--- /dev/null
+From a8e6015d9534f39abc08e6804566af059e498a60 Mon Sep 17 00:00:00 2001
+Date: Wed, 4 Aug 2021 01:31:34 -0600
+Subject: [PATCH 01/10] mm: x86, arm64: add arch_has_hw_pte_young()
+
+Some architectures automatically set the accessed bit in PTEs, e.g.,
+x86 and arm64 v8.2. On architectures that do not have this capability,
+clearing the accessed bit in a PTE triggers a page fault following the
+TLB miss of this PTE.
+
+Being aware of this capability can help make better decisions, i.e.,
+whether to limit the size of each batch of PTEs and the burst of
+batches when clearing the accessed bit.
+
+Change-Id: Ib49b44fb56df3333a2ff1fcc496fb1980b976e7a
+---
+ arch/arm64/include/asm/cpufeature.h | 5 +++++
+ arch/arm64/include/asm/pgtable.h | 13 ++++++++-----
+ arch/arm64/kernel/cpufeature.c | 10 ++++++++++
+ arch/arm64/tools/cpucaps | 1 +
+ arch/x86/include/asm/pgtable.h | 6 +++---
+ include/linux/pgtable.h | 13 +++++++++++++
+ mm/memory.c | 14 +-------------
+ 7 files changed, 41 insertions(+), 21 deletions(-)
+
+--- a/arch/arm64/include/asm/cpufeature.h
++++ b/arch/arm64/include/asm/cpufeature.h
+@@ -808,6 +808,11 @@ static inline bool system_supports_tlb_r
+ cpus_have_const_cap(ARM64_HAS_TLB_RANGE);
+ }
+
++static inline bool system_has_hw_af(void)
++{
++ return IS_ENABLED(CONFIG_ARM64_HW_AFDBM) && cpus_have_const_cap(ARM64_HW_AF);
++}
++
+ extern int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt);
+
+ static inline u32 id_aa64mmfr0_parange_to_phys_shift(int parange)
+--- a/arch/arm64/include/asm/pgtable.h
++++ b/arch/arm64/include/asm/pgtable.h
+@@ -999,13 +999,16 @@ static inline void update_mmu_cache(stru
+ * page after fork() + CoW for pfn mappings. We don't always have a
+ * hardware-managed access flag on arm64.
+ */
+-static inline bool arch_faults_on_old_pte(void)
++static inline bool arch_has_hw_pte_young(bool local)
+ {
+- WARN_ON(preemptible());
++ if (local) {
++ WARN_ON(preemptible());
++ return cpu_has_hw_af();
++ }
+
+- return !cpu_has_hw_af();
++ return system_has_hw_af();
+ }
+-#define arch_faults_on_old_pte arch_faults_on_old_pte
++#define arch_has_hw_pte_young arch_has_hw_pte_young
+
+ /*
+ * Experimentally, it's cheap to set the access flag in hardware and we
+@@ -1013,7 +1016,7 @@ static inline bool arch_faults_on_old_pt
+ */
+ static inline bool arch_wants_old_prefaulted_pte(void)
+ {
+- return !arch_faults_on_old_pte();
++ return arch_has_hw_pte_young(true);
+ }
+ #define arch_wants_old_prefaulted_pte arch_wants_old_prefaulted_pte
+
+--- a/arch/arm64/kernel/cpufeature.c
++++ b/arch/arm64/kernel/cpufeature.c
+@@ -2187,6 +2187,16 @@ static const struct arm64_cpu_capabiliti
+ .matches = has_hw_dbm,
+ .cpu_enable = cpu_enable_hw_dbm,
+ },
++ {
++ .desc = "Hardware update of the Access flag",
++ .type = ARM64_CPUCAP_SYSTEM_FEATURE,
++ .capability = ARM64_HW_AF,
++ .sys_reg = SYS_ID_AA64MMFR1_EL1,
++ .sign = FTR_UNSIGNED,
++ .field_pos = ID_AA64MMFR1_HADBS_SHIFT,
++ .min_field_value = 1,
++ .matches = has_cpuid_feature,
++ },
+ #endif
+ {
+ .desc = "CRC32 instructions",
+--- a/arch/arm64/tools/cpucaps
++++ b/arch/arm64/tools/cpucaps
+@@ -35,6 +35,7 @@ HAS_STAGE2_FWB
+ HAS_SYSREG_GIC_CPUIF
+ HAS_TLB_RANGE
+ HAS_VIRT_HOST_EXTN
++HW_AF
+ HW_DBM
+ KVM_PROTECTED_MODE
+ MISMATCHED_CACHE_TYPE
+--- a/arch/x86/include/asm/pgtable.h
++++ b/arch/x86/include/asm/pgtable.h
+@@ -1397,10 +1397,10 @@ static inline bool arch_has_pfn_modify_c
+ return boot_cpu_has_bug(X86_BUG_L1TF);
+ }
+
+-#define arch_faults_on_old_pte arch_faults_on_old_pte
+-static inline bool arch_faults_on_old_pte(void)
++#define arch_has_hw_pte_young arch_has_hw_pte_young
++static inline bool arch_has_hw_pte_young(bool local)
+ {
+- return false;
++ return true;
+ }
+
+ #endif /* __ASSEMBLY__ */
+--- a/include/linux/pgtable.h
++++ b/include/linux/pgtable.h
+@@ -259,6 +259,19 @@ static inline int pmdp_clear_flush_young
+ #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+ #endif
+
++#ifndef arch_has_hw_pte_young
++/*
++ * Return whether the accessed bit is supported by the local CPU or all CPUs.
++ *
++ * Those arches which have hw access flag feature need to implement their own
++ * helper. By default, "false" means pagefault will be hit on old pte.
++ */
++static inline bool arch_has_hw_pte_young(bool local)
++{
++ return false;
++}
++#endif
++
+ #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR
+ static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
+ unsigned long address,
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -121,18 +121,6 @@ int randomize_va_space __read_mostly =
+ 2;
+ #endif
+
+-#ifndef arch_faults_on_old_pte
+-static inline bool arch_faults_on_old_pte(void)
+-{
+- /*
+- * Those arches which don't have hw access flag feature need to
+- * implement their own helper. By default, "true" means pagefault
+- * will be hit on old pte.
+- */
+- return true;
+-}
+-#endif
+-
+ #ifndef arch_wants_old_prefaulted_pte
+ static inline bool arch_wants_old_prefaulted_pte(void)
+ {
+@@ -2782,7 +2770,7 @@ static inline bool cow_user_page(struct
+ * On architectures with software "accessed" bits, we would
+ * take a double page fault, so mark it accessed here.
+ */
+- if (arch_faults_on_old_pte() && !pte_young(vmf->orig_pte)) {
++ if (!arch_has_hw_pte_young(true) && !pte_young(vmf->orig_pte)) {
+ pte_t entry;
+
+ vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
--- /dev/null
+From f8b663bbfa30af5515e222fd74df20ea4e8393a2 Mon Sep 17 00:00:00 2001
+Date: Sat, 26 Sep 2020 21:17:18 -0600
+Subject: [PATCH 02/10] mm: x86: add CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG
+
+Some architectures support the accessed bit on non-leaf PMD entries,
+e.g., x86_64 sets the accessed bit on a non-leaf PMD entry when using
+it as part of linear address translation [1]. As an optimization, page
+table walkers who are interested in the accessed bit can skip the PTEs
+under a non-leaf PMD entry if the accessed bit is cleared on this PMD
+entry.
+
+Although an inline function may be preferable, this capability is
+added as a configuration option to look consistent when used with the
+existing macros.
+
+[1]: Intel 64 and IA-32 Architectures Software Developer's Manual
+ Volume 3 (June 2021), section 4.8
+
+Change-Id: I1a17be3ae926f721f7b17ea1539e5c39e8c4f9a8
+---
+ arch/Kconfig | 9 +++++++++
+ arch/x86/Kconfig | 1 +
+ arch/x86/include/asm/pgtable.h | 3 ++-
+ arch/x86/mm/pgtable.c | 5 ++++-
+ include/linux/pgtable.h | 4 ++--
+ 5 files changed, 18 insertions(+), 4 deletions(-)
+
+--- a/arch/Kconfig
++++ b/arch/Kconfig
+@@ -1295,6 +1295,15 @@ config ARCH_HAS_ELFCORE_COMPAT
+ config ARCH_HAS_PARANOID_L1D_FLUSH
+ bool
+
++config ARCH_HAS_NONLEAF_PMD_YOUNG
++ bool
++ depends on PGTABLE_LEVELS > 2
++ help
++ Architectures that select this are able to set the accessed bit on
++ non-leaf PMD entries in addition to leaf PTE entries where pages are
++ mapped. For them, page table walkers that clear the accessed bit may
++ stop at non-leaf PMD entries if they do not see the accessed bit.
++
+ source "kernel/gcov/Kconfig"
+
+ source "scripts/gcc-plugins/Kconfig"
+--- a/arch/x86/Kconfig
++++ b/arch/x86/Kconfig
+@@ -84,6 +84,7 @@ config X86
+ select ARCH_HAS_PMEM_API if X86_64
+ select ARCH_HAS_PTE_DEVMAP if X86_64
+ select ARCH_HAS_PTE_SPECIAL
++ select ARCH_HAS_NONLEAF_PMD_YOUNG if X86_64
+ select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64
+ select ARCH_HAS_COPY_MC if X86_64
+ select ARCH_HAS_SET_MEMORY
+--- a/arch/x86/include/asm/pgtable.h
++++ b/arch/x86/include/asm/pgtable.h
+@@ -817,7 +817,8 @@ static inline unsigned long pmd_page_vad
+
+ static inline int pmd_bad(pmd_t pmd)
+ {
+- return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE;
++ return (pmd_flags(pmd) & ~(_PAGE_USER | _PAGE_ACCESSED)) !=
++ (_KERNPG_TABLE & ~_PAGE_ACCESSED);
+ }
+
+ static inline unsigned long pages_to_mb(unsigned long npg)
+--- a/arch/x86/mm/pgtable.c
++++ b/arch/x86/mm/pgtable.c
+@@ -550,7 +550,7 @@ int ptep_test_and_clear_young(struct vm_
+ return ret;
+ }
+
+-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
++#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
+ int pmdp_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long addr, pmd_t *pmdp)
+ {
+@@ -562,6 +562,9 @@ int pmdp_test_and_clear_young(struct vm_
+
+ return ret;
+ }
++#endif
++
++#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ int pudp_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long addr, pud_t *pudp)
+ {
+--- a/include/linux/pgtable.h
++++ b/include/linux/pgtable.h
+@@ -212,7 +212,7 @@ static inline int ptep_test_and_clear_yo
+ #endif
+
+ #ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
+-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
++#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
+ static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long address,
+ pmd_t *pmdp)
+@@ -233,7 +233,7 @@ static inline int pmdp_test_and_clear_yo
+ BUILD_BUG();
+ return 0;
+ }
+-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
++#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG */
+ #endif
+
+ #ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
--- /dev/null
+From a810f8e2f1bdd0707eaf05c8b4ba84a3ff2801bd Mon Sep 17 00:00:00 2001
+Date: Sun, 27 Sep 2020 20:49:08 -0600
+Subject: [PATCH 03/10] mm/vmscan.c: refactor shrink_node()
+
+This patch refactors shrink_node(). This will make the upcoming
+changes to mm/vmscan.c more readable.
+
+Change-Id: Iae734b5b4030205b7db6e8c841f747b6f6ae1a04
+---
+ mm/vmscan.c | 186 +++++++++++++++++++++++++++-------------------------
+ 1 file changed, 98 insertions(+), 88 deletions(-)
+
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -2562,6 +2562,103 @@ enum scan_balance {
+ SCAN_FILE,
+ };
+
++static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc)
++{
++ unsigned long file;
++ struct lruvec *target_lruvec;
++
++ target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
++
++ /*
++ * Determine the scan balance between anon and file LRUs.
++ */
++ spin_lock_irq(&target_lruvec->lru_lock);
++ sc->anon_cost = target_lruvec->anon_cost;
++ sc->file_cost = target_lruvec->file_cost;
++ spin_unlock_irq(&target_lruvec->lru_lock);
++
++ /*
++ * Target desirable inactive:active list ratios for the anon
++ * and file LRU lists.
++ */
++ if (!sc->force_deactivate) {
++ unsigned long refaults;
++
++ refaults = lruvec_page_state(target_lruvec,
++ WORKINGSET_ACTIVATE_ANON);
++ if (refaults != target_lruvec->refaults[0] ||
++ inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))
++ sc->may_deactivate |= DEACTIVATE_ANON;
++ else
++ sc->may_deactivate &= ~DEACTIVATE_ANON;
++
++ /*
++ * When refaults are being observed, it means a new
++ * workingset is being established. Deactivate to get
++ * rid of any stale active pages quickly.
++ */
++ refaults = lruvec_page_state(target_lruvec,
++ WORKINGSET_ACTIVATE_FILE);
++ if (refaults != target_lruvec->refaults[1] ||
++ inactive_is_low(target_lruvec, LRU_INACTIVE_FILE))
++ sc->may_deactivate |= DEACTIVATE_FILE;
++ else
++ sc->may_deactivate &= ~DEACTIVATE_FILE;
++ } else
++ sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE;
++
++ /*
++ * If we have plenty of inactive file pages that aren't
++ * thrashing, try to reclaim those first before touching
++ * anonymous pages.
++ */
++ file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE);
++ if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE))
++ sc->cache_trim_mode = 1;
++ else
++ sc->cache_trim_mode = 0;
++
++ /*
++ * Prevent the reclaimer from falling into the cache trap: as
++ * cache pages start out inactive, every cache fault will tip
++ * the scan balance towards the file LRU. And as the file LRU
++ * shrinks, so does the window for rotation from references.
++ * This means we have a runaway feedback loop where a tiny
++ * thrashing file LRU becomes infinitely more attractive than
++ * anon pages. Try to detect this based on file LRU size.
++ */
++ if (!cgroup_reclaim(sc)) {
++ unsigned long total_high_wmark = 0;
++ unsigned long free, anon;
++ int z;
++
++ free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
++ file = node_page_state(pgdat, NR_ACTIVE_FILE) +
++ node_page_state(pgdat, NR_INACTIVE_FILE);
++
++ for (z = 0; z < MAX_NR_ZONES; z++) {
++ struct zone *zone = &pgdat->node_zones[z];
++
++ if (!managed_zone(zone))
++ continue;
++
++ total_high_wmark += high_wmark_pages(zone);
++ }
++
++ /*
++ * Consider anon: if that's low too, this isn't a
++ * runaway file reclaim problem, but rather just
++ * extreme pressure. Reclaim as per usual then.
++ */
++ anon = node_page_state(pgdat, NR_INACTIVE_ANON);
++
++ sc->file_is_tiny =
++ file + free <= total_high_wmark &&
++ !(sc->may_deactivate & DEACTIVATE_ANON) &&
++ anon >> sc->priority;
++ }
++}
++
+ /*
+ * Determine how aggressively the anon and file LRU lists should be
+ * scanned. The relative value of each set of LRU lists is determined
+@@ -3032,7 +3129,6 @@ static void shrink_node(pg_data_t *pgdat
+ unsigned long nr_reclaimed, nr_scanned;
+ struct lruvec *target_lruvec;
+ bool reclaimable = false;
+- unsigned long file;
+
+ target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
+
+@@ -3048,93 +3144,7 @@ again:
+ nr_reclaimed = sc->nr_reclaimed;
+ nr_scanned = sc->nr_scanned;
+
+- /*
+- * Determine the scan balance between anon and file LRUs.
+- */
+- spin_lock_irq(&target_lruvec->lru_lock);
+- sc->anon_cost = target_lruvec->anon_cost;
+- sc->file_cost = target_lruvec->file_cost;
+- spin_unlock_irq(&target_lruvec->lru_lock);
+-
+- /*
+- * Target desirable inactive:active list ratios for the anon
+- * and file LRU lists.
+- */
+- if (!sc->force_deactivate) {
+- unsigned long refaults;
+-
+- refaults = lruvec_page_state(target_lruvec,
+- WORKINGSET_ACTIVATE_ANON);
+- if (refaults != target_lruvec->refaults[0] ||
+- inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))
+- sc->may_deactivate |= DEACTIVATE_ANON;
+- else
+- sc->may_deactivate &= ~DEACTIVATE_ANON;
+-
+- /*
+- * When refaults are being observed, it means a new
+- * workingset is being established. Deactivate to get
+- * rid of any stale active pages quickly.
+- */
+- refaults = lruvec_page_state(target_lruvec,
+- WORKINGSET_ACTIVATE_FILE);
+- if (refaults != target_lruvec->refaults[1] ||
+- inactive_is_low(target_lruvec, LRU_INACTIVE_FILE))
+- sc->may_deactivate |= DEACTIVATE_FILE;
+- else
+- sc->may_deactivate &= ~DEACTIVATE_FILE;
+- } else
+- sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE;
+-
+- /*
+- * If we have plenty of inactive file pages that aren't
+- * thrashing, try to reclaim those first before touching
+- * anonymous pages.
+- */
+- file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE);
+- if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE))
+- sc->cache_trim_mode = 1;
+- else
+- sc->cache_trim_mode = 0;
+-
+- /*
+- * Prevent the reclaimer from falling into the cache trap: as
+- * cache pages start out inactive, every cache fault will tip
+- * the scan balance towards the file LRU. And as the file LRU
+- * shrinks, so does the window for rotation from references.
+- * This means we have a runaway feedback loop where a tiny
+- * thrashing file LRU becomes infinitely more attractive than
+- * anon pages. Try to detect this based on file LRU size.
+- */
+- if (!cgroup_reclaim(sc)) {
+- unsigned long total_high_wmark = 0;
+- unsigned long free, anon;
+- int z;
+-
+- free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
+- file = node_page_state(pgdat, NR_ACTIVE_FILE) +
+- node_page_state(pgdat, NR_INACTIVE_FILE);
+-
+- for (z = 0; z < MAX_NR_ZONES; z++) {
+- struct zone *zone = &pgdat->node_zones[z];
+- if (!managed_zone(zone))
+- continue;
+-
+- total_high_wmark += high_wmark_pages(zone);
+- }
+-
+- /*
+- * Consider anon: if that's low too, this isn't a
+- * runaway file reclaim problem, but rather just
+- * extreme pressure. Reclaim as per usual then.
+- */
+- anon = node_page_state(pgdat, NR_INACTIVE_ANON);
+-
+- sc->file_is_tiny =
+- file + free <= total_high_wmark &&
+- !(sc->may_deactivate & DEACTIVATE_ANON) &&
+- anon >> sc->priority;
+- }
++ prepare_scan_count(pgdat, sc);
+
+ shrink_node_memcgs(pgdat, sc);
+
--- /dev/null
+From 05f366c941ae2bb8ba21c79fafcb747a5a6b967b Mon Sep 17 00:00:00 2001
+Date: Mon, 25 Jan 2021 21:12:33 -0700
+Subject: [PATCH 04/10] mm: multigenerational lru: groundwork
+
+For each lruvec, evictable pages are divided into multiple
+generations. The youngest generation number is stored in
+lrugen->max_seq for both anon and file types as they are aged on an
+equal footing. The oldest generation numbers are stored in
+lrugen->min_seq[] separately for anon and file types as clean file
+pages can be evicted regardless of swap constraints. These three
+variables are monotonically increasing. Generation numbers are
+truncated into order_base_2(MAX_NR_GENS+1) bits in order to fit into
+page->flags. The sliding window technique is used to prevent truncated
+generation numbers from overlapping. Each truncated generation number
+is an index to
+lrugen->lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES].
+
+The framework comprises two conceptually independent components: the
+aging, which produces young generations, and the eviction, which
+consumes old generations. Both can be invoked independently from user
+space for the purpose of working set estimation and proactive reclaim.
+
+The protection of hot pages and the selection of cold pages are based
+on page access types and patterns. There are two access types: one via
+page tables and the other via file descriptors. The protection of the
+former type is by design stronger because:
+ 1) The uncertainty in determining the access patterns of the former
+ type is higher due to the coalesced nature of the accessed bit.
+ 2) The cost of evicting the former type is higher due to the TLB
+ flushes required and the likelihood of involving I/O.
+ 3) The penalty of under-protecting the former type is higher because
+ applications usually do not prepare themselves for major faults like
+ they do for blocked I/O. For example, client applications commonly
+ dedicate blocked I/O to separate threads to avoid UI janks that
+ negatively affect user experience.
+
+There are also two access patterns: one with temporal locality and the
+other without. The latter pattern, e.g., random and sequential, needs
+to be explicitly excluded to avoid weakening the protection of the
+former pattern. Generally the former type follows the former pattern
+unless MADV_SEQUENTIAL is specified and the latter type follows the
+latter pattern unless outlying refaults have been observed.
+
+Upon faulting, a page is added to the youngest generation, which
+provides the strongest protection as the eviction will not consider
+this page before the aging has scanned it at least twice. The first
+scan clears the accessed bit set during the initial fault. And the
+second scan makes sure this page has not been used since the first
+scan. A page from any other generations is brought back to the
+youngest generation whenever the aging finds the accessed bit set on
+any of the PTEs mapping this page.
+
+Unmapped pages are initially added to the oldest generation and then
+conditionally protected by tiers. This is done later [PATCH 07/10].
+
+Change-Id: I71de7cd15b8dfa6f9fdd838023474693c4fee0a7
+---
+ fs/fuse/dev.c | 3 +-
+ include/linux/cgroup.h | 15 +-
+ include/linux/mm.h | 36 ++++
+ include/linux/mm_inline.h | 182 ++++++++++++++++++++
+ include/linux/mmzone.h | 70 ++++++++
+ include/linux/page-flags-layout.h | 19 ++-
+ include/linux/page-flags.h | 4 +-
+ include/linux/sched.h | 3 +
+ kernel/bounds.c | 3 +
+ kernel/cgroup/cgroup-internal.h | 1 -
+ mm/huge_memory.c | 3 +-
+ mm/memcontrol.c | 1 +
+ mm/memory.c | 7 +
+ mm/mm_init.c | 6 +-
+ mm/page_alloc.c | 1 +
+ mm/swap.c | 9 +-
+ mm/swapfile.c | 2 +
+ mm/vmscan.c | 268 ++++++++++++++++++++++++++++++
+ 18 files changed, 618 insertions(+), 15 deletions(-)
+
+--- a/fs/fuse/dev.c
++++ b/fs/fuse/dev.c
+@@ -785,7 +785,8 @@ static int fuse_check_page(struct page *
+ 1 << PG_active |
+ 1 << PG_workingset |
+ 1 << PG_reclaim |
+- 1 << PG_waiters))) {
++ 1 << PG_waiters |
++ LRU_GEN_MASK | LRU_REFS_MASK))) {
+ dump_page(page, "fuse: trying to steal weird page");
+ return 1;
+ }
+--- a/include/linux/cgroup.h
++++ b/include/linux/cgroup.h
+@@ -432,6 +432,18 @@ static inline void cgroup_put(struct cgr
+ css_put(&cgrp->self);
+ }
+
++extern struct mutex cgroup_mutex;
++
++static inline void cgroup_lock(void)
++{
++ mutex_lock(&cgroup_mutex);
++}
++
++static inline void cgroup_unlock(void)
++{
++ mutex_unlock(&cgroup_mutex);
++}
++
+ /**
+ * task_css_set_check - obtain a task's css_set with extra access conditions
+ * @task: the task to obtain css_set for
+@@ -446,7 +458,6 @@ static inline void cgroup_put(struct cgr
+ * as locks used during the cgroup_subsys::attach() methods.
+ */
+ #ifdef CONFIG_PROVE_RCU
+-extern struct mutex cgroup_mutex;
+ extern spinlock_t css_set_lock;
+ #define task_css_set_check(task, __c) \
+ rcu_dereference_check((task)->cgroups, \
+@@ -707,6 +718,8 @@ struct cgroup;
+ static inline u64 cgroup_id(const struct cgroup *cgrp) { return 1; }
+ static inline void css_get(struct cgroup_subsys_state *css) {}
+ static inline void css_put(struct cgroup_subsys_state *css) {}
++static inline void cgroup_lock(void) {}
++static inline void cgroup_unlock(void) {}
+ static inline int cgroup_attach_task_all(struct task_struct *from,
+ struct task_struct *t) { return 0; }
+ static inline int cgroupstats_build(struct cgroupstats *stats,
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -1093,6 +1093,8 @@ vm_fault_t finish_mkwrite_fault(struct v
+ #define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH)
+ #define LAST_CPUPID_PGOFF (ZONES_PGOFF - LAST_CPUPID_WIDTH)
+ #define KASAN_TAG_PGOFF (LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH)
++#define LRU_GEN_PGOFF (KASAN_TAG_PGOFF - LRU_GEN_WIDTH)
++#define LRU_REFS_PGOFF (LRU_GEN_PGOFF - LRU_REFS_WIDTH)
+
+ /*
+ * Define the bit shifts to access each section. For non-existent
+@@ -1807,6 +1809,40 @@ static inline void unmap_mapping_range(s
+ loff_t const holebegin, loff_t const holelen, int even_cows) { }
+ #endif
+
++#ifdef CONFIG_LRU_GEN
++static inline void task_enter_nonseq_fault(void)
++{
++ WARN_ON(current->in_nonseq_fault);
++
++ current->in_nonseq_fault = 1;
++}
++
++static inline void task_exit_nonseq_fault(void)
++{
++ WARN_ON(!current->in_nonseq_fault);
++
++ current->in_nonseq_fault = 0;
++}
++
++static inline bool task_in_nonseq_fault(void)
++{
++ return current->in_nonseq_fault;
++}
++#else
++static inline void task_enter_nonseq_fault(void)
++{
++}
++
++static inline void task_exit_nonseq_fault(void)
++{
++}
++
++static inline bool task_in_nonseq_fault(void)
++{
++ return false;
++}
++#endif /* CONFIG_LRU_GEN */
++
+ static inline void unmap_shared_mapping_range(struct address_space *mapping,
+ loff_t const holebegin, loff_t const holelen)
+ {
+--- a/include/linux/mm_inline.h
++++ b/include/linux/mm_inline.h
+@@ -79,11 +79,187 @@ static __always_inline enum lru_list pag
+ return lru;
+ }
+
++#ifdef CONFIG_LRU_GEN
++
++static inline bool lru_gen_enabled(void)
++{
++#ifdef CONFIG_LRU_GEN_ENABLED
++ DECLARE_STATIC_KEY_TRUE(lru_gen_static_key);
++
++ return static_branch_likely(&lru_gen_static_key);
++#else
++ DECLARE_STATIC_KEY_FALSE(lru_gen_static_key);
++
++ return static_branch_unlikely(&lru_gen_static_key);
++#endif
++}
++
++/* Return an index within the sliding window that tracks MAX_NR_GENS generations. */
++static inline int lru_gen_from_seq(unsigned long seq)
++{
++ return seq % MAX_NR_GENS;
++}
++
++/* The youngest and the second youngest generations are counted as active. */
++static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen)
++{
++ unsigned long max_seq = lruvec->evictable.max_seq;
++
++ VM_BUG_ON(gen >= MAX_NR_GENS);
++
++ return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1);
++}
++
++/* Update the sizes of the multigenerational lru lists. */
++static inline void lru_gen_update_size(struct page *page, struct lruvec *lruvec,
++ int old_gen, int new_gen)
++{
++ int type = page_is_file_lru(page);
++ int zone = page_zonenum(page);
++ int delta = thp_nr_pages(page);
++ enum lru_list lru = type * LRU_FILE;
++ struct lrugen *lrugen = &lruvec->evictable;
++
++ lockdep_assert_held(&lruvec->lru_lock);
++ VM_BUG_ON(old_gen != -1 && old_gen >= MAX_NR_GENS);
++ VM_BUG_ON(new_gen != -1 && new_gen >= MAX_NR_GENS);
++ VM_BUG_ON(old_gen == -1 && new_gen == -1);
++
++ if (old_gen >= 0)
++ WRITE_ONCE(lrugen->sizes[old_gen][type][zone],
++ lrugen->sizes[old_gen][type][zone] - delta);
++ if (new_gen >= 0)
++ WRITE_ONCE(lrugen->sizes[new_gen][type][zone],
++ lrugen->sizes[new_gen][type][zone] + delta);
++
++ if (old_gen < 0) {
++ if (lru_gen_is_active(lruvec, new_gen))
++ lru += LRU_ACTIVE;
++ update_lru_size(lruvec, lru, zone, delta);
++ return;
++ }
++
++ if (new_gen < 0) {
++ if (lru_gen_is_active(lruvec, old_gen))
++ lru += LRU_ACTIVE;
++ update_lru_size(lruvec, lru, zone, -delta);
++ return;
++ }
++
++ if (!lru_gen_is_active(lruvec, old_gen) && lru_gen_is_active(lruvec, new_gen)) {
++ update_lru_size(lruvec, lru, zone, -delta);
++ update_lru_size(lruvec, lru + LRU_ACTIVE, zone, delta);
++ }
++
++ VM_BUG_ON(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen));
++}
++
++/* Add a page to one of the multigenerational lru lists. Return true on success. */
++static inline bool lru_gen_add_page(struct page *page, struct lruvec *lruvec, bool reclaiming)
++{
++ int gen;
++ unsigned long old_flags, new_flags;
++ int type = page_is_file_lru(page);
++ int zone = page_zonenum(page);
++ struct lrugen *lrugen = &lruvec->evictable;
++
++ if (PageUnevictable(page) || !lrugen->enabled[type])
++ return false;
++ /*
++ * If a page shouldn't be considered for eviction, i.e., a page mapped
++ * upon fault during which the accessed bit is set, add it to the
++ * youngest generation.
++ *
++ * If a page can't be evicted immediately, i.e., an anon page not in
++ * swap cache or a dirty page pending writeback, add it to the second
++ * oldest generation.
++ *
++ * If a page could be evicted immediately, e.g., a clean page, add it to
++ * the oldest generation.
++ */
++ if (PageActive(page))
++ gen = lru_gen_from_seq(lrugen->max_seq);
++ else if ((!type && !PageSwapCache(page)) ||
++ (PageReclaim(page) && (PageDirty(page) || PageWriteback(page))))
++ gen = lru_gen_from_seq(lrugen->min_seq[type] + 1);
++ else
++ gen = lru_gen_from_seq(lrugen->min_seq[type]);
++
++ do {
++ new_flags = old_flags = READ_ONCE(page->flags);
++ VM_BUG_ON_PAGE(new_flags & LRU_GEN_MASK, page);
++
++ new_flags &= ~(LRU_GEN_MASK | BIT(PG_active));
++ new_flags |= (gen + 1UL) << LRU_GEN_PGOFF;
++ } while (cmpxchg(&page->flags, old_flags, new_flags) != old_flags);
++
++ lru_gen_update_size(page, lruvec, -1, gen);
++ /* for rotate_reclaimable_page() */
++ if (reclaiming)
++ list_add_tail(&page->lru, &lrugen->lists[gen][type][zone]);
++ else
++ list_add(&page->lru, &lrugen->lists[gen][type][zone]);
++
++ return true;
++}
++
++/* Delete a page from one of the multigenerational lru lists. Return true on success. */
++static inline bool lru_gen_del_page(struct page *page, struct lruvec *lruvec, bool reclaiming)
++{
++ int gen;
++ unsigned long old_flags, new_flags;
++
++ do {
++ new_flags = old_flags = READ_ONCE(page->flags);
++ if (!(new_flags & LRU_GEN_MASK))
++ return false;
++
++ VM_BUG_ON_PAGE(PageActive(page), page);
++ VM_BUG_ON_PAGE(PageUnevictable(page), page);
++
++ gen = ((new_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
++
++ new_flags &= ~LRU_GEN_MASK;
++ /* for shrink_page_list() */
++ if (reclaiming)
++ new_flags &= ~(BIT(PG_referenced) | BIT(PG_reclaim));
++ else if (lru_gen_is_active(lruvec, gen))
++ new_flags |= BIT(PG_active);
++ } while (cmpxchg(&page->flags, old_flags, new_flags) != old_flags);
++
++ lru_gen_update_size(page, lruvec, gen, -1);
++ list_del(&page->lru);
++
++ return true;
++}
++
++#else
++
++static inline bool lru_gen_enabled(void)
++{
++ return false;
++}
++
++static inline bool lru_gen_add_page(struct page *page, struct lruvec *lruvec, bool reclaiming)
++{
++ return false;
++}
++
++static inline bool lru_gen_del_page(struct page *page, struct lruvec *lruvec, bool reclaiming)
++{
++ return false;
++}
++
++#endif /* CONFIG_LRU_GEN */
++
+ static __always_inline void add_page_to_lru_list(struct page *page,
+ struct lruvec *lruvec)
+ {
+ enum lru_list lru = page_lru(page);
+
++ if (lru_gen_add_page(page, lruvec, false))
++ return;
++
+ update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page));
+ list_add(&page->lru, &lruvec->lists[lru]);
+ }
+@@ -93,6 +269,9 @@ static __always_inline void add_page_to_
+ {
+ enum lru_list lru = page_lru(page);
+
++ if (lru_gen_add_page(page, lruvec, true))
++ return;
++
+ update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page));
+ list_add_tail(&page->lru, &lruvec->lists[lru]);
+ }
+@@ -100,6 +279,9 @@ static __always_inline void add_page_to_
+ static __always_inline void del_page_from_lru_list(struct page *page,
+ struct lruvec *lruvec)
+ {
++ if (lru_gen_del_page(page, lruvec, false))
++ return;
++
+ list_del(&page->lru);
+ update_lru_size(lruvec, page_lru(page), page_zonenum(page),
+ -thp_nr_pages(page));
+--- a/include/linux/mmzone.h
++++ b/include/linux/mmzone.h
+@@ -294,6 +294,72 @@ enum lruvec_flags {
+ */
+ };
+
++struct lruvec;
++
++#define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
++#define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)
++
++#ifdef CONFIG_LRU_GEN
++
++/*
++ * For each lruvec, evictable pages are divided into multiple generations. The
++ * youngest and the oldest generation numbers, AKA max_seq and min_seq, are
++ * monotonically increasing. The sliding window technique is used to track at
++ * least MIN_NR_GENS and at most MAX_NR_GENS generations. An offset within the
++ * window, AKA gen, indexes an array of per-type and per-zone lists for the
++ * corresponding generation. The counter in page->flags stores gen+1 while a
++ * page is on one of the multigenerational lru lists. Otherwise, it stores 0.
++ *
++ * After a page is faulted in, the aging must check the accessed bit at least
++ * twice before the eviction would consider it. The first check clears the
++ * accessed bit set during the initial fault. The second check makes sure this
++ * page hasn't been used since then.
++ */
++#define MIN_NR_GENS 2
++#define MAX_NR_GENS ((unsigned int)CONFIG_NR_LRU_GENS)
++
++struct lrugen {
++ /* the aging increments the max generation number */
++ unsigned long max_seq;
++ /* the eviction increments the min generation numbers */
++ unsigned long min_seq[ANON_AND_FILE];
++ /* the birth time of each generation in jiffies */
++ unsigned long timestamps[MAX_NR_GENS];
++ /* the multigenerational lru lists */
++ struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
++ /* the sizes of the multigenerational lru lists in pages */
++ unsigned long sizes[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
++ /* whether the multigenerational lru is enabled */
++ bool enabled[ANON_AND_FILE];
++};
++
++#define MAX_BATCH_SIZE 8192
++
++void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec);
++void lru_gen_change_state(bool enable, bool main, bool swap);
++
++#ifdef CONFIG_MEMCG
++void lru_gen_init_memcg(struct mem_cgroup *memcg);
++#endif
++
++#else /* !CONFIG_LRU_GEN */
++
++static inline void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec)
++{
++}
++
++static inline void lru_gen_change_state(bool enable, bool main, bool swap)
++{
++}
++
++#ifdef CONFIG_MEMCG
++static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
++{
++}
++#endif
++
++#endif /* CONFIG_LRU_GEN */
++
+ struct lruvec {
+ struct list_head lists[NR_LRU_LISTS];
+ /* per lruvec lru_lock for memcg */
+@@ -311,6 +377,10 @@ struct lruvec {
+ unsigned long refaults[ANON_AND_FILE];
+ /* Various lruvec state flags (enum lruvec_flags) */
+ unsigned long flags;
++#ifdef CONFIG_LRU_GEN
++ /* unevictable pages are on LRU_UNEVICTABLE */
++ struct lrugen evictable;
++#endif
+ #ifdef CONFIG_MEMCG
+ struct pglist_data *pgdat;
+ #endif
+--- a/include/linux/page-flags-layout.h
++++ b/include/linux/page-flags-layout.h
+@@ -26,6 +26,14 @@
+
+ #define ZONES_WIDTH ZONES_SHIFT
+
++#ifdef CONFIG_LRU_GEN
++/* LRU_GEN_WIDTH is generated from order_base_2(CONFIG_NR_LRU_GENS + 1). */
++#define LRU_REFS_WIDTH (CONFIG_TIERS_PER_GEN - 2)
++#else
++#define LRU_GEN_WIDTH 0
++#define LRU_REFS_WIDTH 0
++#endif /* CONFIG_LRU_GEN */
++
+ #ifdef CONFIG_SPARSEMEM
+ #include <asm/sparsemem.h>
+ #define SECTIONS_SHIFT (MAX_PHYSMEM_BITS - SECTION_SIZE_BITS)
+@@ -55,7 +63,8 @@
+ #define SECTIONS_WIDTH 0
+ #endif
+
+-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
++#if ZONES_WIDTH + LRU_GEN_WIDTH + LRU_REFS_WIDTH + SECTIONS_WIDTH + NODES_SHIFT \
++ <= BITS_PER_LONG - NR_PAGEFLAGS
+ #define NODES_WIDTH NODES_SHIFT
+ #elif defined(CONFIG_SPARSEMEM_VMEMMAP)
+ #error "Vmemmap: No space for nodes field in page flags"
+@@ -89,8 +98,8 @@
+ #define LAST_CPUPID_SHIFT 0
+ #endif
+
+-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT \
+- <= BITS_PER_LONG - NR_PAGEFLAGS
++#if ZONES_WIDTH + LRU_GEN_WIDTH + LRU_REFS_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \
++ KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
+ #define LAST_CPUPID_WIDTH LAST_CPUPID_SHIFT
+ #else
+ #define LAST_CPUPID_WIDTH 0
+@@ -100,8 +109,8 @@
+ #define LAST_CPUPID_NOT_IN_PAGE_FLAGS
+ #endif
+
+-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH \
+- > BITS_PER_LONG - NR_PAGEFLAGS
++#if ZONES_WIDTH + LRU_GEN_WIDTH + LRU_REFS_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \
++ KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH > BITS_PER_LONG - NR_PAGEFLAGS
+ #error "Not enough bits in page flags"
+ #endif
+
+--- a/include/linux/page-flags.h
++++ b/include/linux/page-flags.h
+@@ -845,7 +845,7 @@ static inline void ClearPageSlabPfmemall
+ 1UL << PG_private | 1UL << PG_private_2 | \
+ 1UL << PG_writeback | 1UL << PG_reserved | \
+ 1UL << PG_slab | 1UL << PG_active | \
+- 1UL << PG_unevictable | __PG_MLOCKED)
++ 1UL << PG_unevictable | __PG_MLOCKED | LRU_GEN_MASK)
+
+ /*
+ * Flags checked when a page is prepped for return by the page allocator.
+@@ -856,7 +856,7 @@ static inline void ClearPageSlabPfmemall
+ * alloc-free cycle to prevent from reusing the page.
+ */
+ #define PAGE_FLAGS_CHECK_AT_PREP \
+- (PAGEFLAGS_MASK & ~__PG_HWPOISON)
++ ((PAGEFLAGS_MASK & ~__PG_HWPOISON) | LRU_GEN_MASK | LRU_REFS_MASK)
+
+ #define PAGE_FLAGS_PRIVATE \
+ (1UL << PG_private | 1UL << PG_private_2)
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -911,6 +911,9 @@ struct task_struct {
+ #ifdef CONFIG_MEMCG
+ unsigned in_user_fault:1;
+ #endif
++#ifdef CONFIG_LRU_GEN
++ unsigned in_nonseq_fault:1;
++#endif
+ #ifdef CONFIG_COMPAT_BRK
+ unsigned brk_randomized:1;
+ #endif
+--- a/kernel/bounds.c
++++ b/kernel/bounds.c
+@@ -22,6 +22,9 @@ int main(void)
+ DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS));
+ #endif
+ DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t));
++#ifdef CONFIG_LRU_GEN
++ DEFINE(LRU_GEN_WIDTH, order_base_2(CONFIG_NR_LRU_GENS + 1));
++#endif
+ /* End of constants */
+
+ return 0;
+--- a/kernel/cgroup/cgroup-internal.h
++++ b/kernel/cgroup/cgroup-internal.h
+@@ -165,7 +165,6 @@ struct cgroup_mgctx {
+ #define DEFINE_CGROUP_MGCTX(name) \
+ struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name)
+
+-extern struct mutex cgroup_mutex;
+ extern spinlock_t css_set_lock;
+ extern struct cgroup_subsys *cgroup_subsys[];
+ extern struct list_head cgroup_roots;
+--- a/mm/huge_memory.c
++++ b/mm/huge_memory.c
+@@ -2364,7 +2364,8 @@ static void __split_huge_page_tail(struc
+ #ifdef CONFIG_64BIT
+ (1L << PG_arch_2) |
+ #endif
+- (1L << PG_dirty)));
++ (1L << PG_dirty) |
++ LRU_GEN_MASK | LRU_REFS_MASK));
+
+ /* ->mapping in first tail page is compound_mapcount */
+ VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
+--- a/mm/memcontrol.c
++++ b/mm/memcontrol.c
+@@ -5226,6 +5226,7 @@ static struct mem_cgroup *mem_cgroup_all
+ memcg->deferred_split_queue.split_queue_len = 0;
+ #endif
+ idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
++ lru_gen_init_memcg(memcg);
+ return memcg;
+ fail:
+ mem_cgroup_id_remove(memcg);
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -4788,6 +4788,7 @@ vm_fault_t handle_mm_fault(struct vm_are
+ unsigned int flags, struct pt_regs *regs)
+ {
+ vm_fault_t ret;
++ bool nonseq_fault = !(vma->vm_flags & VM_SEQ_READ);
+
+ __set_current_state(TASK_RUNNING);
+
+@@ -4809,11 +4810,17 @@ vm_fault_t handle_mm_fault(struct vm_are
+ if (flags & FAULT_FLAG_USER)
+ mem_cgroup_enter_user_fault();
+
++ if (nonseq_fault)
++ task_enter_nonseq_fault();
++
+ if (unlikely(is_vm_hugetlb_page(vma)))
+ ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
+ else
+ ret = __handle_mm_fault(vma, address, flags);
+
++ if (nonseq_fault)
++ task_exit_nonseq_fault();
++
+ if (flags & FAULT_FLAG_USER) {
+ mem_cgroup_exit_user_fault();
+ /*
+--- a/mm/mm_init.c
++++ b/mm/mm_init.c
+@@ -65,14 +65,16 @@ void __init mminit_verify_pageflags_layo
+
+ shift = 8 * sizeof(unsigned long);
+ width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH
+- - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH;
++ - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH - LRU_GEN_WIDTH - LRU_REFS_WIDTH;
+ mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
+- "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Flags %d\n",
++ "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Gen %d Tier %d Flags %d\n",
+ SECTIONS_WIDTH,
+ NODES_WIDTH,
+ ZONES_WIDTH,
+ LAST_CPUPID_WIDTH,
+ KASAN_TAG_WIDTH,
++ LRU_GEN_WIDTH,
++ LRU_REFS_WIDTH,
+ NR_PAGEFLAGS);
+ mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts",
+ "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d\n",
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -7456,6 +7456,7 @@ static void __meminit pgdat_init_interna
+
+ pgdat_page_ext_init(pgdat);
+ lruvec_init(&pgdat->__lruvec);
++ lru_gen_init_state(NULL, &pgdat->__lruvec);
+ }
+
+ static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid,
+--- a/mm/swap.c
++++ b/mm/swap.c
+@@ -446,6 +446,11 @@ void lru_cache_add(struct page *page)
+ VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page);
+ VM_BUG_ON_PAGE(PageLRU(page), page);
+
++ /* see the comment in lru_gen_add_page() */
++ if (lru_gen_enabled() && !PageUnevictable(page) &&
++ task_in_nonseq_fault() && !(current->flags & PF_MEMALLOC))
++ SetPageActive(page);
++
+ get_page(page);
+ local_lock(&lru_pvecs.lock);
+ pvec = this_cpu_ptr(&lru_pvecs.lru_add);
+@@ -547,7 +552,7 @@ static void lru_deactivate_file_fn(struc
+
+ static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec)
+ {
+- if (PageActive(page) && !PageUnevictable(page)) {
++ if (!PageUnevictable(page) && (PageActive(page) || lru_gen_enabled())) {
+ int nr_pages = thp_nr_pages(page);
+
+ del_page_from_lru_list(page, lruvec);
+@@ -661,7 +666,7 @@ void deactivate_file_page(struct page *p
+ */
+ void deactivate_page(struct page *page)
+ {
+- if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
++ if (PageLRU(page) && !PageUnevictable(page) && (PageActive(page) || lru_gen_enabled())) {
+ struct pagevec *pvec;
+
+ local_lock(&lru_pvecs.lock);
+--- a/mm/swapfile.c
++++ b/mm/swapfile.c
+@@ -2688,6 +2688,7 @@ SYSCALL_DEFINE1(swapoff, const char __us
+ err = 0;
+ atomic_inc(&proc_poll_event);
+ wake_up_interruptible(&proc_poll_wait);
++ lru_gen_change_state(false, false, true);
+
+ out_dput:
+ filp_close(victim, NULL);
+@@ -3349,6 +3350,7 @@ SYSCALL_DEFINE2(swapon, const char __use
+ mutex_unlock(&swapon_mutex);
+ atomic_inc(&proc_poll_event);
+ wake_up_interruptible(&proc_poll_wait);
++ lru_gen_change_state(true, false, true);
+
+ error = 0;
+ goto out;
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -50,6 +50,7 @@
+ #include <linux/printk.h>
+ #include <linux/dax.h>
+ #include <linux/psi.h>
++#include <linux/memory.h>
+
+ #include <asm/tlbflush.h>
+ #include <asm/div64.h>
+@@ -2880,6 +2881,273 @@ static bool can_age_anon_pages(struct pg
+ return can_demote(pgdat->node_id, sc);
+ }
+
++#ifdef CONFIG_LRU_GEN
++
++/******************************************************************************
++ * shorthand helpers
++ ******************************************************************************/
++
++#define for_each_gen_type_zone(gen, type, zone) \
++ for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \
++ for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \
++ for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
++
++static int page_lru_gen(struct page *page)
++{
++ unsigned long flags = READ_ONCE(page->flags);
++
++ return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
++}
++
++static struct lruvec *get_lruvec(int nid, struct mem_cgroup *memcg)
++{
++ struct pglist_data *pgdat = NODE_DATA(nid);
++
++#ifdef CONFIG_MEMCG
++ if (memcg) {
++ struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec;
++
++ if (lruvec->pgdat != pgdat)
++ lruvec->pgdat = pgdat;
++
++ return lruvec;
++ }
++#endif
++ return pgdat ? &pgdat->__lruvec : NULL;
++}
++
++static int get_nr_gens(struct lruvec *lruvec, int type)
++{
++ return lruvec->evictable.max_seq - lruvec->evictable.min_seq[type] + 1;
++}
++
++static bool __maybe_unused seq_is_valid(struct lruvec *lruvec)
++{
++ return get_nr_gens(lruvec, 1) >= MIN_NR_GENS &&
++ get_nr_gens(lruvec, 1) <= get_nr_gens(lruvec, 0) &&
++ get_nr_gens(lruvec, 0) <= MAX_NR_GENS;
++}
++
++/******************************************************************************
++ * state change
++ ******************************************************************************/
++
++#ifdef CONFIG_LRU_GEN_ENABLED
++DEFINE_STATIC_KEY_TRUE(lru_gen_static_key);
++#else
++DEFINE_STATIC_KEY_FALSE(lru_gen_static_key);
++#endif
++
++static int lru_gen_nr_swapfiles;
++
++static bool __maybe_unused state_is_valid(struct lruvec *lruvec)
++{
++ int gen, type, zone;
++ enum lru_list lru;
++ struct lrugen *lrugen = &lruvec->evictable;
++
++ for_each_evictable_lru(lru) {
++ type = is_file_lru(lru);
++
++ if (lrugen->enabled[type] && !list_empty(&lruvec->lists[lru]))
++ return false;
++ }
++
++ for_each_gen_type_zone(gen, type, zone) {
++ if (!lrugen->enabled[type] && !list_empty(&lrugen->lists[gen][type][zone]))
++ return false;
++
++ /* unlikely but not a bug when reset_batch_size() is pending */
++ VM_WARN_ON(!lrugen->enabled[type] && lrugen->sizes[gen][type][zone]);
++ }
++
++ return true;
++}
++
++static bool fill_lists(struct lruvec *lruvec)
++{
++ enum lru_list lru;
++ int remaining = MAX_BATCH_SIZE;
++
++ for_each_evictable_lru(lru) {
++ int type = is_file_lru(lru);
++ bool active = is_active_lru(lru);
++ struct list_head *head = &lruvec->lists[lru];
++
++ if (!lruvec->evictable.enabled[type])
++ continue;
++
++ while (!list_empty(head)) {
++ bool success;
++ struct page *page = lru_to_page(head);
++
++ VM_BUG_ON_PAGE(PageTail(page), page);
++ VM_BUG_ON_PAGE(PageUnevictable(page), page);
++ VM_BUG_ON_PAGE(PageActive(page) != active, page);
++ VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page);
++ VM_BUG_ON_PAGE(page_lru_gen(page) < MAX_NR_GENS, page);
++
++ prefetchw_prev_lru_page(page, head, flags);
++
++ del_page_from_lru_list(page, lruvec);
++ success = lru_gen_add_page(page, lruvec, false);
++ VM_BUG_ON(!success);
++
++ if (!--remaining)
++ return false;
++ }
++ }
++
++ return true;
++}
++
++static bool drain_lists(struct lruvec *lruvec)
++{
++ int gen, type, zone;
++ int remaining = MAX_BATCH_SIZE;
++
++ for_each_gen_type_zone(gen, type, zone) {
++ struct list_head *head = &lruvec->evictable.lists[gen][type][zone];
++
++ if (lruvec->evictable.enabled[type])
++ continue;
++
++ while (!list_empty(head)) {
++ bool success;
++ struct page *page = lru_to_page(head);
++
++ VM_BUG_ON_PAGE(PageTail(page), page);
++ VM_BUG_ON_PAGE(PageUnevictable(page), page);
++ VM_BUG_ON_PAGE(PageActive(page), page);
++ VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page);
++ VM_BUG_ON_PAGE(page_zonenum(page) != zone, page);
++
++ prefetchw_prev_lru_page(page, head, flags);
++
++ success = lru_gen_del_page(page, lruvec, false);
++ VM_BUG_ON(!success);
++ add_page_to_lru_list(page, lruvec);
++
++ if (!--remaining)
++ return false;
++ }
++ }
++
++ return true;
++}
++
++/*
++ * For file page tracking, we enable/disable it according to the main switch.
++ * For anon page tracking, we only enabled it when the main switch is on and
++ * there is at least one swapfile; we disable it when there are no swapfiles
++ * regardless of the value of the main switch. Otherwise, we will eventually
++ * reach the max size of the sliding window and have to call inc_min_seq().
++ */
++void lru_gen_change_state(bool enable, bool main, bool swap)
++{
++ static DEFINE_MUTEX(state_mutex);
++
++ struct mem_cgroup *memcg;
++
++ mem_hotplug_begin();
++ cgroup_lock();
++ mutex_lock(&state_mutex);
++
++ if (swap) {
++ if (enable)
++ swap = !lru_gen_nr_swapfiles++;
++ else
++ swap = !--lru_gen_nr_swapfiles;
++ }
++
++ if (main && enable != lru_gen_enabled()) {
++ if (enable)
++ static_branch_enable(&lru_gen_static_key);
++ else
++ static_branch_disable(&lru_gen_static_key);
++ } else if (!swap || !lru_gen_enabled())
++ goto unlock;
++
++ memcg = mem_cgroup_iter(NULL, NULL, NULL);
++ do {
++ int nid;
++
++ for_each_node(nid) {
++ struct lruvec *lruvec = get_lruvec(nid, memcg);
++
++ if (!lruvec)
++ continue;
++
++ spin_lock_irq(&lruvec->lru_lock);
++
++ VM_BUG_ON(!seq_is_valid(lruvec));
++ VM_BUG_ON(!state_is_valid(lruvec));
++
++ lruvec->evictable.enabled[0] = lru_gen_enabled() && lru_gen_nr_swapfiles;
++ lruvec->evictable.enabled[1] = lru_gen_enabled();
++
++ while (!(enable ? fill_lists(lruvec) : drain_lists(lruvec))) {
++ spin_unlock_irq(&lruvec->lru_lock);
++ cond_resched();
++ spin_lock_irq(&lruvec->lru_lock);
++ }
++
++ spin_unlock_irq(&lruvec->lru_lock);
++ }
++
++ cond_resched();
++ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
++unlock:
++ mutex_unlock(&state_mutex);
++ cgroup_unlock();
++ mem_hotplug_done();
++}
++
++/******************************************************************************
++ * initialization
++ ******************************************************************************/
++
++void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec)
++{
++ int i;
++ int gen, type, zone;
++ struct lrugen *lrugen = &lruvec->evictable;
++
++ lrugen->max_seq = MIN_NR_GENS + 1;
++ lrugen->enabled[0] = lru_gen_enabled() && lru_gen_nr_swapfiles;
++ lrugen->enabled[1] = lru_gen_enabled();
++
++ for (i = 0; i <= MIN_NR_GENS + 1; i++)
++ lrugen->timestamps[i] = jiffies;
++
++ for_each_gen_type_zone(gen, type, zone)
++ INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
++}
++
++#ifdef CONFIG_MEMCG
++void lru_gen_init_memcg(struct mem_cgroup *memcg)
++{
++ int nid;
++
++ for_each_node(nid) {
++ struct lruvec *lruvec = get_lruvec(nid, memcg);
++
++ lru_gen_init_state(memcg, lruvec);
++ }
++}
++#endif
++
++static int __init init_lru_gen(void)
++{
++ BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS);
++ BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);
++
++ return 0;
++};
++late_initcall(init_lru_gen);
++
++#endif /* CONFIG_LRU_GEN */
++
+ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
+ {
+ unsigned long nr[NR_LRU_LISTS];
--- /dev/null
+From 534bcc4a0bb5b24600891ce793f0295a142e9dae Mon Sep 17 00:00:00 2001
+Date: Mon, 5 Apr 2021 04:17:41 -0600
+Subject: [PATCH 05/10] mm: multigenerational lru: mm_struct list
+
+To scan PTEs for accessed pages, a mm_struct list is maintained for
+each memcg. When multiple threads traverse the same memcg->mm_list,
+each of them gets a unique mm_struct and therefore they can run
+walk_page_range() concurrently to reach page tables of all processes
+of this memcg.
+
+This infrastructure also provides the following optimizations:
+ 1) it allows walkers to skip processes that have been sleeping since
+ the last walk by tracking the usage of mm_struct between context
+ switches.
+ 2) it allows walkers to add interesting items they find during a
+ walk to a Bloom filter so that they can skip uninteresting items
+ during the next walk by testing whether an item is in this Bloom
+ filter.
+
+Change-Id: I25d9eda8c6bdc7c3653b9f210a159d6c247c81e8
+---
+ fs/exec.c | 2 +
+ include/linux/memcontrol.h | 4 +
+ include/linux/mm_inline.h | 6 +
+ include/linux/mm_types.h | 75 +++++++++
+ include/linux/mmzone.h | 63 +++++++
+ kernel/exit.c | 1 +
+ kernel/fork.c | 9 +
+ kernel/sched/core.c | 1 +
+ mm/memcontrol.c | 25 +++
+ mm/vmscan.c | 331 +++++++++++++++++++++++++++++++++++++
+ 10 files changed, 517 insertions(+)
+
+--- a/fs/exec.c
++++ b/fs/exec.c
+@@ -1013,6 +1013,7 @@ static int exec_mmap(struct mm_struct *m
+ active_mm = tsk->active_mm;
+ tsk->active_mm = mm;
+ tsk->mm = mm;
++ lru_gen_add_mm(mm);
+ /*
+ * This prevents preemption while active_mm is being loaded and
+ * it and mm are being updated, which could cause problems for
+@@ -1023,6 +1024,7 @@ static int exec_mmap(struct mm_struct *m
+ if (!IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
+ local_irq_enable();
+ activate_mm(active_mm, mm);
++ lru_gen_activate_mm(mm);
+ if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
+ local_irq_enable();
+ tsk->mm->vmacache_seqnum = 0;
+--- a/include/linux/memcontrol.h
++++ b/include/linux/memcontrol.h
+@@ -348,6 +348,10 @@ struct mem_cgroup {
+ struct deferred_split deferred_split_queue;
+ #endif
+
++#ifdef CONFIG_LRU_GEN
++ struct lru_gen_mm_list mm_list;
++#endif
++
+ struct mem_cgroup_per_node *nodeinfo[];
+ };
+
+--- a/include/linux/mm_inline.h
++++ b/include/linux/mm_inline.h
+@@ -100,6 +100,12 @@ static inline int lru_gen_from_seq(unsig
+ return seq % MAX_NR_GENS;
+ }
+
++/* Return a proper index regardless whether we keep stats for historical generations. */
++static inline int lru_hist_from_seq(unsigned long seq)
++{
++ return seq % NR_HIST_GENS;
++}
++
+ /* The youngest and the second youngest generations are counted as active. */
+ static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen)
+ {
+--- a/include/linux/mm_types.h
++++ b/include/linux/mm_types.h
+@@ -3,6 +3,7 @@
+ #define _LINUX_MM_TYPES_H
+
+ #include <linux/mm_types_task.h>
++#include <linux/sched.h>
+
+ #include <linux/auxvec.h>
+ #include <linux/list.h>
+@@ -15,6 +16,8 @@
+ #include <linux/page-flags-layout.h>
+ #include <linux/workqueue.h>
+ #include <linux/seqlock.h>
++#include <linux/nodemask.h>
++#include <linux/mmdebug.h>
+
+ #include <asm/mmu.h>
+
+@@ -580,6 +583,18 @@ struct mm_struct {
+ #ifdef CONFIG_IOMMU_SUPPORT
+ u32 pasid;
+ #endif
++#ifdef CONFIG_LRU_GEN
++ struct {
++ /* the node of a global or per-memcg mm_struct list */
++ struct list_head list;
++#ifdef CONFIG_MEMCG
++ /* points to the memcg of the owner task above */
++ struct mem_cgroup *memcg;
++#endif
++ /* whether this mm_struct has been used since the last walk */
++ nodemask_t nodes;
++ } lrugen;
++#endif /* CONFIG_LRU_GEN */
+ } __randomize_layout;
+
+ /*
+@@ -606,6 +621,66 @@ static inline cpumask_t *mm_cpumask(stru
+ return (struct cpumask *)&mm->cpu_bitmap;
+ }
+
++#ifdef CONFIG_LRU_GEN
++
++struct lru_gen_mm_list {
++ /* a global or per-memcg mm_struct list */
++ struct list_head fifo;
++ /* protects the list above */
++ spinlock_t lock;
++};
++
++void lru_gen_add_mm(struct mm_struct *mm);
++void lru_gen_del_mm(struct mm_struct *mm);
++#ifdef CONFIG_MEMCG
++void lru_gen_migrate_mm(struct mm_struct *mm);
++#endif
++
++static inline void lru_gen_init_mm(struct mm_struct *mm)
++{
++ INIT_LIST_HEAD(&mm->lrugen.list);
++#ifdef CONFIG_MEMCG
++ mm->lrugen.memcg = NULL;
++#endif
++ nodes_clear(mm->lrugen.nodes);
++}
++
++/* Track the usage of each mm_struct so that we can skip inactive ones. */
++static inline void lru_gen_activate_mm(struct mm_struct *mm)
++{
++ /* unlikely but not a bug when racing with lru_gen_migrate_mm() */
++ VM_WARN_ON(list_empty(&mm->lrugen.list));
++
++ if (!(current->flags & PF_KTHREAD) && !nodes_full(mm->lrugen.nodes))
++ nodes_setall(mm->lrugen.nodes);
++}
++
++#else /* !CONFIG_LRU_GEN */
++
++static inline void lru_gen_add_mm(struct mm_struct *mm)
++{
++}
++
++static inline void lru_gen_del_mm(struct mm_struct *mm)
++{
++}
++
++#ifdef CONFIG_MEMCG
++static inline void lru_gen_migrate_mm(struct mm_struct *mm)
++{
++}
++#endif
++
++static inline void lru_gen_init_mm(struct mm_struct *mm)
++{
++}
++
++static inline void lru_gen_activate_mm(struct mm_struct *mm)
++{
++}
++
++#endif /* CONFIG_LRU_GEN */
++
+ struct mmu_gather;
+ extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm);
+ extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm);
+--- a/include/linux/mmzone.h
++++ b/include/linux/mmzone.h
+@@ -318,6 +318,13 @@ struct lruvec;
+ #define MIN_NR_GENS 2
+ #define MAX_NR_GENS ((unsigned int)CONFIG_NR_LRU_GENS)
+
++/* Whether to keep stats for historical generations. */
++#ifdef CONFIG_LRU_GEN_STATS
++#define NR_HIST_GENS ((unsigned int)CONFIG_NR_LRU_GENS)
++#else
++#define NR_HIST_GENS 1U
++#endif
++
+ struct lrugen {
+ /* the aging increments the max generation number */
+ unsigned long max_seq;
+@@ -333,13 +340,63 @@ struct lrugen {
+ bool enabled[ANON_AND_FILE];
+ };
+
++enum {
++ MM_LEAF_TOTAL, /* total leaf entries */
++ MM_LEAF_OLD, /* old leaf entries */
++ MM_LEAF_YOUNG, /* young leaf entries */
++ MM_NONLEAF_TOTAL, /* total non-leaf entries */
++ MM_NONLEAF_PREV, /* previously worthy non-leaf entries */
++ MM_NONLEAF_CUR, /* currently worthy non-leaf entries */
++ NR_MM_STATS
++};
++
++/* mnemonic codes for the stats above */
++#define MM_STAT_CODES "toydpc"
++
++/* double buffering bloom filters */
++#define NR_BLOOM_FILTERS 2
++
++struct lru_gen_mm_walk {
++ /* set to max_seq after each round of walk */
++ unsigned long seq;
++ /* the next mm_struct on the list to walk */
++ struct list_head *head;
++ /* the first mm_struct never walked before */
++ struct list_head *tail;
++ /* to wait for the last walker to finish */
++ struct wait_queue_head wait;
++ /* bloom filters flip after each round of walk */
++ unsigned long *filters[NR_BLOOM_FILTERS];
++ /* page table stats for debugging */
++ unsigned long stats[NR_HIST_GENS][NR_MM_STATS];
++ /* the number of concurrent walkers */
++ int nr_walkers;
++};
++
++#define MIN_BATCH_SIZE 64
+ #define MAX_BATCH_SIZE 8192
+
++struct mm_walk_args {
++ struct mem_cgroup *memcg;
++ unsigned long max_seq;
++ unsigned long start_pfn;
++ unsigned long end_pfn;
++ unsigned long next_addr;
++ unsigned long bitmap[BITS_TO_LONGS(MIN_BATCH_SIZE)];
++ int node_id;
++ int swappiness;
++ int batch_size;
++ int nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
++ int mm_stats[NR_MM_STATS];
++ bool use_filter;
++};
++
+ void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec);
+ void lru_gen_change_state(bool enable, bool main, bool swap);
+
+ #ifdef CONFIG_MEMCG
+ void lru_gen_init_memcg(struct mem_cgroup *memcg);
++void lru_gen_free_memcg(struct mem_cgroup *memcg);
+ #endif
+
+ #else /* !CONFIG_LRU_GEN */
+@@ -356,6 +413,10 @@ static inline void lru_gen_change_state(
+ static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
+ {
+ }
++
++static inline void lru_gen_free_memcg(struct mem_cgroup *memcg)
++{
++}
+ #endif
+
+ #endif /* CONFIG_LRU_GEN */
+@@ -380,6 +441,8 @@ struct lruvec {
+ #ifdef CONFIG_LRU_GEN
+ /* unevictable pages are on LRU_UNEVICTABLE */
+ struct lrugen evictable;
++ /* state for mm list and page table walks */
++ struct lru_gen_mm_walk mm_walk;
+ #endif
+ #ifdef CONFIG_MEMCG
+ struct pglist_data *pgdat;
+--- a/kernel/exit.c
++++ b/kernel/exit.c
+@@ -422,6 +422,7 @@ assign_new_owner:
+ goto retry;
+ }
+ WRITE_ONCE(mm->owner, c);
++ lru_gen_migrate_mm(mm);
+ task_unlock(c);
+ put_task_struct(c);
+ }
+--- a/kernel/fork.c
++++ b/kernel/fork.c
+@@ -1080,6 +1080,7 @@ static struct mm_struct *mm_init(struct
+ goto fail_nocontext;
+
+ mm->user_ns = get_user_ns(user_ns);
++ lru_gen_init_mm(mm);
+ return mm;
+
+ fail_nocontext:
+@@ -1122,6 +1123,7 @@ static inline void __mmput(struct mm_str
+ }
+ if (mm->binfmt)
+ module_put(mm->binfmt->module);
++ lru_gen_del_mm(mm);
+ mmdrop(mm);
+ }
+
+@@ -2617,6 +2619,13 @@ pid_t kernel_clone(struct kernel_clone_a
+ get_task_struct(p);
+ }
+
++ if (IS_ENABLED(CONFIG_LRU_GEN) && !(clone_flags & CLONE_VM)) {
++ /* lock the task to synchronize with memcg migration */
++ task_lock(p);
++ lru_gen_add_mm(p->mm);
++ task_unlock(p);
++ }
++
+ wake_up_new_task(p);
+
+ /* forking complete and child started to run, tell ptracer */
+--- a/kernel/sched/core.c
++++ b/kernel/sched/core.c
+@@ -4978,6 +4978,7 @@ context_switch(struct rq *rq, struct tas
+ * finish_task_switch()'s mmdrop().
+ */
+ switch_mm_irqs_off(prev->active_mm, next->mm, next);
++ lru_gen_activate_mm(next->mm);
+
+ if (!prev->mm) { // from kernel
+ /* will mmdrop() in finish_task_switch(). */
+--- a/mm/memcontrol.c
++++ b/mm/memcontrol.c
+@@ -5163,6 +5163,7 @@ static void __mem_cgroup_free(struct mem
+
+ static void mem_cgroup_free(struct mem_cgroup *memcg)
+ {
++ lru_gen_free_memcg(memcg);
+ memcg_wb_domain_exit(memcg);
+ __mem_cgroup_free(memcg);
+ }
+@@ -6195,6 +6196,29 @@ static void mem_cgroup_move_task(void)
+ }
+ #endif
+
++#ifdef CONFIG_LRU_GEN
++static void mem_cgroup_attach(struct cgroup_taskset *tset)
++{
++ struct cgroup_subsys_state *css;
++ struct task_struct *task = NULL;
++
++ cgroup_taskset_for_each_leader(task, css, tset)
++ break;
++
++ if (!task)
++ return;
++
++ task_lock(task);
++ if (task->mm && task->mm->owner == task)
++ lru_gen_migrate_mm(task->mm);
++ task_unlock(task);
++}
++#else
++static void mem_cgroup_attach(struct cgroup_taskset *tset)
++{
++}
++#endif /* CONFIG_LRU_GEN */
++
+ static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value)
+ {
+ if (value == PAGE_COUNTER_MAX)
+@@ -6538,6 +6562,7 @@ struct cgroup_subsys memory_cgrp_subsys
+ .css_reset = mem_cgroup_css_reset,
+ .css_rstat_flush = mem_cgroup_css_rstat_flush,
+ .can_attach = mem_cgroup_can_attach,
++ .attach = mem_cgroup_attach,
+ .cancel_attach = mem_cgroup_cancel_attach,
+ .post_attach = mem_cgroup_move_task,
+ .dfl_cftypes = memory_files,
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -2929,6 +2929,306 @@ static bool __maybe_unused seq_is_valid(
+ }
+
+ /******************************************************************************
++ * mm_struct list
++ ******************************************************************************/
++
++static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg)
++{
++ static struct lru_gen_mm_list mm_list = {
++ .fifo = LIST_HEAD_INIT(mm_list.fifo),
++ .lock = __SPIN_LOCK_UNLOCKED(mm_list.lock),
++ };
++
++#ifdef CONFIG_MEMCG
++ if (memcg)
++ return &memcg->mm_list;
++#endif
++ return &mm_list;
++}
++
++void lru_gen_add_mm(struct mm_struct *mm)
++{
++ int nid;
++ struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm);
++ struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
++
++ VM_BUG_ON_MM(!list_empty(&mm->lrugen.list), mm);
++#ifdef CONFIG_MEMCG
++ VM_BUG_ON_MM(mm->lrugen.memcg, mm);
++ mm->lrugen.memcg = memcg;
++#endif
++ spin_lock(&mm_list->lock);
++
++ list_add_tail(&mm->lrugen.list, &mm_list->fifo);
++
++ for_each_node(nid) {
++ struct lruvec *lruvec = get_lruvec(nid, memcg);
++
++ if (!lruvec)
++ continue;
++
++ if (lruvec->mm_walk.tail == &mm_list->fifo)
++ lruvec->mm_walk.tail = lruvec->mm_walk.tail->prev;
++ }
++
++ spin_unlock(&mm_list->lock);
++}
++
++void lru_gen_del_mm(struct mm_struct *mm)
++{
++ int nid;
++ struct lru_gen_mm_list *mm_list;
++ struct mem_cgroup *memcg = NULL;
++
++ if (list_empty(&mm->lrugen.list))
++ return;
++
++#ifdef CONFIG_MEMCG
++ memcg = mm->lrugen.memcg;
++#endif
++ mm_list = get_mm_list(memcg);
++
++ spin_lock(&mm_list->lock);
++
++ for_each_node(nid) {
++ struct lruvec *lruvec = get_lruvec(nid, memcg);
++
++ if (!lruvec)
++ continue;
++
++ if (lruvec->mm_walk.tail == &mm->lrugen.list)
++ lruvec->mm_walk.tail = lruvec->mm_walk.tail->next;
++
++ if (lruvec->mm_walk.head != &mm->lrugen.list)
++ continue;
++
++ lruvec->mm_walk.head = lruvec->mm_walk.head->next;
++ if (lruvec->mm_walk.head == &mm_list->fifo)
++ WRITE_ONCE(lruvec->mm_walk.seq, lruvec->mm_walk.seq + 1);
++ }
++
++ list_del_init(&mm->lrugen.list);
++
++ spin_unlock(&mm_list->lock);
++
++#ifdef CONFIG_MEMCG
++ mem_cgroup_put(mm->lrugen.memcg);
++ mm->lrugen.memcg = NULL;
++#endif
++}
++
++#ifdef CONFIG_MEMCG
++void lru_gen_migrate_mm(struct mm_struct *mm)
++{
++ struct mem_cgroup *memcg;
++
++ lockdep_assert_held(&mm->owner->alloc_lock);
++
++ if (mem_cgroup_disabled())
++ return;
++
++ rcu_read_lock();
++ memcg = mem_cgroup_from_task(mm->owner);
++ rcu_read_unlock();
++ if (memcg == mm->lrugen.memcg)
++ return;
++
++ VM_BUG_ON_MM(!mm->lrugen.memcg, mm);
++ VM_BUG_ON_MM(list_empty(&mm->lrugen.list), mm);
++
++ lru_gen_del_mm(mm);
++ lru_gen_add_mm(mm);
++}
++#endif
++
++#define BLOOM_FILTER_SHIFT 15
++
++static inline int filter_gen_from_seq(unsigned long seq)
++{
++ return seq % NR_BLOOM_FILTERS;
++}
++
++static void get_item_key(void *item, int *key)
++{
++ u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2);
++
++ BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32));
++
++ key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1);
++ key[1] = hash >> BLOOM_FILTER_SHIFT;
++}
++
++static void clear_bloom_filter(struct lruvec *lruvec, unsigned long seq)
++{
++ unsigned long *filter;
++ int gen = filter_gen_from_seq(seq);
++
++ lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock);
++
++ filter = lruvec->mm_walk.filters[gen];
++ if (filter) {
++ bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT));
++ return;
++ }
++
++ filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT), GFP_ATOMIC);
++ WRITE_ONCE(lruvec->mm_walk.filters[gen], filter);
++}
++
++static void set_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
++{
++ int key[2];
++ unsigned long *filter;
++ int gen = filter_gen_from_seq(seq);
++
++ filter = READ_ONCE(lruvec->mm_walk.filters[gen]);
++ if (!filter)
++ return;
++
++ get_item_key(item, key);
++
++ if (!test_bit(key[0], filter))
++ set_bit(key[0], filter);
++ if (!test_bit(key[1], filter))
++ set_bit(key[1], filter);
++}
++
++static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
++{
++ int key[2];
++ unsigned long *filter;
++ int gen = filter_gen_from_seq(seq);
++
++ filter = READ_ONCE(lruvec->mm_walk.filters[gen]);
++ if (!filter)
++ return false;
++
++ get_item_key(item, key);
++
++ return test_bit(key[0], filter) && test_bit(key[1], filter);
++}
++
++static void reset_mm_stats(struct lruvec *lruvec, bool last, struct mm_walk_args *args)
++{
++ int i;
++ int hist = lru_hist_from_seq(args->max_seq);
++
++ lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock);
++
++ for (i = 0; i < NR_MM_STATS; i++) {
++ WRITE_ONCE(lruvec->mm_walk.stats[hist][i],
++ lruvec->mm_walk.stats[hist][i] + args->mm_stats[i]);
++ args->mm_stats[i] = 0;
++ }
++
++ if (!last || NR_HIST_GENS == 1)
++ return;
++
++ hist = lru_hist_from_seq(args->max_seq + 1);
++ for (i = 0; i < NR_MM_STATS; i++)
++ WRITE_ONCE(lruvec->mm_walk.stats[hist][i], 0);
++}
++
++static bool should_skip_mm(struct mm_struct *mm, struct mm_walk_args *args)
++{
++ int type;
++ unsigned long size = 0;
++
++ if (cpumask_empty(mm_cpumask(mm)) && !node_isset(args->node_id, mm->lrugen.nodes))
++ return true;
++
++ if (mm_is_oom_victim(mm))
++ return true;
++
++ for (type = !args->swappiness; type < ANON_AND_FILE; type++) {
++ size += type ? get_mm_counter(mm, MM_FILEPAGES) :
++ get_mm_counter(mm, MM_ANONPAGES) +
++ get_mm_counter(mm, MM_SHMEMPAGES);
++ }
++
++ if (size < MIN_BATCH_SIZE)
++ return true;
++
++ if (!mmget_not_zero(mm))
++ return true;
++
++ node_clear(args->node_id, mm->lrugen.nodes);
++
++ return false;
++}
++
++/* To support multiple walkers that concurrently walk an mm_struct list. */
++static bool get_next_mm(struct lruvec *lruvec, struct mm_walk_args *args,
++ struct mm_struct **iter)
++{
++ bool first = false;
++ bool last = true;
++ struct mm_struct *mm = NULL;
++ struct lru_gen_mm_walk *mm_walk = &lruvec->mm_walk;
++ struct lru_gen_mm_list *mm_list = get_mm_list(args->memcg);
++
++ if (*iter)
++ mmput_async(*iter);
++ else if (args->max_seq <= READ_ONCE(mm_walk->seq))
++ return false;
++
++ spin_lock(&mm_list->lock);
++
++ VM_BUG_ON(args->max_seq > mm_walk->seq + 1);
++ VM_BUG_ON(*iter && args->max_seq < mm_walk->seq);
++ VM_BUG_ON(*iter && !mm_walk->nr_walkers);
++
++ if (args->max_seq <= mm_walk->seq) {
++ if (!*iter)
++ last = false;
++ goto done;
++ }
++
++ if (mm_walk->head == &mm_list->fifo) {
++ VM_BUG_ON(mm_walk->nr_walkers);
++ mm_walk->head = mm_walk->head->next;
++ first = true;
++ }
++
++ while (!mm && mm_walk->head != &mm_list->fifo) {
++ mm = list_entry(mm_walk->head, struct mm_struct, lrugen.list);
++
++ mm_walk->head = mm_walk->head->next;
++
++ if (mm_walk->tail == &mm->lrugen.list) {
++ mm_walk->tail = mm_walk->tail->next;
++ args->use_filter = false;
++ }
++
++ if (should_skip_mm(mm, args))
++ mm = NULL;
++ }
++
++ if (mm_walk->head == &mm_list->fifo)
++ WRITE_ONCE(mm_walk->seq, mm_walk->seq + 1);
++done:
++ if (*iter && !mm)
++ mm_walk->nr_walkers--;
++ if (!*iter && mm)
++ mm_walk->nr_walkers++;
++
++ if (mm_walk->nr_walkers)
++ last = false;
++
++ if (mm && first)
++ clear_bloom_filter(lruvec, args->max_seq + 1);
++
++ if (*iter || last)
++ reset_mm_stats(lruvec, last, args);
++
++ spin_unlock(&mm_list->lock);
++
++ *iter = mm;
++
++ return last;
++}
++
++/******************************************************************************
+ * state change
+ ******************************************************************************/
+
+@@ -3112,6 +3412,7 @@ void lru_gen_init_state(struct mem_cgrou
+ int i;
+ int gen, type, zone;
+ struct lrugen *lrugen = &lruvec->evictable;
++ struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
+
+ lrugen->max_seq = MIN_NR_GENS + 1;
+ lrugen->enabled[0] = lru_gen_enabled() && lru_gen_nr_swapfiles;
+@@ -3122,6 +3423,17 @@ void lru_gen_init_state(struct mem_cgrou
+
+ for_each_gen_type_zone(gen, type, zone)
+ INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
++
++ if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG) && !memcg)
++ spin_lock(&mm_list->lock);
++
++ lruvec->mm_walk.seq = MIN_NR_GENS;
++ lruvec->mm_walk.head = &mm_list->fifo;
++ lruvec->mm_walk.tail = &mm_list->fifo;
++ init_waitqueue_head(&lruvec->mm_walk.wait);
++
++ if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG) && !memcg)
++ spin_unlock(&mm_list->lock);
+ }
+
+ #ifdef CONFIG_MEMCG
+@@ -3129,18 +3441,37 @@ void lru_gen_init_memcg(struct mem_cgrou
+ {
+ int nid;
+
++ INIT_LIST_HEAD(&memcg->mm_list.fifo);
++ spin_lock_init(&memcg->mm_list.lock);
++
+ for_each_node(nid) {
+ struct lruvec *lruvec = get_lruvec(nid, memcg);
+
+ lru_gen_init_state(memcg, lruvec);
+ }
+ }
++
++void lru_gen_free_memcg(struct mem_cgroup *memcg)
++{
++ int nid;
++
++ for_each_node(nid) {
++ int i;
++ struct lruvec *lruvec = get_lruvec(nid, memcg);
++
++ for (i = 0; i < NR_BLOOM_FILTERS; i++) {
++ bitmap_free(lruvec->mm_walk.filters[i]);
++ lruvec->mm_walk.filters[i] = NULL;
++ }
++ }
++}
+ #endif
+
+ static int __init init_lru_gen(void)
+ {
+ BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS);
+ BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);
++ BUILD_BUG_ON(sizeof(MM_STAT_CODES) != NR_MM_STATS + 1);
+
+ return 0;
+ };
--- /dev/null
+From 8217cd2238c40cf77208aa27a7cc09879e685890 Mon Sep 17 00:00:00 2001
+Date: Mon, 5 Apr 2021 04:35:07 -0600
+Subject: [PATCH 06/10] mm: multigenerational lru: aging
+
+The aging produces young generations. Given an lruvec, the aging
+traverses lruvec_memcg()->mm_list and calls walk_page_range() to scan
+PTEs for accessed pages. Upon finding one, the aging updates its
+generation number to max_seq (modulo MAX_NR_GENS). After each round of
+traversal, the aging increments max_seq. The aging is due when
+min_seq[] reaches max_seq-1.
+
+The aging uses the following optimizations when walking page tables:
+ 1) It skips non-leaf PMD entries that have the accessed bit cleared
+ when CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG=y.
+ 2) It does not zigzag between a PGD table and the same PMD or PTE
+ table spanning multiple VMAs. In other words, it finishes all the
+ VMAs within the range of the same PMD or PTE table before it returns
+ to this PGD table. This optimizes workloads that have large numbers
+ of tiny VMAs, especially when CONFIG_PGTABLE_LEVELS=5.
+
+Change-Id: I3ae8abc3100d023cecb3a699d86020ae6fc10a45
+---
+ include/linux/memcontrol.h | 3 +
+ include/linux/mmzone.h | 9 +
+ include/linux/oom.h | 16 +
+ include/linux/swap.h | 3 +
+ mm/memcontrol.c | 5 +
+ mm/oom_kill.c | 4 +-
+ mm/rmap.c | 8 +
+ mm/vmscan.c | 948 +++++++++++++++++++++++++++++++++++++
+ 8 files changed, 994 insertions(+), 2 deletions(-)
+
+--- a/include/linux/memcontrol.h
++++ b/include/linux/memcontrol.h
+@@ -1367,10 +1367,13 @@ mem_cgroup_print_oom_meminfo(struct mem_
+
+ static inline void lock_page_memcg(struct page *page)
+ {
++ /* to match page_memcg_rcu() */
++ rcu_read_lock();
+ }
+
+ static inline void unlock_page_memcg(struct page *page)
+ {
++ rcu_read_unlock();
+ }
+
+ static inline void mem_cgroup_handle_over_high(void)
+--- a/include/linux/mmzone.h
++++ b/include/linux/mmzone.h
+@@ -295,6 +295,7 @@ enum lruvec_flags {
+ };
+
+ struct lruvec;
++struct page_vma_mapped_walk;
+
+ #define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
+ #define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)
+@@ -393,6 +394,7 @@ struct mm_walk_args {
+
+ void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec);
+ void lru_gen_change_state(bool enable, bool main, bool swap);
++void lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
+
+ #ifdef CONFIG_MEMCG
+ void lru_gen_init_memcg(struct mem_cgroup *memcg);
+@@ -409,6 +411,10 @@ static inline void lru_gen_change_state(
+ {
+ }
+
++static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
++{
++}
++
+ #ifdef CONFIG_MEMCG
+ static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
+ {
+@@ -1028,6 +1034,9 @@ typedef struct pglist_data {
+
+ unsigned long flags;
+
++#ifdef CONFIG_LRU_GEN
++ struct mm_walk_args mm_walk_args;
++#endif
+ ZONE_PADDING(_pad2_)
+
+ /* Per-node vmstats */
+--- a/include/linux/oom.h
++++ b/include/linux/oom.h
+@@ -57,6 +57,22 @@ struct oom_control {
+ extern struct mutex oom_lock;
+ extern struct mutex oom_adj_mutex;
+
++#ifdef CONFIG_MMU
++extern struct task_struct *oom_reaper_list;
++extern struct wait_queue_head oom_reaper_wait;
++
++static inline bool oom_reaping_in_progress(void)
++{
++ /* racy check to see if oom reaping could be in progress */
++ return READ_ONCE(oom_reaper_list) || !waitqueue_active(&oom_reaper_wait);
++}
++#else
++static inline bool oom_reaping_in_progress(void)
++{
++ return false;
++}
++#endif
++
+ static inline void set_current_oom_origin(void)
+ {
+ current->signal->oom_flag_origin = true;
+--- a/include/linux/swap.h
++++ b/include/linux/swap.h
+@@ -137,6 +137,9 @@ union swap_header {
+ */
+ struct reclaim_state {
+ unsigned long reclaimed_slab;
++#ifdef CONFIG_LRU_GEN
++ struct mm_walk_args *mm_walk_args;
++#endif
+ };
+
+ #ifdef __KERNEL__
+--- a/mm/memcontrol.c
++++ b/mm/memcontrol.c
+@@ -1304,12 +1304,17 @@ void mem_cgroup_update_lru_size(struct l
+ *lru_size += nr_pages;
+
+ size = *lru_size;
++#ifdef CONFIG_LRU_GEN
++ /* unlikely but not a bug when reset_batch_size() is pending */
++ VM_WARN_ON(size + MAX_BATCH_SIZE < 0);
++#else
+ if (WARN_ONCE(size < 0,
+ "%s(%p, %d, %d): lru_size %ld\n",
+ __func__, lruvec, lru, nr_pages, size)) {
+ VM_BUG_ON(1);
+ *lru_size = 0;
+ }
++#endif
+
+ if (nr_pages > 0)
+ *lru_size += nr_pages;
+--- a/mm/oom_kill.c
++++ b/mm/oom_kill.c
+@@ -508,8 +508,8 @@ bool process_shares_mm(struct task_struc
+ * victim (if that is possible) to help the OOM killer to move on.
+ */
+ static struct task_struct *oom_reaper_th;
+-static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
+-static struct task_struct *oom_reaper_list;
++DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
++struct task_struct *oom_reaper_list;
+ static DEFINE_SPINLOCK(oom_reaper_lock);
+
+ bool __oom_reap_task_mm(struct mm_struct *mm)
+--- a/mm/rmap.c
++++ b/mm/rmap.c
+@@ -73,6 +73,7 @@
+ #include <linux/page_idle.h>
+ #include <linux/memremap.h>
+ #include <linux/userfaultfd_k.h>
++#include <linux/mm_inline.h>
+
+ #include <asm/tlbflush.h>
+
+@@ -793,6 +794,13 @@ static bool page_referenced_one(struct p
+ }
+
+ if (pvmw.pte) {
++ /* the multigenerational lru exploits the spatial locality */
++ if (lru_gen_enabled() && pte_young(*pvmw.pte) &&
++ !(vma->vm_flags & VM_SEQ_READ)) {
++ lru_gen_look_around(&pvmw);
++ referenced++;
++ }
++
+ if (ptep_clear_flush_young_notify(vma, address,
+ pvmw.pte)) {
+ /*
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -51,6 +51,8 @@
+ #include <linux/dax.h>
+ #include <linux/psi.h>
+ #include <linux/memory.h>
++#include <linux/pagewalk.h>
++#include <linux/shmem_fs.h>
+
+ #include <asm/tlbflush.h>
+ #include <asm/div64.h>
+@@ -2887,6 +2889,15 @@ static bool can_age_anon_pages(struct pg
+ * shorthand helpers
+ ******************************************************************************/
+
++#define DEFINE_MAX_SEQ(lruvec) \
++ unsigned long max_seq = READ_ONCE((lruvec)->evictable.max_seq)
++
++#define DEFINE_MIN_SEQ(lruvec) \
++ unsigned long min_seq[ANON_AND_FILE] = { \
++ READ_ONCE((lruvec)->evictable.min_seq[0]), \
++ READ_ONCE((lruvec)->evictable.min_seq[1]), \
++ }
++
+ #define for_each_gen_type_zone(gen, type, zone) \
+ for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \
+ for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \
+@@ -2899,6 +2910,12 @@ static int page_lru_gen(struct page *pag
+ return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
+ }
+
++static int get_swappiness(struct mem_cgroup *memcg)
++{
++ return mem_cgroup_get_nr_swap_pages(memcg) >= MIN_BATCH_SIZE ?
++ mem_cgroup_swappiness(memcg) : 0;
++}
++
+ static struct lruvec *get_lruvec(int nid, struct mem_cgroup *memcg)
+ {
+ struct pglist_data *pgdat = NODE_DATA(nid);
+@@ -3229,6 +3246,926 @@ done:
+ }
+
+ /******************************************************************************
++ * the aging
++ ******************************************************************************/
++
++static int page_update_gen(struct page *page, int gen)
++{
++ unsigned long old_flags, new_flags;
++
++ VM_BUG_ON(gen >= MAX_NR_GENS);
++
++ do {
++ new_flags = old_flags = READ_ONCE(page->flags);
++
++ if (!(new_flags & LRU_GEN_MASK)) {
++ new_flags |= BIT(PG_referenced);
++ continue;
++ }
++
++ new_flags &= ~LRU_GEN_MASK;
++ new_flags |= (gen + 1UL) << LRU_GEN_PGOFF;
++ } while (new_flags != old_flags &&
++ cmpxchg(&page->flags, old_flags, new_flags) != old_flags);
++
++ return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
++}
++
++static void page_inc_gen(struct page *page, struct lruvec *lruvec, bool reclaiming)
++{
++ int old_gen, new_gen;
++ unsigned long old_flags, new_flags;
++ int type = page_is_file_lru(page);
++ int zone = page_zonenum(page);
++ struct lrugen *lrugen = &lruvec->evictable;
++
++ old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
++
++ do {
++ new_flags = old_flags = READ_ONCE(page->flags);
++ VM_BUG_ON_PAGE(!(new_flags & LRU_GEN_MASK), page);
++
++ new_gen = ((new_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
++ /* page_update_gen() has updated this page? */
++ if (new_gen >= 0 && new_gen != old_gen) {
++ list_move(&page->lru, &lrugen->lists[new_gen][type][zone]);
++ return;
++ }
++
++ new_gen = (old_gen + 1) % MAX_NR_GENS;
++
++ new_flags &= ~LRU_GEN_MASK;
++ new_flags |= (new_gen + 1UL) << LRU_GEN_PGOFF;
++ /* for end_page_writeback() */
++ if (reclaiming)
++ new_flags |= BIT(PG_reclaim);
++ } while (cmpxchg(&page->flags, old_flags, new_flags) != old_flags);
++
++ lru_gen_update_size(page, lruvec, old_gen, new_gen);
++ if (reclaiming)
++ list_move(&page->lru, &lrugen->lists[new_gen][type][zone]);
++ else
++ list_move_tail(&page->lru, &lrugen->lists[new_gen][type][zone]);
++}
++
++static void update_batch_size(struct page *page, int old_gen, int new_gen,
++ struct mm_walk_args *args)
++{
++ int type = page_is_file_lru(page);
++ int zone = page_zonenum(page);
++ int delta = thp_nr_pages(page);
++
++ VM_BUG_ON(old_gen >= MAX_NR_GENS);
++ VM_BUG_ON(new_gen >= MAX_NR_GENS);
++
++ args->batch_size++;
++
++ args->nr_pages[old_gen][type][zone] -= delta;
++ args->nr_pages[new_gen][type][zone] += delta;
++}
++
++static void reset_batch_size(struct lruvec *lruvec, struct mm_walk_args *args)
++{
++ int gen, type, zone;
++ struct lrugen *lrugen = &lruvec->evictable;
++
++ args->batch_size = 0;
++
++ for_each_gen_type_zone(gen, type, zone) {
++ enum lru_list lru = type * LRU_FILE;
++ int delta = args->nr_pages[gen][type][zone];
++
++ if (!delta)
++ continue;
++
++ args->nr_pages[gen][type][zone] = 0;
++ WRITE_ONCE(lrugen->sizes[gen][type][zone],
++ lrugen->sizes[gen][type][zone] + delta);
++
++ if (lru_gen_is_active(lruvec, gen))
++ lru += LRU_ACTIVE;
++ update_lru_size(lruvec, lru, zone, delta);
++ }
++}
++
++static int should_skip_vma(unsigned long start, unsigned long end, struct mm_walk *walk)
++{
++ struct address_space *mapping;
++ struct vm_area_struct *vma = walk->vma;
++ struct mm_walk_args *args = walk->private;
++
++ if (!vma_is_accessible(vma) || is_vm_hugetlb_page(vma) ||
++ (vma->vm_flags & (VM_LOCKED | VM_SPECIAL | VM_SEQ_READ)))
++ return true;
++
++ if (vma_is_anonymous(vma))
++ return !args->swappiness;
++
++ if (WARN_ON_ONCE(!vma->vm_file || !vma->vm_file->f_mapping))
++ return true;
++
++ mapping = vma->vm_file->f_mapping;
++ if (!mapping->a_ops->writepage)
++ return true;
++
++ return (shmem_mapping(mapping) && !args->swappiness) || mapping_unevictable(mapping);
++}
++
++/*
++ * Some userspace memory allocators create many single-page VMAs. So instead of
++ * returning back to the PGD table for each of such VMAs, we finish at least an
++ * entire PMD table and therefore avoid many zigzags.
++ */
++static bool get_next_vma(struct mm_walk *walk, unsigned long mask, unsigned long size,
++ unsigned long *start, unsigned long *end)
++{
++ unsigned long next = round_up(*end, size);
++
++ VM_BUG_ON(mask & size);
++ VM_BUG_ON(*start >= *end);
++ VM_BUG_ON((next & mask) != (*start & mask));
++
++ while (walk->vma) {
++ if (next >= walk->vma->vm_end) {
++ walk->vma = walk->vma->vm_next;
++ continue;
++ }
++
++ if ((next & mask) != (walk->vma->vm_start & mask))
++ return false;
++
++ if (should_skip_vma(walk->vma->vm_start, walk->vma->vm_end, walk)) {
++ walk->vma = walk->vma->vm_next;
++ continue;
++ }
++
++ *start = max(next, walk->vma->vm_start);
++ next = (next | ~mask) + 1;
++ /* rounded-up boundaries can wrap to 0 */
++ *end = next && next < walk->vma->vm_end ? next : walk->vma->vm_end;
++
++ return true;
++ }
++
++ return false;
++}
++
++static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
++ struct mm_walk *walk)
++{
++ int i;
++ pte_t *pte;
++ spinlock_t *ptl;
++ unsigned long addr;
++ int worth = 0;
++ struct mm_walk_args *args = walk->private;
++ int old_gen, new_gen = lru_gen_from_seq(args->max_seq);
++
++ VM_BUG_ON(pmd_leaf(*pmd));
++
++ pte = pte_offset_map_lock(walk->mm, pmd, start & PMD_MASK, &ptl);
++ arch_enter_lazy_mmu_mode();
++restart:
++ for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) {
++ struct page *page;
++ unsigned long pfn = pte_pfn(pte[i]);
++
++ args->mm_stats[MM_LEAF_TOTAL]++;
++
++ if (!pte_present(pte[i]) || is_zero_pfn(pfn))
++ continue;
++
++ if (WARN_ON_ONCE(pte_devmap(pte[i]) || pte_special(pte[i])))
++ continue;
++
++ if (!pte_young(pte[i])) {
++ args->mm_stats[MM_LEAF_OLD]++;
++ continue;
++ }
++
++ VM_BUG_ON(!pfn_valid(pfn));
++ if (pfn < args->start_pfn || pfn >= args->end_pfn)
++ continue;
++
++ page = compound_head(pfn_to_page(pfn));
++ if (page_to_nid(page) != args->node_id)
++ continue;
++
++ if (page_memcg_rcu(page) != args->memcg)
++ continue;
++
++ VM_BUG_ON(addr < walk->vma->vm_start || addr >= walk->vma->vm_end);
++ if (!ptep_test_and_clear_young(walk->vma, addr, pte + i))
++ continue;
++
++ args->mm_stats[MM_LEAF_YOUNG]++;
++
++ if (pte_dirty(pte[i]) && !PageDirty(page) &&
++ !(PageAnon(page) && PageSwapBacked(page) && !PageSwapCache(page)))
++ set_page_dirty(page);
++
++ old_gen = page_update_gen(page, new_gen);
++ if (old_gen >= 0 && old_gen != new_gen)
++ update_batch_size(page, old_gen, new_gen, args);
++
++ worth++;
++ }
++
++ if (i < PTRS_PER_PTE && get_next_vma(walk, PMD_MASK, PAGE_SIZE, &start, &end))
++ goto restart;
++
++ arch_leave_lazy_mmu_mode();
++ pte_unmap_unlock(pte, ptl);
++
++ return worth >= MIN_BATCH_SIZE / 2;
++}
++
++/*
++ * We scan PMD entries in two passes. The first pass reaches to PTE tables and
++ * doesn't take the PMD lock. The second pass clears the accessed bit on PMD
++ * entries and needs to take the PMD lock.
++ */
++#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
++static void walk_pmd_range_locked(pud_t *pud, unsigned long start, int offset,
++ struct vm_area_struct *vma, struct mm_walk *walk)
++{
++ int i;
++ pmd_t *pmd;
++ spinlock_t *ptl;
++ struct mm_walk_args *args = walk->private;
++ int old_gen, new_gen = lru_gen_from_seq(args->max_seq);
++
++ VM_BUG_ON(pud_leaf(*pud));
++
++ start = (start & PUD_MASK) + offset * PMD_SIZE;
++ pmd = pmd_offset(pud, start);
++ ptl = pmd_lock(walk->mm, pmd);
++ arch_enter_lazy_mmu_mode();
++
++ for_each_set_bit(i, args->bitmap, MIN_BATCH_SIZE) {
++ struct page *page;
++ unsigned long pfn = pmd_pfn(pmd[i]);
++ unsigned long addr = start + i * PMD_SIZE;
++
++ if (!pmd_present(pmd[i]) || is_huge_zero_pmd(pmd[i]))
++ continue;
++
++ if (WARN_ON_ONCE(pmd_devmap(pmd[i])))
++ continue;
++
++ if (!pmd_trans_huge(pmd[i])) {
++ if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG))
++ pmdp_test_and_clear_young(vma, addr, pmd + i);
++ continue;
++ }
++
++ VM_BUG_ON(!pfn_valid(pfn));
++ if (pfn < args->start_pfn || pfn >= args->end_pfn)
++ continue;
++
++ page = pfn_to_page(pfn);
++ VM_BUG_ON_PAGE(PageTail(page), page);
++ if (page_to_nid(page) != args->node_id)
++ continue;
++
++ if (page_memcg_rcu(page) != args->memcg)
++ continue;
++
++ VM_BUG_ON(addr < vma->vm_start || addr >= vma->vm_end);
++ if (!pmdp_test_and_clear_young(vma, addr, pmd + i))
++ continue;
++
++ args->mm_stats[MM_LEAF_YOUNG]++;
++
++ if (pmd_dirty(pmd[i]) && !PageDirty(page) &&
++ !(PageAnon(page) && PageSwapBacked(page) && !PageSwapCache(page)))
++ set_page_dirty(page);
++
++ old_gen = page_update_gen(page, new_gen);
++ if (old_gen >= 0 && old_gen != new_gen)
++ update_batch_size(page, old_gen, new_gen, args);
++ }
++
++ arch_leave_lazy_mmu_mode();
++ spin_unlock(ptl);
++
++ bitmap_zero(args->bitmap, MIN_BATCH_SIZE);
++}
++#else
++static void walk_pmd_range_locked(pud_t *pud, unsigned long start, int offset,
++ struct vm_area_struct *vma, struct mm_walk *walk)
++{
++}
++#endif
++
++static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
++ struct mm_walk *walk)
++{
++ int i;
++ pmd_t *pmd;
++ unsigned long next;
++ unsigned long addr;
++ struct vm_area_struct *vma;
++ int offset = -1;
++ bool reset = false;
++ struct mm_walk_args *args = walk->private;
++ struct lruvec *lruvec = get_lruvec(args->node_id, args->memcg);
++
++ VM_BUG_ON(pud_leaf(*pud));
++
++ pmd = pmd_offset(pud, start & PUD_MASK);
++restart:
++ vma = walk->vma;
++ for (i = pmd_index(start), addr = start; addr != end; i++, addr = next) {
++ pmd_t val = pmd_read_atomic(pmd + i);
++
++ /* for pmd_read_atomic() */
++ barrier();
++
++ next = pmd_addr_end(addr, end);
++
++ if (!pmd_present(val)) {
++ args->mm_stats[MM_LEAF_TOTAL]++;
++ continue;
++ }
++
++#ifdef CONFIG_TRANSPARENT_HUGEPAGE
++ if (pmd_trans_huge(val)) {
++ unsigned long pfn = pmd_pfn(val);
++
++ args->mm_stats[MM_LEAF_TOTAL]++;
++
++ if (is_huge_zero_pmd(val))
++ continue;
++
++ if (!pmd_young(val)) {
++ args->mm_stats[MM_LEAF_OLD]++;
++ continue;
++ }
++
++ if (pfn < args->start_pfn || pfn >= args->end_pfn)
++ continue;
++
++ if (offset < 0)
++ offset = i;
++ else if (i - offset >= MIN_BATCH_SIZE) {
++ walk_pmd_range_locked(pud, start, offset, vma, walk);
++ offset = i;
++ }
++ __set_bit(i - offset, args->bitmap);
++ reset = true;
++ continue;
++ }
++#endif
++ args->mm_stats[MM_NONLEAF_TOTAL]++;
++
++#ifdef CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG
++ if (!pmd_young(val))
++ continue;
++
++ if (offset < 0)
++ offset = i;
++ else if (i - offset >= MIN_BATCH_SIZE) {
++ walk_pmd_range_locked(pud, start, offset, vma, walk);
++ offset = i;
++ reset = false;
++ }
++ __set_bit(i - offset, args->bitmap);
++#endif
++ if (args->use_filter && !test_bloom_filter(lruvec, args->max_seq, pmd + i))
++ continue;
++
++ args->mm_stats[MM_NONLEAF_PREV]++;
++
++ if (!walk_pte_range(&val, addr, next, walk))
++ continue;
++
++ args->mm_stats[MM_NONLEAF_CUR]++;
++
++ set_bloom_filter(lruvec, args->max_seq + 1, pmd + i);
++ }
++
++ if (reset) {
++ walk_pmd_range_locked(pud, start, offset, vma, walk);
++ offset = -1;
++ reset = false;
++ }
++
++ if (i < PTRS_PER_PMD && get_next_vma(walk, PUD_MASK, PMD_SIZE, &start, &end))
++ goto restart;
++
++ if (offset >= 0)
++ walk_pmd_range_locked(pud, start, offset, vma, walk);
++}
++
++static int walk_pud_range(p4d_t *p4d, unsigned long start, unsigned long end,
++ struct mm_walk *walk)
++{
++ int i;
++ pud_t *pud;
++ unsigned long addr;
++ unsigned long next;
++ struct mm_walk_args *args = walk->private;
++
++ VM_BUG_ON(p4d_leaf(*p4d));
++
++ pud = pud_offset(p4d, start & P4D_MASK);
++restart:
++ for (i = pud_index(start), addr = start; addr != end; i++, addr = next) {
++ pud_t val = READ_ONCE(pud[i]);
++
++ next = pud_addr_end(addr, end);
++
++ if (!pud_present(val) || WARN_ON_ONCE(pud_leaf(val)))
++ continue;
++
++ walk_pmd_range(&val, addr, next, walk);
++
++ if (args->batch_size >= MAX_BATCH_SIZE) {
++ end = (addr | ~PUD_MASK) + 1;
++ goto done;
++ }
++ }
++
++ if (i < PTRS_PER_PUD && get_next_vma(walk, P4D_MASK, PUD_SIZE, &start, &end))
++ goto restart;
++
++ end = round_up(end, P4D_SIZE);
++done:
++ /* rounded-up boundaries can wrap to 0 */
++ args->next_addr = end && walk->vma ? max(end, walk->vma->vm_start) : 0;
++
++ return -EAGAIN;
++}
++
++static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct mm_walk_args *args)
++{
++ static const struct mm_walk_ops mm_walk_ops = {
++ .test_walk = should_skip_vma,
++ .p4d_entry = walk_pud_range,
++ };
++
++ int err;
++
++ args->next_addr = FIRST_USER_ADDRESS;
++
++ do {
++ unsigned long start = args->next_addr;
++ unsigned long end = mm->highest_vm_end;
++
++ err = -EBUSY;
++
++ rcu_read_lock();
++#ifdef CONFIG_MEMCG
++ if (args->memcg && atomic_read(&args->memcg->moving_account))
++ goto contended;
++#endif
++ if (!mmap_read_trylock(mm))
++ goto contended;
++
++ err = walk_page_range(mm, start, end, &mm_walk_ops, args);
++
++ mmap_read_unlock(mm);
++
++ if (args->batch_size) {
++ spin_lock_irq(&lruvec->lru_lock);
++ reset_batch_size(lruvec, args);
++ spin_unlock_irq(&lruvec->lru_lock);
++ }
++contended:
++ rcu_read_unlock();
++
++ cond_resched();
++ } while (err == -EAGAIN && args->next_addr && !mm_is_oom_victim(mm));
++}
++
++static struct mm_walk_args *alloc_mm_walk_args(void)
++{
++ if (!current->reclaim_state || !current->reclaim_state->mm_walk_args)
++ return kvzalloc(sizeof(struct mm_walk_args), GFP_KERNEL);
++
++ return current->reclaim_state->mm_walk_args;
++}
++
++static void free_mm_walk_args(struct mm_walk_args *args)
++{
++ if (!current->reclaim_state || !current->reclaim_state->mm_walk_args)
++ kvfree(args);
++}
++
++static bool inc_min_seq(struct lruvec *lruvec, int type)
++{
++ int gen, zone;
++ int remaining = MAX_BATCH_SIZE;
++ struct lrugen *lrugen = &lruvec->evictable;
++
++ VM_BUG_ON(!seq_is_valid(lruvec));
++
++ if (get_nr_gens(lruvec, type) != MAX_NR_GENS)
++ return true;
++
++ gen = lru_gen_from_seq(lrugen->min_seq[type]);
++
++ for (zone = 0; zone < MAX_NR_ZONES; zone++) {
++ struct list_head *head = &lrugen->lists[gen][type][zone];
++
++ while (!list_empty(head)) {
++ struct page *page = lru_to_page(head);
++
++ VM_BUG_ON_PAGE(PageTail(page), page);
++ VM_BUG_ON_PAGE(PageUnevictable(page), page);
++ VM_BUG_ON_PAGE(PageActive(page), page);
++ VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page);
++ VM_BUG_ON_PAGE(page_zonenum(page) != zone, page);
++
++ prefetchw_prev_lru_page(page, head, flags);
++
++ page_inc_gen(page, lruvec, false);
++
++ if (!--remaining)
++ return false;
++ }
++ }
++
++ WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1);
++
++ return true;
++}
++
++static bool try_to_inc_min_seq(struct lruvec *lruvec, int swappiness)
++{
++ int gen, type, zone;
++ bool success = false;
++ struct lrugen *lrugen = &lruvec->evictable;
++ DEFINE_MIN_SEQ(lruvec);
++
++ VM_BUG_ON(!seq_is_valid(lruvec));
++
++ for (type = 0; type < ANON_AND_FILE; type++) {
++ while (lrugen->max_seq - min_seq[type] >= MIN_NR_GENS) {
++ gen = lru_gen_from_seq(min_seq[type]);
++
++ for (zone = 0; zone < MAX_NR_ZONES; zone++) {
++ if (!list_empty(&lrugen->lists[gen][type][zone]))
++ goto next;
++ }
++
++ min_seq[type]++;
++ }
++next:
++ ;
++ }
++
++ min_seq[0] = min(min_seq[0], min_seq[1]);
++ if (swappiness)
++ min_seq[1] = max(min_seq[0], lrugen->min_seq[1]);
++
++ for (type = 0; type < ANON_AND_FILE; type++) {
++ if (min_seq[type] == lrugen->min_seq[type])
++ continue;
++
++ WRITE_ONCE(lrugen->min_seq[type], min_seq[type]);
++ success = true;
++ }
++
++ return success;
++}
++
++static void inc_max_seq(struct lruvec *lruvec, unsigned long max_seq)
++{
++ int gen, type, zone;
++ struct lrugen *lrugen = &lruvec->evictable;
++
++ spin_lock_irq(&lruvec->lru_lock);
++
++ VM_BUG_ON(!seq_is_valid(lruvec));
++
++ if (max_seq != lrugen->max_seq)
++ goto unlock;
++
++ if (!try_to_inc_min_seq(lruvec, true)) {
++ for (type = ANON_AND_FILE - 1; type >= 0; type--) {
++ while (!inc_min_seq(lruvec, type)) {
++ spin_unlock_irq(&lruvec->lru_lock);
++ cond_resched();
++ spin_lock_irq(&lruvec->lru_lock);
++ }
++ }
++ }
++
++ gen = lru_gen_from_seq(lrugen->max_seq - 1);
++ for (type = 0; type < ANON_AND_FILE; type++) {
++ for (zone = 0; zone < MAX_NR_ZONES; zone++) {
++ enum lru_list lru = type * LRU_FILE;
++ long delta = lrugen->sizes[gen][type][zone];
++
++ if (!delta)
++ continue;
++
++ WARN_ON_ONCE(delta != (int)delta);
++
++ update_lru_size(lruvec, lru, zone, delta);
++ update_lru_size(lruvec, lru + LRU_ACTIVE, zone, -delta);
++ }
++ }
++
++ gen = lru_gen_from_seq(lrugen->max_seq + 1);
++ for (type = 0; type < ANON_AND_FILE; type++) {
++ for (zone = 0; zone < MAX_NR_ZONES; zone++) {
++ enum lru_list lru = type * LRU_FILE;
++ long delta = lrugen->sizes[gen][type][zone];
++
++ if (!delta)
++ continue;
++
++ WARN_ON_ONCE(delta != (int)delta);
++
++ update_lru_size(lruvec, lru, zone, -delta);
++ update_lru_size(lruvec, lru + LRU_ACTIVE, zone, delta);
++ }
++ }
++
++ WRITE_ONCE(lrugen->timestamps[gen], jiffies);
++ /* make sure all preceding modifications appear first */
++ smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1);
++unlock:
++ spin_unlock_irq(&lruvec->lru_lock);
++}
++
++/* Main function used by the foreground, the background and the user-triggered aging. */
++static bool try_to_inc_max_seq(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
++ unsigned long max_seq, bool use_filter)
++{
++ bool last;
++ struct mm_walk_args *args;
++ struct mm_struct *mm = NULL;
++ struct lrugen *lrugen = &lruvec->evictable;
++ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
++ struct pglist_data *pgdat = lruvec_pgdat(lruvec);
++ int nid = pgdat->node_id;
++
++ VM_BUG_ON(max_seq > READ_ONCE(lrugen->max_seq));
++
++ /*
++ * If we are not from run_aging() and clearing the accessed bit may
++ * trigger page faults, then don't proceed to clearing all accessed
++ * PTEs. Instead, fallback to lru_gen_look_around(), which only clears a
++ * handful of accessed PTEs. This is less efficient but causes fewer
++ * page faults on CPUs that don't have the capability.
++ */
++ if ((current->flags & PF_MEMALLOC) && !arch_has_hw_pte_young(false)) {
++ inc_max_seq(lruvec, max_seq);
++ return true;
++ }
++
++ args = alloc_mm_walk_args();
++ if (!args)
++ return false;
++
++ args->memcg = memcg;
++ args->max_seq = max_seq;
++ args->start_pfn = pgdat->node_start_pfn;
++ args->end_pfn = pgdat_end_pfn(pgdat);
++ args->node_id = nid;
++ args->swappiness = swappiness;
++ args->use_filter = use_filter;
++
++ do {
++ last = get_next_mm(lruvec, args, &mm);
++ if (mm)
++ walk_mm(lruvec, mm, args);
++
++ cond_resched();
++ } while (mm);
++
++ free_mm_walk_args(args);
++
++ if (!last) {
++ /* don't wait unless we may have trouble reclaiming */
++ if (!current_is_kswapd() && sc->priority < DEF_PRIORITY - 2)
++ wait_event_killable(lruvec->mm_walk.wait,
++ max_seq < READ_ONCE(lrugen->max_seq));
++
++ return max_seq < READ_ONCE(lrugen->max_seq);
++ }
++
++ VM_BUG_ON(max_seq != READ_ONCE(lrugen->max_seq));
++
++ inc_max_seq(lruvec, max_seq);
++ /* either we see any waiters or they will see updated max_seq */
++ if (wq_has_sleeper(&lruvec->mm_walk.wait))
++ wake_up_all(&lruvec->mm_walk.wait);
++
++ wakeup_flusher_threads(WB_REASON_VMSCAN);
++
++ return true;
++}
++
++static long get_nr_evictable(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
++ unsigned long max_seq, unsigned long *min_seq, bool *low)
++{
++ int gen, type, zone;
++ long max = 0;
++ long min = 0;
++ struct lrugen *lrugen = &lruvec->evictable;
++
++ for (type = !swappiness; type < ANON_AND_FILE; type++) {
++ unsigned long seq;
++
++ for (seq = min_seq[type]; seq <= max_seq; seq++) {
++ long size = 0;
++
++ gen = lru_gen_from_seq(seq);
++
++ for (zone = 0; zone <= sc->reclaim_idx; zone++)
++ size += READ_ONCE(lrugen->sizes[gen][type][zone]);
++
++ max += size;
++ if (type && max_seq - seq >= MIN_NR_GENS)
++ min += size;
++ }
++ }
++
++ *low = max_seq - min_seq[1] <= MIN_NR_GENS && min < MIN_BATCH_SIZE;
++
++ return max > 0 ? max : 0;
++}
++
++static bool age_lruvec(struct lruvec *lruvec, struct scan_control *sc,
++ unsigned long min_ttl)
++{
++ bool low;
++ long nr_to_scan;
++ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
++ int swappiness = get_swappiness(memcg);
++ DEFINE_MAX_SEQ(lruvec);
++ DEFINE_MIN_SEQ(lruvec);
++
++ if (mem_cgroup_below_min(memcg))
++ return false;
++
++ if (min_ttl) {
++ int gen = lru_gen_from_seq(min_seq[1]);
++ unsigned long birth = READ_ONCE(lruvec->evictable.timestamps[gen]);
++
++ if (time_is_after_jiffies(birth + min_ttl))
++ return false;
++ }
++
++ nr_to_scan = get_nr_evictable(lruvec, sc, swappiness, max_seq, min_seq, &low);
++ if (!nr_to_scan)
++ return false;
++
++ nr_to_scan >>= sc->priority;
++
++ if (!mem_cgroup_online(memcg))
++ nr_to_scan++;
++
++ if (nr_to_scan && low && (!mem_cgroup_below_low(memcg) || sc->memcg_low_reclaim))
++ try_to_inc_max_seq(lruvec, sc, swappiness, max_seq, true);
++
++ return true;
++}
++
++/* Protect the working set accessed within the last N milliseconds. */
++static unsigned long lru_gen_min_ttl __read_mostly;
++
++static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
++{
++ struct mem_cgroup *memcg;
++ bool success = false;
++ unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl);
++
++ VM_BUG_ON(!current_is_kswapd());
++
++ if (!sc->force_deactivate) {
++ sc->force_deactivate = 1;
++ return;
++ }
++
++ current->reclaim_state->mm_walk_args = &pgdat->mm_walk_args;
++
++ memcg = mem_cgroup_iter(NULL, NULL, NULL);
++ do {
++ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
++
++ if (age_lruvec(lruvec, sc, min_ttl))
++ success = true;
++
++ cond_resched();
++ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
++
++ if (!success && mutex_trylock(&oom_lock)) {
++ struct oom_control oc = {
++ .gfp_mask = sc->gfp_mask,
++ .order = sc->order,
++ };
++
++ /* to avoid overkilling */
++ if (!oom_reaping_in_progress())
++ out_of_memory(&oc);
++
++ mutex_unlock(&oom_lock);
++ }
++
++ current->reclaim_state->mm_walk_args = NULL;
++}
++
++/* Scan the vicinity of an accessed PTE when shrink_page_list() uses the rmap. */
++void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
++{
++ int i;
++ pte_t *pte;
++ struct page *page;
++ int old_gen, new_gen;
++ unsigned long start;
++ unsigned long end;
++ unsigned long addr;
++ struct mm_walk_args *args;
++ int worth = 0;
++ struct mem_cgroup *memcg = page_memcg(pvmw->page);
++ struct pglist_data *pgdat = page_pgdat(pvmw->page);
++ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
++ DEFINE_MAX_SEQ(lruvec);
++
++ lockdep_assert_held(pvmw->ptl);
++ VM_BUG_ON_PAGE(PageLRU(pvmw->page), pvmw->page);
++
++ args = current->reclaim_state ? current->reclaim_state->mm_walk_args : NULL;
++ if (!args)
++ return;
++
++ start = max(pvmw->address & PMD_MASK, pvmw->vma->vm_start);
++ end = min(pvmw->address | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1;
++
++ if (end - start > MIN_BATCH_SIZE * PAGE_SIZE) {
++ if (pvmw->address - start < MIN_BATCH_SIZE * PAGE_SIZE / 2)
++ end = start + MIN_BATCH_SIZE * PAGE_SIZE;
++ else if (end - pvmw->address < MIN_BATCH_SIZE * PAGE_SIZE / 2)
++ start = end - MIN_BATCH_SIZE * PAGE_SIZE;
++ else {
++ start = pvmw->address - MIN_BATCH_SIZE * PAGE_SIZE / 2;
++ end = pvmw->address + MIN_BATCH_SIZE * PAGE_SIZE / 2;
++ }
++ }
++
++ pte = pvmw->pte - (pvmw->address - start) / PAGE_SIZE;
++ new_gen = lru_gen_from_seq(max_seq);
++
++ lock_page_memcg(pvmw->page);
++ arch_enter_lazy_mmu_mode();
++
++ for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) {
++ unsigned long pfn = pte_pfn(pte[i]);
++
++ if (!pte_present(pte[i]) || is_zero_pfn(pfn))
++ continue;
++
++ if (WARN_ON_ONCE(pte_devmap(pte[i]) || pte_special(pte[i])))
++ continue;
++
++ VM_BUG_ON(!pfn_valid(pfn));
++ if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
++ continue;
++
++ worth++;
++
++ if (!pte_young(pte[i]))
++ continue;
++
++ page = compound_head(pfn_to_page(pfn));
++ if (page_to_nid(page) != pgdat->node_id)
++ continue;
++
++ if (page_memcg_rcu(page) != memcg)
++ continue;
++
++ VM_BUG_ON(addr < pvmw->vma->vm_start || addr >= pvmw->vma->vm_end);
++ if (!ptep_test_and_clear_young(pvmw->vma, addr, pte + i))
++ continue;
++
++ if (pte_dirty(pte[i]) && !PageDirty(page) &&
++ !(PageAnon(page) && PageSwapBacked(page) && !PageSwapCache(page)))
++ __set_bit(i, args->bitmap);
++
++ old_gen = page_update_gen(page, new_gen);
++ if (old_gen >= 0 && old_gen != new_gen)
++ update_batch_size(page, old_gen, new_gen, args);
++ }
++
++ arch_leave_lazy_mmu_mode();
++ unlock_page_memcg(pvmw->page);
++
++ if (worth >= MIN_BATCH_SIZE / 2)
++ set_bloom_filter(lruvec, max_seq, pvmw->pmd);
++
++ for_each_set_bit(i, args->bitmap, MIN_BATCH_SIZE)
++ set_page_dirty(pte_page(pte[i]));
++
++ bitmap_zero(args->bitmap, MIN_BATCH_SIZE);
++}
++
++/******************************************************************************
+ * state change
+ ******************************************************************************/
+
+@@ -3477,6 +4414,12 @@ static int __init init_lru_gen(void)
+ };
+ late_initcall(init_lru_gen);
+
++#else
++
++static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
++{
++}
++
+ #endif /* CONFIG_LRU_GEN */
+
+ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
+@@ -4333,6 +5276,11 @@ static void age_active_anon(struct pglis
+ struct mem_cgroup *memcg;
+ struct lruvec *lruvec;
+
++ if (lru_gen_enabled()) {
++ lru_gen_age_node(pgdat, sc);
++ return;
++ }
++
+ if (!can_age_anon_pages(pgdat, sc))
+ return;
+
--- /dev/null
+From f4b881ce07ccb2a519f664afaa2a68225b612ca3 Mon Sep 17 00:00:00 2001
+Date: Tue, 29 Jun 2021 20:46:47 -0600
+Subject: [PATCH 07/10] mm: multigenerational lru: eviction
+
+The eviction consumes old generations. Given an lruvec, the eviction
+scans pages on lrugen->lists indexed by anon and file min_seq[]
+(modulo MAX_NR_GENS). It first tries to select a type based on the
+values of min_seq[]. If they are equal, it selects the type that has
+a lower refaulted %. The eviction sorts a page according to its
+updated generation number if the aging has found this page accessed.
+It also moves a page to the next generation if this page is from an
+upper tier that has a higher refaulted % than the base tier. The
+eviction increments min_seq[] of a selected type when it finds
+lrugen->lists indexed by min_seq[] of this selected type are empty.
+
+Each generation is divided into multiple tiers. Tiers represent
+different ranges of numbers of accesses from file descriptors only.
+Pages accessed N times via file descriptors belong to tier
+order_base_2(N). Each generation contains at most MAX_NR_TIERS tiers,
+and they require additional MAX_NR_TIERS-2 bits in page->flags. In
+contrast to moving between generations which requires list operations,
+moving between tiers only involves operations on page->flags and
+therefore has a negligible cost. A feedback loop modeled after the PID
+controller monitors refaulted % across all tiers and decides when to
+protect pages from which tiers.
+
+Unmapped pages are initially added to the oldest generation and then
+conditionally protected by tiers. Each tier keeps track of how many
+pages from it have refaulted. Tier 0 is the base tier and pages from
+it are evicted unconditionally because there are no better candidates.
+Pages from an upper tier are either evicted or moved to the next
+generation, depending on whether this upper tier has a higher
+refaulted % than the base tier. This model has the following
+advantages:
+ 1) It removes the cost in the buffered access path and reduces the
+ overall cost of protection because pages are conditionally protected
+ in the reclaim path.
+ 2) It takes mapped pages into account and avoids overprotecting
+ pages accessed multiple times via file descriptors.
+ 3 Additional tiers improve the protection of pages accessed more
+ than twice.
+
+Change-Id: I64c06d8f2cdb83ac7d56c7e1d07f043483956cac
+---
+ include/linux/mm_inline.h | 10 +
+ include/linux/mmzone.h | 33 +++
+ mm/swap.c | 42 +++
+ mm/vmscan.c | 555 +++++++++++++++++++++++++++++++++++++-
+ mm/workingset.c | 120 ++++++++-
+ 5 files changed, 757 insertions(+), 3 deletions(-)
+
+--- a/include/linux/mm_inline.h
++++ b/include/linux/mm_inline.h
+@@ -106,6 +106,14 @@ static inline int lru_hist_from_seq(unsi
+ return seq % NR_HIST_GENS;
+ }
+
++/* Convert the number of accesses to a tier. See the comment on MAX_NR_TIERS. */
++static inline int lru_tier_from_refs(int refs)
++{
++ VM_BUG_ON(refs > BIT(LRU_REFS_WIDTH));
++
++ return order_base_2(refs + 1);
++}
++
+ /* The youngest and the second youngest generations are counted as active. */
+ static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen)
+ {
+@@ -226,6 +234,8 @@ static inline bool lru_gen_del_page(stru
+ gen = ((new_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
+
+ new_flags &= ~LRU_GEN_MASK;
++ if ((new_flags & LRU_REFS_FLAGS) != LRU_REFS_FLAGS)
++ new_flags &= ~(LRU_REFS_MASK | LRU_REFS_FLAGS);
+ /* for shrink_page_list() */
+ if (reclaiming)
+ new_flags &= ~(BIT(PG_referenced) | BIT(PG_reclaim));
+--- a/include/linux/mmzone.h
++++ b/include/linux/mmzone.h
+@@ -319,6 +319,30 @@ struct page_vma_mapped_walk;
+ #define MIN_NR_GENS 2
+ #define MAX_NR_GENS ((unsigned int)CONFIG_NR_LRU_GENS)
+
++/*
++ * Each generation is divided into multiple tiers. Tiers represent different
++ * ranges of numbers of accesses from file descriptors, i.e.,
++ * mark_page_accessed(). In contrast to moving between generations which
++ * requires the lru lock, moving between tiers only involves an atomic
++ * operation on page->flags and therefore has a negligible cost.
++ *
++ * The purposes of tiers are to:
++ * 1) estimate whether pages accessed multiple times via file descriptors are
++ * more active than pages accessed only via page tables by separating the two
++ * access types into upper tiers and the base tier, and comparing refaulted %
++ * across all tiers.
++ * 2) improve buffered io performance by deferring the protection of pages
++ * accessed multiple times until the eviction. That is the protection happens
++ * in the reclaim path, not the access path.
++ *
++ * Pages accessed N times via file descriptors belong to tier order_base_2(N).
++ * The base tier may be marked by PageReferenced(). All upper tiers are marked
++ * by PageReferenced() && PageWorkingset(). Additional bits from page->flags are
++ * used to support more than one upper tier.
++ */
++#define MAX_NR_TIERS ((unsigned int)CONFIG_TIERS_PER_GEN)
++#define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset))
++
+ /* Whether to keep stats for historical generations. */
+ #ifdef CONFIG_LRU_GEN_STATS
+ #define NR_HIST_GENS ((unsigned int)CONFIG_NR_LRU_GENS)
+@@ -337,6 +361,15 @@ struct lrugen {
+ struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
+ /* the sizes of the multigenerational lru lists in pages */
+ unsigned long sizes[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
++ /* the exponential moving average of refaulted */
++ unsigned long avg_refaulted[ANON_AND_FILE][MAX_NR_TIERS];
++ /* the exponential moving average of protected+evicted */
++ unsigned long avg_total[ANON_AND_FILE][MAX_NR_TIERS];
++ /* the base tier isn't protected, hence the minus one */
++ unsigned long protected[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS - 1];
++ /* incremented without holding the lru lock */
++ atomic_long_t evicted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
++ atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
+ /* whether the multigenerational lru is enabled */
+ bool enabled[ANON_AND_FILE];
+ };
+--- a/mm/swap.c
++++ b/mm/swap.c
+@@ -389,6 +389,43 @@ static void __lru_cache_activate_page(st
+ local_unlock(&lru_pvecs.lock);
+ }
+
++#ifdef CONFIG_LRU_GEN
++static void page_inc_refs(struct page *page)
++{
++ unsigned long refs;
++ unsigned long old_flags, new_flags;
++
++ if (PageUnevictable(page))
++ return;
++
++ /* see the comment on MAX_NR_TIERS */
++ do {
++ new_flags = old_flags = READ_ONCE(page->flags);
++
++ if (!(new_flags & BIT(PG_referenced))) {
++ new_flags |= BIT(PG_referenced);
++ continue;
++ }
++
++ if (!(new_flags & BIT(PG_workingset))) {
++ new_flags |= BIT(PG_workingset);
++ continue;
++ }
++
++ refs = new_flags & LRU_REFS_MASK;
++ refs = min(refs + BIT(LRU_REFS_PGOFF), LRU_REFS_MASK);
++
++ new_flags &= ~LRU_REFS_MASK;
++ new_flags |= refs;
++ } while (new_flags != old_flags &&
++ cmpxchg(&page->flags, old_flags, new_flags) != old_flags);
++}
++#else
++static void page_inc_refs(struct page *page)
++{
++}
++#endif /* CONFIG_LRU_GEN */
++
+ /*
+ * Mark a page as having seen activity.
+ *
+@@ -403,6 +440,11 @@ void mark_page_accessed(struct page *pag
+ {
+ page = compound_head(page);
+
++ if (lru_gen_enabled()) {
++ page_inc_refs(page);
++ return;
++ }
++
+ if (!PageReferenced(page)) {
+ SetPageReferenced(page);
+ } else if (PageUnevictable(page)) {
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -1145,9 +1145,11 @@ static int __remove_mapping(struct addre
+
+ if (PageSwapCache(page)) {
+ swp_entry_t swap = { .val = page_private(page) };
+- mem_cgroup_swapout(page, swap);
++
++ /* get a shadow entry before page_memcg() is cleared */
+ if (reclaimed && !mapping_exiting(mapping))
+ shadow = workingset_eviction(page, target_memcg);
++ mem_cgroup_swapout(page, swap);
+ __delete_from_swap_cache(page, swap, shadow);
+ xa_unlock_irq(&mapping->i_pages);
+ put_swap_page(page, swap);
+@@ -1410,6 +1412,11 @@ retry:
+ if (!sc->may_unmap && page_mapped(page))
+ goto keep_locked;
+
++ /* lru_gen_look_around() has updated this page? */
++ if (lru_gen_enabled() && !ignore_references &&
++ page_mapped(page) && PageReferenced(page))
++ goto keep_locked;
++
+ may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
+ (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
+
+@@ -2570,6 +2577,9 @@ static void prepare_scan_count(pg_data_t
+ unsigned long file;
+ struct lruvec *target_lruvec;
+
++ if (lru_gen_enabled())
++ return;
++
+ target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
+
+ /*
+@@ -2910,6 +2920,17 @@ static int page_lru_gen(struct page *pag
+ return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
+ }
+
++static int page_lru_tier(struct page *page)
++{
++ int refs;
++ unsigned long flags = READ_ONCE(page->flags);
++
++ refs = (flags & LRU_REFS_FLAGS) == LRU_REFS_FLAGS ?
++ ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + 1 : 0;
++
++ return lru_tier_from_refs(refs);
++}
++
+ static int get_swappiness(struct mem_cgroup *memcg)
+ {
+ return mem_cgroup_get_nr_swap_pages(memcg) >= MIN_BATCH_SIZE ?
+@@ -3246,6 +3267,91 @@ done:
+ }
+
+ /******************************************************************************
++ * refault feedback loop
++ ******************************************************************************/
++
++/*
++ * A feedback loop modeled after the PID controller. Currently supports the
++ * proportional (P) and the integral (I) terms; the derivative (D) term can be
++ * added if necessary. The setpoint (SP) is the desired position; the process
++ * variable (PV) is the measured position. The error is the difference between
++ * the SP and the PV. A positive error results in a positive control output
++ * correction, which, in our case, is to allow eviction.
++ *
++ * The P term is refaulted % of the current generation being evicted. The I
++ * term is the exponential moving average of refaulted % of previously evicted
++ * generations, using the smoothing factor 1/2.
++ *
++ * Our goal is to maintain proportional refaulted % across all tiers.
++ */
++struct ctrl_pos {
++ unsigned long refaulted;
++ unsigned long total;
++ int gain;
++};
++
++static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain,
++ struct ctrl_pos *pos)
++{
++ struct lrugen *lrugen = &lruvec->evictable;
++ int hist = lru_hist_from_seq(lrugen->min_seq[type]);
++
++ pos->refaulted = lrugen->avg_refaulted[type][tier] +
++ atomic_long_read(&lrugen->refaulted[hist][type][tier]);
++ pos->total = lrugen->avg_total[type][tier] +
++ atomic_long_read(&lrugen->evicted[hist][type][tier]);
++ if (tier)
++ pos->total += lrugen->protected[hist][type][tier - 1];
++ pos->gain = gain;
++}
++
++static void reset_ctrl_pos(struct lruvec *lruvec, int gen, int type)
++{
++ int tier;
++ int hist = lru_hist_from_seq(gen);
++ struct lrugen *lrugen = &lruvec->evictable;
++ bool carryover = gen == lru_gen_from_seq(lrugen->min_seq[type]);
++ bool clear = carryover ? NR_HIST_GENS == 1 : NR_HIST_GENS > 1;
++
++ if (!carryover && !clear)
++ return;
++
++ for (tier = 0; tier < MAX_NR_TIERS; tier++) {
++ if (carryover) {
++ unsigned long sum;
++
++ sum = lrugen->avg_refaulted[type][tier] +
++ atomic_long_read(&lrugen->refaulted[hist][type][tier]);
++ WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / 2);
++
++ sum = lrugen->avg_total[type][tier] +
++ atomic_long_read(&lrugen->evicted[hist][type][tier]);
++ if (tier)
++ sum += lrugen->protected[hist][type][tier - 1];
++ WRITE_ONCE(lrugen->avg_total[type][tier], sum / 2);
++ }
++
++ if (clear) {
++ atomic_long_set(&lrugen->refaulted[hist][type][tier], 0);
++ atomic_long_set(&lrugen->evicted[hist][type][tier], 0);
++ if (tier)
++ WRITE_ONCE(lrugen->protected[hist][type][tier - 1], 0);
++ }
++ }
++}
++
++static bool positive_ctrl_err(struct ctrl_pos *sp, struct ctrl_pos *pv)
++{
++ /*
++ * Allow eviction if the PV has a limited number of refaulted pages or a
++ * lower refaulted % than the SP.
++ */
++ return pv->refaulted < MIN_BATCH_SIZE ||
++ pv->refaulted * max(sp->total, 1UL) * sp->gain <=
++ sp->refaulted * max(pv->total, 1UL) * pv->gain;
++}
++
++/******************************************************************************
+ * the aging
+ ******************************************************************************/
+
+@@ -3265,6 +3371,7 @@ static int page_update_gen(struct page *
+
+ new_flags &= ~LRU_GEN_MASK;
+ new_flags |= (gen + 1UL) << LRU_GEN_PGOFF;
++ new_flags &= ~(LRU_REFS_MASK | LRU_REFS_FLAGS);
+ } while (new_flags != old_flags &&
+ cmpxchg(&page->flags, old_flags, new_flags) != old_flags);
+
+@@ -3296,6 +3403,7 @@ static void page_inc_gen(struct page *pa
+
+ new_flags &= ~LRU_GEN_MASK;
+ new_flags |= (new_gen + 1UL) << LRU_GEN_PGOFF;
++ new_flags &= ~(LRU_REFS_MASK | LRU_REFS_FLAGS);
+ /* for end_page_writeback() */
+ if (reclaiming)
+ new_flags |= BIT(PG_reclaim);
+@@ -3787,6 +3895,7 @@ static bool inc_min_seq(struct lruvec *l
+ }
+ }
+
++ reset_ctrl_pos(lruvec, gen, type);
+ WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1);
+
+ return true;
+@@ -3824,6 +3933,8 @@ next:
+ if (min_seq[type] == lrugen->min_seq[type])
+ continue;
+
++ gen = lru_gen_from_seq(lrugen->min_seq[type]);
++ reset_ctrl_pos(lruvec, gen, type);
+ WRITE_ONCE(lrugen->min_seq[type], min_seq[type]);
+ success = true;
+ }
+@@ -3885,6 +3996,9 @@ static void inc_max_seq(struct lruvec *l
+ }
+ }
+
++ for (type = 0; type < ANON_AND_FILE; type++)
++ reset_ctrl_pos(lruvec, gen, type);
++
+ WRITE_ONCE(lrugen->timestamps[gen], jiffies);
+ /* make sure all preceding modifications appear first */
+ smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1);
+@@ -4166,6 +4280,433 @@ void lru_gen_look_around(struct page_vma
+ }
+
+ /******************************************************************************
++ * the eviction
++ ******************************************************************************/
++
++static bool sort_page(struct page *page, struct lruvec *lruvec, int tier_idx)
++{
++ bool success;
++ int gen = page_lru_gen(page);
++ int type = page_is_file_lru(page);
++ int zone = page_zonenum(page);
++ int tier = page_lru_tier(page);
++ int delta = thp_nr_pages(page);
++ struct lrugen *lrugen = &lruvec->evictable;
++
++ VM_BUG_ON_PAGE(gen >= MAX_NR_GENS, page);
++
++ /* an mlocked page? */
++ if (!page_evictable(page)) {
++ success = lru_gen_del_page(page, lruvec, true);
++ VM_BUG_ON_PAGE(!success, page);
++ SetPageUnevictable(page);
++ add_page_to_lru_list(page, lruvec);
++ __count_vm_events(UNEVICTABLE_PGCULLED, delta);
++ return true;
++ }
++
++ /* a lazy-free page that has been written into? */
++ if (type && PageDirty(page) && PageAnon(page)) {
++ success = lru_gen_del_page(page, lruvec, true);
++ VM_BUG_ON_PAGE(!success, page);
++ SetPageSwapBacked(page);
++ add_page_to_lru_list_tail(page, lruvec);
++ return true;
++ }
++
++ /* page_update_gen() has updated this page? */
++ if (gen != lru_gen_from_seq(lrugen->min_seq[type])) {
++ list_move(&page->lru, &lrugen->lists[gen][type][zone]);
++ return true;
++ }
++
++ /* protect this page if its tier has a higher refaulted % */
++ if (tier > tier_idx) {
++ int hist = lru_hist_from_seq(gen);
++
++ page_inc_gen(page, lruvec, false);
++ WRITE_ONCE(lrugen->protected[hist][type][tier - 1],
++ lrugen->protected[hist][type][tier - 1] + delta);
++ __mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta);
++ return true;
++ }
++
++ /* mark this page for reclaim if it's pending writeback */
++ if (PageWriteback(page) || (type && PageDirty(page))) {
++ page_inc_gen(page, lruvec, true);
++ return true;
++ }
++
++ return false;
++}
++
++static bool isolate_page(struct page *page, struct lruvec *lruvec, struct scan_control *sc)
++{
++ bool success;
++
++ if (!sc->may_unmap && page_mapped(page))
++ return false;
++
++ if (!(sc->may_writepage && (sc->gfp_mask & __GFP_IO)) &&
++ (PageDirty(page) || (PageAnon(page) && !PageSwapCache(page))))
++ return false;
++
++ if (!get_page_unless_zero(page))
++ return false;
++
++ if (!TestClearPageLRU(page)) {
++ put_page(page);
++ return false;
++ }
++
++ success = lru_gen_del_page(page, lruvec, true);
++ VM_BUG_ON_PAGE(!success, page);
++
++ return true;
++}
++
++static int scan_pages(struct lruvec *lruvec, struct scan_control *sc,
++ int type, int tier, struct list_head *list)
++{
++ int gen, zone;
++ enum vm_event_item item;
++ int sorted = 0;
++ int scanned = 0;
++ int isolated = 0;
++ int remaining = MAX_BATCH_SIZE;
++ struct lrugen *lrugen = &lruvec->evictable;
++ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
++
++ VM_BUG_ON(!list_empty(list));
++
++ if (get_nr_gens(lruvec, type) == MIN_NR_GENS)
++ return 0;
++
++ gen = lru_gen_from_seq(lrugen->min_seq[type]);
++
++ for (zone = sc->reclaim_idx; zone >= 0; zone--) {
++ LIST_HEAD(moved);
++ int skipped = 0;
++ struct list_head *head = &lrugen->lists[gen][type][zone];
++
++ while (!list_empty(head)) {
++ struct page *page = lru_to_page(head);
++ int delta = thp_nr_pages(page);
++
++ VM_BUG_ON_PAGE(PageTail(page), page);
++ VM_BUG_ON_PAGE(PageUnevictable(page), page);
++ VM_BUG_ON_PAGE(PageActive(page), page);
++ VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page);
++ VM_BUG_ON_PAGE(page_zonenum(page) != zone, page);
++
++ prefetchw_prev_lru_page(page, head, flags);
++
++ scanned += delta;
++
++ if (sort_page(page, lruvec, tier))
++ sorted += delta;
++ else if (isolate_page(page, lruvec, sc)) {
++ list_add(&page->lru, list);
++ isolated += delta;
++ } else {
++ list_move(&page->lru, &moved);
++ skipped += delta;
++ }
++
++ if (!--remaining || max(isolated, skipped) >= MIN_BATCH_SIZE)
++ break;
++ }
++
++ if (skipped) {
++ list_splice(&moved, head);
++ __count_zid_vm_events(PGSCAN_SKIP, zone, skipped);
++ }
++
++ if (!remaining || isolated >= MIN_BATCH_SIZE)
++ break;
++ }
++
++ item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT;
++ if (!cgroup_reclaim(sc)) {
++ __count_vm_events(item, isolated);
++ __count_vm_events(PGREFILL, sorted);
++ }
++ __count_memcg_events(memcg, item, isolated);
++ __count_memcg_events(memcg, PGREFILL, sorted);
++ __count_vm_events(PGSCAN_ANON + type, isolated);
++
++ /*
++ * We may have trouble finding eligible pages due to reclaim_idx,
++ * may_unmap and may_writepage. Check `remaining` to make sure we won't
++ * be stuck if we aren't making enough progress.
++ */
++ return isolated || !remaining ? scanned : 0;
++}
++
++static int get_tier_idx(struct lruvec *lruvec, int type)
++{
++ int tier;
++ struct ctrl_pos sp, pv;
++
++ /*
++ * Ideally we don't want to evict upper tiers that have higher refaulted
++ * %. However, we need to leave a margin for the fluctuation in
++ * refaulted %. So we use a larger gain factor to make sure upper tiers
++ * are indeed more active. We choose 2 because the lowest upper tier
++ * would have twice of refaulted % of the base tier, according to their
++ * numbers of accesses.
++ */
++ read_ctrl_pos(lruvec, type, 0, 1, &sp);
++ for (tier = 1; tier < MAX_NR_TIERS; tier++) {
++ read_ctrl_pos(lruvec, type, tier, 2, &pv);
++ if (!positive_ctrl_err(&sp, &pv))
++ break;
++ }
++
++ return tier - 1;
++}
++
++static int get_type_to_scan(struct lruvec *lruvec, int swappiness, int *tier_idx)
++{
++ int type, tier;
++ struct ctrl_pos sp, pv;
++ int gain[ANON_AND_FILE] = { swappiness, 200 - swappiness };
++
++ /*
++ * Compare refaulted % between the base tiers of anon and file to
++ * determine which type to evict. Also need to compare refaulted % of
++ * the upper tiers of the selected type with that of the base tier of
++ * the other type to determine which tier of the selected type to evict.
++ */
++ read_ctrl_pos(lruvec, 0, 0, gain[0], &sp);
++ read_ctrl_pos(lruvec, 1, 0, gain[1], &pv);
++ type = positive_ctrl_err(&sp, &pv);
++
++ read_ctrl_pos(lruvec, !type, 0, gain[!type], &sp);
++ for (tier = 1; tier < MAX_NR_TIERS; tier++) {
++ read_ctrl_pos(lruvec, type, tier, gain[type], &pv);
++ if (!positive_ctrl_err(&sp, &pv))
++ break;
++ }
++
++ *tier_idx = tier - 1;
++
++ return type;
++}
++
++static int isolate_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
++ int *type_scanned, struct list_head *list)
++{
++ int i;
++ int type;
++ int scanned;
++ int tier = -1;
++ DEFINE_MIN_SEQ(lruvec);
++
++ VM_BUG_ON(!seq_is_valid(lruvec));
++
++ /*
++ * Try to select a type based on generations and swappiness, and if that
++ * fails, fall back to get_type_to_scan(). When anon and file are both
++ * available from the same generation, swappiness 200 is interpreted as
++ * anon first and swappiness 1 is interpreted as file first.
++ */
++ if (!swappiness)
++ type = 1;
++ else if (min_seq[0] < min_seq[1])
++ type = 0;
++ else if (swappiness == 1)
++ type = 1;
++ else if (swappiness == 200)
++ type = 0;
++ else
++ type = get_type_to_scan(lruvec, swappiness, &tier);
++
++ for (i = !swappiness; i < ANON_AND_FILE; i++) {
++ if (tier < 0)
++ tier = get_tier_idx(lruvec, type);
++
++ scanned = scan_pages(lruvec, sc, type, tier, list);
++ if (scanned)
++ break;
++
++ type = !type;
++ tier = -1;
++ }
++
++ *type_scanned = type;
++
++ return scanned;
++}
++
++/* Main function used by the foreground, the background and the user-triggered eviction. */
++static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness)
++{
++ int type;
++ int scanned;
++ int reclaimed;
++ LIST_HEAD(list);
++ struct page *page;
++ enum vm_event_item item;
++ struct reclaim_stat stat;
++ struct mm_walk_args *args;
++ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
++ struct pglist_data *pgdat = lruvec_pgdat(lruvec);
++
++ spin_lock_irq(&lruvec->lru_lock);
++
++ scanned = isolate_pages(lruvec, sc, swappiness, &type, &list);
++
++ if (try_to_inc_min_seq(lruvec, swappiness))
++ scanned++;
++
++ if (get_nr_gens(lruvec, 1) == MIN_NR_GENS)
++ scanned = 0;
++
++ spin_unlock_irq(&lruvec->lru_lock);
++
++ if (list_empty(&list))
++ return scanned;
++
++ reclaimed = shrink_page_list(&list, pgdat, sc, &stat, false);
++ /*
++ * We need to prevent rejected pages from being added back to the same
++ * lists they were isolated from. Otherwise we may risk looping on them
++ * forever.
++ */
++ list_for_each_entry(page, &list, lru) {
++ if (!PageReclaim(page) || !(PageDirty(page) || PageWriteback(page)))
++ SetPageActive(page);
++
++ ClearPageReferenced(page);
++ ClearPageWorkingset(page);
++ }
++
++ spin_lock_irq(&lruvec->lru_lock);
++
++ move_pages_to_lru(lruvec, &list);
++
++ args = current->reclaim_state ? current->reclaim_state->mm_walk_args : NULL;
++ if (args && args->batch_size)
++ reset_batch_size(lruvec, args);
++
++ item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT;
++ if (!cgroup_reclaim(sc))
++ __count_vm_events(item, reclaimed);
++ __count_memcg_events(memcg, item, reclaimed);
++ __count_vm_events(PGSTEAL_ANON + type, reclaimed);
++
++ spin_unlock_irq(&lruvec->lru_lock);
++
++ mem_cgroup_uncharge_list(&list);
++ free_unref_page_list(&list);
++
++ sc->nr_reclaimed += reclaimed;
++
++ return scanned;
++}
++
++static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, int swappiness)
++{
++ bool low;
++ long nr_to_scan;
++ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
++ int priority = sc->priority;
++ DEFINE_MAX_SEQ(lruvec);
++ DEFINE_MIN_SEQ(lruvec);
++
++ if (mem_cgroup_below_min(memcg) ||
++ (mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim))
++ return 0;
++
++ if (sc->nr_reclaimed >= sc->nr_to_reclaim) {
++ priority = DEF_PRIORITY;
++ sc->force_deactivate = 0;
++ }
++
++ nr_to_scan = get_nr_evictable(lruvec, sc, swappiness, max_seq, min_seq, &low);
++ if (!nr_to_scan)
++ return 0;
++
++ nr_to_scan >>= priority;
++
++ if (!mem_cgroup_online(memcg))
++ nr_to_scan++;
++
++ if (!nr_to_scan)
++ return 0;
++
++ if (current_is_kswapd()) {
++ /* leave the work to lru_gen_age_node() */
++ if (max_seq - min_seq[1] < MIN_NR_GENS)
++ return 0;
++
++ if (!low)
++ sc->force_deactivate = 0;
++
++ return nr_to_scan;
++ }
++
++ if (max_seq - min_seq[1] >= MIN_NR_GENS)
++ return nr_to_scan;
++
++ /* move onto slab and other memcgs if we haven't tried them all */
++ if (!sc->force_deactivate) {
++ sc->skipped_deactivate = 1;
++ return 0;
++ }
++
++ return try_to_inc_max_seq(lruvec, sc, swappiness, max_seq, true) ? nr_to_scan : 0;
++}
++
++static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
++{
++ struct blk_plug plug;
++ long scanned = 0;
++ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
++ struct pglist_data *pgdat = lruvec_pgdat(lruvec);
++
++ lru_add_drain();
++
++ if (current_is_kswapd())
++ current->reclaim_state->mm_walk_args = &pgdat->mm_walk_args;
++
++ blk_start_plug(&plug);
++
++ while (true) {
++ int delta;
++ int swappiness;
++ long nr_to_scan;
++
++ if (sc->may_swap)
++ swappiness = get_swappiness(memcg);
++ else if (!cgroup_reclaim(sc) && get_swappiness(memcg))
++ swappiness = 1;
++ else
++ swappiness = 0;
++
++ nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness);
++ if (!nr_to_scan)
++ break;
++
++ delta = evict_pages(lruvec, sc, swappiness);
++ if (!delta)
++ break;
++
++ scanned += delta;
++ if (scanned >= nr_to_scan)
++ break;
++
++ cond_resched();
++ }
++
++ blk_finish_plug(&plug);
++
++ if (current_is_kswapd())
++ current->reclaim_state->mm_walk_args = NULL;
++}
++
++/******************************************************************************
+ * state change
+ ******************************************************************************/
+
+@@ -4420,6 +4961,10 @@ static void lru_gen_age_node(struct pgli
+ {
+ }
+
++static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
++{
++}
++
+ #endif /* CONFIG_LRU_GEN */
+
+ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
+@@ -4433,6 +4978,11 @@ static void shrink_lruvec(struct lruvec
+ struct blk_plug plug;
+ bool scan_adjusted;
+
++ if (lru_gen_enabled()) {
++ lru_gen_shrink_lruvec(lruvec, sc);
++ return;
++ }
++
+ get_scan_count(lruvec, sc, nr);
+
+ /* Record the original scan target for proportional adjustments later */
+@@ -4906,6 +5456,9 @@ static void snapshot_refaults(struct mem
+ struct lruvec *target_lruvec;
+ unsigned long refaults;
+
++ if (lru_gen_enabled())
++ return;
++
+ target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
+ refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON);
+ target_lruvec->refaults[0] = refaults;
+--- a/mm/workingset.c
++++ b/mm/workingset.c
+@@ -187,7 +187,6 @@ static unsigned int bucket_order __read_
+ static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction,
+ bool workingset)
+ {
+- eviction >>= bucket_order;
+ eviction &= EVICTION_MASK;
+ eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
+ eviction = (eviction << NODES_SHIFT) | pgdat->node_id;
+@@ -212,10 +211,117 @@ static void unpack_shadow(void *shadow,
+
+ *memcgidp = memcgid;
+ *pgdat = NODE_DATA(nid);
+- *evictionp = entry << bucket_order;
++ *evictionp = entry;
+ *workingsetp = workingset;
+ }
+
++#ifdef CONFIG_LRU_GEN
++
++static int page_lru_refs(struct page *page)
++{
++ unsigned long flags = READ_ONCE(page->flags);
++
++ BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_WIDTH > BITS_PER_LONG - EVICTION_SHIFT);
++
++ /* see the comment on MAX_NR_TIERS */
++ return flags & BIT(PG_workingset) ? (flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF : 0;
++}
++
++/* Return a token to be stored in the shadow entry of a page being evicted. */
++static void *lru_gen_eviction(struct page *page)
++{
++ int hist, tier;
++ unsigned long token;
++ unsigned long min_seq;
++ struct lruvec *lruvec;
++ struct lrugen *lrugen;
++ int type = page_is_file_lru(page);
++ int refs = page_lru_refs(page);
++ int delta = thp_nr_pages(page);
++ bool workingset = PageWorkingset(page);
++ struct mem_cgroup *memcg = page_memcg(page);
++ struct pglist_data *pgdat = page_pgdat(page);
++
++ lruvec = mem_cgroup_lruvec(memcg, pgdat);
++ lrugen = &lruvec->evictable;
++ min_seq = READ_ONCE(lrugen->min_seq[type]);
++ token = (min_seq << LRU_REFS_WIDTH) | refs;
++
++ hist = lru_hist_from_seq(min_seq);
++ tier = lru_tier_from_refs(refs + workingset);
++ atomic_long_add(delta, &lrugen->evicted[hist][type][tier]);
++
++ return pack_shadow(mem_cgroup_id(memcg), pgdat, token, workingset);
++}
++
++/* Count a refaulted page based on the token stored in its shadow entry. */
++static void lru_gen_refault(struct page *page, void *shadow)
++{
++ int hist, tier, refs;
++ int memcg_id;
++ bool workingset;
++ unsigned long token;
++ unsigned long min_seq;
++ struct lruvec *lruvec;
++ struct lrugen *lrugen;
++ struct mem_cgroup *memcg;
++ struct pglist_data *pgdat;
++ int type = page_is_file_lru(page);
++ int delta = thp_nr_pages(page);
++
++ unpack_shadow(shadow, &memcg_id, &pgdat, &token, &workingset);
++ if (page_pgdat(page) != pgdat)
++ return;
++
++ rcu_read_lock();
++ memcg = page_memcg_rcu(page);
++ if (mem_cgroup_id(memcg) != memcg_id)
++ goto unlock;
++
++ refs = token & (BIT(LRU_REFS_WIDTH) - 1);
++ if (refs && !workingset)
++ goto unlock;
++
++ token >>= LRU_REFS_WIDTH;
++ lruvec = mem_cgroup_lruvec(memcg, pgdat);
++ lrugen = &lruvec->evictable;
++ min_seq = READ_ONCE(lrugen->min_seq[type]);
++ if (token != (min_seq & (EVICTION_MASK >> LRU_REFS_WIDTH)))
++ goto unlock;
++
++ hist = lru_hist_from_seq(min_seq);
++ tier = lru_tier_from_refs(refs + workingset);
++ atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]);
++ mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type, delta);
++
++ /*
++ * Tiers don't offer any protection to pages accessed via page tables.
++ * That's what generations do. Tiers can't fully protect pages after
++ * their numbers of accesses has exceeded the max value. Conservatively
++ * count these two conditions as stalls even though they might not
++ * indicate any real memory pressure.
++ */
++ if (task_in_nonseq_fault() || refs + workingset == BIT(LRU_REFS_WIDTH)) {
++ SetPageWorkingset(page);
++ mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta);
++ }
++unlock:
++ rcu_read_unlock();
++}
++
++#else
++
++static void *lru_gen_eviction(struct page *page)
++{
++ return NULL;
++}
++
++static void lru_gen_refault(struct page *page, void *shadow)
++{
++}
++
++#endif /* CONFIG_LRU_GEN */
++
+ /**
+ * workingset_age_nonresident - age non-resident entries as LRU ages
+ * @lruvec: the lruvec that was aged
+@@ -264,10 +370,14 @@ void *workingset_eviction(struct page *p
+ VM_BUG_ON_PAGE(page_count(page), page);
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+
++ if (lru_gen_enabled())
++ return lru_gen_eviction(page);
++
+ lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
+ /* XXX: target_memcg can be NULL, go through lruvec */
+ memcgid = mem_cgroup_id(lruvec_memcg(lruvec));
+ eviction = atomic_long_read(&lruvec->nonresident_age);
++ eviction >>= bucket_order;
+ workingset_age_nonresident(lruvec, thp_nr_pages(page));
+ return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page));
+ }
+@@ -296,7 +406,13 @@ void workingset_refault(struct page *pag
+ bool workingset;
+ int memcgid;
+
++ if (lru_gen_enabled()) {
++ lru_gen_refault(page, shadow);
++ return;
++ }
++
+ unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset);
++ eviction <<= bucket_order;
+
+ rcu_read_lock();
+ /*
--- /dev/null
+From 5cc7fdec54e87e32b4fb0f07d84b21769d5f8d92 Mon Sep 17 00:00:00 2001
+Date: Mon, 25 Jan 2021 21:38:02 -0700
+Subject: [PATCH 08/10] mm: multigenerational lru: user interface
+
+Add /sys/kernel/mm/lru_gen/enabled to enable and disable the
+multigenerational lru at runtime.
+
+Add /sys/kernel/mm/lru_gen/min_ttl_ms to protect the working set of a
+given number of milliseconds. The OOM killer is invoked if this
+working set cannot be kept in memory.
+
+Add /sys/kernel/debug/lru_gen to monitor the multigenerational lru and
+invoke the aging and the eviction. This file has the following output:
+ memcg memcg_id memcg_path
+ node node_id
+ min_gen birth_time anon_size file_size
+ ...
+ max_gen birth_time anon_size file_size
+
+min_gen is the oldest generation number and max_gen is the youngest
+generation number. birth_time is in milliseconds. anon_size and
+file_size are in pages.
+
+This file takes the following input:
+ + memcg_id node_id max_gen [swappiness] [use_bloom_filter]
+ - memcg_id node_id min_gen [swappiness] [nr_to_reclaim]
+
+The first command line invokes the aging, which scans PTEs for
+accessed pages and then creates the next generation max_gen+1. A swap
+file and a non-zero swappiness, which overrides vm.swappiness, are
+required to scan PTEs mapping anon pages. The second command line
+invokes the eviction, which evicts generations less than or equal to
+min_gen. min_gen should be less than max_gen-1 as max_gen and
+max_gen-1 are not fully aged and therefore cannot be evicted.
+Setting nr_to_reclaim to N limits the number of pages to evict.
+Setting use_bloom_filter to 0 overrides the default behavior which
+only scans PTE tables found populated. Multiple command lines are
+supported, as is concatenation with delimiters "," and ";".
+
+Change-Id: I4448e60029badbe347aa3b624f429b280cc3a3d3
+---
+ include/linux/nodemask.h | 1 +
+ mm/vmscan.c | 415 +++++++++++++++++++++++++++++++++++++++
+ 2 files changed, 416 insertions(+)
+
+--- a/include/linux/nodemask.h
++++ b/include/linux/nodemask.h
+@@ -485,6 +485,7 @@ static inline int num_node_state(enum no
+ #define first_online_node 0
+ #define first_memory_node 0
+ #define next_online_node(nid) (MAX_NUMNODES)
++#define next_memory_node(nid) (MAX_NUMNODES)
+ #define nr_node_ids 1U
+ #define nr_online_nodes 1U
+
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -53,6 +53,8 @@
+ #include <linux/memory.h>
+ #include <linux/pagewalk.h>
+ #include <linux/shmem_fs.h>
++#include <linux/ctype.h>
++#include <linux/debugfs.h>
+
+ #include <asm/tlbflush.h>
+ #include <asm/div64.h>
+@@ -4882,6 +4884,413 @@ unlock:
+ }
+
+ /******************************************************************************
++ * sysfs interface
++ ******************************************************************************/
++
++static ssize_t show_min_ttl(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
++{
++ return sprintf(buf, "%u\n", jiffies_to_msecs(READ_ONCE(lru_gen_min_ttl)));
++}
++
++static ssize_t store_min_ttl(struct kobject *kobj, struct kobj_attribute *attr,
++ const char *buf, size_t len)
++{
++ unsigned int msecs;
++
++ if (kstrtouint(buf, 10, &msecs))
++ return -EINVAL;
++
++ WRITE_ONCE(lru_gen_min_ttl, msecs_to_jiffies(msecs));
++
++ return len;
++}
++
++static struct kobj_attribute lru_gen_min_ttl_attr = __ATTR(
++ min_ttl_ms, 0644, show_min_ttl, store_min_ttl
++);
++
++static ssize_t show_enable(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
++{
++ return snprintf(buf, PAGE_SIZE, "%d\n", lru_gen_enabled());
++}
++
++static ssize_t store_enable(struct kobject *kobj, struct kobj_attribute *attr,
++ const char *buf, size_t len)
++{
++ bool enable;
++
++ if (kstrtobool(buf, &enable))
++ return -EINVAL;
++
++ lru_gen_change_state(enable, true, false);
++
++ return len;
++}
++
++static struct kobj_attribute lru_gen_enabled_attr = __ATTR(
++ enabled, 0644, show_enable, store_enable
++);
++
++static struct attribute *lru_gen_attrs[] = {
++ &lru_gen_min_ttl_attr.attr,
++ &lru_gen_enabled_attr.attr,
++ NULL
++};
++
++static struct attribute_group lru_gen_attr_group = {
++ .name = "lru_gen",
++ .attrs = lru_gen_attrs,
++};
++
++/******************************************************************************
++ * debugfs interface
++ ******************************************************************************/
++
++static void *lru_gen_seq_start(struct seq_file *m, loff_t *pos)
++{
++ struct mem_cgroup *memcg;
++ loff_t nr_to_skip = *pos;
++
++ m->private = kvmalloc(PATH_MAX, GFP_KERNEL);
++ if (!m->private)
++ return ERR_PTR(-ENOMEM);
++
++ memcg = mem_cgroup_iter(NULL, NULL, NULL);
++ do {
++ int nid;
++
++ for_each_node_state(nid, N_MEMORY) {
++ if (!nr_to_skip--)
++ return get_lruvec(nid, memcg);
++ }
++ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
++
++ return NULL;
++}
++
++static void lru_gen_seq_stop(struct seq_file *m, void *v)
++{
++ if (!IS_ERR_OR_NULL(v))
++ mem_cgroup_iter_break(NULL, lruvec_memcg(v));
++
++ kvfree(m->private);
++ m->private = NULL;
++}
++
++static void *lru_gen_seq_next(struct seq_file *m, void *v, loff_t *pos)
++{
++ int nid = lruvec_pgdat(v)->node_id;
++ struct mem_cgroup *memcg = lruvec_memcg(v);
++
++ ++*pos;
++
++ nid = next_memory_node(nid);
++ if (nid == MAX_NUMNODES) {
++ memcg = mem_cgroup_iter(NULL, memcg, NULL);
++ if (!memcg)
++ return NULL;
++
++ nid = first_memory_node;
++ }
++
++ return get_lruvec(nid, memcg);
++}
++
++static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
++ unsigned long max_seq, unsigned long *min_seq,
++ unsigned long seq)
++{
++ int i;
++ int type, tier;
++ int hist = lru_hist_from_seq(seq);
++ struct lrugen *lrugen = &lruvec->evictable;
++
++ for (tier = 0; tier < MAX_NR_TIERS; tier++) {
++ seq_printf(m, " %10d", tier);
++ for (type = 0; type < ANON_AND_FILE; type++) {
++ unsigned long n[3] = {};
++
++ if (seq == max_seq) {
++ n[0] = READ_ONCE(lrugen->avg_refaulted[type][tier]);
++ n[1] = READ_ONCE(lrugen->avg_total[type][tier]);
++
++ seq_printf(m, " %10luR %10luT %10lu ", n[0], n[1], n[2]);
++ } else if (seq == min_seq[type] || NR_HIST_GENS > 1) {
++ n[0] = atomic_long_read(&lrugen->refaulted[hist][type][tier]);
++ n[1] = atomic_long_read(&lrugen->evicted[hist][type][tier]);
++ if (tier)
++ n[2] = READ_ONCE(lrugen->protected[hist][type][tier - 1]);
++
++ seq_printf(m, " %10lur %10lue %10lup", n[0], n[1], n[2]);
++ } else
++ seq_puts(m, " 0 0 0 ");
++ }
++ seq_putc(m, '\n');
++ }
++
++ seq_puts(m, " ");
++ for (i = 0; i < NR_MM_STATS; i++) {
++ if (seq == max_seq && NR_HIST_GENS == 1)
++ seq_printf(m, " %10lu%c", READ_ONCE(lruvec->mm_walk.stats[hist][i]),
++ toupper(MM_STAT_CODES[i]));
++ else if (seq != max_seq && NR_HIST_GENS > 1)
++ seq_printf(m, " %10lu%c", READ_ONCE(lruvec->mm_walk.stats[hist][i]),
++ MM_STAT_CODES[i]);
++ else
++ seq_puts(m, " 0 ");
++ }
++ seq_putc(m, '\n');
++}
++
++static int lru_gen_seq_show(struct seq_file *m, void *v)
++{
++ unsigned long seq;
++ bool full = !debugfs_real_fops(m->file)->write;
++ struct lruvec *lruvec = v;
++ struct lrugen *lrugen = &lruvec->evictable;
++ int nid = lruvec_pgdat(lruvec)->node_id;
++ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
++ DEFINE_MAX_SEQ(lruvec);
++ DEFINE_MIN_SEQ(lruvec);
++
++ if (nid == first_memory_node) {
++ const char *path = memcg ? m->private : "";
++
++#ifdef CONFIG_MEMCG
++ if (memcg)
++ cgroup_path(memcg->css.cgroup, m->private, PATH_MAX);
++#endif
++ seq_printf(m, "memcg %5hu %s\n", mem_cgroup_id(memcg), path);
++ }
++
++ seq_printf(m, " node %5d\n", nid);
++
++ if (!full)
++ seq = min_seq[0];
++ else if (max_seq >= MAX_NR_GENS)
++ seq = max_seq - MAX_NR_GENS + 1;
++ else
++ seq = 0;
++
++ for (; seq <= max_seq; seq++) {
++ int gen, type, zone;
++ unsigned int msecs;
++
++ gen = lru_gen_from_seq(seq);
++ msecs = jiffies_to_msecs(jiffies - READ_ONCE(lrugen->timestamps[gen]));
++
++ seq_printf(m, " %10lu %10u", seq, msecs);
++
++ for (type = 0; type < ANON_AND_FILE; type++) {
++ long size = 0;
++
++ if (seq < min_seq[type]) {
++ seq_puts(m, " -0 ");
++ continue;
++ }
++
++ for (zone = 0; zone < MAX_NR_ZONES; zone++)
++ size += READ_ONCE(lrugen->sizes[gen][type][zone]);
++
++ seq_printf(m, " %10lu ", max(size, 0L));
++ }
++
++ seq_putc(m, '\n');
++
++ if (full)
++ lru_gen_seq_show_full(m, lruvec, max_seq, min_seq, seq);
++ }
++
++ return 0;
++}
++
++static const struct seq_operations lru_gen_seq_ops = {
++ .start = lru_gen_seq_start,
++ .stop = lru_gen_seq_stop,
++ .next = lru_gen_seq_next,
++ .show = lru_gen_seq_show,
++};
++
++static int run_aging(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
++ unsigned long seq, bool use_filter)
++{
++ DEFINE_MAX_SEQ(lruvec);
++
++ if (seq == max_seq)
++ try_to_inc_max_seq(lruvec, sc, swappiness, max_seq, use_filter);
++
++ return seq > max_seq ? -EINVAL : 0;
++}
++
++static int run_eviction(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
++ unsigned long seq, unsigned long nr_to_reclaim)
++{
++ struct blk_plug plug;
++ int err = -EINTR;
++ DEFINE_MAX_SEQ(lruvec);
++
++ if (seq >= max_seq - 1)
++ return -EINVAL;
++
++ sc->nr_reclaimed = 0;
++
++ blk_start_plug(&plug);
++
++ while (!signal_pending(current)) {
++ DEFINE_MIN_SEQ(lruvec);
++
++ if (seq < min_seq[!swappiness] || sc->nr_reclaimed >= nr_to_reclaim ||
++ !evict_pages(lruvec, sc, swappiness)) {
++ err = 0;
++ break;
++ }
++
++ cond_resched();
++ }
++
++ blk_finish_plug(&plug);
++
++ return err;
++}
++
++static int run_cmd(char cmd, int memcg_id, int nid, struct scan_control *sc,
++ int swappiness, unsigned long seq, unsigned long opt)
++{
++ struct lruvec *lruvec;
++ int err = -EINVAL;
++ struct mem_cgroup *memcg = NULL;
++
++ if (!mem_cgroup_disabled()) {
++ rcu_read_lock();
++ memcg = mem_cgroup_from_id(memcg_id);
++#ifdef CONFIG_MEMCG
++ if (memcg && !css_tryget(&memcg->css))
++ memcg = NULL;
++#endif
++ rcu_read_unlock();
++
++ if (!memcg)
++ goto done;
++ }
++ if (memcg_id != mem_cgroup_id(memcg))
++ goto done;
++
++ if (nid < 0 || nid >= MAX_NUMNODES || !node_state(nid, N_MEMORY))
++ goto done;
++
++ lruvec = get_lruvec(nid, memcg);
++
++ if (swappiness < 0)
++ swappiness = get_swappiness(memcg);
++ else if (swappiness > 200)
++ goto done;
++
++ switch (cmd) {
++ case '+':
++ err = run_aging(lruvec, sc, swappiness, seq, opt);
++ break;
++ case '-':
++ err = run_eviction(lruvec, sc, swappiness, seq, opt);
++ break;
++ }
++done:
++ mem_cgroup_put(memcg);
++
++ return err;
++}
++
++static ssize_t lru_gen_seq_write(struct file *file, const char __user *src,
++ size_t len, loff_t *pos)
++{
++ void *buf;
++ char *cur, *next;
++ unsigned int flags;
++ int err = 0;
++ struct scan_control sc = {
++ .may_writepage = 1,
++ .may_unmap = 1,
++ .may_swap = 1,
++ .reclaim_idx = MAX_NR_ZONES - 1,
++ .gfp_mask = GFP_KERNEL,
++ };
++
++ buf = kvmalloc(len + 1, GFP_KERNEL);
++ if (!buf)
++ return -ENOMEM;
++
++ if (copy_from_user(buf, src, len)) {
++ kvfree(buf);
++ return -EFAULT;
++ }
++
++ next = buf;
++ next[len] = '\0';
++
++ sc.reclaim_state.mm_walk_args = alloc_mm_walk_args();
++ if (!sc.reclaim_state.mm_walk_args) {
++ kvfree(buf);
++ return -ENOMEM;
++ }
++
++ flags = memalloc_noreclaim_save();
++ set_task_reclaim_state(current, &sc.reclaim_state);
++
++ while ((cur = strsep(&next, ",;\n"))) {
++ int n;
++ int end;
++ char cmd;
++ unsigned int memcg_id;
++ unsigned int nid;
++ unsigned long seq;
++ unsigned int swappiness = -1;
++ unsigned long opt = -1;
++
++ cur = skip_spaces(cur);
++ if (!*cur)
++ continue;
++
++ n = sscanf(cur, "%c %u %u %lu %n %u %n %lu %n", &cmd, &memcg_id, &nid,
++ &seq, &end, &swappiness, &end, &opt, &end);
++ if (n < 4 || cur[end]) {
++ err = -EINVAL;
++ break;
++ }
++
++ err = run_cmd(cmd, memcg_id, nid, &sc, swappiness, seq, opt);
++ if (err)
++ break;
++ }
++
++ set_task_reclaim_state(current, NULL);
++ memalloc_noreclaim_restore(flags);
++
++ free_mm_walk_args(sc.reclaim_state.mm_walk_args);
++ kvfree(buf);
++
++ return err ? : len;
++}
++
++static int lru_gen_seq_open(struct inode *inode, struct file *file)
++{
++ return seq_open(file, &lru_gen_seq_ops);
++}
++
++static const struct file_operations lru_gen_rw_fops = {
++ .open = lru_gen_seq_open,
++ .read = seq_read,
++ .write = lru_gen_seq_write,
++ .llseek = seq_lseek,
++ .release = seq_release,
++};
++
++static const struct file_operations lru_gen_ro_fops = {
++ .open = lru_gen_seq_open,
++ .read = seq_read,
++ .llseek = seq_lseek,
++ .release = seq_release,
++};
++
++/******************************************************************************
+ * initialization
+ ******************************************************************************/
+
+@@ -4951,6 +5360,12 @@ static int __init init_lru_gen(void)
+ BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);
+ BUILD_BUG_ON(sizeof(MM_STAT_CODES) != NR_MM_STATS + 1);
+
++ if (sysfs_create_group(mm_kobj, &lru_gen_attr_group))
++ pr_err("lru_gen: failed to create sysfs group\n");
++
++ debugfs_create_file("lru_gen", 0644, NULL, NULL, &lru_gen_rw_fops);
++ debugfs_create_file("lru_gen_full", 0444, NULL, NULL, &lru_gen_ro_fops);
++
+ return 0;
+ };
+ late_initcall(init_lru_gen);
--- /dev/null
+From 3008095eb835d207dd7e5b60899aad17f32aa9f7 Mon Sep 17 00:00:00 2001
+Date: Mon, 25 Jan 2021 21:47:24 -0700
+Subject: [PATCH 09/10] mm: multigenerational lru: Kconfig
+
+Add configuration options for the multigenerational lru.
+
+Change-Id: Ic74ea07f8fb5f56e6904a1b80c3c286bc2911635
+---
+ mm/Kconfig | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 59 insertions(+)
+
+--- a/mm/Kconfig
++++ b/mm/Kconfig
+@@ -899,4 +899,63 @@ config SECRETMEM
+
+ source "mm/damon/Kconfig"
+
++# the multigenerational lru {
++config LRU_GEN
++ bool "Multigenerational LRU"
++ depends on MMU
++ # the following options may leave not enough spare bits in page->flags
++ depends on !MAXSMP && (64BIT || !SPARSEMEM || SPARSEMEM_VMEMMAP)
++ help
++ A high performance LRU implementation to heavily overcommit workloads
++ that are not IO bound. See Documentation/vm/multigen_lru.rst for
++ details.
++
++ Warning: do not enable this option unless you plan to use it because
++ it introduces a small per-process and per-memcg and per-node memory
++ overhead.
++
++config LRU_GEN_ENABLED
++ bool "Turn on by default"
++ depends on LRU_GEN
++ help
++ The default value of /sys/kernel/mm/lru_gen/enabled is 0. This option
++ changes it to 1.
++
++ Warning: the default value is the fast path. See
++ Documentation/static-keys.txt for details.
++
++config LRU_GEN_STATS
++ bool "Full stats for debugging"
++ depends on LRU_GEN
++ help
++ This option keeps full stats for each generation, which can be read
++ from /sys/kernel/debug/lru_gen_full.
++
++ Warning: do not enable this option unless you plan to use it because
++ it introduces an additional small per-process and per-memcg and
++ per-node memory overhead.
++
++config NR_LRU_GENS
++ int "Max number of generations"
++ depends on LRU_GEN
++ range 4 31
++ default 7
++ help
++ This will use order_base_2(N+1) spare bits from page flags.
++
++ Warning: do not use numbers larger than necessary because each
++ generation introduces a small per-node and per-memcg memory overhead.
++
++config TIERS_PER_GEN
++ int "Number of tiers per generation"
++ depends on LRU_GEN
++ range 2 5
++ default 4
++ help
++ This will use N-2 spare bits from page flags.
++
++ Larger values generally offer better protection to active pages under
++ heavy buffered I/O workloads.
++# }
++
+ endmenu
--- /dev/null
+From f59c618ed70a1e48accc4cad91a200966f2569c9 Mon Sep 17 00:00:00 2001
+Date: Tue, 2 Feb 2021 01:27:45 -0700
+Subject: [PATCH 10/10] mm: multigenerational lru: documentation
+
+Add Documentation/vm/multigen_lru.rst.
+
+Change-Id: I1902178bcbb5adfa0a748c4d284a6456059bdd7e
+---
+ Documentation/vm/index.rst | 1 +
+ Documentation/vm/multigen_lru.rst | 132 ++++++++++++++++++++++++++++++
+ 2 files changed, 133 insertions(+)
+ create mode 100644 Documentation/vm/multigen_lru.rst
+
+--- a/Documentation/vm/index.rst
++++ b/Documentation/vm/index.rst
+@@ -17,6 +17,7 @@ various features of the Linux memory man
+
+ swap_numa
+ zswap
++ multigen_lru
+
+ Kernel developers MM documentation
+ ==================================
+--- /dev/null
++++ b/Documentation/vm/multigen_lru.rst
+@@ -0,0 +1,132 @@
++.. SPDX-License-Identifier: GPL-2.0
++
++=====================
++Multigenerational LRU
++=====================
++
++Quick Start
++===========
++Build Configurations
++--------------------
++:Required: Set ``CONFIG_LRU_GEN=y``.
++
++:Optional: Set ``CONFIG_LRU_GEN_ENABLED=y`` to turn the feature on by
++ default.
++
++Runtime Configurations
++----------------------
++:Required: Write ``1`` to ``/sys/kernel/mm/lru_gen/enable`` if the
++ feature was not turned on by default.
++
++:Optional: Write ``N`` to ``/sys/kernel/mm/lru_gen/min_ttl_ms`` to
++ protect the working set of ``N`` milliseconds. The OOM killer is
++ invoked if this working set cannot be kept in memory.
++
++:Optional: Read ``/sys/kernel/debug/lru_gen`` to confirm the feature
++ is turned on. This file has the following output:
++
++::
++
++ memcg memcg_id memcg_path
++ node node_id
++ min_gen birth_time anon_size file_size
++ ...
++ max_gen birth_time anon_size file_size
++
++``min_gen`` is the oldest generation number and ``max_gen`` is the
++youngest generation number. ``birth_time`` is in milliseconds.
++``anon_size`` and ``file_size`` are in pages.
++
++Phones/Laptops/Workstations
++---------------------------
++No additional configurations required.
++
++Servers/Data Centers
++--------------------
++:To support more generations: Change ``CONFIG_NR_LRU_GENS`` to a
++ larger number.
++
++:To support more tiers: Change ``CONFIG_TIERS_PER_GEN`` to a larger
++ number.
++
++:To support full stats: Set ``CONFIG_LRU_GEN_STATS=y``.
++
++:Working set estimation: Write ``+ memcg_id node_id max_gen
++ [swappiness] [use_bloom_filter]`` to ``/sys/kernel/debug/lru_gen`` to
++ invoke the aging, which scans PTEs for accessed pages and then
++ creates the next generation ``max_gen+1``. A swap file and a non-zero
++ ``swappiness``, which overrides ``vm.swappiness``, are required to
++ scan PTEs mapping anon pages. Set ``use_bloom_filter`` to 0 to
++ override the default behavior which only scans PTE tables found
++ populated.
++
++:Proactive reclaim: Write ``- memcg_id node_id min_gen [swappiness]
++ [nr_to_reclaim]`` to ``/sys/kernel/debug/lru_gen`` to invoke the
++ eviction, which evicts generations less than or equal to ``min_gen``.
++ ``min_gen`` should be less than ``max_gen-1`` as ``max_gen`` and
++ ``max_gen-1`` are not fully aged and therefore cannot be evicted.
++ Use ``nr_to_reclaim`` to limit the number of pages to evict. Multiple
++ command lines are supported, so does concatenation with delimiters
++ ``,`` and ``;``.
++
++Framework
++=========
++For each ``lruvec``, evictable pages are divided into multiple
++generations. The youngest generation number is stored in
++``lrugen->max_seq`` for both anon and file types as they are aged on
++an equal footing. The oldest generation numbers are stored in
++``lrugen->min_seq[]`` separately for anon and file types as clean
++file pages can be evicted regardless of swap and writeback
++constraints. These three variables are monotonically increasing.
++Generation numbers are truncated into
++``order_base_2(CONFIG_NR_LRU_GENS+1)`` bits in order to fit into
++``page->flags``. The sliding window technique is used to prevent
++truncated generation numbers from overlapping. Each truncated
++generation number is an index to an array of per-type and per-zone
++lists ``lrugen->lists``.
++
++Each generation is divided into multiple tiers. Tiers represent
++different ranges of numbers of accesses from file descriptors only.
++Pages accessed ``N`` times via file descriptors belong to tier
++``order_base_2(N)``. Each generation contains at most
++``CONFIG_TIERS_PER_GEN`` tiers, and they require additional
++``CONFIG_TIERS_PER_GEN-2`` bits in ``page->flags``. In contrast to
++moving between generations which requires list operations, moving
++between tiers only involves operations on ``page->flags`` and
++therefore has a negligible cost. A feedback loop modeled after the PID
++controller monitors refaulted % across all tiers and decides when to
++protect pages from which tiers.
++
++The framework comprises two conceptually independent components: the
++aging and the eviction, which can be invoked separately from user
++space for the purpose of working set estimation and proactive reclaim.
++
++Aging
++-----
++The aging produces young generations. Given an ``lruvec``, the aging
++traverses ``lruvec_memcg()->mm_list`` and calls ``walk_page_range()``
++to scan PTEs for accessed pages (a ``mm_struct`` list is maintained
++for each ``memcg``). Upon finding one, the aging updates its
++generation number to ``max_seq`` (modulo ``CONFIG_NR_LRU_GENS``).
++After each round of traversal, the aging increments ``max_seq``. The
++aging is due when ``min_seq[]`` reaches ``max_seq-1``.
++
++Eviction
++--------
++The eviction consumes old generations. Given an ``lruvec``, the
++eviction scans pages on the per-zone lists indexed by anon and file
++``min_seq[]`` (modulo ``CONFIG_NR_LRU_GENS``). It first tries to
++select a type based on the values of ``min_seq[]``. If they are
++equal, it selects the type that has a lower refaulted %. The eviction
++sorts a page according to its updated generation number if the aging
++has found this page accessed. It also moves a page to the next
++generation if this page is from an upper tier that has a higher
++refaulted % than the base tier. The eviction increments ``min_seq[]``
++of a selected type when it finds all the per-zone lists indexed by
++``min_seq[]`` of this selected type are empty.
++
++To-do List
++==========
++KVM Optimization
++----------------
++Support shadow page table walk.
--- /dev/null
+From 14aa8b2d5c2ebead01b542f62d68029023054774 Mon Sep 17 00:00:00 2001
+Date: Wed, 28 Sep 2022 13:36:58 -0600
+Subject: [PATCH 1/1] mm/mglru: don't sync disk for each aging cycle
+
+wakeup_flusher_threads() was added under the assumption that if a system
+runs out of clean cold pages, it might want to write back dirty pages more
+aggressively so that they can become clean and be dropped.
+
+However, doing so can breach the rate limit a system wants to impose on
+writeback, resulting in early SSD wearout.
+
+Fixes: bd74fdaea146 ("mm: multi-gen LRU: support page table walks")
+---
+ mm/vmscan.c | 2 --
+ 1 file changed, 2 deletions(-)
+
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -4072,8 +4072,6 @@ static bool try_to_inc_max_seq(struct lr
+ if (wq_has_sleeper(&lruvec->mm_walk.wait))
+ wake_up_all(&lruvec->mm_walk.wait);
+
+- wakeup_flusher_threads(WB_REASON_VMSCAN);
+-
+ return true;
+ }
+
+++ /dev/null
-From a8e6015d9534f39abc08e6804566af059e498a60 Mon Sep 17 00:00:00 2001
-Date: Wed, 4 Aug 2021 01:31:34 -0600
-Subject: [PATCH 01/10] mm: x86, arm64: add arch_has_hw_pte_young()
-
-Some architectures automatically set the accessed bit in PTEs, e.g.,
-x86 and arm64 v8.2. On architectures that do not have this capability,
-clearing the accessed bit in a PTE triggers a page fault following the
-TLB miss of this PTE.
-
-Being aware of this capability can help make better decisions, i.e.,
-whether to limit the size of each batch of PTEs and the burst of
-batches when clearing the accessed bit.
-
-Change-Id: Ib49b44fb56df3333a2ff1fcc496fb1980b976e7a
----
- arch/arm64/include/asm/cpufeature.h | 5 +++++
- arch/arm64/include/asm/pgtable.h | 13 ++++++++-----
- arch/arm64/kernel/cpufeature.c | 10 ++++++++++
- arch/arm64/tools/cpucaps | 1 +
- arch/x86/include/asm/pgtable.h | 6 +++---
- include/linux/pgtable.h | 13 +++++++++++++
- mm/memory.c | 14 +-------------
- 7 files changed, 41 insertions(+), 21 deletions(-)
-
---- a/arch/arm64/include/asm/cpufeature.h
-+++ b/arch/arm64/include/asm/cpufeature.h
-@@ -808,6 +808,11 @@ static inline bool system_supports_tlb_r
- cpus_have_const_cap(ARM64_HAS_TLB_RANGE);
- }
-
-+static inline bool system_has_hw_af(void)
-+{
-+ return IS_ENABLED(CONFIG_ARM64_HW_AFDBM) && cpus_have_const_cap(ARM64_HW_AF);
-+}
-+
- extern int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt);
-
- static inline u32 id_aa64mmfr0_parange_to_phys_shift(int parange)
---- a/arch/arm64/include/asm/pgtable.h
-+++ b/arch/arm64/include/asm/pgtable.h
-@@ -999,13 +999,16 @@ static inline void update_mmu_cache(stru
- * page after fork() + CoW for pfn mappings. We don't always have a
- * hardware-managed access flag on arm64.
- */
--static inline bool arch_faults_on_old_pte(void)
-+static inline bool arch_has_hw_pte_young(bool local)
- {
-- WARN_ON(preemptible());
-+ if (local) {
-+ WARN_ON(preemptible());
-+ return cpu_has_hw_af();
-+ }
-
-- return !cpu_has_hw_af();
-+ return system_has_hw_af();
- }
--#define arch_faults_on_old_pte arch_faults_on_old_pte
-+#define arch_has_hw_pte_young arch_has_hw_pte_young
-
- /*
- * Experimentally, it's cheap to set the access flag in hardware and we
-@@ -1013,7 +1016,7 @@ static inline bool arch_faults_on_old_pt
- */
- static inline bool arch_wants_old_prefaulted_pte(void)
- {
-- return !arch_faults_on_old_pte();
-+ return arch_has_hw_pte_young(true);
- }
- #define arch_wants_old_prefaulted_pte arch_wants_old_prefaulted_pte
-
---- a/arch/arm64/kernel/cpufeature.c
-+++ b/arch/arm64/kernel/cpufeature.c
-@@ -2187,6 +2187,16 @@ static const struct arm64_cpu_capabiliti
- .matches = has_hw_dbm,
- .cpu_enable = cpu_enable_hw_dbm,
- },
-+ {
-+ .desc = "Hardware update of the Access flag",
-+ .type = ARM64_CPUCAP_SYSTEM_FEATURE,
-+ .capability = ARM64_HW_AF,
-+ .sys_reg = SYS_ID_AA64MMFR1_EL1,
-+ .sign = FTR_UNSIGNED,
-+ .field_pos = ID_AA64MMFR1_HADBS_SHIFT,
-+ .min_field_value = 1,
-+ .matches = has_cpuid_feature,
-+ },
- #endif
- {
- .desc = "CRC32 instructions",
---- a/arch/arm64/tools/cpucaps
-+++ b/arch/arm64/tools/cpucaps
-@@ -35,6 +35,7 @@ HAS_STAGE2_FWB
- HAS_SYSREG_GIC_CPUIF
- HAS_TLB_RANGE
- HAS_VIRT_HOST_EXTN
-+HW_AF
- HW_DBM
- KVM_PROTECTED_MODE
- MISMATCHED_CACHE_TYPE
---- a/arch/x86/include/asm/pgtable.h
-+++ b/arch/x86/include/asm/pgtable.h
-@@ -1397,10 +1397,10 @@ static inline bool arch_has_pfn_modify_c
- return boot_cpu_has_bug(X86_BUG_L1TF);
- }
-
--#define arch_faults_on_old_pte arch_faults_on_old_pte
--static inline bool arch_faults_on_old_pte(void)
-+#define arch_has_hw_pte_young arch_has_hw_pte_young
-+static inline bool arch_has_hw_pte_young(bool local)
- {
-- return false;
-+ return true;
- }
-
- #endif /* __ASSEMBLY__ */
---- a/include/linux/pgtable.h
-+++ b/include/linux/pgtable.h
-@@ -259,6 +259,19 @@ static inline int pmdp_clear_flush_young
- #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
- #endif
-
-+#ifndef arch_has_hw_pte_young
-+/*
-+ * Return whether the accessed bit is supported by the local CPU or all CPUs.
-+ *
-+ * Those arches which have hw access flag feature need to implement their own
-+ * helper. By default, "false" means pagefault will be hit on old pte.
-+ */
-+static inline bool arch_has_hw_pte_young(bool local)
-+{
-+ return false;
-+}
-+#endif
-+
- #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR
- static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
- unsigned long address,
---- a/mm/memory.c
-+++ b/mm/memory.c
-@@ -121,18 +121,6 @@ int randomize_va_space __read_mostly =
- 2;
- #endif
-
--#ifndef arch_faults_on_old_pte
--static inline bool arch_faults_on_old_pte(void)
--{
-- /*
-- * Those arches which don't have hw access flag feature need to
-- * implement their own helper. By default, "true" means pagefault
-- * will be hit on old pte.
-- */
-- return true;
--}
--#endif
--
- #ifndef arch_wants_old_prefaulted_pte
- static inline bool arch_wants_old_prefaulted_pte(void)
- {
-@@ -2782,7 +2770,7 @@ static inline bool cow_user_page(struct
- * On architectures with software "accessed" bits, we would
- * take a double page fault, so mark it accessed here.
- */
-- if (arch_faults_on_old_pte() && !pte_young(vmf->orig_pte)) {
-+ if (!arch_has_hw_pte_young(true) && !pte_young(vmf->orig_pte)) {
- pte_t entry;
-
- vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
+++ /dev/null
-From f8b663bbfa30af5515e222fd74df20ea4e8393a2 Mon Sep 17 00:00:00 2001
-Date: Sat, 26 Sep 2020 21:17:18 -0600
-Subject: [PATCH 02/10] mm: x86: add CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG
-
-Some architectures support the accessed bit on non-leaf PMD entries,
-e.g., x86_64 sets the accessed bit on a non-leaf PMD entry when using
-it as part of linear address translation [1]. As an optimization, page
-table walkers who are interested in the accessed bit can skip the PTEs
-under a non-leaf PMD entry if the accessed bit is cleared on this PMD
-entry.
-
-Although an inline function may be preferable, this capability is
-added as a configuration option to look consistent when used with the
-existing macros.
-
-[1]: Intel 64 and IA-32 Architectures Software Developer's Manual
- Volume 3 (June 2021), section 4.8
-
-Change-Id: I1a17be3ae926f721f7b17ea1539e5c39e8c4f9a8
----
- arch/Kconfig | 9 +++++++++
- arch/x86/Kconfig | 1 +
- arch/x86/include/asm/pgtable.h | 3 ++-
- arch/x86/mm/pgtable.c | 5 ++++-
- include/linux/pgtable.h | 4 ++--
- 5 files changed, 18 insertions(+), 4 deletions(-)
-
---- a/arch/Kconfig
-+++ b/arch/Kconfig
-@@ -1295,6 +1295,15 @@ config ARCH_HAS_ELFCORE_COMPAT
- config ARCH_HAS_PARANOID_L1D_FLUSH
- bool
-
-+config ARCH_HAS_NONLEAF_PMD_YOUNG
-+ bool
-+ depends on PGTABLE_LEVELS > 2
-+ help
-+ Architectures that select this are able to set the accessed bit on
-+ non-leaf PMD entries in addition to leaf PTE entries where pages are
-+ mapped. For them, page table walkers that clear the accessed bit may
-+ stop at non-leaf PMD entries if they do not see the accessed bit.
-+
- source "kernel/gcov/Kconfig"
-
- source "scripts/gcc-plugins/Kconfig"
---- a/arch/x86/Kconfig
-+++ b/arch/x86/Kconfig
-@@ -84,6 +84,7 @@ config X86
- select ARCH_HAS_PMEM_API if X86_64
- select ARCH_HAS_PTE_DEVMAP if X86_64
- select ARCH_HAS_PTE_SPECIAL
-+ select ARCH_HAS_NONLEAF_PMD_YOUNG if X86_64
- select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64
- select ARCH_HAS_COPY_MC if X86_64
- select ARCH_HAS_SET_MEMORY
---- a/arch/x86/include/asm/pgtable.h
-+++ b/arch/x86/include/asm/pgtable.h
-@@ -817,7 +817,8 @@ static inline unsigned long pmd_page_vad
-
- static inline int pmd_bad(pmd_t pmd)
- {
-- return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE;
-+ return (pmd_flags(pmd) & ~(_PAGE_USER | _PAGE_ACCESSED)) !=
-+ (_KERNPG_TABLE & ~_PAGE_ACCESSED);
- }
-
- static inline unsigned long pages_to_mb(unsigned long npg)
---- a/arch/x86/mm/pgtable.c
-+++ b/arch/x86/mm/pgtable.c
-@@ -550,7 +550,7 @@ int ptep_test_and_clear_young(struct vm_
- return ret;
- }
-
--#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
- int pmdp_test_and_clear_young(struct vm_area_struct *vma,
- unsigned long addr, pmd_t *pmdp)
- {
-@@ -562,6 +562,9 @@ int pmdp_test_and_clear_young(struct vm_
-
- return ret;
- }
-+#endif
-+
-+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- int pudp_test_and_clear_young(struct vm_area_struct *vma,
- unsigned long addr, pud_t *pudp)
- {
---- a/include/linux/pgtable.h
-+++ b/include/linux/pgtable.h
-@@ -212,7 +212,7 @@ static inline int ptep_test_and_clear_yo
- #endif
-
- #ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
--#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
- static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
- unsigned long address,
- pmd_t *pmdp)
-@@ -233,7 +233,7 @@ static inline int pmdp_test_and_clear_yo
- BUILD_BUG();
- return 0;
- }
--#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-+#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG */
- #endif
-
- #ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
+++ /dev/null
-From a810f8e2f1bdd0707eaf05c8b4ba84a3ff2801bd Mon Sep 17 00:00:00 2001
-Date: Sun, 27 Sep 2020 20:49:08 -0600
-Subject: [PATCH 03/10] mm/vmscan.c: refactor shrink_node()
-
-This patch refactors shrink_node(). This will make the upcoming
-changes to mm/vmscan.c more readable.
-
-Change-Id: Iae734b5b4030205b7db6e8c841f747b6f6ae1a04
----
- mm/vmscan.c | 186 +++++++++++++++++++++++++++-------------------------
- 1 file changed, 98 insertions(+), 88 deletions(-)
-
---- a/mm/vmscan.c
-+++ b/mm/vmscan.c
-@@ -2562,6 +2562,103 @@ enum scan_balance {
- SCAN_FILE,
- };
-
-+static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc)
-+{
-+ unsigned long file;
-+ struct lruvec *target_lruvec;
-+
-+ target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
-+
-+ /*
-+ * Determine the scan balance between anon and file LRUs.
-+ */
-+ spin_lock_irq(&target_lruvec->lru_lock);
-+ sc->anon_cost = target_lruvec->anon_cost;
-+ sc->file_cost = target_lruvec->file_cost;
-+ spin_unlock_irq(&target_lruvec->lru_lock);
-+
-+ /*
-+ * Target desirable inactive:active list ratios for the anon
-+ * and file LRU lists.
-+ */
-+ if (!sc->force_deactivate) {
-+ unsigned long refaults;
-+
-+ refaults = lruvec_page_state(target_lruvec,
-+ WORKINGSET_ACTIVATE_ANON);
-+ if (refaults != target_lruvec->refaults[0] ||
-+ inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))
-+ sc->may_deactivate |= DEACTIVATE_ANON;
-+ else
-+ sc->may_deactivate &= ~DEACTIVATE_ANON;
-+
-+ /*
-+ * When refaults are being observed, it means a new
-+ * workingset is being established. Deactivate to get
-+ * rid of any stale active pages quickly.
-+ */
-+ refaults = lruvec_page_state(target_lruvec,
-+ WORKINGSET_ACTIVATE_FILE);
-+ if (refaults != target_lruvec->refaults[1] ||
-+ inactive_is_low(target_lruvec, LRU_INACTIVE_FILE))
-+ sc->may_deactivate |= DEACTIVATE_FILE;
-+ else
-+ sc->may_deactivate &= ~DEACTIVATE_FILE;
-+ } else
-+ sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE;
-+
-+ /*
-+ * If we have plenty of inactive file pages that aren't
-+ * thrashing, try to reclaim those first before touching
-+ * anonymous pages.
-+ */
-+ file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE);
-+ if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE))
-+ sc->cache_trim_mode = 1;
-+ else
-+ sc->cache_trim_mode = 0;
-+
-+ /*
-+ * Prevent the reclaimer from falling into the cache trap: as
-+ * cache pages start out inactive, every cache fault will tip
-+ * the scan balance towards the file LRU. And as the file LRU
-+ * shrinks, so does the window for rotation from references.
-+ * This means we have a runaway feedback loop where a tiny
-+ * thrashing file LRU becomes infinitely more attractive than
-+ * anon pages. Try to detect this based on file LRU size.
-+ */
-+ if (!cgroup_reclaim(sc)) {
-+ unsigned long total_high_wmark = 0;
-+ unsigned long free, anon;
-+ int z;
-+
-+ free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
-+ file = node_page_state(pgdat, NR_ACTIVE_FILE) +
-+ node_page_state(pgdat, NR_INACTIVE_FILE);
-+
-+ for (z = 0; z < MAX_NR_ZONES; z++) {
-+ struct zone *zone = &pgdat->node_zones[z];
-+
-+ if (!managed_zone(zone))
-+ continue;
-+
-+ total_high_wmark += high_wmark_pages(zone);
-+ }
-+
-+ /*
-+ * Consider anon: if that's low too, this isn't a
-+ * runaway file reclaim problem, but rather just
-+ * extreme pressure. Reclaim as per usual then.
-+ */
-+ anon = node_page_state(pgdat, NR_INACTIVE_ANON);
-+
-+ sc->file_is_tiny =
-+ file + free <= total_high_wmark &&
-+ !(sc->may_deactivate & DEACTIVATE_ANON) &&
-+ anon >> sc->priority;
-+ }
-+}
-+
- /*
- * Determine how aggressively the anon and file LRU lists should be
- * scanned. The relative value of each set of LRU lists is determined
-@@ -3032,7 +3129,6 @@ static void shrink_node(pg_data_t *pgdat
- unsigned long nr_reclaimed, nr_scanned;
- struct lruvec *target_lruvec;
- bool reclaimable = false;
-- unsigned long file;
-
- target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
-
-@@ -3048,93 +3144,7 @@ again:
- nr_reclaimed = sc->nr_reclaimed;
- nr_scanned = sc->nr_scanned;
-
-- /*
-- * Determine the scan balance between anon and file LRUs.
-- */
-- spin_lock_irq(&target_lruvec->lru_lock);
-- sc->anon_cost = target_lruvec->anon_cost;
-- sc->file_cost = target_lruvec->file_cost;
-- spin_unlock_irq(&target_lruvec->lru_lock);
--
-- /*
-- * Target desirable inactive:active list ratios for the anon
-- * and file LRU lists.
-- */
-- if (!sc->force_deactivate) {
-- unsigned long refaults;
--
-- refaults = lruvec_page_state(target_lruvec,
-- WORKINGSET_ACTIVATE_ANON);
-- if (refaults != target_lruvec->refaults[0] ||
-- inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))
-- sc->may_deactivate |= DEACTIVATE_ANON;
-- else
-- sc->may_deactivate &= ~DEACTIVATE_ANON;
--
-- /*
-- * When refaults are being observed, it means a new
-- * workingset is being established. Deactivate to get
-- * rid of any stale active pages quickly.
-- */
-- refaults = lruvec_page_state(target_lruvec,
-- WORKINGSET_ACTIVATE_FILE);
-- if (refaults != target_lruvec->refaults[1] ||
-- inactive_is_low(target_lruvec, LRU_INACTIVE_FILE))
-- sc->may_deactivate |= DEACTIVATE_FILE;
-- else
-- sc->may_deactivate &= ~DEACTIVATE_FILE;
-- } else
-- sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE;
--
-- /*
-- * If we have plenty of inactive file pages that aren't
-- * thrashing, try to reclaim those first before touching
-- * anonymous pages.
-- */
-- file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE);
-- if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE))
-- sc->cache_trim_mode = 1;
-- else
-- sc->cache_trim_mode = 0;
--
-- /*
-- * Prevent the reclaimer from falling into the cache trap: as
-- * cache pages start out inactive, every cache fault will tip
-- * the scan balance towards the file LRU. And as the file LRU
-- * shrinks, so does the window for rotation from references.
-- * This means we have a runaway feedback loop where a tiny
-- * thrashing file LRU becomes infinitely more attractive than
-- * anon pages. Try to detect this based on file LRU size.
-- */
-- if (!cgroup_reclaim(sc)) {
-- unsigned long total_high_wmark = 0;
-- unsigned long free, anon;
-- int z;
--
-- free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
-- file = node_page_state(pgdat, NR_ACTIVE_FILE) +
-- node_page_state(pgdat, NR_INACTIVE_FILE);
--
-- for (z = 0; z < MAX_NR_ZONES; z++) {
-- struct zone *zone = &pgdat->node_zones[z];
-- if (!managed_zone(zone))
-- continue;
--
-- total_high_wmark += high_wmark_pages(zone);
-- }
--
-- /*
-- * Consider anon: if that's low too, this isn't a
-- * runaway file reclaim problem, but rather just
-- * extreme pressure. Reclaim as per usual then.
-- */
-- anon = node_page_state(pgdat, NR_INACTIVE_ANON);
--
-- sc->file_is_tiny =
-- file + free <= total_high_wmark &&
-- !(sc->may_deactivate & DEACTIVATE_ANON) &&
-- anon >> sc->priority;
-- }
-+ prepare_scan_count(pgdat, sc);
-
- shrink_node_memcgs(pgdat, sc);
-
+++ /dev/null
-From 05f366c941ae2bb8ba21c79fafcb747a5a6b967b Mon Sep 17 00:00:00 2001
-Date: Mon, 25 Jan 2021 21:12:33 -0700
-Subject: [PATCH 04/10] mm: multigenerational lru: groundwork
-
-For each lruvec, evictable pages are divided into multiple
-generations. The youngest generation number is stored in
-lrugen->max_seq for both anon and file types as they are aged on an
-equal footing. The oldest generation numbers are stored in
-lrugen->min_seq[] separately for anon and file types as clean file
-pages can be evicted regardless of swap constraints. These three
-variables are monotonically increasing. Generation numbers are
-truncated into order_base_2(MAX_NR_GENS+1) bits in order to fit into
-page->flags. The sliding window technique is used to prevent truncated
-generation numbers from overlapping. Each truncated generation number
-is an index to
-lrugen->lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES].
-
-The framework comprises two conceptually independent components: the
-aging, which produces young generations, and the eviction, which
-consumes old generations. Both can be invoked independently from user
-space for the purpose of working set estimation and proactive reclaim.
-
-The protection of hot pages and the selection of cold pages are based
-on page access types and patterns. There are two access types: one via
-page tables and the other via file descriptors. The protection of the
-former type is by design stronger because:
- 1) The uncertainty in determining the access patterns of the former
- type is higher due to the coalesced nature of the accessed bit.
- 2) The cost of evicting the former type is higher due to the TLB
- flushes required and the likelihood of involving I/O.
- 3) The penalty of under-protecting the former type is higher because
- applications usually do not prepare themselves for major faults like
- they do for blocked I/O. For example, client applications commonly
- dedicate blocked I/O to separate threads to avoid UI janks that
- negatively affect user experience.
-
-There are also two access patterns: one with temporal locality and the
-other without. The latter pattern, e.g., random and sequential, needs
-to be explicitly excluded to avoid weakening the protection of the
-former pattern. Generally the former type follows the former pattern
-unless MADV_SEQUENTIAL is specified and the latter type follows the
-latter pattern unless outlying refaults have been observed.
-
-Upon faulting, a page is added to the youngest generation, which
-provides the strongest protection as the eviction will not consider
-this page before the aging has scanned it at least twice. The first
-scan clears the accessed bit set during the initial fault. And the
-second scan makes sure this page has not been used since the first
-scan. A page from any other generations is brought back to the
-youngest generation whenever the aging finds the accessed bit set on
-any of the PTEs mapping this page.
-
-Unmapped pages are initially added to the oldest generation and then
-conditionally protected by tiers. This is done later [PATCH 07/10].
-
-Change-Id: I71de7cd15b8dfa6f9fdd838023474693c4fee0a7
----
- fs/fuse/dev.c | 3 +-
- include/linux/cgroup.h | 15 +-
- include/linux/mm.h | 36 ++++
- include/linux/mm_inline.h | 182 ++++++++++++++++++++
- include/linux/mmzone.h | 70 ++++++++
- include/linux/page-flags-layout.h | 19 ++-
- include/linux/page-flags.h | 4 +-
- include/linux/sched.h | 3 +
- kernel/bounds.c | 3 +
- kernel/cgroup/cgroup-internal.h | 1 -
- mm/huge_memory.c | 3 +-
- mm/memcontrol.c | 1 +
- mm/memory.c | 7 +
- mm/mm_init.c | 6 +-
- mm/page_alloc.c | 1 +
- mm/swap.c | 9 +-
- mm/swapfile.c | 2 +
- mm/vmscan.c | 268 ++++++++++++++++++++++++++++++
- 18 files changed, 618 insertions(+), 15 deletions(-)
-
---- a/fs/fuse/dev.c
-+++ b/fs/fuse/dev.c
-@@ -785,7 +785,8 @@ static int fuse_check_page(struct page *
- 1 << PG_active |
- 1 << PG_workingset |
- 1 << PG_reclaim |
-- 1 << PG_waiters))) {
-+ 1 << PG_waiters |
-+ LRU_GEN_MASK | LRU_REFS_MASK))) {
- dump_page(page, "fuse: trying to steal weird page");
- return 1;
- }
---- a/include/linux/cgroup.h
-+++ b/include/linux/cgroup.h
-@@ -432,6 +432,18 @@ static inline void cgroup_put(struct cgr
- css_put(&cgrp->self);
- }
-
-+extern struct mutex cgroup_mutex;
-+
-+static inline void cgroup_lock(void)
-+{
-+ mutex_lock(&cgroup_mutex);
-+}
-+
-+static inline void cgroup_unlock(void)
-+{
-+ mutex_unlock(&cgroup_mutex);
-+}
-+
- /**
- * task_css_set_check - obtain a task's css_set with extra access conditions
- * @task: the task to obtain css_set for
-@@ -446,7 +458,6 @@ static inline void cgroup_put(struct cgr
- * as locks used during the cgroup_subsys::attach() methods.
- */
- #ifdef CONFIG_PROVE_RCU
--extern struct mutex cgroup_mutex;
- extern spinlock_t css_set_lock;
- #define task_css_set_check(task, __c) \
- rcu_dereference_check((task)->cgroups, \
-@@ -707,6 +718,8 @@ struct cgroup;
- static inline u64 cgroup_id(const struct cgroup *cgrp) { return 1; }
- static inline void css_get(struct cgroup_subsys_state *css) {}
- static inline void css_put(struct cgroup_subsys_state *css) {}
-+static inline void cgroup_lock(void) {}
-+static inline void cgroup_unlock(void) {}
- static inline int cgroup_attach_task_all(struct task_struct *from,
- struct task_struct *t) { return 0; }
- static inline int cgroupstats_build(struct cgroupstats *stats,
---- a/include/linux/mm.h
-+++ b/include/linux/mm.h
-@@ -1093,6 +1093,8 @@ vm_fault_t finish_mkwrite_fault(struct v
- #define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH)
- #define LAST_CPUPID_PGOFF (ZONES_PGOFF - LAST_CPUPID_WIDTH)
- #define KASAN_TAG_PGOFF (LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH)
-+#define LRU_GEN_PGOFF (KASAN_TAG_PGOFF - LRU_GEN_WIDTH)
-+#define LRU_REFS_PGOFF (LRU_GEN_PGOFF - LRU_REFS_WIDTH)
-
- /*
- * Define the bit shifts to access each section. For non-existent
-@@ -1807,6 +1809,40 @@ static inline void unmap_mapping_range(s
- loff_t const holebegin, loff_t const holelen, int even_cows) { }
- #endif
-
-+#ifdef CONFIG_LRU_GEN
-+static inline void task_enter_nonseq_fault(void)
-+{
-+ WARN_ON(current->in_nonseq_fault);
-+
-+ current->in_nonseq_fault = 1;
-+}
-+
-+static inline void task_exit_nonseq_fault(void)
-+{
-+ WARN_ON(!current->in_nonseq_fault);
-+
-+ current->in_nonseq_fault = 0;
-+}
-+
-+static inline bool task_in_nonseq_fault(void)
-+{
-+ return current->in_nonseq_fault;
-+}
-+#else
-+static inline void task_enter_nonseq_fault(void)
-+{
-+}
-+
-+static inline void task_exit_nonseq_fault(void)
-+{
-+}
-+
-+static inline bool task_in_nonseq_fault(void)
-+{
-+ return false;
-+}
-+#endif /* CONFIG_LRU_GEN */
-+
- static inline void unmap_shared_mapping_range(struct address_space *mapping,
- loff_t const holebegin, loff_t const holelen)
- {
---- a/include/linux/mm_inline.h
-+++ b/include/linux/mm_inline.h
-@@ -79,11 +79,187 @@ static __always_inline enum lru_list pag
- return lru;
- }
-
-+#ifdef CONFIG_LRU_GEN
-+
-+static inline bool lru_gen_enabled(void)
-+{
-+#ifdef CONFIG_LRU_GEN_ENABLED
-+ DECLARE_STATIC_KEY_TRUE(lru_gen_static_key);
-+
-+ return static_branch_likely(&lru_gen_static_key);
-+#else
-+ DECLARE_STATIC_KEY_FALSE(lru_gen_static_key);
-+
-+ return static_branch_unlikely(&lru_gen_static_key);
-+#endif
-+}
-+
-+/* Return an index within the sliding window that tracks MAX_NR_GENS generations. */
-+static inline int lru_gen_from_seq(unsigned long seq)
-+{
-+ return seq % MAX_NR_GENS;
-+}
-+
-+/* The youngest and the second youngest generations are counted as active. */
-+static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen)
-+{
-+ unsigned long max_seq = lruvec->evictable.max_seq;
-+
-+ VM_BUG_ON(gen >= MAX_NR_GENS);
-+
-+ return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1);
-+}
-+
-+/* Update the sizes of the multigenerational lru lists. */
-+static inline void lru_gen_update_size(struct page *page, struct lruvec *lruvec,
-+ int old_gen, int new_gen)
-+{
-+ int type = page_is_file_lru(page);
-+ int zone = page_zonenum(page);
-+ int delta = thp_nr_pages(page);
-+ enum lru_list lru = type * LRU_FILE;
-+ struct lrugen *lrugen = &lruvec->evictable;
-+
-+ lockdep_assert_held(&lruvec->lru_lock);
-+ VM_BUG_ON(old_gen != -1 && old_gen >= MAX_NR_GENS);
-+ VM_BUG_ON(new_gen != -1 && new_gen >= MAX_NR_GENS);
-+ VM_BUG_ON(old_gen == -1 && new_gen == -1);
-+
-+ if (old_gen >= 0)
-+ WRITE_ONCE(lrugen->sizes[old_gen][type][zone],
-+ lrugen->sizes[old_gen][type][zone] - delta);
-+ if (new_gen >= 0)
-+ WRITE_ONCE(lrugen->sizes[new_gen][type][zone],
-+ lrugen->sizes[new_gen][type][zone] + delta);
-+
-+ if (old_gen < 0) {
-+ if (lru_gen_is_active(lruvec, new_gen))
-+ lru += LRU_ACTIVE;
-+ update_lru_size(lruvec, lru, zone, delta);
-+ return;
-+ }
-+
-+ if (new_gen < 0) {
-+ if (lru_gen_is_active(lruvec, old_gen))
-+ lru += LRU_ACTIVE;
-+ update_lru_size(lruvec, lru, zone, -delta);
-+ return;
-+ }
-+
-+ if (!lru_gen_is_active(lruvec, old_gen) && lru_gen_is_active(lruvec, new_gen)) {
-+ update_lru_size(lruvec, lru, zone, -delta);
-+ update_lru_size(lruvec, lru + LRU_ACTIVE, zone, delta);
-+ }
-+
-+ VM_BUG_ON(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen));
-+}
-+
-+/* Add a page to one of the multigenerational lru lists. Return true on success. */
-+static inline bool lru_gen_add_page(struct page *page, struct lruvec *lruvec, bool reclaiming)
-+{
-+ int gen;
-+ unsigned long old_flags, new_flags;
-+ int type = page_is_file_lru(page);
-+ int zone = page_zonenum(page);
-+ struct lrugen *lrugen = &lruvec->evictable;
-+
-+ if (PageUnevictable(page) || !lrugen->enabled[type])
-+ return false;
-+ /*
-+ * If a page shouldn't be considered for eviction, i.e., a page mapped
-+ * upon fault during which the accessed bit is set, add it to the
-+ * youngest generation.
-+ *
-+ * If a page can't be evicted immediately, i.e., an anon page not in
-+ * swap cache or a dirty page pending writeback, add it to the second
-+ * oldest generation.
-+ *
-+ * If a page could be evicted immediately, e.g., a clean page, add it to
-+ * the oldest generation.
-+ */
-+ if (PageActive(page))
-+ gen = lru_gen_from_seq(lrugen->max_seq);
-+ else if ((!type && !PageSwapCache(page)) ||
-+ (PageReclaim(page) && (PageDirty(page) || PageWriteback(page))))
-+ gen = lru_gen_from_seq(lrugen->min_seq[type] + 1);
-+ else
-+ gen = lru_gen_from_seq(lrugen->min_seq[type]);
-+
-+ do {
-+ new_flags = old_flags = READ_ONCE(page->flags);
-+ VM_BUG_ON_PAGE(new_flags & LRU_GEN_MASK, page);
-+
-+ new_flags &= ~(LRU_GEN_MASK | BIT(PG_active));
-+ new_flags |= (gen + 1UL) << LRU_GEN_PGOFF;
-+ } while (cmpxchg(&page->flags, old_flags, new_flags) != old_flags);
-+
-+ lru_gen_update_size(page, lruvec, -1, gen);
-+ /* for rotate_reclaimable_page() */
-+ if (reclaiming)
-+ list_add_tail(&page->lru, &lrugen->lists[gen][type][zone]);
-+ else
-+ list_add(&page->lru, &lrugen->lists[gen][type][zone]);
-+
-+ return true;
-+}
-+
-+/* Delete a page from one of the multigenerational lru lists. Return true on success. */
-+static inline bool lru_gen_del_page(struct page *page, struct lruvec *lruvec, bool reclaiming)
-+{
-+ int gen;
-+ unsigned long old_flags, new_flags;
-+
-+ do {
-+ new_flags = old_flags = READ_ONCE(page->flags);
-+ if (!(new_flags & LRU_GEN_MASK))
-+ return false;
-+
-+ VM_BUG_ON_PAGE(PageActive(page), page);
-+ VM_BUG_ON_PAGE(PageUnevictable(page), page);
-+
-+ gen = ((new_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
-+
-+ new_flags &= ~LRU_GEN_MASK;
-+ /* for shrink_page_list() */
-+ if (reclaiming)
-+ new_flags &= ~(BIT(PG_referenced) | BIT(PG_reclaim));
-+ else if (lru_gen_is_active(lruvec, gen))
-+ new_flags |= BIT(PG_active);
-+ } while (cmpxchg(&page->flags, old_flags, new_flags) != old_flags);
-+
-+ lru_gen_update_size(page, lruvec, gen, -1);
-+ list_del(&page->lru);
-+
-+ return true;
-+}
-+
-+#else
-+
-+static inline bool lru_gen_enabled(void)
-+{
-+ return false;
-+}
-+
-+static inline bool lru_gen_add_page(struct page *page, struct lruvec *lruvec, bool reclaiming)
-+{
-+ return false;
-+}
-+
-+static inline bool lru_gen_del_page(struct page *page, struct lruvec *lruvec, bool reclaiming)
-+{
-+ return false;
-+}
-+
-+#endif /* CONFIG_LRU_GEN */
-+
- static __always_inline void add_page_to_lru_list(struct page *page,
- struct lruvec *lruvec)
- {
- enum lru_list lru = page_lru(page);
-
-+ if (lru_gen_add_page(page, lruvec, false))
-+ return;
-+
- update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page));
- list_add(&page->lru, &lruvec->lists[lru]);
- }
-@@ -93,6 +269,9 @@ static __always_inline void add_page_to_
- {
- enum lru_list lru = page_lru(page);
-
-+ if (lru_gen_add_page(page, lruvec, true))
-+ return;
-+
- update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page));
- list_add_tail(&page->lru, &lruvec->lists[lru]);
- }
-@@ -100,6 +279,9 @@ static __always_inline void add_page_to_
- static __always_inline void del_page_from_lru_list(struct page *page,
- struct lruvec *lruvec)
- {
-+ if (lru_gen_del_page(page, lruvec, false))
-+ return;
-+
- list_del(&page->lru);
- update_lru_size(lruvec, page_lru(page), page_zonenum(page),
- -thp_nr_pages(page));
---- a/include/linux/mmzone.h
-+++ b/include/linux/mmzone.h
-@@ -294,6 +294,72 @@ enum lruvec_flags {
- */
- };
-
-+struct lruvec;
-+
-+#define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
-+#define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)
-+
-+#ifdef CONFIG_LRU_GEN
-+
-+/*
-+ * For each lruvec, evictable pages are divided into multiple generations. The
-+ * youngest and the oldest generation numbers, AKA max_seq and min_seq, are
-+ * monotonically increasing. The sliding window technique is used to track at
-+ * least MIN_NR_GENS and at most MAX_NR_GENS generations. An offset within the
-+ * window, AKA gen, indexes an array of per-type and per-zone lists for the
-+ * corresponding generation. The counter in page->flags stores gen+1 while a
-+ * page is on one of the multigenerational lru lists. Otherwise, it stores 0.
-+ *
-+ * After a page is faulted in, the aging must check the accessed bit at least
-+ * twice before the eviction would consider it. The first check clears the
-+ * accessed bit set during the initial fault. The second check makes sure this
-+ * page hasn't been used since then.
-+ */
-+#define MIN_NR_GENS 2
-+#define MAX_NR_GENS ((unsigned int)CONFIG_NR_LRU_GENS)
-+
-+struct lrugen {
-+ /* the aging increments the max generation number */
-+ unsigned long max_seq;
-+ /* the eviction increments the min generation numbers */
-+ unsigned long min_seq[ANON_AND_FILE];
-+ /* the birth time of each generation in jiffies */
-+ unsigned long timestamps[MAX_NR_GENS];
-+ /* the multigenerational lru lists */
-+ struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
-+ /* the sizes of the multigenerational lru lists in pages */
-+ unsigned long sizes[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
-+ /* whether the multigenerational lru is enabled */
-+ bool enabled[ANON_AND_FILE];
-+};
-+
-+#define MAX_BATCH_SIZE 8192
-+
-+void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec);
-+void lru_gen_change_state(bool enable, bool main, bool swap);
-+
-+#ifdef CONFIG_MEMCG
-+void lru_gen_init_memcg(struct mem_cgroup *memcg);
-+#endif
-+
-+#else /* !CONFIG_LRU_GEN */
-+
-+static inline void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec)
-+{
-+}
-+
-+static inline void lru_gen_change_state(bool enable, bool main, bool swap)
-+{
-+}
-+
-+#ifdef CONFIG_MEMCG
-+static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
-+{
-+}
-+#endif
-+
-+#endif /* CONFIG_LRU_GEN */
-+
- struct lruvec {
- struct list_head lists[NR_LRU_LISTS];
- /* per lruvec lru_lock for memcg */
-@@ -311,6 +377,10 @@ struct lruvec {
- unsigned long refaults[ANON_AND_FILE];
- /* Various lruvec state flags (enum lruvec_flags) */
- unsigned long flags;
-+#ifdef CONFIG_LRU_GEN
-+ /* unevictable pages are on LRU_UNEVICTABLE */
-+ struct lrugen evictable;
-+#endif
- #ifdef CONFIG_MEMCG
- struct pglist_data *pgdat;
- #endif
---- a/include/linux/page-flags-layout.h
-+++ b/include/linux/page-flags-layout.h
-@@ -26,6 +26,14 @@
-
- #define ZONES_WIDTH ZONES_SHIFT
-
-+#ifdef CONFIG_LRU_GEN
-+/* LRU_GEN_WIDTH is generated from order_base_2(CONFIG_NR_LRU_GENS + 1). */
-+#define LRU_REFS_WIDTH (CONFIG_TIERS_PER_GEN - 2)
-+#else
-+#define LRU_GEN_WIDTH 0
-+#define LRU_REFS_WIDTH 0
-+#endif /* CONFIG_LRU_GEN */
-+
- #ifdef CONFIG_SPARSEMEM
- #include <asm/sparsemem.h>
- #define SECTIONS_SHIFT (MAX_PHYSMEM_BITS - SECTION_SIZE_BITS)
-@@ -55,7 +63,8 @@
- #define SECTIONS_WIDTH 0
- #endif
-
--#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
-+#if ZONES_WIDTH + LRU_GEN_WIDTH + LRU_REFS_WIDTH + SECTIONS_WIDTH + NODES_SHIFT \
-+ <= BITS_PER_LONG - NR_PAGEFLAGS
- #define NODES_WIDTH NODES_SHIFT
- #elif defined(CONFIG_SPARSEMEM_VMEMMAP)
- #error "Vmemmap: No space for nodes field in page flags"
-@@ -89,8 +98,8 @@
- #define LAST_CPUPID_SHIFT 0
- #endif
-
--#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT \
-- <= BITS_PER_LONG - NR_PAGEFLAGS
-+#if ZONES_WIDTH + LRU_GEN_WIDTH + LRU_REFS_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \
-+ KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
- #define LAST_CPUPID_WIDTH LAST_CPUPID_SHIFT
- #else
- #define LAST_CPUPID_WIDTH 0
-@@ -100,8 +109,8 @@
- #define LAST_CPUPID_NOT_IN_PAGE_FLAGS
- #endif
-
--#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH \
-- > BITS_PER_LONG - NR_PAGEFLAGS
-+#if ZONES_WIDTH + LRU_GEN_WIDTH + LRU_REFS_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \
-+ KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH > BITS_PER_LONG - NR_PAGEFLAGS
- #error "Not enough bits in page flags"
- #endif
-
---- a/include/linux/page-flags.h
-+++ b/include/linux/page-flags.h
-@@ -845,7 +845,7 @@ static inline void ClearPageSlabPfmemall
- 1UL << PG_private | 1UL << PG_private_2 | \
- 1UL << PG_writeback | 1UL << PG_reserved | \
- 1UL << PG_slab | 1UL << PG_active | \
-- 1UL << PG_unevictable | __PG_MLOCKED)
-+ 1UL << PG_unevictable | __PG_MLOCKED | LRU_GEN_MASK)
-
- /*
- * Flags checked when a page is prepped for return by the page allocator.
-@@ -856,7 +856,7 @@ static inline void ClearPageSlabPfmemall
- * alloc-free cycle to prevent from reusing the page.
- */
- #define PAGE_FLAGS_CHECK_AT_PREP \
-- (PAGEFLAGS_MASK & ~__PG_HWPOISON)
-+ ((PAGEFLAGS_MASK & ~__PG_HWPOISON) | LRU_GEN_MASK | LRU_REFS_MASK)
-
- #define PAGE_FLAGS_PRIVATE \
- (1UL << PG_private | 1UL << PG_private_2)
---- a/include/linux/sched.h
-+++ b/include/linux/sched.h
-@@ -911,6 +911,9 @@ struct task_struct {
- #ifdef CONFIG_MEMCG
- unsigned in_user_fault:1;
- #endif
-+#ifdef CONFIG_LRU_GEN
-+ unsigned in_nonseq_fault:1;
-+#endif
- #ifdef CONFIG_COMPAT_BRK
- unsigned brk_randomized:1;
- #endif
---- a/kernel/bounds.c
-+++ b/kernel/bounds.c
-@@ -22,6 +22,9 @@ int main(void)
- DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS));
- #endif
- DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t));
-+#ifdef CONFIG_LRU_GEN
-+ DEFINE(LRU_GEN_WIDTH, order_base_2(CONFIG_NR_LRU_GENS + 1));
-+#endif
- /* End of constants */
-
- return 0;
---- a/kernel/cgroup/cgroup-internal.h
-+++ b/kernel/cgroup/cgroup-internal.h
-@@ -165,7 +165,6 @@ struct cgroup_mgctx {
- #define DEFINE_CGROUP_MGCTX(name) \
- struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name)
-
--extern struct mutex cgroup_mutex;
- extern spinlock_t css_set_lock;
- extern struct cgroup_subsys *cgroup_subsys[];
- extern struct list_head cgroup_roots;
---- a/mm/huge_memory.c
-+++ b/mm/huge_memory.c
-@@ -2364,7 +2364,8 @@ static void __split_huge_page_tail(struc
- #ifdef CONFIG_64BIT
- (1L << PG_arch_2) |
- #endif
-- (1L << PG_dirty)));
-+ (1L << PG_dirty) |
-+ LRU_GEN_MASK | LRU_REFS_MASK));
-
- /* ->mapping in first tail page is compound_mapcount */
- VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
---- a/mm/memcontrol.c
-+++ b/mm/memcontrol.c
-@@ -5226,6 +5226,7 @@ static struct mem_cgroup *mem_cgroup_all
- memcg->deferred_split_queue.split_queue_len = 0;
- #endif
- idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
-+ lru_gen_init_memcg(memcg);
- return memcg;
- fail:
- mem_cgroup_id_remove(memcg);
---- a/mm/memory.c
-+++ b/mm/memory.c
-@@ -4788,6 +4788,7 @@ vm_fault_t handle_mm_fault(struct vm_are
- unsigned int flags, struct pt_regs *regs)
- {
- vm_fault_t ret;
-+ bool nonseq_fault = !(vma->vm_flags & VM_SEQ_READ);
-
- __set_current_state(TASK_RUNNING);
-
-@@ -4809,11 +4810,17 @@ vm_fault_t handle_mm_fault(struct vm_are
- if (flags & FAULT_FLAG_USER)
- mem_cgroup_enter_user_fault();
-
-+ if (nonseq_fault)
-+ task_enter_nonseq_fault();
-+
- if (unlikely(is_vm_hugetlb_page(vma)))
- ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
- else
- ret = __handle_mm_fault(vma, address, flags);
-
-+ if (nonseq_fault)
-+ task_exit_nonseq_fault();
-+
- if (flags & FAULT_FLAG_USER) {
- mem_cgroup_exit_user_fault();
- /*
---- a/mm/mm_init.c
-+++ b/mm/mm_init.c
-@@ -65,14 +65,16 @@ void __init mminit_verify_pageflags_layo
-
- shift = 8 * sizeof(unsigned long);
- width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH
-- - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH;
-+ - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH - LRU_GEN_WIDTH - LRU_REFS_WIDTH;
- mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
-- "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Flags %d\n",
-+ "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Gen %d Tier %d Flags %d\n",
- SECTIONS_WIDTH,
- NODES_WIDTH,
- ZONES_WIDTH,
- LAST_CPUPID_WIDTH,
- KASAN_TAG_WIDTH,
-+ LRU_GEN_WIDTH,
-+ LRU_REFS_WIDTH,
- NR_PAGEFLAGS);
- mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts",
- "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d\n",
---- a/mm/page_alloc.c
-+++ b/mm/page_alloc.c
-@@ -7456,6 +7456,7 @@ static void __meminit pgdat_init_interna
-
- pgdat_page_ext_init(pgdat);
- lruvec_init(&pgdat->__lruvec);
-+ lru_gen_init_state(NULL, &pgdat->__lruvec);
- }
-
- static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid,
---- a/mm/swap.c
-+++ b/mm/swap.c
-@@ -446,6 +446,11 @@ void lru_cache_add(struct page *page)
- VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page);
- VM_BUG_ON_PAGE(PageLRU(page), page);
-
-+ /* see the comment in lru_gen_add_page() */
-+ if (lru_gen_enabled() && !PageUnevictable(page) &&
-+ task_in_nonseq_fault() && !(current->flags & PF_MEMALLOC))
-+ SetPageActive(page);
-+
- get_page(page);
- local_lock(&lru_pvecs.lock);
- pvec = this_cpu_ptr(&lru_pvecs.lru_add);
-@@ -547,7 +552,7 @@ static void lru_deactivate_file_fn(struc
-
- static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec)
- {
-- if (PageActive(page) && !PageUnevictable(page)) {
-+ if (!PageUnevictable(page) && (PageActive(page) || lru_gen_enabled())) {
- int nr_pages = thp_nr_pages(page);
-
- del_page_from_lru_list(page, lruvec);
-@@ -661,7 +666,7 @@ void deactivate_file_page(struct page *p
- */
- void deactivate_page(struct page *page)
- {
-- if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
-+ if (PageLRU(page) && !PageUnevictable(page) && (PageActive(page) || lru_gen_enabled())) {
- struct pagevec *pvec;
-
- local_lock(&lru_pvecs.lock);
---- a/mm/swapfile.c
-+++ b/mm/swapfile.c
-@@ -2688,6 +2688,7 @@ SYSCALL_DEFINE1(swapoff, const char __us
- err = 0;
- atomic_inc(&proc_poll_event);
- wake_up_interruptible(&proc_poll_wait);
-+ lru_gen_change_state(false, false, true);
-
- out_dput:
- filp_close(victim, NULL);
-@@ -3349,6 +3350,7 @@ SYSCALL_DEFINE2(swapon, const char __use
- mutex_unlock(&swapon_mutex);
- atomic_inc(&proc_poll_event);
- wake_up_interruptible(&proc_poll_wait);
-+ lru_gen_change_state(true, false, true);
-
- error = 0;
- goto out;
---- a/mm/vmscan.c
-+++ b/mm/vmscan.c
-@@ -50,6 +50,7 @@
- #include <linux/printk.h>
- #include <linux/dax.h>
- #include <linux/psi.h>
-+#include <linux/memory.h>
-
- #include <asm/tlbflush.h>
- #include <asm/div64.h>
-@@ -2880,6 +2881,273 @@ static bool can_age_anon_pages(struct pg
- return can_demote(pgdat->node_id, sc);
- }
-
-+#ifdef CONFIG_LRU_GEN
-+
-+/******************************************************************************
-+ * shorthand helpers
-+ ******************************************************************************/
-+
-+#define for_each_gen_type_zone(gen, type, zone) \
-+ for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \
-+ for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \
-+ for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
-+
-+static int page_lru_gen(struct page *page)
-+{
-+ unsigned long flags = READ_ONCE(page->flags);
-+
-+ return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
-+}
-+
-+static struct lruvec *get_lruvec(int nid, struct mem_cgroup *memcg)
-+{
-+ struct pglist_data *pgdat = NODE_DATA(nid);
-+
-+#ifdef CONFIG_MEMCG
-+ if (memcg) {
-+ struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec;
-+
-+ if (lruvec->pgdat != pgdat)
-+ lruvec->pgdat = pgdat;
-+
-+ return lruvec;
-+ }
-+#endif
-+ return pgdat ? &pgdat->__lruvec : NULL;
-+}
-+
-+static int get_nr_gens(struct lruvec *lruvec, int type)
-+{
-+ return lruvec->evictable.max_seq - lruvec->evictable.min_seq[type] + 1;
-+}
-+
-+static bool __maybe_unused seq_is_valid(struct lruvec *lruvec)
-+{
-+ return get_nr_gens(lruvec, 1) >= MIN_NR_GENS &&
-+ get_nr_gens(lruvec, 1) <= get_nr_gens(lruvec, 0) &&
-+ get_nr_gens(lruvec, 0) <= MAX_NR_GENS;
-+}
-+
-+/******************************************************************************
-+ * state change
-+ ******************************************************************************/
-+
-+#ifdef CONFIG_LRU_GEN_ENABLED
-+DEFINE_STATIC_KEY_TRUE(lru_gen_static_key);
-+#else
-+DEFINE_STATIC_KEY_FALSE(lru_gen_static_key);
-+#endif
-+
-+static int lru_gen_nr_swapfiles;
-+
-+static bool __maybe_unused state_is_valid(struct lruvec *lruvec)
-+{
-+ int gen, type, zone;
-+ enum lru_list lru;
-+ struct lrugen *lrugen = &lruvec->evictable;
-+
-+ for_each_evictable_lru(lru) {
-+ type = is_file_lru(lru);
-+
-+ if (lrugen->enabled[type] && !list_empty(&lruvec->lists[lru]))
-+ return false;
-+ }
-+
-+ for_each_gen_type_zone(gen, type, zone) {
-+ if (!lrugen->enabled[type] && !list_empty(&lrugen->lists[gen][type][zone]))
-+ return false;
-+
-+ /* unlikely but not a bug when reset_batch_size() is pending */
-+ VM_WARN_ON(!lrugen->enabled[type] && lrugen->sizes[gen][type][zone]);
-+ }
-+
-+ return true;
-+}
-+
-+static bool fill_lists(struct lruvec *lruvec)
-+{
-+ enum lru_list lru;
-+ int remaining = MAX_BATCH_SIZE;
-+
-+ for_each_evictable_lru(lru) {
-+ int type = is_file_lru(lru);
-+ bool active = is_active_lru(lru);
-+ struct list_head *head = &lruvec->lists[lru];
-+
-+ if (!lruvec->evictable.enabled[type])
-+ continue;
-+
-+ while (!list_empty(head)) {
-+ bool success;
-+ struct page *page = lru_to_page(head);
-+
-+ VM_BUG_ON_PAGE(PageTail(page), page);
-+ VM_BUG_ON_PAGE(PageUnevictable(page), page);
-+ VM_BUG_ON_PAGE(PageActive(page) != active, page);
-+ VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page);
-+ VM_BUG_ON_PAGE(page_lru_gen(page) < MAX_NR_GENS, page);
-+
-+ prefetchw_prev_lru_page(page, head, flags);
-+
-+ del_page_from_lru_list(page, lruvec);
-+ success = lru_gen_add_page(page, lruvec, false);
-+ VM_BUG_ON(!success);
-+
-+ if (!--remaining)
-+ return false;
-+ }
-+ }
-+
-+ return true;
-+}
-+
-+static bool drain_lists(struct lruvec *lruvec)
-+{
-+ int gen, type, zone;
-+ int remaining = MAX_BATCH_SIZE;
-+
-+ for_each_gen_type_zone(gen, type, zone) {
-+ struct list_head *head = &lruvec->evictable.lists[gen][type][zone];
-+
-+ if (lruvec->evictable.enabled[type])
-+ continue;
-+
-+ while (!list_empty(head)) {
-+ bool success;
-+ struct page *page = lru_to_page(head);
-+
-+ VM_BUG_ON_PAGE(PageTail(page), page);
-+ VM_BUG_ON_PAGE(PageUnevictable(page), page);
-+ VM_BUG_ON_PAGE(PageActive(page), page);
-+ VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page);
-+ VM_BUG_ON_PAGE(page_zonenum(page) != zone, page);
-+
-+ prefetchw_prev_lru_page(page, head, flags);
-+
-+ success = lru_gen_del_page(page, lruvec, false);
-+ VM_BUG_ON(!success);
-+ add_page_to_lru_list(page, lruvec);
-+
-+ if (!--remaining)
-+ return false;
-+ }
-+ }
-+
-+ return true;
-+}
-+
-+/*
-+ * For file page tracking, we enable/disable it according to the main switch.
-+ * For anon page tracking, we only enabled it when the main switch is on and
-+ * there is at least one swapfile; we disable it when there are no swapfiles
-+ * regardless of the value of the main switch. Otherwise, we will eventually
-+ * reach the max size of the sliding window and have to call inc_min_seq().
-+ */
-+void lru_gen_change_state(bool enable, bool main, bool swap)
-+{
-+ static DEFINE_MUTEX(state_mutex);
-+
-+ struct mem_cgroup *memcg;
-+
-+ mem_hotplug_begin();
-+ cgroup_lock();
-+ mutex_lock(&state_mutex);
-+
-+ if (swap) {
-+ if (enable)
-+ swap = !lru_gen_nr_swapfiles++;
-+ else
-+ swap = !--lru_gen_nr_swapfiles;
-+ }
-+
-+ if (main && enable != lru_gen_enabled()) {
-+ if (enable)
-+ static_branch_enable(&lru_gen_static_key);
-+ else
-+ static_branch_disable(&lru_gen_static_key);
-+ } else if (!swap || !lru_gen_enabled())
-+ goto unlock;
-+
-+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
-+ do {
-+ int nid;
-+
-+ for_each_node(nid) {
-+ struct lruvec *lruvec = get_lruvec(nid, memcg);
-+
-+ if (!lruvec)
-+ continue;
-+
-+ spin_lock_irq(&lruvec->lru_lock);
-+
-+ VM_BUG_ON(!seq_is_valid(lruvec));
-+ VM_BUG_ON(!state_is_valid(lruvec));
-+
-+ lruvec->evictable.enabled[0] = lru_gen_enabled() && lru_gen_nr_swapfiles;
-+ lruvec->evictable.enabled[1] = lru_gen_enabled();
-+
-+ while (!(enable ? fill_lists(lruvec) : drain_lists(lruvec))) {
-+ spin_unlock_irq(&lruvec->lru_lock);
-+ cond_resched();
-+ spin_lock_irq(&lruvec->lru_lock);
-+ }
-+
-+ spin_unlock_irq(&lruvec->lru_lock);
-+ }
-+
-+ cond_resched();
-+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
-+unlock:
-+ mutex_unlock(&state_mutex);
-+ cgroup_unlock();
-+ mem_hotplug_done();
-+}
-+
-+/******************************************************************************
-+ * initialization
-+ ******************************************************************************/
-+
-+void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec)
-+{
-+ int i;
-+ int gen, type, zone;
-+ struct lrugen *lrugen = &lruvec->evictable;
-+
-+ lrugen->max_seq = MIN_NR_GENS + 1;
-+ lrugen->enabled[0] = lru_gen_enabled() && lru_gen_nr_swapfiles;
-+ lrugen->enabled[1] = lru_gen_enabled();
-+
-+ for (i = 0; i <= MIN_NR_GENS + 1; i++)
-+ lrugen->timestamps[i] = jiffies;
-+
-+ for_each_gen_type_zone(gen, type, zone)
-+ INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
-+}
-+
-+#ifdef CONFIG_MEMCG
-+void lru_gen_init_memcg(struct mem_cgroup *memcg)
-+{
-+ int nid;
-+
-+ for_each_node(nid) {
-+ struct lruvec *lruvec = get_lruvec(nid, memcg);
-+
-+ lru_gen_init_state(memcg, lruvec);
-+ }
-+}
-+#endif
-+
-+static int __init init_lru_gen(void)
-+{
-+ BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS);
-+ BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);
-+
-+ return 0;
-+};
-+late_initcall(init_lru_gen);
-+
-+#endif /* CONFIG_LRU_GEN */
-+
- static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
- {
- unsigned long nr[NR_LRU_LISTS];
+++ /dev/null
-From 534bcc4a0bb5b24600891ce793f0295a142e9dae Mon Sep 17 00:00:00 2001
-Date: Mon, 5 Apr 2021 04:17:41 -0600
-Subject: [PATCH 05/10] mm: multigenerational lru: mm_struct list
-
-To scan PTEs for accessed pages, a mm_struct list is maintained for
-each memcg. When multiple threads traverse the same memcg->mm_list,
-each of them gets a unique mm_struct and therefore they can run
-walk_page_range() concurrently to reach page tables of all processes
-of this memcg.
-
-This infrastructure also provides the following optimizations:
- 1) it allows walkers to skip processes that have been sleeping since
- the last walk by tracking the usage of mm_struct between context
- switches.
- 2) it allows walkers to add interesting items they find during a
- walk to a Bloom filter so that they can skip uninteresting items
- during the next walk by testing whether an item is in this Bloom
- filter.
-
-Change-Id: I25d9eda8c6bdc7c3653b9f210a159d6c247c81e8
----
- fs/exec.c | 2 +
- include/linux/memcontrol.h | 4 +
- include/linux/mm_inline.h | 6 +
- include/linux/mm_types.h | 75 +++++++++
- include/linux/mmzone.h | 63 +++++++
- kernel/exit.c | 1 +
- kernel/fork.c | 9 +
- kernel/sched/core.c | 1 +
- mm/memcontrol.c | 25 +++
- mm/vmscan.c | 331 +++++++++++++++++++++++++++++++++++++
- 10 files changed, 517 insertions(+)
-
---- a/fs/exec.c
-+++ b/fs/exec.c
-@@ -1013,6 +1013,7 @@ static int exec_mmap(struct mm_struct *m
- active_mm = tsk->active_mm;
- tsk->active_mm = mm;
- tsk->mm = mm;
-+ lru_gen_add_mm(mm);
- /*
- * This prevents preemption while active_mm is being loaded and
- * it and mm are being updated, which could cause problems for
-@@ -1023,6 +1024,7 @@ static int exec_mmap(struct mm_struct *m
- if (!IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
- local_irq_enable();
- activate_mm(active_mm, mm);
-+ lru_gen_activate_mm(mm);
- if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
- local_irq_enable();
- tsk->mm->vmacache_seqnum = 0;
---- a/include/linux/memcontrol.h
-+++ b/include/linux/memcontrol.h
-@@ -348,6 +348,10 @@ struct mem_cgroup {
- struct deferred_split deferred_split_queue;
- #endif
-
-+#ifdef CONFIG_LRU_GEN
-+ struct lru_gen_mm_list mm_list;
-+#endif
-+
- struct mem_cgroup_per_node *nodeinfo[];
- };
-
---- a/include/linux/mm_inline.h
-+++ b/include/linux/mm_inline.h
-@@ -100,6 +100,12 @@ static inline int lru_gen_from_seq(unsig
- return seq % MAX_NR_GENS;
- }
-
-+/* Return a proper index regardless whether we keep stats for historical generations. */
-+static inline int lru_hist_from_seq(unsigned long seq)
-+{
-+ return seq % NR_HIST_GENS;
-+}
-+
- /* The youngest and the second youngest generations are counted as active. */
- static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen)
- {
---- a/include/linux/mm_types.h
-+++ b/include/linux/mm_types.h
-@@ -3,6 +3,7 @@
- #define _LINUX_MM_TYPES_H
-
- #include <linux/mm_types_task.h>
-+#include <linux/sched.h>
-
- #include <linux/auxvec.h>
- #include <linux/list.h>
-@@ -15,6 +16,8 @@
- #include <linux/page-flags-layout.h>
- #include <linux/workqueue.h>
- #include <linux/seqlock.h>
-+#include <linux/nodemask.h>
-+#include <linux/mmdebug.h>
-
- #include <asm/mmu.h>
-
-@@ -580,6 +583,18 @@ struct mm_struct {
- #ifdef CONFIG_IOMMU_SUPPORT
- u32 pasid;
- #endif
-+#ifdef CONFIG_LRU_GEN
-+ struct {
-+ /* the node of a global or per-memcg mm_struct list */
-+ struct list_head list;
-+#ifdef CONFIG_MEMCG
-+ /* points to the memcg of the owner task above */
-+ struct mem_cgroup *memcg;
-+#endif
-+ /* whether this mm_struct has been used since the last walk */
-+ nodemask_t nodes;
-+ } lrugen;
-+#endif /* CONFIG_LRU_GEN */
- } __randomize_layout;
-
- /*
-@@ -606,6 +621,66 @@ static inline cpumask_t *mm_cpumask(stru
- return (struct cpumask *)&mm->cpu_bitmap;
- }
-
-+#ifdef CONFIG_LRU_GEN
-+
-+struct lru_gen_mm_list {
-+ /* a global or per-memcg mm_struct list */
-+ struct list_head fifo;
-+ /* protects the list above */
-+ spinlock_t lock;
-+};
-+
-+void lru_gen_add_mm(struct mm_struct *mm);
-+void lru_gen_del_mm(struct mm_struct *mm);
-+#ifdef CONFIG_MEMCG
-+void lru_gen_migrate_mm(struct mm_struct *mm);
-+#endif
-+
-+static inline void lru_gen_init_mm(struct mm_struct *mm)
-+{
-+ INIT_LIST_HEAD(&mm->lrugen.list);
-+#ifdef CONFIG_MEMCG
-+ mm->lrugen.memcg = NULL;
-+#endif
-+ nodes_clear(mm->lrugen.nodes);
-+}
-+
-+/* Track the usage of each mm_struct so that we can skip inactive ones. */
-+static inline void lru_gen_activate_mm(struct mm_struct *mm)
-+{
-+ /* unlikely but not a bug when racing with lru_gen_migrate_mm() */
-+ VM_WARN_ON(list_empty(&mm->lrugen.list));
-+
-+ if (!(current->flags & PF_KTHREAD) && !nodes_full(mm->lrugen.nodes))
-+ nodes_setall(mm->lrugen.nodes);
-+}
-+
-+#else /* !CONFIG_LRU_GEN */
-+
-+static inline void lru_gen_add_mm(struct mm_struct *mm)
-+{
-+}
-+
-+static inline void lru_gen_del_mm(struct mm_struct *mm)
-+{
-+}
-+
-+#ifdef CONFIG_MEMCG
-+static inline void lru_gen_migrate_mm(struct mm_struct *mm)
-+{
-+}
-+#endif
-+
-+static inline void lru_gen_init_mm(struct mm_struct *mm)
-+{
-+}
-+
-+static inline void lru_gen_activate_mm(struct mm_struct *mm)
-+{
-+}
-+
-+#endif /* CONFIG_LRU_GEN */
-+
- struct mmu_gather;
- extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm);
- extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm);
---- a/include/linux/mmzone.h
-+++ b/include/linux/mmzone.h
-@@ -318,6 +318,13 @@ struct lruvec;
- #define MIN_NR_GENS 2
- #define MAX_NR_GENS ((unsigned int)CONFIG_NR_LRU_GENS)
-
-+/* Whether to keep stats for historical generations. */
-+#ifdef CONFIG_LRU_GEN_STATS
-+#define NR_HIST_GENS ((unsigned int)CONFIG_NR_LRU_GENS)
-+#else
-+#define NR_HIST_GENS 1U
-+#endif
-+
- struct lrugen {
- /* the aging increments the max generation number */
- unsigned long max_seq;
-@@ -333,13 +340,63 @@ struct lrugen {
- bool enabled[ANON_AND_FILE];
- };
-
-+enum {
-+ MM_LEAF_TOTAL, /* total leaf entries */
-+ MM_LEAF_OLD, /* old leaf entries */
-+ MM_LEAF_YOUNG, /* young leaf entries */
-+ MM_NONLEAF_TOTAL, /* total non-leaf entries */
-+ MM_NONLEAF_PREV, /* previously worthy non-leaf entries */
-+ MM_NONLEAF_CUR, /* currently worthy non-leaf entries */
-+ NR_MM_STATS
-+};
-+
-+/* mnemonic codes for the stats above */
-+#define MM_STAT_CODES "toydpc"
-+
-+/* double buffering bloom filters */
-+#define NR_BLOOM_FILTERS 2
-+
-+struct lru_gen_mm_walk {
-+ /* set to max_seq after each round of walk */
-+ unsigned long seq;
-+ /* the next mm_struct on the list to walk */
-+ struct list_head *head;
-+ /* the first mm_struct never walked before */
-+ struct list_head *tail;
-+ /* to wait for the last walker to finish */
-+ struct wait_queue_head wait;
-+ /* bloom filters flip after each round of walk */
-+ unsigned long *filters[NR_BLOOM_FILTERS];
-+ /* page table stats for debugging */
-+ unsigned long stats[NR_HIST_GENS][NR_MM_STATS];
-+ /* the number of concurrent walkers */
-+ int nr_walkers;
-+};
-+
-+#define MIN_BATCH_SIZE 64
- #define MAX_BATCH_SIZE 8192
-
-+struct mm_walk_args {
-+ struct mem_cgroup *memcg;
-+ unsigned long max_seq;
-+ unsigned long start_pfn;
-+ unsigned long end_pfn;
-+ unsigned long next_addr;
-+ unsigned long bitmap[BITS_TO_LONGS(MIN_BATCH_SIZE)];
-+ int node_id;
-+ int swappiness;
-+ int batch_size;
-+ int nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
-+ int mm_stats[NR_MM_STATS];
-+ bool use_filter;
-+};
-+
- void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec);
- void lru_gen_change_state(bool enable, bool main, bool swap);
-
- #ifdef CONFIG_MEMCG
- void lru_gen_init_memcg(struct mem_cgroup *memcg);
-+void lru_gen_free_memcg(struct mem_cgroup *memcg);
- #endif
-
- #else /* !CONFIG_LRU_GEN */
-@@ -356,6 +413,10 @@ static inline void lru_gen_change_state(
- static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
- {
- }
-+
-+static inline void lru_gen_free_memcg(struct mem_cgroup *memcg)
-+{
-+}
- #endif
-
- #endif /* CONFIG_LRU_GEN */
-@@ -380,6 +441,8 @@ struct lruvec {
- #ifdef CONFIG_LRU_GEN
- /* unevictable pages are on LRU_UNEVICTABLE */
- struct lrugen evictable;
-+ /* state for mm list and page table walks */
-+ struct lru_gen_mm_walk mm_walk;
- #endif
- #ifdef CONFIG_MEMCG
- struct pglist_data *pgdat;
---- a/kernel/exit.c
-+++ b/kernel/exit.c
-@@ -422,6 +422,7 @@ assign_new_owner:
- goto retry;
- }
- WRITE_ONCE(mm->owner, c);
-+ lru_gen_migrate_mm(mm);
- task_unlock(c);
- put_task_struct(c);
- }
---- a/kernel/fork.c
-+++ b/kernel/fork.c
-@@ -1080,6 +1080,7 @@ static struct mm_struct *mm_init(struct
- goto fail_nocontext;
-
- mm->user_ns = get_user_ns(user_ns);
-+ lru_gen_init_mm(mm);
- return mm;
-
- fail_nocontext:
-@@ -1122,6 +1123,7 @@ static inline void __mmput(struct mm_str
- }
- if (mm->binfmt)
- module_put(mm->binfmt->module);
-+ lru_gen_del_mm(mm);
- mmdrop(mm);
- }
-
-@@ -2617,6 +2619,13 @@ pid_t kernel_clone(struct kernel_clone_a
- get_task_struct(p);
- }
-
-+ if (IS_ENABLED(CONFIG_LRU_GEN) && !(clone_flags & CLONE_VM)) {
-+ /* lock the task to synchronize with memcg migration */
-+ task_lock(p);
-+ lru_gen_add_mm(p->mm);
-+ task_unlock(p);
-+ }
-+
- wake_up_new_task(p);
-
- /* forking complete and child started to run, tell ptracer */
---- a/kernel/sched/core.c
-+++ b/kernel/sched/core.c
-@@ -4978,6 +4978,7 @@ context_switch(struct rq *rq, struct tas
- * finish_task_switch()'s mmdrop().
- */
- switch_mm_irqs_off(prev->active_mm, next->mm, next);
-+ lru_gen_activate_mm(next->mm);
-
- if (!prev->mm) { // from kernel
- /* will mmdrop() in finish_task_switch(). */
---- a/mm/memcontrol.c
-+++ b/mm/memcontrol.c
-@@ -5163,6 +5163,7 @@ static void __mem_cgroup_free(struct mem
-
- static void mem_cgroup_free(struct mem_cgroup *memcg)
- {
-+ lru_gen_free_memcg(memcg);
- memcg_wb_domain_exit(memcg);
- __mem_cgroup_free(memcg);
- }
-@@ -6195,6 +6196,29 @@ static void mem_cgroup_move_task(void)
- }
- #endif
-
-+#ifdef CONFIG_LRU_GEN
-+static void mem_cgroup_attach(struct cgroup_taskset *tset)
-+{
-+ struct cgroup_subsys_state *css;
-+ struct task_struct *task = NULL;
-+
-+ cgroup_taskset_for_each_leader(task, css, tset)
-+ break;
-+
-+ if (!task)
-+ return;
-+
-+ task_lock(task);
-+ if (task->mm && task->mm->owner == task)
-+ lru_gen_migrate_mm(task->mm);
-+ task_unlock(task);
-+}
-+#else
-+static void mem_cgroup_attach(struct cgroup_taskset *tset)
-+{
-+}
-+#endif /* CONFIG_LRU_GEN */
-+
- static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value)
- {
- if (value == PAGE_COUNTER_MAX)
-@@ -6538,6 +6562,7 @@ struct cgroup_subsys memory_cgrp_subsys
- .css_reset = mem_cgroup_css_reset,
- .css_rstat_flush = mem_cgroup_css_rstat_flush,
- .can_attach = mem_cgroup_can_attach,
-+ .attach = mem_cgroup_attach,
- .cancel_attach = mem_cgroup_cancel_attach,
- .post_attach = mem_cgroup_move_task,
- .dfl_cftypes = memory_files,
---- a/mm/vmscan.c
-+++ b/mm/vmscan.c
-@@ -2929,6 +2929,306 @@ static bool __maybe_unused seq_is_valid(
- }
-
- /******************************************************************************
-+ * mm_struct list
-+ ******************************************************************************/
-+
-+static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg)
-+{
-+ static struct lru_gen_mm_list mm_list = {
-+ .fifo = LIST_HEAD_INIT(mm_list.fifo),
-+ .lock = __SPIN_LOCK_UNLOCKED(mm_list.lock),
-+ };
-+
-+#ifdef CONFIG_MEMCG
-+ if (memcg)
-+ return &memcg->mm_list;
-+#endif
-+ return &mm_list;
-+}
-+
-+void lru_gen_add_mm(struct mm_struct *mm)
-+{
-+ int nid;
-+ struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm);
-+ struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
-+
-+ VM_BUG_ON_MM(!list_empty(&mm->lrugen.list), mm);
-+#ifdef CONFIG_MEMCG
-+ VM_BUG_ON_MM(mm->lrugen.memcg, mm);
-+ mm->lrugen.memcg = memcg;
-+#endif
-+ spin_lock(&mm_list->lock);
-+
-+ list_add_tail(&mm->lrugen.list, &mm_list->fifo);
-+
-+ for_each_node(nid) {
-+ struct lruvec *lruvec = get_lruvec(nid, memcg);
-+
-+ if (!lruvec)
-+ continue;
-+
-+ if (lruvec->mm_walk.tail == &mm_list->fifo)
-+ lruvec->mm_walk.tail = lruvec->mm_walk.tail->prev;
-+ }
-+
-+ spin_unlock(&mm_list->lock);
-+}
-+
-+void lru_gen_del_mm(struct mm_struct *mm)
-+{
-+ int nid;
-+ struct lru_gen_mm_list *mm_list;
-+ struct mem_cgroup *memcg = NULL;
-+
-+ if (list_empty(&mm->lrugen.list))
-+ return;
-+
-+#ifdef CONFIG_MEMCG
-+ memcg = mm->lrugen.memcg;
-+#endif
-+ mm_list = get_mm_list(memcg);
-+
-+ spin_lock(&mm_list->lock);
-+
-+ for_each_node(nid) {
-+ struct lruvec *lruvec = get_lruvec(nid, memcg);
-+
-+ if (!lruvec)
-+ continue;
-+
-+ if (lruvec->mm_walk.tail == &mm->lrugen.list)
-+ lruvec->mm_walk.tail = lruvec->mm_walk.tail->next;
-+
-+ if (lruvec->mm_walk.head != &mm->lrugen.list)
-+ continue;
-+
-+ lruvec->mm_walk.head = lruvec->mm_walk.head->next;
-+ if (lruvec->mm_walk.head == &mm_list->fifo)
-+ WRITE_ONCE(lruvec->mm_walk.seq, lruvec->mm_walk.seq + 1);
-+ }
-+
-+ list_del_init(&mm->lrugen.list);
-+
-+ spin_unlock(&mm_list->lock);
-+
-+#ifdef CONFIG_MEMCG
-+ mem_cgroup_put(mm->lrugen.memcg);
-+ mm->lrugen.memcg = NULL;
-+#endif
-+}
-+
-+#ifdef CONFIG_MEMCG
-+void lru_gen_migrate_mm(struct mm_struct *mm)
-+{
-+ struct mem_cgroup *memcg;
-+
-+ lockdep_assert_held(&mm->owner->alloc_lock);
-+
-+ if (mem_cgroup_disabled())
-+ return;
-+
-+ rcu_read_lock();
-+ memcg = mem_cgroup_from_task(mm->owner);
-+ rcu_read_unlock();
-+ if (memcg == mm->lrugen.memcg)
-+ return;
-+
-+ VM_BUG_ON_MM(!mm->lrugen.memcg, mm);
-+ VM_BUG_ON_MM(list_empty(&mm->lrugen.list), mm);
-+
-+ lru_gen_del_mm(mm);
-+ lru_gen_add_mm(mm);
-+}
-+#endif
-+
-+#define BLOOM_FILTER_SHIFT 15
-+
-+static inline int filter_gen_from_seq(unsigned long seq)
-+{
-+ return seq % NR_BLOOM_FILTERS;
-+}
-+
-+static void get_item_key(void *item, int *key)
-+{
-+ u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2);
-+
-+ BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32));
-+
-+ key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1);
-+ key[1] = hash >> BLOOM_FILTER_SHIFT;
-+}
-+
-+static void clear_bloom_filter(struct lruvec *lruvec, unsigned long seq)
-+{
-+ unsigned long *filter;
-+ int gen = filter_gen_from_seq(seq);
-+
-+ lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock);
-+
-+ filter = lruvec->mm_walk.filters[gen];
-+ if (filter) {
-+ bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT));
-+ return;
-+ }
-+
-+ filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT), GFP_ATOMIC);
-+ WRITE_ONCE(lruvec->mm_walk.filters[gen], filter);
-+}
-+
-+static void set_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
-+{
-+ int key[2];
-+ unsigned long *filter;
-+ int gen = filter_gen_from_seq(seq);
-+
-+ filter = READ_ONCE(lruvec->mm_walk.filters[gen]);
-+ if (!filter)
-+ return;
-+
-+ get_item_key(item, key);
-+
-+ if (!test_bit(key[0], filter))
-+ set_bit(key[0], filter);
-+ if (!test_bit(key[1], filter))
-+ set_bit(key[1], filter);
-+}
-+
-+static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
-+{
-+ int key[2];
-+ unsigned long *filter;
-+ int gen = filter_gen_from_seq(seq);
-+
-+ filter = READ_ONCE(lruvec->mm_walk.filters[gen]);
-+ if (!filter)
-+ return false;
-+
-+ get_item_key(item, key);
-+
-+ return test_bit(key[0], filter) && test_bit(key[1], filter);
-+}
-+
-+static void reset_mm_stats(struct lruvec *lruvec, bool last, struct mm_walk_args *args)
-+{
-+ int i;
-+ int hist = lru_hist_from_seq(args->max_seq);
-+
-+ lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock);
-+
-+ for (i = 0; i < NR_MM_STATS; i++) {
-+ WRITE_ONCE(lruvec->mm_walk.stats[hist][i],
-+ lruvec->mm_walk.stats[hist][i] + args->mm_stats[i]);
-+ args->mm_stats[i] = 0;
-+ }
-+
-+ if (!last || NR_HIST_GENS == 1)
-+ return;
-+
-+ hist = lru_hist_from_seq(args->max_seq + 1);
-+ for (i = 0; i < NR_MM_STATS; i++)
-+ WRITE_ONCE(lruvec->mm_walk.stats[hist][i], 0);
-+}
-+
-+static bool should_skip_mm(struct mm_struct *mm, struct mm_walk_args *args)
-+{
-+ int type;
-+ unsigned long size = 0;
-+
-+ if (cpumask_empty(mm_cpumask(mm)) && !node_isset(args->node_id, mm->lrugen.nodes))
-+ return true;
-+
-+ if (mm_is_oom_victim(mm))
-+ return true;
-+
-+ for (type = !args->swappiness; type < ANON_AND_FILE; type++) {
-+ size += type ? get_mm_counter(mm, MM_FILEPAGES) :
-+ get_mm_counter(mm, MM_ANONPAGES) +
-+ get_mm_counter(mm, MM_SHMEMPAGES);
-+ }
-+
-+ if (size < MIN_BATCH_SIZE)
-+ return true;
-+
-+ if (!mmget_not_zero(mm))
-+ return true;
-+
-+ node_clear(args->node_id, mm->lrugen.nodes);
-+
-+ return false;
-+}
-+
-+/* To support multiple walkers that concurrently walk an mm_struct list. */
-+static bool get_next_mm(struct lruvec *lruvec, struct mm_walk_args *args,
-+ struct mm_struct **iter)
-+{
-+ bool first = false;
-+ bool last = true;
-+ struct mm_struct *mm = NULL;
-+ struct lru_gen_mm_walk *mm_walk = &lruvec->mm_walk;
-+ struct lru_gen_mm_list *mm_list = get_mm_list(args->memcg);
-+
-+ if (*iter)
-+ mmput_async(*iter);
-+ else if (args->max_seq <= READ_ONCE(mm_walk->seq))
-+ return false;
-+
-+ spin_lock(&mm_list->lock);
-+
-+ VM_BUG_ON(args->max_seq > mm_walk->seq + 1);
-+ VM_BUG_ON(*iter && args->max_seq < mm_walk->seq);
-+ VM_BUG_ON(*iter && !mm_walk->nr_walkers);
-+
-+ if (args->max_seq <= mm_walk->seq) {
-+ if (!*iter)
-+ last = false;
-+ goto done;
-+ }
-+
-+ if (mm_walk->head == &mm_list->fifo) {
-+ VM_BUG_ON(mm_walk->nr_walkers);
-+ mm_walk->head = mm_walk->head->next;
-+ first = true;
-+ }
-+
-+ while (!mm && mm_walk->head != &mm_list->fifo) {
-+ mm = list_entry(mm_walk->head, struct mm_struct, lrugen.list);
-+
-+ mm_walk->head = mm_walk->head->next;
-+
-+ if (mm_walk->tail == &mm->lrugen.list) {
-+ mm_walk->tail = mm_walk->tail->next;
-+ args->use_filter = false;
-+ }
-+
-+ if (should_skip_mm(mm, args))
-+ mm = NULL;
-+ }
-+
-+ if (mm_walk->head == &mm_list->fifo)
-+ WRITE_ONCE(mm_walk->seq, mm_walk->seq + 1);
-+done:
-+ if (*iter && !mm)
-+ mm_walk->nr_walkers--;
-+ if (!*iter && mm)
-+ mm_walk->nr_walkers++;
-+
-+ if (mm_walk->nr_walkers)
-+ last = false;
-+
-+ if (mm && first)
-+ clear_bloom_filter(lruvec, args->max_seq + 1);
-+
-+ if (*iter || last)
-+ reset_mm_stats(lruvec, last, args);
-+
-+ spin_unlock(&mm_list->lock);
-+
-+ *iter = mm;
-+
-+ return last;
-+}
-+
-+/******************************************************************************
- * state change
- ******************************************************************************/
-
-@@ -3112,6 +3412,7 @@ void lru_gen_init_state(struct mem_cgrou
- int i;
- int gen, type, zone;
- struct lrugen *lrugen = &lruvec->evictable;
-+ struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
-
- lrugen->max_seq = MIN_NR_GENS + 1;
- lrugen->enabled[0] = lru_gen_enabled() && lru_gen_nr_swapfiles;
-@@ -3122,6 +3423,17 @@ void lru_gen_init_state(struct mem_cgrou
-
- for_each_gen_type_zone(gen, type, zone)
- INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
-+
-+ if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG) && !memcg)
-+ spin_lock(&mm_list->lock);
-+
-+ lruvec->mm_walk.seq = MIN_NR_GENS;
-+ lruvec->mm_walk.head = &mm_list->fifo;
-+ lruvec->mm_walk.tail = &mm_list->fifo;
-+ init_waitqueue_head(&lruvec->mm_walk.wait);
-+
-+ if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG) && !memcg)
-+ spin_unlock(&mm_list->lock);
- }
-
- #ifdef CONFIG_MEMCG
-@@ -3129,18 +3441,37 @@ void lru_gen_init_memcg(struct mem_cgrou
- {
- int nid;
-
-+ INIT_LIST_HEAD(&memcg->mm_list.fifo);
-+ spin_lock_init(&memcg->mm_list.lock);
-+
- for_each_node(nid) {
- struct lruvec *lruvec = get_lruvec(nid, memcg);
-
- lru_gen_init_state(memcg, lruvec);
- }
- }
-+
-+void lru_gen_free_memcg(struct mem_cgroup *memcg)
-+{
-+ int nid;
-+
-+ for_each_node(nid) {
-+ int i;
-+ struct lruvec *lruvec = get_lruvec(nid, memcg);
-+
-+ for (i = 0; i < NR_BLOOM_FILTERS; i++) {
-+ bitmap_free(lruvec->mm_walk.filters[i]);
-+ lruvec->mm_walk.filters[i] = NULL;
-+ }
-+ }
-+}
- #endif
-
- static int __init init_lru_gen(void)
- {
- BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS);
- BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);
-+ BUILD_BUG_ON(sizeof(MM_STAT_CODES) != NR_MM_STATS + 1);
-
- return 0;
- };
+++ /dev/null
-From 8217cd2238c40cf77208aa27a7cc09879e685890 Mon Sep 17 00:00:00 2001
-Date: Mon, 5 Apr 2021 04:35:07 -0600
-Subject: [PATCH 06/10] mm: multigenerational lru: aging
-
-The aging produces young generations. Given an lruvec, the aging
-traverses lruvec_memcg()->mm_list and calls walk_page_range() to scan
-PTEs for accessed pages. Upon finding one, the aging updates its
-generation number to max_seq (modulo MAX_NR_GENS). After each round of
-traversal, the aging increments max_seq. The aging is due when
-min_seq[] reaches max_seq-1.
-
-The aging uses the following optimizations when walking page tables:
- 1) It skips non-leaf PMD entries that have the accessed bit cleared
- when CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG=y.
- 2) It does not zigzag between a PGD table and the same PMD or PTE
- table spanning multiple VMAs. In other words, it finishes all the
- VMAs within the range of the same PMD or PTE table before it returns
- to this PGD table. This optimizes workloads that have large numbers
- of tiny VMAs, especially when CONFIG_PGTABLE_LEVELS=5.
-
-Change-Id: I3ae8abc3100d023cecb3a699d86020ae6fc10a45
----
- include/linux/memcontrol.h | 3 +
- include/linux/mmzone.h | 9 +
- include/linux/oom.h | 16 +
- include/linux/swap.h | 3 +
- mm/memcontrol.c | 5 +
- mm/oom_kill.c | 4 +-
- mm/rmap.c | 8 +
- mm/vmscan.c | 948 +++++++++++++++++++++++++++++++++++++
- 8 files changed, 994 insertions(+), 2 deletions(-)
-
---- a/include/linux/memcontrol.h
-+++ b/include/linux/memcontrol.h
-@@ -1367,10 +1367,13 @@ mem_cgroup_print_oom_meminfo(struct mem_
-
- static inline void lock_page_memcg(struct page *page)
- {
-+ /* to match page_memcg_rcu() */
-+ rcu_read_lock();
- }
-
- static inline void unlock_page_memcg(struct page *page)
- {
-+ rcu_read_unlock();
- }
-
- static inline void mem_cgroup_handle_over_high(void)
---- a/include/linux/mmzone.h
-+++ b/include/linux/mmzone.h
-@@ -295,6 +295,7 @@ enum lruvec_flags {
- };
-
- struct lruvec;
-+struct page_vma_mapped_walk;
-
- #define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
- #define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)
-@@ -393,6 +394,7 @@ struct mm_walk_args {
-
- void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec);
- void lru_gen_change_state(bool enable, bool main, bool swap);
-+void lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
-
- #ifdef CONFIG_MEMCG
- void lru_gen_init_memcg(struct mem_cgroup *memcg);
-@@ -409,6 +411,10 @@ static inline void lru_gen_change_state(
- {
- }
-
-+static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
-+{
-+}
-+
- #ifdef CONFIG_MEMCG
- static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
- {
-@@ -1028,6 +1034,9 @@ typedef struct pglist_data {
-
- unsigned long flags;
-
-+#ifdef CONFIG_LRU_GEN
-+ struct mm_walk_args mm_walk_args;
-+#endif
- ZONE_PADDING(_pad2_)
-
- /* Per-node vmstats */
---- a/include/linux/oom.h
-+++ b/include/linux/oom.h
-@@ -57,6 +57,22 @@ struct oom_control {
- extern struct mutex oom_lock;
- extern struct mutex oom_adj_mutex;
-
-+#ifdef CONFIG_MMU
-+extern struct task_struct *oom_reaper_list;
-+extern struct wait_queue_head oom_reaper_wait;
-+
-+static inline bool oom_reaping_in_progress(void)
-+{
-+ /* racy check to see if oom reaping could be in progress */
-+ return READ_ONCE(oom_reaper_list) || !waitqueue_active(&oom_reaper_wait);
-+}
-+#else
-+static inline bool oom_reaping_in_progress(void)
-+{
-+ return false;
-+}
-+#endif
-+
- static inline void set_current_oom_origin(void)
- {
- current->signal->oom_flag_origin = true;
---- a/include/linux/swap.h
-+++ b/include/linux/swap.h
-@@ -137,6 +137,9 @@ union swap_header {
- */
- struct reclaim_state {
- unsigned long reclaimed_slab;
-+#ifdef CONFIG_LRU_GEN
-+ struct mm_walk_args *mm_walk_args;
-+#endif
- };
-
- #ifdef __KERNEL__
---- a/mm/memcontrol.c
-+++ b/mm/memcontrol.c
-@@ -1304,12 +1304,17 @@ void mem_cgroup_update_lru_size(struct l
- *lru_size += nr_pages;
-
- size = *lru_size;
-+#ifdef CONFIG_LRU_GEN
-+ /* unlikely but not a bug when reset_batch_size() is pending */
-+ VM_WARN_ON(size + MAX_BATCH_SIZE < 0);
-+#else
- if (WARN_ONCE(size < 0,
- "%s(%p, %d, %d): lru_size %ld\n",
- __func__, lruvec, lru, nr_pages, size)) {
- VM_BUG_ON(1);
- *lru_size = 0;
- }
-+#endif
-
- if (nr_pages > 0)
- *lru_size += nr_pages;
---- a/mm/oom_kill.c
-+++ b/mm/oom_kill.c
-@@ -508,8 +508,8 @@ bool process_shares_mm(struct task_struc
- * victim (if that is possible) to help the OOM killer to move on.
- */
- static struct task_struct *oom_reaper_th;
--static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
--static struct task_struct *oom_reaper_list;
-+DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
-+struct task_struct *oom_reaper_list;
- static DEFINE_SPINLOCK(oom_reaper_lock);
-
- bool __oom_reap_task_mm(struct mm_struct *mm)
---- a/mm/rmap.c
-+++ b/mm/rmap.c
-@@ -73,6 +73,7 @@
- #include <linux/page_idle.h>
- #include <linux/memremap.h>
- #include <linux/userfaultfd_k.h>
-+#include <linux/mm_inline.h>
-
- #include <asm/tlbflush.h>
-
-@@ -793,6 +794,13 @@ static bool page_referenced_one(struct p
- }
-
- if (pvmw.pte) {
-+ /* the multigenerational lru exploits the spatial locality */
-+ if (lru_gen_enabled() && pte_young(*pvmw.pte) &&
-+ !(vma->vm_flags & VM_SEQ_READ)) {
-+ lru_gen_look_around(&pvmw);
-+ referenced++;
-+ }
-+
- if (ptep_clear_flush_young_notify(vma, address,
- pvmw.pte)) {
- /*
---- a/mm/vmscan.c
-+++ b/mm/vmscan.c
-@@ -51,6 +51,8 @@
- #include <linux/dax.h>
- #include <linux/psi.h>
- #include <linux/memory.h>
-+#include <linux/pagewalk.h>
-+#include <linux/shmem_fs.h>
-
- #include <asm/tlbflush.h>
- #include <asm/div64.h>
-@@ -2887,6 +2889,15 @@ static bool can_age_anon_pages(struct pg
- * shorthand helpers
- ******************************************************************************/
-
-+#define DEFINE_MAX_SEQ(lruvec) \
-+ unsigned long max_seq = READ_ONCE((lruvec)->evictable.max_seq)
-+
-+#define DEFINE_MIN_SEQ(lruvec) \
-+ unsigned long min_seq[ANON_AND_FILE] = { \
-+ READ_ONCE((lruvec)->evictable.min_seq[0]), \
-+ READ_ONCE((lruvec)->evictable.min_seq[1]), \
-+ }
-+
- #define for_each_gen_type_zone(gen, type, zone) \
- for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \
- for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \
-@@ -2899,6 +2910,12 @@ static int page_lru_gen(struct page *pag
- return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
- }
-
-+static int get_swappiness(struct mem_cgroup *memcg)
-+{
-+ return mem_cgroup_get_nr_swap_pages(memcg) >= MIN_BATCH_SIZE ?
-+ mem_cgroup_swappiness(memcg) : 0;
-+}
-+
- static struct lruvec *get_lruvec(int nid, struct mem_cgroup *memcg)
- {
- struct pglist_data *pgdat = NODE_DATA(nid);
-@@ -3229,6 +3246,926 @@ done:
- }
-
- /******************************************************************************
-+ * the aging
-+ ******************************************************************************/
-+
-+static int page_update_gen(struct page *page, int gen)
-+{
-+ unsigned long old_flags, new_flags;
-+
-+ VM_BUG_ON(gen >= MAX_NR_GENS);
-+
-+ do {
-+ new_flags = old_flags = READ_ONCE(page->flags);
-+
-+ if (!(new_flags & LRU_GEN_MASK)) {
-+ new_flags |= BIT(PG_referenced);
-+ continue;
-+ }
-+
-+ new_flags &= ~LRU_GEN_MASK;
-+ new_flags |= (gen + 1UL) << LRU_GEN_PGOFF;
-+ } while (new_flags != old_flags &&
-+ cmpxchg(&page->flags, old_flags, new_flags) != old_flags);
-+
-+ return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
-+}
-+
-+static void page_inc_gen(struct page *page, struct lruvec *lruvec, bool reclaiming)
-+{
-+ int old_gen, new_gen;
-+ unsigned long old_flags, new_flags;
-+ int type = page_is_file_lru(page);
-+ int zone = page_zonenum(page);
-+ struct lrugen *lrugen = &lruvec->evictable;
-+
-+ old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
-+
-+ do {
-+ new_flags = old_flags = READ_ONCE(page->flags);
-+ VM_BUG_ON_PAGE(!(new_flags & LRU_GEN_MASK), page);
-+
-+ new_gen = ((new_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
-+ /* page_update_gen() has updated this page? */
-+ if (new_gen >= 0 && new_gen != old_gen) {
-+ list_move(&page->lru, &lrugen->lists[new_gen][type][zone]);
-+ return;
-+ }
-+
-+ new_gen = (old_gen + 1) % MAX_NR_GENS;
-+
-+ new_flags &= ~LRU_GEN_MASK;
-+ new_flags |= (new_gen + 1UL) << LRU_GEN_PGOFF;
-+ /* for end_page_writeback() */
-+ if (reclaiming)
-+ new_flags |= BIT(PG_reclaim);
-+ } while (cmpxchg(&page->flags, old_flags, new_flags) != old_flags);
-+
-+ lru_gen_update_size(page, lruvec, old_gen, new_gen);
-+ if (reclaiming)
-+ list_move(&page->lru, &lrugen->lists[new_gen][type][zone]);
-+ else
-+ list_move_tail(&page->lru, &lrugen->lists[new_gen][type][zone]);
-+}
-+
-+static void update_batch_size(struct page *page, int old_gen, int new_gen,
-+ struct mm_walk_args *args)
-+{
-+ int type = page_is_file_lru(page);
-+ int zone = page_zonenum(page);
-+ int delta = thp_nr_pages(page);
-+
-+ VM_BUG_ON(old_gen >= MAX_NR_GENS);
-+ VM_BUG_ON(new_gen >= MAX_NR_GENS);
-+
-+ args->batch_size++;
-+
-+ args->nr_pages[old_gen][type][zone] -= delta;
-+ args->nr_pages[new_gen][type][zone] += delta;
-+}
-+
-+static void reset_batch_size(struct lruvec *lruvec, struct mm_walk_args *args)
-+{
-+ int gen, type, zone;
-+ struct lrugen *lrugen = &lruvec->evictable;
-+
-+ args->batch_size = 0;
-+
-+ for_each_gen_type_zone(gen, type, zone) {
-+ enum lru_list lru = type * LRU_FILE;
-+ int delta = args->nr_pages[gen][type][zone];
-+
-+ if (!delta)
-+ continue;
-+
-+ args->nr_pages[gen][type][zone] = 0;
-+ WRITE_ONCE(lrugen->sizes[gen][type][zone],
-+ lrugen->sizes[gen][type][zone] + delta);
-+
-+ if (lru_gen_is_active(lruvec, gen))
-+ lru += LRU_ACTIVE;
-+ update_lru_size(lruvec, lru, zone, delta);
-+ }
-+}
-+
-+static int should_skip_vma(unsigned long start, unsigned long end, struct mm_walk *walk)
-+{
-+ struct address_space *mapping;
-+ struct vm_area_struct *vma = walk->vma;
-+ struct mm_walk_args *args = walk->private;
-+
-+ if (!vma_is_accessible(vma) || is_vm_hugetlb_page(vma) ||
-+ (vma->vm_flags & (VM_LOCKED | VM_SPECIAL | VM_SEQ_READ)))
-+ return true;
-+
-+ if (vma_is_anonymous(vma))
-+ return !args->swappiness;
-+
-+ if (WARN_ON_ONCE(!vma->vm_file || !vma->vm_file->f_mapping))
-+ return true;
-+
-+ mapping = vma->vm_file->f_mapping;
-+ if (!mapping->a_ops->writepage)
-+ return true;
-+
-+ return (shmem_mapping(mapping) && !args->swappiness) || mapping_unevictable(mapping);
-+}
-+
-+/*
-+ * Some userspace memory allocators create many single-page VMAs. So instead of
-+ * returning back to the PGD table for each of such VMAs, we finish at least an
-+ * entire PMD table and therefore avoid many zigzags.
-+ */
-+static bool get_next_vma(struct mm_walk *walk, unsigned long mask, unsigned long size,
-+ unsigned long *start, unsigned long *end)
-+{
-+ unsigned long next = round_up(*end, size);
-+
-+ VM_BUG_ON(mask & size);
-+ VM_BUG_ON(*start >= *end);
-+ VM_BUG_ON((next & mask) != (*start & mask));
-+
-+ while (walk->vma) {
-+ if (next >= walk->vma->vm_end) {
-+ walk->vma = walk->vma->vm_next;
-+ continue;
-+ }
-+
-+ if ((next & mask) != (walk->vma->vm_start & mask))
-+ return false;
-+
-+ if (should_skip_vma(walk->vma->vm_start, walk->vma->vm_end, walk)) {
-+ walk->vma = walk->vma->vm_next;
-+ continue;
-+ }
-+
-+ *start = max(next, walk->vma->vm_start);
-+ next = (next | ~mask) + 1;
-+ /* rounded-up boundaries can wrap to 0 */
-+ *end = next && next < walk->vma->vm_end ? next : walk->vma->vm_end;
-+
-+ return true;
-+ }
-+
-+ return false;
-+}
-+
-+static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
-+ struct mm_walk *walk)
-+{
-+ int i;
-+ pte_t *pte;
-+ spinlock_t *ptl;
-+ unsigned long addr;
-+ int worth = 0;
-+ struct mm_walk_args *args = walk->private;
-+ int old_gen, new_gen = lru_gen_from_seq(args->max_seq);
-+
-+ VM_BUG_ON(pmd_leaf(*pmd));
-+
-+ pte = pte_offset_map_lock(walk->mm, pmd, start & PMD_MASK, &ptl);
-+ arch_enter_lazy_mmu_mode();
-+restart:
-+ for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) {
-+ struct page *page;
-+ unsigned long pfn = pte_pfn(pte[i]);
-+
-+ args->mm_stats[MM_LEAF_TOTAL]++;
-+
-+ if (!pte_present(pte[i]) || is_zero_pfn(pfn))
-+ continue;
-+
-+ if (WARN_ON_ONCE(pte_devmap(pte[i]) || pte_special(pte[i])))
-+ continue;
-+
-+ if (!pte_young(pte[i])) {
-+ args->mm_stats[MM_LEAF_OLD]++;
-+ continue;
-+ }
-+
-+ VM_BUG_ON(!pfn_valid(pfn));
-+ if (pfn < args->start_pfn || pfn >= args->end_pfn)
-+ continue;
-+
-+ page = compound_head(pfn_to_page(pfn));
-+ if (page_to_nid(page) != args->node_id)
-+ continue;
-+
-+ if (page_memcg_rcu(page) != args->memcg)
-+ continue;
-+
-+ VM_BUG_ON(addr < walk->vma->vm_start || addr >= walk->vma->vm_end);
-+ if (!ptep_test_and_clear_young(walk->vma, addr, pte + i))
-+ continue;
-+
-+ args->mm_stats[MM_LEAF_YOUNG]++;
-+
-+ if (pte_dirty(pte[i]) && !PageDirty(page) &&
-+ !(PageAnon(page) && PageSwapBacked(page) && !PageSwapCache(page)))
-+ set_page_dirty(page);
-+
-+ old_gen = page_update_gen(page, new_gen);
-+ if (old_gen >= 0 && old_gen != new_gen)
-+ update_batch_size(page, old_gen, new_gen, args);
-+
-+ worth++;
-+ }
-+
-+ if (i < PTRS_PER_PTE && get_next_vma(walk, PMD_MASK, PAGE_SIZE, &start, &end))
-+ goto restart;
-+
-+ arch_leave_lazy_mmu_mode();
-+ pte_unmap_unlock(pte, ptl);
-+
-+ return worth >= MIN_BATCH_SIZE / 2;
-+}
-+
-+/*
-+ * We scan PMD entries in two passes. The first pass reaches to PTE tables and
-+ * doesn't take the PMD lock. The second pass clears the accessed bit on PMD
-+ * entries and needs to take the PMD lock.
-+ */
-+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
-+static void walk_pmd_range_locked(pud_t *pud, unsigned long start, int offset,
-+ struct vm_area_struct *vma, struct mm_walk *walk)
-+{
-+ int i;
-+ pmd_t *pmd;
-+ spinlock_t *ptl;
-+ struct mm_walk_args *args = walk->private;
-+ int old_gen, new_gen = lru_gen_from_seq(args->max_seq);
-+
-+ VM_BUG_ON(pud_leaf(*pud));
-+
-+ start = (start & PUD_MASK) + offset * PMD_SIZE;
-+ pmd = pmd_offset(pud, start);
-+ ptl = pmd_lock(walk->mm, pmd);
-+ arch_enter_lazy_mmu_mode();
-+
-+ for_each_set_bit(i, args->bitmap, MIN_BATCH_SIZE) {
-+ struct page *page;
-+ unsigned long pfn = pmd_pfn(pmd[i]);
-+ unsigned long addr = start + i * PMD_SIZE;
-+
-+ if (!pmd_present(pmd[i]) || is_huge_zero_pmd(pmd[i]))
-+ continue;
-+
-+ if (WARN_ON_ONCE(pmd_devmap(pmd[i])))
-+ continue;
-+
-+ if (!pmd_trans_huge(pmd[i])) {
-+ if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG))
-+ pmdp_test_and_clear_young(vma, addr, pmd + i);
-+ continue;
-+ }
-+
-+ VM_BUG_ON(!pfn_valid(pfn));
-+ if (pfn < args->start_pfn || pfn >= args->end_pfn)
-+ continue;
-+
-+ page = pfn_to_page(pfn);
-+ VM_BUG_ON_PAGE(PageTail(page), page);
-+ if (page_to_nid(page) != args->node_id)
-+ continue;
-+
-+ if (page_memcg_rcu(page) != args->memcg)
-+ continue;
-+
-+ VM_BUG_ON(addr < vma->vm_start || addr >= vma->vm_end);
-+ if (!pmdp_test_and_clear_young(vma, addr, pmd + i))
-+ continue;
-+
-+ args->mm_stats[MM_LEAF_YOUNG]++;
-+
-+ if (pmd_dirty(pmd[i]) && !PageDirty(page) &&
-+ !(PageAnon(page) && PageSwapBacked(page) && !PageSwapCache(page)))
-+ set_page_dirty(page);
-+
-+ old_gen = page_update_gen(page, new_gen);
-+ if (old_gen >= 0 && old_gen != new_gen)
-+ update_batch_size(page, old_gen, new_gen, args);
-+ }
-+
-+ arch_leave_lazy_mmu_mode();
-+ spin_unlock(ptl);
-+
-+ bitmap_zero(args->bitmap, MIN_BATCH_SIZE);
-+}
-+#else
-+static void walk_pmd_range_locked(pud_t *pud, unsigned long start, int offset,
-+ struct vm_area_struct *vma, struct mm_walk *walk)
-+{
-+}
-+#endif
-+
-+static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
-+ struct mm_walk *walk)
-+{
-+ int i;
-+ pmd_t *pmd;
-+ unsigned long next;
-+ unsigned long addr;
-+ struct vm_area_struct *vma;
-+ int offset = -1;
-+ bool reset = false;
-+ struct mm_walk_args *args = walk->private;
-+ struct lruvec *lruvec = get_lruvec(args->node_id, args->memcg);
-+
-+ VM_BUG_ON(pud_leaf(*pud));
-+
-+ pmd = pmd_offset(pud, start & PUD_MASK);
-+restart:
-+ vma = walk->vma;
-+ for (i = pmd_index(start), addr = start; addr != end; i++, addr = next) {
-+ pmd_t val = pmd_read_atomic(pmd + i);
-+
-+ /* for pmd_read_atomic() */
-+ barrier();
-+
-+ next = pmd_addr_end(addr, end);
-+
-+ if (!pmd_present(val)) {
-+ args->mm_stats[MM_LEAF_TOTAL]++;
-+ continue;
-+ }
-+
-+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-+ if (pmd_trans_huge(val)) {
-+ unsigned long pfn = pmd_pfn(val);
-+
-+ args->mm_stats[MM_LEAF_TOTAL]++;
-+
-+ if (is_huge_zero_pmd(val))
-+ continue;
-+
-+ if (!pmd_young(val)) {
-+ args->mm_stats[MM_LEAF_OLD]++;
-+ continue;
-+ }
-+
-+ if (pfn < args->start_pfn || pfn >= args->end_pfn)
-+ continue;
-+
-+ if (offset < 0)
-+ offset = i;
-+ else if (i - offset >= MIN_BATCH_SIZE) {
-+ walk_pmd_range_locked(pud, start, offset, vma, walk);
-+ offset = i;
-+ }
-+ __set_bit(i - offset, args->bitmap);
-+ reset = true;
-+ continue;
-+ }
-+#endif
-+ args->mm_stats[MM_NONLEAF_TOTAL]++;
-+
-+#ifdef CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG
-+ if (!pmd_young(val))
-+ continue;
-+
-+ if (offset < 0)
-+ offset = i;
-+ else if (i - offset >= MIN_BATCH_SIZE) {
-+ walk_pmd_range_locked(pud, start, offset, vma, walk);
-+ offset = i;
-+ reset = false;
-+ }
-+ __set_bit(i - offset, args->bitmap);
-+#endif
-+ if (args->use_filter && !test_bloom_filter(lruvec, args->max_seq, pmd + i))
-+ continue;
-+
-+ args->mm_stats[MM_NONLEAF_PREV]++;
-+
-+ if (!walk_pte_range(&val, addr, next, walk))
-+ continue;
-+
-+ args->mm_stats[MM_NONLEAF_CUR]++;
-+
-+ set_bloom_filter(lruvec, args->max_seq + 1, pmd + i);
-+ }
-+
-+ if (reset) {
-+ walk_pmd_range_locked(pud, start, offset, vma, walk);
-+ offset = -1;
-+ reset = false;
-+ }
-+
-+ if (i < PTRS_PER_PMD && get_next_vma(walk, PUD_MASK, PMD_SIZE, &start, &end))
-+ goto restart;
-+
-+ if (offset >= 0)
-+ walk_pmd_range_locked(pud, start, offset, vma, walk);
-+}
-+
-+static int walk_pud_range(p4d_t *p4d, unsigned long start, unsigned long end,
-+ struct mm_walk *walk)
-+{
-+ int i;
-+ pud_t *pud;
-+ unsigned long addr;
-+ unsigned long next;
-+ struct mm_walk_args *args = walk->private;
-+
-+ VM_BUG_ON(p4d_leaf(*p4d));
-+
-+ pud = pud_offset(p4d, start & P4D_MASK);
-+restart:
-+ for (i = pud_index(start), addr = start; addr != end; i++, addr = next) {
-+ pud_t val = READ_ONCE(pud[i]);
-+
-+ next = pud_addr_end(addr, end);
-+
-+ if (!pud_present(val) || WARN_ON_ONCE(pud_leaf(val)))
-+ continue;
-+
-+ walk_pmd_range(&val, addr, next, walk);
-+
-+ if (args->batch_size >= MAX_BATCH_SIZE) {
-+ end = (addr | ~PUD_MASK) + 1;
-+ goto done;
-+ }
-+ }
-+
-+ if (i < PTRS_PER_PUD && get_next_vma(walk, P4D_MASK, PUD_SIZE, &start, &end))
-+ goto restart;
-+
-+ end = round_up(end, P4D_SIZE);
-+done:
-+ /* rounded-up boundaries can wrap to 0 */
-+ args->next_addr = end && walk->vma ? max(end, walk->vma->vm_start) : 0;
-+
-+ return -EAGAIN;
-+}
-+
-+static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct mm_walk_args *args)
-+{
-+ static const struct mm_walk_ops mm_walk_ops = {
-+ .test_walk = should_skip_vma,
-+ .p4d_entry = walk_pud_range,
-+ };
-+
-+ int err;
-+
-+ args->next_addr = FIRST_USER_ADDRESS;
-+
-+ do {
-+ unsigned long start = args->next_addr;
-+ unsigned long end = mm->highest_vm_end;
-+
-+ err = -EBUSY;
-+
-+ rcu_read_lock();
-+#ifdef CONFIG_MEMCG
-+ if (args->memcg && atomic_read(&args->memcg->moving_account))
-+ goto contended;
-+#endif
-+ if (!mmap_read_trylock(mm))
-+ goto contended;
-+
-+ err = walk_page_range(mm, start, end, &mm_walk_ops, args);
-+
-+ mmap_read_unlock(mm);
-+
-+ if (args->batch_size) {
-+ spin_lock_irq(&lruvec->lru_lock);
-+ reset_batch_size(lruvec, args);
-+ spin_unlock_irq(&lruvec->lru_lock);
-+ }
-+contended:
-+ rcu_read_unlock();
-+
-+ cond_resched();
-+ } while (err == -EAGAIN && args->next_addr && !mm_is_oom_victim(mm));
-+}
-+
-+static struct mm_walk_args *alloc_mm_walk_args(void)
-+{
-+ if (!current->reclaim_state || !current->reclaim_state->mm_walk_args)
-+ return kvzalloc(sizeof(struct mm_walk_args), GFP_KERNEL);
-+
-+ return current->reclaim_state->mm_walk_args;
-+}
-+
-+static void free_mm_walk_args(struct mm_walk_args *args)
-+{
-+ if (!current->reclaim_state || !current->reclaim_state->mm_walk_args)
-+ kvfree(args);
-+}
-+
-+static bool inc_min_seq(struct lruvec *lruvec, int type)
-+{
-+ int gen, zone;
-+ int remaining = MAX_BATCH_SIZE;
-+ struct lrugen *lrugen = &lruvec->evictable;
-+
-+ VM_BUG_ON(!seq_is_valid(lruvec));
-+
-+ if (get_nr_gens(lruvec, type) != MAX_NR_GENS)
-+ return true;
-+
-+ gen = lru_gen_from_seq(lrugen->min_seq[type]);
-+
-+ for (zone = 0; zone < MAX_NR_ZONES; zone++) {
-+ struct list_head *head = &lrugen->lists[gen][type][zone];
-+
-+ while (!list_empty(head)) {
-+ struct page *page = lru_to_page(head);
-+
-+ VM_BUG_ON_PAGE(PageTail(page), page);
-+ VM_BUG_ON_PAGE(PageUnevictable(page), page);
-+ VM_BUG_ON_PAGE(PageActive(page), page);
-+ VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page);
-+ VM_BUG_ON_PAGE(page_zonenum(page) != zone, page);
-+
-+ prefetchw_prev_lru_page(page, head, flags);
-+
-+ page_inc_gen(page, lruvec, false);
-+
-+ if (!--remaining)
-+ return false;
-+ }
-+ }
-+
-+ WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1);
-+
-+ return true;
-+}
-+
-+static bool try_to_inc_min_seq(struct lruvec *lruvec, int swappiness)
-+{
-+ int gen, type, zone;
-+ bool success = false;
-+ struct lrugen *lrugen = &lruvec->evictable;
-+ DEFINE_MIN_SEQ(lruvec);
-+
-+ VM_BUG_ON(!seq_is_valid(lruvec));
-+
-+ for (type = 0; type < ANON_AND_FILE; type++) {
-+ while (lrugen->max_seq - min_seq[type] >= MIN_NR_GENS) {
-+ gen = lru_gen_from_seq(min_seq[type]);
-+
-+ for (zone = 0; zone < MAX_NR_ZONES; zone++) {
-+ if (!list_empty(&lrugen->lists[gen][type][zone]))
-+ goto next;
-+ }
-+
-+ min_seq[type]++;
-+ }
-+next:
-+ ;
-+ }
-+
-+ min_seq[0] = min(min_seq[0], min_seq[1]);
-+ if (swappiness)
-+ min_seq[1] = max(min_seq[0], lrugen->min_seq[1]);
-+
-+ for (type = 0; type < ANON_AND_FILE; type++) {
-+ if (min_seq[type] == lrugen->min_seq[type])
-+ continue;
-+
-+ WRITE_ONCE(lrugen->min_seq[type], min_seq[type]);
-+ success = true;
-+ }
-+
-+ return success;
-+}
-+
-+static void inc_max_seq(struct lruvec *lruvec, unsigned long max_seq)
-+{
-+ int gen, type, zone;
-+ struct lrugen *lrugen = &lruvec->evictable;
-+
-+ spin_lock_irq(&lruvec->lru_lock);
-+
-+ VM_BUG_ON(!seq_is_valid(lruvec));
-+
-+ if (max_seq != lrugen->max_seq)
-+ goto unlock;
-+
-+ if (!try_to_inc_min_seq(lruvec, true)) {
-+ for (type = ANON_AND_FILE - 1; type >= 0; type--) {
-+ while (!inc_min_seq(lruvec, type)) {
-+ spin_unlock_irq(&lruvec->lru_lock);
-+ cond_resched();
-+ spin_lock_irq(&lruvec->lru_lock);
-+ }
-+ }
-+ }
-+
-+ gen = lru_gen_from_seq(lrugen->max_seq - 1);
-+ for (type = 0; type < ANON_AND_FILE; type++) {
-+ for (zone = 0; zone < MAX_NR_ZONES; zone++) {
-+ enum lru_list lru = type * LRU_FILE;
-+ long delta = lrugen->sizes[gen][type][zone];
-+
-+ if (!delta)
-+ continue;
-+
-+ WARN_ON_ONCE(delta != (int)delta);
-+
-+ update_lru_size(lruvec, lru, zone, delta);
-+ update_lru_size(lruvec, lru + LRU_ACTIVE, zone, -delta);
-+ }
-+ }
-+
-+ gen = lru_gen_from_seq(lrugen->max_seq + 1);
-+ for (type = 0; type < ANON_AND_FILE; type++) {
-+ for (zone = 0; zone < MAX_NR_ZONES; zone++) {
-+ enum lru_list lru = type * LRU_FILE;
-+ long delta = lrugen->sizes[gen][type][zone];
-+
-+ if (!delta)
-+ continue;
-+
-+ WARN_ON_ONCE(delta != (int)delta);
-+
-+ update_lru_size(lruvec, lru, zone, -delta);
-+ update_lru_size(lruvec, lru + LRU_ACTIVE, zone, delta);
-+ }
-+ }
-+
-+ WRITE_ONCE(lrugen->timestamps[gen], jiffies);
-+ /* make sure all preceding modifications appear first */
-+ smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1);
-+unlock:
-+ spin_unlock_irq(&lruvec->lru_lock);
-+}
-+
-+/* Main function used by the foreground, the background and the user-triggered aging. */
-+static bool try_to_inc_max_seq(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
-+ unsigned long max_seq, bool use_filter)
-+{
-+ bool last;
-+ struct mm_walk_args *args;
-+ struct mm_struct *mm = NULL;
-+ struct lrugen *lrugen = &lruvec->evictable;
-+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
-+ struct pglist_data *pgdat = lruvec_pgdat(lruvec);
-+ int nid = pgdat->node_id;
-+
-+ VM_BUG_ON(max_seq > READ_ONCE(lrugen->max_seq));
-+
-+ /*
-+ * If we are not from run_aging() and clearing the accessed bit may
-+ * trigger page faults, then don't proceed to clearing all accessed
-+ * PTEs. Instead, fallback to lru_gen_look_around(), which only clears a
-+ * handful of accessed PTEs. This is less efficient but causes fewer
-+ * page faults on CPUs that don't have the capability.
-+ */
-+ if ((current->flags & PF_MEMALLOC) && !arch_has_hw_pte_young(false)) {
-+ inc_max_seq(lruvec, max_seq);
-+ return true;
-+ }
-+
-+ args = alloc_mm_walk_args();
-+ if (!args)
-+ return false;
-+
-+ args->memcg = memcg;
-+ args->max_seq = max_seq;
-+ args->start_pfn = pgdat->node_start_pfn;
-+ args->end_pfn = pgdat_end_pfn(pgdat);
-+ args->node_id = nid;
-+ args->swappiness = swappiness;
-+ args->use_filter = use_filter;
-+
-+ do {
-+ last = get_next_mm(lruvec, args, &mm);
-+ if (mm)
-+ walk_mm(lruvec, mm, args);
-+
-+ cond_resched();
-+ } while (mm);
-+
-+ free_mm_walk_args(args);
-+
-+ if (!last) {
-+ /* don't wait unless we may have trouble reclaiming */
-+ if (!current_is_kswapd() && sc->priority < DEF_PRIORITY - 2)
-+ wait_event_killable(lruvec->mm_walk.wait,
-+ max_seq < READ_ONCE(lrugen->max_seq));
-+
-+ return max_seq < READ_ONCE(lrugen->max_seq);
-+ }
-+
-+ VM_BUG_ON(max_seq != READ_ONCE(lrugen->max_seq));
-+
-+ inc_max_seq(lruvec, max_seq);
-+ /* either we see any waiters or they will see updated max_seq */
-+ if (wq_has_sleeper(&lruvec->mm_walk.wait))
-+ wake_up_all(&lruvec->mm_walk.wait);
-+
-+ wakeup_flusher_threads(WB_REASON_VMSCAN);
-+
-+ return true;
-+}
-+
-+static long get_nr_evictable(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
-+ unsigned long max_seq, unsigned long *min_seq, bool *low)
-+{
-+ int gen, type, zone;
-+ long max = 0;
-+ long min = 0;
-+ struct lrugen *lrugen = &lruvec->evictable;
-+
-+ for (type = !swappiness; type < ANON_AND_FILE; type++) {
-+ unsigned long seq;
-+
-+ for (seq = min_seq[type]; seq <= max_seq; seq++) {
-+ long size = 0;
-+
-+ gen = lru_gen_from_seq(seq);
-+
-+ for (zone = 0; zone <= sc->reclaim_idx; zone++)
-+ size += READ_ONCE(lrugen->sizes[gen][type][zone]);
-+
-+ max += size;
-+ if (type && max_seq - seq >= MIN_NR_GENS)
-+ min += size;
-+ }
-+ }
-+
-+ *low = max_seq - min_seq[1] <= MIN_NR_GENS && min < MIN_BATCH_SIZE;
-+
-+ return max > 0 ? max : 0;
-+}
-+
-+static bool age_lruvec(struct lruvec *lruvec, struct scan_control *sc,
-+ unsigned long min_ttl)
-+{
-+ bool low;
-+ long nr_to_scan;
-+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
-+ int swappiness = get_swappiness(memcg);
-+ DEFINE_MAX_SEQ(lruvec);
-+ DEFINE_MIN_SEQ(lruvec);
-+
-+ if (mem_cgroup_below_min(memcg))
-+ return false;
-+
-+ if (min_ttl) {
-+ int gen = lru_gen_from_seq(min_seq[1]);
-+ unsigned long birth = READ_ONCE(lruvec->evictable.timestamps[gen]);
-+
-+ if (time_is_after_jiffies(birth + min_ttl))
-+ return false;
-+ }
-+
-+ nr_to_scan = get_nr_evictable(lruvec, sc, swappiness, max_seq, min_seq, &low);
-+ if (!nr_to_scan)
-+ return false;
-+
-+ nr_to_scan >>= sc->priority;
-+
-+ if (!mem_cgroup_online(memcg))
-+ nr_to_scan++;
-+
-+ if (nr_to_scan && low && (!mem_cgroup_below_low(memcg) || sc->memcg_low_reclaim))
-+ try_to_inc_max_seq(lruvec, sc, swappiness, max_seq, true);
-+
-+ return true;
-+}
-+
-+/* Protect the working set accessed within the last N milliseconds. */
-+static unsigned long lru_gen_min_ttl __read_mostly;
-+
-+static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
-+{
-+ struct mem_cgroup *memcg;
-+ bool success = false;
-+ unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl);
-+
-+ VM_BUG_ON(!current_is_kswapd());
-+
-+ if (!sc->force_deactivate) {
-+ sc->force_deactivate = 1;
-+ return;
-+ }
-+
-+ current->reclaim_state->mm_walk_args = &pgdat->mm_walk_args;
-+
-+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
-+ do {
-+ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
-+
-+ if (age_lruvec(lruvec, sc, min_ttl))
-+ success = true;
-+
-+ cond_resched();
-+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
-+
-+ if (!success && mutex_trylock(&oom_lock)) {
-+ struct oom_control oc = {
-+ .gfp_mask = sc->gfp_mask,
-+ .order = sc->order,
-+ };
-+
-+ /* to avoid overkilling */
-+ if (!oom_reaping_in_progress())
-+ out_of_memory(&oc);
-+
-+ mutex_unlock(&oom_lock);
-+ }
-+
-+ current->reclaim_state->mm_walk_args = NULL;
-+}
-+
-+/* Scan the vicinity of an accessed PTE when shrink_page_list() uses the rmap. */
-+void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
-+{
-+ int i;
-+ pte_t *pte;
-+ struct page *page;
-+ int old_gen, new_gen;
-+ unsigned long start;
-+ unsigned long end;
-+ unsigned long addr;
-+ struct mm_walk_args *args;
-+ int worth = 0;
-+ struct mem_cgroup *memcg = page_memcg(pvmw->page);
-+ struct pglist_data *pgdat = page_pgdat(pvmw->page);
-+ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
-+ DEFINE_MAX_SEQ(lruvec);
-+
-+ lockdep_assert_held(pvmw->ptl);
-+ VM_BUG_ON_PAGE(PageLRU(pvmw->page), pvmw->page);
-+
-+ args = current->reclaim_state ? current->reclaim_state->mm_walk_args : NULL;
-+ if (!args)
-+ return;
-+
-+ start = max(pvmw->address & PMD_MASK, pvmw->vma->vm_start);
-+ end = min(pvmw->address | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1;
-+
-+ if (end - start > MIN_BATCH_SIZE * PAGE_SIZE) {
-+ if (pvmw->address - start < MIN_BATCH_SIZE * PAGE_SIZE / 2)
-+ end = start + MIN_BATCH_SIZE * PAGE_SIZE;
-+ else if (end - pvmw->address < MIN_BATCH_SIZE * PAGE_SIZE / 2)
-+ start = end - MIN_BATCH_SIZE * PAGE_SIZE;
-+ else {
-+ start = pvmw->address - MIN_BATCH_SIZE * PAGE_SIZE / 2;
-+ end = pvmw->address + MIN_BATCH_SIZE * PAGE_SIZE / 2;
-+ }
-+ }
-+
-+ pte = pvmw->pte - (pvmw->address - start) / PAGE_SIZE;
-+ new_gen = lru_gen_from_seq(max_seq);
-+
-+ lock_page_memcg(pvmw->page);
-+ arch_enter_lazy_mmu_mode();
-+
-+ for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) {
-+ unsigned long pfn = pte_pfn(pte[i]);
-+
-+ if (!pte_present(pte[i]) || is_zero_pfn(pfn))
-+ continue;
-+
-+ if (WARN_ON_ONCE(pte_devmap(pte[i]) || pte_special(pte[i])))
-+ continue;
-+
-+ VM_BUG_ON(!pfn_valid(pfn));
-+ if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
-+ continue;
-+
-+ worth++;
-+
-+ if (!pte_young(pte[i]))
-+ continue;
-+
-+ page = compound_head(pfn_to_page(pfn));
-+ if (page_to_nid(page) != pgdat->node_id)
-+ continue;
-+
-+ if (page_memcg_rcu(page) != memcg)
-+ continue;
-+
-+ VM_BUG_ON(addr < pvmw->vma->vm_start || addr >= pvmw->vma->vm_end);
-+ if (!ptep_test_and_clear_young(pvmw->vma, addr, pte + i))
-+ continue;
-+
-+ if (pte_dirty(pte[i]) && !PageDirty(page) &&
-+ !(PageAnon(page) && PageSwapBacked(page) && !PageSwapCache(page)))
-+ __set_bit(i, args->bitmap);
-+
-+ old_gen = page_update_gen(page, new_gen);
-+ if (old_gen >= 0 && old_gen != new_gen)
-+ update_batch_size(page, old_gen, new_gen, args);
-+ }
-+
-+ arch_leave_lazy_mmu_mode();
-+ unlock_page_memcg(pvmw->page);
-+
-+ if (worth >= MIN_BATCH_SIZE / 2)
-+ set_bloom_filter(lruvec, max_seq, pvmw->pmd);
-+
-+ for_each_set_bit(i, args->bitmap, MIN_BATCH_SIZE)
-+ set_page_dirty(pte_page(pte[i]));
-+
-+ bitmap_zero(args->bitmap, MIN_BATCH_SIZE);
-+}
-+
-+/******************************************************************************
- * state change
- ******************************************************************************/
-
-@@ -3477,6 +4414,12 @@ static int __init init_lru_gen(void)
- };
- late_initcall(init_lru_gen);
-
-+#else
-+
-+static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
-+{
-+}
-+
- #endif /* CONFIG_LRU_GEN */
-
- static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
-@@ -4333,6 +5276,11 @@ static void age_active_anon(struct pglis
- struct mem_cgroup *memcg;
- struct lruvec *lruvec;
-
-+ if (lru_gen_enabled()) {
-+ lru_gen_age_node(pgdat, sc);
-+ return;
-+ }
-+
- if (!can_age_anon_pages(pgdat, sc))
- return;
-
+++ /dev/null
-From f4b881ce07ccb2a519f664afaa2a68225b612ca3 Mon Sep 17 00:00:00 2001
-Date: Tue, 29 Jun 2021 20:46:47 -0600
-Subject: [PATCH 07/10] mm: multigenerational lru: eviction
-
-The eviction consumes old generations. Given an lruvec, the eviction
-scans pages on lrugen->lists indexed by anon and file min_seq[]
-(modulo MAX_NR_GENS). It first tries to select a type based on the
-values of min_seq[]. If they are equal, it selects the type that has
-a lower refaulted %. The eviction sorts a page according to its
-updated generation number if the aging has found this page accessed.
-It also moves a page to the next generation if this page is from an
-upper tier that has a higher refaulted % than the base tier. The
-eviction increments min_seq[] of a selected type when it finds
-lrugen->lists indexed by min_seq[] of this selected type are empty.
-
-Each generation is divided into multiple tiers. Tiers represent
-different ranges of numbers of accesses from file descriptors only.
-Pages accessed N times via file descriptors belong to tier
-order_base_2(N). Each generation contains at most MAX_NR_TIERS tiers,
-and they require additional MAX_NR_TIERS-2 bits in page->flags. In
-contrast to moving between generations which requires list operations,
-moving between tiers only involves operations on page->flags and
-therefore has a negligible cost. A feedback loop modeled after the PID
-controller monitors refaulted % across all tiers and decides when to
-protect pages from which tiers.
-
-Unmapped pages are initially added to the oldest generation and then
-conditionally protected by tiers. Each tier keeps track of how many
-pages from it have refaulted. Tier 0 is the base tier and pages from
-it are evicted unconditionally because there are no better candidates.
-Pages from an upper tier are either evicted or moved to the next
-generation, depending on whether this upper tier has a higher
-refaulted % than the base tier. This model has the following
-advantages:
- 1) It removes the cost in the buffered access path and reduces the
- overall cost of protection because pages are conditionally protected
- in the reclaim path.
- 2) It takes mapped pages into account and avoids overprotecting
- pages accessed multiple times via file descriptors.
- 3 Additional tiers improve the protection of pages accessed more
- than twice.
-
-Change-Id: I64c06d8f2cdb83ac7d56c7e1d07f043483956cac
----
- include/linux/mm_inline.h | 10 +
- include/linux/mmzone.h | 33 +++
- mm/swap.c | 42 +++
- mm/vmscan.c | 555 +++++++++++++++++++++++++++++++++++++-
- mm/workingset.c | 120 ++++++++-
- 5 files changed, 757 insertions(+), 3 deletions(-)
-
---- a/include/linux/mm_inline.h
-+++ b/include/linux/mm_inline.h
-@@ -106,6 +106,14 @@ static inline int lru_hist_from_seq(unsi
- return seq % NR_HIST_GENS;
- }
-
-+/* Convert the number of accesses to a tier. See the comment on MAX_NR_TIERS. */
-+static inline int lru_tier_from_refs(int refs)
-+{
-+ VM_BUG_ON(refs > BIT(LRU_REFS_WIDTH));
-+
-+ return order_base_2(refs + 1);
-+}
-+
- /* The youngest and the second youngest generations are counted as active. */
- static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen)
- {
-@@ -226,6 +234,8 @@ static inline bool lru_gen_del_page(stru
- gen = ((new_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
-
- new_flags &= ~LRU_GEN_MASK;
-+ if ((new_flags & LRU_REFS_FLAGS) != LRU_REFS_FLAGS)
-+ new_flags &= ~(LRU_REFS_MASK | LRU_REFS_FLAGS);
- /* for shrink_page_list() */
- if (reclaiming)
- new_flags &= ~(BIT(PG_referenced) | BIT(PG_reclaim));
---- a/include/linux/mmzone.h
-+++ b/include/linux/mmzone.h
-@@ -319,6 +319,30 @@ struct page_vma_mapped_walk;
- #define MIN_NR_GENS 2
- #define MAX_NR_GENS ((unsigned int)CONFIG_NR_LRU_GENS)
-
-+/*
-+ * Each generation is divided into multiple tiers. Tiers represent different
-+ * ranges of numbers of accesses from file descriptors, i.e.,
-+ * mark_page_accessed(). In contrast to moving between generations which
-+ * requires the lru lock, moving between tiers only involves an atomic
-+ * operation on page->flags and therefore has a negligible cost.
-+ *
-+ * The purposes of tiers are to:
-+ * 1) estimate whether pages accessed multiple times via file descriptors are
-+ * more active than pages accessed only via page tables by separating the two
-+ * access types into upper tiers and the base tier, and comparing refaulted %
-+ * across all tiers.
-+ * 2) improve buffered io performance by deferring the protection of pages
-+ * accessed multiple times until the eviction. That is the protection happens
-+ * in the reclaim path, not the access path.
-+ *
-+ * Pages accessed N times via file descriptors belong to tier order_base_2(N).
-+ * The base tier may be marked by PageReferenced(). All upper tiers are marked
-+ * by PageReferenced() && PageWorkingset(). Additional bits from page->flags are
-+ * used to support more than one upper tier.
-+ */
-+#define MAX_NR_TIERS ((unsigned int)CONFIG_TIERS_PER_GEN)
-+#define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset))
-+
- /* Whether to keep stats for historical generations. */
- #ifdef CONFIG_LRU_GEN_STATS
- #define NR_HIST_GENS ((unsigned int)CONFIG_NR_LRU_GENS)
-@@ -337,6 +361,15 @@ struct lrugen {
- struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
- /* the sizes of the multigenerational lru lists in pages */
- unsigned long sizes[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
-+ /* the exponential moving average of refaulted */
-+ unsigned long avg_refaulted[ANON_AND_FILE][MAX_NR_TIERS];
-+ /* the exponential moving average of protected+evicted */
-+ unsigned long avg_total[ANON_AND_FILE][MAX_NR_TIERS];
-+ /* the base tier isn't protected, hence the minus one */
-+ unsigned long protected[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS - 1];
-+ /* incremented without holding the lru lock */
-+ atomic_long_t evicted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
-+ atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
- /* whether the multigenerational lru is enabled */
- bool enabled[ANON_AND_FILE];
- };
---- a/mm/swap.c
-+++ b/mm/swap.c
-@@ -389,6 +389,43 @@ static void __lru_cache_activate_page(st
- local_unlock(&lru_pvecs.lock);
- }
-
-+#ifdef CONFIG_LRU_GEN
-+static void page_inc_refs(struct page *page)
-+{
-+ unsigned long refs;
-+ unsigned long old_flags, new_flags;
-+
-+ if (PageUnevictable(page))
-+ return;
-+
-+ /* see the comment on MAX_NR_TIERS */
-+ do {
-+ new_flags = old_flags = READ_ONCE(page->flags);
-+
-+ if (!(new_flags & BIT(PG_referenced))) {
-+ new_flags |= BIT(PG_referenced);
-+ continue;
-+ }
-+
-+ if (!(new_flags & BIT(PG_workingset))) {
-+ new_flags |= BIT(PG_workingset);
-+ continue;
-+ }
-+
-+ refs = new_flags & LRU_REFS_MASK;
-+ refs = min(refs + BIT(LRU_REFS_PGOFF), LRU_REFS_MASK);
-+
-+ new_flags &= ~LRU_REFS_MASK;
-+ new_flags |= refs;
-+ } while (new_flags != old_flags &&
-+ cmpxchg(&page->flags, old_flags, new_flags) != old_flags);
-+}
-+#else
-+static void page_inc_refs(struct page *page)
-+{
-+}
-+#endif /* CONFIG_LRU_GEN */
-+
- /*
- * Mark a page as having seen activity.
- *
-@@ -403,6 +440,11 @@ void mark_page_accessed(struct page *pag
- {
- page = compound_head(page);
-
-+ if (lru_gen_enabled()) {
-+ page_inc_refs(page);
-+ return;
-+ }
-+
- if (!PageReferenced(page)) {
- SetPageReferenced(page);
- } else if (PageUnevictable(page)) {
---- a/mm/vmscan.c
-+++ b/mm/vmscan.c
-@@ -1145,9 +1145,11 @@ static int __remove_mapping(struct addre
-
- if (PageSwapCache(page)) {
- swp_entry_t swap = { .val = page_private(page) };
-- mem_cgroup_swapout(page, swap);
-+
-+ /* get a shadow entry before page_memcg() is cleared */
- if (reclaimed && !mapping_exiting(mapping))
- shadow = workingset_eviction(page, target_memcg);
-+ mem_cgroup_swapout(page, swap);
- __delete_from_swap_cache(page, swap, shadow);
- xa_unlock_irq(&mapping->i_pages);
- put_swap_page(page, swap);
-@@ -1410,6 +1412,11 @@ retry:
- if (!sc->may_unmap && page_mapped(page))
- goto keep_locked;
-
-+ /* lru_gen_look_around() has updated this page? */
-+ if (lru_gen_enabled() && !ignore_references &&
-+ page_mapped(page) && PageReferenced(page))
-+ goto keep_locked;
-+
- may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
- (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
-
-@@ -2570,6 +2577,9 @@ static void prepare_scan_count(pg_data_t
- unsigned long file;
- struct lruvec *target_lruvec;
-
-+ if (lru_gen_enabled())
-+ return;
-+
- target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
-
- /*
-@@ -2910,6 +2920,17 @@ static int page_lru_gen(struct page *pag
- return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
- }
-
-+static int page_lru_tier(struct page *page)
-+{
-+ int refs;
-+ unsigned long flags = READ_ONCE(page->flags);
-+
-+ refs = (flags & LRU_REFS_FLAGS) == LRU_REFS_FLAGS ?
-+ ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + 1 : 0;
-+
-+ return lru_tier_from_refs(refs);
-+}
-+
- static int get_swappiness(struct mem_cgroup *memcg)
- {
- return mem_cgroup_get_nr_swap_pages(memcg) >= MIN_BATCH_SIZE ?
-@@ -3246,6 +3267,91 @@ done:
- }
-
- /******************************************************************************
-+ * refault feedback loop
-+ ******************************************************************************/
-+
-+/*
-+ * A feedback loop modeled after the PID controller. Currently supports the
-+ * proportional (P) and the integral (I) terms; the derivative (D) term can be
-+ * added if necessary. The setpoint (SP) is the desired position; the process
-+ * variable (PV) is the measured position. The error is the difference between
-+ * the SP and the PV. A positive error results in a positive control output
-+ * correction, which, in our case, is to allow eviction.
-+ *
-+ * The P term is refaulted % of the current generation being evicted. The I
-+ * term is the exponential moving average of refaulted % of previously evicted
-+ * generations, using the smoothing factor 1/2.
-+ *
-+ * Our goal is to maintain proportional refaulted % across all tiers.
-+ */
-+struct ctrl_pos {
-+ unsigned long refaulted;
-+ unsigned long total;
-+ int gain;
-+};
-+
-+static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain,
-+ struct ctrl_pos *pos)
-+{
-+ struct lrugen *lrugen = &lruvec->evictable;
-+ int hist = lru_hist_from_seq(lrugen->min_seq[type]);
-+
-+ pos->refaulted = lrugen->avg_refaulted[type][tier] +
-+ atomic_long_read(&lrugen->refaulted[hist][type][tier]);
-+ pos->total = lrugen->avg_total[type][tier] +
-+ atomic_long_read(&lrugen->evicted[hist][type][tier]);
-+ if (tier)
-+ pos->total += lrugen->protected[hist][type][tier - 1];
-+ pos->gain = gain;
-+}
-+
-+static void reset_ctrl_pos(struct lruvec *lruvec, int gen, int type)
-+{
-+ int tier;
-+ int hist = lru_hist_from_seq(gen);
-+ struct lrugen *lrugen = &lruvec->evictable;
-+ bool carryover = gen == lru_gen_from_seq(lrugen->min_seq[type]);
-+ bool clear = carryover ? NR_HIST_GENS == 1 : NR_HIST_GENS > 1;
-+
-+ if (!carryover && !clear)
-+ return;
-+
-+ for (tier = 0; tier < MAX_NR_TIERS; tier++) {
-+ if (carryover) {
-+ unsigned long sum;
-+
-+ sum = lrugen->avg_refaulted[type][tier] +
-+ atomic_long_read(&lrugen->refaulted[hist][type][tier]);
-+ WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / 2);
-+
-+ sum = lrugen->avg_total[type][tier] +
-+ atomic_long_read(&lrugen->evicted[hist][type][tier]);
-+ if (tier)
-+ sum += lrugen->protected[hist][type][tier - 1];
-+ WRITE_ONCE(lrugen->avg_total[type][tier], sum / 2);
-+ }
-+
-+ if (clear) {
-+ atomic_long_set(&lrugen->refaulted[hist][type][tier], 0);
-+ atomic_long_set(&lrugen->evicted[hist][type][tier], 0);
-+ if (tier)
-+ WRITE_ONCE(lrugen->protected[hist][type][tier - 1], 0);
-+ }
-+ }
-+}
-+
-+static bool positive_ctrl_err(struct ctrl_pos *sp, struct ctrl_pos *pv)
-+{
-+ /*
-+ * Allow eviction if the PV has a limited number of refaulted pages or a
-+ * lower refaulted % than the SP.
-+ */
-+ return pv->refaulted < MIN_BATCH_SIZE ||
-+ pv->refaulted * max(sp->total, 1UL) * sp->gain <=
-+ sp->refaulted * max(pv->total, 1UL) * pv->gain;
-+}
-+
-+/******************************************************************************
- * the aging
- ******************************************************************************/
-
-@@ -3265,6 +3371,7 @@ static int page_update_gen(struct page *
-
- new_flags &= ~LRU_GEN_MASK;
- new_flags |= (gen + 1UL) << LRU_GEN_PGOFF;
-+ new_flags &= ~(LRU_REFS_MASK | LRU_REFS_FLAGS);
- } while (new_flags != old_flags &&
- cmpxchg(&page->flags, old_flags, new_flags) != old_flags);
-
-@@ -3296,6 +3403,7 @@ static void page_inc_gen(struct page *pa
-
- new_flags &= ~LRU_GEN_MASK;
- new_flags |= (new_gen + 1UL) << LRU_GEN_PGOFF;
-+ new_flags &= ~(LRU_REFS_MASK | LRU_REFS_FLAGS);
- /* for end_page_writeback() */
- if (reclaiming)
- new_flags |= BIT(PG_reclaim);
-@@ -3787,6 +3895,7 @@ static bool inc_min_seq(struct lruvec *l
- }
- }
-
-+ reset_ctrl_pos(lruvec, gen, type);
- WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1);
-
- return true;
-@@ -3824,6 +3933,8 @@ next:
- if (min_seq[type] == lrugen->min_seq[type])
- continue;
-
-+ gen = lru_gen_from_seq(lrugen->min_seq[type]);
-+ reset_ctrl_pos(lruvec, gen, type);
- WRITE_ONCE(lrugen->min_seq[type], min_seq[type]);
- success = true;
- }
-@@ -3885,6 +3996,9 @@ static void inc_max_seq(struct lruvec *l
- }
- }
-
-+ for (type = 0; type < ANON_AND_FILE; type++)
-+ reset_ctrl_pos(lruvec, gen, type);
-+
- WRITE_ONCE(lrugen->timestamps[gen], jiffies);
- /* make sure all preceding modifications appear first */
- smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1);
-@@ -4166,6 +4280,433 @@ void lru_gen_look_around(struct page_vma
- }
-
- /******************************************************************************
-+ * the eviction
-+ ******************************************************************************/
-+
-+static bool sort_page(struct page *page, struct lruvec *lruvec, int tier_idx)
-+{
-+ bool success;
-+ int gen = page_lru_gen(page);
-+ int type = page_is_file_lru(page);
-+ int zone = page_zonenum(page);
-+ int tier = page_lru_tier(page);
-+ int delta = thp_nr_pages(page);
-+ struct lrugen *lrugen = &lruvec->evictable;
-+
-+ VM_BUG_ON_PAGE(gen >= MAX_NR_GENS, page);
-+
-+ /* an mlocked page? */
-+ if (!page_evictable(page)) {
-+ success = lru_gen_del_page(page, lruvec, true);
-+ VM_BUG_ON_PAGE(!success, page);
-+ SetPageUnevictable(page);
-+ add_page_to_lru_list(page, lruvec);
-+ __count_vm_events(UNEVICTABLE_PGCULLED, delta);
-+ return true;
-+ }
-+
-+ /* a lazy-free page that has been written into? */
-+ if (type && PageDirty(page) && PageAnon(page)) {
-+ success = lru_gen_del_page(page, lruvec, true);
-+ VM_BUG_ON_PAGE(!success, page);
-+ SetPageSwapBacked(page);
-+ add_page_to_lru_list_tail(page, lruvec);
-+ return true;
-+ }
-+
-+ /* page_update_gen() has updated this page? */
-+ if (gen != lru_gen_from_seq(lrugen->min_seq[type])) {
-+ list_move(&page->lru, &lrugen->lists[gen][type][zone]);
-+ return true;
-+ }
-+
-+ /* protect this page if its tier has a higher refaulted % */
-+ if (tier > tier_idx) {
-+ int hist = lru_hist_from_seq(gen);
-+
-+ page_inc_gen(page, lruvec, false);
-+ WRITE_ONCE(lrugen->protected[hist][type][tier - 1],
-+ lrugen->protected[hist][type][tier - 1] + delta);
-+ __mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta);
-+ return true;
-+ }
-+
-+ /* mark this page for reclaim if it's pending writeback */
-+ if (PageWriteback(page) || (type && PageDirty(page))) {
-+ page_inc_gen(page, lruvec, true);
-+ return true;
-+ }
-+
-+ return false;
-+}
-+
-+static bool isolate_page(struct page *page, struct lruvec *lruvec, struct scan_control *sc)
-+{
-+ bool success;
-+
-+ if (!sc->may_unmap && page_mapped(page))
-+ return false;
-+
-+ if (!(sc->may_writepage && (sc->gfp_mask & __GFP_IO)) &&
-+ (PageDirty(page) || (PageAnon(page) && !PageSwapCache(page))))
-+ return false;
-+
-+ if (!get_page_unless_zero(page))
-+ return false;
-+
-+ if (!TestClearPageLRU(page)) {
-+ put_page(page);
-+ return false;
-+ }
-+
-+ success = lru_gen_del_page(page, lruvec, true);
-+ VM_BUG_ON_PAGE(!success, page);
-+
-+ return true;
-+}
-+
-+static int scan_pages(struct lruvec *lruvec, struct scan_control *sc,
-+ int type, int tier, struct list_head *list)
-+{
-+ int gen, zone;
-+ enum vm_event_item item;
-+ int sorted = 0;
-+ int scanned = 0;
-+ int isolated = 0;
-+ int remaining = MAX_BATCH_SIZE;
-+ struct lrugen *lrugen = &lruvec->evictable;
-+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
-+
-+ VM_BUG_ON(!list_empty(list));
-+
-+ if (get_nr_gens(lruvec, type) == MIN_NR_GENS)
-+ return 0;
-+
-+ gen = lru_gen_from_seq(lrugen->min_seq[type]);
-+
-+ for (zone = sc->reclaim_idx; zone >= 0; zone--) {
-+ LIST_HEAD(moved);
-+ int skipped = 0;
-+ struct list_head *head = &lrugen->lists[gen][type][zone];
-+
-+ while (!list_empty(head)) {
-+ struct page *page = lru_to_page(head);
-+ int delta = thp_nr_pages(page);
-+
-+ VM_BUG_ON_PAGE(PageTail(page), page);
-+ VM_BUG_ON_PAGE(PageUnevictable(page), page);
-+ VM_BUG_ON_PAGE(PageActive(page), page);
-+ VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page);
-+ VM_BUG_ON_PAGE(page_zonenum(page) != zone, page);
-+
-+ prefetchw_prev_lru_page(page, head, flags);
-+
-+ scanned += delta;
-+
-+ if (sort_page(page, lruvec, tier))
-+ sorted += delta;
-+ else if (isolate_page(page, lruvec, sc)) {
-+ list_add(&page->lru, list);
-+ isolated += delta;
-+ } else {
-+ list_move(&page->lru, &moved);
-+ skipped += delta;
-+ }
-+
-+ if (!--remaining || max(isolated, skipped) >= MIN_BATCH_SIZE)
-+ break;
-+ }
-+
-+ if (skipped) {
-+ list_splice(&moved, head);
-+ __count_zid_vm_events(PGSCAN_SKIP, zone, skipped);
-+ }
-+
-+ if (!remaining || isolated >= MIN_BATCH_SIZE)
-+ break;
-+ }
-+
-+ item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT;
-+ if (!cgroup_reclaim(sc)) {
-+ __count_vm_events(item, isolated);
-+ __count_vm_events(PGREFILL, sorted);
-+ }
-+ __count_memcg_events(memcg, item, isolated);
-+ __count_memcg_events(memcg, PGREFILL, sorted);
-+ __count_vm_events(PGSCAN_ANON + type, isolated);
-+
-+ /*
-+ * We may have trouble finding eligible pages due to reclaim_idx,
-+ * may_unmap and may_writepage. Check `remaining` to make sure we won't
-+ * be stuck if we aren't making enough progress.
-+ */
-+ return isolated || !remaining ? scanned : 0;
-+}
-+
-+static int get_tier_idx(struct lruvec *lruvec, int type)
-+{
-+ int tier;
-+ struct ctrl_pos sp, pv;
-+
-+ /*
-+ * Ideally we don't want to evict upper tiers that have higher refaulted
-+ * %. However, we need to leave a margin for the fluctuation in
-+ * refaulted %. So we use a larger gain factor to make sure upper tiers
-+ * are indeed more active. We choose 2 because the lowest upper tier
-+ * would have twice of refaulted % of the base tier, according to their
-+ * numbers of accesses.
-+ */
-+ read_ctrl_pos(lruvec, type, 0, 1, &sp);
-+ for (tier = 1; tier < MAX_NR_TIERS; tier++) {
-+ read_ctrl_pos(lruvec, type, tier, 2, &pv);
-+ if (!positive_ctrl_err(&sp, &pv))
-+ break;
-+ }
-+
-+ return tier - 1;
-+}
-+
-+static int get_type_to_scan(struct lruvec *lruvec, int swappiness, int *tier_idx)
-+{
-+ int type, tier;
-+ struct ctrl_pos sp, pv;
-+ int gain[ANON_AND_FILE] = { swappiness, 200 - swappiness };
-+
-+ /*
-+ * Compare refaulted % between the base tiers of anon and file to
-+ * determine which type to evict. Also need to compare refaulted % of
-+ * the upper tiers of the selected type with that of the base tier of
-+ * the other type to determine which tier of the selected type to evict.
-+ */
-+ read_ctrl_pos(lruvec, 0, 0, gain[0], &sp);
-+ read_ctrl_pos(lruvec, 1, 0, gain[1], &pv);
-+ type = positive_ctrl_err(&sp, &pv);
-+
-+ read_ctrl_pos(lruvec, !type, 0, gain[!type], &sp);
-+ for (tier = 1; tier < MAX_NR_TIERS; tier++) {
-+ read_ctrl_pos(lruvec, type, tier, gain[type], &pv);
-+ if (!positive_ctrl_err(&sp, &pv))
-+ break;
-+ }
-+
-+ *tier_idx = tier - 1;
-+
-+ return type;
-+}
-+
-+static int isolate_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
-+ int *type_scanned, struct list_head *list)
-+{
-+ int i;
-+ int type;
-+ int scanned;
-+ int tier = -1;
-+ DEFINE_MIN_SEQ(lruvec);
-+
-+ VM_BUG_ON(!seq_is_valid(lruvec));
-+
-+ /*
-+ * Try to select a type based on generations and swappiness, and if that
-+ * fails, fall back to get_type_to_scan(). When anon and file are both
-+ * available from the same generation, swappiness 200 is interpreted as
-+ * anon first and swappiness 1 is interpreted as file first.
-+ */
-+ if (!swappiness)
-+ type = 1;
-+ else if (min_seq[0] < min_seq[1])
-+ type = 0;
-+ else if (swappiness == 1)
-+ type = 1;
-+ else if (swappiness == 200)
-+ type = 0;
-+ else
-+ type = get_type_to_scan(lruvec, swappiness, &tier);
-+
-+ for (i = !swappiness; i < ANON_AND_FILE; i++) {
-+ if (tier < 0)
-+ tier = get_tier_idx(lruvec, type);
-+
-+ scanned = scan_pages(lruvec, sc, type, tier, list);
-+ if (scanned)
-+ break;
-+
-+ type = !type;
-+ tier = -1;
-+ }
-+
-+ *type_scanned = type;
-+
-+ return scanned;
-+}
-+
-+/* Main function used by the foreground, the background and the user-triggered eviction. */
-+static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness)
-+{
-+ int type;
-+ int scanned;
-+ int reclaimed;
-+ LIST_HEAD(list);
-+ struct page *page;
-+ enum vm_event_item item;
-+ struct reclaim_stat stat;
-+ struct mm_walk_args *args;
-+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
-+ struct pglist_data *pgdat = lruvec_pgdat(lruvec);
-+
-+ spin_lock_irq(&lruvec->lru_lock);
-+
-+ scanned = isolate_pages(lruvec, sc, swappiness, &type, &list);
-+
-+ if (try_to_inc_min_seq(lruvec, swappiness))
-+ scanned++;
-+
-+ if (get_nr_gens(lruvec, 1) == MIN_NR_GENS)
-+ scanned = 0;
-+
-+ spin_unlock_irq(&lruvec->lru_lock);
-+
-+ if (list_empty(&list))
-+ return scanned;
-+
-+ reclaimed = shrink_page_list(&list, pgdat, sc, &stat, false);
-+ /*
-+ * We need to prevent rejected pages from being added back to the same
-+ * lists they were isolated from. Otherwise we may risk looping on them
-+ * forever.
-+ */
-+ list_for_each_entry(page, &list, lru) {
-+ if (!PageReclaim(page) || !(PageDirty(page) || PageWriteback(page)))
-+ SetPageActive(page);
-+
-+ ClearPageReferenced(page);
-+ ClearPageWorkingset(page);
-+ }
-+
-+ spin_lock_irq(&lruvec->lru_lock);
-+
-+ move_pages_to_lru(lruvec, &list);
-+
-+ args = current->reclaim_state ? current->reclaim_state->mm_walk_args : NULL;
-+ if (args && args->batch_size)
-+ reset_batch_size(lruvec, args);
-+
-+ item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT;
-+ if (!cgroup_reclaim(sc))
-+ __count_vm_events(item, reclaimed);
-+ __count_memcg_events(memcg, item, reclaimed);
-+ __count_vm_events(PGSTEAL_ANON + type, reclaimed);
-+
-+ spin_unlock_irq(&lruvec->lru_lock);
-+
-+ mem_cgroup_uncharge_list(&list);
-+ free_unref_page_list(&list);
-+
-+ sc->nr_reclaimed += reclaimed;
-+
-+ return scanned;
-+}
-+
-+static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, int swappiness)
-+{
-+ bool low;
-+ long nr_to_scan;
-+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
-+ int priority = sc->priority;
-+ DEFINE_MAX_SEQ(lruvec);
-+ DEFINE_MIN_SEQ(lruvec);
-+
-+ if (mem_cgroup_below_min(memcg) ||
-+ (mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim))
-+ return 0;
-+
-+ if (sc->nr_reclaimed >= sc->nr_to_reclaim) {
-+ priority = DEF_PRIORITY;
-+ sc->force_deactivate = 0;
-+ }
-+
-+ nr_to_scan = get_nr_evictable(lruvec, sc, swappiness, max_seq, min_seq, &low);
-+ if (!nr_to_scan)
-+ return 0;
-+
-+ nr_to_scan >>= priority;
-+
-+ if (!mem_cgroup_online(memcg))
-+ nr_to_scan++;
-+
-+ if (!nr_to_scan)
-+ return 0;
-+
-+ if (current_is_kswapd()) {
-+ /* leave the work to lru_gen_age_node() */
-+ if (max_seq - min_seq[1] < MIN_NR_GENS)
-+ return 0;
-+
-+ if (!low)
-+ sc->force_deactivate = 0;
-+
-+ return nr_to_scan;
-+ }
-+
-+ if (max_seq - min_seq[1] >= MIN_NR_GENS)
-+ return nr_to_scan;
-+
-+ /* move onto slab and other memcgs if we haven't tried them all */
-+ if (!sc->force_deactivate) {
-+ sc->skipped_deactivate = 1;
-+ return 0;
-+ }
-+
-+ return try_to_inc_max_seq(lruvec, sc, swappiness, max_seq, true) ? nr_to_scan : 0;
-+}
-+
-+static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
-+{
-+ struct blk_plug plug;
-+ long scanned = 0;
-+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
-+ struct pglist_data *pgdat = lruvec_pgdat(lruvec);
-+
-+ lru_add_drain();
-+
-+ if (current_is_kswapd())
-+ current->reclaim_state->mm_walk_args = &pgdat->mm_walk_args;
-+
-+ blk_start_plug(&plug);
-+
-+ while (true) {
-+ int delta;
-+ int swappiness;
-+ long nr_to_scan;
-+
-+ if (sc->may_swap)
-+ swappiness = get_swappiness(memcg);
-+ else if (!cgroup_reclaim(sc) && get_swappiness(memcg))
-+ swappiness = 1;
-+ else
-+ swappiness = 0;
-+
-+ nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness);
-+ if (!nr_to_scan)
-+ break;
-+
-+ delta = evict_pages(lruvec, sc, swappiness);
-+ if (!delta)
-+ break;
-+
-+ scanned += delta;
-+ if (scanned >= nr_to_scan)
-+ break;
-+
-+ cond_resched();
-+ }
-+
-+ blk_finish_plug(&plug);
-+
-+ if (current_is_kswapd())
-+ current->reclaim_state->mm_walk_args = NULL;
-+}
-+
-+/******************************************************************************
- * state change
- ******************************************************************************/
-
-@@ -4420,6 +4961,10 @@ static void lru_gen_age_node(struct pgli
- {
- }
-
-+static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
-+{
-+}
-+
- #endif /* CONFIG_LRU_GEN */
-
- static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
-@@ -4433,6 +4978,11 @@ static void shrink_lruvec(struct lruvec
- struct blk_plug plug;
- bool scan_adjusted;
-
-+ if (lru_gen_enabled()) {
-+ lru_gen_shrink_lruvec(lruvec, sc);
-+ return;
-+ }
-+
- get_scan_count(lruvec, sc, nr);
-
- /* Record the original scan target for proportional adjustments later */
-@@ -4906,6 +5456,9 @@ static void snapshot_refaults(struct mem
- struct lruvec *target_lruvec;
- unsigned long refaults;
-
-+ if (lru_gen_enabled())
-+ return;
-+
- target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
- refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON);
- target_lruvec->refaults[0] = refaults;
---- a/mm/workingset.c
-+++ b/mm/workingset.c
-@@ -187,7 +187,6 @@ static unsigned int bucket_order __read_
- static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction,
- bool workingset)
- {
-- eviction >>= bucket_order;
- eviction &= EVICTION_MASK;
- eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
- eviction = (eviction << NODES_SHIFT) | pgdat->node_id;
-@@ -212,10 +211,117 @@ static void unpack_shadow(void *shadow,
-
- *memcgidp = memcgid;
- *pgdat = NODE_DATA(nid);
-- *evictionp = entry << bucket_order;
-+ *evictionp = entry;
- *workingsetp = workingset;
- }
-
-+#ifdef CONFIG_LRU_GEN
-+
-+static int page_lru_refs(struct page *page)
-+{
-+ unsigned long flags = READ_ONCE(page->flags);
-+
-+ BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_WIDTH > BITS_PER_LONG - EVICTION_SHIFT);
-+
-+ /* see the comment on MAX_NR_TIERS */
-+ return flags & BIT(PG_workingset) ? (flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF : 0;
-+}
-+
-+/* Return a token to be stored in the shadow entry of a page being evicted. */
-+static void *lru_gen_eviction(struct page *page)
-+{
-+ int hist, tier;
-+ unsigned long token;
-+ unsigned long min_seq;
-+ struct lruvec *lruvec;
-+ struct lrugen *lrugen;
-+ int type = page_is_file_lru(page);
-+ int refs = page_lru_refs(page);
-+ int delta = thp_nr_pages(page);
-+ bool workingset = PageWorkingset(page);
-+ struct mem_cgroup *memcg = page_memcg(page);
-+ struct pglist_data *pgdat = page_pgdat(page);
-+
-+ lruvec = mem_cgroup_lruvec(memcg, pgdat);
-+ lrugen = &lruvec->evictable;
-+ min_seq = READ_ONCE(lrugen->min_seq[type]);
-+ token = (min_seq << LRU_REFS_WIDTH) | refs;
-+
-+ hist = lru_hist_from_seq(min_seq);
-+ tier = lru_tier_from_refs(refs + workingset);
-+ atomic_long_add(delta, &lrugen->evicted[hist][type][tier]);
-+
-+ return pack_shadow(mem_cgroup_id(memcg), pgdat, token, workingset);
-+}
-+
-+/* Count a refaulted page based on the token stored in its shadow entry. */
-+static void lru_gen_refault(struct page *page, void *shadow)
-+{
-+ int hist, tier, refs;
-+ int memcg_id;
-+ bool workingset;
-+ unsigned long token;
-+ unsigned long min_seq;
-+ struct lruvec *lruvec;
-+ struct lrugen *lrugen;
-+ struct mem_cgroup *memcg;
-+ struct pglist_data *pgdat;
-+ int type = page_is_file_lru(page);
-+ int delta = thp_nr_pages(page);
-+
-+ unpack_shadow(shadow, &memcg_id, &pgdat, &token, &workingset);
-+ if (page_pgdat(page) != pgdat)
-+ return;
-+
-+ rcu_read_lock();
-+ memcg = page_memcg_rcu(page);
-+ if (mem_cgroup_id(memcg) != memcg_id)
-+ goto unlock;
-+
-+ refs = token & (BIT(LRU_REFS_WIDTH) - 1);
-+ if (refs && !workingset)
-+ goto unlock;
-+
-+ token >>= LRU_REFS_WIDTH;
-+ lruvec = mem_cgroup_lruvec(memcg, pgdat);
-+ lrugen = &lruvec->evictable;
-+ min_seq = READ_ONCE(lrugen->min_seq[type]);
-+ if (token != (min_seq & (EVICTION_MASK >> LRU_REFS_WIDTH)))
-+ goto unlock;
-+
-+ hist = lru_hist_from_seq(min_seq);
-+ tier = lru_tier_from_refs(refs + workingset);
-+ atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]);
-+ mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type, delta);
-+
-+ /*
-+ * Tiers don't offer any protection to pages accessed via page tables.
-+ * That's what generations do. Tiers can't fully protect pages after
-+ * their numbers of accesses has exceeded the max value. Conservatively
-+ * count these two conditions as stalls even though they might not
-+ * indicate any real memory pressure.
-+ */
-+ if (task_in_nonseq_fault() || refs + workingset == BIT(LRU_REFS_WIDTH)) {
-+ SetPageWorkingset(page);
-+ mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta);
-+ }
-+unlock:
-+ rcu_read_unlock();
-+}
-+
-+#else
-+
-+static void *lru_gen_eviction(struct page *page)
-+{
-+ return NULL;
-+}
-+
-+static void lru_gen_refault(struct page *page, void *shadow)
-+{
-+}
-+
-+#endif /* CONFIG_LRU_GEN */
-+
- /**
- * workingset_age_nonresident - age non-resident entries as LRU ages
- * @lruvec: the lruvec that was aged
-@@ -264,10 +370,14 @@ void *workingset_eviction(struct page *p
- VM_BUG_ON_PAGE(page_count(page), page);
- VM_BUG_ON_PAGE(!PageLocked(page), page);
-
-+ if (lru_gen_enabled())
-+ return lru_gen_eviction(page);
-+
- lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
- /* XXX: target_memcg can be NULL, go through lruvec */
- memcgid = mem_cgroup_id(lruvec_memcg(lruvec));
- eviction = atomic_long_read(&lruvec->nonresident_age);
-+ eviction >>= bucket_order;
- workingset_age_nonresident(lruvec, thp_nr_pages(page));
- return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page));
- }
-@@ -296,7 +406,13 @@ void workingset_refault(struct page *pag
- bool workingset;
- int memcgid;
-
-+ if (lru_gen_enabled()) {
-+ lru_gen_refault(page, shadow);
-+ return;
-+ }
-+
- unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset);
-+ eviction <<= bucket_order;
-
- rcu_read_lock();
- /*
+++ /dev/null
-From 5cc7fdec54e87e32b4fb0f07d84b21769d5f8d92 Mon Sep 17 00:00:00 2001
-Date: Mon, 25 Jan 2021 21:38:02 -0700
-Subject: [PATCH 08/10] mm: multigenerational lru: user interface
-
-Add /sys/kernel/mm/lru_gen/enabled to enable and disable the
-multigenerational lru at runtime.
-
-Add /sys/kernel/mm/lru_gen/min_ttl_ms to protect the working set of a
-given number of milliseconds. The OOM killer is invoked if this
-working set cannot be kept in memory.
-
-Add /sys/kernel/debug/lru_gen to monitor the multigenerational lru and
-invoke the aging and the eviction. This file has the following output:
- memcg memcg_id memcg_path
- node node_id
- min_gen birth_time anon_size file_size
- ...
- max_gen birth_time anon_size file_size
-
-min_gen is the oldest generation number and max_gen is the youngest
-generation number. birth_time is in milliseconds. anon_size and
-file_size are in pages.
-
-This file takes the following input:
- + memcg_id node_id max_gen [swappiness] [use_bloom_filter]
- - memcg_id node_id min_gen [swappiness] [nr_to_reclaim]
-
-The first command line invokes the aging, which scans PTEs for
-accessed pages and then creates the next generation max_gen+1. A swap
-file and a non-zero swappiness, which overrides vm.swappiness, are
-required to scan PTEs mapping anon pages. The second command line
-invokes the eviction, which evicts generations less than or equal to
-min_gen. min_gen should be less than max_gen-1 as max_gen and
-max_gen-1 are not fully aged and therefore cannot be evicted.
-Setting nr_to_reclaim to N limits the number of pages to evict.
-Setting use_bloom_filter to 0 overrides the default behavior which
-only scans PTE tables found populated. Multiple command lines are
-supported, as is concatenation with delimiters "," and ";".
-
-Change-Id: I4448e60029badbe347aa3b624f429b280cc3a3d3
----
- include/linux/nodemask.h | 1 +
- mm/vmscan.c | 415 +++++++++++++++++++++++++++++++++++++++
- 2 files changed, 416 insertions(+)
-
---- a/include/linux/nodemask.h
-+++ b/include/linux/nodemask.h
-@@ -485,6 +485,7 @@ static inline int num_node_state(enum no
- #define first_online_node 0
- #define first_memory_node 0
- #define next_online_node(nid) (MAX_NUMNODES)
-+#define next_memory_node(nid) (MAX_NUMNODES)
- #define nr_node_ids 1U
- #define nr_online_nodes 1U
-
---- a/mm/vmscan.c
-+++ b/mm/vmscan.c
-@@ -53,6 +53,8 @@
- #include <linux/memory.h>
- #include <linux/pagewalk.h>
- #include <linux/shmem_fs.h>
-+#include <linux/ctype.h>
-+#include <linux/debugfs.h>
-
- #include <asm/tlbflush.h>
- #include <asm/div64.h>
-@@ -4882,6 +4884,413 @@ unlock:
- }
-
- /******************************************************************************
-+ * sysfs interface
-+ ******************************************************************************/
-+
-+static ssize_t show_min_ttl(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
-+{
-+ return sprintf(buf, "%u\n", jiffies_to_msecs(READ_ONCE(lru_gen_min_ttl)));
-+}
-+
-+static ssize_t store_min_ttl(struct kobject *kobj, struct kobj_attribute *attr,
-+ const char *buf, size_t len)
-+{
-+ unsigned int msecs;
-+
-+ if (kstrtouint(buf, 10, &msecs))
-+ return -EINVAL;
-+
-+ WRITE_ONCE(lru_gen_min_ttl, msecs_to_jiffies(msecs));
-+
-+ return len;
-+}
-+
-+static struct kobj_attribute lru_gen_min_ttl_attr = __ATTR(
-+ min_ttl_ms, 0644, show_min_ttl, store_min_ttl
-+);
-+
-+static ssize_t show_enable(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
-+{
-+ return snprintf(buf, PAGE_SIZE, "%d\n", lru_gen_enabled());
-+}
-+
-+static ssize_t store_enable(struct kobject *kobj, struct kobj_attribute *attr,
-+ const char *buf, size_t len)
-+{
-+ bool enable;
-+
-+ if (kstrtobool(buf, &enable))
-+ return -EINVAL;
-+
-+ lru_gen_change_state(enable, true, false);
-+
-+ return len;
-+}
-+
-+static struct kobj_attribute lru_gen_enabled_attr = __ATTR(
-+ enabled, 0644, show_enable, store_enable
-+);
-+
-+static struct attribute *lru_gen_attrs[] = {
-+ &lru_gen_min_ttl_attr.attr,
-+ &lru_gen_enabled_attr.attr,
-+ NULL
-+};
-+
-+static struct attribute_group lru_gen_attr_group = {
-+ .name = "lru_gen",
-+ .attrs = lru_gen_attrs,
-+};
-+
-+/******************************************************************************
-+ * debugfs interface
-+ ******************************************************************************/
-+
-+static void *lru_gen_seq_start(struct seq_file *m, loff_t *pos)
-+{
-+ struct mem_cgroup *memcg;
-+ loff_t nr_to_skip = *pos;
-+
-+ m->private = kvmalloc(PATH_MAX, GFP_KERNEL);
-+ if (!m->private)
-+ return ERR_PTR(-ENOMEM);
-+
-+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
-+ do {
-+ int nid;
-+
-+ for_each_node_state(nid, N_MEMORY) {
-+ if (!nr_to_skip--)
-+ return get_lruvec(nid, memcg);
-+ }
-+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
-+
-+ return NULL;
-+}
-+
-+static void lru_gen_seq_stop(struct seq_file *m, void *v)
-+{
-+ if (!IS_ERR_OR_NULL(v))
-+ mem_cgroup_iter_break(NULL, lruvec_memcg(v));
-+
-+ kvfree(m->private);
-+ m->private = NULL;
-+}
-+
-+static void *lru_gen_seq_next(struct seq_file *m, void *v, loff_t *pos)
-+{
-+ int nid = lruvec_pgdat(v)->node_id;
-+ struct mem_cgroup *memcg = lruvec_memcg(v);
-+
-+ ++*pos;
-+
-+ nid = next_memory_node(nid);
-+ if (nid == MAX_NUMNODES) {
-+ memcg = mem_cgroup_iter(NULL, memcg, NULL);
-+ if (!memcg)
-+ return NULL;
-+
-+ nid = first_memory_node;
-+ }
-+
-+ return get_lruvec(nid, memcg);
-+}
-+
-+static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
-+ unsigned long max_seq, unsigned long *min_seq,
-+ unsigned long seq)
-+{
-+ int i;
-+ int type, tier;
-+ int hist = lru_hist_from_seq(seq);
-+ struct lrugen *lrugen = &lruvec->evictable;
-+
-+ for (tier = 0; tier < MAX_NR_TIERS; tier++) {
-+ seq_printf(m, " %10d", tier);
-+ for (type = 0; type < ANON_AND_FILE; type++) {
-+ unsigned long n[3] = {};
-+
-+ if (seq == max_seq) {
-+ n[0] = READ_ONCE(lrugen->avg_refaulted[type][tier]);
-+ n[1] = READ_ONCE(lrugen->avg_total[type][tier]);
-+
-+ seq_printf(m, " %10luR %10luT %10lu ", n[0], n[1], n[2]);
-+ } else if (seq == min_seq[type] || NR_HIST_GENS > 1) {
-+ n[0] = atomic_long_read(&lrugen->refaulted[hist][type][tier]);
-+ n[1] = atomic_long_read(&lrugen->evicted[hist][type][tier]);
-+ if (tier)
-+ n[2] = READ_ONCE(lrugen->protected[hist][type][tier - 1]);
-+
-+ seq_printf(m, " %10lur %10lue %10lup", n[0], n[1], n[2]);
-+ } else
-+ seq_puts(m, " 0 0 0 ");
-+ }
-+ seq_putc(m, '\n');
-+ }
-+
-+ seq_puts(m, " ");
-+ for (i = 0; i < NR_MM_STATS; i++) {
-+ if (seq == max_seq && NR_HIST_GENS == 1)
-+ seq_printf(m, " %10lu%c", READ_ONCE(lruvec->mm_walk.stats[hist][i]),
-+ toupper(MM_STAT_CODES[i]));
-+ else if (seq != max_seq && NR_HIST_GENS > 1)
-+ seq_printf(m, " %10lu%c", READ_ONCE(lruvec->mm_walk.stats[hist][i]),
-+ MM_STAT_CODES[i]);
-+ else
-+ seq_puts(m, " 0 ");
-+ }
-+ seq_putc(m, '\n');
-+}
-+
-+static int lru_gen_seq_show(struct seq_file *m, void *v)
-+{
-+ unsigned long seq;
-+ bool full = !debugfs_real_fops(m->file)->write;
-+ struct lruvec *lruvec = v;
-+ struct lrugen *lrugen = &lruvec->evictable;
-+ int nid = lruvec_pgdat(lruvec)->node_id;
-+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
-+ DEFINE_MAX_SEQ(lruvec);
-+ DEFINE_MIN_SEQ(lruvec);
-+
-+ if (nid == first_memory_node) {
-+ const char *path = memcg ? m->private : "";
-+
-+#ifdef CONFIG_MEMCG
-+ if (memcg)
-+ cgroup_path(memcg->css.cgroup, m->private, PATH_MAX);
-+#endif
-+ seq_printf(m, "memcg %5hu %s\n", mem_cgroup_id(memcg), path);
-+ }
-+
-+ seq_printf(m, " node %5d\n", nid);
-+
-+ if (!full)
-+ seq = min_seq[0];
-+ else if (max_seq >= MAX_NR_GENS)
-+ seq = max_seq - MAX_NR_GENS + 1;
-+ else
-+ seq = 0;
-+
-+ for (; seq <= max_seq; seq++) {
-+ int gen, type, zone;
-+ unsigned int msecs;
-+
-+ gen = lru_gen_from_seq(seq);
-+ msecs = jiffies_to_msecs(jiffies - READ_ONCE(lrugen->timestamps[gen]));
-+
-+ seq_printf(m, " %10lu %10u", seq, msecs);
-+
-+ for (type = 0; type < ANON_AND_FILE; type++) {
-+ long size = 0;
-+
-+ if (seq < min_seq[type]) {
-+ seq_puts(m, " -0 ");
-+ continue;
-+ }
-+
-+ for (zone = 0; zone < MAX_NR_ZONES; zone++)
-+ size += READ_ONCE(lrugen->sizes[gen][type][zone]);
-+
-+ seq_printf(m, " %10lu ", max(size, 0L));
-+ }
-+
-+ seq_putc(m, '\n');
-+
-+ if (full)
-+ lru_gen_seq_show_full(m, lruvec, max_seq, min_seq, seq);
-+ }
-+
-+ return 0;
-+}
-+
-+static const struct seq_operations lru_gen_seq_ops = {
-+ .start = lru_gen_seq_start,
-+ .stop = lru_gen_seq_stop,
-+ .next = lru_gen_seq_next,
-+ .show = lru_gen_seq_show,
-+};
-+
-+static int run_aging(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
-+ unsigned long seq, bool use_filter)
-+{
-+ DEFINE_MAX_SEQ(lruvec);
-+
-+ if (seq == max_seq)
-+ try_to_inc_max_seq(lruvec, sc, swappiness, max_seq, use_filter);
-+
-+ return seq > max_seq ? -EINVAL : 0;
-+}
-+
-+static int run_eviction(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
-+ unsigned long seq, unsigned long nr_to_reclaim)
-+{
-+ struct blk_plug plug;
-+ int err = -EINTR;
-+ DEFINE_MAX_SEQ(lruvec);
-+
-+ if (seq >= max_seq - 1)
-+ return -EINVAL;
-+
-+ sc->nr_reclaimed = 0;
-+
-+ blk_start_plug(&plug);
-+
-+ while (!signal_pending(current)) {
-+ DEFINE_MIN_SEQ(lruvec);
-+
-+ if (seq < min_seq[!swappiness] || sc->nr_reclaimed >= nr_to_reclaim ||
-+ !evict_pages(lruvec, sc, swappiness)) {
-+ err = 0;
-+ break;
-+ }
-+
-+ cond_resched();
-+ }
-+
-+ blk_finish_plug(&plug);
-+
-+ return err;
-+}
-+
-+static int run_cmd(char cmd, int memcg_id, int nid, struct scan_control *sc,
-+ int swappiness, unsigned long seq, unsigned long opt)
-+{
-+ struct lruvec *lruvec;
-+ int err = -EINVAL;
-+ struct mem_cgroup *memcg = NULL;
-+
-+ if (!mem_cgroup_disabled()) {
-+ rcu_read_lock();
-+ memcg = mem_cgroup_from_id(memcg_id);
-+#ifdef CONFIG_MEMCG
-+ if (memcg && !css_tryget(&memcg->css))
-+ memcg = NULL;
-+#endif
-+ rcu_read_unlock();
-+
-+ if (!memcg)
-+ goto done;
-+ }
-+ if (memcg_id != mem_cgroup_id(memcg))
-+ goto done;
-+
-+ if (nid < 0 || nid >= MAX_NUMNODES || !node_state(nid, N_MEMORY))
-+ goto done;
-+
-+ lruvec = get_lruvec(nid, memcg);
-+
-+ if (swappiness < 0)
-+ swappiness = get_swappiness(memcg);
-+ else if (swappiness > 200)
-+ goto done;
-+
-+ switch (cmd) {
-+ case '+':
-+ err = run_aging(lruvec, sc, swappiness, seq, opt);
-+ break;
-+ case '-':
-+ err = run_eviction(lruvec, sc, swappiness, seq, opt);
-+ break;
-+ }
-+done:
-+ mem_cgroup_put(memcg);
-+
-+ return err;
-+}
-+
-+static ssize_t lru_gen_seq_write(struct file *file, const char __user *src,
-+ size_t len, loff_t *pos)
-+{
-+ void *buf;
-+ char *cur, *next;
-+ unsigned int flags;
-+ int err = 0;
-+ struct scan_control sc = {
-+ .may_writepage = 1,
-+ .may_unmap = 1,
-+ .may_swap = 1,
-+ .reclaim_idx = MAX_NR_ZONES - 1,
-+ .gfp_mask = GFP_KERNEL,
-+ };
-+
-+ buf = kvmalloc(len + 1, GFP_KERNEL);
-+ if (!buf)
-+ return -ENOMEM;
-+
-+ if (copy_from_user(buf, src, len)) {
-+ kvfree(buf);
-+ return -EFAULT;
-+ }
-+
-+ next = buf;
-+ next[len] = '\0';
-+
-+ sc.reclaim_state.mm_walk_args = alloc_mm_walk_args();
-+ if (!sc.reclaim_state.mm_walk_args) {
-+ kvfree(buf);
-+ return -ENOMEM;
-+ }
-+
-+ flags = memalloc_noreclaim_save();
-+ set_task_reclaim_state(current, &sc.reclaim_state);
-+
-+ while ((cur = strsep(&next, ",;\n"))) {
-+ int n;
-+ int end;
-+ char cmd;
-+ unsigned int memcg_id;
-+ unsigned int nid;
-+ unsigned long seq;
-+ unsigned int swappiness = -1;
-+ unsigned long opt = -1;
-+
-+ cur = skip_spaces(cur);
-+ if (!*cur)
-+ continue;
-+
-+ n = sscanf(cur, "%c %u %u %lu %n %u %n %lu %n", &cmd, &memcg_id, &nid,
-+ &seq, &end, &swappiness, &end, &opt, &end);
-+ if (n < 4 || cur[end]) {
-+ err = -EINVAL;
-+ break;
-+ }
-+
-+ err = run_cmd(cmd, memcg_id, nid, &sc, swappiness, seq, opt);
-+ if (err)
-+ break;
-+ }
-+
-+ set_task_reclaim_state(current, NULL);
-+ memalloc_noreclaim_restore(flags);
-+
-+ free_mm_walk_args(sc.reclaim_state.mm_walk_args);
-+ kvfree(buf);
-+
-+ return err ? : len;
-+}
-+
-+static int lru_gen_seq_open(struct inode *inode, struct file *file)
-+{
-+ return seq_open(file, &lru_gen_seq_ops);
-+}
-+
-+static const struct file_operations lru_gen_rw_fops = {
-+ .open = lru_gen_seq_open,
-+ .read = seq_read,
-+ .write = lru_gen_seq_write,
-+ .llseek = seq_lseek,
-+ .release = seq_release,
-+};
-+
-+static const struct file_operations lru_gen_ro_fops = {
-+ .open = lru_gen_seq_open,
-+ .read = seq_read,
-+ .llseek = seq_lseek,
-+ .release = seq_release,
-+};
-+
-+/******************************************************************************
- * initialization
- ******************************************************************************/
-
-@@ -4951,6 +5360,12 @@ static int __init init_lru_gen(void)
- BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);
- BUILD_BUG_ON(sizeof(MM_STAT_CODES) != NR_MM_STATS + 1);
-
-+ if (sysfs_create_group(mm_kobj, &lru_gen_attr_group))
-+ pr_err("lru_gen: failed to create sysfs group\n");
-+
-+ debugfs_create_file("lru_gen", 0644, NULL, NULL, &lru_gen_rw_fops);
-+ debugfs_create_file("lru_gen_full", 0444, NULL, NULL, &lru_gen_ro_fops);
-+
- return 0;
- };
- late_initcall(init_lru_gen);
+++ /dev/null
-From 3008095eb835d207dd7e5b60899aad17f32aa9f7 Mon Sep 17 00:00:00 2001
-Date: Mon, 25 Jan 2021 21:47:24 -0700
-Subject: [PATCH 09/10] mm: multigenerational lru: Kconfig
-
-Add configuration options for the multigenerational lru.
-
-Change-Id: Ic74ea07f8fb5f56e6904a1b80c3c286bc2911635
----
- mm/Kconfig | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
- 1 file changed, 59 insertions(+)
-
---- a/mm/Kconfig
-+++ b/mm/Kconfig
-@@ -899,4 +899,63 @@ config SECRETMEM
-
- source "mm/damon/Kconfig"
-
-+# the multigenerational lru {
-+config LRU_GEN
-+ bool "Multigenerational LRU"
-+ depends on MMU
-+ # the following options may leave not enough spare bits in page->flags
-+ depends on !MAXSMP && (64BIT || !SPARSEMEM || SPARSEMEM_VMEMMAP)
-+ help
-+ A high performance LRU implementation to heavily overcommit workloads
-+ that are not IO bound. See Documentation/vm/multigen_lru.rst for
-+ details.
-+
-+ Warning: do not enable this option unless you plan to use it because
-+ it introduces a small per-process and per-memcg and per-node memory
-+ overhead.
-+
-+config LRU_GEN_ENABLED
-+ bool "Turn on by default"
-+ depends on LRU_GEN
-+ help
-+ The default value of /sys/kernel/mm/lru_gen/enabled is 0. This option
-+ changes it to 1.
-+
-+ Warning: the default value is the fast path. See
-+ Documentation/static-keys.txt for details.
-+
-+config LRU_GEN_STATS
-+ bool "Full stats for debugging"
-+ depends on LRU_GEN
-+ help
-+ This option keeps full stats for each generation, which can be read
-+ from /sys/kernel/debug/lru_gen_full.
-+
-+ Warning: do not enable this option unless you plan to use it because
-+ it introduces an additional small per-process and per-memcg and
-+ per-node memory overhead.
-+
-+config NR_LRU_GENS
-+ int "Max number of generations"
-+ depends on LRU_GEN
-+ range 4 31
-+ default 7
-+ help
-+ This will use order_base_2(N+1) spare bits from page flags.
-+
-+ Warning: do not use numbers larger than necessary because each
-+ generation introduces a small per-node and per-memcg memory overhead.
-+
-+config TIERS_PER_GEN
-+ int "Number of tiers per generation"
-+ depends on LRU_GEN
-+ range 2 5
-+ default 4
-+ help
-+ This will use N-2 spare bits from page flags.
-+
-+ Larger values generally offer better protection to active pages under
-+ heavy buffered I/O workloads.
-+# }
-+
- endmenu
+++ /dev/null
-From f59c618ed70a1e48accc4cad91a200966f2569c9 Mon Sep 17 00:00:00 2001
-Date: Tue, 2 Feb 2021 01:27:45 -0700
-Subject: [PATCH 10/10] mm: multigenerational lru: documentation
-
-Add Documentation/vm/multigen_lru.rst.
-
-Change-Id: I1902178bcbb5adfa0a748c4d284a6456059bdd7e
----
- Documentation/vm/index.rst | 1 +
- Documentation/vm/multigen_lru.rst | 132 ++++++++++++++++++++++++++++++
- 2 files changed, 133 insertions(+)
- create mode 100644 Documentation/vm/multigen_lru.rst
-
---- a/Documentation/vm/index.rst
-+++ b/Documentation/vm/index.rst
-@@ -17,6 +17,7 @@ various features of the Linux memory man
-
- swap_numa
- zswap
-+ multigen_lru
-
- Kernel developers MM documentation
- ==================================
---- /dev/null
-+++ b/Documentation/vm/multigen_lru.rst
-@@ -0,0 +1,132 @@
-+.. SPDX-License-Identifier: GPL-2.0
-+
-+=====================
-+Multigenerational LRU
-+=====================
-+
-+Quick Start
-+===========
-+Build Configurations
-+--------------------
-+:Required: Set ``CONFIG_LRU_GEN=y``.
-+
-+:Optional: Set ``CONFIG_LRU_GEN_ENABLED=y`` to turn the feature on by
-+ default.
-+
-+Runtime Configurations
-+----------------------
-+:Required: Write ``1`` to ``/sys/kernel/mm/lru_gen/enable`` if the
-+ feature was not turned on by default.
-+
-+:Optional: Write ``N`` to ``/sys/kernel/mm/lru_gen/min_ttl_ms`` to
-+ protect the working set of ``N`` milliseconds. The OOM killer is
-+ invoked if this working set cannot be kept in memory.
-+
-+:Optional: Read ``/sys/kernel/debug/lru_gen`` to confirm the feature
-+ is turned on. This file has the following output:
-+
-+::
-+
-+ memcg memcg_id memcg_path
-+ node node_id
-+ min_gen birth_time anon_size file_size
-+ ...
-+ max_gen birth_time anon_size file_size
-+
-+``min_gen`` is the oldest generation number and ``max_gen`` is the
-+youngest generation number. ``birth_time`` is in milliseconds.
-+``anon_size`` and ``file_size`` are in pages.
-+
-+Phones/Laptops/Workstations
-+---------------------------
-+No additional configurations required.
-+
-+Servers/Data Centers
-+--------------------
-+:To support more generations: Change ``CONFIG_NR_LRU_GENS`` to a
-+ larger number.
-+
-+:To support more tiers: Change ``CONFIG_TIERS_PER_GEN`` to a larger
-+ number.
-+
-+:To support full stats: Set ``CONFIG_LRU_GEN_STATS=y``.
-+
-+:Working set estimation: Write ``+ memcg_id node_id max_gen
-+ [swappiness] [use_bloom_filter]`` to ``/sys/kernel/debug/lru_gen`` to
-+ invoke the aging, which scans PTEs for accessed pages and then
-+ creates the next generation ``max_gen+1``. A swap file and a non-zero
-+ ``swappiness``, which overrides ``vm.swappiness``, are required to
-+ scan PTEs mapping anon pages. Set ``use_bloom_filter`` to 0 to
-+ override the default behavior which only scans PTE tables found
-+ populated.
-+
-+:Proactive reclaim: Write ``- memcg_id node_id min_gen [swappiness]
-+ [nr_to_reclaim]`` to ``/sys/kernel/debug/lru_gen`` to invoke the
-+ eviction, which evicts generations less than or equal to ``min_gen``.
-+ ``min_gen`` should be less than ``max_gen-1`` as ``max_gen`` and
-+ ``max_gen-1`` are not fully aged and therefore cannot be evicted.
-+ Use ``nr_to_reclaim`` to limit the number of pages to evict. Multiple
-+ command lines are supported, so does concatenation with delimiters
-+ ``,`` and ``;``.
-+
-+Framework
-+=========
-+For each ``lruvec``, evictable pages are divided into multiple
-+generations. The youngest generation number is stored in
-+``lrugen->max_seq`` for both anon and file types as they are aged on
-+an equal footing. The oldest generation numbers are stored in
-+``lrugen->min_seq[]`` separately for anon and file types as clean
-+file pages can be evicted regardless of swap and writeback
-+constraints. These three variables are monotonically increasing.
-+Generation numbers are truncated into
-+``order_base_2(CONFIG_NR_LRU_GENS+1)`` bits in order to fit into
-+``page->flags``. The sliding window technique is used to prevent
-+truncated generation numbers from overlapping. Each truncated
-+generation number is an index to an array of per-type and per-zone
-+lists ``lrugen->lists``.
-+
-+Each generation is divided into multiple tiers. Tiers represent
-+different ranges of numbers of accesses from file descriptors only.
-+Pages accessed ``N`` times via file descriptors belong to tier
-+``order_base_2(N)``. Each generation contains at most
-+``CONFIG_TIERS_PER_GEN`` tiers, and they require additional
-+``CONFIG_TIERS_PER_GEN-2`` bits in ``page->flags``. In contrast to
-+moving between generations which requires list operations, moving
-+between tiers only involves operations on ``page->flags`` and
-+therefore has a negligible cost. A feedback loop modeled after the PID
-+controller monitors refaulted % across all tiers and decides when to
-+protect pages from which tiers.
-+
-+The framework comprises two conceptually independent components: the
-+aging and the eviction, which can be invoked separately from user
-+space for the purpose of working set estimation and proactive reclaim.
-+
-+Aging
-+-----
-+The aging produces young generations. Given an ``lruvec``, the aging
-+traverses ``lruvec_memcg()->mm_list`` and calls ``walk_page_range()``
-+to scan PTEs for accessed pages (a ``mm_struct`` list is maintained
-+for each ``memcg``). Upon finding one, the aging updates its
-+generation number to ``max_seq`` (modulo ``CONFIG_NR_LRU_GENS``).
-+After each round of traversal, the aging increments ``max_seq``. The
-+aging is due when ``min_seq[]`` reaches ``max_seq-1``.
-+
-+Eviction
-+--------
-+The eviction consumes old generations. Given an ``lruvec``, the
-+eviction scans pages on the per-zone lists indexed by anon and file
-+``min_seq[]`` (modulo ``CONFIG_NR_LRU_GENS``). It first tries to
-+select a type based on the values of ``min_seq[]``. If they are
-+equal, it selects the type that has a lower refaulted %. The eviction
-+sorts a page according to its updated generation number if the aging
-+has found this page accessed. It also moves a page to the next
-+generation if this page is from an upper tier that has a higher
-+refaulted % than the base tier. The eviction increments ``min_seq[]``
-+of a selected type when it finds all the per-zone lists indexed by
-+``min_seq[]`` of this selected type are empty.
-+
-+To-do List
-+==========
-+KVM Optimization
-+----------------
-+Support shadow page table walk.