Commit 36c7b4db authored by T.J. Alumbaugh's avatar T.J. Alumbaugh Committed by Andrew Morton
Browse files

mm: multi-gen LRU: section for memcg LRU

Move memcg LRU code into a dedicated section.  Improve the design doc to
outline its architecture.

Link: https://lkml.kernel.org/r/20230118001827.1040870-5-talumbau@google.com


Signed-off-by: default avatarT.J. Alumbaugh <talumbau@google.com>
Cc: Yu Zhao <yuzhao@google.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
parent ccbbbb85
Loading
Loading
Loading
Loading
+32 −1
Original line number Diff line number Diff line
@@ -186,9 +186,40 @@ is false positive, the cost is an additional scan of a range of PTEs,
which may yield hot pages anyway. Parameters of the filter itself can
control the false positive rate in the limit.

Memcg LRU
---------
An memcg LRU is a per-node LRU of memcgs. It is also an LRU of LRUs,
since each node and memcg combination has an LRU of folios (see
``mem_cgroup_lruvec()``). Its goal is to improve the scalability of
global reclaim, which is critical to system-wide memory overcommit in
data centers. Note that memcg LRU only applies to global reclaim.

The basic structure of an memcg LRU can be understood by an analogy to
the active/inactive LRU (of folios):

1. It has the young and the old (generations), i.e., the counterparts
   to the active and the inactive;
2. The increment of ``max_seq`` triggers promotion, i.e., the
   counterpart to activation;
3. Other events trigger similar operations, e.g., offlining an memcg
   triggers demotion, i.e., the counterpart to deactivation.

In terms of global reclaim, it has two distinct features:

1. Sharding, which allows each thread to start at a random memcg (in
   the old generation) and improves parallelism;
2. Eventual fairness, which allows direct reclaim to bail out at will
   and reduces latency without affecting fairness over some time.

In terms of traversing memcgs during global reclaim, it improves the
best-case complexity from O(n) to O(1) and does not affect the
worst-case complexity O(n). Therefore, on average, it has a sublinear
complexity.

Summary
-------
The multi-gen LRU can be disassembled into the following parts:
The multi-gen LRU (of folios) can be disassembled into the following
parts:

* Generations
* Rmap walks
+0 −17
Original line number Diff line number Diff line
@@ -122,18 +122,6 @@ static inline bool lru_gen_in_fault(void)
	return current->in_lru_fault;
}

#ifdef CONFIG_MEMCG
static inline int lru_gen_memcg_seg(struct lruvec *lruvec)
{
	return READ_ONCE(lruvec->lrugen.seg);
}
#else
static inline int lru_gen_memcg_seg(struct lruvec *lruvec)
{
	return 0;
}
#endif

static inline int lru_gen_from_seq(unsigned long seq)
{
	return seq % MAX_NR_GENS;
@@ -309,11 +297,6 @@ static inline bool lru_gen_in_fault(void)
	return false;
}

static inline int lru_gen_memcg_seg(struct lruvec *lruvec)
{
	return 0;
}

static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
{
	return false;
+2 −11
Original line number Diff line number Diff line
@@ -368,15 +368,6 @@ struct page_vma_mapped_walk;
#define LRU_GEN_MASK		((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
#define LRU_REFS_MASK		((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)

/* see the comment on MEMCG_NR_GENS */
enum {
	MEMCG_LRU_NOP,
	MEMCG_LRU_HEAD,
	MEMCG_LRU_TAIL,
	MEMCG_LRU_OLD,
	MEMCG_LRU_YOUNG,
};

#ifdef CONFIG_LRU_GEN

enum {
@@ -557,7 +548,7 @@ void lru_gen_exit_memcg(struct mem_cgroup *memcg);
void lru_gen_online_memcg(struct mem_cgroup *memcg);
void lru_gen_offline_memcg(struct mem_cgroup *memcg);
void lru_gen_release_memcg(struct mem_cgroup *memcg);
void lru_gen_rotate_memcg(struct lruvec *lruvec, int op);
void lru_gen_soft_reclaim(struct lruvec *lruvec);

#else /* !CONFIG_MEMCG */

@@ -608,7 +599,7 @@ static inline void lru_gen_release_memcg(struct mem_cgroup *memcg)
{
}

static inline void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
static inline void lru_gen_soft_reclaim(struct lruvec *lruvec)
{
}

+2 −6
Original line number Diff line number Diff line
@@ -476,12 +476,8 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, int nid)
	struct mem_cgroup_tree_per_node *mctz;

	if (lru_gen_enabled()) {
		struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec;

		/* see the comment on MEMCG_NR_GENS */
		if (soft_limit_excess(memcg) && lru_gen_memcg_seg(lruvec) != MEMCG_LRU_HEAD)
			lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD);

		if (soft_limit_excess(memcg))
			lru_gen_soft_reclaim(&memcg->nodeinfo[nid]->lruvec);
		return;
	}

+142 −108
Original line number Diff line number Diff line
@@ -4705,6 +4705,148 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
	mem_cgroup_unlock_pages();
}

/******************************************************************************
 *                          memcg LRU
 ******************************************************************************/

/* see the comment on MEMCG_NR_GENS */
enum {
	MEMCG_LRU_NOP,
	MEMCG_LRU_HEAD,
	MEMCG_LRU_TAIL,
	MEMCG_LRU_OLD,
	MEMCG_LRU_YOUNG,
};

#ifdef CONFIG_MEMCG

static int lru_gen_memcg_seg(struct lruvec *lruvec)
{
	return READ_ONCE(lruvec->lrugen.seg);
}

static void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
{
	int seg;
	int old, new;
	int bin = get_random_u32_below(MEMCG_NR_BINS);
	struct pglist_data *pgdat = lruvec_pgdat(lruvec);

	spin_lock(&pgdat->memcg_lru.lock);

	VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list));

	seg = 0;
	new = old = lruvec->lrugen.gen;

	/* see the comment on MEMCG_NR_GENS */
	if (op == MEMCG_LRU_HEAD)
		seg = MEMCG_LRU_HEAD;
	else if (op == MEMCG_LRU_TAIL)
		seg = MEMCG_LRU_TAIL;
	else if (op == MEMCG_LRU_OLD)
		new = get_memcg_gen(pgdat->memcg_lru.seq);
	else if (op == MEMCG_LRU_YOUNG)
		new = get_memcg_gen(pgdat->memcg_lru.seq + 1);
	else
		VM_WARN_ON_ONCE(true);

	hlist_nulls_del_rcu(&lruvec->lrugen.list);

	if (op == MEMCG_LRU_HEAD || op == MEMCG_LRU_OLD)
		hlist_nulls_add_head_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
	else
		hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);

	pgdat->memcg_lru.nr_memcgs[old]--;
	pgdat->memcg_lru.nr_memcgs[new]++;

	lruvec->lrugen.gen = new;
	WRITE_ONCE(lruvec->lrugen.seg, seg);

	if (!pgdat->memcg_lru.nr_memcgs[old] && old == get_memcg_gen(pgdat->memcg_lru.seq))
		WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);

	spin_unlock(&pgdat->memcg_lru.lock);
}

void lru_gen_online_memcg(struct mem_cgroup *memcg)
{
	int gen;
	int nid;
	int bin = get_random_u32_below(MEMCG_NR_BINS);

	for_each_node(nid) {
		struct pglist_data *pgdat = NODE_DATA(nid);
		struct lruvec *lruvec = get_lruvec(memcg, nid);

		spin_lock(&pgdat->memcg_lru.lock);

		VM_WARN_ON_ONCE(!hlist_nulls_unhashed(&lruvec->lrugen.list));

		gen = get_memcg_gen(pgdat->memcg_lru.seq);

		hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[gen][bin]);
		pgdat->memcg_lru.nr_memcgs[gen]++;

		lruvec->lrugen.gen = gen;

		spin_unlock(&pgdat->memcg_lru.lock);
	}
}

void lru_gen_offline_memcg(struct mem_cgroup *memcg)
{
	int nid;

	for_each_node(nid) {
		struct lruvec *lruvec = get_lruvec(memcg, nid);

		lru_gen_rotate_memcg(lruvec, MEMCG_LRU_OLD);
	}
}

void lru_gen_release_memcg(struct mem_cgroup *memcg)
{
	int gen;
	int nid;

	for_each_node(nid) {
		struct pglist_data *pgdat = NODE_DATA(nid);
		struct lruvec *lruvec = get_lruvec(memcg, nid);

		spin_lock(&pgdat->memcg_lru.lock);

		VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list));

		gen = lruvec->lrugen.gen;

		hlist_nulls_del_rcu(&lruvec->lrugen.list);
		pgdat->memcg_lru.nr_memcgs[gen]--;

		if (!pgdat->memcg_lru.nr_memcgs[gen] && gen == get_memcg_gen(pgdat->memcg_lru.seq))
			WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);

		spin_unlock(&pgdat->memcg_lru.lock);
	}
}

void lru_gen_soft_reclaim(struct lruvec *lruvec)
{
	/* see the comment on MEMCG_NR_GENS */
	if (lru_gen_memcg_seg(lruvec) != MEMCG_LRU_HEAD)
		lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD);
}

#else /* !CONFIG_MEMCG */

static int lru_gen_memcg_seg(struct lruvec *lruvec)
{
	return 0;
}

#endif

/******************************************************************************
 *                          the eviction
 ******************************************************************************/
@@ -5397,53 +5539,6 @@ static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *
	pgdat->kswapd_failures = 0;
}

#ifdef CONFIG_MEMCG
void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
{
	int seg;
	int old, new;
	int bin = get_random_u32_below(MEMCG_NR_BINS);
	struct pglist_data *pgdat = lruvec_pgdat(lruvec);

	spin_lock(&pgdat->memcg_lru.lock);

	VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list));

	seg = 0;
	new = old = lruvec->lrugen.gen;

	/* see the comment on MEMCG_NR_GENS */
	if (op == MEMCG_LRU_HEAD)
		seg = MEMCG_LRU_HEAD;
	else if (op == MEMCG_LRU_TAIL)
		seg = MEMCG_LRU_TAIL;
	else if (op == MEMCG_LRU_OLD)
		new = get_memcg_gen(pgdat->memcg_lru.seq);
	else if (op == MEMCG_LRU_YOUNG)
		new = get_memcg_gen(pgdat->memcg_lru.seq + 1);
	else
		VM_WARN_ON_ONCE(true);

	hlist_nulls_del_rcu(&lruvec->lrugen.list);

	if (op == MEMCG_LRU_HEAD || op == MEMCG_LRU_OLD)
		hlist_nulls_add_head_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
	else
		hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);

	pgdat->memcg_lru.nr_memcgs[old]--;
	pgdat->memcg_lru.nr_memcgs[new]++;

	lruvec->lrugen.gen = new;
	WRITE_ONCE(lruvec->lrugen.seg, seg);

	if (!pgdat->memcg_lru.nr_memcgs[old] && old == get_memcg_gen(pgdat->memcg_lru.seq))
		WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);

	spin_unlock(&pgdat->memcg_lru.lock);
}
#endif

/******************************************************************************
 *                          state change
 ******************************************************************************/
@@ -6086,67 +6181,6 @@ void lru_gen_exit_memcg(struct mem_cgroup *memcg)
	}
}

void lru_gen_online_memcg(struct mem_cgroup *memcg)
{
	int gen;
	int nid;
	int bin = get_random_u32_below(MEMCG_NR_BINS);

	for_each_node(nid) {
		struct pglist_data *pgdat = NODE_DATA(nid);
		struct lruvec *lruvec = get_lruvec(memcg, nid);

		spin_lock(&pgdat->memcg_lru.lock);

		VM_WARN_ON_ONCE(!hlist_nulls_unhashed(&lruvec->lrugen.list));

		gen = get_memcg_gen(pgdat->memcg_lru.seq);

		hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[gen][bin]);
		pgdat->memcg_lru.nr_memcgs[gen]++;

		lruvec->lrugen.gen = gen;

		spin_unlock(&pgdat->memcg_lru.lock);
	}
}

void lru_gen_offline_memcg(struct mem_cgroup *memcg)
{
	int nid;

	for_each_node(nid) {
		struct lruvec *lruvec = get_lruvec(memcg, nid);

		lru_gen_rotate_memcg(lruvec, MEMCG_LRU_OLD);
	}
}

void lru_gen_release_memcg(struct mem_cgroup *memcg)
{
	int gen;
	int nid;

	for_each_node(nid) {
		struct pglist_data *pgdat = NODE_DATA(nid);
		struct lruvec *lruvec = get_lruvec(memcg, nid);

		spin_lock(&pgdat->memcg_lru.lock);

		VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list));

		gen = lruvec->lrugen.gen;

		hlist_nulls_del_rcu(&lruvec->lrugen.list);
		pgdat->memcg_lru.nr_memcgs[gen]--;

		if (!pgdat->memcg_lru.nr_memcgs[gen] && gen == get_memcg_gen(pgdat->memcg_lru.seq))
			WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);

		spin_unlock(&pgdat->memcg_lru.lock);
	}
}

#endif /* CONFIG_MEMCG */

static int __init init_lru_gen(void)