mm/migrate: move node demotion code to near its user (dcee9bf5) · Commits · jan.koester / Linux

mm/migrate.c

+132 −133

Original line number	Diff line number	Diff line
		@@ -1093,139 +1093,6 @@ static int __unmap_and_move(struct page page, struct page newpage,
		return rc;
		}


		/*
		* node_demotion[] example:
		*
		* Consider a system with two sockets. Each socket has
		* three classes of memory attached: fast, medium and slow.
		* Each memory class is placed in its own NUMA node. The
		* CPUs are placed in the node with the "fast" memory. The
		* 6 NUMA nodes (0-5) might be split among the sockets like
		* this:
		*
		* Socket A: 0, 1, 2
		* Socket B: 3, 4, 5
		*
		* When Node 0 fills up, its memory should be migrated to
		* Node 1. When Node 1 fills up, it should be migrated to
		* Node 2. The migration path start on the nodes with the
		* processors (since allocations default to this node) and
		* fast memory, progress through medium and end with the
		* slow memory:
		*
		* 0 -> 1 -> 2 -> stop
		* 3 -> 4 -> 5 -> stop
		*
		* This is represented in the node_demotion[] like this:
		*
		* { nr=1, nodes[0]=1 }, // Node 0 migrates to 1
		* { nr=1, nodes[0]=2 }, // Node 1 migrates to 2
		* { nr=0, nodes[0]=-1 }, // Node 2 does not migrate
		* { nr=1, nodes[0]=4 }, // Node 3 migrates to 4
		* { nr=1, nodes[0]=5 }, // Node 4 migrates to 5
		* { nr=0, nodes[0]=-1 }, // Node 5 does not migrate
		*
		* Moreover some systems may have multiple slow memory nodes.
		* Suppose a system has one socket with 3 memory nodes, node 0
		* is fast memory type, and node 1/2 both are slow memory
		* type, and the distance between fast memory node and slow
		* memory node is same. So the migration path should be:
		*
		* 0 -> 1/2 -> stop
		*
		* This is represented in the node_demotion[] like this:
		* { nr=2, {nodes[0]=1, nodes[1]=2} }, // Node 0 migrates to node 1 and node 2
		* { nr=0, nodes[0]=-1, }, // Node 1 dose not migrate
		* { nr=0, nodes[0]=-1, }, // Node 2 does not migrate
		*/

		/*
		* Writes to this array occur without locking. Cycles are
		* not allowed: Node X demotes to Y which demotes to X...
		*
		* If multiple reads are performed, a single rcu_read_lock()
		* must be held over all reads to ensure that no cycles are
		* observed.
		*/
		#define DEFAULT_DEMOTION_TARGET_NODES 15

		#if MAX_NUMNODES < DEFAULT_DEMOTION_TARGET_NODES
		#define DEMOTION_TARGET_NODES (MAX_NUMNODES - 1)
		#else
		#define DEMOTION_TARGET_NODES DEFAULT_DEMOTION_TARGET_NODES
		#endif

		struct demotion_nodes {
		unsigned short nr;
		short nodes[DEMOTION_TARGET_NODES];
		};

		static struct demotion_nodes *node_demotion __read_mostly;

		/**
		* next_demotion_node() - Get the next node in the demotion path
		* @node: The starting node to lookup the next node
		*
		* Return: node id for next memory node in the demotion path hierarchy
		* from @node; NUMA_NO_NODE if @node is terminal. This does not keep
		* @node online or guarantee that it continues to be the next demotion
		* target.
		*/
		int next_demotion_node(int node)
		{
		struct demotion_nodes *nd;
		unsigned short target_nr, index;
		int target;

		if (!node_demotion)
		return NUMA_NO_NODE;

		nd = &node_demotion[node];

		/*
		* node_demotion[] is updated without excluding this
		* function from running. RCU doesn't provide any
		* compiler barriers, so the READ_ONCE() is required
		* to avoid compiler reordering or read merging.
		*
		* Make sure to use RCU over entire code blocks if
		* node_demotion[] reads need to be consistent.
		*/
		rcu_read_lock();
		target_nr = READ_ONCE(nd->nr);

		switch (target_nr) {
		case 0:
		target = NUMA_NO_NODE;
		goto out;
		case 1:
		index = 0;
		break;
		default:
		/*
		* If there are multiple target nodes, just select one
		* target node randomly.
		*
		* In addition, we can also use round-robin to select
		* target node, but we should introduce another variable
		* for node_demotion[] to record last selected target node,
		* that may cause cache ping-pong due to the changing of
		* last target node. Or introducing per-cpu data to avoid
		* caching issue, which seems more complicated. So selecting
		* target node randomly seems better until now.
		*/
		index = get_random_int() % target_nr;
		break;
		}

		target = READ_ONCE(nd->nodes[index]);

		out:
		rcu_read_unlock();
		return target;
		}

		/*
		* Obtain the lock on page, remove all ptes and migrate the page
		* to the newly allocated page in newpage.
		@@ -3059,6 +2926,138 @@ void migrate_vma_finalize(struct migrate_vma *migrate)
		EXPORT_SYMBOL(migrate_vma_finalize);
		#endif /* CONFIG_DEVICE_PRIVATE */

		/*
		* node_demotion[] example:
		*
		* Consider a system with two sockets. Each socket has
		* three classes of memory attached: fast, medium and slow.
		* Each memory class is placed in its own NUMA node. The
		* CPUs are placed in the node with the "fast" memory. The
		* 6 NUMA nodes (0-5) might be split among the sockets like
		* this:
		*
		* Socket A: 0, 1, 2
		* Socket B: 3, 4, 5
		*
		* When Node 0 fills up, its memory should be migrated to
		* Node 1. When Node 1 fills up, it should be migrated to
		* Node 2. The migration path start on the nodes with the
		* processors (since allocations default to this node) and
		* fast memory, progress through medium and end with the
		* slow memory:
		*
		* 0 -> 1 -> 2 -> stop
		* 3 -> 4 -> 5 -> stop
		*
		* This is represented in the node_demotion[] like this:
		*
		* { nr=1, nodes[0]=1 }, // Node 0 migrates to 1
		* { nr=1, nodes[0]=2 }, // Node 1 migrates to 2
		* { nr=0, nodes[0]=-1 }, // Node 2 does not migrate
		* { nr=1, nodes[0]=4 }, // Node 3 migrates to 4
		* { nr=1, nodes[0]=5 }, // Node 4 migrates to 5
		* { nr=0, nodes[0]=-1 }, // Node 5 does not migrate
		*
		* Moreover some systems may have multiple slow memory nodes.
		* Suppose a system has one socket with 3 memory nodes, node 0
		* is fast memory type, and node 1/2 both are slow memory
		* type, and the distance between fast memory node and slow
		* memory node is same. So the migration path should be:
		*
		* 0 -> 1/2 -> stop
		*
		* This is represented in the node_demotion[] like this:
		* { nr=2, {nodes[0]=1, nodes[1]=2} }, // Node 0 migrates to node 1 and node 2
		* { nr=0, nodes[0]=-1, }, // Node 1 dose not migrate
		* { nr=0, nodes[0]=-1, }, // Node 2 does not migrate
		*/

		/*
		* Writes to this array occur without locking. Cycles are
		* not allowed: Node X demotes to Y which demotes to X...
		*
		* If multiple reads are performed, a single rcu_read_lock()
		* must be held over all reads to ensure that no cycles are
		* observed.
		*/
		#define DEFAULT_DEMOTION_TARGET_NODES 15

		#if MAX_NUMNODES < DEFAULT_DEMOTION_TARGET_NODES
		#define DEMOTION_TARGET_NODES (MAX_NUMNODES - 1)
		#else
		#define DEMOTION_TARGET_NODES DEFAULT_DEMOTION_TARGET_NODES
		#endif

		struct demotion_nodes {
		unsigned short nr;
		short nodes[DEMOTION_TARGET_NODES];
		};

		static struct demotion_nodes *node_demotion __read_mostly;

		/**
		* next_demotion_node() - Get the next node in the demotion path
		* @node: The starting node to lookup the next node
		*
		* Return: node id for next memory node in the demotion path hierarchy
		* from @node; NUMA_NO_NODE if @node is terminal. This does not keep
		* @node online or guarantee that it continues to be the next demotion
		* target.
		*/
		int next_demotion_node(int node)
		{
		struct demotion_nodes *nd;
		unsigned short target_nr, index;
		int target;

		if (!node_demotion)
		return NUMA_NO_NODE;

		nd = &node_demotion[node];

		/*
		* node_demotion[] is updated without excluding this
		* function from running. RCU doesn't provide any
		* compiler barriers, so the READ_ONCE() is required
		* to avoid compiler reordering or read merging.
		*
		* Make sure to use RCU over entire code blocks if
		* node_demotion[] reads need to be consistent.
		*/
		rcu_read_lock();
		target_nr = READ_ONCE(nd->nr);

		switch (target_nr) {
		case 0:
		target = NUMA_NO_NODE;
		goto out;
		case 1:
		index = 0;
		break;
		default:
		/*
		* If there are multiple target nodes, just select one
		* target node randomly.
		*
		* In addition, we can also use round-robin to select
		* target node, but we should introduce another variable
		* for node_demotion[] to record last selected target node,
		* that may cause cache ping-pong due to the changing of
		* last target node. Or introducing per-cpu data to avoid
		* caching issue, which seems more complicated. So selecting
		* target node randomly seems better until now.
		*/
		index = get_random_int() % target_nr;
		break;
		}

		target = READ_ONCE(nd->nodes[index]);

		out:
		rcu_read_unlock();
		return target;
		}

		#if defined(CONFIG_HOTPLUG_CPU)
		/* Disable reclaim-based migration. */
		static void __disable_all_migrate_targets(void)