• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * ACPI 3.0 based NUMA setup
3  * Copyright 2004 Andi Kleen, SuSE Labs.
4  *
5  * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs.
6  *
7  * Called from acpi_numa_init while reading the SRAT and SLIT tables.
8  * Assumes all memory regions belonging to a single proximity domain
9  * are in one chunk. Holes between them will be included in the node.
10  */
11 
12 #include <linux/kernel.h>
13 #include <linux/acpi.h>
14 #include <linux/mmzone.h>
15 #include <linux/bitmap.h>
16 #include <linux/module.h>
17 #include <linux/topology.h>
18 #include <linux/bootmem.h>
19 #include <linux/mm.h>
20 #include <asm/proto.h>
21 #include <asm/numa.h>
22 #include <asm/e820.h>
23 #include <asm/genapic.h>
24 
25 int acpi_numa __initdata;
26 
27 static struct acpi_table_slit *acpi_slit;
28 
29 static nodemask_t nodes_parsed __initdata;
30 static struct bootnode nodes[MAX_NUMNODES] __initdata;
31 static struct bootnode nodes_add[MAX_NUMNODES];
32 static int found_add_area __initdata;
33 int hotadd_percent __initdata = 0;
34 
35 static int num_node_memblks __initdata;
36 static struct bootnode node_memblk_range[NR_NODE_MEMBLKS] __initdata;
37 static int memblk_nodeid[NR_NODE_MEMBLKS] __initdata;
38 
39 /* Too small nodes confuse the VM badly. Usually they result
40    from BIOS bugs. */
41 #define NODE_MIN_SIZE (4*1024*1024)
42 
setup_node(int pxm)43 static __init int setup_node(int pxm)
44 {
45 	return acpi_map_pxm_to_node(pxm);
46 }
47 
conflicting_memblks(unsigned long start,unsigned long end)48 static __init int conflicting_memblks(unsigned long start, unsigned long end)
49 {
50 	int i;
51 	for (i = 0; i < num_node_memblks; i++) {
52 		struct bootnode *nd = &node_memblk_range[i];
53 		if (nd->start == nd->end)
54 			continue;
55 		if (nd->end > start && nd->start < end)
56 			return memblk_nodeid[i];
57 		if (nd->end == end && nd->start == start)
58 			return memblk_nodeid[i];
59 	}
60 	return -1;
61 }
62 
cutoff_node(int i,unsigned long start,unsigned long end)63 static __init void cutoff_node(int i, unsigned long start, unsigned long end)
64 {
65 	struct bootnode *nd = &nodes[i];
66 
67 	if (found_add_area)
68 		return;
69 
70 	if (nd->start < start) {
71 		nd->start = start;
72 		if (nd->end < nd->start)
73 			nd->start = nd->end;
74 	}
75 	if (nd->end > end) {
76 		nd->end = end;
77 		if (nd->start > nd->end)
78 			nd->start = nd->end;
79 	}
80 }
81 
bad_srat(void)82 static __init void bad_srat(void)
83 {
84 	int i;
85 	printk(KERN_ERR "SRAT: SRAT not used.\n");
86 	acpi_numa = -1;
87 	found_add_area = 0;
88 	for (i = 0; i < MAX_LOCAL_APIC; i++)
89 		apicid_to_node[i] = NUMA_NO_NODE;
90 	for (i = 0; i < MAX_NUMNODES; i++)
91 		nodes_add[i].start = nodes[i].end = 0;
92 	remove_all_active_ranges();
93 }
94 
srat_disabled(void)95 static __init inline int srat_disabled(void)
96 {
97 	return numa_off || acpi_numa < 0;
98 }
99 
100 /* Callback for SLIT parsing */
acpi_numa_slit_init(struct acpi_table_slit * slit)101 void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
102 {
103 	unsigned length;
104 	unsigned long phys;
105 
106 	length = slit->header.length;
107 	phys = find_e820_area(0, max_pfn_mapped<<PAGE_SHIFT, length,
108 		 PAGE_SIZE);
109 
110 	if (phys == -1L)
111 		panic(" Can not save slit!\n");
112 
113 	acpi_slit = __va(phys);
114 	memcpy(acpi_slit, slit, length);
115 	reserve_early(phys, phys + length, "ACPI SLIT");
116 }
117 
118 /* Callback for Proximity Domain -> LAPIC mapping */
119 void __init
acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity * pa)120 acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
121 {
122 	int pxm, node;
123 	int apic_id;
124 
125 	if (srat_disabled())
126 		return;
127 	if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) {
128 		bad_srat();
129 		return;
130 	}
131 	if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0)
132 		return;
133 	pxm = pa->proximity_domain_lo;
134 	node = setup_node(pxm);
135 	if (node < 0) {
136 		printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
137 		bad_srat();
138 		return;
139 	}
140 
141 	if (get_uv_system_type() >= UV_X2APIC)
142 		apic_id = (pa->apic_id << 8) | pa->local_sapic_eid;
143 	else
144 		apic_id = pa->apic_id;
145 	apicid_to_node[apic_id] = node;
146 	acpi_numa = 1;
147 	printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n",
148 	       pxm, apic_id, node);
149 }
150 
update_end_of_memory(unsigned long end)151 static int update_end_of_memory(unsigned long end) {return -1;}
hotadd_enough_memory(struct bootnode * nd)152 static int hotadd_enough_memory(struct bootnode *nd) {return 1;}
153 #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
save_add_info(void)154 static inline int save_add_info(void) {return 1;}
155 #else
save_add_info(void)156 static inline int save_add_info(void) {return 0;}
157 #endif
158 /*
159  * Update nodes_add and decide if to include add are in the zone.
160  * Both SPARSE and RESERVE need nodes_add information.
161  * This code supports one contiguous hot add area per node.
162  */
163 static int __init
reserve_hotadd(int node,unsigned long start,unsigned long end)164 reserve_hotadd(int node, unsigned long start, unsigned long end)
165 {
166 	unsigned long s_pfn = start >> PAGE_SHIFT;
167 	unsigned long e_pfn = end >> PAGE_SHIFT;
168 	int ret = 0, changed = 0;
169 	struct bootnode *nd = &nodes_add[node];
170 
171 	/* I had some trouble with strange memory hotadd regions breaking
172 	   the boot. Be very strict here and reject anything unexpected.
173 	   If you want working memory hotadd write correct SRATs.
174 
175 	   The node size check is a basic sanity check to guard against
176 	   mistakes */
177 	if ((signed long)(end - start) < NODE_MIN_SIZE) {
178 		printk(KERN_ERR "SRAT: Hotplug area too small\n");
179 		return -1;
180 	}
181 
182 	/* This check might be a bit too strict, but I'm keeping it for now. */
183 	if (absent_pages_in_range(s_pfn, e_pfn) != e_pfn - s_pfn) {
184 		printk(KERN_ERR
185 			"SRAT: Hotplug area %lu -> %lu has existing memory\n",
186 			s_pfn, e_pfn);
187 		return -1;
188 	}
189 
190 	if (!hotadd_enough_memory(&nodes_add[node]))  {
191 		printk(KERN_ERR "SRAT: Hotplug area too large\n");
192 		return -1;
193 	}
194 
195 	/* Looks good */
196 
197 	if (nd->start == nd->end) {
198 		nd->start = start;
199 		nd->end = end;
200 		changed = 1;
201 	} else {
202 		if (nd->start == end) {
203 			nd->start = start;
204 			changed = 1;
205 		}
206 		if (nd->end == start) {
207 			nd->end = end;
208 			changed = 1;
209 		}
210 		if (!changed)
211 			printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n");
212 	}
213 
214 	ret = update_end_of_memory(nd->end);
215 
216 	if (changed)
217 	 	printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", nd->start, nd->end);
218 	return ret;
219 }
220 
221 /* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
222 void __init
acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity * ma)223 acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
224 {
225 	struct bootnode *nd, oldnode;
226 	unsigned long start, end;
227 	int node, pxm;
228 	int i;
229 
230 	if (srat_disabled())
231 		return;
232 	if (ma->header.length != sizeof(struct acpi_srat_mem_affinity)) {
233 		bad_srat();
234 		return;
235 	}
236 	if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0)
237 		return;
238 
239 	if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info())
240 		return;
241 	start = ma->base_address;
242 	end = start + ma->length;
243 	pxm = ma->proximity_domain;
244 	node = setup_node(pxm);
245 	if (node < 0) {
246 		printk(KERN_ERR "SRAT: Too many proximity domains.\n");
247 		bad_srat();
248 		return;
249 	}
250 	i = conflicting_memblks(start, end);
251 	if (i == node) {
252 		printk(KERN_WARNING
253 		"SRAT: Warning: PXM %d (%lx-%lx) overlaps with itself (%Lx-%Lx)\n",
254 			pxm, start, end, nodes[i].start, nodes[i].end);
255 	} else if (i >= 0) {
256 		printk(KERN_ERR
257 		       "SRAT: PXM %d (%lx-%lx) overlaps with PXM %d (%Lx-%Lx)\n",
258 		       pxm, start, end, node_to_pxm(i),
259 			nodes[i].start, nodes[i].end);
260 		bad_srat();
261 		return;
262 	}
263 	nd = &nodes[node];
264 	oldnode = *nd;
265 	if (!node_test_and_set(node, nodes_parsed)) {
266 		nd->start = start;
267 		nd->end = end;
268 	} else {
269 		if (start < nd->start)
270 			nd->start = start;
271 		if (nd->end < end)
272 			nd->end = end;
273 	}
274 
275 	printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm,
276 	       start, end);
277 	e820_register_active_regions(node, start >> PAGE_SHIFT,
278 				     end >> PAGE_SHIFT);
279 	push_node_boundaries(node, nd->start >> PAGE_SHIFT,
280 						nd->end >> PAGE_SHIFT);
281 
282 	if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) &&
283 	    (reserve_hotadd(node, start, end) < 0)) {
284 		/* Ignore hotadd region. Undo damage */
285 		printk(KERN_NOTICE "SRAT: Hotplug region ignored\n");
286 		*nd = oldnode;
287 		if ((nd->start | nd->end) == 0)
288 			node_clear(node, nodes_parsed);
289 	}
290 
291 	node_memblk_range[num_node_memblks].start = start;
292 	node_memblk_range[num_node_memblks].end = end;
293 	memblk_nodeid[num_node_memblks] = node;
294 	num_node_memblks++;
295 }
296 
297 /* Sanity check to catch more bad SRATs (they are amazingly common).
298    Make sure the PXMs cover all memory. */
nodes_cover_memory(const struct bootnode * nodes)299 static int __init nodes_cover_memory(const struct bootnode *nodes)
300 {
301 	int i;
302 	unsigned long pxmram, e820ram;
303 
304 	pxmram = 0;
305 	for_each_node_mask(i, nodes_parsed) {
306 		unsigned long s = nodes[i].start >> PAGE_SHIFT;
307 		unsigned long e = nodes[i].end >> PAGE_SHIFT;
308 		pxmram += e - s;
309 		pxmram -= absent_pages_in_range(s, e);
310 		if ((long)pxmram < 0)
311 			pxmram = 0;
312 	}
313 
314 	e820ram = max_pfn - absent_pages_in_range(0, max_pfn);
315 	/* We seem to lose 3 pages somewhere. Allow a bit of slack. */
316 	if ((long)(e820ram - pxmram) >= 1*1024*1024) {
317 		printk(KERN_ERR
318 	"SRAT: PXMs only cover %luMB of your %luMB e820 RAM. Not used.\n",
319 			(pxmram << PAGE_SHIFT) >> 20,
320 			(e820ram << PAGE_SHIFT) >> 20);
321 		return 0;
322 	}
323 	return 1;
324 }
325 
unparse_node(int node)326 static void __init unparse_node(int node)
327 {
328 	int i;
329 	node_clear(node, nodes_parsed);
330 	for (i = 0; i < MAX_LOCAL_APIC; i++) {
331 		if (apicid_to_node[i] == node)
332 			apicid_to_node[i] = NUMA_NO_NODE;
333 	}
334 }
335 
acpi_numa_arch_fixup(void)336 void __init acpi_numa_arch_fixup(void) {}
337 
338 /* Use the information discovered above to actually set up the nodes. */
acpi_scan_nodes(unsigned long start,unsigned long end)339 int __init acpi_scan_nodes(unsigned long start, unsigned long end)
340 {
341 	int i;
342 
343 	if (acpi_numa <= 0)
344 		return -1;
345 
346 	/* First clean up the node list */
347 	for (i = 0; i < MAX_NUMNODES; i++) {
348 		cutoff_node(i, start, end);
349 		/*
350 		 * don't confuse VM with a node that doesn't have the
351 		 * minimum memory.
352 		 */
353 		if (nodes[i].end &&
354 			(nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) {
355 			unparse_node(i);
356 			node_set_offline(i);
357 		}
358 	}
359 
360 	if (!nodes_cover_memory(nodes)) {
361 		bad_srat();
362 		return -1;
363 	}
364 
365 	memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks,
366 					   memblk_nodeid);
367 	if (memnode_shift < 0) {
368 		printk(KERN_ERR
369 		     "SRAT: No NUMA node hash function found. Contact maintainer\n");
370 		bad_srat();
371 		return -1;
372 	}
373 
374 	node_possible_map = nodes_parsed;
375 
376 	/* Finally register nodes */
377 	for_each_node_mask(i, node_possible_map)
378 		setup_node_bootmem(i, nodes[i].start, nodes[i].end);
379 	/* Try again in case setup_node_bootmem missed one due
380 	   to missing bootmem */
381 	for_each_node_mask(i, node_possible_map)
382 		if (!node_online(i))
383 			setup_node_bootmem(i, nodes[i].start, nodes[i].end);
384 
385 	for (i = 0; i < nr_cpu_ids; i++) {
386 		int node = early_cpu_to_node(i);
387 
388 		if (node == NUMA_NO_NODE)
389 			continue;
390 		if (!node_isset(node, node_possible_map))
391 			numa_clear_node(i);
392 	}
393 	numa_init_array();
394 	return 0;
395 }
396 
397 #ifdef CONFIG_NUMA_EMU
398 static int fake_node_to_pxm_map[MAX_NUMNODES] __initdata = {
399 	[0 ... MAX_NUMNODES-1] = PXM_INVAL
400 };
401 static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = {
402 	[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
403 };
find_node_by_addr(unsigned long addr)404 static int __init find_node_by_addr(unsigned long addr)
405 {
406 	int ret = NUMA_NO_NODE;
407 	int i;
408 
409 	for_each_node_mask(i, nodes_parsed) {
410 		/*
411 		 * Find the real node that this emulated node appears on.  For
412 		 * the sake of simplicity, we only use a real node's starting
413 		 * address to determine which emulated node it appears on.
414 		 */
415 		if (addr >= nodes[i].start && addr < nodes[i].end) {
416 			ret = i;
417 			break;
418 		}
419 	}
420 	return ret;
421 }
422 
423 /*
424  * In NUMA emulation, we need to setup proximity domain (_PXM) to node ID
425  * mappings that respect the real ACPI topology but reflect our emulated
426  * environment.  For each emulated node, we find which real node it appears on
427  * and create PXM to NID mappings for those fake nodes which mirror that
428  * locality.  SLIT will now represent the correct distances between emulated
429  * nodes as a result of the real topology.
430  */
acpi_fake_nodes(const struct bootnode * fake_nodes,int num_nodes)431 void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
432 {
433 	int i, j;
434 
435 	printk(KERN_INFO "Faking PXM affinity for fake nodes on real "
436 			 "topology.\n");
437 	for (i = 0; i < num_nodes; i++) {
438 		int nid, pxm;
439 
440 		nid = find_node_by_addr(fake_nodes[i].start);
441 		if (nid == NUMA_NO_NODE)
442 			continue;
443 		pxm = node_to_pxm(nid);
444 		if (pxm == PXM_INVAL)
445 			continue;
446 		fake_node_to_pxm_map[i] = pxm;
447 		/*
448 		 * For each apicid_to_node mapping that exists for this real
449 		 * node, it must now point to the fake node ID.
450 		 */
451 		for (j = 0; j < MAX_LOCAL_APIC; j++)
452 			if (apicid_to_node[j] == nid)
453 				fake_apicid_to_node[j] = i;
454 	}
455 	for (i = 0; i < num_nodes; i++)
456 		__acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i);
457 	memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node));
458 
459 	nodes_clear(nodes_parsed);
460 	for (i = 0; i < num_nodes; i++)
461 		if (fake_nodes[i].start != fake_nodes[i].end)
462 			node_set(i, nodes_parsed);
463 	WARN_ON(!nodes_cover_memory(fake_nodes));
464 }
465 
null_slit_node_compare(int a,int b)466 static int null_slit_node_compare(int a, int b)
467 {
468 	return node_to_pxm(a) == node_to_pxm(b);
469 }
470 #else
null_slit_node_compare(int a,int b)471 static int null_slit_node_compare(int a, int b)
472 {
473 	return a == b;
474 }
475 #endif /* CONFIG_NUMA_EMU */
476 
srat_reserve_add_area(int nodeid)477 void __init srat_reserve_add_area(int nodeid)
478 {
479 	if (found_add_area && nodes_add[nodeid].end) {
480 		u64 total_mb;
481 
482 		printk(KERN_INFO "SRAT: Reserving hot-add memory space "
483 				"for node %d at %Lx-%Lx\n",
484 			nodeid, nodes_add[nodeid].start, nodes_add[nodeid].end);
485 		total_mb = (nodes_add[nodeid].end - nodes_add[nodeid].start)
486 					>> PAGE_SHIFT;
487 		total_mb *= sizeof(struct page);
488 		total_mb >>= 20;
489 		printk(KERN_INFO "SRAT: This will cost you %Lu MB of "
490 				"pre-allocated memory.\n", (unsigned long long)total_mb);
491 		reserve_bootmem_node(NODE_DATA(nodeid), nodes_add[nodeid].start,
492 			       nodes_add[nodeid].end - nodes_add[nodeid].start,
493 			       BOOTMEM_DEFAULT);
494 	}
495 }
496 
__node_distance(int a,int b)497 int __node_distance(int a, int b)
498 {
499 	int index;
500 
501 	if (!acpi_slit)
502 		return null_slit_node_compare(a, b) ? LOCAL_DISTANCE :
503 						      REMOTE_DISTANCE;
504 	index = acpi_slit->locality_count * node_to_pxm(a);
505 	return acpi_slit->entry[index + node_to_pxm(b)];
506 }
507 
508 EXPORT_SYMBOL(__node_distance);
509 
510 #if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) || defined(CONFIG_ACPI_HOTPLUG_MEMORY)
memory_add_physaddr_to_nid(u64 start)511 int memory_add_physaddr_to_nid(u64 start)
512 {
513 	int i, ret = 0;
514 
515 	for_each_node(i)
516 		if (nodes_add[i].start <= start && nodes_add[i].end > start)
517 			ret = i;
518 
519 	return ret;
520 }
521 EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
522 #endif
523