Here's the big patch. Sparse abstracts the use of discontiguous mem_maps[]. This kind of mem_map[] is needed by discontiguous memory machines (like in the old CONFIG_DISCONTIGMEM case) as well as memory hotplug systems. This patch also introduces CONFIG_FLATMEM. It is used in almost all cases where there used to be an ifndef DISCONTIG, because SPARSEMEM an DISCONTIGMEM often have to compile out the same areas of code. Signed-off-by: Andy Whitcroft Signed-off-by: Dave Hansen --- memhotplug-dave/arch/i386/mm/discontig.c | 2 memhotplug-dave/include/linux/mm.h | 86 ++++++++++++++++++++++++---- memhotplug-dave/include/linux/mmzone.h | 93 ++++++++++++++++++++++++++++++- memhotplug-dave/include/linux/numa.h | 2 memhotplug-dave/mm/Kconfig | 19 ++++++ memhotplug-dave/mm/Makefile | 1 memhotplug-dave/mm/bootmem.c | 8 ++ memhotplug-dave/mm/memory.c | 2 memhotplug-dave/mm/page_alloc.c | 32 ++++++++-- memhotplug-dave/mm/sparse.c | 68 ++++++++++++++++++++++ 10 files changed, 285 insertions(+), 28 deletions(-) diff -puN arch/i386/mm/discontig.c~B-sparse-150-sparsemem arch/i386/mm/discontig.c --- memhotplug/arch/i386/mm/discontig.c~B-sparse-150-sparsemem 2005-02-17 15:25:36.000000000 -0800 +++ memhotplug-dave/arch/i386/mm/discontig.c 2005-02-17 15:25:36.000000000 -0800 @@ -46,6 +46,8 @@ bootmem_data_t node0_bdata; * 3) node_start_pfn - the starting page frame number for a node * 3) node_end_pfn - the ending page fram number for a node */ +unsigned long node_start_pfn[MAX_NUMNODES]; +unsigned long node_end_pfn[MAX_NUMNODES]; /* * physnode_map keeps track of the physical memory layout of a generic diff -puN include/linux/mm.h~B-sparse-150-sparsemem include/linux/mm.h --- memhotplug/include/linux/mm.h~B-sparse-150-sparsemem 2005-02-17 15:25:36.000000000 -0800 +++ memhotplug-dave/include/linux/mm.h 2005-02-17 15:25:36.000000000 -0800 @@ -400,39 +400,91 @@ static inline void put_page(struct page * sets it, so none of the operations on it need to be atomic. */ -/* Page flags: | NODE | ZONE | ... | FLAGS | */ -#define NODES_PGOFF ((sizeof(page_flags_t)*8) - NODES_SHIFT) -#define ZONES_PGOFF (NODES_PGOFF - ZONES_SHIFT) + +/* + * page->flags layout: + * + * There are three possibilities for how page->flags get + * laid out. The first is for the normal case, without + * sparsemem. The second is for sparsemem when there is + * plenty of space for node and section. The last is when + * we have run out of space and have to fall back to an + * alternate (slower) way of determining the node. + * + * No sparsemem: | NODE | ZONE | ... | FLAGS | + * with space for node: | SECTION | NODE | ZONE | ... | FLAGS | + * no space for node: | SECTION | ZONE | ... | FLAGS | + */ +#if SECTIONS_SHIFT+NODES_SHIFT+ZONES_SHIFT <= FLAGS_RESERVED +#define NODES_WIDTH NODES_SHIFT +#else +#define NODES_WIDTH 0 +#endif + +#ifdef CONFIG_SPARSEMEM +#define SECTIONS_WIDTH SECTIONS_SHIFT +#else +#define SECTIONS_WIDTH 0 +#endif + +#define ZONES_WIDTH ZONES_SHIFT + +/* Page flags: | [SECTION] | [NODE] | ZONE | ... | FLAGS | */ +#define SECTIONS_PGOFF ((sizeof(page_flags_t)*8) - SECTIONS_WIDTH) +#define NODES_PGOFF (SECTIONS_PGOFF - NODES_WIDTH) +#define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH) + +/* + * We are going to use the flags for the page to node mapping if its in + * there. This includes the case where there is no node, so it is implicit. + */ +#define FLAGS_HAS_NODE (NODES_WIDTH > 0 || NODES_SHIFT == 0) + +#ifndef PFN_SECTION_SHIFT +#define PFN_SECTION_SHIFT 0 +#endif /* * Define the bit shifts to access each section. For non-existant * sections we define the shift as 0; that plus a 0 mask ensures * the compiler will optimise away reference to them. */ -#define NODES_PGSHIFT (NODES_PGOFF * (NODES_SHIFT != 0)) -#define ZONES_PGSHIFT (ZONES_PGOFF * (ZONES_SHIFT != 0)) +#define SECTIONS_PGSHIFT (SECTIONS_PGOFF * (SECTIONS_WIDTH != 0)) +#define NODES_PGSHIFT (NODES_PGOFF * (NODES_WIDTH != 0)) +#define ZONES_PGSHIFT (ZONES_PGOFF * (ZONES_WIDTH != 0)) -/* NODE:ZONE is used to lookup the zone from a page. */ +/* NODE:ZONE or SECTION:ZONE is used to lookup the zone from a page. */ +#if FLAGS_HAS_NODE #define ZONETABLE_SHIFT (NODES_SHIFT + ZONES_SHIFT) +#else +#define ZONETABLE_SHIFT (SECTIONS_SHIFT + ZONES_SHIFT) +#endif #define ZONETABLE_PGSHIFT ZONES_PGSHIFT -#if NODES_SHIFT+ZONES_SHIFT > FLAGS_RESERVED -#error NODES_SHIFT+ZONES_SHIFT > FLAGS_RESERVED +#if SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > FLAGS_RESERVED +#error SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > FLAGS_RESERVED #endif -#define NODEZONE(node, zone) ((node << ZONES_SHIFT) | zone) - -#define ZONES_MASK ((1UL << ZONES_SHIFT) - 1) -#define NODES_MASK ((1UL << NODES_SHIFT) - 1) +#define ZONES_MASK ((1UL << ZONES_WIDTH) - 1) +#define NODES_MASK ((1UL << NODES_WIDTH) - 1) +#define SECTIONS_MASK ((1UL << SECTIONS_WIDTH) - 1) #define ZONETABLE_MASK ((1UL << ZONETABLE_SHIFT) - 1) static inline unsigned long page_zonenum(struct page *page) { return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK; } +static inline struct zone *page_zone(struct page *page); static inline unsigned long page_to_nid(struct page *page) { - return (page->flags >> NODES_PGSHIFT) & NODES_MASK; + if (FLAGS_HAS_NODE) + return (page->flags >> NODES_PGSHIFT) & NODES_MASK; + else + return page_zone(page)->zone_pgdat->node_id; +} +static inline unsigned long page_to_section(struct page *page) +{ + return (page->flags >> SECTIONS_PGSHIFT) & SECTIONS_MASK; } struct zone; @@ -454,12 +506,18 @@ static inline void set_page_node(struct page->flags &= ~(NODES_MASK << NODES_PGSHIFT); page->flags |= (node & NODES_MASK) << NODES_PGSHIFT; } +static inline void set_page_section(struct page *page, unsigned long section) +{ + page->flags &= ~(SECTIONS_MASK << SECTIONS_PGSHIFT); + page->flags |= (section & SECTIONS_MASK) << SECTIONS_PGSHIFT; +} static inline void set_page_links(struct page *page, unsigned long zone, - unsigned long node) + unsigned long node, unsigned long pfn) { set_page_zone(page, zone); set_page_node(page, node); + set_page_section(page, pfn >> PFN_SECTION_SHIFT); } #ifndef CONFIG_DISCONTIGMEM diff -puN include/linux/mmzone.h~B-sparse-150-sparsemem include/linux/mmzone.h --- memhotplug/include/linux/mmzone.h~B-sparse-150-sparsemem 2005-02-17 15:25:36.000000000 -0800 +++ memhotplug-dave/include/linux/mmzone.h 2005-02-17 15:25:36.000000000 -0800 @@ -372,7 +372,7 @@ int lowmem_reserve_ratio_sysctl_handler( /* Returns the number of the current Node. */ #define numa_node_id() (cpu_to_node(_smp_processor_id())) -#ifndef CONFIG_DISCONTIGMEM +#ifdef CONFIG_FLATMEM extern struct pglist_data contig_page_data; #define NODE_DATA(nid) (&contig_page_data) @@ -380,11 +380,11 @@ extern struct pglist_data contig_page_da #define MAX_NODES_SHIFT 1 #define pfn_to_nid(pfn) (0) -#else /* CONFIG_DISCONTIGMEM */ +#else /* !CONFIG_FLATMEM */ #include -#endif /* !CONFIG_DISCONTIGMEM */ +#endif /* CONFIG_FLATMEM */ #if BITS_PER_LONG == 32 || defined(ARCH_HAS_ATOMIC_UNSIGNED) /* @@ -405,6 +405,93 @@ extern struct pglist_data contig_page_da #endif +#ifdef CONFIG_SPARSEMEM + +/* + * SECTION_SHIFT #bits space required to store a section # + * + * PA_SECTION_SHIFT physical address to/from section number + * PFN_SECTION_SHIFT pfn to/from section number + */ +#define SECTIONS_SHIFT (MAX_PHYSMEM_BITS - SECTION_SIZE_BITS) + +#define PA_SECTION_SHIFT (SECTION_SIZE_BITS) +#define PFN_SECTION_SHIFT (SECTION_SIZE_BITS - PAGE_SHIFT) + +#define NR_MEM_SECTIONS (1 << SECTIONS_SHIFT) + +#define PAGES_PER_SECTION (1 << PFN_SECTION_SHIFT) +#define PAGE_SECTION_MASK (~(PAGES_PER_SECTION-1)) + +#if MAX_ORDER > SECTION_SIZE_BITS +#error MAX_ORDER exceeds SECTION_SIZE_BITS +#endif + +struct page; +struct mem_section { + struct page *section_mem_map; +}; + +extern struct mem_section mem_section[NR_MEM_SECTIONS]; + +/* + * Given a kernel address, find the home node of the underlying memory. + */ +#define kvaddr_to_nid(kaddr) pfn_to_nid(__pa(kaddr) >> PAGE_SHIFT) + +static inline struct mem_section *__pfn_to_section(unsigned long pfn) +{ + return &mem_section[pfn >> PFN_SECTION_SHIFT]; +} + +#define pfn_to_page(pfn) \ +({ \ + unsigned long __pfn = (pfn); \ + __pfn_to_section(__pfn)->section_mem_map + __pfn; \ +}) +#define page_to_pfn(page) \ +({ \ + page - mem_section[page_to_section(page)].section_mem_map; \ +}) + +static inline int pfn_valid(unsigned long pfn) +{ + if ((pfn >> PFN_SECTION_SHIFT) >= NR_MEM_SECTIONS) + return 0; + return mem_section[pfn >> PFN_SECTION_SHIFT].section_mem_map != 0; +} + +/* + * APW/XXX: these are _only_ used during initialisation, therefore they + * can use __initdata ... they should have names to indicate this + * restriction. + */ +#ifdef CONFIG_NUMA +#define pfn_to_nid early_pfn_to_nid +#else +#define pfn_to_nid(pfn) 0 +#endif + +#define pfn_to_pgdat(pfn) \ +({ \ + NODE_DATA(pfn_to_nid(pfn)); \ +}) + +#define early_pfn_valid(pfn) pfn_valid(pfn) +void sparse_init(void); +#define memmodel_init sparse_init + +#endif /* CONFIG_SPARSEMEM */ + +#ifndef early_pfn_valid +#define early_pfn_valid(pfn) (1) +#endif +#ifndef memmodel_init +#define memmodel_init(x) do { } while (0) +#endif + +unsigned long memory_present(int nid, unsigned long start, unsigned long end); + #endif /* !__ASSEMBLY__ */ #endif /* __KERNEL__ */ #endif /* _LINUX_MMZONE_H */ diff -puN include/linux/numa.h~B-sparse-150-sparsemem include/linux/numa.h --- memhotplug/include/linux/numa.h~B-sparse-150-sparsemem 2005-02-17 15:25:36.000000000 -0800 +++ memhotplug-dave/include/linux/numa.h 2005-02-17 15:25:36.000000000 -0800 @@ -3,7 +3,7 @@ #include -#ifdef CONFIG_DISCONTIGMEM +#ifndef CONFIG_FLATMEM #include #endif diff -puN mm/Makefile~B-sparse-150-sparsemem mm/Makefile --- memhotplug/mm/Makefile~B-sparse-150-sparsemem 2005-02-17 15:25:36.000000000 -0800 +++ memhotplug-dave/mm/Makefile 2005-02-17 15:25:36.000000000 -0800 @@ -15,6 +15,7 @@ obj-y := bootmem.o filemap.o mempool.o obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o obj-$(CONFIG_HUGETLBFS) += hugetlb.o obj-$(CONFIG_NUMA) += mempolicy.o +obj-$(CONFIG_SPARSEMEM) += sparse.o obj-$(CONFIG_SHMEM) += shmem.o obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o diff -puN mm/bootmem.c~B-sparse-150-sparsemem mm/bootmem.c --- memhotplug/mm/bootmem.c~B-sparse-150-sparsemem 2005-02-17 15:25:36.000000000 -0800 +++ memhotplug-dave/mm/bootmem.c 2005-02-17 15:25:36.000000000 -0800 @@ -256,6 +256,7 @@ found: static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat) { struct page *page; + unsigned long pfn; bootmem_data_t *bdata = pgdat->bdata; unsigned long i, count, total = 0; unsigned long idx; @@ -266,7 +267,7 @@ static unsigned long __init free_all_boo count = 0; /* first extant page of the node */ - page = virt_to_page(phys_to_virt(bdata->node_boot_start)); + pfn = bdata->node_boot_start >> PAGE_SHIFT; idx = bdata->node_low_pfn - (bdata->node_boot_start >> PAGE_SHIFT); map = bdata->node_bootmem_map; /* Check physaddr is O(LOG2(BITS_PER_LONG)) page aligned */ @@ -275,6 +276,9 @@ static unsigned long __init free_all_boo gofast = 1; for (i = 0; i < idx; ) { unsigned long v = ~map[i / BITS_PER_LONG]; + + page = pfn_to_page(pfn); + if (gofast && v == ~0UL) { int j, order; @@ -302,8 +306,8 @@ static unsigned long __init free_all_boo } } else { i+=BITS_PER_LONG; - page += BITS_PER_LONG; } + pfn += BITS_PER_LONG; } total += count; diff -puN mm/memory.c~B-sparse-150-sparsemem mm/memory.c --- memhotplug/mm/memory.c~B-sparse-150-sparsemem 2005-02-17 15:25:36.000000000 -0800 +++ memhotplug-dave/mm/memory.c 2005-02-17 15:25:36.000000000 -0800 @@ -59,7 +59,7 @@ #include #include -#ifndef CONFIG_DISCONTIGMEM +#ifdef CONFIG_FLATMEM /* use the per-pgdat data instead for discontigmem - mbligh */ unsigned long max_mapnr; struct page *mem_map; diff -puN mm/page_alloc.c~B-sparse-150-sparsemem mm/page_alloc.c --- memhotplug/mm/page_alloc.c~B-sparse-150-sparsemem 2005-02-17 15:25:36.000000000 -0800 +++ memhotplug-dave/mm/page_alloc.c 2005-02-17 15:25:36.000000000 -0800 @@ -61,7 +61,7 @@ EXPORT_SYMBOL(nr_swap_pages); * Used by page_zone() to look up the address of the struct zone whose * id is encoded in the upper bits of page->flags */ -struct zone *zone_table[1 << (ZONES_SHIFT + NODES_SHIFT)]; +struct zone *zone_table[1 << ZONETABLE_SHIFT]; EXPORT_SYMBOL(zone_table); static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" }; @@ -1557,11 +1557,14 @@ static void __init calculate_zone_totalp void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone, unsigned long start_pfn) { - struct page *start = pfn_to_page(start_pfn); struct page *page; + int pfn; - for (page = start; page < (start + size); page++) { - set_page_links(page, zone, nid); + for (pfn = start_pfn; pfn < (start_pfn + size); pfn++) { + if (!early_pfn_valid(pfn)) + continue; + page = pfn_to_page(pfn); + set_page_links(page, zone, nid, pfn); set_page_count(page, 0); reset_page_mapcount(page); SetPageReserved(page); @@ -1585,6 +1588,20 @@ void zone_init_free_lists(struct pglist_ } } +#define ZONETABLE_INDEX(x, zone_nr) ((x << ZONES_SHIFT) | zone_nr) +void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn, + unsigned long size) +{ + unsigned long snum = pfn >> PFN_SECTION_SHIFT; + unsigned long end = (pfn + size) >> PFN_SECTION_SHIFT; + + if (FLAGS_HAS_NODE) + zone_table[ZONETABLE_INDEX(nid, zid)] = zone; + else + for (; snum <= end; snum++) + zone_table[ZONETABLE_INDEX(snum, zid)] = zone; +} + #ifndef __HAVE_ARCH_MEMMAP_INIT #define memmap_init(size, nid, zone, start_pfn) \ memmap_init_zone((size), (nid), (zone), (start_pfn)) @@ -1692,7 +1709,6 @@ static void __init free_area_init_core(s struct zone *zone = pgdat->node_zones + j; unsigned long size, realsize; - zone_table[NODEZONE(nid, j)] = zone; realsize = size = zones_size[j]; if (zholes_size) realsize -= zholes_size[j]; @@ -1722,6 +1738,8 @@ static void __init free_area_init_core(s memmap_init(size, nid, j, zone_start_pfn); + zonetable_add(zone, nid, j, zone_start_pfn, size); + zone_wait_table_init(zone, size); init_currently_empty_zone(zone, zone_start_pfn, size); zone_start_pfn += size; @@ -1746,7 +1764,7 @@ static void __init alloc_node_mem_map(st map = alloc_bootmem_node(pgdat, size); pgdat->node_mem_map = map; -#ifndef CONFIG_DISCONTIGMEM +#ifdef CONFIG_FLATMEM /* * With no DISCONTIG, the global mem_map is just set as node 0's */ @@ -1768,7 +1786,7 @@ void __init free_area_init_node(int nid, free_area_init_core(pgdat, zones_size, zholes_size); } -#ifndef CONFIG_DISCONTIGMEM +#ifdef CONFIG_FLATMEM static bootmem_data_t contig_bootmem_data; struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data }; diff -puN /dev/null mm/sparse.c --- /dev/null 2004-11-08 15:18:04.000000000 -0800 +++ memhotplug-dave/mm/sparse.c 2005-02-17 15:25:36.000000000 -0800 @@ -0,0 +1,68 @@ +/* + * Non-linear memory mappings. + */ +#include +#include +#include +#include +#include + +/* + * Permenant non-linear data: + * + * 1) mem_section - memory sections, mem_map's for valid memory + */ +struct mem_section mem_section[NR_MEM_SECTIONS]; +EXPORT_SYMBOL(mem_section); + +/* Record a memory area against a node. */ +unsigned long memory_present(int nid, unsigned long start, unsigned long end) +{ + unsigned long pfn = start; + unsigned long size = 0; + + start &= PAGE_SECTION_MASK; + for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) { + int section = pfn >> PFN_SECTION_SHIFT; + if (!mem_section[section].section_mem_map) { + mem_section[section].section_mem_map = (void *) -1; + size += (PAGES_PER_SECTION * sizeof (struct page)); + } + } + + return size; +} + +/* + * Allocate the accumulated non-linear sections, allocate a mem_map + * for each and record the physical to section mapping. + */ +void sparse_init(void) +{ + int pnum; + struct page *map; + int nid; + + for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { + if (!mem_section[pnum].section_mem_map) + continue; + + nid = early_pfn_to_nid(pnum << PFN_SECTION_SHIFT); + map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION); + if (!map) + map = alloc_bootmem_node(NODE_DATA(nid), + sizeof(struct page) * PAGES_PER_SECTION); + if (!map) { + mem_section[pnum].section_mem_map = 0; + continue; + } + + /* + * Subtle, we encode the real pfn into the mem_map such that + * the identity pfn - section_mem_map will return the actual + * physical page frame number. + */ + mem_section[pnum].section_mem_map = map - + (pnum << PFN_SECTION_SHIFT); + } +} diff -puN /dev/null mm/Kconfig --- /dev/null 2004-11-08 15:18:04.000000000 -0800 +++ memhotplug-dave/mm/Kconfig 2005-02-17 15:25:36.000000000 -0800 @@ -0,0 +1,19 @@ +choice + prompt "Memory model" + default SPARSEMEM if ARCH_SPARSEMEM_DEFAULT + default FLATMEM + +config DISCONTIGMEM + bool "Discontigious Memory" + depends on !ARCH_DISCONTIGMEM_DISABLE + +config SPARSEMEM + bool "Sparse Memory" + depends on !ARCH_SPARSEMEM_DISABLE + +config FLATMEM + bool "Flat Memory" + +endchoice + + _