I'm not sure that this is a viable general case that we'll run into all that often, at least until we start doing NUMA node hotplug. So, leave this in the testing code for now, and keep it available in case we ever need it. It's handy for testing on an x86emachine where it has <896MB of RAM, because you can boot with mem=512MB, and then add to (what was) an empty HIGHMEM zone. Signed-off-by: Dave Hansen Signed-off-by: Dave Hansen --- memhotplug-dave/include/linux/memory.h | 1 memhotplug-dave/include/linux/mmzone.h | 2 memhotplug-dave/mm/memory_hotplug.c | 2 memhotplug-dave/mm/page_alloc.c | 75 +++++++++++++++++++++++++++------ 4 files changed, 68 insertions(+), 12 deletions(-) diff -puN include/linux/memory.h~E2-for-debugging-handle-add-to-empty-zone include/linux/memory.h --- memhotplug/include/linux/memory.h~E2-for-debugging-handle-add-to-empty-zone 2005-09-30 12:38:19.000000000 -0700 +++ memhotplug-dave/include/linux/memory.h 2005-09-30 12:38:19.000000000 -0700 @@ -79,6 +79,7 @@ extern int attach_device_to_memsection(u #define CONFIG_MEM_BLOCK_SIZE (PAGES_PER_SECTION< #include #include +#include /* Free memory management - zoned buddy allocator. */ #ifndef CONFIG_FORCE_MAX_ZONEORDER @@ -243,6 +244,7 @@ struct zone { * rarely used fields: */ char *name; + struct semaphore init_sem; } ____cacheline_maxaligned_in_smp; diff -puN mm/memory_hotplug.c~E2-for-debugging-handle-add-to-empty-zone mm/memory_hotplug.c --- memhotplug/mm/memory_hotplug.c~E2-for-debugging-handle-add-to-empty-zone 2005-09-30 12:38:19.000000000 -0700 +++ memhotplug-dave/mm/memory_hotplug.c 2005-09-30 12:38:19.000000000 -0700 @@ -44,6 +44,8 @@ static int __add_section(struct zone *zo ret = sparse_add_one_section(zone, phys_start_pfn, nr_pages); + hot_add_zone_init(zone, phys_start_pfn, PAGES_PER_SECTION); + if (ret < 0) return ret; diff -puN mm/page_alloc.c~E2-for-debugging-handle-add-to-empty-zone mm/page_alloc.c --- memhotplug/mm/page_alloc.c~E2-for-debugging-handle-add-to-empty-zone 2005-09-30 12:38:19.000000000 -0700 +++ memhotplug-dave/mm/page_alloc.c 2005-09-30 12:38:19.000000000 -0700 @@ -35,6 +35,7 @@ #include #include #include +#include #include #include @@ -1424,7 +1425,7 @@ void show_free_areas(void) /* * Builds allocation fallback zone lists. */ -static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k) +int __devinit build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k) { switch (k) { struct zone *zone; @@ -1432,7 +1433,12 @@ static int __init build_zonelists_node(p BUG(); case ZONE_HIGHMEM: zone = pgdat->node_zones + ZONE_HIGHMEM; - if (zone->present_pages) { + /* + * with mem hotplug we don't increment present_pages + * until the pages are actually freed into the zone, + * but we increment spanned pages much earlier + */ + if (zone->spanned_pages) { #ifndef CONFIG_HIGHMEM BUG(); #endif @@ -1440,11 +1446,11 @@ static int __init build_zonelists_node(p } case ZONE_NORMAL: zone = pgdat->node_zones + ZONE_NORMAL; - if (zone->present_pages) + if (zone->spanned_pages) zonelist->zones[j++] = zone; case ZONE_DMA: zone = pgdat->node_zones + ZONE_DMA; - if (zone->present_pages) + if (zone->spanned_pages) zonelist->zones[j++] = zone; } @@ -1465,12 +1471,12 @@ static inline int zone_index_to_type(int #define MAX_NODE_LOAD (num_online_nodes()) #ifdef CONFIG_NUMA -static int __initdata node_load[MAX_NUMNODES]; -static int __init get_node_load(int node) +static int __devinitdata node_load[MAX_NUMNODES]; +static int __devinit get_node_load(int node) { return node_load[node]; } -static void __init increment_node_load(int node, int load) +static void __devinit increment_node_load(int node, int load) { node_load[node] += load; } @@ -1495,7 +1501,7 @@ static inline void increment_node_load(i * on them otherwise. * It returns -1 if no node is found. */ -static int __init find_next_best_node(int node, nodemask_t *used_node_mask) +static int __devinit find_next_best_node(int node, nodemask_t *used_node_mask) { int i, n, val; int min_val = INT_MAX; @@ -1541,7 +1547,7 @@ static int __init find_next_best_node(in return best_node; } -static void __init build_zonelists(pg_data_t *pgdat) +void __devinit build_zonelists(pg_data_t *pgdat) { int i, j, k, node, local_node; int prev_node, load; @@ -1865,7 +1871,6 @@ void __init setup_per_cpu_pageset() #endif -static __devinit void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) { int i; @@ -1888,7 +1893,7 @@ void zone_wait_table_init(struct zone *z init_waitqueue_head(zone->wait_table + i); } -static __devinit void zone_pcp_init(struct zone *zone) +void zone_pcp_init(struct zone *zone) { int cpu; unsigned long batch = zone_batchsize(zone); @@ -1920,6 +1925,7 @@ static __devinit void init_currently_emp memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn); zone_init_free_lists(pgdat, zone, zone->spanned_pages); + zone->spanned_pages = size; } /* @@ -1952,12 +1958,12 @@ static void __init free_area_init_core(s nr_kernel_pages += realsize; nr_all_pages += realsize; - zone->spanned_pages = size; zone->present_pages = realsize; zone->name = zone_names[j]; spin_lock_init(&zone->lock); spin_lock_init(&zone->lru_lock); zone_seqlock_init(zone); + init_MUTEX(&zone->init_sem); zone->zone_pgdat = pgdat; zone->free_pages = 0; @@ -2586,3 +2592,48 @@ void *__init alloc_large_system_hash(con return table; } + +static inline int zone_previously_initialized(struct zone *zone) +{ + if (zone->wait_table_size) + return 1; + + return 0; +} + +#ifdef CONFIG_MEMORY_HOTPLUG +static int __build_zonelists(void *__pgdat) +{ + pg_data_t *pgdat = __pgdat; + build_zonelists(pgdat); + return 0; +} + +int hot_add_zone_init(struct zone *zone, unsigned long phys_start_pfn, unsigned long size_pages) +{ + int ret = 0; + + down(&zone->init_sem); + if (zone_previously_initialized(zone)) { + ret = -EEXIST; + goto out; + } + + zone_wait_table_init(zone, size_pages); + init_currently_empty_zone(zone, phys_start_pfn, size_pages); + zone_pcp_init(zone); + + /* + * This is an awfully blunt way to do this. But, the + * zonelists are accessed many times over large areas + * of performance-critical code in the allocator. + * That makes it very hard to get a conventional lock + * to work. This of this as a rw lock with a huge + * write cost. + */ + stop_machine_run(__build_zonelists, zone->zone_pgdat, NR_CPUS); +out: + up(&zone->init_sem); + return ret; +} +#endif _