If a zone is empty at boot-time and then hot-added to later, it needs to run the same init code that would have been run on it at boot. This patch breaks out zone table and per-cpu-pages functions for use by the hotplug code. You can almost see all of the free_area_init_core() function on one page now. :) Signed-off-by: Dave Hansen --- memhotplug-dave/mm/page_alloc.c | 148 +++++++++++++++++++++++----------------- 1 files changed, 86 insertions(+), 62 deletions(-) diff -puN mm/page_alloc.c~A1-pcp_zone_init mm/page_alloc.c --- memhotplug/mm/page_alloc.c~A1-pcp_zone_init 2005-02-17 15:25:27.000000000 -0800 +++ memhotplug-dave/mm/page_alloc.c 2005-02-17 15:25:27.000000000 -0800 @@ -1565,6 +1565,87 @@ void zone_init_free_lists(struct pglist_ memmap_init_zone((size), (nid), (zone), (start_pfn)) #endif +static __devinit void zone_pcp_init(struct zone *zone) +{ + unsigned long batch; + int cpu; + + /* + * The per-cpu-pages pools are set to around 1000th of the + * size of the zone. But no more than 1/4 of a meg - there's + * no point in going beyond the size of L2 cache. + * + * OK, so we don't know how big the cache is. So guess. + */ + batch = zone->present_pages / 1024; + if (batch * PAGE_SIZE > 256 * 1024) + batch = (256 * 1024) / PAGE_SIZE; + batch /= 4; /* We effectively *= 4 below */ + if (batch < 1) + batch = 1; + + for (cpu = 0; cpu < NR_CPUS; cpu++) { + struct per_cpu_pages *pcp; + + pcp = &zone->pageset[cpu].pcp[0]; /* hot */ + pcp->count = 0; + pcp->low = 2 * batch; + pcp->high = 6 * batch; + pcp->batch = 1 * batch; + INIT_LIST_HEAD(&pcp->list); + + pcp = &zone->pageset[cpu].pcp[1]; /* cold */ + pcp->count = 0; + pcp->low = 0; + pcp->high = 2 * batch; + pcp->batch = 1 * batch; + INIT_LIST_HEAD(&pcp->list); + } + printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", + zone->name, zone->present_pages, batch); +} + +static __devinit void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) +{ + int table_size_bytes; + int i; + /* + * The per-page waitqueue mechanism uses hashed waitqueues + * per zone. + */ + zone->wait_table_size = wait_table_size(zone_size_pages); + zone->wait_table_bits = + wait_table_bits(zone->wait_table_size); + table_size_bytes = zone->wait_table_size * sizeof(wait_queue_head_t); + if (system_state < SYSTEM_RUNNING) + zone->wait_table = alloc_bootmem_node(zone->zone_pgdat, + table_size_bytes); + else + zone->wait_table = kmalloc(table_size_bytes, GFP_KERNEL); + + for(i = 0; i < zone->wait_table_size; ++i) + init_waitqueue_head(zone->wait_table + i); +} + +static void init_currently_empty_zone(struct zone *zone, unsigned long zone_start_pfn, unsigned long size) +{ + const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1); + struct pglist_data *pgdat = zone->zone_pgdat; + int nid = pgdat->node_id; + + zone->zone_mem_map = pfn_to_page(zone_start_pfn); + zone->zone_start_pfn = zone_start_pfn; + + if ((zone_start_pfn) & (zone_required_alignment-1)) + printk("BUG: wrong zone alignment, it will crash\n"); + + memmap_init(size, nid, zone_idx(zone), zone_start_pfn); + + zone_init_free_lists(pgdat, zone, zone->spanned_pages); + + pgdat->nr_zones++; +} + /* * Set up the zone data structures: * - mark all pages reserved @@ -1574,9 +1655,8 @@ void zone_init_free_lists(struct pglist_ static void __init free_area_init_core(struct pglist_data *pgdat, unsigned long *zones_size, unsigned long *zholes_size) { - unsigned long i, j; - const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1); - int cpu, nid = pgdat->node_id; + unsigned long j; + int nid = pgdat->node_id; unsigned long zone_start_pfn = pgdat->node_start_pfn; pgdat->nr_zones = 0; @@ -1586,7 +1666,6 @@ static void __init free_area_init_core(s for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; unsigned long size, realsize; - unsigned long batch; zone_table[NODEZONE(nid, j)] = zone; realsize = size = zones_size[j]; @@ -1606,40 +1685,7 @@ static void __init free_area_init_core(s zone->free_pages = 0; zone->temp_priority = zone->prev_priority = DEF_PRIORITY; - - /* - * The per-cpu-pages pools are set to around 1000th of the - * size of the zone. But no more than 1/4 of a meg - there's - * no point in going beyond the size of L2 cache. - * - * OK, so we don't know how big the cache is. So guess. - */ - batch = zone->present_pages / 1024; - if (batch * PAGE_SIZE > 256 * 1024) - batch = (256 * 1024) / PAGE_SIZE; - batch /= 4; /* We effectively *= 4 below */ - if (batch < 1) - batch = 1; - - for (cpu = 0; cpu < NR_CPUS; cpu++) { - struct per_cpu_pages *pcp; - - pcp = &zone->pageset[cpu].pcp[0]; /* hot */ - pcp->count = 0; - pcp->low = 2 * batch; - pcp->high = 6 * batch; - pcp->batch = 1 * batch; - INIT_LIST_HEAD(&pcp->list); - - pcp = &zone->pageset[cpu].pcp[1]; /* cold */ - pcp->count = 0; - pcp->low = 0; - pcp->high = 2 * batch; - pcp->batch = 1 * batch; - INIT_LIST_HEAD(&pcp->list); - } - printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", - zone_names[j], realsize, batch); + zone_pcp_init(zone); INIT_LIST_HEAD(&zone->active_list); INIT_LIST_HEAD(&zone->inactive_list); zone->nr_scan_active = 0; @@ -1649,33 +1695,11 @@ static void __init free_area_init_core(s if (!size) continue; - /* - * The per-page waitqueue mechanism uses hashed waitqueues - * per zone. - */ - zone->wait_table_size = wait_table_size(size); - zone->wait_table_bits = - wait_table_bits(zone->wait_table_size); - zone->wait_table = (wait_queue_head_t *) - alloc_bootmem_node(pgdat, zone->wait_table_size - * sizeof(wait_queue_head_t)); - - for(i = 0; i < zone->wait_table_size; ++i) - init_waitqueue_head(zone->wait_table + i); - - pgdat->nr_zones = j+1; - - zone->zone_mem_map = pfn_to_page(zone_start_pfn); - zone->zone_start_pfn = zone_start_pfn; - - if ((zone_start_pfn) & (zone_required_alignment-1)) - printk(KERN_CRIT "BUG: wrong zone alignment, it will crash\n"); - memmap_init(size, nid, j, zone_start_pfn); + zone_wait_table_init(zone, size); + init_currently_empty_zone(zone, zone_start_pfn, size); zone_start_pfn += size; - - zone_init_free_lists(pgdat, zone, zone->spanned_pages); } } _