From: Matt Tolentino This adds generic memory add/remove and supporting functions for memory hotplug into a new file as well as a memory hotplug kernel config option. These functions allow supporting archs to decide the affected zone for memory hotplug operations. Additionally, this patch includes a stab at the sysfs representation of memory ranges as tied to the rest of config_nonlinear and memory hotplug. The idea here is to allow memory hotplug operations to be initiated not only by hardware/firmware events (I'm specifically thinking ACPI here), but also by userspace. Note, this provides the initial framework for the sysfs support and is not yet complete. Signed-off-by: Matt Tolentino Signed-off-by: Dave Hansen --- memhotplug-dave/arch/i386/mm/init.c | 100 +++++ memhotplug-dave/drivers/base/Makefile | 1 memhotplug-dave/drivers/base/init.c | 2 memhotplug-dave/drivers/base/memory.c | 484 +++++++++++++++++++++++++ memhotplug-dave/include/asm-i386/highmem.h | 1 memhotplug-dave/include/linux/highmem.h | 1 memhotplug-dave/include/linux/memory.h | 77 +++ memhotplug-dave/include/linux/memory_hotplug.h | 41 ++ memhotplug-dave/include/linux/mmzone.h | 6 memhotplug-dave/mm/Kconfig | 4 memhotplug-dave/mm/Makefile | 1 memhotplug-dave/mm/highmem.c | 2 memhotplug-dave/mm/memory_hotplug.c | 193 +++++++++ memhotplug-dave/mm/page_alloc.c | 2 14 files changed, 912 insertions(+), 3 deletions(-) diff -puN arch/i386/mm/init.c~L0-sysfs-memory-class arch/i386/mm/init.c --- memhotplug/arch/i386/mm/init.c~L0-sysfs-memory-class 2005-03-23 17:59:56.000000000 -0800 +++ memhotplug-dave/arch/i386/mm/init.c 2005-03-23 17:59:56.000000000 -0800 @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -540,6 +541,7 @@ void __init mem_init(void) int tmp; int bad_ppro; + #ifdef CONFIG_FLATMEM if (!mem_map) BUG(); @@ -614,6 +616,104 @@ void __init mem_init(void) #endif } +int add_one_highpage(struct page *page, int pfn, int bad_ppro) +{ + /* + * there's no page_is_ram() check because that only covers ram + * from boot-time. We learned about this ram later + */ + if ( !(bad_ppro && page_kills_ppro(pfn))) { + set_bit(PG_highmem, &page->flags); + set_page_count(page, 1); + __free_page(page); + totalhigh_pages++; + } else { + SetPageReserved(page); + BUG(); /* for debugging. remove later */ + } + totalram_pages++; +#ifdef CONFIG_FLATMEM + max_mapnr++; +#endif + num_physpages++; + return 0; +} + + +/* + * Not currently handling the NUMA case. + * Assuming single node and all memory that + * has been added dynamically that would be + * onlined here is in HIGHMEM + */ + +void online_page(struct page *page) +{ + ClearPageReserved(page); + add_one_highpage(page, page_to_pfn(page), 0); +} + +/* + * this is for the non-NUMA, single node SMP system case. + * Specifically, in the case of x86, we will always add + * memory to the highmem for now. + */ +#ifndef CONFIG_NUMA +int add_memory(u64 start, u64 size, unsigned long attr) +{ + struct pglist_data *pgdata = &contig_page_data; + struct zone *zone = pgdata->node_zones + MAX_NR_ZONES-1; + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long nr_pages = size >> PAGE_SHIFT; + + return __add_pages(zone, start_pfn, nr_pages, attr); +} + +int remove_memory(u64 start, u64 size, unsigned long attr) +{ + struct zone *zone; + unsigned long start_pfn, end_pfn, nr_pages; + + start_pfn = start >> PAGE_SHIFT; + nr_pages = size >> PAGE_SHIFT; + end_pfn = start_pfn + nr_pages; + + /* + * check to see which zone the page range is in. If + * not in a zone where we allow hotplug (i.e. highmem), + * just fail it right now. + */ + zone = page_zone(pfn_to_page(start_pfn)); + + printk(KERN_DEBUG "%s(): memory will be removed from " + "the %s zone\n", __func__, zone->name); + + /* + * not handling removing memory ranges that + * overlap multiple zones yet + */ + if (zone != page_zone(pfn_to_page(end_pfn-1))) + goto overlap; + + /* make sure it is in highmem */ + if (!is_highmem(zone)) { + printk(KERN_DEBUG "%s(): range to be removed must be in highmem!\n", + __func__); + goto not_highmem; + } + + return __remove_pages(zone, start_pfn, nr_pages, attr); + +overlap: + printk(KERN_DEBUG "%s(): memory range to be removed overlaps " + "multiple zones!!!\n", __func__); +not_highmem: + return -EINVAL; +} +#endif + + + kmem_cache_t *pgd_cache; kmem_cache_t *pmd_cache; diff -puN drivers/base/Makefile~L0-sysfs-memory-class drivers/base/Makefile --- memhotplug/drivers/base/Makefile~L0-sysfs-memory-class 2005-03-23 17:59:56.000000000 -0800 +++ memhotplug-dave/drivers/base/Makefile 2005-03-23 17:59:56.000000000 -0800 @@ -7,6 +7,7 @@ obj-y := core.o sys.o interface.o bus. obj-y += power/ obj-$(CONFIG_FW_LOADER) += firmware_class.o obj-$(CONFIG_NUMA) += node.o +obj-$(CONFIG_MEMORY_HOTPLUG) += memory.o ifeq ($(CONFIG_DEBUG_DRIVER),y) EXTRA_CFLAGS += -DDEBUG diff -puN drivers/base/init.c~L0-sysfs-memory-class drivers/base/init.c --- memhotplug/drivers/base/init.c~L0-sysfs-memory-class 2005-03-23 17:59:56.000000000 -0800 +++ memhotplug-dave/drivers/base/init.c 2005-03-23 17:59:56.000000000 -0800 @@ -9,6 +9,7 @@ #include #include +#include extern int devices_init(void); extern int buses_init(void); @@ -39,5 +40,6 @@ void __init driver_init(void) platform_bus_init(); system_bus_init(); cpu_dev_init(); + memory_dev_init(); attribute_container_init(); } diff -puN /dev/null drivers/base/memory.c --- /dev/null 2004-11-08 15:18:04.000000000 -0800 +++ memhotplug-dave/drivers/base/memory.c 2005-03-23 17:59:56.000000000 -0800 @@ -0,0 +1,484 @@ +/* + * drivers/base/memory.c - basic Memory class support + */ + +#include +#include +#include +#include /* capable() */ +#include +#include +#include +#include +#include +#include +#include +#include + +#define MEMORY_CLASS_NAME "memory" + +struct sysdev_class memory_sysdev_class = { + set_kset_name(MEMORY_CLASS_NAME), +}; +EXPORT_SYMBOL(memory_sysdev_class); + +/* + * With these ops structures, we can override actions for things + * like merging or splitting + */ +static int memory_hotplug_filter(struct kset *kset, struct kobject *kobj) +{ +/* struct kobj_type *ktype = get_ktype(kobj); */ + return 1; +} + +static char *memory_hotplug_name(struct kset *kset, struct kobject *kobj) +{ + return MEMORY_CLASS_NAME; +} + +static int memory_hotplug(struct kset *kset, struct kobject *kobj, char **envp, + int num_envp, char *buffer, int buffer_size) +{ + int retval = 0; + + return retval; +} + +static struct kset_hotplug_ops memory_hotplug_ops = { + .filter = memory_hotplug_filter, + .name = memory_hotplug_name, + .hotplug = memory_hotplug, +}; + + +/* + * register_memory - Setup a sysfs device for a memory block + */ +int +register_memory(struct memory_block *memory, struct mem_section *section, + struct node *root) +{ + int error; + + memory->sysdev.cls = &memory_sysdev_class; + memory->sysdev.id = __section_nr(section); + + error = sysdev_register(&memory->sysdev); + + if (root && !error) + error = sysfs_create_link(&root->sysdev.kobj, + &memory->sysdev.kobj, + kobject_name(&memory->sysdev.kobj)); + + return error; +} + +void +unregister_memory(struct memory_block *memory, struct mem_section *section, + struct node *root) +{ + BUG_ON(memory->sysdev.cls != &memory_sysdev_class); + BUG_ON(memory->sysdev.id != __section_nr(section)); + + sysdev_unregister(&memory->sysdev); + if (root) + sysfs_remove_link(&root->sysdev.kobj, kobject_name(&memory->sysdev.kobj)); +} + +/* + * use this as the physical section index that this memsection + * uses. + */ + +static ssize_t show_mem_phys_index(struct sys_device *dev, char *buf) +{ + struct memory_block *mem = + container_of(dev, struct memory_block, sysdev); + return sprintf(buf, "%08lx\n", mem->phys_index); +} + +/* + * online, offline, going offline, etc. + */ +static ssize_t show_mem_state(struct sys_device *dev, char *buf) +{ + struct memory_block *mem = + container_of(dev, struct memory_block, sysdev); + ssize_t len = 0; + + /* + * We can probably put these states in a nice little array + * so that they're not open-coded + */ + switch (mem->state) { + case MEM_ONLINE: + len = sprintf(buf, "online\n"); + break; + case MEM_OFFLINE: + len = sprintf(buf, "offline\n"); + break; + case MEM_GOING_OFFLINE: + len = sprintf(buf, "going-offline\n"); + break; + case MEM_INVALID: + len = sprintf(buf, "invalid\n"); + break; + default: + len = sprintf(buf, "ERROR\n"); + break; + } + + return len; +} + +#ifdef CONFIG_SPARSEMEM +/* this can't stay here. it needs to go into nonlinear.c or something */ +static int +memory_block_action(struct memory_block *mem, unsigned long action) +{ + int i; + unsigned long psection; + unsigned long start_pfn, start_paddr; + struct page *first_page; + int ret; + int old_state = mem->state; + + /* + * this eventually needs to be a loop so that a memory_block + * can contain more than a single section + */ + psection = mem->phys_index; //pfn_to_section()?? + first_page = pfn_to_page(psection << PFN_SECTION_SHIFT); + printk(KERN_DEBUG "%s()\n" + KERN_DEBUG "\tpsection: %ld\n" + KERN_DEBUG "\tfirst_page: %p\n" + KERN_DEBUG "\tphys_index: %08lx\n", + __func__, psection, first_page, mem->phys_index); + for (i = 0; i < PAGES_PER_SECTION; i++) { + if ((action == MEM_ONLINE) && !PageReserved(first_page+i)) { + printk(KERN_WARNING "%s: section number %ld page number %d " + "not reserved, was it already online? \n", + __func__, psection, i); + return -EBUSY; + } + } + + switch (action) { + case MEM_ONLINE: + start_pfn = page_to_pfn(first_page); + ret = online_pages(start_pfn, PAGES_PER_SECTION); + break; + case MEM_OFFLINE: + mem->state = MEM_GOING_OFFLINE; + start_paddr = page_to_pfn(first_page) <state = old_state; + break; + default: + printk(KERN_WARNING "%s(%p, %ld) unknown action: %ld\n", __func__, + mem, action, action); + ret = -EINVAL; + } + + return ret; +} +#else +static int +memory_block_action(struct memory_block *mem, unsigned long action) +{ + printk(KERN_WARNING "%s() failed to perform action: %d, SPAARSE is " + "compiled out\n", __FUNCTION__, action); + return -ENOSYS; +} +#endif + +/* + * These to_state and from_state things really are just state + * machine changes. It might just be better to declare them + * all in a table instead of in code like this. + */ +static int memory_block_change_state(struct memory_block *mem, + unsigned long to_state, unsigned long from_state_req) +{ + int ret = 0; + down(&mem->state_sem); + + if (mem->state != from_state_req) { + ret = -EINVAL; + goto out; + } + + ret = memory_block_action(mem, to_state); + if (!ret) + mem->state = to_state; + +out: + up(&mem->state_sem); + return ret; +} + +static ssize_t +store_mem_state(struct sys_device *dev, const char *buf, size_t count) +{ + struct memory_block *mem = + container_of(dev, struct memory_block, sysdev); + unsigned int phys_section_nr = mem->phys_index; + int ret = -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + + if (!valid_section_nr(phys_section_nr)) { + printk(KERN_DEBUG "%s: section (%d) is not valid\n", + __func__, phys_section_nr); + goto out; + } + + if (!strncmp(buf, "online", min((int)count, 6))) + ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE); + else if(!strncmp(buf, "offline", min((int)count, 7))) + ret = memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE); + +out: + if (ret) + return ret; + return count; +} + +/* + * phys_device is a bad name for this. What I really want + * is a way to differentiate between memory ranges that + * are part of physical devices that constitute + * a complete removable unit or fru. + * i.e. do these ranges belong to the same physical device, + * s.t. if I offline all of these sections I can then + * remove the physical device? + */ +static ssize_t show_phys_device(struct sys_device *dev, char *buf) +{ + struct memory_block *mem = + container_of(dev, struct memory_block, sysdev); + return sprintf(buf, "%d\n", mem->phys_device); +} + +SYSDEV_ATTR(phys_index, 0444, show_mem_phys_index, NULL); +SYSDEV_ATTR(state, 0644, show_mem_state, store_mem_state); +SYSDEV_ATTR(phys_device, 0444, show_phys_device, NULL); + +#define mem_create_simple_file(mem, attr_name) \ + sysdev_create_file(&mem->sysdev, &attr_##attr_name) +#define mem_remove_simple_file(mem, attr_name) \ + sysdev_remove_file(&mem->sysdev, &attr_##attr_name) + +/* + * Block size attribute stuff + */ + +static ssize_t +print_block_size(struct class *class, char *buf) +{ + return sprintf(buf, "%lx\n", (unsigned long)PAGES_PER_SECTION*PAGE_SIZE); +} + +static CLASS_ATTR(block_size_bytes, 0444, print_block_size, NULL); + +static int block_size_init(void) +{ + sysfs_create_file(&memory_sysdev_class.kset.kobj, + &class_attr_block_size_bytes.attr); + return 0; +} + +/* + * All the probe stuff here + */ + +/* define this off in some header somewhere ... */ +#ifdef CONFIG_ARCH_MEMORY_PROBE +static ssize_t +memory_probe_store(struct class *class, const char __user *buf, size_t count) +{ + u64 phys_addr; + int ret; + /* + * Hmmm... what do we really want this to do? + */ + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + + phys_addr = simple_strtoull(buf, NULL, 0); + + // a hardware check for the ram? + //if (!ram_present(phys_addr, PAGES_PER_SECTION)) + // return -EINVAL; + + ret = add_memory(phys_addr, (PAGES_PER_SECTION << PAGE_SHIFT), 0); + + if (ret) + count = ret; + + return count; +} +static CLASS_ATTR(probe, 0700, NULL, memory_probe_store); + +static int memory_probe_init(void) +{ + sysfs_create_file(&memory_sysdev_class.kset.kobj, + &class_attr_probe.attr); + return 0; +} +#else +#define memory_probe_init(...) (1) +#endif + +/* + * Note that phys_device is optional. It is here to allow for + * differentiation between which *physical* devices each + * section belongs to... + */ + +int add_memory_block(unsigned long node_id, struct mem_section *section, + unsigned long state, int phys_device) +{ + size_t size = sizeof(struct memory_block); + struct memory_block *mem = kmalloc(size, GFP_KERNEL); + int ret = 0; + + if (!mem) + return -ENOMEM; + + memset(mem, 0, size); + + mem->phys_index = __section_nr(section); + mem->state = state; + init_MUTEX(&mem->state_sem); + mem->phys_device = phys_device; + +#if 0 + /* not yet sure how this can be optimally structured + * to get the fru information from hw/fw specific drivers + */ + if (mem->callback) + callback(mem); +#endif + + ret = register_memory(mem, section, NULL); + if (!ret) + ret = mem_create_simple_file(mem, phys_index); + if (!ret) + ret = mem_create_simple_file(mem, state); + if (!ret) + ret = mem_create_simple_file(mem, phys_device); + + return ret; +} + +/* + * For now, we have a linear search to go find the appropriate + * memory_block corresponding to a particular phys_index. If + * this gets to be a real problem, we can always use a radix + * tree or something here. + * + * This could be made generic for all sysdev classes. + */ +struct memory_block *find_memory_block(struct mem_section *section) +{ + struct kobject *kobj; + struct sys_device *sysdev; + struct memory_block *mem; + char name[sizeof(MEMORY_CLASS_NAME) + 9 + 1]; + + /* + * This only works because we know that section == sysdev->id + * slightly redundant with sysdev_register() + */ + sprintf(&name[0], "%s%d", MEMORY_CLASS_NAME, __section_nr(section)); + printk(KERN_DEBUG "%s() looking for name: \"%s\"\n", __func__, name); + + kobj = kset_find_obj(&memory_sysdev_class.kset, name); + if (!kobj) + return NULL; + + sysdev = container_of(kobj, struct sys_device, kobj); + mem = container_of(sysdev, struct memory_block, sysdev); + + return mem; +} + +int remove_memory_block(unsigned long node_id, struct mem_section *section, + int phys_device) +{ + struct memory_block *mem; + + mem = find_memory_block(section); + +#if 0 + /* not yet sure how this can be optimally structured + * to get the fru information from hw/fw specific drivers + */ + if (mem->callback) + callback(mem); +#endif + + mem_remove_simple_file(mem, phys_index); + mem_remove_simple_file(mem, state); + mem_remove_simple_file(mem, phys_device); + unregister_memory(mem, section, NULL); + + return 0; +} + +/* + * need an interface for the VM to add new memory regions, + * but without onlining it. + */ +int register_new_memory(struct mem_section *section) +{ + printk(KERN_DEBUG "%s(%p)\n", __func__, section); + + /* need some node info here and some sort of callback .... */ + return add_memory_block(0, section, MEM_OFFLINE, 0); +} + +int unregister_memory_section(struct mem_section *section) +{ + if (!valid_section(section)) { + printk(KERN_WARNING "%s: section %d is already invalid\n", + __func__, __section_nr(section)); + return -EINVAL; + } + + /* need some node info here and some sort of callback .... */ + return remove_memory_block(0, section, 0); +} + +/* + * Initialize the sysfs support for memory devices... + */ +int __init memory_dev_init(void) +{ + unsigned int i; + int ret; + + memory_sysdev_class.kset.hotplug_ops = &memory_hotplug_ops; + ret = sysdev_class_register(&memory_sysdev_class); + + /* + * Create entries for memory sections that were found + * during boot and have been initialized + */ + for (i = 0; i < NR_MEM_SECTIONS; i++) { + if (!valid_section_nr(i)) + break; + add_memory_block(0, &mem_section[i], MEM_ONLINE, 0); + } + + memory_probe_init(); + block_size_init(); + + return ret; +} diff -puN include/asm-i386/highmem.h~L0-sysfs-memory-class include/asm-i386/highmem.h --- memhotplug/include/asm-i386/highmem.h~L0-sysfs-memory-class 2005-03-23 17:59:56.000000000 -0800 +++ memhotplug-dave/include/asm-i386/highmem.h 2005-03-23 17:59:56.000000000 -0800 @@ -65,6 +65,7 @@ extern pte_t *pkmap_page_table; extern void * FASTCALL(kmap_high(struct page *page)); extern void FASTCALL(kunmap_high(struct page *page)); +extern void flush_all_zero_pkmaps(void); void *kmap(struct page *page); void kunmap(struct page *page); diff -puN include/linux/highmem.h~L0-sysfs-memory-class include/linux/highmem.h --- memhotplug/include/linux/highmem.h~L0-sysfs-memory-class 2005-03-23 17:59:56.000000000 -0800 +++ memhotplug-dave/include/linux/highmem.h 2005-03-23 17:59:56.000000000 -0800 @@ -29,6 +29,7 @@ static inline void *kmap(struct page *pa #define kmap_atomic(page, idx) page_address(page) #define kunmap_atomic(addr, idx) do { } while (0) #define kmap_atomic_to_page(ptr) virt_to_page(ptr) +#define flush_all_zero_pkmaps() do { } while (0) #endif /* CONFIG_HIGHMEM */ diff -puN /dev/null include/linux/memory.h --- /dev/null 2004-11-08 15:18:04.000000000 -0800 +++ memhotplug-dave/include/linux/memory.h 2005-03-23 17:59:56.000000000 -0800 @@ -0,0 +1,77 @@ +/* + * include/linux/memory.h - generic memory definition + * + * This is mainly for topological representation. We define the + * basic "struct memory_block" here, which can be embedded in per-arch + * definitions or NUMA information. + * + * Basic handling of the devices is done in drivers/base/memory.c + * and system devices are handled in drivers/base/sys.c. + * + * Memory block are exported via sysfs in the class/memory/devices/ + * directory. + * + */ +#ifndef _LINUX_MEMORY_H_ +#define _LINUX_MEMORY_H_ + +#include +#include +#include + +#include + +struct memory_block { + unsigned long phys_index; + unsigned long state; + struct semaphore state_sem; + int phys_device; /* to which fru does this belong? */ + void *hw; /* optional pointer to fw/hw data */ + int (*phys_callback)(struct memory_block *); + struct sys_device sysdev; +}; + +#ifndef CONFIG_MEMORY_HOTPLUG +static inline int memory_dev_init(void) +{ + return 0; +} +#else +extern int register_memory(struct memory_block *, struct mem_section *section, struct node *); +extern int register_new_memory(struct mem_section *); +extern int unregister_memory_section(struct mem_section *); +extern int memory_dev_init(void); +#endif + +#ifndef CONFIG_SPARSEMEM +#define CONFIG_MEM_BLOCK_SIZE (1<<27) +#else /* tie this to nonlinear */ +#define CONFIG_MEM_BLOCK_SIZE (PAGES_PER_SECTION<> PAGE_SHIFT) - static inline struct mem_section *__pfn_to_section(unsigned long pfn) { return &mem_section[pfn >> PFN_SECTION_SHIFT]; } +static inline int __section_nr(struct mem_section* ms) +{ + return ms - &mem_section[0]; +} + #define pfn_to_page(pfn) \ ({ \ unsigned long __pfn = (pfn); \ diff -puN mm/Kconfig~L0-sysfs-memory-class mm/Kconfig --- memhotplug/mm/Kconfig~L0-sysfs-memory-class 2005-03-23 17:59:56.000000000 -0800 +++ memhotplug-dave/mm/Kconfig 2005-03-23 17:59:56.000000000 -0800 @@ -57,3 +57,7 @@ config MEMORY_MIGRATE comment "Selecting Memory Migration automatically enables CONFIG_SWAP" depends on !SWAP + +config MEMORY_HOTPLUG + bool "Allow for memory hot-add" + depends on SPARSEMEM && HOTPLUG diff -puN mm/Makefile~L0-sysfs-memory-class mm/Makefile --- memhotplug/mm/Makefile~L0-sysfs-memory-class 2005-03-23 17:59:56.000000000 -0800 +++ memhotplug-dave/mm/Makefile 2005-03-23 17:59:56.000000000 -0800 @@ -19,4 +19,5 @@ obj-$(CONFIG_SPARSEMEM) += sparse.o obj-$(CONFIG_SHMEM) += shmem.o obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o obj-$(CONFIG_MEMORY_MIGRATE) += mmigrate.o +obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o diff -puN mm/highmem.c~L0-sysfs-memory-class mm/highmem.c --- memhotplug/mm/highmem.c~L0-sysfs-memory-class 2005-03-23 17:59:56.000000000 -0800 +++ memhotplug-dave/mm/highmem.c 2005-03-23 17:59:56.000000000 -0800 @@ -59,7 +59,7 @@ pte_t * pkmap_page_table; static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait); -static void flush_all_zero_pkmaps(void) +void flush_all_zero_pkmaps(void) { int i; diff -puN /dev/null mm/memory_hotplug.c --- /dev/null 2004-11-08 15:18:04.000000000 -0800 +++ memhotplug-dave/mm/memory_hotplug.c 2005-03-23 17:59:56.000000000 -0800 @@ -0,0 +1,193 @@ +/* + * linux/mm/memory_hotplug.c + * + * Copyright (C) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +static struct page *__kmalloc_section_memmap(unsigned long nr_pages) +{ + struct page *page, *ret; + unsigned long memmap_size = sizeof(struct page) * nr_pages; + + page = alloc_pages(GFP_KERNEL, get_order(memmap_size)); + if (page) + goto got_map_page; + + ret = vmalloc(memmap_size); + if (ret) + goto got_map_ptr; + + return NULL; +got_map_page: + ret = (struct page *)pfn_to_kaddr(page_to_pfn(page)); +got_map_ptr: + memset(ret, 0, memmap_size); + + return ret; +} + +extern int sparse_add_one_section(int, int, struct page *); /* FIXME header*/ +int __add_section(struct zone *zone, unsigned long phys_start_pfn, + unsigned long attr) +{ + struct pglist_data *pgdat = zone->zone_pgdat; + int nr_pages = PAGES_PER_SECTION; + struct page *memmap; + int zone_type; + int nid = 0; + int ret; + + printk(KERN_DEBUG "%s(%p, %08lx, %08lx)\n", __func__, zone, + phys_start_pfn, attr); + + /* + * don't check this for failure because it is possible that the + * section already has a mem_map. The sparse code will fix this up + */ + memmap = __kmalloc_section_memmap(nr_pages); + + down(&zone->resize_sem); + + printk(KERN_DEBUG "%s() phys_start_pfn: %08lx\n", __func__, phys_start_pfn); + ret = sparse_add_one_section(phys_start_pfn, nr_pages, memmap); + + if (ret <= 0) { + /* the mem_map didn't get used */ + if (memmap >= (struct page *)VMALLOC_START && + memmap < (struct page *)VMALLOC_END) + vfree(memmap); + else + free_pages((unsigned long)memmap, + get_order(sizeof(struct page) * nr_pages)); + } + + up(&zone->resize_sem); + + if (ret < 0) { + printk(KERN_WARNING "%s(): error onlining section: %d\n", + __func__, ret); + return ret; + } + + zone_type = zone - pgdat->node_zones; + memmap_init_zone(nr_pages, nid, zone_type, phys_start_pfn); + zonetable_add(zone, nid, zone_type, phys_start_pfn, nr_pages); + + /* + * Actually, we don't want to online the pages here at all. We + * will enable the new regions to be available via sysfs and thus + * onlined from user space. + */ + { + struct mem_section *ms = __pfn_to_section(phys_start_pfn); + register_new_memory(ms); + } + + return 0; +} + +/* + * Reasonably generic function for adding memory. It is + * expected that archs that support memory hotplug will + * call this function after deciding the zone to which to + * add the new pages. + */ +int __add_pages(struct zone *zone, unsigned long phys_start_pfn, + unsigned long nr_pages, unsigned long attr) +{ + unsigned long i; + int err = 0; + + printk(KERN_DEBUG "%s(%p, %08lx, %ld, %08lx)\n", __func__, + zone, phys_start_pfn, nr_pages, attr); + + for (i = 0; i < nr_pages; i += PAGES_PER_SECTION) { + printk(KERN_DEBUG "\tfor: i: %ld\n", i); + err = __add_section(zone, phys_start_pfn + i, attr); + + if (err) + break; + } + + /* + * Should we back the ones out that succeeded if any part of + * the addition fails? + */ + + return err; +} + +int online_pages(unsigned long pfn, unsigned long nr_pages) +{ + int i; + + printk(KERN_DEBUG "%s: onlining 0x%lx pages starting from pfn: 0x%lx\n", + __func__, nr_pages, pfn); + + for (i = 0; i < nr_pages; i++) { + struct page *page = pfn_to_page(pfn + i); + if (page_is_ram(pfn + i)) + online_page(page); + } + + page_zone(pfn_to_page(pfn))->present_pages += nr_pages; + + /* need error checking */ + return 0; +} + +extern void flush_all_zero_pkmaps(void); +int __remove_pages(struct zone *zone, unsigned long start_pfn, + unsigned long nr_pages, unsigned long attr) +{ + int order = get_order(nr_pages< %ld\n", + __func__, start_pfn, start_pfn + nr_pages); + + return -EAGAIN; + } +#else + return -EINVAL; +#endif + + unregister_memory_section(ms); + + /* + * Permanent kmaps keep ptes to a page long after a kunmap() to + * keep global tlb flushes to a minimum. When it flushes, it + * works out a pfn and a struct page from that pte which can be + * long after the page is removed. Flush before removal. + */ + flush_all_zero_pkmaps(); +// invalidate_phys_mapping(start_pfn, nr_pages); + ms->section_mem_map &= ~SECTION_MARKED_PRESENT; + return 0; +} diff -puN mm/page_alloc.c~L0-sysfs-memory-class mm/page_alloc.c --- memhotplug/mm/page_alloc.c~L0-sysfs-memory-class 2005-03-23 17:59:56.000000000 -0800 +++ memhotplug-dave/mm/page_alloc.c 2005-03-23 17:59:56.000000000 -0800 @@ -1799,7 +1799,7 @@ static void __init calculate_zone_totalp * up by free_all_bootmem() once the early boot process is * done. Non-atomic initialization, single-pass. */ -void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone, +void __devinit memmap_init_zone(unsigned long size, int nid, unsigned long zone, unsigned long start_pfn) { struct page *page; _