Here's a consolidated patch that replaces the three x86-64 patches currently in the patchset as well as the ones that are missing. This restores x86-64 functionality in terms of sparse and memory hot-plug (well hot-add). This applies cleanly after first reverting the H-* patches. I haven't yet looked at the complete impact to DISCONTIGMEM, but hot-add does work for logical and physical operations. Further, this patch only affects x86-64 specific parts of the tree. Please apply. include/asm-x86_64/bitops.h | 2 include/asm-x86_64/io.h | 2 include/asm-x86_64/mman.h | 1 include/asm-x86_64/page.h | 4 Signed-off-by: Matt Tolentino --- memhotplug-dave/arch/x86_64/Kconfig | 14 + memhotplug-dave/arch/x86_64/kernel/setup.c | 7 memhotplug-dave/arch/x86_64/mm/init.c | 197 +++++++++++++++++++++++----- memhotplug-dave/include/asm-x86_64/bitops.h | 2 memhotplug-dave/include/asm-x86_64/io.h | 2 memhotplug-dave/include/asm-x86_64/mman.h | 1 memhotplug-dave/include/asm-x86_64/mmzone.h | 20 ++ memhotplug-dave/include/asm-x86_64/page.h | 4 8 files changed, 206 insertions(+), 41 deletions(-) diff -puN arch/x86_64/Kconfig~H-sparsemem-x86_64 arch/x86_64/Kconfig --- memhotplug/arch/x86_64/Kconfig~H-sparsemem-x86_64 2005-02-08 14:21:54.000000000 -0800 +++ memhotplug-dave/arch/x86_64/Kconfig 2005-02-08 14:21:54.000000000 -0800 @@ -282,15 +282,19 @@ config NUMA_EMU into virtual nodes when booted with "numa=fake=N", where N is the number of nodes. This is only useful for debugging. -config DISCONTIGMEM - bool - depends on NUMA - default y - config NUMA bool default n +config ARCH_SPARSEMEM_DEFAULT + bool + depends on NUMA + +config ARCH_DISCONTIGMEM_DISABLE + depends on !NUMA + +source "mm/Kconfig" + config HAVE_DEC_LOCK bool depends on SMP diff -puN arch/x86_64/kernel/setup.c~H-sparsemem-x86_64 arch/x86_64/kernel/setup.c --- memhotplug/arch/x86_64/kernel/setup.c~H-sparsemem-x86_64 2005-02-08 14:21:54.000000000 -0800 +++ memhotplug-dave/arch/x86_64/kernel/setup.c 2005-02-08 14:21:54.000000000 -0800 @@ -41,6 +41,7 @@ #include #include #include +#include #include #include #include @@ -511,6 +512,10 @@ void __init setup_arch(char **cmdline_p) */ end_pfn = e820_end_of_ram(); +#ifdef CONFIG_SPARSEMEM + memory_present(0, 0, end_pfn); +#endif + check_efer(); init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT)); @@ -599,6 +604,8 @@ void __init setup_arch(char **cmdline_p) reserve_bootmem(crashk_res.start, crashk_res.end - crashk_res.start + 1); } #endif + + sparse_init(); paging_init(); check_ioapic(); diff -puN arch/x86_64/mm/init.c~H-sparsemem-x86_64 arch/x86_64/mm/init.c --- memhotplug/arch/x86_64/mm/init.c~H-sparsemem-x86_64 2005-02-08 14:21:54.000000000 -0800 +++ memhotplug-dave/arch/x86_64/mm/init.c 2005-02-08 14:21:54.000000000 -0800 @@ -22,6 +22,8 @@ #include #include #include +#include +#include #include #include @@ -179,13 +181,19 @@ static struct temp_map { {} }; -static __init void *alloc_low_page(int *index, unsigned long *phys) +static __devinit void *alloc_low_page(int *index, unsigned long *phys) { struct temp_map *ti; int i; unsigned long pfn = table_end++, paddr; void *adr; + if (after_bootmem) { + adr = (void *)get_zeroed_page(GFP_ATOMIC); + *phys = __pa(adr); + return adr; + } + if (pfn >= end_pfn) panic("alloc_low_page: ran out of memory"); for (i = 0; temp_mappings[i].allocated; i++) { @@ -198,55 +206,95 @@ static __init void *alloc_low_page(int * ti->allocated = 1; __flush_tlb(); adr = ti->address + ((pfn << PAGE_SHIFT) & ~PMD_MASK); + memset(adr, 0, PAGE_SIZE); *index = i; *phys = pfn * PAGE_SIZE; return adr; } -static __init void unmap_low_page(int i) +static __devinit void unmap_low_page(int i) { - struct temp_map *ti = &temp_mappings[i]; + struct temp_map *ti; + + if (after_bootmem) + return; + ti = &temp_mappings[i]; set_pmd(ti->pmd, __pmd(0)); ti->allocated = 0; } -static void __init phys_pud_init(pud_t *pud, unsigned long address, unsigned long end) + +static void __devinit +phys_pmd_init(pmd_t *pmd, unsigned long address, unsigned long end) +{ + int i; + + printk("%s: pmd: 0x%p, address: 0x%lx end: 0x%lx\n", + __func__, pmd, address, end); + + for (i = 0; i < PTRS_PER_PMD; pmd++, i++, address += PMD_SIZE) { + unsigned long entry; + + if (address > end) { + for (; i < PTRS_PER_PMD; i++, pmd++) + set_pmd(pmd, __pmd(0)); + break; + } + entry = _PAGE_NX|_PAGE_PSE | _KERNPG_TABLE | _PAGE_GLOBAL | address; + entry &= __supported_pte_mask; + set_pmd(pmd, __pmd(entry)); + } +} + + +static void __devinit +phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end) +{ + pmd_t *pmd = pmd_offset(pud, (unsigned long)__va(address)); + + printk("%s: addr: 0x%lx end: 0x%lx pmd: 0x%p\n", + __func__, address, end, pmd); + + if (pmd_none(*pmd)) { + spin_lock(&init_mm.page_table_lock); + phys_pmd_init(pmd, address, end); + spin_unlock(&init_mm.page_table_lock); + __flush_tlb_all(); + } +} + + + +static void __devinit phys_pud_init(pud_t *pud, unsigned long address, unsigned long end) { - long i, j; + long i = pud_index(address); - i = pud_index(address); pud = pud + i; + + if (after_bootmem && pud_val(*pud)) { + phys_pmd_update(pud, address, end); + return; + } + for (; i < PTRS_PER_PUD; pud++, i++) { int map; unsigned long paddr, pmd_phys; pmd_t *pmd; - paddr = address + i*PUD_SIZE; - if (paddr >= end) { - for (; i < PTRS_PER_PUD; i++, pud++) - set_pud(pud, __pud(0)); + paddr = (address & PGDIR_MASK) + i*PUD_SIZE; + if (paddr >= end) break; - } - if (!e820_mapped(paddr, paddr+PUD_SIZE, 0)) { + if (!after_bootmem && !e820_mapped(paddr, paddr+PUD_SIZE, 0)) { set_pud(pud, __pud(0)); continue; } pmd = alloc_low_page(&map, &pmd_phys); + if (after_bootmem) spin_lock(&init_mm.page_table_lock); set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE)); - for (j = 0; j < PTRS_PER_PMD; pmd++, j++, paddr += PMD_SIZE) { - unsigned long pe; - - if (paddr >= end) { - for (; j < PTRS_PER_PMD; j++, pmd++) - set_pmd(pmd, __pmd(0)); - break; - } - pe = _PAGE_NX|_PAGE_PSE | _KERNPG_TABLE | _PAGE_GLOBAL | paddr; - pe &= __supported_pte_mask; - set_pmd(pmd, __pmd(pe)); - } + phys_pmd_init(pmd, paddr, end); + if (after_bootmem) spin_unlock(&init_mm.page_table_lock); unmap_low_page(map); } __flush_tlb(); @@ -267,12 +315,16 @@ static void __init find_early_table_spac table_start >>= PAGE_SHIFT; table_end = table_start; + + early_printk("kernel direct mapping tables upto %lx @ %lx-%lx\n", end, + table_start< end) next = end; phys_pud_init(pud, __pa(start), __pa(next)); - set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys)); + if (!after_bootmem) + set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys)); unmap_low_page(map); } - asm volatile("movq %%cr4,%0" : "=r" (mmu_cr4_features)); + if (!after_bootmem) + asm volatile("movq %%cr4,%0" : "=r" (mmu_cr4_features)); __flush_tlb_all(); - early_printk("kernel direct mapping tables upto %lx @ %lx-%lx\n", end, - table_start<node_zones + MAX_NR_ZONES - 2; + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long nr_pages = size >> PAGE_SHIFT; + int ret; + + ret = __add_pages(zone, start_pfn, nr_pages, attr); + if (ret) + goto error; + + printk("%s: just before init_memory_mapping...\n", __func__); + init_memory_mapping(start, (start + size - 1)); + + return ret; +error: + printk("%s: Problem encountered in __add_pages!\n", __func__); + return ret; +} +EXPORT_SYMBOL(add_memory); + +int remove_memory(u64 start, u64 size, unsigned long attr) +{ + struct zone *zone; + unsigned long start_pfn, end_pfn, nr_pages; + + printk("%s: start: 0x%llx size: 0x%llx attr: 0x%lx\n", + __func__, start, size, attr); + + start_pfn = start >> PAGE_SHIFT; + nr_pages = size >> PAGE_SHIFT; + /* end_pfn is the last *valid* pfn */ + end_pfn = start_pfn + nr_pages - 1; + + zone = page_zone(pfn_to_page(start_pfn)); + + printk("%s: memory will be removed from the %s zone\n", + __func__, zone->name); + printk("%s: start_pfn: 0x%lx nr_pages: 0x%lx end_pfn: 0x%lx\n", + __func__, start_pfn, nr_pages, end_pfn); + + if (zone != page_zone(pfn_to_page(end_pfn))) + goto overlap; + + printk("%s: just before remove pages\n", __func__); + + return __remove_pages(zone, start_pfn, nr_pages, attr); +overlap: + printk("%s: memory range overlaps multiple zones?\n", __func__); + return -ENOSYS; +} +EXPORT_SYMBOL(remove_memory); + +#endif + +/* * devmem_is_allowed() checks to see if /dev/mem access to a certain address is * valid. The argument is a physical page number. * @@ -452,8 +582,11 @@ void __init mem_init(void) tmp = 0; /* should count reserved pages here for all nodes */ #else + +#ifdef CONFIG_FLATMEM max_mapnr = end_pfn; if (!mem_map) BUG(); +#endif totalram_pages += free_all_bootmem(); diff -puN include/asm-x86_64/bitops.h~H-sparsemem-x86_64 include/asm-x86_64/bitops.h --- memhotplug/include/asm-x86_64/bitops.h~H-sparsemem-x86_64 2005-02-08 14:21:54.000000000 -0800 +++ memhotplug-dave/include/asm-x86_64/bitops.h 2005-02-08 14:21:54.000000000 -0800 @@ -411,8 +411,6 @@ static __inline__ int ffs(int x) /* find last set bit */ #define fls(x) generic_fls(x) -#define ARCH_HAS_ATOMIC_UNSIGNED 1 - #endif /* __KERNEL__ */ #endif /* _X86_64_BITOPS_H */ diff -puN include/asm-x86_64/io.h~H-sparsemem-x86_64 include/asm-x86_64/io.h --- memhotplug/include/asm-x86_64/io.h~H-sparsemem-x86_64 2005-02-08 14:21:54.000000000 -0800 +++ memhotplug-dave/include/asm-x86_64/io.h 2005-02-08 14:21:54.000000000 -0800 @@ -132,7 +132,7 @@ extern inline void * phys_to_virt(unsign #include #define page_to_phys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT) #else -#define page_to_phys(page) ((page - mem_map) << PAGE_SHIFT) +#define page_to_phys(page) (page_to_pfn(page) << PAGE_SHIFT) #endif #include diff -puN include/asm-x86_64/mman.h~H-sparsemem-x86_64 include/asm-x86_64/mman.h --- memhotplug/include/asm-x86_64/mman.h~H-sparsemem-x86_64 2005-02-08 14:21:54.000000000 -0800 +++ memhotplug-dave/include/asm-x86_64/mman.h 2005-02-08 14:21:54.000000000 -0800 @@ -23,6 +23,7 @@ #define MAP_NORESERVE 0x4000 /* don't check for reservations */ #define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */ #define MAP_NONBLOCK 0x10000 /* do not block on IO */ +#define MAP_IMMOVABLE 0x20000 #define MS_ASYNC 1 /* sync memory asynchronously */ #define MS_INVALIDATE 2 /* invalidate the caches */ diff -puN include/asm-x86_64/mmzone.h~H-sparsemem-x86_64 include/asm-x86_64/mmzone.h --- memhotplug/include/asm-x86_64/mmzone.h~H-sparsemem-x86_64 2005-02-08 14:21:54.000000000 -0800 +++ memhotplug-dave/include/asm-x86_64/mmzone.h 2005-02-08 14:21:54.000000000 -0800 @@ -60,4 +60,24 @@ static inline __attribute__((pure)) int ({ u8 nid__ = pfn_to_nid(pfn); \ nid__ != 0xff && (pfn) >= node_start_pfn(nid__) && (pfn) <= node_end_pfn(nid__); })) #endif + +#ifdef CONFIG_SPARSEMEM + +/* generic non-linear memory support: + * + * 1) we will not split memory into more chunks than will fit into the + * flags field of the struct page + */ + + /* + * SECTION_SIZE_BITS 2^N: how big each section will be + * MAX_PHYSADDR_BITS 2^N: how much physical address space we have + * MAX_PHYSMEM_BITS 2^N: how much memory we can have in that space + */ +#define SECTION_SIZE_BITS 27 /* matt - 128MB is convenient right now */ +#define MAX_PHYSADDR_BITS 40 +#define MAX_PHYSMEM_BITS 40 + +#endif /* CONFIG_SPARSEMEM */ + #endif diff -puN include/asm-x86_64/page.h~H-sparsemem-x86_64 include/asm-x86_64/page.h --- memhotplug/include/asm-x86_64/page.h~H-sparsemem-x86_64 2005-02-08 14:21:54.000000000 -0800 +++ memhotplug-dave/include/asm-x86_64/page.h 2005-02-08 14:21:54.000000000 -0800 @@ -126,7 +126,9 @@ extern int devmem_is_allowed(unsigned lo __pa(v); }) #define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET)) -#ifndef CONFIG_DISCONTIGMEM +#define __boot_va(x) __va(x) +#define __boot_pa(x) __pa(x) +#ifdef CONFIG_FLATMEM #define pfn_to_page(pfn) (mem_map + (pfn)) #define page_to_pfn(page) ((unsigned long)((page) - mem_map)) #define pfn_valid(pfn) ((pfn) < max_mapnr) _