
From: Dave McCracken <dmccr@us.ibm.com>

The kernel currently uses a linear translation between physical and
kernel virutal addresses.  This means that the first physical page
is mapped to the first virtual page at a certain address: PAGE_OFFSET.

This patch allows for these tranlations to be done in a nonlinear fashion,
which allows for things like sparse memory layouts without wasting any
virtual space, or data structures.

As a side-effect of this patch, the mem_map[] may be split up, and is no
longer required to be laid out in one virtually contiguous section.

Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
---

 drivers/isdn/tpam/tpam_memory.c           |    0 
 memhotplug-dave/include/linux/bootmem.h   |    4 
 memhotplug-dave/include/linux/mm.h        |    3 
 memhotplug-dave/include/linux/mmzone.h    |   11 ++
 memhotplug-dave/include/linux/nonlinear.h |  119 +++++++++++++++++++++++++++
 memhotplug-dave/mm/Makefile               |    1 
 memhotplug-dave/mm/nonlinear.c            |  130 ++++++++++++++++++++++++++++++
 memhotplug-dave/mm/page_alloc.c           |    5 -
 8 files changed, 270 insertions(+), 3 deletions(-)

diff -puN include/linux/bootmem.h~C1-nonlinear-base include/linux/bootmem.h
--- memhotplug/include/linux/bootmem.h~C1-nonlinear-base	2004-11-12 16:51:42.000000000 -0800
+++ memhotplug-dave/include/linux/bootmem.h	2004-11-12 16:51:42.000000000 -0800
@@ -61,9 +61,9 @@ extern unsigned long __init free_all_boo
 extern void * __init __alloc_bootmem_node (pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal);
 #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
 #define alloc_bootmem_node(pgdat, x) \
-	__alloc_bootmem_node((pgdat), (x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
+	__alloc_bootmem_node((pgdat), (x), SMP_CACHE_BYTES, __boot_pa(MAX_DMA_ADDRESS))
 #define alloc_bootmem_pages_node(pgdat, x) \
-	__alloc_bootmem_node((pgdat), (x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
+	__alloc_bootmem_node((pgdat), (x), PAGE_SIZE, __boot_pa(MAX_DMA_ADDRESS))
 #define alloc_bootmem_low_pages_node(pgdat, x) \
 	__alloc_bootmem_node((pgdat), (x), PAGE_SIZE, 0)
 #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
diff -puN include/linux/mm.h~C1-nonlinear-base include/linux/mm.h
--- memhotplug/include/linux/mm.h~C1-nonlinear-base	2004-11-12 16:51:42.000000000 -0800
+++ memhotplug-dave/include/linux/mm.h	2004-11-12 16:51:42.000000000 -0800
@@ -274,6 +274,9 @@ struct page {
 	void *virtual;			/* Kernel virtual address (NULL if
 					   not kmapped, ie. highmem) */
 #endif /* WANT_PAGE_VIRTUAL */
+#ifdef CONFIG_NONLINEAR
+	unsigned short section;		/* memory section id */
+#endif
 };
 
 /*
diff -puN include/linux/mmzone.h~C1-nonlinear-base include/linux/mmzone.h
--- memhotplug/include/linux/mmzone.h~C1-nonlinear-base	2004-11-12 16:51:42.000000000 -0800
+++ memhotplug-dave/include/linux/mmzone.h	2004-11-12 16:51:42.000000000 -0800
@@ -11,6 +11,7 @@
 #include <linux/cache.h>
 #include <linux/threads.h>
 #include <linux/numa.h>
+#include <linux/nonlinear.h>
 #include <asm/atomic.h>
 
 /* Free memory management - zoned buddy allocator.  */
@@ -202,7 +203,9 @@ struct zone {
 	 * Discontig memory support fields.
 	 */
 	struct pglist_data	*zone_pgdat;
+#ifndef CONFIG_NONLINEAR
 	struct page		*zone_mem_map;
+#endif
 	/* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
 	unsigned long		zone_start_pfn;
 
@@ -255,7 +258,9 @@ typedef struct pglist_data {
 	struct zone node_zones[MAX_NR_ZONES];
 	struct zonelist node_zonelists[GFP_ZONETYPES];
 	int nr_zones;
+#ifndef CONFIG_NONLINEAR
 	struct page *node_mem_map;
+#endif
 	struct bootmem_data *bdata;
 	unsigned long node_start_pfn;
 	unsigned long node_present_pages; /* total number of physical pages */
@@ -380,7 +385,13 @@ int lower_zone_protection_sysctl_handler
 
 extern struct pglist_data contig_page_data;
 #define NODE_DATA(nid)		(&contig_page_data)
+#ifndef CONFIG_NONLINEAR
+/*
+ * nodes don't have their own mem_map with NONLINEAR.
+ * Use node_start_pfn and node_spanned pages instead
+ */
 #define NODE_MEM_MAP(nid)	mem_map
+#endif
 #define MAX_NODES_SHIFT		1
 #define pfn_to_nid(pfn)		(0)
 
diff -puN /dev/null include/linux/nonlinear.h
--- /dev/null	2004-11-08 15:18:04.000000000 -0800
+++ memhotplug-dave/include/linux/nonlinear.h	2004-11-12 16:51:42.000000000 -0800
@@ -0,0 +1,119 @@
+#ifndef __LINUX_NONLINEAR_H_
+#define __LINUX_NONLINEAR_H_
+
+struct page;
+
+#ifndef CONFIG_NONLINEAR
+
+/*
+ * For now, this keeps us from having to do this to each and
+ * every arch.
+ */
+#ifndef CONFIG_ARCH_HAS_BOOTPA
+#define __boot_pa(x)	__pa(x)
+#define __boot_va(x)	__va(x)
+#endif
+
+static inline void setup_memsections(void) {}
+static inline void alloc_memsections(unsigned long start_pfn, unsigned long start_phys_pfn, unsigned long size) {}
+static inline void alloc_memmap(struct page *page, unsigned long start_pfn, unsigned long size) {}
+
+#else
+#include <asm/nonlinear.h>
+
+#define	__HAVE_ARCH_MEMMAP_INIT	1
+
+#define SECTION_SIZE		(1<<SECTION_SHIFT)
+#define SECTION_MASK		(~(SECTION_SIZE-1))
+#define PAGES_PER_SECTION	(1<<(SECTION_SHIFT-PAGE_SHIFT))
+#define PAGE_SECTION_MASK	(~(PAGES_PER_SECTION-1))
+#define	NR_SECTIONS		(1<<(MAX_MEM_SHIFT-SECTION_SHIFT))
+#define	NR_PHYS_SECTIONS	(1<<(MAX_PHYS_SHIFT-SECTION_SHIFT))
+
+#define	INVALID_PHYS_SECTION	((unsigned short)0xffff)
+#define	INVALID_SECTION		((unsigned int)0xffffffff)
+
+struct mem_section {
+	unsigned int	phys_section;
+	struct page	*mem_map;
+};
+
+extern struct mem_section mem_section[];
+extern unsigned short phys_section[];
+
+static inline unsigned long
+section_to_addr(unsigned short nr)
+{
+	return ((unsigned long)nr) << SECTION_SHIFT;
+}
+
+static inline unsigned int
+addr_to_section(unsigned long addr)
+{
+	return addr >> SECTION_SHIFT;
+}
+
+static inline unsigned long
+section_to_pfn(unsigned short nr)
+{
+	return ((unsigned long)nr) << (SECTION_SHIFT - PAGE_SHIFT);
+}
+
+static inline unsigned int
+pfn_to_section(unsigned long addr)
+{
+	return addr >> (SECTION_SHIFT - PAGE_SHIFT);
+}
+
+static inline unsigned int
+pfn_to_section_roundup(unsigned long addr)
+{
+	return (addr+(PAGES_PER_SECTION-1)) >> (SECTION_SHIFT - PAGE_SHIFT);
+}
+
+static inline unsigned long
+section_offset(unsigned long addr)
+{
+	return addr & ~SECTION_MASK;
+}
+
+static inline unsigned long
+section_offset_pfn(unsigned long pfn)
+{
+	return pfn & ~PAGE_SECTION_MASK;
+}
+
+static inline unsigned long
+__pa(void *ptr_addr)
+{
+	unsigned long addr = (unsigned long)ptr_addr;
+	return section_to_addr(mem_section[addr_to_section(addr-PAGE_OFFSET)].phys_section) |
+		section_offset(addr);
+}
+
+static inline void *
+__va(unsigned long addr)
+{
+	return (void *)(section_to_addr(phys_section[addr_to_section(addr)]) |
+		section_offset(addr)) + PAGE_OFFSET;
+}
+
+extern struct page *pfn_to_page(unsigned long pfn);
+extern unsigned long page_to_pfn(struct page *page);
+
+static inline int
+pfn_valid(unsigned long pfn)
+{
+	if (phys_section[pfn_to_section(pfn)] == INVALID_PHYS_SECTION)
+		return 0;
+	else
+		return 1;
+}
+
+extern void setup_memsections(void);
+extern void alloc_memsections(unsigned long start_pfn, unsigned long start_phys_pfn, unsigned long size);
+extern void alloc_memmap(struct page *page, unsigned long start_pfn, unsigned long size);
+extern void memmap_init(unsigned long size, int nid, unsigned long zone, unsigned long start_pfn);
+
+#endif /* CONFIG_NONLINEAR */
+#endif /* __LINUX_NONLINEAR_H_ */
diff -puN mm/Makefile~C1-nonlinear-base mm/Makefile
--- memhotplug/mm/Makefile~C1-nonlinear-base	2004-11-12 16:51:42.000000000 -0800
+++ memhotplug-dave/mm/Makefile	2004-11-12 16:51:42.000000000 -0800
@@ -17,4 +17,5 @@ obj-$(CONFIG_HUGETLBFS)	+= hugetlb.o
 obj-$(CONFIG_NUMA) 	+= mempolicy.o
 obj-$(CONFIG_SHMEM) += shmem.o
 obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
+obj-$(CONFIG_NONLINEAR)	+= nonlinear.o
 
diff -puN /dev/null mm/nonlinear.c
--- /dev/null	2004-11-08 15:18:04.000000000 -0800
+++ memhotplug-dave/mm/nonlinear.c	2004-11-12 16:51:42.000000000 -0800
@@ -0,0 +1,130 @@
+/*
+ * Written by: Dave McCracken <dmccr@us.ibm.com>, IBM Corporation
+ *
+ * Copyright (C) 2004, IBM Corp.
+ *
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+
+struct mem_section mem_section[NR_SECTIONS];
+unsigned short phys_section[NR_PHYS_SECTIONS];
+
+void
+setup_memsections(void)
+{
+	int	index;
+	struct mem_section *ms;
+	unsigned short *ps;
+
+	for (index = 0, ms = mem_section; index < NR_SECTIONS; index++, ms++) {
+		ms->phys_section = INVALID_SECTION;
+		ms->mem_map = NULL;
+	}
+	for (index = 0, ps = phys_section; index < NR_PHYS_SECTIONS; index++, ps++) {
+		*ps = INVALID_PHYS_SECTION;
+	}
+}
+
+void
+alloc_memsections(unsigned long start_pfn,
+		  unsigned long start_phys_pfn,
+		  unsigned long pfn_count)
+{
+	unsigned int index, limit;
+	unsigned int physid;
+	unsigned int sect_count;
+	unsigned short sect_index;
+	struct mem_section *ms;
+	unsigned short *ps;
+
+	sect_count = pfn_to_section_roundup(pfn_count);
+	sect_index = index = pfn_to_section(start_pfn);
+	limit = index + sect_count;
+	ms = &mem_section[index];
+	physid = pfn_to_section(start_phys_pfn);
+	for (; index < limit; index++, ms++, physid++) {
+		ms->phys_section = physid;
+	}
+
+	index = pfn_to_section(start_phys_pfn);
+	limit = index + sect_count;
+	for (ps = &phys_section[index]; index < limit; index++, ps++, sect_index++) {
+		*ps = sect_index;
+	}
+}
+
+void
+alloc_memmap(struct page *page, unsigned long start_pfn, unsigned long size)
+{
+	unsigned int index, limit;
+	struct mem_section *ms;
+
+	size = pfn_to_section_roundup(size);
+	index = pfn_to_section(start_pfn);
+	limit = index + size;
+	ms = &mem_section[index];
+	for (; index < limit; index++, ms++, page += PAGES_PER_SECTION) {
+		ms->mem_map = page;
+	}
+}
+
+void
+memmap_init(unsigned long num_pages, int nid,
+	    unsigned long zone, unsigned long start_pfn)
+{
+	unsigned long offset;
+
+	offset = section_offset_pfn(start_pfn);
+	while (num_pages) {
+		unsigned long npages;
+
+		npages = num_pages - offset;
+		if (npages > PAGES_PER_SECTION)
+		    npages = PAGES_PER_SECTION;
+
+		/*
+		 * it's possible that a zone has holes in it.  This
+		 * makes sure to skip those sections that are part
+		 * of such a hole.
+		 */
+		if (pfn_valid(start_pfn))
+			memmap_init_zone(npages, nid, zone, start_pfn);
+
+		start_pfn += npages;
+		num_pages -= npages;
+		offset = 0;
+	}
+}
+
+struct page *
+pfn_to_page(unsigned long pfn)
+{
+	return &mem_section[phys_section[pfn_to_section(pfn)]].mem_map[section_offset_pfn(pfn)];
+}
+
+unsigned long
+page_to_pfn(struct page *page)
+{
+	return section_to_pfn(mem_section[page->section].phys_section) +
+		(page - mem_section[page->section].mem_map);
+}
+
diff -puN mm/page_alloc.c~C1-nonlinear-base mm/page_alloc.c
--- memhotplug/mm/page_alloc.c~C1-nonlinear-base	2004-11-12 16:51:42.000000000 -0800
+++ memhotplug-dave/mm/page_alloc.c	2004-11-12 16:51:42.000000000 -0800
@@ -333,7 +333,7 @@ free_pages_bulk(struct zone *zone, int c
 	struct page *base, *page = NULL;
 	int ret = 0;
 
-	base = zone->zone_mem_map;
+	base = pfn_to_page(zone->zone_start_pfn);
 	spin_lock_irqsave(&zone->lock, flags);
 	zone->all_unreclaimable = 0;
 	zone->pages_scanned = 0;
@@ -1715,7 +1715,10 @@ static void init_currently_empty_zone(st
 	struct pglist_data *pgdat = zone->zone_pgdat;
 	int nid = pgdat->node_id;
 
+#ifndef CONFIG_NONLINEAR
+	/* most uses of zone->zone_mem_map can be removed */
 	zone->zone_mem_map = pfn_to_page(zone_start_pfn);
+#endif
 	zone->zone_start_pfn = zone_start_pfn;
 
 	if ((zone_start_pfn) & (zone_required_alignment-1))
diff -puN drivers/i2c/busses/i2c-keywest.c~C1-nonlinear-base drivers/i2c/busses/i2c-keywest.c
diff -puN drivers/isdn/tpam/tpam_memory.c~C1-nonlinear-base drivers/isdn/tpam/tpam_memory.c
_
