
From: Bradley Christiansen <bradc1@us.ibm.com>

The following adds a page->flags bit to keep track of pages that are under
a memory removal operation.  Note that this is just a simple, fast way to
keep track of which pages are targetted for capture (removal).  We can do
the same thing, with the same interfaces (page_under_capture(),
set_page_under_capture(), ...)  with a list, for instance, but the 
implementation would be slightly more complex although it would not use
a page->flags bit.

Some of these functions could probably stand to be moved out of mm/page_alloc.c
if a more suitable place is found.

Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
---

 memhotplug-dave/include/linux/mm.h         |    4 
 memhotplug-dave/include/linux/page-flags.h |   37 +++++
 memhotplug-dave/mm/page_alloc.c            |  184 ++++++++++++++++++++++++++++-
 3 files changed, 222 insertions(+), 3 deletions(-)

diff -puN arch/i386/Kconfig~K1-removal-capture_pages arch/i386/Kconfig
diff -puN include/linux/mm.h~K1-removal-capture_pages include/linux/mm.h
--- memhotplug/include/linux/mm.h~K1-removal-capture_pages	2004-12-10 13:52:52.000000000 -0800
+++ memhotplug-dave/include/linux/mm.h	2004-12-10 13:52:52.000000000 -0800
@@ -270,6 +270,10 @@ struct page {
 #endif /* WANT_PAGE_VIRTUAL */
 };
 
+#ifdef CONFIG_MEMORY_HOTPLUG
+extern int capture_page_range(unsigned long pfn, int order);
+#endif /* CONFIG_MEMORY_HOTPLUG */
+
 /*
  * FIXME: take this include out, include page-flags.h in
  * files which need it (119 of them)
diff -puN include/linux/page-flags.h~K1-removal-capture_pages include/linux/page-flags.h
--- memhotplug/include/linux/page-flags.h~K1-removal-capture_pages	2004-12-10 13:52:52.000000000 -0800
+++ memhotplug-dave/include/linux/page-flags.h	2004-12-10 13:52:52.000000000 -0800
@@ -77,7 +77,14 @@
 #define PG_reclaim		18	/* To be reclaimed asap */
 #define PG_sharedpolicy         19      /* Page was allocated for a file
 					   mapping using a shared_policy */
-
+/*
+ * Note that this is just a simple, fast way to keep track of which pages
+ * are targetted for capture (removal).  We can do the same thing, with the
+ * same interfaces, with a list, for instance, but the implementation
+ * would be slightly more complex, but would not waste a page->flags bit.
+ * -- daveh
+ */
+#define PG_capture		19	/* Remove page for memory hotplug */
 
 /*
  * Global page accounting.  One instance per CPU.  Only unsigned longs are
@@ -333,4 +340,32 @@ static inline void set_page_writeback(st
 #define ClearPageFsMisc(page)		clear_bit(PG_fs_misc, &(page)->flags)
 #define TestClearPageFsMisc(page)	test_and_clear_bit(PG_fs_misc, &(page)->flags)
 
+#ifdef CONFIG_MEMORY_HOTPLUG
+#define PageUnderCapture(page)	test_bit(PG_capture, &(page)->flags)
+#define SetPageUnderCapture(page)	set_bit(PG_capture, &(page)->flags)
+#define ClearPageUnderCapture(page)	clear_bit(PG_capture, &(page)->flags)
+
+static inline void set_page_under_capture(struct page *page)
+{
+	SetPageUnderCapture(page);
+}
+
+static inline void clear_page_under_capture(struct page *page)
+{
+	ClearPageUnderCapture(page);
+}
+
+static inline int page_under_capture(struct page *page)
+{
+	return PageUnderCapture(page);
+}
+#else
+#define PageUnderCapture(page)	0
+
+static inline int page_under_capture(struct page *page)
+{
+	return 0;
+}
+#endif
+
 #endif	/* PAGE_FLAGS_H */
diff -puN mm/page_alloc.c~K1-removal-capture_pages mm/page_alloc.c
--- memhotplug/mm/page_alloc.c~K1-removal-capture_pages	2004-12-10 13:52:52.000000000 -0800
+++ memhotplug-dave/mm/page_alloc.c	2004-12-10 13:52:52.000000000 -0800
@@ -33,6 +33,8 @@
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
 #include <linux/nodemask.h>
+#include <linux/suspend.h>
+#include <linux/delay.h>
 
 #include <asm/tlbflush.h>
 #include "internal.h"
@@ -359,6 +361,78 @@ static inline void extract_pages(struct 
 	area->nr_free--;
 }
 
+#ifdef CONFIG_MEMORY_HOTPLUG
+/*
+ * Called when trying to allocate or free a page that has been marked for
+ * removal.
+ */
+static inline void capture_pages(struct page *page, int order)
+{
+	int i;
+	for (i = 0; i < (1 << order); i++)
+		clear_page_under_capture(&page[i]);
+	page_zone(page)->present_pages -= (1UL << order);
+	totalram_pages -= (1UL << order);
+}
+
+static inline int test_remove_range(struct page *page, struct page *base,
+		int order, int p_order)
+{
+	/* Test if page is contained in the section to be removed */
+	if ((page >= base) && (page < (base + (1 << order))) &&
+			page_under_capture(page))
+		return 1;
+	/* Test if the section to remove is fully contained in the page area */
+	if ((base >= page) && (base < (page + (1 << p_order))))
+		return 1;
+	return 0;
+}
+
+/*
+ * Searches the free_list for the pages that have been marked for removal.
+ * This function removes the largest order of pages (up to MAX_ORDER) that
+ * contain a page marked for removal.  It is passed the base page to determine
+ * the zone in which the pages are contained.
+ */
+static int remove_page_freearea(struct page *base, int order)
+{
+	struct zone *zone = page_zone(base);
+	int p_order;
+	unsigned long flags;
+	struct free_area *area;
+	struct list_head *p, *n;
+	struct page *page;
+
+	/*
+	 * We're not worried about speed here, so taking and releasing the lock
+	 * every iteration of the loop allows other processes to access the
+	 * data when needed.
+	 */
+	for (p_order = 0; p_order < MAX_ORDER; p_order++) {
+		spin_lock_irqsave(&zone->lock, flags);
+		area = zone->free_area + p_order;
+		if(list_empty(&area->free_list)) {
+			spin_unlock_irqrestore(&zone->lock, flags);
+			continue;
+		}
+		list_for_each_safe(p, n, &area->free_list) {
+			page = list_entry(p, struct page, lru);
+			if (test_remove_range(page, base, order, p_order)) {
+				extract_pages(page, zone, p_order, p_order,
+						area);
+				capture_pages(page, p_order);
+			}
+		}
+		spin_unlock_irqrestore(&zone->lock, flags);
+	}
+	return 1;
+}
+#else
+static inline void capture_pages(struct page *page, int order)
+{
+}
+#endif /* CONFIG_MEMORY_HOTPLUG */
+
 void __free_pages_ok(struct page *page, unsigned int order)
 {
 	LIST_HEAD(list);
@@ -376,6 +450,10 @@ void __free_pages_ok(struct page *page, 
 
 	for (i = 0 ; i < (1 << order) ; ++i)
 		free_pages_check(__FUNCTION__, page + i);
+	if (page_under_capture(page)) {
+		capture_pages(page, order);
+		return;
+	}
 	list_add(&page->lru, &list);
 	kernel_map_pages(page, 1<<order, 0);
 	free_pages_bulk(page_zone(page), 1, &list, order);
@@ -472,6 +550,11 @@ static struct page *__rmqueue(struct zon
 
 		page = list_entry(area->free_list.next, struct page, lru);
 		extract_pages(page, zone, current_order, order, area);
+		if (unlikely(page_under_capture(page))) {
+			capture_pages(page, current_order);
+			current_order--;
+			continue;
+		}
 		return expand(zone, page, order, current_order, area);
 	}
 
@@ -503,7 +586,7 @@ static int rmqueue_bulk(struct zone *zon
 	return allocated;
 }
 
-#if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU)
+#if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMORY_HOTPLUG)
 static void __drain_pages(unsigned int cpu)
 {
 	struct zone *zone;
@@ -522,7 +605,7 @@ static void __drain_pages(unsigned int c
 		}
 	}
 }
-#endif /* CONFIG_PM || CONFIG_HOTPLUG_CPU */
+#endif /* CONFIG_PM || CONFIG_HOTPLUG_CPU || CONFIG_MEMORY_HOTPLUG */
 
 #ifdef CONFIG_PM
 int is_head_of_free_region(struct page *page)
@@ -560,6 +643,98 @@ void drain_local_pages(void)
 }
 #endif /* CONFIG_PM */
 
+#ifdef CONFIG_MEMORY_HOTPLUG
+static inline int first_uncaptured_page(unsigned long start_pfn, int nr_pages)
+{
+	int i;
+	int res = -1;
+	int count = 0;;
+
+	printk(KERN_DEBUG "%s(%ld, %d) begin\n",
+			__func__, start_pfn, nr_pages);
+
+	for (i = start_pfn; i < end_pfn; i++) {
+		if (!page_under_capture(pfn_to_page(i)))
+			continue;
+		count++;
+		if (res == -1)
+			res = i;
+	}
+	printk(KERN_DEBUG "%s(%ld, %d) end\n",
+			__func__, start_pfn, nr_pages);
+	return res;
+}
+
+static void
+rmb_and_drain_cpu_pages(void * __unused)
+{
+	/*
+	 * make this CPU see the capture bits
+	 */
+	smp_rmb();
+
+	drain_local_pages();
+}
+
+
+/*
+ * Flags a given order of pages to be removed from memory, then removes any
+ * of those pages that are currently in cpu cache or free lists.  The page
+ * pfn passed must be alligned according to the given order.
+ *
+ * This function assumes that it has received a valid range of pfns.
+ */
+int capture_page_range(unsigned long start_pfn, int order)
+{
+	int fup;
+	struct page *page;
+	int i;
+	unsigned long nr_pages;
+	unsigned long end_pfn;
+
+	/* If the start_pfn is not aligned with the order return failure */
+	if (start_pfn % (1 << order) != 0)
+		return -EINVAL;
+
+	for (i = 0; i < (1 << order); i++)
+		set_page_under_capture(pfn_to_page(start_pfn + i));
+	/*
+	 * the set_page_under_capture() operations are not barriers, so
+	 * make sure that all the other CPUs can see the capture bits
+	 */
+	smp_wmb();
+
+	/*
+	 * This drains the per-cpu caches, and makes sure that each
+	 * CPU does see the capture bits.
+	 */
+	on_each_cpu(rmb_and_drain_cpu_pages, NULL, 1, 0);
+
+	page = pfn_to_page(start_pfn);
+	remove_page_freearea(page, order);
+	nr_pages = 1<<order;
+	/*
+	 * storing the last result (fp) keeps up from having
+	 * to walk the entire range each time
+	 */
+	fup = start_pfn;
+	end_pfn = start_pfn + nr_pages;
+	while((fp = first_uncaptured_page(fup, end_pfn)) >= 0) {
+		if (i++ < 100)
+			while (shrink_all_memory(10000));
+		msleep(100);
+	}
+
+	return 0;
+}
+
+#else /* CONFIG_MEMORY_HOTPLUG */
+int capture_page_range(unsigned long pfn, unsigned long size)
+{
+	return -ENOSYS;
+}
+#endif /* CONFIG_MEMORY_HOTPLUG */
+
 static void zone_statistics(struct zonelist *zonelist, struct zone *z)
 {
 #ifdef CONFIG_NUMA
@@ -603,6 +778,10 @@ static void fastcall free_hot_cold_page(
 	if (PageAnon(page))
 		page->mapping = NULL;
 	free_pages_check(__FUNCTION__, page);
+	if (page_under_capture(page)) {
+		capture_pages(page, 0);
+		return;
+	}
 	pcp = &zone->pageset[get_cpu()].pcp[cold];
 	local_irq_save(flags);
 	if (pcp->count >= pcp->high)
@@ -2317,3 +2496,4 @@ void *__init alloc_large_system_hash(con
 
 	return table;
 }
+
_
