mm/simp.c - pub/scm/linux/kernel/git/davem/netdev-vger-cvs - Git at Google

 /*
  * mm/simp.c  -- simple allocator for cached objects
  *
  * (C) 1997 Thomas Schoebel-Theuer
  */

 #include <linux/simp.h>
 #include <linux/tasks.h>
 #include <linux/smp.h>
 #include <linux/interrupt.h>
 #include <asm/spinlock.h>
 #include <asm/hardirq.h>

 /*#define BENCHMARK*/

 #ifdef BENCHMARK
 #include <linux/slab.h>

 struct cons {
 	struct cons *car, *cdr;
 };

 static struct cons * table[10000];

 DEF_SIMP_HEADER(cons,struct cons,sizeof(struct cons),SIMP_SMALLOBJS,0,(structor)0,(structor)0,(structor)0,0)

 static struct cons_simp *  cons_simp;
 static kmem_cache_t * slab;

 static void simp_straight(struct cons_simp *  cons_simp)
 {
 	int j;
 	for(j = 0; j < 10000; j++) {
 		table[j] = __cons_alloc(cons_simp);
 	}
 	for(j = 0; j < 10000; j++) {
 		__cons_dealloc(cons_simp, table[j]);
 	}
 }

 static void slab_straight(kmem_cache_t * slab)
 {
 	int j;
 	for(j = 0; j < 10000; j++) {
 		table[j] = kmem_cache_alloc(slab, SLAB_KERNEL);
 	}
 	for(j = 0; j < 10000; j++) {
 		kmem_cache_free(slab, table[j]);
 	}
 }

 static void simp_osc(struct cons_simp *  cons_simp)
 {
 	int j;
 	for(j = 0; j < 10000; j++) {
 		struct cons * tmp = __cons_alloc(cons_simp);
 		table[j] = __cons_alloc(cons_simp);
 		__cons_dealloc(cons_simp, tmp);
 	}
 	for(j = 0; j < 10000; j++) {
 		__cons_dealloc(cons_simp, table[j]);
 	}
 }

 static void slab_osc(kmem_cache_t * slab)
 {
 	int j;
 	for(j = 0; j < 10000; j++) {
 		struct cons * tmp = kmem_cache_alloc(slab, SLAB_KERNEL);
 		table[j] = kmem_cache_alloc(slab, SLAB_KERNEL);
 		kmem_cache_free(slab, tmp);
 	}
 	for(j = 0; j < 10000; j++) {
 		kmem_cache_free(slab, table[j]);
 	}
 }

 static void simp_heavy(struct cons_simp *  cons_simp)
 {
 	int j;
 	struct cons * buf[10];
 	for(j = 0; j < 10000; j++) {
 		int k;
 		for(k = 0; k < 10; k++)
 			buf[k] = __cons_alloc(cons_simp);
 		table[j] = __cons_alloc(cons_simp);
 		for(k = 0; k < 10; k++)
 			__cons_dealloc(cons_simp, buf[k]);
 	}
 	for(j = 0; j < 10000; j++) {
 		__cons_dealloc(cons_simp, table[j]);
 	}
 }

 static void slab_heavy(kmem_cache_t * slab)
 {
 	int j;
 	struct cons * buf[10];
 	for(j = 0; j < 10000; j++) {
 		int k;
 		for(k = 0; k < 10; k++)
 			buf[k] = kmem_cache_alloc(slab, SLAB_KERNEL);
 		table[j] = kmem_cache_alloc(slab, SLAB_KERNEL);
 		for(k = 0; k < 10; k++)
 			kmem_cache_free(slab, buf[k]);
 	}
 	for(j = 0; j < 10000; j++) {
 		kmem_cache_free(slab, table[j]);
 	}
 }

 static void simp_plain(struct cons_simp *  cons_simp)
 {
 	int j;
 	for(j = 0; j < 10000; j++) {
 		__cons_dealloc(cons_simp, __cons_alloc(cons_simp));
 	}
 }

 static void slab_plain(kmem_cache_t * slab)
 {
 	int j;
 	for(j = 0; j < 10000; j++) {
 		kmem_cache_free(slab, kmem_cache_alloc(slab, SLAB_KERNEL));
 	}
 }

 void benchmark(void)
 {
 	int i;
 	long time;
 	cons_simp = cons_create("cons");
 	slab = kmem_cache_create("cons", sizeof(struct cons), 0, SLAB_HWCACHE_ALIGN, NULL, NULL);

 	time = jiffies;
 	for(i = 0; i < 1000; i++) {
 		simp_straight(cons_simp);
 	}
 	printk("SIMP benchmark (straight): %5ld jiffies\n", jiffies - time);

 	time = jiffies;
 	for(i = 0; i < 1000; i++) {
 		slab_straight(slab);
 	}
 	printk("SLAB benchmark (straight): %5ld jiffies\n", jiffies - time);

 	time = jiffies;
 	for(i = 0; i < 1000; i++) {
 		simp_osc(cons_simp);
 	}
 	printk("SIMP benchmark (oscillating): %5ld jiffies\n", jiffies - time);

 	time = jiffies;
 	for(i = 0; i < 1000; i++) {
 		slab_osc(slab);
 	}
 	printk("SLAB benchmark (oscillating): %5ld jiffies\n", jiffies - time);

 	time = jiffies;
 	for(i = 0; i < 300; i++) {
 		simp_heavy(cons_simp);
 	}
 	printk("SIMP benchmark (heavy): %5ld jiffies\n", jiffies - time);

 	time = jiffies;
 	for(i = 0; i < 300; i++) {
 		slab_heavy(slab);
 	}
 	printk("SLAB benchmark (heavy): %5ld jiffies\n", jiffies - time);

 	time = jiffies;
 	for(i = 0; i < 6000; i++) {
 		simp_plain(cons_simp);
 	}
 	printk("SIMP benchmark (plain): %5ld jiffies\n", jiffies - time);

 	time = jiffies;
 	for(i = 0; i < 6000; i++) {
 		slab_plain(slab);
 	}
 	printk("SLAB benchmark (plain): %5ld jiffies\n", jiffies - time);
 	if(cons_try_delete(cons_simp))
 		printk("deletion of cons simp failed!\n");
 }

 #endif

 #define GLOBAL_SIZE (PAGE_SIZE << (GLOBAL_ORDER))
 #define CHUNK_END(hdr,size) (void**)((char*)(hdr) + size)

 #define COLOR_INCREMENT (8*sizeof(void*)) /* should be 1 cache line */
 #define ALIGN_CACHE(adr) ((((((unsigned long)adr) - 1) / COLOR_INCREMENT) + 1) * COLOR_INCREMENT)
 #define HEADER_SIZE ALIGN_CACHE(sizeof(struct simp_header))
 #define MAX_SIMPS ((GLOBAL_SIZE / sizeof(struct simp)) - 1)

 struct global_data {
 	/* 1st cache line */
 	long nr_simps;
 	spinlock_t lock;
 	char fill[15*sizeof(void*) - sizeof(spinlock_t)];
 	/* rest */
 	struct simp simps[MAX_SIMPS];
 };

 static struct global_data * global = NULL;

 #ifdef SIMP_DEBUG
 static char global_magic[32] = "SIMP header SdC581oi9rY20051962\n";
 #endif

 #ifdef DEAD_BEEF
 void make_dead_beef(struct simp_header * hdr, void * objp)
 {
 	unsigned int * ptr = (unsigned int*)objp;
 	int count = hdr->father->real_size / sizeof(unsigned int);
 	while(count--)
 		*ptr++ = 0xdeadbeef;
 }

 void test_dead_beef(struct simp * simp, void * objp)
 {
 	unsigned int * ptr = (unsigned int*)objp;
 	int size = simp->real_size / sizeof(unsigned int);
 	while(size--) {
 		if(*ptr++ != 0xdeadbeef) {
 			printk("SIMP: deadbeef of object %p is touched\n",
 			       objp);
 			printk("SIMP: pool is %s\n", simp->name);
 			break;
 		}
 	}
 }
 #endif

 void * always_get_pages(int gfp_mask, unsigned long gfporder)
 {
 	void * res;
 	for(;;) {
 		res = (void*)__get_free_pages(gfp_mask, gfporder);
 		if(res)
 			break;
 		if(simp_garbage())
 			continue;
 		if(in_interrupt()) {
 			int i;
 			int flags;
 			/* Kludge, to be reworked */
 			__save_flags(flags);
 			__sti();
 			for(i = 0; i < 100000; i++) {
 				__sti();
 			}
 			__restore_flags(flags);
 		} else {
 			schedule();
 		}
 	}
 	return res;
 }


 struct simp * simp_create(char * name, long size, long order,
 			    structor first_ctor,
 			    structor dtor, int reserve)
 {
 	struct simp * simp = NULL;
 	long fraction;
 	long real_size;
 	int cpu;
 	int i;

 	if(!global) {
 		global = (struct global_data*)always_get_pages(GFP_KERNEL,GLOBAL_ORDER);
 		memset(global, 0, GLOBAL_SIZE);
 		spin_lock_init(&global->lock);
 #ifdef BENCHMARK
 		benchmark();
 #endif
 	}

 	spin_lock(&global->lock);
 	for(i = 0; i < global->nr_simps; i++) {
 		simp = &global->simps[i];
 		if(!simp->name)
 			break;
 	}
 	if(i >= global->nr_simps) {
 		global->nr_simps = i+1;
 		simp = &global->simps[i];
 	}
 	spin_unlock(&global->lock);

 	if(global->nr_simps >= MAX_SIMPS) {
 		printk("SIMP: too many simps allocated\n");
 		return NULL;
 	}
 	spin_lock_init(&simp->lock);
 	simp->name = name;
 	simp->size = size;
 	real_size = ALIGN_CACHE(size);
 	/* allow aggregation of very small objects in 2-power fractions of
 	 * cachelines */
 	fraction = COLOR_INCREMENT / 2;
 	while(size <= fraction && fraction >= sizeof(void*)) {
 		real_size = fraction;
 		fraction >>= 1;
 	}
 	simp->real_size = real_size;
 	simp->order = order;
 	simp->chunksize = CHUNK_SIZE(order);
 	simp->hdr_mask = ~(CHUNK_SIZE(order)-1);
         simp->first_ctor = first_ctor;
         simp->dtor = dtor;
 	simp->reserve = reserve;

 	real_size += sizeof(void*);
 	simp->max_elems = (CHUNK_SIZE(order) - HEADER_SIZE) / real_size;
 	simp->max_color = (CHUNK_SIZE(order) - HEADER_SIZE) % real_size;
 	for(cpu = 0; cpu < NR_PROCESSORS; cpu++) {
 		struct per_cpu * private = &simp->private[cpu];
 		private->buffer_pos = private->postbuffer;
 	}
 	return simp;
 }


 /* Do *not* inline this, it clobbers too many registers... */
 static void alloc_simp_header(struct simp * simp)
 {
 	struct simp_header * hdr;
 	char * ptr;
 	void ** index;
 	long count;

 	spin_unlock(&simp->lock);
 	for(;;) {
 		hdr = (struct simp_header*)always_get_pages(GFP_KERNEL, simp->order);
 		if(hdr)
 			break;
 		if(!simp_garbage())
 			return;
 	}
 #ifdef SIMP_DEBUG
 	if(__CHUNK_BASE(hdr,simp) != hdr)
 		panic("simp: bad kernel page alignment");
 #endif

 	memset(hdr, 0, HEADER_SIZE);
 #ifdef SIMP_DEBUG
 	memcpy(hdr->magic, global_magic, sizeof(global_magic));
 #endif
 	hdr->father = simp;
 	hdr->first_ctor = simp->first_ctor;

 	/* note: races on simp->color don't produce any error :-) */
 	ptr = ((char*)hdr) + HEADER_SIZE + simp->color;
 	index = CHUNK_END(hdr,simp->chunksize);
 	for(count = 0; count < simp->max_elems; count++) {
 		*--index = ptr;
 		ptr += simp->real_size;
 	}
 	hdr->index = hdr->fresh = hdr->emptypos = index;

 	spin_lock(&simp->lock);
 	simp->color += COLOR_INCREMENT;
 	if(simp->color >= simp->max_color)
 		simp->color = 0;
 	hdr->next = simp->usable_list;
 	simp->usable_list = hdr;
 	simp->nr_header++;
 }

 /* current x86 memcpy() is horribly moving around registers for nothing,
  * is doing unnecessary work if the size is dividable by a power-of-two,
  * and it clobbers way too many registers.
  * This results in nearly any other register being transfered to stack.
  * Fixing this would be a major win for the whole kernel!
  */
 void ** bunch_alloc(struct simp * simp, void ** buffer)
 {
 	struct simp_header * hdr;
 	void ** index;
 	void ** to;
 	void ** end;
 	structor todo;
 	long length;

 	spin_lock(&simp->lock);
 	hdr = simp->usable_list;
 	if(!hdr) {
 		alloc_simp_header(simp);
 		hdr = simp->usable_list;
 	}

 	index = hdr->index;
 	end = hdr->fresh;
 	todo = NULL;
 	if(index == end) {
 		end = CHUNK_END(hdr,simp->chunksize);
 		todo = hdr->first_ctor;
 	}
 	to = index + POSTBUFFER_SIZE/2;
 	if(to >= end) {
 		to = end;
 		if(to == CHUNK_END(hdr,simp->chunksize)) {
 			simp->usable_list = hdr->next;
 			hdr->next = (struct simp_header*)-1;
 		}
 	}
 	if(to > hdr->fresh)
 		hdr->fresh = to;
 	hdr->index = to;
 	length = ((unsigned long)to) - (unsigned long)index;

 	memcpy(buffer, index, length);

 	spin_unlock(&simp->lock);

 	to = buffer + (length/sizeof(void*));
 #ifdef DEAD_BEEF
 	do {
 		make_dead_beef(hdr, *buffer);
 	} while(++buffer < to);
 #else
 	if(todo) {
 		do {
 			todo(*buffer);
 		} while(++buffer < to);
 	}
 #endif
 	return to;
 }

 #ifdef SIMP_DEBUG
 long check_header(struct simp_header * hdr, void * ptr)
 {
 	void ** test;

 	if(!hdr) {
 		printk("SIMP: simp_free() with NULL pointer\n");
 		return 1;
 	}
 	if(strncmp(hdr->magic, global_magic, 32)) {
 		printk("SIMP: simpe_free() with bad ptr %p, or header corruption\n", ptr);
 		return 1;
 	}
 	/* This is brute force, but I don't want to pay for any
 	 * overhead if debugging is not enabled, in particular
 	 * no space overhead for keeping hashtables etc. */
 	test = hdr->index;
 	while(test < CHUNK_END(hdr,hdr->father->chunksize)) {
 		if(*test++ == ptr) {
 			printk("SIMP: trying to simp_free(%p) again\n", ptr);
 			return 1;
 		}
 	}
 	return 0;
 }
 #endif

 static inline
 void ** __bunch_free(struct simp * simp, void ** buffer, void ** stop, int one)
 {
 	while(buffer > stop) {
 		void * elem = buffer[-1];
 		struct simp_header * hdr = __CHUNK_BASE(elem,simp);
 		void ** index = hdr->index;
 		index--;
 		hdr->index = index;
 		*index = elem;
 		if(hdr->next == (struct simp_header*)-1) {
 			hdr->next = simp->usable_list;
 			simp->usable_list = hdr;
 		}
 		if(one) {
 			buffer--;
 			continue;
 		}

 		buffer -= 2;
 		elem = *buffer;
 		hdr = __CHUNK_BASE(elem,simp);
 		index = hdr->index;
 		index--;
 		hdr->index = index;
 		*index = elem;
 		if(hdr->next == (struct simp_header*)-1) {
 			hdr->next = simp->usable_list;
 			simp->usable_list = hdr;
 		}
 	}
 	return buffer;
 }

 void ** bunch_free(struct simp * simp, void ** buffer)
 {
 	void ** stop = buffer - POSTBUFFER_SIZE/3;
 	void ** res;

 	spin_lock(&simp->lock);
 	res = __bunch_free(simp, buffer, stop, 0);
 	spin_unlock(&simp->lock);
 	return res;
 }

 long __simp_garbage(struct simp * simp)
 {
 	struct simp_header ** base = &simp->usable_list;
 	struct simp_header * del;
 	int res = 0;
 	int j;

 	spin_lock(&simp->lock);
 	for(j = 0; j < NR_PROCESSORS; j++) {
 		struct per_cpu * private = &simp->private[j];
 		private->buffer_pos =
 			__bunch_free(simp,
 				     private->buffer_pos,
 				     private->postbuffer, 1);
 	}

 	del = *base;
 	while(del && simp->nr_header > simp->reserve) {
 		if(del->index == del->emptypos) {
 			if(simp->dtor) {
 				void ** ptr = del->index;
 				while(ptr < CHUNK_END(del,simp->chunksize)) {
 					simp->dtor(*ptr++);
 				}
 			}
 			*base = del->next;
 #ifdef SIMP_DEBUG
 			memset(del, 0, CHUNK_SIZE(simp->order));
 #endif
 			free_pages((unsigned long)del, simp->order);
 			res++;
 			simp->nr_header--;
 		} else
 			base = &del->next;
 		del = *base;
 	}
 	spin_unlock(&simp->lock);
 	return res;
 }

 long simp_garbage(void)
 {
 	int i;
 	int res;
 	/* Note: costs do not matter here. Any heavy thrashing of
 	 * simp chunks that could be caused by pools stealing each
 	 * other's memory has to be considered a BUG :-)
 	 * Simply avoid memory shortages by conservative allocating
 	 * policies.
 	 */
 	res = 0;
 	if(global) {
 		for(i = 0; i < global->nr_simps; i++) {
 			struct simp * simp = &global->simps[i];
 			if(simp->name)
 				res += __simp_garbage(simp);
 		}
 	}
 	return res;
 }

 int __simp_statistics(struct simp * simp, long * nr_free, int * posted)
 {
 	struct simp_header * hdr;
 	long free = 0;
 	int nr_usable = 0;
 	int post = 0;
 	int j;

 	for(j = 0; j < NR_PROCESSORS; j++) {
 		struct per_cpu * private = &simp->private[j];
 		long len = (unsigned long)private->buffer_pos -
 			(unsigned long)private->postbuffer;
 		post += len / sizeof(void*);
 	}
 	*posted = post;

 	for(hdr = simp->usable_list; hdr; hdr = hdr->next) {
 		long len = (long)CHUNK_END(hdr,simp->chunksize) -
 			(long)hdr->index;
 		free += len / sizeof(void*);
 		nr_usable++;
 	}
 	*nr_free = free;
 	return nr_usable;
 }

 long __try_delete(struct simp * simp)
 {
 	long nr_free;
 	int posted;
 	simp->reserve = 0;
 	__simp_garbage(simp);
 	__simp_statistics(simp, &nr_free, &posted);
 	if(!simp->nr_header) {
 		memset(simp, 0, sizeof(struct simp));
 	}
 	return simp->nr_header * simp->max_elems - nr_free - posted;

 }

 int get_simpinfo(char * buf)
 {
 	int i;
 	char * pos;
 #ifdef BENCHMARK
 	benchmark();
 #endif
 	pos = buf + sprintf(buf, "  PoolName ElemSize RealSize ChnkSize El/Ch  Alloced Pstd  Hdrs Usabl FreeElms\n");
 	if (!global)
 		goto out;
 	for(i = 0; i < global->nr_simps; i++) {
 		struct simp * simp = &global->simps[i];
 		int posted;
 		int nr_usable;
 		long nr_free;
 		if(!simp->name)
 			continue;
 		nr_usable = __simp_statistics(simp, &nr_free, &posted);
 		pos += sprintf(pos, "%10s %8ld %8ld %8ld %5ld %8ld %4d %5ld %5d %8ld\n",
 			       simp->name, simp->size, simp->real_size,
 			       PAGE_SIZE << simp->order, simp->max_elems,
 			       simp->nr_header * simp->max_elems - nr_free - posted,
 			       posted, simp->nr_header, nr_usable, nr_free);
 	}
 out:
 	*pos = '\0';
 	return (long)pos - (long)buf;
 }
	/*
	* mm/simp.c -- simple allocator for cached objects
	*
	* (C) 1997 Thomas Schoebel-Theuer
	*/

	#include <linux/simp.h>
	#include <linux/tasks.h>
	#include <linux/smp.h>
	#include <linux/interrupt.h>
	#include <asm/spinlock.h>
	#include <asm/hardirq.h>

	/#define BENCHMARK/

	#ifdef BENCHMARK
	#include <linux/slab.h>

	struct cons {
	struct cons car, cdr;
	};

	static struct cons * table[10000];

	DEF_SIMP_HEADER(cons,struct cons,sizeof(struct cons),SIMP_SMALLOBJS,0,(structor)0,(structor)0,(structor)0,0)

	static struct cons_simp * cons_simp;
	static kmem_cache_t * slab;

	static void simp_straight(struct cons_simp * cons_simp)
	{
	int j;
	for(j = 0; j < 10000; j++) {
	table[j] = __cons_alloc(cons_simp);
	}
	for(j = 0; j < 10000; j++) {
	__cons_dealloc(cons_simp, table[j]);
	}
	}

	static void slab_straight(kmem_cache_t * slab)
	{
	int j;
	for(j = 0; j < 10000; j++) {
	table[j] = kmem_cache_alloc(slab, SLAB_KERNEL);
	}
	for(j = 0; j < 10000; j++) {
	kmem_cache_free(slab, table[j]);
	}
	}

	static void simp_osc(struct cons_simp * cons_simp)
	{
	int j;
	for(j = 0; j < 10000; j++) {
	struct cons * tmp = __cons_alloc(cons_simp);
	table[j] = __cons_alloc(cons_simp);
	__cons_dealloc(cons_simp, tmp);
	}
	for(j = 0; j < 10000; j++) {
	__cons_dealloc(cons_simp, table[j]);
	}
	}

	static void slab_osc(kmem_cache_t * slab)
	{
	int j;
	for(j = 0; j < 10000; j++) {
	struct cons * tmp = kmem_cache_alloc(slab, SLAB_KERNEL);
	table[j] = kmem_cache_alloc(slab, SLAB_KERNEL);
	kmem_cache_free(slab, tmp);
	}
	for(j = 0; j < 10000; j++) {
	kmem_cache_free(slab, table[j]);
	}
	}

	static void simp_heavy(struct cons_simp * cons_simp)
	{
	int j;
	struct cons * buf[10];
	for(j = 0; j < 10000; j++) {
	int k;
	for(k = 0; k < 10; k++)
	buf[k] = __cons_alloc(cons_simp);
	table[j] = __cons_alloc(cons_simp);
	for(k = 0; k < 10; k++)
	__cons_dealloc(cons_simp, buf[k]);
	}
	for(j = 0; j < 10000; j++) {
	__cons_dealloc(cons_simp, table[j]);
	}
	}

	static void slab_heavy(kmem_cache_t * slab)
	{
	int j;
	struct cons * buf[10];
	for(j = 0; j < 10000; j++) {
	int k;
	for(k = 0; k < 10; k++)
	buf[k] = kmem_cache_alloc(slab, SLAB_KERNEL);
	table[j] = kmem_cache_alloc(slab, SLAB_KERNEL);
	for(k = 0; k < 10; k++)
	kmem_cache_free(slab, buf[k]);
	}
	for(j = 0; j < 10000; j++) {
	kmem_cache_free(slab, table[j]);
	}
	}

	static void simp_plain(struct cons_simp * cons_simp)
	{
	int j;
	for(j = 0; j < 10000; j++) {
	__cons_dealloc(cons_simp, __cons_alloc(cons_simp));
	}
	}

	static void slab_plain(kmem_cache_t * slab)
	{
	int j;
	for(j = 0; j < 10000; j++) {
	kmem_cache_free(slab, kmem_cache_alloc(slab, SLAB_KERNEL));
	}
	}

	void benchmark(void)
	{
	int i;
	long time;
	cons_simp = cons_create("cons");
	slab = kmem_cache_create("cons", sizeof(struct cons), 0, SLAB_HWCACHE_ALIGN, NULL, NULL);

	time = jiffies;
	for(i = 0; i < 1000; i++) {
	simp_straight(cons_simp);
	}
	printk("SIMP benchmark (straight): %5ld jiffies\n", jiffies - time);

	time = jiffies;
	for(i = 0; i < 1000; i++) {
	slab_straight(slab);
	}
	printk("SLAB benchmark (straight): %5ld jiffies\n", jiffies - time);

	time = jiffies;
	for(i = 0; i < 1000; i++) {
	simp_osc(cons_simp);
	}
	printk("SIMP benchmark (oscillating): %5ld jiffies\n", jiffies - time);

	time = jiffies;
	for(i = 0; i < 1000; i++) {
	slab_osc(slab);
	}
	printk("SLAB benchmark (oscillating): %5ld jiffies\n", jiffies - time);

	time = jiffies;
	for(i = 0; i < 300; i++) {
	simp_heavy(cons_simp);
	}
	printk("SIMP benchmark (heavy): %5ld jiffies\n", jiffies - time);

	time = jiffies;
	for(i = 0; i < 300; i++) {
	slab_heavy(slab);
	}
	printk("SLAB benchmark (heavy): %5ld jiffies\n", jiffies - time);

	time = jiffies;
	for(i = 0; i < 6000; i++) {
	simp_plain(cons_simp);
	}
	printk("SIMP benchmark (plain): %5ld jiffies\n", jiffies - time);

	time = jiffies;
	for(i = 0; i < 6000; i++) {
	slab_plain(slab);
	}
	printk("SLAB benchmark (plain): %5ld jiffies\n", jiffies - time);
	if(cons_try_delete(cons_simp))
	printk("deletion of cons simp failed!\n");
	}

	#endif

	#define GLOBAL_SIZE (PAGE_SIZE << (GLOBAL_ORDER))
	#define CHUNK_END(hdr,size) (void*)((char)(hdr) + size)

	#define COLOR_INCREMENT (8sizeof(void)) /* should be 1 cache line */
	#define ALIGN_CACHE(adr) ((((((unsigned long)adr) - 1) / COLOR_INCREMENT) + 1) * COLOR_INCREMENT)
	#define HEADER_SIZE ALIGN_CACHE(sizeof(struct simp_header))
	#define MAX_SIMPS ((GLOBAL_SIZE / sizeof(struct simp)) - 1)

	struct global_data {
	/* 1st cache line */
	long nr_simps;
	spinlock_t lock;
	char fill[15sizeof(void) - sizeof(spinlock_t)];
	/* rest */
	struct simp simps[MAX_SIMPS];
	};

	static struct global_data * global = NULL;

	#ifdef SIMP_DEBUG
	static char global_magic[32] = "SIMP header SdC581oi9rY20051962\n";
	#endif

	#ifdef DEAD_BEEF
	void make_dead_beef(struct simp_header * hdr, void * objp)
	{
	unsigned int * ptr = (unsigned int*)objp;
	int count = hdr->father->real_size / sizeof(unsigned int);
	while(count--)
	*ptr++ = 0xdeadbeef;
	}

	void test_dead_beef(struct simp * simp, void * objp)
	{
	unsigned int * ptr = (unsigned int*)objp;
	int size = simp->real_size / sizeof(unsigned int);
	while(size--) {
	if(*ptr++ != 0xdeadbeef) {
	printk("SIMP: deadbeef of object %p is touched\n",
	objp);
	printk("SIMP: pool is %s\n", simp->name);
	break;
	}
	}
	}
	#endif

	void * always_get_pages(int gfp_mask, unsigned long gfporder)
	{
	void * res;
	for(;;) {
	res = (void*)__get_free_pages(gfp_mask, gfporder);
	if(res)
	break;
	if(simp_garbage())
	continue;
	if(in_interrupt()) {
	int i;
	int flags;
	/* Kludge, to be reworked */
	__save_flags(flags);
	__sti();
	for(i = 0; i < 100000; i++) {
	__sti();
	}
	__restore_flags(flags);
	} else {
	schedule();
	}
	}
	return res;
	}


	struct simp * simp_create(char * name, long size, long order,
	structor first_ctor,
	structor dtor, int reserve)
	{
	struct simp * simp = NULL;
	long fraction;
	long real_size;
	int cpu;
	int i;

	if(!global) {
	global = (struct global_data*)always_get_pages(GFP_KERNEL,GLOBAL_ORDER);
	memset(global, 0, GLOBAL_SIZE);
	spin_lock_init(&global->lock);
	#ifdef BENCHMARK
	benchmark();
	#endif
	}

	spin_lock(&global->lock);
	for(i = 0; i < global->nr_simps; i++) {
	simp = &global->simps[i];
	if(!simp->name)
	break;
	}
	if(i >= global->nr_simps) {
	global->nr_simps = i+1;
	simp = &global->simps[i];
	}
	spin_unlock(&global->lock);

	if(global->nr_simps >= MAX_SIMPS) {
	printk("SIMP: too many simps allocated\n");
	return NULL;
	}
	spin_lock_init(&simp->lock);
	simp->name = name;
	simp->size = size;
	real_size = ALIGN_CACHE(size);
	/* allow aggregation of very small objects in 2-power fractions of
	* cachelines */
	fraction = COLOR_INCREMENT / 2;
	while(size <= fraction && fraction >= sizeof(void*)) {
	real_size = fraction;
	fraction >>= 1;
	}
	simp->real_size = real_size;
	simp->order = order;
	simp->chunksize = CHUNK_SIZE(order);
	simp->hdr_mask = ~(CHUNK_SIZE(order)-1);
	simp->first_ctor = first_ctor;
	simp->dtor = dtor;
	simp->reserve = reserve;

	real_size += sizeof(void*);
	simp->max_elems = (CHUNK_SIZE(order) - HEADER_SIZE) / real_size;
	simp->max_color = (CHUNK_SIZE(order) - HEADER_SIZE) % real_size;
	for(cpu = 0; cpu < NR_PROCESSORS; cpu++) {
	struct per_cpu * private = &simp->private[cpu];
	private->buffer_pos = private->postbuffer;
	}
	return simp;
	}




	/* Do not inline this, it clobbers too many registers... */
	static void alloc_simp_header(struct simp * simp)
	{
	struct simp_header * hdr;
	char * ptr;
	void ** index;
	long count;

	spin_unlock(&simp->lock);
	for(;;) {
	hdr = (struct simp_header*)always_get_pages(GFP_KERNEL, simp->order);
	if(hdr)
	break;
	if(!simp_garbage())
	return;
	}
	#ifdef SIMP_DEBUG
	if(__CHUNK_BASE(hdr,simp) != hdr)
	panic("simp: bad kernel page alignment");
	#endif

	memset(hdr, 0, HEADER_SIZE);
	#ifdef SIMP_DEBUG
	memcpy(hdr->magic, global_magic, sizeof(global_magic));
	#endif
	hdr->father = simp;
	hdr->first_ctor = simp->first_ctor;

	/* note: races on simp->color don't produce any error :-) */
	ptr = ((char*)hdr) + HEADER_SIZE + simp->color;
	index = CHUNK_END(hdr,simp->chunksize);
	for(count = 0; count < simp->max_elems; count++) {
	*--index = ptr;
	ptr += simp->real_size;
	}
	hdr->index = hdr->fresh = hdr->emptypos = index;

	spin_lock(&simp->lock);
	simp->color += COLOR_INCREMENT;
	if(simp->color >= simp->max_color)
	simp->color = 0;
	hdr->next = simp->usable_list;
	simp->usable_list = hdr;
	simp->nr_header++;
	}

	/* current x86 memcpy() is horribly moving around registers for nothing,
	* is doing unnecessary work if the size is dividable by a power-of-two,
	* and it clobbers way too many registers.
	* This results in nearly any other register being transfered to stack.
	* Fixing this would be a major win for the whole kernel!
	*/
	void ** bunch_alloc(struct simp * simp, void ** buffer)
	{
	struct simp_header * hdr;
	void ** index;
	void ** to;
	void ** end;
	structor todo;
	long length;

	spin_lock(&simp->lock);
	hdr = simp->usable_list;
	if(!hdr) {
	alloc_simp_header(simp);
	hdr = simp->usable_list;
	}

	index = hdr->index;
	end = hdr->fresh;
	todo = NULL;
	if(index == end) {
	end = CHUNK_END(hdr,simp->chunksize);
	todo = hdr->first_ctor;
	}
	to = index + POSTBUFFER_SIZE/2;
	if(to >= end) {
	to = end;
	if(to == CHUNK_END(hdr,simp->chunksize)) {
	simp->usable_list = hdr->next;
	hdr->next = (struct simp_header*)-1;
	}
	}
	if(to > hdr->fresh)
	hdr->fresh = to;
	hdr->index = to;
	length = ((unsigned long)to) - (unsigned long)index;

	memcpy(buffer, index, length);

	spin_unlock(&simp->lock);

	to = buffer + (length/sizeof(void*));
	#ifdef DEAD_BEEF
	do {
	make_dead_beef(hdr, *buffer);
	} while(++buffer < to);
	#else
	if(todo) {
	do {
	todo(*buffer);
	} while(++buffer < to);
	}
	#endif
	return to;
	}

	#ifdef SIMP_DEBUG
	long check_header(struct simp_header * hdr, void * ptr)
	{
	void ** test;

	if(!hdr) {
	printk("SIMP: simp_free() with NULL pointer\n");
	return 1;
	}
	if(strncmp(hdr->magic, global_magic, 32)) {
	printk("SIMP: simpe_free() with bad ptr %p, or header corruption\n", ptr);
	return 1;
	}
	/* This is brute force, but I don't want to pay for any
	* overhead if debugging is not enabled, in particular
	* no space overhead for keeping hashtables etc. */
	test = hdr->index;
	while(test < CHUNK_END(hdr,hdr->father->chunksize)) {
	if(*test++ == ptr) {
	printk("SIMP: trying to simp_free(%p) again\n", ptr);
	return 1;
	}
	}
	return 0;
	}
	#endif

	static inline
	void ** __bunch_free(struct simp * simp, void buffer, void stop, int one)
	{
	while(buffer > stop) {
	void * elem = buffer[-1];
	struct simp_header * hdr = __CHUNK_BASE(elem,simp);
	void ** index = hdr->index;
	index--;
	hdr->index = index;
	*index = elem;
	if(hdr->next == (struct simp_header*)-1) {
	hdr->next = simp->usable_list;
	simp->usable_list = hdr;
	}
	if(one) {
	buffer--;
	continue;
	}

	buffer -= 2;
	elem = *buffer;
	hdr = __CHUNK_BASE(elem,simp);
	index = hdr->index;
	index--;
	hdr->index = index;
	*index = elem;
	if(hdr->next == (struct simp_header*)-1) {
	hdr->next = simp->usable_list;
	simp->usable_list = hdr;
	}
	}
	return buffer;
	}

	void ** bunch_free(struct simp * simp, void ** buffer)
	{
	void ** stop = buffer - POSTBUFFER_SIZE/3;
	void ** res;

	spin_lock(&simp->lock);
	res = __bunch_free(simp, buffer, stop, 0);
	spin_unlock(&simp->lock);
	return res;
	}

	long __simp_garbage(struct simp * simp)
	{
	struct simp_header ** base = &simp->usable_list;
	struct simp_header * del;
	int res = 0;
	int j;

	spin_lock(&simp->lock);
	for(j = 0; j < NR_PROCESSORS; j++) {
	struct per_cpu * private = &simp->private[j];
	private->buffer_pos =
	__bunch_free(simp,
	private->buffer_pos,
	private->postbuffer, 1);
	}

	del = *base;
	while(del && simp->nr_header > simp->reserve) {
	if(del->index == del->emptypos) {
	if(simp->dtor) {
	void ** ptr = del->index;
	while(ptr < CHUNK_END(del,simp->chunksize)) {
	simp->dtor(*ptr++);
	}
	}
	*base = del->next;
	#ifdef SIMP_DEBUG
	memset(del, 0, CHUNK_SIZE(simp->order));
	#endif
	free_pages((unsigned long)del, simp->order);
	res++;
	simp->nr_header--;
	} else
	base = &del->next;
	del = *base;
	}
	spin_unlock(&simp->lock);
	return res;
	}

	long simp_garbage(void)
	{
	int i;
	int res;
	/* Note: costs do not matter here. Any heavy thrashing of
	* simp chunks that could be caused by pools stealing each
	* other's memory has to be considered a BUG :-)
	* Simply avoid memory shortages by conservative allocating
	* policies.
	*/
	res = 0;
	if(global) {
	for(i = 0; i < global->nr_simps; i++) {
	struct simp * simp = &global->simps[i];
	if(simp->name)
	res += __simp_garbage(simp);
	}
	}
	return res;
	}

	int __simp_statistics(struct simp * simp, long * nr_free, int * posted)
	{
	struct simp_header * hdr;
	long free = 0;
	int nr_usable = 0;
	int post = 0;
	int j;

	for(j = 0; j < NR_PROCESSORS; j++) {
	struct per_cpu * private = &simp->private[j];
	long len = (unsigned long)private->buffer_pos -
	(unsigned long)private->postbuffer;
	post += len / sizeof(void*);
	}
	*posted = post;

	for(hdr = simp->usable_list; hdr; hdr = hdr->next) {
	long len = (long)CHUNK_END(hdr,simp->chunksize) -
	(long)hdr->index;
	free += len / sizeof(void*);
	nr_usable++;
	}
	*nr_free = free;
	return nr_usable;
	}

	long __try_delete(struct simp * simp)
	{
	long nr_free;
	int posted;
	simp->reserve = 0;
	__simp_garbage(simp);
	__simp_statistics(simp, &nr_free, &posted);
	if(!simp->nr_header) {
	memset(simp, 0, sizeof(struct simp));
	}
	return simp->nr_header * simp->max_elems - nr_free - posted;

	}

	int get_simpinfo(char * buf)
	{
	int i;
	char * pos;
	#ifdef BENCHMARK
	benchmark();
	#endif
	pos = buf + sprintf(buf, " PoolName ElemSize RealSize ChnkSize El/Ch Alloced Pstd Hdrs Usabl FreeElms\n");
	if (!global)
	goto out;
	for(i = 0; i < global->nr_simps; i++) {
	struct simp * simp = &global->simps[i];
	int posted;
	int nr_usable;
	long nr_free;
	if(!simp->name)
	continue;
	nr_usable = __simp_statistics(simp, &nr_free, &posted);
	pos += sprintf(pos, "%10s %8ld %8ld %8ld %5ld %8ld %4d %5ld %5d %8ld\n",
	simp->name, simp->size, simp->real_size,
	PAGE_SIZE << simp->order, simp->max_elems,
	simp->nr_header * simp->max_elems - nr_free - posted,
	posted, simp->nr_header, nr_usable, nr_free);
	}
	out:
	*pos = '\0';
	return (long)pos - (long)buf;
	}