sl[auo]b: retry allocation once in case of failure. When we are out of space in the caches, we will try to allocate a new page. If we still fail, the page allocator will try to free pages through direct reclaim. Which means that if an object allocation failed we can be sure that no new pages could be given to us, even though direct reclaim was likely invoked. However, direct reclaim will also try to shrink objects from registered shrinkers. They won't necessarily free a full page, but if our cache happens to be one with a shrinker, this may very well open up the space we need. So we retry the allocation in this case. We can't know for sure if this happened. So the best we can do is try to derive from our allocation flags how likely it is for direct reclaim to have been called, and retry if we conclude that this is highly likely (GFP_NOWAIT | GFP_FS | !GFP_NORETRY). The common case is for the allocation to succeed. So we carefuly insert a likely branch for that case. Signed-off-by: Glauber Costa <glommer@parallels.com> CC: Christoph Lameter <cl@linux.com> CC: David Rientjes <rientjes@google.com> CC: Pekka Enberg <penberg@cs.helsinki.fi> CC: Andrew Morton <akpm@linux-foundation.org> CC: Mel Gorman <mgorman@suse.de>

commit: d131c28257c829466a24613776a0be31e914ec2d [log] [tgz]
author: Glauber Costa <glommer@parallels.com> Wed Dec 19 14:51:39 2012 +0400
committer: Glauber Costa <glommer@parallels.com> Wed Dec 19 17:33:54 2012 +0400
tree: 6b3c12e9910bf3afe65e74238f1b83206ae0e8cb
parent: 183577c54ef1fcf02443ecbfd61b1dda43e831b4 [diff]
diff --git a/mm/slab.c b/mm/slab.c
index a98295f..7e82f99 100644
--- a/mm/slab.c
+++ b/mm/slab.c

@@ -3535,6 +3535,8 @@
 	cache_alloc_debugcheck_before(cachep, flags);
 	local_irq_save(save_flags);
 	objp = __do_slab_alloc_node(cachep, flags, nodeid);
+	if (slab_should_retry(objp, flags))
+		objp = __do_slab_alloc_node(cachep, flags, nodeid);
 	local_irq_restore(save_flags);
 	objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
 	kmemleak_alloc_recursive(objp, cachep->object_size, 1, cachep->flags,

diff --git a/mm/slab.h b/mm/slab.h
index 34a98d6..03d1590 100644
--- a/mm/slab.h
+++ b/mm/slab.h

@@ -104,6 +104,48 @@
 ssize_t slabinfo_write(struct file *file, const char __user *buffer,
 		       size_t count, loff_t *ppos);
 
+/*
+ * When we are out of space in the caches, we will try to allocate a new page.
+ * If we still fail, the page allocator will try to free pages through direct
+ * reclaim. Which means that if an object allocation failed we can be sure that
+ * no new pages could be given to us.
+ *
+ * However, direct reclaim will also try to shrink objects from registered
+ * shrinkers. They won't necessarily free a full page, but if our cache happens
+ * to be one with a shrinker, this may very well open up the space we need. So
+ * we retry the allocation in this case.
+ *
+ * We can't know for sure if this is the case. So the best we can do is try
+ * to derive from our allocation flags how likely it is for direct reclaim to
+ * have been called.
+ *
+ * Most of the time the allocation will succeed, and this will be just a branch
+ * with a very high hit ratio.
+ */
+static inline bool slab_should_retry(void *obj, gfp_t flags)
+{
+	if (likely(obj))
+		return false;
+
+	/*
+	 * those are obvious. We can't retry if the flags either explicitly
+	 * prohibit, or disallow waiting.
+	 */
+	if ((flags & __GFP_NORETRY) && !(flags & __GFP_WAIT))
+		return false;
+
+	/*
+	 * If this is not a __GFP_FS allocation, we are unlikely to have
+	 * reclaimed many objects - if at all. We may have succeeded in
+	 * allocating a new page, in which case the object allocation would
+	 * have succeeded, but most of the shrinkable objects would still be
+	 * in their caches. So retrying is likely futile.
+	 */
+	if (!(flags & __GFP_FS))
+		return false;
+	return true;
+}
+
 #ifdef CONFIG_MEMCG_KMEM
 static inline bool is_root_cache(struct kmem_cache *s)
 {

diff --git a/mm/slob.c b/mm/slob.c
index a99fdf7..f00127b 100644
--- a/mm/slob.c
+++ b/mm/slob.c

@@ -424,7 +424,7 @@
  */
 
 static __always_inline void *
-__do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller)
+___do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller)
 {
 	unsigned int *m;
 	int align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
@@ -462,6 +462,16 @@
 	return ret;
 }
 
+
+static __always_inline void *
+__do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller)
+{
+	void *obj = ___do_kmalloc_node(size, gfp, node, caller);
+        if (slab_should_retry(obj, gfp))
+		obj = ___do_kmalloc_node(size, gfp, node, caller);
+	return obj;
+}
+
 void *__kmalloc_node(size_t size, gfp_t gfp, int node)
 {
 	return __do_kmalloc_node(size, gfp, node, _RET_IP_);
@@ -534,7 +544,9 @@
 	return 0;
 }
 
-void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
+static __always_inline void *
+slab_alloc_node(struct kmem_cache *c, gfp_t flags, int node,
+		unsigned long caller)
 {
 	void *b;
 
@@ -544,12 +556,12 @@
 
 	if (c->size < PAGE_SIZE) {
 		b = slob_alloc(c->size, flags, c->align, node);
-		trace_kmem_cache_alloc_node(_RET_IP_, b, c->object_size,
+		trace_kmem_cache_alloc_node(caller, b, c->object_size,
 					    SLOB_UNITS(c->size) * SLOB_UNIT,
 					    flags, node);
 	} else {
 		b = slob_new_pages(flags, get_order(c->size), node);
-		trace_kmem_cache_alloc_node(_RET_IP_, b, c->object_size,
+		trace_kmem_cache_alloc_node(caller, b, c->object_size,
 					    PAGE_SIZE << get_order(c->size),
 					    flags, node);
 	}
@@ -562,6 +574,13 @@
 }
 EXPORT_SYMBOL(kmem_cache_alloc_node);
 
+void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
+{
+	void *obj = slab_alloc_node(c, flags, node, _RET_IP_);
+        if (slab_should_retry(obj, flags))
+		obj = slab_alloc_node(c, flags, node, _RET_IP_);
+	return obj;
+}
 static void __kmem_cache_free(void *b, int size)
 {
 	if (size < PAGE_SIZE)

diff --git a/mm/slub.c b/mm/slub.c
index b72569c..580dfa8 100644
--- a/mm/slub.c
+++ b/mm/slub.c

@@ -2318,18 +2318,15 @@
  *
  * Otherwise we can simply pick the next object from the lockless free list.
  */
-static __always_inline void *slab_alloc_node(struct kmem_cache *s,
-		gfp_t gfpflags, int node, unsigned long addr)
+static __always_inline void *
+do_slab_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node,
+		   unsigned long addr)
 {
 	void **object;
 	struct kmem_cache_cpu *c;
 	struct page *page;
 	unsigned long tid;
 
-	if (slab_pre_alloc_hook(s, gfpflags))
-		return NULL;
-
-	s = memcg_kmem_get_cache(s, gfpflags);
 redo:
 
 	/*
@@ -2389,6 +2386,23 @@
 	return object;
 }
 
+static __always_inline void *
+slab_alloc_node(struct kmem_cache *s, gfp_t gfpflags,
+		int node, unsigned long addr)
+{
+	void *obj;
+
+	if (slab_pre_alloc_hook(s, gfpflags))
+		return NULL;
+
+	s = memcg_kmem_get_cache(s, gfpflags);
+
+	obj = do_slab_alloc_node(s, gfpflags, node, addr);
+	if (slab_should_retry(obj, gfpflags))
+		obj = do_slab_alloc_node(s, gfpflags, node, addr);
+	return obj;
+}
+
 void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
 {
 	void *ret = slab_alloc_node(s, gfpflags, NUMA_NO_NODE, _RET_IP_);
commit	d131c28257c829466a24613776a0be31e914ec2d	[log] [tgz]
author	Glauber Costa <glommer@parallels.com>	Wed Dec 19 14:51:39 2012 +0400
committer	Glauber Costa <glommer@parallels.com>	Wed Dec 19 17:33:54 2012 +0400
tree	6b3c12e9910bf3afe65e74238f1b83206ae0e8cb
parent	183577c54ef1fcf02443ecbfd61b1dda43e831b4 [diff]