bcache: support storing bcache btree nodes into NVDIMM meta device

WIP.

Signed-off-by: Coly Li <colyli@suse.de>
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index ce13c27..54ccf83 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -63,6 +63,7 @@
 
 #include "bcache.h"
 #include "btree.h"
+#include "nvmpg.h"
 
 #include <linux/blkdev.h>
 #include <linux/kthread.h>
@@ -477,14 +478,58 @@
 	}
 }
 
+static void __bch_nvmpg_bucket_free(struct cache_set *c, struct bkey *k)
+{
+	int order;
+	unsigned long nvmpg_offset;
+
+	order = ilog2(c->cache->sb.bucket_size / PAGE_SECTORS);
+	nvmpg_offset = bkey_offset_to_nvmpg_offset(PTR_OFFSET(k, 0));
+	memset(bch_nvmpg_offset_to_ptr(nvmpg_offset), 0, 1<<order);
+	bch_nvmpg_free_pages(nvmpg_offset, order, c->set_uuid);
+}
+
 void bch_bucket_free(struct cache_set *c, struct bkey *k)
 {
 	unsigned int i;
 
+	if (KEY_NVMPG(k)) {
+		__bch_nvmpg_bucket_free(c, k);
+		return;
+	}
+
 	for (i = 0; i < KEY_PTRS(k); i++)
 		__bch_bucket_free(c->cache, PTR_BUCKET(c, k, i));
 }
 
+int __bch_nvmpg_bucket_alloc(struct cache_set *c, struct bkey *k)
+{
+	struct cache *ca;
+	unsigned long nvmpg_offset, bkey_offset;
+	int order;
+
+	if (unlikely(test_bit(CACHE_SET_IO_DISABLE, &c->flags)))
+		return -1;
+
+	lockdep_assert_held(&c->bucket_lock);
+
+	ca = c->cache;
+	order = ilog2(ca->sb.bucket_size / PAGE_SECTORS);
+	nvmpg_offset = bch_nvmpg_alloc_pages(order, c->set_uuid);
+	if (!nvmpg_offset)
+		goto err;
+
+	bkey_offset = nvmpg_offset_to_bkey_offset(nvmpg_offset);
+	bkey_init(k);
+	SET_KEY_NVMPG(k, true);
+	k->ptr[0] = MAKE_PTR(0, bkey_offset, ca->sb.nr_this_dev);
+	SET_KEY_PTRS(k, 1);
+
+	return 0;
+err:
+	return -1;
+}
+
 int __bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve,
 			   struct bkey *k, bool wait)
 {
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 2acda9c..395c923 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -991,6 +991,7 @@
 		       unsigned int sectors, unsigned int write_point,
 		       unsigned int write_prio, bool wait);
 bool bch_cached_dev_error(struct cached_dev *dc);
+int __bch_nvmpg_bucket_alloc(struct cache_set *c, struct bkey *k);
 
 __printf(2, 3)
 bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...);
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 6a90c33..022a227 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -25,6 +25,8 @@
 #include "btree.h"
 #include "debug.h"
 #include "extents.h"
+#include "features.h"
+#include "nvmpg.h"
 
 #include <linux/slab.h>
 #include <linux/bitops.h>
@@ -129,6 +131,9 @@
 {
 	unsigned int i;
 
+	if (KEY_NVMPG(k))
+		return;
+
 	for (i = 0; i < KEY_PTRS(k); i++)
 		if (ptr_available(c, k, i))
 			atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin);
@@ -170,6 +175,10 @@
 	for (;
 	     b->written < btree_blocks(b) && i->seq == b->keys.set[0].data->seq;
 	     i = write_block(b)) {
+		err = "bad magic";
+		if (i->magic != bset_magic(&b->c->cache->sb))
+			goto err;
+
 		err = "unsupported bset version";
 		if (i->version > BCACHE_BSET_VERSION)
 			goto err;
@@ -179,10 +188,6 @@
 		    btree_blocks(b))
 			goto err;
 
-		err = "bad magic";
-		if (i->magic != bset_magic(&b->c->cache->sb))
-			goto err;
-
 		err = "bad checksum";
 		switch (i->version) {
 		case 0:
@@ -227,9 +232,15 @@
 	return;
 err:
 	set_btree_node_io_error(b);
-	bch_cache_set_error(b->c, "%s at bucket %zu, block %u, %u keys",
-			    err, PTR_BUCKET_NR(b->c, &b->key, 0),
-			    bset_block_offset(b, i), i->keys);
+	if (!KEY_NVMPG(&b->key))
+		bch_cache_set_error(b->c, "%s at bucket %zu, block %u, %u keys",
+				    err, PTR_BUCKET_NR(b->c, &b->key, 0),
+				    bset_block_offset(b, i), i->keys);
+	else
+		bch_cache_set_error(b->c, "%s at addr %p, block %u, %u keys",
+			err, bkey_offset_to_nvmpg_ptr(PTR_OFFSET(&b->key, 0)),
+			bset_block_offset(b, i), i->keys);
+
 	goto out;
 }
 
@@ -240,7 +251,7 @@
 	closure_put(cl);
 }
 
-static void bch_btree_node_read(struct btree *b)
+static void __bch_btree_node_read(struct btree *b)
 {
 	uint64_t start_time = local_clock();
 	struct closure cl;
@@ -278,6 +289,28 @@
 			    PTR_BUCKET_NR(b->c, &b->key, 0));
 }
 
+static void __bch_nvmpg_btree_node_read(struct btree *b)
+{
+	uint64_t start_time = local_clock();
+	void *ptr;
+
+	ptr = bkey_offset_to_nvmpg_ptr(PTR_OFFSET(&b->key, 0));
+	memcpy(b->keys.set[0].data, ptr, KEY_SIZE(&b->key) << 9);
+
+	bch_btree_node_read_done(b);
+	bch_time_stats_update(&b->c->btree_read_time, start_time);
+}
+
+static void bch_btree_node_read(struct btree *b)
+{
+	trace_bcache_btree_read(b);
+
+	if (!KEY_NVMPG(&b->key))
+		__bch_btree_node_read(b);
+	else
+		__bch_nvmpg_btree_node_read(b);
+}
+
 static void btree_complete_write(struct btree *b, struct btree_write *w)
 {
 	if (w->prio_blocked &&
@@ -335,7 +368,7 @@
 	closure_put(cl);
 }
 
-static void do_btree_node_write(struct btree *b)
+static void __do_btree_node_write(struct btree *b)
 {
 	struct closure *cl = &b->io;
 	struct bset *i = btree_bset_last(b);
@@ -400,6 +433,68 @@
 	}
 }
 
+static void btree_nvmpg_complete_write(struct btree *b, struct btree_write *w)
+{
+	atomic_sub(w->prio_blocked, &b->c->prio_blocked);
+
+	if (w->journal) {
+		atomic_dec_bug(w->journal);
+		__closure_wake_up(&b->c->journal.wait);
+	}
+
+	w->prio_blocked = 0;
+	w->journal	= NULL;
+}
+
+static void btree_nvmpg_node_write_done(struct closure *cl)
+{
+	struct btree *b = container_of(cl, struct btree, io);
+	struct btree_write *w = btree_prev_write(b);
+
+	btree_nvmpg_complete_write(b, w);
+
+	if (btree_node_dirty(b))
+		queue_delayed_work(btree_io_wq, &b->work, 30 * HZ);
+
+	closure_return_with_destructor(cl, btree_node_write_unlock);
+}
+
+static void __do_nvmpg_btree_node_write(struct btree *b)
+{
+	struct closure *cl = &b->io;
+	struct bset *i = btree_bset_last(b);
+	unsigned long nvmpg_offset;
+	void *nvmpg_ptr;
+
+	i->version	= BCACHE_BSET_VERSION;
+	i->csum		= btree_csum_set(b, i);
+
+	BUG_ON(b->bio);
+
+	/* Calculate location to write */
+	nvmpg_offset = bkey_offset_to_nvmpg_offset(PTR_OFFSET(&b->key, 0));
+	nvmpg_ptr = bch_nvmpg_offset_to_ptr(nvmpg_offset) +
+		bset_byte_offset(&b->keys, i);
+
+	if (b->level > 0)
+		memcpy_flushcache(nvmpg_ptr, i,
+			roundup(set_bytes(i), block_bytes(b->c->cache)));
+	else
+		memcpy(nvmpg_ptr, i,
+		       roundup(set_bytes(i), block_bytes(b->c->cache)));
+
+	closure_sync(cl);
+	continue_at_nobarrier(cl, btree_nvmpg_node_write_done, NULL);
+}
+
+static void do_btree_node_write(struct btree *b)
+{
+	if (!KEY_NVMPG(&b->key))
+		__do_btree_node_write(b);
+	else
+		__do_nvmpg_btree_node_write(b);
+}
+
 void __bch_btree_node_write(struct btree *b, struct closure *parent)
 {
 	struct bset *i = btree_bset_last(b);
@@ -535,6 +630,9 @@
 {
 	BUG_ON(btree_node_dirty(b));
 
+	if (KEY_NVMPG(&b->key))
+		SET_KEY_NVMPG(&b->key, false);
+
 	b->key.ptr[0] = 0;
 	hlist_del_init_rcu(&b->hash);
 	list_move(&b->list, &b->c->btree_cache_freeable);
@@ -1091,13 +1189,25 @@
 {
 	BKEY_PADDED(key) k;
 	struct btree *b = ERR_PTR(-EAGAIN);
+	int err = -1;
 
 	mutex_lock(&c->bucket_lock);
 retry:
-	if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, wait))
-		goto err;
+	/*
+	 * If nvdimm_meta feature is enabled, try to allocate btree
+	 * node from NVDIMM pages and set KEY_NVMPG bit successfully.
+       	 */
+	if (bch_has_feature_nvdimm_meta(&(c->cache->sb)))
+		err = __bch_nvmpg_bucket_alloc(c, &k.key);
 
-	bkey_put(c, &k.key);
+	if (err < 0) {
+		err = __bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, wait);
+		if (!err)
+			bkey_put(c, &k.key);
+		else
+			goto err;
+	}
+
 	SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS);
 
 	b = mca_alloc(c, op, &k.key, level);
@@ -1159,10 +1269,12 @@
 	bkey_copy(k, &b->key);
 	bkey_copy_key(k, &ZERO_KEY);
 
-	for (i = 0; i < KEY_PTRS(k); i++)
-		SET_PTR_GEN(k, i,
-			    bch_inc_gen(b->c->cache,
-					PTR_BUCKET(b->c, &b->key, i)));
+	if (!KEY_NVMPG(&b->key)) {
+		for (i = 0; i < KEY_PTRS(k); i++)
+			SET_PTR_GEN(k, i,
+				    bch_inc_gen(b->c->cache,
+						PTR_BUCKET(b->c, &b->key, i)));
+	}
 
 	mutex_unlock(&b->c->bucket_lock);
 }
@@ -1205,6 +1317,9 @@
 	if (!bkey_cmp(k, &ZERO_KEY))
 		return stale;
 
+	if (KEY_NVMPG(k))
+		return stale;
+
 	for (i = 0; i < KEY_PTRS(k); i++) {
 		if (!ptr_available(c, k, i))
 			continue;
@@ -1248,6 +1363,9 @@
 {
 	unsigned int i;
 
+	if (KEY_NVMPG(k))
+		return;
+
 	for (i = 0; i < KEY_PTRS(k); i++)
 		if (ptr_available(c, k, i) &&
 		    !ptr_stale(c, k, i)) {
@@ -1748,10 +1866,14 @@
 
 		spin_lock(&dc->writeback_keys.lock);
 		rbtree_postorder_for_each_entry_safe(w, n,
-					&dc->writeback_keys.keys, node)
+					&dc->writeback_keys.keys, node) {
+			if (KEY_NVMPG(&w->key))
+				continue;
+
 			for (j = 0; j < KEY_PTRS(&w->key); j++)
 				SET_GC_MARK(PTR_BUCKET(c, &w->key, j),
 					    GC_MARK_DIRTY);
+		}
 		spin_unlock(&dc->writeback_keys.lock);
 	}
 	rcu_read_unlock();
@@ -2480,8 +2602,11 @@
 
 	BUG_ON(!b->written);
 
-	for (i = 0; i < KEY_PTRS(&b->key); i++)
-		BUG_ON(PTR_BUCKET(b->c, &b->key, i)->prio != BTREE_PRIO);
+	if (!KEY_NVMPG(&b->key)) {
+		for (i = 0; i < KEY_PTRS(&b->key); i++)
+			BUG_ON(PTR_BUCKET(b->c, &b->key, i)->prio !=
+			       BTREE_PRIO);
+	}
 
 	mutex_lock(&b->c->bucket_lock);
 	list_del_init(&b->list);
diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c
index d626ffc..4b11d85 100644
--- a/drivers/md/bcache/extents.c
+++ b/drivers/md/bcache/extents.c
@@ -51,13 +51,18 @@
 	for (i = 0; i < KEY_PTRS(k); i++)
 		if (ptr_available(c, k, i)) {
 			struct cache *ca = c->cache;
-			size_t bucket = PTR_BUCKET_NR(c, k, i);
 			size_t r = bucket_remainder(c, PTR_OFFSET(k, i));
 
-			if (KEY_SIZE(k) + r > c->cache->sb.bucket_size ||
-			    bucket <  ca->sb.first_bucket ||
-			    bucket >= ca->sb.nbuckets)
+			if (KEY_SIZE(k) + r > c->cache->sb.bucket_size)
 				return true;
+
+			if (!KEY_NVMPG(k)) {
+				size_t bucket = PTR_BUCKET_NR(c, k, i);
+
+				if (bucket <  ca->sb.first_bucket ||
+				    bucket >= ca->sb.nbuckets)
+					return true;
+			}
 		}
 
 	return false;
@@ -72,17 +77,20 @@
 	for (i = 0; i < KEY_PTRS(k); i++)
 		if (ptr_available(c, k, i)) {
 			struct cache *ca = c->cache;
-			size_t bucket = PTR_BUCKET_NR(c, k, i);
 			size_t r = bucket_remainder(c, PTR_OFFSET(k, i));
 
 			if (KEY_SIZE(k) + r > c->cache->sb.bucket_size)
 				return "bad, length too big";
-			if (bucket <  ca->sb.first_bucket)
-				return "bad, short offset";
-			if (bucket >= ca->sb.nbuckets)
-				return "bad, offset past end of device";
-			if (ptr_stale(c, k, i))
-				return "stale";
+			if (!KEY_NVMPG(k)) {
+				size_t bucket = PTR_BUCKET_NR(c, k, i);
+
+				if (bucket <  ca->sb.first_bucket)
+					return "bad, short offset";
+				if (bucket >= ca->sb.nbuckets)
+					return "bad, offset past end of device";
+				if (ptr_stale(c, k, i))
+					return "stale";
+			}
 		}
 
 	if (!bkey_cmp(k, &ZERO_KEY))
@@ -129,6 +137,9 @@
 	unsigned int j;
 	char buf[80];
 
+	if (KEY_NVMPG(k))
+		return;
+
 	bch_extent_to_text(buf, sizeof(buf), k);
 	pr_cont(" %s", buf);
 
@@ -176,6 +187,9 @@
 	char buf[80];
 	struct bucket *g;
 
+	if (KEY_NVMPG(k))
+		return false;
+
 	if (mutex_trylock(&b->c->bucket_lock)) {
 		for (i = 0; i < KEY_PTRS(k); i++)
 			if (ptr_available(b->c, k, i)) {
@@ -212,10 +226,12 @@
 	    bch_ptr_invalid(bk, k))
 		return true;
 
-	for (i = 0; i < KEY_PTRS(k); i++)
-		if (!ptr_available(b->c, k, i) ||
-		    ptr_stale(b->c, k, i))
+	for (i = 0; i < KEY_PTRS(k); i++) {
+		if (!ptr_available(b->c, k, i))
 			return true;
+		if (!KEY_NVMPG(k) && ptr_stale(b->c, k, i))
+			return true;
+	}
 
 	if (expensive_debug_checks(b->c) &&
 	    btree_ptr_bad_expensive(b, k))
@@ -507,9 +523,13 @@
 static bool bch_extent_bad_expensive(struct btree *b, const struct bkey *k,
 				     unsigned int ptr)
 {
-	struct bucket *g = PTR_BUCKET(b->c, k, ptr);
+	struct bucket *g;
 	char buf[80];
 
+	if (KEY_NVMPG(k))
+		return false;
+
+	g = PTR_BUCKET(b->c, k, ptr);
 	if (mutex_trylock(&b->c->bucket_lock)) {
 		if (b->c->gc_mark_valid &&
 		    (!GC_MARK(g) ||
@@ -548,7 +568,7 @@
 		if (!ptr_available(b->c, k, i))
 			return true;
 
-	for (i = 0; i < KEY_PTRS(k); i++) {
+	for (i = 0; (!KEY_NVMPG(k)) && (i < KEY_PTRS(k)); i++) {
 		stale = ptr_stale(b->c, k, i);
 
 		if (stale && KEY_DIRTY(k)) {
@@ -588,6 +608,9 @@
 	if (key_merging_disabled(b->c))
 		return false;
 
+	 if (KEY_NVMPG(l) || KEY_NVMPG(r))
+		 return false;
+
 	for (i = 0; i < KEY_PTRS(l); i++)
 		if (l->ptr[i] + MAKE_PTR(0, KEY_SIZE(l), 0) != r->ptr[i] ||
 		    PTR_BUCKET_NR(b->c, l, i) != PTR_BUCKET_NR(b->c, r, i))
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 24615df..85a20e0 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -382,6 +382,9 @@
 			if (!__bch_extent_invalid(c, k)) {
 				unsigned int j;
 
+				if (KEY_NVMPG(k))
+					continue;
+
 				for (j = 0; j < KEY_PTRS(k); j++)
 					if (ptr_available(c, k, j))
 						atomic_inc(&PTR_BUCKET(c, k, j)->pin);
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index f2c5a7e..4a5d75e 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -232,9 +232,11 @@
 		if (op->writeback) {
 			SET_KEY_DIRTY(k, true);
 
-			for (i = 0; i < KEY_PTRS(k); i++)
-				SET_GC_MARK(PTR_BUCKET(op->c, k, i),
-					    GC_MARK_DIRTY);
+			if (!KEY_NVMPG(k)) {
+				for (i = 0; i < KEY_PTRS(k); i++)
+					SET_GC_MARK(PTR_BUCKET(op->c, k, i),
+						    GC_MARK_DIRTY);
+			}
 		}
 
 		SET_KEY_CSUM(k, op->csum);
@@ -542,7 +544,10 @@
 	/* XXX: figure out best pointer - for multiple cache devices */
 	ptr = 0;
 
-	PTR_BUCKET(b->c, k, ptr)->prio = INITIAL_PRIO;
+	if (!KEY_NVMPG(k))
+		PTR_BUCKET(b->c, k, ptr)->prio = INITIAL_PRIO;
+	else
+		pr_err("nvmpg key should not show up here.\n");
 
 	if (KEY_DIRTY(k))
 		s->read_dirty_data = true;