drivers/block/blockconsole.c - pub/scm/linux/kernel/git/joern/bcon2 - Git at Google

 /*
  * Blockconsole - write kernel console to a block device
  *
  * Copyright (C) 2012  Joern Engel <joern@logfs.org>
  *
  * For usage and disk format, please see
  * Documentation/block/blockconsole.txt
  *
  * Blockconsole allocates a 1MB buffer at init time.  printk() calls
  * bcon_write(), which simply writes to the buffer and wakes up a
  * writeback thread.  bcon_writeback() will then write the console out
  * to the blockdevice, in chunks of sector_size.
  *
  * All allocations for blockconsole happen at init time.  All means
  * 1MB worth of buffer, a struct bio for every 512B sector, plus
  * another page and struct bio to zero the current 1MB of the device
  * before writing to it.
  *
  * The block layer and device drivers may require further memory
  * allocations do sleeping calls and require all sorts of
  * infrastructure.  In such cases, writeback may be delayed or fail
  * altogether.  But since writeback is decoupled from printk(), the
  * worst consequence should be loss of debug information - which would
  * be lost without blockconsole as well.
  *
  * On top of the writeback thread, bcon_write() also schedules a 1s
  * timer.  When the timer expires, the current partial sector will be
  * padded and written out as well.  Blockconsole does no overwrites,
  * so a spurious line of up to 510 spaces and a newline is the result.
  *
  * There is a panic handler that tries to push out the last dying
  * breath.  Sometimes that works, sometimes it causes a secondary
  * oops.  I have never seen it do harm and when it does work it
  * provides useful crash information.
  *
  * In cases where more printk data comes in the front door than the
  * backing device can handle, there will be data loss.  Again,
  * blockconsole is best-effort.  Given the 1MB buffer it takes an
  * extreme slow device or a deliberate attempt to overflow the buffer,
  * so this is almost never a problem in practice.
  *
  * In case the device is already filled with data (particularly after
  * blockconsole wraps around), each 1MB tile is written with zeroes
  * once before the regular sector writes happen.  Noone should
  * interpret stale data from before the wrap-around as current - or
  * have to guess where the current data ends and the stale data
  * begins.
  *
  * Detection of console devices currently works by having the partition
  * scanning code call bcon_add() once for every partition and once for
  * every device.  If we find a valid header, the device is automatically
  * used.
  *
  * Removal of console devices is also automatic - sooner or later a
  * removed device will cause write errors and any device that
  * consistently returns write errors will get removed.  Main goal here
  * was to be resilient against flaky hardware, using the same code to
  * handle device removal is a bonus and ensures test coverage.
  */
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

 #include <linux/bio.h>
 #include <linux/blockconsole.h>
 #include <linux/console.h>
 #include <linux/fs.h>
 #include <linux/kref.h>
 #include <linux/kthread.h>
 #include <linux/mm.h>
 #include <linux/mount.h>
 #include <linux/random.h>
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/workqueue.h>
 #include <linux/sched.h>
 #include <linux/ctype.h>
 #include <linux/device.h>
 #include <linux/genhd.h>

 #define BLOCKCONSOLE_MAGIC	"\nLinux blockconsole version 1.1\n"
 #define BCON_UUID_OFS		(32)
 #define BCON_ROUND_OFS		(41)
 #define BCON_TILE_OFS		(50)
 #define BCON_HEADERSIZE		(50)
 #define BCON_LONG_HEADERSIZE	(59) /* with tile index */

 #define PAGE_COUNT		(256)
 #define SECTOR_COUNT		(PAGE_COUNT * (PAGE_SIZE >> 9))
 #define CACHE_PAGE_MASK		(PAGE_COUNT - 1)
 #define CACHE_SECTOR_MASK	(SECTOR_COUNT - 1)
 #define CACHE_SIZE		(PAGE_COUNT << PAGE_SHIFT)
 #define CACHE_MASK		(CACHE_SIZE - 1)
 #define SECTOR_SHIFT		(9)
 #define SECTOR_SIZE		(1u << SECTOR_SHIFT)
 #define SECTOR_MASK		(~(SECTOR_SIZE-1))
 #define PG_SECTOR_MASK		((PAGE_SIZE >> 9) - 1)

 struct bcon_bio {
 	struct bio bio;
 	struct bio_vec bvec;
 	void *sector;
 	int in_flight;
 };

 struct blockconsole {
 	char devname[32];
 	atomic64_t console_bytes;
 	spinlock_t end_io_lock;
 	struct timer_list pad_timer;
 	int error_count;
 	struct kref kref;
 	u64 write_bytes;
 	u64 max_bytes;
 	u32 round;
 	u32 uuid;
 	struct bcon_bio bio_array[SECTOR_COUNT];
 	struct page *pages;
 	struct bcon_bio zero_bios[PAGE_COUNT];
 	struct page *zero_page;
 	struct block_device *bdev;
 	struct console console;
 	struct work_struct unregister_work;
 	struct work_struct release_work;
 	struct task_struct *writeback_thread;
 	struct notifier_block panic_block;
 };

 static void bcon_get(struct blockconsole *bc)
 {
 	kref_get(&bc->kref);
 }

 static void __bcon_release(struct work_struct *work)
 {
 	struct blockconsole *bc = container_of(work, struct blockconsole,
 			release_work);

 	__free_pages(bc->zero_page, 0);
 	__free_pages(bc->pages, 8);
 	invalidate_mapping_pages(bc->bdev->bd_inode->i_mapping, 0, -1);
 	blkdev_put(bc->bdev, FMODE_READ|FMODE_WRITE);
 	kfree(bc);
 }

 static void bcon_release(struct kref *kref)
 {
 	struct blockconsole *bc = container_of(kref, struct blockconsole, kref);

 	/* bcon_release can be called from atomic context */
 	schedule_work(&bc->release_work);
 }

 static void bcon_put(struct blockconsole *bc)
 {
 	kref_put(&bc->kref, bcon_release);
 }

 static unsigned int __bcon_console_ofs(u64 console_bytes)
 {
 	return console_bytes & ~SECTOR_MASK;
 }

 static unsigned int bcon_console_ofs(struct blockconsole *bc)
 {
 	return __bcon_console_ofs(atomic64_read(&bc->console_bytes));
 }

 static unsigned int __bcon_console_sector(u64 console_bytes)
 {
 	return (console_bytes >> SECTOR_SHIFT) & CACHE_SECTOR_MASK;
 }

 static unsigned int bcon_console_sector(struct blockconsole *bc)
 {
 	return __bcon_console_sector(atomic64_read(&bc->console_bytes));
 }

 static unsigned int bcon_write_sector(struct blockconsole *bc)
 {
 	return (bc->write_bytes >> SECTOR_SHIFT) & CACHE_SECTOR_MASK;
 }

 static void clear_sector(void *sector)
 {
 	memset(sector, ' ', 511);
 	memset(sector + 511, 10, 1);
 }

 static void bcon_init_first_page(struct blockconsole *bc)
 {
 	char *buf = page_address(bc->pages);
 	size_t len = strlen(BLOCKCONSOLE_MAGIC);
 	u32 tile = atomic64_read(&bc->console_bytes) >> 20; /* We overflow after 4TB - fine */

 	clear_sector(buf);
 	memcpy(buf, BLOCKCONSOLE_MAGIC, len);
 	sprintf(buf + BCON_UUID_OFS, "%08x", bc->uuid);
 	sprintf(buf + BCON_ROUND_OFS, "%08x", bc->round);
 	sprintf(buf + BCON_TILE_OFS, "%08x", tile);
 	/* replace NUL with newline */
 	buf[BCON_UUID_OFS + 8] = 10;
 	buf[BCON_ROUND_OFS + 8] = 10;
 	buf[BCON_TILE_OFS + 8] = 10;
 }

 static void bcon_advance_console_bytes(struct blockconsole *bc, int bytes)
 {
 	u64 old, new;

 	do {
 		old = atomic64_read(&bc->console_bytes);
 		new = old + bytes;
 		if (new >= bc->max_bytes)
 			new = 0;
 		if ((new & CACHE_MASK) == 0) {
 			bcon_init_first_page(bc);
 			new += BCON_LONG_HEADERSIZE;
 		}
 	} while (atomic64_cmpxchg(&bc->console_bytes, old, new) != old);
 }

 static void request_complete(struct bio *bio, int err)
 {
 	complete((struct completion *)bio->bi_private);
 }

 static int sync_read(struct blockconsole *bc, u64 ofs)
 {
 	struct bio bio;
 	struct bio_vec bio_vec;
 	struct completion complete;

 	bio_init(&bio);
 	bio.bi_io_vec = &bio_vec;
 	bio_vec.bv_page = bc->pages;
 	bio_vec.bv_len = SECTOR_SIZE;
 	bio_vec.bv_offset = 0;
 	bio.bi_vcnt = 1;
 	bio.bi_idx = 0;
 	bio.bi_size = SECTOR_SIZE;
 	bio.bi_bdev = bc->bdev;
 	bio.bi_sector = ofs >> SECTOR_SHIFT;
 	init_completion(&complete);
 	bio.bi_private = &complete;
 	bio.bi_end_io = request_complete;

 	submit_bio(READ, &bio);
 	wait_for_completion(&complete);
 	return test_bit(BIO_UPTODATE, &bio.bi_flags) ? 0 : -EIO;
 }

 static void bcon_erase_segment(struct blockconsole *bc)
 {
 	int i;

 	for (i = 0; i < PAGE_COUNT; i++) {
 		struct bcon_bio *bcon_bio = bc->zero_bios + i;
 		struct bio *bio = &bcon_bio->bio;

 		/*
 		 * If the last erase hasn't finished yet, just skip it.  The log
 		 * will look messy, but that's all.
 		 */
 		rmb();
 		if (bcon_bio->in_flight)
 			continue;
 		bio_init(bio);
 		bio->bi_io_vec = &bcon_bio->bvec;
 		bio->bi_vcnt = 1;
 		bio->bi_size = PAGE_SIZE;
 		bio->bi_bdev = bc->bdev;
 		bio->bi_private = bc;
 		bio->bi_idx = 0;
 		bio->bi_sector = (bc->write_bytes + i * PAGE_SIZE) >> 9;
 		bcon_bio->in_flight = 1;
 		wmb();
 		/* We want the erase to go to the device first somehow */
 		submit_bio(WRITE | REQ_SOFTBARRIER, bio);
 	}
 }

 static void bcon_advance_write_bytes(struct blockconsole *bc, int bytes)
 {
 	bc->write_bytes += bytes;
 	if (bc->write_bytes >= bc->max_bytes) {
 		bc->write_bytes = 0;
 		bc->round++;
 		bcon_init_first_page(bc);
 	}
 }

 /*
  * Check if we have an 8-digit hex number followed by newline
  */
 static bool is_four_byte_hex(const void *data)
 {
 	const char *str = data;
 	int len = 0;

 	while (isxdigit(*str) && len++ < 9)
 		str++;

 	if (len != 8)
 		return false;

 	/* str should point to a \n now */
 	if (*str != 0xa)
 		return false;

 	return true;
 }

 static int bcon_magic_present(const void *data)
 {
 	size_t len = strlen(BLOCKCONSOLE_MAGIC);

 	if (memcmp(data, BLOCKCONSOLE_MAGIC, len))
 		return 0;
 	if (!is_four_byte_hex(data + BCON_UUID_OFS))
 		return 0;
 	if (!is_four_byte_hex(data + BCON_ROUND_OFS))
 		return 0;
 	if (!is_four_byte_hex(data + BCON_TILE_OFS))
 		return 0;
 	return 11;
 }

 static int bcon_find_end_of_log(struct blockconsole *bc)
 {
 	u64 start = 0, end = bc->max_bytes, middle;
 	void *sec0 = bc->bio_array[0].sector;
 	void *sec1 = bc->bio_array[1].sector;
 	int err, version;

 	err = sync_read(bc, 0);
 	if (err)
 		return err;
 	/* Second sanity check, out of sheer paranoia */
 	version = bcon_magic_present(sec0);
 	if (!version)
 		return -EINVAL;

 	bc->uuid = simple_strtoull(sec0 + BCON_UUID_OFS, NULL, 16);
 	bc->round = simple_strtoull(sec0 + BCON_ROUND_OFS, NULL, 16);

 	memcpy(sec1, sec0, BCON_HEADERSIZE);
 	for (;;) {
 		middle = (start + end) / 2;
 		middle &= ~CACHE_MASK;
 		if (middle == start)
 			break;
 		err = sync_read(bc, middle);
 		if (err)
 			return err;
 		if (memcmp(sec1, sec0, BCON_HEADERSIZE)) {
 			/* If the two differ, we haven't written that far yet */
 			end = middle;
 		} else {
 			start = middle;
 		}
 	}
 	bc->write_bytes = end;
 	atomic64_set(&bc->console_bytes, end);
 	bcon_advance_console_bytes(bc, 0); /* To skip the header */
 	bcon_advance_write_bytes(bc, 0); /* To wrap around, if necessary */
 	bcon_erase_segment(bc);
 	return 0;
 }

 static void bcon_unregister(struct work_struct *work)
 {
 	struct blockconsole *bc = container_of(work, struct blockconsole,
 			unregister_work);

 	atomic_notifier_chain_unregister(&panic_notifier_list, &bc->panic_block);
 	unregister_console(&bc->console);
 	del_timer_sync(&bc->pad_timer);
 	kthread_stop(bc->writeback_thread);
 	/* No new io will be scheduled anymore now */
 	bcon_put(bc);
 }

 #define BCON_MAX_ERRORS	10
 static void bcon_end_io(struct bio *bio, int err)
 {
 	struct bcon_bio *bcon_bio = container_of(bio, struct bcon_bio, bio);
 	struct blockconsole *bc = bio->bi_private;
 	unsigned long flags;

 	/*
 	 * We want to assume the device broken and free this console if
 	 * we accumulate too many errors.  But if errors are transient,
 	 * we also want to forget about them once writes succeed again.
 	 * Oh, and we only want to reset the counter if it hasn't reached
 	 * the limit yet, so we don't bcon_put() twice from here.
 	 */
 	spin_lock_irqsave(&bc->end_io_lock, flags);
 	if (err) {
 		if (bc->error_count++ == BCON_MAX_ERRORS) {
 			pr_info("no longer logging to %s\n", bc->devname);
 			schedule_work(&bc->unregister_work);
 		}
 	} else {
 		if (bc->error_count && bc->error_count < BCON_MAX_ERRORS)
 			bc->error_count = 0;
 	}
 	/*
 	 * Add padding (a bunch of spaces and a newline) early so bcon_pad
 	 * only has to advance a pointer.
 	 */
 	clear_sector(bcon_bio->sector);
 	bcon_bio->in_flight = 0;
 	spin_unlock_irqrestore(&bc->end_io_lock, flags);
 	bcon_put(bc);
 }

 static void bcon_writesector(struct blockconsole *bc, int index)
 {
 	struct bcon_bio *bcon_bio = bc->bio_array + index;
 	struct bio *bio = &bcon_bio->bio;

 	rmb();
 	if (bcon_bio->in_flight)
 		return;
 	bcon_get(bc);

 	bio_init(bio);
 	bio->bi_io_vec = &bcon_bio->bvec;
 	bio->bi_vcnt = 1;
 	bio->bi_size = SECTOR_SIZE;
 	bio->bi_bdev = bc->bdev;
 	bio->bi_private = bc;
 	bio->bi_end_io = bcon_end_io;

 	bio->bi_idx = 0;
 	bio->bi_sector = bc->write_bytes >> 9;
 	bcon_bio->in_flight = 1;
 	wmb();
 	submit_bio(WRITE, bio);
 }

 /**
  * bcon_writeback - the writeback thread
  * @_bc:	The struct blockconsole
  *
  * Will loop and writeback any full sectors, then go back to sleep.
  */
 static int bcon_writeback(void *_bc)
 {
 	struct blockconsole *bc = _bc;
 	struct sched_param(sp);

 	sp.sched_priority = MAX_RT_PRIO - 1; /* Highest realtime prio */
 	sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
 	for (;;) {
 		set_current_state(TASK_INTERRUPTIBLE);
 		schedule();
 		if (kthread_should_stop())
 			break;
 		while (bcon_write_sector(bc) != bcon_console_sector(bc)) {
 			bcon_writesector(bc, bcon_write_sector(bc));
 			bcon_advance_write_bytes(bc, SECTOR_SIZE);
 			if (bcon_write_sector(bc) == 0)
 				bcon_erase_segment(bc);
 		}
 	}
 	return 0;
 }

 static void bcon_pad(unsigned long data)
 {
 	struct blockconsole *bc = (void *)data;
 	unsigned int n;

 	/*
 	 * We deliberately race against bcon_write here.  If we lose the race,
 	 * our padding is no longer where we expected it to be, i.e. it is
 	 * no longer a bunch of spaces with a newline at the end.  There could
 	 * not be a newline at all or it could be somewhere in the middle.
 	 * Either way, the log corruption is fairly obvious to spot and ignore
 	 * for human readers.
 	 */
 	n = SECTOR_SIZE - bcon_console_ofs(bc);
 	if (n != SECTOR_SIZE) {
 		bcon_advance_console_bytes(bc, n);
 		wake_up_process(bc->writeback_thread);
 	}
 }

 static void bcon_write(struct console *console, const char *msg,
 		unsigned int len)
 {
 	struct blockconsole *bc = container_of(console, struct blockconsole,
 			console);
 	unsigned int n;
 	u64 console_bytes;
 	int i;

 	while (len) {
 		console_bytes = atomic64_read(&bc->console_bytes);
 		i = __bcon_console_sector(console_bytes);
 		rmb();
 		if (bc->bio_array[i].in_flight)
 			break;
 		n = min(len, SECTOR_SIZE - __bcon_console_ofs(console_bytes));
 		memcpy(bc->bio_array[i].sector +
 				__bcon_console_ofs(console_bytes), msg, n);
 		len -= n;
 		msg += n;
 		bcon_advance_console_bytes(bc, n);
 	}
 	wake_up_process(bc->writeback_thread);
 	mod_timer(&bc->pad_timer, jiffies + HZ);
 }

 /**
  * bcon_init_bios - initialize the struct bio array
  */
 static void bcon_init_bios(struct blockconsole *bc)
 {
 	int i;

 	for (i = 0; i < SECTOR_COUNT; i++) {
 		int page_index = i >> (PAGE_SHIFT - SECTOR_SHIFT);
 		struct page *page = bc->pages + page_index;
 		struct bcon_bio *bcon_bio = bc->bio_array + i;
 		struct bio_vec *bvec = &bcon_bio->bvec;

 		bcon_bio->in_flight = 0;
 		bcon_bio->sector = page_address(bc->pages + page_index)
 			+ SECTOR_SIZE * (i & PG_SECTOR_MASK);
 		clear_sector(bcon_bio->sector);
 		bvec->bv_page = page;
 		bvec->bv_len = SECTOR_SIZE;
 		bvec->bv_offset = SECTOR_SIZE * (i & PG_SECTOR_MASK);
 	}
 }

 static void bcon_init_zero_bio(struct blockconsole *bc)
 {
 	int i;

 	memset(page_address(bc->zero_page), 0, PAGE_SIZE);
 	for (i = 0; i < PAGE_COUNT; i++) {
 		struct bcon_bio *bcon_bio = bc->zero_bios + i;
 		struct bio_vec *bvec = &bcon_bio->bvec;

 		bcon_bio->in_flight = 0;
 		bvec->bv_page = bc->zero_page;
 		bvec->bv_len = PAGE_SIZE;
 		bvec->bv_offset = 0;
 	}
 }

 /**
  * blockconsole_panic - panic notifier
  *
  * Tries to write back any crash information.  This fails fairly
  * regularly.  As always, blockconsole is best-effort.
  */
 static int blockconsole_panic(struct notifier_block *this, unsigned long event,
 		void *ptr)
 {
 	struct blockconsole *bc = container_of(this, struct blockconsole,
 			panic_block);
 	unsigned int n;

 	n = SECTOR_SIZE - bcon_console_ofs(bc);
 	if (n != SECTOR_SIZE)
 		bcon_advance_console_bytes(bc, n);
 	bcon_writeback(bc);
 	return NOTIFY_DONE;
 }

 static int bcon_create(dev_t devt)
 {
 	const fmode_t mode = FMODE_READ | FMODE_WRITE;
 	struct blockconsole *bc;
 	int err;

 	bc = kzalloc(sizeof(*bc), GFP_KERNEL);
 	if (!bc)
 		return -ENOMEM;
 	spin_lock_init(&bc->end_io_lock);
 	strcpy(bc->console.name, "bcon");
 	bc->console.flags = CON_PRINTBUFFER | CON_ENABLED | CON_ALLDATA;
 	bc->console.write = bcon_write;

 	bc->bdev = blkdev_get_by_dev(devt, mode, NULL);
 	if (IS_ERR(bc->bdev))
 		goto out;

 	memset(bc->devname, ' ', sizeof(bc->devname));
 	strlcpy(bc->devname, dev_name(part_to_dev(bc->bdev->bd_part)),
 			sizeof(bc->devname));
 	bc->pages = alloc_pages(GFP_KERNEL, 8);
 	if (!bc->pages)
 		goto out;
 	bc->zero_page = alloc_pages(GFP_KERNEL, 0);
 	if (!bc->zero_page)
 		goto out1;
 	bcon_init_bios(bc);
 	bcon_init_zero_bio(bc);
 	setup_timer(&bc->pad_timer, bcon_pad, (unsigned long)bc);
 	bc->max_bytes = bc->bdev->bd_inode->i_size & ~CACHE_MASK;
 	err = bcon_find_end_of_log(bc);
 	if (err)
 		goto out2;
 	kref_init(&bc->kref); /* This reference gets freed on errors */
 	bc->writeback_thread = kthread_run(bcon_writeback, bc, "bcon_%s",
 			bc->devname);
 	if (IS_ERR(bc->writeback_thread))
 		goto out2;
 	INIT_WORK(&bc->unregister_work, bcon_unregister);
 	INIT_WORK(&bc->release_work, __bcon_release);
 	register_console(&bc->console);
 	bc->panic_block.notifier_call = blockconsole_panic;
 	bc->panic_block.priority = INT_MAX;
 	atomic_notifier_chain_register(&panic_notifier_list, &bc->panic_block);
 	pr_info("now logging to %s at %llx\n", bc->devname,
 			atomic64_read(&bc->console_bytes) >> 20);
 	return 0;

 out2:
 	__free_pages(bc->zero_page, 0);
 out1:
 	__free_pages(bc->pages, 8);
 out:
 	kfree(bc);
 	/* Not strictly correct, be the caller doesn't care */
 	return -ENOMEM;
 }

 struct bcon_candidate {
 	struct work_struct work;
 	dev_t devt;
 };

 /*
  * Calling bcon_create directly would cause a deadlock.  __blkdev_get will
  * take bdev->bd_mutex, which is already held by the partitioning code.
  * Hence go through the indirection of a work queue.
  */
 static void bcon_do_add(struct work_struct *work)
 {
 	struct bcon_candidate *cand = container_of(work, struct bcon_candidate,
 			work);

 	bcon_create(cand->devt);
 	kfree(cand);
 }

 void bcon_add(dev_t devt)
 {
 	struct bcon_candidate *cand;

 	cand = kmalloc(sizeof(cand), GFP_KERNEL);
 	if (!cand)
 		return;
 	cand->devt = devt;
 	INIT_WORK(&cand->work, bcon_do_add);
 	schedule_work(&cand->work);
 }
	/*
	* Blockconsole - write kernel console to a block device
	*
	* Copyright (C) 2012 Joern Engel <joern@logfs.org>
	*
	* For usage and disk format, please see
	* Documentation/block/blockconsole.txt
	*
	* Blockconsole allocates a 1MB buffer at init time. printk() calls
	* bcon_write(), which simply writes to the buffer and wakes up a
	* writeback thread. bcon_writeback() will then write the console out
	* to the blockdevice, in chunks of sector_size.
	*
	* All allocations for blockconsole happen at init time. All means
	* 1MB worth of buffer, a struct bio for every 512B sector, plus
	* another page and struct bio to zero the current 1MB of the device
	* before writing to it.
	*
	* The block layer and device drivers may require further memory
	* allocations do sleeping calls and require all sorts of
	* infrastructure. In such cases, writeback may be delayed or fail
	* altogether. But since writeback is decoupled from printk(), the
	* worst consequence should be loss of debug information - which would
	* be lost without blockconsole as well.
	*
	* On top of the writeback thread, bcon_write() also schedules a 1s
	* timer. When the timer expires, the current partial sector will be
	* padded and written out as well. Blockconsole does no overwrites,
	* so a spurious line of up to 510 spaces and a newline is the result.
	*
	* There is a panic handler that tries to push out the last dying
	* breath. Sometimes that works, sometimes it causes a secondary
	* oops. I have never seen it do harm and when it does work it
	* provides useful crash information.
	*
	* In cases where more printk data comes in the front door than the
	* backing device can handle, there will be data loss. Again,
	* blockconsole is best-effort. Given the 1MB buffer it takes an
	* extreme slow device or a deliberate attempt to overflow the buffer,
	* so this is almost never a problem in practice.
	*
	* In case the device is already filled with data (particularly after
	* blockconsole wraps around), each 1MB tile is written with zeroes
	* once before the regular sector writes happen. Noone should
	* interpret stale data from before the wrap-around as current - or
	* have to guess where the current data ends and the stale data
	* begins.
	*
	* Detection of console devices currently works by having the partition
	* scanning code call bcon_add() once for every partition and once for
	* every device. If we find a valid header, the device is automatically
	* used.
	*
	* Removal of console devices is also automatic - sooner or later a
	* removed device will cause write errors and any device that
	* consistently returns write errors will get removed. Main goal here
	* was to be resilient against flaky hardware, using the same code to
	* handle device removal is a bonus and ensures test coverage.
	*/
	#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

	#include <linux/bio.h>
	#include <linux/blockconsole.h>
	#include <linux/console.h>
	#include <linux/fs.h>
	#include <linux/kref.h>
	#include <linux/kthread.h>
	#include <linux/mm.h>
	#include <linux/mount.h>
	#include <linux/random.h>
	#include <linux/slab.h>
	#include <linux/string.h>
	#include <linux/workqueue.h>
	#include <linux/sched.h>
	#include <linux/ctype.h>
	#include <linux/device.h>
	#include <linux/genhd.h>

	#define BLOCKCONSOLE_MAGIC "\nLinux blockconsole version 1.1\n"
	#define BCON_UUID_OFS (32)
	#define BCON_ROUND_OFS (41)
	#define BCON_TILE_OFS (50)
	#define BCON_HEADERSIZE (50)
	#define BCON_LONG_HEADERSIZE (59) /* with tile index */

	#define PAGE_COUNT (256)
	#define SECTOR_COUNT (PAGE_COUNT * (PAGE_SIZE >> 9))
	#define CACHE_PAGE_MASK (PAGE_COUNT - 1)
	#define CACHE_SECTOR_MASK (SECTOR_COUNT - 1)
	#define CACHE_SIZE (PAGE_COUNT << PAGE_SHIFT)
	#define CACHE_MASK (CACHE_SIZE - 1)
	#define SECTOR_SHIFT (9)
	#define SECTOR_SIZE (1u << SECTOR_SHIFT)
	#define SECTOR_MASK (~(SECTOR_SIZE-1))
	#define PG_SECTOR_MASK ((PAGE_SIZE >> 9) - 1)

	struct bcon_bio {
	struct bio bio;
	struct bio_vec bvec;
	void *sector;
	int in_flight;
	};

	struct blockconsole {
	char devname[32];
	atomic64_t console_bytes;
	spinlock_t end_io_lock;
	struct timer_list pad_timer;
	int error_count;
	struct kref kref;
	u64 write_bytes;
	u64 max_bytes;
	u32 round;
	u32 uuid;
	struct bcon_bio bio_array[SECTOR_COUNT];
	struct page *pages;
	struct bcon_bio zero_bios[PAGE_COUNT];
	struct page *zero_page;
	struct block_device *bdev;
	struct console console;
	struct work_struct unregister_work;
	struct work_struct release_work;
	struct task_struct *writeback_thread;
	struct notifier_block panic_block;
	};

	static void bcon_get(struct blockconsole *bc)
	{
	kref_get(&bc->kref);
	}

	static void __bcon_release(struct work_struct *work)
	{
	struct blockconsole *bc = container_of(work, struct blockconsole,
	release_work);

	__free_pages(bc->zero_page, 0);
	__free_pages(bc->pages, 8);
	invalidate_mapping_pages(bc->bdev->bd_inode->i_mapping, 0, -1);
	blkdev_put(bc->bdev, FMODE_READ\|FMODE_WRITE);
	kfree(bc);
	}

	static void bcon_release(struct kref *kref)
	{
	struct blockconsole *bc = container_of(kref, struct blockconsole, kref);

	/* bcon_release can be called from atomic context */
	schedule_work(&bc->release_work);
	}

	static void bcon_put(struct blockconsole *bc)
	{
	kref_put(&bc->kref, bcon_release);
	}

	static unsigned int __bcon_console_ofs(u64 console_bytes)
	{
	return console_bytes & ~SECTOR_MASK;
	}

	static unsigned int bcon_console_ofs(struct blockconsole *bc)
	{
	return __bcon_console_ofs(atomic64_read(&bc->console_bytes));
	}

	static unsigned int __bcon_console_sector(u64 console_bytes)
	{
	return (console_bytes >> SECTOR_SHIFT) & CACHE_SECTOR_MASK;
	}

	static unsigned int bcon_console_sector(struct blockconsole *bc)
	{
	return __bcon_console_sector(atomic64_read(&bc->console_bytes));
	}

	static unsigned int bcon_write_sector(struct blockconsole *bc)
	{
	return (bc->write_bytes >> SECTOR_SHIFT) & CACHE_SECTOR_MASK;
	}

	static void clear_sector(void *sector)
	{
	memset(sector, ' ', 511);
	memset(sector + 511, 10, 1);
	}

	static void bcon_init_first_page(struct blockconsole *bc)
	{
	char *buf = page_address(bc->pages);
	size_t len = strlen(BLOCKCONSOLE_MAGIC);
	u32 tile = atomic64_read(&bc->console_bytes) >> 20; /* We overflow after 4TB - fine */

	clear_sector(buf);
	memcpy(buf, BLOCKCONSOLE_MAGIC, len);
	sprintf(buf + BCON_UUID_OFS, "%08x", bc->uuid);
	sprintf(buf + BCON_ROUND_OFS, "%08x", bc->round);
	sprintf(buf + BCON_TILE_OFS, "%08x", tile);
	/* replace NUL with newline */
	buf[BCON_UUID_OFS + 8] = 10;
	buf[BCON_ROUND_OFS + 8] = 10;
	buf[BCON_TILE_OFS + 8] = 10;
	}

	static void bcon_advance_console_bytes(struct blockconsole *bc, int bytes)
	{
	u64 old, new;

	do {
	old = atomic64_read(&bc->console_bytes);
	new = old + bytes;
	if (new >= bc->max_bytes)
	new = 0;
	if ((new & CACHE_MASK) == 0) {
	bcon_init_first_page(bc);
	new += BCON_LONG_HEADERSIZE;
	}
	} while (atomic64_cmpxchg(&bc->console_bytes, old, new) != old);
	}

	static void request_complete(struct bio *bio, int err)
	{
	complete((struct completion *)bio->bi_private);
	}

	static int sync_read(struct blockconsole *bc, u64 ofs)
	{
	struct bio bio;
	struct bio_vec bio_vec;
	struct completion complete;

	bio_init(&bio);
	bio.bi_io_vec = &bio_vec;
	bio_vec.bv_page = bc->pages;
	bio_vec.bv_len = SECTOR_SIZE;
	bio_vec.bv_offset = 0;
	bio.bi_vcnt = 1;
	bio.bi_idx = 0;
	bio.bi_size = SECTOR_SIZE;
	bio.bi_bdev = bc->bdev;
	bio.bi_sector = ofs >> SECTOR_SHIFT;
	init_completion(&complete);
	bio.bi_private = &complete;
	bio.bi_end_io = request_complete;

	submit_bio(READ, &bio);
	wait_for_completion(&complete);
	return test_bit(BIO_UPTODATE, &bio.bi_flags) ? 0 : -EIO;
	}

	static void bcon_erase_segment(struct blockconsole *bc)
	{
	int i;

	for (i = 0; i < PAGE_COUNT; i++) {
	struct bcon_bio *bcon_bio = bc->zero_bios + i;
	struct bio *bio = &bcon_bio->bio;

	/*
	* If the last erase hasn't finished yet, just skip it. The log
	* will look messy, but that's all.
	*/
	rmb();
	if (bcon_bio->in_flight)
	continue;
	bio_init(bio);
	bio->bi_io_vec = &bcon_bio->bvec;
	bio->bi_vcnt = 1;
	bio->bi_size = PAGE_SIZE;
	bio->bi_bdev = bc->bdev;
	bio->bi_private = bc;
	bio->bi_idx = 0;
	bio->bi_sector = (bc->write_bytes + i * PAGE_SIZE) >> 9;
	bcon_bio->in_flight = 1;
	wmb();
	/* We want the erase to go to the device first somehow */
	submit_bio(WRITE \| REQ_SOFTBARRIER, bio);
	}
	}

	static void bcon_advance_write_bytes(struct blockconsole *bc, int bytes)
	{
	bc->write_bytes += bytes;
	if (bc->write_bytes >= bc->max_bytes) {
	bc->write_bytes = 0;
	bc->round++;
	bcon_init_first_page(bc);
	}
	}

	/*
	* Check if we have an 8-digit hex number followed by newline
	*/
	static bool is_four_byte_hex(const void *data)
	{
	const char *str = data;
	int len = 0;

	while (isxdigit(*str) && len++ < 9)
	str++;

	if (len != 8)
	return false;

	/* str should point to a \n now */
	if (*str != 0xa)
	return false;

	return true;
	}

	static int bcon_magic_present(const void *data)
	{
	size_t len = strlen(BLOCKCONSOLE_MAGIC);

	if (memcmp(data, BLOCKCONSOLE_MAGIC, len))
	return 0;
	if (!is_four_byte_hex(data + BCON_UUID_OFS))
	return 0;
	if (!is_four_byte_hex(data + BCON_ROUND_OFS))
	return 0;
	if (!is_four_byte_hex(data + BCON_TILE_OFS))
	return 0;
	return 11;
	}

	static int bcon_find_end_of_log(struct blockconsole *bc)
	{
	u64 start = 0, end = bc->max_bytes, middle;
	void *sec0 = bc->bio_array[0].sector;
	void *sec1 = bc->bio_array[1].sector;
	int err, version;

	err = sync_read(bc, 0);
	if (err)
	return err;
	/* Second sanity check, out of sheer paranoia */
	version = bcon_magic_present(sec0);
	if (!version)
	return -EINVAL;

	bc->uuid = simple_strtoull(sec0 + BCON_UUID_OFS, NULL, 16);
	bc->round = simple_strtoull(sec0 + BCON_ROUND_OFS, NULL, 16);

	memcpy(sec1, sec0, BCON_HEADERSIZE);
	for (;;) {
	middle = (start + end) / 2;
	middle &= ~CACHE_MASK;
	if (middle == start)
	break;
	err = sync_read(bc, middle);
	if (err)
	return err;
	if (memcmp(sec1, sec0, BCON_HEADERSIZE)) {
	/* If the two differ, we haven't written that far yet */
	end = middle;
	} else {
	start = middle;
	}
	}
	bc->write_bytes = end;
	atomic64_set(&bc->console_bytes, end);
	bcon_advance_console_bytes(bc, 0); /* To skip the header */
	bcon_advance_write_bytes(bc, 0); /* To wrap around, if necessary */
	bcon_erase_segment(bc);
	return 0;
	}

	static void bcon_unregister(struct work_struct *work)
	{
	struct blockconsole *bc = container_of(work, struct blockconsole,
	unregister_work);

	atomic_notifier_chain_unregister(&panic_notifier_list, &bc->panic_block);
	unregister_console(&bc->console);
	del_timer_sync(&bc->pad_timer);
	kthread_stop(bc->writeback_thread);
	/* No new io will be scheduled anymore now */
	bcon_put(bc);
	}

	#define BCON_MAX_ERRORS 10
	static void bcon_end_io(struct bio *bio, int err)
	{
	struct bcon_bio *bcon_bio = container_of(bio, struct bcon_bio, bio);
	struct blockconsole *bc = bio->bi_private;
	unsigned long flags;

	/*
	* We want to assume the device broken and free this console if
	* we accumulate too many errors. But if errors are transient,
	* we also want to forget about them once writes succeed again.
	* Oh, and we only want to reset the counter if it hasn't reached
	* the limit yet, so we don't bcon_put() twice from here.
	*/
	spin_lock_irqsave(&bc->end_io_lock, flags);
	if (err) {
	if (bc->error_count++ == BCON_MAX_ERRORS) {
	pr_info("no longer logging to %s\n", bc->devname);
	schedule_work(&bc->unregister_work);
	}
	} else {
	if (bc->error_count && bc->error_count < BCON_MAX_ERRORS)
	bc->error_count = 0;
	}
	/*
	* Add padding (a bunch of spaces and a newline) early so bcon_pad
	* only has to advance a pointer.
	*/
	clear_sector(bcon_bio->sector);
	bcon_bio->in_flight = 0;
	spin_unlock_irqrestore(&bc->end_io_lock, flags);
	bcon_put(bc);
	}

	static void bcon_writesector(struct blockconsole *bc, int index)
	{
	struct bcon_bio *bcon_bio = bc->bio_array + index;
	struct bio *bio = &bcon_bio->bio;

	rmb();
	if (bcon_bio->in_flight)
	return;
	bcon_get(bc);

	bio_init(bio);
	bio->bi_io_vec = &bcon_bio->bvec;
	bio->bi_vcnt = 1;
	bio->bi_size = SECTOR_SIZE;
	bio->bi_bdev = bc->bdev;
	bio->bi_private = bc;
	bio->bi_end_io = bcon_end_io;

	bio->bi_idx = 0;
	bio->bi_sector = bc->write_bytes >> 9;
	bcon_bio->in_flight = 1;
	wmb();
	submit_bio(WRITE, bio);
	}

	/**
	* bcon_writeback - the writeback thread
	* @_bc: The struct blockconsole
	*
	* Will loop and writeback any full sectors, then go back to sleep.
	*/
	static int bcon_writeback(void *_bc)
	{
	struct blockconsole *bc = _bc;
	struct sched_param(sp);

	sp.sched_priority = MAX_RT_PRIO - 1; /* Highest realtime prio */
	sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
	for (;;) {
	set_current_state(TASK_INTERRUPTIBLE);
	schedule();
	if (kthread_should_stop())
	break;
	while (bcon_write_sector(bc) != bcon_console_sector(bc)) {
	bcon_writesector(bc, bcon_write_sector(bc));
	bcon_advance_write_bytes(bc, SECTOR_SIZE);
	if (bcon_write_sector(bc) == 0)
	bcon_erase_segment(bc);
	}
	}
	return 0;
	}

	static void bcon_pad(unsigned long data)
	{
	struct blockconsole bc = (void )data;
	unsigned int n;

	/*
	* We deliberately race against bcon_write here. If we lose the race,
	* our padding is no longer where we expected it to be, i.e. it is
	* no longer a bunch of spaces with a newline at the end. There could
	* not be a newline at all or it could be somewhere in the middle.
	* Either way, the log corruption is fairly obvious to spot and ignore
	* for human readers.
	*/
	n = SECTOR_SIZE - bcon_console_ofs(bc);
	if (n != SECTOR_SIZE) {
	bcon_advance_console_bytes(bc, n);
	wake_up_process(bc->writeback_thread);
	}
	}

	static void bcon_write(struct console console, const char msg,
	unsigned int len)
	{
	struct blockconsole *bc = container_of(console, struct blockconsole,
	console);
	unsigned int n;
	u64 console_bytes;
	int i;

	while (len) {
	console_bytes = atomic64_read(&bc->console_bytes);
	i = __bcon_console_sector(console_bytes);
	rmb();
	if (bc->bio_array[i].in_flight)
	break;
	n = min(len, SECTOR_SIZE - __bcon_console_ofs(console_bytes));
	memcpy(bc->bio_array[i].sector +
	__bcon_console_ofs(console_bytes), msg, n);
	len -= n;
	msg += n;
	bcon_advance_console_bytes(bc, n);
	}
	wake_up_process(bc->writeback_thread);
	mod_timer(&bc->pad_timer, jiffies + HZ);
	}

	/**
	* bcon_init_bios - initialize the struct bio array
	*/
	static void bcon_init_bios(struct blockconsole *bc)
	{
	int i;

	for (i = 0; i < SECTOR_COUNT; i++) {
	int page_index = i >> (PAGE_SHIFT - SECTOR_SHIFT);
	struct page *page = bc->pages + page_index;
	struct bcon_bio *bcon_bio = bc->bio_array + i;
	struct bio_vec *bvec = &bcon_bio->bvec;

	bcon_bio->in_flight = 0;
	bcon_bio->sector = page_address(bc->pages + page_index)
	+ SECTOR_SIZE * (i & PG_SECTOR_MASK);
	clear_sector(bcon_bio->sector);
	bvec->bv_page = page;
	bvec->bv_len = SECTOR_SIZE;
	bvec->bv_offset = SECTOR_SIZE * (i & PG_SECTOR_MASK);
	}
	}

	static void bcon_init_zero_bio(struct blockconsole *bc)
	{
	int i;

	memset(page_address(bc->zero_page), 0, PAGE_SIZE);
	for (i = 0; i < PAGE_COUNT; i++) {
	struct bcon_bio *bcon_bio = bc->zero_bios + i;
	struct bio_vec *bvec = &bcon_bio->bvec;

	bcon_bio->in_flight = 0;
	bvec->bv_page = bc->zero_page;
	bvec->bv_len = PAGE_SIZE;
	bvec->bv_offset = 0;
	}
	}

	/**
	* blockconsole_panic - panic notifier
	*
	* Tries to write back any crash information. This fails fairly
	* regularly. As always, blockconsole is best-effort.
	*/
	static int blockconsole_panic(struct notifier_block *this, unsigned long event,
	void *ptr)
	{
	struct blockconsole *bc = container_of(this, struct blockconsole,
	panic_block);
	unsigned int n;

	n = SECTOR_SIZE - bcon_console_ofs(bc);
	if (n != SECTOR_SIZE)
	bcon_advance_console_bytes(bc, n);
	bcon_writeback(bc);
	return NOTIFY_DONE;
	}

	static int bcon_create(dev_t devt)
	{
	const fmode_t mode = FMODE_READ \| FMODE_WRITE;
	struct blockconsole *bc;
	int err;

	bc = kzalloc(sizeof(*bc), GFP_KERNEL);
	if (!bc)
	return -ENOMEM;
	spin_lock_init(&bc->end_io_lock);
	strcpy(bc->console.name, "bcon");
	bc->console.flags = CON_PRINTBUFFER \| CON_ENABLED \| CON_ALLDATA;
	bc->console.write = bcon_write;

	bc->bdev = blkdev_get_by_dev(devt, mode, NULL);
	if (IS_ERR(bc->bdev))
	goto out;

	memset(bc->devname, ' ', sizeof(bc->devname));
	strlcpy(bc->devname, dev_name(part_to_dev(bc->bdev->bd_part)),
	sizeof(bc->devname));
	bc->pages = alloc_pages(GFP_KERNEL, 8);
	if (!bc->pages)
	goto out;
	bc->zero_page = alloc_pages(GFP_KERNEL, 0);
	if (!bc->zero_page)
	goto out1;
	bcon_init_bios(bc);
	bcon_init_zero_bio(bc);
	setup_timer(&bc->pad_timer, bcon_pad, (unsigned long)bc);
	bc->max_bytes = bc->bdev->bd_inode->i_size & ~CACHE_MASK;
	err = bcon_find_end_of_log(bc);
	if (err)
	goto out2;
	kref_init(&bc->kref); /* This reference gets freed on errors */
	bc->writeback_thread = kthread_run(bcon_writeback, bc, "bcon_%s",
	bc->devname);
	if (IS_ERR(bc->writeback_thread))
	goto out2;
	INIT_WORK(&bc->unregister_work, bcon_unregister);
	INIT_WORK(&bc->release_work, __bcon_release);
	register_console(&bc->console);
	bc->panic_block.notifier_call = blockconsole_panic;
	bc->panic_block.priority = INT_MAX;
	atomic_notifier_chain_register(&panic_notifier_list, &bc->panic_block);
	pr_info("now logging to %s at %llx\n", bc->devname,
	atomic64_read(&bc->console_bytes) >> 20);
	return 0;

	out2:
	__free_pages(bc->zero_page, 0);
	out1:
	__free_pages(bc->pages, 8);
	out:
	kfree(bc);
	/* Not strictly correct, be the caller doesn't care */
	return -ENOMEM;
	}

	struct bcon_candidate {
	struct work_struct work;
	dev_t devt;
	};

	/*
	* Calling bcon_create directly would cause a deadlock. __blkdev_get will
	* take bdev->bd_mutex, which is already held by the partitioning code.
	* Hence go through the indirection of a work queue.
	*/
	static void bcon_do_add(struct work_struct *work)
	{
	struct bcon_candidate *cand = container_of(work, struct bcon_candidate,
	work);

	bcon_create(cand->devt);
	kfree(cand);
	}

	void bcon_add(dev_t devt)
	{
	struct bcon_candidate *cand;

	cand = kmalloc(sizeof(cand), GFP_KERNEL);
	if (!cand)
	return;
	cand->devt = devt;
	INIT_WORK(&cand->work, bcon_do_add);
	schedule_work(&cand->work);
	}