| /* |
| * Blockconsole - write kernel console to a block device |
| * |
| * Copyright (C) 2012 Joern Engel <joern@logfs.org> |
| * |
| * For usage and disk format, please see |
| * Documentation/block/blockconsole.txt |
| * |
| * Blockconsole allocates a 1MB buffer at init time. printk() calls |
| * bcon_write(), which simply writes to the buffer and wakes up a |
| * writeback thread. bcon_writeback() will then write the console out |
| * to the blockdevice, in chunks of sector_size. |
| * |
| * All allocations for blockconsole happen at init time. All means |
| * 1MB worth of buffer, a struct bio for every 512B sector, plus |
| * another page and struct bio to zero the current 1MB of the device |
| * before writing to it. |
| * |
| * The block layer and device drivers may require further memory |
| * allocations do sleeping calls and require all sorts of |
| * infrastructure. In such cases, writeback may be delayed or fail |
| * altogether. But since writeback is decoupled from printk(), the |
| * worst consequence should be loss of debug information - which would |
| * be lost without blockconsole as well. |
| * |
| * On top of the writeback thread, bcon_write() also schedules a 1s |
| * timer. When the timer expires, the current partial sector will be |
| * padded and written out as well. Blockconsole does no overwrites, |
| * so a spurious line of up to 510 spaces and a newline is the result. |
| * |
| * There is a panic handler that tries to push out the last dying |
| * breath. Sometimes that works, sometimes it causes a secondary |
| * oops. I have never seen it do harm and when it does work it |
| * provides useful crash information. |
| * |
| * In cases where more printk data comes in the front door than the |
| * backing device can handle, there will be data loss. Again, |
| * blockconsole is best-effort. Given the 1MB buffer it takes an |
| * extreme slow device or a deliberate attempt to overflow the buffer, |
| * so this is almost never a problem in practice. |
| * |
| * In case the device is already filled with data (particularly after |
| * blockconsole wraps around), each 1MB tile is written with zeroes |
| * once before the regular sector writes happen. Noone should |
| * interpret stale data from before the wrap-around as current - or |
| * have to guess where the current data ends and the stale data |
| * begins. |
| * |
| * Detection of console devices currently works by having the partition |
| * scanning code call bcon_add() once for every partition and once for |
| * every device. If we find a valid header, the device is automatically |
| * used. |
| * |
| * Removal of console devices is also automatic - sooner or later a |
| * removed device will cause write errors and any device that |
| * consistently returns write errors will get removed. Main goal here |
| * was to be resilient against flaky hardware, using the same code to |
| * handle device removal is a bonus and ensures test coverage. |
| */ |
| #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt |
| |
| #include <linux/bio.h> |
| #include <linux/blockconsole.h> |
| #include <linux/console.h> |
| #include <linux/fs.h> |
| #include <linux/kref.h> |
| #include <linux/kthread.h> |
| #include <linux/mm.h> |
| #include <linux/mount.h> |
| #include <linux/random.h> |
| #include <linux/slab.h> |
| #include <linux/string.h> |
| #include <linux/workqueue.h> |
| #include <linux/sched.h> |
| #include <linux/ctype.h> |
| #include <linux/device.h> |
| #include <linux/genhd.h> |
| |
| #define BLOCKCONSOLE_MAGIC "\nLinux blockconsole version 1.1\n" |
| #define BCON_UUID_OFS (32) |
| #define BCON_ROUND_OFS (41) |
| #define BCON_TILE_OFS (50) |
| #define BCON_HEADERSIZE (50) |
| #define BCON_LONG_HEADERSIZE (59) /* with tile index */ |
| |
| #define PAGE_COUNT (256) |
| #define SECTOR_COUNT (PAGE_COUNT * (PAGE_SIZE >> 9)) |
| #define CACHE_PAGE_MASK (PAGE_COUNT - 1) |
| #define CACHE_SECTOR_MASK (SECTOR_COUNT - 1) |
| #define CACHE_SIZE (PAGE_COUNT << PAGE_SHIFT) |
| #define CACHE_MASK (CACHE_SIZE - 1) |
| #define SECTOR_SHIFT (9) |
| #define SECTOR_SIZE (1u << SECTOR_SHIFT) |
| #define SECTOR_MASK (~(SECTOR_SIZE-1)) |
| #define PG_SECTOR_MASK ((PAGE_SIZE >> 9) - 1) |
| |
| struct bcon_bio { |
| struct bio bio; |
| struct bio_vec bvec; |
| void *sector; |
| int in_flight; |
| }; |
| |
| struct blockconsole { |
| char devname[32]; |
| atomic64_t console_bytes; |
| spinlock_t end_io_lock; |
| struct timer_list pad_timer; |
| int error_count; |
| struct kref kref; |
| u64 write_bytes; |
| u64 max_bytes; |
| u32 round; |
| u32 uuid; |
| struct bcon_bio bio_array[SECTOR_COUNT]; |
| struct page *pages; |
| struct bcon_bio zero_bios[PAGE_COUNT]; |
| struct page *zero_page; |
| struct block_device *bdev; |
| struct console console; |
| struct work_struct unregister_work; |
| struct work_struct release_work; |
| struct task_struct *writeback_thread; |
| struct notifier_block panic_block; |
| }; |
| |
| static void bcon_get(struct blockconsole *bc) |
| { |
| kref_get(&bc->kref); |
| } |
| |
| static void __bcon_release(struct work_struct *work) |
| { |
| struct blockconsole *bc = container_of(work, struct blockconsole, |
| release_work); |
| |
| __free_pages(bc->zero_page, 0); |
| __free_pages(bc->pages, 8); |
| invalidate_mapping_pages(bc->bdev->bd_inode->i_mapping, 0, -1); |
| blkdev_put(bc->bdev, FMODE_READ|FMODE_WRITE); |
| kfree(bc); |
| } |
| |
| static void bcon_release(struct kref *kref) |
| { |
| struct blockconsole *bc = container_of(kref, struct blockconsole, kref); |
| |
| /* bcon_release can be called from atomic context */ |
| schedule_work(&bc->release_work); |
| } |
| |
| static void bcon_put(struct blockconsole *bc) |
| { |
| kref_put(&bc->kref, bcon_release); |
| } |
| |
| static unsigned int __bcon_console_ofs(u64 console_bytes) |
| { |
| return console_bytes & ~SECTOR_MASK; |
| } |
| |
| static unsigned int bcon_console_ofs(struct blockconsole *bc) |
| { |
| return __bcon_console_ofs(atomic64_read(&bc->console_bytes)); |
| } |
| |
| static unsigned int __bcon_console_sector(u64 console_bytes) |
| { |
| return (console_bytes >> SECTOR_SHIFT) & CACHE_SECTOR_MASK; |
| } |
| |
| static unsigned int bcon_console_sector(struct blockconsole *bc) |
| { |
| return __bcon_console_sector(atomic64_read(&bc->console_bytes)); |
| } |
| |
| static unsigned int bcon_write_sector(struct blockconsole *bc) |
| { |
| return (bc->write_bytes >> SECTOR_SHIFT) & CACHE_SECTOR_MASK; |
| } |
| |
| static void clear_sector(void *sector) |
| { |
| memset(sector, ' ', 511); |
| memset(sector + 511, 10, 1); |
| } |
| |
| static void bcon_init_first_page(struct blockconsole *bc) |
| { |
| char *buf = page_address(bc->pages); |
| size_t len = strlen(BLOCKCONSOLE_MAGIC); |
| u32 tile = atomic64_read(&bc->console_bytes) >> 20; /* We overflow after 4TB - fine */ |
| |
| clear_sector(buf); |
| memcpy(buf, BLOCKCONSOLE_MAGIC, len); |
| sprintf(buf + BCON_UUID_OFS, "%08x", bc->uuid); |
| sprintf(buf + BCON_ROUND_OFS, "%08x", bc->round); |
| sprintf(buf + BCON_TILE_OFS, "%08x", tile); |
| /* replace NUL with newline */ |
| buf[BCON_UUID_OFS + 8] = 10; |
| buf[BCON_ROUND_OFS + 8] = 10; |
| buf[BCON_TILE_OFS + 8] = 10; |
| } |
| |
| static void bcon_advance_console_bytes(struct blockconsole *bc, int bytes) |
| { |
| u64 old, new; |
| |
| do { |
| old = atomic64_read(&bc->console_bytes); |
| new = old + bytes; |
| if (new >= bc->max_bytes) |
| new = 0; |
| if ((new & CACHE_MASK) == 0) { |
| bcon_init_first_page(bc); |
| new += BCON_LONG_HEADERSIZE; |
| } |
| } while (atomic64_cmpxchg(&bc->console_bytes, old, new) != old); |
| } |
| |
| static void request_complete(struct bio *bio, int err) |
| { |
| complete((struct completion *)bio->bi_private); |
| } |
| |
| static int sync_read(struct blockconsole *bc, u64 ofs) |
| { |
| struct bio bio; |
| struct bio_vec bio_vec; |
| struct completion complete; |
| |
| bio_init(&bio); |
| bio.bi_io_vec = &bio_vec; |
| bio_vec.bv_page = bc->pages; |
| bio_vec.bv_len = SECTOR_SIZE; |
| bio_vec.bv_offset = 0; |
| bio.bi_vcnt = 1; |
| bio.bi_idx = 0; |
| bio.bi_size = SECTOR_SIZE; |
| bio.bi_bdev = bc->bdev; |
| bio.bi_sector = ofs >> SECTOR_SHIFT; |
| init_completion(&complete); |
| bio.bi_private = &complete; |
| bio.bi_end_io = request_complete; |
| |
| submit_bio(READ, &bio); |
| wait_for_completion(&complete); |
| return test_bit(BIO_UPTODATE, &bio.bi_flags) ? 0 : -EIO; |
| } |
| |
| static void bcon_erase_segment(struct blockconsole *bc) |
| { |
| int i; |
| |
| for (i = 0; i < PAGE_COUNT; i++) { |
| struct bcon_bio *bcon_bio = bc->zero_bios + i; |
| struct bio *bio = &bcon_bio->bio; |
| |
| /* |
| * If the last erase hasn't finished yet, just skip it. The log |
| * will look messy, but that's all. |
| */ |
| rmb(); |
| if (bcon_bio->in_flight) |
| continue; |
| bio_init(bio); |
| bio->bi_io_vec = &bcon_bio->bvec; |
| bio->bi_vcnt = 1; |
| bio->bi_size = PAGE_SIZE; |
| bio->bi_bdev = bc->bdev; |
| bio->bi_private = bc; |
| bio->bi_idx = 0; |
| bio->bi_sector = (bc->write_bytes + i * PAGE_SIZE) >> 9; |
| bcon_bio->in_flight = 1; |
| wmb(); |
| /* We want the erase to go to the device first somehow */ |
| submit_bio(WRITE | REQ_SOFTBARRIER, bio); |
| } |
| } |
| |
| static void bcon_advance_write_bytes(struct blockconsole *bc, int bytes) |
| { |
| bc->write_bytes += bytes; |
| if (bc->write_bytes >= bc->max_bytes) { |
| bc->write_bytes = 0; |
| bc->round++; |
| bcon_init_first_page(bc); |
| } |
| } |
| |
| /* |
| * Check if we have an 8-digit hex number followed by newline |
| */ |
| static bool is_four_byte_hex(const void *data) |
| { |
| const char *str = data; |
| int len = 0; |
| |
| while (isxdigit(*str) && len++ < 9) |
| str++; |
| |
| if (len != 8) |
| return false; |
| |
| /* str should point to a \n now */ |
| if (*str != 0xa) |
| return false; |
| |
| return true; |
| } |
| |
| static int bcon_magic_present(const void *data) |
| { |
| size_t len = strlen(BLOCKCONSOLE_MAGIC); |
| |
| if (memcmp(data, BLOCKCONSOLE_MAGIC, len)) |
| return 0; |
| if (!is_four_byte_hex(data + BCON_UUID_OFS)) |
| return 0; |
| if (!is_four_byte_hex(data + BCON_ROUND_OFS)) |
| return 0; |
| if (!is_four_byte_hex(data + BCON_TILE_OFS)) |
| return 0; |
| return 11; |
| } |
| |
| static int bcon_find_end_of_log(struct blockconsole *bc) |
| { |
| u64 start = 0, end = bc->max_bytes, middle; |
| void *sec0 = bc->bio_array[0].sector; |
| void *sec1 = bc->bio_array[1].sector; |
| int err, version; |
| |
| err = sync_read(bc, 0); |
| if (err) |
| return err; |
| /* Second sanity check, out of sheer paranoia */ |
| version = bcon_magic_present(sec0); |
| if (!version) |
| return -EINVAL; |
| |
| bc->uuid = simple_strtoull(sec0 + BCON_UUID_OFS, NULL, 16); |
| bc->round = simple_strtoull(sec0 + BCON_ROUND_OFS, NULL, 16); |
| |
| memcpy(sec1, sec0, BCON_HEADERSIZE); |
| for (;;) { |
| middle = (start + end) / 2; |
| middle &= ~CACHE_MASK; |
| if (middle == start) |
| break; |
| err = sync_read(bc, middle); |
| if (err) |
| return err; |
| if (memcmp(sec1, sec0, BCON_HEADERSIZE)) { |
| /* If the two differ, we haven't written that far yet */ |
| end = middle; |
| } else { |
| start = middle; |
| } |
| } |
| bc->write_bytes = end; |
| atomic64_set(&bc->console_bytes, end); |
| bcon_advance_console_bytes(bc, 0); /* To skip the header */ |
| bcon_advance_write_bytes(bc, 0); /* To wrap around, if necessary */ |
| bcon_erase_segment(bc); |
| return 0; |
| } |
| |
| static void bcon_unregister(struct work_struct *work) |
| { |
| struct blockconsole *bc = container_of(work, struct blockconsole, |
| unregister_work); |
| |
| atomic_notifier_chain_unregister(&panic_notifier_list, &bc->panic_block); |
| unregister_console(&bc->console); |
| del_timer_sync(&bc->pad_timer); |
| kthread_stop(bc->writeback_thread); |
| /* No new io will be scheduled anymore now */ |
| bcon_put(bc); |
| } |
| |
| #define BCON_MAX_ERRORS 10 |
| static void bcon_end_io(struct bio *bio, int err) |
| { |
| struct bcon_bio *bcon_bio = container_of(bio, struct bcon_bio, bio); |
| struct blockconsole *bc = bio->bi_private; |
| unsigned long flags; |
| |
| /* |
| * We want to assume the device broken and free this console if |
| * we accumulate too many errors. But if errors are transient, |
| * we also want to forget about them once writes succeed again. |
| * Oh, and we only want to reset the counter if it hasn't reached |
| * the limit yet, so we don't bcon_put() twice from here. |
| */ |
| spin_lock_irqsave(&bc->end_io_lock, flags); |
| if (err) { |
| if (bc->error_count++ == BCON_MAX_ERRORS) { |
| pr_info("no longer logging to %s\n", bc->devname); |
| schedule_work(&bc->unregister_work); |
| } |
| } else { |
| if (bc->error_count && bc->error_count < BCON_MAX_ERRORS) |
| bc->error_count = 0; |
| } |
| /* |
| * Add padding (a bunch of spaces and a newline) early so bcon_pad |
| * only has to advance a pointer. |
| */ |
| clear_sector(bcon_bio->sector); |
| bcon_bio->in_flight = 0; |
| spin_unlock_irqrestore(&bc->end_io_lock, flags); |
| bcon_put(bc); |
| } |
| |
| static void bcon_writesector(struct blockconsole *bc, int index) |
| { |
| struct bcon_bio *bcon_bio = bc->bio_array + index; |
| struct bio *bio = &bcon_bio->bio; |
| |
| rmb(); |
| if (bcon_bio->in_flight) |
| return; |
| bcon_get(bc); |
| |
| bio_init(bio); |
| bio->bi_io_vec = &bcon_bio->bvec; |
| bio->bi_vcnt = 1; |
| bio->bi_size = SECTOR_SIZE; |
| bio->bi_bdev = bc->bdev; |
| bio->bi_private = bc; |
| bio->bi_end_io = bcon_end_io; |
| |
| bio->bi_idx = 0; |
| bio->bi_sector = bc->write_bytes >> 9; |
| bcon_bio->in_flight = 1; |
| wmb(); |
| submit_bio(WRITE, bio); |
| } |
| |
| /** |
| * bcon_writeback - the writeback thread |
| * @_bc: The struct blockconsole |
| * |
| * Will loop and writeback any full sectors, then go back to sleep. |
| */ |
| static int bcon_writeback(void *_bc) |
| { |
| struct blockconsole *bc = _bc; |
| struct sched_param(sp); |
| |
| sp.sched_priority = MAX_RT_PRIO - 1; /* Highest realtime prio */ |
| sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); |
| for (;;) { |
| set_current_state(TASK_INTERRUPTIBLE); |
| schedule(); |
| if (kthread_should_stop()) |
| break; |
| while (bcon_write_sector(bc) != bcon_console_sector(bc)) { |
| bcon_writesector(bc, bcon_write_sector(bc)); |
| bcon_advance_write_bytes(bc, SECTOR_SIZE); |
| if (bcon_write_sector(bc) == 0) |
| bcon_erase_segment(bc); |
| } |
| } |
| return 0; |
| } |
| |
| static void bcon_pad(unsigned long data) |
| { |
| struct blockconsole *bc = (void *)data; |
| unsigned int n; |
| |
| /* |
| * We deliberately race against bcon_write here. If we lose the race, |
| * our padding is no longer where we expected it to be, i.e. it is |
| * no longer a bunch of spaces with a newline at the end. There could |
| * not be a newline at all or it could be somewhere in the middle. |
| * Either way, the log corruption is fairly obvious to spot and ignore |
| * for human readers. |
| */ |
| n = SECTOR_SIZE - bcon_console_ofs(bc); |
| if (n != SECTOR_SIZE) { |
| bcon_advance_console_bytes(bc, n); |
| wake_up_process(bc->writeback_thread); |
| } |
| } |
| |
| static void bcon_write(struct console *console, const char *msg, |
| unsigned int len) |
| { |
| struct blockconsole *bc = container_of(console, struct blockconsole, |
| console); |
| unsigned int n; |
| u64 console_bytes; |
| int i; |
| |
| while (len) { |
| console_bytes = atomic64_read(&bc->console_bytes); |
| i = __bcon_console_sector(console_bytes); |
| rmb(); |
| if (bc->bio_array[i].in_flight) |
| break; |
| n = min(len, SECTOR_SIZE - __bcon_console_ofs(console_bytes)); |
| memcpy(bc->bio_array[i].sector + |
| __bcon_console_ofs(console_bytes), msg, n); |
| len -= n; |
| msg += n; |
| bcon_advance_console_bytes(bc, n); |
| } |
| wake_up_process(bc->writeback_thread); |
| mod_timer(&bc->pad_timer, jiffies + HZ); |
| } |
| |
| /** |
| * bcon_init_bios - initialize the struct bio array |
| */ |
| static void bcon_init_bios(struct blockconsole *bc) |
| { |
| int i; |
| |
| for (i = 0; i < SECTOR_COUNT; i++) { |
| int page_index = i >> (PAGE_SHIFT - SECTOR_SHIFT); |
| struct page *page = bc->pages + page_index; |
| struct bcon_bio *bcon_bio = bc->bio_array + i; |
| struct bio_vec *bvec = &bcon_bio->bvec; |
| |
| bcon_bio->in_flight = 0; |
| bcon_bio->sector = page_address(bc->pages + page_index) |
| + SECTOR_SIZE * (i & PG_SECTOR_MASK); |
| clear_sector(bcon_bio->sector); |
| bvec->bv_page = page; |
| bvec->bv_len = SECTOR_SIZE; |
| bvec->bv_offset = SECTOR_SIZE * (i & PG_SECTOR_MASK); |
| } |
| } |
| |
| static void bcon_init_zero_bio(struct blockconsole *bc) |
| { |
| int i; |
| |
| memset(page_address(bc->zero_page), 0, PAGE_SIZE); |
| for (i = 0; i < PAGE_COUNT; i++) { |
| struct bcon_bio *bcon_bio = bc->zero_bios + i; |
| struct bio_vec *bvec = &bcon_bio->bvec; |
| |
| bcon_bio->in_flight = 0; |
| bvec->bv_page = bc->zero_page; |
| bvec->bv_len = PAGE_SIZE; |
| bvec->bv_offset = 0; |
| } |
| } |
| |
| /** |
| * blockconsole_panic - panic notifier |
| * |
| * Tries to write back any crash information. This fails fairly |
| * regularly. As always, blockconsole is best-effort. |
| */ |
| static int blockconsole_panic(struct notifier_block *this, unsigned long event, |
| void *ptr) |
| { |
| struct blockconsole *bc = container_of(this, struct blockconsole, |
| panic_block); |
| unsigned int n; |
| |
| n = SECTOR_SIZE - bcon_console_ofs(bc); |
| if (n != SECTOR_SIZE) |
| bcon_advance_console_bytes(bc, n); |
| bcon_writeback(bc); |
| return NOTIFY_DONE; |
| } |
| |
| static int bcon_create(dev_t devt) |
| { |
| const fmode_t mode = FMODE_READ | FMODE_WRITE; |
| struct blockconsole *bc; |
| int err; |
| |
| bc = kzalloc(sizeof(*bc), GFP_KERNEL); |
| if (!bc) |
| return -ENOMEM; |
| spin_lock_init(&bc->end_io_lock); |
| strcpy(bc->console.name, "bcon"); |
| bc->console.flags = CON_PRINTBUFFER | CON_ENABLED | CON_ALLDATA; |
| bc->console.write = bcon_write; |
| |
| bc->bdev = blkdev_get_by_dev(devt, mode, NULL); |
| if (IS_ERR(bc->bdev)) |
| goto out; |
| |
| memset(bc->devname, ' ', sizeof(bc->devname)); |
| strlcpy(bc->devname, dev_name(part_to_dev(bc->bdev->bd_part)), |
| sizeof(bc->devname)); |
| bc->pages = alloc_pages(GFP_KERNEL, 8); |
| if (!bc->pages) |
| goto out; |
| bc->zero_page = alloc_pages(GFP_KERNEL, 0); |
| if (!bc->zero_page) |
| goto out1; |
| bcon_init_bios(bc); |
| bcon_init_zero_bio(bc); |
| setup_timer(&bc->pad_timer, bcon_pad, (unsigned long)bc); |
| bc->max_bytes = bc->bdev->bd_inode->i_size & ~CACHE_MASK; |
| err = bcon_find_end_of_log(bc); |
| if (err) |
| goto out2; |
| kref_init(&bc->kref); /* This reference gets freed on errors */ |
| bc->writeback_thread = kthread_run(bcon_writeback, bc, "bcon_%s", |
| bc->devname); |
| if (IS_ERR(bc->writeback_thread)) |
| goto out2; |
| INIT_WORK(&bc->unregister_work, bcon_unregister); |
| INIT_WORK(&bc->release_work, __bcon_release); |
| register_console(&bc->console); |
| bc->panic_block.notifier_call = blockconsole_panic; |
| bc->panic_block.priority = INT_MAX; |
| atomic_notifier_chain_register(&panic_notifier_list, &bc->panic_block); |
| pr_info("now logging to %s at %llx\n", bc->devname, |
| atomic64_read(&bc->console_bytes) >> 20); |
| return 0; |
| |
| out2: |
| __free_pages(bc->zero_page, 0); |
| out1: |
| __free_pages(bc->pages, 8); |
| out: |
| kfree(bc); |
| /* Not strictly correct, be the caller doesn't care */ |
| return -ENOMEM; |
| } |
| |
| struct bcon_candidate { |
| struct work_struct work; |
| dev_t devt; |
| }; |
| |
| /* |
| * Calling bcon_create directly would cause a deadlock. __blkdev_get will |
| * take bdev->bd_mutex, which is already held by the partitioning code. |
| * Hence go through the indirection of a work queue. |
| */ |
| static void bcon_do_add(struct work_struct *work) |
| { |
| struct bcon_candidate *cand = container_of(work, struct bcon_candidate, |
| work); |
| |
| bcon_create(cand->devt); |
| kfree(cand); |
| } |
| |
| void bcon_add(dev_t devt) |
| { |
| struct bcon_candidate *cand; |
| |
| cand = kmalloc(sizeof(cand), GFP_KERNEL); |
| if (!cand) |
| return; |
| cand->devt = devt; |
| INIT_WORK(&cand->work, bcon_do_add); |
| schedule_work(&cand->work); |
| } |