efi: capsule pstore backend

The EFI capsule mechanism allows data blobs to be passed to the EFI
firmware. By setting the EFI_CAPSULE_POPULATE_SYSTEM_TABLE and the
EFI_CAPSULE_PERSIST_ACROSS_REBOOT flags, the firmware will place a
pointer to our data blob in the EFI System Table on the next boot.

We can get access to the array of EFI capsules when parsing the
configuration tables. Store a pointer to the table.

We can utilise this facility to save crash dumps, call traces, etc and
pick them up after reboot.

Cc: Andi Kleen <ak@linux.intel.com>
Cc: Tony Luck <tony.luck@intel.com>
Signed-off-by: Matt Fleming <matt.fleming@intel.com>
diff --git a/drivers/firmware/efi/Kconfig b/drivers/firmware/efi/Kconfig
index f7fd282..fe6f335 100644
--- a/drivers/firmware/efi/Kconfig
+++ b/drivers/firmware/efi/Kconfig
@@ -45,4 +45,11 @@
 
 	  If unsure, say N.
 
+config EFI_CAPSULE_PSTORE
+	bool "EFI capsule pstore backend"
+	depends on EFI_CAPSULE && PSTORE
+	help
+	  The EFI capsule mechanism can be used to store crash dumps and
+	  function tracing data.
+
 endmenu
diff --git a/drivers/firmware/efi/capsule.c b/drivers/firmware/efi/capsule.c
index c9c5624..1c79983 100644
--- a/drivers/firmware/efi/capsule.c
+++ b/drivers/firmware/efi/capsule.c
@@ -40,6 +40,9 @@
 
 static unsigned long capsule_total_size;
 
+static int efi_update_capsule(efi_capsule_header_t *capsule,
+			      struct page **pages, size_t size);
+
 /**
  * efi_capsule_pending - has a capsule been passed to the firmware?
  * @reset_type: store the type of EFI reset if capsule is pending
@@ -59,6 +62,563 @@
 	return true;
 }
 
+#ifdef CONFIG_EFI_CAPSULE_PSTORE
+struct efi_capsule_ctx {
+	struct page **pages;
+	unsigned int nr_pages;
+	efi_capsule_header_t *capsule;
+	size_t capsule_size;
+	void *data;
+	size_t data_size;
+};
+
+struct efi_capsule_pstore_buf {
+	void *buf;
+	size_t size;
+	atomic_long_t offset;
+};
+
+struct efi_capsule_pstore {
+	/* Previous records */
+	efi_capsule_header_t **hdrs;
+	uint32_t hdrs_num;
+	off_t hdr_offset;	/* Offset into current header */
+
+	/* New records */
+	struct efi_capsule_pstore_buf console;
+	struct efi_capsule_pstore_buf ftrace;
+	struct efi_capsule_pstore_buf dmesg;
+};
+
+struct efi_capsule_pstore_record {
+	u64 timestamp;
+	u64 id;
+	enum pstore_type_id type;
+	size_t size;
+	char data[];
+} __packed;
+
+static struct pstore_info efi_capsule_info;
+static u64 efi_capsule_max_size;
+
+/*
+ * Information about capsules we pulled from the EFI System Table.
+ */
+static efi_capsule_header_t **prev_capsules;
+static u32 efi_capsule_num;
+
+/**
+ * efi_capsule_build - alloc data buffer and fill out the header
+ * @guid: vendor's guid
+ * @data_size: size in bytes of the capsule data
+ *
+ * This is a helper function for allocating enough room for user data
+ * + the size of an EFI capsule header.
+ *
+ * Returns a pointer to an allocated capsule on success, an ERR_PTR()
+ * value on error.
+ */
+static struct efi_capsule_ctx *
+efi_capsule_build(efi_guid_t guid, size_t data_size)
+{
+	struct efi_capsule_ctx *ctx;
+	size_t capsule_size, needed_pages;
+
+	capsule_size = data_size + sizeof(efi_capsule_header_t);
+	if (capsule_size > efi_capsule_max_size)
+		return ERR_PTR(-ENOSPC);
+
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx) {
+		pr_err("failed to allocate capsule context memory\n");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	pr_info("allocating: %zu\n", capsule_size);
+
+	needed_pages = ALIGN(capsule_size, PAGE_SIZE) >> PAGE_SHIFT;
+	ctx->pages = kzalloc(needed_pages * sizeof(void *), GFP_KERNEL);
+	if (!ctx->pages)
+		goto fail;
+
+	while (needed_pages--) {
+		struct page *page;
+
+		page = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
+		if (!page)
+			goto fail;
+
+		ctx->pages[ctx->nr_pages++] = page;
+	}
+
+	ctx->capsule = vmap(ctx->pages, ctx->nr_pages, 0, PAGE_KERNEL);
+	if (!ctx->capsule)
+		goto fail;
+
+	ctx->capsule_size = capsule_size;
+	ctx->data = (void *)ctx->capsule + sizeof(efi_capsule_header_t);
+	ctx->data_size = data_size;
+
+	pr_info("allocated %zd bytes of capsule memory\n", data_size);
+
+	/*
+	 * Setup the EFI capsule header.
+	 */
+	memcpy(&ctx->capsule->guid, &guid, sizeof(guid));
+
+	ctx->capsule->flags = EFI_CAPSULE_PERSIST_ACROSS_RESET |
+		EFI_CAPSULE_POPULATE_SYSTEM_TABLE;
+
+	ctx->capsule->headersize = sizeof(*ctx->capsule);
+	ctx->capsule->imagesize = capsule_size;
+
+	return ctx;
+
+fail:
+	while (ctx->nr_pages--)
+		__free_page(ctx->pages[ctx->nr_pages]);
+
+	kfree(ctx->pages);
+	kfree(ctx);
+	return ERR_PTR(-ENOMEM);
+}
+
+/**
+ * efi_capsule_lookup - search capsule array for entries.
+ * @guid: the guid to search for.
+ * @nr_caps: the number of entries found.
+ *
+ * Map each capsule header into the kernel's virtual address space and
+ * inspect the guid. Build an array of capsule headers with every
+ * capsule that is found with @guid. If a match is found the capsule
+ * remains mapped, otherwise it is unmapped.
+ *
+ * Returns an array of capsule headers, each element of which has the
+ * guid @guid. The number of elements in the array is stored in
+ * @nr_caps. Returns %NULL if no capsules were found and stores zero
+ * in @nr_caps.
+ */
+static efi_capsule_header_t **
+efi_capsule_lookup(efi_guid_t guid, uint32_t *nr_caps)
+{
+	efi_capsule_header_t **capsules = NULL;
+	size_t capsules_size = 0;
+	int i;
+
+	*nr_caps = 0;
+	for (i = 0; i < efi_capsule_num; i++) {
+		efi_capsule_header_t *c;
+		size_t size;
+
+		c = ioremap((resource_size_t)prev_capsules[i], sizeof(*c));
+		if (!c) {
+			pr_err("failed to ioremap capsule\n");
+			continue;
+		}
+
+		size = c->imagesize;
+		iounmap(c);
+
+		c = ioremap((resource_size_t)prev_capsules[i], size);
+		if (!c) {
+			pr_err("failed to ioremap header + data\n");
+			continue;
+		}
+
+		if (!efi_guidcmp(c->guid, guid)) {
+			capsules_size += sizeof(**capsules);
+			capsules = krealloc(capsules, capsules_size, GFP_KERNEL);
+			if (!capsules)
+				return ERR_PTR(-ENOMEM);
+
+			capsules[(*nr_caps)++] = c;
+			continue;
+		}
+
+		iounmap(c);
+	}
+
+	return capsules;
+}
+
+static int extract_capsules(void)
+{
+	void *capsule;
+	size_t size;
+
+	if (efi.capsule == EFI_INVALID_TABLE_ADDR)
+		return 0;
+
+	capsule = ioremap(efi.capsule, sizeof(efi_capsule_num));
+	if (!capsule)
+		return -ENOMEM;
+
+	/*
+	 * The array of capsules is prefixed with the number of
+	 * capsule entries in the array.
+	 */
+	efi_capsule_num = *(uint32_t *)capsule;
+	iounmap(capsule);
+
+	if (!efi_capsule_num) {
+		pr_info("no capsules on extraction\n");
+		return 0;
+	}
+
+	size = efi_capsule_num * sizeof(*capsule);
+	capsule = ioremap(efi.capsule, size);
+	if (!capsule)
+		return -ENOMEM;
+
+	capsule += sizeof(uint32_t *);
+	prev_capsules = (efi_capsule_header_t **)capsule;
+	if (!*prev_capsules)
+		pr_err("capsule array has no entries\n");
+
+	return 0;
+}
+
+/*
+ * We may not be in a position to allocate memory at the time of a
+ * crash, so pre-allocate some space now and register it with the
+ * firmware via efi_capsule_update().
+ *
+ * Also, iterate through the array of capsules pointed to from the EFI
+ * system table and take note of any LINUX_EFI_CRASH_GUID
+ * capsules. They will be parsed by efi_capsule_pstore_read().
+ */
+static int efi_capsule_pstore_setup(void)
+{
+	struct efi_capsule_pstore_record *rec;
+	struct efi_capsule_pstore *pctx = NULL;
+	struct efi_capsule_ctx *console_ctx = NULL;
+	struct efi_capsule_ctx *ftrace_ctx = NULL;
+	struct efi_capsule_ctx *dmesg_ctx = NULL;
+	efi_capsule_header_t **hdrs;
+	uint32_t hdrs_num;
+	void *crash_buf = NULL;
+	size_t size, crash_size;
+	int rv;
+
+	extract_capsules();
+
+	pctx = kzalloc(sizeof(*pctx), GFP_KERNEL);
+	if (!pctx)
+		return -ENOMEM;
+
+	size = 16 * 1024;
+	if (size > efi_capsule_max_size) {
+		size = efi_capsule_max_size;
+		WARN_ON_ONCE(1);
+	}
+
+	/* Allocate all the capsules upfront */
+	dmesg_ctx = efi_capsule_build(LINUX_EFI_CRASH_GUID, size);
+	if (IS_ERR(dmesg_ctx)) {
+		rv = PTR_ERR(dmesg_ctx);
+		dmesg_ctx = NULL;
+		goto fail;
+	}
+
+	ftrace_ctx = efi_capsule_build(LINUX_EFI_CRASH_GUID, size);
+	if (IS_ERR(ftrace_ctx)) {
+		rv = PTR_ERR(ftrace_ctx);
+		ftrace_ctx = NULL;
+		goto fail;
+	}
+
+	console_ctx = efi_capsule_build(LINUX_EFI_CRASH_GUID, size);
+	if (IS_ERR(console_ctx)) {
+		rv = PTR_ERR(console_ctx);
+		console_ctx = NULL;
+		goto fail;
+	}
+
+	crash_size = 4096;
+	crash_buf = kmalloc(crash_size, GFP_KERNEL);
+	if (!crash_buf) {
+		rv = -ENOMEM;
+		goto fail;
+	}
+
+	/* Register with the firmware. */
+	rv = efi_update_capsule(dmesg_ctx->capsule, dmesg_ctx->pages,
+				dmesg_ctx->capsule_size);
+	if (rv)
+		goto fail;
+
+	pr_info("Registered dmesg with firmware\n");
+	rv = efi_update_capsule(ftrace_ctx->capsule, ftrace_ctx->pages,
+				ftrace_ctx->capsule_size);
+	if (rv)
+		goto fail_ftrace;
+
+	pr_info("Registered ftrace with firmware\n");
+	rv = efi_update_capsule(console_ctx->capsule, console_ctx->pages,
+				console_ctx->capsule_size);
+	if (rv)
+		goto fail_console;
+
+	pr_info("Registered console with firmware\n");
+	pctx->dmesg.size = dmesg_ctx->data_size;
+	pctx->dmesg.buf = dmesg_ctx->data;
+	atomic_long_set(&pctx->dmesg.offset, 0);
+
+	/*
+	 * Setup the pstore records for the ring-buffers.
+	 */
+	pctx->ftrace.size = ftrace_ctx->data_size - offsetof(typeof(*rec), data);
+	pctx->ftrace.buf = ftrace_ctx->data + offsetof(typeof(*rec), data);
+	atomic_long_set(&pctx->ftrace.offset, 0);
+	rec = ftrace_ctx->data;
+	rec->type = PSTORE_TYPE_FTRACE;
+	rec->size = pctx->ftrace.size;
+
+	pctx->console.size = console_ctx->data_size - offsetof(typeof(*rec), data);
+	pctx->console.buf = console_ctx->data + offsetof(typeof(*rec), data);
+	atomic_long_set(&pctx->console.offset, 0);
+	rec = console_ctx->data;
+	rec->type = PSTORE_TYPE_CONSOLE;
+	rec->size = pctx->console.size;
+
+	/*
+	 * Read any pstore entries that were passed across a reboot.
+	 */
+	pr_info("looking up old capsules\n");
+	hdrs = efi_capsule_lookup(LINUX_EFI_CRASH_GUID, &hdrs_num);
+	pctx->hdrs_num = hdrs_num;
+	pctx->hdrs = IS_ERR(hdrs) ? NULL : hdrs;
+
+	if (pctx->hdrs_num)
+		pr_info("found Linux Crash Capsule\n");
+
+	/*
+	 * Register the capsule backend with pstore.
+	 */
+	spin_lock_init(&efi_capsule_info.buf_lock);
+
+	efi_capsule_info.buf = crash_buf;
+	efi_capsule_info.bufsize = crash_size;
+	efi_capsule_info.data = pctx;
+
+	pr_info("registering with pstore\n");
+	rv = pstore_register(&efi_capsule_info);
+	if (rv)
+		pr_err("capsule support registration failed for pstore: %d\n", rv);
+
+	return rv;
+
+fail:
+	kfree(dmesg_ctx);
+fail_ftrace:
+	kfree(ftrace_ctx);
+fail_console:
+	kfree(console_ctx);
+
+	kfree(crash_buf);
+	kfree(pctx);
+	return rv;
+}
+
+/*
+ * Return the next pstore record that was passed to us across a reboot
+ * in an EFI capsule.
+ *
+ * This is expected to be called under the pstore
+ * read_mutex. Therefore, no serialisation is done here.
+ */
+static struct efi_capsule_pstore_record *
+get_pstore_read_record(struct efi_capsule_pstore *pctx)
+{
+	struct efi_capsule_pstore_record *rec;
+	efi_capsule_header_t *hdr;
+	off_t remaining;
+
+next:
+	if (!pctx->hdrs_num)
+		return NULL;
+
+	hdr = pctx->hdrs[pctx->hdrs_num - 1];
+	rec = (void *)hdr + hdr->headersize + pctx->hdr_offset;
+
+	remaining = hdr->imagesize - hdr->headersize - pctx->hdr_offset - offsetof(typeof(*rec), data);
+
+	/*
+	 * A single EFI capsule may contain multiple pstore
+	 * records. It may also only be partially filled with pstore
+	 * records, which we can detect by checking for a record with
+	 * zero size.
+	 *
+	 * If there are no more entries in this capsule try the next.
+	 */
+	if (!rec->size) {
+		pctx->hdrs_num--;
+		pctx->hdr_offset = 0;
+		goto next;
+	}
+
+	/*
+	 * If we've finished parsing all records in this capsule, move
+	 * onto the next. Otherwise, increment the offset into the
+	 * current capsule (pctx->hdr_offset).
+	 */
+	if (rec->size == remaining) {
+		pctx->hdrs_num--;
+		pctx->hdr_offset = 0;
+	} else
+		pctx->hdr_offset += rec->size + offsetof(typeof(*rec), data);
+
+	return rec;
+}
+
+static ssize_t efi_capsule_pstore_read(u64 *id, enum pstore_type_id *type,
+				       int *count, struct timespec *time,
+				       char **buf, struct pstore_info *psi)
+{
+	struct efi_capsule_pstore_record *rec;
+	struct efi_capsule_pstore *pctx = psi->data;
+	ssize_t size;
+
+	printk("%s:%d\n", __func__, __LINE__);
+	rec = get_pstore_read_record(pctx);
+	if (!rec)
+		return 0;
+
+	*type = rec->type;
+	time->tv_sec = rec->timestamp;
+	time->tv_nsec = 0;
+	size = rec->size;
+	*id = rec->id;
+
+	*buf = kmalloc(size, GFP_KERNEL);
+	if (!*buf)
+		return -ENOMEM;
+
+	memcpy(*buf, rec->data, size);
+
+	return size;
+}
+
+/*
+ * We expect to be called with ->buf_lock held, and so don't perform
+ * any serialisation.
+ */
+static struct notrace efi_capsule_pstore_record *
+get_pstore_write_record(struct efi_capsule_pstore_buf *pbuf, size_t *size)
+{
+	struct efi_capsule_pstore_record *rec;
+	long offset = atomic_long_read(&pbuf->offset);
+
+	if (offset == pbuf->size)
+		return NULL;
+
+	/* Trim 'size' if there isn't enough remaining space */
+	if (offset + *size > pbuf->size)
+		*size -= (pbuf->size - offset);
+
+	rec = pbuf->buf + offset;
+	atomic_long_add(offsetof(typeof(*rec), data) + *size, &pbuf->offset);
+
+	return rec;
+}
+
+static int notrace
+efi_capsule_pstore_write(enum pstore_type_id type,
+			 enum kmsg_dump_reason reason, u64 *id,
+			 unsigned int part, int count, size_t hsize,
+			 size_t size, struct pstore_info *psi)
+{
+	struct efi_capsule_pstore_record *rec;
+	struct efi_capsule_pstore *pctx = psi->data;
+
+	printk("%s:%d\n", __func__, __LINE__);
+	/*
+	 * A zero size record would break our detection of
+	 * partially-filled capsules.
+	 */
+	if (!size)
+		return -EINVAL;
+
+	rec = get_pstore_write_record(&pctx->dmesg, &size);
+	if (!rec)
+		return -ENOSPC;
+
+	pr_info("got record %p, %p %zu\n", rec, rec->data, size);
+
+	rec->type = type;
+	rec->timestamp = get_seconds();
+	rec->size = size;
+	rec->id = (*id)++;
+	memcpy(rec->data, psi->buf, size);
+
+	return 0;
+}
+
+static notrace void *
+get_pstore_buf(struct efi_capsule_pstore_buf *pbuf, size_t size)
+{
+	long next, curr;
+
+	if (size > pbuf->size)
+		return NULL;
+
+	do {
+		curr = atomic_long_read(&pbuf->offset);
+		next = curr + size;
+
+		/* Wrap? */
+		if (next > pbuf->size) {
+			next = size;
+			if (atomic_long_cmpxchg(&pbuf->offset, curr, next)) {
+				curr = 0;
+				break;
+			}
+
+			continue;
+		}
+
+	} while (atomic_long_cmpxchg(&pbuf->offset, curr, next) != curr);
+
+	return pbuf->buf + curr;
+}
+
+static int notrace
+efi_capsule_pstore_write_buf(enum pstore_type_id type,
+			     enum kmsg_dump_reason reason,
+			     u64 *id, unsigned int part,
+			     const char *buf, size_t hsize,
+			     size_t size, struct pstore_info *psi)
+{
+	struct efi_capsule_pstore *pctx = psi->data;
+	void *dst;
+
+	printk("%s:%d\n", __func__, __LINE__);
+	if (type == PSTORE_TYPE_FTRACE)
+		dst = get_pstore_buf(&pctx->ftrace, size);
+	else if (type == PSTORE_TYPE_CONSOLE)
+		dst = get_pstore_buf(&pctx->console, size);
+	else
+		return -EINVAL;
+
+	if (!dst)
+		return -ENOSPC;
+
+	memcpy(dst, buf, size);
+	return 0;
+}
+
+static struct pstore_info efi_capsule_info = {
+	.owner     = THIS_MODULE,
+	.name      = "efi-capsule",
+	.read      = efi_capsule_pstore_read,
+	.write     = efi_capsule_pstore_write,
+	.write_buf = efi_capsule_pstore_write_buf,
+};
+
+#else
+static int efi_capsule_pstore_setup(void) { }
+#endif /* CONFIG_EFI_CAPSULE_PSTORE */
+
 /*
  * Construct a fake capsule header to query capsule support.
  */
@@ -67,8 +627,7 @@
 	efi_capsule_header_t *capsule;
 	efi_status_t status;
 	efi_guid_t guid = LINUX_EFI_CRASH_GUID;
-	u64 max;
-	int reset_type, rv = 0;
+	int rv = 0;
 
 	if (!efi_enabled(EFI_RUNTIME_SERVICES))
 		return -ENODEV;
@@ -82,13 +641,14 @@
 		EFI_CAPSULE_POPULATE_SYSTEM_TABLE;
 	memcpy(&capsule->guid, &guid, sizeof(guid));
 
-	status = efi.query_capsule_caps(&capsule, 1, &max, &reset_type);
+	status = efi.query_capsule_caps(&capsule, 1, &efi_capsule_max_size,
+					&efi_reset_type);
 	if (status != EFI_SUCCESS) {
 		rv = -ENODEV;
 		goto out;
 	}
 
-	switch (reset_type) {
+	switch (efi_reset_type) {
 	case EFI_RESET_COLD:
 	case EFI_RESET_WARM:
 	case EFI_RESET_SHUTDOWN:
@@ -183,6 +743,7 @@
 	}
 
 	kfree(block_pgs);
+	set_bit(CAPSULE_PENDING, &capsule_status);
 	return 0;
 
 fail:
@@ -385,6 +946,8 @@
 
 	pr_info("EFI capsule support enabled\n");
 
+	efi_capsule_pstore_setup();
+
 	return 0;
 }
 device_initcall(efi_capsule_init);
diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c
index b0b3f4a..9379b32 100644
--- a/drivers/firmware/efi/efi.c
+++ b/drivers/firmware/efi/efi.c
@@ -32,6 +32,7 @@
 	.hcdp       = EFI_INVALID_TABLE_ADDR,
 	.uga        = EFI_INVALID_TABLE_ADDR,
 	.uv_systab  = EFI_INVALID_TABLE_ADDR,
+	.capsule    = EFI_INVALID_TABLE_ADDR,
 };
 EXPORT_SYMBOL(efi);
 
@@ -64,6 +65,8 @@
 		str += sprintf(str, "BOOTINFO=0x%lx\n", efi.boot_info);
 	if (efi.uga != EFI_INVALID_TABLE_ADDR)
 		str += sprintf(str, "UGA=0x%lx\n", efi.uga);
+	if (efi.capsule != EFI_INVALID_TABLE_ADDR)
+		str += sprintf(str, "CAPSULE=0x%lx\n", efi.capsule);
 
 	return str - buf;
 }
@@ -190,6 +193,7 @@
 	{SAL_SYSTEM_TABLE_GUID, "SALsystab", &efi.sal_systab},
 	{SMBIOS_TABLE_GUID, "SMBIOS", &efi.smbios},
 	{UGA_IO_PROTOCOL_GUID, "UGA", &efi.uga},
+	{LINUX_EFI_CRASH_GUID, "CAPSULE", &efi.capsule},
 	{NULL_GUID, NULL, 0},
 };
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 07f6fc4..a71f91e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -191,6 +191,19 @@
 #ifdef CONFIG_MAGIC_SYSRQ
 /* Note: sysrq code uses it's own private copy */
 static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE;
+static int mjf_foo = 0;
+
+static int mjf_handler(ctl_table *table, int write,
+				void __user *buffer, size_t *lenp,
+				loff_t *ppos)
+{
+	char *killer = NULL;
+
+	*killer = 1;
+	
+	return 0;
+}
+
 
 static int sysrq_sysctl_handler(ctl_table *table, int write,
 				void __user *buffer, size_t *lenp,
@@ -662,6 +675,14 @@
 		.proc_handler	= sysrq_sysctl_handler,
 	},
 #endif
+	{
+		.procname	= "mjf",
+		.data		= &mjf_foo,
+		.maxlen		= sizeof (int),
+		.mode		= 0644,
+		.proc_handler	= mjf_handler,
+	},
+
 #ifdef CONFIG_PROC_SYSCTL
 	{
 		.procname	= "cad_pid",