Merge branch 'x86/urgent'

commit: 9c2f65317357b21b8d289ac79c314a87dcb199f4 [log] [tgz]
author: Ingo Molnar <mingo@kernel.org> Tue Sep 09 07:14:18 2014 +0200
committer: Ingo Molnar <mingo@kernel.org> Tue Sep 09 07:14:18 2014 +0200
tree: 85ec488b9272f97348d1ba5384bd99963f499013
parent: 961fa95879f8925e5e9e05d5f775c5cb5a0519fa [diff]
parent: cc99535eb4049c730cac421d403d079593cb31ae [diff]
diff --git a/Documentation/00-INDEX b/Documentation/00-INDEX
index 27e67a9..1750fce 100644
--- a/Documentation/00-INDEX
+++ b/Documentation/00-INDEX

@@ -287,6 +287,8 @@
 	- semantics and behavior of local atomic operations.
 lockdep-design.txt
 	- documentation on the runtime locking correctness validator.
+locking/
+	- directory with info about kernel locking primitives
 lockstat.txt
 	- info on collecting statistics on locks (and contention).
 lockup-watchdogs.txt

diff --git a/Documentation/DocBook/kernel-locking.tmpl b/Documentation/DocBook/kernel-locking.tmpl
index e584ee1..7c9cc48 100644
--- a/Documentation/DocBook/kernel-locking.tmpl
+++ b/Documentation/DocBook/kernel-locking.tmpl

@@ -1972,7 +1972,7 @@
    <itemizedlist>
     <listitem>
      <para>
-       <filename>Documentation/spinlocks.txt</filename>: 
+       <filename>Documentation/locking/spinlocks.txt</filename>:
        Linus Torvalds' spinlocking tutorial in the kernel sources.
      </para>
     </listitem>

diff --git a/Documentation/SubmittingPatches b/Documentation/SubmittingPatches
index 0a523c9..482c749 100644
--- a/Documentation/SubmittingPatches
+++ b/Documentation/SubmittingPatches

@@ -794,6 +794,7 @@
   <http://www.kroah.com/log/linux/maintainer-03.html>
   <http://www.kroah.com/log/linux/maintainer-04.html>
   <http://www.kroah.com/log/linux/maintainer-05.html>
+  <http://www.kroah.com/log/linux/maintainer-06.html>
 
 NO!!!! No more huge patch bombs to linux-kernel@vger.kernel.org people!
   <https://lkml.org/lkml/2005/7/11/336>

diff --git a/Documentation/devicetree/bindings/input/atmel,maxtouch.txt b/Documentation/devicetree/bindings/input/atmel,maxtouch.txt
index baef432..0ac23f2 100644
--- a/Documentation/devicetree/bindings/input/atmel,maxtouch.txt
+++ b/Documentation/devicetree/bindings/input/atmel,maxtouch.txt

@@ -15,6 +15,17 @@
     keycode generated by each GPIO. Linux keycodes are defined in
     <dt-bindings/input/input.h>.
 
+- linux,gpio-keymap: When enabled, the SPT_GPIOPWN_T19 object sends messages
+    on GPIO bit changes. An array of up to 8 entries can be provided
+    indicating the Linux keycode mapped to each bit of the status byte,
+    starting at the LSB. Linux keycodes are defined in
+    <dt-bindings/input/input.h>.
+
+    Note: the numbering of the GPIOs and the bit they start at varies between
+    maXTouch devices. You must either refer to the documentation, or
+    experiment to determine which bit corresponds to which input. Use
+    KEY_RESERVED for unused padding values.
+
 Example:
 
 	touch@4b {

diff --git a/Documentation/devicetree/bindings/net/stmmac.txt b/Documentation/devicetree/bindings/net/stmmac.txt
index 9b03c57..e45ac3f 100644
--- a/Documentation/devicetree/bindings/net/stmmac.txt
+++ b/Documentation/devicetree/bindings/net/stmmac.txt

@@ -39,6 +39,10 @@
   further clocks may be specified in derived bindings.
 - clock-names: One name for each entry in the clocks property, the
   first one should be "stmmaceth".
+- clk_ptp_ref: this is the PTP reference clock; in case of the PTP is
+  available this clock is used for programming the Timestamp Addend Register.
+  If not passed then the system clock will be used and this is fine on some
+  platforms.
 
 Examples:
 

diff --git a/Documentation/devicetree/bindings/regulator/tps65090.txt b/Documentation/devicetree/bindings/regulator/tps65090.txt
index 34098023..ca69f5e 100644
--- a/Documentation/devicetree/bindings/regulator/tps65090.txt
+++ b/Documentation/devicetree/bindings/regulator/tps65090.txt

@@ -45,8 +45,8 @@
 		infet5-supply = <&some_reg>;
 		infet6-supply = <&some_reg>;
 		infet7-supply = <&some_reg>;
-		vsys_l1-supply = <&some_reg>;
-		vsys_l2-supply = <&some_reg>;
+		vsys-l1-supply = <&some_reg>;
+		vsys-l2-supply = <&some_reg>;
 
 		regulators {
 			dcdc1 {

diff --git a/Documentation/devicetree/bindings/sound/adi,axi-spdif-tx.txt b/Documentation/devicetree/bindings/sound/adi,axi-spdif-tx.txt
index 46f3449..4eb7997 100644
--- a/Documentation/devicetree/bindings/sound/adi,axi-spdif-tx.txt
+++ b/Documentation/devicetree/bindings/sound/adi,axi-spdif-tx.txt

@@ -1,7 +1,7 @@
 ADI AXI-SPDIF controller
 
 Required properties:
- - compatible : Must be "adi,axi-spdif-1.00.a"
+ - compatible : Must be "adi,axi-spdif-tx-1.00.a"
  - reg : Must contain SPDIF core's registers location and length
  - clocks : Pairs of phandle and specifier referencing the controller's clocks.
    The controller expects two clocks, the clock used for the AXI interface and

diff --git a/Documentation/filesystems/nfs/nfs-rdma.txt b/Documentation/filesystems/nfs/nfs-rdma.txt
index e386f7e..7240438 100644
--- a/Documentation/filesystems/nfs/nfs-rdma.txt
+++ b/Documentation/filesystems/nfs/nfs-rdma.txt

@@ -138,9 +138,9 @@
   - Build, install, reboot
 
     The NFS/RDMA code will be enabled automatically if NFS and RDMA
-    are turned on. The NFS/RDMA client and server are configured via the hidden
-    SUNRPC_XPRT_RDMA config option that depends on SUNRPC and INFINIBAND. The
-    value of SUNRPC_XPRT_RDMA will be:
+    are turned on. The NFS/RDMA client and server are configured via the
+    SUNRPC_XPRT_RDMA_CLIENT and SUNRPC_XPRT_RDMA_SERVER config options that both
+    depend on SUNRPC and INFINIBAND. The default value of both options will be:
 
      - N if either SUNRPC or INFINIBAND are N, in this case the NFS/RDMA client
        and server will not be built
@@ -235,8 +235,9 @@
 
   - Start the NFS server
 
-    If the NFS/RDMA server was built as a module (CONFIG_SUNRPC_XPRT_RDMA=m in
-    kernel config), load the RDMA transport module:
+    If the NFS/RDMA server was built as a module
+    (CONFIG_SUNRPC_XPRT_RDMA_SERVER=m in kernel config), load the RDMA
+    transport module:
 
     $ modprobe svcrdma
 
@@ -255,8 +256,9 @@
 
   - On the client system
 
-    If the NFS/RDMA client was built as a module (CONFIG_SUNRPC_XPRT_RDMA=m in
-    kernel config), load the RDMA client module:
+    If the NFS/RDMA client was built as a module
+    (CONFIG_SUNRPC_XPRT_RDMA_CLIENT=m in kernel config), load the RDMA client
+    module:
 
     $ modprobe xprtrdma.ko
 

diff --git a/Documentation/filesystems/seq_file.txt b/Documentation/filesystems/seq_file.txt
index 1fe0ccb..8ea3e90 100644
--- a/Documentation/filesystems/seq_file.txt
+++ b/Documentation/filesystems/seq_file.txt

@@ -235,6 +235,39 @@
 private field of the seq_file structure; that value can then be retrieved
 by the iterator functions.
 
+There is also a wrapper function to seq_open() called seq_open_private(). It
+kmallocs a zero filled block of memory and stores a pointer to it in the
+private field of the seq_file structure, returning 0 on success. The
+block size is specified in a third parameter to the function, e.g.:
+
+	static int ct_open(struct inode *inode, struct file *file)
+	{
+		return seq_open_private(file, &ct_seq_ops,
+					sizeof(struct mystruct));
+	}
+
+There is also a variant function, __seq_open_private(), which is functionally
+identical except that, if successful, it returns the pointer to the allocated
+memory block, allowing further initialisation e.g.:
+
+	static int ct_open(struct inode *inode, struct file *file)
+	{
+		struct mystruct *p =
+			__seq_open_private(file, &ct_seq_ops, sizeof(*p));
+
+		if (!p)
+			return -ENOMEM;
+
+		p->foo = bar; /* initialize my stuff */
+			...
+		p->baz = true;
+
+		return 0;
+	}
+
+A corresponding close function, seq_release_private() is available which
+frees the memory allocated in the corresponding open.
+
 The other operations of interest - read(), llseek(), and release() - are
 all implemented by the seq_file code itself. So a virtual file's
 file_operations structure will look like:

diff --git a/Documentation/gpio/consumer.txt b/Documentation/gpio/consumer.txt
index 7654632..6ce5441 100644
--- a/Documentation/gpio/consumer.txt
+++ b/Documentation/gpio/consumer.txt

@@ -53,7 +53,20 @@
 if and only if no GPIO has been assigned to the device/function/index triplet,
 other error codes are used for cases where a GPIO has been assigned but an error
 occurred while trying to acquire it. This is useful to discriminate between mere
-errors and an absence of GPIO for optional GPIO parameters.
+errors and an absence of GPIO for optional GPIO parameters. For the common
+pattern where a GPIO is optional, the gpiod_get_optional() and
+gpiod_get_index_optional() functions can be used. These functions return NULL
+instead of -ENOENT if no GPIO has been assigned to the requested function:
+
+
+	struct gpio_desc *gpiod_get_optional(struct device *dev,
+					     const char *con_id,
+					     enum gpiod_flags flags)
+
+	struct gpio_desc *gpiod_get_index_optional(struct device *dev,
+						   const char *con_id,
+						   unsigned int index,
+						   enum gpiod_flags flags)
 
 Device-managed variants of these functions are also defined:
 
@@ -65,6 +78,15 @@
 					       unsigned int idx,
 					       enum gpiod_flags flags)
 
+	struct gpio_desc *devm_gpiod_get_optional(struct device *dev,
+						  const char *con_id,
+						  enum gpiod_flags flags)
+
+	struct gpio_desc * devm_gpiod_get_index_optional(struct device *dev,
+							const char *con_id,
+							unsigned int index,
+							enum gpiod_flags flags)
+
 A GPIO descriptor can be disposed of using the gpiod_put() function:
 
 	void gpiod_put(struct gpio_desc *desc)

diff --git a/Documentation/i2c/dev-interface b/Documentation/i2c/dev-interface
index 3e742ba..2ac78ae 100644
--- a/Documentation/i2c/dev-interface
+++ b/Documentation/i2c/dev-interface

@@ -57,12 +57,12 @@
 I2C to communicate with your device. SMBus commands are preferred if
 the device supports them. Both are illustrated below.
 
-  __u8 register = 0x10; /* Device register to access */
+  __u8 reg = 0x10; /* Device register to access */
   __s32 res;
   char buf[10];
 
   /* Using SMBus commands */
-  res = i2c_smbus_read_word_data(file, register);
+  res = i2c_smbus_read_word_data(file, reg);
   if (res < 0) {
     /* ERROR HANDLING: i2c transaction failed */
   } else {
@@ -70,11 +70,11 @@
   }
 
   /* Using I2C Write, equivalent of 
-     i2c_smbus_write_word_data(file, register, 0x6543) */
-  buf[0] = register;
+     i2c_smbus_write_word_data(file, reg, 0x6543) */
+  buf[0] = reg;
   buf[1] = 0x43;
   buf[2] = 0x65;
-  if (write(file, buf, 3) ! =3) {
+  if (write(file, buf, 3) != 3) {
     /* ERROR HANDLING: i2c transaction failed */
   }
 

diff --git a/Documentation/lockdep-design.txt b/Documentation/locking/lockdep-design.txt
similarity index 100%
rename from Documentation/lockdep-design.txt
rename to Documentation/locking/lockdep-design.txt


diff --git a/Documentation/lockstat.txt b/Documentation/locking/lockstat.txt
similarity index 98%
rename from Documentation/lockstat.txt
rename to Documentation/locking/lockstat.txt
index 72d0106..7428773 100644
--- a/Documentation/lockstat.txt
+++ b/Documentation/locking/lockstat.txt

@@ -12,7 +12,7 @@
 - HOW
 
 Lockdep already has hooks in the lock functions and maps lock instances to
-lock classes. We build on that (see Documentation/lockdep-design.txt).
+lock classes. We build on that (see Documentation/lokcing/lockdep-design.txt).
 The graph below shows the relation between the lock functions and the various
 hooks therein.
 

diff --git a/Documentation/mutex-design.txt b/Documentation/locking/mutex-design.txt
similarity index 96%
rename from Documentation/mutex-design.txt
rename to Documentation/locking/mutex-design.txt
index ee231ed..60c482d 100644
--- a/Documentation/mutex-design.txt
+++ b/Documentation/locking/mutex-design.txt

@@ -145,9 +145,9 @@
 
 Unlike its original design and purpose, 'struct mutex' is larger than
 most locks in the kernel. E.g: on x86-64 it is 40 bytes, almost twice
-as large as 'struct semaphore' (24 bytes) and 8 bytes shy of the
-'struct rw_semaphore' variant. Larger structure sizes mean more CPU
-cache and memory footprint.
+as large as 'struct semaphore' (24 bytes) and tied, along with rwsems,
+for the largest lock in the kernel. Larger structure sizes mean more
+CPU cache and memory footprint.
 
 When to use mutexes
 -------------------

diff --git a/Documentation/rt-mutex-design.txt b/Documentation/locking/rt-mutex-design.txt
similarity index 100%
rename from Documentation/rt-mutex-design.txt
rename to Documentation/locking/rt-mutex-design.txt


diff --git a/Documentation/rt-mutex.txt b/Documentation/locking/rt-mutex.txt
similarity index 100%
rename from Documentation/rt-mutex.txt
rename to Documentation/locking/rt-mutex.txt


diff --git a/Documentation/spinlocks.txt b/Documentation/locking/spinlocks.txt
similarity index 97%
rename from Documentation/spinlocks.txt
rename to Documentation/locking/spinlocks.txt
index 97eaf57..ff35e40 100644
--- a/Documentation/spinlocks.txt
+++ b/Documentation/locking/spinlocks.txt

@@ -105,9 +105,9 @@
 	spin_unlock(&lock);
 
 (and the equivalent read-write versions too, of course). The spinlock will
-guarantee the same kind of exclusive access, and it will be much faster. 
+guarantee the same kind of exclusive access, and it will be much faster.
 This is useful if you know that the data in question is only ever
-manipulated from a "process context", ie no interrupts involved. 
+manipulated from a "process context", ie no interrupts involved.
 
 The reasons you mustn't use these versions if you have interrupts that
 play with the spinlock is that you can get deadlocks:
@@ -122,21 +122,21 @@
 interrupt happens on the same CPU that already holds the lock, because the
 lock will obviously never be released (because the interrupt is waiting
 for the lock, and the lock-holder is interrupted by the interrupt and will
-not continue until the interrupt has been processed). 
+not continue until the interrupt has been processed).
 
 (This is also the reason why the irq-versions of the spinlocks only need
 to disable the _local_ interrupts - it's ok to use spinlocks in interrupts
 on other CPU's, because an interrupt on another CPU doesn't interrupt the
 CPU that holds the lock, so the lock-holder can continue and eventually
-releases the lock). 
+releases the lock).
 
 Note that you can be clever with read-write locks and interrupts. For
 example, if you know that the interrupt only ever gets a read-lock, then
 you can use a non-irq version of read locks everywhere - because they
-don't block on each other (and thus there is no dead-lock wrt interrupts. 
-But when you do the write-lock, you have to use the irq-safe version. 
+don't block on each other (and thus there is no dead-lock wrt interrupts.
+But when you do the write-lock, you have to use the irq-safe version.
 
-For an example of being clever with rw-locks, see the "waitqueue_lock" 
+For an example of being clever with rw-locks, see the "waitqueue_lock"
 handling in kernel/sched/core.c - nothing ever _changes_ a wait-queue from
 within an interrupt, they only read the queue in order to know whom to
 wake up. So read-locks are safe (which is good: they are very common

diff --git a/Documentation/ww-mutex-design.txt b/Documentation/locking/ww-mutex-design.txt
similarity index 100%
rename from Documentation/ww-mutex-design.txt
rename to Documentation/locking/ww-mutex-design.txt


diff --git a/Documentation/misc-devices/lis3lv02d b/Documentation/misc-devices/lis3lv02d
index af815b9..f89960a 100644
--- a/Documentation/misc-devices/lis3lv02d
+++ b/Documentation/misc-devices/lis3lv02d

@@ -59,7 +59,7 @@
 from the device. It supports blocking operations, poll/select and
 fasync operation modes. You must read 1 bytes from the device.  The
 result is number of free-fall interrupts since the last successful
-read (or 255 if number of interrupts would not fit). See the hpfall.c
+read (or 255 if number of interrupts would not fit). See the freefall.c
 file for an example on using the device.
 
 

diff --git a/Documentation/power/regulator/consumer.txt b/Documentation/power/regulator/consumer.txt
index 81c0e2b..8afb236c 100644
--- a/Documentation/power/regulator/consumer.txt
+++ b/Documentation/power/regulator/consumer.txt

@@ -143,8 +143,9 @@
 on all its consumers) and change operating mode (if necessary and permitted)
 to best match the current operating load.
 
-The load_uA value can be determined from the consumers datasheet. e.g.most
-datasheets have tables showing the max current consumed in certain situations.
+The load_uA value can be determined from the consumer's datasheet. e.g. most
+datasheets have tables showing the maximum current consumed in certain
+situations.
 
 Most consumers will use indirect operating mode control since they have no
 knowledge of the regulator or whether the regulator is shared with other
@@ -173,7 +174,7 @@
 int regulator_register_notifier(struct regulator *regulator,
 			      struct notifier_block *nb);
 
-Consumers can uregister interest by calling :-
+Consumers can unregister interest by calling :-
 
 int regulator_unregister_notifier(struct regulator *regulator,
 				struct notifier_block *nb);

diff --git a/Documentation/power/regulator/design.txt b/Documentation/power/regulator/design.txt
index f9b56b7..fdd919b 100644
--- a/Documentation/power/regulator/design.txt
+++ b/Documentation/power/regulator/design.txt

@@ -9,14 +9,14 @@
 
  - Errors in regulator configuration can have very serious consequences
    for the system, potentially including lasting hardware damage.
- - It is not possible to automatically determine the power confugration
+ - It is not possible to automatically determine the power configuration
    of the system - software-equivalent variants of the same chip may
-   have different power requirments, and not all components with power
+   have different power requirements, and not all components with power
    requirements are visible to software.
 
   => The API should make no changes to the hardware state unless it has
-     specific knowledge that these changes are safe to do perform on
-     this particular system.
+     specific knowledge that these changes are safe to perform on this
+     particular system.
 
 Consumer use cases
 ------------------

diff --git a/Documentation/power/regulator/machine.txt b/Documentation/power/regulator/machine.txt
index ce63af0..757e3b5 100644
--- a/Documentation/power/regulator/machine.txt
+++ b/Documentation/power/regulator/machine.txt

@@ -11,7 +11,7 @@
                +-> [Consumer B @ 3.3V]
 
 The drivers for consumers A & B must be mapped to the correct regulator in
-order to control their power supply. This mapping can be achieved in machine
+order to control their power supplies. This mapping can be achieved in machine
 initialisation code by creating a struct regulator_consumer_supply for
 each regulator.
 
@@ -39,7 +39,7 @@
 
 Constraints can now be registered by defining a struct regulator_init_data
 for each regulator power domain. This structure also maps the consumers
-to their supply regulator :-
+to their supply regulators :-
 
 static struct regulator_init_data regulator1_data = {
 	.constraints = {

diff --git a/Documentation/power/regulator/overview.txt b/Documentation/power/regulator/overview.txt
index 8ed1758..40ca2d6 100644
--- a/Documentation/power/regulator/overview.txt
+++ b/Documentation/power/regulator/overview.txt

@@ -36,11 +36,11 @@
                    Consumers can be classified into two types:-
 
                    Static: consumer does not change its supply voltage or
-                   current limit. It only needs to enable or disable it's
+                   current limit. It only needs to enable or disable its
                    power supply. Its supply voltage is set by the hardware,
                    bootloader, firmware or kernel board initialisation code.
 
-                   Dynamic: consumer needs to change it's supply voltage or
+                   Dynamic: consumer needs to change its supply voltage or
                    current limit to meet operation demands.
 
 
@@ -156,7 +156,7 @@
       This interface is for machine specific code and allows the creation of
       voltage/current domains (with constraints) for each regulator. It can
       provide regulator constraints that will prevent device damage through
-      overvoltage or over current caused by buggy client drivers. It also
+      overvoltage or overcurrent caused by buggy client drivers. It also
       allows the creation of a regulator tree whereby some regulators are
       supplied by others (similar to a clock tree).
 

diff --git a/Documentation/power/regulator/regulator.txt b/Documentation/power/regulator/regulator.txt
index 1390277..b17e583 100644
--- a/Documentation/power/regulator/regulator.txt
+++ b/Documentation/power/regulator/regulator.txt

@@ -13,7 +13,7 @@
 struct regulator_dev *regulator_register(struct regulator_desc *regulator_desc,
 					 const struct regulator_config *config);
 
-This will register the regulators capabilities and operations to the regulator
+This will register the regulator's capabilities and operations to the regulator
 core.
 
 Regulators can be unregistered by calling :-
@@ -23,8 +23,8 @@
 
 Regulator Events
 ================
-Regulators can send events (e.g. over temp, under voltage, etc) to consumer
-drivers by calling :-
+Regulators can send events (e.g. overtemperature, undervoltage, etc) to
+consumer drivers by calling :-
 
 int regulator_notifier_call_chain(struct regulator_dev *rdev,
 				  unsigned long event, void *data);

diff --git a/MAINTAINERS b/MAINTAINERS
index cf24bb5..7d72ba9 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS

@@ -5594,8 +5594,8 @@
 L:	linux-kernel@vger.kernel.org
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git core/locking
 S:	Maintained
-F:	Documentation/lockdep*.txt
-F:	Documentation/lockstat.txt
+F:	Documentation/locking/lockdep*.txt
+F:	Documentation/locking/lockstat.txt
 F:	include/linux/lockdep.h
 F:	kernel/locking/
 
@@ -6145,6 +6145,14 @@
 W:	http://www.native-instruments.com
 F:	sound/usb/caiaq/
 
+NATIVE LINUX KVM TOOL
+M:	Pekka Enberg <penberg@kernel.org>
+M:	Sasha Levin <levinsasha928@gmail.com>
+M:	Asias He <asias.hejun@gmail.com>
+L:	kvm@vger.kernel.org
+S:	Maintained
+F:	tools/kvm/
+
 NCP FILESYSTEM
 M:	Petr Vandrovec <petr@vandrovec.name>
 S:	Odd Fixes
@@ -10070,9 +10078,9 @@
 F:	arch/x86/
 
 X86 PLATFORM DRIVERS
-M:	Matthew Garrett <matthew.garrett@nebula.com>
+M:	Darren Hart <dvhart@infradead.org>
 L:	platform-driver-x86@vger.kernel.org
-T:	git git://cavan.codon.org.uk/platform-drivers-x86.git
+T:	git git://git.infradead.org/users/dvhart/linux-platform-drivers-x86.git
 S:	Maintained
 F:	drivers/platform/x86/
 

diff --git a/Makefile b/Makefile
index 2893d7f..1a60bdd 100644
--- a/Makefile
+++ b/Makefile

@@ -1,7 +1,7 @@
 VERSION = 3
 PATCHLEVEL = 17
 SUBLEVEL = 0
-EXTRAVERSION = -rc3
+EXTRAVERSION = -rc4
 NAME = Shuffling Zombie Juror
 
 # *DOCUMENTATION*

diff --git a/arch/Kconfig b/arch/Kconfig
index 0eae9df..05d7a8a 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig

@@ -323,6 +323,17 @@
 	    results in the system call being skipped immediately.
 	  - seccomp syscall wired up
 
+	  For best performance, an arch should use seccomp_phase1 and
+	  seccomp_phase2 directly.  It should call seccomp_phase1 for all
+	  syscalls if TIF_SECCOMP is set, but seccomp_phase1 does not
+	  need to be called from a ptrace-safe context.  It must then
+	  call seccomp_phase2 if seccomp_phase1 returns anything other
+	  than SECCOMP_PHASE1_OK or SECCOMP_PHASE1_SKIP.
+
+	  As an additional optimization, an arch may provide seccomp_data
+	  directly to seccomp_phase1; this avoids multiple calls
+	  to the syscall_xyz helpers for every syscall.
+
 config SECCOMP_FILTER
 	def_bool y
 	depends on HAVE_ARCH_SECCOMP_FILTER && SECCOMP && NET

diff --git a/arch/alpha/include/asm/atomic.h b/arch/alpha/include/asm/atomic.h
index ed60a1e..6fbb53a 100644
--- a/arch/alpha/include/asm/atomic.h
+++ b/arch/alpha/include/asm/atomic.h

@@ -29,145 +29,92 @@
  * branch back to restart the operation.
  */
 
-static __inline__ void atomic_add(int i, atomic_t * v)
-{
-	unsigned long temp;
-	__asm__ __volatile__(
-	"1:	ldl_l %0,%1\n"
-	"	addl %0,%2,%0\n"
-	"	stl_c %0,%1\n"
-	"	beq %0,2f\n"
-	".subsection 2\n"
-	"2:	br 1b\n"
-	".previous"
-	:"=&r" (temp), "=m" (v->counter)
-	:"Ir" (i), "m" (v->counter));
+#define ATOMIC_OP(op)							\
+static __inline__ void atomic_##op(int i, atomic_t * v)			\
+{									\
+	unsigned long temp;						\
+	__asm__ __volatile__(						\
+	"1:	ldl_l %0,%1\n"						\
+	"	" #op "l %0,%2,%0\n"					\
+	"	stl_c %0,%1\n"						\
+	"	beq %0,2f\n"						\
+	".subsection 2\n"						\
+	"2:	br 1b\n"						\
+	".previous"							\
+	:"=&r" (temp), "=m" (v->counter)				\
+	:"Ir" (i), "m" (v->counter));					\
+}									\
+
+#define ATOMIC_OP_RETURN(op)						\
+static inline int atomic_##op##_return(int i, atomic_t *v)		\
+{									\
+	long temp, result;						\
+	smp_mb();							\
+	__asm__ __volatile__(						\
+	"1:	ldl_l %0,%1\n"						\
+	"	" #op "l %0,%3,%2\n"					\
+	"	" #op "l %0,%3,%0\n"					\
+	"	stl_c %0,%1\n"						\
+	"	beq %0,2f\n"						\
+	".subsection 2\n"						\
+	"2:	br 1b\n"						\
+	".previous"							\
+	:"=&r" (temp), "=m" (v->counter), "=&r" (result)		\
+	:"Ir" (i), "m" (v->counter) : "memory");			\
+	smp_mb();							\
+	return result;							\
 }
 
-static __inline__ void atomic64_add(long i, atomic64_t * v)
-{
-	unsigned long temp;
-	__asm__ __volatile__(
-	"1:	ldq_l %0,%1\n"
-	"	addq %0,%2,%0\n"
-	"	stq_c %0,%1\n"
-	"	beq %0,2f\n"
-	".subsection 2\n"
-	"2:	br 1b\n"
-	".previous"
-	:"=&r" (temp), "=m" (v->counter)
-	:"Ir" (i), "m" (v->counter));
+#define ATOMIC64_OP(op)							\
+static __inline__ void atomic64_##op(long i, atomic64_t * v)		\
+{									\
+	unsigned long temp;						\
+	__asm__ __volatile__(						\
+	"1:	ldq_l %0,%1\n"						\
+	"	" #op "q %0,%2,%0\n"					\
+	"	stq_c %0,%1\n"						\
+	"	beq %0,2f\n"						\
+	".subsection 2\n"						\
+	"2:	br 1b\n"						\
+	".previous"							\
+	:"=&r" (temp), "=m" (v->counter)				\
+	:"Ir" (i), "m" (v->counter));					\
+}									\
+
+#define ATOMIC64_OP_RETURN(op)						\
+static __inline__ long atomic64_##op##_return(long i, atomic64_t * v)	\
+{									\
+	long temp, result;						\
+	smp_mb();							\
+	__asm__ __volatile__(						\
+	"1:	ldq_l %0,%1\n"						\
+	"	" #op "q %0,%3,%2\n"					\
+	"	" #op "q %0,%3,%0\n"					\
+	"	stq_c %0,%1\n"						\
+	"	beq %0,2f\n"						\
+	".subsection 2\n"						\
+	"2:	br 1b\n"						\
+	".previous"							\
+	:"=&r" (temp), "=m" (v->counter), "=&r" (result)		\
+	:"Ir" (i), "m" (v->counter) : "memory");			\
+	smp_mb();							\
+	return result;							\
 }
 
-static __inline__ void atomic_sub(int i, atomic_t * v)
-{
-	unsigned long temp;
-	__asm__ __volatile__(
-	"1:	ldl_l %0,%1\n"
-	"	subl %0,%2,%0\n"
-	"	stl_c %0,%1\n"
-	"	beq %0,2f\n"
-	".subsection 2\n"
-	"2:	br 1b\n"
-	".previous"
-	:"=&r" (temp), "=m" (v->counter)
-	:"Ir" (i), "m" (v->counter));
-}
+#define ATOMIC_OPS(opg)							\
+	ATOMIC_OP(opg)							\
+	ATOMIC_OP_RETURN(opg)						\
+	ATOMIC64_OP(opg)						\
+	ATOMIC64_OP_RETURN(opg)
 
-static __inline__ void atomic64_sub(long i, atomic64_t * v)
-{
-	unsigned long temp;
-	__asm__ __volatile__(
-	"1:	ldq_l %0,%1\n"
-	"	subq %0,%2,%0\n"
-	"	stq_c %0,%1\n"
-	"	beq %0,2f\n"
-	".subsection 2\n"
-	"2:	br 1b\n"
-	".previous"
-	:"=&r" (temp), "=m" (v->counter)
-	:"Ir" (i), "m" (v->counter));
-}
+ATOMIC_OPS(add)
+ATOMIC_OPS(sub)
 
-
-/*
- * Same as above, but return the result value
- */
-static inline int atomic_add_return(int i, atomic_t *v)
-{
-	long temp, result;
-	smp_mb();
-	__asm__ __volatile__(
-	"1:	ldl_l %0,%1\n"
-	"	addl %0,%3,%2\n"
-	"	addl %0,%3,%0\n"
-	"	stl_c %0,%1\n"
-	"	beq %0,2f\n"
-	".subsection 2\n"
-	"2:	br 1b\n"
-	".previous"
-	:"=&r" (temp), "=m" (v->counter), "=&r" (result)
-	:"Ir" (i), "m" (v->counter) : "memory");
-	smp_mb();
-	return result;
-}
-
-static __inline__ long atomic64_add_return(long i, atomic64_t * v)
-{
-	long temp, result;
-	smp_mb();
-	__asm__ __volatile__(
-	"1:	ldq_l %0,%1\n"
-	"	addq %0,%3,%2\n"
-	"	addq %0,%3,%0\n"
-	"	stq_c %0,%1\n"
-	"	beq %0,2f\n"
-	".subsection 2\n"
-	"2:	br 1b\n"
-	".previous"
-	:"=&r" (temp), "=m" (v->counter), "=&r" (result)
-	:"Ir" (i), "m" (v->counter) : "memory");
-	smp_mb();
-	return result;
-}
-
-static __inline__ long atomic_sub_return(int i, atomic_t * v)
-{
-	long temp, result;
-	smp_mb();
-	__asm__ __volatile__(
-	"1:	ldl_l %0,%1\n"
-	"	subl %0,%3,%2\n"
-	"	subl %0,%3,%0\n"
-	"	stl_c %0,%1\n"
-	"	beq %0,2f\n"
-	".subsection 2\n"
-	"2:	br 1b\n"
-	".previous"
-	:"=&r" (temp), "=m" (v->counter), "=&r" (result)
-	:"Ir" (i), "m" (v->counter) : "memory");
-	smp_mb();
-	return result;
-}
-
-static __inline__ long atomic64_sub_return(long i, atomic64_t * v)
-{
-	long temp, result;
-	smp_mb();
-	__asm__ __volatile__(
-	"1:	ldq_l %0,%1\n"
-	"	subq %0,%3,%2\n"
-	"	subq %0,%3,%0\n"
-	"	stq_c %0,%1\n"
-	"	beq %0,2f\n"
-	".subsection 2\n"
-	"2:	br 1b\n"
-	".previous"
-	:"=&r" (temp), "=m" (v->counter), "=&r" (result)
-	:"Ir" (i), "m" (v->counter) : "memory");
-	smp_mb();
-	return result;
-}
+#undef ATOMIC_OPS
+#undef ATOMIC64_OP_RETURN
+#undef ATOMIC64_OP
+#undef ATOMIC_OP_RETURN
+#undef ATOMIC_OP
 
 #define atomic64_cmpxchg(v, old, new) (cmpxchg(&((v)->counter), old, new))
 #define atomic64_xchg(v, new) (xchg(&((v)->counter), new))

diff --git a/arch/arc/include/asm/atomic.h b/arch/arc/include/asm/atomic.h
index 83f03ca..173f303 100644
--- a/arch/arc/include/asm/atomic.h
+++ b/arch/arc/include/asm/atomic.h

@@ -25,79 +25,36 @@
 
 #define atomic_set(v, i) (((v)->counter) = (i))
 
-static inline void atomic_add(int i, atomic_t *v)
-{
-	unsigned int temp;
+#define ATOMIC_OP(op, c_op, asm_op)					\
+static inline void atomic_##op(int i, atomic_t *v)			\
+{									\
+	unsigned int temp;						\
+									\
+	__asm__ __volatile__(						\
+	"1:	llock   %0, [%1]	\n"				\
+	"	" #asm_op " %0, %0, %2	\n"				\
+	"	scond   %0, [%1]	\n"				\
+	"	bnz     1b		\n"				\
+	: "=&r"(temp)	/* Early clobber, to prevent reg reuse */	\
+	: "r"(&v->counter), "ir"(i)					\
+	: "cc");							\
+}									\
 
-	__asm__ __volatile__(
-	"1:	llock   %0, [%1]	\n"
-	"	add     %0, %0, %2	\n"
-	"	scond   %0, [%1]	\n"
-	"	bnz     1b		\n"
-	: "=&r"(temp)	/* Early clobber, to prevent reg reuse */
-	: "r"(&v->counter), "ir"(i)
-	: "cc");
-}
-
-static inline void atomic_sub(int i, atomic_t *v)
-{
-	unsigned int temp;
-
-	__asm__ __volatile__(
-	"1:	llock   %0, [%1]	\n"
-	"	sub     %0, %0, %2	\n"
-	"	scond   %0, [%1]	\n"
-	"	bnz     1b		\n"
-	: "=&r"(temp)
-	: "r"(&v->counter), "ir"(i)
-	: "cc");
-}
-
-/* add and also return the new value */
-static inline int atomic_add_return(int i, atomic_t *v)
-{
-	unsigned int temp;
-
-	__asm__ __volatile__(
-	"1:	llock   %0, [%1]	\n"
-	"	add     %0, %0, %2	\n"
-	"	scond   %0, [%1]	\n"
-	"	bnz     1b		\n"
-	: "=&r"(temp)
-	: "r"(&v->counter), "ir"(i)
-	: "cc");
-
-	return temp;
-}
-
-static inline int atomic_sub_return(int i, atomic_t *v)
-{
-	unsigned int temp;
-
-	__asm__ __volatile__(
-	"1:	llock   %0, [%1]	\n"
-	"	sub     %0, %0, %2	\n"
-	"	scond   %0, [%1]	\n"
-	"	bnz     1b		\n"
-	: "=&r"(temp)
-	: "r"(&v->counter), "ir"(i)
-	: "cc");
-
-	return temp;
-}
-
-static inline void atomic_clear_mask(unsigned long mask, unsigned long *addr)
-{
-	unsigned int temp;
-
-	__asm__ __volatile__(
-	"1:	llock   %0, [%1]	\n"
-	"	bic     %0, %0, %2	\n"
-	"	scond   %0, [%1]	\n"
-	"	bnz     1b		\n"
-	: "=&r"(temp)
-	: "r"(addr), "ir"(mask)
-	: "cc");
+#define ATOMIC_OP_RETURN(op, c_op, asm_op)				\
+static inline int atomic_##op##_return(int i, atomic_t *v)		\
+{									\
+	unsigned int temp;						\
+									\
+	__asm__ __volatile__(						\
+	"1:	llock   %0, [%1]	\n"				\
+	"	" #asm_op " %0, %0, %2	\n"				\
+	"	scond   %0, [%1]	\n"				\
+	"	bnz     1b		\n"				\
+	: "=&r"(temp)							\
+	: "r"(&v->counter), "ir"(i)					\
+	: "cc");							\
+									\
+	return temp;							\
 }
 
 #else	/* !CONFIG_ARC_HAS_LLSC */
@@ -126,6 +83,7 @@
 	v->counter = i;
 	atomic_ops_unlock(flags);
 }
+
 #endif
 
 /*
@@ -133,63 +91,47 @@
  * Locking would change to irq-disabling only (UP) and spinlocks (SMP)
  */
 
-static inline void atomic_add(int i, atomic_t *v)
-{
-	unsigned long flags;
-
-	atomic_ops_lock(flags);
-	v->counter += i;
-	atomic_ops_unlock(flags);
+#define ATOMIC_OP(op, c_op, asm_op)					\
+static inline void atomic_##op(int i, atomic_t *v)			\
+{									\
+	unsigned long flags;						\
+									\
+	atomic_ops_lock(flags);						\
+	v->counter c_op i;						\
+	atomic_ops_unlock(flags);					\
 }
 
-static inline void atomic_sub(int i, atomic_t *v)
-{
-	unsigned long flags;
-
-	atomic_ops_lock(flags);
-	v->counter -= i;
-	atomic_ops_unlock(flags);
-}
-
-static inline int atomic_add_return(int i, atomic_t *v)
-{
-	unsigned long flags;
-	unsigned long temp;
-
-	atomic_ops_lock(flags);
-	temp = v->counter;
-	temp += i;
-	v->counter = temp;
-	atomic_ops_unlock(flags);
-
-	return temp;
-}
-
-static inline int atomic_sub_return(int i, atomic_t *v)
-{
-	unsigned long flags;
-	unsigned long temp;
-
-	atomic_ops_lock(flags);
-	temp = v->counter;
-	temp -= i;
-	v->counter = temp;
-	atomic_ops_unlock(flags);
-
-	return temp;
-}
-
-static inline void atomic_clear_mask(unsigned long mask, unsigned long *addr)
-{
-	unsigned long flags;
-
-	atomic_ops_lock(flags);
-	*addr &= ~mask;
-	atomic_ops_unlock(flags);
+#define ATOMIC_OP_RETURN(op, c_op)					\
+static inline int atomic_##op##_return(int i, atomic_t *v)		\
+{									\
+	unsigned long flags;						\
+	unsigned long temp;						\
+									\
+	atomic_ops_lock(flags);						\
+	temp = v->counter;						\
+	temp c_op i;							\
+	v->counter = temp;						\
+	atomic_ops_unlock(flags);					\
+									\
+	return temp;							\
 }
 
 #endif /* !CONFIG_ARC_HAS_LLSC */
 
+#define ATOMIC_OPS(op, c_op, asm_op)					\
+	ATOMIC_OP(op, c_op, asm_op)					\
+	ATOMIC_OP_RETURN(op, c_op, asm_op)
+
+ATOMIC_OPS(add, +=, add)
+ATOMIC_OPS(sub, -=, sub)
+ATOMIC_OP(and, &=, and)
+
+#define atomic_clear_mask(mask, v) atomic_and(~(mask), (v))
+
+#undef ATOMIC_OPS
+#undef ATOMIC_OP_RETURN
+#undef ATOMIC_OP
+
 /**
  * __atomic_add_unless - add unless the number is a given value
  * @v: pointer of type atomic_t

diff --git a/arch/arc/mm/cache_arc700.c b/arch/arc/mm/cache_arc700.c
index e88ddbf..9e11427 100644
--- a/arch/arc/mm/cache_arc700.c
+++ b/arch/arc/mm/cache_arc700.c

@@ -427,7 +427,7 @@
 
 static void __ic_line_inv_vaddr_helper(void *info)
 {
-        struct ic_inv *ic_inv_args = (struct ic_inv_args *) info;
+        struct ic_inv_args *ic_inv = info;
 
         __ic_line_inv_vaddr_local(ic_inv->paddr, ic_inv->vaddr, ic_inv->sz);
 }

diff --git a/arch/arm/boot/dts/am4372.dtsi b/arch/arm/boot/dts/am4372.dtsi
index 9b3d2ba..8689949 100644
--- a/arch/arm/boot/dts/am4372.dtsi
+++ b/arch/arm/boot/dts/am4372.dtsi

@@ -804,7 +804,7 @@
 
 			usb1: usb@48390000 {
 				compatible = "synopsys,dwc3";
-				reg = <0x48390000 0x17000>;
+				reg = <0x48390000 0x10000>;
 				interrupts = <GIC_SPI 168 IRQ_TYPE_LEVEL_HIGH>;
 				phys = <&usb2_phy1>;
 				phy-names = "usb2-phy";
@@ -826,7 +826,7 @@
 
 			usb2: usb@483d0000 {
 				compatible = "synopsys,dwc3";
-				reg = <0x483d0000 0x17000>;
+				reg = <0x483d0000 0x10000>;
 				interrupts = <GIC_SPI 174 IRQ_TYPE_LEVEL_HIGH>;
 				phys = <&usb2_phy2>;
 				phy-names = "usb2-phy";

diff --git a/arch/arm/boot/dts/am437x-gp-evm.dts b/arch/arm/boot/dts/am437x-gp-evm.dts
index 646a6ea..e7ac47f 100644
--- a/arch/arm/boot/dts/am437x-gp-evm.dts
+++ b/arch/arm/boot/dts/am437x-gp-evm.dts

@@ -260,7 +260,7 @@
 	status = "okay";
 	pinctrl-names = "default";
 	pinctrl-0 = <&i2c0_pins>;
-	clock-frequency = <400000>;
+	clock-frequency = <100000>;
 
 	tps65218: tps65218@24 {
 		reg = <0x24>;
@@ -424,7 +424,7 @@
 	ranges = <0 0 0 0x01000000>;	/* minimum GPMC partition = 16MB */
 	nand@0,0 {
 		reg = <0 0 4>;		/* device IO registers */
-		ti,nand-ecc-opt = "bch8";
+		ti,nand-ecc-opt = "bch16";
 		ti,elm-id = <&elm>;
 		nand-bus-width = <8>;
 		gpmc,device-width = <1>;
@@ -443,8 +443,6 @@
 		gpmc,rd-cycle-ns = <40>;
 		gpmc,wr-cycle-ns = <40>;
 		gpmc,wait-pin = <0>;
-		gpmc,wait-on-read;
-		gpmc,wait-on-write;
 		gpmc,bus-turnaround-ns = <0>;
 		gpmc,cycle2cycle-delay-ns = <0>;
 		gpmc,clk-activation-ns = <0>;

diff --git a/arch/arm/boot/dts/am43x-epos-evm.dts b/arch/arm/boot/dts/am43x-epos-evm.dts
index ed7dd23..ac3e485 100644
--- a/arch/arm/boot/dts/am43x-epos-evm.dts
+++ b/arch/arm/boot/dts/am43x-epos-evm.dts

@@ -435,13 +435,13 @@
 };
 
 &gpmc {
-	status = "okay";
+	status = "okay";	/* Disable QSPI when enabling GPMC (NAND) */
 	pinctrl-names = "default";
 	pinctrl-0 = <&nand_flash_x8>;
 	ranges = <0 0 0x08000000 0x10000000>;	/* CS0: NAND */
 	nand@0,0 {
 		reg = <0 0 0>; /* CS0, offset 0 */
-		ti,nand-ecc-opt = "bch8";
+		ti,nand-ecc-opt = "bch16";
 		ti,elm-id = <&elm>;
 		nand-bus-width = <8>;
 		gpmc,device-width = <1>;
@@ -459,8 +459,7 @@
 		gpmc,access-ns = <30>; /* tCEA + 4*/
 		gpmc,rd-cycle-ns = <40>;
 		gpmc,wr-cycle-ns = <40>;
-		gpmc,wait-on-read = "true";
-		gpmc,wait-on-write = "true";
+		gpmc,wait-pin = <0>;
 		gpmc,bus-turnaround-ns = <0>;
 		gpmc,cycle2cycle-delay-ns = <0>;
 		gpmc,clk-activation-ns = <0>;
@@ -557,7 +556,7 @@
 };
 
 &qspi {
-	status = "okay";
+	status = "disabled";	/* Disable GPMC (NAND) when enabling QSPI */
 	pinctrl-names = "default";
 	pinctrl-0 = <&qspi1_default>;
 

diff --git a/arch/arm/boot/dts/at91rm9200.dtsi b/arch/arm/boot/dts/at91rm9200.dtsi
index 65ccf56..6c97d4a 100644
--- a/arch/arm/boot/dts/at91rm9200.dtsi
+++ b/arch/arm/boot/dts/at91rm9200.dtsi

@@ -149,7 +149,7 @@
 				usb: usbck {
 					compatible = "atmel,at91rm9200-clk-usb";
 					#clock-cells = <0>;
-					atmel,clk-divisors = <1 2>;
+					atmel,clk-divisors = <1 2 0 0>;
 					clocks = <&pllb>;
 				};
 

diff --git a/arch/arm/boot/dts/at91sam9g20.dtsi b/arch/arm/boot/dts/at91sam9g20.dtsi
index 31f7652..4e0abbd 100644
--- a/arch/arm/boot/dts/at91sam9g20.dtsi
+++ b/arch/arm/boot/dts/at91sam9g20.dtsi

@@ -40,6 +40,7 @@
 				};
 
 				pllb: pllbck {
+					compatible = "atmel,at91sam9g20-clk-pllb";
 					atmel,clk-input-range = <2000000 32000000>;
 					atmel,pll-clk-output-ranges = <30000000 100000000 0 0>;
 				};

diff --git a/arch/arm/boot/dts/dra7-evm.dts b/arch/arm/boot/dts/dra7-evm.dts
index 50f8022..e03fbf3 100644
--- a/arch/arm/boot/dts/dra7-evm.dts
+++ b/arch/arm/boot/dts/dra7-evm.dts

@@ -8,6 +8,7 @@
 /dts-v1/;
 
 #include "dra74x.dtsi"
+#include <dt-bindings/gpio/gpio.h>
 
 / {
 	model = "TI DRA742";
@@ -24,9 +25,29 @@
 		regulator-min-microvolt = <3300000>;
 		regulator-max-microvolt = <3300000>;
 	};
+
+	vtt_fixed: fixedregulator-vtt {
+		compatible = "regulator-fixed";
+		regulator-name = "vtt_fixed";
+		regulator-min-microvolt = <1350000>;
+		regulator-max-microvolt = <1350000>;
+		regulator-always-on;
+		regulator-boot-on;
+		enable-active-high;
+		gpio = <&gpio7 11 GPIO_ACTIVE_HIGH>;
+	};
 };
 
 &dra7_pmx_core {
+	pinctrl-names = "default";
+	pinctrl-0 = <&vtt_pin>;
+
+	vtt_pin: pinmux_vtt_pin {
+		pinctrl-single,pins = <
+			0x3b4 (PIN_OUTPUT | MUX_MODE14) /* spi1_cs1.gpio7_11 */
+		>;
+	};
+
 	i2c1_pins: pinmux_i2c1_pins {
 		pinctrl-single,pins = <
 			0x400 (PIN_INPUT | MUX_MODE0) /* i2c1_sda */
@@ -43,20 +64,19 @@
 
 	i2c3_pins: pinmux_i2c3_pins {
 		pinctrl-single,pins = <
-			0x410 (PIN_INPUT | MUX_MODE0) /* i2c3_sda */
-			0x414 (PIN_INPUT | MUX_MODE0) /* i2c3_scl */
+			0x288 (PIN_INPUT | MUX_MODE9) /* gpio6_14.i2c3_sda */
+			0x28c (PIN_INPUT | MUX_MODE9) /* gpio6_15.i2c3_scl */
 		>;
 	};
 
 	mcspi1_pins: pinmux_mcspi1_pins {
 		pinctrl-single,pins = <
-			0x3a4 (PIN_INPUT | MUX_MODE0) /* spi2_clk */
-			0x3a8 (PIN_INPUT | MUX_MODE0) /* spi2_d1 */
-			0x3ac (PIN_INPUT | MUX_MODE0) /* spi2_d0 */
-			0x3b0 (PIN_INPUT_SLEW | MUX_MODE0) /* spi2_cs0 */
-			0x3b4 (PIN_INPUT_SLEW | MUX_MODE0) /* spi2_cs1 */
-			0x3b8 (PIN_INPUT_SLEW | MUX_MODE6) /* spi2_cs2 */
-			0x3bc (PIN_INPUT_SLEW | MUX_MODE6) /* spi2_cs3 */
+			0x3a4 (PIN_INPUT | MUX_MODE0) /* spi1_sclk */
+			0x3a8 (PIN_INPUT | MUX_MODE0) /* spi1_d1 */
+			0x3ac (PIN_INPUT | MUX_MODE0) /* spi1_d0 */
+			0x3b0 (PIN_INPUT_SLEW | MUX_MODE0) /* spi1_cs0 */
+			0x3b8 (PIN_INPUT_SLEW | MUX_MODE6) /* spi1_cs2.hdmi1_hpd */
+			0x3bc (PIN_INPUT_SLEW | MUX_MODE6) /* spi1_cs3.hdmi1_cec */
 		>;
 	};
 
@@ -284,7 +304,7 @@
 	status = "okay";
 	pinctrl-names = "default";
 	pinctrl-0 = <&i2c3_pins>;
-	clock-frequency = <3400000>;
+	clock-frequency = <400000>;
 };
 
 &mcspi1 {
@@ -483,7 +503,7 @@
 			reg = <0x001c0000 0x00020000>;
 		};
 		partition@7 {
-			label = "NAND.u-boot-env";
+			label = "NAND.u-boot-env.backup1";
 			reg = <0x001e0000 0x00020000>;
 		};
 		partition@8 {
@@ -504,3 +524,8 @@
 &usb2_phy2 {
 	phy-supply = <&ldousb_reg>;
 };
+
+&gpio7 {
+	ti,no-reset-on-init;
+	ti,no-idle-on-init;
+};

diff --git a/arch/arm/boot/dts/omap3xxx-clocks.dtsi b/arch/arm/boot/dts/omap3xxx-clocks.dtsi
index e47ff69..5c37500 100644
--- a/arch/arm/boot/dts/omap3xxx-clocks.dtsi
+++ b/arch/arm/boot/dts/omap3xxx-clocks.dtsi

@@ -467,6 +467,7 @@
 		ti,bit-shift = <0x1e>;
 		reg = <0x0d00>;
 		ti,set-bit-to-disable;
+		ti,set-rate-parent;
 	};
 
 	dpll4_m6_ck: dpll4_m6_ck {

diff --git a/arch/arm/boot/dts/ste-snowball.dts b/arch/arm/boot/dts/ste-snowball.dts
index 4a2000c..3e97a66 100644
--- a/arch/arm/boot/dts/ste-snowball.dts
+++ b/arch/arm/boot/dts/ste-snowball.dts

@@ -116,7 +116,6 @@
 		msp2: msp@80117000 {
 			pinctrl-names = "default";
 			pinctrl-0 = <&msp2_default_mode>;
-			status = "okay";
 		};
 
 		msp3: msp@80125000 {

diff --git a/arch/arm/common/edma.c b/arch/arm/common/edma.c
index 8809917..d86771a 100644
--- a/arch/arm/common/edma.c
+++ b/arch/arm/common/edma.c

@@ -1443,14 +1443,14 @@
 EXPORT_SYMBOL(edma_assign_channel_eventq);
 
 static int edma_setup_from_hw(struct device *dev, struct edma_soc_info *pdata,
-			      struct edma *edma_cc)
+			      struct edma *edma_cc, int cc_id)
 {
 	int i;
 	u32 value, cccfg;
 	s8 (*queue_priority_map)[2];
 
 	/* Decode the eDMA3 configuration from CCCFG register */
-	cccfg = edma_read(0, EDMA_CCCFG);
+	cccfg = edma_read(cc_id, EDMA_CCCFG);
 
 	value = GET_NUM_REGN(cccfg);
 	edma_cc->num_region = BIT(value);
@@ -1464,7 +1464,8 @@
 	value = GET_NUM_EVQUE(cccfg);
 	edma_cc->num_tc = value + 1;
 
-	dev_dbg(dev, "eDMA3 HW configuration (cccfg: 0x%08x):\n", cccfg);
+	dev_dbg(dev, "eDMA3 CC%d HW configuration (cccfg: 0x%08x):\n", cc_id,
+		cccfg);
 	dev_dbg(dev, "num_region: %u\n", edma_cc->num_region);
 	dev_dbg(dev, "num_channel: %u\n", edma_cc->num_channels);
 	dev_dbg(dev, "num_slot: %u\n", edma_cc->num_slots);
@@ -1684,7 +1685,7 @@
 			return -ENOMEM;
 
 		/* Get eDMA3 configuration from IP */
-		ret = edma_setup_from_hw(dev, info[j], edma_cc[j]);
+		ret = edma_setup_from_hw(dev, info[j], edma_cc[j], j);
 		if (ret)
 			return ret;
 

diff --git a/arch/arm/include/asm/atomic.h b/arch/arm/include/asm/atomic.h
index 3040359..832f1cd 100644
--- a/arch/arm/include/asm/atomic.h
+++ b/arch/arm/include/asm/atomic.h

@@ -37,84 +37,47 @@
  * store exclusive to ensure that these are atomic.  We may loop
  * to ensure that the update happens.
  */
-static inline void atomic_add(int i, atomic_t *v)
-{
-	unsigned long tmp;
-	int result;
 
-	prefetchw(&v->counter);
-	__asm__ __volatile__("@ atomic_add\n"
-"1:	ldrex	%0, [%3]\n"
-"	add	%0, %0, %4\n"
-"	strex	%1, %0, [%3]\n"
-"	teq	%1, #0\n"
-"	bne	1b"
-	: "=&r" (result), "=&r" (tmp), "+Qo" (v->counter)
-	: "r" (&v->counter), "Ir" (i)
-	: "cc");
-}
+#define ATOMIC_OP(op, c_op, asm_op)					\
+static inline void atomic_##op(int i, atomic_t *v)			\
+{									\
+	unsigned long tmp;						\
+	int result;							\
+									\
+	prefetchw(&v->counter);						\
+	__asm__ __volatile__("@ atomic_" #op "\n"			\
+"1:	ldrex	%0, [%3]\n"						\
+"	" #asm_op "	%0, %0, %4\n"					\
+"	strex	%1, %0, [%3]\n"						\
+"	teq	%1, #0\n"						\
+"	bne	1b"							\
+	: "=&r" (result), "=&r" (tmp), "+Qo" (v->counter)		\
+	: "r" (&v->counter), "Ir" (i)					\
+	: "cc");							\
+}									\
 
-static inline int atomic_add_return(int i, atomic_t *v)
-{
-	unsigned long tmp;
-	int result;
-
-	smp_mb();
-	prefetchw(&v->counter);
-
-	__asm__ __volatile__("@ atomic_add_return\n"
-"1:	ldrex	%0, [%3]\n"
-"	add	%0, %0, %4\n"
-"	strex	%1, %0, [%3]\n"
-"	teq	%1, #0\n"
-"	bne	1b"
-	: "=&r" (result), "=&r" (tmp), "+Qo" (v->counter)
-	: "r" (&v->counter), "Ir" (i)
-	: "cc");
-
-	smp_mb();
-
-	return result;
-}
-
-static inline void atomic_sub(int i, atomic_t *v)
-{
-	unsigned long tmp;
-	int result;
-
-	prefetchw(&v->counter);
-	__asm__ __volatile__("@ atomic_sub\n"
-"1:	ldrex	%0, [%3]\n"
-"	sub	%0, %0, %4\n"
-"	strex	%1, %0, [%3]\n"
-"	teq	%1, #0\n"
-"	bne	1b"
-	: "=&r" (result), "=&r" (tmp), "+Qo" (v->counter)
-	: "r" (&v->counter), "Ir" (i)
-	: "cc");
-}
-
-static inline int atomic_sub_return(int i, atomic_t *v)
-{
-	unsigned long tmp;
-	int result;
-
-	smp_mb();
-	prefetchw(&v->counter);
-
-	__asm__ __volatile__("@ atomic_sub_return\n"
-"1:	ldrex	%0, [%3]\n"
-"	sub	%0, %0, %4\n"
-"	strex	%1, %0, [%3]\n"
-"	teq	%1, #0\n"
-"	bne	1b"
-	: "=&r" (result), "=&r" (tmp), "+Qo" (v->counter)
-	: "r" (&v->counter), "Ir" (i)
-	: "cc");
-
-	smp_mb();
-
-	return result;
+#define ATOMIC_OP_RETURN(op, c_op, asm_op)				\
+static inline int atomic_##op##_return(int i, atomic_t *v)		\
+{									\
+	unsigned long tmp;						\
+	int result;							\
+									\
+	smp_mb();							\
+	prefetchw(&v->counter);						\
+									\
+	__asm__ __volatile__("@ atomic_" #op "_return\n"		\
+"1:	ldrex	%0, [%3]\n"						\
+"	" #asm_op "	%0, %0, %4\n"					\
+"	strex	%1, %0, [%3]\n"						\
+"	teq	%1, #0\n"						\
+"	bne	1b"							\
+	: "=&r" (result), "=&r" (tmp), "+Qo" (v->counter)		\
+	: "r" (&v->counter), "Ir" (i)					\
+	: "cc");							\
+									\
+	smp_mb();							\
+									\
+	return result;							\
 }
 
 static inline int atomic_cmpxchg(atomic_t *ptr, int old, int new)
@@ -174,33 +137,29 @@
 #error SMP not supported on pre-ARMv6 CPUs
 #endif
 
-static inline int atomic_add_return(int i, atomic_t *v)
-{
-	unsigned long flags;
-	int val;
+#define ATOMIC_OP(op, c_op, asm_op)					\
+static inline void atomic_##op(int i, atomic_t *v)			\
+{									\
+	unsigned long flags;						\
+									\
+	raw_local_irq_save(flags);					\
+	v->counter c_op i;						\
+	raw_local_irq_restore(flags);					\
+}									\
 
-	raw_local_irq_save(flags);
-	val = v->counter;
-	v->counter = val += i;
-	raw_local_irq_restore(flags);
-
-	return val;
+#define ATOMIC_OP_RETURN(op, c_op, asm_op)				\
+static inline int atomic_##op##_return(int i, atomic_t *v)		\
+{									\
+	unsigned long flags;						\
+	int val;							\
+									\
+	raw_local_irq_save(flags);					\
+	v->counter c_op i;						\
+	val = v->counter;						\
+	raw_local_irq_restore(flags);					\
+									\
+	return val;							\
 }
-#define atomic_add(i, v)	(void) atomic_add_return(i, v)
-
-static inline int atomic_sub_return(int i, atomic_t *v)
-{
-	unsigned long flags;
-	int val;
-
-	raw_local_irq_save(flags);
-	val = v->counter;
-	v->counter = val -= i;
-	raw_local_irq_restore(flags);
-
-	return val;
-}
-#define atomic_sub(i, v)	(void) atomic_sub_return(i, v)
 
 static inline int atomic_cmpxchg(atomic_t *v, int old, int new)
 {
@@ -228,6 +187,17 @@
 
 #endif /* __LINUX_ARM_ARCH__ */
 
+#define ATOMIC_OPS(op, c_op, asm_op)					\
+	ATOMIC_OP(op, c_op, asm_op)					\
+	ATOMIC_OP_RETURN(op, c_op, asm_op)
+
+ATOMIC_OPS(add, +=, add)
+ATOMIC_OPS(sub, -=, sub)
+
+#undef ATOMIC_OPS
+#undef ATOMIC_OP_RETURN
+#undef ATOMIC_OP
+
 #define atomic_xchg(v, new) (xchg(&((v)->counter), new))
 
 #define atomic_inc(v)		atomic_add(1, v)
@@ -300,89 +270,60 @@
 }
 #endif
 
-static inline void atomic64_add(long long i, atomic64_t *v)
-{
-	long long result;
-	unsigned long tmp;
+#define ATOMIC64_OP(op, op1, op2)					\
+static inline void atomic64_##op(long long i, atomic64_t *v)		\
+{									\
+	long long result;						\
+	unsigned long tmp;						\
+									\
+	prefetchw(&v->counter);						\
+	__asm__ __volatile__("@ atomic64_" #op "\n"			\
+"1:	ldrexd	%0, %H0, [%3]\n"					\
+"	" #op1 " %Q0, %Q0, %Q4\n"					\
+"	" #op2 " %R0, %R0, %R4\n"					\
+"	strexd	%1, %0, %H0, [%3]\n"					\
+"	teq	%1, #0\n"						\
+"	bne	1b"							\
+	: "=&r" (result), "=&r" (tmp), "+Qo" (v->counter)		\
+	: "r" (&v->counter), "r" (i)					\
+	: "cc");							\
+}									\
 
-	prefetchw(&v->counter);
-	__asm__ __volatile__("@ atomic64_add\n"
-"1:	ldrexd	%0, %H0, [%3]\n"
-"	adds	%Q0, %Q0, %Q4\n"
-"	adc	%R0, %R0, %R4\n"
-"	strexd	%1, %0, %H0, [%3]\n"
-"	teq	%1, #0\n"
-"	bne	1b"
-	: "=&r" (result), "=&r" (tmp), "+Qo" (v->counter)
-	: "r" (&v->counter), "r" (i)
-	: "cc");
+#define ATOMIC64_OP_RETURN(op, op1, op2)				\
+static inline long long atomic64_##op##_return(long long i, atomic64_t *v) \
+{									\
+	long long result;						\
+	unsigned long tmp;						\
+									\
+	smp_mb();							\
+	prefetchw(&v->counter);						\
+									\
+	__asm__ __volatile__("@ atomic64_" #op "_return\n"		\
+"1:	ldrexd	%0, %H0, [%3]\n"					\
+"	" #op1 " %Q0, %Q0, %Q4\n"					\
+"	" #op2 " %R0, %R0, %R4\n"					\
+"	strexd	%1, %0, %H0, [%3]\n"					\
+"	teq	%1, #0\n"						\
+"	bne	1b"							\
+	: "=&r" (result), "=&r" (tmp), "+Qo" (v->counter)		\
+	: "r" (&v->counter), "r" (i)					\
+	: "cc");							\
+									\
+	smp_mb();							\
+									\
+	return result;							\
 }
 
-static inline long long atomic64_add_return(long long i, atomic64_t *v)
-{
-	long long result;
-	unsigned long tmp;
+#define ATOMIC64_OPS(op, op1, op2)					\
+	ATOMIC64_OP(op, op1, op2)					\
+	ATOMIC64_OP_RETURN(op, op1, op2)
 
-	smp_mb();
-	prefetchw(&v->counter);
+ATOMIC64_OPS(add, adds, adc)
+ATOMIC64_OPS(sub, subs, sbc)
 
-	__asm__ __volatile__("@ atomic64_add_return\n"
-"1:	ldrexd	%0, %H0, [%3]\n"
-"	adds	%Q0, %Q0, %Q4\n"
-"	adc	%R0, %R0, %R4\n"
-"	strexd	%1, %0, %H0, [%3]\n"
-"	teq	%1, #0\n"
-"	bne	1b"
-	: "=&r" (result), "=&r" (tmp), "+Qo" (v->counter)
-	: "r" (&v->counter), "r" (i)
-	: "cc");
-
-	smp_mb();
-
-	return result;
-}
-
-static inline void atomic64_sub(long long i, atomic64_t *v)
-{
-	long long result;
-	unsigned long tmp;
-
-	prefetchw(&v->counter);
-	__asm__ __volatile__("@ atomic64_sub\n"
-"1:	ldrexd	%0, %H0, [%3]\n"
-"	subs	%Q0, %Q0, %Q4\n"
-"	sbc	%R0, %R0, %R4\n"
-"	strexd	%1, %0, %H0, [%3]\n"
-"	teq	%1, #0\n"
-"	bne	1b"
-	: "=&r" (result), "=&r" (tmp), "+Qo" (v->counter)
-	: "r" (&v->counter), "r" (i)
-	: "cc");
-}
-
-static inline long long atomic64_sub_return(long long i, atomic64_t *v)
-{
-	long long result;
-	unsigned long tmp;
-
-	smp_mb();
-	prefetchw(&v->counter);
-
-	__asm__ __volatile__("@ atomic64_sub_return\n"
-"1:	ldrexd	%0, %H0, [%3]\n"
-"	subs	%Q0, %Q0, %Q4\n"
-"	sbc	%R0, %R0, %R4\n"
-"	strexd	%1, %0, %H0, [%3]\n"
-"	teq	%1, #0\n"
-"	bne	1b"
-	: "=&r" (result), "=&r" (tmp), "+Qo" (v->counter)
-	: "r" (&v->counter), "r" (i)
-	: "cc");
-
-	smp_mb();
-
-	return result;
-}
+#undef ATOMIC64_OPS
+#undef ATOMIC64_OP_RETURN
+#undef ATOMIC64_OP
 
 static inline long long atomic64_cmpxchg(atomic64_t *ptr, long long old,
 					long long new)

diff --git a/arch/arm/kernel/ptrace.c b/arch/arm/kernel/ptrace.c
index 0c27ed6..5e772a2 100644
--- a/arch/arm/kernel/ptrace.c
+++ b/arch/arm/kernel/ptrace.c

@@ -933,8 +933,13 @@
 	current_thread_info()->syscall = scno;
 
 	/* Do the secure computing check first; failures should be fast. */
-	if (secure_computing(scno) == -1)
+#ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
+	if (secure_computing() == -1)
 		return -1;
+#else
+	/* XXX: remove this once OABI gets fixed */
+	secure_computing_strict(scno);
+#endif
 
 	if (test_thread_flag(TIF_SYSCALL_TRACE))
 		tracehook_report_syscall(regs, PTRACE_SYSCALL_ENTER);

diff --git a/arch/arm/kvm/handle_exit.c b/arch/arm/kvm/handle_exit.c
index 4c979d4..a96a804 100644
--- a/arch/arm/kvm/handle_exit.c
+++ b/arch/arm/kvm/handle_exit.c

@@ -93,6 +93,8 @@
 	else
 		kvm_vcpu_block(vcpu);
 
+	kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
+
 	return 1;
 }
 

diff --git a/arch/arm/kvm/init.S b/arch/arm/kvm/init.S
index 991415d..3988e72 100644
--- a/arch/arm/kvm/init.S
+++ b/arch/arm/kvm/init.S

@@ -99,6 +99,10 @@
 	mrc	p15, 0, r0, c10, c2, 1
 	mcr	p15, 4, r0, c10, c2, 1
 
+	@ Invalidate the stale TLBs from Bootloader
+	mcr	p15, 4, r0, c8, c7, 0	@ TLBIALLH
+	dsb	ish
+
 	@ Set the HSCTLR to:
 	@  - ARM/THUMB exceptions: Kernel config (Thumb-2 kernel)
 	@  - Endianness: Kernel config

diff --git a/arch/arm/mach-at91/board-dt-rm9200.c b/arch/arm/mach-at91/board-dt-rm9200.c
index 3a185fa..f4b6e91 100644
--- a/arch/arm/mach-at91/board-dt-rm9200.c
+++ b/arch/arm/mach-at91/board-dt-rm9200.c

@@ -14,6 +14,7 @@
 #include <linux/gpio.h>
 #include <linux/of.h>
 #include <linux/of_irq.h>
+#include <linux/clk-provider.h>
 
 #include <asm/setup.h>
 #include <asm/irq.h>
@@ -35,13 +36,21 @@
 	of_irq_init(irq_of_match);
 }
 
+static void __init at91rm9200_dt_timer_init(void)
+{
+#if defined(CONFIG_COMMON_CLK)
+	of_clk_init(NULL);
+#endif
+	at91rm9200_timer_init();
+}
+
 static const char *at91rm9200_dt_board_compat[] __initdata = {
 	"atmel,at91rm9200",
 	NULL
 };
 
 DT_MACHINE_START(at91rm9200_dt, "Atmel AT91RM9200 (Device Tree)")
-	.init_time      = at91rm9200_timer_init,
+	.init_time      = at91rm9200_dt_timer_init,
 	.map_io		= at91_map_io,
 	.handle_irq	= at91_aic_handle_irq,
 	.init_early	= at91rm9200_dt_initialize,

diff --git a/arch/arm/mach-omap2/gpmc.c b/arch/arm/mach-omap2/gpmc.c
index 9f42d54..2f97228 100644
--- a/arch/arm/mach-omap2/gpmc.c
+++ b/arch/arm/mach-omap2/gpmc.c

@@ -1207,8 +1207,7 @@
 		}
 	}
 
-	if ((p->wait_on_read || p->wait_on_write) &&
-	    (p->wait_pin > gpmc_nr_waitpins)) {
+	if (p->wait_pin > gpmc_nr_waitpins) {
 		pr_err("%s: invalid wait-pin (%d)\n", __func__, p->wait_pin);
 		return -EINVAL;
 	}
@@ -1288,8 +1287,8 @@
 		p->wait_on_write = of_property_read_bool(np,
 							 "gpmc,wait-on-write");
 		if (!p->wait_on_read && !p->wait_on_write)
-			pr_warn("%s: read/write wait monitoring not enabled!\n",
-				__func__);
+			pr_debug("%s: rd/wr wait monitoring not enabled!\n",
+				 __func__);
 	}
 }
 

diff --git a/arch/arm64/crypto/sha2-ce-glue.c b/arch/arm64/crypto/sha2-ce-glue.c
index c294e67..ae67e88 100644
--- a/arch/arm64/crypto/sha2-ce-glue.c
+++ b/arch/arm64/crypto/sha2-ce-glue.c

@@ -150,7 +150,6 @@
 	kernel_neon_begin_partial(28);
 	sha2_ce_transform(blocks, data, sctx->state, NULL, len);
 	kernel_neon_end();
-	data += blocks * SHA256_BLOCK_SIZE;
 }
 
 static int sha224_finup(struct shash_desc *desc, const u8 *data,

diff --git a/arch/arm64/include/asm/atomic.h b/arch/arm64/include/asm/atomic.h
index 65f1569..b83c325 100644
--- a/arch/arm64/include/asm/atomic.h
+++ b/arch/arm64/include/asm/atomic.h

@@ -43,69 +43,51 @@
  * store exclusive to ensure that these are atomic.  We may loop
  * to ensure that the update happens.
  */
-static inline void atomic_add(int i, atomic_t *v)
-{
-	unsigned long tmp;
-	int result;
 
-	asm volatile("// atomic_add\n"
-"1:	ldxr	%w0, %2\n"
-"	add	%w0, %w0, %w3\n"
-"	stxr	%w1, %w0, %2\n"
-"	cbnz	%w1, 1b"
-	: "=&r" (result), "=&r" (tmp), "+Q" (v->counter)
-	: "Ir" (i));
+#define ATOMIC_OP(op, asm_op)						\
+static inline void atomic_##op(int i, atomic_t *v)			\
+{									\
+	unsigned long tmp;						\
+	int result;							\
+									\
+	asm volatile("// atomic_" #op "\n"				\
+"1:	ldxr	%w0, %2\n"						\
+"	" #asm_op "	%w0, %w0, %w3\n"				\
+"	stxr	%w1, %w0, %2\n"						\
+"	cbnz	%w1, 1b"						\
+	: "=&r" (result), "=&r" (tmp), "+Q" (v->counter)		\
+	: "Ir" (i));							\
+}									\
+
+#define ATOMIC_OP_RETURN(op, asm_op)					\
+static inline int atomic_##op##_return(int i, atomic_t *v)		\
+{									\
+	unsigned long tmp;						\
+	int result;							\
+									\
+	asm volatile("// atomic_" #op "_return\n"			\
+"1:	ldxr	%w0, %2\n"						\
+"	" #asm_op "	%w0, %w0, %w3\n"				\
+"	stlxr	%w1, %w0, %2\n"						\
+"	cbnz	%w1, 1b"						\
+	: "=&r" (result), "=&r" (tmp), "+Q" (v->counter)		\
+	: "Ir" (i)							\
+	: "memory");							\
+									\
+	smp_mb();							\
+	return result;							\
 }
 
-static inline int atomic_add_return(int i, atomic_t *v)
-{
-	unsigned long tmp;
-	int result;
+#define ATOMIC_OPS(op, asm_op)						\
+	ATOMIC_OP(op, asm_op)						\
+	ATOMIC_OP_RETURN(op, asm_op)
 
-	asm volatile("// atomic_add_return\n"
-"1:	ldxr	%w0, %2\n"
-"	add	%w0, %w0, %w3\n"
-"	stlxr	%w1, %w0, %2\n"
-"	cbnz	%w1, 1b"
-	: "=&r" (result), "=&r" (tmp), "+Q" (v->counter)
-	: "Ir" (i)
-	: "memory");
+ATOMIC_OPS(add, add)
+ATOMIC_OPS(sub, sub)
 
-	smp_mb();
-	return result;
-}
-
-static inline void atomic_sub(int i, atomic_t *v)
-{
-	unsigned long tmp;
-	int result;
-
-	asm volatile("// atomic_sub\n"
-"1:	ldxr	%w0, %2\n"
-"	sub	%w0, %w0, %w3\n"
-"	stxr	%w1, %w0, %2\n"
-"	cbnz	%w1, 1b"
-	: "=&r" (result), "=&r" (tmp), "+Q" (v->counter)
-	: "Ir" (i));
-}
-
-static inline int atomic_sub_return(int i, atomic_t *v)
-{
-	unsigned long tmp;
-	int result;
-
-	asm volatile("// atomic_sub_return\n"
-"1:	ldxr	%w0, %2\n"
-"	sub	%w0, %w0, %w3\n"
-"	stlxr	%w1, %w0, %2\n"
-"	cbnz	%w1, 1b"
-	: "=&r" (result), "=&r" (tmp), "+Q" (v->counter)
-	: "Ir" (i)
-	: "memory");
-
-	smp_mb();
-	return result;
-}
+#undef ATOMIC_OPS
+#undef ATOMIC_OP_RETURN
+#undef ATOMIC_OP
 
 static inline int atomic_cmpxchg(atomic_t *ptr, int old, int new)
 {
@@ -160,69 +142,50 @@
 #define atomic64_read(v)	(*(volatile long *)&(v)->counter)
 #define atomic64_set(v,i)	(((v)->counter) = (i))
 
-static inline void atomic64_add(u64 i, atomic64_t *v)
-{
-	long result;
-	unsigned long tmp;
+#define ATOMIC64_OP(op, asm_op)						\
+static inline void atomic64_##op(long i, atomic64_t *v)			\
+{									\
+	long result;							\
+	unsigned long tmp;						\
+									\
+	asm volatile("// atomic64_" #op "\n"				\
+"1:	ldxr	%0, %2\n"						\
+"	" #asm_op "	%0, %0, %3\n"					\
+"	stxr	%w1, %0, %2\n"						\
+"	cbnz	%w1, 1b"						\
+	: "=&r" (result), "=&r" (tmp), "+Q" (v->counter)		\
+	: "Ir" (i));							\
+}									\
 
-	asm volatile("// atomic64_add\n"
-"1:	ldxr	%0, %2\n"
-"	add	%0, %0, %3\n"
-"	stxr	%w1, %0, %2\n"
-"	cbnz	%w1, 1b"
-	: "=&r" (result), "=&r" (tmp), "+Q" (v->counter)
-	: "Ir" (i));
+#define ATOMIC64_OP_RETURN(op, asm_op)					\
+static inline long atomic64_##op##_return(long i, atomic64_t *v)	\
+{									\
+	long result;							\
+	unsigned long tmp;						\
+									\
+	asm volatile("// atomic64_" #op "_return\n"			\
+"1:	ldxr	%0, %2\n"						\
+"	" #asm_op "	%0, %0, %3\n"					\
+"	stlxr	%w1, %0, %2\n"						\
+"	cbnz	%w1, 1b"						\
+	: "=&r" (result), "=&r" (tmp), "+Q" (v->counter)		\
+	: "Ir" (i)							\
+	: "memory");							\
+									\
+	smp_mb();							\
+	return result;							\
 }
 
-static inline long atomic64_add_return(long i, atomic64_t *v)
-{
-	long result;
-	unsigned long tmp;
+#define ATOMIC64_OPS(op, asm_op)					\
+	ATOMIC64_OP(op, asm_op)						\
+	ATOMIC64_OP_RETURN(op, asm_op)
 
-	asm volatile("// atomic64_add_return\n"
-"1:	ldxr	%0, %2\n"
-"	add	%0, %0, %3\n"
-"	stlxr	%w1, %0, %2\n"
-"	cbnz	%w1, 1b"
-	: "=&r" (result), "=&r" (tmp), "+Q" (v->counter)
-	: "Ir" (i)
-	: "memory");
+ATOMIC64_OPS(add, add)
+ATOMIC64_OPS(sub, sub)
 
-	smp_mb();
-	return result;
-}
-
-static inline void atomic64_sub(u64 i, atomic64_t *v)
-{
-	long result;
-	unsigned long tmp;
-
-	asm volatile("// atomic64_sub\n"
-"1:	ldxr	%0, %2\n"
-"	sub	%0, %0, %3\n"
-"	stxr	%w1, %0, %2\n"
-"	cbnz	%w1, 1b"
-	: "=&r" (result), "=&r" (tmp), "+Q" (v->counter)
-	: "Ir" (i));
-}
-
-static inline long atomic64_sub_return(long i, atomic64_t *v)
-{
-	long result;
-	unsigned long tmp;
-
-	asm volatile("// atomic64_sub_return\n"
-"1:	ldxr	%0, %2\n"
-"	sub	%0, %0, %3\n"
-"	stlxr	%w1, %0, %2\n"
-"	cbnz	%w1, 1b"
-	: "=&r" (result), "=&r" (tmp), "+Q" (v->counter)
-	: "Ir" (i)
-	: "memory");
-
-	smp_mb();
-	return result;
-}
+#undef ATOMIC64_OPS
+#undef ATOMIC64_OP_RETURN
+#undef ATOMIC64_OP
 
 static inline long atomic64_cmpxchg(atomic64_t *ptr, long old, long new)
 {

diff --git a/arch/arm64/include/asm/hw_breakpoint.h b/arch/arm64/include/asm/hw_breakpoint.h
index d064047..52b484b 100644
--- a/arch/arm64/include/asm/hw_breakpoint.h
+++ b/arch/arm64/include/asm/hw_breakpoint.h

@@ -79,7 +79,6 @@
  */
 #define ARM_MAX_BRP		16
 #define ARM_MAX_WRP		16
-#define ARM_MAX_HBP_SLOTS	(ARM_MAX_BRP + ARM_MAX_WRP)
 
 /* Virtual debug register bases. */
 #define AARCH64_DBG_REG_BVR	0

diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
index 3df21fe..286b1be 100644
--- a/arch/arm64/include/asm/processor.h
+++ b/arch/arm64/include/asm/processor.h

@@ -139,7 +139,7 @@
 	((struct pt_regs *)(THREAD_START_SP + task_stack_page(p)) - 1)
 
 #define KSTK_EIP(tsk)	((unsigned long)task_pt_regs(tsk)->pc)
-#define KSTK_ESP(tsk)	((unsigned long)task_pt_regs(tsk)->sp)
+#define KSTK_ESP(tsk)	user_stack_pointer(task_pt_regs(tsk))
 
 /*
  * Prefetching support

diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h
index 501000f..41ed9e1 100644
--- a/arch/arm64/include/asm/ptrace.h
+++ b/arch/arm64/include/asm/ptrace.h

@@ -137,7 +137,7 @@
 	(!((regs)->pstate & PSR_F_BIT))
 
 #define user_stack_pointer(regs) \
-	(!compat_user_mode(regs)) ? ((regs)->sp) : ((regs)->compat_sp)
+	(!compat_user_mode(regs) ? (regs)->sp : (regs)->compat_sp)
 
 static inline unsigned long regs_return_value(struct pt_regs *regs)
 {

diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index ad8aebb..3dca156 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c

@@ -270,6 +270,7 @@
 	case CPU_PM_ENTER:
 		if (current->mm && !test_thread_flag(TIF_FOREIGN_FPSTATE))
 			fpsimd_save_state(&current->thread.fpsimd_state);
+		this_cpu_write(fpsimd_last_state, NULL);
 		break;
 	case CPU_PM_EXIT:
 		if (current->mm)

diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index bed0283..8730690 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S

@@ -373,10 +373,6 @@
 	.long	0
 	.popsection
 
-	.align	3
-2:	.quad	.
-	.quad	PAGE_OFFSET
-
 #ifdef CONFIG_SMP
 	.align	3
 1:	.quad	.

diff --git a/arch/arm64/kernel/perf_regs.c b/arch/arm64/kernel/perf_regs.c
index 422ebd6..6762ad7 100644
--- a/arch/arm64/kernel/perf_regs.c
+++ b/arch/arm64/kernel/perf_regs.c

@@ -24,6 +24,12 @@
 			return regs->compat_lr;
 	}
 
+	if ((u32)idx == PERF_REG_ARM64_SP)
+		return regs->sp;
+
+	if ((u32)idx == PERF_REG_ARM64_PC)
+		return regs->pc;
+
 	return regs->regs[idx];
 }
 

diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
index 70526cf..fe63ac5 100644
--- a/arch/arm64/kernel/ptrace.c
+++ b/arch/arm64/kernel/ptrace.c

@@ -87,7 +87,8 @@
 			break;
 		}
 	}
-	for (i = ARM_MAX_BRP; i < ARM_MAX_HBP_SLOTS && !bp; ++i) {
+
+	for (i = 0; i < ARM_MAX_WRP; ++i) {
 		if (current->thread.debug.hbp_watch[i] == bp) {
 			info.si_errno = -((i << 1) + 1);
 			break;
@@ -662,8 +663,10 @@
 			kbuf += sizeof(reg);
 		} else {
 			ret = copy_to_user(ubuf, &reg, sizeof(reg));
-			if (ret)
+			if (ret) {
+				ret = -EFAULT;
 				break;
+			}
 
 			ubuf += sizeof(reg);
 		}
@@ -701,8 +704,10 @@
 			kbuf += sizeof(reg);
 		} else {
 			ret = copy_from_user(&reg, ubuf, sizeof(reg));
-			if (ret)
-				return ret;
+			if (ret) {
+				ret = -EFAULT;
+				break;
+			}
 
 			ubuf += sizeof(reg);
 		}

diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index f6f0ccf..edb146d 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c

@@ -78,6 +78,7 @@
 #endif
 
 static const char *cpu_name;
+static const char *machine_name;
 phys_addr_t __fdt_pointer __initdata;
 
 /*
@@ -309,6 +310,8 @@
 		while (true)
 			cpu_relax();
 	}
+
+	machine_name = of_flat_dt_get_machine_name();
 }
 
 /*
@@ -447,21 +450,10 @@
 {
 	int i;
 
-	/*
-	 * Dump out the common processor features in a single line. Userspace
-	 * should read the hwcaps with getauxval(AT_HWCAP) rather than
-	 * attempting to parse this.
-	 */
-	seq_puts(m, "features\t:");
-	for (i = 0; hwcap_str[i]; i++)
-		if (elf_hwcap & (1 << i))
-			seq_printf(m, " %s", hwcap_str[i]);
-	seq_puts(m, "\n\n");
+	seq_printf(m, "Processor\t: %s rev %d (%s)\n",
+		   cpu_name, read_cpuid_id() & 15, ELF_PLATFORM);
 
 	for_each_online_cpu(i) {
-		struct cpuinfo_arm64 *cpuinfo = &per_cpu(cpu_data, i);
-		u32 midr = cpuinfo->reg_midr;
-
 		/*
 		 * glibc reads /proc/cpuinfo to determine the number of
 		 * online processors, looking for lines beginning with
@@ -470,13 +462,25 @@
 #ifdef CONFIG_SMP
 		seq_printf(m, "processor\t: %d\n", i);
 #endif
-		seq_printf(m, "implementer\t: 0x%02x\n",
-			   MIDR_IMPLEMENTOR(midr));
-		seq_printf(m, "variant\t\t: 0x%x\n", MIDR_VARIANT(midr));
-		seq_printf(m, "partnum\t\t: 0x%03x\n", MIDR_PARTNUM(midr));
-		seq_printf(m, "revision\t: 0x%x\n\n", MIDR_REVISION(midr));
 	}
 
+	/* dump out the processor features */
+	seq_puts(m, "Features\t: ");
+
+	for (i = 0; hwcap_str[i]; i++)
+		if (elf_hwcap & (1 << i))
+			seq_printf(m, "%s ", hwcap_str[i]);
+
+	seq_printf(m, "\nCPU implementer\t: 0x%02x\n", read_cpuid_id() >> 24);
+	seq_printf(m, "CPU architecture: AArch64\n");
+	seq_printf(m, "CPU variant\t: 0x%x\n", (read_cpuid_id() >> 20) & 15);
+	seq_printf(m, "CPU part\t: 0x%03x\n", (read_cpuid_id() >> 4) & 0xfff);
+	seq_printf(m, "CPU revision\t: %d\n", read_cpuid_id() & 15);
+
+	seq_puts(m, "\n");
+
+	seq_printf(m, "Hardware\t: %s\n", machine_name);
+
 	return 0;
 }
 

diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c
index e28be51..34b8bd0 100644
--- a/arch/arm64/kvm/handle_exit.c
+++ b/arch/arm64/kvm/handle_exit.c

@@ -66,6 +66,8 @@
 	else
 		kvm_vcpu_block(vcpu);
 
+	kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
+
 	return 1;
 }
 

diff --git a/arch/arm64/kvm/hyp-init.S b/arch/arm64/kvm/hyp-init.S
index d968796..c319116 100644
--- a/arch/arm64/kvm/hyp-init.S
+++ b/arch/arm64/kvm/hyp-init.S

@@ -80,6 +80,10 @@
 	msr	mair_el2, x4
 	isb
 
+	/* Invalidate the stale TLBs from Bootloader */
+	tlbi	alle2
+	dsb	sy
+
 	mrs	x4, sctlr_el2
 	and	x4, x4, #SCTLR_EL2_EE	// preserve endianness of EL2
 	ldr	x5, =SCTLR_EL2_FLAGS

diff --git a/arch/avr32/include/asm/atomic.h b/arch/avr32/include/asm/atomic.h
index 0780f3f..83e980a 100644
--- a/arch/avr32/include/asm/atomic.h
+++ b/arch/avr32/include/asm/atomic.h

@@ -22,31 +22,44 @@
 #define atomic_read(v)		(*(volatile int *)&(v)->counter)
 #define atomic_set(v, i)	(((v)->counter) = i)
 
-/*
- * atomic_sub_return - subtract the atomic variable
- * @i: integer value to subtract
- * @v: pointer of type atomic_t
- *
- * Atomically subtracts @i from @v. Returns the resulting value.
- */
-static inline int atomic_sub_return(int i, atomic_t *v)
-{
-	int result;
-
-	asm volatile(
-		"/* atomic_sub_return */\n"
-		"1:	ssrf	5\n"
-		"	ld.w	%0, %2\n"
-		"	sub	%0, %3\n"
-		"	stcond	%1, %0\n"
-		"	brne	1b"
-		: "=&r"(result), "=o"(v->counter)
-		: "m"(v->counter), "rKs21"(i)
-		: "cc");
-
-	return result;
+#define ATOMIC_OP_RETURN(op, asm_op, asm_con)				\
+static inline int __atomic_##op##_return(int i, atomic_t *v)		\
+{									\
+	int result;							\
+									\
+	asm volatile(							\
+		"/* atomic_" #op "_return */\n"				\
+		"1:	ssrf	5\n"					\
+		"	ld.w	%0, %2\n"				\
+		"	" #asm_op "	%0, %3\n"			\
+		"	stcond	%1, %0\n"				\
+		"	brne	1b"					\
+		: "=&r" (result), "=o" (v->counter)			\
+		: "m" (v->counter), #asm_con (i)			\
+		: "cc");						\
+									\
+	return result;							\
 }
 
+ATOMIC_OP_RETURN(sub, sub, rKs21)
+ATOMIC_OP_RETURN(add, add, r)
+
+#undef ATOMIC_OP_RETURN
+
+/*
+ * Probably found the reason why we want to use sub with the signed 21-bit
+ * limit, it uses one less register than the add instruction that can add up to
+ * 32-bit values.
+ *
+ * Both instructions are 32-bit, to use a 16-bit instruction the immediate is
+ * very small; 4 bit.
+ *
+ * sub 32-bit, type IV, takes a register and subtracts a 21-bit immediate.
+ * add 32-bit, type II, adds two register values together.
+ */
+#define IS_21BIT_CONST(i)						\
+	(__builtin_constant_p(i) && ((i) >= -1048575) && ((i) <= 1048576))
+
 /*
  * atomic_add_return - add integer to atomic variable
  * @i: integer value to add
@@ -56,51 +69,25 @@
  */
 static inline int atomic_add_return(int i, atomic_t *v)
 {
-	int result;
+	if (IS_21BIT_CONST(i))
+		return __atomic_sub_return(-i, v);
 
-	if (__builtin_constant_p(i) && (i >= -1048575) && (i <= 1048576))
-		result = atomic_sub_return(-i, v);
-	else
-		asm volatile(
-			"/* atomic_add_return */\n"
-			"1:	ssrf	5\n"
-			"	ld.w	%0, %1\n"
-			"	add	%0, %3\n"
-			"	stcond	%2, %0\n"
-			"	brne	1b"
-			: "=&r"(result), "=o"(v->counter)
-			: "m"(v->counter), "r"(i)
-			: "cc", "memory");
-
-	return result;
+	return __atomic_add_return(i, v);
 }
 
 /*
- * atomic_sub_unless - sub unless the number is a given value
+ * atomic_sub_return - subtract the atomic variable
+ * @i: integer value to subtract
  * @v: pointer of type atomic_t
- * @a: the amount to subtract from v...
- * @u: ...unless v is equal to u.
  *
- * Atomically subtract @a from @v, so long as it was not @u.
- * Returns the old value of @v.
-*/
-static inline void atomic_sub_unless(atomic_t *v, int a, int u)
+ * Atomically subtracts @i from @v. Returns the resulting value.
+ */
+static inline int atomic_sub_return(int i, atomic_t *v)
 {
-	int tmp;
+	if (IS_21BIT_CONST(i))
+		return __atomic_sub_return(i, v);
 
-	asm volatile(
-		"/* atomic_sub_unless */\n"
-		"1:	ssrf	5\n"
-		"	ld.w	%0, %2\n"
-		"	cp.w	%0, %4\n"
-		"	breq	1f\n"
-		"	sub	%0, %3\n"
-		"	stcond	%1, %0\n"
-		"	brne	1b\n"
-		"1:"
-		: "=&r"(tmp), "=o"(v->counter)
-		: "m"(v->counter), "rKs21"(a), "rKs21"(u)
-		: "cc", "memory");
+	return __atomic_add_return(-i, v);
 }
 
 /*
@@ -116,9 +103,21 @@
 {
 	int tmp, old = atomic_read(v);
 
-	if (__builtin_constant_p(a) && (a >= -1048575) && (a <= 1048576))
-		atomic_sub_unless(v, -a, u);
-	else {
+	if (IS_21BIT_CONST(a)) {
+		asm volatile(
+			"/* __atomic_sub_unless */\n"
+			"1:	ssrf	5\n"
+			"	ld.w	%0, %2\n"
+			"	cp.w	%0, %4\n"
+			"	breq	1f\n"
+			"	sub	%0, %3\n"
+			"	stcond	%1, %0\n"
+			"	brne	1b\n"
+			"1:"
+			: "=&r"(tmp), "=o"(v->counter)
+			: "m"(v->counter), "rKs21"(-a), "rKs21"(u)
+			: "cc", "memory");
+	} else {
 		asm volatile(
 			"/* __atomic_add_unless */\n"
 			"1:	ssrf	5\n"
@@ -137,6 +136,8 @@
 	return old;
 }
 
+#undef IS_21BIT_CONST
+
 /*
  * atomic_sub_if_positive - conditionally subtract integer from atomic variable
  * @i: integer value to subtract

diff --git a/arch/cris/include/asm/atomic.h b/arch/cris/include/asm/atomic.h
index aa429ba..0033f9d 100644
--- a/arch/cris/include/asm/atomic.h
+++ b/arch/cris/include/asm/atomic.h

@@ -22,44 +22,37 @@
 
 /* These should be written in asm but we do it in C for now. */
 
-static inline void atomic_add(int i, volatile atomic_t *v)
-{
-	unsigned long flags;
-	cris_atomic_save(v, flags);
-	v->counter += i;
-	cris_atomic_restore(v, flags);
+#define ATOMIC_OP(op, c_op)						\
+static inline void atomic_##op(int i, volatile atomic_t *v)		\
+{									\
+	unsigned long flags;						\
+	cris_atomic_save(v, flags);					\
+	v->counter c_op i;						\
+	cris_atomic_restore(v, flags);					\
+}									\
+
+#define ATOMIC_OP_RETURN(op, c_op)					\
+static inline int atomic_##op##_return(int i, volatile atomic_t *v)	\
+{									\
+	unsigned long flags;						\
+	int retval;							\
+	cris_atomic_save(v, flags);					\
+	retval = (v->counter c_op i);					\
+	cris_atomic_restore(v, flags);					\
+	return retval;							\
 }
 
-static inline void atomic_sub(int i, volatile atomic_t *v)
-{
-	unsigned long flags;
-	cris_atomic_save(v, flags);
-	v->counter -= i;
-	cris_atomic_restore(v, flags);
-}
+#define ATOMIC_OPS(op, c_op) ATOMIC_OP(op, c_op) ATOMIC_OP_RETURN(op, c_op)
 
-static inline int atomic_add_return(int i, volatile atomic_t *v)
-{
-	unsigned long flags;
-	int retval;
-	cris_atomic_save(v, flags);
-	retval = (v->counter += i);
-	cris_atomic_restore(v, flags);
-	return retval;
-}
+ATOMIC_OPS(add, +=)
+ATOMIC_OPS(sub, -=)
+
+#undef ATOMIC_OPS
+#undef ATOMIC_OP_RETURN
+#undef ATOMIC_OP
 
 #define atomic_add_negative(a, v)	(atomic_add_return((a), (v)) < 0)
 
-static inline int atomic_sub_return(int i, volatile atomic_t *v)
-{
-	unsigned long flags;
-	int retval;
-	cris_atomic_save(v, flags);
-	retval = (v->counter -= i);
-	cris_atomic_restore(v, flags);
-	return retval;
-}
-
 static inline int atomic_sub_and_test(int i, volatile atomic_t *v)
 {
 	int retval;

diff --git a/arch/hexagon/include/asm/atomic.h b/arch/hexagon/include/asm/atomic.h
index de916b1..93d0702 100644
--- a/arch/hexagon/include/asm/atomic.h
+++ b/arch/hexagon/include/asm/atomic.h

@@ -94,41 +94,47 @@
 	return __oldval;
 }
 
-static inline int atomic_add_return(int i, atomic_t *v)
-{
-	int output;
+#define ATOMIC_OP(op)							\
+static inline void atomic_##op(int i, atomic_t *v)			\
+{									\
+	int output;							\
+									\
+	__asm__ __volatile__ (						\
+		"1:	%0 = memw_locked(%1);\n"			\
+		"	%0 = "#op "(%0,%2);\n"				\
+		"	memw_locked(%1,P3)=%0;\n"			\
+		"	if !P3 jump 1b;\n"				\
+		: "=&r" (output)					\
+		: "r" (&v->counter), "r" (i)				\
+		: "memory", "p3"					\
+	);								\
+}									\
 
-	__asm__ __volatile__ (
-		"1:	%0 = memw_locked(%1);\n"
-		"	%0 = add(%0,%2);\n"
-		"	memw_locked(%1,P3)=%0;\n"
-		"	if !P3 jump 1b;\n"
-		: "=&r" (output)
-		: "r" (&v->counter), "r" (i)
-		: "memory", "p3"
-	);
-	return output;
-
+#define ATOMIC_OP_RETURN(op)							\
+static inline int atomic_##op##_return(int i, atomic_t *v)		\
+{									\
+	int output;							\
+									\
+	__asm__ __volatile__ (						\
+		"1:	%0 = memw_locked(%1);\n"			\
+		"	%0 = "#op "(%0,%2);\n"				\
+		"	memw_locked(%1,P3)=%0;\n"			\
+		"	if !P3 jump 1b;\n"				\
+		: "=&r" (output)					\
+		: "r" (&v->counter), "r" (i)				\
+		: "memory", "p3"					\
+	);								\
+	return output;							\
 }
 
-#define atomic_add(i, v) atomic_add_return(i, (v))
+#define ATOMIC_OPS(op) ATOMIC_OP(op) ATOMIC_OP_RETURN(op)
 
-static inline int atomic_sub_return(int i, atomic_t *v)
-{
-	int output;
-	__asm__ __volatile__ (
-		"1:	%0 = memw_locked(%1);\n"
-		"	%0 = sub(%0,%2);\n"
-		"	memw_locked(%1,P3)=%0\n"
-		"	if !P3 jump 1b;\n"
-		: "=&r" (output)
-		: "r" (&v->counter), "r" (i)
-		: "memory", "p3"
-	);
-	return output;
-}
+ATOMIC_OPS(add)
+ATOMIC_OPS(sub)
 
-#define atomic_sub(i, v) atomic_sub_return(i, (v))
+#undef ATOMIC_OPS
+#undef ATOMIC_OP_RETURN
+#undef ATOMIC_OP
 
 /**
  * __atomic_add_unless - add unless the number is a given value

diff --git a/arch/ia64/include/asm/atomic.h b/arch/ia64/include/asm/atomic.h
index 0f8bf48..42919a8 100644
--- a/arch/ia64/include/asm/atomic.h
+++ b/arch/ia64/include/asm/atomic.h

@@ -27,61 +27,93 @@
 #define atomic_set(v,i)		(((v)->counter) = (i))
 #define atomic64_set(v,i)	(((v)->counter) = (i))
 
-static __inline__ int
-ia64_atomic_add (int i, atomic_t *v)
-{
-	__s32 old, new;
-	CMPXCHG_BUGCHECK_DECL
-
-	do {
-		CMPXCHG_BUGCHECK(v);
-		old = atomic_read(v);
-		new = old + i;
-	} while (ia64_cmpxchg(acq, v, old, new, sizeof(atomic_t)) != old);
-	return new;
+#define ATOMIC_OP(op, c_op)						\
+static __inline__ int							\
+ia64_atomic_##op (int i, atomic_t *v)					\
+{									\
+	__s32 old, new;							\
+	CMPXCHG_BUGCHECK_DECL						\
+									\
+	do {								\
+		CMPXCHG_BUGCHECK(v);					\
+		old = atomic_read(v);					\
+		new = old c_op i;					\
+	} while (ia64_cmpxchg(acq, v, old, new, sizeof(atomic_t)) != old); \
+	return new;							\
 }
 
-static __inline__ long
-ia64_atomic64_add (__s64 i, atomic64_t *v)
-{
-	__s64 old, new;
-	CMPXCHG_BUGCHECK_DECL
+ATOMIC_OP(add, +)
+ATOMIC_OP(sub, -)
 
-	do {
-		CMPXCHG_BUGCHECK(v);
-		old = atomic64_read(v);
-		new = old + i;
-	} while (ia64_cmpxchg(acq, v, old, new, sizeof(atomic64_t)) != old);
-	return new;
+#undef ATOMIC_OP
+
+#define atomic_add_return(i,v)						\
+({									\
+	int __ia64_aar_i = (i);						\
+	(__builtin_constant_p(i)					\
+	 && (   (__ia64_aar_i ==  1) || (__ia64_aar_i ==   4)		\
+	     || (__ia64_aar_i ==  8) || (__ia64_aar_i ==  16)		\
+	     || (__ia64_aar_i == -1) || (__ia64_aar_i ==  -4)		\
+	     || (__ia64_aar_i == -8) || (__ia64_aar_i == -16)))		\
+		? ia64_fetch_and_add(__ia64_aar_i, &(v)->counter)	\
+		: ia64_atomic_add(__ia64_aar_i, v);			\
+})
+
+#define atomic_sub_return(i,v)						\
+({									\
+	int __ia64_asr_i = (i);						\
+	(__builtin_constant_p(i)					\
+	 && (   (__ia64_asr_i ==   1) || (__ia64_asr_i ==   4)		\
+	     || (__ia64_asr_i ==   8) || (__ia64_asr_i ==  16)		\
+	     || (__ia64_asr_i ==  -1) || (__ia64_asr_i ==  -4)		\
+	     || (__ia64_asr_i ==  -8) || (__ia64_asr_i == -16)))	\
+		? ia64_fetch_and_add(-__ia64_asr_i, &(v)->counter)	\
+		: ia64_atomic_sub(__ia64_asr_i, v);			\
+})
+
+#define ATOMIC64_OP(op, c_op)						\
+static __inline__ long							\
+ia64_atomic64_##op (__s64 i, atomic64_t *v)				\
+{									\
+	__s64 old, new;							\
+	CMPXCHG_BUGCHECK_DECL						\
+									\
+	do {								\
+		CMPXCHG_BUGCHECK(v);					\
+		old = atomic64_read(v);					\
+		new = old c_op i;					\
+	} while (ia64_cmpxchg(acq, v, old, new, sizeof(atomic64_t)) != old); \
+	return new;							\
 }
 
-static __inline__ int
-ia64_atomic_sub (int i, atomic_t *v)
-{
-	__s32 old, new;
-	CMPXCHG_BUGCHECK_DECL
+ATOMIC64_OP(add, +)
+ATOMIC64_OP(sub, -)
 
-	do {
-		CMPXCHG_BUGCHECK(v);
-		old = atomic_read(v);
-		new = old - i;
-	} while (ia64_cmpxchg(acq, v, old, new, sizeof(atomic_t)) != old);
-	return new;
-}
+#undef ATOMIC64_OP
 
-static __inline__ long
-ia64_atomic64_sub (__s64 i, atomic64_t *v)
-{
-	__s64 old, new;
-	CMPXCHG_BUGCHECK_DECL
+#define atomic64_add_return(i,v)					\
+({									\
+	long __ia64_aar_i = (i);					\
+	(__builtin_constant_p(i)					\
+	 && (   (__ia64_aar_i ==  1) || (__ia64_aar_i ==   4)		\
+	     || (__ia64_aar_i ==  8) || (__ia64_aar_i ==  16)		\
+	     || (__ia64_aar_i == -1) || (__ia64_aar_i ==  -4)		\
+	     || (__ia64_aar_i == -8) || (__ia64_aar_i == -16)))		\
+		? ia64_fetch_and_add(__ia64_aar_i, &(v)->counter)	\
+		: ia64_atomic64_add(__ia64_aar_i, v);			\
+})
 
-	do {
-		CMPXCHG_BUGCHECK(v);
-		old = atomic64_read(v);
-		new = old - i;
-	} while (ia64_cmpxchg(acq, v, old, new, sizeof(atomic64_t)) != old);
-	return new;
-}
+#define atomic64_sub_return(i,v)					\
+({									\
+	long __ia64_asr_i = (i);					\
+	(__builtin_constant_p(i)					\
+	 && (   (__ia64_asr_i ==   1) || (__ia64_asr_i ==   4)		\
+	     || (__ia64_asr_i ==   8) || (__ia64_asr_i ==  16)		\
+	     || (__ia64_asr_i ==  -1) || (__ia64_asr_i ==  -4)		\
+	     || (__ia64_asr_i ==  -8) || (__ia64_asr_i == -16)))	\
+		? ia64_fetch_and_add(-__ia64_asr_i, &(v)->counter)	\
+		: ia64_atomic64_sub(__ia64_asr_i, v);			\
+})
 
 #define atomic_cmpxchg(v, old, new) (cmpxchg(&((v)->counter), old, new))
 #define atomic_xchg(v, new) (xchg(&((v)->counter), new))
@@ -123,30 +155,6 @@
 
 #define atomic64_inc_not_zero(v) atomic64_add_unless((v), 1, 0)
 
-#define atomic_add_return(i,v)						\
-({									\
-	int __ia64_aar_i = (i);						\
-	(__builtin_constant_p(i)					\
-	 && (   (__ia64_aar_i ==  1) || (__ia64_aar_i ==   4)		\
-	     || (__ia64_aar_i ==  8) || (__ia64_aar_i ==  16)		\
-	     || (__ia64_aar_i == -1) || (__ia64_aar_i ==  -4)		\
-	     || (__ia64_aar_i == -8) || (__ia64_aar_i == -16)))		\
-		? ia64_fetch_and_add(__ia64_aar_i, &(v)->counter)	\
-		: ia64_atomic_add(__ia64_aar_i, v);			\
-})
-
-#define atomic64_add_return(i,v)					\
-({									\
-	long __ia64_aar_i = (i);					\
-	(__builtin_constant_p(i)					\
-	 && (   (__ia64_aar_i ==  1) || (__ia64_aar_i ==   4)		\
-	     || (__ia64_aar_i ==  8) || (__ia64_aar_i ==  16)		\
-	     || (__ia64_aar_i == -1) || (__ia64_aar_i ==  -4)		\
-	     || (__ia64_aar_i == -8) || (__ia64_aar_i == -16)))		\
-		? ia64_fetch_and_add(__ia64_aar_i, &(v)->counter)	\
-		: ia64_atomic64_add(__ia64_aar_i, v);			\
-})
-
 /*
  * Atomically add I to V and return TRUE if the resulting value is
  * negative.
@@ -163,30 +171,6 @@
 	return atomic64_add_return(i, v) < 0;
 }
 
-#define atomic_sub_return(i,v)						\
-({									\
-	int __ia64_asr_i = (i);						\
-	(__builtin_constant_p(i)					\
-	 && (   (__ia64_asr_i ==   1) || (__ia64_asr_i ==   4)		\
-	     || (__ia64_asr_i ==   8) || (__ia64_asr_i ==  16)		\
-	     || (__ia64_asr_i ==  -1) || (__ia64_asr_i ==  -4)		\
-	     || (__ia64_asr_i ==  -8) || (__ia64_asr_i == -16)))	\
-		? ia64_fetch_and_add(-__ia64_asr_i, &(v)->counter)	\
-		: ia64_atomic_sub(__ia64_asr_i, v);			\
-})
-
-#define atomic64_sub_return(i,v)					\
-({									\
-	long __ia64_asr_i = (i);					\
-	(__builtin_constant_p(i)					\
-	 && (   (__ia64_asr_i ==   1) || (__ia64_asr_i ==   4)		\
-	     || (__ia64_asr_i ==   8) || (__ia64_asr_i ==  16)		\
-	     || (__ia64_asr_i ==  -1) || (__ia64_asr_i ==  -4)		\
-	     || (__ia64_asr_i ==  -8) || (__ia64_asr_i == -16)))	\
-		? ia64_fetch_and_add(-__ia64_asr_i, &(v)->counter)	\
-		: ia64_atomic64_sub(__ia64_asr_i, v);			\
-})
-
 #define atomic_dec_return(v)		atomic_sub_return(1, (v))
 #define atomic_inc_return(v)		atomic_add_return(1, (v))
 #define atomic64_dec_return(v)		atomic64_sub_return(1, (v))
@@ -199,13 +183,13 @@
 #define atomic64_dec_and_test(v)	(atomic64_sub_return(1, (v)) == 0)
 #define atomic64_inc_and_test(v)	(atomic64_add_return(1, (v)) == 0)
 
-#define atomic_add(i,v)			atomic_add_return((i), (v))
-#define atomic_sub(i,v)			atomic_sub_return((i), (v))
+#define atomic_add(i,v)			(void)atomic_add_return((i), (v))
+#define atomic_sub(i,v)			(void)atomic_sub_return((i), (v))
 #define atomic_inc(v)			atomic_add(1, (v))
 #define atomic_dec(v)			atomic_sub(1, (v))
 
-#define atomic64_add(i,v)		atomic64_add_return((i), (v))
-#define atomic64_sub(i,v)		atomic64_sub_return((i), (v))
+#define atomic64_add(i,v)		(void)atomic64_add_return((i), (v))
+#define atomic64_sub(i,v)		(void)atomic64_sub_return((i), (v))
 #define atomic64_inc(v)			atomic64_add(1, (v))
 #define atomic64_dec(v)			atomic64_sub(1, (v))
 

diff --git a/arch/m32r/include/asm/atomic.h b/arch/m32r/include/asm/atomic.h
index 8ad0ed4..3946b2c 100644
--- a/arch/m32r/include/asm/atomic.h
+++ b/arch/m32r/include/asm/atomic.h

@@ -39,85 +39,64 @@
  */
 #define atomic_set(v,i)	(((v)->counter) = (i))
 
-/**
- * atomic_add_return - add integer to atomic variable and return it
- * @i: integer value to add
- * @v: pointer of type atomic_t
- *
- * Atomically adds @i to @v and return (@i + @v).
- */
-static __inline__ int atomic_add_return(int i, atomic_t *v)
-{
-	unsigned long flags;
-	int result;
-
-	local_irq_save(flags);
-	__asm__ __volatile__ (
-		"# atomic_add_return		\n\t"
-		DCACHE_CLEAR("%0", "r4", "%1")
-		M32R_LOCK" %0, @%1;		\n\t"
-		"add	%0, %2;			\n\t"
-		M32R_UNLOCK" %0, @%1;		\n\t"
-		: "=&r" (result)
-		: "r" (&v->counter), "r" (i)
-		: "memory"
 #ifdef CONFIG_CHIP_M32700_TS1
-		, "r4"
-#endif	/* CONFIG_CHIP_M32700_TS1 */
-	);
-	local_irq_restore(flags);
+#define __ATOMIC_CLOBBER	, "r4"
+#else
+#define __ATOMIC_CLOBBER
+#endif
 
-	return result;
+#define ATOMIC_OP(op)							\
+static __inline__ void atomic_##op(int i, atomic_t *v)			\
+{									\
+	unsigned long flags;						\
+	int result;							\
+									\
+	local_irq_save(flags);						\
+	__asm__ __volatile__ (						\
+		"# atomic_" #op "		\n\t"			\
+		DCACHE_CLEAR("%0", "r4", "%1")				\
+		M32R_LOCK" %0, @%1;		\n\t"			\
+		#op " %0, %2;			\n\t"			\
+		M32R_UNLOCK" %0, @%1;		\n\t"			\
+		: "=&r" (result)					\
+		: "r" (&v->counter), "r" (i)				\
+		: "memory"						\
+		__ATOMIC_CLOBBER					\
+	);								\
+	local_irq_restore(flags);					\
+}									\
+
+#define ATOMIC_OP_RETURN(op)						\
+static __inline__ int atomic_##op##_return(int i, atomic_t *v)		\
+{									\
+	unsigned long flags;						\
+	int result;							\
+									\
+	local_irq_save(flags);						\
+	__asm__ __volatile__ (						\
+		"# atomic_" #op "_return	\n\t"			\
+		DCACHE_CLEAR("%0", "r4", "%1")				\
+		M32R_LOCK" %0, @%1;		\n\t"			\
+		#op " %0, %2;			\n\t"			\
+		M32R_UNLOCK" %0, @%1;		\n\t"			\
+		: "=&r" (result)					\
+		: "r" (&v->counter), "r" (i)				\
+		: "memory"						\
+		__ATOMIC_CLOBBER					\
+	);								\
+	local_irq_restore(flags);					\
+									\
+	return result;							\
 }
 
-/**
- * atomic_sub_return - subtract integer from atomic variable and return it
- * @i: integer value to subtract
- * @v: pointer of type atomic_t
- *
- * Atomically subtracts @i from @v and return (@v - @i).
- */
-static __inline__ int atomic_sub_return(int i, atomic_t *v)
-{
-	unsigned long flags;
-	int result;
+#define ATOMIC_OPS(op) ATOMIC_OP(op) ATOMIC_OP_RETURN(op)
 
-	local_irq_save(flags);
-	__asm__ __volatile__ (
-		"# atomic_sub_return		\n\t"
-		DCACHE_CLEAR("%0", "r4", "%1")
-		M32R_LOCK" %0, @%1;		\n\t"
-		"sub	%0, %2;			\n\t"
-		M32R_UNLOCK" %0, @%1;		\n\t"
-		: "=&r" (result)
-		: "r" (&v->counter), "r" (i)
-		: "memory"
-#ifdef CONFIG_CHIP_M32700_TS1
-		, "r4"
-#endif	/* CONFIG_CHIP_M32700_TS1 */
-	);
-	local_irq_restore(flags);
+ATOMIC_OPS(add)
+ATOMIC_OPS(sub)
 
-	return result;
-}
-
-/**
- * atomic_add - add integer to atomic variable
- * @i: integer value to add
- * @v: pointer of type atomic_t
- *
- * Atomically adds @i to @v.
- */
-#define atomic_add(i,v) ((void) atomic_add_return((i), (v)))
-
-/**
- * atomic_sub - subtract the atomic variable
- * @i: integer value to subtract
- * @v: pointer of type atomic_t
- *
- * Atomically subtracts @i from @v.
- */
-#define atomic_sub(i,v) ((void) atomic_sub_return((i), (v)))
+#undef ATOMIC_OPS
+#undef ATOMIC_OP_RETURN
+#undef ATOMIC_OP
 
 /**
  * atomic_sub_and_test - subtract value from variable and test result
@@ -151,9 +130,7 @@
 		: "=&r" (result)
 		: "r" (&v->counter)
 		: "memory"
-#ifdef CONFIG_CHIP_M32700_TS1
-		, "r4"
-#endif	/* CONFIG_CHIP_M32700_TS1 */
+		__ATOMIC_CLOBBER
 	);
 	local_irq_restore(flags);
 
@@ -181,9 +158,7 @@
 		: "=&r" (result)
 		: "r" (&v->counter)
 		: "memory"
-#ifdef CONFIG_CHIP_M32700_TS1
-		, "r4"
-#endif	/* CONFIG_CHIP_M32700_TS1 */
+		__ATOMIC_CLOBBER
 	);
 	local_irq_restore(flags);
 
@@ -280,9 +255,7 @@
 		: "=&r" (tmp)
 		: "r" (addr), "r" (~mask)
 		: "memory"
-#ifdef CONFIG_CHIP_M32700_TS1
-		, "r5"
-#endif	/* CONFIG_CHIP_M32700_TS1 */
+		__ATOMIC_CLOBBER
 	);
 	local_irq_restore(flags);
 }
@@ -302,9 +275,7 @@
 		: "=&r" (tmp)
 		: "r" (addr), "r" (mask)
 		: "memory"
-#ifdef CONFIG_CHIP_M32700_TS1
-		, "r5"
-#endif	/* CONFIG_CHIP_M32700_TS1 */
+		__ATOMIC_CLOBBER
 	);
 	local_irq_restore(flags);
 }

diff --git a/arch/m68k/include/asm/atomic.h b/arch/m68k/include/asm/atomic.h
index 5569521..663d4ba 100644
--- a/arch/m68k/include/asm/atomic.h
+++ b/arch/m68k/include/asm/atomic.h

@@ -30,16 +30,57 @@
 #define	ASM_DI	"di"
 #endif
 
-static inline void atomic_add(int i, atomic_t *v)
-{
-	__asm__ __volatile__("addl %1,%0" : "+m" (*v) : ASM_DI (i));
+#define ATOMIC_OP(op, c_op, asm_op)					\
+static inline void atomic_##op(int i, atomic_t *v)			\
+{									\
+	__asm__ __volatile__(#asm_op "l %1,%0" : "+m" (*v) : ASM_DI (i));\
+}									\
+
+#ifdef CONFIG_RMW_INSNS
+
+#define ATOMIC_OP_RETURN(op, c_op, asm_op)				\
+static inline int atomic_##op##_return(int i, atomic_t *v)		\
+{									\
+	int t, tmp;							\
+									\
+	__asm__ __volatile__(						\
+			"1:	movel %2,%1\n"				\
+			"	" #asm_op "l %3,%1\n"			\
+			"	casl %2,%1,%0\n"			\
+			"	jne 1b"					\
+			: "+m" (*v), "=&d" (t), "=&d" (tmp)		\
+			: "g" (i), "2" (atomic_read(v)));		\
+	return t;							\
 }
 
-static inline void atomic_sub(int i, atomic_t *v)
-{
-	__asm__ __volatile__("subl %1,%0" : "+m" (*v) : ASM_DI (i));
+#else
+
+#define ATOMIC_OP_RETURN(op, c_op, asm_op)				\
+static inline int atomic_##op##_return(int i, atomic_t * v)		\
+{									\
+	unsigned long flags;						\
+	int t;								\
+									\
+	local_irq_save(flags);						\
+	t = (v->counter c_op i);					\
+	local_irq_restore(flags);					\
+									\
+	return t;							\
 }
 
+#endif /* CONFIG_RMW_INSNS */
+
+#define ATOMIC_OPS(op, c_op, asm_op)					\
+	ATOMIC_OP(op, c_op, asm_op)					\
+	ATOMIC_OP_RETURN(op, c_op, asm_op)
+
+ATOMIC_OPS(add, +=, add)
+ATOMIC_OPS(sub, -=, sub)
+
+#undef ATOMIC_OPS
+#undef ATOMIC_OP_RETURN
+#undef ATOMIC_OP
+
 static inline void atomic_inc(atomic_t *v)
 {
 	__asm__ __volatile__("addql #1,%0" : "+m" (*v));
@@ -76,67 +117,11 @@
 
 #ifdef CONFIG_RMW_INSNS
 
-static inline int atomic_add_return(int i, atomic_t *v)
-{
-	int t, tmp;
-
-	__asm__ __volatile__(
-			"1:	movel %2,%1\n"
-			"	addl %3,%1\n"
-			"	casl %2,%1,%0\n"
-			"	jne 1b"
-			: "+m" (*v), "=&d" (t), "=&d" (tmp)
-			: "g" (i), "2" (atomic_read(v)));
-	return t;
-}
-
-static inline int atomic_sub_return(int i, atomic_t *v)
-{
-	int t, tmp;
-
-	__asm__ __volatile__(
-			"1:	movel %2,%1\n"
-			"	subl %3,%1\n"
-			"	casl %2,%1,%0\n"
-			"	jne 1b"
-			: "+m" (*v), "=&d" (t), "=&d" (tmp)
-			: "g" (i), "2" (atomic_read(v)));
-	return t;
-}
-
 #define atomic_cmpxchg(v, o, n) ((int)cmpxchg(&((v)->counter), (o), (n)))
 #define atomic_xchg(v, new) (xchg(&((v)->counter), new))
 
 #else /* !CONFIG_RMW_INSNS */
 
-static inline int atomic_add_return(int i, atomic_t * v)
-{
-	unsigned long flags;
-	int t;
-
-	local_irq_save(flags);
-	t = atomic_read(v);
-	t += i;
-	atomic_set(v, t);
-	local_irq_restore(flags);
-
-	return t;
-}
-
-static inline int atomic_sub_return(int i, atomic_t * v)
-{
-	unsigned long flags;
-	int t;
-
-	local_irq_save(flags);
-	t = atomic_read(v);
-	t -= i;
-	atomic_set(v, t);
-	local_irq_restore(flags);
-
-	return t;
-}
-
 static inline int atomic_cmpxchg(atomic_t *v, int old, int new)
 {
 	unsigned long flags;

diff --git a/arch/m68k/include/asm/unistd.h b/arch/m68k/include/asm/unistd.h
index 1fcdd34..4ef7a54 100644
--- a/arch/m68k/include/asm/unistd.h
+++ b/arch/m68k/include/asm/unistd.h

@@ -4,7 +4,7 @@
 #include <uapi/asm/unistd.h>
 
 
-#define NR_syscalls		352
+#define NR_syscalls		354
 
 #define __ARCH_WANT_OLD_READDIR
 #define __ARCH_WANT_OLD_STAT

diff --git a/arch/m68k/include/uapi/asm/unistd.h b/arch/m68k/include/uapi/asm/unistd.h
index 9cd82fb..b419c6b 100644
--- a/arch/m68k/include/uapi/asm/unistd.h
+++ b/arch/m68k/include/uapi/asm/unistd.h

@@ -357,5 +357,7 @@
 #define __NR_sched_setattr	349
 #define __NR_sched_getattr	350
 #define __NR_renameat2		351
+#define __NR_getrandom		352
+#define __NR_memfd_create	353
 
 #endif /* _UAPI_ASM_M68K_UNISTD_H_ */

diff --git a/arch/m68k/kernel/syscalltable.S b/arch/m68k/kernel/syscalltable.S
index 501e102..05b46c2 100644
--- a/arch/m68k/kernel/syscalltable.S
+++ b/arch/m68k/kernel/syscalltable.S

@@ -372,4 +372,6 @@
 	.long sys_sched_setattr
 	.long sys_sched_getattr		/* 350 */
 	.long sys_renameat2
+	.long sys_getrandom
+	.long sys_memfd_create
 

diff --git a/arch/metag/include/asm/atomic_lnkget.h b/arch/metag/include/asm/atomic_lnkget.h
index d2e60a1..948d868 100644
--- a/arch/metag/include/asm/atomic_lnkget.h
+++ b/arch/metag/include/asm/atomic_lnkget.h

@@ -27,85 +27,56 @@
 	return temp;
 }
 
-static inline void atomic_add(int i, atomic_t *v)
-{
-	int temp;
+#define ATOMIC_OP(op)							\
+static inline void atomic_##op(int i, atomic_t *v)			\
+{									\
+	int temp;							\
+									\
+	asm volatile (							\
+		"1:	LNKGETD %0, [%1]\n"				\
+		"	" #op "	%0, %0, %2\n"				\
+		"	LNKSETD [%1], %0\n"				\
+		"	DEFR	%0, TXSTAT\n"				\
+		"	ANDT	%0, %0, #HI(0x3f000000)\n"		\
+		"	CMPT	%0, #HI(0x02000000)\n"			\
+		"	BNZ	1b\n"					\
+		: "=&d" (temp)						\
+		: "da" (&v->counter), "bd" (i)				\
+		: "cc");						\
+}									\
 
-	asm volatile (
-		"1:	LNKGETD %0, [%1]\n"
-		"	ADD	%0, %0, %2\n"
-		"	LNKSETD [%1], %0\n"
-		"	DEFR	%0, TXSTAT\n"
-		"	ANDT	%0, %0, #HI(0x3f000000)\n"
-		"	CMPT	%0, #HI(0x02000000)\n"
-		"	BNZ	1b\n"
-		: "=&d" (temp)
-		: "da" (&v->counter), "bd" (i)
-		: "cc");
+#define ATOMIC_OP_RETURN(op)						\
+static inline int atomic_##op##_return(int i, atomic_t *v)		\
+{									\
+	int result, temp;						\
+									\
+	smp_mb();							\
+									\
+	asm volatile (							\
+		"1:	LNKGETD %1, [%2]\n"				\
+		"	" #op "	%1, %1, %3\n"				\
+		"	LNKSETD [%2], %1\n"				\
+		"	DEFR	%0, TXSTAT\n"				\
+		"	ANDT	%0, %0, #HI(0x3f000000)\n"		\
+		"	CMPT	%0, #HI(0x02000000)\n"			\
+		"	BNZ 1b\n"					\
+		: "=&d" (temp), "=&da" (result)				\
+		: "da" (&v->counter), "bd" (i)				\
+		: "cc");						\
+									\
+	smp_mb();							\
+									\
+	return result;							\
 }
 
-static inline void atomic_sub(int i, atomic_t *v)
-{
-	int temp;
+#define ATOMIC_OPS(op) ATOMIC_OP(op) ATOMIC_OP_RETURN(op)
 
-	asm volatile (
-		"1:	LNKGETD %0, [%1]\n"
-		"	SUB	%0, %0, %2\n"
-		"	LNKSETD [%1], %0\n"
-		"	DEFR	%0, TXSTAT\n"
-		"	ANDT	%0, %0, #HI(0x3f000000)\n"
-		"	CMPT	%0, #HI(0x02000000)\n"
-		"	BNZ 1b\n"
-		: "=&d" (temp)
-		: "da" (&v->counter), "bd" (i)
-		: "cc");
-}
+ATOMIC_OPS(add)
+ATOMIC_OPS(sub)
 
-static inline int atomic_add_return(int i, atomic_t *v)
-{
-	int result, temp;
-
-	smp_mb();
-
-	asm volatile (
-		"1:	LNKGETD %1, [%2]\n"
-		"	ADD	%1, %1, %3\n"
-		"	LNKSETD [%2], %1\n"
-		"	DEFR	%0, TXSTAT\n"
-		"	ANDT	%0, %0, #HI(0x3f000000)\n"
-		"	CMPT	%0, #HI(0x02000000)\n"
-		"	BNZ 1b\n"
-		: "=&d" (temp), "=&da" (result)
-		: "da" (&v->counter), "bd" (i)
-		: "cc");
-
-	smp_mb();
-
-	return result;
-}
-
-static inline int atomic_sub_return(int i, atomic_t *v)
-{
-	int result, temp;
-
-	smp_mb();
-
-	asm volatile (
-		"1:	LNKGETD %1, [%2]\n"
-		"	SUB	%1, %1, %3\n"
-		"	LNKSETD [%2], %1\n"
-		"	DEFR	%0, TXSTAT\n"
-		"	ANDT	%0, %0, #HI(0x3f000000)\n"
-		"	CMPT	%0, #HI(0x02000000)\n"
-		"	BNZ	1b\n"
-		: "=&d" (temp), "=&da" (result)
-		: "da" (&v->counter), "bd" (i)
-		: "cc");
-
-	smp_mb();
-
-	return result;
-}
+#undef ATOMIC_OPS
+#undef ATOMIC_OP_RETURN
+#undef ATOMIC_OP
 
 static inline void atomic_clear_mask(unsigned int mask, atomic_t *v)
 {

diff --git a/arch/metag/include/asm/atomic_lock1.h b/arch/metag/include/asm/atomic_lock1.h
index e578955..f5d5898 100644
--- a/arch/metag/include/asm/atomic_lock1.h
+++ b/arch/metag/include/asm/atomic_lock1.h

@@ -37,55 +37,41 @@
 	return i;
 }
 
-static inline void atomic_add(int i, atomic_t *v)
-{
-	unsigned long flags;
+#define ATOMIC_OP(op, c_op)						\
+static inline void atomic_##op(int i, atomic_t *v)			\
+{									\
+	unsigned long flags;						\
+									\
+	__global_lock1(flags);						\
+	fence();							\
+	v->counter c_op i;						\
+	__global_unlock1(flags);					\
+}									\
 
-	__global_lock1(flags);
-	fence();
-	v->counter += i;
-	__global_unlock1(flags);
+#define ATOMIC_OP_RETURN(op, c_op)					\
+static inline int atomic_##op##_return(int i, atomic_t *v)		\
+{									\
+	unsigned long result;						\
+	unsigned long flags;						\
+									\
+	__global_lock1(flags);						\
+	result = v->counter;						\
+	result c_op i;							\
+	fence();							\
+	v->counter = result;						\
+	__global_unlock1(flags);					\
+									\
+	return result;							\
 }
 
-static inline void atomic_sub(int i, atomic_t *v)
-{
-	unsigned long flags;
+#define ATOMIC_OPS(op, c_op) ATOMIC_OP(op, c_op) ATOMIC_OP_RETURN(op, c_op)
 
-	__global_lock1(flags);
-	fence();
-	v->counter -= i;
-	__global_unlock1(flags);
-}
+ATOMIC_OPS(add, +=)
+ATOMIC_OPS(sub, -=)
 
-static inline int atomic_add_return(int i, atomic_t *v)
-{
-	unsigned long result;
-	unsigned long flags;
-
-	__global_lock1(flags);
-	result = v->counter;
-	result += i;
-	fence();
-	v->counter = result;
-	__global_unlock1(flags);
-
-	return result;
-}
-
-static inline int atomic_sub_return(int i, atomic_t *v)
-{
-	unsigned long result;
-	unsigned long flags;
-
-	__global_lock1(flags);
-	result = v->counter;
-	result -= i;
-	fence();
-	v->counter = result;
-	__global_unlock1(flags);
-
-	return result;
-}
+#undef ATOMIC_OPS
+#undef ATOMIC_OP_RETURN
+#undef ATOMIC_OP
 
 static inline void atomic_clear_mask(unsigned int mask, atomic_t *v)
 {

diff --git a/arch/mips/include/asm/atomic.h b/arch/mips/include/asm/atomic.h
index 37b2bef..476fe3b 100644
--- a/arch/mips/include/asm/atomic.h
+++ b/arch/mips/include/asm/atomic.h

@@ -40,195 +40,103 @@
  */
 #define atomic_set(v, i)		((v)->counter = (i))
 
-/*
- * atomic_add - add integer to atomic variable
- * @i: integer value to add
- * @v: pointer of type atomic_t
- *
- * Atomically adds @i to @v.
- */
-static __inline__ void atomic_add(int i, atomic_t * v)
-{
-	if (kernel_uses_llsc && R10000_LLSC_WAR) {
-		int temp;
+#define ATOMIC_OP(op, c_op, asm_op)						\
+static __inline__ void atomic_##op(int i, atomic_t * v)				\
+{										\
+	if (kernel_uses_llsc && R10000_LLSC_WAR) {				\
+		int temp;							\
+										\
+		__asm__ __volatile__(						\
+		"	.set	arch=r4000				\n"	\
+		"1:	ll	%0, %1		# atomic_" #op "	\n"	\
+		"	" #asm_op " %0, %2				\n"	\
+		"	sc	%0, %1					\n"	\
+		"	beqzl	%0, 1b					\n"	\
+		"	.set	mips0					\n"	\
+		: "=&r" (temp), "+m" (v->counter)				\
+		: "Ir" (i));							\
+	} else if (kernel_uses_llsc) {						\
+		int temp;							\
+										\
+		do {								\
+			__asm__ __volatile__(					\
+			"	.set	arch=r4000			\n"	\
+			"	ll	%0, %1		# atomic_" #op "\n"	\
+			"	" #asm_op " %0, %2			\n"	\
+			"	sc	%0, %1				\n"	\
+			"	.set	mips0				\n"	\
+			: "=&r" (temp), "+m" (v->counter)			\
+			: "Ir" (i));						\
+		} while (unlikely(!temp));					\
+	} else {								\
+		unsigned long flags;						\
+										\
+		raw_local_irq_save(flags);					\
+		v->counter c_op i;						\
+		raw_local_irq_restore(flags);					\
+	}									\
+}										\
 
-		__asm__ __volatile__(
-		"	.set	arch=r4000				\n"
-		"1:	ll	%0, %1		# atomic_add		\n"
-		"	addu	%0, %2					\n"
-		"	sc	%0, %1					\n"
-		"	beqzl	%0, 1b					\n"
-		"	.set	mips0					\n"
-		: "=&r" (temp), "+m" (v->counter)
-		: "Ir" (i));
-	} else if (kernel_uses_llsc) {
-		int temp;
-
-		do {
-			__asm__ __volatile__(
-			"	.set	arch=r4000			\n"
-			"	ll	%0, %1		# atomic_add	\n"
-			"	addu	%0, %2				\n"
-			"	sc	%0, %1				\n"
-			"	.set	mips0				\n"
-			: "=&r" (temp), "+m" (v->counter)
-			: "Ir" (i));
-		} while (unlikely(!temp));
-	} else {
-		unsigned long flags;
-
-		raw_local_irq_save(flags);
-		v->counter += i;
-		raw_local_irq_restore(flags);
-	}
+#define ATOMIC_OP_RETURN(op, c_op, asm_op)					\
+static __inline__ int atomic_##op##_return(int i, atomic_t * v)			\
+{										\
+	int result;								\
+										\
+	smp_mb__before_llsc();							\
+										\
+	if (kernel_uses_llsc && R10000_LLSC_WAR) {				\
+		int temp;							\
+										\
+		__asm__ __volatile__(						\
+		"	.set	arch=r4000				\n"	\
+		"1:	ll	%1, %2		# atomic_" #op "_return	\n"	\
+		"	" #asm_op " %0, %1, %3				\n"	\
+		"	sc	%0, %2					\n"	\
+		"	beqzl	%0, 1b					\n"	\
+		"	addu	%0, %1, %3				\n"	\
+		"	.set	mips0					\n"	\
+		: "=&r" (result), "=&r" (temp), "+m" (v->counter)		\
+		: "Ir" (i));							\
+	} else if (kernel_uses_llsc) {						\
+		int temp;							\
+										\
+		do {								\
+			__asm__ __volatile__(					\
+			"	.set	arch=r4000			\n"	\
+			"	ll	%1, %2	# atomic_" #op "_return	\n"	\
+			"	" #asm_op " %0, %1, %3			\n"	\
+			"	sc	%0, %2				\n"	\
+			"	.set	mips0				\n"	\
+			: "=&r" (result), "=&r" (temp), "+m" (v->counter)	\
+			: "Ir" (i));						\
+		} while (unlikely(!result));					\
+										\
+		result = temp + i;						\
+	} else {								\
+		unsigned long flags;						\
+										\
+		raw_local_irq_save(flags);					\
+		result = v->counter;						\
+		result c_op i;							\
+		v->counter = result;						\
+		raw_local_irq_restore(flags);					\
+	}									\
+										\
+	smp_llsc_mb();								\
+										\
+	return result;								\
 }
 
-/*
- * atomic_sub - subtract the atomic variable
- * @i: integer value to subtract
- * @v: pointer of type atomic_t
- *
- * Atomically subtracts @i from @v.
- */
-static __inline__ void atomic_sub(int i, atomic_t * v)
-{
-	if (kernel_uses_llsc && R10000_LLSC_WAR) {
-		int temp;
+#define ATOMIC_OPS(op, c_op, asm_op)						\
+	ATOMIC_OP(op, c_op, asm_op)						\
+	ATOMIC_OP_RETURN(op, c_op, asm_op)
 
-		__asm__ __volatile__(
-		"	.set	arch=r4000				\n"
-		"1:	ll	%0, %1		# atomic_sub		\n"
-		"	subu	%0, %2					\n"
-		"	sc	%0, %1					\n"
-		"	beqzl	%0, 1b					\n"
-		"	.set	mips0					\n"
-		: "=&r" (temp), "+m" (v->counter)
-		: "Ir" (i));
-	} else if (kernel_uses_llsc) {
-		int temp;
+ATOMIC_OPS(add, +=, addu)
+ATOMIC_OPS(sub, -=, subu)
 
-		do {
-			__asm__ __volatile__(
-			"	.set	arch=r4000			\n"
-			"	ll	%0, %1		# atomic_sub	\n"
-			"	subu	%0, %2				\n"
-			"	sc	%0, %1				\n"
-			"	.set	mips0				\n"
-			: "=&r" (temp), "+m" (v->counter)
-			: "Ir" (i));
-		} while (unlikely(!temp));
-	} else {
-		unsigned long flags;
-
-		raw_local_irq_save(flags);
-		v->counter -= i;
-		raw_local_irq_restore(flags);
-	}
-}
-
-/*
- * Same as above, but return the result value
- */
-static __inline__ int atomic_add_return(int i, atomic_t * v)
-{
-	int result;
-
-	smp_mb__before_llsc();
-
-	if (kernel_uses_llsc && R10000_LLSC_WAR) {
-		int temp;
-
-		__asm__ __volatile__(
-		"	.set	arch=r4000				\n"
-		"1:	ll	%1, %2		# atomic_add_return	\n"
-		"	addu	%0, %1, %3				\n"
-		"	sc	%0, %2					\n"
-		"	beqzl	%0, 1b					\n"
-		"	addu	%0, %1, %3				\n"
-		"	.set	mips0					\n"
-		: "=&r" (result), "=&r" (temp), "+m" (v->counter)
-		: "Ir" (i));
-	} else if (kernel_uses_llsc) {
-		int temp;
-
-		do {
-			__asm__ __volatile__(
-			"	.set	arch=r4000			\n"
-			"	ll	%1, %2	# atomic_add_return	\n"
-			"	addu	%0, %1, %3			\n"
-			"	sc	%0, %2				\n"
-			"	.set	mips0				\n"
-			: "=&r" (result), "=&r" (temp), "+m" (v->counter)
-			: "Ir" (i));
-		} while (unlikely(!result));
-
-		result = temp + i;
-	} else {
-		unsigned long flags;
-
-		raw_local_irq_save(flags);
-		result = v->counter;
-		result += i;
-		v->counter = result;
-		raw_local_irq_restore(flags);
-	}
-
-	smp_llsc_mb();
-
-	return result;
-}
-
-static __inline__ int atomic_sub_return(int i, atomic_t * v)
-{
-	int result;
-
-	smp_mb__before_llsc();
-
-	if (kernel_uses_llsc && R10000_LLSC_WAR) {
-		int temp;
-
-		__asm__ __volatile__(
-		"	.set	arch=r4000				\n"
-		"1:	ll	%1, %2		# atomic_sub_return	\n"
-		"	subu	%0, %1, %3				\n"
-		"	sc	%0, %2					\n"
-		"	beqzl	%0, 1b					\n"
-		"	subu	%0, %1, %3				\n"
-		"	.set	mips0					\n"
-		: "=&r" (result), "=&r" (temp), "=m" (v->counter)
-		: "Ir" (i), "m" (v->counter)
-		: "memory");
-
-		result = temp - i;
-	} else if (kernel_uses_llsc) {
-		int temp;
-
-		do {
-			__asm__ __volatile__(
-			"	.set	arch=r4000			\n"
-			"	ll	%1, %2	# atomic_sub_return	\n"
-			"	subu	%0, %1, %3			\n"
-			"	sc	%0, %2				\n"
-			"	.set	mips0				\n"
-			: "=&r" (result), "=&r" (temp), "+m" (v->counter)
-			: "Ir" (i));
-		} while (unlikely(!result));
-
-		result = temp - i;
-	} else {
-		unsigned long flags;
-
-		raw_local_irq_save(flags);
-		result = v->counter;
-		result -= i;
-		v->counter = result;
-		raw_local_irq_restore(flags);
-	}
-
-	smp_llsc_mb();
-
-	return result;
-}
+#undef ATOMIC_OPS
+#undef ATOMIC_OP_RETURN
+#undef ATOMIC_OP
 
 /*
  * atomic_sub_if_positive - conditionally subtract integer from atomic variable
@@ -407,195 +315,104 @@
  */
 #define atomic64_set(v, i)	((v)->counter = (i))
 
-/*
- * atomic64_add - add integer to atomic variable
- * @i: integer value to add
- * @v: pointer of type atomic64_t
- *
- * Atomically adds @i to @v.
- */
-static __inline__ void atomic64_add(long i, atomic64_t * v)
-{
-	if (kernel_uses_llsc && R10000_LLSC_WAR) {
-		long temp;
+#define ATOMIC64_OP(op, c_op, asm_op)						\
+static __inline__ void atomic64_##op(long i, atomic64_t * v)			\
+{										\
+	if (kernel_uses_llsc && R10000_LLSC_WAR) {				\
+		long temp;							\
+										\
+		__asm__ __volatile__(						\
+		"	.set	arch=r4000				\n"	\
+		"1:	lld	%0, %1		# atomic64_" #op "	\n"	\
+		"	" #asm_op " %0, %2				\n"	\
+		"	scd	%0, %1					\n"	\
+		"	beqzl	%0, 1b					\n"	\
+		"	.set	mips0					\n"	\
+		: "=&r" (temp), "+m" (v->counter)				\
+		: "Ir" (i));							\
+	} else if (kernel_uses_llsc) {						\
+		long temp;							\
+										\
+		do {								\
+			__asm__ __volatile__(					\
+			"	.set	arch=r4000			\n"	\
+			"	lld	%0, %1		# atomic64_" #op "\n"	\
+			"	" #asm_op " %0, %2			\n"	\
+			"	scd	%0, %1				\n"	\
+			"	.set	mips0				\n"	\
+			: "=&r" (temp), "+m" (v->counter)			\
+			: "Ir" (i));						\
+		} while (unlikely(!temp));					\
+	} else {								\
+		unsigned long flags;						\
+										\
+		raw_local_irq_save(flags);					\
+		v->counter c_op i;						\
+		raw_local_irq_restore(flags);					\
+	}									\
+}										\
 
-		__asm__ __volatile__(
-		"	.set	arch=r4000				\n"
-		"1:	lld	%0, %1		# atomic64_add		\n"
-		"	daddu	%0, %2					\n"
-		"	scd	%0, %1					\n"
-		"	beqzl	%0, 1b					\n"
-		"	.set	mips0					\n"
-		: "=&r" (temp), "+m" (v->counter)
-		: "Ir" (i));
-	} else if (kernel_uses_llsc) {
-		long temp;
-
-		do {
-			__asm__ __volatile__(
-			"	.set	arch=r4000			\n"
-			"	lld	%0, %1		# atomic64_add	\n"
-			"	daddu	%0, %2				\n"
-			"	scd	%0, %1				\n"
-			"	.set	mips0				\n"
-			: "=&r" (temp), "+m" (v->counter)
-			: "Ir" (i));
-		} while (unlikely(!temp));
-	} else {
-		unsigned long flags;
-
-		raw_local_irq_save(flags);
-		v->counter += i;
-		raw_local_irq_restore(flags);
-	}
+#define ATOMIC64_OP_RETURN(op, c_op, asm_op)					\
+static __inline__ long atomic64_##op##_return(long i, atomic64_t * v)		\
+{										\
+	long result;								\
+										\
+	smp_mb__before_llsc();							\
+										\
+	if (kernel_uses_llsc && R10000_LLSC_WAR) {				\
+		long temp;							\
+										\
+		__asm__ __volatile__(						\
+		"	.set	arch=r4000				\n"	\
+		"1:	lld	%1, %2		# atomic64_" #op "_return\n"	\
+		"	" #asm_op " %0, %1, %3				\n"	\
+		"	scd	%0, %2					\n"	\
+		"	beqzl	%0, 1b					\n"	\
+		"	" #asm_op " %0, %1, %3				\n"	\
+		"	.set	mips0					\n"	\
+		: "=&r" (result), "=&r" (temp), "+m" (v->counter)		\
+		: "Ir" (i));							\
+	} else if (kernel_uses_llsc) {						\
+		long temp;							\
+										\
+		do {								\
+			__asm__ __volatile__(					\
+			"	.set	arch=r4000			\n"	\
+			"	lld	%1, %2	# atomic64_" #op "_return\n"	\
+			"	" #asm_op " %0, %1, %3			\n"	\
+			"	scd	%0, %2				\n"	\
+			"	.set	mips0				\n"	\
+			: "=&r" (result), "=&r" (temp), "=m" (v->counter)	\
+			: "Ir" (i), "m" (v->counter)				\
+			: "memory");						\
+		} while (unlikely(!result));					\
+										\
+		result = temp + i;						\
+	} else {								\
+		unsigned long flags;						\
+										\
+		raw_local_irq_save(flags);					\
+		result = v->counter;						\
+		result c_op i;							\
+		v->counter = result;						\
+		raw_local_irq_restore(flags);					\
+	}									\
+										\
+	smp_llsc_mb();								\
+										\
+	return result;								\
 }
 
-/*
- * atomic64_sub - subtract the atomic variable
- * @i: integer value to subtract
- * @v: pointer of type atomic64_t
- *
- * Atomically subtracts @i from @v.
- */
-static __inline__ void atomic64_sub(long i, atomic64_t * v)
-{
-	if (kernel_uses_llsc && R10000_LLSC_WAR) {
-		long temp;
+#define ATOMIC64_OPS(op, c_op, asm_op)						\
+	ATOMIC64_OP(op, c_op, asm_op)						\
+	ATOMIC64_OP_RETURN(op, c_op, asm_op)
 
-		__asm__ __volatile__(
-		"	.set	arch=r4000				\n"
-		"1:	lld	%0, %1		# atomic64_sub		\n"
-		"	dsubu	%0, %2					\n"
-		"	scd	%0, %1					\n"
-		"	beqzl	%0, 1b					\n"
-		"	.set	mips0					\n"
-		: "=&r" (temp), "+m" (v->counter)
-		: "Ir" (i));
-	} else if (kernel_uses_llsc) {
-		long temp;
+ATOMIC64_OPS(add, +=, daddu)
+ATOMIC64_OPS(sub, -=, dsubu)
 
-		do {
-			__asm__ __volatile__(
-			"	.set	arch=r4000			\n"
-			"	lld	%0, %1		# atomic64_sub	\n"
-			"	dsubu	%0, %2				\n"
-			"	scd	%0, %1				\n"
-			"	.set	mips0				\n"
-			: "=&r" (temp), "+m" (v->counter)
-			: "Ir" (i));
-		} while (unlikely(!temp));
-	} else {
-		unsigned long flags;
-
-		raw_local_irq_save(flags);
-		v->counter -= i;
-		raw_local_irq_restore(flags);
-	}
-}
-
-/*
- * Same as above, but return the result value
- */
-static __inline__ long atomic64_add_return(long i, atomic64_t * v)
-{
-	long result;
-
-	smp_mb__before_llsc();
-
-	if (kernel_uses_llsc && R10000_LLSC_WAR) {
-		long temp;
-
-		__asm__ __volatile__(
-		"	.set	arch=r4000				\n"
-		"1:	lld	%1, %2		# atomic64_add_return	\n"
-		"	daddu	%0, %1, %3				\n"
-		"	scd	%0, %2					\n"
-		"	beqzl	%0, 1b					\n"
-		"	daddu	%0, %1, %3				\n"
-		"	.set	mips0					\n"
-		: "=&r" (result), "=&r" (temp), "+m" (v->counter)
-		: "Ir" (i));
-	} else if (kernel_uses_llsc) {
-		long temp;
-
-		do {
-			__asm__ __volatile__(
-			"	.set	arch=r4000			\n"
-			"	lld	%1, %2	# atomic64_add_return	\n"
-			"	daddu	%0, %1, %3			\n"
-			"	scd	%0, %2				\n"
-			"	.set	mips0				\n"
-			: "=&r" (result), "=&r" (temp), "=m" (v->counter)
-			: "Ir" (i), "m" (v->counter)
-			: "memory");
-		} while (unlikely(!result));
-
-		result = temp + i;
-	} else {
-		unsigned long flags;
-
-		raw_local_irq_save(flags);
-		result = v->counter;
-		result += i;
-		v->counter = result;
-		raw_local_irq_restore(flags);
-	}
-
-	smp_llsc_mb();
-
-	return result;
-}
-
-static __inline__ long atomic64_sub_return(long i, atomic64_t * v)
-{
-	long result;
-
-	smp_mb__before_llsc();
-
-	if (kernel_uses_llsc && R10000_LLSC_WAR) {
-		long temp;
-
-		__asm__ __volatile__(
-		"	.set	arch=r4000				\n"
-		"1:	lld	%1, %2		# atomic64_sub_return	\n"
-		"	dsubu	%0, %1, %3				\n"
-		"	scd	%0, %2					\n"
-		"	beqzl	%0, 1b					\n"
-		"	dsubu	%0, %1, %3				\n"
-		"	.set	mips0					\n"
-		: "=&r" (result), "=&r" (temp), "=m" (v->counter)
-		: "Ir" (i), "m" (v->counter)
-		: "memory");
-	} else if (kernel_uses_llsc) {
-		long temp;
-
-		do {
-			__asm__ __volatile__(
-			"	.set	arch=r4000			\n"
-			"	lld	%1, %2	# atomic64_sub_return	\n"
-			"	dsubu	%0, %1, %3			\n"
-			"	scd	%0, %2				\n"
-			"	.set	mips0				\n"
-			: "=&r" (result), "=&r" (temp), "=m" (v->counter)
-			: "Ir" (i), "m" (v->counter)
-			: "memory");
-		} while (unlikely(!result));
-
-		result = temp - i;
-	} else {
-		unsigned long flags;
-
-		raw_local_irq_save(flags);
-		result = v->counter;
-		result -= i;
-		v->counter = result;
-		raw_local_irq_restore(flags);
-	}
-
-	smp_llsc_mb();
-
-	return result;
-}
+#undef ATOMIC64_OPS
+#undef ATOMIC64_OP_RETURN
+#undef ATOMIC64_OP
 
 /*
  * atomic64_sub_if_positive - conditionally subtract integer from atomic variable

diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c
index 645b3c4..f7aac5b 100644
--- a/arch/mips/kernel/ptrace.c
+++ b/arch/mips/kernel/ptrace.c

@@ -770,7 +770,7 @@
 	long ret = 0;
 	user_exit();
 
-	if (secure_computing(syscall) == -1)
+	if (secure_computing() == -1)
 		return -1;
 
 	if (test_thread_flag(TIF_SYSCALL_TRACE) &&

diff --git a/arch/mn10300/include/asm/atomic.h b/arch/mn10300/include/asm/atomic.h
index cadeb1e..5be655e 100644
--- a/arch/mn10300/include/asm/atomic.h
+++ b/arch/mn10300/include/asm/atomic.h

@@ -33,7 +33,6 @@
  * @v: pointer of type atomic_t
  *
  * Atomically reads the value of @v.  Note that the guaranteed
- * useful range of an atomic_t is only 24 bits.
  */
 #define atomic_read(v)	(ACCESS_ONCE((v)->counter))
 
@@ -43,102 +42,62 @@
  * @i: required value
  *
  * Atomically sets the value of @v to @i.  Note that the guaranteed
- * useful range of an atomic_t is only 24 bits.
  */
 #define atomic_set(v, i) (((v)->counter) = (i))
 
-/**
- * atomic_add_return - add integer to atomic variable
- * @i: integer value to add
- * @v: pointer of type atomic_t
- *
- * Atomically adds @i to @v and returns the result
- * Note that the guaranteed useful range of an atomic_t is only 24 bits.
- */
-static inline int atomic_add_return(int i, atomic_t *v)
-{
-	int retval;
-#ifdef CONFIG_SMP
-	int status;
-
-	asm volatile(
-		"1:	mov	%4,(_AAR,%3)	\n"
-		"	mov	(_ADR,%3),%1	\n"
-		"	add	%5,%1		\n"
-		"	mov	%1,(_ADR,%3)	\n"
-		"	mov	(_ADR,%3),%0	\n"	/* flush */
-		"	mov	(_ASR,%3),%0	\n"
-		"	or	%0,%0		\n"
-		"	bne	1b		\n"
-		: "=&r"(status), "=&r"(retval), "=m"(v->counter)
-		: "a"(ATOMIC_OPS_BASE_ADDR), "r"(&v->counter), "r"(i)
-		: "memory", "cc");
-
-#else
-	unsigned long flags;
-
-	flags = arch_local_cli_save();
-	retval = v->counter;
-	retval += i;
-	v->counter = retval;
-	arch_local_irq_restore(flags);
-#endif
-	return retval;
+#define ATOMIC_OP(op)							\
+static inline void atomic_##op(int i, atomic_t *v)			\
+{									\
+	int retval, status;						\
+									\
+	asm volatile(							\
+		"1:	mov	%4,(_AAR,%3)	\n"			\
+		"	mov	(_ADR,%3),%1	\n"			\
+		"	" #op "	%5,%1		\n"			\
+		"	mov	%1,(_ADR,%3)	\n"			\
+		"	mov	(_ADR,%3),%0	\n"	/* flush */	\
+		"	mov	(_ASR,%3),%0	\n"			\
+		"	or	%0,%0		\n"			\
+		"	bne	1b		\n"			\
+		: "=&r"(status), "=&r"(retval), "=m"(v->counter)	\
+		: "a"(ATOMIC_OPS_BASE_ADDR), "r"(&v->counter), "r"(i)	\
+		: "memory", "cc");					\
 }
 
-/**
- * atomic_sub_return - subtract integer from atomic variable
- * @i: integer value to subtract
- * @v: pointer of type atomic_t
- *
- * Atomically subtracts @i from @v and returns the result
- * Note that the guaranteed useful range of an atomic_t is only 24 bits.
- */
-static inline int atomic_sub_return(int i, atomic_t *v)
-{
-	int retval;
-#ifdef CONFIG_SMP
-	int status;
-
-	asm volatile(
-		"1:	mov	%4,(_AAR,%3)	\n"
-		"	mov	(_ADR,%3),%1	\n"
-		"	sub	%5,%1		\n"
-		"	mov	%1,(_ADR,%3)	\n"
-		"	mov	(_ADR,%3),%0	\n"	/* flush */
-		"	mov	(_ASR,%3),%0	\n"
-		"	or	%0,%0		\n"
-		"	bne	1b		\n"
-		: "=&r"(status), "=&r"(retval), "=m"(v->counter)
-		: "a"(ATOMIC_OPS_BASE_ADDR), "r"(&v->counter), "r"(i)
-		: "memory", "cc");
-
-#else
-	unsigned long flags;
-	flags = arch_local_cli_save();
-	retval = v->counter;
-	retval -= i;
-	v->counter = retval;
-	arch_local_irq_restore(flags);
-#endif
-	return retval;
+#define ATOMIC_OP_RETURN(op)						\
+static inline int atomic_##op##_return(int i, atomic_t *v)		\
+{									\
+	int retval, status;						\
+									\
+	asm volatile(							\
+		"1:	mov	%4,(_AAR,%3)	\n"			\
+		"	mov	(_ADR,%3),%1	\n"			\
+		"	" #op "	%5,%1		\n"			\
+		"	mov	%1,(_ADR,%3)	\n"			\
+		"	mov	(_ADR,%3),%0	\n"	/* flush */	\
+		"	mov	(_ASR,%3),%0	\n"			\
+		"	or	%0,%0		\n"			\
+		"	bne	1b		\n"			\
+		: "=&r"(status), "=&r"(retval), "=m"(v->counter)	\
+		: "a"(ATOMIC_OPS_BASE_ADDR), "r"(&v->counter), "r"(i)	\
+		: "memory", "cc");					\
+	return retval;							\
 }
 
+#define ATOMIC_OPS(op) ATOMIC_OP(op) ATOMIC_OP_RETURN(op)
+
+ATOMIC_OPS(add)
+ATOMIC_OPS(sub)
+
+#undef ATOMIC_OPS
+#undef ATOMIC_OP_RETURN
+#undef ATOMIC_OP
+
 static inline int atomic_add_negative(int i, atomic_t *v)
 {
 	return atomic_add_return(i, v) < 0;
 }
 
-static inline void atomic_add(int i, atomic_t *v)
-{
-	atomic_add_return(i, v);
-}
-
-static inline void atomic_sub(int i, atomic_t *v)
-{
-	atomic_sub_return(i, v);
-}
-
 static inline void atomic_inc(atomic_t *v)
 {
 	atomic_add_return(1, v);

diff --git a/arch/parisc/include/asm/atomic.h b/arch/parisc/include/asm/atomic.h
index 0be2db2..219750b 100644
--- a/arch/parisc/include/asm/atomic.h
+++ b/arch/parisc/include/asm/atomic.h

@@ -55,24 +55,7 @@
  * are atomic, so a reader never sees inconsistent values.
  */
 
-/* It's possible to reduce all atomic operations to either
- * __atomic_add_return, atomic_set and atomic_read (the latter
- * is there only for consistency).
- */
-
-static __inline__ int __atomic_add_return(int i, atomic_t *v)
-{
-	int ret;
-	unsigned long flags;
-	_atomic_spin_lock_irqsave(v, flags);
-
-	ret = (v->counter += i);
-
-	_atomic_spin_unlock_irqrestore(v, flags);
-	return ret;
-}
-
-static __inline__ void atomic_set(atomic_t *v, int i) 
+static __inline__ void atomic_set(atomic_t *v, int i)
 {
 	unsigned long flags;
 	_atomic_spin_lock_irqsave(v, flags);
@@ -115,16 +98,43 @@
 	return c;
 }
 
+#define ATOMIC_OP(op, c_op)						\
+static __inline__ void atomic_##op(int i, atomic_t *v)			\
+{									\
+	unsigned long flags;						\
+									\
+	_atomic_spin_lock_irqsave(v, flags);				\
+	v->counter c_op i;						\
+	_atomic_spin_unlock_irqrestore(v, flags);			\
+}									\
 
-#define atomic_add(i,v)	((void)(__atomic_add_return(        (i),(v))))
-#define atomic_sub(i,v)	((void)(__atomic_add_return(-((int) (i)),(v))))
-#define atomic_inc(v)	((void)(__atomic_add_return(   1,(v))))
-#define atomic_dec(v)	((void)(__atomic_add_return(  -1,(v))))
+#define ATOMIC_OP_RETURN(op, c_op)					\
+static __inline__ int atomic_##op##_return(int i, atomic_t *v)		\
+{									\
+	unsigned long flags;						\
+	int ret;							\
+									\
+	_atomic_spin_lock_irqsave(v, flags);				\
+	ret = (v->counter c_op i);					\
+	_atomic_spin_unlock_irqrestore(v, flags);			\
+									\
+	return ret;							\
+}
 
-#define atomic_add_return(i,v)	(__atomic_add_return( (i),(v)))
-#define atomic_sub_return(i,v)	(__atomic_add_return(-(i),(v)))
-#define atomic_inc_return(v)	(__atomic_add_return(   1,(v)))
-#define atomic_dec_return(v)	(__atomic_add_return(  -1,(v)))
+#define ATOMIC_OPS(op, c_op) ATOMIC_OP(op, c_op) ATOMIC_OP_RETURN(op, c_op)
+
+ATOMIC_OPS(add, +=)
+ATOMIC_OPS(sub, -=)
+
+#undef ATOMIC_OPS
+#undef ATOMIC_OP_RETURN
+#undef ATOMIC_OP
+
+#define atomic_inc(v)	(atomic_add(   1,(v)))
+#define atomic_dec(v)	(atomic_add(  -1,(v)))
+
+#define atomic_inc_return(v)	(atomic_add_return(   1,(v)))
+#define atomic_dec_return(v)	(atomic_add_return(  -1,(v)))
 
 #define atomic_add_negative(a, v)	(atomic_add_return((a), (v)) < 0)
 
@@ -148,19 +158,38 @@
 
 #define ATOMIC64_INIT(i) { (i) }
 
-static __inline__ s64
-__atomic64_add_return(s64 i, atomic64_t *v)
-{
-	s64 ret;
-	unsigned long flags;
-	_atomic_spin_lock_irqsave(v, flags);
+#define ATOMIC64_OP(op, c_op)						\
+static __inline__ void atomic64_##op(s64 i, atomic64_t *v)		\
+{									\
+	unsigned long flags;						\
+									\
+	_atomic_spin_lock_irqsave(v, flags);				\
+	v->counter c_op i;						\
+	_atomic_spin_unlock_irqrestore(v, flags);			\
+}									\
 
-	ret = (v->counter += i);
-
-	_atomic_spin_unlock_irqrestore(v, flags);
-	return ret;
+#define ATOMIC64_OP_RETURN(op, c_op)					\
+static __inline__ s64 atomic64_##op##_return(s64 i, atomic64_t *v)	\
+{									\
+	unsigned long flags;						\
+	s64 ret;							\
+									\
+	_atomic_spin_lock_irqsave(v, flags);				\
+	ret = (v->counter c_op i);					\
+	_atomic_spin_unlock_irqrestore(v, flags);			\
+									\
+	return ret;							\
 }
 
+#define ATOMIC64_OPS(op, c_op) ATOMIC64_OP(op, c_op) ATOMIC64_OP_RETURN(op, c_op)
+
+ATOMIC64_OPS(add, +=)
+ATOMIC64_OPS(sub, -=)
+
+#undef ATOMIC64_OPS
+#undef ATOMIC64_OP_RETURN
+#undef ATOMIC64_OP
+
 static __inline__ void
 atomic64_set(atomic64_t *v, s64 i)
 {
@@ -178,15 +207,11 @@
 	return (*(volatile long *)&(v)->counter);
 }
 
-#define atomic64_add(i,v)	((void)(__atomic64_add_return( ((s64)(i)),(v))))
-#define atomic64_sub(i,v)	((void)(__atomic64_add_return(-((s64)(i)),(v))))
-#define atomic64_inc(v)		((void)(__atomic64_add_return(   1,(v))))
-#define atomic64_dec(v)		((void)(__atomic64_add_return(  -1,(v))))
+#define atomic64_inc(v)		(atomic64_add(   1,(v)))
+#define atomic64_dec(v)		(atomic64_add(  -1,(v)))
 
-#define atomic64_add_return(i,v)	(__atomic64_add_return( ((s64)(i)),(v)))
-#define atomic64_sub_return(i,v)	(__atomic64_add_return(-((s64)(i)),(v)))
-#define atomic64_inc_return(v)		(__atomic64_add_return(   1,(v)))
-#define atomic64_dec_return(v)		(__atomic64_add_return(  -1,(v)))
+#define atomic64_inc_return(v)		(atomic64_add_return(   1,(v)))
+#define atomic64_dec_return(v)		(atomic64_add_return(  -1,(v)))
 
 #define atomic64_add_negative(a, v)	(atomic64_add_return((a), (v)) < 0)
 

diff --git a/arch/powerpc/include/asm/atomic.h b/arch/powerpc/include/asm/atomic.h
index 28992d0..512d278 100644
--- a/arch/powerpc/include/asm/atomic.h
+++ b/arch/powerpc/include/asm/atomic.h

@@ -26,76 +26,53 @@
 	__asm__ __volatile__("stw%U0%X0 %1,%0" : "=m"(v->counter) : "r"(i));
 }
 
-static __inline__ void atomic_add(int a, atomic_t *v)
-{
-	int t;
+#define ATOMIC_OP(op, asm_op)						\
+static __inline__ void atomic_##op(int a, atomic_t *v)			\
+{									\
+	int t;								\
+									\
+	__asm__ __volatile__(						\
+"1:	lwarx	%0,0,%3		# atomic_" #op "\n"			\
+	#asm_op " %0,%2,%0\n"						\
+	PPC405_ERR77(0,%3)						\
+"	stwcx.	%0,0,%3 \n"						\
+"	bne-	1b\n"							\
+	: "=&r" (t), "+m" (v->counter)					\
+	: "r" (a), "r" (&v->counter)					\
+	: "cc");							\
+}									\
 
-	__asm__ __volatile__(
-"1:	lwarx	%0,0,%3		# atomic_add\n\
-	add	%0,%2,%0\n"
-	PPC405_ERR77(0,%3)
-"	stwcx.	%0,0,%3 \n\
-	bne-	1b"
-	: "=&r" (t), "+m" (v->counter)
-	: "r" (a), "r" (&v->counter)
-	: "cc");
+#define ATOMIC_OP_RETURN(op, asm_op)					\
+static __inline__ int atomic_##op##_return(int a, atomic_t *v)		\
+{									\
+	int t;								\
+									\
+	__asm__ __volatile__(						\
+	PPC_ATOMIC_ENTRY_BARRIER					\
+"1:	lwarx	%0,0,%2		# atomic_" #op "_return\n"		\
+	#asm_op " %0,%1,%0\n"						\
+	PPC405_ERR77(0,%2)						\
+"	stwcx.	%0,0,%2 \n"						\
+"	bne-	1b\n"							\
+	PPC_ATOMIC_EXIT_BARRIER						\
+	: "=&r" (t)							\
+	: "r" (a), "r" (&v->counter)					\
+	: "cc", "memory");						\
+									\
+	return t;							\
 }
 
-static __inline__ int atomic_add_return(int a, atomic_t *v)
-{
-	int t;
+#define ATOMIC_OPS(op, asm_op) ATOMIC_OP(op, asm_op) ATOMIC_OP_RETURN(op, asm_op)
 
-	__asm__ __volatile__(
-	PPC_ATOMIC_ENTRY_BARRIER
-"1:	lwarx	%0,0,%2		# atomic_add_return\n\
-	add	%0,%1,%0\n"
-	PPC405_ERR77(0,%2)
-"	stwcx.	%0,0,%2 \n\
-	bne-	1b"
-	PPC_ATOMIC_EXIT_BARRIER
-	: "=&r" (t)
-	: "r" (a), "r" (&v->counter)
-	: "cc", "memory");
+ATOMIC_OPS(add, add)
+ATOMIC_OPS(sub, subf)
 
-	return t;
-}
+#undef ATOMIC_OPS
+#undef ATOMIC_OP_RETURN
+#undef ATOMIC_OP
 
 #define atomic_add_negative(a, v)	(atomic_add_return((a), (v)) < 0)
 
-static __inline__ void atomic_sub(int a, atomic_t *v)
-{
-	int t;
-
-	__asm__ __volatile__(
-"1:	lwarx	%0,0,%3		# atomic_sub\n\
-	subf	%0,%2,%0\n"
-	PPC405_ERR77(0,%3)
-"	stwcx.	%0,0,%3 \n\
-	bne-	1b"
-	: "=&r" (t), "+m" (v->counter)
-	: "r" (a), "r" (&v->counter)
-	: "cc");
-}
-
-static __inline__ int atomic_sub_return(int a, atomic_t *v)
-{
-	int t;
-
-	__asm__ __volatile__(
-	PPC_ATOMIC_ENTRY_BARRIER
-"1:	lwarx	%0,0,%2		# atomic_sub_return\n\
-	subf	%0,%1,%0\n"
-	PPC405_ERR77(0,%2)
-"	stwcx.	%0,0,%2 \n\
-	bne-	1b"
-	PPC_ATOMIC_EXIT_BARRIER
-	: "=&r" (t)
-	: "r" (a), "r" (&v->counter)
-	: "cc", "memory");
-
-	return t;
-}
-
 static __inline__ void atomic_inc(atomic_t *v)
 {
 	int t;
@@ -289,72 +266,51 @@
 	__asm__ __volatile__("std%U0%X0 %1,%0" : "=m"(v->counter) : "r"(i));
 }
 
-static __inline__ void atomic64_add(long a, atomic64_t *v)
-{
-	long t;
-
-	__asm__ __volatile__(
-"1:	ldarx	%0,0,%3		# atomic64_add\n\
-	add	%0,%2,%0\n\
-	stdcx.	%0,0,%3 \n\
-	bne-	1b"
-	: "=&r" (t), "+m" (v->counter)
-	: "r" (a), "r" (&v->counter)
-	: "cc");
+#define ATOMIC64_OP(op, asm_op)						\
+static __inline__ void atomic64_##op(long a, atomic64_t *v)		\
+{									\
+	long t;								\
+									\
+	__asm__ __volatile__(						\
+"1:	ldarx	%0,0,%3		# atomic64_" #op "\n"			\
+	#asm_op " %0,%2,%0\n"						\
+"	stdcx.	%0,0,%3 \n"						\
+"	bne-	1b\n"							\
+	: "=&r" (t), "+m" (v->counter)					\
+	: "r" (a), "r" (&v->counter)					\
+	: "cc");							\
 }
 
-static __inline__ long atomic64_add_return(long a, atomic64_t *v)
-{
-	long t;
-
-	__asm__ __volatile__(
-	PPC_ATOMIC_ENTRY_BARRIER
-"1:	ldarx	%0,0,%2		# atomic64_add_return\n\
-	add	%0,%1,%0\n\
-	stdcx.	%0,0,%2 \n\
-	bne-	1b"
-	PPC_ATOMIC_EXIT_BARRIER
-	: "=&r" (t)
-	: "r" (a), "r" (&v->counter)
-	: "cc", "memory");
-
-	return t;
+#define ATOMIC64_OP_RETURN(op, asm_op)					\
+static __inline__ long atomic64_##op##_return(long a, atomic64_t *v)	\
+{									\
+	long t;								\
+									\
+	__asm__ __volatile__(						\
+	PPC_ATOMIC_ENTRY_BARRIER					\
+"1:	ldarx	%0,0,%2		# atomic64_" #op "_return\n"		\
+	#asm_op " %0,%1,%0\n"						\
+"	stdcx.	%0,0,%2 \n"						\
+"	bne-	1b\n"							\
+	PPC_ATOMIC_EXIT_BARRIER						\
+	: "=&r" (t)							\
+	: "r" (a), "r" (&v->counter)					\
+	: "cc", "memory");						\
+									\
+	return t;							\
 }
 
+#define ATOMIC64_OPS(op, asm_op) ATOMIC64_OP(op, asm_op) ATOMIC64_OP_RETURN(op, asm_op)
+
+ATOMIC64_OPS(add, add)
+ATOMIC64_OPS(sub, subf)
+
+#undef ATOMIC64_OPS
+#undef ATOMIC64_OP_RETURN
+#undef ATOMIC64_OP
+
 #define atomic64_add_negative(a, v)	(atomic64_add_return((a), (v)) < 0)
 
-static __inline__ void atomic64_sub(long a, atomic64_t *v)
-{
-	long t;
-
-	__asm__ __volatile__(
-"1:	ldarx	%0,0,%3		# atomic64_sub\n\
-	subf	%0,%2,%0\n\
-	stdcx.	%0,0,%3 \n\
-	bne-	1b"
-	: "=&r" (t), "+m" (v->counter)
-	: "r" (a), "r" (&v->counter)
-	: "cc");
-}
-
-static __inline__ long atomic64_sub_return(long a, atomic64_t *v)
-{
-	long t;
-
-	__asm__ __volatile__(
-	PPC_ATOMIC_ENTRY_BARRIER
-"1:	ldarx	%0,0,%2		# atomic64_sub_return\n\
-	subf	%0,%1,%0\n\
-	stdcx.	%0,0,%2 \n\
-	bne-	1b"
-	PPC_ATOMIC_EXIT_BARRIER
-	: "=&r" (t)
-	: "r" (a), "r" (&v->counter)
-	: "cc", "memory");
-
-	return t;
-}
-
 static __inline__ void atomic64_inc(atomic64_t *v)
 {
 	long t;

diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 72c20bb..79294c4 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c

@@ -62,10 +62,10 @@
 	}
 
 	kvm->arch.hpt_cma_alloc = 0;
-	page = kvm_alloc_hpt(1 << (order - PAGE_SHIFT));
+	page = kvm_alloc_hpt(1ul << (order - PAGE_SHIFT));
 	if (page) {
 		hpt = (unsigned long)pfn_to_kaddr(page_to_pfn(page));
-		memset((void *)hpt, 0, (1 << order));
+		memset((void *)hpt, 0, (1ul << order));
 		kvm->arch.hpt_cma_alloc = 1;
 	}
 

diff --git a/arch/s390/include/asm/ipl.h b/arch/s390/include/asm/ipl.h
index 2fcccc0..c81661e 100644
--- a/arch/s390/include/asm/ipl.h
+++ b/arch/s390/include/asm/ipl.h

@@ -17,12 +17,12 @@
 #define IPL_PARM_BLK_FCP_LEN (sizeof(struct ipl_list_hdr) + \
 			      sizeof(struct ipl_block_fcp))
 
-#define IPL_PARM_BLK0_FCP_LEN (sizeof(struct ipl_block_fcp) + 8)
+#define IPL_PARM_BLK0_FCP_LEN (sizeof(struct ipl_block_fcp) + 16)
 
 #define IPL_PARM_BLK_CCW_LEN (sizeof(struct ipl_list_hdr) + \
 			      sizeof(struct ipl_block_ccw))
 
-#define IPL_PARM_BLK0_CCW_LEN (sizeof(struct ipl_block_ccw) + 8)
+#define IPL_PARM_BLK0_CCW_LEN (sizeof(struct ipl_block_ccw) + 16)
 
 #define IPL_MAX_SUPPORTED_VERSION (0)
 
@@ -38,10 +38,11 @@
 	u8  pbt;
 	u8  flags;
 	u16 reserved2;
+	u8  loadparm[8];
 } __attribute__((packed));
 
 struct ipl_block_fcp {
-	u8  reserved1[313-1];
+	u8  reserved1[305-1];
 	u8  opt;
 	u8  reserved2[3];
 	u16 reserved3;
@@ -62,7 +63,6 @@
 				 offsetof(struct ipl_block_fcp, scp_data)))
 
 struct ipl_block_ccw {
-	u8  load_parm[8];
 	u8  reserved1[84];
 	u8  reserved2[2];
 	u16 devno;

diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index b76317c..5efb2fe 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h

@@ -1127,7 +1127,7 @@
 					    unsigned long addr, pte_t *ptep)
 {
 	pgste_t pgste;
-	pte_t pte;
+	pte_t pte, oldpte;
 	int young;
 
 	if (mm_has_pgste(vma->vm_mm)) {
@@ -1135,12 +1135,13 @@
 		pgste = pgste_ipte_notify(vma->vm_mm, ptep, pgste);
 	}
 
-	pte = *ptep;
+	oldpte = pte = *ptep;
 	ptep_flush_direct(vma->vm_mm, addr, ptep);
 	young = pte_young(pte);
 	pte = pte_mkold(pte);
 
 	if (mm_has_pgste(vma->vm_mm)) {
+		pgste = pgste_update_all(&oldpte, pgste, vma->vm_mm);
 		pgste = pgste_set_pte(ptep, pgste, pte);
 		pgste_set_unlock(ptep, pgste);
 	} else
@@ -1330,6 +1331,7 @@
 	ptep_flush_direct(vma->vm_mm, address, ptep);
 
 	if (mm_has_pgste(vma->vm_mm)) {
+		pgste_set_key(ptep, pgste, entry, vma->vm_mm);
 		pgste = pgste_set_pte(ptep, pgste, entry);
 		pgste_set_unlock(ptep, pgste);
 	} else

diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c
index 22aac58..39badb9 100644
--- a/arch/s390/kernel/ipl.c
+++ b/arch/s390/kernel/ipl.c

@@ -455,22 +455,6 @@
 DEFINE_IPL_ATTR_RO(ipl_fcp, br_lba, "%lld\n", (unsigned long long)
 		   IPL_PARMBLOCK_START->ipl_info.fcp.br_lba);
 
-static struct attribute *ipl_fcp_attrs[] = {
-	&sys_ipl_type_attr.attr,
-	&sys_ipl_device_attr.attr,
-	&sys_ipl_fcp_wwpn_attr.attr,
-	&sys_ipl_fcp_lun_attr.attr,
-	&sys_ipl_fcp_bootprog_attr.attr,
-	&sys_ipl_fcp_br_lba_attr.attr,
-	NULL,
-};
-
-static struct attribute_group ipl_fcp_attr_group = {
-	.attrs = ipl_fcp_attrs,
-};
-
-/* CCW ipl device attributes */
-
 static ssize_t ipl_ccw_loadparm_show(struct kobject *kobj,
 				     struct kobj_attribute *attr, char *page)
 {
@@ -487,6 +471,23 @@
 static struct kobj_attribute sys_ipl_ccw_loadparm_attr =
 	__ATTR(loadparm, 0444, ipl_ccw_loadparm_show, NULL);
 
+static struct attribute *ipl_fcp_attrs[] = {
+	&sys_ipl_type_attr.attr,
+	&sys_ipl_device_attr.attr,
+	&sys_ipl_fcp_wwpn_attr.attr,
+	&sys_ipl_fcp_lun_attr.attr,
+	&sys_ipl_fcp_bootprog_attr.attr,
+	&sys_ipl_fcp_br_lba_attr.attr,
+	&sys_ipl_ccw_loadparm_attr.attr,
+	NULL,
+};
+
+static struct attribute_group ipl_fcp_attr_group = {
+	.attrs = ipl_fcp_attrs,
+};
+
+/* CCW ipl device attributes */
+
 static struct attribute *ipl_ccw_attrs_vm[] = {
 	&sys_ipl_type_attr.attr,
 	&sys_ipl_device_attr.attr,
@@ -765,28 +766,10 @@
 DEFINE_IPL_ATTR_RW(reipl_fcp, device, "0.0.%04llx\n", "0.0.%llx\n",
 		   reipl_block_fcp->ipl_info.fcp.devno);
 
-static struct attribute *reipl_fcp_attrs[] = {
-	&sys_reipl_fcp_device_attr.attr,
-	&sys_reipl_fcp_wwpn_attr.attr,
-	&sys_reipl_fcp_lun_attr.attr,
-	&sys_reipl_fcp_bootprog_attr.attr,
-	&sys_reipl_fcp_br_lba_attr.attr,
-	NULL,
-};
-
-static struct attribute_group reipl_fcp_attr_group = {
-	.attrs = reipl_fcp_attrs,
-};
-
-/* CCW reipl device attributes */
-
-DEFINE_IPL_ATTR_RW(reipl_ccw, device, "0.0.%04llx\n", "0.0.%llx\n",
-	reipl_block_ccw->ipl_info.ccw.devno);
-
 static void reipl_get_ascii_loadparm(char *loadparm,
 				     struct ipl_parameter_block *ibp)
 {
-	memcpy(loadparm, ibp->ipl_info.ccw.load_parm, LOADPARM_LEN);
+	memcpy(loadparm, ibp->hdr.loadparm, LOADPARM_LEN);
 	EBCASC(loadparm, LOADPARM_LEN);
 	loadparm[LOADPARM_LEN] = 0;
 	strim(loadparm);
@@ -821,13 +804,50 @@
 		return -EINVAL;
 	}
 	/* initialize loadparm with blanks */
-	memset(ipb->ipl_info.ccw.load_parm, ' ', LOADPARM_LEN);
+	memset(ipb->hdr.loadparm, ' ', LOADPARM_LEN);
 	/* copy and convert to ebcdic */
-	memcpy(ipb->ipl_info.ccw.load_parm, buf, lp_len);
-	ASCEBC(ipb->ipl_info.ccw.load_parm, LOADPARM_LEN);
+	memcpy(ipb->hdr.loadparm, buf, lp_len);
+	ASCEBC(ipb->hdr.loadparm, LOADPARM_LEN);
 	return len;
 }
 
+/* FCP wrapper */
+static ssize_t reipl_fcp_loadparm_show(struct kobject *kobj,
+				       struct kobj_attribute *attr, char *page)
+{
+	return reipl_generic_loadparm_show(reipl_block_fcp, page);
+}
+
+static ssize_t reipl_fcp_loadparm_store(struct kobject *kobj,
+					struct kobj_attribute *attr,
+					const char *buf, size_t len)
+{
+	return reipl_generic_loadparm_store(reipl_block_fcp, buf, len);
+}
+
+static struct kobj_attribute sys_reipl_fcp_loadparm_attr =
+	__ATTR(loadparm, S_IRUGO | S_IWUSR, reipl_fcp_loadparm_show,
+					    reipl_fcp_loadparm_store);
+
+static struct attribute *reipl_fcp_attrs[] = {
+	&sys_reipl_fcp_device_attr.attr,
+	&sys_reipl_fcp_wwpn_attr.attr,
+	&sys_reipl_fcp_lun_attr.attr,
+	&sys_reipl_fcp_bootprog_attr.attr,
+	&sys_reipl_fcp_br_lba_attr.attr,
+	&sys_reipl_fcp_loadparm_attr.attr,
+	NULL,
+};
+
+static struct attribute_group reipl_fcp_attr_group = {
+	.attrs = reipl_fcp_attrs,
+};
+
+/* CCW reipl device attributes */
+
+DEFINE_IPL_ATTR_RW(reipl_ccw, device, "0.0.%04llx\n", "0.0.%llx\n",
+	reipl_block_ccw->ipl_info.ccw.devno);
+
 /* NSS wrapper */
 static ssize_t reipl_nss_loadparm_show(struct kobject *kobj,
 				       struct kobj_attribute *attr, char *page)
@@ -1125,11 +1145,10 @@
 	/* LOADPARM */
 	/* check if read scp info worked and set loadparm */
 	if (sclp_ipl_info.is_valid)
-		memcpy(ipb->ipl_info.ccw.load_parm,
-				&sclp_ipl_info.loadparm, LOADPARM_LEN);
+		memcpy(ipb->hdr.loadparm, &sclp_ipl_info.loadparm, LOADPARM_LEN);
 	else
 		/* read scp info failed: set empty loadparm (EBCDIC blanks) */
-		memset(ipb->ipl_info.ccw.load_parm, 0x40, LOADPARM_LEN);
+		memset(ipb->hdr.loadparm, 0x40, LOADPARM_LEN);
 	ipb->hdr.flags = DIAG308_FLAGS_LP_VALID;
 
 	/* VM PARM */
@@ -1251,9 +1270,16 @@
 		return rc;
 	}
 
-	if (ipl_info.type == IPL_TYPE_FCP)
+	if (ipl_info.type == IPL_TYPE_FCP) {
 		memcpy(reipl_block_fcp, IPL_PARMBLOCK_START, PAGE_SIZE);
-	else {
+		/*
+		 * Fix loadparm: There are systems where the (SCSI) LOADPARM
+		 * is invalid in the SCSI IPL parameter block, so take it
+		 * always from sclp_ipl_info.
+		 */
+		memcpy(reipl_block_fcp->hdr.loadparm, sclp_ipl_info.loadparm,
+		       LOADPARM_LEN);
+	} else {
 		reipl_block_fcp->hdr.len = IPL_PARM_BLK_FCP_LEN;
 		reipl_block_fcp->hdr.version = IPL_PARM_BLOCK_VERSION;
 		reipl_block_fcp->hdr.blk0_len = IPL_PARM_BLK0_FCP_LEN;
@@ -1864,7 +1890,23 @@
 
 static int __init s390_ipl_init(void)
 {
+	char str[8] = {0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40};
+
 	sclp_get_ipl_info(&sclp_ipl_info);
+	/*
+	 * Fix loadparm: There are systems where the (SCSI) LOADPARM
+	 * returned by read SCP info is invalid (contains EBCDIC blanks)
+	 * when the system has been booted via diag308. In that case we use
+	 * the value from diag308, if available.
+	 *
+	 * There are also systems where diag308 store does not work in
+	 * case the system is booted from HMC. Fortunately in this case
+	 * READ SCP info provides the correct value.
+	 */
+	if (memcmp(sclp_ipl_info.loadparm, str, sizeof(str)) == 0 &&
+	    diag308_set_works)
+		memcpy(sclp_ipl_info.loadparm, ipl_block.hdr.loadparm,
+		       LOADPARM_LEN);
 	shutdown_actions_init();
 	shutdown_triggers_init();
 	return 0;

diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c
index 5dc7ad9..bebacad 100644
--- a/arch/s390/kernel/ptrace.c
+++ b/arch/s390/kernel/ptrace.c

@@ -803,7 +803,7 @@
 	long ret = 0;
 
 	/* Do the secure computing check first. */
-	if (secure_computing(regs->gprs[2])) {
+	if (secure_computing()) {
 		/* seccomp failures shouldn't expose any additional code. */
 		ret = -1;
 		goto out;

diff --git a/arch/s390/kernel/vdso32/clock_gettime.S b/arch/s390/kernel/vdso32/clock_gettime.S
index 65fc397..7cf18f8 100644
--- a/arch/s390/kernel/vdso32/clock_gettime.S
+++ b/arch/s390/kernel/vdso32/clock_gettime.S

@@ -22,13 +22,11 @@
 	basr	%r5,0
 0:	al	%r5,21f-0b(%r5)			/* get &_vdso_data */
 	chi	%r2,__CLOCK_REALTIME
-	je	10f
+	je	11f
 	chi	%r2,__CLOCK_MONOTONIC
 	jne	19f
 
 	/* CLOCK_MONOTONIC */
-	ltr	%r3,%r3
-	jz	9f				/* tp == NULL */
 1:	l	%r4,__VDSO_UPD_COUNT+4(%r5)	/* load update counter */
 	tml	%r4,0x0001			/* pending update ? loop */
 	jnz	1b
@@ -67,12 +65,10 @@
 	j	6b
 8:	st	%r2,0(%r3)			/* store tp->tv_sec */
 	st	%r1,4(%r3)			/* store tp->tv_nsec */
-9:	lhi	%r2,0
+	lhi	%r2,0
 	br	%r14
 
 	/* CLOCK_REALTIME */
-10:	ltr	%r3,%r3				/* tp == NULL */
-	jz	18f
 11:	l	%r4,__VDSO_UPD_COUNT+4(%r5)	/* load update counter */
 	tml	%r4,0x0001			/* pending update ? loop */
 	jnz	11b
@@ -111,7 +107,7 @@
 	j	15b
 17:	st	%r2,0(%r3)			/* store tp->tv_sec */
 	st	%r1,4(%r3)			/* store tp->tv_nsec */
-18:	lhi	%r2,0
+	lhi	%r2,0
 	br	%r14
 
 	/* Fallback to system call */

diff --git a/arch/s390/kernel/vdso64/clock_gettime.S b/arch/s390/kernel/vdso64/clock_gettime.S
index 91940ed..3f34e09 100644
--- a/arch/s390/kernel/vdso64/clock_gettime.S
+++ b/arch/s390/kernel/vdso64/clock_gettime.S

@@ -21,7 +21,7 @@
 	.cfi_startproc
 	larl	%r5,_vdso_data
 	cghi	%r2,__CLOCK_REALTIME
-	je	4f
+	je	5f
 	cghi	%r2,__CLOCK_THREAD_CPUTIME_ID
 	je	9f
 	cghi	%r2,-2		/* Per-thread CPUCLOCK with PID=0, VIRT=1 */
@@ -30,8 +30,6 @@
 	jne	12f
 
 	/* CLOCK_MONOTONIC */
-	ltgr	%r3,%r3
-	jz	3f				/* tp == NULL */
 0:	lg	%r4,__VDSO_UPD_COUNT(%r5)	/* load update counter */
 	tmll	%r4,0x0001			/* pending update ? loop */
 	jnz	0b
@@ -53,12 +51,10 @@
 	j	1b
 2:	stg	%r0,0(%r3)			/* store tp->tv_sec */
 	stg	%r1,8(%r3)			/* store tp->tv_nsec */
-3:	lghi	%r2,0
+	lghi	%r2,0
 	br	%r14
 
 	/* CLOCK_REALTIME */
-4:	ltr	%r3,%r3				/* tp == NULL */
-	jz	8f
 5:	lg	%r4,__VDSO_UPD_COUNT(%r5)	/* load update counter */
 	tmll	%r4,0x0001			/* pending update ? loop */
 	jnz	5b
@@ -80,7 +76,7 @@
 	j	6b
 7:	stg	%r0,0(%r3)			/* store tp->tv_sec */
 	stg	%r1,8(%r3)			/* store tp->tv_nsec */
-8:	lghi	%r2,0
+	lghi	%r2,0
 	br	%r14
 
 	/* CLOCK_THREAD_CPUTIME_ID for this thread */

diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index ce81eb2..81b0e11 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c

@@ -1317,19 +1317,6 @@
 		return -EINVAL;
 	}
 
-	switch (kvm_run->exit_reason) {
-	case KVM_EXIT_S390_SIEIC:
-	case KVM_EXIT_UNKNOWN:
-	case KVM_EXIT_INTR:
-	case KVM_EXIT_S390_RESET:
-	case KVM_EXIT_S390_UCONTROL:
-	case KVM_EXIT_S390_TSCH:
-	case KVM_EXIT_DEBUG:
-		break;
-	default:
-		BUG();
-	}
-
 	vcpu->arch.sie_block->gpsw.mask = kvm_run->psw_mask;
 	vcpu->arch.sie_block->gpsw.addr = kvm_run->psw_addr;
 	if (kvm_run->kvm_dirty_regs & KVM_SYNC_PREFIX) {

diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 19daa53..5404a62 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c

@@ -986,11 +986,21 @@
 	pte_t *ptep;
 
 	down_read(&mm->mmap_sem);
+retry:
 	ptep = get_locked_pte(current->mm, addr, &ptl);
 	if (unlikely(!ptep)) {
 		up_read(&mm->mmap_sem);
 		return -EFAULT;
 	}
+	if (!(pte_val(*ptep) & _PAGE_INVALID) &&
+	     (pte_val(*ptep) & _PAGE_PROTECT)) {
+			pte_unmap_unlock(*ptep, ptl);
+			if (fixup_user_fault(current, mm, addr, FAULT_FLAG_WRITE)) {
+				up_read(&mm->mmap_sem);
+				return -EFAULT;
+			}
+			goto retry;
+		}
 
 	new = old = pgste_get_lock(ptep);
 	pgste_val(new) &= ~(PGSTE_GR_BIT | PGSTE_GC_BIT |

diff --git a/arch/sh/include/asm/atomic-grb.h b/arch/sh/include/asm/atomic-grb.h
index a273c88..97a5fda 100644
--- a/arch/sh/include/asm/atomic-grb.h
+++ b/arch/sh/include/asm/atomic-grb.h

@@ -1,85 +1,56 @@
 #ifndef __ASM_SH_ATOMIC_GRB_H
 #define __ASM_SH_ATOMIC_GRB_H
 
-static inline void atomic_add(int i, atomic_t *v)
-{
-	int tmp;
+#define ATOMIC_OP(op)							\
+static inline void atomic_##op(int i, atomic_t *v)			\
+{									\
+	int tmp;							\
+									\
+	__asm__ __volatile__ (						\
+		"   .align 2              \n\t"				\
+		"   mova    1f,   r0      \n\t" /* r0 = end point */	\
+		"   mov    r15,   r1      \n\t" /* r1 = saved sp */	\
+		"   mov    #-6,   r15     \n\t" /* LOGIN: r15 = size */	\
+		"   mov.l  @%1,   %0      \n\t" /* load  old value */	\
+		" " #op "   %2,   %0      \n\t" /* $op */		\
+		"   mov.l   %0,   @%1     \n\t" /* store new value */	\
+		"1: mov     r1,   r15     \n\t" /* LOGOUT */		\
+		: "=&r" (tmp),						\
+		  "+r"  (v)						\
+		: "r"   (i)						\
+		: "memory" , "r0", "r1");				\
+}									\
 
-	__asm__ __volatile__ (
-		"   .align 2              \n\t"
-		"   mova    1f,   r0      \n\t" /* r0 = end point */
-		"   mov    r15,   r1      \n\t" /* r1 = saved sp */
-		"   mov    #-6,   r15     \n\t" /* LOGIN: r15 = size */
-		"   mov.l  @%1,   %0      \n\t" /* load  old value */
-		"   add     %2,   %0      \n\t" /* add */
-		"   mov.l   %0,   @%1     \n\t" /* store new value */
-		"1: mov     r1,   r15     \n\t" /* LOGOUT */
-		: "=&r" (tmp),
-		  "+r"  (v)
-		: "r"   (i)
-		: "memory" , "r0", "r1");
+#define ATOMIC_OP_RETURN(op)						\
+static inline int atomic_##op##_return(int i, atomic_t *v)		\
+{									\
+	int tmp;							\
+									\
+	__asm__ __volatile__ (						\
+		"   .align 2              \n\t"				\
+		"   mova    1f,   r0      \n\t" /* r0 = end point */	\
+		"   mov    r15,   r1      \n\t" /* r1 = saved sp */	\
+		"   mov    #-6,   r15     \n\t" /* LOGIN: r15 = size */	\
+		"   mov.l  @%1,   %0      \n\t" /* load  old value */	\
+		" " #op "   %2,   %0      \n\t" /* $op */		\
+		"   mov.l   %0,   @%1     \n\t" /* store new value */	\
+		"1: mov     r1,   r15     \n\t" /* LOGOUT */		\
+		: "=&r" (tmp),						\
+		  "+r"  (v)						\
+		: "r"   (i)						\
+		: "memory" , "r0", "r1");				\
+									\
+	return tmp;							\
 }
 
-static inline void atomic_sub(int i, atomic_t *v)
-{
-	int tmp;
+#define ATOMIC_OPS(op) ATOMIC_OP(op) ATOMIC_OP_RETURN(op)
 
-	__asm__ __volatile__ (
-		"   .align 2              \n\t"
-		"   mova    1f,   r0      \n\t" /* r0 = end point */
-		"   mov     r15,  r1      \n\t" /* r1 = saved sp */
-		"   mov    #-6,   r15     \n\t" /* LOGIN: r15 = size */
-		"   mov.l  @%1,   %0      \n\t" /* load  old value */
-		"   sub     %2,   %0      \n\t" /* sub */
-		"   mov.l   %0,   @%1     \n\t" /* store new value */
-		"1: mov     r1,   r15     \n\t" /* LOGOUT */
-		: "=&r" (tmp),
-		  "+r"  (v)
-		: "r"   (i)
-		: "memory" , "r0", "r1");
-}
+ATOMIC_OPS(add)
+ATOMIC_OPS(sub)
 
-static inline int atomic_add_return(int i, atomic_t *v)
-{
-	int tmp;
-
-	__asm__ __volatile__ (
-		"   .align 2              \n\t"
-		"   mova    1f,   r0      \n\t" /* r0 = end point */
-		"   mov    r15,   r1      \n\t" /* r1 = saved sp */
-		"   mov    #-6,   r15     \n\t" /* LOGIN: r15 = size */
-		"   mov.l  @%1,   %0      \n\t" /* load  old value */
-		"   add     %2,   %0      \n\t" /* add */
-		"   mov.l   %0,   @%1     \n\t" /* store new value */
-		"1: mov     r1,   r15     \n\t" /* LOGOUT */
-		: "=&r" (tmp),
-		  "+r"  (v)
-		: "r"   (i)
-		: "memory" , "r0", "r1");
-
-	return tmp;
-}
-
-static inline int atomic_sub_return(int i, atomic_t *v)
-{
-	int tmp;
-
-	__asm__ __volatile__ (
-		"   .align 2              \n\t"
-		"   mova    1f,   r0      \n\t" /* r0 = end point */
-		"   mov    r15,   r1      \n\t" /* r1 = saved sp */
-		"   mov    #-6,   r15     \n\t" /* LOGIN: r15 = size */
-		"   mov.l  @%1,   %0      \n\t" /* load  old value */
-		"   sub     %2,   %0      \n\t" /* sub */
-		"   mov.l   %0,   @%1     \n\t" /* store new value */
-		"1: mov     r1,   r15     \n\t" /* LOGOUT */
-		: "=&r" (tmp),
-		  "+r"  (v)
-		: "r"   (i)
-		: "memory", "r0", "r1");
-
-	return tmp;
-}
+#undef ATOMIC_OPS
+#undef ATOMIC_OP_RETURN
+#undef ATOMIC_OP
 
 static inline void atomic_clear_mask(unsigned int mask, atomic_t *v)
 {

diff --git a/arch/sh/include/asm/atomic-irq.h b/arch/sh/include/asm/atomic-irq.h
index 9f7c566..61d1075 100644
--- a/arch/sh/include/asm/atomic-irq.h
+++ b/arch/sh/include/asm/atomic-irq.h

@@ -8,49 +8,39 @@
  * forward to code at the end of this object's .text section, then
  * branch back to restart the operation.
  */
-static inline void atomic_add(int i, atomic_t *v)
-{
-	unsigned long flags;
 
-	raw_local_irq_save(flags);
-	v->counter += i;
-	raw_local_irq_restore(flags);
+#define ATOMIC_OP(op, c_op)						\
+static inline void atomic_##op(int i, atomic_t *v)			\
+{									\
+	unsigned long flags;						\
+									\
+	raw_local_irq_save(flags);					\
+	v->counter c_op i;						\
+	raw_local_irq_restore(flags);					\
 }
 
-static inline void atomic_sub(int i, atomic_t *v)
-{
-	unsigned long flags;
-
-	raw_local_irq_save(flags);
-	v->counter -= i;
-	raw_local_irq_restore(flags);
+#define ATOMIC_OP_RETURN(op, c_op)					\
+static inline int atomic_##op##_return(int i, atomic_t *v)		\
+{									\
+	unsigned long temp, flags;					\
+									\
+	raw_local_irq_save(flags);					\
+	temp = v->counter;						\
+	temp c_op i;							\
+	v->counter = temp;						\
+	raw_local_irq_restore(flags);					\
+									\
+	return temp;							\
 }
 
-static inline int atomic_add_return(int i, atomic_t *v)
-{
-	unsigned long temp, flags;
+#define ATOMIC_OPS(op, c_op) ATOMIC_OP(op, c_op) ATOMIC_OP_RETURN(op, c_op)
 
-	raw_local_irq_save(flags);
-	temp = v->counter;
-	temp += i;
-	v->counter = temp;
-	raw_local_irq_restore(flags);
+ATOMIC_OPS(add, +=)
+ATOMIC_OPS(sub, -=)
 
-	return temp;
-}
-
-static inline int atomic_sub_return(int i, atomic_t *v)
-{
-	unsigned long temp, flags;
-
-	raw_local_irq_save(flags);
-	temp = v->counter;
-	temp -= i;
-	v->counter = temp;
-	raw_local_irq_restore(flags);
-
-	return temp;
-}
+#undef ATOMIC_OPS
+#undef ATOMIC_OP_RETURN
+#undef ATOMIC_OP
 
 static inline void atomic_clear_mask(unsigned int mask, atomic_t *v)
 {

diff --git a/arch/sh/include/asm/atomic-llsc.h b/arch/sh/include/asm/atomic-llsc.h
index 4b00b78..8575dcc 100644
--- a/arch/sh/include/asm/atomic-llsc.h
+++ b/arch/sh/include/asm/atomic-llsc.h

@@ -2,39 +2,6 @@
 #define __ASM_SH_ATOMIC_LLSC_H
 
 /*
- * To get proper branch prediction for the main line, we must branch
- * forward to code at the end of this object's .text section, then
- * branch back to restart the operation.
- */
-static inline void atomic_add(int i, atomic_t *v)
-{
-	unsigned long tmp;
-
-	__asm__ __volatile__ (
-"1:	movli.l @%2, %0		! atomic_add	\n"
-"	add	%1, %0				\n"
-"	movco.l	%0, @%2				\n"
-"	bf	1b				\n"
-	: "=&z" (tmp)
-	: "r" (i), "r" (&v->counter)
-	: "t");
-}
-
-static inline void atomic_sub(int i, atomic_t *v)
-{
-	unsigned long tmp;
-
-	__asm__ __volatile__ (
-"1:	movli.l @%2, %0		! atomic_sub	\n"
-"	sub	%1, %0				\n"
-"	movco.l	%0, @%2				\n"
-"	bf	1b				\n"
-	: "=&z" (tmp)
-	: "r" (i), "r" (&v->counter)
-	: "t");
-}
-
-/*
  * SH-4A note:
  *
  * We basically get atomic_xxx_return() for free compared with
@@ -42,40 +9,54 @@
  * encoding, so the retval is automatically set without having to
  * do any special work.
  */
-static inline int atomic_add_return(int i, atomic_t *v)
-{
-	unsigned long temp;
+/*
+ * To get proper branch prediction for the main line, we must branch
+ * forward to code at the end of this object's .text section, then
+ * branch back to restart the operation.
+ */
 
-	__asm__ __volatile__ (
-"1:	movli.l @%2, %0		! atomic_add_return	\n"
-"	add	%1, %0					\n"
-"	movco.l	%0, @%2					\n"
-"	bf	1b					\n"
-"	synco						\n"
-	: "=&z" (temp)
-	: "r" (i), "r" (&v->counter)
-	: "t");
-
-	return temp;
+#define ATOMIC_OP(op)							\
+static inline void atomic_##op(int i, atomic_t *v)			\
+{									\
+	unsigned long tmp;						\
+									\
+	__asm__ __volatile__ (						\
+"1:	movli.l @%2, %0		! atomic_" #op "\n"			\
+"	" #op "	%1, %0				\n"			\
+"	movco.l	%0, @%2				\n"			\
+"	bf	1b				\n"			\
+	: "=&z" (tmp)							\
+	: "r" (i), "r" (&v->counter)					\
+	: "t");								\
 }
 
-static inline int atomic_sub_return(int i, atomic_t *v)
-{
-	unsigned long temp;
-
-	__asm__ __volatile__ (
-"1:	movli.l @%2, %0		! atomic_sub_return	\n"
-"	sub	%1, %0					\n"
-"	movco.l	%0, @%2					\n"
-"	bf	1b					\n"
-"	synco						\n"
-	: "=&z" (temp)
-	: "r" (i), "r" (&v->counter)
-	: "t");
-
-	return temp;
+#define ATOMIC_OP_RETURN(op)						\
+static inline int atomic_##op##_return(int i, atomic_t *v)		\
+{									\
+	unsigned long temp;						\
+									\
+	__asm__ __volatile__ (						\
+"1:	movli.l @%2, %0		! atomic_" #op "_return	\n"		\
+"	" #op "	%1, %0					\n"		\
+"	movco.l	%0, @%2					\n"		\
+"	bf	1b					\n"		\
+"	synco						\n"		\
+	: "=&z" (temp)							\
+	: "r" (i), "r" (&v->counter)					\
+	: "t");								\
+									\
+	return temp;							\
 }
 
+#define ATOMIC_OPS(op) ATOMIC_OP(op) ATOMIC_OP_RETURN(op)
+
+ATOMIC_OPS(add)
+ATOMIC_OPS(sub)
+
+#undef ATOMIC_OPS
+#undef ATOMIC_OP_RETURN
+#undef ATOMIC_OP
+
 static inline void atomic_clear_mask(unsigned int mask, atomic_t *v)
 {
 	unsigned long tmp;

diff --git a/arch/sparc/include/asm/atomic_32.h b/arch/sparc/include/asm/atomic_32.h
index 7aed2be..7b024f0 100644
--- a/arch/sparc/include/asm/atomic_32.h
+++ b/arch/sparc/include/asm/atomic_32.h

@@ -20,7 +20,7 @@
 
 #define ATOMIC_INIT(i)  { (i) }
 
-int __atomic_add_return(int, atomic_t *);
+int atomic_add_return(int, atomic_t *);
 int atomic_cmpxchg(atomic_t *, int, int);
 #define atomic_xchg(v, new) (xchg(&((v)->counter), new))
 int __atomic_add_unless(atomic_t *, int, int);
@@ -28,15 +28,14 @@
 
 #define atomic_read(v)          (*(volatile int *)&(v)->counter)
 
-#define atomic_add(i, v)	((void)__atomic_add_return( (int)(i), (v)))
-#define atomic_sub(i, v)	((void)__atomic_add_return(-(int)(i), (v)))
-#define atomic_inc(v)		((void)__atomic_add_return(        1, (v)))
-#define atomic_dec(v)		((void)__atomic_add_return(       -1, (v)))
+#define atomic_add(i, v)	((void)atomic_add_return( (int)(i), (v)))
+#define atomic_sub(i, v)	((void)atomic_add_return(-(int)(i), (v)))
+#define atomic_inc(v)		((void)atomic_add_return(        1, (v)))
+#define atomic_dec(v)		((void)atomic_add_return(       -1, (v)))
 
-#define atomic_add_return(i, v)	(__atomic_add_return( (int)(i), (v)))
-#define atomic_sub_return(i, v)	(__atomic_add_return(-(int)(i), (v)))
-#define atomic_inc_return(v)	(__atomic_add_return(        1, (v)))
-#define atomic_dec_return(v)	(__atomic_add_return(       -1, (v)))
+#define atomic_sub_return(i, v)	(atomic_add_return(-(int)(i), (v)))
+#define atomic_inc_return(v)	(atomic_add_return(        1, (v)))
+#define atomic_dec_return(v)	(atomic_add_return(       -1, (v)))
 
 #define atomic_add_negative(a, v)	(atomic_add_return((a), (v)) < 0)
 

diff --git a/arch/sparc/include/asm/atomic_64.h b/arch/sparc/include/asm/atomic_64.h
index bb894c8..7e4ca1e 100644
--- a/arch/sparc/include/asm/atomic_64.h
+++ b/arch/sparc/include/asm/atomic_64.h

@@ -20,27 +20,28 @@
 #define atomic_set(v, i)	(((v)->counter) = i)
 #define atomic64_set(v, i)	(((v)->counter) = i)
 
-void atomic_add(int, atomic_t *);
-void atomic64_add(long, atomic64_t *);
-void atomic_sub(int, atomic_t *);
-void atomic64_sub(long, atomic64_t *);
+#define ATOMIC_OP(op)							\
+void atomic_##op(int, atomic_t *);					\
+void atomic64_##op(long, atomic64_t *);
 
-int atomic_add_ret(int, atomic_t *);
-long atomic64_add_ret(long, atomic64_t *);
-int atomic_sub_ret(int, atomic_t *);
-long atomic64_sub_ret(long, atomic64_t *);
+#define ATOMIC_OP_RETURN(op)						\
+int atomic_##op##_return(int, atomic_t *);				\
+long atomic64_##op##_return(long, atomic64_t *);
 
-#define atomic_dec_return(v) atomic_sub_ret(1, v)
-#define atomic64_dec_return(v) atomic64_sub_ret(1, v)
+#define ATOMIC_OPS(op) ATOMIC_OP(op) ATOMIC_OP_RETURN(op)
 
-#define atomic_inc_return(v) atomic_add_ret(1, v)
-#define atomic64_inc_return(v) atomic64_add_ret(1, v)
+ATOMIC_OPS(add)
+ATOMIC_OPS(sub)
 
-#define atomic_sub_return(i, v) atomic_sub_ret(i, v)
-#define atomic64_sub_return(i, v) atomic64_sub_ret(i, v)
+#undef ATOMIC_OPS
+#undef ATOMIC_OP_RETURN
+#undef ATOMIC_OP
 
-#define atomic_add_return(i, v) atomic_add_ret(i, v)
-#define atomic64_add_return(i, v) atomic64_add_ret(i, v)
+#define atomic_dec_return(v)   atomic_sub_return(1, v)
+#define atomic64_dec_return(v) atomic64_sub_return(1, v)
+
+#define atomic_inc_return(v)   atomic_add_return(1, v)
+#define atomic64_inc_return(v) atomic64_add_return(1, v)
 
 /*
  * atomic_inc_and_test - increment and test
@@ -53,11 +54,11 @@
 #define atomic_inc_and_test(v) (atomic_inc_return(v) == 0)
 #define atomic64_inc_and_test(v) (atomic64_inc_return(v) == 0)
 
-#define atomic_sub_and_test(i, v) (atomic_sub_ret(i, v) == 0)
-#define atomic64_sub_and_test(i, v) (atomic64_sub_ret(i, v) == 0)
+#define atomic_sub_and_test(i, v) (atomic_sub_return(i, v) == 0)
+#define atomic64_sub_and_test(i, v) (atomic64_sub_return(i, v) == 0)
 
-#define atomic_dec_and_test(v) (atomic_sub_ret(1, v) == 0)
-#define atomic64_dec_and_test(v) (atomic64_sub_ret(1, v) == 0)
+#define atomic_dec_and_test(v) (atomic_sub_return(1, v) == 0)
+#define atomic64_dec_and_test(v) (atomic64_sub_return(1, v) == 0)
 
 #define atomic_inc(v) atomic_add(1, v)
 #define atomic64_inc(v) atomic64_add(1, v)
@@ -65,8 +66,8 @@
 #define atomic_dec(v) atomic_sub(1, v)
 #define atomic64_dec(v) atomic64_sub(1, v)
 
-#define atomic_add_negative(i, v) (atomic_add_ret(i, v) < 0)
-#define atomic64_add_negative(i, v) (atomic64_add_ret(i, v) < 0)
+#define atomic_add_negative(i, v) (atomic_add_return(i, v) < 0)
+#define atomic64_add_negative(i, v) (atomic64_add_return(i, v) < 0)
 
 #define atomic_cmpxchg(v, o, n) (cmpxchg(&((v)->counter), (o), (n)))
 #define atomic_xchg(v, new) (xchg(&((v)->counter), new))

diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c
index f7ba875..6a9ec5c 100644
--- a/arch/sparc/kernel/smp_64.c
+++ b/arch/sparc/kernel/smp_64.c

@@ -1138,7 +1138,7 @@
 
 void smp_capture(void)
 {
-	int result = atomic_add_ret(1, &smp_capture_depth);
+	int result = atomic_add_return(1, &smp_capture_depth);
 
 	if (result == 1) {
 		int ncpus = num_online_cpus();

diff --git a/arch/sparc/lib/atomic32.c b/arch/sparc/lib/atomic32.c
index 1d32b54..a7c418a 100644
--- a/arch/sparc/lib/atomic32.c
+++ b/arch/sparc/lib/atomic32.c

@@ -27,18 +27,23 @@
 
 #endif /* SMP */
 
-int __atomic_add_return(int i, atomic_t *v)
-{
-	int ret;
-	unsigned long flags;
-	spin_lock_irqsave(ATOMIC_HASH(v), flags);
+#define ATOMIC_OP(op, cop)						\
+int atomic_##op##_return(int i, atomic_t *v)				\
+{									\
+	int ret;							\
+	unsigned long flags;						\
+	spin_lock_irqsave(ATOMIC_HASH(v), flags);			\
+									\
+	ret = (v->counter cop i);					\
+									\
+	spin_unlock_irqrestore(ATOMIC_HASH(v), flags);			\
+	return ret;							\
+}									\
+EXPORT_SYMBOL(atomic_##op##_return);
 
-	ret = (v->counter += i);
+ATOMIC_OP(add, +=)
 
-	spin_unlock_irqrestore(ATOMIC_HASH(v), flags);
-	return ret;
-}
-EXPORT_SYMBOL(__atomic_add_return);
+#undef ATOMIC_OP
 
 int atomic_cmpxchg(atomic_t *v, int old, int new)
 {

diff --git a/arch/sparc/lib/atomic_64.S b/arch/sparc/lib/atomic_64.S
index 85c233d..96d70b4 100644
--- a/arch/sparc/lib/atomic_64.S
+++ b/arch/sparc/lib/atomic_64.S

@@ -14,109 +14,80 @@
 	 * memory barriers, and a second which returns
 	 * a value and does the barriers.
 	 */
-ENTRY(atomic_add) /* %o0 = increment, %o1 = atomic_ptr */
-	BACKOFF_SETUP(%o2)
-1:	lduw	[%o1], %g1
-	add	%g1, %o0, %g7
-	cas	[%o1], %g1, %g7
-	cmp	%g1, %g7
-	bne,pn	%icc, BACKOFF_LABEL(2f, 1b)
-	 nop
-	retl
-	 nop
-2:	BACKOFF_SPIN(%o2, %o3, 1b)
-ENDPROC(atomic_add)
 
-ENTRY(atomic_sub) /* %o0 = decrement, %o1 = atomic_ptr */
-	BACKOFF_SETUP(%o2)
-1:	lduw	[%o1], %g1
-	sub	%g1, %o0, %g7
-	cas	[%o1], %g1, %g7
-	cmp	%g1, %g7
-	bne,pn	%icc, BACKOFF_LABEL(2f, 1b)
-	 nop
-	retl
-	 nop
-2:	BACKOFF_SPIN(%o2, %o3, 1b)
-ENDPROC(atomic_sub)
+#define ATOMIC_OP(op)							\
+ENTRY(atomic_##op) /* %o0 = increment, %o1 = atomic_ptr */		\
+	BACKOFF_SETUP(%o2);						\
+1:	lduw	[%o1], %g1;						\
+	op	%g1, %o0, %g7;						\
+	cas	[%o1], %g1, %g7;					\
+	cmp	%g1, %g7;						\
+	bne,pn	%icc, BACKOFF_LABEL(2f, 1b);				\
+	 nop;								\
+	retl;								\
+	 nop;								\
+2:	BACKOFF_SPIN(%o2, %o3, 1b);					\
+ENDPROC(atomic_##op);							\
 
-ENTRY(atomic_add_ret) /* %o0 = increment, %o1 = atomic_ptr */
-	BACKOFF_SETUP(%o2)
-1:	lduw	[%o1], %g1
-	add	%g1, %o0, %g7
-	cas	[%o1], %g1, %g7
-	cmp	%g1, %g7
-	bne,pn	%icc, BACKOFF_LABEL(2f, 1b)
-	 add	%g1, %o0, %g1
-	retl
-	 sra	%g1, 0, %o0
-2:	BACKOFF_SPIN(%o2, %o3, 1b)
-ENDPROC(atomic_add_ret)
+#define ATOMIC_OP_RETURN(op)						\
+ENTRY(atomic_##op##_return) /* %o0 = increment, %o1 = atomic_ptr */	\
+	BACKOFF_SETUP(%o2);						\
+1:	lduw	[%o1], %g1;						\
+	op	%g1, %o0, %g7;						\
+	cas	[%o1], %g1, %g7;					\
+	cmp	%g1, %g7;						\
+	bne,pn	%icc, BACKOFF_LABEL(2f, 1b);				\
+	 add	%g1, %o0, %g1;						\
+	retl;								\
+	 sra	%g1, 0, %o0;						\
+2:	BACKOFF_SPIN(%o2, %o3, 1b);					\
+ENDPROC(atomic_##op##_return);
 
-ENTRY(atomic_sub_ret) /* %o0 = decrement, %o1 = atomic_ptr */
-	BACKOFF_SETUP(%o2)
-1:	lduw	[%o1], %g1
-	sub	%g1, %o0, %g7
-	cas	[%o1], %g1, %g7
-	cmp	%g1, %g7
-	bne,pn	%icc, BACKOFF_LABEL(2f, 1b)
-	 sub	%g1, %o0, %g1
-	retl
-	 sra	%g1, 0, %o0
-2:	BACKOFF_SPIN(%o2, %o3, 1b)
-ENDPROC(atomic_sub_ret)
+#define ATOMIC_OPS(op) ATOMIC_OP(op) ATOMIC_OP_RETURN(op)
 
-ENTRY(atomic64_add) /* %o0 = increment, %o1 = atomic_ptr */
-	BACKOFF_SETUP(%o2)
-1:	ldx	[%o1], %g1
-	add	%g1, %o0, %g7
-	casx	[%o1], %g1, %g7
-	cmp	%g1, %g7
-	bne,pn	%xcc, BACKOFF_LABEL(2f, 1b)
-	 nop
-	retl
-	 nop
-2:	BACKOFF_SPIN(%o2, %o3, 1b)
-ENDPROC(atomic64_add)
+ATOMIC_OPS(add)
+ATOMIC_OPS(sub)
 
-ENTRY(atomic64_sub) /* %o0 = decrement, %o1 = atomic_ptr */
-	BACKOFF_SETUP(%o2)
-1:	ldx	[%o1], %g1
-	sub	%g1, %o0, %g7
-	casx	[%o1], %g1, %g7
-	cmp	%g1, %g7
-	bne,pn	%xcc, BACKOFF_LABEL(2f, 1b)
-	 nop
-	retl
-	 nop
-2:	BACKOFF_SPIN(%o2, %o3, 1b)
-ENDPROC(atomic64_sub)
+#undef ATOMIC_OPS
+#undef ATOMIC_OP_RETURN
+#undef ATOMIC_OP
 
-ENTRY(atomic64_add_ret) /* %o0 = increment, %o1 = atomic_ptr */
-	BACKOFF_SETUP(%o2)
-1:	ldx	[%o1], %g1
-	add	%g1, %o0, %g7
-	casx	[%o1], %g1, %g7
-	cmp	%g1, %g7
-	bne,pn	%xcc, BACKOFF_LABEL(2f, 1b)
-	 nop
-	retl
-	 add	%g1, %o0, %o0
-2:	BACKOFF_SPIN(%o2, %o3, 1b)
-ENDPROC(atomic64_add_ret)
+#define ATOMIC64_OP(op)							\
+ENTRY(atomic64_##op) /* %o0 = increment, %o1 = atomic_ptr */		\
+	BACKOFF_SETUP(%o2);						\
+1:	ldx	[%o1], %g1;						\
+	op	%g1, %o0, %g7;						\
+	casx	[%o1], %g1, %g7;					\
+	cmp	%g1, %g7;						\
+	bne,pn	%xcc, BACKOFF_LABEL(2f, 1b);				\
+	 nop;								\
+	retl;								\
+	 nop;								\
+2:	BACKOFF_SPIN(%o2, %o3, 1b);					\
+ENDPROC(atomic64_##op);							\
 
-ENTRY(atomic64_sub_ret) /* %o0 = decrement, %o1 = atomic_ptr */
-	BACKOFF_SETUP(%o2)
-1:	ldx	[%o1], %g1
-	sub	%g1, %o0, %g7
-	casx	[%o1], %g1, %g7
-	cmp	%g1, %g7
-	bne,pn	%xcc, BACKOFF_LABEL(2f, 1b)
-	 nop
-	retl
-	 sub	%g1, %o0, %o0
-2:	BACKOFF_SPIN(%o2, %o3, 1b)
-ENDPROC(atomic64_sub_ret)
+#define ATOMIC64_OP_RETURN(op)						\
+ENTRY(atomic64_##op##_return) /* %o0 = increment, %o1 = atomic_ptr */	\
+	BACKOFF_SETUP(%o2);						\
+1:	ldx	[%o1], %g1;						\
+	op	%g1, %o0, %g7;						\
+	casx	[%o1], %g1, %g7;					\
+	cmp	%g1, %g7;						\
+	bne,pn	%xcc, BACKOFF_LABEL(2f, 1b);				\
+	 nop;								\
+	retl;								\
+	 add	%g1, %o0, %o0;						\
+2:	BACKOFF_SPIN(%o2, %o3, 1b);					\
+ENDPROC(atomic64_##op##_return);
+
+#define ATOMIC64_OPS(op) ATOMIC64_OP(op) ATOMIC64_OP_RETURN(op)
+
+ATOMIC64_OPS(add)
+ATOMIC64_OPS(sub)
+
+#undef ATOMIC64_OPS
+#undef ATOMIC64_OP_RETURN
+#undef ATOMIC64_OP
 
 ENTRY(atomic64_dec_if_positive) /* %o0 = atomic_ptr */
 	BACKOFF_SETUP(%o2)

diff --git a/arch/sparc/lib/ksyms.c b/arch/sparc/lib/ksyms.c
index 323335b..1d649a9 100644
--- a/arch/sparc/lib/ksyms.c
+++ b/arch/sparc/lib/ksyms.c

@@ -99,14 +99,23 @@
 EXPORT_SYMBOL(__clear_user);
 
 /* Atomic counter implementation. */
-EXPORT_SYMBOL(atomic_add);
-EXPORT_SYMBOL(atomic_add_ret);
-EXPORT_SYMBOL(atomic_sub);
-EXPORT_SYMBOL(atomic_sub_ret);
-EXPORT_SYMBOL(atomic64_add);
-EXPORT_SYMBOL(atomic64_add_ret);
-EXPORT_SYMBOL(atomic64_sub);
-EXPORT_SYMBOL(atomic64_sub_ret);
+#define ATOMIC_OP(op)							\
+EXPORT_SYMBOL(atomic_##op);						\
+EXPORT_SYMBOL(atomic64_##op);
+
+#define ATOMIC_OP_RETURN(op)						\
+EXPORT_SYMBOL(atomic_##op##_return);					\
+EXPORT_SYMBOL(atomic64_##op##_return);
+
+#define ATOMIC_OPS(op) ATOMIC_OP(op) ATOMIC_OP_RETURN(op)
+
+ATOMIC_OPS(add)
+ATOMIC_OPS(sub)
+
+#undef ATOMIC_OPS
+#undef ATOMIC_OP_RETURN
+#undef ATOMIC_OP
+
 EXPORT_SYMBOL(atomic64_dec_if_positive);
 
 /* Atomic bit operations. */

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 778178f..4b663e1 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig

@@ -552,6 +552,39 @@
 
 	  If in doubt, say "Y".
 
+config KVMTOOL_TEST_ENABLE
+	bool "Enable options to create a bootable tools/kvm/ kernel"
+	select NET
+	select NETDEVICES
+	select PCI
+	select BLOCK
+	select BLK_DEV
+	select NETWORK_FILESYSTEMS
+	select INET
+	select EXPERIMENTAL
+	select TTY
+	select SERIAL_8250
+	select SERIAL_8250_CONSOLE
+	select IP_PNP
+	select IP_PNP_DHCP
+	select BINFMT_ELF
+	select PCI_MSI
+	select HAVE_ARCH_KGDB
+	select DEBUG_KERNEL
+	select KGDB
+	select KGDB_SERIAL_CONSOLE
+	select VIRTUALIZATION
+	select VIRTIO
+	select VIRTIO_RING
+	select VIRTIO_PCI
+	select VIRTIO_BLK
+	select VIRTIO_CONSOLE
+	select VIRTIO_NET
+	select AVERAGE
+	select 9P_FS
+	select NET_9P
+	select NET_9P_VIRTIO
+
 menuconfig HYPERVISOR_GUEST
 	bool "Linux guest support"
 	---help---
@@ -2443,9 +2476,19 @@
 	depends on STA2X11
 
 config IOSF_MBI
-	tristate
-	default m
+	tristate "Intel System On Chip IOSF Sideband support"
 	depends on PCI
+	---help---
+	  Enables sideband access to mailbox registers on SoC's. The sideband is
+	  available on the following platforms. This list is not meant to be
+	  exclusive.
+	   - BayTrail
+	   - Cherryview
+	   - Braswell
+	   - Quark
+
+	  You should say Y if you are running a kernel on one of these
+	  platforms.
 
 config PMC_ATOM
 	def_bool y

diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index 6dd1c7d..bf20c81 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h

@@ -219,21 +219,6 @@
 	return *v;
 }
 
-#ifdef CONFIG_X86_64
-/**
- * atomic_or_long - OR of two long integers
- * @v1: pointer to type unsigned long
- * @v2: pointer to type unsigned long
- *
- * Atomically ORs @v1 and @v2
- * Returns the result of the OR
- */
-static inline void atomic_or_long(unsigned long *v1, unsigned long v2)
-{
-	asm(LOCK_PREFIX "orq %1, %0" : "+m" (*v1) : "r" (v2));
-}
-#endif
-
 /* These are x86-specific, used by some header files */
 #define atomic_clear_mask(mask, addr)				\
 	asm volatile(LOCK_PREFIX "andl %0,%1"			\

diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h
index cb4c73b..76659b6 100644
--- a/arch/x86/include/asm/calling.h
+++ b/arch/x86/include/asm/calling.h

@@ -85,7 +85,7 @@
 #define ARGOFFSET	R11
 #define SWFRAME		ORIG_RAX
 
-	.macro SAVE_ARGS addskip=0, save_rcx=1, save_r891011=1
+	.macro SAVE_ARGS addskip=0, save_rcx=1, save_r891011=1, rax_enosys=0
 	subq  $9*8+\addskip, %rsp
 	CFI_ADJUST_CFA_OFFSET	9*8+\addskip
 	movq_cfi rdi, 8*8
@@ -96,7 +96,11 @@
 	movq_cfi rcx, 5*8
 	.endif
 
+	.if \rax_enosys
+	movq $-ENOSYS, 4*8(%rsp)
+	.else
 	movq_cfi rax, 4*8
+	.endif
 
 	.if \save_r891011
 	movq_cfi r8,  3*8

diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h
index 412ecec..e97622f 100644
--- a/arch/x86/include/asm/fpu-internal.h
+++ b/arch/x86/include/asm/fpu-internal.h

@@ -344,7 +344,7 @@
 
 static inline void __thread_fpu_begin(struct task_struct *tsk)
 {
-	if (!static_cpu_has_safe(X86_FEATURE_EAGER_FPU))
+	if (!use_eager_fpu())
 		clts();
 	__thread_set_has_fpu(tsk);
 }

diff --git a/arch/x86/include/asm/microcode_intel.h b/arch/x86/include/asm/microcode_intel.h
index 9067166..bbe296e 100644
--- a/arch/x86/include/asm/microcode_intel.h
+++ b/arch/x86/include/asm/microcode_intel.h

@@ -43,7 +43,7 @@
 #define DWSIZE			(sizeof(u32))
 
 #define get_totalsize(mc) \
-	(((struct microcode_intel *)mc)->hdr.totalsize ? \
+	(((struct microcode_intel *)mc)->hdr.datasize ? \
 	 ((struct microcode_intel *)mc)->hdr.totalsize : \
 	 DEFAULT_UCODE_TOTALSIZE)
 

diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 8249df4..8dfc9fd 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h

@@ -51,6 +51,14 @@
 	 ARCH_PERFMON_EVENTSEL_EDGE  |	\
 	 ARCH_PERFMON_EVENTSEL_INV   |	\
 	 ARCH_PERFMON_EVENTSEL_CMASK)
+#define X86_ALL_EVENT_FLAGS  			\
+	(ARCH_PERFMON_EVENTSEL_EDGE |  		\
+	 ARCH_PERFMON_EVENTSEL_INV | 		\
+	 ARCH_PERFMON_EVENTSEL_CMASK | 		\
+	 ARCH_PERFMON_EVENTSEL_ANY | 		\
+	 ARCH_PERFMON_EVENTSEL_PIN_CONTROL | 	\
+	 HSW_IN_TX | 				\
+	 HSW_IN_TX_CHECKPOINTED)
 #define AMD64_RAW_EVENT_MASK		\
 	(X86_RAW_EVENT_MASK          |  \
 	 AMD64_EVENTSEL_EVENT)

diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h
index 9ee3221..b6c0b40 100644
--- a/arch/x86/include/asm/pgtable_32.h
+++ b/arch/x86/include/asm/pgtable_32.h

@@ -32,9 +32,6 @@
 static inline void check_pgt_cache(void) { }
 void paging_init(void);
 
-extern void set_pmd_pfn(unsigned long, unsigned long, pgprot_t);
-
-
 /*
  * Define this if things work differently on an i386 and an i486:
  * it will (on an i486) warn about kernel memory accesses that are

diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 6205f0c..86fc2bb 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h

@@ -75,6 +75,11 @@
 extern void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
 			 int error_code, int si_code);
 
+
+extern unsigned long syscall_trace_enter_phase1(struct pt_regs *, u32 arch);
+extern long syscall_trace_enter_phase2(struct pt_regs *, u32 arch,
+				       unsigned long phase1_result);
+
 extern long syscall_trace_enter(struct pt_regs *);
 extern void syscall_trace_leave(struct pt_regs *);
 

diff --git a/arch/x86/include/asm/serial.h b/arch/x86/include/asm/serial.h
index 628c801..460b84f 100644
--- a/arch/x86/include/asm/serial.h
+++ b/arch/x86/include/asm/serial.h

@@ -6,24 +6,24 @@
  *
  * It'd be nice if someone built a serial card with a 24.576 MHz
  * clock, since the 16550A is capable of handling a top speed of 1.5
- * megabits/second; but this requires the faster clock.
+ * megabits/second; but this requires a faster clock.
  */
-#define BASE_BAUD ( 1843200 / 16 )
+#define BASE_BAUD (1843200/16)
 
 /* Standard COM flags (except for COM4, because of the 8514 problem) */
 #ifdef CONFIG_SERIAL_DETECT_IRQ
-#define STD_COM_FLAGS (ASYNC_BOOT_AUTOCONF | ASYNC_SKIP_TEST | ASYNC_AUTO_IRQ)
-#define STD_COM4_FLAGS (ASYNC_BOOT_AUTOCONF | ASYNC_AUTO_IRQ)
+# define STD_COMX_FLAGS	(ASYNC_BOOT_AUTOCONF |	ASYNC_SKIP_TEST	| ASYNC_AUTO_IRQ)
+# define STD_COM4_FLAGS	(ASYNC_BOOT_AUTOCONF |	0		| ASYNC_AUTO_IRQ)
 #else
-#define STD_COM_FLAGS (ASYNC_BOOT_AUTOCONF | ASYNC_SKIP_TEST)
-#define STD_COM4_FLAGS ASYNC_BOOT_AUTOCONF
+# define STD_COMX_FLAGS	(ASYNC_BOOT_AUTOCONF |	ASYNC_SKIP_TEST	| 0		)
+# define STD_COM4_FLAGS	(ASYNC_BOOT_AUTOCONF |	0		| 0		)
 #endif
 
-#define SERIAL_PORT_DFNS			\
-	/* UART CLK   PORT IRQ     FLAGS        */			\
-	{ 0, BASE_BAUD, 0x3F8, 4, STD_COM_FLAGS },	/* ttyS0 */	\
-	{ 0, BASE_BAUD, 0x2F8, 3, STD_COM_FLAGS },	/* ttyS1 */	\
-	{ 0, BASE_BAUD, 0x3E8, 4, STD_COM_FLAGS },	/* ttyS2 */	\
-	{ 0, BASE_BAUD, 0x2E8, 3, STD_COM4_FLAGS },	/* ttyS3 */
+#define SERIAL_PORT_DFNS								\
+	/* UART		CLK		PORT	IRQ	FLAGS			    */	\
+	{ .uart = 0,	BASE_BAUD,	0x3F8,	4,	STD_COMX_FLAGS	}, /* ttyS0 */	\
+	{ .uart = 0,	BASE_BAUD,	0x2F8,	3,	STD_COMX_FLAGS	}, /* ttyS1 */	\
+	{ .uart = 0,	BASE_BAUD,	0x3E8,	4,	STD_COMX_FLAGS	}, /* ttyS2 */	\
+	{ .uart = 0,	BASE_BAUD,	0x2E8,	3,	STD_COM4_FLAGS	}, /* ttyS3 */
 
 #endif /* _ASM_X86_SERIAL_H */

diff --git a/arch/x86/include/uapi/asm/e820.h b/arch/x86/include/uapi/asm/e820.h
index bbae024..d993e33 100644
--- a/arch/x86/include/uapi/asm/e820.h
+++ b/arch/x86/include/uapi/asm/e820.h

@@ -21,11 +21,6 @@
  * this size.
  */
 
-/*
- * Odd: 'make headers_check' complains about numa.h if I try
- * to collapse the next two #ifdef lines to a single line:
- *	#if defined(__KERNEL__) && defined(CONFIG_EFI)
- */
 #ifndef __KERNEL__
 #define E820_X_MAX E820MAX
 #endif

diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 004f017..8e9dcfd 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c

@@ -204,7 +204,6 @@
 
 static int uv_wakeup_secondary(int phys_apicid, unsigned long start_rip)
 {
-#ifdef CONFIG_SMP
 	unsigned long val;
 	int pnode;
 
@@ -223,7 +222,6 @@
 	uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
 
 	atomic_set(&init_deasserted, 1);
-#endif
 	return 0;
 }
 

diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 7fd54f0..7e1fd4e 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile

@@ -36,7 +36,9 @@
 endif
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_p6.o perf_event_knc.o perf_event_p4.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o
-obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_uncore.o perf_event_intel_rapl.o
+obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_uncore.o perf_event_intel_uncore_snb.o
+obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_uncore_snbep.o perf_event_intel_uncore_nhmex.o
+obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_rapl.o
 endif
 
 

diff --git a/arch/x86/kernel/cpu/microcode/amd_early.c b/arch/x86/kernel/cpu/microcode/amd_early.c
index 617a9e2..7aa1acc 100644
--- a/arch/x86/kernel/cpu/microcode/amd_early.c
+++ b/arch/x86/kernel/cpu/microcode/amd_early.c

@@ -27,7 +27,7 @@
 u8 amd_ucode_patch[PATCH_MAX_SIZE];
 static u16 this_equiv_id;
 
-struct cpio_data ucode_cpio;
+static struct cpio_data ucode_cpio;
 
 /*
  * Microcode patch container file is prepended to the initrd in cpio format.

diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index a276fa7..c6826d1 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c

@@ -127,7 +127,7 @@
 	return get_matching_microcode(csig, cpf, mc_intel, crev);
 }
 
-int apply_microcode(int cpu)
+static int apply_microcode_intel(int cpu)
 {
 	struct microcode_intel *mc_intel;
 	struct ucode_cpu_info *uci;
@@ -314,7 +314,7 @@
 	.request_microcode_user		  = request_microcode_user,
 	.request_microcode_fw             = request_microcode_fw,
 	.collect_cpu_info                 = collect_cpu_info,
-	.apply_microcode                  = apply_microcode,
+	.apply_microcode                  = apply_microcode_intel,
 	.microcode_fini_cpu               = microcode_fini_cpu,
 };
 

diff --git a/arch/x86/kernel/cpu/microcode/intel_early.c b/arch/x86/kernel/cpu/microcode/intel_early.c
index 18f7391..b88343f 100644
--- a/arch/x86/kernel/cpu/microcode/intel_early.c
+++ b/arch/x86/kernel/cpu/microcode/intel_early.c

@@ -28,8 +28,8 @@
 #include <asm/tlbflush.h>
 #include <asm/setup.h>
 
-unsigned long mc_saved_in_initrd[MAX_UCODE_COUNT];
-struct mc_saved_data {
+static unsigned long mc_saved_in_initrd[MAX_UCODE_COUNT];
+static struct mc_saved_data {
 	unsigned int mc_saved_count;
 	struct microcode_intel **mc_saved;
 } mc_saved_data;
@@ -415,7 +415,7 @@
 	struct ucode_cpu_info uci;
 
 	if (mc_saved_data.mc_saved_count == 0) {
-		pr_debug("no micorcode data saved.\n");
+		pr_debug("no microcode data saved.\n");
 		return;
 	}
 	pr_debug("Total microcode saved: %d\n", mc_saved_data.mc_saved_count);
@@ -506,7 +506,7 @@
 
 	if (mc_saved && mc_saved_count)
 		memcpy(mc_saved_tmp, mc_saved,
-		       mc_saved_count * sizeof(struct mirocode_intel *));
+		       mc_saved_count * sizeof(struct microcode_intel *));
 	/*
 	 * Save the microcode patch mc in mc_save_tmp structure if it's a newer
 	 * version.
@@ -526,7 +526,7 @@
 	show_saved_mc();
 
 	/*
-	 * Free old saved microcod data.
+	 * Free old saved microcode data.
 	 */
 	if (mc_saved) {
 		for (i = 0; i < mc_saved_count_init; i++)

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 2879ecd..0646d3b 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c

@@ -387,7 +387,7 @@
 			precise++;
 
 			/* Support for IP fixup */
-			if (x86_pmu.lbr_nr)
+			if (x86_pmu.lbr_nr || x86_pmu.intel_cap.pebs_format >= 2)
 				precise++;
 		}
 

diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 8ade931..fc5eb39 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h

@@ -67,8 +67,10 @@
  */
 #define PERF_X86_EVENT_PEBS_LDLAT	0x1 /* ld+ldlat data address sampling */
 #define PERF_X86_EVENT_PEBS_ST		0x2 /* st data address sampling */
-#define PERF_X86_EVENT_PEBS_ST_HSW	0x4 /* haswell style st data sampling */
+#define PERF_X86_EVENT_PEBS_ST_HSW	0x4 /* haswell style datala, store */
 #define PERF_X86_EVENT_COMMITTED	0x8 /* event passed commit_txn */
+#define PERF_X86_EVENT_PEBS_LD_HSW	0x10 /* haswell style datala, load */
+#define PERF_X86_EVENT_PEBS_NA_HSW	0x20 /* haswell style datala, unknown */
 
 struct amd_nb {
 	int nb_id;  /* NorthBridge id */
@@ -252,18 +254,52 @@
 	EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK)
 
 #define INTEL_PLD_CONSTRAINT(c, n)	\
-	__EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK, \
+	__EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
 			   HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LDLAT)
 
 #define INTEL_PST_CONSTRAINT(c, n)	\
-	__EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK, \
+	__EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
 			  HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST)
 
-/* DataLA version of store sampling without extra enable bit. */
-#define INTEL_PST_HSW_CONSTRAINT(c, n)	\
-	__EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK, \
+/* Event constraint, but match on all event flags too. */
+#define INTEL_FLAGS_EVENT_CONSTRAINT(c, n) \
+	EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS)
+
+/* Check only flags, but allow all event/umask */
+#define INTEL_ALL_EVENT_CONSTRAINT(code, n)	\
+	EVENT_CONSTRAINT(code, n, X86_ALL_EVENT_FLAGS)
+
+/* Check flags and event code, and set the HSW store flag */
+#define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_ST(code, n) \
+	__EVENT_CONSTRAINT(code, n, 			\
+			  ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \
 			  HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST_HSW)
 
+/* Check flags and event code, and set the HSW load flag */
+#define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(code, n) \
+	__EVENT_CONSTRAINT(code, n, 			\
+			  ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \
+			  HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW)
+
+/* Check flags and event code/umask, and set the HSW store flag */
+#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(code, n) \
+	__EVENT_CONSTRAINT(code, n, 			\
+			  INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+			  HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST_HSW)
+
+/* Check flags and event code/umask, and set the HSW load flag */
+#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(code, n) \
+	__EVENT_CONSTRAINT(code, n, 			\
+			  INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+			  HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW)
+
+/* Check flags and event code/umask, and set the HSW N/A flag */
+#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(code, n) \
+	__EVENT_CONSTRAINT(code, n, 			\
+			  INTEL_ARCH_EVENT_MASK|INTEL_ARCH_EVENT_MASK, \
+			  HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_NA_HSW)
+
+
 /*
  * We define the end marker as having a weight of -1
  * to enable blacklisting of events using a counter bitmask

diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 2502d0d..89bc750 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c

@@ -2367,15 +2367,15 @@
 	 * Install the hw-cache-events table:
 	 */
 	switch (boot_cpu_data.x86_model) {
-	case 14: /* 65 nm core solo/duo, "Yonah" */
+	case 14: /* 65nm Core "Yonah" */
 		pr_cont("Core events, ");
 		break;
 
-	case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
+	case 15: /* 65nm Core2 "Merom"          */
 		x86_add_quirk(intel_clovertown_quirk);
-	case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
-	case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */
-	case 29: /* six-core 45 nm xeon "Dunnington" */
+	case 22: /* 65nm Core2 "Merom-L"        */
+	case 23: /* 45nm Core2 "Penryn"         */
+	case 29: /* 45nm Core2 "Dunnington (MP) */
 		memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
 		       sizeof(hw_cache_event_ids));
 
@@ -2386,9 +2386,9 @@
 		pr_cont("Core2 events, ");
 		break;
 
-	case 26: /* 45 nm nehalem, "Bloomfield" */
-	case 30: /* 45 nm nehalem, "Lynnfield" */
-	case 46: /* 45 nm nehalem-ex, "Beckton" */
+	case 30: /* 45nm Nehalem    */
+	case 26: /* 45nm Nehalem-EP */
+	case 46: /* 45nm Nehalem-EX */
 		memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
 		       sizeof(hw_cache_event_ids));
 		memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs,
@@ -2415,11 +2415,11 @@
 		pr_cont("Nehalem events, ");
 		break;
 
-	case 28: /* Atom */
-	case 38: /* Lincroft */
-	case 39: /* Penwell */
-	case 53: /* Cloverview */
-	case 54: /* Cedarview */
+	case 28: /* 45nm Atom "Pineview"   */
+	case 38: /* 45nm Atom "Lincroft"   */
+	case 39: /* 32nm Atom "Penwell"    */
+	case 53: /* 32nm Atom "Cloverview" */
+	case 54: /* 32nm Atom "Cedarview"  */
 		memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
 		       sizeof(hw_cache_event_ids));
 
@@ -2430,8 +2430,8 @@
 		pr_cont("Atom events, ");
 		break;
 
-	case 55: /* Atom 22nm "Silvermont" */
-	case 77: /* Avoton "Silvermont" */
+	case 55: /* 22nm Atom "Silvermont"                */
+	case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */
 		memcpy(hw_cache_event_ids, slm_hw_cache_event_ids,
 			sizeof(hw_cache_event_ids));
 		memcpy(hw_cache_extra_regs, slm_hw_cache_extra_regs,
@@ -2446,9 +2446,9 @@
 		pr_cont("Silvermont events, ");
 		break;
 
-	case 37: /* 32 nm nehalem, "Clarkdale" */
-	case 44: /* 32 nm nehalem, "Gulftown" */
-	case 47: /* 32 nm Xeon E7 */
+	case 37: /* 32nm Westmere    */
+	case 44: /* 32nm Westmere-EP */
+	case 47: /* 32nm Westmere-EX */
 		memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids,
 		       sizeof(hw_cache_event_ids));
 		memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs,
@@ -2474,8 +2474,8 @@
 		pr_cont("Westmere events, ");
 		break;
 
-	case 42: /* SandyBridge */
-	case 45: /* SandyBridge, "Romely-EP" */
+	case 42: /* 32nm SandyBridge         */
+	case 45: /* 32nm SandyBridge-E/EN/EP */
 		x86_add_quirk(intel_sandybridge_quirk);
 		memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
 		       sizeof(hw_cache_event_ids));
@@ -2506,8 +2506,9 @@
 
 		pr_cont("SandyBridge events, ");
 		break;
-	case 58: /* IvyBridge */
-	case 62: /* IvyBridge EP */
+
+	case 58: /* 22nm IvyBridge       */
+	case 62: /* 22nm IvyBridge-EP/EX */
 		memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
 		       sizeof(hw_cache_event_ids));
 		/* dTLB-load-misses on IVB is different than SNB */
@@ -2539,11 +2540,11 @@
 		break;
 
 
-	case 60: /* Haswell Client */
-	case 70:
-	case 71:
+	case 60: /* 22nm Haswell */
 	case 63:
 	case 69:
+	case 70:
+	case 71:
 		x86_pmu.late_ack = true;
 		memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids));
 		memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
@@ -2552,7 +2553,7 @@
 
 		x86_pmu.event_constraints = intel_hsw_event_constraints;
 		x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
-		x86_pmu.extra_regs = intel_snb_extra_regs;
+		x86_pmu.extra_regs = intel_snbep_extra_regs;
 		x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
 		/* all extra regs are per-cpu when HT is on */
 		x86_pmu.er_flags |= ERF_HAS_RSP_1;

diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 696ade3..b1553d0 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c

@@ -108,14 +108,16 @@
 	return val;
 }
 
-static u64 precise_store_data_hsw(struct perf_event *event, u64 status)
+static u64 precise_datala_hsw(struct perf_event *event, u64 status)
 {
 	union perf_mem_data_src dse;
-	u64 cfg = event->hw.config & INTEL_ARCH_EVENT_MASK;
 
-	dse.val = 0;
-	dse.mem_op = PERF_MEM_OP_STORE;
-	dse.mem_lvl = PERF_MEM_LVL_NA;
+	dse.val = PERF_MEM_NA;
+
+	if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW)
+		dse.mem_op = PERF_MEM_OP_STORE;
+	else if (event->hw.flags & PERF_X86_EVENT_PEBS_LD_HSW)
+		dse.mem_op = PERF_MEM_OP_LOAD;
 
 	/*
 	 * L1 info only valid for following events:
@@ -125,15 +127,12 @@
 	 * MEM_UOPS_RETIRED.SPLIT_STORES
 	 * MEM_UOPS_RETIRED.ALL_STORES
 	 */
-	if (cfg != 0x12d0 && cfg != 0x22d0 && cfg != 0x42d0 && cfg != 0x82d0)
-		return dse.mem_lvl;
-
-	if (status & 1)
-		dse.mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT;
-	else
-		dse.mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_MISS;
-
-	/* Nothing else supported. Sorry. */
+	if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW) {
+		if (status & 1)
+			dse.mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT;
+		else
+			dse.mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_MISS;
+	}
 	return dse.val;
 }
 
@@ -569,28 +568,10 @@
 };
 
 struct event_constraint intel_slm_pebs_event_constraints[] = {
-	INTEL_UEVENT_CONSTRAINT(0x0103, 0x1), /* REHABQ.LD_BLOCK_ST_FORWARD_PS */
-	INTEL_UEVENT_CONSTRAINT(0x0803, 0x1), /* REHABQ.LD_SPLITS_PS */
-	INTEL_UEVENT_CONSTRAINT(0x0204, 0x1), /* MEM_UOPS_RETIRED.L2_HIT_LOADS_PS */
-	INTEL_UEVENT_CONSTRAINT(0x0404, 0x1), /* MEM_UOPS_RETIRED.L2_MISS_LOADS_PS */
-	INTEL_UEVENT_CONSTRAINT(0x0804, 0x1), /* MEM_UOPS_RETIRED.DTLB_MISS_LOADS_PS */
-	INTEL_UEVENT_CONSTRAINT(0x2004, 0x1), /* MEM_UOPS_RETIRED.HITM_PS */
-	INTEL_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY_PS */
-	INTEL_UEVENT_CONSTRAINT(0x00c4, 0x1), /* BR_INST_RETIRED.ALL_BRANCHES_PS */
-	INTEL_UEVENT_CONSTRAINT(0x7ec4, 0x1), /* BR_INST_RETIRED.JCC_PS */
-	INTEL_UEVENT_CONSTRAINT(0xbfc4, 0x1), /* BR_INST_RETIRED.FAR_BRANCH_PS */
-	INTEL_UEVENT_CONSTRAINT(0xebc4, 0x1), /* BR_INST_RETIRED.NON_RETURN_IND_PS */
-	INTEL_UEVENT_CONSTRAINT(0xf7c4, 0x1), /* BR_INST_RETIRED.RETURN_PS */
-	INTEL_UEVENT_CONSTRAINT(0xf9c4, 0x1), /* BR_INST_RETIRED.CALL_PS */
-	INTEL_UEVENT_CONSTRAINT(0xfbc4, 0x1), /* BR_INST_RETIRED.IND_CALL_PS */
-	INTEL_UEVENT_CONSTRAINT(0xfdc4, 0x1), /* BR_INST_RETIRED.REL_CALL_PS */
-	INTEL_UEVENT_CONSTRAINT(0xfec4, 0x1), /* BR_INST_RETIRED.TAKEN_JCC_PS */
-	INTEL_UEVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_MISP_RETIRED.ALL_BRANCHES_PS */
-	INTEL_UEVENT_CONSTRAINT(0x7ec5, 0x1), /* BR_INST_MISP_RETIRED.JCC_PS */
-	INTEL_UEVENT_CONSTRAINT(0xebc5, 0x1), /* BR_INST_MISP_RETIRED.NON_RETURN_IND_PS */
-	INTEL_UEVENT_CONSTRAINT(0xf7c5, 0x1), /* BR_INST_MISP_RETIRED.RETURN_PS */
-	INTEL_UEVENT_CONSTRAINT(0xfbc5, 0x1), /* BR_INST_MISP_RETIRED.IND_CALL_PS */
-	INTEL_UEVENT_CONSTRAINT(0xfec5, 0x1), /* BR_INST_MISP_RETIRED.TAKEN_JCC_PS */
+	/* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+	/* Allow all events as PEBS with no flags */
+	INTEL_ALL_EVENT_CONSTRAINT(0, 0x1),
 	EVENT_CONSTRAINT_END
 };
 
@@ -626,68 +607,44 @@
 
 struct event_constraint intel_snb_pebs_event_constraints[] = {
 	INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
-	INTEL_UEVENT_CONSTRAINT(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
-	INTEL_UEVENT_CONSTRAINT(0x02c2, 0xf), /* UOPS_RETIRED.RETIRE_SLOTS */
-	INTEL_EVENT_CONSTRAINT(0xc4, 0xf),    /* BR_INST_RETIRED.* */
-	INTEL_EVENT_CONSTRAINT(0xc5, 0xf),    /* BR_MISP_RETIRED.* */
 	INTEL_PLD_CONSTRAINT(0x01cd, 0x8),    /* MEM_TRANS_RETIRED.LAT_ABOVE_THR */
 	INTEL_PST_CONSTRAINT(0x02cd, 0x8),    /* MEM_TRANS_RETIRED.PRECISE_STORES */
-	INTEL_EVENT_CONSTRAINT(0xd0, 0xf),    /* MEM_UOP_RETIRED.* */
-	INTEL_EVENT_CONSTRAINT(0xd1, 0xf),    /* MEM_LOAD_UOPS_RETIRED.* */
-	INTEL_EVENT_CONSTRAINT(0xd2, 0xf),    /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
-	INTEL_EVENT_CONSTRAINT(0xd3, 0xf),    /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
-	INTEL_UEVENT_CONSTRAINT(0x02d4, 0xf), /* MEM_LOAD_UOPS_MISC_RETIRED.LLC_MISS */
+	/* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+	/* Allow all events as PEBS with no flags */
+	INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
 	EVENT_CONSTRAINT_END
 };
 
 struct event_constraint intel_ivb_pebs_event_constraints[] = {
         INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
-        INTEL_UEVENT_CONSTRAINT(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
-        INTEL_UEVENT_CONSTRAINT(0x02c2, 0xf), /* UOPS_RETIRED.RETIRE_SLOTS */
-        INTEL_EVENT_CONSTRAINT(0xc4, 0xf),    /* BR_INST_RETIRED.* */
-        INTEL_EVENT_CONSTRAINT(0xc5, 0xf),    /* BR_MISP_RETIRED.* */
         INTEL_PLD_CONSTRAINT(0x01cd, 0x8),    /* MEM_TRANS_RETIRED.LAT_ABOVE_THR */
 	INTEL_PST_CONSTRAINT(0x02cd, 0x8),    /* MEM_TRANS_RETIRED.PRECISE_STORES */
-        INTEL_EVENT_CONSTRAINT(0xd0, 0xf),    /* MEM_UOP_RETIRED.* */
-        INTEL_EVENT_CONSTRAINT(0xd1, 0xf),    /* MEM_LOAD_UOPS_RETIRED.* */
-        INTEL_EVENT_CONSTRAINT(0xd2, 0xf),    /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
-        INTEL_EVENT_CONSTRAINT(0xd3, 0xf),    /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
+	/* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+	/* Allow all events as PEBS with no flags */
+	INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
         EVENT_CONSTRAINT_END
 };
 
 struct event_constraint intel_hsw_pebs_event_constraints[] = {
 	INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
-	INTEL_PST_HSW_CONSTRAINT(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
-	INTEL_UEVENT_CONSTRAINT(0x02c2, 0xf), /* UOPS_RETIRED.RETIRE_SLOTS */
-	INTEL_EVENT_CONSTRAINT(0xc4, 0xf),    /* BR_INST_RETIRED.* */
-	INTEL_UEVENT_CONSTRAINT(0x01c5, 0xf), /* BR_MISP_RETIRED.CONDITIONAL */
-	INTEL_UEVENT_CONSTRAINT(0x04c5, 0xf), /* BR_MISP_RETIRED.ALL_BRANCHES */
-	INTEL_UEVENT_CONSTRAINT(0x20c5, 0xf), /* BR_MISP_RETIRED.NEAR_TAKEN */
-	INTEL_PLD_CONSTRAINT(0x01cd, 0x8),    /* MEM_TRANS_RETIRED.* */
-	/* MEM_UOPS_RETIRED.STLB_MISS_LOADS */
-	INTEL_UEVENT_CONSTRAINT(0x11d0, 0xf),
-	/* MEM_UOPS_RETIRED.STLB_MISS_STORES */
-	INTEL_UEVENT_CONSTRAINT(0x12d0, 0xf),
-	INTEL_UEVENT_CONSTRAINT(0x21d0, 0xf), /* MEM_UOPS_RETIRED.LOCK_LOADS */
-	INTEL_UEVENT_CONSTRAINT(0x41d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_LOADS */
-	/* MEM_UOPS_RETIRED.SPLIT_STORES */
-	INTEL_UEVENT_CONSTRAINT(0x42d0, 0xf),
-	INTEL_UEVENT_CONSTRAINT(0x81d0, 0xf), /* MEM_UOPS_RETIRED.ALL_LOADS */
-	INTEL_PST_HSW_CONSTRAINT(0x82d0, 0xf), /* MEM_UOPS_RETIRED.ALL_STORES */
-	INTEL_UEVENT_CONSTRAINT(0x01d1, 0xf), /* MEM_LOAD_UOPS_RETIRED.L1_HIT */
-	INTEL_UEVENT_CONSTRAINT(0x02d1, 0xf), /* MEM_LOAD_UOPS_RETIRED.L2_HIT */
-	INTEL_UEVENT_CONSTRAINT(0x04d1, 0xf), /* MEM_LOAD_UOPS_RETIRED.L3_HIT */
-	/* MEM_LOAD_UOPS_RETIRED.HIT_LFB */
-	INTEL_UEVENT_CONSTRAINT(0x40d1, 0xf),
-	/* MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS */
-	INTEL_UEVENT_CONSTRAINT(0x01d2, 0xf),
-	/* MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT */
-	INTEL_UEVENT_CONSTRAINT(0x02d2, 0xf),
-	/* MEM_LOAD_UOPS_LLC_MISS_RETIRED.LOCAL_DRAM */
-	INTEL_UEVENT_CONSTRAINT(0x01d3, 0xf),
-	INTEL_UEVENT_CONSTRAINT(0x04c8, 0xf), /* HLE_RETIRED.Abort */
-	INTEL_UEVENT_CONSTRAINT(0x04c9, 0xf), /* RTM_RETIRED.Abort */
-
+	INTEL_PLD_CONSTRAINT(0x01cd, 0xf),    /* MEM_TRANS_RETIRED.* */
+	/* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_LOADS */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf), /* MEM_UOPS_RETIRED.LOCK_LOADS */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x41d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_LOADS */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x81d0, 0xf), /* MEM_UOPS_RETIRED.ALL_LOADS */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_STORES */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x42d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_STORES */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x82d0, 0xf), /* MEM_UOPS_RETIRED.ALL_STORES */
+	INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd1, 0xf),    /* MEM_LOAD_UOPS_RETIRED.* */
+	INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd2, 0xf),    /* MEM_LOAD_UOPS_L3_HIT_RETIRED.* */
+	INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd3, 0xf),    /* MEM_LOAD_UOPS_L3_MISS_RETIRED.* */
+	/* Allow all events as PEBS with no flags */
+	INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
 	EVENT_CONSTRAINT_END
 };
 
@@ -864,6 +821,10 @@
 static void __intel_pmu_pebs_event(struct perf_event *event,
 				   struct pt_regs *iregs, void *__pebs)
 {
+#define PERF_X86_EVENT_PEBS_HSW_PREC \
+		(PERF_X86_EVENT_PEBS_ST_HSW | \
+		 PERF_X86_EVENT_PEBS_LD_HSW | \
+		 PERF_X86_EVENT_PEBS_NA_HSW)
 	/*
 	 * We cast to the biggest pebs_record but are careful not to
 	 * unconditionally access the 'extra' entries.
@@ -873,42 +834,40 @@
 	struct perf_sample_data data;
 	struct pt_regs regs;
 	u64 sample_type;
-	int fll, fst;
+	int fll, fst, dsrc;
+	int fl = event->hw.flags;
 
 	if (!intel_pmu_save_and_restart(event))
 		return;
 
-	fll = event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT;
-	fst = event->hw.flags & (PERF_X86_EVENT_PEBS_ST |
-				 PERF_X86_EVENT_PEBS_ST_HSW);
+	sample_type = event->attr.sample_type;
+	dsrc = sample_type & PERF_SAMPLE_DATA_SRC;
+
+	fll = fl & PERF_X86_EVENT_PEBS_LDLAT;
+	fst = fl & (PERF_X86_EVENT_PEBS_ST | PERF_X86_EVENT_PEBS_HSW_PREC);
 
 	perf_sample_data_init(&data, 0, event->hw.last_period);
 
 	data.period = event->hw.last_period;
-	sample_type = event->attr.sample_type;
 
 	/*
-	 * if PEBS-LL or PreciseStore
+	 * Use latency for weight (only avail with PEBS-LL)
 	 */
-	if (fll || fst) {
-		/*
-		 * Use latency for weight (only avail with PEBS-LL)
-		 */
-		if (fll && (sample_type & PERF_SAMPLE_WEIGHT))
-			data.weight = pebs->lat;
+	if (fll && (sample_type & PERF_SAMPLE_WEIGHT))
+		data.weight = pebs->lat;
 
-		/*
-		 * data.data_src encodes the data source
-		 */
-		if (sample_type & PERF_SAMPLE_DATA_SRC) {
-			if (fll)
-				data.data_src.val = load_latency_data(pebs->dse);
-			else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW)
-				data.data_src.val =
-					precise_store_data_hsw(event, pebs->dse);
-			else
-				data.data_src.val = precise_store_data(pebs->dse);
-		}
+	/*
+	 * data.data_src encodes the data source
+	 */
+	if (dsrc) {
+		u64 val = PERF_MEM_NA;
+		if (fll)
+			val = load_latency_data(pebs->dse);
+		else if (fst && (fl & PERF_X86_EVENT_PEBS_HSW_PREC))
+			val = precise_datala_hsw(event, pebs->dse);
+		else if (fst)
+			val = precise_store_data(pebs->dse);
+		data.data_src.val = val;
 	}
 
 	/*
@@ -935,16 +894,16 @@
 	else
 		regs.flags &= ~PERF_EFLAGS_EXACT;
 
-	if ((event->attr.sample_type & PERF_SAMPLE_ADDR) &&
+	if ((sample_type & PERF_SAMPLE_ADDR) &&
 	    x86_pmu.intel_cap.pebs_format >= 1)
 		data.addr = pebs->dla;
 
 	if (x86_pmu.intel_cap.pebs_format >= 2) {
 		/* Only set the TSX weight when no memory weight. */
-		if ((event->attr.sample_type & PERF_SAMPLE_WEIGHT) && !fll)
+		if ((sample_type & PERF_SAMPLE_WEIGHT) && !fll)
 			data.weight = intel_hsw_weight(pebs);
 
-		if (event->attr.sample_type & PERF_SAMPLE_TRANSACTION)
+		if (sample_type & PERF_SAMPLE_TRANSACTION)
 			data.txn = intel_hsw_transaction(pebs);
 	}
 
@@ -1055,7 +1014,7 @@
  * BTS, PEBS probe and setup
  */
 
-void intel_ds_init(void)
+void __init intel_ds_init(void)
 {
 	/*
 	 * No support for 32bit formats

diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index 9dd2459..4af1061 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c

@@ -697,7 +697,7 @@
 };
 
 /* core */
-void intel_pmu_lbr_init_core(void)
+void __init intel_pmu_lbr_init_core(void)
 {
 	x86_pmu.lbr_nr     = 4;
 	x86_pmu.lbr_tos    = MSR_LBR_TOS;
@@ -712,7 +712,7 @@
 }
 
 /* nehalem/westmere */
-void intel_pmu_lbr_init_nhm(void)
+void __init intel_pmu_lbr_init_nhm(void)
 {
 	x86_pmu.lbr_nr     = 16;
 	x86_pmu.lbr_tos    = MSR_LBR_TOS;
@@ -733,7 +733,7 @@
 }
 
 /* sandy bridge */
-void intel_pmu_lbr_init_snb(void)
+void __init intel_pmu_lbr_init_snb(void)
 {
 	x86_pmu.lbr_nr	 = 16;
 	x86_pmu.lbr_tos	 = MSR_LBR_TOS;
@@ -753,7 +753,7 @@
 }
 
 /* atom */
-void intel_pmu_lbr_init_atom(void)
+void __init intel_pmu_lbr_init_atom(void)
 {
 	/*
 	 * only models starting at stepping 10 seems

diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
index 0939f86..812ec5d 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c

@@ -1,83 +1,39 @@
 #include "perf_event_intel_uncore.h"
 
 static struct intel_uncore_type *empty_uncore[] = { NULL, };
-static struct intel_uncore_type **msr_uncores = empty_uncore;
-static struct intel_uncore_type **pci_uncores = empty_uncore;
-/* pci bus to socket mapping */
-static int pcibus_to_physid[256] = { [0 ... 255] = -1, };
+struct intel_uncore_type **uncore_msr_uncores = empty_uncore;
+struct intel_uncore_type **uncore_pci_uncores = empty_uncore;
 
-static struct pci_dev *extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX];
+static bool pcidrv_registered;
+struct pci_driver *uncore_pci_driver;
+/* pci bus to socket mapping */
+int uncore_pcibus_to_physid[256] = { [0 ... 255] = -1, };
+struct pci_dev *uncore_extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX];
 
 static DEFINE_RAW_SPINLOCK(uncore_box_lock);
-
 /* mask of cpus that collect uncore events */
 static cpumask_t uncore_cpu_mask;
 
 /* constraint for the fixed counter */
-static struct event_constraint constraint_fixed =
+static struct event_constraint uncore_constraint_fixed =
 	EVENT_CONSTRAINT(~0ULL, 1 << UNCORE_PMC_IDX_FIXED, ~0ULL);
-static struct event_constraint constraint_empty =
+struct event_constraint uncore_constraint_empty =
 	EVENT_CONSTRAINT(0, 0, 0);
 
-#define __BITS_VALUE(x, i, n)  ((typeof(x))(((x) >> ((i) * (n))) & \
-				((1ULL << (n)) - 1)))
+ssize_t uncore_event_show(struct kobject *kobj,
+			  struct kobj_attribute *attr, char *buf)
+{
+	struct uncore_event_desc *event =
+		container_of(attr, struct uncore_event_desc, attr);
+	return sprintf(buf, "%s", event->config);
+}
 
-DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
-DEFINE_UNCORE_FORMAT_ATTR(event_ext, event, "config:0-7,21");
-DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
-DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
-DEFINE_UNCORE_FORMAT_ATTR(tid_en, tid_en, "config:19");
-DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
-DEFINE_UNCORE_FORMAT_ATTR(cmask5, cmask, "config:24-28");
-DEFINE_UNCORE_FORMAT_ATTR(cmask8, cmask, "config:24-31");
-DEFINE_UNCORE_FORMAT_ATTR(thresh8, thresh, "config:24-31");
-DEFINE_UNCORE_FORMAT_ATTR(thresh5, thresh, "config:24-28");
-DEFINE_UNCORE_FORMAT_ATTR(occ_sel, occ_sel, "config:14-15");
-DEFINE_UNCORE_FORMAT_ATTR(occ_invert, occ_invert, "config:30");
-DEFINE_UNCORE_FORMAT_ATTR(occ_edge, occ_edge, "config:14-51");
-DEFINE_UNCORE_FORMAT_ATTR(filter_tid, filter_tid, "config1:0-4");
-DEFINE_UNCORE_FORMAT_ATTR(filter_link, filter_link, "config1:5-8");
-DEFINE_UNCORE_FORMAT_ATTR(filter_nid, filter_nid, "config1:10-17");
-DEFINE_UNCORE_FORMAT_ATTR(filter_nid2, filter_nid, "config1:32-47");
-DEFINE_UNCORE_FORMAT_ATTR(filter_state, filter_state, "config1:18-22");
-DEFINE_UNCORE_FORMAT_ATTR(filter_state2, filter_state, "config1:17-22");
-DEFINE_UNCORE_FORMAT_ATTR(filter_opc, filter_opc, "config1:23-31");
-DEFINE_UNCORE_FORMAT_ATTR(filter_opc2, filter_opc, "config1:52-60");
-DEFINE_UNCORE_FORMAT_ATTR(filter_band0, filter_band0, "config1:0-7");
-DEFINE_UNCORE_FORMAT_ATTR(filter_band1, filter_band1, "config1:8-15");
-DEFINE_UNCORE_FORMAT_ATTR(filter_band2, filter_band2, "config1:16-23");
-DEFINE_UNCORE_FORMAT_ATTR(filter_band3, filter_band3, "config1:24-31");
-DEFINE_UNCORE_FORMAT_ATTR(match_rds, match_rds, "config1:48-51");
-DEFINE_UNCORE_FORMAT_ATTR(match_rnid30, match_rnid30, "config1:32-35");
-DEFINE_UNCORE_FORMAT_ATTR(match_rnid4, match_rnid4, "config1:31");
-DEFINE_UNCORE_FORMAT_ATTR(match_dnid, match_dnid, "config1:13-17");
-DEFINE_UNCORE_FORMAT_ATTR(match_mc, match_mc, "config1:9-12");
-DEFINE_UNCORE_FORMAT_ATTR(match_opc, match_opc, "config1:5-8");
-DEFINE_UNCORE_FORMAT_ATTR(match_vnw, match_vnw, "config1:3-4");
-DEFINE_UNCORE_FORMAT_ATTR(match0, match0, "config1:0-31");
-DEFINE_UNCORE_FORMAT_ATTR(match1, match1, "config1:32-63");
-DEFINE_UNCORE_FORMAT_ATTR(mask_rds, mask_rds, "config2:48-51");
-DEFINE_UNCORE_FORMAT_ATTR(mask_rnid30, mask_rnid30, "config2:32-35");
-DEFINE_UNCORE_FORMAT_ATTR(mask_rnid4, mask_rnid4, "config2:31");
-DEFINE_UNCORE_FORMAT_ATTR(mask_dnid, mask_dnid, "config2:13-17");
-DEFINE_UNCORE_FORMAT_ATTR(mask_mc, mask_mc, "config2:9-12");
-DEFINE_UNCORE_FORMAT_ATTR(mask_opc, mask_opc, "config2:5-8");
-DEFINE_UNCORE_FORMAT_ATTR(mask_vnw, mask_vnw, "config2:3-4");
-DEFINE_UNCORE_FORMAT_ATTR(mask0, mask0, "config2:0-31");
-DEFINE_UNCORE_FORMAT_ATTR(mask1, mask1, "config2:32-63");
-
-static void uncore_pmu_start_hrtimer(struct intel_uncore_box *box);
-static void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box);
-static void uncore_perf_event_update(struct intel_uncore_box *box, struct perf_event *event);
-static void uncore_pmu_event_read(struct perf_event *event);
-
-static struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event)
+struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event)
 {
 	return container_of(event->pmu, struct intel_uncore_pmu, pmu);
 }
 
-static struct intel_uncore_box *
-uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu)
+struct intel_uncore_box *uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu)
 {
 	struct intel_uncore_box *box;
 
@@ -98,7 +54,7 @@
 	return *per_cpu_ptr(pmu->box, cpu);
 }
 
-static struct intel_uncore_box *uncore_event_to_box(struct perf_event *event)
+struct intel_uncore_box *uncore_event_to_box(struct perf_event *event)
 {
 	/*
 	 * perf core schedules event on the basis of cpu, uncore events are
@@ -107,7 +63,7 @@
 	return uncore_pmu_to_box(uncore_event_to_pmu(event), smp_processor_id());
 }
 
-static u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event)
+u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event)
 {
 	u64 count;
 
@@ -119,7 +75,7 @@
 /*
  * generic get constraint function for shared match/mask registers.
  */
-static struct event_constraint *
+struct event_constraint *
 uncore_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
 {
 	struct intel_uncore_extra_reg *er;
@@ -154,10 +110,10 @@
 		return NULL;
 	}
 
-	return &constraint_empty;
+	return &uncore_constraint_empty;
 }
 
-static void uncore_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
+void uncore_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
 {
 	struct intel_uncore_extra_reg *er;
 	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
@@ -178,7 +134,7 @@
 	reg1->alloc = 0;
 }
 
-static u64 uncore_shared_reg_config(struct intel_uncore_box *box, int idx)
+u64 uncore_shared_reg_config(struct intel_uncore_box *box, int idx)
 {
 	struct intel_uncore_extra_reg *er;
 	unsigned long flags;
@@ -193,2936 +149,6 @@
 	return config;
 }
 
-/* Sandy Bridge-EP uncore support */
-static struct intel_uncore_type snbep_uncore_cbox;
-static struct intel_uncore_type snbep_uncore_pcu;
-
-static void snbep_uncore_pci_disable_box(struct intel_uncore_box *box)
-{
-	struct pci_dev *pdev = box->pci_dev;
-	int box_ctl = uncore_pci_box_ctl(box);
-	u32 config = 0;
-
-	if (!pci_read_config_dword(pdev, box_ctl, &config)) {
-		config |= SNBEP_PMON_BOX_CTL_FRZ;
-		pci_write_config_dword(pdev, box_ctl, config);
-	}
-}
-
-static void snbep_uncore_pci_enable_box(struct intel_uncore_box *box)
-{
-	struct pci_dev *pdev = box->pci_dev;
-	int box_ctl = uncore_pci_box_ctl(box);
-	u32 config = 0;
-
-	if (!pci_read_config_dword(pdev, box_ctl, &config)) {
-		config &= ~SNBEP_PMON_BOX_CTL_FRZ;
-		pci_write_config_dword(pdev, box_ctl, config);
-	}
-}
-
-static void snbep_uncore_pci_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct pci_dev *pdev = box->pci_dev;
-	struct hw_perf_event *hwc = &event->hw;
-
-	pci_write_config_dword(pdev, hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
-}
-
-static void snbep_uncore_pci_disable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct pci_dev *pdev = box->pci_dev;
-	struct hw_perf_event *hwc = &event->hw;
-
-	pci_write_config_dword(pdev, hwc->config_base, hwc->config);
-}
-
-static u64 snbep_uncore_pci_read_counter(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct pci_dev *pdev = box->pci_dev;
-	struct hw_perf_event *hwc = &event->hw;
-	u64 count = 0;
-
-	pci_read_config_dword(pdev, hwc->event_base, (u32 *)&count);
-	pci_read_config_dword(pdev, hwc->event_base + 4, (u32 *)&count + 1);
-
-	return count;
-}
-
-static void snbep_uncore_pci_init_box(struct intel_uncore_box *box)
-{
-	struct pci_dev *pdev = box->pci_dev;
-
-	pci_write_config_dword(pdev, SNBEP_PCI_PMON_BOX_CTL, SNBEP_PMON_BOX_CTL_INT);
-}
-
-static void snbep_uncore_msr_disable_box(struct intel_uncore_box *box)
-{
-	u64 config;
-	unsigned msr;
-
-	msr = uncore_msr_box_ctl(box);
-	if (msr) {
-		rdmsrl(msr, config);
-		config |= SNBEP_PMON_BOX_CTL_FRZ;
-		wrmsrl(msr, config);
-	}
-}
-
-static void snbep_uncore_msr_enable_box(struct intel_uncore_box *box)
-{
-	u64 config;
-	unsigned msr;
-
-	msr = uncore_msr_box_ctl(box);
-	if (msr) {
-		rdmsrl(msr, config);
-		config &= ~SNBEP_PMON_BOX_CTL_FRZ;
-		wrmsrl(msr, config);
-	}
-}
-
-static void snbep_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-
-	if (reg1->idx != EXTRA_REG_NONE)
-		wrmsrl(reg1->reg, uncore_shared_reg_config(box, 0));
-
-	wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
-}
-
-static void snbep_uncore_msr_disable_event(struct intel_uncore_box *box,
-					struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-
-	wrmsrl(hwc->config_base, hwc->config);
-}
-
-static void snbep_uncore_msr_init_box(struct intel_uncore_box *box)
-{
-	unsigned msr = uncore_msr_box_ctl(box);
-
-	if (msr)
-		wrmsrl(msr, SNBEP_PMON_BOX_CTL_INT);
-}
-
-static struct attribute *snbep_uncore_formats_attr[] = {
-	&format_attr_event.attr,
-	&format_attr_umask.attr,
-	&format_attr_edge.attr,
-	&format_attr_inv.attr,
-	&format_attr_thresh8.attr,
-	NULL,
-};
-
-static struct attribute *snbep_uncore_ubox_formats_attr[] = {
-	&format_attr_event.attr,
-	&format_attr_umask.attr,
-	&format_attr_edge.attr,
-	&format_attr_inv.attr,
-	&format_attr_thresh5.attr,
-	NULL,
-};
-
-static struct attribute *snbep_uncore_cbox_formats_attr[] = {
-	&format_attr_event.attr,
-	&format_attr_umask.attr,
-	&format_attr_edge.attr,
-	&format_attr_tid_en.attr,
-	&format_attr_inv.attr,
-	&format_attr_thresh8.attr,
-	&format_attr_filter_tid.attr,
-	&format_attr_filter_nid.attr,
-	&format_attr_filter_state.attr,
-	&format_attr_filter_opc.attr,
-	NULL,
-};
-
-static struct attribute *snbep_uncore_pcu_formats_attr[] = {
-	&format_attr_event_ext.attr,
-	&format_attr_occ_sel.attr,
-	&format_attr_edge.attr,
-	&format_attr_inv.attr,
-	&format_attr_thresh5.attr,
-	&format_attr_occ_invert.attr,
-	&format_attr_occ_edge.attr,
-	&format_attr_filter_band0.attr,
-	&format_attr_filter_band1.attr,
-	&format_attr_filter_band2.attr,
-	&format_attr_filter_band3.attr,
-	NULL,
-};
-
-static struct attribute *snbep_uncore_qpi_formats_attr[] = {
-	&format_attr_event_ext.attr,
-	&format_attr_umask.attr,
-	&format_attr_edge.attr,
-	&format_attr_inv.attr,
-	&format_attr_thresh8.attr,
-	&format_attr_match_rds.attr,
-	&format_attr_match_rnid30.attr,
-	&format_attr_match_rnid4.attr,
-	&format_attr_match_dnid.attr,
-	&format_attr_match_mc.attr,
-	&format_attr_match_opc.attr,
-	&format_attr_match_vnw.attr,
-	&format_attr_match0.attr,
-	&format_attr_match1.attr,
-	&format_attr_mask_rds.attr,
-	&format_attr_mask_rnid30.attr,
-	&format_attr_mask_rnid4.attr,
-	&format_attr_mask_dnid.attr,
-	&format_attr_mask_mc.attr,
-	&format_attr_mask_opc.attr,
-	&format_attr_mask_vnw.attr,
-	&format_attr_mask0.attr,
-	&format_attr_mask1.attr,
-	NULL,
-};
-
-static struct uncore_event_desc snbep_uncore_imc_events[] = {
-	INTEL_UNCORE_EVENT_DESC(clockticks,      "event=0xff,umask=0x00"),
-	INTEL_UNCORE_EVENT_DESC(cas_count_read,  "event=0x04,umask=0x03"),
-	INTEL_UNCORE_EVENT_DESC(cas_count_write, "event=0x04,umask=0x0c"),
-	{ /* end: all zeroes */ },
-};
-
-static struct uncore_event_desc snbep_uncore_qpi_events[] = {
-	INTEL_UNCORE_EVENT_DESC(clockticks,       "event=0x14"),
-	INTEL_UNCORE_EVENT_DESC(txl_flits_active, "event=0x00,umask=0x06"),
-	INTEL_UNCORE_EVENT_DESC(drs_data,         "event=0x102,umask=0x08"),
-	INTEL_UNCORE_EVENT_DESC(ncb_data,         "event=0x103,umask=0x04"),
-	{ /* end: all zeroes */ },
-};
-
-static struct attribute_group snbep_uncore_format_group = {
-	.name = "format",
-	.attrs = snbep_uncore_formats_attr,
-};
-
-static struct attribute_group snbep_uncore_ubox_format_group = {
-	.name = "format",
-	.attrs = snbep_uncore_ubox_formats_attr,
-};
-
-static struct attribute_group snbep_uncore_cbox_format_group = {
-	.name = "format",
-	.attrs = snbep_uncore_cbox_formats_attr,
-};
-
-static struct attribute_group snbep_uncore_pcu_format_group = {
-	.name = "format",
-	.attrs = snbep_uncore_pcu_formats_attr,
-};
-
-static struct attribute_group snbep_uncore_qpi_format_group = {
-	.name = "format",
-	.attrs = snbep_uncore_qpi_formats_attr,
-};
-
-#define SNBEP_UNCORE_MSR_OPS_COMMON_INIT()			\
-	.init_box	= snbep_uncore_msr_init_box,		\
-	.disable_box	= snbep_uncore_msr_disable_box,		\
-	.enable_box	= snbep_uncore_msr_enable_box,		\
-	.disable_event	= snbep_uncore_msr_disable_event,	\
-	.enable_event	= snbep_uncore_msr_enable_event,	\
-	.read_counter	= uncore_msr_read_counter
-
-static struct intel_uncore_ops snbep_uncore_msr_ops = {
-	SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
-};
-
-#define SNBEP_UNCORE_PCI_OPS_COMMON_INIT()			\
-	.init_box	= snbep_uncore_pci_init_box,		\
-	.disable_box	= snbep_uncore_pci_disable_box,		\
-	.enable_box	= snbep_uncore_pci_enable_box,		\
-	.disable_event	= snbep_uncore_pci_disable_event,	\
-	.read_counter	= snbep_uncore_pci_read_counter
-
-static struct intel_uncore_ops snbep_uncore_pci_ops = {
-	SNBEP_UNCORE_PCI_OPS_COMMON_INIT(),
-	.enable_event	= snbep_uncore_pci_enable_event,	\
-};
-
-static struct event_constraint snbep_uncore_cbox_constraints[] = {
-	UNCORE_EVENT_CONSTRAINT(0x01, 0x1),
-	UNCORE_EVENT_CONSTRAINT(0x02, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x04, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x05, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x07, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x09, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
-	UNCORE_EVENT_CONSTRAINT(0x12, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x13, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x1b, 0xc),
-	UNCORE_EVENT_CONSTRAINT(0x1c, 0xc),
-	UNCORE_EVENT_CONSTRAINT(0x1d, 0xc),
-	UNCORE_EVENT_CONSTRAINT(0x1e, 0xc),
-	EVENT_CONSTRAINT_OVERLAP(0x1f, 0xe, 0xff),
-	UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x31, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x35, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
-	UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x39, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x3b, 0x1),
-	EVENT_CONSTRAINT_END
-};
-
-static struct event_constraint snbep_uncore_r2pcie_constraints[] = {
-	UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x12, 0x1),
-	UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x24, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
-	EVENT_CONSTRAINT_END
-};
-
-static struct event_constraint snbep_uncore_r3qpi_constraints[] = {
-	UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x12, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
-	UNCORE_EVENT_CONSTRAINT(0x20, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x22, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x24, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x29, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x2a, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x2b, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x2e, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x2f, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x30, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x31, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x36, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x39, 0x3),
-	EVENT_CONSTRAINT_END
-};
-
-static struct intel_uncore_type snbep_uncore_ubox = {
-	.name		= "ubox",
-	.num_counters   = 2,
-	.num_boxes	= 1,
-	.perf_ctr_bits	= 44,
-	.fixed_ctr_bits	= 48,
-	.perf_ctr	= SNBEP_U_MSR_PMON_CTR0,
-	.event_ctl	= SNBEP_U_MSR_PMON_CTL0,
-	.event_mask	= SNBEP_U_MSR_PMON_RAW_EVENT_MASK,
-	.fixed_ctr	= SNBEP_U_MSR_PMON_UCLK_FIXED_CTR,
-	.fixed_ctl	= SNBEP_U_MSR_PMON_UCLK_FIXED_CTL,
-	.ops		= &snbep_uncore_msr_ops,
-	.format_group	= &snbep_uncore_ubox_format_group,
-};
-
-static struct extra_reg snbep_uncore_cbox_extra_regs[] = {
-	SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
-				  SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4334, 0xffff, 0x6),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4534, 0xffff, 0x6),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4934, 0xffff, 0x6),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0x6),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x8),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x8),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0xa),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0xa),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4435, 0xffff, 0x2),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4835, 0xffff, 0x2),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4a35, 0xffff, 0x2),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x5035, 0xffff, 0x2),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x8),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x8),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0xa),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0xa),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4436, 0xffff, 0x2),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4836, 0xffff, 0x2),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4a36, 0xffff, 0x2),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4037, 0x40ff, 0x2),
-	EVENT_EXTRA_END
-};
-
-static void snbep_cbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-	struct intel_uncore_extra_reg *er = &box->shared_regs[0];
-	int i;
-
-	if (uncore_box_is_fake(box))
-		return;
-
-	for (i = 0; i < 5; i++) {
-		if (reg1->alloc & (0x1 << i))
-			atomic_sub(1 << (i * 6), &er->ref);
-	}
-	reg1->alloc = 0;
-}
-
-static struct event_constraint *
-__snbep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event,
-			    u64 (*cbox_filter_mask)(int fields))
-{
-	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-	struct intel_uncore_extra_reg *er = &box->shared_regs[0];
-	int i, alloc = 0;
-	unsigned long flags;
-	u64 mask;
-
-	if (reg1->idx == EXTRA_REG_NONE)
-		return NULL;
-
-	raw_spin_lock_irqsave(&er->lock, flags);
-	for (i = 0; i < 5; i++) {
-		if (!(reg1->idx & (0x1 << i)))
-			continue;
-		if (!uncore_box_is_fake(box) && (reg1->alloc & (0x1 << i)))
-			continue;
-
-		mask = cbox_filter_mask(0x1 << i);
-		if (!__BITS_VALUE(atomic_read(&er->ref), i, 6) ||
-		    !((reg1->config ^ er->config) & mask)) {
-			atomic_add(1 << (i * 6), &er->ref);
-			er->config &= ~mask;
-			er->config |= reg1->config & mask;
-			alloc |= (0x1 << i);
-		} else {
-			break;
-		}
-	}
-	raw_spin_unlock_irqrestore(&er->lock, flags);
-	if (i < 5)
-		goto fail;
-
-	if (!uncore_box_is_fake(box))
-		reg1->alloc |= alloc;
-
-	return NULL;
-fail:
-	for (; i >= 0; i--) {
-		if (alloc & (0x1 << i))
-			atomic_sub(1 << (i * 6), &er->ref);
-	}
-	return &constraint_empty;
-}
-
-static u64 snbep_cbox_filter_mask(int fields)
-{
-	u64 mask = 0;
-
-	if (fields & 0x1)
-		mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_TID;
-	if (fields & 0x2)
-		mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_NID;
-	if (fields & 0x4)
-		mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_STATE;
-	if (fields & 0x8)
-		mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_OPC;
-
-	return mask;
-}
-
-static struct event_constraint *
-snbep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-	return __snbep_cbox_get_constraint(box, event, snbep_cbox_filter_mask);
-}
-
-static int snbep_cbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-	struct extra_reg *er;
-	int idx = 0;
-
-	for (er = snbep_uncore_cbox_extra_regs; er->msr; er++) {
-		if (er->event != (event->hw.config & er->config_mask))
-			continue;
-		idx |= er->idx;
-	}
-
-	if (idx) {
-		reg1->reg = SNBEP_C0_MSR_PMON_BOX_FILTER +
-			SNBEP_CBO_MSR_OFFSET * box->pmu->pmu_idx;
-		reg1->config = event->attr.config1 & snbep_cbox_filter_mask(idx);
-		reg1->idx = idx;
-	}
-	return 0;
-}
-
-static struct intel_uncore_ops snbep_uncore_cbox_ops = {
-	SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
-	.hw_config		= snbep_cbox_hw_config,
-	.get_constraint		= snbep_cbox_get_constraint,
-	.put_constraint		= snbep_cbox_put_constraint,
-};
-
-static struct intel_uncore_type snbep_uncore_cbox = {
-	.name			= "cbox",
-	.num_counters		= 4,
-	.num_boxes		= 8,
-	.perf_ctr_bits		= 44,
-	.event_ctl		= SNBEP_C0_MSR_PMON_CTL0,
-	.perf_ctr		= SNBEP_C0_MSR_PMON_CTR0,
-	.event_mask		= SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK,
-	.box_ctl		= SNBEP_C0_MSR_PMON_BOX_CTL,
-	.msr_offset		= SNBEP_CBO_MSR_OFFSET,
-	.num_shared_regs	= 1,
-	.constraints		= snbep_uncore_cbox_constraints,
-	.ops			= &snbep_uncore_cbox_ops,
-	.format_group		= &snbep_uncore_cbox_format_group,
-};
-
-static u64 snbep_pcu_alter_er(struct perf_event *event, int new_idx, bool modify)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-	u64 config = reg1->config;
-
-	if (new_idx > reg1->idx)
-		config <<= 8 * (new_idx - reg1->idx);
-	else
-		config >>= 8 * (reg1->idx - new_idx);
-
-	if (modify) {
-		hwc->config += new_idx - reg1->idx;
-		reg1->config = config;
-		reg1->idx = new_idx;
-	}
-	return config;
-}
-
-static struct event_constraint *
-snbep_pcu_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-	struct intel_uncore_extra_reg *er = &box->shared_regs[0];
-	unsigned long flags;
-	int idx = reg1->idx;
-	u64 mask, config1 = reg1->config;
-	bool ok = false;
-
-	if (reg1->idx == EXTRA_REG_NONE ||
-	    (!uncore_box_is_fake(box) && reg1->alloc))
-		return NULL;
-again:
-	mask = 0xffULL << (idx * 8);
-	raw_spin_lock_irqsave(&er->lock, flags);
-	if (!__BITS_VALUE(atomic_read(&er->ref), idx, 8) ||
-	    !((config1 ^ er->config) & mask)) {
-		atomic_add(1 << (idx * 8), &er->ref);
-		er->config &= ~mask;
-		er->config |= config1 & mask;
-		ok = true;
-	}
-	raw_spin_unlock_irqrestore(&er->lock, flags);
-
-	if (!ok) {
-		idx = (idx + 1) % 4;
-		if (idx != reg1->idx) {
-			config1 = snbep_pcu_alter_er(event, idx, false);
-			goto again;
-		}
-		return &constraint_empty;
-	}
-
-	if (!uncore_box_is_fake(box)) {
-		if (idx != reg1->idx)
-			snbep_pcu_alter_er(event, idx, true);
-		reg1->alloc = 1;
-	}
-	return NULL;
-}
-
-static void snbep_pcu_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-	struct intel_uncore_extra_reg *er = &box->shared_regs[0];
-
-	if (uncore_box_is_fake(box) || !reg1->alloc)
-		return;
-
-	atomic_sub(1 << (reg1->idx * 8), &er->ref);
-	reg1->alloc = 0;
-}
-
-static int snbep_pcu_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-	int ev_sel = hwc->config & SNBEP_PMON_CTL_EV_SEL_MASK;
-
-	if (ev_sel >= 0xb && ev_sel <= 0xe) {
-		reg1->reg = SNBEP_PCU_MSR_PMON_BOX_FILTER;
-		reg1->idx = ev_sel - 0xb;
-		reg1->config = event->attr.config1 & (0xff << reg1->idx);
-	}
-	return 0;
-}
-
-static struct intel_uncore_ops snbep_uncore_pcu_ops = {
-	SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
-	.hw_config		= snbep_pcu_hw_config,
-	.get_constraint		= snbep_pcu_get_constraint,
-	.put_constraint		= snbep_pcu_put_constraint,
-};
-
-static struct intel_uncore_type snbep_uncore_pcu = {
-	.name			= "pcu",
-	.num_counters		= 4,
-	.num_boxes		= 1,
-	.perf_ctr_bits		= 48,
-	.perf_ctr		= SNBEP_PCU_MSR_PMON_CTR0,
-	.event_ctl		= SNBEP_PCU_MSR_PMON_CTL0,
-	.event_mask		= SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK,
-	.box_ctl		= SNBEP_PCU_MSR_PMON_BOX_CTL,
-	.num_shared_regs	= 1,
-	.ops			= &snbep_uncore_pcu_ops,
-	.format_group		= &snbep_uncore_pcu_format_group,
-};
-
-static struct intel_uncore_type *snbep_msr_uncores[] = {
-	&snbep_uncore_ubox,
-	&snbep_uncore_cbox,
-	&snbep_uncore_pcu,
-	NULL,
-};
-
-enum {
-	SNBEP_PCI_QPI_PORT0_FILTER,
-	SNBEP_PCI_QPI_PORT1_FILTER,
-};
-
-static int snbep_qpi_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-	struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
-
-	if ((hwc->config & SNBEP_PMON_CTL_EV_SEL_MASK) == 0x38) {
-		reg1->idx = 0;
-		reg1->reg = SNBEP_Q_Py_PCI_PMON_PKT_MATCH0;
-		reg1->config = event->attr.config1;
-		reg2->reg = SNBEP_Q_Py_PCI_PMON_PKT_MASK0;
-		reg2->config = event->attr.config2;
-	}
-	return 0;
-}
-
-static void snbep_qpi_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct pci_dev *pdev = box->pci_dev;
-	struct hw_perf_event *hwc = &event->hw;
-	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-	struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
-
-	if (reg1->idx != EXTRA_REG_NONE) {
-		int idx = box->pmu->pmu_idx + SNBEP_PCI_QPI_PORT0_FILTER;
-		struct pci_dev *filter_pdev = extra_pci_dev[box->phys_id][idx];
-		WARN_ON_ONCE(!filter_pdev);
-		if (filter_pdev) {
-			pci_write_config_dword(filter_pdev, reg1->reg,
-						(u32)reg1->config);
-			pci_write_config_dword(filter_pdev, reg1->reg + 4,
-						(u32)(reg1->config >> 32));
-			pci_write_config_dword(filter_pdev, reg2->reg,
-						(u32)reg2->config);
-			pci_write_config_dword(filter_pdev, reg2->reg + 4,
-						(u32)(reg2->config >> 32));
-		}
-	}
-
-	pci_write_config_dword(pdev, hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
-}
-
-static struct intel_uncore_ops snbep_uncore_qpi_ops = {
-	SNBEP_UNCORE_PCI_OPS_COMMON_INIT(),
-	.enable_event		= snbep_qpi_enable_event,
-	.hw_config		= snbep_qpi_hw_config,
-	.get_constraint		= uncore_get_constraint,
-	.put_constraint		= uncore_put_constraint,
-};
-
-#define SNBEP_UNCORE_PCI_COMMON_INIT()				\
-	.perf_ctr	= SNBEP_PCI_PMON_CTR0,			\
-	.event_ctl	= SNBEP_PCI_PMON_CTL0,			\
-	.event_mask	= SNBEP_PMON_RAW_EVENT_MASK,		\
-	.box_ctl	= SNBEP_PCI_PMON_BOX_CTL,		\
-	.ops		= &snbep_uncore_pci_ops,		\
-	.format_group	= &snbep_uncore_format_group
-
-static struct intel_uncore_type snbep_uncore_ha = {
-	.name		= "ha",
-	.num_counters   = 4,
-	.num_boxes	= 1,
-	.perf_ctr_bits	= 48,
-	SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-static struct intel_uncore_type snbep_uncore_imc = {
-	.name		= "imc",
-	.num_counters   = 4,
-	.num_boxes	= 4,
-	.perf_ctr_bits	= 48,
-	.fixed_ctr_bits	= 48,
-	.fixed_ctr	= SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
-	.fixed_ctl	= SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
-	.event_descs	= snbep_uncore_imc_events,
-	SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-static struct intel_uncore_type snbep_uncore_qpi = {
-	.name			= "qpi",
-	.num_counters		= 4,
-	.num_boxes		= 2,
-	.perf_ctr_bits		= 48,
-	.perf_ctr		= SNBEP_PCI_PMON_CTR0,
-	.event_ctl		= SNBEP_PCI_PMON_CTL0,
-	.event_mask		= SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK,
-	.box_ctl		= SNBEP_PCI_PMON_BOX_CTL,
-	.num_shared_regs	= 1,
-	.ops			= &snbep_uncore_qpi_ops,
-	.event_descs		= snbep_uncore_qpi_events,
-	.format_group		= &snbep_uncore_qpi_format_group,
-};
-
-
-static struct intel_uncore_type snbep_uncore_r2pcie = {
-	.name		= "r2pcie",
-	.num_counters   = 4,
-	.num_boxes	= 1,
-	.perf_ctr_bits	= 44,
-	.constraints	= snbep_uncore_r2pcie_constraints,
-	SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-static struct intel_uncore_type snbep_uncore_r3qpi = {
-	.name		= "r3qpi",
-	.num_counters   = 3,
-	.num_boxes	= 2,
-	.perf_ctr_bits	= 44,
-	.constraints	= snbep_uncore_r3qpi_constraints,
-	SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-enum {
-	SNBEP_PCI_UNCORE_HA,
-	SNBEP_PCI_UNCORE_IMC,
-	SNBEP_PCI_UNCORE_QPI,
-	SNBEP_PCI_UNCORE_R2PCIE,
-	SNBEP_PCI_UNCORE_R3QPI,
-};
-
-static struct intel_uncore_type *snbep_pci_uncores[] = {
-	[SNBEP_PCI_UNCORE_HA]		= &snbep_uncore_ha,
-	[SNBEP_PCI_UNCORE_IMC]		= &snbep_uncore_imc,
-	[SNBEP_PCI_UNCORE_QPI]		= &snbep_uncore_qpi,
-	[SNBEP_PCI_UNCORE_R2PCIE]	= &snbep_uncore_r2pcie,
-	[SNBEP_PCI_UNCORE_R3QPI]	= &snbep_uncore_r3qpi,
-	NULL,
-};
-
-static const struct pci_device_id snbep_uncore_pci_ids[] = {
-	{ /* Home Agent */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_HA),
-		.driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_HA, 0),
-	},
-	{ /* MC Channel 0 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC0),
-		.driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 0),
-	},
-	{ /* MC Channel 1 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC1),
-		.driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 1),
-	},
-	{ /* MC Channel 2 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC2),
-		.driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 2),
-	},
-	{ /* MC Channel 3 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC3),
-		.driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 3),
-	},
-	{ /* QPI Port 0 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI0),
-		.driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_QPI, 0),
-	},
-	{ /* QPI Port 1 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI1),
-		.driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_QPI, 1),
-	},
-	{ /* R2PCIe */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R2PCIE),
-		.driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R2PCIE, 0),
-	},
-	{ /* R3QPI Link 0 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI0),
-		.driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R3QPI, 0),
-	},
-	{ /* R3QPI Link 1 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI1),
-		.driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R3QPI, 1),
-	},
-	{ /* QPI Port 0 filter  */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3c86),
-		.driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
-						   SNBEP_PCI_QPI_PORT0_FILTER),
-	},
-	{ /* QPI Port 0 filter  */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3c96),
-		.driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
-						   SNBEP_PCI_QPI_PORT1_FILTER),
-	},
-	{ /* end: all zeroes */ }
-};
-
-static struct pci_driver snbep_uncore_pci_driver = {
-	.name		= "snbep_uncore",
-	.id_table	= snbep_uncore_pci_ids,
-};
-
-/*
- * build pci bus to socket mapping
- */
-static int snbep_pci2phy_map_init(int devid)
-{
-	struct pci_dev *ubox_dev = NULL;
-	int i, bus, nodeid;
-	int err = 0;
-	u32 config = 0;
-
-	while (1) {
-		/* find the UBOX device */
-		ubox_dev = pci_get_device(PCI_VENDOR_ID_INTEL, devid, ubox_dev);
-		if (!ubox_dev)
-			break;
-		bus = ubox_dev->bus->number;
-		/* get the Node ID of the local register */
-		err = pci_read_config_dword(ubox_dev, 0x40, &config);
-		if (err)
-			break;
-		nodeid = config;
-		/* get the Node ID mapping */
-		err = pci_read_config_dword(ubox_dev, 0x54, &config);
-		if (err)
-			break;
-		/*
-		 * every three bits in the Node ID mapping register maps
-		 * to a particular node.
-		 */
-		for (i = 0; i < 8; i++) {
-			if (nodeid == ((config >> (3 * i)) & 0x7)) {
-				pcibus_to_physid[bus] = i;
-				break;
-			}
-		}
-	}
-
-	if (!err) {
-		/*
-		 * For PCI bus with no UBOX device, find the next bus
-		 * that has UBOX device and use its mapping.
-		 */
-		i = -1;
-		for (bus = 255; bus >= 0; bus--) {
-			if (pcibus_to_physid[bus] >= 0)
-				i = pcibus_to_physid[bus];
-			else
-				pcibus_to_physid[bus] = i;
-		}
-	}
-
-	if (ubox_dev)
-		pci_dev_put(ubox_dev);
-
-	return err ? pcibios_err_to_errno(err) : 0;
-}
-/* end of Sandy Bridge-EP uncore support */
-
-/* IvyTown uncore support */
-static void ivt_uncore_msr_init_box(struct intel_uncore_box *box)
-{
-	unsigned msr = uncore_msr_box_ctl(box);
-	if (msr)
-		wrmsrl(msr, IVT_PMON_BOX_CTL_INT);
-}
-
-static void ivt_uncore_pci_init_box(struct intel_uncore_box *box)
-{
-	struct pci_dev *pdev = box->pci_dev;
-
-	pci_write_config_dword(pdev, SNBEP_PCI_PMON_BOX_CTL, IVT_PMON_BOX_CTL_INT);
-}
-
-#define IVT_UNCORE_MSR_OPS_COMMON_INIT()			\
-	.init_box	= ivt_uncore_msr_init_box,		\
-	.disable_box	= snbep_uncore_msr_disable_box,		\
-	.enable_box	= snbep_uncore_msr_enable_box,		\
-	.disable_event	= snbep_uncore_msr_disable_event,	\
-	.enable_event	= snbep_uncore_msr_enable_event,	\
-	.read_counter	= uncore_msr_read_counter
-
-static struct intel_uncore_ops ivt_uncore_msr_ops = {
-	IVT_UNCORE_MSR_OPS_COMMON_INIT(),
-};
-
-static struct intel_uncore_ops ivt_uncore_pci_ops = {
-	.init_box	= ivt_uncore_pci_init_box,
-	.disable_box	= snbep_uncore_pci_disable_box,
-	.enable_box	= snbep_uncore_pci_enable_box,
-	.disable_event	= snbep_uncore_pci_disable_event,
-	.enable_event	= snbep_uncore_pci_enable_event,
-	.read_counter	= snbep_uncore_pci_read_counter,
-};
-
-#define IVT_UNCORE_PCI_COMMON_INIT()				\
-	.perf_ctr	= SNBEP_PCI_PMON_CTR0,			\
-	.event_ctl	= SNBEP_PCI_PMON_CTL0,			\
-	.event_mask	= IVT_PMON_RAW_EVENT_MASK,		\
-	.box_ctl	= SNBEP_PCI_PMON_BOX_CTL,		\
-	.ops		= &ivt_uncore_pci_ops,			\
-	.format_group	= &ivt_uncore_format_group
-
-static struct attribute *ivt_uncore_formats_attr[] = {
-	&format_attr_event.attr,
-	&format_attr_umask.attr,
-	&format_attr_edge.attr,
-	&format_attr_inv.attr,
-	&format_attr_thresh8.attr,
-	NULL,
-};
-
-static struct attribute *ivt_uncore_ubox_formats_attr[] = {
-	&format_attr_event.attr,
-	&format_attr_umask.attr,
-	&format_attr_edge.attr,
-	&format_attr_inv.attr,
-	&format_attr_thresh5.attr,
-	NULL,
-};
-
-static struct attribute *ivt_uncore_cbox_formats_attr[] = {
-	&format_attr_event.attr,
-	&format_attr_umask.attr,
-	&format_attr_edge.attr,
-	&format_attr_tid_en.attr,
-	&format_attr_thresh8.attr,
-	&format_attr_filter_tid.attr,
-	&format_attr_filter_link.attr,
-	&format_attr_filter_state2.attr,
-	&format_attr_filter_nid2.attr,
-	&format_attr_filter_opc2.attr,
-	NULL,
-};
-
-static struct attribute *ivt_uncore_pcu_formats_attr[] = {
-	&format_attr_event_ext.attr,
-	&format_attr_occ_sel.attr,
-	&format_attr_edge.attr,
-	&format_attr_thresh5.attr,
-	&format_attr_occ_invert.attr,
-	&format_attr_occ_edge.attr,
-	&format_attr_filter_band0.attr,
-	&format_attr_filter_band1.attr,
-	&format_attr_filter_band2.attr,
-	&format_attr_filter_band3.attr,
-	NULL,
-};
-
-static struct attribute *ivt_uncore_qpi_formats_attr[] = {
-	&format_attr_event_ext.attr,
-	&format_attr_umask.attr,
-	&format_attr_edge.attr,
-	&format_attr_thresh8.attr,
-	&format_attr_match_rds.attr,
-	&format_attr_match_rnid30.attr,
-	&format_attr_match_rnid4.attr,
-	&format_attr_match_dnid.attr,
-	&format_attr_match_mc.attr,
-	&format_attr_match_opc.attr,
-	&format_attr_match_vnw.attr,
-	&format_attr_match0.attr,
-	&format_attr_match1.attr,
-	&format_attr_mask_rds.attr,
-	&format_attr_mask_rnid30.attr,
-	&format_attr_mask_rnid4.attr,
-	&format_attr_mask_dnid.attr,
-	&format_attr_mask_mc.attr,
-	&format_attr_mask_opc.attr,
-	&format_attr_mask_vnw.attr,
-	&format_attr_mask0.attr,
-	&format_attr_mask1.attr,
-	NULL,
-};
-
-static struct attribute_group ivt_uncore_format_group = {
-	.name = "format",
-	.attrs = ivt_uncore_formats_attr,
-};
-
-static struct attribute_group ivt_uncore_ubox_format_group = {
-	.name = "format",
-	.attrs = ivt_uncore_ubox_formats_attr,
-};
-
-static struct attribute_group ivt_uncore_cbox_format_group = {
-	.name = "format",
-	.attrs = ivt_uncore_cbox_formats_attr,
-};
-
-static struct attribute_group ivt_uncore_pcu_format_group = {
-	.name = "format",
-	.attrs = ivt_uncore_pcu_formats_attr,
-};
-
-static struct attribute_group ivt_uncore_qpi_format_group = {
-	.name = "format",
-	.attrs = ivt_uncore_qpi_formats_attr,
-};
-
-static struct intel_uncore_type ivt_uncore_ubox = {
-	.name		= "ubox",
-	.num_counters   = 2,
-	.num_boxes	= 1,
-	.perf_ctr_bits	= 44,
-	.fixed_ctr_bits	= 48,
-	.perf_ctr	= SNBEP_U_MSR_PMON_CTR0,
-	.event_ctl	= SNBEP_U_MSR_PMON_CTL0,
-	.event_mask	= IVT_U_MSR_PMON_RAW_EVENT_MASK,
-	.fixed_ctr	= SNBEP_U_MSR_PMON_UCLK_FIXED_CTR,
-	.fixed_ctl	= SNBEP_U_MSR_PMON_UCLK_FIXED_CTL,
-	.ops		= &ivt_uncore_msr_ops,
-	.format_group	= &ivt_uncore_ubox_format_group,
-};
-
-static struct extra_reg ivt_uncore_cbox_extra_regs[] = {
-	SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
-				  SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x1031, 0x10ff, 0x2),
-
-	SNBEP_CBO_EVENT_EXTRA_REG(0x1134, 0xffff, 0x4),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0xc),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x5134, 0xffff, 0xc),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4334, 0xffff, 0xc),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4534, 0xffff, 0xc),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4934, 0xffff, 0xc),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x10),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x10),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x2135, 0xffff, 0x10),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x2335, 0xffff, 0x10),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0x18),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0x18),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4435, 0xffff, 0x8),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4835, 0xffff, 0x8),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4a35, 0xffff, 0x8),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x5035, 0xffff, 0x8),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x8135, 0xffff, 0x10),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x8335, 0xffff, 0x10),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x10),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x10),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x2136, 0xffff, 0x10),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x2336, 0xffff, 0x10),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0x18),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0x18),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4436, 0xffff, 0x8),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4836, 0xffff, 0x8),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4a36, 0xffff, 0x8),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x5036, 0xffff, 0x8),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x8136, 0xffff, 0x10),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x8336, 0xffff, 0x10),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4037, 0x40ff, 0x8),
-	EVENT_EXTRA_END
-};
-
-static u64 ivt_cbox_filter_mask(int fields)
-{
-	u64 mask = 0;
-
-	if (fields & 0x1)
-		mask |= IVT_CB0_MSR_PMON_BOX_FILTER_TID;
-	if (fields & 0x2)
-		mask |= IVT_CB0_MSR_PMON_BOX_FILTER_LINK;
-	if (fields & 0x4)
-		mask |= IVT_CB0_MSR_PMON_BOX_FILTER_STATE;
-	if (fields & 0x8)
-		mask |= IVT_CB0_MSR_PMON_BOX_FILTER_NID;
-	if (fields & 0x10)
-		mask |= IVT_CB0_MSR_PMON_BOX_FILTER_OPC;
-
-	return mask;
-}
-
-static struct event_constraint *
-ivt_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-	return __snbep_cbox_get_constraint(box, event, ivt_cbox_filter_mask);
-}
-
-static int ivt_cbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-	struct extra_reg *er;
-	int idx = 0;
-
-	for (er = ivt_uncore_cbox_extra_regs; er->msr; er++) {
-		if (er->event != (event->hw.config & er->config_mask))
-			continue;
-		idx |= er->idx;
-	}
-
-	if (idx) {
-		reg1->reg = SNBEP_C0_MSR_PMON_BOX_FILTER +
-			SNBEP_CBO_MSR_OFFSET * box->pmu->pmu_idx;
-		reg1->config = event->attr.config1 & ivt_cbox_filter_mask(idx);
-		reg1->idx = idx;
-	}
-	return 0;
-}
-
-static void ivt_cbox_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-
-	if (reg1->idx != EXTRA_REG_NONE) {
-		u64 filter = uncore_shared_reg_config(box, 0);
-		wrmsrl(reg1->reg, filter & 0xffffffff);
-		wrmsrl(reg1->reg + 6, filter >> 32);
-	}
-
-	wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
-}
-
-static struct intel_uncore_ops ivt_uncore_cbox_ops = {
-	.init_box		= ivt_uncore_msr_init_box,
-	.disable_box		= snbep_uncore_msr_disable_box,
-	.enable_box		= snbep_uncore_msr_enable_box,
-	.disable_event		= snbep_uncore_msr_disable_event,
-	.enable_event		= ivt_cbox_enable_event,
-	.read_counter		= uncore_msr_read_counter,
-	.hw_config		= ivt_cbox_hw_config,
-	.get_constraint		= ivt_cbox_get_constraint,
-	.put_constraint		= snbep_cbox_put_constraint,
-};
-
-static struct intel_uncore_type ivt_uncore_cbox = {
-	.name			= "cbox",
-	.num_counters		= 4,
-	.num_boxes		= 15,
-	.perf_ctr_bits		= 44,
-	.event_ctl		= SNBEP_C0_MSR_PMON_CTL0,
-	.perf_ctr		= SNBEP_C0_MSR_PMON_CTR0,
-	.event_mask		= IVT_CBO_MSR_PMON_RAW_EVENT_MASK,
-	.box_ctl		= SNBEP_C0_MSR_PMON_BOX_CTL,
-	.msr_offset		= SNBEP_CBO_MSR_OFFSET,
-	.num_shared_regs	= 1,
-	.constraints		= snbep_uncore_cbox_constraints,
-	.ops			= &ivt_uncore_cbox_ops,
-	.format_group		= &ivt_uncore_cbox_format_group,
-};
-
-static struct intel_uncore_ops ivt_uncore_pcu_ops = {
-	IVT_UNCORE_MSR_OPS_COMMON_INIT(),
-	.hw_config		= snbep_pcu_hw_config,
-	.get_constraint		= snbep_pcu_get_constraint,
-	.put_constraint		= snbep_pcu_put_constraint,
-};
-
-static struct intel_uncore_type ivt_uncore_pcu = {
-	.name			= "pcu",
-	.num_counters		= 4,
-	.num_boxes		= 1,
-	.perf_ctr_bits		= 48,
-	.perf_ctr		= SNBEP_PCU_MSR_PMON_CTR0,
-	.event_ctl		= SNBEP_PCU_MSR_PMON_CTL0,
-	.event_mask		= IVT_PCU_MSR_PMON_RAW_EVENT_MASK,
-	.box_ctl		= SNBEP_PCU_MSR_PMON_BOX_CTL,
-	.num_shared_regs	= 1,
-	.ops			= &ivt_uncore_pcu_ops,
-	.format_group		= &ivt_uncore_pcu_format_group,
-};
-
-static struct intel_uncore_type *ivt_msr_uncores[] = {
-	&ivt_uncore_ubox,
-	&ivt_uncore_cbox,
-	&ivt_uncore_pcu,
-	NULL,
-};
-
-static struct intel_uncore_type ivt_uncore_ha = {
-	.name		= "ha",
-	.num_counters   = 4,
-	.num_boxes	= 2,
-	.perf_ctr_bits	= 48,
-	IVT_UNCORE_PCI_COMMON_INIT(),
-};
-
-static struct intel_uncore_type ivt_uncore_imc = {
-	.name		= "imc",
-	.num_counters   = 4,
-	.num_boxes	= 8,
-	.perf_ctr_bits	= 48,
-	.fixed_ctr_bits	= 48,
-	.fixed_ctr	= SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
-	.fixed_ctl	= SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
-	IVT_UNCORE_PCI_COMMON_INIT(),
-};
-
-/* registers in IRP boxes are not properly aligned */
-static unsigned ivt_uncore_irp_ctls[] = {0xd8, 0xdc, 0xe0, 0xe4};
-static unsigned ivt_uncore_irp_ctrs[] = {0xa0, 0xb0, 0xb8, 0xc0};
-
-static void ivt_uncore_irp_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct pci_dev *pdev = box->pci_dev;
-	struct hw_perf_event *hwc = &event->hw;
-
-	pci_write_config_dword(pdev, ivt_uncore_irp_ctls[hwc->idx],
-			       hwc->config | SNBEP_PMON_CTL_EN);
-}
-
-static void ivt_uncore_irp_disable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct pci_dev *pdev = box->pci_dev;
-	struct hw_perf_event *hwc = &event->hw;
-
-	pci_write_config_dword(pdev, ivt_uncore_irp_ctls[hwc->idx], hwc->config);
-}
-
-static u64 ivt_uncore_irp_read_counter(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct pci_dev *pdev = box->pci_dev;
-	struct hw_perf_event *hwc = &event->hw;
-	u64 count = 0;
-
-	pci_read_config_dword(pdev, ivt_uncore_irp_ctrs[hwc->idx], (u32 *)&count);
-	pci_read_config_dword(pdev, ivt_uncore_irp_ctrs[hwc->idx] + 4, (u32 *)&count + 1);
-
-	return count;
-}
-
-static struct intel_uncore_ops ivt_uncore_irp_ops = {
-	.init_box	= ivt_uncore_pci_init_box,
-	.disable_box	= snbep_uncore_pci_disable_box,
-	.enable_box	= snbep_uncore_pci_enable_box,
-	.disable_event	= ivt_uncore_irp_disable_event,
-	.enable_event	= ivt_uncore_irp_enable_event,
-	.read_counter	= ivt_uncore_irp_read_counter,
-};
-
-static struct intel_uncore_type ivt_uncore_irp = {
-	.name			= "irp",
-	.num_counters		= 4,
-	.num_boxes		= 1,
-	.perf_ctr_bits		= 48,
-	.event_mask		= IVT_PMON_RAW_EVENT_MASK,
-	.box_ctl		= SNBEP_PCI_PMON_BOX_CTL,
-	.ops			= &ivt_uncore_irp_ops,
-	.format_group		= &ivt_uncore_format_group,
-};
-
-static struct intel_uncore_ops ivt_uncore_qpi_ops = {
-	.init_box	= ivt_uncore_pci_init_box,
-	.disable_box	= snbep_uncore_pci_disable_box,
-	.enable_box	= snbep_uncore_pci_enable_box,
-	.disable_event	= snbep_uncore_pci_disable_event,
-	.enable_event	= snbep_qpi_enable_event,
-	.read_counter	= snbep_uncore_pci_read_counter,
-	.hw_config	= snbep_qpi_hw_config,
-	.get_constraint	= uncore_get_constraint,
-	.put_constraint	= uncore_put_constraint,
-};
-
-static struct intel_uncore_type ivt_uncore_qpi = {
-	.name			= "qpi",
-	.num_counters		= 4,
-	.num_boxes		= 3,
-	.perf_ctr_bits		= 48,
-	.perf_ctr		= SNBEP_PCI_PMON_CTR0,
-	.event_ctl		= SNBEP_PCI_PMON_CTL0,
-	.event_mask		= IVT_QPI_PCI_PMON_RAW_EVENT_MASK,
-	.box_ctl		= SNBEP_PCI_PMON_BOX_CTL,
-	.num_shared_regs	= 1,
-	.ops			= &ivt_uncore_qpi_ops,
-	.format_group		= &ivt_uncore_qpi_format_group,
-};
-
-static struct intel_uncore_type ivt_uncore_r2pcie = {
-	.name		= "r2pcie",
-	.num_counters   = 4,
-	.num_boxes	= 1,
-	.perf_ctr_bits	= 44,
-	.constraints	= snbep_uncore_r2pcie_constraints,
-	IVT_UNCORE_PCI_COMMON_INIT(),
-};
-
-static struct intel_uncore_type ivt_uncore_r3qpi = {
-	.name		= "r3qpi",
-	.num_counters   = 3,
-	.num_boxes	= 2,
-	.perf_ctr_bits	= 44,
-	.constraints	= snbep_uncore_r3qpi_constraints,
-	IVT_UNCORE_PCI_COMMON_INIT(),
-};
-
-enum {
-	IVT_PCI_UNCORE_HA,
-	IVT_PCI_UNCORE_IMC,
-	IVT_PCI_UNCORE_IRP,
-	IVT_PCI_UNCORE_QPI,
-	IVT_PCI_UNCORE_R2PCIE,
-	IVT_PCI_UNCORE_R3QPI,
-};
-
-static struct intel_uncore_type *ivt_pci_uncores[] = {
-	[IVT_PCI_UNCORE_HA]	= &ivt_uncore_ha,
-	[IVT_PCI_UNCORE_IMC]	= &ivt_uncore_imc,
-	[IVT_PCI_UNCORE_IRP]	= &ivt_uncore_irp,
-	[IVT_PCI_UNCORE_QPI]	= &ivt_uncore_qpi,
-	[IVT_PCI_UNCORE_R2PCIE]	= &ivt_uncore_r2pcie,
-	[IVT_PCI_UNCORE_R3QPI]	= &ivt_uncore_r3qpi,
-	NULL,
-};
-
-static const struct pci_device_id ivt_uncore_pci_ids[] = {
-	{ /* Home Agent 0 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe30),
-		.driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_HA, 0),
-	},
-	{ /* Home Agent 1 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe38),
-		.driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_HA, 1),
-	},
-	{ /* MC0 Channel 0 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb4),
-		.driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 0),
-	},
-	{ /* MC0 Channel 1 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb5),
-		.driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 1),
-	},
-	{ /* MC0 Channel 3 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb0),
-		.driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 2),
-	},
-	{ /* MC0 Channel 4 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb1),
-		.driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 3),
-	},
-	{ /* MC1 Channel 0 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef4),
-		.driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 4),
-	},
-	{ /* MC1 Channel 1 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef5),
-		.driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 5),
-	},
-	{ /* MC1 Channel 3 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef0),
-		.driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 6),
-	},
-	{ /* MC1 Channel 4 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef1),
-		.driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 7),
-	},
-	{ /* IRP */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe39),
-		.driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IRP, 0),
-	},
-	{ /* QPI0 Port 0 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe32),
-		.driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_QPI, 0),
-	},
-	{ /* QPI0 Port 1 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe33),
-		.driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_QPI, 1),
-	},
-	{ /* QPI1 Port 2 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe3a),
-		.driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_QPI, 2),
-	},
-	{ /* R2PCIe */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe34),
-		.driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_R2PCIE, 0),
-	},
-	{ /* R3QPI0 Link 0 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe36),
-		.driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_R3QPI, 0),
-	},
-	{ /* R3QPI0 Link 1 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe37),
-		.driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_R3QPI, 1),
-	},
-	{ /* R3QPI1 Link 2 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe3e),
-		.driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_R3QPI, 2),
-	},
-	{ /* QPI Port 0 filter  */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe86),
-		.driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
-						   SNBEP_PCI_QPI_PORT0_FILTER),
-	},
-	{ /* QPI Port 0 filter  */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe96),
-		.driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
-						   SNBEP_PCI_QPI_PORT1_FILTER),
-	},
-	{ /* end: all zeroes */ }
-};
-
-static struct pci_driver ivt_uncore_pci_driver = {
-	.name		= "ivt_uncore",
-	.id_table	= ivt_uncore_pci_ids,
-};
-/* end of IvyTown uncore support */
-
-/* Sandy Bridge uncore support */
-static void snb_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-
-	if (hwc->idx < UNCORE_PMC_IDX_FIXED)
-		wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN);
-	else
-		wrmsrl(hwc->config_base, SNB_UNC_CTL_EN);
-}
-
-static void snb_uncore_msr_disable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-	wrmsrl(event->hw.config_base, 0);
-}
-
-static void snb_uncore_msr_init_box(struct intel_uncore_box *box)
-{
-	if (box->pmu->pmu_idx == 0) {
-		wrmsrl(SNB_UNC_PERF_GLOBAL_CTL,
-			SNB_UNC_GLOBAL_CTL_EN | SNB_UNC_GLOBAL_CTL_CORE_ALL);
-	}
-}
-
-static struct uncore_event_desc snb_uncore_events[] = {
-	INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"),
-	{ /* end: all zeroes */ },
-};
-
-static struct attribute *snb_uncore_formats_attr[] = {
-	&format_attr_event.attr,
-	&format_attr_umask.attr,
-	&format_attr_edge.attr,
-	&format_attr_inv.attr,
-	&format_attr_cmask5.attr,
-	NULL,
-};
-
-static struct attribute_group snb_uncore_format_group = {
-	.name		= "format",
-	.attrs		= snb_uncore_formats_attr,
-};
-
-static struct intel_uncore_ops snb_uncore_msr_ops = {
-	.init_box	= snb_uncore_msr_init_box,
-	.disable_event	= snb_uncore_msr_disable_event,
-	.enable_event	= snb_uncore_msr_enable_event,
-	.read_counter	= uncore_msr_read_counter,
-};
-
-static struct event_constraint snb_uncore_cbox_constraints[] = {
-	UNCORE_EVENT_CONSTRAINT(0x80, 0x1),
-	UNCORE_EVENT_CONSTRAINT(0x83, 0x1),
-	EVENT_CONSTRAINT_END
-};
-
-static struct intel_uncore_type snb_uncore_cbox = {
-	.name		= "cbox",
-	.num_counters   = 2,
-	.num_boxes	= 4,
-	.perf_ctr_bits	= 44,
-	.fixed_ctr_bits	= 48,
-	.perf_ctr	= SNB_UNC_CBO_0_PER_CTR0,
-	.event_ctl	= SNB_UNC_CBO_0_PERFEVTSEL0,
-	.fixed_ctr	= SNB_UNC_FIXED_CTR,
-	.fixed_ctl	= SNB_UNC_FIXED_CTR_CTRL,
-	.single_fixed	= 1,
-	.event_mask	= SNB_UNC_RAW_EVENT_MASK,
-	.msr_offset	= SNB_UNC_CBO_MSR_OFFSET,
-	.constraints	= snb_uncore_cbox_constraints,
-	.ops		= &snb_uncore_msr_ops,
-	.format_group	= &snb_uncore_format_group,
-	.event_descs	= snb_uncore_events,
-};
-
-static struct intel_uncore_type *snb_msr_uncores[] = {
-	&snb_uncore_cbox,
-	NULL,
-};
-
-enum {
-	SNB_PCI_UNCORE_IMC,
-};
-
-static struct uncore_event_desc snb_uncore_imc_events[] = {
-	INTEL_UNCORE_EVENT_DESC(data_reads,  "event=0x01"),
-	INTEL_UNCORE_EVENT_DESC(data_reads.scale, "6.103515625e-5"),
-	INTEL_UNCORE_EVENT_DESC(data_reads.unit, "MiB"),
-
-	INTEL_UNCORE_EVENT_DESC(data_writes, "event=0x02"),
-	INTEL_UNCORE_EVENT_DESC(data_writes.scale, "6.103515625e-5"),
-	INTEL_UNCORE_EVENT_DESC(data_writes.unit, "MiB"),
-
-	{ /* end: all zeroes */ },
-};
-
-#define SNB_UNCORE_PCI_IMC_EVENT_MASK		0xff
-#define SNB_UNCORE_PCI_IMC_BAR_OFFSET		0x48
-
-/* page size multiple covering all config regs */
-#define SNB_UNCORE_PCI_IMC_MAP_SIZE		0x6000
-
-#define SNB_UNCORE_PCI_IMC_DATA_READS		0x1
-#define SNB_UNCORE_PCI_IMC_DATA_READS_BASE	0x5050
-#define SNB_UNCORE_PCI_IMC_DATA_WRITES		0x2
-#define SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE	0x5054
-#define SNB_UNCORE_PCI_IMC_CTR_BASE		SNB_UNCORE_PCI_IMC_DATA_READS_BASE
-
-static struct attribute *snb_uncore_imc_formats_attr[] = {
-	&format_attr_event.attr,
-	NULL,
-};
-
-static struct attribute_group snb_uncore_imc_format_group = {
-	.name = "format",
-	.attrs = snb_uncore_imc_formats_attr,
-};
-
-static void snb_uncore_imc_init_box(struct intel_uncore_box *box)
-{
-	struct pci_dev *pdev = box->pci_dev;
-	int where = SNB_UNCORE_PCI_IMC_BAR_OFFSET;
-	resource_size_t addr;
-	u32 pci_dword;
-
-	pci_read_config_dword(pdev, where, &pci_dword);
-	addr = pci_dword;
-
-#ifdef CONFIG_PHYS_ADDR_T_64BIT
-	pci_read_config_dword(pdev, where + 4, &pci_dword);
-	addr |= ((resource_size_t)pci_dword << 32);
-#endif
-
-	addr &= ~(PAGE_SIZE - 1);
-
-	box->io_addr = ioremap(addr, SNB_UNCORE_PCI_IMC_MAP_SIZE);
-	box->hrtimer_duration = UNCORE_SNB_IMC_HRTIMER_INTERVAL;
-}
-
-static void snb_uncore_imc_enable_box(struct intel_uncore_box *box)
-{}
-
-static void snb_uncore_imc_disable_box(struct intel_uncore_box *box)
-{}
-
-static void snb_uncore_imc_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{}
-
-static void snb_uncore_imc_disable_event(struct intel_uncore_box *box, struct perf_event *event)
-{}
-
-static u64 snb_uncore_imc_read_counter(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-
-	return (u64)*(unsigned int *)(box->io_addr + hwc->event_base);
-}
-
-/*
- * custom event_init() function because we define our own fixed, free
- * running counters, so we do not want to conflict with generic uncore
- * logic. Also simplifies processing
- */
-static int snb_uncore_imc_event_init(struct perf_event *event)
-{
-	struct intel_uncore_pmu *pmu;
-	struct intel_uncore_box *box;
-	struct hw_perf_event *hwc = &event->hw;
-	u64 cfg = event->attr.config & SNB_UNCORE_PCI_IMC_EVENT_MASK;
-	int idx, base;
-
-	if (event->attr.type != event->pmu->type)
-		return -ENOENT;
-
-	pmu = uncore_event_to_pmu(event);
-	/* no device found for this pmu */
-	if (pmu->func_id < 0)
-		return -ENOENT;
-
-	/* Sampling not supported yet */
-	if (hwc->sample_period)
-		return -EINVAL;
-
-	/* unsupported modes and filters */
-	if (event->attr.exclude_user   ||
-	    event->attr.exclude_kernel ||
-	    event->attr.exclude_hv     ||
-	    event->attr.exclude_idle   ||
-	    event->attr.exclude_host   ||
-	    event->attr.exclude_guest  ||
-	    event->attr.sample_period) /* no sampling */
-		return -EINVAL;
-
-	/*
-	 * Place all uncore events for a particular physical package
-	 * onto a single cpu
-	 */
-	if (event->cpu < 0)
-		return -EINVAL;
-
-	/* check only supported bits are set */
-	if (event->attr.config & ~SNB_UNCORE_PCI_IMC_EVENT_MASK)
-		return -EINVAL;
-
-	box = uncore_pmu_to_box(pmu, event->cpu);
-	if (!box || box->cpu < 0)
-		return -EINVAL;
-
-	event->cpu = box->cpu;
-
-	event->hw.idx = -1;
-	event->hw.last_tag = ~0ULL;
-	event->hw.extra_reg.idx = EXTRA_REG_NONE;
-	event->hw.branch_reg.idx = EXTRA_REG_NONE;
-	/*
-	 * check event is known (whitelist, determines counter)
-	 */
-	switch (cfg) {
-	case SNB_UNCORE_PCI_IMC_DATA_READS:
-		base = SNB_UNCORE_PCI_IMC_DATA_READS_BASE;
-		idx = UNCORE_PMC_IDX_FIXED;
-		break;
-	case SNB_UNCORE_PCI_IMC_DATA_WRITES:
-		base = SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE;
-		idx = UNCORE_PMC_IDX_FIXED + 1;
-		break;
-	default:
-		return -EINVAL;
-	}
-
-	/* must be done before validate_group */
-	event->hw.event_base = base;
-	event->hw.config = cfg;
-	event->hw.idx = idx;
-
-	/* no group validation needed, we have free running counters */
-
-	return 0;
-}
-
-static int snb_uncore_imc_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
-	return 0;
-}
-
-static void snb_uncore_imc_event_start(struct perf_event *event, int flags)
-{
-	struct intel_uncore_box *box = uncore_event_to_box(event);
-	u64 count;
-
-	if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
-		return;
-
-	event->hw.state = 0;
-	box->n_active++;
-
-	list_add_tail(&event->active_entry, &box->active_list);
-
-	count = snb_uncore_imc_read_counter(box, event);
-	local64_set(&event->hw.prev_count, count);
-
-	if (box->n_active == 1)
-		uncore_pmu_start_hrtimer(box);
-}
-
-static void snb_uncore_imc_event_stop(struct perf_event *event, int flags)
-{
-	struct intel_uncore_box *box = uncore_event_to_box(event);
-	struct hw_perf_event *hwc = &event->hw;
-
-	if (!(hwc->state & PERF_HES_STOPPED)) {
-		box->n_active--;
-
-		WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
-		hwc->state |= PERF_HES_STOPPED;
-
-		list_del(&event->active_entry);
-
-		if (box->n_active == 0)
-			uncore_pmu_cancel_hrtimer(box);
-	}
-
-	if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
-		/*
-		 * Drain the remaining delta count out of a event
-		 * that we are disabling:
-		 */
-		uncore_perf_event_update(box, event);
-		hwc->state |= PERF_HES_UPTODATE;
-	}
-}
-
-static int snb_uncore_imc_event_add(struct perf_event *event, int flags)
-{
-	struct intel_uncore_box *box = uncore_event_to_box(event);
-	struct hw_perf_event *hwc = &event->hw;
-
-	if (!box)
-		return -ENODEV;
-
-	hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
-	if (!(flags & PERF_EF_START))
-		hwc->state |= PERF_HES_ARCH;
-
-	snb_uncore_imc_event_start(event, 0);
-
-	box->n_events++;
-
-	return 0;
-}
-
-static void snb_uncore_imc_event_del(struct perf_event *event, int flags)
-{
-	struct intel_uncore_box *box = uncore_event_to_box(event);
-	int i;
-
-	snb_uncore_imc_event_stop(event, PERF_EF_UPDATE);
-
-	for (i = 0; i < box->n_events; i++) {
-		if (event == box->event_list[i]) {
-			--box->n_events;
-			break;
-		}
-	}
-}
-
-static int snb_pci2phy_map_init(int devid)
-{
-	struct pci_dev *dev = NULL;
-	int bus;
-
-	dev = pci_get_device(PCI_VENDOR_ID_INTEL, devid, dev);
-	if (!dev)
-		return -ENOTTY;
-
-	bus = dev->bus->number;
-
-	pcibus_to_physid[bus] = 0;
-
-	pci_dev_put(dev);
-
-	return 0;
-}
-
-static struct pmu snb_uncore_imc_pmu = {
-	.task_ctx_nr	= perf_invalid_context,
-	.event_init	= snb_uncore_imc_event_init,
-	.add		= snb_uncore_imc_event_add,
-	.del		= snb_uncore_imc_event_del,
-	.start		= snb_uncore_imc_event_start,
-	.stop		= snb_uncore_imc_event_stop,
-	.read		= uncore_pmu_event_read,
-};
-
-static struct intel_uncore_ops snb_uncore_imc_ops = {
-	.init_box	= snb_uncore_imc_init_box,
-	.enable_box	= snb_uncore_imc_enable_box,
-	.disable_box	= snb_uncore_imc_disable_box,
-	.disable_event	= snb_uncore_imc_disable_event,
-	.enable_event	= snb_uncore_imc_enable_event,
-	.hw_config	= snb_uncore_imc_hw_config,
-	.read_counter	= snb_uncore_imc_read_counter,
-};
-
-static struct intel_uncore_type snb_uncore_imc = {
-	.name		= "imc",
-	.num_counters   = 2,
-	.num_boxes	= 1,
-	.fixed_ctr_bits	= 32,
-	.fixed_ctr	= SNB_UNCORE_PCI_IMC_CTR_BASE,
-	.event_descs	= snb_uncore_imc_events,
-	.format_group	= &snb_uncore_imc_format_group,
-	.perf_ctr	= SNB_UNCORE_PCI_IMC_DATA_READS_BASE,
-	.event_mask	= SNB_UNCORE_PCI_IMC_EVENT_MASK,
-	.ops		= &snb_uncore_imc_ops,
-	.pmu		= &snb_uncore_imc_pmu,
-};
-
-static struct intel_uncore_type *snb_pci_uncores[] = {
-	[SNB_PCI_UNCORE_IMC]	= &snb_uncore_imc,
-	NULL,
-};
-
-static const struct pci_device_id snb_uncore_pci_ids[] = {
-	{ /* IMC */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SNB_IMC),
-		.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
-	},
-	{ /* end: all zeroes */ },
-};
-
-static const struct pci_device_id ivb_uncore_pci_ids[] = {
-	{ /* IMC */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IVB_IMC),
-		.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
-	},
-	{ /* end: all zeroes */ },
-};
-
-static const struct pci_device_id hsw_uncore_pci_ids[] = {
-	{ /* IMC */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_IMC),
-		.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
-	},
-	{ /* end: all zeroes */ },
-};
-
-static struct pci_driver snb_uncore_pci_driver = {
-	.name		= "snb_uncore",
-	.id_table	= snb_uncore_pci_ids,
-};
-
-static struct pci_driver ivb_uncore_pci_driver = {
-	.name		= "ivb_uncore",
-	.id_table	= ivb_uncore_pci_ids,
-};
-
-static struct pci_driver hsw_uncore_pci_driver = {
-	.name		= "hsw_uncore",
-	.id_table	= hsw_uncore_pci_ids,
-};
-
-/* end of Sandy Bridge uncore support */
-
-/* Nehalem uncore support */
-static void nhm_uncore_msr_disable_box(struct intel_uncore_box *box)
-{
-	wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, 0);
-}
-
-static void nhm_uncore_msr_enable_box(struct intel_uncore_box *box)
-{
-	wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, NHM_UNC_GLOBAL_CTL_EN_PC_ALL | NHM_UNC_GLOBAL_CTL_EN_FC);
-}
-
-static void nhm_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-
-	if (hwc->idx < UNCORE_PMC_IDX_FIXED)
-		wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN);
-	else
-		wrmsrl(hwc->config_base, NHM_UNC_FIXED_CTR_CTL_EN);
-}
-
-static struct attribute *nhm_uncore_formats_attr[] = {
-	&format_attr_event.attr,
-	&format_attr_umask.attr,
-	&format_attr_edge.attr,
-	&format_attr_inv.attr,
-	&format_attr_cmask8.attr,
-	NULL,
-};
-
-static struct attribute_group nhm_uncore_format_group = {
-	.name = "format",
-	.attrs = nhm_uncore_formats_attr,
-};
-
-static struct uncore_event_desc nhm_uncore_events[] = {
-	INTEL_UNCORE_EVENT_DESC(clockticks,                "event=0xff,umask=0x00"),
-	INTEL_UNCORE_EVENT_DESC(qmc_writes_full_any,       "event=0x2f,umask=0x0f"),
-	INTEL_UNCORE_EVENT_DESC(qmc_normal_reads_any,      "event=0x2c,umask=0x0f"),
-	INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_reads,     "event=0x20,umask=0x01"),
-	INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_writes,    "event=0x20,umask=0x02"),
-	INTEL_UNCORE_EVENT_DESC(qhl_request_remote_reads,  "event=0x20,umask=0x04"),
-	INTEL_UNCORE_EVENT_DESC(qhl_request_remote_writes, "event=0x20,umask=0x08"),
-	INTEL_UNCORE_EVENT_DESC(qhl_request_local_reads,   "event=0x20,umask=0x10"),
-	INTEL_UNCORE_EVENT_DESC(qhl_request_local_writes,  "event=0x20,umask=0x20"),
-	{ /* end: all zeroes */ },
-};
-
-static struct intel_uncore_ops nhm_uncore_msr_ops = {
-	.disable_box	= nhm_uncore_msr_disable_box,
-	.enable_box	= nhm_uncore_msr_enable_box,
-	.disable_event	= snb_uncore_msr_disable_event,
-	.enable_event	= nhm_uncore_msr_enable_event,
-	.read_counter	= uncore_msr_read_counter,
-};
-
-static struct intel_uncore_type nhm_uncore = {
-	.name		= "",
-	.num_counters   = 8,
-	.num_boxes	= 1,
-	.perf_ctr_bits	= 48,
-	.fixed_ctr_bits	= 48,
-	.event_ctl	= NHM_UNC_PERFEVTSEL0,
-	.perf_ctr	= NHM_UNC_UNCORE_PMC0,
-	.fixed_ctr	= NHM_UNC_FIXED_CTR,
-	.fixed_ctl	= NHM_UNC_FIXED_CTR_CTRL,
-	.event_mask	= NHM_UNC_RAW_EVENT_MASK,
-	.event_descs	= nhm_uncore_events,
-	.ops		= &nhm_uncore_msr_ops,
-	.format_group	= &nhm_uncore_format_group,
-};
-
-static struct intel_uncore_type *nhm_msr_uncores[] = {
-	&nhm_uncore,
-	NULL,
-};
-/* end of Nehalem uncore support */
-
-/* Nehalem-EX uncore support */
-DEFINE_UNCORE_FORMAT_ATTR(event5, event, "config:1-5");
-DEFINE_UNCORE_FORMAT_ATTR(counter, counter, "config:6-7");
-DEFINE_UNCORE_FORMAT_ATTR(match, match, "config1:0-63");
-DEFINE_UNCORE_FORMAT_ATTR(mask, mask, "config2:0-63");
-
-static void nhmex_uncore_msr_init_box(struct intel_uncore_box *box)
-{
-	wrmsrl(NHMEX_U_MSR_PMON_GLOBAL_CTL, NHMEX_U_PMON_GLOBAL_EN_ALL);
-}
-
-static void nhmex_uncore_msr_disable_box(struct intel_uncore_box *box)
-{
-	unsigned msr = uncore_msr_box_ctl(box);
-	u64 config;
-
-	if (msr) {
-		rdmsrl(msr, config);
-		config &= ~((1ULL << uncore_num_counters(box)) - 1);
-		/* WBox has a fixed counter */
-		if (uncore_msr_fixed_ctl(box))
-			config &= ~NHMEX_W_PMON_GLOBAL_FIXED_EN;
-		wrmsrl(msr, config);
-	}
-}
-
-static void nhmex_uncore_msr_enable_box(struct intel_uncore_box *box)
-{
-	unsigned msr = uncore_msr_box_ctl(box);
-	u64 config;
-
-	if (msr) {
-		rdmsrl(msr, config);
-		config |= (1ULL << uncore_num_counters(box)) - 1;
-		/* WBox has a fixed counter */
-		if (uncore_msr_fixed_ctl(box))
-			config |= NHMEX_W_PMON_GLOBAL_FIXED_EN;
-		wrmsrl(msr, config);
-	}
-}
-
-static void nhmex_uncore_msr_disable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-	wrmsrl(event->hw.config_base, 0);
-}
-
-static void nhmex_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-
-	if (hwc->idx >= UNCORE_PMC_IDX_FIXED)
-		wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0);
-	else if (box->pmu->type->event_mask & NHMEX_PMON_CTL_EN_BIT0)
-		wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT22);
-	else
-		wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT0);
-}
-
-#define NHMEX_UNCORE_OPS_COMMON_INIT()				\
-	.init_box	= nhmex_uncore_msr_init_box,		\
-	.disable_box	= nhmex_uncore_msr_disable_box,		\
-	.enable_box	= nhmex_uncore_msr_enable_box,		\
-	.disable_event	= nhmex_uncore_msr_disable_event,	\
-	.read_counter	= uncore_msr_read_counter
-
-static struct intel_uncore_ops nhmex_uncore_ops = {
-	NHMEX_UNCORE_OPS_COMMON_INIT(),
-	.enable_event	= nhmex_uncore_msr_enable_event,
-};
-
-static struct attribute *nhmex_uncore_ubox_formats_attr[] = {
-	&format_attr_event.attr,
-	&format_attr_edge.attr,
-	NULL,
-};
-
-static struct attribute_group nhmex_uncore_ubox_format_group = {
-	.name		= "format",
-	.attrs		= nhmex_uncore_ubox_formats_attr,
-};
-
-static struct intel_uncore_type nhmex_uncore_ubox = {
-	.name		= "ubox",
-	.num_counters	= 1,
-	.num_boxes	= 1,
-	.perf_ctr_bits	= 48,
-	.event_ctl	= NHMEX_U_MSR_PMON_EV_SEL,
-	.perf_ctr	= NHMEX_U_MSR_PMON_CTR,
-	.event_mask	= NHMEX_U_PMON_RAW_EVENT_MASK,
-	.box_ctl	= NHMEX_U_MSR_PMON_GLOBAL_CTL,
-	.ops		= &nhmex_uncore_ops,
-	.format_group	= &nhmex_uncore_ubox_format_group
-};
-
-static struct attribute *nhmex_uncore_cbox_formats_attr[] = {
-	&format_attr_event.attr,
-	&format_attr_umask.attr,
-	&format_attr_edge.attr,
-	&format_attr_inv.attr,
-	&format_attr_thresh8.attr,
-	NULL,
-};
-
-static struct attribute_group nhmex_uncore_cbox_format_group = {
-	.name = "format",
-	.attrs = nhmex_uncore_cbox_formats_attr,
-};
-
-/* msr offset for each instance of cbox */
-static unsigned nhmex_cbox_msr_offsets[] = {
-	0x0, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0, 0x240, 0x2c0,
-};
-
-static struct intel_uncore_type nhmex_uncore_cbox = {
-	.name			= "cbox",
-	.num_counters		= 6,
-	.num_boxes		= 10,
-	.perf_ctr_bits		= 48,
-	.event_ctl		= NHMEX_C0_MSR_PMON_EV_SEL0,
-	.perf_ctr		= NHMEX_C0_MSR_PMON_CTR0,
-	.event_mask		= NHMEX_PMON_RAW_EVENT_MASK,
-	.box_ctl		= NHMEX_C0_MSR_PMON_GLOBAL_CTL,
-	.msr_offsets		= nhmex_cbox_msr_offsets,
-	.pair_ctr_ctl		= 1,
-	.ops			= &nhmex_uncore_ops,
-	.format_group		= &nhmex_uncore_cbox_format_group
-};
-
-static struct uncore_event_desc nhmex_uncore_wbox_events[] = {
-	INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0"),
-	{ /* end: all zeroes */ },
-};
-
-static struct intel_uncore_type nhmex_uncore_wbox = {
-	.name			= "wbox",
-	.num_counters		= 4,
-	.num_boxes		= 1,
-	.perf_ctr_bits		= 48,
-	.event_ctl		= NHMEX_W_MSR_PMON_CNT0,
-	.perf_ctr		= NHMEX_W_MSR_PMON_EVT_SEL0,
-	.fixed_ctr		= NHMEX_W_MSR_PMON_FIXED_CTR,
-	.fixed_ctl		= NHMEX_W_MSR_PMON_FIXED_CTL,
-	.event_mask		= NHMEX_PMON_RAW_EVENT_MASK,
-	.box_ctl		= NHMEX_W_MSR_GLOBAL_CTL,
-	.pair_ctr_ctl		= 1,
-	.event_descs		= nhmex_uncore_wbox_events,
-	.ops			= &nhmex_uncore_ops,
-	.format_group		= &nhmex_uncore_cbox_format_group
-};
-
-static int nhmex_bbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-	struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
-	int ctr, ev_sel;
-
-	ctr = (hwc->config & NHMEX_B_PMON_CTR_MASK) >>
-		NHMEX_B_PMON_CTR_SHIFT;
-	ev_sel = (hwc->config & NHMEX_B_PMON_CTL_EV_SEL_MASK) >>
-		  NHMEX_B_PMON_CTL_EV_SEL_SHIFT;
-
-	/* events that do not use the match/mask registers */
-	if ((ctr == 0 && ev_sel > 0x3) || (ctr == 1 && ev_sel > 0x6) ||
-	    (ctr == 2 && ev_sel != 0x4) || ctr == 3)
-		return 0;
-
-	if (box->pmu->pmu_idx == 0)
-		reg1->reg = NHMEX_B0_MSR_MATCH;
-	else
-		reg1->reg = NHMEX_B1_MSR_MATCH;
-	reg1->idx = 0;
-	reg1->config = event->attr.config1;
-	reg2->config = event->attr.config2;
-	return 0;
-}
-
-static void nhmex_bbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-	struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
-
-	if (reg1->idx != EXTRA_REG_NONE) {
-		wrmsrl(reg1->reg, reg1->config);
-		wrmsrl(reg1->reg + 1, reg2->config);
-	}
-	wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0 |
-		(hwc->config & NHMEX_B_PMON_CTL_EV_SEL_MASK));
-}
-
-/*
- * The Bbox has 4 counters, but each counter monitors different events.
- * Use bits 6-7 in the event config to select counter.
- */
-static struct event_constraint nhmex_uncore_bbox_constraints[] = {
-	EVENT_CONSTRAINT(0 , 1, 0xc0),
-	EVENT_CONSTRAINT(0x40, 2, 0xc0),
-	EVENT_CONSTRAINT(0x80, 4, 0xc0),
-	EVENT_CONSTRAINT(0xc0, 8, 0xc0),
-	EVENT_CONSTRAINT_END,
-};
-
-static struct attribute *nhmex_uncore_bbox_formats_attr[] = {
-	&format_attr_event5.attr,
-	&format_attr_counter.attr,
-	&format_attr_match.attr,
-	&format_attr_mask.attr,
-	NULL,
-};
-
-static struct attribute_group nhmex_uncore_bbox_format_group = {
-	.name = "format",
-	.attrs = nhmex_uncore_bbox_formats_attr,
-};
-
-static struct intel_uncore_ops nhmex_uncore_bbox_ops = {
-	NHMEX_UNCORE_OPS_COMMON_INIT(),
-	.enable_event		= nhmex_bbox_msr_enable_event,
-	.hw_config		= nhmex_bbox_hw_config,
-	.get_constraint		= uncore_get_constraint,
-	.put_constraint		= uncore_put_constraint,
-};
-
-static struct intel_uncore_type nhmex_uncore_bbox = {
-	.name			= "bbox",
-	.num_counters		= 4,
-	.num_boxes		= 2,
-	.perf_ctr_bits		= 48,
-	.event_ctl		= NHMEX_B0_MSR_PMON_CTL0,
-	.perf_ctr		= NHMEX_B0_MSR_PMON_CTR0,
-	.event_mask		= NHMEX_B_PMON_RAW_EVENT_MASK,
-	.box_ctl		= NHMEX_B0_MSR_PMON_GLOBAL_CTL,
-	.msr_offset		= NHMEX_B_MSR_OFFSET,
-	.pair_ctr_ctl		= 1,
-	.num_shared_regs	= 1,
-	.constraints		= nhmex_uncore_bbox_constraints,
-	.ops			= &nhmex_uncore_bbox_ops,
-	.format_group		= &nhmex_uncore_bbox_format_group
-};
-
-static int nhmex_sbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-	struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
-
-	/* only TO_R_PROG_EV event uses the match/mask register */
-	if ((hwc->config & NHMEX_PMON_CTL_EV_SEL_MASK) !=
-	    NHMEX_S_EVENT_TO_R_PROG_EV)
-		return 0;
-
-	if (box->pmu->pmu_idx == 0)
-		reg1->reg = NHMEX_S0_MSR_MM_CFG;
-	else
-		reg1->reg = NHMEX_S1_MSR_MM_CFG;
-	reg1->idx = 0;
-	reg1->config = event->attr.config1;
-	reg2->config = event->attr.config2;
-	return 0;
-}
-
-static void nhmex_sbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-	struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
-
-	if (reg1->idx != EXTRA_REG_NONE) {
-		wrmsrl(reg1->reg, 0);
-		wrmsrl(reg1->reg + 1, reg1->config);
-		wrmsrl(reg1->reg + 2, reg2->config);
-		wrmsrl(reg1->reg, NHMEX_S_PMON_MM_CFG_EN);
-	}
-	wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT22);
-}
-
-static struct attribute *nhmex_uncore_sbox_formats_attr[] = {
-	&format_attr_event.attr,
-	&format_attr_umask.attr,
-	&format_attr_edge.attr,
-	&format_attr_inv.attr,
-	&format_attr_thresh8.attr,
-	&format_attr_match.attr,
-	&format_attr_mask.attr,
-	NULL,
-};
-
-static struct attribute_group nhmex_uncore_sbox_format_group = {
-	.name			= "format",
-	.attrs			= nhmex_uncore_sbox_formats_attr,
-};
-
-static struct intel_uncore_ops nhmex_uncore_sbox_ops = {
-	NHMEX_UNCORE_OPS_COMMON_INIT(),
-	.enable_event		= nhmex_sbox_msr_enable_event,
-	.hw_config		= nhmex_sbox_hw_config,
-	.get_constraint		= uncore_get_constraint,
-	.put_constraint		= uncore_put_constraint,
-};
-
-static struct intel_uncore_type nhmex_uncore_sbox = {
-	.name			= "sbox",
-	.num_counters		= 4,
-	.num_boxes		= 2,
-	.perf_ctr_bits		= 48,
-	.event_ctl		= NHMEX_S0_MSR_PMON_CTL0,
-	.perf_ctr		= NHMEX_S0_MSR_PMON_CTR0,
-	.event_mask		= NHMEX_PMON_RAW_EVENT_MASK,
-	.box_ctl		= NHMEX_S0_MSR_PMON_GLOBAL_CTL,
-	.msr_offset		= NHMEX_S_MSR_OFFSET,
-	.pair_ctr_ctl		= 1,
-	.num_shared_regs	= 1,
-	.ops			= &nhmex_uncore_sbox_ops,
-	.format_group		= &nhmex_uncore_sbox_format_group
-};
-
-enum {
-	EXTRA_REG_NHMEX_M_FILTER,
-	EXTRA_REG_NHMEX_M_DSP,
-	EXTRA_REG_NHMEX_M_ISS,
-	EXTRA_REG_NHMEX_M_MAP,
-	EXTRA_REG_NHMEX_M_MSC_THR,
-	EXTRA_REG_NHMEX_M_PGT,
-	EXTRA_REG_NHMEX_M_PLD,
-	EXTRA_REG_NHMEX_M_ZDP_CTL_FVC,
-};
-
-static struct extra_reg nhmex_uncore_mbox_extra_regs[] = {
-	MBOX_INC_SEL_EXTAR_REG(0x0, DSP),
-	MBOX_INC_SEL_EXTAR_REG(0x4, MSC_THR),
-	MBOX_INC_SEL_EXTAR_REG(0x5, MSC_THR),
-	MBOX_INC_SEL_EXTAR_REG(0x9, ISS),
-	/* event 0xa uses two extra registers */
-	MBOX_INC_SEL_EXTAR_REG(0xa, ISS),
-	MBOX_INC_SEL_EXTAR_REG(0xa, PLD),
-	MBOX_INC_SEL_EXTAR_REG(0xb, PLD),
-	/* events 0xd ~ 0x10 use the same extra register */
-	MBOX_INC_SEL_EXTAR_REG(0xd, ZDP_CTL_FVC),
-	MBOX_INC_SEL_EXTAR_REG(0xe, ZDP_CTL_FVC),
-	MBOX_INC_SEL_EXTAR_REG(0xf, ZDP_CTL_FVC),
-	MBOX_INC_SEL_EXTAR_REG(0x10, ZDP_CTL_FVC),
-	MBOX_INC_SEL_EXTAR_REG(0x16, PGT),
-	MBOX_SET_FLAG_SEL_EXTRA_REG(0x0, DSP),
-	MBOX_SET_FLAG_SEL_EXTRA_REG(0x1, ISS),
-	MBOX_SET_FLAG_SEL_EXTRA_REG(0x5, PGT),
-	MBOX_SET_FLAG_SEL_EXTRA_REG(0x6, MAP),
-	EVENT_EXTRA_END
-};
-
-/* Nehalem-EX or Westmere-EX ? */
-static bool uncore_nhmex;
-
-static bool nhmex_mbox_get_shared_reg(struct intel_uncore_box *box, int idx, u64 config)
-{
-	struct intel_uncore_extra_reg *er;
-	unsigned long flags;
-	bool ret = false;
-	u64 mask;
-
-	if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) {
-		er = &box->shared_regs[idx];
-		raw_spin_lock_irqsave(&er->lock, flags);
-		if (!atomic_read(&er->ref) || er->config == config) {
-			atomic_inc(&er->ref);
-			er->config = config;
-			ret = true;
-		}
-		raw_spin_unlock_irqrestore(&er->lock, flags);
-
-		return ret;
-	}
-	/*
-	 * The ZDP_CTL_FVC MSR has 4 fields which are used to control
-	 * events 0xd ~ 0x10. Besides these 4 fields, there are additional
-	 * fields which are shared.
-	 */
-	idx -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
-	if (WARN_ON_ONCE(idx >= 4))
-		return false;
-
-	/* mask of the shared fields */
-	if (uncore_nhmex)
-		mask = NHMEX_M_PMON_ZDP_CTL_FVC_MASK;
-	else
-		mask = WSMEX_M_PMON_ZDP_CTL_FVC_MASK;
-	er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC];
-
-	raw_spin_lock_irqsave(&er->lock, flags);
-	/* add mask of the non-shared field if it's in use */
-	if (__BITS_VALUE(atomic_read(&er->ref), idx, 8)) {
-		if (uncore_nhmex)
-			mask |= NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
-		else
-			mask |= WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
-	}
-
-	if (!atomic_read(&er->ref) || !((er->config ^ config) & mask)) {
-		atomic_add(1 << (idx * 8), &er->ref);
-		if (uncore_nhmex)
-			mask = NHMEX_M_PMON_ZDP_CTL_FVC_MASK |
-				NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
-		else
-			mask = WSMEX_M_PMON_ZDP_CTL_FVC_MASK |
-				WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
-		er->config &= ~mask;
-		er->config |= (config & mask);
-		ret = true;
-	}
-	raw_spin_unlock_irqrestore(&er->lock, flags);
-
-	return ret;
-}
-
-static void nhmex_mbox_put_shared_reg(struct intel_uncore_box *box, int idx)
-{
-	struct intel_uncore_extra_reg *er;
-
-	if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) {
-		er = &box->shared_regs[idx];
-		atomic_dec(&er->ref);
-		return;
-	}
-
-	idx -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
-	er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC];
-	atomic_sub(1 << (idx * 8), &er->ref);
-}
-
-static u64 nhmex_mbox_alter_er(struct perf_event *event, int new_idx, bool modify)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-	u64 idx, orig_idx = __BITS_VALUE(reg1->idx, 0, 8);
-	u64 config = reg1->config;
-
-	/* get the non-shared control bits and shift them */
-	idx = orig_idx - EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
-	if (uncore_nhmex)
-		config &= NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
-	else
-		config &= WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
-	if (new_idx > orig_idx) {
-		idx = new_idx - orig_idx;
-		config <<= 3 * idx;
-	} else {
-		idx = orig_idx - new_idx;
-		config >>= 3 * idx;
-	}
-
-	/* add the shared control bits back */
-	if (uncore_nhmex)
-		config |= NHMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config;
-	else
-		config |= WSMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config;
-	config |= NHMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config;
-	if (modify) {
-		/* adjust the main event selector */
-		if (new_idx > orig_idx)
-			hwc->config += idx << NHMEX_M_PMON_CTL_INC_SEL_SHIFT;
-		else
-			hwc->config -= idx << NHMEX_M_PMON_CTL_INC_SEL_SHIFT;
-		reg1->config = config;
-		reg1->idx = ~0xff | new_idx;
-	}
-	return config;
-}
-
-static struct event_constraint *
-nhmex_mbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-	struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
-	int i, idx[2], alloc = 0;
-	u64 config1 = reg1->config;
-
-	idx[0] = __BITS_VALUE(reg1->idx, 0, 8);
-	idx[1] = __BITS_VALUE(reg1->idx, 1, 8);
-again:
-	for (i = 0; i < 2; i++) {
-		if (!uncore_box_is_fake(box) && (reg1->alloc & (0x1 << i)))
-			idx[i] = 0xff;
-
-		if (idx[i] == 0xff)
-			continue;
-
-		if (!nhmex_mbox_get_shared_reg(box, idx[i],
-				__BITS_VALUE(config1, i, 32)))
-			goto fail;
-		alloc |= (0x1 << i);
-	}
-
-	/* for the match/mask registers */
-	if (reg2->idx != EXTRA_REG_NONE &&
-	    (uncore_box_is_fake(box) || !reg2->alloc) &&
-	    !nhmex_mbox_get_shared_reg(box, reg2->idx, reg2->config))
-		goto fail;
-
-	/*
-	 * If it's a fake box -- as per validate_{group,event}() we
-	 * shouldn't touch event state and we can avoid doing so
-	 * since both will only call get_event_constraints() once
-	 * on each event, this avoids the need for reg->alloc.
-	 */
-	if (!uncore_box_is_fake(box)) {
-		if (idx[0] != 0xff && idx[0] != __BITS_VALUE(reg1->idx, 0, 8))
-			nhmex_mbox_alter_er(event, idx[0], true);
-		reg1->alloc |= alloc;
-		if (reg2->idx != EXTRA_REG_NONE)
-			reg2->alloc = 1;
-	}
-	return NULL;
-fail:
-	if (idx[0] != 0xff && !(alloc & 0x1) &&
-	    idx[0] >= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) {
-		/*
-		 * events 0xd ~ 0x10 are functional identical, but are
-		 * controlled by different fields in the ZDP_CTL_FVC
-		 * register. If we failed to take one field, try the
-		 * rest 3 choices.
-		 */
-		BUG_ON(__BITS_VALUE(reg1->idx, 1, 8) != 0xff);
-		idx[0] -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
-		idx[0] = (idx[0] + 1) % 4;
-		idx[0] += EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
-		if (idx[0] != __BITS_VALUE(reg1->idx, 0, 8)) {
-			config1 = nhmex_mbox_alter_er(event, idx[0], false);
-			goto again;
-		}
-	}
-
-	if (alloc & 0x1)
-		nhmex_mbox_put_shared_reg(box, idx[0]);
-	if (alloc & 0x2)
-		nhmex_mbox_put_shared_reg(box, idx[1]);
-	return &constraint_empty;
-}
-
-static void nhmex_mbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-	struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
-
-	if (uncore_box_is_fake(box))
-		return;
-
-	if (reg1->alloc & 0x1)
-		nhmex_mbox_put_shared_reg(box, __BITS_VALUE(reg1->idx, 0, 8));
-	if (reg1->alloc & 0x2)
-		nhmex_mbox_put_shared_reg(box, __BITS_VALUE(reg1->idx, 1, 8));
-	reg1->alloc = 0;
-
-	if (reg2->alloc) {
-		nhmex_mbox_put_shared_reg(box, reg2->idx);
-		reg2->alloc = 0;
-	}
-}
-
-static int nhmex_mbox_extra_reg_idx(struct extra_reg *er)
-{
-	if (er->idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC)
-		return er->idx;
-	return er->idx + (er->event >> NHMEX_M_PMON_CTL_INC_SEL_SHIFT) - 0xd;
-}
-
-static int nhmex_mbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct intel_uncore_type *type = box->pmu->type;
-	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-	struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
-	struct extra_reg *er;
-	unsigned msr;
-	int reg_idx = 0;
-	/*
-	 * The mbox events may require 2 extra MSRs at the most. But only
-	 * the lower 32 bits in these MSRs are significant, so we can use
-	 * config1 to pass two MSRs' config.
-	 */
-	for (er = nhmex_uncore_mbox_extra_regs; er->msr; er++) {
-		if (er->event != (event->hw.config & er->config_mask))
-			continue;
-		if (event->attr.config1 & ~er->valid_mask)
-			return -EINVAL;
-
-		msr = er->msr + type->msr_offset * box->pmu->pmu_idx;
-		if (WARN_ON_ONCE(msr >= 0xffff || er->idx >= 0xff))
-			return -EINVAL;
-
-		/* always use the 32~63 bits to pass the PLD config */
-		if (er->idx == EXTRA_REG_NHMEX_M_PLD)
-			reg_idx = 1;
-		else if (WARN_ON_ONCE(reg_idx > 0))
-			return -EINVAL;
-
-		reg1->idx &= ~(0xff << (reg_idx * 8));
-		reg1->reg &= ~(0xffff << (reg_idx * 16));
-		reg1->idx |= nhmex_mbox_extra_reg_idx(er) << (reg_idx * 8);
-		reg1->reg |= msr << (reg_idx * 16);
-		reg1->config = event->attr.config1;
-		reg_idx++;
-	}
-	/*
-	 * The mbox only provides ability to perform address matching
-	 * for the PLD events.
-	 */
-	if (reg_idx == 2) {
-		reg2->idx = EXTRA_REG_NHMEX_M_FILTER;
-		if (event->attr.config2 & NHMEX_M_PMON_MM_CFG_EN)
-			reg2->config = event->attr.config2;
-		else
-			reg2->config = ~0ULL;
-		if (box->pmu->pmu_idx == 0)
-			reg2->reg = NHMEX_M0_MSR_PMU_MM_CFG;
-		else
-			reg2->reg = NHMEX_M1_MSR_PMU_MM_CFG;
-	}
-	return 0;
-}
-
-static u64 nhmex_mbox_shared_reg_config(struct intel_uncore_box *box, int idx)
-{
-	struct intel_uncore_extra_reg *er;
-	unsigned long flags;
-	u64 config;
-
-	if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC)
-		return box->shared_regs[idx].config;
-
-	er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC];
-	raw_spin_lock_irqsave(&er->lock, flags);
-	config = er->config;
-	raw_spin_unlock_irqrestore(&er->lock, flags);
-	return config;
-}
-
-static void nhmex_mbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-	struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
-	int idx;
-
-	idx = __BITS_VALUE(reg1->idx, 0, 8);
-	if (idx != 0xff)
-		wrmsrl(__BITS_VALUE(reg1->reg, 0, 16),
-			nhmex_mbox_shared_reg_config(box, idx));
-	idx = __BITS_VALUE(reg1->idx, 1, 8);
-	if (idx != 0xff)
-		wrmsrl(__BITS_VALUE(reg1->reg, 1, 16),
-			nhmex_mbox_shared_reg_config(box, idx));
-
-	if (reg2->idx != EXTRA_REG_NONE) {
-		wrmsrl(reg2->reg, 0);
-		if (reg2->config != ~0ULL) {
-			wrmsrl(reg2->reg + 1,
-				reg2->config & NHMEX_M_PMON_ADDR_MATCH_MASK);
-			wrmsrl(reg2->reg + 2, NHMEX_M_PMON_ADDR_MASK_MASK &
-				(reg2->config >> NHMEX_M_PMON_ADDR_MASK_SHIFT));
-			wrmsrl(reg2->reg, NHMEX_M_PMON_MM_CFG_EN);
-		}
-	}
-
-	wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT0);
-}
-
-DEFINE_UNCORE_FORMAT_ATTR(count_mode,		count_mode,	"config:2-3");
-DEFINE_UNCORE_FORMAT_ATTR(storage_mode,		storage_mode,	"config:4-5");
-DEFINE_UNCORE_FORMAT_ATTR(wrap_mode,		wrap_mode,	"config:6");
-DEFINE_UNCORE_FORMAT_ATTR(flag_mode,		flag_mode,	"config:7");
-DEFINE_UNCORE_FORMAT_ATTR(inc_sel,		inc_sel,	"config:9-13");
-DEFINE_UNCORE_FORMAT_ATTR(set_flag_sel,		set_flag_sel,	"config:19-21");
-DEFINE_UNCORE_FORMAT_ATTR(filter_cfg_en,	filter_cfg_en,	"config2:63");
-DEFINE_UNCORE_FORMAT_ATTR(filter_match,		filter_match,	"config2:0-33");
-DEFINE_UNCORE_FORMAT_ATTR(filter_mask,		filter_mask,	"config2:34-61");
-DEFINE_UNCORE_FORMAT_ATTR(dsp,			dsp,		"config1:0-31");
-DEFINE_UNCORE_FORMAT_ATTR(thr,			thr,		"config1:0-31");
-DEFINE_UNCORE_FORMAT_ATTR(fvc,			fvc,		"config1:0-31");
-DEFINE_UNCORE_FORMAT_ATTR(pgt,			pgt,		"config1:0-31");
-DEFINE_UNCORE_FORMAT_ATTR(map,			map,		"config1:0-31");
-DEFINE_UNCORE_FORMAT_ATTR(iss,			iss,		"config1:0-31");
-DEFINE_UNCORE_FORMAT_ATTR(pld,			pld,		"config1:32-63");
-
-static struct attribute *nhmex_uncore_mbox_formats_attr[] = {
-	&format_attr_count_mode.attr,
-	&format_attr_storage_mode.attr,
-	&format_attr_wrap_mode.attr,
-	&format_attr_flag_mode.attr,
-	&format_attr_inc_sel.attr,
-	&format_attr_set_flag_sel.attr,
-	&format_attr_filter_cfg_en.attr,
-	&format_attr_filter_match.attr,
-	&format_attr_filter_mask.attr,
-	&format_attr_dsp.attr,
-	&format_attr_thr.attr,
-	&format_attr_fvc.attr,
-	&format_attr_pgt.attr,
-	&format_attr_map.attr,
-	&format_attr_iss.attr,
-	&format_attr_pld.attr,
-	NULL,
-};
-
-static struct attribute_group nhmex_uncore_mbox_format_group = {
-	.name		= "format",
-	.attrs		= nhmex_uncore_mbox_formats_attr,
-};
-
-static struct uncore_event_desc nhmex_uncore_mbox_events[] = {
-	INTEL_UNCORE_EVENT_DESC(bbox_cmds_read, "inc_sel=0xd,fvc=0x2800"),
-	INTEL_UNCORE_EVENT_DESC(bbox_cmds_write, "inc_sel=0xd,fvc=0x2820"),
-	{ /* end: all zeroes */ },
-};
-
-static struct uncore_event_desc wsmex_uncore_mbox_events[] = {
-	INTEL_UNCORE_EVENT_DESC(bbox_cmds_read, "inc_sel=0xd,fvc=0x5000"),
-	INTEL_UNCORE_EVENT_DESC(bbox_cmds_write, "inc_sel=0xd,fvc=0x5040"),
-	{ /* end: all zeroes */ },
-};
-
-static struct intel_uncore_ops nhmex_uncore_mbox_ops = {
-	NHMEX_UNCORE_OPS_COMMON_INIT(),
-	.enable_event	= nhmex_mbox_msr_enable_event,
-	.hw_config	= nhmex_mbox_hw_config,
-	.get_constraint	= nhmex_mbox_get_constraint,
-	.put_constraint	= nhmex_mbox_put_constraint,
-};
-
-static struct intel_uncore_type nhmex_uncore_mbox = {
-	.name			= "mbox",
-	.num_counters		= 6,
-	.num_boxes		= 2,
-	.perf_ctr_bits		= 48,
-	.event_ctl		= NHMEX_M0_MSR_PMU_CTL0,
-	.perf_ctr		= NHMEX_M0_MSR_PMU_CNT0,
-	.event_mask		= NHMEX_M_PMON_RAW_EVENT_MASK,
-	.box_ctl		= NHMEX_M0_MSR_GLOBAL_CTL,
-	.msr_offset		= NHMEX_M_MSR_OFFSET,
-	.pair_ctr_ctl		= 1,
-	.num_shared_regs	= 8,
-	.event_descs		= nhmex_uncore_mbox_events,
-	.ops			= &nhmex_uncore_mbox_ops,
-	.format_group		= &nhmex_uncore_mbox_format_group,
-};
-
-static void nhmex_rbox_alter_er(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-
-	/* adjust the main event selector and extra register index */
-	if (reg1->idx % 2) {
-		reg1->idx--;
-		hwc->config -= 1 << NHMEX_R_PMON_CTL_EV_SEL_SHIFT;
-	} else {
-		reg1->idx++;
-		hwc->config += 1 << NHMEX_R_PMON_CTL_EV_SEL_SHIFT;
-	}
-
-	/* adjust extra register config */
-	switch (reg1->idx % 6) {
-	case 2:
-		/* shift the 8~15 bits to the 0~7 bits */
-		reg1->config >>= 8;
-		break;
-	case 3:
-		/* shift the 0~7 bits to the 8~15 bits */
-		reg1->config <<= 8;
-		break;
-	};
-}
-
-/*
- * Each rbox has 4 event set which monitor PQI port 0~3 or 4~7.
- * An event set consists of 6 events, the 3rd and 4th events in
- * an event set use the same extra register. So an event set uses
- * 5 extra registers.
- */
-static struct event_constraint *
-nhmex_rbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-	struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
-	struct intel_uncore_extra_reg *er;
-	unsigned long flags;
-	int idx, er_idx;
-	u64 config1;
-	bool ok = false;
-
-	if (!uncore_box_is_fake(box) && reg1->alloc)
-		return NULL;
-
-	idx = reg1->idx % 6;
-	config1 = reg1->config;
-again:
-	er_idx = idx;
-	/* the 3rd and 4th events use the same extra register */
-	if (er_idx > 2)
-		er_idx--;
-	er_idx += (reg1->idx / 6) * 5;
-
-	er = &box->shared_regs[er_idx];
-	raw_spin_lock_irqsave(&er->lock, flags);
-	if (idx < 2) {
-		if (!atomic_read(&er->ref) || er->config == reg1->config) {
-			atomic_inc(&er->ref);
-			er->config = reg1->config;
-			ok = true;
-		}
-	} else if (idx == 2 || idx == 3) {
-		/*
-		 * these two events use different fields in a extra register,
-		 * the 0~7 bits and the 8~15 bits respectively.
-		 */
-		u64 mask = 0xff << ((idx - 2) * 8);
-		if (!__BITS_VALUE(atomic_read(&er->ref), idx - 2, 8) ||
-				!((er->config ^ config1) & mask)) {
-			atomic_add(1 << ((idx - 2) * 8), &er->ref);
-			er->config &= ~mask;
-			er->config |= config1 & mask;
-			ok = true;
-		}
-	} else {
-		if (!atomic_read(&er->ref) ||
-				(er->config == (hwc->config >> 32) &&
-				 er->config1 == reg1->config &&
-				 er->config2 == reg2->config)) {
-			atomic_inc(&er->ref);
-			er->config = (hwc->config >> 32);
-			er->config1 = reg1->config;
-			er->config2 = reg2->config;
-			ok = true;
-		}
-	}
-	raw_spin_unlock_irqrestore(&er->lock, flags);
-
-	if (!ok) {
-		/*
-		 * The Rbox events are always in pairs. The paired
-		 * events are functional identical, but use different
-		 * extra registers. If we failed to take an extra
-		 * register, try the alternative.
-		 */
-		idx ^= 1;
-		if (idx != reg1->idx % 6) {
-			if (idx == 2)
-				config1 >>= 8;
-			else if (idx == 3)
-				config1 <<= 8;
-			goto again;
-		}
-	} else {
-		if (!uncore_box_is_fake(box)) {
-			if (idx != reg1->idx % 6)
-				nhmex_rbox_alter_er(box, event);
-			reg1->alloc = 1;
-		}
-		return NULL;
-	}
-	return &constraint_empty;
-}
-
-static void nhmex_rbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct intel_uncore_extra_reg *er;
-	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-	int idx, er_idx;
-
-	if (uncore_box_is_fake(box) || !reg1->alloc)
-		return;
-
-	idx = reg1->idx % 6;
-	er_idx = idx;
-	if (er_idx > 2)
-		er_idx--;
-	er_idx += (reg1->idx / 6) * 5;
-
-	er = &box->shared_regs[er_idx];
-	if (idx == 2 || idx == 3)
-		atomic_sub(1 << ((idx - 2) * 8), &er->ref);
-	else
-		atomic_dec(&er->ref);
-
-	reg1->alloc = 0;
-}
-
-static int nhmex_rbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-	struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
-	int idx;
-
-	idx = (event->hw.config & NHMEX_R_PMON_CTL_EV_SEL_MASK) >>
-		NHMEX_R_PMON_CTL_EV_SEL_SHIFT;
-	if (idx >= 0x18)
-		return -EINVAL;
-
-	reg1->idx = idx;
-	reg1->config = event->attr.config1;
-
-	switch (idx % 6) {
-	case 4:
-	case 5:
-		hwc->config |= event->attr.config & (~0ULL << 32);
-		reg2->config = event->attr.config2;
-		break;
-	};
-	return 0;
-}
-
-static void nhmex_rbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-	struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
-	int idx, port;
-
-	idx = reg1->idx;
-	port = idx / 6 + box->pmu->pmu_idx * 4;
-
-	switch (idx % 6) {
-	case 0:
-		wrmsrl(NHMEX_R_MSR_PORTN_IPERF_CFG0(port), reg1->config);
-		break;
-	case 1:
-		wrmsrl(NHMEX_R_MSR_PORTN_IPERF_CFG1(port), reg1->config);
-		break;
-	case 2:
-	case 3:
-		wrmsrl(NHMEX_R_MSR_PORTN_QLX_CFG(port),
-			uncore_shared_reg_config(box, 2 + (idx / 6) * 5));
-		break;
-	case 4:
-		wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(port),
-			hwc->config >> 32);
-		wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MATCH(port), reg1->config);
-		wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MASK(port), reg2->config);
-		break;
-	case 5:
-		wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(port),
-			hwc->config >> 32);
-		wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MATCH(port), reg1->config);
-		wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MASK(port), reg2->config);
-		break;
-	};
-
-	wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0 |
-		(hwc->config & NHMEX_R_PMON_CTL_EV_SEL_MASK));
-}
-
-DEFINE_UNCORE_FORMAT_ATTR(xbr_mm_cfg, xbr_mm_cfg, "config:32-63");
-DEFINE_UNCORE_FORMAT_ATTR(xbr_match, xbr_match, "config1:0-63");
-DEFINE_UNCORE_FORMAT_ATTR(xbr_mask, xbr_mask, "config2:0-63");
-DEFINE_UNCORE_FORMAT_ATTR(qlx_cfg, qlx_cfg, "config1:0-15");
-DEFINE_UNCORE_FORMAT_ATTR(iperf_cfg, iperf_cfg, "config1:0-31");
-
-static struct attribute *nhmex_uncore_rbox_formats_attr[] = {
-	&format_attr_event5.attr,
-	&format_attr_xbr_mm_cfg.attr,
-	&format_attr_xbr_match.attr,
-	&format_attr_xbr_mask.attr,
-	&format_attr_qlx_cfg.attr,
-	&format_attr_iperf_cfg.attr,
-	NULL,
-};
-
-static struct attribute_group nhmex_uncore_rbox_format_group = {
-	.name = "format",
-	.attrs = nhmex_uncore_rbox_formats_attr,
-};
-
-static struct uncore_event_desc nhmex_uncore_rbox_events[] = {
-	INTEL_UNCORE_EVENT_DESC(qpi0_flit_send,		"event=0x0,iperf_cfg=0x80000000"),
-	INTEL_UNCORE_EVENT_DESC(qpi1_filt_send,		"event=0x6,iperf_cfg=0x80000000"),
-	INTEL_UNCORE_EVENT_DESC(qpi0_idle_filt,		"event=0x0,iperf_cfg=0x40000000"),
-	INTEL_UNCORE_EVENT_DESC(qpi1_idle_filt,		"event=0x6,iperf_cfg=0x40000000"),
-	INTEL_UNCORE_EVENT_DESC(qpi0_date_response,	"event=0x0,iperf_cfg=0xc4"),
-	INTEL_UNCORE_EVENT_DESC(qpi1_date_response,	"event=0x6,iperf_cfg=0xc4"),
-	{ /* end: all zeroes */ },
-};
-
-static struct intel_uncore_ops nhmex_uncore_rbox_ops = {
-	NHMEX_UNCORE_OPS_COMMON_INIT(),
-	.enable_event		= nhmex_rbox_msr_enable_event,
-	.hw_config		= nhmex_rbox_hw_config,
-	.get_constraint		= nhmex_rbox_get_constraint,
-	.put_constraint		= nhmex_rbox_put_constraint,
-};
-
-static struct intel_uncore_type nhmex_uncore_rbox = {
-	.name			= "rbox",
-	.num_counters		= 8,
-	.num_boxes		= 2,
-	.perf_ctr_bits		= 48,
-	.event_ctl		= NHMEX_R_MSR_PMON_CTL0,
-	.perf_ctr		= NHMEX_R_MSR_PMON_CNT0,
-	.event_mask		= NHMEX_R_PMON_RAW_EVENT_MASK,
-	.box_ctl		= NHMEX_R_MSR_GLOBAL_CTL,
-	.msr_offset		= NHMEX_R_MSR_OFFSET,
-	.pair_ctr_ctl		= 1,
-	.num_shared_regs	= 20,
-	.event_descs		= nhmex_uncore_rbox_events,
-	.ops			= &nhmex_uncore_rbox_ops,
-	.format_group		= &nhmex_uncore_rbox_format_group
-};
-
-static struct intel_uncore_type *nhmex_msr_uncores[] = {
-	&nhmex_uncore_ubox,
-	&nhmex_uncore_cbox,
-	&nhmex_uncore_bbox,
-	&nhmex_uncore_sbox,
-	&nhmex_uncore_mbox,
-	&nhmex_uncore_rbox,
-	&nhmex_uncore_wbox,
-	NULL,
-};
-/* end of Nehalem-EX uncore support */
-
 static void uncore_assign_hw_event(struct intel_uncore_box *box, struct perf_event *event, int idx)
 {
 	struct hw_perf_event *hwc = &event->hw;
@@ -3140,7 +166,7 @@
 	hwc->event_base  = uncore_perf_ctr(box, hwc->idx);
 }
 
-static void uncore_perf_event_update(struct intel_uncore_box *box, struct perf_event *event)
+void uncore_perf_event_update(struct intel_uncore_box *box, struct perf_event *event)
 {
 	u64 prev_count, new_count, delta;
 	int shift;
@@ -3201,14 +227,14 @@
 	return HRTIMER_RESTART;
 }
 
-static void uncore_pmu_start_hrtimer(struct intel_uncore_box *box)
+void uncore_pmu_start_hrtimer(struct intel_uncore_box *box)
 {
 	__hrtimer_start_range_ns(&box->hrtimer,
 			ns_to_ktime(box->hrtimer_duration), 0,
 			HRTIMER_MODE_REL_PINNED, 0);
 }
 
-static void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box)
+void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box)
 {
 	hrtimer_cancel(&box->hrtimer);
 }
@@ -3291,7 +317,7 @@
 	}
 
 	if (event->attr.config == UNCORE_FIXED_EVENT)
-		return &constraint_fixed;
+		return &uncore_constraint_fixed;
 
 	if (type->constraints) {
 		for_each_event_constraint(c, type->constraints) {
@@ -3496,7 +522,7 @@
 	event->hw.last_tag = ~0ULL;
 }
 
-static void uncore_pmu_event_read(struct perf_event *event)
+void uncore_pmu_event_read(struct perf_event *event)
 {
 	struct intel_uncore_box *box = uncore_event_to_box(event);
 	uncore_perf_event_update(box, event);
@@ -3635,7 +661,7 @@
 	.attrs = uncore_pmu_attrs,
 };
 
-static int __init uncore_pmu_register(struct intel_uncore_pmu *pmu)
+static int uncore_pmu_register(struct intel_uncore_pmu *pmu)
 {
 	int ret;
 
@@ -3758,9 +784,6 @@
 	return ret;
 }
 
-static struct pci_driver *uncore_pci_driver;
-static bool pcidrv_registered;
-
 /*
  * add a pci uncore device
  */
@@ -3771,17 +794,18 @@
 	struct intel_uncore_type *type;
 	int phys_id;
 
-	phys_id = pcibus_to_physid[pdev->bus->number];
+	phys_id = uncore_pcibus_to_physid[pdev->bus->number];
 	if (phys_id < 0)
 		return -ENODEV;
 
 	if (UNCORE_PCI_DEV_TYPE(id->driver_data) == UNCORE_EXTRA_PCI_DEV) {
-		extra_pci_dev[phys_id][UNCORE_PCI_DEV_IDX(id->driver_data)] = pdev;
+		int idx = UNCORE_PCI_DEV_IDX(id->driver_data);
+		uncore_extra_pci_dev[phys_id][idx] = pdev;
 		pci_set_drvdata(pdev, NULL);
 		return 0;
 	}
 
-	type = pci_uncores[UNCORE_PCI_DEV_TYPE(id->driver_data)];
+	type = uncore_pci_uncores[UNCORE_PCI_DEV_TYPE(id->driver_data)];
 	box = uncore_alloc_box(type, NUMA_NO_NODE);
 	if (!box)
 		return -ENOMEM;
@@ -3813,13 +837,13 @@
 {
 	struct intel_uncore_box *box = pci_get_drvdata(pdev);
 	struct intel_uncore_pmu *pmu;
-	int i, cpu, phys_id = pcibus_to_physid[pdev->bus->number];
+	int i, cpu, phys_id = uncore_pcibus_to_physid[pdev->bus->number];
 
 	box = pci_get_drvdata(pdev);
 	if (!box) {
 		for (i = 0; i < UNCORE_EXTRA_PCI_DEV_MAX; i++) {
-			if (extra_pci_dev[phys_id][i] == pdev) {
-				extra_pci_dev[phys_id][i] = NULL;
+			if (uncore_extra_pci_dev[phys_id][i] == pdev) {
+				uncore_extra_pci_dev[phys_id][i] = NULL;
 				break;
 			}
 		}
@@ -3854,46 +878,29 @@
 
 	switch (boot_cpu_data.x86_model) {
 	case 45: /* Sandy Bridge-EP */
-		ret = snbep_pci2phy_map_init(0x3ce0);
-		if (ret)
-			return ret;
-		pci_uncores = snbep_pci_uncores;
-		uncore_pci_driver = &snbep_uncore_pci_driver;
+		ret = snbep_uncore_pci_init();
 		break;
-	case 62: /* IvyTown */
-		ret = snbep_pci2phy_map_init(0x0e1e);
-		if (ret)
-			return ret;
-		pci_uncores = ivt_pci_uncores;
-		uncore_pci_driver = &ivt_uncore_pci_driver;
+	case 62: /* Ivy Bridge-EP */
+		ret = ivbep_uncore_pci_init();
 		break;
 	case 42: /* Sandy Bridge */
-		ret = snb_pci2phy_map_init(PCI_DEVICE_ID_INTEL_SNB_IMC);
-		if (ret)
-			return ret;
-		pci_uncores = snb_pci_uncores;
-		uncore_pci_driver = &snb_uncore_pci_driver;
+		ret = snb_uncore_pci_init();
 		break;
 	case 58: /* Ivy Bridge */
-		ret = snb_pci2phy_map_init(PCI_DEVICE_ID_INTEL_IVB_IMC);
-		if (ret)
-			return ret;
-		pci_uncores = snb_pci_uncores;
-		uncore_pci_driver = &ivb_uncore_pci_driver;
+		ret = ivb_uncore_pci_init();
 		break;
 	case 60: /* Haswell */
 	case 69: /* Haswell Celeron */
-		ret = snb_pci2phy_map_init(PCI_DEVICE_ID_INTEL_HSW_IMC);
-		if (ret)
-			return ret;
-		pci_uncores = snb_pci_uncores;
-		uncore_pci_driver = &hsw_uncore_pci_driver;
+		ret = hsw_uncore_pci_init();
 		break;
 	default:
 		return 0;
 	}
 
-	ret = uncore_types_init(pci_uncores);
+	if (ret)
+		return ret;
+
+	ret = uncore_types_init(uncore_pci_uncores);
 	if (ret)
 		return ret;
 
@@ -3904,7 +911,7 @@
 	if (ret == 0)
 		pcidrv_registered = true;
 	else
-		uncore_types_exit(pci_uncores);
+		uncore_types_exit(uncore_pci_uncores);
 
 	return ret;
 }
@@ -3914,7 +921,7 @@
 	if (pcidrv_registered) {
 		pcidrv_registered = false;
 		pci_unregister_driver(uncore_pci_driver);
-		uncore_types_exit(pci_uncores);
+		uncore_types_exit(uncore_pci_uncores);
 	}
 }
 
@@ -3940,8 +947,8 @@
 	struct intel_uncore_box *box;
 	int i, j;
 
-	for (i = 0; msr_uncores[i]; i++) {
-		type = msr_uncores[i];
+	for (i = 0; uncore_msr_uncores[i]; i++) {
+		type = uncore_msr_uncores[i];
 		for (j = 0; j < type->num_boxes; j++) {
 			pmu = &type->pmus[j];
 			box = *per_cpu_ptr(pmu->box, cpu);
@@ -3961,8 +968,8 @@
 
 	phys_id = topology_physical_package_id(cpu);
 
-	for (i = 0; msr_uncores[i]; i++) {
-		type = msr_uncores[i];
+	for (i = 0; uncore_msr_uncores[i]; i++) {
+		type = uncore_msr_uncores[i];
 		for (j = 0; j < type->num_boxes; j++) {
 			pmu = &type->pmus[j];
 			box = *per_cpu_ptr(pmu->box, cpu);
@@ -4002,8 +1009,8 @@
 	struct intel_uncore_box *box;
 	int i, j;
 
-	for (i = 0; msr_uncores[i]; i++) {
-		type = msr_uncores[i];
+	for (i = 0; uncore_msr_uncores[i]; i++) {
+		type = uncore_msr_uncores[i];
 		for (j = 0; j < type->num_boxes; j++) {
 			pmu = &type->pmus[j];
 			if (pmu->func_id < 0)
@@ -4083,8 +1090,8 @@
 	if (target >= 0)
 		cpumask_set_cpu(target, &uncore_cpu_mask);
 
-	uncore_change_context(msr_uncores, cpu, target);
-	uncore_change_context(pci_uncores, cpu, target);
+	uncore_change_context(uncore_msr_uncores, cpu, target);
+	uncore_change_context(uncore_pci_uncores, cpu, target);
 }
 
 static void uncore_event_init_cpu(int cpu)
@@ -4099,8 +1106,8 @@
 
 	cpumask_set_cpu(cpu, &uncore_cpu_mask);
 
-	uncore_change_context(msr_uncores, -1, cpu);
-	uncore_change_context(pci_uncores, -1, cpu);
+	uncore_change_context(uncore_msr_uncores, -1, cpu);
+	uncore_change_context(uncore_pci_uncores, -1, cpu);
 }
 
 static int uncore_cpu_notifier(struct notifier_block *self,
@@ -4160,47 +1167,35 @@
 
 static int __init uncore_cpu_init(void)
 {
-	int ret, max_cores;
+	int ret;
 
-	max_cores = boot_cpu_data.x86_max_cores;
 	switch (boot_cpu_data.x86_model) {
 	case 26: /* Nehalem */
 	case 30:
 	case 37: /* Westmere */
 	case 44:
-		msr_uncores = nhm_msr_uncores;
+		nhm_uncore_cpu_init();
 		break;
 	case 42: /* Sandy Bridge */
 	case 58: /* Ivy Bridge */
-		if (snb_uncore_cbox.num_boxes > max_cores)
-			snb_uncore_cbox.num_boxes = max_cores;
-		msr_uncores = snb_msr_uncores;
+		snb_uncore_cpu_init();
 		break;
 	case 45: /* Sandy Bridge-EP */
-		if (snbep_uncore_cbox.num_boxes > max_cores)
-			snbep_uncore_cbox.num_boxes = max_cores;
-		msr_uncores = snbep_msr_uncores;
+		snbep_uncore_cpu_init();
 		break;
 	case 46: /* Nehalem-EX */
-		uncore_nhmex = true;
 	case 47: /* Westmere-EX aka. Xeon E7 */
-		if (!uncore_nhmex)
-			nhmex_uncore_mbox.event_descs = wsmex_uncore_mbox_events;
-		if (nhmex_uncore_cbox.num_boxes > max_cores)
-			nhmex_uncore_cbox.num_boxes = max_cores;
-		msr_uncores = nhmex_msr_uncores;
+		nhmex_uncore_cpu_init();
 		break;
-	case 62: /* IvyTown */
-		if (ivt_uncore_cbox.num_boxes > max_cores)
-			ivt_uncore_cbox.num_boxes = max_cores;
-		msr_uncores = ivt_msr_uncores;
+	case 62: /* Ivy Bridge-EP */
+		ivbep_uncore_cpu_init();
 		break;
 
 	default:
 		return 0;
 	}
 
-	ret = uncore_types_init(msr_uncores);
+	ret = uncore_types_init(uncore_msr_uncores);
 	if (ret)
 		return ret;
 
@@ -4213,16 +1208,16 @@
 	struct intel_uncore_type *type;
 	int i, j;
 
-	for (i = 0; msr_uncores[i]; i++) {
-		type = msr_uncores[i];
+	for (i = 0; uncore_msr_uncores[i]; i++) {
+		type = uncore_msr_uncores[i];
 		for (j = 0; j < type->num_boxes; j++) {
 			pmu = &type->pmus[j];
 			uncore_pmu_register(pmu);
 		}
 	}
 
-	for (i = 0; pci_uncores[i]; i++) {
-		type = pci_uncores[i];
+	for (i = 0; uncore_pci_uncores[i]; i++) {
+		type = uncore_pci_uncores[i];
 		for (j = 0; j < type->num_boxes; j++) {
 			pmu = &type->pmus[j];
 			uncore_pmu_register(pmu);

diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
index 90236f0..1d7e894 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h

@@ -24,395 +24,6 @@
 
 #define UNCORE_EVENT_CONSTRAINT(c, n) EVENT_CONSTRAINT(c, n, 0xff)
 
-/* SNB event control */
-#define SNB_UNC_CTL_EV_SEL_MASK			0x000000ff
-#define SNB_UNC_CTL_UMASK_MASK			0x0000ff00
-#define SNB_UNC_CTL_EDGE_DET			(1 << 18)
-#define SNB_UNC_CTL_EN				(1 << 22)
-#define SNB_UNC_CTL_INVERT			(1 << 23)
-#define SNB_UNC_CTL_CMASK_MASK			0x1f000000
-#define NHM_UNC_CTL_CMASK_MASK			0xff000000
-#define NHM_UNC_FIXED_CTR_CTL_EN		(1 << 0)
-
-#define SNB_UNC_RAW_EVENT_MASK			(SNB_UNC_CTL_EV_SEL_MASK | \
-						 SNB_UNC_CTL_UMASK_MASK | \
-						 SNB_UNC_CTL_EDGE_DET | \
-						 SNB_UNC_CTL_INVERT | \
-						 SNB_UNC_CTL_CMASK_MASK)
-
-#define NHM_UNC_RAW_EVENT_MASK			(SNB_UNC_CTL_EV_SEL_MASK | \
-						 SNB_UNC_CTL_UMASK_MASK | \
-						 SNB_UNC_CTL_EDGE_DET | \
-						 SNB_UNC_CTL_INVERT | \
-						 NHM_UNC_CTL_CMASK_MASK)
-
-/* SNB global control register */
-#define SNB_UNC_PERF_GLOBAL_CTL                 0x391
-#define SNB_UNC_FIXED_CTR_CTRL                  0x394
-#define SNB_UNC_FIXED_CTR                       0x395
-
-/* SNB uncore global control */
-#define SNB_UNC_GLOBAL_CTL_CORE_ALL             ((1 << 4) - 1)
-#define SNB_UNC_GLOBAL_CTL_EN                   (1 << 29)
-
-/* SNB Cbo register */
-#define SNB_UNC_CBO_0_PERFEVTSEL0               0x700
-#define SNB_UNC_CBO_0_PER_CTR0                  0x706
-#define SNB_UNC_CBO_MSR_OFFSET                  0x10
-
-/* NHM global control register */
-#define NHM_UNC_PERF_GLOBAL_CTL                 0x391
-#define NHM_UNC_FIXED_CTR                       0x394
-#define NHM_UNC_FIXED_CTR_CTRL                  0x395
-
-/* NHM uncore global control */
-#define NHM_UNC_GLOBAL_CTL_EN_PC_ALL            ((1ULL << 8) - 1)
-#define NHM_UNC_GLOBAL_CTL_EN_FC                (1ULL << 32)
-
-/* NHM uncore register */
-#define NHM_UNC_PERFEVTSEL0                     0x3c0
-#define NHM_UNC_UNCORE_PMC0                     0x3b0
-
-/* SNB-EP Box level control */
-#define SNBEP_PMON_BOX_CTL_RST_CTRL	(1 << 0)
-#define SNBEP_PMON_BOX_CTL_RST_CTRS	(1 << 1)
-#define SNBEP_PMON_BOX_CTL_FRZ		(1 << 8)
-#define SNBEP_PMON_BOX_CTL_FRZ_EN	(1 << 16)
-#define SNBEP_PMON_BOX_CTL_INT		(SNBEP_PMON_BOX_CTL_RST_CTRL | \
-					 SNBEP_PMON_BOX_CTL_RST_CTRS | \
-					 SNBEP_PMON_BOX_CTL_FRZ_EN)
-/* SNB-EP event control */
-#define SNBEP_PMON_CTL_EV_SEL_MASK	0x000000ff
-#define SNBEP_PMON_CTL_UMASK_MASK	0x0000ff00
-#define SNBEP_PMON_CTL_RST		(1 << 17)
-#define SNBEP_PMON_CTL_EDGE_DET		(1 << 18)
-#define SNBEP_PMON_CTL_EV_SEL_EXT	(1 << 21)
-#define SNBEP_PMON_CTL_EN		(1 << 22)
-#define SNBEP_PMON_CTL_INVERT		(1 << 23)
-#define SNBEP_PMON_CTL_TRESH_MASK	0xff000000
-#define SNBEP_PMON_RAW_EVENT_MASK	(SNBEP_PMON_CTL_EV_SEL_MASK | \
-					 SNBEP_PMON_CTL_UMASK_MASK | \
-					 SNBEP_PMON_CTL_EDGE_DET | \
-					 SNBEP_PMON_CTL_INVERT | \
-					 SNBEP_PMON_CTL_TRESH_MASK)
-
-/* SNB-EP Ubox event control */
-#define SNBEP_U_MSR_PMON_CTL_TRESH_MASK		0x1f000000
-#define SNBEP_U_MSR_PMON_RAW_EVENT_MASK		\
-				(SNBEP_PMON_CTL_EV_SEL_MASK | \
-				 SNBEP_PMON_CTL_UMASK_MASK | \
-				 SNBEP_PMON_CTL_EDGE_DET | \
-				 SNBEP_PMON_CTL_INVERT | \
-				 SNBEP_U_MSR_PMON_CTL_TRESH_MASK)
-
-#define SNBEP_CBO_PMON_CTL_TID_EN		(1 << 19)
-#define SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK	(SNBEP_PMON_RAW_EVENT_MASK | \
-						 SNBEP_CBO_PMON_CTL_TID_EN)
-
-/* SNB-EP PCU event control */
-#define SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK	0x0000c000
-#define SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK	0x1f000000
-#define SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT	(1 << 30)
-#define SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET	(1 << 31)
-#define SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK	\
-				(SNBEP_PMON_CTL_EV_SEL_MASK | \
-				 SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \
-				 SNBEP_PMON_CTL_EDGE_DET | \
-				 SNBEP_PMON_CTL_EV_SEL_EXT | \
-				 SNBEP_PMON_CTL_INVERT | \
-				 SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \
-				 SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \
-				 SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET)
-
-#define SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK	\
-				(SNBEP_PMON_RAW_EVENT_MASK | \
-				 SNBEP_PMON_CTL_EV_SEL_EXT)
-
-/* SNB-EP pci control register */
-#define SNBEP_PCI_PMON_BOX_CTL			0xf4
-#define SNBEP_PCI_PMON_CTL0			0xd8
-/* SNB-EP pci counter register */
-#define SNBEP_PCI_PMON_CTR0			0xa0
-
-/* SNB-EP home agent register */
-#define SNBEP_HA_PCI_PMON_BOX_ADDRMATCH0	0x40
-#define SNBEP_HA_PCI_PMON_BOX_ADDRMATCH1	0x44
-#define SNBEP_HA_PCI_PMON_BOX_OPCODEMATCH	0x48
-/* SNB-EP memory controller register */
-#define SNBEP_MC_CHy_PCI_PMON_FIXED_CTL		0xf0
-#define SNBEP_MC_CHy_PCI_PMON_FIXED_CTR		0xd0
-/* SNB-EP QPI register */
-#define SNBEP_Q_Py_PCI_PMON_PKT_MATCH0		0x228
-#define SNBEP_Q_Py_PCI_PMON_PKT_MATCH1		0x22c
-#define SNBEP_Q_Py_PCI_PMON_PKT_MASK0		0x238
-#define SNBEP_Q_Py_PCI_PMON_PKT_MASK1		0x23c
-
-/* SNB-EP Ubox register */
-#define SNBEP_U_MSR_PMON_CTR0			0xc16
-#define SNBEP_U_MSR_PMON_CTL0			0xc10
-
-#define SNBEP_U_MSR_PMON_UCLK_FIXED_CTL		0xc08
-#define SNBEP_U_MSR_PMON_UCLK_FIXED_CTR		0xc09
-
-/* SNB-EP Cbo register */
-#define SNBEP_C0_MSR_PMON_CTR0			0xd16
-#define SNBEP_C0_MSR_PMON_CTL0			0xd10
-#define SNBEP_C0_MSR_PMON_BOX_CTL		0xd04
-#define SNBEP_C0_MSR_PMON_BOX_FILTER		0xd14
-#define SNBEP_CBO_MSR_OFFSET			0x20
-
-#define SNBEP_CB0_MSR_PMON_BOX_FILTER_TID	0x1f
-#define SNBEP_CB0_MSR_PMON_BOX_FILTER_NID	0x3fc00
-#define SNBEP_CB0_MSR_PMON_BOX_FILTER_STATE	0x7c0000
-#define SNBEP_CB0_MSR_PMON_BOX_FILTER_OPC	0xff800000
-
-#define SNBEP_CBO_EVENT_EXTRA_REG(e, m, i) {	\
-	.event = (e),				\
-	.msr = SNBEP_C0_MSR_PMON_BOX_FILTER,	\
-	.config_mask = (m),			\
-	.idx = (i)				\
-}
-
-/* SNB-EP PCU register */
-#define SNBEP_PCU_MSR_PMON_CTR0			0xc36
-#define SNBEP_PCU_MSR_PMON_CTL0			0xc30
-#define SNBEP_PCU_MSR_PMON_BOX_CTL		0xc24
-#define SNBEP_PCU_MSR_PMON_BOX_FILTER		0xc34
-#define SNBEP_PCU_MSR_PMON_BOX_FILTER_MASK	0xffffffff
-#define SNBEP_PCU_MSR_CORE_C3_CTR		0x3fc
-#define SNBEP_PCU_MSR_CORE_C6_CTR		0x3fd
-
-/* IVT event control */
-#define IVT_PMON_BOX_CTL_INT		(SNBEP_PMON_BOX_CTL_RST_CTRL | \
-					 SNBEP_PMON_BOX_CTL_RST_CTRS)
-#define IVT_PMON_RAW_EVENT_MASK		(SNBEP_PMON_CTL_EV_SEL_MASK | \
-					 SNBEP_PMON_CTL_UMASK_MASK | \
-					 SNBEP_PMON_CTL_EDGE_DET | \
-					 SNBEP_PMON_CTL_TRESH_MASK)
-/* IVT Ubox */
-#define IVT_U_MSR_PMON_GLOBAL_CTL		0xc00
-#define IVT_U_PMON_GLOBAL_FRZ_ALL		(1 << 31)
-#define IVT_U_PMON_GLOBAL_UNFRZ_ALL		(1 << 29)
-
-#define IVT_U_MSR_PMON_RAW_EVENT_MASK	\
-				(SNBEP_PMON_CTL_EV_SEL_MASK | \
-				 SNBEP_PMON_CTL_UMASK_MASK | \
-				 SNBEP_PMON_CTL_EDGE_DET | \
-				 SNBEP_U_MSR_PMON_CTL_TRESH_MASK)
-/* IVT Cbo */
-#define IVT_CBO_MSR_PMON_RAW_EVENT_MASK		(IVT_PMON_RAW_EVENT_MASK | \
-						 SNBEP_CBO_PMON_CTL_TID_EN)
-
-#define IVT_CB0_MSR_PMON_BOX_FILTER_TID		(0x1fULL << 0)
-#define IVT_CB0_MSR_PMON_BOX_FILTER_LINK	(0xfULL << 5)
-#define IVT_CB0_MSR_PMON_BOX_FILTER_STATE	(0x3fULL << 17)
-#define IVT_CB0_MSR_PMON_BOX_FILTER_NID		(0xffffULL << 32)
-#define IVT_CB0_MSR_PMON_BOX_FILTER_OPC		(0x1ffULL << 52)
-#define IVT_CB0_MSR_PMON_BOX_FILTER_C6		(0x1ULL << 61)
-#define IVT_CB0_MSR_PMON_BOX_FILTER_NC		(0x1ULL << 62)
-#define IVT_CB0_MSR_PMON_BOX_FILTER_IOSC	(0x1ULL << 63)
-
-/* IVT home agent */
-#define IVT_HA_PCI_PMON_CTL_Q_OCC_RST		(1 << 16)
-#define IVT_HA_PCI_PMON_RAW_EVENT_MASK		\
-				(IVT_PMON_RAW_EVENT_MASK | \
-				 IVT_HA_PCI_PMON_CTL_Q_OCC_RST)
-/* IVT PCU */
-#define IVT_PCU_MSR_PMON_RAW_EVENT_MASK	\
-				(SNBEP_PMON_CTL_EV_SEL_MASK | \
-				 SNBEP_PMON_CTL_EV_SEL_EXT | \
-				 SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \
-				 SNBEP_PMON_CTL_EDGE_DET | \
-				 SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \
-				 SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \
-				 SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET)
-/* IVT QPI */
-#define IVT_QPI_PCI_PMON_RAW_EVENT_MASK	\
-				(IVT_PMON_RAW_EVENT_MASK | \
-				 SNBEP_PMON_CTL_EV_SEL_EXT)
-
-/* NHM-EX event control */
-#define NHMEX_PMON_CTL_EV_SEL_MASK	0x000000ff
-#define NHMEX_PMON_CTL_UMASK_MASK	0x0000ff00
-#define NHMEX_PMON_CTL_EN_BIT0		(1 << 0)
-#define NHMEX_PMON_CTL_EDGE_DET		(1 << 18)
-#define NHMEX_PMON_CTL_PMI_EN		(1 << 20)
-#define NHMEX_PMON_CTL_EN_BIT22		(1 << 22)
-#define NHMEX_PMON_CTL_INVERT		(1 << 23)
-#define NHMEX_PMON_CTL_TRESH_MASK	0xff000000
-#define NHMEX_PMON_RAW_EVENT_MASK	(NHMEX_PMON_CTL_EV_SEL_MASK | \
-					 NHMEX_PMON_CTL_UMASK_MASK | \
-					 NHMEX_PMON_CTL_EDGE_DET | \
-					 NHMEX_PMON_CTL_INVERT | \
-					 NHMEX_PMON_CTL_TRESH_MASK)
-
-/* NHM-EX Ubox */
-#define NHMEX_U_MSR_PMON_GLOBAL_CTL		0xc00
-#define NHMEX_U_MSR_PMON_CTR			0xc11
-#define NHMEX_U_MSR_PMON_EV_SEL			0xc10
-
-#define NHMEX_U_PMON_GLOBAL_EN			(1 << 0)
-#define NHMEX_U_PMON_GLOBAL_PMI_CORE_SEL	0x0000001e
-#define NHMEX_U_PMON_GLOBAL_EN_ALL		(1 << 28)
-#define NHMEX_U_PMON_GLOBAL_RST_ALL		(1 << 29)
-#define NHMEX_U_PMON_GLOBAL_FRZ_ALL		(1 << 31)
-
-#define NHMEX_U_PMON_RAW_EVENT_MASK		\
-		(NHMEX_PMON_CTL_EV_SEL_MASK |	\
-		 NHMEX_PMON_CTL_EDGE_DET)
-
-/* NHM-EX Cbox */
-#define NHMEX_C0_MSR_PMON_GLOBAL_CTL		0xd00
-#define NHMEX_C0_MSR_PMON_CTR0			0xd11
-#define NHMEX_C0_MSR_PMON_EV_SEL0		0xd10
-#define NHMEX_C_MSR_OFFSET			0x20
-
-/* NHM-EX Bbox */
-#define NHMEX_B0_MSR_PMON_GLOBAL_CTL		0xc20
-#define NHMEX_B0_MSR_PMON_CTR0			0xc31
-#define NHMEX_B0_MSR_PMON_CTL0			0xc30
-#define NHMEX_B_MSR_OFFSET			0x40
-#define NHMEX_B0_MSR_MATCH			0xe45
-#define NHMEX_B0_MSR_MASK			0xe46
-#define NHMEX_B1_MSR_MATCH			0xe4d
-#define NHMEX_B1_MSR_MASK			0xe4e
-
-#define NHMEX_B_PMON_CTL_EN			(1 << 0)
-#define NHMEX_B_PMON_CTL_EV_SEL_SHIFT		1
-#define NHMEX_B_PMON_CTL_EV_SEL_MASK		\
-		(0x1f << NHMEX_B_PMON_CTL_EV_SEL_SHIFT)
-#define NHMEX_B_PMON_CTR_SHIFT		6
-#define NHMEX_B_PMON_CTR_MASK		\
-		(0x3 << NHMEX_B_PMON_CTR_SHIFT)
-#define NHMEX_B_PMON_RAW_EVENT_MASK		\
-		(NHMEX_B_PMON_CTL_EV_SEL_MASK | \
-		 NHMEX_B_PMON_CTR_MASK)
-
-/* NHM-EX Sbox */
-#define NHMEX_S0_MSR_PMON_GLOBAL_CTL		0xc40
-#define NHMEX_S0_MSR_PMON_CTR0			0xc51
-#define NHMEX_S0_MSR_PMON_CTL0			0xc50
-#define NHMEX_S_MSR_OFFSET			0x80
-#define NHMEX_S0_MSR_MM_CFG			0xe48
-#define NHMEX_S0_MSR_MATCH			0xe49
-#define NHMEX_S0_MSR_MASK			0xe4a
-#define NHMEX_S1_MSR_MM_CFG			0xe58
-#define NHMEX_S1_MSR_MATCH			0xe59
-#define NHMEX_S1_MSR_MASK			0xe5a
-
-#define NHMEX_S_PMON_MM_CFG_EN			(0x1ULL << 63)
-#define NHMEX_S_EVENT_TO_R_PROG_EV		0
-
-/* NHM-EX Mbox */
-#define NHMEX_M0_MSR_GLOBAL_CTL			0xca0
-#define NHMEX_M0_MSR_PMU_DSP			0xca5
-#define NHMEX_M0_MSR_PMU_ISS			0xca6
-#define NHMEX_M0_MSR_PMU_MAP			0xca7
-#define NHMEX_M0_MSR_PMU_MSC_THR		0xca8
-#define NHMEX_M0_MSR_PMU_PGT			0xca9
-#define NHMEX_M0_MSR_PMU_PLD			0xcaa
-#define NHMEX_M0_MSR_PMU_ZDP_CTL_FVC		0xcab
-#define NHMEX_M0_MSR_PMU_CTL0			0xcb0
-#define NHMEX_M0_MSR_PMU_CNT0			0xcb1
-#define NHMEX_M_MSR_OFFSET			0x40
-#define NHMEX_M0_MSR_PMU_MM_CFG			0xe54
-#define NHMEX_M1_MSR_PMU_MM_CFG			0xe5c
-
-#define NHMEX_M_PMON_MM_CFG_EN			(1ULL << 63)
-#define NHMEX_M_PMON_ADDR_MATCH_MASK		0x3ffffffffULL
-#define NHMEX_M_PMON_ADDR_MASK_MASK		0x7ffffffULL
-#define NHMEX_M_PMON_ADDR_MASK_SHIFT		34
-
-#define NHMEX_M_PMON_CTL_EN			(1 << 0)
-#define NHMEX_M_PMON_CTL_PMI_EN			(1 << 1)
-#define NHMEX_M_PMON_CTL_COUNT_MODE_SHIFT	2
-#define NHMEX_M_PMON_CTL_COUNT_MODE_MASK	\
-	(0x3 << NHMEX_M_PMON_CTL_COUNT_MODE_SHIFT)
-#define NHMEX_M_PMON_CTL_STORAGE_MODE_SHIFT	4
-#define NHMEX_M_PMON_CTL_STORAGE_MODE_MASK	\
-	(0x3 << NHMEX_M_PMON_CTL_STORAGE_MODE_SHIFT)
-#define NHMEX_M_PMON_CTL_WRAP_MODE		(1 << 6)
-#define NHMEX_M_PMON_CTL_FLAG_MODE		(1 << 7)
-#define NHMEX_M_PMON_CTL_INC_SEL_SHIFT		9
-#define NHMEX_M_PMON_CTL_INC_SEL_MASK		\
-	(0x1f << NHMEX_M_PMON_CTL_INC_SEL_SHIFT)
-#define NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT	19
-#define NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK	\
-	(0x7 << NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT)
-#define NHMEX_M_PMON_RAW_EVENT_MASK			\
-		(NHMEX_M_PMON_CTL_COUNT_MODE_MASK |	\
-		 NHMEX_M_PMON_CTL_STORAGE_MODE_MASK |	\
-		 NHMEX_M_PMON_CTL_WRAP_MODE |		\
-		 NHMEX_M_PMON_CTL_FLAG_MODE |		\
-		 NHMEX_M_PMON_CTL_INC_SEL_MASK |	\
-		 NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK)
-
-#define NHMEX_M_PMON_ZDP_CTL_FVC_MASK		(((1 << 11) - 1) | (1 << 23))
-#define NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(n)	(0x7ULL << (11 + 3 * (n)))
-
-#define WSMEX_M_PMON_ZDP_CTL_FVC_MASK		(((1 << 12) - 1) | (1 << 24))
-#define WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(n)	(0x7ULL << (12 + 3 * (n)))
-
-/*
- * use the 9~13 bits to select event If the 7th bit is not set,
- * otherwise use the 19~21 bits to select event.
- */
-#define MBOX_INC_SEL(x) ((x) << NHMEX_M_PMON_CTL_INC_SEL_SHIFT)
-#define MBOX_SET_FLAG_SEL(x) (((x) << NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT) | \
-				NHMEX_M_PMON_CTL_FLAG_MODE)
-#define MBOX_INC_SEL_MASK (NHMEX_M_PMON_CTL_INC_SEL_MASK | \
-			   NHMEX_M_PMON_CTL_FLAG_MODE)
-#define MBOX_SET_FLAG_SEL_MASK (NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK | \
-				NHMEX_M_PMON_CTL_FLAG_MODE)
-#define MBOX_INC_SEL_EXTAR_REG(c, r) \
-		EVENT_EXTRA_REG(MBOX_INC_SEL(c), NHMEX_M0_MSR_PMU_##r, \
-				MBOX_INC_SEL_MASK, (u64)-1, NHMEX_M_##r)
-#define MBOX_SET_FLAG_SEL_EXTRA_REG(c, r) \
-		EVENT_EXTRA_REG(MBOX_SET_FLAG_SEL(c), NHMEX_M0_MSR_PMU_##r, \
-				MBOX_SET_FLAG_SEL_MASK, \
-				(u64)-1, NHMEX_M_##r)
-
-/* NHM-EX Rbox */
-#define NHMEX_R_MSR_GLOBAL_CTL			0xe00
-#define NHMEX_R_MSR_PMON_CTL0			0xe10
-#define NHMEX_R_MSR_PMON_CNT0			0xe11
-#define NHMEX_R_MSR_OFFSET			0x20
-
-#define NHMEX_R_MSR_PORTN_QLX_CFG(n)		\
-		((n) < 4 ? (0xe0c + (n)) : (0xe2c + (n) - 4))
-#define NHMEX_R_MSR_PORTN_IPERF_CFG0(n)		(0xe04 + (n))
-#define NHMEX_R_MSR_PORTN_IPERF_CFG1(n)		(0xe24 + (n))
-#define NHMEX_R_MSR_PORTN_XBR_OFFSET(n)		\
-		(((n) < 4 ? 0 : 0x10) + (n) * 4)
-#define NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n)	\
-		(0xe60 + NHMEX_R_MSR_PORTN_XBR_OFFSET(n))
-#define NHMEX_R_MSR_PORTN_XBR_SET1_MATCH(n)	\
-		(NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) + 1)
-#define NHMEX_R_MSR_PORTN_XBR_SET1_MASK(n)	\
-		(NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) + 2)
-#define NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n)	\
-		(0xe70 + NHMEX_R_MSR_PORTN_XBR_OFFSET(n))
-#define NHMEX_R_MSR_PORTN_XBR_SET2_MATCH(n)	\
-		(NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) + 1)
-#define NHMEX_R_MSR_PORTN_XBR_SET2_MASK(n)	\
-		(NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) + 2)
-
-#define NHMEX_R_PMON_CTL_EN			(1 << 0)
-#define NHMEX_R_PMON_CTL_EV_SEL_SHIFT		1
-#define NHMEX_R_PMON_CTL_EV_SEL_MASK		\
-		(0x1f << NHMEX_R_PMON_CTL_EV_SEL_SHIFT)
-#define NHMEX_R_PMON_CTL_PMI_EN			(1 << 6)
-#define NHMEX_R_PMON_RAW_EVENT_MASK		NHMEX_R_PMON_CTL_EV_SEL_MASK
-
-/* NHM-EX Wbox */
-#define NHMEX_W_MSR_GLOBAL_CTL			0xc80
-#define NHMEX_W_MSR_PMON_CNT0			0xc90
-#define NHMEX_W_MSR_PMON_EVT_SEL0		0xc91
-#define NHMEX_W_MSR_PMON_FIXED_CTR		0x394
-#define NHMEX_W_MSR_PMON_FIXED_CTL		0x395
-
-#define NHMEX_W_PMON_GLOBAL_FIXED_EN		(1ULL << 31)
-
 struct intel_uncore_ops;
 struct intel_uncore_pmu;
 struct intel_uncore_box;
@@ -505,6 +116,9 @@
 	const char *config;
 };
 
+ssize_t uncore_event_show(struct kobject *kobj,
+			  struct kobj_attribute *attr, char *buf);
+
 #define INTEL_UNCORE_EVENT_DESC(_name, _config)			\
 {								\
 	.attr	= __ATTR(_name, 0444, uncore_event_show, NULL),	\
@@ -522,15 +136,6 @@
 static struct kobj_attribute format_attr_##_var =			\
 	__ATTR(_name, 0444, __uncore_##_var##_show, NULL)
 
-
-static ssize_t uncore_event_show(struct kobject *kobj,
-				struct kobj_attribute *attr, char *buf)
-{
-	struct uncore_event_desc *event =
-		container_of(attr, struct uncore_event_desc, attr);
-	return sprintf(buf, "%s", event->config);
-}
-
 static inline unsigned uncore_pci_box_ctl(struct intel_uncore_box *box)
 {
 	return box->pmu->type->box_ctl;
@@ -694,3 +299,39 @@
 {
 	return (box->phys_id < 0);
 }
+
+struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event);
+struct intel_uncore_box *uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu);
+struct intel_uncore_box *uncore_event_to_box(struct perf_event *event);
+u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event);
+void uncore_pmu_start_hrtimer(struct intel_uncore_box *box);
+void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box);
+void uncore_pmu_event_read(struct perf_event *event);
+void uncore_perf_event_update(struct intel_uncore_box *box, struct perf_event *event);
+struct event_constraint *
+uncore_get_constraint(struct intel_uncore_box *box, struct perf_event *event);
+void uncore_put_constraint(struct intel_uncore_box *box, struct perf_event *event);
+u64 uncore_shared_reg_config(struct intel_uncore_box *box, int idx);
+
+extern struct intel_uncore_type **uncore_msr_uncores;
+extern struct intel_uncore_type **uncore_pci_uncores;
+extern struct pci_driver *uncore_pci_driver;
+extern int uncore_pcibus_to_physid[256];
+extern struct pci_dev *uncore_extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX];
+extern struct event_constraint uncore_constraint_empty;
+
+/* perf_event_intel_uncore_snb.c */
+int snb_uncore_pci_init(void);
+int ivb_uncore_pci_init(void);
+int hsw_uncore_pci_init(void);
+void snb_uncore_cpu_init(void);
+void nhm_uncore_cpu_init(void);
+
+/* perf_event_intel_uncore_snbep.c */
+int snbep_uncore_pci_init(void);
+void snbep_uncore_cpu_init(void);
+int ivbep_uncore_pci_init(void);
+void ivbep_uncore_cpu_init(void);
+
+/* perf_event_intel_uncore_nhmex.c */
+void nhmex_uncore_cpu_init(void);

diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_nhmex.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_nhmex.c
new file mode 100644
index 0000000..2749965
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_nhmex.c

@@ -0,0 +1,1221 @@
+/* Nehalem-EX/Westmere-EX uncore support */
+#include "perf_event_intel_uncore.h"
+
+/* NHM-EX event control */
+#define NHMEX_PMON_CTL_EV_SEL_MASK	0x000000ff
+#define NHMEX_PMON_CTL_UMASK_MASK	0x0000ff00
+#define NHMEX_PMON_CTL_EN_BIT0		(1 << 0)
+#define NHMEX_PMON_CTL_EDGE_DET		(1 << 18)
+#define NHMEX_PMON_CTL_PMI_EN		(1 << 20)
+#define NHMEX_PMON_CTL_EN_BIT22		(1 << 22)
+#define NHMEX_PMON_CTL_INVERT		(1 << 23)
+#define NHMEX_PMON_CTL_TRESH_MASK	0xff000000
+#define NHMEX_PMON_RAW_EVENT_MASK	(NHMEX_PMON_CTL_EV_SEL_MASK | \
+					 NHMEX_PMON_CTL_UMASK_MASK | \
+					 NHMEX_PMON_CTL_EDGE_DET | \
+					 NHMEX_PMON_CTL_INVERT | \
+					 NHMEX_PMON_CTL_TRESH_MASK)
+
+/* NHM-EX Ubox */
+#define NHMEX_U_MSR_PMON_GLOBAL_CTL		0xc00
+#define NHMEX_U_MSR_PMON_CTR			0xc11
+#define NHMEX_U_MSR_PMON_EV_SEL			0xc10
+
+#define NHMEX_U_PMON_GLOBAL_EN			(1 << 0)
+#define NHMEX_U_PMON_GLOBAL_PMI_CORE_SEL	0x0000001e
+#define NHMEX_U_PMON_GLOBAL_EN_ALL		(1 << 28)
+#define NHMEX_U_PMON_GLOBAL_RST_ALL		(1 << 29)
+#define NHMEX_U_PMON_GLOBAL_FRZ_ALL		(1 << 31)
+
+#define NHMEX_U_PMON_RAW_EVENT_MASK		\
+		(NHMEX_PMON_CTL_EV_SEL_MASK |	\
+		 NHMEX_PMON_CTL_EDGE_DET)
+
+/* NHM-EX Cbox */
+#define NHMEX_C0_MSR_PMON_GLOBAL_CTL		0xd00
+#define NHMEX_C0_MSR_PMON_CTR0			0xd11
+#define NHMEX_C0_MSR_PMON_EV_SEL0		0xd10
+#define NHMEX_C_MSR_OFFSET			0x20
+
+/* NHM-EX Bbox */
+#define NHMEX_B0_MSR_PMON_GLOBAL_CTL		0xc20
+#define NHMEX_B0_MSR_PMON_CTR0			0xc31
+#define NHMEX_B0_MSR_PMON_CTL0			0xc30
+#define NHMEX_B_MSR_OFFSET			0x40
+#define NHMEX_B0_MSR_MATCH			0xe45
+#define NHMEX_B0_MSR_MASK			0xe46
+#define NHMEX_B1_MSR_MATCH			0xe4d
+#define NHMEX_B1_MSR_MASK			0xe4e
+
+#define NHMEX_B_PMON_CTL_EN			(1 << 0)
+#define NHMEX_B_PMON_CTL_EV_SEL_SHIFT		1
+#define NHMEX_B_PMON_CTL_EV_SEL_MASK		\
+		(0x1f << NHMEX_B_PMON_CTL_EV_SEL_SHIFT)
+#define NHMEX_B_PMON_CTR_SHIFT		6
+#define NHMEX_B_PMON_CTR_MASK		\
+		(0x3 << NHMEX_B_PMON_CTR_SHIFT)
+#define NHMEX_B_PMON_RAW_EVENT_MASK		\
+		(NHMEX_B_PMON_CTL_EV_SEL_MASK | \
+		 NHMEX_B_PMON_CTR_MASK)
+
+/* NHM-EX Sbox */
+#define NHMEX_S0_MSR_PMON_GLOBAL_CTL		0xc40
+#define NHMEX_S0_MSR_PMON_CTR0			0xc51
+#define NHMEX_S0_MSR_PMON_CTL0			0xc50
+#define NHMEX_S_MSR_OFFSET			0x80
+#define NHMEX_S0_MSR_MM_CFG			0xe48
+#define NHMEX_S0_MSR_MATCH			0xe49
+#define NHMEX_S0_MSR_MASK			0xe4a
+#define NHMEX_S1_MSR_MM_CFG			0xe58
+#define NHMEX_S1_MSR_MATCH			0xe59
+#define NHMEX_S1_MSR_MASK			0xe5a
+
+#define NHMEX_S_PMON_MM_CFG_EN			(0x1ULL << 63)
+#define NHMEX_S_EVENT_TO_R_PROG_EV		0
+
+/* NHM-EX Mbox */
+#define NHMEX_M0_MSR_GLOBAL_CTL			0xca0
+#define NHMEX_M0_MSR_PMU_DSP			0xca5
+#define NHMEX_M0_MSR_PMU_ISS			0xca6
+#define NHMEX_M0_MSR_PMU_MAP			0xca7
+#define NHMEX_M0_MSR_PMU_MSC_THR		0xca8
+#define NHMEX_M0_MSR_PMU_PGT			0xca9
+#define NHMEX_M0_MSR_PMU_PLD			0xcaa
+#define NHMEX_M0_MSR_PMU_ZDP_CTL_FVC		0xcab
+#define NHMEX_M0_MSR_PMU_CTL0			0xcb0
+#define NHMEX_M0_MSR_PMU_CNT0			0xcb1
+#define NHMEX_M_MSR_OFFSET			0x40
+#define NHMEX_M0_MSR_PMU_MM_CFG			0xe54
+#define NHMEX_M1_MSR_PMU_MM_CFG			0xe5c
+
+#define NHMEX_M_PMON_MM_CFG_EN			(1ULL << 63)
+#define NHMEX_M_PMON_ADDR_MATCH_MASK		0x3ffffffffULL
+#define NHMEX_M_PMON_ADDR_MASK_MASK		0x7ffffffULL
+#define NHMEX_M_PMON_ADDR_MASK_SHIFT		34
+
+#define NHMEX_M_PMON_CTL_EN			(1 << 0)
+#define NHMEX_M_PMON_CTL_PMI_EN			(1 << 1)
+#define NHMEX_M_PMON_CTL_COUNT_MODE_SHIFT	2
+#define NHMEX_M_PMON_CTL_COUNT_MODE_MASK	\
+	(0x3 << NHMEX_M_PMON_CTL_COUNT_MODE_SHIFT)
+#define NHMEX_M_PMON_CTL_STORAGE_MODE_SHIFT	4
+#define NHMEX_M_PMON_CTL_STORAGE_MODE_MASK	\
+	(0x3 << NHMEX_M_PMON_CTL_STORAGE_MODE_SHIFT)
+#define NHMEX_M_PMON_CTL_WRAP_MODE		(1 << 6)
+#define NHMEX_M_PMON_CTL_FLAG_MODE		(1 << 7)
+#define NHMEX_M_PMON_CTL_INC_SEL_SHIFT		9
+#define NHMEX_M_PMON_CTL_INC_SEL_MASK		\
+	(0x1f << NHMEX_M_PMON_CTL_INC_SEL_SHIFT)
+#define NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT	19
+#define NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK	\
+	(0x7 << NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT)
+#define NHMEX_M_PMON_RAW_EVENT_MASK			\
+		(NHMEX_M_PMON_CTL_COUNT_MODE_MASK |	\
+		 NHMEX_M_PMON_CTL_STORAGE_MODE_MASK |	\
+		 NHMEX_M_PMON_CTL_WRAP_MODE |		\
+		 NHMEX_M_PMON_CTL_FLAG_MODE |		\
+		 NHMEX_M_PMON_CTL_INC_SEL_MASK |	\
+		 NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK)
+
+#define NHMEX_M_PMON_ZDP_CTL_FVC_MASK		(((1 << 11) - 1) | (1 << 23))
+#define NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(n)	(0x7ULL << (11 + 3 * (n)))
+
+#define WSMEX_M_PMON_ZDP_CTL_FVC_MASK		(((1 << 12) - 1) | (1 << 24))
+#define WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(n)	(0x7ULL << (12 + 3 * (n)))
+
+/*
+ * use the 9~13 bits to select event If the 7th bit is not set,
+ * otherwise use the 19~21 bits to select event.
+ */
+#define MBOX_INC_SEL(x) ((x) << NHMEX_M_PMON_CTL_INC_SEL_SHIFT)
+#define MBOX_SET_FLAG_SEL(x) (((x) << NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT) | \
+				NHMEX_M_PMON_CTL_FLAG_MODE)
+#define MBOX_INC_SEL_MASK (NHMEX_M_PMON_CTL_INC_SEL_MASK | \
+			   NHMEX_M_PMON_CTL_FLAG_MODE)
+#define MBOX_SET_FLAG_SEL_MASK (NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK | \
+				NHMEX_M_PMON_CTL_FLAG_MODE)
+#define MBOX_INC_SEL_EXTAR_REG(c, r) \
+		EVENT_EXTRA_REG(MBOX_INC_SEL(c), NHMEX_M0_MSR_PMU_##r, \
+				MBOX_INC_SEL_MASK, (u64)-1, NHMEX_M_##r)
+#define MBOX_SET_FLAG_SEL_EXTRA_REG(c, r) \
+		EVENT_EXTRA_REG(MBOX_SET_FLAG_SEL(c), NHMEX_M0_MSR_PMU_##r, \
+				MBOX_SET_FLAG_SEL_MASK, \
+				(u64)-1, NHMEX_M_##r)
+
+/* NHM-EX Rbox */
+#define NHMEX_R_MSR_GLOBAL_CTL			0xe00
+#define NHMEX_R_MSR_PMON_CTL0			0xe10
+#define NHMEX_R_MSR_PMON_CNT0			0xe11
+#define NHMEX_R_MSR_OFFSET			0x20
+
+#define NHMEX_R_MSR_PORTN_QLX_CFG(n)		\
+		((n) < 4 ? (0xe0c + (n)) : (0xe2c + (n) - 4))
+#define NHMEX_R_MSR_PORTN_IPERF_CFG0(n)		(0xe04 + (n))
+#define NHMEX_R_MSR_PORTN_IPERF_CFG1(n)		(0xe24 + (n))
+#define NHMEX_R_MSR_PORTN_XBR_OFFSET(n)		\
+		(((n) < 4 ? 0 : 0x10) + (n) * 4)
+#define NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n)	\
+		(0xe60 + NHMEX_R_MSR_PORTN_XBR_OFFSET(n))
+#define NHMEX_R_MSR_PORTN_XBR_SET1_MATCH(n)	\
+		(NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) + 1)
+#define NHMEX_R_MSR_PORTN_XBR_SET1_MASK(n)	\
+		(NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) + 2)
+#define NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n)	\
+		(0xe70 + NHMEX_R_MSR_PORTN_XBR_OFFSET(n))
+#define NHMEX_R_MSR_PORTN_XBR_SET2_MATCH(n)	\
+		(NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) + 1)
+#define NHMEX_R_MSR_PORTN_XBR_SET2_MASK(n)	\
+		(NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) + 2)
+
+#define NHMEX_R_PMON_CTL_EN			(1 << 0)
+#define NHMEX_R_PMON_CTL_EV_SEL_SHIFT		1
+#define NHMEX_R_PMON_CTL_EV_SEL_MASK		\
+		(0x1f << NHMEX_R_PMON_CTL_EV_SEL_SHIFT)
+#define NHMEX_R_PMON_CTL_PMI_EN			(1 << 6)
+#define NHMEX_R_PMON_RAW_EVENT_MASK		NHMEX_R_PMON_CTL_EV_SEL_MASK
+
+/* NHM-EX Wbox */
+#define NHMEX_W_MSR_GLOBAL_CTL			0xc80
+#define NHMEX_W_MSR_PMON_CNT0			0xc90
+#define NHMEX_W_MSR_PMON_EVT_SEL0		0xc91
+#define NHMEX_W_MSR_PMON_FIXED_CTR		0x394
+#define NHMEX_W_MSR_PMON_FIXED_CTL		0x395
+
+#define NHMEX_W_PMON_GLOBAL_FIXED_EN		(1ULL << 31)
+
+#define __BITS_VALUE(x, i, n)  ((typeof(x))(((x) >> ((i) * (n))) & \
+				((1ULL << (n)) - 1)))
+
+DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
+DEFINE_UNCORE_FORMAT_ATTR(event5, event, "config:1-5");
+DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
+DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
+DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
+DEFINE_UNCORE_FORMAT_ATTR(thresh8, thresh, "config:24-31");
+DEFINE_UNCORE_FORMAT_ATTR(counter, counter, "config:6-7");
+DEFINE_UNCORE_FORMAT_ATTR(match, match, "config1:0-63");
+DEFINE_UNCORE_FORMAT_ATTR(mask, mask, "config2:0-63");
+
+static void nhmex_uncore_msr_init_box(struct intel_uncore_box *box)
+{
+	wrmsrl(NHMEX_U_MSR_PMON_GLOBAL_CTL, NHMEX_U_PMON_GLOBAL_EN_ALL);
+}
+
+static void nhmex_uncore_msr_disable_box(struct intel_uncore_box *box)
+{
+	unsigned msr = uncore_msr_box_ctl(box);
+	u64 config;
+
+	if (msr) {
+		rdmsrl(msr, config);
+		config &= ~((1ULL << uncore_num_counters(box)) - 1);
+		/* WBox has a fixed counter */
+		if (uncore_msr_fixed_ctl(box))
+			config &= ~NHMEX_W_PMON_GLOBAL_FIXED_EN;
+		wrmsrl(msr, config);
+	}
+}
+
+static void nhmex_uncore_msr_enable_box(struct intel_uncore_box *box)
+{
+	unsigned msr = uncore_msr_box_ctl(box);
+	u64 config;
+
+	if (msr) {
+		rdmsrl(msr, config);
+		config |= (1ULL << uncore_num_counters(box)) - 1;
+		/* WBox has a fixed counter */
+		if (uncore_msr_fixed_ctl(box))
+			config |= NHMEX_W_PMON_GLOBAL_FIXED_EN;
+		wrmsrl(msr, config);
+	}
+}
+
+static void nhmex_uncore_msr_disable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+	wrmsrl(event->hw.config_base, 0);
+}
+
+static void nhmex_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	if (hwc->idx >= UNCORE_PMC_IDX_FIXED)
+		wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0);
+	else if (box->pmu->type->event_mask & NHMEX_PMON_CTL_EN_BIT0)
+		wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT22);
+	else
+		wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT0);
+}
+
+#define NHMEX_UNCORE_OPS_COMMON_INIT()				\
+	.init_box	= nhmex_uncore_msr_init_box,		\
+	.disable_box	= nhmex_uncore_msr_disable_box,		\
+	.enable_box	= nhmex_uncore_msr_enable_box,		\
+	.disable_event	= nhmex_uncore_msr_disable_event,	\
+	.read_counter	= uncore_msr_read_counter
+
+static struct intel_uncore_ops nhmex_uncore_ops = {
+	NHMEX_UNCORE_OPS_COMMON_INIT(),
+	.enable_event	= nhmex_uncore_msr_enable_event,
+};
+
+static struct attribute *nhmex_uncore_ubox_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_edge.attr,
+	NULL,
+};
+
+static struct attribute_group nhmex_uncore_ubox_format_group = {
+	.name		= "format",
+	.attrs		= nhmex_uncore_ubox_formats_attr,
+};
+
+static struct intel_uncore_type nhmex_uncore_ubox = {
+	.name		= "ubox",
+	.num_counters	= 1,
+	.num_boxes	= 1,
+	.perf_ctr_bits	= 48,
+	.event_ctl	= NHMEX_U_MSR_PMON_EV_SEL,
+	.perf_ctr	= NHMEX_U_MSR_PMON_CTR,
+	.event_mask	= NHMEX_U_PMON_RAW_EVENT_MASK,
+	.box_ctl	= NHMEX_U_MSR_PMON_GLOBAL_CTL,
+	.ops		= &nhmex_uncore_ops,
+	.format_group	= &nhmex_uncore_ubox_format_group
+};
+
+static struct attribute *nhmex_uncore_cbox_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_inv.attr,
+	&format_attr_thresh8.attr,
+	NULL,
+};
+
+static struct attribute_group nhmex_uncore_cbox_format_group = {
+	.name = "format",
+	.attrs = nhmex_uncore_cbox_formats_attr,
+};
+
+/* msr offset for each instance of cbox */
+static unsigned nhmex_cbox_msr_offsets[] = {
+	0x0, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0, 0x240, 0x2c0,
+};
+
+static struct intel_uncore_type nhmex_uncore_cbox = {
+	.name			= "cbox",
+	.num_counters		= 6,
+	.num_boxes		= 10,
+	.perf_ctr_bits		= 48,
+	.event_ctl		= NHMEX_C0_MSR_PMON_EV_SEL0,
+	.perf_ctr		= NHMEX_C0_MSR_PMON_CTR0,
+	.event_mask		= NHMEX_PMON_RAW_EVENT_MASK,
+	.box_ctl		= NHMEX_C0_MSR_PMON_GLOBAL_CTL,
+	.msr_offsets		= nhmex_cbox_msr_offsets,
+	.pair_ctr_ctl		= 1,
+	.ops			= &nhmex_uncore_ops,
+	.format_group		= &nhmex_uncore_cbox_format_group
+};
+
+static struct uncore_event_desc nhmex_uncore_wbox_events[] = {
+	INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0"),
+	{ /* end: all zeroes */ },
+};
+
+static struct intel_uncore_type nhmex_uncore_wbox = {
+	.name			= "wbox",
+	.num_counters		= 4,
+	.num_boxes		= 1,
+	.perf_ctr_bits		= 48,
+	.event_ctl		= NHMEX_W_MSR_PMON_CNT0,
+	.perf_ctr		= NHMEX_W_MSR_PMON_EVT_SEL0,
+	.fixed_ctr		= NHMEX_W_MSR_PMON_FIXED_CTR,
+	.fixed_ctl		= NHMEX_W_MSR_PMON_FIXED_CTL,
+	.event_mask		= NHMEX_PMON_RAW_EVENT_MASK,
+	.box_ctl		= NHMEX_W_MSR_GLOBAL_CTL,
+	.pair_ctr_ctl		= 1,
+	.event_descs		= nhmex_uncore_wbox_events,
+	.ops			= &nhmex_uncore_ops,
+	.format_group		= &nhmex_uncore_cbox_format_group
+};
+
+static int nhmex_bbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+	struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+	int ctr, ev_sel;
+
+	ctr = (hwc->config & NHMEX_B_PMON_CTR_MASK) >>
+		NHMEX_B_PMON_CTR_SHIFT;
+	ev_sel = (hwc->config & NHMEX_B_PMON_CTL_EV_SEL_MASK) >>
+		  NHMEX_B_PMON_CTL_EV_SEL_SHIFT;
+
+	/* events that do not use the match/mask registers */
+	if ((ctr == 0 && ev_sel > 0x3) || (ctr == 1 && ev_sel > 0x6) ||
+	    (ctr == 2 && ev_sel != 0x4) || ctr == 3)
+		return 0;
+
+	if (box->pmu->pmu_idx == 0)
+		reg1->reg = NHMEX_B0_MSR_MATCH;
+	else
+		reg1->reg = NHMEX_B1_MSR_MATCH;
+	reg1->idx = 0;
+	reg1->config = event->attr.config1;
+	reg2->config = event->attr.config2;
+	return 0;
+}
+
+static void nhmex_bbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+	struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+
+	if (reg1->idx != EXTRA_REG_NONE) {
+		wrmsrl(reg1->reg, reg1->config);
+		wrmsrl(reg1->reg + 1, reg2->config);
+	}
+	wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0 |
+		(hwc->config & NHMEX_B_PMON_CTL_EV_SEL_MASK));
+}
+
+/*
+ * The Bbox has 4 counters, but each counter monitors different events.
+ * Use bits 6-7 in the event config to select counter.
+ */
+static struct event_constraint nhmex_uncore_bbox_constraints[] = {
+	EVENT_CONSTRAINT(0 , 1, 0xc0),
+	EVENT_CONSTRAINT(0x40, 2, 0xc0),
+	EVENT_CONSTRAINT(0x80, 4, 0xc0),
+	EVENT_CONSTRAINT(0xc0, 8, 0xc0),
+	EVENT_CONSTRAINT_END,
+};
+
+static struct attribute *nhmex_uncore_bbox_formats_attr[] = {
+	&format_attr_event5.attr,
+	&format_attr_counter.attr,
+	&format_attr_match.attr,
+	&format_attr_mask.attr,
+	NULL,
+};
+
+static struct attribute_group nhmex_uncore_bbox_format_group = {
+	.name = "format",
+	.attrs = nhmex_uncore_bbox_formats_attr,
+};
+
+static struct intel_uncore_ops nhmex_uncore_bbox_ops = {
+	NHMEX_UNCORE_OPS_COMMON_INIT(),
+	.enable_event		= nhmex_bbox_msr_enable_event,
+	.hw_config		= nhmex_bbox_hw_config,
+	.get_constraint		= uncore_get_constraint,
+	.put_constraint		= uncore_put_constraint,
+};
+
+static struct intel_uncore_type nhmex_uncore_bbox = {
+	.name			= "bbox",
+	.num_counters		= 4,
+	.num_boxes		= 2,
+	.perf_ctr_bits		= 48,
+	.event_ctl		= NHMEX_B0_MSR_PMON_CTL0,
+	.perf_ctr		= NHMEX_B0_MSR_PMON_CTR0,
+	.event_mask		= NHMEX_B_PMON_RAW_EVENT_MASK,
+	.box_ctl		= NHMEX_B0_MSR_PMON_GLOBAL_CTL,
+	.msr_offset		= NHMEX_B_MSR_OFFSET,
+	.pair_ctr_ctl		= 1,
+	.num_shared_regs	= 1,
+	.constraints		= nhmex_uncore_bbox_constraints,
+	.ops			= &nhmex_uncore_bbox_ops,
+	.format_group		= &nhmex_uncore_bbox_format_group
+};
+
+static int nhmex_sbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+	struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+
+	/* only TO_R_PROG_EV event uses the match/mask register */
+	if ((hwc->config & NHMEX_PMON_CTL_EV_SEL_MASK) !=
+	    NHMEX_S_EVENT_TO_R_PROG_EV)
+		return 0;
+
+	if (box->pmu->pmu_idx == 0)
+		reg1->reg = NHMEX_S0_MSR_MM_CFG;
+	else
+		reg1->reg = NHMEX_S1_MSR_MM_CFG;
+	reg1->idx = 0;
+	reg1->config = event->attr.config1;
+	reg2->config = event->attr.config2;
+	return 0;
+}
+
+static void nhmex_sbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+	struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+
+	if (reg1->idx != EXTRA_REG_NONE) {
+		wrmsrl(reg1->reg, 0);
+		wrmsrl(reg1->reg + 1, reg1->config);
+		wrmsrl(reg1->reg + 2, reg2->config);
+		wrmsrl(reg1->reg, NHMEX_S_PMON_MM_CFG_EN);
+	}
+	wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT22);
+}
+
+static struct attribute *nhmex_uncore_sbox_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_inv.attr,
+	&format_attr_thresh8.attr,
+	&format_attr_match.attr,
+	&format_attr_mask.attr,
+	NULL,
+};
+
+static struct attribute_group nhmex_uncore_sbox_format_group = {
+	.name			= "format",
+	.attrs			= nhmex_uncore_sbox_formats_attr,
+};
+
+static struct intel_uncore_ops nhmex_uncore_sbox_ops = {
+	NHMEX_UNCORE_OPS_COMMON_INIT(),
+	.enable_event		= nhmex_sbox_msr_enable_event,
+	.hw_config		= nhmex_sbox_hw_config,
+	.get_constraint		= uncore_get_constraint,
+	.put_constraint		= uncore_put_constraint,
+};
+
+static struct intel_uncore_type nhmex_uncore_sbox = {
+	.name			= "sbox",
+	.num_counters		= 4,
+	.num_boxes		= 2,
+	.perf_ctr_bits		= 48,
+	.event_ctl		= NHMEX_S0_MSR_PMON_CTL0,
+	.perf_ctr		= NHMEX_S0_MSR_PMON_CTR0,
+	.event_mask		= NHMEX_PMON_RAW_EVENT_MASK,
+	.box_ctl		= NHMEX_S0_MSR_PMON_GLOBAL_CTL,
+	.msr_offset		= NHMEX_S_MSR_OFFSET,
+	.pair_ctr_ctl		= 1,
+	.num_shared_regs	= 1,
+	.ops			= &nhmex_uncore_sbox_ops,
+	.format_group		= &nhmex_uncore_sbox_format_group
+};
+
+enum {
+	EXTRA_REG_NHMEX_M_FILTER,
+	EXTRA_REG_NHMEX_M_DSP,
+	EXTRA_REG_NHMEX_M_ISS,
+	EXTRA_REG_NHMEX_M_MAP,
+	EXTRA_REG_NHMEX_M_MSC_THR,
+	EXTRA_REG_NHMEX_M_PGT,
+	EXTRA_REG_NHMEX_M_PLD,
+	EXTRA_REG_NHMEX_M_ZDP_CTL_FVC,
+};
+
+static struct extra_reg nhmex_uncore_mbox_extra_regs[] = {
+	MBOX_INC_SEL_EXTAR_REG(0x0, DSP),
+	MBOX_INC_SEL_EXTAR_REG(0x4, MSC_THR),
+	MBOX_INC_SEL_EXTAR_REG(0x5, MSC_THR),
+	MBOX_INC_SEL_EXTAR_REG(0x9, ISS),
+	/* event 0xa uses two extra registers */
+	MBOX_INC_SEL_EXTAR_REG(0xa, ISS),
+	MBOX_INC_SEL_EXTAR_REG(0xa, PLD),
+	MBOX_INC_SEL_EXTAR_REG(0xb, PLD),
+	/* events 0xd ~ 0x10 use the same extra register */
+	MBOX_INC_SEL_EXTAR_REG(0xd, ZDP_CTL_FVC),
+	MBOX_INC_SEL_EXTAR_REG(0xe, ZDP_CTL_FVC),
+	MBOX_INC_SEL_EXTAR_REG(0xf, ZDP_CTL_FVC),
+	MBOX_INC_SEL_EXTAR_REG(0x10, ZDP_CTL_FVC),
+	MBOX_INC_SEL_EXTAR_REG(0x16, PGT),
+	MBOX_SET_FLAG_SEL_EXTRA_REG(0x0, DSP),
+	MBOX_SET_FLAG_SEL_EXTRA_REG(0x1, ISS),
+	MBOX_SET_FLAG_SEL_EXTRA_REG(0x5, PGT),
+	MBOX_SET_FLAG_SEL_EXTRA_REG(0x6, MAP),
+	EVENT_EXTRA_END
+};
+
+/* Nehalem-EX or Westmere-EX ? */
+static bool uncore_nhmex;
+
+static bool nhmex_mbox_get_shared_reg(struct intel_uncore_box *box, int idx, u64 config)
+{
+	struct intel_uncore_extra_reg *er;
+	unsigned long flags;
+	bool ret = false;
+	u64 mask;
+
+	if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) {
+		er = &box->shared_regs[idx];
+		raw_spin_lock_irqsave(&er->lock, flags);
+		if (!atomic_read(&er->ref) || er->config == config) {
+			atomic_inc(&er->ref);
+			er->config = config;
+			ret = true;
+		}
+		raw_spin_unlock_irqrestore(&er->lock, flags);
+
+		return ret;
+	}
+	/*
+	 * The ZDP_CTL_FVC MSR has 4 fields which are used to control
+	 * events 0xd ~ 0x10. Besides these 4 fields, there are additional
+	 * fields which are shared.
+	 */
+	idx -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
+	if (WARN_ON_ONCE(idx >= 4))
+		return false;
+
+	/* mask of the shared fields */
+	if (uncore_nhmex)
+		mask = NHMEX_M_PMON_ZDP_CTL_FVC_MASK;
+	else
+		mask = WSMEX_M_PMON_ZDP_CTL_FVC_MASK;
+	er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC];
+
+	raw_spin_lock_irqsave(&er->lock, flags);
+	/* add mask of the non-shared field if it's in use */
+	if (__BITS_VALUE(atomic_read(&er->ref), idx, 8)) {
+		if (uncore_nhmex)
+			mask |= NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
+		else
+			mask |= WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
+	}
+
+	if (!atomic_read(&er->ref) || !((er->config ^ config) & mask)) {
+		atomic_add(1 << (idx * 8), &er->ref);
+		if (uncore_nhmex)
+			mask = NHMEX_M_PMON_ZDP_CTL_FVC_MASK |
+				NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
+		else
+			mask = WSMEX_M_PMON_ZDP_CTL_FVC_MASK |
+				WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
+		er->config &= ~mask;
+		er->config |= (config & mask);
+		ret = true;
+	}
+	raw_spin_unlock_irqrestore(&er->lock, flags);
+
+	return ret;
+}
+
+static void nhmex_mbox_put_shared_reg(struct intel_uncore_box *box, int idx)
+{
+	struct intel_uncore_extra_reg *er;
+
+	if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) {
+		er = &box->shared_regs[idx];
+		atomic_dec(&er->ref);
+		return;
+	}
+
+	idx -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
+	er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC];
+	atomic_sub(1 << (idx * 8), &er->ref);
+}
+
+static u64 nhmex_mbox_alter_er(struct perf_event *event, int new_idx, bool modify)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+	u64 idx, orig_idx = __BITS_VALUE(reg1->idx, 0, 8);
+	u64 config = reg1->config;
+
+	/* get the non-shared control bits and shift them */
+	idx = orig_idx - EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
+	if (uncore_nhmex)
+		config &= NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
+	else
+		config &= WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
+	if (new_idx > orig_idx) {
+		idx = new_idx - orig_idx;
+		config <<= 3 * idx;
+	} else {
+		idx = orig_idx - new_idx;
+		config >>= 3 * idx;
+	}
+
+	/* add the shared control bits back */
+	if (uncore_nhmex)
+		config |= NHMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config;
+	else
+		config |= WSMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config;
+	config |= NHMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config;
+	if (modify) {
+		/* adjust the main event selector */
+		if (new_idx > orig_idx)
+			hwc->config += idx << NHMEX_M_PMON_CTL_INC_SEL_SHIFT;
+		else
+			hwc->config -= idx << NHMEX_M_PMON_CTL_INC_SEL_SHIFT;
+		reg1->config = config;
+		reg1->idx = ~0xff | new_idx;
+	}
+	return config;
+}
+
+static struct event_constraint *
+nhmex_mbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+	struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
+	int i, idx[2], alloc = 0;
+	u64 config1 = reg1->config;
+
+	idx[0] = __BITS_VALUE(reg1->idx, 0, 8);
+	idx[1] = __BITS_VALUE(reg1->idx, 1, 8);
+again:
+	for (i = 0; i < 2; i++) {
+		if (!uncore_box_is_fake(box) && (reg1->alloc & (0x1 << i)))
+			idx[i] = 0xff;
+
+		if (idx[i] == 0xff)
+			continue;
+
+		if (!nhmex_mbox_get_shared_reg(box, idx[i],
+				__BITS_VALUE(config1, i, 32)))
+			goto fail;
+		alloc |= (0x1 << i);
+	}
+
+	/* for the match/mask registers */
+	if (reg2->idx != EXTRA_REG_NONE &&
+	    (uncore_box_is_fake(box) || !reg2->alloc) &&
+	    !nhmex_mbox_get_shared_reg(box, reg2->idx, reg2->config))
+		goto fail;
+
+	/*
+	 * If it's a fake box -- as per validate_{group,event}() we
+	 * shouldn't touch event state and we can avoid doing so
+	 * since both will only call get_event_constraints() once
+	 * on each event, this avoids the need for reg->alloc.
+	 */
+	if (!uncore_box_is_fake(box)) {
+		if (idx[0] != 0xff && idx[0] != __BITS_VALUE(reg1->idx, 0, 8))
+			nhmex_mbox_alter_er(event, idx[0], true);
+		reg1->alloc |= alloc;
+		if (reg2->idx != EXTRA_REG_NONE)
+			reg2->alloc = 1;
+	}
+	return NULL;
+fail:
+	if (idx[0] != 0xff && !(alloc & 0x1) &&
+	    idx[0] >= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) {
+		/*
+		 * events 0xd ~ 0x10 are functional identical, but are
+		 * controlled by different fields in the ZDP_CTL_FVC
+		 * register. If we failed to take one field, try the
+		 * rest 3 choices.
+		 */
+		BUG_ON(__BITS_VALUE(reg1->idx, 1, 8) != 0xff);
+		idx[0] -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
+		idx[0] = (idx[0] + 1) % 4;
+		idx[0] += EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
+		if (idx[0] != __BITS_VALUE(reg1->idx, 0, 8)) {
+			config1 = nhmex_mbox_alter_er(event, idx[0], false);
+			goto again;
+		}
+	}
+
+	if (alloc & 0x1)
+		nhmex_mbox_put_shared_reg(box, idx[0]);
+	if (alloc & 0x2)
+		nhmex_mbox_put_shared_reg(box, idx[1]);
+	return &uncore_constraint_empty;
+}
+
+static void nhmex_mbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+	struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
+
+	if (uncore_box_is_fake(box))
+		return;
+
+	if (reg1->alloc & 0x1)
+		nhmex_mbox_put_shared_reg(box, __BITS_VALUE(reg1->idx, 0, 8));
+	if (reg1->alloc & 0x2)
+		nhmex_mbox_put_shared_reg(box, __BITS_VALUE(reg1->idx, 1, 8));
+	reg1->alloc = 0;
+
+	if (reg2->alloc) {
+		nhmex_mbox_put_shared_reg(box, reg2->idx);
+		reg2->alloc = 0;
+	}
+}
+
+static int nhmex_mbox_extra_reg_idx(struct extra_reg *er)
+{
+	if (er->idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC)
+		return er->idx;
+	return er->idx + (er->event >> NHMEX_M_PMON_CTL_INC_SEL_SHIFT) - 0xd;
+}
+
+static int nhmex_mbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct intel_uncore_type *type = box->pmu->type;
+	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+	struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
+	struct extra_reg *er;
+	unsigned msr;
+	int reg_idx = 0;
+	/*
+	 * The mbox events may require 2 extra MSRs at the most. But only
+	 * the lower 32 bits in these MSRs are significant, so we can use
+	 * config1 to pass two MSRs' config.
+	 */
+	for (er = nhmex_uncore_mbox_extra_regs; er->msr; er++) {
+		if (er->event != (event->hw.config & er->config_mask))
+			continue;
+		if (event->attr.config1 & ~er->valid_mask)
+			return -EINVAL;
+
+		msr = er->msr + type->msr_offset * box->pmu->pmu_idx;
+		if (WARN_ON_ONCE(msr >= 0xffff || er->idx >= 0xff))
+			return -EINVAL;
+
+		/* always use the 32~63 bits to pass the PLD config */
+		if (er->idx == EXTRA_REG_NHMEX_M_PLD)
+			reg_idx = 1;
+		else if (WARN_ON_ONCE(reg_idx > 0))
+			return -EINVAL;
+
+		reg1->idx &= ~(0xff << (reg_idx * 8));
+		reg1->reg &= ~(0xffff << (reg_idx * 16));
+		reg1->idx |= nhmex_mbox_extra_reg_idx(er) << (reg_idx * 8);
+		reg1->reg |= msr << (reg_idx * 16);
+		reg1->config = event->attr.config1;
+		reg_idx++;
+	}
+	/*
+	 * The mbox only provides ability to perform address matching
+	 * for the PLD events.
+	 */
+	if (reg_idx == 2) {
+		reg2->idx = EXTRA_REG_NHMEX_M_FILTER;
+		if (event->attr.config2 & NHMEX_M_PMON_MM_CFG_EN)
+			reg2->config = event->attr.config2;
+		else
+			reg2->config = ~0ULL;
+		if (box->pmu->pmu_idx == 0)
+			reg2->reg = NHMEX_M0_MSR_PMU_MM_CFG;
+		else
+			reg2->reg = NHMEX_M1_MSR_PMU_MM_CFG;
+	}
+	return 0;
+}
+
+static u64 nhmex_mbox_shared_reg_config(struct intel_uncore_box *box, int idx)
+{
+	struct intel_uncore_extra_reg *er;
+	unsigned long flags;
+	u64 config;
+
+	if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC)
+		return box->shared_regs[idx].config;
+
+	er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC];
+	raw_spin_lock_irqsave(&er->lock, flags);
+	config = er->config;
+	raw_spin_unlock_irqrestore(&er->lock, flags);
+	return config;
+}
+
+static void nhmex_mbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+	struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+	int idx;
+
+	idx = __BITS_VALUE(reg1->idx, 0, 8);
+	if (idx != 0xff)
+		wrmsrl(__BITS_VALUE(reg1->reg, 0, 16),
+			nhmex_mbox_shared_reg_config(box, idx));
+	idx = __BITS_VALUE(reg1->idx, 1, 8);
+	if (idx != 0xff)
+		wrmsrl(__BITS_VALUE(reg1->reg, 1, 16),
+			nhmex_mbox_shared_reg_config(box, idx));
+
+	if (reg2->idx != EXTRA_REG_NONE) {
+		wrmsrl(reg2->reg, 0);
+		if (reg2->config != ~0ULL) {
+			wrmsrl(reg2->reg + 1,
+				reg2->config & NHMEX_M_PMON_ADDR_MATCH_MASK);
+			wrmsrl(reg2->reg + 2, NHMEX_M_PMON_ADDR_MASK_MASK &
+				(reg2->config >> NHMEX_M_PMON_ADDR_MASK_SHIFT));
+			wrmsrl(reg2->reg, NHMEX_M_PMON_MM_CFG_EN);
+		}
+	}
+
+	wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT0);
+}
+
+DEFINE_UNCORE_FORMAT_ATTR(count_mode,		count_mode,	"config:2-3");
+DEFINE_UNCORE_FORMAT_ATTR(storage_mode,		storage_mode,	"config:4-5");
+DEFINE_UNCORE_FORMAT_ATTR(wrap_mode,		wrap_mode,	"config:6");
+DEFINE_UNCORE_FORMAT_ATTR(flag_mode,		flag_mode,	"config:7");
+DEFINE_UNCORE_FORMAT_ATTR(inc_sel,		inc_sel,	"config:9-13");
+DEFINE_UNCORE_FORMAT_ATTR(set_flag_sel,		set_flag_sel,	"config:19-21");
+DEFINE_UNCORE_FORMAT_ATTR(filter_cfg_en,	filter_cfg_en,	"config2:63");
+DEFINE_UNCORE_FORMAT_ATTR(filter_match,		filter_match,	"config2:0-33");
+DEFINE_UNCORE_FORMAT_ATTR(filter_mask,		filter_mask,	"config2:34-61");
+DEFINE_UNCORE_FORMAT_ATTR(dsp,			dsp,		"config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(thr,			thr,		"config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(fvc,			fvc,		"config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(pgt,			pgt,		"config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(map,			map,		"config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(iss,			iss,		"config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(pld,			pld,		"config1:32-63");
+
+static struct attribute *nhmex_uncore_mbox_formats_attr[] = {
+	&format_attr_count_mode.attr,
+	&format_attr_storage_mode.attr,
+	&format_attr_wrap_mode.attr,
+	&format_attr_flag_mode.attr,
+	&format_attr_inc_sel.attr,
+	&format_attr_set_flag_sel.attr,
+	&format_attr_filter_cfg_en.attr,
+	&format_attr_filter_match.attr,
+	&format_attr_filter_mask.attr,
+	&format_attr_dsp.attr,
+	&format_attr_thr.attr,
+	&format_attr_fvc.attr,
+	&format_attr_pgt.attr,
+	&format_attr_map.attr,
+	&format_attr_iss.attr,
+	&format_attr_pld.attr,
+	NULL,
+};
+
+static struct attribute_group nhmex_uncore_mbox_format_group = {
+	.name		= "format",
+	.attrs		= nhmex_uncore_mbox_formats_attr,
+};
+
+static struct uncore_event_desc nhmex_uncore_mbox_events[] = {
+	INTEL_UNCORE_EVENT_DESC(bbox_cmds_read, "inc_sel=0xd,fvc=0x2800"),
+	INTEL_UNCORE_EVENT_DESC(bbox_cmds_write, "inc_sel=0xd,fvc=0x2820"),
+	{ /* end: all zeroes */ },
+};
+
+static struct uncore_event_desc wsmex_uncore_mbox_events[] = {
+	INTEL_UNCORE_EVENT_DESC(bbox_cmds_read, "inc_sel=0xd,fvc=0x5000"),
+	INTEL_UNCORE_EVENT_DESC(bbox_cmds_write, "inc_sel=0xd,fvc=0x5040"),
+	{ /* end: all zeroes */ },
+};
+
+static struct intel_uncore_ops nhmex_uncore_mbox_ops = {
+	NHMEX_UNCORE_OPS_COMMON_INIT(),
+	.enable_event	= nhmex_mbox_msr_enable_event,
+	.hw_config	= nhmex_mbox_hw_config,
+	.get_constraint	= nhmex_mbox_get_constraint,
+	.put_constraint	= nhmex_mbox_put_constraint,
+};
+
+static struct intel_uncore_type nhmex_uncore_mbox = {
+	.name			= "mbox",
+	.num_counters		= 6,
+	.num_boxes		= 2,
+	.perf_ctr_bits		= 48,
+	.event_ctl		= NHMEX_M0_MSR_PMU_CTL0,
+	.perf_ctr		= NHMEX_M0_MSR_PMU_CNT0,
+	.event_mask		= NHMEX_M_PMON_RAW_EVENT_MASK,
+	.box_ctl		= NHMEX_M0_MSR_GLOBAL_CTL,
+	.msr_offset		= NHMEX_M_MSR_OFFSET,
+	.pair_ctr_ctl		= 1,
+	.num_shared_regs	= 8,
+	.event_descs		= nhmex_uncore_mbox_events,
+	.ops			= &nhmex_uncore_mbox_ops,
+	.format_group		= &nhmex_uncore_mbox_format_group,
+};
+
+static void nhmex_rbox_alter_er(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+
+	/* adjust the main event selector and extra register index */
+	if (reg1->idx % 2) {
+		reg1->idx--;
+		hwc->config -= 1 << NHMEX_R_PMON_CTL_EV_SEL_SHIFT;
+	} else {
+		reg1->idx++;
+		hwc->config += 1 << NHMEX_R_PMON_CTL_EV_SEL_SHIFT;
+	}
+
+	/* adjust extra register config */
+	switch (reg1->idx % 6) {
+	case 2:
+		/* shift the 8~15 bits to the 0~7 bits */
+		reg1->config >>= 8;
+		break;
+	case 3:
+		/* shift the 0~7 bits to the 8~15 bits */
+		reg1->config <<= 8;
+		break;
+	}
+}
+
+/*
+ * Each rbox has 4 event set which monitor PQI port 0~3 or 4~7.
+ * An event set consists of 6 events, the 3rd and 4th events in
+ * an event set use the same extra register. So an event set uses
+ * 5 extra registers.
+ */
+static struct event_constraint *
+nhmex_rbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+	struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+	struct intel_uncore_extra_reg *er;
+	unsigned long flags;
+	int idx, er_idx;
+	u64 config1;
+	bool ok = false;
+
+	if (!uncore_box_is_fake(box) && reg1->alloc)
+		return NULL;
+
+	idx = reg1->idx % 6;
+	config1 = reg1->config;
+again:
+	er_idx = idx;
+	/* the 3rd and 4th events use the same extra register */
+	if (er_idx > 2)
+		er_idx--;
+	er_idx += (reg1->idx / 6) * 5;
+
+	er = &box->shared_regs[er_idx];
+	raw_spin_lock_irqsave(&er->lock, flags);
+	if (idx < 2) {
+		if (!atomic_read(&er->ref) || er->config == reg1->config) {
+			atomic_inc(&er->ref);
+			er->config = reg1->config;
+			ok = true;
+		}
+	} else if (idx == 2 || idx == 3) {
+		/*
+		 * these two events use different fields in a extra register,
+		 * the 0~7 bits and the 8~15 bits respectively.
+		 */
+		u64 mask = 0xff << ((idx - 2) * 8);
+		if (!__BITS_VALUE(atomic_read(&er->ref), idx - 2, 8) ||
+				!((er->config ^ config1) & mask)) {
+			atomic_add(1 << ((idx - 2) * 8), &er->ref);
+			er->config &= ~mask;
+			er->config |= config1 & mask;
+			ok = true;
+		}
+	} else {
+		if (!atomic_read(&er->ref) ||
+				(er->config == (hwc->config >> 32) &&
+				 er->config1 == reg1->config &&
+				 er->config2 == reg2->config)) {
+			atomic_inc(&er->ref);
+			er->config = (hwc->config >> 32);
+			er->config1 = reg1->config;
+			er->config2 = reg2->config;
+			ok = true;
+		}
+	}
+	raw_spin_unlock_irqrestore(&er->lock, flags);
+
+	if (!ok) {
+		/*
+		 * The Rbox events are always in pairs. The paired
+		 * events are functional identical, but use different
+		 * extra registers. If we failed to take an extra
+		 * register, try the alternative.
+		 */
+		idx ^= 1;
+		if (idx != reg1->idx % 6) {
+			if (idx == 2)
+				config1 >>= 8;
+			else if (idx == 3)
+				config1 <<= 8;
+			goto again;
+		}
+	} else {
+		if (!uncore_box_is_fake(box)) {
+			if (idx != reg1->idx % 6)
+				nhmex_rbox_alter_er(box, event);
+			reg1->alloc = 1;
+		}
+		return NULL;
+	}
+	return &uncore_constraint_empty;
+}
+
+static void nhmex_rbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct intel_uncore_extra_reg *er;
+	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+	int idx, er_idx;
+
+	if (uncore_box_is_fake(box) || !reg1->alloc)
+		return;
+
+	idx = reg1->idx % 6;
+	er_idx = idx;
+	if (er_idx > 2)
+		er_idx--;
+	er_idx += (reg1->idx / 6) * 5;
+
+	er = &box->shared_regs[er_idx];
+	if (idx == 2 || idx == 3)
+		atomic_sub(1 << ((idx - 2) * 8), &er->ref);
+	else
+		atomic_dec(&er->ref);
+
+	reg1->alloc = 0;
+}
+
+static int nhmex_rbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+	struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
+	int idx;
+
+	idx = (event->hw.config & NHMEX_R_PMON_CTL_EV_SEL_MASK) >>
+		NHMEX_R_PMON_CTL_EV_SEL_SHIFT;
+	if (idx >= 0x18)
+		return -EINVAL;
+
+	reg1->idx = idx;
+	reg1->config = event->attr.config1;
+
+	switch (idx % 6) {
+	case 4:
+	case 5:
+		hwc->config |= event->attr.config & (~0ULL << 32);
+		reg2->config = event->attr.config2;
+		break;
+	}
+	return 0;
+}
+
+static void nhmex_rbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+	struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+	int idx, port;
+
+	idx = reg1->idx;
+	port = idx / 6 + box->pmu->pmu_idx * 4;
+
+	switch (idx % 6) {
+	case 0:
+		wrmsrl(NHMEX_R_MSR_PORTN_IPERF_CFG0(port), reg1->config);
+		break;
+	case 1:
+		wrmsrl(NHMEX_R_MSR_PORTN_IPERF_CFG1(port), reg1->config);
+		break;
+	case 2:
+	case 3:
+		wrmsrl(NHMEX_R_MSR_PORTN_QLX_CFG(port),
+			uncore_shared_reg_config(box, 2 + (idx / 6) * 5));
+		break;
+	case 4:
+		wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(port),
+			hwc->config >> 32);
+		wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MATCH(port), reg1->config);
+		wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MASK(port), reg2->config);
+		break;
+	case 5:
+		wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(port),
+			hwc->config >> 32);
+		wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MATCH(port), reg1->config);
+		wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MASK(port), reg2->config);
+		break;
+	}
+
+	wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0 |
+		(hwc->config & NHMEX_R_PMON_CTL_EV_SEL_MASK));
+}
+
+DEFINE_UNCORE_FORMAT_ATTR(xbr_mm_cfg, xbr_mm_cfg, "config:32-63");
+DEFINE_UNCORE_FORMAT_ATTR(xbr_match, xbr_match, "config1:0-63");
+DEFINE_UNCORE_FORMAT_ATTR(xbr_mask, xbr_mask, "config2:0-63");
+DEFINE_UNCORE_FORMAT_ATTR(qlx_cfg, qlx_cfg, "config1:0-15");
+DEFINE_UNCORE_FORMAT_ATTR(iperf_cfg, iperf_cfg, "config1:0-31");
+
+static struct attribute *nhmex_uncore_rbox_formats_attr[] = {
+	&format_attr_event5.attr,
+	&format_attr_xbr_mm_cfg.attr,
+	&format_attr_xbr_match.attr,
+	&format_attr_xbr_mask.attr,
+	&format_attr_qlx_cfg.attr,
+	&format_attr_iperf_cfg.attr,
+	NULL,
+};
+
+static struct attribute_group nhmex_uncore_rbox_format_group = {
+	.name = "format",
+	.attrs = nhmex_uncore_rbox_formats_attr,
+};
+
+static struct uncore_event_desc nhmex_uncore_rbox_events[] = {
+	INTEL_UNCORE_EVENT_DESC(qpi0_flit_send,		"event=0x0,iperf_cfg=0x80000000"),
+	INTEL_UNCORE_EVENT_DESC(qpi1_filt_send,		"event=0x6,iperf_cfg=0x80000000"),
+	INTEL_UNCORE_EVENT_DESC(qpi0_idle_filt,		"event=0x0,iperf_cfg=0x40000000"),
+	INTEL_UNCORE_EVENT_DESC(qpi1_idle_filt,		"event=0x6,iperf_cfg=0x40000000"),
+	INTEL_UNCORE_EVENT_DESC(qpi0_date_response,	"event=0x0,iperf_cfg=0xc4"),
+	INTEL_UNCORE_EVENT_DESC(qpi1_date_response,	"event=0x6,iperf_cfg=0xc4"),
+	{ /* end: all zeroes */ },
+};
+
+static struct intel_uncore_ops nhmex_uncore_rbox_ops = {
+	NHMEX_UNCORE_OPS_COMMON_INIT(),
+	.enable_event		= nhmex_rbox_msr_enable_event,
+	.hw_config		= nhmex_rbox_hw_config,
+	.get_constraint		= nhmex_rbox_get_constraint,
+	.put_constraint		= nhmex_rbox_put_constraint,
+};
+
+static struct intel_uncore_type nhmex_uncore_rbox = {
+	.name			= "rbox",
+	.num_counters		= 8,
+	.num_boxes		= 2,
+	.perf_ctr_bits		= 48,
+	.event_ctl		= NHMEX_R_MSR_PMON_CTL0,
+	.perf_ctr		= NHMEX_R_MSR_PMON_CNT0,
+	.event_mask		= NHMEX_R_PMON_RAW_EVENT_MASK,
+	.box_ctl		= NHMEX_R_MSR_GLOBAL_CTL,
+	.msr_offset		= NHMEX_R_MSR_OFFSET,
+	.pair_ctr_ctl		= 1,
+	.num_shared_regs	= 20,
+	.event_descs		= nhmex_uncore_rbox_events,
+	.ops			= &nhmex_uncore_rbox_ops,
+	.format_group		= &nhmex_uncore_rbox_format_group
+};
+
+static struct intel_uncore_type *nhmex_msr_uncores[] = {
+	&nhmex_uncore_ubox,
+	&nhmex_uncore_cbox,
+	&nhmex_uncore_bbox,
+	&nhmex_uncore_sbox,
+	&nhmex_uncore_mbox,
+	&nhmex_uncore_rbox,
+	&nhmex_uncore_wbox,
+	NULL,
+};
+
+void nhmex_uncore_cpu_init(void)
+{
+	if (boot_cpu_data.x86_model == 46)
+		uncore_nhmex = true;
+	else
+		nhmex_uncore_mbox.event_descs = wsmex_uncore_mbox_events;
+	if (nhmex_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
+		nhmex_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
+	uncore_msr_uncores = nhmex_msr_uncores;
+}
+/* end of Nehalem-EX uncore support */

diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
new file mode 100644
index 0000000..e0e934c
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c

@@ -0,0 +1,603 @@
+/* Nehalem/SandBridge/Haswell uncore support */
+#include "perf_event_intel_uncore.h"
+
+/* SNB event control */
+#define SNB_UNC_CTL_EV_SEL_MASK			0x000000ff
+#define SNB_UNC_CTL_UMASK_MASK			0x0000ff00
+#define SNB_UNC_CTL_EDGE_DET			(1 << 18)
+#define SNB_UNC_CTL_EN				(1 << 22)
+#define SNB_UNC_CTL_INVERT			(1 << 23)
+#define SNB_UNC_CTL_CMASK_MASK			0x1f000000
+#define NHM_UNC_CTL_CMASK_MASK			0xff000000
+#define NHM_UNC_FIXED_CTR_CTL_EN		(1 << 0)
+
+#define SNB_UNC_RAW_EVENT_MASK			(SNB_UNC_CTL_EV_SEL_MASK | \
+						 SNB_UNC_CTL_UMASK_MASK | \
+						 SNB_UNC_CTL_EDGE_DET | \
+						 SNB_UNC_CTL_INVERT | \
+						 SNB_UNC_CTL_CMASK_MASK)
+
+#define NHM_UNC_RAW_EVENT_MASK			(SNB_UNC_CTL_EV_SEL_MASK | \
+						 SNB_UNC_CTL_UMASK_MASK | \
+						 SNB_UNC_CTL_EDGE_DET | \
+						 SNB_UNC_CTL_INVERT | \
+						 NHM_UNC_CTL_CMASK_MASK)
+
+/* SNB global control register */
+#define SNB_UNC_PERF_GLOBAL_CTL                 0x391
+#define SNB_UNC_FIXED_CTR_CTRL                  0x394
+#define SNB_UNC_FIXED_CTR                       0x395
+
+/* SNB uncore global control */
+#define SNB_UNC_GLOBAL_CTL_CORE_ALL             ((1 << 4) - 1)
+#define SNB_UNC_GLOBAL_CTL_EN                   (1 << 29)
+
+/* SNB Cbo register */
+#define SNB_UNC_CBO_0_PERFEVTSEL0               0x700
+#define SNB_UNC_CBO_0_PER_CTR0                  0x706
+#define SNB_UNC_CBO_MSR_OFFSET                  0x10
+
+/* NHM global control register */
+#define NHM_UNC_PERF_GLOBAL_CTL                 0x391
+#define NHM_UNC_FIXED_CTR                       0x394
+#define NHM_UNC_FIXED_CTR_CTRL                  0x395
+
+/* NHM uncore global control */
+#define NHM_UNC_GLOBAL_CTL_EN_PC_ALL            ((1ULL << 8) - 1)
+#define NHM_UNC_GLOBAL_CTL_EN_FC                (1ULL << 32)
+
+/* NHM uncore register */
+#define NHM_UNC_PERFEVTSEL0                     0x3c0
+#define NHM_UNC_UNCORE_PMC0                     0x3b0
+
+DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
+DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
+DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
+DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
+DEFINE_UNCORE_FORMAT_ATTR(cmask5, cmask, "config:24-28");
+DEFINE_UNCORE_FORMAT_ATTR(cmask8, cmask, "config:24-31");
+
+/* Sandy Bridge uncore support */
+static void snb_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	if (hwc->idx < UNCORE_PMC_IDX_FIXED)
+		wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN);
+	else
+		wrmsrl(hwc->config_base, SNB_UNC_CTL_EN);
+}
+
+static void snb_uncore_msr_disable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+	wrmsrl(event->hw.config_base, 0);
+}
+
+static void snb_uncore_msr_init_box(struct intel_uncore_box *box)
+{
+	if (box->pmu->pmu_idx == 0) {
+		wrmsrl(SNB_UNC_PERF_GLOBAL_CTL,
+			SNB_UNC_GLOBAL_CTL_EN | SNB_UNC_GLOBAL_CTL_CORE_ALL);
+	}
+}
+
+static struct uncore_event_desc snb_uncore_events[] = {
+	INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"),
+	{ /* end: all zeroes */ },
+};
+
+static struct attribute *snb_uncore_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_inv.attr,
+	&format_attr_cmask5.attr,
+	NULL,
+};
+
+static struct attribute_group snb_uncore_format_group = {
+	.name		= "format",
+	.attrs		= snb_uncore_formats_attr,
+};
+
+static struct intel_uncore_ops snb_uncore_msr_ops = {
+	.init_box	= snb_uncore_msr_init_box,
+	.disable_event	= snb_uncore_msr_disable_event,
+	.enable_event	= snb_uncore_msr_enable_event,
+	.read_counter	= uncore_msr_read_counter,
+};
+
+static struct event_constraint snb_uncore_cbox_constraints[] = {
+	UNCORE_EVENT_CONSTRAINT(0x80, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x83, 0x1),
+	EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type snb_uncore_cbox = {
+	.name		= "cbox",
+	.num_counters   = 2,
+	.num_boxes	= 4,
+	.perf_ctr_bits	= 44,
+	.fixed_ctr_bits	= 48,
+	.perf_ctr	= SNB_UNC_CBO_0_PER_CTR0,
+	.event_ctl	= SNB_UNC_CBO_0_PERFEVTSEL0,
+	.fixed_ctr	= SNB_UNC_FIXED_CTR,
+	.fixed_ctl	= SNB_UNC_FIXED_CTR_CTRL,
+	.single_fixed	= 1,
+	.event_mask	= SNB_UNC_RAW_EVENT_MASK,
+	.msr_offset	= SNB_UNC_CBO_MSR_OFFSET,
+	.constraints	= snb_uncore_cbox_constraints,
+	.ops		= &snb_uncore_msr_ops,
+	.format_group	= &snb_uncore_format_group,
+	.event_descs	= snb_uncore_events,
+};
+
+static struct intel_uncore_type *snb_msr_uncores[] = {
+	&snb_uncore_cbox,
+	NULL,
+};
+
+void snb_uncore_cpu_init(void)
+{
+	uncore_msr_uncores = snb_msr_uncores;
+	if (snb_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
+		snb_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
+}
+
+enum {
+	SNB_PCI_UNCORE_IMC,
+};
+
+static struct uncore_event_desc snb_uncore_imc_events[] = {
+	INTEL_UNCORE_EVENT_DESC(data_reads,  "event=0x01"),
+	INTEL_UNCORE_EVENT_DESC(data_reads.scale, "6.103515625e-5"),
+	INTEL_UNCORE_EVENT_DESC(data_reads.unit, "MiB"),
+
+	INTEL_UNCORE_EVENT_DESC(data_writes, "event=0x02"),
+	INTEL_UNCORE_EVENT_DESC(data_writes.scale, "6.103515625e-5"),
+	INTEL_UNCORE_EVENT_DESC(data_writes.unit, "MiB"),
+
+	{ /* end: all zeroes */ },
+};
+
+#define SNB_UNCORE_PCI_IMC_EVENT_MASK		0xff
+#define SNB_UNCORE_PCI_IMC_BAR_OFFSET		0x48
+
+/* page size multiple covering all config regs */
+#define SNB_UNCORE_PCI_IMC_MAP_SIZE		0x6000
+
+#define SNB_UNCORE_PCI_IMC_DATA_READS		0x1
+#define SNB_UNCORE_PCI_IMC_DATA_READS_BASE	0x5050
+#define SNB_UNCORE_PCI_IMC_DATA_WRITES		0x2
+#define SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE	0x5054
+#define SNB_UNCORE_PCI_IMC_CTR_BASE		SNB_UNCORE_PCI_IMC_DATA_READS_BASE
+
+static struct attribute *snb_uncore_imc_formats_attr[] = {
+	&format_attr_event.attr,
+	NULL,
+};
+
+static struct attribute_group snb_uncore_imc_format_group = {
+	.name = "format",
+	.attrs = snb_uncore_imc_formats_attr,
+};
+
+static void snb_uncore_imc_init_box(struct intel_uncore_box *box)
+{
+	struct pci_dev *pdev = box->pci_dev;
+	int where = SNB_UNCORE_PCI_IMC_BAR_OFFSET;
+	resource_size_t addr;
+	u32 pci_dword;
+
+	pci_read_config_dword(pdev, where, &pci_dword);
+	addr = pci_dword;
+
+#ifdef CONFIG_PHYS_ADDR_T_64BIT
+	pci_read_config_dword(pdev, where + 4, &pci_dword);
+	addr |= ((resource_size_t)pci_dword << 32);
+#endif
+
+	addr &= ~(PAGE_SIZE - 1);
+
+	box->io_addr = ioremap(addr, SNB_UNCORE_PCI_IMC_MAP_SIZE);
+	box->hrtimer_duration = UNCORE_SNB_IMC_HRTIMER_INTERVAL;
+}
+
+static void snb_uncore_imc_enable_box(struct intel_uncore_box *box)
+{}
+
+static void snb_uncore_imc_disable_box(struct intel_uncore_box *box)
+{}
+
+static void snb_uncore_imc_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{}
+
+static void snb_uncore_imc_disable_event(struct intel_uncore_box *box, struct perf_event *event)
+{}
+
+static u64 snb_uncore_imc_read_counter(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	return (u64)*(unsigned int *)(box->io_addr + hwc->event_base);
+}
+
+/*
+ * custom event_init() function because we define our own fixed, free
+ * running counters, so we do not want to conflict with generic uncore
+ * logic. Also simplifies processing
+ */
+static int snb_uncore_imc_event_init(struct perf_event *event)
+{
+	struct intel_uncore_pmu *pmu;
+	struct intel_uncore_box *box;
+	struct hw_perf_event *hwc = &event->hw;
+	u64 cfg = event->attr.config & SNB_UNCORE_PCI_IMC_EVENT_MASK;
+	int idx, base;
+
+	if (event->attr.type != event->pmu->type)
+		return -ENOENT;
+
+	pmu = uncore_event_to_pmu(event);
+	/* no device found for this pmu */
+	if (pmu->func_id < 0)
+		return -ENOENT;
+
+	/* Sampling not supported yet */
+	if (hwc->sample_period)
+		return -EINVAL;
+
+	/* unsupported modes and filters */
+	if (event->attr.exclude_user   ||
+	    event->attr.exclude_kernel ||
+	    event->attr.exclude_hv     ||
+	    event->attr.exclude_idle   ||
+	    event->attr.exclude_host   ||
+	    event->attr.exclude_guest  ||
+	    event->attr.sample_period) /* no sampling */
+		return -EINVAL;
+
+	/*
+	 * Place all uncore events for a particular physical package
+	 * onto a single cpu
+	 */
+	if (event->cpu < 0)
+		return -EINVAL;
+
+	/* check only supported bits are set */
+	if (event->attr.config & ~SNB_UNCORE_PCI_IMC_EVENT_MASK)
+		return -EINVAL;
+
+	box = uncore_pmu_to_box(pmu, event->cpu);
+	if (!box || box->cpu < 0)
+		return -EINVAL;
+
+	event->cpu = box->cpu;
+
+	event->hw.idx = -1;
+	event->hw.last_tag = ~0ULL;
+	event->hw.extra_reg.idx = EXTRA_REG_NONE;
+	event->hw.branch_reg.idx = EXTRA_REG_NONE;
+	/*
+	 * check event is known (whitelist, determines counter)
+	 */
+	switch (cfg) {
+	case SNB_UNCORE_PCI_IMC_DATA_READS:
+		base = SNB_UNCORE_PCI_IMC_DATA_READS_BASE;
+		idx = UNCORE_PMC_IDX_FIXED;
+		break;
+	case SNB_UNCORE_PCI_IMC_DATA_WRITES:
+		base = SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE;
+		idx = UNCORE_PMC_IDX_FIXED + 1;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	/* must be done before validate_group */
+	event->hw.event_base = base;
+	event->hw.config = cfg;
+	event->hw.idx = idx;
+
+	/* no group validation needed, we have free running counters */
+
+	return 0;
+}
+
+static int snb_uncore_imc_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+	return 0;
+}
+
+static void snb_uncore_imc_event_start(struct perf_event *event, int flags)
+{
+	struct intel_uncore_box *box = uncore_event_to_box(event);
+	u64 count;
+
+	if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
+		return;
+
+	event->hw.state = 0;
+	box->n_active++;
+
+	list_add_tail(&event->active_entry, &box->active_list);
+
+	count = snb_uncore_imc_read_counter(box, event);
+	local64_set(&event->hw.prev_count, count);
+
+	if (box->n_active == 1)
+		uncore_pmu_start_hrtimer(box);
+}
+
+static void snb_uncore_imc_event_stop(struct perf_event *event, int flags)
+{
+	struct intel_uncore_box *box = uncore_event_to_box(event);
+	struct hw_perf_event *hwc = &event->hw;
+
+	if (!(hwc->state & PERF_HES_STOPPED)) {
+		box->n_active--;
+
+		WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+		hwc->state |= PERF_HES_STOPPED;
+
+		list_del(&event->active_entry);
+
+		if (box->n_active == 0)
+			uncore_pmu_cancel_hrtimer(box);
+	}
+
+	if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
+		/*
+		 * Drain the remaining delta count out of a event
+		 * that we are disabling:
+		 */
+		uncore_perf_event_update(box, event);
+		hwc->state |= PERF_HES_UPTODATE;
+	}
+}
+
+static int snb_uncore_imc_event_add(struct perf_event *event, int flags)
+{
+	struct intel_uncore_box *box = uncore_event_to_box(event);
+	struct hw_perf_event *hwc = &event->hw;
+
+	if (!box)
+		return -ENODEV;
+
+	hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+	if (!(flags & PERF_EF_START))
+		hwc->state |= PERF_HES_ARCH;
+
+	snb_uncore_imc_event_start(event, 0);
+
+	box->n_events++;
+
+	return 0;
+}
+
+static void snb_uncore_imc_event_del(struct perf_event *event, int flags)
+{
+	struct intel_uncore_box *box = uncore_event_to_box(event);
+	int i;
+
+	snb_uncore_imc_event_stop(event, PERF_EF_UPDATE);
+
+	for (i = 0; i < box->n_events; i++) {
+		if (event == box->event_list[i]) {
+			--box->n_events;
+			break;
+		}
+	}
+}
+
+static int snb_pci2phy_map_init(int devid)
+{
+	struct pci_dev *dev = NULL;
+	int bus;
+
+	dev = pci_get_device(PCI_VENDOR_ID_INTEL, devid, dev);
+	if (!dev)
+		return -ENOTTY;
+
+	bus = dev->bus->number;
+
+	uncore_pcibus_to_physid[bus] = 0;
+
+	pci_dev_put(dev);
+
+	return 0;
+}
+
+static struct pmu snb_uncore_imc_pmu = {
+	.task_ctx_nr	= perf_invalid_context,
+	.event_init	= snb_uncore_imc_event_init,
+	.add		= snb_uncore_imc_event_add,
+	.del		= snb_uncore_imc_event_del,
+	.start		= snb_uncore_imc_event_start,
+	.stop		= snb_uncore_imc_event_stop,
+	.read		= uncore_pmu_event_read,
+};
+
+static struct intel_uncore_ops snb_uncore_imc_ops = {
+	.init_box	= snb_uncore_imc_init_box,
+	.enable_box	= snb_uncore_imc_enable_box,
+	.disable_box	= snb_uncore_imc_disable_box,
+	.disable_event	= snb_uncore_imc_disable_event,
+	.enable_event	= snb_uncore_imc_enable_event,
+	.hw_config	= snb_uncore_imc_hw_config,
+	.read_counter	= snb_uncore_imc_read_counter,
+};
+
+static struct intel_uncore_type snb_uncore_imc = {
+	.name		= "imc",
+	.num_counters   = 2,
+	.num_boxes	= 1,
+	.fixed_ctr_bits	= 32,
+	.fixed_ctr	= SNB_UNCORE_PCI_IMC_CTR_BASE,
+	.event_descs	= snb_uncore_imc_events,
+	.format_group	= &snb_uncore_imc_format_group,
+	.perf_ctr	= SNB_UNCORE_PCI_IMC_DATA_READS_BASE,
+	.event_mask	= SNB_UNCORE_PCI_IMC_EVENT_MASK,
+	.ops		= &snb_uncore_imc_ops,
+	.pmu		= &snb_uncore_imc_pmu,
+};
+
+static struct intel_uncore_type *snb_pci_uncores[] = {
+	[SNB_PCI_UNCORE_IMC]	= &snb_uncore_imc,
+	NULL,
+};
+
+static const struct pci_device_id snb_uncore_pci_ids[] = {
+	{ /* IMC */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SNB_IMC),
+		.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+	},
+	{ /* end: all zeroes */ },
+};
+
+static const struct pci_device_id ivb_uncore_pci_ids[] = {
+	{ /* IMC */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IVB_IMC),
+		.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+	},
+	{ /* end: all zeroes */ },
+};
+
+static const struct pci_device_id hsw_uncore_pci_ids[] = {
+	{ /* IMC */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_IMC),
+		.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+	},
+	{ /* end: all zeroes */ },
+};
+
+static struct pci_driver snb_uncore_pci_driver = {
+	.name		= "snb_uncore",
+	.id_table	= snb_uncore_pci_ids,
+};
+
+static struct pci_driver ivb_uncore_pci_driver = {
+	.name		= "ivb_uncore",
+	.id_table	= ivb_uncore_pci_ids,
+};
+
+static struct pci_driver hsw_uncore_pci_driver = {
+	.name		= "hsw_uncore",
+	.id_table	= hsw_uncore_pci_ids,
+};
+
+int snb_uncore_pci_init(void)
+{
+	int ret = snb_pci2phy_map_init(PCI_DEVICE_ID_INTEL_SNB_IMC);
+	if (ret)
+		return ret;
+	uncore_pci_uncores = snb_pci_uncores;
+	uncore_pci_driver = &snb_uncore_pci_driver;
+	return 0;
+}
+
+int ivb_uncore_pci_init(void)
+{
+	int ret = snb_pci2phy_map_init(PCI_DEVICE_ID_INTEL_IVB_IMC);
+	if (ret)
+		return ret;
+	uncore_pci_uncores = snb_pci_uncores;
+	uncore_pci_driver = &ivb_uncore_pci_driver;
+	return 0;
+}
+
+int hsw_uncore_pci_init(void)
+{
+	int ret = snb_pci2phy_map_init(PCI_DEVICE_ID_INTEL_HSW_IMC);
+	if (ret)
+		return ret;
+	uncore_pci_uncores = snb_pci_uncores;
+	uncore_pci_driver = &hsw_uncore_pci_driver;
+	return 0;
+}
+
+/* end of Sandy Bridge uncore support */
+
+/* Nehalem uncore support */
+static void nhm_uncore_msr_disable_box(struct intel_uncore_box *box)
+{
+	wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, 0);
+}
+
+static void nhm_uncore_msr_enable_box(struct intel_uncore_box *box)
+{
+	wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, NHM_UNC_GLOBAL_CTL_EN_PC_ALL | NHM_UNC_GLOBAL_CTL_EN_FC);
+}
+
+static void nhm_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	if (hwc->idx < UNCORE_PMC_IDX_FIXED)
+		wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN);
+	else
+		wrmsrl(hwc->config_base, NHM_UNC_FIXED_CTR_CTL_EN);
+}
+
+static struct attribute *nhm_uncore_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_inv.attr,
+	&format_attr_cmask8.attr,
+	NULL,
+};
+
+static struct attribute_group nhm_uncore_format_group = {
+	.name = "format",
+	.attrs = nhm_uncore_formats_attr,
+};
+
+static struct uncore_event_desc nhm_uncore_events[] = {
+	INTEL_UNCORE_EVENT_DESC(clockticks,                "event=0xff,umask=0x00"),
+	INTEL_UNCORE_EVENT_DESC(qmc_writes_full_any,       "event=0x2f,umask=0x0f"),
+	INTEL_UNCORE_EVENT_DESC(qmc_normal_reads_any,      "event=0x2c,umask=0x0f"),
+	INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_reads,     "event=0x20,umask=0x01"),
+	INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_writes,    "event=0x20,umask=0x02"),
+	INTEL_UNCORE_EVENT_DESC(qhl_request_remote_reads,  "event=0x20,umask=0x04"),
+	INTEL_UNCORE_EVENT_DESC(qhl_request_remote_writes, "event=0x20,umask=0x08"),
+	INTEL_UNCORE_EVENT_DESC(qhl_request_local_reads,   "event=0x20,umask=0x10"),
+	INTEL_UNCORE_EVENT_DESC(qhl_request_local_writes,  "event=0x20,umask=0x20"),
+	{ /* end: all zeroes */ },
+};
+
+static struct intel_uncore_ops nhm_uncore_msr_ops = {
+	.disable_box	= nhm_uncore_msr_disable_box,
+	.enable_box	= nhm_uncore_msr_enable_box,
+	.disable_event	= snb_uncore_msr_disable_event,
+	.enable_event	= nhm_uncore_msr_enable_event,
+	.read_counter	= uncore_msr_read_counter,
+};
+
+static struct intel_uncore_type nhm_uncore = {
+	.name		= "",
+	.num_counters   = 8,
+	.num_boxes	= 1,
+	.perf_ctr_bits	= 48,
+	.fixed_ctr_bits	= 48,
+	.event_ctl	= NHM_UNC_PERFEVTSEL0,
+	.perf_ctr	= NHM_UNC_UNCORE_PMC0,
+	.fixed_ctr	= NHM_UNC_FIXED_CTR,
+	.fixed_ctl	= NHM_UNC_FIXED_CTR_CTRL,
+	.event_mask	= NHM_UNC_RAW_EVENT_MASK,
+	.event_descs	= nhm_uncore_events,
+	.ops		= &nhm_uncore_msr_ops,
+	.format_group	= &nhm_uncore_format_group,
+};
+
+static struct intel_uncore_type *nhm_msr_uncores[] = {
+	&nhm_uncore,
+	NULL,
+};
+
+void nhm_uncore_cpu_init(void)
+{
+	uncore_msr_uncores = nhm_msr_uncores;
+}
+
+/* end of Nehalem uncore support */

diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
new file mode 100644
index 0000000..6606ed0
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c

@@ -0,0 +1,1645 @@
+/* SandyBridge-EP/IvyTown uncore support */
+#include "perf_event_intel_uncore.h"
+
+
+/* SNB-EP Box level control */
+#define SNBEP_PMON_BOX_CTL_RST_CTRL	(1 << 0)
+#define SNBEP_PMON_BOX_CTL_RST_CTRS	(1 << 1)
+#define SNBEP_PMON_BOX_CTL_FRZ		(1 << 8)
+#define SNBEP_PMON_BOX_CTL_FRZ_EN	(1 << 16)
+#define SNBEP_PMON_BOX_CTL_INT		(SNBEP_PMON_BOX_CTL_RST_CTRL | \
+					 SNBEP_PMON_BOX_CTL_RST_CTRS | \
+					 SNBEP_PMON_BOX_CTL_FRZ_EN)
+/* SNB-EP event control */
+#define SNBEP_PMON_CTL_EV_SEL_MASK	0x000000ff
+#define SNBEP_PMON_CTL_UMASK_MASK	0x0000ff00
+#define SNBEP_PMON_CTL_RST		(1 << 17)
+#define SNBEP_PMON_CTL_EDGE_DET		(1 << 18)
+#define SNBEP_PMON_CTL_EV_SEL_EXT	(1 << 21)
+#define SNBEP_PMON_CTL_EN		(1 << 22)
+#define SNBEP_PMON_CTL_INVERT		(1 << 23)
+#define SNBEP_PMON_CTL_TRESH_MASK	0xff000000
+#define SNBEP_PMON_RAW_EVENT_MASK	(SNBEP_PMON_CTL_EV_SEL_MASK | \
+					 SNBEP_PMON_CTL_UMASK_MASK | \
+					 SNBEP_PMON_CTL_EDGE_DET | \
+					 SNBEP_PMON_CTL_INVERT | \
+					 SNBEP_PMON_CTL_TRESH_MASK)
+
+/* SNB-EP Ubox event control */
+#define SNBEP_U_MSR_PMON_CTL_TRESH_MASK		0x1f000000
+#define SNBEP_U_MSR_PMON_RAW_EVENT_MASK		\
+				(SNBEP_PMON_CTL_EV_SEL_MASK | \
+				 SNBEP_PMON_CTL_UMASK_MASK | \
+				 SNBEP_PMON_CTL_EDGE_DET | \
+				 SNBEP_PMON_CTL_INVERT | \
+				 SNBEP_U_MSR_PMON_CTL_TRESH_MASK)
+
+#define SNBEP_CBO_PMON_CTL_TID_EN		(1 << 19)
+#define SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK	(SNBEP_PMON_RAW_EVENT_MASK | \
+						 SNBEP_CBO_PMON_CTL_TID_EN)
+
+/* SNB-EP PCU event control */
+#define SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK	0x0000c000
+#define SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK	0x1f000000
+#define SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT	(1 << 30)
+#define SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET	(1 << 31)
+#define SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK	\
+				(SNBEP_PMON_CTL_EV_SEL_MASK | \
+				 SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \
+				 SNBEP_PMON_CTL_EDGE_DET | \
+				 SNBEP_PMON_CTL_EV_SEL_EXT | \
+				 SNBEP_PMON_CTL_INVERT | \
+				 SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \
+				 SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \
+				 SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET)
+
+#define SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK	\
+				(SNBEP_PMON_RAW_EVENT_MASK | \
+				 SNBEP_PMON_CTL_EV_SEL_EXT)
+
+/* SNB-EP pci control register */
+#define SNBEP_PCI_PMON_BOX_CTL			0xf4
+#define SNBEP_PCI_PMON_CTL0			0xd8
+/* SNB-EP pci counter register */
+#define SNBEP_PCI_PMON_CTR0			0xa0
+
+/* SNB-EP home agent register */
+#define SNBEP_HA_PCI_PMON_BOX_ADDRMATCH0	0x40
+#define SNBEP_HA_PCI_PMON_BOX_ADDRMATCH1	0x44
+#define SNBEP_HA_PCI_PMON_BOX_OPCODEMATCH	0x48
+/* SNB-EP memory controller register */
+#define SNBEP_MC_CHy_PCI_PMON_FIXED_CTL		0xf0
+#define SNBEP_MC_CHy_PCI_PMON_FIXED_CTR		0xd0
+/* SNB-EP QPI register */
+#define SNBEP_Q_Py_PCI_PMON_PKT_MATCH0		0x228
+#define SNBEP_Q_Py_PCI_PMON_PKT_MATCH1		0x22c
+#define SNBEP_Q_Py_PCI_PMON_PKT_MASK0		0x238
+#define SNBEP_Q_Py_PCI_PMON_PKT_MASK1		0x23c
+
+/* SNB-EP Ubox register */
+#define SNBEP_U_MSR_PMON_CTR0			0xc16
+#define SNBEP_U_MSR_PMON_CTL0			0xc10
+
+#define SNBEP_U_MSR_PMON_UCLK_FIXED_CTL		0xc08
+#define SNBEP_U_MSR_PMON_UCLK_FIXED_CTR		0xc09
+
+/* SNB-EP Cbo register */
+#define SNBEP_C0_MSR_PMON_CTR0			0xd16
+#define SNBEP_C0_MSR_PMON_CTL0			0xd10
+#define SNBEP_C0_MSR_PMON_BOX_CTL		0xd04
+#define SNBEP_C0_MSR_PMON_BOX_FILTER		0xd14
+#define SNBEP_CBO_MSR_OFFSET			0x20
+
+#define SNBEP_CB0_MSR_PMON_BOX_FILTER_TID	0x1f
+#define SNBEP_CB0_MSR_PMON_BOX_FILTER_NID	0x3fc00
+#define SNBEP_CB0_MSR_PMON_BOX_FILTER_STATE	0x7c0000
+#define SNBEP_CB0_MSR_PMON_BOX_FILTER_OPC	0xff800000
+
+#define SNBEP_CBO_EVENT_EXTRA_REG(e, m, i) {	\
+	.event = (e),				\
+	.msr = SNBEP_C0_MSR_PMON_BOX_FILTER,	\
+	.config_mask = (m),			\
+	.idx = (i)				\
+}
+
+/* SNB-EP PCU register */
+#define SNBEP_PCU_MSR_PMON_CTR0			0xc36
+#define SNBEP_PCU_MSR_PMON_CTL0			0xc30
+#define SNBEP_PCU_MSR_PMON_BOX_CTL		0xc24
+#define SNBEP_PCU_MSR_PMON_BOX_FILTER		0xc34
+#define SNBEP_PCU_MSR_PMON_BOX_FILTER_MASK	0xffffffff
+#define SNBEP_PCU_MSR_CORE_C3_CTR		0x3fc
+#define SNBEP_PCU_MSR_CORE_C6_CTR		0x3fd
+
+/* IVBEP event control */
+#define IVBEP_PMON_BOX_CTL_INT		(SNBEP_PMON_BOX_CTL_RST_CTRL | \
+					 SNBEP_PMON_BOX_CTL_RST_CTRS)
+#define IVBEP_PMON_RAW_EVENT_MASK		(SNBEP_PMON_CTL_EV_SEL_MASK | \
+					 SNBEP_PMON_CTL_UMASK_MASK | \
+					 SNBEP_PMON_CTL_EDGE_DET | \
+					 SNBEP_PMON_CTL_TRESH_MASK)
+/* IVBEP Ubox */
+#define IVBEP_U_MSR_PMON_GLOBAL_CTL		0xc00
+#define IVBEP_U_PMON_GLOBAL_FRZ_ALL		(1 << 31)
+#define IVBEP_U_PMON_GLOBAL_UNFRZ_ALL		(1 << 29)
+
+#define IVBEP_U_MSR_PMON_RAW_EVENT_MASK	\
+				(SNBEP_PMON_CTL_EV_SEL_MASK | \
+				 SNBEP_PMON_CTL_UMASK_MASK | \
+				 SNBEP_PMON_CTL_EDGE_DET | \
+				 SNBEP_U_MSR_PMON_CTL_TRESH_MASK)
+/* IVBEP Cbo */
+#define IVBEP_CBO_MSR_PMON_RAW_EVENT_MASK		(IVBEP_PMON_RAW_EVENT_MASK | \
+						 SNBEP_CBO_PMON_CTL_TID_EN)
+
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_TID		(0x1fULL << 0)
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_LINK	(0xfULL << 5)
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_STATE	(0x3fULL << 17)
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_NID		(0xffffULL << 32)
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_OPC		(0x1ffULL << 52)
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_C6		(0x1ULL << 61)
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_NC		(0x1ULL << 62)
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_IOSC	(0x1ULL << 63)
+
+/* IVBEP home agent */
+#define IVBEP_HA_PCI_PMON_CTL_Q_OCC_RST		(1 << 16)
+#define IVBEP_HA_PCI_PMON_RAW_EVENT_MASK		\
+				(IVBEP_PMON_RAW_EVENT_MASK | \
+				 IVBEP_HA_PCI_PMON_CTL_Q_OCC_RST)
+/* IVBEP PCU */
+#define IVBEP_PCU_MSR_PMON_RAW_EVENT_MASK	\
+				(SNBEP_PMON_CTL_EV_SEL_MASK | \
+				 SNBEP_PMON_CTL_EV_SEL_EXT | \
+				 SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \
+				 SNBEP_PMON_CTL_EDGE_DET | \
+				 SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \
+				 SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \
+				 SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET)
+/* IVBEP QPI */
+#define IVBEP_QPI_PCI_PMON_RAW_EVENT_MASK	\
+				(IVBEP_PMON_RAW_EVENT_MASK | \
+				 SNBEP_PMON_CTL_EV_SEL_EXT)
+
+#define __BITS_VALUE(x, i, n)  ((typeof(x))(((x) >> ((i) * (n))) & \
+				((1ULL << (n)) - 1)))
+
+DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
+DEFINE_UNCORE_FORMAT_ATTR(event_ext, event, "config:0-7,21");
+DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
+DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
+DEFINE_UNCORE_FORMAT_ATTR(tid_en, tid_en, "config:19");
+DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
+DEFINE_UNCORE_FORMAT_ATTR(thresh8, thresh, "config:24-31");
+DEFINE_UNCORE_FORMAT_ATTR(thresh5, thresh, "config:24-28");
+DEFINE_UNCORE_FORMAT_ATTR(occ_sel, occ_sel, "config:14-15");
+DEFINE_UNCORE_FORMAT_ATTR(occ_invert, occ_invert, "config:30");
+DEFINE_UNCORE_FORMAT_ATTR(occ_edge, occ_edge, "config:14-51");
+DEFINE_UNCORE_FORMAT_ATTR(filter_tid, filter_tid, "config1:0-4");
+DEFINE_UNCORE_FORMAT_ATTR(filter_link, filter_link, "config1:5-8");
+DEFINE_UNCORE_FORMAT_ATTR(filter_nid, filter_nid, "config1:10-17");
+DEFINE_UNCORE_FORMAT_ATTR(filter_nid2, filter_nid, "config1:32-47");
+DEFINE_UNCORE_FORMAT_ATTR(filter_state, filter_state, "config1:18-22");
+DEFINE_UNCORE_FORMAT_ATTR(filter_state2, filter_state, "config1:17-22");
+DEFINE_UNCORE_FORMAT_ATTR(filter_opc, filter_opc, "config1:23-31");
+DEFINE_UNCORE_FORMAT_ATTR(filter_opc2, filter_opc, "config1:52-60");
+DEFINE_UNCORE_FORMAT_ATTR(filter_band0, filter_band0, "config1:0-7");
+DEFINE_UNCORE_FORMAT_ATTR(filter_band1, filter_band1, "config1:8-15");
+DEFINE_UNCORE_FORMAT_ATTR(filter_band2, filter_band2, "config1:16-23");
+DEFINE_UNCORE_FORMAT_ATTR(filter_band3, filter_band3, "config1:24-31");
+DEFINE_UNCORE_FORMAT_ATTR(match_rds, match_rds, "config1:48-51");
+DEFINE_UNCORE_FORMAT_ATTR(match_rnid30, match_rnid30, "config1:32-35");
+DEFINE_UNCORE_FORMAT_ATTR(match_rnid4, match_rnid4, "config1:31");
+DEFINE_UNCORE_FORMAT_ATTR(match_dnid, match_dnid, "config1:13-17");
+DEFINE_UNCORE_FORMAT_ATTR(match_mc, match_mc, "config1:9-12");
+DEFINE_UNCORE_FORMAT_ATTR(match_opc, match_opc, "config1:5-8");
+DEFINE_UNCORE_FORMAT_ATTR(match_vnw, match_vnw, "config1:3-4");
+DEFINE_UNCORE_FORMAT_ATTR(match0, match0, "config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(match1, match1, "config1:32-63");
+DEFINE_UNCORE_FORMAT_ATTR(mask_rds, mask_rds, "config2:48-51");
+DEFINE_UNCORE_FORMAT_ATTR(mask_rnid30, mask_rnid30, "config2:32-35");
+DEFINE_UNCORE_FORMAT_ATTR(mask_rnid4, mask_rnid4, "config2:31");
+DEFINE_UNCORE_FORMAT_ATTR(mask_dnid, mask_dnid, "config2:13-17");
+DEFINE_UNCORE_FORMAT_ATTR(mask_mc, mask_mc, "config2:9-12");
+DEFINE_UNCORE_FORMAT_ATTR(mask_opc, mask_opc, "config2:5-8");
+DEFINE_UNCORE_FORMAT_ATTR(mask_vnw, mask_vnw, "config2:3-4");
+DEFINE_UNCORE_FORMAT_ATTR(mask0, mask0, "config2:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(mask1, mask1, "config2:32-63");
+
+static void snbep_uncore_pci_disable_box(struct intel_uncore_box *box)
+{
+	struct pci_dev *pdev = box->pci_dev;
+	int box_ctl = uncore_pci_box_ctl(box);
+	u32 config = 0;
+
+	if (!pci_read_config_dword(pdev, box_ctl, &config)) {
+		config |= SNBEP_PMON_BOX_CTL_FRZ;
+		pci_write_config_dword(pdev, box_ctl, config);
+	}
+}
+
+static void snbep_uncore_pci_enable_box(struct intel_uncore_box *box)
+{
+	struct pci_dev *pdev = box->pci_dev;
+	int box_ctl = uncore_pci_box_ctl(box);
+	u32 config = 0;
+
+	if (!pci_read_config_dword(pdev, box_ctl, &config)) {
+		config &= ~SNBEP_PMON_BOX_CTL_FRZ;
+		pci_write_config_dword(pdev, box_ctl, config);
+	}
+}
+
+static void snbep_uncore_pci_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct pci_dev *pdev = box->pci_dev;
+	struct hw_perf_event *hwc = &event->hw;
+
+	pci_write_config_dword(pdev, hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static void snbep_uncore_pci_disable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct pci_dev *pdev = box->pci_dev;
+	struct hw_perf_event *hwc = &event->hw;
+
+	pci_write_config_dword(pdev, hwc->config_base, hwc->config);
+}
+
+static u64 snbep_uncore_pci_read_counter(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct pci_dev *pdev = box->pci_dev;
+	struct hw_perf_event *hwc = &event->hw;
+	u64 count = 0;
+
+	pci_read_config_dword(pdev, hwc->event_base, (u32 *)&count);
+	pci_read_config_dword(pdev, hwc->event_base + 4, (u32 *)&count + 1);
+
+	return count;
+}
+
+static void snbep_uncore_pci_init_box(struct intel_uncore_box *box)
+{
+	struct pci_dev *pdev = box->pci_dev;
+
+	pci_write_config_dword(pdev, SNBEP_PCI_PMON_BOX_CTL, SNBEP_PMON_BOX_CTL_INT);
+}
+
+static void snbep_uncore_msr_disable_box(struct intel_uncore_box *box)
+{
+	u64 config;
+	unsigned msr;
+
+	msr = uncore_msr_box_ctl(box);
+	if (msr) {
+		rdmsrl(msr, config);
+		config |= SNBEP_PMON_BOX_CTL_FRZ;
+		wrmsrl(msr, config);
+	}
+}
+
+static void snbep_uncore_msr_enable_box(struct intel_uncore_box *box)
+{
+	u64 config;
+	unsigned msr;
+
+	msr = uncore_msr_box_ctl(box);
+	if (msr) {
+		rdmsrl(msr, config);
+		config &= ~SNBEP_PMON_BOX_CTL_FRZ;
+		wrmsrl(msr, config);
+	}
+}
+
+static void snbep_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+
+	if (reg1->idx != EXTRA_REG_NONE)
+		wrmsrl(reg1->reg, uncore_shared_reg_config(box, 0));
+
+	wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static void snbep_uncore_msr_disable_event(struct intel_uncore_box *box,
+					struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	wrmsrl(hwc->config_base, hwc->config);
+}
+
+static void snbep_uncore_msr_init_box(struct intel_uncore_box *box)
+{
+	unsigned msr = uncore_msr_box_ctl(box);
+
+	if (msr)
+		wrmsrl(msr, SNBEP_PMON_BOX_CTL_INT);
+}
+
+static struct attribute *snbep_uncore_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_inv.attr,
+	&format_attr_thresh8.attr,
+	NULL,
+};
+
+static struct attribute *snbep_uncore_ubox_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_inv.attr,
+	&format_attr_thresh5.attr,
+	NULL,
+};
+
+static struct attribute *snbep_uncore_cbox_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_tid_en.attr,
+	&format_attr_inv.attr,
+	&format_attr_thresh8.attr,
+	&format_attr_filter_tid.attr,
+	&format_attr_filter_nid.attr,
+	&format_attr_filter_state.attr,
+	&format_attr_filter_opc.attr,
+	NULL,
+};
+
+static struct attribute *snbep_uncore_pcu_formats_attr[] = {
+	&format_attr_event_ext.attr,
+	&format_attr_occ_sel.attr,
+	&format_attr_edge.attr,
+	&format_attr_inv.attr,
+	&format_attr_thresh5.attr,
+	&format_attr_occ_invert.attr,
+	&format_attr_occ_edge.attr,
+	&format_attr_filter_band0.attr,
+	&format_attr_filter_band1.attr,
+	&format_attr_filter_band2.attr,
+	&format_attr_filter_band3.attr,
+	NULL,
+};
+
+static struct attribute *snbep_uncore_qpi_formats_attr[] = {
+	&format_attr_event_ext.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_inv.attr,
+	&format_attr_thresh8.attr,
+	&format_attr_match_rds.attr,
+	&format_attr_match_rnid30.attr,
+	&format_attr_match_rnid4.attr,
+	&format_attr_match_dnid.attr,
+	&format_attr_match_mc.attr,
+	&format_attr_match_opc.attr,
+	&format_attr_match_vnw.attr,
+	&format_attr_match0.attr,
+	&format_attr_match1.attr,
+	&format_attr_mask_rds.attr,
+	&format_attr_mask_rnid30.attr,
+	&format_attr_mask_rnid4.attr,
+	&format_attr_mask_dnid.attr,
+	&format_attr_mask_mc.attr,
+	&format_attr_mask_opc.attr,
+	&format_attr_mask_vnw.attr,
+	&format_attr_mask0.attr,
+	&format_attr_mask1.attr,
+	NULL,
+};
+
+static struct uncore_event_desc snbep_uncore_imc_events[] = {
+	INTEL_UNCORE_EVENT_DESC(clockticks,      "event=0xff,umask=0x00"),
+	INTEL_UNCORE_EVENT_DESC(cas_count_read,  "event=0x04,umask=0x03"),
+	INTEL_UNCORE_EVENT_DESC(cas_count_write, "event=0x04,umask=0x0c"),
+	{ /* end: all zeroes */ },
+};
+
+static struct uncore_event_desc snbep_uncore_qpi_events[] = {
+	INTEL_UNCORE_EVENT_DESC(clockticks,       "event=0x14"),
+	INTEL_UNCORE_EVENT_DESC(txl_flits_active, "event=0x00,umask=0x06"),
+	INTEL_UNCORE_EVENT_DESC(drs_data,         "event=0x102,umask=0x08"),
+	INTEL_UNCORE_EVENT_DESC(ncb_data,         "event=0x103,umask=0x04"),
+	{ /* end: all zeroes */ },
+};
+
+static struct attribute_group snbep_uncore_format_group = {
+	.name = "format",
+	.attrs = snbep_uncore_formats_attr,
+};
+
+static struct attribute_group snbep_uncore_ubox_format_group = {
+	.name = "format",
+	.attrs = snbep_uncore_ubox_formats_attr,
+};
+
+static struct attribute_group snbep_uncore_cbox_format_group = {
+	.name = "format",
+	.attrs = snbep_uncore_cbox_formats_attr,
+};
+
+static struct attribute_group snbep_uncore_pcu_format_group = {
+	.name = "format",
+	.attrs = snbep_uncore_pcu_formats_attr,
+};
+
+static struct attribute_group snbep_uncore_qpi_format_group = {
+	.name = "format",
+	.attrs = snbep_uncore_qpi_formats_attr,
+};
+
+#define SNBEP_UNCORE_MSR_OPS_COMMON_INIT()			\
+	.init_box	= snbep_uncore_msr_init_box,		\
+	.disable_box	= snbep_uncore_msr_disable_box,		\
+	.enable_box	= snbep_uncore_msr_enable_box,		\
+	.disable_event	= snbep_uncore_msr_disable_event,	\
+	.enable_event	= snbep_uncore_msr_enable_event,	\
+	.read_counter	= uncore_msr_read_counter
+
+static struct intel_uncore_ops snbep_uncore_msr_ops = {
+	SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+};
+
+#define SNBEP_UNCORE_PCI_OPS_COMMON_INIT()			\
+	.init_box	= snbep_uncore_pci_init_box,		\
+	.disable_box	= snbep_uncore_pci_disable_box,		\
+	.enable_box	= snbep_uncore_pci_enable_box,		\
+	.disable_event	= snbep_uncore_pci_disable_event,	\
+	.read_counter	= snbep_uncore_pci_read_counter
+
+static struct intel_uncore_ops snbep_uncore_pci_ops = {
+	SNBEP_UNCORE_PCI_OPS_COMMON_INIT(),
+	.enable_event	= snbep_uncore_pci_enable_event,	\
+};
+
+static struct event_constraint snbep_uncore_cbox_constraints[] = {
+	UNCORE_EVENT_CONSTRAINT(0x01, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x02, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x04, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x05, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x07, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x09, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x12, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x13, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x1b, 0xc),
+	UNCORE_EVENT_CONSTRAINT(0x1c, 0xc),
+	UNCORE_EVENT_CONSTRAINT(0x1d, 0xc),
+	UNCORE_EVENT_CONSTRAINT(0x1e, 0xc),
+	EVENT_CONSTRAINT_OVERLAP(0x1f, 0xe, 0xff),
+	UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x31, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x35, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x39, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x3b, 0x1),
+	EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint snbep_uncore_r2pcie_constraints[] = {
+	UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x12, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x24, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
+	EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint snbep_uncore_r3qpi_constraints[] = {
+	UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x12, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x20, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x22, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x24, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x29, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x2a, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x2b, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x2e, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x2f, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x30, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x31, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x36, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x39, 0x3),
+	EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type snbep_uncore_ubox = {
+	.name		= "ubox",
+	.num_counters   = 2,
+	.num_boxes	= 1,
+	.perf_ctr_bits	= 44,
+	.fixed_ctr_bits	= 48,
+	.perf_ctr	= SNBEP_U_MSR_PMON_CTR0,
+	.event_ctl	= SNBEP_U_MSR_PMON_CTL0,
+	.event_mask	= SNBEP_U_MSR_PMON_RAW_EVENT_MASK,
+	.fixed_ctr	= SNBEP_U_MSR_PMON_UCLK_FIXED_CTR,
+	.fixed_ctl	= SNBEP_U_MSR_PMON_UCLK_FIXED_CTL,
+	.ops		= &snbep_uncore_msr_ops,
+	.format_group	= &snbep_uncore_ubox_format_group,
+};
+
+static struct extra_reg snbep_uncore_cbox_extra_regs[] = {
+	SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
+				  SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4334, 0xffff, 0x6),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4534, 0xffff, 0x6),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4934, 0xffff, 0x6),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0x6),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x8),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x8),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0xa),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0xa),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4435, 0xffff, 0x2),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4835, 0xffff, 0x2),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4a35, 0xffff, 0x2),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x5035, 0xffff, 0x2),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x8),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x8),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0xa),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0xa),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4436, 0xffff, 0x2),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4836, 0xffff, 0x2),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4a36, 0xffff, 0x2),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4037, 0x40ff, 0x2),
+	EVENT_EXTRA_END
+};
+
+static void snbep_cbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+	struct intel_uncore_extra_reg *er = &box->shared_regs[0];
+	int i;
+
+	if (uncore_box_is_fake(box))
+		return;
+
+	for (i = 0; i < 5; i++) {
+		if (reg1->alloc & (0x1 << i))
+			atomic_sub(1 << (i * 6), &er->ref);
+	}
+	reg1->alloc = 0;
+}
+
+static struct event_constraint *
+__snbep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event,
+			    u64 (*cbox_filter_mask)(int fields))
+{
+	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+	struct intel_uncore_extra_reg *er = &box->shared_regs[0];
+	int i, alloc = 0;
+	unsigned long flags;
+	u64 mask;
+
+	if (reg1->idx == EXTRA_REG_NONE)
+		return NULL;
+
+	raw_spin_lock_irqsave(&er->lock, flags);
+	for (i = 0; i < 5; i++) {
+		if (!(reg1->idx & (0x1 << i)))
+			continue;
+		if (!uncore_box_is_fake(box) && (reg1->alloc & (0x1 << i)))
+			continue;
+
+		mask = cbox_filter_mask(0x1 << i);
+		if (!__BITS_VALUE(atomic_read(&er->ref), i, 6) ||
+		    !((reg1->config ^ er->config) & mask)) {
+			atomic_add(1 << (i * 6), &er->ref);
+			er->config &= ~mask;
+			er->config |= reg1->config & mask;
+			alloc |= (0x1 << i);
+		} else {
+			break;
+		}
+	}
+	raw_spin_unlock_irqrestore(&er->lock, flags);
+	if (i < 5)
+		goto fail;
+
+	if (!uncore_box_is_fake(box))
+		reg1->alloc |= alloc;
+
+	return NULL;
+fail:
+	for (; i >= 0; i--) {
+		if (alloc & (0x1 << i))
+			atomic_sub(1 << (i * 6), &er->ref);
+	}
+	return &uncore_constraint_empty;
+}
+
+static u64 snbep_cbox_filter_mask(int fields)
+{
+	u64 mask = 0;
+
+	if (fields & 0x1)
+		mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_TID;
+	if (fields & 0x2)
+		mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_NID;
+	if (fields & 0x4)
+		mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_STATE;
+	if (fields & 0x8)
+		mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_OPC;
+
+	return mask;
+}
+
+static struct event_constraint *
+snbep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+	return __snbep_cbox_get_constraint(box, event, snbep_cbox_filter_mask);
+}
+
+static int snbep_cbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+	struct extra_reg *er;
+	int idx = 0;
+
+	for (er = snbep_uncore_cbox_extra_regs; er->msr; er++) {
+		if (er->event != (event->hw.config & er->config_mask))
+			continue;
+		idx |= er->idx;
+	}
+
+	if (idx) {
+		reg1->reg = SNBEP_C0_MSR_PMON_BOX_FILTER +
+			SNBEP_CBO_MSR_OFFSET * box->pmu->pmu_idx;
+		reg1->config = event->attr.config1 & snbep_cbox_filter_mask(idx);
+		reg1->idx = idx;
+	}
+	return 0;
+}
+
+static struct intel_uncore_ops snbep_uncore_cbox_ops = {
+	SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+	.hw_config		= snbep_cbox_hw_config,
+	.get_constraint		= snbep_cbox_get_constraint,
+	.put_constraint		= snbep_cbox_put_constraint,
+};
+
+static struct intel_uncore_type snbep_uncore_cbox = {
+	.name			= "cbox",
+	.num_counters		= 4,
+	.num_boxes		= 8,
+	.perf_ctr_bits		= 44,
+	.event_ctl		= SNBEP_C0_MSR_PMON_CTL0,
+	.perf_ctr		= SNBEP_C0_MSR_PMON_CTR0,
+	.event_mask		= SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK,
+	.box_ctl		= SNBEP_C0_MSR_PMON_BOX_CTL,
+	.msr_offset		= SNBEP_CBO_MSR_OFFSET,
+	.num_shared_regs	= 1,
+	.constraints		= snbep_uncore_cbox_constraints,
+	.ops			= &snbep_uncore_cbox_ops,
+	.format_group		= &snbep_uncore_cbox_format_group,
+};
+
+static u64 snbep_pcu_alter_er(struct perf_event *event, int new_idx, bool modify)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+	u64 config = reg1->config;
+
+	if (new_idx > reg1->idx)
+		config <<= 8 * (new_idx - reg1->idx);
+	else
+		config >>= 8 * (reg1->idx - new_idx);
+
+	if (modify) {
+		hwc->config += new_idx - reg1->idx;
+		reg1->config = config;
+		reg1->idx = new_idx;
+	}
+	return config;
+}
+
+static struct event_constraint *
+snbep_pcu_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+	struct intel_uncore_extra_reg *er = &box->shared_regs[0];
+	unsigned long flags;
+	int idx = reg1->idx;
+	u64 mask, config1 = reg1->config;
+	bool ok = false;
+
+	if (reg1->idx == EXTRA_REG_NONE ||
+	    (!uncore_box_is_fake(box) && reg1->alloc))
+		return NULL;
+again:
+	mask = 0xffULL << (idx * 8);
+	raw_spin_lock_irqsave(&er->lock, flags);
+	if (!__BITS_VALUE(atomic_read(&er->ref), idx, 8) ||
+	    !((config1 ^ er->config) & mask)) {
+		atomic_add(1 << (idx * 8), &er->ref);
+		er->config &= ~mask;
+		er->config |= config1 & mask;
+		ok = true;
+	}
+	raw_spin_unlock_irqrestore(&er->lock, flags);
+
+	if (!ok) {
+		idx = (idx + 1) % 4;
+		if (idx != reg1->idx) {
+			config1 = snbep_pcu_alter_er(event, idx, false);
+			goto again;
+		}
+		return &uncore_constraint_empty;
+	}
+
+	if (!uncore_box_is_fake(box)) {
+		if (idx != reg1->idx)
+			snbep_pcu_alter_er(event, idx, true);
+		reg1->alloc = 1;
+	}
+	return NULL;
+}
+
+static void snbep_pcu_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+	struct intel_uncore_extra_reg *er = &box->shared_regs[0];
+
+	if (uncore_box_is_fake(box) || !reg1->alloc)
+		return;
+
+	atomic_sub(1 << (reg1->idx * 8), &er->ref);
+	reg1->alloc = 0;
+}
+
+static int snbep_pcu_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+	int ev_sel = hwc->config & SNBEP_PMON_CTL_EV_SEL_MASK;
+
+	if (ev_sel >= 0xb && ev_sel <= 0xe) {
+		reg1->reg = SNBEP_PCU_MSR_PMON_BOX_FILTER;
+		reg1->idx = ev_sel - 0xb;
+		reg1->config = event->attr.config1 & (0xff << reg1->idx);
+	}
+	return 0;
+}
+
+static struct intel_uncore_ops snbep_uncore_pcu_ops = {
+	SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+	.hw_config		= snbep_pcu_hw_config,
+	.get_constraint		= snbep_pcu_get_constraint,
+	.put_constraint		= snbep_pcu_put_constraint,
+};
+
+static struct intel_uncore_type snbep_uncore_pcu = {
+	.name			= "pcu",
+	.num_counters		= 4,
+	.num_boxes		= 1,
+	.perf_ctr_bits		= 48,
+	.perf_ctr		= SNBEP_PCU_MSR_PMON_CTR0,
+	.event_ctl		= SNBEP_PCU_MSR_PMON_CTL0,
+	.event_mask		= SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK,
+	.box_ctl		= SNBEP_PCU_MSR_PMON_BOX_CTL,
+	.num_shared_regs	= 1,
+	.ops			= &snbep_uncore_pcu_ops,
+	.format_group		= &snbep_uncore_pcu_format_group,
+};
+
+static struct intel_uncore_type *snbep_msr_uncores[] = {
+	&snbep_uncore_ubox,
+	&snbep_uncore_cbox,
+	&snbep_uncore_pcu,
+	NULL,
+};
+
+void snbep_uncore_cpu_init(void)
+{
+	if (snbep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
+		snbep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
+	uncore_msr_uncores = snbep_msr_uncores;
+}
+
+enum {
+	SNBEP_PCI_QPI_PORT0_FILTER,
+	SNBEP_PCI_QPI_PORT1_FILTER,
+};
+
+static int snbep_qpi_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+	struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+
+	if ((hwc->config & SNBEP_PMON_CTL_EV_SEL_MASK) == 0x38) {
+		reg1->idx = 0;
+		reg1->reg = SNBEP_Q_Py_PCI_PMON_PKT_MATCH0;
+		reg1->config = event->attr.config1;
+		reg2->reg = SNBEP_Q_Py_PCI_PMON_PKT_MASK0;
+		reg2->config = event->attr.config2;
+	}
+	return 0;
+}
+
+static void snbep_qpi_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct pci_dev *pdev = box->pci_dev;
+	struct hw_perf_event *hwc = &event->hw;
+	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+	struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+
+	if (reg1->idx != EXTRA_REG_NONE) {
+		int idx = box->pmu->pmu_idx + SNBEP_PCI_QPI_PORT0_FILTER;
+		struct pci_dev *filter_pdev = uncore_extra_pci_dev[box->phys_id][idx];
+		WARN_ON_ONCE(!filter_pdev);
+		if (filter_pdev) {
+			pci_write_config_dword(filter_pdev, reg1->reg,
+						(u32)reg1->config);
+			pci_write_config_dword(filter_pdev, reg1->reg + 4,
+						(u32)(reg1->config >> 32));
+			pci_write_config_dword(filter_pdev, reg2->reg,
+						(u32)reg2->config);
+			pci_write_config_dword(filter_pdev, reg2->reg + 4,
+						(u32)(reg2->config >> 32));
+		}
+	}
+
+	pci_write_config_dword(pdev, hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static struct intel_uncore_ops snbep_uncore_qpi_ops = {
+	SNBEP_UNCORE_PCI_OPS_COMMON_INIT(),
+	.enable_event		= snbep_qpi_enable_event,
+	.hw_config		= snbep_qpi_hw_config,
+	.get_constraint		= uncore_get_constraint,
+	.put_constraint		= uncore_put_constraint,
+};
+
+#define SNBEP_UNCORE_PCI_COMMON_INIT()				\
+	.perf_ctr	= SNBEP_PCI_PMON_CTR0,			\
+	.event_ctl	= SNBEP_PCI_PMON_CTL0,			\
+	.event_mask	= SNBEP_PMON_RAW_EVENT_MASK,		\
+	.box_ctl	= SNBEP_PCI_PMON_BOX_CTL,		\
+	.ops		= &snbep_uncore_pci_ops,		\
+	.format_group	= &snbep_uncore_format_group
+
+static struct intel_uncore_type snbep_uncore_ha = {
+	.name		= "ha",
+	.num_counters   = 4,
+	.num_boxes	= 1,
+	.perf_ctr_bits	= 48,
+	SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type snbep_uncore_imc = {
+	.name		= "imc",
+	.num_counters   = 4,
+	.num_boxes	= 4,
+	.perf_ctr_bits	= 48,
+	.fixed_ctr_bits	= 48,
+	.fixed_ctr	= SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
+	.fixed_ctl	= SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
+	.event_descs	= snbep_uncore_imc_events,
+	SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type snbep_uncore_qpi = {
+	.name			= "qpi",
+	.num_counters		= 4,
+	.num_boxes		= 2,
+	.perf_ctr_bits		= 48,
+	.perf_ctr		= SNBEP_PCI_PMON_CTR0,
+	.event_ctl		= SNBEP_PCI_PMON_CTL0,
+	.event_mask		= SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK,
+	.box_ctl		= SNBEP_PCI_PMON_BOX_CTL,
+	.num_shared_regs	= 1,
+	.ops			= &snbep_uncore_qpi_ops,
+	.event_descs		= snbep_uncore_qpi_events,
+	.format_group		= &snbep_uncore_qpi_format_group,
+};
+
+
+static struct intel_uncore_type snbep_uncore_r2pcie = {
+	.name		= "r2pcie",
+	.num_counters   = 4,
+	.num_boxes	= 1,
+	.perf_ctr_bits	= 44,
+	.constraints	= snbep_uncore_r2pcie_constraints,
+	SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type snbep_uncore_r3qpi = {
+	.name		= "r3qpi",
+	.num_counters   = 3,
+	.num_boxes	= 2,
+	.perf_ctr_bits	= 44,
+	.constraints	= snbep_uncore_r3qpi_constraints,
+	SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+enum {
+	SNBEP_PCI_UNCORE_HA,
+	SNBEP_PCI_UNCORE_IMC,
+	SNBEP_PCI_UNCORE_QPI,
+	SNBEP_PCI_UNCORE_R2PCIE,
+	SNBEP_PCI_UNCORE_R3QPI,
+};
+
+static struct intel_uncore_type *snbep_pci_uncores[] = {
+	[SNBEP_PCI_UNCORE_HA]		= &snbep_uncore_ha,
+	[SNBEP_PCI_UNCORE_IMC]		= &snbep_uncore_imc,
+	[SNBEP_PCI_UNCORE_QPI]		= &snbep_uncore_qpi,
+	[SNBEP_PCI_UNCORE_R2PCIE]	= &snbep_uncore_r2pcie,
+	[SNBEP_PCI_UNCORE_R3QPI]	= &snbep_uncore_r3qpi,
+	NULL,
+};
+
+static const struct pci_device_id snbep_uncore_pci_ids[] = {
+	{ /* Home Agent */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_HA),
+		.driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_HA, 0),
+	},
+	{ /* MC Channel 0 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC0),
+		.driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 0),
+	},
+	{ /* MC Channel 1 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC1),
+		.driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 1),
+	},
+	{ /* MC Channel 2 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC2),
+		.driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 2),
+	},
+	{ /* MC Channel 3 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC3),
+		.driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 3),
+	},
+	{ /* QPI Port 0 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI0),
+		.driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_QPI, 0),
+	},
+	{ /* QPI Port 1 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI1),
+		.driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_QPI, 1),
+	},
+	{ /* R2PCIe */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R2PCIE),
+		.driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R2PCIE, 0),
+	},
+	{ /* R3QPI Link 0 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI0),
+		.driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R3QPI, 0),
+	},
+	{ /* R3QPI Link 1 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI1),
+		.driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R3QPI, 1),
+	},
+	{ /* QPI Port 0 filter  */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3c86),
+		.driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+						   SNBEP_PCI_QPI_PORT0_FILTER),
+	},
+	{ /* QPI Port 0 filter  */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3c96),
+		.driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+						   SNBEP_PCI_QPI_PORT1_FILTER),
+	},
+	{ /* end: all zeroes */ }
+};
+
+static struct pci_driver snbep_uncore_pci_driver = {
+	.name		= "snbep_uncore",
+	.id_table	= snbep_uncore_pci_ids,
+};
+
+/*
+ * build pci bus to socket mapping
+ */
+static int snbep_pci2phy_map_init(int devid)
+{
+	struct pci_dev *ubox_dev = NULL;
+	int i, bus, nodeid;
+	int err = 0;
+	u32 config = 0;
+
+	while (1) {
+		/* find the UBOX device */
+		ubox_dev = pci_get_device(PCI_VENDOR_ID_INTEL, devid, ubox_dev);
+		if (!ubox_dev)
+			break;
+		bus = ubox_dev->bus->number;
+		/* get the Node ID of the local register */
+		err = pci_read_config_dword(ubox_dev, 0x40, &config);
+		if (err)
+			break;
+		nodeid = config;
+		/* get the Node ID mapping */
+		err = pci_read_config_dword(ubox_dev, 0x54, &config);
+		if (err)
+			break;
+		/*
+		 * every three bits in the Node ID mapping register maps
+		 * to a particular node.
+		 */
+		for (i = 0; i < 8; i++) {
+			if (nodeid == ((config >> (3 * i)) & 0x7)) {
+				uncore_pcibus_to_physid[bus] = i;
+				break;
+			}
+		}
+	}
+
+	if (!err) {
+		/*
+		 * For PCI bus with no UBOX device, find the next bus
+		 * that has UBOX device and use its mapping.
+		 */
+		i = -1;
+		for (bus = 255; bus >= 0; bus--) {
+			if (uncore_pcibus_to_physid[bus] >= 0)
+				i = uncore_pcibus_to_physid[bus];
+			else
+				uncore_pcibus_to_physid[bus] = i;
+		}
+	}
+
+	if (ubox_dev)
+		pci_dev_put(ubox_dev);
+
+	return err ? pcibios_err_to_errno(err) : 0;
+}
+
+int snbep_uncore_pci_init(void)
+{
+	int ret = snbep_pci2phy_map_init(0x3ce0);
+	if (ret)
+		return ret;
+	uncore_pci_uncores = snbep_pci_uncores;
+	uncore_pci_driver = &snbep_uncore_pci_driver;
+	return 0;
+}
+/* end of Sandy Bridge-EP uncore support */
+
+/* IvyTown uncore support */
+static void ivbep_uncore_msr_init_box(struct intel_uncore_box *box)
+{
+	unsigned msr = uncore_msr_box_ctl(box);
+	if (msr)
+		wrmsrl(msr, IVBEP_PMON_BOX_CTL_INT);
+}
+
+static void ivbep_uncore_pci_init_box(struct intel_uncore_box *box)
+{
+	struct pci_dev *pdev = box->pci_dev;
+
+	pci_write_config_dword(pdev, SNBEP_PCI_PMON_BOX_CTL, IVBEP_PMON_BOX_CTL_INT);
+}
+
+#define IVBEP_UNCORE_MSR_OPS_COMMON_INIT()			\
+	.init_box	= ivbep_uncore_msr_init_box,		\
+	.disable_box	= snbep_uncore_msr_disable_box,		\
+	.enable_box	= snbep_uncore_msr_enable_box,		\
+	.disable_event	= snbep_uncore_msr_disable_event,	\
+	.enable_event	= snbep_uncore_msr_enable_event,	\
+	.read_counter	= uncore_msr_read_counter
+
+static struct intel_uncore_ops ivbep_uncore_msr_ops = {
+	IVBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+};
+
+static struct intel_uncore_ops ivbep_uncore_pci_ops = {
+	.init_box	= ivbep_uncore_pci_init_box,
+	.disable_box	= snbep_uncore_pci_disable_box,
+	.enable_box	= snbep_uncore_pci_enable_box,
+	.disable_event	= snbep_uncore_pci_disable_event,
+	.enable_event	= snbep_uncore_pci_enable_event,
+	.read_counter	= snbep_uncore_pci_read_counter,
+};
+
+#define IVBEP_UNCORE_PCI_COMMON_INIT()				\
+	.perf_ctr	= SNBEP_PCI_PMON_CTR0,			\
+	.event_ctl	= SNBEP_PCI_PMON_CTL0,			\
+	.event_mask	= IVBEP_PMON_RAW_EVENT_MASK,		\
+	.box_ctl	= SNBEP_PCI_PMON_BOX_CTL,		\
+	.ops		= &ivbep_uncore_pci_ops,			\
+	.format_group	= &ivbep_uncore_format_group
+
+static struct attribute *ivbep_uncore_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_inv.attr,
+	&format_attr_thresh8.attr,
+	NULL,
+};
+
+static struct attribute *ivbep_uncore_ubox_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_inv.attr,
+	&format_attr_thresh5.attr,
+	NULL,
+};
+
+static struct attribute *ivbep_uncore_cbox_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_tid_en.attr,
+	&format_attr_thresh8.attr,
+	&format_attr_filter_tid.attr,
+	&format_attr_filter_link.attr,
+	&format_attr_filter_state2.attr,
+	&format_attr_filter_nid2.attr,
+	&format_attr_filter_opc2.attr,
+	NULL,
+};
+
+static struct attribute *ivbep_uncore_pcu_formats_attr[] = {
+	&format_attr_event_ext.attr,
+	&format_attr_occ_sel.attr,
+	&format_attr_edge.attr,
+	&format_attr_thresh5.attr,
+	&format_attr_occ_invert.attr,
+	&format_attr_occ_edge.attr,
+	&format_attr_filter_band0.attr,
+	&format_attr_filter_band1.attr,
+	&format_attr_filter_band2.attr,
+	&format_attr_filter_band3.attr,
+	NULL,
+};
+
+static struct attribute *ivbep_uncore_qpi_formats_attr[] = {
+	&format_attr_event_ext.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_thresh8.attr,
+	&format_attr_match_rds.attr,
+	&format_attr_match_rnid30.attr,
+	&format_attr_match_rnid4.attr,
+	&format_attr_match_dnid.attr,
+	&format_attr_match_mc.attr,
+	&format_attr_match_opc.attr,
+	&format_attr_match_vnw.attr,
+	&format_attr_match0.attr,
+	&format_attr_match1.attr,
+	&format_attr_mask_rds.attr,
+	&format_attr_mask_rnid30.attr,
+	&format_attr_mask_rnid4.attr,
+	&format_attr_mask_dnid.attr,
+	&format_attr_mask_mc.attr,
+	&format_attr_mask_opc.attr,
+	&format_attr_mask_vnw.attr,
+	&format_attr_mask0.attr,
+	&format_attr_mask1.attr,
+	NULL,
+};
+
+static struct attribute_group ivbep_uncore_format_group = {
+	.name = "format",
+	.attrs = ivbep_uncore_formats_attr,
+};
+
+static struct attribute_group ivbep_uncore_ubox_format_group = {
+	.name = "format",
+	.attrs = ivbep_uncore_ubox_formats_attr,
+};
+
+static struct attribute_group ivbep_uncore_cbox_format_group = {
+	.name = "format",
+	.attrs = ivbep_uncore_cbox_formats_attr,
+};
+
+static struct attribute_group ivbep_uncore_pcu_format_group = {
+	.name = "format",
+	.attrs = ivbep_uncore_pcu_formats_attr,
+};
+
+static struct attribute_group ivbep_uncore_qpi_format_group = {
+	.name = "format",
+	.attrs = ivbep_uncore_qpi_formats_attr,
+};
+
+static struct intel_uncore_type ivbep_uncore_ubox = {
+	.name		= "ubox",
+	.num_counters   = 2,
+	.num_boxes	= 1,
+	.perf_ctr_bits	= 44,
+	.fixed_ctr_bits	= 48,
+	.perf_ctr	= SNBEP_U_MSR_PMON_CTR0,
+	.event_ctl	= SNBEP_U_MSR_PMON_CTL0,
+	.event_mask	= IVBEP_U_MSR_PMON_RAW_EVENT_MASK,
+	.fixed_ctr	= SNBEP_U_MSR_PMON_UCLK_FIXED_CTR,
+	.fixed_ctl	= SNBEP_U_MSR_PMON_UCLK_FIXED_CTL,
+	.ops		= &ivbep_uncore_msr_ops,
+	.format_group	= &ivbep_uncore_ubox_format_group,
+};
+
+static struct extra_reg ivbep_uncore_cbox_extra_regs[] = {
+	SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
+				  SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x1031, 0x10ff, 0x2),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x1134, 0xffff, 0x4),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0xc),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x5134, 0xffff, 0xc),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4334, 0xffff, 0xc),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4534, 0xffff, 0xc),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4934, 0xffff, 0xc),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x10),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x10),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x2135, 0xffff, 0x10),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x2335, 0xffff, 0x10),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0x18),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0x18),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4435, 0xffff, 0x8),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4835, 0xffff, 0x8),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4a35, 0xffff, 0x8),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x5035, 0xffff, 0x8),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x8135, 0xffff, 0x10),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x8335, 0xffff, 0x10),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x10),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x10),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x2136, 0xffff, 0x10),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x2336, 0xffff, 0x10),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0x18),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0x18),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4436, 0xffff, 0x8),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4836, 0xffff, 0x8),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4a36, 0xffff, 0x8),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x5036, 0xffff, 0x8),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x8136, 0xffff, 0x10),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x8336, 0xffff, 0x10),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4037, 0x40ff, 0x8),
+	EVENT_EXTRA_END
+};
+
+static u64 ivbep_cbox_filter_mask(int fields)
+{
+	u64 mask = 0;
+
+	if (fields & 0x1)
+		mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_TID;
+	if (fields & 0x2)
+		mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_LINK;
+	if (fields & 0x4)
+		mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_STATE;
+	if (fields & 0x8)
+		mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_NID;
+	if (fields & 0x10)
+		mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_OPC;
+
+	return mask;
+}
+
+static struct event_constraint *
+ivbep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+	return __snbep_cbox_get_constraint(box, event, ivbep_cbox_filter_mask);
+}
+
+static int ivbep_cbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+	struct extra_reg *er;
+	int idx = 0;
+
+	for (er = ivbep_uncore_cbox_extra_regs; er->msr; er++) {
+		if (er->event != (event->hw.config & er->config_mask))
+			continue;
+		idx |= er->idx;
+	}
+
+	if (idx) {
+		reg1->reg = SNBEP_C0_MSR_PMON_BOX_FILTER +
+			SNBEP_CBO_MSR_OFFSET * box->pmu->pmu_idx;
+		reg1->config = event->attr.config1 & ivbep_cbox_filter_mask(idx);
+		reg1->idx = idx;
+	}
+	return 0;
+}
+
+static void ivbep_cbox_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+
+	if (reg1->idx != EXTRA_REG_NONE) {
+		u64 filter = uncore_shared_reg_config(box, 0);
+		wrmsrl(reg1->reg, filter & 0xffffffff);
+		wrmsrl(reg1->reg + 6, filter >> 32);
+	}
+
+	wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static struct intel_uncore_ops ivbep_uncore_cbox_ops = {
+	.init_box		= ivbep_uncore_msr_init_box,
+	.disable_box		= snbep_uncore_msr_disable_box,
+	.enable_box		= snbep_uncore_msr_enable_box,
+	.disable_event		= snbep_uncore_msr_disable_event,
+	.enable_event		= ivbep_cbox_enable_event,
+	.read_counter		= uncore_msr_read_counter,
+	.hw_config		= ivbep_cbox_hw_config,
+	.get_constraint		= ivbep_cbox_get_constraint,
+	.put_constraint		= snbep_cbox_put_constraint,
+};
+
+static struct intel_uncore_type ivbep_uncore_cbox = {
+	.name			= "cbox",
+	.num_counters		= 4,
+	.num_boxes		= 15,
+	.perf_ctr_bits		= 44,
+	.event_ctl		= SNBEP_C0_MSR_PMON_CTL0,
+	.perf_ctr		= SNBEP_C0_MSR_PMON_CTR0,
+	.event_mask		= IVBEP_CBO_MSR_PMON_RAW_EVENT_MASK,
+	.box_ctl		= SNBEP_C0_MSR_PMON_BOX_CTL,
+	.msr_offset		= SNBEP_CBO_MSR_OFFSET,
+	.num_shared_regs	= 1,
+	.constraints		= snbep_uncore_cbox_constraints,
+	.ops			= &ivbep_uncore_cbox_ops,
+	.format_group		= &ivbep_uncore_cbox_format_group,
+};
+
+static struct intel_uncore_ops ivbep_uncore_pcu_ops = {
+	IVBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+	.hw_config		= snbep_pcu_hw_config,
+	.get_constraint		= snbep_pcu_get_constraint,
+	.put_constraint		= snbep_pcu_put_constraint,
+};
+
+static struct intel_uncore_type ivbep_uncore_pcu = {
+	.name			= "pcu",
+	.num_counters		= 4,
+	.num_boxes		= 1,
+	.perf_ctr_bits		= 48,
+	.perf_ctr		= SNBEP_PCU_MSR_PMON_CTR0,
+	.event_ctl		= SNBEP_PCU_MSR_PMON_CTL0,
+	.event_mask		= IVBEP_PCU_MSR_PMON_RAW_EVENT_MASK,
+	.box_ctl		= SNBEP_PCU_MSR_PMON_BOX_CTL,
+	.num_shared_regs	= 1,
+	.ops			= &ivbep_uncore_pcu_ops,
+	.format_group		= &ivbep_uncore_pcu_format_group,
+};
+
+static struct intel_uncore_type *ivbep_msr_uncores[] = {
+	&ivbep_uncore_ubox,
+	&ivbep_uncore_cbox,
+	&ivbep_uncore_pcu,
+	NULL,
+};
+
+void ivbep_uncore_cpu_init(void)
+{
+	if (ivbep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
+		ivbep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
+	uncore_msr_uncores = ivbep_msr_uncores;
+}
+
+static struct intel_uncore_type ivbep_uncore_ha = {
+	.name		= "ha",
+	.num_counters   = 4,
+	.num_boxes	= 2,
+	.perf_ctr_bits	= 48,
+	IVBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type ivbep_uncore_imc = {
+	.name		= "imc",
+	.num_counters   = 4,
+	.num_boxes	= 8,
+	.perf_ctr_bits	= 48,
+	.fixed_ctr_bits	= 48,
+	.fixed_ctr	= SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
+	.fixed_ctl	= SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
+	.event_descs	= snbep_uncore_imc_events,
+	IVBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+/* registers in IRP boxes are not properly aligned */
+static unsigned ivbep_uncore_irp_ctls[] = {0xd8, 0xdc, 0xe0, 0xe4};
+static unsigned ivbep_uncore_irp_ctrs[] = {0xa0, 0xb0, 0xb8, 0xc0};
+
+static void ivbep_uncore_irp_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct pci_dev *pdev = box->pci_dev;
+	struct hw_perf_event *hwc = &event->hw;
+
+	pci_write_config_dword(pdev, ivbep_uncore_irp_ctls[hwc->idx],
+			       hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static void ivbep_uncore_irp_disable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct pci_dev *pdev = box->pci_dev;
+	struct hw_perf_event *hwc = &event->hw;
+
+	pci_write_config_dword(pdev, ivbep_uncore_irp_ctls[hwc->idx], hwc->config);
+}
+
+static u64 ivbep_uncore_irp_read_counter(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct pci_dev *pdev = box->pci_dev;
+	struct hw_perf_event *hwc = &event->hw;
+	u64 count = 0;
+
+	pci_read_config_dword(pdev, ivbep_uncore_irp_ctrs[hwc->idx], (u32 *)&count);
+	pci_read_config_dword(pdev, ivbep_uncore_irp_ctrs[hwc->idx] + 4, (u32 *)&count + 1);
+
+	return count;
+}
+
+static struct intel_uncore_ops ivbep_uncore_irp_ops = {
+	.init_box	= ivbep_uncore_pci_init_box,
+	.disable_box	= snbep_uncore_pci_disable_box,
+	.enable_box	= snbep_uncore_pci_enable_box,
+	.disable_event	= ivbep_uncore_irp_disable_event,
+	.enable_event	= ivbep_uncore_irp_enable_event,
+	.read_counter	= ivbep_uncore_irp_read_counter,
+};
+
+static struct intel_uncore_type ivbep_uncore_irp = {
+	.name			= "irp",
+	.num_counters		= 4,
+	.num_boxes		= 1,
+	.perf_ctr_bits		= 48,
+	.event_mask		= IVBEP_PMON_RAW_EVENT_MASK,
+	.box_ctl		= SNBEP_PCI_PMON_BOX_CTL,
+	.ops			= &ivbep_uncore_irp_ops,
+	.format_group		= &ivbep_uncore_format_group,
+};
+
+static struct intel_uncore_ops ivbep_uncore_qpi_ops = {
+	.init_box	= ivbep_uncore_pci_init_box,
+	.disable_box	= snbep_uncore_pci_disable_box,
+	.enable_box	= snbep_uncore_pci_enable_box,
+	.disable_event	= snbep_uncore_pci_disable_event,
+	.enable_event	= snbep_qpi_enable_event,
+	.read_counter	= snbep_uncore_pci_read_counter,
+	.hw_config	= snbep_qpi_hw_config,
+	.get_constraint	= uncore_get_constraint,
+	.put_constraint	= uncore_put_constraint,
+};
+
+static struct intel_uncore_type ivbep_uncore_qpi = {
+	.name			= "qpi",
+	.num_counters		= 4,
+	.num_boxes		= 3,
+	.perf_ctr_bits		= 48,
+	.perf_ctr		= SNBEP_PCI_PMON_CTR0,
+	.event_ctl		= SNBEP_PCI_PMON_CTL0,
+	.event_mask		= IVBEP_QPI_PCI_PMON_RAW_EVENT_MASK,
+	.box_ctl		= SNBEP_PCI_PMON_BOX_CTL,
+	.num_shared_regs	= 1,
+	.ops			= &ivbep_uncore_qpi_ops,
+	.format_group		= &ivbep_uncore_qpi_format_group,
+};
+
+static struct intel_uncore_type ivbep_uncore_r2pcie = {
+	.name		= "r2pcie",
+	.num_counters   = 4,
+	.num_boxes	= 1,
+	.perf_ctr_bits	= 44,
+	.constraints	= snbep_uncore_r2pcie_constraints,
+	IVBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type ivbep_uncore_r3qpi = {
+	.name		= "r3qpi",
+	.num_counters   = 3,
+	.num_boxes	= 2,
+	.perf_ctr_bits	= 44,
+	.constraints	= snbep_uncore_r3qpi_constraints,
+	IVBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+enum {
+	IVBEP_PCI_UNCORE_HA,
+	IVBEP_PCI_UNCORE_IMC,
+	IVBEP_PCI_UNCORE_IRP,
+	IVBEP_PCI_UNCORE_QPI,
+	IVBEP_PCI_UNCORE_R2PCIE,
+	IVBEP_PCI_UNCORE_R3QPI,
+};
+
+static struct intel_uncore_type *ivbep_pci_uncores[] = {
+	[IVBEP_PCI_UNCORE_HA]	= &ivbep_uncore_ha,
+	[IVBEP_PCI_UNCORE_IMC]	= &ivbep_uncore_imc,
+	[IVBEP_PCI_UNCORE_IRP]	= &ivbep_uncore_irp,
+	[IVBEP_PCI_UNCORE_QPI]	= &ivbep_uncore_qpi,
+	[IVBEP_PCI_UNCORE_R2PCIE]	= &ivbep_uncore_r2pcie,
+	[IVBEP_PCI_UNCORE_R3QPI]	= &ivbep_uncore_r3qpi,
+	NULL,
+};
+
+static const struct pci_device_id ivbep_uncore_pci_ids[] = {
+	{ /* Home Agent 0 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe30),
+		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_HA, 0),
+	},
+	{ /* Home Agent 1 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe38),
+		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_HA, 1),
+	},
+	{ /* MC0 Channel 0 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb4),
+		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 0),
+	},
+	{ /* MC0 Channel 1 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb5),
+		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 1),
+	},
+	{ /* MC0 Channel 3 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb0),
+		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 2),
+	},
+	{ /* MC0 Channel 4 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb1),
+		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 3),
+	},
+	{ /* MC1 Channel 0 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef4),
+		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 4),
+	},
+	{ /* MC1 Channel 1 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef5),
+		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 5),
+	},
+	{ /* MC1 Channel 3 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef0),
+		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 6),
+	},
+	{ /* MC1 Channel 4 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef1),
+		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 7),
+	},
+	{ /* IRP */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe39),
+		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IRP, 0),
+	},
+	{ /* QPI0 Port 0 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe32),
+		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_QPI, 0),
+	},
+	{ /* QPI0 Port 1 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe33),
+		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_QPI, 1),
+	},
+	{ /* QPI1 Port 2 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe3a),
+		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_QPI, 2),
+	},
+	{ /* R2PCIe */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe34),
+		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_R2PCIE, 0),
+	},
+	{ /* R3QPI0 Link 0 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe36),
+		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_R3QPI, 0),
+	},
+	{ /* R3QPI0 Link 1 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe37),
+		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_R3QPI, 1),
+	},
+	{ /* R3QPI1 Link 2 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe3e),
+		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_R3QPI, 2),
+	},
+	{ /* QPI Port 0 filter  */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe86),
+		.driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+						   SNBEP_PCI_QPI_PORT0_FILTER),
+	},
+	{ /* QPI Port 0 filter  */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe96),
+		.driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+						   SNBEP_PCI_QPI_PORT1_FILTER),
+	},
+	{ /* end: all zeroes */ }
+};
+
+static struct pci_driver ivbep_uncore_pci_driver = {
+	.name		= "ivbep_uncore",
+	.id_table	= ivbep_uncore_pci_ids,
+};
+
+int ivbep_uncore_pci_init(void)
+{
+	int ret = snbep_pci2phy_map_init(0x0e1e);
+	if (ret)
+		return ret;
+	uncore_pci_uncores = ivbep_pci_uncores;
+	uncore_pci_driver = &ivbep_uncore_pci_driver;
+	return 0;
+}
+/* end of IvyTown uncore support */

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 2fac134..df088bb 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S

@@ -404,8 +404,8 @@
 	 * and short:
 	 */
 	ENABLE_INTERRUPTS(CLBR_NONE)
-	SAVE_ARGS 8,0
-	movq  %rax,ORIG_RAX-ARGOFFSET(%rsp)
+	SAVE_ARGS 8, 0, rax_enosys=1
+	movq_cfi rax,(ORIG_RAX-ARGOFFSET)
 	movq  %rcx,RIP-ARGOFFSET(%rsp)
 	CFI_REL_OFFSET rip,RIP-ARGOFFSET
 	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
@@ -417,7 +417,7 @@
 	andl $__SYSCALL_MASK,%eax
 	cmpl $__NR_syscall_max,%eax
 #endif
-	ja badsys
+	ja ret_from_sys_call  /* and return regs->ax */
 	movq %r10,%rcx
 	call *sys_call_table(,%rax,8)  # XXX:	 rip relative
 	movq %rax,RAX-ARGOFFSET(%rsp)
@@ -476,28 +476,8 @@
 	FIXUP_TOP_OF_STACK %r11, -ARGOFFSET
 	jmp int_check_syscall_exit_work
 
-badsys:
-	movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
-	jmp ret_from_sys_call
-
 #ifdef CONFIG_AUDITSYSCALL
 	/*
-	 * Fast path for syscall audit without full syscall trace.
-	 * We just call __audit_syscall_entry() directly, and then
-	 * jump back to the normal fast path.
-	 */
-auditsys:
-	movq %r10,%r9			/* 6th arg: 4th syscall arg */
-	movq %rdx,%r8			/* 5th arg: 3rd syscall arg */
-	movq %rsi,%rcx			/* 4th arg: 2nd syscall arg */
-	movq %rdi,%rdx			/* 3rd arg: 1st syscall arg */
-	movq %rax,%rsi			/* 2nd arg: syscall number */
-	movl $AUDIT_ARCH_X86_64,%edi	/* 1st arg: audit arch */
-	call __audit_syscall_entry
-	LOAD_ARGS 0		/* reload call-clobbered registers */
-	jmp system_call_fastpath
-
-	/*
 	 * Return fast path for syscall audit.  Call __audit_syscall_exit()
 	 * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT
 	 * masked off.
@@ -514,18 +494,25 @@
 
 	/* Do syscall tracing */
 tracesys:
-#ifdef CONFIG_AUDITSYSCALL
-	testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
-	jz auditsys
-#endif
+	leaq -REST_SKIP(%rsp), %rdi
+	movq $AUDIT_ARCH_X86_64, %rsi
+	call syscall_trace_enter_phase1
+	test %rax, %rax
+	jnz tracesys_phase2		/* if needed, run the slow path */
+	LOAD_ARGS 0			/* else restore clobbered regs */
+	jmp system_call_fastpath	/*      and return to the fast path */
+
+tracesys_phase2:
 	SAVE_REST
-	movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
 	FIXUP_TOP_OF_STACK %rdi
-	movq %rsp,%rdi
-	call syscall_trace_enter
+	movq %rsp, %rdi
+	movq $AUDIT_ARCH_X86_64, %rsi
+	movq %rax,%rdx
+	call syscall_trace_enter_phase2
+
 	/*
 	 * Reload arg registers from stack in case ptrace changed them.
-	 * We don't reload %rax because syscall_trace_enter() returned
+	 * We don't reload %rax because syscall_trace_entry_phase2() returned
 	 * the value it wants us to use in the table lookup.
 	 */
 	LOAD_ARGS ARGOFFSET, 1
@@ -536,7 +523,7 @@
 	andl $__SYSCALL_MASK,%eax
 	cmpl $__NR_syscall_max,%eax
 #endif
-	ja   int_ret_from_sys_call	/* RAX(%rsp) set to -ENOSYS above */
+	ja   int_ret_from_sys_call	/* RAX(%rsp) is already set */
 	movq %r10,%rcx	/* fixup for C */
 	call *sys_call_table(,%rax,8)
 	movq %rax,RAX-ARGOFFSET(%rsp)

diff --git a/arch/x86/kernel/iosf_mbi.c b/arch/x86/kernel/iosf_mbi.c
index 9030e83..0a2faa3 100644
--- a/arch/x86/kernel/iosf_mbi.c
+++ b/arch/x86/kernel/iosf_mbi.c

@@ -22,6 +22,8 @@
 #include <linux/init.h>
 #include <linux/spinlock.h>
 #include <linux/pci.h>
+#include <linux/debugfs.h>
+#include <linux/capability.h>
 
 #include <asm/iosf_mbi.h>
 
@@ -187,6 +189,75 @@
 }
 EXPORT_SYMBOL(iosf_mbi_available);
 
+/********************** debugfs begin ****************************/
+static u32	dbg_mdr;
+static u32	dbg_mcr;
+static u32	dbg_mcrx;
+
+static int mcr_get(void *data, u64 *val)
+{
+	*val = *(u32 *)data;
+	return 0;
+}
+
+static int mcr_set(void *data, u64 val)
+{
+	u8 command = ((u32)val & 0xFF000000) >> 24,
+	   port	   = ((u32)val & 0x00FF0000) >> 16,
+	   offset  = ((u32)val & 0x0000FF00) >> 8;
+	int err;
+
+	*(u32 *)data = val;
+
+	if (!capable(CAP_SYS_RAWIO))
+		return -EACCES;
+
+	if (command & 1u)
+		err = iosf_mbi_write(port,
+			       command,
+			       dbg_mcrx | offset,
+			       dbg_mdr);
+	else
+		err = iosf_mbi_read(port,
+			      command,
+			      dbg_mcrx | offset,
+			      &dbg_mdr);
+
+	return err;
+}
+DEFINE_SIMPLE_ATTRIBUTE(iosf_mcr_fops, mcr_get, mcr_set , "%llx\n");
+
+static struct dentry *iosf_dbg;
+static void iosf_sideband_debug_init(void)
+{
+	struct dentry *d;
+
+	iosf_dbg = debugfs_create_dir("iosf_sb", NULL);
+	if (IS_ERR_OR_NULL(iosf_dbg))
+		return;
+
+	/* mdr */
+	d = debugfs_create_x32("mdr", 0660, iosf_dbg, &dbg_mdr);
+	if (IS_ERR_OR_NULL(d))
+		goto cleanup;
+
+	/* mcrx */
+	debugfs_create_x32("mcrx", 0660, iosf_dbg, &dbg_mcrx);
+	if (IS_ERR_OR_NULL(d))
+		goto cleanup;
+
+	/* mcr - initiates mailbox tranaction */
+	debugfs_create_file("mcr", 0660, iosf_dbg, &dbg_mcr, &iosf_mcr_fops);
+	if (IS_ERR_OR_NULL(d))
+		goto cleanup;
+
+	return;
+
+cleanup:
+	debugfs_remove_recursive(d);
+}
+/********************** debugfs end ****************************/
+
 static int iosf_mbi_probe(struct pci_dev *pdev,
 			  const struct pci_device_id *unused)
 {
@@ -217,11 +288,14 @@
 
 static int __init iosf_mbi_init(void)
 {
+	iosf_sideband_debug_init();
 	return pci_register_driver(&iosf_mbi_pci_driver);
 }
 
 static void __exit iosf_mbi_exit(void)
 {
+	debugfs_remove_recursive(iosf_dbg);
+
 	pci_unregister_driver(&iosf_mbi_pci_driver);
 	if (mbi_pdev) {
 		pci_dev_put(mbi_pdev);

diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index f304773..f1314d0 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c

@@ -338,8 +338,10 @@
 	 * a relative jump.
 	 */
 	rel = (long)op->optinsn.insn - (long)op->kp.addr + RELATIVEJUMP_SIZE;
-	if (abs(rel) > 0x7fffffff)
+	if (abs(rel) > 0x7fffffff) {
+		__arch_remove_optimized_kprobe(op, 0);
 		return -ERANGE;
+	}
 
 	buf = (u8 *)op->optinsn.insn;
 

diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index f804dc9..e127dda 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c

@@ -64,14 +64,16 @@
  */
 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 {
-	int ret;
-
 	*dst = *src;
-	if (fpu_allocated(&src->thread.fpu)) {
-		memset(&dst->thread.fpu, 0, sizeof(dst->thread.fpu));
-		ret = fpu_alloc(&dst->thread.fpu);
-		if (ret)
-			return ret;
+
+	dst->thread.fpu_counter = 0;
+	dst->thread.fpu.has_fpu = 0;
+	dst->thread.fpu.last_cpu = ~0;
+	dst->thread.fpu.state = NULL;
+	if (tsk_used_math(src)) {
+		int err = fpu_alloc(&dst->thread.fpu);
+		if (err)
+			return err;
 		fpu_copy(dst, src);
 	}
 	return 0;

diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 7bc86bb..8f3ebfe 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c

@@ -138,6 +138,7 @@
 
 	p->thread.sp = (unsigned long) childregs;
 	p->thread.sp0 = (unsigned long) (childregs+1);
+	memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
 
 	if (unlikely(p->flags & PF_KTHREAD)) {
 		/* kernel thread */
@@ -152,9 +153,7 @@
 		childregs->orig_ax = -1;
 		childregs->cs = __KERNEL_CS | get_kernel_rpl();
 		childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_FIXED;
-		p->thread.fpu_counter = 0;
 		p->thread.io_bitmap_ptr = NULL;
-		memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
 		return 0;
 	}
 	*childregs = *current_pt_regs();
@@ -165,13 +164,10 @@
 	p->thread.ip = (unsigned long) ret_from_fork;
 	task_user_gs(p) = get_user_gs(current_pt_regs());
 
-	p->thread.fpu_counter = 0;
 	p->thread.io_bitmap_ptr = NULL;
 	tsk = current;
 	err = -ENOMEM;
 
-	memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
-
 	if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
 		p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr,
 						IO_BITMAP_BYTES, GFP_KERNEL);

diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index ca5b02d..3ed4a68 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c

@@ -163,7 +163,6 @@
 	p->thread.sp = (unsigned long) childregs;
 	p->thread.usersp = me->thread.usersp;
 	set_tsk_thread_flag(p, TIF_FORK);
-	p->thread.fpu_counter = 0;
 	p->thread.io_bitmap_ptr = NULL;
 
 	savesegment(gs, p->thread.gsindex);
@@ -193,8 +192,6 @@
 		childregs->sp = sp;
 
 	err = -ENOMEM;
-	memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
-
 	if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
 		p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr,
 						  IO_BITMAP_BYTES, GFP_KERNEL);

diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 678c0ad..29576c2 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c

@@ -1441,24 +1441,126 @@
 	force_sig_info(SIGTRAP, &info, tsk);
 }
 
-
-#ifdef CONFIG_X86_32
-# define IS_IA32	1
-#elif defined CONFIG_IA32_EMULATION
-# define IS_IA32	is_compat_task()
-#else
-# define IS_IA32	0
+static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch)
+{
+#ifdef CONFIG_X86_64
+	if (arch == AUDIT_ARCH_X86_64) {
+		audit_syscall_entry(arch, regs->orig_ax, regs->di,
+				    regs->si, regs->dx, regs->r10);
+	} else
 #endif
+	{
+		audit_syscall_entry(arch, regs->orig_ax, regs->bx,
+				    regs->cx, regs->dx, regs->si);
+	}
+}
 
 /*
- * We must return the syscall number to actually look up in the table.
- * This can be -1L to skip running any syscall at all.
+ * We can return 0 to resume the syscall or anything else to go to phase
+ * 2.  If we resume the syscall, we need to put something appropriate in
+ * regs->orig_ax.
+ *
+ * NB: We don't have full pt_regs here, but regs->orig_ax and regs->ax
+ * are fully functional.
+ *
+ * For phase 2's benefit, our return value is:
+ * 0:			resume the syscall
+ * 1:			go to phase 2; no seccomp phase 2 needed
+ * anything else:	go to phase 2; pass return value to seccomp
  */
-long syscall_trace_enter(struct pt_regs *regs)
+unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch)
+{
+	unsigned long ret = 0;
+	u32 work;
+
+	BUG_ON(regs != task_pt_regs(current));
+
+	work = ACCESS_ONCE(current_thread_info()->flags) &
+		_TIF_WORK_SYSCALL_ENTRY;
+
+	/*
+	 * If TIF_NOHZ is set, we are required to call user_exit() before
+	 * doing anything that could touch RCU.
+	 */
+	if (work & _TIF_NOHZ) {
+		user_exit();
+		work &= ~TIF_NOHZ;
+	}
+
+#ifdef CONFIG_SECCOMP
+	/*
+	 * Do seccomp first -- it should minimize exposure of other
+	 * code, and keeping seccomp fast is probably more valuable
+	 * than the rest of this.
+	 */
+	if (work & _TIF_SECCOMP) {
+		struct seccomp_data sd;
+
+		sd.arch = arch;
+		sd.nr = regs->orig_ax;
+		sd.instruction_pointer = regs->ip;
+#ifdef CONFIG_X86_64
+		if (arch == AUDIT_ARCH_X86_64) {
+			sd.args[0] = regs->di;
+			sd.args[1] = regs->si;
+			sd.args[2] = regs->dx;
+			sd.args[3] = regs->r10;
+			sd.args[4] = regs->r8;
+			sd.args[5] = regs->r9;
+		} else
+#endif
+		{
+			sd.args[0] = regs->bx;
+			sd.args[1] = regs->cx;
+			sd.args[2] = regs->dx;
+			sd.args[3] = regs->si;
+			sd.args[4] = regs->di;
+			sd.args[5] = regs->bp;
+		}
+
+		BUILD_BUG_ON(SECCOMP_PHASE1_OK != 0);
+		BUILD_BUG_ON(SECCOMP_PHASE1_SKIP != 1);
+
+		ret = seccomp_phase1(&sd);
+		if (ret == SECCOMP_PHASE1_SKIP) {
+			regs->orig_ax = -1;
+			ret = 0;
+		} else if (ret != SECCOMP_PHASE1_OK) {
+			return ret;  /* Go directly to phase 2 */
+		}
+
+		work &= ~_TIF_SECCOMP;
+	}
+#endif
+
+	/* Do our best to finish without phase 2. */
+	if (work == 0)
+		return ret;  /* seccomp and/or nohz only (ret == 0 here) */
+
+#ifdef CONFIG_AUDITSYSCALL
+	if (work == _TIF_SYSCALL_AUDIT) {
+		/*
+		 * If there is no more work to be done except auditing,
+		 * then audit in phase 1.  Phase 2 always audits, so, if
+		 * we audit here, then we can't go on to phase 2.
+		 */
+		do_audit_syscall_entry(regs, arch);
+		return 0;
+	}
+#endif
+
+	return 1;  /* Something is enabled that we can't handle in phase 1 */
+}
+
+/* Returns the syscall nr to run (which should match regs->orig_ax). */
+long syscall_trace_enter_phase2(struct pt_regs *regs, u32 arch,
+				unsigned long phase1_result)
 {
 	long ret = 0;
+	u32 work = ACCESS_ONCE(current_thread_info()->flags) &
+		_TIF_WORK_SYSCALL_ENTRY;
 
-	user_exit();
+	BUG_ON(regs != task_pt_regs(current));
 
 	/*
 	 * If we stepped into a sysenter/syscall insn, it trapped in
@@ -1467,17 +1569,21 @@
 	 * do_debug() and we need to set it again to restore the user
 	 * state.  If we entered on the slow path, TF was already set.
 	 */
-	if (test_thread_flag(TIF_SINGLESTEP))
+	if (work & _TIF_SINGLESTEP)
 		regs->flags |= X86_EFLAGS_TF;
 
-	/* do the secure computing check first */
-	if (secure_computing(regs->orig_ax)) {
+#ifdef CONFIG_SECCOMP
+	/*
+	 * Call seccomp_phase2 before running the other hooks so that
+	 * they can see any changes made by a seccomp tracer.
+	 */
+	if (phase1_result > 1 && seccomp_phase2(phase1_result)) {
 		/* seccomp failures shouldn't expose any additional code. */
-		ret = -1L;
-		goto out;
+		return -1;
 	}
+#endif
 
-	if (unlikely(test_thread_flag(TIF_SYSCALL_EMU)))
+	if (unlikely(work & _TIF_SYSCALL_EMU))
 		ret = -1L;
 
 	if ((ret || test_thread_flag(TIF_SYSCALL_TRACE)) &&
@@ -1487,23 +1593,22 @@
 	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
 		trace_sys_enter(regs, regs->orig_ax);
 
-	if (IS_IA32)
-		audit_syscall_entry(AUDIT_ARCH_I386,
-				    regs->orig_ax,
-				    regs->bx, regs->cx,
-				    regs->dx, regs->si);
-#ifdef CONFIG_X86_64
-	else
-		audit_syscall_entry(AUDIT_ARCH_X86_64,
-				    regs->orig_ax,
-				    regs->di, regs->si,
-				    regs->dx, regs->r10);
-#endif
+	do_audit_syscall_entry(regs, arch);
 
-out:
 	return ret ?: regs->orig_ax;
 }
 
+long syscall_trace_enter(struct pt_regs *regs)
+{
+	u32 arch = is_ia32_task() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64;
+	unsigned long phase1_result = syscall_trace_enter_phase1(regs, arch);
+
+	if (phase1_result == 0)
+		return regs->orig_ax;
+	else
+		return syscall_trace_enter_phase2(regs, arch, phase1_result);
+}
+
 void syscall_trace_leave(struct pt_regs *regs)
 {
 	bool step;

diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 2851d63..ed37a76 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c

@@ -675,6 +675,11 @@
 		 * handler too.
 		 */
 		regs->flags &= ~(X86_EFLAGS_DF|X86_EFLAGS_RF|X86_EFLAGS_TF);
+		/*
+		 * Ensure the signal handler starts with the new fpu state.
+		 */
+		if (used_math())
+			drop_init_fpu(current);
 	}
 	signal_setup_done(failed, ksig, test_thread_flag(TIF_SINGLESTEP));
 }

diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index e1e1e80..957779f 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c

@@ -216,7 +216,7 @@
 	 */
 	regs->orig_ax = syscall_nr;
 	regs->ax = -ENOSYS;
-	tmp = secure_computing(syscall_nr);
+	tmp = secure_computing();
 	if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) {
 		warn_bad_vsyscall(KERN_DEBUG, regs,
 				  "seccomp tried to change syscall nr or ip");

diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 940b142..4c540c4 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c

@@ -271,8 +271,6 @@
 	if (use_fxsr() && save_xstate_epilog(buf_fx, ia32_fxstate))
 		return -1;
 
-	drop_init_fpu(tsk);	/* trigger finit */
-
 	return 0;
 }
 
@@ -402,8 +400,11 @@
 			set_used_math();
 		}
 
-		if (use_eager_fpu())
+		if (use_eager_fpu()) {
+			preempt_disable();
 			math_state_restore();
+			preempt_enable();
+		}
 
 		return err;
 	} else {

diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index 4dd8cf6..75cc097 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c

@@ -59,41 +59,6 @@
 	__flush_tlb_one(vaddr);
 }
 
-/*
- * Associate a large virtual page frame with a given physical page frame 
- * and protection flags for that frame. pfn is for the base of the page,
- * vaddr is what the page gets mapped to - both must be properly aligned. 
- * The pmd must already be instantiated. Assumes PAE mode.
- */ 
-void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
-{
-	pgd_t *pgd;
-	pud_t *pud;
-	pmd_t *pmd;
-
-	if (vaddr & (PMD_SIZE-1)) {		/* vaddr is misaligned */
-		printk(KERN_WARNING "set_pmd_pfn: vaddr misaligned\n");
-		return; /* BUG(); */
-	}
-	if (pfn & (PTRS_PER_PTE-1)) {		/* pfn is misaligned */
-		printk(KERN_WARNING "set_pmd_pfn: pfn misaligned\n");
-		return; /* BUG(); */
-	}
-	pgd = swapper_pg_dir + pgd_index(vaddr);
-	if (pgd_none(*pgd)) {
-		printk(KERN_WARNING "set_pmd_pfn: pgd_none\n");
-		return; /* BUG(); */
-	}
-	pud = pud_offset(pgd, vaddr);
-	pmd = pmd_offset(pud, vaddr);
-	set_pmd(pmd, pfn_pmd(pfn, flags));
-	/*
-	 * It's enough to flush this one mapping.
-	 * (PGE mappings get flushed as well)
-	 */
-	__flush_tlb_one(vaddr);
-}
-
 unsigned long __FIXADDR_TOP = 0xfffff000;
 EXPORT_SYMBOL(__FIXADDR_TOP);
 

diff --git a/arch/xtensa/include/asm/atomic.h b/arch/xtensa/include/asm/atomic.h
index e5103b4..6266766 100644
--- a/arch/xtensa/include/asm/atomic.h
+++ b/arch/xtensa/include/asm/atomic.h

@@ -58,165 +58,96 @@
  */
 #define atomic_set(v,i)		((v)->counter = (i))
 
-/**
- * atomic_add - add integer to atomic variable
- * @i: integer value to add
- * @v: pointer of type atomic_t
- *
- * Atomically adds @i to @v.
- */
-static inline void atomic_add(int i, atomic_t * v)
-{
 #if XCHAL_HAVE_S32C1I
-	unsigned long tmp;
-	int result;
+#define ATOMIC_OP(op)							\
+static inline void atomic_##op(int i, atomic_t * v)			\
+{									\
+	unsigned long tmp;						\
+	int result;							\
+									\
+	__asm__ __volatile__(						\
+			"1:     l32i    %1, %3, 0\n"			\
+			"       wsr     %1, scompare1\n"		\
+			"       " #op " %0, %1, %2\n"			\
+			"       s32c1i  %0, %3, 0\n"			\
+			"       bne     %0, %1, 1b\n"			\
+			: "=&a" (result), "=&a" (tmp)			\
+			: "a" (i), "a" (v)				\
+			: "memory"					\
+			);						\
+}									\
 
-	__asm__ __volatile__(
-			"1:     l32i    %1, %3, 0\n"
-			"       wsr     %1, scompare1\n"
-			"       add     %0, %1, %2\n"
-			"       s32c1i  %0, %3, 0\n"
-			"       bne     %0, %1, 1b\n"
-			: "=&a" (result), "=&a" (tmp)
-			: "a" (i), "a" (v)
-			: "memory"
-			);
-#else
-	unsigned int vval;
-
-	__asm__ __volatile__(
-			"       rsil    a15, "__stringify(LOCKLEVEL)"\n"
-			"       l32i    %0, %2, 0\n"
-			"       add     %0, %0, %1\n"
-			"       s32i    %0, %2, 0\n"
-			"       wsr     a15, ps\n"
-			"       rsync\n"
-			: "=&a" (vval)
-			: "a" (i), "a" (v)
-			: "a15", "memory"
-			);
-#endif
+#define ATOMIC_OP_RETURN(op)						\
+static inline int atomic_##op##_return(int i, atomic_t * v)		\
+{									\
+	unsigned long tmp;						\
+	int result;							\
+									\
+	__asm__ __volatile__(						\
+			"1:     l32i    %1, %3, 0\n"			\
+			"       wsr     %1, scompare1\n"		\
+			"       " #op " %0, %1, %2\n"			\
+			"       s32c1i  %0, %3, 0\n"			\
+			"       bne     %0, %1, 1b\n"			\
+			"       " #op " %0, %0, %2\n"			\
+			: "=&a" (result), "=&a" (tmp)			\
+			: "a" (i), "a" (v)				\
+			: "memory"					\
+			);						\
+									\
+	return result;							\
 }
 
-/**
- * atomic_sub - subtract the atomic variable
- * @i: integer value to subtract
- * @v: pointer of type atomic_t
- *
- * Atomically subtracts @i from @v.
- */
-static inline void atomic_sub(int i, atomic_t *v)
-{
-#if XCHAL_HAVE_S32C1I
-	unsigned long tmp;
-	int result;
+#else /* XCHAL_HAVE_S32C1I */
 
-	__asm__ __volatile__(
-			"1:     l32i    %1, %3, 0\n"
-			"       wsr     %1, scompare1\n"
-			"       sub     %0, %1, %2\n"
-			"       s32c1i  %0, %3, 0\n"
-			"       bne     %0, %1, 1b\n"
-			: "=&a" (result), "=&a" (tmp)
-			: "a" (i), "a" (v)
-			: "memory"
-			);
-#else
-	unsigned int vval;
+#define ATOMIC_OP(op)							\
+static inline void atomic_##op(int i, atomic_t * v)			\
+{									\
+	unsigned int vval;						\
+									\
+	__asm__ __volatile__(						\
+			"       rsil    a15, "__stringify(LOCKLEVEL)"\n"\
+			"       l32i    %0, %2, 0\n"			\
+			"       " #op " %0, %0, %1\n"			\
+			"       s32i    %0, %2, 0\n"			\
+			"       wsr     a15, ps\n"			\
+			"       rsync\n"				\
+			: "=&a" (vval)					\
+			: "a" (i), "a" (v)				\
+			: "a15", "memory"				\
+			);						\
+}									\
 
-	__asm__ __volatile__(
-			"       rsil    a15, "__stringify(LOCKLEVEL)"\n"
-			"       l32i    %0, %2, 0\n"
-			"       sub     %0, %0, %1\n"
-			"       s32i    %0, %2, 0\n"
-			"       wsr     a15, ps\n"
-			"       rsync\n"
-			: "=&a" (vval)
-			: "a" (i), "a" (v)
-			: "a15", "memory"
-			);
-#endif
+#define ATOMIC_OP_RETURN(op)						\
+static inline int atomic_##op##_return(int i, atomic_t * v)		\
+{									\
+	unsigned int vval;						\
+									\
+	__asm__ __volatile__(						\
+			"       rsil    a15,"__stringify(LOCKLEVEL)"\n"	\
+			"       l32i    %0, %2, 0\n"			\
+			"       " #op " %0, %0, %1\n"			\
+			"       s32i    %0, %2, 0\n"			\
+			"       wsr     a15, ps\n"			\
+			"       rsync\n"				\
+			: "=&a" (vval)					\
+			: "a" (i), "a" (v)				\
+			: "a15", "memory"				\
+			);						\
+									\
+	return vval;							\
 }
 
-/*
- * We use atomic_{add|sub}_return to define other functions.
- */
+#endif /* XCHAL_HAVE_S32C1I */
 
-static inline int atomic_add_return(int i, atomic_t * v)
-{
-#if XCHAL_HAVE_S32C1I
-	unsigned long tmp;
-	int result;
+#define ATOMIC_OPS(op) ATOMIC_OP(op) ATOMIC_OP_RETURN(op)
 
-	__asm__ __volatile__(
-			"1:     l32i    %1, %3, 0\n"
-			"       wsr     %1, scompare1\n"
-			"       add     %0, %1, %2\n"
-			"       s32c1i  %0, %3, 0\n"
-			"       bne     %0, %1, 1b\n"
-			"       add     %0, %0, %2\n"
-			: "=&a" (result), "=&a" (tmp)
-			: "a" (i), "a" (v)
-			: "memory"
-			);
+ATOMIC_OPS(add)
+ATOMIC_OPS(sub)
 
-	return result;
-#else
-	unsigned int vval;
-
-	__asm__ __volatile__(
-			"       rsil    a15,"__stringify(LOCKLEVEL)"\n"
-			"       l32i    %0, %2, 0\n"
-			"       add     %0, %0, %1\n"
-			"       s32i    %0, %2, 0\n"
-			"       wsr     a15, ps\n"
-			"       rsync\n"
-			: "=&a" (vval)
-			: "a" (i), "a" (v)
-			: "a15", "memory"
-			);
-
-	return vval;
-#endif
-}
-
-static inline int atomic_sub_return(int i, atomic_t * v)
-{
-#if XCHAL_HAVE_S32C1I
-	unsigned long tmp;
-	int result;
-
-	__asm__ __volatile__(
-			"1:     l32i    %1, %3, 0\n"
-			"       wsr     %1, scompare1\n"
-			"       sub     %0, %1, %2\n"
-			"       s32c1i  %0, %3, 0\n"
-			"       bne     %0, %1, 1b\n"
-			"       sub     %0, %0, %2\n"
-			: "=&a" (result), "=&a" (tmp)
-			: "a" (i), "a" (v)
-			: "memory"
-			);
-
-	return result;
-#else
-	unsigned int vval;
-
-	__asm__ __volatile__(
-			"       rsil    a15,"__stringify(LOCKLEVEL)"\n"
-			"       l32i    %0, %2, 0\n"
-			"       sub     %0, %0, %1\n"
-			"       s32i    %0, %2, 0\n"
-			"       wsr     a15, ps\n"
-			"       rsync\n"
-			: "=&a" (vval)
-			: "a" (i), "a" (v)
-			: "a15", "memory"
-			);
-
-	return vval;
-#endif
-}
+#undef ATOMIC_OPS
+#undef ATOMIC_OP_RETURN
+#undef ATOMIC_OP
 
 /**
  * atomic_sub_and_test - subtract value from variable and test result

diff --git a/crypto/asymmetric_keys/public_key.c b/crypto/asymmetric_keys/public_key.c
index 97eb0019..2f6e4fb 100644
--- a/crypto/asymmetric_keys/public_key.c
+++ b/crypto/asymmetric_keys/public_key.c

@@ -121,6 +121,7 @@
 struct asymmetric_key_subtype public_key_subtype = {
 	.owner			= THIS_MODULE,
 	.name			= "public_key",
+	.name_len		= sizeof("public_key") - 1,
 	.describe		= public_key_describe,
 	.destroy		= public_key_destroy,
 	.verify_signature	= public_key_verify_signature_2,

diff --git a/crypto/asymmetric_keys/verify_pefile.c b/crypto/asymmetric_keys/verify_pefile.c
index 79175e6..2421f46 100644
--- a/crypto/asymmetric_keys/verify_pefile.c
+++ b/crypto/asymmetric_keys/verify_pefile.c

@@ -128,6 +128,7 @@
 {
 	struct win_certificate wrapper;
 	const u8 *pkcs7;
+	unsigned len;
 
 	if (ctx->sig_len < sizeof(wrapper)) {
 		pr_debug("Signature wrapper too short\n");
@@ -154,33 +155,49 @@
 		return -ENOTSUPP;
 	}
 
-	/* Looks like actual pkcs signature length is in wrapper->length.
-	 * size obtained from data dir entries lists the total size of
-	 * certificate table which is also aligned to octawrod boundary.
-	 *
-	 * So set signature length field appropriately.
+	/* It looks like the pkcs signature length in wrapper->length and the
+	 * size obtained from the data dir entries, which lists the total size
+	 * of certificate table, are both aligned to an octaword boundary, so
+	 * we may have to deal with some padding.
 	 */
 	ctx->sig_len = wrapper.length;
 	ctx->sig_offset += sizeof(wrapper);
 	ctx->sig_len -= sizeof(wrapper);
-	if (ctx->sig_len == 0) {
+	if (ctx->sig_len < 4) {
 		pr_debug("Signature data missing\n");
 		return -EKEYREJECTED;
 	}
 
-	/* What's left should a PKCS#7 cert */
+	/* What's left should be a PKCS#7 cert */
 	pkcs7 = pebuf + ctx->sig_offset;
-	if (pkcs7[0] == (ASN1_CONS_BIT | ASN1_SEQ)) {
-		if (pkcs7[1] == 0x82 &&
-		    pkcs7[2] == (((ctx->sig_len - 4) >> 8) & 0xff) &&
-		    pkcs7[3] ==  ((ctx->sig_len - 4)       & 0xff))
-			return 0;
-		if (pkcs7[1] == 0x80)
-			return 0;
-		if (pkcs7[1] > 0x82)
-			return -EMSGSIZE;
+	if (pkcs7[0] != (ASN1_CONS_BIT | ASN1_SEQ))
+		goto not_pkcs7;
+
+	switch (pkcs7[1]) {
+	case 0 ... 0x7f:
+		len = pkcs7[1] + 2;
+		goto check_len;
+	case ASN1_INDEFINITE_LENGTH:
+		return 0;
+	case 0x81:
+		len = pkcs7[2] + 3;
+		goto check_len;
+	case 0x82:
+		len = ((pkcs7[2] << 8) | pkcs7[3]) + 4;
+		goto check_len;
+	case 0x83 ... 0xff:
+		return -EMSGSIZE;
+	default:
+		goto not_pkcs7;
 	}
 
+check_len:
+	if (len <= ctx->sig_len) {
+		/* There may be padding */
+		ctx->sig_len = len;
+		return 0;
+	}
+not_pkcs7:
 	pr_debug("Signature data not PKCS#7\n");
 	return -ELIBBAD;
 }

diff --git a/drivers/acpi/acpica/nsprepkg.c b/drivers/acpi/acpica/nsprepkg.c
index 68f7258..1b13b92 100644
--- a/drivers/acpi/acpica/nsprepkg.c
+++ b/drivers/acpi/acpica/nsprepkg.c

@@ -316,6 +316,45 @@
 		    acpi_ns_check_package_list(info, package, elements, count);
 		break;
 
+	case ACPI_PTYPE2_UUID_PAIR:
+
+		/* The package must contain pairs of (UUID + type) */
+
+		if (count & 1) {
+			expected_count = count + 1;
+			goto package_too_small;
+		}
+
+		while (count > 0) {
+			status = acpi_ns_check_object_type(info, elements,
+							   package->ret_info.
+							   object_type1, 0);
+			if (ACPI_FAILURE(status)) {
+				return (status);
+			}
+
+			/* Validate length of the UUID buffer */
+
+			if ((*elements)->buffer.length != 16) {
+				ACPI_WARN_PREDEFINED((AE_INFO,
+						      info->full_pathname,
+						      info->node_flags,
+						      "Invalid length for UUID Buffer"));
+				return (AE_AML_OPERAND_VALUE);
+			}
+
+			status = acpi_ns_check_object_type(info, elements + 1,
+							   package->ret_info.
+							   object_type2, 0);
+			if (ACPI_FAILURE(status)) {
+				return (status);
+			}
+
+			elements += 2;
+			count -= 2;
+		}
+		break;
+
 	default:
 
 		/* Should not get here if predefined info table is correct */

diff --git a/drivers/acpi/ec.c b/drivers/acpi/ec.c
index 9922cc4..cb6066c 100644
--- a/drivers/acpi/ec.c
+++ b/drivers/acpi/ec.c

@@ -1030,6 +1030,10 @@
 	DMI_MATCH(DMI_SYS_VENDOR, "Quanta"),
 	DMI_MATCH(DMI_PRODUCT_NAME, "TW9/SW9"),}, NULL},
 	{
+	ec_flag_msi, "Clevo W350etq", {
+	DMI_MATCH(DMI_SYS_VENDOR, "CLEVO CO."),
+	DMI_MATCH(DMI_PRODUCT_NAME, "W35_37ET"),}, NULL},
+	{
 	ec_validate_ecdt, "ASUS hardware", {
 	DMI_MATCH(DMI_BIOS_VENDOR, "ASUS") }, NULL},
 	{

diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index 3dca36d..17f9ec5 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c

@@ -1071,9 +1071,9 @@
 
 	if (pr->id == 0 && cpuidle_get_driver() == &acpi_idle_driver) {
 
-		cpuidle_pause_and_lock();
 		/* Protect against cpu-hotplug */
 		get_online_cpus();
+		cpuidle_pause_and_lock();
 
 		/* Disable all cpuidle devices */
 		for_each_online_cpu(cpu) {
@@ -1100,8 +1100,8 @@
 				cpuidle_enable_device(dev);
 			}
 		}
-		put_online_cpus();
 		cpuidle_resume_and_unlock();
+		put_online_cpus();
 	}
 
 	return 0;

diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
index 9a92989..3bf7764 100644
--- a/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c

@@ -667,8 +667,14 @@
 acpi_device_sun_show(struct device *dev, struct device_attribute *attr,
 		     char *buf) {
 	struct acpi_device *acpi_dev = to_acpi_device(dev);
+	acpi_status status;
+	unsigned long long sun;
 
-	return sprintf(buf, "%lu\n", acpi_dev->pnp.sun);
+	status = acpi_evaluate_integer(acpi_dev->handle, "_SUN", NULL, &sun);
+	if (ACPI_FAILURE(status))
+		return -ENODEV;
+
+	return sprintf(buf, "%llu\n", sun);
 }
 static DEVICE_ATTR(sun, 0444, acpi_device_sun_show, NULL);
 
@@ -690,7 +696,6 @@
 {
 	struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER, NULL};
 	acpi_status status;
-	unsigned long long sun;
 	int result = 0;
 
 	/*
@@ -731,14 +736,10 @@
 	if (dev->pnp.unique_id)
 		result = device_create_file(&dev->dev, &dev_attr_uid);
 
-	status = acpi_evaluate_integer(dev->handle, "_SUN", NULL, &sun);
-	if (ACPI_SUCCESS(status)) {
-		dev->pnp.sun = (unsigned long)sun;
+	if (acpi_has_method(dev->handle, "_SUN")) {
 		result = device_create_file(&dev->dev, &dev_attr_sun);
 		if (result)
 			goto end;
-	} else {
-		dev->pnp.sun = (unsigned long)-1;
 	}
 
 	if (acpi_has_method(dev->handle, "_STA")) {

diff --git a/drivers/acpi/video.c b/drivers/acpi/video.c
index 8268843..fcbda10 100644
--- a/drivers/acpi/video.c
+++ b/drivers/acpi/video.c

@@ -82,9 +82,9 @@
  * For Windows 8 systems: used to decide if video module
  * should skip registering backlight interface of its own.
  */
-static int use_native_backlight_param = 1;
+static int use_native_backlight_param = -1;
 module_param_named(use_native_backlight, use_native_backlight_param, int, 0444);
-static bool use_native_backlight_dmi = false;
+static bool use_native_backlight_dmi = true;
 
 static int register_count;
 static struct mutex video_list_lock;
@@ -417,6 +417,12 @@
 	return 0;
 }
 
+static int __init video_disable_native_backlight(const struct dmi_system_id *d)
+{
+	use_native_backlight_dmi = false;
+	return 0;
+}
+
 static struct dmi_system_id video_dmi_table[] __initdata = {
 	/*
 	 * Broken _BQC workaround http://bugzilla.kernel.org/show_bug.cgi?id=13121
@@ -720,6 +726,41 @@
 		DMI_MATCH(DMI_PRODUCT_NAME, "HP EliteBook 8780w"),
 		},
 	},
+
+	/*
+	 * These models have a working acpi_video backlight control, and using
+	 * native backlight causes a regression where backlight does not work
+	 * when userspace is not handling brightness key events. Disable
+	 * native_backlight on these to fix this:
+	 * https://bugzilla.kernel.org/show_bug.cgi?id=81691
+	 */
+	{
+	 .callback = video_disable_native_backlight,
+	 .ident = "ThinkPad T420",
+	 .matches = {
+		DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
+		DMI_MATCH(DMI_PRODUCT_VERSION, "ThinkPad T420"),
+		},
+	},
+	{
+	 .callback = video_disable_native_backlight,
+	 .ident = "ThinkPad T520",
+	 .matches = {
+		DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
+		DMI_MATCH(DMI_PRODUCT_VERSION, "ThinkPad T520"),
+		},
+	},
+
+	/* The native backlight controls do not work on some older machines */
+	{
+	 /* https://bugs.freedesktop.org/show_bug.cgi?id=81515 */
+	 .callback = video_disable_native_backlight,
+	 .ident = "HP ENVY 15 Notebook",
+	 .matches = {
+		DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
+		DMI_MATCH(DMI_PRODUCT_NAME, "HP ENVY 15 Notebook PC"),
+		},
+	},
 	{}
 };
 

diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c
index a29f801..a0cc0ed 100644
--- a/drivers/ata/ahci.c
+++ b/drivers/ata/ahci.c

@@ -305,6 +305,14 @@
 	{ PCI_VDEVICE(INTEL, 0x9c85), board_ahci }, /* Wildcat Point-LP RAID */
 	{ PCI_VDEVICE(INTEL, 0x9c87), board_ahci }, /* Wildcat Point-LP RAID */
 	{ PCI_VDEVICE(INTEL, 0x9c8f), board_ahci }, /* Wildcat Point-LP RAID */
+	{ PCI_VDEVICE(INTEL, 0x8c82), board_ahci }, /* 9 Series AHCI */
+	{ PCI_VDEVICE(INTEL, 0x8c83), board_ahci }, /* 9 Series AHCI */
+	{ PCI_VDEVICE(INTEL, 0x8c84), board_ahci }, /* 9 Series RAID */
+	{ PCI_VDEVICE(INTEL, 0x8c85), board_ahci }, /* 9 Series RAID */
+	{ PCI_VDEVICE(INTEL, 0x8c86), board_ahci }, /* 9 Series RAID */
+	{ PCI_VDEVICE(INTEL, 0x8c87), board_ahci }, /* 9 Series RAID */
+	{ PCI_VDEVICE(INTEL, 0x8c8e), board_ahci }, /* 9 Series RAID */
+	{ PCI_VDEVICE(INTEL, 0x8c8f), board_ahci }, /* 9 Series RAID */
 
 	/* JMicron 360/1/3/5/6, match class to avoid IDE function */
 	{ PCI_VENDOR_ID_JMICRON, PCI_ANY_ID, PCI_ANY_ID, PCI_ANY_ID,
@@ -442,6 +450,8 @@
 	{ PCI_DEVICE(PCI_VENDOR_ID_MARVELL_EXT, 0x917a),
 	  .driver_data = board_ahci_yes_fbs },			/* 88se9172 */
 	{ PCI_DEVICE(PCI_VENDOR_ID_MARVELL_EXT, 0x9172),
+	  .driver_data = board_ahci_yes_fbs },			/* 88se9182 */
+	{ PCI_DEVICE(PCI_VENDOR_ID_MARVELL_EXT, 0x9182),
 	  .driver_data = board_ahci_yes_fbs },			/* 88se9172 */
 	{ PCI_DEVICE(PCI_VENDOR_ID_MARVELL_EXT, 0x9192),
 	  .driver_data = board_ahci_yes_fbs },			/* 88se9172 on some Gigabyte */
@@ -1329,6 +1339,18 @@
 	else if (pdev->vendor == 0x1c44 && pdev->device == 0x8000)
 		ahci_pci_bar = AHCI_PCI_BAR_ENMOTUS;
 
+	/*
+	 * The JMicron chip 361/363 contains one SATA controller and one
+	 * PATA controller,for powering on these both controllers, we must
+	 * follow the sequence one by one, otherwise one of them can not be
+	 * powered on successfully, so here we disable the async suspend
+	 * method for these chips.
+	 */
+	if (pdev->vendor == PCI_VENDOR_ID_JMICRON &&
+		(pdev->device == PCI_DEVICE_ID_JMICRON_JMB363 ||
+		pdev->device == PCI_DEVICE_ID_JMICRON_JMB361))
+		device_disable_async_suspend(&pdev->dev);
+
 	/* acquire resources */
 	rc = pcim_enable_device(pdev);
 	if (rc)

diff --git a/drivers/ata/ahci_tegra.c b/drivers/ata/ahci_tegra.c
index f1fef74..0329044 100644
--- a/drivers/ata/ahci_tegra.c
+++ b/drivers/ata/ahci_tegra.c

@@ -18,14 +18,17 @@
  */
 
 #include <linux/ahci_platform.h>
-#include <linux/reset.h>
 #include <linux/errno.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/of_device.h>
 #include <linux/platform_device.h>
 #include <linux/regulator/consumer.h>
+#include <linux/reset.h>
+
+#include <soc/tegra/fuse.h>
 #include <soc/tegra/pmc.h>
+
 #include "ahci.h"
 
 #define SATA_CONFIGURATION_0				0x180
@@ -180,9 +183,12 @@
 
 	/* Pad calibration */
 
-	/* FIXME Always use calibration 0. Change this to read the calibration
-	 * fuse once the fuse driver has landed. */
-	val = 0;
+	ret = tegra_fuse_readl(FUSE_SATA_CALIB, &val);
+	if (ret) {
+		dev_err(&tegra->pdev->dev,
+			"failed to read calibration fuse: %d\n", ret);
+		return ret;
+	}
 
 	calib = tegra124_pad_calibration[val & FUSE_SATA_CALIB_MASK];
 

diff --git a/drivers/ata/ahci_xgene.c b/drivers/ata/ahci_xgene.c
index c696230..f03aab1 100644
--- a/drivers/ata/ahci_xgene.c
+++ b/drivers/ata/ahci_xgene.c

@@ -78,6 +78,9 @@
 #define CFG_MEM_RAM_SHUTDOWN		0x00000070
 #define BLOCK_MEM_RDY			0x00000074
 
+/* Max retry for link down */
+#define MAX_LINK_DOWN_RETRY 3
+
 struct xgene_ahci_context {
 	struct ahci_host_priv *hpriv;
 	struct device *dev;
@@ -145,6 +148,14 @@
 	return rc;
 }
 
+static bool xgene_ahci_is_memram_inited(struct xgene_ahci_context *ctx)
+{
+	void __iomem *diagcsr = ctx->csr_diag;
+
+	return (readl(diagcsr + CFG_MEM_RAM_SHUTDOWN) == 0 &&
+	        readl(diagcsr + BLOCK_MEM_RDY) == 0xFFFFFFFF);
+}
+
 /**
  * xgene_ahci_read_id - Read ID data from the specified device
  * @dev: device
@@ -229,8 +240,11 @@
  * and Gen1 (1.5Gbps). Otherwise during long IO stress test, the PHY will
  * report disparity error and etc. In addition, during COMRESET, there can
  * be error reported in the register PORT_SCR_ERR. For SERR_DISPARITY and
- * SERR_10B_8B_ERR, the PHY receiver line must be reseted. The following
- * algorithm is followed to proper configure the hardware PHY during COMRESET:
+ * SERR_10B_8B_ERR, the PHY receiver line must be reseted. Also during long
+ * reboot cycle regression, sometimes the PHY reports link down even if the
+ * device is present because of speed negotiation failure. so need to retry
+ * the COMRESET to get the link up. The following algorithm is followed to
+ * proper configure the hardware PHY during COMRESET:
  *
  * Alg Part 1:
  * 1. Start the PHY at Gen3 speed (default setting)
@@ -246,9 +260,15 @@
  * Alg Part 2:
  * 1. On link up, if there are any SERR_DISPARITY and SERR_10B_8B_ERR error
  *    reported in the register PORT_SCR_ERR, then reset the PHY receiver line
- * 2. Go to Alg Part 3
+ * 2. Go to Alg Part 4
  *
  * Alg Part 3:
+ * 1. Check the PORT_SCR_STAT to see whether device presence detected but PHY
+ *    communication establishment failed and maximum link down attempts are
+ *    less than Max attempts 3 then goto Alg Part 1.
+ * 2. Go to Alg Part 4.
+ *
+ * Alg Part 4:
  * 1. Clear any pending from register PORT_SCR_ERR.
  *
  * NOTE: For the initial version, we will NOT support Gen1/Gen2. In addition
@@ -267,19 +287,27 @@
 	u8 *d2h_fis = pp->rx_fis + RX_FIS_D2H_REG;
 	void __iomem *port_mmio = ahci_port_base(ap);
 	struct ata_taskfile tf;
+	int link_down_retry = 0;
 	int rc;
-	u32 val;
+	u32 val, sstatus;
 
-	/* clear D2H reception area to properly wait for D2H FIS */
-	ata_tf_init(link->device, &tf);
-	tf.command = ATA_BUSY;
-	ata_tf_to_fis(&tf, 0, 0, d2h_fis);
-	rc = sata_link_hardreset(link, timing, deadline, online,
+	do {
+		/* clear D2H reception area to properly wait for D2H FIS */
+		ata_tf_init(link->device, &tf);
+		tf.command = ATA_BUSY;
+		ata_tf_to_fis(&tf, 0, 0, d2h_fis);
+		rc = sata_link_hardreset(link, timing, deadline, online,
 				 ahci_check_ready);
+		if (*online) {
+			val = readl(port_mmio + PORT_SCR_ERR);
+			if (val & (SERR_DISPARITY | SERR_10B_8B_ERR))
+				dev_warn(ctx->dev, "link has error\n");
+			break;
+		}
 
-	val = readl(port_mmio + PORT_SCR_ERR);
-	if (val & (SERR_DISPARITY | SERR_10B_8B_ERR))
-		dev_warn(ctx->dev, "link has error\n");
+		sata_scr_read(link, SCR_STATUS, &sstatus);
+	} while (link_down_retry++ < MAX_LINK_DOWN_RETRY &&
+		 (sstatus & 0xff) == 0x1);
 
 	/* clear all errors if any pending */
 	val = readl(port_mmio + PORT_SCR_ERR);
@@ -467,6 +495,11 @@
 		return -ENODEV;
 	}
 
+	if (xgene_ahci_is_memram_inited(ctx)) {
+		dev_info(dev, "skip clock and PHY initialization\n");
+		goto skip_clk_phy;
+	}
+
 	/* Due to errata, HW requires full toggle transition */
 	rc = ahci_platform_enable_clks(hpriv);
 	if (rc)
@@ -479,7 +512,7 @@
 
 	/* Configure the host controller */
 	xgene_ahci_hw_init(hpriv);
-
+skip_clk_phy:
 	hpriv->flags = AHCI_HFLAG_NO_PMP | AHCI_HFLAG_NO_NCQ;
 
 	rc = ahci_platform_init_host(pdev, hpriv, &xgene_ahci_port_info);

diff --git a/drivers/ata/ata_piix.c b/drivers/ata/ata_piix.c
index 893e30e..ffbe625 100644
--- a/drivers/ata/ata_piix.c
+++ b/drivers/ata/ata_piix.c

@@ -340,6 +340,14 @@
 	{ 0x8086, 0x0F21, PCI_ANY_ID, PCI_ANY_ID, 0, 0, ich8_2port_sata_byt },
 	/* SATA Controller IDE (Coleto Creek) */
 	{ 0x8086, 0x23a6, PCI_ANY_ID, PCI_ANY_ID, 0, 0, ich8_2port_sata },
+	/* SATA Controller IDE (9 Series) */
+	{ 0x8086, 0x8c88, PCI_ANY_ID, PCI_ANY_ID, 0, 0, ich8_2port_sata_snb },
+	/* SATA Controller IDE (9 Series) */
+	{ 0x8086, 0x8c89, PCI_ANY_ID, PCI_ANY_ID, 0, 0, ich8_2port_sata_snb },
+	/* SATA Controller IDE (9 Series) */
+	{ 0x8086, 0x8c80, PCI_ANY_ID, PCI_ANY_ID, 0, 0, ich8_sata_snb },
+	/* SATA Controller IDE (9 Series) */
+	{ 0x8086, 0x8c81, PCI_ANY_ID, PCI_ANY_ID, 0, 0, ich8_sata_snb },
 
 	{ }	/* terminate list */
 };

diff --git a/drivers/ata/pata_jmicron.c b/drivers/ata/pata_jmicron.c
index 4d1a5d2..47e418b 100644
--- a/drivers/ata/pata_jmicron.c
+++ b/drivers/ata/pata_jmicron.c

@@ -143,6 +143,18 @@
 	};
 	const struct ata_port_info *ppi[] = { &info, NULL };
 
+	/*
+	 * The JMicron chip 361/363 contains one SATA controller and one
+	 * PATA controller,for powering on these both controllers, we must
+	 * follow the sequence one by one, otherwise one of them can not be
+	 * powered on successfully, so here we disable the async suspend
+	 * method for these chips.
+	 */
+	if (pdev->vendor == PCI_VENDOR_ID_JMICRON &&
+		(pdev->device == PCI_DEVICE_ID_JMICRON_JMB363 ||
+		pdev->device == PCI_DEVICE_ID_JMICRON_JMB361))
+		device_disable_async_suspend(&pdev->dev);
+
 	return ata_pci_bmdma_init_one(pdev, ppi, &jmicron_sht, NULL, 0);
 }
 

diff --git a/drivers/base/regmap/internal.h b/drivers/base/regmap/internal.h
index 7d13269..bfc90b8 100644
--- a/drivers/base/regmap/internal.h
+++ b/drivers/base/regmap/internal.h

@@ -146,6 +146,9 @@
 	enum regcache_type type;
 	int (*init)(struct regmap *map);
 	int (*exit)(struct regmap *map);
+#ifdef CONFIG_DEBUG_FS
+	void (*debugfs_init)(struct regmap *map);
+#endif
 	int (*read)(struct regmap *map, unsigned int reg, unsigned int *value);
 	int (*write)(struct regmap *map, unsigned int reg, unsigned int value);
 	int (*sync)(struct regmap *map, unsigned int min, unsigned int max);

diff --git a/drivers/base/regmap/regcache-rbtree.c b/drivers/base/regmap/regcache-rbtree.c
index 6a7e4fa..f3e8fe0 100644
--- a/drivers/base/regmap/regcache-rbtree.c
+++ b/drivers/base/regmap/regcache-rbtree.c

@@ -194,10 +194,6 @@
 {
 	debugfs_create_file("rbtree", 0400, map->debugfs, map, &rbtree_fops);
 }
-#else
-static void rbtree_debugfs_init(struct regmap *map)
-{
-}
 #endif
 
 static int regcache_rbtree_init(struct regmap *map)
@@ -222,8 +218,6 @@
 			goto err;
 	}
 
-	rbtree_debugfs_init(map);
-
 	return 0;
 
 err:
@@ -532,6 +526,9 @@
 	.name = "rbtree",
 	.init = regcache_rbtree_init,
 	.exit = regcache_rbtree_exit,
+#ifdef CONFIG_DEBUG_FS
+	.debugfs_init = rbtree_debugfs_init,
+#endif
 	.read = regcache_rbtree_read,
 	.write = regcache_rbtree_write,
 	.sync = regcache_rbtree_sync,

diff --git a/drivers/base/regmap/regcache.c b/drivers/base/regmap/regcache.c
index 29b4128..5617da6 100644
--- a/drivers/base/regmap/regcache.c
+++ b/drivers/base/regmap/regcache.c

@@ -698,7 +698,7 @@
 			unsigned int block_base, unsigned int start,
 			unsigned int end)
 {
-	if (regmap_can_raw_write(map))
+	if (regmap_can_raw_write(map) && !map->use_single_rw)
 		return regcache_sync_block_raw(map, block, cache_present,
 					       block_base, start, end);
 	else

diff --git a/drivers/base/regmap/regmap-debugfs.c b/drivers/base/regmap/regmap-debugfs.c
index 45d812c..65ea7b2 100644
--- a/drivers/base/regmap/regmap-debugfs.c
+++ b/drivers/base/regmap/regmap-debugfs.c

@@ -538,6 +538,9 @@
 
 		next = rb_next(&range_node->node);
 	}
+
+	if (map->cache_ops && map->cache_ops->debugfs_init)
+		map->cache_ops->debugfs_init(map);
 }
 
 void regmap_debugfs_exit(struct regmap *map)

diff --git a/drivers/base/regmap/regmap.c b/drivers/base/regmap/regmap.c
index 78f43fb..1cf427b 100644
--- a/drivers/base/regmap/regmap.c
+++ b/drivers/base/regmap/regmap.c

@@ -109,7 +109,7 @@
 
 bool regmap_volatile(struct regmap *map, unsigned int reg)
 {
-	if (!regmap_readable(map, reg))
+	if (!map->format.format_write && !regmap_readable(map, reg))
 		return false;
 
 	if (map->volatile_reg)

diff --git a/drivers/bcma/host_pci.c b/drivers/bcma/host_pci.c
index 294a7dd..f032ed6 100644
--- a/drivers/bcma/host_pci.c
+++ b/drivers/bcma/host_pci.c

@@ -282,6 +282,7 @@
 	{ PCI_DEVICE(PCI_VENDOR_ID_BROADCOM, 0x43a9) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_BROADCOM, 0x43aa) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_BROADCOM, 0x4727) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_BROADCOM, 43227) },	/* 0xA8DB */
 	{ 0, },
 };
 MODULE_DEVICE_TABLE(pci, bcma_pci_bridge_tbl);

diff --git a/drivers/bus/arm-ccn.c b/drivers/bus/arm-ccn.c
index 6f550d9..a60f264 100644
--- a/drivers/bus/arm-ccn.c
+++ b/drivers/bus/arm-ccn.c

@@ -586,6 +586,30 @@
 	return 0;
 }
 
+static void arm_ccn_pmu_event_destroy(struct perf_event *event)
+{
+	struct arm_ccn *ccn = pmu_to_arm_ccn(event->pmu);
+	struct hw_perf_event *hw = &event->hw;
+
+	if (hw->idx == CCN_IDX_PMU_CYCLE_COUNTER) {
+		clear_bit(CCN_IDX_PMU_CYCLE_COUNTER, ccn->dt.pmu_counters_mask);
+	} else {
+		struct arm_ccn_component *source =
+				ccn->dt.pmu_counters[hw->idx].source;
+
+		if (CCN_CONFIG_TYPE(event->attr.config) == CCN_TYPE_XP &&
+				CCN_CONFIG_EVENT(event->attr.config) ==
+				CCN_EVENT_WATCHPOINT)
+			clear_bit(hw->config_base, source->xp.dt_cmp_mask);
+		else
+			clear_bit(hw->config_base, source->pmu_events_mask);
+		clear_bit(hw->idx, ccn->dt.pmu_counters_mask);
+	}
+
+	ccn->dt.pmu_counters[hw->idx].source = NULL;
+	ccn->dt.pmu_counters[hw->idx].event = NULL;
+}
+
 static int arm_ccn_pmu_event_init(struct perf_event *event)
 {
 	struct arm_ccn *ccn;
@@ -599,6 +623,7 @@
 		return -ENOENT;
 
 	ccn = pmu_to_arm_ccn(event->pmu);
+	event->destroy = arm_ccn_pmu_event_destroy;
 
 	if (hw->sample_period) {
 		dev_warn(ccn->dev, "Sampling not supported!\n");
@@ -731,30 +756,6 @@
 	return 0;
 }
 
-static void arm_ccn_pmu_event_free(struct perf_event *event)
-{
-	struct arm_ccn *ccn = pmu_to_arm_ccn(event->pmu);
-	struct hw_perf_event *hw = &event->hw;
-
-	if (hw->idx == CCN_IDX_PMU_CYCLE_COUNTER) {
-		clear_bit(CCN_IDX_PMU_CYCLE_COUNTER, ccn->dt.pmu_counters_mask);
-	} else {
-		struct arm_ccn_component *source =
-				ccn->dt.pmu_counters[hw->idx].source;
-
-		if (CCN_CONFIG_TYPE(event->attr.config) == CCN_TYPE_XP &&
-				CCN_CONFIG_EVENT(event->attr.config) ==
-				CCN_EVENT_WATCHPOINT)
-			clear_bit(hw->config_base, source->xp.dt_cmp_mask);
-		else
-			clear_bit(hw->config_base, source->pmu_events_mask);
-		clear_bit(hw->idx, ccn->dt.pmu_counters_mask);
-	}
-
-	ccn->dt.pmu_counters[hw->idx].source = NULL;
-	ccn->dt.pmu_counters[hw->idx].event = NULL;
-}
-
 static u64 arm_ccn_pmu_read_counter(struct arm_ccn *ccn, int idx)
 {
 	u64 res;
@@ -1027,8 +1028,6 @@
 static void arm_ccn_pmu_event_del(struct perf_event *event, int flags)
 {
 	arm_ccn_pmu_event_stop(event, PERF_EF_UPDATE);
-
-	arm_ccn_pmu_event_free(event);
 }
 
 static void arm_ccn_pmu_event_read(struct perf_event *event)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index e396ad3..0668b38 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c

@@ -708,10 +708,6 @@
 
 static int intel_pstate_set_policy(struct cpufreq_policy *policy)
 {
-	struct cpudata *cpu;
-
-	cpu = all_cpu_data[policy->cpu];
-
 	if (!policy->cpuinfo.max_freq)
 		return -ENODEV;
 

diff --git a/drivers/gpio/gpio-bt8xx.c b/drivers/gpio/gpio-bt8xx.c
index 6557147..7e4c43c 100644
--- a/drivers/gpio/gpio-bt8xx.c
+++ b/drivers/gpio/gpio-bt8xx.c

@@ -241,9 +241,6 @@
 	bgwrite(~0x0, BT848_INT_STAT);
 	bgwrite(0x0, BT848_GPIO_OUT_EN);
 
-	iounmap(bg->mmio);
-	release_mem_region(pci_resource_start(pdev, 0),
-			   pci_resource_len(pdev, 0));
 	pci_disable_device(pdev);
 }
 

diff --git a/drivers/gpu/drm/drm_modeset_lock.c b/drivers/gpu/drm/drm_modeset_lock.c
index 0dc57d5..3a02e5e 100644
--- a/drivers/gpu/drm/drm_modeset_lock.c
+++ b/drivers/gpu/drm/drm_modeset_lock.c

@@ -35,7 +35,7 @@
  * of extra utility/tracking out of our acquire-ctx.  This is provided
  * by drm_modeset_lock / drm_modeset_acquire_ctx.
  *
- * For basic principles of ww_mutex, see: Documentation/ww-mutex-design.txt
+ * For basic principles of ww_mutex, see: Documentation/locking/ww-mutex-design.txt
  *
  * The basic usage pattern is to:
  *

diff --git a/drivers/gpu/drm/i915/intel_bios.c b/drivers/gpu/drm/i915/intel_bios.c
index a669550..eee79e1 100644
--- a/drivers/gpu/drm/i915/intel_bios.c
+++ b/drivers/gpu/drm/i915/intel_bios.c

@@ -1123,7 +1123,7 @@
 	}
 }
 
-static int __init intel_no_opregion_vbt_callback(const struct dmi_system_id *id)
+static int intel_no_opregion_vbt_callback(const struct dmi_system_id *id)
 {
 	DRM_DEBUG_KMS("Falling back to manually reading VBT from "
 		      "VBIOS ROM for %s\n",

diff --git a/drivers/gpu/drm/i915/intel_crt.c b/drivers/gpu/drm/i915/intel_crt.c
index e8abfce..9212e65 100644
--- a/drivers/gpu/drm/i915/intel_crt.c
+++ b/drivers/gpu/drm/i915/intel_crt.c

@@ -804,7 +804,7 @@
 	.destroy = intel_encoder_destroy,
 };
 
-static int __init intel_no_crt_dmi_callback(const struct dmi_system_id *id)
+static int intel_no_crt_dmi_callback(const struct dmi_system_id *id)
 {
 	DRM_INFO("Skipping CRT initialization for %s\n", id->ident);
 	return 1;

diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index d074d70..d8324c6 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c

@@ -2233,6 +2233,15 @@
 	if (need_vtd_wa(dev) && alignment < 256 * 1024)
 		alignment = 256 * 1024;
 
+	/*
+	 * Global gtt pte registers are special registers which actually forward
+	 * writes to a chunk of system memory. Which means that there is no risk
+	 * that the register values disappear as soon as we call
+	 * intel_runtime_pm_put(), so it is correct to wrap only the
+	 * pin/unpin/fence and not more.
+	 */
+	intel_runtime_pm_get(dev_priv);
+
 	dev_priv->mm.interruptible = false;
 	ret = i915_gem_object_pin_to_display_plane(obj, alignment, pipelined);
 	if (ret)
@@ -2250,12 +2259,14 @@
 	i915_gem_object_pin_fence(obj);
 
 	dev_priv->mm.interruptible = true;
+	intel_runtime_pm_put(dev_priv);
 	return 0;
 
 err_unpin:
 	i915_gem_object_unpin_from_display_plane(obj);
 err_interruptible:
 	dev_priv->mm.interruptible = true;
+	intel_runtime_pm_put(dev_priv);
 	return ret;
 }
 
@@ -4188,10 +4199,6 @@
 		intel_set_pch_fifo_underrun_reporting(dev, pipe, false);
 
 	intel_disable_pipe(dev_priv, pipe);
-
-	if (intel_crtc->config.dp_encoder_is_mst)
-		intel_ddi_set_vc_payload_alloc(crtc, false);
-
 	ironlake_pfit_disable(intel_crtc);
 
 	for_each_encoder_on_crtc(dev, crtc, encoder)
@@ -4256,6 +4263,9 @@
 		intel_set_pch_fifo_underrun_reporting(dev, TRANSCODER_A, false);
 	intel_disable_pipe(dev_priv, pipe);
 
+	if (intel_crtc->config.dp_encoder_is_mst)
+		intel_ddi_set_vc_payload_alloc(crtc, false);
+
 	intel_ddi_disable_transcoder_func(dev_priv, cpu_transcoder);
 
 	ironlake_pfit_disable(intel_crtc);
@@ -8240,6 +8250,15 @@
 			goto fail_locked;
 		}
 
+		/*
+		 * Global gtt pte registers are special registers which actually
+		 * forward writes to a chunk of system memory. Which means that
+		 * there is no risk that the register values disappear as soon
+		 * as we call intel_runtime_pm_put(), so it is correct to wrap
+		 * only the pin/unpin/fence and not more.
+		 */
+		intel_runtime_pm_get(dev_priv);
+
 		/* Note that the w/a also requires 2 PTE of padding following
 		 * the bo. We currently fill all unused PTE with the shadow
 		 * page and so we should always have valid PTE following the
@@ -8252,16 +8271,20 @@
 		ret = i915_gem_object_pin_to_display_plane(obj, alignment, NULL);
 		if (ret) {
 			DRM_DEBUG_KMS("failed to move cursor bo into the GTT\n");
+			intel_runtime_pm_put(dev_priv);
 			goto fail_locked;
 		}
 
 		ret = i915_gem_object_put_fence(obj);
 		if (ret) {
 			DRM_DEBUG_KMS("failed to release fence for cursor");
+			intel_runtime_pm_put(dev_priv);
 			goto fail_unpin;
 		}
 
 		addr = i915_gem_obj_ggtt_offset(obj);
+
+		intel_runtime_pm_put(dev_priv);
 	} else {
 		int align = IS_I830(dev) ? 16 * 1024 : 256;
 		ret = i915_gem_object_attach_phys(obj, align);
@@ -12481,6 +12504,9 @@
 	/* Acer C720 and C720P Chromebooks (Celeron 2955U) have backlights */
 	{ 0x0a06, 0x1025, 0x0a11, quirk_backlight_present },
 
+	/* Acer C720 Chromebook (Core i3 4005U) */
+	{ 0x0a16, 0x1025, 0x0a11, quirk_backlight_present },
+
 	/* Toshiba CB35 Chromebook (Celeron 2955U) */
 	{ 0x0a06, 0x1179, 0x0a88, quirk_backlight_present },
 

diff --git a/drivers/gpu/drm/i915/intel_dp.c b/drivers/gpu/drm/i915/intel_dp.c
index 67cfed6..81d7681 100644
--- a/drivers/gpu/drm/i915/intel_dp.c
+++ b/drivers/gpu/drm/i915/intel_dp.c

@@ -3661,24 +3661,12 @@
 	return intel_dp_detect_dpcd(intel_dp);
 }
 
-static enum drm_connector_status
-g4x_dp_detect(struct intel_dp *intel_dp)
+static int g4x_digital_port_connected(struct drm_device *dev,
+				       struct intel_digital_port *intel_dig_port)
 {
-	struct drm_device *dev = intel_dp_to_dev(intel_dp);
 	struct drm_i915_private *dev_priv = dev->dev_private;
-	struct intel_digital_port *intel_dig_port = dp_to_dig_port(intel_dp);
 	uint32_t bit;
 
-	/* Can't disconnect eDP, but you can close the lid... */
-	if (is_edp(intel_dp)) {
-		enum drm_connector_status status;
-
-		status = intel_panel_detect(dev);
-		if (status == connector_status_unknown)
-			status = connector_status_connected;
-		return status;
-	}
-
 	if (IS_VALLEYVIEW(dev)) {
 		switch (intel_dig_port->port) {
 		case PORT_B:
@@ -3691,7 +3679,7 @@
 			bit = PORTD_HOTPLUG_LIVE_STATUS_VLV;
 			break;
 		default:
-			return connector_status_unknown;
+			return -EINVAL;
 		}
 	} else {
 		switch (intel_dig_port->port) {
@@ -3705,11 +3693,36 @@
 			bit = PORTD_HOTPLUG_LIVE_STATUS_G4X;
 			break;
 		default:
-			return connector_status_unknown;
+			return -EINVAL;
 		}
 	}
 
 	if ((I915_READ(PORT_HOTPLUG_STAT) & bit) == 0)
+		return 0;
+	return 1;
+}
+
+static enum drm_connector_status
+g4x_dp_detect(struct intel_dp *intel_dp)
+{
+	struct drm_device *dev = intel_dp_to_dev(intel_dp);
+	struct intel_digital_port *intel_dig_port = dp_to_dig_port(intel_dp);
+	int ret;
+
+	/* Can't disconnect eDP, but you can close the lid... */
+	if (is_edp(intel_dp)) {
+		enum drm_connector_status status;
+
+		status = intel_panel_detect(dev);
+		if (status == connector_status_unknown)
+			status = connector_status_connected;
+		return status;
+	}
+
+	ret = g4x_digital_port_connected(dev, intel_dig_port);
+	if (ret == -EINVAL)
+		return connector_status_unknown;
+	else if (ret == 0)
 		return connector_status_disconnected;
 
 	return intel_dp_detect_dpcd(intel_dp);
@@ -4066,8 +4079,14 @@
 	intel_display_power_get(dev_priv, power_domain);
 
 	if (long_hpd) {
-		if (!ibx_digital_port_connected(dev_priv, intel_dig_port))
-			goto mst_fail;
+
+		if (HAS_PCH_SPLIT(dev)) {
+			if (!ibx_digital_port_connected(dev_priv, intel_dig_port))
+				goto mst_fail;
+		} else {
+			if (g4x_digital_port_connected(dev, intel_dig_port) != 1)
+				goto mst_fail;
+		}
 
 		if (!intel_dp_get_dpcd(intel_dp)) {
 			goto mst_fail;

diff --git a/drivers/gpu/drm/i915/intel_lvds.c b/drivers/gpu/drm/i915/intel_lvds.c
index 881361c..fdf4026 100644
--- a/drivers/gpu/drm/i915/intel_lvds.c
+++ b/drivers/gpu/drm/i915/intel_lvds.c

@@ -538,7 +538,7 @@
 	.destroy = intel_encoder_destroy,
 };
 
-static int __init intel_no_lvds_dmi_callback(const struct dmi_system_id *id)
+static int intel_no_lvds_dmi_callback(const struct dmi_system_id *id)
 {
 	DRM_INFO("Skipping LVDS initialization for %s\n", id->ident);
 	return 1;

diff --git a/drivers/gpu/drm/i915/intel_panel.c b/drivers/gpu/drm/i915/intel_panel.c
index 59b028f..8e37444 100644
--- a/drivers/gpu/drm/i915/intel_panel.c
+++ b/drivers/gpu/drm/i915/intel_panel.c

@@ -801,7 +801,7 @@
 
 	cpu_ctl2 = I915_READ(BLC_PWM_CPU_CTL2);
 	if (cpu_ctl2 & BLM_PWM_ENABLE) {
-		WARN(1, "cpu backlight already enabled\n");
+		DRM_DEBUG_KMS("cpu backlight already enabled\n");
 		cpu_ctl2 &= ~BLM_PWM_ENABLE;
 		I915_WRITE(BLC_PWM_CPU_CTL2, cpu_ctl2);
 	}
@@ -845,7 +845,7 @@
 
 	ctl = I915_READ(BLC_PWM_CTL);
 	if (ctl & BACKLIGHT_DUTY_CYCLE_MASK_PNV) {
-		WARN(1, "backlight already enabled\n");
+		DRM_DEBUG_KMS("backlight already enabled\n");
 		I915_WRITE(BLC_PWM_CTL, 0);
 	}
 
@@ -876,7 +876,7 @@
 
 	ctl2 = I915_READ(BLC_PWM_CTL2);
 	if (ctl2 & BLM_PWM_ENABLE) {
-		WARN(1, "backlight already enabled\n");
+		DRM_DEBUG_KMS("backlight already enabled\n");
 		ctl2 &= ~BLM_PWM_ENABLE;
 		I915_WRITE(BLC_PWM_CTL2, ctl2);
 	}
@@ -910,7 +910,7 @@
 
 	ctl2 = I915_READ(VLV_BLC_PWM_CTL2(pipe));
 	if (ctl2 & BLM_PWM_ENABLE) {
-		WARN(1, "backlight already enabled\n");
+		DRM_DEBUG_KMS("backlight already enabled\n");
 		ctl2 &= ~BLM_PWM_ENABLE;
 		I915_WRITE(VLV_BLC_PWM_CTL2(pipe), ctl2);
 	}

diff --git a/drivers/gpu/drm/i915/intel_tv.c b/drivers/gpu/drm/i915/intel_tv.c
index 32186a6..c69d3ce 100644
--- a/drivers/gpu/drm/i915/intel_tv.c
+++ b/drivers/gpu/drm/i915/intel_tv.c

@@ -1311,6 +1311,7 @@
 {
 	struct drm_display_mode mode;
 	struct intel_tv *intel_tv = intel_attached_tv(connector);
+	enum drm_connector_status status;
 	int type;
 
 	DRM_DEBUG_KMS("[CONNECTOR:%d:%s] force=%d\n",
@@ -1328,16 +1329,19 @@
 		if (intel_get_load_detect_pipe(connector, &mode, &tmp, &ctx)) {
 			type = intel_tv_detect_type(intel_tv, connector);
 			intel_release_load_detect_pipe(connector, &tmp);
+			status = type < 0 ?
+				connector_status_disconnected :
+				connector_status_connected;
 		} else
-			return connector_status_unknown;
+			status = connector_status_unknown;
 
 		drm_modeset_drop_locks(&ctx);
 		drm_modeset_acquire_fini(&ctx);
 	} else
 		return connector->status;
 
-	if (type < 0)
-		return connector_status_disconnected;
+	if (status != connector_status_connected)
+		return status;
 
 	intel_tv->type = type;
 	intel_tv_find_better_format(connector);

diff --git a/drivers/gpu/drm/nouveau/core/core/parent.c b/drivers/gpu/drm/nouveau/core/core/parent.c
index 8701968..30a2911 100644
--- a/drivers/gpu/drm/nouveau/core/core/parent.c
+++ b/drivers/gpu/drm/nouveau/core/core/parent.c

@@ -86,7 +86,7 @@
 	sclass = nv_parent(parent)->sclass;
 	while (sclass) {
 		if (++nr < size)
-			lclass[nr] = sclass->oclass->handle;
+			lclass[nr] = sclass->oclass->handle & 0xffff;
 		sclass = sclass->sclass;
 	}
 
@@ -96,7 +96,7 @@
 		if (engine && (oclass = engine->sclass)) {
 			while (oclass->ofuncs) {
 				if (++nr < size)
-					lclass[nr] = oclass->handle;
+					lclass[nr] = oclass->handle & 0xffff;
 				oclass++;
 			}
 		}

diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c b/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c
index 7bfdaa1..36b8716 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c

@@ -450,11 +450,11 @@
 					  res,
 					  id_loc - sw_context->buf_start);
 	if (unlikely(ret != 0))
-		goto out_err;
+		return ret;
 
 	ret = vmw_resource_val_add(sw_context, res, &node);
 	if (unlikely(ret != 0))
-		goto out_err;
+		return ret;
 
 	if (res_type == vmw_res_context && dev_priv->has_mob &&
 	    node->first_usage) {
@@ -468,13 +468,13 @@
 
 		ret = vmw_resource_context_res_add(dev_priv, sw_context, res);
 		if (unlikely(ret != 0))
-			goto out_err;
+			return ret;
 		node->staged_bindings =
 			kzalloc(sizeof(*node->staged_bindings), GFP_KERNEL);
 		if (node->staged_bindings == NULL) {
 			DRM_ERROR("Failed to allocate context binding "
 				  "information.\n");
-			goto out_err;
+			return -ENOMEM;
 		}
 		INIT_LIST_HEAD(&node->staged_bindings->list);
 	}
@@ -482,8 +482,7 @@
 	if (p_val)
 		*p_val = node;
 
-out_err:
-	return ret;
+	return 0;
 }
 
 

diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c b/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c
index 6ccd993..6eae14d 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c

@@ -180,8 +180,9 @@
 
 	mutex_lock(&dev_priv->hw_mutex);
 
+	vmw_write(dev_priv, SVGA_REG_SYNC, SVGA_SYNC_GENERIC);
 	while (vmw_read(dev_priv, SVGA_REG_BUSY) != 0)
-		vmw_write(dev_priv, SVGA_REG_SYNC, SVGA_SYNC_GENERIC);
+		;
 
 	dev_priv->last_read_seqno = ioread32(fifo_mem + SVGA_FIFO_FENCE);
 

diff --git a/drivers/hwmon/ds1621.c b/drivers/hwmon/ds1621.c
index fc6f5d5..8890870 100644
--- a/drivers/hwmon/ds1621.c
+++ b/drivers/hwmon/ds1621.c

@@ -309,6 +309,7 @@
 	data->conf |= (resol << DS1621_REG_CONFIG_RESOL_SHIFT);
 	i2c_smbus_write_byte_data(client, DS1621_REG_CONF, data->conf);
 	data->update_interval = ds1721_convrates[resol];
+	data->zbits = 7 - resol;
 	mutex_unlock(&data->update_lock);
 
 	return count;

diff --git a/drivers/i2c/busses/i2c-at91.c b/drivers/i2c/busses/i2c-at91.c
index 79a6899..917d545 100644
--- a/drivers/i2c/busses/i2c-at91.c
+++ b/drivers/i2c/busses/i2c-at91.c

@@ -101,6 +101,7 @@
 	unsigned twi_cwgr_reg;
 	struct at91_twi_pdata *pdata;
 	bool use_dma;
+	bool recv_len_abort;
 	struct at91_twi_dma dma;
 };
 
@@ -267,12 +268,24 @@
 	*dev->buf = at91_twi_read(dev, AT91_TWI_RHR) & 0xff;
 	--dev->buf_len;
 
+	/* return if aborting, we only needed to read RHR to clear RXRDY*/
+	if (dev->recv_len_abort)
+		return;
+
 	/* handle I2C_SMBUS_BLOCK_DATA */
 	if (unlikely(dev->msg->flags & I2C_M_RECV_LEN)) {
-		dev->msg->flags &= ~I2C_M_RECV_LEN;
-		dev->buf_len += *dev->buf;
-		dev->msg->len = dev->buf_len + 1;
-		dev_dbg(dev->dev, "received block length %d\n", dev->buf_len);
+		/* ensure length byte is a valid value */
+		if (*dev->buf <= I2C_SMBUS_BLOCK_MAX && *dev->buf > 0) {
+			dev->msg->flags &= ~I2C_M_RECV_LEN;
+			dev->buf_len += *dev->buf;
+			dev->msg->len = dev->buf_len + 1;
+			dev_dbg(dev->dev, "received block length %d\n",
+					 dev->buf_len);
+		} else {
+			/* abort and send the stop by reading one more byte */
+			dev->recv_len_abort = true;
+			dev->buf_len = 1;
+		}
 	}
 
 	/* send stop if second but last byte has been read */
@@ -421,8 +434,8 @@
 		}
 	}
 
-	ret = wait_for_completion_interruptible_timeout(&dev->cmd_complete,
-							dev->adapter.timeout);
+	ret = wait_for_completion_io_timeout(&dev->cmd_complete,
+					     dev->adapter.timeout);
 	if (ret == 0) {
 		dev_err(dev->dev, "controller timed out\n");
 		at91_init_twi_bus(dev);
@@ -444,6 +457,12 @@
 		ret = -EIO;
 		goto error;
 	}
+	if (dev->recv_len_abort) {
+		dev_err(dev->dev, "invalid smbus block length recvd\n");
+		ret = -EPROTO;
+		goto error;
+	}
+
 	dev_dbg(dev->dev, "transfer complete\n");
 
 	return 0;
@@ -500,6 +519,7 @@
 	dev->buf_len = m_start->len;
 	dev->buf = m_start->buf;
 	dev->msg = m_start;
+	dev->recv_len_abort = false;
 
 	ret = at91_do_twi_transfer(dev);
 

diff --git a/drivers/i2c/busses/i2c-mv64xxx.c b/drivers/i2c/busses/i2c-mv64xxx.c
index 6dc5ded..2f64273 100644
--- a/drivers/i2c/busses/i2c-mv64xxx.c
+++ b/drivers/i2c/busses/i2c-mv64xxx.c

@@ -746,8 +746,7 @@
 	}
 	tclk = clk_get_rate(drv_data->clk);
 
-	rc = of_property_read_u32(np, "clock-frequency", &bus_freq);
-	if (rc)
+	if (of_property_read_u32(np, "clock-frequency", &bus_freq))
 		bus_freq = 100000; /* 100kHz by default */
 
 	if (!mv64xxx_find_baud_factors(bus_freq, tclk,

diff --git a/drivers/i2c/busses/i2c-rcar.c b/drivers/i2c/busses/i2c-rcar.c
index f3c7139..1cc146c 100644
--- a/drivers/i2c/busses/i2c-rcar.c
+++ b/drivers/i2c/busses/i2c-rcar.c

@@ -34,6 +34,7 @@
 #include <linux/platform_device.h>
 #include <linux/pm_runtime.h>
 #include <linux/slab.h>
+#include <linux/spinlock.h>
 
 /* register offsets */
 #define ICSCR	0x00	/* slave ctrl */
@@ -95,6 +96,7 @@
 	struct i2c_msg	*msg;
 	struct clk *clk;
 
+	spinlock_t lock;
 	wait_queue_head_t wait;
 
 	int pos;
@@ -365,20 +367,20 @@
 	struct rcar_i2c_priv *priv = ptr;
 	u32 msr;
 
+	/*-------------- spin lock -----------------*/
+	spin_lock(&priv->lock);
+
 	msr = rcar_i2c_read(priv, ICMSR);
 
+	/* Only handle interrupts that are currently enabled */
+	msr &= rcar_i2c_read(priv, ICMIER);
+
 	/* Arbitration lost */
 	if (msr & MAL) {
 		rcar_i2c_flags_set(priv, (ID_DONE | ID_ARBLOST));
 		goto out;
 	}
 
-	/* Stop */
-	if (msr & MST) {
-		rcar_i2c_flags_set(priv, ID_DONE);
-		goto out;
-	}
-
 	/* Nack */
 	if (msr & MNR) {
 		/* go to stop phase */
@@ -388,6 +390,12 @@
 		goto out;
 	}
 
+	/* Stop */
+	if (msr & MST) {
+		rcar_i2c_flags_set(priv, ID_DONE);
+		goto out;
+	}
+
 	if (rcar_i2c_is_recv(priv))
 		rcar_i2c_flags_set(priv, rcar_i2c_irq_recv(priv, msr));
 	else
@@ -400,6 +408,9 @@
 		wake_up(&priv->wait);
 	}
 
+	spin_unlock(&priv->lock);
+	/*-------------- spin unlock -----------------*/
+
 	return IRQ_HANDLED;
 }
 
@@ -409,14 +420,21 @@
 {
 	struct rcar_i2c_priv *priv = i2c_get_adapdata(adap);
 	struct device *dev = rcar_i2c_priv_to_dev(priv);
+	unsigned long flags;
 	int i, ret, timeout;
 
 	pm_runtime_get_sync(dev);
 
+	/*-------------- spin lock -----------------*/
+	spin_lock_irqsave(&priv->lock, flags);
+
 	rcar_i2c_init(priv);
 	/* start clock */
 	rcar_i2c_write(priv, ICCCR, priv->icccr);
 
+	spin_unlock_irqrestore(&priv->lock, flags);
+	/*-------------- spin unlock -----------------*/
+
 	ret = rcar_i2c_bus_barrier(priv);
 	if (ret < 0)
 		goto out;
@@ -428,6 +446,9 @@
 			break;
 		}
 
+		/*-------------- spin lock -----------------*/
+		spin_lock_irqsave(&priv->lock, flags);
+
 		/* init each data */
 		priv->msg	= &msgs[i];
 		priv->pos	= 0;
@@ -437,6 +458,9 @@
 
 		ret = rcar_i2c_prepare_msg(priv);
 
+		spin_unlock_irqrestore(&priv->lock, flags);
+		/*-------------- spin unlock -----------------*/
+
 		if (ret < 0)
 			break;
 
@@ -540,6 +564,7 @@
 
 	irq = platform_get_irq(pdev, 0);
 	init_waitqueue_head(&priv->wait);
+	spin_lock_init(&priv->lock);
 
 	adap = &priv->adap;
 	adap->nr = pdev->id;

diff --git a/drivers/i2c/busses/i2c-rk3x.c b/drivers/i2c/busses/i2c-rk3x.c
index 69e1185..e637c32 100644
--- a/drivers/i2c/busses/i2c-rk3x.c
+++ b/drivers/i2c/busses/i2c-rk3x.c

@@ -323,6 +323,10 @@
 	/* ack interrupt */
 	i2c_writel(i2c, REG_INT_MBRF, REG_IPD);
 
+	/* Can only handle a maximum of 32 bytes at a time */
+	if (len > 32)
+		len = 32;
+
 	/* read the data from receive buffer */
 	for (i = 0; i < len; ++i) {
 		if (i % 4 == 0)

diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index e1e558a..af82563 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c

@@ -1089,6 +1089,30 @@
 	return err;
 }
 
+static int mlx4_ib_tunnel_steer_add(struct ib_qp *qp, struct ib_flow_attr *flow_attr,
+				    u64 *reg_id)
+{
+	void *ib_flow;
+	union ib_flow_spec *ib_spec;
+	struct mlx4_dev	*dev = to_mdev(qp->device)->dev;
+	int err = 0;
+
+	if (dev->caps.tunnel_offload_mode != MLX4_TUNNEL_OFFLOAD_MODE_VXLAN)
+		return 0; /* do nothing */
+
+	ib_flow = flow_attr + 1;
+	ib_spec = (union ib_flow_spec *)ib_flow;
+
+	if (ib_spec->type !=  IB_FLOW_SPEC_ETH || flow_attr->num_of_specs != 1)
+		return 0; /* do nothing */
+
+	err = mlx4_tunnel_steer_add(to_mdev(qp->device)->dev, ib_spec->eth.val.dst_mac,
+				    flow_attr->port, qp->qp_num,
+				    MLX4_DOMAIN_UVERBS | (flow_attr->priority & 0xff),
+				    reg_id);
+	return err;
+}
+
 static struct ib_flow *mlx4_ib_create_flow(struct ib_qp *qp,
 				    struct ib_flow_attr *flow_attr,
 				    int domain)
@@ -1136,6 +1160,12 @@
 		i++;
 	}
 
+	if (i < ARRAY_SIZE(type) && flow_attr->type == IB_FLOW_ATTR_NORMAL) {
+		err = mlx4_ib_tunnel_steer_add(qp, flow_attr, &mflow->reg_id[i]);
+		if (err)
+			goto err_free;
+	}
+
 	return &mflow->ibflow;
 
 err_free:

diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index 6778045..efb9eff 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c

@@ -1677,9 +1677,15 @@
 		}
 	}
 
-	if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET)
+	if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET) {
 		context->pri_path.ackto = (context->pri_path.ackto & 0xf8) |
 					MLX4_IB_LINK_TYPE_ETH;
+		if (dev->dev->caps.tunnel_offload_mode ==  MLX4_TUNNEL_OFFLOAD_MODE_VXLAN) {
+			/* set QP to receive both tunneled & non-tunneled packets */
+			if (!(context->flags & (1 << MLX4_RSS_QPC_FLAG_OFFSET)))
+				context->srqn = cpu_to_be32(7 << 28);
+		}
+	}
 
 	if (ibqp->qp_type == IB_QPT_UD && (new_state == IB_QPS_RTR)) {
 		int is_eth = rdma_port_get_link_layer(

diff --git a/drivers/input/input-mt.c b/drivers/input/input-mt.c
index c30204f..fbe29fc 100644
--- a/drivers/input/input-mt.c
+++ b/drivers/input/input-mt.c

@@ -236,6 +236,18 @@
 }
 EXPORT_SYMBOL(input_mt_report_pointer_emulation);
 
+static void __input_mt_drop_unused(struct input_dev *dev, struct input_mt *mt)
+{
+	int i;
+
+	for (i = 0; i < mt->num_slots; i++) {
+		if (!input_mt_is_used(mt, &mt->slots[i])) {
+			input_mt_slot(dev, i);
+			input_event(dev, EV_ABS, ABS_MT_TRACKING_ID, -1);
+		}
+	}
+}
+
 /**
  * input_mt_drop_unused() - Inactivate slots not seen in this frame
  * @dev: input device with allocated MT slots
@@ -245,19 +257,11 @@
 void input_mt_drop_unused(struct input_dev *dev)
 {
 	struct input_mt *mt = dev->mt;
-	int i;
 
-	if (!mt)
-		return;
-
-	for (i = 0; i < mt->num_slots; i++) {
-		if (!input_mt_is_used(mt, &mt->slots[i])) {
-			input_mt_slot(dev, i);
-			input_event(dev, EV_ABS, ABS_MT_TRACKING_ID, -1);
-		}
+	if (mt) {
+		__input_mt_drop_unused(dev, mt);
+		mt->frame++;
 	}
-
-	mt->frame++;
 }
 EXPORT_SYMBOL(input_mt_drop_unused);
 
@@ -278,12 +282,14 @@
 		return;
 
 	if (mt->flags & INPUT_MT_DROP_UNUSED)
-		input_mt_drop_unused(dev);
+		__input_mt_drop_unused(dev, mt);
 
 	if ((mt->flags & INPUT_MT_POINTER) && !(mt->flags & INPUT_MT_SEMI_MT))
 		use_count = true;
 
 	input_mt_report_pointer_emulation(dev, use_count);
+
+	mt->frame++;
 }
 EXPORT_SYMBOL(input_mt_sync_frame);
 

diff --git a/drivers/input/mouse/alps.c b/drivers/input/mouse/alps.c
index a59a1a6..a956b98 100644
--- a/drivers/input/mouse/alps.c
+++ b/drivers/input/mouse/alps.c

@@ -2234,8 +2234,8 @@
 		return 0;
 	}
 
-	psmouse_info(psmouse,
-		     "Unknown ALPS touchpad: E7=%3ph, EC=%3ph\n", e7, ec);
+	psmouse_dbg(psmouse,
+		    "Likely not an ALPS touchpad: E7=%3ph, EC=%3ph\n", e7, ec);
 
 	return -EINVAL;
 }

diff --git a/drivers/input/mouse/elantech.c b/drivers/input/mouse/elantech.c
index ee2a04d..da51738 100644
--- a/drivers/input/mouse/elantech.c
+++ b/drivers/input/mouse/elantech.c

@@ -18,6 +18,7 @@
 #include <linux/input/mt.h>
 #include <linux/serio.h>
 #include <linux/libps2.h>
+#include <asm/unaligned.h>
 #include "psmouse.h"
 #include "elantech.h"
 
@@ -403,6 +404,68 @@
 	input_sync(dev);
 }
 
+static void elantech_report_trackpoint(struct psmouse *psmouse,
+				       int packet_type)
+{
+	/*
+	 * byte 0:  0   0  sx  sy   0   M   R   L
+	 * byte 1:~sx   0   0   0   0   0   0   0
+	 * byte 2:~sy   0   0   0   0   0   0   0
+	 * byte 3:  0   0 ~sy ~sx   0   1   1   0
+	 * byte 4: x7  x6  x5  x4  x3  x2  x1  x0
+	 * byte 5: y7  y6  y5  y4  y3  y2  y1  y0
+	 *
+	 * x and y are written in two's complement spread
+	 * over 9 bits with sx/sy the relative top bit and
+	 * x7..x0 and y7..y0 the lower bits.
+	 * The sign of y is opposite to what the input driver
+	 * expects for a relative movement
+	 */
+
+	struct elantech_data *etd = psmouse->private;
+	struct input_dev *tp_dev = etd->tp_dev;
+	unsigned char *packet = psmouse->packet;
+	int x, y;
+	u32 t;
+
+	if (dev_WARN_ONCE(&psmouse->ps2dev.serio->dev,
+			  !tp_dev,
+			  psmouse_fmt("Unexpected trackpoint message\n"))) {
+		if (etd->debug == 1)
+			elantech_packet_dump(psmouse);
+		return;
+	}
+
+	t = get_unaligned_le32(&packet[0]);
+
+	switch (t & ~7U) {
+	case 0x06000030U:
+	case 0x16008020U:
+	case 0x26800010U:
+	case 0x36808000U:
+		x = packet[4] - (int)((packet[1]^0x80) << 1);
+		y = (int)((packet[2]^0x80) << 1) - packet[5];
+
+		input_report_key(tp_dev, BTN_LEFT, packet[0] & 0x01);
+		input_report_key(tp_dev, BTN_RIGHT, packet[0] & 0x02);
+		input_report_key(tp_dev, BTN_MIDDLE, packet[0] & 0x04);
+
+		input_report_rel(tp_dev, REL_X, x);
+		input_report_rel(tp_dev, REL_Y, y);
+
+		input_sync(tp_dev);
+
+		break;
+
+	default:
+		/* Dump unexpected packet sequences if debug=1 (default) */
+		if (etd->debug == 1)
+			elantech_packet_dump(psmouse);
+
+		break;
+	}
+}
+
 /*
  * Interpret complete data packets and report absolute mode input events for
  * hardware version 3. (12 byte packets for two fingers)
@@ -715,6 +778,8 @@
 
 		if ((packet[0] & 0x0c) == 0x0c && (packet[3] & 0xce) == 0x0c)
 			return PACKET_V3_TAIL;
+		if ((packet[3] & 0x0f) == 0x06)
+			return PACKET_TRACKPOINT;
 	}
 
 	return PACKET_UNKNOWN;
@@ -791,14 +856,23 @@
 
 	case 3:
 		packet_type = elantech_packet_check_v3(psmouse);
-		/* ignore debounce */
-		if (packet_type == PACKET_DEBOUNCE)
-			return PSMOUSE_FULL_PACKET;
-
-		if (packet_type == PACKET_UNKNOWN)
+		switch (packet_type) {
+		case PACKET_UNKNOWN:
 			return PSMOUSE_BAD_DATA;
 
-		elantech_report_absolute_v3(psmouse, packet_type);
+		case PACKET_DEBOUNCE:
+			/* ignore debounce */
+			break;
+
+		case PACKET_TRACKPOINT:
+			elantech_report_trackpoint(psmouse, packet_type);
+			break;
+
+		default:
+			elantech_report_absolute_v3(psmouse, packet_type);
+			break;
+		}
+
 		break;
 
 	case 4:
@@ -1018,8 +1092,10 @@
  * Asus UX31               0x361f00        20, 15, 0e      clickpad
  * Asus UX32VD             0x361f02        00, 15, 0e      clickpad
  * Avatar AVIU-145A2       0x361f00        ?               clickpad
+ * Fujitsu H730            0x570f00        c0, 14, 0c      3 hw buttons (**)
  * Gigabyte U2442          0x450f01        58, 17, 0c      2 hw buttons
  * Lenovo L430             0x350f02        b9, 15, 0c      2 hw buttons (*)
+ * Lenovo L530             0x350f02        b9, 15, 0c      2 hw buttons (*)
  * Samsung NF210           0x150b00        78, 14, 0a      2 hw buttons
  * Samsung NP770Z5E        0x575f01        10, 15, 0f      clickpad
  * Samsung NP700Z5B        0x361f06        21, 15, 0f      clickpad
@@ -1029,6 +1105,8 @@
  * Samsung RF710           0x450f00        ?               2 hw buttons
  * System76 Pangolin       0x250f01        ?               2 hw buttons
  * (*) + 3 trackpoint buttons
+ * (**) + 0 trackpoint buttons
+ * Note: Lenovo L430 and Lenovo L430 have the same fw_version/caps
  */
 static void elantech_set_buttonpad_prop(struct psmouse *psmouse)
 {
@@ -1324,6 +1402,10 @@
  */
 static void elantech_disconnect(struct psmouse *psmouse)
 {
+	struct elantech_data *etd = psmouse->private;
+
+	if (etd->tp_dev)
+		input_unregister_device(etd->tp_dev);
 	sysfs_remove_group(&psmouse->ps2dev.serio->dev.kobj,
 			   &elantech_attr_group);
 	kfree(psmouse->private);
@@ -1438,8 +1520,10 @@
 int elantech_init(struct psmouse *psmouse)
 {
 	struct elantech_data *etd;
-	int i, error;
+	int i;
+	int error = -EINVAL;
 	unsigned char param[3];
+	struct input_dev *tp_dev;
 
 	psmouse->private = etd = kzalloc(sizeof(struct elantech_data), GFP_KERNEL);
 	if (!etd)
@@ -1498,14 +1582,49 @@
 		goto init_fail;
 	}
 
+	/* The MSB indicates the presence of the trackpoint */
+	if ((etd->capabilities[0] & 0x80) == 0x80) {
+		tp_dev = input_allocate_device();
+
+		if (!tp_dev) {
+			error = -ENOMEM;
+			goto init_fail_tp_alloc;
+		}
+
+		etd->tp_dev = tp_dev;
+		snprintf(etd->tp_phys, sizeof(etd->tp_phys), "%s/input1",
+			psmouse->ps2dev.serio->phys);
+		tp_dev->phys = etd->tp_phys;
+		tp_dev->name = "Elantech PS/2 TrackPoint";
+		tp_dev->id.bustype = BUS_I8042;
+		tp_dev->id.vendor  = 0x0002;
+		tp_dev->id.product = PSMOUSE_ELANTECH;
+		tp_dev->id.version = 0x0000;
+		tp_dev->dev.parent = &psmouse->ps2dev.serio->dev;
+		tp_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_REL);
+		tp_dev->relbit[BIT_WORD(REL_X)] =
+			BIT_MASK(REL_X) | BIT_MASK(REL_Y);
+		tp_dev->keybit[BIT_WORD(BTN_LEFT)] =
+			BIT_MASK(BTN_LEFT) | BIT_MASK(BTN_MIDDLE) |
+			BIT_MASK(BTN_RIGHT);
+		error = input_register_device(etd->tp_dev);
+		if (error < 0)
+			goto init_fail_tp_reg;
+	}
+
 	psmouse->protocol_handler = elantech_process_byte;
 	psmouse->disconnect = elantech_disconnect;
 	psmouse->reconnect = elantech_reconnect;
 	psmouse->pktsize = etd->hw_version > 1 ? 6 : 4;
 
 	return 0;
-
+ init_fail_tp_reg:
+	input_free_device(tp_dev);
+ init_fail_tp_alloc:
+	sysfs_remove_group(&psmouse->ps2dev.serio->dev.kobj,
+			   &elantech_attr_group);
  init_fail:
+	psmouse_reset(psmouse);
 	kfree(etd);
-	return -1;
+	return error;
 }

diff --git a/drivers/input/mouse/elantech.h b/drivers/input/mouse/elantech.h
index 9e0e2a1..6f3afec 100644
--- a/drivers/input/mouse/elantech.h
+++ b/drivers/input/mouse/elantech.h

@@ -94,6 +94,7 @@
 #define PACKET_V4_HEAD			0x05
 #define PACKET_V4_MOTION		0x06
 #define PACKET_V4_STATUS		0x07
+#define PACKET_TRACKPOINT		0x08
 
 /*
  * track up to 5 fingers for v4 hardware
@@ -114,6 +115,8 @@
 };
 
 struct elantech_data {
+	struct input_dev *tp_dev;	/* Relative device for trackpoint */
+	char tp_phys[32];
 	unsigned char reg_07;
 	unsigned char reg_10;
 	unsigned char reg_11;

diff --git a/drivers/input/serio/i8042-sparcio.h b/drivers/input/serio/i8042-sparcio.h
index d6aa4c6..93cb791 100644
--- a/drivers/input/serio/i8042-sparcio.h
+++ b/drivers/input/serio/i8042-sparcio.h

@@ -17,7 +17,6 @@
 #define I8042_MUX_PHYS_DESC "sparcps2/serio%d"
 
 static void __iomem *kbd_iobase;
-static struct resource *kbd_res;
 
 #define I8042_COMMAND_REG	(kbd_iobase + 0x64UL)
 #define I8042_DATA_REG		(kbd_iobase + 0x60UL)
@@ -44,6 +43,8 @@
 
 #ifdef CONFIG_PCI
 
+static struct resource *kbd_res;
+
 #define OBP_PS2KBD_NAME1	"kb_ps2"
 #define OBP_PS2KBD_NAME2	"keyboard"
 #define OBP_PS2MS_NAME1		"kdmouse"

diff --git a/drivers/leds/led-class.c b/drivers/leds/led-class.c
index 129729d..aa29198 100644
--- a/drivers/leds/led-class.c
+++ b/drivers/leds/led-class.c

@@ -15,10 +15,10 @@
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/device.h>
+#include <linux/timer.h>
 #include <linux/err.h>
 #include <linux/ctype.h>
 #include <linux/leds.h>
-#include <linux/workqueue.h>
 #include "leds.h"
 
 static struct class *leds_class;
@@ -97,10 +97,9 @@
 	NULL,
 };
 
-static void led_work_function(struct work_struct *ws)
+static void led_timer_function(unsigned long data)
 {
-	struct led_classdev *led_cdev =
-		container_of(ws, struct led_classdev, blink_work.work);
+	struct led_classdev *led_cdev = (void *)data;
 	unsigned long brightness;
 	unsigned long delay;
 
@@ -144,8 +143,7 @@
 		}
 	}
 
-	queue_delayed_work(system_wq, &led_cdev->blink_work,
-			   msecs_to_jiffies(delay));
+	mod_timer(&led_cdev->blink_timer, jiffies + msecs_to_jiffies(delay));
 }
 
 static void set_brightness_delayed(struct work_struct *ws)
@@ -233,7 +231,9 @@
 
 	INIT_WORK(&led_cdev->set_brightness_work, set_brightness_delayed);
 
-	INIT_DELAYED_WORK(&led_cdev->blink_work, led_work_function);
+	init_timer(&led_cdev->blink_timer);
+	led_cdev->blink_timer.function = led_timer_function;
+	led_cdev->blink_timer.data = (unsigned long)led_cdev;
 
 #ifdef CONFIG_LEDS_TRIGGERS
 	led_trigger_set_default(led_cdev);

diff --git a/drivers/leds/led-core.c b/drivers/leds/led-core.c
index 4bb1168..71b40d3 100644
--- a/drivers/leds/led-core.c
+++ b/drivers/leds/led-core.c

@@ -16,7 +16,6 @@
 #include <linux/module.h>
 #include <linux/rwsem.h>
 #include <linux/leds.h>
-#include <linux/workqueue.h>
 #include "leds.h"
 
 DECLARE_RWSEM(leds_list_lock);
@@ -52,7 +51,7 @@
 		return;
 	}
 
-	queue_delayed_work(system_wq, &led_cdev->blink_work, 1);
+	mod_timer(&led_cdev->blink_timer, jiffies + 1);
 }
 
 
@@ -76,7 +75,7 @@
 		   unsigned long *delay_on,
 		   unsigned long *delay_off)
 {
-	cancel_delayed_work_sync(&led_cdev->blink_work);
+	del_timer_sync(&led_cdev->blink_timer);
 
 	led_cdev->flags &= ~LED_BLINK_ONESHOT;
 	led_cdev->flags &= ~LED_BLINK_ONESHOT_STOP;
@@ -91,7 +90,7 @@
 			   int invert)
 {
 	if ((led_cdev->flags & LED_BLINK_ONESHOT) &&
-	     delayed_work_pending(&led_cdev->blink_work))
+	     timer_pending(&led_cdev->blink_timer))
 		return;
 
 	led_cdev->flags |= LED_BLINK_ONESHOT;
@@ -108,7 +107,7 @@
 
 void led_stop_software_blink(struct led_classdev *led_cdev)
 {
-	cancel_delayed_work_sync(&led_cdev->blink_work);
+	del_timer_sync(&led_cdev->blink_timer);
 	led_cdev->blink_delay_on = 0;
 	led_cdev->blink_delay_off = 0;
 }
@@ -117,7 +116,7 @@
 void led_set_brightness(struct led_classdev *led_cdev,
 			enum led_brightness brightness)
 {
-	/* delay brightness setting if need to stop soft-blink work */
+	/* delay brightness setting if need to stop soft-blink timer */
 	if (led_cdev->blink_delay_on || led_cdev->blink_delay_off) {
 		led_cdev->delayed_set_value = brightness;
 		schedule_work(&led_cdev->set_brightness_work);

diff --git a/drivers/mtd/chips/cfi_cmdset_0002.c b/drivers/mtd/chips/cfi_cmdset_0002.c
index 5a4bfe3..46c4643 100644
--- a/drivers/mtd/chips/cfi_cmdset_0002.c
+++ b/drivers/mtd/chips/cfi_cmdset_0002.c

@@ -1434,6 +1434,10 @@
 
 				mutex_lock(&chip->mutex);
 				ret = get_chip(map, chip, base, FL_LOCKING);
+				if (ret) {
+					mutex_unlock(&chip->mutex);
+					return ret;
+				}
 
 				/* Enter lock register command */
 				cfi_send_gen_cmd(0xAA, cfi->addr_unlock1,

diff --git a/drivers/net/ethernet/3com/3c59x.c b/drivers/net/ethernet/3com/3c59x.c
index 059c741..3fe45c7 100644
--- a/drivers/net/ethernet/3com/3c59x.c
+++ b/drivers/net/ethernet/3com/3c59x.c

@@ -2177,10 +2177,10 @@
 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
 
 			vp->tx_ring[entry].frag[i+1].addr =
-					cpu_to_le32(pci_map_single(
-						VORTEX_PCI(vp),
-						(void *)skb_frag_address(frag),
-						skb_frag_size(frag), PCI_DMA_TODEVICE));
+					cpu_to_le32(skb_frag_dma_map(
+						&VORTEX_PCI(vp)->dev,
+						frag,
+						frag->page_offset, frag->size, DMA_TO_DEVICE));
 
 			if (i == skb_shinfo(skb)->nr_frags-1)
 					vp->tx_ring[entry].frag[i+1].length = cpu_to_le32(skb_frag_size(frag)|LAST_FRAG);

diff --git a/drivers/net/ethernet/aeroflex/greth.c b/drivers/net/ethernet/aeroflex/greth.c
index 23578df..3005155 100644
--- a/drivers/net/ethernet/aeroflex/greth.c
+++ b/drivers/net/ethernet/aeroflex/greth.c

@@ -123,6 +123,12 @@
 	GRETH_REGORIN(greth->regs->control, GRETH_TXEN);
 }
 
+static inline void greth_enable_tx_and_irq(struct greth_private *greth)
+{
+	wmb(); /* BDs must been written to memory before enabling TX */
+	GRETH_REGORIN(greth->regs->control, GRETH_TXEN | GRETH_TXI);
+}
+
 static inline void greth_disable_tx(struct greth_private *greth)
 {
 	GRETH_REGANDIN(greth->regs->control, ~GRETH_TXEN);
@@ -447,29 +453,30 @@
 	return err;
 }
 
+static inline u16 greth_num_free_bds(u16 tx_last, u16 tx_next)
+{
+	if (tx_next < tx_last)
+		return (tx_last - tx_next) - 1;
+	else
+		return GRETH_TXBD_NUM - (tx_next - tx_last) - 1;
+}
 
 static netdev_tx_t
 greth_start_xmit_gbit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct greth_private *greth = netdev_priv(dev);
 	struct greth_bd *bdp;
-	u32 status = 0, dma_addr, ctrl;
+	u32 status, dma_addr;
 	int curr_tx, nr_frags, i, err = NETDEV_TX_OK;
 	unsigned long flags;
+	u16 tx_last;
 
 	nr_frags = skb_shinfo(skb)->nr_frags;
+	tx_last = greth->tx_last;
+	rmb(); /* tx_last is updated by the poll task */
 
-	/* Clean TX Ring */
-	greth_clean_tx_gbit(dev);
-
-	if (greth->tx_free < nr_frags + 1) {
-		spin_lock_irqsave(&greth->devlock, flags);/*save from poll/irq*/
-		ctrl = GRETH_REGLOAD(greth->regs->control);
-		/* Enable TX IRQ only if not already in poll() routine */
-		if (ctrl & GRETH_RXI)
-			GRETH_REGSAVE(greth->regs->control, ctrl | GRETH_TXI);
+	if (greth_num_free_bds(tx_last, greth->tx_next) < nr_frags + 1) {
 		netif_stop_queue(dev);
-		spin_unlock_irqrestore(&greth->devlock, flags);
 		err = NETDEV_TX_BUSY;
 		goto out;
 	}
@@ -488,6 +495,8 @@
 	/* Linear buf */
 	if (nr_frags != 0)
 		status = GRETH_TXBD_MORE;
+	else
+		status = GRETH_BD_IE;
 
 	if (skb->ip_summed == CHECKSUM_PARTIAL)
 		status |= GRETH_TXBD_CSALL;
@@ -545,14 +554,12 @@
 
 	/* Enable the descriptor chain by enabling the first descriptor */
 	bdp = greth->tx_bd_base + greth->tx_next;
-	greth_write_bd(&bdp->stat, greth_read_bd(&bdp->stat) | GRETH_BD_EN);
-	greth->tx_next = curr_tx;
-	greth->tx_free -= nr_frags + 1;
-
-	wmb();
+	greth_write_bd(&bdp->stat,
+		       greth_read_bd(&bdp->stat) | GRETH_BD_EN);
 
 	spin_lock_irqsave(&greth->devlock, flags); /*save from poll/irq*/
-	greth_enable_tx(greth);
+	greth->tx_next = curr_tx;
+	greth_enable_tx_and_irq(greth);
 	spin_unlock_irqrestore(&greth->devlock, flags);
 
 	return NETDEV_TX_OK;
@@ -648,7 +655,6 @@
 	if (greth->tx_free > 0) {
 		netif_wake_queue(dev);
 	}
-
 }
 
 static inline void greth_update_tx_stats(struct net_device *dev, u32 stat)
@@ -670,20 +676,22 @@
 {
 	struct greth_private *greth;
 	struct greth_bd *bdp, *bdp_last_frag;
-	struct sk_buff *skb;
+	struct sk_buff *skb = NULL;
 	u32 stat;
 	int nr_frags, i;
+	u16 tx_last;
 
 	greth = netdev_priv(dev);
+	tx_last = greth->tx_last;
 
-	while (greth->tx_free < GRETH_TXBD_NUM) {
+	while (tx_last != greth->tx_next) {
 
-		skb = greth->tx_skbuff[greth->tx_last];
+		skb = greth->tx_skbuff[tx_last];
 
 		nr_frags = skb_shinfo(skb)->nr_frags;
 
 		/* We only clean fully completed SKBs */
-		bdp_last_frag = greth->tx_bd_base + SKIP_TX(greth->tx_last, nr_frags);
+		bdp_last_frag = greth->tx_bd_base + SKIP_TX(tx_last, nr_frags);
 
 		GRETH_REGSAVE(greth->regs->status, GRETH_INT_TE | GRETH_INT_TX);
 		mb();
@@ -692,14 +700,14 @@
 		if (stat & GRETH_BD_EN)
 			break;
 
-		greth->tx_skbuff[greth->tx_last] = NULL;
+		greth->tx_skbuff[tx_last] = NULL;
 
 		greth_update_tx_stats(dev, stat);
 		dev->stats.tx_bytes += skb->len;
 
-		bdp = greth->tx_bd_base + greth->tx_last;
+		bdp = greth->tx_bd_base + tx_last;
 
-		greth->tx_last = NEXT_TX(greth->tx_last);
+		tx_last = NEXT_TX(tx_last);
 
 		dma_unmap_single(greth->dev,
 				 greth_read_bd(&bdp->addr),
@@ -708,21 +716,26 @@
 
 		for (i = 0; i < nr_frags; i++) {
 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
-			bdp = greth->tx_bd_base + greth->tx_last;
+			bdp = greth->tx_bd_base + tx_last;
 
 			dma_unmap_page(greth->dev,
 				       greth_read_bd(&bdp->addr),
 				       skb_frag_size(frag),
 				       DMA_TO_DEVICE);
 
-			greth->tx_last = NEXT_TX(greth->tx_last);
+			tx_last = NEXT_TX(tx_last);
 		}
-		greth->tx_free += nr_frags+1;
 		dev_kfree_skb(skb);
 	}
+	if (skb) { /* skb is set only if the above while loop was entered */
+		wmb();
+		greth->tx_last = tx_last;
 
-	if (netif_queue_stopped(dev) && (greth->tx_free > (MAX_SKB_FRAGS+1)))
-		netif_wake_queue(dev);
+		if (netif_queue_stopped(dev) &&
+		    (greth_num_free_bds(tx_last, greth->tx_next) >
+		    (MAX_SKB_FRAGS+1)))
+			netif_wake_queue(dev);
+	}
 }
 
 static int greth_rx(struct net_device *dev, int limit)
@@ -965,16 +978,12 @@
 	greth = container_of(napi, struct greth_private, napi);
 
 restart_txrx_poll:
-	if (netif_queue_stopped(greth->netdev)) {
-		if (greth->gbit_mac)
-			greth_clean_tx_gbit(greth->netdev);
-		else
-			greth_clean_tx(greth->netdev);
-	}
-
 	if (greth->gbit_mac) {
+		greth_clean_tx_gbit(greth->netdev);
 		work_done += greth_rx_gbit(greth->netdev, budget - work_done);
 	} else {
+		if (netif_queue_stopped(greth->netdev))
+			greth_clean_tx(greth->netdev);
 		work_done += greth_rx(greth->netdev, budget - work_done);
 	}
 
@@ -983,7 +992,8 @@
 		spin_lock_irqsave(&greth->devlock, flags);
 
 		ctrl = GRETH_REGLOAD(greth->regs->control);
-		if (netif_queue_stopped(greth->netdev)) {
+		if ((greth->gbit_mac && (greth->tx_last != greth->tx_next)) ||
+		    (!greth->gbit_mac && netif_queue_stopped(greth->netdev))) {
 			GRETH_REGSAVE(greth->regs->control,
 					ctrl | GRETH_TXI | GRETH_RXI);
 			mask = GRETH_INT_RX | GRETH_INT_RE |

diff --git a/drivers/net/ethernet/aeroflex/greth.h b/drivers/net/ethernet/aeroflex/greth.h
index 232a622..ae16ac9 100644
--- a/drivers/net/ethernet/aeroflex/greth.h
+++ b/drivers/net/ethernet/aeroflex/greth.h

@@ -107,7 +107,7 @@
 
 	u16 tx_next;
 	u16 tx_last;
-	u16 tx_free;
+	u16 tx_free; /* only used on 10/100Mbit */
 	u16 rx_cur;
 
 	struct greth_regs *regs;	/* Address of controller registers. */

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-debugfs.c b/drivers/net/ethernet/amd/xgbe/xgbe-debugfs.c
index 346592d..a3c1135 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-debugfs.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-debugfs.c

@@ -272,8 +272,8 @@
 	struct xgbe_prv_data *pdata = filp->private_data;
 	unsigned int value;
 
-	value = pdata->hw_if.read_mmd_regs(pdata, pdata->debugfs_xpcs_mmd,
-					   pdata->debugfs_xpcs_reg);
+	value = XMDIO_READ(pdata, pdata->debugfs_xpcs_mmd,
+			   pdata->debugfs_xpcs_reg);
 
 	return xgbe_common_read(buffer, count, ppos, value);
 }
@@ -290,8 +290,8 @@
 	if (len < 0)
 		return len;
 
-	pdata->hw_if.write_mmd_regs(pdata, pdata->debugfs_xpcs_mmd,
-				    pdata->debugfs_xpcs_reg, value);
+	XMDIO_WRITE(pdata, pdata->debugfs_xpcs_mmd, pdata->debugfs_xpcs_reg,
+		    value);
 
 	return len;
 }

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-dev.c b/drivers/net/ethernet/amd/xgbe/xgbe-dev.c
index edaca44..ea27383 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-dev.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-dev.c

@@ -348,7 +348,7 @@
 
 	/* Clear MAC flow control */
 	max_q_count = XGMAC_MAX_FLOW_CONTROL_QUEUES;
-	q_count = min_t(unsigned int, pdata->rx_q_count, max_q_count);
+	q_count = min_t(unsigned int, pdata->tx_q_count, max_q_count);
 	reg = MAC_Q0TFCR;
 	for (i = 0; i < q_count; i++) {
 		reg_val = XGMAC_IOREAD(pdata, reg);
@@ -373,7 +373,7 @@
 
 	/* Set MAC flow control */
 	max_q_count = XGMAC_MAX_FLOW_CONTROL_QUEUES;
-	q_count = min_t(unsigned int, pdata->rx_q_count, max_q_count);
+	q_count = min_t(unsigned int, pdata->tx_q_count, max_q_count);
 	reg = MAC_Q0TFCR;
 	for (i = 0; i < q_count; i++) {
 		reg_val = XGMAC_IOREAD(pdata, reg);
@@ -509,8 +509,8 @@
 	XGMAC_IOWRITE(pdata, MAC_IER, mac_ier);
 
 	/* Enable all counter interrupts */
-	XGMAC_IOWRITE_BITS(pdata, MMC_RIER, ALL_INTERRUPTS, 0xff);
-	XGMAC_IOWRITE_BITS(pdata, MMC_TIER, ALL_INTERRUPTS, 0xff);
+	XGMAC_IOWRITE_BITS(pdata, MMC_RIER, ALL_INTERRUPTS, 0xffffffff);
+	XGMAC_IOWRITE_BITS(pdata, MMC_TIER, ALL_INTERRUPTS, 0xffffffff);
 }
 
 static int xgbe_set_gmii_speed(struct xgbe_prv_data *pdata)
@@ -1633,6 +1633,9 @@
 {
 	unsigned int i, count;
 
+	if (XGMAC_GET_BITS(pdata->hw_feat.version, MAC_VR, SNPSVER) < 0x21)
+		return 0;
+
 	for (i = 0; i < pdata->tx_q_count; i++)
 		XGMAC_MTL_IOWRITE_BITS(pdata, i, MTL_Q_TQOMR, FTQ, 1);
 
@@ -1703,8 +1706,8 @@
 	XGMAC_IOWRITE_BITS(pdata, MTL_OMR, RAA, MTL_RAA_SP);
 }
 
-static unsigned int xgbe_calculate_per_queue_fifo(unsigned long fifo_size,
-						  unsigned char queue_count)
+static unsigned int xgbe_calculate_per_queue_fifo(unsigned int fifo_size,
+						  unsigned int queue_count)
 {
 	unsigned int q_fifo_size = 0;
 	enum xgbe_mtl_fifo_size p_fifo = XGMAC_MTL_FIFO_SIZE_256;
@@ -1748,6 +1751,10 @@
 		q_fifo_size = XGBE_FIFO_SIZE_KB(256);
 		break;
 	}
+
+	/* The configured value is not the actual amount of fifo RAM */
+	q_fifo_size = min_t(unsigned int, XGBE_FIFO_MAX, q_fifo_size);
+
 	q_fifo_size = q_fifo_size / queue_count;
 
 	/* Set the queue fifo size programmable value */
@@ -1947,6 +1954,32 @@
 		xgbe_disable_rx_vlan_stripping(pdata);
 }
 
+static u64 xgbe_mmc_read(struct xgbe_prv_data *pdata, unsigned int reg_lo)
+{
+	bool read_hi;
+	u64 val;
+
+	switch (reg_lo) {
+	/* These registers are always 64 bit */
+	case MMC_TXOCTETCOUNT_GB_LO:
+	case MMC_TXOCTETCOUNT_G_LO:
+	case MMC_RXOCTETCOUNT_GB_LO:
+	case MMC_RXOCTETCOUNT_G_LO:
+		read_hi = true;
+		break;
+
+	default:
+		read_hi = false;
+	};
+
+	val = XGMAC_IOREAD(pdata, reg_lo);
+
+	if (read_hi)
+		val |= ((u64)XGMAC_IOREAD(pdata, reg_lo + 4) << 32);
+
+	return val;
+}
+
 static void xgbe_tx_mmc_int(struct xgbe_prv_data *pdata)
 {
 	struct xgbe_mmc_stats *stats = &pdata->mmc_stats;
@@ -1954,75 +1987,75 @@
 
 	if (XGMAC_GET_BITS(mmc_isr, MMC_TISR, TXOCTETCOUNT_GB))
 		stats->txoctetcount_gb +=
-			XGMAC_IOREAD(pdata, MMC_TXOCTETCOUNT_GB_LO);
+			xgbe_mmc_read(pdata, MMC_TXOCTETCOUNT_GB_LO);
 
 	if (XGMAC_GET_BITS(mmc_isr, MMC_TISR, TXFRAMECOUNT_GB))
 		stats->txframecount_gb +=
-			XGMAC_IOREAD(pdata, MMC_TXFRAMECOUNT_GB_LO);
+			xgbe_mmc_read(pdata, MMC_TXFRAMECOUNT_GB_LO);
 
 	if (XGMAC_GET_BITS(mmc_isr, MMC_TISR, TXBROADCASTFRAMES_G))
 		stats->txbroadcastframes_g +=
-			XGMAC_IOREAD(pdata, MMC_TXBROADCASTFRAMES_G_LO);
+			xgbe_mmc_read(pdata, MMC_TXBROADCASTFRAMES_G_LO);
 
 	if (XGMAC_GET_BITS(mmc_isr, MMC_TISR, TXMULTICASTFRAMES_G))
 		stats->txmulticastframes_g +=
-			XGMAC_IOREAD(pdata, MMC_TXMULTICASTFRAMES_G_LO);
+			xgbe_mmc_read(pdata, MMC_TXMULTICASTFRAMES_G_LO);
 
 	if (XGMAC_GET_BITS(mmc_isr, MMC_TISR, TX64OCTETS_GB))
 		stats->tx64octets_gb +=
-			XGMAC_IOREAD(pdata, MMC_TX64OCTETS_GB_LO);
+			xgbe_mmc_read(pdata, MMC_TX64OCTETS_GB_LO);
 
 	if (XGMAC_GET_BITS(mmc_isr, MMC_TISR, TX65TO127OCTETS_GB))
 		stats->tx65to127octets_gb +=
-			XGMAC_IOREAD(pdata, MMC_TX65TO127OCTETS_GB_LO);
+			xgbe_mmc_read(pdata, MMC_TX65TO127OCTETS_GB_LO);
 
 	if (XGMAC_GET_BITS(mmc_isr, MMC_TISR, TX128TO255OCTETS_GB))
 		stats->tx128to255octets_gb +=
-			XGMAC_IOREAD(pdata, MMC_TX128TO255OCTETS_GB_LO);
+			xgbe_mmc_read(pdata, MMC_TX128TO255OCTETS_GB_LO);
 
 	if (XGMAC_GET_BITS(mmc_isr, MMC_TISR, TX256TO511OCTETS_GB))
 		stats->tx256to511octets_gb +=
-			XGMAC_IOREAD(pdata, MMC_TX256TO511OCTETS_GB_LO);
+			xgbe_mmc_read(pdata, MMC_TX256TO511OCTETS_GB_LO);
 
 	if (XGMAC_GET_BITS(mmc_isr, MMC_TISR, TX512TO1023OCTETS_GB))
 		stats->tx512to1023octets_gb +=
-			XGMAC_IOREAD(pdata, MMC_TX512TO1023OCTETS_GB_LO);
+			xgbe_mmc_read(pdata, MMC_TX512TO1023OCTETS_GB_LO);
 
 	if (XGMAC_GET_BITS(mmc_isr, MMC_TISR, TX1024TOMAXOCTETS_GB))
 		stats->tx1024tomaxoctets_gb +=
-			XGMAC_IOREAD(pdata, MMC_TX1024TOMAXOCTETS_GB_LO);
+			xgbe_mmc_read(pdata, MMC_TX1024TOMAXOCTETS_GB_LO);
 
 	if (XGMAC_GET_BITS(mmc_isr, MMC_TISR, TXUNICASTFRAMES_GB))
 		stats->txunicastframes_gb +=
-			XGMAC_IOREAD(pdata, MMC_TXUNICASTFRAMES_GB_LO);
+			xgbe_mmc_read(pdata, MMC_TXUNICASTFRAMES_GB_LO);
 
 	if (XGMAC_GET_BITS(mmc_isr, MMC_TISR, TXMULTICASTFRAMES_GB))
 		stats->txmulticastframes_gb +=
-			XGMAC_IOREAD(pdata, MMC_TXMULTICASTFRAMES_GB_LO);
+			xgbe_mmc_read(pdata, MMC_TXMULTICASTFRAMES_GB_LO);
 
 	if (XGMAC_GET_BITS(mmc_isr, MMC_TISR, TXBROADCASTFRAMES_GB))
 		stats->txbroadcastframes_g +=
-			XGMAC_IOREAD(pdata, MMC_TXBROADCASTFRAMES_GB_LO);
+			xgbe_mmc_read(pdata, MMC_TXBROADCASTFRAMES_GB_LO);
 
 	if (XGMAC_GET_BITS(mmc_isr, MMC_TISR, TXUNDERFLOWERROR))
 		stats->txunderflowerror +=
-			XGMAC_IOREAD(pdata, MMC_TXUNDERFLOWERROR_LO);
+			xgbe_mmc_read(pdata, MMC_TXUNDERFLOWERROR_LO);
 
 	if (XGMAC_GET_BITS(mmc_isr, MMC_TISR, TXOCTETCOUNT_G))
 		stats->txoctetcount_g +=
-			XGMAC_IOREAD(pdata, MMC_TXOCTETCOUNT_G_LO);
+			xgbe_mmc_read(pdata, MMC_TXOCTETCOUNT_G_LO);
 
 	if (XGMAC_GET_BITS(mmc_isr, MMC_TISR, TXFRAMECOUNT_G))
 		stats->txframecount_g +=
-			XGMAC_IOREAD(pdata, MMC_TXFRAMECOUNT_G_LO);
+			xgbe_mmc_read(pdata, MMC_TXFRAMECOUNT_G_LO);
 
 	if (XGMAC_GET_BITS(mmc_isr, MMC_TISR, TXPAUSEFRAMES))
 		stats->txpauseframes +=
-			XGMAC_IOREAD(pdata, MMC_TXPAUSEFRAMES_LO);
+			xgbe_mmc_read(pdata, MMC_TXPAUSEFRAMES_LO);
 
 	if (XGMAC_GET_BITS(mmc_isr, MMC_TISR, TXVLANFRAMES_G))
 		stats->txvlanframes_g +=
-			XGMAC_IOREAD(pdata, MMC_TXVLANFRAMES_G_LO);
+			xgbe_mmc_read(pdata, MMC_TXVLANFRAMES_G_LO);
 }
 
 static void xgbe_rx_mmc_int(struct xgbe_prv_data *pdata)
@@ -2032,95 +2065,95 @@
 
 	if (XGMAC_GET_BITS(mmc_isr, MMC_RISR, RXFRAMECOUNT_GB))
 		stats->rxframecount_gb +=
-			XGMAC_IOREAD(pdata, MMC_RXFRAMECOUNT_GB_LO);
+			xgbe_mmc_read(pdata, MMC_RXFRAMECOUNT_GB_LO);
 
 	if (XGMAC_GET_BITS(mmc_isr, MMC_RISR, RXOCTETCOUNT_GB))
 		stats->rxoctetcount_gb +=
-			XGMAC_IOREAD(pdata, MMC_RXOCTETCOUNT_GB_LO);
+			xgbe_mmc_read(pdata, MMC_RXOCTETCOUNT_GB_LO);
 
 	if (XGMAC_GET_BITS(mmc_isr, MMC_RISR, RXOCTETCOUNT_G))
 		stats->rxoctetcount_g +=
-			XGMAC_IOREAD(pdata, MMC_RXOCTETCOUNT_G_LO);
+			xgbe_mmc_read(pdata, MMC_RXOCTETCOUNT_G_LO);
 
 	if (XGMAC_GET_BITS(mmc_isr, MMC_RISR, RXBROADCASTFRAMES_G))
 		stats->rxbroadcastframes_g +=
-			XGMAC_IOREAD(pdata, MMC_RXBROADCASTFRAMES_G_LO);
+			xgbe_mmc_read(pdata, MMC_RXBROADCASTFRAMES_G_LO);
 
 	if (XGMAC_GET_BITS(mmc_isr, MMC_RISR, RXMULTICASTFRAMES_G))
 		stats->rxmulticastframes_g +=
-			XGMAC_IOREAD(pdata, MMC_RXMULTICASTFRAMES_G_LO);
+			xgbe_mmc_read(pdata, MMC_RXMULTICASTFRAMES_G_LO);
 
 	if (XGMAC_GET_BITS(mmc_isr, MMC_RISR, RXCRCERROR))
 		stats->rxcrcerror +=
-			XGMAC_IOREAD(pdata, MMC_RXCRCERROR_LO);
+			xgbe_mmc_read(pdata, MMC_RXCRCERROR_LO);
 
 	if (XGMAC_GET_BITS(mmc_isr, MMC_RISR, RXRUNTERROR))
 		stats->rxrunterror +=
-			XGMAC_IOREAD(pdata, MMC_RXRUNTERROR);
+			xgbe_mmc_read(pdata, MMC_RXRUNTERROR);
 
 	if (XGMAC_GET_BITS(mmc_isr, MMC_RISR, RXJABBERERROR))
 		stats->rxjabbererror +=
-			XGMAC_IOREAD(pdata, MMC_RXJABBERERROR);
+			xgbe_mmc_read(pdata, MMC_RXJABBERERROR);
 
 	if (XGMAC_GET_BITS(mmc_isr, MMC_RISR, RXUNDERSIZE_G))
 		stats->rxundersize_g +=
-			XGMAC_IOREAD(pdata, MMC_RXUNDERSIZE_G);
+			xgbe_mmc_read(pdata, MMC_RXUNDERSIZE_G);
 
 	if (XGMAC_GET_BITS(mmc_isr, MMC_RISR, RXOVERSIZE_G))
 		stats->rxoversize_g +=
-			XGMAC_IOREAD(pdata, MMC_RXOVERSIZE_G);
+			xgbe_mmc_read(pdata, MMC_RXOVERSIZE_G);
 
 	if (XGMAC_GET_BITS(mmc_isr, MMC_RISR, RX64OCTETS_GB))
 		stats->rx64octets_gb +=
-			XGMAC_IOREAD(pdata, MMC_RX64OCTETS_GB_LO);
+			xgbe_mmc_read(pdata, MMC_RX64OCTETS_GB_LO);
 
 	if (XGMAC_GET_BITS(mmc_isr, MMC_RISR, RX65TO127OCTETS_GB))
 		stats->rx65to127octets_gb +=
-			XGMAC_IOREAD(pdata, MMC_RX65TO127OCTETS_GB_LO);
+			xgbe_mmc_read(pdata, MMC_RX65TO127OCTETS_GB_LO);
 
 	if (XGMAC_GET_BITS(mmc_isr, MMC_RISR, RX128TO255OCTETS_GB))
 		stats->rx128to255octets_gb +=
-			XGMAC_IOREAD(pdata, MMC_RX128TO255OCTETS_GB_LO);
+			xgbe_mmc_read(pdata, MMC_RX128TO255OCTETS_GB_LO);
 
 	if (XGMAC_GET_BITS(mmc_isr, MMC_RISR, RX256TO511OCTETS_GB))
 		stats->rx256to511octets_gb +=
-			XGMAC_IOREAD(pdata, MMC_RX256TO511OCTETS_GB_LO);
+			xgbe_mmc_read(pdata, MMC_RX256TO511OCTETS_GB_LO);
 
 	if (XGMAC_GET_BITS(mmc_isr, MMC_RISR, RX512TO1023OCTETS_GB))
 		stats->rx512to1023octets_gb +=
-			XGMAC_IOREAD(pdata, MMC_RX512TO1023OCTETS_GB_LO);
+			xgbe_mmc_read(pdata, MMC_RX512TO1023OCTETS_GB_LO);
 
 	if (XGMAC_GET_BITS(mmc_isr, MMC_RISR, RX1024TOMAXOCTETS_GB))
 		stats->rx1024tomaxoctets_gb +=
-			XGMAC_IOREAD(pdata, MMC_RX1024TOMAXOCTETS_GB_LO);
+			xgbe_mmc_read(pdata, MMC_RX1024TOMAXOCTETS_GB_LO);
 
 	if (XGMAC_GET_BITS(mmc_isr, MMC_RISR, RXUNICASTFRAMES_G))
 		stats->rxunicastframes_g +=
-			XGMAC_IOREAD(pdata, MMC_RXUNICASTFRAMES_G_LO);
+			xgbe_mmc_read(pdata, MMC_RXUNICASTFRAMES_G_LO);
 
 	if (XGMAC_GET_BITS(mmc_isr, MMC_RISR, RXLENGTHERROR))
 		stats->rxlengtherror +=
-			XGMAC_IOREAD(pdata, MMC_RXLENGTHERROR_LO);
+			xgbe_mmc_read(pdata, MMC_RXLENGTHERROR_LO);
 
 	if (XGMAC_GET_BITS(mmc_isr, MMC_RISR, RXOUTOFRANGETYPE))
 		stats->rxoutofrangetype +=
-			XGMAC_IOREAD(pdata, MMC_RXOUTOFRANGETYPE_LO);
+			xgbe_mmc_read(pdata, MMC_RXOUTOFRANGETYPE_LO);
 
 	if (XGMAC_GET_BITS(mmc_isr, MMC_RISR, RXPAUSEFRAMES))
 		stats->rxpauseframes +=
-			XGMAC_IOREAD(pdata, MMC_RXPAUSEFRAMES_LO);
+			xgbe_mmc_read(pdata, MMC_RXPAUSEFRAMES_LO);
 
 	if (XGMAC_GET_BITS(mmc_isr, MMC_RISR, RXFIFOOVERFLOW))
 		stats->rxfifooverflow +=
-			XGMAC_IOREAD(pdata, MMC_RXFIFOOVERFLOW_LO);
+			xgbe_mmc_read(pdata, MMC_RXFIFOOVERFLOW_LO);
 
 	if (XGMAC_GET_BITS(mmc_isr, MMC_RISR, RXVLANFRAMES_GB))
 		stats->rxvlanframes_gb +=
-			XGMAC_IOREAD(pdata, MMC_RXVLANFRAMES_GB_LO);
+			xgbe_mmc_read(pdata, MMC_RXVLANFRAMES_GB_LO);
 
 	if (XGMAC_GET_BITS(mmc_isr, MMC_RISR, RXWATCHDOGERROR))
 		stats->rxwatchdogerror +=
-			XGMAC_IOREAD(pdata, MMC_RXWATCHDOGERROR);
+			xgbe_mmc_read(pdata, MMC_RXWATCHDOGERROR);
 }
 
 static void xgbe_read_mmc_stats(struct xgbe_prv_data *pdata)
@@ -2131,127 +2164,127 @@
 	XGMAC_IOWRITE_BITS(pdata, MMC_CR, MCF, 1);
 
 	stats->txoctetcount_gb +=
-		XGMAC_IOREAD(pdata, MMC_TXOCTETCOUNT_GB_LO);
+		xgbe_mmc_read(pdata, MMC_TXOCTETCOUNT_GB_LO);
 
 	stats->txframecount_gb +=
-		XGMAC_IOREAD(pdata, MMC_TXFRAMECOUNT_GB_LO);
+		xgbe_mmc_read(pdata, MMC_TXFRAMECOUNT_GB_LO);
 
 	stats->txbroadcastframes_g +=
-		XGMAC_IOREAD(pdata, MMC_TXBROADCASTFRAMES_G_LO);
+		xgbe_mmc_read(pdata, MMC_TXBROADCASTFRAMES_G_LO);
 
 	stats->txmulticastframes_g +=
-		XGMAC_IOREAD(pdata, MMC_TXMULTICASTFRAMES_G_LO);
+		xgbe_mmc_read(pdata, MMC_TXMULTICASTFRAMES_G_LO);
 
 	stats->tx64octets_gb +=
-		XGMAC_IOREAD(pdata, MMC_TX64OCTETS_GB_LO);
+		xgbe_mmc_read(pdata, MMC_TX64OCTETS_GB_LO);
 
 	stats->tx65to127octets_gb +=
-		XGMAC_IOREAD(pdata, MMC_TX65TO127OCTETS_GB_LO);
+		xgbe_mmc_read(pdata, MMC_TX65TO127OCTETS_GB_LO);
 
 	stats->tx128to255octets_gb +=
-		XGMAC_IOREAD(pdata, MMC_TX128TO255OCTETS_GB_LO);
+		xgbe_mmc_read(pdata, MMC_TX128TO255OCTETS_GB_LO);
 
 	stats->tx256to511octets_gb +=
-		XGMAC_IOREAD(pdata, MMC_TX256TO511OCTETS_GB_LO);
+		xgbe_mmc_read(pdata, MMC_TX256TO511OCTETS_GB_LO);
 
 	stats->tx512to1023octets_gb +=
-		XGMAC_IOREAD(pdata, MMC_TX512TO1023OCTETS_GB_LO);
+		xgbe_mmc_read(pdata, MMC_TX512TO1023OCTETS_GB_LO);
 
 	stats->tx1024tomaxoctets_gb +=
-		XGMAC_IOREAD(pdata, MMC_TX1024TOMAXOCTETS_GB_LO);
+		xgbe_mmc_read(pdata, MMC_TX1024TOMAXOCTETS_GB_LO);
 
 	stats->txunicastframes_gb +=
-		XGMAC_IOREAD(pdata, MMC_TXUNICASTFRAMES_GB_LO);
+		xgbe_mmc_read(pdata, MMC_TXUNICASTFRAMES_GB_LO);
 
 	stats->txmulticastframes_gb +=
-		XGMAC_IOREAD(pdata, MMC_TXMULTICASTFRAMES_GB_LO);
+		xgbe_mmc_read(pdata, MMC_TXMULTICASTFRAMES_GB_LO);
 
 	stats->txbroadcastframes_g +=
-		XGMAC_IOREAD(pdata, MMC_TXBROADCASTFRAMES_GB_LO);
+		xgbe_mmc_read(pdata, MMC_TXBROADCASTFRAMES_GB_LO);
 
 	stats->txunderflowerror +=
-		XGMAC_IOREAD(pdata, MMC_TXUNDERFLOWERROR_LO);
+		xgbe_mmc_read(pdata, MMC_TXUNDERFLOWERROR_LO);
 
 	stats->txoctetcount_g +=
-		XGMAC_IOREAD(pdata, MMC_TXOCTETCOUNT_G_LO);
+		xgbe_mmc_read(pdata, MMC_TXOCTETCOUNT_G_LO);
 
 	stats->txframecount_g +=
-		XGMAC_IOREAD(pdata, MMC_TXFRAMECOUNT_G_LO);
+		xgbe_mmc_read(pdata, MMC_TXFRAMECOUNT_G_LO);
 
 	stats->txpauseframes +=
-		XGMAC_IOREAD(pdata, MMC_TXPAUSEFRAMES_LO);
+		xgbe_mmc_read(pdata, MMC_TXPAUSEFRAMES_LO);
 
 	stats->txvlanframes_g +=
-		XGMAC_IOREAD(pdata, MMC_TXVLANFRAMES_G_LO);
+		xgbe_mmc_read(pdata, MMC_TXVLANFRAMES_G_LO);
 
 	stats->rxframecount_gb +=
-		XGMAC_IOREAD(pdata, MMC_RXFRAMECOUNT_GB_LO);
+		xgbe_mmc_read(pdata, MMC_RXFRAMECOUNT_GB_LO);
 
 	stats->rxoctetcount_gb +=
-		XGMAC_IOREAD(pdata, MMC_RXOCTETCOUNT_GB_LO);
+		xgbe_mmc_read(pdata, MMC_RXOCTETCOUNT_GB_LO);
 
 	stats->rxoctetcount_g +=
-		XGMAC_IOREAD(pdata, MMC_RXOCTETCOUNT_G_LO);
+		xgbe_mmc_read(pdata, MMC_RXOCTETCOUNT_G_LO);
 
 	stats->rxbroadcastframes_g +=
-		XGMAC_IOREAD(pdata, MMC_RXBROADCASTFRAMES_G_LO);
+		xgbe_mmc_read(pdata, MMC_RXBROADCASTFRAMES_G_LO);
 
 	stats->rxmulticastframes_g +=
-		XGMAC_IOREAD(pdata, MMC_RXMULTICASTFRAMES_G_LO);
+		xgbe_mmc_read(pdata, MMC_RXMULTICASTFRAMES_G_LO);
 
 	stats->rxcrcerror +=
-		XGMAC_IOREAD(pdata, MMC_RXCRCERROR_LO);
+		xgbe_mmc_read(pdata, MMC_RXCRCERROR_LO);
 
 	stats->rxrunterror +=
-		XGMAC_IOREAD(pdata, MMC_RXRUNTERROR);
+		xgbe_mmc_read(pdata, MMC_RXRUNTERROR);
 
 	stats->rxjabbererror +=
-		XGMAC_IOREAD(pdata, MMC_RXJABBERERROR);
+		xgbe_mmc_read(pdata, MMC_RXJABBERERROR);
 
 	stats->rxundersize_g +=
-		XGMAC_IOREAD(pdata, MMC_RXUNDERSIZE_G);
+		xgbe_mmc_read(pdata, MMC_RXUNDERSIZE_G);
 
 	stats->rxoversize_g +=
-		XGMAC_IOREAD(pdata, MMC_RXOVERSIZE_G);
+		xgbe_mmc_read(pdata, MMC_RXOVERSIZE_G);
 
 	stats->rx64octets_gb +=
-		XGMAC_IOREAD(pdata, MMC_RX64OCTETS_GB_LO);
+		xgbe_mmc_read(pdata, MMC_RX64OCTETS_GB_LO);
 
 	stats->rx65to127octets_gb +=
-		XGMAC_IOREAD(pdata, MMC_RX65TO127OCTETS_GB_LO);
+		xgbe_mmc_read(pdata, MMC_RX65TO127OCTETS_GB_LO);
 
 	stats->rx128to255octets_gb +=
-		XGMAC_IOREAD(pdata, MMC_RX128TO255OCTETS_GB_LO);
+		xgbe_mmc_read(pdata, MMC_RX128TO255OCTETS_GB_LO);
 
 	stats->rx256to511octets_gb +=
-		XGMAC_IOREAD(pdata, MMC_RX256TO511OCTETS_GB_LO);
+		xgbe_mmc_read(pdata, MMC_RX256TO511OCTETS_GB_LO);
 
 	stats->rx512to1023octets_gb +=
-		XGMAC_IOREAD(pdata, MMC_RX512TO1023OCTETS_GB_LO);
+		xgbe_mmc_read(pdata, MMC_RX512TO1023OCTETS_GB_LO);
 
 	stats->rx1024tomaxoctets_gb +=
-		XGMAC_IOREAD(pdata, MMC_RX1024TOMAXOCTETS_GB_LO);
+		xgbe_mmc_read(pdata, MMC_RX1024TOMAXOCTETS_GB_LO);
 
 	stats->rxunicastframes_g +=
-		XGMAC_IOREAD(pdata, MMC_RXUNICASTFRAMES_G_LO);
+		xgbe_mmc_read(pdata, MMC_RXUNICASTFRAMES_G_LO);
 
 	stats->rxlengtherror +=
-		XGMAC_IOREAD(pdata, MMC_RXLENGTHERROR_LO);
+		xgbe_mmc_read(pdata, MMC_RXLENGTHERROR_LO);
 
 	stats->rxoutofrangetype +=
-		XGMAC_IOREAD(pdata, MMC_RXOUTOFRANGETYPE_LO);
+		xgbe_mmc_read(pdata, MMC_RXOUTOFRANGETYPE_LO);
 
 	stats->rxpauseframes +=
-		XGMAC_IOREAD(pdata, MMC_RXPAUSEFRAMES_LO);
+		xgbe_mmc_read(pdata, MMC_RXPAUSEFRAMES_LO);
 
 	stats->rxfifooverflow +=
-		XGMAC_IOREAD(pdata, MMC_RXFIFOOVERFLOW_LO);
+		xgbe_mmc_read(pdata, MMC_RXFIFOOVERFLOW_LO);
 
 	stats->rxvlanframes_gb +=
-		XGMAC_IOREAD(pdata, MMC_RXVLANFRAMES_GB_LO);
+		xgbe_mmc_read(pdata, MMC_RXVLANFRAMES_GB_LO);
 
 	stats->rxwatchdogerror +=
-		XGMAC_IOREAD(pdata, MMC_RXWATCHDOGERROR);
+		xgbe_mmc_read(pdata, MMC_RXWATCHDOGERROR);
 
 	/* Un-freeze counters */
 	XGMAC_IOWRITE_BITS(pdata, MMC_CR, MCF, 0);

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
index dc84f71..b26d758 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c

@@ -361,6 +361,8 @@
 
 	memset(hw_feat, 0, sizeof(*hw_feat));
 
+	hw_feat->version = XGMAC_IOREAD(pdata, MAC_VR);
+
 	/* Hardware feature register 0 */
 	hw_feat->gmii        = XGMAC_GET_BITS(mac_hfr0, MAC_HWF0R, GMIISEL);
 	hw_feat->vlhash      = XGMAC_GET_BITS(mac_hfr0, MAC_HWF0R, VLHASH);

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c b/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c
index a076aca..46f6130 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c

@@ -361,15 +361,16 @@
 			     struct ethtool_drvinfo *drvinfo)
 {
 	struct xgbe_prv_data *pdata = netdev_priv(netdev);
+	struct xgbe_hw_features *hw_feat = &pdata->hw_feat;
 
 	strlcpy(drvinfo->driver, XGBE_DRV_NAME, sizeof(drvinfo->driver));
 	strlcpy(drvinfo->version, XGBE_DRV_VERSION, sizeof(drvinfo->version));
 	strlcpy(drvinfo->bus_info, dev_name(pdata->dev),
 		sizeof(drvinfo->bus_info));
 	snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version), "%d.%d.%d",
-		 XGMAC_IOREAD_BITS(pdata, MAC_VR, USERVER),
-		 XGMAC_IOREAD_BITS(pdata, MAC_VR, DEVID),
-		 XGMAC_IOREAD_BITS(pdata, MAC_VR, SNPSVER));
+		 XGMAC_GET_BITS(hw_feat->version, MAC_VR, USERVER),
+		 XGMAC_GET_BITS(hw_feat->version, MAC_VR, DEVID),
+		 XGMAC_GET_BITS(hw_feat->version, MAC_VR, SNPSVER));
 	drvinfo->n_stats = XGBE_STATS_COUNT;
 }
 

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-main.c b/drivers/net/ethernet/amd/xgbe/xgbe-main.c
index 8aa6a93..bdf9cfa 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-main.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-main.c

@@ -172,7 +172,7 @@
 		}
 
 		if (i < pdata->rx_ring_count) {
-			spin_lock_init(&tx_ring->lock);
+			spin_lock_init(&rx_ring->lock);
 			channel->rx_ring = rx_ring++;
 		}
 

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe.h b/drivers/net/ethernet/amd/xgbe/xgbe.h
index 07bf70a..e9fe6e6 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe.h
+++ b/drivers/net/ethernet/amd/xgbe/xgbe.h

@@ -183,6 +183,7 @@
 #define XGMAC_DRIVER_CONTEXT	1
 #define XGMAC_IOCTL_CONTEXT	2
 
+#define XGBE_FIFO_MAX		81920
 #define XGBE_FIFO_SIZE_B(x)	(x)
 #define XGBE_FIFO_SIZE_KB(x)	(x * 1024)
 
@@ -526,6 +527,9 @@
  * or configurations are present in the device.
  */
 struct xgbe_hw_features {
+	/* HW Version */
+	unsigned int version;
+
 	/* HW Feature Register0 */
 	unsigned int gmii;		/* 1000 Mbps support */
 	unsigned int vlhash;		/* VLAN Hash Filter */

diff --git a/drivers/net/ethernet/apm/xgene/Kconfig b/drivers/net/ethernet/apm/xgene/Kconfig
index 616dff6..f4054d24 100644
--- a/drivers/net/ethernet/apm/xgene/Kconfig
+++ b/drivers/net/ethernet/apm/xgene/Kconfig

@@ -1,5 +1,6 @@
 config NET_XGENE
 	tristate "APM X-Gene SoC Ethernet Driver"
+	depends on HAS_DMA
 	select PHYLIB
 	help
 	  This is the Ethernet driver for the on-chip ethernet interface on the

diff --git a/drivers/net/ethernet/broadcom/Kconfig b/drivers/net/ethernet/broadcom/Kconfig
index 7dcfb19..d8d07a8 100644
--- a/drivers/net/ethernet/broadcom/Kconfig
+++ b/drivers/net/ethernet/broadcom/Kconfig

@@ -84,7 +84,7 @@
 
 config CNIC
 	tristate "QLogic CNIC support"
-	depends on PCI
+	depends on PCI && (IPV6 || IPV6=n)
 	select BNX2
 	select UIO
 	---help---

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_hsi.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_hsi.h
index 5ba8af5..c4daa06 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_hsi.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_hsi.h

@@ -2233,7 +2233,12 @@
 	u32 reserved3;				/* Offset 0x14C */
 	u32 reserved4;				/* Offset 0x150 */
 	u32 link_attr_sync[PORT_MAX];		/* Offset 0x154 */
-	#define LINK_ATTR_SYNC_KR2_ENABLE	(1<<0)
+	#define LINK_ATTR_SYNC_KR2_ENABLE	0x00000001
+	#define LINK_SFP_EEPROM_COMP_CODE_MASK	0x0000ff00
+	#define LINK_SFP_EEPROM_COMP_CODE_SHIFT		 8
+	#define LINK_SFP_EEPROM_COMP_CODE_SR	0x00001000
+	#define LINK_SFP_EEPROM_COMP_CODE_LR	0x00002000
+	#define LINK_SFP_EEPROM_COMP_CODE_LRM	0x00004000
 
 	u32 reserved5[2];
 	u32 reserved6[PORT_MAX];

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.c
index 53fb4fa..549549e 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.c

@@ -154,15 +154,22 @@
 			 LINK_STATUS_LINK_PARTNER_ASYMMETRIC_PAUSE)
 
 #define SFP_EEPROM_CON_TYPE_ADDR		0x2
+	#define SFP_EEPROM_CON_TYPE_VAL_UNKNOWN	0x0
 	#define SFP_EEPROM_CON_TYPE_VAL_LC	0x7
 	#define SFP_EEPROM_CON_TYPE_VAL_COPPER	0x21
 	#define SFP_EEPROM_CON_TYPE_VAL_RJ45	0x22
 
 
-#define SFP_EEPROM_COMP_CODE_ADDR		0x3
-	#define SFP_EEPROM_COMP_CODE_SR_MASK	(1<<4)
-	#define SFP_EEPROM_COMP_CODE_LR_MASK	(1<<5)
-	#define SFP_EEPROM_COMP_CODE_LRM_MASK	(1<<6)
+#define SFP_EEPROM_10G_COMP_CODE_ADDR		0x3
+	#define SFP_EEPROM_10G_COMP_CODE_SR_MASK	(1<<4)
+	#define SFP_EEPROM_10G_COMP_CODE_LR_MASK	(1<<5)
+	#define SFP_EEPROM_10G_COMP_CODE_LRM_MASK	(1<<6)
+
+#define SFP_EEPROM_1G_COMP_CODE_ADDR		0x6
+	#define SFP_EEPROM_1G_COMP_CODE_SX	(1<<0)
+	#define SFP_EEPROM_1G_COMP_CODE_LX	(1<<1)
+	#define SFP_EEPROM_1G_COMP_CODE_CX	(1<<2)
+	#define SFP_EEPROM_1G_COMP_CODE_BASE_T	(1<<3)
 
 #define SFP_EEPROM_FC_TX_TECH_ADDR		0x8
 	#define SFP_EEPROM_FC_TX_TECH_BITMASK_COPPER_PASSIVE 0x4
@@ -3633,8 +3640,8 @@
 				 reg_set[i].val);
 
 	/* Start KR2 work-around timer which handles BCM8073 link-parner */
-	vars->link_attr_sync |= LINK_ATTR_SYNC_KR2_ENABLE;
-	bnx2x_update_link_attr(params, vars->link_attr_sync);
+	params->link_attr_sync |= LINK_ATTR_SYNC_KR2_ENABLE;
+	bnx2x_update_link_attr(params, params->link_attr_sync);
 }
 
 static void bnx2x_disable_kr2(struct link_params *params,
@@ -3666,8 +3673,8 @@
 	for (i = 0; i < ARRAY_SIZE(reg_set); i++)
 		bnx2x_cl45_write(bp, phy, reg_set[i].devad, reg_set[i].reg,
 				 reg_set[i].val);
-	vars->link_attr_sync &= ~LINK_ATTR_SYNC_KR2_ENABLE;
-	bnx2x_update_link_attr(params, vars->link_attr_sync);
+	params->link_attr_sync &= ~LINK_ATTR_SYNC_KR2_ENABLE;
+	bnx2x_update_link_attr(params, params->link_attr_sync);
 
 	vars->check_kr2_recovery_cnt = CHECK_KR2_RECOVERY_CNT;
 }
@@ -4810,7 +4817,7 @@
 					~FEATURE_CONFIG_PFC_ENABLED;
 
 	if (SHMEM2_HAS(bp, link_attr_sync))
-		vars->link_attr_sync = SHMEM2_RD(bp,
+		params->link_attr_sync = SHMEM2_RD(bp,
 						 link_attr_sync[params->port]);
 
 	DP(NETIF_MSG_LINK, "link_status 0x%x  phy_link_up %x int_mask 0x%x\n",
@@ -8057,21 +8064,24 @@
 {
 	struct bnx2x *bp = params->bp;
 	u32 sync_offset = 0, phy_idx, media_types;
-	u8 gport, val[2], check_limiting_mode = 0;
+	u8 val[SFP_EEPROM_FC_TX_TECH_ADDR + 1], check_limiting_mode = 0;
 	*edc_mode = EDC_MODE_LIMITING;
 	phy->media_type = ETH_PHY_UNSPECIFIED;
 	/* First check for copper cable */
 	if (bnx2x_read_sfp_module_eeprom(phy,
 					 params,
 					 I2C_DEV_ADDR_A0,
-					 SFP_EEPROM_CON_TYPE_ADDR,
-					 2,
+					 0,
+					 SFP_EEPROM_FC_TX_TECH_ADDR + 1,
 					 (u8 *)val) != 0) {
 		DP(NETIF_MSG_LINK, "Failed to read from SFP+ module EEPROM\n");
 		return -EINVAL;
 	}
-
-	switch (val[0]) {
+	params->link_attr_sync &= ~LINK_SFP_EEPROM_COMP_CODE_MASK;
+	params->link_attr_sync |= val[SFP_EEPROM_10G_COMP_CODE_ADDR] <<
+		LINK_SFP_EEPROM_COMP_CODE_SHIFT;
+	bnx2x_update_link_attr(params, params->link_attr_sync);
+	switch (val[SFP_EEPROM_CON_TYPE_ADDR]) {
 	case SFP_EEPROM_CON_TYPE_VAL_COPPER:
 	{
 		u8 copper_module_type;
@@ -8079,17 +8089,7 @@
 		/* Check if its active cable (includes SFP+ module)
 		 * of passive cable
 		 */
-		if (bnx2x_read_sfp_module_eeprom(phy,
-					       params,
-					       I2C_DEV_ADDR_A0,
-					       SFP_EEPROM_FC_TX_TECH_ADDR,
-					       1,
-					       &copper_module_type) != 0) {
-			DP(NETIF_MSG_LINK,
-				"Failed to read copper-cable-type"
-				" from SFP+ EEPROM\n");
-			return -EINVAL;
-		}
+		copper_module_type = val[SFP_EEPROM_FC_TX_TECH_ADDR];
 
 		if (copper_module_type &
 		    SFP_EEPROM_FC_TX_TECH_BITMASK_COPPER_ACTIVE) {
@@ -8115,16 +8115,18 @@
 		}
 		break;
 	}
+	case SFP_EEPROM_CON_TYPE_VAL_UNKNOWN:
 	case SFP_EEPROM_CON_TYPE_VAL_LC:
 	case SFP_EEPROM_CON_TYPE_VAL_RJ45:
 		check_limiting_mode = 1;
-		if ((val[1] & (SFP_EEPROM_COMP_CODE_SR_MASK |
-			       SFP_EEPROM_COMP_CODE_LR_MASK |
-			       SFP_EEPROM_COMP_CODE_LRM_MASK)) == 0) {
+		if ((val[SFP_EEPROM_10G_COMP_CODE_ADDR] &
+		     (SFP_EEPROM_10G_COMP_CODE_SR_MASK |
+		      SFP_EEPROM_10G_COMP_CODE_LR_MASK |
+		      SFP_EEPROM_10G_COMP_CODE_LRM_MASK)) == 0) {
 			DP(NETIF_MSG_LINK, "1G SFP module detected\n");
-			gport = params->port;
 			phy->media_type = ETH_PHY_SFP_1G_FIBER;
 			if (phy->req_line_speed != SPEED_1000) {
+				u8 gport = params->port;
 				phy->req_line_speed = SPEED_1000;
 				if (!CHIP_IS_E1x(bp)) {
 					gport = BP_PATH(bp) +
@@ -8134,6 +8136,12 @@
 					   "Warning: Link speed was forced to 1000Mbps. Current SFP module in port %d is not compliant with 10G Ethernet\n",
 					   gport);
 			}
+			if (val[SFP_EEPROM_1G_COMP_CODE_ADDR] &
+			    SFP_EEPROM_1G_COMP_CODE_BASE_T) {
+				bnx2x_sfp_set_transmitter(params, phy, 0);
+				msleep(40);
+				bnx2x_sfp_set_transmitter(params, phy, 1);
+			}
 		} else {
 			int idx, cfg_idx = 0;
 			DP(NETIF_MSG_LINK, "10G Optic module detected\n");
@@ -8149,7 +8157,7 @@
 		break;
 	default:
 		DP(NETIF_MSG_LINK, "Unable to determine module type 0x%x !!!\n",
-			 val[0]);
+			 val[SFP_EEPROM_CON_TYPE_ADDR]);
 		return -EINVAL;
 	}
 	sync_offset = params->shmem_base +
@@ -13507,7 +13515,7 @@
 
 	sigdet = bnx2x_warpcore_get_sigdet(phy, params);
 	if (!sigdet) {
-		if (!(vars->link_attr_sync & LINK_ATTR_SYNC_KR2_ENABLE)) {
+		if (!(params->link_attr_sync & LINK_ATTR_SYNC_KR2_ENABLE)) {
 			bnx2x_kr2_recovery(params, vars, phy);
 			DP(NETIF_MSG_LINK, "No sigdet\n");
 		}
@@ -13525,7 +13533,7 @@
 
 	/* CL73 has not begun yet */
 	if (base_page == 0) {
-		if (!(vars->link_attr_sync & LINK_ATTR_SYNC_KR2_ENABLE)) {
+		if (!(params->link_attr_sync & LINK_ATTR_SYNC_KR2_ENABLE)) {
 			bnx2x_kr2_recovery(params, vars, phy);
 			DP(NETIF_MSG_LINK, "No BP\n");
 		}
@@ -13541,7 +13549,7 @@
 			    ((next_page & 0xe0) == 0x20))));
 
 	/* In case KR2 is already disabled, check if we need to re-enable it */
-	if (!(vars->link_attr_sync & LINK_ATTR_SYNC_KR2_ENABLE)) {
+	if (!(params->link_attr_sync & LINK_ATTR_SYNC_KR2_ENABLE)) {
 		if (!not_kr2_device) {
 			DP(NETIF_MSG_LINK, "BP=0x%x, NP=0x%x\n", base_page,
 			   next_page);

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.h
index 389f5f8..d9cce4c 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.h

@@ -323,6 +323,9 @@
 #define LINK_FLAGS_INT_DISABLED		(1<<0)
 #define PHY_INITIALIZED		(1<<1)
 	u32 lfa_base;
+
+	/* The same definitions as the shmem2 parameter */
+	u32 link_attr_sync;
 };
 
 /* Output parameters */
@@ -364,8 +367,6 @@
 	u8 rx_tx_asic_rst;
 	u8 turn_to_run_wc_rt;
 	u16 rsrv2;
-	/* The same definitions as the shmem2 parameter */
-	u32 link_attr_sync;
 };
 
 /***********************************************************/

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
index 900cab4..d1c093d 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c

@@ -6849,6 +6849,37 @@
 	bnx2x_release_phy_lock(bp);
 }
 
+static void bnx2x_config_endianity(struct bnx2x *bp, u32 val)
+{
+	REG_WR(bp, PXP2_REG_RQ_QM_ENDIAN_M, val);
+	REG_WR(bp, PXP2_REG_RQ_TM_ENDIAN_M, val);
+	REG_WR(bp, PXP2_REG_RQ_SRC_ENDIAN_M, val);
+	REG_WR(bp, PXP2_REG_RQ_CDU_ENDIAN_M, val);
+	REG_WR(bp, PXP2_REG_RQ_DBG_ENDIAN_M, val);
+
+	/* make sure this value is 0 */
+	REG_WR(bp, PXP2_REG_RQ_HC_ENDIAN_M, 0);
+
+	REG_WR(bp, PXP2_REG_RD_QM_SWAP_MODE, val);
+	REG_WR(bp, PXP2_REG_RD_TM_SWAP_MODE, val);
+	REG_WR(bp, PXP2_REG_RD_SRC_SWAP_MODE, val);
+	REG_WR(bp, PXP2_REG_RD_CDURD_SWAP_MODE, val);
+}
+
+static void bnx2x_set_endianity(struct bnx2x *bp)
+{
+#ifdef __BIG_ENDIAN
+	bnx2x_config_endianity(bp, 1);
+#else
+	bnx2x_config_endianity(bp, 0);
+#endif
+}
+
+static void bnx2x_reset_endianity(struct bnx2x *bp)
+{
+	bnx2x_config_endianity(bp, 0);
+}
+
 /**
  * bnx2x_init_hw_common - initialize the HW at the COMMON phase.
  *
@@ -6915,23 +6946,7 @@
 
 	bnx2x_init_block(bp, BLOCK_PXP2, PHASE_COMMON);
 	bnx2x_init_pxp(bp);
-
-#ifdef __BIG_ENDIAN
-	REG_WR(bp, PXP2_REG_RQ_QM_ENDIAN_M, 1);
-	REG_WR(bp, PXP2_REG_RQ_TM_ENDIAN_M, 1);
-	REG_WR(bp, PXP2_REG_RQ_SRC_ENDIAN_M, 1);
-	REG_WR(bp, PXP2_REG_RQ_CDU_ENDIAN_M, 1);
-	REG_WR(bp, PXP2_REG_RQ_DBG_ENDIAN_M, 1);
-	/* make sure this value is 0 */
-	REG_WR(bp, PXP2_REG_RQ_HC_ENDIAN_M, 0);
-
-/*	REG_WR(bp, PXP2_REG_RD_PBF_SWAP_MODE, 1); */
-	REG_WR(bp, PXP2_REG_RD_QM_SWAP_MODE, 1);
-	REG_WR(bp, PXP2_REG_RD_TM_SWAP_MODE, 1);
-	REG_WR(bp, PXP2_REG_RD_SRC_SWAP_MODE, 1);
-	REG_WR(bp, PXP2_REG_RD_CDURD_SWAP_MODE, 1);
-#endif
-
+	bnx2x_set_endianity(bp);
 	bnx2x_ilt_init_page_size(bp, INITOP_SET);
 
 	if (CHIP_REV_IS_FPGA(bp) && CHIP_IS_E1H(bp))
@@ -13169,9 +13184,15 @@
 	bnx2x_iov_remove_one(bp);
 
 	/* Power on: we can't let PCI layer write to us while we are in D3 */
-	if (IS_PF(bp))
+	if (IS_PF(bp)) {
 		bnx2x_set_power_state(bp, PCI_D0);
 
+		/* Set endianity registers to reset values in case next driver
+		 * boots in different endianty environment.
+		 */
+		bnx2x_reset_endianity(bp);
+	}
+
 	/* Disable MSI/MSI-X */
 	bnx2x_disable_msi(bp);
 

diff --git a/drivers/net/ethernet/broadcom/cnic.c b/drivers/net/ethernet/broadcom/cnic.c
index 27861a6..a6a9f28 100644
--- a/drivers/net/ethernet/broadcom/cnic.c
+++ b/drivers/net/ethernet/broadcom/cnic.c

@@ -31,7 +31,7 @@
 #include <linux/if_vlan.h>
 #include <linux/prefetch.h>
 #include <linux/random.h>
-#if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE)
+#if IS_ENABLED(CONFIG_VLAN_8021Q)
 #define BCM_VLAN 1
 #endif
 #include <net/ip.h>
@@ -3685,7 +3685,7 @@
 static int cnic_get_v6_route(struct sockaddr_in6 *dst_addr,
 			     struct dst_entry **dst)
 {
-#if defined(CONFIG_IPV6) || (defined(CONFIG_IPV6_MODULE) && defined(MODULE))
+#if IS_ENABLED(CONFIG_IPV6)
 	struct flowi6 fl6;
 
 	memset(&fl6, 0, sizeof(fl6));

diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c
index 3ac5d23..cb77ae9 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c

@@ -11617,6 +11617,12 @@
 	struct tg3 *tp = netdev_priv(dev);
 	int err;
 
+	if (tp->pcierr_recovery) {
+		netdev_err(dev, "Failed to open device. PCI error recovery "
+			   "in progress\n");
+		return -EAGAIN;
+	}
+
 	if (tp->fw_needed) {
 		err = tg3_request_firmware(tp);
 		if (tg3_asic_rev(tp) == ASIC_REV_57766) {
@@ -11674,6 +11680,12 @@
 {
 	struct tg3 *tp = netdev_priv(dev);
 
+	if (tp->pcierr_recovery) {
+		netdev_err(dev, "Failed to close device. PCI error recovery "
+			   "in progress\n");
+		return -EAGAIN;
+	}
+
 	tg3_ptp_fini(tp);
 
 	tg3_stop(tp);
@@ -17561,6 +17573,7 @@
 	tp->rx_mode = TG3_DEF_RX_MODE;
 	tp->tx_mode = TG3_DEF_TX_MODE;
 	tp->irq_sync = 1;
+	tp->pcierr_recovery = false;
 
 	if (tg3_debug > 0)
 		tp->msg_enable = tg3_debug;
@@ -18071,6 +18084,8 @@
 
 	rtnl_lock();
 
+	tp->pcierr_recovery = true;
+
 	/* We probably don't have netdev yet */
 	if (!netdev || !netif_running(netdev))
 		goto done;
@@ -18195,6 +18210,7 @@
 	tg3_phy_start(tp);
 
 done:
+	tp->pcierr_recovery = false;
 	rtnl_unlock();
 }
 

diff --git a/drivers/net/ethernet/broadcom/tg3.h b/drivers/net/ethernet/broadcom/tg3.h
index 461acca..31c9f82 100644
--- a/drivers/net/ethernet/broadcom/tg3.h
+++ b/drivers/net/ethernet/broadcom/tg3.h

@@ -3407,6 +3407,7 @@
 
 	struct device			*hwmon_dev;
 	bool				link_up;
+	bool				pcierr_recovery;
 };
 
 /* Accessor macros for chip and asic attributes

diff --git a/drivers/net/ethernet/brocade/bna/bnad.c b/drivers/net/ethernet/brocade/bna/bnad.c
index ff8cae5..ffc92a4 100644
--- a/drivers/net/ethernet/brocade/bna/bnad.c
+++ b/drivers/net/ethernet/brocade/bna/bnad.c

@@ -2506,7 +2506,7 @@
 	 * For TSO, the TCP checksum field is seeded with pseudo-header sum
 	 * excluding the length field.
 	 */
-	if (skb->protocol == htons(ETH_P_IP)) {
+	if (vlan_get_protocol(skb) == htons(ETH_P_IP)) {
 		struct iphdr *iph = ip_hdr(skb);
 
 		/* Do we really need these? */
@@ -2870,12 +2870,13 @@
 		}
 
 		if (skb->ip_summed == CHECKSUM_PARTIAL) {
+			__be16 net_proto = vlan_get_protocol(skb);
 			u8 proto = 0;
 
-			if (skb->protocol == htons(ETH_P_IP))
+			if (net_proto == htons(ETH_P_IP))
 				proto = ip_hdr(skb)->protocol;
 #ifdef NETIF_F_IPV6_CSUM
-			else if (skb->protocol == htons(ETH_P_IPV6)) {
+			else if (net_proto == htons(ETH_P_IPV6)) {
 				/* nexthdr may not be TCP immediately. */
 				proto = ipv6_hdr(skb)->nexthdr;
 			}

diff --git a/drivers/net/ethernet/calxeda/Kconfig b/drivers/net/ethernet/calxeda/Kconfig
index 184a063..07d2201 100644
--- a/drivers/net/ethernet/calxeda/Kconfig
+++ b/drivers/net/ethernet/calxeda/Kconfig

@@ -1,6 +1,7 @@
 config NET_CALXEDA_XGMAC
 	tristate "Calxeda 1G/10G XGMAC Ethernet driver"
 	depends on HAS_IOMEM && HAS_DMA
+	depends on ARCH_HIGHBANK || COMPILE_TEST
 	select CRC32
 	help
 	  This is the driver for the XGMAC Ethernet IP block found on Calxeda

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index 18fb9c6..8c34811 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c

@@ -1253,7 +1253,9 @@
 			goto freeout;
 	}
 
-	t4_write_reg(adap, MPS_TRC_RSS_CONTROL,
+	t4_write_reg(adap, is_t4(adap->params.chip) ?
+				MPS_TRC_RSS_CONTROL :
+				MPS_T5_TRC_RSS_CONTROL,
 		     RSSCONTROL(netdev2pinfo(adap->port[0])->tx_chan) |
 		     QUEUENUMBER(s->ethrxq[0].rspq.abs_id));
 	return 0;
@@ -1761,7 +1763,8 @@
 		0xd004, 0xd03c,
 		0xdfc0, 0xdfe0,
 		0xe000, 0xea7c,
-		0xf000, 0x11190,
+		0xf000, 0x11110,
+		0x11118, 0x11190,
 		0x19040, 0x1906c,
 		0x19078, 0x19080,
 		0x1908c, 0x19124,
@@ -1968,7 +1971,8 @@
 		0xd004, 0xd03c,
 		0xdfc0, 0xdfe0,
 		0xe000, 0x11088,
-		0x1109c, 0x1117c,
+		0x1109c, 0x11110,
+		0x11118, 0x1117c,
 		0x11190, 0x11204,
 		0x19040, 0x1906c,
 		0x19078, 0x19080,
@@ -5955,7 +5959,8 @@
 		params[3] = FW_PARAM_PFVF(CQ_END);
 		params[4] = FW_PARAM_PFVF(OCQ_START);
 		params[5] = FW_PARAM_PFVF(OCQ_END);
-		ret = t4_query_params(adap, 0, 0, 0, 6, params, val);
+		ret = t4_query_params(adap, adap->mbox, adap->fn, 0, 6, params,
+				      val);
 		if (ret < 0)
 			goto bye;
 		adap->vres.qp.start = val[0];
@@ -5967,7 +5972,8 @@
 
 		params[0] = FW_PARAM_DEV(MAXORDIRD_QP);
 		params[1] = FW_PARAM_DEV(MAXIRD_ADAPTER);
-		ret = t4_query_params(adap, 0, 0, 0, 2, params, val);
+		ret = t4_query_params(adap, adap->mbox, adap->fn, 0, 2, params,
+				      val);
 		if (ret < 0) {
 			adap->params.max_ordird_qp = 8;
 			adap->params.max_ird_adapter = 32 * adap->tids.ntids;

diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
index a853133..41d0446 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c

@@ -168,6 +168,34 @@
 }
 
 /*
+ * t4_report_fw_error - report firmware error
+ * @adap: the adapter
+ *
+ * The adapter firmware can indicate error conditions to the host.
+ * If the firmware has indicated an error, print out the reason for
+ * the firmware error.
+ */
+static void t4_report_fw_error(struct adapter *adap)
+{
+	static const char *const reason[] = {
+		"Crash",                        /* PCIE_FW_EVAL_CRASH */
+		"During Device Preparation",    /* PCIE_FW_EVAL_PREP */
+		"During Device Configuration",  /* PCIE_FW_EVAL_CONF */
+		"During Device Initialization", /* PCIE_FW_EVAL_INIT */
+		"Unexpected Event",             /* PCIE_FW_EVAL_UNEXPECTEDEVENT */
+		"Insufficient Airflow",         /* PCIE_FW_EVAL_OVERHEAT */
+		"Device Shutdown",              /* PCIE_FW_EVAL_DEVICESHUTDOWN */
+		"Reserved",                     /* reserved */
+	};
+	u32 pcie_fw;
+
+	pcie_fw = t4_read_reg(adap, MA_PCIE_FW);
+	if (pcie_fw & FW_PCIE_FW_ERR)
+		dev_err(adap->pdev_dev, "Firmware reports adapter error: %s\n",
+			reason[FW_PCIE_FW_EVAL_GET(pcie_fw)]);
+}
+
+/*
  * Get the reply to a mailbox command and store it in @rpl in big-endian order.
  */
 static void get_mbox_rpl(struct adapter *adap, __be64 *rpl, int nflit,
@@ -300,6 +328,7 @@
 	dump_mbox(adap, mbox, data_reg);
 	dev_err(adap->pdev_dev, "command %#x in mailbox %d timed out\n",
 		*(const u8 *)cmd, mbox);
+	t4_report_fw_error(adap);
 	return -ETIMEDOUT;
 }
 
@@ -566,6 +595,7 @@
 #define VPD_BASE           0x400
 #define VPD_BASE_OLD       0
 #define VPD_LEN            1024
+#define CHELSIO_VPD_UNIQUE_ID 0x82
 
 /**
  *	t4_seeprom_wp - enable/disable EEPROM write protection
@@ -603,7 +633,14 @@
 	ret = pci_read_vpd(adapter->pdev, VPD_BASE, sizeof(u32), vpd);
 	if (ret < 0)
 		goto out;
-	addr = *vpd == 0x82 ? VPD_BASE : VPD_BASE_OLD;
+
+	/* The VPD shall have a unique identifier specified by the PCI SIG.
+	 * For chelsio adapters, the identifier is 0x82. The first byte of a VPD
+	 * shall be CHELSIO_VPD_UNIQUE_ID (0x82). The VPD programming software
+	 * is expected to automatically put this entry at the
+	 * beginning of the VPD.
+	 */
+	addr = *vpd == CHELSIO_VPD_UNIQUE_ID ? VPD_BASE : VPD_BASE_OLD;
 
 	ret = pci_read_vpd(adapter->pdev, addr, VPD_LEN, vpd);
 	if (ret < 0)
@@ -667,6 +704,7 @@
 	i = pci_vpd_info_field_size(vpd + sn - PCI_VPD_INFO_FLD_HDR_SIZE);
 	memcpy(p->sn, vpd + sn, min(i, SERNUM_LEN));
 	strim(p->sn);
+	i = pci_vpd_info_field_size(vpd + pn - PCI_VPD_INFO_FLD_HDR_SIZE);
 	memcpy(p->pn, vpd + pn, min(i, PN_LEN));
 	strim(p->pn);
 
@@ -1394,15 +1432,18 @@
 
 	int fat;
 
-	fat = t4_handle_intr_status(adapter,
-				    PCIE_CORE_UTL_SYSTEM_BUS_AGENT_STATUS,
-				    sysbus_intr_info) +
-	      t4_handle_intr_status(adapter,
-				    PCIE_CORE_UTL_PCI_EXPRESS_PORT_STATUS,
-				    pcie_port_intr_info) +
-	      t4_handle_intr_status(adapter, PCIE_INT_CAUSE,
-				    is_t4(adapter->params.chip) ?
-				    pcie_intr_info : t5_pcie_intr_info);
+	if (is_t4(adapter->params.chip))
+		fat = t4_handle_intr_status(adapter,
+					    PCIE_CORE_UTL_SYSTEM_BUS_AGENT_STATUS,
+					    sysbus_intr_info) +
+			t4_handle_intr_status(adapter,
+					      PCIE_CORE_UTL_PCI_EXPRESS_PORT_STATUS,
+					      pcie_port_intr_info) +
+			t4_handle_intr_status(adapter, PCIE_INT_CAUSE,
+					      pcie_intr_info);
+	else
+		fat = t4_handle_intr_status(adapter, PCIE_INT_CAUSE,
+					    t5_pcie_intr_info);
 
 	if (fat)
 		t4_fatal_err(adapter);
@@ -1521,6 +1562,9 @@
 
 	int fat;
 
+	if (t4_read_reg(adapter, MA_PCIE_FW) & FW_PCIE_FW_ERR)
+		t4_report_fw_error(adapter);
+
 	fat = t4_handle_intr_status(adapter, CIM_HOST_INT_CAUSE,
 				    cim_intr_info) +
 	      t4_handle_intr_status(adapter, CIM_HOST_UPACC_INT_CAUSE,
@@ -1768,10 +1812,16 @@
 {
 	u32 v, status = t4_read_reg(adap, MA_INT_CAUSE);
 
-	if (status & MEM_PERR_INT_CAUSE)
+	if (status & MEM_PERR_INT_CAUSE) {
 		dev_alert(adap->pdev_dev,
 			  "MA parity error, parity status %#x\n",
 			  t4_read_reg(adap, MA_PARITY_ERROR_STATUS));
+		if (is_t5(adap->params.chip))
+			dev_alert(adap->pdev_dev,
+				  "MA parity error, parity status %#x\n",
+				  t4_read_reg(adap,
+					      MA_PARITY_ERROR_STATUS2));
+	}
 	if (status & MEM_WRAP_INT_CAUSE) {
 		v = t4_read_reg(adap, MA_INT_WRAP_STATUS);
 		dev_alert(adap->pdev_dev, "MA address wrap-around error by "
@@ -2733,12 +2783,16 @@
 	/*
 	 * Issue the HELLO command to the firmware.  If it's not successful
 	 * but indicates that we got a "busy" or "timeout" condition, retry
-	 * the HELLO until we exhaust our retry limit.
+	 * the HELLO until we exhaust our retry limit.  If we do exceed our
+	 * retry limit, check to see if the firmware left us any error
+	 * information and report that if so.
 	 */
 	ret = t4_wr_mbox(adap, mbox, &c, sizeof(c), &c);
 	if (ret < 0) {
 		if ((ret == -EBUSY || ret == -ETIMEDOUT) && retries-- > 0)
 			goto retry;
+		if (t4_read_reg(adap, MA_PCIE_FW) & FW_PCIE_FW_ERR)
+			t4_report_fw_error(adap);
 		return ret;
 	}
 
@@ -3742,6 +3796,7 @@
 			lc->link_ok = link_ok;
 			lc->speed = speed;
 			lc->fc = fc;
+			lc->supported = be16_to_cpu(p->u.info.pcap);
 			t4_os_link_changed(adap, port, link_ok);
 		}
 		if (mod != pi->mod_type) {

diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_regs.h b/drivers/net/ethernet/chelsio/cxgb4/t4_regs.h
index e3146e8..39fb325 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4_regs.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_regs.h

@@ -511,6 +511,7 @@
 #define  MEM_WRAP_CLIENT_NUM_GET(x) (((x) & MEM_WRAP_CLIENT_NUM_MASK) >> MEM_WRAP_CLIENT_NUM_SHIFT)
 #define MA_PCIE_FW 0x30b8
 #define MA_PARITY_ERROR_STATUS 0x77f4
+#define MA_PARITY_ERROR_STATUS2 0x7804
 
 #define MA_EXT_MEMORY1_BAR 0x7808
 #define EDC_0_BASE_ADDR 0x7900
@@ -959,6 +960,7 @@
 #define  TRCMULTIFILTER     0x00000001U
 
 #define MPS_TRC_RSS_CONTROL 0x9808
+#define MPS_T5_TRC_RSS_CONTROL 0xa00c
 #define  RSSCONTROL_MASK    0x00ff0000U
 #define  RSSCONTROL_SHIFT   16
 #define  RSSCONTROL(x)      ((x) << RSSCONTROL_SHIFT)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h b/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h
index 5f2729e..3409756 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h

@@ -2228,6 +2228,10 @@
 #define FW_PCIE_FW_MASTER(x)     ((x) << FW_PCIE_FW_MASTER_SHIFT)
 #define FW_PCIE_FW_MASTER_GET(x) (((x) >> FW_PCIE_FW_MASTER_SHIFT) & \
 				 FW_PCIE_FW_MASTER_MASK)
+#define FW_PCIE_FW_EVAL_MASK   0x7
+#define FW_PCIE_FW_EVAL_SHIFT  24
+#define FW_PCIE_FW_EVAL_GET(x) (((x) >> FW_PCIE_FW_EVAL_SHIFT) & \
+				 FW_PCIE_FW_EVAL_MASK)
 
 struct fw_hdr {
 	u8 ver;

diff --git a/drivers/net/ethernet/ibm/ehea/ehea_main.c b/drivers/net/ethernet/ibm/ehea/ehea_main.c
index a0b418e..566b17d 100644
--- a/drivers/net/ethernet/ibm/ehea/ehea_main.c
+++ b/drivers/net/ethernet/ibm/ehea/ehea_main.c

@@ -1994,7 +1994,7 @@
 {
 	swqe->tx_control |= EHEA_SWQE_IMM_DATA_PRESENT | EHEA_SWQE_CRC;
 
-	if (skb->protocol != htons(ETH_P_IP))
+	if (vlan_get_protocol(skb) != htons(ETH_P_IP))
 		return;
 
 	if (skb->ip_summed == CHECKSUM_PARTIAL)

diff --git a/drivers/net/ethernet/intel/e1000/e1000_main.c b/drivers/net/ethernet/intel/e1000/e1000_main.c
index cbc330b..ad3d5d1 100644
--- a/drivers/net/ethernet/intel/e1000/e1000_main.c
+++ b/drivers/net/ethernet/intel/e1000/e1000_main.c

@@ -2674,7 +2674,8 @@
 #define E1000_TX_FLAGS_VLAN_SHIFT	16
 
 static int e1000_tso(struct e1000_adapter *adapter,
-		     struct e1000_tx_ring *tx_ring, struct sk_buff *skb)
+		     struct e1000_tx_ring *tx_ring, struct sk_buff *skb,
+		     __be16 protocol)
 {
 	struct e1000_context_desc *context_desc;
 	struct e1000_buffer *buffer_info;
@@ -2692,7 +2693,7 @@
 
 		hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb);
 		mss = skb_shinfo(skb)->gso_size;
-		if (skb->protocol == htons(ETH_P_IP)) {
+		if (protocol == htons(ETH_P_IP)) {
 			struct iphdr *iph = ip_hdr(skb);
 			iph->tot_len = 0;
 			iph->check = 0;
@@ -2702,7 +2703,7 @@
 								 0);
 			cmd_length = E1000_TXD_CMD_IP;
 			ipcse = skb_transport_offset(skb) - 1;
-		} else if (skb->protocol == htons(ETH_P_IPV6)) {
+		} else if (skb_is_gso_v6(skb)) {
 			ipv6_hdr(skb)->payload_len = 0;
 			tcp_hdr(skb)->check =
 				~csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
@@ -2745,7 +2746,8 @@
 }
 
 static bool e1000_tx_csum(struct e1000_adapter *adapter,
-			  struct e1000_tx_ring *tx_ring, struct sk_buff *skb)
+			  struct e1000_tx_ring *tx_ring, struct sk_buff *skb,
+			  __be16 protocol)
 {
 	struct e1000_context_desc *context_desc;
 	struct e1000_buffer *buffer_info;
@@ -2756,7 +2758,7 @@
 	if (skb->ip_summed != CHECKSUM_PARTIAL)
 		return false;
 
-	switch (skb->protocol) {
+	switch (protocol) {
 	case cpu_to_be16(ETH_P_IP):
 		if (ip_hdr(skb)->protocol == IPPROTO_TCP)
 			cmd_len |= E1000_TXD_CMD_TCP;
@@ -3097,6 +3099,7 @@
 	int count = 0;
 	int tso;
 	unsigned int f;
+	__be16 protocol = vlan_get_protocol(skb);
 
 	/* This goes back to the question of how to logically map a Tx queue
 	 * to a flow.  Right now, performance is impacted slightly negatively
@@ -3210,7 +3213,7 @@
 
 	first = tx_ring->next_to_use;
 
-	tso = e1000_tso(adapter, tx_ring, skb);
+	tso = e1000_tso(adapter, tx_ring, skb, protocol);
 	if (tso < 0) {
 		dev_kfree_skb_any(skb);
 		return NETDEV_TX_OK;
@@ -3220,10 +3223,10 @@
 		if (likely(hw->mac_type != e1000_82544))
 			tx_ring->last_tx_tso = true;
 		tx_flags |= E1000_TX_FLAGS_TSO;
-	} else if (likely(e1000_tx_csum(adapter, tx_ring, skb)))
+	} else if (likely(e1000_tx_csum(adapter, tx_ring, skb, protocol)))
 		tx_flags |= E1000_TX_FLAGS_CSUM;
 
-	if (likely(skb->protocol == htons(ETH_P_IP)))
+	if (protocol == htons(ETH_P_IP))
 		tx_flags |= E1000_TX_FLAGS_IPV4;
 
 	if (unlikely(skb->no_fcs))

diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c
index 65c3aef..247335d 100644
--- a/drivers/net/ethernet/intel/e1000e/netdev.c
+++ b/drivers/net/ethernet/intel/e1000e/netdev.c

@@ -5164,7 +5164,8 @@
 #define E1000_TX_FLAGS_VLAN_MASK	0xffff0000
 #define E1000_TX_FLAGS_VLAN_SHIFT	16
 
-static int e1000_tso(struct e1000_ring *tx_ring, struct sk_buff *skb)
+static int e1000_tso(struct e1000_ring *tx_ring, struct sk_buff *skb,
+		     __be16 protocol)
 {
 	struct e1000_context_desc *context_desc;
 	struct e1000_buffer *buffer_info;
@@ -5183,7 +5184,7 @@
 
 	hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb);
 	mss = skb_shinfo(skb)->gso_size;
-	if (skb->protocol == htons(ETH_P_IP)) {
+	if (protocol == htons(ETH_P_IP)) {
 		struct iphdr *iph = ip_hdr(skb);
 		iph->tot_len = 0;
 		iph->check = 0;
@@ -5231,7 +5232,8 @@
 	return 1;
 }
 
-static bool e1000_tx_csum(struct e1000_ring *tx_ring, struct sk_buff *skb)
+static bool e1000_tx_csum(struct e1000_ring *tx_ring, struct sk_buff *skb,
+			  __be16 protocol)
 {
 	struct e1000_adapter *adapter = tx_ring->adapter;
 	struct e1000_context_desc *context_desc;
@@ -5239,16 +5241,10 @@
 	unsigned int i;
 	u8 css;
 	u32 cmd_len = E1000_TXD_CMD_DEXT;
-	__be16 protocol;
 
 	if (skb->ip_summed != CHECKSUM_PARTIAL)
 		return false;
 
-	if (skb->protocol == cpu_to_be16(ETH_P_8021Q))
-		protocol = vlan_eth_hdr(skb)->h_vlan_encapsulated_proto;
-	else
-		protocol = skb->protocol;
-
 	switch (protocol) {
 	case cpu_to_be16(ETH_P_IP):
 		if (ip_hdr(skb)->protocol == IPPROTO_TCP)
@@ -5546,6 +5542,7 @@
 	int count = 0;
 	int tso;
 	unsigned int f;
+	__be16 protocol = vlan_get_protocol(skb);
 
 	if (test_bit(__E1000_DOWN, &adapter->state)) {
 		dev_kfree_skb_any(skb);
@@ -5620,7 +5617,7 @@
 
 	first = tx_ring->next_to_use;
 
-	tso = e1000_tso(tx_ring, skb);
+	tso = e1000_tso(tx_ring, skb, protocol);
 	if (tso < 0) {
 		dev_kfree_skb_any(skb);
 		return NETDEV_TX_OK;
@@ -5628,14 +5625,14 @@
 
 	if (tso)
 		tx_flags |= E1000_TX_FLAGS_TSO;
-	else if (e1000_tx_csum(tx_ring, skb))
+	else if (e1000_tx_csum(tx_ring, skb, protocol))
 		tx_flags |= E1000_TX_FLAGS_CSUM;
 
 	/* Old method was to assume IPv4 packet by default if TSO was enabled.
 	 * 82571 hardware supports TSO capabilities for IPv6 as well...
 	 * no longer assume, we must.
 	 */
-	if (skb->protocol == htons(ETH_P_IP))
+	if (protocol == htons(ETH_P_IP))
 		tx_flags |= E1000_TX_FLAGS_IPV4;
 
 	if (unlikely(skb->no_fcs))

diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index a51aa37..369848e 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c

@@ -2295,7 +2295,7 @@
 		goto out_drop;
 
 	/* obtain protocol of skb */
-	protocol = skb->protocol;
+	protocol = vlan_get_protocol(skb);
 
 	/* record the location of the first descriptor for this packet */
 	first = &tx_ring->tx_bi[tx_ring->next_to_use];

diff --git a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
index 79bf96c..95a3ec2 100644
--- a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c

@@ -1597,7 +1597,7 @@
 		goto out_drop;
 
 	/* obtain protocol of skb */
-	protocol = skb->protocol;
+	protocol = vlan_get_protocol(skb);
 
 	/* record the location of the first descriptor for this packet */
 	first = &tx_ring->tx_bi[tx_ring->next_to_use];

diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c
index c9f1d1b..ade067d 100644
--- a/drivers/net/ethernet/marvell/mvneta.c
+++ b/drivers/net/ethernet/marvell/mvneta.c

@@ -20,6 +20,7 @@
 #include <linux/mbus.h>
 #include <linux/module.h>
 #include <linux/interrupt.h>
+#include <linux/if_vlan.h>
 #include <net/ip.h>
 #include <net/ipv6.h>
 #include <linux/io.h>
@@ -1371,15 +1372,16 @@
 {
 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
 		int ip_hdr_len = 0;
+		__be16 l3_proto = vlan_get_protocol(skb);
 		u8 l4_proto;
 
-		if (skb->protocol == htons(ETH_P_IP)) {
+		if (l3_proto == htons(ETH_P_IP)) {
 			struct iphdr *ip4h = ip_hdr(skb);
 
 			/* Calculate IPv4 checksum and L4 checksum */
 			ip_hdr_len = ip4h->ihl;
 			l4_proto = ip4h->protocol;
-		} else if (skb->protocol == htons(ETH_P_IPV6)) {
+		} else if (l3_proto == htons(ETH_P_IPV6)) {
 			struct ipv6hdr *ip6h = ipv6_hdr(skb);
 
 			/* Read l4_protocol from one of IPv6 extra headers */
@@ -1390,7 +1392,7 @@
 			return MVNETA_TX_L4_CSUM_NOT;
 
 		return mvneta_txq_desc_csum(skb_network_offset(skb),
-				skb->protocol, ip_hdr_len, l4_proto);
+					    l3_proto, ip_hdr_len, l4_proto);
 	}
 
 	return MVNETA_TX_L4_CSUM_NOT;

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index bb536aa..abddcf8 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c

@@ -474,39 +474,12 @@
 				    int qpn, u64 *reg_id)
 {
 	int err;
-	struct mlx4_spec_list spec_eth_outer = { {NULL} };
-	struct mlx4_spec_list spec_vxlan     = { {NULL} };
-	struct mlx4_spec_list spec_eth_inner = { {NULL} };
-
-	struct mlx4_net_trans_rule rule = {
-		.queue_mode = MLX4_NET_TRANS_Q_FIFO,
-		.exclusive = 0,
-		.allow_loopback = 1,
-		.promisc_mode = MLX4_FS_REGULAR,
-		.priority = MLX4_DOMAIN_NIC,
-	};
-
-	__be64 mac_mask = cpu_to_be64(MLX4_MAC_MASK << 16);
 
 	if (priv->mdev->dev->caps.tunnel_offload_mode != MLX4_TUNNEL_OFFLOAD_MODE_VXLAN)
 		return 0; /* do nothing */
 
-	rule.port = priv->port;
-	rule.qpn = qpn;
-	INIT_LIST_HEAD(&rule.list);
-
-	spec_eth_outer.id = MLX4_NET_TRANS_RULE_ID_ETH;
-	memcpy(spec_eth_outer.eth.dst_mac, addr, ETH_ALEN);
-	memcpy(spec_eth_outer.eth.dst_mac_msk, &mac_mask, ETH_ALEN);
-
-	spec_vxlan.id = MLX4_NET_TRANS_RULE_ID_VXLAN;    /* any vxlan header */
-	spec_eth_inner.id = MLX4_NET_TRANS_RULE_ID_ETH;	 /* any inner eth header */
-
-	list_add_tail(&spec_eth_outer.list, &rule.list);
-	list_add_tail(&spec_vxlan.list,     &rule.list);
-	list_add_tail(&spec_eth_inner.list, &rule.list);
-
-	err = mlx4_flow_attach(priv->mdev->dev, &rule, reg_id);
+	err = mlx4_tunnel_steer_add(priv->mdev->dev, addr, priv->port, qpn,
+				    MLX4_DOMAIN_NIC, reg_id);
 	if (err) {
 		en_err(priv, "failed to add vxlan steering rule, err %d\n", err);
 		return err;

diff --git a/drivers/net/ethernet/mellanox/mlx4/mcg.c b/drivers/net/ethernet/mellanox/mlx4/mcg.c
index d80e7a6..ca0f98c 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mcg.c
+++ b/drivers/net/ethernet/mellanox/mlx4/mcg.c

@@ -1020,6 +1020,44 @@
 }
 EXPORT_SYMBOL_GPL(mlx4_flow_detach);
 
+int mlx4_tunnel_steer_add(struct mlx4_dev *dev, unsigned char *addr,
+			  int port, int qpn, u16 prio, u64 *reg_id)
+{
+	int err;
+	struct mlx4_spec_list spec_eth_outer = { {NULL} };
+	struct mlx4_spec_list spec_vxlan     = { {NULL} };
+	struct mlx4_spec_list spec_eth_inner = { {NULL} };
+
+	struct mlx4_net_trans_rule rule = {
+		.queue_mode = MLX4_NET_TRANS_Q_FIFO,
+		.exclusive = 0,
+		.allow_loopback = 1,
+		.promisc_mode = MLX4_FS_REGULAR,
+	};
+
+	__be64 mac_mask = cpu_to_be64(MLX4_MAC_MASK << 16);
+
+	rule.port = port;
+	rule.qpn = qpn;
+	rule.priority = prio;
+	INIT_LIST_HEAD(&rule.list);
+
+	spec_eth_outer.id = MLX4_NET_TRANS_RULE_ID_ETH;
+	memcpy(spec_eth_outer.eth.dst_mac, addr, ETH_ALEN);
+	memcpy(spec_eth_outer.eth.dst_mac_msk, &mac_mask, ETH_ALEN);
+
+	spec_vxlan.id = MLX4_NET_TRANS_RULE_ID_VXLAN;    /* any vxlan header */
+	spec_eth_inner.id = MLX4_NET_TRANS_RULE_ID_ETH;	 /* any inner eth header */
+
+	list_add_tail(&spec_eth_outer.list, &rule.list);
+	list_add_tail(&spec_vxlan.list,     &rule.list);
+	list_add_tail(&spec_eth_inner.list, &rule.list);
+
+	err = mlx4_flow_attach(dev, &rule, reg_id);
+	return err;
+}
+EXPORT_SYMBOL(mlx4_tunnel_steer_add);
+
 int mlx4_FLOW_STEERING_IB_UC_QP_RANGE(struct mlx4_dev *dev, u32 min_range_qpn,
 				      u32 max_range_qpn)
 {

diff --git a/drivers/net/ethernet/moxa/moxart_ether.c b/drivers/net/ethernet/moxa/moxart_ether.c
index 5020fd4..2f12c88 100644
--- a/drivers/net/ethernet/moxa/moxart_ether.c
+++ b/drivers/net/ethernet/moxa/moxart_ether.c

@@ -206,7 +206,7 @@
 	int rx_head = priv->rx_head;
 	int rx = 0;
 
-	while (1) {
+	while (rx < budget) {
 		desc = priv->rx_desc_base + (RX_REG_DESC_SIZE * rx_head);
 		desc0 = readl(desc + RX_REG_OFFSET_DESC0);
 
@@ -218,7 +218,7 @@
 			net_dbg_ratelimited("packet error\n");
 			priv->stats.rx_dropped++;
 			priv->stats.rx_errors++;
-			continue;
+			goto rx_next;
 		}
 
 		len = desc0 & RX_DESC0_FRAME_LEN_MASK;
@@ -226,13 +226,19 @@
 		if (len > RX_BUF_SIZE)
 			len = RX_BUF_SIZE;
 
-		skb = build_skb(priv->rx_buf[rx_head], priv->rx_buf_size);
+		dma_sync_single_for_cpu(&ndev->dev,
+					priv->rx_mapping[rx_head],
+					priv->rx_buf_size, DMA_FROM_DEVICE);
+		skb = netdev_alloc_skb_ip_align(ndev, len);
+
 		if (unlikely(!skb)) {
-			net_dbg_ratelimited("build_skb failed\n");
+			net_dbg_ratelimited("netdev_alloc_skb_ip_align failed\n");
 			priv->stats.rx_dropped++;
 			priv->stats.rx_errors++;
+			goto rx_next;
 		}
 
+		memcpy(skb->data, priv->rx_buf[rx_head], len);
 		skb_put(skb, len);
 		skb->protocol = eth_type_trans(skb, ndev);
 		napi_gro_receive(&priv->napi, skb);
@@ -244,18 +250,15 @@
 		if (desc0 & RX_DESC0_MULTICAST)
 			priv->stats.multicast++;
 
+rx_next:
 		writel(RX_DESC0_DMA_OWN, desc + RX_REG_OFFSET_DESC0);
 
 		rx_head = RX_NEXT(rx_head);
 		priv->rx_head = rx_head;
-
-		if (rx >= budget)
-			break;
 	}
 
 	if (rx < budget) {
-		napi_gro_flush(napi, false);
-		__napi_complete(napi);
+		napi_complete(napi);
 	}
 
 	priv->reg_imr |= RPKT_FINISH_M;
@@ -346,10 +349,12 @@
 		len = ETH_ZLEN;
 	}
 
-	txdes1 = readl(desc + TX_REG_OFFSET_DESC1);
-	txdes1 |= TX_DESC1_LTS | TX_DESC1_FTS;
-	txdes1 &= ~(TX_DESC1_FIFO_COMPLETE | TX_DESC1_INTR_COMPLETE);
-	txdes1 |= (len & TX_DESC1_BUF_SIZE_MASK);
+	dma_sync_single_for_device(&ndev->dev, priv->tx_mapping[tx_head],
+				   priv->tx_buf_size, DMA_TO_DEVICE);
+
+	txdes1 = TX_DESC1_LTS | TX_DESC1_FTS | (len & TX_DESC1_BUF_SIZE_MASK);
+	if (tx_head == TX_DESC_NUM_MASK)
+		txdes1 |= TX_DESC1_END;
 	writel(txdes1, desc + TX_REG_OFFSET_DESC1);
 	writel(TX_DESC0_DMA_OWN, desc + TX_REG_OFFSET_DESC0);
 
@@ -465,8 +470,7 @@
 	spin_lock_init(&priv->txlock);
 
 	priv->tx_buf_size = TX_BUF_SIZE;
-	priv->rx_buf_size = RX_BUF_SIZE +
-			    SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+	priv->rx_buf_size = RX_BUF_SIZE;
 
 	priv->tx_desc_base = dma_alloc_coherent(NULL, TX_REG_DESC_SIZE *
 						TX_DESC_NUM, &priv->tx_base,

diff --git a/drivers/net/ethernet/nxp/lpc_eth.c b/drivers/net/ethernet/nxp/lpc_eth.c
index 8706c0d..a44a03c 100644
--- a/drivers/net/ethernet/nxp/lpc_eth.c
+++ b/drivers/net/ethernet/nxp/lpc_eth.c

@@ -1220,6 +1220,9 @@
 
 	__lpc_eth_clock_enable(pldat, true);
 
+	/* Suspended PHY makes LPC ethernet core block, so resume now */
+	phy_resume(pldat->phy_dev);
+
 	/* Reset and initialize */
 	__lpc_eth_reset(pldat);
 	__lpc_eth_init(pldat);

diff --git a/drivers/net/ethernet/qlogic/qlge/qlge_main.c b/drivers/net/ethernet/qlogic/qlge/qlge_main.c
index 188626e..3e96f26 100644
--- a/drivers/net/ethernet/qlogic/qlge/qlge_main.c
+++ b/drivers/net/ethernet/qlogic/qlge/qlge_main.c

@@ -2556,6 +2556,7 @@
 
 	if (skb_is_gso(skb)) {
 		int err;
+		__be16 l3_proto = vlan_get_protocol(skb);
 
 		err = skb_cow_head(skb, 0);
 		if (err < 0)
@@ -2572,7 +2573,7 @@
 				<< OB_MAC_TRANSPORT_HDR_SHIFT);
 		mac_iocb_ptr->mss = cpu_to_le16(skb_shinfo(skb)->gso_size);
 		mac_iocb_ptr->flags2 |= OB_MAC_TSO_IOCB_LSO;
-		if (likely(skb->protocol == htons(ETH_P_IP))) {
+		if (likely(l3_proto == htons(ETH_P_IP))) {
 			struct iphdr *iph = ip_hdr(skb);
 			iph->check = 0;
 			mac_iocb_ptr->flags1 |= OB_MAC_TSO_IOCB_IP4;
@@ -2580,7 +2581,7 @@
 								 iph->daddr, 0,
 								 IPPROTO_TCP,
 								 0);
-		} else if (skb->protocol == htons(ETH_P_IPV6)) {
+		} else if (l3_proto == htons(ETH_P_IPV6)) {
 			mac_iocb_ptr->flags1 |= OB_MAC_TSO_IOCB_IP6;
 			tcp_hdr(skb)->check =
 			    ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr,

diff --git a/drivers/net/ethernet/renesas/Kconfig b/drivers/net/ethernet/renesas/Kconfig
index 9e757c7..196e98a 100644
--- a/drivers/net/ethernet/renesas/Kconfig
+++ b/drivers/net/ethernet/renesas/Kconfig

@@ -5,6 +5,7 @@
 config SH_ETH
 	tristate "Renesas SuperH Ethernet support"
 	depends on HAS_DMA
+	depends on ARCH_SHMOBILE || SUPERH || COMPILE_TEST
 	select CRC32
 	select MII
 	select MDIO_BITBANG

diff --git a/drivers/net/ethernet/stmicro/stmmac/chain_mode.c b/drivers/net/ethernet/stmicro/stmmac/chain_mode.c
index c553f6b..cf28dab 100644
--- a/drivers/net/ethernet/stmicro/stmmac/chain_mode.c
+++ b/drivers/net/ethernet/stmicro/stmmac/chain_mode.c

@@ -28,7 +28,7 @@
 
 #include "stmmac.h"
 
-static unsigned int stmmac_jumbo_frm(void *p, struct sk_buff *skb, int csum)
+static int stmmac_jumbo_frm(void *p, struct sk_buff *skb, int csum)
 {
 	struct stmmac_priv *priv = (struct stmmac_priv *)p;
 	unsigned int txsize = priv->dma_tx_size;
@@ -47,7 +47,9 @@
 
 	desc->des2 = dma_map_single(priv->device, skb->data,
 				    bmax, DMA_TO_DEVICE);
-	priv->tx_skbuff_dma[entry] = desc->des2;
+	if (dma_mapping_error(priv->device, desc->des2))
+		return -1;
+	priv->tx_skbuff_dma[entry].buf = desc->des2;
 	priv->hw->desc->prepare_tx_desc(desc, 1, bmax, csum, STMMAC_CHAIN_MODE);
 
 	while (len != 0) {
@@ -59,7 +61,9 @@
 			desc->des2 = dma_map_single(priv->device,
 						    (skb->data + bmax * i),
 						    bmax, DMA_TO_DEVICE);
-			priv->tx_skbuff_dma[entry] = desc->des2;
+			if (dma_mapping_error(priv->device, desc->des2))
+				return -1;
+			priv->tx_skbuff_dma[entry].buf = desc->des2;
 			priv->hw->desc->prepare_tx_desc(desc, 0, bmax, csum,
 							STMMAC_CHAIN_MODE);
 			priv->hw->desc->set_tx_owner(desc);
@@ -69,7 +73,9 @@
 			desc->des2 = dma_map_single(priv->device,
 						    (skb->data + bmax * i), len,
 						    DMA_TO_DEVICE);
-			priv->tx_skbuff_dma[entry] = desc->des2;
+			if (dma_mapping_error(priv->device, desc->des2))
+				return -1;
+			priv->tx_skbuff_dma[entry].buf = desc->des2;
 			priv->hw->desc->prepare_tx_desc(desc, 0, len, csum,
 							STMMAC_CHAIN_MODE);
 			priv->hw->desc->set_tx_owner(desc);

diff --git a/drivers/net/ethernet/stmicro/stmmac/common.h b/drivers/net/ethernet/stmicro/stmmac/common.h
index de507c3..593e6c4 100644
--- a/drivers/net/ethernet/stmicro/stmmac/common.h
+++ b/drivers/net/ethernet/stmicro/stmmac/common.h

@@ -220,10 +220,10 @@
 	handle_tx = 0x8,
 };
 
-#define	CORE_IRQ_TX_PATH_IN_LPI_MODE	(1 << 1)
-#define	CORE_IRQ_TX_PATH_EXIT_LPI_MODE	(1 << 2)
-#define	CORE_IRQ_RX_PATH_IN_LPI_MODE	(1 << 3)
-#define	CORE_IRQ_RX_PATH_EXIT_LPI_MODE	(1 << 4)
+#define	CORE_IRQ_TX_PATH_IN_LPI_MODE	(1 << 0)
+#define	CORE_IRQ_TX_PATH_EXIT_LPI_MODE	(1 << 1)
+#define	CORE_IRQ_RX_PATH_IN_LPI_MODE	(1 << 2)
+#define	CORE_IRQ_RX_PATH_EXIT_LPI_MODE	(1 << 3)
 
 #define	CORE_PCS_ANE_COMPLETE		(1 << 5)
 #define	CORE_PCS_LINK_STATUS		(1 << 6)
@@ -287,7 +287,7 @@
 
 /* Default LPI timers */
 #define STMMAC_DEFAULT_LIT_LS	0x3E8
-#define STMMAC_DEFAULT_TWT_LS	0x0
+#define STMMAC_DEFAULT_TWT_LS	0x1E
 
 #define STMMAC_CHAIN_MODE	0x1
 #define STMMAC_RING_MODE	0x2
@@ -425,7 +425,7 @@
 	void (*init) (void *des, dma_addr_t phy_addr, unsigned int size,
 		      unsigned int extend_desc);
 	unsigned int (*is_jumbo_frm) (int len, int ehn_desc);
-	unsigned int (*jumbo_frm) (void *priv, struct sk_buff *skb, int csum);
+	int (*jumbo_frm)(void *priv, struct sk_buff *skb, int csum);
 	int (*set_16kib_bfsize)(int mtu);
 	void (*init_desc3)(struct dma_desc *p);
 	void (*refill_desc3) (void *priv, struct dma_desc *p);
@@ -445,6 +445,7 @@
 	int multicast_filter_bins;
 	int unicast_filter_entries;
 	int mcast_bits_log2;
+	unsigned int rx_csum;
 };
 
 struct mac_device_info *dwmac1000_setup(void __iomem *ioaddr, int mcbins,

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac1000.h b/drivers/net/ethernet/stmicro/stmmac/dwmac1000.h
index 71b5419..64d8f56 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac1000.h
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac1000.h

@@ -153,7 +153,7 @@
 #define GMAC_CONTROL_RE		0x00000004	/* Receiver Enable */
 
 #define GMAC_CORE_INIT (GMAC_CONTROL_JD | GMAC_CONTROL_PS | GMAC_CONTROL_ACS | \
-			GMAC_CONTROL_BE)
+			GMAC_CONTROL_BE | GMAC_CONTROL_DCRS)
 
 /* GMAC Frame Filter defines */
 #define GMAC_FRAME_FILTER_PR	0x00000001	/* Promiscuous Mode */

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c
index d8ef187..5efe60e 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c

@@ -58,7 +58,11 @@
 	void __iomem *ioaddr = hw->pcsr;
 	u32 value = readl(ioaddr + GMAC_CONTROL);
 
-	value |= GMAC_CONTROL_IPC;
+	if (hw->rx_csum)
+		value |= GMAC_CONTROL_IPC;
+	else
+		value &= ~GMAC_CONTROL_IPC;
+
 	writel(value, ioaddr + GMAC_CONTROL);
 
 	value = readl(ioaddr + GMAC_CONTROL);

diff --git a/drivers/net/ethernet/stmicro/stmmac/mmc.h b/drivers/net/ethernet/stmicro/stmmac/mmc.h
index 8607488..192c249 100644
--- a/drivers/net/ethernet/stmicro/stmmac/mmc.h
+++ b/drivers/net/ethernet/stmicro/stmmac/mmc.h

@@ -68,7 +68,7 @@
 	unsigned int mmc_rx_octetcount_g;
 	unsigned int mmc_rx_broadcastframe_g;
 	unsigned int mmc_rx_multicastframe_g;
-	unsigned int mmc_rx_crc_errror;
+	unsigned int mmc_rx_crc_error;
 	unsigned int mmc_rx_align_error;
 	unsigned int mmc_rx_run_error;
 	unsigned int mmc_rx_jabber_error;

diff --git a/drivers/net/ethernet/stmicro/stmmac/mmc_core.c b/drivers/net/ethernet/stmicro/stmmac/mmc_core.c
index 50617c5..08c483b 100644
--- a/drivers/net/ethernet/stmicro/stmmac/mmc_core.c
+++ b/drivers/net/ethernet/stmicro/stmmac/mmc_core.c

@@ -196,7 +196,7 @@
 	mmc->mmc_rx_octetcount_g += readl(ioaddr + MMC_RX_OCTETCOUNT_G);
 	mmc->mmc_rx_broadcastframe_g += readl(ioaddr + MMC_RX_BROADCASTFRAME_G);
 	mmc->mmc_rx_multicastframe_g += readl(ioaddr + MMC_RX_MULTICASTFRAME_G);
-	mmc->mmc_rx_crc_errror += readl(ioaddr + MMC_RX_CRC_ERRROR);
+	mmc->mmc_rx_crc_error += readl(ioaddr + MMC_RX_CRC_ERRROR);
 	mmc->mmc_rx_align_error += readl(ioaddr + MMC_RX_ALIGN_ERROR);
 	mmc->mmc_rx_run_error += readl(ioaddr + MMC_RX_RUN_ERROR);
 	mmc->mmc_rx_jabber_error += readl(ioaddr + MMC_RX_JABBER_ERROR);

diff --git a/drivers/net/ethernet/stmicro/stmmac/ring_mode.c b/drivers/net/ethernet/stmicro/stmmac/ring_mode.c
index 650a4be..5dd50c6 100644
--- a/drivers/net/ethernet/stmicro/stmmac/ring_mode.c
+++ b/drivers/net/ethernet/stmicro/stmmac/ring_mode.c

@@ -28,7 +28,7 @@
 
 #include "stmmac.h"
 
-static unsigned int stmmac_jumbo_frm(void *p, struct sk_buff *skb, int csum)
+static int stmmac_jumbo_frm(void *p, struct sk_buff *skb, int csum)
 {
 	struct stmmac_priv *priv = (struct stmmac_priv *)p;
 	unsigned int txsize = priv->dma_tx_size;
@@ -53,7 +53,10 @@
 
 		desc->des2 = dma_map_single(priv->device, skb->data,
 					    bmax, DMA_TO_DEVICE);
-		priv->tx_skbuff_dma[entry] = desc->des2;
+		if (dma_mapping_error(priv->device, desc->des2))
+			return -1;
+
+		priv->tx_skbuff_dma[entry].buf = desc->des2;
 		desc->des3 = desc->des2 + BUF_SIZE_4KiB;
 		priv->hw->desc->prepare_tx_desc(desc, 1, bmax, csum,
 						STMMAC_RING_MODE);
@@ -68,7 +71,9 @@
 
 		desc->des2 = dma_map_single(priv->device, skb->data + bmax,
 					    len, DMA_TO_DEVICE);
-		priv->tx_skbuff_dma[entry] = desc->des2;
+		if (dma_mapping_error(priv->device, desc->des2))
+			return -1;
+		priv->tx_skbuff_dma[entry].buf = desc->des2;
 		desc->des3 = desc->des2 + BUF_SIZE_4KiB;
 		priv->hw->desc->prepare_tx_desc(desc, 0, len, csum,
 						STMMAC_RING_MODE);
@@ -77,7 +82,9 @@
 	} else {
 		desc->des2 = dma_map_single(priv->device, skb->data,
 					    nopaged_len, DMA_TO_DEVICE);
-		priv->tx_skbuff_dma[entry] = desc->des2;
+		if (dma_mapping_error(priv->device, desc->des2))
+			return -1;
+		priv->tx_skbuff_dma[entry].buf = desc->des2;
 		desc->des3 = desc->des2 + BUF_SIZE_4KiB;
 		priv->hw->desc->prepare_tx_desc(desc, 1, nopaged_len, csum,
 						STMMAC_RING_MODE);

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac.h b/drivers/net/ethernet/stmicro/stmmac/stmmac.h
index ca01035..58097c0 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac.h
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac.h

@@ -34,6 +34,11 @@
 #include <linux/ptp_clock_kernel.h>
 #include <linux/reset.h>
 
+struct stmmac_tx_info {
+	dma_addr_t buf;
+	bool map_as_page;
+};
+
 struct stmmac_priv {
 	/* Frequently used values are kept adjacent for cache effect */
 	struct dma_extended_desc *dma_etx ____cacheline_aligned_in_smp;
@@ -45,7 +50,7 @@
 	u32 tx_count_frames;
 	u32 tx_coal_frames;
 	u32 tx_coal_timer;
-	dma_addr_t *tx_skbuff_dma;
+	struct stmmac_tx_info *tx_skbuff_dma;
 	dma_addr_t dma_tx_phy;
 	int tx_coalesce;
 	int hwts_tx_en;
@@ -105,6 +110,8 @@
 	struct ptp_clock *ptp_clock;
 	struct ptp_clock_info ptp_clock_ops;
 	unsigned int default_addend;
+	struct clk *clk_ptp_ref;
+	unsigned int clk_ptp_rate;
 	u32 adv_ts;
 	int use_riwt;
 	int irq_wake;

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
index 9af50ba..cf4f38d 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c

@@ -175,7 +175,7 @@
 	STMMAC_MMC_STAT(mmc_rx_octetcount_g),
 	STMMAC_MMC_STAT(mmc_rx_broadcastframe_g),
 	STMMAC_MMC_STAT(mmc_rx_multicastframe_g),
-	STMMAC_MMC_STAT(mmc_rx_crc_errror),
+	STMMAC_MMC_STAT(mmc_rx_crc_error),
 	STMMAC_MMC_STAT(mmc_rx_align_error),
 	STMMAC_MMC_STAT(mmc_rx_run_error),
 	STMMAC_MMC_STAT(mmc_rx_jabber_error),

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 08addd6..6e6ee22 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c

@@ -275,6 +275,7 @@
  */
 bool stmmac_eee_init(struct stmmac_priv *priv)
 {
+	char *phy_bus_name = priv->plat->phy_bus_name;
 	bool ret = false;
 
 	/* Using PCS we cannot dial with the phy registers at this stage
@@ -284,6 +285,10 @@
 	    (priv->pcs == STMMAC_PCS_RTBI))
 		goto out;
 
+	/* Never init EEE in case of a switch is attached */
+	if (phy_bus_name && (!strcmp(phy_bus_name, "fixed")))
+		goto out;
+
 	/* MAC core supports the EEE feature. */
 	if (priv->dma_cap.eee) {
 		int tx_lpi_timer = priv->tx_lpi_timer;
@@ -316,10 +321,9 @@
 			priv->hw->mac->set_eee_timer(priv->hw,
 						     STMMAC_DEFAULT_LIT_LS,
 						     tx_lpi_timer);
-		} else
-			/* Set HW EEE according to the speed */
-			priv->hw->mac->set_eee_pls(priv->hw,
-						   priv->phydev->link);
+		}
+		/* Set HW EEE according to the speed */
+		priv->hw->mac->set_eee_pls(priv->hw, priv->phydev->link);
 
 		pr_debug("stmmac: Energy-Efficient Ethernet initialized\n");
 
@@ -603,16 +607,16 @@
 		/* calculate default added value:
 		 * formula is :
 		 * addend = (2^32)/freq_div_ratio;
-		 * where, freq_div_ratio = STMMAC_SYSCLOCK/50MHz
-		 * hence, addend = ((2^32) * 50MHz)/STMMAC_SYSCLOCK;
-		 * NOTE: STMMAC_SYSCLOCK should be >= 50MHz to
+		 * where, freq_div_ratio = clk_ptp_ref_i/50MHz
+		 * hence, addend = ((2^32) * 50MHz)/clk_ptp_ref_i;
+		 * NOTE: clk_ptp_ref_i should be >= 50MHz to
 		 *       achive 20ns accuracy.
 		 *
 		 * 2^x * y == (y << x), hence
 		 * 2^32 * 50000000 ==> (50000000 << 32)
 		 */
 		temp = (u64) (50000000ULL << 32);
-		priv->default_addend = div_u64(temp, STMMAC_SYSCLOCK);
+		priv->default_addend = div_u64(temp, priv->clk_ptp_rate);
 		priv->hw->ptp->config_addend(priv->ioaddr,
 					     priv->default_addend);
 
@@ -638,6 +642,16 @@
 	if (!(priv->dma_cap.time_stamp || priv->dma_cap.atime_stamp))
 		return -EOPNOTSUPP;
 
+	/* Fall-back to main clock in case of no PTP ref is passed */
+	priv->clk_ptp_ref = devm_clk_get(priv->device, "clk_ptp_ref");
+	if (IS_ERR(priv->clk_ptp_ref)) {
+		priv->clk_ptp_rate = clk_get_rate(priv->stmmac_clk);
+		priv->clk_ptp_ref = NULL;
+	} else {
+		clk_prepare_enable(priv->clk_ptp_ref);
+		priv->clk_ptp_rate = clk_get_rate(priv->clk_ptp_ref);
+	}
+
 	priv->adv_ts = 0;
 	if (priv->dma_cap.atime_stamp && priv->extend_desc)
 		priv->adv_ts = 1;
@@ -657,6 +671,8 @@
 
 static void stmmac_release_ptp(struct stmmac_priv *priv)
 {
+	if (priv->clk_ptp_ref)
+		clk_disable_unprepare(priv->clk_ptp_ref);
 	stmmac_ptp_unregister(priv);
 }
 
@@ -1061,7 +1077,8 @@
 		else
 			p = priv->dma_tx + i;
 		p->des2 = 0;
-		priv->tx_skbuff_dma[i] = 0;
+		priv->tx_skbuff_dma[i].buf = 0;
+		priv->tx_skbuff_dma[i].map_as_page = false;
 		priv->tx_skbuff[i] = NULL;
 	}
 
@@ -1100,17 +1117,24 @@
 		else
 			p = priv->dma_tx + i;
 
-		if (priv->tx_skbuff_dma[i]) {
-			dma_unmap_single(priv->device,
-					 priv->tx_skbuff_dma[i],
-					 priv->hw->desc->get_tx_len(p),
-					 DMA_TO_DEVICE);
-			priv->tx_skbuff_dma[i] = 0;
+		if (priv->tx_skbuff_dma[i].buf) {
+			if (priv->tx_skbuff_dma[i].map_as_page)
+				dma_unmap_page(priv->device,
+					       priv->tx_skbuff_dma[i].buf,
+					       priv->hw->desc->get_tx_len(p),
+					       DMA_TO_DEVICE);
+			else
+				dma_unmap_single(priv->device,
+						 priv->tx_skbuff_dma[i].buf,
+						 priv->hw->desc->get_tx_len(p),
+						 DMA_TO_DEVICE);
 		}
 
 		if (priv->tx_skbuff[i] != NULL) {
 			dev_kfree_skb_any(priv->tx_skbuff[i]);
 			priv->tx_skbuff[i] = NULL;
+			priv->tx_skbuff_dma[i].buf = 0;
+			priv->tx_skbuff_dma[i].map_as_page = false;
 		}
 	}
 }
@@ -1131,7 +1155,8 @@
 	if (!priv->rx_skbuff)
 		goto err_rx_skbuff;
 
-	priv->tx_skbuff_dma = kmalloc_array(txsize, sizeof(dma_addr_t),
+	priv->tx_skbuff_dma = kmalloc_array(txsize,
+					    sizeof(*priv->tx_skbuff_dma),
 					    GFP_KERNEL);
 	if (!priv->tx_skbuff_dma)
 		goto err_tx_skbuff_dma;
@@ -1293,12 +1318,19 @@
 			pr_debug("%s: curr %d, dirty %d\n", __func__,
 				 priv->cur_tx, priv->dirty_tx);
 
-		if (likely(priv->tx_skbuff_dma[entry])) {
-			dma_unmap_single(priv->device,
-					 priv->tx_skbuff_dma[entry],
-					 priv->hw->desc->get_tx_len(p),
-					 DMA_TO_DEVICE);
-			priv->tx_skbuff_dma[entry] = 0;
+		if (likely(priv->tx_skbuff_dma[entry].buf)) {
+			if (priv->tx_skbuff_dma[entry].map_as_page)
+				dma_unmap_page(priv->device,
+					       priv->tx_skbuff_dma[entry].buf,
+					       priv->hw->desc->get_tx_len(p),
+					       DMA_TO_DEVICE);
+			else
+				dma_unmap_single(priv->device,
+						 priv->tx_skbuff_dma[entry].buf,
+						 priv->hw->desc->get_tx_len(p),
+						 DMA_TO_DEVICE);
+			priv->tx_skbuff_dma[entry].buf = 0;
+			priv->tx_skbuff_dma[entry].map_as_page = false;
 		}
 		priv->hw->mode->clean_desc3(priv, p);
 
@@ -1637,6 +1669,13 @@
 	/* Initialize the MAC Core */
 	priv->hw->mac->core_init(priv->hw, dev->mtu);
 
+	ret = priv->hw->mac->rx_ipc(priv->hw);
+	if (!ret) {
+		pr_warn(" RX IPC Checksum Offload disabled\n");
+		priv->plat->rx_coe = STMMAC_RX_COE_NONE;
+		priv->hw->rx_csum = 0;
+	}
+
 	/* Enable the MAC Rx/Tx */
 	stmmac_set_mac(priv->ioaddr, true);
 
@@ -1887,12 +1926,16 @@
 	if (likely(!is_jumbo)) {
 		desc->des2 = dma_map_single(priv->device, skb->data,
 					    nopaged_len, DMA_TO_DEVICE);
-		priv->tx_skbuff_dma[entry] = desc->des2;
+		if (dma_mapping_error(priv->device, desc->des2))
+			goto dma_map_err;
+		priv->tx_skbuff_dma[entry].buf = desc->des2;
 		priv->hw->desc->prepare_tx_desc(desc, 1, nopaged_len,
 						csum_insertion, priv->mode);
 	} else {
 		desc = first;
 		entry = priv->hw->mode->jumbo_frm(priv, skb, csum_insertion);
+		if (unlikely(entry < 0))
+			goto dma_map_err;
 	}
 
 	for (i = 0; i < nfrags; i++) {
@@ -1908,7 +1951,11 @@
 
 		desc->des2 = skb_frag_dma_map(priv->device, frag, 0, len,
 					      DMA_TO_DEVICE);
-		priv->tx_skbuff_dma[entry] = desc->des2;
+		if (dma_mapping_error(priv->device, desc->des2))
+			goto dma_map_err; /* should reuse desc w/o issues */
+
+		priv->tx_skbuff_dma[entry].buf = desc->des2;
+		priv->tx_skbuff_dma[entry].map_as_page = true;
 		priv->hw->desc->prepare_tx_desc(desc, 0, len, csum_insertion,
 						priv->mode);
 		wmb();
@@ -1975,7 +2022,12 @@
 	priv->hw->dma->enable_dma_transmission(priv->ioaddr);
 
 	spin_unlock(&priv->tx_lock);
+	return NETDEV_TX_OK;
 
+dma_map_err:
+	dev_err(priv->device, "Tx dma map failed\n");
+	dev_kfree_skb(skb);
+	priv->dev->stats.tx_dropped++;
 	return NETDEV_TX_OK;
 }
 
@@ -2028,7 +2080,12 @@
 			priv->rx_skbuff_dma[entry] =
 			    dma_map_single(priv->device, skb->data, bfsize,
 					   DMA_FROM_DEVICE);
-
+			if (dma_mapping_error(priv->device,
+					      priv->rx_skbuff_dma[entry])) {
+				dev_err(priv->device, "Rx dma map failed\n");
+				dev_kfree_skb(skb);
+				break;
+			}
 			p->des2 = priv->rx_skbuff_dma[entry];
 
 			priv->hw->mode->refill_desc3(priv, p);
@@ -2055,7 +2112,7 @@
 	unsigned int entry = priv->cur_rx % rxsize;
 	unsigned int next_entry;
 	unsigned int count = 0;
-	int coe = priv->plat->rx_coe;
+	int coe = priv->hw->rx_csum;
 
 	if (netif_msg_rx_status(priv)) {
 		pr_debug("%s: descriptor ring:\n", __func__);
@@ -2276,8 +2333,7 @@
 
 	if (priv->plat->rx_coe == STMMAC_RX_COE_NONE)
 		features &= ~NETIF_F_RXCSUM;
-	else if (priv->plat->rx_coe == STMMAC_RX_COE_TYPE1)
-		features &= ~NETIF_F_IPV6_CSUM;
+
 	if (!priv->plat->tx_coe)
 		features &= ~NETIF_F_ALL_CSUM;
 
@@ -2292,6 +2348,24 @@
 	return features;
 }
 
+static int stmmac_set_features(struct net_device *netdev,
+			       netdev_features_t features)
+{
+	struct stmmac_priv *priv = netdev_priv(netdev);
+
+	/* Keep the COE Type in case of csum is supporting */
+	if (features & NETIF_F_RXCSUM)
+		priv->hw->rx_csum = priv->plat->rx_coe;
+	else
+		priv->hw->rx_csum = 0;
+	/* No check needed because rx_coe has been set before and it will be
+	 * fixed in case of issue.
+	 */
+	priv->hw->mac->rx_ipc(priv->hw);
+
+	return 0;
+}
+
 /**
  *  stmmac_interrupt - main ISR
  *  @irq: interrupt number.
@@ -2572,6 +2646,7 @@
 	.ndo_stop = stmmac_release,
 	.ndo_change_mtu = stmmac_change_mtu,
 	.ndo_fix_features = stmmac_fix_features,
+	.ndo_set_features = stmmac_set_features,
 	.ndo_set_rx_mode = stmmac_set_rx_mode,
 	.ndo_tx_timeout = stmmac_tx_timeout,
 	.ndo_do_ioctl = stmmac_ioctl,
@@ -2592,7 +2667,6 @@
  */
 static int stmmac_hw_init(struct stmmac_priv *priv)
 {
-	int ret;
 	struct mac_device_info *mac;
 
 	/* Identify the MAC HW device */
@@ -2649,15 +2723,11 @@
 	/* To use alternate (extended) or normal descriptor structures */
 	stmmac_selec_desc_mode(priv);
 
-	ret = priv->hw->mac->rx_ipc(priv->hw);
-	if (!ret) {
-		pr_warn(" RX IPC Checksum Offload not configured.\n");
-		priv->plat->rx_coe = STMMAC_RX_COE_NONE;
-	}
-
-	if (priv->plat->rx_coe)
+	if (priv->plat->rx_coe) {
+		priv->hw->rx_csum = priv->plat->rx_coe;
 		pr_info(" RX Checksum Offload Engine supported (type %d)\n",
 			priv->plat->rx_coe);
+	}
 	if (priv->plat->tx_coe)
 		pr_info(" TX Checksum insertion supported\n");
 

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c
index b7ad356..c5ee79d 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c

@@ -206,6 +206,7 @@
 {
 	if (priv->ptp_clock) {
 		ptp_clock_unregister(priv->ptp_clock);
+		priv->ptp_clock = NULL;
 		pr_debug("Removed PTP HW clock successfully on %s\n",
 			 priv->dev->name);
 	}

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.h b/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.h
index 3dbc047..4535df3 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.h
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.h

@@ -25,8 +25,6 @@
 #ifndef __STMMAC_PTP_H__
 #define __STMMAC_PTP_H__
 
-#define STMMAC_SYSCLOCK 62500000
-
 /* IEEE 1588 PTP register offsets */
 #define PTP_TCR		0x0700	/* Timestamp Control Reg */
 #define PTP_SSIR	0x0704	/* Sub-Second Increment Reg */

diff --git a/drivers/net/fddi/skfp/h/skfbi.h b/drivers/net/fddi/skfp/h/skfbi.h
index c1ba26c..3de2f0d 100644
--- a/drivers/net/fddi/skfp/h/skfbi.h
+++ b/drivers/net/fddi/skfp/h/skfbi.h

@@ -147,11 +147,6 @@
 #define	PCI_MEM64BIT	(2<<1)	     /* Base addr anywhere in 64 Bit range */
 #define	PCI_MEMSPACE	0x00000001L  /* Bit 0:	Memory Space Indic. */
 
-/*	PCI_BASE_2ND	32 bit	2nd Base address */
-#define	PCI_IOBASE	0xffffff00L  /* Bit 31..8:  I/O Base address */
-#define	PCI_IOSIZE	0x000000fcL  /* Bit 7..2:   I/O Size Requirements */
-#define	PCI_IOSPACE	0x00000001L  /* Bit 0:	    I/O Space Indicator */
-
 /*	PCI_SUB_VID	16 bit	Subsystem Vendor ID */
 /*	PCI_SUB_ID	16 bit	Subsystem ID */
 

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index c94e2a2..a854d38 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c

@@ -1036,31 +1036,31 @@
 		/* First check if the EEE ability is supported */
 		eee_cap = phy_read_mmd_indirect(phydev, MDIO_PCS_EEE_ABLE,
 						MDIO_MMD_PCS, phydev->addr);
-		if (eee_cap < 0)
-			return eee_cap;
+		if (eee_cap <= 0)
+			goto eee_exit_err;
 
 		cap = mmd_eee_cap_to_ethtool_sup_t(eee_cap);
 		if (!cap)
-			return -EPROTONOSUPPORT;
+			goto eee_exit_err;
 
 		/* Check which link settings negotiated and verify it in
 		 * the EEE advertising registers.
 		 */
 		eee_lp = phy_read_mmd_indirect(phydev, MDIO_AN_EEE_LPABLE,
 					       MDIO_MMD_AN, phydev->addr);
-		if (eee_lp < 0)
-			return eee_lp;
+		if (eee_lp <= 0)
+			goto eee_exit_err;
 
 		eee_adv = phy_read_mmd_indirect(phydev, MDIO_AN_EEE_ADV,
 						MDIO_MMD_AN, phydev->addr);
-		if (eee_adv < 0)
-			return eee_adv;
+		if (eee_adv <= 0)
+			goto eee_exit_err;
 
 		adv = mmd_eee_adv_to_ethtool_adv_t(eee_adv);
 		lp = mmd_eee_adv_to_ethtool_adv_t(eee_lp);
 		idx = phy_find_setting(phydev->speed, phydev->duplex);
 		if (!(lp & adv & settings[idx].setting))
-			return -EPROTONOSUPPORT;
+			goto eee_exit_err;
 
 		if (clk_stop_enable) {
 			/* Configure the PHY to stop receiving xMII
@@ -1080,7 +1080,7 @@
 
 		return 0; /* EEE supported */
 	}
-
+eee_exit_err:
 	return -EPROTONOSUPPORT;
 }
 EXPORT_SYMBOL(phy_init_eee);

diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c b/drivers/net/vmxnet3/vmxnet3_drv.c
index d6e90c7..6dfcbf5 100644
--- a/drivers/net/vmxnet3/vmxnet3_drv.c
+++ b/drivers/net/vmxnet3/vmxnet3_drv.c

@@ -2056,7 +2056,6 @@
 		if (!netdev_mc_empty(netdev)) {
 			new_table = vmxnet3_copy_mc(netdev);
 			if (new_table) {
-				new_mode |= VMXNET3_RXM_MCAST;
 				rxConf->mfTableLen = cpu_to_le16(
 					netdev_mc_count(netdev) * ETH_ALEN);
 				new_table_pa = dma_map_single(
@@ -2064,15 +2063,18 @@
 							new_table,
 							rxConf->mfTableLen,
 							PCI_DMA_TODEVICE);
+			}
+
+			if (new_table_pa) {
+				new_mode |= VMXNET3_RXM_MCAST;
 				rxConf->mfTablePA = cpu_to_le64(new_table_pa);
 			} else {
-				netdev_info(netdev, "failed to copy mcast list"
-					    ", setting ALL_MULTI\n");
+				netdev_info(netdev,
+					    "failed to copy mcast list, setting ALL_MULTI\n");
 				new_mode |= VMXNET3_RXM_ALL_MULTI;
 			}
 		}
 
-
 	if (!(new_mode & VMXNET3_RXM_MCAST)) {
 		rxConf->mfTableLen = 0;
 		rxConf->mfTablePA = 0;
@@ -2091,11 +2093,10 @@
 			       VMXNET3_CMD_UPDATE_MAC_FILTERS);
 	spin_unlock_irqrestore(&adapter->cmd_lock, flags);
 
-	if (new_table) {
+	if (new_table_pa)
 		dma_unmap_single(&adapter->pdev->dev, new_table_pa,
 				 rxConf->mfTableLen, PCI_DMA_TODEVICE);
-		kfree(new_table);
-	}
+	kfree(new_table);
 }
 
 void

diff --git a/drivers/net/vmxnet3/vmxnet3_int.h b/drivers/net/vmxnet3/vmxnet3_int.h
index 29ee77f2..3759479 100644
--- a/drivers/net/vmxnet3/vmxnet3_int.h
+++ b/drivers/net/vmxnet3/vmxnet3_int.h

@@ -69,10 +69,10 @@
 /*
  * Version numbers
  */
-#define VMXNET3_DRIVER_VERSION_STRING   "1.2.0.0-k"
+#define VMXNET3_DRIVER_VERSION_STRING   "1.2.1.0-k"
 
 /* a 32-bit int, each byte encode a verion number in VMXNET3_DRIVER_VERSION */
-#define VMXNET3_DRIVER_VERSION_NUM      0x01020000
+#define VMXNET3_DRIVER_VERSION_NUM      0x01020100
 
 #if defined(CONFIG_PCI_MSI)
 	/* RSS only makes sense if MSI-X is supported. */

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 1fb7b37..beb377b 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c

@@ -1327,7 +1327,7 @@
 	} else if (vxlan->flags & VXLAN_F_L3MISS) {
 		union vxlan_addr ipa = {
 			.sin.sin_addr.s_addr = tip,
-			.sa.sa_family = AF_INET,
+			.sin.sin_family = AF_INET,
 		};
 
 		vxlan_ip_miss(dev, &ipa);
@@ -1488,7 +1488,7 @@
 	} else if (vxlan->flags & VXLAN_F_L3MISS) {
 		union vxlan_addr ipa = {
 			.sin6.sin6_addr = msg->target,
-			.sa.sa_family = AF_INET6,
+			.sin6.sin6_family = AF_INET6,
 		};
 
 		vxlan_ip_miss(dev, &ipa);
@@ -1521,7 +1521,7 @@
 		if (!n && (vxlan->flags & VXLAN_F_L3MISS)) {
 			union vxlan_addr ipa = {
 				.sin.sin_addr.s_addr = pip->daddr,
-				.sa.sa_family = AF_INET,
+				.sin.sin_family = AF_INET,
 			};
 
 			vxlan_ip_miss(dev, &ipa);
@@ -1542,7 +1542,7 @@
 		if (!n && (vxlan->flags & VXLAN_F_L3MISS)) {
 			union vxlan_addr ipa = {
 				.sin6.sin6_addr = pip6->daddr,
-				.sa.sa_family = AF_INET6,
+				.sin6.sin6_family = AF_INET6,
 			};
 
 			vxlan_ip_miss(dev, &ipa);

diff --git a/drivers/net/wireless/at76c50x-usb.c b/drivers/net/wireless/at76c50x-usb.c
index 334c2ec..da92bfa 100644
--- a/drivers/net/wireless/at76c50x-usb.c
+++ b/drivers/net/wireless/at76c50x-usb.c

@@ -2423,8 +2423,6 @@
 
 	kfree_skb(priv->rx_skb);
 
-	usb_put_dev(priv->udev);
-
 	at76_dbg(DBG_PROC_ENTRY, "%s: before freeing priv/ieee80211_hw",
 		 __func__);
 	ieee80211_free_hw(priv->hw);
@@ -2558,6 +2556,7 @@
 
 	wiphy_info(priv->hw->wiphy, "disconnecting\n");
 	at76_delete_device(priv);
+	usb_put_dev(priv->udev);
 	dev_info(&interface->dev, "disconnected\n");
 }
 

diff --git a/drivers/net/wireless/ath/ath9k/spectral.c b/drivers/net/wireless/ath/ath9k/spectral.c
index 5fe29b9..8f68426 100644
--- a/drivers/net/wireless/ath/ath9k/spectral.c
+++ b/drivers/net/wireless/ath/ath9k/spectral.c

@@ -253,7 +253,7 @@
 
 	if (strncmp("trigger", buf, 7) == 0) {
 		ath9k_spectral_scan_trigger(sc->hw);
-	} else if (strncmp("background", buf, 9) == 0) {
+	} else if (strncmp("background", buf, 10) == 0) {
 		ath9k_spectral_scan_config(sc->hw, SPECTRAL_BACKGROUND);
 		ath_dbg(common, CONFIG, "spectral scan: background mode enabled\n");
 	} else if (strncmp("chanscan", buf, 8) == 0) {

diff --git a/drivers/net/wireless/iwlwifi/Kconfig b/drivers/net/wireless/iwlwifi/Kconfig
index 6451d2b..824f5e2 100644
--- a/drivers/net/wireless/iwlwifi/Kconfig
+++ b/drivers/net/wireless/iwlwifi/Kconfig

@@ -51,7 +51,6 @@
 
 config IWLDVM
 	tristate "Intel Wireless WiFi DVM Firmware support"
-	depends on m
 	default IWLWIFI
 	help
 	  This is the driver that supports the DVM firmware which is
@@ -60,7 +59,6 @@
 
 config IWLMVM
 	tristate "Intel Wireless WiFi MVM Firmware support"
-	depends on m
 	help
 	  This is the driver that supports the MVM firmware which is
 	  currently only available for 7260 and 3160 devices.

diff --git a/drivers/net/wireless/iwlwifi/dvm/rxon.c b/drivers/net/wireless/iwlwifi/dvm/rxon.c
index 6dc5dd3..ed50de6 100644
--- a/drivers/net/wireless/iwlwifi/dvm/rxon.c
+++ b/drivers/net/wireless/iwlwifi/dvm/rxon.c

@@ -1068,6 +1068,13 @@
 	/* recalculate basic rates */
 	iwl_calc_basic_rates(priv, ctx);
 
+	/*
+	 * force CTS-to-self frames protection if RTS-CTS is not preferred
+	 * one aggregation protection method
+	 */
+	if (!priv->hw_params.use_rts_for_aggregation)
+		ctx->staging.flags |= RXON_FLG_SELF_CTS_EN;
+
 	if ((ctx->vif && ctx->vif->bss_conf.use_short_slot) ||
 	    !(ctx->staging.flags & RXON_FLG_BAND_24G_MSK))
 		ctx->staging.flags |= RXON_FLG_SHORT_SLOT_MSK;
@@ -1473,6 +1480,11 @@
 	else
 		ctx->staging.flags &= ~RXON_FLG_TGG_PROTECT_MSK;
 
+	if (bss_conf->use_cts_prot)
+		ctx->staging.flags |= RXON_FLG_SELF_CTS_EN;
+	else
+		ctx->staging.flags &= ~RXON_FLG_SELF_CTS_EN;
+
 	memcpy(ctx->staging.bssid_addr, bss_conf->bssid, ETH_ALEN);
 
 	if (vif->type == NL80211_IFTYPE_AP ||

diff --git a/drivers/net/wireless/iwlwifi/iwl-7000.c b/drivers/net/wireless/iwlwifi/iwl-7000.c
index 4873006..d67a37a 100644
--- a/drivers/net/wireless/iwlwifi/iwl-7000.c
+++ b/drivers/net/wireless/iwlwifi/iwl-7000.c

@@ -67,8 +67,8 @@
 #include "iwl-agn-hw.h"
 
 /* Highest firmware API version supported */
-#define IWL7260_UCODE_API_MAX	9
-#define IWL3160_UCODE_API_MAX	9
+#define IWL7260_UCODE_API_MAX	10
+#define IWL3160_UCODE_API_MAX	10
 
 /* Oldest version we won't warn about */
 #define IWL7260_UCODE_API_OK	9

diff --git a/drivers/net/wireless/iwlwifi/iwl-8000.c b/drivers/net/wireless/iwlwifi/iwl-8000.c
index 44b19e0..e93c697 100644
--- a/drivers/net/wireless/iwlwifi/iwl-8000.c
+++ b/drivers/net/wireless/iwlwifi/iwl-8000.c

@@ -67,7 +67,7 @@
 #include "iwl-agn-hw.h"
 
 /* Highest firmware API version supported */
-#define IWL8000_UCODE_API_MAX	9
+#define IWL8000_UCODE_API_MAX	10
 
 /* Oldest version we won't warn about */
 #define IWL8000_UCODE_API_OK	8

diff --git a/drivers/net/wireless/rtlwifi/btcoexist/halbtcoutsrc.c b/drivers/net/wireless/rtlwifi/btcoexist/halbtcoutsrc.c
index 33da3df..d4bd550 100644
--- a/drivers/net/wireless/rtlwifi/btcoexist/halbtcoutsrc.c
+++ b/drivers/net/wireless/rtlwifi/btcoexist/halbtcoutsrc.c

@@ -101,7 +101,7 @@
 
 	bool is_legacy = false;
 
-	if ((mac->mode == WIRELESS_MODE_B) || (mac->mode == WIRELESS_MODE_B))
+	if ((mac->mode == WIRELESS_MODE_B) || (mac->mode == WIRELESS_MODE_G))
 		is_legacy = true;
 
 	return is_legacy;

diff --git a/drivers/net/wireless/rtlwifi/rtl8192cu/sw.c b/drivers/net/wireless/rtlwifi/rtl8192cu/sw.c
index 361435f..1ac6383 100644
--- a/drivers/net/wireless/rtlwifi/rtl8192cu/sw.c
+++ b/drivers/net/wireless/rtlwifi/rtl8192cu/sw.c

@@ -317,6 +317,7 @@
 	{RTL_USB_DEVICE(0x0bda, 0x5088, rtl92cu_hal_cfg)}, /*Thinkware-CC&C*/
 	{RTL_USB_DEVICE(0x0df6, 0x0052, rtl92cu_hal_cfg)}, /*Sitecom - Edimax*/
 	{RTL_USB_DEVICE(0x0df6, 0x005c, rtl92cu_hal_cfg)}, /*Sitecom - Edimax*/
+	{RTL_USB_DEVICE(0x0df6, 0x0070, rtl92cu_hal_cfg)}, /*Sitecom - 150N */
 	{RTL_USB_DEVICE(0x0df6, 0x0077, rtl92cu_hal_cfg)}, /*Sitecom-WLA2100V2*/
 	{RTL_USB_DEVICE(0x0eb0, 0x9071, rtl92cu_hal_cfg)}, /*NO Brand - Etop*/
 	{RTL_USB_DEVICE(0x4856, 0x0091, rtl92cu_hal_cfg)}, /*NetweeN - Feixun*/

diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
index e29e15d..f379689 100644
--- a/drivers/net/xen-netback/interface.c
+++ b/drivers/net/xen-netback/interface.c

@@ -576,6 +576,9 @@
 	init_waitqueue_head(&queue->dealloc_wq);
 	atomic_set(&queue->inflight_packets, 0);
 
+	netif_napi_add(queue->vif->dev, &queue->napi, xenvif_poll,
+			XENVIF_NAPI_WEIGHT);
+
 	if (tx_evtchn == rx_evtchn) {
 		/* feature-split-event-channels == 0 */
 		err = bind_interdomain_evtchn_to_irqhandler(
@@ -629,9 +632,6 @@
 	wake_up_process(queue->task);
 	wake_up_process(queue->dealloc_task);
 
-	netif_napi_add(queue->vif->dev, &queue->napi, xenvif_poll,
-			XENVIF_NAPI_WEIGHT);
-
 	return 0;
 
 err_rx_unbind:

diff --git a/drivers/pci/host/Kconfig b/drivers/pci/host/Kconfig
index 8922c37..90f5cca 100644
--- a/drivers/pci/host/Kconfig
+++ b/drivers/pci/host/Kconfig

@@ -56,7 +56,7 @@
 	  controller, such as the one emulated by kvmtool.
 
 config PCIE_SPEAR13XX
-	tristate "STMicroelectronics SPEAr PCIe controller"
+	bool "STMicroelectronics SPEAr PCIe controller"
 	depends on ARCH_SPEAR13XX
 	select PCIEPORTBUS
 	select PCIE_DW

diff --git a/drivers/platform/x86/ideapad-laptop.c b/drivers/platform/x86/ideapad-laptop.c
index fc468a3..02152de 100644
--- a/drivers/platform/x86/ideapad-laptop.c
+++ b/drivers/platform/x86/ideapad-laptop.c

@@ -88,7 +88,6 @@
 	struct dentry *debug;
 	unsigned long cfg;
 	bool has_hw_rfkill_switch;
-	bool has_touchpad_control;
 };
 
 static bool no_bt_rfkill;
@@ -456,7 +455,7 @@
 	int type;
 };
 
-const const struct ideapad_rfk_data ideapad_rfk_data[] = {
+static const struct ideapad_rfk_data ideapad_rfk_data[] = {
 	{ "ideapad_wlan",    CFG_WIFI_BIT, VPCCMD_W_WIFI, RFKILL_TYPE_WLAN },
 	{ "ideapad_bluetooth", CFG_BT_BIT, VPCCMD_W_BT, RFKILL_TYPE_BLUETOOTH },
 	{ "ideapad_3g",        CFG_3G_BIT, VPCCMD_W_3G, RFKILL_TYPE_WWAN },
@@ -767,9 +766,6 @@
 {
 	unsigned long value;
 
-	if (!priv->has_touchpad_control)
-		return;
-
 	/* Without reading from EC touchpad LED doesn't switch state */
 	if (!read_ec_data(priv->adev->handle, VPCCMD_R_TOUCHPAD, &value)) {
 		/* Some IdeaPads don't really turn off touchpad - they only
@@ -833,29 +829,7 @@
  * always results in 0 on these models, causing ideapad_laptop to wrongly
  * report all radios as hardware-blocked.
  */
-static struct dmi_system_id no_hw_rfkill_list[] = {
-	{
-		.ident = "Lenovo Yoga 2 11 / 13 / Pro",
-		.matches = {
-			DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
-			DMI_MATCH(DMI_PRODUCT_VERSION, "Lenovo Yoga 2"),
-		},
-	},
-	{}
-};
-
-/*
- * Some models don't offer touchpad ctrl through the ideapad interface, causing
- * ideapad_sync_touchpad_state to send wrong touchpad enable/disable events.
- */
-static struct dmi_system_id no_touchpad_ctrl_list[] = {
-	{
-		.ident = "Lenovo Yoga 1 series",
-		.matches = {
-			DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
-			DMI_MATCH(DMI_PRODUCT_VERSION, "Lenovo IdeaPad Yoga"),
-		},
-	},
+static const struct dmi_system_id no_hw_rfkill_list[] = {
 	{
 		.ident = "Lenovo Yoga 2 11 / 13 / Pro",
 		.matches = {
@@ -889,7 +863,6 @@
 	priv->adev = adev;
 	priv->platform_device = pdev;
 	priv->has_hw_rfkill_switch = !dmi_check_system(no_hw_rfkill_list);
-	priv->has_touchpad_control = !dmi_check_system(no_touchpad_ctrl_list);
 
 	ret = ideapad_sysfs_init(priv);
 	if (ret)

diff --git a/drivers/platform/x86/toshiba_acpi.c b/drivers/platform/x86/toshiba_acpi.c
index b062d3d..d0dce73 100644
--- a/drivers/platform/x86/toshiba_acpi.c
+++ b/drivers/platform/x86/toshiba_acpi.c

@@ -1255,10 +1255,15 @@
 					 const char *buf, size_t count)
 {
 	struct toshiba_acpi_dev *toshiba = dev_get_drvdata(dev);
-	int mode = -1;
-	int time = -1;
+	int mode;
+	int time;
+	int ret;
 
-	if (sscanf(buf, "%i", &mode) != 1 && (mode != 2 || mode != 1))
+
+	ret = kstrtoint(buf, 0, &mode);
+	if (ret)
+		return ret;
+	if (mode != SCI_KBD_MODE_FNZ && mode != SCI_KBD_MODE_AUTO)
 		return -EINVAL;
 
 	/* Set the Keyboard Backlight Mode where:
@@ -1266,11 +1271,12 @@
 	 *	Auto - KBD backlight turns off automatically in given time
 	 *	FN-Z - KBD backlight "toggles" when hotkey pressed
 	 */
-	if (mode != -1 && toshiba->kbd_mode != mode) {
+	if (toshiba->kbd_mode != mode) {
 		time = toshiba->kbd_time << HCI_MISC_SHIFT;
 		time = time + toshiba->kbd_mode;
-		if (toshiba_kbd_illum_status_set(toshiba, time) < 0)
-			return -EIO;
+		ret = toshiba_kbd_illum_status_set(toshiba, time);
+		if (ret)
+			return ret;
 		toshiba->kbd_mode = mode;
 	}
 
@@ -1857,9 +1863,16 @@
 {
 	struct toshiba_acpi_dev *dev = acpi_driver_data(to_acpi_device(device));
 	u32 result;
+	acpi_status status;
 
-	if (dev->hotkey_dev)
+	if (dev->hotkey_dev) {
+		status = acpi_evaluate_object(dev->acpi_dev->handle, "ENAB",
+				NULL, NULL);
+		if (ACPI_FAILURE(status))
+			pr_info("Unable to re-enable hotkeys\n");
+
 		hci_write1(dev, HCI_HOTKEY_EVENT, HCI_HOTKEY_ENABLE, &result);
+	}
 
 	return 0;
 }

diff --git a/drivers/powercap/intel_rapl.c b/drivers/powercap/intel_rapl.c
index b1cda6f..45e05b3 100644
--- a/drivers/powercap/intel_rapl.c
+++ b/drivers/powercap/intel_rapl.c

@@ -953,6 +953,7 @@
 	{ X86_VENDOR_INTEL, 6, 0x3a},/* Ivy Bridge */
 	{ X86_VENDOR_INTEL, 6, 0x3c},/* Haswell */
 	{ X86_VENDOR_INTEL, 6, 0x3d},/* Broadwell */
+	{ X86_VENDOR_INTEL, 6, 0x3f},/* Haswell */
 	{ X86_VENDOR_INTEL, 6, 0x45},/* Haswell ULT */
 	/* TODO: Add more CPU IDs after testing */
 	{}
@@ -1166,11 +1167,10 @@
 
 	for (i = 0; i < RAPL_DOMAIN_MAX; i++) {
 		/* use physical package id to read counters */
-		if (!rapl_check_domain(cpu, i))
+		if (!rapl_check_domain(cpu, i)) {
 			rp->domain_map |= 1 << i;
-		else
-			pr_warn("RAPL domain %s detection failed\n",
-				rapl_domain_names[i]);
+			pr_info("Found RAPL domain %s\n", rapl_domain_names[i]);
+		}
 	}
 	rp->nr_domains = bitmap_weight(&rp->domain_map,	RAPL_DOMAIN_MAX);
 	if (!rp->nr_domains) {

diff --git a/drivers/s390/block/dasd_devmap.c b/drivers/s390/block/dasd_devmap.c
index 2ead7e7..14ba80b 100644
--- a/drivers/s390/block/dasd_devmap.c
+++ b/drivers/s390/block/dasd_devmap.c

@@ -77,7 +77,7 @@
  * strings when running as a module.
  */
 static char *dasd[256];
-module_param_array(dasd, charp, NULL, 0);
+module_param_array(dasd, charp, NULL, S_IRUGO);
 
 /*
  * Single spinlock to protect devmap and servermap structures and lists.

diff --git a/drivers/s390/net/qeth_core.h b/drivers/s390/net/qeth_core.h
index 97ef37b..e7646ce 100644
--- a/drivers/s390/net/qeth_core.h
+++ b/drivers/s390/net/qeth_core.h

@@ -889,6 +889,7 @@
 extern const struct attribute_group *qeth_osn_attr_groups[];
 extern struct workqueue_struct *qeth_wq;
 
+int qeth_card_hw_is_reachable(struct qeth_card *);
 const char *qeth_get_cardname_short(struct qeth_card *);
 int qeth_realloc_buffer_pool(struct qeth_card *, int);
 int qeth_core_load_discipline(struct qeth_card *, enum qeth_discipline_id);

diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c
index c0d6ba8..fd22c81 100644
--- a/drivers/s390/net/qeth_core_main.c
+++ b/drivers/s390/net/qeth_core_main.c

@@ -73,6 +73,13 @@
 struct workqueue_struct *qeth_wq;
 EXPORT_SYMBOL_GPL(qeth_wq);
 
+int qeth_card_hw_is_reachable(struct qeth_card *card)
+{
+	return (card->state == CARD_STATE_SOFTSETUP) ||
+		(card->state == CARD_STATE_UP);
+}
+EXPORT_SYMBOL_GPL(qeth_card_hw_is_reachable);
+
 static void qeth_close_dev_handler(struct work_struct *work)
 {
 	struct qeth_card *card;
@@ -5790,6 +5797,7 @@
 	struct qeth_card *card = netdev->ml_priv;
 	enum qeth_link_types link_type;
 	struct carrier_info carrier_info;
+	int rc;
 	u32 speed;
 
 	if ((card->info.type == QETH_CARD_TYPE_IQD) || (card->info.guestlan))
@@ -5832,8 +5840,14 @@
 	/* Check if we can obtain more accurate information.	 */
 	/* If QUERY_CARD_INFO command is not supported or fails, */
 	/* just return the heuristics that was filled above.	 */
-	if (qeth_query_card_info(card, &carrier_info) != 0)
+	if (!qeth_card_hw_is_reachable(card))
+		return -ENODEV;
+	rc = qeth_query_card_info(card, &carrier_info);
+	if (rc == -EOPNOTSUPP) /* for old hardware, return heuristic */
 		return 0;
+	if (rc) /* report error from the hardware operation */
+		return rc;
+	/* on success, fill in the information got from the hardware */
 
 	netdev_dbg(netdev,
 	"card info: card_type=0x%02x, port_mode=0x%04x, port_speed=0x%08x\n",

diff --git a/drivers/s390/net/qeth_l2_sys.c b/drivers/s390/net/qeth_l2_sys.c
index ae1bc04..59e3aa5 100644
--- a/drivers/s390/net/qeth_l2_sys.c
+++ b/drivers/s390/net/qeth_l2_sys.c

@@ -5,17 +5,12 @@
 
 #include <linux/slab.h>
 #include <asm/ebcdic.h>
+#include "qeth_core.h"
 #include "qeth_l2.h"
 
 #define QETH_DEVICE_ATTR(_id, _name, _mode, _show, _store) \
 struct device_attribute dev_attr_##_id = __ATTR(_name, _mode, _show, _store)
 
-static int qeth_card_hw_is_reachable(struct qeth_card *card)
-{
-	return (card->state == CARD_STATE_SOFTSETUP) ||
-		(card->state == CARD_STATE_UP);
-}
-
 static ssize_t qeth_bridge_port_role_state_show(struct device *dev,
 				struct device_attribute *attr, char *buf,
 				int show_state)

diff --git a/drivers/ssb/b43_pci_bridge.c b/drivers/ssb/b43_pci_bridge.c
index 19396dc..bed2fede 100644
--- a/drivers/ssb/b43_pci_bridge.c
+++ b/drivers/ssb/b43_pci_bridge.c

@@ -38,6 +38,7 @@
 	{ PCI_DEVICE(PCI_VENDOR_ID_BROADCOM, 0x432b) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_BROADCOM, 0x432c) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_BROADCOM, 0x4350) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_BROADCOM, 0x4351) },
 	{ 0, },
 };
 MODULE_DEVICE_TABLE(pci, b43_pci_bridge_tbl);

diff --git a/fs/aio.c b/fs/aio.c
index 97bc62c..7337500 100644
--- a/fs/aio.c
+++ b/fs/aio.c

@@ -793,6 +793,8 @@
 
 	for (i = 0; i < table->nr; ++i) {
 		struct kioctx *ctx = table->table[i];
+		struct completion requests_done =
+			COMPLETION_INITIALIZER_ONSTACK(requests_done);
 
 		if (!ctx)
 			continue;
@@ -804,7 +806,10 @@
 		 * that it needs to unmap the area, just set it to 0.
 		 */
 		ctx->mmap_size = 0;
-		kill_ioctx(mm, ctx, NULL);
+		kill_ioctx(mm, ctx, &requests_done);
+
+		/* Wait until all IO for the context are done. */
+		wait_for_completion(&requests_done);
 	}
 
 	RCU_INIT_POINTER(mm->ioctx_table, NULL);
@@ -1111,6 +1116,12 @@
 	tail = ring->tail;
 	kunmap_atomic(ring);
 
+	/*
+	 * Ensure that once we've read the current tail pointer, that
+	 * we also see the events that were stored up to the tail.
+	 */
+	smp_rmb();
+
 	pr_debug("h%u t%u m%u\n", head, tail, ctx->nr_events);
 
 	if (head == tail)

diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 90a3cdc..603e4eb 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c

@@ -3240,6 +3240,7 @@
 				 &new.de, &new.inlined);
 	if (IS_ERR(new.bh)) {
 		retval = PTR_ERR(new.bh);
+		new.bh = NULL;
 		goto end_rename;
 	}
 	if (new.bh) {
@@ -3386,6 +3387,7 @@
 				 &new.de, &new.inlined);
 	if (IS_ERR(new.bh)) {
 		retval = PTR_ERR(new.bh);
+		new.bh = NULL;
 		goto end_rename;
 	}
 

diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index bb0e80f..1e43b90 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c

@@ -575,6 +575,7 @@
 		bh = bclean(handle, sb, block);
 		if (IS_ERR(bh)) {
 			err = PTR_ERR(bh);
+			bh = NULL;
 			goto out;
 		}
 		overhead = ext4_group_overhead_blocks(sb, group);
@@ -603,6 +604,7 @@
 		bh = bclean(handle, sb, block);
 		if (IS_ERR(bh)) {
 			err = PTR_ERR(bh);
+			bh = NULL;
 			goto out;
 		}
 

diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig
index 214fe10..736a348 100644
--- a/fs/f2fs/Kconfig
+++ b/fs/f2fs/Kconfig

@@ -23,7 +23,7 @@
 	  mounted as f2fs. Each file shows the whole f2fs information.
 
 	  /sys/kernel/debug/f2fs/status includes:
-	    - major file system information managed by f2fs currently
+	    - major filesystem information managed by f2fs currently
 	    - average SIT information about whole segments
 	    - current memory footprint consumed by f2fs.
 
@@ -68,6 +68,6 @@
 	bool "F2FS consistency checking feature"
 	depends on F2FS_FS
 	help
-	  Enables BUG_ONs which check the file system consistency in runtime.
+	  Enables BUG_ONs which check the filesystem consistency in runtime.
 
 	  If you want to improve the performance, say N.

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 6aeed5b..ec3b7a5 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c

@@ -160,14 +160,11 @@
 		goto redirty_out;
 	if (wbc->for_reclaim)
 		goto redirty_out;
-
-	/* Should not write any meta pages, if any IO error was occurred */
-	if (unlikely(is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG)))
-		goto no_write;
+	if (unlikely(f2fs_cp_error(sbi)))
+		goto redirty_out;
 
 	f2fs_wait_on_page_writeback(page, META);
 	write_meta_page(sbi, page);
-no_write:
 	dec_page_count(sbi, F2FS_DIRTY_META);
 	unlock_page(page);
 	return 0;
@@ -348,7 +345,7 @@
 	return e ? true : false;
 }
 
-static void release_dirty_inode(struct f2fs_sb_info *sbi)
+void release_dirty_inode(struct f2fs_sb_info *sbi)
 {
 	struct ino_entry *e, *tmp;
 	int i;
@@ -446,8 +443,8 @@
 	struct f2fs_orphan_block *orphan_blk = NULL;
 	unsigned int nentries = 0;
 	unsigned short index;
-	unsigned short orphan_blocks = (unsigned short)((sbi->n_orphans +
-		(F2FS_ORPHANS_PER_BLOCK - 1)) / F2FS_ORPHANS_PER_BLOCK);
+	unsigned short orphan_blocks =
+			(unsigned short)GET_ORPHAN_BLOCKS(sbi->n_orphans);
 	struct page *page = NULL;
 	struct ino_entry *orphan = NULL;
 
@@ -737,7 +734,7 @@
 /*
  * Freeze all the FS-operations for checkpoint.
  */
-static void block_operations(struct f2fs_sb_info *sbi)
+static int block_operations(struct f2fs_sb_info *sbi)
 {
 	struct writeback_control wbc = {
 		.sync_mode = WB_SYNC_ALL,
@@ -745,6 +742,7 @@
 		.for_reclaim = 0,
 	};
 	struct blk_plug plug;
+	int err = 0;
 
 	blk_start_plug(&plug);
 
@@ -754,11 +752,15 @@
 	if (get_pages(sbi, F2FS_DIRTY_DENTS)) {
 		f2fs_unlock_all(sbi);
 		sync_dirty_dir_inodes(sbi);
+		if (unlikely(f2fs_cp_error(sbi))) {
+			err = -EIO;
+			goto out;
+		}
 		goto retry_flush_dents;
 	}
 
 	/*
-	 * POR: we should ensure that there is no dirty node pages
+	 * POR: we should ensure that there are no dirty node pages
 	 * until finishing nat/sit flush.
 	 */
 retry_flush_nodes:
@@ -767,9 +769,16 @@
 	if (get_pages(sbi, F2FS_DIRTY_NODES)) {
 		up_write(&sbi->node_write);
 		sync_node_pages(sbi, 0, &wbc);
+		if (unlikely(f2fs_cp_error(sbi))) {
+			f2fs_unlock_all(sbi);
+			err = -EIO;
+			goto out;
+		}
 		goto retry_flush_nodes;
 	}
+out:
 	blk_finish_plug(&plug);
+	return err;
 }
 
 static void unblock_operations(struct f2fs_sb_info *sbi)
@@ -813,8 +822,11 @@
 	discard_next_dnode(sbi, NEXT_FREE_BLKADDR(sbi, curseg));
 
 	/* Flush all the NAT/SIT pages */
-	while (get_pages(sbi, F2FS_DIRTY_META))
+	while (get_pages(sbi, F2FS_DIRTY_META)) {
 		sync_meta_pages(sbi, META, LONG_MAX);
+		if (unlikely(f2fs_cp_error(sbi)))
+			return;
+	}
 
 	next_free_nid(sbi, &last_nid);
 
@@ -825,7 +837,7 @@
 	ckpt->elapsed_time = cpu_to_le64(get_mtime(sbi));
 	ckpt->valid_block_count = cpu_to_le64(valid_user_blocks(sbi));
 	ckpt->free_segment_count = cpu_to_le32(free_segments(sbi));
-	for (i = 0; i < 3; i++) {
+	for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) {
 		ckpt->cur_node_segno[i] =
 			cpu_to_le32(curseg_segno(sbi, i + CURSEG_HOT_NODE));
 		ckpt->cur_node_blkoff[i] =
@@ -833,7 +845,7 @@
 		ckpt->alloc_type[i + CURSEG_HOT_NODE] =
 				curseg_alloc_type(sbi, i + CURSEG_HOT_NODE);
 	}
-	for (i = 0; i < 3; i++) {
+	for (i = 0; i < NR_CURSEG_DATA_TYPE; i++) {
 		ckpt->cur_data_segno[i] =
 			cpu_to_le32(curseg_segno(sbi, i + CURSEG_HOT_DATA));
 		ckpt->cur_data_blkoff[i] =
@@ -848,24 +860,23 @@
 
 	/* 2 cp  + n data seg summary + orphan inode blocks */
 	data_sum_blocks = npages_for_summary_flush(sbi);
-	if (data_sum_blocks < 3)
+	if (data_sum_blocks < NR_CURSEG_DATA_TYPE)
 		set_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG);
 	else
 		clear_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG);
 
-	orphan_blocks = (sbi->n_orphans + F2FS_ORPHANS_PER_BLOCK - 1)
-					/ F2FS_ORPHANS_PER_BLOCK;
+	orphan_blocks = GET_ORPHAN_BLOCKS(sbi->n_orphans);
 	ckpt->cp_pack_start_sum = cpu_to_le32(1 + cp_payload_blks +
 			orphan_blocks);
 
 	if (is_umount) {
 		set_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
-		ckpt->cp_pack_total_block_count = cpu_to_le32(2 +
+		ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS+
 				cp_payload_blks + data_sum_blocks +
 				orphan_blocks + NR_CURSEG_NODE_TYPE);
 	} else {
 		clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
-		ckpt->cp_pack_total_block_count = cpu_to_le32(2 +
+		ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS +
 				cp_payload_blks + data_sum_blocks +
 				orphan_blocks);
 	}
@@ -924,6 +935,9 @@
 	/* wait for previous submitted node/meta pages writeback */
 	wait_on_all_pages_writeback(sbi);
 
+	if (unlikely(f2fs_cp_error(sbi)))
+		return;
+
 	filemap_fdatawait_range(NODE_MAPPING(sbi), 0, LONG_MAX);
 	filemap_fdatawait_range(META_MAPPING(sbi), 0, LONG_MAX);
 
@@ -934,15 +948,17 @@
 	/* Here, we only have one bio having CP pack */
 	sync_meta_pages(sbi, META_FLUSH, LONG_MAX);
 
-	if (!is_set_ckpt_flags(ckpt, CP_ERROR_FLAG)) {
-		clear_prefree_segments(sbi);
-		release_dirty_inode(sbi);
-		F2FS_RESET_SB_DIRT(sbi);
-	}
+	release_dirty_inode(sbi);
+
+	if (unlikely(f2fs_cp_error(sbi)))
+		return;
+
+	clear_prefree_segments(sbi);
+	F2FS_RESET_SB_DIRT(sbi);
 }
 
 /*
- * We guarantee that this checkpoint procedure should not fail.
+ * We guarantee that this checkpoint procedure will not fail.
  */
 void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
 {
@@ -952,7 +968,13 @@
 	trace_f2fs_write_checkpoint(sbi->sb, is_umount, "start block_ops");
 
 	mutex_lock(&sbi->cp_mutex);
-	block_operations(sbi);
+
+	if (!sbi->s_dirty)
+		goto out;
+	if (unlikely(f2fs_cp_error(sbi)))
+		goto out;
+	if (block_operations(sbi))
+		goto out;
 
 	trace_f2fs_write_checkpoint(sbi->sb, is_umount, "finish block_ops");
 
@@ -976,9 +998,9 @@
 	do_checkpoint(sbi, is_umount);
 
 	unblock_operations(sbi);
-	mutex_unlock(&sbi->cp_mutex);
-
 	stat_inc_cp_count(sbi->stat_info);
+out:
+	mutex_unlock(&sbi->cp_mutex);
 	trace_f2fs_write_checkpoint(sbi->sb, is_umount, "finish checkpoint");
 }
 
@@ -999,8 +1021,8 @@
 	 * for cp pack we can have max 1020*504 orphan entries
 	 */
 	sbi->n_orphans = 0;
-	sbi->max_orphans = (sbi->blocks_per_seg - 2 - NR_CURSEG_TYPE)
-				* F2FS_ORPHANS_PER_BLOCK;
+	sbi->max_orphans = (sbi->blocks_per_seg - F2FS_CP_PACKS -
+			NR_CURSEG_TYPE) * F2FS_ORPHANS_PER_BLOCK;
 }
 
 int __init create_checkpoint_caches(void)

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 03313099..76de83e 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c

@@ -53,7 +53,7 @@
 		struct page *page = bvec->bv_page;
 
 		if (unlikely(err)) {
-			SetPageError(page);
+			set_page_dirty(page);
 			set_bit(AS_EIO, &page->mapping->flags);
 			f2fs_stop_checkpoint(sbi);
 		}
@@ -691,7 +691,7 @@
 			allocated = true;
 			blkaddr = dn.data_blkaddr;
 		}
-		/* Give more consecutive addresses for the read ahead */
+		/* Give more consecutive addresses for the readahead */
 		if (blkaddr == (bh_result->b_blocknr + ofs)) {
 			ofs++;
 			dn.ofs_in_node++;
@@ -739,7 +739,7 @@
 
 	trace_f2fs_readpage(page, DATA);
 
-	/* If the file has inline data, try to read it directlly */
+	/* If the file has inline data, try to read it directly */
 	if (f2fs_has_inline_data(inode))
 		ret = f2fs_read_inline_data(inode, page);
 	else
@@ -836,10 +836,19 @@
 
 	/* Dentry blocks are controlled by checkpoint */
 	if (S_ISDIR(inode->i_mode)) {
+		if (unlikely(f2fs_cp_error(sbi)))
+			goto redirty_out;
 		err = do_write_data_page(page, &fio);
 		goto done;
 	}
 
+	/* we should bypass data pages to proceed the kworkder jobs */
+	if (unlikely(f2fs_cp_error(sbi))) {
+		SetPageError(page);
+		unlock_page(page);
+		return 0;
+	}
+
 	if (!wbc->for_reclaim)
 		need_balance_fs = true;
 	else if (has_not_enough_free_secs(sbi, 0))
@@ -927,7 +936,7 @@
 
 	if (to > inode->i_size) {
 		truncate_pagecache(inode, inode->i_size);
-		truncate_blocks(inode, inode->i_size);
+		truncate_blocks(inode, inode->i_size, true);
 	}
 }
 
@@ -946,7 +955,7 @@
 
 	f2fs_balance_fs(sbi);
 repeat:
-	err = f2fs_convert_inline_data(inode, pos + len);
+	err = f2fs_convert_inline_data(inode, pos + len, NULL);
 	if (err)
 		goto fail;
 

diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index a441ba3..fecebdb 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c

@@ -32,7 +32,7 @@
 	struct f2fs_stat_info *si = F2FS_STAT(sbi);
 	int i;
 
-	/* valid check of the segment numbers */
+	/* validation check of the segment numbers */
 	si->hit_ext = sbi->read_hit_ext;
 	si->total_ext = sbi->total_hit_ext;
 	si->ndirty_node = get_pages(sbi, F2FS_DIRTY_NODES);
@@ -152,7 +152,7 @@
 	si->base_mem += NR_DIRTY_TYPE * f2fs_bitmap_size(TOTAL_SEGS(sbi));
 	si->base_mem += f2fs_bitmap_size(TOTAL_SECS(sbi));
 
-	/* buld nm */
+	/* build nm */
 	si->base_mem += sizeof(struct f2fs_nm_info);
 	si->base_mem += __bitmap_size(sbi, NAT_BITMAP);
 

diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index bcf893c..155fb05 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c

@@ -124,7 +124,7 @@
 
 		/*
 		 * For the most part, it should be a bug when name_len is zero.
-		 * We stop here for figuring out where the bugs are occurred.
+		 * We stop here for figuring out where the bugs has occurred.
 		 */
 		f2fs_bug_on(!de->name_len);
 
@@ -391,7 +391,7 @@
 error:
 	/* once the failed inode becomes a bad inode, i_mode is S_IFREG */
 	truncate_inode_pages(&inode->i_data, 0);
-	truncate_blocks(inode, 0);
+	truncate_blocks(inode, 0, false);
 	remove_dirty_dir_inode(inode);
 	remove_inode_page(inode);
 	return ERR_PTR(err);
@@ -563,7 +563,7 @@
 }
 
 /*
- * It only removes the dentry from the dentry page,corresponding name
+ * It only removes the dentry from the dentry page, corresponding name
  * entry in name page does not need to be touched during deletion.
  */
 void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 4dab533..e921242 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h

@@ -24,7 +24,7 @@
 #define f2fs_bug_on(condition)	BUG_ON(condition)
 #define f2fs_down_write(x, y)	down_write_nest_lock(x, y)
 #else
-#define f2fs_bug_on(condition)
+#define f2fs_bug_on(condition)	WARN_ON(condition)
 #define f2fs_down_write(x, y)	down_write(x)
 #endif
 
@@ -395,7 +395,7 @@
 };
 
 /*
- * The below are the page types of bios used in submti_bio().
+ * The below are the page types of bios used in submit_bio().
  * The available types are:
  * DATA			User data pages. It operates as async mode.
  * NODE			Node pages. It operates as async mode.
@@ -470,7 +470,7 @@
 	struct list_head dir_inode_list;	/* dir inode list */
 	spinlock_t dir_inode_lock;		/* for dir inode list lock */
 
-	/* basic file system units */
+	/* basic filesystem units */
 	unsigned int log_sectors_per_block;	/* log2 sectors per block */
 	unsigned int log_blocksize;		/* log2 block size */
 	unsigned int blocksize;			/* block size */
@@ -799,7 +799,7 @@
 
 	/*
 	 * odd numbered checkpoint should at cp segment 0
-	 * and even segent must be at cp segment 1
+	 * and even segment must be at cp segment 1
 	 */
 	if (!(ckpt_version & 1))
 		start_addr += sbi->blocks_per_seg;
@@ -1096,6 +1096,11 @@
 	return sb->s_flags & MS_RDONLY;
 }
 
+static inline bool f2fs_cp_error(struct f2fs_sb_info *sbi)
+{
+	return is_set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG);
+}
+
 static inline void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi)
 {
 	set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG);
@@ -1117,7 +1122,7 @@
  */
 int f2fs_sync_file(struct file *, loff_t, loff_t, int);
 void truncate_data_blocks(struct dnode_of_data *);
-int truncate_blocks(struct inode *, u64);
+int truncate_blocks(struct inode *, u64, bool);
 void f2fs_truncate(struct inode *);
 int f2fs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
 int f2fs_setattr(struct dentry *, struct iattr *);
@@ -1202,10 +1207,8 @@
 bool alloc_nid(struct f2fs_sb_info *, nid_t *);
 void alloc_nid_done(struct f2fs_sb_info *, nid_t);
 void alloc_nid_failed(struct f2fs_sb_info *, nid_t);
-void recover_node_page(struct f2fs_sb_info *, struct page *,
-		struct f2fs_summary *, struct node_info *, block_t);
 void recover_inline_xattr(struct inode *, struct page *);
-bool recover_xattr_data(struct inode *, struct page *, block_t);
+void recover_xattr_data(struct inode *, struct page *, block_t);
 int recover_inode_page(struct f2fs_sb_info *, struct page *);
 int restore_node_summary(struct f2fs_sb_info *, unsigned int,
 				struct f2fs_summary_block *);
@@ -1238,8 +1241,6 @@
 void rewrite_data_page(struct page *, block_t, struct f2fs_io_info *);
 void recover_data_page(struct f2fs_sb_info *, struct page *,
 				struct f2fs_summary *, block_t, block_t);
-void rewrite_node_page(struct f2fs_sb_info *, struct page *,
-				struct f2fs_summary *, block_t, block_t);
 void allocate_data_block(struct f2fs_sb_info *, struct page *,
 		block_t, block_t *, struct f2fs_summary *, int);
 void f2fs_wait_on_page_writeback(struct page *, enum page_type);
@@ -1262,6 +1263,7 @@
 long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long);
 void add_dirty_inode(struct f2fs_sb_info *, nid_t, int type);
 void remove_dirty_inode(struct f2fs_sb_info *, nid_t, int type);
+void release_dirty_inode(struct f2fs_sb_info *);
 bool exist_written_data(struct f2fs_sb_info *, nid_t, int);
 int acquire_orphan_inode(struct f2fs_sb_info *);
 void release_orphan_inode(struct f2fs_sb_info *);
@@ -1439,8 +1441,8 @@
  */
 bool f2fs_may_inline(struct inode *);
 int f2fs_read_inline_data(struct inode *, struct page *);
-int f2fs_convert_inline_data(struct inode *, pgoff_t);
+int f2fs_convert_inline_data(struct inode *, pgoff_t, struct page *);
 int f2fs_write_inline_data(struct inode *, struct page *, unsigned int);
 void truncate_inline_data(struct inode *, u64);
-int recover_inline_data(struct inode *, struct page *);
+bool recover_inline_data(struct inode *, struct page *);
 #endif

diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 208f1a9..060aee6 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c

@@ -41,6 +41,11 @@
 
 	sb_start_pagefault(inode->i_sb);
 
+	/* force to convert with normal data indices */
+	err = f2fs_convert_inline_data(inode, MAX_INLINE_DATA + 1, page);
+	if (err)
+		goto out;
+
 	/* block allocation */
 	f2fs_lock_op(sbi);
 	set_new_dnode(&dn, inode, NULL, NULL, 0);
@@ -110,6 +115,25 @@
 	return 1;
 }
 
+static inline bool need_do_checkpoint(struct inode *inode)
+{
+	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+	bool need_cp = false;
+
+	if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1)
+		need_cp = true;
+	else if (file_wrong_pino(inode))
+		need_cp = true;
+	else if (!space_for_roll_forward(sbi))
+		need_cp = true;
+	else if (!is_checkpointed_node(sbi, F2FS_I(inode)->i_pino))
+		need_cp = true;
+	else if (F2FS_I(inode)->xattr_ver == cur_cp_version(F2FS_CKPT(sbi)))
+		need_cp = true;
+
+	return need_cp;
+}
+
 int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 {
 	struct inode *inode = file->f_mapping->host;
@@ -154,23 +178,12 @@
 	/* guarantee free sections for fsync */
 	f2fs_balance_fs(sbi);
 
-	down_read(&fi->i_sem);
-
 	/*
 	 * Both of fdatasync() and fsync() are able to be recovered from
 	 * sudden-power-off.
 	 */
-	if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1)
-		need_cp = true;
-	else if (file_wrong_pino(inode))
-		need_cp = true;
-	else if (!space_for_roll_forward(sbi))
-		need_cp = true;
-	else if (!is_checkpointed_node(sbi, F2FS_I(inode)->i_pino))
-		need_cp = true;
-	else if (F2FS_I(inode)->xattr_ver == cur_cp_version(F2FS_CKPT(sbi)))
-		need_cp = true;
-
+	down_read(&fi->i_sem);
+	need_cp = need_do_checkpoint(inode);
 	up_read(&fi->i_sem);
 
 	if (need_cp) {
@@ -288,7 +301,7 @@
 		if (err && err != -ENOENT) {
 			goto fail;
 		} else if (err == -ENOENT) {
-			/* direct node is not exist */
+			/* direct node does not exists */
 			if (whence == SEEK_DATA) {
 				pgofs = PGOFS_OF_NEXT_DNODE(pgofs,
 							F2FS_I(inode));
@@ -417,7 +430,7 @@
 	f2fs_put_page(page, 1);
 }
 
-int truncate_blocks(struct inode *inode, u64 from)
+int truncate_blocks(struct inode *inode, u64 from, bool lock)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
 	unsigned int blocksize = inode->i_sb->s_blocksize;
@@ -433,14 +446,16 @@
 	free_from = (pgoff_t)
 			((from + blocksize - 1) >> (sbi->log_blocksize));
 
-	f2fs_lock_op(sbi);
+	if (lock)
+		f2fs_lock_op(sbi);
 
 	set_new_dnode(&dn, inode, NULL, NULL, 0);
 	err = get_dnode_of_data(&dn, free_from, LOOKUP_NODE);
 	if (err) {
 		if (err == -ENOENT)
 			goto free_next;
-		f2fs_unlock_op(sbi);
+		if (lock)
+			f2fs_unlock_op(sbi);
 		trace_f2fs_truncate_blocks_exit(inode, err);
 		return err;
 	}
@@ -458,7 +473,8 @@
 	f2fs_put_dnode(&dn);
 free_next:
 	err = truncate_inode_blocks(inode, free_from);
-	f2fs_unlock_op(sbi);
+	if (lock)
+		f2fs_unlock_op(sbi);
 done:
 	/* lastly zero out the first data page */
 	truncate_partial_data_page(inode, from);
@@ -475,7 +491,7 @@
 
 	trace_f2fs_truncate(inode);
 
-	if (!truncate_blocks(inode, i_size_read(inode))) {
+	if (!truncate_blocks(inode, i_size_read(inode), true)) {
 		inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 		mark_inode_dirty(inode);
 	}
@@ -533,7 +549,7 @@
 
 	if ((attr->ia_valid & ATTR_SIZE) &&
 			attr->ia_size != i_size_read(inode)) {
-		err = f2fs_convert_inline_data(inode, attr->ia_size);
+		err = f2fs_convert_inline_data(inode, attr->ia_size, NULL);
 		if (err)
 			return err;
 
@@ -622,7 +638,7 @@
 	loff_t off_start, off_end;
 	int ret = 0;
 
-	ret = f2fs_convert_inline_data(inode, MAX_INLINE_DATA + 1);
+	ret = f2fs_convert_inline_data(inode, MAX_INLINE_DATA + 1, NULL);
 	if (ret)
 		return ret;
 
@@ -678,7 +694,7 @@
 	if (ret)
 		return ret;
 
-	ret = f2fs_convert_inline_data(inode, offset + len);
+	ret = f2fs_convert_inline_data(inode, offset + len, NULL);
 	if (ret)
 		return ret;
 

diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index d7947d9..943a31d 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c

@@ -58,7 +58,7 @@
 		 * 3. IO subsystem is idle by checking the # of requests in
 		 *    bdev's request list.
 		 *
-		 * Note) We have to avoid triggering GCs too much frequently.
+		 * Note) We have to avoid triggering GCs frequently.
 		 * Because it is possible that some segments can be
 		 * invalidated soon after by user update or deletion.
 		 * So, I'd like to wait some time to collect dirty segments.
@@ -222,7 +222,7 @@
 
 	u = (vblocks * 100) >> sbi->log_blocks_per_seg;
 
-	/* Handle if the system time is changed by user */
+	/* Handle if the system time has changed by the user */
 	if (mtime < sit_i->min_mtime)
 		sit_i->min_mtime = mtime;
 	if (mtime > sit_i->max_mtime)
@@ -593,7 +593,7 @@
 
 		if (phase == 2) {
 			inode = f2fs_iget(sb, dni.ino);
-			if (IS_ERR(inode))
+			if (IS_ERR(inode) || is_bad_inode(inode))
 				continue;
 
 			start_bidx = start_bidx_of_node(nofs, F2FS_I(inode));
@@ -693,7 +693,7 @@
 gc_more:
 	if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE)))
 		goto stop;
-	if (unlikely(is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG)))
+	if (unlikely(f2fs_cp_error(sbi)))
 		goto stop;
 
 	if (gc_type == BG_GC && has_not_enough_free_secs(sbi, nfree)) {

diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h
index 5d5eb60..16f0b2b 100644
--- a/fs/f2fs/gc.h
+++ b/fs/f2fs/gc.h

@@ -91,7 +91,7 @@
 	block_t invalid_user_blocks = sbi->user_block_count -
 					written_block_count(sbi);
 	/*
-	 * Background GC is triggered with the following condition.
+	 * Background GC is triggered with the following conditions.
 	 * 1. There are a number of invalid blocks.
 	 * 2. There is not enough free space.
 	 */

diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c
index 948d17bf..a844fcf 100644
--- a/fs/f2fs/hash.c
+++ b/fs/f2fs/hash.c

@@ -42,7 +42,8 @@
 	buf[1] += b1;
 }
 
-static void str2hashbuf(const char *msg, size_t len, unsigned int *buf, int num)
+static void str2hashbuf(const unsigned char *msg, size_t len,
+				unsigned int *buf, int num)
 {
 	unsigned pad, val;
 	int i;
@@ -73,9 +74,9 @@
 {
 	__u32 hash;
 	f2fs_hash_t f2fs_hash;
-	const char *p;
+	const unsigned char *p;
 	__u32 in[8], buf[4];
-	const char *name = name_info->name;
+	const unsigned char *name = name_info->name;
 	size_t len = name_info->len;
 
 	if ((len <= 2) && (name[0] == '.') &&

diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 5beecce..3e8ecdf 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c

@@ -68,7 +68,7 @@
 
 static int __f2fs_convert_inline_data(struct inode *inode, struct page *page)
 {
-	int err;
+	int err = 0;
 	struct page *ipage;
 	struct dnode_of_data dn;
 	void *src_addr, *dst_addr;
@@ -86,6 +86,10 @@
 		goto out;
 	}
 
+	/* someone else converted inline_data already */
+	if (!f2fs_has_inline_data(inode))
+		goto out;
+
 	/*
 	 * i_addr[0] is not used for inline data,
 	 * so reserving new block will not destroy inline data
@@ -124,9 +128,10 @@
 	return err;
 }
 
-int f2fs_convert_inline_data(struct inode *inode, pgoff_t to_size)
+int f2fs_convert_inline_data(struct inode *inode, pgoff_t to_size,
+						struct page *page)
 {
-	struct page *page;
+	struct page *new_page = page;
 	int err;
 
 	if (!f2fs_has_inline_data(inode))
@@ -134,17 +139,20 @@
 	else if (to_size <= MAX_INLINE_DATA)
 		return 0;
 
-	page = grab_cache_page(inode->i_mapping, 0);
-	if (!page)
-		return -ENOMEM;
+	if (!page || page->index != 0) {
+		new_page = grab_cache_page(inode->i_mapping, 0);
+		if (!new_page)
+			return -ENOMEM;
+	}
 
-	err = __f2fs_convert_inline_data(inode, page);
-	f2fs_put_page(page, 1);
+	err = __f2fs_convert_inline_data(inode, new_page);
+	if (!page || page->index != 0)
+		f2fs_put_page(new_page, 1);
 	return err;
 }
 
 int f2fs_write_inline_data(struct inode *inode,
-			   struct page *page, unsigned size)
+				struct page *page, unsigned size)
 {
 	void *src_addr, *dst_addr;
 	struct page *ipage;
@@ -199,7 +207,7 @@
 	f2fs_put_page(ipage, 1);
 }
 
-int recover_inline_data(struct inode *inode, struct page *npage)
+bool recover_inline_data(struct inode *inode, struct page *npage)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
 	struct f2fs_inode *ri = NULL;
@@ -218,7 +226,7 @@
 		ri = F2FS_INODE(npage);
 
 	if (f2fs_has_inline_data(inode) &&
-			ri && ri->i_inline & F2FS_INLINE_DATA) {
+			ri && (ri->i_inline & F2FS_INLINE_DATA)) {
 process_inline:
 		ipage = get_node_page(sbi, inode->i_ino);
 		f2fs_bug_on(IS_ERR(ipage));
@@ -230,7 +238,7 @@
 		memcpy(dst_addr, src_addr, MAX_INLINE_DATA);
 		update_inode(inode, ipage);
 		f2fs_put_page(ipage, 1);
-		return -1;
+		return true;
 	}
 
 	if (f2fs_has_inline_data(inode)) {
@@ -242,10 +250,10 @@
 		clear_inode_flag(F2FS_I(inode), FI_INLINE_DATA);
 		update_inode(inode, ipage);
 		f2fs_put_page(ipage, 1);
-	} else if (ri && ri->i_inline & F2FS_INLINE_DATA) {
-		truncate_blocks(inode, 0);
+	} else if (ri && (ri->i_inline & F2FS_INLINE_DATA)) {
+		truncate_blocks(inode, 0, false);
 		set_inode_flag(F2FS_I(inode), FI_INLINE_DATA);
 		goto process_inline;
 	}
-	return 0;
+	return false;
 }

diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 27b0377..ee103fd 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c

@@ -134,9 +134,7 @@
 	return 0;
 out:
 	clear_nlink(inode);
-	unlock_new_inode(inode);
-	make_bad_inode(inode);
-	iput(inode);
+	iget_failed(inode);
 	alloc_nid_failed(sbi, ino);
 	return err;
 }
@@ -229,7 +227,7 @@
 	f2fs_delete_entry(de, page, inode);
 	f2fs_unlock_op(sbi);
 
-	/* In order to evict this inode,  we set it dirty */
+	/* In order to evict this inode, we set it dirty */
 	mark_inode_dirty(inode);
 fail:
 	trace_f2fs_unlink_exit(inode, err);
@@ -267,9 +265,7 @@
 	return err;
 out:
 	clear_nlink(inode);
-	unlock_new_inode(inode);
-	make_bad_inode(inode);
-	iput(inode);
+	iget_failed(inode);
 	alloc_nid_failed(sbi, inode->i_ino);
 	return err;
 }
@@ -308,9 +304,7 @@
 out_fail:
 	clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
 	clear_nlink(inode);
-	unlock_new_inode(inode);
-	make_bad_inode(inode);
-	iput(inode);
+	iget_failed(inode);
 	alloc_nid_failed(sbi, inode->i_ino);
 	return err;
 }
@@ -354,9 +348,7 @@
 	return 0;
 out:
 	clear_nlink(inode);
-	unlock_new_inode(inode);
-	make_bad_inode(inode);
-	iput(inode);
+	iget_failed(inode);
 	alloc_nid_failed(sbi, inode->i_ino);
 	return err;
 }
@@ -688,9 +680,7 @@
 out:
 	f2fs_unlock_op(sbi);
 	clear_nlink(inode);
-	unlock_new_inode(inode);
-	make_bad_inode(inode);
-	iput(inode);
+	iget_failed(inode);
 	alloc_nid_failed(sbi, inode->i_ino);
 	return err;
 }
@@ -704,7 +694,6 @@
 	.mkdir		= f2fs_mkdir,
 	.rmdir		= f2fs_rmdir,
 	.mknod		= f2fs_mknod,
-	.rename		= f2fs_rename,
 	.rename2	= f2fs_rename2,
 	.tmpfile	= f2fs_tmpfile,
 	.getattr	= f2fs_getattr,

diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index d3d90d2..4537819 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c

@@ -237,7 +237,7 @@
 			nat_get_blkaddr(e) != NULL_ADDR &&
 			new_blkaddr == NEW_ADDR);
 
-	/* increament version no as node is removed */
+	/* increment version no as node is removed */
 	if (nat_get_blkaddr(e) != NEW_ADDR && new_blkaddr == NULL_ADDR) {
 		unsigned char version = nat_get_version(e);
 		nat_set_version(e, inc_node_version(version));
@@ -274,7 +274,7 @@
 }
 
 /*
- * This function returns always success
+ * This function always returns success
  */
 void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
 {
@@ -650,7 +650,7 @@
 
 	/* get indirect nodes in the path */
 	for (i = 0; i < idx + 1; i++) {
-		/* refernece count'll be increased */
+		/* reference count'll be increased */
 		pages[i] = get_node_page(sbi, nid[i]);
 		if (IS_ERR(pages[i])) {
 			err = PTR_ERR(pages[i]);
@@ -823,22 +823,26 @@
  */
 void remove_inode_page(struct inode *inode)
 {
-	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
-	struct page *page;
-	nid_t ino = inode->i_ino;
 	struct dnode_of_data dn;
 
-	page = get_node_page(sbi, ino);
-	if (IS_ERR(page))
+	set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino);
+	if (get_dnode_of_data(&dn, 0, LOOKUP_NODE))
 		return;
 
-	if (truncate_xattr_node(inode, page)) {
-		f2fs_put_page(page, 1);
+	if (truncate_xattr_node(inode, dn.inode_page)) {
+		f2fs_put_dnode(&dn);
 		return;
 	}
-	/* 0 is possible, after f2fs_new_inode() is failed */
+
+	/* remove potential inline_data blocks */
+	if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+				S_ISLNK(inode->i_mode))
+		truncate_data_blocks_range(&dn, 1);
+
+	/* 0 is possible, after f2fs_new_inode() has failed */
 	f2fs_bug_on(inode->i_blocks != 0 && inode->i_blocks != 1);
-	set_new_dnode(&dn, inode, page, page, ino);
+
+	/* will put inode & node pages */
 	truncate_node(&dn);
 }
 
@@ -1129,8 +1133,11 @@
 				set_fsync_mark(page, 0);
 				set_dentry_mark(page, 0);
 			}
-			NODE_MAPPING(sbi)->a_ops->writepage(page, wbc);
-			wrote++;
+
+			if (NODE_MAPPING(sbi)->a_ops->writepage(page, wbc))
+				unlock_page(page);
+			else
+				wrote++;
 
 			if (--wbc->nr_to_write == 0)
 				break;
@@ -1212,6 +1219,8 @@
 
 	if (unlikely(sbi->por_doing))
 		goto redirty_out;
+	if (unlikely(f2fs_cp_error(sbi)))
+		goto redirty_out;
 
 	f2fs_wait_on_page_writeback(page, NODE);
 
@@ -1540,15 +1549,6 @@
 		kmem_cache_free(free_nid_slab, i);
 }
 
-void recover_node_page(struct f2fs_sb_info *sbi, struct page *page,
-		struct f2fs_summary *sum, struct node_info *ni,
-		block_t new_blkaddr)
-{
-	rewrite_node_page(sbi, page, sum, ni->blk_addr, new_blkaddr);
-	set_node_addr(sbi, ni, new_blkaddr, false);
-	clear_node_page_dirty(page);
-}
-
 void recover_inline_xattr(struct inode *inode, struct page *page)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
@@ -1557,40 +1557,33 @@
 	struct page *ipage;
 	struct f2fs_inode *ri;
 
-	if (!f2fs_has_inline_xattr(inode))
-		return;
-
-	if (!IS_INODE(page))
-		return;
-
-	ri = F2FS_INODE(page);
-	if (!(ri->i_inline & F2FS_INLINE_XATTR))
-		return;
-
 	ipage = get_node_page(sbi, inode->i_ino);
 	f2fs_bug_on(IS_ERR(ipage));
 
+	ri = F2FS_INODE(page);
+	if (!(ri->i_inline & F2FS_INLINE_XATTR)) {
+		clear_inode_flag(F2FS_I(inode), FI_INLINE_XATTR);
+		goto update_inode;
+	}
+
 	dst_addr = inline_xattr_addr(ipage);
 	src_addr = inline_xattr_addr(page);
 	inline_size = inline_xattr_size(inode);
 
 	f2fs_wait_on_page_writeback(ipage, NODE);
 	memcpy(dst_addr, src_addr, inline_size);
-
+update_inode:
 	update_inode(inode, ipage);
 	f2fs_put_page(ipage, 1);
 }
 
-bool recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr)
+void recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
 	nid_t prev_xnid = F2FS_I(inode)->i_xattr_nid;
 	nid_t new_xnid = nid_of_node(page);
 	struct node_info ni;
 
-	if (!f2fs_has_xattr_block(ofs_of_node(page)))
-		return false;
-
 	/* 1: invalidate the previous xattr nid */
 	if (!prev_xnid)
 		goto recover_xnid;
@@ -1618,7 +1611,6 @@
 	set_node_addr(sbi, &ni, blkaddr, false);
 
 	update_inode_page(inode);
-	return true;
 }
 
 int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
@@ -1637,7 +1629,7 @@
 	if (!ipage)
 		return -ENOMEM;
 
-	/* Should not use this inode  from free nid list */
+	/* Should not use this inode from free nid list */
 	remove_free_nid(NM_I(sbi), ino);
 
 	SetPageUptodate(ipage);
@@ -1651,6 +1643,7 @@
 	dst->i_blocks = cpu_to_le64(1);
 	dst->i_links = cpu_to_le32(1);
 	dst->i_xattr_nid = 0;
+	dst->i_inline = src->i_inline & F2FS_INLINE_XATTR;
 
 	new_ni = old_ni;
 	new_ni.ino = ino;
@@ -1659,13 +1652,14 @@
 		WARN_ON(1);
 	set_node_addr(sbi, &new_ni, NEW_ADDR, false);
 	inc_valid_inode_count(sbi);
+	set_page_dirty(ipage);
 	f2fs_put_page(ipage, 1);
 	return 0;
 }
 
 /*
  * ra_sum_pages() merge contiguous pages into one bio and submit.
- * these pre-readed pages are alloced in bd_inode's mapping tree.
+ * these pre-read pages are allocated in bd_inode's mapping tree.
  */
 static int ra_sum_pages(struct f2fs_sb_info *sbi, struct page **pages,
 				int start, int nrpages)
@@ -1709,7 +1703,7 @@
 	for (i = 0; !err && i < last_offset; i += nrpages, addr += nrpages) {
 		nrpages = min(last_offset - i, bio_blocks);
 
-		/* read ahead node pages */
+		/* readahead node pages */
 		nrpages = ra_sum_pages(sbi, pages, addr, nrpages);
 		if (!nrpages)
 			return -ENOMEM;
@@ -1967,7 +1961,7 @@
 	nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nat_blocks;
 
 	/* not used nids: 0, node, meta, (and root counted as valid node) */
-	nm_i->available_nids = nm_i->max_nid - 3;
+	nm_i->available_nids = nm_i->max_nid - F2FS_RESERVED_NODE_NUM;
 	nm_i->fcnt = 0;
 	nm_i->nat_cnt = 0;
 	nm_i->ram_thresh = DEF_RAM_THRESHOLD;

diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index fe1c6d9..756c41c 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c

@@ -62,8 +62,10 @@
 	}
 retry:
 	de = f2fs_find_entry(dir, &name, &page);
-	if (de && inode->i_ino == le32_to_cpu(de->ino))
+	if (de && inode->i_ino == le32_to_cpu(de->ino)) {
+		clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
 		goto out_unmap_put;
+	}
 	if (de) {
 		einode = f2fs_iget(inode->i_sb, le32_to_cpu(de->ino));
 		if (IS_ERR(einode)) {
@@ -300,14 +302,19 @@
 	struct node_info ni;
 	int err = 0, recovered = 0;
 
-	recover_inline_xattr(inode, page);
+	/* step 1: recover xattr */
+	if (IS_INODE(page)) {
+		recover_inline_xattr(inode, page);
+	} else if (f2fs_has_xattr_block(ofs_of_node(page))) {
+		recover_xattr_data(inode, page, blkaddr);
+		goto out;
+	}
 
+	/* step 2: recover inline data */
 	if (recover_inline_data(inode, page))
 		goto out;
 
-	if (recover_xattr_data(inode, page, blkaddr))
-		goto out;
-
+	/* step 3: recover data indices */
 	start = start_bidx_of_node(ofs_of_node(page), fi);
 	end = start + ADDRS_PER_PAGE(page, fi);
 
@@ -364,8 +371,6 @@
 	fill_node_footer(dn.node_page, dn.nid, ni.ino,
 					ofs_of_node(page), false);
 	set_page_dirty(dn.node_page);
-
-	recover_node_page(sbi, dn.node_page, &sum, &ni, blkaddr);
 err:
 	f2fs_put_dnode(&dn);
 	f2fs_unlock_op(sbi);
@@ -452,6 +457,9 @@
 	/* step #1: find fsynced inode numbers */
 	sbi->por_doing = true;
 
+	/* prevent checkpoint */
+	mutex_lock(&sbi->cp_mutex);
+
 	blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
 
 	err = find_fsync_dnodes(sbi, &inode_list);
@@ -465,7 +473,8 @@
 
 	/* step #2: recover data */
 	err = recover_data(sbi, &inode_list, CURSEG_WARM_NODE);
-	f2fs_bug_on(!list_empty(&inode_list));
+	if (!err)
+		f2fs_bug_on(!list_empty(&inode_list));
 out:
 	destroy_fsync_dnodes(&inode_list);
 	kmem_cache_destroy(fsync_entry_slab);
@@ -482,8 +491,13 @@
 		/* Flush all the NAT/SIT pages */
 		while (get_pages(sbi, F2FS_DIRTY_META))
 			sync_meta_pages(sbi, META, LONG_MAX);
+		set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG);
+		mutex_unlock(&sbi->cp_mutex);
 	} else if (need_writecp) {
+		mutex_unlock(&sbi->cp_mutex);
 		write_checkpoint(sbi, false);
+	} else {
+		mutex_unlock(&sbi->cp_mutex);
 	}
 	return err;
 }

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 0dfeeba..0aa337c 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c

@@ -62,7 +62,7 @@
 }
 
 /*
- * __find_rev_next(_zero)_bit is copied from lib/find_next_bit.c becasue
+ * __find_rev_next(_zero)_bit is copied from lib/find_next_bit.c because
  * f2fs_set_bit makes MSB and LSB reversed in a byte.
  * Example:
  *                             LSB <--> MSB
@@ -808,7 +808,7 @@
 }
 
 /*
- * This function always allocates a used segment (from dirty seglist) by SSR
+ * This function always allocates a used segment(from dirty seglist) by SSR
  * manner, so it should recover the existing segment information of valid blocks
  */
 static void change_curseg(struct f2fs_sb_info *sbi, int type, bool reuse)
@@ -1103,55 +1103,6 @@
 	mutex_unlock(&curseg->curseg_mutex);
 }
 
-void rewrite_node_page(struct f2fs_sb_info *sbi,
-			struct page *page, struct f2fs_summary *sum,
-			block_t old_blkaddr, block_t new_blkaddr)
-{
-	struct sit_info *sit_i = SIT_I(sbi);
-	int type = CURSEG_WARM_NODE;
-	struct curseg_info *curseg;
-	unsigned int segno, old_cursegno;
-	block_t next_blkaddr = next_blkaddr_of_node(page);
-	unsigned int next_segno = GET_SEGNO(sbi, next_blkaddr);
-	struct f2fs_io_info fio = {
-		.type = NODE,
-		.rw = WRITE_SYNC,
-	};
-
-	curseg = CURSEG_I(sbi, type);
-
-	mutex_lock(&curseg->curseg_mutex);
-	mutex_lock(&sit_i->sentry_lock);
-
-	segno = GET_SEGNO(sbi, new_blkaddr);
-	old_cursegno = curseg->segno;
-
-	/* change the current segment */
-	if (segno != curseg->segno) {
-		curseg->next_segno = segno;
-		change_curseg(sbi, type, true);
-	}
-	curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr);
-	__add_sum_entry(sbi, type, sum);
-
-	/* change the current log to the next block addr in advance */
-	if (next_segno != segno) {
-		curseg->next_segno = next_segno;
-		change_curseg(sbi, type, true);
-	}
-	curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, next_blkaddr);
-
-	/* rewrite node page */
-	set_page_writeback(page);
-	f2fs_submit_page_mbio(sbi, page, new_blkaddr, &fio);
-	f2fs_submit_merged_bio(sbi, NODE, WRITE);
-	refresh_sit_entry(sbi, old_blkaddr, new_blkaddr);
-	locate_dirty_segment(sbi, old_cursegno);
-
-	mutex_unlock(&sit_i->sentry_lock);
-	mutex_unlock(&curseg->curseg_mutex);
-}
-
 static inline bool is_merged_page(struct f2fs_sb_info *sbi,
 					struct page *page, enum page_type type)
 {

diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 55973f7..ff48325 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h

@@ -549,7 +549,7 @@
 }
 
 /*
- * Summary block is always treated as invalid block
+ * Summary block is always treated as an invalid block
  */
 static inline void check_block_count(struct f2fs_sb_info *sbi,
 		int segno, struct f2fs_sit_entry *raw_sit)

diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 657582f..41bdf51 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c

@@ -432,9 +432,15 @@
 	stop_gc_thread(sbi);
 
 	/* We don't need to do checkpoint when it's clean */
-	if (sbi->s_dirty && get_pages(sbi, F2FS_DIRTY_NODES))
+	if (sbi->s_dirty)
 		write_checkpoint(sbi, true);
 
+	/*
+	 * normally superblock is clean, so we need to release this.
+	 * In addition, EIO will skip do checkpoint, we need this as well.
+	 */
+	release_dirty_inode(sbi);
+
 	iput(sbi->node_inode);
 	iput(sbi->meta_inode);
 
@@ -457,9 +463,6 @@
 
 	trace_f2fs_sync_fs(sb, sync);
 
-	if (!sbi->s_dirty && !get_pages(sbi, F2FS_DIRTY_NODES))
-		return 0;
-
 	if (sync) {
 		mutex_lock(&sbi->gc_mutex);
 		write_checkpoint(sbi, false);
@@ -505,8 +508,8 @@
 	buf->f_bfree = buf->f_blocks - valid_user_blocks(sbi) - ovp_count;
 	buf->f_bavail = user_block_count - valid_user_blocks(sbi);
 
-	buf->f_files = sbi->total_node_count;
-	buf->f_ffree = sbi->total_node_count - valid_inode_count(sbi);
+	buf->f_files = sbi->total_node_count - F2FS_RESERVED_NODE_NUM;
+	buf->f_ffree = buf->f_files - valid_inode_count(sbi);
 
 	buf->f_namelen = F2FS_NAME_LEN;
 	buf->f_fsid.val[0] = (u32)id;
@@ -663,7 +666,7 @@
 	if (need_restart_gc) {
 		if (start_gc_thread(sbi))
 			f2fs_msg(sbi->sb, KERN_WARNING,
-				"background gc thread is stop");
+				"background gc thread has stopped");
 	} else if (need_stop_gc) {
 		stop_gc_thread(sbi);
 	}
@@ -812,7 +815,7 @@
 	if (unlikely(fsmeta >= total))
 		return 1;
 
-	if (unlikely(is_set_ckpt_flags(ckpt, CP_ERROR_FLAG))) {
+	if (unlikely(f2fs_cp_error(sbi))) {
 		f2fs_msg(sbi->sb, KERN_ERR, "A bug case: need to run fsck");
 		return 1;
 	}
@@ -899,8 +902,10 @@
 	struct buffer_head *raw_super_buf;
 	struct inode *root;
 	long err = -EINVAL;
+	bool retry = true;
 	int i;
 
+try_onemore:
 	/* allocate memory for f2fs-specific super block info */
 	sbi = kzalloc(sizeof(struct f2fs_sb_info), GFP_KERNEL);
 	if (!sbi)
@@ -1080,9 +1085,11 @@
 	/* recover fsynced data */
 	if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) {
 		err = recover_fsync_data(sbi);
-		if (err)
+		if (err) {
 			f2fs_msg(sb, KERN_ERR,
 				"Cannot recover all fsync data errno=%ld", err);
+			goto free_kobj;
+		}
 	}
 
 	/*
@@ -1123,6 +1130,13 @@
 	brelse(raw_super_buf);
 free_sbi:
 	kfree(sbi);
+
+	/* give only one another chance */
+	if (retry) {
+		retry = 0;
+		shrink_dcache_sb(sb);
+		goto try_onemore;
+	}
 	return err;
 }
 

diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 8bea941..728a5dc 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c

@@ -528,7 +528,7 @@
 		int free;
 		/*
 		 * If value is NULL, it is remove operation.
-		 * In case of update operation, we caculate free.
+		 * In case of update operation, we calculate free.
 		 */
 		free = MIN_OFFSET(inode) - ((char *)last - (char *)base_addr);
 		if (found)

diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 8f27c93..ec9e082f 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c

@@ -253,13 +253,11 @@
 
 	error = make_socks(serv, net);
 	if (error < 0)
-		goto err_socks;
+		goto err_bind;
 	set_grace_period(net);
 	dprintk("lockd_up_net: per-net data created; net=%p\n", net);
 	return 0;
 
-err_socks:
-	svc_rpcb_cleanup(serv, net);
 err_bind:
 	ln->nlmsvc_users--;
 	return error;

diff --git a/fs/namespace.c b/fs/namespace.c
index a01c773..ef42d9b 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c

@@ -1217,6 +1217,11 @@
 	head.first->pprev = &head.first;
 	INIT_HLIST_HEAD(&unmounted);
 
+	/* undo decrements we'd done in umount_tree() */
+	hlist_for_each_entry(mnt, &head, mnt_hash)
+		if (mnt->mnt_ex_mountpoint.mnt)
+			mntget(mnt->mnt_ex_mountpoint.mnt);
+
 	up_write(&namespace_sem);
 
 	synchronize_rcu();
@@ -1253,6 +1258,9 @@
 		hlist_add_head(&p->mnt_hash, &tmp_list);
 	}
 
+	hlist_for_each_entry(p, &tmp_list, mnt_hash)
+		list_del_init(&p->mnt_child);
+
 	if (how)
 		propagate_umount(&tmp_list);
 
@@ -1263,9 +1271,9 @@
 		p->mnt_ns = NULL;
 		if (how < 2)
 			p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;
-		list_del_init(&p->mnt_child);
 		if (mnt_has_parent(p)) {
 			put_mountpoint(p->mnt_mp);
+			mnt_add_count(p->mnt_parent, -1);
 			/* move the reference to mountpoint into ->mnt_ex_mountpoint */
 			p->mnt_ex_mountpoint.dentry = p->mnt_mountpoint;
 			p->mnt_ex_mountpoint.mnt = &p->mnt_parent->mnt;

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index f9821ce..e94457c 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c

@@ -2657,6 +2657,7 @@
 	struct xdr_stream *xdr = cd->xdr;
 	int start_offset = xdr->buf->len;
 	int cookie_offset;
+	u32 name_and_cookie;
 	int entry_bytes;
 	__be32 nfserr = nfserr_toosmall;
 	__be64 wire_offset;
@@ -2718,7 +2719,14 @@
 	cd->rd_maxcount -= entry_bytes;
 	if (!cd->rd_dircount)
 		goto fail;
-	cd->rd_dircount--;
+	/*
+	 * RFC 3530 14.2.24 describes rd_dircount as only a "hint", so
+	 * let's always let through the first entry, at least:
+	 */
+	name_and_cookie = 4 * XDR_QUADLEN(namlen) + 8;
+	if (name_and_cookie > cd->rd_dircount && cd->cookie_offset)
+		goto fail;
+	cd->rd_dircount -= min(cd->rd_dircount, name_and_cookie);
 	cd->cookie_offset = cookie_offset;
 skip_entry:
 	cd->common.err = nfs_ok;
@@ -3321,6 +3329,10 @@
 	}
 	maxcount = min_t(int, maxcount-16, bytes_left);
 
+	/* RFC 3530 14.2.24 allows us to ignore dircount when it's 0: */
+	if (!readdir->rd_dircount)
+		readdir->rd_dircount = INT_MAX;
+
 	readdir->xdr = xdr;
 	readdir->rd_maxcount = maxcount;
 	readdir->common.err = 0;

diff --git a/fs/pnode.c b/fs/pnode.c
index 302bf22..aae331a 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c

@@ -381,6 +381,7 @@
 		 * other children
 		 */
 		if (child && list_empty(&child->mnt_mounts)) {
+			list_del_init(&child->mnt_child);
 			hlist_del_init_rcu(&child->mnt_hash);
 			hlist_add_before_rcu(&child->mnt_hash, &mnt->mnt_hash);
 		}

diff --git a/fs/sync.c b/fs/sync.c
index b28d1dd..bdc729d 100644
--- a/fs/sync.c
+++ b/fs/sync.c

@@ -65,7 +65,7 @@
 		return ret;
 	return __sync_filesystem(sb, 1);
 }
-EXPORT_SYMBOL_GPL(sync_filesystem);
+EXPORT_SYMBOL(sync_filesystem);
 
 static void sync_inodes_one_sb(struct super_block *sb, void *arg)
 {

diff --git a/fs/timerfd.c b/fs/timerfd.c
index 80c3502..b46ffa9 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c

@@ -333,8 +333,7 @@
 		spin_lock_irq(&ctx->wqh.lock);
 		if (!timerfd_canceled(ctx)) {
 			ctx->ticks = ticks;
-			if (ticks)
-				wake_up_locked(&ctx->wqh);
+			wake_up_locked(&ctx->wqh);
 		} else
 			ret = -ECANCELED;
 		spin_unlock_irq(&ctx->wqh.lock);

diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 7c580c9..be7d42c 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c

@@ -902,9 +902,6 @@
 	invalidate_inode_buffers(inode);
 	clear_inode(inode);
 
-	if (want_delete) {
-		lock_ufs(inode->i_sb);
-		ufs_free_inode (inode);
-		unlock_ufs(inode->i_sb);
-	}
+	if (want_delete)
+		ufs_free_inode(inode);
 }

diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 90d74b8..2df62a7 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c

@@ -126,12 +126,12 @@
 	if (l > sb->s_blocksize)
 		goto out_notlocked;
 
-	lock_ufs(dir->i_sb);
 	inode = ufs_new_inode(dir, S_IFLNK | S_IRWXUGO);
 	err = PTR_ERR(inode);
 	if (IS_ERR(inode))
-		goto out;
+		goto out_notlocked;
 
+	lock_ufs(dir->i_sb);
 	if (l > UFS_SB(sb)->s_uspi->s_maxsymlinklen) {
 		/* slow symlink */
 		inode->i_op = &ufs_symlink_inode_operations;
@@ -181,13 +181,9 @@
 	struct inode * inode;
 	int err;
 
-	lock_ufs(dir->i_sb);
-	inode_inc_link_count(dir);
-
 	inode = ufs_new_inode(dir, S_IFDIR|mode);
-	err = PTR_ERR(inode);
 	if (IS_ERR(inode))
-		goto out_dir;
+		return PTR_ERR(inode);
 
 	inode->i_op = &ufs_dir_inode_operations;
 	inode->i_fop = &ufs_dir_operations;
@@ -195,6 +191,9 @@
 
 	inode_inc_link_count(inode);
 
+	lock_ufs(dir->i_sb);
+	inode_inc_link_count(dir);
+
 	err = ufs_make_empty(inode, dir);
 	if (err)
 		goto out_fail;
@@ -212,7 +211,6 @@
 	inode_dec_link_count(inode);
 	inode_dec_link_count(inode);
 	iput (inode);
-out_dir:
 	inode_dec_link_count(dir);
 	unlock_ufs(dir->i_sb);
 	goto out;

diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index de2d26d..86df952 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c

@@ -5424,7 +5424,7 @@
 	struct xfs_bmap_free	*flist,
 	int			num_exts)
 {
-	struct xfs_btree_cur		*cur;
+	struct xfs_btree_cur		*cur = NULL;
 	struct xfs_bmbt_rec_host	*gotp;
 	struct xfs_bmbt_irec            got;
 	struct xfs_bmbt_irec		left;
@@ -5435,7 +5435,7 @@
 	int				error = 0;
 	int				i;
 	int				whichfork = XFS_DATA_FORK;
-	int				logflags;
+	int				logflags = 0;
 	xfs_filblks_t			blockcount = 0;
 	int				total_extents;
 
@@ -5478,16 +5478,11 @@
 		}
 	}
 
-	/* We are going to change core inode */
-	logflags = XFS_ILOG_CORE;
 	if (ifp->if_flags & XFS_IFBROOT) {
 		cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
 		cur->bc_private.b.firstblock = *firstblock;
 		cur->bc_private.b.flist = flist;
 		cur->bc_private.b.flags = 0;
-	} else {
-		cur = NULL;
-		logflags |= XFS_ILOG_DEXT;
 	}
 
 	/*
@@ -5545,11 +5540,14 @@
 			blockcount = left.br_blockcount +
 				got.br_blockcount;
 			xfs_iext_remove(ip, *current_ext, 1, 0);
+			logflags |= XFS_ILOG_CORE;
 			if (cur) {
 				error = xfs_btree_delete(cur, &i);
 				if (error)
 					goto del_cursor;
 				XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
+			} else {
+				logflags |= XFS_ILOG_DEXT;
 			}
 			XFS_IFORK_NEXT_SET(ip, whichfork,
 				XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
@@ -5575,6 +5573,7 @@
 			got.br_startoff = startoff;
 		}
 
+		logflags |= XFS_ILOG_CORE;
 		if (cur) {
 			error = xfs_bmbt_update(cur, got.br_startoff,
 						got.br_startblock,
@@ -5582,6 +5581,8 @@
 						got.br_state);
 			if (error)
 				goto del_cursor;
+		} else {
+			logflags |= XFS_ILOG_DEXT;
 		}
 
 		(*current_ext)++;
@@ -5597,6 +5598,7 @@
 		xfs_btree_del_cursor(cur,
 			error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
 
-	xfs_trans_log_inode(tp, ip, logflags);
+	if (logflags)
+		xfs_trans_log_inode(tp, ip, logflags);
 	return error;
 }

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 11e9b4c..b984647 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c

@@ -1753,11 +1753,72 @@
 	return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks);
 }
 
+/*
+ * This is basically a copy of __set_page_dirty_buffers() with one
+ * small tweak: buffers beyond EOF do not get marked dirty. If we mark them
+ * dirty, we'll never be able to clean them because we don't write buffers
+ * beyond EOF, and that means we can't invalidate pages that span EOF
+ * that have been marked dirty. Further, the dirty state can leak into
+ * the file interior if the file is extended, resulting in all sorts of
+ * bad things happening as the state does not match the underlying data.
+ *
+ * XXX: this really indicates that bufferheads in XFS need to die. Warts like
+ * this only exist because of bufferheads and how the generic code manages them.
+ */
+STATIC int
+xfs_vm_set_page_dirty(
+	struct page		*page)
+{
+	struct address_space	*mapping = page->mapping;
+	struct inode		*inode = mapping->host;
+	loff_t			end_offset;
+	loff_t			offset;
+	int			newly_dirty;
+
+	if (unlikely(!mapping))
+		return !TestSetPageDirty(page);
+
+	end_offset = i_size_read(inode);
+	offset = page_offset(page);
+
+	spin_lock(&mapping->private_lock);
+	if (page_has_buffers(page)) {
+		struct buffer_head *head = page_buffers(page);
+		struct buffer_head *bh = head;
+
+		do {
+			if (offset < end_offset)
+				set_buffer_dirty(bh);
+			bh = bh->b_this_page;
+			offset += 1 << inode->i_blkbits;
+		} while (bh != head);
+	}
+	newly_dirty = !TestSetPageDirty(page);
+	spin_unlock(&mapping->private_lock);
+
+	if (newly_dirty) {
+		/* sigh - __set_page_dirty() is static, so copy it here, too */
+		unsigned long flags;
+
+		spin_lock_irqsave(&mapping->tree_lock, flags);
+		if (page->mapping) {	/* Race with truncate? */
+			WARN_ON_ONCE(!PageUptodate(page));
+			account_page_dirtied(page, mapping);
+			radix_tree_tag_set(&mapping->page_tree,
+					page_index(page), PAGECACHE_TAG_DIRTY);
+		}
+		spin_unlock_irqrestore(&mapping->tree_lock, flags);
+		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+	}
+	return newly_dirty;
+}
+
 const struct address_space_operations xfs_address_space_operations = {
 	.readpage		= xfs_vm_readpage,
 	.readpages		= xfs_vm_readpages,
 	.writepage		= xfs_vm_writepage,
 	.writepages		= xfs_vm_writepages,
+	.set_page_dirty		= xfs_vm_set_page_dirty,
 	.releasepage		= xfs_vm_releasepage,
 	.invalidatepage		= xfs_vm_invalidatepage,
 	.write_begin		= xfs_vm_write_begin,

diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 2f1e30d..1707980 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c

@@ -1470,6 +1470,26 @@
 	start_fsb = XFS_B_TO_FSB(mp, offset + len);
 	shift_fsb = XFS_B_TO_FSB(mp, len);
 
+	/*
+	 * Writeback the entire file and force remove any post-eof blocks. The
+	 * writeback prevents changes to the extent list via concurrent
+	 * writeback and the eofblocks trim prevents the extent shift algorithm
+	 * from running into a post-eof delalloc extent.
+	 *
+	 * XXX: This is a temporary fix until the extent shift loop below is
+	 * converted to use offsets and lookups within the ILOCK rather than
+	 * carrying around the index into the extent list for the next
+	 * iteration.
+	 */
+	error = filemap_write_and_wait(VFS_I(ip)->i_mapping);
+	if (error)
+		return error;
+	if (xfs_can_free_eofblocks(ip, true)) {
+		error = xfs_free_eofblocks(mp, ip, false);
+		if (error)
+			return error;
+	}
+
 	error = xfs_free_file_space(ip, offset, len);
 	if (error)
 		return error;

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 076b170..de5368c 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c

@@ -291,12 +291,22 @@
 		if (inode->i_mapping->nrpages) {
 			ret = filemap_write_and_wait_range(
 							VFS_I(ip)->i_mapping,
-							pos, -1);
+							pos, pos + size - 1);
 			if (ret) {
 				xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
 				return ret;
 			}
-			truncate_pagecache_range(VFS_I(ip), pos, -1);
+
+			/*
+			 * Invalidate whole pages. This can return an error if
+			 * we fail to invalidate a page, but this should never
+			 * happen on XFS. Warn if it does fail.
+			 */
+			ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
+					pos >> PAGE_CACHE_SHIFT,
+					(pos + size - 1) >> PAGE_CACHE_SHIFT);
+			WARN_ON_ONCE(ret);
+			ret = 0;
 		}
 		xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
 	}
@@ -632,10 +642,19 @@
 
 	if (mapping->nrpages) {
 		ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
-						    pos, -1);
+						    pos, pos + count - 1);
 		if (ret)
 			goto out;
-		truncate_pagecache_range(VFS_I(ip), pos, -1);
+		/*
+		 * Invalidate whole pages. This can return an error if
+		 * we fail to invalidate a page, but this should never
+		 * happen on XFS. Warn if it does fail.
+		 */
+		ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
+					pos >> PAGE_CACHE_SHIFT,
+					(pos + count - 1) >> PAGE_CACHE_SHIFT);
+		WARN_ON_ONCE(ret);
+		ret = 0;
 	}
 
 	/*

diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h
index bcfd808..c1c9de1 100644
--- a/include/acpi/acpi_bus.h
+++ b/include/acpi/acpi_bus.h

@@ -246,7 +246,6 @@
 	acpi_device_name device_name;	/* Driver-determined */
 	acpi_device_class device_class;	/*        "          */
 	union acpi_object *str_obj;	/* unicode string for _STR method */
-	unsigned long sun;		/* _SUN */
 };
 
 #define acpi_device_bid(d)	((d)->pnp.bus_id)

diff --git a/include/asm-generic/atomic.h b/include/asm-generic/atomic.h
index 9c79e76..56d4d36 100644
--- a/include/asm-generic/atomic.h
+++ b/include/asm-generic/atomic.h

@@ -18,14 +18,100 @@
 #include <asm/cmpxchg.h>
 #include <asm/barrier.h>
 
+/*
+ * atomic_$op() - $op integer to atomic variable
+ * @i: integer value to $op
+ * @v: pointer to the atomic variable
+ *
+ * Atomically $ops @i to @v. Does not strictly guarantee a memory-barrier, use
+ * smp_mb__{before,after}_atomic().
+ */
+
+/*
+ * atomic_$op_return() - $op interer to atomic variable and returns the result
+ * @i: integer value to $op
+ * @v: pointer to the atomic variable
+ *
+ * Atomically $ops @i to @v. Does imply a full memory barrier.
+ */
+
 #ifdef CONFIG_SMP
-/* Force people to define core atomics */
-# if !defined(atomic_add_return) || !defined(atomic_sub_return) || \
-     !defined(atomic_clear_mask) || !defined(atomic_set_mask)
-#  error "SMP requires a little arch-specific magic"
-# endif
+
+/* we can build all atomic primitives from cmpxchg */
+
+#define ATOMIC_OP(op, c_op)						\
+static inline void atomic_##op(int i, atomic_t *v)			\
+{									\
+	int c, old;							\
+									\
+	c = v->counter;							\
+	while ((old = cmpxchg(&v->counter, c, c c_op i)) != c)		\
+		c = old;						\
+}
+
+#define ATOMIC_OP_RETURN(op, c_op)					\
+static inline int atomic_##op##_return(int i, atomic_t *v)		\
+{									\
+	int c, old;							\
+									\
+	c = v->counter;							\
+	while ((old = cmpxchg(&v->counter, c, c c_op i)) != c)		\
+		c = old;						\
+									\
+	return c c_op i;						\
+}
+
+#else
+
+#include <linux/irqflags.h>
+
+#define ATOMIC_OP(op, c_op)						\
+static inline void atomic_##op(int i, atomic_t *v)			\
+{									\
+	unsigned long flags;						\
+									\
+	raw_local_irq_save(flags);					\
+	v->counter = v->counter c_op i;					\
+	raw_local_irq_restore(flags);					\
+}
+
+#define ATOMIC_OP_RETURN(op, c_op)					\
+static inline int atomic_##op##_return(int i, atomic_t *v)		\
+{									\
+	unsigned long flags;						\
+	int ret;							\
+									\
+	raw_local_irq_save(flags);					\
+	ret = (v->counter = v->counter c_op i);				\
+	raw_local_irq_restore(flags);					\
+									\
+	return ret;							\
+}
+
+#endif /* CONFIG_SMP */
+
+#ifndef atomic_add_return
+ATOMIC_OP_RETURN(add, +)
 #endif
 
+#ifndef atomic_sub_return
+ATOMIC_OP_RETURN(sub, -)
+#endif
+
+#ifndef atomic_clear_mask
+ATOMIC_OP(and, &)
+#define atomic_clear_mask(i, v) atomic_and(~(i), (v))
+#endif
+
+#ifndef atomic_set_mask
+#define CONFIG_ARCH_HAS_ATOMIC_OR
+ATOMIC_OP(or, |)
+#define atomic_set_mask(i, v)	atomic_or((i), (v))
+#endif
+
+#undef ATOMIC_OP_RETURN
+#undef ATOMIC_OP
+
 /*
  * Atomic operations that C can't guarantee us.  Useful for
  * resource counting etc..
@@ -33,8 +119,6 @@
 
 #define ATOMIC_INIT(i)	{ (i) }
 
-#ifdef __KERNEL__
-
 /**
  * atomic_read - read atomic variable
  * @v: pointer of type atomic_t
@@ -56,52 +140,6 @@
 
 #include <linux/irqflags.h>
 
-/**
- * atomic_add_return - add integer to atomic variable
- * @i: integer value to add
- * @v: pointer of type atomic_t
- *
- * Atomically adds @i to @v and returns the result
- */
-#ifndef atomic_add_return
-static inline int atomic_add_return(int i, atomic_t *v)
-{
-	unsigned long flags;
-	int temp;
-
-	raw_local_irq_save(flags); /* Don't trace it in an irqsoff handler */
-	temp = v->counter;
-	temp += i;
-	v->counter = temp;
-	raw_local_irq_restore(flags);
-
-	return temp;
-}
-#endif
-
-/**
- * atomic_sub_return - subtract integer from atomic variable
- * @i: integer value to subtract
- * @v: pointer of type atomic_t
- *
- * Atomically subtracts @i from @v and returns the result
- */
-#ifndef atomic_sub_return
-static inline int atomic_sub_return(int i, atomic_t *v)
-{
-	unsigned long flags;
-	int temp;
-
-	raw_local_irq_save(flags); /* Don't trace it in an irqsoff handler */
-	temp = v->counter;
-	temp -= i;
-	v->counter = temp;
-	raw_local_irq_restore(flags);
-
-	return temp;
-}
-#endif
-
 static inline int atomic_add_negative(int i, atomic_t *v)
 {
 	return atomic_add_return(i, v) < 0;
@@ -139,49 +177,11 @@
 
 static inline int __atomic_add_unless(atomic_t *v, int a, int u)
 {
-  int c, old;
-  c = atomic_read(v);
-  while (c != u && (old = atomic_cmpxchg(v, c, c + a)) != c)
-    c = old;
-  return c;
+	int c, old;
+	c = atomic_read(v);
+	while (c != u && (old = atomic_cmpxchg(v, c, c + a)) != c)
+		c = old;
+	return c;
 }
 
-/**
- * atomic_clear_mask - Atomically clear bits in atomic variable
- * @mask: Mask of the bits to be cleared
- * @v: pointer of type atomic_t
- *
- * Atomically clears the bits set in @mask from @v
- */
-#ifndef atomic_clear_mask
-static inline void atomic_clear_mask(unsigned long mask, atomic_t *v)
-{
-	unsigned long flags;
-
-	mask = ~mask;
-	raw_local_irq_save(flags); /* Don't trace it in a irqsoff handler */
-	v->counter &= mask;
-	raw_local_irq_restore(flags);
-}
-#endif
-
-/**
- * atomic_set_mask - Atomically set bits in atomic variable
- * @mask: Mask of the bits to be set
- * @v: pointer of type atomic_t
- *
- * Atomically sets the bits set in @mask in @v
- */
-#ifndef atomic_set_mask
-static inline void atomic_set_mask(unsigned int mask, atomic_t *v)
-{
-	unsigned long flags;
-
-	raw_local_irq_save(flags); /* Don't trace it in a irqsoff handler */
-	v->counter |= mask;
-	raw_local_irq_restore(flags);
-}
-#endif
-
-#endif /* __KERNEL__ */
 #endif /* __ASM_GENERIC_ATOMIC_H */

diff --git a/include/asm-generic/atomic64.h b/include/asm-generic/atomic64.h
index b18ce4f..30ad9c8 100644
--- a/include/asm-generic/atomic64.h
+++ b/include/asm-generic/atomic64.h

@@ -20,10 +20,22 @@
 
 extern long long atomic64_read(const atomic64_t *v);
 extern void	 atomic64_set(atomic64_t *v, long long i);
-extern void	 atomic64_add(long long a, atomic64_t *v);
-extern long long atomic64_add_return(long long a, atomic64_t *v);
-extern void	 atomic64_sub(long long a, atomic64_t *v);
-extern long long atomic64_sub_return(long long a, atomic64_t *v);
+
+#define ATOMIC64_OP(op)							\
+extern void	 atomic64_##op(long long a, atomic64_t *v);
+
+#define ATOMIC64_OP_RETURN(op)						\
+extern long long atomic64_##op##_return(long long a, atomic64_t *v);
+
+#define ATOMIC64_OPS(op)	ATOMIC64_OP(op) ATOMIC64_OP_RETURN(op)
+
+ATOMIC64_OPS(add)
+ATOMIC64_OPS(sub)
+
+#undef ATOMIC64_OPS
+#undef ATOMIC64_OP_RETURN
+#undef ATOMIC64_OP
+
 extern long long atomic64_dec_if_positive(atomic64_t *v);
 extern long long atomic64_cmpxchg(atomic64_t *v, long long o, long long n);
 extern long long atomic64_xchg(atomic64_t *v, long long new);

diff --git a/include/linux/atomic.h b/include/linux/atomic.h
index fef3a80..5b08a85 100644
--- a/include/linux/atomic.h
+++ b/include/linux/atomic.h

@@ -3,42 +3,6 @@
 #define _LINUX_ATOMIC_H
 #include <asm/atomic.h>
 
-/*
- * Provide __deprecated wrappers for the new interface, avoid flag day changes.
- * We need the ugly external functions to break header recursion hell.
- */
-#ifndef smp_mb__before_atomic_inc
-static inline void __deprecated smp_mb__before_atomic_inc(void)
-{
-	extern void __smp_mb__before_atomic(void);
-	__smp_mb__before_atomic();
-}
-#endif
-
-#ifndef smp_mb__after_atomic_inc
-static inline void __deprecated smp_mb__after_atomic_inc(void)
-{
-	extern void __smp_mb__after_atomic(void);
-	__smp_mb__after_atomic();
-}
-#endif
-
-#ifndef smp_mb__before_atomic_dec
-static inline void __deprecated smp_mb__before_atomic_dec(void)
-{
-	extern void __smp_mb__before_atomic(void);
-	__smp_mb__before_atomic();
-}
-#endif
-
-#ifndef smp_mb__after_atomic_dec
-static inline void __deprecated smp_mb__after_atomic_dec(void)
-{
-	extern void __smp_mb__after_atomic(void);
-	__smp_mb__after_atomic();
-}
-#endif
-
 /**
  * atomic_add_unless - add unless the number is already a given value
  * @v: pointer of type atomic_t

diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index cbc5833..be5fd38 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h

@@ -32,26 +32,6 @@
  */
 #include <asm/bitops.h>
 
-/*
- * Provide __deprecated wrappers for the new interface, avoid flag day changes.
- * We need the ugly external functions to break header recursion hell.
- */
-#ifndef smp_mb__before_clear_bit
-static inline void __deprecated smp_mb__before_clear_bit(void)
-{
-	extern void __smp_mb__before_atomic(void);
-	__smp_mb__before_atomic();
-}
-#endif
-
-#ifndef smp_mb__after_clear_bit
-static inline void __deprecated smp_mb__after_clear_bit(void)
-{
-	extern void __smp_mb__after_atomic(void);
-	__smp_mb__after_atomic();
-}
-#endif
-
 #define for_each_set_bit(bit, addr, size) \
 	for ((bit) = find_first_bit((addr), (size));		\
 	     (bit) < (size);					\

diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index 6ff0b0b..08ed2b0 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h

@@ -24,6 +24,9 @@
 #define NULL_ADDR		((block_t)0)	/* used as block_t addresses */
 #define NEW_ADDR		((block_t)-1)	/* used as block_t addresses */
 
+/* 0, 1(node nid), 2(meta nid) are reserved node id */
+#define F2FS_RESERVED_NODE_NUM		3
+
 #define F2FS_ROOT_INO(sbi)	(sbi->root_ino_num)
 #define F2FS_NODE_INO(sbi)	(sbi->node_ino_num)
 #define F2FS_META_INO(sbi)	(sbi->meta_ino_num)
@@ -87,6 +90,8 @@
 #define CP_ORPHAN_PRESENT_FLAG	0x00000002
 #define CP_UMOUNT_FLAG		0x00000001
 
+#define F2FS_CP_PACKS		2	/* # of checkpoint packs */
+
 struct f2fs_checkpoint {
 	__le64 checkpoint_ver;		/* checkpoint block version number */
 	__le64 user_block_count;	/* # of user blocks */
@@ -123,6 +128,9 @@
  */
 #define F2FS_ORPHANS_PER_BLOCK	1020
 
+#define GET_ORPHAN_BLOCKS(n)	((n + F2FS_ORPHANS_PER_BLOCK - 1) / \
+					F2FS_ORPHANS_PER_BLOCK)
+
 struct f2fs_orphan_block {
 	__le32 ino[F2FS_ORPHANS_PER_BLOCK];	/* inode numbers */
 	__le32 reserved;	/* reserved */
@@ -144,6 +152,7 @@
 #define F2FS_NAME_LEN		255
 #define F2FS_INLINE_XATTR_ADDRS	50	/* 200 bytes for inline xattrs */
 #define DEF_ADDRS_PER_INODE	923	/* Address Pointers in an Inode */
+#define DEF_NIDS_PER_INODE	5	/* Node IDs in an Inode */
 #define ADDRS_PER_INODE(fi)	addrs_per_inode(fi)
 #define ADDRS_PER_BLOCK		1018	/* Address Pointers in a Direct Block */
 #define NIDS_PER_BLOCK		1018	/* Node IDs in an Indirect Block */
@@ -163,8 +172,9 @@
 #define MAX_INLINE_DATA		(sizeof(__le32) * (DEF_ADDRS_PER_INODE - \
 						F2FS_INLINE_XATTR_ADDRS - 1))
 
-#define INLINE_DATA_OFFSET	(PAGE_CACHE_SIZE - sizeof(struct node_footer) \
-			- sizeof(__le32) * (DEF_ADDRS_PER_INODE + 5 - 1))
+#define INLINE_DATA_OFFSET	(PAGE_CACHE_SIZE - sizeof(struct node_footer) -\
+				sizeof(__le32) * (DEF_ADDRS_PER_INODE + \
+				DEF_NIDS_PER_INODE - 1))
 
 struct f2fs_inode {
 	__le16 i_mode;			/* file mode */
@@ -194,7 +204,7 @@
 
 	__le32 i_addr[DEF_ADDRS_PER_INODE];	/* Pointers to data blocks */
 
-	__le32 i_nid[5];		/* direct(2), indirect(2),
+	__le32 i_nid[DEF_NIDS_PER_INODE];	/* direct(2), indirect(2),
 						double_indirect(1) node id */
 } __packed;
 

diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h
index c7e17de..12f146f 100644
--- a/include/linux/gpio/consumer.h
+++ b/include/linux/gpio/consumer.h

@@ -38,60 +38,32 @@
 struct gpio_desc *__must_check __gpiod_get(struct device *dev,
 					 const char *con_id,
 					 enum gpiod_flags flags);
-#define __gpiod_get(dev, con_id, flags, ...) __gpiod_get(dev, con_id, flags)
-#define gpiod_get(varargs...) __gpiod_get(varargs, 0)
 struct gpio_desc *__must_check __gpiod_get_index(struct device *dev,
 					       const char *con_id,
 					       unsigned int idx,
 					       enum gpiod_flags flags);
-#define __gpiod_get_index(dev, con_id, index, flags, ...)		\
-	__gpiod_get_index(dev, con_id, index, flags)
-#define gpiod_get_index(varargs...) __gpiod_get_index(varargs, 0)
 struct gpio_desc *__must_check __gpiod_get_optional(struct device *dev,
 						  const char *con_id,
 						  enum gpiod_flags flags);
-#define __gpiod_get_optional(dev, con_id, flags, ...)			\
-	__gpiod_get_optional(dev, con_id, flags)
-#define gpiod_get_optional(varargs...) __gpiod_get_optional(varargs, 0)
 struct gpio_desc *__must_check __gpiod_get_index_optional(struct device *dev,
 							const char *con_id,
 							unsigned int index,
 							enum gpiod_flags flags);
-#define __gpiod_get_index_optional(dev, con_id, index, flags, ...)	\
-	__gpiod_get_index_optional(dev, con_id, index, flags)
-#define gpiod_get_index_optional(varargs...)				\
-	__gpiod_get_index_optional(varargs, 0)
-
 void gpiod_put(struct gpio_desc *desc);
 
 struct gpio_desc *__must_check __devm_gpiod_get(struct device *dev,
 					      const char *con_id,
 					      enum gpiod_flags flags);
-#define __devm_gpiod_get(dev, con_id, flags, ...)			\
-	__devm_gpiod_get(dev, con_id, flags)
-#define devm_gpiod_get(varargs...) __devm_gpiod_get(varargs, 0)
 struct gpio_desc *__must_check __devm_gpiod_get_index(struct device *dev,
 						    const char *con_id,
 						    unsigned int idx,
 						    enum gpiod_flags flags);
-#define __devm_gpiod_get_index(dev, con_id, index, flags, ...)		\
-	__devm_gpiod_get_index(dev, con_id, index, flags)
-#define devm_gpiod_get_index(varargs...) __devm_gpiod_get_index(varargs, 0)
 struct gpio_desc *__must_check __devm_gpiod_get_optional(struct device *dev,
 						       const char *con_id,
 						       enum gpiod_flags flags);
-#define __devm_gpiod_get_optional(dev, con_id, flags, ...)		\
-	__devm_gpiod_get_optional(dev, con_id, flags)
-#define devm_gpiod_get_optional(varargs...)				\
-	__devm_gpiod_get_optional(varargs, 0)
 struct gpio_desc *__must_check
 __devm_gpiod_get_index_optional(struct device *dev, const char *con_id,
 			      unsigned int index, enum gpiod_flags flags);
-#define __devm_gpiod_get_index_optional(dev, con_id, index, flags, ...)	\
-	__devm_gpiod_get_index_optional(dev, con_id, index, flags)
-#define devm_gpiod_get_index_optional(varargs...)			\
-	__devm_gpiod_get_index_optional(varargs, 0)
-
 void devm_gpiod_put(struct device *dev, struct gpio_desc *desc);
 
 int gpiod_get_direction(const struct gpio_desc *desc);
@@ -124,27 +96,31 @@
 
 #else /* CONFIG_GPIOLIB */
 
-static inline struct gpio_desc *__must_check gpiod_get(struct device *dev,
-						       const char *con_id)
+static inline struct gpio_desc *__must_check __gpiod_get(struct device *dev,
+						const char *con_id,
+						enum gpiod_flags flags)
 {
 	return ERR_PTR(-ENOSYS);
 }
-static inline struct gpio_desc *__must_check gpiod_get_index(struct device *dev,
-							     const char *con_id,
-							     unsigned int idx)
+static inline struct gpio_desc *__must_check
+__gpiod_get_index(struct device *dev,
+		  const char *con_id,
+		  unsigned int idx,
+		  enum gpiod_flags flags)
 {
 	return ERR_PTR(-ENOSYS);
 }
 
 static inline struct gpio_desc *__must_check
-gpiod_get_optional(struct device *dev, const char *con_id)
+__gpiod_get_optional(struct device *dev, const char *con_id,
+		     enum gpiod_flags flags)
 {
 	return ERR_PTR(-ENOSYS);
 }
 
 static inline struct gpio_desc *__must_check
-gpiod_get_index_optional(struct device *dev, const char *con_id,
-			 unsigned int index)
+__gpiod_get_index_optional(struct device *dev, const char *con_id,
+			   unsigned int index, enum gpiod_flags flags)
 {
 	return ERR_PTR(-ENOSYS);
 }
@@ -157,28 +133,33 @@
 	WARN_ON(1);
 }
 
-static inline struct gpio_desc *__must_check devm_gpiod_get(struct device *dev,
-							    const char *con_id)
+static inline struct gpio_desc *__must_check
+__devm_gpiod_get(struct device *dev,
+		 const char *con_id,
+		 enum gpiod_flags flags)
 {
 	return ERR_PTR(-ENOSYS);
 }
 static inline
-struct gpio_desc *__must_check devm_gpiod_get_index(struct device *dev,
-						    const char *con_id,
-						    unsigned int idx)
+struct gpio_desc *__must_check
+__devm_gpiod_get_index(struct device *dev,
+		       const char *con_id,
+		       unsigned int idx,
+		       enum gpiod_flags flags)
 {
 	return ERR_PTR(-ENOSYS);
 }
 
 static inline struct gpio_desc *__must_check
-devm_gpiod_get_optional(struct device *dev, const char *con_id)
+__devm_gpiod_get_optional(struct device *dev, const char *con_id,
+			  enum gpiod_flags flags)
 {
 	return ERR_PTR(-ENOSYS);
 }
 
 static inline struct gpio_desc *__must_check
-devm_gpiod_get_index_optional(struct device *dev, const char *con_id,
-			      unsigned int index)
+__devm_gpiod_get_index_optional(struct device *dev, const char *con_id,
+				unsigned int index, enum gpiod_flags flags)
 {
 	return ERR_PTR(-ENOSYS);
 }
@@ -303,9 +284,43 @@
 	return -EINVAL;
 }
 
-
 #endif /* CONFIG_GPIOLIB */
 
+/*
+ * Vararg-hacks! This is done to transition the kernel to always pass
+ * the options flags argument to the below functions. During a transition
+ * phase these vararg macros make both old-and-newstyle code compile,
+ * but when all calls to the elder API are removed, these should go away
+ * and the __gpiod_get() etc functions above be renamed just gpiod_get()
+ * etc.
+ */
+#define __gpiod_get(dev, con_id, flags, ...) __gpiod_get(dev, con_id, flags)
+#define gpiod_get(varargs...) __gpiod_get(varargs, 0)
+#define __gpiod_get_index(dev, con_id, index, flags, ...)		\
+	__gpiod_get_index(dev, con_id, index, flags)
+#define gpiod_get_index(varargs...) __gpiod_get_index(varargs, 0)
+#define __gpiod_get_optional(dev, con_id, flags, ...)			\
+	__gpiod_get_optional(dev, con_id, flags)
+#define gpiod_get_optional(varargs...) __gpiod_get_optional(varargs, 0)
+#define __gpiod_get_index_optional(dev, con_id, index, flags, ...)	\
+	__gpiod_get_index_optional(dev, con_id, index, flags)
+#define gpiod_get_index_optional(varargs...)				\
+	__gpiod_get_index_optional(varargs, 0)
+#define __devm_gpiod_get(dev, con_id, flags, ...)			\
+	__devm_gpiod_get(dev, con_id, flags)
+#define devm_gpiod_get(varargs...) __devm_gpiod_get(varargs, 0)
+#define __devm_gpiod_get_index(dev, con_id, index, flags, ...)		\
+	__devm_gpiod_get_index(dev, con_id, index, flags)
+#define devm_gpiod_get_index(varargs...) __devm_gpiod_get_index(varargs, 0)
+#define __devm_gpiod_get_optional(dev, con_id, flags, ...)		\
+	__devm_gpiod_get_optional(dev, con_id, flags)
+#define devm_gpiod_get_optional(varargs...)				\
+	__devm_gpiod_get_optional(varargs, 0)
+#define __devm_gpiod_get_index_optional(dev, con_id, index, flags, ...)	\
+	__devm_gpiod_get_index_optional(dev, con_id, index, flags)
+#define devm_gpiod_get_index_optional(varargs...)			\
+	__devm_gpiod_get_index_optional(varargs, 0)
+
 #if IS_ENABLED(CONFIG_GPIOLIB) && IS_ENABLED(CONFIG_GPIO_SYSFS)
 
 int gpiod_export(struct gpio_desc *desc, bool direction_may_change);

diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
index 784304b..98f923b6 100644
--- a/include/linux/jump_label.h
+++ b/include/linux/jump_label.h

@@ -8,28 +8,28 @@
  * Copyright (C) 2011-2012 Peter Zijlstra <pzijlstr@redhat.com>
  *
  * Jump labels provide an interface to generate dynamic branches using
- * self-modifying code. Assuming toolchain and architecture support the result
- * of a "if (static_key_false(&key))" statement is a unconditional branch (which
+ * self-modifying code. Assuming toolchain and architecture support, the result
+ * of a "if (static_key_false(&key))" statement is an unconditional branch (which
  * defaults to false - and the true block is placed out of line).
  *
  * However at runtime we can change the branch target using
  * static_key_slow_{inc,dec}(). These function as a 'reference' count on the key
- * object and for as long as there are references all branches referring to
+ * object, and for as long as there are references all branches referring to
  * that particular key will point to the (out of line) true block.
  *
- * Since this relies on modifying code the static_key_slow_{inc,dec}() functions
+ * Since this relies on modifying code, the static_key_slow_{inc,dec}() functions
  * must be considered absolute slow paths (machine wide synchronization etc.).
- * OTOH, since the affected branches are unconditional their runtime overhead
+ * OTOH, since the affected branches are unconditional, their runtime overhead
  * will be absolutely minimal, esp. in the default (off) case where the total
  * effect is a single NOP of appropriate size. The on case will patch in a jump
  * to the out-of-line block.
  *
- * When the control is directly exposed to userspace it is prudent to delay the
+ * When the control is directly exposed to userspace, it is prudent to delay the
  * decrement to avoid high frequency code modifications which can (and do)
  * cause significant performance degradation. Struct static_key_deferred and
  * static_key_slow_dec_deferred() provide for this.
  *
- * Lacking toolchain and or architecture support, it falls back to a simple
+ * Lacking toolchain and or architecture support, jump labels fall back to a simple
  * conditional branch.
  *
  * struct static_key my_key = STATIC_KEY_INIT_TRUE;
@@ -43,8 +43,7 @@
  *
  * Not initializing the key (static data is initialized to 0s anyway) is the
  * same as using STATIC_KEY_INIT_FALSE.
- *
-*/
+ */
 
 #include <linux/types.h>
 #include <linux/compiler.h>

diff --git a/include/linux/leds.h b/include/linux/leds.h
index 6a599dc..e436864 100644
--- a/include/linux/leds.h
+++ b/include/linux/leds.h

@@ -15,6 +15,7 @@
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/rwsem.h>
+#include <linux/timer.h>
 #include <linux/workqueue.h>
 
 struct device;
@@ -68,7 +69,7 @@
 	const char		*default_trigger;	/* Trigger to use */
 
 	unsigned long		 blink_delay_on, blink_delay_off;
-	struct delayed_work	 blink_work;
+	struct timer_list	 blink_timer;
 	int			 blink_brightness;
 
 	struct work_struct	set_brightness_work;

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 008388f9..b5a84b62 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h

@@ -4,7 +4,7 @@
  *  Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
  *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
  *
- * see Documentation/lockdep-design.txt for more details.
+ * see Documentation/locking/lockdep-design.txt for more details.
  */
 #ifndef __LINUX_LOCKDEP_H
 #define __LINUX_LOCKDEP_H
@@ -478,16 +478,24 @@
  * on the per lock-class debug mode:
  */
 
+/*
+ * Read states in the 2-bit held_lock:read field:
+ *  0: Exclusive lock
+ *  1: Shareable lock, cannot be recursively called
+ *  2: Shareable lock, can be recursively called
+ *  3: Shareable lock, cannot be recursively called except in interrupt context
+ */
 #define lock_acquire_exclusive(l, s, t, n, i)		lock_acquire(l, s, t, 0, 1, n, i)
 #define lock_acquire_shared(l, s, t, n, i)		lock_acquire(l, s, t, 1, 1, n, i)
 #define lock_acquire_shared_recursive(l, s, t, n, i)	lock_acquire(l, s, t, 2, 1, n, i)
+#define lock_acquire_shared_irecursive(l, s, t, n, i)	lock_acquire(l, s, t, 3, 1, n, i)
 
 #define spin_acquire(l, s, t, i)		lock_acquire_exclusive(l, s, t, NULL, i)
 #define spin_acquire_nest(l, s, t, n, i)	lock_acquire_exclusive(l, s, t, n, i)
 #define spin_release(l, n, i)			lock_release(l, n, i)
 
 #define rwlock_acquire(l, s, t, i)		lock_acquire_exclusive(l, s, t, NULL, i)
-#define rwlock_acquire_read(l, s, t, i)		lock_acquire_shared_recursive(l, s, t, NULL, i)
+#define rwlock_acquire_read(l, s, t, i)		lock_acquire_shared_irecursive(l, s, t, NULL, i)
 #define rwlock_release(l, n, i)			lock_release(l, n, i)
 
 #define seqcount_acquire(l, s, t, i)		lock_acquire_exclusive(l, s, t, NULL, i)

diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 071f6b2..511c6e0 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h

@@ -1196,6 +1196,9 @@
 				  enum mlx4_net_trans_rule_id id);
 int mlx4_hw_rule_sz(struct mlx4_dev *dev, enum mlx4_net_trans_rule_id id);
 
+int mlx4_tunnel_steer_add(struct mlx4_dev *dev, unsigned char *addr,
+			  int port, int qpn, u16 prio, u64 *reg_id);
+
 void mlx4_sync_pkey_table(struct mlx4_dev *dev, int slave, int port,
 			  int i, int val);
 

diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 3083c53..c300db3 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h

@@ -949,7 +949,7 @@
 		: 0;
 }
 
-/**
+/*
  * struct nand_sdr_timings - SDR NAND chip timings
  *
  * This struct defines the timing requirements of a SDR NAND chip.

diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index 8d5535c..cc31498 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h

@@ -52,7 +52,7 @@
 	atomic_t		count;
 	spinlock_t		wait_lock;
 	struct list_head	wait_list;
-#if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_SMP)
+#if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_MUTEX_SPIN_ON_OWNER)
 	struct task_struct	*owner;
 #endif
 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
@@ -133,7 +133,7 @@
 
 /*
  * See kernel/locking/mutex.c for detailed documentation of these APIs.
- * Also see Documentation/mutex-design.txt.
+ * Also see Documentation/locking/mutex-design.txt.
  */
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 extern void mutex_lock_nested(struct mutex *lock, unsigned int subclass);

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 3837739..c8e388e 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h

@@ -3176,7 +3176,7 @@
 }
 
 /**
- *  __dev_uc_unsync - Remove synchonized addresses from device
+ *  __dev_uc_unsync - Remove synchronized addresses from device
  *  @dev:  device to sync
  *  @unsync: function to call if address should be removed
  *
@@ -3220,7 +3220,7 @@
 }
 
 /**
- *  __dev_mc_unsync - Remove synchonized addresses from device
+ *  __dev_mc_unsync - Remove synchronized addresses from device
  *  @dev:  device to sync
  *  @unsync: function to call if address should be removed
  *

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 2077489..2517ece 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h

@@ -9,6 +9,7 @@
 #include <linux/in6.h>
 #include <linux/wait.h>
 #include <linux/list.h>
+#include <linux/static_key.h>
 #include <uapi/linux/netfilter.h>
 #ifdef CONFIG_NETFILTER
 static inline int NF_DROP_GETERR(int verdict)
@@ -99,9 +100,9 @@
 
 extern struct list_head nf_hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS];
 
-#if defined(CONFIG_JUMP_LABEL)
-#include <linux/static_key.h>
+#ifdef HAVE_JUMP_LABEL
 extern struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS];
+
 static inline bool nf_hooks_active(u_int8_t pf, unsigned int hook)
 {
 	if (__builtin_constant_p(pf) &&

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 707617a..893a0d0 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h

@@ -52,6 +52,7 @@
 #include <linux/atomic.h>
 #include <linux/sysfs.h>
 #include <linux/perf_regs.h>
+#include <linux/workqueue.h>
 #include <asm/local.h>
 
 struct perf_callchain_entry {
@@ -268,6 +269,7 @@
  * enum perf_event_active_state - the states of a event
  */
 enum perf_event_active_state {
+	PERF_EVENT_STATE_EXIT		= -3,
 	PERF_EVENT_STATE_ERROR		= -2,
 	PERF_EVENT_STATE_OFF		= -1,
 	PERF_EVENT_STATE_INACTIVE	=  0,
@@ -507,6 +509,9 @@
 	int				nr_cgroups;	 /* cgroup evts */
 	int				nr_branch_stack; /* branch_stack evt */
 	struct rcu_head			rcu_head;
+
+	struct delayed_work		orphans_remove;
+	bool				orphans_remove_sched;
 };
 
 /*
@@ -604,6 +609,13 @@
 	u64				txn;
 };
 
+/* default value for data source */
+#define PERF_MEM_NA (PERF_MEM_S(OP, NA)   |\
+		    PERF_MEM_S(LVL, NA)   |\
+		    PERF_MEM_S(SNOOP, NA) |\
+		    PERF_MEM_S(LOCK, NA)  |\
+		    PERF_MEM_S(TLB, NA))
+
 static inline void perf_sample_data_init(struct perf_sample_data *data,
 					 u64 addr, u64 period)
 {
@@ -616,7 +628,7 @@
 	data->regs_user.regs = NULL;
 	data->stack_user_size = 0;
 	data->weight = 0;
-	data->data_src.val = 0;
+	data->data_src.val = PERF_MEM_NA;
 	data->txn = 0;
 }
 

diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index 7c1d252..ebc4c76 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h

@@ -60,7 +60,7 @@
 	struct mutex lock;
 	struct dev_power_governor *gov;
 	struct work_struct power_off_work;
-	char *name;
+	const char *name;
 	unsigned int in_progress;	/* Number of devices being suspended now */
 	atomic_t sd_count;	/* Number of subdomains with power "on" */
 	enum gpd_status status;	/* Current state of the domain */

diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h
index bbe03a1..4efa1ed 100644
--- a/include/linux/regulator/driver.h
+++ b/include/linux/regulator/driver.h

@@ -218,6 +218,8 @@
  * @linear_min_sel: Minimal selector for starting linear mapping
  * @fixed_uV: Fixed voltage of rails.
  * @ramp_delay: Time to settle down after voltage change (unit: uV/us)
+ * @linear_ranges: A constant table of possible voltage ranges.
+ * @n_linear_ranges: Number of entries in the @linear_ranges table.
  * @volt_table: Voltage mapping table (if table based mapping)
  *
  * @vsel_reg: Register for selector when using regulator_regmap_X_voltage_

diff --git a/include/linux/regulator/machine.h b/include/linux/regulator/machine.h
index 730e638..0b08d05 100644
--- a/include/linux/regulator/machine.h
+++ b/include/linux/regulator/machine.h

@@ -85,6 +85,7 @@
  *           bootloader then it will be enabled when the constraints are
  *           applied.
  * @apply_uV: Apply the voltage constraint when initialising.
+ * @ramp_disable: Disable ramp delay when initialising or when setting voltage.
  *
  * @input_uV: Input voltage for regulator when supplied by another regulator.
  *

diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index 035d3c5..8f498cd 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h

@@ -149,7 +149,7 @@
  * static then another method for expressing nested locking is
  * the explicit definition of lock class keys and the use of
  * lockdep_set_class() at lock initialization time.
- * See Documentation/lockdep-design.txt for more details.)
+ * See Documentation/locking/lockdep-design.txt for more details.)
  */
 extern void down_read_nested(struct rw_semaphore *sem, int subclass);
 extern void down_write_nested(struct rw_semaphore *sem, int subclass);

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5c2c885..dd9eb48 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h

@@ -645,6 +645,7 @@
 	 * Live threads maintain their own counters and add to these
 	 * in __exit_signal, except for the group leader.
 	 */
+	seqlock_t stats_lock;
 	cputime_t utime, stime, cutime, cstime;
 	cputime_t gtime;
 	cputime_t cgtime;

diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index 5d586a4..a19ddac 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h

@@ -27,19 +27,23 @@
 	struct seccomp_filter *filter;
 };
 
-extern int __secure_computing(int);
-static inline int secure_computing(int this_syscall)
+#ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
+extern int __secure_computing(void);
+static inline int secure_computing(void)
 {
 	if (unlikely(test_thread_flag(TIF_SECCOMP)))
-		return  __secure_computing(this_syscall);
+		return  __secure_computing();
 	return 0;
 }
 
-/* A wrapper for architectures supporting only SECCOMP_MODE_STRICT. */
-static inline void secure_computing_strict(int this_syscall)
-{
-	BUG_ON(secure_computing(this_syscall) != 0);
-}
+#define SECCOMP_PHASE1_OK	0
+#define SECCOMP_PHASE1_SKIP	1
+
+extern u32 seccomp_phase1(struct seccomp_data *sd);
+int seccomp_phase2(u32 phase1_result);
+#else
+extern void secure_computing_strict(int this_syscall);
+#endif
 
 extern long prctl_get_seccomp(void);
 extern long prctl_set_seccomp(unsigned long, char __user *);
@@ -56,8 +60,11 @@
 struct seccomp { };
 struct seccomp_filter { };
 
-static inline int secure_computing(int this_syscall) { return 0; }
+#ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
+static inline int secure_computing(void) { return 0; }
+#else
 static inline void secure_computing_strict(int this_syscall) { return; }
+#endif
 
 static inline long prctl_get_seccomp(void)
 {

diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index 3f2867f..262ba4e 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h

@@ -197,7 +197,13 @@
 		 _raw_spin_lock_nest_lock(lock, &(nest_lock)->dep_map);	\
 	 } while (0)
 #else
-# define raw_spin_lock_nested(lock, subclass)		_raw_spin_lock(lock)
+/*
+ * Always evaluate the 'subclass' argument to avoid that the compiler
+ * warns about set-but-not-used variables when building with
+ * CONFIG_DEBUG_LOCK_ALLOC=n and with W=1.
+ */
+# define raw_spin_lock_nested(lock, subclass)		\
+	_raw_spin_lock(((void)(subclass), (lock)))
 # define raw_spin_lock_nest_lock(lock, nest_lock)	_raw_spin_lock(lock)
 #endif
 

diff --git a/include/linux/tick.h b/include/linux/tick.h
index 0590523..9a82c7d 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h

@@ -183,13 +183,8 @@
 
 extern void tick_nohz_init(void);
 extern void __tick_nohz_full_check(void);
+extern void tick_nohz_full_kick(void);
 extern void tick_nohz_full_kick_cpu(int cpu);
-
-static inline void tick_nohz_full_kick(void)
-{
-	tick_nohz_full_kick_cpu(smp_processor_id());
-}
-
 extern void tick_nohz_full_kick_all(void);
 extern void __tick_nohz_task_switch(struct task_struct *tsk);
 #else

diff --git a/include/linux/wait.h b/include/linux/wait.h
index 6fb1ba5..034f6fc 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h

@@ -280,9 +280,11 @@
  * wake_up() has to be called after changing any variable that could
  * change the result of the wait condition.
  *
- * The function returns 0 if the @timeout elapsed, or the remaining
- * jiffies (at least 1) if the @condition evaluated to %true before
- * the @timeout elapsed.
+ * Returns:
+ * 0 if the @condition evaluated to %false after the @timeout elapsed,
+ * 1 if the @condition evaluated to %true after the @timeout elapsed,
+ * or the remaining jiffies (at least 1) if the @condition evaluated
+ * to %true before the @timeout elapsed.
  */
 #define wait_event_timeout(wq, condition, timeout)			\
 ({									\
@@ -363,9 +365,11 @@
  * change the result of the wait condition.
  *
  * Returns:
- * 0 if the @timeout elapsed, -%ERESTARTSYS if it was interrupted by
- * a signal, or the remaining jiffies (at least 1) if the @condition
- * evaluated to %true before the @timeout elapsed.
+ * 0 if the @condition evaluated to %false after the @timeout elapsed,
+ * 1 if the @condition evaluated to %true after the @timeout elapsed,
+ * the remaining jiffies (at least 1) if the @condition evaluated
+ * to %true before the @timeout elapsed, or -%ERESTARTSYS if it was
+ * interrupted by a signal.
  */
 #define wait_event_interruptible_timeout(wq, condition, timeout)	\
 ({									\

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index b5d5af3..6f884e6 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h

@@ -464,6 +464,8 @@
 		HCI_AUTO_CONN_ALWAYS,
 		HCI_AUTO_CONN_LINK_LOSS,
 	} auto_connect;
+
+	struct hci_conn *conn;
 };
 
 extern struct list_head hci_dev_list;

diff --git a/include/net/netns/ieee802154_6lowpan.h b/include/net/netns/ieee802154_6lowpan.h
index e207096..8170f8d 100644
--- a/include/net/netns/ieee802154_6lowpan.h
+++ b/include/net/netns/ieee802154_6lowpan.h

@@ -16,7 +16,6 @@
 struct netns_ieee802154_lowpan {
 	struct netns_sysctl_lowpan sysctl;
 	struct netns_frags	frags;
-	int			max_dsize;
 };
 
 #endif

diff --git a/include/net/regulatory.h b/include/net/regulatory.h
index 2599924..dad7ab2 100644
--- a/include/net/regulatory.h
+++ b/include/net/regulatory.h

@@ -167,7 +167,7 @@
 struct ieee80211_regdomain {
 	struct rcu_head rcu_head;
 	u32 n_reg_rules;
-	char alpha2[2];
+	char alpha2[3];
 	enum nl80211_dfs_regions dfs_region;
 	struct ieee80211_reg_rule reg_rules[];
 };

diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index f6e7397..9fbd856 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h

@@ -320,6 +320,19 @@
 	return asoc ? asoc->assoc_id : 0;
 }
 
+static inline enum sctp_sstat_state
+sctp_assoc_to_state(const struct sctp_association *asoc)
+{
+	/* SCTP's uapi always had SCTP_EMPTY(=0) as a dummy state, but we
+	 * got rid of it in kernel space. Therefore SCTP_CLOSED et al
+	 * start at =1 in user space, but actually as =0 in kernel space.
+	 * Now that we can not break user space and SCTP_EMPTY is exposed
+	 * there, we need to fix it up with an ugly offset not to break
+	 * applications. :(
+	 */
+	return asoc->state + 1;
+}
+
 /* Look up the association by its id.  */
 struct sctp_association *sctp_id2assoc(struct sock *sk, sctp_assoc_t id);
 

diff --git a/include/net/sock.h b/include/net/sock.h
index 7f2ab72..b9a5bd0 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h

@@ -2165,9 +2165,7 @@
 	 */
 	if (sock_flag(sk, SOCK_RCVTSTAMP) ||
 	    (sk->sk_tsflags & SOF_TIMESTAMPING_RX_SOFTWARE) ||
-	    (kt.tv64 &&
-	     (sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE ||
-	      skb_shinfo(skb)->tx_flags & SKBTX_ANY_SW_TSTAMP)) ||
+	    (kt.tv64 && sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE) ||
 	    (hwtstamps->hwtstamp.tv64 &&
 	     (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE)))
 		__sock_recv_timestamp(msg, sk, skb);

diff --git a/include/net/wimax.h b/include/net/wimax.h
index e52ef53..c52b685 100644
--- a/include/net/wimax.h
+++ b/include/net/wimax.h

@@ -290,7 +290,7 @@
  *     This operation has to be synchronous, and return only when the
  *     reset is complete. In case of having had to resort to bus/cold
  *     reset implying a device disconnection, the call is allowed to
- *     return inmediately.
+ *     return immediately.
  *     NOTE: wimax_dev->mutex is NOT locked when this op is being
  *     called; however, wimax_dev->mutex_reset IS locked to ensure
  *     serialization of calls to wimax_reset().

diff --git a/include/sound/soc.h b/include/sound/soc.h
index be6ecae..c83a334 100644
--- a/include/sound/soc.h
+++ b/include/sound/soc.h

@@ -277,7 +277,7 @@
 	.access = SNDRV_CTL_ELEM_ACCESS_TLV_READWRITE | \
 		  SNDRV_CTL_ELEM_ACCESS_TLV_CALLBACK, \
 	.tlv.c = (snd_soc_bytes_tlv_callback), \
-	.info = snd_soc_info_bytes_ext, \
+	.info = snd_soc_bytes_info_ext, \
 	.private_value = (unsigned long)&(struct soc_bytes_ext) \
 		{.max = xcount, .get = xhandler_get, .put = xhandler_put, } }
 #define SOC_SINGLE_XR_SX(xname, xregbase, xregcount, xnbits, \

diff --git a/include/trace/events/irq.h b/include/trace/events/irq.h
index 1c09820..3608beb 100644
--- a/include/trace/events/irq.h
+++ b/include/trace/events/irq.h

@@ -107,7 +107,7 @@
  * @vec_nr:  softirq vector number
  *
  * When used in combination with the softirq_exit tracepoint
- * we can determine the softirq handler runtine.
+ * we can determine the softirq handler routine.
  */
 DEFINE_EVENT(softirq, softirq_entry,
 
@@ -121,7 +121,7 @@
  * @vec_nr:  softirq vector number
  *
  * When used in combination with the softirq_entry tracepoint
- * we can determine the softirq handler runtine.
+ * we can determine the softirq handler routine.
  */
 DEFINE_EVENT(softirq, softirq_exit,
 

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 7dc8788..940aced 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c

@@ -1035,6 +1035,11 @@
 	css_get(&cgrp->self);
 }
 
+static bool cgroup_tryget(struct cgroup *cgrp)
+{
+	return css_tryget(&cgrp->self);
+}
+
 static void cgroup_put(struct cgroup *cgrp)
 {
 	css_put(&cgrp->self);
@@ -1147,7 +1152,8 @@
 	 * protection against removal.  Ensure @cgrp stays accessible and
 	 * break the active_ref protection.
 	 */
-	cgroup_get(cgrp);
+	if (!cgroup_tryget(cgrp))
+		return NULL;
 	kernfs_break_active_protection(kn);
 
 	mutex_lock(&cgroup_mutex);
@@ -3271,8 +3277,17 @@
 {
 	struct cftype *cft;
 
-	for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
-		cft->flags |= __CFTYPE_NOT_ON_DFL;
+	/*
+	 * If legacy_flies_on_dfl, we want to show the legacy files on the
+	 * dfl hierarchy but iff the target subsystem hasn't been updated
+	 * for the dfl hierarchy yet.
+	 */
+	if (!cgroup_legacy_files_on_dfl ||
+	    ss->dfl_cftypes != ss->legacy_cftypes) {
+		for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
+			cft->flags |= __CFTYPE_NOT_ON_DFL;
+	}
+
 	return cgroup_add_cftypes(ss, cfts);
 }
 
@@ -4387,6 +4402,15 @@
 		/* cgroup release path */
 		cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
 		cgrp->id = -1;
+
+		/*
+		 * There are two control paths which try to determine
+		 * cgroup from dentry without going through kernfs -
+		 * cgroupstats_build() and css_tryget_online_from_dir().
+		 * Those are supported by RCU protecting clearing of
+		 * cgrp->kn->priv backpointer.
+		 */
+		RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL);
 	}
 
 	mutex_unlock(&cgroup_mutex);
@@ -4543,6 +4567,11 @@
 	struct cftype *base_files;
 	int ssid, ret;
 
+	/* Do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable.
+	 */
+	if (strchr(name, '\n'))
+		return -EINVAL;
+
 	parent = cgroup_kn_lock_live(parent_kn);
 	if (!parent)
 		return -ENODEV;
@@ -4820,16 +4849,6 @@
 
 	cgroup_kn_unlock(kn);
 
-	/*
-	 * There are two control paths which try to determine cgroup from
-	 * dentry without going through kernfs - cgroupstats_build() and
-	 * css_tryget_online_from_dir().  Those are supported by RCU
-	 * protecting clearing of cgrp->kn->priv backpointer, which should
-	 * happen after all files under it have been removed.
-	 */
-	if (!ret)
-		RCU_INIT_POINTER(*(void __rcu __force **)&kn->priv, NULL);
-
 	cgroup_put(cgrp);
 	return ret;
 }
@@ -5416,7 +5435,7 @@
 	/*
 	 * This path doesn't originate from kernfs and @kn could already
 	 * have been or be removed at any point.  @kn->priv is RCU
-	 * protected for this access.  See cgroup_rmdir() for details.
+	 * protected for this access.  See css_release_work_fn() for details.
 	 */
 	cgrp = rcu_dereference(kn->priv);
 	if (cgrp)

diff --git a/kernel/compat.c b/kernel/compat.c
index 633394f..ebb3c36 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c

@@ -226,7 +226,7 @@
 	ret = hrtimer_nanosleep_restart(restart);
 	set_fs(oldfs);
 
-	if (ret) {
+	if (ret == -ERESTART_RESTARTBLOCK) {
 		rmtp = restart->nanosleep.compat_rmtp;
 
 		if (rmtp && compat_put_timespec(&rmt, rmtp))
@@ -256,7 +256,26 @@
 				HRTIMER_MODE_REL, CLOCK_MONOTONIC);
 	set_fs(oldfs);
 
-	if (ret) {
+	/*
+	 * hrtimer_nanosleep() can only return 0 or
+	 * -ERESTART_RESTARTBLOCK here because:
+	 *
+	 * - we call it with HRTIMER_MODE_REL and therefor exclude the
+	 *   -ERESTARTNOHAND return path.
+	 *
+	 * - we supply the rmtp argument from the task stack (due to
+	 *   the necessary compat conversion. So the update cannot
+	 *   fail, which excludes the -EFAULT return path as well. If
+	 *   it fails nevertheless we have a bigger problem and wont
+	 *   reach this place anymore.
+	 *
+	 * - if the return value is 0, we do not have to update rmtp
+	 *    because there is no remaining time.
+	 *
+	 * We check for -ERESTART_RESTARTBLOCK nevertheless if the
+	 * core implementation decides to return random nonsense.
+	 */
+	if (ret == -ERESTART_RESTARTBLOCK) {
 		struct restart_block *restart
 			= &current_thread_info()->restart_block;
 
@@ -266,7 +285,6 @@
 		if (rmtp && compat_put_timespec(&rmt, rmtp))
 			return -EFAULT;
 	}
-
 	return ret;
 }
 

diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 97b67df..f2a88de 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c

@@ -52,7 +52,7 @@
 	struct callchain_cpus_entries *entries;
 
 	entries = callchain_cpus_entries;
-	rcu_assign_pointer(callchain_cpus_entries, NULL);
+	RCU_INIT_POINTER(callchain_cpus_entries, NULL);
 	call_rcu(&entries->rcu_head, release_callchain_buffers_rcu);
 }
 

diff --git a/kernel/events/core.c b/kernel/events/core.c
index f9c1ed0..1212cc4 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c

@@ -47,6 +47,8 @@
 
 #include <asm/irq_regs.h>
 
+static struct workqueue_struct *perf_wq;
+
 struct remote_function_call {
 	struct task_struct	*p;
 	int			(*func)(void *info);
@@ -120,6 +122,13 @@
 	return data.ret;
 }
 
+#define EVENT_OWNER_KERNEL ((void *) -1)
+
+static bool is_kernel_event(struct perf_event *event)
+{
+	return event->owner == EVENT_OWNER_KERNEL;
+}
+
 #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
 		       PERF_FLAG_FD_OUTPUT  |\
 		       PERF_FLAG_PID_CGROUP |\
@@ -1375,6 +1384,45 @@
 		perf_event__header_size(tmp);
 }
 
+/*
+ * User event without the task.
+ */
+static bool is_orphaned_event(struct perf_event *event)
+{
+	return event && !is_kernel_event(event) && !event->owner;
+}
+
+/*
+ * Event has a parent but parent's task finished and it's
+ * alive only because of children holding refference.
+ */
+static bool is_orphaned_child(struct perf_event *event)
+{
+	return is_orphaned_event(event->parent);
+}
+
+static void orphans_remove_work(struct work_struct *work);
+
+static void schedule_orphans_remove(struct perf_event_context *ctx)
+{
+	if (!ctx->task || ctx->orphans_remove_sched || !perf_wq)
+		return;
+
+	if (queue_delayed_work(perf_wq, &ctx->orphans_remove, 1)) {
+		get_ctx(ctx);
+		ctx->orphans_remove_sched = true;
+	}
+}
+
+static int __init perf_workqueue_init(void)
+{
+	perf_wq = create_singlethread_workqueue("perf");
+	WARN(!perf_wq, "failed to create perf workqueue\n");
+	return perf_wq ? 0 : -1;
+}
+
+core_initcall(perf_workqueue_init);
+
 static inline int
 event_filter_match(struct perf_event *event)
 {
@@ -1424,6 +1472,9 @@
 	if (event->attr.exclusive || !cpuctx->active_oncpu)
 		cpuctx->exclusive = 0;
 
+	if (is_orphaned_child(event))
+		schedule_orphans_remove(ctx);
+
 	perf_pmu_enable(event->pmu);
 }
 
@@ -1524,6 +1575,11 @@
 	 */
 	if (ctx->is_active) {
 		raw_spin_unlock_irq(&ctx->lock);
+		/*
+		 * Reload the task pointer, it might have been changed by
+		 * a concurrent perf_event_context_sched_out().
+		 */
+		task = ctx->task;
 		goto retry;
 	}
 
@@ -1726,6 +1782,9 @@
 	if (event->attr.exclusive)
 		cpuctx->exclusive = 1;
 
+	if (is_orphaned_child(event))
+		schedule_orphans_remove(ctx);
+
 out:
 	perf_pmu_enable(event->pmu);
 
@@ -1967,6 +2026,11 @@
 	 */
 	if (ctx->is_active) {
 		raw_spin_unlock_irq(&ctx->lock);
+		/*
+		 * Reload the task pointer, it might have been changed by
+		 * a concurrent perf_event_context_sched_out().
+		 */
+		task = ctx->task;
 		goto retry;
 	}
 
@@ -3068,6 +3132,7 @@
 	INIT_LIST_HEAD(&ctx->flexible_groups);
 	INIT_LIST_HEAD(&ctx->event_list);
 	atomic_set(&ctx->refcount, 1);
+	INIT_DELAYED_WORK(&ctx->orphans_remove, orphans_remove_work);
 }
 
 static struct perf_event_context *
@@ -3313,16 +3378,12 @@
 }
 
 /*
- * Called when the last reference to the file is gone.
+ * Remove user event from the owner task.
  */
-static void put_event(struct perf_event *event)
+static void perf_remove_from_owner(struct perf_event *event)
 {
-	struct perf_event_context *ctx = event->ctx;
 	struct task_struct *owner;
 
-	if (!atomic_long_dec_and_test(&event->refcount))
-		return;
-
 	rcu_read_lock();
 	owner = ACCESS_ONCE(event->owner);
 	/*
@@ -3355,6 +3416,20 @@
 		mutex_unlock(&owner->perf_event_mutex);
 		put_task_struct(owner);
 	}
+}
+
+/*
+ * Called when the last reference to the file is gone.
+ */
+static void put_event(struct perf_event *event)
+{
+	struct perf_event_context *ctx = event->ctx;
+
+	if (!atomic_long_dec_and_test(&event->refcount))
+		return;
+
+	if (!is_kernel_event(event))
+		perf_remove_from_owner(event);
 
 	WARN_ON_ONCE(ctx->parent_ctx);
 	/*
@@ -3389,6 +3464,42 @@
 	return 0;
 }
 
+/*
+ * Remove all orphanes events from the context.
+ */
+static void orphans_remove_work(struct work_struct *work)
+{
+	struct perf_event_context *ctx;
+	struct perf_event *event, *tmp;
+
+	ctx = container_of(work, struct perf_event_context,
+			   orphans_remove.work);
+
+	mutex_lock(&ctx->mutex);
+	list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) {
+		struct perf_event *parent_event = event->parent;
+
+		if (!is_orphaned_child(event))
+			continue;
+
+		perf_remove_from_context(event, true);
+
+		mutex_lock(&parent_event->child_mutex);
+		list_del_init(&event->child_list);
+		mutex_unlock(&parent_event->child_mutex);
+
+		free_event(event);
+		put_event(parent_event);
+	}
+
+	raw_spin_lock_irq(&ctx->lock);
+	ctx->orphans_remove_sched = false;
+	raw_spin_unlock_irq(&ctx->lock);
+	mutex_unlock(&ctx->mutex);
+
+	put_ctx(ctx);
+}
+
 u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
 {
 	struct perf_event *child;
@@ -3500,7 +3611,8 @@
 	 * error state (i.e. because it was pinned but it couldn't be
 	 * scheduled on to the CPU at some point).
 	 */
-	if (event->state == PERF_EVENT_STATE_ERROR)
+	if ((event->state == PERF_EVENT_STATE_ERROR) ||
+	    (event->state == PERF_EVENT_STATE_EXIT))
 		return 0;
 
 	if (count < event->read_size)
@@ -3527,7 +3639,12 @@
 {
 	struct perf_event *event = file->private_data;
 	struct ring_buffer *rb;
-	unsigned int events = POLL_HUP;
+	unsigned int events = POLLHUP;
+
+	poll_wait(file, &event->waitq, wait);
+
+	if (event->state == PERF_EVENT_STATE_EXIT)
+		return events;
 
 	/*
 	 * Pin the event->rb by taking event->mmap_mutex; otherwise
@@ -3538,9 +3655,6 @@
 	if (rb)
 		events = atomic_xchg(&rb->poll, 0);
 	mutex_unlock(&event->mmap_mutex);
-
-	poll_wait(file, &event->waitq, wait);
-
 	return events;
 }
 
@@ -5804,7 +5918,7 @@
 	if (!hlist)
 		return;
 
-	rcu_assign_pointer(swhash->swevent_hlist, NULL);
+	RCU_INIT_POINTER(swhash->swevent_hlist, NULL);
 	kfree_rcu(hlist, rcu_head);
 }
 
@@ -7387,6 +7501,9 @@
 		goto err;
 	}
 
+	/* Mark owner so we could distinguish it from user events. */
+	event->owner = EVENT_OWNER_KERNEL;
+
 	account_event(event);
 
 	ctx = find_get_context(event->pmu, task, cpu);
@@ -7507,6 +7624,9 @@
 	if (child_event->parent) {
 		sync_child_event(child_event, child);
 		free_event(child_event);
+	} else {
+		child_event->state = PERF_EVENT_STATE_EXIT;
+		perf_event_wakeup(child_event);
 	}
 }
 
@@ -7710,7 +7830,8 @@
 	if (IS_ERR(child_event))
 		return child_event;
 
-	if (!atomic_long_inc_not_zero(&parent_event->refcount)) {
+	if (is_orphaned_event(parent_event) ||
+	    !atomic_long_inc_not_zero(&parent_event->refcount)) {
 		free_event(child_event);
 		return NULL;
 	}

diff --git a/kernel/exit.c b/kernel/exit.c
index 32c58f7..fa09b86 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c

@@ -115,32 +115,33 @@
 
 		if (tsk == sig->curr_target)
 			sig->curr_target = next_thread(tsk);
-		/*
-		 * Accumulate here the counters for all threads but the
-		 * group leader as they die, so they can be added into
-		 * the process-wide totals when those are taken.
-		 * The group leader stays around as a zombie as long
-		 * as there are other threads.  When it gets reaped,
-		 * the exit.c code will add its counts into these totals.
-		 * We won't ever get here for the group leader, since it
-		 * will have been the last reference on the signal_struct.
-		 */
-		task_cputime(tsk, &utime, &stime);
-		sig->utime += utime;
-		sig->stime += stime;
-		sig->gtime += task_gtime(tsk);
-		sig->min_flt += tsk->min_flt;
-		sig->maj_flt += tsk->maj_flt;
-		sig->nvcsw += tsk->nvcsw;
-		sig->nivcsw += tsk->nivcsw;
-		sig->inblock += task_io_get_inblock(tsk);
-		sig->oublock += task_io_get_oublock(tsk);
-		task_io_accounting_add(&sig->ioac, &tsk->ioac);
-		sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
 	}
 
+	/*
+	 * Accumulate here the counters for all threads but the group leader
+	 * as they die, so they can be added into the process-wide totals
+	 * when those are taken.  The group leader stays around as a zombie as
+	 * long as there are other threads.  When it gets reaped, the exit.c
+	 * code will add its counts into these totals.  We won't ever get here
+	 * for the group leader, since it will have been the last reference on
+	 * the signal_struct.
+	 */
+	task_cputime(tsk, &utime, &stime);
+	write_seqlock(&sig->stats_lock);
+	sig->utime += utime;
+	sig->stime += stime;
+	sig->gtime += task_gtime(tsk);
+	sig->min_flt += tsk->min_flt;
+	sig->maj_flt += tsk->maj_flt;
+	sig->nvcsw += tsk->nvcsw;
+	sig->nivcsw += tsk->nivcsw;
+	sig->inblock += task_io_get_inblock(tsk);
+	sig->oublock += task_io_get_oublock(tsk);
+	task_io_accounting_add(&sig->ioac, &tsk->ioac);
+	sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
 	sig->nr_threads--;
 	__unhash_process(tsk, group_dead);
+	write_sequnlock(&sig->stats_lock);
 
 	/*
 	 * Do this under ->siglock, we can race with another thread
@@ -1043,6 +1044,7 @@
 		spin_lock_irq(&p->real_parent->sighand->siglock);
 		psig = p->real_parent->signal;
 		sig = p->signal;
+		write_seqlock(&psig->stats_lock);
 		psig->cutime += tgutime + sig->cutime;
 		psig->cstime += tgstime + sig->cstime;
 		psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime;
@@ -1065,6 +1067,7 @@
 			psig->cmaxrss = maxrss;
 		task_io_accounting_add(&psig->ioac, &p->ioac);
 		task_io_accounting_add(&psig->ioac, &sig->ioac);
+		write_sequnlock(&psig->stats_lock);
 		spin_unlock_irq(&p->real_parent->sighand->siglock);
 	}
 

diff --git a/kernel/fork.c b/kernel/fork.c
index 0cf9cdb..9387ae8 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c

@@ -1068,6 +1068,7 @@
 	sig->curr_target = tsk;
 	init_sigpending(&sig->shared_pending);
 	INIT_LIST_HEAD(&sig->posix_timers);
+	seqlock_init(&sig->stats_lock);
 
 	hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	sig->real_timer.function = it_real_fn;

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index a2b28a2..6223fab 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c

@@ -517,6 +517,7 @@
 		chip->irq_eoi(&desc->irq_data);
 	raw_spin_unlock(&desc->lock);
 }
+EXPORT_SYMBOL_GPL(handle_fasteoi_irq);
 
 /**
  *	handle_edge_irq - edge type IRQ handler

diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 88d0d44..420ba68 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c

@@ -3597,6 +3597,12 @@
 	raw_local_irq_save(flags);
 	check_flags(flags);
 
+	/*
+	 * An interrupt recursive read in interrupt context can be considered
+	 * to be the same as a recursive read from checking perspective.
+	 */
+	if ((read == 3) && in_interrupt())
+		read = 2;
 	current->lockdep_recursion = 1;
 	trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip);
 	__lock_acquire(lock, subclass, trylock, read, check,

diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index 23e89c5..4d60986 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h

@@ -56,9 +56,6 @@
  * If the lock has already been acquired, then this will proceed to spin
  * on this node->locked until the previous lock holder sets the node->locked
  * in mcs_spin_unlock().
- *
- * We don't inline mcs_spin_lock() so that perf can correctly account for the
- * time spent in this lock function.
  */
 static inline
 void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)

diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index ae712b2..dadbf88 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c

@@ -15,7 +15,7 @@
  *    by Steven Rostedt, based on work by Gregory Haskins, Peter Morreale
  *    and Sven Dietrich.
  *
- * Also see Documentation/mutex-design.txt.
+ * Also see Documentation/locking/mutex-design.txt.
  */
 #include <linux/mutex.h>
 #include <linux/ww_mutex.h>
@@ -106,6 +106,92 @@
 EXPORT_SYMBOL(mutex_lock);
 #endif
 
+static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww,
+						   struct ww_acquire_ctx *ww_ctx)
+{
+#ifdef CONFIG_DEBUG_MUTEXES
+	/*
+	 * If this WARN_ON triggers, you used ww_mutex_lock to acquire,
+	 * but released with a normal mutex_unlock in this call.
+	 *
+	 * This should never happen, always use ww_mutex_unlock.
+	 */
+	DEBUG_LOCKS_WARN_ON(ww->ctx);
+
+	/*
+	 * Not quite done after calling ww_acquire_done() ?
+	 */
+	DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire);
+
+	if (ww_ctx->contending_lock) {
+		/*
+		 * After -EDEADLK you tried to
+		 * acquire a different ww_mutex? Bad!
+		 */
+		DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww);
+
+		/*
+		 * You called ww_mutex_lock after receiving -EDEADLK,
+		 * but 'forgot' to unlock everything else first?
+		 */
+		DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0);
+		ww_ctx->contending_lock = NULL;
+	}
+
+	/*
+	 * Naughty, using a different class will lead to undefined behavior!
+	 */
+	DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class);
+#endif
+	ww_ctx->acquired++;
+}
+
+/*
+ * after acquiring lock with fastpath or when we lost out in contested
+ * slowpath, set ctx and wake up any waiters so they can recheck.
+ *
+ * This function is never called when CONFIG_DEBUG_LOCK_ALLOC is set,
+ * as the fastpath and opportunistic spinning are disabled in that case.
+ */
+static __always_inline void
+ww_mutex_set_context_fastpath(struct ww_mutex *lock,
+			       struct ww_acquire_ctx *ctx)
+{
+	unsigned long flags;
+	struct mutex_waiter *cur;
+
+	ww_mutex_lock_acquired(lock, ctx);
+
+	lock->ctx = ctx;
+
+	/*
+	 * The lock->ctx update should be visible on all cores before
+	 * the atomic read is done, otherwise contended waiters might be
+	 * missed. The contended waiters will either see ww_ctx == NULL
+	 * and keep spinning, or it will acquire wait_lock, add itself
+	 * to waiter list and sleep.
+	 */
+	smp_mb(); /* ^^^ */
+
+	/*
+	 * Check if lock is contended, if not there is nobody to wake up
+	 */
+	if (likely(atomic_read(&lock->base.count) == 0))
+		return;
+
+	/*
+	 * Uh oh, we raced in fastpath, wake up everyone in this case,
+	 * so they can see the new lock->ctx.
+	 */
+	spin_lock_mutex(&lock->base.wait_lock, flags);
+	list_for_each_entry(cur, &lock->base.wait_list, list) {
+		debug_mutex_wake_waiter(&lock->base, cur);
+		wake_up_process(cur->task);
+	}
+	spin_unlock_mutex(&lock->base.wait_lock, flags);
+}
+
+
 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
 /*
  * In order to avoid a stampede of mutex spinners from acquiring the mutex
@@ -180,6 +266,129 @@
 	 */
 	return retval;
 }
+
+/*
+ * Atomically try to take the lock when it is available
+ */
+static inline bool mutex_try_to_acquire(struct mutex *lock)
+{
+	return !mutex_is_locked(lock) &&
+		(atomic_cmpxchg(&lock->count, 1, 0) == 1);
+}
+
+/*
+ * Optimistic spinning.
+ *
+ * We try to spin for acquisition when we find that the lock owner
+ * is currently running on a (different) CPU and while we don't
+ * need to reschedule. The rationale is that if the lock owner is
+ * running, it is likely to release the lock soon.
+ *
+ * Since this needs the lock owner, and this mutex implementation
+ * doesn't track the owner atomically in the lock field, we need to
+ * track it non-atomically.
+ *
+ * We can't do this for DEBUG_MUTEXES because that relies on wait_lock
+ * to serialize everything.
+ *
+ * The mutex spinners are queued up using MCS lock so that only one
+ * spinner can compete for the mutex. However, if mutex spinning isn't
+ * going to happen, there is no point in going through the lock/unlock
+ * overhead.
+ *
+ * Returns true when the lock was taken, otherwise false, indicating
+ * that we need to jump to the slowpath and sleep.
+ */
+static bool mutex_optimistic_spin(struct mutex *lock,
+				  struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx)
+{
+	struct task_struct *task = current;
+
+	if (!mutex_can_spin_on_owner(lock))
+		goto done;
+
+	if (!osq_lock(&lock->osq))
+		goto done;
+
+	while (true) {
+		struct task_struct *owner;
+
+		if (use_ww_ctx && ww_ctx->acquired > 0) {
+			struct ww_mutex *ww;
+
+			ww = container_of(lock, struct ww_mutex, base);
+			/*
+			 * If ww->ctx is set the contents are undefined, only
+			 * by acquiring wait_lock there is a guarantee that
+			 * they are not invalid when reading.
+			 *
+			 * As such, when deadlock detection needs to be
+			 * performed the optimistic spinning cannot be done.
+			 */
+			if (ACCESS_ONCE(ww->ctx))
+				break;
+		}
+
+		/*
+		 * If there's an owner, wait for it to either
+		 * release the lock or go to sleep.
+		 */
+		owner = ACCESS_ONCE(lock->owner);
+		if (owner && !mutex_spin_on_owner(lock, owner))
+			break;
+
+		/* Try to acquire the mutex if it is unlocked. */
+		if (mutex_try_to_acquire(lock)) {
+			lock_acquired(&lock->dep_map, ip);
+
+			if (use_ww_ctx) {
+				struct ww_mutex *ww;
+				ww = container_of(lock, struct ww_mutex, base);
+
+				ww_mutex_set_context_fastpath(ww, ww_ctx);
+			}
+
+			mutex_set_owner(lock);
+			osq_unlock(&lock->osq);
+			return true;
+		}
+
+		/*
+		 * When there's no owner, we might have preempted between the
+		 * owner acquiring the lock and setting the owner field. If
+		 * we're an RT task that will live-lock because we won't let
+		 * the owner complete.
+		 */
+		if (!owner && (need_resched() || rt_task(task)))
+			break;
+
+		/*
+		 * The cpu_relax() call is a compiler barrier which forces
+		 * everything in this loop to be re-loaded. We don't need
+		 * memory barriers as we'll eventually observe the right
+		 * values at the cost of a few extra spins.
+		 */
+		cpu_relax_lowlatency();
+	}
+
+	osq_unlock(&lock->osq);
+done:
+	/*
+	 * If we fell out of the spin path because of need_resched(),
+	 * reschedule now, before we try-lock the mutex. This avoids getting
+	 * scheduled out right after we obtained the mutex.
+	 */
+	if (need_resched())
+		schedule_preempt_disabled();
+
+	return false;
+}
+#else
+static bool mutex_optimistic_spin(struct mutex *lock,
+				  struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx)
+{
+	return false;
+}
 #endif
 
 __visible __used noinline
@@ -277,91 +486,6 @@
 	return 0;
 }
 
-static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww,
-						   struct ww_acquire_ctx *ww_ctx)
-{
-#ifdef CONFIG_DEBUG_MUTEXES
-	/*
-	 * If this WARN_ON triggers, you used ww_mutex_lock to acquire,
-	 * but released with a normal mutex_unlock in this call.
-	 *
-	 * This should never happen, always use ww_mutex_unlock.
-	 */
-	DEBUG_LOCKS_WARN_ON(ww->ctx);
-
-	/*
-	 * Not quite done after calling ww_acquire_done() ?
-	 */
-	DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire);
-
-	if (ww_ctx->contending_lock) {
-		/*
-		 * After -EDEADLK you tried to
-		 * acquire a different ww_mutex? Bad!
-		 */
-		DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww);
-
-		/*
-		 * You called ww_mutex_lock after receiving -EDEADLK,
-		 * but 'forgot' to unlock everything else first?
-		 */
-		DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0);
-		ww_ctx->contending_lock = NULL;
-	}
-
-	/*
-	 * Naughty, using a different class will lead to undefined behavior!
-	 */
-	DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class);
-#endif
-	ww_ctx->acquired++;
-}
-
-/*
- * after acquiring lock with fastpath or when we lost out in contested
- * slowpath, set ctx and wake up any waiters so they can recheck.
- *
- * This function is never called when CONFIG_DEBUG_LOCK_ALLOC is set,
- * as the fastpath and opportunistic spinning are disabled in that case.
- */
-static __always_inline void
-ww_mutex_set_context_fastpath(struct ww_mutex *lock,
-			       struct ww_acquire_ctx *ctx)
-{
-	unsigned long flags;
-	struct mutex_waiter *cur;
-
-	ww_mutex_lock_acquired(lock, ctx);
-
-	lock->ctx = ctx;
-
-	/*
-	 * The lock->ctx update should be visible on all cores before
-	 * the atomic read is done, otherwise contended waiters might be
-	 * missed. The contended waiters will either see ww_ctx == NULL
-	 * and keep spinning, or it will acquire wait_lock, add itself
-	 * to waiter list and sleep.
-	 */
-	smp_mb(); /* ^^^ */
-
-	/*
-	 * Check if lock is contended, if not there is nobody to wake up
-	 */
-	if (likely(atomic_read(&lock->base.count) == 0))
-		return;
-
-	/*
-	 * Uh oh, we raced in fastpath, wake up everyone in this case,
-	 * so they can see the new lock->ctx.
-	 */
-	spin_lock_mutex(&lock->base.wait_lock, flags);
-	list_for_each_entry(cur, &lock->base.wait_list, list) {
-		debug_mutex_wake_waiter(&lock->base, cur);
-		wake_up_process(cur->task);
-	}
-	spin_unlock_mutex(&lock->base.wait_lock, flags);
-}
-
 /*
  * Lock a mutex (possibly interruptible), slowpath:
  */
@@ -378,104 +502,12 @@
 	preempt_disable();
 	mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);
 
-#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
-	/*
-	 * Optimistic spinning.
-	 *
-	 * We try to spin for acquisition when we find that the lock owner
-	 * is currently running on a (different) CPU and while we don't
-	 * need to reschedule. The rationale is that if the lock owner is
-	 * running, it is likely to release the lock soon.
-	 *
-	 * Since this needs the lock owner, and this mutex implementation
-	 * doesn't track the owner atomically in the lock field, we need to
-	 * track it non-atomically.
-	 *
-	 * We can't do this for DEBUG_MUTEXES because that relies on wait_lock
-	 * to serialize everything.
-	 *
-	 * The mutex spinners are queued up using MCS lock so that only one
-	 * spinner can compete for the mutex. However, if mutex spinning isn't
-	 * going to happen, there is no point in going through the lock/unlock
-	 * overhead.
-	 */
-	if (!mutex_can_spin_on_owner(lock))
-		goto slowpath;
-
-	if (!osq_lock(&lock->osq))
-		goto slowpath;
-
-	for (;;) {
-		struct task_struct *owner;
-
-		if (use_ww_ctx && ww_ctx->acquired > 0) {
-			struct ww_mutex *ww;
-
-			ww = container_of(lock, struct ww_mutex, base);
-			/*
-			 * If ww->ctx is set the contents are undefined, only
-			 * by acquiring wait_lock there is a guarantee that
-			 * they are not invalid when reading.
-			 *
-			 * As such, when deadlock detection needs to be
-			 * performed the optimistic spinning cannot be done.
-			 */
-			if (ACCESS_ONCE(ww->ctx))
-				break;
-		}
-
-		/*
-		 * If there's an owner, wait for it to either
-		 * release the lock or go to sleep.
-		 */
-		owner = ACCESS_ONCE(lock->owner);
-		if (owner && !mutex_spin_on_owner(lock, owner))
-			break;
-
-		/* Try to acquire the mutex if it is unlocked. */
-		if (!mutex_is_locked(lock) &&
-		    (atomic_cmpxchg(&lock->count, 1, 0) == 1)) {
-			lock_acquired(&lock->dep_map, ip);
-			if (use_ww_ctx) {
-				struct ww_mutex *ww;
-				ww = container_of(lock, struct ww_mutex, base);
-
-				ww_mutex_set_context_fastpath(ww, ww_ctx);
-			}
-
-			mutex_set_owner(lock);
-			osq_unlock(&lock->osq);
-			preempt_enable();
-			return 0;
-		}
-
-		/*
-		 * When there's no owner, we might have preempted between the
-		 * owner acquiring the lock and setting the owner field. If
-		 * we're an RT task that will live-lock because we won't let
-		 * the owner complete.
-		 */
-		if (!owner && (need_resched() || rt_task(task)))
-			break;
-
-		/*
-		 * The cpu_relax() call is a compiler barrier which forces
-		 * everything in this loop to be re-loaded. We don't need
-		 * memory barriers as we'll eventually observe the right
-		 * values at the cost of a few extra spins.
-		 */
-		cpu_relax_lowlatency();
+	if (mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx)) {
+		/* got the lock, yay! */
+		preempt_enable();
+		return 0;
 	}
-	osq_unlock(&lock->osq);
-slowpath:
-	/*
-	 * If we fell out of the spin path because of need_resched(),
-	 * reschedule now, before we try-lock the mutex. This avoids getting
-	 * scheduled out right after we obtained the mutex.
-	 */
-	if (need_resched())
-		schedule_preempt_disabled();
-#endif
+
 	spin_lock_mutex(&lock->wait_lock, flags);
 
 	/*
@@ -679,15 +711,21 @@
  * Release the lock, slowpath:
  */
 static inline void
-__mutex_unlock_common_slowpath(atomic_t *lock_count, int nested)
+__mutex_unlock_common_slowpath(struct mutex *lock, int nested)
 {
-	struct mutex *lock = container_of(lock_count, struct mutex, count);
 	unsigned long flags;
 
 	/*
-	 * some architectures leave the lock unlocked in the fastpath failure
+	 * As a performance measurement, release the lock before doing other
+	 * wakeup related duties to follow. This allows other tasks to acquire
+	 * the lock sooner, while still handling cleanups in past unlock calls.
+	 * This can be done as we do not enforce strict equivalence between the
+	 * mutex counter and wait_list.
+	 *
+	 *
+	 * Some architectures leave the lock unlocked in the fastpath failure
 	 * case, others need to leave it locked. In the later case we have to
-	 * unlock it here
+	 * unlock it here - as the lock counter is currently 0 or negative.
 	 */
 	if (__mutex_slowpath_needs_to_unlock())
 		atomic_set(&lock->count, 1);
@@ -716,7 +754,9 @@
 __visible void
 __mutex_unlock_slowpath(atomic_t *lock_count)
 {
-	__mutex_unlock_common_slowpath(lock_count, 1);
+	struct mutex *lock = container_of(lock_count, struct mutex, count);
+
+	__mutex_unlock_common_slowpath(lock, 1);
 }
 
 #ifndef CONFIG_DEBUG_LOCK_ALLOC

diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h
index 4115fbf..5cda397 100644
--- a/kernel/locking/mutex.h
+++ b/kernel/locking/mutex.h

@@ -16,7 +16,7 @@
 #define mutex_remove_waiter(lock, waiter, ti) \
 		__list_del((waiter)->list.prev, (waiter)->list.next)
 
-#ifdef CONFIG_SMP
+#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
 static inline void mutex_set_owner(struct mutex *lock)
 {
 	lock->owner = current;

diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index a0ea2a1..7c98873 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c

@@ -8,7 +8,7 @@
  *  Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt
  *  Copyright (C) 2006 Esben Nielsen
  *
- *  See Documentation/rt-mutex-design.txt for details.
+ *  See Documentation/locking/rt-mutex-design.txt for details.
  */
 #include <linux/spinlock.h>
 #include <linux/export.h>

diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c
index 6815171..b8120ab 100644
--- a/kernel/locking/semaphore.c
+++ b/kernel/locking/semaphore.c

@@ -36,7 +36,7 @@
 static noinline void __down(struct semaphore *sem);
 static noinline int __down_interruptible(struct semaphore *sem);
 static noinline int __down_killable(struct semaphore *sem);
-static noinline int __down_timeout(struct semaphore *sem, long jiffies);
+static noinline int __down_timeout(struct semaphore *sem, long timeout);
 static noinline void __up(struct semaphore *sem);
 
 /**
@@ -145,14 +145,14 @@
 /**
  * down_timeout - acquire the semaphore within a specified time
  * @sem: the semaphore to be acquired
- * @jiffies: how long to wait before failing
+ * @timeout: how long to wait before failing
  *
  * Attempts to acquire the semaphore.  If no more tasks are allowed to
  * acquire the semaphore, calling this function will put the task to sleep.
  * If the semaphore is not released within the specified number of jiffies,
  * this function returns -ETIME.  It returns 0 if the semaphore was acquired.
  */
-int down_timeout(struct semaphore *sem, long jiffies)
+int down_timeout(struct semaphore *sem, long timeout)
 {
 	unsigned long flags;
 	int result = 0;
@@ -161,7 +161,7 @@
 	if (likely(sem->count > 0))
 		sem->count--;
 	else
-		result = __down_timeout(sem, jiffies);
+		result = __down_timeout(sem, timeout);
 	raw_spin_unlock_irqrestore(&sem->lock, flags);
 
 	return result;
@@ -248,9 +248,9 @@
 	return __down_common(sem, TASK_KILLABLE, MAX_SCHEDULE_TIMEOUT);
 }
 
-static noinline int __sched __down_timeout(struct semaphore *sem, long jiffies)
+static noinline int __sched __down_timeout(struct semaphore *sem, long timeout)
 {
-	return __down_common(sem, TASK_UNINTERRUPTIBLE, jiffies);
+	return __down_common(sem, TASK_UNINTERRUPTIBLE, timeout);
 }
 
 static noinline void __sched __up(struct semaphore *sem)

diff --git a/kernel/power/power.h b/kernel/power/power.h
index 5d49dca..2df883a 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h

@@ -179,6 +179,7 @@
 
 #ifdef CONFIG_SUSPEND
 /* kernel/power/suspend.c */
+extern const char *pm_labels[];
 extern const char *pm_states[];
 
 extern int suspend_devices_and_enter(suspend_state_t state);

diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 6dadb25..18c6219 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c

@@ -31,7 +31,7 @@
 
 #include "power.h"
 
-static const char *pm_labels[] = { "mem", "standby", "freeze", };
+const char *pm_labels[] = { "mem", "standby", "freeze", NULL };
 const char *pm_states[PM_SUSPEND_MAX];
 
 static const struct platform_suspend_ops *suspend_ops;

diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
index 2f52492..bd91bc1 100644
--- a/kernel/power/suspend_test.c
+++ b/kernel/power/suspend_test.c

@@ -129,20 +129,20 @@
  * at startup time.  They're normally disabled, for faster boot and because
  * we can't know which states really work on this particular system.
  */
-static suspend_state_t test_state __initdata = PM_SUSPEND_ON;
+static const char *test_state_label __initdata;
 
 static char warn_bad_state[] __initdata =
 	KERN_WARNING "PM: can't test '%s' suspend state\n";
 
 static int __init setup_test_suspend(char *value)
 {
-	suspend_state_t i;
+	int i;
 
 	/* "=mem" ==> "mem" */
 	value++;
-	for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++)
-		if (!strcmp(pm_states[i], value)) {
-			test_state = i;
+	for (i = 0; pm_labels[i]; i++)
+		if (!strcmp(pm_labels[i], value)) {
+			test_state_label = pm_labels[i];
 			return 0;
 		}
 
@@ -158,13 +158,21 @@
 
 	struct rtc_device	*rtc = NULL;
 	struct device		*dev;
+	suspend_state_t test_state;
 
 	/* PM is initialized by now; is that state testable? */
-	if (test_state == PM_SUSPEND_ON)
-		goto done;
-	if (!pm_states[test_state]) {
-		printk(warn_bad_state, pm_states[test_state]);
-		goto done;
+	if (!test_state_label)
+		return 0;
+
+	for (test_state = PM_SUSPEND_MIN; test_state < PM_SUSPEND_MAX; test_state++) {
+		const char *state_label = pm_states[test_state];
+
+		if (state_label && !strcmp(test_state_label, state_label))
+			break;
+	}
+	if (test_state == PM_SUSPEND_MAX) {
+		printk(warn_bad_state, test_state_label);
+		return 0;
 	}
 
 	/* RTCs have initialized by now too ... can we use one? */
@@ -173,13 +181,12 @@
 		rtc = rtc_class_open(dev_name(dev));
 	if (!rtc) {
 		printk(warn_no_rtc);
-		goto done;
+		return 0;
 	}
 
 	/* go for it */
 	test_wakealarm(rtc, test_state);
 	rtc_class_close(rtc);
-done:
 	return 0;
 }
 late_initcall(test_suspend);

diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 71e64c7..6a86eb7 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h

@@ -358,7 +358,7 @@
 	struct rcu_head **nocb_gp_tail;
 	long nocb_gp_count;
 	long nocb_gp_count_lazy;
-	bool nocb_leader_wake;		/* Is the nocb leader thread awake? */
+	bool nocb_leader_sleep;		/* Is the nocb leader thread asleep? */
 	struct rcu_data *nocb_next_follower;
 					/* Next follower in wakeup chain. */
 

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 00dc411..a7997e2 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h

@@ -2074,9 +2074,9 @@
 
 	if (!ACCESS_ONCE(rdp_leader->nocb_kthread))
 		return;
-	if (!ACCESS_ONCE(rdp_leader->nocb_leader_wake) || force) {
+	if (ACCESS_ONCE(rdp_leader->nocb_leader_sleep) || force) {
 		/* Prior xchg orders against prior callback enqueue. */
-		ACCESS_ONCE(rdp_leader->nocb_leader_wake) = true;
+		ACCESS_ONCE(rdp_leader->nocb_leader_sleep) = false;
 		wake_up(&rdp_leader->nocb_wq);
 	}
 }
@@ -2253,7 +2253,7 @@
 	if (!rcu_nocb_poll) {
 		trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep");
 		wait_event_interruptible(my_rdp->nocb_wq,
-					 ACCESS_ONCE(my_rdp->nocb_leader_wake));
+				!ACCESS_ONCE(my_rdp->nocb_leader_sleep));
 		/* Memory barrier handled by smp_mb() calls below and repoll. */
 	} else if (firsttime) {
 		firsttime = false; /* Don't drown trace log with "Poll"! */
@@ -2292,12 +2292,12 @@
 		schedule_timeout_interruptible(1);
 
 		/* Rescan in case we were a victim of memory ordering. */
-		my_rdp->nocb_leader_wake = false;
-		smp_mb();  /* Ensure _wake false before scan. */
+		my_rdp->nocb_leader_sleep = true;
+		smp_mb();  /* Ensure _sleep true before scan. */
 		for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower)
 			if (ACCESS_ONCE(rdp->nocb_head)) {
 				/* Found CB, so short-circuit next wait. */
-				my_rdp->nocb_leader_wake = true;
+				my_rdp->nocb_leader_sleep = false;
 				break;
 			}
 		goto wait_again;
@@ -2307,17 +2307,17 @@
 	rcu_nocb_wait_gp(my_rdp);
 
 	/*
-	 * We left ->nocb_leader_wake set to reduce cache thrashing.
-	 * We clear it now, but recheck for new callbacks while
+	 * We left ->nocb_leader_sleep unset to reduce cache thrashing.
+	 * We set it now, but recheck for new callbacks while
 	 * traversing our follower list.
 	 */
-	my_rdp->nocb_leader_wake = false;
-	smp_mb(); /* Ensure _wake false before scan of ->nocb_head. */
+	my_rdp->nocb_leader_sleep = true;
+	smp_mb(); /* Ensure _sleep true before scan of ->nocb_head. */
 
 	/* Each pass through the following loop wakes a follower, if needed. */
 	for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
 		if (ACCESS_ONCE(rdp->nocb_head))
-			my_rdp->nocb_leader_wake = true; /* No need to wait. */
+			my_rdp->nocb_leader_sleep = false;/* No need to sleep.*/
 		if (!rdp->nocb_gp_head)
 			continue; /* No CBs, so no need to wake follower. */
 

diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index e73efba..8a2e230 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c

@@ -148,11 +148,8 @@
 	if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled))
 		goto out;
 
-	t = p;
-	do {
+	for_each_thread(p, t)
 		sched_move_task(t);
-	} while_each_thread(p, t);
-
 out:
 	unlock_task_sighand(p, &flags);
 	autogroup_kref_put(prev);

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ec1a286..07d67dd 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c

@@ -90,22 +90,6 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/sched.h>
 
-#ifdef smp_mb__before_atomic
-void __smp_mb__before_atomic(void)
-{
-	smp_mb__before_atomic();
-}
-EXPORT_SYMBOL(__smp_mb__before_atomic);
-#endif
-
-#ifdef smp_mb__after_atomic
-void __smp_mb__after_atomic(void)
-{
-	smp_mb__after_atomic();
-}
-EXPORT_SYMBOL(__smp_mb__after_atomic);
-#endif
-
 void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
 {
 	unsigned long delta;
@@ -333,9 +317,12 @@
 	for (;;) {
 		rq = task_rq(p);
 		raw_spin_lock(&rq->lock);
-		if (likely(rq == task_rq(p)))
+		if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
 			return rq;
 		raw_spin_unlock(&rq->lock);
+
+		while (unlikely(task_on_rq_migrating(p)))
+			cpu_relax();
 	}
 }
 
@@ -352,10 +339,13 @@
 		raw_spin_lock_irqsave(&p->pi_lock, *flags);
 		rq = task_rq(p);
 		raw_spin_lock(&rq->lock);
-		if (likely(rq == task_rq(p)))
+		if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
 			return rq;
 		raw_spin_unlock(&rq->lock);
 		raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
+
+		while (unlikely(task_on_rq_migrating(p)))
+			cpu_relax();
 	}
 }
 
@@ -449,7 +439,15 @@
 void hrtick_start(struct rq *rq, u64 delay)
 {
 	struct hrtimer *timer = &rq->hrtick_timer;
-	ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
+	ktime_t time;
+	s64 delta;
+
+	/*
+	 * Don't schedule slices shorter than 10000ns, that just
+	 * doesn't make sense and can cause timer DoS.
+	 */
+	delta = max_t(s64, delay, 10000LL);
+	time = ktime_add_ns(timer->base->get_time(), delta);
 
 	hrtimer_set_expires(timer, time);
 
@@ -1043,7 +1041,7 @@
 	 * A queue event has occurred, and we're going to schedule.  In
 	 * this case, we can save a useless back to back clock update.
 	 */
-	if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))
+	if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
 		rq->skip_clock_update = 1;
 }
 
@@ -1088,7 +1086,7 @@
 
 static void __migrate_swap_task(struct task_struct *p, int cpu)
 {
-	if (p->on_rq) {
+	if (task_on_rq_queued(p)) {
 		struct rq *src_rq, *dst_rq;
 
 		src_rq = task_rq(p);
@@ -1214,7 +1212,7 @@
 unsigned long wait_task_inactive(struct task_struct *p, long match_state)
 {
 	unsigned long flags;
-	int running, on_rq;
+	int running, queued;
 	unsigned long ncsw;
 	struct rq *rq;
 
@@ -1252,7 +1250,7 @@
 		rq = task_rq_lock(p, &flags);
 		trace_sched_wait_task(p);
 		running = task_running(rq, p);
-		on_rq = p->on_rq;
+		queued = task_on_rq_queued(p);
 		ncsw = 0;
 		if (!match_state || p->state == match_state)
 			ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
@@ -1284,7 +1282,7 @@
 		 * running right now), it's preempted, and we should
 		 * yield - it could be a while.
 		 */
-		if (unlikely(on_rq)) {
+		if (unlikely(queued)) {
 			ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
 
 			set_current_state(TASK_UNINTERRUPTIBLE);
@@ -1478,7 +1476,7 @@
 static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
 {
 	activate_task(rq, p, en_flags);
-	p->on_rq = 1;
+	p->on_rq = TASK_ON_RQ_QUEUED;
 
 	/* if a worker is waking up, notify workqueue */
 	if (p->flags & PF_WQ_WORKER)
@@ -1537,7 +1535,7 @@
 	int ret = 0;
 
 	rq = __task_rq_lock(p);
-	if (p->on_rq) {
+	if (task_on_rq_queued(p)) {
 		/* check_preempt_curr() may use rq clock */
 		update_rq_clock(rq);
 		ttwu_do_wakeup(rq, p, wake_flags);
@@ -1742,7 +1740,7 @@
 	if (!(p->state & TASK_NORMAL))
 		goto out;
 
-	if (!p->on_rq)
+	if (!task_on_rq_queued(p))
 		ttwu_activate(rq, p, ENQUEUE_WAKEUP);
 
 	ttwu_do_wakeup(rq, p, 0);
@@ -2095,7 +2093,7 @@
 	init_task_runnable_average(p);
 	rq = __task_rq_lock(p);
 	activate_task(rq, p, 0);
-	p->on_rq = 1;
+	p->on_rq = TASK_ON_RQ_QUEUED;
 	trace_sched_wakeup_new(p, true);
 	check_preempt_curr(rq, p, WF_FORK);
 #ifdef CONFIG_SMP
@@ -2451,7 +2449,7 @@
 	 * project cycles that may never be accounted to this
 	 * thread, breaking clock_gettime().
 	 */
-	if (task_current(rq, p) && p->on_rq) {
+	if (task_current(rq, p) && task_on_rq_queued(p)) {
 		update_rq_clock(rq);
 		ns = rq_clock_task(rq) - p->se.exec_start;
 		if ((s64)ns < 0)
@@ -2497,7 +2495,7 @@
 	 * If we see ->on_cpu without ->on_rq, the task is leaving, and has
 	 * been accounted, so we're correct here as well.
 	 */
-	if (!p->on_cpu || !p->on_rq)
+	if (!p->on_cpu || !task_on_rq_queued(p))
 		return p->se.sum_exec_runtime;
 #endif
 
@@ -2801,7 +2799,7 @@
 		switch_count = &prev->nvcsw;
 	}
 
-	if (prev->on_rq || rq->skip_clock_update < 0)
+	if (task_on_rq_queued(prev) || rq->skip_clock_update < 0)
 		update_rq_clock(rq);
 
 	next = pick_next_task(rq, prev);
@@ -2966,7 +2964,7 @@
  */
 void rt_mutex_setprio(struct task_struct *p, int prio)
 {
-	int oldprio, on_rq, running, enqueue_flag = 0;
+	int oldprio, queued, running, enqueue_flag = 0;
 	struct rq *rq;
 	const struct sched_class *prev_class;
 
@@ -2995,9 +2993,9 @@
 	trace_sched_pi_setprio(p, prio);
 	oldprio = p->prio;
 	prev_class = p->sched_class;
-	on_rq = p->on_rq;
+	queued = task_on_rq_queued(p);
 	running = task_current(rq, p);
-	if (on_rq)
+	if (queued)
 		dequeue_task(rq, p, 0);
 	if (running)
 		p->sched_class->put_prev_task(rq, p);
@@ -3037,7 +3035,7 @@
 
 	if (running)
 		p->sched_class->set_curr_task(rq);
-	if (on_rq)
+	if (queued)
 		enqueue_task(rq, p, enqueue_flag);
 
 	check_class_changed(rq, p, prev_class, oldprio);
@@ -3048,7 +3046,7 @@
 
 void set_user_nice(struct task_struct *p, long nice)
 {
-	int old_prio, delta, on_rq;
+	int old_prio, delta, queued;
 	unsigned long flags;
 	struct rq *rq;
 
@@ -3069,8 +3067,8 @@
 		p->static_prio = NICE_TO_PRIO(nice);
 		goto out_unlock;
 	}
-	on_rq = p->on_rq;
-	if (on_rq)
+	queued = task_on_rq_queued(p);
+	if (queued)
 		dequeue_task(rq, p, 0);
 
 	p->static_prio = NICE_TO_PRIO(nice);
@@ -3079,7 +3077,7 @@
 	p->prio = effective_prio(p);
 	delta = p->prio - old_prio;
 
-	if (on_rq) {
+	if (queued) {
 		enqueue_task(rq, p, 0);
 		/*
 		 * If the task increased its priority or is running and
@@ -3351,7 +3349,7 @@
 {
 	int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
 		      MAX_RT_PRIO - 1 - attr->sched_priority;
-	int retval, oldprio, oldpolicy = -1, on_rq, running;
+	int retval, oldprio, oldpolicy = -1, queued, running;
 	int policy = attr->sched_policy;
 	unsigned long flags;
 	const struct sched_class *prev_class;
@@ -3548,9 +3546,9 @@
 		return 0;
 	}
 
-	on_rq = p->on_rq;
+	queued = task_on_rq_queued(p);
 	running = task_current(rq, p);
-	if (on_rq)
+	if (queued)
 		dequeue_task(rq, p, 0);
 	if (running)
 		p->sched_class->put_prev_task(rq, p);
@@ -3560,7 +3558,7 @@
 
 	if (running)
 		p->sched_class->set_curr_task(rq);
-	if (on_rq) {
+	if (queued) {
 		/*
 		 * We enqueue to tail when the priority of a task is
 		 * increased (user space view).
@@ -4512,7 +4510,7 @@
 		"  task                        PC stack   pid father\n");
 #endif
 	rcu_read_lock();
-	do_each_thread(g, p) {
+	for_each_process_thread(g, p) {
 		/*
 		 * reset the NMI-timeout, listing all files on a slow
 		 * console might take a lot of time:
@@ -4520,7 +4518,7 @@
 		touch_nmi_watchdog();
 		if (!state_filter || (p->state & state_filter))
 			sched_show_task(p);
-	} while_each_thread(g, p);
+	}
 
 	touch_all_softlockup_watchdogs();
 
@@ -4575,7 +4573,7 @@
 	rcu_read_unlock();
 
 	rq->curr = rq->idle = idle;
-	idle->on_rq = 1;
+	idle->on_rq = TASK_ON_RQ_QUEUED;
 #if defined(CONFIG_SMP)
 	idle->on_cpu = 1;
 #endif
@@ -4652,7 +4650,7 @@
 		goto out;
 
 	dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
-	if (p->on_rq) {
+	if (task_on_rq_queued(p) || p->state == TASK_WAKING) {
 		struct migration_arg arg = { p, dest_cpu };
 		/* Need help from migration thread: drop lock and wait. */
 		task_rq_unlock(rq, p, &flags);
@@ -4680,20 +4678,20 @@
  */
 static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
 {
-	struct rq *rq_dest, *rq_src;
+	struct rq *rq;
 	int ret = 0;
 
 	if (unlikely(!cpu_active(dest_cpu)))
 		return ret;
 
-	rq_src = cpu_rq(src_cpu);
-	rq_dest = cpu_rq(dest_cpu);
+	rq = cpu_rq(src_cpu);
 
 	raw_spin_lock(&p->pi_lock);
-	double_rq_lock(rq_src, rq_dest);
+	raw_spin_lock(&rq->lock);
 	/* Already moved. */
 	if (task_cpu(p) != src_cpu)
 		goto done;
+
 	/* Affinity changed (again). */
 	if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
 		goto fail;
@@ -4702,16 +4700,23 @@
 	 * If we're not on a rq, the next wake-up will ensure we're
 	 * placed properly.
 	 */
-	if (p->on_rq) {
-		dequeue_task(rq_src, p, 0);
+	if (task_on_rq_queued(p)) {
+		dequeue_task(rq, p, 0);
+		p->on_rq = TASK_ON_RQ_MIGRATING;
 		set_task_cpu(p, dest_cpu);
-		enqueue_task(rq_dest, p, 0);
-		check_preempt_curr(rq_dest, p, 0);
+		raw_spin_unlock(&rq->lock);
+
+		rq = cpu_rq(dest_cpu);
+		raw_spin_lock(&rq->lock);
+		BUG_ON(task_rq(p) != rq);
+		p->on_rq = TASK_ON_RQ_QUEUED;
+		enqueue_task(rq, p, 0);
+		check_preempt_curr(rq, p, 0);
 	}
 done:
 	ret = 1;
 fail:
-	double_rq_unlock(rq_src, rq_dest);
+	raw_spin_unlock(&rq->lock);
 	raw_spin_unlock(&p->pi_lock);
 	return ret;
 }
@@ -4743,13 +4748,13 @@
 {
 	struct rq *rq;
 	unsigned long flags;
-	bool on_rq, running;
+	bool queued, running;
 
 	rq = task_rq_lock(p, &flags);
-	on_rq = p->on_rq;
+	queued = task_on_rq_queued(p);
 	running = task_current(rq, p);
 
-	if (on_rq)
+	if (queued)
 		dequeue_task(rq, p, 0);
 	if (running)
 		p->sched_class->put_prev_task(rq, p);
@@ -4758,7 +4763,7 @@
 
 	if (running)
 		p->sched_class->set_curr_task(rq);
-	if (on_rq)
+	if (queued)
 		enqueue_task(rq, p, 0);
 	task_rq_unlock(rq, p, &flags);
 }
@@ -4778,6 +4783,12 @@
 	 * be on another cpu but it doesn't matter.
 	 */
 	local_irq_disable();
+	/*
+	 * We need to explicitly wake pending tasks before running
+	 * __migrate_task() such that we will not miss enforcing cpus_allowed
+	 * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
+	 */
+	sched_ttwu_pending();
 	__migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
 	local_irq_enable();
 	return 0;
@@ -5746,7 +5757,7 @@
 	const struct cpumask *span = sched_domain_span(sd);
 	struct cpumask *covered = sched_domains_tmpmask;
 	struct sd_data *sdd = sd->private;
-	struct sched_domain *child;
+	struct sched_domain *sibling;
 	int i;
 
 	cpumask_clear(covered);
@@ -5757,10 +5768,10 @@
 		if (cpumask_test_cpu(i, covered))
 			continue;
 
-		child = *per_cpu_ptr(sdd->sd, i);
+		sibling = *per_cpu_ptr(sdd->sd, i);
 
 		/* See the comment near build_group_mask(). */
-		if (!cpumask_test_cpu(i, sched_domain_span(child)))
+		if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
 			continue;
 
 		sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
@@ -5770,10 +5781,9 @@
 			goto fail;
 
 		sg_span = sched_group_cpus(sg);
-		if (child->child) {
-			child = child->child;
-			cpumask_copy(sg_span, sched_domain_span(child));
-		} else
+		if (sibling->child)
+			cpumask_copy(sg_span, sched_domain_span(sibling->child));
+		else
 			cpumask_set_cpu(i, sg_span);
 
 		cpumask_or(covered, covered, sg_span);
@@ -7124,13 +7134,13 @@
 		.sched_policy = SCHED_NORMAL,
 	};
 	int old_prio = p->prio;
-	int on_rq;
+	int queued;
 
-	on_rq = p->on_rq;
-	if (on_rq)
+	queued = task_on_rq_queued(p);
+	if (queued)
 		dequeue_task(rq, p, 0);
 	__setscheduler(rq, p, &attr);
-	if (on_rq) {
+	if (queued) {
 		enqueue_task(rq, p, 0);
 		resched_curr(rq);
 	}
@@ -7145,7 +7155,7 @@
 	struct rq *rq;
 
 	read_lock_irqsave(&tasklist_lock, flags);
-	do_each_thread(g, p) {
+	for_each_process_thread(g, p) {
 		/*
 		 * Only normalize user tasks:
 		 */
@@ -7176,8 +7186,7 @@
 
 		__task_rq_unlock(rq);
 		raw_spin_unlock(&p->pi_lock);
-	} while_each_thread(g, p);
-
+	}
 	read_unlock_irqrestore(&tasklist_lock, flags);
 }
 
@@ -7318,16 +7327,16 @@
 void sched_move_task(struct task_struct *tsk)
 {
 	struct task_group *tg;
-	int on_rq, running;
+	int queued, running;
 	unsigned long flags;
 	struct rq *rq;
 
 	rq = task_rq_lock(tsk, &flags);
 
 	running = task_current(rq, tsk);
-	on_rq = tsk->on_rq;
+	queued = task_on_rq_queued(tsk);
 
-	if (on_rq)
+	if (queued)
 		dequeue_task(rq, tsk, 0);
 	if (unlikely(running))
 		tsk->sched_class->put_prev_task(rq, tsk);
@@ -7340,14 +7349,14 @@
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	if (tsk->sched_class->task_move_group)
-		tsk->sched_class->task_move_group(tsk, on_rq);
+		tsk->sched_class->task_move_group(tsk, queued);
 	else
 #endif
 		set_task_rq(tsk, task_cpu(tsk));
 
 	if (unlikely(running))
 		tsk->sched_class->set_curr_task(rq);
-	if (on_rq)
+	if (queued)
 		enqueue_task(rq, tsk, 0);
 
 	task_rq_unlock(rq, tsk, &flags);
@@ -7365,10 +7374,10 @@
 {
 	struct task_struct *g, *p;
 
-	do_each_thread(g, p) {
+	for_each_process_thread(g, p) {
 		if (rt_task(p) && task_rq(p)->rt.tg == tg)
 			return 1;
-	} while_each_thread(g, p);
+	}
 
 	return 0;
 }

diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 72fdf06..2b57031 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c

@@ -288,24 +288,28 @@
 	struct signal_struct *sig = tsk->signal;
 	cputime_t utime, stime;
 	struct task_struct *t;
-
-	times->utime = sig->utime;
-	times->stime = sig->stime;
-	times->sum_exec_runtime = sig->sum_sched_runtime;
+	unsigned int seq, nextseq;
 
 	rcu_read_lock();
-	/* make sure we can trust tsk->thread_group list */
-	if (!likely(pid_alive(tsk)))
-		goto out;
-
-	t = tsk;
+	/* Attempt a lockless read on the first round. */
+	nextseq = 0;
 	do {
-		task_cputime(t, &utime, &stime);
-		times->utime += utime;
-		times->stime += stime;
-		times->sum_exec_runtime += task_sched_runtime(t);
-	} while_each_thread(tsk, t);
-out:
+		seq = nextseq;
+		read_seqbegin_or_lock(&sig->stats_lock, &seq);
+		times->utime = sig->utime;
+		times->stime = sig->stime;
+		times->sum_exec_runtime = sig->sum_sched_runtime;
+
+		for_each_thread(tsk, t) {
+			task_cputime(t, &utime, &stime);
+			times->utime += utime;
+			times->stime += stime;
+			times->sum_exec_runtime += task_sched_runtime(t);
+		}
+		/* If lockless access failed, take the lock. */
+		nextseq = 1;
+	} while (need_seqretry(&sig->stats_lock, seq));
+	done_seqretry(&sig->stats_lock, seq);
 	rcu_read_unlock();
 }
 
@@ -598,9 +602,12 @@
 	 * If the tick based count grows faster than the scheduler one,
 	 * the result of the scaling may go backward.
 	 * Let's enforce monotonicity.
+	 * Atomic exchange protects against concurrent cputime_adjust().
 	 */
-	prev->stime = max(prev->stime, stime);
-	prev->utime = max(prev->utime, utime);
+	while (stime > (rtime = ACCESS_ONCE(prev->stime)))
+		cmpxchg(&prev->stime, rtime, stime);
+	while (utime > (rtime = ACCESS_ONCE(prev->utime)))
+		cmpxchg(&prev->utime, rtime, utime);
 
 out:
 	*ut = prev->utime;
@@ -617,9 +624,6 @@
 	cputime_adjust(&cputime, &p->prev_cputime, ut, st);
 }
 
-/*
- * Must be called with siglock held.
- */
 void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
 {
 	struct task_cputime cputime;

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 255ce13..cc4eb89 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c

@@ -530,7 +530,7 @@
 	update_rq_clock(rq);
 	dl_se->dl_throttled = 0;
 	dl_se->dl_yielded = 0;
-	if (p->on_rq) {
+	if (task_on_rq_queued(p)) {
 		enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
 		if (task_has_dl_policy(rq->curr))
 			check_preempt_curr_dl(rq, p, 0);
@@ -997,10 +997,7 @@
 #ifdef CONFIG_SCHED_HRTICK
 static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
 {
-	s64 delta = p->dl.dl_runtime - p->dl.runtime;
-
-	if (delta > 10000)
-		hrtick_start(rq, p->dl.runtime);
+	hrtick_start(rq, p->dl.runtime);
 }
 #endif
 
@@ -1030,7 +1027,7 @@
 		 * means a stop task can slip in, in which case we need to
 		 * re-start task selection.
 		 */
-		if (rq->stop && rq->stop->on_rq)
+		if (rq->stop && task_on_rq_queued(rq->stop))
 			return RETRY_TASK;
 	}
 
@@ -1257,7 +1254,8 @@
 			if (unlikely(task_rq(task) != rq ||
 				     !cpumask_test_cpu(later_rq->cpu,
 				                       &task->cpus_allowed) ||
-				     task_running(rq, task) || !task->on_rq)) {
+				     task_running(rq, task) ||
+				     !task_on_rq_queued(task))) {
 				double_unlock_balance(rq, later_rq);
 				later_rq = NULL;
 				break;
@@ -1296,7 +1294,7 @@
 	BUG_ON(task_current(rq, p));
 	BUG_ON(p->nr_cpus_allowed <= 1);
 
-	BUG_ON(!p->on_rq);
+	BUG_ON(!task_on_rq_queued(p));
 	BUG_ON(!dl_task(p));
 
 	return p;
@@ -1443,7 +1441,7 @@
 		     dl_time_before(p->dl.deadline,
 				    this_rq->dl.earliest_dl.curr))) {
 			WARN_ON(p == src_rq->curr);
-			WARN_ON(!p->on_rq);
+			WARN_ON(!task_on_rq_queued(p));
 
 			/*
 			 * Then we pull iff p has actually an earlier
@@ -1596,7 +1594,7 @@
 	if (unlikely(p->dl.dl_throttled))
 		return;
 
-	if (p->on_rq && rq->curr != p) {
+	if (task_on_rq_queued(p) && rq->curr != p) {
 #ifdef CONFIG_SMP
 		if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p))
 			/* Only reschedule if pushing failed */
@@ -1614,7 +1612,7 @@
 static void prio_changed_dl(struct rq *rq, struct task_struct *p,
 			    int oldprio)
 {
-	if (p->on_rq || rq->curr == p) {
+	if (task_on_rq_queued(p) || rq->curr == p) {
 #ifdef CONFIG_SMP
 		/*
 		 * This might be too much, but unfortunately

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 627b3c3..c7fe1ea0 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c

@@ -160,14 +160,12 @@
 	"----------------------------------------------------\n");
 
 	read_lock_irqsave(&tasklist_lock, flags);
-
-	do_each_thread(g, p) {
+	for_each_process_thread(g, p) {
 		if (task_cpu(p) != rq_cpu)
 			continue;
 
 		print_task(m, rq, p);
-	} while_each_thread(g, p);
-
+	}
 	read_unlock_irqrestore(&tasklist_lock, flags);
 }
 

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index bfa3c86..be9e97b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c

@@ -1038,7 +1038,8 @@
  */
 static void update_numa_stats(struct numa_stats *ns, int nid)
 {
-	int cpu, cpus = 0;
+	int smt, cpu, cpus = 0;
+	unsigned long capacity;
 
 	memset(ns, 0, sizeof(*ns));
 	for_each_cpu(cpu, cpumask_of_node(nid)) {
@@ -1062,8 +1063,12 @@
 	if (!cpus)
 		return;
 
-	ns->task_capacity =
-		DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE);
+	/* smt := ceil(cpus / capacity), assumes: 1 < smt_power < 2 */
+	smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity);
+	capacity = cpus / smt; /* cores */
+
+	ns->task_capacity = min_t(unsigned, capacity,
+		DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE));
 	ns->has_free_capacity = (ns->nr_running < ns->task_capacity);
 }
 
@@ -1206,7 +1211,7 @@
 
 	if (!cur) {
 		/* Is there capacity at our destination? */
-		if (env->src_stats.has_free_capacity &&
+		if (env->src_stats.nr_running <= env->src_stats.task_capacity &&
 		    !env->dst_stats.has_free_capacity)
 			goto unlock;
 
@@ -1775,7 +1780,7 @@
 		list_del(&p->numa_entry);
 		grp->nr_tasks--;
 		spin_unlock_irqrestore(&grp->lock, flags);
-		rcu_assign_pointer(p->numa_group, NULL);
+		RCU_INIT_POINTER(p->numa_group, NULL);
 		put_numa_group(grp);
 	}
 
@@ -2377,6 +2382,9 @@
 	tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
 	tg_contrib -= cfs_rq->tg_load_contrib;
 
+	if (!tg_contrib)
+		return;
+
 	if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
 		atomic_long_add(tg_contrib, &tg->load_avg);
 		cfs_rq->tg_load_contrib += tg_contrib;
@@ -3892,14 +3900,6 @@
 				resched_curr(rq);
 			return;
 		}
-
-		/*
-		 * Don't schedule slices shorter than 10000ns, that just
-		 * doesn't make sense. Rely on vruntime for fairness.
-		 */
-		if (rq->curr != p)
-			delta = max_t(s64, 10000LL, delta);
-
 		hrtick_start(rq, delta);
 	}
 }
@@ -4704,7 +4704,7 @@
 		return;
 
 	/*
-	 * This is possible from callers such as move_task(), in which we
+	 * This is possible from callers such as attach_tasks(), in which we
 	 * unconditionally check_prempt_curr() after an enqueue (which may have
 	 * lead to a throttle).  This both saves work and prevents false
 	 * next-buddy nomination below.
@@ -5112,27 +5112,18 @@
 	unsigned int		loop_max;
 
 	enum fbq_type		fbq_type;
+	struct list_head	tasks;
 };
 
 /*
- * move_task - move a task from one runqueue to another runqueue.
- * Both runqueues must be locked.
- */
-static void move_task(struct task_struct *p, struct lb_env *env)
-{
-	deactivate_task(env->src_rq, p, 0);
-	set_task_cpu(p, env->dst_cpu);
-	activate_task(env->dst_rq, p, 0);
-	check_preempt_curr(env->dst_rq, p, 0);
-}
-
-/*
  * Is this task likely cache-hot:
  */
 static int task_hot(struct task_struct *p, struct lb_env *env)
 {
 	s64 delta;
 
+	lockdep_assert_held(&env->src_rq->lock);
+
 	if (p->sched_class != &fair_sched_class)
 		return 0;
 
@@ -5252,6 +5243,9 @@
 int can_migrate_task(struct task_struct *p, struct lb_env *env)
 {
 	int tsk_cache_hot = 0;
+
+	lockdep_assert_held(&env->src_rq->lock);
+
 	/*
 	 * We do not migrate tasks that are:
 	 * 1) throttled_lb_pair, or
@@ -5336,47 +5330,63 @@
 }
 
 /*
- * move_one_task tries to move exactly one task from busiest to this_rq, as
- * part of active balancing operations within "domain".
- * Returns 1 if successful and 0 otherwise.
- *
- * Called with both runqueues locked.
+ * detach_task() -- detach the task for the migration specified in env
  */
-static int move_one_task(struct lb_env *env)
+static void detach_task(struct task_struct *p, struct lb_env *env)
+{
+	lockdep_assert_held(&env->src_rq->lock);
+
+	deactivate_task(env->src_rq, p, 0);
+	p->on_rq = TASK_ON_RQ_MIGRATING;
+	set_task_cpu(p, env->dst_cpu);
+}
+
+/*
+ * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as
+ * part of active balancing operations within "domain".
+ *
+ * Returns a task if successful and NULL otherwise.
+ */
+static struct task_struct *detach_one_task(struct lb_env *env)
 {
 	struct task_struct *p, *n;
 
+	lockdep_assert_held(&env->src_rq->lock);
+
 	list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
 		if (!can_migrate_task(p, env))
 			continue;
 
-		move_task(p, env);
+		detach_task(p, env);
+
 		/*
-		 * Right now, this is only the second place move_task()
-		 * is called, so we can safely collect move_task()
-		 * stats here rather than inside move_task().
+		 * Right now, this is only the second place where
+		 * lb_gained[env->idle] is updated (other is detach_tasks)
+		 * so we can safely collect stats here rather than
+		 * inside detach_tasks().
 		 */
 		schedstat_inc(env->sd, lb_gained[env->idle]);
-		return 1;
+		return p;
 	}
-	return 0;
+	return NULL;
 }
 
 static const unsigned int sched_nr_migrate_break = 32;
 
 /*
- * move_tasks tries to move up to imbalance weighted load from busiest to
- * this_rq, as part of a balancing operation within domain "sd".
- * Returns 1 if successful and 0 otherwise.
+ * detach_tasks() -- tries to detach up to imbalance weighted load from
+ * busiest_rq, as part of a balancing operation within domain "sd".
  *
- * Called with both runqueues locked.
+ * Returns number of detached tasks if successful and 0 otherwise.
  */
-static int move_tasks(struct lb_env *env)
+static int detach_tasks(struct lb_env *env)
 {
 	struct list_head *tasks = &env->src_rq->cfs_tasks;
 	struct task_struct *p;
 	unsigned long load;
-	int pulled = 0;
+	int detached = 0;
+
+	lockdep_assert_held(&env->src_rq->lock);
 
 	if (env->imbalance <= 0)
 		return 0;
@@ -5407,14 +5417,16 @@
 		if ((load / 2) > env->imbalance)
 			goto next;
 
-		move_task(p, env);
-		pulled++;
+		detach_task(p, env);
+		list_add(&p->se.group_node, &env->tasks);
+
+		detached++;
 		env->imbalance -= load;
 
 #ifdef CONFIG_PREEMPT
 		/*
 		 * NEWIDLE balancing is a source of latency, so preemptible
-		 * kernels will stop after the first task is pulled to minimize
+		 * kernels will stop after the first task is detached to minimize
 		 * the critical section.
 		 */
 		if (env->idle == CPU_NEWLY_IDLE)
@@ -5434,13 +5446,58 @@
 	}
 
 	/*
-	 * Right now, this is one of only two places move_task() is called,
-	 * so we can safely collect move_task() stats here rather than
-	 * inside move_task().
+	 * Right now, this is one of only two places we collect this stat
+	 * so we can safely collect detach_one_task() stats here rather
+	 * than inside detach_one_task().
 	 */
-	schedstat_add(env->sd, lb_gained[env->idle], pulled);
+	schedstat_add(env->sd, lb_gained[env->idle], detached);
 
-	return pulled;
+	return detached;
+}
+
+/*
+ * attach_task() -- attach the task detached by detach_task() to its new rq.
+ */
+static void attach_task(struct rq *rq, struct task_struct *p)
+{
+	lockdep_assert_held(&rq->lock);
+
+	BUG_ON(task_rq(p) != rq);
+	p->on_rq = TASK_ON_RQ_QUEUED;
+	activate_task(rq, p, 0);
+	check_preempt_curr(rq, p, 0);
+}
+
+/*
+ * attach_one_task() -- attaches the task returned from detach_one_task() to
+ * its new rq.
+ */
+static void attach_one_task(struct rq *rq, struct task_struct *p)
+{
+	raw_spin_lock(&rq->lock);
+	attach_task(rq, p);
+	raw_spin_unlock(&rq->lock);
+}
+
+/*
+ * attach_tasks() -- attaches all tasks detached by detach_tasks() to their
+ * new rq.
+ */
+static void attach_tasks(struct lb_env *env)
+{
+	struct list_head *tasks = &env->tasks;
+	struct task_struct *p;
+
+	raw_spin_lock(&env->dst_rq->lock);
+
+	while (!list_empty(tasks)) {
+		p = list_first_entry(tasks, struct task_struct, se.group_node);
+		list_del_init(&p->se.group_node);
+
+		attach_task(env->dst_rq, p);
+	}
+
+	raw_spin_unlock(&env->dst_rq->lock);
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -5559,6 +5616,13 @@
 #endif
 
 /********** Helpers for find_busiest_group ************************/
+
+enum group_type {
+	group_other = 0,
+	group_imbalanced,
+	group_overloaded,
+};
+
 /*
  * sg_lb_stats - stats of a sched_group required for load_balancing
  */
@@ -5572,7 +5636,7 @@
 	unsigned int group_capacity_factor;
 	unsigned int idle_cpus;
 	unsigned int group_weight;
-	int group_imb; /* Is there an imbalance in the group ? */
+	enum group_type group_type;
 	int group_has_free_capacity;
 #ifdef CONFIG_NUMA_BALANCING
 	unsigned int nr_numa_running;
@@ -5610,6 +5674,8 @@
 		.total_capacity = 0UL,
 		.busiest_stat = {
 			.avg_load = 0UL,
+			.sum_nr_running = 0,
+			.group_type = group_other,
 		},
 	};
 }
@@ -5891,6 +5957,18 @@
 	return capacity_factor;
 }
 
+static enum group_type
+group_classify(struct sched_group *group, struct sg_lb_stats *sgs)
+{
+	if (sgs->sum_nr_running > sgs->group_capacity_factor)
+		return group_overloaded;
+
+	if (sg_imbalanced(group))
+		return group_imbalanced;
+
+	return group_other;
+}
+
 /**
  * update_sg_lb_stats - Update sched_group's statistics for load balancing.
  * @env: The load balancing environment.
@@ -5942,9 +6020,8 @@
 		sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
 
 	sgs->group_weight = group->group_weight;
-
-	sgs->group_imb = sg_imbalanced(group);
 	sgs->group_capacity_factor = sg_capacity_factor(env, group);
+	sgs->group_type = group_classify(group, sgs);
 
 	if (sgs->group_capacity_factor > sgs->sum_nr_running)
 		sgs->group_has_free_capacity = 1;
@@ -5968,13 +6045,19 @@
 				   struct sched_group *sg,
 				   struct sg_lb_stats *sgs)
 {
-	if (sgs->avg_load <= sds->busiest_stat.avg_load)
-		return false;
+	struct sg_lb_stats *busiest = &sds->busiest_stat;
 
-	if (sgs->sum_nr_running > sgs->group_capacity_factor)
+	if (sgs->group_type > busiest->group_type)
 		return true;
 
-	if (sgs->group_imb)
+	if (sgs->group_type < busiest->group_type)
+		return false;
+
+	if (sgs->avg_load <= busiest->avg_load)
+		return false;
+
+	/* This is the busiest node in its class. */
+	if (!(env->sd->flags & SD_ASYM_PACKING))
 		return true;
 
 	/*
@@ -5982,8 +6065,7 @@
 	 * numbered CPUs in the group, therefore mark all groups
 	 * higher than ourself as busy.
 	 */
-	if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
-	    env->dst_cpu < group_first_cpu(sg)) {
+	if (sgs->sum_nr_running && env->dst_cpu < group_first_cpu(sg)) {
 		if (!sds->busiest)
 			return true;
 
@@ -6228,7 +6310,7 @@
 	local = &sds->local_stat;
 	busiest = &sds->busiest_stat;
 
-	if (busiest->group_imb) {
+	if (busiest->group_type == group_imbalanced) {
 		/*
 		 * In the group_imb case we cannot rely on group-wide averages
 		 * to ensure cpu-load equilibrium, look at wider averages. XXX
@@ -6248,12 +6330,11 @@
 		return fix_small_imbalance(env, sds);
 	}
 
-	if (!busiest->group_imb) {
-		/*
-		 * Don't want to pull so many tasks that a group would go idle.
-		 * Except of course for the group_imb case, since then we might
-		 * have to drop below capacity to reach cpu-load equilibrium.
-		 */
+	/*
+	 * If there aren't any idle cpus, avoid creating some.
+	 */
+	if (busiest->group_type == group_overloaded &&
+	    local->group_type   == group_overloaded) {
 		load_above_capacity =
 			(busiest->sum_nr_running - busiest->group_capacity_factor);
 
@@ -6337,7 +6418,7 @@
 	 * work because they assume all things are equal, which typically
 	 * isn't true due to cpus_allowed constraints and the like.
 	 */
-	if (busiest->group_imb)
+	if (busiest->group_type == group_imbalanced)
 		goto force_balance;
 
 	/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
@@ -6550,6 +6631,7 @@
 		.loop_break	= sched_nr_migrate_break,
 		.cpus		= cpus,
 		.fbq_type	= all,
+		.tasks		= LIST_HEAD_INIT(env.tasks),
 	};
 
 	/*
@@ -6599,16 +6681,29 @@
 		env.loop_max  = min(sysctl_sched_nr_migrate, busiest->nr_running);
 
 more_balance:
-		local_irq_save(flags);
-		double_rq_lock(env.dst_rq, busiest);
+		raw_spin_lock_irqsave(&busiest->lock, flags);
 
 		/*
 		 * cur_ld_moved - load moved in current iteration
 		 * ld_moved     - cumulative load moved across iterations
 		 */
-		cur_ld_moved = move_tasks(&env);
-		ld_moved += cur_ld_moved;
-		double_rq_unlock(env.dst_rq, busiest);
+		cur_ld_moved = detach_tasks(&env);
+
+		/*
+		 * We've detached some tasks from busiest_rq. Every
+		 * task is masked "TASK_ON_RQ_MIGRATING", so we can safely
+		 * unlock busiest->lock, and we are able to be sure
+		 * that nobody can manipulate the tasks in parallel.
+		 * See task_rq_lock() family for the details.
+		 */
+
+		raw_spin_unlock(&busiest->lock);
+
+		if (cur_ld_moved) {
+			attach_tasks(&env);
+			ld_moved += cur_ld_moved;
+		}
+
 		local_irq_restore(flags);
 
 		/*
@@ -6744,7 +6839,7 @@
 		 * If we've begun active balancing, start to back off. This
 		 * case may not be covered by the all_pinned logic if there
 		 * is only 1 task on the busy runqueue (because we don't call
-		 * move_tasks).
+		 * detach_tasks).
 		 */
 		if (sd->balance_interval < sd->max_interval)
 			sd->balance_interval *= 2;
@@ -6914,6 +7009,7 @@
 	int target_cpu = busiest_rq->push_cpu;
 	struct rq *target_rq = cpu_rq(target_cpu);
 	struct sched_domain *sd;
+	struct task_struct *p = NULL;
 
 	raw_spin_lock_irq(&busiest_rq->lock);
 
@@ -6933,9 +7029,6 @@
 	 */
 	BUG_ON(busiest_rq == target_rq);
 
-	/* move a task from busiest_rq to target_rq */
-	double_lock_balance(busiest_rq, target_rq);
-
 	/* Search for an sd spanning us and the target CPU. */
 	rcu_read_lock();
 	for_each_domain(target_cpu, sd) {
@@ -6956,16 +7049,22 @@
 
 		schedstat_inc(sd, alb_count);
 
-		if (move_one_task(&env))
+		p = detach_one_task(&env);
+		if (p)
 			schedstat_inc(sd, alb_pushed);
 		else
 			schedstat_inc(sd, alb_failed);
 	}
 	rcu_read_unlock();
-	double_unlock_balance(busiest_rq, target_rq);
 out_unlock:
 	busiest_rq->active_balance = 0;
-	raw_spin_unlock_irq(&busiest_rq->lock);
+	raw_spin_unlock(&busiest_rq->lock);
+
+	if (p)
+		attach_one_task(target_rq, p);
+
+	local_irq_enable();
+
 	return 0;
 }
 
@@ -7465,7 +7564,7 @@
 static void
 prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
 {
-	if (!p->se.on_rq)
+	if (!task_on_rq_queued(p))
 		return;
 
 	/*
@@ -7490,11 +7589,11 @@
 	 * switched back to the fair class the enqueue_entity(.flags=0) will
 	 * do the right thing.
 	 *
-	 * If it's on_rq, then the dequeue_entity(.flags=0) will already
-	 * have normalized the vruntime, if it's !on_rq, then only when
+	 * If it's queued, then the dequeue_entity(.flags=0) will already
+	 * have normalized the vruntime, if it's !queued, then only when
 	 * the task is sleeping will it still have non-normalized vruntime.
 	 */
-	if (!p->on_rq && p->state != TASK_RUNNING) {
+	if (!task_on_rq_queued(p) && p->state != TASK_RUNNING) {
 		/*
 		 * Fix up our vruntime so that the current sleep doesn't
 		 * cause 'unlimited' sleep bonus.
@@ -7521,15 +7620,15 @@
  */
 static void switched_to_fair(struct rq *rq, struct task_struct *p)
 {
-	struct sched_entity *se = &p->se;
 #ifdef CONFIG_FAIR_GROUP_SCHED
+	struct sched_entity *se = &p->se;
 	/*
 	 * Since the real-depth could have been changed (only FAIR
 	 * class maintain depth value), reset depth properly.
 	 */
 	se->depth = se->parent ? se->parent->depth + 1 : 0;
 #endif
-	if (!se->on_rq)
+	if (!task_on_rq_queued(p))
 		return;
 
 	/*
@@ -7575,7 +7674,7 @@
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-static void task_move_group_fair(struct task_struct *p, int on_rq)
+static void task_move_group_fair(struct task_struct *p, int queued)
 {
 	struct sched_entity *se = &p->se;
 	struct cfs_rq *cfs_rq;
@@ -7594,7 +7693,7 @@
 	 * fair sleeper stuff for the first placement, but who cares.
 	 */
 	/*
-	 * When !on_rq, vruntime of the task has usually NOT been normalized.
+	 * When !queued, vruntime of the task has usually NOT been normalized.
 	 * But there are some cases where it has already been normalized:
 	 *
 	 * - Moving a forked child which is waiting for being woken up by
@@ -7605,14 +7704,14 @@
 	 * To prevent boost or penalty in the new cfs_rq caused by delta
 	 * min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
 	 */
-	if (!on_rq && (!se->sum_exec_runtime || p->state == TASK_WAKING))
-		on_rq = 1;
+	if (!queued && (!se->sum_exec_runtime || p->state == TASK_WAKING))
+		queued = 1;
 
-	if (!on_rq)
+	if (!queued)
 		se->vruntime -= cfs_rq_of(se)->min_vruntime;
 	set_task_rq(p, task_cpu(p));
 	se->depth = se->parent ? se->parent->depth + 1 : 0;
-	if (!on_rq) {
+	if (!queued) {
 		cfs_rq = cfs_rq_of(se);
 		se->vruntime += cfs_rq->min_vruntime;
 #ifdef CONFIG_SMP

diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 5f6edca..4feac8f 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c

@@ -1448,7 +1448,7 @@
 		 * means a dl or stop task can slip in, in which case we need
 		 * to re-start task selection.
 		 */
-		if (unlikely((rq->stop && rq->stop->on_rq) ||
+		if (unlikely((rq->stop && task_on_rq_queued(rq->stop)) ||
 			     rq->dl.dl_nr_running))
 			return RETRY_TASK;
 	}
@@ -1624,7 +1624,7 @@
 				     !cpumask_test_cpu(lowest_rq->cpu,
 						       tsk_cpus_allowed(task)) ||
 				     task_running(rq, task) ||
-				     !task->on_rq)) {
+				     !task_on_rq_queued(task))) {
 
 				double_unlock_balance(rq, lowest_rq);
 				lowest_rq = NULL;
@@ -1658,7 +1658,7 @@
 	BUG_ON(task_current(rq, p));
 	BUG_ON(p->nr_cpus_allowed <= 1);
 
-	BUG_ON(!p->on_rq);
+	BUG_ON(!task_on_rq_queued(p));
 	BUG_ON(!rt_task(p));
 
 	return p;
@@ -1809,7 +1809,7 @@
 		 */
 		if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
 			WARN_ON(p == src_rq->curr);
-			WARN_ON(!p->on_rq);
+			WARN_ON(!task_on_rq_queued(p));
 
 			/*
 			 * There's a chance that p is higher in priority
@@ -1870,7 +1870,7 @@
 
 	BUG_ON(!rt_task(p));
 
-	if (!p->on_rq)
+	if (!task_on_rq_queued(p))
 		return;
 
 	weight = cpumask_weight(new_mask);
@@ -1936,7 +1936,7 @@
 	 * we may need to handle the pulling of RT tasks
 	 * now.
 	 */
-	if (!p->on_rq || rq->rt.rt_nr_running)
+	if (!task_on_rq_queued(p) || rq->rt.rt_nr_running)
 		return;
 
 	if (pull_rt_task(rq))
@@ -1970,7 +1970,7 @@
 	 * If that current running task is also an RT task
 	 * then see if we can move to another run queue.
 	 */
-	if (p->on_rq && rq->curr != p) {
+	if (task_on_rq_queued(p) && rq->curr != p) {
 #ifdef CONFIG_SMP
 		if (p->nr_cpus_allowed > 1 && rq->rt.overloaded &&
 		    /* Don't resched if we changed runqueues */
@@ -1989,7 +1989,7 @@
 static void
 prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
 {
-	if (!p->on_rq)
+	if (!task_on_rq_queued(p))
 		return;
 
 	if (rq->curr == p) {

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 579712f..aa0f73b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h

@@ -15,6 +15,10 @@
 
 struct rq;
 
+/* task_struct::on_rq states: */
+#define TASK_ON_RQ_QUEUED	1
+#define TASK_ON_RQ_MIGRATING	2
+
 extern __read_mostly int scheduler_running;
 
 extern unsigned long calc_load_update;
@@ -647,7 +651,7 @@
 #endif
 }
 
-DECLARE_PER_CPU(struct rq, runqueues);
+DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
 
 #define cpu_rq(cpu)		(&per_cpu(runqueues, (cpu)))
 #define this_rq()		(&__get_cpu_var(runqueues))
@@ -942,6 +946,15 @@
 #endif
 }
 
+static inline int task_on_rq_queued(struct task_struct *p)
+{
+	return p->on_rq == TASK_ON_RQ_QUEUED;
+}
+
+static inline int task_on_rq_migrating(struct task_struct *p)
+{
+	return p->on_rq == TASK_ON_RQ_MIGRATING;
+}
 
 #ifndef prepare_arch_switch
 # define prepare_arch_switch(next)	do { } while (0)

diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index bfe0eda..67426e5 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c

@@ -28,7 +28,7 @@
 {
 	struct task_struct *stop = rq->stop;
 
-	if (!stop || !stop->on_rq)
+	if (!stop || !task_on_rq_queued(stop))
 		return NULL;
 
 	put_prev_task(rq, prev);

diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 44eb005..1285cb2 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c

@@ -21,10 +21,11 @@
 #include <linux/slab.h>
 #include <linux/syscalls.h>
 
-/* #define SECCOMP_DEBUG 1 */
+#ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
+#include <asm/syscall.h>
+#endif
 
 #ifdef CONFIG_SECCOMP_FILTER
-#include <asm/syscall.h>
 #include <linux/filter.h>
 #include <linux/pid.h>
 #include <linux/ptrace.h>
@@ -172,10 +173,10 @@
  *
  * Returns valid seccomp BPF response codes.
  */
-static u32 seccomp_run_filters(int syscall)
+static u32 seccomp_run_filters(struct seccomp_data *sd)
 {
 	struct seccomp_filter *f = ACCESS_ONCE(current->seccomp.filter);
-	struct seccomp_data sd;
+	struct seccomp_data sd_local;
 	u32 ret = SECCOMP_RET_ALLOW;
 
 	/* Ensure unexpected behavior doesn't result in failing open. */
@@ -185,14 +186,17 @@
 	/* Make sure cross-thread synced filter points somewhere sane. */
 	smp_read_barrier_depends();
 
-	populate_seccomp_data(&sd);
+	if (!sd) {
+		populate_seccomp_data(&sd_local);
+		sd = &sd_local;
+	}
 
 	/*
 	 * All filters in the list are evaluated and the lowest BPF return
 	 * value always takes priority (ignoring the DATA).
 	 */
 	for (; f; f = f->prev) {
-		u32 cur_ret = BPF_PROG_RUN(f->prog, (void *)&sd);
+		u32 cur_ret = BPF_PROG_RUN(f->prog, (void *)sd);
 
 		if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION))
 			ret = cur_ret;
@@ -564,11 +568,55 @@
 };
 #endif
 
-int __secure_computing(int this_syscall)
+static void __secure_computing_strict(int this_syscall)
 {
-	int exit_sig = 0;
-	int *syscall;
-	u32 ret;
+	int *syscall_whitelist = mode1_syscalls;
+#ifdef CONFIG_COMPAT
+	if (is_compat_task())
+		syscall_whitelist = mode1_syscalls_32;
+#endif
+	do {
+		if (*syscall_whitelist == this_syscall)
+			return;
+	} while (*++syscall_whitelist);
+
+#ifdef SECCOMP_DEBUG
+	dump_stack();
+#endif
+	audit_seccomp(this_syscall, SIGKILL, SECCOMP_RET_KILL);
+	do_exit(SIGKILL);
+}
+
+#ifndef CONFIG_HAVE_ARCH_SECCOMP_FILTER
+void secure_computing_strict(int this_syscall)
+{
+	int mode = current->seccomp.mode;
+
+	if (mode == 0)
+		return;
+	else if (mode == SECCOMP_MODE_STRICT)
+		__secure_computing_strict(this_syscall);
+	else
+		BUG();
+}
+#else
+int __secure_computing(void)
+{
+	u32 phase1_result = seccomp_phase1(NULL);
+
+	if (likely(phase1_result == SECCOMP_PHASE1_OK))
+		return 0;
+	else if (likely(phase1_result == SECCOMP_PHASE1_SKIP))
+		return -1;
+	else
+		return seccomp_phase2(phase1_result);
+}
+
+#ifdef CONFIG_SECCOMP_FILTER
+static u32 __seccomp_phase1_filter(int this_syscall, struct seccomp_data *sd)
+{
+	u32 filter_ret, action;
+	int data;
 
 	/*
 	 * Make sure that any changes to mode from another thread have
@@ -576,86 +624,128 @@
 	 */
 	rmb();
 
-	switch (current->seccomp.mode) {
-	case SECCOMP_MODE_STRICT:
-		syscall = mode1_syscalls;
-#ifdef CONFIG_COMPAT
-		if (is_compat_task())
-			syscall = mode1_syscalls_32;
-#endif
-		do {
-			if (*syscall == this_syscall)
-				return 0;
-		} while (*++syscall);
-		exit_sig = SIGKILL;
-		ret = SECCOMP_RET_KILL;
-		break;
-#ifdef CONFIG_SECCOMP_FILTER
-	case SECCOMP_MODE_FILTER: {
-		int data;
-		struct pt_regs *regs = task_pt_regs(current);
-		ret = seccomp_run_filters(this_syscall);
-		data = ret & SECCOMP_RET_DATA;
-		ret &= SECCOMP_RET_ACTION;
-		switch (ret) {
-		case SECCOMP_RET_ERRNO:
-			/* Set the low-order 16-bits as a errno. */
-			syscall_set_return_value(current, regs,
-						 -data, 0);
-			goto skip;
-		case SECCOMP_RET_TRAP:
-			/* Show the handler the original registers. */
-			syscall_rollback(current, regs);
-			/* Let the filter pass back 16 bits of data. */
-			seccomp_send_sigsys(this_syscall, data);
-			goto skip;
-		case SECCOMP_RET_TRACE:
-			/* Skip these calls if there is no tracer. */
-			if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) {
-				syscall_set_return_value(current, regs,
-							 -ENOSYS, 0);
-				goto skip;
-			}
-			/* Allow the BPF to provide the event message */
-			ptrace_event(PTRACE_EVENT_SECCOMP, data);
-			/*
-			 * The delivery of a fatal signal during event
-			 * notification may silently skip tracer notification.
-			 * Terminating the task now avoids executing a system
-			 * call that may not be intended.
-			 */
-			if (fatal_signal_pending(current))
-				break;
-			if (syscall_get_nr(current, regs) < 0)
-				goto skip;  /* Explicit request to skip. */
+	filter_ret = seccomp_run_filters(sd);
+	data = filter_ret & SECCOMP_RET_DATA;
+	action = filter_ret & SECCOMP_RET_ACTION;
 
-			return 0;
-		case SECCOMP_RET_ALLOW:
-			return 0;
-		case SECCOMP_RET_KILL:
-		default:
-			break;
-		}
-		exit_sig = SIGSYS;
-		break;
+	switch (action) {
+	case SECCOMP_RET_ERRNO:
+		/* Set the low-order 16-bits as a errno. */
+		syscall_set_return_value(current, task_pt_regs(current),
+					 -data, 0);
+		goto skip;
+
+	case SECCOMP_RET_TRAP:
+		/* Show the handler the original registers. */
+		syscall_rollback(current, task_pt_regs(current));
+		/* Let the filter pass back 16 bits of data. */
+		seccomp_send_sigsys(this_syscall, data);
+		goto skip;
+
+	case SECCOMP_RET_TRACE:
+		return filter_ret;  /* Save the rest for phase 2. */
+
+	case SECCOMP_RET_ALLOW:
+		return SECCOMP_PHASE1_OK;
+
+	case SECCOMP_RET_KILL:
+	default:
+		audit_seccomp(this_syscall, SIGSYS, action);
+		do_exit(SIGSYS);
 	}
+
+	unreachable();
+
+skip:
+	audit_seccomp(this_syscall, 0, action);
+	return SECCOMP_PHASE1_SKIP;
+}
+#endif
+
+/**
+ * seccomp_phase1() - run fast path seccomp checks on the current syscall
+ * @arg sd: The seccomp_data or NULL
+ *
+ * This only reads pt_regs via the syscall_xyz helpers.  The only change
+ * it will make to pt_regs is via syscall_set_return_value, and it will
+ * only do that if it returns SECCOMP_PHASE1_SKIP.
+ *
+ * If sd is provided, it will not read pt_regs at all.
+ *
+ * It may also call do_exit or force a signal; these actions must be
+ * safe.
+ *
+ * If it returns SECCOMP_PHASE1_OK, the syscall passes checks and should
+ * be processed normally.
+ *
+ * If it returns SECCOMP_PHASE1_SKIP, then the syscall should not be
+ * invoked.  In this case, seccomp_phase1 will have set the return value
+ * using syscall_set_return_value.
+ *
+ * If it returns anything else, then the return value should be passed
+ * to seccomp_phase2 from a context in which ptrace hooks are safe.
+ */
+u32 seccomp_phase1(struct seccomp_data *sd)
+{
+	int mode = current->seccomp.mode;
+	int this_syscall = sd ? sd->nr :
+		syscall_get_nr(current, task_pt_regs(current));
+
+	switch (mode) {
+	case SECCOMP_MODE_STRICT:
+		__secure_computing_strict(this_syscall);  /* may call do_exit */
+		return SECCOMP_PHASE1_OK;
+#ifdef CONFIG_SECCOMP_FILTER
+	case SECCOMP_MODE_FILTER:
+		return __seccomp_phase1_filter(this_syscall, sd);
 #endif
 	default:
 		BUG();
 	}
-
-#ifdef SECCOMP_DEBUG
-	dump_stack();
-#endif
-	audit_seccomp(this_syscall, exit_sig, ret);
-	do_exit(exit_sig);
-#ifdef CONFIG_SECCOMP_FILTER
-skip:
-	audit_seccomp(this_syscall, exit_sig, ret);
-#endif
-	return -1;
 }
 
+/**
+ * seccomp_phase2() - finish slow path seccomp work for the current syscall
+ * @phase1_result: The return value from seccomp_phase1()
+ *
+ * This must be called from a context in which ptrace hooks can be used.
+ *
+ * Returns 0 if the syscall should be processed or -1 to skip the syscall.
+ */
+int seccomp_phase2(u32 phase1_result)
+{
+	struct pt_regs *regs = task_pt_regs(current);
+	u32 action = phase1_result & SECCOMP_RET_ACTION;
+	int data = phase1_result & SECCOMP_RET_DATA;
+
+	BUG_ON(action != SECCOMP_RET_TRACE);
+
+	audit_seccomp(syscall_get_nr(current, regs), 0, action);
+
+	/* Skip these calls if there is no tracer. */
+	if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) {
+		syscall_set_return_value(current, regs,
+					 -ENOSYS, 0);
+		return -1;
+	}
+
+	/* Allow the BPF to provide the event message */
+	ptrace_event(PTRACE_EVENT_SECCOMP, data);
+	/*
+	 * The delivery of a fatal signal during event
+	 * notification may silently skip tracer notification.
+	 * Terminating the task now avoids executing a system
+	 * call that may not be intended.
+	 */
+	if (fatal_signal_pending(current))
+		do_exit(SIGSYS);
+	if (syscall_get_nr(current, regs) < 0)
+		return -1;  /* Explicit request to skip. */
+
+	return 0;
+}
+#endif /* CONFIG_HAVE_ARCH_SECCOMP_FILTER */
+
 long prctl_get_seccomp(void)
 {
 	return current->seccomp.mode;

diff --git a/kernel/sys.c b/kernel/sys.c
index ce81291..b663664 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c

@@ -862,11 +862,9 @@
 {
 	cputime_t tgutime, tgstime, cutime, cstime;
 
-	spin_lock_irq(&current->sighand->siglock);
 	thread_group_cputime_adjusted(current, &tgutime, &tgstime);
 	cutime = current->signal->cutime;
 	cstime = current->signal->cstime;
-	spin_unlock_irq(&current->sighand->siglock);
 	tms->tms_utime = cputime_to_clock_t(tgutime);
 	tms->tms_stime = cputime_to_clock_t(tgstime);
 	tms->tms_cutime = cputime_to_clock_t(cutime);

diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 3b89464..492b986 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c

@@ -272,22 +272,8 @@
 		if (same_thread_group(tsk, current))
 			err = cpu_clock_sample(which_clock, tsk, &rtn);
 	} else {
-		unsigned long flags;
-		struct sighand_struct *sighand;
-
-		/*
-		 * while_each_thread() is not yet entirely RCU safe,
-		 * keep locking the group while sampling process
-		 * clock for now.
-		 */
-		sighand = lock_task_sighand(tsk, &flags);
-		if (!sighand)
-			return err;
-
 		if (tsk == current || thread_group_leader(tsk))
 			err = cpu_clock_sample_group(which_clock, tsk, &rtn);
-
-		unlock_task_sighand(tsk, &flags);
 	}
 
 	if (!err)

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 99aa6ee..905f3e2 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c

@@ -225,6 +225,20 @@
 };
 
 /*
+ * Kick this CPU if it's full dynticks in order to force it to
+ * re-evaluate its dependency on the tick and restart it if necessary.
+ * This kick, unlike tick_nohz_full_kick_cpu() and tick_nohz_full_kick_all(),
+ * is NMI safe.
+ */
+void tick_nohz_full_kick(void)
+{
+	if (!tick_nohz_full_cpu(smp_processor_id()))
+		return;
+
+	irq_work_queue(&__get_cpu_var(nohz_full_kick_work));
+}
+
+/*
  * Kick the CPU if it's full dynticks in order to force it to
  * re-evaluate its dependency on the tick and restart it if necessary.
  */
@@ -968,6 +982,10 @@
 	tick_sched_do_timer(now);
 	tick_sched_handle(ts, regs);
 
+	/* No need to reprogram if we are running tickless  */
+	if (unlikely(ts->tick_stopped))
+		return;
+
 	while (tick_nohz_reprogram(ts, now)) {
 		now = ktime_get();
 		tick_do_update_jiffies64(now);
@@ -1095,6 +1113,10 @@
 	if (regs)
 		tick_sched_handle(ts, regs);
 
+	/* No need to reprogram if we are in idle or full dynticks mode */
+	if (unlikely(ts->tick_stopped))
+		return HRTIMER_NORESTART;
+
 	hrtimer_forward(timer, now, tick_period);
 
 	return HRTIMER_RESTART;

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index fb4a9c2..ec1791f 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c

@@ -442,11 +442,12 @@
 		tk->ntp_error = 0;
 		ntp_clear();
 	}
-	update_vsyscall(tk);
-	update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET);
 
 	tk_update_ktime_data(tk);
 
+	update_vsyscall(tk);
+	update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET);
+
 	if (action & TK_MIRROR)
 		memcpy(&shadow_timekeeper, &tk_core.timekeeper,
 		       sizeof(tk_core.timekeeper));

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index a8d6914..8759d0b 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c

@@ -15,11 +15,6 @@
 #include <linux/cpu.h>
 #include <linux/nmi.h>
 #include <linux/init.h>
-#include <linux/delay.h>
-#include <linux/freezer.h>
-#include <linux/kthread.h>
-#include <linux/lockdep.h>
-#include <linux/notifier.h>
 #include <linux/module.h>
 #include <linux/sysctl.h>
 #include <linux/smpboot.h>
@@ -514,7 +509,10 @@
 		/* should be in cleanup, but blocks oprofile */
 		perf_event_release_kernel(event);
 	}
-	return;
+	if (cpu == 0) {
+		/* watchdog_nmi_enable() expects this to be zero initially. */
+		cpu0_err = 0;
+	}
 }
 #else
 static int watchdog_nmi_enable(unsigned int cpu) { return 0; }

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index a285900..ffc4772 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug

@@ -952,7 +952,7 @@
 	 the proof of observed correctness is also maintained for an
 	 arbitrary combination of these separate locking variants.
 
-	 For more details, see Documentation/lockdep-design.txt.
+	 For more details, see Documentation/locking/lockdep-design.txt.
 
 config LOCKDEP
 	bool
@@ -973,7 +973,7 @@
 	help
 	 This feature enables tracking lock contention points
 
-	 For more details, see Documentation/lockstat.txt
+	 For more details, see Documentation/locking/lockstat.txt
 
 	 This also enables lock events required by "perf lock",
 	 subcommand of perf.

diff --git a/lib/assoc_array.c b/lib/assoc_array.c
index c0b1007..ae146f0 100644
--- a/lib/assoc_array.c
+++ b/lib/assoc_array.c

@@ -1735,7 +1735,7 @@
 gc_complete:
 	edit->set[0].to = new_root;
 	assoc_array_apply_edit(edit);
-	edit->array->nr_leaves_on_tree = nr_leaves_on_tree;
+	array->nr_leaves_on_tree = nr_leaves_on_tree;
 	return 0;
 
 enomem:

diff --git a/lib/atomic64.c b/lib/atomic64.c
index 08a4f06..1298c05e 100644
--- a/lib/atomic64.c
+++ b/lib/atomic64.c

@@ -70,53 +70,42 @@
 }
 EXPORT_SYMBOL(atomic64_set);
 
-void atomic64_add(long long a, atomic64_t *v)
-{
-	unsigned long flags;
-	raw_spinlock_t *lock = lock_addr(v);
+#define ATOMIC64_OP(op, c_op)						\
+void atomic64_##op(long long a, atomic64_t *v)				\
+{									\
+	unsigned long flags;						\
+	raw_spinlock_t *lock = lock_addr(v);				\
+									\
+	raw_spin_lock_irqsave(lock, flags);				\
+	v->counter c_op a;						\
+	raw_spin_unlock_irqrestore(lock, flags);			\
+}									\
+EXPORT_SYMBOL(atomic64_##op);
 
-	raw_spin_lock_irqsave(lock, flags);
-	v->counter += a;
-	raw_spin_unlock_irqrestore(lock, flags);
-}
-EXPORT_SYMBOL(atomic64_add);
+#define ATOMIC64_OP_RETURN(op, c_op)					\
+long long atomic64_##op##_return(long long a, atomic64_t *v)		\
+{									\
+	unsigned long flags;						\
+	raw_spinlock_t *lock = lock_addr(v);				\
+	long long val;							\
+									\
+	raw_spin_lock_irqsave(lock, flags);				\
+	val = (v->counter c_op a);					\
+	raw_spin_unlock_irqrestore(lock, flags);			\
+	return val;							\
+}									\
+EXPORT_SYMBOL(atomic64_##op##_return);
 
-long long atomic64_add_return(long long a, atomic64_t *v)
-{
-	unsigned long flags;
-	raw_spinlock_t *lock = lock_addr(v);
-	long long val;
+#define ATOMIC64_OPS(op, c_op)						\
+	ATOMIC64_OP(op, c_op)						\
+	ATOMIC64_OP_RETURN(op, c_op)
 
-	raw_spin_lock_irqsave(lock, flags);
-	val = v->counter += a;
-	raw_spin_unlock_irqrestore(lock, flags);
-	return val;
-}
-EXPORT_SYMBOL(atomic64_add_return);
+ATOMIC64_OPS(add, +=)
+ATOMIC64_OPS(sub, -=)
 
-void atomic64_sub(long long a, atomic64_t *v)
-{
-	unsigned long flags;
-	raw_spinlock_t *lock = lock_addr(v);
-
-	raw_spin_lock_irqsave(lock, flags);
-	v->counter -= a;
-	raw_spin_unlock_irqrestore(lock, flags);
-}
-EXPORT_SYMBOL(atomic64_sub);
-
-long long atomic64_sub_return(long long a, atomic64_t *v)
-{
-	unsigned long flags;
-	raw_spinlock_t *lock = lock_addr(v);
-	long long val;
-
-	raw_spin_lock_irqsave(lock, flags);
-	val = v->counter -= a;
-	raw_spin_unlock_irqrestore(lock, flags);
-	return val;
-}
-EXPORT_SYMBOL(atomic64_sub_return);
+#undef ATOMIC64_OPS
+#undef ATOMIC64_OP_RETURN
+#undef ATOMIC64_OP
 
 long long atomic64_dec_if_positive(atomic64_t *v)
 {

diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c
index 872a15a..62af709 100644
--- a/lib/locking-selftest.c
+++ b/lib/locking-selftest.c

@@ -267,19 +267,46 @@
 #undef E
 
 /*
- * Special-case for read-locking, they are
- * allowed to recurse on the same lock class:
+ * Special-case for read-locking, they are not allowed to
+ * recurse on the same lock class except under interrupt context:
  */
 static void rlock_AA1(void)
 {
 	RL(X1);
-	RL(X1); // this one should NOT fail
+	RL(X1); // this one should fail
 }
 
 static void rlock_AA1B(void)
 {
 	RL(X1);
-	RL(X2); // this one should NOT fail
+	RL(X2); // this one should fail
+}
+
+static void rlock_AHA1(void)
+{
+	RL(X1);
+	HARDIRQ_ENTER();
+	RL(X1);	// this one should NOT fail
+	HARDIRQ_EXIT();
+}
+
+static void rlock_AHA1B(void)
+{
+	RL(X1);
+	HARDIRQ_ENTER();
+	RL(X2);	// this one should NOT fail
+	HARDIRQ_EXIT();
+}
+
+static void rlock_ASAHA1(void)
+{
+	RL(X1);
+	SOFTIRQ_ENTER();
+	RL(X1);	// this one should NOT fail
+	HARDIRQ_ENTER();
+	RL(X1);	// this one should NOT fail
+	HARDIRQ_EXIT();
+	SOFTIRQ_EXIT();
 }
 
 static void rsem_AA1(void)
@@ -1069,7 +1096,7 @@
 	print_testname(desc);					\
 	dotest(name##_spin, FAILURE, LOCKTYPE_SPIN);		\
 	dotest(name##_wlock, FAILURE, LOCKTYPE_RWLOCK);		\
-	dotest(name##_rlock, SUCCESS, LOCKTYPE_RWLOCK);		\
+	dotest(name##_rlock, FAILURE, LOCKTYPE_RWLOCK);		\
 	dotest(name##_mutex, FAILURE, LOCKTYPE_MUTEX);		\
 	dotest(name##_wsem, FAILURE, LOCKTYPE_RWSEM);		\
 	dotest(name##_rsem, FAILURE, LOCKTYPE_RWSEM);		\
@@ -1830,14 +1857,14 @@
 	printk("  --------------------------------------------------------------------------\n");
 	print_testname("recursive read-lock");
 	printk("             |");
-	dotest(rlock_AA1, SUCCESS, LOCKTYPE_RWLOCK);
+	dotest(rlock_AA1, FAILURE, LOCKTYPE_RWLOCK);
 	printk("             |");
 	dotest(rsem_AA1, FAILURE, LOCKTYPE_RWSEM);
 	printk("\n");
 
 	print_testname("recursive read-lock #2");
 	printk("             |");
-	dotest(rlock_AA1B, SUCCESS, LOCKTYPE_RWLOCK);
+	dotest(rlock_AA1B, FAILURE, LOCKTYPE_RWLOCK);
 	printk("             |");
 	dotest(rsem_AA1B, FAILURE, LOCKTYPE_RWSEM);
 	printk("\n");
@@ -1856,6 +1883,21 @@
 	dotest(rsem_AA3, FAILURE, LOCKTYPE_RWSEM);
 	printk("\n");
 
+	print_testname("recursive rlock with interrupt");
+	printk("             |");
+	dotest(rlock_AHA1, SUCCESS, LOCKTYPE_RWLOCK);
+	printk("\n");
+
+	print_testname("recursive rlock with interrupt #2");
+	printk("             |");
+	dotest(rlock_AHA1B, SUCCESS, LOCKTYPE_RWLOCK);
+	printk("\n");
+
+	print_testname("recursive rlock with interrupt #3");
+	printk("             |");
+	dotest(rlock_ASAHA1, SUCCESS, LOCKTYPE_RWLOCK);
+	printk("\n");
+
 	printk("  --------------------------------------------------------------------------\n");
 
 	/*

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ec4dcf1..085dc6d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c

@@ -2534,6 +2534,8 @@
 	unsigned long long size;
 	int ret = 0;
 
+	if (mem_cgroup_is_root(memcg))
+		goto done;
 retry:
 	if (consume_stock(memcg, nr_pages))
 		goto done;
@@ -2611,9 +2613,7 @@
 	if (!(gfp_mask & __GFP_NOFAIL))
 		return -ENOMEM;
 bypass:
-	memcg = root_mem_cgroup;
-	ret = -EINTR;
-	goto retry;
+	return -EINTR;
 
 done_restock:
 	if (batch > nr_pages)
@@ -2626,6 +2626,9 @@
 {
 	unsigned long bytes = nr_pages * PAGE_SIZE;
 
+	if (mem_cgroup_is_root(memcg))
+		return;
+
 	res_counter_uncharge(&memcg->res, bytes);
 	if (do_swap_account)
 		res_counter_uncharge(&memcg->memsw, bytes);
@@ -2640,6 +2643,9 @@
 {
 	unsigned long bytes = nr_pages * PAGE_SIZE;
 
+	if (mem_cgroup_is_root(memcg))
+		return;
+
 	res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes);
 	if (do_swap_account)
 		res_counter_uncharge_until(&memcg->memsw,
@@ -4093,6 +4099,46 @@
 	return retval;
 }
 
+static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg,
+					       enum mem_cgroup_stat_index idx)
+{
+	struct mem_cgroup *iter;
+	long val = 0;
+
+	/* Per-cpu values can be negative, use a signed accumulator */
+	for_each_mem_cgroup_tree(iter, memcg)
+		val += mem_cgroup_read_stat(iter, idx);
+
+	if (val < 0) /* race ? */
+		val = 0;
+	return val;
+}
+
+static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
+{
+	u64 val;
+
+	if (!mem_cgroup_is_root(memcg)) {
+		if (!swap)
+			return res_counter_read_u64(&memcg->res, RES_USAGE);
+		else
+			return res_counter_read_u64(&memcg->memsw, RES_USAGE);
+	}
+
+	/*
+	 * Transparent hugepages are still accounted for in MEM_CGROUP_STAT_RSS
+	 * as well as in MEM_CGROUP_STAT_RSS_HUGE.
+	 */
+	val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);
+	val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);
+
+	if (swap)
+		val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP);
+
+	return val << PAGE_SHIFT;
+}
+
+
 static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
 			       struct cftype *cft)
 {
@@ -4102,8 +4148,12 @@
 
 	switch (type) {
 	case _MEM:
+		if (name == RES_USAGE)
+			return mem_cgroup_usage(memcg, false);
 		return res_counter_read_u64(&memcg->res, name);
 	case _MEMSWAP:
+		if (name == RES_USAGE)
+			return mem_cgroup_usage(memcg, true);
 		return res_counter_read_u64(&memcg->memsw, name);
 	case _KMEM:
 		return res_counter_read_u64(&memcg->kmem, name);
@@ -4572,10 +4622,7 @@
 	if (!t)
 		goto unlock;
 
-	if (!swap)
-		usage = res_counter_read_u64(&memcg->res, RES_USAGE);
-	else
-		usage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
+	usage = mem_cgroup_usage(memcg, swap);
 
 	/*
 	 * current_threshold points to threshold just below or equal to usage.
@@ -4673,10 +4720,10 @@
 
 	if (type == _MEM) {
 		thresholds = &memcg->thresholds;
-		usage = res_counter_read_u64(&memcg->res, RES_USAGE);
+		usage = mem_cgroup_usage(memcg, false);
 	} else if (type == _MEMSWAP) {
 		thresholds = &memcg->memsw_thresholds;
-		usage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
+		usage = mem_cgroup_usage(memcg, true);
 	} else
 		BUG();
 
@@ -4762,10 +4809,10 @@
 
 	if (type == _MEM) {
 		thresholds = &memcg->thresholds;
-		usage = res_counter_read_u64(&memcg->res, RES_USAGE);
+		usage = mem_cgroup_usage(memcg, false);
 	} else if (type == _MEMSWAP) {
 		thresholds = &memcg->memsw_thresholds;
-		usage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
+		usage = mem_cgroup_usage(memcg, true);
 	} else
 		BUG();
 
@@ -5525,9 +5572,9 @@
 		 * core guarantees its existence.
 		 */
 	} else {
-		res_counter_init(&memcg->res, &root_mem_cgroup->res);
-		res_counter_init(&memcg->memsw, &root_mem_cgroup->memsw);
-		res_counter_init(&memcg->kmem, &root_mem_cgroup->kmem);
+		res_counter_init(&memcg->res, NULL);
+		res_counter_init(&memcg->memsw, NULL);
+		res_counter_init(&memcg->kmem, NULL);
 		/*
 		 * Deeper hierachy with use_hierarchy == false doesn't make
 		 * much sense so let cgroup subsystem know about this
@@ -5969,8 +6016,9 @@
 	/* we must fixup refcnts and charges */
 	if (mc.moved_swap) {
 		/* uncharge swap account from the old cgroup */
-		res_counter_uncharge(&mc.from->memsw,
-				     PAGE_SIZE * mc.moved_swap);
+		if (!mem_cgroup_is_root(mc.from))
+			res_counter_uncharge(&mc.from->memsw,
+					     PAGE_SIZE * mc.moved_swap);
 
 		for (i = 0; i < mc.moved_swap; i++)
 			css_put(&mc.from->css);
@@ -5979,8 +6027,9 @@
 		 * we charged both to->res and to->memsw, so we should
 		 * uncharge to->res.
 		 */
-		res_counter_uncharge(&mc.to->res,
-				     PAGE_SIZE * mc.moved_swap);
+		if (!mem_cgroup_is_root(mc.to))
+			res_counter_uncharge(&mc.to->res,
+					     PAGE_SIZE * mc.moved_swap);
 		/* we've already done css_get(mc.to) */
 		mc.moved_swap = 0;
 	}
@@ -6345,7 +6394,8 @@
 	rcu_read_lock();
 	memcg = mem_cgroup_lookup(id);
 	if (memcg) {
-		res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
+		if (!mem_cgroup_is_root(memcg))
+			res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
 		mem_cgroup_swap_statistics(memcg, false);
 		css_put(&memcg->css);
 	}
@@ -6509,12 +6559,15 @@
 {
 	unsigned long flags;
 
-	if (nr_mem)
-		res_counter_uncharge(&memcg->res, nr_mem * PAGE_SIZE);
-	if (nr_memsw)
-		res_counter_uncharge(&memcg->memsw, nr_memsw * PAGE_SIZE);
-
-	memcg_oom_recover(memcg);
+	if (!mem_cgroup_is_root(memcg)) {
+		if (nr_mem)
+			res_counter_uncharge(&memcg->res,
+					     nr_mem * PAGE_SIZE);
+		if (nr_memsw)
+			res_counter_uncharge(&memcg->memsw,
+					     nr_memsw * PAGE_SIZE);
+		memcg_oom_recover(memcg);
+	}
 
 	local_irq_save(flags);
 	__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon);

diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index 3707c71..5110816 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c

@@ -108,7 +108,7 @@
 			    int page_start, int page_end)
 {
 	const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
-	unsigned int cpu;
+	unsigned int cpu, tcpu;
 	int i;
 
 	for_each_possible_cpu(cpu) {
@@ -116,14 +116,23 @@
 			struct page **pagep = &pages[pcpu_page_idx(cpu, i)];
 
 			*pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0);
-			if (!*pagep) {
-				pcpu_free_pages(chunk, pages, populated,
-						page_start, page_end);
-				return -ENOMEM;
-			}
+			if (!*pagep)
+				goto err;
 		}
 	}
 	return 0;
+
+err:
+	while (--i >= page_start)
+		__free_page(pages[pcpu_page_idx(cpu, i)]);
+
+	for_each_possible_cpu(tcpu) {
+		if (tcpu == cpu)
+			break;
+		for (i = page_start; i < page_end; i++)
+			__free_page(pages[pcpu_page_idx(tcpu, i)]);
+	}
+	return -ENOMEM;
 }
 
 /**
@@ -263,6 +272,7 @@
 		__pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start),
 				   page_end - page_start);
 	}
+	pcpu_post_unmap_tlb_flush(chunk, page_start, page_end);
 	return err;
 }
 

diff --git a/mm/percpu.c b/mm/percpu.c
index 2139e30..da997f9 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c

@@ -1932,6 +1932,8 @@
 
 	if (pcpu_setup_first_chunk(ai, fc) < 0)
 		panic("Failed to initialize percpu areas.");
+
+	pcpu_free_alloc_info(ai);
 }
 
 #endif	/* CONFIG_SMP */

diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index b50dabb..faff624 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c

@@ -589,6 +589,14 @@
 void hci_le_conn_failed(struct hci_conn *conn, u8 status)
 {
 	struct hci_dev *hdev = conn->hdev;
+	struct hci_conn_params *params;
+
+	params = hci_pend_le_action_lookup(&hdev->pend_le_conns, &conn->dst,
+					   conn->dst_type);
+	if (params && params->conn) {
+		hci_conn_drop(params->conn);
+		params->conn = NULL;
+	}
 
 	conn->state = BT_CLOSED;
 

diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index c32d361..1d9c29a 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c

@@ -2536,8 +2536,13 @@
 {
 	struct hci_conn_params *p;
 
-	list_for_each_entry(p, &hdev->le_conn_params, list)
+	list_for_each_entry(p, &hdev->le_conn_params, list) {
+		if (p->conn) {
+			hci_conn_drop(p->conn);
+			p->conn = NULL;
+		}
 		list_del_init(&p->action);
+	}
 
 	BT_DBG("All LE pending actions cleared");
 }
@@ -2578,8 +2583,8 @@
 
 	hci_dev_lock(hdev);
 	hci_inquiry_cache_flush(hdev);
-	hci_conn_hash_flush(hdev);
 	hci_pend_le_actions_clear(hdev);
+	hci_conn_hash_flush(hdev);
 	hci_dev_unlock(hdev);
 
 	hci_notify(hdev, HCI_DEV_DOWN);
@@ -3727,6 +3732,9 @@
 	if (!params)
 		return;
 
+	if (params->conn)
+		hci_conn_drop(params->conn);
+
 	list_del(&params->action);
 	list_del(&params->list);
 	kfree(params);
@@ -3757,6 +3765,8 @@
 	struct hci_conn_params *params, *tmp;
 
 	list_for_each_entry_safe(params, tmp, &hdev->le_conn_params, list) {
+		if (params->conn)
+			hci_conn_drop(params->conn);
 		list_del(&params->action);
 		list_del(&params->list);
 		kfree(params);

diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index be35598..a600082 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c

@@ -4221,8 +4221,13 @@
 	hci_proto_connect_cfm(conn, ev->status);
 
 	params = hci_conn_params_lookup(hdev, &conn->dst, conn->dst_type);
-	if (params)
+	if (params) {
 		list_del_init(&params->action);
+		if (params->conn) {
+			hci_conn_drop(params->conn);
+			params->conn = NULL;
+		}
+	}
 
 unlock:
 	hci_update_background_scan(hdev);
@@ -4304,8 +4309,16 @@
 
 	conn = hci_connect_le(hdev, addr, addr_type, BT_SECURITY_LOW,
 			      HCI_LE_AUTOCONN_TIMEOUT, HCI_ROLE_MASTER);
-	if (!IS_ERR(conn))
+	if (!IS_ERR(conn)) {
+		/* Store the pointer since we don't really have any
+		 * other owner of the object besides the params that
+		 * triggered it. This way we can abort the connection if
+		 * the parameters get removed and keep the reference
+		 * count consistent once the connection is established.
+		 */
+		params->conn = conn;
 		return;
+	}
 
 	switch (PTR_ERR(conn)) {
 	case -EBUSY:

diff --git a/net/core/datagram.c b/net/core/datagram.c
index 488dd1a..fdbc9a8 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c

@@ -775,7 +775,7 @@
 EXPORT_SYMBOL(__skb_checksum_complete);
 
 /**
- *	skb_copy_and_csum_datagram_iovec - Copy and checkum skb to user iovec.
+ *	skb_copy_and_csum_datagram_iovec - Copy and checksum skb to user iovec.
  *	@skb: skbuff
  *	@hlen: hardware length
  *	@iov: io vector

diff --git a/net/core/dev.c b/net/core/dev.c
index b65a505..ab9a165 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c

@@ -2587,13 +2587,19 @@
 		return harmonize_features(skb, features);
 	}
 
-	features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX |
-					       NETIF_F_HW_VLAN_STAG_TX);
+	features = netdev_intersect_features(features,
+					     skb->dev->vlan_features |
+					     NETIF_F_HW_VLAN_CTAG_TX |
+					     NETIF_F_HW_VLAN_STAG_TX);
 
 	if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD))
-		features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
-				NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_CTAG_TX |
-				NETIF_F_HW_VLAN_STAG_TX;
+		features = netdev_intersect_features(features,
+						     NETIF_F_SG |
+						     NETIF_F_HIGHDMA |
+						     NETIF_F_FRAGLIST |
+						     NETIF_F_GEN_CSUM |
+						     NETIF_F_HW_VLAN_CTAG_TX |
+						     NETIF_F_HW_VLAN_STAG_TX);
 
 	return harmonize_features(skb, features);
 }
@@ -4889,7 +4895,8 @@
 	if (adj->master)
 		sysfs_remove_link(&(dev->dev.kobj), "master");
 
-	if (netdev_adjacent_is_neigh_list(dev, dev_list))
+	if (netdev_adjacent_is_neigh_list(dev, dev_list) &&
+	    net_eq(dev_net(dev),dev_net(adj_dev)))
 		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
 
 	list_del_rcu(&adj->list);
@@ -5159,11 +5166,65 @@
 }
 EXPORT_SYMBOL(netdev_upper_dev_unlink);
 
+void netdev_adjacent_add_links(struct net_device *dev)
+{
+	struct netdev_adjacent *iter;
+
+	struct net *net = dev_net(dev);
+
+	list_for_each_entry(iter, &dev->adj_list.upper, list) {
+		if (!net_eq(net,dev_net(iter->dev)))
+			continue;
+		netdev_adjacent_sysfs_add(iter->dev, dev,
+					  &iter->dev->adj_list.lower);
+		netdev_adjacent_sysfs_add(dev, iter->dev,
+					  &dev->adj_list.upper);
+	}
+
+	list_for_each_entry(iter, &dev->adj_list.lower, list) {
+		if (!net_eq(net,dev_net(iter->dev)))
+			continue;
+		netdev_adjacent_sysfs_add(iter->dev, dev,
+					  &iter->dev->adj_list.upper);
+		netdev_adjacent_sysfs_add(dev, iter->dev,
+					  &dev->adj_list.lower);
+	}
+}
+
+void netdev_adjacent_del_links(struct net_device *dev)
+{
+	struct netdev_adjacent *iter;
+
+	struct net *net = dev_net(dev);
+
+	list_for_each_entry(iter, &dev->adj_list.upper, list) {
+		if (!net_eq(net,dev_net(iter->dev)))
+			continue;
+		netdev_adjacent_sysfs_del(iter->dev, dev->name,
+					  &iter->dev->adj_list.lower);
+		netdev_adjacent_sysfs_del(dev, iter->dev->name,
+					  &dev->adj_list.upper);
+	}
+
+	list_for_each_entry(iter, &dev->adj_list.lower, list) {
+		if (!net_eq(net,dev_net(iter->dev)))
+			continue;
+		netdev_adjacent_sysfs_del(iter->dev, dev->name,
+					  &iter->dev->adj_list.upper);
+		netdev_adjacent_sysfs_del(dev, iter->dev->name,
+					  &dev->adj_list.lower);
+	}
+}
+
 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
 {
 	struct netdev_adjacent *iter;
 
+	struct net *net = dev_net(dev);
+
 	list_for_each_entry(iter, &dev->adj_list.upper, list) {
+		if (!net_eq(net,dev_net(iter->dev)))
+			continue;
 		netdev_adjacent_sysfs_del(iter->dev, oldname,
 					  &iter->dev->adj_list.lower);
 		netdev_adjacent_sysfs_add(iter->dev, dev,
@@ -5171,6 +5232,8 @@
 	}
 
 	list_for_each_entry(iter, &dev->adj_list.lower, list) {
+		if (!net_eq(net,dev_net(iter->dev)))
+			continue;
 		netdev_adjacent_sysfs_del(iter->dev, oldname,
 					  &iter->dev->adj_list.upper);
 		netdev_adjacent_sysfs_add(iter->dev, dev,
@@ -6773,6 +6836,7 @@
 
 	/* Send a netdev-removed uevent to the old namespace */
 	kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
+	netdev_adjacent_del_links(dev);
 
 	/* Actually switch the network namespace */
 	dev_net_set(dev, net);
@@ -6787,6 +6851,7 @@
 
 	/* Send a netdev-add uevent to the new namespace */
 	kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
+	netdev_adjacent_add_links(dev);
 
 	/* Fixup kobjects */
 	err = device_rename(&dev->dev, dev->name);

diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index 6b5b6e7..9d33dff 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c

@@ -197,7 +197,7 @@
  * as destination. A new timer with the interval specified in the
  * configuration TLV is created. Upon each interval, the latest statistics
  * will be read from &bstats and the estimated rate will be stored in
- * &rate_est with the statistics lock grabed during this period.
+ * &rate_est with the statistics lock grabbed during this period.
  *
  * Returns 0 on success or a negative error code.
  *

diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
index 9d3d9e7..2ddbce4 100644
--- a/net/core/gen_stats.c
+++ b/net/core/gen_stats.c

@@ -206,7 +206,7 @@
  * @st: application specific statistics data
  * @len: length of data
  *
- * Appends the application sepecific statistics to the top level TLV created by
+ * Appends the application specific statistics to the top level TLV created by
  * gnet_stats_start_copy() and remembers the data for XSTATS if the dumping
  * handle is in backward compatibility mode.
  *

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 163b673..da1378a 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c

@@ -2647,7 +2647,7 @@
  * skb_seq_read() will return the remaining part of the block.
  *
  * Note 1: The size of each block of data returned can be arbitrary,
- *       this limitation is the cost for zerocopy seqeuental
+ *       this limitation is the cost for zerocopy sequential
  *       reads of potentially non linear data.
  *
  * Note 2: Fragment lists within fragments are not implemented
@@ -2781,7 +2781,7 @@
 /**
  * skb_append_datato_frags - append the user data to a skb
  * @sk: sock  structure
- * @skb: skb structure to be appened with user data.
+ * @skb: skb structure to be appended with user data.
  * @getfrag: call back function to be used for getting the user data
  * @from: pointer to user message iov
  * @length: length of the iov message

diff --git a/net/core/sock.c b/net/core/sock.c
index 2714811..d372b4b 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c

@@ -166,7 +166,7 @@
 /**
  * sk_capable - Socket global capability test
  * @sk: Socket to use a capability on or through
- * @cap: The global capbility to use
+ * @cap: The global capability to use
  *
  * Test to see if the opener of the socket had when the socket was
  * created and the current process has the capability @cap in all user
@@ -183,7 +183,7 @@
  * @sk: Socket to use a capability on or through
  * @cap: The capability to use
  *
- * Test to see if the opener of the socket had when the socke was created
+ * Test to see if the opener of the socket had when the socket was created
  * and the current process has the capability @cap over the network namespace
  * the socket is a member of.
  */
@@ -1822,6 +1822,9 @@
 							   order);
 					if (page)
 						goto fill_page;
+					/* Do not retry other high order allocations */
+					order = 1;
+					max_page_order = 0;
 				}
 				order--;
 			}
@@ -1869,10 +1872,8 @@
  * no guarantee that allocations succeed. Therefore, @sz MUST be
  * less or equal than PAGE_SIZE.
  */
-bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t prio)
+bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
 {
-	int order;
-
 	if (pfrag->page) {
 		if (atomic_read(&pfrag->page->_count) == 1) {
 			pfrag->offset = 0;
@@ -1883,20 +1884,21 @@
 		put_page(pfrag->page);
 	}
 
-	order = SKB_FRAG_PAGE_ORDER;
-	do {
-		gfp_t gfp = prio;
-
-		if (order)
-			gfp |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY;
-		pfrag->page = alloc_pages(gfp, order);
+	pfrag->offset = 0;
+	if (SKB_FRAG_PAGE_ORDER) {
+		pfrag->page = alloc_pages(gfp | __GFP_COMP |
+					  __GFP_NOWARN | __GFP_NORETRY,
+					  SKB_FRAG_PAGE_ORDER);
 		if (likely(pfrag->page)) {
-			pfrag->offset = 0;
-			pfrag->size = PAGE_SIZE << order;
+			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
 			return true;
 		}
-	} while (--order >= 0);
-
+	}
+	pfrag->page = alloc_page(gfp);
+	if (likely(pfrag->page)) {
+		pfrag->size = PAGE_SIZE;
+		return true;
+	}
 	return false;
 }
 EXPORT_SYMBOL(skb_page_frag_refill);

diff --git a/net/ieee802154/6lowpan_rtnl.c b/net/ieee802154/6lowpan_rtnl.c
index 016b77e..6591d27 100644
--- a/net/ieee802154/6lowpan_rtnl.c
+++ b/net/ieee802154/6lowpan_rtnl.c

@@ -246,7 +246,7 @@
 			return ERR_PTR(-rc);
 		}
 	} else {
-		frag = ERR_PTR(ENOMEM);
+		frag = ERR_PTR(-ENOMEM);
 	}
 
 	return frag;
@@ -437,7 +437,7 @@
 	/* Frame Control + Sequence Number + Address fields + Security Header */
 	dev->hard_header_len	= 2 + 1 + 20 + 14;
 	dev->needed_tailroom	= 2; /* FCS */
-	dev->mtu		= 1281;
+	dev->mtu		= IPV6_MIN_MTU;
 	dev->tx_queue_len	= 0;
 	dev->flags		= IFF_BROADCAST | IFF_MULTICAST;
 	dev->watchdog_timeo	= 0;

diff --git a/net/ieee802154/reassembly.c b/net/ieee802154/reassembly.c
index ffec6ce..32755cb 100644
--- a/net/ieee802154/reassembly.c
+++ b/net/ieee802154/reassembly.c

@@ -355,8 +355,6 @@
 	struct net *net = dev_net(skb->dev);
 	struct lowpan_frag_info *frag_info = lowpan_cb(skb);
 	struct ieee802154_addr source, dest;
-	struct netns_ieee802154_lowpan *ieee802154_lowpan =
-		net_ieee802154_lowpan(net);
 	int err;
 
 	source = mac_cb(skb)->source;
@@ -366,8 +364,10 @@
 	if (err < 0)
 		goto err;
 
-	if (frag_info->d_size > ieee802154_lowpan->max_dsize)
+	if (frag_info->d_size > IPV6_MIN_MTU) {
+		net_warn_ratelimited("lowpan_frag_rcv: datagram size exceeds MTU\n");
 		goto err;
+	}
 
 	fq = fq_find(net, frag_info, &source, &dest);
 	if (fq != NULL) {
@@ -415,13 +415,6 @@
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_jiffies,
 	},
-	{
-		.procname	= "6lowpanfrag_max_datagram_size",
-		.data		= &init_net.ieee802154_lowpan.max_dsize,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec
-	},
 	{ }
 };
 
@@ -458,7 +451,6 @@
 		table[1].data = &ieee802154_lowpan->frags.low_thresh;
 		table[1].extra2 = &ieee802154_lowpan->frags.high_thresh;
 		table[2].data = &ieee802154_lowpan->frags.timeout;
-		table[3].data = &ieee802154_lowpan->max_dsize;
 
 		/* Don't export sysctls to unprivileged users */
 		if (net->user_ns != &init_user_ns)
@@ -533,7 +525,6 @@
 	ieee802154_lowpan->frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
 	ieee802154_lowpan->frags.low_thresh = IPV6_FRAG_LOW_THRESH;
 	ieee802154_lowpan->frags.timeout = IPV6_FRAG_TIMEOUT;
-	ieee802154_lowpan->max_dsize = 0xFFFF;
 
 	inet_frags_init_net(&ieee802154_lowpan->frags);
 

diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index fb17312..7cbcaf4 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig

@@ -82,6 +82,52 @@
 	help
 	  This option enables the ARP support for nf_tables.
 
+config NF_NAT_IPV4
+	tristate "IPv4 NAT"
+	depends on NF_CONNTRACK_IPV4
+	default m if NETFILTER_ADVANCED=n
+	select NF_NAT
+	help
+	  The IPv4 NAT option allows masquerading, port forwarding and other
+	  forms of full Network Address Port Translation. This can be
+	  controlled by iptables or nft.
+
+if NF_NAT_IPV4
+
+config NF_NAT_SNMP_BASIC
+	tristate "Basic SNMP-ALG support"
+	depends on NF_CONNTRACK_SNMP
+	depends on NETFILTER_ADVANCED
+	default NF_NAT && NF_CONNTRACK_SNMP
+	---help---
+
+	  This module implements an Application Layer Gateway (ALG) for
+	  SNMP payloads.  In conjunction with NAT, it allows a network
+	  management system to access multiple private networks with
+	  conflicting addresses.  It works by modifying IP addresses
+	  inside SNMP payloads to match IP-layer NAT mapping.
+
+	  This is the "basic" form of SNMP-ALG, as described in RFC 2962
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config NF_NAT_PROTO_GRE
+	tristate
+	depends on NF_CT_PROTO_GRE
+
+config NF_NAT_PPTP
+	tristate
+	depends on NF_CONNTRACK
+	default NF_CONNTRACK_PPTP
+	select NF_NAT_PROTO_GRE
+
+config NF_NAT_H323
+	tristate
+	depends on NF_CONNTRACK
+	default NF_CONNTRACK_H323
+
+endif # NF_NAT_IPV4
+
 config IP_NF_IPTABLES
 	tristate "IP tables support (required for filtering/masq/NAT)"
 	default m if NETFILTER_ADVANCED=n
@@ -170,19 +216,21 @@
 	  To compile it as a module, choose M here. If unsure, say N.
 
 # NAT + specific targets: nf_conntrack
-config NF_NAT_IPV4
-	tristate "IPv4 NAT"
+config IP_NF_NAT
+	tristate "iptables NAT support"
 	depends on NF_CONNTRACK_IPV4
 	default m if NETFILTER_ADVANCED=n
 	select NF_NAT
+	select NF_NAT_IPV4
+	select NETFILTER_XT_NAT
 	help
-	  The IPv4 NAT option allows masquerading, port forwarding and other
-	  forms of full Network Address Port Translation.  It is controlled by
-	  the `nat' table in iptables: see the man page for iptables(8).
+	  This enables the `nat' table in iptables. This allows masquerading,
+	  port forwarding and other forms of full Network Address Port
+	  Translation.
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
-if NF_NAT_IPV4
+if IP_NF_NAT
 
 config IP_NF_TARGET_MASQUERADE
 	tristate "MASQUERADE target support"
@@ -214,47 +262,7 @@
 	(e.g. when running oldconfig). It selects
 	CONFIG_NETFILTER_XT_TARGET_REDIRECT.
 
-endif
-
-config NF_NAT_SNMP_BASIC
-	tristate "Basic SNMP-ALG support"
-	depends on NF_CONNTRACK_SNMP && NF_NAT_IPV4
-	depends on NETFILTER_ADVANCED
-	default NF_NAT && NF_CONNTRACK_SNMP
-	---help---
-
-	  This module implements an Application Layer Gateway (ALG) for
-	  SNMP payloads.  In conjunction with NAT, it allows a network
-	  management system to access multiple private networks with
-	  conflicting addresses.  It works by modifying IP addresses
-	  inside SNMP payloads to match IP-layer NAT mapping.
-
-	  This is the "basic" form of SNMP-ALG, as described in RFC 2962
-
-	  To compile it as a module, choose M here.  If unsure, say N.
-
-# If they want FTP, set to $CONFIG_IP_NF_NAT (m or y),
-# or $CONFIG_IP_NF_FTP (m or y), whichever is weaker.
-# From kconfig-language.txt:
-#
-#           <expr> '&&' <expr>                   (6)
-#
-# (6) Returns the result of min(/expr/, /expr/).
-
-config NF_NAT_PROTO_GRE
-	tristate
-	depends on NF_NAT_IPV4 && NF_CT_PROTO_GRE
-
-config NF_NAT_PPTP
-	tristate
-	depends on NF_CONNTRACK && NF_NAT_IPV4
-	default NF_NAT_IPV4 && NF_CONNTRACK_PPTP
-	select NF_NAT_PROTO_GRE
-
-config NF_NAT_H323
-	tristate
-	depends on NF_CONNTRACK && NF_NAT_IPV4
-	default NF_NAT_IPV4 && NF_CONNTRACK_H323
+endif # IP_NF_NAT
 
 # mangle + specific targets
 config IP_NF_MANGLE

diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 3300162..edf4af3 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile

@@ -43,7 +43,7 @@
 # the three instances of ip_tables
 obj-$(CONFIG_IP_NF_FILTER) += iptable_filter.o
 obj-$(CONFIG_IP_NF_MANGLE) += iptable_mangle.o
-obj-$(CONFIG_NF_NAT_IPV4) += iptable_nat.o
+obj-$(CONFIG_IP_NF_NAT) += iptable_nat.o
 obj-$(CONFIG_IP_NF_RAW) += iptable_raw.o
 obj-$(CONFIG_IP_NF_SECURITY) += iptable_security.o
 

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 0b239fc..fc1fac2 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c

@@ -1690,14 +1690,12 @@
 	addrconf_mod_dad_work(ifp, 0);
 }
 
-/* Join to solicited addr multicast group. */
-
+/* Join to solicited addr multicast group.
+ * caller must hold RTNL */
 void addrconf_join_solict(struct net_device *dev, const struct in6_addr *addr)
 {
 	struct in6_addr maddr;
 
-	ASSERT_RTNL();
-
 	if (dev->flags&(IFF_LOOPBACK|IFF_NOARP))
 		return;
 
@@ -1705,12 +1703,11 @@
 	ipv6_dev_mc_inc(dev, &maddr);
 }
 
+/* caller must hold RTNL */
 void addrconf_leave_solict(struct inet6_dev *idev, const struct in6_addr *addr)
 {
 	struct in6_addr maddr;
 
-	ASSERT_RTNL();
-
 	if (idev->dev->flags&(IFF_LOOPBACK|IFF_NOARP))
 		return;
 
@@ -1718,12 +1715,11 @@
 	__ipv6_dev_mc_dec(idev, &maddr);
 }
 
+/* caller must hold RTNL */
 static void addrconf_join_anycast(struct inet6_ifaddr *ifp)
 {
 	struct in6_addr addr;
 
-	ASSERT_RTNL();
-
 	if (ifp->prefix_len >= 127) /* RFC 6164 */
 		return;
 	ipv6_addr_prefix(&addr, &ifp->addr, ifp->prefix_len);
@@ -1732,12 +1728,11 @@
 	ipv6_dev_ac_inc(ifp->idev->dev, &addr);
 }
 
+/* caller must hold RTNL */
 static void addrconf_leave_anycast(struct inet6_ifaddr *ifp)
 {
 	struct in6_addr addr;
 
-	ASSERT_RTNL();
-
 	if (ifp->prefix_len >= 127) /* RFC 6164 */
 		return;
 	ipv6_addr_prefix(&addr, &ifp->addr, ifp->prefix_len);
@@ -4773,15 +4768,11 @@
 		addrconf_leave_solict(ifp->idev, &ifp->addr);
 		if (!ipv6_addr_any(&ifp->peer_addr)) {
 			struct rt6_info *rt;
-			struct net_device *dev = ifp->idev->dev;
 
-			rt = rt6_lookup(dev_net(dev), &ifp->peer_addr, NULL,
-					dev->ifindex, 1);
-			if (rt) {
-				dst_hold(&rt->dst);
-				if (ip6_del_rt(rt))
-					dst_free(&rt->dst);
-			}
+			rt = addrconf_get_prefix_route(&ifp->peer_addr, 128,
+						       ifp->idev->dev, 0, 0);
+			if (rt && ip6_del_rt(rt))
+				dst_free(&rt->dst);
 		}
 		dst_hold(&ifp->rt->dst);
 

diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
index 2101832..ff2de7d 100644
--- a/net/ipv6/anycast.c
+++ b/net/ipv6/anycast.c

@@ -77,6 +77,7 @@
 	pac->acl_next = NULL;
 	pac->acl_addr = *addr;
 
+	rtnl_lock();
 	rcu_read_lock();
 	if (ifindex == 0) {
 		struct rt6_info *rt;
@@ -137,6 +138,7 @@
 
 error:
 	rcu_read_unlock();
+	rtnl_unlock();
 	if (pac)
 		sock_kfree_s(sk, pac, sizeof(*pac));
 	return err;
@@ -171,11 +173,13 @@
 
 	spin_unlock_bh(&ipv6_sk_ac_lock);
 
+	rtnl_lock();
 	rcu_read_lock();
 	dev = dev_get_by_index_rcu(net, pac->acl_ifindex);
 	if (dev)
 		ipv6_dev_ac_dec(dev, &pac->acl_addr);
 	rcu_read_unlock();
+	rtnl_unlock();
 
 	sock_kfree_s(sk, pac, sizeof(*pac));
 	return 0;
@@ -198,6 +202,7 @@
 	spin_unlock_bh(&ipv6_sk_ac_lock);
 
 	prev_index = 0;
+	rtnl_lock();
 	rcu_read_lock();
 	while (pac) {
 		struct ipv6_ac_socklist *next = pac->acl_next;
@@ -212,6 +217,7 @@
 		pac = next;
 	}
 	rcu_read_unlock();
+	rtnl_unlock();
 }
 
 static void aca_put(struct ifacaddr6 *ac)
@@ -233,6 +239,8 @@
 	struct rt6_info *rt;
 	int err;
 
+	ASSERT_RTNL();
+
 	idev = in6_dev_get(dev);
 
 	if (idev == NULL)
@@ -302,6 +310,8 @@
 {
 	struct ifacaddr6 *aca, *prev_aca;
 
+	ASSERT_RTNL();
+
 	write_lock_bh(&idev->lock);
 	prev_aca = NULL;
 	for (aca = idev->ac_list; aca; aca = aca->aca_next) {

diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 617f095..a23b655 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c

@@ -172,6 +172,7 @@
 	mc_lst->next = NULL;
 	mc_lst->addr = *addr;
 
+	rtnl_lock();
 	rcu_read_lock();
 	if (ifindex == 0) {
 		struct rt6_info *rt;
@@ -185,6 +186,7 @@
 
 	if (dev == NULL) {
 		rcu_read_unlock();
+		rtnl_unlock();
 		sock_kfree_s(sk, mc_lst, sizeof(*mc_lst));
 		return -ENODEV;
 	}
@@ -202,6 +204,7 @@
 
 	if (err) {
 		rcu_read_unlock();
+		rtnl_unlock();
 		sock_kfree_s(sk, mc_lst, sizeof(*mc_lst));
 		return err;
 	}
@@ -212,6 +215,7 @@
 	spin_unlock(&ipv6_sk_mc_lock);
 
 	rcu_read_unlock();
+	rtnl_unlock();
 
 	return 0;
 }
@@ -229,6 +233,7 @@
 	if (!ipv6_addr_is_multicast(addr))
 		return -EINVAL;
 
+	rtnl_lock();
 	spin_lock(&ipv6_sk_mc_lock);
 	for (lnk = &np->ipv6_mc_list;
 	     (mc_lst = rcu_dereference_protected(*lnk,
@@ -252,12 +257,15 @@
 			} else
 				(void) ip6_mc_leave_src(sk, mc_lst, NULL);
 			rcu_read_unlock();
+			rtnl_unlock();
+
 			atomic_sub(sizeof(*mc_lst), &sk->sk_omem_alloc);
 			kfree_rcu(mc_lst, rcu);
 			return 0;
 		}
 	}
 	spin_unlock(&ipv6_sk_mc_lock);
+	rtnl_unlock();
 
 	return -EADDRNOTAVAIL;
 }
@@ -302,6 +310,7 @@
 	if (!rcu_access_pointer(np->ipv6_mc_list))
 		return;
 
+	rtnl_lock();
 	spin_lock(&ipv6_sk_mc_lock);
 	while ((mc_lst = rcu_dereference_protected(np->ipv6_mc_list,
 				lockdep_is_held(&ipv6_sk_mc_lock))) != NULL) {
@@ -328,6 +337,7 @@
 		spin_lock(&ipv6_sk_mc_lock);
 	}
 	spin_unlock(&ipv6_sk_mc_lock);
+	rtnl_unlock();
 }
 
 int ip6_mc_source(int add, int omode, struct sock *sk,
@@ -845,6 +855,8 @@
 	struct ifmcaddr6 *mc;
 	struct inet6_dev *idev;
 
+	ASSERT_RTNL();
+
 	/* we need to take a reference on idev */
 	idev = in6_dev_get(dev);
 
@@ -916,6 +928,8 @@
 {
 	struct ifmcaddr6 *ma, **map;
 
+	ASSERT_RTNL();
+
 	write_lock_bh(&idev->lock);
 	for (map = &idev->mc_list; (ma=*map) != NULL; map = &ma->next) {
 		if (ipv6_addr_equal(&ma->mca_addr, addr)) {

diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig
index ac93df1..2812816 100644
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig

@@ -57,9 +57,19 @@
 
 config NF_LOG_IPV6
 	tristate "IPv6 packet logging"
-	depends on NETFILTER_ADVANCED
+	default m if NETFILTER_ADVANCED=n
 	select NF_LOG_COMMON
 
+config NF_NAT_IPV6
+	tristate "IPv6 NAT"
+	depends on NF_CONNTRACK_IPV6
+	depends on NETFILTER_ADVANCED
+	select NF_NAT
+	help
+	  The IPv6 NAT option allows masquerading, port forwarding and other
+	  forms of full Network Address Port Translation. This can be
+	  controlled by iptables or nft.
+
 config IP6_NF_IPTABLES
 	tristate "IP6 tables support (required for filtering)"
 	depends on INET && IPV6
@@ -232,19 +242,21 @@
 
          If unsure, say N.
 
-config NF_NAT_IPV6
-	tristate "IPv6 NAT"
+config IP6_NF_NAT
+	tristate "ip6tables NAT support"
 	depends on NF_CONNTRACK_IPV6
 	depends on NETFILTER_ADVANCED
 	select NF_NAT
+	select NF_NAT_IPV6
+	select NETFILTER_XT_NAT
 	help
-	  The IPv6 NAT option allows masquerading, port forwarding and other
-	  forms of full Network Address Port Translation. It is controlled by
-	  the `nat' table in ip6tables, see the man page for ip6tables(8).
+	  This enables the `nat' table in ip6tables. This allows masquerading,
+	  port forwarding and other forms of full Network Address Port
+	  Translation.
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
-if NF_NAT_IPV6
+if IP6_NF_NAT
 
 config IP6_NF_TARGET_MASQUERADE
 	tristate "MASQUERADE target support"
@@ -265,7 +277,7 @@
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
-endif # NF_NAT_IPV6
+endif # IP6_NF_NAT
 
 endif # IP6_NF_IPTABLES
 

diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile
index c0b2631..c3d3286 100644
--- a/net/ipv6/netfilter/Makefile
+++ b/net/ipv6/netfilter/Makefile

@@ -8,7 +8,7 @@
 obj-$(CONFIG_IP6_NF_MANGLE) += ip6table_mangle.o
 obj-$(CONFIG_IP6_NF_RAW) += ip6table_raw.o
 obj-$(CONFIG_IP6_NF_SECURITY) += ip6table_security.o
-obj-$(CONFIG_NF_NAT_IPV6) += ip6table_nat.o
+obj-$(CONFIG_IP6_NF_NAT) += ip6table_nat.o
 
 # objects for l3 independent conntrack
 nf_conntrack_ipv6-y  :=  nf_conntrack_l3proto_ipv6.o nf_conntrack_proto_icmpv6.o

diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c
index 13752d9..b704a93 100644
--- a/net/l2tp/l2tp_ppp.c
+++ b/net/l2tp/l2tp_ppp.c

@@ -755,7 +755,8 @@
 	/* If PMTU discovery was enabled, use the MTU that was discovered */
 	dst = sk_dst_get(tunnel->sock);
 	if (dst != NULL) {
-		u32 pmtu = dst_mtu(__sk_dst_get(tunnel->sock));
+		u32 pmtu = dst_mtu(dst);
+
 		if (pmtu != 0)
 			session->mtu = session->mru = pmtu -
 				PPPOL2TP_HEADER_OVERHEAD;

diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c
index 0375009..399ad82 100644
--- a/net/mac80211/chan.c
+++ b/net/mac80211/chan.c

@@ -541,6 +541,8 @@
 			continue;
 		if (rcu_access_pointer(sdata->vif.chanctx_conf) != conf)
 			continue;
+		if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
+			continue;
 
 		if (!compat)
 			compat = &sdata->vif.bss_conf.chandef;

diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c
index 3db9664..86173c0 100644
--- a/net/mac80211/debugfs_sta.c
+++ b/net/mac80211/debugfs_sta.c

@@ -167,7 +167,7 @@
 	p += scnprintf(p, sizeof(buf) + buf - p, "next dialog_token: %#02x\n",
 			sta->ampdu_mlme.dialog_token_allocator + 1);
 	p += scnprintf(p, sizeof(buf) + buf - p,
-		       "TID\t\tRX active\tDTKN\tSSN\t\tTX\tDTKN\tpending\n");
+		       "TID\t\tRX\tDTKN\tSSN\t\tTX\tDTKN\tpending\n");
 
 	for (i = 0; i < IEEE80211_NUM_TIDS; i++) {
 		tid_rx = rcu_dereference(sta->ampdu_mlme.tid_rx[i]);

diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index 01eede7..f75e5f1 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c

@@ -1175,8 +1175,8 @@
 			if (sta) {
 				u16 last_seq;
 
-				last_seq = le16_to_cpu(
-					sta->last_seq_ctrl[rx_agg->tid]);
+				last_seq = IEEE80211_SEQ_TO_SN(le16_to_cpu(
+					sta->last_seq_ctrl[rx_agg->tid]));
 
 				__ieee80211_start_rx_ba_session(sta,
 						0, 0,

diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c
index 63b8741..c47194d 100644
--- a/net/mac80211/mesh_plink.c
+++ b/net/mac80211/mesh_plink.c

@@ -959,7 +959,8 @@
 		if (!matches_local)
 			event = CNF_RJCT;
 		if (!mesh_plink_free_count(sdata) ||
-		    (sta->llid != llid || sta->plid != plid))
+		    sta->llid != llid ||
+		    (sta->plid && sta->plid != plid))
 			event = CNF_IGNR;
 		else
 			event = CNF_ACPT;
@@ -1080,6 +1081,10 @@
 		goto unlock_rcu;
 	}
 
+	/* 802.11-2012 13.3.7.2 - update plid on CNF if not set */
+	if (!sta->plid && event == CNF_ACPT)
+		sta->plid = plid;
+
 	changed |= mesh_plink_fsm(sdata, sta, event);
 
 unlock_rcu:

diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 31a8afa..b82a12a 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c

@@ -4376,8 +4376,7 @@
 	rcu_read_unlock();
 
 	if (bss->wmm_used && bss->uapsd_supported &&
-	    (sdata->local->hw.flags & IEEE80211_HW_SUPPORTS_UAPSD) &&
-	    sdata->wmm_acm != 0xff) {
+	    (sdata->local->hw.flags & IEEE80211_HW_SUPPORTS_UAPSD)) {
 		assoc_data->uapsd = true;
 		ifmgd->flags |= IEEE80211_STA_UAPSD_ENABLED;
 	} else {

diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index c6ee213..441875f 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c

@@ -1094,8 +1094,11 @@
 	unsigned long flags;
 	struct ps_data *ps;
 
-	if (sdata->vif.type == NL80211_IFTYPE_AP ||
-	    sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
+	if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
+		sdata = container_of(sdata->bss, struct ieee80211_sub_if_data,
+				     u.ap);
+
+	if (sdata->vif.type == NL80211_IFTYPE_AP)
 		ps = &sdata->bss->ps;
 	else if (ieee80211_vif_is_mesh(&sdata->vif))
 		ps = &sdata->u.mesh.ps;

diff --git a/net/mac802154/wpan.c b/net/mac802154/wpan.c
index 3c3069f..5478388 100644
--- a/net/mac802154/wpan.c
+++ b/net/mac802154/wpan.c

@@ -462,7 +462,10 @@
 			skb->pkt_type = PACKET_OTHERHOST;
 		break;
 	default:
-		break;
+		spin_unlock_bh(&sdata->mib_lock);
+		pr_debug("invalid dest mode\n");
+		kfree_skb(skb);
+		return NET_RX_DROP;
 	}
 
 	spin_unlock_bh(&sdata->mib_lock);
@@ -573,6 +576,7 @@
 	ret = mac802154_parse_frame_start(skb, &hdr);
 	if (ret) {
 		pr_debug("got invalid frame\n");
+		kfree_skb(skb);
 		return;
 	}
 

diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index ad751fe..b5c1d3a 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig

@@ -499,7 +499,7 @@
 config NFT_NAT
 	depends on NF_TABLES
 	depends on NF_CONNTRACK
-	depends on NF_NAT
+	select NF_NAT
 	tristate "Netfilter nf_tables nat module"
 	help
 	  This option adds the "nat" expression that you can use to perform
@@ -747,7 +747,9 @@
 
 config NETFILTER_XT_TARGET_LOG
 	tristate "LOG target support"
-	depends on NF_LOG_IPV4 && NF_LOG_IPV6
+	select NF_LOG_COMMON
+	select NF_LOG_IPV4
+	select NF_LOG_IPV6 if IPV6
 	default m if NETFILTER_ADVANCED=n
 	help
 	  This option adds a `LOG' target, which allows you to create rules in
@@ -764,6 +766,14 @@
 	(e.g. when running oldconfig). It selects
 	CONFIG_NETFILTER_XT_MARK (combined mark/MARK module).
 
+config NETFILTER_XT_NAT
+	tristate '"SNAT and DNAT" targets support'
+	depends on NF_NAT
+	---help---
+	This option enables the SNAT and DNAT targets.
+
+	To compile it as a module, choose M here. If unsure, say N.
+
 config NETFILTER_XT_TARGET_NETMAP
 	tristate '"NETMAP" target support'
 	depends on NF_NAT

diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 8308624..fad5fdb 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile

@@ -95,7 +95,7 @@
 obj-$(CONFIG_NETFILTER_XT_MARK) += xt_mark.o
 obj-$(CONFIG_NETFILTER_XT_CONNMARK) += xt_connmark.o
 obj-$(CONFIG_NETFILTER_XT_SET) += xt_set.o
-obj-$(CONFIG_NF_NAT) += xt_nat.o
+obj-$(CONFIG_NETFILTER_XT_NAT) += xt_nat.o
 
 # targets
 obj-$(CONFIG_NETFILTER_XT_TARGET_AUDIT) += xt_AUDIT.o

diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index a93c97f..024a2e2 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c

@@ -54,7 +54,7 @@
 struct list_head nf_hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS] __read_mostly;
 EXPORT_SYMBOL(nf_hooks);
 
-#if defined(CONFIG_JUMP_LABEL)
+#ifdef HAVE_JUMP_LABEL
 struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS];
 EXPORT_SYMBOL(nf_hooks_needed);
 #endif
@@ -72,7 +72,7 @@
 	}
 	list_add_rcu(&reg->list, elem->list.prev);
 	mutex_unlock(&nf_hook_mutex);
-#if defined(CONFIG_JUMP_LABEL)
+#ifdef HAVE_JUMP_LABEL
 	static_key_slow_inc(&nf_hooks_needed[reg->pf][reg->hooknum]);
 #endif
 	return 0;
@@ -84,7 +84,7 @@
 	mutex_lock(&nf_hook_mutex);
 	list_del_rcu(&reg->list);
 	mutex_unlock(&nf_hook_mutex);
-#if defined(CONFIG_JUMP_LABEL)
+#ifdef HAVE_JUMP_LABEL
 	static_key_slow_dec(&nf_hooks_needed[reg->pf][reg->hooknum]);
 #endif
 	synchronize_net();

diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index e683675..5c34e8d 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c

@@ -1906,7 +1906,7 @@
 	{
 		.hook		= ip_vs_local_reply6,
 		.owner		= THIS_MODULE,
-		.pf		= NFPROTO_IPV4,
+		.pf		= NFPROTO_IPV6,
 		.hooknum	= NF_INET_LOCAL_OUT,
 		.priority	= NF_IP6_PRI_NAT_DST + 1,
 	},

diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 6f70bdd..56896a4 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c

@@ -38,6 +38,7 @@
 #include <net/route.h>                  /* for ip_route_output */
 #include <net/ipv6.h>
 #include <net/ip6_route.h>
+#include <net/ip_tunnels.h>
 #include <net/addrconf.h>
 #include <linux/icmpv6.h>
 #include <linux/netfilter.h>
@@ -862,11 +863,15 @@
 		old_iph = ip_hdr(skb);
 	}
 
-	skb->transport_header = skb->network_header;
-
 	/* fix old IP header checksum */
 	ip_send_check(old_iph);
 
+	skb = iptunnel_handle_offloads(skb, false, SKB_GSO_IPIP);
+	if (IS_ERR(skb))
+		goto tx_error;
+
+	skb->transport_header = skb->network_header;
+
 	skb_push(skb, sizeof(struct iphdr));
 	skb_reset_network_header(skb);
 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
@@ -900,7 +905,8 @@
 	return NF_STOLEN;
 
   tx_error:
-	kfree_skb(skb);
+	if (!IS_ERR(skb))
+		kfree_skb(skb);
 	rcu_read_unlock();
 	LeaveFunction(10);
 	return NF_STOLEN;
@@ -953,6 +959,11 @@
 		old_iph = ipv6_hdr(skb);
 	}
 
+	/* GSO: we need to provide proper SKB_GSO_ value for IPv6 */
+	skb = iptunnel_handle_offloads(skb, false, 0); /* SKB_GSO_SIT/IPV6 */
+	if (IS_ERR(skb))
+		goto tx_error;
+
 	skb->transport_header = skb->network_header;
 
 	skb_push(skb, sizeof(struct ipv6hdr));
@@ -988,7 +999,8 @@
 	return NF_STOLEN;
 
 tx_error:
-	kfree_skb(skb);
+	if (!IS_ERR(skb))
+		kfree_skb(skb);
 	rcu_read_unlock();
 	LeaveFunction(10);
 	return NF_STOLEN;

diff --git a/net/netfilter/xt_cgroup.c b/net/netfilter/xt_cgroup.c
index f4e8330..7198d66 100644
--- a/net/netfilter/xt_cgroup.c
+++ b/net/netfilter/xt_cgroup.c

@@ -31,7 +31,7 @@
 	if (info->invert & ~1)
 		return -EINVAL;
 
-	return info->id ? 0 : -EINVAL;
+	return 0;
 }
 
 static bool

diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 7228ec3..91d66b7 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c

@@ -265,8 +265,11 @@
 		upcall.key = &key;
 		upcall.userdata = NULL;
 		upcall.portid = ovs_vport_find_upcall_portid(p, skb);
-		ovs_dp_upcall(dp, skb, &upcall);
-		consume_skb(skb);
+		error = ovs_dp_upcall(dp, skb, &upcall);
+		if (unlikely(error))
+			kfree_skb(skb);
+		else
+			consume_skb(skb);
 		stats_counter = &stats->n_missed;
 		goto out;
 	}
@@ -404,7 +407,7 @@
 {
 	struct ovs_header *upcall;
 	struct sk_buff *nskb = NULL;
-	struct sk_buff *user_skb; /* to be queued to userspace */
+	struct sk_buff *user_skb = NULL; /* to be queued to userspace */
 	struct nlattr *nla;
 	struct genl_info info = {
 		.dst_sk = ovs_dp_get_net(dp)->genl_sock,
@@ -494,9 +497,11 @@
 	((struct nlmsghdr *) user_skb->data)->nlmsg_len = user_skb->len;
 
 	err = genlmsg_unicast(ovs_dp_get_net(dp), user_skb, upcall_info->portid);
+	user_skb = NULL;
 out:
 	if (err)
 		skb_tx_error(skb);
+	kfree_skb(user_skb);
 	kfree_skb(nskb);
 	return err;
 }

diff --git a/net/rfkill/rfkill-gpio.c b/net/rfkill/rfkill-gpio.c
index 14c98e4..02a86a2 100644
--- a/net/rfkill/rfkill-gpio.c
+++ b/net/rfkill/rfkill-gpio.c

@@ -158,6 +158,7 @@
 	{ "BCM2E1A", RFKILL_TYPE_BLUETOOTH },
 	{ "BCM2E39", RFKILL_TYPE_BLUETOOTH },
 	{ "BCM2E3D", RFKILL_TYPE_BLUETOOTH },
+	{ "BCM2E64", RFKILL_TYPE_BLUETOOTH },
 	{ "BCM4752", RFKILL_TYPE_GPS },
 	{ "LNV4752", RFKILL_TYPE_GPS },
 	{ },

diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index eb71d49..634a2ab 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c

@@ -4243,7 +4243,7 @@
 	transport = asoc->peer.primary_path;
 
 	status.sstat_assoc_id = sctp_assoc2id(asoc);
-	status.sstat_state = asoc->state;
+	status.sstat_state = sctp_assoc_to_state(asoc);
 	status.sstat_rwnd =  asoc->peer.rwnd;
 	status.sstat_unackdata = asoc->unack_data;
 

diff --git a/net/socket.c b/net/socket.c
index 95ee7d8..2e2586e 100644
--- a/net/socket.c
+++ b/net/socket.c

@@ -734,8 +734,7 @@
 	}
 
 	memset(&tss, 0, sizeof(tss));
-	if ((sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE ||
-	     skb_shinfo(skb)->tx_flags & SKBTX_ANY_SW_TSTAMP) &&
+	if ((sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE) &&
 	    ktime_to_timespec_cond(skb->tstamp, tss.ts + 0))
 		empty = 0;
 	if (shhwtstamps &&
@@ -2602,7 +2601,7 @@
  *
  *	This function is called by a protocol handler that wants to
  *	advertise its address family, and have it linked into the
- *	socket interface. The value ops->family coresponds to the
+ *	socket interface. The value ops->family corresponds to the
  *	socket system call protocol family.
  */
 int sock_register(const struct net_proto_family *ops)

diff --git a/scripts/kconfig/Makefile b/scripts/kconfig/Makefile
index 9c4d241..a7e0862 100644
--- a/scripts/kconfig/Makefile
+++ b/scripts/kconfig/Makefile

@@ -3,7 +3,7 @@
 # These targets are used from top-level makefile
 
 PHONY += oldconfig xconfig gconfig menuconfig config silentoldconfig update-po-config \
-	localmodconfig localyesconfig
+	localmodconfig localyesconfig kvmconfig
 
 ifdef KBUILD_KCONFIG
 Kconfig := $(KBUILD_KCONFIG)
@@ -36,6 +36,11 @@
 	$(Q)mkdir -p include/config include/generated
 	$< --$@ $(Kconfig)
 
+kvmconfig:
+	$(Q)$(CONFIG_SHELL) $(srctree)/scripts/config -e KVMTOOL_TEST_ENABLE
+	$(Q)yes "" | make oldconfig > /dev/null
+	@echo 'Kernel configuration modified to run as KVM guest.'
+
 localyesconfig localmodconfig: $(obj)/streamline_config.pl $(obj)/conf
 	$(Q)mkdir -p include/config include/generated
 	$(Q)perl $< --$@ $(srctree) $(Kconfig) > .tmp.config

diff --git a/security/keys/key.c b/security/keys/key.c
index b90a68c..6d0cad1 100644
--- a/security/keys/key.c
+++ b/security/keys/key.c

@@ -27,8 +27,8 @@
 struct rb_root	key_user_tree; /* tree of quota records indexed by UID */
 DEFINE_SPINLOCK(key_user_lock);
 
-unsigned int key_quota_root_maxkeys = 200;	/* root's key count quota */
-unsigned int key_quota_root_maxbytes = 20000;	/* root's key space quota */
+unsigned int key_quota_root_maxkeys = 1000000;	/* root's key count quota */
+unsigned int key_quota_root_maxbytes = 25000000; /* root's key space quota */
 unsigned int key_quota_maxkeys = 200;		/* general key count quota */
 unsigned int key_quota_maxbytes = 20000;	/* general key space quota */
 

diff --git a/sound/firewire/amdtp.c b/sound/firewire/amdtp.c
index f96bf4c..95fc2eaf 100644
--- a/sound/firewire/amdtp.c
+++ b/sound/firewire/amdtp.c

@@ -507,7 +507,16 @@
 static void update_pcm_pointers(struct amdtp_stream *s,
 				struct snd_pcm_substream *pcm,
 				unsigned int frames)
-{	unsigned int ptr;
+{
+	unsigned int ptr;
+
+	/*
+	 * In IEC 61883-6, one data block represents one event. In ALSA, one
+	 * event equals to one PCM frame. But Dice has a quirk to transfer
+	 * two PCM frames in one data block.
+	 */
+	if (s->double_pcm_frames)
+		frames *= 2;
 
 	ptr = s->pcm_buffer_pointer + frames;
 	if (ptr >= pcm->runtime->buffer_size)

diff --git a/sound/firewire/amdtp.h b/sound/firewire/amdtp.h
index d8ee7b0..4823c08 100644
--- a/sound/firewire/amdtp.h
+++ b/sound/firewire/amdtp.h

@@ -125,6 +125,7 @@
 	unsigned int pcm_buffer_pointer;
 	unsigned int pcm_period_pointer;
 	bool pointer_flush;
+	bool double_pcm_frames;
 
 	struct snd_rawmidi_substream *midi[AMDTP_MAX_CHANNELS_FOR_MIDI * 8];
 

diff --git a/sound/firewire/dice.c b/sound/firewire/dice.c
index a9a30c0..e3a04d6 100644
--- a/sound/firewire/dice.c
+++ b/sound/firewire/dice.c

@@ -567,10 +567,14 @@
 		return err;
 
 	/*
-	 * At rates above 96 kHz, pretend that the stream runs at half the
-	 * actual sample rate with twice the number of channels; two samples
-	 * of a channel are stored consecutively in the packet. Requires
-	 * blocking mode and PCM buffer size should be aligned to SYT_INTERVAL.
+	 * At 176.4/192.0 kHz, Dice has a quirk to transfer two PCM frames in
+	 * one data block of AMDTP packet. Thus sampling transfer frequency is
+	 * a half of PCM sampling frequency, i.e. PCM frames at 192.0 kHz are
+	 * transferred on AMDTP packets at 96 kHz. Two successive samples of a
+	 * channel are stored consecutively in the packet. This quirk is called
+	 * as 'Dual Wire'.
+	 * For this quirk, blocking mode is required and PCM buffer size should
+	 * be aligned to SYT_INTERVAL.
 	 */
 	channels = params_channels(hw_params);
 	if (rate_index > 4) {
@@ -579,18 +583,25 @@
 			return err;
 		}
 
-		for (i = 0; i < channels; i++) {
-			dice->stream.pcm_positions[i * 2] = i;
-			dice->stream.pcm_positions[i * 2 + 1] = i + channels;
-		}
-
 		rate /= 2;
 		channels *= 2;
+		dice->stream.double_pcm_frames = true;
+	} else {
+		dice->stream.double_pcm_frames = false;
 	}
 
 	mode = rate_index_to_mode(rate_index);
 	amdtp_stream_set_parameters(&dice->stream, rate, channels,
 				    dice->rx_midi_ports[mode]);
+	if (rate_index > 4) {
+		channels /= 2;
+
+		for (i = 0; i < channels; i++) {
+			dice->stream.pcm_positions[i] = i * 2;
+			dice->stream.pcm_positions[i + channels] = i * 2 + 1;
+		}
+	}
+
 	amdtp_stream_set_pcm_format(&dice->stream,
 				    params_format(hw_params));
 

diff --git a/sound/pci/hda/patch_conexant.c b/sound/pci/hda/patch_conexant.c
index 6f2fa83..6e5d0cb 100644
--- a/sound/pci/hda/patch_conexant.c
+++ b/sound/pci/hda/patch_conexant.c

@@ -217,6 +217,7 @@
 	CXT_FIXUP_HEADPHONE_MIC_PIN,
 	CXT_FIXUP_HEADPHONE_MIC,
 	CXT_FIXUP_GPIO1,
+	CXT_FIXUP_ASPIRE_DMIC,
 	CXT_FIXUP_THINKPAD_ACPI,
 	CXT_FIXUP_OLPC_XO,
 	CXT_FIXUP_CAP_MIX_AMP,
@@ -664,6 +665,12 @@
 			{ }
 		},
 	},
+	[CXT_FIXUP_ASPIRE_DMIC] = {
+		.type = HDA_FIXUP_FUNC,
+		.v.func = cxt_fixup_stereo_dmic,
+		.chained = true,
+		.chain_id = CXT_FIXUP_GPIO1,
+	},
 	[CXT_FIXUP_THINKPAD_ACPI] = {
 		.type = HDA_FIXUP_FUNC,
 		.v.func = hda_fixup_thinkpad_acpi,
@@ -744,7 +751,7 @@
 
 static const struct snd_pci_quirk cxt5066_fixups[] = {
 	SND_PCI_QUIRK(0x1025, 0x0543, "Acer Aspire One 522", CXT_FIXUP_STEREO_DMIC),
-	SND_PCI_QUIRK(0x1025, 0x054c, "Acer Aspire 3830TG", CXT_FIXUP_GPIO1),
+	SND_PCI_QUIRK(0x1025, 0x054c, "Acer Aspire 3830TG", CXT_FIXUP_ASPIRE_DMIC),
 	SND_PCI_QUIRK(0x1043, 0x138d, "Asus", CXT_FIXUP_HEADPHONE_MIC_PIN),
 	SND_PCI_QUIRK(0x152d, 0x0833, "OLPC XO-1.5", CXT_FIXUP_OLPC_XO),
 	SND_PCI_QUIRK(0x17aa, 0x20f2, "Lenovo T400", CXT_PINCFG_LENOVO_TP410),

diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c
index d446ac3..1ba22fb 100644
--- a/sound/pci/hda/patch_realtek.c
+++ b/sound/pci/hda/patch_realtek.c

@@ -328,6 +328,7 @@
 		case 0x10ec0885:
 		case 0x10ec0887:
 		/*case 0x10ec0889:*/ /* this causes an SPDIF problem */
+		case 0x10ec0900:
 			alc889_coef_init(codec);
 			break;
 		case 0x10ec0888:
@@ -2350,6 +2351,7 @@
 	switch (codec->vendor_id) {
 	case 0x10ec0882:
 	case 0x10ec0885:
+	case 0x10ec0900:
 		break;
 	default:
 		/* ALC883 and variants */

diff --git a/sound/soc/codecs/cs4265.c b/sound/soc/codecs/cs4265.c
index a20b30c..9852320 100644
--- a/sound/soc/codecs/cs4265.c
+++ b/sound/soc/codecs/cs4265.c

@@ -282,10 +282,10 @@
 
 	/*64k*/
 	{8192000, 64000, 1, 0},
-	{1228800, 64000, 1, 1},
-	{1693440, 64000, 1, 2},
-	{2457600, 64000, 1, 3},
-	{3276800, 64000, 1, 4},
+	{12288000, 64000, 1, 1},
+	{16934400, 64000, 1, 2},
+	{24576000, 64000, 1, 3},
+	{32768000, 64000, 1, 4},
 
 	/* 88.2k */
 	{11289600, 88200, 1, 0},
@@ -435,10 +435,10 @@
 	index = cs4265_get_clk_index(cs4265->sysclk, params_rate(params));
 	if (index >= 0) {
 		snd_soc_update_bits(codec, CS4265_ADC_CTL,
-			CS4265_ADC_FM, clk_map_table[index].fm_mode);
+			CS4265_ADC_FM, clk_map_table[index].fm_mode << 6);
 		snd_soc_update_bits(codec, CS4265_MCLK_FREQ,
 			CS4265_MCLK_FREQ_MASK,
-			clk_map_table[index].mclkdiv);
+			clk_map_table[index].mclkdiv << 4);
 
 	} else {
 		dev_err(codec->dev, "can't get correct mclk\n");

diff --git a/sound/soc/codecs/da732x.h b/sound/soc/codecs/da732x.h
index 1dceafe..f586cbd 100644
--- a/sound/soc/codecs/da732x.h
+++ b/sound/soc/codecs/da732x.h

@@ -11,7 +11,7 @@
  */
 
 #ifndef __DA732X_H_
-#define __DA732X_H
+#define __DA732X_H_
 
 #include <sound/soc.h>
 

diff --git a/sound/soc/codecs/rt5640.c b/sound/soc/codecs/rt5640.c
index 6bc6efd..f1ec6e6 100644
--- a/sound/soc/codecs/rt5640.c
+++ b/sound/soc/codecs/rt5640.c

@@ -2059,6 +2059,7 @@
 static const struct regmap_config rt5640_regmap = {
 	.reg_bits = 8,
 	.val_bits = 16,
+	.use_single_rw = true,
 
 	.max_register = RT5640_VENDOR_ID2 + 1 + (ARRAY_SIZE(rt5640_ranges) *
 					       RT5640_PR_SPACING),

diff --git a/sound/soc/codecs/rt5677.c b/sound/soc/codecs/rt5677.c
index 67f1455..5337c44 100644
--- a/sound/soc/codecs/rt5677.c
+++ b/sound/soc/codecs/rt5677.c

@@ -2135,10 +2135,10 @@
 	{ "BST2", NULL, "IN2P" },
 	{ "BST2", NULL, "IN2N" },
 
-	{ "IN1P", NULL, "micbias1" },
-	{ "IN1N", NULL, "micbias1" },
-	{ "IN2P", NULL, "micbias1" },
-	{ "IN2N", NULL, "micbias1" },
+	{ "IN1P", NULL, "MICBIAS1" },
+	{ "IN1N", NULL, "MICBIAS1" },
+	{ "IN2P", NULL, "MICBIAS1" },
+	{ "IN2N", NULL, "MICBIAS1" },
 
 	{ "ADC 1", NULL, "BST1" },
 	{ "ADC 1", NULL, "ADC 1 power" },

diff --git a/sound/soc/generic/simple-card.c b/sound/soc/generic/simple-card.c
index 159e517f..cef7776 100644
--- a/sound/soc/generic/simple-card.c
+++ b/sound/soc/generic/simple-card.c

@@ -481,12 +481,19 @@
 	snd_soc_card_set_drvdata(&priv->snd_card, priv);
 
 	ret = devm_snd_soc_register_card(&pdev->dev, &priv->snd_card);
+	if (ret >= 0)
+		return ret;
 
 err:
 	asoc_simple_card_unref(pdev);
 	return ret;
 }
 
+static int asoc_simple_card_remove(struct platform_device *pdev)
+{
+	return asoc_simple_card_unref(pdev);
+}
+
 static const struct of_device_id asoc_simple_of_match[] = {
 	{ .compatible = "simple-audio-card", },
 	{},
@@ -500,6 +507,7 @@
 		.of_match_table = asoc_simple_of_match,
 	},
 	.probe = asoc_simple_card_probe,
+	.remove = asoc_simple_card_remove,
 };
 
 module_platform_driver(asoc_simple_card);

diff --git a/sound/soc/omap/omap-twl4030.c b/sound/soc/omap/omap-twl4030.c
index f8a6adc..4336d18 100644
--- a/sound/soc/omap/omap-twl4030.c
+++ b/sound/soc/omap/omap-twl4030.c

@@ -260,7 +260,7 @@
 		.stream_name = "TWL4030 Voice",
 		.cpu_dai_name = "omap-mcbsp.3",
 		.codec_dai_name = "twl4030-voice",
-		.platform_name = "omap-mcbsp.2",
+		.platform_name = "omap-mcbsp.3",
 		.codec_name = "twl4030-codec",
 		.dai_fmt = SND_SOC_DAIFMT_DSP_A | SND_SOC_DAIFMT_IB_NF |
 			   SND_SOC_DAIFMT_CBM_CFM,

diff --git a/sound/soc/sh/rcar/gen.c b/sound/soc/sh/rcar/gen.c
index 3fdf3be..f95e7ab 100644
--- a/sound/soc/sh/rcar/gen.c
+++ b/sound/soc/sh/rcar/gen.c

@@ -247,7 +247,7 @@
 	};
 
 	/* it shouldn't happen */
-	if (use_dvc & !use_src)
+	if (use_dvc && !use_src)
 		dev_err(dev, "DVC is selected without SRC\n");
 
 	/* use SSIU or SSI ? */

diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c
index d4bfd4a..889f4e3 100644
--- a/sound/soc/soc-core.c
+++ b/sound/soc/soc-core.c

@@ -1325,7 +1325,7 @@
 	device_initialize(rtd->dev);
 	rtd->dev->parent = rtd->card->dev;
 	rtd->dev->release = rtd_release;
-	rtd->dev->init_name = name;
+	dev_set_name(rtd->dev, "%s", name);
 	dev_set_drvdata(rtd->dev, rtd);
 	mutex_init(&rtd->pcm_mutex);
 	INIT_LIST_HEAD(&rtd->dpcm[SNDRV_PCM_STREAM_PLAYBACK].be_clients);

diff --git a/sound/soc/tegra/tegra_asoc_utils.h b/sound/soc/tegra/tegra_asoc_utils.h
index 9577121..ca80376 100644
--- a/sound/soc/tegra/tegra_asoc_utils.h
+++ b/sound/soc/tegra/tegra_asoc_utils.h

@@ -21,7 +21,7 @@
  */
 
 #ifndef __TEGRA_ASOC_UTILS_H__
-#define __TEGRA_ASOC_UTILS_H_
+#define __TEGRA_ASOC_UTILS_H__
 
 struct clk;
 struct device;

diff --git a/tools/kvm/.gitignore b/tools/kvm/.gitignore
new file mode 100644
index 0000000..60dd6db
--- /dev/null
+++ b/tools/kvm/.gitignore

@@ -0,0 +1,12 @@
+/lkvm
+/vm
+*.o
+*.d
+.cscope
+tags
+include/common-cmds.h
+tests/boot/boot_test.iso
+tests/boot/rootfs/
+guest/init
+guest/init_stage2
+KVMTOOLS-VERSION-FILE

diff --git a/tools/kvm/CREDITS-Git b/tools/kvm/CREDITS-Git
new file mode 100644
index 0000000..c2ddcb3
--- /dev/null
+++ b/tools/kvm/CREDITS-Git

@@ -0,0 +1,30 @@
+Most of the infrastructure that 'perf' uses here has been reused
+from the Git project, as of version:
+
+    66996ec: Sync with 1.6.2.4
+
+Here is an (incomplete!) list of main contributors to those files
+in util/* and elsewhere:
+
+ Alex Riesen
+ Christian Couder
+ Dmitry Potapov
+ Jeff King
+ Johannes Schindelin
+ Johannes Sixt
+ Junio C Hamano
+ Linus Torvalds
+ Matthias Kestenholz
+ Michal Ostrowski
+ Miklos Vajna
+ Petr Baudis
+ Pierre Habouzit
+ René Scharfe
+ Samuel Tardieu
+ Shawn O. Pearce
+ Steffen Prohaska
+ Steve Haslam
+
+Thanks guys!
+
+The full history of the files can be found in the upstream Git commits.

diff --git a/tools/kvm/Documentation/kernel-debugging.txt b/tools/kvm/Documentation/kernel-debugging.txt
new file mode 100644
index 0000000..98b9438
--- /dev/null
+++ b/tools/kvm/Documentation/kernel-debugging.txt

@@ -0,0 +1,15 @@
+This document explains how to debug a guests' kernel using KGDB.
+
+1. Run the guest:
+        'lkvm run -k [vmlinuz] -p "kgdboc=ttyS1 kgdbwait" --tty 1'
+
+And see which PTY got assigned to ttyS1 (you'll see:
+'  Info: Assigned terminal 1 to pty /dev/pts/X').
+
+2. Run GDB on the host:
+        'gdb [vmlinuz]'
+
+3. Connect to the guest (from within GDB):
+        'target remote /dev/pty/X'
+
+4. Start debugging! (enter 'continue' to continue boot).

diff --git a/tools/kvm/Documentation/kvm-balloon.txt b/tools/kvm/Documentation/kvm-balloon.txt
new file mode 100644
index 0000000..efc0a87
--- /dev/null
+++ b/tools/kvm/Documentation/kvm-balloon.txt

@@ -0,0 +1,24 @@
+lkvm-balloon(1)
+================
+
+NAME
+----
+lkvm-balloon - Inflate or deflate the virtio balloon
+
+SYNOPSIS
+--------
+[verse]
+'lkvm balloon [command] [size] [instance]'
+
+DESCRIPTION
+-----------
+The command inflates or deflates the virtio balloon located in the
+specified instance.
+For a list of running instances see 'lkvm list'.
+
+Command can be either 'inflate' or 'deflate'. Inflate increases the
+size of the balloon, thus decreasing the amount of virtual RAM available
+for the guest. Deflation returns previously inflated memory back to the
+guest.
+
+size is specified in Mb.

diff --git a/tools/kvm/Documentation/kvm-debug.txt b/tools/kvm/Documentation/kvm-debug.txt
new file mode 100644
index 0000000..a8eb2c0
--- /dev/null
+++ b/tools/kvm/Documentation/kvm-debug.txt

@@ -0,0 +1,16 @@
+lkvm-debug(1)
+================
+
+NAME
+----
+lkvm-debug - Print debug information from a running instance
+
+SYNOPSIS
+--------
+[verse]
+'lkvm debug [instance]'
+
+DESCRIPTION
+-----------
+The command prints debug information from a running instance.
+For a list of running instances see 'lkvm list'.

diff --git a/tools/kvm/Documentation/kvm-list.txt b/tools/kvm/Documentation/kvm-list.txt
new file mode 100644
index 0000000..a245607
--- /dev/null
+++ b/tools/kvm/Documentation/kvm-list.txt

@@ -0,0 +1,16 @@
+lkvm-list(1)
+================
+
+NAME
+----
+lkvm-list - Print a list of running instances on the host.
+
+SYNOPSIS
+--------
+[verse]
+'lkvm list'
+
+DESCRIPTION
+-----------
+This command prints a list of running instances on the host which
+belong to the user who currently ran 'lkvm list'.

diff --git a/tools/kvm/Documentation/kvm-pause.txt b/tools/kvm/Documentation/kvm-pause.txt
new file mode 100644
index 0000000..1ea2a23
--- /dev/null
+++ b/tools/kvm/Documentation/kvm-pause.txt

@@ -0,0 +1,16 @@
+lkvm-pause(1)
+================
+
+NAME
+----
+lkvm-pause - Pause the virtual machine
+
+SYNOPSIS
+--------
+[verse]
+'lkvm pause [instance]'
+
+DESCRIPTION
+-----------
+The command pauses a virtual machine.
+For a list of running instances see 'lkvm list'.

diff --git a/tools/kvm/Documentation/kvm-resume.txt b/tools/kvm/Documentation/kvm-resume.txt
new file mode 100644
index 0000000..a36c4df
--- /dev/null
+++ b/tools/kvm/Documentation/kvm-resume.txt

@@ -0,0 +1,16 @@
+lkvm-resume(1)
+================
+
+NAME
+----
+lkvm-resume - Resume the virtual machine
+
+SYNOPSIS
+--------
+[verse]
+'lkvm resume [instance]'
+
+DESCRIPTION
+-----------
+The command resumes a virtual machine.
+For a list of running instances see 'lkvm list'.

diff --git a/tools/kvm/Documentation/kvm-run.txt b/tools/kvm/Documentation/kvm-run.txt
new file mode 100644
index 0000000..8ddf470
--- /dev/null
+++ b/tools/kvm/Documentation/kvm-run.txt

@@ -0,0 +1,62 @@
+lkvm-run(1)
+================
+
+NAME
+----
+lkvm-run - Start the virtual machine
+
+SYNOPSIS
+--------
+[verse]
+'lkvm run' [-k <kernel image> | --kernel <kernel image>]
+
+DESCRIPTION
+-----------
+The command starts a virtual machine.
+
+OPTIONS
+-------
+-m::
+--mem=::
+	Virtual machine memory size in MiB.
+
+-p::
+--params::
+	Additional kernel command line arguments.
+
+-r::
+--initrd=::
+	Initial RAM disk image.
+
+-k::
+--kernel=::
+	The virtual machine kernel.
+
+--dev=::
+	KVM device file.
+
+-i::
+--image=::
+	A disk image file.
+
+-s::
+--single-step::
+	Enable single stepping.
+
+-g::
+--ioport-debug::
+	Enable ioport debugging.
+
+-c::
+--enable-virtio-console::
+	Enable the virtual IO console.
+
+--cpus::
+	The number of virtual CPUs to run.
+
+--debug::
+	Enable debug messages.
+
+SEE ALSO
+--------
+linkkvm:

diff --git a/tools/kvm/Documentation/kvm-sandbox.txt b/tools/kvm/Documentation/kvm-sandbox.txt
new file mode 100644
index 0000000..2d7f558
--- /dev/null
+++ b/tools/kvm/Documentation/kvm-sandbox.txt

@@ -0,0 +1,16 @@
+lkvm-sandbox(1)
+================
+
+NAME
+----
+lkvm-sandbox - Run a command in a sandboxed guest
+
+SYNOPSIS
+--------
+[verse]
+'lkvm sandbox ['lkvm run' arguments] -- [sandboxed command]'
+
+DESCRIPTION
+-----------
+The sandboxed command will run in a guest as part of it's init
+command.

diff --git a/tools/kvm/Documentation/kvm-setup.txt b/tools/kvm/Documentation/kvm-setup.txt
new file mode 100644
index 0000000..4b6e331
--- /dev/null
+++ b/tools/kvm/Documentation/kvm-setup.txt

@@ -0,0 +1,15 @@
+lkvm-setup(1)
+================
+
+NAME
+----
+lkvm-setup - Setup a new virtual machine
+
+SYNOPSIS
+--------
+[verse]
+'lkvm setup <name>'
+
+DESCRIPTION
+-----------
+The command setups a virtual machine.

diff --git a/tools/kvm/Documentation/kvm-stat.txt b/tools/kvm/Documentation/kvm-stat.txt
new file mode 100644
index 0000000..101ce7a
--- /dev/null
+++ b/tools/kvm/Documentation/kvm-stat.txt

@@ -0,0 +1,19 @@
+lkvm-stat(1)
+================
+
+NAME
+----
+lkvm-stat - Print statistics about a running instance
+
+SYNOPSIS
+--------
+[verse]
+'lkvm [command] [-n instance] [-p instance pid] [--all]'
+
+DESCRIPTION
+-----------
+The command prints statistics about a running instance.
+For a list of running instances see 'lkvm list'.
+
+Commands:
+ --memory, -m	Display memory statistics

diff --git a/tools/kvm/Documentation/kvm-stop.txt b/tools/kvm/Documentation/kvm-stop.txt
new file mode 100644
index 0000000..6e4bc83
--- /dev/null
+++ b/tools/kvm/Documentation/kvm-stop.txt

@@ -0,0 +1,16 @@
+lkvm-stop(1)
+================
+
+NAME
+----
+lkvm-stop - Stop a running instance
+
+SYNOPSIS
+--------
+[verse]
+'lkvm stop [instance]'
+
+DESCRIPTION
+-----------
+The command stops a running instance.
+For a list of running instances see 'lkvm list'.

diff --git a/tools/kvm/Documentation/kvm-version.txt b/tools/kvm/Documentation/kvm-version.txt
new file mode 100644
index 0000000..41003d2
--- /dev/null
+++ b/tools/kvm/Documentation/kvm-version.txt

@@ -0,0 +1,21 @@
+lkvm-version(1)
+================
+
+NAME
+----
+lkvm-version - Print the version of the kernel tree kvm tools
+was built on.
+
+SYNOPSIS
+--------
+[verse]
+'lkvm version'
+
+DESCRIPTION
+-----------
+The command prints the version of the kernel that was used to build
+kvm tools.
+
+Note that the version is not the version of the kernel which is currently
+running on the host, but is the version of the kernel tree from which kvm
+tools was built.

diff --git a/tools/kvm/Documentation/virtio-console.txt b/tools/kvm/Documentation/virtio-console.txt
new file mode 100644
index 0000000..4a58d56
--- /dev/null
+++ b/tools/kvm/Documentation/virtio-console.txt

@@ -0,0 +1,41 @@
+General
+--------
+
+virtio-console as the name implies is a console over virtio transport. Here is
+a simple head to head comparison of the virtio-console vs regular 8250 console:
+
+8250 serial console:
+
+ - Requires CONFIG_SERIAL_8250=y and CONFIG_SERIAL_8250_CONSOLE=y kernel configs,
+which are enabled almost everywhere.
+ - Doesn't require guest-side changes.
+ - Compatible with older guests.
+
+virtio-console:
+
+ - Requires CONFIG_VIRTIO_CONSOLE=y (along with all other virtio dependencies),
+which got enabled only in recent kernels (but not all of them).
+ - Much faster.
+ - Consumes less processing resources.
+ - Requires guest-side changes.
+
+Enabling virtio-console
+------------------------
+
+First, make sure guest kernel is built with CONFIG_VIRTIO_CONSOLE=y. Once this
+is done, the following has to be done inside guest image:
+
+ - Add the following line to /etc/inittab:
+	'hvc0:2345:respawn:/sbin/agetty -L 9600 hvc0'
+ - Add 'hvc0' to /etc/securetty (so you could actually log on)
+ - Start the guest with '--console virtio'
+
+Common errors
+--------------
+
+Q: I don't see anything on the screen!
+A: Make sure CONFIG_VIRTIO_CONSOLE=y is enabled in the *guest* kernel, also
+make sure you've updated /etc/inittab
+
+Q: It won't accept my username/password, but I enter them correctly!
+A: You didn't add 'hvc0' to /etc/securetty

diff --git a/tools/kvm/Makefile b/tools/kvm/Makefile
new file mode 100644
index 0000000..fba60f1
--- /dev/null
+++ b/tools/kvm/Makefile

@@ -0,0 +1,520 @@
+#
+# Define WERROR=0 to disable -Werror.
+#
+
+ifeq ($(strip $(V)),)
+	E = @echo
+	Q = @
+else
+	E = @\#
+	Q =
+endif
+ifneq ($(I), )
+	KINCL_PATH=$(I)
+else
+	KINCL_PATH=../..
+endif
+export E Q KINCL_PATH
+
+include config/utilities.mak
+include config/feature-tests.mak
+
+CC	:= $(CROSS_COMPILE)gcc
+LD	:= $(CROSS_COMPILE)ld
+
+FIND	:= find
+CSCOPE	:= cscope
+TAGS	:= ctags
+INSTALL := install
+
+prefix = $(HOME)
+bindir_relative = bin
+bindir = $(prefix)/$(bindir_relative)
+
+DESTDIR_SQ = $(subst ','\'',$(DESTDIR))
+bindir_SQ = $(subst ','\'',$(bindir))
+
+PROGRAM	:= lkvm
+PROGRAM_ALIAS := vm
+
+GUEST_INIT := guest/init
+
+OBJS	+= builtin-balloon.o
+OBJS	+= builtin-debug.o
+OBJS	+= builtin-help.o
+OBJS	+= builtin-list.o
+OBJS	+= builtin-stat.o
+OBJS	+= builtin-pause.o
+OBJS	+= builtin-resume.o
+OBJS	+= builtin-run.o
+OBJS	+= builtin-setup.o
+OBJS	+= builtin-stop.o
+OBJS	+= builtin-version.o
+OBJS	+= devices.o
+OBJS	+= disk/core.o
+OBJS	+= framebuffer.o
+OBJS	+= guest_compat.o
+OBJS	+= hw/rtc.o
+OBJS	+= hw/serial.o
+OBJS	+= ioport.o
+OBJS	+= irq.o
+OBJS	+= kvm-cpu.o
+OBJS	+= kvm.o
+OBJS	+= main.o
+OBJS	+= mmio.o
+OBJS	+= pci.o
+OBJS	+= term.o
+OBJS	+= virtio/blk.o
+OBJS	+= virtio/scsi.o
+OBJS	+= virtio/console.o
+OBJS	+= virtio/core.o
+OBJS	+= virtio/net.o
+OBJS	+= virtio/rng.o
+OBJS    += virtio/balloon.o
+OBJS	+= virtio/pci.o
+OBJS	+= disk/blk.o
+OBJS	+= disk/qcow.o
+OBJS	+= disk/raw.o
+OBJS	+= ioeventfd.o
+OBJS	+= net/uip/core.o
+OBJS	+= net/uip/arp.o
+OBJS	+= net/uip/icmp.o
+OBJS	+= net/uip/ipv4.o
+OBJS	+= net/uip/tcp.o
+OBJS	+= net/uip/udp.o
+OBJS	+= net/uip/buf.o
+OBJS	+= net/uip/csum.o
+OBJS	+= net/uip/dhcp.o
+OBJS	+= kvm-cmd.o
+OBJS	+= util/init.o
+OBJS    += util/iovec.o
+OBJS	+= util/rbtree.o
+OBJS	+= util/threadpool.o
+OBJS	+= util/parse-options.o
+OBJS	+= util/rbtree-interval.o
+OBJS	+= util/strbuf.o
+OBJS	+= util/read-write.o
+OBJS	+= util/util.o
+OBJS	+= virtio/9p.o
+OBJS	+= virtio/9p-pdu.o
+OBJS	+= hw/vesa.o
+OBJS	+= hw/pci-shmem.o
+OBJS	+= kvm-ipc.o
+OBJS	+= builtin-sandbox.o
+OBJS	+= virtio/mmio.o
+OBJS	+= hw/i8042.o
+
+# Translate uname -m into ARCH string
+ARCH ?= $(shell uname -m | sed -e s/i.86/i386/ -e s/ppc.*/powerpc/ \
+	  -e s/armv7.*/arm/ -e s/aarch64.*/arm64/ -e s/mips64/mips/)
+
+ifeq ($(ARCH),i386)
+	ARCH         := x86
+	DEFINES      += -DCONFIG_X86_32
+endif
+ifeq ($(ARCH),x86_64)
+	ARCH         := x86
+	DEFINES      += -DCONFIG_X86_64
+endif
+
+### Arch-specific stuff
+
+#x86
+ifeq ($(ARCH),x86)
+	DEFINES += -DCONFIG_X86
+	OBJS	+= x86/boot.o
+	OBJS	+= x86/cpuid.o
+	OBJS	+= x86/interrupt.o
+	OBJS	+= x86/ioport.o
+	OBJS	+= x86/irq.o
+	OBJS	+= x86/kvm.o
+	OBJS	+= x86/kvm-cpu.o
+	OBJS	+= x86/mptable.o
+# Exclude BIOS object files from header dependencies.
+	OTHEROBJS	+= x86/bios.o
+	OTHEROBJS	+= x86/bios/bios-rom.o
+	ARCH_INCLUDE := x86/include
+endif
+# POWER/ppc:  Actually only support ppc64 currently.
+ifeq ($(ARCH), powerpc)
+	DEFINES += -DCONFIG_PPC
+	OBJS	+= powerpc/boot.o
+	OBJS	+= powerpc/ioport.o
+	OBJS	+= powerpc/irq.o
+	OBJS	+= powerpc/kvm.o
+	OBJS	+= powerpc/cpu_info.o
+	OBJS	+= powerpc/kvm-cpu.o
+	OBJS	+= powerpc/spapr_hcall.o
+	OBJS	+= powerpc/spapr_rtas.o
+	OBJS	+= powerpc/spapr_hvcons.o
+	OBJS	+= powerpc/spapr_pci.o
+	OBJS	+= powerpc/xics.o
+	ARCH_INCLUDE := powerpc/include
+	CFLAGS 	+= -m64
+	LDFLAGS += -m elf64ppc
+
+	ARCH_WANT_LIBFDT := y
+endif
+
+# ARM
+OBJS_ARM_COMMON		:= arm/fdt.o arm/gic.o arm/ioport.o arm/irq.o \
+			   arm/kvm.o arm/kvm-cpu.o arm/pci.o arm/timer.o
+HDRS_ARM_COMMON		:= arm/include
+ifeq ($(ARCH), arm)
+	DEFINES		+= -DCONFIG_ARM
+	OBJS		+= $(OBJS_ARM_COMMON)
+	OBJS		+= arm/aarch32/arm-cpu.o
+	OBJS		+= arm/aarch32/kvm-cpu.o
+	ARCH_INCLUDE	:= $(HDRS_ARM_COMMON)
+	ARCH_INCLUDE	+= -Iarm/aarch32/include
+	CFLAGS		+= -march=armv7-a
+
+	ARCH_WANT_LIBFDT := y
+endif
+
+# ARM64
+ifeq ($(ARCH), arm64)
+	DEFINES		+= -DCONFIG_ARM64
+	OBJS		+= $(OBJS_ARM_COMMON)
+	OBJS		+= arm/aarch64/arm-cpu.o
+	OBJS		+= arm/aarch64/kvm-cpu.o
+	ARCH_INCLUDE	:= $(HDRS_ARM_COMMON)
+	ARCH_INCLUDE	+= -Iarm/aarch64/include
+
+	ARCH_WANT_LIBFDT := y
+endif
+
+ifeq ($(ARCH),mips)
+	DEFINES		+= -DCONFIG_MIPS
+	ARCH_INCLUDE	:= mips/include
+	OBJS		+= mips/kvm.o
+	OBJS		+= mips/kvm-cpu.o
+	OBJS		+= mips/irq.o
+endif
+###
+
+ifeq (,$(ARCH_INCLUDE))
+	UNSUPP_ERR = @echo "This architecture is not supported in kvmtool." && exit 1
+else
+	UNSUPP_ERR =
+endif
+
+###
+
+# libfdt support
+
+LIBFDT_SRC = fdt.o fdt_ro.o fdt_wip.o fdt_sw.o fdt_rw.o fdt_strerror.o
+LIBFDT_OBJS = $(patsubst %,../../scripts/dtc/libfdt/%,$(LIBFDT_SRC))
+
+ifeq (y,$(ARCH_WANT_LIBFDT))
+	DEFINES		+= -DCONFIG_HAS_LIBFDT
+	OTHEROBJS	+= $(LIBFDT_OBJS)
+endif
+
+###
+
+# Detect optional features.
+# On a given system, some libs may link statically, some may not; so, check
+# both and only build those that link!
+
+FLAGS_BFD := $(CFLAGS) -lbfd
+ifeq ($(call try-cc,$(SOURCE_BFD),$(FLAGS_BFD) -static),y)
+	CFLAGS_STATOPT	+= -DCONFIG_HAS_BFD
+	OBJS_STATOPT	+= symbol.o
+	LIBS_STATOPT	+= -lbfd
+endif
+
+FLAGS_GTK3 := $(CFLAGS) $(shell pkg-config --libs --cflags gtk+-3.0 2>/dev/null)
+ifeq ($(call try-cc,$(SOURCE_GTK3),$(FLAGS_GTK3)),y)
+	OBJS_DYNOPT	+= ui/gtk3.o
+	CFLAGS_DYNOPT	+= -DCONFIG_HAS_GTK3 $(shell pkg-config --cflags gtk+-3.0 2>/dev/null)
+	LIBS_DYNOPT	+= $(shell pkg-config --libs gtk+-3.0 2>/dev/null)
+else
+    $(warning warning GTK3 not found, disables GTK3 support. Please install gtk3-devel or libgtk3.0-dev)
+endif
+
+FLAGS_VNCSERVER := $(CFLAGS) -lvncserver
+ifeq ($(call try-cc,$(SOURCE_VNCSERVER),$(FLAGS_VNCSERVER)),y)
+	OBJS_DYNOPT	+= ui/vnc.o
+	CFLAGS_DYNOPT	+= -DCONFIG_HAS_VNCSERVER
+	LIBS_DYNOPT	+= -lvncserver
+endif
+ifeq ($(call try-cc,$(SOURCE_VNCSERVER),$(FLAGS_VNCSERVER) -static),y)
+	OBJS_STATOPT	+= ui/vnc.o
+	CFLAGS_STATOPT	+= -DCONFIG_HAS_VNCSERVER
+	LIBS_STATOPT	+= -lvncserver
+endif
+
+FLAGS_SDL := $(CFLAGS) -lSDL
+ifeq ($(call try-cc,$(SOURCE_SDL),$(FLAGS_SDL)),y)
+	OBJS_DYNOPT	+= ui/sdl.o
+	CFLAGS_DYNOPT	+= -DCONFIG_HAS_SDL
+	LIBS_DYNOPT	+= -lSDL
+endif
+ifeq ($(call try-cc,$(SOURCE_SDL),$(FLAGS_SDL) -static), y)
+	OBJS_STATOPT	+= ui/sdl.o
+	CFLAGS_STATOPT	+= -DCONFIG_HAS_SDL
+	LIBS_STATOPT	+= -lSDL
+endif
+
+FLAGS_ZLIB := $(CFLAGS) -lz
+ifeq ($(call try-cc,$(SOURCE_ZLIB),$(FLAGS_ZLIB)),y)
+	CFLAGS_DYNOPT	+= -DCONFIG_HAS_ZLIB
+	LIBS_DYNOPT	+= -lz
+endif
+ifeq ($(call try-cc,$(SOURCE_ZLIB),$(FLAGS_ZLIB) -static),y)
+	CFLAGS_STATOPT	+= -DCONFIG_HAS_ZLIB
+	LIBS_STATOPT	+= -lz
+endif
+
+FLAGS_AIO := $(CFLAGS) -laio
+ifeq ($(call try-cc,$(SOURCE_AIO),$(FLAGS_AIO)),y)
+	CFLAGS_DYNOPT	+= -DCONFIG_HAS_AIO
+	LIBS_DYNOPT	+= -laio
+endif
+ifeq ($(call try-cc,$(SOURCE_AIO),$(FLAGS_AIO) -static),y)
+	CFLAGS_STATOPT	+= -DCONFIG_HAS_AIO
+	LIBS_STATOPT	+= -laio
+endif
+
+ifeq ($(LTO),1)
+	FLAGS_LTO := -flto
+	ifeq ($(call try-cc,$(SOURCE_HELLO),$(FLAGS_LTO)),y)
+		CFLAGS		+= $(FLAGS_LTO)
+	endif
+endif
+
+ifneq ($(call try-build,$(SOURCE_STATIC),-static,),y)
+$(error No static libc found. Please install glibc-static package.)
+endif
+###
+
+LIBS	+= -lrt
+LIBS	+= -lpthread
+LIBS	+= -lutil
+
+
+DEPS	:= $(patsubst %.o,%.d,$(OBJS))
+
+DEFINES	+= -D_FILE_OFFSET_BITS=64
+DEFINES	+= -D_GNU_SOURCE
+DEFINES	+= -DKVMTOOLS_VERSION='"$(KVMTOOLS_VERSION)"'
+DEFINES	+= -DBUILD_ARCH='"$(ARCH)"'
+
+KVM_INCLUDE := include
+CFLAGS	+= $(CPPFLAGS) $(DEFINES) -I$(KVM_INCLUDE) -I$(ARCH_INCLUDE) -I$(KINCL_PATH)/include/uapi -I$(KINCL_PATH)/include -I$(KINCL_PATH)/arch/$(ARCH)/include/uapi -I$(KINCL_PATH)/arch/$(ARCH)/include/ -I$(KINCL_PATH)/scripts/dtc/libfdt -O2 -fno-strict-aliasing -g
+
+WARNINGS += -Wall
+WARNINGS += -Wformat=2
+WARNINGS += -Winit-self
+WARNINGS += -Wmissing-declarations
+WARNINGS += -Wmissing-prototypes
+WARNINGS += -Wnested-externs
+WARNINGS += -Wno-system-headers
+WARNINGS += -Wold-style-definition
+WARNINGS += -Wredundant-decls
+WARNINGS += -Wsign-compare
+WARNINGS += -Wstrict-prototypes
+WARNINGS += -Wundef
+WARNINGS += -Wvolatile-register-var
+WARNINGS += -Wwrite-strings
+
+CFLAGS	+= $(WARNINGS)
+
+# Some targets may use 'external' sources that don't build totally cleanly.
+CFLAGS_EASYGOING := $(CFLAGS)
+
+ifneq ($(WERROR),0)
+	CFLAGS += -Werror
+endif
+
+all: arch_support_check $(PROGRAM) $(PROGRAM_ALIAS) $(GUEST_INIT)
+
+arch_support_check:
+	$(UNSUPP_ERR)
+
+# When building -static all objects are built with appropriate flags, which
+# may differ between static & dynamic .o.  The objects are separated into
+# .o and .static.o.  See the %.o: %.c rules below.
+#
+# $(OTHEROBJS) are things that do not get substituted like this.
+#
+STATIC_OBJS = $(patsubst %.o,%.static.o,$(OBJS) $(OBJS_STATOPT))
+GUEST_OBJS = guest/guest_init.o
+
+$(PROGRAM)-static:  $(DEPS) $(STATIC_OBJS) $(OTHEROBJS) $(GUEST_INIT)
+	$(E) "  LINK    " $@
+	$(Q) $(CC) -static $(CFLAGS) $(STATIC_OBJS) $(OTHEROBJS) $(GUEST_OBJS) $(LIBS) $(LIBS_STATOPT) -o $@
+
+$(PROGRAM): $(DEPS) $(OBJS) $(OBJS_DYNOPT) $(OTHEROBJS) $(GUEST_INIT)
+	$(E) "  LINK    " $@
+	$(Q) $(CC) $(CFLAGS) $(OBJS) $(OBJS_DYNOPT) $(OTHEROBJS) $(GUEST_OBJS) $(LIBS) $(LIBS_DYNOPT) -o $@
+
+$(PROGRAM_ALIAS): $(PROGRAM)
+	$(E) "  LN      " $@
+	$(Q) ln -f $(PROGRAM) $@
+
+$(GUEST_INIT): guest/init.c
+	$(E) "  LINK    " $@
+	$(Q) $(CC) -static guest/init.c -o $@
+	$(Q) $(LD) $(LDFLAGS) -r -b binary -o guest/guest_init.o $(GUEST_INIT)
+
+$(DEPS):
+
+util/rbtree.d: ../../lib/rbtree.c
+	$(Q) $(CC) -M -MT util/rbtree.o $(CFLAGS) $< -o $@
+
+%.d: %.c
+	$(Q) $(CC) -M -MT $(patsubst %.d,%.o,$@) $(CFLAGS) $< -o $@
+
+%.s: %.c
+	$(Q) $(CC) -o $@ -S $(CFLAGS) -fverbose-asm $<
+
+# The header file common-cmds.h is needed for compilation of builtin-help.c.
+builtin-help.d: $(KVM_INCLUDE)/common-cmds.h
+
+$(OBJS):
+
+# This rule relaxes the -Werror on libfdt, since for now it still has
+# a bunch of warnings. :(
+../../scripts/dtc/libfdt/%.o: ../../scripts/dtc/libfdt/%.c
+ifeq ($(C),1)
+	$(E) "  CHECK   " $@
+	$(Q) $(CHECK) -c $(CFLAGS_EASYGOING) $< -o $@
+endif
+	$(E) "  CC      " $@
+	$(Q) $(CC) -c $(CFLAGS_EASYGOING) $< -o $@
+
+util/rbtree.static.o util/rbtree.o: ../../lib/rbtree.c
+ifeq ($(C),1)
+	$(E) "  CHECK   " $@
+	$(Q) $(CHECK) -c $(CFLAGS) $< -o $@
+endif
+	$(E) "  CC      " $@
+	$(Q) $(CC) -c $(CFLAGS) $< -o $@
+
+%.static.o: %.c
+ifeq ($(C),1)
+	$(E) "  CHECK   " $@
+	$(Q) $(CHECK) -c $(CFLAGS) $(CFLAGS_STATOPT) $< -o $@
+endif
+	$(E) "  CC      " $@
+	$(Q) $(CC) -c $(CFLAGS) $(CFLAGS_STATOPT)  $< -o $@
+
+%.o: %.c
+ifeq ($(C),1)
+	$(E) "  CHECK   " $@
+	$(Q) $(CHECK) -c $(CFLAGS) $(CFLAGS_DYNOPT) $< -o $@
+endif
+	$(E) "  CC      " $@
+	$(Q) $(CC) -c $(CFLAGS) $(CFLAGS_DYNOPT) $< -o $@
+
+
+$(KVM_INCLUDE)/common-cmds.h: util/generate-cmdlist.sh command-list.txt
+
+$(KVM_INCLUDE)/common-cmds.h: $(wildcard Documentation/kvm-*.txt)
+	$(E) "  GEN     " $@
+	$(Q) util/generate-cmdlist.sh > $@+ && mv $@+ $@
+
+#
+# BIOS assembly weirdness
+#
+BIOS_CFLAGS += -m32
+BIOS_CFLAGS += -march=i386
+BIOS_CFLAGS += -mregparm=3
+
+BIOS_CFLAGS += -fno-stack-protector
+BIOS_CFLAGS += -I../../arch/$(ARCH)
+
+x86/bios.o: x86/bios/bios.bin x86/bios/bios-rom.h
+
+x86/bios/bios.bin.elf: x86/bios/entry.S x86/bios/e820.c x86/bios/int10.c x86/bios/int15.c x86/bios/rom.ld.S
+	$(E) "  CC       x86/bios/memcpy.o"
+	$(Q) $(CC) -include code16gcc.h $(CFLAGS) $(BIOS_CFLAGS) -c -s x86/bios/memcpy.c -o x86/bios/memcpy.o
+	$(E) "  CC       x86/bios/e820.o"
+	$(Q) $(CC) -include code16gcc.h $(CFLAGS) $(BIOS_CFLAGS) -c -s x86/bios/e820.c -o x86/bios/e820.o
+	$(E) "  CC       x86/bios/int10.o"
+	$(Q) $(CC) -include code16gcc.h $(CFLAGS) $(BIOS_CFLAGS) -c -s x86/bios/int10.c -o x86/bios/int10.o
+	$(E) "  CC       x86/bios/int15.o"
+	$(Q) $(CC) -include code16gcc.h $(CFLAGS) $(BIOS_CFLAGS) -c -s x86/bios/int15.c -o x86/bios/int15.o
+	$(E) "  CC       x86/bios/entry.o"
+	$(Q) $(CC) $(CFLAGS) $(BIOS_CFLAGS) -c -s x86/bios/entry.S -o x86/bios/entry.o
+	$(E) "  LD      " $@
+	$(Q) $(LD) -T x86/bios/rom.ld.S -o x86/bios/bios.bin.elf x86/bios/memcpy.o x86/bios/entry.o x86/bios/e820.o x86/bios/int10.o x86/bios/int15.o
+
+x86/bios/bios.bin: x86/bios/bios.bin.elf
+	$(E) "  OBJCOPY " $@
+	$(Q) objcopy -O binary -j .text x86/bios/bios.bin.elf x86/bios/bios.bin
+
+x86/bios/bios-rom.o: x86/bios/bios-rom.S x86/bios/bios.bin x86/bios/bios-rom.h
+	$(E) "  CC      " $@
+	$(Q) $(CC) -c $(CFLAGS) x86/bios/bios-rom.S -o x86/bios/bios-rom.o
+
+x86/bios/bios-rom.h: x86/bios/bios.bin.elf
+	$(E) "  NM      " $@
+	$(Q) cd x86/bios && sh gen-offsets.sh > bios-rom.h && cd ..
+
+check: all
+	$(MAKE) -C tests
+	./$(PROGRAM) run tests/pit/tick.bin
+	./$(PROGRAM) run -d tests/boot/boot_test.iso -p "init=init"
+.PHONY: check
+
+install: all
+	$(E) "  INSTALL"
+	$(Q) $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(bindir_SQ)' 
+	$(Q) $(INSTALL) $(PROGRAM) '$(DESTDIR_SQ)$(bindir_SQ)' 
+.PHONY: install
+
+clean:
+	$(E) "  CLEAN"
+	$(Q) rm -f x86/bios/*.bin
+	$(Q) rm -f x86/bios/*.elf
+	$(Q) rm -f x86/bios/*.o
+	$(Q) rm -f x86/bios/bios-rom.h
+	$(Q) rm -f tests/boot/boot_test.iso
+	$(Q) rm -rf tests/boot/rootfs/
+	$(Q) rm -f $(DEPS) $(OBJS) $(OTHEROBJS) $(OBJS_DYNOPT) $(STATIC_OBJS) $(PROGRAM) $(PROGRAM_ALIAS) $(PROGRAM)-static $(GUEST_INIT) $(GUEST_OBJS)
+	$(Q) rm -f cscope.*
+	$(Q) rm -f tags
+	$(Q) rm -f TAGS
+	$(Q) rm -f $(KVM_INCLUDE)/common-cmds.h
+	$(Q) rm -f KVMTOOLS-VERSION-FILE
+.PHONY: clean
+
+KVM_DEV	?= /dev/kvm
+
+$(KVM_DEV):
+	$(E) "  MKNOD " $@
+	$(Q) mknod $@ char 10 232
+
+devices: $(KVM_DEV)
+.PHONY: devices
+
+TAGS:
+	$(E) "  GEN" $@
+	$(Q) $(RM) -f TAGS
+	$(Q) $(FIND) . -name '*.[hcS]' -print | xargs etags -a
+.PHONY: TAGS
+
+tags:
+	$(E) "  GEN" $@
+	$(Q) $(RM) -f tags
+	$(Q) $(FIND) . -name '*.[hcS]' -print | xargs ctags -a
+.PHONY: tags
+
+cscope:
+	$(E) "  GEN" $@
+	$(Q) $(FIND) . -name '*.[hcS]' -print > cscope.files
+	$(Q) $(CSCOPE) -bkqu
+.PHONY: cscope
+
+#
+# Escape redundant work on cleaning up
+ifneq ($(MAKECMDGOALS),clean)
+-include $(DEPS)
+
+KVMTOOLS-VERSION-FILE:
+	@$(SHELL_PATH) util/KVMTOOLS-VERSION-GEN $(OUTPUT)
+-include $(OUTPUT)KVMTOOLS-VERSION-FILE
+endif

diff --git a/tools/kvm/README b/tools/kvm/README
new file mode 100644
index 0000000..97747d6
--- /dev/null
+++ b/tools/kvm/README

@@ -0,0 +1,112 @@
+Native Linux KVM tool
+=====================
+The goal of this tool is to provide a clean, from-scratch, lightweight
+KVM host tool implementation that can boot Linux guest images (just a
+hobby, won't be big and professional like QEMU) with no BIOS
+dependencies and with only the minimal amount of legacy device
+emulation.
+
+It's great as a learning tool if you want to get your feet wet in
+virtualization land: it's only 5 KLOC of clean C code that can already
+boot a guest Linux image.
+
+Right now it can boot a Linux image and provide you output via a serial
+console, over the host terminal, i.e. you can use it to boot a guest
+Linux image in a terminal or over ssh and log into the guest without
+much guest or host side setup work needed.
+
+1. To try out the tool, clone the git repository:
+
+  git clone git://github.com/penberg/linux-kvm.git
+
+or alternatively, if you already have a kernel source tree:
+
+  git remote add kvm-tool git://github.com/penberg/linux-kvm.git
+  git remote update
+  git checkout kvm-tool/master -b kvm-tool
+
+2. Compile the tool:
+
+  cd tools/kvm && make
+
+3. Download a raw userspace image:
+
+  wget http://wiki.qemu.org/download/linux-0.2.img.bz2 && bunzip2
+linux-0.2.img.bz2
+
+4. The guest kernel has to be built with the following configuration:
+
+ - For the default console output:
+	CONFIG_SERIAL_8250=y
+	CONFIG_SERIAL_8250_CONSOLE=y
+
+ - For running 32bit images on 64bit hosts:
+	CONFIG_IA32_EMULATION=y
+
+ - Proper FS options according to image FS (e.g. CONFIG_EXT2_FS, CONFIG_EXT4_FS).
+
+ - For all virtio devices listed below:
+	CONFIG_VIRTIO=y
+	CONFIG_VIRTIO_RING=y
+	CONFIG_VIRTIO_PCI=y
+
+ - For virtio-blk devices (--disk, -d):
+	CONFIG_VIRTIO_BLK=y
+
+ - For virtio-net devices ([--network, -n] virtio):
+	CONFIG_VIRTIO_NET=y
+
+ - For virtio-9p devices (--virtio-9p):
+	CONFIG_NET_9P=y
+	CONFIG_NET_9P_VIRTIO=y
+	CONFIG_9P_FS=y
+
+ - For virtio-balloon device (--balloon):
+	CONFIG_VIRTIO_BALLOON=y
+
+ - For virtio-console device (--console virtio):
+	CONFIG_VIRTIO_CONSOLE=y
+
+ - For virtio-rng device (--rng):
+	CONFIG_HW_RANDOM_VIRTIO=y
+
+ - For vesa device (--sdl or --vnc):
+	CONFIG_FB_VESA=y
+
+
+5. And finally, launch the hypervisor:
+
+  ./lkvm run --disk linux-0.2.img \
+	    --kernel ../../arch/x86/boot/bzImage \
+or
+
+  sudo ./lkvm run --disk linux-0.2.img \
+		 --kernel ../../arch/x86/boot/bzImage \
+		 --network virtio
+
+The tool has been written by Pekka Enberg, Cyrill Gorcunov, Asias He,
+Sasha Levin and Prasad Joshi. Special thanks to Avi Kivity for his help
+on KVM internals and Ingo Molnar for all-around support and encouragement!
+
+See the following thread for original discussion for motivation of this
+project:
+
+http://thread.gmane.org/gmane.linux.kernel/962051/focus=962620
+
+Build dependencies
+=====================
+For deb based systems:
+32-bit:
+sudo apt-get install build-essential
+64-bit:
+sudo apt-get install build-essential libc6-dev-i386
+
+For rpm based systems:
+32-bit:
+yum install glibc-devel
+64-bit:
+yum install glibc-devel glibc-static
+
+On 64-bit Arch Linux make sure the multilib repository is enabled in your
+/etc/pacman.conf and run
+pacman -Sy lib32-glibc

diff --git a/tools/kvm/arm/aarch32/arm-cpu.c b/tools/kvm/arm/aarch32/arm-cpu.c
new file mode 100644
index 0000000..71b98fe
--- /dev/null
+++ b/tools/kvm/arm/aarch32/arm-cpu.c

@@ -0,0 +1,42 @@
+#include "kvm/kvm.h"
+#include "kvm/kvm-cpu.h"
+#include "kvm/util.h"
+
+#include "arm-common/gic.h"
+#include "arm-common/timer.h"
+
+#include <linux/byteorder.h>
+#include <linux/types.h>
+
+static void generate_fdt_nodes(void *fdt, struct kvm *kvm, u32 gic_phandle)
+{
+	int timer_interrupts[4] = {13, 14, 11, 10};
+
+	gic__generate_fdt_nodes(fdt, gic_phandle);
+	timer__generate_fdt_nodes(fdt, kvm, timer_interrupts);
+}
+
+static int arm_cpu__vcpu_init(struct kvm_cpu *vcpu)
+{
+	vcpu->generate_fdt_nodes = generate_fdt_nodes;
+	return 0;
+}
+
+static struct kvm_arm_target target_cortex_a15 = {
+	.id		= KVM_ARM_TARGET_CORTEX_A15,
+	.compatible	= "arm,cortex-a15",
+	.init		= arm_cpu__vcpu_init,
+};
+
+static struct kvm_arm_target target_cortex_a7 = {
+	.id		= KVM_ARM_TARGET_CORTEX_A7,
+	.compatible	= "arm,cortex-a7",
+	.init		= arm_cpu__vcpu_init,
+};
+
+static int arm_cpu__core_init(struct kvm *kvm)
+{
+	return (kvm_cpu__register_kvm_arm_target(&target_cortex_a15) ||
+		kvm_cpu__register_kvm_arm_target(&target_cortex_a7));
+}
+core_init(arm_cpu__core_init);

diff --git a/tools/kvm/arm/aarch32/include/kvm/barrier.h b/tools/kvm/arm/aarch32/include/kvm/barrier.h
new file mode 100644
index 0000000..94913a9
--- /dev/null
+++ b/tools/kvm/arm/aarch32/include/kvm/barrier.h

@@ -0,0 +1,10 @@
+#ifndef KVM__KVM_BARRIER_H
+#define KVM__KVM_BARRIER_H
+
+#define dmb()	asm volatile ("dmb" : : : "memory")
+
+#define mb()	dmb()
+#define rmb()	dmb()
+#define wmb()	dmb()
+
+#endif /* KVM__KVM_BARRIER_H */

diff --git a/tools/kvm/arm/aarch32/include/kvm/kvm-arch.h b/tools/kvm/arm/aarch32/include/kvm/kvm-arch.h
new file mode 100644
index 0000000..1632e3c
--- /dev/null
+++ b/tools/kvm/arm/aarch32/include/kvm/kvm-arch.h

@@ -0,0 +1,13 @@
+#ifndef KVM__KVM_ARCH_H
+#define KVM__KVM_ARCH_H
+
+#define ARM_GIC_DIST_SIZE	0x1000
+#define ARM_GIC_CPUI_SIZE	0x2000
+
+#define ARM_KERN_OFFSET(...)	0x8000
+
+#define ARM_MAX_MEMORY(...)	ARM_LOMAP_MAX_MEMORY
+
+#include "arm-common/kvm-arch.h"
+
+#endif /* KVM__KVM_ARCH_H */

diff --git a/tools/kvm/arm/aarch32/include/kvm/kvm-config-arch.h b/tools/kvm/arm/aarch32/include/kvm/kvm-config-arch.h
new file mode 100644
index 0000000..acf0d23
--- /dev/null
+++ b/tools/kvm/arm/aarch32/include/kvm/kvm-config-arch.h

@@ -0,0 +1,8 @@
+#ifndef KVM__KVM_CONFIG_ARCH_H
+#define KVM__KVM_CONFIG_ARCH_H
+
+#define ARM_OPT_ARCH_RUN(...)
+
+#include "arm-common/kvm-config-arch.h"
+
+#endif /* KVM__KVM_CONFIG_ARCH_H */

diff --git a/tools/kvm/arm/aarch32/include/kvm/kvm-cpu-arch.h b/tools/kvm/arm/aarch32/include/kvm/kvm-cpu-arch.h
new file mode 100644
index 0000000..d28ea67
--- /dev/null
+++ b/tools/kvm/arm/aarch32/include/kvm/kvm-cpu-arch.h

@@ -0,0 +1,16 @@
+#ifndef KVM__KVM_CPU_ARCH_H
+#define KVM__KVM_CPU_ARCH_H
+
+#include "kvm/kvm.h"
+
+#include "arm-common/kvm-cpu-arch.h"
+
+#define ARM_VCPU_FEATURE_FLAGS(kvm, cpuid)	{			\
+	[0] = (!!(cpuid) << KVM_ARM_VCPU_POWER_OFF),			\
+}
+
+#define ARM_MPIDR_HWID_BITMASK	0xFFFFFF
+#define ARM_CPU_ID		0, 0, 0
+#define ARM_CPU_ID_MPIDR	5
+
+#endif /* KVM__KVM_CPU_ARCH_H */

diff --git a/tools/kvm/arm/aarch32/kvm-cpu.c b/tools/kvm/arm/aarch32/kvm-cpu.c
new file mode 100644
index 0000000..464b473
--- /dev/null
+++ b/tools/kvm/arm/aarch32/kvm-cpu.c

@@ -0,0 +1,145 @@
+#include "kvm/kvm-cpu.h"
+#include "kvm/kvm.h"
+#include "kvm/virtio.h"
+
+#include <asm/ptrace.h>
+
+#define ARM_CORE_REG(x)	(KVM_REG_ARM | KVM_REG_SIZE_U32 | KVM_REG_ARM_CORE | \
+			 KVM_REG_ARM_CORE_REG(x))
+
+#define ARM_CP15_REG_SHIFT_MASK(x,n)		\
+	(((x) << KVM_REG_ARM_ ## n ## _SHIFT) & KVM_REG_ARM_ ## n ## _MASK)
+
+#define __ARM_CP15_REG(op1,crn,crm,op2)			\
+	(KVM_REG_ARM | KVM_REG_SIZE_U32		|	\
+	 (15 << KVM_REG_ARM_COPROC_SHIFT)	|	\
+	 ARM_CP15_REG_SHIFT_MASK(op1, OPC1)	|	\
+	 ARM_CP15_REG_SHIFT_MASK(crn, 32_CRN)	|	\
+	 ARM_CP15_REG_SHIFT_MASK(crm, CRM)	|	\
+	 ARM_CP15_REG_SHIFT_MASK(op2, 32_OPC2))
+
+#define ARM_CP15_REG(...)	__ARM_CP15_REG(__VA_ARGS__)
+
+unsigned long kvm_cpu__get_vcpu_mpidr(struct kvm_cpu *vcpu)
+{
+	struct kvm_one_reg reg;
+	u32 mpidr;
+
+	reg.id = ARM_CP15_REG(ARM_CPU_ID, ARM_CPU_ID_MPIDR);
+	reg.addr = (u64)(unsigned long)&mpidr;
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_ONE_REG, &reg) < 0)
+		die("KVM_GET_ONE_REG failed (get_mpidr vcpu%ld", vcpu->cpu_id);
+
+	return mpidr;
+}
+
+void kvm_cpu__reset_vcpu(struct kvm_cpu *vcpu)
+{
+	struct kvm *kvm	= vcpu->kvm;
+	struct kvm_one_reg reg;
+	u32 data;
+
+	/* Who said future-proofing was a good idea? */
+	reg.addr = (u64)(unsigned long)&data;
+
+	/* cpsr = IRQs/FIQs masked */
+	data	= PSR_I_BIT | PSR_F_BIT | SVC_MODE;
+	reg.id	= ARM_CORE_REG(usr_regs.ARM_cpsr);
+	if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, &reg) < 0)
+		die_perror("KVM_SET_ONE_REG failed (cpsr)");
+
+	/* Secondary cores are stopped awaiting PSCI wakeup */
+	if (vcpu->cpu_id != 0)
+		return;
+
+	/* r0 = 0 */
+	data	= 0;
+	reg.id	= ARM_CORE_REG(usr_regs.ARM_r0);
+	if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, &reg) < 0)
+		die_perror("KVM_SET_ONE_REG failed (r0)");
+
+	/* r1 = machine type (-1) */
+	data	= -1;
+	reg.id	= ARM_CORE_REG(usr_regs.ARM_r1);
+	if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, &reg) < 0)
+		die_perror("KVM_SET_ONE_REG failed (r1)");
+
+	/* r2 = physical address of the device tree blob */
+	data	= kvm->arch.dtb_guest_start;
+	reg.id	= ARM_CORE_REG(usr_regs.ARM_r2);
+	if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, &reg) < 0)
+		die_perror("KVM_SET_ONE_REG failed (r2)");
+
+	/* pc = start of kernel image */
+	data	= kvm->arch.kern_guest_start;
+	reg.id	= ARM_CORE_REG(usr_regs.ARM_pc);
+	if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, &reg) < 0)
+		die_perror("KVM_SET_ONE_REG failed (pc)");
+}
+
+int kvm_cpu__get_endianness(struct kvm_cpu *vcpu)
+{
+	struct kvm_one_reg reg;
+	u32 data;
+
+	reg.id = ARM_CORE_REG(usr_regs.ARM_cpsr);
+	reg.addr = (u64)(unsigned long)&data;
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_ONE_REG, &reg) < 0)
+		die("KVM_GET_ONE_REG failed (cpsr)");
+
+	return (data & PSR_E_BIT) ? VIRTIO_ENDIAN_BE : VIRTIO_ENDIAN_LE;
+}
+
+void kvm_cpu__show_code(struct kvm_cpu *vcpu)
+{
+	struct kvm_one_reg reg;
+	u32 data;
+	int debug_fd = kvm_cpu__get_debug_fd();
+
+	reg.addr = (u64)(unsigned long)&data;
+
+	dprintf(debug_fd, "\n*pc:\n");
+	reg.id = ARM_CORE_REG(usr_regs.ARM_pc);
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_ONE_REG, &reg) < 0)
+		die("KVM_GET_ONE_REG failed (show_code @ PC)");
+
+	kvm__dump_mem(vcpu->kvm, data, 32, debug_fd);
+
+	dprintf(debug_fd, "\n*lr (svc):\n");
+	reg.id = ARM_CORE_REG(svc_regs[1]);
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_ONE_REG, &reg) < 0)
+		die("KVM_GET_ONE_REG failed (show_code @ LR_svc)");
+	data &= ~0x1;
+
+	kvm__dump_mem(vcpu->kvm, data, 32, debug_fd);
+}
+
+void kvm_cpu__show_registers(struct kvm_cpu *vcpu)
+{
+	struct kvm_one_reg reg;
+	u32 data;
+	int debug_fd = kvm_cpu__get_debug_fd();
+
+	reg.addr	= (u64)(unsigned long)&data;
+	dprintf(debug_fd, "\n Registers:\n");
+
+	reg.id		= ARM_CORE_REG(usr_regs.ARM_pc);
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_ONE_REG, &reg) < 0)
+		die("KVM_GET_ONE_REG failed (pc)");
+	dprintf(debug_fd, " PC:    0x%x\n", data);
+
+	reg.id		= ARM_CORE_REG(usr_regs.ARM_cpsr);
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_ONE_REG, &reg) < 0)
+		die("KVM_GET_ONE_REG failed (cpsr)");
+	dprintf(debug_fd, " CPSR:  0x%x\n", data);
+
+	reg.id		= ARM_CORE_REG(svc_regs[0]);
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_ONE_REG, &reg) < 0)
+		die("KVM_GET_ONE_REG failed (SP_svc)");
+	dprintf(debug_fd, " SP_svc:  0x%x\n", data);
+
+	reg.id		= ARM_CORE_REG(svc_regs[1]);
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_ONE_REG, &reg) < 0)
+		die("KVM_GET_ONE_REG failed (LR_svc)");
+	dprintf(debug_fd, " LR_svc:  0x%x\n", data);
+}

diff --git a/tools/kvm/arm/aarch64/arm-cpu.c b/tools/kvm/arm/aarch64/arm-cpu.c
new file mode 100644
index 0000000..ce5ea2f
--- /dev/null
+++ b/tools/kvm/arm/aarch64/arm-cpu.c

@@ -0,0 +1,50 @@
+#include "kvm/fdt.h"
+#include "kvm/kvm.h"
+#include "kvm/kvm-cpu.h"
+#include "kvm/util.h"
+
+#include "arm-common/gic.h"
+#include "arm-common/timer.h"
+
+#include <linux/byteorder.h>
+#include <linux/types.h>
+
+static void generate_fdt_nodes(void *fdt, struct kvm *kvm, u32 gic_phandle)
+{
+	int timer_interrupts[4] = {13, 14, 11, 10};
+	gic__generate_fdt_nodes(fdt, gic_phandle);
+	timer__generate_fdt_nodes(fdt, kvm, timer_interrupts);
+}
+
+
+static int arm_cpu__vcpu_init(struct kvm_cpu *vcpu)
+{
+	vcpu->generate_fdt_nodes = generate_fdt_nodes;
+	return 0;
+}
+
+static struct kvm_arm_target target_aem_v8 = {
+	.id		= KVM_ARM_TARGET_AEM_V8,
+	.compatible	= "arm,arm-v8",
+	.init		= arm_cpu__vcpu_init,
+};
+
+static struct kvm_arm_target target_foundation_v8 = {
+	.id		= KVM_ARM_TARGET_FOUNDATION_V8,
+	.compatible	= "arm,arm-v8",
+	.init		= arm_cpu__vcpu_init,
+};
+
+static struct kvm_arm_target target_cortex_a57 = {
+	.id		= KVM_ARM_TARGET_CORTEX_A57,
+	.compatible	= "arm,cortex-a57",
+	.init		= arm_cpu__vcpu_init,
+};
+
+static int arm_cpu__core_init(struct kvm *kvm)
+{
+	return (kvm_cpu__register_kvm_arm_target(&target_aem_v8) ||
+		kvm_cpu__register_kvm_arm_target(&target_foundation_v8) ||
+		kvm_cpu__register_kvm_arm_target(&target_cortex_a57));
+}
+core_init(arm_cpu__core_init);

diff --git a/tools/kvm/arm/aarch64/include/kvm/barrier.h b/tools/kvm/arm/aarch64/include/kvm/barrier.h
new file mode 100644
index 0000000..97ab252
--- /dev/null
+++ b/tools/kvm/arm/aarch64/include/kvm/barrier.h

@@ -0,0 +1,8 @@
+#ifndef KVM__KVM_BARRIER_H
+#define KVM__KVM_BARRIER_H
+
+#define mb()	asm volatile ("dmb ish"		: : : "memory")
+#define rmb()	asm volatile ("dmb ishld"	: : : "memory")
+#define wmb()	asm volatile ("dmb ishst"	: : : "memory")
+
+#endif /* KVM__KVM_BARRIER_H */

diff --git a/tools/kvm/arm/aarch64/include/kvm/kvm-arch.h b/tools/kvm/arm/aarch64/include/kvm/kvm-arch.h
new file mode 100644
index 0000000..2f08a26
--- /dev/null
+++ b/tools/kvm/arm/aarch64/include/kvm/kvm-arch.h

@@ -0,0 +1,17 @@
+#ifndef KVM__KVM_ARCH_H
+#define KVM__KVM_ARCH_H
+
+#define ARM_GIC_DIST_SIZE	0x10000
+#define ARM_GIC_CPUI_SIZE	0x10000
+
+#define ARM_KERN_OFFSET(kvm)	((kvm)->cfg.arch.aarch32_guest	?	\
+				0x8000				:	\
+				0x80000)
+
+#define ARM_MAX_MEMORY(kvm)	((kvm)->cfg.arch.aarch32_guest	?	\
+				ARM_LOMAP_MAX_MEMORY		:	\
+				ARM_HIMAP_MAX_MEMORY)
+
+#include "arm-common/kvm-arch.h"
+
+#endif /* KVM__KVM_ARCH_H */

diff --git a/tools/kvm/arm/aarch64/include/kvm/kvm-config-arch.h b/tools/kvm/arm/aarch64/include/kvm/kvm-config-arch.h
new file mode 100644
index 0000000..89860ae
--- /dev/null
+++ b/tools/kvm/arm/aarch64/include/kvm/kvm-config-arch.h

@@ -0,0 +1,10 @@
+#ifndef KVM__KVM_CONFIG_ARCH_H
+#define KVM__KVM_CONFIG_ARCH_H
+
+#define ARM_OPT_ARCH_RUN(cfg)						\
+	OPT_BOOLEAN('\0', "aarch32", &(cfg)->aarch32_guest,		\
+			"Run AArch32 guest"),
+
+#include "arm-common/kvm-config-arch.h"
+
+#endif /* KVM__KVM_CONFIG_ARCH_H */

diff --git a/tools/kvm/arm/aarch64/include/kvm/kvm-cpu-arch.h b/tools/kvm/arm/aarch64/include/kvm/kvm-cpu-arch.h
new file mode 100644
index 0000000..2ffa7ad
--- /dev/null
+++ b/tools/kvm/arm/aarch64/include/kvm/kvm-cpu-arch.h

@@ -0,0 +1,19 @@
+#ifndef KVM__KVM_CPU_ARCH_H
+#define KVM__KVM_CPU_ARCH_H
+
+#include "kvm/kvm.h"
+
+#include "arm-common/kvm-cpu-arch.h"
+
+#define ARM_VCPU_FEATURE_FLAGS(kvm, cpuid)	{				\
+	[0] = ((!!(cpuid) << KVM_ARM_VCPU_POWER_OFF) |				\
+	       (!!(kvm)->cfg.arch.aarch32_guest << KVM_ARM_VCPU_EL1_32BIT))	\
+}
+
+#define ARM_MPIDR_HWID_BITMASK	0xFF00FFFFFFUL
+#define ARM_CPU_ID		3, 0, 0, 0
+#define ARM_CPU_ID_MPIDR	5
+#define ARM_CPU_CTRL		3, 0, 1, 0
+#define ARM_CPU_CTRL_SCTLR_EL1	0
+
+#endif /* KVM__KVM_CPU_ARCH_H */

diff --git a/tools/kvm/arm/aarch64/kvm-cpu.c b/tools/kvm/arm/aarch64/kvm-cpu.c
new file mode 100644
index 0000000..71a2a3a
--- /dev/null
+++ b/tools/kvm/arm/aarch64/kvm-cpu.c

@@ -0,0 +1,230 @@
+#include "kvm/kvm-cpu.h"
+#include "kvm/kvm.h"
+#include "kvm/virtio.h"
+
+#include <asm/ptrace.h>
+
+#define COMPAT_PSR_F_BIT	0x00000040
+#define COMPAT_PSR_I_BIT	0x00000080
+#define COMPAT_PSR_E_BIT	0x00000200
+#define COMPAT_PSR_MODE_SVC	0x00000013
+
+#define SCTLR_EL1_E0E_MASK	(1 << 24)
+#define SCTLR_EL1_EE_MASK	(1 << 25)
+
+#define ARM64_CORE_REG(x)	(KVM_REG_ARM64 | KVM_REG_SIZE_U64 | \
+				 KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(x))
+
+#define ARM64_SYS_REG_SHIFT_MASK(x,n)				\
+	(((x) << KVM_REG_ARM64_SYSREG_ ## n ## _SHIFT) &	\
+	 KVM_REG_ARM64_SYSREG_ ## n ## _MASK)
+
+#define __ARM64_SYS_REG(op0,op1,crn,crm,op2)			\
+	(KVM_REG_ARM64 | KVM_REG_SIZE_U64		|	\
+	 KVM_REG_ARM64_SYSREG				|	\
+	 ARM64_SYS_REG_SHIFT_MASK(op0, OP0)		|	\
+	 ARM64_SYS_REG_SHIFT_MASK(op1, OP1)		|	\
+	 ARM64_SYS_REG_SHIFT_MASK(crn, CRN)		|	\
+	 ARM64_SYS_REG_SHIFT_MASK(crm, CRM)		|	\
+	 ARM64_SYS_REG_SHIFT_MASK(op2, OP2))
+
+#define ARM64_SYS_REG(...)	__ARM64_SYS_REG(__VA_ARGS__)
+
+unsigned long kvm_cpu__get_vcpu_mpidr(struct kvm_cpu *vcpu)
+{
+	struct kvm_one_reg reg;
+	u64 mpidr;
+
+	reg.id = ARM64_SYS_REG(ARM_CPU_ID, ARM_CPU_ID_MPIDR);
+	reg.addr = (u64)&mpidr;
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_ONE_REG, &reg) < 0)
+		die("KVM_GET_ONE_REG failed (get_mpidr vcpu%ld", vcpu->cpu_id);
+
+	return mpidr;
+}
+
+static void reset_vcpu_aarch32(struct kvm_cpu *vcpu)
+{
+	struct kvm *kvm = vcpu->kvm;
+	struct kvm_one_reg reg;
+	u64 data;
+
+	reg.addr = (u64)&data;
+
+	/* pstate = all interrupts masked */
+	data	= COMPAT_PSR_I_BIT | COMPAT_PSR_F_BIT | COMPAT_PSR_MODE_SVC;
+	reg.id	= ARM64_CORE_REG(regs.pstate);
+	if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, &reg) < 0)
+		die_perror("KVM_SET_ONE_REG failed (spsr[EL1])");
+
+	/* Secondary cores are stopped awaiting PSCI wakeup */
+	if (vcpu->cpu_id != 0)
+		return;
+
+	/* r0 = 0 */
+	data	= 0;
+	reg.id	= ARM64_CORE_REG(regs.regs[0]);
+	if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, &reg) < 0)
+		die_perror("KVM_SET_ONE_REG failed (r0)");
+
+	/* r1 = machine type (-1) */
+	data	= -1;
+	reg.id	= ARM64_CORE_REG(regs.regs[1]);
+	if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, &reg) < 0)
+		die_perror("KVM_SET_ONE_REG failed (r1)");
+
+	/* r2 = physical address of the device tree blob */
+	data	= kvm->arch.dtb_guest_start;
+	reg.id	= ARM64_CORE_REG(regs.regs[2]);
+	if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, &reg) < 0)
+		die_perror("KVM_SET_ONE_REG failed (r2)");
+
+	/* pc = start of kernel image */
+	data	= kvm->arch.kern_guest_start;
+	reg.id	= ARM64_CORE_REG(regs.pc);
+	if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, &reg) < 0)
+		die_perror("KVM_SET_ONE_REG failed (pc)");
+}
+
+static void reset_vcpu_aarch64(struct kvm_cpu *vcpu)
+{
+	struct kvm *kvm = vcpu->kvm;
+	struct kvm_one_reg reg;
+	u64 data;
+
+	reg.addr = (u64)&data;
+
+	/* pstate = all interrupts masked */
+	data	= PSR_D_BIT | PSR_A_BIT | PSR_I_BIT | PSR_F_BIT | PSR_MODE_EL1h;
+	reg.id	= ARM64_CORE_REG(regs.pstate);
+	if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, &reg) < 0)
+		die_perror("KVM_SET_ONE_REG failed (spsr[EL1])");
+
+	/* x1...x3 = 0 */
+	data	= 0;
+	reg.id	= ARM64_CORE_REG(regs.regs[1]);
+	if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, &reg) < 0)
+		die_perror("KVM_SET_ONE_REG failed (x1)");
+
+	reg.id	= ARM64_CORE_REG(regs.regs[2]);
+	if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, &reg) < 0)
+		die_perror("KVM_SET_ONE_REG failed (x2)");
+
+	reg.id	= ARM64_CORE_REG(regs.regs[3]);
+	if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, &reg) < 0)
+		die_perror("KVM_SET_ONE_REG failed (x3)");
+
+	/* Secondary cores are stopped awaiting PSCI wakeup */
+	if (vcpu->cpu_id == 0) {
+		/* x0 = physical address of the device tree blob */
+		data	= kvm->arch.dtb_guest_start;
+		reg.id	= ARM64_CORE_REG(regs.regs[0]);
+		if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, &reg) < 0)
+			die_perror("KVM_SET_ONE_REG failed (x0)");
+
+		/* pc = start of kernel image */
+		data	= kvm->arch.kern_guest_start;
+		reg.id	= ARM64_CORE_REG(regs.pc);
+		if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, &reg) < 0)
+			die_perror("KVM_SET_ONE_REG failed (pc)");
+	}
+}
+
+void kvm_cpu__reset_vcpu(struct kvm_cpu *vcpu)
+{
+	if (vcpu->kvm->cfg.arch.aarch32_guest)
+		return reset_vcpu_aarch32(vcpu);
+	else
+		return reset_vcpu_aarch64(vcpu);
+}
+
+int kvm_cpu__get_endianness(struct kvm_cpu *vcpu)
+{
+	struct kvm_one_reg reg;
+	u64 psr;
+	u64 sctlr;
+
+	/*
+	 * Quoting the definition given by Peter Maydell:
+	 *
+	 * "Endianness of the CPU which does the virtio reset at the
+	 * point when it does that reset"
+	 *
+	 * We first check for an AArch32 guest: its endianness can
+	 * change when using SETEND, which affects the CPSR.E bit.
+	 *
+	 * If we're AArch64, use SCTLR_EL1.E0E if access comes from
+	 * EL0, and SCTLR_EL1.EE if access comes from EL1.
+	 */
+	reg.id = ARM64_CORE_REG(regs.pstate);
+	reg.addr = (u64)&psr;
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_ONE_REG, &reg) < 0)
+		die("KVM_GET_ONE_REG failed (spsr[EL1])");
+
+	if (psr & PSR_MODE32_BIT)
+		return (psr & COMPAT_PSR_E_BIT) ? VIRTIO_ENDIAN_BE : VIRTIO_ENDIAN_LE;
+
+	reg.id = ARM64_SYS_REG(ARM_CPU_CTRL, ARM_CPU_CTRL_SCTLR_EL1);
+	reg.addr = (u64)&sctlr;
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_ONE_REG, &reg) < 0)
+		die("KVM_GET_ONE_REG failed (SCTLR_EL1)");
+
+	if ((psr & PSR_MODE_MASK) == PSR_MODE_EL0t)
+		sctlr &= SCTLR_EL1_E0E_MASK;
+	else
+		sctlr &= SCTLR_EL1_EE_MASK;
+	return sctlr ? VIRTIO_ENDIAN_BE : VIRTIO_ENDIAN_LE;
+}
+
+void kvm_cpu__show_code(struct kvm_cpu *vcpu)
+{
+	struct kvm_one_reg reg;
+	unsigned long data;
+	int debug_fd = kvm_cpu__get_debug_fd();
+
+	reg.addr = (u64)&data;
+
+	dprintf(debug_fd, "\n*pc:\n");
+	reg.id = ARM64_CORE_REG(regs.pc);
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_ONE_REG, &reg) < 0)
+		die("KVM_GET_ONE_REG failed (show_code @ PC)");
+
+	kvm__dump_mem(vcpu->kvm, data, 32, debug_fd);
+
+	dprintf(debug_fd, "\n*lr:\n");
+	reg.id = ARM64_CORE_REG(regs.regs[30]);
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_ONE_REG, &reg) < 0)
+		die("KVM_GET_ONE_REG failed (show_code @ LR)");
+
+	kvm__dump_mem(vcpu->kvm, data, 32, debug_fd);
+}
+
+void kvm_cpu__show_registers(struct kvm_cpu *vcpu)
+{
+	struct kvm_one_reg reg;
+	unsigned long data;
+	int debug_fd = kvm_cpu__get_debug_fd();
+
+	reg.addr = (u64)&data;
+	dprintf(debug_fd, "\n Registers:\n");
+
+	reg.id		= ARM64_CORE_REG(regs.pc);
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_ONE_REG, &reg) < 0)
+		die("KVM_GET_ONE_REG failed (pc)");
+	dprintf(debug_fd, " PC:    0x%lx\n", data);
+
+	reg.id		= ARM64_CORE_REG(regs.pstate);
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_ONE_REG, &reg) < 0)
+		die("KVM_GET_ONE_REG failed (pstate)");
+	dprintf(debug_fd, " PSTATE:    0x%lx\n", data);
+
+	reg.id		= ARM64_CORE_REG(sp_el1);
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_ONE_REG, &reg) < 0)
+		die("KVM_GET_ONE_REG failed (sp_el1)");
+	dprintf(debug_fd, " SP_EL1:    0x%lx\n", data);
+
+	reg.id		= ARM64_CORE_REG(regs.regs[30]);
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_ONE_REG, &reg) < 0)
+		die("KVM_GET_ONE_REG failed (lr)");
+	dprintf(debug_fd, " LR:    0x%lx\n", data);
+}

diff --git a/tools/kvm/arm/fdt.c b/tools/kvm/arm/fdt.c
new file mode 100644
index 0000000..186a718
--- /dev/null
+++ b/tools/kvm/arm/fdt.c

@@ -0,0 +1,278 @@
+#include "kvm/devices.h"
+#include "kvm/fdt.h"
+#include "kvm/kvm.h"
+#include "kvm/kvm-cpu.h"
+#include "kvm/virtio-mmio.h"
+
+#include "arm-common/gic.h"
+#include "arm-common/pci.h"
+
+#include <stdbool.h>
+
+#include <asm/setup.h>
+#include <linux/byteorder.h>
+#include <linux/kernel.h>
+#include <linux/sizes.h>
+
+static char kern_cmdline[COMMAND_LINE_SIZE];
+
+bool kvm__load_firmware(struct kvm *kvm, const char *firmware_filename)
+{
+	return false;
+}
+
+int kvm__arch_setup_firmware(struct kvm *kvm)
+{
+	return 0;
+}
+
+static void dump_fdt(const char *dtb_file, void *fdt)
+{
+	int count, fd;
+
+	fd = open(dtb_file, O_CREAT | O_TRUNC | O_RDWR, 0666);
+	if (fd < 0)
+		die("Failed to write dtb to %s", dtb_file);
+
+	count = write(fd, fdt, FDT_MAX_SIZE);
+	if (count < 0)
+		die_perror("Failed to dump dtb");
+
+	pr_info("Wrote %d bytes to dtb %s\n", count, dtb_file);
+	close(fd);
+}
+
+#define CPU_NAME_MAX_LEN 8
+static void generate_cpu_nodes(void *fdt, struct kvm *kvm)
+{
+	int cpu;
+
+	_FDT(fdt_begin_node(fdt, "cpus"));
+	_FDT(fdt_property_cell(fdt, "#address-cells", 0x1));
+	_FDT(fdt_property_cell(fdt, "#size-cells", 0x0));
+
+	for (cpu = 0; cpu < kvm->nrcpus; ++cpu) {
+		char cpu_name[CPU_NAME_MAX_LEN];
+		struct kvm_cpu *vcpu = kvm->cpus[cpu];
+		unsigned long mpidr = kvm_cpu__get_vcpu_mpidr(vcpu);
+
+		mpidr &= ARM_MPIDR_HWID_BITMASK;
+		snprintf(cpu_name, CPU_NAME_MAX_LEN, "cpu@%lx", mpidr);
+
+		_FDT(fdt_begin_node(fdt, cpu_name));
+		_FDT(fdt_property_string(fdt, "device_type", "cpu"));
+		_FDT(fdt_property_string(fdt, "compatible", vcpu->cpu_compatible));
+
+		if (kvm->nrcpus > 1)
+			_FDT(fdt_property_string(fdt, "enable-method", "psci"));
+
+		_FDT(fdt_property_cell(fdt, "reg", mpidr));
+		_FDT(fdt_end_node(fdt));
+	}
+
+	_FDT(fdt_end_node(fdt));
+}
+
+static void generate_irq_prop(void *fdt, u8 irq)
+{
+	u32 irq_prop[] = {
+		cpu_to_fdt32(GIC_FDT_IRQ_TYPE_SPI),
+		cpu_to_fdt32(irq - GIC_SPI_IRQ_BASE),
+		cpu_to_fdt32(GIC_FDT_IRQ_FLAGS_EDGE_LO_HI),
+	};
+
+	_FDT(fdt_property(fdt, "interrupts", irq_prop, sizeof(irq_prop)));
+}
+
+static int setup_fdt(struct kvm *kvm)
+{
+	struct device_header *dev_hdr;
+	u8 staging_fdt[FDT_MAX_SIZE];
+	u32 gic_phandle		= fdt__alloc_phandle();
+	u64 mem_reg_prop[]	= {
+		cpu_to_fdt64(kvm->arch.memory_guest_start),
+		cpu_to_fdt64(kvm->ram_size),
+	};
+	void *fdt		= staging_fdt;
+	void *fdt_dest		= guest_flat_to_host(kvm,
+						     kvm->arch.dtb_guest_start);
+	void (*generate_mmio_fdt_nodes)(void *, struct device_header *,
+					void (*)(void *, u8));
+	void (*generate_cpu_peripheral_fdt_nodes)(void *, struct kvm *, u32)
+					= kvm->cpus[0]->generate_fdt_nodes;
+
+	/* Create new tree without a reserve map */
+	_FDT(fdt_create(fdt, FDT_MAX_SIZE));
+	_FDT(fdt_finish_reservemap(fdt));
+
+	/* Header */
+	_FDT(fdt_begin_node(fdt, ""));
+	_FDT(fdt_property_cell(fdt, "interrupt-parent", gic_phandle));
+	_FDT(fdt_property_string(fdt, "compatible", "linux,dummy-virt"));
+	_FDT(fdt_property_cell(fdt, "#address-cells", 0x2));
+	_FDT(fdt_property_cell(fdt, "#size-cells", 0x2));
+
+	/* /chosen */
+	_FDT(fdt_begin_node(fdt, "chosen"));
+	_FDT(fdt_property_cell(fdt, "linux,pci-probe-only", 1));
+	_FDT(fdt_property_string(fdt, "bootargs", kern_cmdline));
+
+	/* Initrd */
+	if (kvm->arch.initrd_size != 0) {
+		u32 ird_st_prop = cpu_to_fdt64(kvm->arch.initrd_guest_start);
+		u32 ird_end_prop = cpu_to_fdt64(kvm->arch.initrd_guest_start +
+					       kvm->arch.initrd_size);
+
+		_FDT(fdt_property(fdt, "linux,initrd-start",
+				   &ird_st_prop, sizeof(ird_st_prop)));
+		_FDT(fdt_property(fdt, "linux,initrd-end",
+				   &ird_end_prop, sizeof(ird_end_prop)));
+	}
+	_FDT(fdt_end_node(fdt));
+
+	/* Memory */
+	_FDT(fdt_begin_node(fdt, "memory"));
+	_FDT(fdt_property_string(fdt, "device_type", "memory"));
+	_FDT(fdt_property(fdt, "reg", mem_reg_prop, sizeof(mem_reg_prop)));
+	_FDT(fdt_end_node(fdt));
+
+	/* CPU and peripherals (interrupt controller, timers, etc) */
+	generate_cpu_nodes(fdt, kvm);
+	if (generate_cpu_peripheral_fdt_nodes)
+		generate_cpu_peripheral_fdt_nodes(fdt, kvm, gic_phandle);
+
+	/* Virtio MMIO devices */
+	dev_hdr = device__first_dev(DEVICE_BUS_MMIO);
+	while (dev_hdr) {
+		generate_mmio_fdt_nodes = dev_hdr->data;
+		generate_mmio_fdt_nodes(fdt, dev_hdr, generate_irq_prop);
+		dev_hdr = device__next_dev(dev_hdr);
+	}
+
+	/* IOPORT devices (!) */
+	dev_hdr = device__first_dev(DEVICE_BUS_IOPORT);
+	while (dev_hdr) {
+		generate_mmio_fdt_nodes = dev_hdr->data;
+		generate_mmio_fdt_nodes(fdt, dev_hdr, generate_irq_prop);
+		dev_hdr = device__next_dev(dev_hdr);
+	}
+
+	/* PCI host controller */
+	pci__generate_fdt_nodes(fdt, gic_phandle);
+
+	/* PSCI firmware */
+	_FDT(fdt_begin_node(fdt, "psci"));
+	_FDT(fdt_property_string(fdt, "compatible", "arm,psci"));
+	_FDT(fdt_property_string(fdt, "method", "hvc"));
+	_FDT(fdt_property_cell(fdt, "cpu_suspend", KVM_PSCI_FN_CPU_SUSPEND));
+	_FDT(fdt_property_cell(fdt, "cpu_off", KVM_PSCI_FN_CPU_OFF));
+	_FDT(fdt_property_cell(fdt, "cpu_on", KVM_PSCI_FN_CPU_ON));
+	_FDT(fdt_property_cell(fdt, "migrate", KVM_PSCI_FN_MIGRATE));
+	_FDT(fdt_end_node(fdt));
+
+	/* Finalise. */
+	_FDT(fdt_end_node(fdt));
+	_FDT(fdt_finish(fdt));
+
+	_FDT(fdt_open_into(fdt, fdt_dest, FDT_MAX_SIZE));
+	_FDT(fdt_pack(fdt_dest));
+
+	if (kvm->cfg.arch.dump_dtb_filename)
+		dump_fdt(kvm->cfg.arch.dump_dtb_filename, fdt_dest);
+	return 0;
+}
+late_init(setup_fdt);
+
+static int read_image(int fd, void **pos, void *limit)
+{
+	int count;
+
+	while (((count = xread(fd, *pos, SZ_64K)) > 0) && *pos <= limit)
+		*pos += count;
+
+	if (pos < 0)
+		die_perror("xread");
+
+	return *pos < limit ? 0 : -ENOMEM;
+}
+
+#define FDT_ALIGN	SZ_2M
+#define INITRD_ALIGN	4
+int load_flat_binary(struct kvm *kvm, int fd_kernel, int fd_initrd,
+		     const char *kernel_cmdline)
+{
+	void *pos, *kernel_end, *limit;
+	unsigned long guest_addr;
+
+	if (lseek(fd_kernel, 0, SEEK_SET) < 0)
+		die_perror("lseek");
+
+	/*
+	 * Linux requires the initrd and dtb to be mapped inside lowmem,
+	 * so we can't just place them at the top of memory.
+	 */
+	limit = kvm->ram_start + min(kvm->ram_size, (u64)SZ_256M) - 1;
+
+	pos = kvm->ram_start + ARM_KERN_OFFSET(kvm);
+	kvm->arch.kern_guest_start = host_to_guest_flat(kvm, pos);
+	if (read_image(fd_kernel, &pos, limit) == -ENOMEM)
+		die("kernel image too big to contain in guest memory.");
+
+	kernel_end = pos;
+	pr_info("Loaded kernel to 0x%llx (%llu bytes)",
+		kvm->arch.kern_guest_start,
+		host_to_guest_flat(kvm, pos) - kvm->arch.kern_guest_start);
+
+	/*
+	 * Now load backwards from the end of memory so the kernel
+	 * decompressor has plenty of space to work with. First up is
+	 * the device tree blob...
+	 */
+	pos = limit;
+	pos -= (FDT_MAX_SIZE + FDT_ALIGN);
+	guest_addr = ALIGN(host_to_guest_flat(kvm, pos), FDT_ALIGN);
+	pos = guest_flat_to_host(kvm, guest_addr);
+	if (pos < kernel_end)
+		die("fdt overlaps with kernel image.");
+
+	kvm->arch.dtb_guest_start = guest_addr;
+	pr_info("Placing fdt at 0x%llx - 0x%llx",
+		kvm->arch.dtb_guest_start,
+		host_to_guest_flat(kvm, limit));
+	limit = pos;
+
+	/* ... and finally the initrd, if we have one. */
+	if (fd_initrd != -1) {
+		struct stat sb;
+		unsigned long initrd_start;
+
+		if (lseek(fd_initrd, 0, SEEK_SET) < 0)
+			die_perror("lseek");
+
+		if (fstat(fd_initrd, &sb))
+			die_perror("fstat");
+
+		pos -= (sb.st_size + INITRD_ALIGN);
+		guest_addr = ALIGN(host_to_guest_flat(kvm, pos), INITRD_ALIGN);
+		pos = guest_flat_to_host(kvm, guest_addr);
+		if (pos < kernel_end)
+			die("initrd overlaps with kernel image.");
+
+		initrd_start = guest_addr;
+		if (read_image(fd_initrd, &pos, limit) == -ENOMEM)
+			die("initrd too big to contain in guest memory.");
+
+		kvm->arch.initrd_guest_start = initrd_start;
+		kvm->arch.initrd_size = host_to_guest_flat(kvm, pos) - initrd_start;
+		pr_info("Loaded initrd to 0x%llx (%llu bytes)",
+			kvm->arch.initrd_guest_start,
+			kvm->arch.initrd_size);
+	} else {
+		kvm->arch.initrd_size = 0;
+	}
+
+	strncpy(kern_cmdline, kernel_cmdline, COMMAND_LINE_SIZE);
+	kern_cmdline[COMMAND_LINE_SIZE - 1] = '\0';
+
+	return true;
+}

diff --git a/tools/kvm/arm/gic.c b/tools/kvm/arm/gic.c
new file mode 100644
index 0000000..5d8cbe6
--- /dev/null
+++ b/tools/kvm/arm/gic.c

@@ -0,0 +1,80 @@
+#include "kvm/fdt.h"
+#include "kvm/kvm.h"
+#include "kvm/virtio.h"
+
+#include "arm-common/gic.h"
+
+#include <linux/byteorder.h>
+#include <linux/kvm.h>
+
+int gic__init_irqchip(struct kvm *kvm)
+{
+	int err;
+	struct kvm_arm_device_addr gic_addr[] = {
+		[0] = {
+			.id = KVM_VGIC_V2_ADDR_TYPE_DIST |
+			(KVM_ARM_DEVICE_VGIC_V2 << KVM_ARM_DEVICE_ID_SHIFT),
+			.addr = ARM_GIC_DIST_BASE,
+		},
+		[1] = {
+			.id = KVM_VGIC_V2_ADDR_TYPE_CPU |
+			(KVM_ARM_DEVICE_VGIC_V2 << KVM_ARM_DEVICE_ID_SHIFT),
+			.addr = ARM_GIC_CPUI_BASE,
+		}
+	};
+
+	if (kvm->nrcpus > GIC_MAX_CPUS) {
+		pr_warning("%d CPUS greater than maximum of %d -- truncating\n",
+				kvm->nrcpus, GIC_MAX_CPUS);
+		kvm->nrcpus = GIC_MAX_CPUS;
+	}
+
+	err = ioctl(kvm->vm_fd, KVM_CREATE_IRQCHIP);
+	if (err)
+		return err;
+
+	err = ioctl(kvm->vm_fd, KVM_ARM_SET_DEVICE_ADDR, &gic_addr[0]);
+	if (err)
+		return err;
+
+	err = ioctl(kvm->vm_fd, KVM_ARM_SET_DEVICE_ADDR, &gic_addr[1]);
+	return err;
+}
+
+void gic__generate_fdt_nodes(void *fdt, u32 phandle)
+{
+	u64 reg_prop[] = {
+		cpu_to_fdt64(ARM_GIC_DIST_BASE), cpu_to_fdt64(ARM_GIC_DIST_SIZE),
+		cpu_to_fdt64(ARM_GIC_CPUI_BASE), cpu_to_fdt64(ARM_GIC_CPUI_SIZE),
+	};
+
+	_FDT(fdt_begin_node(fdt, "intc"));
+	_FDT(fdt_property_string(fdt, "compatible", "arm,cortex-a15-gic"));
+	_FDT(fdt_property_cell(fdt, "#interrupt-cells", GIC_FDT_IRQ_NUM_CELLS));
+	_FDT(fdt_property(fdt, "interrupt-controller", NULL, 0));
+	_FDT(fdt_property(fdt, "reg", reg_prop, sizeof(reg_prop)));
+	_FDT(fdt_property_cell(fdt, "phandle", phandle));
+	_FDT(fdt_end_node(fdt));
+}
+
+#define KVM_IRQCHIP_IRQ(x) (KVM_ARM_IRQ_TYPE_SPI << KVM_ARM_IRQ_TYPE_SHIFT) |\
+			   ((x) & KVM_ARM_IRQ_NUM_MASK)
+
+void kvm__irq_line(struct kvm *kvm, int irq, int level)
+{
+	struct kvm_irq_level irq_level = {
+		.irq	= KVM_IRQCHIP_IRQ(irq),
+		.level	= !!level,
+	};
+
+	if (irq < GIC_SPI_IRQ_BASE || irq > GIC_MAX_IRQ)
+		pr_warning("Ignoring invalid GIC IRQ %d", irq);
+	else if (ioctl(kvm->vm_fd, KVM_IRQ_LINE, &irq_level) < 0)
+		pr_warning("Could not KVM_IRQ_LINE for irq %d", irq);
+}
+
+void kvm__irq_trigger(struct kvm *kvm, int irq)
+{
+	kvm__irq_line(kvm, irq, VIRTIO_IRQ_HIGH);
+	kvm__irq_line(kvm, irq, VIRTIO_IRQ_LOW);
+}

diff --git a/tools/kvm/arm/include/arm-common/gic.h b/tools/kvm/arm/include/arm-common/gic.h
new file mode 100644
index 0000000..850edc7
--- /dev/null
+++ b/tools/kvm/arm/include/arm-common/gic.h

@@ -0,0 +1,35 @@
+#ifndef ARM_COMMON__GIC_H
+#define ARM_COMMON__GIC_H
+
+#define GIC_SGI_IRQ_BASE		0
+#define GIC_PPI_IRQ_BASE		16
+#define GIC_SPI_IRQ_BASE		32
+
+#define GIC_FDT_IRQ_NUM_CELLS		3
+
+#define GIC_FDT_IRQ_TYPE_SPI		0
+#define GIC_FDT_IRQ_TYPE_PPI		1
+
+#define GIC_FDT_IRQ_FLAGS_EDGE_LO_HI	1
+#define GIC_FDT_IRQ_FLAGS_EDGE_HI_LO	2
+#define GIC_FDT_IRQ_FLAGS_LEVEL_HI	4
+#define GIC_FDT_IRQ_FLAGS_LEVEL_LO	8
+
+#define GIC_FDT_IRQ_PPI_CPU_SHIFT	8
+#define GIC_FDT_IRQ_PPI_CPU_MASK	(0xff << GIC_FDT_IRQ_PPI_CPU_SHIFT)
+
+#define GIC_CPUI_CTLR_EN		(1 << 0)
+#define GIC_CPUI_PMR_MIN_PRIO		0xff
+
+#define GIC_CPUI_OFF_PMR		4
+
+#define GIC_MAX_CPUS			8
+#define GIC_MAX_IRQ			255
+
+struct kvm;
+
+int gic__alloc_irqnum(void);
+int gic__init_irqchip(struct kvm *kvm);
+void gic__generate_fdt_nodes(void *fdt, u32 phandle);
+
+#endif /* ARM_COMMON__GIC_H */

diff --git a/tools/kvm/arm/include/arm-common/kvm-arch.h b/tools/kvm/arm/include/arm-common/kvm-arch.h
new file mode 100644
index 0000000..082131d
--- /dev/null
+++ b/tools/kvm/arm/include/arm-common/kvm-arch.h

@@ -0,0 +1,78 @@
+#ifndef ARM_COMMON__KVM_ARCH_H
+#define ARM_COMMON__KVM_ARCH_H
+
+#include <stdbool.h>
+#include <linux/const.h>
+#include <linux/types.h>
+
+#include "arm-common/gic.h"
+
+#define ARM_IOPORT_AREA		_AC(0x0000000000000000, UL)
+#define ARM_MMIO_AREA		_AC(0x0000000000010000, UL)
+#define ARM_AXI_AREA		_AC(0x0000000040000000, UL)
+#define ARM_MEMORY_AREA		_AC(0x0000000080000000, UL)
+
+#define ARM_LOMAP_MAX_MEMORY	((1ULL << 32) - ARM_MEMORY_AREA)
+#define ARM_HIMAP_MAX_MEMORY	((1ULL << 40) - ARM_MEMORY_AREA)
+
+#define ARM_GIC_DIST_BASE	(ARM_AXI_AREA - ARM_GIC_DIST_SIZE)
+#define ARM_GIC_CPUI_BASE	(ARM_GIC_DIST_BASE - ARM_GIC_CPUI_SIZE)
+#define ARM_GIC_SIZE		(ARM_GIC_DIST_SIZE + ARM_GIC_CPUI_SIZE)
+
+#define ARM_IOPORT_SIZE		(ARM_MMIO_AREA - ARM_IOPORT_AREA)
+#define ARM_VIRTIO_MMIO_SIZE	(ARM_AXI_AREA - (ARM_MMIO_AREA + ARM_GIC_SIZE))
+#define ARM_PCI_CFG_SIZE	(1ULL << 24)
+#define ARM_PCI_MMIO_SIZE	(ARM_MEMORY_AREA - \
+				(ARM_AXI_AREA + ARM_PCI_CFG_SIZE))
+
+#define KVM_IOPORT_AREA		ARM_IOPORT_AREA
+#define KVM_PCI_CFG_AREA	ARM_AXI_AREA
+#define KVM_PCI_MMIO_AREA	(KVM_PCI_CFG_AREA + ARM_PCI_CFG_SIZE)
+#define KVM_VIRTIO_MMIO_AREA	ARM_MMIO_AREA
+
+#define KVM_IRQ_OFFSET		GIC_SPI_IRQ_BASE
+
+#define KVM_VM_TYPE		0
+
+#define VIRTIO_DEFAULT_TRANS(kvm)	\
+	((kvm)->cfg.arch.virtio_trans_pci ? VIRTIO_PCI : VIRTIO_MMIO)
+
+#define VIRTIO_RING_ENDIAN	(VIRTIO_ENDIAN_LE | VIRTIO_ENDIAN_BE)
+
+static inline bool arm_addr_in_ioport_region(u64 phys_addr)
+{
+	u64 limit = KVM_IOPORT_AREA + ARM_IOPORT_SIZE;
+	return phys_addr >= KVM_IOPORT_AREA && phys_addr < limit;
+}
+
+static inline bool arm_addr_in_virtio_mmio_region(u64 phys_addr)
+{
+	u64 limit = KVM_VIRTIO_MMIO_AREA + ARM_VIRTIO_MMIO_SIZE;
+	return phys_addr >= KVM_VIRTIO_MMIO_AREA && phys_addr < limit;
+}
+
+static inline bool arm_addr_in_pci_region(u64 phys_addr)
+{
+	u64 limit = KVM_PCI_CFG_AREA + ARM_PCI_CFG_SIZE + ARM_PCI_MMIO_SIZE;
+	return phys_addr >= KVM_PCI_CFG_AREA && phys_addr < limit;
+}
+
+struct kvm_arch {
+	/*
+	 * We may have to align the guest memory for virtio, so keep the
+	 * original pointers here for munmap.
+	 */
+	void	*ram_alloc_start;
+	u64	ram_alloc_size;
+
+	/*
+	 * Guest addresses for memory layout.
+	 */
+	u64	memory_guest_start;
+	u64	kern_guest_start;
+	u64	initrd_guest_start;
+	u64	initrd_size;
+	u64	dtb_guest_start;
+};
+
+#endif /* ARM_COMMON__KVM_ARCH_H */

diff --git a/tools/kvm/arm/include/arm-common/kvm-config-arch.h b/tools/kvm/arm/include/arm-common/kvm-config-arch.h
new file mode 100644
index 0000000..a8ebd94
--- /dev/null
+++ b/tools/kvm/arm/include/arm-common/kvm-config-arch.h

@@ -0,0 +1,26 @@
+#ifndef ARM_COMMON__KVM_CONFIG_ARCH_H
+#define ARM_COMMON__KVM_CONFIG_ARCH_H
+
+#include "kvm/parse-options.h"
+
+struct kvm_config_arch {
+	const char	*dump_dtb_filename;
+	unsigned int	force_cntfrq;
+	bool		virtio_trans_pci;
+	bool		aarch32_guest;
+};
+
+#define OPT_ARCH_RUN(pfx, cfg)							\
+	pfx,									\
+	ARM_OPT_ARCH_RUN(cfg)							\
+	OPT_STRING('\0', "dump-dtb", &(cfg)->dump_dtb_filename,			\
+		   ".dtb file", "Dump generated .dtb to specified file"),	\
+	OPT_UINTEGER('\0', "override-bad-firmware-cntfrq", &(cfg)->force_cntfrq,\
+		     "Specify Generic Timer frequency in guest DT to "		\
+		     "work around buggy secure firmware *Firmware should be "	\
+		     "updated to program CNTFRQ correctly*"),			\
+	OPT_BOOLEAN('\0', "force-pci", &(cfg)->virtio_trans_pci,		\
+		    "Force virtio devices to use PCI as their default "		\
+		    "transport"),
+
+#endif /* ARM_COMMON__KVM_CONFIG_ARCH_H */

diff --git a/tools/kvm/arm/include/arm-common/kvm-cpu-arch.h b/tools/kvm/arm/include/arm-common/kvm-cpu-arch.h
new file mode 100644
index 0000000..83cd8b8
--- /dev/null
+++ b/tools/kvm/arm/include/arm-common/kvm-cpu-arch.h

@@ -0,0 +1,50 @@
+#ifndef ARM_COMMON__KVM_CPU_ARCH_H
+#define ARM_COMMON__KVM_CPU_ARCH_H
+
+#include <linux/kvm.h>
+#include <pthread.h>
+#include <stdbool.h>
+
+struct kvm;
+
+struct kvm_cpu {
+	pthread_t	thread;
+
+	unsigned long	cpu_id;
+	unsigned long	cpu_type;
+	const char	*cpu_compatible;
+
+	struct kvm	*kvm;
+	int		vcpu_fd;
+	struct kvm_run	*kvm_run;
+
+	u8		is_running;
+	u8		paused;
+	u8		needs_nmi;
+
+	struct kvm_coalesced_mmio_ring	*ring;
+
+	void		(*generate_fdt_nodes)(void *fdt, struct kvm* kvm,
+					      u32 gic_phandle);
+};
+
+struct kvm_arm_target {
+	u32		id;
+	const char 	*compatible;
+	int		(*init)(struct kvm_cpu *vcpu);
+};
+
+int kvm_cpu__register_kvm_arm_target(struct kvm_arm_target *target);
+
+static inline bool kvm_cpu__emulate_io(struct kvm_cpu *vcpu, u16 port, void *data,
+				       int direction, int size, u32 count)
+{
+	return false;
+}
+
+bool kvm_cpu__emulate_mmio(struct kvm_cpu *vcpu, u64 phys_addr, u8 *data,
+			   u32 len, u8 is_write);
+
+unsigned long kvm_cpu__get_vcpu_mpidr(struct kvm_cpu *vcpu);
+
+#endif /* ARM_COMMON__KVM_CPU_ARCH_H */

diff --git a/tools/kvm/arm/include/arm-common/pci.h b/tools/kvm/arm/include/arm-common/pci.h
new file mode 100644
index 0000000..ee87725
--- /dev/null
+++ b/tools/kvm/arm/include/arm-common/pci.h

@@ -0,0 +1,6 @@
+#ifndef ARM_COMMON__PCI_H
+#define ARM_COMMON__PCI_H
+
+void pci__generate_fdt_nodes(void *fdt, u32 gic_phandle);
+
+#endif /* ARM_COMMON__PCI_H */

diff --git a/tools/kvm/arm/include/arm-common/timer.h b/tools/kvm/arm/include/arm-common/timer.h
new file mode 100644
index 0000000..928e9ea
--- /dev/null
+++ b/tools/kvm/arm/include/arm-common/timer.h

@@ -0,0 +1,6 @@
+#ifndef ARM_COMMON__TIMER_H
+#define ARM_COMMON__TIMER_H
+
+void timer__generate_fdt_nodes(void *fdt, struct kvm *kvm, int *irqs);
+
+#endif /* ARM_COMMON__TIMER_H */

diff --git a/tools/kvm/arm/ioport.c b/tools/kvm/arm/ioport.c
new file mode 100644
index 0000000..bdd30b6
--- /dev/null
+++ b/tools/kvm/arm/ioport.c

@@ -0,0 +1,11 @@
+#include "kvm/ioport.h"
+#include "kvm/irq.h"
+
+void ioport__setup_arch(struct kvm *kvm)
+{
+}
+
+void ioport__map_irq(u8 *irq)
+{
+	*irq = irq__alloc_line();
+}

diff --git a/tools/kvm/arm/irq.c b/tools/kvm/arm/irq.c
new file mode 100644
index 0000000..d8f44df
--- /dev/null
+++ b/tools/kvm/arm/irq.c

@@ -0,0 +1,9 @@
+#include "kvm/irq.h"
+#include "kvm/kvm.h"
+#include "kvm/util.h"
+
+int irq__add_msix_route(struct kvm *kvm, struct msi_msg *msg)
+{
+	die(__FUNCTION__);
+	return 0;
+}

diff --git a/tools/kvm/arm/kvm-cpu.c b/tools/kvm/arm/kvm-cpu.c
new file mode 100644
index 0000000..aeaa4cf
--- /dev/null
+++ b/tools/kvm/arm/kvm-cpu.c

@@ -0,0 +1,119 @@
+#include "kvm/kvm.h"
+#include "kvm/kvm-cpu.h"
+
+static int debug_fd;
+
+void kvm_cpu__set_debug_fd(int fd)
+{
+	debug_fd = fd;
+}
+
+int kvm_cpu__get_debug_fd(void)
+{
+	return debug_fd;
+}
+
+static struct kvm_arm_target *kvm_arm_targets[KVM_ARM_NUM_TARGETS];
+int kvm_cpu__register_kvm_arm_target(struct kvm_arm_target *target)
+{
+	unsigned int i = 0;
+
+	for (i = 0; i < ARRAY_SIZE(kvm_arm_targets); ++i) {
+		if (!kvm_arm_targets[i]) {
+			kvm_arm_targets[i] = target;
+			return 0;
+		}
+	}
+
+	return -ENOSPC;
+}
+
+struct kvm_cpu *kvm_cpu__arch_init(struct kvm *kvm, unsigned long cpu_id)
+{
+	struct kvm_arm_target *target;
+	struct kvm_cpu *vcpu;
+	int coalesced_offset, mmap_size, err = -1;
+	unsigned int i;
+	struct kvm_vcpu_init vcpu_init = {
+		.features = ARM_VCPU_FEATURE_FLAGS(kvm, cpu_id)
+	};
+
+	vcpu = calloc(1, sizeof(struct kvm_cpu));
+	if (!vcpu)
+		return NULL;
+
+	vcpu->vcpu_fd = ioctl(kvm->vm_fd, KVM_CREATE_VCPU, cpu_id);
+	if (vcpu->vcpu_fd < 0)
+		die_perror("KVM_CREATE_VCPU ioctl");
+
+	mmap_size = ioctl(kvm->sys_fd, KVM_GET_VCPU_MMAP_SIZE, 0);
+	if (mmap_size < 0)
+		die_perror("KVM_GET_VCPU_MMAP_SIZE ioctl");
+
+	vcpu->kvm_run = mmap(NULL, mmap_size, PROT_RW, MAP_SHARED,
+			     vcpu->vcpu_fd, 0);
+	if (vcpu->kvm_run == MAP_FAILED)
+		die("unable to mmap vcpu fd");
+
+	/* Find an appropriate target CPU type. */
+	for (i = 0; i < ARRAY_SIZE(kvm_arm_targets); ++i) {
+		if (!kvm_arm_targets[i])
+			continue;
+		target = kvm_arm_targets[i];
+		vcpu_init.target = target->id;
+		err = ioctl(vcpu->vcpu_fd, KVM_ARM_VCPU_INIT, &vcpu_init);
+		if (!err)
+			break;
+	}
+
+	if (err || target->init(vcpu))
+		die("Unable to initialise ARM vcpu");
+
+	coalesced_offset = ioctl(kvm->sys_fd, KVM_CHECK_EXTENSION,
+				 KVM_CAP_COALESCED_MMIO);
+	if (coalesced_offset)
+		vcpu->ring = (void *)vcpu->kvm_run +
+			     (coalesced_offset * PAGE_SIZE);
+
+	/* Populate the vcpu structure. */
+	vcpu->kvm		= kvm;
+	vcpu->cpu_id		= cpu_id;
+	vcpu->cpu_type		= target->id;
+	vcpu->cpu_compatible	= target->compatible;
+	vcpu->is_running	= true;
+	return vcpu;
+}
+
+void kvm_cpu__arch_nmi(struct kvm_cpu *cpu)
+{
+}
+
+void kvm_cpu__delete(struct kvm_cpu *vcpu)
+{
+	free(vcpu);
+}
+
+bool kvm_cpu__handle_exit(struct kvm_cpu *vcpu)
+{
+	return false;
+}
+
+bool kvm_cpu__emulate_mmio(struct kvm_cpu *vcpu, u64 phys_addr, u8 *data,
+			   u32 len, u8 is_write)
+{
+	if (arm_addr_in_virtio_mmio_region(phys_addr)) {
+		return kvm__emulate_mmio(vcpu, phys_addr, data, len, is_write);
+	} else if (arm_addr_in_ioport_region(phys_addr)) {
+		int direction = is_write ? KVM_EXIT_IO_OUT : KVM_EXIT_IO_IN;
+		u16 port = (phys_addr - KVM_IOPORT_AREA) & USHRT_MAX;
+		return kvm__emulate_io(vcpu, port, data, direction, len, 1);
+	} else if (arm_addr_in_pci_region(phys_addr)) {
+		return kvm__emulate_mmio(vcpu, phys_addr, data, len, is_write);
+	}
+
+	return false;
+}
+
+void kvm_cpu__show_page_tables(struct kvm_cpu *vcpu)
+{
+}

diff --git a/tools/kvm/arm/kvm.c b/tools/kvm/arm/kvm.c
new file mode 100644
index 0000000..58ad9fa
--- /dev/null
+++ b/tools/kvm/arm/kvm.c

@@ -0,0 +1,87 @@
+#include "kvm/kvm.h"
+#include "kvm/term.h"
+#include "kvm/util.h"
+#include "kvm/8250-serial.h"
+#include "kvm/virtio-console.h"
+
+#include "arm-common/gic.h"
+
+#include <linux/kernel.h>
+#include <linux/kvm.h>
+#include <linux/sizes.h>
+
+struct kvm_ext kvm_req_ext[] = {
+	{ DEFINE_KVM_EXT(KVM_CAP_IRQCHIP) },
+	{ DEFINE_KVM_EXT(KVM_CAP_ONE_REG) },
+	{ DEFINE_KVM_EXT(KVM_CAP_ARM_PSCI) },
+	{ 0, 0 },
+};
+
+bool kvm__arch_cpu_supports_vm(void)
+{
+	/* The KVM capability check is enough. */
+	return true;
+}
+
+void kvm__init_ram(struct kvm *kvm)
+{
+	int err;
+	u64 phys_start, phys_size;
+	void *host_mem;
+
+	phys_start	= ARM_MEMORY_AREA;
+	phys_size	= kvm->ram_size;
+	host_mem	= kvm->ram_start;
+
+	err = kvm__register_mem(kvm, phys_start, phys_size, host_mem);
+	if (err)
+		die("Failed to register %lld bytes of memory at physical "
+		    "address 0x%llx [err %d]", phys_size, phys_start, err);
+
+	kvm->arch.memory_guest_start = phys_start;
+}
+
+void kvm__arch_delete_ram(struct kvm *kvm)
+{
+	munmap(kvm->arch.ram_alloc_start, kvm->arch.ram_alloc_size);
+}
+
+void kvm__arch_read_term(struct kvm *kvm)
+{
+	if (term_readable(0)) {
+		serial8250__update_consoles(kvm);
+		virtio_console__inject_interrupt(kvm);
+	}
+}
+
+void kvm__arch_set_cmdline(char *cmdline, bool video)
+{
+}
+
+void kvm__arch_init(struct kvm *kvm, const char *hugetlbfs_path, u64 ram_size)
+{
+	/*
+	 * Allocate guest memory. We must align our buffer to 64K to
+	 * correlate with the maximum guest page size for virtio-mmio.
+	 * If using THP, then our minimal alignment becomes 2M.
+	 * 2M trumps 64K, so let's go with that.
+	 */
+	kvm->ram_size = min(ram_size, (u64)ARM_MAX_MEMORY(kvm));
+	kvm->arch.ram_alloc_size = kvm->ram_size + SZ_2M;
+	kvm->arch.ram_alloc_start = mmap_anon_or_hugetlbfs(kvm, hugetlbfs_path,
+						kvm->arch.ram_alloc_size);
+
+	if (kvm->arch.ram_alloc_start == MAP_FAILED)
+		die("Failed to map %lld bytes for guest memory (%d)",
+		    kvm->arch.ram_alloc_size, errno);
+
+	kvm->ram_start = (void *)ALIGN((unsigned long)kvm->arch.ram_alloc_start,
+					SZ_2M);
+
+	madvise(kvm->arch.ram_alloc_start, kvm->arch.ram_alloc_size,
+		MADV_MERGEABLE | MADV_HUGEPAGE);
+
+	/* Initialise the virtual GIC. */
+	if (gic__init_irqchip(kvm))
+		die("Failed to initialise virtual GIC");
+}

diff --git a/tools/kvm/arm/pci.c b/tools/kvm/arm/pci.c
new file mode 100644
index 0000000..9f4dabc
--- /dev/null
+++ b/tools/kvm/arm/pci.c

@@ -0,0 +1,117 @@
+#include "kvm/devices.h"
+#include "kvm/fdt.h"
+#include "kvm/of_pci.h"
+#include "kvm/pci.h"
+#include "kvm/util.h"
+
+#include "arm-common/pci.h"
+
+/*
+ * An entry in the interrupt-map table looks like:
+ * <pci unit address> <pci interrupt pin> <gic phandle> <gic interrupt>
+ */
+
+struct of_gic_irq {
+	u32 type, num, flags;
+} __attribute__((packed));
+
+struct of_interrupt_map_entry {
+	struct of_pci_irq_mask		pci_irq_mask;
+	u32				gic_phandle;
+	struct of_gic_irq		gic_irq;
+} __attribute__((packed));
+
+void pci__generate_fdt_nodes(void *fdt, u32 gic_phandle)
+{
+	struct device_header *dev_hdr;
+	struct of_interrupt_map_entry irq_map[OF_PCI_IRQ_MAP_MAX];
+	unsigned nentries = 0;
+	/* Bus range */
+	u32 bus_range[] = { cpu_to_fdt32(0), cpu_to_fdt32(1), };
+	/* Configuration Space */
+	u64 cfg_reg_prop[] = { cpu_to_fdt64(KVM_PCI_CFG_AREA),
+			       cpu_to_fdt64(ARM_PCI_CFG_SIZE), };
+	/* Describe the memory ranges */
+	struct of_pci_ranges_entry ranges[] = {
+		{
+			.pci_addr = {
+				.hi	= cpu_to_fdt32(of_pci_b_ss(OF_PCI_SS_IO)),
+				.mid	= 0,
+				.lo	= 0,
+			},
+			.cpu_addr	= cpu_to_fdt64(KVM_IOPORT_AREA),
+			.length		= cpu_to_fdt64(ARM_IOPORT_SIZE),
+		},
+		{
+			.pci_addr = {
+				.hi	= cpu_to_fdt32(of_pci_b_ss(OF_PCI_SS_M32)),
+				.mid	= cpu_to_fdt32(KVM_PCI_MMIO_AREA >> 32),
+				.lo	= cpu_to_fdt32(KVM_PCI_MMIO_AREA),
+			},
+			.cpu_addr	= cpu_to_fdt64(KVM_PCI_MMIO_AREA),
+			.length		= cpu_to_fdt64(ARM_PCI_MMIO_SIZE),
+		},
+	};
+
+	/* Boilerplate PCI properties */
+	_FDT(fdt_begin_node(fdt, "pci"));
+	_FDT(fdt_property_string(fdt, "device_type", "pci"));
+	_FDT(fdt_property_cell(fdt, "#address-cells", 0x3));
+	_FDT(fdt_property_cell(fdt, "#size-cells", 0x2));
+	_FDT(fdt_property_cell(fdt, "#interrupt-cells", 0x1));
+	_FDT(fdt_property_string(fdt, "compatible", "pci-host-cam-generic"));
+
+	_FDT(fdt_property(fdt, "bus-range", bus_range, sizeof(bus_range)));
+	_FDT(fdt_property(fdt, "reg", &cfg_reg_prop, sizeof(cfg_reg_prop)));
+	_FDT(fdt_property(fdt, "ranges", ranges, sizeof(ranges)));
+
+	/* Generate the interrupt map ... */
+	dev_hdr = device__first_dev(DEVICE_BUS_PCI);
+	while (dev_hdr && nentries < ARRAY_SIZE(irq_map)) {
+		struct of_interrupt_map_entry *entry = &irq_map[nentries];
+		struct pci_device_header *pci_hdr = dev_hdr->data;
+		u8 dev_num = dev_hdr->dev_num;
+		u8 pin = pci_hdr->irq_pin;
+		u8 irq = pci_hdr->irq_line;
+
+		*entry = (struct of_interrupt_map_entry) {
+			.pci_irq_mask = {
+				.pci_addr = {
+					.hi	= cpu_to_fdt32(of_pci_b_ddddd(dev_num)),
+					.mid	= 0,
+					.lo	= 0,
+				},
+				.pci_pin	= cpu_to_fdt32(pin),
+			},
+			.gic_phandle	= cpu_to_fdt32(gic_phandle),
+			.gic_irq = {
+				.type	= cpu_to_fdt32(GIC_FDT_IRQ_TYPE_SPI),
+				.num	= cpu_to_fdt32(irq - GIC_SPI_IRQ_BASE),
+				.flags	= cpu_to_fdt32(GIC_FDT_IRQ_FLAGS_EDGE_LO_HI),
+			},
+		};
+
+		nentries++;
+		dev_hdr = device__next_dev(dev_hdr);
+	}
+
+	_FDT(fdt_property(fdt, "interrupt-map", irq_map,
+			  sizeof(struct of_interrupt_map_entry) * nentries));
+
+	/* ... and the corresponding mask. */
+	if (nentries) {
+		struct of_pci_irq_mask irq_mask = {
+			.pci_addr = {
+				.hi	= cpu_to_fdt32(of_pci_b_ddddd(-1)),
+				.mid	= 0,
+				.lo	= 0,
+			},
+			.pci_pin	= cpu_to_fdt32(7),
+		};
+
+		_FDT(fdt_property(fdt, "interrupt-map-mask", &irq_mask,
+				  sizeof(irq_mask)));
+	}
+
+	_FDT(fdt_end_node(fdt));
+}

diff --git a/tools/kvm/arm/timer.c b/tools/kvm/arm/timer.c
new file mode 100644
index 0000000..209251e
--- /dev/null
+++ b/tools/kvm/arm/timer.c

@@ -0,0 +1,41 @@
+#include "kvm/fdt.h"
+#include "kvm/kvm.h"
+#include "kvm/kvm-cpu.h"
+#include "kvm/util.h"
+
+#include "arm-common/gic.h"
+#include "arm-common/timer.h"
+
+void timer__generate_fdt_nodes(void *fdt, struct kvm *kvm, int *irqs)
+{
+	const char compatible[] = "arm,armv8-timer\0arm,armv7-timer";
+
+	u32 cpu_mask = (((1 << kvm->nrcpus) - 1) << GIC_FDT_IRQ_PPI_CPU_SHIFT) \
+		       & GIC_FDT_IRQ_PPI_CPU_MASK;
+	u32 irq_prop[] = {
+		cpu_to_fdt32(GIC_FDT_IRQ_TYPE_PPI),
+		cpu_to_fdt32(irqs[0]),
+		cpu_to_fdt32(cpu_mask | GIC_FDT_IRQ_FLAGS_EDGE_LO_HI),
+
+		cpu_to_fdt32(GIC_FDT_IRQ_TYPE_PPI),
+		cpu_to_fdt32(irqs[1]),
+		cpu_to_fdt32(cpu_mask | GIC_FDT_IRQ_FLAGS_EDGE_LO_HI),
+
+		cpu_to_fdt32(GIC_FDT_IRQ_TYPE_PPI),
+		cpu_to_fdt32(irqs[2]),
+		cpu_to_fdt32(cpu_mask | GIC_FDT_IRQ_FLAGS_EDGE_LO_HI),
+
+		cpu_to_fdt32(GIC_FDT_IRQ_TYPE_PPI),
+		cpu_to_fdt32(irqs[3]),
+		cpu_to_fdt32(cpu_mask | GIC_FDT_IRQ_FLAGS_EDGE_LO_HI),
+	};
+
+	_FDT(fdt_begin_node(fdt, "timer"));
+	_FDT(fdt_property(fdt, "compatible", compatible, sizeof(compatible)));
+	_FDT(fdt_property(fdt, "interrupts", irq_prop, sizeof(irq_prop)));
+	_FDT(fdt_property(fdt, "always-on", NULL, 0));
+	if (kvm->cfg.arch.force_cntfrq > 0)
+		_FDT(fdt_property_cell(fdt, "clock-frequency", kvm->cfg.arch.force_cntfrq));
+	_FDT(fdt_end_node(fdt));
+}
+

diff --git a/tools/kvm/builtin-balloon.c b/tools/kvm/builtin-balloon.c
new file mode 100644
index 0000000..d158ace
--- /dev/null
+++ b/tools/kvm/builtin-balloon.c

@@ -0,0 +1,80 @@
+#include <stdio.h>
+#include <string.h>
+#include <signal.h>
+
+#include <kvm/util.h>
+#include <kvm/kvm-cmd.h>
+#include <kvm/builtin-balloon.h>
+#include <kvm/parse-options.h>
+#include <kvm/kvm.h>
+#include <kvm/kvm-ipc.h>
+
+static const char *instance_name;
+static u64 inflate;
+static u64 deflate;
+
+static const char * const balloon_usage[] = {
+	"lkvm balloon [-n name] [-p pid] [-i amount] [-d amount]",
+	NULL
+};
+
+static const struct option balloon_options[] = {
+	OPT_GROUP("Instance options:"),
+	OPT_STRING('n', "name", &instance_name, "name", "Instance name"),
+	OPT_GROUP("Balloon options:"),
+	OPT_U64('i', "inflate", &inflate, "Amount to inflate (in MB)"),
+	OPT_U64('d', "deflate", &deflate, "Amount to deflate (in MB)"),
+	OPT_END(),
+};
+
+void kvm_balloon_help(void)
+{
+	usage_with_options(balloon_usage, balloon_options);
+}
+
+static void parse_balloon_options(int argc, const char **argv)
+{
+	while (argc != 0) {
+		argc = parse_options(argc, argv, balloon_options, balloon_usage,
+				PARSE_OPT_STOP_AT_NON_OPTION);
+		if (argc != 0)
+			kvm_balloon_help();
+	}
+}
+
+int kvm_cmd_balloon(int argc, const char **argv, const char *prefix)
+{
+	int instance;
+	int r;
+	int amount;
+
+	parse_balloon_options(argc, argv);
+
+	if (inflate == 0 && deflate == 0)
+		kvm_balloon_help();
+
+	if (instance_name == NULL)
+		kvm_balloon_help();
+
+	instance = kvm__get_sock_by_instance(instance_name);
+
+	if (instance <= 0)
+		die("Failed locating instance");
+
+	if (inflate)
+		amount = inflate;
+	else if (deflate)
+		amount = -deflate;
+	else
+		kvm_balloon_help();
+
+	r = kvm_ipc__send_msg(instance, KVM_IPC_BALLOON,
+			sizeof(amount), (u8 *)&amount);
+
+	close(instance);
+
+	if (r < 0)
+		return -1;
+
+	return 0;
+}

diff --git a/tools/kvm/builtin-debug.c b/tools/kvm/builtin-debug.c
new file mode 100644
index 0000000..4ae51d2
--- /dev/null
+++ b/tools/kvm/builtin-debug.c

@@ -0,0 +1,110 @@
+#include <kvm/util.h>
+#include <kvm/kvm-cmd.h>
+#include <kvm/builtin-debug.h>
+#include <kvm/kvm.h>
+#include <kvm/parse-options.h>
+#include <kvm/kvm-ipc.h>
+#include <kvm/read-write.h>
+
+#include <stdio.h>
+#include <string.h>
+#include <signal.h>
+
+#define BUFFER_SIZE 100
+
+static bool all;
+static int nmi = -1;
+static bool dump;
+static const char *instance_name;
+static const char *sysrq;
+
+static const char * const debug_usage[] = {
+	"lkvm debug [--all] [-n name] [-d] [-m vcpu]",
+	NULL
+};
+
+static const struct option debug_options[] = {
+	OPT_GROUP("General options:"),
+	OPT_BOOLEAN('d', "dump", &dump, "Generate a debug dump from guest"),
+	OPT_INTEGER('m', "nmi", &nmi, "Generate NMI on VCPU"),
+	OPT_STRING('s', "sysrq", &sysrq, "sysrq", "Inject a sysrq"),
+	OPT_GROUP("Instance options:"),
+	OPT_BOOLEAN('a', "all", &all, "Debug all instances"),
+	OPT_STRING('n', "name", &instance_name, "name", "Instance name"),
+	OPT_END()
+};
+
+static void parse_debug_options(int argc, const char **argv)
+{
+	while (argc != 0) {
+		argc = parse_options(argc, argv, debug_options, debug_usage,
+				PARSE_OPT_STOP_AT_NON_OPTION);
+		if (argc != 0)
+			kvm_debug_help();
+	}
+}
+
+void kvm_debug_help(void)
+{
+	usage_with_options(debug_usage, debug_options);
+}
+
+static int do_debug(const char *name, int sock)
+{
+	char buff[BUFFER_SIZE];
+	struct debug_cmd_params cmd = {.dbg_type = 0};
+	int r;
+
+	if (dump)
+		cmd.dbg_type |= KVM_DEBUG_CMD_TYPE_DUMP;
+
+	if (nmi != -1) {
+		cmd.dbg_type |= KVM_DEBUG_CMD_TYPE_NMI;
+		cmd.cpu = nmi;
+	}
+
+	if (sysrq) {
+		cmd.dbg_type |= KVM_DEBUG_CMD_TYPE_SYSRQ;
+		cmd.sysrq = sysrq[0];
+	}
+
+	r = kvm_ipc__send_msg(sock, KVM_IPC_DEBUG, sizeof(cmd), (u8 *)&cmd);
+	if (r < 0)
+		return r;
+
+	if (!dump)
+		return 0;
+
+	do {
+		r = xread(sock, buff, BUFFER_SIZE);
+		if (r < 0)
+			return 0;
+		printf("%.*s", r, buff);
+	} while (r > 0);
+
+	return 0;
+}
+
+int kvm_cmd_debug(int argc, const char **argv, const char *prefix)
+{
+	parse_debug_options(argc, argv);
+	int instance;
+	int r;
+
+	if (all)
+		return kvm__enumerate_instances(do_debug);
+
+	if (instance_name == NULL)
+		kvm_debug_help();
+
+	instance = kvm__get_sock_by_instance(instance_name);
+
+	if (instance <= 0)
+		die("Failed locating instance");
+
+	r = do_debug(instance_name, instance);
+
+	close(instance);
+
+	return r;
+}

diff --git a/tools/kvm/builtin-help.c b/tools/kvm/builtin-help.c
new file mode 100644
index 0000000..5970fb7
--- /dev/null
+++ b/tools/kvm/builtin-help.c

@@ -0,0 +1,63 @@
+#include <stdio.h>
+#include <string.h>
+
+/* user defined headers */
+#include <common-cmds.h>
+
+#include <kvm/util.h>
+#include <kvm/kvm-cmd.h>
+#include <kvm/builtin-help.h>
+#include <kvm/kvm.h>
+
+
+const char kvm_usage_string[] =
+	"lkvm COMMAND [ARGS]";
+
+const char kvm_more_info_string[] =
+	"See 'lkvm help COMMAND' for more information on a specific command.";
+
+
+static void list_common_cmds_help(void)
+{
+	unsigned int i, longest = 0;
+
+	for (i = 0; i < ARRAY_SIZE(common_cmds); i++) {
+		if (longest < strlen(common_cmds[i].name))
+			longest = strlen(common_cmds[i].name);
+	}
+
+	puts(" The most commonly used lkvm commands are:");
+	for (i = 0; i < ARRAY_SIZE(common_cmds); i++) {
+		printf("   %-*s   ", longest, common_cmds[i].name);
+		puts(common_cmds[i].help);
+	}
+}
+
+static void kvm_help(void)
+{
+	printf("\n To start a simple non-privileged shell run '%s run'\n\n"
+		"usage: %s\n\n", KVM_BINARY_NAME, kvm_usage_string);
+	list_common_cmds_help();
+	printf("\n %s\n\n", kvm_more_info_string);
+}
+
+
+static void help_cmd(const char *cmd)
+{
+	struct cmd_struct *p;
+	p = kvm_get_command(kvm_commands, cmd);
+	if (!p)
+		kvm_help();
+	else if (p->help)
+		p->help();
+}
+
+int kvm_cmd_help(int argc, const char **argv, const char *prefix)
+{
+	if (!argv || !*argv) {
+		kvm_help();
+		return 0;
+	}
+	help_cmd(argv[0]);
+	return 0;
+}

diff --git a/tools/kvm/builtin-list.c b/tools/kvm/builtin-list.c
new file mode 100644
index 0000000..c35be93
--- /dev/null
+++ b/tools/kvm/builtin-list.c

@@ -0,0 +1,155 @@
+#include <kvm/util.h>
+#include <kvm/kvm-cmd.h>
+#include <kvm/builtin-list.h>
+#include <kvm/kvm.h>
+#include <kvm/parse-options.h>
+#include <kvm/kvm-ipc.h>
+
+#include <dirent.h>
+#include <stdio.h>
+#include <string.h>
+#include <signal.h>
+#include <fcntl.h>
+
+static bool run;
+static bool rootfs;
+
+static const char * const list_usage[] = {
+	"lkvm list",
+	NULL
+};
+
+static const struct option list_options[] = {
+	OPT_GROUP("General options:"),
+	OPT_BOOLEAN('i', "run", &run, "List running instances"),
+	OPT_BOOLEAN('r', "rootfs", &rootfs, "List rootfs instances"),
+	OPT_END()
+};
+
+#define KVM_INSTANCE_RUNNING	"running"
+#define KVM_INSTANCE_PAUSED	"paused"
+#define KVM_INSTANCE_SHUTOFF	"shut off"
+
+void kvm_list_help(void)
+{
+	usage_with_options(list_usage, list_options);
+}
+
+static pid_t get_pid(int sock)
+{
+	pid_t pid;
+	int r;
+
+	r = kvm_ipc__send(sock, KVM_IPC_PID);
+	if (r < 0)
+		return r;
+
+	r = read(sock, &pid, sizeof(pid));
+	if (r < 0)
+		return r;
+
+	return pid;
+}
+
+int get_vmstate(int sock)
+{
+	int vmstate;
+	int r;
+
+	r = kvm_ipc__send(sock, KVM_IPC_VMSTATE);
+	if (r < 0)
+		return r;
+
+	r = read(sock, &vmstate, sizeof(vmstate));
+	if (r < 0)
+		return r;
+
+	return vmstate;
+
+}
+
+static int print_guest(const char *name, int sock)
+{
+	pid_t pid;
+	int vmstate;
+
+	pid = get_pid(sock);
+	vmstate = get_vmstate(sock);
+
+	if ((int)pid < 0 || vmstate < 0)
+		return -1;
+
+	if (vmstate == KVM_VMSTATE_PAUSED)
+		printf("%5d %-20s %s\n", pid, name, KVM_INSTANCE_PAUSED);
+	else
+		printf("%5d %-20s %s\n", pid, name, KVM_INSTANCE_RUNNING);
+
+	return 0;
+}
+
+static int kvm_list_running_instances(void)
+{
+	return kvm__enumerate_instances(print_guest);
+}
+
+static int kvm_list_rootfs(void)
+{
+	DIR *dir;
+	struct dirent *dirent;
+
+	dir = opendir(kvm__get_dir());
+	if (dir == NULL)
+		return -1;
+
+	while ((dirent = readdir(dir))) {
+		if (dirent->d_type == DT_DIR &&
+			strcmp(dirent->d_name, ".") &&
+			strcmp(dirent->d_name, ".."))
+			printf("%5s %-20s %s\n", "", dirent->d_name, KVM_INSTANCE_SHUTOFF);
+	}
+
+	return 0;
+}
+
+static void parse_setup_options(int argc, const char **argv)
+{
+	while (argc != 0) {
+		argc = parse_options(argc, argv, list_options, list_usage,
+				PARSE_OPT_STOP_AT_NON_OPTION);
+		if (argc != 0)
+			kvm_list_help();
+	}
+}
+
+int kvm_cmd_list(int argc, const char **argv, const char *prefix)
+{
+	int status, r;
+
+	parse_setup_options(argc, argv);
+
+	if (!run && !rootfs)
+		run = rootfs = true;
+
+	printf("%6s %-20s %s\n", "PID", "NAME", "STATE");
+	printf("------------------------------------\n");
+
+	status = 0;
+
+	if (run) {
+		r = kvm_list_running_instances();
+		if (r < 0)
+			perror("Error listing instances");
+
+		status |= r;
+	}
+
+	if (rootfs) {
+		r = kvm_list_rootfs();
+		if (r < 0)
+			perror("Error listing rootfs");
+
+		status |= r;
+	}
+
+	return status;
+}

diff --git a/tools/kvm/builtin-pause.c b/tools/kvm/builtin-pause.c
new file mode 100644
index 0000000..c08595a
--- /dev/null
+++ b/tools/kvm/builtin-pause.c

@@ -0,0 +1,88 @@
+#include <kvm/util.h>
+#include <kvm/kvm-cmd.h>
+#include <kvm/builtin-pause.h>
+#include <kvm/builtin-list.h>
+#include <kvm/kvm.h>
+#include <kvm/parse-options.h>
+#include <kvm/kvm-ipc.h>
+
+#include <stdio.h>
+#include <string.h>
+#include <signal.h>
+
+static bool all;
+static const char *instance_name;
+
+static const char * const pause_usage[] = {
+	"lkvm pause [--all] [-n name]",
+	NULL
+};
+
+static const struct option pause_options[] = {
+	OPT_GROUP("General options:"),
+	OPT_BOOLEAN('a', "all", &all, "Pause all instances"),
+	OPT_STRING('n', "name", &instance_name, "name", "Instance name"),
+	OPT_END()
+};
+
+static void parse_pause_options(int argc, const char **argv)
+{
+	while (argc != 0) {
+		argc = parse_options(argc, argv, pause_options, pause_usage,
+				PARSE_OPT_STOP_AT_NON_OPTION);
+		if (argc != 0)
+			kvm_pause_help();
+	}
+}
+
+void kvm_pause_help(void)
+{
+	usage_with_options(pause_usage, pause_options);
+}
+
+static int do_pause(const char *name, int sock)
+{
+	int r;
+	int vmstate;
+
+	vmstate = get_vmstate(sock);
+	if (vmstate < 0)
+		return vmstate;
+	if (vmstate == KVM_VMSTATE_PAUSED) {
+		printf("Guest %s is already paused.\n", name);
+		return 0;
+	}
+
+	r = kvm_ipc__send(sock, KVM_IPC_PAUSE);
+	if (r)
+		return r;
+
+	printf("Guest %s paused\n", name);
+
+	return 0;
+}
+
+int kvm_cmd_pause(int argc, const char **argv, const char *prefix)
+{
+	int instance;
+	int r;
+
+	parse_pause_options(argc, argv);
+
+	if (all)
+		return kvm__enumerate_instances(do_pause);
+
+	if (instance_name == NULL)
+		kvm_pause_help();
+
+	instance = kvm__get_sock_by_instance(instance_name);
+
+	if (instance <= 0)
+		die("Failed locating instance");
+
+	r = do_pause(instance_name, instance);
+
+	close(instance);
+
+	return r;
+}

diff --git a/tools/kvm/builtin-resume.c b/tools/kvm/builtin-resume.c
new file mode 100644
index 0000000..0e954b4
--- /dev/null
+++ b/tools/kvm/builtin-resume.c

@@ -0,0 +1,88 @@
+#include <kvm/util.h>
+#include <kvm/kvm-cmd.h>
+#include <kvm/builtin-resume.h>
+#include <kvm/builtin-list.h>
+#include <kvm/kvm.h>
+#include <kvm/parse-options.h>
+#include <kvm/kvm-ipc.h>
+
+#include <stdio.h>
+#include <string.h>
+#include <signal.h>
+
+static bool all;
+static const char *instance_name;
+
+static const char * const resume_usage[] = {
+	"lkvm resume [--all] [-n name]",
+	NULL
+};
+
+static const struct option resume_options[] = {
+	OPT_GROUP("General options:"),
+	OPT_BOOLEAN('a', "all", &all, "Resume all instances"),
+	OPT_STRING('n', "name", &instance_name, "name", "Instance name"),
+	OPT_END()
+};
+
+static void parse_resume_options(int argc, const char **argv)
+{
+	while (argc != 0) {
+		argc = parse_options(argc, argv, resume_options, resume_usage,
+				PARSE_OPT_STOP_AT_NON_OPTION);
+		if (argc != 0)
+			kvm_resume_help();
+	}
+}
+
+void kvm_resume_help(void)
+{
+	usage_with_options(resume_usage, resume_options);
+}
+
+static int do_resume(const char *name, int sock)
+{
+	int r;
+	int vmstate;
+
+	vmstate = get_vmstate(sock);
+	if (vmstate < 0)
+		return vmstate;
+	if (vmstate == KVM_VMSTATE_RUNNING) {
+		printf("Guest %s is still running.\n", name);
+		return 0;
+	}
+
+	r = kvm_ipc__send(sock, KVM_IPC_RESUME);
+	if (r)
+		return r;
+
+	printf("Guest %s resumed\n", name);
+
+	return 0;
+}
+
+int kvm_cmd_resume(int argc, const char **argv, const char *prefix)
+{
+	int instance;
+	int r;
+
+	parse_resume_options(argc, argv);
+
+	if (all)
+		return kvm__enumerate_instances(do_resume);
+
+	if (instance_name == NULL)
+		kvm_resume_help();
+
+	instance = kvm__get_sock_by_instance(instance_name);
+
+	if (instance <= 0)
+		die("Failed locating instance");
+
+	r = do_resume(instance_name, instance);
+
+	close(instance);
+
+	return r;
+}

diff --git a/tools/kvm/builtin-run.c b/tools/kvm/builtin-run.c
new file mode 100644
index 0000000..1ee75ad
--- /dev/null
+++ b/tools/kvm/builtin-run.c

@@ -0,0 +1,691 @@
+#include "kvm/builtin-run.h"
+
+#include "kvm/builtin-setup.h"
+#include "kvm/virtio-balloon.h"
+#include "kvm/virtio-console.h"
+#include "kvm/parse-options.h"
+#include "kvm/8250-serial.h"
+#include "kvm/framebuffer.h"
+#include "kvm/disk-image.h"
+#include "kvm/threadpool.h"
+#include "kvm/virtio-scsi.h"
+#include "kvm/virtio-blk.h"
+#include "kvm/virtio-net.h"
+#include "kvm/virtio-rng.h"
+#include "kvm/ioeventfd.h"
+#include "kvm/virtio-9p.h"
+#include "kvm/barrier.h"
+#include "kvm/kvm-cpu.h"
+#include "kvm/ioport.h"
+#include "kvm/symbol.h"
+#include "kvm/i8042.h"
+#include "kvm/mutex.h"
+#include "kvm/term.h"
+#include "kvm/util.h"
+#include "kvm/strbuf.h"
+#include "kvm/vesa.h"
+#include "kvm/irq.h"
+#include "kvm/kvm.h"
+#include "kvm/pci.h"
+#include "kvm/rtc.h"
+#include "kvm/sdl.h"
+#include "kvm/vnc.h"
+#include "kvm/guest_compat.h"
+#include "kvm/pci-shmem.h"
+#include "kvm/kvm-ipc.h"
+#include "kvm/builtin-debug.h"
+
+#include <linux/types.h>
+#include <linux/err.h>
+
+#include <sys/utsname.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <termios.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <ctype.h>
+#include <stdio.h>
+
+#define MB_SHIFT		(20)
+#define KB_SHIFT		(10)
+#define GB_SHIFT		(30)
+
+__thread struct kvm_cpu *current_kvm_cpu;
+
+static int  kvm_run_wrapper;
+
+bool do_debug_print = false;
+
+extern char _binary_guest_init_start;
+extern char _binary_guest_init_size;
+
+static const char * const run_usage[] = {
+	"lkvm run [<options>] [<kernel image>]",
+	NULL
+};
+
+enum {
+	KVM_RUN_DEFAULT,
+	KVM_RUN_SANDBOX,
+};
+
+static int img_name_parser(const struct option *opt, const char *arg, int unset)
+{
+	char path[PATH_MAX];
+	struct stat st;
+
+	snprintf(path, PATH_MAX, "%s%s", kvm__get_dir(), arg);
+
+	if ((stat(arg, &st) == 0 && S_ISDIR(st.st_mode)) ||
+	   (stat(path, &st) == 0 && S_ISDIR(st.st_mode)))
+		return virtio_9p_img_name_parser(opt, arg, unset);
+	return disk_img_name_parser(opt, arg, unset);
+}
+
+void kvm_run_set_wrapper_sandbox(void)
+{
+	kvm_run_wrapper = KVM_RUN_SANDBOX;
+}
+
+#ifndef OPT_ARCH_RUN
+#define OPT_ARCH_RUN(...)
+#endif
+
+#define BUILD_OPTIONS(name, cfg, kvm)					\
+	struct option name[] = {					\
+	OPT_GROUP("Basic options:"),					\
+	OPT_STRING('\0', "name", &(cfg)->guest_name, "guest name",	\
+			"A name for the guest"),			\
+	OPT_INTEGER('c', "cpus", &(cfg)->nrcpus, "Number of CPUs"),	\
+	OPT_U64('m', "mem", &(cfg)->ram_size, "Virtual machine memory"	\
+		" size in MiB."),					\
+	OPT_CALLBACK('\0', "shmem", NULL,				\
+		     "[pci:]<addr>:<size>[:handle=<handle>][:create]",	\
+		     "Share host shmem with guest via pci device",	\
+		     shmem_parser, NULL),				\
+	OPT_CALLBACK('d', "disk", kvm, "image or rootfs_dir", "Disk "	\
+			" image or rootfs directory", img_name_parser,	\
+			kvm),						\
+	OPT_BOOLEAN('\0', "balloon", &(cfg)->balloon, "Enable virtio"	\
+			" balloon"),					\
+	OPT_BOOLEAN('\0', "vnc", &(cfg)->vnc, "Enable VNC framebuffer"),\
+	OPT_BOOLEAN('\0', "gtk", &(cfg)->gtk, "Enable GTK framebuffer"),\
+	OPT_BOOLEAN('\0', "sdl", &(cfg)->sdl, "Enable SDL framebuffer"),\
+	OPT_BOOLEAN('\0', "rng", &(cfg)->virtio_rng, "Enable virtio"	\
+			" Random Number Generator"),			\
+	OPT_CALLBACK('\0', "9p", NULL, "dir_to_share,tag_name",		\
+		     "Enable virtio 9p to share files between host and"	\
+		     " guest", virtio_9p_rootdir_parser, kvm),		\
+	OPT_STRING('\0', "console", &(cfg)->console, "serial, virtio or"\
+			" hv", "Console to use"),			\
+	OPT_STRING('\0', "dev", &(cfg)->dev, "device_file",		\
+			"KVM device file"),				\
+	OPT_CALLBACK('\0', "tty", NULL, "tty id",			\
+		     "Remap guest TTY into a pty on the host",		\
+		     tty_parser, NULL),					\
+	OPT_STRING('\0', "sandbox", &(cfg)->sandbox, "script",		\
+			"Run this script when booting into custom"	\
+			" rootfs"),					\
+	OPT_STRING('\0', "hugetlbfs", &(cfg)->hugetlbfs_path, "path",	\
+			"Hugetlbfs path"),				\
+									\
+	OPT_GROUP("Kernel options:"),					\
+	OPT_STRING('k', "kernel", &(cfg)->kernel_filename, "kernel",	\
+			"Kernel to boot in virtual machine"),		\
+	OPT_STRING('i', "initrd", &(cfg)->initrd_filename, "initrd",	\
+			"Initial RAM disk image"),			\
+	OPT_STRING('p', "params", &(cfg)->kernel_cmdline, "params",	\
+			"Kernel command line arguments"),		\
+	OPT_STRING('f', "firmware", &(cfg)->firmware_filename, "firmware",\
+			"Firmware image to boot in virtual machine"),	\
+									\
+	OPT_GROUP("Networking options:"),				\
+	OPT_CALLBACK_DEFAULT('n', "network", NULL, "network params",	\
+		     "Create a new guest NIC",				\
+		     netdev_parser, NULL, kvm),				\
+	OPT_BOOLEAN('\0', "no-dhcp", &(cfg)->no_dhcp, "Disable kernel"	\
+			" DHCP in rootfs mode"),			\
+									\
+	OPT_GROUP("Debug options:"),					\
+	OPT_BOOLEAN('\0', "debug", &do_debug_print,			\
+			"Enable debug messages"),			\
+	OPT_BOOLEAN('\0', "debug-single-step", &(cfg)->single_step,	\
+			"Enable single stepping"),			\
+	OPT_BOOLEAN('\0', "debug-ioport", &(cfg)->ioport_debug,		\
+			"Enable ioport debugging"),			\
+	OPT_BOOLEAN('\0', "debug-mmio", &(cfg)->mmio_debug,		\
+			"Enable MMIO debugging"),			\
+	OPT_INTEGER('\0', "debug-iodelay", &(cfg)->debug_iodelay,	\
+			"Delay IO by millisecond"),			\
+									\
+	OPT_ARCH(RUN, cfg)						\
+	OPT_END()							\
+	};
+
+static void *kvm_cpu_thread(void *arg)
+{
+	char name[16];
+
+	current_kvm_cpu = arg;
+
+	sprintf(name, "kvm-vcpu-%lu", current_kvm_cpu->cpu_id);
+	kvm__set_thread_name(name);
+
+	if (kvm_cpu__start(current_kvm_cpu))
+		goto panic_kvm;
+
+	return (void *) (intptr_t) 0;
+
+panic_kvm:
+	fprintf(stderr, "KVM exit reason: %u (\"%s\")\n",
+		current_kvm_cpu->kvm_run->exit_reason,
+		kvm_exit_reasons[current_kvm_cpu->kvm_run->exit_reason]);
+	if (current_kvm_cpu->kvm_run->exit_reason == KVM_EXIT_UNKNOWN)
+		fprintf(stderr, "KVM exit code: 0x%llu\n",
+			(unsigned long long)current_kvm_cpu->kvm_run->hw.hardware_exit_reason);
+
+	kvm_cpu__set_debug_fd(STDOUT_FILENO);
+	kvm_cpu__show_registers(current_kvm_cpu);
+	kvm_cpu__show_code(current_kvm_cpu);
+	kvm_cpu__show_page_tables(current_kvm_cpu);
+
+	return (void *) (intptr_t) 1;
+}
+
+static char kernel[PATH_MAX];
+
+static const char *host_kernels[] = {
+	"/boot/vmlinuz",
+	"/boot/bzImage",
+	NULL
+};
+
+static const char *default_kernels[] = {
+	"./bzImage",
+	"arch/" BUILD_ARCH "/boot/bzImage",
+	"../../arch/" BUILD_ARCH "/boot/bzImage",
+	NULL
+};
+
+static const char *default_vmlinux[] = {
+	"vmlinux",
+	"../../../vmlinux",
+	"../../vmlinux",
+	NULL
+};
+
+static void kernel_usage_with_options(void)
+{
+	const char **k;
+	struct utsname uts;
+
+	fprintf(stderr, "Fatal: could not find default kernel image in:\n");
+	k = &default_kernels[0];
+	while (*k) {
+		fprintf(stderr, "\t%s\n", *k);
+		k++;
+	}
+
+	if (uname(&uts) < 0)
+		return;
+
+	k = &host_kernels[0];
+	while (*k) {
+		if (snprintf(kernel, PATH_MAX, "%s-%s", *k, uts.release) < 0)
+			return;
+		fprintf(stderr, "\t%s\n", kernel);
+		k++;
+	}
+	fprintf(stderr, "\nPlease see '%s run --help' for more options.\n\n",
+		KVM_BINARY_NAME);
+}
+
+static u64 host_ram_size(void)
+{
+	long page_size;
+	long nr_pages;
+
+	nr_pages	= sysconf(_SC_PHYS_PAGES);
+	if (nr_pages < 0) {
+		pr_warning("sysconf(_SC_PHYS_PAGES) failed");
+		return 0;
+	}
+
+	page_size	= sysconf(_SC_PAGE_SIZE);
+	if (page_size < 0) {
+		pr_warning("sysconf(_SC_PAGE_SIZE) failed");
+		return 0;
+	}
+
+	return (nr_pages * page_size) >> MB_SHIFT;
+}
+
+/*
+ * If user didn't specify how much memory it wants to allocate for the guest,
+ * avoid filling the whole host RAM.
+ */
+#define RAM_SIZE_RATIO		0.8
+
+static u64 get_ram_size(int nr_cpus)
+{
+	u64 available;
+	u64 ram_size;
+
+	ram_size	= 64 * (nr_cpus + 3);
+
+	available	= host_ram_size() * RAM_SIZE_RATIO;
+	if (!available)
+		available = MIN_RAM_SIZE_MB;
+
+	if (ram_size > available)
+		ram_size	= available;
+
+	return ram_size;
+}
+
+static const char *find_kernel(void)
+{
+	const char **k;
+	struct stat st;
+	struct utsname uts;
+
+	k = &default_kernels[0];
+	while (*k) {
+		if (stat(*k, &st) < 0 || !S_ISREG(st.st_mode)) {
+			k++;
+			continue;
+		}
+		strncpy(kernel, *k, PATH_MAX);
+		return kernel;
+	}
+
+	if (uname(&uts) < 0)
+		return NULL;
+
+	k = &host_kernels[0];
+	while (*k) {
+		if (snprintf(kernel, PATH_MAX, "%s-%s", *k, uts.release) < 0)
+			return NULL;
+
+		if (stat(kernel, &st) < 0 || !S_ISREG(st.st_mode)) {
+			k++;
+			continue;
+		}
+		return kernel;
+
+	}
+	return NULL;
+}
+
+static const char *find_vmlinux(void)
+{
+	const char **vmlinux;
+
+	vmlinux = &default_vmlinux[0];
+	while (*vmlinux) {
+		struct stat st;
+
+		if (stat(*vmlinux, &st) < 0 || !S_ISREG(st.st_mode)) {
+			vmlinux++;
+			continue;
+		}
+		return *vmlinux;
+	}
+	return NULL;
+}
+
+void kvm_run_help(void)
+{
+	struct kvm *kvm = NULL;
+
+	BUILD_OPTIONS(options, &kvm->cfg, kvm);
+	usage_with_options(run_usage, options);
+}
+
+static int kvm_setup_guest_init(struct kvm *kvm)
+{
+	const char *rootfs = kvm->cfg.custom_rootfs_name;
+	char tmp[PATH_MAX];
+	size_t size;
+	int fd, ret;
+	char *data;
+
+	/* Setup /virt/init */
+	size = (size_t)&_binary_guest_init_size;
+	data = (char *)&_binary_guest_init_start;
+	snprintf(tmp, PATH_MAX, "%s%s/virt/init", kvm__get_dir(), rootfs);
+	remove(tmp);
+	fd = open(tmp, O_CREAT | O_WRONLY, 0755);
+	if (fd < 0)
+		die("Fail to setup %s", tmp);
+	ret = xwrite(fd, data, size);
+	if (ret < 0)
+		die("Fail to setup %s", tmp);
+	close(fd);
+
+	return 0;
+}
+
+static int kvm_run_set_sandbox(struct kvm *kvm)
+{
+	const char *guestfs_name = kvm->cfg.custom_rootfs_name;
+	char path[PATH_MAX], script[PATH_MAX], *tmp;
+
+	snprintf(path, PATH_MAX, "%s%s/virt/sandbox.sh", kvm__get_dir(), guestfs_name);
+
+	remove(path);
+
+	if (kvm->cfg.sandbox == NULL)
+		return 0;
+
+	tmp = realpath(kvm->cfg.sandbox, NULL);
+	if (tmp == NULL)
+		return -ENOMEM;
+
+	snprintf(script, PATH_MAX, "/host/%s", tmp);
+	free(tmp);
+
+	return symlink(script, path);
+}
+
+static void kvm_write_sandbox_cmd_exactly(int fd, const char *arg)
+{
+	const char *single_quote;
+
+	if (!*arg) { /* zero length string */
+		if (write(fd, "''", 2) <= 0)
+			die("Failed writing sandbox script");
+		return;
+	}
+
+	while (*arg) {
+		single_quote = strchrnul(arg, '\'');
+
+		/* write non-single-quote string as #('string') */
+		if (arg != single_quote) {
+			if (write(fd, "'", 1) <= 0 ||
+			    write(fd, arg, single_quote - arg) <= 0 ||
+			    write(fd, "'", 1) <= 0)
+				die("Failed writing sandbox script");
+		}
+
+		/* write single quote as #("'") */
+		if (*single_quote) {
+			if (write(fd, "\"'\"", 3) <= 0)
+				die("Failed writing sandbox script");
+		} else
+			break;
+
+		arg = single_quote + 1;
+	}
+}
+
+static void resolve_program(const char *src, char *dst, size_t len)
+{
+	struct stat st;
+	int err;
+
+	err = stat(src, &st);
+
+	if (!err && S_ISREG(st.st_mode)) {
+		char resolved_path[PATH_MAX];
+
+		if (!realpath(src, resolved_path))
+			die("Unable to resolve program %s: %s\n", src, strerror(errno));
+
+		snprintf(dst, len, "/host%s", resolved_path);
+	} else
+		strncpy(dst, src, len);
+}
+
+static void kvm_run_write_sandbox_cmd(struct kvm *kvm, const char **argv, int argc)
+{
+	const char script_hdr[] = "#! /bin/bash\n\n";
+	char program[PATH_MAX];
+	int fd;
+
+	remove(kvm->cfg.sandbox);
+
+	fd = open(kvm->cfg.sandbox, O_RDWR | O_CREAT, 0777);
+	if (fd < 0)
+		die("Failed creating sandbox script");
+
+	if (write(fd, script_hdr, sizeof(script_hdr) - 1) <= 0)
+		die("Failed writing sandbox script");
+
+	resolve_program(argv[0], program, PATH_MAX);
+	kvm_write_sandbox_cmd_exactly(fd, program);
+
+	argv++;
+	argc--;
+
+	while (argc) {
+		if (write(fd, " ", 1) <= 0)
+			die("Failed writing sandbox script");
+
+		kvm_write_sandbox_cmd_exactly(fd, argv[0]);
+		argv++;
+		argc--;
+	}
+	if (write(fd, "\n", 1) <= 0)
+		die("Failed writing sandbox script");
+
+	close(fd);
+}
+
+static struct kvm *kvm_cmd_run_init(int argc, const char **argv)
+{
+	static char real_cmdline[2048], default_name[20];
+	unsigned int nr_online_cpus;
+	struct kvm *kvm = kvm__new();
+
+	if (IS_ERR(kvm))
+		return kvm;
+
+	nr_online_cpus = sysconf(_SC_NPROCESSORS_ONLN);
+	kvm->cfg.custom_rootfs_name = "default";
+
+	while (argc != 0) {
+		BUILD_OPTIONS(options, &kvm->cfg, kvm);
+		argc = parse_options(argc, argv, options, run_usage,
+				PARSE_OPT_STOP_AT_NON_OPTION |
+				PARSE_OPT_KEEP_DASHDASH);
+		if (argc != 0) {
+			/* Cusrom options, should have been handled elsewhere */
+			if (strcmp(argv[0], "--") == 0) {
+				if (kvm_run_wrapper == KVM_RUN_SANDBOX) {
+					kvm->cfg.sandbox = DEFAULT_SANDBOX_FILENAME;
+					kvm_run_write_sandbox_cmd(kvm, argv+1, argc-1);
+					break;
+				}
+			}
+
+			if ((kvm_run_wrapper == KVM_RUN_DEFAULT && kvm->cfg.kernel_filename) ||
+				(kvm_run_wrapper == KVM_RUN_SANDBOX && kvm->cfg.sandbox)) {
+				fprintf(stderr, "Cannot handle parameter: "
+						"%s\n", argv[0]);
+				usage_with_options(run_usage, options);
+				free(kvm);
+				return ERR_PTR(-EINVAL);
+			}
+			if (kvm_run_wrapper == KVM_RUN_SANDBOX) {
+				/*
+				 * first unhandled parameter is treated as
+				 * sandbox command
+				 */
+				kvm->cfg.sandbox = DEFAULT_SANDBOX_FILENAME;
+				kvm_run_write_sandbox_cmd(kvm, argv, argc);
+			} else {
+				/*
+				 * first unhandled parameter is treated as a kernel
+				 * image
+				 */
+				kvm->cfg.kernel_filename = argv[0];
+			}
+			argv++;
+			argc--;
+		}
+
+	}
+
+	kvm->nr_disks = kvm->cfg.image_count;
+
+	if (!kvm->cfg.kernel_filename)
+		kvm->cfg.kernel_filename = find_kernel();
+
+	if (!kvm->cfg.kernel_filename) {
+		kernel_usage_with_options();
+		return ERR_PTR(-EINVAL);
+	}
+
+	kvm->cfg.vmlinux_filename = find_vmlinux();
+	kvm->vmlinux = kvm->cfg.vmlinux_filename;
+
+	if (kvm->cfg.nrcpus == 0)
+		kvm->cfg.nrcpus = nr_online_cpus;
+
+	if (!kvm->cfg.ram_size)
+		kvm->cfg.ram_size = get_ram_size(kvm->cfg.nrcpus);
+
+	if (kvm->cfg.ram_size > host_ram_size())
+		pr_warning("Guest memory size %lluMB exceeds host physical RAM size %lluMB",
+			(unsigned long long)kvm->cfg.ram_size,
+			(unsigned long long)host_ram_size());
+
+	kvm->cfg.ram_size <<= MB_SHIFT;
+
+	if (!kvm->cfg.dev)
+		kvm->cfg.dev = DEFAULT_KVM_DEV;
+
+	if (!kvm->cfg.console)
+		kvm->cfg.console = DEFAULT_CONSOLE;
+
+	if (!strncmp(kvm->cfg.console, "virtio", 6))
+		kvm->cfg.active_console  = CONSOLE_VIRTIO;
+	else if (!strncmp(kvm->cfg.console, "serial", 6))
+		kvm->cfg.active_console  = CONSOLE_8250;
+	else if (!strncmp(kvm->cfg.console, "hv", 2))
+		kvm->cfg.active_console = CONSOLE_HV;
+	else
+		pr_warning("No console!");
+
+	if (!kvm->cfg.host_ip)
+		kvm->cfg.host_ip = DEFAULT_HOST_ADDR;
+
+	if (!kvm->cfg.guest_ip)
+		kvm->cfg.guest_ip = DEFAULT_GUEST_ADDR;
+
+	if (!kvm->cfg.guest_mac)
+		kvm->cfg.guest_mac = DEFAULT_GUEST_MAC;
+
+	if (!kvm->cfg.host_mac)
+		kvm->cfg.host_mac = DEFAULT_HOST_MAC;
+
+	if (!kvm->cfg.script)
+		kvm->cfg.script = DEFAULT_SCRIPT;
+
+	if (!kvm->cfg.network)
+                kvm->cfg.network = DEFAULT_NETWORK;
+
+	memset(real_cmdline, 0, sizeof(real_cmdline));
+	kvm__arch_set_cmdline(real_cmdline, kvm->cfg.vnc || kvm->cfg.sdl || kvm->cfg.gtk);
+
+	if (strlen(real_cmdline) > 0)
+		strcat(real_cmdline, " ");
+
+	if (kvm->cfg.kernel_cmdline)
+		strlcat(real_cmdline, kvm->cfg.kernel_cmdline, sizeof(real_cmdline));
+
+	if (!kvm->cfg.guest_name) {
+		if (kvm->cfg.custom_rootfs) {
+			kvm->cfg.guest_name = kvm->cfg.custom_rootfs_name;
+		} else {
+			sprintf(default_name, "guest-%u", getpid());
+			kvm->cfg.guest_name = default_name;
+		}
+	}
+
+	if (!kvm->cfg.using_rootfs && !kvm->cfg.disk_image[0].filename && !kvm->cfg.initrd_filename) {
+		char tmp[PATH_MAX];
+
+		kvm_setup_create_new(kvm->cfg.custom_rootfs_name);
+		kvm_setup_resolv(kvm->cfg.custom_rootfs_name);
+
+		snprintf(tmp, PATH_MAX, "%s%s", kvm__get_dir(), "default");
+		if (virtio_9p__register(kvm, tmp, "/dev/root") < 0)
+			die("Unable to initialize virtio 9p");
+		if (virtio_9p__register(kvm, "/", "hostfs") < 0)
+			die("Unable to initialize virtio 9p");
+		kvm->cfg.using_rootfs = kvm->cfg.custom_rootfs = 1;
+	}
+
+	if (kvm->cfg.using_rootfs) {
+		strcat(real_cmdline, " root=/dev/root rw rootflags=rw,trans=virtio,version=9p2000.L rootfstype=9p");
+		if (kvm->cfg.custom_rootfs) {
+			kvm_run_set_sandbox(kvm);
+
+			strcat(real_cmdline, " init=/virt/init");
+
+			if (!kvm->cfg.no_dhcp)
+				strcat(real_cmdline, "  ip=dhcp");
+			if (kvm_setup_guest_init(kvm))
+				die("Failed to setup init for guest.");
+		}
+	} else if (!strstr(real_cmdline, "root=")) {
+		strlcat(real_cmdline, " root=/dev/vda rw ", sizeof(real_cmdline));
+	}
+
+	kvm->cfg.real_cmdline = real_cmdline;
+
+	printf("  # %s run -k %s -m %Lu -c %d --name %s\n", KVM_BINARY_NAME,
+		kvm->cfg.kernel_filename,
+		(unsigned long long)kvm->cfg.ram_size / 1024 / 1024,
+		kvm->cfg.nrcpus, kvm->cfg.guest_name);
+
+	if (init_list__init(kvm) < 0)
+		die ("Initialisation failed");
+
+	return kvm;
+}
+
+static int kvm_cmd_run_work(struct kvm *kvm)
+{
+	int i;
+	void *ret = NULL;
+
+	for (i = 0; i < kvm->nrcpus; i++) {
+		if (pthread_create(&kvm->cpus[i]->thread, NULL, kvm_cpu_thread, kvm->cpus[i]) != 0)
+			die("unable to create KVM VCPU thread");
+	}
+
+	/* Only VCPU #0 is going to exit by itself when shutting down */
+	return pthread_join(kvm->cpus[0]->thread, &ret);
+}
+
+static void kvm_cmd_run_exit(struct kvm *kvm, int guest_ret)
+{
+	compat__print_all_messages();
+
+	init_list__exit(kvm);
+
+	if (guest_ret == 0)
+		printf("\n  # KVM session ended normally.\n");
+}
+
+int kvm_cmd_run(int argc, const char **argv, const char *prefix)
+{
+	int ret = -EFAULT;
+	struct kvm *kvm;
+
+	kvm = kvm_cmd_run_init(argc, argv);
+	if (IS_ERR(kvm))
+		return PTR_ERR(kvm);
+
+	ret = kvm_cmd_run_work(kvm);
+	kvm_cmd_run_exit(kvm, ret);
+
+	return ret;
+}

diff --git a/tools/kvm/builtin-sandbox.c b/tools/kvm/builtin-sandbox.c
new file mode 100644
index 0000000..433f536
--- /dev/null
+++ b/tools/kvm/builtin-sandbox.c

@@ -0,0 +1,9 @@
+#include "kvm/builtin-sandbox.h"
+#include "kvm/builtin-run.h"
+
+int kvm_cmd_sandbox(int argc, const char **argv, const char *prefix)
+{
+	kvm_run_set_wrapper_sandbox();
+
+	return kvm_cmd_run(argc, argv, prefix);
+}

diff --git a/tools/kvm/builtin-setup.c b/tools/kvm/builtin-setup.c
new file mode 100644
index 0000000..8b45c56
--- /dev/null
+++ b/tools/kvm/builtin-setup.c

@@ -0,0 +1,258 @@
+#include <kvm/util.h>
+#include <kvm/kvm-cmd.h>
+#include <kvm/builtin-setup.h>
+#include <kvm/kvm.h>
+#include <kvm/parse-options.h>
+#include <kvm/read-write.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <limits.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+
+extern char _binary_guest_init_start;
+extern char _binary_guest_init_size;
+
+static const char *instance_name;
+
+static const char * const setup_usage[] = {
+	"lkvm setup [name]",
+	NULL
+};
+
+static const struct option setup_options[] = {
+	OPT_END()
+};
+
+static void parse_setup_options(int argc, const char **argv)
+{
+	while (argc != 0) {
+		argc = parse_options(argc, argv, setup_options, setup_usage,
+				PARSE_OPT_STOP_AT_NON_OPTION);
+		if (argc != 0 && instance_name)
+			kvm_setup_help();
+		else
+			instance_name = argv[0];
+		argv++;
+		argc--;
+	}
+}
+
+void kvm_setup_help(void)
+{
+	printf("\n%s setup creates a new rootfs under %s.\n"
+		"This can be used later by the '-d' parameter of '%s run'.\n",
+		KVM_BINARY_NAME, kvm__get_dir(), KVM_BINARY_NAME);
+	usage_with_options(setup_usage, setup_options);
+}
+
+static int copy_file(const char *from, const char *to)
+{
+	int in_fd, out_fd;
+	void *src, *dst;
+	struct stat st;
+	int err = -1;
+
+	in_fd = open(from, O_RDONLY);
+	if (in_fd < 0)
+		return err;
+
+	if (fstat(in_fd, &st) < 0)
+		goto error_close_in;
+
+	out_fd = open(to, O_RDWR | O_CREAT | O_TRUNC, st.st_mode & (S_IRWXU|S_IRWXG|S_IRWXO));
+	if (out_fd < 0)
+		goto error_close_in;
+
+	src = mmap(NULL, st.st_size, PROT_READ, MAP_SHARED, in_fd, 0);
+	if (src == MAP_FAILED)
+		goto error_close_out;
+
+	if (ftruncate(out_fd, st.st_size) < 0)
+		goto error_munmap_src;
+
+	dst = mmap(NULL, st.st_size, PROT_READ|PROT_WRITE, MAP_SHARED, out_fd, 0);
+	if (dst == MAP_FAILED)
+		goto error_munmap_src;
+
+	memcpy(dst, src, st.st_size);
+
+	if (fsync(out_fd) < 0)
+		goto error_munmap_dst;
+
+	err = 0;
+
+error_munmap_dst:
+	munmap(dst, st.st_size);
+error_munmap_src:
+	munmap(src, st.st_size);
+error_close_out:
+	close(out_fd);
+error_close_in:
+	close(in_fd);
+
+	return err;
+}
+
+static const char *guestfs_dirs[] = {
+	"/dev",
+	"/etc",
+	"/home",
+	"/host",
+	"/proc",
+	"/root",
+	"/sys",
+	"/tmp",
+	"/var",
+	"/var/lib",
+	"/virt",
+	"/virt/home",
+};
+
+static const char *guestfs_symlinks[] = {
+	"/bin",
+	"/lib",
+	"/lib64",
+	"/sbin",
+	"/usr",
+	"/etc/ld.so.conf",
+};
+
+static int copy_init(const char *guestfs_name)
+{
+	char path[PATH_MAX];
+	size_t size;
+	int fd, ret;
+	char *data;
+
+	size = (size_t)&_binary_guest_init_size;
+	data = (char *)&_binary_guest_init_start;
+	snprintf(path, PATH_MAX, "%s%s/virt/init", kvm__get_dir(), guestfs_name);
+	remove(path);
+	fd = open(path, O_CREAT | O_WRONLY, 0755);
+	if (fd < 0)
+		die("Fail to setup %s", path);
+	ret = xwrite(fd, data, size);
+	if (ret < 0)
+		die("Fail to setup %s", path);
+	close(fd);
+
+	return 0;
+}
+
+static int copy_passwd(const char *guestfs_name)
+{
+	char path[PATH_MAX];
+	FILE *file;
+	int ret;
+
+	snprintf(path, PATH_MAX, "%s%s/etc/passwd", kvm__get_dir(), guestfs_name);
+
+	file = fopen(path, "w");
+	if (!file)
+		return -1;
+
+	ret = fprintf(file, "root:x:0:0:root:/root:/bin/sh\n");
+	if (ret > 0)
+		ret = 0;
+
+	fclose(file);
+
+	return ret;
+}
+
+static int make_guestfs_symlink(const char *guestfs_name, const char *path)
+{
+	char target[PATH_MAX];
+	char name[PATH_MAX];
+
+	snprintf(name, PATH_MAX, "%s%s%s", kvm__get_dir(), guestfs_name, path);
+
+	snprintf(target, PATH_MAX, "/host%s", path);
+
+	return symlink(target, name);
+}
+
+static int make_dir(const char *dir)
+{
+	char name[PATH_MAX];
+
+	snprintf(name, PATH_MAX, "%s%s", kvm__get_dir(), dir);
+
+	return mkdir(name, 0777);
+}
+
+static void make_guestfs_dir(const char *guestfs_name, const char *dir)
+{
+	char name[PATH_MAX];
+
+	snprintf(name, PATH_MAX, "%s%s", guestfs_name, dir);
+
+	make_dir(name);
+}
+
+void kvm_setup_resolv(const char *guestfs_name)
+{
+	char path[PATH_MAX];
+
+	snprintf(path, PATH_MAX, "%s%s/etc/resolv.conf", kvm__get_dir(), guestfs_name);
+
+	copy_file("/etc/resolv.conf", path);
+}
+
+static int do_setup(const char *guestfs_name)
+{
+	unsigned int i;
+	int ret;
+
+	ret = make_dir(guestfs_name);
+	if (ret < 0)
+		return ret;
+
+	for (i = 0; i < ARRAY_SIZE(guestfs_dirs); i++)
+		make_guestfs_dir(guestfs_name, guestfs_dirs[i]);
+
+	for (i = 0; i < ARRAY_SIZE(guestfs_symlinks); i++) {
+		make_guestfs_symlink(guestfs_name, guestfs_symlinks[i]);
+	}
+
+	ret = copy_init(guestfs_name);
+	if (ret < 0)
+		return ret;
+
+	return copy_passwd(guestfs_name);
+}
+
+int kvm_setup_create_new(const char *guestfs_name)
+{
+	return do_setup(guestfs_name);
+}
+
+int kvm_cmd_setup(int argc, const char **argv, const char *prefix)
+{
+	int r;
+
+	parse_setup_options(argc, argv);
+
+	if (instance_name == NULL)
+		kvm_setup_help();
+
+	r = do_setup(instance_name);
+	if (r == 0)
+		printf("A new rootfs '%s' has been created in '%s%s'.\n\n"
+			"You can now start it by running the following command:\n\n"
+			"  %s run -d %s\n",
+			instance_name, kvm__get_dir(), instance_name,
+			KVM_BINARY_NAME,instance_name);
+	else
+		printf("Unable to create rootfs in %s%s: %s\n",
+			kvm__get_dir(), instance_name, strerror(errno));
+
+	return r;
+}

diff --git a/tools/kvm/builtin-stat.c b/tools/kvm/builtin-stat.c
new file mode 100644
index 0000000..5d6407e
--- /dev/null
+++ b/tools/kvm/builtin-stat.c

@@ -0,0 +1,127 @@
+#include <kvm/util.h>
+#include <kvm/kvm-cmd.h>
+#include <kvm/builtin-stat.h>
+#include <kvm/kvm.h>
+#include <kvm/parse-options.h>
+#include <kvm/kvm-ipc.h>
+
+#include <sys/select.h>
+#include <stdio.h>
+#include <string.h>
+#include <signal.h>
+
+#include <linux/virtio_balloon.h>
+
+static bool mem;
+static bool all;
+static const char *instance_name;
+
+static const char * const stat_usage[] = {
+	"lkvm stat [command] [--all] [-n name]",
+	NULL
+};
+
+static const struct option stat_options[] = {
+	OPT_GROUP("Commands options:"),
+	OPT_BOOLEAN('m', "memory", &mem, "Display memory statistics"),
+	OPT_GROUP("Instance options:"),
+	OPT_BOOLEAN('a', "all", &all, "All instances"),
+	OPT_STRING('n', "name", &instance_name, "name", "Instance name"),
+	OPT_END()
+};
+
+static void parse_stat_options(int argc, const char **argv)
+{
+	while (argc != 0) {
+		argc = parse_options(argc, argv, stat_options, stat_usage,
+				PARSE_OPT_STOP_AT_NON_OPTION);
+		if (argc != 0)
+			kvm_stat_help();
+	}
+}
+
+void kvm_stat_help(void)
+{
+	usage_with_options(stat_usage, stat_options);
+}
+
+static int do_memstat(const char *name, int sock)
+{
+	struct virtio_balloon_stat stats[VIRTIO_BALLOON_S_NR];
+	fd_set fdset;
+	struct timeval t = { .tv_sec = 1 };
+	int r;
+	u8 i;
+
+	FD_ZERO(&fdset);
+	FD_SET(sock, &fdset);
+	r = kvm_ipc__send(sock, KVM_IPC_STAT);
+	if (r < 0)
+		return r;
+
+	r = select(1, &fdset, NULL, NULL, &t);
+	if (r < 0) {
+		pr_err("Could not retrieve mem stats from %s", name);
+		return r;
+	}
+	r = read(sock, &stats, sizeof(stats));
+	if (r < 0)
+		return r;
+
+	printf("\n\n\t*** Guest memory statistics ***\n\n");
+	for (i = 0; i < VIRTIO_BALLOON_S_NR; i++) {
+		switch (stats[i].tag) {
+		case VIRTIO_BALLOON_S_SWAP_IN:
+			printf("The amount of memory that has been swapped in (in bytes):");
+			break;
+		case VIRTIO_BALLOON_S_SWAP_OUT:
+			printf("The amount of memory that has been swapped out to disk (in bytes):");
+			break;
+		case VIRTIO_BALLOON_S_MAJFLT:
+			printf("The number of major page faults that have occurred:");
+			break;
+		case VIRTIO_BALLOON_S_MINFLT:
+			printf("The number of minor page faults that have occurred:");
+			break;
+		case VIRTIO_BALLOON_S_MEMFREE:
+			printf("The amount of memory not being used for any purpose (in bytes):");
+			break;
+		case VIRTIO_BALLOON_S_MEMTOT:
+			printf("The total amount of memory available (in bytes):");
+			break;
+		}
+		printf("%llu\n", (unsigned long long)stats[i].val);
+	}
+	printf("\n");
+
+	return 0;
+}
+
+int kvm_cmd_stat(int argc, const char **argv, const char *prefix)
+{
+	int instance;
+	int r = 0;
+
+	parse_stat_options(argc, argv);
+
+	if (!mem)
+		usage_with_options(stat_usage, stat_options);
+
+	if (mem && all)
+		return kvm__enumerate_instances(do_memstat);
+
+	if (instance_name == NULL)
+		kvm_stat_help();
+
+	instance = kvm__get_sock_by_instance(instance_name);
+
+	if (instance <= 0)
+		die("Failed locating instance");
+
+	if (mem)
+		r = do_memstat(instance_name, instance);
+
+	close(instance);
+
+	return r;
+}

diff --git a/tools/kvm/builtin-stop.c b/tools/kvm/builtin-stop.c
new file mode 100644
index 0000000..6067630
--- /dev/null
+++ b/tools/kvm/builtin-stop.c

@@ -0,0 +1,70 @@
+#include <kvm/util.h>
+#include <kvm/kvm-cmd.h>
+#include <kvm/builtin-stop.h>
+#include <kvm/kvm.h>
+#include <kvm/parse-options.h>
+#include <kvm/kvm-ipc.h>
+
+#include <stdio.h>
+#include <string.h>
+#include <signal.h>
+
+static bool all;
+static const char *instance_name;
+
+static const char * const stop_usage[] = {
+	"lkvm stop [--all] [-n name]",
+	NULL
+};
+
+static const struct option stop_options[] = {
+	OPT_GROUP("General options:"),
+	OPT_BOOLEAN('a', "all", &all, "Stop all instances"),
+	OPT_STRING('n', "name", &instance_name, "name", "Instance name"),
+	OPT_END()
+};
+
+static void parse_stop_options(int argc, const char **argv)
+{
+	while (argc != 0) {
+		argc = parse_options(argc, argv, stop_options, stop_usage,
+				PARSE_OPT_STOP_AT_NON_OPTION);
+		if (argc != 0)
+			kvm_stop_help();
+	}
+}
+
+void kvm_stop_help(void)
+{
+	usage_with_options(stop_usage, stop_options);
+}
+
+static int do_stop(const char *name, int sock)
+{
+	return kvm_ipc__send(sock, KVM_IPC_STOP);
+}
+
+int kvm_cmd_stop(int argc, const char **argv, const char *prefix)
+{
+	int instance;
+	int r;
+
+	parse_stop_options(argc, argv);
+
+	if (all)
+		return kvm__enumerate_instances(do_stop);
+
+	if (instance_name == NULL)
+		kvm_stop_help();
+
+	instance = kvm__get_sock_by_instance(instance_name);
+
+	if (instance <= 0)
+		die("Failed locating instance");
+
+	r = do_stop(instance_name, instance);
+
+	close(instance);
+
+	return r;
+}

diff --git a/tools/kvm/builtin-version.c b/tools/kvm/builtin-version.c
new file mode 100644
index 0000000..b8bb859
--- /dev/null
+++ b/tools/kvm/builtin-version.c

@@ -0,0 +1,15 @@
+#include <kvm/util.h>
+#include <kvm/kvm-cmd.h>
+#include <kvm/builtin-version.h>
+#include <kvm/kvm.h>
+
+#include <stdio.h>
+#include <string.h>
+#include <signal.h>
+
+int kvm_cmd_version(int argc, const char **argv, const char *prefix)
+{
+	printf("kvm tool %s\n", KVMTOOLS_VERSION);
+
+	return 0;
+}

diff --git a/tools/kvm/code16gcc.h b/tools/kvm/code16gcc.h
new file mode 100644
index 0000000..d93e480
--- /dev/null
+++ b/tools/kvm/code16gcc.h

@@ -0,0 +1,15 @@
+/*
+ * code16gcc.h
+ *
+ * This file is -include'd when compiling 16-bit C code.
+ * Note: this asm() needs to be emitted before gcc emits any code.
+ * Depending on gcc version, this requires -fno-unit-at-a-time or
+ * -fno-toplevel-reorder.
+ *
+ * Hopefully gcc will eventually have a real -m16 option so we can
+ * drop this hack long term.
+ */
+
+#ifndef __ASSEMBLY__
+asm(".code16gcc");
+#endif

diff --git a/tools/kvm/command-list.txt b/tools/kvm/command-list.txt
new file mode 100644
index 0000000..d93597d
--- /dev/null
+++ b/tools/kvm/command-list.txt

@@ -0,0 +1,15 @@
+#
+# List of known perf commands.
+# command name			category [deprecated] [common]
+#
+lkvm-run			mainporcelain common
+lkvm-setup			mainporcelain common
+lkvm-pause			common
+lkvm-resume			common
+lkvm-version			common
+lkvm-list			common
+lkvm-debug			common
+lkvm-balloon			common
+lkvm-stop			common
+lkvm-stat			common
+lkvm-sandbox			common

diff --git a/tools/kvm/config/feature-tests.mak b/tools/kvm/config/feature-tests.mak
new file mode 100644
index 0000000..927ac54
--- /dev/null
+++ b/tools/kvm/config/feature-tests.mak

@@ -0,0 +1,188 @@
+define SOURCE_HELLO
+#include <stdio.h>
+int main(void)
+{
+	return puts(\"hi\");
+}
+endef
+
+ifndef NO_DWARF
+define SOURCE_DWARF
+#include <dwarf.h>
+#include <elfutils/libdw.h>
+#include <elfutils/version.h>
+#ifndef _ELFUTILS_PREREQ
+#error
+#endif
+
+int main(void)
+{
+	Dwarf *dbg = dwarf_begin(0, DWARF_C_READ);
+	return (long)dbg;
+}
+endef
+endif
+
+define SOURCE_LIBELF
+#include <libelf.h>
+
+int main(void)
+{
+	Elf *elf = elf_begin(0, ELF_C_READ, 0);
+	return (long)elf;
+}
+endef
+
+define SOURCE_GLIBC
+#include <gnu/libc-version.h>
+
+int main(void)
+{
+	const char *version = gnu_get_libc_version();
+	return (long)version;
+}
+endef
+
+define SOURCE_ELF_MMAP
+#include <libelf.h>
+int main(void)
+{
+	Elf *elf = elf_begin(0, ELF_C_READ_MMAP, 0);
+	return (long)elf;
+}
+endef
+
+ifndef NO_NEWT
+define SOURCE_NEWT
+#include <newt.h>
+
+int main(void)
+{
+	newtInit();
+	newtCls();
+	return newtFinished();
+}
+endef
+endif
+
+ifndef NO_LIBPERL
+define SOURCE_PERL_EMBED
+#include <EXTERN.h>
+#include <perl.h>
+
+int main(void)
+{
+perl_alloc();
+return 0;
+}
+endef
+endif
+
+ifndef NO_LIBPYTHON
+define SOURCE_PYTHON_VERSION
+#include <Python.h>
+#if PY_VERSION_HEX >= 0x03000000
+	#error
+#endif
+int main(void){}
+endef
+define SOURCE_PYTHON_EMBED
+#include <Python.h>
+int main(void)
+{
+	Py_Initialize();
+	return 0;
+}
+endef
+endif
+
+define SOURCE_BFD
+#include <bfd.h>
+
+int main(void)
+{
+	bfd_demangle(0, 0, 0);
+	return 0;
+}
+endef
+
+define SOURCE_CPLUS_DEMANGLE
+extern char *cplus_demangle(const char *, int);
+
+int main(void)
+{
+	cplus_demangle(0, 0);
+	return 0;
+}
+endef
+
+define SOURCE_STRLCPY
+#include <stdlib.h>
+extern size_t strlcpy(char *dest, const char *src, size_t size);
+
+int main(void)
+{
+	strlcpy(NULL, NULL, 0);
+	return 0;
+}
+endef
+
+define SOURCE_VNCSERVER
+#include <rfb/rfb.h>
+
+int main(void)
+{
+	rfbIsActive((void *)0);
+	return 0;
+}
+endef
+
+define SOURCE_SDL
+#include <SDL/SDL.h>
+
+int main(void)
+{
+	SDL_Init(SDL_INIT_VIDEO);
+	return 0;
+}
+endef
+
+define SOURCE_ZLIB
+#include <zlib.h>
+
+int main(void)
+{
+	inflateInit2(NULL, 0);
+	return 0;
+}
+endef
+
+define SOURCE_AIO
+#include <libaio.h>
+
+int main(void)
+{
+	io_setup(0, NULL);
+	return 0;
+}
+endef
+
+define SOURCE_STATIC
+#include <stdlib.h>
+
+int main(void)
+{
+	return 0;
+}
+endef
+
+define SOURCE_GTK3
+#include <gtk/gtk.h>
+
+int main(void)
+{
+	gtk_main();
+
+	return 0;
+}
+endef

diff --git a/tools/kvm/config/utilities.mak b/tools/kvm/config/utilities.mak
new file mode 100644
index 0000000..a70963b
--- /dev/null
+++ b/tools/kvm/config/utilities.mak

@@ -0,0 +1,196 @@
+# This allows us to work with the newline character:
+define newline
+
+
+endef
+newline := $(newline)
+
+# nl-escape
+#
+# Usage: escape = $(call nl-escape[,escape])
+#
+# This is used as the common way to specify
+# what should replace a newline when escaping
+# newlines; the default is a bizarre string.
+#
+nl-escape = $(or $(1),m822df3020w6a44id34bt574ctac44eb9f4n)
+
+# escape-nl
+#
+# Usage: escaped-text = $(call escape-nl,text[,escape])
+#
+# GNU make's $(shell ...) function converts to a
+# single space each newline character in the output
+# produced during the expansion; this may not be
+# desirable.
+#
+# The only solution is to change each newline into
+# something that won't be converted, so that the
+# information can be recovered later with
+# $(call unescape-nl...)
+#
+escape-nl = $(subst $(newline),$(call nl-escape,$(2)),$(1))
+
+# unescape-nl
+#
+# Usage: text = $(call unescape-nl,escaped-text[,escape])
+#
+# See escape-nl.
+#
+unescape-nl = $(subst $(call nl-escape,$(2)),$(newline),$(1))
+
+# shell-escape-nl
+#
+# Usage: $(shell some-command | $(call shell-escape-nl[,escape]))
+#
+# Use this to escape newlines from within a shell call;
+# the default escape is a bizarre string.
+#
+# NOTE: The escape is used directly as a string constant
+#       in an `awk' program that is delimited by shell
+#       single-quotes, so be wary of the characters
+#       that are chosen.
+#
+define shell-escape-nl
+awk 'NR==1 {t=$$0} NR>1 {t=t "$(nl-escape)" $$0} END {printf t}'
+endef
+
+# shell-unescape-nl
+#
+# Usage: $(shell some-command | $(call shell-unescape-nl[,escape]))
+#
+# Use this to unescape newlines from within a shell call;
+# the default escape is a bizarre string.
+#
+# NOTE: The escape is used directly as an extended regular
+#       expression constant in an `awk' program that is
+#       delimited by shell single-quotes, so be wary
+#       of the characters that are chosen.
+#
+# (The bash shell has a bug where `{gsub(...),...}' is
+#  misinterpreted as a brace expansion; this can be
+#  overcome by putting a space between `{' and `gsub').
+#
+define shell-unescape-nl
+awk 'NR==1 {t=$$0} NR>1 {t=t "\n" $$0} END { gsub(/$(nl-escape)/,"\n",t); printf t }'
+endef
+
+# escape-for-shell-sq
+#
+# Usage: embeddable-text = $(call escape-for-shell-sq,text)
+#
+# This function produces text that is suitable for
+# embedding in a shell string that is delimited by
+# single-quotes.
+#
+escape-for-shell-sq =  $(subst ','\'',$(1))
+
+# shell-sq
+#
+# Usage: single-quoted-and-escaped-text = $(call shell-sq,text)
+#
+shell-sq = '$(escape-for-shell-sq)'
+
+# shell-wordify
+#
+# Usage: wordified-text = $(call shell-wordify,text)
+#
+# For instance:
+#
+#  |define text
+#  |hello
+#  |world
+#  |endef
+#  |
+#  |target:
+#  |	echo $(call shell-wordify,$(text))
+#
+# At least GNU make gets confused by expanding a newline
+# within the context of a command line of a makefile rule
+# (this is in constrast to a `$(shell ...)' function call,
+# which can handle it just fine).
+#
+# This function avoids the problem by producing a string
+# that works as a shell word, regardless of whether or
+# not it contains a newline.
+#
+# If the text to be wordified contains a newline, then
+# an intrictate shell command substitution is constructed
+# to render the text as a single line; when the shell
+# processes the resulting escaped text, it transforms
+# it into the original unescaped text.
+#
+# If the text does not contain a newline, then this function
+# produces the same results as the `$(shell-sq)' function.
+#
+shell-wordify = $(if $(findstring $(newline),$(1)),$(_sw-esc-nl),$(shell-sq))
+define _sw-esc-nl
+"$$(echo $(call escape-nl,$(shell-sq),$(2)) | $(call shell-unescape-nl,$(2)))"
+endef
+
+# is-absolute
+#
+# Usage: bool-value = $(call is-absolute,path)
+#
+is-absolute = $(shell echo $(shell-sq) | grep ^/ -q && echo y)
+
+# lookup
+#
+# Usage: absolute-executable-path-or-empty = $(call lookup,path)
+#
+# (It's necessary to use `sh -c' because GNU make messes up by
+#  trying too hard and getting things wrong).
+#
+lookup = $(call unescape-nl,$(shell sh -c $(_l-sh)))
+_l-sh = $(call shell-sq,command -v $(shell-sq) | $(call shell-escape-nl,))
+
+# is-executable
+#
+# Usage: bool-value = $(call is-executable,path)
+#
+# (It's necessary to use `sh -c' because GNU make messes up by
+#  trying too hard and getting things wrong).
+#
+is-executable = $(call _is-executable-helper,$(shell-sq))
+_is-executable-helper = $(shell sh -c $(_is-executable-sh))
+_is-executable-sh = $(call shell-sq,test -f $(1) -a -x $(1) && echo y)
+
+# get-executable
+#
+# Usage: absolute-executable-path-or-empty = $(call get-executable,path)
+#
+# The goal is to get an absolute path for an executable;
+# the `command -v' is defined by POSIX, but it's not
+# necessarily very portable, so it's only used if
+# relative path resolution is requested, as determined
+# by the presence of a leading `/'.
+#
+get-executable = $(if $(1),$(if $(is-absolute),$(_ge-abspath),$(lookup)))
+_ge-abspath = $(if $(is-executable),$(1))
+
+# get-supplied-or-default-executable
+#
+# Usage: absolute-executable-path-or-empty = $(call get-executable-or-default,variable,default)
+#
+define get-executable-or-default
+$(if $($(1)),$(call _ge_attempt,$($(1)),$(1)),$(call _ge_attempt,$(2)))
+endef
+_ge_attempt = $(or $(get-executable),$(_gea_warn),$(call _gea_err,$(2)))
+_gea_warn = $(warning The path '$(1)' is not executable.)
+_gea_err  = $(if $(1),$(error Please set '$(1)' appropriately))
+
+# try-cc
+# Usage: option = $(call try-cc, source-to-build, cc-options)
+try-cc = $(shell sh -c						  \
+	'TMP="$(OUTPUT)$(TMPOUT).$$$$";				  \
+	 echo "$(1)" |						  \
+	 $(CC) -x c - $(2) -o "$$TMP" > /dev/null 2>&1 && echo y; \
+	 rm -f "$$TMP"')
+
+# try-build
+# Usage: option = $(call try-build, source-to-build, cc-options, link-options)
+try-build = $(shell sh -c							\
+	'TMP="$(OUTPUT)$(TMPOUT).$$$$";						\
+	echo "$(1)" |								\
+	$(CC) -x c - $(2) $(3) -o "$$TMP" > /dev/null 2>&1 && echo y;		\
+	rm -f "$$TMP"')

diff --git a/tools/kvm/devices.c b/tools/kvm/devices.c
new file mode 100644
index 0000000..b560a59
--- /dev/null
+++ b/tools/kvm/devices.c

@@ -0,0 +1,105 @@
+#include "kvm/devices.h"
+#include "kvm/kvm.h"
+#include "kvm/pci.h"
+#include "kvm/virtio-mmio.h"
+
+#include <linux/err.h>
+#include <linux/rbtree.h>
+
+struct device_bus {
+	struct rb_root	root;
+	int		dev_num;
+};
+
+static struct device_bus device_trees[DEVICE_BUS_MAX] = {
+	[0 ... (DEVICE_BUS_MAX - 1)] = { RB_ROOT, 0 },
+};
+
+int device__register(struct device_header *dev)
+{
+	struct device_bus *bus;
+	struct rb_node **node, *parent = NULL;
+
+	if (dev->bus_type >= DEVICE_BUS_MAX) {
+		pr_warning("Ignoring device registration on unknown bus %d\n",
+			   dev->bus_type);
+		return -EINVAL;
+	}
+
+	bus = &device_trees[dev->bus_type];
+	dev->dev_num = bus->dev_num++;
+
+	switch (dev->bus_type) {
+	case DEVICE_BUS_PCI:
+		pci__assign_irq(dev);
+		break;
+	case DEVICE_BUS_MMIO:
+		virtio_mmio_assign_irq(dev);
+		break;
+	default:
+		break;
+	}
+
+	node = &bus->root.rb_node;
+	while (*node) {
+		int num = rb_entry(*node, struct device_header, node)->dev_num;
+		int result = dev->dev_num - num;
+
+		if (result < 0)
+			node = &((*node)->rb_left);
+		else if (result > 0)
+			node = &((*node)->rb_right);
+		else
+			return -EEXIST;
+	}
+
+	rb_link_node(&dev->node, parent, node);
+	rb_insert_color(&dev->node, &bus->root);
+	return 0;
+}
+
+void device__unregister(struct device_header *dev)
+{
+	struct device_bus *bus = &device_trees[dev->bus_type];
+	rb_erase(&dev->node, &bus->root);
+}
+
+struct device_header *device__find_dev(enum device_bus_type bus_type, u8 dev_num)
+{
+	struct rb_node *node;
+
+	if (bus_type >= DEVICE_BUS_MAX)
+		return ERR_PTR(-EINVAL);
+
+	node = device_trees[bus_type].root.rb_node;
+	while (node) {
+		struct device_header *dev = rb_entry(node, struct device_header,
+						     node);
+		if (dev_num < dev->dev_num) {
+			node = node->rb_left;
+		} else if (dev_num > dev->dev_num) {
+			node = node->rb_right;
+		} else {
+			return dev;
+		}
+	}
+
+	return NULL;
+}
+
+struct device_header *device__first_dev(enum device_bus_type bus_type)
+{
+	struct rb_node *node;
+
+	if (bus_type >= DEVICE_BUS_MAX)
+		return NULL;
+
+	node = rb_first(&device_trees[bus_type].root);
+	return node ? rb_entry(node, struct device_header, node) : NULL;
+}
+
+struct device_header *device__next_dev(struct device_header *dev)
+{
+	struct rb_node *node = rb_next(&dev->node);
+	return node ? rb_entry(node, struct device_header, node) : NULL;
+}

diff --git a/tools/kvm/disk/blk.c b/tools/kvm/disk/blk.c
new file mode 100644
index 0000000..37581d3
--- /dev/null
+++ b/tools/kvm/disk/blk.c

@@ -0,0 +1,76 @@
+#include "kvm/disk-image.h"
+
+#include <linux/err.h>
+#include <mntent.h>
+
+/*
+ * raw image and blk dev are similar, so reuse raw image ops.
+ */
+static struct disk_image_operations blk_dev_ops = {
+	.read	= raw_image__read,
+	.write	= raw_image__write,
+};
+
+static bool is_mounted(struct stat *st)
+{
+	struct stat st_buf;
+	struct mntent *mnt;
+	FILE *f;
+
+	f = setmntent("/proc/mounts", "r");
+	if (!f)
+		return false;
+
+	while ((mnt = getmntent(f)) != NULL) {
+		if (stat(mnt->mnt_fsname, &st_buf) == 0 &&
+		    S_ISBLK(st_buf.st_mode) && st->st_rdev == st_buf.st_rdev) {
+			fclose(f);
+			return true;
+		}
+	}
+
+	fclose(f);
+	return false;
+}
+
+struct disk_image *blkdev__probe(const char *filename, int flags, struct stat *st)
+{
+	struct disk_image *disk;
+	int fd, r;
+	u64 size;
+
+	if (!S_ISBLK(st->st_mode))
+		return ERR_PTR(-EINVAL);
+
+	if (is_mounted(st)) {
+		pr_err("Block device %s is already mounted! Unmount before use.",
+		       filename);
+		return ERR_PTR(-EINVAL);
+	}
+
+	/*
+	 * Be careful! We are opening host block device!
+	 * Open it readonly since we do not want to break user's data on disk.
+	 */
+	fd = open(filename, flags);
+	if (fd < 0)
+		return ERR_PTR(fd);
+
+	if (ioctl(fd, BLKGETSIZE64, &size) < 0) {
+		r = -errno;
+		close(fd);
+		return ERR_PTR(r);
+	}
+
+	/*
+	 * FIXME: This will not work on 32-bit host because we can not
+	 * mmap large disk. There is not enough virtual address space
+	 * in 32-bit host. However, this works on 64-bit host.
+	 */
+	disk = disk_image__new(fd, size, &blk_dev_ops, DISK_IMAGE_REGULAR);
+#ifdef CONFIG_HAS_AIO
+		if (!IS_ERR_OR_NULL(disk))
+			disk->async = 1;
+#endif
+	return disk;
+}

diff --git a/tools/kvm/disk/core.c b/tools/kvm/disk/core.c
new file mode 100644
index 0000000..309e16c
--- /dev/null
+++ b/tools/kvm/disk/core.c

@@ -0,0 +1,358 @@
+#include "kvm/disk-image.h"
+#include "kvm/qcow.h"
+#include "kvm/virtio-blk.h"
+#include "kvm/kvm.h"
+
+#include <linux/err.h>
+#include <sys/eventfd.h>
+#include <sys/poll.h>
+
+#define AIO_MAX 256
+
+int debug_iodelay;
+
+static int disk_image__close(struct disk_image *disk);
+
+int disk_img_name_parser(const struct option *opt, const char *arg, int unset)
+{
+	const char *cur;
+	char *sep;
+	struct kvm *kvm = opt->ptr;
+
+	if (kvm->cfg.image_count >= MAX_DISK_IMAGES)
+		die("Currently only 4 images are supported");
+
+	kvm->cfg.disk_image[kvm->cfg.image_count].filename = arg;
+	cur = arg;
+
+	if (strncmp(arg, "scsi:", 5) == 0) {
+		sep = strstr(arg, ":");
+		if (sep)
+			kvm->cfg.disk_image[kvm->cfg.image_count].wwpn = sep + 1;
+		sep = strstr(sep + 1, ":");
+		if (sep) {
+			*sep = 0;
+			kvm->cfg.disk_image[kvm->cfg.image_count].tpgt = sep + 1;
+		}
+		cur = sep + 1;
+	}
+
+	do {
+		sep = strstr(cur, ",");
+		if (sep) {
+			if (strncmp(sep + 1, "ro", 2) == 0)
+				kvm->cfg.disk_image[kvm->cfg.image_count].readonly = true;
+			else if (strncmp(sep + 1, "direct", 6) == 0)
+				kvm->cfg.disk_image[kvm->cfg.image_count].direct = true;
+			*sep = 0;
+			cur = sep + 1;
+		}
+	} while (sep);
+
+	kvm->cfg.image_count++;
+
+	return 0;
+}
+
+#ifdef CONFIG_HAS_AIO
+static void *disk_image__thread(void *param)
+{
+	struct disk_image *disk = param;
+	struct io_event event[AIO_MAX];
+	struct timespec notime = {0};
+	int nr, i;
+	u64 dummy;
+
+	kvm__set_thread_name("disk-image-io");
+
+	while (read(disk->evt, &dummy, sizeof(dummy)) > 0) {
+		nr = io_getevents(disk->ctx, 1, ARRAY_SIZE(event), event, &notime);
+		for (i = 0; i < nr; i++)
+			disk->disk_req_cb(event[i].data, event[i].res);
+	}
+
+	return NULL;
+}
+#endif
+
+struct disk_image *disk_image__new(int fd, u64 size,
+				   struct disk_image_operations *ops,
+				   int use_mmap)
+{
+	struct disk_image *disk;
+	int r;
+
+	disk = malloc(sizeof *disk);
+	if (!disk)
+		return ERR_PTR(-ENOMEM);
+
+	*disk = (struct disk_image) {
+		.fd	= fd,
+		.size	= size,
+		.ops	= ops,
+	};
+
+	if (use_mmap == DISK_IMAGE_MMAP) {
+		/*
+		 * The write to disk image will be discarded
+		 */
+		disk->priv = mmap(NULL, size, PROT_RW, MAP_PRIVATE | MAP_NORESERVE, fd, 0);
+		if (disk->priv == MAP_FAILED) {
+			r = -errno;
+			free(disk);
+			return ERR_PTR(r);
+		}
+	}
+
+#ifdef CONFIG_HAS_AIO
+	{
+		pthread_t thread;
+
+		disk->evt = eventfd(0, 0);
+		io_setup(AIO_MAX, &disk->ctx);
+		r = pthread_create(&thread, NULL, disk_image__thread, disk);
+		if (r) {
+			r = -errno;
+			free(disk);
+			return ERR_PTR(r);
+		}
+	}
+#endif
+	return disk;
+}
+
+static struct disk_image *disk_image__open(const char *filename, bool readonly, bool direct)
+{
+	struct disk_image *disk;
+	struct stat st;
+	int fd, flags;
+
+	if (readonly)
+		flags = O_RDONLY;
+	else
+		flags = O_RDWR;
+	if (direct)
+		flags |= O_DIRECT;
+
+	if (stat(filename, &st) < 0)
+		return ERR_PTR(-errno);
+
+	/* blk device ?*/
+	disk = blkdev__probe(filename, flags, &st);
+	if (!IS_ERR_OR_NULL(disk))
+		return disk;
+
+	fd = open(filename, flags);
+	if (fd < 0)
+		return ERR_PTR(fd);
+
+	/* qcow image ?*/
+	disk = qcow_probe(fd, true);
+	if (!IS_ERR_OR_NULL(disk)) {
+		pr_warning("Forcing read-only support for QCOW");
+		return disk;
+	}
+
+	/* raw image ?*/
+	disk = raw_image__probe(fd, &st, readonly);
+	if (!IS_ERR_OR_NULL(disk))
+		return disk;
+
+	if (close(fd) < 0)
+		pr_warning("close() failed");
+
+	return ERR_PTR(-ENOSYS);
+}
+
+static struct disk_image **disk_image__open_all(struct kvm *kvm)
+{
+	struct disk_image **disks;
+	const char *filename;
+	const char *wwpn;
+	const char *tpgt;
+	bool readonly;
+	bool direct;
+	void *err;
+	int i;
+	struct disk_image_params *params = (struct disk_image_params *)&kvm->cfg.disk_image;
+	int count = kvm->cfg.image_count;
+
+	if (!count)
+		return ERR_PTR(-EINVAL);
+	if (count > MAX_DISK_IMAGES)
+		return ERR_PTR(-ENOSPC);
+
+	disks = calloc(count, sizeof(*disks));
+	if (!disks)
+		return ERR_PTR(-ENOMEM);
+
+	for (i = 0; i < count; i++) {
+		filename = params[i].filename;
+		readonly = params[i].readonly;
+		direct = params[i].direct;
+		wwpn = params[i].wwpn;
+		tpgt = params[i].tpgt;
+
+		if (wwpn) {
+			disks[i] = malloc(sizeof(struct disk_image));
+			if (!disks[i])
+				return ERR_PTR(-ENOMEM);
+			disks[i]->wwpn = wwpn;
+			disks[i]->tpgt = tpgt;
+			continue;
+		}
+
+		if (!filename)
+			continue;
+
+		disks[i] = disk_image__open(filename, readonly, direct);
+		if (IS_ERR_OR_NULL(disks[i])) {
+			pr_err("Loading disk image '%s' failed", filename);
+			err = disks[i];
+			goto error;
+		}
+		disks[i]->debug_iodelay = kvm->cfg.debug_iodelay;
+	}
+
+	return disks;
+error:
+	for (i = 0; i < count; i++)
+		if (!IS_ERR_OR_NULL(disks[i]))
+			disk_image__close(disks[i]);
+
+	free(disks);
+	return err;
+}
+
+int disk_image__flush(struct disk_image *disk)
+{
+	if (disk->ops->flush)
+		return disk->ops->flush(disk);
+
+	return fsync(disk->fd);
+}
+
+static int disk_image__close(struct disk_image *disk)
+{
+	/* If there was no disk image then there's nothing to do: */
+	if (!disk)
+		return 0;
+
+	if (disk->ops->close)
+		return disk->ops->close(disk);
+
+	if (close(disk->fd) < 0)
+		pr_warning("close() failed");
+
+	free(disk);
+
+	return 0;
+}
+
+static int disk_image__close_all(struct disk_image **disks, int count)
+{
+	while (count)
+		disk_image__close(disks[--count]);
+
+	free(disks);
+
+	return 0;
+}
+
+/*
+ * Fill iov with disk data, starting from sector 'sector'.
+ * Return amount of bytes read.
+ */
+ssize_t disk_image__read(struct disk_image *disk, u64 sector,
+			 const struct iovec *iov, int iovcount, void *param)
+{
+	ssize_t total = 0;
+
+	if (debug_iodelay)
+		msleep(debug_iodelay);
+
+	if (disk->ops->read) {
+		total = disk->ops->read(disk, sector, iov, iovcount, param);
+		if (total < 0) {
+			pr_info("disk_image__read error: total=%ld\n", (long)total);
+			return total;
+		}
+	}
+
+	if (!disk->async && disk->disk_req_cb)
+		disk->disk_req_cb(param, total);
+
+	return total;
+}
+
+/*
+ * Write iov to disk, starting from sector 'sector'.
+ * Return amount of bytes written.
+ */
+ssize_t disk_image__write(struct disk_image *disk, u64 sector,
+			  const struct iovec *iov, int iovcount, void *param)
+{
+	ssize_t total = 0;
+
+	if (debug_iodelay)
+		msleep(debug_iodelay);
+
+	if (disk->ops->write) {
+		/*
+		 * Try writev based operation first
+		 */
+
+		total = disk->ops->write(disk, sector, iov, iovcount, param);
+		if (total < 0) {
+			pr_info("disk_image__write error: total=%ld\n", (long)total);
+			return total;
+		}
+	} else {
+		/* Do nothing */
+	}
+
+	if (!disk->async && disk->disk_req_cb)
+		disk->disk_req_cb(param, total);
+
+	return total;
+}
+
+ssize_t disk_image__get_serial(struct disk_image *disk, void *buffer, ssize_t *len)
+{
+	struct stat st;
+	int r;
+
+	r = fstat(disk->fd, &st);
+	if (r)
+		return r;
+
+	*len = snprintf(buffer, *len, "%llu%llu%llu",
+			(unsigned long long)st.st_dev,
+			(unsigned long long)st.st_rdev,
+			(unsigned long long)st.st_ino);
+	return *len;
+}
+
+void disk_image__set_callback(struct disk_image *disk,
+			      void (*disk_req_cb)(void *param, long len))
+{
+	disk->disk_req_cb = disk_req_cb;
+}
+
+int disk_image__init(struct kvm *kvm)
+{
+	if (kvm->cfg.image_count) {
+		kvm->disks = disk_image__open_all(kvm);
+		if (IS_ERR(kvm->disks))
+			return PTR_ERR(kvm->disks);
+	}
+
+	return 0;
+}
+dev_base_init(disk_image__init);
+
+int disk_image__exit(struct kvm *kvm)
+{
+	return disk_image__close_all(kvm->disks, kvm->nr_disks);
+}
+dev_base_exit(disk_image__exit);

diff --git a/tools/kvm/disk/qcow.c b/tools/kvm/disk/qcow.c
new file mode 100644
index 0000000..64a2550
--- /dev/null
+++ b/tools/kvm/disk/qcow.c

@@ -0,0 +1,1527 @@
+#include "kvm/qcow.h"
+
+#include "kvm/disk-image.h"
+#include "kvm/read-write.h"
+#include "kvm/mutex.h"
+#include "kvm/util.h"
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <errno.h>
+#ifdef CONFIG_HAS_ZLIB
+#include <zlib.h>
+#endif
+
+#include <linux/err.h>
+#include <linux/byteorder.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+
+static int update_cluster_refcount(struct qcow *q, u64 clust_idx, u16 append);
+static int qcow_write_refcount_table(struct qcow *q);
+static u64 qcow_alloc_clusters(struct qcow *q, u64 size, int update_ref);
+static void  qcow_free_clusters(struct qcow *q, u64 clust_start, u64 size);
+
+static inline int qcow_pwrite_sync(int fd,
+	void *buf, size_t count, off_t offset)
+{
+	if (pwrite_in_full(fd, buf, count, offset) < 0)
+		return -1;
+
+	return fdatasync(fd);
+}
+
+static int l2_table_insert(struct rb_root *root, struct qcow_l2_table *new)
+{
+	struct rb_node **link = &(root->rb_node), *parent = NULL;
+	u64 offset = new->offset;
+
+	/* search the tree */
+	while (*link) {
+		struct qcow_l2_table *t;
+
+		t = rb_entry(*link, struct qcow_l2_table, node);
+		if (!t)
+			goto error;
+
+		parent = *link;
+
+		if (t->offset > offset)
+			link = &(*link)->rb_left;
+		else if (t->offset < offset)
+			link = &(*link)->rb_right;
+		else
+			goto out;
+	}
+
+	/* add new node */
+	rb_link_node(&new->node, parent, link);
+	rb_insert_color(&new->node, root);
+out:
+	return 0;
+error:
+	return -1;
+}
+
+static struct qcow_l2_table *l2_table_lookup(struct rb_root *root, u64 offset)
+{
+	struct rb_node *link = root->rb_node;
+
+	while (link) {
+		struct qcow_l2_table *t;
+
+		t = rb_entry(link, struct qcow_l2_table, node);
+		if (!t)
+			goto out;
+
+		if (t->offset > offset)
+			link = link->rb_left;
+		else if (t->offset < offset)
+			link = link->rb_right;
+		else
+			return t;
+	}
+out:
+	return NULL;
+}
+
+static void l1_table_free_cache(struct qcow_l1_table *l1t)
+{
+	struct rb_root *r = &l1t->root;
+	struct list_head *pos, *n;
+	struct qcow_l2_table *t;
+
+	list_for_each_safe(pos, n, &l1t->lru_list) {
+		/* Remove cache table from the list and RB tree */
+		list_del(pos);
+		t = list_entry(pos, struct qcow_l2_table, list);
+		rb_erase(&t->node, r);
+
+		/* Free the cached node */
+		free(t);
+	}
+}
+
+static int qcow_l2_cache_write(struct qcow *q, struct qcow_l2_table *c)
+{
+	struct qcow_header *header = q->header;
+	u64 size;
+
+	if (!c->dirty)
+		return 0;
+
+	size = 1 << header->l2_bits;
+
+	if (qcow_pwrite_sync(q->fd, c->table,
+		size * sizeof(u64), c->offset) < 0)
+		return -1;
+
+	c->dirty = 0;
+
+	return 0;
+}
+
+static int cache_table(struct qcow *q, struct qcow_l2_table *c)
+{
+	struct qcow_l1_table *l1t = &q->table;
+	struct rb_root *r = &l1t->root;
+	struct qcow_l2_table *lru;
+
+	if (l1t->nr_cached == MAX_CACHE_NODES) {
+		/*
+		 * The node at the head of the list is least recently used
+		 * node. Remove it from the list and replaced with a new node.
+		 */
+		lru = list_first_entry(&l1t->lru_list, struct qcow_l2_table, list);
+
+		/* Remove the node from the cache */
+		rb_erase(&lru->node, r);
+		list_del_init(&lru->list);
+		l1t->nr_cached--;
+
+		/* Free the LRUed node */
+		free(lru);
+	}
+
+	/* Add new node in RB Tree: Helps in searching faster */
+	if (l2_table_insert(r, c) < 0)
+		goto error;
+
+	/* Add in LRU replacement list */
+	list_add_tail(&c->list, &l1t->lru_list);
+	l1t->nr_cached++;
+
+	return 0;
+error:
+	return -1;
+}
+
+static struct qcow_l2_table *l2_table_search(struct qcow *q, u64 offset)
+{
+	struct qcow_l1_table *l1t = &q->table;
+	struct qcow_l2_table *l2t;
+
+	l2t = l2_table_lookup(&l1t->root, offset);
+	if (!l2t)
+		return NULL;
+
+	/* Update the LRU state, by moving the searched node to list tail */
+	list_move_tail(&l2t->list, &l1t->lru_list);
+
+	return l2t;
+}
+
+/* Allocates a new node for caching L2 table */
+static struct qcow_l2_table *new_cache_table(struct qcow *q, u64 offset)
+{
+	struct qcow_header *header = q->header;
+	struct qcow_l2_table *c;
+	u64 l2t_sz;
+	u64 size;
+
+	l2t_sz = 1 << header->l2_bits;
+	size   = sizeof(*c) + l2t_sz * sizeof(u64);
+	c      = calloc(1, size);
+	if (!c)
+		goto out;
+
+	c->offset = offset;
+	RB_CLEAR_NODE(&c->node);
+	INIT_LIST_HEAD(&c->list);
+out:
+	return c;
+}
+
+static inline u64 get_l1_index(struct qcow *q, u64 offset)
+{
+	struct qcow_header *header = q->header;
+
+	return offset >> (header->l2_bits + header->cluster_bits);
+}
+
+static inline u64 get_l2_index(struct qcow *q, u64 offset)
+{
+	struct qcow_header *header = q->header;
+
+	return (offset >> (header->cluster_bits)) & ((1 << header->l2_bits)-1);
+}
+
+static inline u64 get_cluster_offset(struct qcow *q, u64 offset)
+{
+	struct qcow_header *header = q->header;
+
+	return offset & ((1 << header->cluster_bits)-1);
+}
+
+static struct qcow_l2_table *qcow_read_l2_table(struct qcow *q, u64 offset)
+{
+	struct qcow_header *header = q->header;
+	struct qcow_l2_table *l2t;
+	u64 size;
+
+	size = 1 << header->l2_bits;
+
+	/* search an entry for offset in cache */
+	l2t = l2_table_search(q, offset);
+	if (l2t)
+		return l2t;
+
+	/* allocate new node for caching l2 table */
+	l2t = new_cache_table(q, offset);
+	if (!l2t)
+		goto error;
+
+	/* table not cached: read from the disk */
+	if (pread_in_full(q->fd, l2t->table, size * sizeof(u64), offset) < 0)
+		goto error;
+
+	/* cache the table */
+	if (cache_table(q, l2t) < 0)
+		goto error;
+
+	return l2t;
+error:
+	free(l2t);
+	return NULL;
+}
+
+static int qcow_decompress_buffer(u8 *out_buf, int out_buf_size,
+	const u8 *buf, int buf_size)
+{
+#ifdef CONFIG_HAS_ZLIB
+	z_stream strm1, *strm = &strm1;
+	int ret, out_len;
+
+	memset(strm, 0, sizeof(*strm));
+
+	strm->next_in	= (u8 *)buf;
+	strm->avail_in	= buf_size;
+	strm->next_out	= out_buf;
+	strm->avail_out	= out_buf_size;
+
+	ret = inflateInit2(strm, -12);
+	if (ret != Z_OK)
+		return -1;
+
+	ret = inflate(strm, Z_FINISH);
+	out_len = strm->next_out - out_buf;
+	if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) ||
+		out_len != out_buf_size) {
+		inflateEnd(strm);
+		return -1;
+	}
+
+	inflateEnd(strm);
+	return 0;
+#else
+	return -1;
+#endif
+}
+
+static ssize_t qcow1_read_cluster(struct qcow *q, u64 offset,
+	void *dst, u32 dst_len)
+{
+	struct qcow_header *header = q->header;
+	struct qcow_l1_table *l1t = &q->table;
+	struct qcow_l2_table *l2t;
+	u64 clust_offset;
+	u64 clust_start;
+	u64 l2t_offset;
+	size_t length;
+	u64 l2t_size;
+	u64 l1_idx;
+	u64 l2_idx;
+	int coffset;
+	int csize;
+
+	l1_idx = get_l1_index(q, offset);
+	if (l1_idx >= l1t->table_size)
+		return -1;
+
+	clust_offset = get_cluster_offset(q, offset);
+	if (clust_offset >= q->cluster_size)
+		return -1;
+
+	length = q->cluster_size - clust_offset;
+	if (length > dst_len)
+		length = dst_len;
+
+	mutex_lock(&q->mutex);
+
+	l2t_offset = be64_to_cpu(l1t->l1_table[l1_idx]);
+	if (!l2t_offset)
+		goto zero_cluster;
+
+	l2t_size = 1 << header->l2_bits;
+
+	/* read and cache level 2 table */
+	l2t = qcow_read_l2_table(q, l2t_offset);
+	if (!l2t)
+		goto out_error;
+
+	l2_idx = get_l2_index(q, offset);
+	if (l2_idx >= l2t_size)
+		goto out_error;
+
+	clust_start = be64_to_cpu(l2t->table[l2_idx]);
+	if (clust_start & QCOW1_OFLAG_COMPRESSED) {
+		coffset	= clust_start & q->cluster_offset_mask;
+		csize	= clust_start >> (63 - q->header->cluster_bits);
+		csize	&= (q->cluster_size - 1);
+
+		if (pread_in_full(q->fd, q->cluster_data, csize,
+				  coffset) < 0)
+			goto out_error;
+
+		if (qcow_decompress_buffer(q->cluster_cache, q->cluster_size,
+					q->cluster_data, csize) < 0)
+			goto out_error;
+
+		memcpy(dst, q->cluster_cache + clust_offset, length);
+		mutex_unlock(&q->mutex);
+	} else {
+		if (!clust_start)
+			goto zero_cluster;
+
+		mutex_unlock(&q->mutex);
+
+		if (pread_in_full(q->fd, dst, length,
+				  clust_start + clust_offset) < 0)
+			return -1;
+	}
+
+	return length;
+
+zero_cluster:
+	mutex_unlock(&q->mutex);
+	memset(dst, 0, length);
+	return length;
+
+out_error:
+	mutex_unlock(&q->mutex);
+	length = -1;
+	return -1;
+}
+
+static ssize_t qcow2_read_cluster(struct qcow *q, u64 offset,
+	void *dst, u32 dst_len)
+{
+	struct qcow_header *header = q->header;
+	struct qcow_l1_table *l1t = &q->table;
+	struct qcow_l2_table *l2t;
+	u64 clust_offset;
+	u64 clust_start;
+	u64 l2t_offset;
+	size_t length;
+	u64 l2t_size;
+	u64 l1_idx;
+	u64 l2_idx;
+	int coffset;
+	int sector_offset;
+	int nb_csectors;
+	int csize;
+
+	l1_idx = get_l1_index(q, offset);
+	if (l1_idx >= l1t->table_size)
+		return -1;
+
+	clust_offset = get_cluster_offset(q, offset);
+	if (clust_offset >= q->cluster_size)
+		return -1;
+
+	length = q->cluster_size - clust_offset;
+	if (length > dst_len)
+		length = dst_len;
+
+	mutex_lock(&q->mutex);
+
+	l2t_offset = be64_to_cpu(l1t->l1_table[l1_idx]);
+
+	l2t_offset &= ~QCOW2_OFLAG_COPIED;
+	if (!l2t_offset)
+		goto zero_cluster;
+
+	l2t_size = 1 << header->l2_bits;
+
+	/* read and cache level 2 table */
+	l2t = qcow_read_l2_table(q, l2t_offset);
+	if (!l2t)
+		goto out_error;
+
+	l2_idx = get_l2_index(q, offset);
+	if (l2_idx >= l2t_size)
+		goto out_error;
+
+	clust_start = be64_to_cpu(l2t->table[l2_idx]);
+	if (clust_start & QCOW2_OFLAG_COMPRESSED) {
+		coffset = clust_start & q->cluster_offset_mask;
+		nb_csectors = ((clust_start >> q->csize_shift)
+			& q->csize_mask) + 1;
+		sector_offset = coffset & (SECTOR_SIZE - 1);
+		csize = nb_csectors * SECTOR_SIZE - sector_offset;
+
+		if (pread_in_full(q->fd, q->cluster_data,
+				  nb_csectors * SECTOR_SIZE,
+				  coffset & ~(SECTOR_SIZE - 1)) < 0) {
+			goto out_error;
+		}
+
+		if (qcow_decompress_buffer(q->cluster_cache, q->cluster_size,
+					q->cluster_data + sector_offset,
+					csize) < 0) {
+			goto out_error;
+		}
+
+		memcpy(dst, q->cluster_cache + clust_offset, length);
+		mutex_unlock(&q->mutex);
+	} else {
+		clust_start &= QCOW2_OFFSET_MASK;
+		if (!clust_start)
+			goto zero_cluster;
+
+		mutex_unlock(&q->mutex);
+
+		if (pread_in_full(q->fd, dst, length,
+				  clust_start + clust_offset) < 0)
+			return -1;
+	}
+
+	return length;
+
+zero_cluster:
+	mutex_unlock(&q->mutex);
+	memset(dst, 0, length);
+	return length;
+
+out_error:
+	mutex_unlock(&q->mutex);
+	length = -1;
+	return -1;
+}
+
+static ssize_t qcow_read_sector_single(struct disk_image *disk, u64 sector,
+	void *dst, u32 dst_len)
+{
+	struct qcow *q = disk->priv;
+	struct qcow_header *header = q->header;
+	u32 nr_read;
+	u64 offset;
+	char *buf;
+	u32 nr;
+
+	buf = dst;
+	nr_read = 0;
+
+	while (nr_read < dst_len) {
+		offset = sector << SECTOR_SHIFT;
+		if (offset >= header->size)
+			return -1;
+
+		if (q->version == QCOW1_VERSION)
+			nr = qcow1_read_cluster(q, offset, buf,
+				dst_len - nr_read);
+		else
+			nr = qcow2_read_cluster(q, offset, buf,
+				dst_len - nr_read);
+
+		if (nr <= 0)
+			return -1;
+
+		nr_read	+= nr;
+		buf	+= nr;
+		sector	+= (nr >> SECTOR_SHIFT);
+	}
+
+	return dst_len;
+}
+
+static ssize_t qcow_read_sector(struct disk_image *disk, u64 sector,
+				const struct iovec *iov, int iovcount, void *param)
+{
+	ssize_t nr, total = 0;
+
+	while (iovcount--) {
+		nr = qcow_read_sector_single(disk, sector, iov->iov_base, iov->iov_len);
+		if (nr != (ssize_t)iov->iov_len) {
+			pr_info("qcow_read_sector error: nr=%ld iov_len=%ld\n", (long)nr, (long)iov->iov_len);
+			return -1;
+		}
+
+		sector += iov->iov_len >> SECTOR_SHIFT;
+		total += nr;
+		iov++;
+	}
+
+	return total;
+}
+
+static void refcount_table_free_cache(struct qcow_refcount_table *rft)
+{
+	struct rb_root *r = &rft->root;
+	struct list_head *pos, *n;
+	struct qcow_refcount_block *t;
+
+	list_for_each_safe(pos, n, &rft->lru_list) {
+		list_del(pos);
+		t = list_entry(pos, struct qcow_refcount_block, list);
+		rb_erase(&t->node, r);
+
+		free(t);
+	}
+}
+
+static int refcount_block_insert(struct rb_root *root, struct qcow_refcount_block *new)
+{
+	struct rb_node **link = &(root->rb_node), *parent = NULL;
+	u64 offset = new->offset;
+
+	/* search the tree */
+	while (*link) {
+		struct qcow_refcount_block *t;
+
+		t = rb_entry(*link, struct qcow_refcount_block, node);
+		if (!t)
+			goto error;
+
+		parent = *link;
+
+		if (t->offset > offset)
+			link = &(*link)->rb_left;
+		else if (t->offset < offset)
+			link = &(*link)->rb_right;
+		else
+			goto out;
+	}
+
+	/* add new node */
+	rb_link_node(&new->node, parent, link);
+	rb_insert_color(&new->node, root);
+out:
+	return 0;
+error:
+	return -1;
+}
+
+static int write_refcount_block(struct qcow *q, struct qcow_refcount_block *rfb)
+{
+	if (!rfb->dirty)
+		return 0;
+
+	if (qcow_pwrite_sync(q->fd, rfb->entries,
+		rfb->size * sizeof(u16), rfb->offset) < 0)
+		return -1;
+
+	rfb->dirty = 0;
+
+	return 0;
+}
+
+static int cache_refcount_block(struct qcow *q, struct qcow_refcount_block *c)
+{
+	struct qcow_refcount_table *rft = &q->refcount_table;
+	struct rb_root *r = &rft->root;
+	struct qcow_refcount_block *lru;
+
+	if (rft->nr_cached == MAX_CACHE_NODES) {
+		lru = list_first_entry(&rft->lru_list, struct qcow_refcount_block, list);
+
+		rb_erase(&lru->node, r);
+		list_del_init(&lru->list);
+		rft->nr_cached--;
+
+		free(lru);
+	}
+
+	if (refcount_block_insert(r, c) < 0)
+		goto error;
+
+	list_add_tail(&c->list, &rft->lru_list);
+	rft->nr_cached++;
+
+	return 0;
+error:
+	return -1;
+}
+
+static struct qcow_refcount_block *new_refcount_block(struct qcow *q, u64 rfb_offset)
+{
+	struct qcow_refcount_block *rfb;
+
+	rfb = malloc(sizeof *rfb + q->cluster_size);
+	if (!rfb)
+		return NULL;
+
+	rfb->offset = rfb_offset;
+	rfb->size = q->cluster_size / sizeof(u16);
+	RB_CLEAR_NODE(&rfb->node);
+	INIT_LIST_HEAD(&rfb->list);
+
+	return rfb;
+}
+
+static struct qcow_refcount_block *refcount_block_lookup(struct rb_root *root, u64 offset)
+{
+	struct rb_node *link = root->rb_node;
+
+	while (link) {
+		struct qcow_refcount_block *t;
+
+		t = rb_entry(link, struct qcow_refcount_block, node);
+		if (!t)
+			goto out;
+
+		if (t->offset > offset)
+			link = link->rb_left;
+		else if (t->offset < offset)
+			link = link->rb_right;
+		else
+			return t;
+	}
+out:
+	return NULL;
+}
+
+static struct qcow_refcount_block *refcount_block_search(struct qcow *q, u64 offset)
+{
+	struct qcow_refcount_table *rft = &q->refcount_table;
+	struct qcow_refcount_block *rfb;
+
+	rfb = refcount_block_lookup(&rft->root, offset);
+	if (!rfb)
+		return NULL;
+
+	/* Update the LRU state, by moving the searched node to list tail */
+	list_move_tail(&rfb->list, &rft->lru_list);
+
+	return rfb;
+}
+
+static struct qcow_refcount_block *qcow_grow_refcount_block(struct qcow *q,
+	u64 clust_idx)
+{
+	struct qcow_header *header = q->header;
+	struct qcow_refcount_table *rft = &q->refcount_table;
+	struct qcow_refcount_block *rfb;
+	u64 new_block_offset;
+	u64 rft_idx;
+
+	rft_idx = clust_idx >> (header->cluster_bits -
+		QCOW_REFCOUNT_BLOCK_SHIFT);
+
+	if (rft_idx >= rft->rf_size) {
+		pr_warning("Don't support grow refcount block table");
+		return NULL;
+	}
+
+	new_block_offset = qcow_alloc_clusters(q, q->cluster_size, 0);
+	if (new_block_offset < 0)
+		return NULL;
+
+	rfb = new_refcount_block(q, new_block_offset);
+	if (!rfb)
+		return NULL;
+
+	memset(rfb->entries, 0x00, q->cluster_size);
+	rfb->dirty = 1;
+
+	/* write refcount block */
+	if (write_refcount_block(q, rfb) < 0)
+		goto free_rfb;
+
+	if (cache_refcount_block(q, rfb) < 0)
+		goto free_rfb;
+
+	rft->rf_table[rft_idx] = cpu_to_be64(new_block_offset);
+	if (update_cluster_refcount(q, new_block_offset >>
+		    header->cluster_bits, 1) < 0)
+		goto recover_rft;
+
+	if (qcow_write_refcount_table(q) < 0)
+		goto recover_rft;
+
+	return rfb;
+
+recover_rft:
+	rft->rf_table[rft_idx] = 0;
+free_rfb:
+	free(rfb);
+	return NULL;
+}
+
+static struct qcow_refcount_block *qcow_read_refcount_block(struct qcow *q, u64 clust_idx)
+{
+	struct qcow_header *header = q->header;
+	struct qcow_refcount_table *rft = &q->refcount_table;
+	struct qcow_refcount_block *rfb;
+	u64 rfb_offset;
+	u64 rft_idx;
+
+	rft_idx = clust_idx >> (header->cluster_bits - QCOW_REFCOUNT_BLOCK_SHIFT);
+	if (rft_idx >= rft->rf_size)
+		return ERR_PTR(-ENOSPC);
+
+	rfb_offset = be64_to_cpu(rft->rf_table[rft_idx]);
+	if (!rfb_offset)
+		return ERR_PTR(-ENOSPC);
+
+	rfb = refcount_block_search(q, rfb_offset);
+	if (rfb)
+		return rfb;
+
+	rfb = new_refcount_block(q, rfb_offset);
+	if (!rfb)
+		return NULL;
+
+	if (pread_in_full(q->fd, rfb->entries, rfb->size * sizeof(u16), rfb_offset) < 0)
+		goto error_free_rfb;
+
+	if (cache_refcount_block(q, rfb) < 0)
+		goto error_free_rfb;
+
+	return rfb;
+
+error_free_rfb:
+	free(rfb);
+
+	return NULL;
+}
+
+static u16 qcow_get_refcount(struct qcow *q, u64 clust_idx)
+{
+	struct qcow_refcount_block *rfb = NULL;
+	struct qcow_header *header = q->header;
+	u64 rfb_idx;
+
+	rfb = qcow_read_refcount_block(q, clust_idx);
+	if (PTR_ERR(rfb) == -ENOSPC)
+		return 0;
+	else if (IS_ERR_OR_NULL(rfb)) {
+		pr_warning("Error while reading refcount table");
+		return -1;
+	}
+
+	rfb_idx = clust_idx & (((1ULL <<
+		(header->cluster_bits - QCOW_REFCOUNT_BLOCK_SHIFT)) - 1));
+
+	if (rfb_idx >= rfb->size) {
+		pr_warning("L1: refcount block index out of bounds");
+		return -1;
+	}
+
+	return be16_to_cpu(rfb->entries[rfb_idx]);
+}
+
+static int update_cluster_refcount(struct qcow *q, u64 clust_idx, u16 append)
+{
+	struct qcow_refcount_block *rfb = NULL;
+	struct qcow_header *header = q->header;
+	u16 refcount;
+	u64 rfb_idx;
+
+	rfb = qcow_read_refcount_block(q, clust_idx);
+	if (PTR_ERR(rfb) == -ENOSPC) {
+		rfb = qcow_grow_refcount_block(q, clust_idx);
+		if (!rfb) {
+			pr_warning("error while growing refcount table");
+			return -1;
+		}
+	} else if (IS_ERR_OR_NULL(rfb)) {
+		pr_warning("error while reading refcount table");
+		return -1;
+	}
+
+	rfb_idx = clust_idx & (((1ULL <<
+		(header->cluster_bits - QCOW_REFCOUNT_BLOCK_SHIFT)) - 1));
+	if (rfb_idx >= rfb->size) {
+		pr_warning("refcount block index out of bounds");
+		return -1;
+	}
+
+	refcount = be16_to_cpu(rfb->entries[rfb_idx]) + append;
+	rfb->entries[rfb_idx] = cpu_to_be16(refcount);
+	rfb->dirty = 1;
+
+	/* write refcount block */
+	if (write_refcount_block(q, rfb) < 0) {
+		pr_warning("refcount block index out of bounds");
+		return -1;
+	}
+
+	/* update free_clust_idx since refcount becomes zero */
+	if (!refcount && clust_idx < q->free_clust_idx)
+		q->free_clust_idx = clust_idx;
+
+	return 0;
+}
+
+static void  qcow_free_clusters(struct qcow *q, u64 clust_start, u64 size)
+{
+	struct qcow_header *header = q->header;
+	u64 start, end, offset;
+
+	start = clust_start & ~(q->cluster_size - 1);
+	end = (clust_start + size - 1) & ~(q->cluster_size - 1);
+	for (offset = start; offset <= end; offset += q->cluster_size)
+		update_cluster_refcount(q, offset >> header->cluster_bits, -1);
+}
+
+/*
+ * Allocate clusters according to the size. Find a postion that
+ * can satisfy the size. free_clust_idx is initialized to zero and
+ * Record last position.
+ */
+static u64 qcow_alloc_clusters(struct qcow *q, u64 size, int update_ref)
+{
+	struct qcow_header *header = q->header;
+	u16 clust_refcount;
+	u32 clust_idx = 0, i;
+	u64 clust_num;
+
+	clust_num = (size + (q->cluster_size - 1)) >> header->cluster_bits;
+
+again:
+	for (i = 0; i < clust_num; i++) {
+		clust_idx = q->free_clust_idx++;
+		clust_refcount = qcow_get_refcount(q, clust_idx);
+		if (clust_refcount < 0)
+			return -1;
+		else if (clust_refcount > 0)
+			goto again;
+	}
+
+	clust_idx++;
+
+	if (update_ref)
+		for (i = 0; i < clust_num; i++)
+			if (update_cluster_refcount(q,
+				clust_idx - clust_num + i, 1))
+				return -1;
+
+	return (clust_idx - clust_num) << header->cluster_bits;
+}
+
+static int qcow_write_l1_table(struct qcow *q)
+{
+	struct qcow_l1_table *l1t = &q->table;
+	struct qcow_header *header = q->header;
+
+	if (qcow_pwrite_sync(q->fd, l1t->l1_table,
+		l1t->table_size * sizeof(u64),
+		header->l1_table_offset) < 0)
+		return -1;
+
+	return 0;
+}
+
+/*
+ * Get l2 table. If the table has been copied, read table directly.
+ * If the table exists, allocate a new cluster and copy the table
+ * to the new cluster.
+ */
+static int get_cluster_table(struct qcow *q, u64 offset,
+	struct qcow_l2_table **result_l2t, u64 *result_l2_index)
+{
+	struct qcow_header *header = q->header;
+	struct qcow_l1_table *l1t = &q->table;
+	struct qcow_l2_table *l2t;
+	u64 l1t_idx;
+	u64 l2t_offset;
+	u64 l2t_idx;
+	u64 l2t_size;
+	u64 l2t_new_offset;
+
+	l2t_size = 1 << header->l2_bits;
+
+	l1t_idx = get_l1_index(q, offset);
+	if (l1t_idx >= l1t->table_size)
+		return -1;
+
+	l2t_idx = get_l2_index(q, offset);
+	if (l2t_idx >= l2t_size)
+		return -1;
+
+	l2t_offset = be64_to_cpu(l1t->l1_table[l1t_idx]);
+	if (l2t_offset & QCOW2_OFLAG_COPIED) {
+		l2t_offset &= ~QCOW2_OFLAG_COPIED;
+		l2t = qcow_read_l2_table(q, l2t_offset);
+		if (!l2t)
+			goto error;
+	} else {
+		l2t_new_offset = qcow_alloc_clusters(q,
+			l2t_size*sizeof(u64), 1);
+
+		if (l2t_new_offset < 0)
+			goto error;
+
+		l2t = new_cache_table(q, l2t_new_offset);
+		if (!l2t)
+			goto free_cluster;
+
+		if (l2t_offset) {
+			l2t = qcow_read_l2_table(q, l2t_offset);
+			if (!l2t)
+				goto free_cache;
+		} else
+			memset(l2t->table, 0x00, l2t_size * sizeof(u64));
+
+		/* write l2 table */
+		l2t->dirty = 1;
+		if (qcow_l2_cache_write(q, l2t) < 0)
+			goto free_cache;
+
+		/* cache l2 table */
+		if (cache_table(q, l2t))
+			goto free_cache;
+
+		/* update the l1 talble */
+		l1t->l1_table[l1t_idx] = cpu_to_be64(l2t_new_offset
+			| QCOW2_OFLAG_COPIED);
+		if (qcow_write_l1_table(q)) {
+			pr_warning("Update l1 table error");
+			goto free_cache;
+		}
+
+		/* free old cluster */
+		qcow_free_clusters(q, l2t_offset, q->cluster_size);
+	}
+
+	*result_l2t = l2t;
+	*result_l2_index = l2t_idx;
+
+	return 0;
+
+free_cache:
+	free(l2t);
+
+free_cluster:
+	qcow_free_clusters(q, l2t_new_offset, q->cluster_size);
+
+error:
+	return -1;
+}
+
+/*
+ * If the cluster has been copied, write data directly. If not,
+ * read the original data and write it to the new cluster with
+ * modification.
+ */
+static ssize_t qcow_write_cluster(struct qcow *q, u64 offset,
+		void *buf, u32 src_len)
+{
+	struct qcow_l2_table *l2t;
+	u64 clust_new_start;
+	u64 clust_start;
+	u64 clust_flags;
+	u64 clust_off;
+	u64 l2t_idx;
+	u64 len;
+
+	l2t = NULL;
+
+	clust_off = get_cluster_offset(q, offset);
+	if (clust_off >= q->cluster_size)
+		return -1;
+
+	len = q->cluster_size - clust_off;
+	if (len > src_len)
+		len = src_len;
+
+	mutex_lock(&q->mutex);
+
+	if (get_cluster_table(q, offset, &l2t, &l2t_idx)) {
+		pr_warning("Get l2 table error");
+		goto error;
+	}
+
+	clust_start = be64_to_cpu(l2t->table[l2t_idx]);
+	clust_flags = clust_start & QCOW2_OFLAGS_MASK;
+
+	clust_start &= QCOW2_OFFSET_MASK;
+	if (!(clust_flags & QCOW2_OFLAG_COPIED)) {
+		clust_new_start	= qcow_alloc_clusters(q, q->cluster_size, 1);
+		if (clust_new_start < 0) {
+			pr_warning("Cluster alloc error");
+			goto error;
+		}
+
+		offset &= ~(q->cluster_size - 1);
+
+		/* if clust_start is not zero, read the original data*/
+		if (clust_start) {
+			mutex_unlock(&q->mutex);
+			if (qcow2_read_cluster(q, offset, q->copy_buff,
+				q->cluster_size) < 0) {
+				pr_warning("Read copy cluster error");
+				qcow_free_clusters(q, clust_new_start,
+					q->cluster_size);
+				return -1;
+			}
+			mutex_lock(&q->mutex);
+		} else
+			memset(q->copy_buff, 0x00, q->cluster_size);
+
+		memcpy(q->copy_buff + clust_off, buf, len);
+
+		 /* Write actual data */
+		if (pwrite_in_full(q->fd, q->copy_buff, q->cluster_size,
+			clust_new_start) < 0)
+			goto free_cluster;
+
+		/* update l2 table*/
+		l2t->table[l2t_idx] = cpu_to_be64(clust_new_start
+			| QCOW2_OFLAG_COPIED);
+		l2t->dirty = 1;
+
+		if (qcow_l2_cache_write(q, l2t))
+			goto free_cluster;
+
+		/* free old cluster*/
+		if (clust_flags & QCOW2_OFLAG_COMPRESSED) {
+			int size;
+			size = ((clust_start >> q->csize_shift) &
+				q->csize_mask) + 1;
+			size *= 512;
+			clust_start &= q->cluster_offset_mask;
+			clust_start &= ~511;
+
+			qcow_free_clusters(q, clust_start, size);
+		} else if (clust_start)
+			qcow_free_clusters(q, clust_start, q->cluster_size);
+
+	} else {
+		/* Write actual data */
+		if (pwrite_in_full(q->fd, buf, len,
+			clust_start + clust_off) < 0)
+			goto error;
+	}
+	mutex_unlock(&q->mutex);
+	return len;
+
+free_cluster:
+	qcow_free_clusters(q, clust_new_start, q->cluster_size);
+
+error:
+	mutex_unlock(&q->mutex);
+	return -1;
+}
+
+static ssize_t qcow_write_sector_single(struct disk_image *disk, u64 sector, void *src, u32 src_len)
+{
+	struct qcow *q = disk->priv;
+	struct qcow_header *header = q->header;
+	u32 nr_written;
+	char *buf;
+	u64 offset;
+	ssize_t nr;
+
+	buf		= src;
+	nr_written	= 0;
+	offset		= sector << SECTOR_SHIFT;
+
+	while (nr_written < src_len) {
+		if (offset >= header->size)
+			return -1;
+
+		nr = qcow_write_cluster(q, offset, buf, src_len - nr_written);
+		if (nr < 0)
+			return -1;
+
+		nr_written	+= nr;
+		buf		+= nr;
+		offset		+= nr;
+	}
+
+	return nr_written;
+}
+
+static ssize_t qcow_write_sector(struct disk_image *disk, u64 sector,
+				const struct iovec *iov, int iovcount, void *param)
+{
+	ssize_t nr, total = 0;
+
+	while (iovcount--) {
+		nr = qcow_write_sector_single(disk, sector, iov->iov_base, iov->iov_len);
+		if (nr != (ssize_t)iov->iov_len) {
+			pr_info("qcow_write_sector error: nr=%ld iov_len=%ld\n", (long)nr, (long)iov->iov_len);
+			return -1;
+		}
+
+		sector	+= iov->iov_len >> SECTOR_SHIFT;
+		iov++;
+		total	+= nr;
+	}
+
+	return total;
+}
+
+static int qcow_disk_flush(struct disk_image *disk)
+{
+	struct qcow *q = disk->priv;
+	struct qcow_refcount_table *rft;
+	struct list_head *pos, *n;
+	struct qcow_l1_table *l1t;
+
+	l1t = &q->table;
+	rft = &q->refcount_table;
+
+	mutex_lock(&q->mutex);
+
+	list_for_each_safe(pos, n, &rft->lru_list) {
+		struct qcow_refcount_block *c = list_entry(pos, struct qcow_refcount_block, list);
+
+		if (write_refcount_block(q, c) < 0)
+			goto error_unlock;
+	}
+
+	list_for_each_safe(pos, n, &l1t->lru_list) {
+		struct qcow_l2_table *c = list_entry(pos, struct qcow_l2_table, list);
+
+		if (qcow_l2_cache_write(q, c) < 0)
+			goto error_unlock;
+	}
+
+	if (qcow_write_l1_table < 0)
+		goto error_unlock;
+
+	mutex_unlock(&q->mutex);
+
+	return fsync(disk->fd);
+
+error_unlock:
+	mutex_unlock(&q->mutex);
+	return -1;
+}
+
+static int qcow_disk_close(struct disk_image *disk)
+{
+	struct qcow *q;
+
+	if (!disk)
+		return 0;
+
+	q = disk->priv;
+
+	refcount_table_free_cache(&q->refcount_table);
+	l1_table_free_cache(&q->table);
+	free(q->copy_buff);
+	free(q->cluster_data);
+	free(q->cluster_cache);
+	free(q->refcount_table.rf_table);
+	free(q->table.l1_table);
+	free(q->header);
+	free(q);
+
+	return 0;
+}
+
+static struct disk_image_operations qcow_disk_readonly_ops = {
+	.read	= qcow_read_sector,
+	.close	= qcow_disk_close,
+};
+
+static struct disk_image_operations qcow_disk_ops = {
+	.read	= qcow_read_sector,
+	.write	= qcow_write_sector,
+	.flush	= qcow_disk_flush,
+	.close	= qcow_disk_close,
+};
+
+static int qcow_read_refcount_table(struct qcow *q)
+{
+	struct qcow_header *header = q->header;
+	struct qcow_refcount_table *rft = &q->refcount_table;
+
+	rft->rf_size = (header->refcount_table_size * q->cluster_size)
+		/ sizeof(u64);
+
+	rft->rf_table = calloc(rft->rf_size, sizeof(u64));
+	if (!rft->rf_table)
+		return -1;
+
+	rft->root = RB_ROOT;
+	INIT_LIST_HEAD(&rft->lru_list);
+
+	return pread_in_full(q->fd, rft->rf_table, sizeof(u64) * rft->rf_size, header->refcount_table_offset);
+}
+
+static int qcow_write_refcount_table(struct qcow *q)
+{
+	struct qcow_header *header = q->header;
+	struct qcow_refcount_table *rft = &q->refcount_table;
+
+	return qcow_pwrite_sync(q->fd, rft->rf_table,
+		rft->rf_size * sizeof(u64), header->refcount_table_offset);
+}
+
+static int qcow_read_l1_table(struct qcow *q)
+{
+	struct qcow_header *header = q->header;
+	struct qcow_l1_table *table = &q->table;
+
+	table->table_size = header->l1_size;
+
+	table->l1_table	= calloc(table->table_size, sizeof(u64));
+	if (!table->l1_table)
+		return -1;
+
+	return pread_in_full(q->fd, table->l1_table, sizeof(u64) * table->table_size, header->l1_table_offset);
+}
+
+static void *qcow2_read_header(int fd)
+{
+	struct qcow2_header_disk f_header;
+	struct qcow_header *header;
+
+	header = malloc(sizeof(struct qcow_header));
+	if (!header)
+		return NULL;
+
+	if (pread_in_full(fd, &f_header, sizeof(struct qcow2_header_disk), 0) < 0) {
+		free(header);
+		return NULL;
+	}
+
+	be32_to_cpus(&f_header.magic);
+	be32_to_cpus(&f_header.version);
+	be64_to_cpus(&f_header.backing_file_offset);
+	be32_to_cpus(&f_header.backing_file_size);
+	be32_to_cpus(&f_header.cluster_bits);
+	be64_to_cpus(&f_header.size);
+	be32_to_cpus(&f_header.crypt_method);
+	be32_to_cpus(&f_header.l1_size);
+	be64_to_cpus(&f_header.l1_table_offset);
+	be64_to_cpus(&f_header.refcount_table_offset);
+	be32_to_cpus(&f_header.refcount_table_clusters);
+	be32_to_cpus(&f_header.nb_snapshots);
+	be64_to_cpus(&f_header.snapshots_offset);
+
+	*header		= (struct qcow_header) {
+		.size			= f_header.size,
+		.l1_table_offset	= f_header.l1_table_offset,
+		.l1_size		= f_header.l1_size,
+		.cluster_bits		= f_header.cluster_bits,
+		.l2_bits		= f_header.cluster_bits - 3,
+		.refcount_table_offset	= f_header.refcount_table_offset,
+		.refcount_table_size	= f_header.refcount_table_clusters,
+	};
+
+	return header;
+}
+
+static struct disk_image *qcow2_probe(int fd, bool readonly)
+{
+	struct disk_image *disk_image;
+	struct qcow_l1_table *l1t;
+	struct qcow_header *h;
+	struct qcow *q;
+
+	q = calloc(1, sizeof(struct qcow));
+	if (!q)
+		return NULL;
+
+	mutex_init(&q->mutex);
+	q->fd = fd;
+
+	l1t = &q->table;
+
+	l1t->root = RB_ROOT;
+	INIT_LIST_HEAD(&l1t->lru_list);
+
+	h = q->header = qcow2_read_header(fd);
+	if (!h)
+		goto free_qcow;
+
+	q->version = QCOW2_VERSION;
+	q->csize_shift = (62 - (q->header->cluster_bits - 8));
+	q->csize_mask = (1 << (q->header->cluster_bits - 8)) - 1;
+	q->cluster_offset_mask = (1LL << q->csize_shift) - 1;
+	q->cluster_size = 1 << q->header->cluster_bits;
+
+	q->copy_buff = malloc(q->cluster_size);
+	if (!q->copy_buff) {
+		pr_warning("copy buff malloc error");
+		goto free_header;
+	}
+
+	q->cluster_data = malloc(q->cluster_size);
+	if (!q->cluster_data) {
+		pr_warning("cluster data malloc error");
+		goto free_copy_buff;
+	}
+
+	q->cluster_cache = malloc(q->cluster_size);
+	if (!q->cluster_cache) {
+		pr_warning("cluster cache malloc error");
+		goto free_cluster_data;
+	}
+
+	if (qcow_read_l1_table(q) < 0)
+		goto free_cluster_cache;
+
+	if (qcow_read_refcount_table(q) < 0)
+		goto free_l1_table;
+
+	/*
+	 * Do not use mmap use read/write instead
+	 */
+	if (readonly)
+		disk_image = disk_image__new(fd, h->size, &qcow_disk_readonly_ops, DISK_IMAGE_REGULAR);
+	else
+		disk_image = disk_image__new(fd, h->size, &qcow_disk_ops, DISK_IMAGE_REGULAR);
+
+	if (IS_ERR_OR_NULL(disk_image))
+		goto free_refcount_table;
+
+	disk_image->async = 0;
+	disk_image->priv = q;
+
+	return disk_image;
+
+free_refcount_table:
+	if (q->refcount_table.rf_table)
+		free(q->refcount_table.rf_table);
+free_l1_table:
+	if (q->table.l1_table)
+		free(q->table.l1_table);
+free_cluster_cache:
+	if (q->cluster_cache)
+		free(q->cluster_cache);
+free_cluster_data:
+	if (q->cluster_data)
+		free(q->cluster_data);
+free_copy_buff:
+	if (q->copy_buff)
+		free(q->copy_buff);
+free_header:
+	if (q->header)
+		free(q->header);
+free_qcow:
+	free(q);
+
+	return NULL;
+}
+
+static bool qcow2_check_image(int fd)
+{
+	struct qcow2_header_disk f_header;
+
+	if (pread_in_full(fd, &f_header, sizeof(struct qcow2_header_disk), 0) < 0)
+		return false;
+
+	be32_to_cpus(&f_header.magic);
+	be32_to_cpus(&f_header.version);
+
+	if (f_header.magic != QCOW_MAGIC)
+		return false;
+
+	if (f_header.version != QCOW2_VERSION)
+		return false;
+
+	return true;
+}
+
+static void *qcow1_read_header(int fd)
+{
+	struct qcow1_header_disk f_header;
+	struct qcow_header *header;
+
+	header = malloc(sizeof(struct qcow_header));
+	if (!header)
+		return NULL;
+
+	if (pread_in_full(fd, &f_header, sizeof(struct qcow1_header_disk), 0) < 0) {
+		free(header);
+		return NULL;
+	}
+
+	be32_to_cpus(&f_header.magic);
+	be32_to_cpus(&f_header.version);
+	be64_to_cpus(&f_header.backing_file_offset);
+	be32_to_cpus(&f_header.backing_file_size);
+	be32_to_cpus(&f_header.mtime);
+	be64_to_cpus(&f_header.size);
+	be32_to_cpus(&f_header.crypt_method);
+	be64_to_cpus(&f_header.l1_table_offset);
+
+	*header		= (struct qcow_header) {
+		.size			= f_header.size,
+		.l1_table_offset	= f_header.l1_table_offset,
+		.l1_size		= f_header.size / ((1 << f_header.l2_bits) * (1 << f_header.cluster_bits)),
+		.cluster_bits		= f_header.cluster_bits,
+		.l2_bits		= f_header.l2_bits,
+	};
+
+	return header;
+}
+
+static struct disk_image *qcow1_probe(int fd, bool readonly)
+{
+	struct disk_image *disk_image;
+	struct qcow_l1_table *l1t;
+	struct qcow_header *h;
+	struct qcow *q;
+
+	q = calloc(1, sizeof(struct qcow));
+	if (!q)
+		return NULL;
+
+	mutex_init(&q->mutex);
+	q->fd = fd;
+
+	l1t = &q->table;
+
+	l1t->root = RB_ROOT;
+	INIT_LIST_HEAD(&l1t->lru_list);
+
+	h = q->header = qcow1_read_header(fd);
+	if (!h)
+		goto free_qcow;
+
+	q->version = QCOW1_VERSION;
+	q->cluster_size = 1 << q->header->cluster_bits;
+	q->cluster_offset_mask = (1LL << (63 - q->header->cluster_bits)) - 1;
+	q->free_clust_idx = 0;
+
+	q->cluster_data = malloc(q->cluster_size);
+	if (!q->cluster_data) {
+		pr_warning("cluster data malloc error");
+		goto free_header;
+	}
+
+	q->cluster_cache = malloc(q->cluster_size);
+	if (!q->cluster_cache) {
+		pr_warning("cluster cache malloc error");
+		goto free_cluster_data;
+	}
+
+	if (qcow_read_l1_table(q) < 0)
+		goto free_cluster_cache;
+
+	/*
+	 * Do not use mmap use read/write instead
+	 */
+	if (readonly)
+		disk_image = disk_image__new(fd, h->size, &qcow_disk_readonly_ops, DISK_IMAGE_REGULAR);
+	else
+		disk_image = disk_image__new(fd, h->size, &qcow_disk_ops, DISK_IMAGE_REGULAR);
+
+	if (!disk_image)
+		goto free_l1_table;
+
+	disk_image->async = 1;
+	disk_image->priv = q;
+
+	return disk_image;
+
+free_l1_table:
+	if (q->table.l1_table)
+		free(q->table.l1_table);
+free_cluster_cache:
+	if (q->cluster_cache)
+		free(q->cluster_cache);
+free_cluster_data:
+	if (q->cluster_data)
+		free(q->cluster_data);
+free_header:
+	if (q->header)
+		free(q->header);
+free_qcow:
+	free(q);
+
+	return NULL;
+}
+
+static bool qcow1_check_image(int fd)
+{
+	struct qcow1_header_disk f_header;
+
+	if (pread_in_full(fd, &f_header, sizeof(struct qcow1_header_disk), 0) < 0)
+		return false;
+
+	be32_to_cpus(&f_header.magic);
+	be32_to_cpus(&f_header.version);
+
+	if (f_header.magic != QCOW_MAGIC)
+		return false;
+
+	if (f_header.version != QCOW1_VERSION)
+		return false;
+
+	return true;
+}
+
+struct disk_image *qcow_probe(int fd, bool readonly)
+{
+	if (qcow1_check_image(fd))
+		return qcow1_probe(fd, readonly);
+
+	if (qcow2_check_image(fd))
+		return qcow2_probe(fd, readonly);
+
+	return NULL;
+}

diff --git a/tools/kvm/disk/raw.c b/tools/kvm/disk/raw.c
new file mode 100644
index 0000000..93b2b4e
--- /dev/null
+++ b/tools/kvm/disk/raw.c

@@ -0,0 +1,141 @@
+#include "kvm/disk-image.h"
+
+#include <linux/err.h>
+
+#ifdef CONFIG_HAS_AIO
+#include <libaio.h>
+#endif
+
+ssize_t raw_image__read(struct disk_image *disk, u64 sector, const struct iovec *iov,
+				int iovcount, void *param)
+{
+	u64 offset = sector << SECTOR_SHIFT;
+
+#ifdef CONFIG_HAS_AIO
+	struct iocb iocb;
+
+	return aio_preadv(disk->ctx, &iocb, disk->fd, iov, iovcount, offset,
+				disk->evt, param);
+#else
+	return preadv_in_full(disk->fd, iov, iovcount, offset);
+#endif
+}
+
+ssize_t raw_image__write(struct disk_image *disk, u64 sector, const struct iovec *iov,
+				int iovcount, void *param)
+{
+	u64 offset = sector << SECTOR_SHIFT;
+
+#ifdef CONFIG_HAS_AIO
+	struct iocb iocb;
+
+	return aio_pwritev(disk->ctx, &iocb, disk->fd, iov, iovcount, offset,
+				disk->evt, param);
+#else
+	return pwritev_in_full(disk->fd, iov, iovcount, offset);
+#endif
+}
+
+ssize_t raw_image__read_mmap(struct disk_image *disk, u64 sector, const struct iovec *iov,
+				int iovcount, void *param)
+{
+	u64 offset = sector << SECTOR_SHIFT;
+	ssize_t total = 0;
+
+	while (iovcount--) {
+		memcpy(iov->iov_base, disk->priv + offset, iov->iov_len);
+
+		sector	+= iov->iov_len >> SECTOR_SHIFT;
+		offset	+= iov->iov_len;
+		total	+= iov->iov_len;
+		iov++;
+	}
+
+	return total;
+}
+
+ssize_t raw_image__write_mmap(struct disk_image *disk, u64 sector, const struct iovec *iov,
+				int iovcount, void *param)
+{
+	u64 offset = sector << SECTOR_SHIFT;
+	ssize_t total = 0;
+
+	while (iovcount--) {
+		memcpy(disk->priv + offset, iov->iov_base, iov->iov_len);
+
+		sector	+= iov->iov_len >> SECTOR_SHIFT;
+		offset	+= iov->iov_len;
+		total	+= iov->iov_len;
+		iov++;
+	}
+
+	return total;
+}
+
+int raw_image__close(struct disk_image *disk)
+{
+	int ret = 0;
+
+	if (disk->priv != MAP_FAILED)
+		ret = munmap(disk->priv, disk->size);
+
+	close(disk->evt);
+
+#ifdef CONFIG_HAS_VIRTIO
+	io_destroy(disk->ctx);
+#endif
+
+	return ret;
+}
+
+/*
+ * multiple buffer based disk image operations
+ */
+static struct disk_image_operations raw_image_regular_ops = {
+	.read	= raw_image__read,
+	.write	= raw_image__write,
+};
+
+struct disk_image_operations ro_ops = {
+	.read	= raw_image__read_mmap,
+	.write	= raw_image__write_mmap,
+	.close	= raw_image__close,
+};
+
+struct disk_image_operations ro_ops_nowrite = {
+	.read	= raw_image__read,
+};
+
+struct disk_image *raw_image__probe(int fd, struct stat *st, bool readonly)
+{
+	struct disk_image *disk;
+
+	if (readonly) {
+		/*
+		 * Use mmap's MAP_PRIVATE to implement non-persistent write
+		 * FIXME: This does not work on 32-bit host.
+		 */
+		struct disk_image *disk;
+
+		disk = disk_image__new(fd, st->st_size, &ro_ops, DISK_IMAGE_MMAP);
+		if (IS_ERR_OR_NULL(disk)) {
+			disk = disk_image__new(fd, st->st_size, &ro_ops_nowrite, DISK_IMAGE_REGULAR);
+#ifdef CONFIG_HAS_AIO
+			if (!IS_ERR_OR_NULL(disk))
+				disk->async = 1;
+#endif
+		}
+
+		return disk;
+	} else {
+		/*
+		 * Use read/write instead of mmap
+		 */
+		disk = disk_image__new(fd, st->st_size, &raw_image_regular_ops, DISK_IMAGE_REGULAR);
+#ifdef CONFIG_HAS_AIO
+		if (!IS_ERR_OR_NULL(disk))
+			disk->async = 1;
+#endif
+		return disk;
+	}
+}

diff --git a/tools/kvm/framebuffer.c b/tools/kvm/framebuffer.c
new file mode 100644
index 0000000..fb8f51d
--- /dev/null
+++ b/tools/kvm/framebuffer.c

@@ -0,0 +1,80 @@
+#include "kvm/framebuffer.h"
+#include "kvm/kvm.h"
+
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <errno.h>
+
+static LIST_HEAD(framebuffers);
+
+struct framebuffer *fb__register(struct framebuffer *fb)
+{
+	INIT_LIST_HEAD(&fb->node);
+	list_add(&fb->node, &framebuffers);
+
+	return fb;
+}
+
+int fb__attach(struct framebuffer *fb, struct fb_target_operations *ops)
+{
+	if (fb->nr_targets >= FB_MAX_TARGETS)
+		return -ENOSPC;
+
+	fb->targets[fb->nr_targets++] = ops;
+
+	return 0;
+}
+
+static int start_targets(struct framebuffer *fb)
+{
+	unsigned long i;
+
+	for (i = 0; i < fb->nr_targets; i++) {
+		struct fb_target_operations *ops = fb->targets[i];
+		int err = 0;
+
+		if (ops->start)
+			err = ops->start(fb);
+
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+int fb__init(struct kvm *kvm)
+{
+	struct framebuffer *fb;
+
+	list_for_each_entry(fb, &framebuffers, node) {
+		int err;
+
+		err = start_targets(fb);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+firmware_init(fb__init);
+
+int fb__exit(struct kvm *kvm)
+{
+	struct framebuffer *fb;
+
+	list_for_each_entry(fb, &framebuffers, node) {
+		u32 i;
+
+		for (i = 0; i < fb->nr_targets; i++)
+			if (fb->targets[i]->stop)
+				fb->targets[i]->stop(fb);
+
+		munmap(fb->mem, fb->mem_size);
+	}
+
+	return 0;
+}
+firmware_exit(fb__exit);

diff --git a/tools/kvm/guest/init.c b/tools/kvm/guest/init.c
new file mode 100644
index 0000000..8c49a03
--- /dev/null
+++ b/tools/kvm/guest/init.c

@@ -0,0 +1,76 @@
+/*
+ * This is a simple init for shared rootfs guests. This part should be limited
+ * to doing mounts and running stage 2 of the init process.
+ */
+#include <sys/mount.h>
+#include <string.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <errno.h>
+#include <linux/reboot.h>
+
+static int run_process(char *filename)
+{
+	char *new_argv[] = { filename, NULL };
+	char *new_env[] = { "TERM=linux", "DISPLAY=192.168.33.1:0",
+				"HOME=/virt/home", NULL };
+
+	return execve(filename, new_argv, new_env);
+}
+
+static int run_process_sandbox(char *filename)
+{
+	char *new_argv[] = { filename, "/virt/sandbox.sh", NULL };
+	char *new_env[] = { "TERM=linux", "HOME=/virt/home", NULL };
+
+	return execve(filename, new_argv, new_env);
+}
+
+static void do_mounts(void)
+{
+	mount("hostfs", "/host", "9p", MS_RDONLY, "trans=virtio,version=9p2000.L");
+	mount("", "/sys", "sysfs", 0, NULL);
+	mount("proc", "/proc", "proc", 0, NULL);
+	mount("devtmpfs", "/dev", "devtmpfs", 0, NULL);
+	mkdir("/dev/pts", 0755);
+	mount("devpts", "/dev/pts", "devpts", 0, NULL);
+}
+
+int main(int argc, char *argv[])
+{
+	pid_t child;
+	int status;
+
+	puts("Mounting...");
+
+	do_mounts();
+
+	/* get session leader */
+	setsid();
+
+	/* set controlling terminal */
+	ioctl(0, TIOCSCTTY, 1);
+
+	child = fork();
+	if (child < 0) {
+		printf("Fatal: fork() failed with %d\n", child);
+		return 0;
+	} else if (child == 0) {
+		if (access("/virt/sandbox.sh", R_OK) == 0)
+			run_process_sandbox("/bin/sh");
+		else
+			run_process("/bin/sh");
+	} else {
+		pid_t corpse;
+
+		do {
+			corpse = waitpid(-1, &status, 0);
+		} while (corpse != child);
+	}
+
+	reboot(LINUX_REBOOT_CMD_RESTART);
+
+	printf("Init failed: %s\n", strerror(errno));
+
+	return 0;
+}

diff --git a/tools/kvm/guest_compat.c b/tools/kvm/guest_compat.c
new file mode 100644
index 0000000..fd4704b
--- /dev/null
+++ b/tools/kvm/guest_compat.c

@@ -0,0 +1,99 @@
+#include "kvm/guest_compat.h"
+
+#include "kvm/mutex.h"
+
+#include <linux/kernel.h>
+#include <linux/list.h>
+
+struct compat_message {
+	int id;
+	char *title;
+	char *desc;
+
+	struct list_head list;
+};
+
+static int id;
+static DEFINE_MUTEX(compat_mtx);
+static LIST_HEAD(messages);
+
+static void compat__free(struct compat_message *msg)
+{
+	free(msg->title);
+	free(msg->desc);
+	free(msg);
+}
+
+int compat__add_message(const char *title, const char *desc)
+{
+	struct compat_message *msg;
+	int msg_id;
+
+	msg = malloc(sizeof(*msg));
+	if (msg == NULL)
+		goto cleanup;
+
+	msg->title = strdup(title);
+	msg->desc = strdup(desc);
+
+	if (msg->title == NULL || msg->desc == NULL)
+		goto cleanup;
+
+	mutex_lock(&compat_mtx);
+
+	msg->id = msg_id = id++;
+	list_add_tail(&msg->list, &messages);
+
+	mutex_unlock(&compat_mtx);
+
+	return msg_id;
+
+cleanup:
+	if (msg)
+		compat__free(msg);
+
+	return -ENOMEM;
+}
+
+int compat__remove_message(int id)
+{
+	struct compat_message *pos, *n;
+
+	mutex_lock(&compat_mtx);
+
+	list_for_each_entry_safe(pos, n, &messages, list) {
+		if (pos->id == id) {
+			list_del(&pos->list);
+			compat__free(pos);
+
+			mutex_unlock(&compat_mtx);
+
+			return 0;
+		}
+	}
+
+	mutex_unlock(&compat_mtx);
+
+	return -ENOENT;
+}
+
+int compat__print_all_messages(void)
+{
+	mutex_lock(&compat_mtx);
+
+	while (!list_empty(&messages)) {
+		struct compat_message *msg;
+
+		msg = list_first_entry(&messages, struct compat_message, list);
+
+		printf("\n  # KVM compatibility warning.\n\t%s\n\t%s\n",
+			msg->title, msg->desc);
+
+		list_del(&msg->list);
+		compat__free(msg);
+	}
+
+	mutex_unlock(&compat_mtx);
+
+	return 0;
+}

diff --git a/tools/kvm/hw/i8042.c b/tools/kvm/hw/i8042.c
new file mode 100644
index 0000000..3801e20
--- /dev/null
+++ b/tools/kvm/hw/i8042.c

@@ -0,0 +1,363 @@
+#include "kvm/read-write.h"
+#include "kvm/ioport.h"
+#include "kvm/mutex.h"
+#include "kvm/util.h"
+#include "kvm/term.h"
+#include "kvm/kvm.h"
+#include "kvm/i8042.h"
+#include "kvm/kvm-cpu.h"
+
+#include <stdint.h>
+
+/*
+ * IRQs
+ */
+#define KBD_IRQ			1
+#define AUX_IRQ			12
+
+/*
+ * Registers
+ */
+#define I8042_DATA_REG		0x60
+#define I8042_PORT_B_REG	0x61
+#define I8042_COMMAND_REG	0x64
+
+/*
+ * Commands
+ */
+#define I8042_CMD_CTL_RCTR	0x20
+#define I8042_CMD_CTL_WCTR	0x60
+#define I8042_CMD_AUX_LOOP	0xD3
+#define I8042_CMD_AUX_SEND	0xD4
+#define I8042_CMD_AUX_TEST	0xA9
+#define I8042_CMD_AUX_DISABLE	0xA7
+#define I8042_CMD_AUX_ENABLE	0xA8
+#define I8042_CMD_SYSTEM_RESET	0xFE
+
+#define RESPONSE_ACK		0xFA
+
+#define MODE_DISABLE_AUX	0x20
+
+#define AUX_ENABLE_REPORTING	0x20
+#define AUX_SCALING_FLAG	0x10
+#define AUX_DEFAULT_RESOLUTION	0x2
+#define AUX_DEFAULT_SAMPLE	100
+
+/*
+ * Status register bits
+ */
+#define I8042_STR_AUXDATA	0x20
+#define I8042_STR_KEYLOCK	0x10
+#define I8042_STR_CMDDAT	0x08
+#define I8042_STR_MUXERR	0x04
+#define I8042_STR_OBF		0x01
+
+#define KBD_MODE_KBD_INT	0x01
+#define KBD_MODE_SYS		0x02
+
+#define QUEUE_SIZE		128
+
+/*
+ * This represents the current state of the PS/2 keyboard system,
+ * including the AUX device (the mouse)
+ */
+struct kbd_state {
+	struct kvm		*kvm;
+
+	char			kq[QUEUE_SIZE];	/* Keyboard queue */
+	int			kread, kwrite;	/* Indexes into the queue */
+	int			kcount;		/* number of elements in queue */
+
+	char			mq[QUEUE_SIZE];
+	int			mread, mwrite;
+	int			mcount;
+
+	u8			mstatus;	/* Mouse status byte */
+	u8			mres;		/* Current mouse resolution */
+	u8			msample;	/* Current mouse samples/second */
+
+	u8			mode;		/* i8042 mode register */
+	u8			status;		/* i8042 status register */
+	/*
+	 * Some commands (on port 0x64) have arguments;
+	 * we store the command here while we wait for the argument
+	 */
+	u32			write_cmd;
+};
+
+static struct kbd_state		state;
+
+/*
+ * If there are packets to be read, set the appropriate IRQs high
+ */
+static void kbd_update_irq(void)
+{
+	u8 klevel = 0;
+	u8 mlevel = 0;
+
+	/* First, clear the kbd and aux output buffer full bits */
+	state.status &= ~(I8042_STR_OBF | I8042_STR_AUXDATA);
+
+	if (state.kcount > 0) {
+		state.status |= I8042_STR_OBF;
+		klevel = 1;
+	}
+
+	/* Keyboard has higher priority than mouse */
+	if (klevel == 0 && state.mcount != 0) {
+		state.status |= I8042_STR_OBF | I8042_STR_AUXDATA;
+		mlevel = 1;
+	}
+
+	kvm__irq_line(state.kvm, KBD_IRQ, klevel);
+	kvm__irq_line(state.kvm, AUX_IRQ, mlevel);
+}
+
+/*
+ * Add a byte to the mouse queue, then set IRQs
+ */
+void mouse_queue(u8 c)
+{
+	if (state.mcount >= QUEUE_SIZE)
+		return;
+
+	state.mq[state.mwrite++ % QUEUE_SIZE] = c;
+
+	state.mcount++;
+	kbd_update_irq();
+}
+
+/*
+ * Add a byte to the keyboard queue, then set IRQs
+ */
+void kbd_queue(u8 c)
+{
+	if (state.kcount >= QUEUE_SIZE)
+		return;
+
+	state.kq[state.kwrite++ % QUEUE_SIZE] = c;
+
+	state.kcount++;
+	kbd_update_irq();
+}
+
+static void kbd_write_command(struct kvm *kvm, u8 val)
+{
+	switch (val) {
+	case I8042_CMD_CTL_RCTR:
+		kbd_queue(state.mode);
+		break;
+	case I8042_CMD_CTL_WCTR:
+	case I8042_CMD_AUX_SEND:
+	case I8042_CMD_AUX_LOOP:
+		state.write_cmd = val;
+		break;
+	case I8042_CMD_AUX_TEST:
+		/* 0 means we're a normal PS/2 mouse */
+		mouse_queue(0);
+		break;
+	case I8042_CMD_AUX_DISABLE:
+		state.mode |= MODE_DISABLE_AUX;
+		break;
+	case I8042_CMD_AUX_ENABLE:
+		state.mode &= ~MODE_DISABLE_AUX;
+		break;
+	case I8042_CMD_SYSTEM_RESET:
+		kvm_cpu__reboot(kvm);
+		break;
+	default:
+		break;
+	}
+}
+
+/*
+ * Called when the OS reads from port 0x60 (PS/2 data)
+ */
+static u32 kbd_read_data(void)
+{
+	u32 ret;
+	int i;
+
+	if (state.kcount != 0) {
+		/* Keyboard data gets read first */
+		ret = state.kq[state.kread++ % QUEUE_SIZE];
+		state.kcount--;
+		kvm__irq_line(state.kvm, KBD_IRQ, 0);
+		kbd_update_irq();
+	} else if (state.mcount > 0) {
+		/* Followed by the mouse */
+		ret = state.mq[state.mread++ % QUEUE_SIZE];
+		state.mcount--;
+		kvm__irq_line(state.kvm, AUX_IRQ, 0);
+		kbd_update_irq();
+	} else {
+		i = state.kread - 1;
+		if (i < 0)
+			i = QUEUE_SIZE;
+		ret = state.kq[i];
+	}
+	return ret;
+}
+
+/*
+ * Called when the OS read from port 0x64, the command port
+ */
+static u32 kbd_read_status(void)
+{
+	return (u32)state.status;
+}
+
+/*
+ * Called when the OS writes to port 0x60 (data port)
+ * Things written here are generally arguments to commands previously
+ * written to port 0x64 and stored in state.write_cmd
+ */
+static void kbd_write_data(u32 val)
+{
+	switch (state.write_cmd) {
+	case I8042_CMD_CTL_WCTR:
+		state.mode = val;
+		kbd_update_irq();
+		break;
+	case I8042_CMD_AUX_LOOP:
+		mouse_queue(val);
+		mouse_queue(RESPONSE_ACK);
+		break;
+	case I8042_CMD_AUX_SEND:
+		/* The OS wants to send a command to the mouse */
+		mouse_queue(RESPONSE_ACK);
+		switch (val) {
+		case 0xe6:
+			/* set scaling = 1:1 */
+			state.mstatus &= ~AUX_SCALING_FLAG;
+			break;
+		case 0xe8:
+			/* set resolution */
+			state.mres = val;
+			break;
+		case 0xe9:
+			/* Report mouse status/config */
+			mouse_queue(state.mstatus);
+			mouse_queue(state.mres);
+			mouse_queue(state.msample);
+			break;
+		case 0xf2:
+			/* send ID */
+			mouse_queue(0); /* normal mouse */
+			break;
+		case 0xf3:
+			/* set sample rate */
+			state.msample = val;
+			break;
+		case 0xf4:
+			/* enable reporting */
+			state.mstatus |= AUX_ENABLE_REPORTING;
+			break;
+		case 0xf5:
+			state.mstatus &= ~AUX_ENABLE_REPORTING;
+			break;
+		case 0xf6:
+			/* set defaults, just fall through to reset */
+		case 0xff:
+			/* reset */
+			state.mstatus = 0x0;
+			state.mres = AUX_DEFAULT_RESOLUTION;
+			state.msample = AUX_DEFAULT_SAMPLE;
+			break;
+		default:
+			break;
+	}
+	break;
+	case 0:
+		/* Just send the ID */
+		kbd_queue(RESPONSE_ACK);
+		kbd_queue(0xab);
+		kbd_queue(0x41);
+		kbd_update_irq();
+		break;
+	default:
+		/* Yeah whatever */
+		break;
+	}
+	state.write_cmd = 0;
+}
+
+static void kbd_reset(void)
+{
+	state = (struct kbd_state) {
+		.status		= I8042_STR_MUXERR | I8042_STR_CMDDAT | I8042_STR_KEYLOCK, /* 0x1c */
+		.mode		= KBD_MODE_KBD_INT | KBD_MODE_SYS, /* 0x3 */
+		.mres		= AUX_DEFAULT_RESOLUTION,
+		.msample	= AUX_DEFAULT_SAMPLE,
+	};
+}
+
+/*
+ * Called when the OS has written to one of the keyboard's ports (0x60 or 0x64)
+ */
+static bool kbd_in(struct ioport *ioport, struct kvm_cpu *vcpu, u16 port, void *data, int size)
+{
+	switch (port) {
+	case I8042_COMMAND_REG: {
+		u8 value = kbd_read_status();
+		ioport__write8(data, value);
+		break;
+	}
+	case I8042_DATA_REG: {
+		u32 value = kbd_read_data();
+		ioport__write32(data, value);
+		break;
+	}
+	case I8042_PORT_B_REG: {
+		ioport__write8(data, 0x20);
+		break;
+	}
+	default:
+		return false;
+	}
+
+	return true;
+}
+
+static bool kbd_out(struct ioport *ioport, struct kvm_cpu *vcpu, u16 port, void *data, int size)
+{
+	switch (port) {
+	case I8042_COMMAND_REG: {
+		u8 value = ioport__read8(data);
+		kbd_write_command(vcpu->kvm, value);
+		break;
+	}
+	case I8042_DATA_REG: {
+		u32 value = ioport__read32(data);
+		kbd_write_data(value);
+		break;
+	}
+	case I8042_PORT_B_REG: {
+		break;
+	}
+	default:
+		return false;
+	}
+
+	return true;
+}
+
+static struct ioport_operations kbd_ops = {
+	.io_in		= kbd_in,
+	.io_out		= kbd_out,
+};
+
+int kbd__init(struct kvm *kvm)
+{
+#ifndef CONFIG_X86
+	return 0;
+#endif
+
+	kbd_reset();
+	state.kvm = kvm;
+	ioport__register(kvm, I8042_DATA_REG, &kbd_ops, 2, NULL);
+	ioport__register(kvm, I8042_COMMAND_REG, &kbd_ops, 2, NULL);
+
+	return 0;
+}
+dev_init(kbd__init);

diff --git a/tools/kvm/hw/pci-shmem.c b/tools/kvm/hw/pci-shmem.c
new file mode 100644
index 0000000..a1c5ab7
--- /dev/null
+++ b/tools/kvm/hw/pci-shmem.c

@@ -0,0 +1,403 @@
+#include "kvm/devices.h"
+#include "kvm/pci-shmem.h"
+#include "kvm/virtio-pci-dev.h"
+#include "kvm/irq.h"
+#include "kvm/kvm.h"
+#include "kvm/pci.h"
+#include "kvm/util.h"
+#include "kvm/ioport.h"
+#include "kvm/ioeventfd.h"
+
+#include <linux/kvm.h>
+#include <linux/byteorder.h>
+#include <sys/ioctl.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+
+#define MB_SHIFT (20)
+#define KB_SHIFT (10)
+#define GB_SHIFT (30)
+
+static struct pci_device_header pci_shmem_pci_device = {
+	.vendor_id	= cpu_to_le16(PCI_VENDOR_ID_REDHAT_QUMRANET),
+	.device_id	= cpu_to_le16(0x1110),
+	.header_type	= PCI_HEADER_TYPE_NORMAL,
+	.class[2]	= 0xFF,	/* misc pci device */
+	.status		= cpu_to_le16(PCI_STATUS_CAP_LIST),
+	.capabilities	= (void *)&pci_shmem_pci_device.msix - (void *)&pci_shmem_pci_device,
+	.msix.cap	= PCI_CAP_ID_MSIX,
+	.msix.ctrl	= cpu_to_le16(1),
+	.msix.table_offset = cpu_to_le32(1),		/* Use BAR 1 */
+	.msix.pba_offset = cpu_to_le32(0x1001),		/* Use BAR 1 */
+};
+
+static struct device_header pci_shmem_device = {
+	.bus_type	= DEVICE_BUS_PCI,
+	.data		= &pci_shmem_pci_device,
+};
+
+/* registers for the Inter-VM shared memory device */
+enum ivshmem_registers {
+	INTRMASK = 0,
+	INTRSTATUS = 4,
+	IVPOSITION = 8,
+	DOORBELL = 12,
+};
+
+static struct shmem_info *shmem_region;
+static u16 ivshmem_registers;
+static int local_fd;
+static u32 local_id;
+static u64 msix_block;
+static u64 msix_pba;
+static struct msix_table msix_table[2];
+
+int pci_shmem__register_mem(struct shmem_info *si)
+{
+	if (!shmem_region) {
+		shmem_region = si;
+	} else {
+		pr_warning("only single shmem currently avail. ignoring.\n");
+		free(si);
+	}
+	return 0;
+}
+
+static bool shmem_pci__io_in(struct ioport *ioport, struct kvm_cpu *vcpu, u16 port, void *data, int size)
+{
+	u16 offset = port - ivshmem_registers;
+
+	switch (offset) {
+	case INTRMASK:
+		break;
+	case INTRSTATUS:
+		break;
+	case IVPOSITION:
+		ioport__write32(data, local_id);
+		break;
+	case DOORBELL:
+		break;
+	};
+
+	return true;
+}
+
+static bool shmem_pci__io_out(struct ioport *ioport, struct kvm_cpu *vcpu, u16 port, void *data, int size)
+{
+	u16 offset = port - ivshmem_registers;
+
+	switch (offset) {
+	case INTRMASK:
+		break;
+	case INTRSTATUS:
+		break;
+	case IVPOSITION:
+		break;
+	case DOORBELL:
+		break;
+	};
+
+	return true;
+}
+
+static struct ioport_operations shmem_pci__io_ops = {
+	.io_in	= shmem_pci__io_in,
+	.io_out	= shmem_pci__io_out,
+};
+
+static void callback_mmio_msix(struct kvm_cpu *vcpu, u64 addr, u8 *data, u32 len, u8 is_write, void *ptr)
+{
+	void *mem;
+
+	if (addr - msix_block < 0x1000)
+		mem = &msix_table;
+	else
+		mem = &msix_pba;
+
+	if (is_write)
+		memcpy(mem + addr - msix_block, data, len);
+	else
+		memcpy(data, mem + addr - msix_block, len);
+}
+
+/*
+ * Return an irqfd which can be used by other guests to signal this guest
+ * whenever they need to poke it
+ */
+int pci_shmem__get_local_irqfd(struct kvm *kvm)
+{
+	int fd, gsi, r;
+	struct kvm_irqfd irqfd;
+
+	if (local_fd == 0) {
+		fd = eventfd(0, 0);
+		if (fd < 0)
+			return fd;
+
+		if (pci_shmem_pci_device.msix.ctrl & cpu_to_le16(PCI_MSIX_FLAGS_ENABLE)) {
+			gsi = irq__add_msix_route(kvm, &msix_table[0].msg);
+		} else {
+			gsi = pci_shmem_pci_device.irq_line;
+		}
+
+		irqfd = (struct kvm_irqfd) {
+			.fd = fd,
+			.gsi = gsi,
+		};
+
+		r = ioctl(kvm->vm_fd, KVM_IRQFD, &irqfd);
+		if (r < 0)
+			return r;
+
+		local_fd = fd;
+	}
+
+	return local_fd;
+}
+
+/*
+ * Connect a new client to ivshmem by adding the appropriate datamatch
+ * to the DOORBELL
+ */
+int pci_shmem__add_client(struct kvm *kvm, u32 id, int fd)
+{
+	struct kvm_ioeventfd ioevent;
+
+	ioevent = (struct kvm_ioeventfd) {
+		.addr		= ivshmem_registers + DOORBELL,
+		.len		= sizeof(u32),
+		.datamatch	= id,
+		.fd		= fd,
+		.flags		= KVM_IOEVENTFD_FLAG_PIO | KVM_IOEVENTFD_FLAG_DATAMATCH,
+	};
+
+	return ioctl(kvm->vm_fd, KVM_IOEVENTFD, &ioevent);
+}
+
+/*
+ * Remove a client connected to ivshmem by removing the appropriate datamatch
+ * from the DOORBELL
+ */
+int pci_shmem__remove_client(struct kvm *kvm, u32 id)
+{
+	struct kvm_ioeventfd ioevent;
+
+	ioevent = (struct kvm_ioeventfd) {
+		.addr		= ivshmem_registers + DOORBELL,
+		.len		= sizeof(u32),
+		.datamatch	= id,
+		.flags		= KVM_IOEVENTFD_FLAG_PIO
+				| KVM_IOEVENTFD_FLAG_DATAMATCH
+				| KVM_IOEVENTFD_FLAG_DEASSIGN,
+	};
+
+	return ioctl(kvm->vm_fd, KVM_IOEVENTFD, &ioevent);
+}
+
+static void *setup_shmem(const char *key, size_t len, int creating)
+{
+	int fd;
+	int rtn;
+	void *mem;
+	int flag = O_RDWR;
+
+	if (creating)
+		flag |= O_CREAT;
+
+	fd = shm_open(key, flag, S_IRUSR | S_IWUSR);
+	if (fd < 0) {
+		pr_warning("Failed to open shared memory file %s\n", key);
+		return NULL;
+	}
+
+	if (creating) {
+		rtn = ftruncate(fd, (off_t) len);
+		if (rtn < 0)
+			pr_warning("Can't ftruncate(fd,%zu)\n", len);
+	}
+	mem = mmap(NULL, len,
+		   PROT_READ | PROT_WRITE, MAP_SHARED | MAP_NORESERVE, fd, 0);
+	if (mem == MAP_FAILED) {
+		pr_warning("Failed to mmap shared memory file");
+		mem = NULL;
+	}
+	close(fd);
+
+	return mem;
+}
+
+int shmem_parser(const struct option *opt, const char *arg, int unset)
+{
+	const u64 default_size = SHMEM_DEFAULT_SIZE;
+	const u64 default_phys_addr = SHMEM_DEFAULT_ADDR;
+	const char *default_handle = SHMEM_DEFAULT_HANDLE;
+	struct shmem_info *si = malloc(sizeof(struct shmem_info));
+	u64 phys_addr;
+	u64 size;
+	char *handle = NULL;
+	int create = 0;
+	const char *p = arg;
+	char *next;
+	int base = 10;
+	int verbose = 0;
+
+	const int skip_pci = strlen("pci:");
+	if (verbose)
+		pr_info("shmem_parser(%p,%s,%d)", opt, arg, unset);
+	/* parse out optional addr family */
+	if (strcasestr(p, "pci:")) {
+		p += skip_pci;
+	} else if (strcasestr(p, "mem:")) {
+		die("I can't add to E820 map yet.\n");
+	}
+	/* parse out physical addr */
+	base = 10;
+	if (strcasestr(p, "0x"))
+		base = 16;
+	phys_addr = strtoll(p, &next, base);
+	if (next == p && phys_addr == 0) {
+		pr_info("shmem: no physical addr specified, using default.");
+		phys_addr = default_phys_addr;
+	}
+	if (*next != ':' && *next != '\0')
+		die("shmem: unexpected chars after phys addr.\n");
+	if (*next == '\0')
+		p = next;
+	else
+		p = next + 1;
+	/* parse out size */
+	base = 10;
+	if (strcasestr(p, "0x"))
+		base = 16;
+	size = strtoll(p, &next, base);
+	if (next == p && size == 0) {
+		pr_info("shmem: no size specified, using default.");
+		size = default_size;
+	}
+	/* look for [KMGkmg][Bb]*  uses base 2. */
+	int skip_B = 0;
+	if (strspn(next, "KMGkmg")) {	/* might have a prefix */
+		if (*(next + 1) == 'B' || *(next + 1) == 'b')
+			skip_B = 1;
+		switch (*next) {
+		case 'K':
+		case 'k':
+			size = size << KB_SHIFT;
+			break;
+		case 'M':
+		case 'm':
+			size = size << MB_SHIFT;
+			break;
+		case 'G':
+		case 'g':
+			size = size << GB_SHIFT;
+			break;
+		default:
+			die("shmem: bug in detecting size prefix.");
+			break;
+		}
+		next += 1 + skip_B;
+	}
+	if (*next != ':' && *next != '\0') {
+		die("shmem: unexpected chars after phys size. <%c><%c>\n",
+		    *next, *p);
+	}
+	if (*next == '\0')
+		p = next;
+	else
+		p = next + 1;
+	/* parse out optional shmem handle */
+	const int skip_handle = strlen("handle=");
+	next = strcasestr(p, "handle=");
+	if (*p && next) {
+		if (p != next)
+			die("unexpected chars before handle\n");
+		p += skip_handle;
+		next = strchrnul(p, ':');
+		if (next - p) {
+			handle = malloc(next - p + 1);
+			strncpy(handle, p, next - p);
+			handle[next - p] = '\0';	/* just in case. */
+		}
+		if (*next == '\0')
+			p = next;
+		else
+			p = next + 1;
+	}
+	/* parse optional create flag to see if we should create shm seg. */
+	if (*p && strcasestr(p, "create")) {
+		create = 1;
+		p += strlen("create");
+	}
+	if (*p != '\0')
+		die("shmem: unexpected trailing chars\n");
+	if (handle == NULL) {
+		handle = malloc(strlen(default_handle) + 1);
+		strcpy(handle, default_handle);
+	}
+	if (verbose) {
+		pr_info("shmem: phys_addr = %llx",
+			(unsigned long long)phys_addr);
+		pr_info("shmem: size      = %llx", (unsigned long long)size);
+		pr_info("shmem: handle    = %s", handle);
+		pr_info("shmem: create    = %d", create);
+	}
+
+	si->phys_addr = phys_addr;
+	si->size = size;
+	si->handle = handle;
+	si->create = create;
+	pci_shmem__register_mem(si);	/* ownership of si, etc. passed on. */
+	return 0;
+}
+
+int pci_shmem__init(struct kvm *kvm)
+{
+	char *mem;
+	int r;
+
+	if (shmem_region == NULL)
+		return 0;
+
+	/* Register MMIO space for MSI-X */
+	r = ioport__register(kvm, IOPORT_EMPTY, &shmem_pci__io_ops, IOPORT_SIZE, NULL);
+	if (r < 0)
+		return r;
+	ivshmem_registers = (u16)r;
+
+	msix_block = pci_get_io_space_block(0x1010);
+	kvm__register_mmio(kvm, msix_block, 0x1010, false, callback_mmio_msix, NULL);
+
+	/*
+	 * This registers 3 BARs:
+	 *
+	 * 0 - ivshmem registers
+	 * 1 - MSI-X MMIO space
+	 * 2 - Shared memory block
+	 */
+	pci_shmem_pci_device.bar[0] = cpu_to_le32(ivshmem_registers | PCI_BASE_ADDRESS_SPACE_IO);
+	pci_shmem_pci_device.bar_size[0] = shmem_region->size;
+	pci_shmem_pci_device.bar[1] = cpu_to_le32(msix_block | PCI_BASE_ADDRESS_SPACE_MEMORY);
+	pci_shmem_pci_device.bar_size[1] = 0x1010;
+	pci_shmem_pci_device.bar[2] = cpu_to_le32(shmem_region->phys_addr | PCI_BASE_ADDRESS_SPACE_MEMORY);
+	pci_shmem_pci_device.bar_size[2] = shmem_region->size;
+
+	device__register(&pci_shmem_device);
+
+	/* Open shared memory and plug it into the guest */
+	mem = setup_shmem(shmem_region->handle, shmem_region->size,
+				shmem_region->create);
+	if (mem == NULL)
+		return -EINVAL;
+
+	kvm__register_mem(kvm, shmem_region->phys_addr, shmem_region->size,
+			  mem);
+	return 0;
+}
+dev_init(pci_shmem__init);
+
+int pci_shmem__exit(struct kvm *kvm)
+{
+	return 0;
+}
+dev_exit(pci_shmem__exit);

diff --git a/tools/kvm/hw/rtc.c b/tools/kvm/hw/rtc.c
new file mode 100644
index 0000000..0649b5d
--- /dev/null
+++ b/tools/kvm/hw/rtc.c

@@ -0,0 +1,155 @@
+#include "kvm/rtc.h"
+
+#include "kvm/ioport.h"
+#include "kvm/kvm.h"
+
+#include <time.h>
+
+/*
+ * MC146818 RTC registers
+ */
+#define RTC_SECONDS			0x00
+#define RTC_SECONDS_ALARM		0x01
+#define RTC_MINUTES			0x02
+#define RTC_MINUTES_ALARM		0x03
+#define RTC_HOURS			0x04
+#define RTC_HOURS_ALARM			0x05
+#define RTC_DAY_OF_WEEK			0x06
+#define RTC_DAY_OF_MONTH		0x07
+#define RTC_MONTH			0x08
+#define RTC_YEAR			0x09
+#define RTC_CENTURY			0x32
+
+#define RTC_REG_A			0x0A
+#define RTC_REG_B			0x0B
+#define RTC_REG_C			0x0C
+#define RTC_REG_D			0x0D
+
+struct rtc_device {
+	u8			cmos_idx;
+	u8			cmos_data[128];
+};
+
+static struct rtc_device	rtc;
+
+static inline unsigned char bin2bcd(unsigned val)
+{
+	return ((val / 10) << 4) + val % 10;
+}
+
+static bool cmos_ram_data_in(struct ioport *ioport, struct kvm_cpu *vcpu, u16 port, void *data, int size)
+{
+	struct tm *tm;
+	time_t ti;
+
+	time(&ti);
+
+	tm = gmtime(&ti);
+
+	switch (rtc.cmos_idx) {
+	case RTC_SECONDS:
+		ioport__write8(data, bin2bcd(tm->tm_sec));
+		break;
+	case RTC_MINUTES:
+		ioport__write8(data, bin2bcd(tm->tm_min));
+		break;
+	case RTC_HOURS:
+		ioport__write8(data, bin2bcd(tm->tm_hour));
+		break;
+	case RTC_DAY_OF_WEEK:
+		ioport__write8(data, bin2bcd(tm->tm_wday + 1));
+		break;
+	case RTC_DAY_OF_MONTH:
+		ioport__write8(data, bin2bcd(tm->tm_mday));
+		break;
+	case RTC_MONTH:
+		ioport__write8(data, bin2bcd(tm->tm_mon + 1));
+		break;
+	case RTC_YEAR: {
+		int year;
+
+		year = tm->tm_year + 1900;
+
+		ioport__write8(data, bin2bcd(year % 100));
+
+		break;
+	}
+	case RTC_CENTURY: {
+		int year;
+
+		year = tm->tm_year + 1900;
+
+		ioport__write8(data, bin2bcd(year / 100));
+
+		break;
+	}
+	default:
+		ioport__write8(data, rtc.cmos_data[rtc.cmos_idx]);
+		break;
+	}
+
+	return true;
+}
+
+static bool cmos_ram_data_out(struct ioport *ioport, struct kvm_cpu *vcpu, u16 port, void *data, int size)
+{
+	switch (rtc.cmos_idx) {
+	case RTC_REG_C:
+	case RTC_REG_D:
+		/* Read-only */
+		break;
+	default:
+		rtc.cmos_data[rtc.cmos_idx] = ioport__read8(data);
+		break;
+	}
+
+	return true;
+}
+
+static struct ioport_operations cmos_ram_data_ioport_ops = {
+	.io_out		= cmos_ram_data_out,
+	.io_in		= cmos_ram_data_in,
+};
+
+static bool cmos_ram_index_out(struct ioport *ioport, struct kvm_cpu *vcpu, u16 port, void *data, int size)
+{
+	u8 value = ioport__read8(data);
+
+	vcpu->kvm->nmi_disabled	= value & (1UL << 7);
+	rtc.cmos_idx		= value & ~(1UL << 7);
+
+	return true;
+}
+
+static struct ioport_operations cmos_ram_index_ioport_ops = {
+	.io_out		= cmos_ram_index_out,
+};
+
+int rtc__init(struct kvm *kvm)
+{
+	int r = 0;
+
+	/* PORT 0070-007F - CMOS RAM/RTC (REAL TIME CLOCK) */
+	r = ioport__register(kvm, 0x0070, &cmos_ram_index_ioport_ops, 1, NULL);
+	if (r < 0)
+		return r;
+
+	r = ioport__register(kvm, 0x0071, &cmos_ram_data_ioport_ops, 1, NULL);
+	if (r < 0) {
+		ioport__unregister(kvm, 0x0071);
+		return r;
+	}
+
+	return r;
+}
+dev_init(rtc__init);
+
+int rtc__exit(struct kvm *kvm)
+{
+	/* PORT 0070-007F - CMOS RAM/RTC (REAL TIME CLOCK) */
+	ioport__unregister(kvm, 0x0070);
+	ioport__unregister(kvm, 0x0071);
+
+	return 0;
+}
+dev_exit(rtc__exit);

diff --git a/tools/kvm/hw/serial.c b/tools/kvm/hw/serial.c
new file mode 100644
index 0000000..60147de
--- /dev/null
+++ b/tools/kvm/hw/serial.c

@@ -0,0 +1,452 @@
+#include "kvm/8250-serial.h"
+
+#include "kvm/read-write.h"
+#include "kvm/ioport.h"
+#include "kvm/mutex.h"
+#include "kvm/util.h"
+#include "kvm/term.h"
+#include "kvm/kvm.h"
+#include "kvm/fdt.h"
+
+#include <linux/types.h>
+#include <linux/serial_reg.h>
+
+#include <pthread.h>
+
+/*
+ * This fakes a U6_16550A. The fifo len needs to be 64 as the kernel
+ * expects that for autodetection.
+ */
+#define FIFO_LEN		64
+#define FIFO_MASK		(FIFO_LEN - 1)
+
+#define UART_IIR_TYPE_BITS	0xc0
+
+struct serial8250_device {
+	struct mutex		mutex;
+	u8			id;
+
+	u16			iobase;
+	u8			irq;
+	u8			irq_state;
+	int			txcnt;
+	int			rxcnt;
+	int			rxdone;
+	char			txbuf[FIFO_LEN];
+	char			rxbuf[FIFO_LEN];
+
+	u8			dll;
+	u8			dlm;
+	u8			iir;
+	u8			ier;
+	u8			fcr;
+	u8			lcr;
+	u8			mcr;
+	u8			lsr;
+	u8			msr;
+	u8			scr;
+};
+
+#define SERIAL_REGS_SETTING \
+	.iir			= UART_IIR_NO_INT, \
+	.lsr			= UART_LSR_TEMT | UART_LSR_THRE, \
+	.msr			= UART_MSR_DCD | UART_MSR_DSR | UART_MSR_CTS, \
+	.mcr			= UART_MCR_OUT2,
+
+static struct serial8250_device devices[] = {
+	/* ttyS0 */
+	[0]	= {
+		.mutex			= MUTEX_INITIALIZER,
+
+		.id			= 0,
+		.iobase			= 0x3f8,
+		.irq			= 4,
+
+		SERIAL_REGS_SETTING
+	},
+	/* ttyS1 */
+	[1]	= {
+		.mutex			= MUTEX_INITIALIZER,
+
+		.id			= 1,
+		.iobase			= 0x2f8,
+		.irq			= 3,
+
+		SERIAL_REGS_SETTING
+	},
+	/* ttyS2 */
+	[2]	= {
+		.mutex			= MUTEX_INITIALIZER,
+
+		.id			= 2,
+		.iobase			= 0x3e8,
+		.irq			= 4,
+
+		SERIAL_REGS_SETTING
+	},
+	/* ttyS3 */
+	[3]	= {
+		.mutex			= MUTEX_INITIALIZER,
+
+		.id			= 3,
+		.iobase			= 0x2e8,
+		.irq			= 3,
+
+		SERIAL_REGS_SETTING
+	},
+};
+
+static void serial8250_flush_tx(struct kvm *kvm, struct serial8250_device *dev)
+{
+	dev->lsr |= UART_LSR_TEMT | UART_LSR_THRE;
+
+	if (dev->txcnt) {
+		term_putc(dev->txbuf, dev->txcnt, dev->id);
+		dev->txcnt = 0;
+	}
+}
+
+static void serial8250_update_irq(struct kvm *kvm, struct serial8250_device *dev)
+{
+	u8 iir = 0;
+
+	/* Handle clear rx */
+	if (dev->lcr & UART_FCR_CLEAR_RCVR) {
+		dev->lcr &= ~UART_FCR_CLEAR_RCVR;
+		dev->rxcnt = dev->rxdone = 0;
+		dev->lsr &= ~UART_LSR_DR;
+	}
+
+	/* Handle clear tx */
+	if (dev->lcr & UART_FCR_CLEAR_XMIT) {
+		dev->lcr &= ~UART_FCR_CLEAR_XMIT;
+		dev->txcnt = 0;
+		dev->lsr |= UART_LSR_TEMT | UART_LSR_THRE;
+	}
+
+	/* Data ready and rcv interrupt enabled ? */
+	if ((dev->ier & UART_IER_RDI) && (dev->lsr & UART_LSR_DR))
+		iir |= UART_IIR_RDI;
+
+	/* Transmitter empty and interrupt enabled ? */
+	if ((dev->ier & UART_IER_THRI) && (dev->lsr & UART_LSR_TEMT))
+		iir |= UART_IIR_THRI;
+
+	/* Now update the irq line, if necessary */
+	if (!iir) {
+		dev->iir = UART_IIR_NO_INT;
+		if (dev->irq_state)
+			kvm__irq_line(kvm, dev->irq, 0);
+	} else {
+		dev->iir = iir;
+		if (!dev->irq_state)
+			kvm__irq_line(kvm, dev->irq, 1);
+	}
+	dev->irq_state = iir;
+
+	/*
+	 * If the kernel disabled the tx interrupt, we know that there
+	 * is nothing more to transmit, so we can reset our tx logic
+	 * here.
+	 */
+	if (!(dev->ier & UART_IER_THRI))
+		serial8250_flush_tx(kvm, dev);
+}
+
+#define SYSRQ_PENDING_NONE		0
+
+static int sysrq_pending;
+
+static void serial8250__sysrq(struct kvm *kvm, struct serial8250_device *dev)
+{
+	dev->lsr |= UART_LSR_DR | UART_LSR_BI;
+	dev->rxbuf[dev->rxcnt++] = sysrq_pending;
+	sysrq_pending	= SYSRQ_PENDING_NONE;
+}
+
+static void serial8250__receive(struct kvm *kvm, struct serial8250_device *dev,
+				bool handle_sysrq)
+{
+	int c;
+
+	if (dev->mcr & UART_MCR_LOOP)
+		return;
+
+	if ((dev->lsr & UART_LSR_DR) || dev->rxcnt)
+		return;
+
+	if (handle_sysrq && sysrq_pending) {
+		serial8250__sysrq(kvm, dev);
+		return;
+	}
+
+	if (kvm->cfg.active_console != CONSOLE_8250)
+		return;
+
+	while (term_readable(dev->id) &&
+	       dev->rxcnt < FIFO_LEN) {
+
+		c = term_getc(kvm, dev->id);
+
+		if (c < 0)
+			break;
+		dev->rxbuf[dev->rxcnt++] = c;
+		dev->lsr |= UART_LSR_DR;
+	}
+}
+
+void serial8250__update_consoles(struct kvm *kvm)
+{
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(devices); i++) {
+		struct serial8250_device *dev = &devices[i];
+
+		mutex_lock(&dev->mutex);
+
+		/* Restrict sysrq injection to the first port */
+		serial8250__receive(kvm, dev, i == 0);
+
+		serial8250_update_irq(kvm, dev);
+
+		mutex_unlock(&dev->mutex);
+	}
+}
+
+void serial8250__inject_sysrq(struct kvm *kvm, char sysrq)
+{
+	sysrq_pending = sysrq;
+}
+
+static bool serial8250_out(struct ioport *ioport, struct kvm_cpu *vcpu, u16 port,
+			   void *data, int size)
+{
+	struct serial8250_device *dev = ioport->priv;
+	u16 offset;
+	bool ret = true;
+	char *addr = data;
+
+	mutex_lock(&dev->mutex);
+
+	offset = port - dev->iobase;
+
+	switch (offset) {
+	case UART_TX:
+		if (dev->lcr & UART_LCR_DLAB) {
+			dev->dll = ioport__read8(data);
+			break;
+		}
+
+		/* Loopback mode */
+		if (dev->mcr & UART_MCR_LOOP) {
+			if (dev->rxcnt < FIFO_LEN) {
+				dev->rxbuf[dev->rxcnt++] = *addr;
+				dev->lsr |= UART_LSR_DR;
+			}
+			break;
+		}
+
+		if (dev->txcnt < FIFO_LEN) {
+			dev->txbuf[dev->txcnt++] = *addr;
+			dev->lsr &= ~UART_LSR_TEMT;
+			if (dev->txcnt == FIFO_LEN / 2)
+				dev->lsr &= ~UART_LSR_THRE;
+			serial8250_flush_tx(vcpu->kvm, dev);
+		} else {
+			/* Should never happpen */
+			dev->lsr &= ~(UART_LSR_TEMT | UART_LSR_THRE);
+		}
+		break;
+	case UART_IER:
+		if (!(dev->lcr & UART_LCR_DLAB))
+			dev->ier = ioport__read8(data) & 0x0f;
+		else
+			dev->dlm = ioport__read8(data);
+		break;
+	case UART_FCR:
+		dev->fcr = ioport__read8(data);
+		break;
+	case UART_LCR:
+		dev->lcr = ioport__read8(data);
+		break;
+	case UART_MCR:
+		dev->mcr = ioport__read8(data);
+		break;
+	case UART_LSR:
+		/* Factory test */
+		break;
+	case UART_MSR:
+		/* Not used */
+		break;
+	case UART_SCR:
+		dev->scr = ioport__read8(data);
+		break;
+	default:
+		ret = false;
+		break;
+	}
+
+	serial8250_update_irq(vcpu->kvm, dev);
+
+	mutex_unlock(&dev->mutex);
+
+	return ret;
+}
+
+static void serial8250_rx(struct serial8250_device *dev, void *data)
+{
+	if (dev->rxdone == dev->rxcnt)
+		return;
+
+	/* Break issued ? */
+	if (dev->lsr & UART_LSR_BI) {
+		dev->lsr &= ~UART_LSR_BI;
+		ioport__write8(data, 0);
+		return;
+	}
+
+	ioport__write8(data, dev->rxbuf[dev->rxdone++]);
+	if (dev->rxcnt == dev->rxdone) {
+		dev->lsr &= ~UART_LSR_DR;
+		dev->rxcnt = dev->rxdone = 0;
+	}
+}
+
+static bool serial8250_in(struct ioport *ioport, struct kvm_cpu *vcpu, u16 port, void *data, int size)
+{
+	struct serial8250_device *dev = ioport->priv;
+	u16 offset;
+	bool ret = true;
+
+	mutex_lock(&dev->mutex);
+
+	offset = port - dev->iobase;
+
+	switch (offset) {
+	case UART_RX:
+		if (dev->lcr & UART_LCR_DLAB)
+			ioport__write8(data, dev->dll);
+		else
+			serial8250_rx(dev, data);
+		break;
+	case UART_IER:
+		if (dev->lcr & UART_LCR_DLAB)
+			ioport__write8(data, dev->dlm);
+		else
+			ioport__write8(data, dev->ier);
+		break;
+	case UART_IIR:
+		ioport__write8(data, dev->iir | UART_IIR_TYPE_BITS);
+		break;
+	case UART_LCR:
+		ioport__write8(data, dev->lcr);
+		break;
+	case UART_MCR:
+		ioport__write8(data, dev->mcr);
+		break;
+	case UART_LSR:
+		ioport__write8(data, dev->lsr);
+		break;
+	case UART_MSR:
+		ioport__write8(data, dev->msr);
+		break;
+	case UART_SCR:
+		ioport__write8(data, dev->scr);
+		break;
+	default:
+		ret = false;
+		break;
+	}
+
+	serial8250_update_irq(vcpu->kvm, dev);
+
+	mutex_unlock(&dev->mutex);
+
+	return ret;
+}
+
+#ifdef CONFIG_HAS_LIBFDT
+#define DEVICE_NAME_MAX_LEN 32
+static void serial8250_generate_fdt_node(struct ioport *ioport, void *fdt,
+					 void (*generate_irq_prop)(void *fdt,
+								   u8 irq))
+{
+	char dev_name[DEVICE_NAME_MAX_LEN];
+	struct serial8250_device *dev = ioport->priv;
+	u64 addr = KVM_IOPORT_AREA + dev->iobase;
+	u64 reg_prop[] = {
+		cpu_to_fdt64(addr),
+		cpu_to_fdt64(8),
+	};
+
+	snprintf(dev_name, DEVICE_NAME_MAX_LEN, "U6_16550A@%llx", addr);
+
+	_FDT(fdt_begin_node(fdt, dev_name));
+	_FDT(fdt_property_string(fdt, "compatible", "ns16550a"));
+	_FDT(fdt_property(fdt, "reg", reg_prop, sizeof(reg_prop)));
+	generate_irq_prop(fdt, dev->irq);
+	_FDT(fdt_property_cell(fdt, "clock-frequency", 1843200));
+	_FDT(fdt_end_node(fdt));
+}
+#else
+#define serial8250_generate_fdt_node	NULL
+#endif
+
+static struct ioport_operations serial8250_ops = {
+	.io_in			= serial8250_in,
+	.io_out			= serial8250_out,
+	.generate_fdt_node	= serial8250_generate_fdt_node,
+};
+
+static int serial8250__device_init(struct kvm *kvm, struct serial8250_device *dev)
+{
+	int r;
+
+	ioport__map_irq(&dev->irq);
+	r = ioport__register(kvm, dev->iobase, &serial8250_ops, 8, dev);
+	kvm__irq_line(kvm, dev->irq, 0);
+
+	return r;
+}
+
+int serial8250__init(struct kvm *kvm)
+{
+	unsigned int i, j;
+	int r = 0;
+
+	for (i = 0; i < ARRAY_SIZE(devices); i++) {
+		struct serial8250_device *dev = &devices[i];
+
+		r = serial8250__device_init(kvm, dev);
+		if (r < 0)
+			goto cleanup;
+	}
+
+	return r;
+cleanup:
+	for (j = 0; j <= i; j++) {
+		struct serial8250_device *dev = &devices[j];
+
+		ioport__unregister(kvm, dev->iobase);
+	}
+
+	return r;
+}
+dev_init(serial8250__init);
+
+int serial8250__exit(struct kvm *kvm)
+{
+	unsigned int i;
+	int r;
+
+	for (i = 0; i < ARRAY_SIZE(devices); i++) {
+		struct serial8250_device *dev = &devices[i];
+
+		r = ioport__unregister(kvm, dev->iobase);
+		if (r < 0)
+			return r;
+	}
+
+	return 0;
+}
+dev_exit(serial8250__exit);

diff --git a/tools/kvm/hw/vesa.c b/tools/kvm/hw/vesa.c
new file mode 100644
index 0000000..a9a1d3e
--- /dev/null
+++ b/tools/kvm/hw/vesa.c

@@ -0,0 +1,88 @@
+#include "kvm/vesa.h"
+
+#include "kvm/devices.h"
+#include "kvm/virtio-pci-dev.h"
+#include "kvm/framebuffer.h"
+#include "kvm/kvm-cpu.h"
+#include "kvm/ioport.h"
+#include "kvm/util.h"
+#include "kvm/irq.h"
+#include "kvm/kvm.h"
+#include "kvm/pci.h"
+
+#include <linux/byteorder.h>
+#include <sys/mman.h>
+#include <linux/err.h>
+#include <sys/types.h>
+#include <sys/ioctl.h>
+#include <inttypes.h>
+#include <unistd.h>
+
+static bool vesa_pci_io_in(struct ioport *ioport, struct kvm_cpu *vcpu, u16 port, void *data, int size)
+{
+	return true;
+}
+
+static bool vesa_pci_io_out(struct ioport *ioport, struct kvm_cpu *vcpu, u16 port, void *data, int size)
+{
+	return true;
+}
+
+static struct ioport_operations vesa_io_ops = {
+	.io_in			= vesa_pci_io_in,
+	.io_out			= vesa_pci_io_out,
+};
+
+static struct pci_device_header vesa_pci_device = {
+	.vendor_id		= cpu_to_le16(PCI_VENDOR_ID_REDHAT_QUMRANET),
+	.device_id		= cpu_to_le16(PCI_DEVICE_ID_VESA),
+	.header_type		= PCI_HEADER_TYPE_NORMAL,
+	.revision_id		= 0,
+	.class[2]		= 0x03,
+	.subsys_vendor_id	= cpu_to_le16(PCI_SUBSYSTEM_VENDOR_ID_REDHAT_QUMRANET),
+	.subsys_id		= cpu_to_le16(PCI_SUBSYSTEM_ID_VESA),
+	.bar[1]			= cpu_to_le32(VESA_MEM_ADDR | PCI_BASE_ADDRESS_SPACE_MEMORY),
+	.bar_size[1]		= VESA_MEM_SIZE,
+};
+
+static struct device_header vesa_device = {
+	.bus_type	= DEVICE_BUS_PCI,
+	.data		= &vesa_pci_device,
+};
+
+static struct framebuffer vesafb;
+
+struct framebuffer *vesa__init(struct kvm *kvm)
+{
+	u16 vesa_base_addr;
+	char *mem;
+	int r;
+
+	if (!kvm->cfg.vnc && !kvm->cfg.sdl && !kvm->cfg.gtk)
+		return NULL;
+
+	r = ioport__register(kvm, IOPORT_EMPTY, &vesa_io_ops, IOPORT_SIZE, NULL);
+	if (r < 0)
+		return ERR_PTR(r);
+
+	vesa_base_addr			= (u16)r;
+	vesa_pci_device.bar[0]		= cpu_to_le32(vesa_base_addr | PCI_BASE_ADDRESS_SPACE_IO);
+	device__register(&vesa_device);
+
+	mem = mmap(NULL, VESA_MEM_SIZE, PROT_RW, MAP_ANON_NORESERVE, -1, 0);
+	if (mem == MAP_FAILED)
+		ERR_PTR(-errno);
+
+	kvm__register_mem(kvm, VESA_MEM_ADDR, VESA_MEM_SIZE, mem);
+
+	vesafb = (struct framebuffer) {
+		.width			= VESA_WIDTH,
+		.height			= VESA_HEIGHT,
+		.depth			= VESA_BPP,
+		.mem			= mem,
+		.mem_addr		= VESA_MEM_ADDR,
+		.mem_size		= VESA_MEM_SIZE,
+		.kvm			= kvm,
+	};
+	return fb__register(&vesafb);
+}

diff --git a/tools/kvm/include/asm/hweight.h b/tools/kvm/include/asm/hweight.h
new file mode 100644
index 0000000..1a43977
--- /dev/null
+++ b/tools/kvm/include/asm/hweight.h

@@ -0,0 +1,8 @@
+#ifndef _KVM_ASM_HWEIGHT_H_
+#define _KVM_ASM_HWEIGHT_H_
+
+#include <linux/types.h>
+unsigned int hweight32(unsigned int w);
+unsigned long hweight64(__u64 w);
+
+#endif /* _KVM_ASM_HWEIGHT_H_ */

diff --git a/tools/kvm/include/bios/memcpy.h b/tools/kvm/include/bios/memcpy.h
new file mode 100644
index 0000000..e021044
--- /dev/null
+++ b/tools/kvm/include/bios/memcpy.h

@@ -0,0 +1,9 @@
+#ifndef KVM_BIOS_MEMCPY_H
+#define KVM_BIOS_MEMCPY_H
+
+#include <linux/types.h>
+#include <stddef.h>
+
+void memcpy16(u16 dst_seg, void *dst, u16 src_seg, const void *src, size_t len);
+
+#endif /* KVM_BIOS_MEMCPY_H */

diff --git a/tools/kvm/include/kvm/8250-serial.h b/tools/kvm/include/kvm/8250-serial.h
new file mode 100644
index 0000000..e954551
--- /dev/null
+++ b/tools/kvm/include/kvm/8250-serial.h

@@ -0,0 +1,11 @@
+#ifndef KVM__8250_SERIAL_H
+#define KVM__8250_SERIAL_H
+
+struct kvm;
+
+int serial8250__init(struct kvm *kvm);
+int serial8250__exit(struct kvm *kvm);
+void serial8250__update_consoles(struct kvm *kvm);
+void serial8250__inject_sysrq(struct kvm *kvm, char sysrq);
+
+#endif /* KVM__8250_SERIAL_H */

diff --git a/tools/kvm/include/kvm/apic.h b/tools/kvm/include/kvm/apic.h
new file mode 100644
index 0000000..2129997
--- /dev/null
+++ b/tools/kvm/include/kvm/apic.h

@@ -0,0 +1,17 @@
+#ifndef KVM_APIC_H_
+#define KVM_APIC_H_
+
+#include <asm/apicdef.h>
+
+/*
+ * APIC, IOAPIC stuff
+ */
+#define APIC_BASE_ADDR_STEP	0x00400000
+#define IOAPIC_BASE_ADDR_STEP	0x00100000
+
+#define APIC_ADDR(apic)		(APIC_DEFAULT_PHYS_BASE + apic * APIC_BASE_ADDR_STEP)
+#define IOAPIC_ADDR(ioapic)	(IO_APIC_DEFAULT_PHYS_BASE + ioapic * IOAPIC_BASE_ADDR_STEP)
+
+#define KVM_APIC_VERSION	0x14 /* xAPIC */
+
+#endif /* KVM_APIC_H_ */

diff --git a/tools/kvm/include/kvm/brlock.h b/tools/kvm/include/kvm/brlock.h
new file mode 100644
index 0000000..29f72e0
--- /dev/null
+++ b/tools/kvm/include/kvm/brlock.h

@@ -0,0 +1,41 @@
+#ifndef KVM__BRLOCK_H
+#define KVM__BRLOCK_H
+
+#include "kvm/kvm.h"
+#include "kvm/barrier.h"
+
+/*
+ * brlock is a lock which is very cheap for reads, but very expensive
+ * for writes.
+ * This lock will be used when updates are very rare and reads are common.
+ * This lock is currently implemented by stopping the guest while
+ * performing the updates. We assume that the only threads whichread from
+ * the locked data are VCPU threads, and the only writer isn't a VCPU thread.
+ */
+
+#ifndef barrier
+#define barrier()		__asm__ __volatile__("": : :"memory")
+#endif
+
+#ifdef KVM_BRLOCK_DEBUG
+
+#include "kvm/rwsem.h"
+
+DECLARE_RWSEM(brlock_sem);
+
+#define br_read_lock(kvm)	down_read(&brlock_sem);
+#define br_read_unlock(kvm)	up_read(&brlock_sem);
+
+#define br_write_lock(kvm)	down_write(&brlock_sem);
+#define br_write_unlock(kvm)	up_write(&brlock_sem);
+
+#else
+
+#define br_read_lock(kvm)	barrier()
+#define br_read_unlock(kvm)	barrier()
+
+#define br_write_lock(kvm)	kvm__pause(kvm)
+#define br_write_unlock(kvm)	kvm__continue(kvm)
+#endif
+
+#endif

diff --git a/tools/kvm/include/kvm/builtin-balloon.h b/tools/kvm/include/kvm/builtin-balloon.h
new file mode 100644
index 0000000..77ee656
--- /dev/null
+++ b/tools/kvm/include/kvm/builtin-balloon.h

@@ -0,0 +1,9 @@
+#ifndef KVM__BALLOON_H
+#define KVM__BALLOON_H
+
+#include <kvm/util.h>
+
+int kvm_cmd_balloon(int argc, const char **argv, const char *prefix);
+void kvm_balloon_help(void) NORETURN;
+
+#endif

diff --git a/tools/kvm/include/kvm/builtin-debug.h b/tools/kvm/include/kvm/builtin-debug.h
new file mode 100644
index 0000000..efa0268
--- /dev/null
+++ b/tools/kvm/include/kvm/builtin-debug.h

@@ -0,0 +1,20 @@
+#ifndef KVM__DEBUG_H
+#define KVM__DEBUG_H
+
+#include <kvm/util.h>
+#include <linux/types.h>
+
+#define KVM_DEBUG_CMD_TYPE_DUMP	(1 << 0)
+#define KVM_DEBUG_CMD_TYPE_NMI	(1 << 1)
+#define KVM_DEBUG_CMD_TYPE_SYSRQ (1 << 2)
+
+struct debug_cmd_params {
+	u32 dbg_type;
+	u32 cpu;
+	char sysrq;
+};
+
+int kvm_cmd_debug(int argc, const char **argv, const char *prefix);
+void kvm_debug_help(void) NORETURN;
+
+#endif

diff --git a/tools/kvm/include/kvm/builtin-help.h b/tools/kvm/include/kvm/builtin-help.h
new file mode 100644
index 0000000..2946743
--- /dev/null
+++ b/tools/kvm/include/kvm/builtin-help.h

@@ -0,0 +1,6 @@
+#ifndef __KVM_HELP_H__
+#define __KVM_HELP_H__
+
+int kvm_cmd_help(int argc, const char **argv, const char *prefix);
+
+#endif

diff --git a/tools/kvm/include/kvm/builtin-list.h b/tools/kvm/include/kvm/builtin-list.h
new file mode 100644
index 0000000..47029ca
--- /dev/null
+++ b/tools/kvm/include/kvm/builtin-list.h

@@ -0,0 +1,10 @@
+#ifndef KVM__LIST_H
+#define KVM__LIST_H
+
+#include <kvm/util.h>
+
+int kvm_cmd_list(int argc, const char **argv, const char *prefix);
+void kvm_list_help(void) NORETURN;
+int get_vmstate(int sock);
+
+#endif

diff --git a/tools/kvm/include/kvm/builtin-pause.h b/tools/kvm/include/kvm/builtin-pause.h
new file mode 100644
index 0000000..84aaee3
--- /dev/null
+++ b/tools/kvm/include/kvm/builtin-pause.h

@@ -0,0 +1,9 @@
+#ifndef KVM__PAUSE_H
+#define KVM__PAUSE_H
+
+#include <kvm/util.h>
+
+int kvm_cmd_pause(int argc, const char **argv, const char *prefix);
+void kvm_pause_help(void) NORETURN;
+
+#endif

diff --git a/tools/kvm/include/kvm/builtin-resume.h b/tools/kvm/include/kvm/builtin-resume.h
new file mode 100644
index 0000000..7de999b
--- /dev/null
+++ b/tools/kvm/include/kvm/builtin-resume.h

@@ -0,0 +1,9 @@
+#ifndef KVM__RESUME_H
+#define KVM__RESUME_H
+
+#include <kvm/util.h>
+
+int kvm_cmd_resume(int argc, const char **argv, const char *prefix);
+void kvm_resume_help(void) NORETURN;
+
+#endif

diff --git a/tools/kvm/include/kvm/builtin-run.h b/tools/kvm/include/kvm/builtin-run.h
new file mode 100644
index 0000000..91521a58
--- /dev/null
+++ b/tools/kvm/include/kvm/builtin-run.h

@@ -0,0 +1,11 @@
+#ifndef __KVM_RUN_H__
+#define __KVM_RUN_H__
+
+#include <kvm/util.h>
+
+int kvm_cmd_run(int argc, const char **argv, const char *prefix);
+void kvm_run_help(void) NORETURN;
+
+void kvm_run_set_wrapper_sandbox(void);
+
+#endif

diff --git a/tools/kvm/include/kvm/builtin-sandbox.h b/tools/kvm/include/kvm/builtin-sandbox.h
new file mode 100644
index 0000000..98cd6be
--- /dev/null
+++ b/tools/kvm/include/kvm/builtin-sandbox.h

@@ -0,0 +1,6 @@
+#ifndef KVM__SANDBOX_H
+#define KVM__SANDBOX_H
+
+int kvm_cmd_sandbox(int argc, const char **argv, const char *prefix);
+
+#endif

diff --git a/tools/kvm/include/kvm/builtin-setup.h b/tools/kvm/include/kvm/builtin-setup.h
new file mode 100644
index 0000000..4a8d7ee
--- /dev/null
+++ b/tools/kvm/include/kvm/builtin-setup.h

@@ -0,0 +1,11 @@
+#ifndef KVM__SETUP_H
+#define KVM__SETUP_H
+
+#include <kvm/util.h>
+
+int kvm_cmd_setup(int argc, const char **argv, const char *prefix);
+void kvm_setup_help(void) NORETURN;
+int kvm_setup_create_new(const char *guestfs_name);
+void kvm_setup_resolv(const char *guestfs_name);
+
+#endif

diff --git a/tools/kvm/include/kvm/builtin-stat.h b/tools/kvm/include/kvm/builtin-stat.h
new file mode 100644
index 0000000..4fecb37
--- /dev/null
+++ b/tools/kvm/include/kvm/builtin-stat.h

@@ -0,0 +1,9 @@
+#ifndef KVM__STAT_H
+#define KVM__STAT_H
+
+#include <kvm/util.h>
+
+int kvm_cmd_stat(int argc, const char **argv, const char *prefix);
+void kvm_stat_help(void) NORETURN;
+
+#endif

diff --git a/tools/kvm/include/kvm/builtin-stop.h b/tools/kvm/include/kvm/builtin-stop.h
new file mode 100644
index 0000000..b26b275
--- /dev/null
+++ b/tools/kvm/include/kvm/builtin-stop.h

@@ -0,0 +1,9 @@
+#ifndef KVM__STOP_H
+#define KVM__STOP_H
+
+#include <kvm/util.h>
+
+int kvm_cmd_stop(int argc, const char **argv, const char *prefix);
+void kvm_stop_help(void) NORETURN;
+
+#endif

diff --git a/tools/kvm/include/kvm/builtin-version.h b/tools/kvm/include/kvm/builtin-version.h
new file mode 100644
index 0000000..83cac4d
--- /dev/null
+++ b/tools/kvm/include/kvm/builtin-version.h

@@ -0,0 +1,6 @@
+#ifndef KVM__VERSION_H
+#define KVM__VERSION_H
+
+int kvm_cmd_version(int argc, const char **argv, const char *prefix);
+
+#endif

diff --git a/tools/kvm/include/kvm/compiler.h b/tools/kvm/include/kvm/compiler.h
new file mode 100644
index 0000000..2013a83
--- /dev/null
+++ b/tools/kvm/include/kvm/compiler.h

@@ -0,0 +1,10 @@
+#ifndef KVM_COMPILER_H_
+#define KVM_COMPILER_H_
+
+#ifndef __compiletime_error
+# define __compiletime_error(message)
+#endif
+
+#define notrace __attribute__((no_instrument_function))
+
+#endif /* KVM_COMPILER_H_ */

diff --git a/tools/kvm/include/kvm/devices.h b/tools/kvm/include/kvm/devices.h
new file mode 100644
index 0000000..405f195
--- /dev/null
+++ b/tools/kvm/include/kvm/devices.h

@@ -0,0 +1,29 @@
+#ifndef KVM__DEVICES_H
+#define KVM__DEVICES_H
+
+#include <linux/rbtree.h>
+#include <linux/types.h>
+
+enum device_bus_type {
+	DEVICE_BUS_PCI,
+	DEVICE_BUS_MMIO,
+	DEVICE_BUS_IOPORT,
+	DEVICE_BUS_MAX,
+};
+
+struct device_header {
+	enum device_bus_type	bus_type;
+	void			*data;
+	int			dev_num;
+	struct rb_node		node;
+};
+
+int device__register(struct device_header *dev);
+void device__unregister(struct device_header *dev);
+struct device_header *device__find_dev(enum device_bus_type bus_type,
+				       u8 dev_num);
+
+struct device_header *device__first_dev(enum device_bus_type bus_type);
+struct device_header *device__next_dev(struct device_header *dev);
+
+#endif /* KVM__DEVICES_H */

diff --git a/tools/kvm/include/kvm/disk-image.h b/tools/kvm/include/kvm/disk-image.h
new file mode 100644
index 0000000..b728052
--- /dev/null
+++ b/tools/kvm/include/kvm/disk-image.h

@@ -0,0 +1,96 @@
+#ifndef KVM__DISK_IMAGE_H
+#define KVM__DISK_IMAGE_H
+
+#include "kvm/read-write.h"
+#include "kvm/util.h"
+#include "kvm/parse-options.h"
+
+#include <linux/types.h>
+#include <linux/fs.h>	/* for BLKGETSIZE64 */
+#include <sys/ioctl.h>
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <stdbool.h>
+#include <sys/uio.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+#define SECTOR_SHIFT		9
+#define SECTOR_SIZE		(1UL << SECTOR_SHIFT)
+
+enum {
+	DISK_IMAGE_REGULAR,
+	DISK_IMAGE_MMAP,
+};
+
+#define MAX_DISK_IMAGES         4
+
+struct disk_image;
+
+struct disk_image_operations {
+	ssize_t (*read)(struct disk_image *disk, u64 sector, const struct iovec *iov,
+			int iovcount, void *param);
+	ssize_t (*write)(struct disk_image *disk, u64 sector, const struct iovec *iov,
+			int iovcount, void *param);
+	int (*flush)(struct disk_image *disk);
+	int (*close)(struct disk_image *disk);
+};
+
+struct disk_image_params {
+	const char *filename;
+	/*
+	 * wwpn == World Wide Port Number
+	 * tpgt == Target Portal Group Tag
+	 */
+	const char *wwpn;
+	const char *tpgt;
+	bool readonly;
+	bool direct;
+};
+
+struct disk_image {
+	int				fd;
+	u64				size;
+	struct disk_image_operations	*ops;
+	void				*priv;
+	void				*disk_req_cb_param;
+	void				(*disk_req_cb)(void *param, long len);
+	bool				async;
+	int				evt;
+#ifdef CONFIG_HAS_AIO
+	io_context_t			ctx;
+#endif
+	const char			*wwpn;
+	const char			*tpgt;
+	int				debug_iodelay;
+};
+
+int disk_img_name_parser(const struct option *opt, const char *arg, int unset);
+int disk_image__init(struct kvm *kvm);
+int disk_image__exit(struct kvm *kvm);
+struct disk_image *disk_image__new(int fd, u64 size, struct disk_image_operations *ops, int mmap);
+int disk_image__flush(struct disk_image *disk);
+ssize_t disk_image__read(struct disk_image *disk, u64 sector, const struct iovec *iov,
+				int iovcount, void *param);
+ssize_t disk_image__write(struct disk_image *disk, u64 sector, const struct iovec *iov,
+				int iovcount, void *param);
+ssize_t disk_image__get_serial(struct disk_image *disk, void *buffer, ssize_t *len);
+
+struct disk_image *raw_image__probe(int fd, struct stat *st, bool readonly);
+struct disk_image *blkdev__probe(const char *filename, int flags, struct stat *st);
+
+ssize_t raw_image__read(struct disk_image *disk, u64 sector,
+				const struct iovec *iov, int iovcount, void *param);
+ssize_t raw_image__write(struct disk_image *disk, u64 sector,
+				const struct iovec *iov, int iovcount, void *param);
+ssize_t raw_image__read_mmap(struct disk_image *disk, u64 sector,
+				const struct iovec *iov, int iovcount, void *param);
+ssize_t raw_image__write_mmap(struct disk_image *disk, u64 sector,
+				const struct iovec *iov, int iovcount, void *param);
+int raw_image__close(struct disk_image *disk);
+void disk_image__set_callback(struct disk_image *disk, void (*disk_req_cb)(void *param, long len));
+#endif /* KVM__DISK_IMAGE_H */

diff --git a/tools/kvm/include/kvm/e820.h b/tools/kvm/include/kvm/e820.h
new file mode 100644
index 0000000..15f62cc
--- /dev/null
+++ b/tools/kvm/include/kvm/e820.h

@@ -0,0 +1,13 @@
+#ifndef KVM_E820_H
+#define KVM_E820_H
+
+#include <linux/types.h>
+#include <kvm/bios.h>
+
+#define SMAP    0x534d4150      /* ASCII "SMAP" */
+
+struct biosregs;
+
+extern bioscall void e820_query_map(struct biosregs *regs);
+
+#endif /* KVM_E820_H */

diff --git a/tools/kvm/include/kvm/fdt.h b/tools/kvm/include/kvm/fdt.h
new file mode 100644
index 0000000..19f95ac
--- /dev/null
+++ b/tools/kvm/include/kvm/fdt.h

@@ -0,0 +1,26 @@
+#ifndef KVM__FDT_H
+#define KVM__FDT_H
+
+#include "libfdt.h"
+
+#include <linux/types.h>
+
+#define FDT_MAX_SIZE	0x10000
+
+/* Helper for the various bits of code that generate FDT nodes */
+#define _FDT(exp)							\
+	do {								\
+		int ret = (exp);					\
+		if (ret < 0) {						\
+			die("Error creating device tree: %s: %s\n",	\
+			    #exp, fdt_strerror(ret));			\
+		}							\
+	} while (0)
+
+static inline u32 fdt__alloc_phandle(void)
+{
+	static u32 phandle = 0;
+	return ++phandle;
+}
+
+#endif /* KVM__FDT_H */

diff --git a/tools/kvm/include/kvm/framebuffer.h b/tools/kvm/include/kvm/framebuffer.h
new file mode 100644
index 0000000..e3200e5
--- /dev/null
+++ b/tools/kvm/include/kvm/framebuffer.h

@@ -0,0 +1,36 @@
+#ifndef KVM__FRAMEBUFFER_H
+#define KVM__FRAMEBUFFER_H
+
+#include <linux/types.h>
+#include <linux/list.h>
+
+struct framebuffer;
+
+struct fb_target_operations {
+	int (*start)(struct framebuffer *fb);
+	int (*stop)(struct framebuffer *fb);
+};
+
+#define FB_MAX_TARGETS			2
+
+struct framebuffer {
+	struct list_head		node;
+
+	u32				width;
+	u32				height;
+	u8				depth;
+	char				*mem;
+	u64				mem_addr;
+	u64				mem_size;
+	struct kvm			*kvm;
+
+	unsigned long			nr_targets;
+	struct fb_target_operations	*targets[FB_MAX_TARGETS];
+};
+
+struct framebuffer *fb__register(struct framebuffer *fb);
+int fb__attach(struct framebuffer *fb, struct fb_target_operations *ops);
+int fb__init(struct kvm *kvm);
+int fb__exit(struct kvm *kvm);
+
+#endif /* KVM__FRAMEBUFFER_H */

diff --git a/tools/kvm/include/kvm/gtk3.h b/tools/kvm/include/kvm/gtk3.h
new file mode 100644
index 0000000..b02dc13
--- /dev/null
+++ b/tools/kvm/include/kvm/gtk3.h

@@ -0,0 +1,28 @@
+#ifndef KVM__GTK3_H
+#define KVM__GTK3_H
+
+#include "kvm/util.h"
+
+struct framebuffer;
+
+#ifdef CONFIG_HAS_GTK3
+int kvm_gtk_init(struct kvm *kvm);
+int kvm_gtk_exit(struct kvm *kvm);
+#else
+static inline int kvm_gtk_init(struct kvm *kvm)
+{
+	if (kvm->cfg.gtk)
+		die("GTK3 support not compiled in. (install the gtk3-devel or libgtk3.0-dev package)");
+
+	return 0;
+}
+static inline int kvm_gtk_exit(struct kvm *kvm)
+{
+	if (kvm->cfg.gtk)
+		die("GTK3 support not compiled in. (install the gtk3-devel or libgtk3.0-dev package)");
+
+	return 0;
+}
+#endif
+
+#endif /* KVM__GTK3_H */

diff --git a/tools/kvm/include/kvm/guest_compat.h b/tools/kvm/include/kvm/guest_compat.h
new file mode 100644
index 0000000..ae7abbd
--- /dev/null
+++ b/tools/kvm/include/kvm/guest_compat.h

@@ -0,0 +1,9 @@
+#ifndef KVM__GUEST_COMPAT_H
+#define KVM__GUEST_COMPAT_H
+
+int compat__print_all_messages(void);
+int compat__remove_message(int id);
+int compat__add_message(const char *title, const char *description);
+
+
+#endif
\ No newline at end of file

diff --git a/tools/kvm/include/kvm/i8042.h b/tools/kvm/include/kvm/i8042.h
new file mode 100644
index 0000000..3b4ab68
--- /dev/null
+++ b/tools/kvm/include/kvm/i8042.h

@@ -0,0 +1,12 @@
+#ifndef KVM__PCKBD_H
+#define KVM__PCKBD_H
+
+#include <linux/types.h>
+
+struct kvm;
+
+void mouse_queue(u8 c);
+void kbd_queue(u8 c);
+int kbd__init(struct kvm *kvm);
+
+#endif

diff --git a/tools/kvm/include/kvm/ioeventfd.h b/tools/kvm/include/kvm/ioeventfd.h
new file mode 100644
index 0000000..bb1f78d
--- /dev/null
+++ b/tools/kvm/include/kvm/ioeventfd.h

@@ -0,0 +1,31 @@
+#ifndef KVM__IOEVENTFD_H
+#define KVM__IOEVENTFD_H
+
+#include <linux/types.h>
+#include <linux/list.h>
+#include <sys/eventfd.h>
+#include "kvm/util.h"
+
+struct kvm;
+
+struct ioevent {
+	u64			io_addr;
+	u8			io_len;
+	void			(*fn)(struct kvm *kvm, void *ptr);
+	struct kvm		*fn_kvm;
+	void			*fn_ptr;
+	int			fd;
+	u64			datamatch;
+
+	struct list_head	list;
+};
+
+#define IOEVENTFD_FLAG_PIO		(1 << 0)
+#define IOEVENTFD_FLAG_USER_POLL	(1 << 1)
+
+int ioeventfd__init(struct kvm *kvm);
+int ioeventfd__exit(struct kvm *kvm);
+int ioeventfd__add_event(struct ioevent *ioevent, int flags);
+int ioeventfd__del_event(u64 addr, u64 datamatch);
+
+#endif

diff --git a/tools/kvm/include/kvm/ioport.h b/tools/kvm/include/kvm/ioport.h
new file mode 100644
index 0000000..18da47c
--- /dev/null
+++ b/tools/kvm/include/kvm/ioport.h

@@ -0,0 +1,76 @@
+#ifndef KVM__IOPORT_H
+#define KVM__IOPORT_H
+
+#include "kvm/devices.h"
+#include "kvm/kvm-cpu.h"
+#include "kvm/rbtree-interval.h"
+
+#include <stdbool.h>
+#include <limits.h>
+#include <asm/types.h>
+#include <linux/types.h>
+#include <linux/byteorder.h>
+
+/* some ports we reserve for own use */
+#define IOPORT_DBG			0xe0
+#define IOPORT_START			0x6200
+#define IOPORT_SIZE			0x400
+
+#define IOPORT_EMPTY			USHRT_MAX
+
+struct kvm;
+
+struct ioport {
+	struct rb_int_node		node;
+	struct ioport_operations	*ops;
+	void				*priv;
+	struct device_header		dev_hdr;
+};
+
+struct ioport_operations {
+	bool (*io_in)(struct ioport *ioport, struct kvm_cpu *vcpu, u16 port, void *data, int size);
+	bool (*io_out)(struct ioport *ioport, struct kvm_cpu *vcpu, u16 port, void *data, int size);
+	void (*generate_fdt_node)(struct ioport *ioport, void *fdt,
+				  void (*generate_irq_prop)(void *fdt, u8 irq));
+};
+
+void ioport__setup_arch(struct kvm *kvm);
+void ioport__map_irq(u8 *irq);
+
+int ioport__register(struct kvm *kvm, u16 port, struct ioport_operations *ops,
+			int count, void *param);
+int ioport__unregister(struct kvm *kvm, u16 port);
+int ioport__init(struct kvm *kvm);
+int ioport__exit(struct kvm *kvm);
+
+static inline u8 ioport__read8(u8 *data)
+{
+	return *data;
+}
+/* On BE platforms, PCI I/O is byteswapped, i.e. LE, so swap back. */
+static inline u16 ioport__read16(u16 *data)
+{
+	return le16_to_cpu(*data);
+}
+
+static inline u32 ioport__read32(u32 *data)
+{
+	return le32_to_cpu(*data);
+}
+
+static inline void ioport__write8(u8 *data, u8 value)
+{
+	*data		 = value;
+}
+
+static inline void ioport__write16(u16 *data, u16 value)
+{
+	*data		 = cpu_to_le16(value);
+}
+
+static inline void ioport__write32(u32 *data, u32 value)
+{
+	*data		 = cpu_to_le32(value);
+}
+
+#endif /* KVM__IOPORT_H */

diff --git a/tools/kvm/include/kvm/iovec.h b/tools/kvm/include/kvm/iovec.h
new file mode 100644
index 0000000..fe79dd4
--- /dev/null
+++ b/tools/kvm/include/kvm/iovec.h

@@ -0,0 +1,21 @@
+#ifndef KVM_UTIL_IOVEC_H_
+#define KVM_UTIL_IOVEC_H_
+
+extern int memcpy_fromiovec(unsigned char *kdata, struct iovec *iov, int len);
+extern int memcpy_fromiovecend(unsigned char *kdata, const struct iovec *iov,
+				size_t offset, int len);
+extern int memcpy_toiovec(struct iovec *v, unsigned char *kdata, int len);
+extern int memcpy_toiovecend(const struct iovec *v, unsigned char *kdata,
+				size_t offset, int len);
+
+static inline size_t iov_size(const struct iovec *iovec, size_t len)
+{
+	size_t size = 0, i;
+
+	for (i = 0; i < len; i++)
+		size += iovec[i].iov_len;
+
+	return size;
+}
+
+#endif

diff --git a/tools/kvm/include/kvm/irq.h b/tools/kvm/include/kvm/irq.h
new file mode 100644
index 0000000..4cec6f0
--- /dev/null
+++ b/tools/kvm/include/kvm/irq.h

@@ -0,0 +1,19 @@
+#ifndef KVM__IRQ_H
+#define KVM__IRQ_H
+
+#include <linux/types.h>
+#include <linux/rbtree.h>
+#include <linux/list.h>
+#include <linux/kvm.h>
+
+#include "kvm/msi.h"
+
+struct kvm;
+
+int irq__alloc_line(void);
+
+int irq__init(struct kvm *kvm);
+int irq__exit(struct kvm *kvm);
+int irq__add_msix_route(struct kvm *kvm, struct msi_msg *msg);
+
+#endif

diff --git a/tools/kvm/include/kvm/kvm-cmd.h b/tools/kvm/include/kvm/kvm-cmd.h
new file mode 100644
index 0000000..0a73bce
--- /dev/null
+++ b/tools/kvm/include/kvm/kvm-cmd.h

@@ -0,0 +1,17 @@
+#ifndef __KVM_CMD_H__
+#define __KVM_CMD_H__
+
+struct cmd_struct {
+	const char *cmd;
+	int (*fn)(int, const char **, const char *);
+	void (*help)(void);
+	int option;
+};
+
+extern struct cmd_struct kvm_commands[];
+struct cmd_struct *kvm_get_command(struct cmd_struct *command,
+                const char *cmd);
+
+int handle_command(struct cmd_struct *command, int argc, const char **argv);
+
+#endif

diff --git a/tools/kvm/include/kvm/kvm-config.h b/tools/kvm/include/kvm/kvm-config.h
new file mode 100644
index 0000000..386fa8c
--- /dev/null
+++ b/tools/kvm/include/kvm/kvm-config.h

@@ -0,0 +1,62 @@
+#ifndef KVM_CONFIG_H_
+#define KVM_CONFIG_H_
+
+#include "kvm/disk-image.h"
+#include "kvm/kvm-config-arch.h"
+
+#define DEFAULT_KVM_DEV		"/dev/kvm"
+#define DEFAULT_CONSOLE		"serial"
+#define DEFAULT_NETWORK		"user"
+#define DEFAULT_HOST_ADDR	"192.168.33.1"
+#define DEFAULT_GUEST_ADDR	"192.168.33.15"
+#define DEFAULT_GUEST_MAC	"02:15:15:15:15:15"
+#define DEFAULT_HOST_MAC	"02:01:01:01:01:01"
+#define DEFAULT_SCRIPT		"none"
+#define DEFAULT_SANDBOX_FILENAME "guest/sandbox.sh"
+
+#define MIN_RAM_SIZE_MB		(64ULL)
+#define MIN_RAM_SIZE_BYTE	(MIN_RAM_SIZE_MB << MB_SHIFT)
+
+struct kvm_config {
+	struct kvm_config_arch arch;
+	struct disk_image_params disk_image[MAX_DISK_IMAGES];
+	u64 ram_size;
+	u8  image_count;
+	u8 num_net_devices;
+	bool virtio_rng;
+	int active_console;
+	int debug_iodelay;
+	int nrcpus;
+	const char *kernel_cmdline;
+	const char *kernel_filename;
+	const char *vmlinux_filename;
+	const char *initrd_filename;
+	const char *firmware_filename;
+	const char *console;
+	const char *dev;
+	const char *network;
+	const char *host_ip;
+	const char *guest_ip;
+	const char *guest_mac;
+	const char *host_mac;
+	const char *script;
+	const char *guest_name;
+	const char *sandbox;
+	const char *hugetlbfs_path;
+	const char *custom_rootfs_name;
+	const char *real_cmdline;
+	struct virtio_net_params *net_params;
+	bool single_step;
+	bool vnc;
+	bool gtk;
+	bool sdl;
+	bool balloon;
+	bool using_rootfs;
+	bool custom_rootfs;
+	bool no_net;
+	bool no_dhcp;
+	bool ioport_debug;
+	bool mmio_debug;
+};
+
+#endif

diff --git a/tools/kvm/include/kvm/kvm-cpu.h b/tools/kvm/include/kvm/kvm-cpu.h
new file mode 100644
index 0000000..aa0cb54
--- /dev/null
+++ b/tools/kvm/include/kvm/kvm-cpu.h

@@ -0,0 +1,27 @@
+#ifndef KVM__KVM_CPU_H
+#define KVM__KVM_CPU_H
+
+#include "kvm/kvm-cpu-arch.h"
+#include <stdbool.h>
+
+int kvm_cpu__init(struct kvm *kvm);
+int kvm_cpu__exit(struct kvm *kvm);
+struct kvm_cpu *kvm_cpu__arch_init(struct kvm *kvm, unsigned long cpu_id);
+void kvm_cpu__delete(struct kvm_cpu *vcpu);
+void kvm_cpu__reset_vcpu(struct kvm_cpu *vcpu);
+void kvm_cpu__setup_cpuid(struct kvm_cpu *vcpu);
+void kvm_cpu__enable_singlestep(struct kvm_cpu *vcpu);
+void kvm_cpu__run(struct kvm_cpu *vcpu);
+void kvm_cpu__reboot(struct kvm *kvm);
+int kvm_cpu__start(struct kvm_cpu *cpu);
+bool kvm_cpu__handle_exit(struct kvm_cpu *vcpu);
+int kvm_cpu__get_endianness(struct kvm_cpu *vcpu);
+
+int kvm_cpu__get_debug_fd(void);
+void kvm_cpu__set_debug_fd(int fd);
+void kvm_cpu__show_code(struct kvm_cpu *vcpu);
+void kvm_cpu__show_registers(struct kvm_cpu *vcpu);
+void kvm_cpu__show_page_tables(struct kvm_cpu *vcpu);
+void kvm_cpu__arch_nmi(struct kvm_cpu *cpu);
+
+#endif /* KVM__KVM_CPU_H */

diff --git a/tools/kvm/include/kvm/kvm-ipc.h b/tools/kvm/include/kvm/kvm-ipc.h
new file mode 100644
index 0000000..5494da4
--- /dev/null
+++ b/tools/kvm/include/kvm/kvm-ipc.h

@@ -0,0 +1,26 @@
+#ifndef KVM__IPC_H_
+#define KVM__IPC_H_
+
+#include <linux/types.h>
+#include "kvm/kvm.h"
+
+enum {
+	KVM_IPC_BALLOON	= 1,
+	KVM_IPC_DEBUG	= 2,
+	KVM_IPC_STAT	= 3,
+	KVM_IPC_PAUSE	= 4,
+	KVM_IPC_RESUME	= 5,
+	KVM_IPC_STOP	= 6,
+	KVM_IPC_PID	= 7,
+	KVM_IPC_VMSTATE	= 8,
+};
+
+int kvm_ipc__register_handler(u32 type, void (*cb)(struct kvm *kvm,
+				int fd, u32 type, u32 len, u8 *msg));
+int kvm_ipc__init(struct kvm *kvm);
+int kvm_ipc__exit(struct kvm *kvm);
+
+int kvm_ipc__send(int fd, u32 type);
+int kvm_ipc__send_msg(int fd, u32 type, u32 len, u8 *msg);
+
+#endif

diff --git a/tools/kvm/include/kvm/kvm.h b/tools/kvm/include/kvm/kvm.h
new file mode 100644
index 0000000..754e029
--- /dev/null
+++ b/tools/kvm/include/kvm/kvm.h

@@ -0,0 +1,134 @@
+#ifndef KVM__KVM_H
+#define KVM__KVM_H
+
+#include "kvm/kvm-arch.h"
+#include "kvm/kvm-config.h"
+#include "kvm/util-init.h"
+#include "kvm/kvm.h"
+
+#include <stdbool.h>
+#include <linux/types.h>
+#include <time.h>
+#include <signal.h>
+#include <sys/prctl.h>
+
+#define SIGKVMEXIT		(SIGRTMIN + 0)
+#define SIGKVMPAUSE		(SIGRTMIN + 1)
+
+#define KVM_PID_FILE_PATH	"/.lkvm/"
+#define HOME_DIR		getenv("HOME")
+#define KVM_BINARY_NAME		"lkvm"
+
+#define PAGE_SIZE (sysconf(_SC_PAGE_SIZE))
+
+#define DEFINE_KVM_EXT(ext)		\
+	.name = #ext,			\
+	.code = ext
+
+enum {
+	KVM_VMSTATE_RUNNING,
+	KVM_VMSTATE_PAUSED,
+};
+
+struct kvm_ext {
+	const char *name;
+	int code;
+};
+
+struct kvm_mem_bank {
+	struct list_head	list;
+	u64			guest_phys_addr;
+	void			*host_addr;
+	u64			size;
+};
+
+struct kvm {
+	struct kvm_arch		arch;
+	struct kvm_config	cfg;
+	int			sys_fd;		/* For system ioctls(), i.e. /dev/kvm */
+	int			vm_fd;		/* For VM ioctls() */
+	timer_t			timerid;	/* Posix timer for interrupts */
+
+	int			nrcpus;		/* Number of cpus to run */
+	struct kvm_cpu		**cpus;
+
+	u32			mem_slots;	/* for KVM_SET_USER_MEMORY_REGION */
+	u64			ram_size;
+	void			*ram_start;
+	u64			ram_pagesize;
+	struct list_head	mem_banks;
+
+	bool			nmi_disabled;
+
+	const char		*vmlinux;
+	struct disk_image       **disks;
+	int                     nr_disks;
+
+	int			vm_state;
+};
+
+void kvm__set_dir(const char *fmt, ...);
+const char *kvm__get_dir(void);
+
+int kvm__init(struct kvm *kvm);
+struct kvm *kvm__new(void);
+int kvm__recommended_cpus(struct kvm *kvm);
+int kvm__max_cpus(struct kvm *kvm);
+void kvm__init_ram(struct kvm *kvm);
+int kvm__exit(struct kvm *kvm);
+bool kvm__load_firmware(struct kvm *kvm, const char *firmware_filename);
+bool kvm__load_kernel(struct kvm *kvm, const char *kernel_filename,
+			const char *initrd_filename, const char *kernel_cmdline);
+int kvm_timer__init(struct kvm *kvm);
+int kvm_timer__exit(struct kvm *kvm);
+void kvm__irq_line(struct kvm *kvm, int irq, int level);
+void kvm__irq_trigger(struct kvm *kvm, int irq);
+bool kvm__emulate_io(struct kvm_cpu *vcpu, u16 port, void *data, int direction, int size, u32 count);
+bool kvm__emulate_mmio(struct kvm_cpu *vcpu, u64 phys_addr, u8 *data, u32 len, u8 is_write);
+int kvm__register_mem(struct kvm *kvm, u64 guest_phys, u64 size, void *userspace_addr);
+int kvm__register_mmio(struct kvm *kvm, u64 phys_addr, u64 phys_addr_len, bool coalesce,
+		       void (*mmio_fn)(struct kvm_cpu *vcpu, u64 addr, u8 *data, u32 len, u8 is_write, void *ptr),
+			void *ptr);
+bool kvm__deregister_mmio(struct kvm *kvm, u64 phys_addr);
+void kvm__pause(struct kvm *kvm);
+void kvm__continue(struct kvm *kvm);
+void kvm__notify_paused(void);
+int kvm__get_sock_by_instance(const char *name);
+int kvm__enumerate_instances(int (*callback)(const char *name, int pid));
+void kvm__remove_socket(const char *name);
+
+void kvm__arch_set_cmdline(char *cmdline, bool video);
+void kvm__arch_init(struct kvm *kvm, const char *hugetlbfs_path, u64 ram_size);
+void kvm__arch_delete_ram(struct kvm *kvm);
+int kvm__arch_setup_firmware(struct kvm *kvm);
+int kvm__arch_free_firmware(struct kvm *kvm);
+bool kvm__arch_cpu_supports_vm(void);
+void kvm__arch_read_term(struct kvm *kvm);
+
+void *guest_flat_to_host(struct kvm *kvm, u64 offset);
+u64 host_to_guest_flat(struct kvm *kvm, void *ptr);
+
+int load_flat_binary(struct kvm *kvm, int fd_kernel, int fd_initrd, const char *kernel_cmdline);
+int load_elf_binary(struct kvm *kvm, int fd_kernel, int fd_initrd, const char *kernel_cmdline);
+bool load_bzimage(struct kvm *kvm, int fd_kernel, int fd_initrd, const char *kernel_cmdline);
+
+/*
+ * Debugging
+ */
+void kvm__dump_mem(struct kvm *kvm, unsigned long addr, unsigned long size, int debug_fd);
+
+extern const char *kvm_exit_reasons[];
+
+static inline bool host_ptr_in_ram(struct kvm *kvm, void *p)
+{
+	return kvm->ram_start <= p && p < (kvm->ram_start + kvm->ram_size);
+}
+
+bool kvm__supports_extension(struct kvm *kvm, unsigned int extension);
+
+static inline void kvm__set_thread_name(const char *name)
+{
+	prctl(PR_SET_NAME, name);
+}
+
+#endif /* KVM__KVM_H */

diff --git a/tools/kvm/include/kvm/msi.h b/tools/kvm/include/kvm/msi.h
new file mode 100644
index 0000000..885eb5b
--- /dev/null
+++ b/tools/kvm/include/kvm/msi.h

@@ -0,0 +1,10 @@
+#ifndef LKVM_MSI_H
+#define LKVM_MSI_H
+
+struct msi_msg {
+	u32	address_lo;	/* low 32 bits of msi message address */
+	u32	address_hi;	/* high 32 bits of msi message address */
+	u32	data;		/* 16 bits of msi message data */
+};
+
+#endif /* LKVM_MSI_H */

diff --git a/tools/kvm/include/kvm/mutex.h b/tools/kvm/include/kvm/mutex.h
new file mode 100644
index 0000000..a90584b
--- /dev/null
+++ b/tools/kvm/include/kvm/mutex.h

@@ -0,0 +1,39 @@
+#ifndef KVM__MUTEX_H
+#define KVM__MUTEX_H
+
+#include <pthread.h>
+
+#include "kvm/util.h"
+
+/*
+ * Kernel-alike mutex API - to make it easier for kernel developers
+ * to write user-space code! :-)
+ */
+
+struct mutex {
+	pthread_mutex_t mutex;
+};
+#define MUTEX_INITIALIZER (struct mutex) { .mutex = PTHREAD_MUTEX_INITIALIZER }
+
+#define DEFINE_MUTEX(mtx) struct mutex mtx = MUTEX_INITIALIZER
+
+static inline void mutex_init(struct mutex *lock)
+{
+	if (pthread_mutex_init(&lock->mutex, NULL) != 0)
+		die("unexpected pthread_mutex_init() failure!");
+}
+
+static inline void mutex_lock(struct mutex *lock)
+{
+	if (pthread_mutex_lock(&lock->mutex) != 0)
+		die("unexpected pthread_mutex_lock() failure!");
+
+}
+
+static inline void mutex_unlock(struct mutex *lock)
+{
+	if (pthread_mutex_unlock(&lock->mutex) != 0)
+		die("unexpected pthread_mutex_unlock() failure!");
+}
+
+#endif /* KVM__MUTEX_H */

diff --git a/tools/kvm/include/kvm/of_pci.h b/tools/kvm/include/kvm/of_pci.h
new file mode 100644
index 0000000..c8187ab
--- /dev/null
+++ b/tools/kvm/include/kvm/of_pci.h

@@ -0,0 +1,44 @@
+#ifndef KVM__OF_PCI_H
+#define KVM__OF_PCI_H
+
+#include <linux/types.h>
+
+/*
+ * Definitions for implementing parts of the OpenFirmware PCI Bus Binding
+ * Specification (IEEE Std 1275-1994).
+ */
+
+struct of_pci_unit_address {
+	u32 hi, mid, lo;
+} __attribute__((packed));
+
+struct of_pci_irq_mask {
+	struct of_pci_unit_address	pci_addr;
+	u32				pci_pin;
+} __attribute__((packed));
+
+struct of_pci_ranges_entry {
+	struct of_pci_unit_address	pci_addr;
+	u64				cpu_addr;
+	u64				length;
+} __attribute__((packed));
+
+/* Macros to operate with address in OF binding to PCI */
+#define __b_x(x, p, l)		(((x) & ((1<<(l))-1)) << (p))
+#define of_pci_b_n(x)		__b_x((x), 31, 1)	/* 0 if relocatable */
+#define of_pci_b_p(x)		__b_x((x), 30, 1)	/* 1 if prefetchable */
+#define of_pci_b_t(x)		__b_x((x), 29, 1)	/* 1 if the address is aliased */
+#define of_pci_b_ss(x)		__b_x((x), 24, 2)	/* the space code */
+#define of_pci_b_bbbbbbbb(x)	__b_x((x), 16, 8)	/* bus number */
+#define of_pci_b_ddddd(x)	__b_x((x), 11, 5)	/* device number */
+#define of_pci_b_fff(x)		__b_x((x), 8, 3)	/* function number */
+#define of_pci_b_rrrrrrrr(x)	__b_x((x), 0, 8)	/* register number */
+
+#define OF_PCI_SS_CONFIG	0
+#define OF_PCI_SS_IO		1
+#define OF_PCI_SS_M32		2
+#define OF_PCI_SS_M64		3
+
+#define OF_PCI_IRQ_MAP_MAX	256	/* 5 bit device + 3 bit pin */
+
+#endif /* KVM__OF_PCI_H */

diff --git a/tools/kvm/include/kvm/parse-options.h b/tools/kvm/include/kvm/parse-options.h
new file mode 100644
index 0000000..b03f151
--- /dev/null
+++ b/tools/kvm/include/kvm/parse-options.h

@@ -0,0 +1,230 @@
+#ifndef __PARSE_OPTIONS_H__
+#define __PARSE_OPTIONS_H__
+
+#include <inttypes.h>
+#include <kvm/util.h>
+
+enum parse_opt_type {
+	/* special types */
+	OPTION_END,
+	OPTION_ARGUMENT,
+	OPTION_GROUP,
+	/* options with no arguments */
+	OPTION_BIT,
+	OPTION_BOOLEAN,
+	OPTION_INCR,
+	OPTION_SET_UINT,
+	OPTION_SET_PTR,
+	/* options with arguments (usually) */
+	OPTION_STRING,
+	OPTION_INTEGER,
+	OPTION_LONG,
+	OPTION_CALLBACK,
+	OPTION_U64,
+	OPTION_UINTEGER,
+};
+
+enum parse_opt_flags {
+	PARSE_OPT_KEEP_DASHDASH = 1,
+	PARSE_OPT_STOP_AT_NON_OPTION = 2,
+	PARSE_OPT_KEEP_ARGV0 = 4,
+	PARSE_OPT_KEEP_UNKNOWN = 8,
+	PARSE_OPT_NO_INTERNAL_HELP = 16,
+};
+
+enum parse_opt_option_flags {
+	PARSE_OPT_OPTARG  = 1,
+	PARSE_OPT_NOARG   = 2,
+	PARSE_OPT_NONEG   = 4,
+	PARSE_OPT_HIDDEN  = 8,
+	PARSE_OPT_LASTARG_DEFAULT = 16,
+};
+
+struct option;
+typedef int parse_opt_cb(const struct option *, const char *arg, int unset);
+/*
+ * `type`::
+ *   holds the type of the option, you must have an OPTION_END last in your
+ *   array.
+ *
+ * `short_name`::
+ *   the character to use as a short option name, '\0' if none.
+ *
+ * `long_name`::
+ *   the long option name, without the leading dashes, NULL if none.
+ *
+ * `value`::
+ *   stores pointers to the values to be filled.
+ *
+ * `argh`::
+ *   token to explain the kind of argument this option wants. Keep it
+ *   homogenous across the repository.
+ *
+ * `help`::
+ *   the short help associated to what the option does.
+ *   Must never be NULL (except for OPTION_END).
+ *   OPTION_GROUP uses this pointer to store the group header.
+ *
+ * `flags`::
+ *   mask of parse_opt_option_flags.
+ *   PARSE_OPT_OPTARG: says that the argument is optionnal (not for BOOLEANs)
+ *   PARSE_OPT_NOARG: says that this option takes no argument, for CALLBACKs
+ *   PARSE_OPT_NONEG: says that this option cannot be negated
+ *   PARSE_OPT_HIDDEN this option is skipped in the default usage, showed in
+ *                    the long one.
+ *
+ * `callback`::
+ *   pointer to the callback to use for OPTION_CALLBACK.
+ *
+ * `defval`::
+ *   default value to fill (*->value) with for PARSE_OPT_OPTARG.
+ *   OPTION_{BIT,SET_UINT,SET_PTR} store the {mask,integer,pointer} to put in
+ *   the value when met.
+ *   CALLBACKS can use it like they want.
+ */
+struct option {
+	enum parse_opt_type type;
+	int short_name;
+	const char *long_name;
+	void *value;
+	const char *argh;
+	const char *help;
+	void *ptr;
+
+	int flags;
+	parse_opt_cb *callback;
+	intptr_t defval;
+};
+
+#define BUILD_BUG_ON_ZERO(e) (sizeof(struct { int:-!!(e); }))
+#define check_vtype(v, type) \
+	(BUILD_BUG_ON_ZERO(!__builtin_types_compatible_p(typeof(v), type)) + v)
+
+#define OPT_INTEGER(s, l, v, h)             \
+{                                           \
+	.type = OPTION_INTEGER,             \
+	.short_name = (s),                  \
+	.long_name = (l),                   \
+	.value = check_vtype(v, int *),     \
+	.help = (h)                         \
+}
+
+#define OPT_UINTEGER(s, l, v, h)            \
+{                                           \
+	.type = OPTION_UINTEGER,            \
+	.short_name = (s),                  \
+	.long_name = (l),                   \
+	.value = check_vtype(v, unsigned int *), \
+	.help = (h)                         \
+}
+
+#define OPT_U64(s, l, v, h)                 \
+{                                           \
+	.type = OPTION_U64,                 \
+	.short_name = (s),                  \
+	.long_name = (l),                   \
+	.value = check_vtype(v, u64 *),     \
+	.help = (h)                         \
+}
+
+#define OPT_STRING(s, l, v, a, h)           \
+{                                           \
+	.type = OPTION_STRING,              \
+	.short_name = (s),                  \
+	.long_name = (l),                   \
+	.value = check_vtype(v, const char **), (a), \
+	.help = (h)                         \
+}
+
+#define OPT_BOOLEAN(s, l, v, h)             \
+{                                           \
+	.type = OPTION_BOOLEAN,             \
+	.short_name = (s),                  \
+	.long_name = (l),                   \
+	.value = check_vtype(v, bool *),    \
+	.help = (h)                         \
+}
+
+#define OPT_INCR(s, l, v, h)                \
+{                                           \
+	.type = OPTION_INCR,	            \
+	.short_name = (s),                  \
+	.long_name = (l),                   \
+	.value = check_vtype(v, int *),     \
+	.help = (h)                         \
+}
+
+#define OPT_GROUP(h)                        \
+{                                           \
+	.type = OPTION_GROUP,               \
+	.help = (h)                         \
+}
+
+#define OPT_CALLBACK(s, l, v, a, h, f, p)   \
+{					    \
+	.type = OPTION_CALLBACK,	    \
+	.short_name = (s),		    \
+	.long_name = (l),		    \
+	.value = (v),			    \
+	(a),				    \
+	.help = (h),			    \
+	.callback = (f),		    \
+	.ptr = (p),			    \
+}
+
+#define OPT_CALLBACK_NOOPT(s, l, v, a, h, f, p) \
+{					    \
+	.type = OPTION_CALLBACK,	    \
+	.short_name = (s),		    \
+	.long_name = (l),		    \
+	.value = (v),			    \
+	(a),				    \
+	.help = (h),			    \
+	.callback = (f),		    \
+	.flags = PARSE_OPT_NOARG,	    \
+	.ptr = (p),			    \
+}
+
+#define OPT_CALLBACK_DEFAULT(s, l, v, a, h, f, d, p) \
+{					    \
+	.type = OPTION_CALLBACK,	    \
+	.short_name = (s),		    \
+	.long_name = (l),		    \
+	.value = (v), (a),		    \
+	.help = (h),			    \
+	.callback = (f),		    \
+	.defval = (intptr_t)d,		    \
+	.flags = PARSE_OPT_LASTARG_DEFAULT, \
+	.ptr = (p)			    \
+}
+
+#define OPT_END() { .type = OPTION_END }
+
+#define OPT_ARCH(cmd, cfg)		    \
+	OPT_ARCH_##cmd(OPT_GROUP("Arch-specific options:"), &(cfg)->arch)
+
+enum {
+	PARSE_OPT_HELP = -1,
+	PARSE_OPT_DONE,
+	PARSE_OPT_UNKNOWN,
+};
+
+/*
+ * It's okay for the caller to consume argv/argc in the usual way.
+ * Other fields of that structure are private to parse-options and should not
+ * be modified in any way.
+ **/
+struct parse_opt_ctx_t {
+	const char **argv;
+	const char **out;
+	int argc, cpidx;
+	const char *opt;
+	int flags;
+};
+
+/* global functions */
+void usage_with_options(const char * const *usagestr,
+		const struct option *opts) NORETURN;
+int parse_options(int argc, const char **argv, const struct option *options,
+		const char * const usagestr[], int flags);
+#endif

diff --git a/tools/kvm/include/kvm/pci-shmem.h b/tools/kvm/include/kvm/pci-shmem.h
new file mode 100644
index 0000000..6cff2b8
--- /dev/null
+++ b/tools/kvm/include/kvm/pci-shmem.h

@@ -0,0 +1,32 @@
+#ifndef KVM__PCI_SHMEM_H
+#define KVM__PCI_SHMEM_H
+
+#include <linux/types.h>
+#include <linux/list.h>
+
+#include "kvm/parse-options.h"
+
+#define SHMEM_DEFAULT_SIZE (16 << MB_SHIFT)
+#define SHMEM_DEFAULT_ADDR (0xc8000000)
+#define SHMEM_DEFAULT_HANDLE "/kvm_shmem"
+
+struct kvm;
+struct shmem_info;
+
+struct shmem_info {
+	u64 phys_addr;
+	u64 size;
+	char *handle;
+	int create;
+};
+
+int pci_shmem__init(struct kvm *kvm);
+int pci_shmem__exit(struct kvm *kvm);
+int pci_shmem__register_mem(struct shmem_info *si);
+int shmem_parser(const struct option *opt, const char *arg, int unset);
+
+int pci_shmem__get_local_irqfd(struct kvm *kvm);
+int pci_shmem__add_client(struct kvm *kvm, u32 id, int fd);
+int pci_shmem__remove_client(struct kvm *kvm, u32 id);
+
+#endif

diff --git a/tools/kvm/include/kvm/pci.h b/tools/kvm/include/kvm/pci.h
new file mode 100644
index 0000000..b0c28a1
--- /dev/null
+++ b/tools/kvm/include/kvm/pci.h

@@ -0,0 +1,96 @@
+#ifndef KVM__PCI_H
+#define KVM__PCI_H
+
+#include <linux/types.h>
+#include <linux/kvm.h>
+#include <linux/pci_regs.h>
+#include <endian.h>
+
+#include "kvm/devices.h"
+#include "kvm/kvm.h"
+#include "kvm/msi.h"
+
+/*
+ * PCI Configuration Mechanism #1 I/O ports. See Section 3.7.4.1.
+ * ("Configuration Mechanism #1") of the PCI Local Bus Specification 2.1 for
+ * details.
+ */
+#define PCI_CONFIG_ADDRESS	0xcf8
+#define PCI_CONFIG_DATA		0xcfc
+#define PCI_CONFIG_BUS_FORWARD	0xcfa
+#define PCI_IO_SIZE		0x100
+#define PCI_CFG_SIZE		(1ULL << 24)
+
+union pci_config_address {
+	struct {
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+		unsigned	reg_offset	: 2;		/* 1  .. 0  */
+		unsigned	register_number	: 6;		/* 7  .. 2  */
+		unsigned	function_number	: 3;		/* 10 .. 8  */
+		unsigned	device_number	: 5;		/* 15 .. 11 */
+		unsigned	bus_number	: 8;		/* 23 .. 16 */
+		unsigned	reserved	: 7;		/* 30 .. 24 */
+		unsigned	enable_bit	: 1;		/* 31       */
+#else
+		unsigned	enable_bit	: 1;		/* 31       */
+		unsigned	reserved	: 7;		/* 30 .. 24 */
+		unsigned	bus_number	: 8;		/* 23 .. 16 */
+		unsigned	device_number	: 5;		/* 15 .. 11 */
+		unsigned	function_number	: 3;		/* 10 .. 8  */
+		unsigned	register_number	: 6;		/* 7  .. 2  */
+		unsigned	reg_offset	: 2;		/* 1  .. 0  */
+#endif
+	};
+	u32 w;
+};
+
+struct msix_table {
+	struct msi_msg msg;
+	u32 ctrl;
+};
+
+struct msix_cap {
+	u8 cap;
+	u8 next;
+	u16 ctrl;
+	u32 table_offset;
+	u32 pba_offset;
+};
+
+struct pci_device_header {
+	u16		vendor_id;
+	u16		device_id;
+	u16		command;
+	u16		status;
+	u8		revision_id;
+	u8		class[3];
+	u8		cacheline_size;
+	u8		latency_timer;
+	u8		header_type;
+	u8		bist;
+	u32		bar[6];
+	u32		card_bus;
+	u16		subsys_vendor_id;
+	u16		subsys_id;
+	u32		exp_rom_bar;
+	u8		capabilities;
+	u8		reserved1[3];
+	u32		reserved2;
+	u8		irq_line;
+	u8		irq_pin;
+	u8		min_gnt;
+	u8		max_lat;
+	struct msix_cap msix;
+	u8		empty[136]; /* Rest of PCI config space */
+	u32		bar_size[6];
+} __attribute__((packed));
+
+int pci__init(struct kvm *kvm);
+int pci__exit(struct kvm *kvm);
+struct pci_device_header *pci__find_dev(u8 dev_num);
+u32 pci_get_io_space_block(u32 size);
+void pci__assign_irq(struct device_header *dev_hdr);
+void pci__config_wr(struct kvm *kvm, union pci_config_address addr, void *data, int size);
+void pci__config_rd(struct kvm *kvm, union pci_config_address addr, void *data, int size);
+
+#endif /* KVM__PCI_H */

diff --git a/tools/kvm/include/kvm/qcow.h b/tools/kvm/include/kvm/qcow.h
new file mode 100644
index 0000000..f849246
--- /dev/null
+++ b/tools/kvm/include/kvm/qcow.h

@@ -0,0 +1,133 @@
+#ifndef KVM__QCOW_H
+#define KVM__QCOW_H
+
+#include "kvm/mutex.h"
+
+#include <linux/types.h>
+#include <stdbool.h>
+#include <linux/rbtree.h>
+#include <linux/list.h>
+
+#define QCOW_MAGIC		(('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb)
+
+#define QCOW1_VERSION		1
+#define QCOW2_VERSION		2
+
+#define QCOW1_OFLAG_COMPRESSED	(1ULL << 63)
+
+#define QCOW2_OFLAG_COPIED	(1ULL << 63)
+#define QCOW2_OFLAG_COMPRESSED	(1ULL << 62)
+
+#define QCOW2_OFLAGS_MASK	(QCOW2_OFLAG_COPIED|QCOW2_OFLAG_COMPRESSED)
+
+#define QCOW2_OFFSET_MASK	(~QCOW2_OFLAGS_MASK)
+
+#define MAX_CACHE_NODES         32
+
+struct qcow_l2_table {
+	u64				offset;
+	struct rb_node			node;
+	struct list_head		list;
+	u8				dirty;
+	u64				table[];
+};
+
+struct qcow_l1_table {
+	u32				table_size;
+	u64				*l1_table;
+
+	/* Level2 caching data structures */
+	struct rb_root			root;
+	struct list_head		lru_list;
+	int				nr_cached;
+};
+
+#define QCOW_REFCOUNT_BLOCK_SHIFT	1
+
+struct qcow_refcount_block {
+	u64				offset;
+	struct rb_node			node;
+	struct list_head		list;
+	u64				size;
+	u8				dirty;
+	u16				entries[];
+};
+
+struct qcow_refcount_table {
+	u32				rf_size;
+	u64				*rf_table;
+
+	/* Refcount block caching data structures */
+	struct rb_root			root;
+	struct list_head		lru_list;
+	int				nr_cached;
+};
+
+struct qcow_header {
+	u64				size;	/* in bytes */
+	u64				l1_table_offset;
+	u32				l1_size;
+	u8				cluster_bits;
+	u8				l2_bits;
+	u64				refcount_table_offset;
+	u32				refcount_table_size;
+};
+
+struct qcow {
+	struct mutex			mutex;
+	struct qcow_header		*header;
+	struct qcow_l1_table		table;
+	struct qcow_refcount_table	refcount_table;
+	int				fd;
+	int				csize_shift;
+	int				csize_mask;
+	u32				version;
+	u64				cluster_size;
+	u64				cluster_offset_mask;
+	u64				free_clust_idx;
+	void				*cluster_cache;
+	void				*cluster_data;
+	void				*copy_buff;
+};
+
+struct qcow1_header_disk {
+	u32				magic;
+	u32				version;
+
+	u64				backing_file_offset;
+	u32 				backing_file_size;
+	u32				mtime;
+
+	u64				size;	/* in bytes */
+
+	u8				cluster_bits;
+	u8				l2_bits;
+	u32				crypt_method;
+
+	u64				l1_table_offset;
+};
+
+struct qcow2_header_disk {
+	u32				magic;
+	u32				version;
+
+	u64				backing_file_offset;
+	u32				backing_file_size;
+
+	u32				cluster_bits;
+	u64				size;	/* in bytes */
+	u32				crypt_method;
+
+	u32				l1_size;
+	u64				l1_table_offset;
+
+	u64				refcount_table_offset;
+	u32				refcount_table_clusters;
+
+	u32				nb_snapshots;
+	u64				snapshots_offset;
+};
+
+struct disk_image *qcow_probe(int fd, bool readonly);
+
+#endif /* KVM__QCOW_H */

diff --git a/tools/kvm/include/kvm/rbtree-interval.h b/tools/kvm/include/kvm/rbtree-interval.h
new file mode 100644
index 0000000..730eb5e
--- /dev/null
+++ b/tools/kvm/include/kvm/rbtree-interval.h

@@ -0,0 +1,30 @@
+#ifndef KVM__INTERVAL_RBTREE_H
+#define KVM__INTERVAL_RBTREE_H
+
+#include <linux/rbtree.h>
+#include <linux/types.h>
+
+#define RB_INT_INIT(l, h) \
+	(struct rb_int_node){.low = l, .high = h}
+#define rb_int(n) rb_entry(n, struct rb_int_node, node)
+
+struct rb_int_node {
+	struct rb_node	node;
+	u64		low;
+	u64		high;
+};
+
+/* Return the rb_int_node interval in which 'point' is located. */
+struct rb_int_node *rb_int_search_single(struct rb_root *root, u64 point);
+
+/* Return the rb_int_node in which start:len is located. */
+struct rb_int_node *rb_int_search_range(struct rb_root *root, u64 low, u64 high);
+
+int rb_int_insert(struct rb_root *root, struct rb_int_node *data);
+
+static inline void rb_int_erase(struct rb_root *root, struct rb_int_node *node)
+{
+	rb_erase(&node->node, root);
+}
+
+#endif

diff --git a/tools/kvm/include/kvm/read-write.h b/tools/kvm/include/kvm/read-write.h
new file mode 100644
index 0000000..67571f9
--- /dev/null
+++ b/tools/kvm/include/kvm/read-write.h

@@ -0,0 +1,43 @@
+#ifndef KVM_READ_WRITE_H
+#define KVM_READ_WRITE_H
+
+#include <sys/types.h>
+#include <sys/uio.h>
+#include <unistd.h>
+
+#ifdef CONFIG_HAS_AIO
+#include <libaio.h>
+#endif
+
+ssize_t xread(int fd, void *buf, size_t count);
+ssize_t xwrite(int fd, const void *buf, size_t count);
+
+ssize_t read_in_full(int fd, void *buf, size_t count);
+ssize_t write_in_full(int fd, const void *buf, size_t count);
+
+ssize_t xpread(int fd, void *buf, size_t count, off_t offset);
+ssize_t xpwrite(int fd, const void *buf, size_t count, off_t offset);
+
+ssize_t pread_in_full(int fd, void *buf, size_t count, off_t offset);
+ssize_t pwrite_in_full(int fd, const void *buf, size_t count, off_t offset);
+
+ssize_t xreadv(int fd, const struct iovec *iov, int iovcnt);
+ssize_t xwritev(int fd, const struct iovec *iov, int iovcnt);
+
+ssize_t readv_in_full(int fd, const struct iovec *iov, int iovcnt);
+ssize_t writev_in_full(int fd, const struct iovec *iov, int iovcnt);
+
+ssize_t xpreadv(int fd, const struct iovec *iov, int iovcnt, off_t offset);
+ssize_t xpwritev(int fd, const struct iovec *iov, int iovcnt, off_t offset);
+
+ssize_t preadv_in_full(int fd, const struct iovec *iov, int iovcnt, off_t offset);
+ssize_t pwritev_in_full(int fd, const struct iovec *iov, int iovcnt, off_t offset);
+
+#ifdef CONFIG_HAS_AIO
+int aio_preadv(io_context_t ctx, struct iocb *iocb, int fd, const struct iovec *iov, int iovcnt,
+		off_t offset, int ev, void *param);
+int aio_pwritev(io_context_t ctx, struct iocb *iocb, int fd, const struct iovec *iov, int iovcnt,
+		off_t offset, int ev, void *param);
+#endif
+
+#endif /* KVM_READ_WRITE_H */

diff --git a/tools/kvm/include/kvm/rtc.h b/tools/kvm/include/kvm/rtc.h
new file mode 100644
index 0000000..6aa9299
--- /dev/null
+++ b/tools/kvm/include/kvm/rtc.h

@@ -0,0 +1,9 @@
+#ifndef KVM__RTC_H
+#define KVM__RTC_H
+
+struct kvm;
+
+int rtc__init(struct kvm *kvm);
+int rtc__exit(struct kvm *kvm);
+
+#endif /* KVM__RTC_H */

diff --git a/tools/kvm/include/kvm/rwsem.h b/tools/kvm/include/kvm/rwsem.h
new file mode 100644
index 0000000..75a22f8
--- /dev/null
+++ b/tools/kvm/include/kvm/rwsem.h

@@ -0,0 +1,39 @@
+#ifndef KVM__RWSEM_H
+#define KVM__RWSEM_H
+
+#include <pthread.h>
+
+#include "kvm/util.h"
+
+/*
+ * Kernel-alike rwsem API - to make it easier for kernel developers
+ * to write user-space code! :-)
+ */
+
+#define DECLARE_RWSEM(sem) pthread_rwlock_t sem = PTHREAD_RWLOCK_INITIALIZER
+
+static inline void down_read(pthread_rwlock_t *rwsem)
+{
+	if (pthread_rwlock_rdlock(rwsem) != 0)
+		die("unexpected pthread_rwlock_rdlock() failure!");
+}
+
+static inline void down_write(pthread_rwlock_t *rwsem)
+{
+	if (pthread_rwlock_wrlock(rwsem) != 0)
+		die("unexpected pthread_rwlock_wrlock() failure!");
+}
+
+static inline void up_read(pthread_rwlock_t *rwsem)
+{
+	if (pthread_rwlock_unlock(rwsem) != 0)
+		die("unexpected pthread_rwlock_unlock() failure!");
+}
+
+static inline void up_write(pthread_rwlock_t *rwsem)
+{
+	if (pthread_rwlock_unlock(rwsem) != 0)
+		die("unexpected pthread_rwlock_unlock() failure!");
+}
+
+#endif /* KVM__RWSEM_H */

diff --git a/tools/kvm/include/kvm/sdl.h b/tools/kvm/include/kvm/sdl.h
new file mode 100644
index 0000000..2f0c213
--- /dev/null
+++ b/tools/kvm/include/kvm/sdl.h

@@ -0,0 +1,28 @@
+#ifndef KVM__SDL_H
+#define KVM__SDL_H
+
+#include "kvm/util.h"
+
+struct framebuffer;
+
+#ifdef CONFIG_HAS_SDL
+int sdl__init(struct kvm *kvm);
+int sdl__exit(struct kvm *kvm);
+#else
+static inline int sdl__init(struct kvm *kvm)
+{
+	if (kvm->cfg.sdl)
+		die("SDL support not compiled in. (install the SDL-dev[el] package)");
+
+	return 0;
+}
+static inline int sdl__exit(struct kvm *kvm)
+{
+	if (kvm->cfg.sdl)
+		die("SDL support not compiled in. (install the SDL-dev[el] package)");
+
+	return 0;
+}
+#endif
+
+#endif /* KVM__SDL_H */

diff --git a/tools/kvm/include/kvm/segment.h b/tools/kvm/include/kvm/segment.h
new file mode 100644
index 0000000..9387a82
--- /dev/null
+++ b/tools/kvm/include/kvm/segment.h

@@ -0,0 +1,21 @@
+#ifndef KVM_SEGMENT_H
+#define KVM_SEGMENT_H
+
+#include <linux/types.h>
+
+static inline u32 segment_to_flat(u16 selector, u16 offset)
+{
+	return ((u32)selector << 4) + (u32) offset;
+}
+
+static inline u16 flat_to_seg16(u32 address)
+{
+	return address >> 4;
+}
+
+static inline u16 flat_to_off16(u32 address, u32 segment)
+{
+	return address - (segment << 4);
+}
+
+#endif /* KVM_SEGMENT_H */

diff --git a/tools/kvm/include/kvm/strbuf.h b/tools/kvm/include/kvm/strbuf.h
new file mode 100644
index 0000000..2beefbc
--- /dev/null
+++ b/tools/kvm/include/kvm/strbuf.h

@@ -0,0 +1,20 @@
+#ifndef __STRBUF_H__
+#define __STRBUF_H__
+
+#include <sys/types.h>
+#include <string.h>
+
+int prefixcmp(const char *str, const char *prefix);
+
+extern size_t strlcat(char *dest, const char *src, size_t count);
+extern size_t strlcpy(char *dest, const char *src, size_t size);
+
+/* some inline functions */
+
+static inline const char *skip_prefix(const char *str, const char *prefix)
+{
+	size_t len = strlen(prefix);
+	return strncmp(str, prefix, len) ? NULL : str + len;
+}
+
+#endif

diff --git a/tools/kvm/include/kvm/symbol.h b/tools/kvm/include/kvm/symbol.h
new file mode 100644
index 0000000..725bbaf
--- /dev/null
+++ b/tools/kvm/include/kvm/symbol.h

@@ -0,0 +1,30 @@
+#ifndef KVM__SYMBOL_H
+#define KVM__SYMBOL_H
+
+#include <stddef.h>
+#include <string.h>
+
+struct kvm;
+
+#define SYMBOL_DEFAULT_UNKNOWN "<unknown>"
+
+#ifdef CONFIG_HAS_BFD
+
+int symbol_init(struct kvm *kvm);
+int symbol_exit(struct kvm *kvm);
+char *symbol_lookup(struct kvm *kvm, unsigned long addr, char *sym, size_t size);
+
+#else
+
+static inline int symbol_init(struct kvm *kvm) { return 0; }
+static inline char *symbol_lookup(struct kvm *kvm, unsigned long addr, char *sym, size_t size)
+{
+	char *s = strncpy(sym, SYMBOL_DEFAULT_UNKNOWN, size);
+	sym[size - 1] = '\0';
+	return s;
+}
+static inline int symbol_exit(struct kvm *kvm) { return 0; }
+
+#endif
+
+#endif /* KVM__SYMBOL_H */

diff --git a/tools/kvm/include/kvm/term.h b/tools/kvm/include/kvm/term.h
new file mode 100644
index 0000000..dc9882e
--- /dev/null
+++ b/tools/kvm/include/kvm/term.h

@@ -0,0 +1,26 @@
+#ifndef KVM__TERM_H
+#define KVM__TERM_H
+
+#include "kvm/kvm.h"
+
+#include <sys/uio.h>
+#include <stdbool.h>
+
+#define CONSOLE_8250	1
+#define CONSOLE_VIRTIO	2
+#define CONSOLE_HV	3
+
+#define TERM_MAX_DEVS	4
+
+int term_putc_iov(struct iovec *iov, int iovcnt, int term);
+int term_getc_iov(struct kvm *kvm, struct iovec *iov, int iovcnt, int term);
+int term_putc(char *addr, int cnt, int term);
+int term_getc(struct kvm *kvm, int term);
+
+bool term_readable(int term);
+void term_set_tty(int term);
+int term_init(struct kvm *kvm);
+int term_exit(struct kvm *kvm);
+int tty_parser(const struct option *opt, const char *arg, int unset);
+
+#endif /* KVM__TERM_H */

diff --git a/tools/kvm/include/kvm/threadpool.h b/tools/kvm/include/kvm/threadpool.h
new file mode 100644
index 0000000..bacb243
--- /dev/null
+++ b/tools/kvm/include/kvm/threadpool.h

@@ -0,0 +1,38 @@
+#ifndef KVM__THREADPOOL_H
+#define KVM__THREADPOOL_H
+
+#include "kvm/mutex.h"
+
+#include <linux/list.h>
+
+struct kvm;
+
+typedef void (*kvm_thread_callback_fn_t)(struct kvm *kvm, void *data);
+
+struct thread_pool__job {
+	kvm_thread_callback_fn_t	callback;
+	struct kvm			*kvm;
+	void				*data;
+
+	int				signalcount;
+	struct mutex			mutex;
+
+	struct list_head		queue;
+};
+
+static inline void thread_pool__init_job(struct thread_pool__job *job, struct kvm *kvm, kvm_thread_callback_fn_t callback, void *data)
+{
+	*job = (struct thread_pool__job) {
+		.kvm		= kvm,
+		.callback	= callback,
+		.data		= data,
+		.mutex		= MUTEX_INITIALIZER,
+	};
+}
+
+int thread_pool__init(struct kvm *kvm);
+int thread_pool__exit(struct kvm *kvm);
+
+void thread_pool__do_job(struct thread_pool__job *job);
+
+#endif

diff --git a/tools/kvm/include/kvm/types.h b/tools/kvm/include/kvm/types.h
new file mode 100644
index 0000000..0cbc5fb
--- /dev/null
+++ b/tools/kvm/include/kvm/types.h

@@ -0,0 +1,7 @@
+#ifndef KVM_TYPES_H
+#define KVM_TYPES_H
+
+/* FIXME: include/linux/if_tun.h and include/linux/if_ether.h complains */
+#define __be16 u16
+
+#endif /* KVM_TYPES_H */

diff --git a/tools/kvm/include/kvm/uip.h b/tools/kvm/include/kvm/uip.h
new file mode 100644
index 0000000..a9a8d85
--- /dev/null
+++ b/tools/kvm/include/kvm/uip.h

@@ -0,0 +1,362 @@
+#ifndef KVM__UIP_H
+#define KVM__UIP_H
+
+#include "linux/types.h"
+#include "kvm/mutex.h"
+
+#include <netinet/in.h>
+#include <sys/uio.h>
+
+#define UIP_BUF_STATUS_FREE	0
+#define UIP_BUF_STATUS_INUSE	1
+#define UIP_BUF_STATUS_USED	2
+
+#define UIP_ETH_P_IP		0X0800
+#define UIP_ETH_P_ARP		0X0806
+
+#define UIP_IP_VER_4		0X40
+#define UIP_IP_HDR_LEN		0X05
+#define UIP_IP_TTL		0X40
+#define UIP_IP_P_UDP		0X11
+#define UIP_IP_P_TCP		0X06
+#define UIP_IP_P_ICMP		0X01
+
+#define UIP_TCP_HDR_LEN		0x50
+#define UIP_TCP_WIN_SIZE	14600
+#define UIP_TCP_FLAG_FIN	1
+#define UIP_TCP_FLAG_SYN	2
+#define UIP_TCP_FLAG_RST	4
+#define UIP_TCP_FLAG_PSH	8
+#define UIP_TCP_FLAG_ACK	16
+#define UIP_TCP_FLAG_URG	32
+
+#define UIP_BOOTP_VENDOR_SPECIFIC_LEN	64
+#define UIP_BOOTP_MAX_PAYLOAD_LEN	300
+#define UIP_DHCP_VENDOR_SPECIFIC_LEN	312
+#define UIP_DHCP_PORT_SERVER		67
+#define UIP_DHCP_PORT_CLIENT		68
+#define UIP_DHCP_MACPAD_LEN		10
+#define UIP_DHCP_HOSTNAME_LEN		64
+#define UIP_DHCP_FILENAME_LEN		128
+#define UIP_DHCP_MAGIC_COOKIE		0x63825363
+#define UIP_DHCP_MAGIC_COOKIE_LEN	4
+#define UIP_DHCP_LEASE_TIME		0x00003840
+#define UIP_DHCP_MAX_PAYLOAD_LEN	(UIP_BOOTP_MAX_PAYLOAD_LEN - UIP_BOOTP_VENDOR_SPECIFIC_LEN +  UIP_DHCP_VENDOR_SPECIFIC_LEN)
+#define UIP_DHCP_OPTION_LEN		(UIP_DHCP_VENDOR_SPECIFIC_LEN - UIP_DHCP_MAGIC_COOKIE_LEN)
+#define UIP_DHCP_DISCOVER		1
+#define UIP_DHCP_OFFER			2
+#define UIP_DHCP_REQUEST		3
+#define UIP_DHCP_ACK			5
+#define UIP_DHCP_MAX_DNS_SERVER_NR	3
+#define UIP_DHCP_MAX_DOMAIN_NAME_LEN	256
+#define UIP_DHCP_TAG_MSG_TYPE		53
+#define UIP_DHCP_TAG_MSG_TYPE_LEN	1
+#define UIP_DHCP_TAG_SERVER_ID		54
+#define UIP_DHCP_TAG_SERVER_ID_LEN	4
+#define UIP_DHCP_TAG_LEASE_TIME		51
+#define UIP_DHCP_TAG_LEASE_TIME_LEN	4
+#define UIP_DHCP_TAG_SUBMASK		1
+#define UIP_DHCP_TAG_SUBMASK_LEN	4
+#define UIP_DHCP_TAG_ROUTER		3
+#define UIP_DHCP_TAG_ROUTER_LEN		4
+#define UIP_DHCP_TAG_ROOT		17
+#define UIP_DHCP_TAG_ROOT_LEN		4
+#define UIP_DHCP_TAG_DNS_SERVER		6
+#define UIP_DHCP_TAG_DNS_SERVER_LEN	4
+#define UIP_DHCP_TAG_DOMAIN_NAME	15
+#define UIP_DHCP_TAG_END		255
+
+/*
+ * IP package maxium len == 64 KBytes
+ * IP header == 20 Bytes
+ * TCP header == 20 Bytes
+ * UDP header == 8 Bytes
+ */
+#define UIP_MAX_TCP_PAYLOAD	(64*1024 - 20 - 20 - 1)
+#define UIP_MAX_UDP_PAYLOAD	(64*1024 - 20 -  8 - 1)
+
+struct uip_eth_addr {
+	u8 addr[6];
+};
+
+struct uip_eth {
+	struct uip_eth_addr dst;
+	struct uip_eth_addr src;
+	u16 type;
+} __attribute__((packed));
+
+struct uip_arp {
+	struct uip_eth eth;
+	u16 hwtype;
+	u16 proto;
+	u8 hwlen;
+	u8 protolen;
+	u16 op;
+	struct uip_eth_addr smac;
+	u32 sip;
+	struct uip_eth_addr dmac;
+	u32 dip;
+} __attribute__((packed));
+
+struct uip_ip {
+	struct uip_eth eth;
+	u8 vhl;
+	u8 tos;
+	/*
+	 * len = IP hdr +  IP payload
+	 */
+	u16 len;
+	u16 id;
+	u16 flgfrag;
+	u8 ttl;
+	u8 proto;
+	u16 csum;
+	u32 sip;
+	u32 dip;
+} __attribute__((packed));
+
+struct uip_icmp {
+	struct uip_ip ip;
+	u8 type;
+	u8 code;
+	u16 csum;
+	u16 id;
+	u16 seq;
+} __attribute__((packed));
+
+struct uip_udp {
+	/*
+	 * FIXME: IP Options (IP hdr len > 20 bytes) are not supported
+	 */
+	struct uip_ip ip;
+	u16 sport;
+	u16 dport;
+	/*
+	 * len = UDP hdr +  UDP payload
+	 */
+	u16 len;
+	u16 csum;
+	u8 payload[0];
+} __attribute__((packed));
+
+struct uip_tcp {
+	/*
+	 * FIXME: IP Options (IP hdr len > 20 bytes) are not supported
+	 */
+	struct uip_ip ip;
+	u16 sport;
+	u16 dport;
+	u32 seq;
+	u32 ack;
+	u8  off;
+	u8  flg;
+	u16 win;
+	u16 csum;
+	u16 urgent;
+} __attribute__((packed));
+
+struct uip_pseudo_hdr {
+	u32 sip;
+	u32 dip;
+	u8 zero;
+	u8 proto;
+	u16 len;
+} __attribute__((packed));
+
+struct uip_dhcp {
+	struct uip_udp udp;
+	u8 msg_type;
+	u8 hardware_type;
+	u8 hardware_len;
+	u8 hops;
+	u32 id;
+	u16 time;
+	u16 flg;
+	u32 client_ip;
+	u32 your_ip;
+	u32 server_ip;
+	u32 agent_ip;
+	struct uip_eth_addr client_mac;
+	u8 pad[UIP_DHCP_MACPAD_LEN];
+	u8 server_hostname[UIP_DHCP_HOSTNAME_LEN];
+	u8 boot_filename[UIP_DHCP_FILENAME_LEN];
+	u32 magic_cookie;
+	u8 option[UIP_DHCP_OPTION_LEN];
+} __attribute__((packed));
+
+struct uip_info {
+	struct list_head udp_socket_head;
+	struct list_head tcp_socket_head;
+	struct mutex udp_socket_lock;
+	struct mutex tcp_socket_lock;
+	struct uip_eth_addr guest_mac;
+	struct uip_eth_addr host_mac;
+	pthread_cond_t buf_free_cond;
+	pthread_cond_t buf_used_cond;
+	struct list_head buf_head;
+	struct mutex buf_lock;
+	pthread_t udp_thread;
+	int udp_epollfd;
+	int buf_free_nr;
+	int buf_used_nr;
+	u32 guest_ip;
+	u32 guest_netmask;
+	u32 host_ip;
+	u32 dns_ip[UIP_DHCP_MAX_DNS_SERVER_NR];
+	char *domain_name;
+	u32 buf_nr;
+	u32 vnet_hdr_len;
+};
+
+struct uip_buf {
+	struct list_head list;
+	struct uip_info *info;
+	int vnet_len;
+	int eth_len;
+	int status;
+	unsigned char *vnet;
+	unsigned char *eth;
+	int id;
+};
+
+struct uip_udp_socket {
+	struct sockaddr_in addr;
+	struct list_head list;
+	struct mutex *lock;
+	u32 dport, sport;
+	u32 dip, sip;
+	int fd;
+};
+
+struct uip_tcp_socket {
+	struct sockaddr_in addr;
+	struct list_head list;
+	struct uip_info *info;
+	pthread_cond_t	cond;
+	struct mutex *lock;
+	pthread_t thread;
+	u32 dport, sport;
+	u32 guest_acked;
+	u16 window_size;
+	/*
+	 * Initial Sequence Number
+	 */
+	u32 isn_server;
+	u32 isn_guest;
+	u32 ack_server;
+	u32 seq_server;
+	int write_done;
+	int read_done;
+	u32 dip, sip;
+	u8 *payload;
+	int fd;
+};
+
+struct uip_tx_arg {
+	void *vnet;
+	struct uip_info *info;
+	struct uip_eth *eth;
+	int vnet_len;
+	int eth_len;
+};
+
+static inline u16 uip_ip_hdrlen(struct uip_ip *ip)
+{
+	return (ip->vhl & 0x0f) * 4;
+}
+
+static inline u16 uip_ip_len(struct uip_ip *ip)
+{
+	return htons(ip->len);
+}
+
+static inline u16 uip_udp_hdrlen(struct uip_udp *udp)
+{
+	return 8;
+}
+
+static inline u16 uip_udp_len(struct uip_udp *udp)
+{
+	return ntohs(udp->len);
+}
+
+static inline u16 uip_tcp_hdrlen(struct uip_tcp *tcp)
+{
+	return (tcp->off >> 4) * 4;
+}
+
+static inline u16 uip_tcp_len(struct uip_tcp *tcp)
+{
+	struct uip_ip *ip;
+
+	ip = &tcp->ip;
+
+	return uip_ip_len(ip) - uip_ip_hdrlen(ip);
+}
+
+static inline u16 uip_tcp_payloadlen(struct uip_tcp *tcp)
+{
+	return uip_tcp_len(tcp) - uip_tcp_hdrlen(tcp);
+}
+
+static inline u8 *uip_tcp_payload(struct uip_tcp *tcp)
+{
+	return (u8 *)&tcp->sport + uip_tcp_hdrlen(tcp);
+}
+
+static inline bool uip_tcp_is_syn(struct uip_tcp *tcp)
+{
+	return (tcp->flg & UIP_TCP_FLAG_SYN) != 0;
+}
+
+static inline bool uip_tcp_is_fin(struct uip_tcp *tcp)
+{
+	return (tcp->flg & UIP_TCP_FLAG_FIN) != 0;
+}
+
+static inline u32 uip_tcp_isn(struct uip_tcp *tcp)
+{
+	return ntohl(tcp->seq);
+}
+
+static inline u32 uip_tcp_isn_alloc(void)
+{
+	/*
+	 * FIXME: should increase every 4ms
+	 */
+	return 10000000;
+}
+
+static inline u16 uip_eth_hdrlen(struct uip_eth *eth)
+{
+	return sizeof(*eth);
+}
+
+int uip_tx(struct iovec *iov, u16 out, struct uip_info *info);
+int uip_rx(struct iovec *iov, u16 in, struct uip_info *info);
+void uip_static_init(struct uip_info *info);
+int uip_init(struct uip_info *info);
+
+int uip_tx_do_ipv4_udp_dhcp(struct uip_tx_arg *arg);
+int uip_tx_do_ipv4_icmp(struct uip_tx_arg *arg);
+int uip_tx_do_ipv4_tcp(struct uip_tx_arg *arg);
+int uip_tx_do_ipv4_udp(struct uip_tx_arg *arg);
+int uip_tx_do_ipv4(struct uip_tx_arg *arg);
+int uip_tx_do_arp(struct uip_tx_arg *arg);
+
+u16 uip_csum_icmp(struct uip_icmp *icmp);
+u16 uip_csum_udp(struct uip_udp *udp);
+u16 uip_csum_tcp(struct uip_tcp *tcp);
+u16 uip_csum_ip(struct uip_ip *ip);
+
+struct uip_buf *uip_buf_set_used(struct uip_info *info, struct uip_buf *buf);
+struct uip_buf *uip_buf_set_free(struct uip_info *info, struct uip_buf *buf);
+struct uip_buf *uip_buf_get_used(struct uip_info *info);
+struct uip_buf *uip_buf_get_free(struct uip_info *info);
+struct uip_buf *uip_buf_clone(struct uip_tx_arg *arg);
+
+int uip_udp_make_pkg(struct uip_info *info, struct uip_udp_socket *sk, struct uip_buf *buf, u8 *payload, int payload_len);
+bool uip_udp_is_dhcp(struct uip_udp *udp);
+
+int uip_dhcp_get_dns(struct uip_info *info);
+#endif /* KVM__UIP_H */

diff --git a/tools/kvm/include/kvm/util-init.h b/tools/kvm/include/kvm/util-init.h
new file mode 100644
index 0000000..13d4f04
--- /dev/null
+++ b/tools/kvm/include/kvm/util-init.h

@@ -0,0 +1,51 @@
+#ifndef KVM__UTIL_INIT_H
+#define KVM__UTIL_INIT_H
+
+struct kvm;
+
+struct init_item {
+	struct hlist_node n;
+	const char *fn_name;
+	int (*init)(struct kvm *);
+};
+
+int init_list__init(struct kvm *kvm);
+int init_list__exit(struct kvm *kvm);
+
+int init_list_add(struct init_item *t, int (*init)(struct kvm *),
+			int priority, const char *name);
+int exit_list_add(struct init_item *t, int (*init)(struct kvm *),
+			int priority, const char *name);
+
+#define __init_list_add(cb, l)						\
+static void __attribute__ ((constructor)) __init__##cb(void)		\
+{									\
+	static char name[] = #cb;					\
+	static struct init_item t;					\
+	init_list_add(&t, cb, l, name);					\
+}
+
+#define __exit_list_add(cb, l)						\
+static void __attribute__ ((constructor)) __init__##cb(void)		\
+{									\
+	static char name[] = #cb;					\
+	static struct init_item t;					\
+	exit_list_add(&t, cb, l, name);					\
+}
+
+#define core_init(cb) __init_list_add(cb, 0)
+#define base_init(cb) __init_list_add(cb, 2)
+#define dev_base_init(cb)  __init_list_add(cb, 4)
+#define dev_init(cb) __init_list_add(cb, 5)
+#define virtio_dev_init(cb) __init_list_add(cb, 6)
+#define firmware_init(cb) __init_list_add(cb, 7)
+#define late_init(cb) __init_list_add(cb, 9)
+
+#define core_exit(cb) __exit_list_add(cb, 0)
+#define base_exit(cb) __exit_list_add(cb, 2)
+#define dev_base_exit(cb) __exit_list_add(cb, 4)
+#define dev_exit(cb) __exit_list_add(cb, 5)
+#define virtio_dev_exit(cb) __exit_list_add(cb, 6)
+#define firmware_exit(cb) __exit_list_add(cb, 7)
+#define late_exit(cb) __exit_list_add(cb, 9)
+#endif

diff --git a/tools/kvm/include/kvm/util.h b/tools/kvm/include/kvm/util.h
new file mode 100644
index 0000000..0df9f0d
--- /dev/null
+++ b/tools/kvm/include/kvm/util.h

@@ -0,0 +1,97 @@
+#include <linux/stringify.h>
+
+#ifndef KVM__UTIL_H
+#define KVM__UTIL_H
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+
+/*
+ * Some bits are stolen from perf tool :)
+ */
+
+#include <unistd.h>
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <string.h>
+#include <stdbool.h>
+#include <signal.h>
+#include <errno.h>
+#include <limits.h>
+#include <sys/param.h>
+#include <sys/types.h>
+#include <linux/types.h>
+
+#ifdef __GNUC__
+#define NORETURN __attribute__((__noreturn__))
+#else
+#define NORETURN
+#ifndef __attribute__
+#define __attribute__(x)
+#endif
+#endif
+
+extern bool do_debug_print;
+
+#define PROT_RW (PROT_READ|PROT_WRITE)
+#define MAP_ANON_NORESERVE (MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE)
+
+extern void die(const char *err, ...) NORETURN __attribute__((format (printf, 1, 2)));
+extern void die_perror(const char *s) NORETURN;
+extern int pr_err(const char *err, ...) __attribute__((format (printf, 1, 2)));
+extern void pr_warning(const char *err, ...) __attribute__((format (printf, 1, 2)));
+extern void pr_info(const char *err, ...) __attribute__((format (printf, 1, 2)));
+extern void set_die_routine(void (*routine)(const char *err, va_list params) NORETURN);
+
+#define pr_debug(fmt, ...)						\
+	do {								\
+		if (do_debug_print)					\
+			pr_info("(%s) %s:%d: " fmt, __FILE__,		\
+				__func__, __LINE__, ##__VA_ARGS__);	\
+	} while (0)
+
+
+#define BUILD_BUG_ON(condition)	((void)sizeof(char[1 - 2*!!(condition)]))
+
+#ifndef BUG_ON_HANDLER
+# define BUG_ON_HANDLER(condition)					\
+	do {								\
+		if ((condition)) {					\
+			pr_err("BUG at %s:%d", __FILE__, __LINE__);	\
+			raise(SIGABRT);					\
+		}							\
+	} while (0)
+#endif
+
+#define BUG_ON(condition)	BUG_ON_HANDLER((condition))
+
+#define DIE_IF(cnd)						\
+do {								\
+	if (cnd)						\
+	die(" at (" __FILE__ ":" __stringify(__LINE__) "): "	\
+		__stringify(cnd) "\n");				\
+} while (0)
+
+#define WARN_ON(condition) ({					\
+	int __ret_warn_on = !!(condition);			\
+	if (__ret_warn_on)					\
+		pr_warning("(%s) %s:%d: failed condition: %s",	\
+				__FILE__, __func__, __LINE__,	\
+				__stringify(condition));	\
+	__ret_warn_on;						\
+})
+
+#define MSECS_TO_USECS(s) ((s) * 1000)
+
+/* Millisecond sleep */
+static inline void msleep(unsigned int msecs)
+{
+	usleep(MSECS_TO_USECS(msecs));
+}
+
+struct kvm;
+void *mmap_hugetlbfs(struct kvm *kvm, const char *htlbfs_path, u64 size);
+void *mmap_anon_or_hugetlbfs(struct kvm *kvm, const char *hugetlbfs_path, u64 size);
+
+#endif /* KVM__UTIL_H */

diff --git a/tools/kvm/include/kvm/vesa.h b/tools/kvm/include/kvm/vesa.h
new file mode 100644
index 0000000..ac041d9
--- /dev/null
+++ b/tools/kvm/include/kvm/vesa.h

@@ -0,0 +1,18 @@
+#ifndef KVM__VESA_H
+#define KVM__VESA_H
+
+#include <linux/types.h>
+
+#define VESA_WIDTH	640
+#define VESA_HEIGHT	480
+
+#define VESA_MEM_ADDR	0xd0000000
+#define VESA_MEM_SIZE	(4*VESA_WIDTH*VESA_HEIGHT)
+#define VESA_BPP	32
+
+struct kvm;
+struct biosregs;
+
+struct framebuffer *vesa__init(struct kvm *self);
+
+#endif

diff --git a/tools/kvm/include/kvm/virtio-9p.h b/tools/kvm/include/kvm/virtio-9p.h
new file mode 100644
index 0000000..19ffe50
--- /dev/null
+++ b/tools/kvm/include/kvm/virtio-9p.h

@@ -0,0 +1,76 @@
+#ifndef KVM__VIRTIO_9P_H
+#define KVM__VIRTIO_9P_H
+#include "kvm/virtio.h"
+#include "kvm/pci.h"
+#include "kvm/threadpool.h"
+#include "kvm/parse-options.h"
+
+#include <sys/types.h>
+#include <dirent.h>
+#include <linux/list.h>
+#include <linux/rbtree.h>
+
+#define NUM_VIRT_QUEUES		1
+#define VIRTQUEUE_NUM		128
+#define	VIRTIO_9P_DEFAULT_TAG	"kvm_9p"
+#define VIRTIO_9P_HDR_LEN	(sizeof(u32)+sizeof(u8)+sizeof(u16))
+#define VIRTIO_9P_VERSION_DOTL	"9P2000.L"
+#define MAX_TAG_LEN		32
+
+struct p9_msg {
+	u32			size;
+	u8			cmd;
+	u16			tag;
+	u8			msg[0];
+} __attribute__((packed));
+
+struct p9_fid {
+	u32			fid;
+	u32			uid;
+	char			abs_path[PATH_MAX];
+	char			*path;
+	DIR			*dir;
+	int			fd;
+	struct rb_node		node;
+};
+
+struct p9_dev_job {
+	struct virt_queue	*vq;
+	struct p9_dev		*p9dev;
+	struct thread_pool__job job_id;
+};
+
+struct p9_dev {
+	struct list_head	list;
+	struct virtio_device	vdev;
+	struct rb_root		fids;
+
+	struct virtio_9p_config	*config;
+	u32			features;
+
+	/* virtio queue */
+	struct virt_queue	vqs[NUM_VIRT_QUEUES];
+	struct p9_dev_job	jobs[NUM_VIRT_QUEUES];
+	char			root_dir[PATH_MAX];
+};
+
+struct p9_pdu {
+	u32			queue_head;
+	size_t			read_offset;
+	size_t			write_offset;
+	u16			out_iov_cnt;
+	u16			in_iov_cnt;
+	struct iovec		in_iov[VIRTQUEUE_NUM];
+	struct iovec		out_iov[VIRTQUEUE_NUM];
+};
+
+struct kvm;
+
+int virtio_9p_rootdir_parser(const struct option *opt, const char *arg, int unset);
+int virtio_9p_img_name_parser(const struct option *opt, const char *arg, int unset);
+int virtio_9p__register(struct kvm *kvm, const char *root, const char *tag_name);
+int virtio_9p__init(struct kvm *kvm);
+int virtio_p9_pdu_readf(struct p9_pdu *pdu, const char *fmt, ...);
+int virtio_p9_pdu_writef(struct p9_pdu *pdu, const char *fmt, ...);
+
+#endif

diff --git a/tools/kvm/include/kvm/virtio-balloon.h b/tools/kvm/include/kvm/virtio-balloon.h
new file mode 100644
index 0000000..844a1ba
--- /dev/null
+++ b/tools/kvm/include/kvm/virtio-balloon.h

@@ -0,0 +1,9 @@
+#ifndef KVM__BLN_VIRTIO_H
+#define KVM__BLN_VIRTIO_H
+
+struct kvm;
+
+int virtio_bln__init(struct kvm *kvm);
+int virtio_bln__exit(struct kvm *kvm);
+
+#endif /* KVM__BLN_VIRTIO_H */

diff --git a/tools/kvm/include/kvm/virtio-blk.h b/tools/kvm/include/kvm/virtio-blk.h
new file mode 100644
index 0000000..12e59b6
--- /dev/null
+++ b/tools/kvm/include/kvm/virtio-blk.h

@@ -0,0 +1,12 @@
+#ifndef KVM__BLK_VIRTIO_H
+#define KVM__BLK_VIRTIO_H
+
+#include "kvm/disk-image.h"
+
+struct kvm;
+
+int virtio_blk__init(struct kvm *kvm);
+int virtio_blk__exit(struct kvm *kvm);
+void virtio_blk_complete(void *param, long len);
+
+#endif /* KVM__BLK_VIRTIO_H */

diff --git a/tools/kvm/include/kvm/virtio-console.h b/tools/kvm/include/kvm/virtio-console.h
new file mode 100644
index 0000000..8980920
--- /dev/null
+++ b/tools/kvm/include/kvm/virtio-console.h

@@ -0,0 +1,10 @@
+#ifndef KVM__CONSOLE_VIRTIO_H
+#define KVM__CONSOLE_VIRTIO_H
+
+struct kvm;
+
+int virtio_console__init(struct kvm *kvm);
+void virtio_console__inject_interrupt(struct kvm *kvm);
+int virtio_console__exit(struct kvm *kvm);
+
+#endif /* KVM__CONSOLE_VIRTIO_H */

diff --git a/tools/kvm/include/kvm/virtio-mmio.h b/tools/kvm/include/kvm/virtio-mmio.h
new file mode 100644
index 0000000..835f421
--- /dev/null
+++ b/tools/kvm/include/kvm/virtio-mmio.h

@@ -0,0 +1,60 @@
+#ifndef KVM__VIRTIO_MMIO_H
+#define KVM__VIRTIO_MMIO_H
+
+#include <linux/types.h>
+#include <linux/virtio_mmio.h>
+
+#define VIRTIO_MMIO_MAX_VQ	32
+#define VIRTIO_MMIO_MAX_CONFIG	1
+#define VIRTIO_MMIO_IO_SIZE	0x200
+
+struct kvm;
+
+struct virtio_mmio_ioevent_param {
+	struct virtio_device	*vdev;
+	u32			vq;
+};
+
+struct virtio_mmio_hdr {
+	char	magic[4];
+	u32	version;
+	u32	device_id;
+	u32	vendor_id;
+	u32	host_features;
+	u32	host_features_sel;
+	u32	reserved_1[2];
+	u32	guest_features;
+	u32	guest_features_sel;
+	u32	guest_page_size;
+	u32	reserved_2;
+	u32	queue_sel;
+	u32	queue_num_max;
+	u32	queue_num;
+	u32	queue_align;
+	u32	queue_pfn;
+	u32	reserved_3[3];
+	u32	queue_notify;
+	u32	reserved_4[3];
+	u32	interrupt_state;
+	u32	interrupt_ack;
+	u32	reserved_5[2];
+	u32	status;
+} __attribute__((packed));
+
+struct virtio_mmio {
+	u32			addr;
+	void			*dev;
+	struct kvm		*kvm;
+	u8			irq;
+	struct virtio_mmio_hdr	hdr;
+	struct device_header	dev_hdr;
+	struct virtio_mmio_ioevent_param ioeventfds[VIRTIO_MMIO_MAX_VQ];
+};
+
+int virtio_mmio_signal_vq(struct kvm *kvm, struct virtio_device *vdev, u32 vq);
+int virtio_mmio_signal_config(struct kvm *kvm, struct virtio_device *vdev);
+int virtio_mmio_exit(struct kvm *kvm, struct virtio_device *vdev);
+int virtio_mmio_init(struct kvm *kvm, void *dev, struct virtio_device *vdev,
+		      int device_id, int subsys_id, int class);
+void virtio_mmio_assign_irq(struct device_header *dev_hdr);
+#endif

diff --git a/tools/kvm/include/kvm/virtio-net.h b/tools/kvm/include/kvm/virtio-net.h
new file mode 100644
index 0000000..f435cc3
--- /dev/null
+++ b/tools/kvm/include/kvm/virtio-net.h

@@ -0,0 +1,32 @@
+#ifndef KVM__VIRTIO_NET_H
+#define KVM__VIRTIO_NET_H
+
+#include "kvm/parse-options.h"
+
+struct kvm;
+
+struct virtio_net_params {
+	const char *guest_ip;
+	const char *host_ip;
+	const char *script;
+	const char *trans;
+	const char *tapif;
+	char guest_mac[6];
+	char host_mac[6];
+	struct kvm *kvm;
+	int mode;
+	int vhost;
+	int fd;
+	int mq;
+};
+
+int virtio_net__init(struct kvm *kvm);
+int virtio_net__exit(struct kvm *kvm);
+int netdev_parser(const struct option *opt, const char *arg, int unset);
+
+enum {
+	NET_MODE_USER,
+	NET_MODE_TAP
+};
+
+#endif /* KVM__VIRTIO_NET_H */

diff --git a/tools/kvm/include/kvm/virtio-pci-dev.h b/tools/kvm/include/kvm/virtio-pci-dev.h
new file mode 100644
index 0000000..48ae018
--- /dev/null
+++ b/tools/kvm/include/kvm/virtio-pci-dev.h

@@ -0,0 +1,38 @@
+#ifndef VIRTIO_PCI_DEV_H_
+#define VIRTIO_PCI_DEV_H_
+
+#include <linux/virtio_ids.h>
+
+/*
+ * Virtio PCI device constants and resources
+ * they do use (such as irqs and pins).
+ */
+
+#define PCI_DEVICE_ID_VIRTIO_NET		0x1000
+#define PCI_DEVICE_ID_VIRTIO_BLK		0x1001
+#define PCI_DEVICE_ID_VIRTIO_CONSOLE		0x1003
+#define PCI_DEVICE_ID_VIRTIO_RNG		0x1004
+#define PCI_DEVICE_ID_VIRTIO_BLN		0x1005
+#define PCI_DEVICE_ID_VIRTIO_SCSI		0x1008
+#define PCI_DEVICE_ID_VIRTIO_9P			0x1009
+#define PCI_DEVICE_ID_VESA			0x2000
+#define PCI_DEVICE_ID_PCI_SHMEM			0x0001
+
+#define PCI_VENDOR_ID_REDHAT_QUMRANET		0x1af4
+#define PCI_VENDOR_ID_PCI_SHMEM			0x0001
+#define PCI_SUBSYSTEM_VENDOR_ID_REDHAT_QUMRANET	0x1af4
+
+#define PCI_SUBSYSTEM_ID_VESA			0x0004
+#define PCI_SUBSYSTEM_ID_PCI_SHMEM		0x0001
+
+#define PCI_CLASS_BLK				0x018000
+#define PCI_CLASS_NET				0x020000
+#define PCI_CLASS_CONSOLE			0x078000
+/*
+ * 0xFF Device does not fit in any defined classes
+ */
+#define PCI_CLASS_RNG				0xff0000
+#define PCI_CLASS_BLN				0xff0000
+#define PCI_CLASS_9P				0xff0000
+
+#endif /* VIRTIO_PCI_DEV_H_ */

diff --git a/tools/kvm/include/kvm/virtio-pci.h b/tools/kvm/include/kvm/virtio-pci.h
new file mode 100644
index 0000000..c795ce7
--- /dev/null
+++ b/tools/kvm/include/kvm/virtio-pci.h

@@ -0,0 +1,53 @@
+#ifndef KVM__VIRTIO_PCI_H
+#define KVM__VIRTIO_PCI_H
+
+#include "kvm/devices.h"
+#include "kvm/pci.h"
+
+#include <linux/types.h>
+
+#define VIRTIO_PCI_MAX_VQ	32
+#define VIRTIO_PCI_MAX_CONFIG	1
+
+struct kvm;
+
+struct virtio_pci_ioevent_param {
+	struct virtio_device	*vdev;
+	u32			vq;
+};
+
+#define VIRTIO_PCI_F_SIGNAL_MSI (1 << 0)
+
+struct virtio_pci {
+	struct pci_device_header pci_hdr;
+	struct device_header	dev_hdr;
+	void			*dev;
+	struct kvm		*kvm;
+
+	u16			port_addr;
+	u32			mmio_addr;
+	u8			status;
+	u8			isr;
+	u32			features;
+
+	/* MSI-X */
+	u16			config_vector;
+	u32			config_gsi;
+	u32			vq_vector[VIRTIO_PCI_MAX_VQ];
+	u32			gsis[VIRTIO_PCI_MAX_VQ];
+	u32			msix_io_block;
+	u64			msix_pba;
+	struct msix_table	msix_table[VIRTIO_PCI_MAX_VQ + VIRTIO_PCI_MAX_CONFIG];
+
+	/* virtio queue */
+	u16			queue_selector;
+	struct virtio_pci_ioevent_param ioeventfds[VIRTIO_PCI_MAX_VQ];
+};
+
+int virtio_pci__signal_vq(struct kvm *kvm, struct virtio_device *vdev, u32 vq);
+int virtio_pci__signal_config(struct kvm *kvm, struct virtio_device *vdev);
+int virtio_pci__exit(struct kvm *kvm, struct virtio_device *vdev);
+int virtio_pci__init(struct kvm *kvm, void *dev, struct virtio_device *vdev,
+		     int device_id, int subsys_id, int class);
+
+#endif

diff --git a/tools/kvm/include/kvm/virtio-rng.h b/tools/kvm/include/kvm/virtio-rng.h
new file mode 100644
index 0000000..b585b37
--- /dev/null
+++ b/tools/kvm/include/kvm/virtio-rng.h

@@ -0,0 +1,9 @@
+#ifndef KVM__RNG_VIRTIO_H
+#define KVM__RNG_VIRTIO_H
+
+struct kvm;
+
+int virtio_rng__init(struct kvm *kvm);
+int virtio_rng__exit(struct kvm *kvm);
+
+#endif /* KVM__RNG_VIRTIO_H */

diff --git a/tools/kvm/include/kvm/virtio-scsi.h b/tools/kvm/include/kvm/virtio-scsi.h
new file mode 100644
index 0000000..d64aa7e
--- /dev/null
+++ b/tools/kvm/include/kvm/virtio-scsi.h

@@ -0,0 +1,11 @@
+#ifndef KVM__SCSI_VIRTIO_H
+#define KVM__SCSI_VIRTIO_H
+
+#include "kvm/disk-image.h"
+
+struct kvm;
+
+int virtio_scsi_init(struct kvm *kvm);
+int virtio_scsi_exit(struct kvm *kvm);
+
+#endif /* KVM__SCSI_VIRTIO_H */

diff --git a/tools/kvm/include/kvm/virtio.h b/tools/kvm/include/kvm/virtio.h
new file mode 100644
index 0000000..8a9eab5
--- /dev/null
+++ b/tools/kvm/include/kvm/virtio.h

@@ -0,0 +1,175 @@
+#ifndef KVM__VIRTIO_H
+#define KVM__VIRTIO_H
+
+#include <endian.h>
+
+#include <linux/virtio_ring.h>
+#include <linux/virtio_pci.h>
+
+#include <linux/types.h>
+#include <sys/uio.h>
+
+#include "kvm/kvm.h"
+
+#define VIRTIO_IRQ_LOW		0
+#define VIRTIO_IRQ_HIGH		1
+
+#define VIRTIO_PCI_O_CONFIG	0
+#define VIRTIO_PCI_O_MSIX	1
+
+#define VIRTIO_ENDIAN_HOST	0
+#define VIRTIO_ENDIAN_LE	(1 << 0)
+#define VIRTIO_ENDIAN_BE	(1 << 1)
+
+struct virt_queue {
+	struct vring	vring;
+	u32		pfn;
+	/* The last_avail_idx field is an index to ->ring of struct vring_avail.
+	   It's where we assume the next request index is at.  */
+	u16		last_avail_idx;
+	u16		last_used_signalled;
+	u16		endian;
+};
+
+/*
+ * The default policy is not to cope with the guest endianness.
+ * It also helps not breaking archs that do not care about supporting
+ * such a configuration.
+ */
+#ifndef VIRTIO_RING_ENDIAN
+#define VIRTIO_RING_ENDIAN VIRTIO_ENDIAN_HOST
+#endif
+
+#if (VIRTIO_RING_ENDIAN & (VIRTIO_ENDIAN_LE | VIRTIO_ENDIAN_BE))
+
+static inline __u16 __virtio_g2h_u16(u16 endian, __u16 val)
+{
+	return (endian == VIRTIO_ENDIAN_LE) ? le16toh(val) : be16toh(val);
+}
+
+static inline __u16 __virtio_h2g_u16(u16 endian, __u16 val)
+{
+	return (endian == VIRTIO_ENDIAN_LE) ? htole16(val) : htobe16(val);
+}
+
+static inline __u32 __virtio_g2h_u32(u16 endian, __u32 val)
+{
+	return (endian == VIRTIO_ENDIAN_LE) ? le32toh(val) : be32toh(val);
+}
+
+static inline __u32 __virtio_h2g_u32(u16 endian, __u32 val)
+{
+	return (endian == VIRTIO_ENDIAN_LE) ? htole32(val) : htobe32(val);
+}
+
+static inline __u64 __virtio_g2h_u64(u16 endian, __u64 val)
+{
+	return (endian == VIRTIO_ENDIAN_LE) ? le64toh(val) : be64toh(val);
+}
+
+static inline __u64 __virtio_h2g_u64(u16 endian, __u64 val)
+{
+	return (endian == VIRTIO_ENDIAN_LE) ? htole64(val) : htobe64(val);
+}
+
+#define virtio_guest_to_host_u16(x, v)	__virtio_g2h_u16((x)->endian, (v))
+#define virtio_host_to_guest_u16(x, v)	__virtio_h2g_u16((x)->endian, (v))
+#define virtio_guest_to_host_u32(x, v)	__virtio_g2h_u32((x)->endian, (v))
+#define virtio_host_to_guest_u32(x, v)	__virtio_h2g_u32((x)->endian, (v))
+#define virtio_guest_to_host_u64(x, v)	__virtio_g2h_u64((x)->endian, (v))
+#define virtio_host_to_guest_u64(x, v)	__virtio_h2g_u64((x)->endian, (v))
+
+#else
+
+#define virtio_guest_to_host_u16(x, v)	(v)
+#define virtio_host_to_guest_u16(x, v)	(v)
+#define virtio_guest_to_host_u32(x, v)	(v)
+#define virtio_host_to_guest_u32(x, v)	(v)
+#define virtio_guest_to_host_u64(x, v)	(v)
+#define virtio_host_to_guest_u64(x, v)	(v)
+
+#endif
+
+static inline u16 virt_queue__pop(struct virt_queue *queue)
+{
+	__u16 guest_idx;
+
+	guest_idx = queue->vring.avail->ring[queue->last_avail_idx++ % queue->vring.num];
+	return virtio_guest_to_host_u16(queue, guest_idx);
+}
+
+static inline struct vring_desc *virt_queue__get_desc(struct virt_queue *queue, u16 desc_ndx)
+{
+	return &queue->vring.desc[desc_ndx];
+}
+
+static inline bool virt_queue__available(struct virt_queue *vq)
+{
+	if (!vq->vring.avail)
+		return 0;
+
+	vring_avail_event(&vq->vring) = virtio_host_to_guest_u16(vq, vq->last_avail_idx);
+	return virtio_guest_to_host_u16(vq, vq->vring.avail->idx) != vq->last_avail_idx;
+}
+
+struct vring_used_elem *virt_queue__set_used_elem(struct virt_queue *queue, u32 head, u32 len);
+
+bool virtio_queue__should_signal(struct virt_queue *vq);
+u16 virt_queue__get_iov(struct virt_queue *vq, struct iovec iov[],
+			u16 *out, u16 *in, struct kvm *kvm);
+u16 virt_queue__get_head_iov(struct virt_queue *vq, struct iovec iov[],
+			     u16 *out, u16 *in, u16 head, struct kvm *kvm);
+u16 virt_queue__get_inout_iov(struct kvm *kvm, struct virt_queue *queue,
+			      struct iovec in_iov[], struct iovec out_iov[],
+			      u16 *in, u16 *out);
+int virtio__get_dev_specific_field(int offset, bool msix, u32 *config_off);
+
+enum virtio_trans {
+	VIRTIO_PCI,
+	VIRTIO_MMIO,
+};
+
+struct virtio_device {
+	bool			use_vhost;
+	void			*virtio;
+	struct virtio_ops	*ops;
+	u16			endian;
+};
+
+struct virtio_ops {
+	u8 *(*get_config)(struct kvm *kvm, void *dev);
+	u32 (*get_host_features)(struct kvm *kvm, void *dev);
+	void (*set_guest_features)(struct kvm *kvm, void *dev, u32 features);
+	int (*init_vq)(struct kvm *kvm, void *dev, u32 vq, u32 page_size,
+		       u32 align, u32 pfn);
+	int (*notify_vq)(struct kvm *kvm, void *dev, u32 vq);
+	int (*get_pfn_vq)(struct kvm *kvm, void *dev, u32 vq);
+	int (*get_size_vq)(struct kvm *kvm, void *dev, u32 vq);
+	int (*set_size_vq)(struct kvm *kvm, void *dev, u32 vq, int size);
+	void (*notify_vq_gsi)(struct kvm *kvm, void *dev, u32 vq, u32 gsi);
+	void (*notify_vq_eventfd)(struct kvm *kvm, void *dev, u32 vq, u32 efd);
+	int (*signal_vq)(struct kvm *kvm, struct virtio_device *vdev, u32 queueid);
+	int (*signal_config)(struct kvm *kvm, struct virtio_device *vdev);
+	void (*notify_status)(struct kvm *kvm, void *dev, u8 status);
+	int (*init)(struct kvm *kvm, void *dev, struct virtio_device *vdev,
+		    int device_id, int subsys_id, int class);
+	int (*exit)(struct kvm *kvm, struct virtio_device *vdev);
+};
+
+int virtio_init(struct kvm *kvm, void *dev, struct virtio_device *vdev,
+		struct virtio_ops *ops, enum virtio_trans trans,
+		int device_id, int subsys_id, int class);
+int virtio_compat_add_message(const char *device, const char *config);
+
+static inline void *virtio_get_vq(struct kvm *kvm, u32 pfn, u32 page_size)
+{
+	return guest_flat_to_host(kvm, (u64)pfn * page_size);
+}
+
+static inline void virtio_init_device_vq(struct virtio_device *vdev,
+					 struct virt_queue *vq)
+{
+	vq->endian = vdev->endian;
+}
+
+#endif /* KVM__VIRTIO_H */

diff --git a/tools/kvm/include/kvm/vnc.h b/tools/kvm/include/kvm/vnc.h
new file mode 100644
index 0000000..c2934a4
--- /dev/null
+++ b/tools/kvm/include/kvm/vnc.h

@@ -0,0 +1,22 @@
+#ifndef KVM__VNC_H
+#define KVM__VNC_H
+
+#include "kvm/kvm.h"
+
+struct framebuffer;
+
+#ifdef CONFIG_HAS_VNCSERVER
+int vnc__init(struct kvm *kvm);
+int vnc__exit(struct kvm *kvm);
+#else
+static inline int vnc__init(struct kvm *kvm)
+{
+	return 0;
+}
+static inline int vnc__exit(struct kvm *kvm)
+{
+	return 0;
+}
+#endif
+
+#endif /* KVM__VNC_H */

diff --git a/tools/kvm/include/linux/bitops.h b/tools/kvm/include/linux/bitops.h
new file mode 100644
index 0000000..56448b7
--- /dev/null
+++ b/tools/kvm/include/linux/bitops.h

@@ -0,0 +1,33 @@
+#ifndef _KVM_LINUX_BITOPS_H_
+#define _KVM_LINUX_BITOPS_H_
+
+#include <linux/kernel.h>
+#include <linux/compiler.h>
+#include <asm/hweight.h>
+
+#define BITS_PER_LONG __WORDSIZE
+#define BITS_PER_BYTE           8
+#define BITS_TO_LONGS(nr)       DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(long))
+
+static inline void set_bit(int nr, unsigned long *addr)
+{
+	addr[nr / BITS_PER_LONG] |= 1UL << (nr % BITS_PER_LONG);
+}
+
+static inline void clear_bit(int nr, unsigned long *addr)
+{
+	addr[nr / BITS_PER_LONG] &= ~(1UL << (nr % BITS_PER_LONG));
+}
+
+static __always_inline int test_bit(unsigned int nr, const unsigned long *addr)
+{
+	return ((1UL << (nr % BITS_PER_LONG)) &
+		(((unsigned long *)addr)[nr / BITS_PER_LONG])) != 0;
+}
+
+static inline unsigned long hweight_long(unsigned long w)
+{
+	return sizeof(w) == 4 ? hweight32(w) : hweight64(w);
+}
+
+#endif

diff --git a/tools/kvm/include/linux/byteorder.h b/tools/kvm/include/linux/byteorder.h
new file mode 100644
index 0000000..c490de8
--- /dev/null
+++ b/tools/kvm/include/linux/byteorder.h

@@ -0,0 +1,7 @@
+#ifndef __BYTE_ORDER_H__
+#define __BYTE_ORDER_H__
+
+#include <asm/byteorder.h>
+#include <linux/byteorder/generic.h>
+
+#endif

diff --git a/tools/kvm/include/linux/compiler.h b/tools/kvm/include/linux/compiler.h
new file mode 100644
index 0000000..898420b
--- /dev/null
+++ b/tools/kvm/include/linux/compiler.h

@@ -0,0 +1,20 @@
+#ifndef _PERF_LINUX_COMPILER_H_
+#define _PERF_LINUX_COMPILER_H_
+
+#ifndef __always_inline
+#define __always_inline	inline
+#endif
+#define __user
+
+#ifndef __attribute_const__
+#define __attribute_const__
+#endif
+
+#define __used		__attribute__((__unused__))
+#define __packed	__attribute__((packed))
+#define __iomem
+#define __force
+#define __must_check
+#define unlikely
+
+#endif

diff --git a/tools/kvm/include/linux/kernel.h b/tools/kvm/include/linux/kernel.h
new file mode 100644
index 0000000..f2bff5f
--- /dev/null
+++ b/tools/kvm/include/linux/kernel.h

@@ -0,0 +1,51 @@
+
+#ifndef KVM__LINUX_KERNEL_H_
+#define KVM__LINUX_KERNEL_H_
+
+#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))
+
+#define ALIGN(x,a)		__ALIGN_MASK(x,(typeof(x))(a)-1)
+#define __ALIGN_MASK(x,mask)	(((x)+(mask))&~(mask))
+
+#ifndef offsetof
+#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER)
+#endif
+
+#ifndef container_of
+/**
+ * container_of - cast a member of a structure out to the containing structure
+ * @ptr:	the pointer to the member.
+ * @type:	the type of the container struct this is embedded in.
+ * @member:	the name of the member within the struct.
+ *
+ */
+#define container_of(ptr, type, member) ({			\
+	const typeof(((type *)0)->member) * __mptr = (ptr);	\
+	(type *)((char *)__mptr - offsetof(type, member)); })
+#endif
+
+#define min(x, y) ({				\
+	typeof(x) _min1 = (x);			\
+	typeof(y) _min2 = (y);			\
+	(void) (&_min1 == &_min2);		\
+	_min1 < _min2 ? _min1 : _min2; })
+
+#define max(x, y) ({				\
+	typeof(x) _max1 = (x);			\
+	typeof(y) _max2 = (y);			\
+	(void) (&_max1 == &_max2);		\
+	_max1 > _max2 ? _max1 : _max2; })
+
+#define min_t(type, x, y) ({                    \
+	type __min1 = (x);                      \
+	type __min2 = (y);                      \
+	__min1 < __min2 ? __min1: __min2; })
+
+#define max_t(type, x, y) ({                    \
+	type __max1 = (x);                      \
+	type __max2 = (y);                      \
+	__max1 > __max2 ? __max1: __max2; })
+
+#define true 1
+
+#endif

diff --git a/tools/kvm/include/linux/module.h b/tools/kvm/include/linux/module.h
new file mode 100644
index 0000000..0e4c6a3
--- /dev/null
+++ b/tools/kvm/include/linux/module.h

@@ -0,0 +1,6 @@
+#ifndef KVM__LINUX_MODULE_H
+#define KVM__LINUX_MODULE_H
+
+#define EXPORT_SYMBOL(name)
+
+#endif

diff --git a/tools/kvm/include/linux/prefetch.h b/tools/kvm/include/linux/prefetch.h
new file mode 100644
index 0000000..62f6788
--- /dev/null
+++ b/tools/kvm/include/linux/prefetch.h

@@ -0,0 +1,6 @@
+#ifndef KVM__LINUX_PREFETCH_H
+#define KVM__LINUX_PREFETCH_H
+
+static inline void prefetch(void *a __attribute__((unused))) { }
+
+#endif

diff --git a/tools/kvm/include/linux/stddef.h b/tools/kvm/include/linux/stddef.h
new file mode 100644
index 0000000..39da808
--- /dev/null
+++ b/tools/kvm/include/linux/stddef.h

@@ -0,0 +1,10 @@
+#ifndef _LINUX_STDDEF_H
+#define _LINUX_STDDEF_H
+
+#undef NULL
+#define NULL ((void *)0)
+
+#undef offsetof
+#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER)
+
+#endif

diff --git a/tools/kvm/include/linux/types.h b/tools/kvm/include/linux/types.h
new file mode 100644
index 0000000..5e20f10
--- /dev/null
+++ b/tools/kvm/include/linux/types.h

@@ -0,0 +1,51 @@
+#ifndef LINUX_TYPES_H
+#define LINUX_TYPES_H
+
+#include <kvm/compiler.h>
+#define __SANE_USERSPACE_TYPES__	/* For PPC64, to get LL64 types */
+#include <asm/types.h>
+
+typedef __u64 u64;
+typedef __s64 s64;
+
+typedef __u32 u32;
+typedef __s32 s32;
+
+typedef __u16 u16;
+typedef __s16 s16;
+
+typedef __u8  u8;
+typedef __s8  s8;
+
+#ifdef __CHECKER__
+#define __bitwise__ __attribute__((bitwise))
+#else
+#define __bitwise__
+#endif
+#ifdef __CHECK_ENDIAN__
+#define __bitwise __bitwise__
+#else
+#define __bitwise
+#endif
+
+
+typedef __u16 __bitwise __le16;
+typedef __u16 __bitwise __be16;
+typedef __u32 __bitwise __le32;
+typedef __u32 __bitwise __be32;
+typedef __u64 __bitwise __le64;
+typedef __u64 __bitwise __be64;
+
+struct list_head {
+	struct list_head *next, *prev;
+};
+
+struct hlist_head {
+	struct hlist_node *first;
+};
+
+struct hlist_node {
+	struct hlist_node *next, **pprev;
+};
+
+#endif /* LINUX_TYPES_H */

diff --git a/tools/kvm/ioeventfd.c b/tools/kvm/ioeventfd.c
new file mode 100644
index 0000000..bce6861
--- /dev/null
+++ b/tools/kvm/ioeventfd.c

@@ -0,0 +1,218 @@
+#include <sys/epoll.h>
+#include <sys/ioctl.h>
+#include <pthread.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <signal.h>
+
+#include <linux/kernel.h>
+#include <linux/kvm.h>
+#include <linux/types.h>
+
+#include "kvm/ioeventfd.h"
+#include "kvm/kvm.h"
+#include "kvm/util.h"
+
+#define IOEVENTFD_MAX_EVENTS	20
+
+static struct	epoll_event events[IOEVENTFD_MAX_EVENTS];
+static int	epoll_fd, epoll_stop_fd;
+static LIST_HEAD(used_ioevents);
+static bool	ioeventfd_avail;
+
+static void *ioeventfd__thread(void *param)
+{
+	u64 tmp = 1;
+
+	kvm__set_thread_name("ioeventfd-worker");
+
+	for (;;) {
+		int nfds, i;
+
+		nfds = epoll_wait(epoll_fd, events, IOEVENTFD_MAX_EVENTS, -1);
+		for (i = 0; i < nfds; i++) {
+			struct ioevent *ioevent;
+
+			if (events[i].data.fd == epoll_stop_fd)
+				goto done;
+
+			ioevent = events[i].data.ptr;
+
+			if (read(ioevent->fd, &tmp, sizeof(tmp)) < 0)
+				die("Failed reading event");
+
+			ioevent->fn(ioevent->fn_kvm, ioevent->fn_ptr);
+		}
+	}
+
+done:
+	tmp = write(epoll_stop_fd, &tmp, sizeof(tmp));
+
+	return NULL;
+}
+
+static int ioeventfd__start(void)
+{
+	pthread_t thread;
+
+	if (!ioeventfd_avail)
+		return -ENOSYS;
+
+	return pthread_create(&thread, NULL, ioeventfd__thread, NULL);
+}
+
+int ioeventfd__init(struct kvm *kvm)
+{
+	struct epoll_event epoll_event = {.events = EPOLLIN};
+	int r;
+
+	ioeventfd_avail = kvm__supports_extension(kvm, KVM_CAP_IOEVENTFD);
+	if (!ioeventfd_avail)
+		return 1; /* Not fatal, but let caller determine no-go. */
+
+	epoll_fd = epoll_create(IOEVENTFD_MAX_EVENTS);
+	if (epoll_fd < 0)
+		return -errno;
+
+	epoll_stop_fd = eventfd(0, 0);
+	epoll_event.data.fd = epoll_stop_fd;
+
+	r = epoll_ctl(epoll_fd, EPOLL_CTL_ADD, epoll_stop_fd, &epoll_event);
+	if (r < 0)
+		goto cleanup;
+
+	r = ioeventfd__start();
+	if (r < 0)
+		goto cleanup;
+
+	r = 0;
+
+	return r;
+
+cleanup:
+	close(epoll_stop_fd);
+	close(epoll_fd);
+
+	return r;
+}
+base_init(ioeventfd__init);
+
+int ioeventfd__exit(struct kvm *kvm)
+{
+	u64 tmp = 1;
+	int r;
+
+	if (!ioeventfd_avail)
+		return 0;
+
+	r = write(epoll_stop_fd, &tmp, sizeof(tmp));
+	if (r < 0)
+		return r;
+
+	r = read(epoll_stop_fd, &tmp, sizeof(tmp));
+	if (r < 0)
+		return r;
+
+	close(epoll_fd);
+	close(epoll_stop_fd);
+
+	return 0;
+}
+base_exit(ioeventfd__exit);
+
+int ioeventfd__add_event(struct ioevent *ioevent, int flags)
+{
+	struct kvm_ioeventfd kvm_ioevent;
+	struct epoll_event epoll_event;
+	struct ioevent *new_ioevent;
+	int event, r;
+
+	if (!ioeventfd_avail)
+		return -ENOSYS;
+
+	new_ioevent = malloc(sizeof(*new_ioevent));
+	if (new_ioevent == NULL)
+		return -ENOMEM;
+
+	*new_ioevent = *ioevent;
+	event = new_ioevent->fd;
+
+	kvm_ioevent = (struct kvm_ioeventfd) {
+		.addr		= ioevent->io_addr,
+		.len		= ioevent->io_len,
+		.datamatch	= ioevent->datamatch,
+		.fd		= event,
+		.flags		= KVM_IOEVENTFD_FLAG_DATAMATCH,
+	};
+
+	if (flags & IOEVENTFD_FLAG_PIO)
+		kvm_ioevent.flags |= KVM_IOEVENTFD_FLAG_PIO;
+
+	r = ioctl(ioevent->fn_kvm->vm_fd, KVM_IOEVENTFD, &kvm_ioevent);
+	if (r) {
+		r = -errno;
+		goto cleanup;
+	}
+
+	if (!(flags & IOEVENTFD_FLAG_USER_POLL))
+		return 0;
+
+	epoll_event = (struct epoll_event) {
+		.events		= EPOLLIN,
+		.data.ptr	= new_ioevent,
+	};
+
+	r = epoll_ctl(epoll_fd, EPOLL_CTL_ADD, event, &epoll_event);
+	if (r) {
+		r = -errno;
+		goto cleanup;
+	}
+
+	list_add_tail(&new_ioevent->list, &used_ioevents);
+
+	return 0;
+
+cleanup:
+	free(new_ioevent);
+	return r;
+}
+
+int ioeventfd__del_event(u64 addr, u64 datamatch)
+{
+	struct kvm_ioeventfd kvm_ioevent;
+	struct ioevent *ioevent;
+	u8 found = 0;
+
+	if (!ioeventfd_avail)
+		return -ENOSYS;
+
+	list_for_each_entry(ioevent, &used_ioevents, list) {
+		if (ioevent->io_addr == addr) {
+			found = 1;
+			break;
+		}
+	}
+
+	if (found == 0 || ioevent == NULL)
+		return -ENOENT;
+
+	kvm_ioevent = (struct kvm_ioeventfd) {
+		.addr			= ioevent->io_addr,
+		.len			= ioevent->io_len,
+		.datamatch		= ioevent->datamatch,
+		.flags			= KVM_IOEVENTFD_FLAG_PIO
+					| KVM_IOEVENTFD_FLAG_DEASSIGN
+					| KVM_IOEVENTFD_FLAG_DATAMATCH,
+	};
+
+	ioctl(ioevent->fn_kvm->vm_fd, KVM_IOEVENTFD, &kvm_ioevent);
+
+	epoll_ctl(epoll_fd, EPOLL_CTL_DEL, ioevent->fd, NULL);
+
+	list_del(&ioevent->list);
+
+	close(ioevent->fd);
+	free(ioevent);
+
+	return 0;
+}

diff --git a/tools/kvm/ioport.c b/tools/kvm/ioport.c
new file mode 100644
index 0000000..5bfb7e2
--- /dev/null
+++ b/tools/kvm/ioport.c

@@ -0,0 +1,227 @@
+#include "kvm/ioport.h"
+
+#include "kvm/kvm.h"
+#include "kvm/util.h"
+#include "kvm/brlock.h"
+#include "kvm/rbtree-interval.h"
+#include "kvm/mutex.h"
+
+#include <linux/kvm.h>	/* for KVM_EXIT_* */
+#include <linux/types.h>
+
+#include <stdbool.h>
+#include <limits.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+#define ioport_node(n) rb_entry(n, struct ioport, node)
+
+DEFINE_MUTEX(ioport_mutex);
+
+static u16			free_io_port_idx; /* protected by ioport_mutex */
+
+static struct rb_root		ioport_tree = RB_ROOT;
+
+static u16 ioport__find_free_port(void)
+{
+	u16 free_port;
+
+	mutex_lock(&ioport_mutex);
+	free_port = IOPORT_START + free_io_port_idx * IOPORT_SIZE;
+	free_io_port_idx++;
+	mutex_unlock(&ioport_mutex);
+
+	return free_port;
+}
+
+static struct ioport *ioport_search(struct rb_root *root, u64 addr)
+{
+	struct rb_int_node *node;
+
+	node = rb_int_search_single(root, addr);
+	if (node == NULL)
+		return NULL;
+
+	return ioport_node(node);
+}
+
+static int ioport_insert(struct rb_root *root, struct ioport *data)
+{
+	return rb_int_insert(root, &data->node);
+}
+
+static void ioport_remove(struct rb_root *root, struct ioport *data)
+{
+	rb_int_erase(root, &data->node);
+}
+
+#ifdef CONFIG_HAS_LIBFDT
+static void generate_ioport_fdt_node(void *fdt,
+				     struct device_header *dev_hdr,
+				     void (*generate_irq_prop)(void *fdt,
+							       u8 irq))
+{
+	struct ioport *ioport = container_of(dev_hdr, struct ioport, dev_hdr);
+	struct ioport_operations *ops = ioport->ops;
+
+	if (ops->generate_fdt_node)
+		ops->generate_fdt_node(ioport, fdt, generate_irq_prop);
+}
+#else
+static void generate_ioport_fdt_node(void *fdt,
+				     struct device_header *dev_hdr,
+				     void (*generate_irq_prop)(void *fdt,
+							       u8 irq))
+{
+	die("Unable to generate device tree nodes without libfdt\n");
+}
+#endif
+
+int ioport__register(struct kvm *kvm, u16 port, struct ioport_operations *ops, int count, void *param)
+{
+	struct ioport *entry;
+	int r;
+
+	br_write_lock(kvm);
+	if (port == IOPORT_EMPTY)
+		port = ioport__find_free_port();
+
+	entry = ioport_search(&ioport_tree, port);
+	if (entry) {
+		pr_warning("ioport re-registered: %x", port);
+		rb_int_erase(&ioport_tree, &entry->node);
+	}
+
+	entry = malloc(sizeof(*entry));
+	if (entry == NULL)
+		return -ENOMEM;
+
+	*entry = (struct ioport) {
+		.node		= RB_INT_INIT(port, port + count),
+		.ops		= ops,
+		.priv		= param,
+		.dev_hdr	= (struct device_header) {
+			.bus_type	= DEVICE_BUS_IOPORT,
+			.data		= generate_ioport_fdt_node,
+		},
+	};
+
+	r = ioport_insert(&ioport_tree, entry);
+	if (r < 0) {
+		free(entry);
+		br_write_unlock(kvm);
+		return r;
+	}
+
+	device__register(&entry->dev_hdr);
+	br_write_unlock(kvm);
+
+	return port;
+}
+
+int ioport__unregister(struct kvm *kvm, u16 port)
+{
+	struct ioport *entry;
+	int r;
+
+	br_write_lock(kvm);
+
+	r = -ENOENT;
+	entry = ioport_search(&ioport_tree, port);
+	if (!entry)
+		goto done;
+
+	ioport_remove(&ioport_tree, entry);
+
+	free(entry);
+
+	r = 0;
+
+done:
+	br_write_unlock(kvm);
+
+	return r;
+}
+
+static void ioport__unregister_all(void)
+{
+	struct ioport *entry;
+	struct rb_node *rb;
+	struct rb_int_node *rb_node;
+
+	rb = rb_first(&ioport_tree);
+	while (rb) {
+		rb_node = rb_int(rb);
+		entry = ioport_node(rb_node);
+		ioport_remove(&ioport_tree, entry);
+		free(entry);
+		rb = rb_first(&ioport_tree);
+	}
+}
+
+static const char *to_direction(int direction)
+{
+	if (direction == KVM_EXIT_IO_IN)
+		return "IN";
+	else
+		return "OUT";
+}
+
+static void ioport_error(u16 port, void *data, int direction, int size, u32 count)
+{
+	fprintf(stderr, "IO error: %s port=%x, size=%d, count=%u\n", to_direction(direction), port, size, count);
+}
+
+bool kvm__emulate_io(struct kvm_cpu *vcpu, u16 port, void *data, int direction, int size, u32 count)
+{
+	struct ioport_operations *ops;
+	bool ret = false;
+	struct ioport *entry;
+	void *ptr = data;
+	struct kvm *kvm = vcpu->kvm;
+
+	br_read_lock();
+	entry = ioport_search(&ioport_tree, port);
+	if (!entry)
+		goto error;
+
+	ops	= entry->ops;
+
+	while (count--) {
+		if (direction == KVM_EXIT_IO_IN && ops->io_in)
+				ret = ops->io_in(entry, vcpu, port, ptr, size);
+		else if (ops->io_out)
+				ret = ops->io_out(entry, vcpu, port, ptr, size);
+
+		ptr += size;
+	}
+
+	br_read_unlock();
+
+	if (!ret)
+		goto error;
+
+	return true;
+error:
+	br_read_unlock();
+
+	if (kvm->cfg.ioport_debug)
+		ioport_error(port, data, direction, size, count);
+
+	return !kvm->cfg.ioport_debug;
+}
+
+int ioport__init(struct kvm *kvm)
+{
+	ioport__setup_arch(kvm);
+
+	return 0;
+}
+dev_base_init(ioport__init);
+
+int ioport__exit(struct kvm *kvm)
+{
+	ioport__unregister_all();
+	return 0;
+}
+dev_base_exit(ioport__exit);

diff --git a/tools/kvm/irq.c b/tools/kvm/irq.c
new file mode 100644
index 0000000..33ea8d2
--- /dev/null
+++ b/tools/kvm/irq.c

@@ -0,0 +1,9 @@
+#include "kvm/irq.h"
+#include "kvm/kvm-arch.h"
+
+static u8 next_line = KVM_IRQ_OFFSET;
+
+int irq__alloc_line(void)
+{
+	return next_line++;
+}

diff --git a/tools/kvm/kvm-cmd.c b/tools/kvm/kvm-cmd.c
new file mode 100644
index 0000000..2520b08
--- /dev/null
+++ b/tools/kvm/kvm-cmd.c

@@ -0,0 +1,91 @@
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+
+/* user defined header files */
+#include "kvm/builtin-debug.h"
+#include "kvm/builtin-pause.h"
+#include "kvm/builtin-resume.h"
+#include "kvm/builtin-balloon.h"
+#include "kvm/builtin-list.h"
+#include "kvm/builtin-version.h"
+#include "kvm/builtin-setup.h"
+#include "kvm/builtin-stop.h"
+#include "kvm/builtin-stat.h"
+#include "kvm/builtin-help.h"
+#include "kvm/builtin-sandbox.h"
+#include "kvm/kvm-cmd.h"
+#include "kvm/builtin-run.h"
+#include "kvm/util.h"
+
+struct cmd_struct kvm_commands[] = {
+	{ "pause",	kvm_cmd_pause,		kvm_pause_help,		0 },
+	{ "resume",	kvm_cmd_resume,		kvm_resume_help,	0 },
+	{ "debug",	kvm_cmd_debug,		kvm_debug_help,		0 },
+	{ "balloon",	kvm_cmd_balloon,	kvm_balloon_help,	0 },
+	{ "list",	kvm_cmd_list,		kvm_list_help,		0 },
+	{ "version",	kvm_cmd_version,	NULL,			0 },
+	{ "--version",	kvm_cmd_version,	NULL,			0 },
+	{ "stop",	kvm_cmd_stop,		kvm_stop_help,		0 },
+	{ "stat",	kvm_cmd_stat,		kvm_stat_help,		0 },
+	{ "help",	kvm_cmd_help,		NULL,			0 },
+	{ "setup",	kvm_cmd_setup,		kvm_setup_help,		0 },
+	{ "run",	kvm_cmd_run,		kvm_run_help,		0 },
+	{ "sandbox",	kvm_cmd_sandbox,	kvm_run_help,		0 },
+	{ NULL,		NULL,			NULL,			0 },
+};
+
+/*
+ * kvm_get_command: Searches the command in an array of the commands and
+ * returns a pointer to cmd_struct if a match is found.
+ *
+ * Input parameters:
+ * command: Array of possible commands. The last entry in the array must be
+ *          NULL.
+ * cmd: A string command to search in the array
+ *
+ * Return Value:
+ * NULL: If the cmd is not matched with any of the command in the command array
+ * p: Pointer to cmd_struct of the matching command
+ */
+struct cmd_struct *kvm_get_command(struct cmd_struct *command,
+		const char *cmd)
+{
+	struct cmd_struct *p = command;
+
+	while (p->cmd) {
+		if (!strcmp(p->cmd, cmd))
+			return p;
+		p++;
+	}
+	return NULL;
+}
+
+int handle_command(struct cmd_struct *command, int argc, const char **argv)
+{
+	struct cmd_struct *p;
+	const char *prefix = NULL;
+	int ret = 0;
+
+	if (!argv || !*argv) {
+		p = kvm_get_command(command, "help");
+		BUG_ON(!p);
+		return p->fn(argc, argv, prefix);
+	}
+
+	p = kvm_get_command(command, argv[0]);
+	if (!p) {
+		p = kvm_get_command(command, "help");
+		BUG_ON(!p);
+		p->fn(0, NULL, prefix);
+		return EINVAL;
+	}
+
+	ret = p->fn(argc - 1, &argv[1], prefix);
+	if (ret < 0) {
+		if (errno == EPERM)
+			die("Permission error - are you root?");
+	}
+
+	return ret;
+}

diff --git a/tools/kvm/kvm-cpu.c b/tools/kvm/kvm-cpu.c
new file mode 100644
index 0000000..ee0a8ec
--- /dev/null
+++ b/tools/kvm/kvm-cpu.c

@@ -0,0 +1,248 @@
+#include "kvm/kvm-cpu.h"
+
+#include "kvm/symbol.h"
+#include "kvm/util.h"
+#include "kvm/kvm.h"
+#include "kvm/virtio.h"
+
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <stdio.h>
+
+extern __thread struct kvm_cpu *current_kvm_cpu;
+
+int __attribute__((weak)) kvm_cpu__get_endianness(struct kvm_cpu *vcpu)
+{
+	return VIRTIO_ENDIAN_HOST;
+}
+
+void kvm_cpu__enable_singlestep(struct kvm_cpu *vcpu)
+{
+	struct kvm_guest_debug debug = {
+		.control	= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP,
+	};
+
+	if (ioctl(vcpu->vcpu_fd, KVM_SET_GUEST_DEBUG, &debug) < 0)
+		pr_warning("KVM_SET_GUEST_DEBUG failed");
+}
+
+void kvm_cpu__run(struct kvm_cpu *vcpu)
+{
+	int err;
+
+	if (!vcpu->is_running)
+		return;
+
+	err = ioctl(vcpu->vcpu_fd, KVM_RUN, 0);
+	if (err < 0 && (errno != EINTR && errno != EAGAIN))
+		die_perror("KVM_RUN failed");
+}
+
+static void kvm_cpu_signal_handler(int signum)
+{
+	if (signum == SIGKVMEXIT) {
+		if (current_kvm_cpu && current_kvm_cpu->is_running) {
+			current_kvm_cpu->is_running = false;
+			kvm__continue(current_kvm_cpu->kvm);
+		}
+	} else if (signum == SIGKVMPAUSE) {
+		current_kvm_cpu->paused = 1;
+	}
+}
+
+static void kvm_cpu__handle_coalesced_mmio(struct kvm_cpu *cpu)
+{
+	if (cpu->ring) {
+		while (cpu->ring->first != cpu->ring->last) {
+			struct kvm_coalesced_mmio *m;
+			m = &cpu->ring->coalesced_mmio[cpu->ring->first];
+			kvm_cpu__emulate_mmio(cpu,
+					      m->phys_addr,
+					      m->data,
+					      m->len,
+					      1);
+			cpu->ring->first = (cpu->ring->first + 1) % KVM_COALESCED_MMIO_MAX;
+		}
+	}
+}
+
+void kvm_cpu__reboot(struct kvm *kvm)
+{
+	int i;
+
+	/* The kvm->cpus array contains a null pointer in the last location */
+	for (i = 0; ; i++) {
+		if (kvm->cpus[i])
+			pthread_kill(kvm->cpus[i]->thread, SIGKVMEXIT);
+		else
+			break;
+	}
+}
+
+int kvm_cpu__start(struct kvm_cpu *cpu)
+{
+	sigset_t sigset;
+
+	sigemptyset(&sigset);
+	sigaddset(&sigset, SIGALRM);
+
+	pthread_sigmask(SIG_BLOCK, &sigset, NULL);
+
+	signal(SIGKVMEXIT, kvm_cpu_signal_handler);
+	signal(SIGKVMPAUSE, kvm_cpu_signal_handler);
+
+	kvm_cpu__reset_vcpu(cpu);
+
+	if (cpu->kvm->cfg.single_step)
+		kvm_cpu__enable_singlestep(cpu);
+
+	while (cpu->is_running) {
+		if (cpu->paused) {
+			kvm__notify_paused();
+			cpu->paused = 0;
+		}
+
+		if (cpu->needs_nmi) {
+			kvm_cpu__arch_nmi(cpu);
+			cpu->needs_nmi = 0;
+		}
+
+		kvm_cpu__run(cpu);
+
+		switch (cpu->kvm_run->exit_reason) {
+		case KVM_EXIT_UNKNOWN:
+			break;
+		case KVM_EXIT_DEBUG:
+			kvm_cpu__show_registers(cpu);
+			kvm_cpu__show_code(cpu);
+			break;
+		case KVM_EXIT_IO: {
+			bool ret;
+
+			ret = kvm_cpu__emulate_io(cpu,
+						  cpu->kvm_run->io.port,
+						  (u8 *)cpu->kvm_run +
+						  cpu->kvm_run->io.data_offset,
+						  cpu->kvm_run->io.direction,
+						  cpu->kvm_run->io.size,
+						  cpu->kvm_run->io.count);
+
+			if (!ret)
+				goto panic_kvm;
+			break;
+		}
+		case KVM_EXIT_MMIO: {
+			bool ret;
+
+			/*
+			 * If we had MMIO exit, coalesced ring should be processed
+			 * *before* processing the exit itself
+			 */
+			kvm_cpu__handle_coalesced_mmio(cpu);
+
+			ret = kvm_cpu__emulate_mmio(cpu,
+						    cpu->kvm_run->mmio.phys_addr,
+						    cpu->kvm_run->mmio.data,
+						    cpu->kvm_run->mmio.len,
+						    cpu->kvm_run->mmio.is_write);
+
+			if (!ret)
+				goto panic_kvm;
+			break;
+		}
+		case KVM_EXIT_INTR:
+			if (cpu->is_running)
+				break;
+			goto exit_kvm;
+		case KVM_EXIT_SHUTDOWN:
+			goto exit_kvm;
+		default: {
+			bool ret;
+
+			ret = kvm_cpu__handle_exit(cpu);
+			if (!ret)
+				goto panic_kvm;
+			break;
+		}
+		}
+		kvm_cpu__handle_coalesced_mmio(cpu);
+	}
+
+exit_kvm:
+	return 0;
+
+panic_kvm:
+	return 1;
+}
+
+int kvm_cpu__init(struct kvm *kvm)
+{
+	int max_cpus, recommended_cpus, i;
+
+	max_cpus = kvm__max_cpus(kvm);
+	recommended_cpus = kvm__recommended_cpus(kvm);
+
+	if (kvm->cfg.nrcpus > max_cpus) {
+		printf("  # Limit the number of CPUs to %d\n", max_cpus);
+		kvm->cfg.nrcpus = max_cpus;
+	} else if (kvm->cfg.nrcpus > recommended_cpus) {
+		printf("  # Warning: The maximum recommended amount of VCPUs"
+			" is %d\n", recommended_cpus);
+	}
+
+	kvm->nrcpus = kvm->cfg.nrcpus;
+
+	/* Alloc one pointer too many, so array ends up 0-terminated */
+	kvm->cpus = calloc(kvm->nrcpus + 1, sizeof(void *));
+	if (!kvm->cpus) {
+		pr_warning("Couldn't allocate array for %d CPUs", kvm->nrcpus);
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < kvm->nrcpus; i++) {
+		kvm->cpus[i] = kvm_cpu__arch_init(kvm, i);
+		if (!kvm->cpus[i]) {
+			pr_warning("unable to initialize KVM VCPU");
+			goto fail_alloc;
+		}
+	}
+
+	return 0;
+
+fail_alloc:
+	for (i = 0; i < kvm->nrcpus; i++)
+		free(kvm->cpus[i]);
+	return -ENOMEM;
+}
+base_init(kvm_cpu__init);
+
+int kvm_cpu__exit(struct kvm *kvm)
+{
+	int i, r;
+	void *ret = NULL;
+
+	kvm_cpu__delete(kvm->cpus[0]);
+	kvm->cpus[0] = NULL;
+
+	for (i = 1; i < kvm->nrcpus; i++) {
+		if (kvm->cpus[i]->is_running) {
+			pthread_kill(kvm->cpus[i]->thread, SIGKVMEXIT);
+			if (pthread_join(kvm->cpus[i]->thread, &ret) != 0)
+				die("pthread_join");
+			kvm_cpu__delete(kvm->cpus[i]);
+		}
+		if (ret == NULL)
+			r = 0;
+	}
+
+	free(kvm->cpus);
+
+	kvm->nrcpus = 0;
+
+	return r;
+}
+late_exit(kvm_cpu__exit);

diff --git a/tools/kvm/kvm-ipc.c b/tools/kvm/kvm-ipc.c
new file mode 100644
index 0000000..b1c43dd
--- /dev/null
+++ b/tools/kvm/kvm-ipc.c

@@ -0,0 +1,533 @@
+#include <sys/epoll.h>
+#include <sys/un.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/eventfd.h>
+#include <dirent.h>
+
+#include "kvm/kvm-ipc.h"
+#include "kvm/rwsem.h"
+#include "kvm/read-write.h"
+#include "kvm/util.h"
+#include "kvm/kvm.h"
+#include "kvm/builtin-debug.h"
+#include "kvm/strbuf.h"
+#include "kvm/kvm-cpu.h"
+#include "kvm/8250-serial.h"
+
+struct kvm_ipc_head {
+	u32 type;
+	u32 len;
+};
+
+#define KVM_IPC_MAX_MSGS 16
+
+#define KVM_SOCK_SUFFIX		".sock"
+#define KVM_SOCK_SUFFIX_LEN	((ssize_t)sizeof(KVM_SOCK_SUFFIX) - 1)
+
+extern __thread struct kvm_cpu *current_kvm_cpu;
+static void (*msgs[KVM_IPC_MAX_MSGS])(struct kvm *kvm, int fd, u32 type, u32 len, u8 *msg);
+static DECLARE_RWSEM(msgs_rwlock);
+static int epoll_fd, server_fd, stop_fd;
+static pthread_t thread;
+
+static int kvm__create_socket(struct kvm *kvm)
+{
+	char full_name[PATH_MAX];
+	unsigned int s;
+	struct sockaddr_un local;
+	int len, r;
+
+	/* This usually 108 bytes long */
+	BUILD_BUG_ON(sizeof(local.sun_path) < 32);
+
+	snprintf(full_name, sizeof(full_name), "%s/%s%s",
+		 kvm__get_dir(), kvm->cfg.guest_name, KVM_SOCK_SUFFIX);
+	if (access(full_name, F_OK) == 0) {
+		pr_err("Socket file %s already exist", full_name);
+		return -EEXIST;
+	}
+
+	s = socket(AF_UNIX, SOCK_STREAM, 0);
+	if (s < 0) {
+		perror("socket");
+		return s;
+	}
+
+	local.sun_family = AF_UNIX;
+	strlcpy(local.sun_path, full_name, sizeof(local.sun_path));
+	len = strlen(local.sun_path) + sizeof(local.sun_family);
+	r = bind(s, (struct sockaddr *)&local, len);
+	if (r < 0) {
+		perror("bind");
+		goto fail;
+	}
+
+	r = listen(s, 5);
+	if (r < 0) {
+		perror("listen");
+		goto fail;
+	}
+
+	return s;
+
+fail:
+	close(s);
+	return r;
+}
+
+void kvm__remove_socket(const char *name)
+{
+	char full_name[PATH_MAX];
+
+	snprintf(full_name, sizeof(full_name), "%s/%s%s",
+		 kvm__get_dir(), name, KVM_SOCK_SUFFIX);
+	unlink(full_name);
+}
+
+int kvm__get_sock_by_instance(const char *name)
+{
+	int s, len, r;
+	char sock_file[PATH_MAX];
+	struct sockaddr_un local;
+
+	snprintf(sock_file, sizeof(sock_file), "%s/%s%s",
+		 kvm__get_dir(), name, KVM_SOCK_SUFFIX);
+	s = socket(AF_UNIX, SOCK_STREAM, 0);
+
+	local.sun_family = AF_UNIX;
+	strlcpy(local.sun_path, sock_file, sizeof(local.sun_path));
+	len = strlen(local.sun_path) + sizeof(local.sun_family);
+
+	r = connect(s, &local, len);
+	if (r < 0 && errno == ECONNREFUSED) {
+		/* Tell the user clean ghost socket file */
+		pr_err("\"%s\" could be a ghost socket file, please remove it",
+				sock_file);
+		return r;
+	} else if (r < 0) {
+		return r;
+	}
+
+	return s;
+}
+
+static bool is_socket(const char *base_path, const struct dirent *dent)
+{
+	switch (dent->d_type) {
+	case DT_SOCK:
+		return true;
+
+	case DT_UNKNOWN: {
+		char path[PATH_MAX];
+		struct stat st;
+
+		sprintf(path, "%s/%s", base_path, dent->d_name);
+		if (stat(path, &st))
+			return false;
+
+		return S_ISSOCK(st.st_mode);
+	}
+	default:
+		return false;
+	}
+}
+
+int kvm__enumerate_instances(int (*callback)(const char *name, int fd))
+{
+	int sock;
+	DIR *dir;
+	struct dirent entry, *result;
+	int ret = 0;
+	const char *path;
+
+	path = kvm__get_dir();
+
+	dir = opendir(path);
+	if (!dir)
+		return -errno;
+
+	for (;;) {
+		readdir_r(dir, &entry, &result);
+		if (result == NULL)
+			break;
+		if (is_socket(path, &entry)) {
+			ssize_t name_len = strlen(entry.d_name);
+			char *p;
+
+			if (name_len <= KVM_SOCK_SUFFIX_LEN)
+				continue;
+
+			p = &entry.d_name[name_len - KVM_SOCK_SUFFIX_LEN];
+			if (memcmp(KVM_SOCK_SUFFIX, p, KVM_SOCK_SUFFIX_LEN))
+				continue;
+
+			*p = 0;
+			sock = kvm__get_sock_by_instance(entry.d_name);
+			if (sock < 0)
+				continue;
+			ret = callback(entry.d_name, sock);
+			close(sock);
+			if (ret < 0)
+				break;
+		}
+	}
+
+	closedir(dir);
+
+	return ret;
+}
+
+int kvm_ipc__register_handler(u32 type, void (*cb)(struct kvm *kvm, int fd, u32 type, u32 len, u8 *msg))
+{
+	if (type >= KVM_IPC_MAX_MSGS)
+		return -ENOSPC;
+
+	down_write(&msgs_rwlock);
+	msgs[type] = cb;
+	up_write(&msgs_rwlock);
+
+	return 0;
+}
+
+int kvm_ipc__send(int fd, u32 type)
+{
+	struct kvm_ipc_head head = {.type = type, .len = 0,};
+
+	if (write_in_full(fd, &head, sizeof(head)) < 0)
+		return -1;
+
+	return 0;
+}
+
+int kvm_ipc__send_msg(int fd, u32 type, u32 len, u8 *msg)
+{
+	struct kvm_ipc_head head = {.type = type, .len = len,};
+
+	if (write_in_full(fd, &head, sizeof(head)) < 0)
+		return -1;
+
+	if (write_in_full(fd, msg, len) < 0)
+		return -1;
+
+	return 0;
+}
+
+static int kvm_ipc__handle(struct kvm *kvm, int fd, u32 type, u32 len, u8 *data)
+{
+	void (*cb)(struct kvm *kvm, int fd, u32 type, u32 len, u8 *msg);
+
+	if (type >= KVM_IPC_MAX_MSGS)
+		return -ENOSPC;
+
+	down_read(&msgs_rwlock);
+	cb = msgs[type];
+	up_read(&msgs_rwlock);
+
+	if (cb == NULL) {
+		pr_warning("No device handles type %u\n", type);
+		return -ENODEV;
+	}
+
+	cb(kvm, fd, type, len, data);
+
+	return 0;
+}
+
+static int kvm_ipc__new_conn(int fd)
+{
+	int client;
+	struct epoll_event ev;
+
+	client = accept(fd, NULL, NULL);
+	if (client < 0)
+		return -1;
+
+	ev.events = EPOLLIN | EPOLLRDHUP;
+	ev.data.fd = client;
+	if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, client, &ev) < 0) {
+		close(client);
+		return -1;
+	}
+
+	return client;
+}
+
+static void kvm_ipc__close_conn(int fd)
+{
+	epoll_ctl(epoll_fd, EPOLL_CTL_DEL, fd, NULL);
+	close(fd);
+}
+
+static int kvm_ipc__receive(struct kvm *kvm, int fd)
+{
+	struct kvm_ipc_head head;
+	u8 *msg = NULL;
+	u32 n;
+
+	n = read(fd, &head, sizeof(head));
+	if (n != sizeof(head))
+		goto done;
+
+	msg = malloc(head.len);
+	if (msg == NULL)
+		goto done;
+
+	n = read_in_full(fd, msg, head.len);
+	if (n != head.len)
+		goto done;
+
+	kvm_ipc__handle(kvm, fd, head.type, head.len, msg);
+
+	return 0;
+
+done:
+	free(msg);
+	return -1;
+}
+
+static void *kvm_ipc__thread(void *param)
+{
+	struct epoll_event event;
+	struct kvm *kvm = param;
+
+	kvm__set_thread_name("kvm-ipc");
+
+	for (;;) {
+		int nfds;
+
+		nfds = epoll_wait(epoll_fd, &event, 1, -1);
+		if (nfds > 0) {
+			int fd = event.data.fd;
+
+			if (fd == stop_fd && event.events & EPOLLIN) {
+				break;
+			} else if (fd == server_fd) {
+				int client, r;
+
+				client = kvm_ipc__new_conn(fd);
+				/*
+				 * Handle multiple IPC cmd at a time
+				 */
+				do {
+					r = kvm_ipc__receive(kvm, client);
+				} while	(r == 0);
+
+			} else if (event.events & (EPOLLERR | EPOLLRDHUP | EPOLLHUP)) {
+				kvm_ipc__close_conn(fd);
+			} else {
+				kvm_ipc__receive(kvm, fd);
+			}
+		}
+	}
+
+	return NULL;
+}
+
+static void kvm__pid(struct kvm *kvm, int fd, u32 type, u32 len, u8 *msg)
+{
+	pid_t pid = getpid();
+	int r = 0;
+
+	if (type == KVM_IPC_PID)
+		r = write(fd, &pid, sizeof(pid));
+
+	if (r < 0)
+		pr_warning("Failed sending PID");
+}
+
+static void handle_stop(struct kvm *kvm, int fd, u32 type, u32 len, u8 *msg)
+{
+	if (WARN_ON(type != KVM_IPC_STOP || len))
+		return;
+
+	kvm_cpu__reboot(kvm);
+}
+
+/* Pause/resume the guest using SIGUSR2 */
+static int is_paused;
+
+static void handle_pause(struct kvm *kvm, int fd, u32 type, u32 len, u8 *msg)
+{
+	if (WARN_ON(len))
+		return;
+
+	if (type == KVM_IPC_RESUME && is_paused) {
+		kvm->vm_state = KVM_VMSTATE_RUNNING;
+		kvm__continue(kvm);
+	} else if (type == KVM_IPC_PAUSE && !is_paused) {
+		kvm->vm_state = KVM_VMSTATE_PAUSED;
+		ioctl(kvm->vm_fd, KVM_KVMCLOCK_CTRL);
+		kvm__pause(kvm);
+	} else {
+		return;
+	}
+
+	is_paused = !is_paused;
+}
+
+static void handle_vmstate(struct kvm *kvm, int fd, u32 type, u32 len, u8 *msg)
+{
+	int r = 0;
+
+	if (type == KVM_IPC_VMSTATE)
+		r = write(fd, &kvm->vm_state, sizeof(kvm->vm_state));
+
+	if (r < 0)
+		pr_warning("Failed sending VMSTATE");
+}
+
+/*
+ * Serialize debug printout so that the output of multiple vcpus does not
+ * get mixed up:
+ */
+static int printout_done;
+
+static void handle_sigusr1(int sig)
+{
+	struct kvm_cpu *cpu = current_kvm_cpu;
+	int fd = kvm_cpu__get_debug_fd();
+
+	if (!cpu || cpu->needs_nmi)
+		return;
+
+	dprintf(fd, "\n #\n # vCPU #%ld's dump:\n #\n", cpu->cpu_id);
+	kvm_cpu__show_registers(cpu);
+	kvm_cpu__show_code(cpu);
+	kvm_cpu__show_page_tables(cpu);
+	fflush(stdout);
+	printout_done = 1;
+}
+
+static void handle_debug(struct kvm *kvm, int fd, u32 type, u32 len, u8 *msg)
+{
+	int i;
+	struct debug_cmd_params *params;
+	u32 dbg_type;
+	u32 vcpu;
+
+	if (WARN_ON(type != KVM_IPC_DEBUG || len != sizeof(*params)))
+		return;
+
+	params = (void *)msg;
+	dbg_type = params->dbg_type;
+	vcpu = params->cpu;
+
+	if (dbg_type & KVM_DEBUG_CMD_TYPE_SYSRQ)
+		serial8250__inject_sysrq(kvm, params->sysrq);
+
+	if (dbg_type & KVM_DEBUG_CMD_TYPE_NMI) {
+		if ((int)vcpu >= kvm->nrcpus)
+			return;
+
+		kvm->cpus[vcpu]->needs_nmi = 1;
+		pthread_kill(kvm->cpus[vcpu]->thread, SIGUSR1);
+	}
+
+	if (!(dbg_type & KVM_DEBUG_CMD_TYPE_DUMP))
+		return;
+
+	for (i = 0; i < kvm->nrcpus; i++) {
+		struct kvm_cpu *cpu = kvm->cpus[i];
+
+		if (!cpu)
+			continue;
+
+		printout_done = 0;
+
+		kvm_cpu__set_debug_fd(fd);
+		pthread_kill(cpu->thread, SIGUSR1);
+		/*
+		 * Wait for the vCPU to dump state before signalling
+		 * the next thread. Since this is debug code it does
+		 * not matter that we are burning CPU time a bit:
+		 */
+		while (!printout_done)
+			sleep(0);
+	}
+
+	close(fd);
+
+	serial8250__inject_sysrq(kvm, 'p');
+}
+
+int kvm_ipc__init(struct kvm *kvm)
+{
+	int ret;
+	int sock = kvm__create_socket(kvm);
+	struct epoll_event ev = {0};
+
+	server_fd = sock;
+
+	epoll_fd = epoll_create(KVM_IPC_MAX_MSGS);
+	if (epoll_fd < 0) {
+		perror("epoll_create");
+		ret = epoll_fd;
+		goto err;
+	}
+
+	ev.events = EPOLLIN | EPOLLET;
+	ev.data.fd = sock;
+	if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, sock, &ev) < 0) {
+		pr_err("Failed adding socket to epoll");
+		ret = -EFAULT;
+		goto err_epoll;
+	}
+
+	stop_fd = eventfd(0, 0);
+	if (stop_fd < 0) {
+		perror("eventfd");
+		ret = stop_fd;
+		goto err_epoll;
+	}
+
+	ev.events = EPOLLIN | EPOLLET;
+	ev.data.fd = stop_fd;
+	if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, stop_fd, &ev) < 0) {
+		pr_err("Failed adding stop event to epoll");
+		ret = -EFAULT;
+		goto err_stop;
+	}
+
+	if (pthread_create(&thread, NULL, kvm_ipc__thread, kvm) != 0) {
+		pr_err("Failed starting IPC thread");
+		ret = -EFAULT;
+		goto err_stop;
+	}
+
+	kvm_ipc__register_handler(KVM_IPC_PID, kvm__pid);
+	kvm_ipc__register_handler(KVM_IPC_DEBUG, handle_debug);
+	kvm_ipc__register_handler(KVM_IPC_PAUSE, handle_pause);
+	kvm_ipc__register_handler(KVM_IPC_RESUME, handle_pause);
+	kvm_ipc__register_handler(KVM_IPC_STOP, handle_stop);
+	kvm_ipc__register_handler(KVM_IPC_VMSTATE, handle_vmstate);
+	signal(SIGUSR1, handle_sigusr1);
+
+	return 0;
+
+err_stop:
+	close(stop_fd);
+err_epoll:
+	close(epoll_fd);
+err:
+	return ret;
+}
+base_init(kvm_ipc__init);
+
+int kvm_ipc__exit(struct kvm *kvm)
+{
+	u64 val = 1;
+	int ret;
+
+	ret = write(stop_fd, &val, sizeof(val));
+	if (ret < 0)
+		return ret;
+
+	close(server_fd);
+	close(epoll_fd);
+
+	kvm__remove_socket(kvm->cfg.guest_name);
+
+	return ret;
+}
+base_exit(kvm_ipc__exit);

diff --git a/tools/kvm/kvm.c b/tools/kvm/kvm.c
new file mode 100644
index 0000000..e1b9f6c
--- /dev/null
+++ b/tools/kvm/kvm.c

@@ -0,0 +1,481 @@
+#include "kvm/kvm.h"
+#include "kvm/read-write.h"
+#include "kvm/util.h"
+#include "kvm/strbuf.h"
+#include "kvm/mutex.h"
+#include "kvm/kvm-cpu.h"
+#include "kvm/kvm-ipc.h"
+
+#include <linux/kernel.h>
+#include <linux/kvm.h>
+#include <linux/list.h>
+#include <linux/err.h>
+
+#include <sys/un.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <stdbool.h>
+#include <limits.h>
+#include <signal.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <time.h>
+#include <sys/eventfd.h>
+#include <asm/unistd.h>
+#include <dirent.h>
+
+#define DEFINE_KVM_EXIT_REASON(reason) [reason] = #reason
+
+const char *kvm_exit_reasons[] = {
+	DEFINE_KVM_EXIT_REASON(KVM_EXIT_UNKNOWN),
+	DEFINE_KVM_EXIT_REASON(KVM_EXIT_EXCEPTION),
+	DEFINE_KVM_EXIT_REASON(KVM_EXIT_IO),
+	DEFINE_KVM_EXIT_REASON(KVM_EXIT_HYPERCALL),
+	DEFINE_KVM_EXIT_REASON(KVM_EXIT_DEBUG),
+	DEFINE_KVM_EXIT_REASON(KVM_EXIT_HLT),
+	DEFINE_KVM_EXIT_REASON(KVM_EXIT_MMIO),
+	DEFINE_KVM_EXIT_REASON(KVM_EXIT_IRQ_WINDOW_OPEN),
+	DEFINE_KVM_EXIT_REASON(KVM_EXIT_SHUTDOWN),
+	DEFINE_KVM_EXIT_REASON(KVM_EXIT_FAIL_ENTRY),
+	DEFINE_KVM_EXIT_REASON(KVM_EXIT_INTR),
+	DEFINE_KVM_EXIT_REASON(KVM_EXIT_SET_TPR),
+	DEFINE_KVM_EXIT_REASON(KVM_EXIT_TPR_ACCESS),
+	DEFINE_KVM_EXIT_REASON(KVM_EXIT_S390_SIEIC),
+	DEFINE_KVM_EXIT_REASON(KVM_EXIT_S390_RESET),
+	DEFINE_KVM_EXIT_REASON(KVM_EXIT_DCR),
+	DEFINE_KVM_EXIT_REASON(KVM_EXIT_NMI),
+	DEFINE_KVM_EXIT_REASON(KVM_EXIT_INTERNAL_ERROR),
+#ifdef CONFIG_PPC64
+	DEFINE_KVM_EXIT_REASON(KVM_EXIT_PAPR_HCALL),
+#endif
+};
+
+static int pause_event;
+static DEFINE_MUTEX(pause_lock);
+extern struct kvm_ext kvm_req_ext[];
+
+static char kvm_dir[PATH_MAX];
+
+static int set_dir(const char *fmt, va_list args)
+{
+	char tmp[PATH_MAX];
+
+	vsnprintf(tmp, sizeof(tmp), fmt, args);
+
+	mkdir(tmp, 0777);
+
+	if (!realpath(tmp, kvm_dir))
+		return -errno;
+
+	strcat(kvm_dir, "/");
+
+	return 0;
+}
+
+void kvm__set_dir(const char *fmt, ...)
+{
+	va_list args;
+
+	va_start(args, fmt);
+	set_dir(fmt, args);
+	va_end(args);
+}
+
+const char *kvm__get_dir(void)
+{
+	return kvm_dir;
+}
+
+bool kvm__supports_extension(struct kvm *kvm, unsigned int extension)
+{
+	int ret;
+
+	ret = ioctl(kvm->sys_fd, KVM_CHECK_EXTENSION, extension);
+	if (ret < 0)
+		return false;
+
+	return ret;
+}
+
+static int kvm__check_extensions(struct kvm *kvm)
+{
+	int i;
+
+	for (i = 0; ; i++) {
+		if (!kvm_req_ext[i].name)
+			break;
+		if (!kvm__supports_extension(kvm, kvm_req_ext[i].code)) {
+			pr_err("Unsuppored KVM extension detected: %s",
+				kvm_req_ext[i].name);
+			return -i;
+		}
+	}
+
+	return 0;
+}
+
+struct kvm *kvm__new(void)
+{
+	struct kvm *kvm = calloc(1, sizeof(*kvm));
+	if (!kvm)
+		return ERR_PTR(-ENOMEM);
+
+	kvm->sys_fd = -1;
+	kvm->vm_fd = -1;
+
+	return kvm;
+}
+
+int kvm__exit(struct kvm *kvm)
+{
+	struct kvm_mem_bank *bank, *tmp;
+
+	kvm__arch_delete_ram(kvm);
+
+	list_for_each_entry_safe(bank, tmp, &kvm->mem_banks, list) {
+		list_del(&bank->list);
+		free(bank);
+	}
+
+	free(kvm);
+	return 0;
+}
+core_exit(kvm__exit);
+
+/*
+ * Note: KVM_SET_USER_MEMORY_REGION assumes that we don't pass overlapping
+ * memory regions to it. Therefore, be careful if you use this function for
+ * registering memory regions for emulating hardware.
+ */
+int kvm__register_mem(struct kvm *kvm, u64 guest_phys, u64 size, void *userspace_addr)
+{
+	struct kvm_userspace_memory_region mem;
+	struct kvm_mem_bank *bank;
+	int ret;
+
+	bank = malloc(sizeof(*bank));
+	if (!bank)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&bank->list);
+	bank->guest_phys_addr		= guest_phys;
+	bank->host_addr			= userspace_addr;
+	bank->size			= size;
+
+	mem = (struct kvm_userspace_memory_region) {
+		.slot			= kvm->mem_slots++,
+		.guest_phys_addr	= guest_phys,
+		.memory_size		= size,
+		.userspace_addr		= (unsigned long)userspace_addr,
+	};
+
+	ret = ioctl(kvm->vm_fd, KVM_SET_USER_MEMORY_REGION, &mem);
+	if (ret < 0)
+		return -errno;
+
+	list_add(&bank->list, &kvm->mem_banks);
+	return 0;
+}
+
+void *guest_flat_to_host(struct kvm *kvm, u64 offset)
+{
+	struct kvm_mem_bank *bank;
+
+	list_for_each_entry(bank, &kvm->mem_banks, list) {
+		u64 bank_start = bank->guest_phys_addr;
+		u64 bank_end = bank_start + bank->size;
+
+		if (offset >= bank_start && offset < bank_end)
+			return bank->host_addr + (offset - bank_start);
+	}
+
+	pr_warning("unable to translate guest address 0x%llx to host",
+			(unsigned long long)offset);
+	return NULL;
+}
+
+u64 host_to_guest_flat(struct kvm *kvm, void *ptr)
+{
+	struct kvm_mem_bank *bank;
+
+	list_for_each_entry(bank, &kvm->mem_banks, list) {
+		void *bank_start = bank->host_addr;
+		void *bank_end = bank_start + bank->size;
+
+		if (ptr >= bank_start && ptr < bank_end)
+			return bank->guest_phys_addr + (ptr - bank_start);
+	}
+
+	pr_warning("unable to translate host address %p to guest", ptr);
+	return 0;
+}
+
+int kvm__recommended_cpus(struct kvm *kvm)
+{
+	int ret;
+
+	ret = ioctl(kvm->sys_fd, KVM_CHECK_EXTENSION, KVM_CAP_NR_VCPUS);
+	if (ret <= 0)
+		/*
+		 * api.txt states that if KVM_CAP_NR_VCPUS does not exist,
+		 * assume 4.
+		 */
+		return 4;
+
+	return ret;
+}
+
+/*
+ * The following hack should be removed once 'x86: Raise the hard
+ * VCPU count limit' makes it's way into the mainline.
+ */
+#ifndef KVM_CAP_MAX_VCPUS
+#define KVM_CAP_MAX_VCPUS 66
+#endif
+
+int kvm__max_cpus(struct kvm *kvm)
+{
+	int ret;
+
+	ret = ioctl(kvm->sys_fd, KVM_CHECK_EXTENSION, KVM_CAP_MAX_VCPUS);
+	if (ret <= 0)
+		ret = kvm__recommended_cpus(kvm);
+
+	return ret;
+}
+
+int kvm__init(struct kvm *kvm)
+{
+	int ret;
+
+	if (!kvm__arch_cpu_supports_vm()) {
+		pr_err("Your CPU does not support hardware virtualization");
+		ret = -ENOSYS;
+		goto err;
+	}
+
+	kvm->sys_fd = open(kvm->cfg.dev, O_RDWR);
+	if (kvm->sys_fd < 0) {
+		if (errno == ENOENT)
+			pr_err("'%s' not found. Please make sure your kernel has CONFIG_KVM "
+			       "enabled and that the KVM modules are loaded.", kvm->cfg.dev);
+		else if (errno == ENODEV)
+			pr_err("'%s' KVM driver not available.\n  # (If the KVM "
+			       "module is loaded then 'dmesg' may offer further clues "
+			       "about the failure.)", kvm->cfg.dev);
+		else
+			pr_err("Could not open %s: ", kvm->cfg.dev);
+
+		ret = -errno;
+		goto err_free;
+	}
+
+	ret = ioctl(kvm->sys_fd, KVM_GET_API_VERSION, 0);
+	if (ret != KVM_API_VERSION) {
+		pr_err("KVM_API_VERSION ioctl");
+		ret = -errno;
+		goto err_sys_fd;
+	}
+
+	kvm->vm_fd = ioctl(kvm->sys_fd, KVM_CREATE_VM, KVM_VM_TYPE);
+	if (kvm->vm_fd < 0) {
+		pr_err("KVM_CREATE_VM ioctl");
+		ret = kvm->vm_fd;
+		goto err_sys_fd;
+	}
+
+	if (kvm__check_extensions(kvm)) {
+		pr_err("A required KVM extension is not supported by OS");
+		ret = -ENOSYS;
+		goto err_vm_fd;
+	}
+
+	kvm__arch_init(kvm, kvm->cfg.hugetlbfs_path, kvm->cfg.ram_size);
+
+	INIT_LIST_HEAD(&kvm->mem_banks);
+	kvm__init_ram(kvm);
+
+	if (!kvm->cfg.firmware_filename) {
+		if (!kvm__load_kernel(kvm, kvm->cfg.kernel_filename,
+				kvm->cfg.initrd_filename, kvm->cfg.real_cmdline))
+			die("unable to load kernel %s", kvm->cfg.kernel_filename);
+	}
+
+	if (kvm->cfg.firmware_filename) {
+		if (!kvm__load_firmware(kvm, kvm->cfg.firmware_filename))
+			die("unable to load firmware image %s: %s", kvm->cfg.firmware_filename, strerror(errno));
+	} else {
+		ret = kvm__arch_setup_firmware(kvm);
+		if (ret < 0)
+			die("kvm__arch_setup_firmware() failed with error %d\n", ret);
+	}
+
+	return 0;
+
+err_vm_fd:
+	close(kvm->vm_fd);
+err_sys_fd:
+	close(kvm->sys_fd);
+err_free:
+	free(kvm);
+err:
+	return ret;
+}
+core_init(kvm__init);
+
+/* RFC 1952 */
+#define GZIP_ID1		0x1f
+#define GZIP_ID2		0x8b
+#define CPIO_MAGIC		"0707"
+/* initrd may be gzipped, or a plain cpio */
+static bool initrd_check(int fd)
+{
+	unsigned char id[4];
+
+	if (read_in_full(fd, id, ARRAY_SIZE(id)) < 0)
+		return false;
+
+	if (lseek(fd, 0, SEEK_SET) < 0)
+		die_perror("lseek");
+
+	return (id[0] == GZIP_ID1 && id[1] == GZIP_ID2) ||
+		!memcmp(id, CPIO_MAGIC, 4);
+}
+
+int __attribute__((__weak__)) load_elf_binary(struct kvm *kvm, int fd_kernel,
+				int fd_initrd, const char *kernel_cmdline)
+{
+	return false;
+}
+
+bool __attribute__((__weak__)) load_bzimage(struct kvm *kvm, int fd_kernel,
+				int fd_initrd, const char *kernel_cmdline)
+{
+	return false;
+}
+
+bool kvm__load_kernel(struct kvm *kvm, const char *kernel_filename,
+		const char *initrd_filename, const char *kernel_cmdline)
+{
+	bool ret;
+	int fd_kernel = -1, fd_initrd = -1;
+
+	fd_kernel = open(kernel_filename, O_RDONLY);
+	if (fd_kernel < 0)
+		die("Unable to open kernel %s", kernel_filename);
+
+	if (initrd_filename) {
+		fd_initrd = open(initrd_filename, O_RDONLY);
+		if (fd_initrd < 0)
+			die("Unable to open initrd %s", initrd_filename);
+
+		if (!initrd_check(fd_initrd))
+			die("%s is not an initrd", initrd_filename);
+	}
+
+	ret = load_bzimage(kvm, fd_kernel, fd_initrd, kernel_cmdline);
+
+	if (ret)
+		goto found_kernel;
+
+	pr_warning("%s is not a bzImage. Trying to load it as a flat binary...", kernel_filename);
+
+	ret = load_elf_binary(kvm, fd_kernel, fd_initrd, kernel_cmdline);
+
+	if (ret)
+		goto found_kernel;
+
+	ret = load_flat_binary(kvm, fd_kernel, fd_initrd, kernel_cmdline);
+
+	if (ret)
+		goto found_kernel;
+
+	if (initrd_filename)
+		close(fd_initrd);
+	close(fd_kernel);
+
+	die("%s is not a valid bzImage or flat binary", kernel_filename);
+
+found_kernel:
+	if (initrd_filename)
+		close(fd_initrd);
+	close(fd_kernel);
+
+	return ret;
+}
+
+
+void kvm__dump_mem(struct kvm *kvm, unsigned long addr, unsigned long size, int debug_fd)
+{
+	unsigned char *p;
+	unsigned long n;
+
+	size &= ~7; /* mod 8 */
+	if (!size)
+		return;
+
+	p = guest_flat_to_host(kvm, addr);
+
+	for (n = 0; n < size; n += 8) {
+		if (!host_ptr_in_ram(kvm, p + n)) {
+			dprintf(debug_fd, " 0x%08lx: <unknown>\n", addr + n);
+			continue;
+		}
+		dprintf(debug_fd, " 0x%08lx: %02x %02x %02x %02x  %02x %02x %02x %02x\n",
+			addr + n, p[n + 0], p[n + 1], p[n + 2], p[n + 3],
+				  p[n + 4], p[n + 5], p[n + 6], p[n + 7]);
+	}
+}
+
+void kvm__pause(struct kvm *kvm)
+{
+	int i, paused_vcpus = 0;
+
+	/* Check if the guest is running */
+	if (!kvm->cpus[0] || kvm->cpus[0]->thread == 0)
+		return;
+
+	mutex_lock(&pause_lock);
+
+	pause_event = eventfd(0, 0);
+	if (pause_event < 0)
+		die("Failed creating pause notification event");
+	for (i = 0; i < kvm->nrcpus; i++)
+		pthread_kill(kvm->cpus[i]->thread, SIGKVMPAUSE);
+
+	while (paused_vcpus < kvm->nrcpus) {
+		u64 cur_read;
+
+		if (read(pause_event, &cur_read, sizeof(cur_read)) < 0)
+			die("Failed reading pause event");
+		paused_vcpus += cur_read;
+	}
+	close(pause_event);
+}
+
+void kvm__continue(struct kvm *kvm)
+{
+	/* Check if the guest is running */
+	if (!kvm->cpus[0] || kvm->cpus[0]->thread == 0)
+		return;
+
+	mutex_unlock(&pause_lock);
+}
+
+void kvm__notify_paused(void)
+{
+	u64 p = 1;
+
+	if (write(pause_event, &p, sizeof(p)) < 0)
+		die("Failed notifying of paused VCPU.");
+
+	mutex_lock(&pause_lock);
+	mutex_unlock(&pause_lock);
+}

diff --git a/tools/kvm/main.c b/tools/kvm/main.c
new file mode 100644
index 0000000..05bc82c
--- /dev/null
+++ b/tools/kvm/main.c

@@ -0,0 +1,19 @@
+#include "kvm/kvm.h"
+
+#include <stdlib.h>
+#include <stdio.h>
+
+/* user defined header files */
+#include <kvm/kvm-cmd.h>
+
+static int handle_kvm_command(int argc, char **argv)
+{
+	return handle_command(kvm_commands, argc, (const char **) &argv[0]);
+}
+
+int main(int argc, char *argv[])
+{
+	kvm__set_dir("%s/%s", HOME_DIR, KVM_PID_FILE_PATH);
+
+	return handle_kvm_command(argc - 1, &argv[1]);
+}

diff --git a/tools/kvm/mips/include/kvm/barrier.h b/tools/kvm/mips/include/kvm/barrier.h
new file mode 100644
index 0000000..45bfa72
--- /dev/null
+++ b/tools/kvm/mips/include/kvm/barrier.h

@@ -0,0 +1,20 @@
+#ifndef _KVM_BARRIER_H_
+#define _KVM_BARRIER_H_
+
+#define barrier() asm volatile("": : :"memory")
+
+#define mb()	asm volatile (".set push\n\t.set mips2\n\tsync\n\t.set pop": : :"memory")
+#define rmb() mb()
+#define wmb() mb()
+
+#ifdef CONFIG_SMP
+#define smp_mb()	mb()
+#define smp_rmb()	rmb()
+#define smp_wmb()	wmb()
+#else
+#define smp_mb()	barrier()
+#define smp_rmb()	barrier()
+#define smp_wmb()	barrier()
+#endif
+
+#endif /* _KVM_BARRIER_H_ */

diff --git a/tools/kvm/mips/include/kvm/kvm-arch.h b/tools/kvm/mips/include/kvm/kvm-arch.h
new file mode 100644
index 0000000..7eadbf4
--- /dev/null
+++ b/tools/kvm/mips/include/kvm/kvm-arch.h

@@ -0,0 +1,38 @@
+#ifndef KVM__KVM_ARCH_H
+#define KVM__KVM_ARCH_H
+
+#define KVM_MMIO_START		0x10000000
+#define KVM_PCI_CFG_AREA	KVM_MMIO_START
+#define KVM_PCI_MMIO_AREA	(KVM_MMIO_START + 0x1000000)
+#define KVM_VIRTIO_MMIO_AREA	(KVM_MMIO_START + 0x2000000)
+
+/*
+ * Just for reference. This and the above corresponds to what's used
+ * in mipsvz_page_fault() in kvm_mipsvz.c of the host kernel.
+ */
+#define KVM_MIPS_IOPORT_AREA	0x1e000000
+#define KVM_MIPS_IOPORT_SIZE	0x00010000
+#define KVM_MIPS_IRQCHIP_AREA	0x1e010000
+#define KVM_MIPS_IRQCHIP_SIZE	0x00010000
+
+#define KVM_IRQ_OFFSET		1
+
+/*
+ * MIPS-VZ (trap and emulate is 0)
+ */
+#define KVM_VM_TYPE		1
+
+#define VIRTIO_DEFAULT_TRANS(kvm)	VIRTIO_PCI
+
+#include <stdbool.h>
+
+#include "linux/types.h"
+
+struct kvm_arch {
+	u64 entry_point;
+	u64 argc;
+	u64 argv;
+	bool is64bit;
+};
+
+#endif /* KVM__KVM_ARCH_H */

diff --git a/tools/kvm/mips/include/kvm/kvm-config-arch.h b/tools/kvm/mips/include/kvm/kvm-config-arch.h
new file mode 100644
index 0000000..8a28f9d
--- /dev/null
+++ b/tools/kvm/mips/include/kvm/kvm-config-arch.h

@@ -0,0 +1,7 @@
+#ifndef KVM__KVM_CONFIG_ARCH_H
+#define KVM__KVM_CONFIG_ARCH_H
+
+struct kvm_config_arch {
+};
+
+#endif /* KVM__MIPS_KVM_CONFIG_ARCH_H */

diff --git a/tools/kvm/mips/include/kvm/kvm-cpu-arch.h b/tools/kvm/mips/include/kvm/kvm-cpu-arch.h
new file mode 100644
index 0000000..3db7f2b
--- /dev/null
+++ b/tools/kvm/mips/include/kvm/kvm-cpu-arch.h

@@ -0,0 +1,42 @@
+#ifndef KVM__KVM_CPU_ARCH_H
+#define KVM__KVM_CPU_ARCH_H
+
+#include <linux/kvm.h>	/* for struct kvm_regs */
+#include "kvm/kvm.h"	/* for kvm__emulate_{mm}io() */
+#include <pthread.h>
+
+struct kvm;
+
+struct kvm_cpu {
+	pthread_t	thread;		/* VCPU thread */
+
+	unsigned long	cpu_id;
+
+	struct kvm	*kvm;		/* parent KVM */
+	int		vcpu_fd;	/* For VCPU ioctls() */
+	struct kvm_run	*kvm_run;
+
+	struct kvm_regs	regs;
+
+	u8		is_running;
+	u8		paused;
+	u8		needs_nmi;
+
+	struct kvm_coalesced_mmio_ring *ring;
+};
+
+/*
+ * As these are such simple wrappers, let's have them in the header so they'll
+ * be cheaper to call:
+ */
+static inline bool kvm_cpu__emulate_io(struct kvm_cpu *vcpu, u16 port, void *data, int direction, int size, u32 count)
+{
+	return kvm__emulate_io(vcpu, port, data, direction, size, count);
+}
+
+static inline bool kvm_cpu__emulate_mmio(struct kvm_cpu *vcpu, u64 phys_addr, u8 *data, u32 len, u8 is_write)
+{
+	return kvm__emulate_mmio(vcpu, phys_addr, data, len, is_write);
+}
+
+#endif /* KVM__KVM_CPU_ARCH_H */

diff --git a/tools/kvm/mips/irq.c b/tools/kvm/mips/irq.c
new file mode 100644
index 0000000..c1ff6bb
--- /dev/null
+++ b/tools/kvm/mips/irq.c

@@ -0,0 +1,10 @@
+#include "kvm/irq.h"
+#include "kvm/kvm.h"
+
+#include <stdlib.h>
+
+int irq__add_msix_route(struct kvm *kvm, struct msi_msg *msg)
+{
+	pr_warning("irq__add_msix_route");
+	return 1;
+}

diff --git a/tools/kvm/mips/kvm-cpu.c b/tools/kvm/mips/kvm-cpu.c
new file mode 100644
index 0000000..30a3de1
--- /dev/null
+++ b/tools/kvm/mips/kvm-cpu.c

@@ -0,0 +1,219 @@
+#include "kvm/kvm-cpu.h"
+#include "kvm/term.h"
+
+#include <stdlib.h>
+
+static int debug_fd;
+
+void kvm_cpu__set_debug_fd(int fd)
+{
+	debug_fd = fd;
+}
+
+int kvm_cpu__get_debug_fd(void)
+{
+	return debug_fd;
+}
+
+void kvm_cpu__delete(struct kvm_cpu *vcpu)
+{
+	free(vcpu);
+}
+
+static struct kvm_cpu *kvm_cpu__new(struct kvm *kvm)
+{
+	struct kvm_cpu *vcpu;
+
+	vcpu = calloc(1, sizeof(*vcpu));
+	if (!vcpu)
+		return NULL;
+
+	vcpu->kvm = kvm;
+
+	return vcpu;
+}
+
+struct kvm_cpu *kvm_cpu__arch_init(struct kvm *kvm, unsigned long cpu_id)
+{
+	struct kvm_cpu *vcpu;
+	int mmap_size;
+	int coalesced_offset;
+
+	vcpu = kvm_cpu__new(kvm);
+	if (!vcpu)
+		return NULL;
+
+	vcpu->cpu_id = cpu_id;
+
+	vcpu->vcpu_fd = ioctl(vcpu->kvm->vm_fd, KVM_CREATE_VCPU, cpu_id);
+	if (vcpu->vcpu_fd < 0)
+		die_perror("KVM_CREATE_VCPU ioctl");
+
+	mmap_size = ioctl(vcpu->kvm->sys_fd, KVM_GET_VCPU_MMAP_SIZE, 0);
+	if (mmap_size < 0)
+		die_perror("KVM_GET_VCPU_MMAP_SIZE ioctl");
+
+	vcpu->kvm_run = mmap(NULL, mmap_size, PROT_RW, MAP_SHARED, vcpu->vcpu_fd, 0);
+	if (vcpu->kvm_run == MAP_FAILED)
+		die("unable to mmap vcpu fd");
+
+	vcpu->is_running = true;
+
+	coalesced_offset = ioctl(kvm->sys_fd, KVM_CHECK_EXTENSION, KVM_CAP_COALESCED_MMIO);
+	if (coalesced_offset)
+		vcpu->ring = (void *)vcpu->kvm_run + (coalesced_offset * PAGE_SIZE);
+
+	return vcpu;
+}
+
+static void kvm_cpu__setup_regs(struct kvm_cpu *vcpu)
+{
+	uint32_t v;
+	struct kvm_one_reg one_reg;
+
+	memset(&vcpu->regs, 0, sizeof(vcpu->regs));
+	vcpu->regs.pc = vcpu->kvm->arch.entry_point;
+	vcpu->regs.gpr[4] = vcpu->kvm->arch.argc;
+	vcpu->regs.gpr[5] = vcpu->kvm->arch.argv;
+
+	if (ioctl(vcpu->vcpu_fd, KVM_SET_REGS, &vcpu->regs) < 0)
+		die_perror("KVM_SET_REGS failed");
+
+
+	one_reg.id = KVM_REG_MIPS | KVM_REG_SIZE_U32 | (0x10000 + 8 * 12 + 0); /* Status */
+	one_reg.addr = (unsigned long)(uint32_t *)&v;
+	v = 6;
+
+	if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, &one_reg) < 0)
+		die_perror("KVM_SET_ONE_REG failed");
+}
+
+/**
+ * kvm_cpu__reset_vcpu - reset virtual CPU to a known state
+ */
+void kvm_cpu__reset_vcpu(struct kvm_cpu *vcpu)
+{
+	kvm_cpu__setup_regs(vcpu);
+}
+
+static bool kvm_cpu__hypercall_write_cons(struct kvm_cpu *vcpu)
+{
+	int term = (int)vcpu->kvm_run->hypercall.args[0];
+	u64 addr = vcpu->kvm_run->hypercall.args[1];
+	int len = (int)vcpu->kvm_run->hypercall.args[2];
+	char *host_addr;
+
+	if (term < 0 || term >= TERM_MAX_DEVS) {
+		pr_warning("hypercall_write_cons term out of range <%d>", term);
+		return false;
+	}
+
+	if ((addr & 0xffffffffc0000000ull) == 0xffffffff80000000ull)
+		addr &= 0x1ffffffful; /* Convert KSEG{0,1} to physical. */
+	if ((addr & 0xc000000000000000ull) == 0x8000000000000000ull)
+		addr &= 0x07ffffffffffffffull; /* Convert XKPHYS to pysical */
+
+	host_addr = guest_flat_to_host(vcpu->kvm, addr);
+	if (!host_addr) {
+		pr_warning("hypercall_write_cons unmapped physaddr %llx", (unsigned long long)addr);
+		return false;
+	}
+
+	if ((len <= 0) || !host_ptr_in_ram(vcpu->kvm, host_addr + len)) {
+		pr_warning("hypercall_write_cons len out of range <%d>", len);
+		return false;
+	}
+
+	term_putc(host_addr, len, term);
+
+	return true;
+}
+
+#define KVM_HC_MIPS_CONSOLE_OUTPUT 8
+bool kvm_cpu__handle_exit(struct kvm_cpu *vcpu)
+{
+	switch(vcpu->kvm_run->exit_reason) {
+	case KVM_EXIT_HYPERCALL:
+		if (vcpu->kvm_run->hypercall.nr == KVM_HC_MIPS_CONSOLE_OUTPUT) {
+			return kvm_cpu__hypercall_write_cons(vcpu);
+		} else {
+			pr_warning("KVM_EXIT_HYPERCALL unrecognized call %llu",
+				   (unsigned long long)vcpu->kvm_run->hypercall.nr);
+			return false;
+		}
+	case KVM_EXIT_EXCEPTION:
+	case KVM_EXIT_INTERNAL_ERROR:
+		return false;
+	default:
+		break;
+	}
+	return false;
+}
+
+void kvm_cpu__arch_nmi(struct kvm_cpu *cpu)
+{
+}
+
+void kvm_cpu__show_registers(struct kvm_cpu *vcpu)
+{
+	struct kvm_regs regs;
+
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_REGS, &regs) < 0)
+		die("KVM_GET_REGS failed");
+	dprintf(debug_fd, "\n Registers:\n");
+	dprintf(debug_fd,   " ----------\n");
+	dprintf(debug_fd, "$0   : %016llx %016llx %016llx %016llx\n",
+		(unsigned long long)regs.gpr[0],
+		(unsigned long long)regs.gpr[1],
+		(unsigned long long)regs.gpr[2],
+		(unsigned long long)regs.gpr[3]);
+	dprintf(debug_fd, "$4   : %016llx %016llx %016llx %016llx\n",
+		(unsigned long long)regs.gpr[4],
+		(unsigned long long)regs.gpr[5],
+		(unsigned long long)regs.gpr[6],
+		(unsigned long long)regs.gpr[7]);
+	dprintf(debug_fd, "$8   : %016llx %016llx %016llx %016llx\n",
+		(unsigned long long)regs.gpr[8],
+		(unsigned long long)regs.gpr[9],
+		(unsigned long long)regs.gpr[10],
+		(unsigned long long)regs.gpr[11]);
+	dprintf(debug_fd, "$12  : %016llx %016llx %016llx %016llx\n",
+		(unsigned long long)regs.gpr[12],
+		(unsigned long long)regs.gpr[13],
+		(unsigned long long)regs.gpr[14],
+		(unsigned long long)regs.gpr[15]);
+	dprintf(debug_fd, "$16  : %016llx %016llx %016llx %016llx\n",
+		(unsigned long long)regs.gpr[16],
+		(unsigned long long)regs.gpr[17],
+		(unsigned long long)regs.gpr[18],
+		(unsigned long long)regs.gpr[19]);
+	dprintf(debug_fd, "$20  : %016llx %016llx %016llx %016llx\n",
+		(unsigned long long)regs.gpr[20],
+		(unsigned long long)regs.gpr[21],
+		(unsigned long long)regs.gpr[22],
+		(unsigned long long)regs.gpr[23]);
+	dprintf(debug_fd, "$24  : %016llx %016llx %016llx %016llx\n",
+		(unsigned long long)regs.gpr[24],
+		(unsigned long long)regs.gpr[25],
+		(unsigned long long)regs.gpr[26],
+		(unsigned long long)regs.gpr[27]);
+	dprintf(debug_fd, "$28  : %016llx %016llx %016llx %016llx\n",
+		(unsigned long long)regs.gpr[28],
+		(unsigned long long)regs.gpr[29],
+		(unsigned long long)regs.gpr[30],
+		(unsigned long long)regs.gpr[31]);
+
+	dprintf(debug_fd, "hi   : %016llx\n", (unsigned long long)regs.hi);
+	dprintf(debug_fd, "lo   : %016llx\n", (unsigned long long)regs.lo);
+	dprintf(debug_fd, "epc  : %016llx\n", (unsigned long long)regs.pc);
+
+	dprintf(debug_fd, "\n");
+}
+
+void kvm_cpu__show_code(struct kvm_cpu *vcpu)
+{
+}
+
+void kvm_cpu__show_page_tables(struct kvm_cpu *vcpu)
+{
+}

diff --git a/tools/kvm/mips/kvm.c b/tools/kvm/mips/kvm.c
new file mode 100644
index 0000000..fc0428b
--- /dev/null
+++ b/tools/kvm/mips/kvm.c

@@ -0,0 +1,328 @@
+#include "kvm/kvm.h"
+#include "kvm/ioport.h"
+#include "kvm/virtio-console.h"
+
+#include <linux/kvm.h>
+
+#include <ctype.h>
+#include <unistd.h>
+#include <elf.h>
+
+struct kvm_ext kvm_req_ext[] = {
+	{ 0, 0 }
+};
+
+void kvm__arch_read_term(struct kvm *kvm)
+{
+	virtio_console__inject_interrupt(kvm);
+}
+
+void kvm__init_ram(struct kvm *kvm)
+{
+	u64	phys_start, phys_size;
+	void	*host_mem;
+
+	phys_start = 0;
+	phys_size  = kvm->ram_size;
+	host_mem   = kvm->ram_start;
+
+	kvm__register_mem(kvm, phys_start, phys_size, host_mem);
+}
+
+void kvm__arch_delete_ram(struct kvm *kvm)
+{
+	munmap(kvm->ram_start, kvm->ram_size);
+}
+
+void kvm__arch_set_cmdline(char *cmdline, bool video)
+{
+
+}
+
+/* Architecture-specific KVM init */
+void kvm__arch_init(struct kvm *kvm, const char *hugetlbfs_path, u64 ram_size)
+{
+	int ret;
+
+	kvm->ram_start = mmap_anon_or_hugetlbfs(kvm, hugetlbfs_path, ram_size);
+	kvm->ram_size = ram_size;
+
+	if (kvm->ram_start == MAP_FAILED)
+		die("out of memory");
+
+	madvise(kvm->ram_start, kvm->ram_size, MADV_MERGEABLE);
+
+	ret = ioctl(kvm->vm_fd, KVM_CREATE_IRQCHIP);
+	if (ret < 0)
+		die_perror("KVM_CREATE_IRQCHIP ioctl");
+}
+
+void kvm__irq_line(struct kvm *kvm, int irq, int level)
+{
+	struct kvm_irq_level irq_level;
+	int ret;
+
+	irq_level.irq = irq;
+	irq_level.level = level ? 1 : 0;
+
+	ret = ioctl(kvm->vm_fd, KVM_IRQ_LINE, &irq_level);
+	if (ret < 0)
+		die_perror("KVM_IRQ_LINE ioctl");
+}
+
+void kvm__irq_trigger(struct kvm *kvm, int irq)
+{
+	struct kvm_irq_level irq_level;
+	int ret;
+
+	irq_level.irq = irq;
+	irq_level.level = 1;
+
+	ret = ioctl(kvm->vm_fd, KVM_IRQ_LINE, &irq_level);
+	if (ret < 0)
+		die_perror("KVM_IRQ_LINE ioctl");
+}
+
+void ioport__setup_arch(struct kvm *kvm)
+{
+}
+
+bool kvm__arch_cpu_supports_vm(void)
+{
+	return true;
+}
+bool kvm__load_firmware(struct kvm *kvm, const char *firmware_filename)
+{
+	return false;
+}
+int kvm__arch_setup_firmware(struct kvm *kvm)
+{
+	return 0;
+}
+
+static void kvm__mips_install_cmdline(struct kvm *kvm)
+{
+	char *p = kvm->ram_start;
+	u64 cmdline_offset = 0x2000;
+	u64 argv_start = 0x3000;
+	u64 argv_offset = argv_start;
+	u64 argc = 0;
+
+	sprintf(p + cmdline_offset, "mem=0x%llx@0 ",
+		 (unsigned long long)kvm->ram_size);
+
+	strcat(p + cmdline_offset, kvm->cfg.real_cmdline); /* maximum size is 2K */
+
+	while (p[cmdline_offset]) {
+		if (!isspace(p[cmdline_offset])) {
+			if (kvm->arch.is64bit) {
+				*(u64 *)(p + argv_offset) = 0xffffffff80000000ull + cmdline_offset;
+				argv_offset += sizeof(u64);
+			} else {
+				*(u32 *)(p + argv_offset) = 0x80000000u + cmdline_offset;
+				argv_offset += sizeof(u32);
+			}
+			argc++;
+			while(p[cmdline_offset] && !isspace(p[cmdline_offset]))
+				cmdline_offset++;
+			continue;
+		}
+		/* Must be a space character skip over these*/
+		while(p[cmdline_offset] && isspace(p[cmdline_offset])) {
+			p[cmdline_offset] = 0;
+			cmdline_offset++;
+		}
+	}
+	kvm->arch.argc = argc;
+	kvm->arch.argv = 0xffffffff80000000ull + argv_start;
+}
+
+/* Load at the 1M point. */
+#define KERNEL_LOAD_ADDR 0x1000000
+int load_flat_binary(struct kvm *kvm, int fd_kernel, int fd_initrd, const char *kernel_cmdline)
+{
+	void *p;
+	void *k_start;
+	int nr;
+
+	if (lseek(fd_kernel, 0, SEEK_SET) < 0)
+		die_perror("lseek");
+
+	p = k_start = guest_flat_to_host(kvm, KERNEL_LOAD_ADDR);
+
+	while ((nr = read(fd_kernel, p, 65536)) > 0)
+		p += nr;
+
+	kvm->arch.is64bit = true;
+	kvm->arch.entry_point = 0xffffffff81000000ull;
+
+	pr_info("Loaded kernel to 0x%x (%ld bytes)", KERNEL_LOAD_ADDR, (long int)(p - k_start));
+
+	return true;
+}
+
+struct kvm__arch_elf_info {
+	u64 load_addr;
+	u64 entry_point;
+	size_t len;
+	size_t offset;
+};
+
+static bool kvm__arch_get_elf_64_info(Elf64_Ehdr *ehdr, int fd_kernel,
+				      struct kvm__arch_elf_info *ei)
+{
+	int i;
+	size_t nr;
+	Elf64_Phdr phdr;
+
+	if (ehdr->e_phentsize != sizeof(phdr)) {
+		pr_info("Incompatible ELF PHENTSIZE %d", ehdr->e_phentsize);
+		return false;
+	}
+
+	ei->entry_point = ehdr->e_entry;
+
+	if (lseek(fd_kernel, ehdr->e_phoff, SEEK_SET) < 0)
+		die_perror("lseek");
+
+	phdr.p_type = PT_NULL;
+	for (i = 0; i < ehdr->e_phnum; i++) {
+		nr = read(fd_kernel, &phdr, sizeof(phdr));
+		if (nr != sizeof(phdr)) {
+			pr_info("Couldn't read %d bytes for ELF PHDR.", (int)sizeof(phdr));
+			return false;
+		}
+		if (phdr.p_type == PT_LOAD)
+			break;
+	}
+	if (phdr.p_type != PT_LOAD) {
+		pr_info("No PT_LOAD Program Header found.");
+		return false;
+	}
+
+	ei->load_addr = phdr.p_paddr;
+
+	if ((ei->load_addr & 0xffffffffc0000000ull) == 0xffffffff80000000ull)
+		ei->load_addr &= 0x1ffffffful; /* Convert KSEG{0,1} to physical. */
+	if ((ei->load_addr & 0xc000000000000000ull) == 0x8000000000000000ull)
+		ei->load_addr &= 0x07ffffffffffffffull; /* Convert XKPHYS to pysical */
+
+
+	ei->len = phdr.p_filesz;
+	ei->offset = phdr.p_offset;
+
+	return true;
+}
+
+static bool kvm__arch_get_elf_32_info(Elf32_Ehdr *ehdr, int fd_kernel,
+				      struct kvm__arch_elf_info *ei)
+{
+	int i;
+	size_t nr;
+	Elf32_Phdr phdr;
+
+	if (ehdr->e_phentsize != sizeof(phdr)) {
+		pr_info("Incompatible ELF PHENTSIZE %d", ehdr->e_phentsize);
+		return false;
+	}
+
+	ei->entry_point = (s64)((s32)ehdr->e_entry);
+
+	if (lseek(fd_kernel, ehdr->e_phoff, SEEK_SET) < 0)
+		die_perror("lseek");
+
+	phdr.p_type = PT_NULL;
+	for (i = 0; i < ehdr->e_phnum; i++) {
+		nr = read(fd_kernel, &phdr, sizeof(phdr));
+		if (nr != sizeof(phdr)) {
+			pr_info("Couldn't read %d bytes for ELF PHDR.", (int)sizeof(phdr));
+			return false;
+		}
+		if (phdr.p_type == PT_LOAD)
+			break;
+	}
+	if (phdr.p_type != PT_LOAD) {
+		pr_info("No PT_LOAD Program Header found.");
+		return false;
+	}
+
+	ei->load_addr = (s64)((s32)phdr.p_paddr);
+
+	if ((ei->load_addr & 0xffffffffc0000000ull) == 0xffffffff80000000ull)
+		ei->load_addr &= 0x1fffffffull; /* Convert KSEG{0,1} to physical. */
+
+	ei->len = phdr.p_filesz;
+	ei->offset = phdr.p_offset;
+
+	return true;
+}
+
+int load_elf_binary(struct kvm *kvm, int fd_kernel, int fd_initrd, const char *kernel_cmdline)
+{
+	union {
+		Elf64_Ehdr ehdr;
+		Elf32_Ehdr ehdr32;
+	} eh;
+
+	size_t nr;
+	char *p;
+	struct kvm__arch_elf_info ei;
+
+	if (lseek(fd_kernel, 0, SEEK_SET) < 0)
+		die_perror("lseek");
+
+	nr = read(fd_kernel, &eh, sizeof(eh));
+	if (nr != sizeof(eh)) {
+		pr_info("Couldn't read %d bytes for ELF header.", (int)sizeof(eh));
+		return false;
+	}
+
+	if (eh.ehdr.e_ident[EI_MAG0] != ELFMAG0 ||
+	    eh.ehdr.e_ident[EI_MAG1] != ELFMAG1 ||
+	    eh.ehdr.e_ident[EI_MAG2] != ELFMAG2 ||
+	    eh.ehdr.e_ident[EI_MAG3] != ELFMAG3 ||
+	    (eh.ehdr.e_ident[EI_CLASS] != ELFCLASS64 && eh.ehdr.e_ident[EI_CLASS] != ELFCLASS32) ||
+	    eh.ehdr.e_ident[EI_VERSION] != EV_CURRENT) {
+		pr_info("Incompatible ELF header.");
+		return false;
+	}
+	if (eh.ehdr.e_type != ET_EXEC || eh.ehdr.e_machine != EM_MIPS) {
+		pr_info("Incompatible ELF not MIPS EXEC.");
+		return false;
+	}
+
+	if (eh.ehdr.e_ident[EI_CLASS] == ELFCLASS64) {
+		if (!kvm__arch_get_elf_64_info(&eh.ehdr, fd_kernel, &ei))
+			return false;
+		kvm->arch.is64bit = true;
+	} else {
+		if (!kvm__arch_get_elf_32_info(&eh.ehdr32, fd_kernel, &ei))
+			return false;
+		kvm->arch.is64bit = false;
+	}
+
+	kvm->arch.entry_point = ei.entry_point;
+
+	if (lseek(fd_kernel, ei.offset, SEEK_SET) < 0)
+		die_perror("lseek");
+
+	p = guest_flat_to_host(kvm, ei.load_addr);
+
+	pr_info("ELF Loading 0x%lx bytes from 0x%llx to 0x%llx",
+		(unsigned long)ei.len, (unsigned long long)ei.offset, (unsigned long long)ei.load_addr);
+	do {
+		nr = read(fd_kernel, p, ei.len);
+		if (nr < 0)
+			die_perror("read");
+		p += nr;
+		ei.len -= nr;
+	} while (ei.len);
+
+	kvm__mips_install_cmdline(kvm);
+
+	return true;
+}
+
+void ioport__map_irq(u8 *irq)
+{
+}

diff --git a/tools/kvm/mmio.c b/tools/kvm/mmio.c
new file mode 100644
index 0000000..c648bec
--- /dev/null
+++ b/tools/kvm/mmio.c

@@ -0,0 +1,141 @@
+#include "kvm/kvm.h"
+#include "kvm/kvm-cpu.h"
+#include "kvm/rbtree-interval.h"
+#include "kvm/brlock.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <sys/ioctl.h>
+#include <linux/kvm.h>
+#include <linux/types.h>
+#include <linux/rbtree.h>
+#include <linux/err.h>
+#include <errno.h>
+
+#define mmio_node(n) rb_entry(n, struct mmio_mapping, node)
+
+struct mmio_mapping {
+	struct rb_int_node	node;
+	void			(*mmio_fn)(struct kvm_cpu *vcpu, u64 addr, u8 *data, u32 len, u8 is_write, void *ptr);
+	void			*ptr;
+};
+
+static struct rb_root mmio_tree = RB_ROOT;
+
+static struct mmio_mapping *mmio_search(struct rb_root *root, u64 addr, u64 len)
+{
+	struct rb_int_node *node;
+
+	node = rb_int_search_range(root, addr, addr + len);
+	if (node == NULL)
+		return NULL;
+
+	return mmio_node(node);
+}
+
+/* Find lowest match, Check for overlap */
+static struct mmio_mapping *mmio_search_single(struct rb_root *root, u64 addr)
+{
+	struct rb_int_node *node;
+
+	node = rb_int_search_single(root, addr);
+	if (node == NULL)
+		return NULL;
+
+	return mmio_node(node);
+}
+
+static int mmio_insert(struct rb_root *root, struct mmio_mapping *data)
+{
+	return rb_int_insert(root, &data->node);
+}
+
+static const char *to_direction(u8 is_write)
+{
+	if (is_write)
+		return "write";
+
+	return "read";
+}
+
+int kvm__register_mmio(struct kvm *kvm, u64 phys_addr, u64 phys_addr_len, bool coalesce,
+		       void (*mmio_fn)(struct kvm_cpu *vcpu, u64 addr, u8 *data, u32 len, u8 is_write, void *ptr),
+			void *ptr)
+{
+	struct mmio_mapping *mmio;
+	struct kvm_coalesced_mmio_zone zone;
+	int ret;
+
+	mmio = malloc(sizeof(*mmio));
+	if (mmio == NULL)
+		return -ENOMEM;
+
+	*mmio = (struct mmio_mapping) {
+		.node = RB_INT_INIT(phys_addr, phys_addr + phys_addr_len),
+		.mmio_fn = mmio_fn,
+		.ptr	= ptr,
+	};
+
+	if (coalesce) {
+		zone = (struct kvm_coalesced_mmio_zone) {
+			.addr	= phys_addr,
+			.size	= phys_addr_len,
+		};
+		ret = ioctl(kvm->vm_fd, KVM_REGISTER_COALESCED_MMIO, &zone);
+		if (ret < 0) {
+			free(mmio);
+			return -errno;
+		}
+	}
+	br_write_lock(kvm);
+	ret = mmio_insert(&mmio_tree, mmio);
+	br_write_unlock(kvm);
+
+	return ret;
+}
+
+bool kvm__deregister_mmio(struct kvm *kvm, u64 phys_addr)
+{
+	struct mmio_mapping *mmio;
+	struct kvm_coalesced_mmio_zone zone;
+
+	br_write_lock(kvm);
+	mmio = mmio_search_single(&mmio_tree, phys_addr);
+	if (mmio == NULL) {
+		br_write_unlock(kvm);
+		return false;
+	}
+
+	zone = (struct kvm_coalesced_mmio_zone) {
+		.addr	= phys_addr,
+		.size	= 1,
+	};
+	ioctl(kvm->vm_fd, KVM_UNREGISTER_COALESCED_MMIO, &zone);
+
+	rb_int_erase(&mmio_tree, &mmio->node);
+	br_write_unlock(kvm);
+
+	free(mmio);
+	return true;
+}
+
+bool kvm__emulate_mmio(struct kvm_cpu *vcpu, u64 phys_addr, u8 *data, u32 len, u8 is_write)
+{
+	struct mmio_mapping *mmio;
+
+	br_read_lock();
+	mmio = mmio_search(&mmio_tree, phys_addr, len);
+
+	if (mmio)
+		mmio->mmio_fn(vcpu, phys_addr, data, len, is_write, mmio->ptr);
+	else {
+		if (vcpu->kvm->cfg.mmio_debug)
+			fprintf(stderr,	"Warning: Ignoring MMIO %s at %016llx (length %u)\n",
+				to_direction(is_write),
+				(unsigned long long)phys_addr, len);
+	}
+	br_read_unlock();
+
+	return true;
+}

diff --git a/tools/kvm/net/uip/arp.c b/tools/kvm/net/uip/arp.c
new file mode 100644
index 0000000..98423da
--- /dev/null
+++ b/tools/kvm/net/uip/arp.c

@@ -0,0 +1,30 @@
+#include "kvm/uip.h"
+
+int uip_tx_do_arp(struct uip_tx_arg *arg)
+{
+	struct uip_arp *arp, *arp2;
+	struct uip_info *info;
+	struct uip_buf *buf;
+
+	info = arg->info;
+	buf = uip_buf_clone(arg);
+
+	arp	 = (struct uip_arp *)(arg->eth);
+	arp2	 = (struct uip_arp *)(buf->eth);
+
+	/*
+	 * ARP replay code: 2
+	 */
+	arp2->op   = htons(0x2);
+	arp2->dmac = arp->smac;
+	arp2->dip  = arp->sip;
+
+	if (arp->dip == htonl(info->host_ip)) {
+		arp2->smac = info->host_mac;
+		arp2->sip = htonl(info->host_ip);
+
+		uip_buf_set_used(info, buf);
+	}
+
+	return 0;
+}

diff --git a/tools/kvm/net/uip/buf.c b/tools/kvm/net/uip/buf.c
new file mode 100644
index 0000000..f29ad41
--- /dev/null
+++ b/tools/kvm/net/uip/buf.c

@@ -0,0 +1,114 @@
+#include "kvm/uip.h"
+
+#include <linux/kernel.h>
+#include <linux/list.h>
+
+struct uip_buf *uip_buf_get_used(struct uip_info *info)
+{
+	struct uip_buf *buf;
+	bool found = false;
+
+	mutex_lock(&info->buf_lock);
+
+	while (!(info->buf_used_nr > 0))
+		pthread_cond_wait(&info->buf_used_cond, &info->buf_lock.mutex);
+
+	list_for_each_entry(buf, &info->buf_head, list) {
+		if (buf->status == UIP_BUF_STATUS_USED) {
+			/*
+			 * Set status to INUSE immediately to prevent
+			 * someone from using this buf until we free it
+			 */
+			buf->status = UIP_BUF_STATUS_INUSE;
+			info->buf_used_nr--;
+			found = true;
+			break;
+		}
+	}
+
+	mutex_unlock(&info->buf_lock);
+
+	return found ? buf : NULL;
+}
+
+struct uip_buf *uip_buf_get_free(struct uip_info *info)
+{
+	struct uip_buf *buf;
+	bool found = false;
+
+	mutex_lock(&info->buf_lock);
+
+	while (!(info->buf_free_nr > 0))
+		pthread_cond_wait(&info->buf_free_cond, &info->buf_lock.mutex);
+
+	list_for_each_entry(buf, &info->buf_head, list) {
+		if (buf->status == UIP_BUF_STATUS_FREE) {
+			/*
+			 * Set status to INUSE immediately to prevent
+			 * someone from using this buf until we free it
+			 */
+			buf->status = UIP_BUF_STATUS_INUSE;
+			info->buf_free_nr--;
+			found = true;
+			break;
+		}
+	}
+
+	mutex_unlock(&info->buf_lock);
+
+	return found ? buf : NULL;
+}
+
+struct uip_buf *uip_buf_set_used(struct uip_info *info, struct uip_buf *buf)
+{
+	mutex_lock(&info->buf_lock);
+
+	buf->status = UIP_BUF_STATUS_USED;
+	info->buf_used_nr++;
+	pthread_cond_signal(&info->buf_used_cond);
+
+	mutex_unlock(&info->buf_lock);
+
+	return buf;
+}
+
+struct uip_buf *uip_buf_set_free(struct uip_info *info, struct uip_buf *buf)
+{
+	mutex_lock(&info->buf_lock);
+
+	buf->status = UIP_BUF_STATUS_FREE;
+	info->buf_free_nr++;
+	pthread_cond_signal(&info->buf_free_cond);
+
+	mutex_unlock(&info->buf_lock);
+
+	return buf;
+}
+
+struct uip_buf *uip_buf_clone(struct uip_tx_arg *arg)
+{
+	struct uip_buf *buf;
+	struct uip_eth *eth2;
+	struct uip_info *info;
+
+	info = arg->info;
+
+	/*
+	 * Get buffer from device to guest
+	 */
+	buf = uip_buf_get_free(info);
+
+	/*
+	 * Clone buffer
+	 */
+	memcpy(buf->vnet, arg->vnet, arg->vnet_len);
+	memcpy(buf->eth, arg->eth, arg->eth_len);
+	buf->vnet_len	= arg->vnet_len;
+	buf->eth_len	= arg->eth_len;
+
+	eth2		= (struct uip_eth *)buf->eth;
+	eth2->src	= info->host_mac;
+	eth2->dst	= arg->eth->src;
+
+	return buf;
+}

diff --git a/tools/kvm/net/uip/core.c b/tools/kvm/net/uip/core.c
new file mode 100644
index 0000000..e860f3a
--- /dev/null
+++ b/tools/kvm/net/uip/core.c

@@ -0,0 +1,156 @@
+#include "kvm/mutex.h"
+#include "kvm/uip.h"
+
+#include <linux/virtio_net.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <kvm/iovec.h>
+
+int uip_tx(struct iovec *iov, u16 out, struct uip_info *info)
+{
+	void *vnet;
+	struct uip_tx_arg arg;
+	int eth_len, vnet_len;
+	struct uip_eth *eth;
+	u8 *buf = NULL;
+	u16 proto;
+	int i;
+
+	/*
+	 * Buffer from guest to device
+	 */
+	vnet_len = iov[0].iov_len;
+	vnet	 = iov[0].iov_base;
+
+	eth_len	 = iov[1].iov_len;
+	eth	 = iov[1].iov_base;
+
+	/*
+	 * In case, ethernet frame is in more than one iov entry.
+	 * Copy iov buffer into one linear buffer.
+	 */
+	if (out > 2) {
+		eth_len = 0;
+		for (i = 1; i < out; i++)
+			eth_len += iov[i].iov_len;
+
+		buf = malloc(eth_len);
+		if (!buf)
+			return -ENOMEM;
+
+		eth = (struct uip_eth *)buf;
+		for (i = 1; i < out; i++) {
+			memcpy(buf, iov[i].iov_base, iov[i].iov_len);
+			buf += iov[i].iov_len;
+		}
+	}
+
+	memset(&arg, 0, sizeof(arg));
+
+	arg.vnet_len = vnet_len;
+	arg.eth_len = eth_len;
+	arg.info = info;
+	arg.vnet = vnet;
+	arg.eth = eth;
+
+	/*
+	 * Check package type
+	 */
+	proto = ntohs(eth->type);
+
+	switch (proto) {
+	case UIP_ETH_P_ARP:
+		uip_tx_do_arp(&arg);
+		break;
+	case UIP_ETH_P_IP:
+		uip_tx_do_ipv4(&arg);
+		break;
+	default:
+		break;
+	}
+
+	if (out > 2 && buf)
+		free(eth);
+
+	return vnet_len + eth_len;
+}
+
+int uip_rx(struct iovec *iov, u16 in, struct uip_info *info)
+{
+	struct uip_buf *buf;
+	int len;
+
+	/*
+	 * Sleep until there is a buffer for guest
+	 */
+	buf = uip_buf_get_used(info);
+
+	memcpy_toiovecend(iov, buf->vnet, 0, buf->vnet_len);
+	memcpy_toiovecend(iov, buf->eth, buf->vnet_len, buf->eth_len);
+
+	len = buf->vnet_len + buf->eth_len;
+
+	uip_buf_set_free(info, buf);
+	return len;
+}
+
+void uip_static_init(struct uip_info *info)
+{
+	struct list_head *udp_socket_head;
+	struct list_head *tcp_socket_head;
+	struct list_head *buf_head;
+
+	udp_socket_head	= &info->udp_socket_head;
+	tcp_socket_head	= &info->tcp_socket_head;
+	buf_head	= &info->buf_head;
+
+	INIT_LIST_HEAD(udp_socket_head);
+	INIT_LIST_HEAD(tcp_socket_head);
+	INIT_LIST_HEAD(buf_head);
+
+	mutex_init(&info->udp_socket_lock);
+	mutex_init(&info->tcp_socket_lock);
+	mutex_init(&info->buf_lock);
+
+	pthread_cond_init(&info->buf_used_cond, NULL);
+	pthread_cond_init(&info->buf_free_cond, NULL);
+
+	info->buf_used_nr = 0;
+}
+
+int uip_init(struct uip_info *info)
+{
+	struct list_head *buf_head;
+	struct uip_buf *buf;
+	int buf_nr;
+	int i;
+
+	buf_head	= &info->buf_head;
+	buf_nr		= info->buf_nr;
+
+	for (i = 0; i < buf_nr; i++) {
+		buf = malloc(sizeof(*buf));
+		memset(buf, 0, sizeof(*buf));
+
+		buf->status	= UIP_BUF_STATUS_FREE;
+		buf->info	= info;
+		buf->id		= i;
+		list_add_tail(&buf->list, buf_head);
+	}
+
+	list_for_each_entry(buf, buf_head, list) {
+		buf->vnet_len   = info->vnet_hdr_len;
+		buf->vnet	= malloc(buf->vnet_len);
+		buf->eth_len    = 1024*64 + sizeof(struct uip_pseudo_hdr);
+		buf->eth	= malloc(buf->eth_len);
+
+		memset(buf->vnet, 0, buf->vnet_len);
+		memset(buf->eth, 0, buf->eth_len);
+	}
+
+	info->buf_free_nr = buf_nr;
+
+	uip_dhcp_get_dns(info);
+
+	return 0;
+}

diff --git a/tools/kvm/net/uip/csum.c b/tools/kvm/net/uip/csum.c
new file mode 100644
index 0000000..7ca8bad
--- /dev/null
+++ b/tools/kvm/net/uip/csum.c

@@ -0,0 +1,92 @@
+#include "kvm/uip.h"
+
+static u16 uip_csum(u16 csum, u8 *addr, u16 count)
+{
+	long sum = csum;
+
+	while (count > 1) {
+		sum	+= *(u16 *)addr;
+		addr	+= 2;
+		count	-= 2;
+	}
+
+	if (count > 0)
+		sum += *(unsigned char *)addr;
+
+	while (sum>>16)
+		sum = (sum & 0xffff) + (sum >> 16);
+
+	return ~sum;
+}
+
+u16 uip_csum_ip(struct uip_ip *ip)
+{
+	return uip_csum(0, &ip->vhl, uip_ip_hdrlen(ip));
+}
+
+u16 uip_csum_icmp(struct uip_icmp *icmp)
+{
+	struct uip_ip *ip;
+
+	ip = &icmp->ip;
+	return icmp->csum = uip_csum(0, &icmp->type, htons(ip->len) - uip_ip_hdrlen(ip) - 8); /* icmp header len = 8 */
+}
+
+u16 uip_csum_udp(struct uip_udp *udp)
+{
+	struct uip_pseudo_hdr hdr;
+	struct uip_ip *ip;
+	int udp_len;
+	u8 *pad;
+
+	ip	  = &udp->ip;
+
+	hdr.sip   = ip->sip;
+	hdr.dip	  = ip->dip;
+	hdr.zero  = 0;
+	hdr.proto = ip->proto;
+	hdr.len   = udp->len;
+
+	udp_len	  = uip_udp_len(udp);
+
+	if (udp_len % 2) {
+		pad = (u8 *)&udp->sport + udp_len;
+		*pad = 0;
+		memcpy((u8 *)&udp->sport + udp_len + 1, &hdr, sizeof(hdr));
+		return uip_csum(0, (u8 *)&udp->sport, udp_len + 1 + sizeof(hdr));
+	} else {
+		memcpy((u8 *)&udp->sport + udp_len, &hdr, sizeof(hdr));
+		return uip_csum(0, (u8 *)&udp->sport, udp_len + sizeof(hdr));
+	}
+
+}
+
+u16 uip_csum_tcp(struct uip_tcp *tcp)
+{
+	struct uip_pseudo_hdr hdr;
+	struct uip_ip *ip;
+	u16 tcp_len;
+	u8 *pad;
+
+	ip	  = &tcp->ip;
+	tcp_len   = ntohs(ip->len) - uip_ip_hdrlen(ip);
+
+	hdr.sip   = ip->sip;
+	hdr.dip	  = ip->dip;
+	hdr.zero  = 0;
+	hdr.proto = ip->proto;
+	hdr.len   = htons(tcp_len);
+
+	if (tcp_len > UIP_MAX_TCP_PAYLOAD + 20)
+		pr_warning("tcp_len(%d) is too large", tcp_len);
+
+	if (tcp_len % 2) {
+		pad = (u8 *)&tcp->sport + tcp_len;
+		*pad = 0;
+		memcpy((u8 *)&tcp->sport + tcp_len + 1, &hdr, sizeof(hdr));
+		return uip_csum(0, (u8 *)&tcp->sport, tcp_len + 1 + sizeof(hdr));
+	} else {
+		memcpy((u8 *)&tcp->sport + tcp_len, &hdr, sizeof(hdr));
+		return uip_csum(0, (u8 *)&tcp->sport, tcp_len + sizeof(hdr));
+	}
+}

diff --git a/tools/kvm/net/uip/dhcp.c b/tools/kvm/net/uip/dhcp.c
new file mode 100644
index 0000000..b17d352
--- /dev/null
+++ b/tools/kvm/net/uip/dhcp.c

@@ -0,0 +1,202 @@
+#include "kvm/uip.h"
+
+#include <arpa/inet.h>
+
+#define EMPTY_ADDR "0.0.0.0"
+
+static inline bool uip_dhcp_is_discovery(struct uip_dhcp *dhcp)
+{
+	return (dhcp->option[2] == UIP_DHCP_DISCOVER &&
+		dhcp->option[1] == UIP_DHCP_TAG_MSG_TYPE_LEN &&
+		dhcp->option[0] == UIP_DHCP_TAG_MSG_TYPE);
+}
+
+static inline bool uip_dhcp_is_request(struct uip_dhcp *dhcp)
+{
+	return (dhcp->option[2] == UIP_DHCP_REQUEST &&
+		dhcp->option[1] == UIP_DHCP_TAG_MSG_TYPE_LEN &&
+		dhcp->option[0] == UIP_DHCP_TAG_MSG_TYPE);
+}
+
+bool uip_udp_is_dhcp(struct uip_udp *udp)
+{
+	struct uip_dhcp *dhcp;
+
+	if (ntohs(udp->sport) != UIP_DHCP_PORT_CLIENT ||
+	    ntohs(udp->dport) != UIP_DHCP_PORT_SERVER)
+		return false;
+
+	dhcp = (struct uip_dhcp *)udp;
+
+	if (ntohl(dhcp->magic_cookie) != UIP_DHCP_MAGIC_COOKIE)
+		return false;
+
+	return true;
+}
+
+int uip_dhcp_get_dns(struct uip_info *info)
+{
+	char key[256], val[256];
+	struct in_addr addr;
+	int ret = -1;
+	int n = 0;
+	FILE *fp;
+	u32 ip;
+
+	fp = fopen("/etc/resolv.conf", "r");
+	if (!fp)
+		return ret;
+
+	while (!feof(fp)) {
+		if (fscanf(fp, "%s %s\n", key, val) != 2)
+			continue;
+		if (strncmp("domain", key, 6) == 0)
+			info->domain_name = strndup(val, UIP_DHCP_MAX_DOMAIN_NAME_LEN);
+		else if (strncmp("nameserver", key, 10) == 0) {
+			if (!inet_aton(val, &addr))
+				continue;
+			ip = ntohl(addr.s_addr);
+			if (n < UIP_DHCP_MAX_DNS_SERVER_NR)
+				info->dns_ip[n++] = ip;
+			ret = 0;
+		}
+	}
+
+	fclose(fp);
+	return ret;
+}
+
+static int uip_dhcp_fill_option_name_and_server(struct uip_info *info, u8 *opt, int i)
+{
+	u8 domain_name_len;
+	u32 *addr;
+	int n;
+
+	if (info->domain_name) {
+		domain_name_len	= strlen(info->domain_name);
+		opt[i++]	= UIP_DHCP_TAG_DOMAIN_NAME;
+		opt[i++]	= domain_name_len;
+		memcpy(&opt[i], info->domain_name, domain_name_len);
+		i		+= domain_name_len;
+	}
+
+	for (n = 0; n < UIP_DHCP_MAX_DNS_SERVER_NR; n++) {
+		if (info->dns_ip[n] == 0)
+			continue;
+		opt[i++]	= UIP_DHCP_TAG_DNS_SERVER;
+		opt[i++]	= UIP_DHCP_TAG_DNS_SERVER_LEN;
+		addr		= (u32 *)&opt[i];
+		*addr		= htonl(info->dns_ip[n]);
+		i		+= UIP_DHCP_TAG_DNS_SERVER_LEN;
+	}
+
+	return i;
+}
+static int uip_dhcp_fill_option(struct uip_info *info, struct uip_dhcp *dhcp, int reply_msg_type)
+{
+	int i = 0;
+	u32 *addr;
+	u8 *opt;
+
+	opt		= dhcp->option;
+
+	opt[i++]	= UIP_DHCP_TAG_MSG_TYPE;
+	opt[i++]	= UIP_DHCP_TAG_MSG_TYPE_LEN;
+	opt[i++]	= reply_msg_type;
+
+	opt[i++]	= UIP_DHCP_TAG_SERVER_ID;
+	opt[i++]	= UIP_DHCP_TAG_SERVER_ID_LEN;
+	addr		= (u32 *)&opt[i];
+	*addr		= htonl(info->host_ip);
+	i		+= UIP_DHCP_TAG_SERVER_ID_LEN;
+
+	opt[i++]	= UIP_DHCP_TAG_LEASE_TIME;
+	opt[i++]	= UIP_DHCP_TAG_LEASE_TIME_LEN;
+	addr		= (u32 *)&opt[i];
+	*addr		= htonl(UIP_DHCP_LEASE_TIME);
+	i		+= UIP_DHCP_TAG_LEASE_TIME_LEN;
+
+	opt[i++]	= UIP_DHCP_TAG_SUBMASK;
+	opt[i++]	= UIP_DHCP_TAG_SUBMASK_LEN;
+	addr		= (u32 *)&opt[i];
+	*addr		= htonl(info->guest_netmask);
+	i		+= UIP_DHCP_TAG_SUBMASK_LEN;
+
+	opt[i++]	= UIP_DHCP_TAG_ROUTER;
+	opt[i++]	= UIP_DHCP_TAG_ROUTER_LEN;
+	addr		= (u32 *)&opt[i];
+	*addr		= htonl(info->host_ip);
+	i		+= UIP_DHCP_TAG_ROUTER_LEN;
+
+	opt[i++]	= UIP_DHCP_TAG_ROOT;
+	opt[i++]	= strlen(EMPTY_ADDR);
+	addr		= (u32 *)&opt[i];
+	strncpy((void *) addr, EMPTY_ADDR, strlen(EMPTY_ADDR));
+	i		+= strlen(EMPTY_ADDR);
+
+	i 		= uip_dhcp_fill_option_name_and_server(info, opt, i);
+
+	opt[i++]	= UIP_DHCP_TAG_END;
+
+	return 0;
+}
+
+static int uip_dhcp_make_pkg(struct uip_info *info, struct uip_udp_socket *sk, struct uip_buf *buf, u8 reply_msg_type)
+{
+	struct uip_dhcp *dhcp;
+
+	dhcp		= (struct uip_dhcp *)buf->eth;
+
+	dhcp->msg_type	= 2;
+	dhcp->client_ip	= 0;
+	dhcp->your_ip	= htonl(info->guest_ip);
+	dhcp->server_ip	= htonl(info->host_ip);
+	dhcp->agent_ip	= 0;
+
+	uip_dhcp_fill_option(info, dhcp, reply_msg_type);
+
+	sk->sip		= htonl(info->guest_ip);
+	sk->dip		= htonl(info->host_ip);
+	sk->sport	= htons(UIP_DHCP_PORT_CLIENT);
+	sk->dport	= htons(UIP_DHCP_PORT_SERVER);
+
+	return 0;
+}
+
+int uip_tx_do_ipv4_udp_dhcp(struct uip_tx_arg *arg)
+{
+	struct uip_udp_socket sk;
+	struct uip_dhcp *dhcp;
+	struct uip_info *info;
+	struct uip_buf *buf;
+	u8 reply_msg_type;
+
+	dhcp = (struct uip_dhcp *)arg->eth;
+
+	if (uip_dhcp_is_discovery(dhcp))
+		reply_msg_type = UIP_DHCP_OFFER;
+	else if (uip_dhcp_is_request(dhcp))
+		reply_msg_type = UIP_DHCP_ACK;
+	else
+		return -1;
+
+	buf = uip_buf_clone(arg);
+	info = arg->info;
+
+	/*
+	 * Cook DHCP pkg
+	 */
+	uip_dhcp_make_pkg(info, &sk, buf, reply_msg_type);
+
+	/*
+	 * Cook UDP pkg
+	 */
+	uip_udp_make_pkg(info, &sk, buf, NULL, UIP_DHCP_MAX_PAYLOAD_LEN);
+
+	/*
+	 * Send data received from socket to guest
+	 */
+	uip_buf_set_used(info, buf);
+
+	return 0;
+}

diff --git a/tools/kvm/net/uip/icmp.c b/tools/kvm/net/uip/icmp.c
new file mode 100644
index 0000000..233297c
--- /dev/null
+++ b/tools/kvm/net/uip/icmp.c

@@ -0,0 +1,29 @@
+#include "kvm/uip.h"
+
+int uip_tx_do_ipv4_icmp(struct uip_tx_arg *arg)
+{
+	struct uip_ip *ip, *ip2;
+	struct uip_icmp *icmp2;
+	struct uip_buf *buf;
+
+	buf		= uip_buf_clone(arg);
+
+	icmp2		= (struct uip_icmp *)(buf->eth);
+	ip2		= (struct uip_ip *)(buf->eth);
+	ip		= (struct uip_ip *)(arg->eth);
+
+	ip2->sip	= ip->dip;
+	ip2->dip	= ip->sip;
+	ip2->csum	= 0;
+	/*
+	 * ICMP reply: 0
+	 */
+	icmp2->type	= 0;
+	icmp2->csum	= 0;
+	ip2->csum	= uip_csum_ip(ip2);
+	icmp2->csum	= uip_csum_icmp(icmp2);
+
+	uip_buf_set_used(arg->info, buf);
+
+	return 0;
+}

diff --git a/tools/kvm/net/uip/ipv4.c b/tools/kvm/net/uip/ipv4.c
new file mode 100644
index 0000000..58373fd
--- /dev/null
+++ b/tools/kvm/net/uip/ipv4.c

@@ -0,0 +1,29 @@
+#include "kvm/uip.h"
+
+int uip_tx_do_ipv4(struct uip_tx_arg *arg)
+{
+	struct uip_ip *ip;
+
+	ip = (struct uip_ip *)(arg->eth);
+
+	if (uip_ip_hdrlen(ip) != 20) {
+		pr_warning("IP header length is not 20 bytes");
+		return -1;
+	}
+
+	switch (ip->proto) {
+	case UIP_IP_P_ICMP:
+		uip_tx_do_ipv4_icmp(arg);
+		break;
+	case UIP_IP_P_TCP:
+		uip_tx_do_ipv4_tcp(arg);
+		break;
+	case UIP_IP_P_UDP:
+		uip_tx_do_ipv4_udp(arg);
+		break;
+	default:
+		break;
+	}
+
+	return 0;
+}

diff --git a/tools/kvm/net/uip/tcp.c b/tools/kvm/net/uip/tcp.c
new file mode 100644
index 0000000..3c30ade
--- /dev/null
+++ b/tools/kvm/net/uip/tcp.c

@@ -0,0 +1,348 @@
+#include "kvm/uip.h"
+
+#include <kvm/kvm.h>
+#include <linux/virtio_net.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <arpa/inet.h>
+
+static int uip_tcp_socket_close(struct uip_tcp_socket *sk, int how)
+{
+	shutdown(sk->fd, how);
+
+	if (sk->write_done && sk->read_done) {
+		shutdown(sk->fd, SHUT_RDWR);
+		close(sk->fd);
+
+		mutex_lock(sk->lock);
+		list_del(&sk->list);
+		mutex_unlock(sk->lock);
+
+		free(sk);
+	}
+
+	return 0;
+}
+
+static struct uip_tcp_socket *uip_tcp_socket_find(struct uip_tx_arg *arg, u32 sip, u32 dip, u16 sport, u16 dport)
+{
+	struct list_head *sk_head;
+	struct mutex *sk_lock;
+	struct uip_tcp_socket *sk;
+
+	sk_head = &arg->info->tcp_socket_head;
+	sk_lock = &arg->info->tcp_socket_lock;
+
+	mutex_lock(sk_lock);
+	list_for_each_entry(sk, sk_head, list) {
+		if (sk->sip == sip && sk->dip == dip && sk->sport == sport && sk->dport == dport) {
+			mutex_unlock(sk_lock);
+			return sk;
+		}
+	}
+	mutex_unlock(sk_lock);
+
+	return NULL;
+}
+
+static struct uip_tcp_socket *uip_tcp_socket_alloc(struct uip_tx_arg *arg, u32 sip, u32 dip, u16 sport, u16 dport)
+{
+	struct list_head *sk_head;
+	struct uip_tcp_socket *sk;
+	struct mutex *sk_lock;
+	struct uip_tcp *tcp;
+	struct uip_ip *ip;
+	int ret;
+
+	tcp = (struct uip_tcp *)arg->eth;
+	ip = (struct uip_ip *)arg->eth;
+
+	sk_head = &arg->info->tcp_socket_head;
+	sk_lock = &arg->info->tcp_socket_lock;
+
+	sk = malloc(sizeof(*sk));
+	memset(sk, 0, sizeof(*sk));
+
+	sk->lock			= sk_lock;
+	sk->info			= arg->info;
+
+	sk->fd				= socket(AF_INET, SOCK_STREAM, 0);
+	sk->addr.sin_family		= AF_INET;
+	sk->addr.sin_port		= dport;
+	sk->addr.sin_addr.s_addr	= dip;
+
+	pthread_cond_init(&sk->cond, NULL);
+
+	if (ntohl(dip) == arg->info->host_ip)
+		sk->addr.sin_addr.s_addr = inet_addr("127.0.0.1");
+
+	ret = connect(sk->fd, (struct sockaddr *)&sk->addr, sizeof(sk->addr));
+	if (ret) {
+		free(sk);
+		return NULL;
+	}
+
+	sk->sip		= ip->sip;
+	sk->dip		= ip->dip;
+	sk->sport	= tcp->sport;
+	sk->dport	= tcp->dport;
+
+	mutex_lock(sk_lock);
+	list_add_tail(&sk->list, sk_head);
+	mutex_unlock(sk_lock);
+
+	return sk;
+}
+
+static int uip_tcp_payload_send(struct uip_tcp_socket *sk, u8 flag, u16 payload_len)
+{
+	struct uip_info *info;
+	struct uip_eth *eth2;
+	struct uip_tcp *tcp2;
+	struct uip_buf *buf;
+	struct uip_ip *ip2;
+
+	info		= sk->info;
+
+	/*
+	 * Get free buffer to send data to guest
+	 */
+	buf		= uip_buf_get_free(info);
+
+	/*
+	 * Cook a ethernet frame
+	 */
+	tcp2		= (struct uip_tcp *)buf->eth;
+	eth2		= (struct uip_eth *)buf->eth;
+	ip2		= (struct uip_ip *)buf->eth;
+
+	eth2->src	= info->host_mac;
+	eth2->dst	= info->guest_mac;
+	eth2->type	= htons(UIP_ETH_P_IP);
+
+	ip2->vhl	= UIP_IP_VER_4 | UIP_IP_HDR_LEN;
+	ip2->tos	= 0;
+	ip2->id		= 0;
+	ip2->flgfrag	= 0;
+	ip2->ttl	= UIP_IP_TTL;
+	ip2->proto	= UIP_IP_P_TCP;
+	ip2->csum	= 0;
+	ip2->sip	= sk->dip;
+	ip2->dip	= sk->sip;
+
+	tcp2->sport	= sk->dport;
+	tcp2->dport	= sk->sport;
+	tcp2->seq	= htonl(sk->seq_server);
+	tcp2->ack	= htonl(sk->ack_server);
+	/*
+	 * Diable TCP options, tcp hdr len equals 20 bytes
+	 */
+	tcp2->off	= UIP_TCP_HDR_LEN;
+	tcp2->flg	= flag;
+	tcp2->win	= htons(UIP_TCP_WIN_SIZE);
+	tcp2->csum	= 0;
+	tcp2->urgent	= 0;
+
+	if (payload_len > 0)
+		memcpy(uip_tcp_payload(tcp2), sk->payload, payload_len);
+
+	ip2->len	= htons(uip_tcp_hdrlen(tcp2) + payload_len + uip_ip_hdrlen(ip2));
+	ip2->csum	= uip_csum_ip(ip2);
+	tcp2->csum	= uip_csum_tcp(tcp2);
+
+	/*
+	 * virtio_net_hdr
+	 */
+	buf->vnet_len	= info->vnet_hdr_len;
+	memset(buf->vnet, 0, buf->vnet_len);
+
+	buf->eth_len	= ntohs(ip2->len) + uip_eth_hdrlen(&ip2->eth);
+
+	/*
+	 * Increase server seq
+	 */
+	sk->seq_server  += payload_len;
+
+	/*
+	 * Send data received from socket to guest
+	 */
+	uip_buf_set_used(info, buf);
+
+	return 0;
+}
+
+static void *uip_tcp_socket_thread(void *p)
+{
+	struct uip_tcp_socket *sk;
+	int len, left, ret;
+	u8 *payload, *pos;
+
+	kvm__set_thread_name("uip-tcp");
+
+	sk = p;
+
+	payload = malloc(UIP_MAX_TCP_PAYLOAD);
+	if (!payload)
+		goto out;
+
+	while (1) {
+		pos = payload;
+
+		ret = read(sk->fd, payload, UIP_MAX_TCP_PAYLOAD);
+
+		if (ret <= 0 || ret > UIP_MAX_TCP_PAYLOAD)
+			goto out;
+
+		left = ret;
+
+		while (left > 0) {
+			mutex_lock(sk->lock);
+			while ((len = sk->guest_acked + sk->window_size - sk->seq_server) <= 0)
+				pthread_cond_wait(&sk->cond, &sk->lock->mutex);
+			mutex_unlock(sk->lock);
+
+			sk->payload = pos;
+			if (len > left)
+				len = left;
+			if (len > UIP_MAX_TCP_PAYLOAD)
+				len = UIP_MAX_TCP_PAYLOAD;
+			left -= len;
+			pos += len;
+
+			uip_tcp_payload_send(sk, UIP_TCP_FLAG_ACK, len);
+		}
+	}
+
+out:
+	/*
+	 * Close server to guest TCP connection
+	 */
+	uip_tcp_socket_close(sk, SHUT_RD);
+
+	uip_tcp_payload_send(sk, UIP_TCP_FLAG_FIN | UIP_TCP_FLAG_ACK, 0);
+	sk->seq_server += 1;
+
+	sk->read_done = 1;
+
+	free(payload);
+	pthread_exit(NULL);
+
+	return NULL;
+}
+
+static int uip_tcp_socket_receive(struct uip_tcp_socket *sk)
+{
+	if (sk->thread == 0)
+		return pthread_create(&sk->thread, NULL, uip_tcp_socket_thread, (void *)sk);
+
+	return 0;
+}
+
+static int uip_tcp_socket_send(struct uip_tcp_socket *sk, struct uip_tcp *tcp)
+{
+	int len;
+	int ret;
+	u8 *payload;
+
+	if (sk->write_done)
+		return 0;
+
+	payload = uip_tcp_payload(tcp);
+	len = uip_tcp_payloadlen(tcp);
+
+	ret = write(sk->fd, payload, len);
+	if (ret != len)
+		pr_warning("tcp send error");
+
+	return ret;
+}
+
+int uip_tx_do_ipv4_tcp(struct uip_tx_arg *arg)
+{
+	struct uip_tcp_socket *sk;
+	struct uip_tcp *tcp;
+	struct uip_ip *ip;
+	int ret;
+
+	tcp = (struct uip_tcp *)arg->eth;
+	ip = (struct uip_ip *)arg->eth;
+
+	/*
+	 * Guest is trying to start a TCP session, let's fake SYN-ACK to guest
+	 */
+	if (uip_tcp_is_syn(tcp)) {
+		sk = uip_tcp_socket_alloc(arg, ip->sip, ip->dip, tcp->sport, tcp->dport);
+		if (!sk)
+			return -1;
+
+		sk->window_size = ntohs(tcp->win);
+
+		/*
+		 * Setup ISN number
+		 */
+		sk->isn_guest  = uip_tcp_isn(tcp);
+		sk->isn_server = uip_tcp_isn_alloc();
+
+		sk->seq_server = sk->isn_server;
+		sk->ack_server = sk->isn_guest + 1;
+		uip_tcp_payload_send(sk, UIP_TCP_FLAG_SYN | UIP_TCP_FLAG_ACK, 0);
+		sk->seq_server += 1;
+
+		/*
+		 * Start receive thread for data from remote to guest
+		 */
+		uip_tcp_socket_receive(sk);
+
+		goto out;
+	}
+
+	/*
+	 * Find socket we have allocated
+	 */
+	sk = uip_tcp_socket_find(arg, ip->sip, ip->dip, tcp->sport, tcp->dport);
+	if (!sk)
+		return -1;
+
+	mutex_lock(sk->lock);
+	sk->window_size = ntohs(tcp->win);
+	sk->guest_acked = ntohl(tcp->ack);
+	pthread_cond_signal(&sk->cond);
+	mutex_unlock(sk->lock);
+
+	if (uip_tcp_is_fin(tcp)) {
+		if (sk->write_done)
+			goto out;
+
+		sk->write_done = 1;
+		sk->ack_server += 1;
+		uip_tcp_payload_send(sk, UIP_TCP_FLAG_ACK, 0);
+
+		/*
+		 * Close guest to server TCP connection
+		 */
+		uip_tcp_socket_close(sk, SHUT_WR);
+
+		goto out;
+	}
+
+	/*
+	 * Ignore guest to server frames with zero tcp payload
+	 */
+	if (uip_tcp_payloadlen(tcp) == 0)
+		goto out;
+
+	/*
+	 * Sent out TCP data to remote host
+	 */
+	ret = uip_tcp_socket_send(sk, tcp);
+	if (ret < 0)
+		return -1;
+	/*
+	 * Send ACK to guest imediately
+	 */
+	sk->ack_server += ret;
+	uip_tcp_payload_send(sk, UIP_TCP_FLAG_ACK, 0);
+
+out:
+	return 0;
+}

diff --git a/tools/kvm/net/uip/udp.c b/tools/kvm/net/uip/udp.c
new file mode 100644
index 0000000..dd288c5
--- /dev/null
+++ b/tools/kvm/net/uip/udp.c

@@ -0,0 +1,239 @@
+#include "kvm/uip.h"
+
+#include <kvm/kvm.h>
+#include <linux/virtio_net.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <sys/socket.h>
+#include <sys/epoll.h>
+#include <fcntl.h>
+
+#define UIP_UDP_MAX_EVENTS 1000
+
+static struct uip_udp_socket *uip_udp_socket_find(struct uip_tx_arg *arg, u32 sip, u32 dip, u16 sport, u16 dport)
+{
+	struct list_head *sk_head;
+	struct uip_udp_socket *sk;
+	struct mutex *sk_lock;
+	struct epoll_event ev;
+	int flags;
+	int ret;
+
+	sk_head = &arg->info->udp_socket_head;
+	sk_lock = &arg->info->udp_socket_lock;
+
+	/*
+	 * Find existing sk
+	 */
+	mutex_lock(sk_lock);
+	list_for_each_entry(sk, sk_head, list) {
+		if (sk->sip == sip && sk->dip == dip && sk->sport == sport && sk->dport == dport) {
+			mutex_unlock(sk_lock);
+			return sk;
+		}
+	}
+	mutex_unlock(sk_lock);
+
+	/*
+	 * Allocate new one
+	 */
+	sk = malloc(sizeof(*sk));
+	memset(sk, 0, sizeof(*sk));
+
+	sk->lock = sk_lock;
+
+	sk->fd = socket(AF_INET, SOCK_DGRAM, 0);
+	if (sk->fd < 0)
+		goto out;
+
+	/*
+	 * Set non-blocking
+	 */
+	flags = fcntl(sk->fd, F_GETFL, 0);
+	flags |= O_NONBLOCK;
+	fcntl(sk->fd, F_SETFL, flags);
+
+	/*
+	 * Add sk->fd to epoll_wait
+	 */
+	ev.events	= EPOLLIN;
+	ev.data.fd	= sk->fd;
+	ev.data.ptr	= sk;
+	if (arg->info->udp_epollfd <= 0)
+		arg->info->udp_epollfd = epoll_create(UIP_UDP_MAX_EVENTS);
+	ret = epoll_ctl(arg->info->udp_epollfd, EPOLL_CTL_ADD, sk->fd, &ev);
+	if (ret == -1)
+		pr_warning("epoll_ctl error");
+
+	sk->addr.sin_family	 = AF_INET;
+	sk->addr.sin_addr.s_addr = dip;
+	sk->addr.sin_port	 = dport;
+
+	sk->sip			 = sip;
+	sk->dip			 = dip;
+	sk->sport		 = sport;
+	sk->dport		 = dport;
+
+	mutex_lock(sk_lock);
+	list_add_tail(&sk->list, sk_head);
+	mutex_unlock(sk_lock);
+
+	return sk;
+
+out:
+	free(sk);
+	return NULL;
+}
+
+static int uip_udp_socket_send(struct uip_udp_socket *sk, struct uip_udp *udp)
+{
+	int len;
+	int ret;
+
+	len = ntohs(udp->len) - uip_udp_hdrlen(udp);
+
+	ret = sendto(sk->fd, udp->payload, len, 0, (struct sockaddr *)&sk->addr, sizeof(sk->addr));
+	if (ret != len)
+		return -1;
+
+	return 0;
+}
+
+int uip_udp_make_pkg(struct uip_info *info, struct uip_udp_socket *sk, struct uip_buf *buf, u8* payload, int payload_len)
+{
+	struct uip_eth *eth2;
+	struct uip_udp *udp2;
+	struct uip_ip *ip2;
+
+	/*
+	 * Cook a ethernet frame
+	 */
+	udp2		= (struct uip_udp *)(buf->eth);
+	eth2		= (struct uip_eth *)buf->eth;
+	ip2		= (struct uip_ip *)(buf->eth);
+
+	eth2->src	= info->host_mac;
+	eth2->dst	= info->guest_mac;
+	eth2->type	= htons(UIP_ETH_P_IP);
+
+	ip2->vhl	= UIP_IP_VER_4 | UIP_IP_HDR_LEN;
+	ip2->tos	= 0;
+	ip2->id		= 0;
+	ip2->flgfrag	= 0;
+	ip2->ttl	= UIP_IP_TTL;
+	ip2->proto	= UIP_IP_P_UDP;
+	ip2->csum	= 0;
+
+	ip2->sip	= sk->dip;
+	ip2->dip	= sk->sip;
+	udp2->sport	= sk->dport;
+	udp2->dport	= sk->sport;
+
+	udp2->len	= htons(payload_len + uip_udp_hdrlen(udp2));
+	udp2->csum	= 0;
+
+	if (payload)
+		memcpy(udp2->payload, payload, payload_len);
+
+	ip2->len	= udp2->len + htons(uip_ip_hdrlen(ip2));
+	ip2->csum	= uip_csum_ip(ip2);
+	udp2->csum	= uip_csum_udp(udp2);
+
+	/*
+	 * virtio_net_hdr
+	 */
+	buf->vnet_len	= info->vnet_hdr_len;
+	memset(buf->vnet, 0, buf->vnet_len);
+
+	buf->eth_len	= ntohs(ip2->len) + uip_eth_hdrlen(&ip2->eth);
+
+	return 0;
+}
+
+static void *uip_udp_socket_thread(void *p)
+{
+	struct epoll_event events[UIP_UDP_MAX_EVENTS];
+	struct uip_udp_socket *sk;
+	struct uip_info *info;
+	struct uip_buf *buf;
+	int payload_len;
+	u8 *payload;
+	int nfds;
+	int i;
+
+	kvm__set_thread_name("uip-udp");
+
+	info = p;
+
+	do {
+		payload = malloc(UIP_MAX_UDP_PAYLOAD);
+	} while (!payload);
+
+	while (1) {
+		nfds = epoll_wait(info->udp_epollfd, events, UIP_UDP_MAX_EVENTS, -1);
+
+		if (nfds == -1)
+			continue;
+
+		for (i = 0; i < nfds; i++) {
+
+			sk = events[i].data.ptr;
+			payload_len = recvfrom(sk->fd, payload, UIP_MAX_UDP_PAYLOAD, 0, NULL, NULL);
+			if (payload_len < 0)
+				continue;
+
+			/*
+			 * Get free buffer to send data to guest
+			 */
+			buf = uip_buf_get_free(info);
+
+			uip_udp_make_pkg(info, sk, buf, payload, payload_len);
+
+			/*
+			 * Send data received from socket to guest
+			 */
+			uip_buf_set_used(info, buf);
+		}
+	}
+
+	free(payload);
+	pthread_exit(NULL);
+	return NULL;
+}
+
+int uip_tx_do_ipv4_udp(struct uip_tx_arg *arg)
+{
+	struct uip_udp_socket *sk;
+	struct uip_info *info;
+	struct uip_udp *udp;
+	struct uip_ip *ip;
+	int ret;
+
+	udp	= (struct uip_udp *)(arg->eth);
+	ip	= (struct uip_ip *)(arg->eth);
+	info	= arg->info;
+
+	if (uip_udp_is_dhcp(udp)) {
+		uip_tx_do_ipv4_udp_dhcp(arg);
+		return 0;
+	}
+
+	/*
+	 * Find socket we have allocated before, otherwise allocate one
+	 */
+	sk = uip_udp_socket_find(arg, ip->sip, ip->dip, udp->sport, udp->dport);
+	if (!sk)
+		return -1;
+
+	/*
+	 * Send out UDP data to remote host
+	 */
+	ret = uip_udp_socket_send(sk, udp);
+	if (ret)
+		return -1;
+
+	if (!info->udp_thread)
+		pthread_create(&info->udp_thread, NULL, uip_udp_socket_thread, (void *)info);
+
+	return 0;
+}

diff --git a/tools/kvm/pci.c b/tools/kvm/pci.c
new file mode 100644
index 0000000..3a6696c
--- /dev/null
+++ b/tools/kvm/pci.c

@@ -0,0 +1,251 @@
+#include "kvm/devices.h"
+#include "kvm/pci.h"
+#include "kvm/ioport.h"
+#include "kvm/irq.h"
+#include "kvm/util.h"
+#include "kvm/kvm.h"
+
+#include <linux/err.h>
+#include <assert.h>
+
+#define PCI_BAR_OFFSET(b)		(offsetof(struct pci_device_header, bar[b]))
+
+static u32 pci_config_address_bits;
+
+/* This is within our PCI gap - in an unused area.
+ * Note this is a PCI *bus address*, is used to assign BARs etc.!
+ * (That's why it can still 32bit even with 64bit guests-- 64bit
+ * PCI isn't currently supported.)
+ */
+static u32 io_space_blocks		= KVM_PCI_MMIO_AREA;
+
+/*
+ * BARs must be naturally aligned, so enforce this in the allocator.
+ */
+u32 pci_get_io_space_block(u32 size)
+{
+	u32 block = ALIGN(io_space_blocks, size);
+	io_space_blocks = block + size;
+	return block;
+}
+
+void pci__assign_irq(struct device_header *dev_hdr)
+{
+	struct pci_device_header *pci_hdr = dev_hdr->data;
+
+	/*
+	 * PCI supports only INTA#,B#,C#,D# per device.
+	 *
+	 * A#,B#,C#,D# are allowed for multifunctional devices so stick
+	 * with A# for our single function devices.
+	 */
+	pci_hdr->irq_pin	= 1;
+	pci_hdr->irq_line	= irq__alloc_line();
+}
+
+static void *pci_config_address_ptr(u16 port)
+{
+	unsigned long offset;
+	void *base;
+
+	offset	= port - PCI_CONFIG_ADDRESS;
+	base	= &pci_config_address_bits;
+
+	return base + offset;
+}
+
+static bool pci_config_address_out(struct ioport *ioport, struct kvm_cpu *vcpu, u16 port, void *data, int size)
+{
+	void *p = pci_config_address_ptr(port);
+
+	memcpy(p, data, size);
+
+	return true;
+}
+
+static bool pci_config_address_in(struct ioport *ioport, struct kvm_cpu *vcpu, u16 port, void *data, int size)
+{
+	void *p = pci_config_address_ptr(port);
+
+	memcpy(data, p, size);
+
+	return true;
+}
+
+static struct ioport_operations pci_config_address_ops = {
+	.io_in	= pci_config_address_in,
+	.io_out	= pci_config_address_out,
+};
+
+static bool pci_device_exists(u8 bus_number, u8 device_number, u8 function_number)
+{
+	union pci_config_address pci_config_address;
+
+	pci_config_address.w = ioport__read32(&pci_config_address_bits);
+
+	if (pci_config_address.bus_number != bus_number)
+		return false;
+
+	if (pci_config_address.function_number != function_number)
+		return false;
+
+	return !IS_ERR_OR_NULL(device__find_dev(DEVICE_BUS_PCI, device_number));
+}
+
+static bool pci_config_data_out(struct ioport *ioport, struct kvm_cpu *vcpu, u16 port, void *data, int size)
+{
+	union pci_config_address pci_config_address;
+
+	pci_config_address.w = ioport__read32(&pci_config_address_bits);
+	/*
+	 * If someone accesses PCI configuration space offsets that are not
+	 * aligned to 4 bytes, it uses ioports to signify that.
+	 */
+	pci_config_address.reg_offset = port - PCI_CONFIG_DATA;
+
+	pci__config_wr(vcpu->kvm, pci_config_address, data, size);
+
+	return true;
+}
+
+static bool pci_config_data_in(struct ioport *ioport, struct kvm_cpu *vcpu, u16 port, void *data, int size)
+{
+	union pci_config_address pci_config_address;
+
+	pci_config_address.w = ioport__read32(&pci_config_address_bits);
+	/*
+	 * If someone accesses PCI configuration space offsets that are not
+	 * aligned to 4 bytes, it uses ioports to signify that.
+	 */
+	pci_config_address.reg_offset = port - PCI_CONFIG_DATA;
+
+	pci__config_rd(vcpu->kvm, pci_config_address, data, size);
+
+	return true;
+}
+
+static struct ioport_operations pci_config_data_ops = {
+	.io_in	= pci_config_data_in,
+	.io_out	= pci_config_data_out,
+};
+
+void pci__config_wr(struct kvm *kvm, union pci_config_address addr, void *data, int size)
+{
+	u8 dev_num;
+
+	dev_num	= addr.device_number;
+
+	if (pci_device_exists(0, dev_num, 0)) {
+		unsigned long offset;
+
+		offset = addr.w & 0xff;
+		if (offset < sizeof(struct pci_device_header)) {
+			void *p = device__find_dev(DEVICE_BUS_PCI, dev_num)->data;
+			struct pci_device_header *hdr = p;
+			u8 bar = (offset - PCI_BAR_OFFSET(0)) / (sizeof(u32));
+			u32 sz = cpu_to_le32(PCI_IO_SIZE);
+
+			if (bar < 6 && hdr->bar_size[bar])
+				sz = hdr->bar_size[bar];
+
+			/*
+			 * If the kernel masks the BAR it would expect to find the
+			 * size of the BAR there next time it reads from it.
+			 * When the kernel got the size it would write the address
+			 * back.
+			 */
+			if (*(u32 *)(p + offset)) {
+				/* See if kernel tries to mask one of the BARs */
+				if ((offset >= PCI_BAR_OFFSET(0)) &&
+				    (offset <= PCI_BAR_OFFSET(6)) &&
+				    (ioport__read32(data)  == 0xFFFFFFFF))
+					memcpy(p + offset, &sz, sizeof(sz));
+				    else
+					memcpy(p + offset, data, size);
+			}
+		}
+	}
+}
+
+void pci__config_rd(struct kvm *kvm, union pci_config_address addr, void *data, int size)
+{
+	u8 dev_num;
+
+	dev_num	= addr.device_number;
+
+	if (pci_device_exists(0, dev_num, 0)) {
+		unsigned long offset;
+
+		offset = addr.w & 0xff;
+		if (offset < sizeof(struct pci_device_header)) {
+			void *p = device__find_dev(DEVICE_BUS_PCI, dev_num)->data;
+
+			memcpy(data, p + offset, size);
+		} else {
+			memset(data, 0x00, size);
+		}
+	} else {
+		memset(data, 0xff, size);
+	}
+}
+
+static void pci_config_mmio_access(struct kvm_cpu *vcpu, u64 addr, u8 *data,
+				   u32 len, u8 is_write, void *kvm)
+{
+	union pci_config_address cfg_addr;
+
+	addr			-= KVM_PCI_CFG_AREA;
+	cfg_addr.w		= (u32)addr;
+	cfg_addr.enable_bit	= 1;
+
+	if (is_write)
+		pci__config_wr(kvm, cfg_addr, data, len);
+	else
+		pci__config_rd(kvm, cfg_addr, data, len);
+}
+
+struct pci_device_header *pci__find_dev(u8 dev_num)
+{
+	struct device_header *hdr = device__find_dev(DEVICE_BUS_PCI, dev_num);
+
+	if (IS_ERR_OR_NULL(hdr))
+		return NULL;
+
+	return hdr->data;
+}
+
+int pci__init(struct kvm *kvm)
+{
+	int r;
+
+	r = ioport__register(kvm, PCI_CONFIG_DATA + 0, &pci_config_data_ops, 4, NULL);
+	if (r < 0)
+		return r;
+
+	r = ioport__register(kvm, PCI_CONFIG_ADDRESS + 0, &pci_config_address_ops, 4, NULL);
+	if (r < 0)
+		goto err_unregister_data;
+
+	r = kvm__register_mmio(kvm, KVM_PCI_CFG_AREA, PCI_CFG_SIZE, false,
+			       pci_config_mmio_access, kvm);
+	if (r < 0)
+		goto err_unregister_addr;
+
+	return 0;
+
+err_unregister_addr:
+	ioport__unregister(kvm, PCI_CONFIG_ADDRESS);
+err_unregister_data:
+	ioport__unregister(kvm, PCI_CONFIG_DATA);
+	return r;
+}
+dev_base_init(pci__init);
+
+int pci__exit(struct kvm *kvm)
+{
+	ioport__unregister(kvm, PCI_CONFIG_DATA);
+	ioport__unregister(kvm, PCI_CONFIG_ADDRESS);
+
+	return 0;
+}
+dev_base_exit(pci__exit);

diff --git a/tools/kvm/powerpc/boot.c b/tools/kvm/powerpc/boot.c
new file mode 100644
index 0000000..2557fc0
--- /dev/null
+++ b/tools/kvm/powerpc/boot.c

@@ -0,0 +1,8 @@
+#include "kvm/kvm.h"
+
+#include <stdbool.h>
+
+bool kvm__load_firmware(struct kvm *kvm, const char *firmware_filename)
+{
+	return false;
+}

diff --git a/tools/kvm/powerpc/cpu_info.c b/tools/kvm/powerpc/cpu_info.c
new file mode 100644
index 0000000..a9dfe39
--- /dev/null
+++ b/tools/kvm/powerpc/cpu_info.c

@@ -0,0 +1,210 @@
+/*
+ * PPC CPU identification
+ *
+ * This is a very simple "host CPU info" struct to get us going.
+ * For the little host information we need, I don't want to grub about
+ * parsing stuff in /proc/device-tree so just match host PVR to differentiate
+ * PPC970 and POWER7 (which is all that's currently supported).
+ *
+ * Qemu does something similar but this is MUCH simpler!
+ *
+ * Copyright 2012 Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include <kvm/kvm.h>
+#include <sys/ioctl.h>
+
+#include "cpu_info.h"
+#include "kvm/util.h"
+
+/* POWER7 */
+
+static struct cpu_info cpu_power7_info = {
+	.name = "POWER7",
+	.tb_freq = 512000000,
+	.d_bsize = 128,
+	.i_bsize = 128,
+	.flags = CPUINFO_FLAG_DFP | CPUINFO_FLAG_VSX | CPUINFO_FLAG_VMX,
+	.mmu_info = {
+		.flags = KVM_PPC_PAGE_SIZES_REAL | KVM_PPC_1T_SEGMENTS,
+		.slb_size = 32,
+	},
+};
+
+/* POWER8 */
+
+static struct cpu_info cpu_power8_info = {
+	.name = "POWER8",
+	.tb_freq = 512000000,
+	.d_bsize = 128,
+	.i_bsize = 128,
+	.flags = CPUINFO_FLAG_DFP | CPUINFO_FLAG_VSX | CPUINFO_FLAG_VMX,
+	.mmu_info = {
+		.flags = KVM_PPC_PAGE_SIZES_REAL | KVM_PPC_1T_SEGMENTS,
+		.slb_size = 32,
+	},
+};
+
+/* PPC970/G5 */
+
+static struct cpu_info cpu_970_info = {
+	.name = "G5",
+	.tb_freq = 33333333,
+	.d_bsize = 128,
+	.i_bsize = 128,
+	.flags = CPUINFO_FLAG_VMX,
+};
+
+/* This is a default catchall for 'no match' on PVR: */
+static struct cpu_info cpu_dummy_info = { .name = "unknown" };
+
+static struct pvr_info host_pvr_info[] = {
+	{ 0xffffffff, 0x0f000003, &cpu_power7_info },
+	{ 0xffff0000, 0x003f0000, &cpu_power7_info },
+	{ 0xffff0000, 0x004a0000, &cpu_power7_info },
+	{ 0xffff0000, 0x004b0000, &cpu_power8_info },
+	{ 0xffff0000, 0x00390000, &cpu_970_info },
+	{ 0xffff0000, 0x003c0000, &cpu_970_info },
+        { 0xffff0000, 0x00440000, &cpu_970_info },
+        { 0xffff0000, 0x00450000, &cpu_970_info },
+};
+
+/* If we can't query the kernel for supported page sizes assume 4K and 16M */
+static struct kvm_ppc_one_seg_page_size fallback_sps[] = {
+	[0] = {
+		.page_shift = 12,
+		.slb_enc    = 0,
+		.enc =  {
+			[0] = {
+				.page_shift = 12,
+				.pte_enc    = 0,
+			},
+		},
+	},
+	[1] = {
+		.page_shift = 24,
+		.slb_enc    = 0x100,
+		.enc =  {
+			[0] = {
+				.page_shift = 24,
+				.pte_enc    = 0,
+			},
+		},
+	},
+};
+
+
+static void setup_mmu_info(struct kvm *kvm, struct cpu_info *cpu_info)
+{
+	static struct kvm_ppc_smmu_info *mmu_info;
+	struct kvm_ppc_one_seg_page_size *sps;
+	int i, j, k, valid;
+
+	if (!kvm__supports_extension(kvm, KVM_CAP_PPC_GET_SMMU_INFO)) {
+		memcpy(&cpu_info->mmu_info.sps, fallback_sps, sizeof(fallback_sps));
+	} else if (ioctl(kvm->vm_fd, KVM_PPC_GET_SMMU_INFO, &cpu_info->mmu_info) < 0) {
+			die_perror("KVM_PPC_GET_SMMU_INFO failed");
+	}
+
+	mmu_info = &cpu_info->mmu_info;
+
+	if (!(mmu_info->flags & KVM_PPC_PAGE_SIZES_REAL))
+		/* Guest pages are not restricted by the backing page size */
+		return;
+
+	/* Filter based on backing page size */
+
+	for (i = 0; i < KVM_PPC_PAGE_SIZES_MAX_SZ; i++) {
+		sps = &mmu_info->sps[i];
+
+		if (!sps->page_shift)
+			break;
+
+		if (kvm->ram_pagesize < (1ul << sps->page_shift)) {
+			/* Mark the whole segment size invalid */
+			sps->page_shift = 0;
+			continue;
+		}
+
+		/* Check each page size for the segment */
+		for (j = 0, valid = 0; j < KVM_PPC_PAGE_SIZES_MAX_SZ; j++) {
+			if (!sps->enc[j].page_shift)
+				break;
+
+			if (kvm->ram_pagesize < (1ul << sps->enc[j].page_shift))
+				sps->enc[j].page_shift = 0;
+			else
+				valid++;
+		}
+
+		if (!valid) {
+			/* Mark the whole segment size invalid */
+			sps->page_shift = 0;
+			continue;
+		}
+
+		/* Mark any trailing entries invalid if we broke out early */
+		for (k = j; k < KVM_PPC_PAGE_SIZES_MAX_SZ; k++)
+			sps->enc[k].page_shift = 0;
+
+		/* Collapse holes */
+		for (j = 0; j < KVM_PPC_PAGE_SIZES_MAX_SZ; j++) {
+			if (sps->enc[j].page_shift)
+				continue;
+
+			for (k = j + 1; k < KVM_PPC_PAGE_SIZES_MAX_SZ; k++) {
+				if (sps->enc[k].page_shift) {
+					sps->enc[j] = sps->enc[k];
+					sps->enc[k].page_shift = 0;
+					break;
+				}
+			}
+		}
+	}
+
+	/* Mark any trailing entries invalid if we broke out early */
+	for (j = i; j < KVM_PPC_PAGE_SIZES_MAX_SZ; j++)
+		mmu_info->sps[j].page_shift = 0;
+
+	/* Collapse holes */
+	for (i = 0; i < KVM_PPC_PAGE_SIZES_MAX_SZ; i++) {
+		if (mmu_info->sps[i].page_shift)
+			continue;
+
+		for (j = i + 1; j < KVM_PPC_PAGE_SIZES_MAX_SZ; j++) {
+			if (mmu_info->sps[j].page_shift) {
+				mmu_info->sps[i] = mmu_info->sps[j];
+				mmu_info->sps[j].page_shift = 0;
+				break;
+			}
+		}
+	}
+}
+
+struct cpu_info *find_cpu_info(struct kvm *kvm)
+{
+	struct cpu_info *info;
+	unsigned int i;
+	u32 pvr = kvm->arch.pvr;
+
+	for (info = NULL, i = 0; i < ARRAY_SIZE(host_pvr_info); i++) {
+		if ((pvr & host_pvr_info[i].pvr_mask) == host_pvr_info[i].pvr) {
+			info = host_pvr_info[i].cpu_info;
+			break;
+		}
+	}
+
+	/* Didn't find anything? Rut-ro. */
+	if (!info) {
+		pr_warning("Host CPU unsupported by kvmtool\n");
+		info = &cpu_dummy_info;
+	}
+
+	setup_mmu_info(kvm, info);
+
+	return info;
+}

diff --git a/tools/kvm/powerpc/cpu_info.h b/tools/kvm/powerpc/cpu_info.h
new file mode 100644
index 0000000..f61707a
--- /dev/null
+++ b/tools/kvm/powerpc/cpu_info.h

@@ -0,0 +1,42 @@
+/*
+ * PPC CPU identification
+ *
+ * Copyright 2012 Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#ifndef CPU_INFO_H
+#define CPU_INFO_H
+
+#include <kvm/kvm.h>
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/kvm.h>
+
+struct cpu_info {
+	const char	*name;
+	u32		tb_freq; /* timebase frequency */
+	u32		d_bsize; /* d-cache block size */
+	u32		i_bsize; /* i-cache block size */
+	u32		flags;
+	struct kvm_ppc_smmu_info mmu_info;
+};
+
+struct pvr_info {
+	u32		pvr_mask;
+	u32		pvr;
+	struct cpu_info *cpu_info;
+};
+
+/* Misc capabilities/CPU properties */
+#define CPUINFO_FLAG_DFP	0x00000001
+#define CPUINFO_FLAG_VMX	0x00000002
+#define CPUINFO_FLAG_VSX	0x00000004
+
+struct cpu_info *find_cpu_info(struct kvm *kvm);
+
+#endif

diff --git a/tools/kvm/powerpc/include/kvm/barrier.h b/tools/kvm/powerpc/include/kvm/barrier.h
new file mode 100644
index 0000000..dd5115a
--- /dev/null
+++ b/tools/kvm/powerpc/include/kvm/barrier.h

@@ -0,0 +1,6 @@
+#ifndef _KVM_BARRIER_H_
+#define _KVM_BARRIER_H_
+
+#include <asm/barrier.h>
+
+#endif /* _KVM_BARRIER_H_ */

diff --git a/tools/kvm/powerpc/include/kvm/kvm-arch.h b/tools/kvm/powerpc/include/kvm/kvm-arch.h
new file mode 100644
index 0000000..fdd518f
--- /dev/null
+++ b/tools/kvm/powerpc/include/kvm/kvm-arch.h

@@ -0,0 +1,65 @@
+/*
+ * PPC64 architecture-specific definitions
+ *
+ * Copyright 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#ifndef KVM__KVM_ARCH_H
+#define KVM__KVM_ARCH_H
+
+#include <stdbool.h>
+#include <linux/types.h>
+#include <time.h>
+
+/*
+ * MMIO lives after RAM, but it'd be nice if it didn't constantly move.
+ * Choose a suitably high address, e.g. 63T...  This limits RAM size.
+ */
+#define PPC_MMIO_START			0x3F0000000000UL
+#define PPC_MMIO_SIZE			0x010000000000UL
+
+#define KERNEL_LOAD_ADDR        	0x0000000000000000
+#define KERNEL_START_ADDR       	0x0000000000000000
+#define KERNEL_SECONDARY_START_ADDR     0x0000000000000060
+#define INITRD_LOAD_ADDR        	0x0000000002800000
+
+#define RTAS_MAX_SIZE           	0x10000
+
+#define TIMEBASE_FREQ           	512000000ULL
+
+#define KVM_MMIO_START			PPC_MMIO_START
+
+/*
+ * This is the address that pci_get_io_space_block() starts allocating
+ * from.  Note that this is a PCI bus address.
+ */
+#define KVM_IOPORT_AREA			0x0
+#define KVM_PCI_CFG_AREA		0x1000000
+#define KVM_PCI_MMIO_AREA		0x2000000
+#define KVM_VIRTIO_MMIO_AREA		0x3000000
+
+#define KVM_IRQ_OFFSET			16
+
+#define KVM_VM_TYPE			0
+
+#define VIRTIO_DEFAULT_TRANS(kvm)	VIRTIO_PCI
+
+struct spapr_phb;
+
+struct kvm_arch {
+	u64			sdr1;
+	u32			pvr;
+	unsigned long		rtas_gra;
+	unsigned long		rtas_size;
+	unsigned long		fdt_gra;
+	unsigned long		initrd_gra;
+	unsigned long		initrd_size;
+	struct icp_state	*icp;
+	struct spapr_phb	*phb;
+};
+
+#endif /* KVM__KVM_ARCH_H */

diff --git a/tools/kvm/powerpc/include/kvm/kvm-config-arch.h b/tools/kvm/powerpc/include/kvm/kvm-config-arch.h
new file mode 100644
index 0000000..60f61de
--- /dev/null
+++ b/tools/kvm/powerpc/include/kvm/kvm-config-arch.h

@@ -0,0 +1,7 @@
+#ifndef KVM__KVM_CONFIG_ARCH_H
+#define KVM__KVM_CONFIG_ARCH_H
+
+struct kvm_config_arch {
+};
+
+#endif /* KVM__KVM_CONFIG_ARCH_H */

diff --git a/tools/kvm/powerpc/include/kvm/kvm-cpu-arch.h b/tools/kvm/powerpc/include/kvm/kvm-cpu-arch.h
new file mode 100644
index 0000000..e256f5d
--- /dev/null
+++ b/tools/kvm/powerpc/include/kvm/kvm-cpu-arch.h

@@ -0,0 +1,76 @@
+/*
+ * PPC64 cpu-specific definitions
+ *
+ * Copyright 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#ifndef KVM__KVM_CPU_ARCH_H
+#define KVM__KVM_CPU_ARCH_H
+
+/* Architecture-specific kvm_cpu definitions. */
+
+#include <linux/kvm.h>	/* for struct kvm_regs */
+#include <stdbool.h>
+#include <pthread.h>
+
+#define MSR_SF		(1UL<<63)
+#define MSR_HV		(1UL<<60)
+#define MSR_VEC		(1UL<<25)
+#define MSR_VSX		(1UL<<23)
+#define MSR_POW		(1UL<<18)
+#define MSR_EE		(1UL<<15)
+#define MSR_PR		(1UL<<14)
+#define MSR_FP		(1UL<<13)
+#define MSR_ME		(1UL<<12)
+#define MSR_FE0		(1UL<<11)
+#define MSR_SE		(1UL<<10)
+#define MSR_BE		(1UL<<9)
+#define MSR_FE1		(1UL<<8)
+#define MSR_IR		(1UL<<5)
+#define MSR_DR		(1UL<<4)
+#define MSR_PMM		(1UL<<2)
+#define MSR_RI		(1UL<<1)
+#define MSR_LE		(1UL<<0)
+
+#define POWER7_EXT_IRQ	0
+
+struct kvm;
+
+struct kvm_cpu {
+	pthread_t		thread;		/* VCPU thread */
+
+	unsigned long		cpu_id;
+
+	struct kvm		*kvm;		/* parent KVM */
+	int			vcpu_fd;	/* For VCPU ioctls() */
+	struct kvm_run		*kvm_run;
+
+	struct kvm_regs		regs;
+	struct kvm_sregs	sregs;
+	struct kvm_fpu		fpu;
+
+	u8			is_running;
+	u8			paused;
+	u8			needs_nmi;
+	/*
+	 * Although PPC KVM doesn't yet support coalesced MMIO, generic code
+	 * needs this in our kvm_cpu:
+	 */
+	struct kvm_coalesced_mmio_ring  *ring;
+};
+
+void kvm_cpu__irq(struct kvm_cpu *vcpu, int pin, int level);
+
+/* This is never actually called on PPC. */
+static inline bool kvm_cpu__emulate_io(struct kvm_cpu *vcpu, u16 port, void *data, int direction, int size, u32 count)
+{
+	return false;
+}
+
+bool kvm_cpu__emulate_mmio(struct kvm_cpu *vcpu, u64 phys_addr, u8 *data, u32 len, u8 is_write);
+
+#endif /* KVM__KVM_CPU_ARCH_H */

diff --git a/tools/kvm/powerpc/ioport.c b/tools/kvm/powerpc/ioport.c
new file mode 100644
index 0000000..58dc625
--- /dev/null
+++ b/tools/kvm/powerpc/ioport.c

@@ -0,0 +1,22 @@
+/*
+ * PPC64 ioport platform setup.  There isn't any! :-)
+ *
+ * Copyright 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include "kvm/ioport.h"
+
+#include <stdlib.h>
+
+void ioport__setup_arch(struct kvm *kvm)
+{
+	/* PPC has no legacy ioports to set up */
+}
+
+void ioport__map_irq(u8 *irq)
+{
+}

diff --git a/tools/kvm/powerpc/irq.c b/tools/kvm/powerpc/irq.c
new file mode 100644
index 0000000..03f2fe7
--- /dev/null
+++ b/tools/kvm/powerpc/irq.c

@@ -0,0 +1,31 @@
+/*
+ * PPC64 IRQ routines
+ *
+ * Copyright 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include "kvm/devices.h"
+#include "kvm/irq.h"
+#include "kvm/kvm.h"
+#include "kvm/util.h"
+
+#include <linux/types.h>
+#include <linux/rbtree.h>
+#include <linux/list.h>
+#include <linux/kvm.h>
+#include <sys/ioctl.h>
+
+#include <stddef.h>
+#include <stdlib.h>
+
+#include "kvm/pci.h"
+
+int irq__add_msix_route(struct kvm *kvm, struct msi_msg *msg)
+{
+	die(__FUNCTION__);
+	return 0;
+}

diff --git a/tools/kvm/powerpc/kvm-cpu.c b/tools/kvm/powerpc/kvm-cpu.c
new file mode 100644
index 0000000..2f5cfc6
--- /dev/null
+++ b/tools/kvm/powerpc/kvm-cpu.c

@@ -0,0 +1,290 @@
+/*
+ * PPC64 processor support
+ *
+ * Copyright 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include "kvm/kvm-cpu.h"
+
+#include "kvm/symbol.h"
+#include "kvm/util.h"
+#include "kvm/kvm.h"
+
+#include "spapr.h"
+#include "spapr_pci.h"
+#include "xics.h"
+
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <stdio.h>
+#include <assert.h>
+
+static int debug_fd;
+
+void kvm_cpu__set_debug_fd(int fd)
+{
+	debug_fd = fd;
+}
+
+int kvm_cpu__get_debug_fd(void)
+{
+	return debug_fd;
+}
+
+static struct kvm_cpu *kvm_cpu__new(struct kvm *kvm)
+{
+	struct kvm_cpu *vcpu;
+
+	vcpu		= calloc(1, sizeof *vcpu);
+	if (!vcpu)
+		return NULL;
+
+	vcpu->kvm	= kvm;
+
+	return vcpu;
+}
+
+void kvm_cpu__delete(struct kvm_cpu *vcpu)
+{
+	free(vcpu);
+}
+
+struct kvm_cpu *kvm_cpu__arch_init(struct kvm *kvm, unsigned long cpu_id)
+{
+	struct kvm_cpu *vcpu;
+	int mmap_size;
+	struct kvm_enable_cap papr_cap = { .cap = KVM_CAP_PPC_PAPR };
+
+	vcpu		= kvm_cpu__new(kvm);
+	if (!vcpu)
+		return NULL;
+
+	vcpu->cpu_id	= cpu_id;
+
+	vcpu->vcpu_fd = ioctl(vcpu->kvm->vm_fd, KVM_CREATE_VCPU, cpu_id);
+	if (vcpu->vcpu_fd < 0)
+		die_perror("KVM_CREATE_VCPU ioctl");
+
+	mmap_size = ioctl(vcpu->kvm->sys_fd, KVM_GET_VCPU_MMAP_SIZE, 0);
+	if (mmap_size < 0)
+		die_perror("KVM_GET_VCPU_MMAP_SIZE ioctl");
+
+	vcpu->kvm_run = mmap(NULL, mmap_size, PROT_RW, MAP_SHARED, vcpu->vcpu_fd, 0);
+	if (vcpu->kvm_run == MAP_FAILED)
+		die("unable to mmap vcpu fd");
+
+	if (ioctl(vcpu->vcpu_fd, KVM_ENABLE_CAP, &papr_cap) < 0)
+		die("unable to enable PAPR capability");
+
+	/*
+	 * We start all CPUs, directing non-primary threads into the kernel's
+	 * secondary start point.  When we come to support SLOF, we will start
+	 * only one and SLOF will RTAS call us to ask for others to be
+	 * started.  (FIXME: make more generic & interface with whichever
+	 * firmware a platform may be using.)
+	 */
+	vcpu->is_running = true;
+
+	return vcpu;
+}
+
+static void kvm_cpu__setup_fpu(struct kvm_cpu *vcpu)
+{
+	/* Don't have to do anything, there's no expected FPU state. */
+}
+
+static void kvm_cpu__setup_regs(struct kvm_cpu *vcpu)
+{
+	/*
+	 * FIXME: This assumes PPC64 and Linux guest.  It doesn't use the
+	 * OpenFirmware entry method, but instead the "embedded" entry which
+	 * passes the FDT address directly.
+	 */
+	struct kvm_regs *r = &vcpu->regs;
+
+	if (vcpu->cpu_id == 0) {
+		r->pc = KERNEL_START_ADDR;
+		r->gpr[3] = vcpu->kvm->arch.fdt_gra;
+		r->gpr[5] = 0;
+	} else {
+		r->pc = KERNEL_SECONDARY_START_ADDR;
+		r->gpr[3] = vcpu->cpu_id;
+	}
+	r->msr = 0x8000000000001000UL; /* 64bit, non-HV, ME */
+
+	if (ioctl(vcpu->vcpu_fd, KVM_SET_REGS, &vcpu->regs) < 0)
+		die_perror("KVM_SET_REGS failed");
+}
+
+static void kvm_cpu__setup_sregs(struct kvm_cpu *vcpu)
+{
+	/*
+	 * Some sregs setup to initialise SDR1/PVR/HIOR on PPC64 SPAPR
+	 * platforms using PR KVM.  (Technically, this is all ignored on
+	 * SPAPR HV KVM.)  Different setup is required for non-PV non-SPAPR
+	 * platforms!  (FIXME.)
+	 */
+	struct kvm_sregs sregs;
+	struct kvm_one_reg reg = {};
+	u64 value;
+
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_SREGS, &sregs) < 0)
+		die("KVM_GET_SREGS failed");
+
+	sregs.u.s.sdr1 = vcpu->kvm->arch.sdr1;
+	sregs.pvr = vcpu->kvm->arch.pvr;
+
+	if (ioctl(vcpu->vcpu_fd, KVM_SET_SREGS, &sregs) < 0)
+		die("KVM_SET_SREGS failed");
+
+	reg.id = KVM_REG_PPC_HIOR;
+	value = 0;
+	reg.addr = (u64)&value;
+	if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, &reg) < 0)
+		die("KVM_SET_ONE_REG failed");
+}
+
+/**
+ * kvm_cpu__reset_vcpu - reset virtual CPU to a known state
+ */
+void kvm_cpu__reset_vcpu(struct kvm_cpu *vcpu)
+{
+	kvm_cpu__setup_regs(vcpu);
+	kvm_cpu__setup_sregs(vcpu);
+	kvm_cpu__setup_fpu(vcpu);
+}
+
+/* kvm_cpu__irq - set KVM's IRQ flag on this vcpu */
+void kvm_cpu__irq(struct kvm_cpu *vcpu, int pin, int level)
+{
+	unsigned int virq = level ? KVM_INTERRUPT_SET_LEVEL : KVM_INTERRUPT_UNSET;
+
+	/* FIXME: POWER-specific */
+	if (pin != POWER7_EXT_IRQ)
+		return;
+	if (ioctl(vcpu->vcpu_fd, KVM_INTERRUPT, &virq) < 0)
+		pr_warning("Could not KVM_INTERRUPT.");
+}
+
+void kvm_cpu__arch_nmi(struct kvm_cpu *cpu)
+{
+}
+
+bool kvm_cpu__handle_exit(struct kvm_cpu *vcpu)
+{
+	bool ret = true;
+	struct kvm_run *run = vcpu->kvm_run;
+	switch(run->exit_reason) {
+	case KVM_EXIT_PAPR_HCALL:
+		run->papr_hcall.ret = spapr_hypercall(vcpu, run->papr_hcall.nr,
+						      (target_ulong*)run->papr_hcall.args);
+		break;
+	default:
+		ret = false;
+	}
+	return ret;
+}
+
+bool kvm_cpu__emulate_mmio(struct kvm_cpu *vcpu, u64 phys_addr, u8 *data, u32 len, u8 is_write)
+{
+	/*
+	 * FIXME: This function will need to be split in order to support
+	 * various PowerPC platforms/PHB types, etc.  It currently assumes SPAPR
+	 * PPC64 guest.
+	 */
+	bool ret = false;
+
+	if ((phys_addr >= SPAPR_PCI_WIN_START) &&
+	    (phys_addr < SPAPR_PCI_WIN_END)) {
+		ret = spapr_phb_mmio(vcpu, phys_addr, data, len, is_write);
+	} else {
+		pr_warning("MMIO %s unknown address %llx (size %d)!\n",
+			   is_write ? "write to" : "read from",
+			   phys_addr, len);
+	}
+	return ret;
+}
+
+#define CONDSTR_BIT(m, b) (((m) & MSR_##b) ? #b" " : "")
+
+void kvm_cpu__show_registers(struct kvm_cpu *vcpu)
+{
+	struct kvm_regs regs;
+	struct kvm_sregs sregs;
+	int r;
+
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_REGS, &regs) < 0)
+		die("KVM_GET_REGS failed");
+        if (ioctl(vcpu->vcpu_fd, KVM_GET_SREGS, &sregs) < 0)
+		die("KVM_GET_SREGS failed");
+
+	dprintf(debug_fd, "\n Registers:\n");
+	dprintf(debug_fd, " NIP:   %016llx  MSR:   %016llx "
+		"( %s%s%s%s%s%s%s%s%s%s%s%s)\n",
+		regs.pc, regs.msr,
+		CONDSTR_BIT(regs.msr, SF),
+		CONDSTR_BIT(regs.msr, HV), /* ! */
+		CONDSTR_BIT(regs.msr, VEC),
+		CONDSTR_BIT(regs.msr, VSX),
+		CONDSTR_BIT(regs.msr, EE),
+		CONDSTR_BIT(regs.msr, PR),
+		CONDSTR_BIT(regs.msr, FP),
+		CONDSTR_BIT(regs.msr, ME),
+		CONDSTR_BIT(regs.msr, IR),
+		CONDSTR_BIT(regs.msr, DR),
+		CONDSTR_BIT(regs.msr, RI),
+		CONDSTR_BIT(regs.msr, LE));
+	dprintf(debug_fd, " CTR:   %016llx  LR:    %016llx  CR:   %08llx\n",
+		regs.ctr, regs.lr, regs.cr);
+	dprintf(debug_fd, " SRR0:  %016llx  SRR1:  %016llx  XER:  %016llx\n",
+		regs.srr0, regs.srr1, regs.xer);
+	dprintf(debug_fd, " SPRG0: %016llx  SPRG1: %016llx\n",
+		regs.sprg0, regs.sprg1);
+	dprintf(debug_fd, " SPRG2: %016llx  SPRG3: %016llx\n",
+		regs.sprg2, regs.sprg3);
+	dprintf(debug_fd, " SPRG4: %016llx  SPRG5: %016llx\n",
+		regs.sprg4, regs.sprg5);
+	dprintf(debug_fd, " SPRG6: %016llx  SPRG7: %016llx\n",
+		regs.sprg6, regs.sprg7);
+	dprintf(debug_fd, " GPRs:\n ");
+	for (r = 0; r < 32; r++) {
+		dprintf(debug_fd, "%016llx  ", regs.gpr[r]);
+		if ((r & 3) == 3)
+			dprintf(debug_fd, "\n ");
+	}
+	dprintf(debug_fd, "\n");
+
+	/* FIXME: Assumes SLB-based (book3s) guest */
+	for (r = 0; r < 32; r++) {
+		dprintf(debug_fd, " SLB%02d  %016llx %016llx\n", r,
+			sregs.u.s.ppc64.slb[r].slbe,
+			sregs.u.s.ppc64.slb[r].slbv);
+	}
+	dprintf(debug_fd, "----------\n");
+}
+
+void kvm_cpu__show_code(struct kvm_cpu *vcpu)
+{
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_REGS, &vcpu->regs) < 0)
+		die("KVM_GET_REGS failed");
+
+	/* FIXME: Dump/disassemble some code...! */
+
+	dprintf(debug_fd, "\n Stack:\n");
+	dprintf(debug_fd,   " ------\n");
+	/* Only works in real mode: */
+	kvm__dump_mem(vcpu->kvm, vcpu->regs.gpr[1], 32, debug_fd);
+}
+
+void kvm_cpu__show_page_tables(struct kvm_cpu *vcpu)
+{
+	/* Does nothing yet */
+}

diff --git a/tools/kvm/powerpc/kvm.c b/tools/kvm/powerpc/kvm.c
new file mode 100644
index 0000000..2b03a12
--- /dev/null
+++ b/tools/kvm/powerpc/kvm.c

@@ -0,0 +1,524 @@
+/*
+ * PPC64 (SPAPR) platform support
+ *
+ * Copyright 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * Portions of FDT setup borrowed from QEMU, copyright 2010 David Gibson, IBM
+ * Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include "kvm/fdt.h"
+#include "kvm/kvm.h"
+#include "kvm/util.h"
+#include "cpu_info.h"
+
+#include "spapr.h"
+#include "spapr_hvcons.h"
+#include "spapr_pci.h"
+
+#include <linux/kvm.h>
+
+#include <sys/types.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <asm/unistd.h>
+#include <errno.h>
+
+#include <linux/byteorder.h>
+
+#define HPT_ORDER 24
+
+#define HUGETLBFS_PATH "/var/lib/hugetlbfs/global/pagesize-16MB/"
+
+#define PHANDLE_XICP		0x00001111
+
+static char kern_cmdline[2048];
+
+struct kvm_ext kvm_req_ext[] = {
+	{ DEFINE_KVM_EXT(KVM_CAP_PPC_UNSET_IRQ) },
+	{ DEFINE_KVM_EXT(KVM_CAP_PPC_IRQ_LEVEL) },
+	{ 0, 0 }
+};
+
+static uint32_t mfpvr(void)
+{
+	uint32_t r;
+	asm volatile ("mfpvr %0" : "=r"(r));
+	return r;
+}
+
+bool kvm__arch_cpu_supports_vm(void)
+{
+	return true;
+}
+
+void kvm__init_ram(struct kvm *kvm)
+{
+	u64	phys_start, phys_size;
+	void	*host_mem;
+
+	phys_start = 0;
+	phys_size  = kvm->ram_size;
+	host_mem   = kvm->ram_start;
+
+	/*
+	 * We put MMIO at PPC_MMIO_START, high up.  Make sure that this doesn't
+	 * crash into the end of RAM -- on PPC64 at least, this is so high
+	 * (63TB!) that this is unlikely.
+	 */
+	if (phys_size >= PPC_MMIO_START)
+		die("Too much memory (%lld, what a nice problem): "
+		    "overlaps MMIO!\n",
+		    phys_size);
+
+	kvm__register_mem(kvm, phys_start, phys_size, host_mem);
+}
+
+void kvm__arch_set_cmdline(char *cmdline, bool video)
+{
+	/* We don't need anything unusual in here. */
+}
+
+/* Architecture-specific KVM init */
+void kvm__arch_init(struct kvm *kvm, const char *hugetlbfs_path, u64 ram_size)
+{
+	int cap_ppc_rma;
+	unsigned long hpt;
+
+	kvm->ram_size		= ram_size;
+
+	/* Map "default" hugetblfs path to the standard 16M mount point */
+	if (hugetlbfs_path && !strcmp(hugetlbfs_path, "default"))
+		hugetlbfs_path = HUGETLBFS_PATH;
+
+	kvm->ram_start = mmap_anon_or_hugetlbfs(kvm, hugetlbfs_path, kvm->ram_size);
+
+	if (kvm->ram_start == MAP_FAILED)
+		die("Couldn't map %lld bytes for RAM (%d)\n",
+		    kvm->ram_size, errno);
+
+	/* FDT goes at top of memory, RTAS just below */
+	kvm->arch.fdt_gra = kvm->ram_size - FDT_MAX_SIZE;
+	/* FIXME: Not all PPC systems have RTAS */
+	kvm->arch.rtas_gra = kvm->arch.fdt_gra - RTAS_MAX_SIZE;
+	madvise(kvm->ram_start, kvm->ram_size, MADV_MERGEABLE);
+
+	/* FIXME:  SPAPR-PR specific; allocate a guest HPT. */
+	if (posix_memalign((void **)&hpt, (1<<HPT_ORDER), (1<<HPT_ORDER)))
+		die("Can't allocate %d bytes for HPT\n", (1<<HPT_ORDER));
+
+	kvm->arch.sdr1 = ((hpt + 0x3ffffULL) & ~0x3ffffULL) | (HPT_ORDER-18);
+
+	kvm->arch.pvr = mfpvr();
+
+	/* FIXME: This is book3s-specific */
+	cap_ppc_rma = ioctl(kvm->sys_fd, KVM_CHECK_EXTENSION, KVM_CAP_PPC_RMA);
+	if (cap_ppc_rma == 2)
+		die("Need contiguous RMA allocation on this hardware, "
+		    "which is not yet supported.");
+
+	/* Do these before FDT setup, IRQ setup, etc. */
+	/* FIXME: SPAPR-specific */
+	hypercall_init();
+	register_core_rtas();
+	/* Now that hypercalls are initialised, register a couple for the console: */
+	spapr_hvcons_init();
+	spapr_create_phb(kvm, "pci", SPAPR_PCI_BUID,
+			 SPAPR_PCI_MEM_WIN_ADDR,
+			 SPAPR_PCI_MEM_WIN_SIZE,
+			 SPAPR_PCI_IO_WIN_ADDR,
+			 SPAPR_PCI_IO_WIN_SIZE);
+}
+
+void kvm__arch_delete_ram(struct kvm *kvm)
+{
+	munmap(kvm->ram_start, kvm->ram_size);
+}
+
+void kvm__irq_trigger(struct kvm *kvm, int irq)
+{
+	kvm__irq_line(kvm, irq, 1);
+	kvm__irq_line(kvm, irq, 0);
+}
+
+void kvm__arch_read_term(struct kvm *kvm)
+{
+	/* FIXME: Should register callbacks to platform-specific polls */
+	spapr_hvcons_poll(kvm);
+}
+
+int load_flat_binary(struct kvm *kvm, int fd_kernel, int fd_initrd, const char *kernel_cmdline)
+{
+	void *p;
+	void *k_start;
+	void *i_start;
+	int nr;
+
+	if (lseek(fd_kernel, 0, SEEK_SET) < 0)
+		die_perror("lseek");
+
+	p = k_start = guest_flat_to_host(kvm, KERNEL_LOAD_ADDR);
+
+	while ((nr = read(fd_kernel, p, 65536)) > 0)
+		p += nr;
+
+	pr_info("Loaded kernel to 0x%x (%ld bytes)", KERNEL_LOAD_ADDR, p-k_start);
+
+	if (fd_initrd != -1) {
+		if (lseek(fd_initrd, 0, SEEK_SET) < 0)
+			die_perror("lseek");
+
+		if (p-k_start > INITRD_LOAD_ADDR)
+			die("Kernel overlaps initrd!");
+
+		/* Round up kernel size to 8byte alignment, and load initrd right after. */
+		i_start = p = guest_flat_to_host(kvm, INITRD_LOAD_ADDR);
+
+		while (((nr = read(fd_initrd, p, 65536)) > 0) &&
+		       p < (kvm->ram_start + kvm->ram_size))
+			p += nr;
+
+		if (p >= (kvm->ram_start + kvm->ram_size))
+			die("initrd too big to contain in guest RAM.\n");
+
+		pr_info("Loaded initrd to 0x%x (%ld bytes)",
+			INITRD_LOAD_ADDR, p-i_start);
+		kvm->arch.initrd_gra = INITRD_LOAD_ADDR;
+		kvm->arch.initrd_size = p-i_start;
+	} else {
+		kvm->arch.initrd_size = 0;
+	}
+	strncpy(kern_cmdline, kernel_cmdline, 2048);
+	kern_cmdline[2047] = '\0';
+
+	return true;
+}
+
+struct fdt_prop {
+	void *value;
+	int size;
+};
+
+static void generate_segment_page_sizes(struct kvm_ppc_smmu_info *info, struct fdt_prop *prop)
+{
+	struct kvm_ppc_one_seg_page_size *sps;
+	int i, j, size;
+	u32 *p;
+
+	for (size = 0, i = 0; i < KVM_PPC_PAGE_SIZES_MAX_SZ; i++) {
+		sps = &info->sps[i];
+
+		if (sps->page_shift == 0)
+			break;
+
+		/* page shift, slb enc & count */
+		size += 3;
+
+		for (j = 0; j < KVM_PPC_PAGE_SIZES_MAX_SZ; j++) {
+			if (info->sps[i].enc[j].page_shift == 0)
+				break;
+
+			/* page shift & pte enc */
+			size += 2;
+		}
+	}
+
+	if (!size) {
+		prop->value = NULL;
+		prop->size = 0;
+		return;
+	}
+
+	/* Convert size to bytes */
+	prop->size = size * sizeof(u32);
+
+	prop->value = malloc(prop->size);
+	if (!prop->value)
+		die_perror("malloc failed");
+
+	p = (u32 *)prop->value;
+	for (i = 0; i < KVM_PPC_PAGE_SIZES_MAX_SZ; i++) {
+		sps = &info->sps[i];
+
+		if (sps->page_shift == 0)
+			break;
+
+		*p++ = sps->page_shift;
+		*p++ = sps->slb_enc;
+
+		for (j = 0; j < KVM_PPC_PAGE_SIZES_MAX_SZ; j++)
+			if (!info->sps[i].enc[j].page_shift)
+				break;
+
+		*p++ = j;	/* count of enc */
+
+		for (j = 0; j < KVM_PPC_PAGE_SIZES_MAX_SZ; j++) {
+			if (!info->sps[i].enc[j].page_shift)
+				break;
+
+			*p++ = info->sps[i].enc[j].page_shift;
+			*p++ = info->sps[i].enc[j].pte_enc;
+		}
+	}
+}
+
+#define SMT_THREADS 4
+
+/*
+ * Set up the FDT for the kernel: This function is currently fairly SPAPR-heavy,
+ * and whilst most PPC targets will require CPU/memory nodes, others like RTAS
+ * should eventually be added separately.
+ */
+static int setup_fdt(struct kvm *kvm)
+{
+	uint64_t 	mem_reg_property[] = { 0, cpu_to_be64(kvm->ram_size) };
+	int 		smp_cpus = kvm->nrcpus;
+	uint32_t	int_server_ranges_prop[] = {0, cpu_to_be32(smp_cpus)};
+	char 		hypertas_prop_kvm[] = "hcall-pft\0hcall-term\0"
+		"hcall-dabr\0hcall-interrupt\0hcall-tce\0hcall-vio\0"
+		"hcall-splpar\0hcall-bulk";
+	int 		i, j;
+	char 		cpu_name[30];
+	u8		staging_fdt[FDT_MAX_SIZE];
+	struct cpu_info *cpu_info = find_cpu_info(kvm);
+	struct fdt_prop segment_page_sizes;
+	u32 segment_sizes_1T[] = {0x1c, 0x28, 0xffffffff, 0xffffffff};
+
+	/* Generate an appropriate DT at kvm->arch.fdt_gra */
+	void *fdt_dest = guest_flat_to_host(kvm, kvm->arch.fdt_gra);
+	void *fdt = staging_fdt;
+
+	_FDT(fdt_create(fdt, FDT_MAX_SIZE));
+	_FDT(fdt_finish_reservemap(fdt));
+
+	_FDT(fdt_begin_node(fdt, ""));
+
+	_FDT(fdt_property_string(fdt, "device_type", "chrp"));
+	_FDT(fdt_property_string(fdt, "model", "IBM pSeries (kvmtool)"));
+	_FDT(fdt_property_cell(fdt, "#address-cells", 0x2));
+	_FDT(fdt_property_cell(fdt, "#size-cells", 0x2));
+
+	/* RTAS */
+	_FDT(fdt_begin_node(fdt, "rtas"));
+	/* This is what the kernel uses to switch 'We're an LPAR'! */
+        _FDT(fdt_property(fdt, "ibm,hypertas-functions", hypertas_prop_kvm,
+                           sizeof(hypertas_prop_kvm)));
+	_FDT(fdt_property_cell(fdt, "linux,rtas-base", kvm->arch.rtas_gra));
+	_FDT(fdt_property_cell(fdt, "linux,rtas-entry", kvm->arch.rtas_gra));
+	_FDT(fdt_property_cell(fdt, "rtas-size", kvm->arch.rtas_size));
+	/* Now add properties for all RTAS tokens: */
+	if (spapr_rtas_fdt_setup(kvm, fdt))
+		die("Couldn't create RTAS FDT properties\n");
+
+	_FDT(fdt_end_node(fdt));
+
+	/* /chosen */
+	_FDT(fdt_begin_node(fdt, "chosen"));
+	/* cmdline */
+	_FDT(fdt_property_string(fdt, "bootargs", kern_cmdline));
+	/* Initrd */
+	if (kvm->arch.initrd_size != 0) {
+		uint32_t ird_st_prop = cpu_to_be32(kvm->arch.initrd_gra);
+		uint32_t ird_end_prop = cpu_to_be32(kvm->arch.initrd_gra +
+						    kvm->arch.initrd_size);
+		_FDT(fdt_property(fdt, "linux,initrd-start",
+				   &ird_st_prop, sizeof(ird_st_prop)));
+		_FDT(fdt_property(fdt, "linux,initrd-end",
+				   &ird_end_prop, sizeof(ird_end_prop)));
+	}
+
+	/*
+	 * stdout-path: This is assuming we're using the HV console.  Also, the
+	 * address is hardwired until we do a VIO bus.
+	 */
+	_FDT(fdt_property_string(fdt, "linux,stdout-path",
+				 "/vdevice/vty@30000000"));
+	_FDT(fdt_end_node(fdt));
+
+	/*
+	 * Memory: We don't alloc. a separate RMA yet.  If we ever need to
+	 * (CAP_PPC_RMA == 2) then have one memory node for 0->RMAsize, and
+	 * another RMAsize->endOfMem.
+	 */
+	_FDT(fdt_begin_node(fdt, "memory@0"));
+	_FDT(fdt_property_string(fdt, "device_type", "memory"));
+	_FDT(fdt_property(fdt, "reg", mem_reg_property,
+			  sizeof(mem_reg_property)));
+	_FDT(fdt_end_node(fdt));
+
+	generate_segment_page_sizes(&cpu_info->mmu_info, &segment_page_sizes);
+
+	/* CPUs */
+	_FDT(fdt_begin_node(fdt, "cpus"));
+	_FDT(fdt_property_cell(fdt, "#address-cells", 0x1));
+	_FDT(fdt_property_cell(fdt, "#size-cells", 0x0));
+
+	for (i = 0; i < smp_cpus; i += SMT_THREADS) {
+		int32_t pft_size_prop[] = { 0, HPT_ORDER };
+		uint32_t servers_prop[SMT_THREADS];
+		uint32_t gservers_prop[SMT_THREADS * 2];
+		int threads = (smp_cpus - i) >= SMT_THREADS ? SMT_THREADS :
+			smp_cpus - i;
+
+		sprintf(cpu_name, "PowerPC,%s@%d", cpu_info->name, i);
+		_FDT(fdt_begin_node(fdt, cpu_name));
+		sprintf(cpu_name, "PowerPC,%s", cpu_info->name);
+		_FDT(fdt_property_string(fdt, "name", cpu_name));
+		_FDT(fdt_property_string(fdt, "device_type", "cpu"));
+
+		_FDT(fdt_property_cell(fdt, "reg", i));
+		_FDT(fdt_property_cell(fdt, "cpu-version", kvm->arch.pvr));
+
+		_FDT(fdt_property_cell(fdt, "dcache-block-size", cpu_info->d_bsize));
+		_FDT(fdt_property_cell(fdt, "icache-block-size", cpu_info->i_bsize));
+
+		if (cpu_info->tb_freq)
+			_FDT(fdt_property_cell(fdt, "timebase-frequency", cpu_info->tb_freq));
+
+		/* Lies, but safeish lies! */
+		_FDT(fdt_property_cell(fdt, "clock-frequency", 0xddbab200));
+
+		if (cpu_info->mmu_info.slb_size)
+			_FDT(fdt_property_cell(fdt, "ibm,slb-size", cpu_info->mmu_info.slb_size));
+
+		/*
+		 * HPT size is hardwired; KVM currently fixes it at 16MB but the
+		 * moment that changes we'll need to read it out of the kernel.
+		 */
+		_FDT(fdt_property(fdt, "ibm,pft-size", pft_size_prop,
+				  sizeof(pft_size_prop)));
+
+		_FDT(fdt_property_string(fdt, "status", "okay"));
+		_FDT(fdt_property(fdt, "64-bit", NULL, 0));
+		/* A server for each thread in this core */
+		for (j = 0; j < SMT_THREADS; j++) {
+			servers_prop[j] = cpu_to_be32(i+j);
+			/*
+			 * Hack borrowed from QEMU, direct the group queues back
+			 * to cpu 0:
+			 */
+			gservers_prop[j*2] = cpu_to_be32(i+j);
+			gservers_prop[j*2 + 1] = 0;
+		}
+		_FDT(fdt_property(fdt, "ibm,ppc-interrupt-server#s",
+				   servers_prop, threads * sizeof(uint32_t)));
+		_FDT(fdt_property(fdt, "ibm,ppc-interrupt-gserver#s",
+				  gservers_prop,
+				  threads * 2 * sizeof(uint32_t)));
+
+		if (segment_page_sizes.value)
+			_FDT(fdt_property(fdt, "ibm,segment-page-sizes",
+					  segment_page_sizes.value,
+					  segment_page_sizes.size));
+
+		if (cpu_info->mmu_info.flags & KVM_PPC_1T_SEGMENTS)
+			_FDT(fdt_property(fdt, "ibm,processor-segment-sizes",
+					  segment_sizes_1T, sizeof(segment_sizes_1T)));
+
+		/* VSX / DFP options: */
+		if (cpu_info->flags & CPUINFO_FLAG_VMX)
+			_FDT(fdt_property_cell(fdt, "ibm,vmx",
+					       (cpu_info->flags &
+						CPUINFO_FLAG_VSX) ? 2 : 1));
+		if (cpu_info->flags & CPUINFO_FLAG_DFP)
+			_FDT(fdt_property_cell(fdt, "ibm,dfp", 0x1));
+		_FDT(fdt_end_node(fdt));
+	}
+	_FDT(fdt_end_node(fdt));
+
+	/* IRQ controller */
+	_FDT(fdt_begin_node(fdt, "interrupt-controller@0"));
+
+	_FDT(fdt_property_string(fdt, "device_type",
+				 "PowerPC-External-Interrupt-Presentation"));
+	_FDT(fdt_property_string(fdt, "compatible", "IBM,ppc-xicp"));
+	_FDT(fdt_property_cell(fdt, "reg", 0));
+	_FDT(fdt_property(fdt, "interrupt-controller", NULL, 0));
+	_FDT(fdt_property(fdt, "ibm,interrupt-server-ranges",
+			   int_server_ranges_prop,
+			   sizeof(int_server_ranges_prop)));
+	_FDT(fdt_property_cell(fdt, "#interrupt-cells", 2));
+	_FDT(fdt_property_cell(fdt, "linux,phandle", PHANDLE_XICP));
+	_FDT(fdt_property_cell(fdt, "phandle", PHANDLE_XICP));
+	_FDT(fdt_end_node(fdt));
+
+	/*
+	 * VIO: See comment in linux,stdout-path; we don't yet represent a VIO
+	 * bus/address allocation so addresses are hardwired here.
+	 */
+	_FDT(fdt_begin_node(fdt, "vdevice"));
+	_FDT(fdt_property_cell(fdt, "#address-cells", 0x1));
+	_FDT(fdt_property_cell(fdt, "#size-cells", 0x0));
+	_FDT(fdt_property_string(fdt, "device_type", "vdevice"));
+	_FDT(fdt_property_string(fdt, "compatible", "IBM,vdevice"));
+	_FDT(fdt_begin_node(fdt, "vty@30000000"));
+	_FDT(fdt_property_string(fdt, "name", "vty"));
+	_FDT(fdt_property_string(fdt, "device_type", "serial"));
+	_FDT(fdt_property_string(fdt, "compatible", "hvterm1"));
+	_FDT(fdt_property_cell(fdt, "reg", 0x30000000));
+	_FDT(fdt_end_node(fdt));
+	_FDT(fdt_end_node(fdt));
+
+	/* Finalise: */
+	_FDT(fdt_end_node(fdt)); /* Root node */
+	_FDT(fdt_finish(fdt));
+
+	_FDT(fdt_open_into(fdt, fdt_dest, FDT_MAX_SIZE));
+
+	/* PCI */
+	if (spapr_populate_pci_devices(kvm, PHANDLE_XICP, fdt_dest))
+		die("Fail populating PCI device nodes");
+
+	_FDT(fdt_add_mem_rsv(fdt_dest, kvm->arch.rtas_gra, kvm->arch.rtas_size));
+	_FDT(fdt_pack(fdt_dest));
+
+	free(segment_page_sizes.value);
+
+	return 0;
+}
+firmware_init(setup_fdt);
+
+/**
+ * kvm__arch_setup_firmware
+ */
+int kvm__arch_setup_firmware(struct kvm *kvm)
+{
+	/*
+	 * Set up RTAS stub.  All it is is a single hypercall:
+	 *  0:   7c 64 1b 78     mr      r4,r3
+	 *  4:   3c 60 00 00     lis     r3,0
+	 *  8:   60 63 f0 00     ori     r3,r3,61440
+	 *  c:   44 00 00 22     sc      1
+	 * 10:   4e 80 00 20     blr
+	 */
+	uint32_t *rtas = guest_flat_to_host(kvm, kvm->arch.rtas_gra);
+
+	rtas[0] = 0x7c641b78;
+	rtas[1] = 0x3c600000;
+	rtas[2] = 0x6063f000;
+	rtas[3] = 0x44000022;
+	rtas[4] = 0x4e800020;
+	kvm->arch.rtas_size = 20;
+
+	pr_info("Set up %ld bytes of RTAS at 0x%lx\n",
+		kvm->arch.rtas_size, kvm->arch.rtas_gra);
+
+	/* Load SLOF */
+
+	return 0;
+}
+
+int kvm__arch_free_firmware(struct kvm *kvm)
+{
+	return 0;
+}

diff --git a/tools/kvm/powerpc/spapr.h b/tools/kvm/powerpc/spapr.h
new file mode 100644
index 0000000..0537f88
--- /dev/null
+++ b/tools/kvm/powerpc/spapr.h

@@ -0,0 +1,93 @@
+/*
+ * SPAPR definitions and declarations
+ *
+ * Borrowed heavily from QEMU's spapr.h,
+ * Copyright (c) 2010 David Gibson, IBM Corporation.
+ *
+ * Modifications by Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#if !defined(__HW_SPAPR_H__)
+#define __HW_SPAPR_H__
+
+#include <inttypes.h>
+
+/* We need some of the H_ hcall defs, but they're __KERNEL__ only. */
+#define __KERNEL__
+#include <asm/hvcall.h>
+#undef __KERNEL__
+
+#include "kvm/kvm.h"
+#include "kvm/kvm-cpu.h"
+
+typedef unsigned long target_ulong;
+typedef uintptr_t target_phys_addr_t;
+
+/*
+ * The hcalls above are standardized in PAPR and implemented by pHyp
+ * as well.
+ *
+ * We also need some hcalls which are specific to qemu / KVM-on-POWER.
+ * So far we just need one for H_RTAS, but in future we'll need more
+ * for extensions like virtio.  We put those into the 0xf000-0xfffc
+ * range which is reserved by PAPR for "platform-specific" hcalls.
+ */
+#define KVMPPC_HCALL_BASE       0xf000
+#define KVMPPC_H_RTAS           (KVMPPC_HCALL_BASE + 0x0)
+#define KVMPPC_HCALL_MAX        KVMPPC_H_RTAS
+
+#define DEBUG_SPAPR_HCALLS
+
+#ifdef DEBUG_SPAPR_HCALLS
+#define hcall_dprintf(fmt, ...) \
+    do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0)
+#else
+#define hcall_dprintf(fmt, ...) \
+    do { } while (0)
+#endif
+
+typedef target_ulong (*spapr_hcall_fn)(struct kvm_cpu *vcpu,
+				       target_ulong opcode,
+                                       target_ulong *args);
+
+void hypercall_init(void);
+void register_core_rtas(void);
+
+void spapr_register_hypercall(target_ulong opcode, spapr_hcall_fn fn);
+target_ulong spapr_hypercall(struct kvm_cpu *vcpu, target_ulong opcode,
+                             target_ulong *args);
+
+int spapr_rtas_fdt_setup(struct kvm *kvm, void *fdt);
+
+static inline uint32_t rtas_ld(struct kvm *kvm, target_ulong phys, int n)
+{
+	return *((uint32_t *)guest_flat_to_host(kvm, phys + 4*n));
+}
+
+static inline void rtas_st(struct kvm *kvm, target_ulong phys, int n, uint32_t val)
+{
+	*((uint32_t *)guest_flat_to_host(kvm, phys + 4*n)) = val;
+}
+
+typedef void (*spapr_rtas_fn)(struct kvm_cpu *vcpu, uint32_t token,
+                              uint32_t nargs, target_ulong args,
+                              uint32_t nret, target_ulong rets);
+void spapr_rtas_register(const char *name, spapr_rtas_fn fn);
+target_ulong spapr_rtas_call(struct kvm_cpu *vcpu,
+                             uint32_t token, uint32_t nargs, target_ulong args,
+                             uint32_t nret, target_ulong rets);
+
+#define SPAPR_PCI_BUID          0x800000020000001ULL
+#define SPAPR_PCI_MEM_WIN_ADDR  (KVM_MMIO_START + 0xA0000000)
+#define SPAPR_PCI_MEM_WIN_SIZE  0x20000000
+#define SPAPR_PCI_IO_WIN_ADDR   (SPAPR_PCI_MEM_WIN_ADDR + SPAPR_PCI_MEM_WIN_SIZE)
+#define SPAPR_PCI_IO_WIN_SIZE	0x2000000
+
+#define SPAPR_PCI_WIN_START	SPAPR_PCI_MEM_WIN_ADDR
+#define SPAPR_PCI_WIN_END	(SPAPR_PCI_IO_WIN_ADDR + SPAPR_PCI_IO_WIN_SIZE)
+
+#endif /* !defined (__HW_SPAPR_H__) */

diff --git a/tools/kvm/powerpc/spapr_hcall.c b/tools/kvm/powerpc/spapr_hcall.c
new file mode 100644
index 0000000..ff1d63a
--- /dev/null
+++ b/tools/kvm/powerpc/spapr_hcall.c

@@ -0,0 +1,134 @@
+/*
+ * SPAPR hypercalls
+ *
+ * Borrowed heavily from QEMU's spapr_hcall.c,
+ * Copyright (c) 2010 David Gibson, IBM Corporation.
+ *
+ * Copyright (c) 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include "spapr.h"
+#include "kvm/util.h"
+#include "kvm/kvm.h"
+#include "kvm/kvm-cpu.h"
+
+#include <stdio.h>
+#include <assert.h>
+
+static spapr_hcall_fn papr_hypercall_table[(MAX_HCALL_OPCODE / 4) + 1];
+static spapr_hcall_fn kvmppc_hypercall_table[KVMPPC_HCALL_MAX -
+					     KVMPPC_HCALL_BASE + 1];
+
+static target_ulong h_set_dabr(struct kvm_cpu *vcpu, target_ulong opcode, target_ulong *args)
+{
+	/* FIXME:  Implement this for -PR.  (-HV does this in kernel.) */
+	return H_HARDWARE;
+}
+
+static target_ulong h_rtas(struct kvm_cpu *vcpu, target_ulong opcode, target_ulong *args)
+{
+	target_ulong rtas_r3 = args[0];
+	/*
+	 * Pointer read from phys mem; these ptrs cannot be MMIO (!) so just
+	 * reference guest RAM directly.
+	 */
+	uint32_t token, nargs, nret;
+
+	token = rtas_ld(vcpu->kvm, rtas_r3, 0);
+	nargs = rtas_ld(vcpu->kvm, rtas_r3, 1);
+	nret  = rtas_ld(vcpu->kvm, rtas_r3, 2);
+
+	return spapr_rtas_call(vcpu, token, nargs, rtas_r3 + 12,
+			       nret, rtas_r3 + 12 + 4*nargs);
+}
+
+static target_ulong h_logical_load(struct kvm_cpu *vcpu, target_ulong opcode, target_ulong *args)
+{
+	/* SLOF will require these, though kernel doesn't. */
+	die(__PRETTY_FUNCTION__);
+	return H_PARAMETER;
+}
+
+static target_ulong h_logical_store(struct kvm_cpu *vcpu, target_ulong opcode, target_ulong *args)
+{
+	/* SLOF will require these, though kernel doesn't. */
+	die(__PRETTY_FUNCTION__);
+	return H_PARAMETER;
+}
+
+static target_ulong h_logical_icbi(struct kvm_cpu *vcpu, target_ulong opcode, target_ulong *args)
+{
+	/* KVM will trap this in the kernel.  Die if it misses. */
+	die(__PRETTY_FUNCTION__);
+	return H_SUCCESS;
+}
+
+static target_ulong h_logical_dcbf(struct kvm_cpu *vcpu, target_ulong opcode, target_ulong *args)
+{
+	/* KVM will trap this in the kernel.  Die if it misses. */
+	die(__PRETTY_FUNCTION__);
+	return H_SUCCESS;
+}
+
+void spapr_register_hypercall(target_ulong opcode, spapr_hcall_fn fn)
+{
+	spapr_hcall_fn *slot;
+
+	if (opcode <= MAX_HCALL_OPCODE) {
+		assert((opcode & 0x3) == 0);
+
+		slot = &papr_hypercall_table[opcode / 4];
+	} else {
+		assert((opcode >= KVMPPC_HCALL_BASE) &&
+		       (opcode <= KVMPPC_HCALL_MAX));
+
+		slot = &kvmppc_hypercall_table[opcode - KVMPPC_HCALL_BASE];
+	}
+
+	assert(!(*slot) || (fn == *slot));
+	*slot = fn;
+}
+
+target_ulong spapr_hypercall(struct kvm_cpu *vcpu, target_ulong opcode,
+			     target_ulong *args)
+{
+	if ((opcode <= MAX_HCALL_OPCODE)
+	    && ((opcode & 0x3) == 0)) {
+		spapr_hcall_fn fn = papr_hypercall_table[opcode / 4];
+
+		if (fn) {
+			return fn(vcpu, opcode, args);
+		}
+	} else if ((opcode >= KVMPPC_HCALL_BASE) &&
+		   (opcode <= KVMPPC_HCALL_MAX)) {
+		spapr_hcall_fn fn = kvmppc_hypercall_table[opcode -
+							   KVMPPC_HCALL_BASE];
+
+		if (fn) {
+			return fn(vcpu, opcode, args);
+		}
+	}
+
+	hcall_dprintf("Unimplemented hcall 0x%lx\n", opcode);
+	return H_FUNCTION;
+}
+
+void hypercall_init(void)
+{
+	/* hcall-dabr */
+	spapr_register_hypercall(H_SET_DABR, h_set_dabr);
+
+	spapr_register_hypercall(H_LOGICAL_CI_LOAD, h_logical_load);
+	spapr_register_hypercall(H_LOGICAL_CI_STORE, h_logical_store);
+	spapr_register_hypercall(H_LOGICAL_CACHE_LOAD, h_logical_load);
+	spapr_register_hypercall(H_LOGICAL_CACHE_STORE, h_logical_store);
+	spapr_register_hypercall(H_LOGICAL_ICBI, h_logical_icbi);
+	spapr_register_hypercall(H_LOGICAL_DCBF, h_logical_dcbf);
+
+	/* KVM-PPC specific hcalls */
+	spapr_register_hypercall(KVMPPC_H_RTAS, h_rtas);
+}

diff --git a/tools/kvm/powerpc/spapr_hvcons.c b/tools/kvm/powerpc/spapr_hvcons.c
new file mode 100644
index 0000000..605367b
--- /dev/null
+++ b/tools/kvm/powerpc/spapr_hvcons.c

@@ -0,0 +1,105 @@
+/*
+ * SPAPR HV console
+ *
+ * Borrowed lightly from QEMU's spapr_vty.c, Copyright (c) 2010 David Gibson,
+ * IBM Corporation.
+ *
+ * Copyright (c) 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include "kvm/term.h"
+#include "kvm/kvm.h"
+#include "kvm/kvm-cpu.h"
+#include "kvm/util.h"
+#include "spapr.h"
+#include "spapr_hvcons.h"
+
+#include <stdio.h>
+#include <sys/uio.h>
+#include <errno.h>
+
+#include <linux/byteorder.h>
+
+union hv_chario {
+	struct {
+		uint64_t char0_7;
+		uint64_t char8_15;
+	} a;
+	uint8_t buf[16];
+};
+
+static unsigned long h_put_term_char(struct kvm_cpu *vcpu, unsigned long opcode, unsigned long *args)
+{
+	/* To do: Read register from args[0], and check it. */
+	unsigned long len = args[1];
+	union hv_chario data;
+	struct iovec iov;
+
+	if (len > 16) {
+		return H_PARAMETER;
+	}
+	data.a.char0_7 = cpu_to_be64(args[2]);
+	data.a.char8_15 = cpu_to_be64(args[3]);
+
+	iov.iov_base = data.buf;
+	iov.iov_len = len;
+	do {
+		int ret;
+
+		ret = term_putc_iov(&iov, 1, 0);
+		if (ret < 0) {
+			die("term_putc_iov error %d!\n", errno);
+		}
+		iov.iov_base += ret;
+		iov.iov_len -= ret;
+	} while (iov.iov_len > 0);
+
+	return H_SUCCESS;
+}
+
+
+static unsigned long h_get_term_char(struct kvm_cpu *vcpu, unsigned long opcode, unsigned long *args)
+{
+	/* To do: Read register from args[0], and check it. */
+	unsigned long *len = args + 0;
+	unsigned long *char0_7 = args + 1;
+	unsigned long *char8_15 = args + 2;
+	union hv_chario data;
+	struct iovec iov;
+
+	if (vcpu->kvm->cfg.active_console != CONSOLE_HV)
+		return H_SUCCESS;
+
+	if (term_readable(0)) {
+		iov.iov_base = data.buf;
+		iov.iov_len = 16;
+
+		*len = term_getc_iov(vcpu->kvm, &iov, 1, 0);
+		*char0_7 = be64_to_cpu(data.a.char0_7);
+		*char8_15 = be64_to_cpu(data.a.char8_15);
+	} else {
+		*len = 0;
+	}
+
+	return H_SUCCESS;
+}
+
+void spapr_hvcons_poll(struct kvm *kvm)
+{
+	if (term_readable(0)) {
+		/*
+		 * We can inject an IRQ to guest here if we want.  The guest
+		 * will happily poll, though, so not required.
+		 */
+	}
+}
+
+void spapr_hvcons_init(void)
+{
+	spapr_register_hypercall(H_PUT_TERM_CHAR, h_put_term_char);
+	spapr_register_hypercall(H_GET_TERM_CHAR, h_get_term_char);
+}

diff --git a/tools/kvm/powerpc/spapr_hvcons.h b/tools/kvm/powerpc/spapr_hvcons.h
new file mode 100644
index 0000000..d3e4414
--- /dev/null
+++ b/tools/kvm/powerpc/spapr_hvcons.h

@@ -0,0 +1,19 @@
+/*
+ * SPAPR HV console
+ *
+ * Copyright (c) 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#ifndef spapr_hvcons_H
+#define spapr_hvcons_H
+
+#include "kvm/kvm.h"
+
+void spapr_hvcons_init(void);
+void spapr_hvcons_poll(struct kvm *kvm);
+
+#endif

diff --git a/tools/kvm/powerpc/spapr_pci.c b/tools/kvm/powerpc/spapr_pci.c
new file mode 100644
index 0000000..768e3f2
--- /dev/null
+++ b/tools/kvm/powerpc/spapr_pci.c

@@ -0,0 +1,409 @@
+/*
+ * SPAPR PHB emulation, RTAS interface to PCI config space, device tree nodes
+ * for enumerated devices.
+ *
+ * Borrowed heavily from QEMU's spapr_pci.c,
+ * Copyright (c) 2011 Alexey Kardashevskiy, IBM Corporation.
+ * Copyright (c) 2011 David Gibson, IBM Corporation.
+ *
+ * Modifications copyright 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include "spapr.h"
+#include "spapr_pci.h"
+#include "kvm/devices.h"
+#include "kvm/fdt.h"
+#include "kvm/util.h"
+#include "kvm/of_pci.h"
+#include "kvm/pci.h"
+
+#include <linux/pci_regs.h>
+#include <linux/byteorder.h>
+
+
+/* #define DEBUG_PHB yes */
+#ifdef DEBUG_PHB
+#define phb_dprintf(fmt, ...)					\
+	do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0)
+#else
+#define phb_dprintf(fmt, ...)			\
+	do { } while (0)
+#endif
+
+static const uint32_t bars[] = {
+	PCI_BASE_ADDRESS_0, PCI_BASE_ADDRESS_1,
+	PCI_BASE_ADDRESS_2, PCI_BASE_ADDRESS_3,
+	PCI_BASE_ADDRESS_4, PCI_BASE_ADDRESS_5
+	/*, PCI_ROM_ADDRESS*/
+};
+
+#define PCI_NUM_REGIONS		7
+
+static struct spapr_phb phb;
+
+static void rtas_ibm_read_pci_config(struct kvm_cpu *vcpu,
+				     uint32_t token, uint32_t nargs,
+				     target_ulong args,
+				     uint32_t nret, target_ulong rets)
+{
+	uint32_t val = 0;
+	uint64_t buid = ((uint64_t)rtas_ld(vcpu->kvm, args, 1) << 32) | rtas_ld(vcpu->kvm, args, 2);
+	union pci_config_address addr = { .w = rtas_ld(vcpu->kvm, args, 0) };
+	struct pci_device_header *dev = pci__find_dev(addr.device_number);
+	uint32_t size = rtas_ld(vcpu->kvm, args, 3);
+
+	if (buid != phb.buid || !dev || (size > 4)) {
+		phb_dprintf("- cfgRd buid 0x%lx cfg addr 0x%x size %d not found\n",
+			    buid, addr.w, size);
+
+		rtas_st(vcpu->kvm, rets, 0, -1);
+		return;
+	}
+	pci__config_rd(vcpu->kvm, addr, &val, size);
+	/* It appears this wants a byteswapped result... */
+	switch (size) {
+	case 4:
+		val = le32_to_cpu(val);
+		break;
+	case 2:
+		val = le16_to_cpu(val>>16);
+		break;
+	case 1:
+		val = val >> 24;
+		break;
+	}
+	phb_dprintf("- cfgRd buid 0x%lx addr 0x%x (/%d): b%d,d%d,f%d,r0x%x, val 0x%x\n",
+		    buid, addr.w, size, addr.bus_number, addr.device_number, addr.function_number,
+		    addr.register_number, val);
+
+	rtas_st(vcpu->kvm, rets, 0, 0);
+	rtas_st(vcpu->kvm, rets, 1, val);
+}
+
+static void rtas_read_pci_config(struct kvm_cpu *vcpu,
+				 uint32_t token, uint32_t nargs,
+				 target_ulong args,
+				 uint32_t nret, target_ulong rets)
+{
+	uint32_t val;
+	union pci_config_address addr = { .w = rtas_ld(vcpu->kvm, args, 0) };
+	struct pci_device_header *dev = pci__find_dev(addr.device_number);
+	uint32_t size = rtas_ld(vcpu->kvm, args, 1);
+
+	if (!dev || (size > 4)) {
+		rtas_st(vcpu->kvm, rets, 0, -1);
+		return;
+	}
+	pci__config_rd(vcpu->kvm, addr, &val, size);
+	switch (size) {
+	case 4:
+		val = le32_to_cpu(val);
+		break;
+	case 2:
+		val = le16_to_cpu(val>>16); /* We're yuck-endian. */
+		break;
+	case 1:
+		val = val >> 24;
+		break;
+	}
+	phb_dprintf("- cfgRd addr 0x%x size %d, val 0x%x\n", addr.w, size, val);
+	rtas_st(vcpu->kvm, rets, 0, 0);
+	rtas_st(vcpu->kvm, rets, 1, val);
+}
+
+static void rtas_ibm_write_pci_config(struct kvm_cpu *vcpu,
+				      uint32_t token, uint32_t nargs,
+				      target_ulong args,
+				      uint32_t nret, target_ulong rets)
+{
+	uint64_t buid = ((uint64_t)rtas_ld(vcpu->kvm, args, 1) << 32) | rtas_ld(vcpu->kvm, args, 2);
+	union pci_config_address addr = { .w = rtas_ld(vcpu->kvm, args, 0) };
+	struct pci_device_header *dev = pci__find_dev(addr.device_number);
+	uint32_t size = rtas_ld(vcpu->kvm, args, 3);
+	uint32_t val = rtas_ld(vcpu->kvm, args, 4);
+
+	if (buid != phb.buid || !dev || (size > 4)) {
+		phb_dprintf("- cfgWr buid 0x%lx cfg addr 0x%x/%d error (val 0x%x)\n",
+			    buid, addr.w, size, val);
+
+		rtas_st(vcpu->kvm, rets, 0, -1);
+		return;
+	}
+	phb_dprintf("- cfgWr buid 0x%lx addr 0x%x (/%d): b%d,d%d,f%d,r0x%x, val 0x%x\n",
+		    buid, addr.w, size, addr.bus_number, addr.device_number, addr.function_number,
+		    addr.register_number, val);
+	switch (size) {
+	case 4:
+		val = le32_to_cpu(val);
+		break;
+	case 2:
+		val = le16_to_cpu(val) << 16;
+		break;
+	case 1:
+		val = val >> 24;
+		break;
+	}
+	pci__config_wr(vcpu->kvm, addr, &val, size);
+	rtas_st(vcpu->kvm, rets, 0, 0);
+}
+
+static void rtas_write_pci_config(struct kvm_cpu *vcpu,
+				  uint32_t token, uint32_t nargs,
+				  target_ulong args,
+				  uint32_t nret, target_ulong rets)
+{
+	union pci_config_address addr = { .w = rtas_ld(vcpu->kvm, args, 0) };
+	struct pci_device_header *dev = pci__find_dev(addr.device_number);
+	uint32_t size = rtas_ld(vcpu->kvm, args, 1);
+	uint32_t val = rtas_ld(vcpu->kvm, args, 2);
+
+	if (!dev || (size > 4)) {
+		rtas_st(vcpu->kvm, rets, 0, -1);
+		return;
+	}
+
+	phb_dprintf("- cfgWr addr 0x%x (/%d): b%d,d%d,f%d,r0x%x, val 0x%x\n",
+		    addr.w, size, addr.bus_number, addr.device_number, addr.function_number,
+		    addr.register_number, val);
+	switch (size) {
+	case 4:
+		val = le32_to_cpu(val);
+		break;
+	case 2:
+		val = le16_to_cpu(val) << 16;
+		break;
+	case 1:
+		val = val >> 24;
+		break;
+	}
+	pci__config_wr(vcpu->kvm, addr, &val, size);
+	rtas_st(vcpu->kvm, rets, 0, 0);
+}
+
+void spapr_create_phb(struct kvm *kvm,
+		      const char *busname, uint64_t buid,
+		      uint64_t mem_win_addr, uint64_t mem_win_size,
+		      uint64_t io_win_addr, uint64_t io_win_size)
+{
+	/*
+	 * Since kvmtool doesn't really have any concept of buses etc.,
+	 * there's nothing to register here.  Just register RTAS.
+	 */
+	spapr_rtas_register("read-pci-config", rtas_read_pci_config);
+	spapr_rtas_register("write-pci-config", rtas_write_pci_config);
+	spapr_rtas_register("ibm,read-pci-config", rtas_ibm_read_pci_config);
+	spapr_rtas_register("ibm,write-pci-config", rtas_ibm_write_pci_config);
+
+	phb.buid = buid;
+	phb.mem_addr = mem_win_addr;
+	phb.mem_size = mem_win_size;
+	phb.io_addr  = io_win_addr;
+	phb.io_size  = io_win_size;
+
+	kvm->arch.phb = &phb;
+}
+
+static uint32_t bar_to_ss(unsigned long bar)
+{
+	if ((bar & PCI_BASE_ADDRESS_SPACE) ==
+	    PCI_BASE_ADDRESS_SPACE_IO)
+		return OF_PCI_SS_IO;
+	else if (bar & PCI_BASE_ADDRESS_MEM_TYPE_64)
+		return OF_PCI_SS_M64;
+	else
+		return OF_PCI_SS_M32;
+}
+
+static unsigned long bar_to_addr(unsigned long bar)
+{
+	if ((bar & PCI_BASE_ADDRESS_SPACE) ==
+	    PCI_BASE_ADDRESS_SPACE_IO)
+		return bar & PCI_BASE_ADDRESS_IO_MASK;
+	else
+		return bar & PCI_BASE_ADDRESS_MEM_MASK;
+}
+
+int spapr_populate_pci_devices(struct kvm *kvm,
+			       uint32_t xics_phandle,
+			       void *fdt)
+{
+	int bus_off, node_off = 0, devid, fn, i, n, devices;
+	struct device_header *dev_hdr;
+	char nodename[256];
+	struct of_pci_unit_address reg[PCI_NUM_REGIONS + 1],
+				   assigned_addresses[PCI_NUM_REGIONS];
+	uint32_t bus_range[] = { cpu_to_be32(0), cpu_to_be32(0xff) };
+	struct of_pci_ranges_entry ranges[] = {
+		{
+			{
+				cpu_to_be32(of_pci_b_ss(1)),
+				cpu_to_be32(0),
+				cpu_to_be32(0),
+			},
+			cpu_to_be64(phb.io_addr),
+			cpu_to_be64(phb.io_size),
+		},
+		{
+			{
+				cpu_to_be32(of_pci_b_ss(2)),
+				cpu_to_be32(0),
+				cpu_to_be32(0),
+			},
+			cpu_to_be64(phb.mem_addr),
+			cpu_to_be64(phb.mem_size),
+		},
+	};
+	uint64_t bus_reg[] = { cpu_to_be64(phb.buid), 0 };
+	uint32_t interrupt_map_mask[] = {
+		cpu_to_be32(of_pci_b_ddddd(-1)|of_pci_b_fff(-1)), 0x0, 0x0, 0x0};
+	uint32_t interrupt_map[SPAPR_PCI_NUM_LSI][7];
+
+	/* Start populating the FDT */
+	sprintf(nodename, "pci@%" PRIx64, phb.buid);
+	bus_off = fdt_add_subnode(fdt, 0, nodename);
+	if (bus_off < 0) {
+		die("error making bus subnode, %s\n", fdt_strerror(bus_off));
+		return bus_off;
+	}
+
+	/* Write PHB properties */
+	_FDT(fdt_setprop_string(fdt, bus_off, "device_type", "pci"));
+	_FDT(fdt_setprop_string(fdt, bus_off, "compatible", "IBM,Logical_PHB"));
+	_FDT(fdt_setprop_cell(fdt, bus_off, "#address-cells", 0x3));
+	_FDT(fdt_setprop_cell(fdt, bus_off, "#size-cells", 0x2));
+	_FDT(fdt_setprop_cell(fdt, bus_off, "#interrupt-cells", 0x1));
+	_FDT(fdt_setprop(fdt, bus_off, "used-by-rtas", NULL, 0));
+	_FDT(fdt_setprop(fdt, bus_off, "bus-range", &bus_range, sizeof(bus_range)));
+	_FDT(fdt_setprop(fdt, bus_off, "ranges", &ranges, sizeof(ranges)));
+	_FDT(fdt_setprop(fdt, bus_off, "reg", &bus_reg, sizeof(bus_reg)));
+	_FDT(fdt_setprop(fdt, bus_off, "interrupt-map-mask",
+			 &interrupt_map_mask, sizeof(interrupt_map_mask)));
+
+	/* Populate PCI devices and allocate IRQs */
+	devices = 0;
+	dev_hdr = device__first_dev(DEVICE_BUS_PCI);
+	while (dev_hdr) {
+		uint32_t *irqmap = interrupt_map[devices];
+		struct pci_device_header *hdr = dev_hdr->data;
+
+		if (!hdr)
+			continue;
+
+		devid = dev_hdr->dev_num;
+		fn = 0; /* kvmtool doesn't yet do multifunction devices */
+
+		sprintf(nodename, "pci@%u,%u", devid, fn);
+
+		/* Allocate interrupt from the map */
+		if (devid > SPAPR_PCI_NUM_LSI)	{
+			die("Unexpected behaviour in spapr_populate_pci_devices,"
+			    "wrong devid %u\n", devid);
+		}
+		irqmap[0] = cpu_to_be32(of_pci_b_ddddd(devid)|of_pci_b_fff(fn));
+		irqmap[1] = 0;
+		irqmap[2] = 0;
+		irqmap[3] = 0;
+		irqmap[4] = cpu_to_be32(xics_phandle);
+		/*
+		 * This is nasty; the PCI devs are set up such that their own
+		 * header's irq_line indicates the direct XICS IRQ number to
+		 * use.  There REALLY needs to be a hierarchical system in place
+		 * to 'raise' an IRQ on the bridge which indexes/looks up which
+		 * XICS IRQ to fire.
+		 */
+		irqmap[5] = cpu_to_be32(hdr->irq_line);
+		irqmap[6] = cpu_to_be32(0x8);
+
+		/* Add node to FDT */
+		node_off = fdt_add_subnode(fdt, bus_off, nodename);
+		if (node_off < 0) {
+			die("error making node subnode, %s\n", fdt_strerror(bus_off));
+			return node_off;
+		}
+
+		_FDT(fdt_setprop_cell(fdt, node_off, "vendor-id",
+				      le16_to_cpu(hdr->vendor_id)));
+		_FDT(fdt_setprop_cell(fdt, node_off, "device-id",
+				      le16_to_cpu(hdr->device_id)));
+		_FDT(fdt_setprop_cell(fdt, node_off, "revision-id",
+				      hdr->revision_id));
+		_FDT(fdt_setprop_cell(fdt, node_off, "class-code",
+				      hdr->class[0] | (hdr->class[1] << 8) | (hdr->class[2] << 16)));
+		_FDT(fdt_setprop_cell(fdt, node_off, "subsystem-id",
+				      le16_to_cpu(hdr->subsys_id)));
+		_FDT(fdt_setprop_cell(fdt, node_off, "subsystem-vendor-id",
+				      le16_to_cpu(hdr->subsys_vendor_id)));
+
+		/* Config space region comes first */
+		reg[0].hi = cpu_to_be32(
+			of_pci_b_n(0) |
+			of_pci_b_p(0) |
+			of_pci_b_t(0) |
+			of_pci_b_ss(OF_PCI_SS_CONFIG) |
+			of_pci_b_bbbbbbbb(0) |
+			of_pci_b_ddddd(devid) |
+			of_pci_b_fff(fn));
+		reg[0].mid = 0;
+		reg[0].lo = 0;
+
+		n = 0;
+		/* Six BARs, no ROM supported, addresses are 32bit */
+		for (i = 0; i < 6; ++i) {
+			if (0 == hdr->bar[i]) {
+				continue;
+			}
+
+			reg[n+1].hi = cpu_to_be32(
+				of_pci_b_n(0) |
+				of_pci_b_p(0) |
+				of_pci_b_t(0) |
+				of_pci_b_ss(bar_to_ss(le32_to_cpu(hdr->bar[i]))) |
+				of_pci_b_bbbbbbbb(0) |
+				of_pci_b_ddddd(devid) |
+				of_pci_b_fff(fn) |
+				of_pci_b_rrrrrrrr(bars[i]));
+			reg[n+1].mid = 0;
+			reg[n+1].lo = cpu_to_be64(hdr->bar_size[i]);
+
+			assigned_addresses[n].hi = cpu_to_be32(
+				of_pci_b_n(1) |
+				of_pci_b_p(0) |
+				of_pci_b_t(0) |
+				of_pci_b_ss(bar_to_ss(le32_to_cpu(hdr->bar[i]))) |
+				of_pci_b_bbbbbbbb(0) |
+				of_pci_b_ddddd(devid) |
+				of_pci_b_fff(fn) |
+				of_pci_b_rrrrrrrr(bars[i]));
+
+			/*
+			 * Writing zeroes to assigned_addresses causes the guest kernel to
+			 * reassign BARs
+			 */
+			assigned_addresses[n].mid = cpu_to_be64(bar_to_addr(le32_to_cpu(hdr->bar[i])));
+			assigned_addresses[n].lo = reg[n+1].lo;
+
+			++n;
+		}
+		_FDT(fdt_setprop(fdt, node_off, "reg", reg, sizeof(reg[0])*(n+1)));
+		_FDT(fdt_setprop(fdt, node_off, "assigned-addresses",
+				 assigned_addresses,
+				 sizeof(assigned_addresses[0])*(n)));
+		_FDT(fdt_setprop_cell(fdt, node_off, "interrupts",
+				      hdr->irq_pin));
+
+		/* We don't set ibm,dma-window property as we don't have an IOMMU. */
+
+		++devices;
+		dev_hdr = device__next_dev(dev_hdr);
+	}
+
+	/* Write interrupt map */
+	_FDT(fdt_setprop(fdt, bus_off, "interrupt-map", &interrupt_map,
+			 devices * sizeof(interrupt_map[0])));
+
+	return 0;
+}

diff --git a/tools/kvm/powerpc/spapr_pci.h b/tools/kvm/powerpc/spapr_pci.h
new file mode 100644
index 0000000..f659eda
--- /dev/null
+++ b/tools/kvm/powerpc/spapr_pci.h

@@ -0,0 +1,57 @@
+/*
+ * SPAPR PHB definitions
+ *
+ * Modifications by Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#ifndef SPAPR_PCI_H
+#define SPAPR_PCI_H
+
+#include "kvm/kvm.h"
+#include "spapr.h"
+#include <inttypes.h>
+
+/* With XICS, we can easily accomodate 1 IRQ per PCI device. */
+
+#define SPAPR_PCI_NUM_LSI 256
+
+struct spapr_phb {
+	uint64_t buid;
+	uint64_t mem_addr;
+	uint64_t mem_size;
+	uint64_t io_addr;
+	uint64_t io_size;
+};
+
+void spapr_create_phb(struct kvm *kvm,
+                      const char *busname, uint64_t buid,
+                      uint64_t mem_win_addr, uint64_t mem_win_size,
+                      uint64_t io_win_addr, uint64_t io_win_size);
+
+int spapr_populate_pci_devices(struct kvm *kvm,
+                               uint32_t xics_phandle,
+                               void *fdt);
+
+static inline bool spapr_phb_mmio(struct kvm_cpu *vcpu, u64 phys_addr, u8 *data, u32 len, u8 is_write)
+{
+	if ((phys_addr >= SPAPR_PCI_IO_WIN_ADDR) &&
+	    (phys_addr < SPAPR_PCI_IO_WIN_ADDR +
+	     SPAPR_PCI_IO_WIN_SIZE)) {
+		return kvm__emulate_io(vcpu, phys_addr - SPAPR_PCI_IO_WIN_ADDR,
+				       data, is_write ? KVM_EXIT_IO_OUT :
+				       KVM_EXIT_IO_IN,
+				       len, 1);
+	} else if ((phys_addr >= SPAPR_PCI_MEM_WIN_ADDR) &&
+		   (phys_addr < SPAPR_PCI_MEM_WIN_ADDR +
+		    SPAPR_PCI_MEM_WIN_SIZE)) {
+		return kvm__emulate_mmio(vcpu, phys_addr - SPAPR_PCI_MEM_WIN_ADDR,
+					 data, len, is_write);
+	}
+	return false;
+}
+
+#endif

diff --git a/tools/kvm/powerpc/spapr_rtas.c b/tools/kvm/powerpc/spapr_rtas.c
new file mode 100644
index 0000000..43e5310
--- /dev/null
+++ b/tools/kvm/powerpc/spapr_rtas.c

@@ -0,0 +1,246 @@
+/*
+ * SPAPR base RTAS calls
+ *
+ * Borrowed heavily from QEMU's spapr_rtas.c
+ * Copyright (c) 2010-2011 David Gibson, IBM Corporation.
+ *
+ * Modifications copyright 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include "kvm/kvm.h"
+#include "kvm/kvm-cpu.h"
+#include "kvm/util.h"
+#include "kvm/term.h"
+#include "libfdt.h"
+
+#include "spapr.h"
+
+#include <stdio.h>
+#include <assert.h>
+
+#define TOKEN_BASE      0x2000
+#define TOKEN_MAX       0x100
+
+#define RTAS_CONSOLE
+
+static struct rtas_call {
+	const char *name;
+	spapr_rtas_fn fn;
+} rtas_table[TOKEN_MAX];
+
+struct rtas_call *rtas_next = rtas_table;
+
+
+static void rtas_display_character(struct kvm_cpu *vcpu,
+                                   uint32_t token, uint32_t nargs,
+                                   target_ulong args,
+                                   uint32_t nret, target_ulong rets)
+{
+	char c = rtas_ld(vcpu->kvm, args, 0);
+	term_putc(&c, 1, 0);
+	rtas_st(vcpu->kvm, rets, 0, 0);
+}
+
+#ifdef RTAS_CONSOLE
+static void rtas_put_term_char(struct kvm_cpu *vcpu,
+			       uint32_t token, uint32_t nargs,
+			       target_ulong args,
+			       uint32_t nret, target_ulong rets)
+{
+	char c = rtas_ld(vcpu->kvm, args, 0);
+
+	term_putc(&c, 1, 0);
+
+	rtas_st(vcpu->kvm, rets, 0, 0);
+}
+
+static void rtas_get_term_char(struct kvm_cpu *vcpu,
+			       uint32_t token, uint32_t nargs,
+			       target_ulong args,
+			       uint32_t nret, target_ulong rets)
+{
+	int c;
+
+	if (vcpu->kvm->cfg.active_console == CONSOLE_HV && term_readable(0) &&
+	    (c = term_getc(vcpu->kvm, 0)) >= 0) {
+		rtas_st(vcpu->kvm, rets, 0, 0);
+		rtas_st(vcpu->kvm, rets, 1, c);
+	} else {
+		rtas_st(vcpu->kvm, rets, 0, -2);
+	}
+}
+#endif
+
+static void rtas_get_time_of_day(struct kvm_cpu *vcpu,
+                                 uint32_t token, uint32_t nargs,
+                                 target_ulong args,
+                                 uint32_t nret, target_ulong rets)
+{
+	struct tm tm;
+	time_t tnow;
+
+	if (nret != 8) {
+		rtas_st(vcpu->kvm, rets, 0, -3);
+		return;
+	}
+
+	tnow = time(NULL);
+	/* Guest time is currently not offset in any way. */
+	gmtime_r(&tnow, &tm);
+
+	rtas_st(vcpu->kvm, rets, 0, 0); /* Success */
+	rtas_st(vcpu->kvm, rets, 1, tm.tm_year + 1900);
+	rtas_st(vcpu->kvm, rets, 2, tm.tm_mon + 1);
+	rtas_st(vcpu->kvm, rets, 3, tm.tm_mday);
+	rtas_st(vcpu->kvm, rets, 4, tm.tm_hour);
+	rtas_st(vcpu->kvm, rets, 5, tm.tm_min);
+	rtas_st(vcpu->kvm, rets, 6, tm.tm_sec);
+	rtas_st(vcpu->kvm, rets, 7, 0);
+}
+
+static void rtas_set_time_of_day(struct kvm_cpu *vcpu,
+                                 uint32_t token, uint32_t nargs,
+                                 target_ulong args,
+                                 uint32_t nret, target_ulong rets)
+{
+	pr_warning("%s called; TOD set ignored.\n", __FUNCTION__);
+}
+
+static void rtas_power_off(struct kvm_cpu *vcpu,
+                           uint32_t token, uint32_t nargs, target_ulong args,
+                           uint32_t nret, target_ulong rets)
+{
+	if (nargs != 2 || nret != 1) {
+		rtas_st(vcpu->kvm, rets, 0, -3);
+		return;
+	}
+	kvm_cpu__reboot(vcpu->kvm);
+}
+
+static void rtas_system_reboot(struct kvm_cpu *vcpu,
+                           uint32_t token, uint32_t nargs, target_ulong args,
+                           uint32_t nret, target_ulong rets)
+{
+	if (nargs != 0 || nret != 1) {
+		rtas_st(vcpu->kvm, rets, 0, -3);
+		return;
+	}
+
+	/* NB this actually halts the VM */
+	kvm_cpu__reboot(vcpu->kvm);
+}
+
+static void rtas_query_cpu_stopped_state(struct kvm_cpu *vcpu,
+                                         uint32_t token, uint32_t nargs,
+                                         target_ulong args,
+                                         uint32_t nret, target_ulong rets)
+{
+	if (nargs != 1 || nret != 2) {
+		rtas_st(vcpu->kvm, rets, 0, -3);
+		return;
+	}
+
+	/*
+	 * Can read id = rtas_ld(vcpu->kvm, args, 0), but
+	 * we currently start all CPUs.  So just return true.
+	 */
+	rtas_st(vcpu->kvm, rets, 0, 0);
+	rtas_st(vcpu->kvm, rets, 1, 2);
+}
+
+static void rtas_start_cpu(struct kvm_cpu *vcpu,
+                           uint32_t token, uint32_t nargs,
+                           target_ulong args,
+                           uint32_t nret, target_ulong rets)
+{
+	die(__FUNCTION__);
+}
+
+target_ulong spapr_rtas_call(struct kvm_cpu *vcpu,
+                             uint32_t token, uint32_t nargs, target_ulong args,
+                             uint32_t nret, target_ulong rets)
+{
+	if ((token >= TOKEN_BASE)
+	    && ((token - TOKEN_BASE) < TOKEN_MAX)) {
+		struct rtas_call *call = rtas_table + (token - TOKEN_BASE);
+
+		if (call->fn) {
+			call->fn(vcpu, token, nargs, args, nret, rets);
+			return H_SUCCESS;
+		}
+	}
+
+	/*
+	 * HACK: Some Linux early debug code uses RTAS display-character,
+	 * but assumes the token value is 0xa (which it is on some real
+	 * machines) without looking it up in the device tree.  This
+	 * special case makes this work
+	 */
+	if (token == 0xa) {
+		rtas_display_character(vcpu, 0xa, nargs, args, nret, rets);
+		return H_SUCCESS;
+	}
+
+	hcall_dprintf("Unknown RTAS token 0x%x\n", token);
+	rtas_st(vcpu->kvm, rets, 0, -3);
+	return H_PARAMETER;
+}
+
+void spapr_rtas_register(const char *name, spapr_rtas_fn fn)
+{
+	assert(rtas_next < (rtas_table + TOKEN_MAX));
+
+	rtas_next->name = name;
+	rtas_next->fn = fn;
+
+	rtas_next++;
+}
+
+/*
+ * This is called from the context of an open /rtas node, in order to add
+ * properties for the rtas call tokens.
+ */
+int spapr_rtas_fdt_setup(struct kvm *kvm, void *fdt)
+{
+	int ret;
+	int i;
+
+	for (i = 0; i < TOKEN_MAX; i++) {
+		struct rtas_call *call = &rtas_table[i];
+
+		if (!call->fn) {
+			continue;
+		}
+
+		ret = fdt_property_cell(fdt, call->name, i + TOKEN_BASE);
+
+		if (ret < 0) {
+			pr_warning("Couldn't add rtas token for %s: %s\n",
+				   call->name, fdt_strerror(ret));
+			return ret;
+		}
+
+	}
+	return 0;
+}
+
+void register_core_rtas(void)
+{
+	spapr_rtas_register("display-character", rtas_display_character);
+	spapr_rtas_register("get-time-of-day", rtas_get_time_of_day);
+	spapr_rtas_register("set-time-of-day", rtas_set_time_of_day);
+	spapr_rtas_register("power-off", rtas_power_off);
+	spapr_rtas_register("system-reboot", rtas_system_reboot);
+	spapr_rtas_register("query-cpu-stopped-state",
+			    rtas_query_cpu_stopped_state);
+	spapr_rtas_register("start-cpu", rtas_start_cpu);
+#ifdef RTAS_CONSOLE
+	/* These are unused: We do console I/O via hcalls, not rtas. */
+	spapr_rtas_register("put-term-char", rtas_put_term_char);
+	spapr_rtas_register("get-term-char", rtas_get_term_char);
+#endif
+}

diff --git a/tools/kvm/powerpc/xics.c b/tools/kvm/powerpc/xics.c
new file mode 100644
index 0000000..071b928
--- /dev/null
+++ b/tools/kvm/powerpc/xics.c

@@ -0,0 +1,495 @@
+/*
+ * PAPR Virtualized Interrupt System, aka ICS/ICP aka xics
+ *
+ * Borrowed heavily from QEMU's xics.c,
+ * Copyright (c) 2010,2011 David Gibson, IBM Corporation.
+ *
+ * Modifications copyright 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include "spapr.h"
+#include "xics.h"
+#include "kvm/util.h"
+#include "kvm/kvm.h"
+
+#include <stdio.h>
+#include <malloc.h>
+
+#define XICS_NUM_IRQS	1024
+
+
+/* #define DEBUG_XICS yes */
+#ifdef DEBUG_XICS
+#define xics_dprintf(fmt, ...)					\
+	do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0)
+#else
+#define xics_dprintf(fmt, ...)			\
+	do { } while (0)
+#endif
+
+/*
+ * ICP: Presentation layer
+ */
+
+struct icp_server_state {
+	uint32_t xirr;
+	uint8_t pending_priority;
+	uint8_t mfrr;
+	struct kvm_cpu *cpu;
+};
+
+#define XICS_IRQ_OFFSET KVM_IRQ_OFFSET
+#define XISR_MASK	0x00ffffff
+#define CPPR_MASK	0xff000000
+
+#define XISR(ss)   (((ss)->xirr) & XISR_MASK)
+#define CPPR(ss)   (((ss)->xirr) >> 24)
+
+struct ics_state;
+
+struct icp_state {
+	unsigned long nr_servers;
+	struct icp_server_state *ss;
+	struct ics_state *ics;
+};
+
+static void ics_reject(struct ics_state *ics, int nr);
+static void ics_resend(struct ics_state *ics);
+static void ics_eoi(struct ics_state *ics, int nr);
+
+static inline void cpu_irq_raise(struct kvm_cpu *vcpu)
+{
+	xics_dprintf("INT1[%p]\n", vcpu);
+	kvm_cpu__irq(vcpu, POWER7_EXT_IRQ, 1);
+}
+
+static inline void cpu_irq_lower(struct kvm_cpu *vcpu)
+{
+	xics_dprintf("INT0[%p]\n", vcpu);
+	kvm_cpu__irq(vcpu, POWER7_EXT_IRQ, 0);
+}
+
+static void icp_check_ipi(struct icp_state *icp, int server)
+{
+	struct icp_server_state *ss = icp->ss + server;
+
+	if (XISR(ss) && (ss->pending_priority <= ss->mfrr)) {
+		return;
+	}
+
+	if (XISR(ss)) {
+		ics_reject(icp->ics, XISR(ss));
+	}
+
+	ss->xirr = (ss->xirr & ~XISR_MASK) | XICS_IPI;
+	ss->pending_priority = ss->mfrr;
+	cpu_irq_raise(ss->cpu);
+}
+
+static void icp_resend(struct icp_state *icp, int server)
+{
+	struct icp_server_state *ss = icp->ss + server;
+
+	if (ss->mfrr < CPPR(ss)) {
+		icp_check_ipi(icp, server);
+	}
+	ics_resend(icp->ics);
+}
+
+static void icp_set_cppr(struct icp_state *icp, int server, uint8_t cppr)
+{
+	struct icp_server_state *ss = icp->ss + server;
+	uint8_t old_cppr;
+	uint32_t old_xisr;
+
+	old_cppr = CPPR(ss);
+	ss->xirr = (ss->xirr & ~CPPR_MASK) | (cppr << 24);
+
+	if (cppr < old_cppr) {
+		if (XISR(ss) && (cppr <= ss->pending_priority)) {
+			old_xisr = XISR(ss);
+			ss->xirr &= ~XISR_MASK; /* Clear XISR */
+			cpu_irq_lower(ss->cpu);
+			ics_reject(icp->ics, old_xisr);
+		}
+	} else {
+		if (!XISR(ss)) {
+			icp_resend(icp, server);
+		}
+	}
+}
+
+static void icp_set_mfrr(struct icp_state *icp, int nr, uint8_t mfrr)
+{
+	struct icp_server_state *ss = icp->ss + nr;
+
+	ss->mfrr = mfrr;
+	if (mfrr < CPPR(ss)) {
+		icp_check_ipi(icp, nr);
+	}
+}
+
+static uint32_t icp_accept(struct icp_server_state *ss)
+{
+	uint32_t xirr;
+
+	cpu_irq_lower(ss->cpu);
+	xirr = ss->xirr;
+	ss->xirr = ss->pending_priority << 24;
+	return xirr;
+}
+
+static void icp_eoi(struct icp_state *icp, int server, uint32_t xirr)
+{
+	struct icp_server_state *ss = icp->ss + server;
+
+	ics_eoi(icp->ics, xirr & XISR_MASK);
+	/* Send EOI -> ICS */
+	ss->xirr = (ss->xirr & ~CPPR_MASK) | (xirr & CPPR_MASK);
+	if (!XISR(ss)) {
+		icp_resend(icp, server);
+	}
+}
+
+static void icp_irq(struct icp_state *icp, int server, int nr, uint8_t priority)
+{
+	struct icp_server_state *ss = icp->ss + server;
+	xics_dprintf("icp_irq(nr %d, server %d, prio 0x%x)\n", nr, server, priority);
+	if ((priority >= CPPR(ss))
+	    || (XISR(ss) && (ss->pending_priority <= priority))) {
+		xics_dprintf("reject %d, CPPR 0x%x, XISR 0x%x, pprio 0x%x, prio 0x%x\n",
+			     nr, CPPR(ss), XISR(ss), ss->pending_priority, priority);
+		ics_reject(icp->ics, nr);
+	} else {
+		if (XISR(ss)) {
+			xics_dprintf("reject %d, CPPR 0x%x, XISR 0x%x, pprio 0x%x, prio 0x%x\n",
+				     nr, CPPR(ss), XISR(ss), ss->pending_priority, priority);
+			ics_reject(icp->ics, XISR(ss));
+		}
+		ss->xirr = (ss->xirr & ~XISR_MASK) | (nr & XISR_MASK);
+		ss->pending_priority = priority;
+		cpu_irq_raise(ss->cpu);
+	}
+}
+
+/*
+ * ICS: Source layer
+ */
+
+struct ics_irq_state {
+	int server;
+	uint8_t priority;
+	uint8_t saved_priority;
+	int rejected:1;
+	int masked_pending:1;
+};
+
+struct ics_state {
+	unsigned int nr_irqs;
+	unsigned int offset;
+	struct ics_irq_state *irqs;
+	struct icp_state *icp;
+};
+
+static int ics_valid_irq(struct ics_state *ics, uint32_t nr)
+{
+	return (nr >= ics->offset)
+		&& (nr < (ics->offset + ics->nr_irqs));
+}
+
+static void ics_set_irq_msi(struct ics_state *ics, int srcno, int val)
+{
+	struct ics_irq_state *irq = ics->irqs + srcno;
+
+	if (val) {
+		if (irq->priority == 0xff) {
+			xics_dprintf(" irq pri ff, masked pending\n");
+			irq->masked_pending = 1;
+		} else	{
+			icp_irq(ics->icp, irq->server, srcno + ics->offset, irq->priority);
+		}
+	}
+}
+
+static void ics_reject_msi(struct ics_state *ics, int nr)
+{
+	struct ics_irq_state *irq = ics->irqs + nr - ics->offset;
+
+	irq->rejected = 1;
+}
+
+static void ics_resend_msi(struct ics_state *ics)
+{
+	unsigned int i;
+
+	for (i = 0; i < ics->nr_irqs; i++) {
+		struct ics_irq_state *irq = ics->irqs + i;
+
+		/* FIXME: filter by server#? */
+		if (irq->rejected) {
+			irq->rejected = 0;
+			if (irq->priority != 0xff) {
+				icp_irq(ics->icp, irq->server, i + ics->offset, irq->priority);
+			}
+		}
+	}
+}
+
+static void ics_write_xive_msi(struct ics_state *ics, int nr, int server,
+			       uint8_t priority)
+{
+	struct ics_irq_state *irq = ics->irqs + nr - ics->offset;
+
+	irq->server = server;
+	irq->priority = priority;
+	xics_dprintf("ics_write_xive_msi(nr %d, server %d, pri 0x%x)\n", nr, server, priority);
+
+	if (!irq->masked_pending || (priority == 0xff)) {
+		return;
+	}
+
+	irq->masked_pending = 0;
+	icp_irq(ics->icp, server, nr, priority);
+}
+
+static void ics_reject(struct ics_state *ics, int nr)
+{
+	ics_reject_msi(ics, nr);
+}
+
+static void ics_resend(struct ics_state *ics)
+{
+	ics_resend_msi(ics);
+}
+
+static void ics_eoi(struct ics_state *ics, int nr)
+{
+}
+
+/*
+ * Exported functions
+ */
+
+static target_ulong h_cppr(struct kvm_cpu *vcpu,
+			   target_ulong opcode, target_ulong *args)
+{
+	target_ulong cppr = args[0];
+
+	xics_dprintf("h_cppr(%lx)\n", cppr);
+	icp_set_cppr(vcpu->kvm->arch.icp, vcpu->cpu_id, cppr);
+	return H_SUCCESS;
+}
+
+static target_ulong h_ipi(struct kvm_cpu *vcpu,
+			  target_ulong opcode, target_ulong *args)
+{
+	target_ulong server = args[0];
+	target_ulong mfrr = args[1];
+
+	xics_dprintf("h_ipi(%lx, %lx)\n", server, mfrr);
+	if (server >= vcpu->kvm->arch.icp->nr_servers) {
+		return H_PARAMETER;
+	}
+
+	icp_set_mfrr(vcpu->kvm->arch.icp, server, mfrr);
+	return H_SUCCESS;
+}
+
+static target_ulong h_xirr(struct kvm_cpu *vcpu,
+			   target_ulong opcode, target_ulong *args)
+{
+	uint32_t xirr = icp_accept(vcpu->kvm->arch.icp->ss + vcpu->cpu_id);
+
+	xics_dprintf("h_xirr() = %x\n", xirr);
+	args[0] = xirr;
+	return H_SUCCESS;
+}
+
+static target_ulong h_eoi(struct kvm_cpu *vcpu,
+			  target_ulong opcode, target_ulong *args)
+{
+	target_ulong xirr = args[0];
+
+	xics_dprintf("h_eoi(%lx)\n", xirr);
+	icp_eoi(vcpu->kvm->arch.icp, vcpu->cpu_id, xirr);
+	return H_SUCCESS;
+}
+
+static void rtas_set_xive(struct kvm_cpu *vcpu, uint32_t token,
+			  uint32_t nargs, target_ulong args,
+			  uint32_t nret, target_ulong rets)
+{
+	struct ics_state *ics = vcpu->kvm->arch.icp->ics;
+	uint32_t nr, server, priority;
+
+	if ((nargs != 3) || (nret != 1)) {
+		rtas_st(vcpu->kvm, rets, 0, -3);
+		return;
+	}
+
+	nr = rtas_ld(vcpu->kvm, args, 0);
+	server = rtas_ld(vcpu->kvm, args, 1);
+	priority = rtas_ld(vcpu->kvm, args, 2);
+
+	xics_dprintf("rtas_set_xive(%x,%x,%x)\n", nr, server, priority);
+	if (!ics_valid_irq(ics, nr) || (server >= ics->icp->nr_servers)
+	    || (priority > 0xff)) {
+		rtas_st(vcpu->kvm, rets, 0, -3);
+		return;
+	}
+
+	ics_write_xive_msi(ics, nr, server, priority);
+
+	rtas_st(vcpu->kvm, rets, 0, 0); /* Success */
+}
+
+static void rtas_get_xive(struct kvm_cpu *vcpu, uint32_t token,
+			  uint32_t nargs, target_ulong args,
+			  uint32_t nret, target_ulong rets)
+{
+	struct ics_state *ics = vcpu->kvm->arch.icp->ics;
+	uint32_t nr;
+
+	if ((nargs != 1) || (nret != 3)) {
+		rtas_st(vcpu->kvm, rets, 0, -3);
+		return;
+	}
+
+	nr = rtas_ld(vcpu->kvm, args, 0);
+
+	if (!ics_valid_irq(ics, nr)) {
+		rtas_st(vcpu->kvm, rets, 0, -3);
+		return;
+	}
+
+	rtas_st(vcpu->kvm, rets, 0, 0); /* Success */
+	rtas_st(vcpu->kvm, rets, 1, ics->irqs[nr - ics->offset].server);
+	rtas_st(vcpu->kvm, rets, 2, ics->irqs[nr - ics->offset].priority);
+}
+
+static void rtas_int_off(struct kvm_cpu *vcpu, uint32_t token,
+			 uint32_t nargs, target_ulong args,
+			 uint32_t nret, target_ulong rets)
+{
+	struct ics_state *ics = vcpu->kvm->arch.icp->ics;
+	uint32_t nr;
+
+	if ((nargs != 1) || (nret != 1)) {
+		rtas_st(vcpu->kvm, rets, 0, -3);
+		return;
+	}
+
+	nr = rtas_ld(vcpu->kvm, args, 0);
+
+	if (!ics_valid_irq(ics, nr)) {
+		rtas_st(vcpu->kvm, rets, 0, -3);
+		return;
+	}
+
+	/* ME: QEMU wrote xive_msi here, in #if 0.  Deleted. */
+
+	rtas_st(vcpu->kvm, rets, 0, 0); /* Success */
+}
+
+static void rtas_int_on(struct kvm_cpu *vcpu, uint32_t token,
+			uint32_t nargs, target_ulong args,
+			uint32_t nret, target_ulong rets)
+{
+	struct ics_state *ics = vcpu->kvm->arch.icp->ics;
+	uint32_t nr;
+
+	if ((nargs != 1) || (nret != 1)) {
+		rtas_st(vcpu->kvm, rets, 0, -3);
+		return;
+	}
+
+	nr = rtas_ld(vcpu->kvm, args, 0);
+
+	if (!ics_valid_irq(ics, nr)) {
+		rtas_st(vcpu->kvm, rets, 0, -3);
+		return;
+	}
+
+	/* ME: QEMU wrote xive_msi here, in #if 0.  Deleted. */
+
+	rtas_st(vcpu->kvm, rets, 0, 0); /* Success */
+}
+
+static int xics_init(struct kvm *kvm)
+{
+	unsigned int i;
+	struct icp_state *icp;
+	struct ics_state *ics;
+	int j;
+
+	icp = malloc(sizeof(*icp));
+	icp->nr_servers = kvm->nrcpus;
+	icp->ss = malloc(icp->nr_servers * sizeof(struct icp_server_state));
+
+	for (i = 0; i < icp->nr_servers; i++) {
+		icp->ss[i].xirr = 0;
+		icp->ss[i].pending_priority = 0;
+		icp->ss[i].cpu = 0;
+		icp->ss[i].mfrr = 0xff;
+	}
+
+	/*
+	 * icp->ss[env->cpu_index].cpu is set by CPUs calling in to
+	 * xics_cpu_register().
+	 */
+
+	ics = malloc(sizeof(*ics));
+	ics->nr_irqs = XICS_NUM_IRQS;
+	ics->offset = XICS_IRQ_OFFSET;
+	ics->irqs = malloc(ics->nr_irqs * sizeof(struct ics_irq_state));
+
+	icp->ics = ics;
+	ics->icp = icp;
+
+	for (i = 0; i < ics->nr_irqs; i++) {
+		ics->irqs[i].server = 0;
+		ics->irqs[i].priority = 0xff;
+		ics->irqs[i].saved_priority = 0xff;
+		ics->irqs[i].rejected = 0;
+		ics->irqs[i].masked_pending = 0;
+	}
+
+	spapr_register_hypercall(H_CPPR, h_cppr);
+	spapr_register_hypercall(H_IPI, h_ipi);
+	spapr_register_hypercall(H_XIRR, h_xirr);
+	spapr_register_hypercall(H_EOI, h_eoi);
+
+	spapr_rtas_register("ibm,set-xive", rtas_set_xive);
+	spapr_rtas_register("ibm,get-xive", rtas_get_xive);
+	spapr_rtas_register("ibm,int-off", rtas_int_off);
+	spapr_rtas_register("ibm,int-on", rtas_int_on);
+
+	for (j = 0; j < kvm->nrcpus; j++) {
+		struct kvm_cpu *vcpu = kvm->cpus[j];
+
+		if (vcpu->cpu_id >= icp->nr_servers)
+			die("Invalid server number for cpuid %ld\n", vcpu->cpu_id);
+
+		icp->ss[vcpu->cpu_id].cpu = vcpu;
+	}
+
+	kvm->arch.icp = icp;
+
+	return 0;
+}
+dev_base_init(xics_init);
+
+
+void kvm__irq_line(struct kvm *kvm, int irq, int level)
+{
+	/*
+	 * Route event to ICS, which routes to ICP, which eventually does a
+	 * kvm_cpu__irq(vcpu, POWER7_EXT_IRQ, 1)
+	 */
+	xics_dprintf("Raising IRQ %d -> %d\n", irq, level);
+	ics_set_irq_msi(kvm->arch.icp->ics, irq - kvm->arch.icp->ics->offset, level);
+}

diff --git a/tools/kvm/powerpc/xics.h b/tools/kvm/powerpc/xics.h
new file mode 100644
index 0000000..d5bc6f9
--- /dev/null
+++ b/tools/kvm/powerpc/xics.h

@@ -0,0 +1,18 @@
+/*
+ * PAPR Virtualized Interrupt System, aka ICS/ICP aka xics
+ *
+ * Copyright 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#ifndef XICS_H
+#define XICS_H
+
+#define XICS_IPI        0x2
+
+int xics_alloc_irqnum(void);
+
+#endif

diff --git a/tools/kvm/symbol.c b/tools/kvm/symbol.c
new file mode 100644
index 0000000..07dd9d5
--- /dev/null
+++ b/tools/kvm/symbol.c

@@ -0,0 +1,133 @@
+#include "kvm/symbol.h"
+
+#include "kvm/kvm.h"
+
+#include <linux/err.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <bfd.h>
+
+static bfd *abfd;
+
+int symbol_init(struct kvm *kvm)
+{
+	int ret = 0;
+
+	if (!kvm->vmlinux)
+		return 0;
+
+	bfd_init();
+
+	abfd = bfd_openr(kvm->vmlinux, NULL);
+	if (abfd == NULL) {
+		bfd_error_type err = bfd_get_error();
+
+		switch (err) {
+		case bfd_error_no_memory:
+			ret = -ENOMEM;
+			break;
+		case bfd_error_invalid_target:
+			ret = -EINVAL;
+			break;
+		default:
+			ret = -EFAULT;
+			break;
+		}
+	}
+
+	return ret;
+}
+late_init(symbol_init);
+
+static asymbol *lookup(asymbol **symbols, int nr_symbols, const char *symbol_name)
+{
+	int i, ret;
+
+	ret = -ENOENT;
+
+	for (i = 0; i < nr_symbols; i++) {
+		asymbol *symbol = symbols[i];
+
+		if (!strcmp(bfd_asymbol_name(symbol), symbol_name))
+			return symbol;
+	}
+
+	return ERR_PTR(ret);
+}
+
+char *symbol_lookup(struct kvm *kvm, unsigned long addr, char *sym, size_t size)
+{
+	const char *filename;
+	bfd_vma sym_offset;
+	bfd_vma sym_start;
+	asection *section;
+	unsigned int line;
+	const char *func;
+	long symtab_size;
+	asymbol *symbol;
+	asymbol **syms;
+	int nr_syms, ret;
+
+	ret = -ENOENT;
+	if (!abfd)
+		goto not_found;
+
+	if (!bfd_check_format(abfd, bfd_object))
+		goto not_found;
+
+	symtab_size = bfd_get_symtab_upper_bound(abfd);
+	if (!symtab_size)
+		goto not_found;
+
+	ret = -ENOMEM;
+	syms = malloc(symtab_size);
+	if (!syms)
+		goto not_found;
+
+	nr_syms = bfd_canonicalize_symtab(abfd, syms);
+
+	ret = -ENOENT;
+	section = bfd_get_section_by_name(abfd, ".debug_aranges");
+	if (!section)
+		goto not_found;
+
+	if (!bfd_find_nearest_line(abfd, section, NULL, addr, &filename, &func, &line))
+		goto not_found;
+
+	if (!func)
+		goto not_found;
+
+	symbol = lookup(syms, nr_syms, func);
+	if (IS_ERR(symbol))
+		goto not_found;
+
+	sym_start = bfd_asymbol_value(symbol);
+
+	sym_offset = addr - sym_start;
+
+	snprintf(sym, size, "%s+%llx (%s:%i)", func, (long long) sym_offset, filename, line);
+
+	sym[size - 1] = '\0';
+
+	free(syms);
+
+	return sym;
+
+not_found:
+	return ERR_PTR(ret);
+}
+
+int symbol_exit(struct kvm *kvm)
+{
+	bfd_boolean ret = TRUE;
+
+	if (abfd)
+		ret = bfd_close(abfd);
+
+	if (ret == TRUE)
+		return 0;
+
+	return -EFAULT;
+}
+late_exit(symbol_exit);

diff --git a/tools/kvm/term.c b/tools/kvm/term.c
new file mode 100644
index 0000000..1b8131a
--- /dev/null
+++ b/tools/kvm/term.c

@@ -0,0 +1,210 @@
+#include <poll.h>
+#include <stdbool.h>
+#include <termios.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/uio.h>
+#include <signal.h>
+#include <pty.h>
+#include <utmp.h>
+
+#include "kvm/read-write.h"
+#include "kvm/term.h"
+#include "kvm/util.h"
+#include "kvm/kvm.h"
+#include "kvm/kvm-cpu.h"
+
+#define TERM_FD_IN      0
+#define TERM_FD_OUT     1
+
+static struct termios	orig_term;
+
+int term_escape_char	= 0x01; /* ctrl-a is used for escape */
+bool term_got_escape	= false;
+
+int term_fds[TERM_MAX_DEVS][2];
+
+static pthread_t term_poll_thread;
+
+int term_getc(struct kvm *kvm, int term)
+{
+	unsigned char c;
+
+	if (read_in_full(term_fds[term][TERM_FD_IN], &c, 1) < 0)
+		return -1;
+
+	if (term_got_escape) {
+		term_got_escape = false;
+		if (c == 'x')
+			kvm_cpu__reboot(kvm);
+		if (c == term_escape_char)
+			return c;
+	}
+
+	if (c == term_escape_char) {
+		term_got_escape = true;
+		return -1;
+	}
+
+	return c;
+}
+
+int term_putc(char *addr, int cnt, int term)
+{
+	int ret;
+	int num_remaining = cnt;
+
+	while (num_remaining) {
+		ret = write(term_fds[term][TERM_FD_OUT], addr, num_remaining);
+		if (ret < 0)
+			return cnt - num_remaining;
+		num_remaining -= ret;
+		addr += ret;
+	}
+
+	return cnt;
+}
+
+int term_getc_iov(struct kvm *kvm, struct iovec *iov, int iovcnt, int term)
+{
+	int c;
+
+	c = term_getc(kvm, term);
+
+	if (c < 0)
+		return 0;
+
+	*((char *)iov[TERM_FD_IN].iov_base)	= (char)c;
+
+	return sizeof(char);
+}
+
+int term_putc_iov(struct iovec *iov, int iovcnt, int term)
+{
+	return writev(term_fds[term][TERM_FD_OUT], iov, iovcnt);
+}
+
+bool term_readable(int term)
+{
+	struct pollfd pollfd = (struct pollfd) {
+		.fd	= term_fds[term][TERM_FD_IN],
+		.events	= POLLIN,
+		.revents = 0,
+	};
+	int err;
+
+	err = poll(&pollfd, 1, 0);
+	return (err > 0 && (pollfd.revents & POLLIN));
+}
+
+static void *term_poll_thread_loop(void *param)
+{
+	struct pollfd fds[TERM_MAX_DEVS];
+	struct kvm *kvm = (struct kvm *) param;
+
+	int i;
+
+	for (i = 0; i < TERM_MAX_DEVS; i++) {
+		fds[i].fd = term_fds[i][TERM_FD_IN];
+		fds[i].events = POLLIN;
+		fds[i].revents = 0;
+	}
+
+	while (1) {
+		/* Poll with infinite timeout */
+		if(poll(fds, TERM_MAX_DEVS, -1) < 1)
+			break;
+		kvm__arch_read_term(kvm);
+	}
+
+	die("term_poll_thread_loop: error polling device fds %d\n", errno);
+	return NULL;
+}
+
+static void term_cleanup(void)
+{
+	int i;
+
+	for (i = 0; i < TERM_MAX_DEVS; i++)
+		tcsetattr(term_fds[i][TERM_FD_IN], TCSANOW, &orig_term);
+}
+
+static void term_sig_cleanup(int sig)
+{
+	term_cleanup();
+	signal(sig, SIG_DFL);
+	raise(sig);
+}
+
+void term_set_tty(int term)
+{
+	struct termios orig_term;
+	int master, slave;
+	char new_pty[PATH_MAX];
+
+	if (tcgetattr(STDIN_FILENO, &orig_term) < 0)
+		die("unable to save initial standard input settings");
+
+	orig_term.c_lflag &= ~(ICANON | ECHO | ISIG);
+
+	if (openpty(&master, &slave, new_pty, &orig_term, NULL) < 0)
+		return;
+
+	close(slave);
+
+	pr_info("Assigned terminal %d to pty %s\n", term, new_pty);
+
+	term_fds[term][TERM_FD_IN] = term_fds[term][TERM_FD_OUT] = master;
+}
+
+int tty_parser(const struct option *opt, const char *arg, int unset)
+{
+	int tty = atoi(arg);
+
+	term_set_tty(tty);
+
+	return 0;
+}
+
+int term_init(struct kvm *kvm)
+{
+	struct termios term;
+	int i, r;
+
+	for (i = 0; i < TERM_MAX_DEVS; i++)
+		if (term_fds[i][TERM_FD_IN] == 0) {
+			term_fds[i][TERM_FD_IN] = STDIN_FILENO;
+			term_fds[i][TERM_FD_OUT] = STDOUT_FILENO;
+		}
+
+	if (!isatty(STDIN_FILENO) || !isatty(STDOUT_FILENO))
+		return 0;
+
+	r = tcgetattr(STDIN_FILENO, &orig_term);
+	if (r < 0) {
+		pr_warning("unable to save initial standard input settings");
+		return r;
+	}
+
+
+	term = orig_term;
+	term.c_lflag &= ~(ICANON | ECHO | ISIG);
+	tcsetattr(STDIN_FILENO, TCSANOW, &term);
+
+
+	/* Use our own blocking thread to read stdin, don't require a tick */
+	if(pthread_create(&term_poll_thread, NULL, term_poll_thread_loop,kvm))
+		die("Unable to create console input poll thread\n");
+
+	signal(SIGTERM, term_sig_cleanup);
+	atexit(term_cleanup);
+
+	return 0;
+}
+dev_init(term_init);
+
+int term_exit(struct kvm *kvm)
+{
+	return 0;
+}
+dev_exit(term_exit);

diff --git a/tools/kvm/tests/Makefile b/tools/kvm/tests/Makefile
new file mode 100644
index 0000000..cad14ec
--- /dev/null
+++ b/tools/kvm/tests/Makefile

@@ -0,0 +1,19 @@
+all: kernel pit boot
+
+kernel:
+	$(MAKE) -C kernel
+.PHONY: kernel
+
+pit:
+	$(MAKE) -C pit
+.PHONY: pit
+
+boot:
+	$(MAKE) -C boot
+.PHONY: boot
+
+clean:
+	$(MAKE) -C kernel clean
+	$(MAKE) -C pit clean
+	$(MAKE) -C boot clean
+.PHONY: clean

diff --git a/tools/kvm/tests/boot/Makefile b/tools/kvm/tests/boot/Makefile
new file mode 100644
index 0000000..40cba68
--- /dev/null
+++ b/tools/kvm/tests/boot/Makefile

@@ -0,0 +1,13 @@
+NAME	:= init
+
+OBJ	:= $(NAME).o
+
+all: $(.o)
+	rm -rf rootfs
+	mkdir rootfs
+	gcc -static init.c -o rootfs/init
+	mkisofs rootfs > boot_test.iso
+
+clean:
+	rm -rf rootfs boot_test.iso
+.PHONY: clean

diff --git a/tools/kvm/tests/boot/init.c b/tools/kvm/tests/boot/init.c
new file mode 100644
index 0000000..094f8ba
--- /dev/null
+++ b/tools/kvm/tests/boot/init.c

@@ -0,0 +1,11 @@
+#include <linux/reboot.h>
+#include <unistd.h>
+
+int main(int argc, char *argv[])
+{
+	puts("hello, KVM guest!\r");
+
+	reboot(LINUX_REBOOT_CMD_RESTART);
+
+	return 0;
+}

diff --git a/tools/kvm/tests/kernel/.gitignore b/tools/kvm/tests/kernel/.gitignore
new file mode 100644
index 0000000..d0cd209
--- /dev/null
+++ b/tools/kvm/tests/kernel/.gitignore

@@ -0,0 +1,2 @@
+kernel.bin
+kernel.elf

diff --git a/tools/kvm/tests/kernel/Makefile b/tools/kvm/tests/kernel/Makefile
new file mode 100644
index 0000000..c7dd8da
--- /dev/null
+++ b/tools/kvm/tests/kernel/Makefile

@@ -0,0 +1,20 @@
+NAME	:= kernel
+
+BIN	:= $(NAME).bin
+ELF	:= $(NAME).elf
+OBJ	:= $(NAME).o
+
+all: $(BIN)
+
+$(BIN): $(ELF)
+	objcopy -O binary $< $@
+
+$(ELF): $(OBJ)
+	ld -Ttext=0x00 -nostdlib -static $< -o $@
+
+%.o: %.S
+	gcc -nostdinc -c $< -o $@
+
+clean:
+	rm -f $(BIN) $(ELF) $(OBJ)
+.PHONY: clean

diff --git a/tools/kvm/tests/kernel/README b/tools/kvm/tests/kernel/README
new file mode 100644
index 0000000..2923777
--- /dev/null
+++ b/tools/kvm/tests/kernel/README

@@ -0,0 +1,16 @@
+Compiling
+---------
+
+You can simply type:
+
+  $ make
+
+to build a 16-bit binary that uses the i8086 instruction set.
+
+Disassembling
+-------------
+
+Use the "-m i8086" command line option with objdump to make sure it knows we're
+dealing with i8086 instruction set:
+
+  $ objdump -d -m i8086 i8086.elf

diff --git a/tools/kvm/tests/kernel/kernel.S b/tools/kvm/tests/kernel/kernel.S
new file mode 100644
index 0000000..2824b64
--- /dev/null
+++ b/tools/kvm/tests/kernel/kernel.S

@@ -0,0 +1,8 @@
+	.code16gcc
+	.text
+	.globl  _start
+	.type   _start, @function
+_start:
+	# "This is probably the largest possible kernel that is bug free." -- Avi Kivity
+	1:
+	jmp 1b

diff --git a/tools/kvm/tests/pit/.gitignore b/tools/kvm/tests/pit/.gitignore
new file mode 100644
index 0000000..43f0aa8
--- /dev/null
+++ b/tools/kvm/tests/pit/.gitignore

@@ -0,0 +1,2 @@
+*.bin
+*.elf

diff --git a/tools/kvm/tests/pit/Makefile b/tools/kvm/tests/pit/Makefile
new file mode 100644
index 0000000..2fae9b2
--- /dev/null
+++ b/tools/kvm/tests/pit/Makefile

@@ -0,0 +1,20 @@
+NAME	:= tick
+
+BIN	:= $(NAME).bin
+ELF	:= $(NAME).elf
+OBJ	:= $(NAME).o
+
+all: $(BIN)
+
+$(BIN): $(ELF)
+	objcopy -O binary $< $@
+
+$(ELF): $(OBJ)
+	ld -Ttext=0x00 -nostdlib -static $< -o $@
+
+%.o: %.S
+	gcc -nostdinc -c $< -o $@
+
+clean:
+	rm -f $(BIN) $(ELF) $(OBJ)
+.PHONY: clean

diff --git a/tools/kvm/tests/pit/README b/tools/kvm/tests/pit/README
new file mode 100644
index 0000000..2923777
--- /dev/null
+++ b/tools/kvm/tests/pit/README

@@ -0,0 +1,16 @@
+Compiling
+---------
+
+You can simply type:
+
+  $ make
+
+to build a 16-bit binary that uses the i8086 instruction set.
+
+Disassembling
+-------------
+
+Use the "-m i8086" command line option with objdump to make sure it knows we're
+dealing with i8086 instruction set:
+
+  $ objdump -d -m i8086 i8086.elf

diff --git a/tools/kvm/tests/pit/tick.S b/tools/kvm/tests/pit/tick.S
new file mode 100644
index 0000000..635dc8d
--- /dev/null
+++ b/tools/kvm/tests/pit/tick.S

@@ -0,0 +1,101 @@
+#define IO_PIC		0x20
+#define IRQ_OFFSET	32
+#define IO_PIT		0x40
+#define TIMER_FREQ	1193182
+#define TIMER_DIV(x)	((TIMER_FREQ+(x)/2)/(x))
+
+#define TEST_COUNT	0x0200
+
+	.code16gcc
+	.text
+	.globl	_start
+	.type	_start, @function
+_start:
+/*
+ * fill up noop handlers
+ */
+	xorw	%ax, %ax
+	xorw	%di, %di
+	movw	%ax, %es
+	movw	$256, %cx
+fill_noop_idt:
+	movw	$noop_handler, %es:(%di)
+	movw	%cs, %es:2(%di)
+	add	$4, %di
+	loop	fill_noop_idt
+
+set_idt:
+	movw	$timer_isr, %es:(IRQ_OFFSET*4)
+	movw	%cs, %es:(IRQ_OFFSET*4+2)
+
+set_pic:
+	# ICW1
+	mov	$0x11, %al
+	mov	$(IO_PIC), %dx
+	out	%al,%dx
+	# ICW2
+	mov	$(IRQ_OFFSET), %al
+	mov	$(IO_PIC+1), %dx
+	out	%al, %dx
+	# ICW3
+	mov	$0x00, %al
+	mov	$(IO_PIC+1), %dx
+	out	%al, %dx
+	# ICW4
+	mov	$0x3, %al
+	mov	$(IO_PIC+1), %dx
+	out	%al, %dx
+
+set_pit:
+	# set 8254 mode
+	mov	$(IO_PIT+3), %dx
+	mov	$0x34, %al
+	outb	%al, %dx
+	# set 8254 freq 1KHz
+	mov	$(IO_PIT), %dx
+	movb	$(TIMER_DIV(1000) % 256), %al
+	outb	%al, %dx
+	movb	$(TIMER_DIV(1000) / 256), %al
+	outb	%al, %dx
+
+enable_irq0:
+	mov	$0xfe, %al
+	mov	$(IO_PIC+1), %dx
+	out	%al, %dx
+	sti
+loop:
+	1:
+	jmp	1b
+
+test_ok:
+	mov	$0x3f8,%dx
+	cs lea	msg2, %si
+	mov	$(msg2_end-msg2), %cx
+	cs rep/outsb
+
+	/* Reboot by using the i8042 reboot line */
+	mov	$0xfe, %al
+	outb	%al, $0x64
+
+timer_isr:
+	cli
+	pushaw
+	pushfw
+	mov	$0x3f8,%dx
+	mov	$0x2e, %al	# .
+	out	%al,%dx
+	decw	count
+	jz	test_ok
+	popfw
+	popaw
+	iretw
+
+noop_handler:
+	iretw
+
+count:
+	.word	TEST_COUNT
+
+msg2:
+	.asciz "\nTest OK\n"
+msg2_end:

diff --git a/tools/kvm/ui/gtk3.c b/tools/kvm/ui/gtk3.c
new file mode 100644
index 0000000..b2335bc
--- /dev/null
+++ b/tools/kvm/ui/gtk3.c

@@ -0,0 +1,326 @@
+#include "kvm/gtk3.h"
+
+#include "kvm/framebuffer.h"
+#include "kvm/kvm-cpu.h"
+#include "kvm/i8042.h"
+#include "kvm/vesa.h"
+#include "kvm/kvm.h"
+
+#include <gtk/gtk.h>
+#include <pthread.h>
+#include <linux/err.h>
+
+#define FRAME_RATE			25
+
+#define SCANCODE_UNKNOWN		0
+#define SCANCODE_NORMAL			1
+#define SCANCODE_ESCAPED		2
+#define SCANCODE_KEY_PAUSE		3
+#define SCANCODE_KEY_PRNTSCRN		4
+
+struct set2_scancode {
+	u8 code;
+	u8 type;
+};
+
+#define DEFINE_SC(_code) {		\
+	.code = _code,			\
+	.type = SCANCODE_NORMAL,	\
+}
+
+/* escaped scancodes */
+#define DEFINE_ESC(_code) {		\
+	.code = _code,			\
+	.type = SCANCODE_ESCAPED,	\
+}
+
+static const struct set2_scancode const keymap[256] = {
+	[9]	= DEFINE_SC(0x76),	/* <esc> */
+	[10]	= DEFINE_SC(0x16),	/* 1 */
+	[11]	= DEFINE_SC(0x1e),	/* 2 */
+	[12]	= DEFINE_SC(0x26),	/* 3 */
+	[13]	= DEFINE_SC(0x25),	/* 4 */
+	[14]	= DEFINE_SC(0x2e),	/* 5 */
+	[15]	= DEFINE_SC(0x36),	/* 6 */
+	[16]	= DEFINE_SC(0x3d),	/* 7 */
+	[17]	= DEFINE_SC(0x3e),	/* 8 */
+	[18]	= DEFINE_SC(0x46),	/* 9 */
+	[19]	= DEFINE_SC(0x45),	/* 9 */
+	[20]	= DEFINE_SC(0x4e),	/* - */
+	[21]	= DEFINE_SC(0x55),	/* + */
+	[22]	= DEFINE_SC(0x66),	/* <backspace> */
+	[23]	= DEFINE_SC(0x0d),	/* <tab> */
+	[24]	= DEFINE_SC(0x15),	/* q */
+	[25]	= DEFINE_SC(0x1d),	/* w */
+	[26]	= DEFINE_SC(0x24),	/* e */
+	[27]	= DEFINE_SC(0x2d),	/* r */
+	[28]	= DEFINE_SC(0x2c),	/* t */
+	[29]	= DEFINE_SC(0x35),	/* y */
+	[30]	= DEFINE_SC(0x3c),	/* u */
+	[31]	= DEFINE_SC(0x43),	/* i */
+	[32]	= DEFINE_SC(0x44),	/* o */
+	[33]	= DEFINE_SC(0x4d),	/* p */
+	[34]	= DEFINE_SC(0x54),	/* [ */
+	[35]	= DEFINE_SC(0x5b),	/* ] */
+	[36]	= DEFINE_SC(0x5a),	/* <enter> */
+	[37]	= DEFINE_SC(0x14),	/* <left ctrl> */
+	[38]	= DEFINE_SC(0x1c),	/* a */
+	[39]	= DEFINE_SC(0x1b),	/* s */
+	[40]	= DEFINE_SC(0x23),	/* d */
+	[41]	= DEFINE_SC(0x2b),	/* f */
+	[42]	= DEFINE_SC(0x34),	/* g */
+	[43]	= DEFINE_SC(0x33),	/* h */
+	[44]	= DEFINE_SC(0x3b),	/* j */
+	[45]	= DEFINE_SC(0x42),	/* k */
+	[46]	= DEFINE_SC(0x4b),	/* l */
+	[47]	= DEFINE_SC(0x4c),	/* ; */
+	[48]	= DEFINE_SC(0x52),	/* ' */
+	[49]	= DEFINE_SC(0x0e),	/* ` */
+	[50]	= DEFINE_SC(0x12),	/* <left shift> */
+	[51]	= DEFINE_SC(0x5d),	/* \ */
+	[52]	= DEFINE_SC(0x1a),	/* z */
+	[53]	= DEFINE_SC(0x22),	/* x */
+	[54]	= DEFINE_SC(0x21),	/* c */
+	[55]	= DEFINE_SC(0x2a),	/* v */
+	[56]	= DEFINE_SC(0x32),	/* b */
+	[57]	= DEFINE_SC(0x31),	/* n */
+	[58]	= DEFINE_SC(0x3a),	/* m */
+	[59]	= DEFINE_SC(0x41),	/* < */
+	[60]	= DEFINE_SC(0x49),	/* > */
+	[61]	= DEFINE_SC(0x4a),	/* / */
+	[62]	= DEFINE_SC(0x59),	/* <right shift> */
+	[63]	= DEFINE_SC(0x7c),	/* keypad * */
+	[64]	= DEFINE_SC(0x11),	/* <left alt> */
+	[65]	= DEFINE_SC(0x29),	/* <space> */
+
+	[67]	= DEFINE_SC(0x05),	/* <F1> */
+	[68]	= DEFINE_SC(0x06),	/* <F2> */
+	[69]	= DEFINE_SC(0x04),	/* <F3> */
+	[70]	= DEFINE_SC(0x0c),	/* <F4> */
+	[71]	= DEFINE_SC(0x03),	/* <F5> */
+	[72]	= DEFINE_SC(0x0b),	/* <F6> */
+	[73]	= DEFINE_SC(0x83),	/* <F7> */
+	[74]	= DEFINE_SC(0x0a),	/* <F8> */
+	[75]	= DEFINE_SC(0x01),	/* <F9> */
+	[76]	= DEFINE_SC(0x09),	/* <F10> */
+
+	[79]	= DEFINE_SC(0x6c),	/* keypad 7 */
+	[80]	= DEFINE_SC(0x75),	/* keypad 8 */
+	[81]	= DEFINE_SC(0x7d),	/* keypad 9 */
+	[82]	= DEFINE_SC(0x7b),	/* keypad - */
+	[83]	= DEFINE_SC(0x6b),	/* keypad 4 */
+	[84]	= DEFINE_SC(0x73),	/* keypad 5 */
+	[85]	= DEFINE_SC(0x74),	/* keypad 6 */
+	[86]	= DEFINE_SC(0x79),	/* keypad + */
+	[87]	= DEFINE_SC(0x69),	/* keypad 1 */
+	[88]	= DEFINE_SC(0x72),	/* keypad 2 */
+	[89]	= DEFINE_SC(0x7a),	/* keypad 3 */
+	[90]	= DEFINE_SC(0x70),	/* keypad 0 */
+	[91]	= DEFINE_SC(0x71),	/* keypad . */
+
+	[94]	= DEFINE_SC(0x61),	/* <INT 1> */
+	[95]	= DEFINE_SC(0x78),	/* <F11> */
+	[96]	= DEFINE_SC(0x07),	/* <F12> */
+
+	[104]	= DEFINE_ESC(0x5a),	/* keypad <enter> */
+	[105]	= DEFINE_ESC(0x14),	/* <right ctrl> */
+	[106]	= DEFINE_ESC(0x4a),	/* keypad / */
+	[108]	= DEFINE_ESC(0x11),	/* <right alt> */
+	[110]	= DEFINE_ESC(0x6c),	/* <home> */
+	[111]	= DEFINE_ESC(0x75),	/* <up> */
+	[112]	= DEFINE_ESC(0x7d),	/* <pag up> */
+	[113]	= DEFINE_ESC(0x6b),	/* <left> */
+	[114]	= DEFINE_ESC(0x74),	/* <right> */
+	[115]	= DEFINE_ESC(0x69),	/* <end> */
+	[116]	= DEFINE_ESC(0x72),	/* <down> */
+	[117]	= DEFINE_ESC(0x7a),	/* <pag down> */
+	[118]	= DEFINE_ESC(0x70),	/* <ins> */
+	[119]	= DEFINE_ESC(0x71),	/* <delete> */
+};
+
+static cairo_surface_t	*surface;
+static bool		done;
+
+static const struct set2_scancode *to_code(u8 scancode)
+{
+        return &keymap[scancode];
+}
+
+static gboolean
+kvm_gtk_configure_event(GtkWidget * widget, GdkEventConfigure * event, gpointer data)
+{
+	struct framebuffer *fb = data;
+	int stride;
+
+	if (surface)
+		cairo_surface_destroy(surface);
+
+	stride = cairo_format_stride_for_width(CAIRO_FORMAT_RGB24, fb->width);
+
+	surface =
+	    cairo_image_surface_create_for_data((void *) fb->mem,
+						CAIRO_FORMAT_RGB24,
+						fb->width,
+						fb->height,
+						stride);
+
+	return TRUE;
+}
+
+static gboolean kvm_gtk_draw(GtkWidget *widget, cairo_t *cr, gpointer data)
+{
+	cairo_set_source_surface(cr, surface, 0, 0);
+
+	cairo_paint(cr);
+
+	return FALSE;
+}
+
+static void kvm_gtk_destroy(void)
+{
+	if (surface)
+		cairo_surface_destroy(surface);
+
+	gtk_main_quit();
+}
+
+static gboolean kvm_gtk_redraw(GtkWidget * window)
+{
+	gtk_widget_queue_draw(window);
+
+	return TRUE;
+}
+
+static gboolean
+kvm_gtk_key_press(GtkWidget *widget, GdkEventKey *event, gpointer user_data)
+{
+	const struct set2_scancode *sc = to_code(event->hardware_keycode);
+
+        switch (sc->type) {
+        case SCANCODE_ESCAPED:
+                kbd_queue(0xe0);
+                /* fallthrough */
+        case SCANCODE_NORMAL:
+                kbd_queue(sc->code);
+                break;
+        case SCANCODE_KEY_PAUSE:
+                kbd_queue(0xe1);
+                kbd_queue(0x14);
+                kbd_queue(0x77);
+                kbd_queue(0xe1);
+                kbd_queue(0xf0);
+                kbd_queue(0x14);
+                kbd_queue(0x77);
+                break;
+        case SCANCODE_KEY_PRNTSCRN:
+                kbd_queue(0xe0);
+                kbd_queue(0x12);
+                kbd_queue(0xe0);
+                kbd_queue(0x7c);
+                break;
+        }
+
+	return TRUE;
+}
+
+static void *kvm_gtk_thread(void *p)
+{
+	struct framebuffer *fb = p;
+	GtkWidget *window;
+	GtkWidget *frame;
+	GtkWidget *da;
+
+	gtk_init(NULL, NULL);
+
+	window = gtk_window_new(GTK_WINDOW_TOPLEVEL);
+
+	gtk_window_set_title(GTK_WINDOW(window), "VM");
+
+	g_signal_connect(window, "destroy", G_CALLBACK(kvm_gtk_destroy), NULL);
+
+	gtk_container_set_border_width(GTK_CONTAINER(window), 8);
+
+	frame = gtk_frame_new(NULL);
+
+	gtk_frame_set_shadow_type(GTK_FRAME(frame), GTK_SHADOW_IN);
+	gtk_container_add(GTK_CONTAINER(window), frame);
+
+	da = gtk_drawing_area_new();
+
+	gtk_widget_set_size_request(da, 100, 100);
+
+	gtk_container_add(GTK_CONTAINER(frame), da);
+
+	g_signal_connect(da, "draw", G_CALLBACK(kvm_gtk_draw), NULL);
+	g_signal_connect(da, "configure-event",
+			 G_CALLBACK(kvm_gtk_configure_event), fb);
+	g_signal_connect(G_OBJECT (window), "key_press_event", G_CALLBACK(kvm_gtk_key_press), NULL);
+
+
+	gtk_widget_set_events(da, gtk_widget_get_events(da)
+			      | GDK_BUTTON_PRESS_MASK
+			      | GDK_POINTER_MOTION_MASK
+			      | GDK_POINTER_MOTION_HINT_MASK);
+
+	gtk_widget_show_all(window);
+
+	g_timeout_add(1000 / FRAME_RATE, (GSourceFunc) kvm_gtk_redraw, window);
+
+	gtk_main();
+
+	done = true;
+
+	return NULL;
+}
+
+static int kvm_gtk_start(struct framebuffer *fb)
+{
+	pthread_t thread;
+
+	if (pthread_create(&thread, NULL, kvm_gtk_thread, fb) != 0)
+		return -1;
+
+	return 0;
+}
+
+static int kvm_gtk_stop(struct framebuffer *fb)
+{
+	gtk_main_quit();
+
+	while (!done)
+		sleep(0);
+
+	return 0;
+}
+
+static struct fb_target_operations kvm_gtk_ops = {
+	.start		= kvm_gtk_start,
+	.stop		= kvm_gtk_stop,
+};
+
+int kvm_gtk_init(struct kvm *kvm)
+{
+	struct framebuffer *fb;
+
+	if (!kvm->cfg.gtk)
+		return 0;
+
+	fb = vesa__init(kvm);
+	if (IS_ERR(fb)) {
+		pr_err("vesa__init() failed with error %ld\n", PTR_ERR(fb));
+		return PTR_ERR(fb);
+	}
+
+	return fb__attach(fb, &kvm_gtk_ops);
+}
+
+int kvm_gtk_exit(struct kvm *kvm)
+{
+	if (kvm->cfg.gtk)
+		return kvm_gtk_stop(NULL);
+
+	return 0;
+}
+
+dev_init(kvm_gtk_init);
+dev_exit(kvm_gtk_exit);

diff --git a/tools/kvm/ui/sdl.c b/tools/kvm/ui/sdl.c
new file mode 100644
index 0000000..a260002
--- /dev/null
+++ b/tools/kvm/ui/sdl.c

@@ -0,0 +1,324 @@
+#include "kvm/sdl.h"
+
+#include "kvm/framebuffer.h"
+#include "kvm/i8042.h"
+#include "kvm/util.h"
+#include "kvm/kvm.h"
+#include "kvm/kvm-cpu.h"
+#include "kvm/vesa.h"
+
+#include <SDL/SDL.h>
+#include <pthread.h>
+#include <signal.h>
+#include <linux/err.h>
+
+#define FRAME_RATE		25
+
+#define SCANCODE_UNKNOWN      0
+#define SCANCODE_NORMAL       1
+#define SCANCODE_ESCAPED      2
+#define SCANCODE_KEY_PAUSE    3
+#define SCANCODE_KEY_PRNTSCRN 4
+
+struct set2_scancode {
+	u8 code;
+	u8 type;
+};
+
+#define DEFINE_SC(_code) {\
+	.code = _code,\
+	.type = SCANCODE_NORMAL,\
+}
+
+/* escaped scancodes */
+#define DEFINE_ESC(_code) {\
+	.code = _code,\
+	.type = SCANCODE_ESCAPED,\
+}
+
+static const struct set2_scancode const keymap[256] = {
+	[9]	= DEFINE_SC(0x76),	/* <esc> */
+	[10]	= DEFINE_SC(0x16),	/* 1 */
+	[11]	= DEFINE_SC(0x1e),	/* 2 */
+	[12]	= DEFINE_SC(0x26),	/* 3 */
+	[13]	= DEFINE_SC(0x25),	/* 4 */
+	[14]	= DEFINE_SC(0x2e),	/* 5 */
+	[15]	= DEFINE_SC(0x36),	/* 6 */
+	[16]	= DEFINE_SC(0x3d),	/* 7 */
+	[17]	= DEFINE_SC(0x3e),	/* 8 */
+	[18]	= DEFINE_SC(0x46),	/* 9 */
+	[19]	= DEFINE_SC(0x45),	/* 9 */
+	[20]	= DEFINE_SC(0x4e),	/* - */
+	[21]	= DEFINE_SC(0x55),	/* + */
+	[22]	= DEFINE_SC(0x66),	/* <backspace> */
+	[23]	= DEFINE_SC(0x0d),	/* <tab> */
+	[24]	= DEFINE_SC(0x15),	/* q */
+	[25]	= DEFINE_SC(0x1d),	/* w */
+	[26]	= DEFINE_SC(0x24),	/* e */
+	[27]	= DEFINE_SC(0x2d),	/* r */
+	[28]	= DEFINE_SC(0x2c),	/* t */
+	[29]	= DEFINE_SC(0x35),	/* y */
+	[30]	= DEFINE_SC(0x3c),	/* u */
+	[31]	= DEFINE_SC(0x43),	/* i */
+	[32]	= DEFINE_SC(0x44),	/* o */
+	[33]	= DEFINE_SC(0x4d),	/* p */
+	[34]	= DEFINE_SC(0x54),	/* [ */
+	[35]	= DEFINE_SC(0x5b),	/* ] */
+	[36]	= DEFINE_SC(0x5a),	/* <enter> */
+	[37]	= DEFINE_SC(0x14),	/* <left ctrl> */
+	[38]	= DEFINE_SC(0x1c),	/* a */
+	[39]	= DEFINE_SC(0x1b),	/* s */
+	[40]	= DEFINE_SC(0x23),	/* d */
+	[41]	= DEFINE_SC(0x2b),	/* f */
+	[42]	= DEFINE_SC(0x34),	/* g */
+	[43]	= DEFINE_SC(0x33),	/* h */
+	[44]	= DEFINE_SC(0x3b),	/* j */
+	[45]	= DEFINE_SC(0x42),	/* k */
+	[46]	= DEFINE_SC(0x4b),	/* l */
+	[47]	= DEFINE_SC(0x4c),	/* ; */
+	[48]	= DEFINE_SC(0x52),	/* ' */
+	[49]	= DEFINE_SC(0x0e),	/* ` */
+	[50]	= DEFINE_SC(0x12),	/* <left shift> */
+	[51]	= DEFINE_SC(0x5d),	/* \ */
+	[52]	= DEFINE_SC(0x1a),	/* z */
+	[53]	= DEFINE_SC(0x22),	/* x */
+	[54]	= DEFINE_SC(0x21),	/* c */
+	[55]	= DEFINE_SC(0x2a),	/* v */
+	[56]	= DEFINE_SC(0x32),	/* b */
+	[57]	= DEFINE_SC(0x31),	/* n */
+	[58]	= DEFINE_SC(0x3a),	/* m */
+	[59]	= DEFINE_SC(0x41),	/* < */
+	[60]	= DEFINE_SC(0x49),	/* > */
+	[61]	= DEFINE_SC(0x4a),	/* / */
+	[62]	= DEFINE_SC(0x59),	/* <right shift> */
+	[63]	= DEFINE_SC(0x7c),	/* keypad * */
+	[64]	= DEFINE_SC(0x11),	/* <left alt> */
+	[65]	= DEFINE_SC(0x29),	/* <space> */
+
+	[67]	= DEFINE_SC(0x05),	/* <F1> */
+	[68]	= DEFINE_SC(0x06),	/* <F2> */
+	[69]	= DEFINE_SC(0x04),	/* <F3> */
+	[70]	= DEFINE_SC(0x0c),	/* <F4> */
+	[71]	= DEFINE_SC(0x03),	/* <F5> */
+	[72]	= DEFINE_SC(0x0b),	/* <F6> */
+	[73]	= DEFINE_SC(0x83),	/* <F7> */
+	[74]	= DEFINE_SC(0x0a),	/* <F8> */
+	[75]	= DEFINE_SC(0x01),	/* <F9> */
+	[76]	= DEFINE_SC(0x09),	/* <F10> */
+
+	[79]	= DEFINE_SC(0x6c),	/* keypad 7 */
+	[80]	= DEFINE_SC(0x75),	/* keypad 8 */
+	[81]	= DEFINE_SC(0x7d),	/* keypad 9 */
+	[82]	= DEFINE_SC(0x7b),	/* keypad - */
+	[83]	= DEFINE_SC(0x6b),	/* keypad 4 */
+	[84]	= DEFINE_SC(0x73),	/* keypad 5 */
+	[85]	= DEFINE_SC(0x74),	/* keypad 6 */
+	[86]	= DEFINE_SC(0x79),	/* keypad + */
+	[87]	= DEFINE_SC(0x69),	/* keypad 1 */
+	[88]	= DEFINE_SC(0x72),	/* keypad 2 */
+	[89]	= DEFINE_SC(0x7a),	/* keypad 3 */
+	[90]	= DEFINE_SC(0x70),	/* keypad 0 */
+	[91]	= DEFINE_SC(0x71),	/* keypad . */
+
+	[94]	= DEFINE_SC(0x61),	/* <INT 1> */
+	[95]	= DEFINE_SC(0x78),	/* <F11> */
+	[96]	= DEFINE_SC(0x07),	/* <F12> */
+
+	[104]	= DEFINE_ESC(0x5a),	/* keypad <enter> */
+	[105]	= DEFINE_ESC(0x14),	/* <right ctrl> */
+	[106]	= DEFINE_ESC(0x4a),	/* keypad / */
+	[108]	= DEFINE_ESC(0x11),	/* <right alt> */
+	[110]	= DEFINE_ESC(0x6c),	/* <home> */
+	[111]	= DEFINE_ESC(0x75),	/* <up> */
+	[112]	= DEFINE_ESC(0x7d),	/* <pag up> */
+	[113]	= DEFINE_ESC(0x6b),	/* <left> */
+	[114]	= DEFINE_ESC(0x74),	/* <right> */
+	[115]	= DEFINE_ESC(0x69),	/* <end> */
+	[116]	= DEFINE_ESC(0x72),	/* <down> */
+	[117]	= DEFINE_ESC(0x7a),	/* <pag down> */
+	[118]	= DEFINE_ESC(0x70),	/* <ins> */
+	[119]	= DEFINE_ESC(0x71),	/* <delete> */
+};
+static bool running, done;
+
+static const struct set2_scancode *to_code(u8 scancode)
+{
+	return &keymap[scancode];
+}
+
+static void key_press(const struct set2_scancode *sc)
+{
+	switch (sc->type) {
+	case SCANCODE_ESCAPED:
+		kbd_queue(0xe0);
+		/* fallthrough */
+	case SCANCODE_NORMAL:
+		kbd_queue(sc->code);
+		break;
+	case SCANCODE_KEY_PAUSE:
+		kbd_queue(0xe1);
+		kbd_queue(0x14);
+		kbd_queue(0x77);
+		kbd_queue(0xe1);
+		kbd_queue(0xf0);
+		kbd_queue(0x14);
+		kbd_queue(0x77);
+		break;
+	case SCANCODE_KEY_PRNTSCRN:
+		kbd_queue(0xe0);
+		kbd_queue(0x12);
+		kbd_queue(0xe0);
+		kbd_queue(0x7c);
+		break;
+	}
+}
+
+static void key_release(const struct set2_scancode *sc)
+{
+	switch (sc->type) {
+	case SCANCODE_ESCAPED:
+		kbd_queue(0xe0);
+		/* fallthrough */
+	case SCANCODE_NORMAL:
+		kbd_queue(0xf0);
+		kbd_queue(sc->code);
+		break;
+	case SCANCODE_KEY_PAUSE:
+		/* nothing to do */
+		break;
+	case SCANCODE_KEY_PRNTSCRN:
+		kbd_queue(0xe0);
+		kbd_queue(0xf0);
+		kbd_queue(0x7c);
+		kbd_queue(0xe0);
+		kbd_queue(0xf0);
+		kbd_queue(0x12);
+		break;
+	}
+}
+
+static void *sdl__thread(void *p)
+{
+	Uint32 rmask, gmask, bmask, amask;
+	struct framebuffer *fb = p;
+	SDL_Surface *guest_screen;
+	SDL_Surface *screen;
+	SDL_Event ev;
+	Uint32 flags;
+
+	kvm__set_thread_name("kvm-sdl-worker");
+
+	if (SDL_Init(SDL_INIT_VIDEO) != 0)
+		die("Unable to initialize SDL");
+
+	rmask = 0x000000ff;
+	gmask = 0x0000ff00;
+	bmask = 0x00ff0000;
+	amask = 0x00000000;
+
+	guest_screen = SDL_CreateRGBSurfaceFrom(fb->mem, fb->width, fb->height, fb->depth, fb->width * fb->depth / 8, rmask, gmask, bmask, amask);
+	if (!guest_screen)
+		die("Unable to create SDL RBG surface");
+
+	flags = SDL_HWSURFACE | SDL_ASYNCBLIT | SDL_HWACCEL | SDL_DOUBLEBUF;
+
+	SDL_WM_SetCaption("KVM tool", "KVM tool");
+
+	screen = SDL_SetVideoMode(fb->width, fb->height, fb->depth, flags);
+	if (!screen)
+		die("Unable to set SDL video mode");
+
+	SDL_EnableKeyRepeat(200, 50);
+
+	while (running) {
+		SDL_BlitSurface(guest_screen, NULL, screen, NULL);
+		SDL_Flip(screen);
+
+		while (SDL_PollEvent(&ev)) {
+			switch (ev.type) {
+			case SDL_KEYDOWN: {
+				const struct set2_scancode *sc = to_code(ev.key.keysym.scancode);
+				if (sc->type == SCANCODE_UNKNOWN) {
+					pr_warning("key '%d' not found in keymap", ev.key.keysym.scancode);
+					break;
+				}
+				key_press(sc);
+				break;
+			}
+			case SDL_KEYUP: {
+				const struct set2_scancode *sc = to_code(ev.key.keysym.scancode);
+				if (sc->type == SCANCODE_UNKNOWN)
+					break;
+				key_release(sc);
+				break;
+			}
+			case SDL_QUIT:
+				goto exit;
+			}
+		}
+
+		SDL_Delay(1000 / FRAME_RATE);
+	}
+
+	if (running == false && done == false) {
+		done = true;
+		return NULL;
+	}
+exit:
+	done = true;
+	kvm_cpu__reboot(fb->kvm);
+
+	return NULL;
+}
+
+static int sdl__start(struct framebuffer *fb)
+{
+	pthread_t thread;
+
+	running = true;
+
+	if (pthread_create(&thread, NULL, sdl__thread, fb) != 0)
+		return -1;
+
+	return 0;
+}
+
+static int sdl__stop(struct framebuffer *fb)
+{
+	running = false;
+	while (done == false)
+		sleep(0);
+
+	return 0;
+}
+
+static struct fb_target_operations sdl_ops = {
+	.start	= sdl__start,
+	.stop	= sdl__stop,
+};
+
+int sdl__init(struct kvm *kvm)
+{
+	struct framebuffer *fb;
+
+	if (!kvm->cfg.sdl)
+		return 0;
+
+	fb = vesa__init(kvm);
+	if (IS_ERR(fb)) {
+		pr_err("vesa__init() failed with error %ld\n", PTR_ERR(fb));
+		return PTR_ERR(fb);
+	}
+
+	return fb__attach(fb, &sdl_ops);
+}
+dev_init(sdl__init);
+
+int sdl__exit(struct kvm *kvm)
+{
+	if (kvm->cfg.sdl)
+		return sdl__stop(NULL);
+
+	return 0;
+}
+dev_exit(sdl__exit);

diff --git a/tools/kvm/ui/vnc.c b/tools/kvm/ui/vnc.c
new file mode 100644
index 0000000..12e4bd5
--- /dev/null
+++ b/tools/kvm/ui/vnc.c

@@ -0,0 +1,250 @@
+#include "kvm/vnc.h"
+
+#include "kvm/framebuffer.h"
+#include "kvm/i8042.h"
+#include "kvm/vesa.h"
+
+#include <linux/types.h>
+#include <rfb/keysym.h>
+#include <rfb/rfb.h>
+#include <pthread.h>
+#include <linux/err.h>
+
+#define VESA_QUEUE_SIZE		128
+#define VESA_IRQ		14
+
+/*
+ * This "6000" value is pretty much the result of experimentation
+ * It seems that around this value, things update pretty smoothly
+ */
+#define VESA_UPDATE_TIME	6000
+
+/*
+ * We can map the letters and numbers without a fuss,
+ * but the other characters not so much.
+ */
+static char letters[26] = {
+	0x1c, 0x32, 0x21, 0x23, 0x24, /* a-e */
+	0x2b, 0x34, 0x33, 0x43, 0x3b, /* f-j */
+	0x42, 0x4b, 0x3a, 0x31, 0x44, /* k-o */
+	0x4d, 0x15, 0x2d, 0x1b, 0x2c, /* p-t */
+	0x3c, 0x2a, 0x1d, 0x22, 0x35, /* u-y */
+	0x1a,
+};
+
+static rfbScreenInfoPtr server;
+static char num[10] = {
+	0x45, 0x16, 0x1e, 0x26, 0x2e, 0x23, 0x36, 0x3d, 0x3e, 0x46,
+};
+
+/*
+ * This is called when the VNC server receives a key event
+ * The reason this function is such a beast is that we have
+ * to convert from ASCII characters (which is what VNC gets)
+ * to PC keyboard scancodes, which is what Linux expects to
+ * get from its keyboard. ASCII and the scancode set don't
+ * really seem to mesh in any good way beyond some basics with
+ * the letters and numbers.
+ */
+static void kbd_handle_key(rfbBool down, rfbKeySym key, rfbClientPtr cl)
+{
+	char tosend = 0;
+
+	if (key >= 0x41 && key <= 0x5a)
+		key += 0x20; /* convert to lowercase */
+
+	if (key >= 0x61 && key <= 0x7a) /* a-z */
+		tosend = letters[key - 0x61];
+
+	if (key >= 0x30 && key <= 0x39)
+		tosend = num[key - 0x30];
+
+	switch (key) {
+	case XK_Insert:		kbd_queue(0xe0);	tosend = 0x70;	break;
+	case XK_Delete:		kbd_queue(0xe0);	tosend = 0x71;	break;
+	case XK_Up:		kbd_queue(0xe0);	tosend = 0x75;	break;
+	case XK_Down:		kbd_queue(0xe0);	tosend = 0x72;	break;
+	case XK_Left:		kbd_queue(0xe0);	tosend = 0x6b;	break;
+	case XK_Right:		kbd_queue(0xe0);	tosend = 0x74;	break;
+	case XK_Page_Up:	kbd_queue(0xe0);	tosend = 0x7d;	break;
+	case XK_Page_Down:	kbd_queue(0xe0);	tosend = 0x7a;	break;
+	case XK_Home:		kbd_queue(0xe0);	tosend = 0x6c;	break;
+	case XK_BackSpace:	tosend = 0x66;		break;
+	case XK_Tab:		tosend = 0x0d;		break;
+	case XK_Return:		tosend = 0x5a;		break;
+	case XK_Escape:		tosend = 0x76;		break;
+	case XK_End:		tosend = 0x69;		break;
+	case XK_Shift_L:	tosend = 0x12;		break;
+	case XK_Shift_R:	tosend = 0x59;		break;
+	case XK_Control_R:	kbd_queue(0xe0);
+	case XK_Control_L:	tosend = 0x14;		break;
+	case XK_Alt_R:		kbd_queue(0xe0);
+	case XK_Alt_L:		tosend = 0x11;		break;
+	case XK_quoteleft:	tosend = 0x0e;		break;
+	case XK_minus:		tosend = 0x4e;		break;
+	case XK_equal:		tosend = 0x55;		break;
+	case XK_bracketleft:	tosend = 0x54;		break;
+	case XK_bracketright:	tosend = 0x5b;		break;
+	case XK_backslash:	tosend = 0x5d;		break;
+	case XK_Caps_Lock:	tosend = 0x58;		break;
+	case XK_semicolon:	tosend = 0x4c;		break;
+	case XK_quoteright:	tosend = 0x52;		break;
+	case XK_comma:		tosend = 0x41;		break;
+	case XK_period:		tosend = 0x49;		break;
+	case XK_slash:		tosend = 0x4a;		break;
+	case XK_space:		tosend = 0x29;		break;
+
+	/*
+	 * This is where I handle the shifted characters.
+	 * They don't really map nicely the way A-Z maps to a-z,
+	 * so I'm doing it manually
+	 */
+	case XK_exclam:		tosend = 0x16;		break;
+	case XK_quotedbl:	tosend = 0x52;		break;
+	case XK_numbersign:	tosend = 0x26;		break;
+	case XK_dollar:		tosend = 0x25;		break;
+	case XK_percent:	tosend = 0x2e;		break;
+	case XK_ampersand:	tosend = 0x3d;		break;
+	case XK_parenleft:	tosend = 0x46;		break;
+	case XK_parenright:	tosend = 0x45;		break;
+	case XK_asterisk:	tosend = 0x3e;		break;
+	case XK_plus:		tosend = 0x55;		break;
+	case XK_colon:		tosend = 0x4c;		break;
+	case XK_less:		tosend = 0x41;		break;
+	case XK_greater:	tosend = 0x49;		break;
+	case XK_question:	tosend = 0x4a;		break;
+	case XK_at:		tosend = 0x1e;		break;
+	case XK_asciicircum:	tosend = 0x36;		break;
+	case XK_underscore:	tosend = 0x4e;		break;
+	case XK_braceleft:	tosend = 0x54;		break;
+	case XK_braceright:	tosend = 0x5b;		break;
+	case XK_bar:		tosend = 0x5d;		break;
+	case XK_asciitilde:	tosend = 0x0e;		break;
+	default:		break;
+	}
+
+	/*
+	 * If this is a "key up" event (the user has released the key, we
+	 * need to send 0xf0 first.
+	 */
+	if (!down && tosend != 0x0)
+		kbd_queue(0xf0);
+
+	if (tosend)
+		kbd_queue(tosend);
+}
+
+/* The previous X and Y coordinates of the mouse */
+static int xlast, ylast = -1;
+
+/*
+ * This function is called by the VNC server whenever a mouse event occurs.
+ */
+static void kbd_handle_ptr(int buttonMask, int x, int y, rfbClientPtr cl)
+{
+	int dx, dy;
+	char b1 = 0x8;
+
+	/* The VNC mask and the PS/2 button encoding are the same */
+	b1 |= buttonMask;
+
+	if (xlast >= 0 && ylast >= 0) {
+		/* The PS/2 mouse sends deltas, not absolutes */
+		dx = x - xlast;
+		dy = ylast - y;
+
+		/* Set overflow bits if needed */
+		if (dy > 255)
+			b1 |= 0x80;
+		if (dx > 255)
+			b1 |= 0x40;
+
+		/* Set negative bits if needed */
+		if (dy < 0)
+			b1 |= 0x20;
+		if (dx < 0)
+			b1 |= 0x10;
+
+		mouse_queue(b1);
+		mouse_queue(dx);
+		mouse_queue(dy);
+	}
+
+	xlast = x;
+	ylast = y;
+	rfbDefaultPtrAddEvent(buttonMask, x, y, cl);
+}
+
+static void *vnc__thread(void *p)
+{
+	struct framebuffer *fb = p;
+	/*
+	 * Make a fake argc and argv because the getscreen function
+	 * seems to want it.
+	 */
+	char argv[1][1] = {{0}};
+	int argc = 1;
+
+	kvm__set_thread_name("kvm-vnc-worker");
+
+	server = rfbGetScreen(&argc, (char **) argv, fb->width, fb->height, 8, 3, 4);
+	server->frameBuffer		= fb->mem;
+	server->alwaysShared		= TRUE;
+	server->kbdAddEvent		= kbd_handle_key;
+	server->ptrAddEvent		= kbd_handle_ptr;
+	rfbInitServer(server);
+
+	while (rfbIsActive(server)) {
+		rfbMarkRectAsModified(server, 0, 0, fb->width, fb->height);
+		rfbProcessEvents(server, server->deferUpdateTime * VESA_UPDATE_TIME);
+	}
+	return NULL;
+}
+
+static int vnc__start(struct framebuffer *fb)
+{
+	pthread_t thread;
+
+	if (pthread_create(&thread, NULL, vnc__thread, fb) != 0)
+		return -1;
+
+	return 0;
+}
+
+static int vnc__stop(struct framebuffer *fb)
+{
+	rfbShutdownServer(server, TRUE);
+
+	return 0;
+}
+
+static struct fb_target_operations vnc_ops = {
+	.start	= vnc__start,
+	.stop	= vnc__stop,
+};
+
+int vnc__init(struct kvm *kvm)
+{
+	struct framebuffer *fb;
+
+	if (!kvm->cfg.vnc)
+		return 0;
+
+	fb = vesa__init(kvm);
+	if (IS_ERR(fb)) {
+		pr_err("vesa__init() failed with error %ld\n", PTR_ERR(fb));
+		return PTR_ERR(fb);
+	}
+
+	return fb__attach(fb, &vnc_ops);
+}
+dev_init(vnc__init);
+
+int vnc__exit(struct kvm *kvm)
+{
+	if (kvm->cfg.vnc)
+		return vnc__stop(NULL);
+
+	return 0;
+}
+dev_exit(vnc__exit);

diff --git a/tools/kvm/util/KVMTOOLS-VERSION-GEN b/tools/kvm/util/KVMTOOLS-VERSION-GEN
new file mode 100755
index 0000000..1af9d6c
--- /dev/null
+++ b/tools/kvm/util/KVMTOOLS-VERSION-GEN

@@ -0,0 +1,40 @@
+#!/bin/sh
+
+if [ $# -eq 1 ]  ; then
+	OUTPUT=$1
+fi
+
+GVF=${OUTPUT}KVMTOOLS-VERSION-FILE
+
+LF='
+'
+
+# First check if there is a .git to get the version from git describe
+# otherwise try to get the version from the kernel makefile
+if test -d ../../.git -o -f ../../.git &&
+	VN=$(git describe --abbrev=4 HEAD 2>/dev/null) &&
+	case "$VN" in
+	*$LF*) (exit 1) ;;
+	v[0-9]*)
+		git update-index -q --refresh
+		test -z "$(git diff-index --name-only HEAD --)" ||
+		VN="$VN-dirty" ;;
+	esac
+then
+	VN=$(echo "$VN" | sed -e 's/-/./g');
+else
+	VN=$(MAKEFLAGS= make -sC ../.. kernelversion)
+fi
+
+VN=$(expr "$VN" : v*'\(.*\)')
+
+if test -r $GVF
+then
+	VC=$(sed -e 's/^KVMTOOLS_VERSION = //' <$GVF)
+else
+	VC=unset
+fi
+test "$VN" = "$VC" || {
+	echo >&2 "KVMTOOLS_VERSION = $VN"
+	echo "KVMTOOLS_VERSION = $VN" >$GVF
+}

diff --git a/tools/kvm/util/generate-cmdlist.sh b/tools/kvm/util/generate-cmdlist.sh
new file mode 100755
index 0000000..c8be0bd
--- /dev/null
+++ b/tools/kvm/util/generate-cmdlist.sh

@@ -0,0 +1,23 @@
+#!/bin/sh
+
+echo "/* Automatically generated by $0 */
+struct cmdname_help
+{
+    char name[16];
+    char help[80];
+};
+
+static struct cmdname_help common_cmds[] = {"
+
+sed -n 's/^lkvm-\([^ \t]*\).*common/\1/p' command-list.txt |
+while read cmd
+do
+	 # TODO following sed command should be fixed
+     sed -n '/^NAME/,/^lkvm-'"$cmd"'/ {
+		 /NAME/d
+		 /--/d
+		 s/.*kvm-'"$cmd"' - \(.*\)/  {"'"$cmd"'", "\1"},/
+	     p
+	 }' "Documentation/kvm-$cmd.txt"
+done
+echo "};"

diff --git a/tools/kvm/util/init.c b/tools/kvm/util/init.c
new file mode 100644
index 0000000..d4ce144
--- /dev/null
+++ b/tools/kvm/util/init.c

@@ -0,0 +1,67 @@
+#include <linux/list.h>
+#include <linux/kernel.h>
+
+#include "kvm/kvm.h"
+#include "kvm/util-init.h"
+
+#define PRIORITY_LISTS 10
+
+static struct hlist_head init_lists[PRIORITY_LISTS];
+static struct hlist_head exit_lists[PRIORITY_LISTS];
+
+int init_list_add(struct init_item *t, int (*init)(struct kvm *),
+			int priority, const char *name)
+{
+	t->init = init;
+	t->fn_name = name;
+	hlist_add_head(&t->n, &init_lists[priority]);
+
+	return 0;
+}
+
+int exit_list_add(struct init_item *t, int (*init)(struct kvm *),
+			int priority, const char *name)
+{
+	t->init = init;
+	t->fn_name = name;
+	hlist_add_head(&t->n, &exit_lists[priority]);
+
+	return 0;
+}
+
+int init_list__init(struct kvm *kvm)
+{
+	unsigned int i;
+	int r = 0;
+	struct init_item *t;
+
+	for (i = 0; i < ARRAY_SIZE(init_lists); i++)
+		hlist_for_each_entry(t, &init_lists[i], n) {
+			r = t->init(kvm);
+			if (r < 0) {
+				pr_warning("Failed init: %s\n", t->fn_name);
+				goto fail;
+			}
+		}
+
+fail:
+	return r;
+}
+
+int init_list__exit(struct kvm *kvm)
+{
+	int i;
+	int r = 0;
+	struct init_item *t;
+
+	for (i = ARRAY_SIZE(exit_lists) - 1; i >= 0; i--)
+		hlist_for_each_entry(t, &exit_lists[i], n) {
+			r = t->init(kvm);
+			if (r < 0) {
+				pr_warning("%s failed.\n", t->fn_name);
+				goto fail;
+			}
+		}
+fail:
+	return r;
+}

diff --git a/tools/kvm/util/iovec.c b/tools/kvm/util/iovec.c
new file mode 100644
index 0000000..0c8b9cf
--- /dev/null
+++ b/tools/kvm/util/iovec.c

@@ -0,0 +1,126 @@
+/*
+ *	iovec manipulation routines.
+ *
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ *	Fixes:
+ *		Andrew Lunn	:	Errors in iovec copying.
+ *		Pedro Roque	:	Added memcpy_fromiovecend and
+ *					csum_..._fromiovecend.
+ *		Andi Kleen	:	fixed error handling for 2.1
+ *		Alexey Kuznetsov:	2.1 optimisations
+ *		Andi Kleen	:	Fix csum*fromiovecend for IPv6.
+ */
+
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/compiler.h>
+#include <sys/uio.h>
+#include <kvm/iovec.h>
+#include <string.h>
+
+/*
+ *	Copy kernel to iovec. Returns -EFAULT on error.
+ *
+ *	Note: this modifies the original iovec.
+ */
+
+int memcpy_toiovec(struct iovec *iov, unsigned char *kdata, int len)
+{
+	while (len > 0) {
+		if (iov->iov_len) {
+			int copy = min_t(unsigned int, iov->iov_len, len);
+			memcpy(iov->iov_base, kdata, copy);
+			kdata += copy;
+			len -= copy;
+			iov->iov_len -= copy;
+			iov->iov_base += copy;
+		}
+		iov++;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(memcpy_toiovec);
+
+/*
+ *	Copy kernel to iovec. Returns -EFAULT on error.
+ */
+
+int memcpy_toiovecend(const struct iovec *iov, unsigned char *kdata,
+		      size_t offset, int len)
+{
+	int copy;
+	for (; len > 0; ++iov) {
+		/* Skip over the finished iovecs */
+		if (unlikely(offset >= iov->iov_len)) {
+			offset -= iov->iov_len;
+			continue;
+		}
+		copy = min_t(unsigned int, iov->iov_len - offset, len);
+		memcpy(iov->iov_base + offset, kdata, copy);
+		offset = 0;
+		kdata += copy;
+		len -= copy;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(memcpy_toiovecend);
+
+/*
+ *	Copy iovec to kernel. Returns -EFAULT on error.
+ *
+ *	Note: this modifies the original iovec.
+ */
+
+int memcpy_fromiovec(unsigned char *kdata, struct iovec *iov, int len)
+{
+	while (len > 0) {
+		if (iov->iov_len) {
+			int copy = min_t(unsigned int, len, iov->iov_len);
+			memcpy(kdata, iov->iov_base, copy);
+			len -= copy;
+			kdata += copy;
+			iov->iov_base += copy;
+			iov->iov_len -= copy;
+		}
+		iov++;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(memcpy_fromiovec);
+
+/*
+ *	Copy iovec from kernel. Returns -EFAULT on error.
+ */
+
+int memcpy_fromiovecend(unsigned char *kdata, const struct iovec *iov,
+			size_t offset, int len)
+{
+	/* Skip over the finished iovecs */
+	while (offset >= iov->iov_len) {
+		offset -= iov->iov_len;
+		iov++;
+	}
+
+	while (len > 0) {
+		char *base = iov->iov_base + offset;
+		int copy = min_t(unsigned int, len, iov->iov_len - offset);
+
+		offset = 0;
+		memcpy(kdata, base, copy);
+		len -= copy;
+		kdata += copy;
+		iov++;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(memcpy_fromiovecend);

diff --git a/tools/kvm/util/kvm-ifup-vbr0 b/tools/kvm/util/kvm-ifup-vbr0
new file mode 100755
index 0000000..a91c37f
--- /dev/null
+++ b/tools/kvm/util/kvm-ifup-vbr0

@@ -0,0 +1,6 @@
+#!/bin/sh
+switch=vbr0
+/sbin/ifconfig $1 0.0.0.0 up
+/usr/sbin/brctl addif ${switch} $1
+/usr/sbin/brctl setfd ${switch} 0
+/usr/sbin/brctl stp ${switch} off

diff --git a/tools/kvm/util/parse-options.c b/tools/kvm/util/parse-options.c
new file mode 100644
index 0000000..9a1bbee
--- /dev/null
+++ b/tools/kvm/util/parse-options.c

@@ -0,0 +1,577 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <stdbool.h>
+
+/* user defined includes */
+#include <linux/types.h>
+#include <kvm/util.h>
+#include <kvm/parse-options.h>
+#include <kvm/strbuf.h>
+
+#define OPT_SHORT 1
+#define OPT_UNSET 2
+
+static int opterror(const struct option *opt, const char *reason, int flags)
+{
+	if (flags & OPT_SHORT)
+		return pr_err("switch `%c' %s", opt->short_name, reason);
+	if (flags & OPT_UNSET)
+		return pr_err("option `no-%s' %s", opt->long_name, reason);
+	return pr_err("option `%s' %s", opt->long_name, reason);
+}
+
+static int get_arg(struct parse_opt_ctx_t *p, const struct option *opt,
+		int flags, const char **arg)
+{
+	if (p->opt) {
+		*arg = p->opt;
+		p->opt = NULL;
+	} else if ((opt->flags & PARSE_OPT_LASTARG_DEFAULT) && (p->argc == 1 ||
+				**(p->argv + 1) == '-')) {
+		*arg = (const char *)opt->defval;
+	} else if (p->argc > 1) {
+		p->argc--;
+		*arg = *++p->argv;
+	} else
+		return opterror(opt, "requires a value", flags);
+	return 0;
+}
+
+static int readnum(const struct option *opt, int flags,
+		   const char *str, char **end)
+{
+	switch (opt->type) {
+	case OPTION_INTEGER:
+		*(int *)opt->value = strtol(str, end, 0);
+		break;
+	case OPTION_UINTEGER:
+		*(unsigned int *)opt->value = strtol(str, end, 0);
+		break;
+	case OPTION_LONG:
+		*(long *)opt->value = strtol(str, end, 0);
+		break;
+	case OPTION_U64:
+		*(u64 *)opt->value = strtoull(str, end, 0);
+		break;
+	default:
+		return opterror(opt, "invalid numeric conversion", flags);
+	}
+
+	return 0;
+}
+
+static int get_value(struct parse_opt_ctx_t *p,
+		const struct option *opt, int flags)
+{
+	const char *s, *arg = NULL;
+	const int unset = flags & OPT_UNSET;
+
+	if (unset && p->opt)
+		return opterror(opt, "takes no value", flags);
+	if (unset && (opt->flags & PARSE_OPT_NONEG))
+		return opterror(opt, "isn't available", flags);
+
+	if (!(flags & OPT_SHORT) && p->opt) {
+		switch (opt->type) {
+		case OPTION_CALLBACK:
+			if (!(opt->flags & PARSE_OPT_NOARG))
+				break;
+		/* FALLTHROUGH */
+		case OPTION_BOOLEAN:
+		case OPTION_INCR:
+		case OPTION_BIT:
+		case OPTION_SET_UINT:
+		case OPTION_SET_PTR:
+			return opterror(opt, "takes no value", flags);
+		case OPTION_END:
+		case OPTION_ARGUMENT:
+		case OPTION_GROUP:
+		case OPTION_STRING:
+		case OPTION_INTEGER:
+		case OPTION_UINTEGER:
+		case OPTION_LONG:
+		case OPTION_U64:
+		default:
+			break;
+		}
+	}
+
+	switch (opt->type) {
+	case OPTION_BIT:
+		if (unset)
+			*(int *)opt->value &= ~opt->defval;
+		else
+			*(int *)opt->value |= opt->defval;
+		return 0;
+
+	case OPTION_BOOLEAN:
+		*(bool *)opt->value = unset ? false : true;
+		return 0;
+
+	case OPTION_INCR:
+		*(int *)opt->value = unset ? 0 : *(int *)opt->value + 1;
+		return 0;
+
+	case OPTION_SET_UINT:
+		*(unsigned int *)opt->value = unset ? 0 : opt->defval;
+		return 0;
+
+	case OPTION_SET_PTR:
+		*(void **)opt->value = unset ? NULL : (void *)opt->defval;
+		return 0;
+
+	case OPTION_STRING:
+		if (unset)
+			*(const char **)opt->value = NULL;
+		else if (opt->flags & PARSE_OPT_OPTARG && !p->opt)
+			*(const char **)opt->value = (const char *)opt->defval;
+		else
+			return get_arg(p, opt, flags,
+					(const char **)opt->value);
+		return 0;
+
+	case OPTION_CALLBACK:
+		if (unset)
+			return (*opt->callback)(opt, NULL, 1) ? (-1) : 0;
+		if (opt->flags & PARSE_OPT_NOARG)
+			return (*opt->callback)(opt, NULL, 0) ? (-1) : 0;
+		if (opt->flags & PARSE_OPT_OPTARG && !p->opt)
+			return (*opt->callback)(opt, NULL, 0) ? (-1) : 0;
+		if (get_arg(p, opt, flags, &arg))
+			return -1;
+		return (*opt->callback)(opt, arg, 0) ? (-1) : 0;
+
+	case OPTION_INTEGER:
+		if (unset) {
+			*(int *)opt->value = 0;
+			return 0;
+		}
+		if (opt->flags & PARSE_OPT_OPTARG && !p->opt) {
+			*(int *)opt->value = opt->defval;
+			return 0;
+		}
+		if (get_arg(p, opt, flags, &arg))
+			return -1;
+		return readnum(opt, flags, arg, (char **)&s);
+
+	case OPTION_UINTEGER:
+		if (unset) {
+			*(unsigned int *)opt->value = 0;
+			return 0;
+		}
+		if (opt->flags & PARSE_OPT_OPTARG && !p->opt) {
+			*(unsigned int *)opt->value = opt->defval;
+			return 0;
+		}
+		if (get_arg(p, opt, flags, &arg))
+			return -1;
+		return readnum(opt, flags, arg, (char **)&s);
+
+	case OPTION_LONG:
+		if (unset) {
+			*(long *)opt->value = 0;
+			return 0;
+		}
+		if (opt->flags & PARSE_OPT_OPTARG && !p->opt) {
+			*(long *)opt->value = opt->defval;
+			return 0;
+		}
+		if (get_arg(p, opt, flags, &arg))
+			return -1;
+		return readnum(opt, flags, arg, (char **)&s);
+
+	case OPTION_U64:
+		if (unset) {
+			*(u64 *)opt->value = 0;
+			return 0;
+		}
+		if (opt->flags & PARSE_OPT_OPTARG && !p->opt) {
+			*(u64 *)opt->value = opt->defval;
+			return 0;
+		}
+		if (get_arg(p, opt, flags, &arg))
+			return -1;
+		return readnum(opt, flags, arg, (char **)&s);
+
+	case OPTION_END:
+	case OPTION_ARGUMENT:
+	case OPTION_GROUP:
+	default:
+		die("should not happen, someone must be hit on the forehead");
+	}
+}
+
+#define USAGE_OPTS_WIDTH 24
+#define USAGE_GAP         2
+
+static int usage_with_options_internal(const char * const *usagestr,
+		const struct option *opts, int full)
+{
+	if (!usagestr)
+		return PARSE_OPT_HELP;
+
+	fprintf(stderr, "\n usage: %s\n", *usagestr++);
+	while (*usagestr && **usagestr)
+		fprintf(stderr, "    or: %s\n", *usagestr++);
+	while (*usagestr) {
+		fprintf(stderr, "%s%s\n",
+				**usagestr ? "    " : "",
+				*usagestr);
+		usagestr++;
+	}
+
+	if (opts->type != OPTION_GROUP)
+		fputc('\n', stderr);
+
+	for (; opts->type != OPTION_END; opts++) {
+		size_t pos;
+		int pad;
+
+		if (opts->type == OPTION_GROUP) {
+			fputc('\n', stderr);
+			if (*opts->help)
+				fprintf(stderr, "%s\n", opts->help);
+			continue;
+		}
+		if (!full && (opts->flags & PARSE_OPT_HIDDEN))
+			continue;
+
+		pos = fprintf(stderr, "    ");
+		if (opts->short_name)
+			pos += fprintf(stderr, "-%c", opts->short_name);
+		else
+			pos += fprintf(stderr, "    ");
+
+		if (opts->long_name && opts->short_name)
+			pos += fprintf(stderr, ", ");
+		if (opts->long_name)
+			pos += fprintf(stderr, "--%s", opts->long_name);
+
+		switch (opts->type) {
+		case OPTION_ARGUMENT:
+			break;
+		case OPTION_LONG:
+		case OPTION_U64:
+		case OPTION_INTEGER:
+		case OPTION_UINTEGER:
+			if (opts->flags & PARSE_OPT_OPTARG)
+				if (opts->long_name)
+					pos += fprintf(stderr, "[=<n>]");
+				else
+					pos += fprintf(stderr, "[<n>]");
+			else
+				pos += fprintf(stderr, " <n>");
+			break;
+		case OPTION_CALLBACK:
+			if (opts->flags & PARSE_OPT_NOARG)
+				break;
+		/* FALLTHROUGH */
+		case OPTION_STRING:
+			if (opts->argh) {
+				if (opts->flags & PARSE_OPT_OPTARG)
+					if (opts->long_name)
+						pos += fprintf(stderr, "[=<%s>]", opts->argh);
+					else
+						pos += fprintf(stderr, "[<%s>]", opts->argh);
+				else
+					pos += fprintf(stderr, " <%s>", opts->argh);
+			} else {
+				if (opts->flags & PARSE_OPT_OPTARG)
+					if (opts->long_name)
+						pos += fprintf(stderr, "[=...]");
+					else
+						pos += fprintf(stderr, "[...]");
+				else
+					pos += fprintf(stderr, " ...");
+			}
+				break;
+		default: /* OPTION_{BIT,BOOLEAN,SET_UINT,SET_PTR} */
+		case OPTION_END:
+		case OPTION_GROUP:
+		case OPTION_BIT:
+		case OPTION_BOOLEAN:
+		case OPTION_INCR:
+		case OPTION_SET_UINT:
+		case OPTION_SET_PTR:
+			break;
+		}
+		if (pos <= USAGE_OPTS_WIDTH)
+			pad = USAGE_OPTS_WIDTH - pos;
+		else {
+			fputc('\n', stderr);
+			pad = USAGE_OPTS_WIDTH;
+		}
+		fprintf(stderr, "%*s%s\n", pad + USAGE_GAP, "", opts->help);
+	}
+	fputc('\n', stderr);
+
+	return PARSE_OPT_HELP;
+}
+
+void usage_with_options(const char * const *usagestr,
+		const struct option *opts)
+{
+	usage_with_options_internal(usagestr, opts, 0);
+	exit(129);
+}
+
+static void check_typos(const char *arg, const struct option *options)
+{
+	if (strlen(arg) < 3)
+		return;
+
+	if (!prefixcmp(arg, "no-")) {
+		pr_err("did you mean `--%s` (with two dashes ?)", arg);
+		exit(129);
+	}
+
+	for (; options->type != OPTION_END; options++) {
+		if (!options->long_name)
+			continue;
+		if (!prefixcmp(options->long_name, arg)) {
+			pr_err("did you mean `--%s` (with two dashes ?)", arg);
+			exit(129);
+		}
+	}
+}
+
+static int parse_options_usage(const char * const *usagestr,
+		const struct option *opts)
+{
+	return usage_with_options_internal(usagestr, opts, 0);
+}
+
+static int parse_short_opt(struct parse_opt_ctx_t *p,
+        const struct option *options)
+{
+	for (; options->type != OPTION_END; options++) {
+		if (options->short_name == *p->opt) {
+			p->opt = p->opt[1] ? p->opt + 1 : NULL;
+			return get_value(p, options, OPT_SHORT);
+		}
+	}
+	return -2;
+}
+
+static int parse_long_opt(struct parse_opt_ctx_t *p, const char *arg,
+		const struct option *options)
+{
+	const char *arg_end = strchr(arg, '=');
+	const struct option *abbrev_option = NULL, *ambiguous_option = NULL;
+	int abbrev_flags = 0, ambiguous_flags = 0;
+
+	if (!arg_end)
+		arg_end = arg + strlen(arg);
+
+	for (; options->type != OPTION_END; options++) {
+		const char *rest;
+		int flags = 0;
+
+		if (!options->long_name)
+			continue;
+
+		rest = skip_prefix(arg, options->long_name);
+		if (options->type == OPTION_ARGUMENT) {
+			if (!rest)
+				continue;
+			if (*rest == '=')
+				return opterror(options, "takes no value",
+						flags);
+			if (*rest)
+				continue;
+			p->out[p->cpidx++] = arg - 2;
+			return 0;
+		}
+		if (!rest) {
+			/* abbreviated? */
+			if (!strncmp(options->long_name, arg, arg_end - arg)) {
+is_abbreviated:
+				if (abbrev_option) {
+					/*
+					 * If this is abbreviated, it is
+					 * ambiguous. So when there is no
+					 * exact match later, we need to
+					 * error out.
+					 */
+					ambiguous_option = abbrev_option;
+					ambiguous_flags = abbrev_flags;
+				}
+				if (!(flags & OPT_UNSET) && *arg_end)
+					p->opt = arg_end + 1;
+				abbrev_option = options;
+				abbrev_flags = flags;
+				continue;
+			}
+			/* negated and abbreviated very much? */
+			if (!prefixcmp("no-", arg)) {
+				flags |= OPT_UNSET;
+				goto is_abbreviated;
+			}
+			/* negated? */
+			if (strncmp(arg, "no-", 3))
+				continue;
+			flags |= OPT_UNSET;
+			rest = skip_prefix(arg + 3, options->long_name);
+			/* abbreviated and negated? */
+			if (!rest && !prefixcmp(options->long_name, arg + 3))
+				goto is_abbreviated;
+			if (!rest)
+				continue;
+		}
+		if (*rest) {
+			if (*rest != '=')
+				continue;
+			p->opt = rest + 1;
+		}
+		return get_value(p, options, flags);
+	}
+
+	if (ambiguous_option)
+		return pr_err("Ambiguous option: %s "
+				"(could be --%s%s or --%s%s)",
+				arg,
+				(ambiguous_flags & OPT_UNSET) ?  "no-" : "",
+				ambiguous_option->long_name,
+				(abbrev_flags & OPT_UNSET) ?  "no-" : "",
+				abbrev_option->long_name);
+	if (abbrev_option)
+		return get_value(p, abbrev_option, abbrev_flags);
+	return -2;
+}
+
+
+static void parse_options_start(struct parse_opt_ctx_t *ctx, int argc,
+		const char **argv, int flags)
+{
+	memset(ctx, 0, sizeof(*ctx));
+	ctx->argc = argc;
+	ctx->argv = argv;
+	ctx->out  = argv;
+	ctx->cpidx = ((flags & PARSE_OPT_KEEP_ARGV0) != 0);
+	ctx->flags = flags;
+	if ((flags & PARSE_OPT_KEEP_UNKNOWN) &&
+			(flags & PARSE_OPT_STOP_AT_NON_OPTION))
+		die("STOP_AT_NON_OPTION and KEEP_UNKNOWN don't go together");
+}
+
+static int parse_options_end(struct parse_opt_ctx_t *ctx)
+{
+	memmove(ctx->out + ctx->cpidx, ctx->argv, ctx->argc * sizeof(*ctx->out));
+	ctx->out[ctx->cpidx + ctx->argc] = NULL;
+	return ctx->cpidx + ctx->argc;
+}
+
+
+static int parse_options_step(struct parse_opt_ctx_t *ctx,
+		const struct option *options, const char * const usagestr[])
+{
+	int internal_help = !(ctx->flags & PARSE_OPT_NO_INTERNAL_HELP);
+
+	/* we must reset ->opt, unknown short option leave it dangling */
+	ctx->opt = NULL;
+
+	for (; ctx->argc; ctx->argc--, ctx->argv++) {
+		const char *arg = ctx->argv[0];
+
+		if (*arg != '-' || !arg[1]) {
+			if (ctx->flags & PARSE_OPT_STOP_AT_NON_OPTION)
+				break;
+			ctx->out[ctx->cpidx++] = ctx->argv[0];
+			continue;
+		}
+
+		if (arg[1] != '-') {
+			ctx->opt = arg + 1;
+			if (internal_help && *ctx->opt == 'h')
+				return parse_options_usage(usagestr, options);
+			switch (parse_short_opt(ctx, options)) {
+			case -1:
+				return parse_options_usage(usagestr, options);
+			case -2:
+				goto unknown;
+			default:
+				break;
+			}
+			if (ctx->opt)
+				check_typos(arg + 1, options);
+			while (ctx->opt) {
+				if (internal_help && *ctx->opt == 'h')
+					return parse_options_usage(usagestr,
+							options);
+				switch (parse_short_opt(ctx, options)) {
+				case -1:
+					return parse_options_usage(usagestr,
+							options);
+				case -2:
+					/* fake a short option thing to hide
+					 * the fact that we may have
+					 * started to parse aggregated stuff
+					 *
+					 * This is leaky, too bad.
+					 */
+					ctx->argv[0] = strdup(ctx->opt - 1);
+					*(char *)ctx->argv[0] = '-';
+					goto unknown;
+				default:
+					break;
+				}
+			}
+			continue;
+		}
+
+		if (!arg[2]) { /* "--" */
+			if (!(ctx->flags & PARSE_OPT_KEEP_DASHDASH)) {
+				ctx->argc--;
+				ctx->argv++;
+			}
+			break;
+		}
+
+		if (internal_help && !strcmp(arg + 2, "help-all"))
+			return usage_with_options_internal(usagestr, options,
+					1);
+		if (internal_help && !strcmp(arg + 2, "help"))
+			return parse_options_usage(usagestr, options);
+		switch (parse_long_opt(ctx, arg + 2, options)) {
+		case -1:
+			return parse_options_usage(usagestr, options);
+		case -2:
+			goto unknown;
+		default:
+			break;
+		}
+		continue;
+unknown:
+		if (!(ctx->flags & PARSE_OPT_KEEP_UNKNOWN))
+			return PARSE_OPT_UNKNOWN;
+		ctx->out[ctx->cpidx++] = ctx->argv[0];
+		ctx->opt = NULL;
+	}
+	return PARSE_OPT_DONE;
+}
+
+int parse_options(int argc, const char **argv, const struct option *options,
+		const char * const usagestr[], int flags)
+{
+	struct parse_opt_ctx_t ctx;
+
+	parse_options_start(&ctx, argc, argv, flags);
+	switch (parse_options_step(&ctx, options, usagestr)) {
+	case PARSE_OPT_HELP:
+		exit(129);
+	case PARSE_OPT_DONE:
+		break;
+	default: /* PARSE_OPT_UNKNOWN */
+		if (ctx.argv[0][1] == '-') {
+			pr_err("unknown option `%s'", ctx.argv[0] + 2);
+		} else {
+			pr_err("unknown switch `%c'", *ctx.opt);
+		}
+		usage_with_options(usagestr, options);
+	}
+
+	return parse_options_end(&ctx);
+}

diff --git a/tools/kvm/util/rbtree-interval.c b/tools/kvm/util/rbtree-interval.c
new file mode 100644
index 0000000..3630a6d
--- /dev/null
+++ b/tools/kvm/util/rbtree-interval.c

@@ -0,0 +1,58 @@
+#include <kvm/rbtree-interval.h>
+#include <stddef.h>
+#include <errno.h>
+
+struct rb_int_node *rb_int_search_single(struct rb_root *root, u64 point)
+{
+	struct rb_node *node = root->rb_node;
+
+	while (node) {
+		struct rb_int_node *cur = rb_int(node);
+
+		if (point < cur->low)
+			node = node->rb_left;
+		else if (cur->high <= point)
+			node = node->rb_right;
+		else
+			return cur;
+	}
+
+	return NULL;
+}
+
+struct rb_int_node *rb_int_search_range(struct rb_root *root, u64 low, u64 high)
+{
+	struct rb_int_node *range;
+
+	range = rb_int_search_single(root, low);
+	if (range == NULL)
+		return NULL;
+
+	/* We simply verify that 'high' is smaller than the end of the range where 'low' is located */
+	if (range->high < high)
+		return NULL;
+
+	return range;
+}
+
+int rb_int_insert(struct rb_root *root, struct rb_int_node *i_node)
+{
+	struct rb_node **node = &root->rb_node, *parent = NULL;
+
+	while (*node) {
+		struct rb_int_node *cur = rb_int(*node);
+
+		parent = *node;
+		if (i_node->high <= cur->low)
+			node = &cur->node.rb_left;
+		else if (cur->high <= i_node->low)
+			node = &cur->node.rb_right;
+		else
+			return -EEXIST;
+	}
+
+	rb_link_node(&i_node->node, parent, node);
+	rb_insert_color(&i_node->node, root);
+
+	return 0;
+}

diff --git a/tools/kvm/util/read-write.c b/tools/kvm/util/read-write.c
new file mode 100644
index 0000000..44709df
--- /dev/null
+++ b/tools/kvm/util/read-write.c

@@ -0,0 +1,354 @@
+#include "kvm/read-write.h"
+
+#include <sys/types.h>
+#include <sys/uio.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+
+/* Same as read(2) except that this function never returns EAGAIN or EINTR. */
+ssize_t xread(int fd, void *buf, size_t count)
+{
+	ssize_t nr;
+
+restart:
+	nr = read(fd, buf, count);
+	if ((nr < 0) && ((errno == EAGAIN) || (errno == EINTR)))
+		goto restart;
+
+	return nr;
+}
+
+/* Same as write(2) except that this function never returns EAGAIN or EINTR. */
+ssize_t xwrite(int fd, const void *buf, size_t count)
+{
+	ssize_t nr;
+
+restart:
+	nr = write(fd, buf, count);
+	if ((nr < 0) && ((errno == EAGAIN) || (errno == EINTR)))
+		goto restart;
+
+	return nr;
+}
+
+ssize_t read_in_full(int fd, void *buf, size_t count)
+{
+	ssize_t total = 0;
+	char *p = buf;
+
+	while (count > 0) {
+		ssize_t nr;
+
+		nr = xread(fd, p, count);
+		if (nr <= 0) {
+			if (total > 0)
+				return total;
+
+			return -1;
+		}
+
+		count -= nr;
+		total += nr;
+		p += nr;
+	}
+
+	return total;
+}
+
+ssize_t write_in_full(int fd, const void *buf, size_t count)
+{
+	const char *p = buf;
+	ssize_t total = 0;
+
+	while (count > 0) {
+		ssize_t nr;
+
+		nr = xwrite(fd, p, count);
+		if (nr < 0)
+			return -1;
+		if (nr == 0) {
+			errno = ENOSPC;
+			return -1;
+		}
+		count -= nr;
+		total += nr;
+		p += nr;
+	}
+
+	return total;
+}
+
+/* Same as pread(2) except that this function never returns EAGAIN or EINTR. */
+ssize_t xpread(int fd, void *buf, size_t count, off_t offset)
+{
+	ssize_t nr;
+
+restart:
+	nr = pread(fd, buf, count, offset);
+	if ((nr < 0) && ((errno == EAGAIN) || (errno == EINTR)))
+		goto restart;
+
+	return nr;
+}
+
+/* Same as pwrite(2) except that this function never returns EAGAIN or EINTR. */
+ssize_t xpwrite(int fd, const void *buf, size_t count, off_t offset)
+{
+	ssize_t nr;
+
+restart:
+	nr = pwrite(fd, buf, count, offset);
+	if ((nr < 0) && ((errno == EAGAIN) || (errno == EINTR)))
+		goto restart;
+
+	return nr;
+}
+
+ssize_t pread_in_full(int fd, void *buf, size_t count, off_t offset)
+{
+	ssize_t total = 0;
+	char *p = buf;
+
+	while (count > 0) {
+		ssize_t nr;
+
+		nr = xpread(fd, p, count, offset);
+		if (nr <= 0) {
+			if (total > 0)
+				return total;
+
+			return -1;
+		}
+
+		count -= nr;
+		total += nr;
+		p += nr;
+		offset += nr;
+	}
+
+	return total;
+}
+
+ssize_t pwrite_in_full(int fd, const void *buf, size_t count, off_t offset)
+{
+	const char *p = buf;
+	ssize_t total = 0;
+
+	while (count > 0) {
+		ssize_t nr;
+
+		nr = xpwrite(fd, p, count, offset);
+		if (nr < 0)
+			return -1;
+		if (nr == 0) {
+			errno = ENOSPC;
+			return -1;
+		}
+		count -= nr;
+		total += nr;
+		p += nr;
+		offset += nr;
+	}
+
+	return total;
+}
+
+/* Same as readv(2) except that this function never returns EAGAIN or EINTR. */
+ssize_t xreadv(int fd, const struct iovec *iov, int iovcnt)
+{
+	ssize_t nr;
+
+restart:
+	nr = readv(fd, iov, iovcnt);
+	if ((nr < 0) && ((errno == EAGAIN) || (errno == EINTR)))
+		goto restart;
+
+	return nr;
+}
+
+/* Same as writev(2) except that this function never returns EAGAIN or EINTR. */
+ssize_t xwritev(int fd, const struct iovec *iov, int iovcnt)
+{
+	ssize_t nr;
+
+restart:
+	nr = writev(fd, iov, iovcnt);
+	if ((nr < 0) && ((errno == EAGAIN) || (errno == EINTR)))
+		goto restart;
+
+	return nr;
+}
+
+static inline ssize_t get_iov_size(const struct iovec *iov, int iovcnt)
+{
+	size_t size = 0;
+	while (iovcnt--)
+		size += (iov++)->iov_len;
+
+	return size;
+}
+
+static inline void shift_iovec(const struct iovec **iov, int *iovcnt,
+				size_t nr, ssize_t *total, size_t *count, off_t *offset)
+{
+	while (nr >= (*iov)->iov_len) {
+		nr -= (*iov)->iov_len;
+		*total += (*iov)->iov_len;
+		*count -= (*iov)->iov_len;
+		if (offset)
+			*offset += (*iov)->iov_len;
+		(*iovcnt)--;
+		(*iov)++;
+	}
+}
+
+ssize_t readv_in_full(int fd, const struct iovec *iov, int iovcnt)
+{
+	ssize_t total = 0;
+	size_t count = get_iov_size(iov, iovcnt);
+
+	while (count > 0) {
+		ssize_t nr;
+
+		nr = xreadv(fd, iov, iovcnt);
+		if (nr <= 0) {
+			if (total > 0)
+				return total;
+
+			return -1;
+		}
+
+		shift_iovec(&iov, &iovcnt, nr, &total, &count, NULL);
+	}
+
+	return total;
+}
+
+ssize_t writev_in_full(int fd, const struct iovec *iov, int iovcnt)
+{
+	ssize_t total = 0;
+	size_t count = get_iov_size(iov, iovcnt);
+
+	while (count > 0) {
+		ssize_t nr;
+
+		nr = xwritev(fd, iov, iovcnt);
+		if (nr < 0)
+			return -1;
+		if (nr == 0) {
+			errno = ENOSPC;
+			return -1;
+		}
+
+		shift_iovec(&iov, &iovcnt, nr, &total, &count, NULL);
+	}
+
+	return total;
+}
+
+/* Same as preadv(2) except that this function never returns EAGAIN or EINTR. */
+ssize_t xpreadv(int fd, const struct iovec *iov, int iovcnt, off_t offset)
+{
+	ssize_t nr;
+
+restart:
+	nr = preadv(fd, iov, iovcnt, offset);
+	if ((nr < 0) && ((errno == EAGAIN) || (errno == EINTR)))
+		goto restart;
+
+	return nr;
+}
+
+/* Same as pwritev(2) except that this function never returns EAGAIN or EINTR. */
+ssize_t xpwritev(int fd, const struct iovec *iov, int iovcnt, off_t offset)
+{
+	ssize_t nr;
+
+restart:
+	nr = pwritev(fd, iov, iovcnt, offset);
+	if ((nr < 0) && ((errno == EAGAIN) || (errno == EINTR)))
+		goto restart;
+
+	return nr;
+}
+
+ssize_t preadv_in_full(int fd, const struct iovec *iov, int iovcnt, off_t offset)
+{
+	ssize_t total = 0;
+	size_t count = get_iov_size(iov, iovcnt);
+
+	while (count > 0) {
+		ssize_t nr;
+
+		nr = xpreadv(fd, iov, iovcnt, offset);
+		if (nr <= 0) {
+			if (total > 0)
+				return total;
+
+			return -1;
+		}
+
+		shift_iovec(&iov, &iovcnt, nr, &total, &count, &offset);
+	}
+
+	return total;
+}
+
+ssize_t pwritev_in_full(int fd, const struct iovec *iov, int iovcnt, off_t offset)
+{
+	ssize_t total = 0;
+	size_t count = get_iov_size(iov, iovcnt);
+
+	while (count > 0) {
+		ssize_t nr;
+
+		nr = xpwritev(fd, iov, iovcnt, offset);
+		if (nr < 0)
+			return -1;
+		if (nr == 0) {
+			errno = ENOSPC;
+			return -1;
+		}
+
+		shift_iovec(&iov, &iovcnt, nr, &total, &count, &offset);
+	}
+
+	return total;
+}
+
+#ifdef CONFIG_HAS_AIO
+int aio_pwritev(io_context_t ctx, struct iocb *iocb, int fd, const struct iovec *iov, int iovcnt,
+		off_t offset, int ev, void *param)
+{
+	struct iocb *ios[1] = { iocb };
+	int ret;
+
+	io_prep_pwritev(iocb, fd, iov, iovcnt, offset);
+	io_set_eventfd(iocb, ev);
+	iocb->data = param;
+
+restart:
+	ret = io_submit(ctx, 1, ios);
+	if (ret == -EAGAIN)
+		goto restart;
+	return ret;
+}
+
+int aio_preadv(io_context_t ctx, struct iocb *iocb, int fd, const struct iovec *iov, int iovcnt,
+		off_t offset, int ev, void *param)
+{
+	struct iocb *ios[1] = { iocb };
+	int ret;
+
+	io_prep_preadv(iocb, fd, iov, iovcnt, offset);
+	io_set_eventfd(iocb, ev);
+	iocb->data = param;
+
+restart:
+	ret = io_submit(ctx, 1, ios);
+	if (ret == -EAGAIN)
+		goto restart;
+	return ret;
+}
+#endif

diff --git a/tools/kvm/util/set_private_br.sh b/tools/kvm/util/set_private_br.sh
new file mode 100755
index 0000000..49867dd
--- /dev/null
+++ b/tools/kvm/util/set_private_br.sh

@@ -0,0 +1,51 @@
+#!/bin/bash
+#
+# Author: Amos Kong <kongjianjun@gmail.com>
+# Date: Apr 14, 2011
+# Description: this script is used to create/delete a private bridge,
+# launch a dhcp server on the bridge by dnsmasq.
+#
+# @ ./set_private_br.sh $bridge_name $subnet_prefix
+# @ ./set_private_br.sh vbr0 192.168.33
+
+brname='vbr0'
+subnet='192.168.33'
+
+add_br()
+{
+    echo "add new private bridge: $brname"
+    /usr/sbin/brctl addbr $brname
+    echo 1 > /proc/sys/net/ipv6/conf/$brname/disable_ipv6
+    echo 1 > /proc/sys/net/ipv4/ip_forward
+    /usr/sbin/brctl stp $brname on
+    /usr/sbin/brctl setfd $brname 0
+    ifconfig $brname $subnet.1
+    ifconfig $brname up
+    # Add forward rule, then guest can access public network
+    iptables -t nat -A POSTROUTING -s $subnet.254/24 ! -d $subnet.254/24 -j MASQUERADE
+    /etc/init.d/dnsmasq stop
+    /etc/init.d/tftpd-hpa stop 2>/dev/null
+    dnsmasq --strict-order --bind-interfaces --listen-address $subnet.1 --dhcp-range $subnet.1,$subnet.254 $tftp_cmd
+}
+
+del_br()
+{
+    echo "cleanup bridge setup"
+    kill -9 `pgrep dnsmasq|tail -1`
+    ifconfig $brname down
+    /usr/sbin/brctl delbr $brname
+    iptables -t nat -D POSTROUTING -s $subnet.254/24 ! -d $subnet.254/24 -j MASQUERADE
+}
+
+
+if [ $# = 0 ]; then
+    del_br 2>/dev/null
+    exit
+fi
+if [ $# > 1 ]; then
+    brname="$1"
+fi
+if [ $# = 2 ]; then
+    subnet="$2"
+fi
+add_br

diff --git a/tools/kvm/util/strbuf.c b/tools/kvm/util/strbuf.c
new file mode 100644
index 0000000..99d6b0c
--- /dev/null
+++ b/tools/kvm/util/strbuf.c

@@ -0,0 +1,62 @@
+
+/* user defined headers */
+#include <kvm/util.h>
+#include <kvm/strbuf.h>
+
+int prefixcmp(const char *str, const char *prefix)
+{
+	for (; ; str++, prefix++) {
+		if (!*prefix)
+			return 0;
+		else if (*str != *prefix)
+			return (unsigned char)*prefix - (unsigned char)*str;
+	}
+}
+
+/**
+ * strlcat - Append a length-limited, %NUL-terminated string to another
+ * @dest: The string to be appended to
+ * @src: The string to append to it
+ * @count: The size of the destination buffer.
+ */
+size_t strlcat(char *dest, const char *src, size_t count)
+{
+	size_t dsize = strlen(dest);
+	size_t len = strlen(src);
+	size_t res = dsize + len;
+
+	DIE_IF(dsize >= count);
+
+	dest += dsize;
+	count -= dsize;
+	if (len >= count)
+		len = count - 1;
+
+	memcpy(dest, src, len);
+	dest[len] = 0;
+
+	return res;
+}
+
+/**
+ * strlcpy - Copy a %NUL terminated string into a sized buffer
+ * @dest: Where to copy the string to
+ * @src: Where to copy the string from
+ * @size: size of destination buffer
+ *
+ * Compatible with *BSD: the result is always a valid
+ * NUL-terminated string that fits in the buffer (unless,
+ * of course, the buffer size is zero). It does not pad
+ * out the result like strncpy() does.
+ */
+size_t strlcpy(char *dest, const char *src, size_t size)
+{
+	size_t ret = strlen(src);
+
+	if (size) {
+		size_t len = (ret >= size) ? size - 1 : ret;
+		memcpy(dest, src, len);
+		dest[len] = '\0';
+	}
+	return ret;
+}

diff --git a/tools/kvm/util/threadpool.c b/tools/kvm/util/threadpool.c
new file mode 100644
index 0000000..e64aa26
--- /dev/null
+++ b/tools/kvm/util/threadpool.c

@@ -0,0 +1,175 @@
+#include "kvm/threadpool.h"
+#include "kvm/mutex.h"
+#include "kvm/kvm.h"
+
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <pthread.h>
+#include <stdbool.h>
+
+static DEFINE_MUTEX(job_mutex);
+static DEFINE_MUTEX(thread_mutex);
+static pthread_cond_t job_cond = PTHREAD_COND_INITIALIZER;
+
+static LIST_HEAD(head);
+
+static pthread_t	*threads;
+static long		threadcount;
+static bool		running;
+
+static struct thread_pool__job *thread_pool__job_pop_locked(void)
+{
+	struct thread_pool__job *job;
+
+	if (list_empty(&head))
+		return NULL;
+
+	job = list_first_entry(&head, struct thread_pool__job, queue);
+	list_del(&job->queue);
+
+	return job;
+}
+
+static void thread_pool__job_push_locked(struct thread_pool__job *job)
+{
+	list_add_tail(&job->queue, &head);
+}
+
+static struct thread_pool__job *thread_pool__job_pop(void)
+{
+	struct thread_pool__job *job;
+
+	mutex_lock(&job_mutex);
+	job = thread_pool__job_pop_locked();
+	mutex_unlock(&job_mutex);
+	return job;
+}
+
+static void thread_pool__job_push(struct thread_pool__job *job)
+{
+	mutex_lock(&job_mutex);
+	thread_pool__job_push_locked(job);
+	mutex_unlock(&job_mutex);
+}
+
+static void thread_pool__handle_job(struct thread_pool__job *job)
+{
+	while (job) {
+		job->callback(job->kvm, job->data);
+
+		mutex_lock(&job->mutex);
+
+		if (--job->signalcount > 0)
+			/* If the job was signaled again while we were working */
+			thread_pool__job_push(job);
+
+		mutex_unlock(&job->mutex);
+
+		job = thread_pool__job_pop();
+	}
+}
+
+static void thread_pool__threadfunc_cleanup(void *param)
+{
+	mutex_unlock(&job_mutex);
+}
+
+static void *thread_pool__threadfunc(void *param)
+{
+	pthread_cleanup_push(thread_pool__threadfunc_cleanup, NULL);
+
+	kvm__set_thread_name("threadpool-worker");
+
+	while (running) {
+		struct thread_pool__job *curjob = NULL;
+
+		mutex_lock(&job_mutex);
+		while (running && (curjob = thread_pool__job_pop_locked()) == NULL)
+			pthread_cond_wait(&job_cond, &job_mutex.mutex);
+		mutex_unlock(&job_mutex);
+
+		if (running)
+			thread_pool__handle_job(curjob);
+	}
+
+	pthread_cleanup_pop(0);
+
+	return NULL;
+}
+
+static int thread_pool__addthread(void)
+{
+	int res;
+	void *newthreads;
+
+	mutex_lock(&thread_mutex);
+	newthreads = realloc(threads, (threadcount + 1) * sizeof(pthread_t));
+	if (newthreads == NULL) {
+		mutex_unlock(&thread_mutex);
+		return -1;
+	}
+
+	threads = newthreads;
+
+	res = pthread_create(threads + threadcount, NULL,
+			     thread_pool__threadfunc, NULL);
+
+	if (res == 0)
+		threadcount++;
+	mutex_unlock(&thread_mutex);
+
+	return res;
+}
+
+int thread_pool__init(struct kvm *kvm)
+{
+	unsigned long i;
+	unsigned int thread_count = sysconf(_SC_NPROCESSORS_ONLN);
+
+	running = true;
+
+	for (i = 0; i < thread_count; i++)
+		if (thread_pool__addthread() < 0)
+			return i;
+
+	return i;
+}
+late_init(thread_pool__init);
+
+int thread_pool__exit(struct kvm *kvm)
+{
+	int i;
+	void *NUL = NULL;
+
+	running = false;
+
+	for (i = 0; i < threadcount; i++) {
+		mutex_lock(&job_mutex);
+		pthread_cond_signal(&job_cond);
+		mutex_unlock(&job_mutex);
+	}
+
+	for (i = 0; i < threadcount; i++) {
+		pthread_join(threads[i], NUL);
+	}
+
+	return 0;
+}
+late_exit(thread_pool__exit);
+
+void thread_pool__do_job(struct thread_pool__job *job)
+{
+	struct thread_pool__job *jobinfo = job;
+
+	if (jobinfo == NULL || jobinfo->callback == NULL)
+		return;
+
+	mutex_lock(&jobinfo->mutex);
+	if (jobinfo->signalcount++ == 0)
+		thread_pool__job_push(job);
+	mutex_unlock(&jobinfo->mutex);
+
+	mutex_lock(&job_mutex);
+	pthread_cond_signal(&job_cond);
+	mutex_unlock(&job_mutex);
+}

diff --git a/tools/kvm/util/util.c b/tools/kvm/util/util.c
new file mode 100644
index 0000000..1877105
--- /dev/null
+++ b/tools/kvm/util/util.c

@@ -0,0 +1,133 @@
+/*
+ * Taken from perf which in turn take it from GIT
+ */
+
+#include "kvm/util.h"
+
+#include <kvm/kvm.h>
+#include <linux/magic.h>	/* For HUGETLBFS_MAGIC */
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/statfs.h>
+
+static void report(const char *prefix, const char *err, va_list params)
+{
+	char msg[1024];
+	vsnprintf(msg, sizeof(msg), err, params);
+	fprintf(stderr, " %s%s\n", prefix, msg);
+}
+
+static NORETURN void die_builtin(const char *err, va_list params)
+{
+	report(" Fatal: ", err, params);
+	exit(128);
+}
+
+static void error_builtin(const char *err, va_list params)
+{
+	report(" Error: ", err, params);
+}
+
+static void warn_builtin(const char *warn, va_list params)
+{
+	report(" Warning: ", warn, params);
+}
+
+static void info_builtin(const char *info, va_list params)
+{
+	report(" Info: ", info, params);
+}
+
+void die(const char *err, ...)
+{
+	va_list params;
+
+	va_start(params, err);
+	die_builtin(err, params);
+	va_end(params);
+}
+
+int pr_err(const char *err, ...)
+{
+	va_list params;
+
+	va_start(params, err);
+	error_builtin(err, params);
+	va_end(params);
+	return -1;
+}
+
+void pr_warning(const char *warn, ...)
+{
+	va_list params;
+
+	va_start(params, warn);
+	warn_builtin(warn, params);
+	va_end(params);
+}
+
+void pr_info(const char *info, ...)
+{
+	va_list params;
+
+	va_start(params, info);
+	info_builtin(info, params);
+	va_end(params);
+}
+
+void die_perror(const char *s)
+{
+	perror(s);
+	exit(1);
+}
+
+void *mmap_hugetlbfs(struct kvm *kvm, const char *htlbfs_path, u64 size)
+{
+	char mpath[PATH_MAX];
+	int fd;
+	struct statfs sfs;
+	void *addr;
+	unsigned long blk_size;
+
+	if (statfs(htlbfs_path, &sfs) < 0)
+		die("Can't stat %s\n", htlbfs_path);
+
+	if ((unsigned int)sfs.f_type != HUGETLBFS_MAGIC)
+		die("%s is not hugetlbfs!\n", htlbfs_path);
+
+	blk_size = (unsigned long)sfs.f_bsize;
+	if (sfs.f_bsize == 0 || blk_size > size) {
+		die("Can't use hugetlbfs pagesize %ld for mem size %lld\n",
+			blk_size, (unsigned long long)size);
+	}
+
+	kvm->ram_pagesize = blk_size;
+
+	snprintf(mpath, PATH_MAX, "%s/kvmtoolXXXXXX", htlbfs_path);
+	fd = mkstemp(mpath);
+	if (fd < 0)
+		die("Can't open %s for hugetlbfs map\n", mpath);
+	unlink(mpath);
+	if (ftruncate(fd, size) < 0)
+		die("Can't ftruncate for mem mapping size %lld\n",
+			(unsigned long long)size);
+	addr = mmap(NULL, size, PROT_RW, MAP_PRIVATE, fd, 0);
+	close(fd);
+
+	return addr;
+}
+
+/* This function wraps the decision between hugetlbfs map (if requested) or normal mmap */
+void *mmap_anon_or_hugetlbfs(struct kvm *kvm, const char *hugetlbfs_path, u64 size)
+{
+	if (hugetlbfs_path)
+		/*
+		 * We don't /need/ to map guest RAM from hugetlbfs, but we do so
+		 * if the user specifies a hugetlbfs path.
+		 */
+		return mmap_hugetlbfs(kvm, hugetlbfs_path, size);
+	else {
+		kvm->ram_pagesize = getpagesize();
+		return mmap(NULL, size, PROT_RW, MAP_ANON_NORESERVE, -1, 0);
+	}
+}

diff --git a/tools/kvm/virtio/9p-pdu.c b/tools/kvm/virtio/9p-pdu.c
new file mode 100644
index 0000000..cc2e871
--- /dev/null
+++ b/tools/kvm/virtio/9p-pdu.c

@@ -0,0 +1,290 @@
+#include "kvm/util.h"
+#include "kvm/virtio-9p.h"
+
+#include <endian.h>
+#include <stdint.h>
+
+#include <linux/compiler.h>
+#include <linux/uidgid.h>
+#include <net/9p/9p.h>
+
+static void virtio_p9_pdu_read(struct p9_pdu *pdu, void *data, size_t size)
+{
+	size_t len;
+	int i, copied = 0;
+	u16 iov_cnt = pdu->out_iov_cnt;
+	size_t offset = pdu->read_offset;
+	struct iovec *iov = pdu->out_iov;
+
+	for (i = 0; i < iov_cnt && size; i++) {
+		if (offset >= iov[i].iov_len) {
+			offset -= iov[i].iov_len;
+			continue;
+		} else {
+			len = MIN(iov[i].iov_len - offset, size);
+			memcpy(data, iov[i].iov_base + offset, len);
+			size -= len;
+			data += len;
+			offset = 0;
+			copied += len;
+		}
+	}
+	pdu->read_offset += copied;
+}
+
+static void virtio_p9_pdu_write(struct p9_pdu *pdu,
+				const void *data, size_t size)
+{
+	size_t len;
+	int i, copied = 0;
+	u16 iov_cnt = pdu->in_iov_cnt;
+	size_t offset = pdu->write_offset;
+	struct iovec *iov = pdu->in_iov;
+
+	for (i = 0; i < iov_cnt && size; i++) {
+		if (offset >= iov[i].iov_len) {
+			offset -= iov[i].iov_len;
+			continue;
+		} else {
+			len = MIN(iov[i].iov_len - offset, size);
+			memcpy(iov[i].iov_base + offset, data, len);
+			size -= len;
+			data += len;
+			offset = 0;
+			copied += len;
+		}
+	}
+	pdu->write_offset += copied;
+}
+
+static void virtio_p9_wstat_free(struct p9_wstat *stbuf)
+{
+	free(stbuf->name);
+	free(stbuf->uid);
+	free(stbuf->gid);
+	free(stbuf->muid);
+}
+
+static int virtio_p9_decode(struct p9_pdu *pdu, const char *fmt, va_list ap)
+{
+	int retval = 0;
+	const char *ptr;
+
+	for (ptr = fmt; *ptr; ptr++) {
+		switch (*ptr) {
+		case 'b':
+		{
+			int8_t *val = va_arg(ap, int8_t *);
+			virtio_p9_pdu_read(pdu, val, sizeof(*val));
+		}
+		break;
+		case 'w':
+		{
+			int16_t le_val;
+			int16_t *val = va_arg(ap, int16_t *);
+			virtio_p9_pdu_read(pdu, &le_val, sizeof(le_val));
+			*val = le16toh(le_val);
+		}
+		break;
+		case 'd':
+		{
+			int32_t le_val;
+			int32_t *val = va_arg(ap, int32_t *);
+			virtio_p9_pdu_read(pdu, &le_val, sizeof(le_val));
+			*val = le32toh(le_val);
+		}
+		break;
+		case 'q':
+		{
+			int64_t le_val;
+			int64_t *val = va_arg(ap, int64_t *);
+			virtio_p9_pdu_read(pdu, &le_val, sizeof(le_val));
+			*val = le64toh(le_val);
+		}
+		break;
+		case 's':
+		{
+			int16_t len;
+			char **str = va_arg(ap, char **);
+
+			virtio_p9_pdu_readf(pdu, "w", &len);
+			*str = malloc(len + 1);
+			if (*str == NULL) {
+				retval = ENOMEM;
+				break;
+			}
+			virtio_p9_pdu_read(pdu, *str, len);
+			(*str)[len] = 0;
+		}
+		break;
+		case 'Q':
+		{
+			struct p9_qid *qid = va_arg(ap, struct p9_qid *);
+			retval = virtio_p9_pdu_readf(pdu, "bdq",
+						     &qid->type, &qid->version,
+						     &qid->path);
+		}
+		break;
+		case 'S':
+		{
+			struct p9_wstat *stbuf = va_arg(ap, struct p9_wstat *);
+			memset(stbuf, 0, sizeof(struct p9_wstat));
+			stbuf->n_uid = KUIDT_INIT(-1);
+			stbuf->n_gid = KGIDT_INIT(-1);
+			stbuf->n_muid = KUIDT_INIT(-1);
+			retval = virtio_p9_pdu_readf(pdu, "wwdQdddqssss",
+						&stbuf->size, &stbuf->type,
+						&stbuf->dev, &stbuf->qid,
+						&stbuf->mode, &stbuf->atime,
+						&stbuf->mtime, &stbuf->length,
+						&stbuf->name, &stbuf->uid,
+						&stbuf->gid, &stbuf->muid);
+			if (retval)
+				virtio_p9_wstat_free(stbuf);
+		}
+		break;
+		case 'I':
+		{
+			struct p9_iattr_dotl *p9attr = va_arg(ap,
+						       struct p9_iattr_dotl *);
+
+			retval = virtio_p9_pdu_readf(pdu, "ddddqqqqq",
+						     &p9attr->valid,
+						     &p9attr->mode,
+						     &p9attr->uid,
+						     &p9attr->gid,
+						     &p9attr->size,
+						     &p9attr->atime_sec,
+						     &p9attr->atime_nsec,
+						     &p9attr->mtime_sec,
+						     &p9attr->mtime_nsec);
+		}
+		break;
+		default:
+			retval = EINVAL;
+			break;
+		}
+	}
+	return retval;
+}
+
+static int virtio_p9_pdu_encode(struct p9_pdu *pdu, const char *fmt, va_list ap)
+{
+	int retval = 0;
+	const char *ptr;
+
+	for (ptr = fmt; *ptr; ptr++) {
+		switch (*ptr) {
+		case 'b':
+		{
+			int8_t val = va_arg(ap, int);
+			virtio_p9_pdu_write(pdu, &val, sizeof(val));
+		}
+		break;
+		case 'w':
+		{
+			int16_t val = htole16(va_arg(ap, int));
+			virtio_p9_pdu_write(pdu, &val, sizeof(val));
+		}
+		break;
+		case 'd':
+		{
+			int32_t val = htole32(va_arg(ap, int32_t));
+			virtio_p9_pdu_write(pdu, &val, sizeof(val));
+		}
+		break;
+		case 'q':
+		{
+			int64_t val = htole64(va_arg(ap, int64_t));
+			virtio_p9_pdu_write(pdu, &val, sizeof(val));
+		}
+		break;
+		case 's':
+		{
+			uint16_t len = 0;
+			const char *s = va_arg(ap, char *);
+			if (s)
+				len = MIN(strlen(s), USHRT_MAX);
+			virtio_p9_pdu_writef(pdu, "w", len);
+			virtio_p9_pdu_write(pdu, s, len);
+		}
+		break;
+		case 'Q':
+		{
+			struct p9_qid *qid = va_arg(ap, struct p9_qid *);
+			retval = virtio_p9_pdu_writef(pdu, "bdq",
+						      qid->type, qid->version,
+						      qid->path);
+		}
+		break;
+		case 'S':
+		{
+			struct p9_wstat *stbuf = va_arg(ap, struct p9_wstat *);
+			retval = virtio_p9_pdu_writef(pdu, "wwdQdddqssss",
+						stbuf->size, stbuf->type,
+						stbuf->dev, &stbuf->qid,
+						stbuf->mode, stbuf->atime,
+						stbuf->mtime, stbuf->length,
+						stbuf->name, stbuf->uid,
+						stbuf->gid, stbuf->muid);
+		}
+		break;
+		case 'A':
+		{
+			struct p9_stat_dotl *stbuf = va_arg(ap,
+						      struct p9_stat_dotl *);
+			retval  = virtio_p9_pdu_writef(pdu,
+						       "qQdddqqqqqqqqqqqqqqq",
+						       stbuf->st_result_mask,
+						       &stbuf->qid,
+						       stbuf->st_mode,
+						       stbuf->st_uid,
+						       stbuf->st_gid,
+						       stbuf->st_nlink,
+						       stbuf->st_rdev,
+						       stbuf->st_size,
+						       stbuf->st_blksize,
+						       stbuf->st_blocks,
+						       stbuf->st_atime_sec,
+						       stbuf->st_atime_nsec,
+						       stbuf->st_mtime_sec,
+						       stbuf->st_mtime_nsec,
+						       stbuf->st_ctime_sec,
+						       stbuf->st_ctime_nsec,
+						       stbuf->st_btime_sec,
+						       stbuf->st_btime_nsec,
+						       stbuf->st_gen,
+						       stbuf->st_data_version);
+		}
+		break;
+		default:
+			retval = EINVAL;
+			break;
+		}
+	}
+	return retval;
+}
+
+int virtio_p9_pdu_readf(struct p9_pdu *pdu, const char *fmt, ...)
+{
+	int ret;
+	va_list ap;
+
+	va_start(ap, fmt);
+	ret = virtio_p9_decode(pdu, fmt, ap);
+	va_end(ap);
+
+	return ret;
+}
+
+int virtio_p9_pdu_writef(struct p9_pdu *pdu, const char *fmt, ...)
+{
+	int ret;
+	va_list ap;
+
+	va_start(ap, fmt);
+	ret = virtio_p9_pdu_encode(pdu, fmt, ap);
+	va_end(ap);
+
+	return ret;
+}

diff --git a/tools/kvm/virtio/9p.c b/tools/kvm/virtio/9p.c
new file mode 100644
index 0000000..9073a1e
--- /dev/null
+++ b/tools/kvm/virtio/9p.c

@@ -0,0 +1,1446 @@
+#include "kvm/virtio-pci-dev.h"
+#include "kvm/ioport.h"
+#include "kvm/util.h"
+#include "kvm/threadpool.h"
+#include "kvm/irq.h"
+#include "kvm/virtio-9p.h"
+#include "kvm/guest_compat.h"
+#include "kvm/builtin-setup.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+#include <sys/vfs.h>
+
+#include <linux/virtio_ring.h>
+#include <linux/virtio_9p.h>
+#include <linux/uidgid.h>
+#include <net/9p/9p.h>
+
+static LIST_HEAD(devs);
+static int compat_id = -1;
+
+static int insert_new_fid(struct p9_dev *dev, struct p9_fid *fid);
+static struct p9_fid *find_or_create_fid(struct p9_dev *dev, u32 fid)
+{
+	struct rb_node *node = dev->fids.rb_node;
+	struct p9_fid *pfid = NULL;
+
+	while (node) {
+		struct p9_fid *cur = rb_entry(node, struct p9_fid, node);
+
+		if (fid < cur->fid) {
+			node = node->rb_left;
+		} else if (fid > cur->fid) {
+			node = node->rb_right;
+		} else {
+			return cur;
+		}
+	}
+
+	pfid = calloc(sizeof(*pfid), 1);
+	if (!pfid)
+		return NULL;
+
+	pfid->fid = fid;
+	strcpy(pfid->abs_path, dev->root_dir);
+	pfid->path = pfid->abs_path + strlen(dev->root_dir);
+
+	insert_new_fid(dev, pfid);
+
+	return pfid;
+}
+
+static int insert_new_fid(struct p9_dev *dev, struct p9_fid *fid)
+{
+	struct rb_node **node = &(dev->fids.rb_node), *parent = NULL;
+
+	while (*node) {
+		int result = fid->fid - rb_entry(*node, struct p9_fid, node)->fid;
+
+		parent = *node;
+		if (result < 0)
+			node    = &((*node)->rb_left);
+		else if (result > 0)
+			node    = &((*node)->rb_right);
+		else
+			return -EEXIST;
+	}
+
+	rb_link_node(&fid->node, parent, node);
+	rb_insert_color(&fid->node, &dev->fids);
+	return 0;
+}
+
+static struct p9_fid *get_fid(struct p9_dev *p9dev, int fid)
+{
+	struct p9_fid *new;
+
+	new = find_or_create_fid(p9dev, fid);
+
+	return new;
+}
+
+/* Warning: Immediately use value returned from this function */
+static const char *rel_to_abs(struct p9_dev *p9dev,
+			      const char *path, char *abs_path)
+{
+	sprintf(abs_path, "%s/%s", p9dev->root_dir, path);
+
+	return abs_path;
+}
+
+static void stat2qid(struct stat *st, struct p9_qid *qid)
+{
+	*qid = (struct p9_qid) {
+		.path		= st->st_ino,
+		.version	= st->st_mtime,
+	};
+
+	if (S_ISDIR(st->st_mode))
+		qid->type	|= P9_QTDIR;
+}
+
+static void close_fid(struct p9_dev *p9dev, u32 fid)
+{
+	struct p9_fid *pfid = get_fid(p9dev, fid);
+
+	if (pfid->fd > 0)
+		close(pfid->fd);
+
+	if (pfid->dir)
+		closedir(pfid->dir);
+
+	rb_erase(&pfid->node, &p9dev->fids);
+	free(pfid);
+}
+
+static void virtio_p9_set_reply_header(struct p9_pdu *pdu, u32 size)
+{
+	u8 cmd;
+	u16 tag;
+
+	pdu->read_offset = sizeof(u32);
+	virtio_p9_pdu_readf(pdu, "bw", &cmd, &tag);
+	pdu->write_offset = 0;
+	/* cmd + 1 is the reply message */
+	virtio_p9_pdu_writef(pdu, "dbw", size, cmd + 1, tag);
+}
+
+static u16 virtio_p9_update_iov_cnt(struct iovec iov[], u32 count, int iov_cnt)
+{
+	int i;
+	u32 total = 0;
+	for (i = 0; (i < iov_cnt) && (total < count); i++) {
+		if (total + iov[i].iov_len > count) {
+			/* we don't need this iov fully */
+			iov[i].iov_len -= ((total + iov[i].iov_len) - count);
+			i++;
+			break;
+		}
+		total += iov[i].iov_len;
+	}
+	return i;
+}
+
+static void virtio_p9_error_reply(struct p9_dev *p9dev,
+				  struct p9_pdu *pdu, int err, u32 *outlen)
+{
+	u16 tag;
+
+	pdu->write_offset = VIRTIO_9P_HDR_LEN;
+	virtio_p9_pdu_writef(pdu, "d", err);
+	*outlen = pdu->write_offset;
+
+	/* read the tag from input */
+	pdu->read_offset = sizeof(u32) + sizeof(u8);
+	virtio_p9_pdu_readf(pdu, "w", &tag);
+
+	/* Update the header */
+	pdu->write_offset = 0;
+	virtio_p9_pdu_writef(pdu, "dbw", *outlen, P9_RLERROR, tag);
+}
+
+static void virtio_p9_version(struct p9_dev *p9dev,
+			      struct p9_pdu *pdu, u32 *outlen)
+{
+	u32 msize;
+	char *version;
+	virtio_p9_pdu_readf(pdu, "ds", &msize, &version);
+	/*
+	 * reply with the same msize the client sent us
+	 * Error out if the request is not for 9P2000.L
+	 */
+	if (!strcmp(version, VIRTIO_9P_VERSION_DOTL))
+		virtio_p9_pdu_writef(pdu, "ds", msize, version);
+	else
+		virtio_p9_pdu_writef(pdu, "ds", msize, "unknown");
+
+	*outlen = pdu->write_offset;
+	virtio_p9_set_reply_header(pdu, *outlen);
+	free(version);
+	return;
+}
+
+static void virtio_p9_clunk(struct p9_dev *p9dev,
+			    struct p9_pdu *pdu, u32 *outlen)
+{
+	u32 fid;
+
+	virtio_p9_pdu_readf(pdu, "d", &fid);
+	close_fid(p9dev, fid);
+
+	*outlen = pdu->write_offset;
+	virtio_p9_set_reply_header(pdu, *outlen);
+	return;
+}
+
+/*
+ * FIXME!! Need to map to protocol independent value. Upstream
+ * 9p also have the same BUG
+ */
+static int virtio_p9_openflags(int flags)
+{
+	flags &= ~(O_NOCTTY | O_ASYNC | O_CREAT | O_DIRECT);
+	flags |= O_NOFOLLOW;
+	return flags;
+}
+
+static bool is_dir(struct p9_fid *fid)
+{
+	struct stat st;
+
+	stat(fid->abs_path, &st);
+
+	return S_ISDIR(st.st_mode);
+}
+
+static void virtio_p9_open(struct p9_dev *p9dev,
+			   struct p9_pdu *pdu, u32 *outlen)
+{
+	u32 fid, flags;
+	struct stat st;
+	struct p9_qid qid;
+	struct p9_fid *new_fid;
+
+
+	virtio_p9_pdu_readf(pdu, "dd", &fid, &flags);
+	new_fid = get_fid(p9dev, fid);
+
+	if (lstat(new_fid->abs_path, &st) < 0)
+		goto err_out;
+
+	stat2qid(&st, &qid);
+
+	if (is_dir(new_fid)) {
+		new_fid->dir = opendir(new_fid->abs_path);
+		if (!new_fid->dir)
+			goto err_out;
+	} else {
+		new_fid->fd  = open(new_fid->abs_path,
+				    virtio_p9_openflags(flags));
+		if (new_fid->fd < 0)
+			goto err_out;
+	}
+	/* FIXME!! need ot send proper iounit  */
+	virtio_p9_pdu_writef(pdu, "Qd", &qid, 0);
+
+	*outlen = pdu->write_offset;
+	virtio_p9_set_reply_header(pdu, *outlen);
+	return;
+err_out:
+	virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+	return;
+}
+
+static void virtio_p9_create(struct p9_dev *p9dev,
+			     struct p9_pdu *pdu, u32 *outlen)
+{
+	int fd, ret;
+	char *name;
+	struct stat st;
+	struct p9_qid qid;
+	struct p9_fid *dfid;
+	char full_path[PATH_MAX];
+	u32 dfid_val, flags, mode, gid;
+
+	virtio_p9_pdu_readf(pdu, "dsddd", &dfid_val,
+			    &name, &flags, &mode, &gid);
+	dfid = get_fid(p9dev, dfid_val);
+
+	flags = virtio_p9_openflags(flags);
+
+	sprintf(full_path, "%s/%s", dfid->abs_path, name);
+	fd = open(full_path, flags | O_CREAT, mode);
+	if (fd < 0)
+		goto err_out;
+	dfid->fd = fd;
+
+	if (lstat(full_path, &st) < 0)
+		goto err_out;
+
+	ret = chmod(full_path, mode & 0777);
+	if (ret < 0)
+		goto err_out;
+
+	sprintf(dfid->path, "%s/%s", dfid->path, name);
+	stat2qid(&st, &qid);
+	virtio_p9_pdu_writef(pdu, "Qd", &qid, 0);
+	*outlen = pdu->write_offset;
+	virtio_p9_set_reply_header(pdu, *outlen);
+	free(name);
+	return;
+err_out:
+	free(name);
+	virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+	return;
+}
+
+static void virtio_p9_mkdir(struct p9_dev *p9dev,
+			    struct p9_pdu *pdu, u32 *outlen)
+{
+	int ret;
+	char *name;
+	struct stat st;
+	struct p9_qid qid;
+	struct p9_fid *dfid;
+	char full_path[PATH_MAX];
+	u32 dfid_val, mode, gid;
+
+	virtio_p9_pdu_readf(pdu, "dsdd", &dfid_val,
+			    &name, &mode, &gid);
+	dfid = get_fid(p9dev, dfid_val);
+
+	sprintf(full_path, "%s/%s", dfid->abs_path, name);
+	ret = mkdir(full_path, mode);
+	if (ret < 0)
+		goto err_out;
+
+	if (lstat(full_path, &st) < 0)
+		goto err_out;
+
+	ret = chmod(full_path, mode & 0777);
+	if (ret < 0)
+		goto err_out;
+
+	stat2qid(&st, &qid);
+	virtio_p9_pdu_writef(pdu, "Qd", &qid, 0);
+	*outlen = pdu->write_offset;
+	virtio_p9_set_reply_header(pdu, *outlen);
+	free(name);
+	return;
+err_out:
+	free(name);
+	virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+	return;
+}
+
+static void virtio_p9_walk(struct p9_dev *p9dev,
+			   struct p9_pdu *pdu, u32 *outlen)
+{
+	u8 i;
+	u16 nwqid;
+	u16 nwname;
+	struct p9_qid wqid;
+	struct p9_fid *new_fid, *old_fid;
+	u32 fid_val, newfid_val;
+
+
+	virtio_p9_pdu_readf(pdu, "ddw", &fid_val, &newfid_val, &nwname);
+	new_fid	= get_fid(p9dev, newfid_val);
+
+	nwqid = 0;
+	if (nwname) {
+		struct p9_fid *fid = get_fid(p9dev, fid_val);
+
+		strcpy(new_fid->path, fid->path);
+		/* skip the space for count */
+		pdu->write_offset += sizeof(u16);
+		for (i = 0; i < nwname; i++) {
+			struct stat st;
+			char tmp[PATH_MAX] = {0};
+			char full_path[PATH_MAX];
+			char *str;
+
+			virtio_p9_pdu_readf(pdu, "s", &str);
+
+			/* Format the new path we're 'walk'ing into */
+			sprintf(tmp, "%s/%s", new_fid->path, str);
+
+			free(str);
+
+			if (lstat(rel_to_abs(p9dev, tmp, full_path), &st) < 0)
+				goto err_out;
+
+			stat2qid(&st, &wqid);
+			strcpy(new_fid->path, tmp);
+			new_fid->uid = fid->uid;
+			nwqid++;
+			virtio_p9_pdu_writef(pdu, "Q", &wqid);
+		}
+	} else {
+		/*
+		 * update write_offset so our outlen get correct value
+		 */
+		pdu->write_offset += sizeof(u16);
+		old_fid = get_fid(p9dev, fid_val);
+		strcpy(new_fid->path, old_fid->path);
+		new_fid->uid    = old_fid->uid;
+	}
+	*outlen = pdu->write_offset;
+	pdu->write_offset = VIRTIO_9P_HDR_LEN;
+	virtio_p9_pdu_writef(pdu, "d", nwqid);
+	virtio_p9_set_reply_header(pdu, *outlen);
+	return;
+err_out:
+	virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+	return;
+}
+
+static void virtio_p9_attach(struct p9_dev *p9dev,
+			     struct p9_pdu *pdu, u32 *outlen)
+{
+	char *uname;
+	char *aname;
+	struct stat st;
+	struct p9_qid qid;
+	struct p9_fid *fid;
+	u32 fid_val, afid, uid;
+
+	virtio_p9_pdu_readf(pdu, "ddssd", &fid_val, &afid,
+			    &uname, &aname, &uid);
+
+	free(uname);
+	free(aname);
+
+	if (lstat(p9dev->root_dir, &st) < 0)
+		goto err_out;
+
+	stat2qid(&st, &qid);
+
+	fid = get_fid(p9dev, fid_val);
+	fid->uid = uid;
+	strcpy(fid->path, "/");
+
+	virtio_p9_pdu_writef(pdu, "Q", &qid);
+	*outlen = pdu->write_offset;
+	virtio_p9_set_reply_header(pdu, *outlen);
+	return;
+err_out:
+	virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+	return;
+}
+
+static void virtio_p9_fill_stat(struct p9_dev *p9dev,
+				struct stat *st, struct p9_stat_dotl *statl)
+{
+	memset(statl, 0, sizeof(*statl));
+	statl->st_mode		= st->st_mode;
+	statl->st_nlink		= st->st_nlink;
+	statl->st_uid		= KUIDT_INIT(st->st_uid);
+	statl->st_gid		= KGIDT_INIT(st->st_gid);
+	statl->st_rdev		= st->st_rdev;
+	statl->st_size		= st->st_size;
+	statl->st_blksize	= st->st_blksize;
+	statl->st_blocks	= st->st_blocks;
+	statl->st_atime_sec	= st->st_atime;
+	statl->st_atime_nsec	= st->st_atim.tv_nsec;
+	statl->st_mtime_sec	= st->st_mtime;
+	statl->st_mtime_nsec	= st->st_mtim.tv_nsec;
+	statl->st_ctime_sec	= st->st_ctime;
+	statl->st_ctime_nsec	= st->st_ctim.tv_nsec;
+	/* Currently we only support BASIC fields in stat */
+	statl->st_result_mask	= P9_STATS_BASIC;
+	stat2qid(st, &statl->qid);
+}
+
+static void virtio_p9_read(struct p9_dev *p9dev,
+			   struct p9_pdu *pdu, u32 *outlen)
+{
+	u64 offset;
+	u32 fid_val;
+	u16 iov_cnt;
+	void *iov_base;
+	size_t iov_len;
+	u32 count, rcount;
+	struct p9_fid *fid;
+
+
+	rcount = 0;
+	virtio_p9_pdu_readf(pdu, "dqd", &fid_val, &offset, &count);
+	fid = get_fid(p9dev, fid_val);
+
+	iov_base = pdu->in_iov[0].iov_base;
+	iov_len  = pdu->in_iov[0].iov_len;
+	iov_cnt  = pdu->in_iov_cnt;
+	pdu->in_iov[0].iov_base += VIRTIO_9P_HDR_LEN + sizeof(u32);
+	pdu->in_iov[0].iov_len -= VIRTIO_9P_HDR_LEN + sizeof(u32);
+	pdu->in_iov_cnt = virtio_p9_update_iov_cnt(pdu->in_iov,
+						   count,
+						   pdu->in_iov_cnt);
+	rcount = preadv(fid->fd, pdu->in_iov,
+			pdu->in_iov_cnt, offset);
+	if (rcount > count)
+		rcount = count;
+	/*
+	 * Update the iov_base back, so that rest of
+	 * pdu_writef works correctly.
+	 */
+	pdu->in_iov[0].iov_base = iov_base;
+	pdu->in_iov[0].iov_len  = iov_len;
+	pdu->in_iov_cnt         = iov_cnt;
+
+	pdu->write_offset = VIRTIO_9P_HDR_LEN;
+	virtio_p9_pdu_writef(pdu, "d", rcount);
+	*outlen = pdu->write_offset + rcount;
+	virtio_p9_set_reply_header(pdu, *outlen);
+	return;
+}
+
+static int virtio_p9_dentry_size(struct dirent *dent)
+{
+	/*
+	 * Size of each dirent:
+	 * qid(13) + offset(8) + type(1) + name_len(2) + name
+	 */
+	return 24 + strlen(dent->d_name);
+}
+
+static void virtio_p9_readdir(struct p9_dev *p9dev,
+			      struct p9_pdu *pdu, u32 *outlen)
+{
+	u32 fid_val;
+	u32 count, rcount;
+	struct stat st;
+	struct p9_fid *fid;
+	struct dirent *dent;
+	char full_path[PATH_MAX];
+	u64 offset, old_offset;
+
+	rcount = 0;
+	virtio_p9_pdu_readf(pdu, "dqd", &fid_val, &offset, &count);
+	fid = get_fid(p9dev, fid_val);
+
+	if (!is_dir(fid)) {
+		errno = EINVAL;
+		goto err_out;
+	}
+
+	/* Move the offset specified */
+	seekdir(fid->dir, offset);
+
+	old_offset = offset;
+	/* If reading a dir, fill the buffer with p9_stat entries */
+	dent = readdir(fid->dir);
+
+	/* Skip the space for writing count */
+	pdu->write_offset += sizeof(u32);
+	while (dent) {
+		u32 read;
+		struct p9_qid qid;
+
+		if ((rcount + virtio_p9_dentry_size(dent)) > count) {
+			/* seek to the previous offset and return */
+			seekdir(fid->dir, old_offset);
+			break;
+		}
+		old_offset = dent->d_off;
+		lstat(rel_to_abs(p9dev, dent->d_name, full_path), &st);
+		stat2qid(&st, &qid);
+		read = pdu->write_offset;
+		virtio_p9_pdu_writef(pdu, "Qqbs", &qid, dent->d_off,
+				     dent->d_type, dent->d_name);
+		rcount += pdu->write_offset - read;
+		dent = readdir(fid->dir);
+	}
+
+	pdu->write_offset = VIRTIO_9P_HDR_LEN;
+	virtio_p9_pdu_writef(pdu, "d", rcount);
+	*outlen = pdu->write_offset + rcount;
+	virtio_p9_set_reply_header(pdu, *outlen);
+	return;
+err_out:
+	virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+	return;
+}
+
+
+static void virtio_p9_getattr(struct p9_dev *p9dev,
+			      struct p9_pdu *pdu, u32 *outlen)
+{
+	u32 fid_val;
+	struct stat st;
+	u64 request_mask;
+	struct p9_fid *fid;
+	struct p9_stat_dotl statl;
+
+	virtio_p9_pdu_readf(pdu, "dq", &fid_val, &request_mask);
+	fid = get_fid(p9dev, fid_val);
+	if (lstat(fid->abs_path, &st) < 0)
+		goto err_out;
+
+	virtio_p9_fill_stat(p9dev, &st, &statl);
+	virtio_p9_pdu_writef(pdu, "A", &statl);
+	*outlen = pdu->write_offset;
+	virtio_p9_set_reply_header(pdu, *outlen);
+	return;
+err_out:
+	virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+	return;
+}
+
+/* FIXME!! from linux/fs.h */
+/*
+ * Attribute flags.  These should be or-ed together to figure out what
+ * has been changed!
+ */
+#define ATTR_MODE	(1 << 0)
+#define ATTR_UID	(1 << 1)
+#define ATTR_GID	(1 << 2)
+#define ATTR_SIZE	(1 << 3)
+#define ATTR_ATIME	(1 << 4)
+#define ATTR_MTIME	(1 << 5)
+#define ATTR_CTIME	(1 << 6)
+#define ATTR_ATIME_SET	(1 << 7)
+#define ATTR_MTIME_SET	(1 << 8)
+#define ATTR_FORCE	(1 << 9) /* Not a change, but a change it */
+#define ATTR_ATTR_FLAG	(1 << 10)
+#define ATTR_KILL_SUID	(1 << 11)
+#define ATTR_KILL_SGID	(1 << 12)
+#define ATTR_FILE	(1 << 13)
+#define ATTR_KILL_PRIV	(1 << 14)
+#define ATTR_OPEN	(1 << 15) /* Truncating from open(O_TRUNC) */
+#define ATTR_TIMES_SET	(1 << 16)
+
+#define ATTR_MASK    127
+
+static void virtio_p9_setattr(struct p9_dev *p9dev,
+			      struct p9_pdu *pdu, u32 *outlen)
+{
+	int ret = 0;
+	u32 fid_val;
+	struct p9_fid *fid;
+	struct p9_iattr_dotl p9attr;
+
+	virtio_p9_pdu_readf(pdu, "dI", &fid_val, &p9attr);
+	fid = get_fid(p9dev, fid_val);
+
+	if (p9attr.valid & ATTR_MODE) {
+		ret = chmod(fid->abs_path, p9attr.mode);
+		if (ret < 0)
+			goto err_out;
+	}
+	if (p9attr.valid & (ATTR_ATIME | ATTR_MTIME)) {
+		struct timespec times[2];
+		if (p9attr.valid & ATTR_ATIME) {
+			if (p9attr.valid & ATTR_ATIME_SET) {
+				times[0].tv_sec = p9attr.atime_sec;
+				times[0].tv_nsec = p9attr.atime_nsec;
+			} else {
+				times[0].tv_nsec = UTIME_NOW;
+			}
+		} else {
+			times[0].tv_nsec = UTIME_OMIT;
+		}
+		if (p9attr.valid & ATTR_MTIME) {
+			if (p9attr.valid & ATTR_MTIME_SET) {
+				times[1].tv_sec = p9attr.mtime_sec;
+				times[1].tv_nsec = p9attr.mtime_nsec;
+			} else {
+				times[1].tv_nsec = UTIME_NOW;
+			}
+		} else
+			times[1].tv_nsec = UTIME_OMIT;
+
+		ret = utimensat(-1, fid->abs_path, times, AT_SYMLINK_NOFOLLOW);
+		if (ret < 0)
+			goto err_out;
+	}
+	/*
+	 * If the only valid entry in iattr is ctime we can call
+	 * chown(-1,-1) to update the ctime of the file
+	 */
+	if ((p9attr.valid & (ATTR_UID | ATTR_GID)) ||
+	    ((p9attr.valid & ATTR_CTIME)
+	     && !((p9attr.valid & ATTR_MASK) & ~ATTR_CTIME))) {
+		if (!(p9attr.valid & ATTR_UID))
+			p9attr.uid = KUIDT_INIT(-1);
+
+		if (!(p9attr.valid & ATTR_GID))
+			p9attr.gid = KGIDT_INIT(-1);
+
+		ret = lchown(fid->abs_path, __kuid_val(p9attr.uid),
+				__kgid_val(p9attr.gid));
+		if (ret < 0)
+			goto err_out;
+	}
+	if (p9attr.valid & (ATTR_SIZE)) {
+		ret = truncate(fid->abs_path, p9attr.size);
+		if (ret < 0)
+			goto err_out;
+	}
+	*outlen = VIRTIO_9P_HDR_LEN;
+	virtio_p9_set_reply_header(pdu, *outlen);
+	return;
+err_out:
+	virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+	return;
+}
+
+static void virtio_p9_write(struct p9_dev *p9dev,
+			    struct p9_pdu *pdu, u32 *outlen)
+{
+
+	u64 offset;
+	u32 fid_val;
+	u32 count;
+	ssize_t res;
+	u16 iov_cnt;
+	void *iov_base;
+	size_t iov_len;
+	struct p9_fid *fid;
+	/* u32 fid + u64 offset + u32 count */
+	int twrite_size = sizeof(u32) + sizeof(u64) + sizeof(u32);
+
+	virtio_p9_pdu_readf(pdu, "dqd", &fid_val, &offset, &count);
+	fid = get_fid(p9dev, fid_val);
+
+	iov_base = pdu->out_iov[0].iov_base;
+	iov_len  = pdu->out_iov[0].iov_len;
+	iov_cnt  = pdu->out_iov_cnt;
+
+	/* Adjust the iovec to skip the header and meta data */
+	pdu->out_iov[0].iov_base += (sizeof(struct p9_msg) + twrite_size);
+	pdu->out_iov[0].iov_len -=  (sizeof(struct p9_msg) + twrite_size);
+	pdu->out_iov_cnt = virtio_p9_update_iov_cnt(pdu->out_iov, count,
+						    pdu->out_iov_cnt);
+	res = pwritev(fid->fd, pdu->out_iov, pdu->out_iov_cnt, offset);
+	/*
+	 * Update the iov_base back, so that rest of
+	 * pdu_readf works correctly.
+	 */
+	pdu->out_iov[0].iov_base = iov_base;
+	pdu->out_iov[0].iov_len  = iov_len;
+	pdu->out_iov_cnt         = iov_cnt;
+
+	if (res < 0)
+		goto err_out;
+	virtio_p9_pdu_writef(pdu, "d", res);
+	*outlen = pdu->write_offset;
+	virtio_p9_set_reply_header(pdu, *outlen);
+	return;
+err_out:
+	virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+	return;
+}
+
+static void virtio_p9_remove(struct p9_dev *p9dev,
+			     struct p9_pdu *pdu, u32 *outlen)
+{
+	int ret;
+	u32 fid_val;
+	struct p9_fid *fid;
+
+	virtio_p9_pdu_readf(pdu, "d", &fid_val);
+	fid = get_fid(p9dev, fid_val);
+
+	ret = remove(fid->abs_path);
+	if (ret < 0)
+		goto err_out;
+	*outlen = pdu->write_offset;
+	virtio_p9_set_reply_header(pdu, *outlen);
+	return;
+
+err_out:
+	virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+	return;
+}
+
+static void virtio_p9_rename(struct p9_dev *p9dev,
+			     struct p9_pdu *pdu, u32 *outlen)
+{
+	int ret;
+	u32 fid_val, new_fid_val;
+	struct p9_fid *fid, *new_fid;
+	char full_path[PATH_MAX], *new_name;
+
+	virtio_p9_pdu_readf(pdu, "dds", &fid_val, &new_fid_val, &new_name);
+	fid = get_fid(p9dev, fid_val);
+	new_fid = get_fid(p9dev, new_fid_val);
+
+	sprintf(full_path, "%s/%s", new_fid->abs_path, new_name);
+	ret = rename(fid->abs_path, full_path);
+	if (ret < 0)
+		goto err_out;
+	*outlen = pdu->write_offset;
+	virtio_p9_set_reply_header(pdu, *outlen);
+	return;
+
+err_out:
+	virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+	return;
+}
+
+static void virtio_p9_readlink(struct p9_dev *p9dev,
+			       struct p9_pdu *pdu, u32 *outlen)
+{
+	int ret;
+	u32 fid_val;
+	struct p9_fid *fid;
+	char target_path[PATH_MAX];
+
+	virtio_p9_pdu_readf(pdu, "d", &fid_val);
+	fid = get_fid(p9dev, fid_val);
+
+	memset(target_path, 0, PATH_MAX);
+	ret = readlink(fid->abs_path, target_path, PATH_MAX - 1);
+	if (ret < 0)
+		goto err_out;
+
+	virtio_p9_pdu_writef(pdu, "s", target_path);
+	*outlen = pdu->write_offset;
+	virtio_p9_set_reply_header(pdu, *outlen);
+	return;
+err_out:
+	virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+	return;
+}
+
+static void virtio_p9_statfs(struct p9_dev *p9dev,
+			     struct p9_pdu *pdu, u32 *outlen)
+{
+	int ret;
+	u64 fsid;
+	u32 fid_val;
+	struct p9_fid *fid;
+	struct statfs stat_buf;
+
+	virtio_p9_pdu_readf(pdu, "d", &fid_val);
+	fid = get_fid(p9dev, fid_val);
+
+	ret = statfs(fid->abs_path, &stat_buf);
+	if (ret < 0)
+		goto err_out;
+	/* FIXME!! f_blocks needs update based on client msize */
+	fsid = (unsigned int) stat_buf.f_fsid.__val[0] |
+		(unsigned long long)stat_buf.f_fsid.__val[1] << 32;
+	virtio_p9_pdu_writef(pdu, "ddqqqqqqd", stat_buf.f_type,
+			     stat_buf.f_bsize, stat_buf.f_blocks,
+			     stat_buf.f_bfree, stat_buf.f_bavail,
+			     stat_buf.f_files, stat_buf.f_ffree,
+			     fsid, stat_buf.f_namelen);
+	*outlen = pdu->write_offset;
+	virtio_p9_set_reply_header(pdu, *outlen);
+	return;
+err_out:
+	virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+	return;
+}
+
+static void virtio_p9_mknod(struct p9_dev *p9dev,
+			    struct p9_pdu *pdu, u32 *outlen)
+{
+	int ret;
+	char *name;
+	struct stat st;
+	struct p9_fid *dfid;
+	struct p9_qid qid;
+	char full_path[PATH_MAX];
+	u32 fid_val, mode, major, minor, gid;
+
+	virtio_p9_pdu_readf(pdu, "dsdddd", &fid_val, &name, &mode,
+			    &major, &minor, &gid);
+
+	dfid = get_fid(p9dev, fid_val);
+	sprintf(full_path, "%s/%s", dfid->abs_path, name);
+	ret = mknod(full_path, mode, makedev(major, minor));
+	if (ret < 0)
+		goto err_out;
+
+	if (lstat(full_path, &st) < 0)
+		goto err_out;
+
+	ret = chmod(full_path, mode & 0777);
+	if (ret < 0)
+		goto err_out;
+
+	stat2qid(&st, &qid);
+	virtio_p9_pdu_writef(pdu, "Q", &qid);
+	free(name);
+	*outlen = pdu->write_offset;
+	virtio_p9_set_reply_header(pdu, *outlen);
+	return;
+err_out:
+	free(name);
+	virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+	return;
+}
+
+static void virtio_p9_fsync(struct p9_dev *p9dev,
+			    struct p9_pdu *pdu, u32 *outlen)
+{
+	int ret;
+	struct p9_fid *fid;
+	u32 fid_val, datasync;
+
+	virtio_p9_pdu_readf(pdu, "dd", &fid_val, &datasync);
+	fid = get_fid(p9dev, fid_val);
+
+	if (datasync)
+		ret = fdatasync(fid->fd);
+	else
+		ret = fsync(fid->fd);
+	if (ret < 0)
+		goto err_out;
+	*outlen = pdu->write_offset;
+	virtio_p9_set_reply_header(pdu, *outlen);
+	return;
+err_out:
+	virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+	return;
+}
+
+static void virtio_p9_symlink(struct p9_dev *p9dev,
+			      struct p9_pdu *pdu, u32 *outlen)
+{
+	int ret;
+	struct stat st;
+	u32 fid_val, gid;
+	struct p9_qid qid;
+	struct p9_fid *dfid;
+	char new_name[PATH_MAX];
+	char *old_path, *name;
+
+	virtio_p9_pdu_readf(pdu, "dssd", &fid_val, &name, &old_path, &gid);
+
+	dfid = get_fid(p9dev, fid_val);
+	sprintf(new_name, "%s/%s", dfid->abs_path, name);
+	ret = symlink(old_path, new_name);
+	if (ret < 0)
+		goto err_out;
+
+	if (lstat(new_name, &st) < 0)
+		goto err_out;
+
+	stat2qid(&st, &qid);
+	virtio_p9_pdu_writef(pdu, "Q", &qid);
+	free(name);
+	free(old_path);
+	*outlen = pdu->write_offset;
+	virtio_p9_set_reply_header(pdu, *outlen);
+	return;
+err_out:
+	free(name);
+	free(old_path);
+	virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+	return;
+}
+
+static void virtio_p9_link(struct p9_dev *p9dev,
+			   struct p9_pdu *pdu, u32 *outlen)
+{
+	int ret;
+	char *name;
+	u32 fid_val, dfid_val;
+	struct p9_fid *dfid, *fid;
+	char full_path[PATH_MAX];
+
+	virtio_p9_pdu_readf(pdu, "dds", &dfid_val, &fid_val, &name);
+
+	dfid = get_fid(p9dev, dfid_val);
+	fid =  get_fid(p9dev, fid_val);
+	sprintf(full_path, "%s/%s", dfid->abs_path, name);
+	ret = link(fid->abs_path, full_path);
+	if (ret < 0)
+		goto err_out;
+	free(name);
+	*outlen = pdu->write_offset;
+	virtio_p9_set_reply_header(pdu, *outlen);
+	return;
+err_out:
+	free(name);
+	virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+	return;
+
+}
+
+static void virtio_p9_lock(struct p9_dev *p9dev,
+			   struct p9_pdu *pdu, u32 *outlen)
+{
+	u8 ret;
+	u32 fid_val;
+	struct p9_flock flock;
+
+	virtio_p9_pdu_readf(pdu, "dbdqqds", &fid_val, &flock.type,
+			    &flock.flags, &flock.start, &flock.length,
+			    &flock.proc_id, &flock.client_id);
+
+	/* Just return success */
+	ret = P9_LOCK_SUCCESS;
+	virtio_p9_pdu_writef(pdu, "d", ret);
+	*outlen = pdu->write_offset;
+	virtio_p9_set_reply_header(pdu, *outlen);
+	free(flock.client_id);
+	return;
+}
+
+static void virtio_p9_getlock(struct p9_dev *p9dev,
+			      struct p9_pdu *pdu, u32 *outlen)
+{
+	u32 fid_val;
+	struct p9_getlock glock;
+	virtio_p9_pdu_readf(pdu, "dbqqds", &fid_val, &glock.type,
+			    &glock.start, &glock.length, &glock.proc_id,
+			    &glock.client_id);
+
+	/* Just return success */
+	glock.type = F_UNLCK;
+	virtio_p9_pdu_writef(pdu, "bqqds", glock.type,
+			     glock.start, glock.length, glock.proc_id,
+			     glock.client_id);
+	*outlen = pdu->write_offset;
+	virtio_p9_set_reply_header(pdu, *outlen);
+	free(glock.client_id);
+	return;
+}
+
+static int virtio_p9_ancestor(char *path, char *ancestor)
+{
+	int size = strlen(ancestor);
+	if (!strncmp(path, ancestor, size)) {
+		/*
+		 * Now check whether ancestor is a full name or
+		 * or directory component and not just part
+		 * of a name.
+		 */
+		if (path[size] == '\0' || path[size] == '/')
+			return 1;
+	}
+	return 0;
+}
+
+static void virtio_p9_fix_path(char *fid_path, char *old_name, char *new_name)
+{
+	char tmp_name[PATH_MAX];
+	size_t rp_sz = strlen(old_name);
+
+	if (rp_sz == strlen(fid_path)) {
+		/* replace the full name */
+		strcpy(fid_path, new_name);
+		return;
+	}
+	/* save the trailing path details */
+	strcpy(tmp_name, fid_path + rp_sz);
+	sprintf(fid_path, "%s%s", new_name, tmp_name);
+	return;
+}
+
+static void rename_fids(struct p9_dev *p9dev, char *old_name, char *new_name)
+{
+	struct rb_node *node = rb_first(&p9dev->fids);
+
+	while (node) {
+		struct p9_fid *fid = rb_entry(node, struct p9_fid, node);
+
+		if (fid->fid != P9_NOFID && virtio_p9_ancestor(fid->path, old_name)) {
+				virtio_p9_fix_path(fid->path, old_name, new_name);
+		}
+		node = rb_next(node);
+	}
+}
+
+static void virtio_p9_renameat(struct p9_dev *p9dev,
+			       struct p9_pdu *pdu, u32 *outlen)
+{
+	int ret;
+	char *old_name, *new_name;
+	u32 old_dfid_val, new_dfid_val;
+	struct p9_fid *old_dfid, *new_dfid;
+	char old_full_path[PATH_MAX], new_full_path[PATH_MAX];
+
+
+	virtio_p9_pdu_readf(pdu, "dsds", &old_dfid_val, &old_name,
+			    &new_dfid_val, &new_name);
+
+	old_dfid = get_fid(p9dev, old_dfid_val);
+	new_dfid = get_fid(p9dev, new_dfid_val);
+
+	sprintf(old_full_path, "%s/%s", old_dfid->abs_path, old_name);
+	sprintf(new_full_path, "%s/%s", new_dfid->abs_path, new_name);
+	ret = rename(old_full_path, new_full_path);
+	if (ret < 0)
+		goto err_out;
+	/*
+	 * Now fix path in other fids, if the renamed path is part of
+	 * that.
+	 */
+	rename_fids(p9dev, old_name, new_name);
+	free(old_name);
+	free(new_name);
+	*outlen = pdu->write_offset;
+	virtio_p9_set_reply_header(pdu, *outlen);
+	return;
+err_out:
+	free(old_name);
+	free(new_name);
+	virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+	return;
+}
+
+static void virtio_p9_unlinkat(struct p9_dev *p9dev,
+			       struct p9_pdu *pdu, u32 *outlen)
+{
+	int ret;
+	char *name;
+	u32 fid_val, flags;
+	struct p9_fid *fid;
+	char full_path[PATH_MAX];
+
+	virtio_p9_pdu_readf(pdu, "dsd", &fid_val, &name, &flags);
+	fid = get_fid(p9dev, fid_val);
+
+	sprintf(full_path, "%s/%s", fid->abs_path, name);
+	ret = remove(full_path);
+	if (ret < 0)
+		goto err_out;
+	free(name);
+	*outlen = pdu->write_offset;
+	virtio_p9_set_reply_header(pdu, *outlen);
+	return;
+err_out:
+	free(name);
+	virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+	return;
+}
+
+static void virtio_p9_flush(struct p9_dev *p9dev,
+				struct p9_pdu *pdu, u32 *outlen)
+{
+	u16 tag, oldtag;
+
+	virtio_p9_pdu_readf(pdu, "ww", &tag, &oldtag);
+	virtio_p9_pdu_writef(pdu, "w", tag);
+	*outlen = pdu->write_offset;
+	virtio_p9_set_reply_header(pdu, *outlen);
+
+	return;
+}
+
+static void virtio_p9_eopnotsupp(struct p9_dev *p9dev,
+				 struct p9_pdu *pdu, u32 *outlen)
+{
+	return virtio_p9_error_reply(p9dev, pdu, EOPNOTSUPP, outlen);
+}
+
+typedef void p9_handler(struct p9_dev *p9dev,
+			struct p9_pdu *pdu, u32 *outlen);
+
+/* FIXME should be removed when merging with latest linus tree */
+#define P9_TRENAMEAT 74
+#define P9_TUNLINKAT 76
+
+static p9_handler *virtio_9p_dotl_handler [] = {
+	[P9_TREADDIR]     = virtio_p9_readdir,
+	[P9_TSTATFS]      = virtio_p9_statfs,
+	[P9_TGETATTR]     = virtio_p9_getattr,
+	[P9_TSETATTR]     = virtio_p9_setattr,
+	[P9_TXATTRWALK]   = virtio_p9_eopnotsupp,
+	[P9_TXATTRCREATE] = virtio_p9_eopnotsupp,
+	[P9_TMKNOD]       = virtio_p9_mknod,
+	[P9_TLOCK]        = virtio_p9_lock,
+	[P9_TGETLOCK]     = virtio_p9_getlock,
+	[P9_TRENAMEAT]    = virtio_p9_renameat,
+	[P9_TREADLINK]    = virtio_p9_readlink,
+	[P9_TUNLINKAT]    = virtio_p9_unlinkat,
+	[P9_TMKDIR]       = virtio_p9_mkdir,
+	[P9_TVERSION]     = virtio_p9_version,
+	[P9_TLOPEN]       = virtio_p9_open,
+	[P9_TATTACH]      = virtio_p9_attach,
+	[P9_TWALK]        = virtio_p9_walk,
+	[P9_TCLUNK]       = virtio_p9_clunk,
+	[P9_TFSYNC]       = virtio_p9_fsync,
+	[P9_TREAD]        = virtio_p9_read,
+	[P9_TFLUSH]       = virtio_p9_flush,
+	[P9_TLINK]        = virtio_p9_link,
+	[P9_TSYMLINK]     = virtio_p9_symlink,
+	[P9_TLCREATE]     = virtio_p9_create,
+	[P9_TWRITE]       = virtio_p9_write,
+	[P9_TREMOVE]      = virtio_p9_remove,
+	[P9_TRENAME]      = virtio_p9_rename,
+};
+
+static struct p9_pdu *virtio_p9_pdu_init(struct kvm *kvm, struct virt_queue *vq)
+{
+	struct p9_pdu *pdu = calloc(1, sizeof(*pdu));
+	if (!pdu)
+		return NULL;
+
+	/* skip the pdu header p9_msg */
+	pdu->read_offset	= VIRTIO_9P_HDR_LEN;
+	pdu->write_offset	= VIRTIO_9P_HDR_LEN;
+	pdu->queue_head		= virt_queue__get_inout_iov(kvm, vq, pdu->in_iov,
+					pdu->out_iov, &pdu->in_iov_cnt, &pdu->out_iov_cnt);
+	return pdu;
+}
+
+static u8 virtio_p9_get_cmd(struct p9_pdu *pdu)
+{
+	struct p9_msg *msg;
+	/*
+	 * we can peek directly into pdu for a u8
+	 * value. The host endianess won't be an issue
+	 */
+	msg = pdu->out_iov[0].iov_base;
+	return msg->cmd;
+}
+
+static bool virtio_p9_do_io_request(struct kvm *kvm, struct p9_dev_job *job)
+{
+	u8 cmd;
+	u32 len = 0;
+	p9_handler *handler;
+	struct p9_dev *p9dev;
+	struct virt_queue *vq;
+	struct p9_pdu *p9pdu;
+
+	vq = job->vq;
+	p9dev = job->p9dev;
+
+	p9pdu = virtio_p9_pdu_init(kvm, vq);
+	cmd = virtio_p9_get_cmd(p9pdu);
+
+	if ((cmd >= ARRAY_SIZE(virtio_9p_dotl_handler)) ||
+	    !virtio_9p_dotl_handler[cmd])
+		handler = virtio_p9_eopnotsupp;
+	else
+		handler = virtio_9p_dotl_handler[cmd];
+
+	handler(p9dev, p9pdu, &len);
+	virt_queue__set_used_elem(vq, p9pdu->queue_head, len);
+	free(p9pdu);
+	return true;
+}
+
+static void virtio_p9_do_io(struct kvm *kvm, void *param)
+{
+	struct p9_dev_job *job = (struct p9_dev_job *)param;
+	struct p9_dev *p9dev   = job->p9dev;
+	struct virt_queue *vq  = job->vq;
+
+	while (virt_queue__available(vq)) {
+		virtio_p9_do_io_request(kvm, job);
+		p9dev->vdev.ops->signal_vq(kvm, &p9dev->vdev, vq - p9dev->vqs);
+	}
+}
+
+static u8 *get_config(struct kvm *kvm, void *dev)
+{
+	struct p9_dev *p9dev = dev;
+
+	return ((u8 *)(p9dev->config));
+}
+
+static u32 get_host_features(struct kvm *kvm, void *dev)
+{
+	return 1 << VIRTIO_9P_MOUNT_TAG;
+}
+
+static void set_guest_features(struct kvm *kvm, void *dev, u32 features)
+{
+	struct p9_dev *p9dev = dev;
+	struct virtio_9p_config *conf = p9dev->config;
+
+	p9dev->features = features;
+	conf->tag_len = virtio_host_to_guest_u16(&p9dev->vdev, conf->tag_len);
+}
+
+static int init_vq(struct kvm *kvm, void *dev, u32 vq, u32 page_size, u32 align,
+		   u32 pfn)
+{
+	struct p9_dev *p9dev = dev;
+	struct p9_dev_job *job;
+	struct virt_queue *queue;
+	void *p;
+
+	compat__remove_message(compat_id);
+
+	queue		= &p9dev->vqs[vq];
+	queue->pfn	= pfn;
+	p		= virtio_get_vq(kvm, queue->pfn, page_size);
+	job		= &p9dev->jobs[vq];
+
+	vring_init(&queue->vring, VIRTQUEUE_NUM, p, align);
+	virtio_init_device_vq(&p9dev->vdev, queue);
+
+	*job		= (struct p9_dev_job) {
+		.vq		= queue,
+		.p9dev		= p9dev,
+	};
+	thread_pool__init_job(&job->job_id, kvm, virtio_p9_do_io, job);
+
+	return 0;
+}
+
+static int notify_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+	struct p9_dev *p9dev = dev;
+
+	thread_pool__do_job(&p9dev->jobs[vq].job_id);
+
+	return 0;
+}
+
+static int get_pfn_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+	struct p9_dev *p9dev = dev;
+
+	return p9dev->vqs[vq].pfn;
+}
+
+static int get_size_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+	return VIRTQUEUE_NUM;
+}
+
+static int set_size_vq(struct kvm *kvm, void *dev, u32 vq, int size)
+{
+	/* FIXME: dynamic */
+	return size;
+}
+
+struct virtio_ops p9_dev_virtio_ops = (struct virtio_ops) {
+	.get_config		= get_config,
+	.get_host_features	= get_host_features,
+	.set_guest_features	= set_guest_features,
+	.init_vq		= init_vq,
+	.notify_vq		= notify_vq,
+	.get_pfn_vq		= get_pfn_vq,
+	.get_size_vq		= get_size_vq,
+	.set_size_vq		= set_size_vq,
+};
+
+int virtio_9p_rootdir_parser(const struct option *opt, const char *arg, int unset)
+{
+	char *tag_name;
+	char tmp[PATH_MAX];
+	struct kvm *kvm = opt->ptr;
+
+	/*
+	 * 9p dir can be of the form dirname,tag_name or
+	 * just dirname. In the later case we use the
+	 * default tag name
+	 */
+	tag_name = strstr(arg, ",");
+	if (tag_name) {
+		*tag_name = '\0';
+		tag_name++;
+	}
+	if (realpath(arg, tmp)) {
+		if (virtio_9p__register(kvm, tmp, tag_name) < 0)
+			die("Unable to initialize virtio 9p");
+	} else
+		die("Failed resolving 9p path");
+	return 0;
+}
+
+int virtio_9p_img_name_parser(const struct option *opt, const char *arg, int unset)
+{
+	char path[PATH_MAX];
+	struct stat st;
+	struct kvm *kvm = opt->ptr;
+
+	if (stat(arg, &st) == 0 &&
+	    S_ISDIR(st.st_mode)) {
+		char tmp[PATH_MAX];
+
+		if (kvm->cfg.using_rootfs)
+			die("Please use only one rootfs directory atmost");
+
+		if (realpath(arg, tmp) == 0 ||
+		    virtio_9p__register(kvm, tmp, "/dev/root") < 0)
+			die("Unable to initialize virtio 9p");
+		kvm->cfg.using_rootfs = 1;
+		return 0;
+	}
+
+	snprintf(path, PATH_MAX, "%s%s", kvm__get_dir(), arg);
+
+	if (stat(path, &st) == 0 &&
+	    S_ISDIR(st.st_mode)) {
+		char tmp[PATH_MAX];
+
+		if (kvm->cfg.using_rootfs)
+			die("Please use only one rootfs directory atmost");
+
+		if (realpath(path, tmp) == 0 ||
+		    virtio_9p__register(kvm, tmp, "/dev/root") < 0)
+			die("Unable to initialize virtio 9p");
+		if (virtio_9p__register(kvm, "/", "hostfs") < 0)
+			die("Unable to initialize virtio 9p");
+		kvm_setup_resolv(arg);
+		kvm->cfg.using_rootfs = kvm->cfg.custom_rootfs = 1;
+		kvm->cfg.custom_rootfs_name = arg;
+		return 0;
+	}
+
+	return -1;
+}
+
+int virtio_9p__init(struct kvm *kvm)
+{
+	struct p9_dev *p9dev;
+
+	list_for_each_entry(p9dev, &devs, list) {
+		virtio_init(kvm, p9dev, &p9dev->vdev, &p9_dev_virtio_ops,
+			    VIRTIO_DEFAULT_TRANS(kvm), PCI_DEVICE_ID_VIRTIO_9P,
+			    VIRTIO_ID_9P, PCI_CLASS_9P);
+	}
+
+	return 0;
+}
+virtio_dev_init(virtio_9p__init);
+
+int virtio_9p__register(struct kvm *kvm, const char *root, const char *tag_name)
+{
+	struct p9_dev *p9dev;
+	int err = 0;
+
+	p9dev = calloc(1, sizeof(*p9dev));
+	if (!p9dev)
+		return -ENOMEM;
+
+	if (!tag_name)
+		tag_name = VIRTIO_9P_DEFAULT_TAG;
+
+	p9dev->config = calloc(1, sizeof(*p9dev->config) + strlen(tag_name) + 1);
+	if (p9dev->config == NULL) {
+		err = -ENOMEM;
+		goto free_p9dev;
+	}
+
+	strcpy(p9dev->root_dir, root);
+	p9dev->config->tag_len = strlen(tag_name);
+	if (p9dev->config->tag_len > MAX_TAG_LEN) {
+		err = -EINVAL;
+		goto free_p9dev_config;
+	}
+
+	memcpy(&p9dev->config->tag, tag_name, strlen(tag_name));
+
+	list_add(&p9dev->list, &devs);
+
+	if (compat_id == -1)
+		compat_id = virtio_compat_add_message("virtio-9p", "CONFIG_NET_9P_VIRTIO");
+
+	return err;
+
+free_p9dev_config:
+	free(p9dev->config);
+free_p9dev:
+	free(p9dev);
+	return err;
+}

diff --git a/tools/kvm/virtio/balloon.c b/tools/kvm/virtio/balloon.c
new file mode 100644
index 0000000..84c4bb0
--- /dev/null
+++ b/tools/kvm/virtio/balloon.c

@@ -0,0 +1,279 @@
+#include "kvm/virtio-balloon.h"
+
+#include "kvm/virtio-pci-dev.h"
+
+#include "kvm/virtio.h"
+#include "kvm/util.h"
+#include "kvm/kvm.h"
+#include "kvm/pci.h"
+#include "kvm/threadpool.h"
+#include "kvm/guest_compat.h"
+#include "kvm/kvm-ipc.h"
+
+#include <linux/virtio_ring.h>
+#include <linux/virtio_balloon.h>
+
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <pthread.h>
+#include <sys/eventfd.h>
+
+#define NUM_VIRT_QUEUES		3
+#define VIRTIO_BLN_QUEUE_SIZE	128
+#define VIRTIO_BLN_INFLATE	0
+#define VIRTIO_BLN_DEFLATE	1
+#define VIRTIO_BLN_STATS	2
+
+struct bln_dev {
+	struct list_head	list;
+	struct virtio_device	vdev;
+
+	u32			features;
+
+	/* virtio queue */
+	struct virt_queue	vqs[NUM_VIRT_QUEUES];
+	struct thread_pool__job	jobs[NUM_VIRT_QUEUES];
+
+	struct virtio_balloon_stat stats[VIRTIO_BALLOON_S_NR];
+	struct virtio_balloon_stat *cur_stat;
+	u32			cur_stat_head;
+	u16			stat_count;
+	int			stat_waitfd;
+
+	struct virtio_balloon_config config;
+};
+
+static struct bln_dev bdev;
+static int compat_id = -1;
+
+static bool virtio_bln_do_io_request(struct kvm *kvm, struct bln_dev *bdev, struct virt_queue *queue)
+{
+	struct iovec iov[VIRTIO_BLN_QUEUE_SIZE];
+	unsigned int len = 0;
+	u16 out, in, head;
+	u32 *ptrs, i;
+
+	head	= virt_queue__get_iov(queue, iov, &out, &in, kvm);
+	ptrs	= iov[0].iov_base;
+	len	= iov[0].iov_len / sizeof(u32);
+
+	for (i = 0 ; i < len ; i++) {
+		void *guest_ptr;
+
+		guest_ptr = guest_flat_to_host(kvm, (u64)ptrs[i] << VIRTIO_BALLOON_PFN_SHIFT);
+		if (queue == &bdev->vqs[VIRTIO_BLN_INFLATE]) {
+			madvise(guest_ptr, 1 << VIRTIO_BALLOON_PFN_SHIFT, MADV_DONTNEED);
+			bdev->config.actual++;
+		} else if (queue == &bdev->vqs[VIRTIO_BLN_DEFLATE]) {
+			bdev->config.actual--;
+		}
+	}
+
+	virt_queue__set_used_elem(queue, head, len);
+
+	return true;
+}
+
+static bool virtio_bln_do_stat_request(struct kvm *kvm, struct bln_dev *bdev, struct virt_queue *queue)
+{
+	struct iovec iov[VIRTIO_BLN_QUEUE_SIZE];
+	u16 out, in, head;
+	struct virtio_balloon_stat *stat;
+	u64 wait_val = 1;
+
+	head = virt_queue__get_iov(queue, iov, &out, &in, kvm);
+	stat = iov[0].iov_base;
+
+	/* Initial empty stat buffer */
+	if (bdev->cur_stat == NULL) {
+		bdev->cur_stat = stat;
+		bdev->cur_stat_head = head;
+
+		return true;
+	}
+
+	memcpy(bdev->stats, stat, iov[0].iov_len);
+
+	bdev->stat_count = iov[0].iov_len / sizeof(struct virtio_balloon_stat);
+	bdev->cur_stat = stat;
+	bdev->cur_stat_head = head;
+
+	if (write(bdev->stat_waitfd, &wait_val, sizeof(wait_val)) <= 0)
+		return -EFAULT;
+
+	return 1;
+}
+
+static void virtio_bln_do_io(struct kvm *kvm, void *param)
+{
+	struct virt_queue *vq = param;
+
+	if (vq == &bdev.vqs[VIRTIO_BLN_STATS]) {
+		virtio_bln_do_stat_request(kvm, &bdev, vq);
+		bdev.vdev.ops->signal_vq(kvm, &bdev.vdev, VIRTIO_BLN_STATS);
+		return;
+	}
+
+	while (virt_queue__available(vq)) {
+		virtio_bln_do_io_request(kvm, &bdev, vq);
+		bdev.vdev.ops->signal_vq(kvm, &bdev.vdev, vq - bdev.vqs);
+	}
+}
+
+static int virtio_bln__collect_stats(struct kvm *kvm)
+{
+	u64 tmp;
+
+	virt_queue__set_used_elem(&bdev.vqs[VIRTIO_BLN_STATS], bdev.cur_stat_head,
+				  sizeof(struct virtio_balloon_stat));
+	bdev.vdev.ops->signal_vq(kvm, &bdev.vdev, VIRTIO_BLN_STATS);
+
+	if (read(bdev.stat_waitfd, &tmp, sizeof(tmp)) <= 0)
+		return -EFAULT;
+
+	return 0;
+}
+
+static void virtio_bln__print_stats(struct kvm *kvm, int fd, u32 type, u32 len, u8 *msg)
+{
+	int r;
+
+	if (WARN_ON(type != KVM_IPC_STAT || len))
+		return;
+
+	if (virtio_bln__collect_stats(kvm) < 0)
+		return;
+
+	r = write(fd, bdev.stats, sizeof(bdev.stats));
+	if (r < 0)
+		pr_warning("Failed sending memory stats");
+}
+
+static void handle_mem(struct kvm *kvm, int fd, u32 type, u32 len, u8 *msg)
+{
+	int mem;
+
+	if (WARN_ON(type != KVM_IPC_BALLOON || len != sizeof(int)))
+		return;
+
+	mem = *(int *)msg;
+	if (mem > 0) {
+		bdev.config.num_pages += 256 * mem;
+	} else if (mem < 0) {
+		if (bdev.config.num_pages < (u32)(256 * (-mem)))
+			return;
+
+		bdev.config.num_pages += 256 * mem;
+	}
+
+	/* Notify that the configuration space has changed */
+	bdev.vdev.ops->signal_config(kvm, &bdev.vdev);
+}
+
+static u8 *get_config(struct kvm *kvm, void *dev)
+{
+	struct bln_dev *bdev = dev;
+
+	return ((u8 *)(&bdev->config));
+}
+
+static u32 get_host_features(struct kvm *kvm, void *dev)
+{
+	return 1 << VIRTIO_BALLOON_F_STATS_VQ;
+}
+
+static void set_guest_features(struct kvm *kvm, void *dev, u32 features)
+{
+	struct bln_dev *bdev = dev;
+
+	bdev->features = features;
+}
+
+static int init_vq(struct kvm *kvm, void *dev, u32 vq, u32 page_size, u32 align,
+		   u32 pfn)
+{
+	struct bln_dev *bdev = dev;
+	struct virt_queue *queue;
+	void *p;
+
+	compat__remove_message(compat_id);
+
+	queue		= &bdev->vqs[vq];
+	queue->pfn	= pfn;
+	p		= virtio_get_vq(kvm, queue->pfn, page_size);
+
+	thread_pool__init_job(&bdev->jobs[vq], kvm, virtio_bln_do_io, queue);
+	vring_init(&queue->vring, VIRTIO_BLN_QUEUE_SIZE, p, align);
+
+	return 0;
+}
+
+static int notify_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+	struct bln_dev *bdev = dev;
+
+	thread_pool__do_job(&bdev->jobs[vq]);
+
+	return 0;
+}
+
+static int get_pfn_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+	struct bln_dev *bdev = dev;
+
+	return bdev->vqs[vq].pfn;
+}
+
+static int get_size_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+	return VIRTIO_BLN_QUEUE_SIZE;
+}
+
+static int set_size_vq(struct kvm *kvm, void *dev, u32 vq, int size)
+{
+	/* FIXME: dynamic */
+	return size;
+}
+
+struct virtio_ops bln_dev_virtio_ops = (struct virtio_ops) {
+	.get_config		= get_config,
+	.get_host_features	= get_host_features,
+	.set_guest_features	= set_guest_features,
+	.init_vq		= init_vq,
+	.notify_vq		= notify_vq,
+	.get_pfn_vq		= get_pfn_vq,
+	.get_size_vq		= get_size_vq,
+	.set_size_vq            = set_size_vq,
+};
+
+int virtio_bln__init(struct kvm *kvm)
+{
+	if (!kvm->cfg.balloon)
+		return 0;
+
+	kvm_ipc__register_handler(KVM_IPC_BALLOON, handle_mem);
+	kvm_ipc__register_handler(KVM_IPC_STAT, virtio_bln__print_stats);
+
+	bdev.stat_waitfd	= eventfd(0, 0);
+	memset(&bdev.config, 0, sizeof(struct virtio_balloon_config));
+
+	virtio_init(kvm, &bdev, &bdev.vdev, &bln_dev_virtio_ops,
+		    VIRTIO_DEFAULT_TRANS(kvm), PCI_DEVICE_ID_VIRTIO_BLN,
+		    VIRTIO_ID_BALLOON, PCI_CLASS_BLN);
+
+	if (compat_id == -1)
+		compat_id = virtio_compat_add_message("virtio-balloon", "CONFIG_VIRTIO_BALLOON");
+
+	return 0;
+}
+virtio_dev_init(virtio_bln__init);
+
+int virtio_bln__exit(struct kvm *kvm)
+{
+	return 0;
+}
+virtio_dev_exit(virtio_bln__exit);

diff --git a/tools/kvm/virtio/blk.c b/tools/kvm/virtio/blk.c
new file mode 100644
index 0000000..edfa8e6
--- /dev/null
+++ b/tools/kvm/virtio/blk.c

@@ -0,0 +1,338 @@
+#include "kvm/virtio-blk.h"
+
+#include "kvm/virtio-pci-dev.h"
+#include "kvm/disk-image.h"
+#include "kvm/mutex.h"
+#include "kvm/util.h"
+#include "kvm/kvm.h"
+#include "kvm/pci.h"
+#include "kvm/threadpool.h"
+#include "kvm/ioeventfd.h"
+#include "kvm/guest_compat.h"
+#include "kvm/virtio-pci.h"
+#include "kvm/virtio.h"
+
+#include <linux/virtio_ring.h>
+#include <linux/virtio_blk.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/types.h>
+#include <pthread.h>
+
+#define VIRTIO_BLK_MAX_DEV		4
+
+/*
+ * the header and status consume too entries
+ */
+#define DISK_SEG_MAX			(VIRTIO_BLK_QUEUE_SIZE - 2)
+#define VIRTIO_BLK_QUEUE_SIZE		256
+#define NUM_VIRT_QUEUES			1
+
+struct blk_dev_req {
+	struct virt_queue		*vq;
+	struct blk_dev			*bdev;
+	struct iovec			iov[VIRTIO_BLK_QUEUE_SIZE];
+	u16				out, in, head;
+	struct kvm			*kvm;
+};
+
+struct blk_dev {
+	struct mutex			mutex;
+
+	struct list_head		list;
+
+	struct virtio_device		vdev;
+	struct virtio_blk_config	blk_config;
+	struct disk_image		*disk;
+	u32				features;
+
+	struct virt_queue		vqs[NUM_VIRT_QUEUES];
+	struct blk_dev_req		reqs[VIRTIO_BLK_QUEUE_SIZE];
+
+	pthread_t			io_thread;
+	int				io_efd;
+
+	struct kvm			*kvm;
+};
+
+static LIST_HEAD(bdevs);
+static int compat_id = -1;
+
+void virtio_blk_complete(void *param, long len)
+{
+	struct blk_dev_req *req = param;
+	struct blk_dev *bdev = req->bdev;
+	int queueid = req->vq - bdev->vqs;
+	u8 *status;
+
+	/* status */
+	status	= req->iov[req->out + req->in - 1].iov_base;
+	*status	= (len < 0) ? VIRTIO_BLK_S_IOERR : VIRTIO_BLK_S_OK;
+
+	mutex_lock(&bdev->mutex);
+	virt_queue__set_used_elem(req->vq, req->head, len);
+	mutex_unlock(&bdev->mutex);
+
+	if (virtio_queue__should_signal(&bdev->vqs[queueid]))
+		bdev->vdev.ops->signal_vq(req->kvm, &bdev->vdev, queueid);
+}
+
+static void virtio_blk_do_io_request(struct kvm *kvm, struct virt_queue *vq, struct blk_dev_req *req)
+{
+	struct virtio_blk_outhdr *req_hdr;
+	ssize_t block_cnt;
+	struct blk_dev *bdev;
+	struct iovec *iov;
+	u16 out, in;
+	u32 type;
+	u64 sector;
+
+	block_cnt	= -1;
+	bdev		= req->bdev;
+	iov		= req->iov;
+	out		= req->out;
+	in		= req->in;
+	req_hdr		= iov[0].iov_base;
+
+	type = virtio_guest_to_host_u32(vq, req_hdr->type);
+	sector = virtio_guest_to_host_u64(vq, req_hdr->sector);
+
+	switch (type) {
+	case VIRTIO_BLK_T_IN:
+		block_cnt = disk_image__read(bdev->disk, sector,
+				iov + 1, in + out - 2, req);
+		break;
+	case VIRTIO_BLK_T_OUT:
+		block_cnt = disk_image__write(bdev->disk, sector,
+				iov + 1, in + out - 2, req);
+		break;
+	case VIRTIO_BLK_T_FLUSH:
+		block_cnt = disk_image__flush(bdev->disk);
+		virtio_blk_complete(req, block_cnt);
+		break;
+	case VIRTIO_BLK_T_GET_ID:
+		block_cnt = VIRTIO_BLK_ID_BYTES;
+		disk_image__get_serial(bdev->disk,
+				(iov + 1)->iov_base, &block_cnt);
+		virtio_blk_complete(req, block_cnt);
+		break;
+	default:
+		pr_warning("request type %d", type);
+		block_cnt	= -1;
+		break;
+	}
+}
+
+static void virtio_blk_do_io(struct kvm *kvm, struct virt_queue *vq, struct blk_dev *bdev)
+{
+	struct blk_dev_req *req;
+	u16 head;
+
+	while (virt_queue__available(vq)) {
+		head		= virt_queue__pop(vq);
+		req		= &bdev->reqs[head];
+		req->head	= virt_queue__get_head_iov(vq, req->iov, &req->out,
+					&req->in, head, kvm);
+		req->vq		= vq;
+
+		virtio_blk_do_io_request(kvm, vq, req);
+	}
+}
+
+static u8 *get_config(struct kvm *kvm, void *dev)
+{
+	struct blk_dev *bdev = dev;
+
+	return ((u8 *)(&bdev->blk_config));
+}
+
+static u32 get_host_features(struct kvm *kvm, void *dev)
+{
+	return	1UL << VIRTIO_BLK_F_SEG_MAX
+		| 1UL << VIRTIO_BLK_F_FLUSH
+		| 1UL << VIRTIO_RING_F_EVENT_IDX
+		| 1UL << VIRTIO_RING_F_INDIRECT_DESC;
+}
+
+static void set_guest_features(struct kvm *kvm, void *dev, u32 features)
+{
+	struct blk_dev *bdev = dev;
+	struct virtio_blk_config *conf = &bdev->blk_config;
+	struct virtio_blk_geometry *geo = &conf->geometry;
+
+	bdev->features = features;
+
+	conf->capacity = virtio_host_to_guest_u64(&bdev->vdev, conf->capacity);
+	conf->size_max = virtio_host_to_guest_u32(&bdev->vdev, conf->size_max);
+	conf->seg_max = virtio_host_to_guest_u32(&bdev->vdev, conf->seg_max);
+
+	/* Geometry */
+	geo->cylinders = virtio_host_to_guest_u16(&bdev->vdev, geo->cylinders);
+
+	conf->blk_size = virtio_host_to_guest_u32(&bdev->vdev, conf->blk_size);
+	conf->min_io_size = virtio_host_to_guest_u16(&bdev->vdev, conf->min_io_size);
+	conf->opt_io_size = virtio_host_to_guest_u32(&bdev->vdev, conf->opt_io_size);
+}
+
+static int init_vq(struct kvm *kvm, void *dev, u32 vq, u32 page_size, u32 align,
+		   u32 pfn)
+{
+	struct blk_dev *bdev = dev;
+	struct virt_queue *queue;
+	void *p;
+
+	compat__remove_message(compat_id);
+
+	queue		= &bdev->vqs[vq];
+	queue->pfn	= pfn;
+	p		= virtio_get_vq(kvm, queue->pfn, page_size);
+
+	vring_init(&queue->vring, VIRTIO_BLK_QUEUE_SIZE, p, align);
+	virtio_init_device_vq(&bdev->vdev, queue);
+
+	return 0;
+}
+
+static void *virtio_blk_thread(void *dev)
+{
+	struct blk_dev *bdev = dev;
+	u64 data;
+	int r;
+
+	kvm__set_thread_name("virtio-blk-io");
+
+	while (1) {
+		r = read(bdev->io_efd, &data, sizeof(u64));
+		if (r < 0)
+			continue;
+		virtio_blk_do_io(bdev->kvm, &bdev->vqs[0], bdev);
+	}
+
+	pthread_exit(NULL);
+	return NULL;
+}
+
+static int notify_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+	struct blk_dev *bdev = dev;
+	u64 data = 1;
+	int r;
+
+	r = write(bdev->io_efd, &data, sizeof(data));
+	if (r < 0)
+		return r;
+
+	return 0;
+}
+
+static int get_pfn_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+	struct blk_dev *bdev = dev;
+
+	return bdev->vqs[vq].pfn;
+}
+
+static int get_size_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+	/* FIXME: dynamic */
+	return VIRTIO_BLK_QUEUE_SIZE;
+}
+
+static int set_size_vq(struct kvm *kvm, void *dev, u32 vq, int size)
+{
+	/* FIXME: dynamic */
+	return size;
+}
+
+static struct virtio_ops blk_dev_virtio_ops = (struct virtio_ops) {
+	.get_config		= get_config,
+	.get_host_features	= get_host_features,
+	.set_guest_features	= set_guest_features,
+	.init_vq		= init_vq,
+	.notify_vq		= notify_vq,
+	.get_pfn_vq		= get_pfn_vq,
+	.get_size_vq		= get_size_vq,
+	.set_size_vq		= set_size_vq,
+};
+
+static int virtio_blk__init_one(struct kvm *kvm, struct disk_image *disk)
+{
+	struct blk_dev *bdev;
+	unsigned int i;
+
+	if (!disk)
+		return -EINVAL;
+
+	bdev = calloc(1, sizeof(struct blk_dev));
+	if (bdev == NULL)
+		return -ENOMEM;
+
+	*bdev = (struct blk_dev) {
+		.mutex			= MUTEX_INITIALIZER,
+		.disk			= disk,
+		.blk_config		= (struct virtio_blk_config) {
+			.capacity	= disk->size / SECTOR_SIZE,
+			.seg_max	= DISK_SEG_MAX,
+		},
+		.io_efd			= eventfd(0, 0),
+		.kvm			= kvm,
+	};
+
+	virtio_init(kvm, bdev, &bdev->vdev, &blk_dev_virtio_ops,
+		    VIRTIO_DEFAULT_TRANS(kvm), PCI_DEVICE_ID_VIRTIO_BLK,
+		    VIRTIO_ID_BLOCK, PCI_CLASS_BLK);
+
+	list_add_tail(&bdev->list, &bdevs);
+
+	for (i = 0; i < ARRAY_SIZE(bdev->reqs); i++) {
+		bdev->reqs[i].bdev = bdev;
+		bdev->reqs[i].kvm = kvm;
+	}
+
+	disk_image__set_callback(bdev->disk, virtio_blk_complete);
+
+	pthread_create(&bdev->io_thread, NULL, virtio_blk_thread, bdev);
+	if (compat_id == -1)
+		compat_id = virtio_compat_add_message("virtio-blk", "CONFIG_VIRTIO_BLK");
+
+	return 0;
+}
+
+static int virtio_blk__exit_one(struct kvm *kvm, struct blk_dev *bdev)
+{
+	list_del(&bdev->list);
+	free(bdev);
+
+	return 0;
+}
+
+int virtio_blk__init(struct kvm *kvm)
+{
+	int i, r = 0;
+
+	for (i = 0; i < kvm->nr_disks; i++) {
+		if (kvm->disks[i]->wwpn)
+			continue;
+		r = virtio_blk__init_one(kvm, kvm->disks[i]);
+		if (r < 0)
+			goto cleanup;
+	}
+
+	return 0;
+cleanup:
+	return virtio_blk__exit(kvm);
+}
+virtio_dev_init(virtio_blk__init);
+
+int virtio_blk__exit(struct kvm *kvm)
+{
+	while (!list_empty(&bdevs)) {
+		struct blk_dev *bdev;
+
+		bdev = list_first_entry(&bdevs, struct blk_dev, list);
+		virtio_blk__exit_one(kvm, bdev);
+	}
+
+	return 0;
+}
+virtio_dev_exit(virtio_blk__exit);

diff --git a/tools/kvm/virtio/console.c b/tools/kvm/virtio/console.c
new file mode 100644
index 0000000..384eac1
--- /dev/null
+++ b/tools/kvm/virtio/console.c

@@ -0,0 +1,232 @@
+#include "kvm/virtio-console.h"
+#include "kvm/virtio-pci-dev.h"
+#include "kvm/disk-image.h"
+#include "kvm/virtio.h"
+#include "kvm/ioport.h"
+#include "kvm/util.h"
+#include "kvm/term.h"
+#include "kvm/mutex.h"
+#include "kvm/kvm.h"
+#include "kvm/pci.h"
+#include "kvm/threadpool.h"
+#include "kvm/irq.h"
+#include "kvm/guest_compat.h"
+
+#include <linux/virtio_console.h>
+#include <linux/virtio_ring.h>
+#include <linux/virtio_blk.h>
+
+#include <sys/uio.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <termios.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+#define VIRTIO_CONSOLE_QUEUE_SIZE	128
+#define VIRTIO_CONSOLE_NUM_QUEUES	2
+#define VIRTIO_CONSOLE_RX_QUEUE		0
+#define VIRTIO_CONSOLE_TX_QUEUE		1
+
+struct con_dev {
+	struct mutex			mutex;
+
+	struct virtio_device		vdev;
+	struct virt_queue		vqs[VIRTIO_CONSOLE_NUM_QUEUES];
+	struct virtio_console_config	config;
+	u32				features;
+
+	pthread_cond_t			poll_cond;
+	int				vq_ready;
+
+	struct thread_pool__job		jobs[VIRTIO_CONSOLE_NUM_QUEUES];
+};
+
+static struct con_dev cdev = {
+	.mutex				= MUTEX_INITIALIZER,
+
+	.vq_ready			= 0,
+
+	.config = {
+		.cols			= 80,
+		.rows			= 24,
+		.max_nr_ports		= 1,
+	},
+};
+
+static int compat_id = -1;
+
+/*
+ * Interrupts are injected for hvc0 only.
+ */
+static void virtio_console__inject_interrupt_callback(struct kvm *kvm, void *param)
+{
+	struct iovec iov[VIRTIO_CONSOLE_QUEUE_SIZE];
+	struct virt_queue *vq;
+	u16 out, in;
+	u16 head;
+	int len;
+
+	if (kvm->cfg.active_console != CONSOLE_VIRTIO)
+		return;
+
+	mutex_lock(&cdev.mutex);
+
+	vq = param;
+
+	if (!cdev.vq_ready)
+		pthread_cond_wait(&cdev.poll_cond, &cdev.mutex.mutex);
+
+	if (term_readable(0) && virt_queue__available(vq)) {
+		head = virt_queue__get_iov(vq, iov, &out, &in, kvm);
+		len = term_getc_iov(kvm, iov, in, 0);
+		virt_queue__set_used_elem(vq, head, len);
+		cdev.vdev.ops->signal_vq(kvm, &cdev.vdev, vq - cdev.vqs);
+	}
+
+	mutex_unlock(&cdev.mutex);
+}
+
+void virtio_console__inject_interrupt(struct kvm *kvm)
+{
+	virtio_console__inject_interrupt_callback(kvm,
+					&cdev.vqs[VIRTIO_CONSOLE_RX_QUEUE]);
+}
+
+static void virtio_console_handle_callback(struct kvm *kvm, void *param)
+{
+	struct iovec iov[VIRTIO_CONSOLE_QUEUE_SIZE];
+	struct virt_queue *vq;
+	u16 out, in;
+	u16 head;
+	u32 len;
+
+	vq = param;
+
+	/*
+	 * The current Linux implementation polls for the buffer
+	 * to be used, rather than waiting for an interrupt.
+	 * So there is no need to inject an interrupt for the tx path.
+	 */
+
+	while (virt_queue__available(vq)) {
+		head = virt_queue__get_iov(vq, iov, &out, &in, kvm);
+		len = term_putc_iov(iov, out, 0);
+		virt_queue__set_used_elem(vq, head, len);
+	}
+
+}
+
+static u8 *get_config(struct kvm *kvm, void *dev)
+{
+	struct con_dev *cdev = dev;
+
+	return ((u8 *)(&cdev->config));
+}
+
+static u32 get_host_features(struct kvm *kvm, void *dev)
+{
+	return 0;
+}
+
+static void set_guest_features(struct kvm *kvm, void *dev, u32 features)
+{
+	struct con_dev *cdev = dev;
+	struct virtio_console_config *conf = &cdev->config;
+
+	conf->cols = virtio_host_to_guest_u16(&cdev->vdev, conf->cols);
+	conf->rows = virtio_host_to_guest_u16(&cdev->vdev, conf->rows);
+	conf->max_nr_ports = virtio_host_to_guest_u32(&cdev->vdev, conf->max_nr_ports);
+}
+
+static int init_vq(struct kvm *kvm, void *dev, u32 vq, u32 page_size, u32 align,
+		   u32 pfn)
+{
+	struct virt_queue *queue;
+	void *p;
+
+	BUG_ON(vq >= VIRTIO_CONSOLE_NUM_QUEUES);
+
+	compat__remove_message(compat_id);
+
+	queue		= &cdev.vqs[vq];
+	queue->pfn	= pfn;
+	p		= virtio_get_vq(kvm, queue->pfn, page_size);
+
+	vring_init(&queue->vring, VIRTIO_CONSOLE_QUEUE_SIZE, p, align);
+	virtio_init_device_vq(&cdev.vdev, queue);
+
+	if (vq == VIRTIO_CONSOLE_TX_QUEUE) {
+		thread_pool__init_job(&cdev.jobs[vq], kvm, virtio_console_handle_callback, queue);
+	} else if (vq == VIRTIO_CONSOLE_RX_QUEUE) {
+		thread_pool__init_job(&cdev.jobs[vq], kvm, virtio_console__inject_interrupt_callback, queue);
+		/* Tell the waiting poll thread that we're ready to go */
+		mutex_lock(&cdev.mutex);
+		cdev.vq_ready = 1;
+		pthread_cond_signal(&cdev.poll_cond);
+		mutex_unlock(&cdev.mutex);
+	}
+
+	return 0;
+}
+
+static int notify_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+	struct con_dev *cdev = dev;
+
+	thread_pool__do_job(&cdev->jobs[vq]);
+
+	return 0;
+}
+
+static int get_pfn_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+	struct con_dev *cdev = dev;
+
+	return cdev->vqs[vq].pfn;
+}
+
+static int get_size_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+	return VIRTIO_CONSOLE_QUEUE_SIZE;
+}
+
+static int set_size_vq(struct kvm *kvm, void *dev, u32 vq, int size)
+{
+	/* FIXME: dynamic */
+	return size;
+}
+
+static struct virtio_ops con_dev_virtio_ops = (struct virtio_ops) {
+	.get_config		= get_config,
+	.get_host_features	= get_host_features,
+	.set_guest_features	= set_guest_features,
+	.init_vq		= init_vq,
+	.notify_vq		= notify_vq,
+	.get_pfn_vq		= get_pfn_vq,
+	.get_size_vq		= get_size_vq,
+	.set_size_vq		= set_size_vq,
+};
+
+int virtio_console__init(struct kvm *kvm)
+{
+	if (kvm->cfg.active_console != CONSOLE_VIRTIO)
+		return 0;
+
+	pthread_cond_init(&cdev.poll_cond, NULL);
+
+	virtio_init(kvm, &cdev, &cdev.vdev, &con_dev_virtio_ops,
+		    VIRTIO_DEFAULT_TRANS(kvm), PCI_DEVICE_ID_VIRTIO_CONSOLE,
+		    VIRTIO_ID_CONSOLE, PCI_CLASS_CONSOLE);
+	if (compat_id == -1)
+		compat_id = virtio_compat_add_message("virtio-console", "CONFIG_VIRTIO_CONSOLE");
+
+	return 0;
+}
+virtio_dev_init(virtio_console__init);
+
+int virtio_console__exit(struct kvm *kvm)
+{
+	return 0;
+}
+virtio_dev_exit(virtio_console__exit);

diff --git a/tools/kvm/virtio/core.c b/tools/kvm/virtio/core.c
new file mode 100644
index 0000000..9ae7887
--- /dev/null
+++ b/tools/kvm/virtio/core.c

@@ -0,0 +1,242 @@
+#include <linux/virtio_ring.h>
+#include <linux/types.h>
+#include <sys/uio.h>
+#include <stdlib.h>
+
+#include "kvm/guest_compat.h"
+#include "kvm/barrier.h"
+#include "kvm/virtio.h"
+#include "kvm/virtio-pci.h"
+#include "kvm/virtio-mmio.h"
+#include "kvm/util.h"
+#include "kvm/kvm.h"
+
+
+struct vring_used_elem *virt_queue__set_used_elem(struct virt_queue *queue, u32 head, u32 len)
+{
+	struct vring_used_elem *used_elem;
+	u16 idx = virtio_guest_to_host_u16(queue, queue->vring.used->idx);
+
+	used_elem	= &queue->vring.used->ring[idx % queue->vring.num];
+	used_elem->id	= virtio_host_to_guest_u32(queue, head);
+	used_elem->len	= virtio_host_to_guest_u32(queue, len);
+
+	/*
+	 * Use wmb to assure that used elem was updated with head and len.
+	 * We need a wmb here since we can't advance idx unless we're ready
+	 * to pass the used element to the guest.
+	 */
+	wmb();
+	idx++;
+	queue->vring.used->idx = virtio_host_to_guest_u16(queue, idx);
+
+	/*
+	 * Use wmb to assure used idx has been increased before we signal the guest.
+	 * Without a wmb here the guest may ignore the queue since it won't see
+	 * an updated idx.
+	 */
+	wmb();
+
+	return used_elem;
+}
+
+static inline bool virt_desc__test_flag(struct virt_queue *vq,
+					struct vring_desc *desc, u16 flag)
+{
+	return !!(virtio_guest_to_host_u16(vq, desc->flags) & flag);
+}
+
+/*
+ * Each buffer in the virtqueues is actually a chain of descriptors.  This
+ * function returns the next descriptor in the chain, or vq->vring.num if we're
+ * at the end.
+ */
+static unsigned next_desc(struct virt_queue *vq, struct vring_desc *desc,
+			  unsigned int i, unsigned int max)
+{
+	unsigned int next;
+
+	/* If this descriptor says it doesn't chain, we're done. */
+	if (!virt_desc__test_flag(vq, &desc[i], VRING_DESC_F_NEXT))
+		return max;
+
+	/* Check they're not leading us off end of descriptors. */
+	next = virtio_guest_to_host_u16(vq, desc[i].next);
+	/* Make sure compiler knows to grab that: we don't want it changing! */
+	wmb();
+
+	return next;
+}
+
+u16 virt_queue__get_head_iov(struct virt_queue *vq, struct iovec iov[], u16 *out, u16 *in, u16 head, struct kvm *kvm)
+{
+	struct vring_desc *desc;
+	u16 idx;
+	u16 max;
+
+	idx = head;
+	*out = *in = 0;
+	max = vq->vring.num;
+	desc = vq->vring.desc;
+
+	if (virt_desc__test_flag(vq, &desc[idx], VRING_DESC_F_INDIRECT)) {
+		max = virtio_guest_to_host_u32(vq, desc[idx].len) / sizeof(struct vring_desc);
+		desc = guest_flat_to_host(kvm, virtio_guest_to_host_u64(vq, desc[idx].addr));
+		idx = 0;
+	}
+
+	do {
+		/* Grab the first descriptor, and check it's OK. */
+		iov[*out + *in].iov_len = virtio_guest_to_host_u32(vq, desc[idx].len);
+		iov[*out + *in].iov_base = guest_flat_to_host(kvm,
+							      virtio_guest_to_host_u64(vq, desc[idx].addr));
+		/* If this is an input descriptor, increment that count. */
+		if (virt_desc__test_flag(vq, &desc[idx], VRING_DESC_F_WRITE))
+			(*in)++;
+		else
+			(*out)++;
+	} while ((idx = next_desc(vq, desc, idx, max)) != max);
+
+	return head;
+}
+
+u16 virt_queue__get_iov(struct virt_queue *vq, struct iovec iov[], u16 *out, u16 *in, struct kvm *kvm)
+{
+	u16 head;
+
+	head = virt_queue__pop(vq);
+
+	return virt_queue__get_head_iov(vq, iov, out, in, head, kvm);
+}
+
+/* in and out are relative to guest */
+u16 virt_queue__get_inout_iov(struct kvm *kvm, struct virt_queue *queue,
+			      struct iovec in_iov[], struct iovec out_iov[],
+			      u16 *in, u16 *out)
+{
+	struct vring_desc *desc;
+	u16 head, idx;
+
+	idx = head = virt_queue__pop(queue);
+	*out = *in = 0;
+	do {
+		u64 addr;
+		desc = virt_queue__get_desc(queue, idx);
+		addr = virtio_guest_to_host_u64(queue, desc->addr);
+		if (virt_desc__test_flag(queue, desc, VRING_DESC_F_WRITE)) {
+			in_iov[*in].iov_base = guest_flat_to_host(kvm, addr);
+			in_iov[*in].iov_len = virtio_guest_to_host_u32(queue, desc->len);
+			(*in)++;
+		} else {
+			out_iov[*out].iov_base = guest_flat_to_host(kvm, addr);
+			out_iov[*out].iov_len = virtio_guest_to_host_u32(queue, desc->len);
+			(*out)++;
+		}
+		if (virt_desc__test_flag(queue, desc, VRING_DESC_F_NEXT))
+			idx = virtio_guest_to_host_u16(queue, desc->next);
+		else
+			break;
+	} while (1);
+
+	return head;
+}
+
+int virtio__get_dev_specific_field(int offset, bool msix, u32 *config_off)
+{
+	if (msix) {
+		if (offset < 4)
+			return VIRTIO_PCI_O_MSIX;
+		else
+			offset -= 4;
+	}
+
+	*config_off = offset;
+
+	return VIRTIO_PCI_O_CONFIG;
+}
+
+bool virtio_queue__should_signal(struct virt_queue *vq)
+{
+	u16 old_idx, new_idx, event_idx;
+
+	old_idx		= vq->last_used_signalled;
+	new_idx		= virtio_guest_to_host_u16(vq, vq->vring.used->idx);
+	event_idx	= virtio_guest_to_host_u16(vq, vring_used_event(&vq->vring));
+
+	if (vring_need_event(event_idx, new_idx, old_idx)) {
+		vq->last_used_signalled = new_idx;
+		return true;
+	}
+
+	return false;
+}
+
+int virtio_init(struct kvm *kvm, void *dev, struct virtio_device *vdev,
+		struct virtio_ops *ops, enum virtio_trans trans,
+		int device_id, int subsys_id, int class)
+{
+	void *virtio;
+
+	switch (trans) {
+	case VIRTIO_PCI:
+		virtio = calloc(sizeof(struct virtio_pci), 1);
+		if (!virtio)
+			return -ENOMEM;
+		vdev->virtio			= virtio;
+		vdev->ops			= ops;
+		vdev->ops->signal_vq		= virtio_pci__signal_vq;
+		vdev->ops->signal_config	= virtio_pci__signal_config;
+		vdev->ops->init			= virtio_pci__init;
+		vdev->ops->exit			= virtio_pci__exit;
+		vdev->ops->init(kvm, dev, vdev, device_id, subsys_id, class);
+		break;
+	case VIRTIO_MMIO:
+		virtio = calloc(sizeof(struct virtio_mmio), 1);
+		if (!virtio)
+			return -ENOMEM;
+		vdev->virtio			= virtio;
+		vdev->ops			= ops;
+		vdev->ops->signal_vq		= virtio_mmio_signal_vq;
+		vdev->ops->signal_config	= virtio_mmio_signal_config;
+		vdev->ops->init			= virtio_mmio_init;
+		vdev->ops->exit			= virtio_mmio_exit;
+		vdev->ops->init(kvm, dev, vdev, device_id, subsys_id, class);
+		break;
+	default:
+		return -1;
+	};
+
+	return 0;
+}
+
+int virtio_compat_add_message(const char *device, const char *config)
+{
+	int len = 1024;
+	int compat_id;
+	char *title;
+	char *desc;
+
+	title = malloc(len);
+	if (!title)
+		return -ENOMEM;
+
+	desc = malloc(len);
+	if (!desc) {
+		free(title);
+		return -ENOMEM;
+	}
+
+	snprintf(title, len, "%s device was not detected.", device);
+	snprintf(desc,  len, "While you have requested a %s device, "
+			     "the guest kernel did not initialize it.\n"
+			     "\tPlease make sure that the guest kernel was "
+			     "compiled with %s=y enabled in .config.",
+			     device, config);
+
+	compat_id = compat__add_message(title, desc);
+
+	free(desc);
+	free(title);
+
+	return compat_id;
+}

diff --git a/tools/kvm/virtio/mmio.c b/tools/kvm/virtio/mmio.c
new file mode 100644
index 0000000..3a2bd62
--- /dev/null
+++ b/tools/kvm/virtio/mmio.c

@@ -0,0 +1,323 @@
+#include "kvm/devices.h"
+#include "kvm/virtio-mmio.h"
+#include "kvm/ioeventfd.h"
+#include "kvm/ioport.h"
+#include "kvm/virtio.h"
+#include "kvm/kvm.h"
+#include "kvm/kvm-cpu.h"
+#include "kvm/irq.h"
+#include "kvm/fdt.h"
+
+#include <linux/virtio_mmio.h>
+#include <string.h>
+
+static u32 virtio_mmio_io_space_blocks = KVM_VIRTIO_MMIO_AREA;
+
+static u32 virtio_mmio_get_io_space_block(u32 size)
+{
+	u32 block = virtio_mmio_io_space_blocks;
+	virtio_mmio_io_space_blocks += size;
+
+	return block;
+}
+
+static void virtio_mmio_ioevent_callback(struct kvm *kvm, void *param)
+{
+	struct virtio_mmio_ioevent_param *ioeventfd = param;
+	struct virtio_mmio *vmmio = ioeventfd->vdev->virtio;
+
+	ioeventfd->vdev->ops->notify_vq(kvm, vmmio->dev, ioeventfd->vq);
+}
+
+static int virtio_mmio_init_ioeventfd(struct kvm *kvm,
+				      struct virtio_device *vdev, u32 vq)
+{
+	struct virtio_mmio *vmmio = vdev->virtio;
+	struct ioevent ioevent;
+	int err;
+
+	vmmio->ioeventfds[vq] = (struct virtio_mmio_ioevent_param) {
+		.vdev		= vdev,
+		.vq		= vq,
+	};
+
+	ioevent = (struct ioevent) {
+		.io_addr	= vmmio->addr + VIRTIO_MMIO_QUEUE_NOTIFY,
+		.io_len		= sizeof(u32),
+		.fn		= virtio_mmio_ioevent_callback,
+		.fn_ptr		= &vmmio->ioeventfds[vq],
+		.datamatch	= vq,
+		.fn_kvm		= kvm,
+		.fd		= eventfd(0, 0),
+	};
+
+	if (vdev->use_vhost)
+		/*
+		 * Vhost will poll the eventfd in host kernel side,
+		 * no need to poll in userspace.
+		 */
+		err = ioeventfd__add_event(&ioevent, 0);
+	else
+		/* Need to poll in userspace. */
+		err = ioeventfd__add_event(&ioevent, IOEVENTFD_FLAG_USER_POLL);
+	if (err)
+		return err;
+
+	if (vdev->ops->notify_vq_eventfd)
+		vdev->ops->notify_vq_eventfd(kvm, vmmio->dev, vq, ioevent.fd);
+
+	return 0;
+}
+
+int virtio_mmio_signal_vq(struct kvm *kvm, struct virtio_device *vdev, u32 vq)
+{
+	struct virtio_mmio *vmmio = vdev->virtio;
+
+	vmmio->hdr.interrupt_state |= VIRTIO_MMIO_INT_VRING;
+	kvm__irq_trigger(vmmio->kvm, vmmio->irq);
+
+	return 0;
+}
+
+int virtio_mmio_signal_config(struct kvm *kvm, struct virtio_device *vdev)
+{
+	struct virtio_mmio *vmmio = vdev->virtio;
+
+	vmmio->hdr.interrupt_state |= VIRTIO_MMIO_INT_CONFIG;
+	kvm__irq_trigger(vmmio->kvm, vmmio->irq);
+
+	return 0;
+}
+
+static void virtio_mmio_device_specific(struct kvm_cpu *vcpu,
+					u64 addr, u8 *data, u32 len,
+					u8 is_write, struct virtio_device *vdev)
+{
+	struct virtio_mmio *vmmio = vdev->virtio;
+	u32 i;
+
+	for (i = 0; i < len; i++) {
+		if (is_write)
+			vdev->ops->get_config(vmmio->kvm, vmmio->dev)[addr + i] =
+					      *(u8 *)data + i;
+		else
+			data[i] = vdev->ops->get_config(vmmio->kvm,
+							vmmio->dev)[addr + i];
+	}
+}
+
+static void virtio_mmio_config_in(struct kvm_cpu *vcpu,
+				  u64 addr, void *data, u32 len,
+				  struct virtio_device *vdev)
+{
+	struct virtio_mmio *vmmio = vdev->virtio;
+	u32 val = 0;
+
+	switch (addr) {
+	case VIRTIO_MMIO_MAGIC_VALUE:
+	case VIRTIO_MMIO_VERSION:
+	case VIRTIO_MMIO_DEVICE_ID:
+	case VIRTIO_MMIO_VENDOR_ID:
+	case VIRTIO_MMIO_STATUS:
+	case VIRTIO_MMIO_INTERRUPT_STATUS:
+		ioport__write32(data, *(u32 *)(((void *)&vmmio->hdr) + addr));
+		break;
+	case VIRTIO_MMIO_HOST_FEATURES:
+		if (vmmio->hdr.host_features_sel == 0)
+			val = vdev->ops->get_host_features(vmmio->kvm,
+							   vmmio->dev);
+		ioport__write32(data, val);
+		break;
+	case VIRTIO_MMIO_QUEUE_PFN:
+		val = vdev->ops->get_pfn_vq(vmmio->kvm, vmmio->dev,
+					    vmmio->hdr.queue_sel);
+		ioport__write32(data, val);
+		break;
+	case VIRTIO_MMIO_QUEUE_NUM_MAX:
+		val = vdev->ops->get_size_vq(vmmio->kvm, vmmio->dev,
+					     vmmio->hdr.queue_sel);
+		ioport__write32(data, val);
+		break;
+	default:
+		break;
+	}
+}
+
+static void virtio_mmio_config_out(struct kvm_cpu *vcpu,
+				   u64 addr, void *data, u32 len,
+				   struct virtio_device *vdev)
+{
+	struct virtio_mmio *vmmio = vdev->virtio;
+	struct kvm *kvm = vmmio->kvm;
+	u32 val = 0;
+
+	switch (addr) {
+	case VIRTIO_MMIO_HOST_FEATURES_SEL:
+	case VIRTIO_MMIO_GUEST_FEATURES_SEL:
+	case VIRTIO_MMIO_QUEUE_SEL:
+		val = ioport__read32(data);
+		*(u32 *)(((void *)&vmmio->hdr) + addr) = val;
+		break;
+	case VIRTIO_MMIO_STATUS:
+		vmmio->hdr.status = ioport__read32(data);
+		if (!vmmio->hdr.status) /* Sample endianness on reset */
+			vdev->endian = kvm_cpu__get_endianness(vcpu);
+		if (vdev->ops->notify_status)
+			vdev->ops->notify_status(kvm, vmmio->dev, vmmio->hdr.status);
+		break;
+	case VIRTIO_MMIO_GUEST_FEATURES:
+		if (vmmio->hdr.guest_features_sel == 0) {
+			val = ioport__read32(data);
+			vdev->ops->set_guest_features(vmmio->kvm,
+						      vmmio->dev, val);
+		}
+		break;
+	case VIRTIO_MMIO_GUEST_PAGE_SIZE:
+		val = ioport__read32(data);
+		vmmio->hdr.guest_page_size = val;
+		break;
+	case VIRTIO_MMIO_QUEUE_NUM:
+		val = ioport__read32(data);
+		vmmio->hdr.queue_num = val;
+		vdev->ops->set_size_vq(vmmio->kvm, vmmio->dev,
+				       vmmio->hdr.queue_sel, val);
+		break;
+	case VIRTIO_MMIO_QUEUE_ALIGN:
+		val = ioport__read32(data);
+		vmmio->hdr.queue_align = val;
+		break;
+	case VIRTIO_MMIO_QUEUE_PFN:
+		val = ioport__read32(data);
+		virtio_mmio_init_ioeventfd(vmmio->kvm, vdev, vmmio->hdr.queue_sel);
+		vdev->ops->init_vq(vmmio->kvm, vmmio->dev,
+				   vmmio->hdr.queue_sel,
+				   vmmio->hdr.guest_page_size,
+				   vmmio->hdr.queue_align,
+				   val);
+		break;
+	case VIRTIO_MMIO_QUEUE_NOTIFY:
+		val = ioport__read32(data);
+		vdev->ops->notify_vq(vmmio->kvm, vmmio->dev, val);
+		break;
+	case VIRTIO_MMIO_INTERRUPT_ACK:
+		val = ioport__read32(data);
+		vmmio->hdr.interrupt_state &= ~val;
+		break;
+	default:
+		break;
+	};
+}
+
+static void virtio_mmio_mmio_callback(struct kvm_cpu *vcpu,
+				      u64 addr, u8 *data, u32 len,
+				      u8 is_write, void *ptr)
+{
+	struct virtio_device *vdev = ptr;
+	struct virtio_mmio *vmmio = vdev->virtio;
+	u32 offset = addr - vmmio->addr;
+
+	if (offset >= VIRTIO_MMIO_CONFIG) {
+		offset -= VIRTIO_MMIO_CONFIG;
+		virtio_mmio_device_specific(vcpu, offset, data, len, is_write, ptr);
+		return;
+	}
+
+	if (is_write)
+		virtio_mmio_config_out(vcpu, offset, data, len, ptr);
+	else
+		virtio_mmio_config_in(vcpu, offset, data, len, ptr);
+}
+
+#ifdef CONFIG_HAS_LIBFDT
+#define DEVICE_NAME_MAX_LEN 32
+static void generate_virtio_mmio_fdt_node(void *fdt,
+					  struct device_header *dev_hdr,
+					  void (*generate_irq_prop)(void *fdt,
+								    u8 irq))
+{
+	char dev_name[DEVICE_NAME_MAX_LEN];
+	struct virtio_mmio *vmmio = container_of(dev_hdr,
+						 struct virtio_mmio,
+						 dev_hdr);
+	u64 addr = vmmio->addr;
+	u64 reg_prop[] = {
+		cpu_to_fdt64(addr),
+		cpu_to_fdt64(VIRTIO_MMIO_IO_SIZE),
+	};
+
+	snprintf(dev_name, DEVICE_NAME_MAX_LEN, "virtio@%llx", addr);
+
+	_FDT(fdt_begin_node(fdt, dev_name));
+	_FDT(fdt_property_string(fdt, "compatible", "virtio,mmio"));
+	_FDT(fdt_property(fdt, "reg", reg_prop, sizeof(reg_prop)));
+	generate_irq_prop(fdt, vmmio->irq);
+	_FDT(fdt_end_node(fdt));
+}
+#else
+static void generate_virtio_mmio_fdt_node(void *fdt,
+					  struct device_header *dev_hdr,
+					  void (*generate_irq_prop)(void *fdt,
+								    u8 irq))
+{
+	die("Unable to generate device tree nodes without libfdt\n");
+}
+#endif
+
+void virtio_mmio_assign_irq(struct device_header *dev_hdr)
+{
+	struct virtio_mmio *vmmio = container_of(dev_hdr,
+						 struct virtio_mmio,
+						 dev_hdr);
+	vmmio->irq = irq__alloc_line();
+}
+
+int virtio_mmio_init(struct kvm *kvm, void *dev, struct virtio_device *vdev,
+		     int device_id, int subsys_id, int class)
+{
+	struct virtio_mmio *vmmio = vdev->virtio;
+
+	vmmio->addr	= virtio_mmio_get_io_space_block(VIRTIO_MMIO_IO_SIZE);
+	vmmio->kvm	= kvm;
+	vmmio->dev	= dev;
+
+	kvm__register_mmio(kvm, vmmio->addr, VIRTIO_MMIO_IO_SIZE,
+			   false, virtio_mmio_mmio_callback, vdev);
+
+	vmmio->hdr = (struct virtio_mmio_hdr) {
+		.magic		= {'v', 'i', 'r', 't'},
+		.version	= 1,
+		.device_id	= subsys_id,
+		.vendor_id	= 0x4d564b4c , /* 'LKVM' */
+		.queue_num_max	= 256,
+	};
+
+	vmmio->dev_hdr = (struct device_header) {
+		.bus_type	= DEVICE_BUS_MMIO,
+		.data		= generate_virtio_mmio_fdt_node,
+	};
+
+	device__register(&vmmio->dev_hdr);
+
+	/*
+	 * Instantiate guest virtio-mmio devices using kernel command line
+	 * (or module) parameter, e.g
+	 *
+	 * virtio_mmio.devices=0x200@0xd2000000:5,0x200@0xd2000200:6
+	 */
+	pr_info("virtio-mmio.devices=0x%x@0x%x:%d\n", VIRTIO_MMIO_IO_SIZE, vmmio->addr, vmmio->irq);
+
+	return 0;
+}
+
+int virtio_mmio_exit(struct kvm *kvm, struct virtio_device *vdev)
+{
+	struct virtio_mmio *vmmio = vdev->virtio;
+	int i;
+
+	kvm__deregister_mmio(kvm, vmmio->addr);
+
+	for (i = 0; i < VIRTIO_MMIO_MAX_VQ; i++)
+		ioeventfd__del_event(vmmio->addr + VIRTIO_MMIO_QUEUE_NOTIFY, i);
+
+	return 0;
+}

diff --git a/tools/kvm/virtio/net.c b/tools/kvm/virtio/net.c
new file mode 100644
index 0000000..c8af385
--- /dev/null
+++ b/tools/kvm/virtio/net.c

@@ -0,0 +1,855 @@
+#include "kvm/virtio-pci-dev.h"
+#include "kvm/virtio-net.h"
+#include "kvm/virtio.h"
+#include "kvm/types.h"
+#include "kvm/mutex.h"
+#include "kvm/util.h"
+#include "kvm/kvm.h"
+#include "kvm/irq.h"
+#include "kvm/uip.h"
+#include "kvm/guest_compat.h"
+#include "kvm/iovec.h"
+
+#include <linux/vhost.h>
+#include <linux/virtio_net.h>
+#include <linux/if_tun.h>
+#include <linux/types.h>
+
+#include <arpa/inet.h>
+#include <net/if.h>
+
+#include <unistd.h>
+#include <fcntl.h>
+
+#include <sys/socket.h>
+#include <sys/ioctl.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <sys/eventfd.h>
+
+#define VIRTIO_NET_QUEUE_SIZE		256
+#define VIRTIO_NET_NUM_QUEUES		8
+
+struct net_dev;
+
+struct net_dev_operations {
+	int (*rx)(struct iovec *iov, u16 in, struct net_dev *ndev);
+	int (*tx)(struct iovec *iov, u16 in, struct net_dev *ndev);
+};
+
+struct net_dev {
+	struct mutex			mutex;
+	struct virtio_device		vdev;
+	struct list_head		list;
+
+	struct virt_queue		vqs[VIRTIO_NET_NUM_QUEUES * 2 + 1];
+	struct virtio_net_config	config;
+	u32				features, rx_vqs, tx_vqs, queue_pairs;
+
+	pthread_t			io_thread[VIRTIO_NET_NUM_QUEUES * 2 + 1];
+	struct mutex			io_lock[VIRTIO_NET_NUM_QUEUES * 2 + 1];
+	pthread_cond_t			io_cond[VIRTIO_NET_NUM_QUEUES * 2 + 1];
+
+	int				vhost_fd;
+	int				tap_fd;
+	char				tap_name[IFNAMSIZ];
+
+	int				mode;
+
+	struct uip_info			info;
+	struct net_dev_operations	*ops;
+	struct kvm			*kvm;
+
+	struct virtio_net_params	*params;
+};
+
+static LIST_HEAD(ndevs);
+static int compat_id = -1;
+
+#define MAX_PACKET_SIZE 65550
+
+static bool has_virtio_feature(struct net_dev *ndev, u32 feature)
+{
+	return ndev->features & (1 << feature);
+}
+
+static void virtio_net_fix_tx_hdr(struct virtio_net_hdr *hdr, struct net_dev *ndev)
+{
+	hdr->hdr_len		= virtio_guest_to_host_u16(&ndev->vdev, hdr->hdr_len);
+	hdr->gso_size		= virtio_guest_to_host_u16(&ndev->vdev, hdr->gso_size);
+	hdr->csum_start		= virtio_guest_to_host_u16(&ndev->vdev, hdr->csum_start);
+	hdr->csum_offset	= virtio_guest_to_host_u16(&ndev->vdev, hdr->csum_offset);
+}
+
+static void virtio_net_fix_rx_hdr(struct virtio_net_hdr_mrg_rxbuf *hdr, struct net_dev *ndev)
+{
+	hdr->hdr.hdr_len	= virtio_host_to_guest_u16(&ndev->vdev, hdr->hdr.hdr_len);
+	hdr->hdr.gso_size	= virtio_host_to_guest_u16(&ndev->vdev, hdr->hdr.gso_size);
+	hdr->hdr.csum_start	= virtio_host_to_guest_u16(&ndev->vdev, hdr->hdr.csum_start);
+	hdr->hdr.csum_offset	= virtio_host_to_guest_u16(&ndev->vdev, hdr->hdr.csum_offset);
+	if (has_virtio_feature(ndev, VIRTIO_NET_F_MRG_RXBUF))
+		hdr->num_buffers	= virtio_host_to_guest_u16(&ndev->vdev, hdr->num_buffers);
+}
+
+static void *virtio_net_rx_thread(void *p)
+{
+	struct iovec iov[VIRTIO_NET_QUEUE_SIZE];
+	struct virt_queue *vq;
+	struct kvm *kvm;
+	struct net_dev *ndev = p;
+	u16 out, in;
+	u16 head;
+	int len, copied;
+	u32 id;
+
+	mutex_lock(&ndev->mutex);
+	id = ndev->rx_vqs++ * 2;
+	mutex_unlock(&ndev->mutex);
+
+	kvm__set_thread_name("virtio-net-rx");
+
+	kvm = ndev->kvm;
+	vq = &ndev->vqs[id];
+
+	while (1) {
+		mutex_lock(&ndev->io_lock[id]);
+		if (!virt_queue__available(vq))
+			pthread_cond_wait(&ndev->io_cond[id], &ndev->io_lock[id].mutex);
+		mutex_unlock(&ndev->io_lock[id]);
+
+		while (virt_queue__available(vq)) {
+			unsigned char buffer[MAX_PACKET_SIZE + sizeof(struct virtio_net_hdr_mrg_rxbuf)];
+			struct iovec dummy_iov = {
+				.iov_base = buffer,
+				.iov_len  = sizeof(buffer),
+			};
+			struct virtio_net_hdr_mrg_rxbuf *hdr;
+			int i;
+
+			len = ndev->ops->rx(&dummy_iov, 1, ndev);
+			if (len < 0) {
+				pr_warning("%s: rx on vq %u failed (%d), exiting thread\n",
+						__func__, id, len);
+				goto out_err;
+			}
+
+			copied = i = 0;
+			head = virt_queue__get_iov(vq, iov, &out, &in, kvm);
+			hdr = iov[0].iov_base;
+			while (copied < len) {
+				size_t iovsize = min_t(size_t, len - copied, iov_size(iov, in));
+
+				memcpy_toiovec(iov, buffer + copied, iovsize);
+				copied += iovsize;
+				if (i++ == 0)
+					virtio_net_fix_rx_hdr(hdr, ndev);
+				if (has_virtio_feature(ndev, VIRTIO_NET_F_MRG_RXBUF)) {
+					u16 num_buffers = virtio_guest_to_host_u16(vq, hdr->num_buffers);
+					hdr->num_buffers = virtio_host_to_guest_u16(vq, num_buffers + 1);
+				}
+				virt_queue__set_used_elem(vq, head, iovsize);
+				if (copied == len)
+					break;
+				while (!virt_queue__available(vq))
+					sleep(0);
+				head = virt_queue__get_iov(vq, iov, &out, &in, kvm);
+			}
+			/* We should interrupt guest right now, otherwise latency is huge. */
+			if (virtio_queue__should_signal(vq))
+				ndev->vdev.ops->signal_vq(kvm, &ndev->vdev, id);
+		}
+	}
+
+out_err:
+	pthread_exit(NULL);
+	return NULL;
+
+}
+
+static void *virtio_net_tx_thread(void *p)
+{
+	struct iovec iov[VIRTIO_NET_QUEUE_SIZE];
+	struct virt_queue *vq;
+	struct kvm *kvm;
+	struct net_dev *ndev = p;
+	u16 out, in;
+	u16 head;
+	int len;
+	u32 id;
+
+	mutex_lock(&ndev->mutex);
+	id = ndev->tx_vqs++ * 2 + 1;
+	mutex_unlock(&ndev->mutex);
+
+	kvm__set_thread_name("virtio-net-tx");
+
+	kvm = ndev->kvm;
+	vq = &ndev->vqs[id];
+
+	while (1) {
+		mutex_lock(&ndev->io_lock[id]);
+		if (!virt_queue__available(vq))
+			pthread_cond_wait(&ndev->io_cond[id], &ndev->io_lock[id].mutex);
+		mutex_unlock(&ndev->io_lock[id]);
+
+		while (virt_queue__available(vq)) {
+			struct virtio_net_hdr *hdr;
+			head = virt_queue__get_iov(vq, iov, &out, &in, kvm);
+			hdr = iov[0].iov_base;
+			virtio_net_fix_tx_hdr(hdr, ndev);
+			len = ndev->ops->tx(iov, out, ndev);
+			if (len < 0) {
+				pr_warning("%s: tx on vq %u failed (%d)\n",
+						__func__, id, errno);
+				goto out_err;
+			}
+
+			virt_queue__set_used_elem(vq, head, len);
+		}
+
+		if (virtio_queue__should_signal(vq))
+			ndev->vdev.ops->signal_vq(kvm, &ndev->vdev, id);
+	}
+
+out_err:
+	pthread_exit(NULL);
+	return NULL;
+}
+
+static virtio_net_ctrl_ack virtio_net_handle_mq(struct kvm* kvm, struct net_dev *ndev, struct virtio_net_ctrl_hdr *ctrl)
+{
+	/* Not much to do here */
+	return VIRTIO_NET_OK;
+}
+
+static void *virtio_net_ctrl_thread(void *p)
+{
+	struct iovec iov[VIRTIO_NET_QUEUE_SIZE];
+	u16 out, in, head;
+	struct net_dev *ndev = p;
+	struct kvm *kvm = ndev->kvm;
+	u32 id = ndev->queue_pairs * 2;
+	struct virt_queue *vq = &ndev->vqs[id];
+	struct virtio_net_ctrl_hdr *ctrl;
+	virtio_net_ctrl_ack *ack;
+
+	while (1) {
+		mutex_lock(&ndev->io_lock[id]);
+		if (!virt_queue__available(vq))
+			pthread_cond_wait(&ndev->io_cond[id], &ndev->io_lock[id].mutex);
+		mutex_unlock(&ndev->io_lock[id]);
+
+		while (virt_queue__available(vq)) {
+			head = virt_queue__get_iov(&ndev->vqs[id], iov, &out, &in, kvm);
+			ctrl = iov[0].iov_base;
+			ack = iov[out].iov_base;
+
+			switch (ctrl->class) {
+			case VIRTIO_NET_CTRL_MQ:
+				*ack = virtio_net_handle_mq(kvm, ndev, ctrl);
+				break;
+			default:
+				*ack = VIRTIO_NET_ERR;
+				break;
+			}
+			virt_queue__set_used_elem(&ndev->vqs[id], head, iov[out].iov_len);
+		}
+
+		if (virtio_queue__should_signal(&ndev->vqs[id]))
+			ndev->vdev.ops->signal_vq(kvm, &ndev->vdev, id);
+	}
+
+	pthread_exit(NULL);
+
+	return NULL;
+}
+
+static void virtio_net_handle_callback(struct kvm *kvm, struct net_dev *ndev, int queue)
+{
+	if ((u32)queue >= (ndev->queue_pairs * 2 + 1)) {
+		pr_warning("Unknown queue index %u", queue);
+		return;
+	}
+
+	mutex_lock(&ndev->io_lock[queue]);
+	pthread_cond_signal(&ndev->io_cond[queue]);
+	mutex_unlock(&ndev->io_lock[queue]);
+}
+
+static bool virtio_net__tap_init(struct net_dev *ndev)
+{
+	int sock = socket(AF_INET, SOCK_STREAM, 0);
+	int pid, status, offload, hdr_len;
+	struct sockaddr_in sin = {0};
+	struct ifreq ifr;
+	const struct virtio_net_params *params = ndev->params;
+	bool skipconf = !!params->tapif;
+
+	/* Did the user already gave us the FD? */
+	if (params->fd) {
+		ndev->tap_fd = params->fd;
+		return 1;
+	}
+
+	ndev->tap_fd = open("/dev/net/tun", O_RDWR);
+	if (ndev->tap_fd < 0) {
+		pr_warning("Unable to open /dev/net/tun");
+		goto fail;
+	}
+
+	memset(&ifr, 0, sizeof(ifr));
+	ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR;
+	if (params->tapif)
+		strncpy(ifr.ifr_name, params->tapif, sizeof(ifr.ifr_name));
+	if (ioctl(ndev->tap_fd, TUNSETIFF, &ifr) < 0) {
+		pr_warning("Config tap device error. Are you root?");
+		goto fail;
+	}
+
+	strncpy(ndev->tap_name, ifr.ifr_name, sizeof(ndev->tap_name));
+
+	if (ioctl(ndev->tap_fd, TUNSETNOCSUM, 1) < 0) {
+		pr_warning("Config tap device TUNSETNOCSUM error");
+		goto fail;
+	}
+
+	hdr_len = has_virtio_feature(ndev, VIRTIO_NET_F_MRG_RXBUF) ?
+			sizeof(struct virtio_net_hdr_mrg_rxbuf) :
+			sizeof(struct virtio_net_hdr);
+	if (ioctl(ndev->tap_fd, TUNSETVNETHDRSZ, &hdr_len) < 0)
+		pr_warning("Config tap device TUNSETVNETHDRSZ error");
+
+	offload = TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 | TUN_F_UFO;
+	if (ioctl(ndev->tap_fd, TUNSETOFFLOAD, offload) < 0) {
+		pr_warning("Config tap device TUNSETOFFLOAD error");
+		goto fail;
+	}
+
+	if (strcmp(params->script, "none")) {
+		pid = fork();
+		if (pid == 0) {
+			execl(params->script, params->script, ndev->tap_name, NULL);
+			_exit(1);
+		} else {
+			waitpid(pid, &status, 0);
+			if (WIFEXITED(status) && WEXITSTATUS(status) != 0) {
+				pr_warning("Fail to setup tap by %s", params->script);
+				goto fail;
+			}
+		}
+	} else if (!skipconf) {
+		memset(&ifr, 0, sizeof(ifr));
+		strncpy(ifr.ifr_name, ndev->tap_name, sizeof(ndev->tap_name));
+		sin.sin_addr.s_addr = inet_addr(params->host_ip);
+		memcpy(&(ifr.ifr_addr), &sin, sizeof(ifr.ifr_addr));
+		ifr.ifr_addr.sa_family = AF_INET;
+		if (ioctl(sock, SIOCSIFADDR, &ifr) < 0) {
+			pr_warning("Could not set ip address on tap device");
+			goto fail;
+		}
+	}
+
+	if (!skipconf) {
+		memset(&ifr, 0, sizeof(ifr));
+		strncpy(ifr.ifr_name, ndev->tap_name, sizeof(ndev->tap_name));
+		ioctl(sock, SIOCGIFFLAGS, &ifr);
+		ifr.ifr_flags |= IFF_UP | IFF_RUNNING;
+		if (ioctl(sock, SIOCSIFFLAGS, &ifr) < 0)
+			pr_warning("Could not bring tap device up");
+	}
+
+	close(sock);
+
+	return 1;
+
+fail:
+	if (sock >= 0)
+		close(sock);
+	if (ndev->tap_fd >= 0)
+		close(ndev->tap_fd);
+
+	return 0;
+}
+
+static inline int tap_ops_tx(struct iovec *iov, u16 out, struct net_dev *ndev)
+{
+	return writev(ndev->tap_fd, iov, out);
+}
+
+static inline int tap_ops_rx(struct iovec *iov, u16 in, struct net_dev *ndev)
+{
+	return readv(ndev->tap_fd, iov, in);
+}
+
+static inline int uip_ops_tx(struct iovec *iov, u16 out, struct net_dev *ndev)
+{
+	return uip_tx(iov, out, &ndev->info);
+}
+
+static inline int uip_ops_rx(struct iovec *iov, u16 in, struct net_dev *ndev)
+{
+	return uip_rx(iov, in, &ndev->info);
+}
+
+static struct net_dev_operations tap_ops = {
+	.rx	= tap_ops_rx,
+	.tx	= tap_ops_tx,
+};
+
+static struct net_dev_operations uip_ops = {
+	.rx	= uip_ops_rx,
+	.tx	= uip_ops_tx,
+};
+
+static u8 *get_config(struct kvm *kvm, void *dev)
+{
+	struct net_dev *ndev = dev;
+
+	return ((u8 *)(&ndev->config));
+}
+
+static u32 get_host_features(struct kvm *kvm, void *dev)
+{
+	struct net_dev *ndev = dev;
+
+	return 1UL << VIRTIO_NET_F_MAC
+		| 1UL << VIRTIO_NET_F_CSUM
+		| 1UL << VIRTIO_NET_F_HOST_UFO
+		| 1UL << VIRTIO_NET_F_HOST_TSO4
+		| 1UL << VIRTIO_NET_F_HOST_TSO6
+		| 1UL << VIRTIO_NET_F_GUEST_UFO
+		| 1UL << VIRTIO_NET_F_GUEST_TSO4
+		| 1UL << VIRTIO_NET_F_GUEST_TSO6
+		| 1UL << VIRTIO_RING_F_EVENT_IDX
+		| 1UL << VIRTIO_RING_F_INDIRECT_DESC
+		| 1UL << VIRTIO_NET_F_CTRL_VQ
+		| 1UL << VIRTIO_NET_F_MRG_RXBUF
+		| 1UL << (ndev->queue_pairs > 1 ? VIRTIO_NET_F_MQ : 0);
+}
+
+static int virtio_net__vhost_set_features(struct net_dev *ndev)
+{
+	u64 features = 1UL << VIRTIO_RING_F_EVENT_IDX;
+	u64 vhost_features;
+
+	if (ioctl(ndev->vhost_fd, VHOST_GET_FEATURES, &vhost_features) != 0)
+		die_perror("VHOST_GET_FEATURES failed");
+
+	/* make sure both side support mergable rx buffers */
+	if (vhost_features & 1UL << VIRTIO_NET_F_MRG_RXBUF &&
+			has_virtio_feature(ndev, VIRTIO_NET_F_MRG_RXBUF))
+		features |= 1UL << VIRTIO_NET_F_MRG_RXBUF;
+
+	return ioctl(ndev->vhost_fd, VHOST_SET_FEATURES, &features);
+}
+
+static void set_guest_features(struct kvm *kvm, void *dev, u32 features)
+{
+	struct net_dev *ndev = dev;
+	struct virtio_net_config *conf = &ndev->config;
+
+	ndev->features = features;
+
+	conf->status = virtio_host_to_guest_u16(&ndev->vdev, conf->status);
+	conf->max_virtqueue_pairs = virtio_host_to_guest_u16(&ndev->vdev,
+							     conf->max_virtqueue_pairs);
+
+	if (ndev->mode == NET_MODE_TAP) {
+		if (!virtio_net__tap_init(ndev))
+			die_perror("You have requested a TAP device, but creation of one has failed because");
+		if (ndev->vhost_fd &&
+				virtio_net__vhost_set_features(ndev) != 0)
+			die_perror("VHOST_SET_FEATURES failed");
+	} else {
+		ndev->info.vnet_hdr_len = has_virtio_feature(ndev, VIRTIO_NET_F_MRG_RXBUF) ?
+						sizeof(struct virtio_net_hdr_mrg_rxbuf) :
+						sizeof(struct virtio_net_hdr);
+		uip_init(&ndev->info);
+	}
+}
+
+static bool is_ctrl_vq(struct net_dev *ndev, u32 vq)
+{
+	return vq == (u32)(ndev->queue_pairs * 2);
+}
+
+static int init_vq(struct kvm *kvm, void *dev, u32 vq, u32 page_size, u32 align,
+		   u32 pfn)
+{
+	struct vhost_vring_state state = { .index = vq };
+	struct vhost_vring_addr addr;
+	struct net_dev *ndev = dev;
+	struct virt_queue *queue;
+	void *p;
+	int r;
+
+	compat__remove_message(compat_id);
+
+	queue		= &ndev->vqs[vq];
+	queue->pfn	= pfn;
+	p		= virtio_get_vq(kvm, queue->pfn, page_size);
+
+	vring_init(&queue->vring, VIRTIO_NET_QUEUE_SIZE, p, align);
+	virtio_init_device_vq(&ndev->vdev, queue);
+
+	mutex_init(&ndev->io_lock[vq]);
+	pthread_cond_init(&ndev->io_cond[vq], NULL);
+	if (is_ctrl_vq(ndev, vq)) {
+		pthread_create(&ndev->io_thread[vq], NULL, virtio_net_ctrl_thread, ndev);
+
+		return 0;
+	} else if (ndev->vhost_fd == 0 ) {
+		if (vq & 1)
+			pthread_create(&ndev->io_thread[vq], NULL, virtio_net_tx_thread, ndev);
+		else
+			pthread_create(&ndev->io_thread[vq], NULL, virtio_net_rx_thread, ndev);
+
+		return 0;
+	}
+
+	if (queue->endian != VIRTIO_ENDIAN_HOST)
+		die_perror("VHOST requires VIRTIO_ENDIAN_HOST");
+
+	state.num = queue->vring.num;
+	r = ioctl(ndev->vhost_fd, VHOST_SET_VRING_NUM, &state);
+	if (r < 0)
+		die_perror("VHOST_SET_VRING_NUM failed");
+	state.num = 0;
+	r = ioctl(ndev->vhost_fd, VHOST_SET_VRING_BASE, &state);
+	if (r < 0)
+		die_perror("VHOST_SET_VRING_BASE failed");
+
+	addr = (struct vhost_vring_addr) {
+		.index = vq,
+		.desc_user_addr = (u64)(unsigned long)queue->vring.desc,
+		.avail_user_addr = (u64)(unsigned long)queue->vring.avail,
+		.used_user_addr = (u64)(unsigned long)queue->vring.used,
+	};
+
+	r = ioctl(ndev->vhost_fd, VHOST_SET_VRING_ADDR, &addr);
+	if (r < 0)
+		die_perror("VHOST_SET_VRING_ADDR failed");
+
+	return 0;
+}
+
+static void notify_vq_gsi(struct kvm *kvm, void *dev, u32 vq, u32 gsi)
+{
+	struct net_dev *ndev = dev;
+	struct kvm_irqfd irq;
+	struct vhost_vring_file file;
+	int r;
+
+	if (ndev->vhost_fd == 0)
+		return;
+
+	irq = (struct kvm_irqfd) {
+		.gsi	= gsi,
+		.fd	= eventfd(0, 0),
+	};
+	file = (struct vhost_vring_file) {
+		.index	= vq,
+		.fd	= irq.fd,
+	};
+
+	r = ioctl(kvm->vm_fd, KVM_IRQFD, &irq);
+	if (r < 0)
+		die_perror("KVM_IRQFD failed");
+
+	r = ioctl(ndev->vhost_fd, VHOST_SET_VRING_CALL, &file);
+	if (r < 0)
+		die_perror("VHOST_SET_VRING_CALL failed");
+	file.fd = ndev->tap_fd;
+	r = ioctl(ndev->vhost_fd, VHOST_NET_SET_BACKEND, &file);
+	if (r != 0)
+		die("VHOST_NET_SET_BACKEND failed %d", errno);
+
+}
+
+static void notify_vq_eventfd(struct kvm *kvm, void *dev, u32 vq, u32 efd)
+{
+	struct net_dev *ndev = dev;
+	struct vhost_vring_file file = {
+		.index	= vq,
+		.fd	= efd,
+	};
+	int r;
+
+	if (ndev->vhost_fd == 0 || is_ctrl_vq(ndev, vq))
+		return;
+
+	r = ioctl(ndev->vhost_fd, VHOST_SET_VRING_KICK, &file);
+	if (r < 0)
+		die_perror("VHOST_SET_VRING_KICK failed");
+}
+
+static int notify_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+	struct net_dev *ndev = dev;
+
+	virtio_net_handle_callback(kvm, ndev, vq);
+
+	return 0;
+}
+
+static int get_pfn_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+	struct net_dev *ndev = dev;
+
+	return ndev->vqs[vq].pfn;
+}
+
+static int get_size_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+	/* FIXME: dynamic */
+	return VIRTIO_NET_QUEUE_SIZE;
+}
+
+static int set_size_vq(struct kvm *kvm, void *dev, u32 vq, int size)
+{
+	/* FIXME: dynamic */
+	return size;
+}
+
+static struct virtio_ops net_dev_virtio_ops = (struct virtio_ops) {
+	.get_config		= get_config,
+	.get_host_features	= get_host_features,
+	.set_guest_features	= set_guest_features,
+	.init_vq		= init_vq,
+	.get_pfn_vq		= get_pfn_vq,
+	.get_size_vq		= get_size_vq,
+	.set_size_vq		= set_size_vq,
+	.notify_vq		= notify_vq,
+	.notify_vq_gsi		= notify_vq_gsi,
+	.notify_vq_eventfd	= notify_vq_eventfd,
+};
+
+static void virtio_net__vhost_init(struct kvm *kvm, struct net_dev *ndev)
+{
+	struct vhost_memory *mem;
+	int r;
+
+	ndev->vhost_fd = open("/dev/vhost-net", O_RDWR);
+	if (ndev->vhost_fd < 0)
+		die_perror("Failed openning vhost-net device");
+
+	mem = calloc(1, sizeof(*mem) + sizeof(struct vhost_memory_region));
+	if (mem == NULL)
+		die("Failed allocating memory for vhost memory map");
+
+	mem->nregions = 1;
+	mem->regions[0] = (struct vhost_memory_region) {
+		.guest_phys_addr	= 0,
+		.memory_size		= kvm->ram_size,
+		.userspace_addr		= (unsigned long)kvm->ram_start,
+	};
+
+	r = ioctl(ndev->vhost_fd, VHOST_SET_OWNER);
+	if (r != 0)
+		die_perror("VHOST_SET_OWNER failed");
+
+	r = ioctl(ndev->vhost_fd, VHOST_SET_MEM_TABLE, mem);
+	if (r != 0)
+		die_perror("VHOST_SET_MEM_TABLE failed");
+
+	ndev->vdev.use_vhost = true;
+
+	free(mem);
+}
+
+static inline void str_to_mac(const char *str, char *mac)
+{
+	sscanf(str, "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx",
+		mac, mac+1, mac+2, mac+3, mac+4, mac+5);
+}
+static int set_net_param(struct kvm *kvm, struct virtio_net_params *p,
+			const char *param, const char *val)
+{
+	if (strcmp(param, "guest_mac") == 0) {
+		str_to_mac(val, p->guest_mac);
+	} else if (strcmp(param, "mode") == 0) {
+		if (!strncmp(val, "user", 4)) {
+			int i;
+
+			for (i = 0; i < kvm->cfg.num_net_devices; i++)
+				if (kvm->cfg.net_params[i].mode == NET_MODE_USER)
+					die("Only one usermode network device allowed at a time");
+			p->mode = NET_MODE_USER;
+		} else if (!strncmp(val, "tap", 3)) {
+			p->mode = NET_MODE_TAP;
+		} else if (!strncmp(val, "none", 4)) {
+			kvm->cfg.no_net = 1;
+			return -1;
+		} else
+			die("Unknown network mode %s, please use user, tap or none", kvm->cfg.network);
+	} else if (strcmp(param, "script") == 0) {
+		p->script = strdup(val);
+	} else if (strcmp(param, "guest_ip") == 0) {
+		p->guest_ip = strdup(val);
+	} else if (strcmp(param, "host_ip") == 0) {
+		p->host_ip = strdup(val);
+	} else if (strcmp(param, "trans") == 0) {
+		p->trans = strdup(val);
+	} else if (strcmp(param, "tapif") == 0) {
+		p->tapif = strdup(val);
+	} else if (strcmp(param, "vhost") == 0) {
+		p->vhost = atoi(val);
+	} else if (strcmp(param, "fd") == 0) {
+		p->fd = atoi(val);
+	} else if (strcmp(param, "mq") == 0) {
+		p->mq = atoi(val);
+	} else
+		die("Unknown network parameter %s", param);
+
+	return 0;
+}
+
+int netdev_parser(const struct option *opt, const char *arg, int unset)
+{
+	struct virtio_net_params p;
+	char *buf = NULL, *cmd = NULL, *cur = NULL;
+	bool on_cmd = true;
+	struct kvm *kvm = opt->ptr;
+
+	if (arg) {
+		buf = strdup(arg);
+		if (buf == NULL)
+			die("Failed allocating new net buffer");
+		cur = strtok(buf, ",=");
+	}
+
+	p = (struct virtio_net_params) {
+		.guest_ip	= DEFAULT_GUEST_ADDR,
+		.host_ip	= DEFAULT_HOST_ADDR,
+		.script		= DEFAULT_SCRIPT,
+		.mode		= NET_MODE_TAP,
+	};
+
+	str_to_mac(DEFAULT_GUEST_MAC, p.guest_mac);
+	p.guest_mac[5] += kvm->cfg.num_net_devices;
+
+	while (cur) {
+		if (on_cmd) {
+			cmd = cur;
+		} else {
+			if (set_net_param(kvm, &p, cmd, cur) < 0)
+				goto done;
+		}
+		on_cmd = !on_cmd;
+
+		cur = strtok(NULL, ",=");
+	};
+
+	kvm->cfg.num_net_devices++;
+
+	kvm->cfg.net_params = realloc(kvm->cfg.net_params, kvm->cfg.num_net_devices * sizeof(*kvm->cfg.net_params));
+	if (kvm->cfg.net_params == NULL)
+		die("Failed adding new network device");
+
+	kvm->cfg.net_params[kvm->cfg.num_net_devices - 1] = p;
+
+done:
+	free(buf);
+	return 0;
+}
+
+static int virtio_net__init_one(struct virtio_net_params *params)
+{
+	int i, err;
+	struct net_dev *ndev;
+	struct virtio_ops *ops;
+
+	ndev = calloc(1, sizeof(struct net_dev));
+	if (ndev == NULL)
+		return -ENOMEM;
+
+	ops = malloc(sizeof(*ops));
+	if (ops == NULL) {
+		err = -ENOMEM;
+		goto err_free_ndev;
+	}
+
+	list_add_tail(&ndev->list, &ndevs);
+
+	ndev->kvm = params->kvm;
+	ndev->params = params;
+
+	mutex_init(&ndev->mutex);
+	ndev->queue_pairs = max(1, min(VIRTIO_NET_NUM_QUEUES, params->mq));
+	ndev->config.status = VIRTIO_NET_S_LINK_UP;
+	if (ndev->queue_pairs > 1)
+		ndev->config.max_virtqueue_pairs = ndev->queue_pairs;
+
+	for (i = 0 ; i < 6 ; i++) {
+		ndev->config.mac[i]		= params->guest_mac[i];
+		ndev->info.guest_mac.addr[i]	= params->guest_mac[i];
+		ndev->info.host_mac.addr[i]	= params->host_mac[i];
+	}
+
+	ndev->mode = params->mode;
+	if (ndev->mode == NET_MODE_TAP) {
+		ndev->ops = &tap_ops;
+	} else {
+		ndev->info.host_ip		= ntohl(inet_addr(params->host_ip));
+		ndev->info.guest_ip		= ntohl(inet_addr(params->guest_ip));
+		ndev->info.guest_netmask	= ntohl(inet_addr("255.255.255.0"));
+		ndev->info.buf_nr		= 20,
+		ndev->ops = &uip_ops;
+		uip_static_init(&ndev->info);
+	}
+
+	*ops = net_dev_virtio_ops;
+	if (params->trans && strcmp(params->trans, "mmio") == 0)
+		virtio_init(params->kvm, ndev, &ndev->vdev, ops, VIRTIO_MMIO,
+			    PCI_DEVICE_ID_VIRTIO_NET, VIRTIO_ID_NET, PCI_CLASS_NET);
+	else
+		virtio_init(params->kvm, ndev, &ndev->vdev, ops, VIRTIO_PCI,
+			    PCI_DEVICE_ID_VIRTIO_NET, VIRTIO_ID_NET, PCI_CLASS_NET);
+
+	if (params->vhost)
+		virtio_net__vhost_init(params->kvm, ndev);
+
+	if (compat_id == -1)
+		compat_id = virtio_compat_add_message("virtio-net", "CONFIG_VIRTIO_NET");
+
+	return 0;
+
+err_free_ndev:
+	free(ndev);
+	return err;
+}
+
+int virtio_net__init(struct kvm *kvm)
+{
+	int i;
+
+	for (i = 0; i < kvm->cfg.num_net_devices; i++) {
+		kvm->cfg.net_params[i].kvm = kvm;
+		virtio_net__init_one(&kvm->cfg.net_params[i]);
+	}
+
+	if (kvm->cfg.num_net_devices == 0 && kvm->cfg.no_net == 0) {
+		static struct virtio_net_params net_params;
+
+		net_params = (struct virtio_net_params) {
+			.guest_ip	= kvm->cfg.guest_ip,
+			.host_ip	= kvm->cfg.host_ip,
+			.kvm		= kvm,
+			.script		= kvm->cfg.script,
+			.mode		= NET_MODE_USER,
+		};
+		str_to_mac(kvm->cfg.guest_mac, net_params.guest_mac);
+		str_to_mac(kvm->cfg.host_mac, net_params.host_mac);
+
+		virtio_net__init_one(&net_params);
+	}
+
+	return 0;
+}
+virtio_dev_init(virtio_net__init);
+
+int virtio_net__exit(struct kvm *kvm)
+{
+	return 0;
+}
+virtio_dev_exit(virtio_net__exit);

diff --git a/tools/kvm/virtio/pci.c b/tools/kvm/virtio/pci.c
new file mode 100644
index 0000000..7556239
--- /dev/null
+++ b/tools/kvm/virtio/pci.c

@@ -0,0 +1,451 @@
+#include "kvm/virtio-pci.h"
+
+#include "kvm/ioport.h"
+#include "kvm/kvm.h"
+#include "kvm/kvm-cpu.h"
+#include "kvm/virtio-pci-dev.h"
+#include "kvm/irq.h"
+#include "kvm/virtio.h"
+#include "kvm/ioeventfd.h"
+
+#include <sys/ioctl.h>
+#include <linux/virtio_pci.h>
+#include <linux/byteorder.h>
+#include <string.h>
+
+static void virtio_pci__ioevent_callback(struct kvm *kvm, void *param)
+{
+	struct virtio_pci_ioevent_param *ioeventfd = param;
+	struct virtio_pci *vpci = ioeventfd->vdev->virtio;
+
+	ioeventfd->vdev->ops->notify_vq(kvm, vpci->dev, ioeventfd->vq);
+}
+
+static int virtio_pci__init_ioeventfd(struct kvm *kvm, struct virtio_device *vdev, u32 vq)
+{
+	struct ioevent ioevent;
+	struct virtio_pci *vpci = vdev->virtio;
+	int i, r, flags = IOEVENTFD_FLAG_PIO;
+	int fds[2];
+
+	vpci->ioeventfds[vq] = (struct virtio_pci_ioevent_param) {
+		.vdev		= vdev,
+		.vq		= vq,
+	};
+
+	ioevent = (struct ioevent) {
+		.fn		= virtio_pci__ioevent_callback,
+		.fn_ptr		= &vpci->ioeventfds[vq],
+		.datamatch	= vq,
+		.fn_kvm		= kvm,
+	};
+
+	/*
+	 * Vhost will poll the eventfd in host kernel side, otherwise we
+	 * need to poll in userspace.
+	 */
+	if (!vdev->use_vhost)
+		flags |= IOEVENTFD_FLAG_USER_POLL;
+
+	/* ioport */
+	ioevent.io_addr	= vpci->port_addr + VIRTIO_PCI_QUEUE_NOTIFY;
+	ioevent.io_len	= sizeof(u16);
+	ioevent.fd	= fds[0] = eventfd(0, 0);
+	r = ioeventfd__add_event(&ioevent, flags);
+	if (r)
+		return r;
+
+	/* mmio */
+	ioevent.io_addr	= vpci->mmio_addr + VIRTIO_PCI_QUEUE_NOTIFY;
+	ioevent.io_len	= sizeof(u32);
+	ioevent.fd	= fds[1] = eventfd(0, 0);
+	r = ioeventfd__add_event(&ioevent, flags);
+	if (r)
+		goto free_ioport_evt;
+
+	if (vdev->ops->notify_vq_eventfd)
+		for (i = 0; i < 2; ++i)
+			vdev->ops->notify_vq_eventfd(kvm, vpci->dev, vq,
+						     fds[i]);
+	return 0;
+
+free_ioport_evt:
+	ioeventfd__del_event(vpci->port_addr + VIRTIO_PCI_QUEUE_NOTIFY, vq);
+	return r;
+}
+
+static inline bool virtio_pci__msix_enabled(struct virtio_pci *vpci)
+{
+	return vpci->pci_hdr.msix.ctrl & cpu_to_le16(PCI_MSIX_FLAGS_ENABLE);
+}
+
+static bool virtio_pci__specific_io_in(struct kvm *kvm, struct virtio_device *vdev, u16 port,
+					void *data, int size, int offset)
+{
+	u32 config_offset;
+	struct virtio_pci *vpci = vdev->virtio;
+	int type = virtio__get_dev_specific_field(offset - 20,
+							virtio_pci__msix_enabled(vpci),
+							&config_offset);
+	if (type == VIRTIO_PCI_O_MSIX) {
+		switch (offset) {
+		case VIRTIO_MSI_CONFIG_VECTOR:
+			ioport__write16(data, vpci->config_vector);
+			break;
+		case VIRTIO_MSI_QUEUE_VECTOR:
+			ioport__write16(data, vpci->vq_vector[vpci->queue_selector]);
+			break;
+		};
+
+		return true;
+	} else if (type == VIRTIO_PCI_O_CONFIG) {
+		u8 cfg;
+
+		cfg = vdev->ops->get_config(kvm, vpci->dev)[config_offset];
+		ioport__write8(data, cfg);
+		return true;
+	}
+
+	return false;
+}
+
+static bool virtio_pci__io_in(struct ioport *ioport, struct kvm_cpu *vcpu, u16 port, void *data, int size)
+{
+	unsigned long offset;
+	bool ret = true;
+	struct virtio_device *vdev;
+	struct virtio_pci *vpci;
+	struct kvm *kvm;
+	u32 val;
+
+	kvm = vcpu->kvm;
+	vdev = ioport->priv;
+	vpci = vdev->virtio;
+	offset = port - vpci->port_addr;
+
+	switch (offset) {
+	case VIRTIO_PCI_HOST_FEATURES:
+		val = vdev->ops->get_host_features(kvm, vpci->dev);
+		ioport__write32(data, val);
+		break;
+	case VIRTIO_PCI_QUEUE_PFN:
+		val = vdev->ops->get_pfn_vq(kvm, vpci->dev, vpci->queue_selector);
+		ioport__write32(data, val);
+		break;
+	case VIRTIO_PCI_QUEUE_NUM:
+		val = vdev->ops->get_size_vq(kvm, vpci->dev, vpci->queue_selector);
+		ioport__write16(data, val);
+		break;
+	case VIRTIO_PCI_STATUS:
+		ioport__write8(data, vpci->status);
+		break;
+	case VIRTIO_PCI_ISR:
+		ioport__write8(data, vpci->isr);
+		kvm__irq_line(kvm, vpci->pci_hdr.irq_line, VIRTIO_IRQ_LOW);
+		vpci->isr = VIRTIO_IRQ_LOW;
+		break;
+	default:
+		ret = virtio_pci__specific_io_in(kvm, vdev, port, data, size, offset);
+		break;
+	};
+
+	return ret;
+}
+
+static bool virtio_pci__specific_io_out(struct kvm *kvm, struct virtio_device *vdev, u16 port,
+					void *data, int size, int offset)
+{
+	struct virtio_pci *vpci = vdev->virtio;
+	u32 config_offset, gsi, vec;
+	int type = virtio__get_dev_specific_field(offset - 20, virtio_pci__msix_enabled(vpci),
+							&config_offset);
+	if (type == VIRTIO_PCI_O_MSIX) {
+		switch (offset) {
+		case VIRTIO_MSI_CONFIG_VECTOR:
+			vec = vpci->config_vector = ioport__read16(data);
+			if (vec == VIRTIO_MSI_NO_VECTOR)
+				break;
+
+			gsi = irq__add_msix_route(kvm, &vpci->msix_table[vec].msg);
+
+			vpci->config_gsi = gsi;
+			break;
+		case VIRTIO_MSI_QUEUE_VECTOR:
+			vec = vpci->vq_vector[vpci->queue_selector] = ioport__read16(data);
+
+			if (vec == VIRTIO_MSI_NO_VECTOR)
+				break;
+
+			gsi = irq__add_msix_route(kvm, &vpci->msix_table[vec].msg);
+			vpci->gsis[vpci->queue_selector] = gsi;
+			if (vdev->ops->notify_vq_gsi)
+				vdev->ops->notify_vq_gsi(kvm, vpci->dev,
+							vpci->queue_selector, gsi);
+			break;
+		};
+
+		return true;
+	} else if (type == VIRTIO_PCI_O_CONFIG) {
+		vdev->ops->get_config(kvm, vpci->dev)[config_offset] = *(u8 *)data;
+
+		return true;
+	}
+
+	return false;
+}
+
+static bool virtio_pci__io_out(struct ioport *ioport, struct kvm_cpu *vcpu, u16 port, void *data, int size)
+{
+	unsigned long offset;
+	bool ret = true;
+	struct virtio_device *vdev;
+	struct virtio_pci *vpci;
+	struct kvm *kvm;
+	u32 val;
+
+	kvm = vcpu->kvm;
+	vdev = ioport->priv;
+	vpci = vdev->virtio;
+	offset = port - vpci->port_addr;
+
+	switch (offset) {
+	case VIRTIO_PCI_GUEST_FEATURES:
+		val = ioport__read32(data);
+		vdev->ops->set_guest_features(kvm, vpci->dev, val);
+		break;
+	case VIRTIO_PCI_QUEUE_PFN:
+		val = ioport__read32(data);
+		virtio_pci__init_ioeventfd(kvm, vdev, vpci->queue_selector);
+		vdev->ops->init_vq(kvm, vpci->dev, vpci->queue_selector,
+				   1 << VIRTIO_PCI_QUEUE_ADDR_SHIFT,
+				   VIRTIO_PCI_VRING_ALIGN, val);
+		break;
+	case VIRTIO_PCI_QUEUE_SEL:
+		vpci->queue_selector = ioport__read16(data);
+		break;
+	case VIRTIO_PCI_QUEUE_NOTIFY:
+		val = ioport__read16(data);
+		vdev->ops->notify_vq(kvm, vpci->dev, val);
+		break;
+	case VIRTIO_PCI_STATUS:
+		vpci->status = ioport__read8(data);
+		if (!vpci->status) /* Sample endianness on reset */
+			vdev->endian = kvm_cpu__get_endianness(vcpu);
+		if (vdev->ops->notify_status)
+			vdev->ops->notify_status(kvm, vpci->dev, vpci->status);
+		break;
+	default:
+		ret = virtio_pci__specific_io_out(kvm, vdev, port, data, size, offset);
+		break;
+	};
+
+	return ret;
+}
+
+static struct ioport_operations virtio_pci__io_ops = {
+	.io_in	= virtio_pci__io_in,
+	.io_out	= virtio_pci__io_out,
+};
+
+static void virtio_pci__msix_mmio_callback(struct kvm_cpu *vcpu,
+					   u64 addr, u8 *data, u32 len,
+					   u8 is_write, void *ptr)
+{
+	struct virtio_pci *vpci = ptr;
+	void *table;
+	u32 offset;
+
+	if (addr > vpci->msix_io_block + PCI_IO_SIZE) {
+		table	= &vpci->msix_pba;
+		offset	= vpci->msix_io_block + PCI_IO_SIZE;
+	} else {
+		table	= &vpci->msix_table;
+		offset	= vpci->msix_io_block;
+	}
+
+	if (is_write)
+		memcpy(table + addr - offset, data, len);
+	else
+		memcpy(data, table + addr - offset, len);
+}
+
+static void virtio_pci__signal_msi(struct kvm *kvm, struct virtio_pci *vpci, int vec)
+{
+	struct kvm_msi msi = {
+		.address_lo = vpci->msix_table[vec].msg.address_lo,
+		.address_hi = vpci->msix_table[vec].msg.address_hi,
+		.data = vpci->msix_table[vec].msg.data,
+	};
+
+	ioctl(kvm->vm_fd, KVM_SIGNAL_MSI, &msi);
+}
+
+int virtio_pci__signal_vq(struct kvm *kvm, struct virtio_device *vdev, u32 vq)
+{
+	struct virtio_pci *vpci = vdev->virtio;
+	int tbl = vpci->vq_vector[vq];
+
+	if (virtio_pci__msix_enabled(vpci) && tbl != VIRTIO_MSI_NO_VECTOR) {
+		if (vpci->pci_hdr.msix.ctrl & cpu_to_le16(PCI_MSIX_FLAGS_MASKALL) ||
+		    vpci->msix_table[tbl].ctrl & cpu_to_le16(PCI_MSIX_ENTRY_CTRL_MASKBIT)) {
+
+			vpci->msix_pba |= 1 << tbl;
+			return 0;
+		}
+
+		if (vpci->features & VIRTIO_PCI_F_SIGNAL_MSI)
+			virtio_pci__signal_msi(kvm, vpci, vpci->vq_vector[vq]);
+		else
+			kvm__irq_trigger(kvm, vpci->gsis[vq]);
+	} else {
+		vpci->isr = VIRTIO_IRQ_HIGH;
+		kvm__irq_trigger(kvm, vpci->pci_hdr.irq_line);
+	}
+	return 0;
+}
+
+int virtio_pci__signal_config(struct kvm *kvm, struct virtio_device *vdev)
+{
+	struct virtio_pci *vpci = vdev->virtio;
+	int tbl = vpci->config_vector;
+
+	if (virtio_pci__msix_enabled(vpci) && tbl != VIRTIO_MSI_NO_VECTOR) {
+		if (vpci->pci_hdr.msix.ctrl & cpu_to_le16(PCI_MSIX_FLAGS_MASKALL) ||
+		    vpci->msix_table[tbl].ctrl & cpu_to_le16(PCI_MSIX_ENTRY_CTRL_MASKBIT)) {
+
+			vpci->msix_pba |= 1 << tbl;
+			return 0;
+		}
+
+		if (vpci->features & VIRTIO_PCI_F_SIGNAL_MSI)
+			virtio_pci__signal_msi(kvm, vpci, tbl);
+		else
+			kvm__irq_trigger(kvm, vpci->config_gsi);
+	} else {
+		vpci->isr = VIRTIO_PCI_ISR_CONFIG;
+		kvm__irq_trigger(kvm, vpci->pci_hdr.irq_line);
+	}
+
+	return 0;
+}
+
+static void virtio_pci__io_mmio_callback(struct kvm_cpu *vcpu,
+					 u64 addr, u8 *data, u32 len,
+					 u8 is_write, void *ptr)
+{
+	struct virtio_pci *vpci = ptr;
+	int direction = is_write ? KVM_EXIT_IO_OUT : KVM_EXIT_IO_IN;
+	u16 port = vpci->port_addr + (addr & (IOPORT_SIZE - 1));
+
+	kvm__emulate_io(vcpu, port, data, direction, len, 1);
+}
+
+int virtio_pci__init(struct kvm *kvm, void *dev, struct virtio_device *vdev,
+		     int device_id, int subsys_id, int class)
+{
+	struct virtio_pci *vpci = vdev->virtio;
+	int r;
+
+	vpci->kvm = kvm;
+	vpci->dev = dev;
+
+	r = ioport__register(kvm, IOPORT_EMPTY, &virtio_pci__io_ops, IOPORT_SIZE, vdev);
+	if (r < 0)
+		return r;
+	vpci->port_addr = (u16)r;
+
+	vpci->mmio_addr = pci_get_io_space_block(IOPORT_SIZE);
+	r = kvm__register_mmio(kvm, vpci->mmio_addr, IOPORT_SIZE, false,
+			       virtio_pci__io_mmio_callback, vpci);
+	if (r < 0)
+		goto free_ioport;
+
+	vpci->msix_io_block = pci_get_io_space_block(PCI_IO_SIZE * 2);
+	r = kvm__register_mmio(kvm, vpci->msix_io_block, PCI_IO_SIZE * 2, false,
+			       virtio_pci__msix_mmio_callback, vpci);
+	if (r < 0)
+		goto free_mmio;
+
+	vpci->pci_hdr = (struct pci_device_header) {
+		.vendor_id		= cpu_to_le16(PCI_VENDOR_ID_REDHAT_QUMRANET),
+		.device_id		= cpu_to_le16(device_id),
+		.command		= PCI_COMMAND_IO | PCI_COMMAND_MEMORY,
+		.header_type		= PCI_HEADER_TYPE_NORMAL,
+		.revision_id		= 0,
+		.class[0]		= class & 0xff,
+		.class[1]		= (class >> 8) & 0xff,
+		.class[2]		= (class >> 16) & 0xff,
+		.subsys_vendor_id	= cpu_to_le16(PCI_SUBSYSTEM_VENDOR_ID_REDHAT_QUMRANET),
+		.subsys_id		= cpu_to_le16(subsys_id),
+		.bar[0]			= cpu_to_le32(vpci->mmio_addr
+							| PCI_BASE_ADDRESS_SPACE_MEMORY),
+		.bar[1]			= cpu_to_le32(vpci->port_addr
+							| PCI_BASE_ADDRESS_SPACE_IO),
+		.bar[2]			= cpu_to_le32(vpci->msix_io_block
+							| PCI_BASE_ADDRESS_SPACE_MEMORY),
+		.status			= cpu_to_le16(PCI_STATUS_CAP_LIST),
+		.capabilities		= (void *)&vpci->pci_hdr.msix - (void *)&vpci->pci_hdr,
+		.bar_size[0]		= cpu_to_le32(IOPORT_SIZE),
+		.bar_size[1]		= cpu_to_le32(IOPORT_SIZE),
+		.bar_size[2]		= cpu_to_le32(PCI_IO_SIZE*2),
+	};
+
+	vpci->dev_hdr = (struct device_header) {
+		.bus_type		= DEVICE_BUS_PCI,
+		.data			= &vpci->pci_hdr,
+	};
+
+	vpci->pci_hdr.msix.cap = PCI_CAP_ID_MSIX;
+	vpci->pci_hdr.msix.next = 0;
+	/*
+	 * We at most have VIRTIO_PCI_MAX_VQ entries for virt queue,
+	 * VIRTIO_PCI_MAX_CONFIG entries for config.
+	 *
+	 * To quote the PCI spec:
+	 *
+	 * System software reads this field to determine the
+	 * MSI-X Table Size N, which is encoded as N-1.
+	 * For example, a returned value of "00000000011"
+	 * indicates a table size of 4.
+	 */
+	vpci->pci_hdr.msix.ctrl = cpu_to_le16(VIRTIO_PCI_MAX_VQ + VIRTIO_PCI_MAX_CONFIG - 1);
+
+	/* Both table and PBA are mapped to the same BAR (2) */
+	vpci->pci_hdr.msix.table_offset = cpu_to_le32(2);
+	vpci->pci_hdr.msix.pba_offset = cpu_to_le32(2 | PCI_IO_SIZE);
+	vpci->config_vector = 0;
+
+	if (kvm__supports_extension(kvm, KVM_CAP_SIGNAL_MSI))
+		vpci->features |= VIRTIO_PCI_F_SIGNAL_MSI;
+
+	r = device__register(&vpci->dev_hdr);
+	if (r < 0)
+		goto free_msix_mmio;
+
+	return 0;
+
+free_msix_mmio:
+	kvm__deregister_mmio(kvm, vpci->msix_io_block);
+free_mmio:
+	kvm__deregister_mmio(kvm, vpci->mmio_addr);
+free_ioport:
+	ioport__unregister(kvm, vpci->port_addr);
+	return r;
+}
+
+int virtio_pci__exit(struct kvm *kvm, struct virtio_device *vdev)
+{
+	struct virtio_pci *vpci = vdev->virtio;
+	int i;
+
+	kvm__deregister_mmio(kvm, vpci->mmio_addr);
+	kvm__deregister_mmio(kvm, vpci->msix_io_block);
+	ioport__unregister(kvm, vpci->port_addr);
+
+	for (i = 0; i < VIRTIO_PCI_MAX_VQ; i++) {
+		ioeventfd__del_event(vpci->port_addr + VIRTIO_PCI_QUEUE_NOTIFY, i);
+		ioeventfd__del_event(vpci->mmio_addr + VIRTIO_PCI_QUEUE_NOTIFY, i);
+	}
+
+	return 0;
+}

diff --git a/tools/kvm/virtio/rng.c b/tools/kvm/virtio/rng.c
new file mode 100644
index 0000000..8031368
--- /dev/null
+++ b/tools/kvm/virtio/rng.c

@@ -0,0 +1,204 @@
+#include "kvm/virtio-rng.h"
+
+#include "kvm/virtio-pci-dev.h"
+
+#include "kvm/virtio.h"
+#include "kvm/util.h"
+#include "kvm/kvm.h"
+#include "kvm/threadpool.h"
+#include "kvm/guest_compat.h"
+
+#include <linux/virtio_ring.h>
+#include <linux/virtio_rng.h>
+
+#include <linux/list.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <pthread.h>
+#include <linux/kernel.h>
+
+#define NUM_VIRT_QUEUES		1
+#define VIRTIO_RNG_QUEUE_SIZE	128
+
+struct rng_dev_job {
+	struct virt_queue	*vq;
+	struct rng_dev		*rdev;
+	struct thread_pool__job	job_id;
+};
+
+struct rng_dev {
+	struct list_head	list;
+	struct virtio_device	vdev;
+
+	int			fd;
+
+	/* virtio queue */
+	struct virt_queue	vqs[NUM_VIRT_QUEUES];
+	struct rng_dev_job	jobs[NUM_VIRT_QUEUES];
+};
+
+static LIST_HEAD(rdevs);
+static int compat_id = -1;
+
+static u8 *get_config(struct kvm *kvm, void *dev)
+{
+	/* Unused */
+	return 0;
+}
+
+static u32 get_host_features(struct kvm *kvm, void *dev)
+{
+	/* Unused */
+	return 0;
+}
+
+static void set_guest_features(struct kvm *kvm, void *dev, u32 features)
+{
+	/* Unused */
+}
+
+static bool virtio_rng_do_io_request(struct kvm *kvm, struct rng_dev *rdev, struct virt_queue *queue)
+{
+	struct iovec iov[VIRTIO_RNG_QUEUE_SIZE];
+	ssize_t len = 0;
+	u16 out, in, head;
+
+	head	= virt_queue__get_iov(queue, iov, &out, &in, kvm);
+	len	= readv(rdev->fd, iov, in);
+	if (len < 0 && errno == EAGAIN)
+		len = 0;
+
+	virt_queue__set_used_elem(queue, head, len);
+
+	return true;
+}
+
+static void virtio_rng_do_io(struct kvm *kvm, void *param)
+{
+	struct rng_dev_job *job	= param;
+	struct virt_queue *vq	= job->vq;
+	struct rng_dev *rdev	= job->rdev;
+
+	while (virt_queue__available(vq))
+		virtio_rng_do_io_request(kvm, rdev, vq);
+
+	rdev->vdev.ops->signal_vq(kvm, &rdev->vdev, vq - rdev->vqs);
+}
+
+static int init_vq(struct kvm *kvm, void *dev, u32 vq, u32 page_size, u32 align,
+		   u32 pfn)
+{
+	struct rng_dev *rdev = dev;
+	struct virt_queue *queue;
+	struct rng_dev_job *job;
+	void *p;
+
+	compat__remove_message(compat_id);
+
+	queue		= &rdev->vqs[vq];
+	queue->pfn	= pfn;
+	p		= virtio_get_vq(kvm, queue->pfn, page_size);
+
+	job = &rdev->jobs[vq];
+
+	vring_init(&queue->vring, VIRTIO_RNG_QUEUE_SIZE, p, align);
+
+	*job = (struct rng_dev_job) {
+		.vq	= queue,
+		.rdev	= rdev,
+	};
+
+	thread_pool__init_job(&job->job_id, kvm, virtio_rng_do_io, job);
+
+	return 0;
+}
+
+static int notify_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+	struct rng_dev *rdev = dev;
+
+	thread_pool__do_job(&rdev->jobs[vq].job_id);
+
+	return 0;
+}
+
+static int get_pfn_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+	struct rng_dev *rdev = dev;
+
+	return rdev->vqs[vq].pfn;
+}
+
+static int get_size_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+	return VIRTIO_RNG_QUEUE_SIZE;
+}
+
+static int set_size_vq(struct kvm *kvm, void *dev, u32 vq, int size)
+{
+	/* FIXME: dynamic */
+	return size;
+}
+
+static struct virtio_ops rng_dev_virtio_ops = (struct virtio_ops) {
+	.get_config		= get_config,
+	.get_host_features	= get_host_features,
+	.set_guest_features	= set_guest_features,
+	.init_vq		= init_vq,
+	.notify_vq		= notify_vq,
+	.get_pfn_vq		= get_pfn_vq,
+	.get_size_vq		= get_size_vq,
+	.set_size_vq		= set_size_vq,
+};
+
+int virtio_rng__init(struct kvm *kvm)
+{
+	struct rng_dev *rdev;
+	int r;
+
+	if (!kvm->cfg.virtio_rng)
+		return 0;
+
+	rdev = malloc(sizeof(*rdev));
+	if (rdev == NULL)
+		return -ENOMEM;
+
+	rdev->fd = open("/dev/random", O_RDONLY | O_NONBLOCK);
+	if (rdev->fd < 0) {
+		r = rdev->fd;
+		goto cleanup;
+	}
+
+	r = virtio_init(kvm, rdev, &rdev->vdev, &rng_dev_virtio_ops,
+			VIRTIO_DEFAULT_TRANS(kvm), PCI_DEVICE_ID_VIRTIO_RNG,
+			VIRTIO_ID_RNG, PCI_CLASS_RNG);
+	if (r < 0)
+		goto cleanup;
+
+	list_add_tail(&rdev->list, &rdevs);
+
+	if (compat_id == -1)
+		compat_id = virtio_compat_add_message("virtio-rng", "CONFIG_HW_RANDOM_VIRTIO");
+	return 0;
+cleanup:
+	close(rdev->fd);
+	free(rdev);
+
+	return r;
+}
+virtio_dev_init(virtio_rng__init);
+
+int virtio_rng__exit(struct kvm *kvm)
+{
+	struct rng_dev *rdev, *tmp;
+
+	list_for_each_entry_safe(rdev, tmp, &rdevs, list) {
+		list_del(&rdev->list);
+		rdev->vdev.ops->exit(kvm, &rdev->vdev);
+		free(rdev);
+	}
+
+	return 0;
+}
+virtio_dev_exit(virtio_rng__exit);

diff --git a/tools/kvm/virtio/scsi.c b/tools/kvm/virtio/scsi.c
new file mode 100644
index 0000000..be254f3
--- /dev/null
+++ b/tools/kvm/virtio/scsi.c

@@ -0,0 +1,311 @@
+#include "kvm/virtio-scsi.h"
+#include "kvm/virtio-pci-dev.h"
+#include "kvm/disk-image.h"
+#include "kvm/kvm.h"
+#include "kvm/pci.h"
+#include "kvm/ioeventfd.h"
+#include "kvm/guest_compat.h"
+#include "kvm/virtio-pci.h"
+#include "kvm/virtio.h"
+
+#include <linux/kernel.h>
+#include <linux/virtio_scsi.h>
+#include <linux/vhost.h>
+
+#define VIRTIO_SCSI_QUEUE_SIZE		128
+#define NUM_VIRT_QUEUES			3
+
+static LIST_HEAD(sdevs);
+static int compat_id = -1;
+
+struct scsi_dev {
+	struct virt_queue		vqs[NUM_VIRT_QUEUES];
+	struct virtio_scsi_config	config;
+	struct vhost_scsi_target	target;
+	u32				features;
+	int				vhost_fd;
+	struct virtio_device		vdev;
+	struct list_head		list;
+	struct kvm			*kvm;
+};
+
+static u8 *get_config(struct kvm *kvm, void *dev)
+{
+	struct scsi_dev *sdev = dev;
+
+	return ((u8 *)(&sdev->config));
+}
+
+static u32 get_host_features(struct kvm *kvm, void *dev)
+{
+	return	1UL << VIRTIO_RING_F_EVENT_IDX |
+		1UL << VIRTIO_RING_F_INDIRECT_DESC;
+}
+
+static void set_guest_features(struct kvm *kvm, void *dev, u32 features)
+{
+	struct scsi_dev *sdev = dev;
+
+	sdev->features = features;
+}
+
+static int init_vq(struct kvm *kvm, void *dev, u32 vq, u32 page_size, u32 align,
+		   u32 pfn)
+{
+	struct vhost_vring_state state = { .index = vq };
+	struct vhost_vring_addr addr;
+	struct scsi_dev *sdev = dev;
+	struct virt_queue *queue;
+	void *p;
+	int r;
+
+	compat__remove_message(compat_id);
+
+	queue		= &sdev->vqs[vq];
+	queue->pfn	= pfn;
+	p		= virtio_get_vq(kvm, queue->pfn, page_size);
+
+	vring_init(&queue->vring, VIRTIO_SCSI_QUEUE_SIZE, p, align);
+
+	if (sdev->vhost_fd == 0)
+		return 0;
+
+	state.num = queue->vring.num;
+	r = ioctl(sdev->vhost_fd, VHOST_SET_VRING_NUM, &state);
+	if (r < 0)
+		die_perror("VHOST_SET_VRING_NUM failed");
+	state.num = 0;
+	r = ioctl(sdev->vhost_fd, VHOST_SET_VRING_BASE, &state);
+	if (r < 0)
+		die_perror("VHOST_SET_VRING_BASE failed");
+
+	addr = (struct vhost_vring_addr) {
+		.index = vq,
+		.desc_user_addr = (u64)(unsigned long)queue->vring.desc,
+		.avail_user_addr = (u64)(unsigned long)queue->vring.avail,
+		.used_user_addr = (u64)(unsigned long)queue->vring.used,
+	};
+
+	r = ioctl(sdev->vhost_fd, VHOST_SET_VRING_ADDR, &addr);
+	if (r < 0)
+		die_perror("VHOST_SET_VRING_ADDR failed");
+
+	return 0;
+}
+
+static void notify_vq_gsi(struct kvm *kvm, void *dev, u32 vq, u32 gsi)
+{
+	struct vhost_vring_file file;
+	struct scsi_dev *sdev = dev;
+	struct kvm_irqfd irq;
+	int r;
+
+	if (sdev->vhost_fd == 0)
+		return;
+
+	irq = (struct kvm_irqfd) {
+		.gsi	= gsi,
+		.fd	= eventfd(0, 0),
+	};
+	file = (struct vhost_vring_file) {
+		.index	= vq,
+		.fd	= irq.fd,
+	};
+
+	r = ioctl(kvm->vm_fd, KVM_IRQFD, &irq);
+	if (r < 0)
+		die_perror("KVM_IRQFD failed");
+
+	r = ioctl(sdev->vhost_fd, VHOST_SET_VRING_CALL, &file);
+	if (r < 0)
+		die_perror("VHOST_SET_VRING_CALL failed");
+
+	if (vq > 0)
+		return;
+
+	r = ioctl(sdev->vhost_fd, VHOST_SCSI_SET_ENDPOINT, &sdev->target);
+	if (r != 0)
+		die("VHOST_SCSI_SET_ENDPOINT failed %d", errno);
+}
+
+static void notify_vq_eventfd(struct kvm *kvm, void *dev, u32 vq, u32 efd)
+{
+	struct scsi_dev *sdev = dev;
+	struct vhost_vring_file file = {
+		.index	= vq,
+		.fd	= efd,
+	};
+	int r;
+
+	if (sdev->vhost_fd == 0)
+		return;
+
+	r = ioctl(sdev->vhost_fd, VHOST_SET_VRING_KICK, &file);
+	if (r < 0)
+		die_perror("VHOST_SET_VRING_KICK failed");
+}
+
+static int notify_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+	return 0;
+}
+
+static int get_pfn_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+	struct scsi_dev *sdev = dev;
+
+	return sdev->vqs[vq].pfn;
+}
+
+static int get_size_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+	return VIRTIO_SCSI_QUEUE_SIZE;
+}
+
+static int set_size_vq(struct kvm *kvm, void *dev, u32 vq, int size)
+{
+	return size;
+}
+
+static struct virtio_ops scsi_dev_virtio_ops = (struct virtio_ops) {
+	.get_config		= get_config,
+	.get_host_features	= get_host_features,
+	.set_guest_features	= set_guest_features,
+	.init_vq		= init_vq,
+	.get_pfn_vq		= get_pfn_vq,
+	.get_size_vq		= get_size_vq,
+	.set_size_vq		= set_size_vq,
+	.notify_vq		= notify_vq,
+	.notify_vq_gsi		= notify_vq_gsi,
+	.notify_vq_eventfd	= notify_vq_eventfd,
+};
+
+static void virtio_scsi_vhost_init(struct kvm *kvm, struct scsi_dev *sdev)
+{
+	struct vhost_memory *mem;
+	u64 features;
+	int r;
+
+	sdev->vhost_fd = open("/dev/vhost-scsi", O_RDWR);
+	if (sdev->vhost_fd < 0)
+		die_perror("Failed openning vhost-scsi device");
+
+	mem = calloc(1, sizeof(*mem) + sizeof(struct vhost_memory_region));
+	if (mem == NULL)
+		die("Failed allocating memory for vhost memory map");
+
+	mem->nregions = 1;
+	mem->regions[0] = (struct vhost_memory_region) {
+		.guest_phys_addr	= 0,
+		.memory_size		= kvm->ram_size,
+		.userspace_addr		= (unsigned long)kvm->ram_start,
+	};
+
+	r = ioctl(sdev->vhost_fd, VHOST_SET_OWNER);
+	if (r != 0)
+		die_perror("VHOST_SET_OWNER failed");
+
+	r = ioctl(sdev->vhost_fd, VHOST_GET_FEATURES, &features);
+	if (r != 0)
+		die_perror("VHOST_GET_FEATURES failed");
+
+	r = ioctl(sdev->vhost_fd, VHOST_SET_FEATURES, &features);
+	if (r != 0)
+		die_perror("VHOST_SET_FEATURES failed");
+	r = ioctl(sdev->vhost_fd, VHOST_SET_MEM_TABLE, mem);
+	if (r != 0)
+		die_perror("VHOST_SET_MEM_TABLE failed");
+
+	sdev->vdev.use_vhost = true;
+
+	free(mem);
+}
+
+
+static int virtio_scsi_init_one(struct kvm *kvm, struct disk_image *disk)
+{
+	struct scsi_dev *sdev;
+
+	if (!disk)
+		return -EINVAL;
+
+	sdev = calloc(1, sizeof(struct scsi_dev));
+	if (sdev == NULL)
+		return -ENOMEM;
+
+	*sdev = (struct scsi_dev) {
+		.config	= (struct virtio_scsi_config) {
+			.num_queues	= NUM_VIRT_QUEUES - 2,
+			.seg_max	= VIRTIO_SCSI_CDB_SIZE - 2,
+			.max_sectors	= 65535,
+			.cmd_per_lun	= 128,
+			.sense_size	= VIRTIO_SCSI_SENSE_SIZE,
+			.cdb_size	= VIRTIO_SCSI_CDB_SIZE,
+			.max_channel	= 0,
+			.max_target	= 0,
+			.max_lun	= 16383,
+			.event_info_size = sizeof(struct virtio_scsi_event),
+		},
+		.kvm			= kvm,
+	};
+	strncpy((char *)&sdev->target.vhost_wwpn, disk->wwpn, sizeof(sdev->target.vhost_wwpn));
+	sdev->target.vhost_tpgt = strtol(disk->tpgt, NULL, 0);
+
+	virtio_init(kvm, sdev, &sdev->vdev, &scsi_dev_virtio_ops,
+		    VIRTIO_DEFAULT_TRANS(kvm), PCI_DEVICE_ID_VIRTIO_SCSI,
+		    VIRTIO_ID_SCSI, PCI_CLASS_BLK);
+
+	list_add_tail(&sdev->list, &sdevs);
+
+	virtio_scsi_vhost_init(kvm, sdev);
+
+	if (compat_id == -1)
+		compat_id = virtio_compat_add_message("virtio-scsi", "CONFIG_VIRTIO_SCSI");
+
+	return 0;
+}
+
+static int virtio_scsi_exit_one(struct kvm *kvm, struct scsi_dev *sdev)
+{
+	int r;
+
+	r = ioctl(sdev->vhost_fd, VHOST_SCSI_CLEAR_ENDPOINT, &sdev->target);
+	if (r != 0)
+		die("VHOST_SCSI_CLEAR_ENDPOINT failed %d", errno);
+
+	list_del(&sdev->list);
+	free(sdev);
+
+	return 0;
+}
+
+int virtio_scsi_init(struct kvm *kvm)
+{
+	int i, r = 0;
+
+	for (i = 0; i < kvm->nr_disks; i++) {
+		if (!kvm->disks[i]->wwpn)
+			continue;
+		r = virtio_scsi_init_one(kvm, kvm->disks[i]);
+		if (r < 0)
+			goto cleanup;
+	}
+
+	return 0;
+cleanup:
+	return virtio_scsi_exit(kvm);
+}
+virtio_dev_init(virtio_scsi_init);
+
+int virtio_scsi_exit(struct kvm *kvm)
+{
+	while (!list_empty(&sdevs)) {
+		struct scsi_dev *sdev;
+
+		sdev = list_first_entry(&sdevs, struct scsi_dev, list);
+		virtio_scsi_exit_one(kvm, sdev);
+	}
+
+	return 0;
+}
+virtio_dev_exit(virtio_scsi_exit);

diff --git a/tools/kvm/x86/bios.c b/tools/kvm/x86/bios.c
new file mode 100644
index 0000000..f05cc02
--- /dev/null
+++ b/tools/kvm/x86/bios.c

@@ -0,0 +1,174 @@
+#include "kvm/kvm.h"
+#include "kvm/boot-protocol.h"
+#include "kvm/e820.h"
+#include "kvm/interrupt.h"
+#include "kvm/util.h"
+
+#include <string.h>
+#include <asm/e820.h>
+
+#include "bios/bios-rom.h"
+
+struct irq_handler {
+	unsigned long		address;
+	unsigned int		irq;
+	void			*handler;
+	size_t			size;
+};
+
+#define BIOS_IRQ_PA_ADDR(name)	(MB_BIOS_BEGIN + BIOS_OFFSET__##name)
+#define BIOS_IRQ_FUNC(name)	((char *)&bios_rom[BIOS_OFFSET__##name])
+#define BIOS_IRQ_SIZE(name)	(BIOS_ENTRY_SIZE(BIOS_OFFSET__##name))
+
+#define DEFINE_BIOS_IRQ_HANDLER(_irq, _handler)			\
+	{							\
+		.irq		= _irq,				\
+		.address	= BIOS_IRQ_PA_ADDR(_handler),	\
+		.handler	= BIOS_IRQ_FUNC(_handler),	\
+		.size		= BIOS_IRQ_SIZE(_handler),	\
+	}
+
+static struct irq_handler bios_irq_handlers[] = {
+	DEFINE_BIOS_IRQ_HANDLER(0x10, bios_int10),
+	DEFINE_BIOS_IRQ_HANDLER(0x15, bios_int15),
+};
+
+static void setup_irq_handler(struct kvm *kvm, struct irq_handler *handler)
+{
+	struct real_intr_desc intr_desc;
+	void *p;
+
+	p = guest_flat_to_host(kvm, handler->address);
+	memcpy(p, handler->handler, handler->size);
+
+	intr_desc = (struct real_intr_desc) {
+		.segment	= REAL_SEGMENT(MB_BIOS_BEGIN),
+		.offset		= handler->address - MB_BIOS_BEGIN,
+	};
+
+	DIE_IF((handler->address - MB_BIOS_BEGIN) > 0xffffUL);
+
+	interrupt_table__set(&kvm->arch.interrupt_table, &intr_desc, handler->irq);
+}
+
+/**
+ * e820_setup - setup some simple E820 memory map
+ * @kvm - guest system descriptor
+ */
+static void e820_setup(struct kvm *kvm)
+{
+	struct e820map *e820;
+	struct e820entry *mem_map;
+	unsigned int i = 0;
+
+	e820		= guest_flat_to_host(kvm, E820_MAP_START);
+	mem_map		= e820->map;
+
+	mem_map[i++]	= (struct e820entry) {
+		.addr		= REAL_MODE_IVT_BEGIN,
+		.size		= EBDA_START - REAL_MODE_IVT_BEGIN,
+		.type		= E820_RAM,
+	};
+	mem_map[i++]	= (struct e820entry) {
+		.addr		= EBDA_START,
+		.size		= VGA_RAM_BEGIN - EBDA_START,
+		.type		= E820_RESERVED,
+	};
+	mem_map[i++]	= (struct e820entry) {
+		.addr		= MB_BIOS_BEGIN,
+		.size		= MB_BIOS_END - MB_BIOS_BEGIN,
+		.type		= E820_RESERVED,
+	};
+	if (kvm->ram_size < KVM_32BIT_GAP_START) {
+		mem_map[i++]	= (struct e820entry) {
+			.addr		= BZ_KERNEL_START,
+			.size		= kvm->ram_size - BZ_KERNEL_START,
+			.type		= E820_RAM,
+		};
+	} else {
+		mem_map[i++]	= (struct e820entry) {
+			.addr		= BZ_KERNEL_START,
+			.size		= KVM_32BIT_GAP_START - BZ_KERNEL_START,
+			.type		= E820_RAM,
+		};
+		mem_map[i++]	= (struct e820entry) {
+			.addr		= KVM_32BIT_MAX_MEM_SIZE,
+			.size		= kvm->ram_size - KVM_32BIT_MAX_MEM_SIZE,
+			.type		= E820_RAM,
+		};
+	}
+
+	BUG_ON(i > E820_X_MAX);
+
+	e820->nr_map = i;
+}
+
+static void setup_vga_rom(struct kvm *kvm)
+{
+	u16 *mode;
+	void *p;
+
+	p = guest_flat_to_host(kvm, VGA_ROM_OEM_STRING);
+	memset(p, 0, VGA_ROM_OEM_STRING_SIZE);
+	strncpy(p, "KVM VESA", VGA_ROM_OEM_STRING_SIZE);
+
+	mode = guest_flat_to_host(kvm, VGA_ROM_MODES);
+	mode[0]	= 0x0112;
+	mode[1] = 0xffff;
+}
+
+/**
+ * setup_bios - inject BIOS into guest memory
+ * @kvm - guest system descriptor
+ */
+void setup_bios(struct kvm *kvm)
+{
+	unsigned long address = MB_BIOS_BEGIN;
+	struct real_intr_desc intr_desc;
+	unsigned int i;
+	void *p;
+
+	/*
+	 * before anything else -- clean some known areas
+	 * we definitely don't want any trash here
+	 */
+	p = guest_flat_to_host(kvm, BDA_START);
+	memset(p, 0, BDA_END - BDA_START);
+
+	p = guest_flat_to_host(kvm, EBDA_START);
+	memset(p, 0, EBDA_END - EBDA_START);
+
+	p = guest_flat_to_host(kvm, MB_BIOS_BEGIN);
+	memset(p, 0, MB_BIOS_END - MB_BIOS_BEGIN);
+
+	p = guest_flat_to_host(kvm, VGA_ROM_BEGIN);
+	memset(p, 0, VGA_ROM_END - VGA_ROM_BEGIN);
+
+	/* just copy the bios rom into the place */
+	p = guest_flat_to_host(kvm, MB_BIOS_BEGIN);
+	memcpy(p, bios_rom, bios_rom_size);
+
+	/* E820 memory map must be present */
+	e820_setup(kvm);
+
+	/* VESA needs own tricks */
+	setup_vga_rom(kvm);
+
+	/*
+	 * Setup a *fake* real mode vector table, it has only
+	 * one real handler which does just iret
+	 */
+	address = BIOS_IRQ_PA_ADDR(bios_intfake);
+	intr_desc = (struct real_intr_desc) {
+		.segment	= REAL_SEGMENT(MB_BIOS_BEGIN),
+		.offset		= address - MB_BIOS_BEGIN,
+	};
+	interrupt_table__setup(&kvm->arch.interrupt_table, &intr_desc);
+
+	for (i = 0; i < ARRAY_SIZE(bios_irq_handlers); i++)
+		setup_irq_handler(kvm, &bios_irq_handlers[i]);
+
+	/* we almost done */
+	p = guest_flat_to_host(kvm, 0);
+	interrupt_table__copy(&kvm->arch.interrupt_table, p, REAL_INTR_SIZE);
+}

diff --git a/tools/kvm/x86/bios/.gitignore b/tools/kvm/x86/bios/.gitignore
new file mode 100644
index 0000000..1f0080b
--- /dev/null
+++ b/tools/kvm/x86/bios/.gitignore

@@ -0,0 +1,3 @@
+bios-rom.bin
+bios-rom.bin.elf
+bios-rom.h

diff --git a/tools/kvm/x86/bios/bios-rom.S b/tools/kvm/x86/bios/bios-rom.S
new file mode 100644
index 0000000..3269ce9
--- /dev/null
+++ b/tools/kvm/x86/bios/bios-rom.S

@@ -0,0 +1,12 @@
+#include <kvm/assembly.h>
+
+	.org 0
+#ifdef CONFIG_X86_64
+	.code64
+#else
+	.code32
+#endif
+
+GLOBAL(bios_rom)
+	.incbin "x86/bios/bios.bin"
+END(bios_rom)

diff --git a/tools/kvm/x86/bios/e820.c b/tools/kvm/x86/bios/e820.c
new file mode 100644
index 0000000..a9bca29
--- /dev/null
+++ b/tools/kvm/x86/bios/e820.c

@@ -0,0 +1,72 @@
+#include "kvm/e820.h"
+
+#include "kvm/segment.h"
+#include "kvm/bios.h"
+
+#include <asm/processor-flags.h>
+#include <asm/e820.h>
+
+static inline void set_fs(u16 seg)
+{
+	asm volatile("movw %0,%%fs" : : "rm" (seg));
+}
+
+static inline u8 rdfs8(unsigned long addr)
+{
+	u8 v;
+
+	asm volatile("addr32 movb %%fs:%1,%0" : "=q" (v) : "m" (*(u8 *)addr));
+
+	return v;
+}
+
+static inline u32 rdfs32(unsigned long addr)
+{
+	u32 v;
+
+	asm volatile("addr32 movl %%fs:%1,%0" : "=q" (v) : "m" (*(u32 *)addr));
+
+	return v;
+}
+
+bioscall void e820_query_map(struct biosregs *regs)
+{
+	struct e820map *e820;
+	u32 map_size;
+	u16 fs_seg;
+	u32 ndx;
+
+	e820		= (struct e820map *)E820_MAP_START;
+	fs_seg		= flat_to_seg16(E820_MAP_START);
+	set_fs(fs_seg);
+
+	ndx		= regs->ebx;
+
+	map_size	= rdfs32(flat_to_off16((u32)&e820->nr_map, fs_seg));
+
+	if (ndx < map_size) {
+		u32 start;
+		unsigned int i;
+		u8 *p;
+
+		fs_seg	= flat_to_seg16(E820_MAP_START);
+		set_fs(fs_seg);
+
+		start	= (u32)&e820->map[ndx];
+
+		p	= (void *) regs->edi;
+
+		for (i = 0; i < sizeof(struct e820entry); i++)
+			*p++	= rdfs8(flat_to_off16(start + i, fs_seg));
+	}
+
+	regs->eax	= SMAP;
+	regs->ecx	= sizeof(struct e820entry);
+	regs->ebx	= ++ndx;
+
+	/* Clear CF to indicate success.  */
+	regs->eflags	&= ~X86_EFLAGS_CF;
+
+	if (ndx >= map_size)
+		regs->ebx	= 0;	/* end of map */
+}

diff --git a/tools/kvm/x86/bios/entry.S b/tools/kvm/x86/bios/entry.S
new file mode 100644
index 0000000..85056e9
--- /dev/null
+++ b/tools/kvm/x86/bios/entry.S

@@ -0,0 +1,92 @@
+/*
+ * Our pretty trivial BIOS emulation
+ */
+
+#include <kvm/bios.h>
+#include <kvm/assembly.h>
+
+	.org 0
+	.code16gcc
+
+#define EFLAGS_CF	(1 << 0)
+
+#include "macro.S"
+
+/* If you change these macros, remember to update 'struct biosregs' */
+.macro SAVE_BIOSREGS
+	pushl	%fs
+	pushl	%es
+	pushl	%ds
+	pushl	%edi
+	pushl	%esi
+	pushl	%ebp
+	pushl	%esp
+	pushl	%edx
+	pushl	%ecx
+	pushl	%ebx
+	pushl	%eax
+.endm
+
+.macro RESTORE_BIOSREGS
+	popl	%eax
+	popl	%ebx
+	popl	%ecx
+	popl	%edx
+	popl	%esp
+	popl	%ebp
+	popl	%esi
+	popl	%edi
+	popl	%ds
+	popl	%es
+	popl	%fs
+.endm
+
+/*
+ * fake interrupt handler, nothing can be faster ever
+ */
+ENTRY(bios_intfake)
+	/*
+	 * Set CF to indicate failure. We don't want callers to think that the
+	 * interrupt handler succeeded and then treat the return values in
+	 * registers as valid data.
+	 */
+	orl	$EFLAGS_CF, 0x4(%esp)
+
+	IRET
+ENTRY_END(bios_intfake)
+
+/*
+ * int 10 - video - service
+ */
+ENTRY(bios_int10)
+	SAVE_BIOSREGS
+
+	movl		%esp, %eax
+	/* this is way easier than doing it in assembly */
+	/* just push all the regs and jump to a C handler */
+	call	int10_handler
+
+	RESTORE_BIOSREGS
+
+	/* Clear CF to indicate success.  */
+	andl	$~EFLAGS_CF, 0x4(%esp)
+
+	IRET
+ENTRY_END(bios_int10)
+
+ENTRY(bios_int15)
+	SAVE_BIOSREGS
+
+	movl	%esp, %eax
+	call	int15_handler
+
+	RESTORE_BIOSREGS
+
+	IRET
+ENTRY_END(bios_int15)
+
+GLOBAL(__locals)
+
+#include "local.S"
+
+END(__locals)

diff --git a/tools/kvm/x86/bios/gen-offsets.sh b/tools/kvm/x86/bios/gen-offsets.sh
new file mode 100644
index 0000000..8771bbe
--- /dev/null
+++ b/tools/kvm/x86/bios/gen-offsets.sh

@@ -0,0 +1,14 @@
+#!/bin/sh
+
+echo "/* Autogenerated file, don't edit */"
+echo "#ifndef BIOS_OFFSETS_H"
+echo "#define BIOS_OFFSETS_H"
+
+echo ""
+echo "#define BIOS_ENTRY_SIZE(name) (name##_end - name)"
+echo ""
+
+nm bios.bin.elf | grep ' [Tt] ' | awk '{ print "#define BIOS_OFFSET__" $3 " 0x" $1; }'
+
+echo ""
+echo "#endif"

diff --git a/tools/kvm/x86/bios/int10.c b/tools/kvm/x86/bios/int10.c
new file mode 100644
index 0000000..7cc0b3f
--- /dev/null
+++ b/tools/kvm/x86/bios/int10.c

@@ -0,0 +1,110 @@
+#include "kvm/segment.h"
+#include "kvm/bios.h"
+#include "kvm/vesa.h"
+
+#include "bios/memcpy.h"
+
+#include <boot/vesa.h>
+
+static far_ptr gen_far_ptr(unsigned int pa)
+{
+	far_ptr ptr;
+
+	ptr.seg = (pa >> 4);
+	ptr.off = pa - (ptr.seg << 4);
+
+	return ptr;
+}
+
+static inline void outb(unsigned short port, unsigned char val)
+{
+	asm volatile("outb %0, %1" : : "a"(val), "Nd"(port));
+}
+
+/*
+ * It's probably much more useful to make this print to the serial
+ * line rather than print to a non-displayed VGA memory
+ */
+static inline void int10_putchar(struct biosregs *args)
+{
+	u8 al = args->eax & 0xFF;
+
+	outb(0x3f8, al);
+}
+
+static void vbe_get_mode(struct biosregs *args)
+{
+	struct vesa_mode_info *info = (struct vesa_mode_info *) args->edi;
+
+	*info = (struct vesa_mode_info) {
+		.mode_attr		= 0xd9, /* 11011011 */
+		.logical_scan		= VESA_WIDTH*4,
+		.h_res			= VESA_WIDTH,
+		.v_res			= VESA_HEIGHT,
+		.bpp			= VESA_BPP,
+		.memory_layout		= 6,
+		.memory_planes		= 1,
+		.lfb_ptr		= VESA_MEM_ADDR,
+		.rmask			= 8,
+		.gmask			= 8,
+		.bmask			= 8,
+		.resv_mask		= 8,
+		.resv_pos		= 24,
+		.bpos			= 16,
+		.gpos			= 8,
+	};
+}
+
+static void vbe_get_info(struct biosregs *args)
+{
+	struct vesa_general_info *infop = (struct vesa_general_info *) args->edi;
+	struct vesa_general_info info;
+
+	info = (struct vesa_general_info) {
+		.signature		= VESA_MAGIC,
+		.version		= 0x102,
+		.vendor_string		= gen_far_ptr(VGA_ROM_BEGIN),
+		.capabilities		= 0x10,
+		.video_mode_ptr		= gen_far_ptr(VGA_ROM_MODES),
+		.total_memory		= (4 * VESA_WIDTH * VESA_HEIGHT) / 0x10000,
+	};
+
+	memcpy16(args->es, infop, args->ds, &info, sizeof(info));
+}
+
+#define VBE_STATUS_OK		0x004F
+
+static void int10_vesa(struct biosregs *args)
+{
+	u8 al;
+
+	al = args->eax & 0xff;
+
+	switch (al) {
+	case 0x00:
+		vbe_get_info(args);
+		break;
+	case 0x01:
+		vbe_get_mode(args);
+		break;
+	}
+
+	args->eax = VBE_STATUS_OK;
+}
+
+bioscall void int10_handler(struct biosregs *args)
+{
+	u8 ah;
+
+	ah = (args->eax & 0xff00) >> 8;
+
+	switch (ah) {
+	case 0x0e:
+		int10_putchar(args);
+		break;
+	case 0x4f:
+		int10_vesa(args);
+		break;
+	}
+
+}

diff --git a/tools/kvm/x86/bios/int15.c b/tools/kvm/x86/bios/int15.c
new file mode 100644
index 0000000..faf5343
--- /dev/null
+++ b/tools/kvm/x86/bios/int15.c

@@ -0,0 +1,18 @@
+#include "kvm/bios.h"
+
+#include "kvm/e820.h"
+
+#include <asm/processor-flags.h>
+
+bioscall void int15_handler(struct biosregs *regs)
+{
+	switch (regs->eax) {
+	case 0xe820:
+		e820_query_map(regs);
+		break;
+	default:
+		/* Set CF to indicate failure.  */
+		regs->eflags	|= X86_EFLAGS_CF;
+		break;
+	}
+}

diff --git a/tools/kvm/x86/bios/local.S b/tools/kvm/x86/bios/local.S
new file mode 100644
index 0000000..f2cdbf4
--- /dev/null
+++ b/tools/kvm/x86/bios/local.S

@@ -0,0 +1,7 @@
+/*
+ * Local variables for almost every BIOS irq handler
+ * Must be put somewhere inside irq handler body
+ */
+__CALLER_SS:		.int  0
+__CALLER_SP:		.long 0
+__CALLER_CLOBBER:	.long 0

diff --git a/tools/kvm/x86/bios/macro.S b/tools/kvm/x86/bios/macro.S
new file mode 100644
index 0000000..0d5e567e
--- /dev/null
+++ b/tools/kvm/x86/bios/macro.S

@@ -0,0 +1,25 @@
+/*
+ * handy BIOS macros
+ */
+
+/*
+ * switch to BIOS stack
+ */
+.macro stack_swap
+	movw %ss, %cs:(__CALLER_SS)
+	movl %esp, %cs:(__CALLER_SP)
+	movl %edx, %cs:(__CALLER_CLOBBER)
+	movw $MB_BIOS_SS, %dx
+	movw %dx, %ss
+	movw $MB_BIOS_SP, %sp
+	movl %cs:(__CALLER_CLOBBER), %edx
+.endm
+
+/*
+ * restore the original stack
+ */
+.macro stack_restore
+	movl %cs:(__CALLER_SP), %esp
+	movw %cs:(__CALLER_SS), %ss
+.endm
+

diff --git a/tools/kvm/x86/bios/memcpy.c b/tools/kvm/x86/bios/memcpy.c
new file mode 100644
index 0000000..40b9b65
--- /dev/null
+++ b/tools/kvm/x86/bios/memcpy.c

@@ -0,0 +1,23 @@
+#include "bios/memcpy.h"
+
+/*
+ *  Copy memory area in 16-bit real mode.
+ */
+void memcpy16(u16 dst_seg, void *dst, u16 src_seg, const void *src, size_t len)
+{
+	__asm__ __volatile__ (
+		"pushw	%%ds				\n"
+		"pushw	%%es				\n"
+		"movw	%[src_seg], %%ds		\n"
+		"movw	%[dst_seg], %%es		\n"
+		"rep movsb %%ds:(%%si), %%es:(%%di)	\n"
+		"popw	%%es				\n"
+		"popw	%%ds				\n"
+		:
+		: "S"(src),
+		  "D"(dst),
+		  "c"(len),
+		  [src_seg] "r"(src_seg),
+		  [dst_seg] "r"(dst_seg)
+		: "cc", "memory");
+}

diff --git a/tools/kvm/x86/bios/rom.ld.S b/tools/kvm/x86/bios/rom.ld.S
new file mode 100644
index 0000000..f4f1835
--- /dev/null
+++ b/tools/kvm/x86/bios/rom.ld.S

@@ -0,0 +1,16 @@
+OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
+OUTPUT_ARCH(i386)
+
+SECTIONS {
+	.text 0 : {
+		*(.text)
+	}
+
+	/DISCARD/ : {
+		*(.debug*)
+		*(.data)
+		*(.bss)
+		*(.eh_frame*)
+	}
+}
+

diff --git a/tools/kvm/x86/boot.c b/tools/kvm/x86/boot.c
new file mode 100644
index 0000000..61535eb
--- /dev/null
+++ b/tools/kvm/x86/boot.c

@@ -0,0 +1,41 @@
+#include "kvm/kvm.h"
+
+#include "kvm/util.h"
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <stdbool.h>
+#include <fcntl.h>
+
+#define BIOS_SELECTOR	0xf000
+#define BIOS_IP		0xfff0
+#define BIOS_SP		0x8000
+
+bool kvm__load_firmware(struct kvm *kvm, const char *firmware_filename)
+{
+	struct stat st;
+	void *p;
+	int fd;
+	int nr;
+
+	fd = open(firmware_filename, O_RDONLY);
+	if (fd < 0)
+		return false;
+
+	if (fstat(fd, &st))
+		return false;
+
+	if (st.st_size > MB_FIRMWARE_BIOS_SIZE)
+		die("firmware image %s is too big to fit in memory (%Lu KB).\n", firmware_filename, (u64)(st.st_size / 1024));
+
+	p = guest_flat_to_host(kvm, MB_FIRMWARE_BIOS_BEGIN);
+
+	while ((nr = read(fd, p, st.st_size)) > 0)
+		p += nr;
+
+	kvm->arch.boot_selector	= BIOS_SELECTOR;
+	kvm->arch.boot_ip	= BIOS_IP;
+	kvm->arch.boot_sp	= BIOS_SP;
+
+	return true;
+}

diff --git a/tools/kvm/x86/cpuid.c b/tools/kvm/x86/cpuid.c
new file mode 100644
index 0000000..c3b67d9
--- /dev/null
+++ b/tools/kvm/x86/cpuid.c

@@ -0,0 +1,89 @@
+#include "kvm/kvm-cpu.h"
+
+#include "kvm/kvm.h"
+#include "kvm/util.h"
+
+#include <sys/ioctl.h>
+#include <stdlib.h>
+
+#define	MAX_KVM_CPUID_ENTRIES		100
+
+static void filter_cpuid(struct kvm_cpuid2 *kvm_cpuid)
+{
+	unsigned int signature[3];
+	unsigned int i;
+
+	/*
+	 * Filter CPUID functions that are not supported by the hypervisor.
+	 */
+	for (i = 0; i < kvm_cpuid->nent; i++) {
+		struct kvm_cpuid_entry2 *entry = &kvm_cpuid->entries[i];
+
+		switch (entry->function) {
+		case 0:
+			/* Vendor name */
+			memcpy(signature, "LKVMLKVMLKVM", 12);
+			entry->ebx = signature[0];
+			entry->ecx = signature[1];
+			entry->edx = signature[2];
+			break;
+		case 1:
+			/* Set X86_FEATURE_HYPERVISOR */
+			if (entry->index == 0)
+				entry->ecx |= (1 << 31);
+			break;
+		case 6:
+			/* Clear X86_FEATURE_EPB */
+			entry->ecx = entry->ecx & ~(1 << 3);
+			break;
+		case 10: { /* Architectural Performance Monitoring */
+			union cpuid10_eax {
+				struct {
+					unsigned int version_id		:8;
+					unsigned int num_counters	:8;
+					unsigned int bit_width		:8;
+					unsigned int mask_length	:8;
+				} split;
+				unsigned int full;
+			} eax;
+
+			/*
+			 * If the host has perf system running,
+			 * but no architectural events available
+			 * through kvm pmu -- disable perf support,
+			 * thus guest won't even try to access msr
+			 * registers.
+			 */
+			if (entry->eax) {
+				eax.full = entry->eax;
+				if (eax.split.version_id != 2 ||
+				    !eax.split.num_counters)
+					entry->eax = 0;
+			}
+			break;
+		}
+		default:
+			/* Keep the CPUID function as -is */
+			break;
+		};
+	}
+}
+
+void kvm_cpu__setup_cpuid(struct kvm_cpu *vcpu)
+{
+	struct kvm_cpuid2 *kvm_cpuid;
+
+	kvm_cpuid = calloc(1, sizeof(*kvm_cpuid) +
+				MAX_KVM_CPUID_ENTRIES * sizeof(*kvm_cpuid->entries));
+
+	kvm_cpuid->nent = MAX_KVM_CPUID_ENTRIES;
+	if (ioctl(vcpu->kvm->sys_fd, KVM_GET_SUPPORTED_CPUID, kvm_cpuid) < 0)
+		die_perror("KVM_GET_SUPPORTED_CPUID failed");
+
+	filter_cpuid(kvm_cpuid);
+
+	if (ioctl(vcpu->vcpu_fd, KVM_SET_CPUID2, kvm_cpuid) < 0)
+		die_perror("KVM_SET_CPUID2 failed");
+
+	free(kvm_cpuid);
+}

diff --git a/tools/kvm/x86/include/kvm/assembly.h b/tools/kvm/x86/include/kvm/assembly.h
new file mode 100644
index 0000000..e70baab
--- /dev/null
+++ b/tools/kvm/x86/include/kvm/assembly.h

@@ -0,0 +1,24 @@
+#ifndef ASSEMBLY_H_
+#define ASSEMBLY_H_
+
+#define __ALIGN	.p2align 4, 0x90
+#define ENTRY(name)	\
+	__ALIGN;	\
+	.globl name;	\
+	name:
+
+#define GLOBAL(name)	\
+	.globl name;	\
+	name:
+
+#define ENTRY_END(name)	GLOBAL(name##_end)
+#define END(name)	GLOBAL(name##_end)
+
+/*
+ * gas produces size override prefix with which
+ * we are unhappy, lets make it hardcoded for
+ * 16 bit mode
+ */
+#define IRET	.byte 0xcf
+
+#endif /* ASSEMBLY_H_ */

diff --git a/tools/kvm/x86/include/kvm/barrier.h b/tools/kvm/x86/include/kvm/barrier.h
new file mode 100644
index 0000000..46d14f6
--- /dev/null
+++ b/tools/kvm/x86/include/kvm/barrier.h

@@ -0,0 +1,20 @@
+#ifndef _KVM_BARRIER_H_
+#define _KVM_BARRIER_H_
+
+#define barrier() asm volatile("": : :"memory")
+
+#define mb()	asm volatile ("mfence": : :"memory")
+#define rmb()	asm volatile ("lfence": : :"memory")
+#define wmb()	asm volatile ("sfence": : :"memory")
+
+#ifdef CONFIG_SMP
+#define smp_mb()	mb()
+#define smp_rmb()	rmb()
+#define smp_wmb()	wmb()
+#else
+#define smp_mb()	barrier()
+#define smp_rmb()	barrier()
+#define smp_wmb()	barrier()
+#endif
+
+#endif /* _KVM_BARRIER_H_ */

diff --git a/tools/kvm/x86/include/kvm/bios-export.h b/tools/kvm/x86/include/kvm/bios-export.h
new file mode 100644
index 0000000..23825aa
--- /dev/null
+++ b/tools/kvm/x86/include/kvm/bios-export.h

@@ -0,0 +1,13 @@
+#ifndef BIOS_EXPORT_H_
+#define BIOS_EXPORT_H_
+
+struct kvm;
+
+extern char bios_rom[0];
+extern char bios_rom_end[0];
+
+#define bios_rom_size		(bios_rom_end - bios_rom)
+
+extern void setup_bios(struct kvm *kvm);
+
+#endif /* BIOS_EXPORT_H_ */

diff --git a/tools/kvm/x86/include/kvm/bios.h b/tools/kvm/x86/include/kvm/bios.h
new file mode 100644
index 0000000..ec7ed71
--- /dev/null
+++ b/tools/kvm/x86/include/kvm/bios.h

@@ -0,0 +1,93 @@
+#ifndef BIOS_H_
+#define BIOS_H_
+
+/*
+ * X86-32 Memory Map (typical)
+ *					start      end
+ * Real Mode Interrupt Vector Table	0x00000000 0x000003FF
+ * BDA area				0x00000400 0x000004FF
+ * Conventional Low Memory		0x00000500 0x0009FBFF
+ * EBDA area				0x0009FC00 0x0009FFFF
+ * VIDEO RAM				0x000A0000 0x000BFFFF
+ * VIDEO ROM (BIOS)			0x000C0000 0x000C7FFF
+ * ROMs & unus. space (mapped hw & misc)0x000C8000 0x000EFFFF 160 KiB (typically)
+ * Motherboard BIOS			0x000F0000 0x000FFFFF
+ * Extended Memory			0x00100000 0xFEBFFFFF
+ * Reserved (configs, ACPI, PnP, etc)	0xFEC00000 0xFFFFFFFF
+ */
+
+#define REAL_MODE_IVT_BEGIN		0x00000000
+#define REAL_MODE_IVT_END		0x000003ff
+
+#define BDA_START			0x00000400
+#define BDA_END				0x000004ff
+
+#define EBDA_START			0x0009fc00
+#define EBDA_END			0x0009ffff
+
+#define E820_MAP_START			EBDA_START
+
+#define MB_BIOS_BEGIN			0x000f0000
+#define MB_FIRMWARE_BIOS_BEGIN		0x000e0000
+#define MB_BIOS_END			0x000fffff
+
+#define MB_BIOS_SIZE			(MB_BIOS_END - MB_BIOS_BEGIN + 1)
+#define MB_FIRMWARE_BIOS_SIZE		(MB_BIOS_END - MB_FIRMWARE_BIOS_BEGIN + 1)
+
+#define VGA_RAM_BEGIN			0x000a0000
+#define VGA_RAM_END			0x000bffff
+
+#define VGA_ROM_BEGIN			0x000c0000
+#define VGA_ROM_OEM_STRING		VGA_ROM_BEGIN
+#define VGA_ROM_OEM_STRING_SIZE		16
+#define VGA_ROM_MODES			(VGA_ROM_OEM_STRING + VGA_ROM_OEM_STRING_SIZE)
+#define VGA_ROM_MODES_SIZE		32
+#define VGA_ROM_END			0x000c7fff
+
+/* we handle one page only */
+#define VGA_RAM_SEG			(VGA_RAM_BEGIN >> 4)
+#define VGA_PAGE_SIZE			0x007d0 /* 80x25 */
+
+/* real mode interrupt vector table */
+#define REAL_INTR_BASE			REAL_MODE_IVT_BEGIN
+#define REAL_INTR_VECTORS		256
+
+/*
+ * BIOS stack must be at absolute predefined memory address
+ * We reserve 64 bytes for BIOS stack
+ */
+#define MB_BIOS_SS			0xfff7
+#define MB_BIOS_SP			0x40
+
+/*
+ * When interfere with assembler code we need to be sure how
+ * arguments are passed in real mode.
+ */
+#define bioscall __attribute__((regparm(3)))
+
+#ifndef __ASSEMBLER__
+
+#include <linux/types.h>
+
+struct biosregs {
+	u32			eax;
+	u32			ebx;
+	u32			ecx;
+	u32			edx;
+	u32			esp;
+	u32			ebp;
+	u32			esi;
+	u32			edi;
+	u32			ds;
+	u32			es;
+	u32			fs;
+	u32			eip;
+	u32			eflags;
+};
+
+extern bioscall void int10_handler(struct biosregs *regs);
+extern bioscall void int15_handler(struct biosregs *regs);
+
+#endif
+
+#endif /* BIOS_H_ */

diff --git a/tools/kvm/x86/include/kvm/boot-protocol.h b/tools/kvm/x86/include/kvm/boot-protocol.h
new file mode 100644
index 0000000..85b637f
--- /dev/null
+++ b/tools/kvm/x86/include/kvm/boot-protocol.h

@@ -0,0 +1,16 @@
+/*
+ * Linux boot protocol specifics
+ */
+
+#ifndef BOOT_PROTOCOL_H_
+#define BOOT_PROTOCOL_H_
+
+/*
+ * The protected mode kernel part of a modern bzImage is loaded
+ * at 1 MB by default.
+ */
+#define BZ_DEFAULT_SETUP_SECTS		4
+#define BZ_KERNEL_START			0x100000UL
+#define INITRD_START			0x1000000UL
+
+#endif /* BOOT_PROTOCOL_H_ */

diff --git a/tools/kvm/x86/include/kvm/cpufeature.h b/tools/kvm/x86/include/kvm/cpufeature.h
new file mode 100644
index 0000000..bc4abbb
--- /dev/null
+++ b/tools/kvm/x86/include/kvm/cpufeature.h

@@ -0,0 +1,41 @@
+#ifndef KVM__CPUFEATURE_H
+#define KVM__CPUFEATURE_H
+
+#define CPUID_VENDOR_INTEL_1 0x756e6547 /* "Genu" */
+#define CPUID_VENDOR_INTEL_2 0x49656e69 /* "ineI" */
+#define CPUID_VENDOR_INTEL_3 0x6c65746e /* "ntel" */
+
+#define CPUID_VENDOR_AMD_1   0x68747541 /* "Auth" */
+#define CPUID_VENDOR_AMD_2   0x69746e65 /* "enti" */
+#define CPUID_VENDOR_AMD_3   0x444d4163 /* "cAMD" */
+
+/*
+ * CPUID flags we need to deal with
+ */
+#define KVM__X86_FEATURE_VMX		5	/* Hardware virtualization */
+#define KVM__X86_FEATURE_SVM		2	/* Secure virtual machine */
+#define KVM__X86_FEATURE_XSAVE		26	/* XSAVE/XRSTOR/XSETBV/XGETBV */
+
+#define cpu_feature_disable(reg, feature)	\
+	((reg) & ~(1 << (feature)))
+#define cpu_feature_enable(reg, feature)	\
+	((reg) |  (1 << (feature)))
+
+struct cpuid_regs {
+	u32	eax;
+	u32	ebx;
+	u32	ecx;
+	u32	edx;
+};
+
+static inline void host_cpuid(struct cpuid_regs *regs)
+{
+	asm volatile("cpuid"
+		: "=a" (regs->eax),
+		  "=b" (regs->ebx),
+		  "=c" (regs->ecx),
+		  "=d" (regs->edx)
+		: "0" (regs->eax), "2" (regs->ecx));
+}
+
+#endif /* KVM__CPUFEATURE_H */

diff --git a/tools/kvm/x86/include/kvm/interrupt.h b/tools/kvm/x86/include/kvm/interrupt.h
new file mode 100644
index 0000000..00c7ed7
--- /dev/null
+++ b/tools/kvm/x86/include/kvm/interrupt.h

@@ -0,0 +1,26 @@
+#ifndef KVM__INTERRUPT_H
+#define KVM__INTERRUPT_H
+
+#include <linux/types.h>
+#include "kvm/bios.h"
+#include "kvm/bios-export.h"
+
+struct real_intr_desc {
+	u16 offset;
+	u16 segment;
+} __attribute__((packed));
+
+#define REAL_SEGMENT_SHIFT	4
+#define REAL_SEGMENT(addr)	((addr) >> REAL_SEGMENT_SHIFT)
+#define REAL_OFFSET(addr)	((addr) & ((1 << REAL_SEGMENT_SHIFT) - 1))
+#define REAL_INTR_SIZE		(REAL_INTR_VECTORS * sizeof(struct real_intr_desc))
+
+struct interrupt_table {
+	struct real_intr_desc entries[REAL_INTR_VECTORS];
+};
+
+void interrupt_table__copy(struct interrupt_table *itable, void *dst, unsigned int size);
+void interrupt_table__setup(struct interrupt_table *itable, struct real_intr_desc *entry);
+void interrupt_table__set(struct interrupt_table *itable, struct real_intr_desc *entry, unsigned int num);
+
+#endif /* KVM__INTERRUPT_H */

diff --git a/tools/kvm/x86/include/kvm/kvm-arch.h b/tools/kvm/x86/include/kvm/kvm-arch.h
new file mode 100644
index 0000000..673bdf1
--- /dev/null
+++ b/tools/kvm/x86/include/kvm/kvm-arch.h

@@ -0,0 +1,42 @@
+#ifndef KVM__KVM_ARCH_H
+#define KVM__KVM_ARCH_H
+
+#include "kvm/interrupt.h"
+#include "kvm/segment.h"
+
+#include <stdbool.h>
+#include <linux/types.h>
+#include <time.h>
+
+/*
+ * The hole includes VESA framebuffer and PCI memory.
+ */
+#define KVM_32BIT_MAX_MEM_SIZE  (1ULL << 32)
+#define KVM_32BIT_GAP_SIZE	(768 << 20)
+#define KVM_32BIT_GAP_START	(KVM_32BIT_MAX_MEM_SIZE - KVM_32BIT_GAP_SIZE)
+
+#define KVM_MMIO_START		KVM_32BIT_GAP_START
+
+/* This is the address that pci_get_io_space_block() starts allocating
+ * from.  Note that this is a PCI bus address (though same on x86).
+ */
+#define KVM_IOPORT_AREA		0x0
+#define KVM_PCI_CFG_AREA	(KVM_MMIO_START + 0x1000000)
+#define KVM_PCI_MMIO_AREA	(KVM_MMIO_START + 0x2000000)
+#define KVM_VIRTIO_MMIO_AREA	(KVM_MMIO_START + 0x3000000)
+
+#define KVM_IRQ_OFFSET		5
+
+#define KVM_VM_TYPE		0
+
+#define VIRTIO_DEFAULT_TRANS(kvm)	VIRTIO_PCI
+
+struct kvm_arch {
+	u16			boot_selector;
+	u16			boot_ip;
+	u16			boot_sp;
+
+	struct interrupt_table	interrupt_table;
+};
+
+#endif /* KVM__KVM_ARCH_H */

diff --git a/tools/kvm/x86/include/kvm/kvm-config-arch.h b/tools/kvm/x86/include/kvm/kvm-config-arch.h
new file mode 100644
index 0000000..3eae8db
--- /dev/null
+++ b/tools/kvm/x86/include/kvm/kvm-config-arch.h

@@ -0,0 +1,15 @@
+#ifndef KVM__KVM_CONFIG_ARCH_H
+#define KVM__KVM_CONFIG_ARCH_H
+
+#include "kvm/parse-options.h"
+
+struct kvm_config_arch {
+	int vidmode;
+};
+
+#define OPT_ARCH_RUN(pfx, cfg)						\
+	pfx,								\
+	OPT_GROUP("BIOS options:"),					\
+	OPT_INTEGER('\0', "vidmode", &(cfg)->vidmode, "Video mode"),
+
+#endif /* KVM__KVM_CONFIG_ARCH_H */

diff --git a/tools/kvm/x86/include/kvm/kvm-cpu-arch.h b/tools/kvm/x86/include/kvm/kvm-cpu-arch.h
new file mode 100644
index 0000000..89c8059
--- /dev/null
+++ b/tools/kvm/x86/include/kvm/kvm-cpu-arch.h

@@ -0,0 +1,49 @@
+#ifndef KVM__KVM_CPU_ARCH_H
+#define KVM__KVM_CPU_ARCH_H
+
+/* Architecture-specific kvm_cpu definitions. */
+
+#include <linux/kvm.h>	/* for struct kvm_regs */
+#include "kvm/kvm.h"	/* for kvm__emulate_{mm}io() */
+#include <stdbool.h>
+#include <pthread.h>
+
+struct kvm;
+
+struct kvm_cpu {
+	pthread_t		thread;		/* VCPU thread */
+
+	unsigned long		cpu_id;
+
+	struct kvm		*kvm;		/* parent KVM */
+	int			vcpu_fd;	/* For VCPU ioctls() */
+	struct kvm_run		*kvm_run;
+
+	struct kvm_regs		regs;
+	struct kvm_sregs	sregs;
+	struct kvm_fpu		fpu;
+
+	struct kvm_msrs		*msrs;		/* dynamically allocated */
+
+	u8			is_running;
+	u8			paused;
+	u8			needs_nmi;
+
+	struct kvm_coalesced_mmio_ring	*ring;
+};
+
+/*
+ * As these are such simple wrappers, let's have them in the header so they'll
+ * be cheaper to call:
+ */
+static inline bool kvm_cpu__emulate_io(struct kvm_cpu *vcpu, u16 port, void *data, int direction, int size, u32 count)
+{
+	return kvm__emulate_io(vcpu, port, data, direction, size, count);
+}
+
+static inline bool kvm_cpu__emulate_mmio(struct kvm_cpu *vcpu, u64 phys_addr, u8 *data, u32 len, u8 is_write)
+{
+	return kvm__emulate_mmio(vcpu, phys_addr, data, len, is_write);
+}
+
+#endif /* KVM__KVM_CPU_ARCH_H */

diff --git a/tools/kvm/x86/include/kvm/mptable.h b/tools/kvm/x86/include/kvm/mptable.h
new file mode 100644
index 0000000..9e3cfa6
--- /dev/null
+++ b/tools/kvm/x86/include/kvm/mptable.h

@@ -0,0 +1,9 @@
+#ifndef KVM_MPTABLE_H_
+#define KVM_MPTABLE_H_
+
+struct kvm;
+
+int mptable__init(struct kvm *kvm);
+int mptable__exit(struct kvm *kvm);
+
+#endif /* KVM_MPTABLE_H_ */

diff --git a/tools/kvm/x86/interrupt.c b/tools/kvm/x86/interrupt.c
new file mode 100644
index 0000000..7d47869
--- /dev/null
+++ b/tools/kvm/x86/interrupt.c

@@ -0,0 +1,28 @@
+#include "kvm/interrupt.h"
+
+#include "kvm/util.h"
+
+#include <string.h>
+
+void interrupt_table__copy(struct interrupt_table *itable, void *dst, unsigned int size)
+{
+	if (size < sizeof(itable->entries))
+		die("An attempt to overwrite host memory");
+
+	memcpy(dst, itable->entries, sizeof(itable->entries));
+}
+
+void interrupt_table__setup(struct interrupt_table *itable, struct real_intr_desc *entry)
+{
+	unsigned int i;
+
+	for (i = 0; i < REAL_INTR_VECTORS; i++)
+		itable->entries[i] = *entry;
+}
+
+void interrupt_table__set(struct interrupt_table *itable,
+				struct real_intr_desc *entry, unsigned int num)
+{
+	if (num < REAL_INTR_VECTORS)
+		itable->entries[num] = *entry;
+}

diff --git a/tools/kvm/x86/ioport.c b/tools/kvm/x86/ioport.c
new file mode 100644
index 0000000..8572c758
--- /dev/null
+++ b/tools/kvm/x86/ioport.c

@@ -0,0 +1,118 @@
+#include "kvm/ioport.h"
+
+#include <stdlib.h>
+#include <stdio.h>
+
+static bool debug_io_out(struct ioport *ioport, struct kvm_cpu *vcpu, u16 port, void *data, int size)
+{
+	return 0;
+}
+
+static struct ioport_operations debug_ops = {
+	.io_out		= debug_io_out,
+};
+
+static bool seabios_debug_io_out(struct ioport *ioport, struct kvm_cpu *vcpu, u16 port, void *data, int size)
+{
+	char ch;
+
+	ch = ioport__read8(data);
+
+	putchar(ch);
+
+	return true;
+}
+
+static struct ioport_operations seabios_debug_ops = {
+	.io_out		= seabios_debug_io_out,
+};
+
+static bool dummy_io_in(struct ioport *ioport, struct kvm_cpu *vcpu, u16 port, void *data, int size)
+{
+	return true;
+}
+
+static bool dummy_io_out(struct ioport *ioport, struct kvm_cpu *vcpu, u16 port, void *data, int size)
+{
+	return true;
+}
+
+static struct ioport_operations dummy_read_write_ioport_ops = {
+	.io_in		= dummy_io_in,
+	.io_out		= dummy_io_out,
+};
+
+static struct ioport_operations dummy_write_only_ioport_ops = {
+	.io_out		= dummy_io_out,
+};
+
+/*
+ * The "fast A20 gate"
+ */
+
+static bool ps2_control_a_io_in(struct ioport *ioport, struct kvm_cpu *vcpu, u16 port, void *data, int size)
+{
+	/*
+	 * A20 is always enabled.
+	 */
+	ioport__write8(data, 0x02);
+
+	return true;
+}
+
+static struct ioport_operations ps2_control_a_ops = {
+	.io_in		= ps2_control_a_io_in,
+	.io_out		= dummy_io_out,
+};
+
+void ioport__map_irq(u8 *irq)
+{
+}
+
+void ioport__setup_arch(struct kvm *kvm)
+{
+	/* Legacy ioport setup */
+
+	/* 0000 - 001F - DMA1 controller */
+	ioport__register(kvm, 0x0000, &dummy_read_write_ioport_ops, 32, NULL);
+
+	/* 0x0020 - 0x003F - 8259A PIC 1 */
+	ioport__register(kvm, 0x0020, &dummy_read_write_ioport_ops, 2, NULL);
+
+	/* PORT 0040-005F - PIT - PROGRAMMABLE INTERVAL TIMER (8253, 8254) */
+	ioport__register(kvm, 0x0040, &dummy_read_write_ioport_ops, 4, NULL);
+
+	/* 0092 - PS/2 system control port A */
+	ioport__register(kvm, 0x0092, &ps2_control_a_ops, 1, NULL);
+
+	/* 0x00A0 - 0x00AF - 8259A PIC 2 */
+	ioport__register(kvm, 0x00A0, &dummy_read_write_ioport_ops, 2, NULL);
+
+	/* 00C0 - 001F - DMA2 controller */
+	ioport__register(kvm, 0x00C0, &dummy_read_write_ioport_ops, 32, NULL);
+
+	/* PORT 00E0-00EF are 'motherboard specific' so we use them for our
+	   internal debugging purposes.  */
+	ioport__register(kvm, IOPORT_DBG, &debug_ops, 1, NULL);
+
+	/* PORT 00ED - DUMMY PORT FOR DELAY??? */
+	ioport__register(kvm, 0x00ED, &dummy_write_only_ioport_ops, 1, NULL);
+
+	/* 0x00F0 - 0x00FF - Math co-processor */
+	ioport__register(kvm, 0x00F0, &dummy_write_only_ioport_ops, 2, NULL);
+
+	/* PORT 0278-027A - PARALLEL PRINTER PORT (usually LPT1, sometimes LPT2) */
+	ioport__register(kvm, 0x0278, &dummy_read_write_ioport_ops, 3, NULL);
+
+	/* PORT 0378-037A - PARALLEL PRINTER PORT (usually LPT2, sometimes LPT3) */
+	ioport__register(kvm, 0x0378, &dummy_read_write_ioport_ops, 3, NULL);
+
+	/* PORT 03D4-03D5 - COLOR VIDEO - CRT CONTROL REGISTERS */
+	ioport__register(kvm, 0x03D4, &dummy_read_write_ioport_ops, 1, NULL);
+	ioport__register(kvm, 0x03D5, &dummy_write_only_ioport_ops, 1, NULL);
+
+	ioport__register(kvm, 0x402, &seabios_debug_ops, 1, NULL);
+
+	/* 0510 - QEMU BIOS configuration register */
+	ioport__register(kvm, 0x510, &dummy_read_write_ioport_ops, 2, NULL);
+}

diff --git a/tools/kvm/x86/irq.c b/tools/kvm/x86/irq.c
new file mode 100644
index 0000000..72177e7
--- /dev/null
+++ b/tools/kvm/x86/irq.c

@@ -0,0 +1,101 @@
+#include "kvm/irq.h"
+#include "kvm/kvm.h"
+#include "kvm/util.h"
+
+#include <linux/types.h>
+#include <linux/rbtree.h>
+#include <linux/list.h>
+#include <linux/kvm.h>
+#include <sys/ioctl.h>
+
+#include <stddef.h>
+#include <stdlib.h>
+
+#define IRQ_MAX_GSI			64
+#define IRQCHIP_MASTER			0
+#define IRQCHIP_SLAVE			1
+#define IRQCHIP_IOAPIC			2
+
+/* First 24 GSIs are routed between IRQCHIPs and IOAPICs */
+static u32 gsi = 24;
+
+struct kvm_irq_routing *irq_routing;
+
+static int irq__add_routing(u32 gsi, u32 type, u32 irqchip, u32 pin)
+{
+	if (gsi >= IRQ_MAX_GSI)
+		return -ENOSPC;
+
+	irq_routing->entries[irq_routing->nr++] =
+		(struct kvm_irq_routing_entry) {
+			.gsi = gsi,
+			.type = type,
+			.u.irqchip.irqchip = irqchip,
+			.u.irqchip.pin = pin,
+		};
+
+	return 0;
+}
+
+int irq__init(struct kvm *kvm)
+{
+	int i, r;
+
+	irq_routing = calloc(sizeof(struct kvm_irq_routing) +
+			IRQ_MAX_GSI * sizeof(struct kvm_irq_routing_entry), 1);
+	if (irq_routing == NULL)
+		return -ENOMEM;
+
+	/* Hook first 8 GSIs to master IRQCHIP */
+	for (i = 0; i < 8; i++)
+		if (i != 2)
+			irq__add_routing(i, KVM_IRQ_ROUTING_IRQCHIP, IRQCHIP_MASTER, i);
+
+	/* Hook next 8 GSIs to slave IRQCHIP */
+	for (i = 8; i < 16; i++)
+		irq__add_routing(i, KVM_IRQ_ROUTING_IRQCHIP, IRQCHIP_SLAVE, i - 8);
+
+	/* Last but not least, IOAPIC */
+	for (i = 0; i < 24; i++) {
+		if (i == 0)
+			irq__add_routing(i, KVM_IRQ_ROUTING_IRQCHIP, IRQCHIP_IOAPIC, 2);
+		else if (i != 2)
+			irq__add_routing(i, KVM_IRQ_ROUTING_IRQCHIP, IRQCHIP_IOAPIC, i);
+	}
+
+	r = ioctl(kvm->vm_fd, KVM_SET_GSI_ROUTING, irq_routing);
+	if (r) {
+		free(irq_routing);
+		return errno;
+	}
+
+	return 0;
+}
+dev_base_init(irq__init);
+
+int irq__exit(struct kvm *kvm)
+{
+	free(irq_routing);
+	return 0;
+}
+dev_base_exit(irq__exit);
+
+int irq__add_msix_route(struct kvm *kvm, struct msi_msg *msg)
+{
+	int r;
+
+	irq_routing->entries[irq_routing->nr++] =
+		(struct kvm_irq_routing_entry) {
+			.gsi = gsi,
+			.type = KVM_IRQ_ROUTING_MSI,
+			.u.msi.address_hi = msg->address_hi,
+			.u.msi.address_lo = msg->address_lo,
+			.u.msi.data = msg->data,
+		};
+
+	r = ioctl(kvm->vm_fd, KVM_SET_GSI_ROUTING, irq_routing);
+	if (r)
+		return r;
+
+	return gsi++;
+}

diff --git a/tools/kvm/x86/kvm-cpu.c b/tools/kvm/x86/kvm-cpu.c
new file mode 100644
index 0000000..5cc4e1e
--- /dev/null
+++ b/tools/kvm/x86/kvm-cpu.c

@@ -0,0 +1,431 @@
+#include "kvm/kvm-cpu.h"
+
+#include "kvm/symbol.h"
+#include "kvm/util.h"
+#include "kvm/kvm.h"
+
+#include <asm/msr-index.h>
+#include <asm/apicdef.h>
+#include <linux/err.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <stdio.h>
+
+static int debug_fd;
+
+void kvm_cpu__set_debug_fd(int fd)
+{
+	debug_fd = fd;
+}
+
+int kvm_cpu__get_debug_fd(void)
+{
+	return debug_fd;
+}
+
+static inline bool is_in_protected_mode(struct kvm_cpu *vcpu)
+{
+	return vcpu->sregs.cr0 & 0x01;
+}
+
+static inline u64 ip_to_flat(struct kvm_cpu *vcpu, u64 ip)
+{
+	u64 cs;
+
+	/*
+	 * NOTE! We should take code segment base address into account here.
+	 * Luckily it's usually zero because Linux uses flat memory model.
+	 */
+	if (is_in_protected_mode(vcpu))
+		return ip;
+
+	cs = vcpu->sregs.cs.selector;
+
+	return ip + (cs << 4);
+}
+
+static inline u32 selector_to_base(u16 selector)
+{
+	/*
+	 * KVM on Intel requires 'base' to be 'selector * 16' in real mode.
+	 */
+	return (u32)selector << 4;
+}
+
+static struct kvm_cpu *kvm_cpu__new(struct kvm *kvm)
+{
+	struct kvm_cpu *vcpu;
+
+	vcpu = calloc(1, sizeof(*vcpu));
+	if (!vcpu)
+		return NULL;
+
+	vcpu->kvm = kvm;
+
+	return vcpu;
+}
+
+void kvm_cpu__delete(struct kvm_cpu *vcpu)
+{
+	if (vcpu->msrs)
+		free(vcpu->msrs);
+
+	free(vcpu);
+}
+
+static int kvm_cpu__set_lint(struct kvm_cpu *vcpu)
+{
+	struct local_apic lapic;
+
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_LAPIC, &lapic))
+		return -1;
+
+	lapic.lvt_lint0.delivery_mode = APIC_MODE_EXTINT;
+	lapic.lvt_lint1.delivery_mode = APIC_MODE_NMI;
+
+	return ioctl(vcpu->vcpu_fd, KVM_SET_LAPIC, &lapic);
+}
+
+struct kvm_cpu *kvm_cpu__arch_init(struct kvm *kvm, unsigned long cpu_id)
+{
+	struct kvm_cpu *vcpu;
+	int mmap_size;
+	int coalesced_offset;
+
+	vcpu = kvm_cpu__new(kvm);
+	if (!vcpu)
+		return NULL;
+
+	vcpu->cpu_id = cpu_id;
+
+	vcpu->vcpu_fd = ioctl(vcpu->kvm->vm_fd, KVM_CREATE_VCPU, cpu_id);
+	if (vcpu->vcpu_fd < 0)
+		die_perror("KVM_CREATE_VCPU ioctl");
+
+	mmap_size = ioctl(vcpu->kvm->sys_fd, KVM_GET_VCPU_MMAP_SIZE, 0);
+	if (mmap_size < 0)
+		die_perror("KVM_GET_VCPU_MMAP_SIZE ioctl");
+
+	vcpu->kvm_run = mmap(NULL, mmap_size, PROT_RW, MAP_SHARED, vcpu->vcpu_fd, 0);
+	if (vcpu->kvm_run == MAP_FAILED)
+		die("unable to mmap vcpu fd");
+
+	coalesced_offset = ioctl(kvm->sys_fd, KVM_CHECK_EXTENSION, KVM_CAP_COALESCED_MMIO);
+	if (coalesced_offset)
+		vcpu->ring = (void *)vcpu->kvm_run + (coalesced_offset * PAGE_SIZE);
+
+	if (kvm_cpu__set_lint(vcpu))
+		die_perror("KVM_SET_LAPIC failed");
+
+	vcpu->is_running = true;
+
+	return vcpu;
+}
+
+static struct kvm_msrs *kvm_msrs__new(size_t nmsrs)
+{
+	struct kvm_msrs *vcpu = calloc(1, sizeof(*vcpu) + (sizeof(struct kvm_msr_entry) * nmsrs));
+
+	if (!vcpu)
+		die("out of memory");
+
+	return vcpu;
+}
+
+#define KVM_MSR_ENTRY(_index, _data)	\
+	(struct kvm_msr_entry) { .index = _index, .data = _data }
+
+static void kvm_cpu__setup_msrs(struct kvm_cpu *vcpu)
+{
+	unsigned long ndx = 0;
+
+	vcpu->msrs = kvm_msrs__new(100);
+
+	vcpu->msrs->entries[ndx++] = KVM_MSR_ENTRY(MSR_IA32_SYSENTER_CS,	0x0);
+	vcpu->msrs->entries[ndx++] = KVM_MSR_ENTRY(MSR_IA32_SYSENTER_ESP,	0x0);
+	vcpu->msrs->entries[ndx++] = KVM_MSR_ENTRY(MSR_IA32_SYSENTER_EIP,	0x0);
+#ifdef CONFIG_X86_64
+	vcpu->msrs->entries[ndx++] = KVM_MSR_ENTRY(MSR_STAR,			0x0);
+	vcpu->msrs->entries[ndx++] = KVM_MSR_ENTRY(MSR_CSTAR,			0x0);
+	vcpu->msrs->entries[ndx++] = KVM_MSR_ENTRY(MSR_KERNEL_GS_BASE,		0x0);
+	vcpu->msrs->entries[ndx++] = KVM_MSR_ENTRY(MSR_SYSCALL_MASK,		0x0);
+	vcpu->msrs->entries[ndx++] = KVM_MSR_ENTRY(MSR_LSTAR,			0x0);
+#endif
+	vcpu->msrs->entries[ndx++] = KVM_MSR_ENTRY(MSR_IA32_TSC,		0x0);
+	vcpu->msrs->entries[ndx++] = KVM_MSR_ENTRY(MSR_IA32_MISC_ENABLE,
+						MSR_IA32_MISC_ENABLE_FAST_STRING);
+
+	vcpu->msrs->nmsrs = ndx;
+
+	if (ioctl(vcpu->vcpu_fd, KVM_SET_MSRS, vcpu->msrs) < 0)
+		die_perror("KVM_SET_MSRS failed");
+}
+
+static void kvm_cpu__setup_fpu(struct kvm_cpu *vcpu)
+{
+	vcpu->fpu = (struct kvm_fpu) {
+		.fcw	= 0x37f,
+		.mxcsr	= 0x1f80,
+	};
+
+	if (ioctl(vcpu->vcpu_fd, KVM_SET_FPU, &vcpu->fpu) < 0)
+		die_perror("KVM_SET_FPU failed");
+}
+
+static void kvm_cpu__setup_regs(struct kvm_cpu *vcpu)
+{
+	vcpu->regs = (struct kvm_regs) {
+		/* We start the guest in 16-bit real mode  */
+		.rflags	= 0x0000000000000002ULL,
+
+		.rip	= vcpu->kvm->arch.boot_ip,
+		.rsp	= vcpu->kvm->arch.boot_sp,
+		.rbp	= vcpu->kvm->arch.boot_sp,
+	};
+
+	if (vcpu->regs.rip > USHRT_MAX)
+		die("ip 0x%llx is too high for real mode", (u64)vcpu->regs.rip);
+
+	if (ioctl(vcpu->vcpu_fd, KVM_SET_REGS, &vcpu->regs) < 0)
+		die_perror("KVM_SET_REGS failed");
+}
+
+static void kvm_cpu__setup_sregs(struct kvm_cpu *vcpu)
+{
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_SREGS, &vcpu->sregs) < 0)
+		die_perror("KVM_GET_SREGS failed");
+
+	vcpu->sregs.cs.selector	= vcpu->kvm->arch.boot_selector;
+	vcpu->sregs.cs.base	= selector_to_base(vcpu->kvm->arch.boot_selector);
+	vcpu->sregs.ss.selector	= vcpu->kvm->arch.boot_selector;
+	vcpu->sregs.ss.base	= selector_to_base(vcpu->kvm->arch.boot_selector);
+	vcpu->sregs.ds.selector	= vcpu->kvm->arch.boot_selector;
+	vcpu->sregs.ds.base	= selector_to_base(vcpu->kvm->arch.boot_selector);
+	vcpu->sregs.es.selector	= vcpu->kvm->arch.boot_selector;
+	vcpu->sregs.es.base	= selector_to_base(vcpu->kvm->arch.boot_selector);
+	vcpu->sregs.fs.selector	= vcpu->kvm->arch.boot_selector;
+	vcpu->sregs.fs.base	= selector_to_base(vcpu->kvm->arch.boot_selector);
+	vcpu->sregs.gs.selector	= vcpu->kvm->arch.boot_selector;
+	vcpu->sregs.gs.base	= selector_to_base(vcpu->kvm->arch.boot_selector);
+
+	if (ioctl(vcpu->vcpu_fd, KVM_SET_SREGS, &vcpu->sregs) < 0)
+		die_perror("KVM_SET_SREGS failed");
+}
+
+/**
+ * kvm_cpu__reset_vcpu - reset virtual CPU to a known state
+ */
+void kvm_cpu__reset_vcpu(struct kvm_cpu *vcpu)
+{
+	kvm_cpu__setup_cpuid(vcpu);
+	kvm_cpu__setup_sregs(vcpu);
+	kvm_cpu__setup_regs(vcpu);
+	kvm_cpu__setup_fpu(vcpu);
+	kvm_cpu__setup_msrs(vcpu);
+}
+
+bool kvm_cpu__handle_exit(struct kvm_cpu *vcpu)
+{
+	return false;
+}
+
+static void print_dtable(const char *name, struct kvm_dtable *dtable)
+{
+	dprintf(debug_fd, " %s                 %016llx  %08hx\n",
+		name, (u64) dtable->base, (u16) dtable->limit);
+}
+
+static void print_segment(const char *name, struct kvm_segment *seg)
+{
+	dprintf(debug_fd, " %s       %04hx      %016llx  %08x  %02hhx    %x %x   %x  %x %x %x %x\n",
+		name, (u16) seg->selector, (u64) seg->base, (u32) seg->limit,
+		(u8) seg->type, seg->present, seg->dpl, seg->db, seg->s, seg->l, seg->g, seg->avl);
+}
+
+void kvm_cpu__show_registers(struct kvm_cpu *vcpu)
+{
+	unsigned long cr0, cr2, cr3;
+	unsigned long cr4, cr8;
+	unsigned long rax, rbx, rcx;
+	unsigned long rdx, rsi, rdi;
+	unsigned long rbp,  r8,  r9;
+	unsigned long r10, r11, r12;
+	unsigned long r13, r14, r15;
+	unsigned long rip, rsp;
+	struct kvm_sregs sregs;
+	unsigned long rflags;
+	struct kvm_regs regs;
+	int i;
+
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_REGS, &regs) < 0)
+		die("KVM_GET_REGS failed");
+
+	rflags = regs.rflags;
+
+	rip = regs.rip; rsp = regs.rsp;
+	rax = regs.rax; rbx = regs.rbx; rcx = regs.rcx;
+	rdx = regs.rdx; rsi = regs.rsi; rdi = regs.rdi;
+	rbp = regs.rbp; r8  = regs.r8;  r9  = regs.r9;
+	r10 = regs.r10; r11 = regs.r11; r12 = regs.r12;
+	r13 = regs.r13; r14 = regs.r14; r15 = regs.r15;
+
+	dprintf(debug_fd, "\n Registers:\n");
+	dprintf(debug_fd,   " ----------\n");
+	dprintf(debug_fd, " rip: %016lx   rsp: %016lx flags: %016lx\n", rip, rsp, rflags);
+	dprintf(debug_fd, " rax: %016lx   rbx: %016lx   rcx: %016lx\n", rax, rbx, rcx);
+	dprintf(debug_fd, " rdx: %016lx   rsi: %016lx   rdi: %016lx\n", rdx, rsi, rdi);
+	dprintf(debug_fd, " rbp: %016lx    r8: %016lx    r9: %016lx\n", rbp, r8,  r9);
+	dprintf(debug_fd, " r10: %016lx   r11: %016lx   r12: %016lx\n", r10, r11, r12);
+	dprintf(debug_fd, " r13: %016lx   r14: %016lx   r15: %016lx\n", r13, r14, r15);
+
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_SREGS, &sregs) < 0)
+		die("KVM_GET_REGS failed");
+
+	cr0 = sregs.cr0; cr2 = sregs.cr2; cr3 = sregs.cr3;
+	cr4 = sregs.cr4; cr8 = sregs.cr8;
+
+	dprintf(debug_fd, " cr0: %016lx   cr2: %016lx   cr3: %016lx\n", cr0, cr2, cr3);
+	dprintf(debug_fd, " cr4: %016lx   cr8: %016lx\n", cr4, cr8);
+	dprintf(debug_fd, "\n Segment registers:\n");
+	dprintf(debug_fd,   " ------------------\n");
+	dprintf(debug_fd, " register  selector  base              limit     type  p dpl db s l g avl\n");
+	print_segment("cs ", &sregs.cs);
+	print_segment("ss ", &sregs.ss);
+	print_segment("ds ", &sregs.ds);
+	print_segment("es ", &sregs.es);
+	print_segment("fs ", &sregs.fs);
+	print_segment("gs ", &sregs.gs);
+	print_segment("tr ", &sregs.tr);
+	print_segment("ldt", &sregs.ldt);
+	print_dtable("gdt", &sregs.gdt);
+	print_dtable("idt", &sregs.idt);
+
+	dprintf(debug_fd, "\n APIC:\n");
+	dprintf(debug_fd,   " -----\n");
+	dprintf(debug_fd, " efer: %016llx  apic base: %016llx  nmi: %s\n",
+		(u64) sregs.efer, (u64) sregs.apic_base,
+		(vcpu->kvm->nmi_disabled ? "disabled" : "enabled"));
+
+	dprintf(debug_fd, "\n Interrupt bitmap:\n");
+	dprintf(debug_fd,   " -----------------\n");
+	for (i = 0; i < (KVM_NR_INTERRUPTS + 63) / 64; i++)
+		dprintf(debug_fd, " %016llx", (u64) sregs.interrupt_bitmap[i]);
+	dprintf(debug_fd, "\n");
+}
+
+#define MAX_SYM_LEN 128
+
+void kvm_cpu__show_code(struct kvm_cpu *vcpu)
+{
+	unsigned int code_bytes = 64;
+	unsigned int code_prologue = 43;
+	unsigned int code_len = code_bytes;
+	char sym[MAX_SYM_LEN] = SYMBOL_DEFAULT_UNKNOWN, *psym;
+	unsigned char c;
+	unsigned int i;
+	u8 *ip;
+
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_REGS, &vcpu->regs) < 0)
+		die("KVM_GET_REGS failed");
+
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_SREGS, &vcpu->sregs) < 0)
+		die("KVM_GET_SREGS failed");
+
+	ip = guest_flat_to_host(vcpu->kvm, ip_to_flat(vcpu, vcpu->regs.rip) - code_prologue);
+
+	dprintf(debug_fd, "\n Code:\n");
+	dprintf(debug_fd,   " -----\n");
+
+	psym = symbol_lookup(vcpu->kvm, vcpu->regs.rip, sym, MAX_SYM_LEN);
+	if (IS_ERR(psym))
+		dprintf(debug_fd,
+			"Warning: symbol_lookup() failed to find symbol "
+			"with error: %ld\n", PTR_ERR(psym));
+
+	dprintf(debug_fd, " rip: [<%016lx>] %s\n\n", (unsigned long) vcpu->regs.rip, sym);
+
+	for (i = 0; i < code_len; i++, ip++) {
+		if (!host_ptr_in_ram(vcpu->kvm, ip))
+			break;
+
+		c = *ip;
+
+		if (ip == guest_flat_to_host(vcpu->kvm, ip_to_flat(vcpu, vcpu->regs.rip)))
+			dprintf(debug_fd, " <%02x>", c);
+		else
+			dprintf(debug_fd, " %02x", c);
+	}
+
+	dprintf(debug_fd, "\n");
+
+	dprintf(debug_fd, "\n Stack:\n");
+	dprintf(debug_fd,   " ------\n");
+	dprintf(debug_fd, " rsp: [<%016lx>] \n", (unsigned long) vcpu->regs.rsp);
+	kvm__dump_mem(vcpu->kvm, vcpu->regs.rsp, 32, debug_fd);
+}
+
+void kvm_cpu__show_page_tables(struct kvm_cpu *vcpu)
+{
+	u64 *pte1;
+	u64 *pte2;
+	u64 *pte3;
+	u64 *pte4;
+
+	if (!is_in_protected_mode(vcpu)) {
+		dprintf(debug_fd, "\n Page Tables:\n");
+		dprintf(debug_fd, " ------\n");
+		dprintf(debug_fd, " Not in protected mode\n");
+		return;
+	}
+
+	if (ioctl(vcpu->vcpu_fd, KVM_GET_SREGS, &vcpu->sregs) < 0)
+		die("KVM_GET_SREGS failed");
+
+	pte4 = guest_flat_to_host(vcpu->kvm, vcpu->sregs.cr3);
+	if (!host_ptr_in_ram(vcpu->kvm, pte4))
+		return;
+
+	pte3 = guest_flat_to_host(vcpu->kvm, (*pte4 & ~0xfff));
+	if (!host_ptr_in_ram(vcpu->kvm, pte3))
+		return;
+
+	pte2 = guest_flat_to_host(vcpu->kvm, (*pte3 & ~0xfff));
+	if (!host_ptr_in_ram(vcpu->kvm, pte2))
+		return;
+
+	pte1 = guest_flat_to_host(vcpu->kvm, (*pte2 & ~0xfff));
+	if (!host_ptr_in_ram(vcpu->kvm, pte1))
+		return;
+
+	dprintf(debug_fd, "\n Page Tables:\n");
+	dprintf(debug_fd, " ------\n");
+	if (*pte2 & (1 << 7))
+		dprintf(debug_fd, " pte4: %016llx   pte3: %016llx"
+			"   pte2: %016llx\n",
+			*pte4, *pte3, *pte2);
+	else
+		dprintf(debug_fd, " pte4: %016llx  pte3: %016llx   pte2: %016"
+			"llx   pte1: %016llx\n",
+			*pte4, *pte3, *pte2, *pte1);
+}
+
+void kvm_cpu__arch_nmi(struct kvm_cpu *cpu)
+{
+	struct kvm_lapic_state klapic;
+	struct local_apic *lapic = (void *)&klapic;
+
+	if (ioctl(cpu->vcpu_fd, KVM_GET_LAPIC, &klapic) != 0)
+		return;
+
+	if (lapic->lvt_lint1.mask)
+		return;
+
+	if (lapic->lvt_lint1.delivery_mode != APIC_MODE_NMI)
+		return;
+
+	ioctl(cpu->vcpu_fd, KVM_NMI);
+}

diff --git a/tools/kvm/x86/kvm.c b/tools/kvm/x86/kvm.c
new file mode 100644
index 0000000..d285d1d
--- /dev/null
+++ b/tools/kvm/x86/kvm.c

@@ -0,0 +1,381 @@
+#include "kvm/kvm.h"
+#include "kvm/boot-protocol.h"
+#include "kvm/cpufeature.h"
+#include "kvm/interrupt.h"
+#include "kvm/mptable.h"
+#include "kvm/util.h"
+#include "kvm/8250-serial.h"
+#include "kvm/virtio-console.h"
+
+#include <asm/bootparam.h>
+#include <linux/kvm.h>
+
+#include <sys/types.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <fcntl.h>
+
+struct kvm_ext kvm_req_ext[] = {
+	{ DEFINE_KVM_EXT(KVM_CAP_COALESCED_MMIO) },
+	{ DEFINE_KVM_EXT(KVM_CAP_SET_TSS_ADDR) },
+	{ DEFINE_KVM_EXT(KVM_CAP_PIT2) },
+	{ DEFINE_KVM_EXT(KVM_CAP_USER_MEMORY) },
+	{ DEFINE_KVM_EXT(KVM_CAP_IRQ_ROUTING) },
+	{ DEFINE_KVM_EXT(KVM_CAP_IRQCHIP) },
+	{ DEFINE_KVM_EXT(KVM_CAP_HLT) },
+	{ DEFINE_KVM_EXT(KVM_CAP_IRQ_INJECT_STATUS) },
+	{ DEFINE_KVM_EXT(KVM_CAP_EXT_CPUID) },
+	{ 0, 0 }
+};
+
+bool kvm__arch_cpu_supports_vm(void)
+{
+	struct cpuid_regs regs;
+	u32 eax_base;
+	int feature;
+
+	regs	= (struct cpuid_regs) {
+		.eax		= 0x00,
+	};
+	host_cpuid(&regs);
+
+	switch (regs.ebx) {
+	case CPUID_VENDOR_INTEL_1:
+		eax_base	= 0x00;
+		feature		= KVM__X86_FEATURE_VMX;
+		break;
+
+	case CPUID_VENDOR_AMD_1:
+		eax_base	= 0x80000000;
+		feature		= KVM__X86_FEATURE_SVM;
+		break;
+
+	default:
+		return false;
+	}
+
+	regs	= (struct cpuid_regs) {
+		.eax		= eax_base,
+	};
+	host_cpuid(&regs);
+
+	if (regs.eax < eax_base + 0x01)
+		return false;
+
+	regs	= (struct cpuid_regs) {
+		.eax		= eax_base + 0x01
+	};
+	host_cpuid(&regs);
+
+	return regs.ecx & (1 << feature);
+}
+
+/*
+ * Allocating RAM size bigger than 4GB requires us to leave a gap
+ * in the RAM which is used for PCI MMIO, hotplug, and unconfigured
+ * devices (see documentation of e820_setup_gap() for details).
+ *
+ * If we're required to initialize RAM bigger than 4GB, we will create
+ * a gap between 0xe0000000 and 0x100000000 in the guest virtual mem space.
+ */
+
+void kvm__init_ram(struct kvm *kvm)
+{
+	u64	phys_start, phys_size;
+	void	*host_mem;
+
+	if (kvm->ram_size < KVM_32BIT_GAP_START) {
+		/* Use a single block of RAM for 32bit RAM */
+
+		phys_start = 0;
+		phys_size  = kvm->ram_size;
+		host_mem   = kvm->ram_start;
+
+		kvm__register_mem(kvm, phys_start, phys_size, host_mem);
+	} else {
+		/* First RAM range from zero to the PCI gap: */
+
+		phys_start = 0;
+		phys_size  = KVM_32BIT_GAP_START;
+		host_mem   = kvm->ram_start;
+
+		kvm__register_mem(kvm, phys_start, phys_size, host_mem);
+
+		/* Second RAM range from 4GB to the end of RAM: */
+
+		phys_start = KVM_32BIT_MAX_MEM_SIZE;
+		phys_size  = kvm->ram_size - phys_start;
+		host_mem   = kvm->ram_start + phys_start;
+
+		kvm__register_mem(kvm, phys_start, phys_size, host_mem);
+	}
+}
+
+/* Arch-specific commandline setup */
+void kvm__arch_set_cmdline(char *cmdline, bool video)
+{
+	strcpy(cmdline, "noapic noacpi pci=conf1 reboot=k panic=1 i8042.direct=1 "
+				"i8042.dumbkbd=1 i8042.nopnp=1");
+	if (video)
+		strcat(cmdline, " video=vesafb console=tty0");
+	else
+		strcat(cmdline, " console=ttyS0 earlyprintk=serial i8042.noaux=1");
+}
+
+/* Architecture-specific KVM init */
+void kvm__arch_init(struct kvm *kvm, const char *hugetlbfs_path, u64 ram_size)
+{
+	struct kvm_pit_config pit_config = { .flags = 0, };
+	int ret;
+
+	ret = ioctl(kvm->vm_fd, KVM_SET_TSS_ADDR, 0xfffbd000);
+	if (ret < 0)
+		die_perror("KVM_SET_TSS_ADDR ioctl");
+
+	ret = ioctl(kvm->vm_fd, KVM_CREATE_PIT2, &pit_config);
+	if (ret < 0)
+		die_perror("KVM_CREATE_PIT2 ioctl");
+
+	if (ram_size < KVM_32BIT_GAP_START) {
+		kvm->ram_size = ram_size;
+		kvm->ram_start = mmap_anon_or_hugetlbfs(kvm, hugetlbfs_path, ram_size);
+	} else {
+		kvm->ram_start = mmap_anon_or_hugetlbfs(kvm, hugetlbfs_path, ram_size + KVM_32BIT_GAP_SIZE);
+		kvm->ram_size = ram_size + KVM_32BIT_GAP_SIZE;
+		if (kvm->ram_start != MAP_FAILED)
+			/*
+			 * We mprotect the gap (see kvm__init_ram() for details) PROT_NONE so that
+			 * if we accidently write to it, we will know.
+			 */
+			mprotect(kvm->ram_start + KVM_32BIT_GAP_START, KVM_32BIT_GAP_SIZE, PROT_NONE);
+	}
+	if (kvm->ram_start == MAP_FAILED)
+		die("out of memory");
+
+	madvise(kvm->ram_start, kvm->ram_size, MADV_MERGEABLE);
+
+	ret = ioctl(kvm->vm_fd, KVM_CREATE_IRQCHIP);
+	if (ret < 0)
+		die_perror("KVM_CREATE_IRQCHIP ioctl");
+}
+
+void kvm__arch_delete_ram(struct kvm *kvm)
+{
+	munmap(kvm->ram_start, kvm->ram_size);
+}
+
+void kvm__irq_line(struct kvm *kvm, int irq, int level)
+{
+	struct kvm_irq_level irq_level;
+
+	irq_level	= (struct kvm_irq_level) {
+		{
+			.irq		= irq,
+		},
+		.level		= level,
+	};
+
+	if (ioctl(kvm->vm_fd, KVM_IRQ_LINE, &irq_level) < 0)
+		die_perror("KVM_IRQ_LINE failed");
+}
+
+void kvm__irq_trigger(struct kvm *kvm, int irq)
+{
+	kvm__irq_line(kvm, irq, 1);
+	kvm__irq_line(kvm, irq, 0);
+}
+
+#define BOOT_LOADER_SELECTOR	0x1000
+#define BOOT_LOADER_IP		0x0000
+#define BOOT_LOADER_SP		0x8000
+#define BOOT_CMDLINE_OFFSET	0x20000
+
+#define BOOT_PROTOCOL_REQUIRED	0x206
+#define LOAD_HIGH		0x01
+
+static inline void *guest_real_to_host(struct kvm *kvm, u16 selector, u16 offset)
+{
+	unsigned long flat = segment_to_flat(selector, offset);
+
+	return guest_flat_to_host(kvm, flat);
+}
+
+int load_flat_binary(struct kvm *kvm, int fd_kernel, int fd_initrd, const char *kernel_cmdline)
+{
+	void *p;
+	int nr;
+
+	/*
+	 * Some architectures may support loading an initrd alongside the flat kernel,
+	 * but we do not.
+	 */
+	if (fd_initrd != -1)
+		pr_warning("Loading initrd with flat binary not supported.");
+
+	if (lseek(fd_kernel, 0, SEEK_SET) < 0)
+		die_perror("lseek");
+
+	p = guest_real_to_host(kvm, BOOT_LOADER_SELECTOR, BOOT_LOADER_IP);
+
+	while ((nr = read(fd_kernel, p, 65536)) > 0)
+		p += nr;
+
+	kvm->arch.boot_selector	= BOOT_LOADER_SELECTOR;
+	kvm->arch.boot_ip	= BOOT_LOADER_IP;
+	kvm->arch.boot_sp	= BOOT_LOADER_SP;
+
+	return true;
+}
+
+static const char *BZIMAGE_MAGIC = "HdrS";
+
+bool load_bzimage(struct kvm *kvm, int fd_kernel, int fd_initrd,
+		  const char *kernel_cmdline)
+{
+	struct boot_params *kern_boot;
+	unsigned long setup_sects;
+	struct boot_params boot;
+	size_t cmdline_size;
+	ssize_t setup_size;
+	void *p;
+	int nr;
+	u16 vidmode;
+
+	/*
+	 * See Documentation/x86/boot.txt for details no bzImage on-disk and
+	 * memory layout.
+	 */
+
+	if (lseek(fd_kernel, 0, SEEK_SET) < 0)
+		die_perror("lseek");
+
+	if (read(fd_kernel, &boot, sizeof(boot)) != sizeof(boot))
+		return false;
+
+	if (memcmp(&boot.hdr.header, BZIMAGE_MAGIC, strlen(BZIMAGE_MAGIC)))
+		return false;
+
+	if (boot.hdr.version < BOOT_PROTOCOL_REQUIRED)
+		die("Too old kernel");
+
+	if (lseek(fd_kernel, 0, SEEK_SET) < 0)
+		die_perror("lseek");
+
+	if (!boot.hdr.setup_sects)
+		boot.hdr.setup_sects = BZ_DEFAULT_SETUP_SECTS;
+	setup_sects = boot.hdr.setup_sects + 1;
+
+	setup_size = setup_sects << 9;
+	p = guest_real_to_host(kvm, BOOT_LOADER_SELECTOR, BOOT_LOADER_IP);
+
+	/* copy setup.bin to mem*/
+	if (read(fd_kernel, p, setup_size) != setup_size)
+		die_perror("read");
+
+	/* copy vmlinux.bin to BZ_KERNEL_START*/
+	p = guest_flat_to_host(kvm, BZ_KERNEL_START);
+
+	while ((nr = read(fd_kernel, p, 65536)) > 0)
+		p += nr;
+
+	p = guest_flat_to_host(kvm, BOOT_CMDLINE_OFFSET);
+	if (kernel_cmdline) {
+		cmdline_size = strlen(kernel_cmdline) + 1;
+		if (cmdline_size > boot.hdr.cmdline_size)
+			cmdline_size = boot.hdr.cmdline_size;
+
+		memset(p, 0, boot.hdr.cmdline_size);
+		memcpy(p, kernel_cmdline, cmdline_size - 1);
+	}
+
+
+	/* vidmode should be either specified or set by default */
+	if (kvm->cfg.vnc || kvm->cfg.sdl || kvm->cfg.gtk) {
+		if (!kvm->cfg.arch.vidmode)
+			vidmode = 0x312;
+		else
+			vidmode = kvm->cfg.arch.vidmode;
+	} else {
+		vidmode = 0;
+	}
+
+	kern_boot	= guest_real_to_host(kvm, BOOT_LOADER_SELECTOR, 0x00);
+
+	kern_boot->hdr.cmd_line_ptr	= BOOT_CMDLINE_OFFSET;
+	kern_boot->hdr.type_of_loader	= 0xff;
+	kern_boot->hdr.heap_end_ptr	= 0xfe00;
+	kern_boot->hdr.loadflags	|= CAN_USE_HEAP;
+	kern_boot->hdr.vid_mode		= vidmode;
+
+	/*
+	 * Read initrd image into guest memory
+	 */
+	if (fd_initrd >= 0) {
+		struct stat initrd_stat;
+		unsigned long addr;
+
+		if (fstat(fd_initrd, &initrd_stat))
+			die_perror("fstat");
+
+		addr = boot.hdr.initrd_addr_max & ~0xfffff;
+		for (;;) {
+			if (addr < BZ_KERNEL_START)
+				die("Not enough memory for initrd");
+			else if (addr < (kvm->ram_size - initrd_stat.st_size))
+				break;
+			addr -= 0x100000;
+		}
+
+		p = guest_flat_to_host(kvm, addr);
+		nr = read(fd_initrd, p, initrd_stat.st_size);
+		if (nr != initrd_stat.st_size)
+			die("Failed to read initrd");
+
+		kern_boot->hdr.ramdisk_image	= addr;
+		kern_boot->hdr.ramdisk_size	= initrd_stat.st_size;
+	}
+
+	kvm->arch.boot_selector = BOOT_LOADER_SELECTOR;
+	/*
+	 * The real-mode setup code starts at offset 0x200 of a bzImage. See
+	 * Documentation/x86/boot.txt for details.
+	 */
+	kvm->arch.boot_ip = BOOT_LOADER_IP + 0x200;
+	kvm->arch.boot_sp = BOOT_LOADER_SP;
+
+	return true;
+}
+
+/**
+ * kvm__arch_setup_firmware - inject BIOS into guest system memory
+ * @kvm - guest system descriptor
+ *
+ * This function is a main routine where we poke guest memory
+ * and install BIOS there.
+ */
+int kvm__arch_setup_firmware(struct kvm *kvm)
+{
+	/* standart minimal configuration */
+	setup_bios(kvm);
+
+	/* FIXME: SMP, ACPI and friends here */
+
+	return 0;
+}
+
+int kvm__arch_free_firmware(struct kvm *kvm)
+{
+	return 0;
+}
+
+void kvm__arch_read_term(struct kvm *kvm)
+{
+	serial8250__update_consoles(kvm);
+	virtio_console__inject_interrupt(kvm);
+}

diff --git a/tools/kvm/x86/mptable.c b/tools/kvm/x86/mptable.c
new file mode 100644
index 0000000..a984de9
--- /dev/null
+++ b/tools/kvm/x86/mptable.c

@@ -0,0 +1,280 @@
+#include "kvm/kvm.h"
+#include "kvm/bios.h"
+#include "kvm/apic.h"
+#include "kvm/mptable.h"
+#include "kvm/util.h"
+#include "kvm/devices.h"
+#include "kvm/pci.h"
+
+#include <linux/kernel.h>
+#include <string.h>
+
+#include <asm/mpspec_def.h>
+#include <linux/types.h>
+
+/*
+ * FIXME: please make sure the addresses borrowed
+ * for apic/ioapic never overlaped! We need a global
+ * tracker of system resources (including io, mmio,
+ * and friends).
+ */
+
+static unsigned int mpf_checksum(unsigned char *mp, int len)
+{
+	unsigned int sum = 0;
+
+	while (len--)
+		sum += *mp++;
+
+	return sum & 0xFF;
+}
+
+static unsigned int gen_cpu_flag(unsigned int cpu, unsigned int ncpu)
+{
+	/* sets enabled/disabled | BSP/AP processor */
+	return ( (cpu < ncpu) ? CPU_ENABLED       : 0) |
+		((cpu == 0)   ? CPU_BOOTPROCESSOR : 0x00);
+}
+
+#define MPTABLE_SIG_FLOATING	"_MP_"
+#define MPTABLE_OEM		"KVMCPU00"
+#define MPTABLE_PRODUCTID	"0.1         "
+#define MPTABLE_PCIBUSTYPE	"PCI   "
+#define MPTABLE_ISABUSTYPE	"ISA   "
+
+#define MPTABLE_STRNCPY(d, s)	memcpy(d, s, sizeof(d))
+
+/* It should be more than enough */
+#define MPTABLE_MAX_SIZE	(32 << 20)
+
+/*
+ * Too many cpus will require x2apic mode
+ * and rather ACPI support so we limit it
+ * here for a while.
+ */
+#define MPTABLE_MAX_CPUS	255
+
+static void mptable_add_irq_src(struct mpc_intsrc *mpc_intsrc,
+				u16 srcbusid,	u16 srcbusirq,
+				u16 dstapic,	u16 dstirq)
+{
+	*mpc_intsrc = (struct mpc_intsrc) {
+		.type		= MP_INTSRC,
+		.irqtype	= mp_INT,
+		.irqflag	= MP_IRQDIR_DEFAULT,
+		.srcbus		= srcbusid,
+		.srcbusirq	= srcbusirq,
+		.dstapic	= dstapic,
+		.dstirq		= dstirq
+	};
+}
+
+/**
+ * mptable_setup - create mptable and fill guest memory with it
+ */
+int mptable__init(struct kvm *kvm)
+{
+	unsigned long real_mpc_table, real_mpf_intel, size;
+	struct mpf_intel *mpf_intel;
+	struct mpc_table *mpc_table;
+	struct mpc_cpu *mpc_cpu;
+	struct mpc_bus *mpc_bus;
+	struct mpc_ioapic *mpc_ioapic;
+	struct mpc_intsrc *mpc_intsrc;
+	struct device_header *dev_hdr;
+
+	const int pcibusid = 0;
+	const int isabusid = 1;
+
+	unsigned int i, nentries = 0, ncpus = kvm->nrcpus;
+	unsigned int ioapicid;
+	void *last_addr;
+
+	/* That is where MP table will be in guest memory */
+	real_mpc_table = ALIGN(MB_BIOS_BEGIN + bios_rom_size, 16);
+
+	if (ncpus > MPTABLE_MAX_CPUS) {
+		pr_warning("Too many cpus: %d limited to %d",
+			ncpus, MPTABLE_MAX_CPUS);
+		ncpus = MPTABLE_MAX_CPUS;
+	}
+
+	mpc_table = calloc(1, MPTABLE_MAX_SIZE);
+	if (!mpc_table)
+		return -ENOMEM;
+
+	MPTABLE_STRNCPY(mpc_table->signature,	MPC_SIGNATURE);
+	MPTABLE_STRNCPY(mpc_table->oem,		MPTABLE_OEM);
+	MPTABLE_STRNCPY(mpc_table->productid,	MPTABLE_PRODUCTID);
+
+	mpc_table->spec		= 4;
+	mpc_table->lapic	= APIC_ADDR(0);
+	mpc_table->oemcount	= ncpus; /* will be updated again at end */
+
+	/*
+	 * CPUs enumeration. Technically speaking we should
+	 * ask either host or HV for apic version supported
+	 * but for a while we simply put some random value
+	 * here.
+	 */
+	mpc_cpu = (void *)&mpc_table[1];
+	for (i = 0; i < ncpus; i++) {
+		mpc_cpu->type		= MP_PROCESSOR;
+		mpc_cpu->apicid		= i;
+		mpc_cpu->apicver	= KVM_APIC_VERSION;
+		mpc_cpu->cpuflag	= gen_cpu_flag(i, ncpus);
+		mpc_cpu->cpufeature	= 0x600; /* some default value */
+		mpc_cpu->featureflag	= 0x201; /* some default value */
+		mpc_cpu++;
+	}
+
+	last_addr = (void *)mpc_cpu;
+	nentries += ncpus;
+
+	/*
+	 * PCI buses.
+	 * FIXME: Some callback here to obtain real number
+	 * of PCI buses present in system.
+	 */
+	mpc_bus		= last_addr;
+	mpc_bus->type	= MP_BUS;
+	mpc_bus->busid	= pcibusid;
+	MPTABLE_STRNCPY(mpc_bus->bustype, MPTABLE_PCIBUSTYPE);
+
+	last_addr = (void *)&mpc_bus[1];
+	nentries++;
+
+	/*
+	 * ISA bus.
+	 * FIXME: Same issue as for PCI bus.
+	 */
+	mpc_bus		= last_addr;
+	mpc_bus->type	= MP_BUS;
+	mpc_bus->busid	= isabusid;
+	MPTABLE_STRNCPY(mpc_bus->bustype, MPTABLE_ISABUSTYPE);
+
+	last_addr = (void *)&mpc_bus[1];
+	nentries++;
+
+	/*
+	 * IO-APIC chip.
+	 */
+	ioapicid		= ncpus + 1;
+	mpc_ioapic		= last_addr;
+	mpc_ioapic->type	= MP_IOAPIC;
+	mpc_ioapic->apicid	= ioapicid;
+	mpc_ioapic->apicver	= KVM_APIC_VERSION;
+	mpc_ioapic->flags	= MPC_APIC_USABLE;
+	mpc_ioapic->apicaddr	= IOAPIC_ADDR(0);
+
+	last_addr = (void *)&mpc_ioapic[1];
+	nentries++;
+
+	/*
+	 * IRQ sources.
+	 * Also note we use PCI irqs here, no for ISA bus yet.
+	 */
+
+	dev_hdr = device__first_dev(DEVICE_BUS_PCI);
+	while (dev_hdr) {
+		unsigned char srcbusirq;
+		struct pci_device_header *pci_hdr = dev_hdr->data;
+
+		srcbusirq = (pci_hdr->subsys_id << 2) | (pci_hdr->irq_pin - 1);
+		mpc_intsrc = last_addr;
+		mptable_add_irq_src(mpc_intsrc, pcibusid, srcbusirq, ioapicid, pci_hdr->irq_line);
+
+		last_addr = (void *)&mpc_intsrc[dev_hdr->dev_num];
+		nentries++;
+		dev_hdr = device__next_dev(dev_hdr);
+	}
+
+	/*
+	 * Local IRQs assignment (LINT0, LINT1)
+	 */
+	mpc_intsrc		= last_addr;
+	mpc_intsrc->type	= MP_LINTSRC;
+	mpc_intsrc->irqtype	= mp_ExtINT;
+	mpc_intsrc->irqtype	= mp_INT;
+	mpc_intsrc->irqflag	= MP_IRQDIR_DEFAULT;
+	mpc_intsrc->srcbus	= isabusid;
+	mpc_intsrc->srcbusirq	= 0;
+	mpc_intsrc->dstapic	= 0; /* FIXME: BSP apic */
+	mpc_intsrc->dstirq	= 0; /* LINT0 */
+
+	last_addr = (void *)&mpc_intsrc[1];
+	nentries++;
+
+	mpc_intsrc		= last_addr;
+	mpc_intsrc->type	= MP_LINTSRC;
+	mpc_intsrc->irqtype	= mp_NMI;
+	mpc_intsrc->irqflag	= MP_IRQDIR_DEFAULT;
+	mpc_intsrc->srcbus	= isabusid;
+	mpc_intsrc->srcbusirq	= 0;
+	mpc_intsrc->dstapic	= 0; /* FIXME: BSP apic */
+	mpc_intsrc->dstirq	= 1; /* LINT1 */
+
+	last_addr = (void *)&mpc_intsrc[1];
+	nentries++;
+
+	/*
+	 * Floating MP table finally.
+	 */
+	real_mpf_intel	= ALIGN((unsigned long)last_addr - (unsigned long)mpc_table, 16);
+	mpf_intel	= (void *)((unsigned long)mpc_table + real_mpf_intel);
+
+	MPTABLE_STRNCPY(mpf_intel->signature, MPTABLE_SIG_FLOATING);
+	mpf_intel->length	= 1;
+	mpf_intel->specification= 4;
+	mpf_intel->physptr	= (unsigned int)real_mpc_table;
+	mpf_intel->checksum	= -mpf_checksum((unsigned char *)mpf_intel, sizeof(*mpf_intel));
+
+	/*
+	 * No last_addr inclrement here please, we need last
+	 * active position here to compute table size.
+	 */
+
+	/*
+	 * Don't forget to update header in fixed table.
+	*/
+	mpc_table->oemcount	= nentries;
+	mpc_table->length	= last_addr - (void *)mpc_table;
+	mpc_table->checksum	= -mpf_checksum((unsigned char *)mpc_table, mpc_table->length);
+
+
+	/*
+	 * We will copy the whole table, no need to separate
+	 * floating structure and table itkvm.
+	 */
+	size = (unsigned long)mpf_intel + sizeof(*mpf_intel) - (unsigned long)mpc_table;
+
+	/*
+	 * The finial check -- never get out of system bios
+	 * area. Lets also check for allocated memory overrun,
+	 * in real it's late but still usefull.
+	 */
+
+	if (size > (unsigned long)(MB_BIOS_END - bios_rom_size) ||
+	    size > MPTABLE_MAX_SIZE) {
+		free(mpc_table);
+		pr_err("MP table is too big");
+
+		return -E2BIG;
+	}
+
+	/*
+	 * OK, it is time to move it to guest memory.
+	 */
+	memcpy(guest_flat_to_host(kvm, real_mpc_table), mpc_table, size);
+
+	free(mpc_table);
+
+	return 0;
+}
+firmware_init(mptable__init);
+
+int mptable__exit(struct kvm *kvm)
+{
+	return 0;
+}
+firmware_exit(mptable__exit);

diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt
index d2b59af..d561e02 100644
--- a/tools/perf/Documentation/perf-report.txt
+++ b/tools/perf/Documentation/perf-report.txt

@@ -147,7 +147,7 @@
 -w::
 --column-widths=<width[,width...]>::
 	Force each column width to the provided list, for large terminal
-	readability.
+	readability.  0 means no limit (default behavior).
 
 -t::
 --field-separator=::

diff --git a/tools/perf/Documentation/perf-top.txt b/tools/perf/Documentation/perf-top.txt
index 180ae02..28fdee3 100644
--- a/tools/perf/Documentation/perf-top.txt
+++ b/tools/perf/Documentation/perf-top.txt

@@ -193,6 +193,12 @@
 	sum of shown entries will be always 100%. "absolute" means it retains
 	the original value before and after the filter is applied.
 
+-w::
+--column-widths=<width[,width...]>::
+	Force each column width to the provided list, for large terminal
+	readability.  0 means no limit (default behavior).
+
+
 INTERACTIVE PROMPTING KEYS
 --------------------------
 

diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf
index 2240974..95e832b 100644
--- a/tools/perf/Makefile.perf
+++ b/tools/perf/Makefile.perf

@@ -263,6 +263,7 @@
 LIB_H += util/header.h
 LIB_H += util/help.h
 LIB_H += util/session.h
+LIB_H += util/ordered-events.h
 LIB_H += util/strbuf.h
 LIB_H += util/strlist.h
 LIB_H += util/strfilter.h
@@ -347,6 +348,7 @@
 LIB_OBJS += $(OUTPUT)util/map.o
 LIB_OBJS += $(OUTPUT)util/pstack.o
 LIB_OBJS += $(OUTPUT)util/session.o
+LIB_OBJS += $(OUTPUT)util/ordered-events.o
 LIB_OBJS += $(OUTPUT)util/comm.o
 LIB_OBJS += $(OUTPUT)util/thread.o
 LIB_OBJS += $(OUTPUT)util/thread_map.o
@@ -423,6 +425,7 @@
 endif
 LIB_OBJS += $(OUTPUT)tests/mmap-thread-lookup.o
 LIB_OBJS += $(OUTPUT)tests/thread-mg-share.o
+LIB_OBJS += $(OUTPUT)tests/switch-tracking.o
 
 BUILTIN_OBJS += $(OUTPUT)builtin-annotate.o
 BUILTIN_OBJS += $(OUTPUT)builtin-bench.o

diff --git a/tools/perf/arch/arm64/include/perf_regs.h b/tools/perf/arch/arm64/include/perf_regs.h
index e9441b9..1d3f39c 100644
--- a/tools/perf/arch/arm64/include/perf_regs.h
+++ b/tools/perf/arch/arm64/include/perf_regs.h

@@ -6,6 +6,8 @@
 #include <asm/perf_regs.h>
 
 #define PERF_REGS_MASK	((1ULL << PERF_REG_ARM64_MAX) - 1)
+#define PERF_REGS_MAX	PERF_REG_ARM64_MAX
+
 #define PERF_REG_IP	PERF_REG_ARM64_PC
 #define PERF_REG_SP	PERF_REG_ARM64_SP
 

diff --git a/tools/perf/arch/common.c b/tools/perf/arch/common.c
index 42faf36..49776f1 100644
--- a/tools/perf/arch/common.c
+++ b/tools/perf/arch/common.c

@@ -12,6 +12,11 @@
 	NULL
 };
 
+const char *const arm64_triplets[] = {
+	"aarch64-linux-android-",
+	NULL
+};
+
 const char *const powerpc_triplets[] = {
 	"powerpc-unknown-linux-gnu-",
 	"powerpc64-unknown-linux-gnu-",
@@ -105,6 +110,8 @@
 		return "x86";
 	if (!strcmp(arch, "sun4u") || !strncmp(arch, "sparc", 5))
 		return "sparc";
+	if (!strcmp(arch, "aarch64") || !strcmp(arch, "arm64"))
+		return "arm64";
 	if (!strncmp(arch, "arm", 3) || !strcmp(arch, "sa110"))
 		return "arm";
 	if (!strncmp(arch, "s390", 4))
@@ -159,6 +166,8 @@
 
 	if (!strcmp(arch, "arm"))
 		path_list = arm_triplets;
+	else if (!strcmp(arch, "arm64"))
+		path_list = arm64_triplets;
 	else if (!strcmp(arch, "powerpc"))
 		path_list = powerpc_triplets;
 	else if (!strcmp(arch, "sh"))

diff --git a/tools/perf/arch/powerpc/util/skip-callchain-idx.c b/tools/perf/arch/powerpc/util/skip-callchain-idx.c
index a7c23a4..d73ef8b 100644
--- a/tools/perf/arch/powerpc/util/skip-callchain-idx.c
+++ b/tools/perf/arch/powerpc/util/skip-callchain-idx.c

@@ -15,6 +15,7 @@
 
 #include "util/thread.h"
 #include "util/callchain.h"
+#include "util/debug.h"
 
 /*
  * When saving the callchain on Power, the kernel conservatively saves

diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index 1ec429f..d4da692 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c

@@ -36,7 +36,8 @@
 
 struct perf_annotate {
 	struct perf_tool tool;
-	bool	   force, use_tui, use_stdio, use_gtk;
+	struct perf_session *session;
+	bool	   use_tui, use_stdio, use_gtk;
 	bool	   full_paths;
 	bool	   print_line;
 	bool	   skip_missing;
@@ -188,18 +189,9 @@
 static int __cmd_annotate(struct perf_annotate *ann)
 {
 	int ret;
-	struct perf_session *session;
+	struct perf_session *session = ann->session;
 	struct perf_evsel *pos;
 	u64 total_nr_samples;
-	struct perf_data_file file = {
-		.path  = input_name,
-		.mode  = PERF_DATA_MODE_READ,
-		.force = ann->force,
-	};
-
-	session = perf_session__new(&file, false, &ann->tool);
-	if (session == NULL)
-		return -ENOMEM;
 
 	machines__set_symbol_filter(&session->machines, symbol__annotate_init);
 
@@ -207,22 +199,22 @@
 		ret = perf_session__cpu_bitmap(session, ann->cpu_list,
 					       ann->cpu_bitmap);
 		if (ret)
-			goto out_delete;
+			goto out;
 	}
 
 	if (!objdump_path) {
 		ret = perf_session_env__lookup_objdump(&session->header.env);
 		if (ret)
-			goto out_delete;
+			goto out;
 	}
 
 	ret = perf_session__process_events(session, &ann->tool);
 	if (ret)
-		goto out_delete;
+		goto out;
 
 	if (dump_trace) {
 		perf_session__fprintf_nr_events(session, stdout);
-		goto out_delete;
+		goto out;
 	}
 
 	if (verbose > 3)
@@ -250,8 +242,8 @@
 	}
 
 	if (total_nr_samples == 0) {
-		ui__error("The %s file has no samples!\n", file.path);
-		goto out_delete;
+		ui__error("The %s file has no samples!\n", session->file->path);
+		goto out;
 	}
 
 	if (use_browser == 2) {
@@ -261,24 +253,12 @@
 					 "perf_gtk__show_annotations");
 		if (show_annotations == NULL) {
 			ui__error("GTK browser not found!\n");
-			goto out_delete;
+			goto out;
 		}
 		show_annotations();
 	}
 
-out_delete:
-	/*
-	 * Speed up the exit process, for large files this can
-	 * take quite a while.
-	 *
-	 * XXX Enable this when using valgrind or if we ever
-	 * librarize this command.
-	 *
-	 * Also experiment with obstacks to see how much speed
-	 * up we'll get here.
-	 *
-	 * perf_session__delete(session);
-	 */
+out:
 	return ret;
 }
 
@@ -297,10 +277,14 @@
 			.comm	= perf_event__process_comm,
 			.exit	= perf_event__process_exit,
 			.fork	= perf_event__process_fork,
-			.ordered_samples = true,
+			.ordered_events = true,
 			.ordering_requires_timestamps = true,
 		},
 	};
+	struct perf_data_file file = {
+		.path  = input_name,
+		.mode  = PERF_DATA_MODE_READ,
+	};
 	const struct option options[] = {
 	OPT_STRING('i', "input", &input_name, "file",
 		    "input file name"),
@@ -308,7 +292,7 @@
 		   "only consider symbols in these dsos"),
 	OPT_STRING('s', "symbol", &annotate.sym_hist_filter, "symbol",
 		    "symbol to annotate"),
-	OPT_BOOLEAN('f', "force", &annotate.force, "don't complain, do it"),
+	OPT_BOOLEAN('f', "force", &file.force, "don't complain, do it"),
 	OPT_INCR('v', "verbose", &verbose,
 		    "be more verbose (show symbol address, etc)"),
 	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
@@ -341,6 +325,7 @@
 		    "Show event group information together"),
 	OPT_END()
 	};
+	int ret;
 
 	argc = parse_options(argc, argv, options, annotate_usage, 0);
 
@@ -353,11 +338,16 @@
 
 	setup_browser(true);
 
+	annotate.session = perf_session__new(&file, false, &annotate.tool);
+	if (annotate.session == NULL)
+		return -ENOMEM;
+
 	symbol_conf.priv_size = sizeof(struct annotation);
 	symbol_conf.try_vmlinux_path = true;
 
-	if (symbol__init() < 0)
-		return -1;
+	ret = symbol__init(&annotate.session->header.env);
+	if (ret < 0)
+		goto out_delete;
 
 	if (setup_sorting() < 0)
 		usage_with_options(annotate_usage, options);
@@ -373,5 +363,20 @@
 		annotate.sym_hist_filter = argv[0];
 	}
 
-	return __cmd_annotate(&annotate);
+	ret = __cmd_annotate(&annotate);
+
+out_delete:
+	/*
+	 * Speed up the exit process, for large files this can
+	 * take quite a while.
+	 *
+	 * XXX Enable this when using valgrind or if we ever
+	 * librarize this command.
+	 *
+	 * Also experiment with obstacks to see how much speed
+	 * up we'll get here.
+	 *
+	 * perf_session__delete(session);
+	 */
+	return ret;
 }

diff --git a/tools/perf/builtin-buildid-cache.c b/tools/perf/builtin-buildid-cache.c
index 2a2c78f..7038575 100644
--- a/tools/perf/builtin-buildid-cache.c
+++ b/tools/perf/builtin-buildid-cache.c

@@ -246,20 +246,9 @@
 	return true;
 }
 
-static int build_id_cache__fprintf_missing(const char *filename, bool force, FILE *fp)
+static int build_id_cache__fprintf_missing(struct perf_session *session, FILE *fp)
 {
-	struct perf_data_file file = {
-		.path  = filename,
-		.mode  = PERF_DATA_MODE_READ,
-		.force = force,
-	};
-	struct perf_session *session = perf_session__new(&file, false, NULL);
-	if (session == NULL)
-		return -1;
-
 	perf_session__fprintf_dsos_buildid(session, fp, dso__missing_buildid_cache, 0);
-	perf_session__delete(session);
-
 	return 0;
 }
 
@@ -302,6 +291,12 @@
 		   *missing_filename = NULL,
 		   *update_name_list_str = NULL,
 		   *kcore_filename;
+	char sbuf[STRERR_BUFSIZE];
+
+	struct perf_data_file file = {
+		.mode  = PERF_DATA_MODE_READ,
+	};
+	struct perf_session *session = NULL;
 
 	const struct option buildid_cache_options[] = {
 	OPT_STRING('a', "add", &add_name_list_str,
@@ -326,8 +321,17 @@
 	argc = parse_options(argc, argv, buildid_cache_options,
 			     buildid_cache_usage, 0);
 
-	if (symbol__init() < 0)
-		return -1;
+	if (missing_filename) {
+		file.path = missing_filename;
+		file.force = force;
+
+		session = perf_session__new(&file, false, NULL);
+		if (session == NULL)
+			return -1;
+	}
+
+	if (symbol__init(session ? &session->header.env : NULL) < 0)
+		goto out;
 
 	setup_pager();
 
@@ -344,7 +348,7 @@
 						continue;
 					}
 					pr_warning("Couldn't add %s: %s\n",
-						   pos->s, strerror(errno));
+						   pos->s, strerror_r(errno, sbuf, sizeof(sbuf)));
 				}
 
 			strlist__delete(list);
@@ -362,7 +366,7 @@
 						continue;
 					}
 					pr_warning("Couldn't remove %s: %s\n",
-						   pos->s, strerror(errno));
+						   pos->s, strerror_r(errno, sbuf, sizeof(sbuf)));
 				}
 
 			strlist__delete(list);
@@ -370,7 +374,7 @@
 	}
 
 	if (missing_filename)
-		ret = build_id_cache__fprintf_missing(missing_filename, force, stdout);
+		ret = build_id_cache__fprintf_missing(session, stdout);
 
 	if (update_name_list_str) {
 		list = strlist__new(true, update_name_list_str);
@@ -383,7 +387,7 @@
 						continue;
 					}
 					pr_warning("Couldn't update %s: %s\n",
-						   pos->s, strerror(errno));
+						   pos->s, strerror_r(errno, sbuf, sizeof(sbuf)));
 				}
 
 			strlist__delete(list);
@@ -394,5 +398,9 @@
 	    build_id_cache__add_kcore(kcore_filename, debugdir, force))
 		pr_warning("Couldn't add %s\n", kcore_filename);
 
+out:
+	if (session)
+		perf_session__delete(session);
+
 	return ret;
 }

diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c
index 9a5a035..190d0b6 100644
--- a/tools/perf/builtin-diff.c
+++ b/tools/perf/builtin-diff.c

@@ -360,7 +360,7 @@
 	.exit	= perf_event__process_exit,
 	.fork	= perf_event__process_fork,
 	.lost	= perf_event__process_lost,
-	.ordered_samples = true,
+	.ordered_events = true,
 	.ordering_requires_timestamps = true,
 };
 
@@ -1143,7 +1143,7 @@
 
 	argc = parse_options(argc, argv, options, diff_usage, 0);
 
-	if (symbol__init() < 0)
+	if (symbol__init(NULL) < 0)
 		return -1;
 
 	if (data_init(argc, argv) < 0)

diff --git a/tools/perf/builtin-help.c b/tools/perf/builtin-help.c
index 0384d93..25d2062 100644
--- a/tools/perf/builtin-help.c
+++ b/tools/perf/builtin-help.c

@@ -103,6 +103,8 @@
 
 static void exec_woman_emacs(const char *path, const char *page)
 {
+	char sbuf[STRERR_BUFSIZE];
+
 	if (!check_emacsclient_version()) {
 		/* This works only with emacsclient version >= 22. */
 		struct strbuf man_page = STRBUF_INIT;
@@ -111,16 +113,19 @@
 			path = "emacsclient";
 		strbuf_addf(&man_page, "(woman \"%s\")", page);
 		execlp(path, "emacsclient", "-e", man_page.buf, NULL);
-		warning("failed to exec '%s': %s", path, strerror(errno));
+		warning("failed to exec '%s': %s", path,
+			strerror_r(errno, sbuf, sizeof(sbuf)));
 	}
 }
 
 static void exec_man_konqueror(const char *path, const char *page)
 {
 	const char *display = getenv("DISPLAY");
+
 	if (display && *display) {
 		struct strbuf man_page = STRBUF_INIT;
 		const char *filename = "kfmclient";
+		char sbuf[STRERR_BUFSIZE];
 
 		/* It's simpler to launch konqueror using kfmclient. */
 		if (path) {
@@ -139,24 +144,31 @@
 			path = "kfmclient";
 		strbuf_addf(&man_page, "man:%s(1)", page);
 		execlp(path, filename, "newTab", man_page.buf, NULL);
-		warning("failed to exec '%s': %s", path, strerror(errno));
+		warning("failed to exec '%s': %s", path,
+			strerror_r(errno, sbuf, sizeof(sbuf)));
 	}
 }
 
 static void exec_man_man(const char *path, const char *page)
 {
+	char sbuf[STRERR_BUFSIZE];
+
 	if (!path)
 		path = "man";
 	execlp(path, "man", page, NULL);
-	warning("failed to exec '%s': %s", path, strerror(errno));
+	warning("failed to exec '%s': %s", path,
+		strerror_r(errno, sbuf, sizeof(sbuf)));
 }
 
 static void exec_man_cmd(const char *cmd, const char *page)
 {
 	struct strbuf shell_cmd = STRBUF_INIT;
+	char sbuf[STRERR_BUFSIZE];
+
 	strbuf_addf(&shell_cmd, "%s %s", cmd, page);
 	execl("/bin/sh", "sh", "-c", shell_cmd.buf, NULL);
-	warning("failed to exec '%s': %s", cmd, strerror(errno));
+	warning("failed to exec '%s': %s", cmd,
+		strerror_r(errno, sbuf, sizeof(sbuf)));
 }
 
 static void add_man_viewer(const char *name)

diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c
index 9a02807..3a62b6b 100644
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c

@@ -23,6 +23,7 @@
 
 struct perf_inject {
 	struct perf_tool	tool;
+	struct perf_session	*session;
 	bool			build_ids;
 	bool			sched_stat;
 	const char		*input_name;
@@ -340,12 +341,8 @@
 
 static int __cmd_inject(struct perf_inject *inject)
 {
-	struct perf_session *session;
 	int ret = -EINVAL;
-	struct perf_data_file file = {
-		.path = inject->input_name,
-		.mode = PERF_DATA_MODE_READ,
-	};
+	struct perf_session *session = inject->session;
 	struct perf_data_file *file_out = &inject->output;
 
 	signal(SIGINT, sig_handler);
@@ -357,16 +354,12 @@
 		inject->tool.tracing_data = perf_event__repipe_tracing_data;
 	}
 
-	session = perf_session__new(&file, true, &inject->tool);
-	if (session == NULL)
-		return -ENOMEM;
-
 	if (inject->build_ids) {
 		inject->tool.sample = perf_event__inject_buildid;
 	} else if (inject->sched_stat) {
 		struct perf_evsel *evsel;
 
-		inject->tool.ordered_samples = true;
+		inject->tool.ordered_events = true;
 
 		evlist__for_each(session->evlist, evsel) {
 			const char *name = perf_evsel__name(evsel);
@@ -396,8 +389,6 @@
 		perf_session__write_header(session, session->evlist, file_out->fd, true);
 	}
 
-	perf_session__delete(session);
-
 	return ret;
 }
 
@@ -427,6 +418,11 @@
 			.mode = PERF_DATA_MODE_WRITE,
 		},
 	};
+	struct perf_data_file file = {
+		.mode = PERF_DATA_MODE_READ,
+	};
+	int ret;
+
 	const struct option options[] = {
 		OPT_BOOLEAN('b', "build-ids", &inject.build_ids,
 			    "Inject build-ids into the output stream"),
@@ -461,8 +457,17 @@
 		return -1;
 	}
 
-	if (symbol__init() < 0)
+	file.path = inject.input_name;
+	inject.session = perf_session__new(&file, true, &inject.tool);
+	if (inject.session == NULL)
+		return -ENOMEM;
+
+	if (symbol__init(&inject.session->header.env) < 0)
 		return -1;
 
-	return __cmd_inject(&inject);
+	ret = __cmd_inject(&inject);
+
+	perf_session__delete(inject.session);
+
+	return ret;
 }

diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c
index bef3376..2376218 100644
--- a/tools/perf/builtin-kmem.c
+++ b/tools/perf/builtin-kmem.c

@@ -256,7 +256,9 @@
 static struct perf_tool perf_kmem = {
 	.sample		 = process_sample_event,
 	.comm		 = perf_event__process_comm,
-	.ordered_samples = true,
+	.mmap		 = perf_event__process_mmap,
+	.mmap2		 = perf_event__process_mmap2,
+	.ordered_events	 = true,
 };
 
 static double fragmentation(unsigned long n_req, unsigned long n_alloc)
@@ -403,10 +405,9 @@
 	__sort_result(&root_caller_stat, &root_caller_sorted, &caller_sort);
 }
 
-static int __cmd_kmem(void)
+static int __cmd_kmem(struct perf_session *session)
 {
 	int err = -EINVAL;
-	struct perf_session *session;
 	const struct perf_evsel_str_handler kmem_tracepoints[] = {
 		{ "kmem:kmalloc",		perf_evsel__process_alloc_event, },
     		{ "kmem:kmem_cache_alloc",	perf_evsel__process_alloc_event, },
@@ -415,34 +416,22 @@
 		{ "kmem:kfree",			perf_evsel__process_free_event, },
     		{ "kmem:kmem_cache_free",	perf_evsel__process_free_event, },
 	};
-	struct perf_data_file file = {
-		.path = input_name,
-		.mode = PERF_DATA_MODE_READ,
-	};
-
-	session = perf_session__new(&file, false, &perf_kmem);
-	if (session == NULL)
-		return -ENOMEM;
-
-	if (perf_session__create_kernel_maps(session) < 0)
-		goto out_delete;
 
 	if (!perf_session__has_traces(session, "kmem record"))
-		goto out_delete;
+		goto out;
 
 	if (perf_session__set_tracepoints_handlers(session, kmem_tracepoints)) {
 		pr_err("Initializing perf session tracepoint handlers failed\n");
-		return -1;
+		goto out;
 	}
 
 	setup_pager();
 	err = perf_session__process_events(session, &perf_kmem);
 	if (err != 0)
-		goto out_delete;
+		goto out;
 	sort_result();
 	print_result(session);
-out_delete:
-	perf_session__delete(session);
+out:
 	return err;
 }
 
@@ -689,29 +678,46 @@
 		NULL,
 		NULL
 	};
+	struct perf_session *session;
+	struct perf_data_file file = {
+		.path = input_name,
+		.mode = PERF_DATA_MODE_READ,
+	};
+	int ret = -1;
+
 	argc = parse_options_subcommand(argc, argv, kmem_options,
 					kmem_subcommands, kmem_usage, 0);
 
 	if (!argc)
 		usage_with_options(kmem_usage, kmem_options);
 
-	symbol__init();
-
 	if (!strncmp(argv[0], "rec", 3)) {
+		symbol__init(NULL);
 		return __cmd_record(argc, argv);
-	} else if (!strcmp(argv[0], "stat")) {
+	}
+
+	session = perf_session__new(&file, false, &perf_kmem);
+	if (session == NULL)
+		return -ENOMEM;
+
+	symbol__init(&session->header.env);
+
+	if (!strcmp(argv[0], "stat")) {
 		if (cpu__setup_cpunode_map())
-			return -1;
+			goto out_delete;
 
 		if (list_empty(&caller_sort))
 			setup_sorting(&caller_sort, default_sort_order);
 		if (list_empty(&alloc_sort))
 			setup_sorting(&alloc_sort, default_sort_order);
 
-		return __cmd_kmem();
+		ret = __cmd_kmem(session);
 	} else
 		usage_with_options(kmem_usage, kmem_options);
 
-	return 0;
+out_delete:
+	perf_session__delete(session);
+
+	return ret;
 }
 

diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c
index 43367eb..1a4ef9c 100644
--- a/tools/perf/builtin-kvm.c
+++ b/tools/perf/builtin-kvm.c

@@ -592,8 +592,8 @@
 	pr_info("%9s ", "Samples%");
 
 	pr_info("%9s ", "Time%");
-	pr_info("%10s ", "Min Time");
-	pr_info("%10s ", "Max Time");
+	pr_info("%11s ", "Min Time");
+	pr_info("%11s ", "Max Time");
 	pr_info("%16s ", "Avg time");
 	pr_info("\n\n");
 
@@ -610,8 +610,8 @@
 		pr_info("%10llu ", (unsigned long long)ecount);
 		pr_info("%8.2f%% ", (double)ecount / kvm->total_count * 100);
 		pr_info("%8.2f%% ", (double)etime / kvm->total_time * 100);
-		pr_info("%8" PRIu64 "us ", min / 1000);
-		pr_info("%8" PRIu64 "us ", max / 1000);
+		pr_info("%9.2fus ", (double)min / 1e3);
+		pr_info("%9.2fus ", (double)max / 1e3);
 		pr_info("%9.2fus ( +-%7.2f%% )", (double)etime / ecount/1e3,
 			kvm_event_rel_stddev(vcpu, event));
 		pr_info("\n");
@@ -732,7 +732,7 @@
 			return -1;
 		}
 
-		err = perf_session_queue_event(kvm->session, event, &sample, 0);
+		err = perf_session_queue_event(kvm->session, event, &kvm->tool, &sample, 0);
 		/*
 		 * FIXME: Here we can't consume the event, as perf_session_queue_event will
 		 *        point to it, and it'll get possibly overwritten by the kernel.
@@ -785,7 +785,7 @@
 
 	/* flush queue after each round in which we processed events */
 	if (ntotal) {
-		kvm->session->ordered_samples.next_flush = flush_time;
+		kvm->session->ordered_events.next_flush = flush_time;
 		err = kvm->tool.finished_round(&kvm->tool, NULL, kvm->session);
 		if (err) {
 			if (kvm->lost_events)
@@ -885,15 +885,11 @@
 	return 0;
 }
 
-static
-int perf_kvm__handle_stdin(struct termios *tc_now, struct termios *tc_save)
+static int perf_kvm__handle_stdin(void)
 {
 	int c;
 
-	tcsetattr(0, TCSANOW, tc_now);
 	c = getc(stdin);
-	tcsetattr(0, TCSAFLUSH, tc_save);
-
 	if (c == 'q')
 		return 1;
 
@@ -904,7 +900,7 @@
 {
 	struct pollfd *pollfds = NULL;
 	int nr_fds, nr_stdin, ret, err = -EINVAL;
-	struct termios tc, save;
+	struct termios save;
 
 	/* live flag must be set first */
 	kvm->live = true;
@@ -919,14 +915,9 @@
 		goto out;
 	}
 
+	set_term_quiet_input(&save);
 	init_kvm_event_record(kvm);
 
-	tcgetattr(0, &save);
-	tc = save;
-	tc.c_lflag &= ~(ICANON | ECHO);
-	tc.c_cc[VMIN] = 0;
-	tc.c_cc[VTIME] = 0;
-
 	signal(SIGINT, sig_handler);
 	signal(SIGTERM, sig_handler);
 
@@ -972,7 +963,7 @@
 			goto out;
 
 		if (pollfds[nr_stdin].revents & POLLIN)
-			done = perf_kvm__handle_stdin(&tc, &save);
+			done = perf_kvm__handle_stdin();
 
 		if (!rc && !done)
 			err = poll(pollfds, nr_fds, 100);
@@ -989,6 +980,7 @@
 	if (kvm->timerfd >= 0)
 		close(kvm->timerfd);
 
+	tcsetattr(0, TCSAFLUSH, &save);
 	free(pollfds);
 	return err;
 }
@@ -998,6 +990,7 @@
 	int err, rc = -1;
 	struct perf_evsel *pos;
 	struct perf_evlist *evlist = kvm->evlist;
+	char sbuf[STRERR_BUFSIZE];
 
 	perf_evlist__config(evlist, &kvm->opts);
 
@@ -1034,12 +1027,14 @@
 
 	err = perf_evlist__open(evlist);
 	if (err < 0) {
-		printf("Couldn't create the events: %s\n", strerror(errno));
+		printf("Couldn't create the events: %s\n",
+		       strerror_r(errno, sbuf, sizeof(sbuf)));
 		goto out;
 	}
 
 	if (perf_evlist__mmap(evlist, kvm->opts.mmap_pages, false) < 0) {
-		ui__error("Failed to mmap the events: %s\n", strerror(errno));
+		ui__error("Failed to mmap the events: %s\n",
+			  strerror_r(errno, sbuf, sizeof(sbuf)));
 		perf_evlist__close(evlist);
 		goto out;
 	}
@@ -1058,7 +1053,7 @@
 	struct perf_tool eops = {
 		.sample			= process_sample_event,
 		.comm			= perf_event__process_comm,
-		.ordered_samples	= true,
+		.ordered_events		= true,
 	};
 	struct perf_data_file file = {
 		.path = kvm->file_name,
@@ -1072,6 +1067,8 @@
 		return -EINVAL;
 	}
 
+	symbol__init(&kvm->session->header.env);
+
 	if (!perf_session__has_traces(kvm->session, "kvm record"))
 		return -EINVAL;
 
@@ -1201,8 +1198,6 @@
 		NULL
 	};
 
-	symbol__init();
-
 	if (argc) {
 		argc = parse_options(argc, argv,
 				     kvm_events_report_options,
@@ -1311,7 +1306,7 @@
 	kvm->tool.exit   = perf_event__process_exit;
 	kvm->tool.fork   = perf_event__process_fork;
 	kvm->tool.lost   = process_lost_event;
-	kvm->tool.ordered_samples = true;
+	kvm->tool.ordered_events = true;
 	perf_tool__fill_defaults(&kvm->tool);
 
 	/* set defaults */
@@ -1322,7 +1317,7 @@
 	kvm->opts.target.uid_str = NULL;
 	kvm->opts.target.uid = UINT_MAX;
 
-	symbol__init();
+	symbol__init(NULL);
 	disable_buildid_cache();
 
 	use_browser = 0;

diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c
index 6148afc..92790ed 100644
--- a/tools/perf/builtin-lock.c
+++ b/tools/perf/builtin-lock.c

@@ -852,7 +852,7 @@
 	struct perf_tool eops = {
 		.sample		 = process_sample_event,
 		.comm		 = perf_event__process_comm,
-		.ordered_samples = true,
+		.ordered_events	 = true,
 	};
 	struct perf_data_file file = {
 		.path = input_name,
@@ -865,6 +865,8 @@
 		return -ENOMEM;
 	}
 
+	symbol__init(&session->header.env);
+
 	if (!perf_session__has_traces(session, "lock record"))
 		goto out_delete;
 
@@ -974,7 +976,6 @@
 	unsigned int i;
 	int rc = 0;
 
-	symbol__init();
 	for (i = 0; i < LOCKHASH_SIZE; i++)
 		INIT_LIST_HEAD(lockhash_table + i);
 

diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c
index 4a1a6c9..8b4a87f 100644
--- a/tools/perf/builtin-mem.c
+++ b/tools/perf/builtin-mem.c

@@ -133,7 +133,7 @@
 			goto out_delete;
 	}
 
-	if (symbol__init() < 0)
+	if (symbol__init(&session->header.env) < 0)
 		return -1;
 
 	printf("# PID, TID, IP, ADDR, LOCAL WEIGHT, DSRC, SYMBOL\n");
@@ -194,7 +194,7 @@
 			.lost		= perf_event__process_lost,
 			.fork		= perf_event__process_fork,
 			.build_id	= perf_event__process_build_id,
-			.ordered_samples = true,
+			.ordered_events	= true,
 		},
 		.input_name		 = "perf.data",
 	};

diff --git a/tools/perf/builtin-probe.c b/tools/perf/builtin-probe.c
index c63fa29..347729e 100644
--- a/tools/perf/builtin-probe.c
+++ b/tools/perf/builtin-probe.c

@@ -290,8 +290,11 @@
 
 static void pr_err_with_code(const char *msg, int err)
 {
+	char sbuf[STRERR_BUFSIZE];
+
 	pr_err("%s", msg);
-	pr_debug(" Reason: %s (Code: %d)", strerror(-err), err);
+	pr_debug(" Reason: %s (Code: %d)",
+		 strerror_r(-err, sbuf, sizeof(sbuf)), err);
 	pr_err("\n");
 }
 

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 4869050..87e28a4 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c

@@ -161,7 +161,7 @@
 
 	if (perf_evlist__apply_filters(evlist)) {
 		error("failed to set filter with %d (%s)\n", errno,
-			strerror(errno));
+			strerror_r(errno, msg, sizeof(msg)));
 		rc = -1;
 		goto out;
 	}
@@ -175,7 +175,8 @@
 			       "(current value: %u)\n", opts->mmap_pages);
 			rc = -errno;
 		} else {
-			pr_err("failed to mmap with %d (%s)\n", errno, strerror(errno));
+			pr_err("failed to mmap with %d (%s)\n", errno,
+				strerror_r(errno, msg, sizeof(msg)));
 			rc = -errno;
 		}
 		goto out;
@@ -480,7 +481,7 @@
 	}
 
 	if (forks && workload_exec_errno) {
-		char msg[512];
+		char msg[STRERR_BUFSIZE];
 		const char *emsg = strerror_r(workload_exec_errno, msg, sizeof(msg));
 		pr_err("Workload failed: %s\n", emsg);
 		err = -1;
@@ -781,6 +782,7 @@
  */
 static struct record record = {
 	.opts = {
+		.sample_time	     = true,
 		.mmap_pages	     = UINT_MAX,
 		.user_freq	     = UINT_MAX,
 		.user_interval	     = ULLONG_MAX,
@@ -907,7 +909,7 @@
 		usage_with_options(record_usage, record_options);
 	}
 
-	symbol__init();
+	symbol__init(NULL);
 
 	if (symbol_conf.kptr_restrict)
 		pr_warning(

diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 21d830b..3da59a8 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c

@@ -58,17 +58,19 @@
 	const char		*symbol_filter_str;
 	float			min_percent;
 	u64			nr_entries;
+	u64			queue_size;
 	DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS);
 };
 
 static int report__config(const char *var, const char *value, void *cb)
 {
+	struct report *rep = cb;
+
 	if (!strcmp(var, "report.group")) {
 		symbol_conf.event_group = perf_config_bool(var, value);
 		return 0;
 	}
 	if (!strcmp(var, "report.percent-limit")) {
-		struct report *rep = cb;
 		rep->min_percent = strtof(value, NULL);
 		return 0;
 	}
@@ -76,6 +78,10 @@
 		symbol_conf.cumulate_callchain = perf_config_bool(var, value);
 		return 0;
 	}
+	if (!strcmp(var, "report.queue-size")) {
+		rep->queue_size = perf_config_u64(var, value);
+		return 0;
+	}
 
 	return perf_default_config(var, value, cb);
 }
@@ -578,7 +584,7 @@
 			.attr		 = perf_event__process_attr,
 			.tracing_data	 = perf_event__process_tracing_data,
 			.build_id	 = perf_event__process_build_id,
-			.ordered_samples = true,
+			.ordered_events	 = true,
 			.ordering_requires_timestamps = true,
 		},
 		.max_stack		 = PERF_MAX_STACK_DEPTH,
@@ -714,12 +720,17 @@
 	if (session == NULL)
 		return -ENOMEM;
 
+	if (report.queue_size) {
+		ordered_events__set_alloc_size(&session->ordered_events,
+					       report.queue_size);
+	}
+
 	report.session = session;
 
 	has_br_stack = perf_header__has_feat(&session->header,
 					     HEADER_BRANCH_STACK);
 
-	if (branch_mode == -1 && has_br_stack) {
+	if ((branch_mode == -1 && has_br_stack) || branch_mode == 1) {
 		sort__mode = SORT_MODE__BRANCH;
 		symbol_conf.cumulate_callchain = false;
 	}
@@ -787,7 +798,7 @@
 		}
 	}
 
-	if (symbol__init() < 0)
+	if (symbol__init(&session->header.env) < 0)
 		goto error;
 
 	if (argc) {

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index f83c08c..9c9287f 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c

@@ -428,6 +428,7 @@
 static int self_open_counters(void)
 {
 	struct perf_event_attr attr;
+	char sbuf[STRERR_BUFSIZE];
 	int fd;
 
 	memset(&attr, 0, sizeof(attr));
@@ -440,7 +441,8 @@
 
 	if (fd < 0)
 		pr_err("Error: sys_perf_event_open() syscall returned "
-		       "with %d (%s)\n", fd, strerror(errno));
+		       "with %d (%s)\n", fd,
+		       strerror_r(errno, sbuf, sizeof(sbuf)));
 	return fd;
 }
 
@@ -1462,6 +1464,8 @@
 		return -1;
 	}
 
+	symbol__init(&session->header.env);
+
 	if (perf_session__set_tracepoints_handlers(session, handlers))
 		goto out_delete;
 
@@ -1662,7 +1666,7 @@
 			.comm		 = perf_event__process_comm,
 			.lost		 = perf_event__process_lost,
 			.fork		 = perf_sched__process_fork_event,
-			.ordered_samples = true,
+			.ordered_events = true,
 		},
 		.cmp_pid	      = LIST_HEAD_INIT(sched.cmp_pid),
 		.sort_list	      = LIST_HEAD_INIT(sched.sort_list),
@@ -1747,7 +1751,6 @@
 	if (!strcmp(argv[0], "script"))
 		return cmd_script(argc, argv, prefix);
 
-	symbol__init();
 	if (!strncmp(argv[0], "rec", 3)) {
 		return __cmd_record(argc, argv);
 	} else if (!strncmp(argv[0], "lat", 3)) {

diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index f57035b..02dce92 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c

@@ -184,10 +184,6 @@
 		if (perf_evsel__check_stype(evsel, PERF_SAMPLE_IP, "IP",
 					    PERF_OUTPUT_IP))
 			return -EINVAL;
-
-		if (!no_callchain &&
-		    !(attr->sample_type & PERF_SAMPLE_CALLCHAIN))
-			symbol_conf.use_callchain = false;
 	}
 
 	if (PRINT_FIELD(ADDR) &&
@@ -290,6 +286,19 @@
 		set_print_ip_opts(&evsel->attr);
 	}
 
+	if (!no_callchain) {
+		bool use_callchain = false;
+
+		evlist__for_each(session->evlist, evsel) {
+			if (evsel->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
+				use_callchain = true;
+				break;
+			}
+		}
+		if (!use_callchain)
+			symbol_conf.use_callchain = false;
+	}
+
 	/*
 	 * set default for tracepoints to print symbols only
 	 * if callchains are present
@@ -476,6 +485,11 @@
 	return 0;
 }
 
+static int default_flush_script(void)
+{
+	return 0;
+}
+
 static int default_stop_script(void)
 {
 	return 0;
@@ -489,6 +503,7 @@
 
 static struct scripting_ops default_scripting_ops = {
 	.start_script		= default_start_script,
+	.flush_script		= default_flush_script,
 	.stop_script		= default_stop_script,
 	.process_event		= process_event,
 	.generate_script	= default_generate_script,
@@ -504,6 +519,11 @@
 	scripting_ops = &default_scripting_ops;
 }
 
+static int flush_scripting(void)
+{
+	return scripting_ops->flush_script();
+}
+
 static int cleanup_scripting(void)
 {
 	pr_debug("\nperf script stopped\n");
@@ -1471,12 +1491,13 @@
 	bool show_full_info = false;
 	bool header = false;
 	bool header_only = false;
+	bool script_started = false;
 	char *rec_script_path = NULL;
 	char *rep_script_path = NULL;
 	struct perf_session *session;
 	char *script_path = NULL;
 	const char **__argv;
-	int i, j, err;
+	int i, j, err = 0;
 	struct perf_script script = {
 		.tool = {
 			.sample		 = process_sample_event,
@@ -1488,7 +1509,7 @@
 			.attr		 = process_attr,
 			.tracing_data	 = perf_event__process_tracing_data,
 			.build_id	 = perf_event__process_build_id,
-			.ordered_samples = true,
+			.ordered_events	 = true,
 			.ordering_requires_timestamps = true,
 		},
 	};
@@ -1718,8 +1739,6 @@
 		exit(-1);
 	}
 
-	if (symbol__init() < 0)
-		return -1;
 	if (!script_name)
 		setup_pager();
 
@@ -1730,14 +1749,18 @@
 	if (header || header_only) {
 		perf_session__fprintf_info(session, stdout, show_full_info);
 		if (header_only)
-			return 0;
+			goto out_delete;
 	}
 
+	if (symbol__init(&session->header.env) < 0)
+		goto out_delete;
+
 	script.session = session;
 
 	if (cpu_list) {
-		if (perf_session__cpu_bitmap(session, cpu_list, cpu_bitmap))
-			return -1;
+		err = perf_session__cpu_bitmap(session, cpu_list, cpu_bitmap);
+		if (err < 0)
+			goto out_delete;
 	}
 
 	if (!no_callchain)
@@ -1752,53 +1775,62 @@
 		if (output_set_by_user()) {
 			fprintf(stderr,
 				"custom fields not supported for generated scripts");
-			return -1;
+			err = -EINVAL;
+			goto out_delete;
 		}
 
 		input = open(file.path, O_RDONLY);	/* input_name */
 		if (input < 0) {
+			err = -errno;
 			perror("failed to open file");
-			return -1;
+			goto out_delete;
 		}
 
 		err = fstat(input, &perf_stat);
 		if (err < 0) {
 			perror("failed to stat file");
-			return -1;
+			goto out_delete;
 		}
 
 		if (!perf_stat.st_size) {
 			fprintf(stderr, "zero-sized file, nothing to do!\n");
-			return 0;
+			goto out_delete;
 		}
 
 		scripting_ops = script_spec__lookup(generate_script_lang);
 		if (!scripting_ops) {
 			fprintf(stderr, "invalid language specifier");
-			return -1;
+			err = -ENOENT;
+			goto out_delete;
 		}
 
 		err = scripting_ops->generate_script(session->tevent.pevent,
 						     "perf-script");
-		goto out;
+		goto out_delete;
 	}
 
 	if (script_name) {
 		err = scripting_ops->start_script(script_name, argc, argv);
 		if (err)
-			goto out;
+			goto out_delete;
 		pr_debug("perf script started with script %s\n\n", script_name);
+		script_started = true;
 	}
 
 
 	err = perf_session__check_output_opt(session);
 	if (err < 0)
-		goto out;
+		goto out_delete;
 
 	err = __cmd_script(&script);
 
+	flush_scripting();
+
+out_delete:
 	perf_session__delete(session);
-	cleanup_scripting();
+
+	if (script_started)
+		cleanup_scripting();
 out:
 	return err;
 }

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 3e80aa1..5fe0edb 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c

@@ -593,7 +593,7 @@
 
 	if (perf_evlist__apply_filters(evsel_list)) {
 		error("failed to set filter with %d (%s)\n", errno,
-			strerror(errno));
+			strerror_r(errno, msg, sizeof(msg)));
 		return -1;
 	}
 

diff --git a/tools/perf/builtin-timechart.c b/tools/perf/builtin-timechart.c
index 2f1a522..48eea6c 100644
--- a/tools/perf/builtin-timechart.c
+++ b/tools/perf/builtin-timechart.c

@@ -1607,6 +1607,8 @@
 	if (session == NULL)
 		return -ENOMEM;
 
+	symbol__init(&session->header.env);
+
 	(void)perf_header__process_sections(&session->header,
 					    perf_data_file__fd(session->file),
 					    tchart,
@@ -1920,7 +1922,7 @@
 			.fork		 = process_fork_event,
 			.exit		 = process_exit_event,
 			.sample		 = process_sample_event,
-			.ordered_samples = true,
+			.ordered_events	 = true,
 		},
 		.proc_num = 15,
 		.min_time = 1000000,
@@ -1982,8 +1984,6 @@
 		return -1;
 	}
 
-	symbol__init();
-
 	if (argc && !strncmp(argv[0], "rec", 3)) {
 		argc = parse_options(argc, argv, record_options, record_usage,
 				     PARSE_OPT_STOP_AT_NON_OPTION);

diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 377971d..9848e27 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c

@@ -276,11 +276,17 @@
 		return;
 	}
 
+	if (top->zero) {
+		hists__delete_entries(&top->sym_evsel->hists);
+	} else {
+		hists__decay_entries(&top->sym_evsel->hists,
+				     top->hide_user_symbols,
+				     top->hide_kernel_symbols);
+	}
+
 	hists__collapse_resort(&top->sym_evsel->hists, NULL);
 	hists__output_resort(&top->sym_evsel->hists);
-	hists__decay_entries(&top->sym_evsel->hists,
-			     top->hide_user_symbols,
-			     top->hide_kernel_symbols);
+
 	hists__output_recalc_col_len(&top->sym_evsel->hists,
 				     top->print_entries - printed);
 	putchar('\n');
@@ -427,18 +433,13 @@
 
 	if (!perf_top__key_mapped(top, c)) {
 		struct pollfd stdin_poll = { .fd = 0, .events = POLLIN };
-		struct termios tc, save;
+		struct termios save;
 
 		perf_top__print_mapped_keys(top);
 		fprintf(stdout, "\nEnter selection, or unmapped key to continue: ");
 		fflush(stdout);
 
-		tcgetattr(0, &save);
-		tc = save;
-		tc.c_lflag &= ~(ICANON | ECHO);
-		tc.c_cc[VMIN] = 0;
-		tc.c_cc[VTIME] = 0;
-		tcsetattr(0, TCSANOW, &tc);
+		set_term_quiet_input(&save);
 
 		poll(&stdin_poll, 1, -1);
 		c = getc(stdin);
@@ -542,11 +543,16 @@
 	if (t->evlist->selected != NULL)
 		t->sym_evsel = t->evlist->selected;
 
+	if (t->zero) {
+		hists__delete_entries(&t->sym_evsel->hists);
+	} else {
+		hists__decay_entries(&t->sym_evsel->hists,
+				     t->hide_user_symbols,
+				     t->hide_kernel_symbols);
+	}
+
 	hists__collapse_resort(&t->sym_evsel->hists, NULL);
 	hists__output_resort(&t->sym_evsel->hists);
-	hists__decay_entries(&t->sym_evsel->hists,
-			     t->hide_user_symbols,
-			     t->hide_kernel_symbols);
 }
 
 static void *display_thread_tui(void *arg)
@@ -577,23 +583,32 @@
 	return NULL;
 }
 
+static void display_sig(int sig __maybe_unused)
+{
+	done = 1;
+}
+
+static void display_setup_sig(void)
+{
+	signal(SIGSEGV, display_sig);
+	signal(SIGFPE,  display_sig);
+	signal(SIGINT,  display_sig);
+	signal(SIGQUIT, display_sig);
+	signal(SIGTERM, display_sig);
+}
+
 static void *display_thread(void *arg)
 {
 	struct pollfd stdin_poll = { .fd = 0, .events = POLLIN };
-	struct termios tc, save;
+	struct termios save;
 	struct perf_top *top = arg;
 	int delay_msecs, c;
 
-	tcgetattr(0, &save);
-	tc = save;
-	tc.c_lflag &= ~(ICANON | ECHO);
-	tc.c_cc[VMIN] = 0;
-	tc.c_cc[VTIME] = 0;
-
+	display_setup_sig();
 	pthread__unblock_sigwinch();
 repeat:
 	delay_msecs = top->delay_secs * 1000;
-	tcsetattr(0, TCSANOW, &tc);
+	set_term_quiet_input(&save);
 	/* trash return*/
 	getc(stdin);
 
@@ -620,13 +635,16 @@
 		}
 	}
 
+	tcsetattr(0, TCSAFLUSH, &save);
 	return NULL;
 }
 
-static int symbol_filter(struct map *map __maybe_unused, struct symbol *sym)
+static int symbol_filter(struct map *map, struct symbol *sym)
 {
 	const char *name = sym->name;
 
+	if (!map->dso->kernel)
+		return 0;
 	/*
 	 * ppc64 uses function descriptors and appends a '.' to the
 	 * start of every instruction address. Remove it.
@@ -876,7 +894,7 @@
 
 	if (perf_evlist__mmap(evlist, opts->mmap_pages, false) < 0) {
 		ui__error("Failed to mmap with %d (%s)\n",
-			    errno, strerror(errno));
+			    errno, strerror_r(errno, msg, sizeof(msg)));
 		goto out_err;
 	}
 
@@ -963,7 +981,7 @@
 		param.sched_priority = top->realtime_prio;
 		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
 			ui__error("Could not set realtime priority.\n");
-			goto out_delete;
+			goto out_join;
 		}
 	}
 
@@ -977,6 +995,8 @@
 	}
 
 	ret = 0;
+out_join:
+	pthread_join(thread, NULL);
 out_delete:
 	perf_session__delete(top->session);
 	top->session = NULL;
@@ -1131,6 +1151,9 @@
 		     "Don't show entries under that percent", parse_percent_limit),
 	OPT_CALLBACK(0, "percentage", NULL, "relative|absolute",
 		     "How to display percentage of filtered entries", parse_filter_percentage),
+	OPT_STRING('w', "column-widths", &symbol_conf.col_width_list_str,
+		   "width[,width...]",
+		   "don't try to adjust column width, use these fixed values"),
 	OPT_END()
 	};
 	const char * const top_usage[] = {
@@ -1217,7 +1240,7 @@
 	symbol_conf.priv_size = sizeof(struct annotation);
 
 	symbol_conf.try_vmlinux_path = (symbol_conf.vmlinux_name == NULL);
-	if (symbol__init() < 0)
+	if (symbol__init(NULL) < 0)
 		return -1;
 
 	sort__setup_elide(stdout);

diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index a6c3752..a9e96ff 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c

@@ -402,6 +402,31 @@
 
 #define SCA_MMAP_FLAGS syscall_arg__scnprintf_mmap_flags
 
+static size_t syscall_arg__scnprintf_mremap_flags(char *bf, size_t size,
+						  struct syscall_arg *arg)
+{
+	int printed = 0, flags = arg->val;
+
+#define P_MREMAP_FLAG(n) \
+	if (flags & MREMAP_##n) { \
+		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
+		flags &= ~MREMAP_##n; \
+	}
+
+	P_MREMAP_FLAG(MAYMOVE);
+#ifdef MREMAP_FIXED
+	P_MREMAP_FLAG(FIXED);
+#endif
+#undef P_MREMAP_FLAG
+
+	if (flags)
+		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
+
+	return printed;
+}
+
+#define SCA_MREMAP_FLAGS syscall_arg__scnprintf_mremap_flags
+
 static size_t syscall_arg__scnprintf_madvise_behavior(char *bf, size_t size,
 						      struct syscall_arg *arg)
 {
@@ -1004,6 +1029,7 @@
 			     [2] = SCA_MMAP_PROT, /* prot */ }, },
 	{ .name	    = "mremap",	    .hexret = true,
 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */
+			     [3] = SCA_MREMAP_FLAGS, /* flags */
 			     [4] = SCA_HEX, /* new_addr */ }, },
 	{ .name	    = "munlock",    .errmsg = true,
 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
@@ -1385,7 +1411,7 @@
 
 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
 {
-	int err = symbol__init();
+	int err = symbol__init(NULL);
 
 	if (err)
 		return err;
@@ -1724,7 +1750,7 @@
 signed_print:
 		fprintf(trace->output, ") = %d", ret);
 	} else if (ret < 0 && sc->fmt->errmsg) {
-		char bf[256];
+		char bf[STRERR_BUFSIZE];
 		const char *emsg = strerror_r(-ret, bf, sizeof(bf)),
 			   *e = audit_errno_to_name(-ret);
 
@@ -2018,6 +2044,7 @@
 	int err = -1, i;
 	unsigned long before;
 	const bool forks = argc > 0;
+	char sbuf[STRERR_BUFSIZE];
 
 	trace->live = true;
 
@@ -2079,7 +2106,8 @@
 
 	err = perf_evlist__mmap(evlist, trace->opts.mmap_pages, false);
 	if (err < 0) {
-		fprintf(trace->output, "Couldn't mmap the events: %s\n", strerror(errno));
+		fprintf(trace->output, "Couldn't mmap the events: %s\n",
+			strerror_r(errno, sbuf, sizeof(sbuf)));
 		goto out_delete_evlist;
 	}
 
@@ -2209,19 +2237,19 @@
 	trace->tool.tracing_data = perf_event__process_tracing_data;
 	trace->tool.build_id	  = perf_event__process_build_id;
 
-	trace->tool.ordered_samples = true;
+	trace->tool.ordered_events = true;
 	trace->tool.ordering_requires_timestamps = true;
 
 	/* add tid to output */
 	trace->multiple_threads = true;
 
-	if (symbol__init() < 0)
-		return -1;
-
 	session = perf_session__new(&file, false, &trace->tool);
 	if (session == NULL)
 		return -ENOMEM;
 
+	if (symbol__init(&session->header.env) < 0)
+		goto out;
+
 	trace->host = &session->machines.host;
 
 	err = perf_session__set_tracepoints_handlers(session, handlers);

diff --git a/tools/perf/config/Makefile b/tools/perf/config/Makefile
index 1f67aa0..75d4c23 100644
--- a/tools/perf/config/Makefile
+++ b/tools/perf/config/Makefile

@@ -120,6 +120,29 @@
   CFLAGS             += -DPARSER_DEBUG
 endif
 
+ifndef NO_LIBPYTHON
+  # Try different combinations to accommodate systems that only have
+  # python[2][-config] in weird combinations but always preferring
+  # python2 and python2-config as per pep-0394. If we catch a
+  # python[-config] in version 3, the version check will kill it.
+  PYTHON2 := $(if $(call get-executable,python2),python2,python)
+  override PYTHON := $(call get-executable-or-default,PYTHON,$(PYTHON2))
+  PYTHON2_CONFIG := \
+    $(if $(call get-executable,$(PYTHON)-config),$(PYTHON)-config,python-config)
+  override PYTHON_CONFIG := \
+    $(call get-executable-or-default,PYTHON_CONFIG,$(PYTHON2_CONFIG))
+
+  PYTHON_CONFIG_SQ := $(call shell-sq,$(PYTHON_CONFIG))
+
+  PYTHON_EMBED_LDOPTS := $(shell $(PYTHON_CONFIG_SQ) --ldflags 2>/dev/null)
+  PYTHON_EMBED_CCOPTS := $(shell $(PYTHON_CONFIG_SQ) --cflags 2>/dev/null)
+
+  FEATURE_CHECK_CFLAGS-libpython := $(PYTHON_EMBED_CCOPTS)
+  FEATURE_CHECK_LDFLAGS-libpython := $(PYTHON_EMBED_LDOPTS)
+  FEATURE_CHECK_CFLAGS-libpython-version := $(PYTHON_EMBED_CCOPTS)
+  FEATURE_CHECK_LDFLAGS-libpython-version := $(PYTHON_EMBED_LDOPTS)
+endif
+
 CFLAGS += -fno-omit-frame-pointer
 CFLAGS += -ggdb3
 CFLAGS += -funwind-tables
@@ -482,21 +505,14 @@
   NO_LIBPYTHON := 1
 endef
 
-override PYTHON := \
-  $(call get-executable-or-default,PYTHON,python)
-
-ifndef PYTHON
-  $(call disable-python,python interpreter)
+ifdef NO_LIBPYTHON
+  $(call disable-python)
 else
 
-  PYTHON_WORD := $(call shell-wordify,$(PYTHON))
-
-  ifdef NO_LIBPYTHON
-    $(call disable-python)
+  ifndef PYTHON
+    $(call disable-python,python interpreter)
   else
-
-    override PYTHON_CONFIG := \
-      $(call get-executable-or-default,PYTHON_CONFIG,$(PYTHON)-config)
+    PYTHON_WORD := $(call shell-wordify,$(PYTHON))
 
     ifndef PYTHON_CONFIG
       $(call disable-python,python-config tool)

diff --git a/tools/perf/config/feature-checks/Makefile b/tools/perf/config/feature-checks/Makefile
index 6088f8d..72ab298 100644
--- a/tools/perf/config/feature-checks/Makefile
+++ b/tools/perf/config/feature-checks/Makefile

@@ -101,25 +101,11 @@
 test-libperl.bin:
 	$(BUILD) $(FLAGS_PERL_EMBED)
 
-override PYTHON := python
-override PYTHON_CONFIG := python-config
-
-escape-for-shell-sq =  $(subst ','\'',$(1))
-shell-sq = '$(escape-for-shell-sq)'
-
-PYTHON_CONFIG_SQ = $(call shell-sq,$(PYTHON_CONFIG))
-
-PYTHON_EMBED_LDOPTS = $(shell $(PYTHON_CONFIG_SQ) --ldflags 2>/dev/null)
-PYTHON_EMBED_LDFLAGS = $(call strip-libs,$(PYTHON_EMBED_LDOPTS))
-PYTHON_EMBED_LIBADD = $(call grep-libs,$(PYTHON_EMBED_LDOPTS))
-PYTHON_EMBED_CCOPTS = $(shell $(PYTHON_CONFIG_SQ) --cflags 2>/dev/null)
-FLAGS_PYTHON_EMBED = $(PYTHON_EMBED_CCOPTS) $(PYTHON_EMBED_LDOPTS)
-
 test-libpython.bin:
-	$(BUILD) $(FLAGS_PYTHON_EMBED)
+	$(BUILD)
 
 test-libpython-version.bin:
-	$(BUILD) $(FLAGS_PYTHON_EMBED)
+	$(BUILD)
 
 test-libbfd.bin:
 	$(BUILD) -DPACKAGE='"perf"' -lbfd -lz -liberty -ldl

diff --git a/tools/perf/perf.c b/tools/perf/perf.c
index 2282d41..452a847 100644
--- a/tools/perf/perf.c
+++ b/tools/perf/perf.c

@@ -313,6 +313,7 @@
 	int status;
 	struct stat st;
 	const char *prefix;
+	char sbuf[STRERR_BUFSIZE];
 
 	prefix = NULL;
 	if (p->option & RUN_SETUP)
@@ -343,7 +344,8 @@
 	status = 1;
 	/* Check for ENOSPC and EIO errors.. */
 	if (fflush(stdout)) {
-		fprintf(stderr, "write failure on standard output: %s", strerror(errno));
+		fprintf(stderr, "write failure on standard output: %s",
+			strerror_r(errno, sbuf, sizeof(sbuf)));
 		goto out;
 	}
 	if (ferror(stdout)) {
@@ -351,7 +353,8 @@
 		goto out;
 	}
 	if (fclose(stdout)) {
-		fprintf(stderr, "close failed on standard output: %s", strerror(errno));
+		fprintf(stderr, "close failed on standard output: %s",
+			strerror_r(errno, sbuf, sizeof(sbuf)));
 		goto out;
 	}
 	status = 0;
@@ -466,6 +469,7 @@
 int main(int argc, const char **argv)
 {
 	const char *cmd;
+	char sbuf[STRERR_BUFSIZE];
 
 	/* The page_size is placed in util object. */
 	page_size = sysconf(_SC_PAGE_SIZE);
@@ -561,7 +565,7 @@
 	}
 
 	fprintf(stderr, "Failed to run command '%s': %s\n",
-		cmd, strerror(errno));
+		cmd, strerror_r(errno, sbuf, sizeof(sbuf)));
 out:
 	return 1;
 }

diff --git a/tools/perf/tests/builtin-test.c b/tools/perf/tests/builtin-test.c
index 6f8b01b..6a4145e 100644
--- a/tools/perf/tests/builtin-test.c
+++ b/tools/perf/tests/builtin-test.c

@@ -154,6 +154,10 @@
 		.func = test__hists_cumulate,
 	},
 	{
+		.desc = "Test tracking with sched_switch",
+		.func = test__switch_tracking,
+	},
+	{
 		.func = NULL,
 	},
 };
@@ -185,9 +189,11 @@
 static int run_test(struct test *test)
 {
 	int status, err = -1, child = fork();
+	char sbuf[STRERR_BUFSIZE];
 
 	if (child < 0) {
-		pr_err("failed to fork test: %s\n", strerror(errno));
+		pr_err("failed to fork test: %s\n",
+			strerror_r(errno, sbuf, sizeof(sbuf)));
 		return -1;
 	}
 
@@ -297,7 +303,7 @@
 	symbol_conf.sort_by_name = true;
 	symbol_conf.try_vmlinux_path = true;
 
-	if (symbol__init() < 0)
+	if (symbol__init(NULL) < 0)
 		return -1;
 
 	if (skip != NULL)

diff --git a/tools/perf/tests/mmap-basic.c b/tools/perf/tests/mmap-basic.c
index 1422634..9b9622a 100644
--- a/tools/perf/tests/mmap-basic.c
+++ b/tools/perf/tests/mmap-basic.c

@@ -31,6 +31,7 @@
 	unsigned int nr_events[nsyscalls],
 		     expected_nr_events[nsyscalls], i, j;
 	struct perf_evsel *evsels[nsyscalls], *evsel;
+	char sbuf[STRERR_BUFSIZE];
 
 	threads = thread_map__new(-1, getpid(), UINT_MAX);
 	if (threads == NULL) {
@@ -49,7 +50,7 @@
 	sched_setaffinity(0, sizeof(cpu_set), &cpu_set);
 	if (sched_setaffinity(0, sizeof(cpu_set), &cpu_set) < 0) {
 		pr_debug("sched_setaffinity() failed on CPU %d: %s ",
-			 cpus->map[0], strerror(errno));
+			 cpus->map[0], strerror_r(errno, sbuf, sizeof(sbuf)));
 		goto out_free_cpus;
 	}
 
@@ -79,7 +80,7 @@
 		if (perf_evsel__open(evsels[i], cpus, threads) < 0) {
 			pr_debug("failed to open counter: %s, "
 				 "tweak /proc/sys/kernel/perf_event_paranoid?\n",
-				 strerror(errno));
+				 strerror_r(errno, sbuf, sizeof(sbuf)));
 			goto out_delete_evlist;
 		}
 
@@ -89,7 +90,7 @@
 
 	if (perf_evlist__mmap(evlist, 128, true) < 0) {
 		pr_debug("failed to mmap events: %d (%s)\n", errno,
-			 strerror(errno));
+			 strerror_r(errno, sbuf, sizeof(sbuf)));
 		goto out_delete_evlist;
 	}
 

diff --git a/tools/perf/tests/open-syscall-all-cpus.c b/tools/perf/tests/open-syscall-all-cpus.c
index 5fecdbd..8fa82d1 100644
--- a/tools/perf/tests/open-syscall-all-cpus.c
+++ b/tools/perf/tests/open-syscall-all-cpus.c

@@ -12,6 +12,7 @@
 	unsigned int nr_open_calls = 111, i;
 	cpu_set_t cpu_set;
 	struct thread_map *threads = thread_map__new(-1, getpid(), UINT_MAX);
+	char sbuf[STRERR_BUFSIZE];
 
 	if (threads == NULL) {
 		pr_debug("thread_map__new\n");
@@ -35,7 +36,7 @@
 	if (perf_evsel__open(evsel, cpus, threads) < 0) {
 		pr_debug("failed to open counter: %s, "
 			 "tweak /proc/sys/kernel/perf_event_paranoid?\n",
-			 strerror(errno));
+			 strerror_r(errno, sbuf, sizeof(sbuf)));
 		goto out_evsel_delete;
 	}
 
@@ -56,7 +57,7 @@
 		if (sched_setaffinity(0, sizeof(cpu_set), &cpu_set) < 0) {
 			pr_debug("sched_setaffinity() failed on CPU %d: %s ",
 				 cpus->map[cpu],
-				 strerror(errno));
+				 strerror_r(errno, sbuf, sizeof(sbuf)));
 			goto out_close_fd;
 		}
 		for (i = 0; i < ncalls; ++i) {

diff --git a/tools/perf/tests/open-syscall-tp-fields.c b/tools/perf/tests/open-syscall-tp-fields.c
index 0785b64..922bdb6 100644
--- a/tools/perf/tests/open-syscall-tp-fields.c
+++ b/tools/perf/tests/open-syscall-tp-fields.c

@@ -22,6 +22,7 @@
 	struct perf_evlist *evlist = perf_evlist__new();
 	struct perf_evsel *evsel;
 	int err = -1, i, nr_events = 0, nr_polls = 0;
+	char sbuf[STRERR_BUFSIZE];
 
 	if (evlist == NULL) {
 		pr_debug("%s: perf_evlist__new\n", __func__);
@@ -48,13 +49,15 @@
 
 	err = perf_evlist__open(evlist);
 	if (err < 0) {
-		pr_debug("perf_evlist__open: %s\n", strerror(errno));
+		pr_debug("perf_evlist__open: %s\n",
+			 strerror_r(errno, sbuf, sizeof(sbuf)));
 		goto out_delete_evlist;
 	}
 
 	err = perf_evlist__mmap(evlist, UINT_MAX, false);
 	if (err < 0) {
-		pr_debug("perf_evlist__mmap: %s\n", strerror(errno));
+		pr_debug("perf_evlist__mmap: %s\n",
+			 strerror_r(errno, sbuf, sizeof(sbuf)));
 		goto out_delete_evlist;
 	}
 

diff --git a/tools/perf/tests/open-syscall.c b/tools/perf/tests/open-syscall.c
index c1dc7d2..a33b2da 100644
--- a/tools/perf/tests/open-syscall.c
+++ b/tools/perf/tests/open-syscall.c

@@ -9,6 +9,7 @@
 	struct perf_evsel *evsel;
 	unsigned int nr_open_calls = 111, i;
 	struct thread_map *threads = thread_map__new(-1, getpid(), UINT_MAX);
+	char sbuf[STRERR_BUFSIZE];
 
 	if (threads == NULL) {
 		pr_debug("thread_map__new\n");
@@ -24,7 +25,7 @@
 	if (perf_evsel__open_per_thread(evsel, threads) < 0) {
 		pr_debug("failed to open counter: %s, "
 			 "tweak /proc/sys/kernel/perf_event_paranoid?\n",
-			 strerror(errno));
+			 strerror_r(errno, sbuf, sizeof(sbuf)));
 		goto out_evsel_delete;
 	}
 

diff --git a/tools/perf/tests/perf-record.c b/tools/perf/tests/perf-record.c
index aca1a83..2ce753c 100644
--- a/tools/perf/tests/perf-record.c
+++ b/tools/perf/tests/perf-record.c

@@ -59,6 +59,7 @@
 	int err = -1, errs = 0, i, wakeups = 0;
 	u32 cpu;
 	int total_events = 0, nr_events[PERF_RECORD_MAX] = { 0, };
+	char sbuf[STRERR_BUFSIZE];
 
 	if (evlist == NULL || argv == NULL) {
 		pr_debug("Not enough memory to create evlist\n");
@@ -100,7 +101,8 @@
 
 	err = sched__get_first_possible_cpu(evlist->workload.pid, &cpu_mask);
 	if (err < 0) {
-		pr_debug("sched__get_first_possible_cpu: %s\n", strerror(errno));
+		pr_debug("sched__get_first_possible_cpu: %s\n",
+			 strerror_r(errno, sbuf, sizeof(sbuf)));
 		goto out_delete_evlist;
 	}
 
@@ -110,7 +112,8 @@
 	 * So that we can check perf_sample.cpu on all the samples.
 	 */
 	if (sched_setaffinity(evlist->workload.pid, cpu_mask_size, &cpu_mask) < 0) {
-		pr_debug("sched_setaffinity: %s\n", strerror(errno));
+		pr_debug("sched_setaffinity: %s\n",
+			 strerror_r(errno, sbuf, sizeof(sbuf)));
 		goto out_delete_evlist;
 	}
 
@@ -120,7 +123,8 @@
 	 */
 	err = perf_evlist__open(evlist);
 	if (err < 0) {
-		pr_debug("perf_evlist__open: %s\n", strerror(errno));
+		pr_debug("perf_evlist__open: %s\n",
+			 strerror_r(errno, sbuf, sizeof(sbuf)));
 		goto out_delete_evlist;
 	}
 
@@ -131,7 +135,8 @@
 	 */
 	err = perf_evlist__mmap(evlist, opts.mmap_pages, false);
 	if (err < 0) {
-		pr_debug("perf_evlist__mmap: %s\n", strerror(errno));
+		pr_debug("perf_evlist__mmap: %s\n",
+			 strerror_r(errno, sbuf, sizeof(sbuf)));
 		goto out_delete_evlist;
 	}
 

diff --git a/tools/perf/tests/rdpmc.c b/tools/perf/tests/rdpmc.c
index c04d1f2..d31f2c4 100644
--- a/tools/perf/tests/rdpmc.c
+++ b/tools/perf/tests/rdpmc.c

@@ -100,6 +100,7 @@
 	};
 	u64 delta_sum = 0;
         struct sigaction sa;
+	char sbuf[STRERR_BUFSIZE];
 
 	sigfillset(&sa.sa_mask);
 	sa.sa_sigaction = segfault_handler;
@@ -109,14 +110,15 @@
 				 perf_event_open_cloexec_flag());
 	if (fd < 0) {
 		pr_err("Error: sys_perf_event_open() syscall returned "
-		       "with %d (%s)\n", fd, strerror(errno));
+		       "with %d (%s)\n", fd,
+		       strerror_r(errno, sbuf, sizeof(sbuf)));
 		return -1;
 	}
 
 	addr = mmap(NULL, page_size, PROT_READ, MAP_SHARED, fd, 0);
 	if (addr == (void *)(-1)) {
 		pr_err("Error: mmap() syscall returned with (%s)\n",
-		       strerror(errno));
+		       strerror_r(errno, sbuf, sizeof(sbuf)));
 		goto out_close;
 	}
 

diff --git a/tools/perf/tests/sw-clock.c b/tools/perf/tests/sw-clock.c
index 983d6b8..1aa21c9 100644
--- a/tools/perf/tests/sw-clock.c
+++ b/tools/perf/tests/sw-clock.c

@@ -22,6 +22,7 @@
 	volatile int tmp = 0;
 	u64 total_periods = 0;
 	int nr_samples = 0;
+	char sbuf[STRERR_BUFSIZE];
 	union perf_event *event;
 	struct perf_evsel *evsel;
 	struct perf_evlist *evlist;
@@ -62,14 +63,15 @@
 
 		err = -errno;
 		pr_debug("Couldn't open evlist: %s\nHint: check %s, using %" PRIu64 " in this test.\n",
-			 strerror(errno), knob, (u64)attr.sample_freq);
+			 strerror_r(errno, sbuf, sizeof(sbuf)),
+			 knob, (u64)attr.sample_freq);
 		goto out_delete_evlist;
 	}
 
 	err = perf_evlist__mmap(evlist, 128, true);
 	if (err < 0) {
 		pr_debug("failed to mmap event: %d (%s)\n", errno,
-			 strerror(errno));
+			 strerror_r(errno, sbuf, sizeof(sbuf)));
 		goto out_delete_evlist;
 	}
 

diff --git a/tools/perf/tests/switch-tracking.c b/tools/perf/tests/switch-tracking.c
new file mode 100644
index 0000000..cc68648
--- /dev/null
+++ b/tools/perf/tests/switch-tracking.c

@@ -0,0 +1,572 @@
+#include <sys/time.h>
+#include <sys/prctl.h>
+#include <time.h>
+#include <stdlib.h>
+
+#include "parse-events.h"
+#include "evlist.h"
+#include "evsel.h"
+#include "thread_map.h"
+#include "cpumap.h"
+#include "tests.h"
+
+static int spin_sleep(void)
+{
+	struct timeval start, now, diff, maxtime;
+	struct timespec ts;
+	int err, i;
+
+	maxtime.tv_sec = 0;
+	maxtime.tv_usec = 50000;
+
+	err = gettimeofday(&start, NULL);
+	if (err)
+		return err;
+
+	/* Spin for 50ms */
+	while (1) {
+		for (i = 0; i < 1000; i++)
+			barrier();
+
+		err = gettimeofday(&now, NULL);
+		if (err)
+			return err;
+
+		timersub(&now, &start, &diff);
+		if (timercmp(&diff, &maxtime, > /* For checkpatch */))
+			break;
+	}
+
+	ts.tv_nsec = 50 * 1000 * 1000;
+	ts.tv_sec = 0;
+
+	/* Sleep for 50ms */
+	err = nanosleep(&ts, NULL);
+	if (err == EINTR)
+		err = 0;
+
+	return err;
+}
+
+struct switch_tracking {
+	struct perf_evsel *switch_evsel;
+	struct perf_evsel *cycles_evsel;
+	pid_t *tids;
+	int nr_tids;
+	int comm_seen[4];
+	int cycles_before_comm_1;
+	int cycles_between_comm_2_and_comm_3;
+	int cycles_after_comm_4;
+};
+
+static int check_comm(struct switch_tracking *switch_tracking,
+		      union perf_event *event, const char *comm, int nr)
+{
+	if (event->header.type == PERF_RECORD_COMM &&
+	    (pid_t)event->comm.pid == getpid() &&
+	    (pid_t)event->comm.tid == getpid() &&
+	    strcmp(event->comm.comm, comm) == 0) {
+		if (switch_tracking->comm_seen[nr]) {
+			pr_debug("Duplicate comm event\n");
+			return -1;
+		}
+		switch_tracking->comm_seen[nr] = 1;
+		pr_debug3("comm event: %s nr: %d\n", event->comm.comm, nr);
+		return 1;
+	}
+	return 0;
+}
+
+static int check_cpu(struct switch_tracking *switch_tracking, int cpu)
+{
+	int i, nr = cpu + 1;
+
+	if (cpu < 0)
+		return -1;
+
+	if (!switch_tracking->tids) {
+		switch_tracking->tids = calloc(nr, sizeof(pid_t));
+		if (!switch_tracking->tids)
+			return -1;
+		for (i = 0; i < nr; i++)
+			switch_tracking->tids[i] = -1;
+		switch_tracking->nr_tids = nr;
+		return 0;
+	}
+
+	if (cpu >= switch_tracking->nr_tids) {
+		void *addr;
+
+		addr = realloc(switch_tracking->tids, nr * sizeof(pid_t));
+		if (!addr)
+			return -1;
+		switch_tracking->tids = addr;
+		for (i = switch_tracking->nr_tids; i < nr; i++)
+			switch_tracking->tids[i] = -1;
+		switch_tracking->nr_tids = nr;
+		return 0;
+	}
+
+	return 0;
+}
+
+static int process_sample_event(struct perf_evlist *evlist,
+				union perf_event *event,
+				struct switch_tracking *switch_tracking)
+{
+	struct perf_sample sample;
+	struct perf_evsel *evsel;
+	pid_t next_tid, prev_tid;
+	int cpu, err;
+
+	if (perf_evlist__parse_sample(evlist, event, &sample)) {
+		pr_debug("perf_evlist__parse_sample failed\n");
+		return -1;
+	}
+
+	evsel = perf_evlist__id2evsel(evlist, sample.id);
+	if (evsel == switch_tracking->switch_evsel) {
+		next_tid = perf_evsel__intval(evsel, &sample, "next_pid");
+		prev_tid = perf_evsel__intval(evsel, &sample, "prev_pid");
+		cpu = sample.cpu;
+		pr_debug3("sched_switch: cpu: %d prev_tid %d next_tid %d\n",
+			  cpu, prev_tid, next_tid);
+		err = check_cpu(switch_tracking, cpu);
+		if (err)
+			return err;
+		/*
+		 * Check for no missing sched_switch events i.e. that the
+		 * evsel->system_wide flag has worked.
+		 */
+		if (switch_tracking->tids[cpu] != -1 &&
+		    switch_tracking->tids[cpu] != prev_tid) {
+			pr_debug("Missing sched_switch events\n");
+			return -1;
+		}
+		switch_tracking->tids[cpu] = next_tid;
+	}
+
+	if (evsel == switch_tracking->cycles_evsel) {
+		pr_debug3("cycles event\n");
+		if (!switch_tracking->comm_seen[0])
+			switch_tracking->cycles_before_comm_1 = 1;
+		if (switch_tracking->comm_seen[1] &&
+		    !switch_tracking->comm_seen[2])
+			switch_tracking->cycles_between_comm_2_and_comm_3 = 1;
+		if (switch_tracking->comm_seen[3])
+			switch_tracking->cycles_after_comm_4 = 1;
+	}
+
+	return 0;
+}
+
+static int process_event(struct perf_evlist *evlist, union perf_event *event,
+			 struct switch_tracking *switch_tracking)
+{
+	if (event->header.type == PERF_RECORD_SAMPLE)
+		return process_sample_event(evlist, event, switch_tracking);
+
+	if (event->header.type == PERF_RECORD_COMM) {
+		int err, done = 0;
+
+		err = check_comm(switch_tracking, event, "Test COMM 1", 0);
+		if (err < 0)
+			return -1;
+		done += err;
+		err = check_comm(switch_tracking, event, "Test COMM 2", 1);
+		if (err < 0)
+			return -1;
+		done += err;
+		err = check_comm(switch_tracking, event, "Test COMM 3", 2);
+		if (err < 0)
+			return -1;
+		done += err;
+		err = check_comm(switch_tracking, event, "Test COMM 4", 3);
+		if (err < 0)
+			return -1;
+		done += err;
+		if (done != 1) {
+			pr_debug("Unexpected comm event\n");
+			return -1;
+		}
+	}
+
+	return 0;
+}
+
+struct event_node {
+	struct list_head list;
+	union perf_event *event;
+	u64 event_time;
+};
+
+static int add_event(struct perf_evlist *evlist, struct list_head *events,
+		     union perf_event *event)
+{
+	struct perf_sample sample;
+	struct event_node *node;
+
+	node = malloc(sizeof(struct event_node));
+	if (!node) {
+		pr_debug("malloc failed\n");
+		return -1;
+	}
+	node->event = event;
+	list_add(&node->list, events);
+
+	if (perf_evlist__parse_sample(evlist, event, &sample)) {
+		pr_debug("perf_evlist__parse_sample failed\n");
+		return -1;
+	}
+
+	if (!sample.time) {
+		pr_debug("event with no time\n");
+		return -1;
+	}
+
+	node->event_time = sample.time;
+
+	return 0;
+}
+
+static void free_event_nodes(struct list_head *events)
+{
+	struct event_node *node;
+
+	while (!list_empty(events)) {
+		node = list_entry(events->next, struct event_node, list);
+		list_del(&node->list);
+		free(node);
+	}
+}
+
+static int compar(const void *a, const void *b)
+{
+	const struct event_node *nodea = a;
+	const struct event_node *nodeb = b;
+	s64 cmp = nodea->event_time - nodeb->event_time;
+
+	return cmp;
+}
+
+static int process_events(struct perf_evlist *evlist,
+			  struct switch_tracking *switch_tracking)
+{
+	union perf_event *event;
+	unsigned pos, cnt = 0;
+	LIST_HEAD(events);
+	struct event_node *events_array, *node;
+	int i, ret;
+
+	for (i = 0; i < evlist->nr_mmaps; i++) {
+		while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
+			cnt += 1;
+			ret = add_event(evlist, &events, event);
+			perf_evlist__mmap_consume(evlist, i);
+			if (ret < 0)
+				goto out_free_nodes;
+		}
+	}
+
+	events_array = calloc(cnt, sizeof(struct event_node));
+	if (!events_array) {
+		pr_debug("calloc failed\n");
+		ret = -1;
+		goto out_free_nodes;
+	}
+
+	pos = 0;
+	list_for_each_entry(node, &events, list)
+		events_array[pos++] = *node;
+
+	qsort(events_array, cnt, sizeof(struct event_node), compar);
+
+	for (pos = 0; pos < cnt; pos++) {
+		ret = process_event(evlist, events_array[pos].event,
+				    switch_tracking);
+		if (ret < 0)
+			goto out_free;
+	}
+
+	ret = 0;
+out_free:
+	pr_debug("%u events recorded\n", cnt);
+	free(events_array);
+out_free_nodes:
+	free_event_nodes(&events);
+	return ret;
+}
+
+/**
+ * test__switch_tracking - test using sched_switch and tracking events.
+ *
+ * This function implements a test that checks that sched_switch events and
+ * tracking events can be recorded for a workload (current process) using the
+ * evsel->system_wide and evsel->tracking flags (respectively) with other events
+ * sometimes enabled or disabled.
+ */
+int test__switch_tracking(void)
+{
+	const char *sched_switch = "sched:sched_switch";
+	struct switch_tracking switch_tracking = { .tids = NULL, };
+	struct record_opts opts = {
+		.mmap_pages	     = UINT_MAX,
+		.user_freq	     = UINT_MAX,
+		.user_interval	     = ULLONG_MAX,
+		.freq		     = 4000,
+		.target		     = {
+			.uses_mmap   = true,
+		},
+	};
+	struct thread_map *threads = NULL;
+	struct cpu_map *cpus = NULL;
+	struct perf_evlist *evlist = NULL;
+	struct perf_evsel *evsel, *cpu_clocks_evsel, *cycles_evsel;
+	struct perf_evsel *switch_evsel, *tracking_evsel;
+	const char *comm;
+	int err = -1;
+
+	threads = thread_map__new(-1, getpid(), UINT_MAX);
+	if (!threads) {
+		pr_debug("thread_map__new failed!\n");
+		goto out_err;
+	}
+
+	cpus = cpu_map__new(NULL);
+	if (!cpus) {
+		pr_debug("cpu_map__new failed!\n");
+		goto out_err;
+	}
+
+	evlist = perf_evlist__new();
+	if (!evlist) {
+		pr_debug("perf_evlist__new failed!\n");
+		goto out_err;
+	}
+
+	perf_evlist__set_maps(evlist, cpus, threads);
+
+	/* First event */
+	err = parse_events(evlist, "cpu-clock:u");
+	if (err) {
+		pr_debug("Failed to parse event dummy:u\n");
+		goto out_err;
+	}
+
+	cpu_clocks_evsel = perf_evlist__last(evlist);
+
+	/* Second event */
+	err = parse_events(evlist, "cycles:u");
+	if (err) {
+		pr_debug("Failed to parse event cycles:u\n");
+		goto out_err;
+	}
+
+	cycles_evsel = perf_evlist__last(evlist);
+
+	/* Third event */
+	if (!perf_evlist__can_select_event(evlist, sched_switch)) {
+		fprintf(stderr, " (no sched_switch)");
+		err = 0;
+		goto out;
+	}
+
+	err = parse_events(evlist, sched_switch);
+	if (err) {
+		pr_debug("Failed to parse event %s\n", sched_switch);
+		goto out_err;
+	}
+
+	switch_evsel = perf_evlist__last(evlist);
+
+	perf_evsel__set_sample_bit(switch_evsel, CPU);
+	perf_evsel__set_sample_bit(switch_evsel, TIME);
+
+	switch_evsel->system_wide = true;
+	switch_evsel->no_aux_samples = true;
+	switch_evsel->immediate = true;
+
+	/* Test moving an event to the front */
+	if (cycles_evsel == perf_evlist__first(evlist)) {
+		pr_debug("cycles event already at front");
+		goto out_err;
+	}
+	perf_evlist__to_front(evlist, cycles_evsel);
+	if (cycles_evsel != perf_evlist__first(evlist)) {
+		pr_debug("Failed to move cycles event to front");
+		goto out_err;
+	}
+
+	perf_evsel__set_sample_bit(cycles_evsel, CPU);
+	perf_evsel__set_sample_bit(cycles_evsel, TIME);
+
+	/* Fourth event */
+	err = parse_events(evlist, "dummy:u");
+	if (err) {
+		pr_debug("Failed to parse event dummy:u\n");
+		goto out_err;
+	}
+
+	tracking_evsel = perf_evlist__last(evlist);
+
+	perf_evlist__set_tracking_event(evlist, tracking_evsel);
+
+	tracking_evsel->attr.freq = 0;
+	tracking_evsel->attr.sample_period = 1;
+
+	perf_evsel__set_sample_bit(tracking_evsel, TIME);
+
+	/* Config events */
+	perf_evlist__config(evlist, &opts);
+
+	/* Check moved event is still at the front */
+	if (cycles_evsel != perf_evlist__first(evlist)) {
+		pr_debug("Front event no longer at front");
+		goto out_err;
+	}
+
+	/* Check tracking event is tracking */
+	if (!tracking_evsel->attr.mmap || !tracking_evsel->attr.comm) {
+		pr_debug("Tracking event not tracking\n");
+		goto out_err;
+	}
+
+	/* Check non-tracking events are not tracking */
+	evlist__for_each(evlist, evsel) {
+		if (evsel != tracking_evsel) {
+			if (evsel->attr.mmap || evsel->attr.comm) {
+				pr_debug("Non-tracking event is tracking\n");
+				goto out_err;
+			}
+		}
+	}
+
+	if (perf_evlist__open(evlist) < 0) {
+		fprintf(stderr, " (not supported)");
+		err = 0;
+		goto out;
+	}
+
+	err = perf_evlist__mmap(evlist, UINT_MAX, false);
+	if (err) {
+		pr_debug("perf_evlist__mmap failed!\n");
+		goto out_err;
+	}
+
+	perf_evlist__enable(evlist);
+
+	err = perf_evlist__disable_event(evlist, cpu_clocks_evsel);
+	if (err) {
+		pr_debug("perf_evlist__disable_event failed!\n");
+		goto out_err;
+	}
+
+	err = spin_sleep();
+	if (err) {
+		pr_debug("spin_sleep failed!\n");
+		goto out_err;
+	}
+
+	comm = "Test COMM 1";
+	err = prctl(PR_SET_NAME, (unsigned long)comm, 0, 0, 0);
+	if (err) {
+		pr_debug("PR_SET_NAME failed!\n");
+		goto out_err;
+	}
+
+	err = perf_evlist__disable_event(evlist, cycles_evsel);
+	if (err) {
+		pr_debug("perf_evlist__disable_event failed!\n");
+		goto out_err;
+	}
+
+	comm = "Test COMM 2";
+	err = prctl(PR_SET_NAME, (unsigned long)comm, 0, 0, 0);
+	if (err) {
+		pr_debug("PR_SET_NAME failed!\n");
+		goto out_err;
+	}
+
+	err = spin_sleep();
+	if (err) {
+		pr_debug("spin_sleep failed!\n");
+		goto out_err;
+	}
+
+	comm = "Test COMM 3";
+	err = prctl(PR_SET_NAME, (unsigned long)comm, 0, 0, 0);
+	if (err) {
+		pr_debug("PR_SET_NAME failed!\n");
+		goto out_err;
+	}
+
+	err = perf_evlist__enable_event(evlist, cycles_evsel);
+	if (err) {
+		pr_debug("perf_evlist__disable_event failed!\n");
+		goto out_err;
+	}
+
+	comm = "Test COMM 4";
+	err = prctl(PR_SET_NAME, (unsigned long)comm, 0, 0, 0);
+	if (err) {
+		pr_debug("PR_SET_NAME failed!\n");
+		goto out_err;
+	}
+
+	err = spin_sleep();
+	if (err) {
+		pr_debug("spin_sleep failed!\n");
+		goto out_err;
+	}
+
+	perf_evlist__disable(evlist);
+
+	switch_tracking.switch_evsel = switch_evsel;
+	switch_tracking.cycles_evsel = cycles_evsel;
+
+	err = process_events(evlist, &switch_tracking);
+
+	zfree(&switch_tracking.tids);
+
+	if (err)
+		goto out_err;
+
+	/* Check all 4 comm events were seen i.e. that evsel->tracking works */
+	if (!switch_tracking.comm_seen[0] || !switch_tracking.comm_seen[1] ||
+	    !switch_tracking.comm_seen[2] || !switch_tracking.comm_seen[3]) {
+		pr_debug("Missing comm events\n");
+		goto out_err;
+	}
+
+	/* Check cycles event got enabled */
+	if (!switch_tracking.cycles_before_comm_1) {
+		pr_debug("Missing cycles events\n");
+		goto out_err;
+	}
+
+	/* Check cycles event got disabled */
+	if (switch_tracking.cycles_between_comm_2_and_comm_3) {
+		pr_debug("cycles events even though event was disabled\n");
+		goto out_err;
+	}
+
+	/* Check cycles event got enabled again */
+	if (!switch_tracking.cycles_after_comm_4) {
+		pr_debug("Missing cycles events\n");
+		goto out_err;
+	}
+out:
+	if (evlist) {
+		perf_evlist__disable(evlist);
+		perf_evlist__delete(evlist);
+	} else {
+		cpu_map__delete(cpus);
+		thread_map__delete(threads);
+	}
+
+	return err;
+
+out_err:
+	err = -1;
+	goto out;
+}

diff --git a/tools/perf/tests/task-exit.c b/tools/perf/tests/task-exit.c
index 5ff3db3..87522f0 100644
--- a/tools/perf/tests/task-exit.c
+++ b/tools/perf/tests/task-exit.c

@@ -42,6 +42,7 @@
 		.uses_mmap	= true,
 	};
 	const char *argv[] = { "true", NULL };
+	char sbuf[STRERR_BUFSIZE];
 
 	signal(SIGCHLD, sig_handler);
 
@@ -82,13 +83,14 @@
 
 	err = perf_evlist__open(evlist);
 	if (err < 0) {
-		pr_debug("Couldn't open the evlist: %s\n", strerror(-err));
+		pr_debug("Couldn't open the evlist: %s\n",
+			 strerror_r(-err, sbuf, sizeof(sbuf)));
 		goto out_delete_evlist;
 	}
 
 	if (perf_evlist__mmap(evlist, 128, true) < 0) {
 		pr_debug("failed to mmap events: %d (%s)\n", errno,
-			 strerror(errno));
+			 strerror_r(errno, sbuf, sizeof(sbuf)));
 		goto out_delete_evlist;
 	}
 

diff --git a/tools/perf/tests/tests.h b/tools/perf/tests/tests.h
index ed64790..be8be10 100644
--- a/tools/perf/tests/tests.h
+++ b/tools/perf/tests/tests.h

@@ -48,6 +48,7 @@
 int test__thread_mg_share(void);
 int test__hists_output(void);
 int test__hists_cumulate(void);
+int test__switch_tracking(void);
 
 #if defined(__x86_64__) || defined(__i386__) || defined(__arm__)
 #ifdef HAVE_DWARF_UNWIND_SUPPORT

diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c
index a94b11fc..d4cef68 100644
--- a/tools/perf/ui/browsers/hists.c
+++ b/tools/perf/ui/browsers/hists.c

@@ -10,6 +10,7 @@
 #include "../../util/pstack.h"
 #include "../../util/sort.h"
 #include "../../util/util.h"
+#include "../../util/top.h"
 #include "../../arch/common.h"
 
 #include "../browser.h"
@@ -228,8 +229,10 @@
 {
 	struct callchain_list *chain;
 
-	list_for_each_entry(chain, &node->val, list)
+	if (!list_empty(&node->val)) {
+		chain = list_entry(node->val.prev, struct callchain_list, list);
 		chain->ms.has_children = !RB_EMPTY_ROOT(&node->rb_root);
+	}
 
 	callchain_node__init_have_children_rb_tree(node);
 }
@@ -474,26 +477,87 @@
 	return bf;
 }
 
+struct callchain_print_arg {
+	/* for hists browser */
+	off_t	row_offset;
+	bool	is_current_entry;
+
+	/* for file dump */
+	FILE	*fp;
+	int	printed;
+};
+
+typedef void (*print_callchain_entry_fn)(struct hist_browser *browser,
+					 struct callchain_list *chain,
+					 const char *str, int offset,
+					 unsigned short row,
+					 struct callchain_print_arg *arg);
+
+static void hist_browser__show_callchain_entry(struct hist_browser *browser,
+					       struct callchain_list *chain,
+					       const char *str, int offset,
+					       unsigned short row,
+					       struct callchain_print_arg *arg)
+{
+	int color, width;
+	char folded_sign = callchain_list__folded(chain);
+
+	color = HE_COLORSET_NORMAL;
+	width = browser->b.width - (offset + 2);
+	if (ui_browser__is_current_entry(&browser->b, row)) {
+		browser->selection = &chain->ms;
+		color = HE_COLORSET_SELECTED;
+		arg->is_current_entry = true;
+	}
+
+	ui_browser__set_color(&browser->b, color);
+	hist_browser__gotorc(browser, row, 0);
+	slsmg_write_nstring(" ", offset);
+	slsmg_printf("%c ", folded_sign);
+	slsmg_write_nstring(str, width);
+}
+
+static void hist_browser__fprintf_callchain_entry(struct hist_browser *b __maybe_unused,
+						  struct callchain_list *chain,
+						  const char *str, int offset,
+						  unsigned short row __maybe_unused,
+						  struct callchain_print_arg *arg)
+{
+	char folded_sign = callchain_list__folded(chain);
+
+	arg->printed += fprintf(arg->fp, "%*s%c %s\n", offset, " ",
+				folded_sign, str);
+}
+
+typedef bool (*check_output_full_fn)(struct hist_browser *browser,
+				     unsigned short row);
+
+static bool hist_browser__check_output_full(struct hist_browser *browser,
+					    unsigned short row)
+{
+	return browser->b.rows == row;
+}
+
+static bool hist_browser__check_dump_full(struct hist_browser *browser __maybe_unused,
+					  unsigned short row __maybe_unused)
+{
+	return false;
+}
+
 #define LEVEL_OFFSET_STEP 3
 
-static int hist_browser__show_callchain_node_rb_tree(struct hist_browser *browser,
-						     struct callchain_node *chain_node,
-						     u64 total, int level,
-						     unsigned short row,
-						     off_t *row_offset,
-						     bool *is_current_entry)
+static int hist_browser__show_callchain(struct hist_browser *browser,
+					struct rb_root *root, int level,
+					unsigned short row, u64 total,
+					print_callchain_entry_fn print,
+					struct callchain_print_arg *arg,
+					check_output_full_fn is_output_full)
 {
 	struct rb_node *node;
-	int first_row = row, width, offset = level * LEVEL_OFFSET_STEP;
-	u64 new_total, remaining;
+	int first_row = row, offset = level * LEVEL_OFFSET_STEP;
+	u64 new_total;
 
-	if (callchain_param.mode == CHAIN_GRAPH_REL)
-		new_total = chain_node->children_hit;
-	else
-		new_total = total;
-
-	remaining = new_total;
-	node = rb_first(&chain_node->rb_root);
+	node = rb_first(root);
 	while (node) {
 		struct callchain_node *child = rb_entry(node, struct callchain_node, rb_node);
 		struct rb_node *next = rb_next(node);
@@ -503,30 +567,28 @@
 		int first = true;
 		int extra_offset = 0;
 
-		remaining -= cumul;
-
 		list_for_each_entry(chain, &child->val, list) {
 			char bf[1024], *alloc_str;
 			const char *str;
-			int color;
 			bool was_first = first;
 
 			if (first)
 				first = false;
-			else
+			else if (level > 1)
 				extra_offset = LEVEL_OFFSET_STEP;
 
 			folded_sign = callchain_list__folded(chain);
-			if (*row_offset != 0) {
-				--*row_offset;
+			if (arg->row_offset != 0) {
+				arg->row_offset--;
 				goto do_next;
 			}
 
 			alloc_str = NULL;
 			str = callchain_list__sym_name(chain, bf, sizeof(bf),
 						       browser->show_dso);
-			if (was_first) {
-				double percent = cumul * 100.0 / new_total;
+
+			if (was_first && level > 1) {
+				double percent = cumul * 100.0 / total;
 
 				if (asprintf(&alloc_str, "%2.2f%% %s", percent, str) < 0)
 					str = "Not enough memory!";
@@ -534,22 +596,11 @@
 					str = alloc_str;
 			}
 
-			color = HE_COLORSET_NORMAL;
-			width = browser->b.width - (offset + extra_offset + 2);
-			if (ui_browser__is_current_entry(&browser->b, row)) {
-				browser->selection = &chain->ms;
-				color = HE_COLORSET_SELECTED;
-				*is_current_entry = true;
-			}
+			print(browser, chain, str, offset + extra_offset, row, arg);
 
-			ui_browser__set_color(&browser->b, color);
-			hist_browser__gotorc(browser, row, 0);
-			slsmg_write_nstring(" ", offset + extra_offset);
-			slsmg_printf("%c ", folded_sign);
-			slsmg_write_nstring(str, width);
 			free(alloc_str);
 
-			if (++row == browser->b.rows)
+			if (is_output_full(browser, ++row))
 				goto out;
 do_next:
 			if (folded_sign == '+')
@@ -558,92 +609,24 @@
 
 		if (folded_sign == '-') {
 			const int new_level = level + (extra_offset ? 2 : 1);
-			row += hist_browser__show_callchain_node_rb_tree(browser, child, new_total,
-									 new_level, row, row_offset,
-									 is_current_entry);
+
+			if (callchain_param.mode == CHAIN_GRAPH_REL)
+				new_total = child->children_hit;
+			else
+				new_total = total;
+
+			row += hist_browser__show_callchain(browser, &child->rb_root,
+							    new_level, row, new_total,
+							    print, arg, is_output_full);
 		}
-		if (row == browser->b.rows)
-			goto out;
+		if (is_output_full(browser, row))
+			break;
 		node = next;
 	}
 out:
 	return row - first_row;
 }
 
-static int hist_browser__show_callchain_node(struct hist_browser *browser,
-					     struct callchain_node *node,
-					     int level, unsigned short row,
-					     off_t *row_offset,
-					     bool *is_current_entry)
-{
-	struct callchain_list *chain;
-	int first_row = row,
-	     offset = level * LEVEL_OFFSET_STEP,
-	     width = browser->b.width - offset;
-	char folded_sign = ' ';
-
-	list_for_each_entry(chain, &node->val, list) {
-		char bf[1024], *s;
-		int color;
-
-		folded_sign = callchain_list__folded(chain);
-
-		if (*row_offset != 0) {
-			--*row_offset;
-			continue;
-		}
-
-		color = HE_COLORSET_NORMAL;
-		if (ui_browser__is_current_entry(&browser->b, row)) {
-			browser->selection = &chain->ms;
-			color = HE_COLORSET_SELECTED;
-			*is_current_entry = true;
-		}
-
-		s = callchain_list__sym_name(chain, bf, sizeof(bf),
-					     browser->show_dso);
-		hist_browser__gotorc(browser, row, 0);
-		ui_browser__set_color(&browser->b, color);
-		slsmg_write_nstring(" ", offset);
-		slsmg_printf("%c ", folded_sign);
-		slsmg_write_nstring(s, width - 2);
-
-		if (++row == browser->b.rows)
-			goto out;
-	}
-
-	if (folded_sign == '-')
-		row += hist_browser__show_callchain_node_rb_tree(browser, node,
-								 browser->hists->stats.total_period,
-								 level + 1, row,
-								 row_offset,
-								 is_current_entry);
-out:
-	return row - first_row;
-}
-
-static int hist_browser__show_callchain(struct hist_browser *browser,
-					struct rb_root *chain,
-					int level, unsigned short row,
-					off_t *row_offset,
-					bool *is_current_entry)
-{
-	struct rb_node *nd;
-	int first_row = row;
-
-	for (nd = rb_first(chain); nd; nd = rb_next(nd)) {
-		struct callchain_node *node = rb_entry(nd, struct callchain_node, rb_node);
-
-		row += hist_browser__show_callchain_node(browser, node, level,
-							 row, row_offset,
-							 is_current_entry);
-		if (row == browser->b.rows)
-			break;
-	}
-
-	return row - first_row;
-}
-
 struct hpp_arg {
 	struct ui_browser *b;
 	char folded_sign;
@@ -653,17 +636,18 @@
 static int __hpp__slsmg_color_printf(struct perf_hpp *hpp, const char *fmt, ...)
 {
 	struct hpp_arg *arg = hpp->ptr;
-	int ret;
+	int ret, len;
 	va_list args;
 	double percent;
 
 	va_start(args, fmt);
+	len = va_arg(args, int);
 	percent = va_arg(args, double);
 	va_end(args);
 
 	ui_browser__set_percent_color(arg->b, percent, arg->current_entry);
 
-	ret = scnprintf(hpp->buf, hpp->size, fmt, percent);
+	ret = scnprintf(hpp->buf, hpp->size, fmt, len, percent);
 	slsmg_printf("%s", hpp->buf);
 
 	advance_hpp(hpp, ret);
@@ -677,12 +661,12 @@
 }									\
 									\
 static int								\
-hist_browser__hpp_color_##_type(struct perf_hpp_fmt *fmt __maybe_unused,\
+hist_browser__hpp_color_##_type(struct perf_hpp_fmt *fmt,		\
 				struct perf_hpp *hpp,			\
 				struct hist_entry *he)			\
 {									\
-	return __hpp__fmt(hpp, he, __hpp_get_##_field, " %6.2f%%",	\
-			  __hpp__slsmg_color_printf, true);		\
+	return hpp__fmt(fmt, hpp, he, __hpp_get_##_field, " %*.2f%%",	\
+			__hpp__slsmg_color_printf, true);		\
 }
 
 #define __HPP_COLOR_ACC_PERCENT_FN(_type, _field)			\
@@ -692,18 +676,20 @@
 }									\
 									\
 static int								\
-hist_browser__hpp_color_##_type(struct perf_hpp_fmt *fmt __maybe_unused,\
+hist_browser__hpp_color_##_type(struct perf_hpp_fmt *fmt,		\
 				struct perf_hpp *hpp,			\
 				struct hist_entry *he)			\
 {									\
 	if (!symbol_conf.cumulate_callchain) {				\
-		int ret = scnprintf(hpp->buf, hpp->size, "%8s", "N/A");	\
+		int len = fmt->user_len ?: fmt->len;			\
+		int ret = scnprintf(hpp->buf, hpp->size,		\
+				    "%*s", len, "N/A");			\
 		slsmg_printf("%s", hpp->buf);				\
 									\
 		return ret;						\
 	}								\
-	return __hpp__fmt(hpp, he, __hpp_get_acc_##_field, " %6.2f%%",	\
-			  __hpp__slsmg_color_printf, true);		\
+	return hpp__fmt(fmt, hpp, he, __hpp_get_acc_##_field,		\
+			" %*.2f%%", __hpp__slsmg_color_printf, true);	\
 }
 
 __HPP_COLOR_PERCENT_FN(overhead, period)
@@ -812,10 +798,21 @@
 		--row_offset;
 
 	if (folded_sign == '-' && row != browser->b.rows) {
-		printed += hist_browser__show_callchain(browser, &entry->sorted_chain,
-							1, row, &row_offset,
-							&current_entry);
-		if (current_entry)
+		u64 total = hists__total_period(entry->hists);
+		struct callchain_print_arg arg = {
+			.row_offset = row_offset,
+			.is_current_entry = current_entry,
+		};
+
+		if (symbol_conf.cumulate_callchain)
+			total = entry->stat_acc->period;
+
+		printed += hist_browser__show_callchain(browser,
+					&entry->sorted_chain, 1, row, total,
+					hist_browser__show_callchain_entry, &arg,
+					hist_browser__check_output_full);
+
+		if (arg.is_current_entry)
 			browser->he_selection = entry;
 	}
 
@@ -847,9 +844,6 @@
 		if (perf_hpp__should_skip(fmt))
 			continue;
 
-		/* We need to add the length of the columns header. */
-		perf_hpp__reset_width(fmt, hists);
-
 		ret = fmt->header(fmt, &dummy_hpp, hists_to_evsel(hists));
 		if (advance_hpp_check(&dummy_hpp, ret))
 			break;
@@ -1074,113 +1068,21 @@
 	}
 }
 
-static int hist_browser__fprintf_callchain_node_rb_tree(struct hist_browser *browser,
-							struct callchain_node *chain_node,
-							u64 total, int level,
-							FILE *fp)
-{
-	struct rb_node *node;
-	int offset = level * LEVEL_OFFSET_STEP;
-	u64 new_total, remaining;
-	int printed = 0;
-
-	if (callchain_param.mode == CHAIN_GRAPH_REL)
-		new_total = chain_node->children_hit;
-	else
-		new_total = total;
-
-	remaining = new_total;
-	node = rb_first(&chain_node->rb_root);
-	while (node) {
-		struct callchain_node *child = rb_entry(node, struct callchain_node, rb_node);
-		struct rb_node *next = rb_next(node);
-		u64 cumul = callchain_cumul_hits(child);
-		struct callchain_list *chain;
-		char folded_sign = ' ';
-		int first = true;
-		int extra_offset = 0;
-
-		remaining -= cumul;
-
-		list_for_each_entry(chain, &child->val, list) {
-			char bf[1024], *alloc_str;
-			const char *str;
-			bool was_first = first;
-
-			if (first)
-				first = false;
-			else
-				extra_offset = LEVEL_OFFSET_STEP;
-
-			folded_sign = callchain_list__folded(chain);
-
-			alloc_str = NULL;
-			str = callchain_list__sym_name(chain, bf, sizeof(bf),
-						       browser->show_dso);
-			if (was_first) {
-				double percent = cumul * 100.0 / new_total;
-
-				if (asprintf(&alloc_str, "%2.2f%% %s", percent, str) < 0)
-					str = "Not enough memory!";
-				else
-					str = alloc_str;
-			}
-
-			printed += fprintf(fp, "%*s%c %s\n", offset + extra_offset, " ", folded_sign, str);
-			free(alloc_str);
-			if (folded_sign == '+')
-				break;
-		}
-
-		if (folded_sign == '-') {
-			const int new_level = level + (extra_offset ? 2 : 1);
-			printed += hist_browser__fprintf_callchain_node_rb_tree(browser, child, new_total,
-										new_level, fp);
-		}
-
-		node = next;
-	}
-
-	return printed;
-}
-
-static int hist_browser__fprintf_callchain_node(struct hist_browser *browser,
-						struct callchain_node *node,
-						int level, FILE *fp)
-{
-	struct callchain_list *chain;
-	int offset = level * LEVEL_OFFSET_STEP;
-	char folded_sign = ' ';
-	int printed = 0;
-
-	list_for_each_entry(chain, &node->val, list) {
-		char bf[1024], *s;
-
-		folded_sign = callchain_list__folded(chain);
-		s = callchain_list__sym_name(chain, bf, sizeof(bf), browser->show_dso);
-		printed += fprintf(fp, "%*s%c %s\n", offset, " ", folded_sign, s);
-	}
-
-	if (folded_sign == '-')
-		printed += hist_browser__fprintf_callchain_node_rb_tree(browser, node,
-									browser->hists->stats.total_period,
-									level + 1,  fp);
-	return printed;
-}
-
 static int hist_browser__fprintf_callchain(struct hist_browser *browser,
-					   struct rb_root *chain, int level, FILE *fp)
+					   struct hist_entry *he, FILE *fp)
 {
-	struct rb_node *nd;
-	int printed = 0;
+	u64 total = hists__total_period(he->hists);
+	struct callchain_print_arg arg  = {
+		.fp = fp,
+	};
 
-	for (nd = rb_first(chain); nd; nd = rb_next(nd)) {
-		struct callchain_node *node = rb_entry(nd, struct callchain_node, rb_node);
+	if (symbol_conf.cumulate_callchain)
+		total = he->stat_acc->period;
 
-		printed += hist_browser__fprintf_callchain_node(browser, node, level, fp);
-	}
-
-	return printed;
+	hist_browser__show_callchain(browser, &he->sorted_chain, 1, 0, total,
+				     hist_browser__fprintf_callchain_entry, &arg,
+				     hist_browser__check_dump_full);
+	return arg.printed;
 }
 
 static int hist_browser__fprintf_entry(struct hist_browser *browser,
@@ -1219,7 +1121,7 @@
 	printed += fprintf(fp, "%s\n", rtrim(s));
 
 	if (folded_sign == '-')
-		printed += hist_browser__fprintf_callchain(browser, &he->sorted_chain, 1, fp);
+		printed += hist_browser__fprintf_callchain(browser, he, fp);
 
 	return printed;
 }
@@ -1498,6 +1400,7 @@
 	char buf[64];
 	char script_opt[64];
 	int delay_secs = hbt ? hbt->refresh : 0;
+	struct perf_hpp_fmt *fmt;
 
 #define HIST_BROWSER_HELP_COMMON					\
 	"h/?/F1        Show this window\n"				\
@@ -1529,6 +1432,7 @@
 	"P             Print histograms to perf.hist.N\n"
 	"t             Zoom into current Thread\n"
 	"V             Verbose (DSO names in callchains, etc)\n"
+	"z             Toggle zeroing of samples\n"
 	"/             Filter symbol by name";
 
 	if (browser == NULL)
@@ -1547,6 +1451,12 @@
 
 	memset(options, 0, sizeof(options));
 
+	perf_hpp__for_each_format(fmt)
+		perf_hpp__reset_width(fmt, hists);
+
+	if (symbol_conf.col_width_list_str)
+		perf_hpp__set_user_width(symbol_conf.col_width_list_str);
+
 	while (1) {
 		const struct thread *thread = NULL;
 		const struct dso *dso = NULL;
@@ -1623,6 +1533,13 @@
 		case 'F':
 			symbol_conf.filter_relative ^= 1;
 			continue;
+		case 'z':
+			if (!is_report_browser(hbt)) {
+				struct perf_top *top = hbt->arg;
+
+				top->zero = !top->zero;
+			}
+			continue;
 		case K_F1:
 		case 'h':
 		case '?':

diff --git a/tools/perf/ui/gtk/hists.c b/tools/perf/ui/gtk/hists.c
index 6ca60e4..f3fa425 100644
--- a/tools/perf/ui/gtk/hists.c
+++ b/tools/perf/ui/gtk/hists.c

@@ -11,6 +11,7 @@
 static int __percent_color_snprintf(struct perf_hpp *hpp, const char *fmt, ...)
 {
 	int ret = 0;
+	int len;
 	va_list args;
 	double percent;
 	const char *markup;
@@ -18,6 +19,7 @@
 	size_t size = hpp->size;
 
 	va_start(args, fmt);
+	len = va_arg(args, int);
 	percent = va_arg(args, double);
 	va_end(args);
 
@@ -25,7 +27,7 @@
 	if (markup)
 		ret += scnprintf(buf, size, markup);
 
-	ret += scnprintf(buf + ret, size - ret, fmt, percent);
+	ret += scnprintf(buf + ret, size - ret, fmt, len, percent);
 
 	if (markup)
 		ret += scnprintf(buf + ret, size - ret, "</span>");
@@ -39,12 +41,12 @@
 	return he->stat._field;							\
 }										\
 										\
-static int perf_gtk__hpp_color_##_type(struct perf_hpp_fmt *fmt __maybe_unused,	\
+static int perf_gtk__hpp_color_##_type(struct perf_hpp_fmt *fmt,		\
 				       struct perf_hpp *hpp,			\
 				       struct hist_entry *he)			\
 {										\
-	return __hpp__fmt(hpp, he, he_get_##_field, " %6.2f%%",			\
-			  __percent_color_snprintf, true);			\
+	return hpp__fmt(fmt, hpp, he, he_get_##_field, " %*.2f%%",		\
+			__percent_color_snprintf, true);			\
 }
 
 #define __HPP_COLOR_ACC_PERCENT_FN(_type, _field)				\
@@ -57,8 +59,8 @@
 				       struct perf_hpp *hpp,			\
 				       struct hist_entry *he)			\
 {										\
-	return __hpp__fmt_acc(hpp, he, he_get_acc_##_field, " %6.2f%%",		\
-			      __percent_color_snprintf, true);			\
+	return hpp__fmt_acc(fmt, hpp, he, he_get_acc_##_field, " %*.2f%%", 	\
+			    __percent_color_snprintf, true);			\
 }
 
 __HPP_COLOR_PERCENT_FN(overhead, period)
@@ -205,10 +207,8 @@
 		if (perf_hpp__is_sort_entry(fmt))
 			sym_col = col_idx;
 
-		fmt->header(fmt, &hpp, hists_to_evsel(hists));
-
 		gtk_tree_view_insert_column_with_attributes(GTK_TREE_VIEW(view),
-							    -1, ltrim(s),
+							    -1, fmt->name,
 							    renderer, "markup",
 							    col_idx++, NULL);
 	}

diff --git a/tools/perf/ui/hist.c b/tools/perf/ui/hist.c
index 498adb2..2af1837 100644
--- a/tools/perf/ui/hist.c
+++ b/tools/perf/ui/hist.c

@@ -15,9 +15,9 @@
 	__ret;							\
 })
 
-int __hpp__fmt(struct perf_hpp *hpp, struct hist_entry *he,
-	       hpp_field_fn get_field, const char *fmt,
-	       hpp_snprint_fn print_fn, bool fmt_percent)
+static int __hpp__fmt(struct perf_hpp *hpp, struct hist_entry *he,
+		      hpp_field_fn get_field, const char *fmt, int len,
+		      hpp_snprint_fn print_fn, bool fmt_percent)
 {
 	int ret;
 	struct hists *hists = he->hists;
@@ -32,9 +32,9 @@
 		if (total)
 			percent = 100.0 * get_field(he) / total;
 
-		ret = hpp__call_print_fn(hpp, print_fn, fmt, percent);
+		ret = hpp__call_print_fn(hpp, print_fn, fmt, len, percent);
 	} else
-		ret = hpp__call_print_fn(hpp, print_fn, fmt, get_field(he));
+		ret = hpp__call_print_fn(hpp, print_fn, fmt, len, get_field(he));
 
 	if (perf_evsel__is_group_event(evsel)) {
 		int prev_idx, idx_delta;
@@ -60,19 +60,19 @@
 				 */
 				if (fmt_percent) {
 					ret += hpp__call_print_fn(hpp, print_fn,
-								  fmt, 0.0);
+								  fmt, len, 0.0);
 				} else {
 					ret += hpp__call_print_fn(hpp, print_fn,
-								  fmt, 0ULL);
+								  fmt, len, 0ULL);
 				}
 			}
 
 			if (fmt_percent) {
-				ret += hpp__call_print_fn(hpp, print_fn, fmt,
+				ret += hpp__call_print_fn(hpp, print_fn, fmt, len,
 							  100.0 * period / total);
 			} else {
 				ret += hpp__call_print_fn(hpp, print_fn, fmt,
-							  period);
+							  len, period);
 			}
 
 			prev_idx = perf_evsel__group_idx(evsel);
@@ -86,10 +86,10 @@
 			 */
 			if (fmt_percent) {
 				ret += hpp__call_print_fn(hpp, print_fn,
-							  fmt, 0.0);
+							  fmt, len, 0.0);
 			} else {
 				ret += hpp__call_print_fn(hpp, print_fn,
-							  fmt, 0ULL);
+							  fmt, len, 0ULL);
 			}
 		}
 	}
@@ -104,16 +104,35 @@
 	return ret;
 }
 
-int __hpp__fmt_acc(struct perf_hpp *hpp, struct hist_entry *he,
-		   hpp_field_fn get_field, const char *fmt,
-		   hpp_snprint_fn print_fn, bool fmt_percent)
+int hpp__fmt(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp,
+	     struct hist_entry *he, hpp_field_fn get_field,
+	     const char *fmtstr, hpp_snprint_fn print_fn, bool fmt_percent)
 {
-	if (!symbol_conf.cumulate_callchain) {
-		return snprintf(hpp->buf, hpp->size, "%*s",
-				fmt_percent ? 8 : 12, "N/A");
+	int len = fmt->user_len ?: fmt->len;
+
+	if (symbol_conf.field_sep) {
+		return __hpp__fmt(hpp, he, get_field, fmtstr, 1,
+				  print_fn, fmt_percent);
 	}
 
-	return __hpp__fmt(hpp, he, get_field, fmt, print_fn, fmt_percent);
+	if (fmt_percent)
+		len -= 2; /* 2 for a space and a % sign */
+	else
+		len -= 1;
+
+	return  __hpp__fmt(hpp, he, get_field, fmtstr, len, print_fn, fmt_percent);
+}
+
+int hpp__fmt_acc(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp,
+		 struct hist_entry *he, hpp_field_fn get_field,
+		 const char *fmtstr, hpp_snprint_fn print_fn, bool fmt_percent)
+{
+	if (!symbol_conf.cumulate_callchain) {
+		int len = fmt->user_len ?: fmt->len;
+		return snprintf(hpp->buf, hpp->size, " %*s", len - 1, "N/A");
+	}
+
+	return hpp__fmt(fmt, hpp, he, get_field, fmtstr, print_fn, fmt_percent);
 }
 
 static int field_cmp(u64 field_a, u64 field_b)
@@ -190,30 +209,26 @@
 	return ret;
 }
 
-#define __HPP_HEADER_FN(_type, _str, _min_width, _unit_width) 		\
-static int hpp__header_##_type(struct perf_hpp_fmt *fmt __maybe_unused,	\
-			       struct perf_hpp *hpp,			\
-			       struct perf_evsel *evsel)		\
-{									\
-	int len = _min_width;						\
-									\
-	if (symbol_conf.event_group)					\
-		len = max(len, evsel->nr_members * _unit_width);	\
-									\
-	return scnprintf(hpp->buf, hpp->size, "%*s", len, _str);	\
+static int hpp__width_fn(struct perf_hpp_fmt *fmt,
+			 struct perf_hpp *hpp __maybe_unused,
+			 struct perf_evsel *evsel)
+{
+	int len = fmt->user_len ?: fmt->len;
+
+	if (symbol_conf.event_group)
+		len = max(len, evsel->nr_members * fmt->len);
+
+	if (len < (int)strlen(fmt->name))
+		len = strlen(fmt->name);
+
+	return len;
 }
 
-#define __HPP_WIDTH_FN(_type, _min_width, _unit_width) 			\
-static int hpp__width_##_type(struct perf_hpp_fmt *fmt __maybe_unused,	\
-			      struct perf_hpp *hpp __maybe_unused,	\
-			      struct perf_evsel *evsel)			\
-{									\
-	int len = _min_width;						\
-									\
-	if (symbol_conf.event_group)					\
-		len = max(len, evsel->nr_members * _unit_width);	\
-									\
-	return len;							\
+static int hpp__header_fn(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp,
+			  struct perf_evsel *evsel)
+{
+	int len = hpp__width_fn(fmt, hpp, evsel);
+	return scnprintf(hpp->buf, hpp->size, "%*s", len, fmt->name);
 }
 
 static int hpp_color_scnprintf(struct perf_hpp *hpp, const char *fmt, ...)
@@ -221,11 +236,12 @@
 	va_list args;
 	ssize_t ssize = hpp->size;
 	double percent;
-	int ret;
+	int ret, len;
 
 	va_start(args, fmt);
+	len = va_arg(args, int);
 	percent = va_arg(args, double);
-	ret = value_color_snprintf(hpp->buf, hpp->size, fmt, percent);
+	ret = percent_color_len_snprintf(hpp->buf, hpp->size, fmt, len, percent);
 	va_end(args);
 
 	return (ret >= ssize) ? (ssize - 1) : ret;
@@ -250,20 +266,19 @@
 	return he->stat._field;							\
 }										\
 										\
-static int hpp__color_##_type(struct perf_hpp_fmt *fmt __maybe_unused,		\
+static int hpp__color_##_type(struct perf_hpp_fmt *fmt,				\
 			      struct perf_hpp *hpp, struct hist_entry *he) 	\
 {										\
-	return __hpp__fmt(hpp, he, he_get_##_field, " %6.2f%%",			\
-			  hpp_color_scnprintf, true);				\
+	return hpp__fmt(fmt, hpp, he, he_get_##_field, " %*.2f%%",		\
+			hpp_color_scnprintf, true);				\
 }
 
 #define __HPP_ENTRY_PERCENT_FN(_type, _field)					\
-static int hpp__entry_##_type(struct perf_hpp_fmt *_fmt __maybe_unused,		\
+static int hpp__entry_##_type(struct perf_hpp_fmt *fmt,				\
 			      struct perf_hpp *hpp, struct hist_entry *he) 	\
 {										\
-	const char *fmt = symbol_conf.field_sep ? " %.2f" : " %6.2f%%";		\
-	return __hpp__fmt(hpp, he, he_get_##_field, fmt,			\
-			  hpp_entry_scnprintf, true);				\
+	return hpp__fmt(fmt, hpp, he, he_get_##_field, " %*.2f%%",		\
+			hpp_entry_scnprintf, true);				\
 }
 
 #define __HPP_SORT_FN(_type, _field)						\
@@ -278,20 +293,19 @@
 	return he->stat_acc->_field;						\
 }										\
 										\
-static int hpp__color_##_type(struct perf_hpp_fmt *fmt __maybe_unused,		\
+static int hpp__color_##_type(struct perf_hpp_fmt *fmt,				\
 			      struct perf_hpp *hpp, struct hist_entry *he) 	\
 {										\
-	return __hpp__fmt_acc(hpp, he, he_get_acc_##_field, " %6.2f%%",		\
-			      hpp_color_scnprintf, true);			\
+	return hpp__fmt_acc(fmt, hpp, he, he_get_acc_##_field, " %*.2f%%", 	\
+			    hpp_color_scnprintf, true);				\
 }
 
 #define __HPP_ENTRY_ACC_PERCENT_FN(_type, _field)				\
-static int hpp__entry_##_type(struct perf_hpp_fmt *_fmt __maybe_unused,		\
+static int hpp__entry_##_type(struct perf_hpp_fmt *fmt,				\
 			      struct perf_hpp *hpp, struct hist_entry *he) 	\
 {										\
-	const char *fmt = symbol_conf.field_sep ? " %.2f" : " %6.2f%%";		\
-	return __hpp__fmt_acc(hpp, he, he_get_acc_##_field, fmt,		\
-			      hpp_entry_scnprintf, true);			\
+	return hpp__fmt_acc(fmt, hpp, he, he_get_acc_##_field, " %*.2f%%",	\
+			    hpp_entry_scnprintf, true);				\
 }
 
 #define __HPP_SORT_ACC_FN(_type, _field)					\
@@ -306,12 +320,11 @@
 	return he->stat._field;							\
 }										\
 										\
-static int hpp__entry_##_type(struct perf_hpp_fmt *_fmt __maybe_unused,		\
+static int hpp__entry_##_type(struct perf_hpp_fmt *fmt,				\
 			      struct perf_hpp *hpp, struct hist_entry *he) 	\
 {										\
-	const char *fmt = symbol_conf.field_sep ? " %"PRIu64 : " %11"PRIu64;	\
-	return __hpp__fmt(hpp, he, he_get_raw_##_field, fmt,			\
-			  hpp_entry_scnprintf, false);				\
+	return hpp__fmt(fmt, hpp, he, he_get_raw_##_field, " %*"PRIu64, 	\
+			hpp_entry_scnprintf, false);				\
 }
 
 #define __HPP_SORT_RAW_FN(_type, _field)					\
@@ -321,37 +334,29 @@
 }
 
 
-#define HPP_PERCENT_FNS(_type, _str, _field, _min_width, _unit_width)	\
-__HPP_HEADER_FN(_type, _str, _min_width, _unit_width)			\
-__HPP_WIDTH_FN(_type, _min_width, _unit_width)				\
+#define HPP_PERCENT_FNS(_type, _field)					\
 __HPP_COLOR_PERCENT_FN(_type, _field)					\
 __HPP_ENTRY_PERCENT_FN(_type, _field)					\
 __HPP_SORT_FN(_type, _field)
 
-#define HPP_PERCENT_ACC_FNS(_type, _str, _field, _min_width, _unit_width)\
-__HPP_HEADER_FN(_type, _str, _min_width, _unit_width)			\
-__HPP_WIDTH_FN(_type, _min_width, _unit_width)				\
+#define HPP_PERCENT_ACC_FNS(_type, _field)				\
 __HPP_COLOR_ACC_PERCENT_FN(_type, _field)				\
 __HPP_ENTRY_ACC_PERCENT_FN(_type, _field)				\
 __HPP_SORT_ACC_FN(_type, _field)
 
-#define HPP_RAW_FNS(_type, _str, _field, _min_width, _unit_width)	\
-__HPP_HEADER_FN(_type, _str, _min_width, _unit_width)			\
-__HPP_WIDTH_FN(_type, _min_width, _unit_width)				\
+#define HPP_RAW_FNS(_type, _field)					\
 __HPP_ENTRY_RAW_FN(_type, _field)					\
 __HPP_SORT_RAW_FN(_type, _field)
 
-__HPP_HEADER_FN(overhead_self, "Self", 8, 8)
+HPP_PERCENT_FNS(overhead, period)
+HPP_PERCENT_FNS(overhead_sys, period_sys)
+HPP_PERCENT_FNS(overhead_us, period_us)
+HPP_PERCENT_FNS(overhead_guest_sys, period_guest_sys)
+HPP_PERCENT_FNS(overhead_guest_us, period_guest_us)
+HPP_PERCENT_ACC_FNS(overhead_acc, period)
 
-HPP_PERCENT_FNS(overhead, "Overhead", period, 8, 8)
-HPP_PERCENT_FNS(overhead_sys, "sys", period_sys, 8, 8)
-HPP_PERCENT_FNS(overhead_us, "usr", period_us, 8, 8)
-HPP_PERCENT_FNS(overhead_guest_sys, "guest sys", period_guest_sys, 9, 8)
-HPP_PERCENT_FNS(overhead_guest_us, "guest usr", period_guest_us, 9, 8)
-HPP_PERCENT_ACC_FNS(overhead_acc, "Children", period, 8, 8)
-
-HPP_RAW_FNS(samples, "Samples", nr_events, 12, 12)
-HPP_RAW_FNS(period, "Period", period, 12, 12)
+HPP_RAW_FNS(samples, nr_events)
+HPP_RAW_FNS(period, period)
 
 static int64_t hpp__nop_cmp(struct hist_entry *a __maybe_unused,
 			    struct hist_entry *b __maybe_unused)
@@ -359,47 +364,50 @@
 	return 0;
 }
 
-#define HPP__COLOR_PRINT_FNS(_name)			\
+#define HPP__COLOR_PRINT_FNS(_name, _fn)		\
 	{						\
-		.header	= hpp__header_ ## _name,	\
-		.width	= hpp__width_ ## _name,		\
-		.color	= hpp__color_ ## _name,		\
-		.entry	= hpp__entry_ ## _name,		\
+		.name   = _name,			\
+		.header	= hpp__header_fn,		\
+		.width	= hpp__width_fn,		\
+		.color	= hpp__color_ ## _fn,		\
+		.entry	= hpp__entry_ ## _fn,		\
 		.cmp	= hpp__nop_cmp,			\
 		.collapse = hpp__nop_cmp,		\
-		.sort	= hpp__sort_ ## _name,		\
+		.sort	= hpp__sort_ ## _fn,		\
 	}
 
-#define HPP__COLOR_ACC_PRINT_FNS(_name)			\
+#define HPP__COLOR_ACC_PRINT_FNS(_name, _fn)		\
 	{						\
-		.header	= hpp__header_ ## _name,	\
-		.width	= hpp__width_ ## _name,		\
-		.color	= hpp__color_ ## _name,		\
-		.entry	= hpp__entry_ ## _name,		\
+		.name   = _name,			\
+		.header	= hpp__header_fn,		\
+		.width	= hpp__width_fn,		\
+		.color	= hpp__color_ ## _fn,		\
+		.entry	= hpp__entry_ ## _fn,		\
 		.cmp	= hpp__nop_cmp,			\
 		.collapse = hpp__nop_cmp,		\
-		.sort	= hpp__sort_ ## _name,		\
+		.sort	= hpp__sort_ ## _fn,		\
 	}
 
-#define HPP__PRINT_FNS(_name)				\
+#define HPP__PRINT_FNS(_name, _fn)			\
 	{						\
-		.header	= hpp__header_ ## _name,	\
-		.width	= hpp__width_ ## _name,		\
-		.entry	= hpp__entry_ ## _name,		\
+		.name   = _name,			\
+		.header	= hpp__header_fn,		\
+		.width	= hpp__width_fn,		\
+		.entry	= hpp__entry_ ## _fn,		\
 		.cmp	= hpp__nop_cmp,			\
 		.collapse = hpp__nop_cmp,		\
-		.sort	= hpp__sort_ ## _name,		\
+		.sort	= hpp__sort_ ## _fn,		\
 	}
 
 struct perf_hpp_fmt perf_hpp__format[] = {
-	HPP__COLOR_PRINT_FNS(overhead),
-	HPP__COLOR_PRINT_FNS(overhead_sys),
-	HPP__COLOR_PRINT_FNS(overhead_us),
-	HPP__COLOR_PRINT_FNS(overhead_guest_sys),
-	HPP__COLOR_PRINT_FNS(overhead_guest_us),
-	HPP__COLOR_ACC_PRINT_FNS(overhead_acc),
-	HPP__PRINT_FNS(samples),
-	HPP__PRINT_FNS(period)
+	HPP__COLOR_PRINT_FNS("Overhead", overhead),
+	HPP__COLOR_PRINT_FNS("sys", overhead_sys),
+	HPP__COLOR_PRINT_FNS("usr", overhead_us),
+	HPP__COLOR_PRINT_FNS("guest sys", overhead_guest_sys),
+	HPP__COLOR_PRINT_FNS("guest usr", overhead_guest_us),
+	HPP__COLOR_ACC_PRINT_FNS("Children", overhead_acc),
+	HPP__PRINT_FNS("Samples", samples),
+	HPP__PRINT_FNS("Period", period)
 };
 
 LIST_HEAD(perf_hpp__list);
@@ -444,14 +452,12 @@
 	/*
 	 * If user specified field order, no need to setup default fields.
 	 */
-	if (field_order)
+	if (is_strict_order(field_order))
 		return;
 
 	if (symbol_conf.cumulate_callchain) {
 		perf_hpp__column_enable(PERF_HPP__OVERHEAD_ACC);
-
-		perf_hpp__format[PERF_HPP__OVERHEAD].header =
-						hpp__header_overhead_self;
+		perf_hpp__format[PERF_HPP__OVERHEAD].name = "Self";
 	}
 
 	perf_hpp__column_enable(PERF_HPP__OVERHEAD);
@@ -513,11 +519,11 @@
 
 void perf_hpp__cancel_cumulate(void)
 {
-	if (field_order)
+	if (is_strict_order(field_order))
 		return;
 
 	perf_hpp__column_disable(PERF_HPP__OVERHEAD_ACC);
-	perf_hpp__format[PERF_HPP__OVERHEAD].header = hpp__header_overhead;
+	perf_hpp__format[PERF_HPP__OVERHEAD].name = "Overhead";
 }
 
 void perf_hpp__setup_output_field(void)
@@ -622,3 +628,59 @@
 
 	return ret;
 }
+
+void perf_hpp__reset_width(struct perf_hpp_fmt *fmt, struct hists *hists)
+{
+	int idx;
+
+	if (perf_hpp__is_sort_entry(fmt))
+		return perf_hpp__reset_sort_width(fmt, hists);
+
+	for (idx = 0; idx < PERF_HPP__MAX_INDEX; idx++) {
+		if (fmt == &perf_hpp__format[idx])
+			break;
+	}
+
+	if (idx == PERF_HPP__MAX_INDEX)
+		return;
+
+	switch (idx) {
+	case PERF_HPP__OVERHEAD:
+	case PERF_HPP__OVERHEAD_SYS:
+	case PERF_HPP__OVERHEAD_US:
+	case PERF_HPP__OVERHEAD_ACC:
+		fmt->len = 8;
+		break;
+
+	case PERF_HPP__OVERHEAD_GUEST_SYS:
+	case PERF_HPP__OVERHEAD_GUEST_US:
+		fmt->len = 9;
+		break;
+
+	case PERF_HPP__SAMPLES:
+	case PERF_HPP__PERIOD:
+		fmt->len = 12;
+		break;
+
+	default:
+		break;
+	}
+}
+
+void perf_hpp__set_user_width(const char *width_list_str)
+{
+	struct perf_hpp_fmt *fmt;
+	const char *ptr = width_list_str;
+
+	perf_hpp__for_each_format(fmt) {
+		char *p;
+
+		int len = strtol(ptr, &p, 10);
+		fmt->user_len = len;
+
+		if (*p == ',')
+			ptr = p + 1;
+		else
+			break;
+	}
+}

diff --git a/tools/perf/ui/stdio/hist.c b/tools/perf/ui/stdio/hist.c
index 40af0ac..15b451a 100644
--- a/tools/perf/ui/stdio/hist.c
+++ b/tools/perf/ui/stdio/hist.c

@@ -395,10 +395,12 @@
 
 	init_rem_hits();
 
-
 	perf_hpp__for_each_format(fmt)
 		perf_hpp__reset_width(fmt, hists);
 
+	if (symbol_conf.col_width_list_str)
+		perf_hpp__set_user_width(symbol_conf.col_width_list_str);
+
 	if (!show_header)
 		goto print_entries;
 

diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index 809b4c5..3643752 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c

@@ -232,9 +232,16 @@
 		return -1;
 
 	target = ++s;
+	comment = strchr(s, '#');
 
-	while (s[0] != '\0' && !isspace(s[0]))
-		++s;
+	if (comment != NULL)
+		s = comment - 1;
+	else
+		s = strchr(s, '\0') - 1;
+
+	while (s > target && isspace(s[0]))
+		--s;
+	s++;
 	prev = *s;
 	*s = '\0';
 
@@ -244,7 +251,6 @@
 	if (ops->target.raw == NULL)
 		goto out_free_source;
 
-	comment = strchr(s, '#');
 	if (comment == NULL)
 		return 0;
 
@@ -899,10 +905,8 @@
 	struct kcore_extract kce;
 	bool delete_extract = false;
 
-	if (filename) {
-		snprintf(symfs_filename, sizeof(symfs_filename), "%s%s",
-			 symbol_conf.symfs, filename);
-	}
+	if (filename)
+		symbol__join_symfs(symfs_filename, filename);
 
 	if (filename == NULL) {
 		if (dso->has_build_id) {
@@ -922,8 +926,7 @@
 		 * DSO is the same as when 'perf record' ran.
 		 */
 		filename = (char *)dso->long_name;
-		snprintf(symfs_filename, sizeof(symfs_filename), "%s%s",
-			 symbol_conf.symfs, filename);
+		symbol__join_symfs(symfs_filename, filename);
 		free_filename = false;
 	}
 

diff --git a/tools/perf/util/cache.h b/tools/perf/util/cache.h
index 7b176dd..5cf9e1b 100644
--- a/tools/perf/util/cache.h
+++ b/tools/perf/util/cache.h

@@ -22,6 +22,7 @@
 extern int perf_default_config(const char *, const char *, void *);
 extern int perf_config(config_fn_t fn, void *);
 extern int perf_config_int(const char *, const char *);
+extern u64 perf_config_u64(const char *, const char *);
 extern int perf_config_bool(const char *, const char *);
 extern int config_error_nonbool(const char *);
 extern const char *perf_config_dirname(const char *, const char *);

diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
index 437ee09..08f0fbf 100644
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c

@@ -28,74 +28,55 @@
 int
 parse_callchain_report_opt(const char *arg)
 {
-	char *tok, *tok2;
+	char *tok;
 	char *endptr;
+	bool minpcnt_set = false;
 
 	symbol_conf.use_callchain = true;
 
 	if (!arg)
 		return 0;
 
-	tok = strtok((char *)arg, ",");
-	if (!tok)
-		return -1;
+	while ((tok = strtok((char *)arg, ",")) != NULL) {
+		if (!strncmp(tok, "none", strlen(tok))) {
+			callchain_param.mode = CHAIN_NONE;
+			symbol_conf.use_callchain = false;
+			return 0;
+		}
 
-	/* get the output mode */
-	if (!strncmp(tok, "graph", strlen(arg))) {
-		callchain_param.mode = CHAIN_GRAPH_ABS;
+		/* try to get the output mode */
+		if (!strncmp(tok, "graph", strlen(tok)))
+			callchain_param.mode = CHAIN_GRAPH_ABS;
+		else if (!strncmp(tok, "flat", strlen(tok)))
+			callchain_param.mode = CHAIN_FLAT;
+		else if (!strncmp(tok, "fractal", strlen(tok)))
+			callchain_param.mode = CHAIN_GRAPH_REL;
+		/* try to get the call chain order */
+		else if (!strncmp(tok, "caller", strlen(tok)))
+			callchain_param.order = ORDER_CALLER;
+		else if (!strncmp(tok, "callee", strlen(tok)))
+			callchain_param.order = ORDER_CALLEE;
+		/* try to get the sort key */
+		else if (!strncmp(tok, "function", strlen(tok)))
+			callchain_param.key = CCKEY_FUNCTION;
+		else if (!strncmp(tok, "address", strlen(tok)))
+			callchain_param.key = CCKEY_ADDRESS;
+		/* try to get the min percent */
+		else if (!minpcnt_set) {
+			callchain_param.min_percent = strtod(tok, &endptr);
+			if (tok == endptr)
+				return -1;
+			minpcnt_set = true;
+		} else {
+			/* try print limit at last */
+			callchain_param.print_limit = strtoul(tok, &endptr, 0);
+			if (tok == endptr)
+				return -1;
+		}
 
-	} else if (!strncmp(tok, "flat", strlen(arg))) {
-		callchain_param.mode = CHAIN_FLAT;
-	} else if (!strncmp(tok, "fractal", strlen(arg))) {
-		callchain_param.mode = CHAIN_GRAPH_REL;
-	} else if (!strncmp(tok, "none", strlen(arg))) {
-		callchain_param.mode = CHAIN_NONE;
-		symbol_conf.use_callchain = false;
-		return 0;
-	} else {
-		return -1;
+		arg = NULL;
 	}
 
-	/* get the min percentage */
-	tok = strtok(NULL, ",");
-	if (!tok)
-		goto setup;
-
-	callchain_param.min_percent = strtod(tok, &endptr);
-	if (tok == endptr)
-		return -1;
-
-	/* get the print limit */
-	tok2 = strtok(NULL, ",");
-	if (!tok2)
-		goto setup;
-
-	if (tok2[0] != 'c') {
-		callchain_param.print_limit = strtoul(tok2, &endptr, 0);
-		tok2 = strtok(NULL, ",");
-		if (!tok2)
-			goto setup;
-	}
-
-	/* get the call chain order */
-	if (!strncmp(tok2, "caller", strlen("caller")))
-		callchain_param.order = ORDER_CALLER;
-	else if (!strncmp(tok2, "callee", strlen("callee")))
-		callchain_param.order = ORDER_CALLEE;
-	else
-		return -1;
-
-	/* Get the sort key */
-	tok2 = strtok(NULL, ",");
-	if (!tok2)
-		goto setup;
-	if (!strncmp(tok2, "function", strlen("function")))
-		callchain_param.key = CCKEY_FUNCTION;
-	else if (!strncmp(tok2, "address", strlen("address")))
-		callchain_param.key = CCKEY_ADDRESS;
-	else
-		return -1;
-setup:
 	if (callchain_register_param(&callchain_param) < 0) {
 		pr_err("Can't register callchain params\n");
 		return -1;

diff --git a/tools/perf/util/cloexec.c b/tools/perf/util/cloexec.c
index c5d05ec..47b78b3 100644
--- a/tools/perf/util/cloexec.c
+++ b/tools/perf/util/cloexec.c

@@ -1,7 +1,9 @@
+#include <sched.h>
 #include "util.h"
 #include "../perf.h"
 #include "cloexec.h"
 #include "asm/bug.h"
+#include "debug.h"
 
 static unsigned long flag = PERF_FLAG_FD_CLOEXEC;
 
@@ -9,15 +11,30 @@
 {
 	/* use 'safest' configuration as used in perf_evsel__fallback() */
 	struct perf_event_attr attr = {
-		.type = PERF_COUNT_SW_CPU_CLOCK,
+		.type = PERF_TYPE_SOFTWARE,
 		.config = PERF_COUNT_SW_CPU_CLOCK,
+		.exclude_kernel = 1,
 	};
 	int fd;
 	int err;
+	int cpu;
+	pid_t pid = -1;
+	char sbuf[STRERR_BUFSIZE];
 
-	/* check cloexec flag */
-	fd = sys_perf_event_open(&attr, 0, -1, -1,
-				 PERF_FLAG_FD_CLOEXEC);
+	cpu = sched_getcpu();
+	if (cpu < 0)
+		cpu = 0;
+
+	while (1) {
+		/* check cloexec flag */
+		fd = sys_perf_event_open(&attr, pid, cpu, -1,
+					 PERF_FLAG_FD_CLOEXEC);
+		if (fd < 0 && pid == -1 && errno == EACCES) {
+			pid = 0;
+			continue;
+		}
+		break;
+	}
 	err = errno;
 
 	if (fd >= 0) {
@@ -25,17 +42,17 @@
 		return 1;
 	}
 
-	WARN_ONCE(err != EINVAL,
+	WARN_ONCE(err != EINVAL && err != EBUSY,
 		  "perf_event_open(..., PERF_FLAG_FD_CLOEXEC) failed with unexpected error %d (%s)\n",
-		  err, strerror(err));
+		  err, strerror_r(err, sbuf, sizeof(sbuf)));
 
 	/* not supported, confirm error related to PERF_FLAG_FD_CLOEXEC */
-	fd = sys_perf_event_open(&attr, 0, -1, -1, 0);
+	fd = sys_perf_event_open(&attr, pid, cpu, -1, 0);
 	err = errno;
 
-	if (WARN_ONCE(fd < 0,
+	if (WARN_ONCE(fd < 0 && err != EBUSY,
 		      "perf_event_open(..., 0) failed unexpectedly with error %d (%s)\n",
-		      err, strerror(err)))
+		      err, strerror_r(err, sbuf, sizeof(sbuf))))
 		return -1;
 
 	close(fd);

diff --git a/tools/perf/util/color.c b/tools/perf/util/color.c
index 87b8672..f465418 100644
--- a/tools/perf/util/color.c
+++ b/tools/perf/util/color.c

@@ -335,3 +335,19 @@
 	va_end(args);
 	return value_color_snprintf(bf, size, fmt, percent);
 }
+
+int percent_color_len_snprintf(char *bf, size_t size, const char *fmt, ...)
+{
+	va_list args;
+	int len;
+	double percent;
+	const char *color;
+
+	va_start(args, fmt);
+	len = va_arg(args, int);
+	percent = va_arg(args, double);
+	va_end(args);
+
+	color = get_percent_color(percent);
+	return color_snprintf(bf, size, color, fmt, len, percent);
+}

diff --git a/tools/perf/util/color.h b/tools/perf/util/color.h
index 7ff30a6..0a594b8 100644
--- a/tools/perf/util/color.h
+++ b/tools/perf/util/color.h

@@ -41,6 +41,7 @@
 int color_fwrite_lines(FILE *fp, const char *color, size_t count, const char *buf);
 int value_color_snprintf(char *bf, size_t size, const char *fmt, double value);
 int percent_color_snprintf(char *bf, size_t size, const char *fmt, ...);
+int percent_color_len_snprintf(char *bf, size_t size, const char *fmt, ...);
 int percent_color_fprintf(FILE *fp, const char *fmt, double percent);
 const char *get_percent_color(double percent);
 

diff --git a/tools/perf/util/comm.c b/tools/perf/util/comm.c
index f9e7776..b2bb59d 100644
--- a/tools/perf/util/comm.c
+++ b/tools/perf/util/comm.c

@@ -74,7 +74,7 @@
 	return new;
 }
 
-struct comm *comm__new(const char *str, u64 timestamp)
+struct comm *comm__new(const char *str, u64 timestamp, bool exec)
 {
 	struct comm *comm = zalloc(sizeof(*comm));
 
@@ -82,6 +82,7 @@
 		return NULL;
 
 	comm->start = timestamp;
+	comm->exec = exec;
 
 	comm->comm_str = comm_str__findnew(str, &comm_str_root);
 	if (!comm->comm_str) {
@@ -94,7 +95,7 @@
 	return comm;
 }
 
-int comm__override(struct comm *comm, const char *str, u64 timestamp)
+int comm__override(struct comm *comm, const char *str, u64 timestamp, bool exec)
 {
 	struct comm_str *new, *old = comm->comm_str;
 
@@ -106,6 +107,8 @@
 	comm_str__put(old);
 	comm->comm_str = new;
 	comm->start = timestamp;
+	if (exec)
+		comm->exec = true;
 
 	return 0;
 }

diff --git a/tools/perf/util/comm.h b/tools/perf/util/comm.h
index fac5bd5..51c10ab 100644
--- a/tools/perf/util/comm.h
+++ b/tools/perf/util/comm.h

@@ -11,11 +11,13 @@
 	struct comm_str *comm_str;
 	u64 start;
 	struct list_head list;
+	bool exec;
 };
 
 void comm__free(struct comm *comm);
-struct comm *comm__new(const char *str, u64 timestamp);
+struct comm *comm__new(const char *str, u64 timestamp, bool exec);
 const char *comm__str(const struct comm *comm);
-int comm__override(struct comm *comm, const char *str, u64 timestamp);
+int comm__override(struct comm *comm, const char *str, u64 timestamp,
+		   bool exec);
 
 #endif  /* __PERF_COMM_H */

diff --git a/tools/perf/util/config.c b/tools/perf/util/config.c
index 1e5e2e5..9970b8b 100644
--- a/tools/perf/util/config.c
+++ b/tools/perf/util/config.c

@@ -286,6 +286,21 @@
 	return 0;
 }
 
+static int perf_parse_llong(const char *value, long long *ret)
+{
+	if (value && *value) {
+		char *end;
+		long long val = strtoll(value, &end, 0);
+		unsigned long factor = 1;
+
+		if (!parse_unit_factor(end, &factor))
+			return 0;
+		*ret = val * factor;
+		return 1;
+	}
+	return 0;
+}
+
 static int perf_parse_long(const char *value, long *ret)
 {
 	if (value && *value) {
@@ -307,6 +322,15 @@
 	die("bad config value for '%s'", name);
 }
 
+u64 perf_config_u64(const char *name, const char *value)
+{
+	long long ret = 0;
+
+	if (!perf_parse_llong(value, &ret))
+		die_bad_config(name);
+	return (u64) ret;
+}
+
 int perf_config_int(const char *name, const char *value)
 {
 	long ret = 0;

diff --git a/tools/perf/util/data.c b/tools/perf/util/data.c
index 29d720c..1921942 100644
--- a/tools/perf/util/data.c
+++ b/tools/perf/util/data.c

@@ -50,12 +50,14 @@
 {
 	struct stat st;
 	int fd;
+	char sbuf[STRERR_BUFSIZE];
 
 	fd = open(file->path, O_RDONLY);
 	if (fd < 0) {
 		int err = errno;
 
-		pr_err("failed to open %s: %s", file->path, strerror(err));
+		pr_err("failed to open %s: %s", file->path,
+			strerror_r(err, sbuf, sizeof(sbuf)));
 		if (err == ENOENT && !strcmp(file->path, "perf.data"))
 			pr_err("  (try 'perf record' first)");
 		pr_err("\n");
@@ -88,6 +90,7 @@
 static int open_file_write(struct perf_data_file *file)
 {
 	int fd;
+	char sbuf[STRERR_BUFSIZE];
 
 	if (check_backup(file))
 		return -1;
@@ -95,7 +98,8 @@
 	fd = open(file->path, O_CREAT|O_RDWR|O_TRUNC, S_IRUSR|S_IWUSR);
 
 	if (fd < 0)
-		pr_err("failed to open %s : %s\n", file->path, strerror(errno));
+		pr_err("failed to open %s : %s\n", file->path,
+			strerror_r(errno, sbuf, sizeof(sbuf)));
 
 	return fd;
 }

diff --git a/tools/perf/util/debug.c b/tools/perf/util/debug.c
index 71d4193..ba357f3 100644
--- a/tools/perf/util/debug.c
+++ b/tools/perf/util/debug.c

@@ -13,8 +13,12 @@
 #include "util.h"
 #include "target.h"
 
+#define NSECS_PER_SEC  1000000000ULL
+#define NSECS_PER_USEC 1000ULL
+
 int verbose;
 bool dump_trace = false, quiet = false;
+int debug_ordered_events;
 
 static int _eprintf(int level, int var, const char *fmt, va_list args)
 {
@@ -42,6 +46,35 @@
 	return ret;
 }
 
+static int __eprintf_time(u64 t, const char *fmt, va_list args)
+{
+	int ret = 0;
+	u64 secs, usecs, nsecs = t;
+
+	secs   = nsecs / NSECS_PER_SEC;
+	nsecs -= secs  * NSECS_PER_SEC;
+	usecs  = nsecs / NSECS_PER_USEC;
+
+	ret = fprintf(stderr, "[%13" PRIu64 ".%06" PRIu64 "] ",
+		      secs, usecs);
+	ret += vfprintf(stderr, fmt, args);
+	return ret;
+}
+
+int eprintf_time(int level, int var, u64 t, const char *fmt, ...)
+{
+	int ret = 0;
+	va_list args;
+
+	if (var >= level) {
+		va_start(args, fmt);
+		ret = __eprintf_time(t, fmt, args);
+		va_end(args);
+	}
+
+	return ret;
+}
+
 /*
  * Overloading libtraceevent standard info print
  * function, display with -v in perf.
@@ -110,7 +143,8 @@
 	const char *name;
 	int *ptr;
 } debug_variables[] = {
-	{ .name = "verbose", .ptr = &verbose },
+	{ .name = "verbose",		.ptr = &verbose },
+	{ .name = "ordered-events",	.ptr = &debug_ordered_events},
 	{ .name = NULL, }
 };
 

diff --git a/tools/perf/util/debug.h b/tools/perf/util/debug.h
index 89fb6b0..be264d6 100644
--- a/tools/perf/util/debug.h
+++ b/tools/perf/util/debug.h

@@ -3,6 +3,7 @@
 #define __PERF_DEBUG_H
 
 #include <stdbool.h>
+#include <string.h>
 #include "event.h"
 #include "../ui/helpline.h"
 #include "../ui/progress.h"
@@ -10,6 +11,7 @@
 
 extern int verbose;
 extern bool quiet, dump_trace;
+extern int debug_ordered_events;
 
 #ifndef pr_fmt
 #define pr_fmt(fmt) fmt
@@ -29,6 +31,14 @@
 #define pr_debug3(fmt, ...) pr_debugN(3, pr_fmt(fmt), ##__VA_ARGS__)
 #define pr_debug4(fmt, ...) pr_debugN(4, pr_fmt(fmt), ##__VA_ARGS__)
 
+#define pr_time_N(n, var, t, fmt, ...) \
+	eprintf_time(n, var, t, fmt, ##__VA_ARGS__)
+
+#define pr_oe_time(t, fmt, ...)  pr_time_N(1, debug_ordered_events, t, pr_fmt(fmt), ##__VA_ARGS__)
+#define pr_oe_time2(t, fmt, ...) pr_time_N(2, debug_ordered_events, t, pr_fmt(fmt), ##__VA_ARGS__)
+
+#define STRERR_BUFSIZE	128	/* For the buffer size of strerror_r */
+
 int dump_printf(const char *fmt, ...) __attribute__((format(printf, 1, 2)));
 void trace_event(union perf_event *event);
 
@@ -38,6 +48,7 @@
 void pr_stat(const char *fmt, ...);
 
 int eprintf(int level, int var, const char *fmt, ...) __attribute__((format(printf, 3, 4)));
+int eprintf_time(int level, int var, u64 t, const char *fmt, ...) __attribute__((format(printf, 4, 5)));
 
 int perf_debug_option(const char *str);
 

diff --git a/tools/perf/util/dso.c b/tools/perf/util/dso.c
index 90d02c66..55e39dc 100644
--- a/tools/perf/util/dso.c
+++ b/tools/perf/util/dso.c

@@ -37,6 +37,7 @@
 {
 	char build_id_hex[BUILD_ID_SIZE * 2 + 1];
 	int ret = 0;
+	size_t len;
 
 	switch (type) {
 	case DSO_BINARY_TYPE__DEBUGLINK: {
@@ -60,26 +61,25 @@
 		break;
 
 	case DSO_BINARY_TYPE__FEDORA_DEBUGINFO:
-		snprintf(filename, size, "%s/usr/lib/debug%s.debug",
-			 symbol_conf.symfs, dso->long_name);
+		len = __symbol__join_symfs(filename, size, "/usr/lib/debug");
+		snprintf(filename + len, size - len, "%s.debug", dso->long_name);
 		break;
 
 	case DSO_BINARY_TYPE__UBUNTU_DEBUGINFO:
-		snprintf(filename, size, "%s/usr/lib/debug%s",
-			 symbol_conf.symfs, dso->long_name);
+		len = __symbol__join_symfs(filename, size, "/usr/lib/debug");
+		snprintf(filename + len, size - len, "%s", dso->long_name);
 		break;
 
 	case DSO_BINARY_TYPE__OPENEMBEDDED_DEBUGINFO:
 	{
 		const char *last_slash;
-		size_t len;
 		size_t dir_size;
 
 		last_slash = dso->long_name + dso->long_name_len;
 		while (last_slash != dso->long_name && *last_slash != '/')
 			last_slash--;
 
-		len = scnprintf(filename, size, "%s", symbol_conf.symfs);
+		len = __symbol__join_symfs(filename, size, "");
 		dir_size = last_slash - dso->long_name + 2;
 		if (dir_size > (size - len)) {
 			ret = -1;
@@ -100,26 +100,24 @@
 		build_id__sprintf(dso->build_id,
 				  sizeof(dso->build_id),
 				  build_id_hex);
-		snprintf(filename, size,
-			 "%s/usr/lib/debug/.build-id/%.2s/%s.debug",
-			 symbol_conf.symfs, build_id_hex, build_id_hex + 2);
+		len = __symbol__join_symfs(filename, size, "/usr/lib/debug/.build-id/");
+		snprintf(filename + len, size - len, "%.2s/%s.debug",
+			 build_id_hex, build_id_hex + 2);
 		break;
 
 	case DSO_BINARY_TYPE__VMLINUX:
 	case DSO_BINARY_TYPE__GUEST_VMLINUX:
 	case DSO_BINARY_TYPE__SYSTEM_PATH_DSO:
-		snprintf(filename, size, "%s%s",
-			 symbol_conf.symfs, dso->long_name);
+		__symbol__join_symfs(filename, size, dso->long_name);
 		break;
 
 	case DSO_BINARY_TYPE__GUEST_KMODULE:
-		snprintf(filename, size, "%s%s%s", symbol_conf.symfs,
-			 root_dir, dso->long_name);
+		path__join3(filename, size, symbol_conf.symfs,
+			    root_dir, dso->long_name);
 		break;
 
 	case DSO_BINARY_TYPE__SYSTEM_PATH_KMODULE:
-		snprintf(filename, size, "%s%s", symbol_conf.symfs,
-			 dso->long_name);
+		__symbol__join_symfs(filename, size, dso->long_name);
 		break;
 
 	case DSO_BINARY_TYPE__KCORE:
@@ -164,13 +162,15 @@
 static int do_open(char *name)
 {
 	int fd;
+	char sbuf[STRERR_BUFSIZE];
 
 	do {
 		fd = open(name, O_RDONLY);
 		if (fd >= 0)
 			return fd;
 
-		pr_debug("dso open failed, mmap: %s\n", strerror(errno));
+		pr_debug("dso open failed, mmap: %s\n",
+			 strerror_r(errno, sbuf, sizeof(sbuf)));
 		if (!dso__data_open_cnt || errno != EMFILE)
 			break;
 
@@ -532,10 +532,12 @@
 static int data_file_size(struct dso *dso)
 {
 	struct stat st;
+	char sbuf[STRERR_BUFSIZE];
 
 	if (!dso->data.file_size) {
 		if (fstat(dso->data.fd, &st)) {
-			pr_err("dso mmap failed, fstat: %s\n", strerror(errno));
+			pr_err("dso mmap failed, fstat: %s\n",
+				strerror_r(errno, sbuf, sizeof(sbuf)));
 			return -1;
 		}
 		dso->data.file_size = st.st_size;

diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index 1398c83..ed55819 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c

@@ -784,9 +784,9 @@
 		 * "[vdso]" dso, but for now lets use the old trick of looking
 		 * in the whole kernel symbol list.
 		 */
-		if ((long long)al->addr < 0 &&
-		    cpumode == PERF_RECORD_MISC_USER &&
-		    machine && mg != &machine->kmaps) {
+		if (cpumode == PERF_RECORD_MISC_USER && machine &&
+		    mg != &machine->kmaps &&
+		    machine__kernel_ip(machine, al->addr)) {
 			mg = &machine->kmaps;
 			load_map = true;
 			goto try_again;

diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index 94d6976..7eb7107 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h

@@ -156,6 +156,8 @@
 	u32 cpu;
 	u32 raw_size;
 	u64 data_src;
+	u32 flags;
+	u16 insn_len;
 	void *raw_data;
 	struct ip_callchain *callchain;
 	struct branch_stack *branch_stack;

diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index 814e954..a3e28b4 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c

@@ -122,6 +122,7 @@
 {
 	list_add_tail(&entry->node, &evlist->entries);
 	entry->idx = evlist->nr_entries;
+	entry->tracking = !entry->idx;
 
 	if (!evlist->nr_entries++)
 		perf_evlist__set_id_pos(evlist);
@@ -265,17 +266,27 @@
 	return 0;
 }
 
+static int perf_evlist__nr_threads(struct perf_evlist *evlist,
+				   struct perf_evsel *evsel)
+{
+	if (evsel->system_wide)
+		return 1;
+	else
+		return thread_map__nr(evlist->threads);
+}
+
 void perf_evlist__disable(struct perf_evlist *evlist)
 {
 	int cpu, thread;
 	struct perf_evsel *pos;
 	int nr_cpus = cpu_map__nr(evlist->cpus);
-	int nr_threads = thread_map__nr(evlist->threads);
+	int nr_threads;
 
 	for (cpu = 0; cpu < nr_cpus; cpu++) {
 		evlist__for_each(evlist, pos) {
 			if (!perf_evsel__is_group_leader(pos) || !pos->fd)
 				continue;
+			nr_threads = perf_evlist__nr_threads(evlist, pos);
 			for (thread = 0; thread < nr_threads; thread++)
 				ioctl(FD(pos, cpu, thread),
 				      PERF_EVENT_IOC_DISABLE, 0);
@@ -288,12 +299,13 @@
 	int cpu, thread;
 	struct perf_evsel *pos;
 	int nr_cpus = cpu_map__nr(evlist->cpus);
-	int nr_threads = thread_map__nr(evlist->threads);
+	int nr_threads;
 
 	for (cpu = 0; cpu < nr_cpus; cpu++) {
 		evlist__for_each(evlist, pos) {
 			if (!perf_evsel__is_group_leader(pos) || !pos->fd)
 				continue;
+			nr_threads = perf_evlist__nr_threads(evlist, pos);
 			for (thread = 0; thread < nr_threads; thread++)
 				ioctl(FD(pos, cpu, thread),
 				      PERF_EVENT_IOC_ENABLE, 0);
@@ -305,12 +317,14 @@
 			       struct perf_evsel *evsel)
 {
 	int cpu, thread, err;
+	int nr_cpus = cpu_map__nr(evlist->cpus);
+	int nr_threads = perf_evlist__nr_threads(evlist, evsel);
 
 	if (!evsel->fd)
 		return 0;
 
-	for (cpu = 0; cpu < evlist->cpus->nr; cpu++) {
-		for (thread = 0; thread < evlist->threads->nr; thread++) {
+	for (cpu = 0; cpu < nr_cpus; cpu++) {
+		for (thread = 0; thread < nr_threads; thread++) {
 			err = ioctl(FD(evsel, cpu, thread),
 				    PERF_EVENT_IOC_DISABLE, 0);
 			if (err)
@@ -324,12 +338,14 @@
 			      struct perf_evsel *evsel)
 {
 	int cpu, thread, err;
+	int nr_cpus = cpu_map__nr(evlist->cpus);
+	int nr_threads = perf_evlist__nr_threads(evlist, evsel);
 
 	if (!evsel->fd)
 		return -EINVAL;
 
-	for (cpu = 0; cpu < evlist->cpus->nr; cpu++) {
-		for (thread = 0; thread < evlist->threads->nr; thread++) {
+	for (cpu = 0; cpu < nr_cpus; cpu++) {
+		for (thread = 0; thread < nr_threads; thread++) {
 			err = ioctl(FD(evsel, cpu, thread),
 				    PERF_EVENT_IOC_ENABLE, 0);
 			if (err)
@@ -339,11 +355,67 @@
 	return 0;
 }
 
+static int perf_evlist__enable_event_cpu(struct perf_evlist *evlist,
+					 struct perf_evsel *evsel, int cpu)
+{
+	int thread, err;
+	int nr_threads = perf_evlist__nr_threads(evlist, evsel);
+
+	if (!evsel->fd)
+		return -EINVAL;
+
+	for (thread = 0; thread < nr_threads; thread++) {
+		err = ioctl(FD(evsel, cpu, thread),
+			    PERF_EVENT_IOC_ENABLE, 0);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+static int perf_evlist__enable_event_thread(struct perf_evlist *evlist,
+					    struct perf_evsel *evsel,
+					    int thread)
+{
+	int cpu, err;
+	int nr_cpus = cpu_map__nr(evlist->cpus);
+
+	if (!evsel->fd)
+		return -EINVAL;
+
+	for (cpu = 0; cpu < nr_cpus; cpu++) {
+		err = ioctl(FD(evsel, cpu, thread), PERF_EVENT_IOC_ENABLE, 0);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+int perf_evlist__enable_event_idx(struct perf_evlist *evlist,
+				  struct perf_evsel *evsel, int idx)
+{
+	bool per_cpu_mmaps = !cpu_map__empty(evlist->cpus);
+
+	if (per_cpu_mmaps)
+		return perf_evlist__enable_event_cpu(evlist, evsel, idx);
+	else
+		return perf_evlist__enable_event_thread(evlist, evsel, idx);
+}
+
 static int perf_evlist__alloc_pollfd(struct perf_evlist *evlist)
 {
 	int nr_cpus = cpu_map__nr(evlist->cpus);
 	int nr_threads = thread_map__nr(evlist->threads);
-	int nfds = nr_cpus * nr_threads * evlist->nr_entries;
+	int nfds = 0;
+	struct perf_evsel *evsel;
+
+	list_for_each_entry(evsel, &evlist->entries, node) {
+		if (evsel->system_wide)
+			nfds += nr_cpus;
+		else
+			nfds += nr_cpus * nr_threads;
+	}
+
 	evlist->pollfd = malloc(sizeof(struct pollfd) * nfds);
 	return evlist->pollfd != NULL ? 0 : -ENOMEM;
 }
@@ -636,7 +708,12 @@
 	struct perf_evsel *evsel;
 
 	evlist__for_each(evlist, evsel) {
-		int fd = FD(evsel, cpu, thread);
+		int fd;
+
+		if (evsel->system_wide && thread)
+			continue;
+
+		fd = FD(evsel, cpu, thread);
 
 		if (*output == -1) {
 			*output = fd;
@@ -1061,6 +1138,8 @@
 	}
 
 	if (!evlist->workload.pid) {
+		int ret;
+
 		if (pipe_output)
 			dup2(2, 1);
 
@@ -1078,8 +1157,22 @@
 		/*
 		 * Wait until the parent tells us to go.
 		 */
-		if (read(go_pipe[0], &bf, 1) == -1)
-			perror("unable to read pipe");
+		ret = read(go_pipe[0], &bf, 1);
+		/*
+		 * The parent will ask for the execvp() to be performed by
+		 * writing exactly one byte, in workload.cork_fd, usually via
+		 * perf_evlist__start_workload().
+		 *
+		 * For cancelling the workload without actuallin running it,
+		 * the parent will just close workload.cork_fd, without writing
+		 * anything, i.e. read will return zero and we just exit()
+		 * here.
+		 */
+		if (ret != 1) {
+			if (ret == -1)
+				perror("unable to read pipe");
+			exit(ret);
+		}
 
 		execvp(argv[0], (char **)argv);
 
@@ -1202,7 +1295,7 @@
 			       int err, char *buf, size_t size)
 {
 	int printed, value;
-	char sbuf[128], *emsg = strerror_r(err, sbuf, sizeof(sbuf));
+	char sbuf[STRERR_BUFSIZE], *emsg = strerror_r(err, sbuf, sizeof(sbuf));
 
 	switch (err) {
 	case EACCES:
@@ -1250,3 +1343,19 @@
 
 	list_splice(&move, &evlist->entries);
 }
+
+void perf_evlist__set_tracking_event(struct perf_evlist *evlist,
+				     struct perf_evsel *tracking_evsel)
+{
+	struct perf_evsel *evsel;
+
+	if (tracking_evsel->tracking)
+		return;
+
+	evlist__for_each(evlist, evsel) {
+		if (evsel != tracking_evsel)
+			evsel->tracking = false;
+	}
+
+	tracking_evsel->tracking = true;
+}

diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
index f5173cd..106de53 100644
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h

@@ -122,6 +122,8 @@
 			       struct perf_evsel *evsel);
 int perf_evlist__enable_event(struct perf_evlist *evlist,
 			      struct perf_evsel *evsel);
+int perf_evlist__enable_event_idx(struct perf_evlist *evlist,
+				  struct perf_evsel *evsel, int idx);
 
 void perf_evlist__set_selected(struct perf_evlist *evlist,
 			       struct perf_evsel *evsel);
@@ -262,4 +264,7 @@
 #define evlist__for_each_safe(evlist, tmp, evsel) \
 	__evlist__for_each_safe(&(evlist)->entries, tmp, evsel)
 
+void perf_evlist__set_tracking_event(struct perf_evlist *evlist,
+				     struct perf_evsel *tracking_evsel);
+
 #endif /* __PERF_EVLIST_H */

diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 21a373e..b38de58 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c

@@ -162,6 +162,7 @@
 		      struct perf_event_attr *attr, int idx)
 {
 	evsel->idx	   = idx;
+	evsel->tracking	   = !idx;
 	evsel->attr	   = *attr;
 	evsel->leader	   = evsel;
 	evsel->unit	   = "";
@@ -561,7 +562,7 @@
 {
 	struct perf_evsel *leader = evsel->leader;
 	struct perf_event_attr *attr = &evsel->attr;
-	int track = !evsel->idx; /* only the first counter needs these */
+	int track = evsel->tracking;
 	bool per_cpu = opts->target.default_per_cpu && !opts->target.per_thread;
 
 	attr->sample_id_all = perf_missing_features.sample_id_all ? 0 : 1;
@@ -633,9 +634,12 @@
 	if (opts->period)
 		perf_evsel__set_sample_bit(evsel, PERIOD);
 
-	if (!perf_missing_features.sample_id_all &&
-	    (opts->sample_time || !opts->no_inherit ||
-	     target__has_cpu(&opts->target) || per_cpu))
+	/*
+	 * When the user explicitely disabled time don't force it here.
+	 */
+	if (opts->sample_time &&
+	    (!perf_missing_features.sample_id_all &&
+	    (!opts->no_inherit || target__has_cpu(&opts->target) || per_cpu)))
 		perf_evsel__set_sample_bit(evsel, TIME);
 
 	if (opts->raw_samples && !evsel->no_aux_samples) {
@@ -692,6 +696,10 @@
 int perf_evsel__alloc_fd(struct perf_evsel *evsel, int ncpus, int nthreads)
 {
 	int cpu, thread;
+
+	if (evsel->system_wide)
+		nthreads = 1;
+
 	evsel->fd = xyarray__new(ncpus, nthreads, sizeof(int));
 
 	if (evsel->fd) {
@@ -710,6 +718,9 @@
 {
 	int cpu, thread;
 
+	if (evsel->system_wide)
+		nthreads = 1;
+
 	for (cpu = 0; cpu < ncpus; cpu++) {
 		for (thread = 0; thread < nthreads; thread++) {
 			int fd = FD(evsel, cpu, thread),
@@ -740,6 +751,9 @@
 
 int perf_evsel__alloc_id(struct perf_evsel *evsel, int ncpus, int nthreads)
 {
+	if (evsel->system_wide)
+		nthreads = 1;
+
 	evsel->sample_id = xyarray__new(ncpus, nthreads, sizeof(struct perf_sample_id));
 	if (evsel->sample_id == NULL)
 		return -ENOMEM;
@@ -784,6 +798,9 @@
 {
 	int cpu, thread;
 
+	if (evsel->system_wide)
+		nthreads = 1;
+
 	for (cpu = 0; cpu < ncpus; cpu++)
 		for (thread = 0; thread < nthreads; ++thread) {
 			close(FD(evsel, cpu, thread));
@@ -872,6 +889,9 @@
 	int cpu, thread;
 	struct perf_counts_values *aggr = &evsel->counts->aggr, count;
 
+	if (evsel->system_wide)
+		nthreads = 1;
+
 	aggr->val = aggr->ena = aggr->run = 0;
 
 	for (cpu = 0; cpu < ncpus; cpu++) {
@@ -994,13 +1014,18 @@
 static int __perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus,
 			      struct thread_map *threads)
 {
-	int cpu, thread;
+	int cpu, thread, nthreads;
 	unsigned long flags = PERF_FLAG_FD_CLOEXEC;
 	int pid = -1, err;
 	enum { NO_CHANGE, SET_TO_MAX, INCREASED_MAX } set_rlimit = NO_CHANGE;
 
+	if (evsel->system_wide)
+		nthreads = 1;
+	else
+		nthreads = threads->nr;
+
 	if (evsel->fd == NULL &&
-	    perf_evsel__alloc_fd(evsel, cpus->nr, threads->nr) < 0)
+	    perf_evsel__alloc_fd(evsel, cpus->nr, nthreads) < 0)
 		return -ENOMEM;
 
 	if (evsel->cgrp) {
@@ -1024,10 +1049,10 @@
 
 	for (cpu = 0; cpu < cpus->nr; cpu++) {
 
-		for (thread = 0; thread < threads->nr; thread++) {
+		for (thread = 0; thread < nthreads; thread++) {
 			int group_fd;
 
-			if (!evsel->cgrp)
+			if (!evsel->cgrp && !evsel->system_wide)
 				pid = threads->map[thread];
 
 			group_fd = get_group_fd(evsel, cpu, thread);
@@ -1100,7 +1125,7 @@
 			close(FD(evsel, cpu, thread));
 			FD(evsel, cpu, thread) = -1;
 		}
-		thread = threads->nr;
+		thread = nthreads;
 	} while (--cpu >= 0);
 	return err;
 }
@@ -2002,6 +2027,8 @@
 int perf_evsel__open_strerror(struct perf_evsel *evsel, struct target *target,
 			      int err, char *msg, size_t size)
 {
+	char sbuf[STRERR_BUFSIZE];
+
 	switch (err) {
 	case EPERM:
 	case EACCES:
@@ -2036,13 +2063,20 @@
 	"No APIC? If so then you can boot the kernel with the \"lapic\" boot parameter to force-enable it.");
 #endif
 		break;
+	case EBUSY:
+		if (find_process("oprofiled"))
+			return scnprintf(msg, size,
+	"The PMU counters are busy/taken by another profiler.\n"
+	"We found oprofile daemon running, please stop it and try again.");
+		break;
 	default:
 		break;
 	}
 
 	return scnprintf(msg, size,
-	"The sys_perf_event_open() syscall returned with %d (%s) for event (%s).  \n"
+	"The sys_perf_event_open() syscall returned with %d (%s) for event (%s).\n"
 	"/bin/dmesg may provide additional information.\n"
 	"No CONFIG_PERF_EVENTS=y kernel support configured?\n",
-			 err, strerror(err), perf_evsel__name(evsel));
+			 err, strerror_r(err, sbuf, sizeof(sbuf)),
+			 perf_evsel__name(evsel));
 }

diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index d7f93ce..7bc314b 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h

@@ -85,6 +85,8 @@
 	bool 			needs_swap;
 	bool			no_aux_samples;
 	bool			immediate;
+	bool			system_wide;
+	bool			tracking;
 	/* parse modifier helper */
 	int			exclude_GH;
 	int			nr_members;

diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index 30df618..86569fa 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c

@@ -277,6 +277,28 @@
 	}
 }
 
+void hists__delete_entries(struct hists *hists)
+{
+	struct rb_node *next = rb_first(&hists->entries);
+	struct hist_entry *n;
+
+	while (next) {
+		n = rb_entry(next, struct hist_entry, rb_node);
+		next = rb_next(&n->rb_node);
+
+		rb_erase(&n->rb_node, &hists->entries);
+
+		if (sort__need_collapse)
+			rb_erase(&n->rb_node_in, &hists->entries_collapsed);
+
+		--hists->nr_entries;
+		if (!n->filtered)
+			--hists->nr_non_filtered_entries;
+
+		hist_entry__free(n);
+	}
+}
+
 /*
  * histogram, sorted on item, collects periods
  */

diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index 742f49a..8c9c70e 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h

@@ -152,6 +152,7 @@
 void hists__collapse_resort(struct hists *hists, struct ui_progress *prog);
 
 void hists__decay_entries(struct hists *hists, bool zap_user, bool zap_kernel);
+void hists__delete_entries(struct hists *hists);
 void hists__output_recalc_col_len(struct hists *hists, int max_rows);
 
 u64 hists__total_period(struct hists *hists);
@@ -192,6 +193,7 @@
 };
 
 struct perf_hpp_fmt {
+	const char *name;
 	int (*header)(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp,
 		      struct perf_evsel *evsel);
 	int (*width)(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp,
@@ -207,6 +209,8 @@
 	struct list_head list;
 	struct list_head sort_list;
 	bool elide;
+	int len;
+	int user_len;
 };
 
 extern struct list_head perf_hpp__list;
@@ -261,17 +265,19 @@
 }
 
 void perf_hpp__reset_width(struct perf_hpp_fmt *fmt, struct hists *hists);
+void perf_hpp__reset_sort_width(struct perf_hpp_fmt *fmt, struct hists *hists);
+void perf_hpp__set_user_width(const char *width_list_str);
 
 typedef u64 (*hpp_field_fn)(struct hist_entry *he);
 typedef int (*hpp_callback_fn)(struct perf_hpp *hpp, bool front);
 typedef int (*hpp_snprint_fn)(struct perf_hpp *hpp, const char *fmt, ...);
 
-int __hpp__fmt(struct perf_hpp *hpp, struct hist_entry *he,
-	       hpp_field_fn get_field, const char *fmt,
-	       hpp_snprint_fn print_fn, bool fmt_percent);
-int __hpp__fmt_acc(struct perf_hpp *hpp, struct hist_entry *he,
-		   hpp_field_fn get_field, const char *fmt,
-		   hpp_snprint_fn print_fn, bool fmt_percent);
+int hpp__fmt(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp,
+	     struct hist_entry *he, hpp_field_fn get_field,
+	     const char *fmtstr, hpp_snprint_fn print_fn, bool fmt_percent);
+int hpp__fmt_acc(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp,
+		 struct hist_entry *he, hpp_field_fn get_field,
+		 const char *fmtstr, hpp_snprint_fn print_fn, bool fmt_percent);
 
 static inline void advance_hpp(struct perf_hpp *hpp, int inc)
 {

diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index 16bba9f..b2ec38b 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c

@@ -31,6 +31,8 @@
 
 	machine->symbol_filter = NULL;
 	machine->id_hdr_size = 0;
+	machine->comm_exec = false;
+	machine->kernel_start = 0;
 
 	machine->root_dir = strdup(root_dir);
 	if (machine->root_dir == NULL)
@@ -179,6 +181,19 @@
 	}
 }
 
+void machines__set_comm_exec(struct machines *machines, bool comm_exec)
+{
+	struct rb_node *nd;
+
+	machines->host.comm_exec = comm_exec;
+
+	for (nd = rb_first(&machines->guests); nd; nd = rb_next(nd)) {
+		struct machine *machine = rb_entry(nd, struct machine, rb_node);
+
+		machine->comm_exec = comm_exec;
+	}
+}
+
 struct machine *machines__find(struct machines *machines, pid_t pid)
 {
 	struct rb_node **p = &machines->guests.rb_node;
@@ -398,17 +413,31 @@
 	return __machine__findnew_thread(machine, pid, tid, false);
 }
 
+struct comm *machine__thread_exec_comm(struct machine *machine,
+				       struct thread *thread)
+{
+	if (machine->comm_exec)
+		return thread__exec_comm(thread);
+	else
+		return thread__comm(thread);
+}
+
 int machine__process_comm_event(struct machine *machine, union perf_event *event,
 				struct perf_sample *sample)
 {
 	struct thread *thread = machine__findnew_thread(machine,
 							event->comm.pid,
 							event->comm.tid);
+	bool exec = event->header.misc & PERF_RECORD_MISC_COMM_EXEC;
+
+	if (exec)
+		machine->comm_exec = true;
 
 	if (dump_trace)
 		perf_event__fprintf_comm(event, stdout);
 
-	if (thread == NULL || thread__set_comm(thread, event->comm.comm, sample->time)) {
+	if (thread == NULL ||
+	    __thread__set_comm(thread, event->comm.comm, sample->time, exec)) {
 		dump_printf("problem processing PERF_RECORD_COMM, skipping event.\n");
 		return -1;
 	}
@@ -565,8 +594,8 @@
  * Returns the name of the start symbol in *symbol_name. Pass in NULL as
  * symbol_name if it's not that important.
  */
-static u64 machine__get_kernel_start_addr(struct machine *machine,
-					  const char **symbol_name)
+static u64 machine__get_running_kernel_start(struct machine *machine,
+					     const char **symbol_name)
 {
 	char filename[PATH_MAX];
 	int i;
@@ -593,7 +622,7 @@
 int __machine__create_kernel_maps(struct machine *machine, struct dso *kernel)
 {
 	enum map_type type;
-	u64 start = machine__get_kernel_start_addr(machine, NULL);
+	u64 start = machine__get_running_kernel_start(machine, NULL);
 
 	for (type = 0; type < MAP__NR_TYPES; ++type) {
 		struct kmap *kmap;
@@ -912,7 +941,7 @@
 {
 	struct dso *kernel = machine__get_kernel(machine);
 	const char *name;
-	u64 addr = machine__get_kernel_start_addr(machine, &name);
+	u64 addr = machine__get_running_kernel_start(machine, &name);
 	if (!addr)
 		return -1;
 
@@ -1285,6 +1314,16 @@
 
 	thread__find_addr_location(thread, machine, m, MAP__VARIABLE, addr,
 				   &al);
+	if (al.map == NULL) {
+		/*
+		 * some shared data regions have execute bit set which puts
+		 * their mapping in the MAP__FUNCTION type array.
+		 * Check there as a fallback option before dropping the sample.
+		 */
+		thread__find_addr_location(thread, machine, m, MAP__FUNCTION, addr,
+					   &al);
+	}
+
 	ams->addr = addr;
 	ams->al_addr = al.addr;
 	ams->sym = al.sym;
@@ -1531,3 +1570,25 @@
 
 	return 0;
 }
+
+int machine__get_kernel_start(struct machine *machine)
+{
+	struct map *map = machine__kernel_map(machine, MAP__FUNCTION);
+	int err = 0;
+
+	/*
+	 * The only addresses above 2^63 are kernel addresses of a 64-bit
+	 * kernel.  Note that addresses are unsigned so that on a 32-bit system
+	 * all addresses including kernel addresses are less than 2^32.  In
+	 * that case (32-bit system), if the kernel mapping is unknown, all
+	 * addresses will be assumed to be in user space - see
+	 * machine__kernel_ip().
+	 */
+	machine->kernel_start = 1ULL << 63;
+	if (map) {
+		err = map__load(map, machine->symbol_filter);
+		if (map->start)
+			machine->kernel_start = map->start;
+	}
+	return err;
+}

diff --git a/tools/perf/util/machine.h b/tools/perf/util/machine.h
index b972824..6a6bcc1 100644
--- a/tools/perf/util/machine.h
+++ b/tools/perf/util/machine.h

@@ -26,6 +26,7 @@
 	struct rb_node	  rb_node;
 	pid_t		  pid;
 	u16		  id_hdr_size;
+	bool		  comm_exec;
 	char		  *root_dir;
 	struct rb_root	  threads;
 	struct list_head  dead_threads;
@@ -35,6 +36,7 @@
 	struct list_head  kernel_dsos;
 	struct map_groups kmaps;
 	struct map	  *vmlinux_maps[MAP__NR_TYPES];
+	u64		  kernel_start;
 	symbol_filter_t	  symbol_filter;
 	pid_t		  *current_tid;
 };
@@ -45,8 +47,26 @@
 	return machine->vmlinux_maps[type];
 }
 
+int machine__get_kernel_start(struct machine *machine);
+
+static inline u64 machine__kernel_start(struct machine *machine)
+{
+	if (!machine->kernel_start)
+		machine__get_kernel_start(machine);
+	return machine->kernel_start;
+}
+
+static inline bool machine__kernel_ip(struct machine *machine, u64 ip)
+{
+	u64 kernel_start = machine__kernel_start(machine);
+
+	return ip >= kernel_start;
+}
+
 struct thread *machine__find_thread(struct machine *machine, pid_t pid,
 				    pid_t tid);
+struct comm *machine__thread_exec_comm(struct machine *machine,
+				       struct thread *thread);
 
 int machine__process_comm_event(struct machine *machine, union perf_event *event,
 				struct perf_sample *sample);
@@ -88,6 +108,7 @@
 
 void machines__set_symbol_filter(struct machines *machines,
 				 symbol_filter_t symbol_filter);
+void machines__set_comm_exec(struct machines *machines, bool comm_exec);
 
 struct machine *machine__new_host(void);
 int machine__init(struct machine *machine, const char *root_dir, pid_t pid);

diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c
index 31b8905..b709059 100644
--- a/tools/perf/util/map.c
+++ b/tools/perf/util/map.c

@@ -31,6 +31,7 @@
 static inline int is_no_dso_memory(const char *filename)
 {
 	return !strncmp(filename, "[stack", 6) ||
+	       !strncmp(filename, "/SYSV",5)   ||
 	       !strcmp(filename, "[heap]");
 }
 

diff --git a/tools/perf/util/ordered-events.c b/tools/perf/util/ordered-events.c
new file mode 100644
index 0000000..706ce1a
--- /dev/null
+++ b/tools/perf/util/ordered-events.c

@@ -0,0 +1,245 @@
+#include <linux/list.h>
+#include <linux/compiler.h>
+#include "ordered-events.h"
+#include "evlist.h"
+#include "session.h"
+#include "asm/bug.h"
+#include "debug.h"
+
+#define pr_N(n, fmt, ...) \
+	eprintf(n, debug_ordered_events, fmt, ##__VA_ARGS__)
+
+#define pr(fmt, ...) pr_N(1, pr_fmt(fmt), ##__VA_ARGS__)
+
+static void queue_event(struct ordered_events *oe, struct ordered_event *new)
+{
+	struct ordered_event *last = oe->last;
+	u64 timestamp = new->timestamp;
+	struct list_head *p;
+
+	++oe->nr_events;
+	oe->last = new;
+
+	pr_oe_time2(timestamp, "queue_event nr_events %u\n", oe->nr_events);
+
+	if (!last) {
+		list_add(&new->list, &oe->events);
+		oe->max_timestamp = timestamp;
+		return;
+	}
+
+	/*
+	 * last event might point to some random place in the list as it's
+	 * the last queued event. We expect that the new event is close to
+	 * this.
+	 */
+	if (last->timestamp <= timestamp) {
+		while (last->timestamp <= timestamp) {
+			p = last->list.next;
+			if (p == &oe->events) {
+				list_add_tail(&new->list, &oe->events);
+				oe->max_timestamp = timestamp;
+				return;
+			}
+			last = list_entry(p, struct ordered_event, list);
+		}
+		list_add_tail(&new->list, &last->list);
+	} else {
+		while (last->timestamp > timestamp) {
+			p = last->list.prev;
+			if (p == &oe->events) {
+				list_add(&new->list, &oe->events);
+				return;
+			}
+			last = list_entry(p, struct ordered_event, list);
+		}
+		list_add(&new->list, &last->list);
+	}
+}
+
+#define MAX_SAMPLE_BUFFER	(64 * 1024 / sizeof(struct ordered_event))
+static struct ordered_event *alloc_event(struct ordered_events *oe)
+{
+	struct list_head *cache = &oe->cache;
+	struct ordered_event *new = NULL;
+
+	if (!list_empty(cache)) {
+		new = list_entry(cache->next, struct ordered_event, list);
+		list_del(&new->list);
+	} else if (oe->buffer) {
+		new = oe->buffer + oe->buffer_idx;
+		if (++oe->buffer_idx == MAX_SAMPLE_BUFFER)
+			oe->buffer = NULL;
+	} else if (oe->cur_alloc_size < oe->max_alloc_size) {
+		size_t size = MAX_SAMPLE_BUFFER * sizeof(*new);
+
+		oe->buffer = malloc(size);
+		if (!oe->buffer)
+			return NULL;
+
+		pr("alloc size %" PRIu64 "B (+%zu), max %" PRIu64 "B\n",
+		   oe->cur_alloc_size, size, oe->max_alloc_size);
+
+		oe->cur_alloc_size += size;
+		list_add(&oe->buffer->list, &oe->to_free);
+
+		/* First entry is abused to maintain the to_free list. */
+		oe->buffer_idx = 2;
+		new = oe->buffer + 1;
+	} else {
+		pr("allocation limit reached %" PRIu64 "B\n", oe->max_alloc_size);
+	}
+
+	return new;
+}
+
+struct ordered_event *
+ordered_events__new(struct ordered_events *oe, u64 timestamp)
+{
+	struct ordered_event *new;
+
+	new = alloc_event(oe);
+	if (new) {
+		new->timestamp = timestamp;
+		queue_event(oe, new);
+	}
+
+	return new;
+}
+
+void ordered_events__delete(struct ordered_events *oe, struct ordered_event *event)
+{
+	list_move(&event->list, &oe->cache);
+	oe->nr_events--;
+}
+
+static int __ordered_events__flush(struct perf_session *s,
+				   struct perf_tool *tool)
+{
+	struct ordered_events *oe = &s->ordered_events;
+	struct list_head *head = &oe->events;
+	struct ordered_event *tmp, *iter;
+	struct perf_sample sample;
+	u64 limit = oe->next_flush;
+	u64 last_ts = oe->last ? oe->last->timestamp : 0ULL;
+	bool show_progress = limit == ULLONG_MAX;
+	struct ui_progress prog;
+	int ret;
+
+	if (!tool->ordered_events || !limit)
+		return 0;
+
+	if (show_progress)
+		ui_progress__init(&prog, oe->nr_events, "Processing time ordered events...");
+
+	list_for_each_entry_safe(iter, tmp, head, list) {
+		if (session_done())
+			return 0;
+
+		if (iter->timestamp > limit)
+			break;
+
+		ret = perf_evlist__parse_sample(s->evlist, iter->event, &sample);
+		if (ret)
+			pr_err("Can't parse sample, err = %d\n", ret);
+		else {
+			ret = perf_session__deliver_event(s, iter->event, &sample, tool,
+							  iter->file_offset);
+			if (ret)
+				return ret;
+		}
+
+		ordered_events__delete(oe, iter);
+		oe->last_flush = iter->timestamp;
+
+		if (show_progress)
+			ui_progress__update(&prog, 1);
+	}
+
+	if (list_empty(head))
+		oe->last = NULL;
+	else if (last_ts <= limit)
+		oe->last = list_entry(head->prev, struct ordered_event, list);
+
+	return 0;
+}
+
+int ordered_events__flush(struct perf_session *s, struct perf_tool *tool,
+			  enum oe_flush how)
+{
+	struct ordered_events *oe = &s->ordered_events;
+	static const char * const str[] = {
+		"NONE",
+		"FINAL",
+		"ROUND",
+		"HALF ",
+	};
+	int err;
+
+	switch (how) {
+	case OE_FLUSH__FINAL:
+		oe->next_flush = ULLONG_MAX;
+		break;
+
+	case OE_FLUSH__HALF:
+	{
+		struct ordered_event *first, *last;
+		struct list_head *head = &oe->events;
+
+		first = list_entry(head->next, struct ordered_event, list);
+		last = oe->last;
+
+		/* Warn if we are called before any event got allocated. */
+		if (WARN_ONCE(!last || list_empty(head), "empty queue"))
+			return 0;
+
+		oe->next_flush  = first->timestamp;
+		oe->next_flush += (last->timestamp - first->timestamp) / 2;
+		break;
+	}
+
+	case OE_FLUSH__ROUND:
+	case OE_FLUSH__NONE:
+	default:
+		break;
+	};
+
+	pr_oe_time(oe->next_flush, "next_flush - ordered_events__flush PRE  %s, nr_events %u\n",
+		   str[how], oe->nr_events);
+	pr_oe_time(oe->max_timestamp, "max_timestamp\n");
+
+	err = __ordered_events__flush(s, tool);
+
+	if (!err) {
+		if (how == OE_FLUSH__ROUND)
+			oe->next_flush = oe->max_timestamp;
+
+		oe->last_flush_type = how;
+	}
+
+	pr_oe_time(oe->next_flush, "next_flush - ordered_events__flush POST %s, nr_events %u\n",
+		   str[how], oe->nr_events);
+	pr_oe_time(oe->last_flush, "last_flush\n");
+
+	return err;
+}
+
+void ordered_events__init(struct ordered_events *oe)
+{
+	INIT_LIST_HEAD(&oe->events);
+	INIT_LIST_HEAD(&oe->cache);
+	INIT_LIST_HEAD(&oe->to_free);
+	oe->max_alloc_size = (u64) -1;
+	oe->cur_alloc_size = 0;
+}
+
+void ordered_events__free(struct ordered_events *oe)
+{
+	while (!list_empty(&oe->to_free)) {
+		struct ordered_event *event;
+
+		event = list_entry(oe->to_free.next, struct ordered_event, list);
+		list_del(&event->list);
+		free(event);
+	}
+}

diff --git a/tools/perf/util/ordered-events.h b/tools/perf/util/ordered-events.h
new file mode 100644
index 0000000..3b2f205
--- /dev/null
+++ b/tools/perf/util/ordered-events.h

@@ -0,0 +1,51 @@
+#ifndef __ORDERED_EVENTS_H
+#define __ORDERED_EVENTS_H
+
+#include <linux/types.h>
+#include "tool.h"
+
+struct perf_session;
+
+struct ordered_event {
+	u64			timestamp;
+	u64			file_offset;
+	union perf_event	*event;
+	struct list_head	list;
+};
+
+enum oe_flush {
+	OE_FLUSH__NONE,
+	OE_FLUSH__FINAL,
+	OE_FLUSH__ROUND,
+	OE_FLUSH__HALF,
+};
+
+struct ordered_events {
+	u64			last_flush;
+	u64			next_flush;
+	u64			max_timestamp;
+	u64			max_alloc_size;
+	u64			cur_alloc_size;
+	struct list_head	events;
+	struct list_head	cache;
+	struct list_head	to_free;
+	struct ordered_event	*buffer;
+	struct ordered_event	*last;
+	int			buffer_idx;
+	unsigned int		nr_events;
+	enum oe_flush		last_flush_type;
+};
+
+struct ordered_event *ordered_events__new(struct ordered_events *oe, u64 timestamp);
+void ordered_events__delete(struct ordered_events *oe, struct ordered_event *event);
+int ordered_events__flush(struct perf_session *s, struct perf_tool *tool,
+			  enum oe_flush how);
+void ordered_events__init(struct ordered_events *oe);
+void ordered_events__free(struct ordered_events *oe);
+
+static inline
+void ordered_events__set_alloc_size(struct ordered_events *oe, u64 size)
+{
+	oe->max_alloc_size = size;
+}
+#endif /* __ORDERED_EVENTS_H */

diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 1e15df1..e34c81a 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c

@@ -10,6 +10,7 @@
 #include "symbol.h"
 #include "cache.h"
 #include "header.h"
+#include "debug.h"
 #include <api/fs/debugfs.h>
 #include "parse-events-bison.h"
 #define YY_EXTRA_TYPE int
@@ -1006,9 +1007,11 @@
 	struct dirent *sys_next, *evt_next, sys_dirent, evt_dirent;
 	char evt_path[MAXPATHLEN];
 	char dir_path[MAXPATHLEN];
+	char sbuf[STRERR_BUFSIZE];
 
 	if (debugfs_valid_mountpoint(tracing_events_path)) {
-		printf("  [ Tracepoints not available: %s ]\n", strerror(errno));
+		printf("  [ Tracepoints not available: %s ]\n",
+			strerror_r(errno, sbuf, sizeof(sbuf)));
 		return;
 	}
 

diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c
index 7a811eb..9bf5827 100644
--- a/tools/perf/util/pmu.c
+++ b/tools/perf/util/pmu.c

@@ -14,8 +14,8 @@
 
 struct perf_pmu_alias {
 	char *name;
-	struct list_head terms;
-	struct list_head list;
+	struct list_head terms; /* HEAD struct parse_events_term -> list */
+	struct list_head list;  /* ELEM */
 	char unit[UNIT_MAX_LEN+1];
 	double scale;
 };

diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h
index c14a543..1c1e2ee 100644
--- a/tools/perf/util/pmu.h
+++ b/tools/perf/util/pmu.h

@@ -17,9 +17,9 @@
 	char *name;
 	__u32 type;
 	struct cpu_map *cpus;
-	struct list_head format;
-	struct list_head aliases;
-	struct list_head list;
+	struct list_head format;  /* HEAD struct perf_pmu_format -> list */
+	struct list_head aliases; /* HEAD struct perf_pmu_alias -> list */
+	struct list_head list;    /* ELEM */
 };
 
 struct perf_pmu *perf_pmu__find(const char *name);

diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
index 9a0a183..f73595f 100644
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c

@@ -79,7 +79,7 @@
 	int ret;
 
 	symbol_conf.sort_by_name = true;
-	ret = symbol__init();
+	ret = symbol__init(NULL);
 	if (ret < 0) {
 		pr_debug("Failed to init symbol map.\n");
 		goto out;
@@ -258,21 +258,33 @@
 #ifdef HAVE_DWARF_SUPPORT
 
 /* Open new debuginfo of given module */
-static struct debuginfo *open_debuginfo(const char *module)
+static struct debuginfo *open_debuginfo(const char *module, bool silent)
 {
 	const char *path = module;
+	struct debuginfo *ret;
 
 	if (!module || !strchr(module, '/')) {
 		path = kernel_get_module_path(module);
 		if (!path) {
-			pr_err("Failed to find path of %s module.\n",
-			       module ?: "kernel");
+			if (!silent)
+				pr_err("Failed to find path of %s module.\n",
+				       module ?: "kernel");
 			return NULL;
 		}
 	}
-	return debuginfo__new(path);
+	ret = debuginfo__new(path);
+	if (!ret && !silent) {
+		pr_warning("The %s file has no debug information.\n", path);
+		if (!module || !strtailcmp(path, ".ko"))
+			pr_warning("Rebuild with CONFIG_DEBUG_INFO=y, ");
+		else
+			pr_warning("Rebuild with -g, ");
+		pr_warning("or install an appropriate debuginfo package.\n");
+	}
+	return ret;
 }
 
+
 static int get_text_start_address(const char *exec, unsigned long *address)
 {
 	Elf *elf;
@@ -333,15 +345,13 @@
 	pr_debug("try to find information at %" PRIx64 " in %s\n", addr,
 		 tp->module ? : "kernel");
 
-	dinfo = open_debuginfo(tp->module);
+	dinfo = open_debuginfo(tp->module, verbose == 0);
 	if (dinfo) {
 		ret = debuginfo__find_probe_point(dinfo,
 						 (unsigned long)addr, pp);
 		debuginfo__delete(dinfo);
-	} else {
-		pr_debug("Failed to open debuginfo at 0x%" PRIx64 "\n", addr);
+	} else
 		ret = -ENOENT;
-	}
 
 	if (ret > 0) {
 		pp->retprobe = tp->retprobe;
@@ -457,13 +467,11 @@
 	struct debuginfo *dinfo;
 	int ntevs, ret = 0;
 
-	dinfo = open_debuginfo(target);
+	dinfo = open_debuginfo(target, !need_dwarf);
 
 	if (!dinfo) {
-		if (need_dwarf) {
-			pr_warning("Failed to open debuginfo file.\n");
+		if (need_dwarf)
 			return -ENOENT;
-		}
 		pr_debug("Could not open debuginfo. Try to use symbols.\n");
 		return 0;
 	}
@@ -565,7 +573,7 @@
 
 static int __show_one_line(FILE *fp, int l, bool skip, bool show_num)
 {
-	char buf[LINEBUF_SIZE];
+	char buf[LINEBUF_SIZE], sbuf[STRERR_BUFSIZE];
 	const char *color = show_num ? "" : PERF_COLOR_BLUE;
 	const char *prefix = NULL;
 
@@ -585,7 +593,8 @@
 	return 1;
 error:
 	if (ferror(fp)) {
-		pr_warning("File read error: %s\n", strerror(errno));
+		pr_warning("File read error: %s\n",
+			   strerror_r(errno, sbuf, sizeof(sbuf)));
 		return -1;
 	}
 	return 0;
@@ -618,13 +627,12 @@
 	FILE *fp;
 	int ret;
 	char *tmp;
+	char sbuf[STRERR_BUFSIZE];
 
 	/* Search a line range */
-	dinfo = open_debuginfo(module);
-	if (!dinfo) {
-		pr_warning("Failed to open debuginfo file.\n");
+	dinfo = open_debuginfo(module, false);
+	if (!dinfo)
 		return -ENOENT;
-	}
 
 	ret = debuginfo__find_line_range(dinfo, lr);
 	debuginfo__delete(dinfo);
@@ -656,7 +664,7 @@
 	fp = fopen(lr->path, "r");
 	if (fp == NULL) {
 		pr_warning("Failed to open %s: %s\n", lr->path,
-			   strerror(errno));
+			   strerror_r(errno, sbuf, sizeof(sbuf)));
 		return -errno;
 	}
 	/* Skip to starting line number */
@@ -772,9 +780,8 @@
 	if (ret < 0)
 		return ret;
 
-	dinfo = open_debuginfo(module);
+	dinfo = open_debuginfo(module, false);
 	if (!dinfo) {
-		pr_warning("Failed to open debuginfo file.\n");
 		ret = -ENOENT;
 		goto out;
 	}
@@ -1405,8 +1412,7 @@
 
 	return tmp - buf;
 error:
-	pr_debug("Failed to synthesize perf probe argument: %s\n",
-		 strerror(-ret));
+	pr_debug("Failed to synthesize perf probe argument: %d\n", ret);
 	return ret;
 }
 
@@ -1455,8 +1461,7 @@
 
 	return buf;
 error:
-	pr_debug("Failed to synthesize perf probe point: %s\n",
-		 strerror(-ret));
+	pr_debug("Failed to synthesize perf probe point: %d\n", ret);
 	free(buf);
 	return NULL;
 }
@@ -1780,10 +1785,11 @@
 	memset(tev, 0, sizeof(*tev));
 }
 
-static void print_warn_msg(const char *file, bool is_kprobe)
+static void print_open_warning(int err, bool is_kprobe)
 {
+	char sbuf[STRERR_BUFSIZE];
 
-	if (errno == ENOENT) {
+	if (err == -ENOENT) {
 		const char *config;
 
 		if (!is_kprobe)
@@ -1791,25 +1797,43 @@
 		else
 			config = "CONFIG_KPROBE_EVENTS";
 
-		pr_warning("%s file does not exist - please rebuild kernel"
-				" with %s.\n", file, config);
-	} else
-		pr_warning("Failed to open %s file: %s\n", file,
-				strerror(errno));
+		pr_warning("%cprobe_events file does not exist"
+			   " - please rebuild kernel with %s.\n",
+			   is_kprobe ? 'k' : 'u', config);
+	} else if (err == -ENOTSUP)
+		pr_warning("Debugfs is not mounted.\n");
+	else
+		pr_warning("Failed to open %cprobe_events: %s\n",
+			   is_kprobe ? 'k' : 'u',
+			   strerror_r(-err, sbuf, sizeof(sbuf)));
 }
 
-static int open_probe_events(const char *trace_file, bool readwrite,
-				bool is_kprobe)
+static void print_both_open_warning(int kerr, int uerr)
+{
+	/* Both kprobes and uprobes are disabled, warn it. */
+	if (kerr == -ENOTSUP && uerr == -ENOTSUP)
+		pr_warning("Debugfs is not mounted.\n");
+	else if (kerr == -ENOENT && uerr == -ENOENT)
+		pr_warning("Please rebuild kernel with CONFIG_KPROBE_EVENTS "
+			   "or/and CONFIG_UPROBE_EVENTS.\n");
+	else {
+		char sbuf[STRERR_BUFSIZE];
+		pr_warning("Failed to open kprobe events: %s.\n",
+			   strerror_r(-kerr, sbuf, sizeof(sbuf)));
+		pr_warning("Failed to open uprobe events: %s.\n",
+			   strerror_r(-uerr, sbuf, sizeof(sbuf)));
+	}
+}
+
+static int open_probe_events(const char *trace_file, bool readwrite)
 {
 	char buf[PATH_MAX];
 	const char *__debugfs;
 	int ret;
 
 	__debugfs = debugfs_find_mountpoint();
-	if (__debugfs == NULL) {
-		pr_warning("Debugfs is not mounted.\n");
-		return -ENOENT;
-	}
+	if (__debugfs == NULL)
+		return -ENOTSUP;
 
 	ret = e_snprintf(buf, PATH_MAX, "%s/%s", __debugfs, trace_file);
 	if (ret >= 0) {
@@ -1820,19 +1844,19 @@
 			ret = open(buf, O_RDONLY, 0);
 
 		if (ret < 0)
-			print_warn_msg(buf, is_kprobe);
+			ret = -errno;
 	}
 	return ret;
 }
 
 static int open_kprobe_events(bool readwrite)
 {
-	return open_probe_events("tracing/kprobe_events", readwrite, true);
+	return open_probe_events("tracing/kprobe_events", readwrite);
 }
 
 static int open_uprobe_events(bool readwrite)
 {
-	return open_probe_events("tracing/uprobe_events", readwrite, false);
+	return open_probe_events("tracing/uprobe_events", readwrite);
 }
 
 /* Get raw string list of current kprobe_events  or uprobe_events */
@@ -1857,7 +1881,7 @@
 			p[idx] = '\0';
 		ret = strlist__add(sl, buf);
 		if (ret < 0) {
-			pr_debug("strlist__add failed: %s\n", strerror(-ret));
+			pr_debug("strlist__add failed (%d)\n", ret);
 			strlist__delete(sl);
 			return NULL;
 		}
@@ -1916,7 +1940,7 @@
 
 	rawlist = get_probe_trace_command_rawlist(fd);
 	if (!rawlist)
-		return -ENOENT;
+		return -ENOMEM;
 
 	strlist__for_each(ent, rawlist) {
 		ret = parse_probe_trace_command(ent->s, &tev);
@@ -1940,27 +1964,34 @@
 /* List up current perf-probe events */
 int show_perf_probe_events(void)
 {
-	int fd, ret;
+	int kp_fd, up_fd, ret;
 
 	setup_pager();
-	fd = open_kprobe_events(false);
-
-	if (fd < 0)
-		return fd;
 
 	ret = init_symbol_maps(false);
 	if (ret < 0)
 		return ret;
 
-	ret = __show_perf_probe_events(fd, true);
-	close(fd);
-
-	fd = open_uprobe_events(false);
-	if (fd >= 0) {
-		ret = __show_perf_probe_events(fd, false);
-		close(fd);
+	kp_fd = open_kprobe_events(false);
+	if (kp_fd >= 0) {
+		ret = __show_perf_probe_events(kp_fd, true);
+		close(kp_fd);
+		if (ret < 0)
+			goto out;
 	}
 
+	up_fd = open_uprobe_events(false);
+	if (kp_fd < 0 && up_fd < 0) {
+		print_both_open_warning(kp_fd, up_fd);
+		ret = kp_fd;
+		goto out;
+	}
+
+	if (up_fd >= 0) {
+		ret = __show_perf_probe_events(up_fd, false);
+		close(up_fd);
+	}
+out:
 	exit_symbol_maps();
 	return ret;
 }
@@ -1976,6 +2007,8 @@
 
 	memset(&tev, 0, sizeof(tev));
 	rawlist = get_probe_trace_command_rawlist(fd);
+	if (!rawlist)
+		return NULL;
 	sl = strlist__new(true, NULL);
 	strlist__for_each(ent, rawlist) {
 		ret = parse_probe_trace_command(ent->s, &tev);
@@ -2005,6 +2038,7 @@
 {
 	int ret = 0;
 	char *buf = synthesize_probe_trace_command(tev);
+	char sbuf[STRERR_BUFSIZE];
 
 	if (!buf) {
 		pr_debug("Failed to synthesize probe trace event.\n");
@@ -2016,7 +2050,7 @@
 		ret = write(fd, buf, strlen(buf));
 		if (ret <= 0)
 			pr_warning("Failed to write event: %s\n",
-				   strerror(errno));
+				   strerror_r(errno, sbuf, sizeof(sbuf)));
 	}
 	free(buf);
 	return ret;
@@ -2030,7 +2064,7 @@
 	/* Try no suffix */
 	ret = e_snprintf(buf, len, "%s", base);
 	if (ret < 0) {
-		pr_debug("snprintf() failed: %s\n", strerror(-ret));
+		pr_debug("snprintf() failed: %d\n", ret);
 		return ret;
 	}
 	if (!strlist__has_entry(namelist, buf))
@@ -2046,7 +2080,7 @@
 	for (i = 1; i < MAX_EVENT_INDEX; i++) {
 		ret = e_snprintf(buf, len, "%s_%d", base, i);
 		if (ret < 0) {
-			pr_debug("snprintf() failed: %s\n", strerror(-ret));
+			pr_debug("snprintf() failed: %d\n", ret);
 			return ret;
 		}
 		if (!strlist__has_entry(namelist, buf))
@@ -2075,8 +2109,11 @@
 	else
 		fd = open_kprobe_events(true);
 
-	if (fd < 0)
+	if (fd < 0) {
+		print_open_warning(fd, !pev->uprobes);
 		return fd;
+	}
+
 	/* Get current event names */
 	namelist = get_probe_trace_event_names(fd, false);
 	if (!namelist) {
@@ -2408,7 +2445,8 @@
 	printf("Removed event: %s\n", ent->s);
 	return 0;
 error:
-	pr_warning("Failed to delete event: %s\n", strerror(-ret));
+	pr_warning("Failed to delete event: %s\n",
+		   strerror_r(-ret, buf, sizeof(buf)));
 	return ret;
 }
 
@@ -2449,15 +2487,18 @@
 
 	/* Get current event names */
 	kfd = open_kprobe_events(true);
-	if (kfd < 0)
-		return kfd;
+	if (kfd >= 0)
+		namelist = get_probe_trace_event_names(kfd, true);
 
-	namelist = get_probe_trace_event_names(kfd, true);
 	ufd = open_uprobe_events(true);
-
 	if (ufd >= 0)
 		unamelist = get_probe_trace_event_names(ufd, true);
 
+	if (kfd < 0 && ufd < 0) {
+		print_both_open_warning(kfd, ufd);
+		goto error;
+	}
+
 	if (namelist == NULL && unamelist == NULL)
 		goto error;
 

diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c
index dca9145..9c59356 100644
--- a/tools/perf/util/probe-finder.c
+++ b/tools/perf/util/probe-finder.c

@@ -281,6 +281,7 @@
 	struct probe_trace_arg_ref **ref_ptr = &tvar->ref;
 	Dwarf_Die type;
 	char buf[16];
+	char sbuf[STRERR_BUFSIZE];
 	int bsize, boffs, total;
 	int ret;
 
@@ -367,7 +368,7 @@
 		if (ret >= 16)
 			ret = -E2BIG;
 		pr_warning("Failed to convert variable type: %s\n",
-			   strerror(-ret));
+			   strerror_r(-ret, sbuf, sizeof(sbuf)));
 		return ret;
 	}
 	tvar->type = strdup(buf);
@@ -779,10 +780,12 @@
 	size_t line_len;
 	ssize_t len;
 	int count = 0, linenum = 1;
+	char sbuf[STRERR_BUFSIZE];
 
 	fp = fopen(fname, "r");
 	if (!fp) {
-		pr_warning("Failed to open %s: %s\n", fname, strerror(errno));
+		pr_warning("Failed to open %s: %s\n", fname,
+			   strerror_r(errno, sbuf, sizeof(sbuf)));
 		return -errno;
 	}
 

diff --git a/tools/perf/util/record.c b/tools/perf/util/record.c
index fe8079e..cf69325 100644
--- a/tools/perf/util/record.c
+++ b/tools/perf/util/record.c

@@ -14,6 +14,7 @@
 	struct perf_evsel *evsel;
 	unsigned long flags = perf_event_open_cloexec_flag();
 	int err = -EAGAIN, fd;
+	static pid_t pid = -1;
 
 	evlist = perf_evlist__new();
 	if (!evlist)
@@ -24,14 +25,22 @@
 
 	evsel = perf_evlist__first(evlist);
 
-	fd = sys_perf_event_open(&evsel->attr, -1, cpu, -1, flags);
-	if (fd < 0)
-		goto out_delete;
+	while (1) {
+		fd = sys_perf_event_open(&evsel->attr, pid, cpu, -1, flags);
+		if (fd < 0) {
+			if (pid == -1 && errno == EACCES) {
+				pid = 0;
+				continue;
+			}
+			goto out_delete;
+		}
+		break;
+	}
 	close(fd);
 
 	fn(evsel);
 
-	fd = sys_perf_event_open(&evsel->attr, -1, cpu, -1, flags);
+	fd = sys_perf_event_open(&evsel->attr, pid, cpu, -1, flags);
 	if (fd < 0) {
 		if (errno == EINVAL)
 			err = -EINVAL;
@@ -47,7 +56,7 @@
 
 static bool perf_probe_api(setup_probe_fn_t fn)
 {
-	const char *try[] = {"cycles:u", "instructions:u", "cpu-clock", NULL};
+	const char *try[] = {"cycles:u", "instructions:u", "cpu-clock:u", NULL};
 	struct cpu_map *cpus;
 	int cpu, ret, i = 0;
 
@@ -106,7 +115,7 @@
 
 	evlist__for_each(evlist, evsel) {
 		perf_evsel__config(evsel, opts);
-		if (!evsel->idx && use_comm_exec)
+		if (evsel->tracking && use_comm_exec)
 			evsel->attr.comm_exec = 1;
 	}
 
@@ -201,6 +210,7 @@
 	struct perf_evsel *evsel;
 	int err, fd, cpu;
 	bool ret = false;
+	pid_t pid = -1;
 
 	temp_evlist = perf_evlist__new();
 	if (!temp_evlist)
@@ -221,12 +231,20 @@
 		cpu = evlist->cpus->map[0];
 	}
 
-	fd = sys_perf_event_open(&evsel->attr, -1, cpu, -1,
-				 perf_event_open_cloexec_flag());
-	if (fd >= 0) {
-		close(fd);
-		ret = true;
+	while (1) {
+		fd = sys_perf_event_open(&evsel->attr, pid, cpu, -1,
+					 perf_event_open_cloexec_flag());
+		if (fd < 0) {
+			if (pid == -1 && errno == EACCES) {
+				pid = 0;
+				continue;
+			}
+			goto out_delete;
+		}
+		break;
 	}
+	close(fd);
+	ret = true;
 
 out_delete:
 	perf_evlist__delete(temp_evlist);

diff --git a/tools/perf/util/run-command.c b/tools/perf/util/run-command.c
index da8e9b2..34622b5 100644
--- a/tools/perf/util/run-command.c
+++ b/tools/perf/util/run-command.c

@@ -1,6 +1,7 @@
 #include "cache.h"
 #include "run-command.h"
 #include "exec_cmd.h"
+#include "debug.h"
 
 static inline void close_pair(int fd[2])
 {
@@ -19,6 +20,7 @@
 {
 	int need_in, need_out, need_err;
 	int fdin[2], fdout[2], fderr[2];
+	char sbuf[STRERR_BUFSIZE];
 
 	/*
 	 * In case of errors we must keep the promise to close FDs
@@ -99,7 +101,7 @@
 
 		if (cmd->dir && chdir(cmd->dir))
 			die("exec %s: cd to %s failed (%s)", cmd->argv[0],
-			    cmd->dir, strerror(errno));
+			    cmd->dir, strerror_r(errno, sbuf, sizeof(sbuf)));
 		if (cmd->env) {
 			for (; *cmd->env; cmd->env++) {
 				if (strchr(*cmd->env, '='))
@@ -153,6 +155,8 @@
 
 static int wait_or_whine(pid_t pid)
 {
+	char sbuf[STRERR_BUFSIZE];
+
 	for (;;) {
 		int status, code;
 		pid_t waiting = waitpid(pid, &status, 0);
@@ -160,7 +164,8 @@
 		if (waiting < 0) {
 			if (errno == EINTR)
 				continue;
-			error("waitpid failed (%s)", strerror(errno));
+			error("waitpid failed (%s)",
+			      strerror_r(errno, sbuf, sizeof(sbuf)));
 			return -ERR_RUN_COMMAND_WAITPID;
 		}
 		if (waiting != pid)

diff --git a/tools/perf/util/scripting-engines/trace-event-perl.c b/tools/perf/util/scripting-engines/trace-event-perl.c
index b2dba9c..0a01bac 100644
--- a/tools/perf/util/scripting-engines/trace-event-perl.c
+++ b/tools/perf/util/scripting-engines/trace-event-perl.c

@@ -432,6 +432,11 @@
 	return err;
 }
 
+static int perl_flush_script(void)
+{
+	return 0;
+}
+
 /*
  * Stop trace script
  */
@@ -633,6 +638,7 @@
 struct scripting_ops perl_scripting_ops = {
 	.name = "Perl",
 	.start_script = perl_start_script,
+	.flush_script = perl_flush_script,
 	.stop_script = perl_stop_script,
 	.process_event = perl_process_event,
 	.generate_script = perl_generate_script,

diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c
index cbce254..56ba07c 100644
--- a/tools/perf/util/scripting-engines/trace-event-python.c
+++ b/tools/perf/util/scripting-engines/trace-event-python.c

@@ -73,6 +73,35 @@
 	Py_DECREF(val);
 }
 
+static PyObject *get_handler(const char *handler_name)
+{
+	PyObject *handler;
+
+	handler = PyDict_GetItemString(main_dict, handler_name);
+	if (handler && !PyCallable_Check(handler))
+		return NULL;
+	return handler;
+}
+
+static void call_object(PyObject *handler, PyObject *args, const char *die_msg)
+{
+	PyObject *retval;
+
+	retval = PyObject_CallObject(handler, args);
+	if (retval == NULL)
+		handler_call_die(die_msg);
+	Py_DECREF(retval);
+}
+
+static void try_call_object(const char *handler_name, PyObject *args)
+{
+	PyObject *handler;
+
+	handler = get_handler(handler_name);
+	if (handler)
+		call_object(handler, args, handler_name);
+}
+
 static void define_value(enum print_arg_type field_type,
 			 const char *ev_name,
 			 const char *field_name,
@@ -80,7 +109,7 @@
 			 const char *field_str)
 {
 	const char *handler_name = "define_flag_value";
-	PyObject *handler, *t, *retval;
+	PyObject *t;
 	unsigned long long value;
 	unsigned n = 0;
 
@@ -98,13 +127,7 @@
 	PyTuple_SetItem(t, n++, PyInt_FromLong(value));
 	PyTuple_SetItem(t, n++, PyString_FromString(field_str));
 
-	handler = PyDict_GetItemString(main_dict, handler_name);
-	if (handler && PyCallable_Check(handler)) {
-		retval = PyObject_CallObject(handler, t);
-		if (retval == NULL)
-			handler_call_die(handler_name);
-		Py_DECREF(retval);
-	}
+	try_call_object(handler_name, t);
 
 	Py_DECREF(t);
 }
@@ -127,7 +150,7 @@
 			 const char *delim)
 {
 	const char *handler_name = "define_flag_field";
-	PyObject *handler, *t, *retval;
+	PyObject *t;
 	unsigned n = 0;
 
 	if (field_type == PRINT_SYMBOL)
@@ -145,13 +168,7 @@
 	if (field_type == PRINT_FLAGS)
 		PyTuple_SetItem(t, n++, PyString_FromString(delim));
 
-	handler = PyDict_GetItemString(main_dict, handler_name);
-	if (handler && PyCallable_Check(handler)) {
-		retval = PyObject_CallObject(handler, t);
-		if (retval == NULL)
-			handler_call_die(handler_name);
-		Py_DECREF(retval);
-	}
+	try_call_object(handler_name, t);
 
 	Py_DECREF(t);
 }
@@ -362,7 +379,7 @@
 				      struct thread *thread,
 				      struct addr_location *al)
 {
-	PyObject *handler, *retval, *context, *t, *obj, *callchain;
+	PyObject *handler, *context, *t, *obj, *callchain;
 	PyObject *dict = NULL;
 	static char handler_name[256];
 	struct format_field *field;
@@ -387,9 +404,7 @@
 
 	sprintf(handler_name, "%s__%s", event->system, event->name);
 
-	handler = PyDict_GetItemString(main_dict, handler_name);
-	if (handler && !PyCallable_Check(handler))
-		handler = NULL;
+	handler = get_handler(handler_name);
 	if (!handler) {
 		dict = PyDict_New();
 		if (!dict)
@@ -450,19 +465,9 @@
 		Py_FatalError("error resizing Python tuple");
 
 	if (handler) {
-		retval = PyObject_CallObject(handler, t);
-		if (retval == NULL)
-			handler_call_die(handler_name);
-		Py_DECREF(retval);
+		call_object(handler, t, handler_name);
 	} else {
-		handler = PyDict_GetItemString(main_dict, "trace_unhandled");
-		if (handler && PyCallable_Check(handler)) {
-
-			retval = PyObject_CallObject(handler, t);
-			if (retval == NULL)
-				handler_call_die("trace_unhandled");
-			Py_DECREF(retval);
-		}
+		try_call_object("trace_unhandled", t);
 		Py_DECREF(dict);
 	}
 
@@ -474,7 +479,7 @@
 					 struct thread *thread,
 					 struct addr_location *al)
 {
-	PyObject *handler, *retval, *t, *dict, *callchain, *dict_sample;
+	PyObject *handler, *t, *dict, *callchain, *dict_sample;
 	static char handler_name[64];
 	unsigned n = 0;
 
@@ -496,8 +501,8 @@
 
 	snprintf(handler_name, sizeof(handler_name), "%s", "process_event");
 
-	handler = PyDict_GetItemString(main_dict, handler_name);
-	if (!handler || !PyCallable_Check(handler))
+	handler = get_handler(handler_name);
+	if (!handler)
 		goto exit;
 
 	pydict_set_item_string_decref(dict, "ev_name", PyString_FromString(perf_evsel__name(evsel)));
@@ -539,10 +544,7 @@
 	if (_PyTuple_Resize(&t, n) == -1)
 		Py_FatalError("error resizing Python tuple");
 
-	retval = PyObject_CallObject(handler, t);
-	if (retval == NULL)
-		handler_call_die(handler_name);
-	Py_DECREF(retval);
+	call_object(handler, t, handler_name);
 exit:
 	Py_DECREF(dict);
 	Py_DECREF(t);
@@ -566,36 +568,24 @@
 
 static int run_start_sub(void)
 {
-	PyObject *handler, *retval;
-	int err = 0;
-
 	main_module = PyImport_AddModule("__main__");
 	if (main_module == NULL)
 		return -1;
 	Py_INCREF(main_module);
 
 	main_dict = PyModule_GetDict(main_module);
-	if (main_dict == NULL) {
-		err = -1;
+	if (main_dict == NULL)
 		goto error;
-	}
 	Py_INCREF(main_dict);
 
-	handler = PyDict_GetItemString(main_dict, "trace_begin");
-	if (handler == NULL || !PyCallable_Check(handler))
-		goto out;
+	try_call_object("trace_begin", NULL);
 
-	retval = PyObject_CallObject(handler, NULL);
-	if (retval == NULL)
-		handler_call_die("trace_begin");
+	return 0;
 
-	Py_DECREF(retval);
-	return err;
 error:
 	Py_XDECREF(main_dict);
 	Py_XDECREF(main_module);
-out:
-	return err;
+	return -1;
 }
 
 /*
@@ -649,28 +639,23 @@
 	return err;
 }
 
+static int python_flush_script(void)
+{
+	return 0;
+}
+
 /*
  * Stop trace script
  */
 static int python_stop_script(void)
 {
-	PyObject *handler, *retval;
-	int err = 0;
+	try_call_object("trace_end", NULL);
 
-	handler = PyDict_GetItemString(main_dict, "trace_end");
-	if (handler == NULL || !PyCallable_Check(handler))
-		goto out;
-
-	retval = PyObject_CallObject(handler, NULL);
-	if (retval == NULL)
-		handler_call_die("trace_end");
-	Py_DECREF(retval);
-out:
 	Py_XDECREF(main_dict);
 	Py_XDECREF(main_module);
 	Py_Finalize();
 
-	return err;
+	return 0;
 }
 
 static int python_generate_script(struct pevent *pevent, const char *outfile)
@@ -843,6 +828,7 @@
 struct scripting_ops python_scripting_ops = {
 	.name = "Python",
 	.start_script = python_start_script,
+	.flush_script = python_flush_script,
 	.stop_script = python_stop_script,
 	.process_event = python_process_event,
 	.generate_script = python_generate_script,

diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 88dfef7..6d2d50d 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c

@@ -14,6 +14,7 @@
 #include "util.h"
 #include "cpumap.h"
 #include "perf_regs.h"
+#include "asm/bug.h"
 
 static int perf_session__open(struct perf_session *session)
 {
@@ -66,6 +67,25 @@
 	machines__destroy_kernel_maps(&session->machines);
 }
 
+static bool perf_session__has_comm_exec(struct perf_session *session)
+{
+	struct perf_evsel *evsel;
+
+	evlist__for_each(session->evlist, evsel) {
+		if (evsel->attr.comm_exec)
+			return true;
+	}
+
+	return false;
+}
+
+static void perf_session__set_comm_exec(struct perf_session *session)
+{
+	bool comm_exec = perf_session__has_comm_exec(session);
+
+	machines__set_comm_exec(&session->machines, comm_exec);
+}
+
 struct perf_session *perf_session__new(struct perf_data_file *file,
 				       bool repipe, struct perf_tool *tool)
 {
@@ -75,9 +95,7 @@
 		goto out;
 
 	session->repipe = repipe;
-	INIT_LIST_HEAD(&session->ordered_samples.samples);
-	INIT_LIST_HEAD(&session->ordered_samples.sample_cache);
-	INIT_LIST_HEAD(&session->ordered_samples.to_free);
+	ordered_events__init(&session->ordered_events);
 	machines__init(&session->machines);
 
 	if (file) {
@@ -91,6 +109,7 @@
 				goto out_close;
 
 			perf_session__set_id_hdr_size(session);
+			perf_session__set_comm_exec(session);
 		}
 	}
 
@@ -104,9 +123,9 @@
 	}
 
 	if (tool && tool->ordering_requires_timestamps &&
-	    tool->ordered_samples && !perf_evlist__sample_id_all(session->evlist)) {
+	    tool->ordered_events && !perf_evlist__sample_id_all(session->evlist)) {
 		dump_printf("WARNING: No sample_id_all support, falling back to unordered processing\n");
-		tool->ordered_samples = false;
+		tool->ordered_events = false;
 	}
 
 	return session;
@@ -238,7 +257,7 @@
 	if (tool->build_id == NULL)
 		tool->build_id = process_finished_round_stub;
 	if (tool->finished_round == NULL) {
-		if (tool->ordered_samples)
+		if (tool->ordered_events)
 			tool->finished_round = process_finished_round;
 		else
 			tool->finished_round = process_finished_round_stub;
@@ -444,87 +463,6 @@
 	[PERF_RECORD_HEADER_MAX]	  = NULL,
 };
 
-struct sample_queue {
-	u64			timestamp;
-	u64			file_offset;
-	union perf_event	*event;
-	struct list_head	list;
-};
-
-static void perf_session_free_sample_buffers(struct perf_session *session)
-{
-	struct ordered_samples *os = &session->ordered_samples;
-
-	while (!list_empty(&os->to_free)) {
-		struct sample_queue *sq;
-
-		sq = list_entry(os->to_free.next, struct sample_queue, list);
-		list_del(&sq->list);
-		free(sq);
-	}
-}
-
-static int perf_session_deliver_event(struct perf_session *session,
-				      union perf_event *event,
-				      struct perf_sample *sample,
-				      struct perf_tool *tool,
-				      u64 file_offset);
-
-static int flush_sample_queue(struct perf_session *s,
-		       struct perf_tool *tool)
-{
-	struct ordered_samples *os = &s->ordered_samples;
-	struct list_head *head = &os->samples;
-	struct sample_queue *tmp, *iter;
-	struct perf_sample sample;
-	u64 limit = os->next_flush;
-	u64 last_ts = os->last_sample ? os->last_sample->timestamp : 0ULL;
-	bool show_progress = limit == ULLONG_MAX;
-	struct ui_progress prog;
-	int ret;
-
-	if (!tool->ordered_samples || !limit)
-		return 0;
-
-	if (show_progress)
-		ui_progress__init(&prog, os->nr_samples, "Processing time ordered events...");
-
-	list_for_each_entry_safe(iter, tmp, head, list) {
-		if (session_done())
-			return 0;
-
-		if (iter->timestamp > limit)
-			break;
-
-		ret = perf_evlist__parse_sample(s->evlist, iter->event, &sample);
-		if (ret)
-			pr_err("Can't parse sample, err = %d\n", ret);
-		else {
-			ret = perf_session_deliver_event(s, iter->event, &sample, tool,
-							 iter->file_offset);
-			if (ret)
-				return ret;
-		}
-
-		os->last_flush = iter->timestamp;
-		list_del(&iter->list);
-		list_add(&iter->list, &os->sample_cache);
-		os->nr_samples--;
-
-		if (show_progress)
-			ui_progress__update(&prog, 1);
-	}
-
-	if (list_empty(head)) {
-		os->last_sample = NULL;
-	} else if (last_ts <= limit) {
-		os->last_sample =
-			list_entry(head->prev, struct sample_queue, list);
-	}
-
-	return 0;
-}
-
 /*
  * When perf record finishes a pass on every buffers, it records this pseudo
  * event.
@@ -568,99 +506,43 @@
 				  union perf_event *event __maybe_unused,
 				  struct perf_session *session)
 {
-	int ret = flush_sample_queue(session, tool);
-	if (!ret)
-		session->ordered_samples.next_flush = session->ordered_samples.max_timestamp;
-
-	return ret;
+	return ordered_events__flush(session, tool, OE_FLUSH__ROUND);
 }
 
-/* The queue is ordered by time */
-static void __queue_event(struct sample_queue *new, struct perf_session *s)
-{
-	struct ordered_samples *os = &s->ordered_samples;
-	struct sample_queue *sample = os->last_sample;
-	u64 timestamp = new->timestamp;
-	struct list_head *p;
-
-	++os->nr_samples;
-	os->last_sample = new;
-
-	if (!sample) {
-		list_add(&new->list, &os->samples);
-		os->max_timestamp = timestamp;
-		return;
-	}
-
-	/*
-	 * last_sample might point to some random place in the list as it's
-	 * the last queued event. We expect that the new event is close to
-	 * this.
-	 */
-	if (sample->timestamp <= timestamp) {
-		while (sample->timestamp <= timestamp) {
-			p = sample->list.next;
-			if (p == &os->samples) {
-				list_add_tail(&new->list, &os->samples);
-				os->max_timestamp = timestamp;
-				return;
-			}
-			sample = list_entry(p, struct sample_queue, list);
-		}
-		list_add_tail(&new->list, &sample->list);
-	} else {
-		while (sample->timestamp > timestamp) {
-			p = sample->list.prev;
-			if (p == &os->samples) {
-				list_add(&new->list, &os->samples);
-				return;
-			}
-			sample = list_entry(p, struct sample_queue, list);
-		}
-		list_add(&new->list, &sample->list);
-	}
-}
-
-#define MAX_SAMPLE_BUFFER	(64 * 1024 / sizeof(struct sample_queue))
-
 int perf_session_queue_event(struct perf_session *s, union perf_event *event,
-				    struct perf_sample *sample, u64 file_offset)
+			     struct perf_tool *tool, struct perf_sample *sample,
+			     u64 file_offset)
 {
-	struct ordered_samples *os = &s->ordered_samples;
-	struct list_head *sc = &os->sample_cache;
+	struct ordered_events *oe = &s->ordered_events;
 	u64 timestamp = sample->time;
-	struct sample_queue *new;
+	struct ordered_event *new;
 
 	if (!timestamp || timestamp == ~0ULL)
 		return -ETIME;
 
-	if (timestamp < s->ordered_samples.last_flush) {
-		printf("Warning: Timestamp below last timeslice flush\n");
-		return -EINVAL;
+	if (timestamp < oe->last_flush) {
+		WARN_ONCE(1, "Timestamp below last timeslice flush\n");
+
+		pr_oe_time(timestamp,      "out of order event");
+		pr_oe_time(oe->last_flush, "last flush, last_flush_type %d\n",
+			   oe->last_flush_type);
+
+		/* We could get out of order messages after forced flush. */
+		if (oe->last_flush_type != OE_FLUSH__HALF)
+			return -EINVAL;
 	}
 
-	if (!list_empty(sc)) {
-		new = list_entry(sc->next, struct sample_queue, list);
-		list_del(&new->list);
-	} else if (os->sample_buffer) {
-		new = os->sample_buffer + os->sample_buffer_idx;
-		if (++os->sample_buffer_idx == MAX_SAMPLE_BUFFER)
-			os->sample_buffer = NULL;
-	} else {
-		os->sample_buffer = malloc(MAX_SAMPLE_BUFFER * sizeof(*new));
-		if (!os->sample_buffer)
-			return -ENOMEM;
-		list_add(&os->sample_buffer->list, &os->to_free);
-		os->sample_buffer_idx = 2;
-		new = os->sample_buffer + 1;
+	new = ordered_events__new(oe, timestamp);
+	if (!new) {
+		ordered_events__flush(s, tool, OE_FLUSH__HALF);
+		new = ordered_events__new(oe, timestamp);
 	}
 
-	new->timestamp = timestamp;
+	if (!new)
+		return -ENOMEM;
+
 	new->file_offset = file_offset;
 	new->event = event;
-
-	__queue_event(new, s);
-
 	return 0;
 }
 
@@ -920,11 +802,10 @@
 					    &sample->read.one, machine);
 }
 
-static int perf_session_deliver_event(struct perf_session *session,
-				      union perf_event *event,
-				      struct perf_sample *sample,
-				      struct perf_tool *tool,
-				      u64 file_offset)
+int perf_session__deliver_event(struct perf_session *session,
+				union perf_event *event,
+				struct perf_sample *sample,
+				struct perf_tool *tool, u64 file_offset)
 {
 	struct perf_evsel *evsel;
 	struct machine *machine;
@@ -1005,8 +886,10 @@
 	switch (event->header.type) {
 	case PERF_RECORD_HEADER_ATTR:
 		err = tool->attr(tool, event, &session->evlist);
-		if (err == 0)
+		if (err == 0) {
 			perf_session__set_id_hdr_size(session);
+			perf_session__set_comm_exec(session);
+		}
 		return err;
 	case PERF_RECORD_HEADER_EVENT_TYPE:
 		/*
@@ -1036,6 +919,61 @@
 		swap(event, sample_id_all);
 }
 
+int perf_session__peek_event(struct perf_session *session, off_t file_offset,
+			     void *buf, size_t buf_sz,
+			     union perf_event **event_ptr,
+			     struct perf_sample *sample)
+{
+	union perf_event *event;
+	size_t hdr_sz, rest;
+	int fd;
+
+	if (session->one_mmap && !session->header.needs_swap) {
+		event = file_offset - session->one_mmap_offset +
+			session->one_mmap_addr;
+		goto out_parse_sample;
+	}
+
+	if (perf_data_file__is_pipe(session->file))
+		return -1;
+
+	fd = perf_data_file__fd(session->file);
+	hdr_sz = sizeof(struct perf_event_header);
+
+	if (buf_sz < hdr_sz)
+		return -1;
+
+	if (lseek(fd, file_offset, SEEK_SET) == (off_t)-1 ||
+	    readn(fd, &buf, hdr_sz) != (ssize_t)hdr_sz)
+		return -1;
+
+	event = (union perf_event *)buf;
+
+	if (session->header.needs_swap)
+		perf_event_header__bswap(&event->header);
+
+	if (event->header.size < hdr_sz)
+		return -1;
+
+	rest = event->header.size - hdr_sz;
+
+	if (readn(fd, &buf, rest) != (ssize_t)rest)
+		return -1;
+
+	if (session->header.needs_swap)
+		event_swap(event, perf_evlist__sample_id_all(session->evlist));
+
+out_parse_sample:
+
+	if (sample && event->header.type < PERF_RECORD_USER_TYPE_START &&
+	    perf_evlist__parse_sample(session->evlist, event, sample))
+		return -1;
+
+	*event_ptr = event;
+
+	return 0;
+}
+
 static s64 perf_session__process_event(struct perf_session *session,
 				       union perf_event *event,
 				       struct perf_tool *tool,
@@ -1062,15 +1000,15 @@
 	if (ret)
 		return ret;
 
-	if (tool->ordered_samples) {
-		ret = perf_session_queue_event(session, event, &sample,
+	if (tool->ordered_events) {
+		ret = perf_session_queue_event(session, event, tool, &sample,
 					       file_offset);
 		if (ret != -ETIME)
 			return ret;
 	}
 
-	return perf_session_deliver_event(session, event, &sample, tool,
-					  file_offset);
+	return perf_session__deliver_event(session, event, &sample, tool,
+					   file_offset);
 }
 
 void perf_event_header__bswap(struct perf_event_header *hdr)
@@ -1222,12 +1160,11 @@
 		goto more;
 done:
 	/* do the final flush for ordered samples */
-	session->ordered_samples.next_flush = ULLONG_MAX;
-	err = flush_sample_queue(session, tool);
+	err = ordered_events__flush(session, tool, OE_FLUSH__FINAL);
 out_err:
 	free(buf);
 	perf_session__warn_about_errors(session, tool);
-	perf_session_free_sample_buffers(session);
+	ordered_events__free(&session->ordered_events);
 	return err;
 }
 
@@ -1368,12 +1305,11 @@
 
 out:
 	/* do the final flush for ordered samples */
-	session->ordered_samples.next_flush = ULLONG_MAX;
-	err = flush_sample_queue(session, tool);
+	err = ordered_events__flush(session, tool, OE_FLUSH__FINAL);
 out_err:
 	ui_progress__finish();
 	perf_session__warn_about_errors(session, tool);
-	perf_session_free_sample_buffers(session);
+	ordered_events__free(&session->ordered_events);
 	session->one_mmap = false;
 	return err;
 }

diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h
index 0321013..8dd41ca 100644
--- a/tools/perf/util/session.h
+++ b/tools/perf/util/session.h

@@ -9,26 +9,13 @@
 #include "symbol.h"
 #include "thread.h"
 #include "data.h"
+#include "ordered-events.h"
 #include <linux/rbtree.h>
 #include <linux/perf_event.h>
 
-struct sample_queue;
 struct ip_callchain;
 struct thread;
 
-struct ordered_samples {
-	u64			last_flush;
-	u64			next_flush;
-	u64			max_timestamp;
-	struct list_head	samples;
-	struct list_head	sample_cache;
-	struct list_head	to_free;
-	struct sample_queue	*sample_buffer;
-	struct sample_queue	*last_sample;
-	int			sample_buffer_idx;
-	unsigned int		nr_samples;
-};
-
 struct perf_session {
 	struct perf_header	header;
 	struct machines		machines;
@@ -39,7 +26,7 @@
 	bool			one_mmap;
 	void			*one_mmap_addr;
 	u64			one_mmap_offset;
-	struct ordered_samples	ordered_samples;
+	struct ordered_events	ordered_events;
 	struct perf_data_file	*file;
 };
 
@@ -58,6 +45,11 @@
 
 void perf_event_header__bswap(struct perf_event_header *hdr);
 
+int perf_session__peek_event(struct perf_session *session, off_t file_offset,
+			     void *buf, size_t buf_sz,
+			     union perf_event **event_ptr,
+			     struct perf_sample *sample);
+
 int __perf_session__process_events(struct perf_session *session,
 				   u64 data_offset, u64 data_size, u64 size,
 				   struct perf_tool *tool);
@@ -65,10 +57,16 @@
 				 struct perf_tool *tool);
 
 int perf_session_queue_event(struct perf_session *s, union perf_event *event,
-			     struct perf_sample *sample, u64 file_offset);
+			     struct perf_tool *tool, struct perf_sample *sample,
+			     u64 file_offset);
 
 void perf_tool__fill_defaults(struct perf_tool *tool);
 
+int perf_session__deliver_event(struct perf_session *session,
+				union perf_event *event,
+				struct perf_sample *sample,
+				struct perf_tool *tool, u64 file_offset);
+
 int perf_session__resolve_callchain(struct perf_session *session,
 				    struct perf_evsel *evsel,
 				    struct thread *thread,

diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index 14e5a03..1958637 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c

@@ -70,12 +70,14 @@
 				       size_t size, unsigned int width)
 {
 	const char *comm = thread__comm_str(he->thread);
-	return repsep_snprintf(bf, size, "%*s:%5d", width - 6,
-			       comm ?: "", he->thread->tid);
+
+	width = max(7U, width) - 6;
+	return repsep_snprintf(bf, size, "%5d:%-*.*s", he->thread->tid,
+			       width, width, comm ?: "");
 }
 
 struct sort_entry sort_thread = {
-	.se_header	= "Command:  Pid",
+	.se_header	= "  Pid:Command",
 	.se_cmp		= sort__thread_cmp,
 	.se_snprintf	= hist_entry__thread_snprintf,
 	.se_width_idx	= HISTC_THREAD,
@@ -106,7 +108,7 @@
 static int hist_entry__comm_snprintf(struct hist_entry *he, char *bf,
 				     size_t size, unsigned int width)
 {
-	return repsep_snprintf(bf, size, "%*s", width, comm__str(he->comm));
+	return repsep_snprintf(bf, size, "%-*.*s", width, width, comm__str(he->comm));
 }
 
 struct sort_entry sort_comm = {
@@ -152,10 +154,10 @@
 	if (map && map->dso) {
 		const char *dso_name = !verbose ? map->dso->short_name :
 			map->dso->long_name;
-		return repsep_snprintf(bf, size, "%-*s", width, dso_name);
+		return repsep_snprintf(bf, size, "%-*.*s", width, width, dso_name);
 	}
 
-	return repsep_snprintf(bf, size, "%-*s", width, "[unknown]");
+	return repsep_snprintf(bf, size, "%-*.*s", width, width, "[unknown]");
 }
 
 static int hist_entry__dso_snprintf(struct hist_entry *he, char *bf,
@@ -257,7 +259,10 @@
 				       width - ret, "");
 	}
 
-	return ret;
+	if (ret > width)
+		bf[width] = '\0';
+
+	return width;
 }
 
 static int hist_entry__sym_snprintf(struct hist_entry *he, char *bf,
@@ -302,10 +307,9 @@
 }
 
 static int hist_entry__srcline_snprintf(struct hist_entry *he, char *bf,
-					size_t size,
-					unsigned int width __maybe_unused)
+					size_t size, unsigned int width)
 {
-	return repsep_snprintf(bf, size, "%s", he->srcline);
+	return repsep_snprintf(bf, size, "%*.*-s", width, width, he->srcline);
 }
 
 struct sort_entry sort_srcline = {
@@ -332,7 +336,7 @@
 static int hist_entry__parent_snprintf(struct hist_entry *he, char *bf,
 				       size_t size, unsigned int width)
 {
-	return repsep_snprintf(bf, size, "%-*s", width,
+	return repsep_snprintf(bf, size, "%-*.*s", width, width,
 			      he->parent ? he->parent->name : "[other]");
 }
 
@@ -354,7 +358,7 @@
 static int hist_entry__cpu_snprintf(struct hist_entry *he, char *bf,
 				    size_t size, unsigned int width)
 {
-	return repsep_snprintf(bf, size, "%*d", width, he->cpu);
+	return repsep_snprintf(bf, size, "%*.*d", width, width, he->cpu);
 }
 
 struct sort_entry sort_cpu = {
@@ -484,7 +488,7 @@
 	else if (he->branch_info->flags.mispred)
 		out = "Y";
 
-	return repsep_snprintf(bf, size, "%-*s", width, out);
+	return repsep_snprintf(bf, size, "%-*.*s", width, width, out);
 }
 
 /* --sort daddr_sym */
@@ -1194,7 +1198,7 @@
 	return hse_a->se == hse_b->se;
 }
 
-void perf_hpp__reset_width(struct perf_hpp_fmt *fmt, struct hists *hists)
+void perf_hpp__reset_sort_width(struct perf_hpp_fmt *fmt, struct hists *hists)
 {
 	struct hpp_sort_entry *hse;
 
@@ -1202,20 +1206,21 @@
 		return;
 
 	hse = container_of(fmt, struct hpp_sort_entry, hpp);
-	hists__new_col_len(hists, hse->se->se_width_idx,
-			   strlen(hse->se->se_header));
+	hists__new_col_len(hists, hse->se->se_width_idx, strlen(fmt->name));
 }
 
 static int __sort__hpp_header(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp,
 			      struct perf_evsel *evsel)
 {
 	struct hpp_sort_entry *hse;
-	size_t len;
+	size_t len = fmt->user_len;
 
 	hse = container_of(fmt, struct hpp_sort_entry, hpp);
-	len = hists__col_len(&evsel->hists, hse->se->se_width_idx);
 
-	return scnprintf(hpp->buf, hpp->size, "%-*s", len, hse->se->se_header);
+	if (!len)
+		len = hists__col_len(&evsel->hists, hse->se->se_width_idx);
+
+	return scnprintf(hpp->buf, hpp->size, "%-*.*s", len, len, fmt->name);
 }
 
 static int __sort__hpp_width(struct perf_hpp_fmt *fmt,
@@ -1223,20 +1228,26 @@
 			     struct perf_evsel *evsel)
 {
 	struct hpp_sort_entry *hse;
+	size_t len = fmt->user_len;
 
 	hse = container_of(fmt, struct hpp_sort_entry, hpp);
 
-	return hists__col_len(&evsel->hists, hse->se->se_width_idx);
+	if (!len)
+		len = hists__col_len(&evsel->hists, hse->se->se_width_idx);
+
+	return len;
 }
 
 static int __sort__hpp_entry(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp,
 			     struct hist_entry *he)
 {
 	struct hpp_sort_entry *hse;
-	size_t len;
+	size_t len = fmt->user_len;
 
 	hse = container_of(fmt, struct hpp_sort_entry, hpp);
-	len = hists__col_len(he->hists, hse->se->se_width_idx);
+
+	if (!len)
+		len = hists__col_len(he->hists, hse->se->se_width_idx);
 
 	return hse->se->se_snprintf(he, hpp->buf, hpp->size, len);
 }
@@ -1253,6 +1264,7 @@
 	}
 
 	hse->se = sd->entry;
+	hse->hpp.name = sd->entry->se_header;
 	hse->hpp.header = __sort__hpp_header;
 	hse->hpp.width = __sort__hpp_width;
 	hse->hpp.entry = __sort__hpp_entry;
@@ -1265,6 +1277,8 @@
 	INIT_LIST_HEAD(&hse->hpp.list);
 	INIT_LIST_HEAD(&hse->hpp.sort_list);
 	hse->hpp.elide = false;
+	hse->hpp.len = 0;
+	hse->hpp.user_len = 0;
 
 	return hse;
 }
@@ -1439,7 +1453,7 @@
 	int ret = 0;
 
 	if (sort_keys == NULL) {
-		if (field_order) {
+		if (is_strict_order(field_order)) {
 			/*
 			 * If user specified field order but no sort order,
 			 * we'll honor it and not add default sort orders.
@@ -1625,23 +1639,36 @@
 		memory_sort_dimensions[i].taken = 0;
 }
 
+bool is_strict_order(const char *order)
+{
+	return order && (*order != '+');
+}
+
 static int __setup_output_field(void)
 {
-	char *tmp, *tok, *str;
-	int ret = 0;
+	char *tmp, *tok, *str, *strp;
+	int ret = -EINVAL;
 
 	if (field_order == NULL)
 		return 0;
 
 	reset_dimensions();
 
-	str = strdup(field_order);
+	strp = str = strdup(field_order);
 	if (str == NULL) {
 		error("Not enough memory to setup output fields");
 		return -ENOMEM;
 	}
 
-	for (tok = strtok_r(str, ", ", &tmp);
+	if (!is_strict_order(field_order))
+		strp++;
+
+	if (!strlen(strp)) {
+		error("Invalid --fields key: `+'");
+		goto out;
+	}
+
+	for (tok = strtok_r(strp, ", ", &tmp);
 			tok; tok = strtok_r(NULL, ", ", &tmp)) {
 		ret = output_field_add(tok);
 		if (ret == -EINVAL) {
@@ -1653,6 +1680,7 @@
 		}
 	}
 
+out:
 	free(str);
 	return ret;
 }

diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h
index 041f0c9..c03e4ff 100644
--- a/tools/perf/util/sort.h
+++ b/tools/perf/util/sort.h

@@ -218,4 +218,5 @@
 
 int report_parse_ignore_callees_opt(const struct option *opt, const char *arg, int unset);
 
+bool is_strict_order(const char *order);
 #endif	/* __PERF_SORT_H */

diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c
index d753499..9fb5e9e 100644
--- a/tools/perf/util/symbol-elf.c
+++ b/tools/perf/util/symbol-elf.c

@@ -736,7 +736,7 @@
 	if (symstrs == NULL)
 		goto out_elf_end;
 
-	sec_strndx = elf_getscn(elf, ehdr.e_shstrndx);
+	sec_strndx = elf_getscn(runtime_ss->elf, runtime_ss->ehdr.e_shstrndx);
 	if (sec_strndx == NULL)
 		goto out_elf_end;
 
@@ -939,8 +939,11 @@
 		 * to it...
 		 */
 		if (symbol_conf.demangle) {
-			demangled = bfd_demangle(NULL, elf_name,
-						 DMGL_PARAMS | DMGL_ANSI);
+			int demangle_flags = DMGL_NO_OPTS;
+			if (verbose)
+				demangle_flags = DMGL_PARAMS | DMGL_ANSI;
+
+			demangled = bfd_demangle(NULL, elf_name, demangle_flags);
 			if (demangled != NULL)
 				elf_name = demangled;
 		}

diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index eb06746..ac098a3 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c

@@ -15,6 +15,7 @@
 #include "machine.h"
 #include "symbol.h"
 #include "strlist.h"
+#include "header.h"
 
 #include <elf.h>
 #include <limits.h>
@@ -523,10 +524,15 @@
 	struct dso *dso;
 };
 
+/*
+ * These are symbols in the kernel image, so make sure that
+ * sym is from a kernel DSO.
+ */
 bool symbol__is_idle(struct symbol *sym)
 {
 	const char * const idle_symbols[] = {
 		"cpu_idle",
+		"cpu_startup_entry",
 		"intel_idle",
 		"default_idle",
 		"native_safe_halt",
@@ -1468,8 +1474,7 @@
 	if (vmlinux[0] == '/')
 		snprintf(symfs_vmlinux, sizeof(symfs_vmlinux), "%s", vmlinux);
 	else
-		snprintf(symfs_vmlinux, sizeof(symfs_vmlinux), "%s%s",
-			 symbol_conf.symfs, vmlinux);
+		symbol__join_symfs(symfs_vmlinux, vmlinux);
 
 	if (dso->kernel == DSO_TYPE_GUEST_KERNEL)
 		symtab_type = DSO_BINARY_TYPE__GUEST_VMLINUX;
@@ -1745,10 +1750,11 @@
 	zfree(&vmlinux_path);
 }
 
-static int vmlinux_path__init(void)
+static int vmlinux_path__init(struct perf_session_env *env)
 {
 	struct utsname uts;
 	char bf[PATH_MAX];
+	char *kernel_version;
 
 	vmlinux_path = malloc(sizeof(char *) * 5);
 	if (vmlinux_path == NULL)
@@ -1763,25 +1769,31 @@
 		goto out_fail;
 	++vmlinux_path__nr_entries;
 
-	/* only try running kernel version if no symfs was given */
+	/* only try kernel version if no symfs was given */
 	if (symbol_conf.symfs[0] != 0)
 		return 0;
 
-	if (uname(&uts) < 0)
-		return -1;
+	if (env) {
+		kernel_version = env->os_release;
+	} else {
+		if (uname(&uts) < 0)
+			goto out_fail;
 
-	snprintf(bf, sizeof(bf), "/boot/vmlinux-%s", uts.release);
+		kernel_version = uts.release;
+	}
+
+	snprintf(bf, sizeof(bf), "/boot/vmlinux-%s", kernel_version);
 	vmlinux_path[vmlinux_path__nr_entries] = strdup(bf);
 	if (vmlinux_path[vmlinux_path__nr_entries] == NULL)
 		goto out_fail;
 	++vmlinux_path__nr_entries;
-	snprintf(bf, sizeof(bf), "/lib/modules/%s/build/vmlinux", uts.release);
+	snprintf(bf, sizeof(bf), "/lib/modules/%s/build/vmlinux", kernel_version);
 	vmlinux_path[vmlinux_path__nr_entries] = strdup(bf);
 	if (vmlinux_path[vmlinux_path__nr_entries] == NULL)
 		goto out_fail;
 	++vmlinux_path__nr_entries;
 	snprintf(bf, sizeof(bf), "/usr/lib/debug/lib/modules/%s/vmlinux",
-		 uts.release);
+		 kernel_version);
 	vmlinux_path[vmlinux_path__nr_entries] = strdup(bf);
 	if (vmlinux_path[vmlinux_path__nr_entries] == NULL)
 		goto out_fail;
@@ -1827,7 +1839,7 @@
 	return value;
 }
 
-int symbol__init(void)
+int symbol__init(struct perf_session_env *env)
 {
 	const char *symfs;
 
@@ -1842,7 +1854,7 @@
 		symbol_conf.priv_size += (sizeof(struct symbol_name_rb_node) -
 					  sizeof(struct symbol));
 
-	if (symbol_conf.try_vmlinux_path && vmlinux_path__init() < 0)
+	if (symbol_conf.try_vmlinux_path && vmlinux_path__init(env) < 0)
 		return -1;
 
 	if (symbol_conf.field_sep && *symbol_conf.field_sep == '.') {

diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h
index e7295e9..3f95ea0 100644
--- a/tools/perf/util/symbol.h
+++ b/tools/perf/util/symbol.h

@@ -13,6 +13,7 @@
 #include <libgen.h>
 #include "build-id.h"
 #include "event.h"
+#include "util.h"
 
 #ifdef HAVE_LIBELF_SUPPORT
 #include <libelf.h>
@@ -59,6 +60,7 @@
 #endif
 
 #ifndef DMGL_PARAMS
+#define DMGL_NO_OPTS     0              /* For readability... */
 #define DMGL_PARAMS      (1 << 0)       /* Include function args */
 #define DMGL_ANSI        (1 << 1)       /* Include const, volatile, etc */
 #endif
@@ -143,6 +145,14 @@
 };
 
 extern struct symbol_conf symbol_conf;
+
+static inline int __symbol__join_symfs(char *bf, size_t size, const char *path)
+{
+	return path__join(bf, size, symbol_conf.symfs, path);
+}
+
+#define symbol__join_symfs(bf, path) __symbol__join_symfs(bf, sizeof(bf), path)
+
 extern int vmlinux_path__nr_entries;
 extern char **vmlinux_path;
 
@@ -253,7 +263,8 @@
 int filename__read_debuglink(const char *filename, char *debuglink,
 			     size_t size);
 
-int symbol__init(void);
+struct perf_session_env;
+int symbol__init(struct perf_session_env *env);
 void symbol__exit(void);
 void symbol__elf_init(void);
 struct symbol *symbol__new(u64 start, u64 len, u8 binding, const char *name);

diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c
index 12c7a25..a9df7f2 100644
--- a/tools/perf/util/thread.c
+++ b/tools/perf/util/thread.c

@@ -42,7 +42,7 @@
 			goto err_thread;
 
 		snprintf(comm_str, 32, ":%d", tid);
-		comm = comm__new(comm_str, 0);
+		comm = comm__new(comm_str, 0, false);
 		free(comm_str);
 		if (!comm)
 			goto err_thread;
@@ -81,19 +81,33 @@
 	return list_first_entry(&thread->comm_list, struct comm, list);
 }
 
+struct comm *thread__exec_comm(const struct thread *thread)
+{
+	struct comm *comm, *last = NULL;
+
+	list_for_each_entry(comm, &thread->comm_list, list) {
+		if (comm->exec)
+			return comm;
+		last = comm;
+	}
+
+	return last;
+}
+
 /* CHECKME: time should always be 0 if event aren't ordered */
-int thread__set_comm(struct thread *thread, const char *str, u64 timestamp)
+int __thread__set_comm(struct thread *thread, const char *str, u64 timestamp,
+		       bool exec)
 {
 	struct comm *new, *curr = thread__comm(thread);
 	int err;
 
 	/* Override latest entry if it had no specific time coverage */
-	if (!curr->start) {
-		err = comm__override(curr, str, timestamp);
+	if (!curr->start && !curr->exec) {
+		err = comm__override(curr, str, timestamp, exec);
 		if (err)
 			return err;
 	} else {
-		new = comm__new(str, timestamp);
+		new = comm__new(str, timestamp, exec);
 		if (!new)
 			return -ENOMEM;
 		list_add(&new->list, &thread->comm_list);

diff --git a/tools/perf/util/thread.h b/tools/perf/util/thread.h
index 716b772..8c75fa7 100644
--- a/tools/perf/util/thread.h
+++ b/tools/perf/util/thread.h

@@ -38,9 +38,17 @@
 	thread->dead = true;
 }
 
-int thread__set_comm(struct thread *thread, const char *comm, u64 timestamp);
+int __thread__set_comm(struct thread *thread, const char *comm, u64 timestamp,
+		       bool exec);
+static inline int thread__set_comm(struct thread *thread, const char *comm,
+				   u64 timestamp)
+{
+	return __thread__set_comm(thread, comm, timestamp, false);
+}
+
 int thread__comm_len(struct thread *thread);
 struct comm *thread__comm(const struct thread *thread);
+struct comm *thread__exec_comm(const struct thread *thread);
 const char *thread__comm_str(const struct thread *thread);
 void thread__insert_map(struct thread *thread, struct map *map);
 int thread__fork(struct thread *thread, struct thread *parent, u64 timestamp);

diff --git a/tools/perf/util/tool.h b/tools/perf/util/tool.h
index 4385816..f116369 100644
--- a/tools/perf/util/tool.h
+++ b/tools/perf/util/tool.h

@@ -40,7 +40,7 @@
 	event_op2	tracing_data;
 	event_op2	finished_round,
 			build_id;
-	bool		ordered_samples;
+	bool		ordered_events;
 	bool		ordering_requires_timestamps;
 };
 

diff --git a/tools/perf/util/trace-event-scripting.c b/tools/perf/util/trace-event-scripting.c
index 57aaccc..5c9bdd1 100644
--- a/tools/perf/util/trace-event-scripting.c
+++ b/tools/perf/util/trace-event-scripting.c

@@ -30,6 +30,11 @@
 
 struct scripting_context *scripting_context;
 
+static int flush_script_unsupported(void)
+{
+	return 0;
+}
+
 static int stop_script_unsupported(void)
 {
 	return 0;
@@ -74,6 +79,7 @@
 struct scripting_ops python_scripting_unsupported_ops = {
 	.name = "Python",
 	.start_script = python_start_script_unsupported,
+	.flush_script = flush_script_unsupported,
 	.stop_script = stop_script_unsupported,
 	.process_event = process_event_unsupported,
 	.generate_script = python_generate_script_unsupported,
@@ -137,6 +143,7 @@
 struct scripting_ops perl_scripting_unsupported_ops = {
 	.name = "Perl",
 	.start_script = perl_start_script_unsupported,
+	.flush_script = flush_script_unsupported,
 	.stop_script = stop_script_unsupported,
 	.process_event = process_event_unsupported,
 	.generate_script = perl_generate_script_unsupported,

diff --git a/tools/perf/util/trace-event.h b/tools/perf/util/trace-event.h
index 7b6d686..52aaa19 100644
--- a/tools/perf/util/trace-event.h
+++ b/tools/perf/util/trace-event.h

@@ -64,6 +64,7 @@
 struct scripting_ops {
 	const char *name;
 	int (*start_script) (const char *script, int argc, const char **argv);
+	int (*flush_script) (void);
 	int (*stop_script) (void);
 	void (*process_event) (union perf_event *event,
 			       struct perf_sample *sample,

diff --git a/tools/perf/util/util.c b/tools/perf/util/util.c
index e52e746..24e8d87 100644
--- a/tools/perf/util/util.c
+++ b/tools/perf/util/util.c

@@ -13,6 +13,7 @@
 #include <limits.h>
 #include <byteswap.h>
 #include <linux/kernel.h>
+#include <unistd.h>
 
 /*
  * XXX We need to find a better place for these things...
@@ -282,6 +283,18 @@
 	ws->ws_col = 80;
 }
 
+void set_term_quiet_input(struct termios *old)
+{
+	struct termios tc;
+
+	tcgetattr(0, old);
+	tc = *old;
+	tc.c_lflag &= ~(ICANON | ECHO);
+	tc.c_cc[VMIN] = 0;
+	tc.c_cc[VTIME] = 0;
+	tcsetattr(0, TCSANOW, &tc);
+}
+
 static void set_tracing_events_path(const char *mountpoint)
 {
 	snprintf(tracing_events_path, sizeof(tracing_events_path), "%s/%s",
@@ -443,6 +456,7 @@
 	size_t size = 0, alloc_size = 0;
 	void *bf = NULL, *nbf;
 	int fd, n, err = 0;
+	char sbuf[STRERR_BUFSIZE];
 
 	fd = open(filename, O_RDONLY);
 	if (fd < 0)
@@ -463,8 +477,8 @@
 		n = read(fd, bf + size, alloc_size - size);
 		if (n < 0) {
 			if (size) {
-				pr_warning("read failed %d: %s\n",
-					   errno, strerror(errno));
+				pr_warning("read failed %d: %s\n", errno,
+					 strerror_r(errno, sbuf, sizeof(sbuf)));
 				err = 0;
 			} else
 				err = -errno;
@@ -536,3 +550,39 @@
 		++m;
 	}
 }
+
+bool find_process(const char *name)
+{
+	size_t len = strlen(name);
+	DIR *dir;
+	struct dirent *d;
+	int ret = -1;
+
+	dir = opendir(procfs__mountpoint());
+	if (!dir)
+		return -1;
+
+	/* Walk through the directory. */
+	while (ret && (d = readdir(dir)) != NULL) {
+		char path[PATH_MAX];
+		char *data;
+		size_t size;
+
+		if ((d->d_type != DT_DIR) ||
+		     !strcmp(".", d->d_name) ||
+		     !strcmp("..", d->d_name))
+			continue;
+
+		scnprintf(path, sizeof(path), "%s/%s/comm",
+			  procfs__mountpoint(), d->d_name);
+
+		if (filename__read_str(path, &data, &size))
+			continue;
+
+		ret = strncmp(name, data, len);
+		free(data);
+	}
+
+	closedir(dir);
+	return ret ? false : true;
+}

diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h
index 6686436..d6a79b1 100644
--- a/tools/perf/util/util.h
+++ b/tools/perf/util/util.h

@@ -68,12 +68,14 @@
 #include <sys/socket.h>
 #include <sys/ioctl.h>
 #include <inttypes.h>
+#include <linux/kernel.h>
 #include <linux/magic.h>
 #include <linux/types.h>
 #include <sys/ttydefaults.h>
 #include <api/fs/debugfs.h>
 #include <termios.h>
 #include <linux/bitops.h>
+#include <termios.h>
 
 extern const char *graph_line;
 extern const char *graph_dotted_line;
@@ -307,6 +309,7 @@
 extern int cacheline_size;
 
 void get_term_dimensions(struct winsize *ws);
+void set_term_quiet_input(struct termios *old);
 
 struct parse_tag {
 	char tag;
@@ -317,6 +320,21 @@
 
 #define SRCLINE_UNKNOWN  ((char *) "??:0")
 
+static inline int path__join(char *bf, size_t size,
+			     const char *path1, const char *path2)
+{
+	return scnprintf(bf, size, "%s%s%s", path1, path1[0] ? "/" : "", path2);
+}
+
+static inline int path__join3(char *bf, size_t size,
+			      const char *path1, const char *path2,
+			      const char *path3)
+{
+	return scnprintf(bf, size, "%s%s%s%s%s",
+			 path1, path1[0] ? "/" : "",
+			 path2, path2[0] ? "/" : "", path3);
+}
+
 struct dso;
 
 char *get_srcline(struct dso *dso, unsigned long addr);
@@ -330,4 +348,5 @@
 void mem_bswap_32(void *src, int byte_size);
 
 const char *get_filename_for_perf_kvm(void);
+bool find_process(const char *name);
 #endif /* GIT_COMPAT_UTIL_H */
commit	9c2f65317357b21b8d289ac79c314a87dcb199f4	[log] [tgz]
author	Ingo Molnar <mingo@kernel.org>	Tue Sep 09 07:14:18 2014 +0200
committer	Ingo Molnar <mingo@kernel.org>	Tue Sep 09 07:14:18 2014 +0200
tree	85ec488b9272f97348d1ba5384bd99963f499013
parent	961fa95879f8925e5e9e05d5f775c5cb5a0519fa [diff]
parent	cc99535eb4049c730cac421d403d079593cb31ae [diff]