Merge tag 'm68k-for-v4.9-tag2' of git://git.kernel.org/pub/scm/linux/kernel/git/geert/linux-m68k

Pull m68k fixes from Geert Uytterhoeven:

 - build fix for drivers calling ndelay() in a conditional block without
   curly braces

 - defconfig updates

* tag 'm68k-for-v4.9-tag2' of git://git.kernel.org/pub/scm/linux/kernel/git/geert/linux-m68k:
  m68k: Fix ndelay() macro
  m68k/defconfig: Update defconfigs for v4.9-rc1
diff --git a/CREDITS b/CREDITS
index 513aaa3..d7ebdfb 100644
--- a/CREDITS
+++ b/CREDITS
@@ -9,7 +9,7 @@
 			Linus
 ----------
 
-M: Matt Mackal
+N: Matt Mackal
 E: mpm@selenic.com
 D: SLOB slab allocator
 
@@ -1864,10 +1864,11 @@
 
 N: Martin Kepplinger
 E: martink@posteo.de
-E: martin.kepplinger@theobroma-systems.com
+E: martin.kepplinger@ginzinger.com
 W: http://www.martinkepplinger.com
 D: mma8452 accelerators iio driver
-D: Kernel cleanups
+D: pegasus_notetaker input driver
+D: Kernel fixes and cleanups
 S: Garnisonstraße 26
 S: 4020 Linz
 S: Austria
@@ -1909,7 +1910,7 @@
 
 N: Andi Kleen
 E: andi@firstfloor.org
-U: http://www.halobates.de
+W: http://www.halobates.de
 D: network, x86, NUMA, various hacks
 S: Schwalbenstr. 96
 S: 85551 Ottobrunn
@@ -2088,8 +2089,8 @@
 D: Synopsys Designware PCI host bridge driver
 
 N: Gabor Kuti
-M: seasons@falcon.sch.bme.hu
-M: seasons@makosteszta.sote.hu
+E: seasons@falcon.sch.bme.hu
+E: seasons@makosteszta.sote.hu
 D: Original author of software suspend
 
 N: Jaroslav Kysela
diff --git a/Documentation/ABI/testing/sysfs-class-cxl b/Documentation/ABI/testing/sysfs-class-cxl
index 4ba0a2a..640f65e 100644
--- a/Documentation/ABI/testing/sysfs-class-cxl
+++ b/Documentation/ABI/testing/sysfs-class-cxl
@@ -220,8 +220,11 @@
 Date:           October 2014
 Contact:        linuxppc-dev@lists.ozlabs.org
 Description:    write only
-                Writing 1 will issue a PERST to card which may cause the card
-                to reload the FPGA depending on load_image_on_perst.
+                Writing 1 will issue a PERST to card provided there are no
+                contexts active on any one of the card AFUs. This may cause
+                the card to reload the FPGA depending on load_image_on_perst.
+                Writing -1 will do a force PERST irrespective of any active
+                contexts on the card AFUs.
 Users:		https://github.com/ibm-capi/libcxl
 
 What:		/sys/class/cxl/<card>/perst_reloads_same_image (not in a guest)
diff --git a/Documentation/ABI/testing/sysfs-devices-system-ibm-rtl b/Documentation/ABI/testing/sysfs-devices-system-ibm-rtl
index b82deea..470def0 100644
--- a/Documentation/ABI/testing/sysfs-devices-system-ibm-rtl
+++ b/Documentation/ABI/testing/sysfs-devices-system-ibm-rtl
@@ -1,4 +1,4 @@
-What:           state
+What:           /sys/devices/system/ibm_rtl/state
 Date:           Sep 2010
 KernelVersion:  2.6.37
 Contact:        Vernon Mauery <vernux@us.ibm.com>
@@ -10,7 +10,7 @@
 Users:          The ibm-prtm userspace daemon uses this interface.
 
 
-What:           version
+What:           /sys/devices/system/ibm_rtl/version
 Date:           Sep 2010
 KernelVersion:  2.6.37
 Contact:        Vernon Mauery <vernux@us.ibm.com>
diff --git a/Documentation/device-mapper/dm-raid.txt b/Documentation/device-mapper/dm-raid.txt
index e5b6497..c75b64a 100644
--- a/Documentation/device-mapper/dm-raid.txt
+++ b/Documentation/device-mapper/dm-raid.txt
@@ -309,3 +309,4 @@
 	with a reshape in progress.
 1.9.0   Add support for RAID level takeover/reshape/region size
 	and set size reduction.
+1.9.1   Fix activation of existing RAID 4/10 mapped devices
diff --git a/Documentation/devicetree/bindings/clock/uniphier-clock.txt b/Documentation/devicetree/bindings/clock/uniphier-clock.txt
index c7179d3..8121630 100644
--- a/Documentation/devicetree/bindings/clock/uniphier-clock.txt
+++ b/Documentation/devicetree/bindings/clock/uniphier-clock.txt
@@ -24,7 +24,7 @@
 		reg = <0x61840000 0x4000>;
 
 		clock {
-			compatible = "socionext,uniphier-ld20-clock";
+			compatible = "socionext,uniphier-ld11-clock";
 			#clock-cells = <1>;
 		};
 
@@ -43,8 +43,8 @@
 21: USB3 ch1 PHY1
 
 
-Media I/O (MIO) clock
----------------------
+Media I/O (MIO) clock, SD clock
+-------------------------------
 
 Required properties:
 - compatible: should be one of the following:
@@ -52,10 +52,10 @@
     "socionext,uniphier-ld4-mio-clock"  - for LD4 SoC.
     "socionext,uniphier-pro4-mio-clock" - for Pro4 SoC.
     "socionext,uniphier-sld8-mio-clock" - for sLD8 SoC.
-    "socionext,uniphier-pro5-mio-clock" - for Pro5 SoC.
-    "socionext,uniphier-pxs2-mio-clock" - for PXs2/LD6b SoC.
+    "socionext,uniphier-pro5-sd-clock"  - for Pro5 SoC.
+    "socionext,uniphier-pxs2-sd-clock"  - for PXs2/LD6b SoC.
     "socionext,uniphier-ld11-mio-clock" - for LD11 SoC.
-    "socionext,uniphier-ld20-mio-clock" - for LD20 SoC.
+    "socionext,uniphier-ld20-sd-clock"  - for LD20 SoC.
 - #clock-cells: should be 1.
 
 Example:
@@ -66,7 +66,7 @@
 		reg = <0x59810000 0x800>;
 
 		clock {
-			compatible = "socionext,uniphier-ld20-mio-clock";
+			compatible = "socionext,uniphier-ld11-mio-clock";
 			#clock-cells = <1>;
 		};
 
@@ -112,7 +112,7 @@
 		reg = <0x59820000 0x200>;
 
 		clock {
-			compatible = "socionext,uniphier-ld20-peri-clock";
+			compatible = "socionext,uniphier-ld11-peri-clock";
 			#clock-cells = <1>;
 		};
 
diff --git a/Documentation/devicetree/bindings/ipmi/aspeed,ast2400-ibt-bmc.txt b/Documentation/devicetree/bindings/ipmi/aspeed,ast2400-ibt-bmc.txt
new file mode 100644
index 0000000..6f28969
--- /dev/null
+++ b/Documentation/devicetree/bindings/ipmi/aspeed,ast2400-ibt-bmc.txt
@@ -0,0 +1,23 @@
+* Aspeed BT (Block Transfer) IPMI interface
+
+The Aspeed SOCs (AST2400 and AST2500) are commonly used as BMCs
+(BaseBoard Management Controllers) and the BT interface can be used to
+perform in-band IPMI communication with their host.
+
+Required properties:
+
+- compatible : should be "aspeed,ast2400-ibt-bmc"
+- reg: physical address and size of the registers
+
+Optional properties:
+
+- interrupts: interrupt generated by the BT interface. without an
+  interrupt, the driver will operate in poll mode.
+
+Example:
+
+	ibt@1e789140 {
+		compatible = "aspeed,ast2400-ibt-bmc";
+		reg = <0x1e789140 0x18>;
+		interrupts = <8>;
+	};
diff --git a/Documentation/devicetree/bindings/ipmi.txt b/Documentation/devicetree/bindings/ipmi/ipmi-smic.txt
similarity index 100%
rename from Documentation/devicetree/bindings/ipmi.txt
rename to Documentation/devicetree/bindings/ipmi/ipmi-smic.txt
diff --git a/Documentation/devicetree/bindings/mmc/synopsys-dw-mshc.txt b/Documentation/devicetree/bindings/mmc/synopsys-dw-mshc.txt
index 4e00e85..bfa461a 100644
--- a/Documentation/devicetree/bindings/mmc/synopsys-dw-mshc.txt
+++ b/Documentation/devicetree/bindings/mmc/synopsys-dw-mshc.txt
@@ -43,6 +43,9 @@
   reset signal present internally in some host controller IC designs.
   See Documentation/devicetree/bindings/reset/reset.txt for details.
 
+* reset-names: request name for using "resets" property. Must be "reset".
+	(It will be used together with "resets" property.)
+
 * clocks: from common clock binding: handle to biu and ciu clocks for the
   bus interface unit clock and the card interface unit clock.
 
@@ -103,6 +106,8 @@
 		interrupts = <0 75 0>;
 		#address-cells = <1>;
 		#size-cells = <0>;
+		resets = <&rst 20>;
+		reset-names = "reset";
 	};
 
 [board specific internal DMA resources]
diff --git a/Documentation/devicetree/bindings/net/ethernet.txt b/Documentation/devicetree/bindings/net/ethernet.txt
index e1d7681..05150957 100644
--- a/Documentation/devicetree/bindings/net/ethernet.txt
+++ b/Documentation/devicetree/bindings/net/ethernet.txt
@@ -9,10 +9,26 @@
 - max-speed: number, specifies maximum speed in Mbit/s supported by the device;
 - max-frame-size: number, maximum transfer unit (IEEE defined MTU), rather than
   the maximum frame size (there's contradiction in ePAPR).
-- phy-mode: string, operation mode of the PHY interface; supported values are
-  "mii", "gmii", "sgmii", "qsgmii", "tbi", "rev-mii", "rmii", "rgmii", "rgmii-id",
-  "rgmii-rxid", "rgmii-txid", "rtbi", "smii", "xgmii", "trgmii"; this is now a
-  de-facto standard property;
+- phy-mode: string, operation mode of the PHY interface. This is now a de-facto
+  standard property; supported values are:
+  * "mii"
+  * "gmii"
+  * "sgmii"
+  * "qsgmii"
+  * "tbi"
+  * "rev-mii"
+  * "rmii"
+  * "rgmii" (RX and TX delays are added by the MAC when required)
+  * "rgmii-id" (RGMII with internal RX and TX delays provided by the PHY, the
+     MAC should not add the RX or TX delays in this case)
+  * "rgmii-rxid" (RGMII with internal RX delay provided by the PHY, the MAC
+     should not add an RX delay in this case)
+  * "rgmii-txid" (RGMII with internal TX delay provided by the PHY, the MAC
+     should not add an TX delay in this case)
+  * "rtbi"
+  * "smii"
+  * "xgmii"
+  * "trgmii"
 - phy-connection-type: the same as "phy-mode" property but described in ePAPR;
 - phy-handle: phandle, specifies a reference to a node representing a PHY
   device; this property is described in ePAPR and so preferred;
diff --git a/Documentation/devicetree/bindings/net/marvell-orion-net.txt b/Documentation/devicetree/bindings/net/marvell-orion-net.txt
index bce52b2..6fd988c 100644
--- a/Documentation/devicetree/bindings/net/marvell-orion-net.txt
+++ b/Documentation/devicetree/bindings/net/marvell-orion-net.txt
@@ -49,6 +49,7 @@
 and
 
  - phy-handle: See ethernet.txt file in the same directory.
+ - phy-mode: See ethernet.txt file in the same directory.
 
 or
 
diff --git a/Documentation/devicetree/bindings/pci/rockchip-pcie.txt b/Documentation/devicetree/bindings/pci/rockchip-pcie.txt
index ba67b39..71aeda1 100644
--- a/Documentation/devicetree/bindings/pci/rockchip-pcie.txt
+++ b/Documentation/devicetree/bindings/pci/rockchip-pcie.txt
@@ -26,13 +26,16 @@
 	- "sys"
 	- "legacy"
 	- "client"
-- resets: Must contain five entries for each entry in reset-names.
+- resets: Must contain seven entries for each entry in reset-names.
 	   See ../reset/reset.txt for details.
 - reset-names: Must include the following names
 	- "core"
 	- "mgmt"
 	- "mgmt-sticky"
 	- "pipe"
+	- "pm"
+	- "aclk"
+	- "pclk"
 - pinctrl-names : The pin control state names
 - pinctrl-0: The "default" pinctrl state
 - #interrupt-cells: specifies the number of cells needed to encode an
@@ -86,8 +89,10 @@
 	reg = <0x0 0xf8000000 0x0 0x2000000>, <0x0 0xfd000000 0x0 0x1000000>;
 	reg-names = "axi-base", "apb-base";
 	resets = <&cru SRST_PCIE_CORE>, <&cru SRST_PCIE_MGMT>,
-		 <&cru SRST_PCIE_MGMT_STICKY>, <&cru SRST_PCIE_PIPE>;
-	reset-names = "core", "mgmt", "mgmt-sticky", "pipe";
+		 <&cru SRST_PCIE_MGMT_STICKY>, <&cru SRST_PCIE_PIPE> ,
+		 <&cru SRST_PCIE_PM>, <&cru SRST_P_PCIE>, <&cru SRST_A_PCIE>;
+	reset-names = "core", "mgmt", "mgmt-sticky", "pipe",
+		      "pm", "pclk", "aclk";
 	phys = <&pcie_phy>;
 	phy-names = "pcie-phy";
 	pinctrl-names = "default";
diff --git a/Documentation/devicetree/bindings/pinctrl/pinctrl-aspeed.txt b/Documentation/devicetree/bindings/pinctrl/pinctrl-aspeed.txt
index 5e60ad1..2ad18c4 100644
--- a/Documentation/devicetree/bindings/pinctrl/pinctrl-aspeed.txt
+++ b/Documentation/devicetree/bindings/pinctrl/pinctrl-aspeed.txt
@@ -43,7 +43,9 @@
 
 GPID0 GPID2 GPIE0 I2C10 I2C11 I2C12 I2C13 I2C14 I2C3 I2C4 I2C5 I2C6 I2C7 I2C8
 I2C9 MAC1LINK MDIO1 MDIO2 OSCCLK PEWAKE PWM0 PWM1 PWM2 PWM3 PWM4 PWM5 PWM6 PWM7
-RGMII1 RGMII2 RMII1 RMII2 SD1 SPI1 TIMER4 TIMER5 TIMER6 TIMER7 TIMER8
+RGMII1 RGMII2 RMII1 RMII2 SD1 SPI1 SPI1DEBUG SPI1PASSTHRU TIMER4 TIMER5 TIMER6
+TIMER7 TIMER8 VGABIOSROM
+
 
 Examples:
 
diff --git a/Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.txt
index f9753c4..b24583a 100644
--- a/Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.txt
+++ b/Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.txt
@@ -14,11 +14,6 @@
  - #size-cells	: The value of this property must be 1
  - ranges	: defines mapping between pin controller node (parent) to
    gpio-bank node (children).
- - interrupt-parent: phandle of the interrupt parent to which the external
-   GPIO interrupts are forwarded to.
- - st,syscfg: Should be phandle/offset pair. The phandle to the syscon node
-   which includes IRQ mux selection register, and the offset of the IRQ mux
-   selection register.
  - pins-are-numbered: Specify the subnodes are using numbered pinmux to
    specify pins.
 
@@ -37,6 +32,11 @@
 
 Optional properties:
  - reset:	  : Reference to the reset controller
+ - interrupt-parent: phandle of the interrupt parent to which the external
+   GPIO interrupts are forwarded to.
+ - st,syscfg: Should be phandle/offset pair. The phandle to the syscon node
+   which includes IRQ mux selection register, and the offset of the IRQ mux
+   selection register.
 
 Example:
 #include <dt-bindings/pinctrl/stm32f429-pinfunc.h>
diff --git a/Documentation/devicetree/bindings/reset/uniphier-reset.txt b/Documentation/devicetree/bindings/reset/uniphier-reset.txt
index e6bbfcc..5020524 100644
--- a/Documentation/devicetree/bindings/reset/uniphier-reset.txt
+++ b/Documentation/devicetree/bindings/reset/uniphier-reset.txt
@@ -6,25 +6,25 @@
 
 Required properties:
 - compatible: should be one of the following:
-    "socionext,uniphier-sld3-reset" - for PH1-sLD3 SoC.
-    "socionext,uniphier-ld4-reset"  - for PH1-LD4 SoC.
-    "socionext,uniphier-pro4-reset" - for PH1-Pro4 SoC.
-    "socionext,uniphier-sld8-reset" - for PH1-sLD8 SoC.
-    "socionext,uniphier-pro5-reset" - for PH1-Pro5 SoC.
-    "socionext,uniphier-pxs2-reset" - for ProXstream2/PH1-LD6b SoC.
-    "socionext,uniphier-ld11-reset" - for PH1-LD11 SoC.
-    "socionext,uniphier-ld20-reset" - for PH1-LD20 SoC.
+    "socionext,uniphier-sld3-reset" - for sLD3 SoC.
+    "socionext,uniphier-ld4-reset"  - for LD4 SoC.
+    "socionext,uniphier-pro4-reset" - for Pro4 SoC.
+    "socionext,uniphier-sld8-reset" - for sLD8 SoC.
+    "socionext,uniphier-pro5-reset" - for Pro5 SoC.
+    "socionext,uniphier-pxs2-reset" - for PXs2/LD6b SoC.
+    "socionext,uniphier-ld11-reset" - for LD11 SoC.
+    "socionext,uniphier-ld20-reset" - for LD20 SoC.
 - #reset-cells: should be 1.
 
 Example:
 
 	sysctrl@61840000 {
-		compatible = "socionext,uniphier-ld20-sysctrl",
+		compatible = "socionext,uniphier-ld11-sysctrl",
 			     "simple-mfd", "syscon";
 		reg = <0x61840000 0x4000>;
 
 		reset {
-			compatible = "socionext,uniphier-ld20-reset";
+			compatible = "socionext,uniphier-ld11-reset";
 			#reset-cells = <1>;
 		};
 
@@ -32,30 +32,30 @@
 	};
 
 
-Media I/O (MIO) reset
----------------------
+Media I/O (MIO) reset, SD reset
+-------------------------------
 
 Required properties:
 - compatible: should be one of the following:
-    "socionext,uniphier-sld3-mio-reset" - for PH1-sLD3 SoC.
-    "socionext,uniphier-ld4-mio-reset"  - for PH1-LD4 SoC.
-    "socionext,uniphier-pro4-mio-reset" - for PH1-Pro4 SoC.
-    "socionext,uniphier-sld8-mio-reset" - for PH1-sLD8 SoC.
-    "socionext,uniphier-pro5-mio-reset" - for PH1-Pro5 SoC.
-    "socionext,uniphier-pxs2-mio-reset" - for ProXstream2/PH1-LD6b SoC.
-    "socionext,uniphier-ld11-mio-reset" - for PH1-LD11 SoC.
-    "socionext,uniphier-ld20-mio-reset" - for PH1-LD20 SoC.
+    "socionext,uniphier-sld3-mio-reset" - for sLD3 SoC.
+    "socionext,uniphier-ld4-mio-reset"  - for LD4 SoC.
+    "socionext,uniphier-pro4-mio-reset" - for Pro4 SoC.
+    "socionext,uniphier-sld8-mio-reset" - for sLD8 SoC.
+    "socionext,uniphier-pro5-sd-reset"  - for Pro5 SoC.
+    "socionext,uniphier-pxs2-sd-reset"  - for PXs2/LD6b SoC.
+    "socionext,uniphier-ld11-mio-reset" - for LD11 SoC.
+    "socionext,uniphier-ld20-sd-reset"  - for LD20 SoC.
 - #reset-cells: should be 1.
 
 Example:
 
 	mioctrl@59810000 {
-		compatible = "socionext,uniphier-ld20-mioctrl",
+		compatible = "socionext,uniphier-ld11-mioctrl",
 			     "simple-mfd", "syscon";
 		reg = <0x59810000 0x800>;
 
 		reset {
-			compatible = "socionext,uniphier-ld20-mio-reset";
+			compatible = "socionext,uniphier-ld11-mio-reset";
 			#reset-cells = <1>;
 		};
 
@@ -68,24 +68,24 @@
 
 Required properties:
 - compatible: should be one of the following:
-    "socionext,uniphier-ld4-peri-reset"  - for PH1-LD4 SoC.
-    "socionext,uniphier-pro4-peri-reset" - for PH1-Pro4 SoC.
-    "socionext,uniphier-sld8-peri-reset" - for PH1-sLD8 SoC.
-    "socionext,uniphier-pro5-peri-reset" - for PH1-Pro5 SoC.
-    "socionext,uniphier-pxs2-peri-reset" - for ProXstream2/PH1-LD6b SoC.
-    "socionext,uniphier-ld11-peri-reset" - for PH1-LD11 SoC.
-    "socionext,uniphier-ld20-peri-reset" - for PH1-LD20 SoC.
+    "socionext,uniphier-ld4-peri-reset"  - for LD4 SoC.
+    "socionext,uniphier-pro4-peri-reset" - for Pro4 SoC.
+    "socionext,uniphier-sld8-peri-reset" - for sLD8 SoC.
+    "socionext,uniphier-pro5-peri-reset" - for Pro5 SoC.
+    "socionext,uniphier-pxs2-peri-reset" - for PXs2/LD6b SoC.
+    "socionext,uniphier-ld11-peri-reset" - for LD11 SoC.
+    "socionext,uniphier-ld20-peri-reset" - for LD20 SoC.
 - #reset-cells: should be 1.
 
 Example:
 
 	perictrl@59820000 {
-		compatible = "socionext,uniphier-ld20-perictrl",
+		compatible = "socionext,uniphier-ld11-perictrl",
 			     "simple-mfd", "syscon";
 		reg = <0x59820000 0x200>;
 
 		reset {
-			compatible = "socionext,uniphier-ld20-peri-reset";
+			compatible = "socionext,uniphier-ld11-peri-reset";
 			#reset-cells = <1>;
 		};
 
diff --git a/Documentation/devicetree/bindings/serial/cdns,uart.txt b/Documentation/devicetree/bindings/serial/cdns,uart.txt
index a3eb154..227bb77 100644
--- a/Documentation/devicetree/bindings/serial/cdns,uart.txt
+++ b/Documentation/devicetree/bindings/serial/cdns,uart.txt
@@ -1,7 +1,9 @@
 Binding for Cadence UART Controller
 
 Required properties:
-- compatible : should be "cdns,uart-r1p8", or "xlnx,xuartps"
+- compatible :
+  Use "xlnx,xuartps","cdns,uart-r1p8" for Zynq-7xxx SoC.
+  Use "xlnx,zynqmp-uart","cdns,uart-r1p12" for Zynq Ultrascale+ MPSoC.
 - reg: Should contain UART controller registers location and length.
 - interrupts: Should contain UART controller interrupts.
 - clocks: Must contain phandles to the UART clocks
diff --git a/Documentation/devicetree/bindings/serial/renesas,sci-serial.txt b/Documentation/devicetree/bindings/serial/renesas,sci-serial.txt
index 1e4000d..8d27d1a 100644
--- a/Documentation/devicetree/bindings/serial/renesas,sci-serial.txt
+++ b/Documentation/devicetree/bindings/serial/renesas,sci-serial.txt
@@ -9,6 +9,14 @@
     - "renesas,scifb-r8a73a4" for R8A73A4 (R-Mobile APE6) SCIFB compatible UART.
     - "renesas,scifa-r8a7740" for R8A7740 (R-Mobile A1) SCIFA compatible UART.
     - "renesas,scifb-r8a7740" for R8A7740 (R-Mobile A1) SCIFB compatible UART.
+    - "renesas,scif-r8a7743" for R8A7743 (RZ/G1M) SCIF compatible UART.
+    - "renesas,scifa-r8a7743" for R8A7743 (RZ/G1M) SCIFA compatible UART.
+    - "renesas,scifb-r8a7743" for R8A7743 (RZ/G1M) SCIFB compatible UART.
+    - "renesas,hscif-r8a7743" for R8A7743 (RZ/G1M) HSCIF compatible UART.
+    - "renesas,scif-r8a7745" for R8A7745 (RZ/G1E) SCIF compatible UART.
+    - "renesas,scifa-r8a7745" for R8A7745 (RZ/G1E) SCIFA compatible UART.
+    - "renesas,scifb-r8a7745" for R8A7745 (RZ/G1E) SCIFB compatible UART.
+    - "renesas,hscif-r8a7745" for R8A7745 (RZ/G1E) HSCIF compatible UART.
     - "renesas,scif-r8a7778" for R8A7778 (R-Car M1) SCIF compatible UART.
     - "renesas,scif-r8a7779" for R8A7779 (R-Car H1) SCIF compatible UART.
     - "renesas,scif-r8a7790" for R8A7790 (R-Car H2) SCIF compatible UART.
diff --git a/Documentation/devicetree/bindings/sound/omap-abe-twl6040.txt b/Documentation/devicetree/bindings/sound/omap-abe-twl6040.txt
index fd40c85..462b04e8 100644
--- a/Documentation/devicetree/bindings/sound/omap-abe-twl6040.txt
+++ b/Documentation/devicetree/bindings/sound/omap-abe-twl6040.txt
@@ -12,7 +12,7 @@
 
 Optional properties:
 - ti,dmic: phandle for the OMAP dmic node if the machine have it connected
-- ti,jack_detection: Need to be present if the board capable to detect jack
+- ti,jack-detection: Need to be present if the board capable to detect jack
   insertion, removal.
 
 Available audio endpoints for the audio-routing table:
diff --git a/Documentation/devicetree/bindings/timer/jcore,pit.txt b/Documentation/devicetree/bindings/timer/jcore,pit.txt
new file mode 100644
index 0000000..af5dd35
--- /dev/null
+++ b/Documentation/devicetree/bindings/timer/jcore,pit.txt
@@ -0,0 +1,24 @@
+J-Core Programmable Interval Timer and Clocksource
+
+Required properties:
+
+- compatible: Must be "jcore,pit".
+
+- reg: Memory region(s) for timer/clocksource registers. For SMP,
+  there should be one region per cpu, indexed by the sequential,
+  zero-based hardware cpu number.
+
+- interrupts: An interrupt to assign for the timer. The actual pit
+  core is integrated with the aic and allows the timer interrupt
+  assignment to be programmed by software, but this property is
+  required in order to reserve an interrupt number that doesn't
+  conflict with other devices.
+
+
+Example:
+
+timer@200 {
+	compatible = "jcore,pit";
+	reg = < 0x200 0x30 0x500 0x30 >;
+	interrupts = < 0x48 >;
+};
diff --git a/Documentation/devicetree/bindings/usb/dwc2.txt b/Documentation/devicetree/bindings/usb/dwc2.txt
index 455f2c3..2c30a54 100644
--- a/Documentation/devicetree/bindings/usb/dwc2.txt
+++ b/Documentation/devicetree/bindings/usb/dwc2.txt
@@ -28,10 +28,7 @@
 - g-use-dma: enable dma usage in gadget driver.
 - g-rx-fifo-size: size of rx fifo size in gadget mode.
 - g-np-tx-fifo-size: size of non-periodic tx fifo size in gadget mode.
-
-Deprecated properties:
-- g-tx-fifo-size: size of periodic tx fifo per endpoint (except ep0)
-  in gadget mode.
+- g-tx-fifo-size: size of periodic tx fifo per endpoint (except ep0) in gadget mode.
 
 Example:
 
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 14cdc10..1b5f156 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -447,7 +447,6 @@
 	int (*flush) (struct file *);
 	int (*release) (struct inode *, struct file *);
 	int (*fsync) (struct file *, loff_t start, loff_t end, int datasync);
-	int (*aio_fsync) (struct kiocb *, int datasync);
 	int (*fasync) (int, struct file *, int);
 	int (*lock) (struct file *, int, struct file_lock *);
 	ssize_t (*readv) (struct file *, const struct iovec *, unsigned long,
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 219ffd4..74329fd 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -395,32 +395,6 @@
 
  or if empty, the mapping is anonymous.
 
-The /proc/PID/task/TID/maps is a view of the virtual memory from the viewpoint
-of the individual tasks of a process. In this file you will see a mapping marked
-as [stack] if that task sees it as a stack. Hence, for the example above, the
-task-level map, i.e. /proc/PID/task/TID/maps for thread 1001 will look like this:
-
-08048000-08049000 r-xp 00000000 03:00 8312       /opt/test
-08049000-0804a000 rw-p 00001000 03:00 8312       /opt/test
-0804a000-0806b000 rw-p 00000000 00:00 0          [heap]
-a7cb1000-a7cb2000 ---p 00000000 00:00 0
-a7cb2000-a7eb2000 rw-p 00000000 00:00 0
-a7eb2000-a7eb3000 ---p 00000000 00:00 0
-a7eb3000-a7ed5000 rw-p 00000000 00:00 0          [stack]
-a7ed5000-a8008000 r-xp 00000000 03:00 4222       /lib/libc.so.6
-a8008000-a800a000 r--p 00133000 03:00 4222       /lib/libc.so.6
-a800a000-a800b000 rw-p 00135000 03:00 4222       /lib/libc.so.6
-a800b000-a800e000 rw-p 00000000 00:00 0
-a800e000-a8022000 r-xp 00000000 03:00 14462      /lib/libpthread.so.0
-a8022000-a8023000 r--p 00013000 03:00 14462      /lib/libpthread.so.0
-a8023000-a8024000 rw-p 00014000 03:00 14462      /lib/libpthread.so.0
-a8024000-a8027000 rw-p 00000000 00:00 0
-a8027000-a8043000 r-xp 00000000 03:00 8317       /lib/ld-linux.so.2
-a8043000-a8044000 r--p 0001b000 03:00 8317       /lib/ld-linux.so.2
-a8044000-a8045000 rw-p 0001c000 03:00 8317       /lib/ld-linux.so.2
-aff35000-aff4a000 rw-p 00000000 00:00 0
-ffffe000-fffff000 r-xp 00000000 00:00 0          [vdso]
-
 The /proc/PID/smaps is an extension based on maps, showing the memory
 consumption for each of the process's mappings. For each of mappings there
 is a series of lines such as the following:
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index d619c8d..b5039a0 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -828,7 +828,6 @@
 	int (*flush) (struct file *, fl_owner_t id);
 	int (*release) (struct inode *, struct file *);
 	int (*fsync) (struct file *, loff_t, loff_t, int datasync);
-	int (*aio_fsync) (struct kiocb *, int datasync);
 	int (*fasync) (int, struct file *, int);
 	int (*lock) (struct file *, int, struct file_lock *);
 	ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);
diff --git a/Documentation/gpio/board.txt b/Documentation/gpio/board.txt
index 40884c4..a0f6189 100644
--- a/Documentation/gpio/board.txt
+++ b/Documentation/gpio/board.txt
@@ -6,7 +6,7 @@
 description of the deprecated integer-based GPIO interface please refer to
 gpio-legacy.txt (actually, there is no real mapping possible with the old
 interface; you just fetch an integer from somewhere and request the
-corresponding GPIO.
+corresponding GPIO).
 
 All platforms can enable the GPIO library, but if the platform strictly
 requires GPIO functionality to be present, it needs to select GPIOLIB from its
@@ -162,6 +162,9 @@
 
 Since the "led" GPIOs are mapped as active-high, this example will switch their
 signals to 1, i.e. enabling the LEDs. And for the "power" GPIO, which is mapped
-as active-low, its actual signal will be 0 after this code. Contrary to the legacy
-integer GPIO interface, the active-low property is handled during mapping and is
-thus transparent to GPIO consumers.
+as active-low, its actual signal will be 0 after this code. Contrary to the
+legacy integer GPIO interface, the active-low property is handled during
+mapping and is thus transparent to GPIO consumers.
+
+A set of functions such as gpiod_set_value() is available to work with
+the new descriptor-oriented interface.
diff --git a/Documentation/i2c/i2c-topology b/Documentation/i2c/i2c-topology
index e0aefee..1a014fe 100644
--- a/Documentation/i2c/i2c-topology
+++ b/Documentation/i2c/i2c-topology
@@ -326,7 +326,7 @@
 
 This is a good topology.
 
-                                   .--------.
+                                    .--------.
                    .----------.  .--| dev D1 |
                    |  parent- |--'  '--------'
                 .--|  locked  |     .--------.
@@ -350,7 +350,7 @@
 
 This is a good topology.
 
-                                   .--------.
+                                    .--------.
                    .----------.  .--| dev D1 |
                    |   mux-   |--'  '--------'
                 .--|  locked  |     .--------.
diff --git a/Documentation/networking/dsa/dsa.txt b/Documentation/networking/dsa/dsa.txt
index 6d6c07c..63912ef3 100644
--- a/Documentation/networking/dsa/dsa.txt
+++ b/Documentation/networking/dsa/dsa.txt
@@ -67,13 +67,14 @@
 Switch tagging protocols
 ------------------------
 
-DSA currently supports 4 different tagging protocols, and a tag-less mode as
+DSA currently supports 5 different tagging protocols, and a tag-less mode as
 well. The different protocols are implemented in:
 
 net/dsa/tag_trailer.c: Marvell's 4 trailer tag mode (legacy)
 net/dsa/tag_dsa.c: Marvell's original DSA tag
 net/dsa/tag_edsa.c: Marvell's enhanced DSA tag
 net/dsa/tag_brcm.c: Broadcom's 4 bytes tag
+net/dsa/tag_qca.c: Qualcomm's 2 bytes tag
 
 The exact format of the tag protocol is vendor specific, but in general, they
 all contain something which:
diff --git a/Documentation/networking/netdev-FAQ.txt b/Documentation/networking/netdev-FAQ.txt
index 0fe1c6e..a20b2fa 100644
--- a/Documentation/networking/netdev-FAQ.txt
+++ b/Documentation/networking/netdev-FAQ.txt
@@ -29,8 +29,8 @@
    Linus, and net-next is where the new code goes for the future release.
    You can find the trees here:
 
-	http://git.kernel.org/?p=linux/kernel/git/davem/net.git
-	http://git.kernel.org/?p=linux/kernel/git/davem/net-next.git
+        https://git.kernel.org/pub/scm/linux/kernel/git/davem/net.git
+        https://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git
 
 Q: How often do changes from these trees make it to the mainline Linus tree?
 
@@ -76,7 +76,7 @@
 
 A: Load the mainline (Linus) page here:
 
-	http://git.kernel.org/?p=linux/kernel/git/torvalds/linux.git
+	https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
 
    and note the top of the "tags" section.  If it is rc1, it is early
    in the dev cycle.  If it was tagged rc7 a week ago, then a release
@@ -123,7 +123,7 @@
 
    It contains the patches which Dave has selected, but not yet handed
    off to Greg.  If Greg already has the patch, then it will be here:
-	http://git.kernel.org/cgit/linux/kernel/git/stable/stable-queue.git
+	https://git.kernel.org/pub/scm/linux/kernel/git/stable/stable-queue.git
 
    A quick way to find whether the patch is in this stable-queue is
    to simply clone the repo, and then git grep the mainline commit ID, e.g.
diff --git a/Documentation/networking/nf_conntrack-sysctl.txt b/Documentation/networking/nf_conntrack-sysctl.txt
index 4fb51d3..433b672 100644
--- a/Documentation/networking/nf_conntrack-sysctl.txt
+++ b/Documentation/networking/nf_conntrack-sysctl.txt
@@ -33,24 +33,6 @@
 	If this option is enabled, the connection tracking code will
 	provide userspace with connection tracking events via ctnetlink.
 
-nf_conntrack_events_retry_timeout - INTEGER (seconds)
-	default 15
-
-	This option is only relevant when "reliable connection tracking
-	events" are used.  Normally, ctnetlink is "lossy", that is,
-	events are normally dropped when userspace listeners can't keep up.
-
-	Userspace can request "reliable event mode".  When this mode is
-	active, the conntrack will only be destroyed after the event was
-	delivered.  If event delivery fails, the kernel periodically
-	re-tries to send the event to userspace.
-
-	This is the maximum interval the kernel should use when re-trying
-	to deliver the destroy event.
-
-	A higher number means there will be fewer delivery retries and it
-	will take longer for a backlog to be processed.
-
 nf_conntrack_expect_max - INTEGER
 	Maximum size of expectation table.  Default value is
 	nf_conntrack_buckets / 256. Minimum is 1.
@@ -80,10 +62,13 @@
 	protocols.
 
 nf_conntrack_helper - BOOLEAN
-	0 - disabled
-	not 0 - enabled (default)
+	0 - disabled (default)
+	not 0 - enabled
 
 	Enable automatic conntrack helper assignment.
+	If disabled it is required to set up iptables rules to assign
+	helpers to connections.  See the CT target description in the
+	iptables-extensions(8) man page for further information.
 
 nf_conntrack_icmp_timeout - INTEGER (seconds)
 	default 30
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 739db9a..6bbceb9 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -777,6 +777,17 @@
 conjunction with KVM_SET_CLOCK, it is used to ensure monotonicity on scenarios
 such as migration.
 
+When KVM_CAP_ADJUST_CLOCK is passed to KVM_CHECK_EXTENSION, it returns the
+set of bits that KVM can return in struct kvm_clock_data's flag member.
+
+The only flag defined now is KVM_CLOCK_TSC_STABLE.  If set, the returned
+value is the exact kvmclock value seen by all VCPUs at the instant
+when KVM_GET_CLOCK was called.  If clear, the returned value is simply
+CLOCK_MONOTONIC plus a constant offset; the offset can be modified
+with KVM_SET_CLOCK.  KVM will try to make all VCPUs follow this clock,
+but the exact value read by each VCPU could differ, because the host
+TSC is not stable.
+
 struct kvm_clock_data {
 	__u64 clock;  /* kvmclock current value */
 	__u32 flags;
diff --git a/Documentation/virtual/kvm/locking.txt b/Documentation/virtual/kvm/locking.txt
index f2491a8..e5dd9f4 100644
--- a/Documentation/virtual/kvm/locking.txt
+++ b/Documentation/virtual/kvm/locking.txt
@@ -4,7 +4,17 @@
 1. Acquisition Orders
 ---------------------
 
-(to be written)
+The acquisition orders for mutexes are as follows:
+
+- kvm->lock is taken outside vcpu->mutex
+
+- kvm->lock is taken outside kvm->slots_lock and kvm->irq_lock
+
+- kvm->slots_lock is taken outside kvm->irq_lock, though acquiring
+  them together is quite rare.
+
+For spinlocks, kvm_lock is taken outside kvm->mmu_lock.  Everything
+else is a leaf: no other lock is taken inside the critical sections.
 
 2: Exception
 ------------
diff --git a/MAINTAINERS b/MAINTAINERS
index 1cd38a7..63cefa6 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -77,6 +77,7 @@
 	Q: Patchwork web based patch tracking system site
 	T: SCM tree type and location.
 	   Type is one of: git, hg, quilt, stgit, topgit
+	B: Bug tracking system location.
 	S: Status, one of the following:
 	   Supported:	Someone is actually paid to look after this.
 	   Maintained:	Someone actually looks after it.
@@ -281,6 +282,7 @@
 W:	https://01.org/linux-acpi
 Q:	https://patchwork.kernel.org/project/linux-acpi/list/
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm
+B:	https://bugzilla.kernel.org
 S:	Supported
 F:	drivers/acpi/
 F:	drivers/pnp/pnpacpi/
@@ -304,6 +306,8 @@
 W:	https://github.com/acpica/acpica/
 Q:	https://patchwork.kernel.org/project/linux-acpi/list/
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm
+B:	https://bugzilla.kernel.org
+B:	https://bugs.acpica.org
 S:	Supported
 F:	drivers/acpi/acpica/
 F:	include/acpi/
@@ -313,6 +317,7 @@
 M:	Zhang Rui <rui.zhang@intel.com>
 L:	linux-acpi@vger.kernel.org
 W:	https://01.org/linux-acpi
+B:	https://bugzilla.kernel.org
 S:	Supported
 F:	drivers/acpi/fan.c
 
@@ -328,6 +333,7 @@
 M:	Zhang Rui <rui.zhang@intel.com>
 L:	linux-acpi@vger.kernel.org
 W:	https://01.org/linux-acpi
+B:	https://bugzilla.kernel.org
 S:	Supported
 F:	drivers/acpi/*thermal*
 
@@ -335,6 +341,7 @@
 M:	Zhang Rui <rui.zhang@intel.com>
 L:	linux-acpi@vger.kernel.org
 W:	https://01.org/linux-acpi
+B:	https://bugzilla.kernel.org
 S:	Supported
 F:	drivers/acpi/acpi_video.c
 
@@ -1442,6 +1449,7 @@
 F:	arch/arm/configs/mvebu_*_defconfig
 
 ARM/Marvell Berlin SoC support
+M:	Jisheng Zhang <jszhang@marvell.com>
 M:	Sebastian Hesselbarth <sebastian.hesselbarth@gmail.com>
 L:	linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
 S:	Maintained
@@ -2551,15 +2559,18 @@
 F:	drivers/net/ethernet/broadcom/genet/
 
 BROADCOM BNX2 GIGABIT ETHERNET DRIVER
-M:	Sony Chacko <sony.chacko@qlogic.com>
-M:	Dept-HSGLinuxNICDev@qlogic.com
+M:	Rasesh Mody <rasesh.mody@cavium.com>
+M:	Harish Patil <harish.patil@cavium.com>
+M:	Dept-GELinuxNICDev@cavium.com
 L:	netdev@vger.kernel.org
 S:	Supported
 F:	drivers/net/ethernet/broadcom/bnx2.*
 F:	drivers/net/ethernet/broadcom/bnx2_*
 
 BROADCOM BNX2X 10 GIGABIT ETHERNET DRIVER
-M:	Ariel Elior <ariel.elior@qlogic.com>
+M:	Yuval Mintz <Yuval.Mintz@cavium.com>
+M:	Ariel Elior <ariel.elior@cavium.com>
+M:	everest-linux-l2@cavium.com
 L:	netdev@vger.kernel.org
 S:	Supported
 F:	drivers/net/ethernet/broadcom/bnx2x/
@@ -2766,7 +2777,9 @@
 F:	drivers/scsi/bfa/
 
 BROCADE BNA 10 GIGABIT ETHERNET DRIVER
-M:	Rasesh Mody <rasesh.mody@qlogic.com>
+M:	Rasesh Mody <rasesh.mody@cavium.com>
+M:	Sudarsana Kalluru <sudarsana.kalluru@cavium.com>
+M:	Dept-GELinuxNICDev@cavium.com
 L:	netdev@vger.kernel.org
 S:	Supported
 F:	drivers/net/ethernet/brocade/bna/
@@ -4620,8 +4633,9 @@
 
 EXTENSIBLE FIRMWARE INTERFACE (EFI)
 M:	Matt Fleming <matt@codeblueprint.co.uk>
+M:	Ard Biesheuvel <ard.biesheuvel@linaro.org>
 L:	linux-efi@vger.kernel.org
-T:	git git://git.kernel.org/pub/scm/linux/kernel/git/mfleming/efi.git
+T:	git git://git.kernel.org/pub/scm/linux/kernel/git/efi/efi.git
 S:	Maintained
 F:	Documentation/efi-stub.txt
 F:	arch/ia64/kernel/efi.c
@@ -5286,6 +5300,12 @@
 S:	Maintained
 F:	scripts/get_maintainer.pl
 
+GENWQE (IBM Generic Workqueue Card)
+M:	Frank Haverkamp <haver@linux.vnet.ibm.com>
+M:	Gabriel Krisman Bertazi <krisman@linux.vnet.ibm.com>
+S:	Supported
+F:	drivers/misc/genwqe/
+
 GFS2 FILE SYSTEM
 M:	Steven Whitehouse <swhiteho@redhat.com>
 M:	Bob Peterson <rpeterso@redhat.com>
@@ -5650,6 +5670,7 @@
 M:	"Rafael J. Wysocki" <rjw@rjwysocki.net>
 M:	Pavel Machek <pavel@ucw.cz>
 L:	linux-pm@vger.kernel.org
+B:	https://bugzilla.kernel.org
 S:	Supported
 F:	arch/x86/power/
 F:	drivers/base/power/
@@ -7071,6 +7092,7 @@
 LED SUBSYSTEM
 M:	Richard Purdie <rpurdie@rpsys.net>
 M:	Jacek Anaszewski <j.anaszewski@samsung.com>
+M:	Pavel Machek <pavel@ucw.cz>
 L:	linux-leds@vger.kernel.org
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/j.anaszewski/linux-leds.git
 S:	Maintained
@@ -7912,6 +7934,10 @@
 MEMORY TECHNOLOGY DEVICES (MTD)
 M:	David Woodhouse <dwmw2@infradead.org>
 M:	Brian Norris <computersforpeace@gmail.com>
+M:	Boris Brezillon <boris.brezillon@free-electrons.com>
+M:	Marek Vasut <marek.vasut@gmail.com>
+M:	Richard Weinberger <richard@nod.at>
+M:	Cyrille Pitchen <cyrille.pitchen@atmel.com>
 L:	linux-mtd@lists.infradead.org
 W:	http://www.linux-mtd.infradead.org/
 Q:	http://patchwork.ozlabs.org/project/linux-mtd/list/
@@ -8040,6 +8066,7 @@
 F:	include/linux/mlx4/
 
 MELLANOX MLX5 core VPI driver
+M:	Saeed Mahameed <saeedm@mellanox.com>
 M:	Matan Barak <matanb@mellanox.com>
 M:	Leon Romanovsky <leonro@mellanox.com>
 L:	netdev@vger.kernel.org
@@ -8099,6 +8126,7 @@
 F:	drivers/media/dvb-frontends/mn88473*
 
 MODULE SUPPORT
+M:	Jessica Yu <jeyu@redhat.com>
 M:	Rusty Russell <rusty@rustcorp.com.au>
 S:	Maintained
 F:	include/linux/module.h
@@ -8212,7 +8240,7 @@
 MULTIMEDIA CARD (MMC), SECURE DIGITAL (SD) AND SDIO SUBSYSTEM
 M:	Ulf Hansson <ulf.hansson@linaro.org>
 L:	linux-mmc@vger.kernel.org
-T:	git git://git.linaro.org/people/ulf.hansson/mmc.git
+T:	git git://git.kernel.org/pub/scm/linux/kernel/git/ulfh/mmc.git
 S:	Maintained
 F:	Documentation/devicetree/bindings/mmc/
 F:	drivers/mmc/
@@ -8508,11 +8536,10 @@
 F:	drivers/net/wireless/
 
 NETXEN (1/10) GbE SUPPORT
-M:	Manish Chopra <manish.chopra@qlogic.com>
-M:	Sony Chacko <sony.chacko@qlogic.com>
-M:	Rajesh Borundia <rajesh.borundia@qlogic.com>
+M:	Manish Chopra <manish.chopra@cavium.com>
+M:	Rahul Verma <rahul.verma@cavium.com>
+M:	Dept-GELinuxNICDev@cavium.com
 L:	netdev@vger.kernel.org
-W:	http://www.qlogic.com
 S:	Supported
 F:	drivers/net/ethernet/qlogic/netxen/
 
@@ -9230,11 +9257,12 @@
 F:	drivers/pci/host/*layerscape*
 
 PCI DRIVER FOR IMX6
-M:	Richard Zhu <Richard.Zhu@freescale.com>
+M:	Richard Zhu <hongxing.zhu@nxp.com>
 M:	Lucas Stach <l.stach@pengutronix.de>
 L:	linux-pci@vger.kernel.org
 L:	linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
 S:	Maintained
+F:	Documentation/devicetree/bindings/pci/fsl,imx6q-pcie.txt
 F:	drivers/pci/host/*imx6*
 
 PCI DRIVER FOR TI KEYSTONE
@@ -9293,17 +9321,11 @@
 
 PCI DRIVER FOR SYNOPSIS DESIGNWARE
 M:	Jingoo Han <jingoohan1@gmail.com>
-M:	Pratyush Anand <pratyush.anand@gmail.com>
-L:	linux-pci@vger.kernel.org
-S:	Maintained
-F:	drivers/pci/host/*designware*
-
-PCI DRIVER FOR SYNOPSYS PROTOTYPING DEVICE
-M:	Joao Pinto <jpinto@synopsys.com>
+M:	Joao Pinto <Joao.Pinto@synopsys.com>
 L:	linux-pci@vger.kernel.org
 S:	Maintained
 F:	Documentation/devicetree/bindings/pci/designware-pcie.txt
-F:	drivers/pci/host/pcie-designware-plat.c
+F:	drivers/pci/host/*designware*
 
 PCI DRIVER FOR GENERIC OF HOSTS
 M:	Will Deacon <will.deacon@arm.com>
@@ -9318,7 +9340,7 @@
 M:	Keith Busch <keith.busch@intel.com>
 L:	linux-pci@vger.kernel.org
 S:	Supported
-F:	arch/x86/pci/vmd.c
+F:	drivers/pci/host/vmd.c
 
 PCIE DRIVER FOR ST SPEAR13XX
 M:	Pratyush Anand <pratyush.anand@gmail.com>
@@ -9605,6 +9627,7 @@
 M:	"Rafael J. Wysocki" <rjw@rjwysocki.net>
 L:	linux-pm@vger.kernel.org
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm
+B:	https://bugzilla.kernel.org
 S:	Supported
 F:	drivers/base/power/
 F:	include/linux/pm.h
@@ -9888,33 +9911,32 @@
 F:	drivers/scsi/qla4xxx/
 
 QLOGIC QLA3XXX NETWORK DRIVER
-M:	Jitendra Kalsaria <jitendra.kalsaria@qlogic.com>
-M:	Ron Mercer <ron.mercer@qlogic.com>
-M:	linux-driver@qlogic.com
+M:	Dept-GELinuxNICDev@cavium.com
 L:	netdev@vger.kernel.org
 S:	Supported
 F:	Documentation/networking/LICENSE.qla3xxx
 F:	drivers/net/ethernet/qlogic/qla3xxx.*
 
 QLOGIC QLCNIC (1/10)Gb ETHERNET DRIVER
-M:	Dept-GELinuxNICDev@qlogic.com
+M:	Harish Patil <harish.patil@cavium.com>
+M:	Manish Chopra <manish.chopra@cavium.com>
+M:	Dept-GELinuxNICDev@cavium.com
 L:	netdev@vger.kernel.org
 S:	Supported
 F:	drivers/net/ethernet/qlogic/qlcnic/
 
 QLOGIC QLGE 10Gb ETHERNET DRIVER
-M:	Harish Patil <harish.patil@qlogic.com>
-M:	Sudarsana Kalluru <sudarsana.kalluru@qlogic.com>
-M:	Dept-GELinuxNICDev@qlogic.com
-M:	linux-driver@qlogic.com
+M:	Harish Patil <harish.patil@cavium.com>
+M:	Manish Chopra <manish.chopra@cavium.com>
+M:	Dept-GELinuxNICDev@cavium.com
 L:	netdev@vger.kernel.org
 S:	Supported
 F:	drivers/net/ethernet/qlogic/qlge/
 
 QLOGIC QL4xxx ETHERNET DRIVER
-M:	Yuval Mintz <Yuval.Mintz@qlogic.com>
-M:	Ariel Elior <Ariel.Elior@qlogic.com>
-M:	everest-linux-l2@qlogic.com
+M:	Yuval Mintz <Yuval.Mintz@cavium.com>
+M:	Ariel Elior <Ariel.Elior@cavium.com>
+M:	everest-linux-l2@cavium.com
 L:	netdev@vger.kernel.org
 S:	Supported
 F:	drivers/net/ethernet/qlogic/qed/
@@ -11392,6 +11414,17 @@
 S:	Maintained
 F:	drivers/clk/spear/
 
+SPI NOR SUBSYSTEM
+M:	Cyrille Pitchen <cyrille.pitchen@atmel.com>
+M:	Marek Vasut <marek.vasut@gmail.com>
+L:	linux-mtd@lists.infradead.org
+W:	http://www.linux-mtd.infradead.org/
+Q:	http://patchwork.ozlabs.org/project/linux-mtd/list/
+T:	git git://github.com/spi-nor/linux.git
+S:	Maintained
+F:	drivers/mtd/spi-nor/
+F:	include/linux/mtd/spi-nor.h
+
 SPI SUBSYSTEM
 M:	Mark Brown <broonie@kernel.org>
 L:	linux-spi@vger.kernel.org
@@ -11584,6 +11617,7 @@
 M:	Len Brown <len.brown@intel.com>
 M:	Pavel Machek <pavel@ucw.cz>
 L:	linux-pm@vger.kernel.org
+B:	https://bugzilla.kernel.org
 S:	Supported
 F:	Documentation/power/
 F:	arch/x86/kernel/acpi/
@@ -12771,6 +12805,7 @@
 
 VIRTIO CORE, NET AND BLOCK DRIVERS
 M:	"Michael S. Tsirkin" <mst@redhat.com>
+M:	Jason Wang <jasowang@redhat.com>
 L:	virtualization@lists.linux-foundation.org
 S:	Maintained
 F:	Documentation/devicetree/bindings/virtio/
@@ -12801,6 +12836,7 @@
 
 VIRTIO HOST (VHOST)
 M:	"Michael S. Tsirkin" <mst@redhat.com>
+M:	Jason Wang <jasowang@redhat.com>
 L:	kvm@vger.kernel.org
 L:	virtualization@lists.linux-foundation.org
 L:	netdev@vger.kernel.org
diff --git a/Makefile b/Makefile
index 512e47a..369099d 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
 VERSION = 4
 PATCHLEVEL = 9
 SUBLEVEL = 0
-EXTRAVERSION = -rc1
+EXTRAVERSION = -rc8
 NAME = Psychotic Stoned Sheep
 
 # *DOCUMENTATION*
@@ -370,7 +370,7 @@
 CFLAGS_KERNEL	=
 AFLAGS_KERNEL	=
 LDFLAGS_vmlinux =
-CFLAGS_GCOV	= -fprofile-arcs -ftest-coverage -fno-tree-loop-im
+CFLAGS_GCOV	= -fprofile-arcs -ftest-coverage -fno-tree-loop-im -Wno-maybe-uninitialized
 CFLAGS_KCOV	:= $(call cc-option,-fsanitize-coverage=trace-pc,)
 
 
@@ -399,11 +399,12 @@
 		   -fno-strict-aliasing -fno-common \
 		   -Werror-implicit-function-declaration \
 		   -Wno-format-security \
-		   -std=gnu89
+		   -std=gnu89 $(call cc-option,-fno-PIE)
+
 
 KBUILD_AFLAGS_KERNEL :=
 KBUILD_CFLAGS_KERNEL :=
-KBUILD_AFLAGS   := -D__ASSEMBLY__
+KBUILD_AFLAGS   := -D__ASSEMBLY__ $(call cc-option,-fno-PIE)
 KBUILD_AFLAGS_MODULE  := -DMODULE
 KBUILD_CFLAGS_MODULE  := -DMODULE
 KBUILD_LDFLAGS_MODULE := -T $(srctree)/scripts/module-common.lds
@@ -606,6 +607,13 @@
 include/config/auto.conf: ;
 endif # $(dot-config)
 
+# For the kernel to actually contain only the needed exported symbols,
+# we have to build modules as well to determine what those symbols are.
+# (this can be evaluated only once include/config/auto.conf has been included)
+ifdef CONFIG_TRIM_UNUSED_KSYMS
+  KBUILD_MODULES := 1
+endif
+
 # The all: target is the default when no target is given on the
 # command line.
 # This allow a user to issue only 'make' to build a kernel including modules
@@ -620,7 +628,6 @@
 include arch/$(SRCARCH)/Makefile
 
 KBUILD_CFLAGS	+= $(call cc-option,-fno-delete-null-pointer-checks,)
-KBUILD_CFLAGS	+= $(call cc-disable-warning,maybe-uninitialized,)
 KBUILD_CFLAGS	+= $(call cc-disable-warning,frame-address,)
 
 ifdef CONFIG_LD_DEAD_CODE_DATA_ELIMINATION
@@ -629,15 +636,18 @@
 endif
 
 ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE
-KBUILD_CFLAGS	+= -Os
+KBUILD_CFLAGS	+= -Os $(call cc-disable-warning,maybe-uninitialized,)
 else
 ifdef CONFIG_PROFILE_ALL_BRANCHES
-KBUILD_CFLAGS	+= -O2
+KBUILD_CFLAGS	+= -O2 $(call cc-disable-warning,maybe-uninitialized,)
 else
 KBUILD_CFLAGS   += -O2
 endif
 endif
 
+KBUILD_CFLAGS += $(call cc-ifversion, -lt, 0409, \
+			$(call cc-disable-warning,maybe-uninitialized,))
+
 # Tell gcc to never replace conditional load with a non-conditional one
 KBUILD_CFLAGS	+= $(call cc-option,--param=allow-store-data-races=0)
 
@@ -941,7 +951,7 @@
 endif
 ifdef CONFIG_TRIM_UNUSED_KSYMS
 	$(Q)$(CONFIG_SHELL) $(srctree)/scripts/adjust_autoksyms.sh \
-	  "$(MAKE) KBUILD_MODULES=1 -f $(srctree)/Makefile vmlinux_prereq"
+	  "$(MAKE) -f $(srctree)/Makefile vmlinux"
 endif
 
 # standalone target for easier testing
@@ -1016,8 +1026,6 @@
 prepare1: prepare2 $(version_h) include/generated/utsrelease.h \
                    include/config/auto.conf
 	$(cmd_crmodverdir)
-	$(Q)test -e include/generated/autoksyms.h || \
-	    touch   include/generated/autoksyms.h
 
 archprepare: archheaders archscripts prepare1 scripts_basic
 
diff --git a/arch/alpha/kernel/ptrace.c b/arch/alpha/kernel/ptrace.c
index d9ee817..940dfb4 100644
--- a/arch/alpha/kernel/ptrace.c
+++ b/arch/alpha/kernel/ptrace.c
@@ -157,14 +157,16 @@
 static inline int
 read_int(struct task_struct *task, unsigned long addr, int * data)
 {
-	int copied = access_process_vm(task, addr, data, sizeof(int), 0);
+	int copied = access_process_vm(task, addr, data, sizeof(int),
+			FOLL_FORCE);
 	return (copied == sizeof(int)) ? 0 : -EIO;
 }
 
 static inline int
 write_int(struct task_struct *task, unsigned long addr, int data)
 {
-	int copied = access_process_vm(task, addr, &data, sizeof(int), 1);
+	int copied = access_process_vm(task, addr, &data, sizeof(int),
+			FOLL_FORCE | FOLL_WRITE);
 	return (copied == sizeof(int)) ? 0 : -EIO;
 }
 
@@ -281,7 +283,8 @@
 	/* When I and D space are separate, these will need to be fixed.  */
 	case PTRACE_PEEKTEXT: /* read word at location addr. */
 	case PTRACE_PEEKDATA:
-		copied = access_process_vm(child, addr, &tmp, sizeof(tmp), 0);
+		copied = access_process_vm(child, addr, &tmp, sizeof(tmp),
+				FOLL_FORCE);
 		ret = -EIO;
 		if (copied != sizeof(tmp))
 			break;
diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig
index ecd1237..bd204bf 100644
--- a/arch/arc/Kconfig
+++ b/arch/arc/Kconfig
@@ -41,6 +41,8 @@
 	select PERF_USE_VMALLOC
 	select HAVE_DEBUG_STACKOVERFLOW
 	select HAVE_GENERIC_DMA_COHERENT
+	select HAVE_KERNEL_GZIP
+	select HAVE_KERNEL_LZMA
 
 config MIGHT_HAVE_PCI
 	bool
@@ -186,14 +188,6 @@
 config ARC_HAS_COH_CACHES
 	def_bool n
 
-config ARC_MCIP
-	bool "ARConnect Multicore IP (MCIP) Support "
-	depends on ISA_ARCV2
-	help
-	  This IP block enables SMP in ARC-HS38 cores.
-	  It provides for cross-core interrupts, multi-core debug
-	  hardware semaphores, shared memory,....
-
 config NR_CPUS
 	int "Maximum number of CPUs (2-4096)"
 	range 2 4096
@@ -211,6 +205,15 @@
 
 endif	#SMP
 
+config ARC_MCIP
+	bool "ARConnect Multicore IP (MCIP) Support "
+	depends on ISA_ARCV2
+	default y if SMP
+	help
+	  This IP block enables SMP in ARC-HS38 cores.
+	  It provides for cross-core interrupts, multi-core debug
+	  hardware semaphores, shared memory,....
+
 menuconfig ARC_CACHE
 	bool "Enable Cache Support"
 	default y
@@ -537,14 +540,6 @@
 	bool "Paranoia Checks in Low Level TLB Handlers"
 	default n
 
-config ARC_DBG_TLB_MISS_COUNT
-	bool "Profile TLB Misses"
-	default n
-	select DEBUG_FS
-	help
-	  Counts number of I and D TLB Misses and exports them via Debugfs
-	  The counters can be cleared via Debugfs as well
-
 endif
 
 config ARC_UBOOT_SUPPORT
diff --git a/arch/arc/Makefile b/arch/arc/Makefile
index aa82d13..19cce22 100644
--- a/arch/arc/Makefile
+++ b/arch/arc/Makefile
@@ -71,7 +71,9 @@
 ifndef CONFIG_CC_OPTIMIZE_FOR_SIZE
 # Generic build system uses -O2, we want -O3
 # Note: No need to add to cflags-y as that happens anyways
-ARCH_CFLAGS += -O3
+#
+# Disable the false maybe-uninitialized warings gcc spits out at -O3
+ARCH_CFLAGS += -O3 $(call cc-disable-warning,maybe-uninitialized,)
 endif
 
 # small data is default for elf32 tool-chain. If not usable, disable it
diff --git a/arch/arc/boot/Makefile b/arch/arc/boot/Makefile
index e597cb34..f94cf15 100644
--- a/arch/arc/boot/Makefile
+++ b/arch/arc/boot/Makefile
@@ -14,9 +14,15 @@
 
 suffix-y := bin
 suffix-$(CONFIG_KERNEL_GZIP)	:= gz
+suffix-$(CONFIG_KERNEL_LZMA)	:= lzma
 
-targets += uImage uImage.bin uImage.gz
-extra-y += vmlinux.bin vmlinux.bin.gz
+targets += uImage
+targets += uImage.bin
+targets += uImage.gz
+targets += uImage.lzma
+extra-y += vmlinux.bin
+extra-y += vmlinux.bin.gz
+extra-y += vmlinux.bin.lzma
 
 $(obj)/vmlinux.bin: vmlinux FORCE
 	$(call if_changed,objcopy)
@@ -24,12 +30,18 @@
 $(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE
 	$(call if_changed,gzip)
 
+$(obj)/vmlinux.bin.lzma: $(obj)/vmlinux.bin FORCE
+	$(call if_changed,lzma)
+
 $(obj)/uImage.bin: $(obj)/vmlinux.bin FORCE
 	$(call if_changed,uimage,none)
 
 $(obj)/uImage.gz: $(obj)/vmlinux.bin.gz FORCE
 	$(call if_changed,uimage,gzip)
 
+$(obj)/uImage.lzma: $(obj)/vmlinux.bin.lzma FORCE
+	$(call if_changed,uimage,lzma)
+
 $(obj)/uImage: $(obj)/uImage.$(suffix-y)
 	@ln -sf $(notdir $<) $@
 	@echo '  Image $@ is ready'
diff --git a/arch/arc/boot/dts/axc001.dtsi b/arch/arc/boot/dts/axc001.dtsi
index 6ae2c47..53ce226 100644
--- a/arch/arc/boot/dts/axc001.dtsi
+++ b/arch/arc/boot/dts/axc001.dtsi
@@ -71,7 +71,7 @@
 			reg-io-width = <4>;
 		};
 
-		arcpmu0: pmu {
+		arcpct0: pct {
 			compatible = "snps,arc700-pct";
 		};
 	};
diff --git a/arch/arc/boot/dts/nsim_700.dts b/arch/arc/boot/dts/nsim_700.dts
index ce0ccd20..5ee96b0 100644
--- a/arch/arc/boot/dts/nsim_700.dts
+++ b/arch/arc/boot/dts/nsim_700.dts
@@ -69,7 +69,7 @@
 			};
 		};
 
-		arcpmu0: pmu {
+		arcpct0: pct {
 			compatible = "snps,arc700-pct";
 		};
 	};
diff --git a/arch/arc/boot/dts/nsimosci.dts b/arch/arc/boot/dts/nsimosci.dts
index bcf6031..3c391ba 100644
--- a/arch/arc/boot/dts/nsimosci.dts
+++ b/arch/arc/boot/dts/nsimosci.dts
@@ -83,5 +83,9 @@
 			reg = <0xf0003000 0x44>;
 			interrupts = <7>;
 		};
+
+		arcpct0: pct {
+			compatible = "snps,arc700-pct";
+		};
 	};
 };
diff --git a/arch/arc/configs/nsim_700_defconfig b/arch/arc/configs/nsim_700_defconfig
index 7314f53..b0066a7 100644
--- a/arch/arc/configs/nsim_700_defconfig
+++ b/arch/arc/configs/nsim_700_defconfig
@@ -14,6 +14,7 @@
 CONFIG_INITRAMFS_SOURCE="../arc_initramfs/"
 CONFIG_KALLSYMS_ALL=y
 CONFIG_EMBEDDED=y
+CONFIG_PERF_EVENTS=y
 # CONFIG_SLUB_DEBUG is not set
 # CONFIG_COMPAT_BRK is not set
 CONFIG_KPROBES=y
diff --git a/arch/arc/configs/nsim_hs_defconfig b/arch/arc/configs/nsim_hs_defconfig
index 65ab9fb..ebe9ebb 100644
--- a/arch/arc/configs/nsim_hs_defconfig
+++ b/arch/arc/configs/nsim_hs_defconfig
@@ -14,6 +14,7 @@
 CONFIG_INITRAMFS_SOURCE="../../arc_initramfs_hs/"
 CONFIG_KALLSYMS_ALL=y
 CONFIG_EMBEDDED=y
+CONFIG_PERF_EVENTS=y
 # CONFIG_SLUB_DEBUG is not set
 # CONFIG_COMPAT_BRK is not set
 CONFIG_KPROBES=y
diff --git a/arch/arc/configs/nsim_hs_smp_defconfig b/arch/arc/configs/nsim_hs_smp_defconfig
index 3b3990c..4bde432 100644
--- a/arch/arc/configs/nsim_hs_smp_defconfig
+++ b/arch/arc/configs/nsim_hs_smp_defconfig
@@ -12,6 +12,7 @@
 CONFIG_INITRAMFS_SOURCE="../arc_initramfs_hs/"
 CONFIG_KALLSYMS_ALL=y
 CONFIG_EMBEDDED=y
+CONFIG_PERF_EVENTS=y
 # CONFIG_SLUB_DEBUG is not set
 # CONFIG_COMPAT_BRK is not set
 CONFIG_KPROBES=y
diff --git a/arch/arc/configs/nsimosci_defconfig b/arch/arc/configs/nsimosci_defconfig
index 98cf209..f6fb3d26 100644
--- a/arch/arc/configs/nsimosci_defconfig
+++ b/arch/arc/configs/nsimosci_defconfig
@@ -14,6 +14,7 @@
 CONFIG_INITRAMFS_SOURCE="../arc_initramfs/"
 CONFIG_KALLSYMS_ALL=y
 CONFIG_EMBEDDED=y
+CONFIG_PERF_EVENTS=y
 # CONFIG_SLUB_DEBUG is not set
 # CONFIG_COMPAT_BRK is not set
 CONFIG_KPROBES=y
diff --git a/arch/arc/configs/nsimosci_hs_defconfig b/arch/arc/configs/nsimosci_hs_defconfig
index ddf8b96..b9f0fe0 100644
--- a/arch/arc/configs/nsimosci_hs_defconfig
+++ b/arch/arc/configs/nsimosci_hs_defconfig
@@ -14,6 +14,7 @@
 CONFIG_INITRAMFS_SOURCE="../arc_initramfs_hs/"
 CONFIG_KALLSYMS_ALL=y
 CONFIG_EMBEDDED=y
+CONFIG_PERF_EVENTS=y
 # CONFIG_SLUB_DEBUG is not set
 # CONFIG_COMPAT_BRK is not set
 CONFIG_KPROBES=y
diff --git a/arch/arc/configs/nsimosci_hs_smp_defconfig b/arch/arc/configs/nsimosci_hs_smp_defconfig
index ceb9074..6da71ba 100644
--- a/arch/arc/configs/nsimosci_hs_smp_defconfig
+++ b/arch/arc/configs/nsimosci_hs_smp_defconfig
@@ -10,6 +10,7 @@
 # CONFIG_PID_NS is not set
 CONFIG_BLK_DEV_INITRD=y
 CONFIG_INITRAMFS_SOURCE="../arc_initramfs_hs/"
+CONFIG_PERF_EVENTS=y
 # CONFIG_COMPAT_BRK is not set
 CONFIG_KPROBES=y
 CONFIG_MODULES=y
@@ -34,7 +35,6 @@
 # CONFIG_INET_XFRM_MODE_TRANSPORT is not set
 # CONFIG_INET_XFRM_MODE_TUNNEL is not set
 # CONFIG_INET_XFRM_MODE_BEET is not set
-# CONFIG_INET_LRO is not set
 # CONFIG_IPV6 is not set
 # CONFIG_WIRELESS is not set
 CONFIG_DEVTMPFS=y
@@ -72,7 +72,6 @@
 # CONFIG_HWMON is not set
 CONFIG_DRM=y
 CONFIG_DRM_ARCPGU=y
-CONFIG_FRAMEBUFFER_CONSOLE=y
 CONFIG_LOGO=y
 # CONFIG_HID is not set
 # CONFIG_USB_SUPPORT is not set
diff --git a/arch/arc/include/asm/arcregs.h b/arch/arc/include/asm/arcregs.h
index db25c65..1bd24ec 100644
--- a/arch/arc/include/asm/arcregs.h
+++ b/arch/arc/include/asm/arcregs.h
@@ -43,12 +43,14 @@
 #define STATUS_AE_BIT		5	/* Exception active */
 #define STATUS_DE_BIT		6	/* PC is in delay slot */
 #define STATUS_U_BIT		7	/* User/Kernel mode */
+#define STATUS_Z_BIT            11
 #define STATUS_L_BIT		12	/* Loop inhibit */
 
 /* These masks correspond to the status word(STATUS_32) bits */
 #define STATUS_AE_MASK		(1<<STATUS_AE_BIT)
 #define STATUS_DE_MASK		(1<<STATUS_DE_BIT)
 #define STATUS_U_MASK		(1<<STATUS_U_BIT)
+#define STATUS_Z_MASK		(1<<STATUS_Z_BIT)
 #define STATUS_L_MASK		(1<<STATUS_L_BIT)
 
 /*
@@ -349,10 +351,11 @@
 	struct cpuinfo_arc_bpu bpu;
 	struct bcr_identity core;
 	struct bcr_isa isa;
+	const char *details, *name;
 	unsigned int vec_base;
 	struct cpuinfo_arc_ccm iccm, dccm;
 	struct {
-		unsigned int swap:1, norm:1, minmax:1, barrel:1, crc:1, pad1:3,
+		unsigned int swap:1, norm:1, minmax:1, barrel:1, crc:1, swape:1, pad1:2,
 			     fpu_sp:1, fpu_dp:1, pad2:6,
 			     debug:1, ap:1, smart:1, rtt:1, pad3:4,
 			     timer0:1, timer1:1, rtc:1, gfrc:1, pad4:4;
diff --git a/arch/arc/include/asm/cache.h b/arch/arc/include/asm/cache.h
index fb781e3..b3410ff 100644
--- a/arch/arc/include/asm/cache.h
+++ b/arch/arc/include/asm/cache.h
@@ -53,7 +53,7 @@
 extern char *arc_cache_mumbojumbo(int cpu_id, char *buf, int len);
 extern void read_decode_cache_bcr(void);
 
-extern int ioc_exists;
+extern int ioc_enable;
 extern unsigned long perip_base, perip_end;
 
 #endif	/* !__ASSEMBLY__ */
diff --git a/arch/arc/include/asm/delay.h b/arch/arc/include/asm/delay.h
index 08e7e2a..a36e860 100644
--- a/arch/arc/include/asm/delay.h
+++ b/arch/arc/include/asm/delay.h
@@ -22,10 +22,11 @@
 static inline void __delay(unsigned long loops)
 {
 	__asm__ __volatile__(
-	"	lp  1f	\n"
-	"	nop	\n"
-	"1:		\n"
-	: "+l"(loops));
+	"	mov lp_count, %0	\n"
+	"	lp  1f			\n"
+	"	nop			\n"
+	"1:				\n"
+	: : "r"(loops));
 }
 
 extern void __bad_udelay(void);
diff --git a/arch/arc/include/asm/elf.h b/arch/arc/include/asm/elf.h
index 7096f97..aa2d6da 100644
--- a/arch/arc/include/asm/elf.h
+++ b/arch/arc/include/asm/elf.h
@@ -54,7 +54,7 @@
  * the loader.  We need to make sure that it is out of the way of the program
  * that it will "exec", and that there is sufficient room for the brk.
  */
-#define ELF_ET_DYN_BASE		(2 * TASK_SIZE / 3)
+#define ELF_ET_DYN_BASE		(2UL * TASK_SIZE / 3)
 
 /*
  * When the program starts, a1 contains a pointer to a function to be
diff --git a/arch/arc/include/asm/mcip.h b/arch/arc/include/asm/mcip.h
index 847e3bb..c8fbe41 100644
--- a/arch/arc/include/asm/mcip.h
+++ b/arch/arc/include/asm/mcip.h
@@ -55,6 +55,22 @@
 #define IDU_M_DISTRI_DEST		0x2
 };
 
+struct mcip_bcr {
+#ifdef CONFIG_CPU_BIG_ENDIAN
+		unsigned int pad3:8,
+			     idu:1, llm:1, num_cores:6,
+			     iocoh:1,  gfrc:1, dbg:1, pad2:1,
+			     msg:1, sem:1, ipi:1, pad:1,
+			     ver:8;
+#else
+		unsigned int ver:8,
+			     pad:1, ipi:1, sem:1, msg:1,
+			     pad2:1, dbg:1, gfrc:1, iocoh:1,
+			     num_cores:6, llm:1, idu:1,
+			     pad3:8;
+#endif
+};
+
 /*
  * MCIP programming model
  *
diff --git a/arch/arc/include/asm/module.h b/arch/arc/include/asm/module.h
index 518222b..6e91d8b 100644
--- a/arch/arc/include/asm/module.h
+++ b/arch/arc/include/asm/module.h
@@ -18,6 +18,7 @@
 struct mod_arch_specific {
 	void *unw_info;
 	int unw_sec_idx;
+	const char *secstr;
 };
 #endif
 
diff --git a/arch/arc/include/asm/pgtable.h b/arch/arc/include/asm/pgtable.h
index 89eeb37..e94ca72 100644
--- a/arch/arc/include/asm/pgtable.h
+++ b/arch/arc/include/asm/pgtable.h
@@ -280,7 +280,7 @@
 
 #define pte_page(pte)		pfn_to_page(pte_pfn(pte))
 #define mk_pte(page, prot)	pfn_pte(page_to_pfn(page), prot)
-#define pfn_pte(pfn, prot)	__pte(((pfn) << PAGE_SHIFT) | pgprot_val(prot))
+#define pfn_pte(pfn, prot)	__pte(__pfn_to_phys(pfn) | pgprot_val(prot))
 
 /* Don't use virt_to_pfn for macros below: could cause truncations for PAE40*/
 #define pte_pfn(pte)		(pte_val(pte) >> PAGE_SHIFT)
diff --git a/arch/arc/include/asm/setup.h b/arch/arc/include/asm/setup.h
index 48b37c6..cb954cd 100644
--- a/arch/arc/include/asm/setup.h
+++ b/arch/arc/include/asm/setup.h
@@ -27,11 +27,6 @@
 	const char *str;
 };
 
-struct cpuinfo_data {
-	struct id_to_str info;
-	int up_range;
-};
-
 extern int root_mountflags, end_mem;
 
 void setup_processor(void);
@@ -43,5 +38,6 @@
 #define IS_USED_RUN(v)		((v) ? "" : "(not used) ")
 #define IS_USED_CFG(cfg)	IS_USED_RUN(IS_ENABLED(cfg))
 #define IS_AVAIL2(v, s, cfg)	IS_AVAIL1(v, s), IS_AVAIL1(v, IS_USED_CFG(cfg))
+#define IS_AVAIL3(v, v2, s)	IS_AVAIL1(v, s), IS_AVAIL1(v, IS_DISABLED_RUN(v2))
 
 #endif /* __ASMARC_SETUP_H */
diff --git a/arch/arc/include/asm/smp.h b/arch/arc/include/asm/smp.h
index 89fdd1b..0861007 100644
--- a/arch/arc/include/asm/smp.h
+++ b/arch/arc/include/asm/smp.h
@@ -37,9 +37,9 @@
  * API expected BY platform smp code (FROM arch smp code)
  *
  * smp_ipi_irq_setup:
- *	Takes @cpu and @irq to which the arch-common ISR is hooked up
+ *	Takes @cpu and @hwirq to which the arch-common ISR is hooked up
  */
-extern int smp_ipi_irq_setup(int cpu, int irq);
+extern int smp_ipi_irq_setup(int cpu, irq_hw_number_t hwirq);
 
 /*
  * struct plat_smp_ops	- SMP callbacks provided by platform to ARC SMP
diff --git a/arch/arc/include/asm/syscalls.h b/arch/arc/include/asm/syscalls.h
index e56f9fc..772b67c 100644
--- a/arch/arc/include/asm/syscalls.h
+++ b/arch/arc/include/asm/syscalls.h
@@ -17,6 +17,7 @@
 int sys_cacheflush(uint32_t, uint32_t uint32_t);
 int sys_arc_settls(void *);
 int sys_arc_gettls(void);
+int sys_arc_usr_cmpxchg(int *, int, int);
 
 #include <asm-generic/syscalls.h>
 
diff --git a/arch/arc/include/uapi/asm/unistd.h b/arch/arc/include/uapi/asm/unistd.h
index 41fa2ec..9a34136 100644
--- a/arch/arc/include/uapi/asm/unistd.h
+++ b/arch/arc/include/uapi/asm/unistd.h
@@ -27,18 +27,19 @@
 
 #define NR_syscalls	__NR_syscalls
 
+/* Generic syscall (fs/filesystems.c - lost in asm-generic/unistd.h */
+#define __NR_sysfs		(__NR_arch_specific_syscall + 3)
+
 /* ARC specific syscall */
 #define __NR_cacheflush		(__NR_arch_specific_syscall + 0)
 #define __NR_arc_settls		(__NR_arch_specific_syscall + 1)
 #define __NR_arc_gettls		(__NR_arch_specific_syscall + 2)
+#define __NR_arc_usr_cmpxchg	(__NR_arch_specific_syscall + 4)
 
 __SYSCALL(__NR_cacheflush, sys_cacheflush)
 __SYSCALL(__NR_arc_settls, sys_arc_settls)
 __SYSCALL(__NR_arc_gettls, sys_arc_gettls)
-
-
-/* Generic syscall (fs/filesystems.c - lost in asm-generic/unistd.h */
-#define __NR_sysfs		(__NR_arch_specific_syscall + 3)
+__SYSCALL(__NR_arc_usr_cmpxchg, sys_arc_usr_cmpxchg)
 __SYSCALL(__NR_sysfs, sys_sysfs)
 
 #undef __SYSCALL
diff --git a/arch/arc/kernel/devtree.c b/arch/arc/kernel/devtree.c
index f1e07c2..3b67f53 100644
--- a/arch/arc/kernel/devtree.c
+++ b/arch/arc/kernel/devtree.c
@@ -31,6 +31,8 @@
 		arc_base_baud = 166666666;	/* Fixed 166.6MHz clk (TB10x) */
 	else if (of_flat_dt_is_compatible(dt_root, "snps,arc-sdp"))
 		arc_base_baud = 33333333;	/* Fixed 33MHz clk (AXS10x) */
+	else if (of_flat_dt_is_compatible(dt_root, "ezchip,arc-nps"))
+		arc_base_baud = 800000000;      /* Fixed 800MHz clk (NPS) */
 	else
 		arc_base_baud = 50000000;	/* Fixed default 50MHz */
 }
diff --git a/arch/arc/kernel/mcip.c b/arch/arc/kernel/mcip.c
index 72f9179..f39142a 100644
--- a/arch/arc/kernel/mcip.c
+++ b/arch/arc/kernel/mcip.c
@@ -15,11 +15,12 @@
 #include <asm/mcip.h>
 #include <asm/setup.h>
 
-static char smp_cpuinfo_buf[128];
-static int idu_detected;
-
 static DEFINE_RAW_SPINLOCK(mcip_lock);
 
+#ifdef CONFIG_SMP
+
+static char smp_cpuinfo_buf[128];
+
 static void mcip_setup_per_cpu(int cpu)
 {
 	smp_ipi_irq_setup(cpu, IPI_IRQ);
@@ -86,21 +87,7 @@
 
 static void mcip_probe_n_setup(void)
 {
-	struct mcip_bcr {
-#ifdef CONFIG_CPU_BIG_ENDIAN
-		unsigned int pad3:8,
-			     idu:1, llm:1, num_cores:6,
-			     iocoh:1,  gfrc:1, dbg:1, pad2:1,
-			     msg:1, sem:1, ipi:1, pad:1,
-			     ver:8;
-#else
-		unsigned int ver:8,
-			     pad:1, ipi:1, sem:1, msg:1,
-			     pad2:1, dbg:1, gfrc:1, iocoh:1,
-			     num_cores:6, llm:1, idu:1,
-			     pad3:8;
-#endif
-	} mp;
+	struct mcip_bcr mp;
 
 	READ_BCR(ARC_REG_MCIP_BCR, mp);
 
@@ -114,7 +101,6 @@
 		IS_AVAIL1(mp.gfrc, "GFRC"));
 
 	cpuinfo_arc700[0].extn.gfrc = mp.gfrc;
-	idu_detected = mp.idu;
 
 	if (mp.dbg) {
 		__mcip_cmd_data(CMD_DEBUG_SET_SELECT, 0, 0xf);
@@ -130,6 +116,8 @@
 	.ipi_clear	= mcip_ipi_clear,
 };
 
+#endif
+
 /***************************************************************************
  * ARCv2 Interrupt Distribution Unit (IDU)
  *
@@ -193,6 +181,8 @@
 {
 	unsigned long flags;
 	cpumask_t online;
+	unsigned int destination_bits;
+	unsigned int distribution_mode;
 
 	/* errout if no online cpu per @cpumask */
 	if (!cpumask_and(&online, cpumask, cpu_online_mask))
@@ -200,8 +190,15 @@
 
 	raw_spin_lock_irqsave(&mcip_lock, flags);
 
-	idu_set_dest(data->hwirq, cpumask_bits(&online)[0]);
-	idu_set_mode(data->hwirq, IDU_M_TRIG_LEVEL, IDU_M_DISTRI_RR);
+	destination_bits = cpumask_bits(&online)[0];
+	idu_set_dest(data->hwirq, destination_bits);
+
+	if (ffs(destination_bits) == fls(destination_bits))
+		distribution_mode = IDU_M_DISTRI_DEST;
+	else
+		distribution_mode = IDU_M_DISTRI_RR;
+
+	idu_set_mode(data->hwirq, IDU_M_TRIG_LEVEL, distribution_mode);
 
 	raw_spin_unlock_irqrestore(&mcip_lock, flags);
 
@@ -219,16 +216,15 @@
 
 };
 
-static int idu_first_irq;
+static irq_hw_number_t idu_first_hwirq;
 
 static void idu_cascade_isr(struct irq_desc *desc)
 {
-	struct irq_domain *domain = irq_desc_get_handler_data(desc);
-	unsigned int core_irq = irq_desc_get_irq(desc);
-	unsigned int idu_irq;
+	struct irq_domain *idu_domain = irq_desc_get_handler_data(desc);
+	irq_hw_number_t core_hwirq = irqd_to_hwirq(irq_desc_get_irq_data(desc));
+	irq_hw_number_t idu_hwirq = core_hwirq - idu_first_hwirq;
 
-	idu_irq = core_irq - idu_first_irq;
-	generic_handle_irq(irq_find_mapping(domain, idu_irq));
+	generic_handle_irq(irq_find_mapping(idu_domain, idu_hwirq));
 }
 
 static int idu_irq_map(struct irq_domain *d, unsigned int virq, irq_hw_number_t hwirq)
@@ -294,9 +290,12 @@
 	struct irq_domain *domain;
 	/* Read IDU BCR to confirm nr_irqs */
 	int nr_irqs = of_irq_count(intc);
-	int i, irq;
+	int i, virq;
+	struct mcip_bcr mp;
 
-	if (!idu_detected)
+	READ_BCR(ARC_REG_MCIP_BCR, mp);
+
+	if (!mp.idu)
 		panic("IDU not detected, but DeviceTree using it");
 
 	pr_info("MCIP: IDU referenced from Devicetree %d irqs\n", nr_irqs);
@@ -312,11 +311,11 @@
 		 * however we need it to get the parent virq and set IDU handler
 		 * as first level isr
 		 */
-		irq = irq_of_parse_and_map(intc, i);
+		virq = irq_of_parse_and_map(intc, i);
 		if (!i)
-			idu_first_irq = irq;
+			idu_first_hwirq = irqd_to_hwirq(irq_get_irq_data(virq));
 
-		irq_set_chained_handler_and_data(irq, idu_cascade_isr, domain);
+		irq_set_chained_handler_and_data(virq, idu_cascade_isr, domain);
 	}
 
 	__mcip_cmd(CMD_IDU_ENABLE, 0);
diff --git a/arch/arc/kernel/module.c b/arch/arc/kernel/module.c
index 9a28497..42e964d 100644
--- a/arch/arc/kernel/module.c
+++ b/arch/arc/kernel/module.c
@@ -30,17 +30,9 @@
 			      char *secstr, struct module *mod)
 {
 #ifdef CONFIG_ARC_DW2_UNWIND
-	int i;
-
 	mod->arch.unw_sec_idx = 0;
 	mod->arch.unw_info = NULL;
-
-	for (i = 1; i < hdr->e_shnum; i++) {
-		if (strcmp(secstr+sechdrs[i].sh_name, ".eh_frame") == 0) {
-			mod->arch.unw_sec_idx = i;
-			break;
-		}
-	}
+	mod->arch.secstr = secstr;
 #endif
 	return 0;
 }
@@ -59,29 +51,33 @@
 		       unsigned int relsec,	/* sec index for relo sec */
 		       struct module *module)
 {
-	int i, n;
+	int i, n, relo_type;
 	Elf32_Rela *rel_entry = (void *)sechdrs[relsec].sh_addr;
 	Elf32_Sym *sym_entry, *sym_sec;
-	Elf32_Addr relocation;
-	Elf32_Addr location;
-	Elf32_Addr sec_to_patch;
-	int relo_type;
+	Elf32_Addr relocation, location, tgt_addr;
+	unsigned int tgtsec;
 
-	sec_to_patch = sechdrs[sechdrs[relsec].sh_info].sh_addr;
+	/*
+	 * @relsec has relocations e.g. .rela.init.text
+	 * @tgtsec is section to patch e.g. .init.text
+	 */
+	tgtsec = sechdrs[relsec].sh_info;
+	tgt_addr = sechdrs[tgtsec].sh_addr;
 	sym_sec = (Elf32_Sym *) sechdrs[symindex].sh_addr;
 	n = sechdrs[relsec].sh_size / sizeof(*rel_entry);
 
-	pr_debug("\n========== Module Sym reloc ===========================\n");
-	pr_debug("Section to fixup %x\n", sec_to_patch);
+	pr_debug("\nSection to fixup %s @%x\n",
+		 module->arch.secstr + sechdrs[tgtsec].sh_name, tgt_addr);
 	pr_debug("=========================================================\n");
-	pr_debug("rela->r_off | rela->addend | sym->st_value | ADDR | VALUE\n");
+	pr_debug("r_off\tr_add\tst_value ADDRESS  VALUE\n");
 	pr_debug("=========================================================\n");
 
 	/* Loop thru entries in relocation section */
 	for (i = 0; i < n; i++) {
+		const char *s;
 
 		/* This is where to make the change */
-		location = sec_to_patch + rel_entry[i].r_offset;
+		location = tgt_addr + rel_entry[i].r_offset;
 
 		/* This is the symbol it is referring to.  Note that all
 		   undefined symbols have been resolved.  */
@@ -89,10 +85,15 @@
 
 		relocation = sym_entry->st_value + rel_entry[i].r_addend;
 
-		pr_debug("\t%x\t\t%x\t\t%x  %x %x [%s]\n",
-			rel_entry[i].r_offset, rel_entry[i].r_addend,
-			sym_entry->st_value, location, relocation,
-			strtab + sym_entry->st_name);
+		if (sym_entry->st_name == 0 && ELF_ST_TYPE (sym_entry->st_info) == STT_SECTION) {
+			s = module->arch.secstr + sechdrs[sym_entry->st_shndx].sh_name;
+		} else {
+			s = strtab + sym_entry->st_name;
+		}
+
+		pr_debug("   %x\t%x\t%x %x %x [%s]\n",
+			 rel_entry[i].r_offset, rel_entry[i].r_addend,
+			 sym_entry->st_value, location, relocation, s);
 
 		/* This assumes modules are built with -mlong-calls
 		 * so any branches/jumps are absolute 32 bit jmps
@@ -111,6 +112,10 @@
 			goto relo_err;
 
 	}
+
+	if (strcmp(module->arch.secstr+sechdrs[tgtsec].sh_name, ".eh_frame") == 0)
+		module->arch.unw_sec_idx = tgtsec;
+
 	return 0;
 
 relo_err:
diff --git a/arch/arc/kernel/process.c b/arch/arc/kernel/process.c
index be1972b..a41a79a 100644
--- a/arch/arc/kernel/process.c
+++ b/arch/arc/kernel/process.c
@@ -41,6 +41,41 @@
 	return task_thread_info(current)->thr_ptr;
 }
 
+SYSCALL_DEFINE3(arc_usr_cmpxchg, int *, uaddr, int, expected, int, new)
+{
+	struct pt_regs *regs = current_pt_regs();
+	int uval = -EFAULT;
+
+	/*
+	 * This is only for old cores lacking LLOCK/SCOND, which by defintion
+	 * can't possibly be SMP. Thus doesn't need to be SMP safe.
+	 * And this also helps reduce the overhead for serializing in
+	 * the UP case
+	 */
+	WARN_ON_ONCE(IS_ENABLED(CONFIG_SMP));
+
+	/* Z indicates to userspace if operation succeded */
+	regs->status32 &= ~STATUS_Z_MASK;
+
+	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
+		return -EFAULT;
+
+	preempt_disable();
+
+	if (__get_user(uval, uaddr))
+		goto done;
+
+	if (uval == expected) {
+		if (!__put_user(new, uaddr))
+			regs->status32 |= STATUS_Z_MASK;
+	}
+
+done:
+	preempt_enable();
+
+	return uval;
+}
+
 void arch_cpu_idle(void)
 {
 	/* sleep, but enable all interrupts before committing */
diff --git a/arch/arc/kernel/setup.c b/arch/arc/kernel/setup.c
index 3df7f9c..0385df7 100644
--- a/arch/arc/kernel/setup.c
+++ b/arch/arc/kernel/setup.c
@@ -40,6 +40,29 @@
 
 struct cpuinfo_arc cpuinfo_arc700[NR_CPUS];
 
+static const struct id_to_str arc_cpu_rel[] = {
+#ifdef CONFIG_ISA_ARCOMPACT
+	{ 0x34, "R4.10"},
+	{ 0x35, "R4.11"},
+#else
+	{ 0x51, "R2.0" },
+	{ 0x52, "R2.1" },
+	{ 0x53, "R3.0" },
+#endif
+	{ 0x00, NULL   }
+};
+
+static const struct id_to_str arc_cpu_nm[] = {
+#ifdef CONFIG_ISA_ARCOMPACT
+	{ 0x20, "ARC 600"   },
+	{ 0x30, "ARC 770"   },  /* 750 identified seperately */
+#else
+	{ 0x40, "ARC EM"  },
+	{ 0x50, "ARC HS38"  },
+#endif
+	{ 0x00, "Unknown"   }
+};
+
 static void read_decode_ccm_bcr(struct cpuinfo_arc *cpu)
 {
 	if (is_isa_arcompact()) {
@@ -92,11 +115,26 @@
 	struct bcr_timer timer;
 	struct bcr_generic bcr;
 	struct cpuinfo_arc *cpu = &cpuinfo_arc700[smp_processor_id()];
+	const struct id_to_str *tbl;
+
 	FIX_PTR(cpu);
 
 	READ_BCR(AUX_IDENTITY, cpu->core);
 	READ_BCR(ARC_REG_ISA_CFG_BCR, cpu->isa);
 
+	for (tbl = &arc_cpu_rel[0]; tbl->id != 0; tbl++) {
+		if (cpu->core.family == tbl->id) {
+			cpu->details = tbl->str;
+			break;
+		}
+	}
+
+	for (tbl = &arc_cpu_nm[0]; tbl->id != 0; tbl++) {
+		if ((cpu->core.family & 0xF0) == tbl->id)
+			break;
+	}
+	cpu->name = tbl->str;
+
 	READ_BCR(ARC_REG_TIMERS_BCR, timer);
 	cpu->extn.timer0 = timer.t0;
 	cpu->extn.timer1 = timer.t1;
@@ -111,6 +149,9 @@
 	cpu->extn.swap = read_aux_reg(ARC_REG_SWAP_BCR) ? 1 : 0;        /* 1,3 */
 	cpu->extn.crc = read_aux_reg(ARC_REG_CRC_BCR) ? 1 : 0;
 	cpu->extn.minmax = read_aux_reg(ARC_REG_MIXMAX_BCR) > 1 ? 1 : 0; /* 2 */
+	cpu->extn.swape = (cpu->core.family >= 0x34) ? 1 :
+				IS_ENABLED(CONFIG_ARC_HAS_SWAPE);
+
 	READ_BCR(ARC_REG_XY_MEM_BCR, cpu->extn_xymem);
 
 	/* Read CCM BCRs for boot reporting even if not enabled in Kconfig */
@@ -160,64 +201,38 @@
 	cpu->extn.rtt = bcr.ver ? 1 : 0;
 
 	cpu->extn.debug = cpu->extn.ap | cpu->extn.smart | cpu->extn.rtt;
+
+	/* some hacks for lack of feature BCR info in old ARC700 cores */
+	if (is_isa_arcompact()) {
+		if (!cpu->isa.ver)	/* ISA BCR absent, use Kconfig info */
+			cpu->isa.atomic = IS_ENABLED(CONFIG_ARC_HAS_LLSC);
+		else
+			cpu->isa.atomic = cpu->isa.atomic1;
+
+		cpu->isa.be = IS_ENABLED(CONFIG_CPU_BIG_ENDIAN);
+
+		 /* there's no direct way to distinguish 750 vs. 770 */
+		if (unlikely(cpu->core.family < 0x34 || cpu->mmu.ver < 3))
+			cpu->name = "ARC750";
+	}
 }
 
-static const struct cpuinfo_data arc_cpu_tbl[] = {
-#ifdef CONFIG_ISA_ARCOMPACT
-	{ {0x20, "ARC 600"      }, 0x2F},
-	{ {0x30, "ARC 700"      }, 0x33},
-	{ {0x34, "ARC 700 R4.10"}, 0x34},
-	{ {0x35, "ARC 700 R4.11"}, 0x35},
-#else
-	{ {0x50, "ARC HS38 R2.0"}, 0x51},
-	{ {0x52, "ARC HS38 R2.1"}, 0x52},
-	{ {0x53, "ARC HS38 R3.0"}, 0x53},
-#endif
-	{ {0x00, NULL		} }
-};
-
-
 static char *arc_cpu_mumbojumbo(int cpu_id, char *buf, int len)
 {
 	struct cpuinfo_arc *cpu = &cpuinfo_arc700[cpu_id];
 	struct bcr_identity *core = &cpu->core;
-	const struct cpuinfo_data *tbl;
-	char *isa_nm;
-	int i, be, atomic;
-	int n = 0;
+	int i, n = 0;
 
 	FIX_PTR(cpu);
 
-	if (is_isa_arcompact()) {
-		isa_nm = "ARCompact";
-		be = IS_ENABLED(CONFIG_CPU_BIG_ENDIAN);
-
-		atomic = cpu->isa.atomic1;
-		if (!cpu->isa.ver)	/* ISA BCR absent, use Kconfig info */
-			atomic = IS_ENABLED(CONFIG_ARC_HAS_LLSC);
-	} else {
-		isa_nm = "ARCv2";
-		be = cpu->isa.be;
-		atomic = cpu->isa.atomic;
-	}
-
 	n += scnprintf(buf + n, len - n,
 		       "\nIDENTITY\t: ARCVER [%#02x] ARCNUM [%#02x] CHIPID [%#4x]\n",
 		       core->family, core->cpu_id, core->chip_id);
 
-	for (tbl = &arc_cpu_tbl[0]; tbl->info.id != 0; tbl++) {
-		if ((core->family >= tbl->info.id) &&
-		    (core->family <= tbl->up_range)) {
-			n += scnprintf(buf + n, len - n,
-				       "processor [%d]\t: %s (%s ISA) %s\n",
-				       cpu_id, tbl->info.str, isa_nm,
-				       IS_AVAIL1(be, "[Big-Endian]"));
-			break;
-		}
-	}
-
-	if (tbl->info.id == 0)
-		n += scnprintf(buf + n, len - n, "UNKNOWN ARC Processor\n");
+	n += scnprintf(buf + n, len - n, "processor [%d]\t: %s %s (%s ISA) %s\n",
+		       cpu_id, cpu->name, cpu->details,
+		       is_isa_arcompact() ? "ARCompact" : "ARCv2",
+		       IS_AVAIL1(cpu->isa.be, "[Big-Endian]"));
 
 	n += scnprintf(buf + n, len - n, "Timers\t\t: %s%s%s%s\nISA Extn\t: ",
 		       IS_AVAIL1(cpu->extn.timer0, "Timer0 "),
@@ -226,7 +241,7 @@
 				 CONFIG_ARC_HAS_RTC));
 
 	n += i = scnprintf(buf + n, len - n, "%s%s%s%s%s",
-			   IS_AVAIL2(atomic, "atomic ", CONFIG_ARC_HAS_LLSC),
+			   IS_AVAIL2(cpu->isa.atomic, "atomic ", CONFIG_ARC_HAS_LLSC),
 			   IS_AVAIL2(cpu->isa.ldd, "ll64 ", CONFIG_ARC_HAS_LL64),
 			   IS_AVAIL1(cpu->isa.unalign, "unalign (not used)"));
 
@@ -253,7 +268,7 @@
 		       IS_AVAIL1(cpu->extn.swap, "swap "),
 		       IS_AVAIL1(cpu->extn.minmax, "minmax "),
 		       IS_AVAIL1(cpu->extn.crc, "crc "),
-		       IS_AVAIL2(1, "swape", CONFIG_ARC_HAS_SWAPE));
+		       IS_AVAIL2(cpu->extn.swape, "swape", CONFIG_ARC_HAS_SWAPE));
 
 	if (cpu->bpu.ver)
 		n += scnprintf(buf + n, len - n,
@@ -272,9 +287,7 @@
 
 	FIX_PTR(cpu);
 
-	n += scnprintf(buf + n, len - n,
-		       "Vector Table\t: %#x\nPeripherals\t: %#lx:%#lx\n",
-		       cpu->vec_base, perip_base, perip_end);
+	n += scnprintf(buf + n, len - n, "Vector Table\t: %#x\n", cpu->vec_base);
 
 	if (cpu->extn.fpu_sp || cpu->extn.fpu_dp)
 		n += scnprintf(buf + n, len - n, "FPU\t\t: %s%s\n",
@@ -507,7 +520,7 @@
 	 * way to pass it w/o having to kmalloc/free a 2 byte string.
 	 * Encode cpu-id as 0xFFcccc, which is decoded by show routine.
 	 */
-	return *pos < num_possible_cpus() ? cpu_to_ptr(*pos) : NULL;
+	return *pos < nr_cpu_ids ? cpu_to_ptr(*pos) : NULL;
 }
 
 static void *c_next(struct seq_file *m, void *v, loff_t *pos)
diff --git a/arch/arc/kernel/smp.c b/arch/arc/kernel/smp.c
index f183cc6..88674d9 100644
--- a/arch/arc/kernel/smp.c
+++ b/arch/arc/kernel/smp.c
@@ -22,6 +22,7 @@
 #include <linux/atomic.h>
 #include <linux/cpumask.h>
 #include <linux/reboot.h>
+#include <linux/irqdomain.h>
 #include <asm/processor.h>
 #include <asm/setup.h>
 #include <asm/mach_desc.h>
@@ -67,11 +68,13 @@
 	int i;
 
 	/*
-	 * Initialise the present map, which describes the set of CPUs
-	 * actually populated at the present time.
+	 * if platform didn't set the present map already, do it now
+	 * boot cpu is set to present already by init/main.c
 	 */
-	for (i = 0; i < max_cpus; i++)
-		set_cpu_present(i, true);
+	if (num_present_cpus() <= 1) {
+		for (i = 0; i < max_cpus; i++)
+			set_cpu_present(i, true);
+	}
 }
 
 void __init smp_cpus_done(unsigned int max_cpus)
@@ -351,20 +354,24 @@
  */
 static DEFINE_PER_CPU(int, ipi_dev);
 
-int smp_ipi_irq_setup(int cpu, int irq)
+int smp_ipi_irq_setup(int cpu, irq_hw_number_t hwirq)
 {
 	int *dev = per_cpu_ptr(&ipi_dev, cpu);
+	unsigned int virq = irq_find_mapping(NULL, hwirq);
+
+	if (!virq)
+		panic("Cannot find virq for root domain and hwirq=%lu", hwirq);
 
 	/* Boot cpu calls request, all call enable */
 	if (!cpu) {
 		int rc;
 
-		rc = request_percpu_irq(irq, do_IPI, "IPI Interrupt", dev);
+		rc = request_percpu_irq(virq, do_IPI, "IPI Interrupt", dev);
 		if (rc)
-			panic("Percpu IRQ request failed for %d\n", irq);
+			panic("Percpu IRQ request failed for %u\n", virq);
 	}
 
-	enable_percpu_irq(irq, 0);
+	enable_percpu_irq(virq, 0);
 
 	return 0;
 }
diff --git a/arch/arc/kernel/time.c b/arch/arc/kernel/time.c
index f927b8d..c10390d 100644
--- a/arch/arc/kernel/time.c
+++ b/arch/arc/kernel/time.c
@@ -152,14 +152,17 @@
 		cycle_t  full;
 	} stamp;
 
-
-	__asm__ __volatile(
-	"1:						\n"
-	"	lr		%0, [AUX_RTC_LOW]	\n"
-	"	lr		%1, [AUX_RTC_HIGH]	\n"
-	"	lr		%2, [AUX_RTC_CTRL]	\n"
-	"	bbit0.nt	%2, 31, 1b		\n"
-	: "=r" (stamp.low), "=r" (stamp.high), "=r" (status));
+	/*
+	 * hardware has an internal state machine which tracks readout of
+	 * low/high and updates the CTRL.status if
+	 *  - interrupt/exception taken between the two reads
+	 *  - high increments after low has been read
+	 */
+	do {
+		stamp.low = read_aux_reg(AUX_RTC_LOW);
+		stamp.high = read_aux_reg(AUX_RTC_HIGH);
+		status = read_aux_reg(AUX_RTC_CTRL);
+	} while (!(status & _BITUL(31)));
 
 	return stamp.full;
 }
diff --git a/arch/arc/kernel/troubleshoot.c b/arch/arc/kernel/troubleshoot.c
index 934150e..82f9bc8 100644
--- a/arch/arc/kernel/troubleshoot.c
+++ b/arch/arc/kernel/troubleshoot.c
@@ -237,113 +237,3 @@
 	if (!user_mode(regs))
 		show_stacktrace(current, regs);
 }
-
-#ifdef CONFIG_DEBUG_FS
-
-#include <linux/module.h>
-#include <linux/fs.h>
-#include <linux/mount.h>
-#include <linux/pagemap.h>
-#include <linux/init.h>
-#include <linux/namei.h>
-#include <linux/debugfs.h>
-
-static struct dentry *test_dentry;
-static struct dentry *test_dir;
-static struct dentry *test_u32_dentry;
-
-static u32 clr_on_read = 1;
-
-#ifdef CONFIG_ARC_DBG_TLB_MISS_COUNT
-u32 numitlb, numdtlb, num_pte_not_present;
-
-static int fill_display_data(char *kbuf)
-{
-	size_t num = 0;
-	num += sprintf(kbuf + num, "I-TLB Miss %x\n", numitlb);
-	num += sprintf(kbuf + num, "D-TLB Miss %x\n", numdtlb);
-	num += sprintf(kbuf + num, "PTE not present %x\n", num_pte_not_present);
-
-	if (clr_on_read)
-		numitlb = numdtlb = num_pte_not_present = 0;
-
-	return num;
-}
-
-static int tlb_stats_open(struct inode *inode, struct file *file)
-{
-	file->private_data = (void *)__get_free_page(GFP_KERNEL);
-	return 0;
-}
-
-/* called on user read(): display the counters */
-static ssize_t tlb_stats_output(struct file *file,	/* file descriptor */
-				char __user *user_buf,	/* user buffer */
-				size_t len,		/* length of buffer */
-				loff_t *offset)		/* offset in the file */
-{
-	size_t num;
-	char *kbuf = (char *)file->private_data;
-
-	/* All of the data can he shoved in one iteration */
-	if (*offset != 0)
-		return 0;
-
-	num = fill_display_data(kbuf);
-
-	/* simple_read_from_buffer() is helper for copy to user space
-	   It copies up to @2 (num) bytes from kernel buffer @4 (kbuf) at offset
-	   @3 (offset) into the user space address starting at @1 (user_buf).
-	   @5 (len) is max size of user buffer
-	 */
-	return simple_read_from_buffer(user_buf, num, offset, kbuf, len);
-}
-
-/* called on user write : clears the counters */
-static ssize_t tlb_stats_clear(struct file *file, const char __user *user_buf,
-			       size_t length, loff_t *offset)
-{
-	numitlb = numdtlb = num_pte_not_present = 0;
-	return length;
-}
-
-static int tlb_stats_close(struct inode *inode, struct file *file)
-{
-	free_page((unsigned long)(file->private_data));
-	return 0;
-}
-
-static const struct file_operations tlb_stats_file_ops = {
-	.read = tlb_stats_output,
-	.write = tlb_stats_clear,
-	.open = tlb_stats_open,
-	.release = tlb_stats_close
-};
-#endif
-
-static int __init arc_debugfs_init(void)
-{
-	test_dir = debugfs_create_dir("arc", NULL);
-
-#ifdef CONFIG_ARC_DBG_TLB_MISS_COUNT
-	test_dentry = debugfs_create_file("tlb_stats", 0444, test_dir, NULL,
-					  &tlb_stats_file_ops);
-#endif
-
-	test_u32_dentry =
-	    debugfs_create_u32("clr_on_read", 0444, test_dir, &clr_on_read);
-
-	return 0;
-}
-
-module_init(arc_debugfs_init);
-
-static void __exit arc_debugfs_exit(void)
-{
-	debugfs_remove(test_u32_dentry);
-	debugfs_remove(test_dentry);
-	debugfs_remove(test_dir);
-}
-module_exit(arc_debugfs_exit);
-
-#endif
diff --git a/arch/arc/mm/cache.c b/arch/arc/mm/cache.c
index 97dddbe..50d7169 100644
--- a/arch/arc/mm/cache.c
+++ b/arch/arc/mm/cache.c
@@ -22,8 +22,8 @@
 #include <asm/setup.h>
 
 static int l2_line_sz;
-int ioc_exists;
-volatile int slc_enable = 1, ioc_enable = 1;
+static int ioc_exists;
+int slc_enable = 1, ioc_enable = 0;
 unsigned long perip_base = ARC_UNCACHED_ADDR_SPACE; /* legacy value for boot */
 unsigned long perip_end = 0xFFFFFFFF; /* legacy value */
 
@@ -53,18 +53,15 @@
 	PR_CACHE(&cpuinfo_arc700[c].icache, CONFIG_ARC_HAS_ICACHE, "I-Cache");
 	PR_CACHE(&cpuinfo_arc700[c].dcache, CONFIG_ARC_HAS_DCACHE, "D-Cache");
 
-	if (!is_isa_arcv2())
-                return buf;
-
 	p = &cpuinfo_arc700[c].slc;
 	if (p->ver)
 		n += scnprintf(buf + n, len - n,
 			       "SLC\t\t: %uK, %uB Line%s\n",
 			       p->sz_k, p->line_len, IS_USED_RUN(slc_enable));
 
-	if (ioc_exists)
-		n += scnprintf(buf + n, len - n, "IOC\t\t:%s\n",
-				IS_DISABLED_RUN(ioc_enable));
+	n += scnprintf(buf + n, len - n, "Peripherals\t: %#lx%s%s\n",
+		       perip_base,
+		       IS_AVAIL3(ioc_exists, ioc_enable, ", IO-Coherency "));
 
 	return buf;
 }
@@ -113,8 +110,10 @@
 	}
 
 	READ_BCR(ARC_REG_CLUSTER_BCR, cbcr);
-	if (cbcr.c && ioc_enable)
+	if (cbcr.c)
 		ioc_exists = 1;
+	else
+		ioc_enable = 0;
 
 	/* HS 2.0 didn't have AUX_VOL */
 	if (cpuinfo_arc700[cpu].core.family > 0x51) {
@@ -1002,7 +1001,7 @@
 			read_aux_reg(ARC_REG_SLC_CTRL) | SLC_CTRL_DISABLE);
 	}
 
-	if (is_isa_arcv2() && ioc_exists) {
+	if (is_isa_arcv2() && ioc_enable) {
 		/* IO coherency base - 0x8z */
 		write_aux_reg(ARC_REG_IO_COH_AP0_BASE, 0x80000);
 		/* IO coherency aperture size - 512Mb: 0x8z-0xAz */
diff --git a/arch/arc/mm/dma.c b/arch/arc/mm/dma.c
index 20afc65..cd8aad8 100644
--- a/arch/arc/mm/dma.c
+++ b/arch/arc/mm/dma.c
@@ -45,7 +45,7 @@
 	 *   -For coherent data, Read/Write to buffers terminate early in cache
 	 *   (vs. always going to memory - thus are faster)
 	 */
-	if ((is_isa_arcv2() && ioc_exists) ||
+	if ((is_isa_arcv2() && ioc_enable) ||
 	    (attrs & DMA_ATTR_NON_CONSISTENT))
 		need_coh = 0;
 
@@ -97,7 +97,7 @@
 	int is_non_coh = 1;
 
 	is_non_coh = (attrs & DMA_ATTR_NON_CONSISTENT) ||
-			(is_isa_arcv2() && ioc_exists);
+			(is_isa_arcv2() && ioc_enable);
 
 	if (PageHighMem(page) || !is_non_coh)
 		iounmap((void __force __iomem *)vaddr);
@@ -105,6 +105,31 @@
 	__free_pages(page, get_order(size));
 }
 
+static int arc_dma_mmap(struct device *dev, struct vm_area_struct *vma,
+			void *cpu_addr, dma_addr_t dma_addr, size_t size,
+			unsigned long attrs)
+{
+	unsigned long user_count = vma_pages(vma);
+	unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT;
+	unsigned long pfn = __phys_to_pfn(plat_dma_to_phys(dev, dma_addr));
+	unsigned long off = vma->vm_pgoff;
+	int ret = -ENXIO;
+
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+
+	if (dma_mmap_from_coherent(dev, vma, cpu_addr, size, &ret))
+		return ret;
+
+	if (off < count && user_count <= (count - off)) {
+		ret = remap_pfn_range(vma, vma->vm_start,
+				      pfn + off,
+				      user_count << PAGE_SHIFT,
+				      vma->vm_page_prot);
+	}
+
+	return ret;
+}
+
 /*
  * streaming DMA Mapping API...
  * CPU accesses page via normal paddr, thus needs to explicitly made
@@ -193,6 +218,7 @@
 struct dma_map_ops arc_dma_ops = {
 	.alloc			= arc_dma_alloc,
 	.free			= arc_dma_free,
+	.mmap			= arc_dma_mmap,
 	.map_page		= arc_dma_map_page,
 	.map_sg			= arc_dma_map_sg,
 	.sync_single_for_device	= arc_dma_sync_single_for_device,
diff --git a/arch/arc/mm/tlb.c b/arch/arc/mm/tlb.c
index ec868a9..bdb295e 100644
--- a/arch/arc/mm/tlb.c
+++ b/arch/arc/mm/tlb.c
@@ -793,16 +793,16 @@
 	char super_pg[64] = "";
 
 	if (p_mmu->s_pg_sz_m)
-		scnprintf(super_pg, 64, "%dM Super Page%s, ",
+		scnprintf(super_pg, 64, "%dM Super Page %s",
 			  p_mmu->s_pg_sz_m,
 			  IS_USED_CFG(CONFIG_TRANSPARENT_HUGEPAGE));
 
 	n += scnprintf(buf + n, len - n,
-		      "MMU [v%x]\t: %dk PAGE, %sJTLB %d (%dx%d), uDTLB %d, uITLB %d %s%s\n",
+		      "MMU [v%x]\t: %dk PAGE, %sJTLB %d (%dx%d), uDTLB %d, uITLB %d%s%s\n",
 		       p_mmu->ver, p_mmu->pg_sz_k, super_pg,
 		       p_mmu->sets * p_mmu->ways, p_mmu->sets, p_mmu->ways,
 		       p_mmu->u_dtlb, p_mmu->u_itlb,
-		       IS_AVAIL2(p_mmu->pae, "PAE40 ", CONFIG_ARC_HAS_PAE40));
+		       IS_AVAIL2(p_mmu->pae, ", PAE40 ", CONFIG_ARC_HAS_PAE40));
 
 	return buf;
 }
diff --git a/arch/arc/mm/tlbex.S b/arch/arc/mm/tlbex.S
index f1967ee..b30e4e3 100644
--- a/arch/arc/mm/tlbex.S
+++ b/arch/arc/mm/tlbex.S
@@ -237,15 +237,6 @@
 
 2:
 
-#ifdef CONFIG_ARC_DBG_TLB_MISS_COUNT
-	and.f 0, r0, _PAGE_PRESENT
-	bz   1f
-	ld   r3, [num_pte_not_present]
-	add  r3, r3, 1
-	st   r3, [num_pte_not_present]
-1:
-#endif
-
 .endm
 
 ;-----------------------------------------------------------------
@@ -309,12 +300,6 @@
 
 	TLBMISS_FREEUP_REGS
 
-#ifdef CONFIG_ARC_DBG_TLB_MISS_COUNT
-	ld  r0, [@numitlb]
-	add r0, r0, 1
-	st  r0, [@numitlb]
-#endif
-
 	;----------------------------------------------------------------
 	; Get the PTE corresponding to V-addr accessed, r2 is setup with EFA
 	LOAD_FAULT_PTE
@@ -349,12 +334,6 @@
 
 	TLBMISS_FREEUP_REGS
 
-#ifdef CONFIG_ARC_DBG_TLB_MISS_COUNT
-	ld  r0, [@numdtlb]
-	add r0, r0, 1
-	st  r0, [@numdtlb]
-#endif
-
 	;----------------------------------------------------------------
 	; Get the PTE corresponding to V-addr accessed
 	; If PTE exists, it will setup, r0 = PTE, r1 = Ptr to PTE, r2 = EFA
diff --git a/arch/arc/plat-eznps/smp.c b/arch/arc/plat-eznps/smp.c
index 5e901f8..56a4c85 100644
--- a/arch/arc/plat-eznps/smp.c
+++ b/arch/arc/plat-eznps/smp.c
@@ -140,16 +140,10 @@
 	mtm_enable_core(cpu);
 }
 
-static void eznps_ipi_clear(int irq)
-{
-	write_aux_reg(CTOP_AUX_IACK, 1 << irq);
-}
-
 struct plat_smp_ops plat_smp_ops = {
 	.info		= smp_cpuinfo_buf,
 	.init_early_smp	= eznps_init_cpumasks,
 	.cpu_kick	= eznps_smp_wakeup_cpu,
 	.ipi_send	= eznps_ipi_send,
 	.init_per_cpu	= eznps_init_per_cpu,
-	.ipi_clear	= eznps_ipi_clear,
 };
diff --git a/arch/arm/boot/dts/Makefile b/arch/arm/boot/dts/Makefile
index befcd26..c558ba7 100644
--- a/arch/arm/boot/dts/Makefile
+++ b/arch/arm/boot/dts/Makefile
@@ -745,7 +745,6 @@
 	sun4i-a10-pcduino2.dtb \
 	sun4i-a10-pov-protab2-ips9.dtb
 dtb-$(CONFIG_MACH_SUN5I) += \
-	ntc-gr8-evb.dtb \
 	sun5i-a10s-auxtek-t003.dtb \
 	sun5i-a10s-auxtek-t004.dtb \
 	sun5i-a10s-mk802.dtb \
@@ -761,6 +760,7 @@
 	sun5i-a13-olinuxino-micro.dtb \
 	sun5i-a13-q8-tablet.dtb \
 	sun5i-a13-utoo-p66.dtb \
+	sun5i-gr8-evb.dtb \
 	sun5i-r8-chip.dtb
 dtb-$(CONFIG_MACH_SUN6I) += \
 	sun6i-a31-app4-evb1.dtb \
diff --git a/arch/arm/boot/dts/imx53-qsb.dts b/arch/arm/boot/dts/imx53-qsb.dts
index dec4b07..3799396 100644
--- a/arch/arm/boot/dts/imx53-qsb.dts
+++ b/arch/arm/boot/dts/imx53-qsb.dts
@@ -64,8 +64,8 @@
 			};
 
 			ldo3_reg: ldo3 {
-				regulator-min-microvolt = <600000>;
-				regulator-max-microvolt = <1800000>;
+				regulator-min-microvolt = <1725000>;
+				regulator-max-microvolt = <3300000>;
 				regulator-always-on;
 			};
 
@@ -76,8 +76,8 @@
 			};
 
 			ldo5_reg: ldo5 {
-				regulator-min-microvolt = <1725000>;
-				regulator-max-microvolt = <3300000>;
+				regulator-min-microvolt = <1200000>;
+				regulator-max-microvolt = <3600000>;
 				regulator-always-on;
 			};
 
@@ -100,14 +100,14 @@
 			};
 
 			ldo9_reg: ldo9 {
-				regulator-min-microvolt = <1200000>;
+				regulator-min-microvolt = <1250000>;
 				regulator-max-microvolt = <3600000>;
 				regulator-always-on;
 			};
 
 			ldo10_reg: ldo10 {
-				regulator-min-microvolt = <1250000>;
-				regulator-max-microvolt = <3650000>;
+				regulator-min-microvolt = <1200000>;
+				regulator-max-microvolt = <3600000>;
 				regulator-always-on;
 			};
 		};
diff --git a/arch/arm/boot/dts/logicpd-som-lv.dtsi b/arch/arm/boot/dts/logicpd-som-lv.dtsi
index 0ff1c2d..26cce4d 100644
--- a/arch/arm/boot/dts/logicpd-som-lv.dtsi
+++ b/arch/arm/boot/dts/logicpd-som-lv.dtsi
@@ -13,6 +13,11 @@
 		};
 	};
 
+	memory@80000000 {
+		device_type = "memory";
+		reg = <0x80000000 0>;
+	};
+
 	wl12xx_vmmc: wl12xx_vmmc {
 		compatible = "regulator-fixed";
 		regulator-name = "vwl1271";
diff --git a/arch/arm/boot/dts/logicpd-torpedo-som.dtsi b/arch/arm/boot/dts/logicpd-torpedo-som.dtsi
index 731ec37..8f9a69c 100644
--- a/arch/arm/boot/dts/logicpd-torpedo-som.dtsi
+++ b/arch/arm/boot/dts/logicpd-torpedo-som.dtsi
@@ -13,9 +13,9 @@
 		};
 	};
 
-	memory@0 {
+	memory@80000000 {
 		device_type = "memory";
-		reg = <0 0>;
+		reg = <0x80000000 0>;
 	};
 
 	leds {
diff --git a/arch/arm/boot/dts/omap5-board-common.dtsi b/arch/arm/boot/dts/omap5-board-common.dtsi
index 6365635..4caadb2 100644
--- a/arch/arm/boot/dts/omap5-board-common.dtsi
+++ b/arch/arm/boot/dts/omap5-board-common.dtsi
@@ -124,6 +124,7 @@
 		compatible = "ti,abe-twl6040";
 		ti,model = "omap5-uevm";
 
+		ti,jack-detection;
 		ti,mclk-freq = <19200000>;
 
 		ti,mcpdm = <&mcpdm>;
@@ -415,7 +416,7 @@
 			ti,backup-battery-charge-high-current;
 		};
 
-		gpadc {
+		gpadc: gpadc {
 			compatible = "ti,palmas-gpadc";
 			interrupts = <18 0
 				      16 0
@@ -475,8 +476,8 @@
 				smps6_reg: smps6 {
 					/* VDD_DDR3 - over VDD_SMPS6 */
 					regulator-name = "smps6";
-					regulator-min-microvolt = <1200000>;
-					regulator-max-microvolt = <1200000>;
+					regulator-min-microvolt = <1350000>;
+					regulator-max-microvolt = <1350000>;
 					regulator-always-on;
 					regulator-boot-on;
 				};
diff --git a/arch/arm/boot/dts/ste-snowball.dts b/arch/arm/boot/dts/ste-snowball.dts
index b3df1c6..386eee6 100644
--- a/arch/arm/boot/dts/ste-snowball.dts
+++ b/arch/arm/boot/dts/ste-snowball.dts
@@ -239,14 +239,25 @@
 			arm,primecell-periphid = <0x10480180>;
 			max-frequency = <100000000>;
 			bus-width = <4>;
+			cap-sd-highspeed;
 			cap-mmc-highspeed;
+			sd-uhs-sdr12;
+			sd-uhs-sdr25;
+			/* All direction control is used */
+			st,sig-dir-cmd;
+			st,sig-dir-dat0;
+			st,sig-dir-dat2;
+			st,sig-dir-dat31;
+			st,sig-pin-fbclk;
+			full-pwr-cycle;
 			vmmc-supply = <&ab8500_ldo_aux3_reg>;
 			vqmmc-supply = <&vmmci>;
 			pinctrl-names = "default", "sleep";
 			pinctrl-0 = <&sdi0_default_mode>;
 			pinctrl-1 = <&sdi0_sleep_mode>;
 
-			cd-gpios  = <&gpio6 26 GPIO_ACTIVE_LOW>; // 218
+			/* GPIO218 MMC_CD */
+			cd-gpios  = <&gpio6 26 GPIO_ACTIVE_LOW>;
 
 			status = "okay";
 		};
@@ -549,7 +560,7 @@
 					/* VMMCI level-shifter enable */
 					snowball_cfg3 {
 						pins = "GPIO217_AH12";
-						ste,config = <&gpio_out_lo>;
+						ste,config = <&gpio_out_hi>;
 					};
 					/* VMMCI level-shifter voltage select */
 					snowball_cfg4 {
diff --git a/arch/arm/boot/dts/stih407-family.dtsi b/arch/arm/boot/dts/stih407-family.dtsi
index 91096a4..8f79b41 100644
--- a/arch/arm/boot/dts/stih407-family.dtsi
+++ b/arch/arm/boot/dts/stih407-family.dtsi
@@ -283,6 +283,8 @@
 			clock-frequency = <400000>;
 			pinctrl-names = "default";
 			pinctrl-0 = <&pinctrl_i2c0_default>;
+			#address-cells = <1>;
+			#size-cells = <0>;
 
 			status = "disabled";
 		};
@@ -296,6 +298,8 @@
 			clock-frequency = <400000>;
 			pinctrl-names = "default";
 			pinctrl-0 = <&pinctrl_i2c1_default>;
+			#address-cells = <1>;
+			#size-cells = <0>;
 
 			status = "disabled";
 		};
@@ -309,6 +313,8 @@
 			clock-frequency = <400000>;
 			pinctrl-names = "default";
 			pinctrl-0 = <&pinctrl_i2c2_default>;
+			#address-cells = <1>;
+			#size-cells = <0>;
 
 			status = "disabled";
 		};
@@ -322,6 +328,8 @@
 			clock-frequency = <400000>;
 			pinctrl-names = "default";
 			pinctrl-0 = <&pinctrl_i2c3_default>;
+			#address-cells = <1>;
+			#size-cells = <0>;
 
 			status = "disabled";
 		};
@@ -335,6 +343,8 @@
 			clock-frequency = <400000>;
 			pinctrl-names = "default";
 			pinctrl-0 = <&pinctrl_i2c4_default>;
+			#address-cells = <1>;
+			#size-cells = <0>;
 
 			status = "disabled";
 		};
@@ -348,6 +358,8 @@
 			clock-frequency = <400000>;
 			pinctrl-names = "default";
 			pinctrl-0 = <&pinctrl_i2c5_default>;
+			#address-cells = <1>;
+			#size-cells = <0>;
 
 			status = "disabled";
 		};
@@ -363,6 +375,8 @@
 			clock-frequency = <400000>;
 			pinctrl-names = "default";
 			pinctrl-0 = <&pinctrl_i2c10_default>;
+			#address-cells = <1>;
+			#size-cells = <0>;
 
 			status = "disabled";
 		};
@@ -376,6 +390,8 @@
 			clock-frequency = <400000>;
 			pinctrl-names = "default";
 			pinctrl-0 = <&pinctrl_i2c11_default>;
+			#address-cells = <1>;
+			#size-cells = <0>;
 
 			status = "disabled";
 		};
diff --git a/arch/arm/boot/dts/stih410-b2260.dts b/arch/arm/boot/dts/stih410-b2260.dts
index ef2ff2f..7fb507f 100644
--- a/arch/arm/boot/dts/stih410-b2260.dts
+++ b/arch/arm/boot/dts/stih410-b2260.dts
@@ -74,7 +74,7 @@
 		/* Low speed expansion connector */
 		spi0: spi@9844000 {
 			label = "LS-SPI0";
-			cs-gpio = <&pio30 3 0>;
+			cs-gpios = <&pio30 3 0>;
 			status = "okay";
 		};
 
diff --git a/arch/arm/boot/dts/ntc-gr8-evb.dts b/arch/arm/boot/dts/sun5i-gr8-evb.dts
similarity index 99%
rename from arch/arm/boot/dts/ntc-gr8-evb.dts
rename to arch/arm/boot/dts/sun5i-gr8-evb.dts
index 4b622f3b..714381f 100644
--- a/arch/arm/boot/dts/ntc-gr8-evb.dts
+++ b/arch/arm/boot/dts/sun5i-gr8-evb.dts
@@ -44,7 +44,7 @@
  */
 
 /dts-v1/;
-#include "ntc-gr8.dtsi"
+#include "sun5i-gr8.dtsi"
 #include "sunxi-common-regulators.dtsi"
 
 #include <dt-bindings/gpio/gpio.h>
diff --git a/arch/arm/boot/dts/ntc-gr8.dtsi b/arch/arm/boot/dts/sun5i-gr8.dtsi
similarity index 100%
rename from arch/arm/boot/dts/ntc-gr8.dtsi
rename to arch/arm/boot/dts/sun5i-gr8.dtsi
diff --git a/arch/arm/boot/dts/sun8i-a23-a33.dtsi b/arch/arm/boot/dts/sun8i-a23-a33.dtsi
index 48fc24f..300a1bd 100644
--- a/arch/arm/boot/dts/sun8i-a23-a33.dtsi
+++ b/arch/arm/boot/dts/sun8i-a23-a33.dtsi
@@ -282,11 +282,15 @@
 			uart1_pins_a: uart1@0 {
 				allwinner,pins = "PG6", "PG7";
 				allwinner,function = "uart1";
+				allwinner,drive = <SUN4I_PINCTRL_10_MA>;
+				allwinner,pull = <SUN4I_PINCTRL_NO_PULL>;
 			};
 
 			uart1_pins_cts_rts_a: uart1-cts-rts@0 {
 				allwinner,pins = "PG8", "PG9";
 				allwinner,function = "uart1";
+				allwinner,drive = <SUN4I_PINCTRL_10_MA>;
+				allwinner,pull = <SUN4I_PINCTRL_NO_PULL>;
 			};
 
 			mmc0_pins_a: mmc0@0 {
diff --git a/arch/arm/boot/dts/uniphier-pro5.dtsi b/arch/arm/boot/dts/uniphier-pro5.dtsi
index 2c49c36..5357ea9 100644
--- a/arch/arm/boot/dts/uniphier-pro5.dtsi
+++ b/arch/arm/boot/dts/uniphier-pro5.dtsi
@@ -184,11 +184,11 @@
 };
 
 &mio_clk {
-	compatible = "socionext,uniphier-pro5-mio-clock";
+	compatible = "socionext,uniphier-pro5-sd-clock";
 };
 
 &mio_rst {
-	compatible = "socionext,uniphier-pro5-mio-reset";
+	compatible = "socionext,uniphier-pro5-sd-reset";
 };
 
 &peri_clk {
diff --git a/arch/arm/boot/dts/uniphier-pxs2.dtsi b/arch/arm/boot/dts/uniphier-pxs2.dtsi
index 8789cd5..950f07b 100644
--- a/arch/arm/boot/dts/uniphier-pxs2.dtsi
+++ b/arch/arm/boot/dts/uniphier-pxs2.dtsi
@@ -197,11 +197,11 @@
 };
 
 &mio_clk {
-	compatible = "socionext,uniphier-pxs2-mio-clock";
+	compatible = "socionext,uniphier-pxs2-sd-clock";
 };
 
 &mio_rst {
-	compatible = "socionext,uniphier-pxs2-mio-reset";
+	compatible = "socionext,uniphier-pxs2-sd-reset";
 };
 
 &peri_clk {
diff --git a/arch/arm/boot/dts/vf500.dtsi b/arch/arm/boot/dts/vf500.dtsi
index a3824e6..d7fdb2a 100644
--- a/arch/arm/boot/dts/vf500.dtsi
+++ b/arch/arm/boot/dts/vf500.dtsi
@@ -70,7 +70,7 @@
 			global_timer: timer@40002200 {
 				compatible = "arm,cortex-a9-global-timer";
 				reg = <0x40002200 0x20>;
-				interrupts = <GIC_PPI 11 IRQ_TYPE_LEVEL_HIGH>;
+				interrupts = <GIC_PPI 11 IRQ_TYPE_EDGE_RISING>;
 				interrupt-parent = <&intc>;
 				clocks = <&clks VF610_CLK_PLATFORM_BUS>;
 			};
diff --git a/arch/arm/configs/multi_v7_defconfig b/arch/arm/configs/multi_v7_defconfig
index 437d074..11f37ed 100644
--- a/arch/arm/configs/multi_v7_defconfig
+++ b/arch/arm/configs/multi_v7_defconfig
@@ -850,6 +850,7 @@
 CONFIG_PWM_TEGRA=y
 CONFIG_PWM_VT8500=y
 CONFIG_PHY_HIX5HD2_SATA=y
+CONFIG_E1000E=y
 CONFIG_PWM_STI=y
 CONFIG_PWM_BCM2835=y
 CONFIG_PWM_BRCMSTB=m
diff --git a/arch/arm/include/asm/Kbuild b/arch/arm/include/asm/Kbuild
index 0745538..55e0e3e 100644
--- a/arch/arm/include/asm/Kbuild
+++ b/arch/arm/include/asm/Kbuild
@@ -8,7 +8,6 @@
 generic-y += emergency-restart.h
 generic-y += errno.h
 generic-y += exec.h
-generic-y += export.h
 generic-y += ioctl.h
 generic-y += ipcbuf.h
 generic-y += irq_regs.h
diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h
index d7ea6bc..8ef0538 100644
--- a/arch/arm/include/asm/kvm_asm.h
+++ b/arch/arm/include/asm/kvm_asm.h
@@ -66,6 +66,7 @@
 extern void __kvm_flush_vm_context(void);
 extern void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa);
 extern void __kvm_tlb_flush_vmid(struct kvm *kvm);
+extern void __kvm_tlb_flush_local_vmid(struct kvm_vcpu *vcpu);
 
 extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu);
 
diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 2d19e02..d5423ab 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -57,6 +57,9 @@
 	/* VTTBR value associated with below pgd and vmid */
 	u64    vttbr;
 
+	/* The last vcpu id that ran on each physical CPU */
+	int __percpu *last_vcpu_ran;
+
 	/* Timer */
 	struct arch_timer_kvm	timer;
 
diff --git a/arch/arm/include/asm/kvm_hyp.h b/arch/arm/include/asm/kvm_hyp.h
index 343135e..5850890 100644
--- a/arch/arm/include/asm/kvm_hyp.h
+++ b/arch/arm/include/asm/kvm_hyp.h
@@ -71,6 +71,7 @@
 #define ICIALLUIS	__ACCESS_CP15(c7, 0, c1, 0)
 #define ATS1CPR		__ACCESS_CP15(c7, 0, c8, 0)
 #define TLBIALLIS	__ACCESS_CP15(c8, 0, c3, 0)
+#define TLBIALL		__ACCESS_CP15(c8, 0, c7, 0)
 #define TLBIALLNSNHIS	__ACCESS_CP15(c8, 4, c3, 4)
 #define PRRR		__ACCESS_CP15(c10, 0, c2, 0)
 #define NMRR		__ACCESS_CP15(c10, 0, c2, 1)
diff --git a/arch/arm/include/asm/unistd.h b/arch/arm/include/asm/unistd.h
index 194b699..ada0d29 100644
--- a/arch/arm/include/asm/unistd.h
+++ b/arch/arm/include/asm/unistd.h
@@ -19,7 +19,7 @@
  * This may need to be greater than __NR_last_syscall+1 in order to
  * account for the padding in the syscall table
  */
-#define __NR_syscalls  (396)
+#define __NR_syscalls  (400)
 
 #define __ARCH_WANT_STAT64
 #define __ARCH_WANT_SYS_GETHOSTNAME
diff --git a/arch/arm/include/uapi/asm/unistd.h b/arch/arm/include/uapi/asm/unistd.h
index 2cb9dc7..314100a 100644
--- a/arch/arm/include/uapi/asm/unistd.h
+++ b/arch/arm/include/uapi/asm/unistd.h
@@ -420,6 +420,9 @@
 #define __NR_copy_file_range		(__NR_SYSCALL_BASE+391)
 #define __NR_preadv2			(__NR_SYSCALL_BASE+392)
 #define __NR_pwritev2			(__NR_SYSCALL_BASE+393)
+#define __NR_pkey_mprotect		(__NR_SYSCALL_BASE+394)
+#define __NR_pkey_alloc			(__NR_SYSCALL_BASE+395)
+#define __NR_pkey_free			(__NR_SYSCALL_BASE+396)
 
 /*
  * The following SWIs are ARM private.
diff --git a/arch/arm/kernel/Makefile b/arch/arm/kernel/Makefile
index 68c2c09..ad325a8 100644
--- a/arch/arm/kernel/Makefile
+++ b/arch/arm/kernel/Makefile
@@ -33,7 +33,7 @@
 obj-$(CONFIG_CPU_IDLE)		+= cpuidle.o
 obj-$(CONFIG_ISA_DMA_API)	+= dma.o
 obj-$(CONFIG_FIQ)		+= fiq.o fiqasm.o
-obj-$(CONFIG_MODULES)		+= module.o
+obj-$(CONFIG_MODULES)		+= armksyms.o module.o
 obj-$(CONFIG_ARM_MODULE_PLTS)	+= module-plts.o
 obj-$(CONFIG_ISA_DMA)		+= dma-isa.o
 obj-$(CONFIG_PCI)		+= bios32.o isa.o
diff --git a/arch/arm/kernel/armksyms.c b/arch/arm/kernel/armksyms.c
new file mode 100644
index 0000000..7e45f69
--- /dev/null
+++ b/arch/arm/kernel/armksyms.c
@@ -0,0 +1,183 @@
+/*
+ *  linux/arch/arm/kernel/armksyms.c
+ *
+ *  Copyright (C) 2000 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/export.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/cryptohash.h>
+#include <linux/delay.h>
+#include <linux/in6.h>
+#include <linux/syscalls.h>
+#include <linux/uaccess.h>
+#include <linux/io.h>
+#include <linux/arm-smccc.h>
+
+#include <asm/checksum.h>
+#include <asm/ftrace.h>
+
+/*
+ * libgcc functions - functions that are used internally by the
+ * compiler...  (prototypes are not correct though, but that
+ * doesn't really matter since they're not versioned).
+ */
+extern void __ashldi3(void);
+extern void __ashrdi3(void);
+extern void __divsi3(void);
+extern void __lshrdi3(void);
+extern void __modsi3(void);
+extern void __muldi3(void);
+extern void __ucmpdi2(void);
+extern void __udivsi3(void);
+extern void __umodsi3(void);
+extern void __do_div64(void);
+extern void __bswapsi2(void);
+extern void __bswapdi2(void);
+
+extern void __aeabi_idiv(void);
+extern void __aeabi_idivmod(void);
+extern void __aeabi_lasr(void);
+extern void __aeabi_llsl(void);
+extern void __aeabi_llsr(void);
+extern void __aeabi_lmul(void);
+extern void __aeabi_uidiv(void);
+extern void __aeabi_uidivmod(void);
+extern void __aeabi_ulcmp(void);
+
+extern void fpundefinstr(void);
+
+void mmioset(void *, unsigned int, size_t);
+void mmiocpy(void *, const void *, size_t);
+
+	/* platform dependent support */
+EXPORT_SYMBOL(arm_delay_ops);
+
+	/* networking */
+EXPORT_SYMBOL(csum_partial);
+EXPORT_SYMBOL(csum_partial_copy_from_user);
+EXPORT_SYMBOL(csum_partial_copy_nocheck);
+EXPORT_SYMBOL(__csum_ipv6_magic);
+
+	/* io */
+#ifndef __raw_readsb
+EXPORT_SYMBOL(__raw_readsb);
+#endif
+#ifndef __raw_readsw
+EXPORT_SYMBOL(__raw_readsw);
+#endif
+#ifndef __raw_readsl
+EXPORT_SYMBOL(__raw_readsl);
+#endif
+#ifndef __raw_writesb
+EXPORT_SYMBOL(__raw_writesb);
+#endif
+#ifndef __raw_writesw
+EXPORT_SYMBOL(__raw_writesw);
+#endif
+#ifndef __raw_writesl
+EXPORT_SYMBOL(__raw_writesl);
+#endif
+
+	/* string / mem functions */
+EXPORT_SYMBOL(strchr);
+EXPORT_SYMBOL(strrchr);
+EXPORT_SYMBOL(memset);
+EXPORT_SYMBOL(memcpy);
+EXPORT_SYMBOL(memmove);
+EXPORT_SYMBOL(memchr);
+EXPORT_SYMBOL(__memzero);
+
+EXPORT_SYMBOL(mmioset);
+EXPORT_SYMBOL(mmiocpy);
+
+#ifdef CONFIG_MMU
+EXPORT_SYMBOL(copy_page);
+
+EXPORT_SYMBOL(arm_copy_from_user);
+EXPORT_SYMBOL(arm_copy_to_user);
+EXPORT_SYMBOL(arm_clear_user);
+
+EXPORT_SYMBOL(__get_user_1);
+EXPORT_SYMBOL(__get_user_2);
+EXPORT_SYMBOL(__get_user_4);
+EXPORT_SYMBOL(__get_user_8);
+
+#ifdef __ARMEB__
+EXPORT_SYMBOL(__get_user_64t_1);
+EXPORT_SYMBOL(__get_user_64t_2);
+EXPORT_SYMBOL(__get_user_64t_4);
+EXPORT_SYMBOL(__get_user_32t_8);
+#endif
+
+EXPORT_SYMBOL(__put_user_1);
+EXPORT_SYMBOL(__put_user_2);
+EXPORT_SYMBOL(__put_user_4);
+EXPORT_SYMBOL(__put_user_8);
+#endif
+
+	/* gcc lib functions */
+EXPORT_SYMBOL(__ashldi3);
+EXPORT_SYMBOL(__ashrdi3);
+EXPORT_SYMBOL(__divsi3);
+EXPORT_SYMBOL(__lshrdi3);
+EXPORT_SYMBOL(__modsi3);
+EXPORT_SYMBOL(__muldi3);
+EXPORT_SYMBOL(__ucmpdi2);
+EXPORT_SYMBOL(__udivsi3);
+EXPORT_SYMBOL(__umodsi3);
+EXPORT_SYMBOL(__do_div64);
+EXPORT_SYMBOL(__bswapsi2);
+EXPORT_SYMBOL(__bswapdi2);
+
+#ifdef CONFIG_AEABI
+EXPORT_SYMBOL(__aeabi_idiv);
+EXPORT_SYMBOL(__aeabi_idivmod);
+EXPORT_SYMBOL(__aeabi_lasr);
+EXPORT_SYMBOL(__aeabi_llsl);
+EXPORT_SYMBOL(__aeabi_llsr);
+EXPORT_SYMBOL(__aeabi_lmul);
+EXPORT_SYMBOL(__aeabi_uidiv);
+EXPORT_SYMBOL(__aeabi_uidivmod);
+EXPORT_SYMBOL(__aeabi_ulcmp);
+#endif
+
+	/* bitops */
+EXPORT_SYMBOL(_set_bit);
+EXPORT_SYMBOL(_test_and_set_bit);
+EXPORT_SYMBOL(_clear_bit);
+EXPORT_SYMBOL(_test_and_clear_bit);
+EXPORT_SYMBOL(_change_bit);
+EXPORT_SYMBOL(_test_and_change_bit);
+EXPORT_SYMBOL(_find_first_zero_bit_le);
+EXPORT_SYMBOL(_find_next_zero_bit_le);
+EXPORT_SYMBOL(_find_first_bit_le);
+EXPORT_SYMBOL(_find_next_bit_le);
+
+#ifdef __ARMEB__
+EXPORT_SYMBOL(_find_first_zero_bit_be);
+EXPORT_SYMBOL(_find_next_zero_bit_be);
+EXPORT_SYMBOL(_find_first_bit_be);
+EXPORT_SYMBOL(_find_next_bit_be);
+#endif
+
+#ifdef CONFIG_FUNCTION_TRACER
+#ifdef CONFIG_OLD_MCOUNT
+EXPORT_SYMBOL(mcount);
+#endif
+EXPORT_SYMBOL(__gnu_mcount_nc);
+#endif
+
+#ifdef CONFIG_ARM_PATCH_PHYS_VIRT
+EXPORT_SYMBOL(__pv_phys_pfn_offset);
+EXPORT_SYMBOL(__pv_offset);
+#endif
+
+#ifdef CONFIG_HAVE_ARM_SMCCC
+EXPORT_SYMBOL(arm_smccc_smc);
+EXPORT_SYMBOL(arm_smccc_hvc);
+#endif
diff --git a/arch/arm/kernel/calls.S b/arch/arm/kernel/calls.S
index 703fa0f..08030b1 100644
--- a/arch/arm/kernel/calls.S
+++ b/arch/arm/kernel/calls.S
@@ -403,6 +403,9 @@
 		CALL(sys_copy_file_range)
 		CALL(sys_preadv2)
 		CALL(sys_pwritev2)
+		CALL(sys_pkey_mprotect)
+/* 395 */	CALL(sys_pkey_alloc)
+		CALL(sys_pkey_free)
 #ifndef syscalls_counted
 .equ syscalls_padding, ((NR_syscalls + 3) & ~3) - NR_syscalls
 #define syscalls_counted
diff --git a/arch/arm/kernel/entry-ftrace.S b/arch/arm/kernel/entry-ftrace.S
index b629d3f..c73c403 100644
--- a/arch/arm/kernel/entry-ftrace.S
+++ b/arch/arm/kernel/entry-ftrace.S
@@ -7,7 +7,6 @@
 #include <asm/assembler.h>
 #include <asm/ftrace.h>
 #include <asm/unwind.h>
-#include <asm/export.h>
 
 #include "entry-header.S"
 
@@ -154,7 +153,6 @@
 	__mcount _old
 #endif
 ENDPROC(mcount)
-EXPORT_SYMBOL(mcount)
 
 #ifdef CONFIG_DYNAMIC_FTRACE
 ENTRY(ftrace_caller_old)
@@ -207,7 +205,6 @@
 #endif
 UNWIND(.fnend)
 ENDPROC(__gnu_mcount_nc)
-EXPORT_SYMBOL(__gnu_mcount_nc)
 
 #ifdef CONFIG_DYNAMIC_FTRACE
 ENTRY(ftrace_caller)
diff --git a/arch/arm/kernel/head.S b/arch/arm/kernel/head.S
index f41cee4..04286fd 100644
--- a/arch/arm/kernel/head.S
+++ b/arch/arm/kernel/head.S
@@ -22,7 +22,6 @@
 #include <asm/memory.h>
 #include <asm/thread_info.h>
 #include <asm/pgtable.h>
-#include <asm/export.h>
 
 #if defined(CONFIG_DEBUG_LL) && !defined(CONFIG_DEBUG_SEMIHOSTING)
 #include CONFIG_DEBUG_LL_INCLUDE
@@ -728,8 +727,6 @@
 __pv_offset:
 	.quad	0
 	.size	__pv_offset, . -__pv_offset
-EXPORT_SYMBOL(__pv_phys_pfn_offset)
-EXPORT_SYMBOL(__pv_offset)
 #endif
 
 #include "head-common.S"
diff --git a/arch/arm/kernel/smccc-call.S b/arch/arm/kernel/smccc-call.S
index 37669e7..2e48b67 100644
--- a/arch/arm/kernel/smccc-call.S
+++ b/arch/arm/kernel/smccc-call.S
@@ -16,7 +16,6 @@
 #include <asm/opcodes-sec.h>
 #include <asm/opcodes-virt.h>
 #include <asm/unwind.h>
-#include <asm/export.h>
 
 	/*
 	 * Wrap c macros in asm macros to delay expansion until after the
@@ -52,7 +51,6 @@
 ENTRY(arm_smccc_smc)
 	SMCCC SMCCC_SMC
 ENDPROC(arm_smccc_smc)
-EXPORT_SYMBOL(arm_smccc_smc)
 
 /*
  * void smccc_hvc(unsigned long a0, unsigned long a1, unsigned long a2,
@@ -62,4 +60,3 @@
 ENTRY(arm_smccc_hvc)
 	SMCCC SMCCC_HVC
 ENDPROC(arm_smccc_hvc)
-EXPORT_SYMBOL(arm_smccc_hvc)
diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c
index bc69838..9688ec0 100644
--- a/arch/arm/kernel/traps.c
+++ b/arch/arm/kernel/traps.c
@@ -74,6 +74,26 @@
 		dump_mem("", "Exception stack", frame + 4, frame + 4 + sizeof(struct pt_regs));
 }
 
+void dump_backtrace_stm(u32 *stack, u32 instruction)
+{
+	char str[80], *p;
+	unsigned int x;
+	int reg;
+
+	for (reg = 10, x = 0, p = str; reg >= 0; reg--) {
+		if (instruction & BIT(reg)) {
+			p += sprintf(p, " r%d:%08x", reg, *stack--);
+			if (++x == 6) {
+				x = 0;
+				p = str;
+				printk("%s\n", str);
+			}
+		}
+	}
+	if (p != str)
+		printk("%s\n", str);
+}
+
 #ifndef CONFIG_ARM_UNWIND
 /*
  * Stack pointers should always be within the kernels view of
diff --git a/arch/arm/kernel/vmlinux-xip.lds.S b/arch/arm/kernel/vmlinux-xip.lds.S
index 7fa487e..37b2a11 100644
--- a/arch/arm/kernel/vmlinux-xip.lds.S
+++ b/arch/arm/kernel/vmlinux-xip.lds.S
@@ -3,6 +3,9 @@
  * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>
  */
 
+/* No __ro_after_init data in the .rodata section - which will always be ro */
+#define RO_AFTER_INIT_DATA
+
 #include <asm-generic/vmlinux.lds.h>
 #include <asm/cache.h>
 #include <asm/thread_info.h>
@@ -223,6 +226,8 @@
 		. = ALIGN(PAGE_SIZE);
 		__init_end = .;
 
+		*(.data..ro_after_init)
+
 		NOSAVE_DATA
 		CACHELINE_ALIGNED_DATA(L1_CACHE_BYTES)
 		READ_MOSTLY_DATA(L1_CACHE_BYTES)
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 03e9273..19b5f5c 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -114,11 +114,18 @@
  */
 int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 {
-	int ret = 0;
+	int ret, cpu;
 
 	if (type)
 		return -EINVAL;
 
+	kvm->arch.last_vcpu_ran = alloc_percpu(typeof(*kvm->arch.last_vcpu_ran));
+	if (!kvm->arch.last_vcpu_ran)
+		return -ENOMEM;
+
+	for_each_possible_cpu(cpu)
+		*per_cpu_ptr(kvm->arch.last_vcpu_ran, cpu) = -1;
+
 	ret = kvm_alloc_stage2_pgd(kvm);
 	if (ret)
 		goto out_fail_alloc;
@@ -141,6 +148,8 @@
 out_free_stage2_pgd:
 	kvm_free_stage2_pgd(kvm);
 out_fail_alloc:
+	free_percpu(kvm->arch.last_vcpu_ran);
+	kvm->arch.last_vcpu_ran = NULL;
 	return ret;
 }
 
@@ -168,6 +177,9 @@
 {
 	int i;
 
+	free_percpu(kvm->arch.last_vcpu_ran);
+	kvm->arch.last_vcpu_ran = NULL;
+
 	for (i = 0; i < KVM_MAX_VCPUS; ++i) {
 		if (kvm->vcpus[i]) {
 			kvm_arch_vcpu_free(kvm->vcpus[i]);
@@ -312,6 +324,19 @@
 
 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
+	int *last_ran;
+
+	last_ran = this_cpu_ptr(vcpu->kvm->arch.last_vcpu_ran);
+
+	/*
+	 * We might get preempted before the vCPU actually runs, but
+	 * over-invalidation doesn't affect correctness.
+	 */
+	if (*last_ran != vcpu->vcpu_id) {
+		kvm_call_hyp(__kvm_tlb_flush_local_vmid, vcpu);
+		*last_ran = vcpu->vcpu_id;
+	}
+
 	vcpu->cpu = cpu;
 	vcpu->arch.host_cpu_context = this_cpu_ptr(kvm_host_cpu_state);
 
@@ -1312,6 +1337,13 @@
 		goto out_err;
 	}
 
+	err = create_hyp_mappings(kvm_ksym_ref(__bss_start),
+				  kvm_ksym_ref(__bss_stop), PAGE_HYP_RO);
+	if (err) {
+		kvm_err("Cannot map bss section\n");
+		goto out_err;
+	}
+
 	/*
 	 * Map the Hyp stack pages
 	 */
diff --git a/arch/arm/kvm/hyp/tlb.c b/arch/arm/kvm/hyp/tlb.c
index 7296528..6d810af 100644
--- a/arch/arm/kvm/hyp/tlb.c
+++ b/arch/arm/kvm/hyp/tlb.c
@@ -55,6 +55,21 @@
 	__kvm_tlb_flush_vmid(kvm);
 }
 
+void __hyp_text __kvm_tlb_flush_local_vmid(struct kvm_vcpu *vcpu)
+{
+	struct kvm *kvm = kern_hyp_va(kern_hyp_va(vcpu)->kvm);
+
+	/* Switch to requested VMID */
+	write_sysreg(kvm->arch.vttbr, VTTBR);
+	isb();
+
+	write_sysreg(0, TLBIALL);
+	dsb(nsh);
+	isb();
+
+	write_sysreg(0, VTTBR);
+}
+
 void __hyp_text __kvm_flush_vm_context(void)
 {
 	write_sysreg(0, TLBIALLNSNHIS);
diff --git a/arch/arm/lib/ashldi3.S b/arch/arm/lib/ashldi3.S
index a7e7de8..b05e958 100644
--- a/arch/arm/lib/ashldi3.S
+++ b/arch/arm/lib/ashldi3.S
@@ -28,7 +28,6 @@
 
 #include <linux/linkage.h>
 #include <asm/assembler.h>
-#include <asm/export.h>
 
 #ifdef __ARMEB__
 #define al r1
@@ -53,5 +52,3 @@
 
 ENDPROC(__ashldi3)
 ENDPROC(__aeabi_llsl)
-EXPORT_SYMBOL(__ashldi3)
-EXPORT_SYMBOL(__aeabi_llsl)
diff --git a/arch/arm/lib/ashrdi3.S b/arch/arm/lib/ashrdi3.S
index 490336e..275d7d2 100644
--- a/arch/arm/lib/ashrdi3.S
+++ b/arch/arm/lib/ashrdi3.S
@@ -28,7 +28,6 @@
 
 #include <linux/linkage.h>
 #include <asm/assembler.h>
-#include <asm/export.h>
 
 #ifdef __ARMEB__
 #define al r1
@@ -53,5 +52,3 @@
 
 ENDPROC(__ashrdi3)
 ENDPROC(__aeabi_lasr)
-EXPORT_SYMBOL(__ashrdi3)
-EXPORT_SYMBOL(__aeabi_lasr)
diff --git a/arch/arm/lib/backtrace.S b/arch/arm/lib/backtrace.S
index fab5a50..7d7952e 100644
--- a/arch/arm/lib/backtrace.S
+++ b/arch/arm/lib/backtrace.S
@@ -10,6 +10,7 @@
  * 27/03/03 Ian Molton Clean up CONFIG_CPU
  *
  */
+#include <linux/kern_levels.h>
 #include <linux/linkage.h>
 #include <asm/assembler.h>
 		.text
@@ -83,13 +84,13 @@
 		teq	r3, r1, lsr #11
 		ldreq	r0, [frame, #-8]	@ get sp
 		subeq	r0, r0, #4		@ point at the last arg
-		bleq	.Ldumpstm		@ dump saved registers
+		bleq	dump_backtrace_stm	@ dump saved registers
 
 1004:		ldr	r1, [sv_pc, #0]		@ if stmfd sp!, {..., fp, ip, lr, pc}
 		ldr	r3, .Ldsi		@ instruction exists,
 		teq	r3, r1, lsr #11
 		subeq	r0, frame, #16
-		bleq	.Ldumpstm		@ dump saved registers
+		bleq	dump_backtrace_stm	@ dump saved registers
 
 		teq	sv_fp, #0		@ zero saved fp means
 		beq	no_frame		@ no further frames
@@ -112,38 +113,6 @@
 		.long	1004b, 1006b
 		.popsection
 
-#define instr r4
-#define reg   r5
-#define stack r6
-
-.Ldumpstm:	stmfd	sp!, {instr, reg, stack, r7, lr}
-		mov	stack, r0
-		mov	instr, r1
-		mov	reg, #10
-		mov	r7, #0
-1:		mov	r3, #1
- ARM(		tst	instr, r3, lsl reg	)
- THUMB(		lsl	r3, reg			)
- THUMB(		tst	instr, r3		)
-		beq	2f
-		add	r7, r7, #1
-		teq	r7, #6
-		moveq	r7, #0
-		adr	r3, .Lcr
-		addne	r3, r3, #1		@ skip newline
-		ldr	r2, [stack], #-4
-		mov	r1, reg
-		adr	r0, .Lfp
-		bl	printk
-2:		subs	reg, reg, #1
-		bpl	1b
-		teq	r7, #0
-		adrne	r0, .Lcr
-		blne	printk
-		ldmfd	sp!, {instr, reg, stack, r7, pc}
-
-.Lfp:		.asciz	" r%d:%08x%s"
-.Lcr:		.asciz	"\n"
 .Lbad:		.asciz	"Backtrace aborted due to bad frame pointer <%p>\n"
 		.align
 .Ldsi:		.word	0xe92dd800 >> 11	@ stmfd sp!, {... fp, ip, lr, pc}
diff --git a/arch/arm/lib/bitops.h b/arch/arm/lib/bitops.h
index df06638..7d807cf 100644
--- a/arch/arm/lib/bitops.h
+++ b/arch/arm/lib/bitops.h
@@ -1,6 +1,5 @@
 #include <asm/assembler.h>
 #include <asm/unwind.h>
-#include <asm/export.h>
 
 #if __LINUX_ARM_ARCH__ >= 6
 	.macro	bitop, name, instr
@@ -26,7 +25,6 @@
 	bx	lr
 UNWIND(	.fnend		)
 ENDPROC(\name		)
-EXPORT_SYMBOL(\name	)
 	.endm
 
 	.macro	testop, name, instr, store
@@ -57,7 +55,6 @@
 2:	bx	lr
 UNWIND(	.fnend		)
 ENDPROC(\name		)
-EXPORT_SYMBOL(\name	)
 	.endm
 #else
 	.macro	bitop, name, instr
@@ -77,7 +74,6 @@
 	ret	lr
 UNWIND(	.fnend		)
 ENDPROC(\name		)
-EXPORT_SYMBOL(\name	)
 	.endm
 
 /**
@@ -106,6 +102,5 @@
 	ret	lr
 UNWIND(	.fnend		)
 ENDPROC(\name		)
-EXPORT_SYMBOL(\name	)
 	.endm
 #endif
diff --git a/arch/arm/lib/bswapsdi2.S b/arch/arm/lib/bswapsdi2.S
index f05f782..07cda73 100644
--- a/arch/arm/lib/bswapsdi2.S
+++ b/arch/arm/lib/bswapsdi2.S
@@ -1,6 +1,5 @@
 #include <linux/linkage.h>
 #include <asm/assembler.h>
-#include <asm/export.h>
 
 #if __LINUX_ARM_ARCH__ >= 6
 ENTRY(__bswapsi2)
@@ -36,5 +35,3 @@
 	ret lr
 ENDPROC(__bswapdi2)
 #endif
-EXPORT_SYMBOL(__bswapsi2)
-EXPORT_SYMBOL(__bswapdi2)
diff --git a/arch/arm/lib/clear_user.S b/arch/arm/lib/clear_user.S
index b566154..e936352 100644
--- a/arch/arm/lib/clear_user.S
+++ b/arch/arm/lib/clear_user.S
@@ -10,7 +10,6 @@
 #include <linux/linkage.h>
 #include <asm/assembler.h>
 #include <asm/unwind.h>
-#include <asm/export.h>
 
 		.text
 
@@ -51,9 +50,6 @@
 UNWIND(.fnend)
 ENDPROC(arm_clear_user)
 ENDPROC(__clear_user_std)
-#ifndef CONFIG_UACCESS_WITH_MEMCPY
-EXPORT_SYMBOL(arm_clear_user)
-#endif
 
 		.pushsection .text.fixup,"ax"
 		.align	0
diff --git a/arch/arm/lib/copy_from_user.S b/arch/arm/lib/copy_from_user.S
index 63e4c1e..7a4b060 100644
--- a/arch/arm/lib/copy_from_user.S
+++ b/arch/arm/lib/copy_from_user.S
@@ -13,7 +13,6 @@
 #include <linux/linkage.h>
 #include <asm/assembler.h>
 #include <asm/unwind.h>
-#include <asm/export.h>
 
 /*
  * Prototype:
@@ -95,7 +94,6 @@
 #include "copy_template.S"
 
 ENDPROC(arm_copy_from_user)
-EXPORT_SYMBOL(arm_copy_from_user)
 
 	.pushsection .fixup,"ax"
 	.align 0
diff --git a/arch/arm/lib/copy_page.S b/arch/arm/lib/copy_page.S
index d97851d..6ee2f67 100644
--- a/arch/arm/lib/copy_page.S
+++ b/arch/arm/lib/copy_page.S
@@ -13,7 +13,6 @@
 #include <asm/assembler.h>
 #include <asm/asm-offsets.h>
 #include <asm/cache.h>
-#include <asm/export.h>
 
 #define COPY_COUNT (PAGE_SZ / (2 * L1_CACHE_BYTES) PLD( -1 ))
 
@@ -46,4 +45,3 @@
 	PLD(	beq	2b			)
 		ldmfd	sp!, {r4, pc}			@	3
 ENDPROC(copy_page)
-EXPORT_SYMBOL(copy_page)
diff --git a/arch/arm/lib/copy_to_user.S b/arch/arm/lib/copy_to_user.S
index 592c179..caf5019 100644
--- a/arch/arm/lib/copy_to_user.S
+++ b/arch/arm/lib/copy_to_user.S
@@ -13,7 +13,6 @@
 #include <linux/linkage.h>
 #include <asm/assembler.h>
 #include <asm/unwind.h>
-#include <asm/export.h>
 
 /*
  * Prototype:
@@ -100,9 +99,6 @@
 
 ENDPROC(arm_copy_to_user)
 ENDPROC(__copy_to_user_std)
-#ifndef CONFIG_UACCESS_WITH_MEMCPY
-EXPORT_SYMBOL(arm_copy_to_user)
-#endif
 
 	.pushsection .text.fixup,"ax"
 	.align 0
diff --git a/arch/arm/lib/csumipv6.S b/arch/arm/lib/csumipv6.S
index 68603b5..3ac6ef0 100644
--- a/arch/arm/lib/csumipv6.S
+++ b/arch/arm/lib/csumipv6.S
@@ -9,7 +9,6 @@
  */
 #include <linux/linkage.h>
 #include <asm/assembler.h>
-#include <asm/export.h>
 
 		.text
 
@@ -31,4 +30,4 @@
 		adcs	r0, r0, #0
 		ldmfd	sp!, {pc}
 ENDPROC(__csum_ipv6_magic)
-EXPORT_SYMBOL(__csum_ipv6_magic)
+
diff --git a/arch/arm/lib/csumpartial.S b/arch/arm/lib/csumpartial.S
index 830b20e..984e0f2 100644
--- a/arch/arm/lib/csumpartial.S
+++ b/arch/arm/lib/csumpartial.S
@@ -9,7 +9,6 @@
  */
 #include <linux/linkage.h>
 #include <asm/assembler.h>
-#include <asm/export.h>
 
 		.text
 
@@ -141,4 +140,3 @@
 		bne	4b
 		b	.Lless4
 ENDPROC(csum_partial)
-EXPORT_SYMBOL(csum_partial)
diff --git a/arch/arm/lib/csumpartialcopy.S b/arch/arm/lib/csumpartialcopy.S
index 9c3383f..d03fc71 100644
--- a/arch/arm/lib/csumpartialcopy.S
+++ b/arch/arm/lib/csumpartialcopy.S
@@ -49,6 +49,5 @@
 
 #define FN_ENTRY	ENTRY(csum_partial_copy_nocheck)
 #define FN_EXIT		ENDPROC(csum_partial_copy_nocheck)
-#define FN_EXPORT	EXPORT_SYMBOL(csum_partial_copy_nocheck)
 
 #include "csumpartialcopygeneric.S"
diff --git a/arch/arm/lib/csumpartialcopygeneric.S b/arch/arm/lib/csumpartialcopygeneric.S
index 8b94d20..10b4590 100644
--- a/arch/arm/lib/csumpartialcopygeneric.S
+++ b/arch/arm/lib/csumpartialcopygeneric.S
@@ -8,7 +8,6 @@
  * published by the Free Software Foundation.
  */
 #include <asm/assembler.h>
-#include <asm/export.h>
 
 /*
  * unsigned int
@@ -332,4 +331,3 @@
 		mov	r5, r4, get_byte_1
 		b	.Lexit
 FN_EXIT
-FN_EXPORT
diff --git a/arch/arm/lib/csumpartialcopyuser.S b/arch/arm/lib/csumpartialcopyuser.S
index 5d495ed..1712f13 100644
--- a/arch/arm/lib/csumpartialcopyuser.S
+++ b/arch/arm/lib/csumpartialcopyuser.S
@@ -73,7 +73,6 @@
 
 #define FN_ENTRY	ENTRY(csum_partial_copy_from_user)
 #define FN_EXIT		ENDPROC(csum_partial_copy_from_user)
-#define FN_EXPORT	EXPORT_SYMBOL(csum_partial_copy_from_user)
 
 #include "csumpartialcopygeneric.S"
 
diff --git a/arch/arm/lib/delay.c b/arch/arm/lib/delay.c
index 69aad80..2cef118 100644
--- a/arch/arm/lib/delay.c
+++ b/arch/arm/lib/delay.c
@@ -24,7 +24,6 @@
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
-#include <linux/export.h>
 #include <linux/timex.h>
 
 /*
@@ -35,7 +34,6 @@
 	.const_udelay	= __loop_const_udelay,
 	.udelay		= __loop_udelay,
 };
-EXPORT_SYMBOL(arm_delay_ops);
 
 static const struct delay_timer *delay_timer;
 static bool delay_calibrated;
diff --git a/arch/arm/lib/div64.S b/arch/arm/lib/div64.S
index 0c9e1c1..a9eafe4 100644
--- a/arch/arm/lib/div64.S
+++ b/arch/arm/lib/div64.S
@@ -15,7 +15,6 @@
 #include <linux/linkage.h>
 #include <asm/assembler.h>
 #include <asm/unwind.h>
-#include <asm/export.h>
 
 #ifdef __ARMEB__
 #define xh r0
@@ -211,4 +210,3 @@
 
 UNWIND(.fnend)
 ENDPROC(__do_div64)
-EXPORT_SYMBOL(__do_div64)
diff --git a/arch/arm/lib/findbit.S b/arch/arm/lib/findbit.S
index 26302b8..7848780 100644
--- a/arch/arm/lib/findbit.S
+++ b/arch/arm/lib/findbit.S
@@ -15,7 +15,6 @@
  */
 #include <linux/linkage.h>
 #include <asm/assembler.h>
-#include <asm/export.h>
                 .text
 
 /*
@@ -38,7 +37,6 @@
 3:		mov	r0, r1			@ no free bits
 		ret	lr
 ENDPROC(_find_first_zero_bit_le)
-EXPORT_SYMBOL(_find_first_zero_bit_le)
 
 /*
  * Purpose  : Find next 'zero' bit
@@ -59,7 +57,6 @@
 		add	r2, r2, #1		@ align bit pointer
 		b	2b			@ loop for next bit
 ENDPROC(_find_next_zero_bit_le)
-EXPORT_SYMBOL(_find_next_zero_bit_le)
 
 /*
  * Purpose  : Find a 'one' bit
@@ -81,7 +78,6 @@
 3:		mov	r0, r1			@ no free bits
 		ret	lr
 ENDPROC(_find_first_bit_le)
-EXPORT_SYMBOL(_find_first_bit_le)
 
 /*
  * Purpose  : Find next 'one' bit
@@ -101,7 +97,6 @@
 		add	r2, r2, #1		@ align bit pointer
 		b	2b			@ loop for next bit
 ENDPROC(_find_next_bit_le)
-EXPORT_SYMBOL(_find_next_bit_le)
 
 #ifdef __ARMEB__
 
@@ -121,7 +116,6 @@
 3:		mov	r0, r1			@ no free bits
 		ret	lr
 ENDPROC(_find_first_zero_bit_be)
-EXPORT_SYMBOL(_find_first_zero_bit_be)
 
 ENTRY(_find_next_zero_bit_be)
 		teq	r1, #0
@@ -139,7 +133,6 @@
 		add	r2, r2, #1		@ align bit pointer
 		b	2b			@ loop for next bit
 ENDPROC(_find_next_zero_bit_be)
-EXPORT_SYMBOL(_find_next_zero_bit_be)
 
 ENTRY(_find_first_bit_be)
 		teq	r1, #0
@@ -157,7 +150,6 @@
 3:		mov	r0, r1			@ no free bits
 		ret	lr
 ENDPROC(_find_first_bit_be)
-EXPORT_SYMBOL(_find_first_bit_be)
 
 ENTRY(_find_next_bit_be)
 		teq	r1, #0
@@ -174,7 +166,6 @@
 		add	r2, r2, #1		@ align bit pointer
 		b	2b			@ loop for next bit
 ENDPROC(_find_next_bit_be)
-EXPORT_SYMBOL(_find_next_bit_be)
 
 #endif
 
diff --git a/arch/arm/lib/getuser.S b/arch/arm/lib/getuser.S
index 9d09a38..8ecfd15 100644
--- a/arch/arm/lib/getuser.S
+++ b/arch/arm/lib/getuser.S
@@ -31,7 +31,6 @@
 #include <asm/assembler.h>
 #include <asm/errno.h>
 #include <asm/domain.h>
-#include <asm/export.h>
 
 ENTRY(__get_user_1)
 	check_uaccess r0, 1, r1, r2, __get_user_bad
@@ -39,7 +38,6 @@
 	mov	r0, #0
 	ret	lr
 ENDPROC(__get_user_1)
-EXPORT_SYMBOL(__get_user_1)
 
 ENTRY(__get_user_2)
 	check_uaccess r0, 2, r1, r2, __get_user_bad
@@ -60,7 +58,6 @@
 	mov	r0, #0
 	ret	lr
 ENDPROC(__get_user_2)
-EXPORT_SYMBOL(__get_user_2)
 
 ENTRY(__get_user_4)
 	check_uaccess r0, 4, r1, r2, __get_user_bad
@@ -68,7 +65,6 @@
 	mov	r0, #0
 	ret	lr
 ENDPROC(__get_user_4)
-EXPORT_SYMBOL(__get_user_4)
 
 ENTRY(__get_user_8)
 	check_uaccess r0, 8, r1, r2, __get_user_bad
@@ -82,7 +78,6 @@
 	mov	r0, #0
 	ret	lr
 ENDPROC(__get_user_8)
-EXPORT_SYMBOL(__get_user_8)
 
 #ifdef __ARMEB__
 ENTRY(__get_user_32t_8)
@@ -96,7 +91,6 @@
 	mov	r0, #0
 	ret	lr
 ENDPROC(__get_user_32t_8)
-EXPORT_SYMBOL(__get_user_32t_8)
 
 ENTRY(__get_user_64t_1)
 	check_uaccess r0, 1, r1, r2, __get_user_bad8
@@ -104,7 +98,6 @@
 	mov	r0, #0
 	ret	lr
 ENDPROC(__get_user_64t_1)
-EXPORT_SYMBOL(__get_user_64t_1)
 
 ENTRY(__get_user_64t_2)
 	check_uaccess r0, 2, r1, r2, __get_user_bad8
@@ -121,7 +114,6 @@
 	mov	r0, #0
 	ret	lr
 ENDPROC(__get_user_64t_2)
-EXPORT_SYMBOL(__get_user_64t_2)
 
 ENTRY(__get_user_64t_4)
 	check_uaccess r0, 4, r1, r2, __get_user_bad8
@@ -129,7 +121,6 @@
 	mov	r0, #0
 	ret	lr
 ENDPROC(__get_user_64t_4)
-EXPORT_SYMBOL(__get_user_64t_4)
 #endif
 
 __get_user_bad8:
diff --git a/arch/arm/lib/io-readsb.S b/arch/arm/lib/io-readsb.S
index 3dff7a3..c31b2f3 100644
--- a/arch/arm/lib/io-readsb.S
+++ b/arch/arm/lib/io-readsb.S
@@ -9,7 +9,6 @@
  */
 #include <linux/linkage.h>
 #include <asm/assembler.h>
-#include <asm/export.h>
 
 .Linsb_align:	rsb	ip, ip, #4
 		cmp	ip, r2
@@ -122,4 +121,3 @@
 
 		ldmfd	sp!, {r4 - r6, pc}
 ENDPROC(__raw_readsb)
-EXPORT_SYMBOL(__raw_readsb)
diff --git a/arch/arm/lib/io-readsl.S b/arch/arm/lib/io-readsl.S
index bfd3968..2ed86fa 100644
--- a/arch/arm/lib/io-readsl.S
+++ b/arch/arm/lib/io-readsl.S
@@ -9,7 +9,6 @@
  */
 #include <linux/linkage.h>
 #include <asm/assembler.h>
-#include <asm/export.h>
 
 ENTRY(__raw_readsl)
 		teq	r2, #0		@ do we have to check for the zero len?
@@ -78,4 +77,3 @@
 		strb	r3, [r1, #0]
 		ret	lr
 ENDPROC(__raw_readsl)
-EXPORT_SYMBOL(__raw_readsl)
diff --git a/arch/arm/lib/io-readsw-armv3.S b/arch/arm/lib/io-readsw-armv3.S
index b3af3db..413da99 100644
--- a/arch/arm/lib/io-readsw-armv3.S
+++ b/arch/arm/lib/io-readsw-armv3.S
@@ -9,7 +9,6 @@
  */
 #include <linux/linkage.h>
 #include <asm/assembler.h>
-#include <asm/export.h>
 
 .Linsw_bad_alignment:
 		adr	r0, .Linsw_bad_align_msg
@@ -104,4 +103,4 @@
 
 		ldmfd	sp!, {r4, r5, r6, pc}
 
-EXPORT_SYMBOL(__raw_readsw)
+
diff --git a/arch/arm/lib/io-readsw-armv4.S b/arch/arm/lib/io-readsw-armv4.S
index 3c7a7a40..d9a45e9 100644
--- a/arch/arm/lib/io-readsw-armv4.S
+++ b/arch/arm/lib/io-readsw-armv4.S
@@ -9,7 +9,6 @@
  */
 #include <linux/linkage.h>
 #include <asm/assembler.h>
-#include <asm/export.h>
 
 		.macro	pack, rd, hw1, hw2
 #ifndef __ARMEB__
@@ -130,4 +129,3 @@
 		strneb	ip, [r1]
 		ldmfd	sp!, {r4, pc}
 ENDPROC(__raw_readsw)
-EXPORT_SYMBOL(__raw_readsw)
diff --git a/arch/arm/lib/io-writesb.S b/arch/arm/lib/io-writesb.S
index fa36335..a46bbc9 100644
--- a/arch/arm/lib/io-writesb.S
+++ b/arch/arm/lib/io-writesb.S
@@ -9,7 +9,6 @@
  */
 #include <linux/linkage.h>
 #include <asm/assembler.h>
-#include <asm/export.h>
 
 		.macro	outword, rd
 #ifndef __ARMEB__
@@ -93,4 +92,3 @@
 
 		ldmfd	sp!, {r4, r5, pc}
 ENDPROC(__raw_writesb)
-EXPORT_SYMBOL(__raw_writesb)
diff --git a/arch/arm/lib/io-writesl.S b/arch/arm/lib/io-writesl.S
index 98ed6ae..4ea2435 100644
--- a/arch/arm/lib/io-writesl.S
+++ b/arch/arm/lib/io-writesl.S
@@ -9,7 +9,6 @@
  */
 #include <linux/linkage.h>
 #include <asm/assembler.h>
-#include <asm/export.h>
 
 ENTRY(__raw_writesl)
 		teq	r2, #0		@ do we have to check for the zero len?
@@ -66,4 +65,3 @@
 		bne	6b
 		ret	lr
 ENDPROC(__raw_writesl)
-EXPORT_SYMBOL(__raw_writesl)
diff --git a/arch/arm/lib/io-writesw-armv3.S b/arch/arm/lib/io-writesw-armv3.S
index 577184c..121789e 100644
--- a/arch/arm/lib/io-writesw-armv3.S
+++ b/arch/arm/lib/io-writesw-armv3.S
@@ -9,7 +9,6 @@
  */
 #include <linux/linkage.h>
 #include <asm/assembler.h>
-#include <asm/export.h>
 
 .Loutsw_bad_alignment:
 		adr	r0, .Loutsw_bad_align_msg
@@ -125,4 +124,3 @@
 		strne	ip, [r0]
 
 		ldmfd	sp!, {r4, r5, r6, pc}
-EXPORT_SYMBOL(__raw_writesw)
diff --git a/arch/arm/lib/io-writesw-armv4.S b/arch/arm/lib/io-writesw-armv4.S
index e335f48..269f90c 100644
--- a/arch/arm/lib/io-writesw-armv4.S
+++ b/arch/arm/lib/io-writesw-armv4.S
@@ -9,7 +9,6 @@
  */
 #include <linux/linkage.h>
 #include <asm/assembler.h>
-#include <asm/export.h>
 
 		.macro	outword, rd
 #ifndef __ARMEB__
@@ -99,4 +98,3 @@
 		strneh	ip, [r0]
 		ret	lr
 ENDPROC(__raw_writesw)
-EXPORT_SYMBOL(__raw_writesw)
diff --git a/arch/arm/lib/lib1funcs.S b/arch/arm/lib/lib1funcs.S
index f541bc0..9397b2e 100644
--- a/arch/arm/lib/lib1funcs.S
+++ b/arch/arm/lib/lib1funcs.S
@@ -36,7 +36,6 @@
 #include <linux/linkage.h>
 #include <asm/assembler.h>
 #include <asm/unwind.h>
-#include <asm/export.h>
 
 .macro ARM_DIV_BODY dividend, divisor, result, curbit
 
@@ -239,8 +238,6 @@
 UNWIND(.fnend)
 ENDPROC(__udivsi3)
 ENDPROC(__aeabi_uidiv)
-EXPORT_SYMBOL(__udivsi3)
-EXPORT_SYMBOL(__aeabi_uidiv)
 
 ENTRY(__umodsi3)
 UNWIND(.fnstart)
@@ -259,7 +256,6 @@
 
 UNWIND(.fnend)
 ENDPROC(__umodsi3)
-EXPORT_SYMBOL(__umodsi3)
 
 #ifdef CONFIG_ARM_PATCH_IDIV
 	.align 3
@@ -307,8 +303,6 @@
 UNWIND(.fnend)
 ENDPROC(__divsi3)
 ENDPROC(__aeabi_idiv)
-EXPORT_SYMBOL(__divsi3)
-EXPORT_SYMBOL(__aeabi_idiv)
 
 ENTRY(__modsi3)
 UNWIND(.fnstart)
@@ -333,7 +327,6 @@
 
 UNWIND(.fnend)
 ENDPROC(__modsi3)
-EXPORT_SYMBOL(__modsi3)
 
 #ifdef CONFIG_AEABI
 
@@ -350,7 +343,6 @@
 
 UNWIND(.fnend)
 ENDPROC(__aeabi_uidivmod)
-EXPORT_SYMBOL(__aeabi_uidivmod)
 
 ENTRY(__aeabi_idivmod)
 UNWIND(.fnstart)
@@ -364,7 +356,6 @@
 
 UNWIND(.fnend)
 ENDPROC(__aeabi_idivmod)
-EXPORT_SYMBOL(__aeabi_idivmod)
 
 #endif
 
diff --git a/arch/arm/lib/lshrdi3.S b/arch/arm/lib/lshrdi3.S
index e408339..922dcd8 100644
--- a/arch/arm/lib/lshrdi3.S
+++ b/arch/arm/lib/lshrdi3.S
@@ -28,7 +28,6 @@
 
 #include <linux/linkage.h>
 #include <asm/assembler.h>
-#include <asm/export.h>
 
 #ifdef __ARMEB__
 #define al r1
@@ -53,5 +52,3 @@
 
 ENDPROC(__lshrdi3)
 ENDPROC(__aeabi_llsr)
-EXPORT_SYMBOL(__lshrdi3)
-EXPORT_SYMBOL(__aeabi_llsr)
diff --git a/arch/arm/lib/memchr.S b/arch/arm/lib/memchr.S
index 44182bf..74a5bed 100644
--- a/arch/arm/lib/memchr.S
+++ b/arch/arm/lib/memchr.S
@@ -11,7 +11,6 @@
  */
 #include <linux/linkage.h>
 #include <asm/assembler.h>
-#include <asm/export.h>
 
 	.text
 	.align	5
@@ -25,4 +24,3 @@
 2:	movne	r0, #0
 	ret	lr
 ENDPROC(memchr)
-EXPORT_SYMBOL(memchr)
diff --git a/arch/arm/lib/memcpy.S b/arch/arm/lib/memcpy.S
index 1be5b6d..64111bd 100644
--- a/arch/arm/lib/memcpy.S
+++ b/arch/arm/lib/memcpy.S
@@ -13,7 +13,6 @@
 #include <linux/linkage.h>
 #include <asm/assembler.h>
 #include <asm/unwind.h>
-#include <asm/export.h>
 
 #define LDR1W_SHIFT	0
 #define STR1W_SHIFT	0
@@ -69,5 +68,3 @@
 
 ENDPROC(memcpy)
 ENDPROC(mmiocpy)
-EXPORT_SYMBOL(memcpy)
-EXPORT_SYMBOL(mmiocpy)
diff --git a/arch/arm/lib/memmove.S b/arch/arm/lib/memmove.S
index 71dcc54..69a9d47 100644
--- a/arch/arm/lib/memmove.S
+++ b/arch/arm/lib/memmove.S
@@ -13,7 +13,6 @@
 #include <linux/linkage.h>
 #include <asm/assembler.h>
 #include <asm/unwind.h>
-#include <asm/export.h>
 
 		.text
 
@@ -226,4 +225,3 @@
 18:		backward_copy_shift	push=24	pull=8
 
 ENDPROC(memmove)
-EXPORT_SYMBOL(memmove)
diff --git a/arch/arm/lib/memset.S b/arch/arm/lib/memset.S
index 7b72044..3c65e3b 100644
--- a/arch/arm/lib/memset.S
+++ b/arch/arm/lib/memset.S
@@ -12,7 +12,6 @@
 #include <linux/linkage.h>
 #include <asm/assembler.h>
 #include <asm/unwind.h>
-#include <asm/export.h>
 
 	.text
 	.align	5
@@ -136,5 +135,3 @@
 UNWIND( .fnend   )
 ENDPROC(memset)
 ENDPROC(mmioset)
-EXPORT_SYMBOL(memset)
-EXPORT_SYMBOL(mmioset)
diff --git a/arch/arm/lib/memzero.S b/arch/arm/lib/memzero.S
index 6dec26e..0eded95 100644
--- a/arch/arm/lib/memzero.S
+++ b/arch/arm/lib/memzero.S
@@ -10,7 +10,6 @@
 #include <linux/linkage.h>
 #include <asm/assembler.h>
 #include <asm/unwind.h>
-#include <asm/export.h>
 
 	.text
 	.align	5
@@ -136,4 +135,3 @@
 	ret	lr			@ 1
 UNWIND(	.fnend				)
 ENDPROC(__memzero)
-EXPORT_SYMBOL(__memzero)
diff --git a/arch/arm/lib/muldi3.S b/arch/arm/lib/muldi3.S
index b8f1238..2043059 100644
--- a/arch/arm/lib/muldi3.S
+++ b/arch/arm/lib/muldi3.S
@@ -12,7 +12,6 @@
 
 #include <linux/linkage.h>
 #include <asm/assembler.h>
-#include <asm/export.h>
 
 #ifdef __ARMEB__
 #define xh r0
@@ -47,5 +46,3 @@
 
 ENDPROC(__muldi3)
 ENDPROC(__aeabi_lmul)
-EXPORT_SYMBOL(__muldi3)
-EXPORT_SYMBOL(__aeabi_lmul)
diff --git a/arch/arm/lib/putuser.S b/arch/arm/lib/putuser.S
index 11de126..38d660d 100644
--- a/arch/arm/lib/putuser.S
+++ b/arch/arm/lib/putuser.S
@@ -31,7 +31,6 @@
 #include <asm/assembler.h>
 #include <asm/errno.h>
 #include <asm/domain.h>
-#include <asm/export.h>
 
 ENTRY(__put_user_1)
 	check_uaccess r0, 1, r1, ip, __put_user_bad
@@ -39,7 +38,6 @@
 	mov	r0, #0
 	ret	lr
 ENDPROC(__put_user_1)
-EXPORT_SYMBOL(__put_user_1)
 
 ENTRY(__put_user_2)
 	check_uaccess r0, 2, r1, ip, __put_user_bad
@@ -64,7 +62,6 @@
 	mov	r0, #0
 	ret	lr
 ENDPROC(__put_user_2)
-EXPORT_SYMBOL(__put_user_2)
 
 ENTRY(__put_user_4)
 	check_uaccess r0, 4, r1, ip, __put_user_bad
@@ -72,7 +69,6 @@
 	mov	r0, #0
 	ret	lr
 ENDPROC(__put_user_4)
-EXPORT_SYMBOL(__put_user_4)
 
 ENTRY(__put_user_8)
 	check_uaccess r0, 8, r1, ip, __put_user_bad
@@ -86,7 +82,6 @@
 	mov	r0, #0
 	ret	lr
 ENDPROC(__put_user_8)
-EXPORT_SYMBOL(__put_user_8)
 
 __put_user_bad:
 	mov	r0, #-EFAULT
diff --git a/arch/arm/lib/strchr.S b/arch/arm/lib/strchr.S
index 7301f6e6..013d64c 100644
--- a/arch/arm/lib/strchr.S
+++ b/arch/arm/lib/strchr.S
@@ -11,7 +11,6 @@
  */
 #include <linux/linkage.h>
 #include <asm/assembler.h>
-#include <asm/export.h>
 
 		.text
 		.align	5
@@ -26,4 +25,3 @@
 		subeq	r0, r0, #1
 		ret	lr
 ENDPROC(strchr)
-EXPORT_SYMBOL(strchr)
diff --git a/arch/arm/lib/strrchr.S b/arch/arm/lib/strrchr.S
index aaf9fd9..3cec1c7 100644
--- a/arch/arm/lib/strrchr.S
+++ b/arch/arm/lib/strrchr.S
@@ -11,7 +11,6 @@
  */
 #include <linux/linkage.h>
 #include <asm/assembler.h>
-#include <asm/export.h>
 
 		.text
 		.align	5
@@ -25,4 +24,3 @@
 		mov	r0, r3
 		ret	lr
 ENDPROC(strrchr)
-EXPORT_SYMBOL(strrchr)
diff --git a/arch/arm/lib/uaccess_with_memcpy.c b/arch/arm/lib/uaccess_with_memcpy.c
index 1626e3a..6bd1089 100644
--- a/arch/arm/lib/uaccess_with_memcpy.c
+++ b/arch/arm/lib/uaccess_with_memcpy.c
@@ -19,7 +19,6 @@
 #include <linux/gfp.h>
 #include <linux/highmem.h>
 #include <linux/hugetlb.h>
-#include <linux/export.h>
 #include <asm/current.h>
 #include <asm/page.h>
 
@@ -157,7 +156,6 @@
 	}
 	return n;
 }
-EXPORT_SYMBOL(arm_copy_to_user);
 	
 static unsigned long noinline
 __clear_user_memset(void __user *addr, unsigned long n)
@@ -215,7 +213,6 @@
 	}
 	return n;
 }
-EXPORT_SYMBOL(arm_clear_user);
 
 #if 0
 
diff --git a/arch/arm/lib/ucmpdi2.S b/arch/arm/lib/ucmpdi2.S
index 127a91a..ad4a630 100644
--- a/arch/arm/lib/ucmpdi2.S
+++ b/arch/arm/lib/ucmpdi2.S
@@ -12,7 +12,6 @@
 
 #include <linux/linkage.h>
 #include <asm/assembler.h>
-#include <asm/export.h>
 
 #ifdef __ARMEB__
 #define xh r0
@@ -36,7 +35,6 @@
 	ret	lr
 
 ENDPROC(__ucmpdi2)
-EXPORT_SYMBOL(__ucmpdi2)
 
 #ifdef CONFIG_AEABI
 
@@ -50,7 +48,6 @@
 	ret	lr
 
 ENDPROC(__aeabi_ulcmp)
-EXPORT_SYMBOL(__aeabi_ulcmp)
 
 #endif
 
diff --git a/arch/arm/mach-imx/Makefile b/arch/arm/mach-imx/Makefile
index 737450f..cab1289 100644
--- a/arch/arm/mach-imx/Makefile
+++ b/arch/arm/mach-imx/Makefile
@@ -32,6 +32,7 @@
 
 ifdef CONFIG_SND_IMX_SOC
 obj-y += ssi-fiq.o
+obj-y += ssi-fiq-ksym.o
 endif
 
 # i.MX21 based machines
diff --git a/arch/arm/mach-imx/gpc.c b/arch/arm/mach-imx/gpc.c
index 0df062d..b54db47 100644
--- a/arch/arm/mach-imx/gpc.c
+++ b/arch/arm/mach-imx/gpc.c
@@ -408,7 +408,7 @@
 static int imx_gpc_genpd_init(struct device *dev, struct regulator *pu_reg)
 {
 	struct clk *clk;
-	int i;
+	int i, ret;
 
 	imx6q_pu_domain.reg = pu_reg;
 
@@ -430,13 +430,22 @@
 	if (!IS_ENABLED(CONFIG_PM_GENERIC_DOMAINS))
 		return 0;
 
-	pm_genpd_init(&imx6q_pu_domain.base, NULL, false);
-	return of_genpd_add_provider_onecell(dev->of_node,
-					     &imx_gpc_onecell_data);
+	for (i = 0; i < ARRAY_SIZE(imx_gpc_domains); i++)
+		pm_genpd_init(imx_gpc_domains[i], NULL, false);
 
+	ret =  of_genpd_add_provider_onecell(dev->of_node,
+					     &imx_gpc_onecell_data);
+	if (ret)
+		goto power_off;
+
+	return 0;
+
+power_off:
+	imx6q_pm_pu_power_off(&imx6q_pu_domain.base);
 clk_err:
 	while (i--)
 		clk_put(imx6q_pu_domain.clk[i]);
+	imx6q_pu_domain.reg = NULL;
 	return -EINVAL;
 }
 
diff --git a/arch/arm/mach-imx/mach-imx6q.c b/arch/arm/mach-imx/mach-imx6q.c
index 97fd251..45801b2 100644
--- a/arch/arm/mach-imx/mach-imx6q.c
+++ b/arch/arm/mach-imx/mach-imx6q.c
@@ -173,7 +173,7 @@
 				ksz9021rn_phy_fixup);
 		phy_register_fixup_for_uid(PHY_ID_KSZ9031, MICREL_PHY_ID_MASK,
 				ksz9031rn_phy_fixup);
-		phy_register_fixup_for_uid(PHY_ID_AR8031, 0xffffffff,
+		phy_register_fixup_for_uid(PHY_ID_AR8031, 0xffffffef,
 				ar8031_phy_fixup);
 		phy_register_fixup_for_uid(PHY_ID_AR8035, 0xffffffef,
 				ar8035_phy_fixup);
diff --git a/arch/arm/mach-imx/ssi-fiq-ksym.c b/arch/arm/mach-imx/ssi-fiq-ksym.c
new file mode 100644
index 0000000..792090f
--- /dev/null
+++ b/arch/arm/mach-imx/ssi-fiq-ksym.c
@@ -0,0 +1,20 @@
+/*
+ * Exported ksyms for the SSI FIQ handler
+ *
+ * Copyright (C) 2009, Sascha Hauer <s.hauer@pengutronix.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+
+#include <linux/platform_data/asoc-imx-ssi.h>
+
+EXPORT_SYMBOL(imx_ssi_fiq_tx_buffer);
+EXPORT_SYMBOL(imx_ssi_fiq_rx_buffer);
+EXPORT_SYMBOL(imx_ssi_fiq_start);
+EXPORT_SYMBOL(imx_ssi_fiq_end);
+EXPORT_SYMBOL(imx_ssi_fiq_base);
+
diff --git a/arch/arm/mach-imx/ssi-fiq.S b/arch/arm/mach-imx/ssi-fiq.S
index fd7917f..a8b93c5 100644
--- a/arch/arm/mach-imx/ssi-fiq.S
+++ b/arch/arm/mach-imx/ssi-fiq.S
@@ -8,7 +8,6 @@
 
 #include <linux/linkage.h>
 #include <asm/assembler.h>
-#include <asm/export.h>
 
 /*
  * r8  = bit 0-15: tx offset, bit 16-31: tx buffer size
@@ -145,8 +144,4 @@
 		.word 0x0
 .L_imx_ssi_fiq_end:
 imx_ssi_fiq_end:
-EXPORT_SYMBOL(imx_ssi_fiq_tx_buffer)
-EXPORT_SYMBOL(imx_ssi_fiq_rx_buffer)
-EXPORT_SYMBOL(imx_ssi_fiq_start)
-EXPORT_SYMBOL(imx_ssi_fiq_end)
-EXPORT_SYMBOL(imx_ssi_fiq_base)
+
diff --git a/arch/arm/mach-mvebu/Kconfig b/arch/arm/mach-mvebu/Kconfig
index f9b6bd3..541647f 100644
--- a/arch/arm/mach-mvebu/Kconfig
+++ b/arch/arm/mach-mvebu/Kconfig
@@ -23,6 +23,7 @@
 	select CACHE_L2X0
 	select ARM_CPU_SUSPEND
 	select MACH_MVEBU_ANY
+	select MVEBU_CLK_COREDIV
 
 config MACH_ARMADA_370
 	bool "Marvell Armada 370 boards"
@@ -32,7 +33,6 @@
 	select CPU_PJ4B
 	select MACH_MVEBU_V7
 	select PINCTRL_ARMADA_370
-	select MVEBU_CLK_COREDIV
 	help
 	  Say 'Y' here if you want your kernel to support boards based
 	  on the Marvell Armada 370 SoC with device tree.
@@ -50,7 +50,6 @@
 	select HAVE_SMP
 	select MACH_MVEBU_V7
 	select PINCTRL_ARMADA_375
-	select MVEBU_CLK_COREDIV
 	help
 	  Say 'Y' here if you want your kernel to support boards based
 	  on the Marvell Armada 375 SoC with device tree.
@@ -68,7 +67,6 @@
 	select HAVE_SMP
 	select MACH_MVEBU_V7
 	select PINCTRL_ARMADA_38X
-	select MVEBU_CLK_COREDIV
 	help
 	  Say 'Y' here if you want your kernel to support boards based
 	  on the Marvell Armada 380/385 SoC with device tree.
diff --git a/arch/arm/mach-omap2/Kconfig b/arch/arm/mach-omap2/Kconfig
index a9afeeb..0465338 100644
--- a/arch/arm/mach-omap2/Kconfig
+++ b/arch/arm/mach-omap2/Kconfig
@@ -71,6 +71,7 @@
 	select HAVE_ARM_TWD
 	select ARM_ERRATA_754322
 	select ARM_ERRATA_775420
+	select OMAP_INTERCONNECT
 
 config SOC_DRA7XX
 	bool "TI DRA7XX"
diff --git a/arch/arm/mach-omap2/id.c b/arch/arm/mach-omap2/id.c
index 2abd53a..cc6d9fa 100644
--- a/arch/arm/mach-omap2/id.c
+++ b/arch/arm/mach-omap2/id.c
@@ -205,11 +205,15 @@
 
 #define OMAP3_SHOW_FEATURE(feat)		\
 	if (omap3_has_ ##feat())		\
-		printk(#feat" ");
+		n += scnprintf(buf + n, sizeof(buf) - n, #feat " ");
 
 static void __init omap3_cpuinfo(void)
 {
 	const char *cpu_name;
+	char buf[64];
+	int n = 0;
+
+	memset(buf, 0, sizeof(buf));
 
 	/*
 	 * OMAP3430 and OMAP3530 are assumed to be same.
@@ -241,10 +245,10 @@
 		cpu_name = "OMAP3503";
 	}
 
-	sprintf(soc_name, "%s", cpu_name);
+	scnprintf(soc_name, sizeof(soc_name), "%s", cpu_name);
 
 	/* Print verbose information */
-	pr_info("%s %s (", soc_name, soc_rev);
+	n += scnprintf(buf, sizeof(buf) - n, "%s %s (", soc_name, soc_rev);
 
 	OMAP3_SHOW_FEATURE(l2cache);
 	OMAP3_SHOW_FEATURE(iva);
@@ -252,8 +256,10 @@
 	OMAP3_SHOW_FEATURE(neon);
 	OMAP3_SHOW_FEATURE(isp);
 	OMAP3_SHOW_FEATURE(192mhz_clk);
-
-	printk(")\n");
+	if (*(buf + n - 1) == ' ')
+		n--;
+	n += scnprintf(buf + n, sizeof(buf) - n, ")\n");
+	pr_info("%s", buf);
 }
 
 #define OMAP3_CHECK_FEATURE(status,feat)				\
diff --git a/arch/arm/mach-omap2/prm3xxx.c b/arch/arm/mach-omap2/prm3xxx.c
index 62680aa..718981b 100644
--- a/arch/arm/mach-omap2/prm3xxx.c
+++ b/arch/arm/mach-omap2/prm3xxx.c
@@ -319,6 +319,9 @@
 	if (has_uart4) {
 		en_uart4_mask = OMAP3630_EN_UART4_MASK;
 		grpsel_uart4_mask = OMAP3630_GRPSEL_UART4_MASK;
+	} else {
+		en_uart4_mask = 0;
+		grpsel_uart4_mask = 0;
 	}
 
 	/* Enable wakeups in PER */
diff --git a/arch/arm/mach-omap2/voltage.c b/arch/arm/mach-omap2/voltage.c
index cba8cad..cd15dbd 100644
--- a/arch/arm/mach-omap2/voltage.c
+++ b/arch/arm/mach-omap2/voltage.c
@@ -87,6 +87,12 @@
 		return -ENODATA;
 	}
 
+	if (!voltdm->volt_data) {
+		pr_err("%s: No voltage data defined for vdd_%s\n",
+			__func__, voltdm->name);
+		return -ENODATA;
+	}
+
 	/* Adjust voltage to the exact voltage from the OPP table */
 	for (i = 0; voltdm->volt_data[i].volt_nominal != 0; i++) {
 		if (voltdm->volt_data[i].volt_nominal >= target_volt) {
diff --git a/arch/arm/mach-uniphier/Kconfig b/arch/arm/mach-uniphier/Kconfig
index 82dddee..3930fbb 100644
--- a/arch/arm/mach-uniphier/Kconfig
+++ b/arch/arm/mach-uniphier/Kconfig
@@ -1,6 +1,7 @@
 config ARCH_UNIPHIER
 	bool "Socionext UniPhier SoCs"
 	depends on ARCH_MULTI_V7
+	select ARCH_HAS_RESET_CONTROLLER
 	select ARM_AMBA
 	select ARM_GLOBAL_TIMER
 	select ARM_GIC
diff --git a/arch/arm/mm/abort-lv4t.S b/arch/arm/mm/abort-lv4t.S
index 6d8e8e3..4cdfab3 100644
--- a/arch/arm/mm/abort-lv4t.S
+++ b/arch/arm/mm/abort-lv4t.S
@@ -7,7 +7,7 @@
  *	   : r4 = aborted context pc
  *	   : r5 = aborted context psr
  *
- * Returns : r4-r5, r10-r11, r13 preserved
+ * Returns : r4-r5, r9-r11, r13 preserved
  *
  * Purpose : obtain information about current aborted instruction.
  * Note: we read user space.  This means we might cause a data
@@ -48,7 +48,10 @@
 /* c */	b	do_DataAbort			@ ldc	rd, [rn], #m	@ Same as ldr	rd, [rn], #m
 /* d */	b	do_DataAbort			@ ldc	rd, [rn, #m]
 /* e */	b	.data_unknown
-/* f */
+/* f */	b	.data_unknown
+
+.data_unknown_r9:
+	ldr	r9, [sp], #4
 .data_unknown:	@ Part of jumptable
 	mov	r0, r4
 	mov	r1, r8
@@ -57,6 +60,7 @@
 .data_arm_ldmstm:
 	tst	r8, #1 << 21			@ check writeback bit
 	beq	do_DataAbort			@ no writeback -> no fixup
+	str	r9, [sp, #-4]!
 	mov	r7, #0x11
 	orr	r7, r7, #0x1100
 	and	r6, r8, r7
@@ -75,12 +79,14 @@
 	subne	r7, r7, r6, lsl #2		@ Undo increment
 	addeq	r7, r7, r6, lsl #2		@ Undo decrement
 	str	r7, [r2, r9, lsr #14]		@ Put register 'Rn'
+	ldr	r9, [sp], #4
 	b	do_DataAbort
 
 .data_arm_lateldrhpre:
 	tst	r8, #1 << 21			@ Check writeback bit
 	beq	do_DataAbort			@ No writeback -> no fixup
 .data_arm_lateldrhpost:
+	str	r9, [sp, #-4]!
 	and	r9, r8, #0x00f			@ get Rm / low nibble of immediate value
 	tst	r8, #1 << 22			@ if (immediate offset)
 	andne	r6, r8, #0xf00			@ { immediate high nibble
@@ -93,6 +99,7 @@
 	subne	r7, r7, r6			@ Undo incrmenet
 	addeq	r7, r7, r6			@ Undo decrement
 	str	r7, [r2, r9, lsr #14]		@ Put register 'Rn'
+	ldr	r9, [sp], #4
 	b	do_DataAbort
 
 .data_arm_lateldrpreconst:
@@ -101,12 +108,14 @@
 .data_arm_lateldrpostconst:
 	movs	r6, r8, lsl #20			@ Get offset
 	beq	do_DataAbort			@ zero -> no fixup
+	str	r9, [sp, #-4]!
 	and	r9, r8, #15 << 16		@ Extract 'n' from instruction
 	ldr	r7, [r2, r9, lsr #14]		@ Get register 'Rn'
 	tst	r8, #1 << 23			@ Check U bit
 	subne	r7, r7, r6, lsr #20		@ Undo increment
 	addeq	r7, r7, r6, lsr #20		@ Undo decrement
 	str	r7, [r2, r9, lsr #14]		@ Put register 'Rn'
+	ldr	r9, [sp], #4
 	b	do_DataAbort
 
 .data_arm_lateldrprereg:
@@ -115,6 +124,7 @@
 .data_arm_lateldrpostreg:
 	and	r7, r8, #15			@ Extract 'm' from instruction
 	ldr	r6, [r2, r7, lsl #2]		@ Get register 'Rm'
+	str	r9, [sp, #-4]!
 	mov	r9, r8, lsr #7			@ get shift count
 	ands	r9, r9, #31
 	and	r7, r8, #0x70			@ get shift type
@@ -126,33 +136,33 @@
 	b	.data_arm_apply_r6_and_rn
 	b	.data_arm_apply_r6_and_rn	@ 1: LSL #0
 	nop
-	b	.data_unknown			@ 2: MUL?
+	b	.data_unknown_r9		@ 2: MUL?
 	nop
-	b	.data_unknown			@ 3: MUL?
+	b	.data_unknown_r9		@ 3: MUL?
 	nop
 	mov	r6, r6, lsr r9			@ 4: LSR #!0
 	b	.data_arm_apply_r6_and_rn
 	mov	r6, r6, lsr #32			@ 5: LSR #32
 	b	.data_arm_apply_r6_and_rn
-	b	.data_unknown			@ 6: MUL?
+	b	.data_unknown_r9		@ 6: MUL?
 	nop
-	b	.data_unknown			@ 7: MUL?
+	b	.data_unknown_r9		@ 7: MUL?
 	nop
 	mov	r6, r6, asr r9			@ 8: ASR #!0
 	b	.data_arm_apply_r6_and_rn
 	mov	r6, r6, asr #32			@ 9: ASR #32
 	b	.data_arm_apply_r6_and_rn
-	b	.data_unknown			@ A: MUL?
+	b	.data_unknown_r9		@ A: MUL?
 	nop
-	b	.data_unknown			@ B: MUL?
+	b	.data_unknown_r9		@ B: MUL?
 	nop
 	mov	r6, r6, ror r9			@ C: ROR #!0
 	b	.data_arm_apply_r6_and_rn
 	mov	r6, r6, rrx			@ D: RRX
 	b	.data_arm_apply_r6_and_rn
-	b	.data_unknown			@ E: MUL?
+	b	.data_unknown_r9		@ E: MUL?
 	nop
-	b	.data_unknown			@ F: MUL?
+	b	.data_unknown_r9		@ F: MUL?
 
 .data_thumb_abort:
 	ldrh	r8, [r4]			@ read instruction
@@ -190,6 +200,7 @@
 .data_thumb_pushpop:
 	tst	r8, #1 << 10
 	beq	.data_unknown
+	str	r9, [sp, #-4]!
 	and	r6, r8, #0x55			@ hweight8(r8) + R bit
 	and	r9, r8, #0xaa
 	add	r6, r6, r9, lsr #1
@@ -204,9 +215,11 @@
 	addeq	r7, r7, r6, lsl #2		@ increment SP if PUSH
 	subne	r7, r7, r6, lsl #2		@ decrement SP if POP
 	str	r7, [r2, #13 << 2]
+	ldr	r9, [sp], #4
 	b	do_DataAbort
 
 .data_thumb_ldmstm:
+	str	r9, [sp, #-4]!
 	and	r6, r8, #0x55			@ hweight8(r8)
 	and	r9, r8, #0xaa
 	add	r6, r6, r9, lsr #1
@@ -219,4 +232,5 @@
 	and	r6, r6, #15			@ number of regs to transfer
 	sub	r7, r7, r6, lsl #2		@ always decrement
 	str	r7, [r2, r9, lsr #6]
+	ldr	r9, [sp], #4
 	b	do_DataAbort
diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index ab4f745..ab77100 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -1167,7 +1167,7 @@
 	dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
 	return 0;
 }
-fs_initcall(dma_debug_do_init);
+core_initcall(dma_debug_do_init);
 
 #ifdef CONFIG_ARM_DMA_USE_IOMMU
 
diff --git a/arch/arm/mm/proc-v7m.S b/arch/arm/mm/proc-v7m.S
index f6d333f..8dea616 100644
--- a/arch/arm/mm/proc-v7m.S
+++ b/arch/arm/mm/proc-v7m.S
@@ -96,7 +96,7 @@
 	ret	lr
 ENDPROC(cpu_cm7_proc_fin)
 
-	.section ".text.init", #alloc, #execinstr
+	.section ".init.text", #alloc, #execinstr
 
 __v7m_cm7_setup:
 	mov	r8, #(V7M_SCB_CCR_DC | V7M_SCB_CCR_IC| V7M_SCB_CCR_BP)
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 30398db..969ef88 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -915,7 +915,7 @@
 
 config RANDOMIZE_MODULE_REGION_FULL
 	bool "Randomize the module region independently from the core kernel"
-	depends on RANDOMIZE_BASE
+	depends on RANDOMIZE_BASE && !DYNAMIC_FTRACE
 	default y
 	help
 	  Randomizes the location of the module region without considering the
diff --git a/arch/arm64/Kconfig.platforms b/arch/arm64/Kconfig.platforms
index cfbdf02..101794f 100644
--- a/arch/arm64/Kconfig.platforms
+++ b/arch/arm64/Kconfig.platforms
@@ -190,6 +190,7 @@
 
 config ARCH_UNIPHIER
 	bool "Socionext UniPhier SoC Family"
+	select ARCH_HAS_RESET_CONTROLLER
 	select PINCTRL
 	help
 	  This enables support for Socionext UniPhier SoC family.
diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile
index ab51aed..3635b86 100644
--- a/arch/arm64/Makefile
+++ b/arch/arm64/Makefile
@@ -15,7 +15,7 @@
 GZFLAGS		:=-9
 
 ifneq ($(CONFIG_RELOCATABLE),)
-LDFLAGS_vmlinux		+= -pie -Bsymbolic
+LDFLAGS_vmlinux		+= -pie -shared -Bsymbolic
 endif
 
 ifeq ($(CONFIG_ARM64_ERRATUM_843419),y)
diff --git a/arch/arm64/boot/dts/arm/juno-base.dtsi b/arch/arm64/boot/dts/arm/juno-base.dtsi
index 334271a..7d3a2ac 100644
--- a/arch/arm64/boot/dts/arm/juno-base.dtsi
+++ b/arch/arm64/boot/dts/arm/juno-base.dtsi
@@ -393,7 +393,7 @@
 		#address-cells = <3>;
 		#size-cells = <2>;
 		dma-coherent;
-		ranges = <0x01000000 0x00 0x5f800000 0x00 0x5f800000 0x0 0x00800000>,
+		ranges = <0x01000000 0x00 0x00000000 0x00 0x5f800000 0x0 0x00800000>,
 			 <0x02000000 0x00 0x50000000 0x00 0x50000000 0x0 0x08000000>,
 			 <0x42000000 0x40 0x00000000 0x40 0x00000000 0x1 0x00000000>;
 		#interrupt-cells = <1>;
diff --git a/arch/arm64/boot/dts/arm/juno-r1.dts b/arch/arm64/boot/dts/arm/juno-r1.dts
index 123a58b..f0b857d 100644
--- a/arch/arm64/boot/dts/arm/juno-r1.dts
+++ b/arch/arm64/boot/dts/arm/juno-r1.dts
@@ -76,7 +76,7 @@
 				compatible = "arm,idle-state";
 				arm,psci-suspend-param = <0x1010000>;
 				local-timer-stop;
-				entry-latency-us = <300>;
+				entry-latency-us = <400>;
 				exit-latency-us = <1200>;
 				min-residency-us = <2500>;
 			};
diff --git a/arch/arm64/boot/dts/arm/juno-r2.dts b/arch/arm64/boot/dts/arm/juno-r2.dts
index 007be82..26aaa6a 100644
--- a/arch/arm64/boot/dts/arm/juno-r2.dts
+++ b/arch/arm64/boot/dts/arm/juno-r2.dts
@@ -76,7 +76,7 @@
 				compatible = "arm,idle-state";
 				arm,psci-suspend-param = <0x1010000>;
 				local-timer-stop;
-				entry-latency-us = <300>;
+				entry-latency-us = <400>;
 				exit-latency-us = <1200>;
 				min-residency-us = <2500>;
 			};
diff --git a/arch/arm64/boot/dts/arm/juno.dts b/arch/arm64/boot/dts/arm/juno.dts
index a7270ef..6e154d9 100644
--- a/arch/arm64/boot/dts/arm/juno.dts
+++ b/arch/arm64/boot/dts/arm/juno.dts
@@ -76,7 +76,7 @@
 				compatible = "arm,idle-state";
 				arm,psci-suspend-param = <0x1010000>;
 				local-timer-stop;
-				entry-latency-us = <300>;
+				entry-latency-us = <400>;
 				exit-latency-us = <1200>;
 				min-residency-us = <2500>;
 			};
diff --git a/arch/arm64/boot/dts/broadcom/ns2-svk.dts b/arch/arm64/boot/dts/broadcom/ns2-svk.dts
index 2d7872a..b09f3bc 100644
--- a/arch/arm64/boot/dts/broadcom/ns2-svk.dts
+++ b/arch/arm64/boot/dts/broadcom/ns2-svk.dts
@@ -164,6 +164,8 @@
 		nand-ecc-mode = "hw";
 		nand-ecc-strength = <8>;
 		nand-ecc-step-size = <512>;
+		nand-bus-width = <16>;
+		brcm,nand-oob-sector-size = <16>;
 		#address-cells = <1>;
 		#size-cells = <1>;
 	};
diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi
index 220ac70..97d331e 100644
--- a/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi
+++ b/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi
@@ -123,6 +123,7 @@
 			     <1 14 0xf08>, /* Physical Non-Secure PPI */
 			     <1 11 0xf08>, /* Virtual PPI */
 			     <1 10 0xf08>; /* Hypervisor PPI */
+		fsl,erratum-a008585;
 	};
 
 	pmu {
diff --git a/arch/arm64/boot/dts/freescale/fsl-ls2080a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls2080a.dtsi
index 337da90..7f0dc13 100644
--- a/arch/arm64/boot/dts/freescale/fsl-ls2080a.dtsi
+++ b/arch/arm64/boot/dts/freescale/fsl-ls2080a.dtsi
@@ -195,6 +195,7 @@
 			     <1 14 4>, /* Physical Non-Secure PPI, active-low */
 			     <1 11 4>, /* Virtual PPI, active-low */
 			     <1 10 4>; /* Hypervisor PPI, active-low */
+		fsl,erratum-a008585;
 	};
 
 	pmu {
diff --git a/arch/arm64/boot/dts/marvell/armada-37xx.dtsi b/arch/arm64/boot/dts/marvell/armada-37xx.dtsi
index c476253..e9bd587 100644
--- a/arch/arm64/boot/dts/marvell/armada-37xx.dtsi
+++ b/arch/arm64/boot/dts/marvell/armada-37xx.dtsi
@@ -105,7 +105,7 @@
 				status = "disabled";
 			};
 
-			nb_perih_clk: nb-periph-clk@13000{
+			nb_periph_clk: nb-periph-clk@13000 {
 				compatible = "marvell,armada-3700-periph-clock-nb";
 				reg = <0x13000 0x100>;
 				clocks = <&tbg 0>, <&tbg 1>, <&tbg 2>,
@@ -113,7 +113,7 @@
 				#clock-cells = <1>;
 			};
 
-			sb_perih_clk: sb-periph-clk@18000{
+			sb_periph_clk: sb-periph-clk@18000 {
 				compatible = "marvell,armada-3700-periph-clock-sb";
 				reg = <0x18000 0x100>;
 				clocks = <&tbg 0>, <&tbg 1>, <&tbg 2>,
diff --git a/arch/arm64/boot/dts/marvell/armada-cp110-master.dtsi b/arch/arm64/boot/dts/marvell/armada-cp110-master.dtsi
index e5e3ed6..602e2c2 100644
--- a/arch/arm64/boot/dts/marvell/armada-cp110-master.dtsi
+++ b/arch/arm64/boot/dts/marvell/armada-cp110-master.dtsi
@@ -131,7 +131,7 @@
 				#address-cells = <0x1>;
 				#size-cells = <0x0>;
 				cell-index = <1>;
-				clocks = <&cpm_syscon0 0 3>;
+				clocks = <&cpm_syscon0 1 21>;
 				status = "disabled";
 			};
 
diff --git a/arch/arm64/boot/dts/marvell/armada-cp110-slave.dtsi b/arch/arm64/boot/dts/marvell/armada-cp110-slave.dtsi
index 842fb33..6bf9e24 100644
--- a/arch/arm64/boot/dts/marvell/armada-cp110-slave.dtsi
+++ b/arch/arm64/boot/dts/marvell/armada-cp110-slave.dtsi
@@ -130,8 +130,8 @@
 				reg = <0x700600 0x50>;
 				#address-cells = <0x1>;
 				#size-cells = <0x0>;
-				cell-index = <1>;
-				clocks = <&cps_syscon0 0 3>;
+				cell-index = <3>;
+				clocks = <&cps_syscon0 1 21>;
 				status = "disabled";
 			};
 
@@ -140,7 +140,7 @@
 				reg = <0x700680 0x50>;
 				#address-cells = <1>;
 				#size-cells = <0>;
-				cell-index = <2>;
+				cell-index = <4>;
 				clocks = <&cps_syscon0 1 21>;
 				status = "disabled";
 			};
diff --git a/arch/arm64/boot/dts/rockchip/rk3368-geekbox.dts b/arch/arm64/boot/dts/rockchip/rk3368-geekbox.dts
index 46cdddf..e5eeca2 100644
--- a/arch/arm64/boot/dts/rockchip/rk3368-geekbox.dts
+++ b/arch/arm64/boot/dts/rockchip/rk3368-geekbox.dts
@@ -116,7 +116,6 @@
 	cap-mmc-highspeed;
 	clock-frequency = <150000000>;
 	disable-wp;
-	keep-power-in-suspend;
 	non-removable;
 	num-slots = <1>;
 	vmmc-supply = <&vcc_io>;
@@ -258,8 +257,6 @@
 			};
 
 			vcc_sd: SWITCH_REG1 {
-				regulator-always-on;
-				regulator-boot-on;
 				regulator-name = "vcc_sd";
 			};
 
diff --git a/arch/arm64/boot/dts/rockchip/rk3368-orion-r68-meta.dts b/arch/arm64/boot/dts/rockchip/rk3368-orion-r68-meta.dts
index 5797933..ea0a8ec 100644
--- a/arch/arm64/boot/dts/rockchip/rk3368-orion-r68-meta.dts
+++ b/arch/arm64/boot/dts/rockchip/rk3368-orion-r68-meta.dts
@@ -152,8 +152,6 @@
 		gpio = <&gpio3 11 GPIO_ACTIVE_LOW>;
 		regulator-min-microvolt = <1800000>;
 		regulator-max-microvolt = <3300000>;
-		regulator-always-on;
-		regulator-boot-on;
 		vin-supply = <&vcc_io>;
 	};
 
@@ -201,7 +199,6 @@
 	bus-width = <8>;
 	cap-mmc-highspeed;
 	disable-wp;
-	keep-power-in-suspend;
 	mmc-pwrseq = <&emmc_pwrseq>;
 	mmc-hs200-1_2v;
 	mmc-hs200-1_8v;
@@ -350,7 +347,6 @@
 	clock-freq-min-max = <400000 50000000>;
 	cap-sd-highspeed;
 	card-detect-delay = <200>;
-	keep-power-in-suspend;
 	num-slots = <1>;
 	pinctrl-names = "default";
 	pinctrl-0 = <&sdmmc_clk &sdmmc_cmd &sdmmc_cd &sdmmc_bus4>;
diff --git a/arch/arm64/boot/dts/rockchip/rk3399.dtsi b/arch/arm64/boot/dts/rockchip/rk3399.dtsi
index b65c193..7afbfb0 100644
--- a/arch/arm64/boot/dts/rockchip/rk3399.dtsi
+++ b/arch/arm64/boot/dts/rockchip/rk3399.dtsi
@@ -300,8 +300,11 @@
 		ranges = <0x83000000 0x0 0xfa000000 0x0 0xfa000000 0x0 0x600000
 			  0x81000000 0x0 0xfa600000 0x0 0xfa600000 0x0 0x100000>;
 		resets = <&cru SRST_PCIE_CORE>, <&cru SRST_PCIE_MGMT>,
-			 <&cru SRST_PCIE_MGMT_STICKY>, <&cru SRST_PCIE_PIPE>;
-		reset-names = "core", "mgmt", "mgmt-sticky", "pipe";
+			 <&cru SRST_PCIE_MGMT_STICKY>, <&cru SRST_PCIE_PIPE>,
+			 <&cru SRST_PCIE_PM>, <&cru SRST_P_PCIE>,
+			 <&cru SRST_A_PCIE>;
+		reset-names = "core", "mgmt", "mgmt-sticky", "pipe",
+			      "pm", "pclk", "aclk";
 		status = "disabled";
 
 		pcie0_intc: interrupt-controller {
diff --git a/arch/arm64/boot/dts/socionext/uniphier-ld20.dtsi b/arch/arm64/boot/dts/socionext/uniphier-ld20.dtsi
index 08fd7cf..56a1b2e 100644
--- a/arch/arm64/boot/dts/socionext/uniphier-ld20.dtsi
+++ b/arch/arm64/boot/dts/socionext/uniphier-ld20.dtsi
@@ -257,18 +257,18 @@
 			reg = <0x59801000 0x400>;
 		};
 
-		mioctrl@59810000 {
-			compatible = "socionext,uniphier-mioctrl",
+		sdctrl@59810000 {
+			compatible = "socionext,uniphier-ld20-sdctrl",
 				     "simple-mfd", "syscon";
 			reg = <0x59810000 0x800>;
 
-			mio_clk: clock {
-				compatible = "socionext,uniphier-ld20-mio-clock";
+			sd_clk: clock {
+				compatible = "socionext,uniphier-ld20-sd-clock";
 				#clock-cells = <1>;
 			};
 
-			mio_rst: reset {
-				compatible = "socionext,uniphier-ld20-mio-reset";
+			sd_rst: reset {
+				compatible = "socionext,uniphier-ld20-sd-reset";
 				#reset-cells = <1>;
 			};
 		};
diff --git a/arch/arm64/include/asm/alternative.h b/arch/arm64/include/asm/alternative.h
index 39feb85..6e1cb8c 100644
--- a/arch/arm64/include/asm/alternative.h
+++ b/arch/arm64/include/asm/alternative.h
@@ -1,7 +1,7 @@
 #ifndef __ASM_ALTERNATIVE_H
 #define __ASM_ALTERNATIVE_H
 
-#include <asm/cpufeature.h>
+#include <asm/cpucaps.h>
 #include <asm/insn.h>
 
 #ifndef __ASSEMBLY__
diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h
new file mode 100644
index 0000000..87b4465
--- /dev/null
+++ b/arch/arm64/include/asm/cpucaps.h
@@ -0,0 +1,40 @@
+/*
+ * arch/arm64/include/asm/cpucaps.h
+ *
+ * Copyright (C) 2016 ARM Ltd.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#ifndef __ASM_CPUCAPS_H
+#define __ASM_CPUCAPS_H
+
+#define ARM64_WORKAROUND_CLEAN_CACHE		0
+#define ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE	1
+#define ARM64_WORKAROUND_845719			2
+#define ARM64_HAS_SYSREG_GIC_CPUIF		3
+#define ARM64_HAS_PAN				4
+#define ARM64_HAS_LSE_ATOMICS			5
+#define ARM64_WORKAROUND_CAVIUM_23154		6
+#define ARM64_WORKAROUND_834220			7
+#define ARM64_HAS_NO_HW_PREFETCH		8
+#define ARM64_HAS_UAO				9
+#define ARM64_ALT_PAN_NOT_UAO			10
+#define ARM64_HAS_VIRT_HOST_EXTN		11
+#define ARM64_WORKAROUND_CAVIUM_27456		12
+#define ARM64_HAS_32BIT_EL0			13
+#define ARM64_HYP_OFFSET_LOW			14
+#define ARM64_MISMATCHED_CACHE_LINE_SIZE	15
+
+#define ARM64_NCAPS				16
+
+#endif /* __ASM_CPUCAPS_H */
diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index 758d74f..0bc0b1d 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -11,6 +11,7 @@
 
 #include <linux/jump_label.h>
 
+#include <asm/cpucaps.h>
 #include <asm/hwcap.h>
 #include <asm/sysreg.h>
 
@@ -24,25 +25,6 @@
 #define MAX_CPU_FEATURES	(8 * sizeof(elf_hwcap))
 #define cpu_feature(x)		ilog2(HWCAP_ ## x)
 
-#define ARM64_WORKAROUND_CLEAN_CACHE		0
-#define ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE	1
-#define ARM64_WORKAROUND_845719			2
-#define ARM64_HAS_SYSREG_GIC_CPUIF		3
-#define ARM64_HAS_PAN				4
-#define ARM64_HAS_LSE_ATOMICS			5
-#define ARM64_WORKAROUND_CAVIUM_23154		6
-#define ARM64_WORKAROUND_834220			7
-#define ARM64_HAS_NO_HW_PREFETCH		8
-#define ARM64_HAS_UAO				9
-#define ARM64_ALT_PAN_NOT_UAO			10
-#define ARM64_HAS_VIRT_HOST_EXTN		11
-#define ARM64_WORKAROUND_CAVIUM_27456		12
-#define ARM64_HAS_32BIT_EL0			13
-#define ARM64_HYP_OFFSET_LOW			14
-#define ARM64_MISMATCHED_CACHE_LINE_SIZE	15
-
-#define ARM64_NCAPS				16
-
 #ifndef __ASSEMBLY__
 
 #include <linux/kernel.h>
@@ -94,7 +76,7 @@
 	u16 capability;
 	int def_scope;			/* default scope */
 	bool (*matches)(const struct arm64_cpu_capabilities *caps, int scope);
-	void (*enable)(void *);		/* Called on all active CPUs */
+	int (*enable)(void *);		/* Called on all active CPUs */
 	union {
 		struct {	/* To be used for erratum handling only */
 			u32 midr_model;
diff --git a/arch/arm64/include/asm/exec.h b/arch/arm64/include/asm/exec.h
index db0563c..f7865dd 100644
--- a/arch/arm64/include/asm/exec.h
+++ b/arch/arm64/include/asm/exec.h
@@ -18,6 +18,9 @@
 #ifndef __ASM_EXEC_H
 #define __ASM_EXEC_H
 
+#include <linux/sched.h>
+
 extern unsigned long arch_align_stack(unsigned long sp);
+void uao_thread_switch(struct task_struct *next);
 
 #endif	/* __ASM_EXEC_H */
diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index 18f7465..ec3553eb 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -54,6 +54,7 @@
 extern void __kvm_flush_vm_context(void);
 extern void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa);
 extern void __kvm_tlb_flush_vmid(struct kvm *kvm);
+extern void __kvm_tlb_flush_local_vmid(struct kvm_vcpu *vcpu);
 
 extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu);
 
diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
index fd9d5fd..f5ea0ba 100644
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -178,11 +178,6 @@
 	return !!(kvm_vcpu_get_hsr(vcpu) & ESR_ELx_ISV);
 }
 
-static inline bool kvm_vcpu_dabt_iswrite(const struct kvm_vcpu *vcpu)
-{
-	return !!(kvm_vcpu_get_hsr(vcpu) & ESR_ELx_WNR);
-}
-
 static inline bool kvm_vcpu_dabt_issext(const struct kvm_vcpu *vcpu)
 {
 	return !!(kvm_vcpu_get_hsr(vcpu) & ESR_ELx_SSE);
@@ -203,6 +198,12 @@
 	return !!(kvm_vcpu_get_hsr(vcpu) & ESR_ELx_S1PTW);
 }
 
+static inline bool kvm_vcpu_dabt_iswrite(const struct kvm_vcpu *vcpu)
+{
+	return !!(kvm_vcpu_get_hsr(vcpu) & ESR_ELx_WNR) ||
+		kvm_vcpu_dabt_iss1tw(vcpu); /* AF/DBM update */
+}
+
 static inline bool kvm_vcpu_dabt_is_cm(const struct kvm_vcpu *vcpu)
 {
 	return !!(kvm_vcpu_get_hsr(vcpu) & ESR_ELx_CM);
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index bd94e67..e505038 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -62,6 +62,9 @@
 	/* VTTBR value associated with above pgd and vmid */
 	u64    vttbr;
 
+	/* The last vcpu id that ran on each physical CPU */
+	int __percpu *last_vcpu_ran;
+
 	/* The maximum number of vCPUs depends on the used GIC model */
 	int max_vcpus;
 
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index a79b969..6f72fe8 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -128,7 +128,7 @@
 	return v;
 }
 
-#define kern_hyp_va(v) 	(typeof(v))(__kern_hyp_va((unsigned long)(v)))
+#define kern_hyp_va(v) 	((typeof(v))(__kern_hyp_va((unsigned long)(v))))
 
 /*
  * We currently only support a 40bit IPA.
diff --git a/arch/arm64/include/asm/lse.h b/arch/arm64/include/asm/lse.h
index 23acc00..fc756e2 100644
--- a/arch/arm64/include/asm/lse.h
+++ b/arch/arm64/include/asm/lse.h
@@ -5,7 +5,6 @@
 
 #include <linux/stringify.h>
 #include <asm/alternative.h>
-#include <asm/cpufeature.h>
 
 #ifdef __ASSEMBLER__
 
diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
index ba62df8..b71086d 100644
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -217,7 +217,7 @@
 #define _virt_addr_valid(kaddr)	pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
 #else
 #define __virt_to_pgoff(kaddr)	(((u64)(kaddr) & ~PAGE_OFFSET) / PAGE_SIZE * sizeof(struct page))
-#define __page_to_voff(kaddr)	(((u64)(page) & ~VMEMMAP_START) * PAGE_SIZE / sizeof(struct page))
+#define __page_to_voff(page)	(((u64)(page) & ~VMEMMAP_START) * PAGE_SIZE / sizeof(struct page))
 
 #define page_to_virt(page)	((void *)((__page_to_voff(page)) | PAGE_OFFSET))
 #define virt_to_page(vaddr)	((struct page *)((__virt_to_pgoff(vaddr)) | VMEMMAP_START))
diff --git a/arch/arm64/include/asm/module.h b/arch/arm64/include/asm/module.h
index e12af67..06ff7fd 100644
--- a/arch/arm64/include/asm/module.h
+++ b/arch/arm64/include/asm/module.h
@@ -17,6 +17,7 @@
 #define __ASM_MODULE_H
 
 #include <asm-generic/module.h>
+#include <asm/memory.h>
 
 #define MODULE_ARCH_VERMAGIC	"aarch64"
 
@@ -32,6 +33,10 @@
 			  Elf64_Sym *sym);
 
 #ifdef CONFIG_RANDOMIZE_BASE
+#ifdef CONFIG_MODVERSIONS
+#define ARCH_RELOCATES_KCRCTAB
+#define reloc_start 		(kimage_vaddr - KIMAGE_VADDR)
+#endif
 extern u64 module_alloc_base;
 #else
 #define module_alloc_base	((u64)_etext - MODULES_VSIZE)
diff --git a/arch/arm64/include/asm/percpu.h b/arch/arm64/include/asm/percpu.h
index 2fee2f5..5394c84 100644
--- a/arch/arm64/include/asm/percpu.h
+++ b/arch/arm64/include/asm/percpu.h
@@ -44,48 +44,44 @@
 									\
 	switch (size) {							\
 	case 1:								\
-		do {							\
-			asm ("//__per_cpu_" #op "_1\n"			\
-			"ldxrb	  %w[ret], %[ptr]\n"			\
+		asm ("//__per_cpu_" #op "_1\n"				\
+		"1:	ldxrb	  %w[ret], %[ptr]\n"			\
 			#asm_op " %w[ret], %w[ret], %w[val]\n"		\
-			"stxrb	  %w[loop], %w[ret], %[ptr]\n"		\
-			: [loop] "=&r" (loop), [ret] "=&r" (ret),	\
-			  [ptr] "+Q"(*(u8 *)ptr)			\
-			: [val] "Ir" (val));				\
-		} while (loop);						\
+		"	stxrb	  %w[loop], %w[ret], %[ptr]\n"		\
+		"	cbnz	  %w[loop], 1b"				\
+		: [loop] "=&r" (loop), [ret] "=&r" (ret),		\
+		  [ptr] "+Q"(*(u8 *)ptr)				\
+		: [val] "Ir" (val));					\
 		break;							\
 	case 2:								\
-		do {							\
-			asm ("//__per_cpu_" #op "_2\n"			\
-			"ldxrh	  %w[ret], %[ptr]\n"			\
+		asm ("//__per_cpu_" #op "_2\n"				\
+		"1:	ldxrh	  %w[ret], %[ptr]\n"			\
 			#asm_op " %w[ret], %w[ret], %w[val]\n"		\
-			"stxrh	  %w[loop], %w[ret], %[ptr]\n"		\
-			: [loop] "=&r" (loop), [ret] "=&r" (ret),	\
-			  [ptr]  "+Q"(*(u16 *)ptr)			\
-			: [val] "Ir" (val));				\
-		} while (loop);						\
+		"	stxrh	  %w[loop], %w[ret], %[ptr]\n"		\
+		"	cbnz	  %w[loop], 1b"				\
+		: [loop] "=&r" (loop), [ret] "=&r" (ret),		\
+		  [ptr]  "+Q"(*(u16 *)ptr)				\
+		: [val] "Ir" (val));					\
 		break;							\
 	case 4:								\
-		do {							\
-			asm ("//__per_cpu_" #op "_4\n"			\
-			"ldxr	  %w[ret], %[ptr]\n"			\
+		asm ("//__per_cpu_" #op "_4\n"				\
+		"1:	ldxr	  %w[ret], %[ptr]\n"			\
 			#asm_op " %w[ret], %w[ret], %w[val]\n"		\
-			"stxr	  %w[loop], %w[ret], %[ptr]\n"		\
-			: [loop] "=&r" (loop), [ret] "=&r" (ret),	\
-			  [ptr] "+Q"(*(u32 *)ptr)			\
-			: [val] "Ir" (val));				\
-		} while (loop);						\
+		"	stxr	  %w[loop], %w[ret], %[ptr]\n"		\
+		"	cbnz	  %w[loop], 1b"				\
+		: [loop] "=&r" (loop), [ret] "=&r" (ret),		\
+		  [ptr] "+Q"(*(u32 *)ptr)				\
+		: [val] "Ir" (val));					\
 		break;							\
 	case 8:								\
-		do {							\
-			asm ("//__per_cpu_" #op "_8\n"			\
-			"ldxr	  %[ret], %[ptr]\n"			\
+		asm ("//__per_cpu_" #op "_8\n"				\
+		"1:	ldxr	  %[ret], %[ptr]\n"			\
 			#asm_op " %[ret], %[ret], %[val]\n"		\
-			"stxr	  %w[loop], %[ret], %[ptr]\n"		\
-			: [loop] "=&r" (loop), [ret] "=&r" (ret),	\
-			  [ptr] "+Q"(*(u64 *)ptr)			\
-			: [val] "Ir" (val));				\
-		} while (loop);						\
+		"	stxr	  %w[loop], %[ret], %[ptr]\n"		\
+		"	cbnz	  %w[loop], 1b"				\
+		: [loop] "=&r" (loop), [ret] "=&r" (ret),		\
+		  [ptr] "+Q"(*(u64 *)ptr)				\
+		: [val] "Ir" (val));					\
 		break;							\
 	default:							\
 		BUILD_BUG();						\
@@ -150,44 +146,40 @@
 
 	switch (size) {
 	case 1:
-		do {
-			asm ("//__percpu_xchg_1\n"
-			"ldxrb %w[ret], %[ptr]\n"
-			"stxrb %w[loop], %w[val], %[ptr]\n"
-			: [loop] "=&r"(loop), [ret] "=&r"(ret),
-			  [ptr] "+Q"(*(u8 *)ptr)
-			: [val] "r" (val));
-		} while (loop);
+		asm ("//__percpu_xchg_1\n"
+		"1:	ldxrb	%w[ret], %[ptr]\n"
+		"	stxrb	%w[loop], %w[val], %[ptr]\n"
+		"	cbnz	%w[loop], 1b"
+		: [loop] "=&r"(loop), [ret] "=&r"(ret),
+		  [ptr] "+Q"(*(u8 *)ptr)
+		: [val] "r" (val));
 		break;
 	case 2:
-		do {
-			asm ("//__percpu_xchg_2\n"
-			"ldxrh %w[ret], %[ptr]\n"
-			"stxrh %w[loop], %w[val], %[ptr]\n"
-			: [loop] "=&r"(loop), [ret] "=&r"(ret),
-			  [ptr] "+Q"(*(u16 *)ptr)
-			: [val] "r" (val));
-		} while (loop);
+		asm ("//__percpu_xchg_2\n"
+		"1:	ldxrh	%w[ret], %[ptr]\n"
+		"	stxrh	%w[loop], %w[val], %[ptr]\n"
+		"	cbnz	%w[loop], 1b"
+		: [loop] "=&r"(loop), [ret] "=&r"(ret),
+		  [ptr] "+Q"(*(u16 *)ptr)
+		: [val] "r" (val));
 		break;
 	case 4:
-		do {
-			asm ("//__percpu_xchg_4\n"
-			"ldxr %w[ret], %[ptr]\n"
-			"stxr %w[loop], %w[val], %[ptr]\n"
-			: [loop] "=&r"(loop), [ret] "=&r"(ret),
-			  [ptr] "+Q"(*(u32 *)ptr)
-			: [val] "r" (val));
-		} while (loop);
+		asm ("//__percpu_xchg_4\n"
+		"1:	ldxr	%w[ret], %[ptr]\n"
+		"	stxr	%w[loop], %w[val], %[ptr]\n"
+		"	cbnz	%w[loop], 1b"
+		: [loop] "=&r"(loop), [ret] "=&r"(ret),
+		  [ptr] "+Q"(*(u32 *)ptr)
+		: [val] "r" (val));
 		break;
 	case 8:
-		do {
-			asm ("//__percpu_xchg_8\n"
-			"ldxr %[ret], %[ptr]\n"
-			"stxr %w[loop], %[val], %[ptr]\n"
-			: [loop] "=&r"(loop), [ret] "=&r"(ret),
-			  [ptr] "+Q"(*(u64 *)ptr)
-			: [val] "r" (val));
-		} while (loop);
+		asm ("//__percpu_xchg_8\n"
+		"1:	ldxr	%[ret], %[ptr]\n"
+		"	stxr	%w[loop], %[val], %[ptr]\n"
+		"	cbnz	%w[loop], 1b"
+		: [loop] "=&r"(loop), [ret] "=&r"(ret),
+		  [ptr] "+Q"(*(u64 *)ptr)
+		: [val] "r" (val));
 		break;
 	default:
 		BUILD_BUG();
diff --git a/arch/arm64/include/asm/perf_event.h b/arch/arm64/include/asm/perf_event.h
index 2065f46..38b6a2b 100644
--- a/arch/arm64/include/asm/perf_event.h
+++ b/arch/arm64/include/asm/perf_event.h
@@ -46,7 +46,15 @@
 #define	ARMV8_PMU_EVTYPE_MASK	0xc800ffff	/* Mask for writable bits */
 #define	ARMV8_PMU_EVTYPE_EVENT	0xffff		/* Mask for EVENT bits */
 
-#define ARMV8_PMU_EVTYPE_EVENT_SW_INCR	0	/* Software increment event */
+/*
+ * PMUv3 event types: required events
+ */
+#define ARMV8_PMUV3_PERFCTR_SW_INCR				0x00
+#define ARMV8_PMUV3_PERFCTR_L1D_CACHE_REFILL			0x03
+#define ARMV8_PMUV3_PERFCTR_L1D_CACHE				0x04
+#define ARMV8_PMUV3_PERFCTR_BR_MIS_PRED				0x10
+#define ARMV8_PMUV3_PERFCTR_CPU_CYCLES				0x11
+#define ARMV8_PMUV3_PERFCTR_BR_PRED				0x12
 
 /*
  * Event filters for PMUv3
diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
index df2e53d..60e3482 100644
--- a/arch/arm64/include/asm/processor.h
+++ b/arch/arm64/include/asm/processor.h
@@ -188,8 +188,8 @@
 
 #endif
 
-void cpu_enable_pan(void *__unused);
-void cpu_enable_uao(void *__unused);
-void cpu_enable_cache_maint_trap(void *__unused);
+int cpu_enable_pan(void *__unused);
+int cpu_enable_uao(void *__unused);
+int cpu_enable_cache_maint_trap(void *__unused);
 
 #endif /* __ASM_PROCESSOR_H */
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index e8d46e8..6c80b36 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -286,7 +286,7 @@
 
 #define write_sysreg_s(v, r) do {					\
 	u64 __val = (u64)v;						\
-	asm volatile("msr_s " __stringify(r) ", %0" : : "rZ" (__val));	\
+	asm volatile("msr_s " __stringify(r) ", %x0" : : "rZ" (__val));	\
 } while (0)
 
 static inline void config_sctlr_el1(u32 clear, u32 set)
diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h
index bcaf6fb..55d0adb 100644
--- a/arch/arm64/include/asm/uaccess.h
+++ b/arch/arm64/include/asm/uaccess.h
@@ -21,6 +21,7 @@
 /*
  * User space memory access functions
  */
+#include <linux/bitops.h>
 #include <linux/kasan-checks.h>
 #include <linux/string.h>
 #include <linux/thread_info.h>
@@ -102,6 +103,13 @@
 	flag;								\
 })
 
+/*
+ * When dealing with data aborts or instruction traps we may end up with
+ * a tagged userland pointer. Clear the tag to get a sane pointer to pass
+ * on to access_ok(), for instance.
+ */
+#define untagged_addr(addr)		sign_extend64(addr, 55)
+
 #define access_ok(type, addr, size)	__range_ok(addr, size)
 #define user_addr_max			get_fs
 
diff --git a/arch/arm64/kernel/armv8_deprecated.c b/arch/arm64/kernel/armv8_deprecated.c
index 42ffdb54..b0988bb 100644
--- a/arch/arm64/kernel/armv8_deprecated.c
+++ b/arch/arm64/kernel/armv8_deprecated.c
@@ -280,35 +280,43 @@
 /*
  * Error-checking SWP macros implemented using ldxr{b}/stxr{b}
  */
-#define __user_swpX_asm(data, addr, res, temp, B)		\
+
+/* Arbitrary constant to ensure forward-progress of the LL/SC loop */
+#define __SWP_LL_SC_LOOPS	4
+
+#define __user_swpX_asm(data, addr, res, temp, temp2, B)	\
 	__asm__ __volatile__(					\
+	"	mov		%w3, %w7\n"			\
 	ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN,	\
 		    CONFIG_ARM64_PAN)				\
-	"0:	ldxr"B"		%w2, [%3]\n"			\
-	"1:	stxr"B"		%w0, %w1, [%3]\n"		\
+	"0:	ldxr"B"		%w2, [%4]\n"			\
+	"1:	stxr"B"		%w0, %w1, [%4]\n"		\
 	"	cbz		%w0, 2f\n"			\
-	"	mov		%w0, %w4\n"			\
+	"	sub		%w3, %w3, #1\n"			\
+	"	cbnz		%w3, 0b\n"			\
+	"	mov		%w0, %w5\n"			\
 	"	b		3f\n"				\
 	"2:\n"							\
 	"	mov		%w1, %w2\n"			\
 	"3:\n"							\
 	"	.pushsection	 .fixup,\"ax\"\n"		\
 	"	.align		2\n"				\
-	"4:	mov		%w0, %w5\n"			\
+	"4:	mov		%w0, %w6\n"			\
 	"	b		3b\n"				\
 	"	.popsection"					\
 	_ASM_EXTABLE(0b, 4b)					\
 	_ASM_EXTABLE(1b, 4b)					\
 	ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN,	\
 		CONFIG_ARM64_PAN)				\
-	: "=&r" (res), "+r" (data), "=&r" (temp)		\
-	: "r" (addr), "i" (-EAGAIN), "i" (-EFAULT)		\
+	: "=&r" (res), "+r" (data), "=&r" (temp), "=&r" (temp2)	\
+	: "r" (addr), "i" (-EAGAIN), "i" (-EFAULT),		\
+	  "i" (__SWP_LL_SC_LOOPS)				\
 	: "memory")
 
-#define __user_swp_asm(data, addr, res, temp) \
-	__user_swpX_asm(data, addr, res, temp, "")
-#define __user_swpb_asm(data, addr, res, temp) \
-	__user_swpX_asm(data, addr, res, temp, "b")
+#define __user_swp_asm(data, addr, res, temp, temp2) \
+	__user_swpX_asm(data, addr, res, temp, temp2, "")
+#define __user_swpb_asm(data, addr, res, temp, temp2) \
+	__user_swpX_asm(data, addr, res, temp, temp2, "b")
 
 /*
  * Bit 22 of the instruction encoding distinguishes between
@@ -328,12 +336,12 @@
 	}
 
 	while (1) {
-		unsigned long temp;
+		unsigned long temp, temp2;
 
 		if (type == TYPE_SWPB)
-			__user_swpb_asm(*data, address, res, temp);
+			__user_swpb_asm(*data, address, res, temp, temp2);
 		else
-			__user_swp_asm(*data, address, res, temp);
+			__user_swp_asm(*data, address, res, temp, temp2);
 
 		if (likely(res != -EAGAIN) || signal_pending(current))
 			break;
diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c
index 0150394..b75e917 100644
--- a/arch/arm64/kernel/cpu_errata.c
+++ b/arch/arm64/kernel/cpu_errata.c
@@ -39,10 +39,11 @@
 		(arm64_ftr_reg_ctrel0.sys_val & arm64_ftr_reg_ctrel0.strict_mask);
 }
 
-static void cpu_enable_trap_ctr_access(void *__unused)
+static int cpu_enable_trap_ctr_access(void *__unused)
 {
 	/* Clear SCTLR_EL1.UCT */
 	config_sctlr_el1(SCTLR_EL1_UCT, 0);
+	return 0;
 }
 
 #define MIDR_RANGE(model, min, max) \
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index d577f26..c02504e 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -19,7 +19,9 @@
 #define pr_fmt(fmt) "CPU features: " fmt
 
 #include <linux/bsearch.h>
+#include <linux/cpumask.h>
 #include <linux/sort.h>
+#include <linux/stop_machine.h>
 #include <linux/types.h>
 #include <asm/cpu.h>
 #include <asm/cpufeature.h>
@@ -941,7 +943,13 @@
 {
 	for (; caps->matches; caps++)
 		if (caps->enable && cpus_have_cap(caps->capability))
-			on_each_cpu(caps->enable, NULL, true);
+			/*
+			 * Use stop_machine() as it schedules the work allowing
+			 * us to modify PSTATE, instead of on_each_cpu() which
+			 * uses an IPI, giving us a PSTATE that disappears when
+			 * we return.
+			 */
+			stop_machine(caps->enable, NULL, cpu_online_mask);
 }
 
 /*
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 427f6d3..332e331 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -586,8 +586,9 @@
 	b.lt	4f				// Skip if no PMU present
 	mrs	x0, pmcr_el0			// Disable debug access traps
 	ubfx	x0, x0, #11, #5			// to EL2 and allow access to
-	msr	mdcr_el2, x0			// all PMU counters from EL1
 4:
+	csel	x0, xzr, x0, lt			// all PMU counters from EL1
+	msr	mdcr_el2, x0			// (if they exist)
 
 	/* Stage-2 translation */
 	msr	vttbr_el2, xzr
diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c
index a9310a6..57ae9d9 100644
--- a/arch/arm64/kernel/perf_event.c
+++ b/arch/arm64/kernel/perf_event.c
@@ -31,17 +31,9 @@
 
 /*
  * ARMv8 PMUv3 Performance Events handling code.
- * Common event types.
+ * Common event types (some are defined in asm/perf_event.h).
  */
 
-/* Required events. */
-#define ARMV8_PMUV3_PERFCTR_SW_INCR				0x00
-#define ARMV8_PMUV3_PERFCTR_L1D_CACHE_REFILL			0x03
-#define ARMV8_PMUV3_PERFCTR_L1D_CACHE				0x04
-#define ARMV8_PMUV3_PERFCTR_BR_MIS_PRED				0x10
-#define ARMV8_PMUV3_PERFCTR_CPU_CYCLES				0x11
-#define ARMV8_PMUV3_PERFCTR_BR_PRED				0x12
-
 /* At least one of the following is required. */
 #define ARMV8_PMUV3_PERFCTR_INST_RETIRED			0x08
 #define ARMV8_PMUV3_PERFCTR_INST_SPEC				0x1B
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index 27b2f138..01753cd 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -49,6 +49,7 @@
 #include <asm/alternative.h>
 #include <asm/compat.h>
 #include <asm/cacheflush.h>
+#include <asm/exec.h>
 #include <asm/fpsimd.h>
 #include <asm/mmu_context.h>
 #include <asm/processor.h>
@@ -186,10 +187,19 @@
 	printk("pc : [<%016llx>] lr : [<%016llx>] pstate: %08llx\n",
 	       regs->pc, lr, regs->pstate);
 	printk("sp : %016llx\n", sp);
-	for (i = top_reg; i >= 0; i--) {
+
+	i = top_reg;
+
+	while (i >= 0) {
 		printk("x%-2d: %016llx ", i, regs->regs[i]);
-		if (i % 2 == 0)
-			printk("\n");
+		i--;
+
+		if (i % 2 == 0) {
+			pr_cont("x%-2d: %016llx ", i, regs->regs[i]);
+			i--;
+		}
+
+		pr_cont("\n");
 	}
 	printk("\n");
 }
@@ -301,7 +311,7 @@
 }
 
 /* Restore the UAO state depending on next's addr_limit */
-static void uao_thread_switch(struct task_struct *next)
+void uao_thread_switch(struct task_struct *next)
 {
 	if (IS_ENABLED(CONFIG_ARM64_UAO)) {
 		if (task_thread_info(next)->addr_limit == KERNEL_DS)
diff --git a/arch/arm64/kernel/sleep.S b/arch/arm64/kernel/sleep.S
index b8799e7..1bec41b 100644
--- a/arch/arm64/kernel/sleep.S
+++ b/arch/arm64/kernel/sleep.S
@@ -135,7 +135,7 @@
 
 #ifdef CONFIG_KASAN
 	mov	x0, sp
-	bl	kasan_unpoison_remaining_stack
+	bl	kasan_unpoison_task_stack_below
 #endif
 
 	ldp	x19, x20, [x29, #16]
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index d3f151c..8507703 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -544,6 +544,7 @@
 			return;
 		}
 		bootcpu_valid = true;
+		early_map_cpu_to_node(0, acpi_numa_get_nid(0, hwid));
 		return;
 	}
 
diff --git a/arch/arm64/kernel/suspend.c b/arch/arm64/kernel/suspend.c
index ad73414..bb0cd78 100644
--- a/arch/arm64/kernel/suspend.c
+++ b/arch/arm64/kernel/suspend.c
@@ -1,8 +1,11 @@
 #include <linux/ftrace.h>
 #include <linux/percpu.h>
 #include <linux/slab.h>
+#include <asm/alternative.h>
 #include <asm/cacheflush.h>
+#include <asm/cpufeature.h>
 #include <asm/debug-monitors.h>
+#include <asm/exec.h>
 #include <asm/pgtable.h>
 #include <asm/memory.h>
 #include <asm/mmu_context.h>
@@ -50,6 +53,14 @@
 	set_my_cpu_offset(per_cpu_offset(cpu));
 
 	/*
+	 * PSTATE was not saved over suspend/resume, re-enable any detected
+	 * features that might not have been set correctly.
+	 */
+	asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN,
+			CONFIG_ARM64_PAN));
+	uao_thread_switch(current);
+
+	/*
 	 * Restore HW breakpoint registers to sane values
 	 * before debug exceptions are possibly reenabled
 	 * through local_dbg_restore.
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index 5ff020f..c9986b3 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -428,24 +428,28 @@
 	force_signal_inject(SIGILL, ILL_ILLOPC, regs, 0);
 }
 
-void cpu_enable_cache_maint_trap(void *__unused)
+int cpu_enable_cache_maint_trap(void *__unused)
 {
 	config_sctlr_el1(SCTLR_EL1_UCI, 0);
+	return 0;
 }
 
 #define __user_cache_maint(insn, address, res)			\
-	asm volatile (						\
-		"1:	" insn ", %1\n"				\
-		"	mov	%w0, #0\n"			\
-		"2:\n"						\
-		"	.pushsection .fixup,\"ax\"\n"		\
-		"	.align	2\n"				\
-		"3:	mov	%w0, %w2\n"			\
-		"	b	2b\n"				\
-		"	.popsection\n"				\
-		_ASM_EXTABLE(1b, 3b)				\
-		: "=r" (res)					\
-		: "r" (address), "i" (-EFAULT) )
+	if (untagged_addr(address) >= user_addr_max())		\
+		res = -EFAULT;					\
+	else							\
+		asm volatile (					\
+			"1:	" insn ", %1\n"			\
+			"	mov	%w0, #0\n"		\
+			"2:\n"					\
+			"	.pushsection .fixup,\"ax\"\n"	\
+			"	.align	2\n"			\
+			"3:	mov	%w0, %w2\n"		\
+			"	b	2b\n"			\
+			"	.popsection\n"			\
+			_ASM_EXTABLE(1b, 3b)			\
+			: "=r" (res)				\
+			: "r" (address), "i" (-EFAULT) )
 
 static void user_cache_maint_handler(unsigned int esr, struct pt_regs *regs)
 {
diff --git a/arch/arm64/kvm/hyp/tlb.c b/arch/arm64/kvm/hyp/tlb.c
index 9cc0ea7..88e2f2b 100644
--- a/arch/arm64/kvm/hyp/tlb.c
+++ b/arch/arm64/kvm/hyp/tlb.c
@@ -64,6 +64,21 @@
 	write_sysreg(0, vttbr_el2);
 }
 
+void __hyp_text __kvm_tlb_flush_local_vmid(struct kvm_vcpu *vcpu)
+{
+	struct kvm *kvm = kern_hyp_va(kern_hyp_va(vcpu)->kvm);
+
+	/* Switch to requested VMID */
+	write_sysreg(kvm->arch.vttbr, vttbr_el2);
+	isb();
+
+	asm volatile("tlbi vmalle1" : : );
+	dsb(nsh);
+	isb();
+
+	write_sysreg(0, vttbr_el2);
+}
+
 void __hyp_text __kvm_flush_vm_context(void)
 {
 	dsb(ishst);
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index f302fdb..87e7e66 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -597,8 +597,14 @@
 
 			idx = ARMV8_PMU_CYCLE_IDX;
 		} else {
-			BUG();
+			return false;
 		}
+	} else if (r->CRn == 0 && r->CRm == 9) {
+		/* PMCCNTR */
+		if (pmu_access_event_counter_el0_disabled(vcpu))
+			return false;
+
+		idx = ARMV8_PMU_CYCLE_IDX;
 	} else if (r->CRn == 14 && (r->CRm & 12) == 8) {
 		/* PMEVCNTRn_EL0 */
 		if (pmu_access_event_counter_el0_disabled(vcpu))
@@ -606,7 +612,7 @@
 
 		idx = ((r->CRm & 3) << 3) | (r->Op2 & 7);
 	} else {
-		BUG();
+		return false;
 	}
 
 	if (!pmu_counter_idx_valid(vcpu, idx))
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 53d9159..0f87883 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -29,7 +29,9 @@
 #include <linux/sched.h>
 #include <linux/highmem.h>
 #include <linux/perf_event.h>
+#include <linux/preempt.h>
 
+#include <asm/bug.h>
 #include <asm/cpufeature.h>
 #include <asm/exception.h>
 #include <asm/debug-monitors.h>
@@ -670,9 +672,17 @@
 NOKPROBE_SYMBOL(do_debug_exception);
 
 #ifdef CONFIG_ARM64_PAN
-void cpu_enable_pan(void *__unused)
+int cpu_enable_pan(void *__unused)
 {
+	/*
+	 * We modify PSTATE. This won't work from irq context as the PSTATE
+	 * is discarded once we return from the exception.
+	 */
+	WARN_ON_ONCE(in_interrupt());
+
 	config_sctlr_el1(SCTLR_EL1_SPAN, 0);
+	asm(SET_PSTATE_PAN(1));
+	return 0;
 }
 #endif /* CONFIG_ARM64_PAN */
 
@@ -683,8 +693,9 @@
  * We need to enable the feature at runtime (instead of adding it to
  * PSR_MODE_EL1h) as the feature may not be implemented by the cpu.
  */
-void cpu_enable_uao(void *__unused)
+int cpu_enable_uao(void *__unused)
 {
 	asm(SET_PSTATE_UAO(1));
+	return 0;
 }
 #endif /* CONFIG_ARM64_UAO */
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 21c489b..212c4d1 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -421,35 +421,35 @@
 
 	pr_notice("Virtual kernel memory layout:\n");
 #ifdef CONFIG_KASAN
-	pr_cont("    kasan   : 0x%16lx - 0x%16lx   (%6ld GB)\n",
+	pr_notice("    kasan   : 0x%16lx - 0x%16lx   (%6ld GB)\n",
 		MLG(KASAN_SHADOW_START, KASAN_SHADOW_END));
 #endif
-	pr_cont("    modules : 0x%16lx - 0x%16lx   (%6ld MB)\n",
+	pr_notice("    modules : 0x%16lx - 0x%16lx   (%6ld MB)\n",
 		MLM(MODULES_VADDR, MODULES_END));
-	pr_cont("    vmalloc : 0x%16lx - 0x%16lx   (%6ld GB)\n",
+	pr_notice("    vmalloc : 0x%16lx - 0x%16lx   (%6ld GB)\n",
 		MLG(VMALLOC_START, VMALLOC_END));
-	pr_cont("      .text : 0x%p" " - 0x%p" "   (%6ld KB)\n",
+	pr_notice("      .text : 0x%p" " - 0x%p" "   (%6ld KB)\n",
 		MLK_ROUNDUP(_text, _etext));
-	pr_cont("    .rodata : 0x%p" " - 0x%p" "   (%6ld KB)\n",
+	pr_notice("    .rodata : 0x%p" " - 0x%p" "   (%6ld KB)\n",
 		MLK_ROUNDUP(__start_rodata, __init_begin));
-	pr_cont("      .init : 0x%p" " - 0x%p" "   (%6ld KB)\n",
+	pr_notice("      .init : 0x%p" " - 0x%p" "   (%6ld KB)\n",
 		MLK_ROUNDUP(__init_begin, __init_end));
-	pr_cont("      .data : 0x%p" " - 0x%p" "   (%6ld KB)\n",
+	pr_notice("      .data : 0x%p" " - 0x%p" "   (%6ld KB)\n",
 		MLK_ROUNDUP(_sdata, _edata));
-	pr_cont("       .bss : 0x%p" " - 0x%p" "   (%6ld KB)\n",
+	pr_notice("       .bss : 0x%p" " - 0x%p" "   (%6ld KB)\n",
 		MLK_ROUNDUP(__bss_start, __bss_stop));
-	pr_cont("    fixed   : 0x%16lx - 0x%16lx   (%6ld KB)\n",
+	pr_notice("    fixed   : 0x%16lx - 0x%16lx   (%6ld KB)\n",
 		MLK(FIXADDR_START, FIXADDR_TOP));
-	pr_cont("    PCI I/O : 0x%16lx - 0x%16lx   (%6ld MB)\n",
+	pr_notice("    PCI I/O : 0x%16lx - 0x%16lx   (%6ld MB)\n",
 		MLM(PCI_IO_START, PCI_IO_END));
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
-	pr_cont("    vmemmap : 0x%16lx - 0x%16lx   (%6ld GB maximum)\n",
+	pr_notice("    vmemmap : 0x%16lx - 0x%16lx   (%6ld GB maximum)\n",
 		MLG(VMEMMAP_START, VMEMMAP_START + VMEMMAP_SIZE));
-	pr_cont("              0x%16lx - 0x%16lx   (%6ld MB actual)\n",
+	pr_notice("              0x%16lx - 0x%16lx   (%6ld MB actual)\n",
 		MLM((unsigned long)phys_to_page(memblock_start_of_DRAM()),
 		    (unsigned long)virt_to_page(high_memory)));
 #endif
-	pr_cont("    memory  : 0x%16lx - 0x%16lx   (%6ld MB)\n",
+	pr_notice("    memory  : 0x%16lx - 0x%16lx   (%6ld MB)\n",
 		MLM(__phys_to_virt(memblock_start_of_DRAM()),
 		    (unsigned long)high_memory));
 
diff --git a/arch/arm64/mm/numa.c b/arch/arm64/mm/numa.c
index 778a985..4b32168 100644
--- a/arch/arm64/mm/numa.c
+++ b/arch/arm64/mm/numa.c
@@ -147,7 +147,7 @@
 
 static int __init pcpu_cpu_distance(unsigned int from, unsigned int to)
 {
-	return node_distance(from, to);
+	return node_distance(early_cpu_to_node(from), early_cpu_to_node(to));
 }
 
 static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size,
@@ -223,8 +223,11 @@
 	void *nd;
 	int tnid;
 
-	pr_info("Initmem setup node %d [mem %#010Lx-%#010Lx]\n",
-		nid, start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1);
+	if (start_pfn < end_pfn)
+		pr_info("Initmem setup node %d [mem %#010Lx-%#010Lx]\n", nid,
+			start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1);
+	else
+		pr_info("Initmem setup node %d [<memory-less node>]\n", nid);
 
 	nd_pa = memblock_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
 	nd = __va(nd_pa);
diff --git a/arch/blackfin/kernel/ptrace.c b/arch/blackfin/kernel/ptrace.c
index 8b8fe67..8d79286 100644
--- a/arch/blackfin/kernel/ptrace.c
+++ b/arch/blackfin/kernel/ptrace.c
@@ -271,7 +271,7 @@
 			case BFIN_MEM_ACCESS_CORE:
 			case BFIN_MEM_ACCESS_CORE_ONLY:
 				copied = access_process_vm(child, addr, &tmp,
-				                           to_copy, 0);
+							   to_copy, FOLL_FORCE);
 				if (copied)
 					break;
 
@@ -324,7 +324,8 @@
 			case BFIN_MEM_ACCESS_CORE:
 			case BFIN_MEM_ACCESS_CORE_ONLY:
 				copied = access_process_vm(child, addr, &data,
-				                           to_copy, 1);
+				                           to_copy,
+							   FOLL_FORCE | FOLL_WRITE);
 				break;
 			case BFIN_MEM_ACCESS_DMA:
 				if (safe_dma_memcpy(paddr, &data, to_copy))
diff --git a/arch/cris/arch-v32/drivers/cryptocop.c b/arch/cris/arch-v32/drivers/cryptocop.c
index b5698c8..0068fd4 100644
--- a/arch/cris/arch-v32/drivers/cryptocop.c
+++ b/arch/cris/arch-v32/drivers/cryptocop.c
@@ -2722,7 +2722,6 @@
 	err = get_user_pages((unsigned long int)(oper.indata + prev_ix),
 			     noinpages,
 			     0,  /* read access only for in data */
-			     0, /* no force */
 			     inpages,
 			     NULL);
 
@@ -2736,8 +2735,7 @@
 	if (oper.do_cipher){
 		err = get_user_pages((unsigned long int)oper.cipher_outdata,
 				     nooutpages,
-				     1, /* write access for out data */
-				     0, /* no force */
+				     FOLL_WRITE, /* write access for out data */
 				     outpages,
 				     NULL);
 		up_read(&current->mm->mmap_sem);
@@ -3151,7 +3149,7 @@
 	printk("print_dma_descriptors start\n");
 
 	printk("iop:\n");
-	printk("\tsid: 0x%lld\n", iop->sid);
+	printk("\tsid: 0x%llx\n", iop->sid);
 
 	printk("\tcdesc_out: 0x%p\n", iop->cdesc_out);
 	printk("\tcdesc_in: 0x%p\n", iop->cdesc_in);
diff --git a/arch/cris/arch-v32/kernel/ptrace.c b/arch/cris/arch-v32/kernel/ptrace.c
index f085229..f0df654 100644
--- a/arch/cris/arch-v32/kernel/ptrace.c
+++ b/arch/cris/arch-v32/kernel/ptrace.c
@@ -147,7 +147,7 @@
 				/* The trampoline page is globally mapped, no page table to traverse.*/
 				tmp = *(unsigned long*)addr;
 			} else {
-				copied = access_process_vm(child, addr, &tmp, sizeof(tmp), 0);
+				copied = access_process_vm(child, addr, &tmp, sizeof(tmp), FOLL_FORCE);
 
 				if (copied != sizeof(tmp))
 					break;
@@ -279,7 +279,7 @@
   int opsize = 0;
 
   /* Read the opcode at pc (do what PTRACE_PEEKTEXT would do). */
-  copied = access_process_vm(child, pc, &opcode, sizeof(opcode), 0);
+  copied = access_process_vm(child, pc, &opcode, sizeof(opcode), FOLL_FORCE);
   if (copied != sizeof(opcode))
     return 0;
 
diff --git a/arch/h8300/include/asm/thread_info.h b/arch/h8300/include/asm/thread_info.h
index b408fe6..3cef068 100644
--- a/arch/h8300/include/asm/thread_info.h
+++ b/arch/h8300/include/asm/thread_info.h
@@ -31,7 +31,6 @@
 	int		   cpu;			/* cpu we're on */
 	int		   preempt_count;	/* 0 => preemptable, <0 => BUG */
 	mm_segment_t		addr_limit;
-	struct restart_block restart_block;
 };
 
 /*
@@ -44,9 +43,6 @@
 	.cpu =		0,			\
 	.preempt_count = INIT_PREEMPT_COUNT,	\
 	.addr_limit	= KERNEL_DS,		\
-	.restart_block	= {			\
-		.fn = do_no_restart_syscall,	\
-	},					\
 }
 
 #define init_thread_info	(init_thread_union.thread_info)
diff --git a/arch/h8300/kernel/signal.c b/arch/h8300/kernel/signal.c
index ad1f81f..7138303 100644
--- a/arch/h8300/kernel/signal.c
+++ b/arch/h8300/kernel/signal.c
@@ -79,7 +79,7 @@
 	unsigned int er0;
 
 	/* Always make any pending restarted system calls return -EINTR */
-	current_thread_info()->restart_block.fn = do_no_restart_syscall;
+	current->restart_block.fn = do_no_restart_syscall;
 
 	/* restore passed registers */
 #define COPY(r)  do { err |= get_user(regs->r, &usc->sc_##r); } while (0)
diff --git a/arch/ia64/kernel/err_inject.c b/arch/ia64/kernel/err_inject.c
index 09f8457..5ed0ea9 100644
--- a/arch/ia64/kernel/err_inject.c
+++ b/arch/ia64/kernel/err_inject.c
@@ -142,7 +142,7 @@
 	u64 virt_addr=simple_strtoull(buf, NULL, 16);
 	int ret;
 
-	ret = get_user_pages(virt_addr, 1, VM_READ, 0, NULL, NULL);
+	ret = get_user_pages(virt_addr, 1, FOLL_WRITE, NULL, NULL);
 	if (ret<=0) {
 #ifdef ERR_INJ_DEBUG
 		printk("Virtual address %lx is not existing.\n",virt_addr);
diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c
index 6f54d51..31aa8c0 100644
--- a/arch/ia64/kernel/ptrace.c
+++ b/arch/ia64/kernel/ptrace.c
@@ -453,7 +453,7 @@
 			return 0;
 		}
 	}
-	copied = access_process_vm(child, addr, &ret, sizeof(ret), 0);
+	copied = access_process_vm(child, addr, &ret, sizeof(ret), FOLL_FORCE);
 	if (copied != sizeof(ret))
 		return -EIO;
 	*val = ret;
@@ -489,7 +489,8 @@
 				*ia64_rse_skip_regs(krbs, regnum) = val;
 			}
 		}
-	} else if (access_process_vm(child, addr, &val, sizeof(val), 1)
+	} else if (access_process_vm(child, addr, &val, sizeof(val),
+				FOLL_FORCE | FOLL_WRITE)
 		   != sizeof(val))
 		return -EIO;
 	return 0;
@@ -543,7 +544,8 @@
 		ret = ia64_peek(child, sw, user_rbs_end, addr, &val);
 		if (ret < 0)
 			return ret;
-		if (access_process_vm(child, addr, &val, sizeof(val), 1)
+		if (access_process_vm(child, addr, &val, sizeof(val),
+				FOLL_FORCE | FOLL_WRITE)
 		    != sizeof(val))
 			return -EIO;
 	}
@@ -559,7 +561,8 @@
 
 	/* now copy word for word from user rbs to kernel rbs: */
 	for (addr = user_rbs_start; addr < user_rbs_end; addr += 8) {
-		if (access_process_vm(child, addr, &val, sizeof(val), 0)
+		if (access_process_vm(child, addr, &val, sizeof(val),
+				FOLL_FORCE)
 				!= sizeof(val))
 			return -EIO;
 
@@ -1156,7 +1159,8 @@
 	case PTRACE_PEEKTEXT:
 	case PTRACE_PEEKDATA:
 		/* read word at location addr */
-		if (access_process_vm(child, addr, &data, sizeof(data), 0)
+		if (access_process_vm(child, addr, &data, sizeof(data),
+				FOLL_FORCE)
 		    != sizeof(data))
 			return -EIO;
 		/* ensure return value is not mistaken for error code */
diff --git a/arch/m32r/kernel/ptrace.c b/arch/m32r/kernel/ptrace.c
index 51f5e9a..c145605 100644
--- a/arch/m32r/kernel/ptrace.c
+++ b/arch/m32r/kernel/ptrace.c
@@ -493,7 +493,8 @@
 	int i;
 
 	for (i = 0; i < p->nr_trap; i++)
-		access_process_vm(child, p->addr[i], &p->insn[i], sizeof(p->insn[i]), 1);
+		access_process_vm(child, p->addr[i], &p->insn[i], sizeof(p->insn[i]),
+				FOLL_FORCE | FOLL_WRITE);
 	p->nr_trap = 0;
 }
 
@@ -537,7 +538,8 @@
 	unsigned long next_insn, code;
 	unsigned long addr = next_pc & ~3;
 
-	if (access_process_vm(child, addr, &next_insn, sizeof(next_insn), 0)
+	if (access_process_vm(child, addr, &next_insn, sizeof(next_insn),
+			FOLL_FORCE)
 	    != sizeof(next_insn)) {
 		return -1; /* error */
 	}
@@ -546,7 +548,8 @@
 	if (register_debug_trap(child, next_pc, next_insn, &code)) {
 		return -1; /* error */
 	}
-	if (access_process_vm(child, addr, &code, sizeof(code), 1)
+	if (access_process_vm(child, addr, &code, sizeof(code),
+			FOLL_FORCE | FOLL_WRITE)
 	    != sizeof(code)) {
 		return -1; /* error */
 	}
@@ -562,7 +565,8 @@
  	addr = (regs->bpc - 2) & ~3;
 	regs->bpc -= 2;
 	if (unregister_debug_trap(current, addr, &code)) {
-	    access_process_vm(current, addr, &code, sizeof(code), 1);
+	    access_process_vm(current, addr, &code, sizeof(code),
+		    FOLL_FORCE | FOLL_WRITE);
 	    invalidate_cache();
 	}
 }
@@ -589,7 +593,8 @@
 	/* Compute next pc.  */
 	pc = get_stack_long(child, PT_BPC);
 
-	if (access_process_vm(child, pc&~3, &insn, sizeof(insn), 0)
+	if (access_process_vm(child, pc&~3, &insn, sizeof(insn),
+			FOLL_FORCE)
 	    != sizeof(insn))
 		return;
 
diff --git a/arch/mips/Makefile b/arch/mips/Makefile
index fbf40d3..1a6bac7 100644
--- a/arch/mips/Makefile
+++ b/arch/mips/Makefile
@@ -263,7 +263,7 @@
 
 bootvars-y	= VMLINUX_LOAD_ADDRESS=$(load-y) \
 		  VMLINUX_ENTRY_ADDRESS=$(entry-y) \
-		  PLATFORM=$(platform-y)
+		  PLATFORM="$(platform-y)"
 ifdef CONFIG_32BIT
 bootvars-y	+= ADDR_BITS=32
 endif
diff --git a/arch/mips/boot/dts/mti/malta.dts b/arch/mips/boot/dts/mti/malta.dts
index f604a27..ffe3a15 100644
--- a/arch/mips/boot/dts/mti/malta.dts
+++ b/arch/mips/boot/dts/mti/malta.dts
@@ -84,12 +84,13 @@
 	fpga_regs: system-controller@1f000000 {
 		compatible = "mti,malta-fpga", "syscon", "simple-mfd";
 		reg = <0x1f000000 0x1000>;
+		native-endian;
 
 		reboot {
 			compatible = "syscon-reboot";
 			regmap = <&fpga_regs>;
 			offset = <0x500>;
-			mask = <0x4d>;
+			mask = <0x42>;
 		};
 	};
 
diff --git a/arch/mips/generic/init.c b/arch/mips/generic/init.c
index 0ea73e8..d493ccb 100644
--- a/arch/mips/generic/init.c
+++ b/arch/mips/generic/init.c
@@ -30,9 +30,19 @@
 
 void __init prom_init(void)
 {
+	plat_get_fdt();
+	BUG_ON(!fdt);
+}
+
+void __init *plat_get_fdt(void)
+{
 	const struct mips_machine *check_mach;
 	const struct of_device_id *match;
 
+	if (fdt)
+		/* Already set up */
+		return (void *)fdt;
+
 	if ((fw_arg0 == -2) && !fdt_check_header((void *)fw_arg1)) {
 		/*
 		 * We booted using the UHI boot protocol, so we have been
@@ -75,12 +85,6 @@
 		/* Retrieve the machine's FDT */
 		fdt = mach->fdt;
 	}
-
-	BUG_ON(!fdt);
-}
-
-void __init *plat_get_fdt(void)
-{
 	return (void *)fdt;
 }
 
diff --git a/arch/mips/include/asm/fpu_emulator.h b/arch/mips/include/asm/fpu_emulator.h
index 355dc25..c05369e 100644
--- a/arch/mips/include/asm/fpu_emulator.h
+++ b/arch/mips/include/asm/fpu_emulator.h
@@ -63,6 +63,8 @@
 extern int fpu_emulator_cop1Handler(struct pt_regs *xcp,
 				    struct mips_fpu_struct *ctx, int has_fpu,
 				    void *__user *fault_addr);
+void force_fcr31_sig(unsigned long fcr31, void __user *fault_addr,
+		     struct task_struct *tsk);
 int process_fpemu_return(int sig, void __user *fault_addr,
 			 unsigned long fcr31);
 int isBranchInstr(struct pt_regs *regs, struct mm_decoded_insn dec_insn,
@@ -81,4 +83,15 @@
 		set_fpr64(&t->thread.fpu.fpr[i], 0, SIGNALLING_NAN);
 }
 
+/*
+ * Mask the FCSR Cause bits according to the Enable bits, observing
+ * that Unimplemented is always enabled.
+ */
+static inline unsigned long mask_fcr31_x(unsigned long fcr31)
+{
+	return fcr31 & (FPU_CSR_UNI_X |
+			((fcr31 & FPU_CSR_ALL_E) <<
+			 (ffs(FPU_CSR_ALL_X) - ffs(FPU_CSR_ALL_E))));
+}
+
 #endif /* _ASM_FPU_EMULATOR_H */
diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index 07f58cf..bebec37 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -293,7 +293,10 @@
 	/* Host KSEG0 address of the EI/DI offset */
 	void *kseg0_commpage;
 
-	u32 io_gpr;		/* GPR used as IO source/target */
+	/* Resume PC after MMIO completion */
+	unsigned long io_pc;
+	/* GPR used as IO source/target */
+	u32 io_gpr;
 
 	struct hrtimer comparecount_timer;
 	/* Count timer control KVM register */
@@ -315,8 +318,6 @@
 	/* Bitmask of pending exceptions to be cleared */
 	unsigned long pending_exceptions_clr;
 
-	u32 pending_load_cause;
-
 	/* Save/Restore the entryhi register when are are preempted/scheduled back in */
 	unsigned long preempt_entryhi;
 
diff --git a/arch/mips/include/asm/mipsregs.h b/arch/mips/include/asm/mipsregs.h
index 7dd2dd4..df78b2c 100644
--- a/arch/mips/include/asm/mipsregs.h
+++ b/arch/mips/include/asm/mipsregs.h
@@ -215,6 +215,12 @@
 #endif
 
 /*
+ * Wired register bits
+ */
+#define MIPSR6_WIRED_LIMIT	(_ULCAST_(0xffff) << 16)
+#define MIPSR6_WIRED_WIRED	(_ULCAST_(0xffff) << 0)
+
+/*
  * Values used for computation of new tlb entries
  */
 #define PL_4K		12
diff --git a/arch/mips/include/asm/switch_to.h b/arch/mips/include/asm/switch_to.h
index ebb5c0f..c0ae279 100644
--- a/arch/mips/include/asm/switch_to.h
+++ b/arch/mips/include/asm/switch_to.h
@@ -76,6 +76,22 @@
 } while (0)
 
 /*
+ * Check FCSR for any unmasked exceptions pending set with `ptrace',
+ * clear them and send a signal.
+ */
+#define __sanitize_fcr31(next)						\
+do {									\
+	unsigned long fcr31 = mask_fcr31_x(next->thread.fpu.fcr31);	\
+	void __user *pc;						\
+									\
+	if (unlikely(fcr31)) {						\
+		pc = (void __user *)task_pt_regs(next)->cp0_epc;	\
+		next->thread.fpu.fcr31 &= ~fcr31;			\
+		force_fcr31_sig(fcr31, pc, next);			\
+	}								\
+} while (0)
+
+/*
  * For newly created kernel threads switch_to() will return to
  * ret_from_kernel_thread, newly created user threads to ret_from_fork.
  * That is, everything following resume() will be skipped for new threads.
@@ -85,6 +101,8 @@
 do {									\
 	__mips_mt_fpaff_switch_to(prev);				\
 	lose_fpu_inatomic(1, prev);					\
+	if (tsk_used_math(next))					\
+		__sanitize_fcr31(next);					\
 	if (cpu_has_dsp) {						\
 		__save_dsp(prev);					\
 		__restore_dsp(next);					\
diff --git a/arch/mips/include/asm/tlb.h b/arch/mips/include/asm/tlb.h
index 4a23493..dd179fd 100644
--- a/arch/mips/include/asm/tlb.h
+++ b/arch/mips/include/asm/tlb.h
@@ -1,6 +1,9 @@
 #ifndef __ASM_TLB_H
 #define __ASM_TLB_H
 
+#include <asm/cpu-features.h>
+#include <asm/mipsregs.h>
+
 /*
  * MIPS doesn't need any special per-pte or per-vma handling, except
  * we need to flush cache for area to be unmapped.
@@ -22,6 +25,16 @@
 		((CKSEG0 + ((idx) << (PAGE_SHIFT + 1))) |		\
 		 (cpu_has_tlbinv ? MIPS_ENTRYHI_EHINV : 0))
 
+static inline unsigned int num_wired_entries(void)
+{
+	unsigned int wired = read_c0_wired();
+
+	if (cpu_has_mips_r6)
+		wired &= MIPSR6_WIRED_WIRED;
+
+	return wired;
+}
+
 #include <asm-generic/tlb.h>
 
 #endif /* __ASM_TLB_H */
diff --git a/arch/mips/kernel/mips-cpc.c b/arch/mips/kernel/mips-cpc.c
index 2a45867..a4964c3 100644
--- a/arch/mips/kernel/mips-cpc.c
+++ b/arch/mips/kernel/mips-cpc.c
@@ -21,6 +21,11 @@
 
 static DEFINE_PER_CPU_ALIGNED(unsigned long, cpc_core_lock_flags);
 
+phys_addr_t __weak mips_cpc_default_phys_base(void)
+{
+	return 0;
+}
+
 /**
  * mips_cpc_phys_base - retrieve the physical base address of the CPC
  *
@@ -43,8 +48,12 @@
 	if (cpc_base & CM_GCR_CPC_BASE_CPCEN_MSK)
 		return cpc_base & CM_GCR_CPC_BASE_CPCBASE_MSK;
 
-	/* Otherwise, give it the default address & enable it */
+	/* Otherwise, use the default address */
 	cpc_base = mips_cpc_default_phys_base();
+	if (!cpc_base)
+		return cpc_base;
+
+	/* Enable the CPC, mapped at the default address */
 	write_gcr_cpc_base(cpc_base | CM_GCR_CPC_BASE_CPCEN_MSK);
 	return cpc_base;
 }
diff --git a/arch/mips/kernel/mips-r2-to-r6-emul.c b/arch/mips/kernel/mips-r2-to-r6-emul.c
index 22dedd6..bd09853 100644
--- a/arch/mips/kernel/mips-r2-to-r6-emul.c
+++ b/arch/mips/kernel/mips-r2-to-r6-emul.c
@@ -899,7 +899,7 @@
  * mipsr2_decoder: Decode and emulate a MIPS R2 instruction
  * @regs: Process register set
  * @inst: Instruction to decode and emulate
- * @fcr31: Floating Point Control and Status Register returned
+ * @fcr31: Floating Point Control and Status Register Cause bits returned
  */
 int mipsr2_decoder(struct pt_regs *regs, u32 inst, unsigned long *fcr31)
 {
@@ -1172,13 +1172,13 @@
 
 		err = fpu_emulator_cop1Handler(regs, &current->thread.fpu, 0,
 					       &fault_addr);
-		*fcr31 = current->thread.fpu.fcr31;
 
 		/*
-		 * We can't allow the emulated instruction to leave any of
-		 * the cause bits set in $fcr31.
+		 * We can't allow the emulated instruction to leave any
+		 * enabled Cause bits set in $fcr31.
 		 */
-		current->thread.fpu.fcr31 &= ~FPU_CSR_ALL_X;
+		*fcr31 = res = mask_fcr31_x(current->thread.fpu.fcr31);
+		current->thread.fpu.fcr31 &= ~res;
 
 		/*
 		 * this is a tricky issue - lose_fpu() uses LL/SC atomics
diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c
index 6103b24..a92994d 100644
--- a/arch/mips/kernel/ptrace.c
+++ b/arch/mips/kernel/ptrace.c
@@ -79,16 +79,15 @@
 }
 
 /*
- * Poke at FCSR according to its mask.  Don't set the cause bits as
- * this is currently not handled correctly in FP context restoration
- * and will cause an oops if a corresponding enable bit is set.
+ * Poke at FCSR according to its mask.  Set the Cause bits even
+ * if a corresponding Enable bit is set.  This will be noticed at
+ * the time the thread is switched to and SIGFPE thrown accordingly.
  */
 static void ptrace_setfcr31(struct task_struct *child, u32 value)
 {
 	u32 fcr31;
 	u32 mask;
 
-	value &= ~FPU_CSR_ALL_X;
 	fcr31 = child->thread.fpu.fcr31;
 	mask = boot_cpu_data.fpu_msk31;
 	child->thread.fpu.fcr31 = (value & ~mask) | (fcr31 & mask);
@@ -817,6 +816,7 @@
 			break;
 #endif
 		case FPC_CSR:
+			init_fp_ctx(child);
 			ptrace_setfcr31(child, data);
 			break;
 		case DSP_BASE ... DSP_BASE + 5: {
diff --git a/arch/mips/kernel/ptrace32.c b/arch/mips/kernel/ptrace32.c
index 283b5a1..7e71a4e 100644
--- a/arch/mips/kernel/ptrace32.c
+++ b/arch/mips/kernel/ptrace32.c
@@ -70,7 +70,7 @@
 			break;
 
 		copied = access_process_vm(child, (u64)addrOthers, &tmp,
-				sizeof(tmp), 0);
+				sizeof(tmp), FOLL_FORCE);
 		if (copied != sizeof(tmp))
 			break;
 		ret = put_user(tmp, (u32 __user *) (unsigned long) data);
@@ -179,7 +179,8 @@
 			break;
 		ret = 0;
 		if (access_process_vm(child, (u64)addrOthers, &data,
-					sizeof(data), 1) == sizeof(data))
+					sizeof(data),
+					FOLL_FORCE | FOLL_WRITE) == sizeof(data))
 			break;
 		ret = -EIO;
 		break;
diff --git a/arch/mips/kernel/r2300_fpu.S b/arch/mips/kernel/r2300_fpu.S
index b4ac637..918f2f6 100644
--- a/arch/mips/kernel/r2300_fpu.S
+++ b/arch/mips/kernel/r2300_fpu.S
@@ -21,106 +21,84 @@
 #define EX(a,b)							\
 9:	a,##b;							\
 	.section __ex_table,"a";				\
+	PTR	9b,fault;					\
+	.previous
+
+#define EX2(a,b)						\
+9:	a,##b;							\
+	.section __ex_table,"a";				\
 	PTR	9b,bad_stack;					\
+	PTR	9b+4,bad_stack;					\
 	.previous
 
 	.set	noreorder
 	.set	mips1
-	/* Save floating point context */
+
+/**
+ * _save_fp_context() - save FP context from the FPU
+ * @a0 - pointer to fpregs field of sigcontext
+ * @a1 - pointer to fpc_csr field of sigcontext
+ *
+ * Save FP context, including the 32 FP data registers and the FP
+ * control & status register, from the FPU to signal context.
+ */
 LEAF(_save_fp_context)
 	.set	push
 	SET_HARDFLOAT
 	li	v0, 0					# assume success
-	cfc1	t1,fcr31
-	EX(swc1 $f0,(SC_FPREGS+0)(a0))
-	EX(swc1 $f1,(SC_FPREGS+8)(a0))
-	EX(swc1 $f2,(SC_FPREGS+16)(a0))
-	EX(swc1 $f3,(SC_FPREGS+24)(a0))
-	EX(swc1 $f4,(SC_FPREGS+32)(a0))
-	EX(swc1 $f5,(SC_FPREGS+40)(a0))
-	EX(swc1 $f6,(SC_FPREGS+48)(a0))
-	EX(swc1 $f7,(SC_FPREGS+56)(a0))
-	EX(swc1 $f8,(SC_FPREGS+64)(a0))
-	EX(swc1 $f9,(SC_FPREGS+72)(a0))
-	EX(swc1 $f10,(SC_FPREGS+80)(a0))
-	EX(swc1 $f11,(SC_FPREGS+88)(a0))
-	EX(swc1 $f12,(SC_FPREGS+96)(a0))
-	EX(swc1 $f13,(SC_FPREGS+104)(a0))
-	EX(swc1 $f14,(SC_FPREGS+112)(a0))
-	EX(swc1 $f15,(SC_FPREGS+120)(a0))
-	EX(swc1 $f16,(SC_FPREGS+128)(a0))
-	EX(swc1 $f17,(SC_FPREGS+136)(a0))
-	EX(swc1 $f18,(SC_FPREGS+144)(a0))
-	EX(swc1 $f19,(SC_FPREGS+152)(a0))
-	EX(swc1 $f20,(SC_FPREGS+160)(a0))
-	EX(swc1 $f21,(SC_FPREGS+168)(a0))
-	EX(swc1 $f22,(SC_FPREGS+176)(a0))
-	EX(swc1 $f23,(SC_FPREGS+184)(a0))
-	EX(swc1 $f24,(SC_FPREGS+192)(a0))
-	EX(swc1 $f25,(SC_FPREGS+200)(a0))
-	EX(swc1 $f26,(SC_FPREGS+208)(a0))
-	EX(swc1 $f27,(SC_FPREGS+216)(a0))
-	EX(swc1 $f28,(SC_FPREGS+224)(a0))
-	EX(swc1 $f29,(SC_FPREGS+232)(a0))
-	EX(swc1 $f30,(SC_FPREGS+240)(a0))
-	EX(swc1 $f31,(SC_FPREGS+248)(a0))
-	EX(sw	t1,(SC_FPC_CSR)(a0))
-	cfc1	t0,$0				# implementation/version
+	cfc1	t1, fcr31
+	EX2(s.d $f0, 0(a0))
+	EX2(s.d $f2, 16(a0))
+	EX2(s.d $f4, 32(a0))
+	EX2(s.d $f6, 48(a0))
+	EX2(s.d $f8, 64(a0))
+	EX2(s.d $f10, 80(a0))
+	EX2(s.d $f12, 96(a0))
+	EX2(s.d $f14, 112(a0))
+	EX2(s.d $f16, 128(a0))
+	EX2(s.d $f18, 144(a0))
+	EX2(s.d $f20, 160(a0))
+	EX2(s.d $f22, 176(a0))
+	EX2(s.d $f24, 192(a0))
+	EX2(s.d $f26, 208(a0))
+	EX2(s.d $f28, 224(a0))
+	EX2(s.d $f30, 240(a0))
 	jr	ra
+	 EX(sw	t1, (a1))
 	.set	pop
-	.set	nomacro
-	 EX(sw	t0,(SC_FPC_EIR)(a0))
-	.set	macro
 	END(_save_fp_context)
 
-/*
- * Restore FPU state:
- *  - fp gp registers
- *  - cp1 status/control register
+/**
+ * _restore_fp_context() - restore FP context to the FPU
+ * @a0 - pointer to fpregs field of sigcontext
+ * @a1 - pointer to fpc_csr field of sigcontext
  *
- * We base the decision which registers to restore from the signal stack
- * frame on the current content of c0_status, not on the content of the
- * stack frame which might have been changed by the user.
+ * Restore FP context, including the 32 FP data registers and the FP
+ * control & status register, from signal context to the FPU.
  */
 LEAF(_restore_fp_context)
 	.set	push
 	SET_HARDFLOAT
 	li	v0, 0					# assume success
-	EX(lw t0,(SC_FPC_CSR)(a0))
-	EX(lwc1 $f0,(SC_FPREGS+0)(a0))
-	EX(lwc1 $f1,(SC_FPREGS+8)(a0))
-	EX(lwc1 $f2,(SC_FPREGS+16)(a0))
-	EX(lwc1 $f3,(SC_FPREGS+24)(a0))
-	EX(lwc1 $f4,(SC_FPREGS+32)(a0))
-	EX(lwc1 $f5,(SC_FPREGS+40)(a0))
-	EX(lwc1 $f6,(SC_FPREGS+48)(a0))
-	EX(lwc1 $f7,(SC_FPREGS+56)(a0))
-	EX(lwc1 $f8,(SC_FPREGS+64)(a0))
-	EX(lwc1 $f9,(SC_FPREGS+72)(a0))
-	EX(lwc1 $f10,(SC_FPREGS+80)(a0))
-	EX(lwc1 $f11,(SC_FPREGS+88)(a0))
-	EX(lwc1 $f12,(SC_FPREGS+96)(a0))
-	EX(lwc1 $f13,(SC_FPREGS+104)(a0))
-	EX(lwc1 $f14,(SC_FPREGS+112)(a0))
-	EX(lwc1 $f15,(SC_FPREGS+120)(a0))
-	EX(lwc1 $f16,(SC_FPREGS+128)(a0))
-	EX(lwc1 $f17,(SC_FPREGS+136)(a0))
-	EX(lwc1 $f18,(SC_FPREGS+144)(a0))
-	EX(lwc1 $f19,(SC_FPREGS+152)(a0))
-	EX(lwc1 $f20,(SC_FPREGS+160)(a0))
-	EX(lwc1 $f21,(SC_FPREGS+168)(a0))
-	EX(lwc1 $f22,(SC_FPREGS+176)(a0))
-	EX(lwc1 $f23,(SC_FPREGS+184)(a0))
-	EX(lwc1 $f24,(SC_FPREGS+192)(a0))
-	EX(lwc1 $f25,(SC_FPREGS+200)(a0))
-	EX(lwc1 $f26,(SC_FPREGS+208)(a0))
-	EX(lwc1 $f27,(SC_FPREGS+216)(a0))
-	EX(lwc1 $f28,(SC_FPREGS+224)(a0))
-	EX(lwc1 $f29,(SC_FPREGS+232)(a0))
-	EX(lwc1 $f30,(SC_FPREGS+240)(a0))
-	EX(lwc1 $f31,(SC_FPREGS+248)(a0))
+	EX(lw t0, (a1))
+	EX2(l.d $f0, 0(a0))
+	EX2(l.d $f2, 16(a0))
+	EX2(l.d $f4, 32(a0))
+	EX2(l.d $f6, 48(a0))
+	EX2(l.d $f8, 64(a0))
+	EX2(l.d $f10, 80(a0))
+	EX2(l.d $f12, 96(a0))
+	EX2(l.d $f14, 112(a0))
+	EX2(l.d $f16, 128(a0))
+	EX2(l.d $f18, 144(a0))
+	EX2(l.d $f20, 160(a0))
+	EX2(l.d $f22, 176(a0))
+	EX2(l.d $f24, 192(a0))
+	EX2(l.d $f26, 208(a0))
+	EX2(l.d $f28, 224(a0))
+	EX2(l.d $f30, 240(a0))
 	jr	ra
-	 ctc1	t0,fcr31
+	 ctc1	t0, fcr31
 	.set	pop
 	END(_restore_fp_context)
 	.set	reorder
diff --git a/arch/mips/kernel/r6000_fpu.S b/arch/mips/kernel/r6000_fpu.S
index 4707738..9cc7bfa 100644
--- a/arch/mips/kernel/r6000_fpu.S
+++ b/arch/mips/kernel/r6000_fpu.S
@@ -21,7 +21,14 @@
 	.set	push
 	SET_HARDFLOAT
 
-	/* Save floating point context */
+/**
+ * _save_fp_context() - save FP context from the FPU
+ * @a0 - pointer to fpregs field of sigcontext
+ * @a1 - pointer to fpc_csr field of sigcontext
+ *
+ * Save FP context, including the 32 FP data registers and the FP
+ * control & status register, from the FPU to signal context.
+ */
 	LEAF(_save_fp_context)
 	mfc0	t0,CP0_STATUS
 	sll	t0,t0,2
@@ -30,59 +37,59 @@
 
 	cfc1	t1,fcr31
 	/* Store the 16 double precision registers */
-	sdc1	$f0,(SC_FPREGS+0)(a0)
-	sdc1	$f2,(SC_FPREGS+16)(a0)
-	sdc1	$f4,(SC_FPREGS+32)(a0)
-	sdc1	$f6,(SC_FPREGS+48)(a0)
-	sdc1	$f8,(SC_FPREGS+64)(a0)
-	sdc1	$f10,(SC_FPREGS+80)(a0)
-	sdc1	$f12,(SC_FPREGS+96)(a0)
-	sdc1	$f14,(SC_FPREGS+112)(a0)
-	sdc1	$f16,(SC_FPREGS+128)(a0)
-	sdc1	$f18,(SC_FPREGS+144)(a0)
-	sdc1	$f20,(SC_FPREGS+160)(a0)
-	sdc1	$f22,(SC_FPREGS+176)(a0)
-	sdc1	$f24,(SC_FPREGS+192)(a0)
-	sdc1	$f26,(SC_FPREGS+208)(a0)
-	sdc1	$f28,(SC_FPREGS+224)(a0)
-	sdc1	$f30,(SC_FPREGS+240)(a0)
+	sdc1	$f0,0(a0)
+	sdc1	$f2,16(a0)
+	sdc1	$f4,32(a0)
+	sdc1	$f6,48(a0)
+	sdc1	$f8,64(a0)
+	sdc1	$f10,80(a0)
+	sdc1	$f12,96(a0)
+	sdc1	$f14,112(a0)
+	sdc1	$f16,128(a0)
+	sdc1	$f18,144(a0)
+	sdc1	$f20,160(a0)
+	sdc1	$f22,176(a0)
+	sdc1	$f24,192(a0)
+	sdc1	$f26,208(a0)
+	sdc1	$f28,224(a0)
+	sdc1	$f30,240(a0)
 	jr	ra
-	 sw	t0,SC_FPC_CSR(a0)
+	 sw	t0,(a1)
 1:	jr	ra
 	 nop
 	END(_save_fp_context)
 
-/* Restore FPU state:
- *  - fp gp registers
- *  - cp1 status/control register
+/**
+ * _restore_fp_context() - restore FP context to the FPU
+ * @a0 - pointer to fpregs field of sigcontext
+ * @a1 - pointer to fpc_csr field of sigcontext
  *
- * We base the decision which registers to restore from the signal stack
- * frame on the current content of c0_status, not on the content of the
- * stack frame which might have been changed by the user.
+ * Restore FP context, including the 32 FP data registers and the FP
+ * control & status register, from signal context to the FPU.
  */
 	LEAF(_restore_fp_context)
 	mfc0	t0,CP0_STATUS
 	sll	t0,t0,2
 
 	bgez	t0,1f
-	 lw	t0,SC_FPC_CSR(a0)
+	 lw	t0,(a1)
 	/* Restore the 16 double precision registers */
-	ldc1	$f0,(SC_FPREGS+0)(a0)
-	ldc1	$f2,(SC_FPREGS+16)(a0)
-	ldc1	$f4,(SC_FPREGS+32)(a0)
-	ldc1	$f6,(SC_FPREGS+48)(a0)
-	ldc1	$f8,(SC_FPREGS+64)(a0)
-	ldc1	$f10,(SC_FPREGS+80)(a0)
-	ldc1	$f12,(SC_FPREGS+96)(a0)
-	ldc1	$f14,(SC_FPREGS+112)(a0)
-	ldc1	$f16,(SC_FPREGS+128)(a0)
-	ldc1	$f18,(SC_FPREGS+144)(a0)
-	ldc1	$f20,(SC_FPREGS+160)(a0)
-	ldc1	$f22,(SC_FPREGS+176)(a0)
-	ldc1	$f24,(SC_FPREGS+192)(a0)
-	ldc1	$f26,(SC_FPREGS+208)(a0)
-	ldc1	$f28,(SC_FPREGS+224)(a0)
-	ldc1	$f30,(SC_FPREGS+240)(a0)
+	ldc1	$f0,0(a0)
+	ldc1	$f2,16(a0)
+	ldc1	$f4,32(a0)
+	ldc1	$f6,48(a0)
+	ldc1	$f8,64(a0)
+	ldc1	$f10,80(a0)
+	ldc1	$f12,96(a0)
+	ldc1	$f14,112(a0)
+	ldc1	$f16,128(a0)
+	ldc1	$f18,144(a0)
+	ldc1	$f20,160(a0)
+	ldc1	$f22,176(a0)
+	ldc1	$f24,192(a0)
+	ldc1	$f26,208(a0)
+	ldc1	$f28,224(a0)
+	ldc1	$f30,240(a0)
 	jr	ra
 	 ctc1	t0,fcr31
 1:	jr	ra
diff --git a/arch/mips/kernel/relocate.c b/arch/mips/kernel/relocate.c
index ca1cc30..1958910 100644
--- a/arch/mips/kernel/relocate.c
+++ b/arch/mips/kernel/relocate.c
@@ -200,7 +200,7 @@
 
 #if defined(CONFIG_USE_OF)
 	/* Get any additional entropy passed in device tree */
-	{
+	if (initial_boot_params) {
 		int node, len;
 		u64 *prop;
 
diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c
index 0d57909..f66e5ce 100644
--- a/arch/mips/kernel/setup.c
+++ b/arch/mips/kernel/setup.c
@@ -368,6 +368,19 @@
 		end = PFN_DOWN(boot_mem_map.map[i].addr
 				+ boot_mem_map.map[i].size);
 
+#ifndef CONFIG_HIGHMEM
+		/*
+		 * Skip highmem here so we get an accurate max_low_pfn if low
+		 * memory stops short of high memory.
+		 * If the region overlaps HIGHMEM_START, end is clipped so
+		 * max_pfn excludes the highmem portion.
+		 */
+		if (start >= PFN_DOWN(HIGHMEM_START))
+			continue;
+		if (end > PFN_DOWN(HIGHMEM_START))
+			end = PFN_DOWN(HIGHMEM_START);
+#endif
+
 		if (end > max_low_pfn)
 			max_low_pfn = end;
 		if (start < min_low_pfn)
diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c
index 1f5fdee..3905003 100644
--- a/arch/mips/kernel/traps.c
+++ b/arch/mips/kernel/traps.c
@@ -156,7 +156,7 @@
 		print_ip_sym(pc);
 		pc = unwind_stack(task, &sp, pc, &ra);
 	} while (pc);
-	printk("\n");
+	pr_cont("\n");
 }
 
 /*
@@ -174,22 +174,24 @@
 	printk("Stack :");
 	i = 0;
 	while ((unsigned long) sp & (PAGE_SIZE - 1)) {
-		if (i && ((i % (64 / field)) == 0))
-			printk("\n	 ");
+		if (i && ((i % (64 / field)) == 0)) {
+			pr_cont("\n");
+			printk("       ");
+		}
 		if (i > 39) {
-			printk(" ...");
+			pr_cont(" ...");
 			break;
 		}
 
 		if (__get_user(stackdata, sp++)) {
-			printk(" (Bad stack address)");
+			pr_cont(" (Bad stack address)");
 			break;
 		}
 
-		printk(" %0*lx", field, stackdata);
+		pr_cont(" %0*lx", field, stackdata);
 		i++;
 	}
-	printk("\n");
+	pr_cont("\n");
 	show_backtrace(task, regs);
 }
 
@@ -229,18 +231,19 @@
 	long i;
 	unsigned short __user *pc16 = NULL;
 
-	printk("\nCode:");
+	printk("Code:");
 
 	if ((unsigned long)pc & 1)
 		pc16 = (unsigned short __user *)((unsigned long)pc & ~1);
 	for(i = -3 ; i < 6 ; i++) {
 		unsigned int insn;
 		if (pc16 ? __get_user(insn, pc16 + i) : __get_user(insn, pc + i)) {
-			printk(" (Bad address in epc)\n");
+			pr_cont(" (Bad address in epc)\n");
 			break;
 		}
-		printk("%c%0*x%c", (i?' ':'<'), pc16 ? 4 : 8, insn, (i?' ':'>'));
+		pr_cont("%c%0*x%c", (i?' ':'<'), pc16 ? 4 : 8, insn, (i?' ':'>'));
 	}
+	pr_cont("\n");
 }
 
 static void __show_regs(const struct pt_regs *regs)
@@ -259,15 +262,15 @@
 		if ((i % 4) == 0)
 			printk("$%2d   :", i);
 		if (i == 0)
-			printk(" %0*lx", field, 0UL);
+			pr_cont(" %0*lx", field, 0UL);
 		else if (i == 26 || i == 27)
-			printk(" %*s", field, "");
+			pr_cont(" %*s", field, "");
 		else
-			printk(" %0*lx", field, regs->regs[i]);
+			pr_cont(" %0*lx", field, regs->regs[i]);
 
 		i++;
 		if ((i % 4) == 0)
-			printk("\n");
+			pr_cont("\n");
 	}
 
 #ifdef CONFIG_CPU_HAS_SMARTMIPS
@@ -288,46 +291,46 @@
 
 	if (cpu_has_3kex) {
 		if (regs->cp0_status & ST0_KUO)
-			printk("KUo ");
+			pr_cont("KUo ");
 		if (regs->cp0_status & ST0_IEO)
-			printk("IEo ");
+			pr_cont("IEo ");
 		if (regs->cp0_status & ST0_KUP)
-			printk("KUp ");
+			pr_cont("KUp ");
 		if (regs->cp0_status & ST0_IEP)
-			printk("IEp ");
+			pr_cont("IEp ");
 		if (regs->cp0_status & ST0_KUC)
-			printk("KUc ");
+			pr_cont("KUc ");
 		if (regs->cp0_status & ST0_IEC)
-			printk("IEc ");
+			pr_cont("IEc ");
 	} else if (cpu_has_4kex) {
 		if (regs->cp0_status & ST0_KX)
-			printk("KX ");
+			pr_cont("KX ");
 		if (regs->cp0_status & ST0_SX)
-			printk("SX ");
+			pr_cont("SX ");
 		if (regs->cp0_status & ST0_UX)
-			printk("UX ");
+			pr_cont("UX ");
 		switch (regs->cp0_status & ST0_KSU) {
 		case KSU_USER:
-			printk("USER ");
+			pr_cont("USER ");
 			break;
 		case KSU_SUPERVISOR:
-			printk("SUPERVISOR ");
+			pr_cont("SUPERVISOR ");
 			break;
 		case KSU_KERNEL:
-			printk("KERNEL ");
+			pr_cont("KERNEL ");
 			break;
 		default:
-			printk("BAD_MODE ");
+			pr_cont("BAD_MODE ");
 			break;
 		}
 		if (regs->cp0_status & ST0_ERL)
-			printk("ERL ");
+			pr_cont("ERL ");
 		if (regs->cp0_status & ST0_EXL)
-			printk("EXL ");
+			pr_cont("EXL ");
 		if (regs->cp0_status & ST0_IE)
-			printk("IE ");
+			pr_cont("IE ");
 	}
-	printk("\n");
+	pr_cont("\n");
 
 	exccode = (cause & CAUSEF_EXCCODE) >> CAUSEB_EXCCODE;
 	printk("Cause : %08x (ExcCode %02x)\n", cause, exccode);
@@ -705,6 +708,32 @@
 	exception_exit(prev_state);
 }
 
+/*
+ * Send SIGFPE according to FCSR Cause bits, which must have already
+ * been masked against Enable bits.  This is impotant as Inexact can
+ * happen together with Overflow or Underflow, and `ptrace' can set
+ * any bits.
+ */
+void force_fcr31_sig(unsigned long fcr31, void __user *fault_addr,
+		     struct task_struct *tsk)
+{
+	struct siginfo si = { .si_addr = fault_addr, .si_signo = SIGFPE };
+
+	if (fcr31 & FPU_CSR_INV_X)
+		si.si_code = FPE_FLTINV;
+	else if (fcr31 & FPU_CSR_DIV_X)
+		si.si_code = FPE_FLTDIV;
+	else if (fcr31 & FPU_CSR_OVF_X)
+		si.si_code = FPE_FLTOVF;
+	else if (fcr31 & FPU_CSR_UDF_X)
+		si.si_code = FPE_FLTUND;
+	else if (fcr31 & FPU_CSR_INE_X)
+		si.si_code = FPE_FLTRES;
+	else
+		si.si_code = __SI_FAULT;
+	force_sig_info(SIGFPE, &si, tsk);
+}
+
 int process_fpemu_return(int sig, void __user *fault_addr, unsigned long fcr31)
 {
 	struct siginfo si = { 0 };
@@ -715,27 +744,7 @@
 		return 0;
 
 	case SIGFPE:
-		si.si_addr = fault_addr;
-		si.si_signo = sig;
-		/*
-		 * Inexact can happen together with Overflow or Underflow.
-		 * Respect the mask to deliver the correct exception.
-		 */
-		fcr31 &= (fcr31 & FPU_CSR_ALL_E) <<
-			 (ffs(FPU_CSR_ALL_X) - ffs(FPU_CSR_ALL_E));
-		if (fcr31 & FPU_CSR_INV_X)
-			si.si_code = FPE_FLTINV;
-		else if (fcr31 & FPU_CSR_DIV_X)
-			si.si_code = FPE_FLTDIV;
-		else if (fcr31 & FPU_CSR_OVF_X)
-			si.si_code = FPE_FLTOVF;
-		else if (fcr31 & FPU_CSR_UDF_X)
-			si.si_code = FPE_FLTUND;
-		else if (fcr31 & FPU_CSR_INE_X)
-			si.si_code = FPE_FLTRES;
-		else
-			si.si_code = __SI_FAULT;
-		force_sig_info(sig, &si, current);
+		force_fcr31_sig(fcr31, fault_addr, current);
 		return 1;
 
 	case SIGBUS:
@@ -799,13 +808,13 @@
 	/* Run the emulator */
 	sig = fpu_emulator_cop1Handler(regs, &current->thread.fpu, 1,
 				       &fault_addr);
-	fcr31 = current->thread.fpu.fcr31;
 
 	/*
-	 * We can't allow the emulated instruction to leave any of
-	 * the cause bits set in $fcr31.
+	 * We can't allow the emulated instruction to leave any
+	 * enabled Cause bits set in $fcr31.
 	 */
-	current->thread.fpu.fcr31 &= ~FPU_CSR_ALL_X;
+	fcr31 = mask_fcr31_x(current->thread.fpu.fcr31);
+	current->thread.fpu.fcr31 &= ~fcr31;
 
 	/* Restore the hardware register state */
 	own_fpu(1);
@@ -831,7 +840,7 @@
 		goto out;
 
 	/* Clear FCSR.Cause before enabling interrupts */
-	write_32bit_cp1_register(CP1_STATUS, fcr31 & ~FPU_CSR_ALL_X);
+	write_32bit_cp1_register(CP1_STATUS, fcr31 & ~mask_fcr31_x(fcr31));
 	local_irq_enable();
 
 	die_if_kernel("FP exception in kernel code", regs);
@@ -853,13 +862,13 @@
 		/* Run the emulator */
 		sig = fpu_emulator_cop1Handler(regs, &current->thread.fpu, 1,
 					       &fault_addr);
-		fcr31 = current->thread.fpu.fcr31;
 
 		/*
-		 * We can't allow the emulated instruction to leave any of
-		 * the cause bits set in $fcr31.
+		 * We can't allow the emulated instruction to leave any
+		 * enabled Cause bits set in $fcr31.
 		 */
-		current->thread.fpu.fcr31 &= ~FPU_CSR_ALL_X;
+		fcr31 = mask_fcr31_x(current->thread.fpu.fcr31);
+		current->thread.fpu.fcr31 &= ~fcr31;
 
 		/* Restore the hardware register state */
 		own_fpu(1);	/* Using the FPU again.	 */
@@ -1424,13 +1433,13 @@
 
 		sig = fpu_emulator_cop1Handler(regs, &current->thread.fpu, 0,
 					       &fault_addr);
-		fcr31 = current->thread.fpu.fcr31;
 
 		/*
 		 * We can't allow the emulated instruction to leave
-		 * any of the cause bits set in $fcr31.
+		 * any enabled Cause bits set in $fcr31.
 		 */
-		current->thread.fpu.fcr31 &= ~FPU_CSR_ALL_X;
+		fcr31 = mask_fcr31_x(current->thread.fpu.fcr31);
+		current->thread.fpu.fcr31 &= ~fcr31;
 
 		/* Send a signal if required.  */
 		if (!process_fpemu_return(sig, fault_addr, fcr31) && !err)
diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c
index 8770f32..aa09374 100644
--- a/arch/mips/kvm/emulate.c
+++ b/arch/mips/kvm/emulate.c
@@ -790,15 +790,15 @@
 	struct mips_coproc *cop0 = vcpu->arch.cop0;
 	enum emulation_result er = EMULATE_DONE;
 
-	if (kvm_read_c0_guest_status(cop0) & ST0_EXL) {
+	if (kvm_read_c0_guest_status(cop0) & ST0_ERL) {
+		kvm_clear_c0_guest_status(cop0, ST0_ERL);
+		vcpu->arch.pc = kvm_read_c0_guest_errorepc(cop0);
+	} else if (kvm_read_c0_guest_status(cop0) & ST0_EXL) {
 		kvm_debug("[%#lx] ERET to %#lx\n", vcpu->arch.pc,
 			  kvm_read_c0_guest_epc(cop0));
 		kvm_clear_c0_guest_status(cop0, ST0_EXL);
 		vcpu->arch.pc = kvm_read_c0_guest_epc(cop0);
 
-	} else if (kvm_read_c0_guest_status(cop0) & ST0_ERL) {
-		kvm_clear_c0_guest_status(cop0, ST0_ERL);
-		vcpu->arch.pc = kvm_read_c0_guest_errorepc(cop0);
 	} else {
 		kvm_err("[%#lx] ERET when MIPS_SR_EXL|MIPS_SR_ERL == 0\n",
 			vcpu->arch.pc);
@@ -1528,13 +1528,25 @@
 					    struct kvm_vcpu *vcpu)
 {
 	enum emulation_result er = EMULATE_DO_MMIO;
+	unsigned long curr_pc;
 	u32 op, rt;
 	u32 bytes;
 
 	rt = inst.i_format.rt;
 	op = inst.i_format.opcode;
 
-	vcpu->arch.pending_load_cause = cause;
+	/*
+	 * Find the resume PC now while we have safe and easy access to the
+	 * prior branch instruction, and save it for
+	 * kvm_mips_complete_mmio_load() to restore later.
+	 */
+	curr_pc = vcpu->arch.pc;
+	er = update_pc(vcpu, cause);
+	if (er == EMULATE_FAIL)
+		return er;
+	vcpu->arch.io_pc = vcpu->arch.pc;
+	vcpu->arch.pc = curr_pc;
+
 	vcpu->arch.io_gpr = rt;
 
 	switch (op) {
@@ -2494,9 +2506,8 @@
 		goto done;
 	}
 
-	er = update_pc(vcpu, vcpu->arch.pending_load_cause);
-	if (er == EMULATE_FAIL)
-		return er;
+	/* Restore saved resume PC */
+	vcpu->arch.pc = vcpu->arch.io_pc;
 
 	switch (run->mmio.len) {
 	case 4:
@@ -2518,11 +2529,6 @@
 		break;
 	}
 
-	if (vcpu->arch.pending_load_cause & CAUSEF_BD)
-		kvm_debug("[%#lx] Completing %d byte BD Load to gpr %d (0x%08lx) type %d\n",
-			  vcpu->arch.pc, run->mmio.len, vcpu->arch.io_gpr, *gpr,
-			  vcpu->mmio_needed);
-
 done:
 	return er;
 }
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index ce96149..06a60b1 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -14,6 +14,7 @@
 #include <linux/err.h>
 #include <linux/kdebug.h>
 #include <linux/module.h>
+#include <linux/uaccess.h>
 #include <linux/vmalloc.h>
 #include <linux/fs.h>
 #include <linux/bootmem.h>
@@ -425,7 +426,7 @@
 static void kvm_mips_check_asids(struct kvm_vcpu *vcpu)
 {
 	struct mips_coproc *cop0 = vcpu->arch.cop0;
-	int cpu = smp_processor_id();
+	int i, cpu = smp_processor_id();
 	unsigned int gasid;
 
 	/*
@@ -441,6 +442,9 @@
 						vcpu);
 			vcpu->arch.guest_user_asid[cpu] =
 				vcpu->arch.guest_user_mm.context.asid[cpu];
+			for_each_possible_cpu(i)
+				if (i != cpu)
+					vcpu->arch.guest_user_asid[cpu] = 0;
 			vcpu->arch.last_user_gasid = gasid;
 		}
 	}
diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c
index 03883ba..3b677c8 100644
--- a/arch/mips/kvm/mmu.c
+++ b/arch/mips/kvm/mmu.c
@@ -260,13 +260,9 @@
 
 	if ((vcpu->arch.guest_user_asid[cpu] ^ asid_cache(cpu)) &
 						asid_version_mask(cpu)) {
-		u32 gasid = kvm_read_c0_guest_entryhi(vcpu->arch.cop0) &
-				KVM_ENTRYHI_ASID;
-
 		kvm_get_new_mmu_context(&vcpu->arch.guest_user_mm, cpu, vcpu);
 		vcpu->arch.guest_user_asid[cpu] =
 		    vcpu->arch.guest_user_mm.context.asid[cpu];
-		vcpu->arch.last_user_gasid = gasid;
 		newasid++;
 
 		kvm_debug("[%d]: cpu_context: %#lx\n", cpu,
diff --git a/arch/mips/lib/dump_tlb.c b/arch/mips/lib/dump_tlb.c
index 0f80b93..6eb50a7 100644
--- a/arch/mips/lib/dump_tlb.c
+++ b/arch/mips/lib/dump_tlb.c
@@ -135,42 +135,42 @@
 		c0 = (entrylo0 & ENTRYLO_C) >> ENTRYLO_C_SHIFT;
 		c1 = (entrylo1 & ENTRYLO_C) >> ENTRYLO_C_SHIFT;
 
-		printk("va=%0*lx asid=%0*lx",
-		       vwidth, (entryhi & ~0x1fffUL),
-		       asidwidth, entryhi & asidmask);
+		pr_cont("va=%0*lx asid=%0*lx",
+			vwidth, (entryhi & ~0x1fffUL),
+			asidwidth, entryhi & asidmask);
 		if (cpu_has_guestid)
-			printk(" gid=%02lx",
-			       (guestctl1 & MIPS_GCTL1_RID)
+			pr_cont(" gid=%02lx",
+				(guestctl1 & MIPS_GCTL1_RID)
 					>> MIPS_GCTL1_RID_SHIFT);
 		/* RI/XI are in awkward places, so mask them off separately */
 		pa = entrylo0 & ~(MIPS_ENTRYLO_RI | MIPS_ENTRYLO_XI);
 		if (xpa)
 			pa |= (unsigned long long)readx_c0_entrylo0() << 30;
 		pa = (pa << 6) & PAGE_MASK;
-		printk("\n\t[");
+		pr_cont("\n\t[");
 		if (cpu_has_rixi)
-			printk("ri=%d xi=%d ",
-			       (entrylo0 & MIPS_ENTRYLO_RI) ? 1 : 0,
-			       (entrylo0 & MIPS_ENTRYLO_XI) ? 1 : 0);
-		printk("pa=%0*llx c=%d d=%d v=%d g=%d] [",
-		       pwidth, pa, c0,
-		       (entrylo0 & ENTRYLO_D) ? 1 : 0,
-		       (entrylo0 & ENTRYLO_V) ? 1 : 0,
-		       (entrylo0 & ENTRYLO_G) ? 1 : 0);
+			pr_cont("ri=%d xi=%d ",
+				(entrylo0 & MIPS_ENTRYLO_RI) ? 1 : 0,
+				(entrylo0 & MIPS_ENTRYLO_XI) ? 1 : 0);
+		pr_cont("pa=%0*llx c=%d d=%d v=%d g=%d] [",
+			pwidth, pa, c0,
+			(entrylo0 & ENTRYLO_D) ? 1 : 0,
+			(entrylo0 & ENTRYLO_V) ? 1 : 0,
+			(entrylo0 & ENTRYLO_G) ? 1 : 0);
 		/* RI/XI are in awkward places, so mask them off separately */
 		pa = entrylo1 & ~(MIPS_ENTRYLO_RI | MIPS_ENTRYLO_XI);
 		if (xpa)
 			pa |= (unsigned long long)readx_c0_entrylo1() << 30;
 		pa = (pa << 6) & PAGE_MASK;
 		if (cpu_has_rixi)
-			printk("ri=%d xi=%d ",
-			       (entrylo1 & MIPS_ENTRYLO_RI) ? 1 : 0,
-			       (entrylo1 & MIPS_ENTRYLO_XI) ? 1 : 0);
-		printk("pa=%0*llx c=%d d=%d v=%d g=%d]\n",
-		       pwidth, pa, c1,
-		       (entrylo1 & ENTRYLO_D) ? 1 : 0,
-		       (entrylo1 & ENTRYLO_V) ? 1 : 0,
-		       (entrylo1 & ENTRYLO_G) ? 1 : 0);
+			pr_cont("ri=%d xi=%d ",
+				(entrylo1 & MIPS_ENTRYLO_RI) ? 1 : 0,
+				(entrylo1 & MIPS_ENTRYLO_XI) ? 1 : 0);
+		pr_cont("pa=%0*llx c=%d d=%d v=%d g=%d]\n",
+			pwidth, pa, c1,
+			(entrylo1 & ENTRYLO_D) ? 1 : 0,
+			(entrylo1 & ENTRYLO_V) ? 1 : 0,
+			(entrylo1 & ENTRYLO_G) ? 1 : 0);
 	}
 	printk("\n");
 
diff --git a/arch/mips/lib/r3k_dump_tlb.c b/arch/mips/lib/r3k_dump_tlb.c
index 744f4a7..85b4086 100644
--- a/arch/mips/lib/r3k_dump_tlb.c
+++ b/arch/mips/lib/r3k_dump_tlb.c
@@ -53,15 +53,15 @@
 			 */
 			printk("Index: %2d ", i);
 
-			printk("va=%08lx asid=%08lx"
-			       "  [pa=%06lx n=%d d=%d v=%d g=%d]",
-			       entryhi & PAGE_MASK,
-			       entryhi & asid_mask,
-			       entrylo0 & PAGE_MASK,
-			       (entrylo0 & R3K_ENTRYLO_N) ? 1 : 0,
-			       (entrylo0 & R3K_ENTRYLO_D) ? 1 : 0,
-			       (entrylo0 & R3K_ENTRYLO_V) ? 1 : 0,
-			       (entrylo0 & R3K_ENTRYLO_G) ? 1 : 0);
+			pr_cont("va=%08lx asid=%08lx"
+				"  [pa=%06lx n=%d d=%d v=%d g=%d]",
+				entryhi & PAGE_MASK,
+				entryhi & asid_mask,
+				entrylo0 & PAGE_MASK,
+				(entrylo0 & R3K_ENTRYLO_N) ? 1 : 0,
+				(entrylo0 & R3K_ENTRYLO_D) ? 1 : 0,
+				(entrylo0 & R3K_ENTRYLO_V) ? 1 : 0,
+				(entrylo0 & R3K_ENTRYLO_G) ? 1 : 0);
 		}
 	}
 	printk("\n");
diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c
index d56a855..3bef306 100644
--- a/arch/mips/mm/fault.c
+++ b/arch/mips/mm/fault.c
@@ -209,17 +209,18 @@
 		if (show_unhandled_signals &&
 		    unhandled_signal(tsk, SIGSEGV) &&
 		    __ratelimit(&ratelimit_state)) {
-			pr_info("\ndo_page_fault(): sending SIGSEGV to %s for invalid %s %0*lx",
+			pr_info("do_page_fault(): sending SIGSEGV to %s for invalid %s %0*lx\n",
 				tsk->comm,
 				write ? "write access to" : "read access from",
 				field, address);
 			pr_info("epc = %0*lx in", field,
 				(unsigned long) regs->cp0_epc);
-			print_vma_addr(" ", regs->cp0_epc);
+			print_vma_addr(KERN_CONT " ", regs->cp0_epc);
+			pr_cont("\n");
 			pr_info("ra  = %0*lx in", field,
 				(unsigned long) regs->regs[31]);
-			print_vma_addr(" ", regs->regs[31]);
-			pr_info("\n");
+			print_vma_addr(KERN_CONT " ", regs->regs[31]);
+			pr_cont("\n");
 		}
 		current->thread.trap_nr = (regs->cp0_cause >> 2) & 0x1f;
 		info.si_signo = SIGSEGV;
diff --git a/arch/mips/mm/gup.c b/arch/mips/mm/gup.c
index 42d124f..d8c3c15 100644
--- a/arch/mips/mm/gup.c
+++ b/arch/mips/mm/gup.c
@@ -287,7 +287,7 @@
 	pages += nr;
 
 	ret = get_user_pages_unlocked(start, (end - start) >> PAGE_SHIFT,
-				      write, 0, pages);
+				      pages, write ? FOLL_WRITE : 0);
 
 	/* Have to be a bit careful with return values */
 	if (nr > 0) {
diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c
index 3a6edec..e86ebcf 100644
--- a/arch/mips/mm/init.c
+++ b/arch/mips/mm/init.c
@@ -118,7 +118,7 @@
 		writex_c0_entrylo1(entrylo);
 	}
 #endif
-	tlbidx = read_c0_wired();
+	tlbidx = num_wired_entries();
 	write_c0_wired(tlbidx + 1);
 	write_c0_index(tlbidx);
 	mtc0_tlbw_hazard();
@@ -147,7 +147,7 @@
 
 	local_irq_save(flags);
 	old_ctx = read_c0_entryhi();
-	wired = read_c0_wired() - 1;
+	wired = num_wired_entries() - 1;
 	write_c0_wired(wired);
 	write_c0_index(wired);
 	write_c0_entryhi(UNIQUE_ENTRYHI(wired));
diff --git a/arch/mips/mm/tlb-r4k.c b/arch/mips/mm/tlb-r4k.c
index bba9c14..0596505 100644
--- a/arch/mips/mm/tlb-r4k.c
+++ b/arch/mips/mm/tlb-r4k.c
@@ -65,7 +65,7 @@
 	write_c0_entrylo0(0);
 	write_c0_entrylo1(0);
 
-	entry = read_c0_wired();
+	entry = num_wired_entries();
 
 	/*
 	 * Blast 'em all away.
@@ -385,7 +385,7 @@
 	old_ctx = read_c0_entryhi();
 	htw_stop();
 	old_pagemask = read_c0_pagemask();
-	wired = read_c0_wired();
+	wired = num_wired_entries();
 	write_c0_wired(wired + 1);
 	write_c0_index(wired);
 	tlbw_use_hazard();	/* What is the hazard here? */
@@ -449,7 +449,7 @@
 	htw_stop();
 	old_ctx = read_c0_entryhi();
 	old_pagemask = read_c0_pagemask();
-	wired = read_c0_wired();
+	wired = num_wired_entries();
 	if (--temp_tlb_entry < wired) {
 		printk(KERN_WARNING
 		       "No TLB space left for add_temporary_entry\n");
diff --git a/arch/nios2/kernel/time.c b/arch/nios2/kernel/time.c
index d9563dd..746bf5c 100644
--- a/arch/nios2/kernel/time.c
+++ b/arch/nios2/kernel/time.c
@@ -324,6 +324,7 @@
 		ret = nios2_clocksource_init(timer);
 		break;
 	default:
+		ret = 0;
 		break;
 	}
 
diff --git a/arch/openrisc/include/asm/cache.h b/arch/openrisc/include/asm/cache.h
index 4ce7a01..5f55da9 100644
--- a/arch/openrisc/include/asm/cache.h
+++ b/arch/openrisc/include/asm/cache.h
@@ -23,6 +23,8 @@
  * they shouldn't be hard-coded!
  */
 
+#define __ro_after_init __read_mostly
+
 #define L1_CACHE_BYTES 16
 #define L1_CACHE_SHIFT 4
 
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index 71c4a3a..a14b865 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -34,7 +34,9 @@
 	select HAVE_ARCH_HASH
 	select HAVE_ARCH_SECCOMP_FILTER
 	select HAVE_ARCH_TRACEHOOK
-	select HAVE_UNSTABLE_SCHED_CLOCK if (SMP || !64BIT)
+	select GENERIC_SCHED_CLOCK
+	select HAVE_UNSTABLE_SCHED_CLOCK if SMP
+	select GENERIC_CLOCKEVENTS
 	select ARCH_NO_COHERENT_DMA_MMAP
 	select CPU_NO_EFFICIENT_FFS
 
diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h
index c2c43f7..3a4ed9f 100644
--- a/arch/parisc/include/asm/pgtable.h
+++ b/arch/parisc/include/asm/pgtable.h
@@ -65,9 +65,9 @@
 		unsigned long flags;				\
 		spin_lock_irqsave(&pa_tlb_lock, flags);		\
 		old_pte = *ptep;				\
-		set_pte(ptep, pteval);				\
 		if (pte_inserted(old_pte))			\
 			purge_tlb_entries(mm, addr);		\
+		set_pte(ptep, pteval);				\
 		spin_unlock_irqrestore(&pa_tlb_lock, flags);	\
 	} while (0)
 
@@ -478,8 +478,8 @@
 		spin_unlock_irqrestore(&pa_tlb_lock, flags);
 		return 0;
 	}
-	set_pte(ptep, pte_mkold(pte));
 	purge_tlb_entries(vma->vm_mm, addr);
+	set_pte(ptep, pte_mkold(pte));
 	spin_unlock_irqrestore(&pa_tlb_lock, flags);
 	return 1;
 }
@@ -492,9 +492,9 @@
 
 	spin_lock_irqsave(&pa_tlb_lock, flags);
 	old_pte = *ptep;
-	set_pte(ptep, __pte(0));
 	if (pte_inserted(old_pte))
 		purge_tlb_entries(mm, addr);
+	set_pte(ptep, __pte(0));
 	spin_unlock_irqrestore(&pa_tlb_lock, flags);
 
 	return old_pte;
@@ -504,8 +504,8 @@
 {
 	unsigned long flags;
 	spin_lock_irqsave(&pa_tlb_lock, flags);
-	set_pte(ptep, pte_wrprotect(*ptep));
 	purge_tlb_entries(mm, addr);
+	set_pte(ptep, pte_wrprotect(*ptep));
 	spin_unlock_irqrestore(&pa_tlb_lock, flags);
 }
 
diff --git a/arch/parisc/include/uapi/asm/unistd.h b/arch/parisc/include/uapi/asm/unistd.h
index a9b9407..6b0741e 100644
--- a/arch/parisc/include/uapi/asm/unistd.h
+++ b/arch/parisc/include/uapi/asm/unistd.h
@@ -368,7 +368,9 @@
 
 #define __IGNORE_select		/* newselect */
 #define __IGNORE_fadvise64	/* fadvise64_64 */
-
+#define __IGNORE_pkey_mprotect
+#define __IGNORE_pkey_alloc
+#define __IGNORE_pkey_free
 
 #define LINUX_GATEWAY_ADDR      0x100
 
diff --git a/arch/parisc/kernel/cache.c b/arch/parisc/kernel/cache.c
index 629eb46..977f0a4f 100644
--- a/arch/parisc/kernel/cache.c
+++ b/arch/parisc/kernel/cache.c
@@ -369,6 +369,7 @@
 {
 	unsigned long rangetime, alltime;
 	unsigned long size, start;
+	unsigned long threshold;
 
 	alltime = mfctl(16);
 	flush_data_cache();
@@ -382,26 +383,30 @@
 	printk(KERN_DEBUG "Whole cache flush %lu cycles, flushing %lu bytes %lu cycles\n",
 		alltime, size, rangetime);
 
-	/* Racy, but if we see an intermediate value, it's ok too... */
-	parisc_cache_flush_threshold = size * alltime / rangetime;
-
-	parisc_cache_flush_threshold = L1_CACHE_ALIGN(parisc_cache_flush_threshold);
-	if (!parisc_cache_flush_threshold)
-		parisc_cache_flush_threshold = FLUSH_THRESHOLD;
-
-	if (parisc_cache_flush_threshold > cache_info.dc_size)
-		parisc_cache_flush_threshold = cache_info.dc_size;
-
-	printk(KERN_INFO "Setting cache flush threshold to %lu kB\n",
+	threshold = L1_CACHE_ALIGN(size * alltime / rangetime);
+	if (threshold > cache_info.dc_size)
+		threshold = cache_info.dc_size;
+	if (threshold)
+		parisc_cache_flush_threshold = threshold;
+	printk(KERN_INFO "Cache flush threshold set to %lu KiB\n",
 		parisc_cache_flush_threshold/1024);
 
 	/* calculate TLB flush threshold */
 
+	/* On SMP machines, skip the TLB measure of kernel text which
+	 * has been mapped as huge pages. */
+	if (num_online_cpus() > 1 && !parisc_requires_coherency()) {
+		threshold = max(cache_info.it_size, cache_info.dt_size);
+		threshold *= PAGE_SIZE;
+		threshold /= num_online_cpus();
+		goto set_tlb_threshold;
+	}
+
 	alltime = mfctl(16);
 	flush_tlb_all();
 	alltime = mfctl(16) - alltime;
 
-	size = PAGE_SIZE;
+	size = 0;
 	start = (unsigned long) _text;
 	rangetime = mfctl(16);
 	while (start < (unsigned long) _end) {
@@ -414,13 +419,12 @@
 	printk(KERN_DEBUG "Whole TLB flush %lu cycles, flushing %lu bytes %lu cycles\n",
 		alltime, size, rangetime);
 
-	parisc_tlb_flush_threshold = size * alltime / rangetime;
-	parisc_tlb_flush_threshold *= num_online_cpus();
-	parisc_tlb_flush_threshold = PAGE_ALIGN(parisc_tlb_flush_threshold);
-	if (!parisc_tlb_flush_threshold)
-		parisc_tlb_flush_threshold = FLUSH_TLB_THRESHOLD;
+	threshold = PAGE_ALIGN(num_online_cpus() * size * alltime / rangetime);
 
-	printk(KERN_INFO "Setting TLB flush threshold to %lu kB\n",
+set_tlb_threshold:
+	if (threshold)
+		parisc_tlb_flush_threshold = threshold;
+	printk(KERN_INFO "TLB flush threshold set to %lu KiB\n",
 		parisc_tlb_flush_threshold/1024);
 }
 
diff --git a/arch/parisc/kernel/drivers.c b/arch/parisc/kernel/drivers.c
index f815066..700e2d2 100644
--- a/arch/parisc/kernel/drivers.c
+++ b/arch/parisc/kernel/drivers.c
@@ -873,11 +873,11 @@
 
 	if (dev->num_addrs) {
 		int k;
-		printk(", additional addresses: ");
+		pr_cont(", additional addresses: ");
 		for (k = 0; k < dev->num_addrs; k++)
-			printk("0x%lx ", dev->addr[k]);
+			pr_cont("0x%lx ", dev->addr[k]);
 	}
-	printk("\n");
+	pr_cont("\n");
 }
 
 /**
diff --git a/arch/parisc/kernel/inventory.c b/arch/parisc/kernel/inventory.c
index 545f9d2..c05d187 100644
--- a/arch/parisc/kernel/inventory.c
+++ b/arch/parisc/kernel/inventory.c
@@ -58,7 +58,7 @@
 	status = pdc_system_map_find_mods(&module_result, &module_path, 0);
 	if (status == PDC_OK) {
 		pdc_type = PDC_TYPE_SYSTEM_MAP;
-		printk("System Map.\n");
+		pr_cont("System Map.\n");
 		return;
 	}
 
@@ -77,7 +77,7 @@
 	status = pdc_pat_cell_get_number(&cell_info);
 	if (status == PDC_OK) {
 		pdc_type = PDC_TYPE_PAT;
-		printk("64 bit PAT.\n");
+		pr_cont("64 bit PAT.\n");
 		return;
 	}
 #endif
@@ -97,12 +97,12 @@
 	case 0xC:		/* 715/64, at least */
 
 		pdc_type = PDC_TYPE_SNAKE;
-		printk("Snake.\n");
+		pr_cont("Snake.\n");
 		return;
 
 	default:		/* Everything else */
 
-		printk("Unsupported.\n");
+		pr_cont("Unsupported.\n");
 		panic("If this is a 64-bit machine, please try a 64-bit kernel.\n");
 	}
 }
diff --git a/arch/parisc/kernel/pacache.S b/arch/parisc/kernel/pacache.S
index 985e06d..adf7187 100644
--- a/arch/parisc/kernel/pacache.S
+++ b/arch/parisc/kernel/pacache.S
@@ -96,7 +96,7 @@
 
 fitmanymiddle:					/* Loop if LOOP >= 2 */
 	addib,COND(>)		-1, %r31, fitmanymiddle	/* Adjusted inner loop decr */
-	pitlbe		0(%sr1, %r28)
+	pitlbe		%r0(%sr1, %r28)
 	pitlbe,m	%arg1(%sr1, %r28)	/* Last pitlbe and addr adjust */
 	addib,COND(>)		-1, %r29, fitmanymiddle	/* Middle loop decr */
 	copy		%arg3, %r31		/* Re-init inner loop count */
@@ -139,7 +139,7 @@
 
 fdtmanymiddle:					/* Loop if LOOP >= 2 */
 	addib,COND(>)		-1, %r31, fdtmanymiddle	/* Adjusted inner loop decr */
-	pdtlbe		0(%sr1, %r28)
+	pdtlbe		%r0(%sr1, %r28)
 	pdtlbe,m	%arg1(%sr1, %r28)	/* Last pdtlbe and addr adjust */
 	addib,COND(>)		-1, %r29, fdtmanymiddle	/* Middle loop decr */
 	copy		%arg3, %r31		/* Re-init inner loop count */
@@ -626,12 +626,12 @@
 	/* Purge any old translations */
 
 #ifdef CONFIG_PA20
-	pdtlb,l		0(%r28)
-	pdtlb,l		0(%r29)
+	pdtlb,l		%r0(%r28)
+	pdtlb,l		%r0(%r29)
 #else
 	tlb_lock	%r20,%r21,%r22
-	pdtlb		0(%r28)
-	pdtlb		0(%r29)
+	pdtlb		%r0(%r28)
+	pdtlb		%r0(%r29)
 	tlb_unlock	%r20,%r21,%r22
 #endif
 
@@ -774,10 +774,10 @@
 	/* Purge any old translation */
 
 #ifdef CONFIG_PA20
-	pdtlb,l		0(%r28)
+	pdtlb,l		%r0(%r28)
 #else
 	tlb_lock	%r20,%r21,%r22
-	pdtlb		0(%r28)
+	pdtlb		%r0(%r28)
 	tlb_unlock	%r20,%r21,%r22
 #endif
 
@@ -858,10 +858,10 @@
 	/* Purge any old translation */
 
 #ifdef CONFIG_PA20
-	pdtlb,l		0(%r28)
+	pdtlb,l		%r0(%r28)
 #else
 	tlb_lock	%r20,%r21,%r22
-	pdtlb		0(%r28)
+	pdtlb		%r0(%r28)
 	tlb_unlock	%r20,%r21,%r22
 #endif
 
@@ -892,19 +892,10 @@
 	fdc,m		r31(%r28)
 	fdc,m		r31(%r28)
 	fdc,m		r31(%r28)
-	cmpb,COND(<<)		%r28, %r25,1b
+	cmpb,COND(<<)	%r28, %r25,1b
 	fdc,m		r31(%r28)
 
 	sync
-
-#ifdef CONFIG_PA20
-	pdtlb,l		0(%r25)
-#else
-	tlb_lock	%r20,%r21,%r22
-	pdtlb		0(%r25)
-	tlb_unlock	%r20,%r21,%r22
-#endif
-
 	bv		%r0(%r2)
 	nop
 	.exit
@@ -931,13 +922,18 @@
 	depwi		0, 31,PAGE_SHIFT, %r28	/* Clear any offset bits */
 #endif
 
-	/* Purge any old translation */
+	/* Purge any old translation.  Note that the FIC instruction
+	 * may use either the instruction or data TLB.  Given that we
+	 * have a flat address space, it's not clear which TLB will be
+	 * used.  So, we purge both entries.  */
 
 #ifdef CONFIG_PA20
+	pdtlb,l		%r0(%r28)
 	pitlb,l         %r0(%sr4,%r28)
 #else
 	tlb_lock        %r20,%r21,%r22
-	pitlb           (%sr4,%r28)
+	pdtlb		%r0(%r28)
+	pitlb           %r0(%sr4,%r28)
 	tlb_unlock      %r20,%r21,%r22
 #endif
 
@@ -974,15 +970,6 @@
 	fic,m		%r31(%sr4,%r28)
 
 	sync
-
-#ifdef CONFIG_PA20
-	pitlb,l         %r0(%sr4,%r25)
-#else
-	tlb_lock        %r20,%r21,%r22
-	pitlb           (%sr4,%r25)
-	tlb_unlock      %r20,%r21,%r22
-#endif
-
 	bv		%r0(%r2)
 	nop
 	.exit
diff --git a/arch/parisc/kernel/pci-dma.c b/arch/parisc/kernel/pci-dma.c
index 02d9ed0..494ff6e 100644
--- a/arch/parisc/kernel/pci-dma.c
+++ b/arch/parisc/kernel/pci-dma.c
@@ -95,8 +95,8 @@
 
 		if (!pte_none(*pte))
 			printk(KERN_ERR "map_pte_uncached: page already exists\n");
-		set_pte(pte, __mk_pte(*paddr_ptr, PAGE_KERNEL_UNC));
 		purge_tlb_start(flags);
+		set_pte(pte, __mk_pte(*paddr_ptr, PAGE_KERNEL_UNC));
 		pdtlb_kernel(orig_vaddr);
 		purge_tlb_end(flags);
 		vaddr += PAGE_SIZE;
diff --git a/arch/parisc/kernel/setup.c b/arch/parisc/kernel/setup.c
index 81d6f63..2e66a88 100644
--- a/arch/parisc/kernel/setup.c
+++ b/arch/parisc/kernel/setup.c
@@ -334,6 +334,10 @@
 	/* tell PDC we're Linux. Nevermind failure. */
 	pdc_stable_write(0x40, &osid, sizeof(osid));
 	
+	/* start with known state */
+	flush_cache_all_local();
+	flush_tlb_all_local(NULL);
+
 	processor_init();
 #ifdef CONFIG_SMP
 	pr_info("CPU(s): %d out of %d %s at %d.%06d MHz online\n",
diff --git a/arch/parisc/kernel/syscall.S b/arch/parisc/kernel/syscall.S
index d03422e..23de307 100644
--- a/arch/parisc/kernel/syscall.S
+++ b/arch/parisc/kernel/syscall.S
@@ -100,14 +100,12 @@
 	.endr
 
 /* This address must remain fixed at 0x100 for glibc's syscalls to work */
-	.align 256
+	.align LINUX_GATEWAY_ADDR
 linux_gateway_entry:
 	gate	.+8, %r0			/* become privileged */
 	mtsp	%r0,%sr4			/* get kernel space into sr4 */
 	mtsp	%r0,%sr5			/* get kernel space into sr5 */
 	mtsp	%r0,%sr6			/* get kernel space into sr6 */
-	mfsp    %sr7,%r1                        /* save user sr7 */
-	mtsp    %r1,%sr3                        /* and store it in sr3 */
 
 #ifdef CONFIG_64BIT
 	/* for now we can *always* set the W bit on entry to the syscall
@@ -133,6 +131,14 @@
 	depdi	0, 31, 32, %r21
 1:	
 #endif
+
+	/* We use a rsm/ssm pair to prevent sr3 from being clobbered
+	 * by external interrupts.
+	 */
+	mfsp    %sr7,%r1                        /* save user sr7 */
+	rsm	PSW_SM_I, %r0			/* disable interrupts */
+	mtsp    %r1,%sr3                        /* and store it in sr3 */
+
 	mfctl   %cr30,%r1
 	xor     %r1,%r30,%r30                   /* ye olde xor trick */
 	xor     %r1,%r30,%r1
@@ -147,6 +153,7 @@
 	 */
 
 	mtsp	%r0,%sr7			/* get kernel space into sr7 */
+	ssm	PSW_SM_I, %r0			/* enable interrupts */
 	STREGM	%r1,FRAME_SIZE(%r30)		/* save r1 (usp) here for now */
 	mfctl	%cr30,%r1			/* get task ptr in %r1 */
 	LDREG	TI_TASK(%r1),%r1
@@ -474,11 +481,6 @@
 	comiclr,>>	__NR_lws_entries, %r20, %r0
 	b,n	lws_exit_nosys
 
-	/* WARNING: Trashing sr2 and sr3 */
-	mfsp	%sr7,%r1			/* get userspace into sr3 */
-	mtsp	%r1,%sr3
-	mtsp	%r0,%sr2			/* get kernel space into sr2 */
-
 	/* Load table start */
 	ldil	L%lws_table, %r1
 	ldo	R%lws_table(%r1), %r28	/* Scratch use of r28 */
@@ -627,9 +629,9 @@
 	stw	%r1, 4(%sr2,%r20)
 #endif
 	/* The load and store could fail */
-1:	ldw,ma	0(%sr3,%r26), %r28
+1:	ldw,ma	0(%r26), %r28
 	sub,<>	%r28, %r25, %r0
-2:	stw,ma	%r24, 0(%sr3,%r26)
+2:	stw,ma	%r24, 0(%r26)
 	/* Free lock */
 	stw,ma	%r20, 0(%sr2,%r20)
 #if ENABLE_LWS_DEBUG
@@ -706,9 +708,9 @@
 	nop
 
 	/* 8bit load */
-4:	ldb	0(%sr3,%r25), %r25
+4:	ldb	0(%r25), %r25
 	b	cas2_lock_start
-5:	ldb	0(%sr3,%r24), %r24
+5:	ldb	0(%r24), %r24
 	nop
 	nop
 	nop
@@ -716,9 +718,9 @@
 	nop
 
 	/* 16bit load */
-6:	ldh	0(%sr3,%r25), %r25
+6:	ldh	0(%r25), %r25
 	b	cas2_lock_start
-7:	ldh	0(%sr3,%r24), %r24
+7:	ldh	0(%r24), %r24
 	nop
 	nop
 	nop
@@ -726,9 +728,9 @@
 	nop
 
 	/* 32bit load */
-8:	ldw	0(%sr3,%r25), %r25
+8:	ldw	0(%r25), %r25
 	b	cas2_lock_start
-9:	ldw	0(%sr3,%r24), %r24
+9:	ldw	0(%r24), %r24
 	nop
 	nop
 	nop
@@ -737,14 +739,14 @@
 
 	/* 64bit load */
 #ifdef CONFIG_64BIT
-10:	ldd	0(%sr3,%r25), %r25
-11:	ldd	0(%sr3,%r24), %r24
+10:	ldd	0(%r25), %r25
+11:	ldd	0(%r24), %r24
 #else
 	/* Load new value into r22/r23 - high/low */
-10:	ldw	0(%sr3,%r25), %r22
-11:	ldw	4(%sr3,%r25), %r23
+10:	ldw	0(%r25), %r22
+11:	ldw	4(%r25), %r23
 	/* Load new value into fr4 for atomic store later */
-12:	flddx	0(%sr3,%r24), %fr4
+12:	flddx	0(%r24), %fr4
 #endif
 
 cas2_lock_start:
@@ -794,30 +796,30 @@
 	ldo	1(%r0),%r28
 
 	/* 8bit CAS */
-13:	ldb,ma	0(%sr3,%r26), %r29
+13:	ldb,ma	0(%r26), %r29
 	sub,=	%r29, %r25, %r0
 	b,n	cas2_end
-14:	stb,ma	%r24, 0(%sr3,%r26)
+14:	stb,ma	%r24, 0(%r26)
 	b	cas2_end
 	copy	%r0, %r28
 	nop
 	nop
 
 	/* 16bit CAS */
-15:	ldh,ma	0(%sr3,%r26), %r29
+15:	ldh,ma	0(%r26), %r29
 	sub,=	%r29, %r25, %r0
 	b,n	cas2_end
-16:	sth,ma	%r24, 0(%sr3,%r26)
+16:	sth,ma	%r24, 0(%r26)
 	b	cas2_end
 	copy	%r0, %r28
 	nop
 	nop
 
 	/* 32bit CAS */
-17:	ldw,ma	0(%sr3,%r26), %r29
+17:	ldw,ma	0(%r26), %r29
 	sub,=	%r29, %r25, %r0
 	b,n	cas2_end
-18:	stw,ma	%r24, 0(%sr3,%r26)
+18:	stw,ma	%r24, 0(%r26)
 	b	cas2_end
 	copy	%r0, %r28
 	nop
@@ -825,22 +827,22 @@
 
 	/* 64bit CAS */
 #ifdef CONFIG_64BIT
-19:	ldd,ma	0(%sr3,%r26), %r29
+19:	ldd,ma	0(%r26), %r29
 	sub,*=	%r29, %r25, %r0
 	b,n	cas2_end
-20:	std,ma	%r24, 0(%sr3,%r26)
+20:	std,ma	%r24, 0(%r26)
 	copy	%r0, %r28
 #else
 	/* Compare first word */
-19:	ldw,ma	0(%sr3,%r26), %r29
+19:	ldw,ma	0(%r26), %r29
 	sub,=	%r29, %r22, %r0
 	b,n	cas2_end
 	/* Compare second word */
-20:	ldw,ma	4(%sr3,%r26), %r29
+20:	ldw,ma	4(%r26), %r29
 	sub,=	%r29, %r23, %r0
 	b,n	cas2_end
 	/* Perform the store */
-21:	fstdx	%fr4, 0(%sr3,%r26)
+21:	fstdx	%fr4, 0(%r26)
 	copy	%r0, %r28
 #endif
 
diff --git a/arch/parisc/kernel/time.c b/arch/parisc/kernel/time.c
index 9b63b87..325f30d 100644
--- a/arch/parisc/kernel/time.c
+++ b/arch/parisc/kernel/time.c
@@ -14,6 +14,7 @@
 #include <linux/module.h>
 #include <linux/rtc.h>
 #include <linux/sched.h>
+#include <linux/sched_clock.h>
 #include <linux/kernel.h>
 #include <linux/param.h>
 #include <linux/string.h>
@@ -39,18 +40,6 @@
 
 static unsigned long clocktick __read_mostly;	/* timer cycles per tick */
 
-#ifndef CONFIG_64BIT
-/*
- * The processor-internal cycle counter (Control Register 16) is used as time
- * source for the sched_clock() function.  This register is 64bit wide on a
- * 64-bit kernel and 32bit on a 32-bit kernel. Since sched_clock() always
- * requires a 64bit counter we emulate on the 32-bit kernel the higher 32bits
- * with a per-cpu variable which we increase every time the counter
- * wraps-around (which happens every ~4 secounds).
- */
-static DEFINE_PER_CPU(unsigned long, cr16_high_32_bits);
-#endif
-
 /*
  * We keep time on PA-RISC Linux by using the Interval Timer which is
  * a pair of registers; one is read-only and one is write-only; both
@@ -121,12 +110,6 @@
 	 */
 	mtctl(next_tick, 16);
 
-#if !defined(CONFIG_64BIT)
-	/* check for overflow on a 32bit kernel (every ~4 seconds). */
-	if (unlikely(next_tick < now))
-		this_cpu_inc(cr16_high_32_bits);
-#endif
-
 	/* Skip one clocktick on purpose if we missed next_tick.
 	 * The new CR16 must be "later" than current CR16 otherwise
 	 * itimer would not fire until CR16 wrapped - e.g 4 seconds
@@ -208,7 +191,7 @@
 
 /* clock source code */
 
-static cycle_t read_cr16(struct clocksource *cs)
+static cycle_t notrace read_cr16(struct clocksource *cs)
 {
 	return get_cycles();
 }
@@ -287,26 +270,9 @@
 }
 
 
-/*
- * sched_clock() framework
- */
-
-static u32 cyc2ns_mul __read_mostly;
-static u32 cyc2ns_shift __read_mostly;
-
-u64 sched_clock(void)
+static u64 notrace read_cr16_sched_clock(void)
 {
-	u64 now;
-
-	/* Get current cycle counter (Control Register 16). */
-#ifdef CONFIG_64BIT
-	now = mfctl(16);
-#else
-	now = mfctl(16) + (((u64) this_cpu_read(cr16_high_32_bits)) << 32);
-#endif
-
-	/* return the value in ns (cycles_2_ns) */
-	return mul_u64_u32_shr(now, cyc2ns_mul, cyc2ns_shift);
+	return get_cycles();
 }
 
 
@@ -316,17 +282,16 @@
 
 void __init time_init(void)
 {
-	unsigned long current_cr16_khz;
+	unsigned long cr16_hz;
 
-	current_cr16_khz = PAGE0->mem_10msec/10;  /* kHz */
 	clocktick = (100 * PAGE0->mem_10msec) / HZ;
-
-	/* calculate mult/shift values for cr16 */
-	clocks_calc_mult_shift(&cyc2ns_mul, &cyc2ns_shift, current_cr16_khz,
-				NSEC_PER_MSEC, 0);
-
 	start_cpu_itimer();	/* get CPU 0 started */
 
+	cr16_hz = 100 * PAGE0->mem_10msec;  /* Hz */
+
 	/* register at clocksource framework */
-	clocksource_register_khz(&clocksource_cr16, current_cr16_khz);
+	clocksource_register_hz(&clocksource_cr16, cr16_hz);
+
+	/* register as sched_clock source */
+	sched_clock_register(read_cr16_sched_clock, BITS_PER_LONG, cr16_hz);
 }
diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile
index eae2dc8..9d47f2e 100644
--- a/arch/powerpc/boot/Makefile
+++ b/arch/powerpc/boot/Makefile
@@ -100,7 +100,8 @@
 		ns16550.c serial.c simple_alloc.c div64.S util.S \
 		elf_util.c $(zlib-y) devtree.c stdlib.c \
 		oflib.c ofconsole.c cuboot.c mpsc.c cpm-serial.c \
-		uartlite.c mpc52xx-psc.c opal.c opal-calls.S
+		uartlite.c mpc52xx-psc.c opal.c
+src-wlib-$(CONFIG_PPC64_BOOT_WRAPPER) +=  opal-calls.S
 src-wlib-$(CONFIG_40x) += 4xx.c planetcore.c
 src-wlib-$(CONFIG_44x) += 4xx.c ebony.c bamboo.c
 src-wlib-$(CONFIG_8xx) += mpc8xx.c planetcore.c fsl-soc.c
diff --git a/arch/powerpc/boot/main.c b/arch/powerpc/boot/main.c
index f7a184b..78aaf4f 100644
--- a/arch/powerpc/boot/main.c
+++ b/arch/powerpc/boot/main.c
@@ -32,9 +32,16 @@
 	void *addr = 0;
 	struct elf_info ei;
 	long len;
+	int uncompressed_image = 0;
 
-	partial_decompress(vmlinuz_addr, vmlinuz_size,
+	len = partial_decompress(vmlinuz_addr, vmlinuz_size,
 		elfheader, sizeof(elfheader), 0);
+	/* assume uncompressed data if -1 is returned */
+	if (len == -1) {
+		uncompressed_image = 1;
+		memcpy(elfheader, vmlinuz_addr, sizeof(elfheader));
+		printf("No valid compressed data found, assume uncompressed data\n\r");
+	}
 
 	if (!parse_elf64(elfheader, &ei) && !parse_elf32(elfheader, &ei))
 		fatal("Error: not a valid PPC32 or PPC64 ELF file!\n\r");
@@ -67,6 +74,13 @@
 					"device tree\n\r");
 	}
 
+	if (uncompressed_image) {
+		memcpy(addr, vmlinuz_addr + ei.elfoffset, ei.loadsize);
+		printf("0x%lx bytes of uncompressed data copied\n\r",
+		       ei.loadsize);
+		goto out;
+	}
+
 	/* Finally, decompress the kernel */
 	printf("Decompressing (0x%p <- 0x%p:0x%p)...\n\r", addr,
 	       vmlinuz_addr, vmlinuz_addr+vmlinuz_size);
@@ -82,7 +96,7 @@
 			 len, ei.loadsize);
 
 	printf("Done! Decompressed 0x%lx bytes\n\r", len);
-
+out:
 	flush_cache(addr, ei.loadsize);
 
 	return (struct addr_range){addr, ei.memsize};
@@ -218,8 +232,12 @@
 		console_ops.close();
 
 	kentry = (kernel_entry_t) vmlinux.addr;
-	if (ft_addr)
-		kentry(ft_addr, 0, NULL);
+	if (ft_addr) {
+		if(platform_ops.kentry)
+			platform_ops.kentry(ft_addr, vmlinux.addr);
+		else
+			kentry(ft_addr, 0, NULL);
+	}
 	else
 		kentry((unsigned long)initrd.addr, initrd.size,
 		       loader_info.promptr);
diff --git a/arch/powerpc/boot/opal-calls.S b/arch/powerpc/boot/opal-calls.S
index ff2f1b9..2a99fc9 100644
--- a/arch/powerpc/boot/opal-calls.S
+++ b/arch/powerpc/boot/opal-calls.S
@@ -12,6 +12,19 @@
 
 	.text
 
+	.globl opal_kentry
+opal_kentry:
+	/* r3 is the fdt ptr */
+	mtctr r4
+	li	r4, 0
+	li	r5, 0
+	li	r6, 0
+	li	r7, 0
+	ld	r11,opal@got(r2)
+	ld	r8,0(r11)
+	ld	r9,8(r11)
+	bctr
+
 #define OPAL_CALL(name, token)				\
 	.globl name;					\
 name:							\
diff --git a/arch/powerpc/boot/opal.c b/arch/powerpc/boot/opal.c
index 1f37e1c..0272570 100644
--- a/arch/powerpc/boot/opal.c
+++ b/arch/powerpc/boot/opal.c
@@ -13,7 +13,7 @@
 #include <libfdt.h>
 #include "../include/asm/opal-api.h"
 
-#ifdef __powerpc64__
+#ifdef CONFIG_PPC64_BOOT_WRAPPER
 
 /* Global OPAL struct used by opal-call.S */
 struct opal {
@@ -23,14 +23,25 @@
 
 static u32 opal_con_id;
 
+/* see opal-wrappers.S */
 int64_t opal_console_write(int64_t term_number, u64 *length, const u8 *buffer);
 int64_t opal_console_read(int64_t term_number, uint64_t *length, u8 *buffer);
 int64_t opal_console_write_buffer_space(uint64_t term_number, uint64_t *length);
 int64_t opal_console_flush(uint64_t term_number);
 int64_t opal_poll_events(uint64_t *outstanding_event_mask);
 
+void opal_kentry(unsigned long fdt_addr, void *vmlinux_addr);
+
 static int opal_con_open(void)
 {
+	/*
+	 * When OPAL loads the boot kernel it stashes the OPAL base and entry
+	 * address in r8 and r9 so the kernel can use the OPAL console
+	 * before unflattening the devicetree. While executing the wrapper will
+	 * probably trash r8 and r9 so this kentry hook restores them before
+	 * entering the decompressed kernel.
+	 */
+	platform_ops.kentry = opal_kentry;
 	return 0;
 }
 
diff --git a/arch/powerpc/boot/ops.h b/arch/powerpc/boot/ops.h
index 309d1b1..fad1862 100644
--- a/arch/powerpc/boot/ops.h
+++ b/arch/powerpc/boot/ops.h
@@ -30,6 +30,7 @@
 	void *	(*realloc)(void *ptr, unsigned long size);
 	void	(*exit)(void);
 	void *	(*vmlinux_alloc)(unsigned long size);
+	void  	(*kentry)(unsigned long fdt_addr, void *vmlinux_addr);
 };
 extern struct platform_ops platform_ops;
 
diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h
index d149273..e0baba1 100644
--- a/arch/powerpc/include/asm/asm-prototypes.h
+++ b/arch/powerpc/include/asm/asm-prototypes.h
@@ -14,6 +14,10 @@
 
 #include <linux/threads.h>
 #include <linux/kprobes.h>
+#include <asm/cacheflush.h>
+#include <asm/checksum.h>
+#include <asm/uaccess.h>
+#include <asm/epapr_hcalls.h>
 
 #include <uapi/asm/ucontext.h>
 
@@ -109,4 +113,12 @@
 /* time */
 void accumulate_stolen_time(void);
 
+/* misc runtime */
+extern u64 __bswapdi2(u64);
+extern s64 __lshrdi3(s64, int);
+extern s64 __ashldi3(s64, int);
+extern s64 __ashrdi3(s64, int);
+extern int __cmpdi2(s64, s64);
+extern int __ucmpdi2(u64, u64);
+
 #endif /* _ASM_POWERPC_ASM_PROTOTYPES_H */
diff --git a/arch/powerpc/include/asm/checksum.h b/arch/powerpc/include/asm/checksum.h
index ee655ed..1e8fceb 100644
--- a/arch/powerpc/include/asm/checksum.h
+++ b/arch/powerpc/include/asm/checksum.h
@@ -53,10 +53,8 @@
 	return (__force __sum16)(~((__force u32)sum + tmp) >> 16);
 }
 
-static inline __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr,
-                                     unsigned short len,
-                                     unsigned short proto,
-                                     __wsum sum)
+static inline __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len,
+					__u8 proto, __wsum sum)
 {
 #ifdef __powerpc64__
 	unsigned long s = (__force u32)sum;
@@ -83,10 +81,8 @@
  * computes the checksum of the TCP/UDP pseudo-header
  * returns a 16-bit checksum, already complemented
  */
-static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
-					unsigned short len,
-					unsigned short proto,
-					__wsum sum)
+static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr, __u32 len,
+					__u8 proto, __wsum sum)
 {
 	return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum));
 }
diff --git a/arch/powerpc/include/asm/cpuidle.h b/arch/powerpc/include/asm/cpuidle.h
index 01b8a13..3919332 100644
--- a/arch/powerpc/include/asm/cpuidle.h
+++ b/arch/powerpc/include/asm/cpuidle.h
@@ -26,7 +26,7 @@
 	std	r0,0(r1);					\
 	ptesync;						\
 	ld	r0,0(r1);					\
-1:	cmp	cr0,r0,r0;					\
+1:	cmpd	cr0,r0,r0;					\
 	bne	1b;						\
 	IDLE_INST;						\
 	b	.
diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h
index 2e4e7d8..9a3eee6 100644
--- a/arch/powerpc/include/asm/exception-64s.h
+++ b/arch/powerpc/include/asm/exception-64s.h
@@ -91,7 +91,11 @@
  */
 #define LOAD_HANDLER(reg, label)					\
 	ld	reg,PACAKBASE(r13);	/* get high part of &label */	\
-	ori	reg,reg,(FIXED_SYMBOL_ABS_ADDR(label))@l;
+	ori	reg,reg,FIXED_SYMBOL_ABS_ADDR(label);
+
+#define __LOAD_HANDLER(reg, label)					\
+	ld	reg,PACAKBASE(r13);					\
+	ori	reg,reg,(ABS_ADDR(label))@l;
 
 /* Exception register prefixes */
 #define EXC_HV	H
@@ -154,14 +158,17 @@
 	std	ra,offset(r13);						\
 END_FTR_SECTION_NESTED(ftr,ftr,943)
 
-#define EXCEPTION_PROLOG_0(area)					\
-	GET_PACA(r13);							\
+#define EXCEPTION_PROLOG_0_PACA(area)					\
 	std	r9,area+EX_R9(r13);	/* save r9 */			\
 	OPT_GET_SPR(r9, SPRN_PPR, CPU_FTR_HAS_PPR);			\
 	HMT_MEDIUM;							\
 	std	r10,area+EX_R10(r13);	/* save r10 - r12 */		\
 	OPT_GET_SPR(r10, SPRN_CFAR, CPU_FTR_CFAR)
 
+#define EXCEPTION_PROLOG_0(area)					\
+	GET_PACA(r13);							\
+	EXCEPTION_PROLOG_0_PACA(area)
+
 #define __EXCEPTION_PROLOG_1(area, extra, vec)				\
 	OPT_SAVE_REG_TO_PACA(area+EX_PPR, r9, CPU_FTR_HAS_PPR);		\
 	OPT_SAVE_REG_TO_PACA(area+EX_CFAR, r10, CPU_FTR_CFAR);		\
@@ -192,6 +199,12 @@
 	EXCEPTION_PROLOG_1(area, extra, vec);				\
 	EXCEPTION_PROLOG_PSERIES_1(label, h);
 
+/* Have the PACA in r13 already */
+#define EXCEPTION_PROLOG_PSERIES_PACA(area, label, h, extra, vec)	\
+	EXCEPTION_PROLOG_0_PACA(area);					\
+	EXCEPTION_PROLOG_1(area, extra, vec);				\
+	EXCEPTION_PROLOG_PSERIES_1(label, h);
+
 #define __KVMTEST(h, n)							\
 	lbz	r10,HSTATE_IN_GUEST(r13);				\
 	cmpwi	r10,0;							\
@@ -208,6 +221,18 @@
 #define kvmppc_interrupt kvmppc_interrupt_pr
 #endif
 
+#ifdef CONFIG_RELOCATABLE
+#define BRANCH_TO_COMMON(reg, label)					\
+	__LOAD_HANDLER(reg, label);					\
+	mtctr	reg;							\
+	bctr
+
+#else
+#define BRANCH_TO_COMMON(reg, label)					\
+	b	label
+
+#endif
+
 #define __KVM_HANDLER_PROLOG(area, n)					\
 	BEGIN_FTR_SECTION_NESTED(947)					\
 	ld	r10,area+EX_CFAR(r13);					\
diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h
index e883683..e311c25 100644
--- a/arch/powerpc/include/asm/mmu.h
+++ b/arch/powerpc/include/asm/mmu.h
@@ -29,6 +29,12 @@
  */
 
 /*
+ * Kernel read only support.
+ * We added the ppp value 0b110 in ISA 2.04.
+ */
+#define MMU_FTR_KERNEL_RO		ASM_CONST(0x00004000)
+
+/*
  * We need to clear top 16bits of va (from the remaining 64 bits )in
  * tlbie* instructions
  */
@@ -103,10 +109,10 @@
 #define MMU_FTRS_POWER4		MMU_FTRS_DEFAULT_HPTE_ARCH_V2
 #define MMU_FTRS_PPC970		MMU_FTRS_POWER4 | MMU_FTR_TLBIE_CROP_VA
 #define MMU_FTRS_POWER5		MMU_FTRS_POWER4 | MMU_FTR_LOCKLESS_TLBIE
-#define MMU_FTRS_POWER6		MMU_FTRS_POWER4 | MMU_FTR_LOCKLESS_TLBIE
-#define MMU_FTRS_POWER7		MMU_FTRS_POWER4 | MMU_FTR_LOCKLESS_TLBIE
-#define MMU_FTRS_POWER8		MMU_FTRS_POWER4 | MMU_FTR_LOCKLESS_TLBIE
-#define MMU_FTRS_POWER9		MMU_FTRS_POWER4 | MMU_FTR_LOCKLESS_TLBIE
+#define MMU_FTRS_POWER6		MMU_FTRS_POWER4 | MMU_FTR_LOCKLESS_TLBIE | MMU_FTR_KERNEL_RO
+#define MMU_FTRS_POWER7		MMU_FTRS_POWER4 | MMU_FTR_LOCKLESS_TLBIE | MMU_FTR_KERNEL_RO
+#define MMU_FTRS_POWER8		MMU_FTRS_POWER4 | MMU_FTR_LOCKLESS_TLBIE | MMU_FTR_KERNEL_RO
+#define MMU_FTRS_POWER9		MMU_FTRS_POWER4 | MMU_FTR_LOCKLESS_TLBIE | MMU_FTR_KERNEL_RO
 #define MMU_FTRS_CELL		MMU_FTRS_DEFAULT_HPTE_ARCH_V2 | \
 				MMU_FTR_CI_LARGE_PAGE
 #define MMU_FTRS_PA6T		MMU_FTRS_DEFAULT_HPTE_ARCH_V2 | \
diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h
index 0132831..c56ea8c 100644
--- a/arch/powerpc/include/asm/ppc-opcode.h
+++ b/arch/powerpc/include/asm/ppc-opcode.h
@@ -460,5 +460,6 @@
 
 #define PPC_SLBIA(IH)	stringify_in_c(.long PPC_INST_SLBIA | \
 				       ((IH & 0x7) << 21))
+#define PPC_INVALIDATE_ERAT	PPC_SLBIA(7)
 
 #endif /* _ASM_POWERPC_PPC_OPCODE_H */
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index 9cd4e8c..9e1499f 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -355,6 +355,7 @@
 #define     LPCR_PECE0		ASM_CONST(0x0000000000004000)	/* ext. exceptions can cause exit */
 #define     LPCR_PECE1		ASM_CONST(0x0000000000002000)	/* decrementer can cause exit */
 #define     LPCR_PECE2		ASM_CONST(0x0000000000001000)	/* machine check etc can cause exit */
+#define     LPCR_PECE_HVEE	ASM_CONST(0x0000400000000000)	/* P9 Wakeup on HV interrupts */
 #define   LPCR_MER		ASM_CONST(0x0000000000000800)	/* Mediated External Exception */
 #define   LPCR_MER_SH		11
 #define   LPCR_TC		ASM_CONST(0x0000000000000200)	/* Translation control */
diff --git a/arch/powerpc/include/asm/tlb.h b/arch/powerpc/include/asm/tlb.h
index f6f68f7..99e1397 100644
--- a/arch/powerpc/include/asm/tlb.h
+++ b/arch/powerpc/include/asm/tlb.h
@@ -52,11 +52,23 @@
 	return cpumask_subset(mm_cpumask(mm),
 			      topology_sibling_cpumask(smp_processor_id()));
 }
+
+static inline int mm_is_thread_local(struct mm_struct *mm)
+{
+	return cpumask_equal(mm_cpumask(mm),
+			      cpumask_of(smp_processor_id()));
+}
+
 #else
 static inline int mm_is_core_local(struct mm_struct *mm)
 {
 	return 1;
 }
+
+static inline int mm_is_thread_local(struct mm_struct *mm)
+{
+	return 1;
+}
 #endif
 
 #endif /* __KERNEL__ */
diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h
index cf12c58..e8cdfec 100644
--- a/arch/powerpc/include/asm/unistd.h
+++ b/arch/powerpc/include/asm/unistd.h
@@ -16,6 +16,10 @@
 
 #define __NR__exit __NR_exit
 
+#define __IGNORE_pkey_mprotect
+#define __IGNORE_pkey_alloc
+#define __IGNORE_pkey_free
+
 #ifndef __ASSEMBLY__
 
 #include <linux/types.h>
diff --git a/arch/powerpc/kernel/cpu_setup_power.S b/arch/powerpc/kernel/cpu_setup_power.S
index 52ff3f0..37c027c 100644
--- a/arch/powerpc/kernel/cpu_setup_power.S
+++ b/arch/powerpc/kernel/cpu_setup_power.S
@@ -98,8 +98,8 @@
 	li	r0,0
 	mtspr	SPRN_LPID,r0
 	mfspr	r3,SPRN_LPCR
-	ori	r3, r3, LPCR_PECEDH
-	ori	r3, r3, LPCR_HVICE
+	LOAD_REG_IMMEDIATE(r4, LPCR_PECEDH | LPCR_PECE_HVEE | LPCR_HVICE)
+	or	r3, r3, r4
 	bl	__init_LPCR
 	bl	__init_HFSCR
 	bl	__init_tlb_power9
@@ -118,8 +118,8 @@
 	li	r0,0
 	mtspr	SPRN_LPID,r0
 	mfspr   r3,SPRN_LPCR
-	ori	r3, r3, LPCR_PECEDH
-	ori	r3, r3, LPCR_HVICE
+	LOAD_REG_IMMEDIATE(r4, LPCR_PECEDH | LPCR_PECE_HVEE | LPCR_HVICE)
+	or	r3, r3, r4
 	bl	__init_LPCR
 	bl	__init_HFSCR
 	bl	__init_tlb_power9
diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index a62be72..5c31369 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -671,8 +671,10 @@
 
 	/* Clear frozen state */
 	rc = eeh_clear_pe_frozen_state(pe, false);
-	if (rc)
+	if (rc) {
+		pci_unlock_rescan_remove();
 		return rc;
+	}
 
 	/* Give the system 5 seconds to finish running the user-space
 	 * hotplug shutdown scripts, e.g. ifdown for ethernet.  Yes,
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index f129408..1ba82ea 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -95,19 +95,40 @@
 /* No virt vectors corresponding with 0x0..0x100 */
 EXC_VIRT_NONE(0x4000, 0x4100)
 
+
+#ifdef CONFIG_PPC_P7_NAP
+	/*
+	 * If running native on arch 2.06 or later, check if we are waking up
+	 * from nap/sleep/winkle, and branch to idle handler.
+	 */
+#define IDLETEST(n)							\
+	BEGIN_FTR_SECTION ;						\
+	mfspr	r10,SPRN_SRR1 ;						\
+	rlwinm.	r10,r10,47-31,30,31 ;					\
+	beq-	1f ;							\
+	cmpwi	cr3,r10,2 ;						\
+	BRANCH_TO_COMMON(r10, system_reset_idle_common) ;		\
+1:									\
+	END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
+#else
+#define IDLETEST NOTEST
+#endif
+
 EXC_REAL_BEGIN(system_reset, 0x100, 0x200)
 	SET_SCRATCH0(r13)
-#ifdef CONFIG_PPC_P7_NAP
-BEGIN_FTR_SECTION
-	/* Running native on arch 2.06 or later, check if we are
-	 * waking up from nap/sleep/winkle.
-	 */
-	mfspr	r13,SPRN_SRR1
-	rlwinm.	r13,r13,47-31,30,31
-	beq	9f
-
-	cmpwi	cr3,r13,2
 	GET_PACA(r13)
+	clrrdi	r13,r13,1 /* Last bit of HSPRG0 is set if waking from winkle */
+	EXCEPTION_PROLOG_PSERIES_PACA(PACA_EXGEN, system_reset_common, EXC_STD,
+				 IDLETEST, 0x100)
+
+EXC_REAL_END(system_reset, 0x100, 0x200)
+EXC_VIRT_NONE(0x4100, 0x4200)
+
+#ifdef CONFIG_PPC_P7_NAP
+EXC_COMMON_BEGIN(system_reset_idle_common)
+BEGIN_FTR_SECTION
+	GET_PACA(r13) /* Restore HSPRG0 to get the winkle bit in r13 */
+END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
 	bl	pnv_restore_hyp_resource
 
 	li	r0,PNV_THREAD_RUNNING
@@ -130,14 +151,8 @@
 	blt	cr3,2f
 	b	pnv_wakeup_loss
 2:	b	pnv_wakeup_noloss
+#endif
 
-9:
-END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
-#endif /* CONFIG_PPC_P7_NAP */
-	EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, system_reset_common, EXC_STD,
-				 NOTEST, 0x100)
-EXC_REAL_END(system_reset, 0x100, 0x200)
-EXC_VIRT_NONE(0x4100, 0x4200)
 EXC_COMMON(system_reset_common, 0x100, system_reset_exception)
 
 #ifdef CONFIG_PPC_PSERIES
@@ -159,7 +174,7 @@
 	SET_SCRATCH0(r13)		/* save r13 */
 	/*
 	 * Running native on arch 2.06 or later, we may wakeup from winkle
-	 * inside machine check. If yes, then last bit of HSPGR0 would be set
+	 * inside machine check. If yes, then last bit of HSPRG0 would be set
 	 * to 1. Hence clear it unconditionally.
 	 */
 	GET_PACA(r13)
@@ -378,7 +393,7 @@
 	/*
 	 * Go back to winkle. Please note that this thread was woken up in
 	 * machine check from winkle and have not restored the per-subcore
-	 * state. Hence before going back to winkle, set last bit of HSPGR0
+	 * state. Hence before going back to winkle, set last bit of HSPRG0
 	 * to 1. This will make sure that if this thread gets woken up
 	 * again at reset vector 0x100 then it will get chance to restore
 	 * the subcore state.
@@ -817,10 +832,8 @@
 TRAMP_KVM(PACA_EXGEN, 0xb00)
 EXC_COMMON(trap_0b_common, 0xb00, unknown_exception)
 
-
-#define LOAD_SYSCALL_HANDLER(reg)				\
-	ld	reg,PACAKBASE(r13);				\
-	ori	reg,reg,(ABS_ADDR(system_call_common))@l;
+#define LOAD_SYSCALL_HANDLER(reg)					\
+	__LOAD_HANDLER(reg, system_call_common)
 
 /* Syscall routine is used twice, in reloc-off and reloc-on paths */
 #define SYSCALL_PSERIES_1 					\
diff --git a/arch/powerpc/kernel/hw_breakpoint.c b/arch/powerpc/kernel/hw_breakpoint.c
index 9781c69..03d089b 100644
--- a/arch/powerpc/kernel/hw_breakpoint.c
+++ b/arch/powerpc/kernel/hw_breakpoint.c
@@ -275,7 +275,7 @@
 	if (!stepped) {
 		WARN(1, "Unable to handle hardware breakpoint. Breakpoint at "
 			"0x%lx will be disabled.", info->address);
-		perf_event_disable(bp);
+		perf_event_disable_inatomic(bp);
 		goto out;
 	}
 	/*
diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S
index bd739fe..72dac0b 100644
--- a/arch/powerpc/kernel/idle_book3s.S
+++ b/arch/powerpc/kernel/idle_book3s.S
@@ -90,6 +90,7 @@
  * Threads will spin in HMT_LOW until the lock bit is cleared.
  * r14 - pointer to core_idle_state
  * r15 - used to load contents of core_idle_state
+ * r9  - used as a temporary variable
  */
 
 core_idle_lock_held:
@@ -99,6 +100,8 @@
 	bne	3b
 	HMT_MEDIUM
 	lwarx	r15,0,r14
+	andi.	r9,r15,PNV_CORE_IDLE_LOCK_BIT
+	bne	core_idle_lock_held
 	blr
 
 /*
@@ -163,12 +166,6 @@
 	std	r9,_MSR(r1)
 	std	r1,PACAR1(r13)
 
-#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
-	/* Tell KVM we're entering idle */
-	li	r4,KVM_HWTHREAD_IN_IDLE
-	stb	r4,HSTATE_HWTHREAD_STATE(r13)
-#endif
-
 	/*
 	 * Go to real mode to do the nap, as required by the architecture.
 	 * Also, we need to be in real mode before setting hwthread_state,
@@ -185,6 +182,26 @@
 
 	.globl pnv_enter_arch207_idle_mode
 pnv_enter_arch207_idle_mode:
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+	/* Tell KVM we're entering idle */
+	li	r4,KVM_HWTHREAD_IN_IDLE
+	/******************************************************/
+	/*  N O T E   W E L L    ! ! !    N O T E   W E L L   */
+	/* The following store to HSTATE_HWTHREAD_STATE(r13)  */
+	/* MUST occur in real mode, i.e. with the MMU off,    */
+	/* and the MMU must stay off until we clear this flag */
+	/* and test HSTATE_HWTHREAD_REQ(r13) in the system    */
+	/* reset interrupt vector in exceptions-64s.S.        */
+	/* The reason is that another thread can switch the   */
+	/* MMU to a guest context whenever this flag is set   */
+	/* to KVM_HWTHREAD_IN_IDLE, and if the MMU was on,    */
+	/* that would potentially cause this thread to start  */
+	/* executing instructions from guest memory in        */
+	/* hypervisor mode, leading to a host crash or data   */
+	/* corruption, or worse.                              */
+	/******************************************************/
+	stb	r4,HSTATE_HWTHREAD_STATE(r13)
+#endif
 	stb	r3,PACA_THREAD_IDLE_STATE(r13)
 	cmpwi	cr3,r3,PNV_THREAD_SLEEP
 	bge	cr3,2f
@@ -250,6 +267,12 @@
  * r3 - requested stop state
  */
 power_enter_stop:
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+	/* Tell KVM we're entering idle */
+	li	r4,KVM_HWTHREAD_IN_IDLE
+	/* DO THIS IN REAL MODE!  See comment above. */
+	stb	r4,HSTATE_HWTHREAD_STATE(r13)
+#endif
 /*
  * Check if the requested state is a deep idle state.
  */
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 9e7c10f..49a680d 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -1012,7 +1012,7 @@
 	/* Ensure that restore_math() will restore */
 	if (msr_diff & MSR_FP)
 		current->thread.load_fp = 1;
-#ifdef CONFIG_ALIVEC
+#ifdef CONFIG_ALTIVEC
 	if (cpu_has_feature(CPU_FTR_ALTIVEC) && msr_diff & MSR_VEC)
 		current->thread.load_vec = 1;
 #endif
@@ -1215,7 +1215,7 @@
 		int instr;
 
 		if (!(i % 8))
-			printk("\n");
+			pr_cont("\n");
 
 #if !defined(CONFIG_BOOKE)
 		/* If executing with the IMMU off, adjust pc rather
@@ -1227,18 +1227,18 @@
 
 		if (!__kernel_text_address(pc) ||
 		     probe_kernel_address((unsigned int __user *)pc, instr)) {
-			printk(KERN_CONT "XXXXXXXX ");
+			pr_cont("XXXXXXXX ");
 		} else {
 			if (regs->nip == pc)
-				printk(KERN_CONT "<%08x> ", instr);
+				pr_cont("<%08x> ", instr);
 			else
-				printk(KERN_CONT "%08x ", instr);
+				pr_cont("%08x ", instr);
 		}
 
 		pc += sizeof(int);
 	}
 
-	printk("\n");
+	pr_cont("\n");
 }
 
 struct regbit {
@@ -1282,7 +1282,7 @@
 
 	for (; bits->bit; ++bits)
 		if (val & bits->bit) {
-			printk("%s%s", s, bits->name);
+			pr_cont("%s%s", s, bits->name);
 			s = sep;
 		}
 }
@@ -1305,9 +1305,9 @@
  *   T: Transactional	(bit 34)
  */
 	if (val & (MSR_TM | MSR_TS_S | MSR_TS_T)) {
-		printk(",TM[");
+		pr_cont(",TM[");
 		print_bits(val, msr_tm_bits, "");
-		printk("]");
+		pr_cont("]");
 	}
 }
 #else
@@ -1316,10 +1316,10 @@
 
 static void print_msr_bits(unsigned long val)
 {
-	printk("<");
+	pr_cont("<");
 	print_bits(val, msr_bits, ",");
 	print_tm_bits(val);
-	printk(">");
+	pr_cont(">");
 }
 
 #ifdef CONFIG_PPC64
@@ -1347,29 +1347,29 @@
 	printk("  CR: %08lx  XER: %08lx\n", regs->ccr, regs->xer);
 	trap = TRAP(regs);
 	if ((regs->trap != 0xc00) && cpu_has_feature(CPU_FTR_CFAR))
-		printk("CFAR: "REG" ", regs->orig_gpr3);
+		pr_cont("CFAR: "REG" ", regs->orig_gpr3);
 	if (trap == 0x200 || trap == 0x300 || trap == 0x600)
 #if defined(CONFIG_4xx) || defined(CONFIG_BOOKE)
-		printk("DEAR: "REG" ESR: "REG" ", regs->dar, regs->dsisr);
+		pr_cont("DEAR: "REG" ESR: "REG" ", regs->dar, regs->dsisr);
 #else
-		printk("DAR: "REG" DSISR: %08lx ", regs->dar, regs->dsisr);
+		pr_cont("DAR: "REG" DSISR: %08lx ", regs->dar, regs->dsisr);
 #endif
 #ifdef CONFIG_PPC64
-	printk("SOFTE: %ld ", regs->softe);
+	pr_cont("SOFTE: %ld ", regs->softe);
 #endif
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 	if (MSR_TM_ACTIVE(regs->msr))
-		printk("\nPACATMSCRATCH: %016llx ", get_paca()->tm_scratch);
+		pr_cont("\nPACATMSCRATCH: %016llx ", get_paca()->tm_scratch);
 #endif
 
 	for (i = 0;  i < 32;  i++) {
 		if ((i % REGS_PER_LINE) == 0)
-			printk("\nGPR%02d: ", i);
-		printk(REG " ", regs->gpr[i]);
+			pr_cont("\nGPR%02d: ", i);
+		pr_cont(REG " ", regs->gpr[i]);
 		if (i == LAST_VOLATILE && !FULL_REGS(regs))
 			break;
 	}
-	printk("\n");
+	pr_cont("\n");
 #ifdef CONFIG_KALLSYMS
 	/*
 	 * Lookup NIP late so we have the best change of getting the
@@ -1900,14 +1900,14 @@
 			printk("["REG"] ["REG"] %pS", sp, ip, (void *)ip);
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 			if ((ip == rth) && curr_frame >= 0) {
-				printk(" (%pS)",
+				pr_cont(" (%pS)",
 				       (void *)current->ret_stack[curr_frame].ret);
 				curr_frame--;
 			}
 #endif
 			if (firstframe)
-				printk(" (unreliable)");
-			printk("\n");
+				pr_cont(" (unreliable)");
+			pr_cont("\n");
 		}
 		firstframe = 0;
 
diff --git a/arch/powerpc/kernel/ptrace32.c b/arch/powerpc/kernel/ptrace32.c
index f52b7db3..010b7b3 100644
--- a/arch/powerpc/kernel/ptrace32.c
+++ b/arch/powerpc/kernel/ptrace32.c
@@ -74,7 +74,7 @@
 			break;
 
 		copied = access_process_vm(child, (u64)addrOthers, &tmp,
-				sizeof(tmp), 0);
+				sizeof(tmp), FOLL_FORCE);
 		if (copied != sizeof(tmp))
 			break;
 		ret = put_user(tmp, (u32 __user *)data);
@@ -179,7 +179,8 @@
 			break;
 		ret = 0;
 		if (access_process_vm(child, (u64)addrOthers, &tmp,
-					sizeof(tmp), 1) == sizeof(tmp))
+					sizeof(tmp),
+					FOLL_FORCE | FOLL_WRITE) == sizeof(tmp))
 			break;
 		ret = -EIO;
 		break;
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index 7ac8e6e..8d586cf 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -226,17 +226,25 @@
 		if (firmware_has_feature(FW_FEATURE_OPAL))
 			opal_configure_cores();
 
-		/* Enable AIL if supported, and we are in hypervisor mode */
-		if (early_cpu_has_feature(CPU_FTR_HVMODE) &&
-		    early_cpu_has_feature(CPU_FTR_ARCH_207S)) {
-			unsigned long lpcr = mfspr(SPRN_LPCR);
-			mtspr(SPRN_LPCR, lpcr | LPCR_AIL_3);
-		}
+		/* AIL on native is done in cpu_ready_for_interrupts() */
 	}
 }
 
 static void cpu_ready_for_interrupts(void)
 {
+	/*
+	 * Enable AIL if supported, and we are in hypervisor mode. This
+	 * is called once for every processor.
+	 *
+	 * If we are not in hypervisor mode the job is done once for
+	 * the whole partition in configure_exceptions().
+	 */
+	if (early_cpu_has_feature(CPU_FTR_HVMODE) &&
+	    early_cpu_has_feature(CPU_FTR_ARCH_207S)) {
+		unsigned long lpcr = mfspr(SPRN_LPCR);
+		mtspr(SPRN_LPCR, lpcr | LPCR_AIL_3);
+	}
+
 	/* Set IR and DR in PACA MSR */
 	get_paca()->kernel_msr = MSR_KERNEL;
 }
diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S
index 8295f51..7394b77 100644
--- a/arch/powerpc/kernel/vmlinux.lds.S
+++ b/arch/powerpc/kernel/vmlinux.lds.S
@@ -94,8 +94,17 @@
 	 * detected, and will result in a crash at boot due to offsets being
 	 * wrong.
 	 */
+#ifdef CONFIG_PPC64
+	/*
+	 * BLOCK(0) overrides the default output section alignment because
+	 * this needs to start right after .head.text in order for fixed
+	 * section placement to work.
+	 */
+	.text BLOCK(0) : AT(ADDR(.text) - LOAD_OFFSET) {
+#else
 	.text : AT(ADDR(.text) - LOAD_OFFSET) {
 		ALIGN_FUNCTION();
+#endif
 		/* careful! __ftr_alt_* sections need to be close to .text */
 		*(.text .fixup __ftr_alt_* .ref.text)
 		SCHED_TEXT
diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c
index 82ff5de..a0ea63a 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_xics.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c
@@ -23,6 +23,7 @@
 #include <asm/ppc-opcode.h>
 #include <asm/pnv-pci.h>
 #include <asm/opal.h>
+#include <asm/smp.h>
 
 #include "book3s_xics.h"
 
diff --git a/arch/powerpc/mm/copro_fault.c b/arch/powerpc/mm/copro_fault.c
index bb03542..362954f 100644
--- a/arch/powerpc/mm/copro_fault.c
+++ b/arch/powerpc/mm/copro_fault.c
@@ -106,6 +106,8 @@
 	switch (REGION_ID(ea)) {
 	case USER_REGION_ID:
 		pr_devel("%s: 0x%llx -- USER_REGION_ID\n", __func__, ea);
+		if (mm == NULL)
+			return 1;
 		psize = get_slice_psize(mm, ea);
 		ssize = user_segment_size(ea);
 		vsid = get_vsid(mm->context.id, ea, ssize);
diff --git a/arch/powerpc/mm/hash64_4k.c b/arch/powerpc/mm/hash64_4k.c
index 42c702b..6fa450c 100644
--- a/arch/powerpc/mm/hash64_4k.c
+++ b/arch/powerpc/mm/hash64_4k.c
@@ -55,7 +55,7 @@
 	 */
 	rflags = htab_convert_pte_flags(new_pte);
 
-	if (!cpu_has_feature(CPU_FTR_NOEXECUTE) &&
+	if (cpu_has_feature(CPU_FTR_NOEXECUTE) &&
 	    !cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
 		rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap);
 
diff --git a/arch/powerpc/mm/hash64_64k.c b/arch/powerpc/mm/hash64_64k.c
index 3bbbea0..1a68cb1 100644
--- a/arch/powerpc/mm/hash64_64k.c
+++ b/arch/powerpc/mm/hash64_64k.c
@@ -87,7 +87,7 @@
 	subpg_pte = new_pte & ~subpg_prot;
 	rflags = htab_convert_pte_flags(subpg_pte);
 
-	if (!cpu_has_feature(CPU_FTR_NOEXECUTE) &&
+	if (cpu_has_feature(CPU_FTR_NOEXECUTE) &&
 	    !cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) {
 
 		/*
@@ -258,7 +258,7 @@
 
 	rflags = htab_convert_pte_flags(new_pte);
 
-	if (!cpu_has_feature(CPU_FTR_NOEXECUTE) &&
+	if (cpu_has_feature(CPU_FTR_NOEXECUTE) &&
 	    !cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
 		rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap);
 
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 44d3c3a..78dabf06 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -193,8 +193,12 @@
 		/*
 		 * Kernel read only mapped with ppp bits 0b110
 		 */
-		if (!(pteflags & _PAGE_WRITE))
-			rflags |= (HPTE_R_PP0 | 0x2);
+		if (!(pteflags & _PAGE_WRITE)) {
+			if (mmu_has_feature(MMU_FTR_KERNEL_RO))
+				rflags |= (HPTE_R_PP0 | 0x2);
+			else
+				rflags |= 0x3;
+		}
 	} else {
 		if (pteflags & _PAGE_RWX)
 			rflags |= 0x2;
@@ -1029,6 +1033,10 @@
 {
 	/* Initialize hash table for that CPU */
 	if (!firmware_has_feature(FW_FEATURE_LPAR)) {
+
+		if (cpu_has_feature(CPU_FTR_POWER9_DD1))
+			update_hid_for_hash();
+
 		if (!cpu_has_feature(CPU_FTR_ARCH_300))
 			mtspr(SPRN_SDR1, _SDR1);
 		else
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 75b9cd6..a51c188 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -845,7 +845,7 @@
 		return;
 
 	for_each_online_node(node) {
-		printk(KERN_DEBUG "Node %d CPUs:", node);
+		pr_info("Node %d CPUs:", node);
 
 		count = 0;
 		/*
@@ -856,52 +856,18 @@
 			if (cpumask_test_cpu(cpu,
 					node_to_cpumask_map[node])) {
 				if (count == 0)
-					printk(" %u", cpu);
+					pr_cont(" %u", cpu);
 				++count;
 			} else {
 				if (count > 1)
-					printk("-%u", cpu - 1);
+					pr_cont("-%u", cpu - 1);
 				count = 0;
 			}
 		}
 
 		if (count > 1)
-			printk("-%u", nr_cpu_ids - 1);
-		printk("\n");
-	}
-}
-
-static void __init dump_numa_memory_topology(void)
-{
-	unsigned int node;
-	unsigned int count;
-
-	if (min_common_depth == -1 || !numa_enabled)
-		return;
-
-	for_each_online_node(node) {
-		unsigned long i;
-
-		printk(KERN_DEBUG "Node %d Memory:", node);
-
-		count = 0;
-
-		for (i = 0; i < memblock_end_of_DRAM();
-		     i += (1 << SECTION_SIZE_BITS)) {
-			if (early_pfn_to_nid(i >> PAGE_SHIFT) == node) {
-				if (count == 0)
-					printk(" 0x%lx", i);
-				++count;
-			} else {
-				if (count > 0)
-					printk("-0x%lx", i);
-				count = 0;
-			}
-		}
-
-		if (count > 0)
-			printk("-0x%lx", i);
-		printk("\n");
+			pr_cont("-%u", nr_cpu_ids - 1);
+		pr_cont("\n");
 	}
 }
 
@@ -947,8 +913,6 @@
 
 	if (parse_numa_properties())
 		setup_nonnuma();
-	else
-		dump_numa_memory_topology();
 
 	memblock_dump_all();
 
diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c
index ed7bddc..688b545 100644
--- a/arch/powerpc/mm/pgtable-radix.c
+++ b/arch/powerpc/mm/pgtable-radix.c
@@ -388,6 +388,10 @@
 	 * update partition table control register and UPRT
 	 */
 	if (!firmware_has_feature(FW_FEATURE_LPAR)) {
+
+		if (cpu_has_feature(CPU_FTR_POWER9_DD1))
+			update_hid_for_radix();
+
 		lpcr = mfspr(SPRN_LPCR);
 		mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
 
diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
index 0e49ec5..3493cf4 100644
--- a/arch/powerpc/mm/tlb-radix.c
+++ b/arch/powerpc/mm/tlb-radix.c
@@ -50,6 +50,8 @@
 	for (set = 0; set < POWER9_TLB_SETS_RADIX ; set++) {
 		__tlbiel_pid(pid, set, ric);
 	}
+	if (cpu_has_feature(CPU_FTR_POWER9_DD1))
+		asm volatile(PPC_INVALIDATE_ERAT : : :"memory");
 	return;
 }
 
@@ -83,6 +85,8 @@
 	asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
 		     : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
 	asm volatile("ptesync": : :"memory");
+	if (cpu_has_feature(CPU_FTR_POWER9_DD1))
+		asm volatile(PPC_INVALIDATE_ERAT : : :"memory");
 }
 
 static inline void _tlbie_va(unsigned long va, unsigned long pid,
@@ -175,7 +179,7 @@
 	if (unlikely(pid == MMU_NO_CONTEXT))
 		goto no_context;
 
-	if (!mm_is_core_local(mm)) {
+	if (!mm_is_thread_local(mm)) {
 		int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
 
 		if (lock_tlbie)
@@ -201,7 +205,7 @@
 	if (unlikely(pid == MMU_NO_CONTEXT))
 		goto no_context;
 
-	if (!mm_is_core_local(mm)) {
+	if (!mm_is_thread_local(mm)) {
 		int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
 
 		if (lock_tlbie)
@@ -226,7 +230,7 @@
 	pid = mm ? mm->context.id : 0;
 	if (unlikely(pid == MMU_NO_CONTEXT))
 		goto bail;
-	if (!mm_is_core_local(mm)) {
+	if (!mm_is_thread_local(mm)) {
 		int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
 
 		if (lock_tlbie)
@@ -321,7 +325,7 @@
 {
 	unsigned long pid;
 	unsigned long addr;
-	int local = mm_is_core_local(mm);
+	int local = mm_is_thread_local(mm);
 	unsigned long ap = mmu_get_ap(psize);
 	int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
 	unsigned long page_size = 1UL << mmu_psize_defs[psize].shift;
diff --git a/arch/s390/hypfs/hypfs_diag.c b/arch/s390/hypfs/hypfs_diag.c
index 28f03ca..794bebb 100644
--- a/arch/s390/hypfs/hypfs_diag.c
+++ b/arch/s390/hypfs/hypfs_diag.c
@@ -363,11 +363,11 @@
 static int diag224_get_name_table(void)
 {
 	/* memory must be below 2GB */
-	diag224_cpu_names = kmalloc(PAGE_SIZE, GFP_KERNEL | GFP_DMA);
+	diag224_cpu_names = (char *) __get_free_page(GFP_KERNEL | GFP_DMA);
 	if (!diag224_cpu_names)
 		return -ENOMEM;
 	if (diag224(diag224_cpu_names)) {
-		kfree(diag224_cpu_names);
+		free_page((unsigned long) diag224_cpu_names);
 		return -EOPNOTSUPP;
 	}
 	EBCASC(diag224_cpu_names + 16, (*diag224_cpu_names + 1) * 16);
@@ -376,7 +376,7 @@
 
 static void diag224_delete_name_table(void)
 {
-	kfree(diag224_cpu_names);
+	free_page((unsigned long) diag224_cpu_names);
 }
 
 static int diag224_idx2name(int index, char *name)
diff --git a/arch/s390/include/asm/ftrace.h b/arch/s390/include/asm/ftrace.h
index 64053d9..836c562 100644
--- a/arch/s390/include/asm/ftrace.h
+++ b/arch/s390/include/asm/ftrace.h
@@ -12,9 +12,7 @@
 
 #ifndef __ASSEMBLY__
 
-unsigned long return_address(int depth);
-
-#define ftrace_return_address(n) return_address(n)
+#define ftrace_return_address(n) __builtin_return_address(n)
 
 void _mcount(void);
 void ftrace_caller(void);
diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h
index 0332317..602af69 100644
--- a/arch/s390/include/asm/processor.h
+++ b/arch/s390/include/asm/processor.h
@@ -192,7 +192,7 @@
 struct mm_struct;
 struct seq_file;
 
-typedef int (*dump_trace_func_t)(void *data, unsigned long address);
+typedef int (*dump_trace_func_t)(void *data, unsigned long address, int reliable);
 void dump_trace(dump_trace_func_t func, void *data,
 		struct task_struct *task, unsigned long sp);
 
diff --git a/arch/s390/include/asm/unistd.h b/arch/s390/include/asm/unistd.h
index 02613ba..3066031 100644
--- a/arch/s390/include/asm/unistd.h
+++ b/arch/s390/include/asm/unistd.h
@@ -9,6 +9,9 @@
 #include <uapi/asm/unistd.h>
 
 #define __IGNORE_time
+#define __IGNORE_pkey_mprotect
+#define __IGNORE_pkey_alloc
+#define __IGNORE_pkey_free
 
 #define __ARCH_WANT_OLD_READDIR
 #define __ARCH_WANT_SYS_ALARM
diff --git a/arch/s390/kernel/dis.c b/arch/s390/kernel/dis.c
index 43446fa..c74c592 100644
--- a/arch/s390/kernel/dis.c
+++ b/arch/s390/kernel/dis.c
@@ -2014,12 +2014,12 @@
 			*ptr++ = '\t';
 		ptr += print_insn(ptr, code + start, addr);
 		start += opsize;
-		printk("%s", buffer);
+		pr_cont("%s", buffer);
 		ptr = buffer;
 		ptr += sprintf(ptr, "\n          ");
 		hops++;
 	}
-	printk("\n");
+	pr_cont("\n");
 }
 
 void print_fn_code(unsigned char *code, unsigned long len)
diff --git a/arch/s390/kernel/dumpstack.c b/arch/s390/kernel/dumpstack.c
index 6693383..55d4fe1 100644
--- a/arch/s390/kernel/dumpstack.c
+++ b/arch/s390/kernel/dumpstack.c
@@ -38,10 +38,10 @@
 		if (sp < low || sp > high - sizeof(*sf))
 			return sp;
 		sf = (struct stack_frame *) sp;
+		if (func(data, sf->gprs[8], 0))
+			return sp;
 		/* Follow the backchain. */
 		while (1) {
-			if (func(data, sf->gprs[8]))
-				return sp;
 			low = sp;
 			sp = sf->back_chain;
 			if (!sp)
@@ -49,6 +49,8 @@
 			if (sp <= low || sp > high - sizeof(*sf))
 				return sp;
 			sf = (struct stack_frame *) sp;
+			if (func(data, sf->gprs[8], 1))
+				return sp;
 		}
 		/* Zero backchain detected, check for interrupt frame. */
 		sp = (unsigned long) (sf + 1);
@@ -56,7 +58,7 @@
 			return sp;
 		regs = (struct pt_regs *) sp;
 		if (!user_mode(regs)) {
-			if (func(data, regs->psw.addr))
+			if (func(data, regs->psw.addr, 1))
 				return sp;
 		}
 		low = sp;
@@ -85,33 +87,12 @@
 }
 EXPORT_SYMBOL_GPL(dump_trace);
 
-struct return_address_data {
-	unsigned long address;
-	int depth;
-};
-
-static int __return_address(void *data, unsigned long address)
+static int show_address(void *data, unsigned long address, int reliable)
 {
-	struct return_address_data *rd = data;
-
-	if (rd->depth--)
-		return 0;
-	rd->address = address;
-	return 1;
-}
-
-unsigned long return_address(int depth)
-{
-	struct return_address_data rd = { .depth = depth + 2 };
-
-	dump_trace(__return_address, &rd, NULL, current_stack_pointer());
-	return rd.address;
-}
-EXPORT_SYMBOL_GPL(return_address);
-
-static int show_address(void *data, unsigned long address)
-{
-	printk("([<%016lx>] %pSR)\n", address, (void *)address);
+	if (reliable)
+		printk(" [<%016lx>] %pSR \n", address, (void *)address);
+	else
+		printk("([<%016lx>] %pSR)\n", address, (void *)address);
 	return 0;
 }
 
@@ -138,14 +119,14 @@
 		else
 			stack = (unsigned long *)task->thread.ksp;
 	}
+	printk(KERN_DEFAULT "Stack:\n");
 	for (i = 0; i < 20; i++) {
 		if (((addr_t) stack & (THREAD_SIZE-1)) == 0)
 			break;
-		if ((i * sizeof(long) % 32) == 0)
-			printk("%s       ", i == 0 ? "" : "\n");
-		printk("%016lx ", *stack++);
+		if (i % 4 == 0)
+			printk(KERN_DEFAULT "       ");
+		pr_cont("%016lx%c", *stack++, i % 4 == 3 ? '\n' : ' ');
 	}
-	printk("\n");
 	show_trace(task, (unsigned long)sp);
 }
 
@@ -163,13 +144,13 @@
 	mode = user_mode(regs) ? "User" : "Krnl";
 	printk("%s PSW : %p %p", mode, (void *)regs->psw.mask, (void *)regs->psw.addr);
 	if (!user_mode(regs))
-		printk(" (%pSR)", (void *)regs->psw.addr);
-	printk("\n");
+		pr_cont(" (%pSR)", (void *)regs->psw.addr);
+	pr_cont("\n");
 	printk("           R:%x T:%x IO:%x EX:%x Key:%x M:%x W:%x "
 	       "P:%x AS:%x CC:%x PM:%x", psw->r, psw->t, psw->i, psw->e,
 	       psw->key, psw->m, psw->w, psw->p, psw->as, psw->cc, psw->pm);
-	printk(" RI:%x EA:%x", psw->ri, psw->eaba);
-	printk("\n%s GPRS: %016lx %016lx %016lx %016lx\n", mode,
+	pr_cont(" RI:%x EA:%x\n", psw->ri, psw->eaba);
+	printk("%s GPRS: %016lx %016lx %016lx %016lx\n", mode,
 	       regs->gprs[0], regs->gprs[1], regs->gprs[2], regs->gprs[3]);
 	printk("           %016lx %016lx %016lx %016lx\n",
 	       regs->gprs[4], regs->gprs[5], regs->gprs[6], regs->gprs[7]);
@@ -205,14 +186,14 @@
 	printk("%s: %04x ilc:%d [#%d] ", str, regs->int_code & 0xffff,
 	       regs->int_code >> 17, ++die_counter);
 #ifdef CONFIG_PREEMPT
-	printk("PREEMPT ");
+	pr_cont("PREEMPT ");
 #endif
 #ifdef CONFIG_SMP
-	printk("SMP ");
+	pr_cont("SMP ");
 #endif
 	if (debug_pagealloc_enabled())
-		printk("DEBUG_PAGEALLOC");
-	printk("\n");
+		pr_cont("DEBUG_PAGEALLOC");
+	pr_cont("\n");
 	notify_die(DIE_OOPS, str, regs, 0, regs->int_code & 0xffff, SIGSEGV);
 	print_modules();
 	show_regs(regs);
diff --git a/arch/s390/kernel/perf_event.c b/arch/s390/kernel/perf_event.c
index 17431f6..955a7b6 100644
--- a/arch/s390/kernel/perf_event.c
+++ b/arch/s390/kernel/perf_event.c
@@ -222,7 +222,7 @@
 }
 arch_initcall(service_level_perf_register);
 
-static int __perf_callchain_kernel(void *data, unsigned long address)
+static int __perf_callchain_kernel(void *data, unsigned long address, int reliable)
 {
 	struct perf_callchain_entry_ctx *entry = data;
 
diff --git a/arch/s390/kernel/stacktrace.c b/arch/s390/kernel/stacktrace.c
index 44f84b2..355db9d 100644
--- a/arch/s390/kernel/stacktrace.c
+++ b/arch/s390/kernel/stacktrace.c
@@ -27,12 +27,12 @@
 	return 1;
 }
 
-static int save_address(void *data, unsigned long address)
+static int save_address(void *data, unsigned long address, int reliable)
 {
 	return __save_address(data, address, 0);
 }
 
-static int save_address_nosched(void *data, unsigned long address)
+static int save_address_nosched(void *data, unsigned long address, int reliable)
 {
 	return __save_address(data, address, 1);
 }
diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S
index 000e6e91..3667d20 100644
--- a/arch/s390/kernel/vmlinux.lds.S
+++ b/arch/s390/kernel/vmlinux.lds.S
@@ -62,9 +62,11 @@
 
 	. = ALIGN(PAGE_SIZE);
 	__start_ro_after_init = .;
+	__start_data_ro_after_init = .;
 	.data..ro_after_init : {
 		 *(.data..ro_after_init)
 	}
+	__end_data_ro_after_init = .;
 	EXCEPTION_TABLE(16)
 	. = ALIGN(PAGE_SIZE);
 	__end_ro_after_init = .;
diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c
index 1cab8a1..7a27eeb 100644
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@@ -119,8 +119,13 @@
 
 	vcpu->stat.exit_validity++;
 	trace_kvm_s390_intercept_validity(vcpu, viwhy);
-	WARN_ONCE(true, "kvm: unhandled validity intercept 0x%x\n", viwhy);
-	return -EOPNOTSUPP;
+	KVM_EVENT(3, "validity intercept 0x%x for pid %u (kvm 0x%pK)", viwhy,
+		  current->pid, vcpu->kvm);
+
+	/* do not warn on invalid runtime instrumentation mode */
+	WARN_ONCE(viwhy != 0x44, "kvm: unhandled validity intercept 0x%x\n",
+		  viwhy);
+	return -EINVAL;
 }
 
 static int handle_instruction(struct kvm_vcpu *vcpu)
diff --git a/arch/s390/kvm/sthyi.c b/arch/s390/kvm/sthyi.c
index bd98b7d..05c98bb 100644
--- a/arch/s390/kvm/sthyi.c
+++ b/arch/s390/kvm/sthyi.c
@@ -315,7 +315,7 @@
 	if (r < 0)
 		goto out;
 
-	diag224_buf = kmalloc(PAGE_SIZE, GFP_KERNEL | GFP_DMA);
+	diag224_buf = (void *)__get_free_page(GFP_KERNEL | GFP_DMA);
 	if (!diag224_buf || diag224(diag224_buf))
 		goto out;
 
@@ -378,7 +378,7 @@
 	sctns->par.infpval1 |= PAR_WGHT_VLD;
 
 out:
-	kfree(diag224_buf);
+	free_page((unsigned long)diag224_buf);
 	vfree(diag204_buf);
 }
 
diff --git a/arch/s390/mm/gup.c b/arch/s390/mm/gup.c
index adb0c34..18d4107e 100644
--- a/arch/s390/mm/gup.c
+++ b/arch/s390/mm/gup.c
@@ -266,7 +266,8 @@
 	/* Try to get the remaining pages with get_user_pages */
 	start += nr << PAGE_SHIFT;
 	pages += nr;
-	ret = get_user_pages_unlocked(start, nr_pages - nr, write, 0, pages);
+	ret = get_user_pages_unlocked(start, nr_pages - nr, pages,
+				      write ? FOLL_WRITE : 0);
 	/* Have to be a bit careful with return values */
 	if (nr > 0)
 		ret = (ret < 0) ? nr : ret + nr;
diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c
index cd404aa..4a0c5bc 100644
--- a/arch/s390/mm/hugetlbpage.c
+++ b/arch/s390/mm/hugetlbpage.c
@@ -217,6 +217,7 @@
 	} else if (MACHINE_HAS_EDAT2 && size == PUD_SIZE) {
 		hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
 	} else {
+		hugetlb_bad_size();
 		pr_err("hugepagesz= specifies an unsupported page size %s\n",
 			string);
 		return 0;
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index f56a39b..b3e9d18 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -151,36 +151,40 @@
 #ifdef CONFIG_MEMORY_HOTPLUG
 int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
 {
-	unsigned long normal_end_pfn = PFN_DOWN(memblock_end_of_DRAM());
-	unsigned long dma_end_pfn = PFN_DOWN(MAX_DMA_ADDRESS);
+	unsigned long zone_start_pfn, zone_end_pfn, nr_pages;
 	unsigned long start_pfn = PFN_DOWN(start);
 	unsigned long size_pages = PFN_DOWN(size);
-	unsigned long nr_pages;
-	int rc, zone_enum;
+	pg_data_t *pgdat = NODE_DATA(nid);
+	struct zone *zone;
+	int rc, i;
 
 	rc = vmem_add_mapping(start, size);
 	if (rc)
 		return rc;
 
-	while (size_pages > 0) {
-		if (start_pfn < dma_end_pfn) {
-			nr_pages = (start_pfn + size_pages > dma_end_pfn) ?
-				   dma_end_pfn - start_pfn : size_pages;
-			zone_enum = ZONE_DMA;
-		} else if (start_pfn < normal_end_pfn) {
-			nr_pages = (start_pfn + size_pages > normal_end_pfn) ?
-				   normal_end_pfn - start_pfn : size_pages;
-			zone_enum = ZONE_NORMAL;
+	for (i = 0; i < MAX_NR_ZONES; i++) {
+		zone = pgdat->node_zones + i;
+		if (zone_idx(zone) != ZONE_MOVABLE) {
+			/* Add range within existing zone limits, if possible */
+			zone_start_pfn = zone->zone_start_pfn;
+			zone_end_pfn = zone->zone_start_pfn +
+				       zone->spanned_pages;
 		} else {
-			nr_pages = size_pages;
-			zone_enum = ZONE_MOVABLE;
+			/* Add remaining range to ZONE_MOVABLE */
+			zone_start_pfn = start_pfn;
+			zone_end_pfn = start_pfn + size_pages;
 		}
-		rc = __add_pages(nid, NODE_DATA(nid)->node_zones + zone_enum,
-				 start_pfn, size_pages);
+		if (start_pfn < zone_start_pfn || start_pfn >= zone_end_pfn)
+			continue;
+		nr_pages = (start_pfn + size_pages > zone_end_pfn) ?
+			   zone_end_pfn - start_pfn : size_pages;
+		rc = __add_pages(nid, zone, start_pfn, nr_pages);
 		if (rc)
 			break;
 		start_pfn += nr_pages;
 		size_pages -= nr_pages;
+		if (!size_pages)
+			break;
 	}
 	if (rc)
 		vmem_remove_mapping(start, size);
diff --git a/arch/s390/oprofile/init.c b/arch/s390/oprofile/init.c
index 16f4c39..9a4de45 100644
--- a/arch/s390/oprofile/init.c
+++ b/arch/s390/oprofile/init.c
@@ -13,7 +13,7 @@
 #include <linux/init.h>
 #include <asm/processor.h>
 
-static int __s390_backtrace(void *data, unsigned long address)
+static int __s390_backtrace(void *data, unsigned long address, int reliable)
 {
 	unsigned int *depth = data;
 
diff --git a/arch/s390/pci/pci_dma.c b/arch/s390/pci/pci_dma.c
index 7350c8b..6b2f72f 100644
--- a/arch/s390/pci/pci_dma.c
+++ b/arch/s390/pci/pci_dma.c
@@ -423,7 +423,7 @@
 	dma_addr_t dma_addr_base, dma_addr;
 	int flags = ZPCI_PTE_VALID;
 	struct scatterlist *s;
-	unsigned long pa;
+	unsigned long pa = 0;
 	int ret;
 
 	size = PAGE_ALIGN(size);
diff --git a/arch/score/kernel/ptrace.c b/arch/score/kernel/ptrace.c
index 5583618..4f7314d 100644
--- a/arch/score/kernel/ptrace.c
+++ b/arch/score/kernel/ptrace.c
@@ -131,7 +131,7 @@
 {
 	int copied;
 
-	copied = access_process_vm(child, addr, res, sizeof(*res), 0);
+	copied = access_process_vm(child, addr, res, sizeof(*res), FOLL_FORCE);
 
 	return copied != sizeof(*res) ? -EIO : 0;
 }
@@ -142,7 +142,7 @@
 {
 	int copied;
 
-	copied = access_process_vm(child, addr, res, sizeof(*res), 0);
+	copied = access_process_vm(child, addr, res, sizeof(*res), FOLL_FORCE);
 
 	return copied != sizeof(*res) ? -EIO : 0;
 }
@@ -153,7 +153,8 @@
 {
 	int copied;
 
-	copied = access_process_vm(child, addr, &val, sizeof(val), 1);
+	copied = access_process_vm(child, addr, &val, sizeof(val),
+			FOLL_FORCE | FOLL_WRITE);
 
 	return copied != sizeof(val) ? -EIO : 0;
 }
@@ -164,7 +165,8 @@
 {
 	int copied;
 
-	copied = access_process_vm(child, addr, &val, sizeof(val), 1);
+	copied = access_process_vm(child, addr, &val, sizeof(val),
+			FOLL_FORCE | FOLL_WRITE);
 
 	return copied != sizeof(val) ? -EIO : 0;
 }
diff --git a/arch/sh/Makefile b/arch/sh/Makefile
index 0047666..336f33a 100644
--- a/arch/sh/Makefile
+++ b/arch/sh/Makefile
@@ -31,7 +31,7 @@
 endif
 
 cflags-$(CONFIG_CPU_SH2)		:= $(call cc-option,-m2,)
-cflags-$(CONFIG_CPU_J2)			:= $(call cc-option,-mj2,)
+cflags-$(CONFIG_CPU_J2)			+= $(call cc-option,-mj2,)
 cflags-$(CONFIG_CPU_SH2A)		+= $(call cc-option,-m2a,) \
 					   $(call cc-option,-m2a-nofpu,) \
 					   $(call cc-option,-m4-nofpu,)
diff --git a/arch/sh/boards/Kconfig b/arch/sh/boards/Kconfig
index e9c2c42..4e21949 100644
--- a/arch/sh/boards/Kconfig
+++ b/arch/sh/boards/Kconfig
@@ -22,6 +22,16 @@
 	  have sufficient driver coverage to use this option; do not
 	  select it if you are using original SuperH hardware.
 
+config SH_JCORE_SOC
+	bool "J-Core SoC"
+	depends on SH_DEVICE_TREE && (CPU_SH2 || CPU_J2)
+	select CLKSRC_JCORE_PIT
+	select JCORE_AIC
+	default y if CPU_J2
+	help
+	  Select this option to include drivers core components of the
+	  J-Core SoC, including interrupt controllers and timers.
+
 config SH_SOLUTION_ENGINE
 	bool "SolutionEngine"
 	select SOLUTION_ENGINE
diff --git a/arch/sh/configs/j2_defconfig b/arch/sh/configs/j2_defconfig
index 94d1eca..2eb81ebe 100644
--- a/arch/sh/configs/j2_defconfig
+++ b/arch/sh/configs/j2_defconfig
@@ -8,6 +8,7 @@
 CONFIG_MEMORY_SIZE=0x04000000
 CONFIG_CPU_BIG_ENDIAN=y
 CONFIG_SH_DEVICE_TREE=y
+CONFIG_SH_JCORE_SOC=y
 CONFIG_HZ_100=y
 CONFIG_CMDLINE_OVERWRITE=y
 CONFIG_CMDLINE="console=ttyUL0 earlycon"
@@ -20,6 +21,7 @@
 CONFIG_DEVTMPFS=y
 CONFIG_DEVTMPFS_MOUNT=y
 CONFIG_NETDEVICES=y
+CONFIG_SERIAL_EARLYCON=y
 CONFIG_SERIAL_UARTLITE=y
 CONFIG_SERIAL_UARTLITE_CONSOLE=y
 CONFIG_I2C=y
diff --git a/arch/sh/mm/gup.c b/arch/sh/mm/gup.c
index 40fa6c8..063c298 100644
--- a/arch/sh/mm/gup.c
+++ b/arch/sh/mm/gup.c
@@ -258,7 +258,8 @@
 		pages += nr;
 
 		ret = get_user_pages_unlocked(start,
-			(end - start) >> PAGE_SHIFT, write, 0, pages);
+			(end - start) >> PAGE_SHIFT, pages,
+			write ? FOLL_WRITE : 0);
 
 		/* Have to be a bit careful with return values */
 		if (nr > 0) {
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index b23c76b..165ecdd 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -43,6 +43,7 @@
 	select ARCH_HAS_SG_CHAIN
 	select CPU_NO_EFFICIENT_FFS
 	select HAVE_ARCH_HARDENED_USERCOPY
+	select PROVE_LOCKING_SMALL if PROVE_LOCKING
 
 config SPARC32
 	def_bool !64BIT
@@ -89,6 +90,14 @@
 config ARCH_PROC_KCORE_TEXT
 	def_bool y
 
+config ARCH_ATU
+	bool
+	default y if SPARC64
+
+config ARCH_DMA_ADDR_T_64BIT
+	bool
+	default y if ARCH_ATU
+
 config IOMMU_HELPER
 	bool
 	default y if SPARC64
@@ -304,6 +313,20 @@
 config ARCH_SPARSEMEM_DEFAULT
 	def_bool y if SPARC64
 
+config FORCE_MAX_ZONEORDER
+	int "Maximum zone order"
+	default "13"
+	help
+	  The kernel memory allocator divides physically contiguous memory
+	  blocks into "zones", where each zone is a power of two number of
+	  pages.  This option selects the largest power of two that the kernel
+	  keeps in the memory allocator.  If you need to allocate very large
+	  blocks of physically contiguous memory, then you may need to
+	  increase this value.
+
+	  This config option is actually maximum order plus one. For example,
+	  a value of 13 means that the largest free memory block is 2^12 pages.
+
 source "mm/Kconfig"
 
 if SPARC64
diff --git a/arch/sparc/include/asm/cpudata_64.h b/arch/sparc/include/asm/cpudata_64.h
index a6cfdab..5b0ed48 100644
--- a/arch/sparc/include/asm/cpudata_64.h
+++ b/arch/sparc/include/asm/cpudata_64.h
@@ -24,9 +24,10 @@
 	unsigned int	icache_line_size;
 	unsigned int	ecache_size;
 	unsigned int	ecache_line_size;
-	unsigned short	sock_id;
+	unsigned short	sock_id;	/* physical package */
 	unsigned short	core_id;
-	int		proc_id;
+	unsigned short  max_cache_id;	/* groupings of highest shared cache */
+	unsigned short	proc_id;	/* strand (aka HW thread) id */
 } cpuinfo_sparc;
 
 DECLARE_PER_CPU(cpuinfo_sparc, __cpu_data);
diff --git a/arch/sparc/include/asm/hypervisor.h b/arch/sparc/include/asm/hypervisor.h
index 666d5ba..73cb897 100644
--- a/arch/sparc/include/asm/hypervisor.h
+++ b/arch/sparc/include/asm/hypervisor.h
@@ -2335,6 +2335,348 @@
  */
 #define HV_FAST_PCI_MSG_SETVALID	0xd3
 
+/* PCI IOMMU v2 definitions and services
+ *
+ * While the PCI IO definitions above is valid IOMMU v2 adds new PCI IO
+ * definitions and services.
+ *
+ *	CTE		Clump Table Entry. First level table entry in the ATU.
+ *
+ *	pci_device_list
+ *			A 32-bit aligned list of pci_devices.
+ *
+ *	pci_device_listp
+ *			real address of a pci_device_list. 32-bit aligned.
+ *
+ *	iotte		IOMMU translation table entry.
+ *
+ *	iotte_attributes
+ *			IO Attributes for IOMMU v2 mappings. In addition to
+ *			read, write IOMMU v2 supports relax ordering
+ *
+ *	io_page_list	A 64-bit aligned list of real addresses. Each real
+ *			address in an io_page_list must be properly aligned
+ *			to the pagesize of the given IOTSB.
+ *
+ *	io_page_list_p	Real address of an io_page_list, 64-bit aligned.
+ *
+ *	IOTSB		IO Translation Storage Buffer. An aligned table of
+ *			IOTTEs. Each IOTSB has a pagesize, table size, and
+ *			virtual address associated with it that must match
+ *			a pagesize and table size supported by the un-derlying
+ *			hardware implementation. The alignment requirements
+ *			for an IOTSB depend on the pagesize used for that IOTSB.
+ *			Each IOTTE in an IOTSB maps one pagesize-sized page.
+ *			The size of the IOTSB dictates how large of a virtual
+ *			address space the IOTSB is capable of mapping.
+ *
+ *	iotsb_handle	An opaque identifier for an IOTSB. A devhandle plus
+ *			iotsb_handle represents a binding of an IOTSB to a
+ *			PCI root complex.
+ *
+ *	iotsb_index	Zero-based IOTTE number within an IOTSB.
+ */
+
+/* The index_count argument consists of two fields:
+ * bits 63:48 #iottes and bits 47:0 iotsb_index
+ */
+#define HV_PCI_IOTSB_INDEX_COUNT(__iottes, __iotsb_index) \
+	(((u64)(__iottes) << 48UL) | ((u64)(__iotsb_index)))
+
+/* pci_iotsb_conf()
+ * TRAP:	HV_FAST_TRAP
+ * FUNCTION:	HV_FAST_PCI_IOTSB_CONF
+ * ARG0:	devhandle
+ * ARG1:	r_addr
+ * ARG2:	size
+ * ARG3:	pagesize
+ * ARG4:	iova
+ * RET0:	status
+ * RET1:	iotsb_handle
+ * ERRORS:	EINVAL		Invalid devhandle, size, iova, or pagesize
+ *		EBADALIGN	r_addr is not properly aligned
+ *		ENORADDR	r_addr is not a valid real address
+ *		ETOOMANY	No further IOTSBs may be configured
+ *		EBUSY		Duplicate devhandle, raddir, iova combination
+ *
+ * Create an IOTSB suitable for the PCI root complex identified by devhandle,
+ * for the DMA virtual address defined by the argument iova.
+ *
+ * r_addr is the properly aligned base address of the IOTSB and size is the
+ * IOTSB (table) size in bytes.The IOTSB is required to be zeroed prior to
+ * being configured. If it contains any values other than zeros then the
+ * behavior is undefined.
+ *
+ * pagesize is the size of each page in the IOTSB. Note that the combination of
+ * size (table size) and pagesize must be valid.
+ *
+ * virt is the DMA virtual address this IOTSB will map.
+ *
+ * If successful, the opaque 64-bit handle iotsb_handle is returned in ret1.
+ * Once configured, privileged access to the IOTSB memory is prohibited and
+ * creates undefined behavior. The only permitted access is indirect via these
+ * services.
+ */
+#define HV_FAST_PCI_IOTSB_CONF		0x190
+
+/* pci_iotsb_info()
+ * TRAP:	HV_FAST_TRAP
+ * FUNCTION:	HV_FAST_PCI_IOTSB_INFO
+ * ARG0:	devhandle
+ * ARG1:	iotsb_handle
+ * RET0:	status
+ * RET1:	r_addr
+ * RET2:	size
+ * RET3:	pagesize
+ * RET4:	iova
+ * RET5:	#bound
+ * ERRORS:	EINVAL	Invalid devhandle or iotsb_handle
+ *
+ * This service returns configuration information about an IOTSB previously
+ * created with pci_iotsb_conf.
+ *
+ * iotsb_handle value 0 may be used with this service to inquire about the
+ * legacy IOTSB that may or may not exist. If the service succeeds, the return
+ * values describe the legacy IOTSB and I/O virtual addresses mapped by that
+ * table. However, the table base address r_addr may contain the value -1 which
+ * indicates a memory range that cannot be accessed or be reclaimed.
+ *
+ * The return value #bound contains the number of PCI devices that iotsb_handle
+ * is currently bound to.
+ */
+#define HV_FAST_PCI_IOTSB_INFO		0x191
+
+/* pci_iotsb_unconf()
+ * TRAP:	HV_FAST_TRAP
+ * FUNCTION:	HV_FAST_PCI_IOTSB_UNCONF
+ * ARG0:	devhandle
+ * ARG1:	iotsb_handle
+ * RET0:	status
+ * ERRORS:	EINVAL	Invalid devhandle or iotsb_handle
+ *		EBUSY	The IOTSB is bound and may not be unconfigured
+ *
+ * This service unconfigures the IOTSB identified by the devhandle and
+ * iotsb_handle arguments, previously created with pci_iotsb_conf.
+ * The IOTSB must not be currently bound to any device or the service will fail
+ *
+ * If the call succeeds, iotsb_handle is no longer valid.
+ */
+#define HV_FAST_PCI_IOTSB_UNCONF	0x192
+
+/* pci_iotsb_bind()
+ * TRAP:	HV_FAST_TRAP
+ * FUNCTION:	HV_FAST_PCI_IOTSB_BIND
+ * ARG0:	devhandle
+ * ARG1:	iotsb_handle
+ * ARG2:	pci_device
+ * RET0:	status
+ * ERRORS:	EINVAL	Invalid devhandle, iotsb_handle, or pci_device
+ *		EBUSY	A PCI function is already bound to an IOTSB at the same
+ *			address range as specified by devhandle, iotsb_handle.
+ *
+ * This service binds the PCI function specified by the argument pci_device to
+ * the IOTSB specified by the arguments devhandle and iotsb_handle.
+ *
+ * The PCI device function is bound to the specified IOTSB with the IOVA range
+ * specified when the IOTSB was configured via pci_iotsb_conf. If the function
+ * is already bound then it is unbound first.
+ */
+#define HV_FAST_PCI_IOTSB_BIND		0x193
+
+/* pci_iotsb_unbind()
+ * TRAP:	HV_FAST_TRAP
+ * FUNCTION:	HV_FAST_PCI_IOTSB_UNBIND
+ * ARG0:	devhandle
+ * ARG1:	iotsb_handle
+ * ARG2:	pci_device
+ * RET0:	status
+ * ERRORS:	EINVAL	Invalid devhandle, iotsb_handle, or pci_device
+ *		ENOMAP	The PCI function was not bound to the specified IOTSB
+ *
+ * This service unbinds the PCI device specified by the argument pci_device
+ * from the IOTSB identified  * by the arguments devhandle and iotsb_handle.
+ *
+ * If the PCI device is not bound to the specified IOTSB then this service will
+ * fail with status ENOMAP
+ */
+#define HV_FAST_PCI_IOTSB_UNBIND	0x194
+
+/* pci_iotsb_get_binding()
+ * TRAP:	HV_FAST_TRAP
+ * FUNCTION:	HV_FAST_PCI_IOTSB_GET_BINDING
+ * ARG0:	devhandle
+ * ARG1:	iotsb_handle
+ * ARG2:	iova
+ * RET0:	status
+ * RET1:	iotsb_handle
+ * ERRORS:	EINVAL	Invalid devhandle, pci_device, or iova
+ *		ENOMAP	The PCI function is not bound to an IOTSB at iova
+ *
+ * This service returns the IOTSB binding, iotsb_handle, for a given pci_device
+ * and DMA virtual address, iova.
+ *
+ * iova must be the base address of a DMA virtual address range as defined by
+ * the iommu-address-ranges property in the root complex device node defined
+ * by the argument devhandle.
+ */
+#define HV_FAST_PCI_IOTSB_GET_BINDING	0x195
+
+/* pci_iotsb_map()
+ * TRAP:	HV_FAST_TRAP
+ * FUNCTION:	HV_FAST_PCI_IOTSB_MAP
+ * ARG0:	devhandle
+ * ARG1:	iotsb_handle
+ * ARG2:	index_count
+ * ARG3:	iotte_attributes
+ * ARG4:	io_page_list_p
+ * RET0:	status
+ * RET1:	#mapped
+ * ERRORS:	EINVAL		Invalid devhandle, iotsb_handle, #iottes,
+ *				iotsb_index or iotte_attributes
+ *		EBADALIGN	Improperly aligned io_page_list_p or I/O page
+ *				address in the I/O page list.
+ *		ENORADDR	Invalid io_page_list_p or I/O page address in
+ *				the I/O page list.
+ *
+ * This service creates and flushes mappings in the IOTSB defined by the
+ * arguments devhandle, iotsb.
+ *
+ * The index_count argument consists of two fields. Bits 63:48 contain #iotte
+ * and bits 47:0 contain iotsb_index
+ *
+ * The first mapping is created in the IOTSB index specified by iotsb_index.
+ * Subsequent mappings are  created at iotsb_index+1 and so on.
+ *
+ * The attributes of each mapping are defined by the argument iotte_attributes.
+ *
+ * The io_page_list_p specifies the real address of the 64-bit-aligned list of
+ * #iottes I/O page addresses. Each page address must be a properly aligned
+ * real address of a page to be mapped in the IOTSB. The first entry in the I/O
+ * page list contains the real address of the first page, the 2nd entry for the
+ * 2nd page, and so on.
+ *
+ * #iottes must be greater than zero.
+ *
+ * The return value #mapped is the actual number of mappings created, which may
+ * be less than or equal to the argument #iottes. If the function returns
+ * successfully with a #mapped value less than the requested #iottes then the
+ * caller should continue to invoke the service with updated iotsb_index,
+ * #iottes, and io_page_list_p arguments until all pages are mapped.
+ *
+ * This service must not be used to demap a mapping. In other words, all
+ * mappings must be valid and have  one or both of the RW attribute bits set.
+ *
+ * Note:
+ * It is implementation-defined whether I/O page real address validity checking
+ * is done at time mappings are established or deferred until they are
+ * accessed.
+ */
+#define HV_FAST_PCI_IOTSB_MAP		0x196
+
+/* pci_iotsb_map_one()
+ * TRAP:	HV_FAST_TRAP
+ * FUNCTION:	HV_FAST_PCI_IOTSB_MAP_ONE
+ * ARG0:	devhandle
+ * ARG1:	iotsb_handle
+ * ARG2:	iotsb_index
+ * ARG3:	iotte_attributes
+ * ARG4:	r_addr
+ * RET0:	status
+ * ERRORS:	EINVAL		Invalid devhandle,iotsb_handle, iotsb_index
+ *				or iotte_attributes
+ *		EBADALIGN	Improperly aligned r_addr
+ *		ENORADDR	Invalid r_addr
+ *
+ * This service creates and flushes a single mapping in the IOTSB defined by the
+ * arguments devhandle, iotsb.
+ *
+ * The mapping for the page at r_addr is created at the IOTSB index specified by
+ * iotsb_index with  the attributes iotte_attributes.
+ *
+ * This service must not be used to demap a mapping. In other words, the mapping
+ * must be valid and have one or both of the RW attribute bits set.
+ *
+ * Note:
+ * It is implementation-defined whether I/O page real address validity checking
+ * is done at time mappings are established or deferred until they are
+ * accessed.
+ */
+#define HV_FAST_PCI_IOTSB_MAP_ONE	0x197
+
+/* pci_iotsb_demap()
+ * TRAP:	HV_FAST_TRAP
+ * FUNCTION:	HV_FAST_PCI_IOTSB_DEMAP
+ * ARG0:	devhandle
+ * ARG1:	iotsb_handle
+ * ARG2:	iotsb_index
+ * ARG3:	#iottes
+ * RET0:	status
+ * RET1:	#unmapped
+ * ERRORS:	EINVAL	Invalid devhandle, iotsb_handle, iotsb_index or #iottes
+ *
+ * This service unmaps and flushes up to #iottes mappings starting at index
+ * iotsb_index from the IOTSB defined by the arguments devhandle, iotsb.
+ *
+ * #iottes must be greater than zero.
+ *
+ * The actual number of IOTTEs unmapped is returned in #unmapped and may be less
+ * than or equal to the requested number of IOTTEs, #iottes.
+ *
+ * If #unmapped is less than #iottes, the caller should continue to invoke this
+ * service with updated iotsb_index and #iottes arguments until all pages are
+ * demapped.
+ */
+#define HV_FAST_PCI_IOTSB_DEMAP		0x198
+
+/* pci_iotsb_getmap()
+ * TRAP:	HV_FAST_TRAP
+ * FUNCTION:	HV_FAST_PCI_IOTSB_GETMAP
+ * ARG0:	devhandle
+ * ARG1:	iotsb_handle
+ * ARG2:	iotsb_index
+ * RET0:	status
+ * RET1:	r_addr
+ * RET2:	iotte_attributes
+ * ERRORS:	EINVAL	Invalid devhandle, iotsb_handle, or iotsb_index
+ *		ENOMAP	No mapping was found
+ *
+ * This service returns the mapping specified by index iotsb_index from the
+ * IOTSB defined by the arguments devhandle, iotsb.
+ *
+ * Upon success, the real address of the mapping shall be returned in
+ * r_addr and thethe IOTTE mapping attributes shall be returned in
+ * iotte_attributes.
+ *
+ * The return value iotte_attributes may not include optional features used in
+ * the call to create the  mapping.
+ */
+#define HV_FAST_PCI_IOTSB_GETMAP	0x199
+
+/* pci_iotsb_sync_mappings()
+ * TRAP:	HV_FAST_TRAP
+ * FUNCTION:	HV_FAST_PCI_IOTSB_SYNC_MAPPINGS
+ * ARG0:	devhandle
+ * ARG1:	iotsb_handle
+ * ARG2:	iotsb_index
+ * ARG3:	#iottes
+ * RET0:	status
+ * RET1:	#synced
+ * ERROS:	EINVAL	Invalid devhandle, iotsb_handle, iotsb_index, or #iottes
+ *
+ * This service synchronizes #iottes mappings starting at index iotsb_index in
+ * the IOTSB defined by the arguments devhandle, iotsb.
+ *
+ * #iottes must be greater than zero.
+ *
+ * The actual number of IOTTEs synchronized is returned in #synced, which may
+ * be less than or equal to the requested number, #iottes.
+ *
+ * Upon a successful return, #synced is less than #iottes, the caller should
+ * continue to invoke this service with updated iotsb_index and #iottes
+ * arguments until all pages are synchronized.
+ */
+#define HV_FAST_PCI_IOTSB_SYNC_MAPPINGS	0x19a
+
 /* Logical Domain Channel services.  */
 
 #define LDC_CHANNEL_DOWN		0
@@ -2993,6 +3335,7 @@
 #define HV_GRP_SDIO			0x0108
 #define HV_GRP_SDIO_ERR			0x0109
 #define HV_GRP_REBOOT_DATA		0x0110
+#define HV_GRP_ATU			0x0111
 #define HV_GRP_M7_PERF			0x0114
 #define HV_GRP_NIAG_PERF		0x0200
 #define HV_GRP_FIRE_PERF		0x0201
diff --git a/arch/sparc/include/asm/iommu_64.h b/arch/sparc/include/asm/iommu_64.h
index cd0d69f..f24f356 100644
--- a/arch/sparc/include/asm/iommu_64.h
+++ b/arch/sparc/include/asm/iommu_64.h
@@ -24,8 +24,36 @@
 	unsigned int	limit;
 };
 
+#define ATU_64_SPACE_SIZE 0x800000000 /* 32G */
+
+/* Data structures for SPARC ATU architecture */
+struct atu_iotsb {
+	void	*table;		/* IOTSB table base virtual addr*/
+	u64	ra;		/* IOTSB table real addr */
+	u64	dvma_size;	/* ranges[3].size or OS slected 32G size */
+	u64	dvma_base;	/* ranges[3].base */
+	u64	table_size;	/* IOTSB table size */
+	u64	page_size;	/* IO PAGE size for IOTSB */
+	u32	iotsb_num;	/* tsbnum is same as iotsb_handle */
+};
+
+struct atu_ranges {
+	u64	base;
+	u64	size;
+};
+
+struct atu {
+	struct	atu_ranges	*ranges;
+	struct	atu_iotsb	*iotsb;
+	struct	iommu_map_table	tbl;
+	u64			base;
+	u64			size;
+	u64			dma_addr_mask;
+};
+
 struct iommu {
 	struct iommu_map_table	tbl;
+	struct atu		*atu;
 	spinlock_t		lock;
 	u32			dma_addr_mask;
 	iopte_t			*page_table;
diff --git a/arch/sparc/include/asm/spinlock_32.h b/arch/sparc/include/asm/spinlock_32.h
index d9c5876..8011e79 100644
--- a/arch/sparc/include/asm/spinlock_32.h
+++ b/arch/sparc/include/asm/spinlock_32.h
@@ -134,7 +134,7 @@
 	*(volatile __u32 *)&lp->lock = ~0U;
 }
 
-static void inline arch_write_unlock(arch_rwlock_t *lock)
+static inline void arch_write_unlock(arch_rwlock_t *lock)
 {
 	__asm__ __volatile__(
 "	st		%%g0, [%0]"
diff --git a/arch/sparc/include/asm/spinlock_64.h b/arch/sparc/include/asm/spinlock_64.h
index 87990b7..07c9f2e 100644
--- a/arch/sparc/include/asm/spinlock_64.h
+++ b/arch/sparc/include/asm/spinlock_64.h
@@ -96,7 +96,7 @@
 
 /* Multi-reader locks, these are much saner than the 32-bit Sparc ones... */
 
-static void inline arch_read_lock(arch_rwlock_t *lock)
+static inline void arch_read_lock(arch_rwlock_t *lock)
 {
 	unsigned long tmp1, tmp2;
 
@@ -119,7 +119,7 @@
 	: "memory");
 }
 
-static int inline arch_read_trylock(arch_rwlock_t *lock)
+static inline int arch_read_trylock(arch_rwlock_t *lock)
 {
 	int tmp1, tmp2;
 
@@ -140,7 +140,7 @@
 	return tmp1;
 }
 
-static void inline arch_read_unlock(arch_rwlock_t *lock)
+static inline void arch_read_unlock(arch_rwlock_t *lock)
 {
 	unsigned long tmp1, tmp2;
 
@@ -156,7 +156,7 @@
 	: "memory");
 }
 
-static void inline arch_write_lock(arch_rwlock_t *lock)
+static inline void arch_write_lock(arch_rwlock_t *lock)
 {
 	unsigned long mask, tmp1, tmp2;
 
@@ -181,7 +181,7 @@
 	: "memory");
 }
 
-static void inline arch_write_unlock(arch_rwlock_t *lock)
+static inline void arch_write_unlock(arch_rwlock_t *lock)
 {
 	__asm__ __volatile__(
 "	stw		%%g0, [%0]"
@@ -190,7 +190,7 @@
 	: "memory");
 }
 
-static int inline arch_write_trylock(arch_rwlock_t *lock)
+static inline int arch_write_trylock(arch_rwlock_t *lock)
 {
 	unsigned long mask, tmp1, tmp2, result;
 
diff --git a/arch/sparc/include/asm/topology_64.h b/arch/sparc/include/asm/topology_64.h
index bec481a..7b4898a 100644
--- a/arch/sparc/include/asm/topology_64.h
+++ b/arch/sparc/include/asm/topology_64.h
@@ -44,14 +44,20 @@
 #define topology_physical_package_id(cpu)	(cpu_data(cpu).proc_id)
 #define topology_core_id(cpu)			(cpu_data(cpu).core_id)
 #define topology_core_cpumask(cpu)		(&cpu_core_sib_map[cpu])
+#define topology_core_cache_cpumask(cpu)	(&cpu_core_sib_cache_map[cpu])
 #define topology_sibling_cpumask(cpu)		(&per_cpu(cpu_sibling_map, cpu))
 #endif /* CONFIG_SMP */
 
 extern cpumask_t cpu_core_map[NR_CPUS];
 extern cpumask_t cpu_core_sib_map[NR_CPUS];
+extern cpumask_t cpu_core_sib_cache_map[NR_CPUS];
+
+/**
+ * Return cores that shares the last level cache.
+ */
 static inline const struct cpumask *cpu_coregroup_mask(int cpu)
 {
-        return &cpu_core_map[cpu];
+	return &cpu_core_sib_cache_map[cpu];
 }
 
 #endif /* _ASM_SPARC64_TOPOLOGY_H */
diff --git a/arch/sparc/include/asm/uaccess_64.h b/arch/sparc/include/asm/uaccess_64.h
index b68acc5..5373136 100644
--- a/arch/sparc/include/asm/uaccess_64.h
+++ b/arch/sparc/include/asm/uaccess_64.h
@@ -82,7 +82,6 @@
 	return 1;
 }
 
-void __ret_efault(void);
 void __retl_efault(void);
 
 /* Uh, these should become the main single-value transfer routines..
@@ -189,55 +188,34 @@
 unsigned long __must_check ___copy_from_user(void *to,
 					     const void __user *from,
 					     unsigned long size);
-unsigned long copy_from_user_fixup(void *to, const void __user *from,
-				   unsigned long size);
 static inline unsigned long __must_check
 copy_from_user(void *to, const void __user *from, unsigned long size)
 {
-	unsigned long ret;
-
 	check_object_size(to, size, false);
 
-	ret = ___copy_from_user(to, from, size);
-	if (unlikely(ret))
-		ret = copy_from_user_fixup(to, from, size);
-
-	return ret;
+	return ___copy_from_user(to, from, size);
 }
 #define __copy_from_user copy_from_user
 
 unsigned long __must_check ___copy_to_user(void __user *to,
 					   const void *from,
 					   unsigned long size);
-unsigned long copy_to_user_fixup(void __user *to, const void *from,
-				 unsigned long size);
 static inline unsigned long __must_check
 copy_to_user(void __user *to, const void *from, unsigned long size)
 {
-	unsigned long ret;
-
 	check_object_size(from, size, true);
 
-	ret = ___copy_to_user(to, from, size);
-	if (unlikely(ret))
-		ret = copy_to_user_fixup(to, from, size);
-	return ret;
+	return ___copy_to_user(to, from, size);
 }
 #define __copy_to_user copy_to_user
 
 unsigned long __must_check ___copy_in_user(void __user *to,
 					   const void __user *from,
 					   unsigned long size);
-unsigned long copy_in_user_fixup(void __user *to, void __user *from,
-				 unsigned long size);
 static inline unsigned long __must_check
 copy_in_user(void __user *to, void __user *from, unsigned long size)
 {
-	unsigned long ret = ___copy_in_user(to, from, size);
-
-	if (unlikely(ret))
-		ret = copy_in_user_fixup(to, from, size);
-	return ret;
+	return ___copy_in_user(to, from, size);
 }
 #define __copy_in_user copy_in_user
 
diff --git a/arch/sparc/kernel/head_64.S b/arch/sparc/kernel/head_64.S
index beba6c1..6aa3da1 100644
--- a/arch/sparc/kernel/head_64.S
+++ b/arch/sparc/kernel/head_64.S
@@ -926,48 +926,11 @@
 EXPORT_SYMBOL(tlb_type)
 	.section	".fixup",#alloc,#execinstr
 
-	.globl	__ret_efault, __retl_efault, __ret_one, __retl_one
-ENTRY(__ret_efault)
-	ret
-	 restore %g0, -EFAULT, %o0
-ENDPROC(__ret_efault)
-EXPORT_SYMBOL(__ret_efault)
-
 ENTRY(__retl_efault)
 	retl
 	 mov	-EFAULT, %o0
 ENDPROC(__retl_efault)
 
-ENTRY(__retl_one)
-	retl
-	 mov	1, %o0
-ENDPROC(__retl_one)
-
-ENTRY(__retl_one_fp)
-	VISExitHalf
-	retl
-	 mov	1, %o0
-ENDPROC(__retl_one_fp)
-
-ENTRY(__ret_one_asi)
-	wr	%g0, ASI_AIUS, %asi
-	ret
-	 restore %g0, 1, %o0
-ENDPROC(__ret_one_asi)
-
-ENTRY(__retl_one_asi)
-	wr	%g0, ASI_AIUS, %asi
-	retl
-	 mov	1, %o0
-ENDPROC(__retl_one_asi)
-
-ENTRY(__retl_one_asi_fp)
-	wr	%g0, ASI_AIUS, %asi
-	VISExitHalf
-	retl
-	 mov	1, %o0
-ENDPROC(__retl_one_asi_fp)
-
 ENTRY(__retl_o1)
 	retl
 	 mov	%o1, %o0
diff --git a/arch/sparc/kernel/hvapi.c b/arch/sparc/kernel/hvapi.c
index 662500f..2677312 100644
--- a/arch/sparc/kernel/hvapi.c
+++ b/arch/sparc/kernel/hvapi.c
@@ -39,6 +39,7 @@
 	{ .group = HV_GRP_SDIO,					},
 	{ .group = HV_GRP_SDIO_ERR,				},
 	{ .group = HV_GRP_REBOOT_DATA,				},
+	{ .group = HV_GRP_ATU,		.flags = FLAG_PRE_API	},
 	{ .group = HV_GRP_NIAG_PERF,	.flags = FLAG_PRE_API	},
 	{ .group = HV_GRP_FIRE_PERF,				},
 	{ .group = HV_GRP_N2_CPU,				},
diff --git a/arch/sparc/kernel/iommu.c b/arch/sparc/kernel/iommu.c
index 5c615ab..852a329 100644
--- a/arch/sparc/kernel/iommu.c
+++ b/arch/sparc/kernel/iommu.c
@@ -760,8 +760,12 @@
 	struct iommu *iommu = dev->archdata.iommu;
 	u64 dma_addr_mask = iommu->dma_addr_mask;
 
-	if (device_mask >= (1UL << 32UL))
-		return 0;
+	if (device_mask > DMA_BIT_MASK(32)) {
+		if (iommu->atu)
+			dma_addr_mask = iommu->atu->dma_addr_mask;
+		else
+			return 0;
+	}
 
 	if ((device_mask & dma_addr_mask) == dma_addr_mask)
 		return 1;
diff --git a/arch/sparc/kernel/iommu_common.h b/arch/sparc/kernel/iommu_common.h
index b40cec2..8284933 100644
--- a/arch/sparc/kernel/iommu_common.h
+++ b/arch/sparc/kernel/iommu_common.h
@@ -13,7 +13,6 @@
 #include <linux/scatterlist.h>
 #include <linux/device.h>
 #include <linux/iommu-helper.h>
-#include <linux/scatterlist.h>
 
 #include <asm/iommu.h>
 
diff --git a/arch/sparc/kernel/jump_label.c b/arch/sparc/kernel/jump_label.c
index 59bbeff..07933b9 100644
--- a/arch/sparc/kernel/jump_label.c
+++ b/arch/sparc/kernel/jump_label.c
@@ -13,19 +13,30 @@
 void arch_jump_label_transform(struct jump_entry *entry,
 			       enum jump_label_type type)
 {
-	u32 val;
 	u32 *insn = (u32 *) (unsigned long) entry->code;
+	u32 val;
 
 	if (type == JUMP_LABEL_JMP) {
 		s32 off = (s32)entry->target - (s32)entry->code;
+		bool use_v9_branch = false;
+
+		BUG_ON(off & 3);
 
 #ifdef CONFIG_SPARC64
-		/* ba,pt %xcc, . + (off << 2) */
-		val = 0x10680000 | ((u32) off >> 2);
-#else
-		/* ba . + (off << 2) */
-		val = 0x10800000 | ((u32) off >> 2);
+		if (off <= 0xfffff && off >= -0x100000)
+			use_v9_branch = true;
 #endif
+		if (use_v9_branch) {
+			/* WDISP19 - target is . + immed << 2 */
+			/* ba,pt %xcc, . + off */
+			val = 0x10680000 | (((u32) off >> 2) & 0x7ffff);
+		} else {
+			/* WDISP22 - target is . + immed << 2 */
+			BUG_ON(off > 0x7fffff);
+			BUG_ON(off < -0x800000);
+			/* ba . + off */
+			val = 0x10800000 | (((u32) off >> 2) & 0x3fffff);
+		}
 	} else {
 		val = 0x01000000;
 	}
diff --git a/arch/sparc/kernel/mdesc.c b/arch/sparc/kernel/mdesc.c
index 1122886..8a6982d 100644
--- a/arch/sparc/kernel/mdesc.c
+++ b/arch/sparc/kernel/mdesc.c
@@ -645,13 +645,20 @@
 		cpu_data(*id).core_id = core_id;
 }
 
-static void __mark_sock_id(struct mdesc_handle *hp, u64 node,
-			   int sock_id)
+static void __mark_max_cache_id(struct mdesc_handle *hp, u64 node,
+				int max_cache_id)
 {
 	const u64 *id = mdesc_get_property(hp, node, "id", NULL);
 
-	if (*id < num_possible_cpus())
-		cpu_data(*id).sock_id = sock_id;
+	if (*id < num_possible_cpus()) {
+		cpu_data(*id).max_cache_id = max_cache_id;
+
+		/**
+		 * On systems without explicit socket descriptions socket
+		 * is max_cache_id
+		 */
+		cpu_data(*id).sock_id = max_cache_id;
+	}
 }
 
 static void mark_core_ids(struct mdesc_handle *hp, u64 mp,
@@ -660,10 +667,11 @@
 	find_back_node_value(hp, mp, "cpu", __mark_core_id, core_id, 10);
 }
 
-static void mark_sock_ids(struct mdesc_handle *hp, u64 mp,
-			  int sock_id)
+static void mark_max_cache_ids(struct mdesc_handle *hp, u64 mp,
+			       int max_cache_id)
 {
-	find_back_node_value(hp, mp, "cpu", __mark_sock_id, sock_id, 10);
+	find_back_node_value(hp, mp, "cpu", __mark_max_cache_id,
+			     max_cache_id, 10);
 }
 
 static void set_core_ids(struct mdesc_handle *hp)
@@ -694,14 +702,15 @@
 	}
 }
 
-static int set_sock_ids_by_cache(struct mdesc_handle *hp, int level)
+static int set_max_cache_ids_by_cache(struct mdesc_handle *hp, int level)
 {
 	u64 mp;
 	int idx = 1;
 	int fnd = 0;
 
-	/* Identify unique sockets by looking for cpus backpointed to by
-	 * shared level n caches.
+	/**
+	 * Identify unique highest level of shared cache by looking for cpus
+	 * backpointed to by shared level N caches.
 	 */
 	mdesc_for_each_node_by_name(hp, mp, "cache") {
 		const u64 *cur_lvl;
@@ -709,8 +718,7 @@
 		cur_lvl = mdesc_get_property(hp, mp, "level", NULL);
 		if (*cur_lvl != level)
 			continue;
-
-		mark_sock_ids(hp, mp, idx);
+		mark_max_cache_ids(hp, mp, idx);
 		idx++;
 		fnd = 1;
 	}
@@ -745,15 +753,17 @@
 {
 	u64 mp;
 
-	/* If machine description exposes sockets data use it.
-	 * Otherwise fallback to use shared L3 or L2 caches.
+	/**
+	 * Find the highest level of shared cache which pre-T7 is also
+	 * the socket.
 	 */
+	if (!set_max_cache_ids_by_cache(hp, 3))
+		set_max_cache_ids_by_cache(hp, 2);
+
+	/* If machine description exposes sockets data use it.*/
 	mp = mdesc_node_by_name(hp, MDESC_NODE_NULL, "sockets");
 	if (mp != MDESC_NODE_NULL)
-		return set_sock_ids_by_socket(hp, mp);
-
-	if (!set_sock_ids_by_cache(hp, 3))
-		set_sock_ids_by_cache(hp, 2);
+		set_sock_ids_by_socket(hp, mp);
 }
 
 static void mark_proc_ids(struct mdesc_handle *hp, u64 mp, int proc_id)
diff --git a/arch/sparc/kernel/pci_sun4v.c b/arch/sparc/kernel/pci_sun4v.c
index db57d8a..06981cc 100644
--- a/arch/sparc/kernel/pci_sun4v.c
+++ b/arch/sparc/kernel/pci_sun4v.c
@@ -44,6 +44,9 @@
 	{ .major = 1, .minor = 1 },
 };
 
+static unsigned long vatu_major = 1;
+static unsigned long vatu_minor = 1;
+
 #define PGLIST_NENTS	(PAGE_SIZE / sizeof(u64))
 
 struct iommu_batch {
@@ -69,34 +72,57 @@
 }
 
 /* Interrupts must be disabled.  */
-static long iommu_batch_flush(struct iommu_batch *p)
+static long iommu_batch_flush(struct iommu_batch *p, u64 mask)
 {
 	struct pci_pbm_info *pbm = p->dev->archdata.host_controller;
+	u64 *pglist = p->pglist;
+	u64 index_count;
 	unsigned long devhandle = pbm->devhandle;
 	unsigned long prot = p->prot;
 	unsigned long entry = p->entry;
-	u64 *pglist = p->pglist;
 	unsigned long npages = p->npages;
+	unsigned long iotsb_num;
+	unsigned long ret;
+	long num;
 
 	/* VPCI maj=1, min=[0,1] only supports read and write */
 	if (vpci_major < 2)
 		prot &= (HV_PCI_MAP_ATTR_READ | HV_PCI_MAP_ATTR_WRITE);
 
 	while (npages != 0) {
-		long num;
-
-		num = pci_sun4v_iommu_map(devhandle, HV_PCI_TSBID(0, entry),
-					  npages, prot, __pa(pglist));
-		if (unlikely(num < 0)) {
-			if (printk_ratelimit())
-				printk("iommu_batch_flush: IOMMU map of "
-				       "[%08lx:%08llx:%lx:%lx:%lx] failed with "
-				       "status %ld\n",
-				       devhandle, HV_PCI_TSBID(0, entry),
-				       npages, prot, __pa(pglist), num);
-			return -1;
+		if (mask <= DMA_BIT_MASK(32)) {
+			num = pci_sun4v_iommu_map(devhandle,
+						  HV_PCI_TSBID(0, entry),
+						  npages,
+						  prot,
+						  __pa(pglist));
+			if (unlikely(num < 0)) {
+				pr_err_ratelimited("%s: IOMMU map of [%08lx:%08llx:%lx:%lx:%lx] failed with status %ld\n",
+						   __func__,
+						   devhandle,
+						   HV_PCI_TSBID(0, entry),
+						   npages, prot, __pa(pglist),
+						   num);
+				return -1;
+			}
+		} else {
+			index_count = HV_PCI_IOTSB_INDEX_COUNT(npages, entry),
+			iotsb_num = pbm->iommu->atu->iotsb->iotsb_num;
+			ret = pci_sun4v_iotsb_map(devhandle,
+						  iotsb_num,
+						  index_count,
+						  prot,
+						  __pa(pglist),
+						  &num);
+			if (unlikely(ret != HV_EOK)) {
+				pr_err_ratelimited("%s: ATU map of [%08lx:%lx:%llx:%lx:%lx] failed with status %ld\n",
+						   __func__,
+						   devhandle, iotsb_num,
+						   index_count, prot,
+						   __pa(pglist), ret);
+				return -1;
+			}
 		}
-
 		entry += num;
 		npages -= num;
 		pglist += num;
@@ -108,19 +134,19 @@
 	return 0;
 }
 
-static inline void iommu_batch_new_entry(unsigned long entry)
+static inline void iommu_batch_new_entry(unsigned long entry, u64 mask)
 {
 	struct iommu_batch *p = this_cpu_ptr(&iommu_batch);
 
 	if (p->entry + p->npages == entry)
 		return;
 	if (p->entry != ~0UL)
-		iommu_batch_flush(p);
+		iommu_batch_flush(p, mask);
 	p->entry = entry;
 }
 
 /* Interrupts must be disabled.  */
-static inline long iommu_batch_add(u64 phys_page)
+static inline long iommu_batch_add(u64 phys_page, u64 mask)
 {
 	struct iommu_batch *p = this_cpu_ptr(&iommu_batch);
 
@@ -128,28 +154,31 @@
 
 	p->pglist[p->npages++] = phys_page;
 	if (p->npages == PGLIST_NENTS)
-		return iommu_batch_flush(p);
+		return iommu_batch_flush(p, mask);
 
 	return 0;
 }
 
 /* Interrupts must be disabled.  */
-static inline long iommu_batch_end(void)
+static inline long iommu_batch_end(u64 mask)
 {
 	struct iommu_batch *p = this_cpu_ptr(&iommu_batch);
 
 	BUG_ON(p->npages >= PGLIST_NENTS);
 
-	return iommu_batch_flush(p);
+	return iommu_batch_flush(p, mask);
 }
 
 static void *dma_4v_alloc_coherent(struct device *dev, size_t size,
 				   dma_addr_t *dma_addrp, gfp_t gfp,
 				   unsigned long attrs)
 {
+	u64 mask;
 	unsigned long flags, order, first_page, npages, n;
 	unsigned long prot = 0;
 	struct iommu *iommu;
+	struct atu *atu;
+	struct iommu_map_table *tbl;
 	struct page *page;
 	void *ret;
 	long entry;
@@ -174,14 +203,21 @@
 	memset((char *)first_page, 0, PAGE_SIZE << order);
 
 	iommu = dev->archdata.iommu;
+	atu = iommu->atu;
 
-	entry = iommu_tbl_range_alloc(dev, &iommu->tbl, npages, NULL,
+	mask = dev->coherent_dma_mask;
+	if (mask <= DMA_BIT_MASK(32))
+		tbl = &iommu->tbl;
+	else
+		tbl = &atu->tbl;
+
+	entry = iommu_tbl_range_alloc(dev, tbl, npages, NULL,
 				      (unsigned long)(-1), 0);
 
 	if (unlikely(entry == IOMMU_ERROR_CODE))
 		goto range_alloc_fail;
 
-	*dma_addrp = (iommu->tbl.table_map_base + (entry << IO_PAGE_SHIFT));
+	*dma_addrp = (tbl->table_map_base + (entry << IO_PAGE_SHIFT));
 	ret = (void *) first_page;
 	first_page = __pa(first_page);
 
@@ -193,12 +229,12 @@
 			  entry);
 
 	for (n = 0; n < npages; n++) {
-		long err = iommu_batch_add(first_page + (n * PAGE_SIZE));
+		long err = iommu_batch_add(first_page + (n * PAGE_SIZE), mask);
 		if (unlikely(err < 0L))
 			goto iommu_map_fail;
 	}
 
-	if (unlikely(iommu_batch_end() < 0L))
+	if (unlikely(iommu_batch_end(mask) < 0L))
 		goto iommu_map_fail;
 
 	local_irq_restore(flags);
@@ -206,25 +242,71 @@
 	return ret;
 
 iommu_map_fail:
-	iommu_tbl_range_free(&iommu->tbl, *dma_addrp, npages, IOMMU_ERROR_CODE);
+	iommu_tbl_range_free(tbl, *dma_addrp, npages, IOMMU_ERROR_CODE);
 
 range_alloc_fail:
 	free_pages(first_page, order);
 	return NULL;
 }
 
-static void dma_4v_iommu_demap(void *demap_arg, unsigned long entry,
-			       unsigned long npages)
+unsigned long dma_4v_iotsb_bind(unsigned long devhandle,
+				unsigned long iotsb_num,
+				struct pci_bus *bus_dev)
 {
-	u32 devhandle = *(u32 *)demap_arg;
+	struct pci_dev *pdev;
+	unsigned long err;
+	unsigned int bus;
+	unsigned int device;
+	unsigned int fun;
+
+	list_for_each_entry(pdev, &bus_dev->devices, bus_list) {
+		if (pdev->subordinate) {
+			/* No need to bind pci bridge */
+			dma_4v_iotsb_bind(devhandle, iotsb_num,
+					  pdev->subordinate);
+		} else {
+			bus = bus_dev->number;
+			device = PCI_SLOT(pdev->devfn);
+			fun = PCI_FUNC(pdev->devfn);
+			err = pci_sun4v_iotsb_bind(devhandle, iotsb_num,
+						   HV_PCI_DEVICE_BUILD(bus,
+								       device,
+								       fun));
+
+			/* If bind fails for one device it is going to fail
+			 * for rest of the devices because we are sharing
+			 * IOTSB. So in case of failure simply return with
+			 * error.
+			 */
+			if (err)
+				return err;
+		}
+	}
+
+	return 0;
+}
+
+static void dma_4v_iommu_demap(struct device *dev, unsigned long devhandle,
+			       dma_addr_t dvma, unsigned long iotsb_num,
+			       unsigned long entry, unsigned long npages)
+{
 	unsigned long num, flags;
+	unsigned long ret;
 
 	local_irq_save(flags);
 	do {
-		num = pci_sun4v_iommu_demap(devhandle,
-					    HV_PCI_TSBID(0, entry),
-					    npages);
-
+		if (dvma <= DMA_BIT_MASK(32)) {
+			num = pci_sun4v_iommu_demap(devhandle,
+						    HV_PCI_TSBID(0, entry),
+						    npages);
+		} else {
+			ret = pci_sun4v_iotsb_demap(devhandle, iotsb_num,
+						    entry, npages, &num);
+			if (unlikely(ret != HV_EOK)) {
+				pr_err_ratelimited("pci_iotsb_demap() failed with error: %ld\n",
+						   ret);
+			}
+		}
 		entry += num;
 		npages -= num;
 	} while (npages != 0);
@@ -236,16 +318,28 @@
 {
 	struct pci_pbm_info *pbm;
 	struct iommu *iommu;
+	struct atu *atu;
+	struct iommu_map_table *tbl;
 	unsigned long order, npages, entry;
+	unsigned long iotsb_num;
 	u32 devhandle;
 
 	npages = IO_PAGE_ALIGN(size) >> IO_PAGE_SHIFT;
 	iommu = dev->archdata.iommu;
 	pbm = dev->archdata.host_controller;
+	atu = iommu->atu;
 	devhandle = pbm->devhandle;
-	entry = ((dvma - iommu->tbl.table_map_base) >> IO_PAGE_SHIFT);
-	dma_4v_iommu_demap(&devhandle, entry, npages);
-	iommu_tbl_range_free(&iommu->tbl, dvma, npages, IOMMU_ERROR_CODE);
+
+	if (dvma <= DMA_BIT_MASK(32)) {
+		tbl = &iommu->tbl;
+		iotsb_num = 0; /* we don't care for legacy iommu */
+	} else {
+		tbl = &atu->tbl;
+		iotsb_num = atu->iotsb->iotsb_num;
+	}
+	entry = ((dvma - tbl->table_map_base) >> IO_PAGE_SHIFT);
+	dma_4v_iommu_demap(dev, devhandle, dvma, iotsb_num, entry, npages);
+	iommu_tbl_range_free(tbl, dvma, npages, IOMMU_ERROR_CODE);
 	order = get_order(size);
 	if (order < 10)
 		free_pages((unsigned long)cpu, order);
@@ -257,13 +351,17 @@
 				  unsigned long attrs)
 {
 	struct iommu *iommu;
+	struct atu *atu;
+	struct iommu_map_table *tbl;
+	u64 mask;
 	unsigned long flags, npages, oaddr;
 	unsigned long i, base_paddr;
-	u32 bus_addr, ret;
 	unsigned long prot;
+	dma_addr_t bus_addr, ret;
 	long entry;
 
 	iommu = dev->archdata.iommu;
+	atu = iommu->atu;
 
 	if (unlikely(direction == DMA_NONE))
 		goto bad;
@@ -272,13 +370,19 @@
 	npages = IO_PAGE_ALIGN(oaddr + sz) - (oaddr & IO_PAGE_MASK);
 	npages >>= IO_PAGE_SHIFT;
 
-	entry = iommu_tbl_range_alloc(dev, &iommu->tbl, npages, NULL,
+	mask = *dev->dma_mask;
+	if (mask <= DMA_BIT_MASK(32))
+		tbl = &iommu->tbl;
+	else
+		tbl = &atu->tbl;
+
+	entry = iommu_tbl_range_alloc(dev, tbl, npages, NULL,
 				      (unsigned long)(-1), 0);
 
 	if (unlikely(entry == IOMMU_ERROR_CODE))
 		goto bad;
 
-	bus_addr = (iommu->tbl.table_map_base + (entry << IO_PAGE_SHIFT));
+	bus_addr = (tbl->table_map_base + (entry << IO_PAGE_SHIFT));
 	ret = bus_addr | (oaddr & ~IO_PAGE_MASK);
 	base_paddr = __pa(oaddr & IO_PAGE_MASK);
 	prot = HV_PCI_MAP_ATTR_READ;
@@ -293,11 +397,11 @@
 	iommu_batch_start(dev, prot, entry);
 
 	for (i = 0; i < npages; i++, base_paddr += IO_PAGE_SIZE) {
-		long err = iommu_batch_add(base_paddr);
+		long err = iommu_batch_add(base_paddr, mask);
 		if (unlikely(err < 0L))
 			goto iommu_map_fail;
 	}
-	if (unlikely(iommu_batch_end() < 0L))
+	if (unlikely(iommu_batch_end(mask) < 0L))
 		goto iommu_map_fail;
 
 	local_irq_restore(flags);
@@ -310,7 +414,7 @@
 	return DMA_ERROR_CODE;
 
 iommu_map_fail:
-	iommu_tbl_range_free(&iommu->tbl, bus_addr, npages, IOMMU_ERROR_CODE);
+	iommu_tbl_range_free(tbl, bus_addr, npages, IOMMU_ERROR_CODE);
 	return DMA_ERROR_CODE;
 }
 
@@ -320,7 +424,10 @@
 {
 	struct pci_pbm_info *pbm;
 	struct iommu *iommu;
+	struct atu *atu;
+	struct iommu_map_table *tbl;
 	unsigned long npages;
+	unsigned long iotsb_num;
 	long entry;
 	u32 devhandle;
 
@@ -332,14 +439,23 @@
 
 	iommu = dev->archdata.iommu;
 	pbm = dev->archdata.host_controller;
+	atu = iommu->atu;
 	devhandle = pbm->devhandle;
 
 	npages = IO_PAGE_ALIGN(bus_addr + sz) - (bus_addr & IO_PAGE_MASK);
 	npages >>= IO_PAGE_SHIFT;
 	bus_addr &= IO_PAGE_MASK;
-	entry = (bus_addr - iommu->tbl.table_map_base) >> IO_PAGE_SHIFT;
-	dma_4v_iommu_demap(&devhandle, entry, npages);
-	iommu_tbl_range_free(&iommu->tbl, bus_addr, npages, IOMMU_ERROR_CODE);
+
+	if (bus_addr <= DMA_BIT_MASK(32)) {
+		iotsb_num = 0; /* we don't care for legacy iommu */
+		tbl = &iommu->tbl;
+	} else {
+		iotsb_num = atu->iotsb->iotsb_num;
+		tbl = &atu->tbl;
+	}
+	entry = (bus_addr - tbl->table_map_base) >> IO_PAGE_SHIFT;
+	dma_4v_iommu_demap(dev, devhandle, bus_addr, iotsb_num, entry, npages);
+	iommu_tbl_range_free(tbl, bus_addr, npages, IOMMU_ERROR_CODE);
 }
 
 static int dma_4v_map_sg(struct device *dev, struct scatterlist *sglist,
@@ -353,12 +469,17 @@
 	unsigned long seg_boundary_size;
 	int outcount, incount, i;
 	struct iommu *iommu;
+	struct atu *atu;
+	struct iommu_map_table *tbl;
+	u64 mask;
 	unsigned long base_shift;
 	long err;
 
 	BUG_ON(direction == DMA_NONE);
 
 	iommu = dev->archdata.iommu;
+	atu = iommu->atu;
+
 	if (nelems == 0 || !iommu)
 		return 0;
 	
@@ -384,7 +505,15 @@
 	max_seg_size = dma_get_max_seg_size(dev);
 	seg_boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
 				  IO_PAGE_SIZE) >> IO_PAGE_SHIFT;
-	base_shift = iommu->tbl.table_map_base >> IO_PAGE_SHIFT;
+
+	mask = *dev->dma_mask;
+	if (mask <= DMA_BIT_MASK(32))
+		tbl = &iommu->tbl;
+	else
+		tbl = &atu->tbl;
+
+	base_shift = tbl->table_map_base >> IO_PAGE_SHIFT;
+
 	for_each_sg(sglist, s, nelems, i) {
 		unsigned long paddr, npages, entry, out_entry = 0, slen;
 
@@ -397,27 +526,26 @@
 		/* Allocate iommu entries for that segment */
 		paddr = (unsigned long) SG_ENT_PHYS_ADDRESS(s);
 		npages = iommu_num_pages(paddr, slen, IO_PAGE_SIZE);
-		entry = iommu_tbl_range_alloc(dev, &iommu->tbl, npages,
+		entry = iommu_tbl_range_alloc(dev, tbl, npages,
 					      &handle, (unsigned long)(-1), 0);
 
 		/* Handle failure */
 		if (unlikely(entry == IOMMU_ERROR_CODE)) {
-			if (printk_ratelimit())
-				printk(KERN_INFO "iommu_alloc failed, iommu %p paddr %lx"
-				       " npages %lx\n", iommu, paddr, npages);
+			pr_err_ratelimited("iommu_alloc failed, iommu %p paddr %lx npages %lx\n",
+					   tbl, paddr, npages);
 			goto iommu_map_failed;
 		}
 
-		iommu_batch_new_entry(entry);
+		iommu_batch_new_entry(entry, mask);
 
 		/* Convert entry to a dma_addr_t */
-		dma_addr = iommu->tbl.table_map_base + (entry << IO_PAGE_SHIFT);
+		dma_addr = tbl->table_map_base + (entry << IO_PAGE_SHIFT);
 		dma_addr |= (s->offset & ~IO_PAGE_MASK);
 
 		/* Insert into HW table */
 		paddr &= IO_PAGE_MASK;
 		while (npages--) {
-			err = iommu_batch_add(paddr);
+			err = iommu_batch_add(paddr, mask);
 			if (unlikely(err < 0L))
 				goto iommu_map_failed;
 			paddr += IO_PAGE_SIZE;
@@ -452,7 +580,7 @@
 		dma_next = dma_addr + slen;
 	}
 
-	err = iommu_batch_end();
+	err = iommu_batch_end(mask);
 
 	if (unlikely(err < 0L))
 		goto iommu_map_failed;
@@ -475,7 +603,7 @@
 			vaddr = s->dma_address & IO_PAGE_MASK;
 			npages = iommu_num_pages(s->dma_address, s->dma_length,
 						 IO_PAGE_SIZE);
-			iommu_tbl_range_free(&iommu->tbl, vaddr, npages,
+			iommu_tbl_range_free(tbl, vaddr, npages,
 					     IOMMU_ERROR_CODE);
 			/* XXX demap? XXX */
 			s->dma_address = DMA_ERROR_CODE;
@@ -496,13 +624,16 @@
 	struct pci_pbm_info *pbm;
 	struct scatterlist *sg;
 	struct iommu *iommu;
+	struct atu *atu;
 	unsigned long flags, entry;
+	unsigned long iotsb_num;
 	u32 devhandle;
 
 	BUG_ON(direction == DMA_NONE);
 
 	iommu = dev->archdata.iommu;
 	pbm = dev->archdata.host_controller;
+	atu = iommu->atu;
 	devhandle = pbm->devhandle;
 	
 	local_irq_save(flags);
@@ -512,15 +643,24 @@
 		dma_addr_t dma_handle = sg->dma_address;
 		unsigned int len = sg->dma_length;
 		unsigned long npages;
-		struct iommu_map_table *tbl = &iommu->tbl;
+		struct iommu_map_table *tbl;
 		unsigned long shift = IO_PAGE_SHIFT;
 
 		if (!len)
 			break;
 		npages = iommu_num_pages(dma_handle, len, IO_PAGE_SIZE);
+
+		if (dma_handle <= DMA_BIT_MASK(32)) {
+			iotsb_num = 0; /* we don't care for legacy iommu */
+			tbl = &iommu->tbl;
+		} else {
+			iotsb_num = atu->iotsb->iotsb_num;
+			tbl = &atu->tbl;
+		}
 		entry = ((dma_handle - tbl->table_map_base) >> shift);
-		dma_4v_iommu_demap(&devhandle, entry, npages);
-		iommu_tbl_range_free(&iommu->tbl, dma_handle, npages,
+		dma_4v_iommu_demap(dev, devhandle, dma_handle, iotsb_num,
+				   entry, npages);
+		iommu_tbl_range_free(tbl, dma_handle, npages,
 				     IOMMU_ERROR_CODE);
 		sg = sg_next(sg);
 	}
@@ -581,6 +721,132 @@
 	return cnt;
 }
 
+static int pci_sun4v_atu_alloc_iotsb(struct pci_pbm_info *pbm)
+{
+	struct atu *atu = pbm->iommu->atu;
+	struct atu_iotsb *iotsb;
+	void *table;
+	u64 table_size;
+	u64 iotsb_num;
+	unsigned long order;
+	unsigned long err;
+
+	iotsb = kzalloc(sizeof(*iotsb), GFP_KERNEL);
+	if (!iotsb) {
+		err = -ENOMEM;
+		goto out_err;
+	}
+	atu->iotsb = iotsb;
+
+	/* calculate size of IOTSB */
+	table_size = (atu->size / IO_PAGE_SIZE) * 8;
+	order = get_order(table_size);
+	table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, order);
+	if (!table) {
+		err = -ENOMEM;
+		goto table_failed;
+	}
+	iotsb->table = table;
+	iotsb->ra = __pa(table);
+	iotsb->dvma_size = atu->size;
+	iotsb->dvma_base = atu->base;
+	iotsb->table_size = table_size;
+	iotsb->page_size = IO_PAGE_SIZE;
+
+	/* configure and register IOTSB with HV */
+	err = pci_sun4v_iotsb_conf(pbm->devhandle,
+				   iotsb->ra,
+				   iotsb->table_size,
+				   iotsb->page_size,
+				   iotsb->dvma_base,
+				   &iotsb_num);
+	if (err) {
+		pr_err(PFX "pci_iotsb_conf failed error: %ld\n", err);
+		goto iotsb_conf_failed;
+	}
+	iotsb->iotsb_num = iotsb_num;
+
+	err = dma_4v_iotsb_bind(pbm->devhandle, iotsb_num, pbm->pci_bus);
+	if (err) {
+		pr_err(PFX "pci_iotsb_bind failed error: %ld\n", err);
+		goto iotsb_conf_failed;
+	}
+
+	return 0;
+
+iotsb_conf_failed:
+	free_pages((unsigned long)table, order);
+table_failed:
+	kfree(iotsb);
+out_err:
+	return err;
+}
+
+static int pci_sun4v_atu_init(struct pci_pbm_info *pbm)
+{
+	struct atu *atu = pbm->iommu->atu;
+	unsigned long err;
+	const u64 *ranges;
+	u64 map_size, num_iotte;
+	u64 dma_mask;
+	const u32 *page_size;
+	int len;
+
+	ranges = of_get_property(pbm->op->dev.of_node, "iommu-address-ranges",
+				 &len);
+	if (!ranges) {
+		pr_err(PFX "No iommu-address-ranges\n");
+		return -EINVAL;
+	}
+
+	page_size = of_get_property(pbm->op->dev.of_node, "iommu-pagesizes",
+				    NULL);
+	if (!page_size) {
+		pr_err(PFX "No iommu-pagesizes\n");
+		return -EINVAL;
+	}
+
+	/* There are 4 iommu-address-ranges supported. Each range is pair of
+	 * {base, size}. The ranges[0] and ranges[1] are 32bit address space
+	 * while ranges[2] and ranges[3] are 64bit space.  We want to use 64bit
+	 * address ranges to support 64bit addressing. Because 'size' for
+	 * address ranges[2] and ranges[3] are same we can select either of
+	 * ranges[2] or ranges[3] for mapping. However due to 'size' is too
+	 * large for OS to allocate IOTSB we are using fix size 32G
+	 * (ATU_64_SPACE_SIZE) which is more than enough for all PCIe devices
+	 * to share.
+	 */
+	atu->ranges = (struct atu_ranges *)ranges;
+	atu->base = atu->ranges[3].base;
+	atu->size = ATU_64_SPACE_SIZE;
+
+	/* Create IOTSB */
+	err = pci_sun4v_atu_alloc_iotsb(pbm);
+	if (err) {
+		pr_err(PFX "Error creating ATU IOTSB\n");
+		return err;
+	}
+
+	/* Create ATU iommu map.
+	 * One bit represents one iotte in IOTSB table.
+	 */
+	dma_mask = (roundup_pow_of_two(atu->size) - 1UL);
+	num_iotte = atu->size / IO_PAGE_SIZE;
+	map_size = num_iotte / 8;
+	atu->tbl.table_map_base = atu->base;
+	atu->dma_addr_mask = dma_mask;
+	atu->tbl.map = kzalloc(map_size, GFP_KERNEL);
+	if (!atu->tbl.map)
+		return -ENOMEM;
+
+	iommu_tbl_pool_init(&atu->tbl, num_iotte, IO_PAGE_SHIFT,
+			    NULL, false /* no large_pool */,
+			    0 /* default npools */,
+			    false /* want span boundary checking */);
+
+	return 0;
+}
+
 static int pci_sun4v_iommu_init(struct pci_pbm_info *pbm)
 {
 	static const u32 vdma_default[] = { 0x80000000, 0x80000000 };
@@ -918,6 +1184,18 @@
 
 	pci_sun4v_scan_bus(pbm, &op->dev);
 
+	/* if atu_init fails its not complete failure.
+	 * we can still continue using legacy iommu.
+	 */
+	if (pbm->iommu->atu) {
+		err = pci_sun4v_atu_init(pbm);
+		if (err) {
+			kfree(pbm->iommu->atu);
+			pbm->iommu->atu = NULL;
+			pr_err(PFX "ATU init failed, err=%d\n", err);
+		}
+	}
+
 	pbm->next = pci_pbm_root;
 	pci_pbm_root = pbm;
 
@@ -931,8 +1209,10 @@
 	struct pci_pbm_info *pbm;
 	struct device_node *dp;
 	struct iommu *iommu;
+	struct atu *atu;
 	u32 devhandle;
 	int i, err = -ENODEV;
+	static bool hv_atu = true;
 
 	dp = op->dev.of_node;
 
@@ -954,6 +1234,19 @@
 		pr_info(PFX "Registered hvapi major[%lu] minor[%lu]\n",
 			vpci_major, vpci_minor);
 
+		err = sun4v_hvapi_register(HV_GRP_ATU, vatu_major, &vatu_minor);
+		if (err) {
+			/* don't return an error if we fail to register the
+			 * ATU group, but ATU hcalls won't be available.
+			 */
+			hv_atu = false;
+			pr_err(PFX "Could not register hvapi ATU err=%d\n",
+			       err);
+		} else {
+			pr_info(PFX "Registered hvapi ATU major[%lu] minor[%lu]\n",
+				vatu_major, vatu_minor);
+		}
+
 		dma_ops = &sun4v_dma_ops;
 	}
 
@@ -991,6 +1284,14 @@
 	}
 
 	pbm->iommu = iommu;
+	iommu->atu = NULL;
+	if (hv_atu) {
+		atu = kzalloc(sizeof(*atu), GFP_KERNEL);
+		if (!atu)
+			pr_err(PFX "Could not allocate atu\n");
+		else
+			iommu->atu = atu;
+	}
 
 	err = pci_sun4v_pbm_init(pbm, op, devhandle);
 	if (err)
@@ -1001,6 +1302,7 @@
 	return 0;
 
 out_free_iommu:
+	kfree(iommu->atu);
 	kfree(pbm->iommu);
 
 out_free_controller:
diff --git a/arch/sparc/kernel/pci_sun4v.h b/arch/sparc/kernel/pci_sun4v.h
index 5642212..22603a4 100644
--- a/arch/sparc/kernel/pci_sun4v.h
+++ b/arch/sparc/kernel/pci_sun4v.h
@@ -89,4 +89,25 @@
 				     unsigned long msinum,
 				     unsigned long valid);
 
+/* Sun4v HV IOMMU v2 APIs */
+unsigned long pci_sun4v_iotsb_conf(unsigned long devhandle,
+				   unsigned long ra,
+				   unsigned long table_size,
+				   unsigned long page_size,
+				   unsigned long dvma_base,
+				   u64 *iotsb_num);
+unsigned long pci_sun4v_iotsb_bind(unsigned long devhandle,
+				   unsigned long iotsb_num,
+				   unsigned int pci_device);
+unsigned long pci_sun4v_iotsb_map(unsigned long devhandle,
+				  unsigned long iotsb_num,
+				  unsigned long iotsb_index_iottes,
+				  unsigned long io_attributes,
+				  unsigned long io_page_list_pa,
+				  long *mapped);
+unsigned long pci_sun4v_iotsb_demap(unsigned long devhandle,
+				    unsigned long iotsb_num,
+				    unsigned long iotsb_index,
+				    unsigned long iottes,
+				    unsigned long *demapped);
 #endif /* !(_PCI_SUN4V_H) */
diff --git a/arch/sparc/kernel/pci_sun4v_asm.S b/arch/sparc/kernel/pci_sun4v_asm.S
index e606d46..578f096 100644
--- a/arch/sparc/kernel/pci_sun4v_asm.S
+++ b/arch/sparc/kernel/pci_sun4v_asm.S
@@ -360,3 +360,71 @@
 	 mov	%o0, %o0
 ENDPROC(pci_sun4v_msg_setvalid)
 
+	/*
+	 * %o0:	devhandle
+	 * %o1:	r_addr
+	 * %o2:	size
+	 * %o3:	pagesize
+	 * %o4:	virt
+	 * %o5: &iotsb_num/&iotsb_handle
+	 *
+	 * returns %o0:	status
+	 *         %o1:	iotsb_num/iotsb_handle
+	 */
+ENTRY(pci_sun4v_iotsb_conf)
+	mov	%o5, %g1
+	mov	HV_FAST_PCI_IOTSB_CONF, %o5
+	ta	HV_FAST_TRAP
+	retl
+	 stx	%o1, [%g1]
+ENDPROC(pci_sun4v_iotsb_conf)
+
+	/*
+	 * %o0:	devhandle
+	 * %o1:	iotsb_num/iotsb_handle
+	 * %o2:	pci_device
+	 *
+	 * returns %o0:	status
+	 */
+ENTRY(pci_sun4v_iotsb_bind)
+	mov	HV_FAST_PCI_IOTSB_BIND, %o5
+	ta	HV_FAST_TRAP
+	retl
+	 nop
+ENDPROC(pci_sun4v_iotsb_bind)
+
+	/*
+	 * %o0:	devhandle
+	 * %o1:	iotsb_num/iotsb_handle
+	 * %o2:	index_count
+	 * %o3:	iotte_attributes
+	 * %o4:	io_page_list_p
+	 * %o5: &mapped
+	 *
+	 * returns %o0:	status
+	 *         %o1:	#mapped
+	 */
+ENTRY(pci_sun4v_iotsb_map)
+	mov	%o5, %g1
+	mov	HV_FAST_PCI_IOTSB_MAP, %o5
+	ta	HV_FAST_TRAP
+	retl
+	 stx	%o1, [%g1]
+ENDPROC(pci_sun4v_iotsb_map)
+
+	/*
+	 * %o0:	devhandle
+	 * %o1:	iotsb_num/iotsb_handle
+	 * %o2:	iotsb_index
+	 * %o3:	#iottes
+	 * %o4: &demapped
+	 *
+	 * returns %o0:	status
+	 *         %o1:	#demapped
+	 */
+ENTRY(pci_sun4v_iotsb_demap)
+	mov	HV_FAST_PCI_IOTSB_DEMAP, %o5
+	ta	HV_FAST_TRAP
+	retl
+	 stx	%o1, [%o4]
+ENDPROC(pci_sun4v_iotsb_demap)
diff --git a/arch/sparc/kernel/ptrace_64.c b/arch/sparc/kernel/ptrace_64.c
index 9ddc492..ac082dd 100644
--- a/arch/sparc/kernel/ptrace_64.c
+++ b/arch/sparc/kernel/ptrace_64.c
@@ -127,7 +127,8 @@
 		if (copy_from_user(kbuf, (void __user *) uaddr, len))
 			return -EFAULT;
 	} else {
-		int len2 = access_process_vm(target, uaddr, kbuf, len, 0);
+		int len2 = access_process_vm(target, uaddr, kbuf, len,
+				FOLL_FORCE);
 		if (len2 != len)
 			return -EFAULT;
 	}
@@ -141,7 +142,8 @@
 		if (copy_to_user((void __user *) uaddr, kbuf, len))
 			return -EFAULT;
 	} else {
-		int len2 = access_process_vm(target, uaddr, kbuf, len, 1);
+		int len2 = access_process_vm(target, uaddr, kbuf, len,
+				FOLL_FORCE | FOLL_WRITE);
 		if (len2 != len)
 			return -EFAULT;
 	}
@@ -505,7 +507,8 @@
 				if (access_process_vm(target,
 						      (unsigned long)
 						      &reg_window[pos],
-						      k, sizeof(*k), 0)
+						      k, sizeof(*k),
+						      FOLL_FORCE)
 				    != sizeof(*k))
 					return -EFAULT;
 				k++;
@@ -531,12 +534,14 @@
 				if (access_process_vm(target,
 						      (unsigned long)
 						      &reg_window[pos],
-						      &reg, sizeof(reg), 0)
+						      &reg, sizeof(reg),
+						      FOLL_FORCE)
 				    != sizeof(reg))
 					return -EFAULT;
 				if (access_process_vm(target,
 						      (unsigned long) u,
-						      &reg, sizeof(reg), 1)
+						      &reg, sizeof(reg),
+						      FOLL_FORCE | FOLL_WRITE)
 				    != sizeof(reg))
 					return -EFAULT;
 				pos++;
@@ -615,7 +620,8 @@
 						      (unsigned long)
 						      &reg_window[pos],
 						      (void *) k,
-						      sizeof(*k), 1)
+						      sizeof(*k),
+						      FOLL_FORCE | FOLL_WRITE)
 				    != sizeof(*k))
 					return -EFAULT;
 				k++;
@@ -642,13 +648,15 @@
 				if (access_process_vm(target,
 						      (unsigned long)
 						      u,
-						      &reg, sizeof(reg), 0)
+						      &reg, sizeof(reg),
+						      FOLL_FORCE)
 				    != sizeof(reg))
 					return -EFAULT;
 				if (access_process_vm(target,
 						      (unsigned long)
 						      &reg_window[pos],
-						      &reg, sizeof(reg), 1)
+						      &reg, sizeof(reg),
+						      FOLL_FORCE | FOLL_WRITE)
 				    != sizeof(reg))
 					return -EFAULT;
 				pos++;
diff --git a/arch/sparc/kernel/signal_32.c b/arch/sparc/kernel/signal_32.c
index c3c12ef..9c0c8fd 100644
--- a/arch/sparc/kernel/signal_32.c
+++ b/arch/sparc/kernel/signal_32.c
@@ -89,7 +89,7 @@
 	sf = (struct signal_frame __user *) regs->u_regs[UREG_FP];
 
 	/* 1. Make sure we are not getting garbage from the user */
-	if (!invalid_frame_pointer(sf, sizeof(*sf)))
+	if (invalid_frame_pointer(sf, sizeof(*sf)))
 		goto segv_and_exit;
 
 	if (get_user(ufp, &sf->info.si_regs.u_regs[UREG_FP]))
@@ -150,7 +150,7 @@
 
 	synchronize_user_stack();
 	sf = (struct rt_signal_frame __user *) regs->u_regs[UREG_FP];
-	if (!invalid_frame_pointer(sf, sizeof(*sf)))
+	if (invalid_frame_pointer(sf, sizeof(*sf)))
 		goto segv;
 
 	if (get_user(ufp, &sf->regs.u_regs[UREG_FP]))
diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c
index d3035ba..8182f7c 100644
--- a/arch/sparc/kernel/smp_64.c
+++ b/arch/sparc/kernel/smp_64.c
@@ -63,9 +63,13 @@
 cpumask_t cpu_core_sib_map[NR_CPUS] __read_mostly = {
 	[0 ... NR_CPUS-1] = CPU_MASK_NONE };
 
+cpumask_t cpu_core_sib_cache_map[NR_CPUS] __read_mostly = {
+	[0 ... NR_CPUS - 1] = CPU_MASK_NONE };
+
 EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
 EXPORT_SYMBOL(cpu_core_map);
 EXPORT_SYMBOL(cpu_core_sib_map);
+EXPORT_SYMBOL(cpu_core_sib_cache_map);
 
 static cpumask_t smp_commenced_mask;
 
@@ -1265,6 +1269,10 @@
 		unsigned int j;
 
 		for_each_present_cpu(j)  {
+			if (cpu_data(i).max_cache_id ==
+			    cpu_data(j).max_cache_id)
+				cpumask_set_cpu(j, &cpu_core_sib_cache_map[i]);
+
 			if (cpu_data(i).sock_id == cpu_data(j).sock_id)
 				cpumask_set_cpu(j, &cpu_core_sib_map[i]);
 		}
diff --git a/arch/sparc/lib/GENcopy_from_user.S b/arch/sparc/lib/GENcopy_from_user.S
index b7d0bd6..69a439f 100644
--- a/arch/sparc/lib/GENcopy_from_user.S
+++ b/arch/sparc/lib/GENcopy_from_user.S
@@ -3,11 +3,11 @@
  * Copyright (C) 2007 David S. Miller (davem@davemloft.net)
  */
 
-#define EX_LD(x)		\
+#define EX_LD(x,y)		\
 98:	x;			\
 	.section __ex_table,"a";\
 	.align 4;		\
-	.word 98b, __retl_one;	\
+	.word 98b, y;		\
 	.text;			\
 	.align 4;
 
diff --git a/arch/sparc/lib/GENcopy_to_user.S b/arch/sparc/lib/GENcopy_to_user.S
index 780550e..9947427 100644
--- a/arch/sparc/lib/GENcopy_to_user.S
+++ b/arch/sparc/lib/GENcopy_to_user.S
@@ -3,11 +3,11 @@
  * Copyright (C) 2007 David S. Miller (davem@davemloft.net)
  */
 
-#define EX_ST(x)		\
+#define EX_ST(x,y)		\
 98:	x;			\
 	.section __ex_table,"a";\
 	.align 4;		\
-	.word 98b, __retl_one;	\
+	.word 98b, y;		\
 	.text;			\
 	.align 4;
 
diff --git a/arch/sparc/lib/GENmemcpy.S b/arch/sparc/lib/GENmemcpy.S
index 89358ee..059ea24 100644
--- a/arch/sparc/lib/GENmemcpy.S
+++ b/arch/sparc/lib/GENmemcpy.S
@@ -4,21 +4,18 @@
  */
 
 #ifdef __KERNEL__
+#include <linux/linkage.h>
 #define GLOBAL_SPARE	%g7
 #else
 #define GLOBAL_SPARE	%g5
 #endif
 
 #ifndef EX_LD
-#define EX_LD(x)	x
+#define EX_LD(x,y)	x
 #endif
 
 #ifndef EX_ST
-#define EX_ST(x)	x
-#endif
-
-#ifndef EX_RETVAL
-#define EX_RETVAL(x)	x
+#define EX_ST(x,y)	x
 #endif
 
 #ifndef LOAD
@@ -45,6 +42,29 @@
 	.register	%g3,#scratch
 
 	.text
+
+#ifndef EX_RETVAL
+#define EX_RETVAL(x)	x
+ENTRY(GEN_retl_o4_1)
+	add	%o4, %o2, %o4
+	retl
+	 add	%o4, 1, %o0
+ENDPROC(GEN_retl_o4_1)
+ENTRY(GEN_retl_g1_8)
+	add	%g1, %o2, %g1
+	retl
+	 add	%g1, 8, %o0
+ENDPROC(GEN_retl_g1_8)
+ENTRY(GEN_retl_o2_4)
+	retl
+	 add	%o2, 4, %o0
+ENDPROC(GEN_retl_o2_4)
+ENTRY(GEN_retl_o2_1)
+	retl
+	 add	%o2, 1, %o0
+ENDPROC(GEN_retl_o2_1)
+#endif
+
 	.align		64
 
 	.globl	FUNC_NAME
@@ -73,8 +93,8 @@
 	sub		%g0, %o4, %o4
 	sub		%o2, %o4, %o2
 1:	subcc		%o4, 1, %o4
-	EX_LD(LOAD(ldub, %o1, %g1))
-	EX_ST(STORE(stb, %g1, %o0))
+	EX_LD(LOAD(ldub, %o1, %g1),GEN_retl_o4_1)
+	EX_ST(STORE(stb, %g1, %o0),GEN_retl_o4_1)
 	add		%o1, 1, %o1
 	bne,pt		%XCC, 1b
 	add		%o0, 1, %o0
@@ -82,8 +102,8 @@
 	andn		%o2, 0x7, %g1
 	sub		%o2, %g1, %o2
 1:	subcc		%g1, 0x8, %g1
-	EX_LD(LOAD(ldx, %o1, %g2))
-	EX_ST(STORE(stx, %g2, %o0))
+	EX_LD(LOAD(ldx, %o1, %g2),GEN_retl_g1_8)
+	EX_ST(STORE(stx, %g2, %o0),GEN_retl_g1_8)
 	add		%o1, 0x8, %o1
 	bne,pt		%XCC, 1b
 	 add		%o0, 0x8, %o0
@@ -100,8 +120,8 @@
 
 1:
 	subcc		%o2, 4, %o2
-	EX_LD(LOAD(lduw, %o1, %g1))
-	EX_ST(STORE(stw, %g1, %o1 + %o3))
+	EX_LD(LOAD(lduw, %o1, %g1),GEN_retl_o2_4)
+	EX_ST(STORE(stw, %g1, %o1 + %o3),GEN_retl_o2_4)
 	bgu,pt		%XCC, 1b
 	 add		%o1, 4, %o1
 
@@ -111,8 +131,8 @@
 	.align		32
 90:
 	subcc		%o2, 1, %o2
-	EX_LD(LOAD(ldub, %o1, %g1))
-	EX_ST(STORE(stb, %g1, %o1 + %o3))
+	EX_LD(LOAD(ldub, %o1, %g1),GEN_retl_o2_1)
+	EX_ST(STORE(stb, %g1, %o1 + %o3),GEN_retl_o2_1)
 	bgu,pt		%XCC, 90b
 	 add		%o1, 1, %o1
 	retl
diff --git a/arch/sparc/lib/Makefile b/arch/sparc/lib/Makefile
index 885f00e..69912d2 100644
--- a/arch/sparc/lib/Makefile
+++ b/arch/sparc/lib/Makefile
@@ -38,7 +38,7 @@
 lib-$(CONFIG_SPARC64) += GENmemcpy.o GENcopy_from_user.o GENcopy_to_user.o
 lib-$(CONFIG_SPARC64) += GENpatch.o GENpage.o GENbzero.o
 
-lib-$(CONFIG_SPARC64) += copy_in_user.o user_fixup.o memmove.o
+lib-$(CONFIG_SPARC64) += copy_in_user.o memmove.o
 lib-$(CONFIG_SPARC64) += mcount.o ipcsum.o xor.o hweight.o ffs.o
 
 obj-$(CONFIG_SPARC64) += iomap.o
diff --git a/arch/sparc/lib/NG2copy_from_user.S b/arch/sparc/lib/NG2copy_from_user.S
index d5242b8..b79a699 100644
--- a/arch/sparc/lib/NG2copy_from_user.S
+++ b/arch/sparc/lib/NG2copy_from_user.S
@@ -3,19 +3,19 @@
  * Copyright (C) 2007 David S. Miller (davem@davemloft.net)
  */
 
-#define EX_LD(x)		\
+#define EX_LD(x,y)		\
 98:	x;			\
 	.section __ex_table,"a";\
 	.align 4;		\
-	.word 98b, __retl_one_asi;\
+	.word 98b, y;		\
 	.text;			\
 	.align 4;
 
-#define EX_LD_FP(x)		\
+#define EX_LD_FP(x,y)		\
 98:	x;			\
 	.section __ex_table,"a";\
 	.align 4;		\
-	.word 98b, __retl_one_asi_fp;\
+	.word 98b, y##_fp;	\
 	.text;			\
 	.align 4;
 
diff --git a/arch/sparc/lib/NG2copy_to_user.S b/arch/sparc/lib/NG2copy_to_user.S
index 4e962d9..dcec55f 100644
--- a/arch/sparc/lib/NG2copy_to_user.S
+++ b/arch/sparc/lib/NG2copy_to_user.S
@@ -3,19 +3,19 @@
  * Copyright (C) 2007 David S. Miller (davem@davemloft.net)
  */
 
-#define EX_ST(x)		\
+#define EX_ST(x,y)		\
 98:	x;			\
 	.section __ex_table,"a";\
 	.align 4;		\
-	.word 98b, __retl_one_asi;\
+	.word 98b, y;		\
 	.text;			\
 	.align 4;
 
-#define EX_ST_FP(x)		\
+#define EX_ST_FP(x,y)		\
 98:	x;			\
 	.section __ex_table,"a";\
 	.align 4;		\
-	.word 98b, __retl_one_asi_fp;\
+	.word 98b, y##_fp;	\
 	.text;			\
 	.align 4;
 
diff --git a/arch/sparc/lib/NG2memcpy.S b/arch/sparc/lib/NG2memcpy.S
index d5f585d..c629dbd 100644
--- a/arch/sparc/lib/NG2memcpy.S
+++ b/arch/sparc/lib/NG2memcpy.S
@@ -4,6 +4,7 @@
  */
 
 #ifdef __KERNEL__
+#include <linux/linkage.h>
 #include <asm/visasm.h>
 #include <asm/asi.h>
 #define GLOBAL_SPARE	%g7
@@ -32,21 +33,17 @@
 #endif
 
 #ifndef EX_LD
-#define EX_LD(x)	x
+#define EX_LD(x,y)	x
 #endif
 #ifndef EX_LD_FP
-#define EX_LD_FP(x)	x
+#define EX_LD_FP(x,y)	x
 #endif
 
 #ifndef EX_ST
-#define EX_ST(x)	x
+#define EX_ST(x,y)	x
 #endif
 #ifndef EX_ST_FP
-#define EX_ST_FP(x)	x
-#endif
-
-#ifndef EX_RETVAL
-#define EX_RETVAL(x)	x
+#define EX_ST_FP(x,y)	x
 #endif
 
 #ifndef LOAD
@@ -140,45 +137,110 @@
 	fsrc2		%x6, %f12; \
 	fsrc2		%x7, %f14;
 #define FREG_LOAD_1(base, x0) \
-	EX_LD_FP(LOAD(ldd, base + 0x00, %x0))
+	EX_LD_FP(LOAD(ldd, base + 0x00, %x0), NG2_retl_o2_plus_g1)
 #define FREG_LOAD_2(base, x0, x1) \
-	EX_LD_FP(LOAD(ldd, base + 0x00, %x0)); \
-	EX_LD_FP(LOAD(ldd, base + 0x08, %x1));
+	EX_LD_FP(LOAD(ldd, base + 0x00, %x0), NG2_retl_o2_plus_g1); \
+	EX_LD_FP(LOAD(ldd, base + 0x08, %x1), NG2_retl_o2_plus_g1);
 #define FREG_LOAD_3(base, x0, x1, x2) \
-	EX_LD_FP(LOAD(ldd, base + 0x00, %x0)); \
-	EX_LD_FP(LOAD(ldd, base + 0x08, %x1)); \
-	EX_LD_FP(LOAD(ldd, base + 0x10, %x2));
+	EX_LD_FP(LOAD(ldd, base + 0x00, %x0), NG2_retl_o2_plus_g1); \
+	EX_LD_FP(LOAD(ldd, base + 0x08, %x1), NG2_retl_o2_plus_g1); \
+	EX_LD_FP(LOAD(ldd, base + 0x10, %x2), NG2_retl_o2_plus_g1);
 #define FREG_LOAD_4(base, x0, x1, x2, x3) \
-	EX_LD_FP(LOAD(ldd, base + 0x00, %x0)); \
-	EX_LD_FP(LOAD(ldd, base + 0x08, %x1)); \
-	EX_LD_FP(LOAD(ldd, base + 0x10, %x2)); \
-	EX_LD_FP(LOAD(ldd, base + 0x18, %x3));
+	EX_LD_FP(LOAD(ldd, base + 0x00, %x0), NG2_retl_o2_plus_g1); \
+	EX_LD_FP(LOAD(ldd, base + 0x08, %x1), NG2_retl_o2_plus_g1); \
+	EX_LD_FP(LOAD(ldd, base + 0x10, %x2), NG2_retl_o2_plus_g1); \
+	EX_LD_FP(LOAD(ldd, base + 0x18, %x3), NG2_retl_o2_plus_g1);
 #define FREG_LOAD_5(base, x0, x1, x2, x3, x4) \
-	EX_LD_FP(LOAD(ldd, base + 0x00, %x0)); \
-	EX_LD_FP(LOAD(ldd, base + 0x08, %x1)); \
-	EX_LD_FP(LOAD(ldd, base + 0x10, %x2)); \
-	EX_LD_FP(LOAD(ldd, base + 0x18, %x3)); \
-	EX_LD_FP(LOAD(ldd, base + 0x20, %x4));
+	EX_LD_FP(LOAD(ldd, base + 0x00, %x0), NG2_retl_o2_plus_g1); \
+	EX_LD_FP(LOAD(ldd, base + 0x08, %x1), NG2_retl_o2_plus_g1); \
+	EX_LD_FP(LOAD(ldd, base + 0x10, %x2), NG2_retl_o2_plus_g1); \
+	EX_LD_FP(LOAD(ldd, base + 0x18, %x3), NG2_retl_o2_plus_g1); \
+	EX_LD_FP(LOAD(ldd, base + 0x20, %x4), NG2_retl_o2_plus_g1);
 #define FREG_LOAD_6(base, x0, x1, x2, x3, x4, x5) \
-	EX_LD_FP(LOAD(ldd, base + 0x00, %x0)); \
-	EX_LD_FP(LOAD(ldd, base + 0x08, %x1)); \
-	EX_LD_FP(LOAD(ldd, base + 0x10, %x2)); \
-	EX_LD_FP(LOAD(ldd, base + 0x18, %x3)); \
-	EX_LD_FP(LOAD(ldd, base + 0x20, %x4)); \
-	EX_LD_FP(LOAD(ldd, base + 0x28, %x5));
+	EX_LD_FP(LOAD(ldd, base + 0x00, %x0), NG2_retl_o2_plus_g1); \
+	EX_LD_FP(LOAD(ldd, base + 0x08, %x1), NG2_retl_o2_plus_g1); \
+	EX_LD_FP(LOAD(ldd, base + 0x10, %x2), NG2_retl_o2_plus_g1); \
+	EX_LD_FP(LOAD(ldd, base + 0x18, %x3), NG2_retl_o2_plus_g1); \
+	EX_LD_FP(LOAD(ldd, base + 0x20, %x4), NG2_retl_o2_plus_g1); \
+	EX_LD_FP(LOAD(ldd, base + 0x28, %x5), NG2_retl_o2_plus_g1);
 #define FREG_LOAD_7(base, x0, x1, x2, x3, x4, x5, x6) \
-	EX_LD_FP(LOAD(ldd, base + 0x00, %x0)); \
-	EX_LD_FP(LOAD(ldd, base + 0x08, %x1)); \
-	EX_LD_FP(LOAD(ldd, base + 0x10, %x2)); \
-	EX_LD_FP(LOAD(ldd, base + 0x18, %x3)); \
-	EX_LD_FP(LOAD(ldd, base + 0x20, %x4)); \
-	EX_LD_FP(LOAD(ldd, base + 0x28, %x5)); \
-	EX_LD_FP(LOAD(ldd, base + 0x30, %x6));
+	EX_LD_FP(LOAD(ldd, base + 0x00, %x0), NG2_retl_o2_plus_g1); \
+	EX_LD_FP(LOAD(ldd, base + 0x08, %x1), NG2_retl_o2_plus_g1); \
+	EX_LD_FP(LOAD(ldd, base + 0x10, %x2), NG2_retl_o2_plus_g1); \
+	EX_LD_FP(LOAD(ldd, base + 0x18, %x3), NG2_retl_o2_plus_g1); \
+	EX_LD_FP(LOAD(ldd, base + 0x20, %x4), NG2_retl_o2_plus_g1); \
+	EX_LD_FP(LOAD(ldd, base + 0x28, %x5), NG2_retl_o2_plus_g1); \
+	EX_LD_FP(LOAD(ldd, base + 0x30, %x6), NG2_retl_o2_plus_g1);
 
 	.register	%g2,#scratch
 	.register	%g3,#scratch
 
 	.text
+#ifndef EX_RETVAL
+#define EX_RETVAL(x)	x
+__restore_fp:
+	VISExitHalf
+__restore_asi:
+	retl
+	 wr	%g0, ASI_AIUS, %asi
+ENTRY(NG2_retl_o2)
+	ba,pt	%xcc, __restore_asi
+	 mov	%o2, %o0
+ENDPROC(NG2_retl_o2)
+ENTRY(NG2_retl_o2_plus_1)
+	ba,pt	%xcc, __restore_asi
+	 add	%o2, 1, %o0
+ENDPROC(NG2_retl_o2_plus_1)
+ENTRY(NG2_retl_o2_plus_4)
+	ba,pt	%xcc, __restore_asi
+	 add	%o2, 4, %o0
+ENDPROC(NG2_retl_o2_plus_4)
+ENTRY(NG2_retl_o2_plus_8)
+	ba,pt	%xcc, __restore_asi
+	 add	%o2, 8, %o0
+ENDPROC(NG2_retl_o2_plus_8)
+ENTRY(NG2_retl_o2_plus_o4_plus_1)
+	add	%o4, 1, %o4
+	ba,pt	%xcc, __restore_asi
+	 add	%o2, %o4, %o0
+ENDPROC(NG2_retl_o2_plus_o4_plus_1)
+ENTRY(NG2_retl_o2_plus_o4_plus_8)
+	add	%o4, 8, %o4
+	ba,pt	%xcc, __restore_asi
+	 add	%o2, %o4, %o0
+ENDPROC(NG2_retl_o2_plus_o4_plus_8)
+ENTRY(NG2_retl_o2_plus_o4_plus_16)
+	add	%o4, 16, %o4
+	ba,pt	%xcc, __restore_asi
+	 add	%o2, %o4, %o0
+ENDPROC(NG2_retl_o2_plus_o4_plus_16)
+ENTRY(NG2_retl_o2_plus_g1_fp)
+	ba,pt	%xcc, __restore_fp
+	 add	%o2, %g1, %o0
+ENDPROC(NG2_retl_o2_plus_g1_fp)
+ENTRY(NG2_retl_o2_plus_g1_plus_64_fp)
+	add	%g1, 64, %g1
+	ba,pt	%xcc, __restore_fp
+	 add	%o2, %g1, %o0
+ENDPROC(NG2_retl_o2_plus_g1_plus_64_fp)
+ENTRY(NG2_retl_o2_plus_g1_plus_1)
+	add	%g1, 1, %g1
+	ba,pt	%xcc, __restore_asi
+	 add	%o2, %g1, %o0
+ENDPROC(NG2_retl_o2_plus_g1_plus_1)
+ENTRY(NG2_retl_o2_and_7_plus_o4)
+	and	%o2, 7, %o2
+	ba,pt	%xcc, __restore_asi
+	 add	%o2, %o4, %o0
+ENDPROC(NG2_retl_o2_and_7_plus_o4)
+ENTRY(NG2_retl_o2_and_7_plus_o4_plus_8)
+	and	%o2, 7, %o2
+	add	%o4, 8, %o4
+	ba,pt	%xcc, __restore_asi
+	 add	%o2, %o4, %o0
+ENDPROC(NG2_retl_o2_and_7_plus_o4_plus_8)
+#endif
+
 	.align		64
 
 	.globl	FUNC_NAME
@@ -230,8 +292,8 @@
 	sub		%g0, %o4, %o4	! bytes to align dst
 	sub		%o2, %o4, %o2
 1:	subcc		%o4, 1, %o4
-	EX_LD(LOAD(ldub, %o1, %g1))
-	EX_ST(STORE(stb, %g1, %o0))
+	EX_LD(LOAD(ldub, %o1, %g1), NG2_retl_o2_plus_o4_plus_1)
+	EX_ST(STORE(stb, %g1, %o0), NG2_retl_o2_plus_o4_plus_1)
 	add		%o1, 1, %o1
 	bne,pt		%XCC, 1b
 	add		%o0, 1, %o0
@@ -281,11 +343,11 @@
 	 nop
 	/* fall through for 0 < low bits < 8 */
 110:	sub		%o4, 64, %g2
-	EX_LD_FP(LOAD_BLK(%g2, %f0))
-1:	EX_ST_FP(STORE_INIT(%g0, %o4 + %g3))
-	EX_LD_FP(LOAD_BLK(%o4, %f16))
+	EX_LD_FP(LOAD_BLK(%g2, %f0), NG2_retl_o2_plus_g1)
+1:	EX_ST_FP(STORE_INIT(%g0, %o4 + %g3), NG2_retl_o2_plus_g1)
+	EX_LD_FP(LOAD_BLK(%o4, %f16), NG2_retl_o2_plus_g1)
 	FREG_FROB(f0, f2, f4, f6, f8, f10, f12, f14, f16)
-	EX_ST_FP(STORE_BLK(%f0, %o4 + %g3))
+	EX_ST_FP(STORE_BLK(%f0, %o4 + %g3), NG2_retl_o2_plus_g1)
 	FREG_MOVE_8(f16, f18, f20, f22, f24, f26, f28, f30)
 	subcc		%g1, 64, %g1
 	add		%o4, 64, %o4
@@ -296,10 +358,10 @@
 
 120:	sub		%o4, 56, %g2
 	FREG_LOAD_7(%g2, f0, f2, f4, f6, f8, f10, f12)
-1:	EX_ST_FP(STORE_INIT(%g0, %o4 + %g3))
-	EX_LD_FP(LOAD_BLK(%o4, %f16))
+1:	EX_ST_FP(STORE_INIT(%g0, %o4 + %g3), NG2_retl_o2_plus_g1)
+	EX_LD_FP(LOAD_BLK(%o4, %f16), NG2_retl_o2_plus_g1)
 	FREG_FROB(f0, f2, f4, f6, f8, f10, f12, f16, f18)
-	EX_ST_FP(STORE_BLK(%f0, %o4 + %g3))
+	EX_ST_FP(STORE_BLK(%f0, %o4 + %g3), NG2_retl_o2_plus_g1)
 	FREG_MOVE_7(f18, f20, f22, f24, f26, f28, f30)
 	subcc		%g1, 64, %g1
 	add		%o4, 64, %o4
@@ -310,10 +372,10 @@
 
 130:	sub		%o4, 48, %g2
 	FREG_LOAD_6(%g2, f0, f2, f4, f6, f8, f10)
-1:	EX_ST_FP(STORE_INIT(%g0, %o4 + %g3))
-	EX_LD_FP(LOAD_BLK(%o4, %f16))
+1:	EX_ST_FP(STORE_INIT(%g0, %o4 + %g3), NG2_retl_o2_plus_g1)
+	EX_LD_FP(LOAD_BLK(%o4, %f16), NG2_retl_o2_plus_g1)
 	FREG_FROB(f0, f2, f4, f6, f8, f10, f16, f18, f20)
-	EX_ST_FP(STORE_BLK(%f0, %o4 + %g3))
+	EX_ST_FP(STORE_BLK(%f0, %o4 + %g3), NG2_retl_o2_plus_g1)
 	FREG_MOVE_6(f20, f22, f24, f26, f28, f30)
 	subcc		%g1, 64, %g1
 	add		%o4, 64, %o4
@@ -324,10 +386,10 @@
 
 140:	sub		%o4, 40, %g2
 	FREG_LOAD_5(%g2, f0, f2, f4, f6, f8)
-1:	EX_ST_FP(STORE_INIT(%g0, %o4 + %g3))
-	EX_LD_FP(LOAD_BLK(%o4, %f16))
+1:	EX_ST_FP(STORE_INIT(%g0, %o4 + %g3), NG2_retl_o2_plus_g1)
+	EX_LD_FP(LOAD_BLK(%o4, %f16), NG2_retl_o2_plus_g1)
 	FREG_FROB(f0, f2, f4, f6, f8, f16, f18, f20, f22)
-	EX_ST_FP(STORE_BLK(%f0, %o4 + %g3))
+	EX_ST_FP(STORE_BLK(%f0, %o4 + %g3), NG2_retl_o2_plus_g1)
 	FREG_MOVE_5(f22, f24, f26, f28, f30)
 	subcc		%g1, 64, %g1
 	add		%o4, 64, %o4
@@ -338,10 +400,10 @@
 
 150:	sub		%o4, 32, %g2
 	FREG_LOAD_4(%g2, f0, f2, f4, f6)
-1:	EX_ST_FP(STORE_INIT(%g0, %o4 + %g3))
-	EX_LD_FP(LOAD_BLK(%o4, %f16))
+1:	EX_ST_FP(STORE_INIT(%g0, %o4 + %g3), NG2_retl_o2_plus_g1)
+	EX_LD_FP(LOAD_BLK(%o4, %f16), NG2_retl_o2_plus_g1)
 	FREG_FROB(f0, f2, f4, f6, f16, f18, f20, f22, f24)
-	EX_ST_FP(STORE_BLK(%f0, %o4 + %g3))
+	EX_ST_FP(STORE_BLK(%f0, %o4 + %g3), NG2_retl_o2_plus_g1)
 	FREG_MOVE_4(f24, f26, f28, f30)
 	subcc		%g1, 64, %g1
 	add		%o4, 64, %o4
@@ -352,10 +414,10 @@
 
 160:	sub		%o4, 24, %g2
 	FREG_LOAD_3(%g2, f0, f2, f4)
-1:	EX_ST_FP(STORE_INIT(%g0, %o4 + %g3))
-	EX_LD_FP(LOAD_BLK(%o4, %f16))
+1:	EX_ST_FP(STORE_INIT(%g0, %o4 + %g3), NG2_retl_o2_plus_g1)
+	EX_LD_FP(LOAD_BLK(%o4, %f16), NG2_retl_o2_plus_g1)
 	FREG_FROB(f0, f2, f4, f16, f18, f20, f22, f24, f26)
-	EX_ST_FP(STORE_BLK(%f0, %o4 + %g3))
+	EX_ST_FP(STORE_BLK(%f0, %o4 + %g3), NG2_retl_o2_plus_g1)
 	FREG_MOVE_3(f26, f28, f30)
 	subcc		%g1, 64, %g1
 	add		%o4, 64, %o4
@@ -366,10 +428,10 @@
 
 170:	sub		%o4, 16, %g2
 	FREG_LOAD_2(%g2, f0, f2)
-1:	EX_ST_FP(STORE_INIT(%g0, %o4 + %g3))
-	EX_LD_FP(LOAD_BLK(%o4, %f16))
+1:	EX_ST_FP(STORE_INIT(%g0, %o4 + %g3), NG2_retl_o2_plus_g1)
+	EX_LD_FP(LOAD_BLK(%o4, %f16), NG2_retl_o2_plus_g1)
 	FREG_FROB(f0, f2, f16, f18, f20, f22, f24, f26, f28)
-	EX_ST_FP(STORE_BLK(%f0, %o4 + %g3))
+	EX_ST_FP(STORE_BLK(%f0, %o4 + %g3), NG2_retl_o2_plus_g1)
 	FREG_MOVE_2(f28, f30)
 	subcc		%g1, 64, %g1
 	add		%o4, 64, %o4
@@ -380,10 +442,10 @@
 
 180:	sub		%o4, 8, %g2
 	FREG_LOAD_1(%g2, f0)
-1:	EX_ST_FP(STORE_INIT(%g0, %o4 + %g3))
-	EX_LD_FP(LOAD_BLK(%o4, %f16))
+1:	EX_ST_FP(STORE_INIT(%g0, %o4 + %g3), NG2_retl_o2_plus_g1)
+	EX_LD_FP(LOAD_BLK(%o4, %f16), NG2_retl_o2_plus_g1)
 	FREG_FROB(f0, f16, f18, f20, f22, f24, f26, f28, f30)
-	EX_ST_FP(STORE_BLK(%f0, %o4 + %g3))
+	EX_ST_FP(STORE_BLK(%f0, %o4 + %g3), NG2_retl_o2_plus_g1)
 	FREG_MOVE_1(f30)
 	subcc		%g1, 64, %g1
 	add		%o4, 64, %o4
@@ -393,10 +455,10 @@
 	 nop
 
 190:
-1:	EX_ST_FP(STORE_INIT(%g0, %o4 + %g3))
+1:	EX_ST_FP(STORE_INIT(%g0, %o4 + %g3), NG2_retl_o2_plus_g1)
 	subcc		%g1, 64, %g1
-	EX_LD_FP(LOAD_BLK(%o4, %f0))
-	EX_ST_FP(STORE_BLK(%f0, %o4 + %g3))
+	EX_LD_FP(LOAD_BLK(%o4, %f0), NG2_retl_o2_plus_g1_plus_64)
+	EX_ST_FP(STORE_BLK(%f0, %o4 + %g3), NG2_retl_o2_plus_g1_plus_64)
 	add		%o4, 64, %o4
 	bne,pt		%xcc, 1b
 	 LOAD(prefetch, %o4 + 64, #one_read)
@@ -423,28 +485,28 @@
 	andn		%o2, 0xf, %o4
 	and		%o2, 0xf, %o2
 1:	subcc		%o4, 0x10, %o4
-	EX_LD(LOAD(ldx, %o1, %o5))
+	EX_LD(LOAD(ldx, %o1, %o5), NG2_retl_o2_plus_o4_plus_16)
 	add		%o1, 0x08, %o1
-	EX_LD(LOAD(ldx, %o1, %g1))
+	EX_LD(LOAD(ldx, %o1, %g1), NG2_retl_o2_plus_o4_plus_16)
 	sub		%o1, 0x08, %o1
-	EX_ST(STORE(stx, %o5, %o1 + GLOBAL_SPARE))
+	EX_ST(STORE(stx, %o5, %o1 + GLOBAL_SPARE), NG2_retl_o2_plus_o4_plus_16)
 	add		%o1, 0x8, %o1
-	EX_ST(STORE(stx, %g1, %o1 + GLOBAL_SPARE))
+	EX_ST(STORE(stx, %g1, %o1 + GLOBAL_SPARE), NG2_retl_o2_plus_o4_plus_8)
 	bgu,pt		%XCC, 1b
 	 add		%o1, 0x8, %o1
 73:	andcc		%o2, 0x8, %g0
 	be,pt		%XCC, 1f
 	 nop
 	sub		%o2, 0x8, %o2
-	EX_LD(LOAD(ldx, %o1, %o5))
-	EX_ST(STORE(stx, %o5, %o1 + GLOBAL_SPARE))
+	EX_LD(LOAD(ldx, %o1, %o5), NG2_retl_o2_plus_8)
+	EX_ST(STORE(stx, %o5, %o1 + GLOBAL_SPARE), NG2_retl_o2_plus_8)
 	add		%o1, 0x8, %o1
 1:	andcc		%o2, 0x4, %g0
 	be,pt		%XCC, 1f
 	 nop
 	sub		%o2, 0x4, %o2
-	EX_LD(LOAD(lduw, %o1, %o5))
-	EX_ST(STORE(stw, %o5, %o1 + GLOBAL_SPARE))
+	EX_LD(LOAD(lduw, %o1, %o5), NG2_retl_o2_plus_4)
+	EX_ST(STORE(stw, %o5, %o1 + GLOBAL_SPARE), NG2_retl_o2_plus_4)
 	add		%o1, 0x4, %o1
 1:	cmp		%o2, 0
 	be,pt		%XCC, 85f
@@ -460,8 +522,8 @@
 	sub		%o2, %g1, %o2
 
 1:	subcc		%g1, 1, %g1
-	EX_LD(LOAD(ldub, %o1, %o5))
-	EX_ST(STORE(stb, %o5, %o1 + GLOBAL_SPARE))
+	EX_LD(LOAD(ldub, %o1, %o5), NG2_retl_o2_plus_g1_plus_1)
+	EX_ST(STORE(stb, %o5, %o1 + GLOBAL_SPARE), NG2_retl_o2_plus_g1_plus_1)
 	bgu,pt		%icc, 1b
 	 add		%o1, 1, %o1
 
@@ -477,16 +539,16 @@
 
 8:	mov		64, GLOBAL_SPARE
 	andn		%o1, 0x7, %o1
-	EX_LD(LOAD(ldx, %o1, %g2))
+	EX_LD(LOAD(ldx, %o1, %g2), NG2_retl_o2)
 	sub		GLOBAL_SPARE, %g1, GLOBAL_SPARE
 	andn		%o2, 0x7, %o4
 	sllx		%g2, %g1, %g2
 1:	add		%o1, 0x8, %o1
-	EX_LD(LOAD(ldx, %o1, %g3))
+	EX_LD(LOAD(ldx, %o1, %g3), NG2_retl_o2_and_7_plus_o4)
 	subcc		%o4, 0x8, %o4
 	srlx		%g3, GLOBAL_SPARE, %o5
 	or		%o5, %g2, %o5
-	EX_ST(STORE(stx, %o5, %o0))
+	EX_ST(STORE(stx, %o5, %o0), NG2_retl_o2_and_7_plus_o4_plus_8)
 	add		%o0, 0x8, %o0
 	bgu,pt		%icc, 1b
 	 sllx		%g3, %g1, %g2
@@ -506,8 +568,8 @@
 
 1:
 	subcc		%o2, 4, %o2
-	EX_LD(LOAD(lduw, %o1, %g1))
-	EX_ST(STORE(stw, %g1, %o1 + GLOBAL_SPARE))
+	EX_LD(LOAD(lduw, %o1, %g1), NG2_retl_o2_plus_4)
+	EX_ST(STORE(stw, %g1, %o1 + GLOBAL_SPARE), NG2_retl_o2_plus_4)
 	bgu,pt		%XCC, 1b
 	 add		%o1, 4, %o1
 
@@ -517,8 +579,8 @@
 	.align		32
 90:
 	subcc		%o2, 1, %o2
-	EX_LD(LOAD(ldub, %o1, %g1))
-	EX_ST(STORE(stb, %g1, %o1 + GLOBAL_SPARE))
+	EX_LD(LOAD(ldub, %o1, %g1), NG2_retl_o2_plus_1)
+	EX_ST(STORE(stb, %g1, %o1 + GLOBAL_SPARE), NG2_retl_o2_plus_1)
 	bgu,pt		%XCC, 90b
 	 add		%o1, 1, %o1
 	retl
diff --git a/arch/sparc/lib/NG4copy_from_user.S b/arch/sparc/lib/NG4copy_from_user.S
index 2e8ee7a..16a286c 100644
--- a/arch/sparc/lib/NG4copy_from_user.S
+++ b/arch/sparc/lib/NG4copy_from_user.S
@@ -3,19 +3,19 @@
  * Copyright (C) 2012 David S. Miller (davem@davemloft.net)
  */
 
-#define EX_LD(x)		\
+#define EX_LD(x, y)		\
 98:	x;			\
 	.section __ex_table,"a";\
 	.align 4;		\
-	.word 98b, __retl_one_asi;\
+	.word 98b, y;		\
 	.text;			\
 	.align 4;
 
-#define EX_LD_FP(x)		\
+#define EX_LD_FP(x,y)		\
 98:	x;			\
 	.section __ex_table,"a";\
 	.align 4;		\
-	.word 98b, __retl_one_asi_fp;\
+	.word 98b, y##_fp;	\
 	.text;			\
 	.align 4;
 
diff --git a/arch/sparc/lib/NG4copy_to_user.S b/arch/sparc/lib/NG4copy_to_user.S
index be0bf45..6b0276f 100644
--- a/arch/sparc/lib/NG4copy_to_user.S
+++ b/arch/sparc/lib/NG4copy_to_user.S
@@ -3,19 +3,19 @@
  * Copyright (C) 2012 David S. Miller (davem@davemloft.net)
  */
 
-#define EX_ST(x)		\
+#define EX_ST(x,y)		\
 98:	x;			\
 	.section __ex_table,"a";\
 	.align 4;		\
-	.word 98b, __retl_one_asi;\
+	.word 98b, y;		\
 	.text;			\
 	.align 4;
 
-#define EX_ST_FP(x)		\
+#define EX_ST_FP(x,y)		\
 98:	x;			\
 	.section __ex_table,"a";\
 	.align 4;		\
-	.word 98b, __retl_one_asi_fp;\
+	.word 98b, y##_fp;	\
 	.text;			\
 	.align 4;
 
diff --git a/arch/sparc/lib/NG4memcpy.S b/arch/sparc/lib/NG4memcpy.S
index 8e13ee1..75bb93b 100644
--- a/arch/sparc/lib/NG4memcpy.S
+++ b/arch/sparc/lib/NG4memcpy.S
@@ -4,6 +4,7 @@
  */
 
 #ifdef __KERNEL__
+#include <linux/linkage.h>
 #include <asm/visasm.h>
 #include <asm/asi.h>
 #define GLOBAL_SPARE	%g7
@@ -46,22 +47,19 @@
 #endif
 
 #ifndef EX_LD
-#define EX_LD(x)	x
+#define EX_LD(x,y)	x
 #endif
 #ifndef EX_LD_FP
-#define EX_LD_FP(x)	x
+#define EX_LD_FP(x,y)	x
 #endif
 
 #ifndef EX_ST
-#define EX_ST(x)	x
+#define EX_ST(x,y)	x
 #endif
 #ifndef EX_ST_FP
-#define EX_ST_FP(x)	x
+#define EX_ST_FP(x,y)	x
 #endif
 
-#ifndef EX_RETVAL
-#define EX_RETVAL(x)	x
-#endif
 
 #ifndef LOAD
 #define LOAD(type,addr,dest)	type [addr], dest
@@ -94,6 +92,158 @@
 	.register	%g3,#scratch
 
 	.text
+#ifndef EX_RETVAL
+#define EX_RETVAL(x)	x
+__restore_asi_fp:
+	VISExitHalf
+__restore_asi:
+	retl
+	 wr	%g0, ASI_AIUS, %asi
+
+ENTRY(NG4_retl_o2)
+	ba,pt	%xcc, __restore_asi
+	 mov	%o2, %o0
+ENDPROC(NG4_retl_o2)
+ENTRY(NG4_retl_o2_plus_1)
+	ba,pt	%xcc, __restore_asi
+	 add	%o2, 1, %o0
+ENDPROC(NG4_retl_o2_plus_1)
+ENTRY(NG4_retl_o2_plus_4)
+	ba,pt	%xcc, __restore_asi
+	 add	%o2, 4, %o0
+ENDPROC(NG4_retl_o2_plus_4)
+ENTRY(NG4_retl_o2_plus_o5)
+	ba,pt	%xcc, __restore_asi
+	 add	%o2, %o5, %o0
+ENDPROC(NG4_retl_o2_plus_o5)
+ENTRY(NG4_retl_o2_plus_o5_plus_4)
+	add	%o5, 4, %o5
+	ba,pt	%xcc, __restore_asi
+	 add	%o2, %o5, %o0
+ENDPROC(NG4_retl_o2_plus_o5_plus_4)
+ENTRY(NG4_retl_o2_plus_o5_plus_8)
+	add	%o5, 8, %o5
+	ba,pt	%xcc, __restore_asi
+	 add	%o2, %o5, %o0
+ENDPROC(NG4_retl_o2_plus_o5_plus_8)
+ENTRY(NG4_retl_o2_plus_o5_plus_16)
+	add	%o5, 16, %o5
+	ba,pt	%xcc, __restore_asi
+	 add	%o2, %o5, %o0
+ENDPROC(NG4_retl_o2_plus_o5_plus_16)
+ENTRY(NG4_retl_o2_plus_o5_plus_24)
+	add	%o5, 24, %o5
+	ba,pt	%xcc, __restore_asi
+	 add	%o2, %o5, %o0
+ENDPROC(NG4_retl_o2_plus_o5_plus_24)
+ENTRY(NG4_retl_o2_plus_o5_plus_32)
+	add	%o5, 32, %o5
+	ba,pt	%xcc, __restore_asi
+	 add	%o2, %o5, %o0
+ENDPROC(NG4_retl_o2_plus_o5_plus_32)
+ENTRY(NG4_retl_o2_plus_g1)
+	ba,pt	%xcc, __restore_asi
+	 add	%o2, %g1, %o0
+ENDPROC(NG4_retl_o2_plus_g1)
+ENTRY(NG4_retl_o2_plus_g1_plus_1)
+	add	%g1, 1, %g1
+	ba,pt	%xcc, __restore_asi
+	 add	%o2, %g1, %o0
+ENDPROC(NG4_retl_o2_plus_g1_plus_1)
+ENTRY(NG4_retl_o2_plus_g1_plus_8)
+	add	%g1, 8, %g1
+	ba,pt	%xcc, __restore_asi
+	 add	%o2, %g1, %o0
+ENDPROC(NG4_retl_o2_plus_g1_plus_8)
+ENTRY(NG4_retl_o2_plus_o4)
+	ba,pt	%xcc, __restore_asi
+	 add	%o2, %o4, %o0
+ENDPROC(NG4_retl_o2_plus_o4)
+ENTRY(NG4_retl_o2_plus_o4_plus_8)
+	add	%o4, 8, %o4
+	ba,pt	%xcc, __restore_asi
+	 add	%o2, %o4, %o0
+ENDPROC(NG4_retl_o2_plus_o4_plus_8)
+ENTRY(NG4_retl_o2_plus_o4_plus_16)
+	add	%o4, 16, %o4
+	ba,pt	%xcc, __restore_asi
+	 add	%o2, %o4, %o0
+ENDPROC(NG4_retl_o2_plus_o4_plus_16)
+ENTRY(NG4_retl_o2_plus_o4_plus_24)
+	add	%o4, 24, %o4
+	ba,pt	%xcc, __restore_asi
+	 add	%o2, %o4, %o0
+ENDPROC(NG4_retl_o2_plus_o4_plus_24)
+ENTRY(NG4_retl_o2_plus_o4_plus_32)
+	add	%o4, 32, %o4
+	ba,pt	%xcc, __restore_asi
+	 add	%o2, %o4, %o0
+ENDPROC(NG4_retl_o2_plus_o4_plus_32)
+ENTRY(NG4_retl_o2_plus_o4_plus_40)
+	add	%o4, 40, %o4
+	ba,pt	%xcc, __restore_asi
+	 add	%o2, %o4, %o0
+ENDPROC(NG4_retl_o2_plus_o4_plus_40)
+ENTRY(NG4_retl_o2_plus_o4_plus_48)
+	add	%o4, 48, %o4
+	ba,pt	%xcc, __restore_asi
+	 add	%o2, %o4, %o0
+ENDPROC(NG4_retl_o2_plus_o4_plus_48)
+ENTRY(NG4_retl_o2_plus_o4_plus_56)
+	add	%o4, 56, %o4
+	ba,pt	%xcc, __restore_asi
+	 add	%o2, %o4, %o0
+ENDPROC(NG4_retl_o2_plus_o4_plus_56)
+ENTRY(NG4_retl_o2_plus_o4_plus_64)
+	add	%o4, 64, %o4
+	ba,pt	%xcc, __restore_asi
+	 add	%o2, %o4, %o0
+ENDPROC(NG4_retl_o2_plus_o4_plus_64)
+ENTRY(NG4_retl_o2_plus_o4_fp)
+	ba,pt	%xcc, __restore_asi_fp
+	 add	%o2, %o4, %o0
+ENDPROC(NG4_retl_o2_plus_o4_fp)
+ENTRY(NG4_retl_o2_plus_o4_plus_8_fp)
+	add	%o4, 8, %o4
+	ba,pt	%xcc, __restore_asi_fp
+	 add	%o2, %o4, %o0
+ENDPROC(NG4_retl_o2_plus_o4_plus_8_fp)
+ENTRY(NG4_retl_o2_plus_o4_plus_16_fp)
+	add	%o4, 16, %o4
+	ba,pt	%xcc, __restore_asi_fp
+	 add	%o2, %o4, %o0
+ENDPROC(NG4_retl_o2_plus_o4_plus_16_fp)
+ENTRY(NG4_retl_o2_plus_o4_plus_24_fp)
+	add	%o4, 24, %o4
+	ba,pt	%xcc, __restore_asi_fp
+	 add	%o2, %o4, %o0
+ENDPROC(NG4_retl_o2_plus_o4_plus_24_fp)
+ENTRY(NG4_retl_o2_plus_o4_plus_32_fp)
+	add	%o4, 32, %o4
+	ba,pt	%xcc, __restore_asi_fp
+	 add	%o2, %o4, %o0
+ENDPROC(NG4_retl_o2_plus_o4_plus_32_fp)
+ENTRY(NG4_retl_o2_plus_o4_plus_40_fp)
+	add	%o4, 40, %o4
+	ba,pt	%xcc, __restore_asi_fp
+	 add	%o2, %o4, %o0
+ENDPROC(NG4_retl_o2_plus_o4_plus_40_fp)
+ENTRY(NG4_retl_o2_plus_o4_plus_48_fp)
+	add	%o4, 48, %o4
+	ba,pt	%xcc, __restore_asi_fp
+	 add	%o2, %o4, %o0
+ENDPROC(NG4_retl_o2_plus_o4_plus_48_fp)
+ENTRY(NG4_retl_o2_plus_o4_plus_56_fp)
+	add	%o4, 56, %o4
+	ba,pt	%xcc, __restore_asi_fp
+	 add	%o2, %o4, %o0
+ENDPROC(NG4_retl_o2_plus_o4_plus_56_fp)
+ENTRY(NG4_retl_o2_plus_o4_plus_64_fp)
+	add	%o4, 64, %o4
+	ba,pt	%xcc, __restore_asi_fp
+	 add	%o2, %o4, %o0
+ENDPROC(NG4_retl_o2_plus_o4_plus_64_fp)
+#endif
 	.align		64
 
 	.globl	FUNC_NAME
@@ -124,12 +274,13 @@
 	brz,pt		%g1, 51f
 	 sub		%o2, %g1, %o2
 
-1:	EX_LD(LOAD(ldub, %o1 + 0x00, %g2))
+
+1:	EX_LD(LOAD(ldub, %o1 + 0x00, %g2), NG4_retl_o2_plus_g1)
 	add		%o1, 1, %o1
 	subcc		%g1, 1, %g1
 	add		%o0, 1, %o0
 	bne,pt		%icc, 1b
-	 EX_ST(STORE(stb, %g2, %o0 - 0x01))
+	 EX_ST(STORE(stb, %g2, %o0 - 0x01), NG4_retl_o2_plus_g1_plus_1)
 
 51:	LOAD(prefetch, %o1 + 0x040, #n_reads_strong)
 	LOAD(prefetch, %o1 + 0x080, #n_reads_strong)
@@ -154,43 +305,43 @@
 	brz,pt		%g1, .Llarge_aligned
 	 sub		%o2, %g1, %o2
 
-1:	EX_LD(LOAD(ldx, %o1 + 0x00, %g2))
+1:	EX_LD(LOAD(ldx, %o1 + 0x00, %g2), NG4_retl_o2_plus_g1)
 	add		%o1, 8, %o1
 	subcc		%g1, 8, %g1
 	add		%o0, 8, %o0
 	bne,pt		%icc, 1b
-	 EX_ST(STORE(stx, %g2, %o0 - 0x08))
+	 EX_ST(STORE(stx, %g2, %o0 - 0x08), NG4_retl_o2_plus_g1_plus_8)
 
 .Llarge_aligned:
 	/* len >= 0x80 && src 8-byte aligned && dest 8-byte aligned */
 	andn		%o2, 0x3f, %o4
 	sub		%o2, %o4, %o2
 
-1:	EX_LD(LOAD(ldx, %o1 + 0x00, %g1))
+1:	EX_LD(LOAD(ldx, %o1 + 0x00, %g1), NG4_retl_o2_plus_o4)
 	add		%o1, 0x40, %o1
-	EX_LD(LOAD(ldx, %o1 - 0x38, %g2))
+	EX_LD(LOAD(ldx, %o1 - 0x38, %g2), NG4_retl_o2_plus_o4)
 	subcc		%o4, 0x40, %o4
-	EX_LD(LOAD(ldx, %o1 - 0x30, %g3))
-	EX_LD(LOAD(ldx, %o1 - 0x28, GLOBAL_SPARE))
-	EX_LD(LOAD(ldx, %o1 - 0x20, %o5))
-	EX_ST(STORE_INIT(%g1, %o0))
+	EX_LD(LOAD(ldx, %o1 - 0x30, %g3), NG4_retl_o2_plus_o4_plus_64)
+	EX_LD(LOAD(ldx, %o1 - 0x28, GLOBAL_SPARE), NG4_retl_o2_plus_o4_plus_64)
+	EX_LD(LOAD(ldx, %o1 - 0x20, %o5), NG4_retl_o2_plus_o4_plus_64)
+	EX_ST(STORE_INIT(%g1, %o0), NG4_retl_o2_plus_o4_plus_64)
 	add		%o0, 0x08, %o0
-	EX_ST(STORE_INIT(%g2, %o0))
+	EX_ST(STORE_INIT(%g2, %o0), NG4_retl_o2_plus_o4_plus_56)
 	add		%o0, 0x08, %o0
-	EX_LD(LOAD(ldx, %o1 - 0x18, %g2))
-	EX_ST(STORE_INIT(%g3, %o0))
+	EX_LD(LOAD(ldx, %o1 - 0x18, %g2), NG4_retl_o2_plus_o4_plus_48)
+	EX_ST(STORE_INIT(%g3, %o0), NG4_retl_o2_plus_o4_plus_48)
 	add		%o0, 0x08, %o0
-	EX_LD(LOAD(ldx, %o1 - 0x10, %g3))
-	EX_ST(STORE_INIT(GLOBAL_SPARE, %o0))
+	EX_LD(LOAD(ldx, %o1 - 0x10, %g3), NG4_retl_o2_plus_o4_plus_40)
+	EX_ST(STORE_INIT(GLOBAL_SPARE, %o0), NG4_retl_o2_plus_o4_plus_40)
 	add		%o0, 0x08, %o0
-	EX_LD(LOAD(ldx, %o1 - 0x08, GLOBAL_SPARE))
-	EX_ST(STORE_INIT(%o5, %o0))
+	EX_LD(LOAD(ldx, %o1 - 0x08, GLOBAL_SPARE), NG4_retl_o2_plus_o4_plus_32)
+	EX_ST(STORE_INIT(%o5, %o0), NG4_retl_o2_plus_o4_plus_32)
 	add		%o0, 0x08, %o0
-	EX_ST(STORE_INIT(%g2, %o0))
+	EX_ST(STORE_INIT(%g2, %o0), NG4_retl_o2_plus_o4_plus_24)
 	add		%o0, 0x08, %o0
-	EX_ST(STORE_INIT(%g3, %o0))
+	EX_ST(STORE_INIT(%g3, %o0), NG4_retl_o2_plus_o4_plus_16)
 	add		%o0, 0x08, %o0
-	EX_ST(STORE_INIT(GLOBAL_SPARE, %o0))
+	EX_ST(STORE_INIT(GLOBAL_SPARE, %o0), NG4_retl_o2_plus_o4_plus_8)
 	add		%o0, 0x08, %o0
 	bne,pt		%icc, 1b
 	 LOAD(prefetch, %o1 + 0x200, #n_reads_strong)
@@ -216,17 +367,17 @@
 	sub		%o2, %o4, %o2
 	alignaddr	%o1, %g0, %g1
 	add		%o1, %o4, %o1
-	EX_LD_FP(LOAD(ldd, %g1 + 0x00, %f0))
-1:	EX_LD_FP(LOAD(ldd, %g1 + 0x08, %f2))
+	EX_LD_FP(LOAD(ldd, %g1 + 0x00, %f0), NG4_retl_o2_plus_o4)
+1:	EX_LD_FP(LOAD(ldd, %g1 + 0x08, %f2), NG4_retl_o2_plus_o4)
 	subcc		%o4, 0x40, %o4
-	EX_LD_FP(LOAD(ldd, %g1 + 0x10, %f4))
-	EX_LD_FP(LOAD(ldd, %g1 + 0x18, %f6))
-	EX_LD_FP(LOAD(ldd, %g1 + 0x20, %f8))
-	EX_LD_FP(LOAD(ldd, %g1 + 0x28, %f10))
-	EX_LD_FP(LOAD(ldd, %g1 + 0x30, %f12))
-	EX_LD_FP(LOAD(ldd, %g1 + 0x38, %f14))
+	EX_LD_FP(LOAD(ldd, %g1 + 0x10, %f4), NG4_retl_o2_plus_o4_plus_64)
+	EX_LD_FP(LOAD(ldd, %g1 + 0x18, %f6), NG4_retl_o2_plus_o4_plus_64)
+	EX_LD_FP(LOAD(ldd, %g1 + 0x20, %f8), NG4_retl_o2_plus_o4_plus_64)
+	EX_LD_FP(LOAD(ldd, %g1 + 0x28, %f10), NG4_retl_o2_plus_o4_plus_64)
+	EX_LD_FP(LOAD(ldd, %g1 + 0x30, %f12), NG4_retl_o2_plus_o4_plus_64)
+	EX_LD_FP(LOAD(ldd, %g1 + 0x38, %f14), NG4_retl_o2_plus_o4_plus_64)
 	faligndata	%f0, %f2, %f16
-	EX_LD_FP(LOAD(ldd, %g1 + 0x40, %f0))
+	EX_LD_FP(LOAD(ldd, %g1 + 0x40, %f0), NG4_retl_o2_plus_o4_plus_64)
 	faligndata	%f2, %f4, %f18
 	add		%g1, 0x40, %g1
 	faligndata	%f4, %f6, %f20
@@ -235,14 +386,14 @@
 	faligndata	%f10, %f12, %f26
 	faligndata	%f12, %f14, %f28
 	faligndata	%f14, %f0, %f30
-	EX_ST_FP(STORE(std, %f16, %o0 + 0x00))
-	EX_ST_FP(STORE(std, %f18, %o0 + 0x08))
-	EX_ST_FP(STORE(std, %f20, %o0 + 0x10))
-	EX_ST_FP(STORE(std, %f22, %o0 + 0x18))
-	EX_ST_FP(STORE(std, %f24, %o0 + 0x20))
-	EX_ST_FP(STORE(std, %f26, %o0 + 0x28))
-	EX_ST_FP(STORE(std, %f28, %o0 + 0x30))
-	EX_ST_FP(STORE(std, %f30, %o0 + 0x38))
+	EX_ST_FP(STORE(std, %f16, %o0 + 0x00), NG4_retl_o2_plus_o4_plus_64)
+	EX_ST_FP(STORE(std, %f18, %o0 + 0x08), NG4_retl_o2_plus_o4_plus_56)
+	EX_ST_FP(STORE(std, %f20, %o0 + 0x10), NG4_retl_o2_plus_o4_plus_48)
+	EX_ST_FP(STORE(std, %f22, %o0 + 0x18), NG4_retl_o2_plus_o4_plus_40)
+	EX_ST_FP(STORE(std, %f24, %o0 + 0x20), NG4_retl_o2_plus_o4_plus_32)
+	EX_ST_FP(STORE(std, %f26, %o0 + 0x28), NG4_retl_o2_plus_o4_plus_24)
+	EX_ST_FP(STORE(std, %f28, %o0 + 0x30), NG4_retl_o2_plus_o4_plus_16)
+	EX_ST_FP(STORE(std, %f30, %o0 + 0x38), NG4_retl_o2_plus_o4_plus_8)
 	add		%o0, 0x40, %o0
 	bne,pt		%icc, 1b
 	 LOAD(prefetch, %g1 + 0x200, #n_reads_strong)
@@ -270,37 +421,38 @@
 	andncc		%o2, 0x20 - 1, %o5
 	be,pn		%icc, 2f
 	 sub		%o2, %o5, %o2
-1:	EX_LD(LOAD(ldx, %o1 + 0x00, %g1))
-	EX_LD(LOAD(ldx, %o1 + 0x08, %g2))
-	EX_LD(LOAD(ldx, %o1 + 0x10, GLOBAL_SPARE))
-	EX_LD(LOAD(ldx, %o1 + 0x18, %o4))
+1:	EX_LD(LOAD(ldx, %o1 + 0x00, %g1), NG4_retl_o2_plus_o5)
+	EX_LD(LOAD(ldx, %o1 + 0x08, %g2), NG4_retl_o2_plus_o5)
+	EX_LD(LOAD(ldx, %o1 + 0x10, GLOBAL_SPARE), NG4_retl_o2_plus_o5)
+	EX_LD(LOAD(ldx, %o1 + 0x18, %o4), NG4_retl_o2_plus_o5)
 	add		%o1, 0x20, %o1
 	subcc		%o5, 0x20, %o5
-	EX_ST(STORE(stx, %g1, %o0 + 0x00))
-	EX_ST(STORE(stx, %g2, %o0 + 0x08))
-	EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x10))
-	EX_ST(STORE(stx, %o4, %o0 + 0x18))
+	EX_ST(STORE(stx, %g1, %o0 + 0x00), NG4_retl_o2_plus_o5_plus_32)
+	EX_ST(STORE(stx, %g2, %o0 + 0x08), NG4_retl_o2_plus_o5_plus_24)
+	EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x10), NG4_retl_o2_plus_o5_plus_24)
+	EX_ST(STORE(stx, %o4, %o0 + 0x18), NG4_retl_o2_plus_o5_plus_8)
 	bne,pt		%icc, 1b
 	 add		%o0, 0x20, %o0
 2:	andcc		%o2, 0x18, %o5
 	be,pt		%icc, 3f
 	 sub		%o2, %o5, %o2
-1:	EX_LD(LOAD(ldx, %o1 + 0x00, %g1))
+
+1:	EX_LD(LOAD(ldx, %o1 + 0x00, %g1), NG4_retl_o2_plus_o5)
 	add		%o1, 0x08, %o1
 	add		%o0, 0x08, %o0
 	subcc		%o5, 0x08, %o5
 	bne,pt		%icc, 1b
-	 EX_ST(STORE(stx, %g1, %o0 - 0x08))
+	 EX_ST(STORE(stx, %g1, %o0 - 0x08), NG4_retl_o2_plus_o5_plus_8)
 3:	brz,pt		%o2, .Lexit
 	 cmp		%o2, 0x04
 	bl,pn		%icc, .Ltiny
 	 nop
-	EX_LD(LOAD(lduw, %o1 + 0x00, %g1))
+	EX_LD(LOAD(lduw, %o1 + 0x00, %g1), NG4_retl_o2)
 	add		%o1, 0x04, %o1
 	add		%o0, 0x04, %o0
 	subcc		%o2, 0x04, %o2
 	bne,pn		%icc, .Ltiny
-	 EX_ST(STORE(stw, %g1, %o0 - 0x04))
+	 EX_ST(STORE(stw, %g1, %o0 - 0x04), NG4_retl_o2_plus_4)
 	ba,a,pt		%icc, .Lexit
 .Lmedium_unaligned:
 	/* First get dest 8 byte aligned.  */
@@ -309,12 +461,12 @@
 	brz,pt		%g1, 2f
 	 sub		%o2, %g1, %o2
 
-1:	EX_LD(LOAD(ldub, %o1 + 0x00, %g2))
+1:	EX_LD(LOAD(ldub, %o1 + 0x00, %g2), NG4_retl_o2_plus_g1)
 	add		%o1, 1, %o1
 	subcc		%g1, 1, %g1
 	add		%o0, 1, %o0
 	bne,pt		%icc, 1b
-	 EX_ST(STORE(stb, %g2, %o0 - 0x01))
+	 EX_ST(STORE(stb, %g2, %o0 - 0x01), NG4_retl_o2_plus_g1_plus_1)
 2:
 	and		%o1, 0x7, %g1
 	brz,pn		%g1, .Lmedium_noprefetch
@@ -322,16 +474,16 @@
 	mov		64, %g2
 	sub		%g2, %g1, %g2
 	andn		%o1, 0x7, %o1
-	EX_LD(LOAD(ldx, %o1 + 0x00, %o4))
+	EX_LD(LOAD(ldx, %o1 + 0x00, %o4), NG4_retl_o2)
 	sllx		%o4, %g1, %o4
 	andn		%o2, 0x08 - 1, %o5
 	sub		%o2, %o5, %o2
-1:	EX_LD(LOAD(ldx, %o1 + 0x08, %g3))
+1:	EX_LD(LOAD(ldx, %o1 + 0x08, %g3), NG4_retl_o2_plus_o5)
 	add		%o1, 0x08, %o1
 	subcc		%o5, 0x08, %o5
 	srlx		%g3, %g2, GLOBAL_SPARE
 	or		GLOBAL_SPARE, %o4, GLOBAL_SPARE
-	EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x00))
+	EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x00), NG4_retl_o2_plus_o5_plus_8)
 	add		%o0, 0x08, %o0
 	bne,pt		%icc, 1b
 	 sllx		%g3, %g1, %o4
@@ -342,17 +494,17 @@
 	ba,pt		%icc, .Lsmall_unaligned
 
 .Ltiny:
-	EX_LD(LOAD(ldub, %o1 + 0x00, %g1))
+	EX_LD(LOAD(ldub, %o1 + 0x00, %g1), NG4_retl_o2)
 	subcc		%o2, 1, %o2
 	be,pn		%icc, .Lexit
-	 EX_ST(STORE(stb, %g1, %o0 + 0x00))
-	EX_LD(LOAD(ldub, %o1 + 0x01, %g1))
+	 EX_ST(STORE(stb, %g1, %o0 + 0x00), NG4_retl_o2_plus_1)
+	EX_LD(LOAD(ldub, %o1 + 0x01, %g1), NG4_retl_o2)
 	subcc		%o2, 1, %o2
 	be,pn		%icc, .Lexit
-	 EX_ST(STORE(stb, %g1, %o0 + 0x01))
-	EX_LD(LOAD(ldub, %o1 + 0x02, %g1))
+	 EX_ST(STORE(stb, %g1, %o0 + 0x01), NG4_retl_o2_plus_1)
+	EX_LD(LOAD(ldub, %o1 + 0x02, %g1), NG4_retl_o2)
 	ba,pt		%icc, .Lexit
-	 EX_ST(STORE(stb, %g1, %o0 + 0x02))
+	 EX_ST(STORE(stb, %g1, %o0 + 0x02), NG4_retl_o2)
 
 .Lsmall:
 	andcc		%g2, 0x3, %g0
@@ -360,22 +512,22 @@
 	 andn		%o2, 0x4 - 1, %o5
 	sub		%o2, %o5, %o2
 1:
-	EX_LD(LOAD(lduw, %o1 + 0x00, %g1))
+	EX_LD(LOAD(lduw, %o1 + 0x00, %g1), NG4_retl_o2_plus_o5)
 	add		%o1, 0x04, %o1
 	subcc		%o5, 0x04, %o5
 	add		%o0, 0x04, %o0
 	bne,pt		%icc, 1b
-	 EX_ST(STORE(stw, %g1, %o0 - 0x04))
+	 EX_ST(STORE(stw, %g1, %o0 - 0x04), NG4_retl_o2_plus_o5_plus_4)
 	brz,pt		%o2, .Lexit
 	 nop
 	ba,a,pt		%icc, .Ltiny
 
 .Lsmall_unaligned:
-1:	EX_LD(LOAD(ldub, %o1 + 0x00, %g1))
+1:	EX_LD(LOAD(ldub, %o1 + 0x00, %g1), NG4_retl_o2)
 	add		%o1, 1, %o1
 	add		%o0, 1, %o0
 	subcc		%o2, 1, %o2
 	bne,pt		%icc, 1b
-	 EX_ST(STORE(stb, %g1, %o0 - 0x01))
+	 EX_ST(STORE(stb, %g1, %o0 - 0x01), NG4_retl_o2_plus_1)
 	ba,a,pt		%icc, .Lexit
 	.size		FUNC_NAME, .-FUNC_NAME
diff --git a/arch/sparc/lib/NGcopy_from_user.S b/arch/sparc/lib/NGcopy_from_user.S
index 5d1e4d1..9cd42fc 100644
--- a/arch/sparc/lib/NGcopy_from_user.S
+++ b/arch/sparc/lib/NGcopy_from_user.S
@@ -3,11 +3,11 @@
  * Copyright (C) 2006, 2007 David S. Miller (davem@davemloft.net)
  */
 
-#define EX_LD(x)		\
+#define EX_LD(x,y)		\
 98:	x;			\
 	.section __ex_table,"a";\
 	.align 4;		\
-	.word 98b, __ret_one_asi;\
+	.word 98b, y;		\
 	.text;			\
 	.align 4;
 
diff --git a/arch/sparc/lib/NGcopy_to_user.S b/arch/sparc/lib/NGcopy_to_user.S
index ff630dc..5c358af 100644
--- a/arch/sparc/lib/NGcopy_to_user.S
+++ b/arch/sparc/lib/NGcopy_to_user.S
@@ -3,11 +3,11 @@
  * Copyright (C) 2006, 2007 David S. Miller (davem@davemloft.net)
  */
 
-#define EX_ST(x)		\
+#define EX_ST(x,y)		\
 98:	x;			\
 	.section __ex_table,"a";\
 	.align 4;		\
-	.word 98b, __ret_one_asi;\
+	.word 98b, y;		\
 	.text;			\
 	.align 4;
 
diff --git a/arch/sparc/lib/NGmemcpy.S b/arch/sparc/lib/NGmemcpy.S
index 96a14ca..d88c4ed 100644
--- a/arch/sparc/lib/NGmemcpy.S
+++ b/arch/sparc/lib/NGmemcpy.S
@@ -4,6 +4,7 @@
  */
 
 #ifdef __KERNEL__
+#include <linux/linkage.h>
 #include <asm/asi.h>
 #include <asm/thread_info.h>
 #define GLOBAL_SPARE	%g7
@@ -27,15 +28,11 @@
 #endif
 
 #ifndef EX_LD
-#define EX_LD(x)	x
+#define EX_LD(x,y)	x
 #endif
 
 #ifndef EX_ST
-#define EX_ST(x)	x
-#endif
-
-#ifndef EX_RETVAL
-#define EX_RETVAL(x)	x
+#define EX_ST(x,y)	x
 #endif
 
 #ifndef LOAD
@@ -79,6 +76,92 @@
 	.register	%g3,#scratch
 
 	.text
+#ifndef EX_RETVAL
+#define EX_RETVAL(x)	x
+__restore_asi:
+	ret
+	wr	%g0, ASI_AIUS, %asi
+	 restore
+ENTRY(NG_ret_i2_plus_i4_plus_1)
+	ba,pt	%xcc, __restore_asi
+	 add	%i2, %i5, %i0
+ENDPROC(NG_ret_i2_plus_i4_plus_1)
+ENTRY(NG_ret_i2_plus_g1)
+	ba,pt	%xcc, __restore_asi
+	 add	%i2, %g1, %i0
+ENDPROC(NG_ret_i2_plus_g1)
+ENTRY(NG_ret_i2_plus_g1_minus_8)
+	sub	%g1, 8, %g1
+	ba,pt	%xcc, __restore_asi
+	 add	%i2, %g1, %i0
+ENDPROC(NG_ret_i2_plus_g1_minus_8)
+ENTRY(NG_ret_i2_plus_g1_minus_16)
+	sub	%g1, 16, %g1
+	ba,pt	%xcc, __restore_asi
+	 add	%i2, %g1, %i0
+ENDPROC(NG_ret_i2_plus_g1_minus_16)
+ENTRY(NG_ret_i2_plus_g1_minus_24)
+	sub	%g1, 24, %g1
+	ba,pt	%xcc, __restore_asi
+	 add	%i2, %g1, %i0
+ENDPROC(NG_ret_i2_plus_g1_minus_24)
+ENTRY(NG_ret_i2_plus_g1_minus_32)
+	sub	%g1, 32, %g1
+	ba,pt	%xcc, __restore_asi
+	 add	%i2, %g1, %i0
+ENDPROC(NG_ret_i2_plus_g1_minus_32)
+ENTRY(NG_ret_i2_plus_g1_minus_40)
+	sub	%g1, 40, %g1
+	ba,pt	%xcc, __restore_asi
+	 add	%i2, %g1, %i0
+ENDPROC(NG_ret_i2_plus_g1_minus_40)
+ENTRY(NG_ret_i2_plus_g1_minus_48)
+	sub	%g1, 48, %g1
+	ba,pt	%xcc, __restore_asi
+	 add	%i2, %g1, %i0
+ENDPROC(NG_ret_i2_plus_g1_minus_48)
+ENTRY(NG_ret_i2_plus_g1_minus_56)
+	sub	%g1, 56, %g1
+	ba,pt	%xcc, __restore_asi
+	 add	%i2, %g1, %i0
+ENDPROC(NG_ret_i2_plus_g1_minus_56)
+ENTRY(NG_ret_i2_plus_i4)
+	ba,pt	%xcc, __restore_asi
+	 add	%i2, %i4, %i0
+ENDPROC(NG_ret_i2_plus_i4)
+ENTRY(NG_ret_i2_plus_i4_minus_8)
+	sub	%i4, 8, %i4
+	ba,pt	%xcc, __restore_asi
+	 add	%i2, %i4, %i0
+ENDPROC(NG_ret_i2_plus_i4_minus_8)
+ENTRY(NG_ret_i2_plus_8)
+	ba,pt	%xcc, __restore_asi
+	 add	%i2, 8, %i0
+ENDPROC(NG_ret_i2_plus_8)
+ENTRY(NG_ret_i2_plus_4)
+	ba,pt	%xcc, __restore_asi
+	 add	%i2, 4, %i0
+ENDPROC(NG_ret_i2_plus_4)
+ENTRY(NG_ret_i2_plus_1)
+	ba,pt	%xcc, __restore_asi
+	 add	%i2, 1, %i0
+ENDPROC(NG_ret_i2_plus_1)
+ENTRY(NG_ret_i2_plus_g1_plus_1)
+	add	%g1, 1, %g1
+	ba,pt	%xcc, __restore_asi
+	 add	%i2, %g1, %i0
+ENDPROC(NG_ret_i2_plus_g1_plus_1)
+ENTRY(NG_ret_i2)
+	ba,pt	%xcc, __restore_asi
+	 mov	%i2, %i0
+ENDPROC(NG_ret_i2)
+ENTRY(NG_ret_i2_and_7_plus_i4)
+	and	%i2, 7, %i2
+	ba,pt	%xcc, __restore_asi
+	 add	%i2, %i4, %i0
+ENDPROC(NG_ret_i2_and_7_plus_i4)
+#endif
+
 	.align		64
 
 	.globl	FUNC_NAME
@@ -126,8 +209,8 @@
 	sub		%g0, %i4, %i4	! bytes to align dst
 	sub		%i2, %i4, %i2
 1:	subcc		%i4, 1, %i4
-	EX_LD(LOAD(ldub, %i1, %g1))
-	EX_ST(STORE(stb, %g1, %o0))
+	EX_LD(LOAD(ldub, %i1, %g1), NG_ret_i2_plus_i4_plus_1)
+	EX_ST(STORE(stb, %g1, %o0), NG_ret_i2_plus_i4_plus_1)
 	add		%i1, 1, %i1
 	bne,pt		%XCC, 1b
 	add		%o0, 1, %o0
@@ -160,7 +243,7 @@
 	and		%i4, 0x7, GLOBAL_SPARE
 	sll		GLOBAL_SPARE, 3, GLOBAL_SPARE
 	mov		64, %i5
-	EX_LD(LOAD_TWIN(%i1, %g2, %g3))
+	EX_LD(LOAD_TWIN(%i1, %g2, %g3), NG_ret_i2_plus_g1)
 	sub		%i5, GLOBAL_SPARE, %i5
 	mov		16, %o4
 	mov		32, %o5
@@ -178,31 +261,31 @@
 	srlx		WORD3, PRE_SHIFT, TMP; \
 	or		WORD2, TMP, WORD2;
 
-8:	EX_LD(LOAD_TWIN(%i1 + %o4, %o2, %o3))
+8:	EX_LD(LOAD_TWIN(%i1 + %o4, %o2, %o3), NG_ret_i2_plus_g1)
 	MIX_THREE_WORDS(%g2, %g3, %o2, %i5, GLOBAL_SPARE, %o1)
 	LOAD(prefetch, %i1 + %i3, #one_read)
 
-	EX_ST(STORE_INIT(%g2, %o0 + 0x00))
-	EX_ST(STORE_INIT(%g3, %o0 + 0x08))
+	EX_ST(STORE_INIT(%g2, %o0 + 0x00), NG_ret_i2_plus_g1)
+	EX_ST(STORE_INIT(%g3, %o0 + 0x08), NG_ret_i2_plus_g1_minus_8)
 
-	EX_LD(LOAD_TWIN(%i1 + %o5, %g2, %g3))
+	EX_LD(LOAD_TWIN(%i1 + %o5, %g2, %g3), NG_ret_i2_plus_g1_minus_16)
 	MIX_THREE_WORDS(%o2, %o3, %g2, %i5, GLOBAL_SPARE, %o1)
 
-	EX_ST(STORE_INIT(%o2, %o0 + 0x10))
-	EX_ST(STORE_INIT(%o3, %o0 + 0x18))
+	EX_ST(STORE_INIT(%o2, %o0 + 0x10), NG_ret_i2_plus_g1_minus_16)
+	EX_ST(STORE_INIT(%o3, %o0 + 0x18), NG_ret_i2_plus_g1_minus_24)
 
-	EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3))
+	EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), NG_ret_i2_plus_g1_minus_32)
 	MIX_THREE_WORDS(%g2, %g3, %o2, %i5, GLOBAL_SPARE, %o1)
 
-	EX_ST(STORE_INIT(%g2, %o0 + 0x20))
-	EX_ST(STORE_INIT(%g3, %o0 + 0x28))
+	EX_ST(STORE_INIT(%g2, %o0 + 0x20), NG_ret_i2_plus_g1_minus_32)
+	EX_ST(STORE_INIT(%g3, %o0 + 0x28), NG_ret_i2_plus_g1_minus_40)
 
-	EX_LD(LOAD_TWIN(%i1 + %i3, %g2, %g3))
+	EX_LD(LOAD_TWIN(%i1 + %i3, %g2, %g3), NG_ret_i2_plus_g1_minus_48)
 	add		%i1, 64, %i1
 	MIX_THREE_WORDS(%o2, %o3, %g2, %i5, GLOBAL_SPARE, %o1)
 
-	EX_ST(STORE_INIT(%o2, %o0 + 0x30))
-	EX_ST(STORE_INIT(%o3, %o0 + 0x38))
+	EX_ST(STORE_INIT(%o2, %o0 + 0x30), NG_ret_i2_plus_g1_minus_48)
+	EX_ST(STORE_INIT(%o3, %o0 + 0x38), NG_ret_i2_plus_g1_minus_56)
 
 	subcc		%g1, 64, %g1
 	bne,pt		%XCC, 8b
@@ -211,31 +294,31 @@
 	ba,pt		%XCC, 60f
 	 add		%i1, %i4, %i1
 
-9:	EX_LD(LOAD_TWIN(%i1 + %o4, %o2, %o3))
+9:	EX_LD(LOAD_TWIN(%i1 + %o4, %o2, %o3), NG_ret_i2_plus_g1)
 	MIX_THREE_WORDS(%g3, %o2, %o3, %i5, GLOBAL_SPARE, %o1)
 	LOAD(prefetch, %i1 + %i3, #one_read)
 
-	EX_ST(STORE_INIT(%g3, %o0 + 0x00))
-	EX_ST(STORE_INIT(%o2, %o0 + 0x08))
+	EX_ST(STORE_INIT(%g3, %o0 + 0x00), NG_ret_i2_plus_g1)
+	EX_ST(STORE_INIT(%o2, %o0 + 0x08), NG_ret_i2_plus_g1_minus_8)
 
-	EX_LD(LOAD_TWIN(%i1 + %o5, %g2, %g3))
+	EX_LD(LOAD_TWIN(%i1 + %o5, %g2, %g3), NG_ret_i2_plus_g1_minus_16)
 	MIX_THREE_WORDS(%o3, %g2, %g3, %i5, GLOBAL_SPARE, %o1)
 
-	EX_ST(STORE_INIT(%o3, %o0 + 0x10))
-	EX_ST(STORE_INIT(%g2, %o0 + 0x18))
+	EX_ST(STORE_INIT(%o3, %o0 + 0x10), NG_ret_i2_plus_g1_minus_16)
+	EX_ST(STORE_INIT(%g2, %o0 + 0x18), NG_ret_i2_plus_g1_minus_24)
 
-	EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3))
+	EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), NG_ret_i2_plus_g1_minus_32)
 	MIX_THREE_WORDS(%g3, %o2, %o3, %i5, GLOBAL_SPARE, %o1)
 
-	EX_ST(STORE_INIT(%g3, %o0 + 0x20))
-	EX_ST(STORE_INIT(%o2, %o0 + 0x28))
+	EX_ST(STORE_INIT(%g3, %o0 + 0x20), NG_ret_i2_plus_g1_minus_32)
+	EX_ST(STORE_INIT(%o2, %o0 + 0x28), NG_ret_i2_plus_g1_minus_40)
 
-	EX_LD(LOAD_TWIN(%i1 + %i3, %g2, %g3))
+	EX_LD(LOAD_TWIN(%i1 + %i3, %g2, %g3), NG_ret_i2_plus_g1_minus_48)
 	add		%i1, 64, %i1
 	MIX_THREE_WORDS(%o3, %g2, %g3, %i5, GLOBAL_SPARE, %o1)
 
-	EX_ST(STORE_INIT(%o3, %o0 + 0x30))
-	EX_ST(STORE_INIT(%g2, %o0 + 0x38))
+	EX_ST(STORE_INIT(%o3, %o0 + 0x30), NG_ret_i2_plus_g1_minus_48)
+	EX_ST(STORE_INIT(%g2, %o0 + 0x38), NG_ret_i2_plus_g1_minus_56)
 
 	subcc		%g1, 64, %g1
 	bne,pt		%XCC, 9b
@@ -249,25 +332,25 @@
 	 * one twin load ahead, then add 8 back into source when
 	 * we finish the loop.
 	 */
-	EX_LD(LOAD_TWIN(%i1, %o4, %o5))
+	EX_LD(LOAD_TWIN(%i1, %o4, %o5), NG_ret_i2_plus_g1)
 	mov	16, %o7
 	mov	32, %g2
 	mov	48, %g3
 	mov	64, %o1
-1:	EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3))
+1:	EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), NG_ret_i2_plus_g1)
 	LOAD(prefetch, %i1 + %o1, #one_read)
-	EX_ST(STORE_INIT(%o5, %o0 + 0x00))	! initializes cache line
-	EX_ST(STORE_INIT(%o2, %o0 + 0x08))
-	EX_LD(LOAD_TWIN(%i1 + %g2, %o4, %o5))
-	EX_ST(STORE_INIT(%o3, %o0 + 0x10))
-	EX_ST(STORE_INIT(%o4, %o0 + 0x18))
-	EX_LD(LOAD_TWIN(%i1 + %g3, %o2, %o3))
-	EX_ST(STORE_INIT(%o5, %o0 + 0x20))
-	EX_ST(STORE_INIT(%o2, %o0 + 0x28))
-	EX_LD(LOAD_TWIN(%i1 + %o1, %o4, %o5))
+	EX_ST(STORE_INIT(%o5, %o0 + 0x00), NG_ret_i2_plus_g1)	! initializes cache line
+	EX_ST(STORE_INIT(%o2, %o0 + 0x08), NG_ret_i2_plus_g1_minus_8)
+	EX_LD(LOAD_TWIN(%i1 + %g2, %o4, %o5), NG_ret_i2_plus_g1_minus_16)
+	EX_ST(STORE_INIT(%o3, %o0 + 0x10), NG_ret_i2_plus_g1_minus_16)
+	EX_ST(STORE_INIT(%o4, %o0 + 0x18), NG_ret_i2_plus_g1_minus_24)
+	EX_LD(LOAD_TWIN(%i1 + %g3, %o2, %o3), NG_ret_i2_plus_g1_minus_32)
+	EX_ST(STORE_INIT(%o5, %o0 + 0x20), NG_ret_i2_plus_g1_minus_32)
+	EX_ST(STORE_INIT(%o2, %o0 + 0x28), NG_ret_i2_plus_g1_minus_40)
+	EX_LD(LOAD_TWIN(%i1 + %o1, %o4, %o5), NG_ret_i2_plus_g1_minus_48)
 	add		%i1, 64, %i1
-	EX_ST(STORE_INIT(%o3, %o0 + 0x30))
-	EX_ST(STORE_INIT(%o4, %o0 + 0x38))
+	EX_ST(STORE_INIT(%o3, %o0 + 0x30), NG_ret_i2_plus_g1_minus_48)
+	EX_ST(STORE_INIT(%o4, %o0 + 0x38), NG_ret_i2_plus_g1_minus_56)
 	subcc		%g1, 64, %g1
 	bne,pt		%XCC, 1b
 	 add		%o0, 64, %o0
@@ -282,20 +365,20 @@
 	mov	32, %g2
 	mov	48, %g3
 	mov	64, %o1
-1:	EX_LD(LOAD_TWIN(%i1 + %g0, %o4, %o5))
-	EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3))
+1:	EX_LD(LOAD_TWIN(%i1 + %g0, %o4, %o5), NG_ret_i2_plus_g1)
+	EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), NG_ret_i2_plus_g1)
 	LOAD(prefetch, %i1 + %o1, #one_read)
-	EX_ST(STORE_INIT(%o4, %o0 + 0x00))	! initializes cache line
-	EX_ST(STORE_INIT(%o5, %o0 + 0x08))
-	EX_LD(LOAD_TWIN(%i1 + %g2, %o4, %o5))
-	EX_ST(STORE_INIT(%o2, %o0 + 0x10))
-	EX_ST(STORE_INIT(%o3, %o0 + 0x18))
-	EX_LD(LOAD_TWIN(%i1 + %g3, %o2, %o3))
+	EX_ST(STORE_INIT(%o4, %o0 + 0x00), NG_ret_i2_plus_g1)	! initializes cache line
+	EX_ST(STORE_INIT(%o5, %o0 + 0x08), NG_ret_i2_plus_g1_minus_8)
+	EX_LD(LOAD_TWIN(%i1 + %g2, %o4, %o5), NG_ret_i2_plus_g1_minus_16)
+	EX_ST(STORE_INIT(%o2, %o0 + 0x10), NG_ret_i2_plus_g1_minus_16)
+	EX_ST(STORE_INIT(%o3, %o0 + 0x18), NG_ret_i2_plus_g1_minus_24)
+	EX_LD(LOAD_TWIN(%i1 + %g3, %o2, %o3), NG_ret_i2_plus_g1_minus_32)
 	add	%i1, 64, %i1
-	EX_ST(STORE_INIT(%o4, %o0 + 0x20))
-	EX_ST(STORE_INIT(%o5, %o0 + 0x28))
-	EX_ST(STORE_INIT(%o2, %o0 + 0x30))
-	EX_ST(STORE_INIT(%o3, %o0 + 0x38))
+	EX_ST(STORE_INIT(%o4, %o0 + 0x20), NG_ret_i2_plus_g1_minus_32)
+	EX_ST(STORE_INIT(%o5, %o0 + 0x28), NG_ret_i2_plus_g1_minus_40)
+	EX_ST(STORE_INIT(%o2, %o0 + 0x30), NG_ret_i2_plus_g1_minus_48)
+	EX_ST(STORE_INIT(%o3, %o0 + 0x38), NG_ret_i2_plus_g1_minus_56)
 	subcc	%g1, 64, %g1
 	bne,pt	%XCC, 1b
 	 add	%o0, 64, %o0
@@ -321,28 +404,28 @@
 	andn		%i2, 0xf, %i4
 	and		%i2, 0xf, %i2
 1:	subcc		%i4, 0x10, %i4
-	EX_LD(LOAD(ldx, %i1, %o4))
+	EX_LD(LOAD(ldx, %i1, %o4), NG_ret_i2_plus_i4)
 	add		%i1, 0x08, %i1
-	EX_LD(LOAD(ldx, %i1, %g1))
+	EX_LD(LOAD(ldx, %i1, %g1), NG_ret_i2_plus_i4)
 	sub		%i1, 0x08, %i1
-	EX_ST(STORE(stx, %o4, %i1 + %i3))
+	EX_ST(STORE(stx, %o4, %i1 + %i3), NG_ret_i2_plus_i4)
 	add		%i1, 0x8, %i1
-	EX_ST(STORE(stx, %g1, %i1 + %i3))
+	EX_ST(STORE(stx, %g1, %i1 + %i3), NG_ret_i2_plus_i4_minus_8)
 	bgu,pt		%XCC, 1b
 	 add		%i1, 0x8, %i1
 73:	andcc		%i2, 0x8, %g0
 	be,pt		%XCC, 1f
 	 nop
 	sub		%i2, 0x8, %i2
-	EX_LD(LOAD(ldx, %i1, %o4))
-	EX_ST(STORE(stx, %o4, %i1 + %i3))
+	EX_LD(LOAD(ldx, %i1, %o4), NG_ret_i2_plus_8)
+	EX_ST(STORE(stx, %o4, %i1 + %i3), NG_ret_i2_plus_8)
 	add		%i1, 0x8, %i1
 1:	andcc		%i2, 0x4, %g0
 	be,pt		%XCC, 1f
 	 nop
 	sub		%i2, 0x4, %i2
-	EX_LD(LOAD(lduw, %i1, %i5))
-	EX_ST(STORE(stw, %i5, %i1 + %i3))
+	EX_LD(LOAD(lduw, %i1, %i5), NG_ret_i2_plus_4)
+	EX_ST(STORE(stw, %i5, %i1 + %i3), NG_ret_i2_plus_4)
 	add		%i1, 0x4, %i1
 1:	cmp		%i2, 0
 	be,pt		%XCC, 85f
@@ -358,8 +441,8 @@
 	sub		%i2, %g1, %i2
 
 1:	subcc		%g1, 1, %g1
-	EX_LD(LOAD(ldub, %i1, %i5))
-	EX_ST(STORE(stb, %i5, %i1 + %i3))
+	EX_LD(LOAD(ldub, %i1, %i5), NG_ret_i2_plus_g1_plus_1)
+	EX_ST(STORE(stb, %i5, %i1 + %i3), NG_ret_i2_plus_g1_plus_1)
 	bgu,pt		%icc, 1b
 	 add		%i1, 1, %i1
 
@@ -375,16 +458,16 @@
 
 8:	mov		64, %i3
 	andn		%i1, 0x7, %i1
-	EX_LD(LOAD(ldx, %i1, %g2))
+	EX_LD(LOAD(ldx, %i1, %g2), NG_ret_i2)
 	sub		%i3, %g1, %i3
 	andn		%i2, 0x7, %i4
 	sllx		%g2, %g1, %g2
 1:	add		%i1, 0x8, %i1
-	EX_LD(LOAD(ldx, %i1, %g3))
+	EX_LD(LOAD(ldx, %i1, %g3), NG_ret_i2_and_7_plus_i4)
 	subcc		%i4, 0x8, %i4
 	srlx		%g3, %i3, %i5
 	or		%i5, %g2, %i5
-	EX_ST(STORE(stx, %i5, %o0))
+	EX_ST(STORE(stx, %i5, %o0), NG_ret_i2_and_7_plus_i4)
 	add		%o0, 0x8, %o0
 	bgu,pt		%icc, 1b
 	 sllx		%g3, %g1, %g2
@@ -404,8 +487,8 @@
 
 1:
 	subcc		%i2, 4, %i2
-	EX_LD(LOAD(lduw, %i1, %g1))
-	EX_ST(STORE(stw, %g1, %i1 + %i3))
+	EX_LD(LOAD(lduw, %i1, %g1), NG_ret_i2_plus_4)
+	EX_ST(STORE(stw, %g1, %i1 + %i3), NG_ret_i2_plus_4)
 	bgu,pt		%XCC, 1b
 	 add		%i1, 4, %i1
 
@@ -415,8 +498,8 @@
 	.align		32
 90:
 	subcc		%i2, 1, %i2
-	EX_LD(LOAD(ldub, %i1, %g1))
-	EX_ST(STORE(stb, %g1, %i1 + %i3))
+	EX_LD(LOAD(ldub, %i1, %g1), NG_ret_i2_plus_1)
+	EX_ST(STORE(stb, %g1, %i1 + %i3), NG_ret_i2_plus_1)
 	bgu,pt		%XCC, 90b
 	 add		%i1, 1, %i1
 	ret
diff --git a/arch/sparc/lib/U1copy_from_user.S b/arch/sparc/lib/U1copy_from_user.S
index ecc5692..bb6ff73 100644
--- a/arch/sparc/lib/U1copy_from_user.S
+++ b/arch/sparc/lib/U1copy_from_user.S
@@ -3,19 +3,19 @@
  * Copyright (C) 1999, 2000, 2004 David S. Miller (davem@redhat.com)
  */
 
-#define EX_LD(x)		\
+#define EX_LD(x,y)		\
 98:	x;			\
 	.section __ex_table,"a";\
 	.align 4;		\
-	.word 98b, __retl_one;	\
+	.word 98b, y;		\
 	.text;			\
 	.align 4;
 
-#define EX_LD_FP(x)		\
+#define EX_LD_FP(x,y)		\
 98:	x;			\
 	.section __ex_table,"a";\
 	.align 4;		\
-	.word 98b, __retl_one_fp;\
+	.word 98b, y;		\
 	.text;			\
 	.align 4;
 
diff --git a/arch/sparc/lib/U1copy_to_user.S b/arch/sparc/lib/U1copy_to_user.S
index 9eea392..ed92ce73 100644
--- a/arch/sparc/lib/U1copy_to_user.S
+++ b/arch/sparc/lib/U1copy_to_user.S
@@ -3,19 +3,19 @@
  * Copyright (C) 1999, 2000, 2004 David S. Miller (davem@redhat.com)
  */
 
-#define EX_ST(x)		\
+#define EX_ST(x,y)		\
 98:	x;			\
 	.section __ex_table,"a";\
 	.align 4;		\
-	.word 98b, __retl_one;	\
+	.word 98b, y;		\
 	.text;			\
 	.align 4;
 
-#define EX_ST_FP(x)		\
+#define EX_ST_FP(x,y)		\
 98:	x;			\
 	.section __ex_table,"a";\
 	.align 4;		\
-	.word 98b, __retl_one_fp;\
+	.word 98b, y;		\
 	.text;			\
 	.align 4;
 
diff --git a/arch/sparc/lib/U1memcpy.S b/arch/sparc/lib/U1memcpy.S
index 97e1b21..4f0d50b 100644
--- a/arch/sparc/lib/U1memcpy.S
+++ b/arch/sparc/lib/U1memcpy.S
@@ -5,6 +5,7 @@
  */
 
 #ifdef __KERNEL__
+#include <linux/linkage.h>
 #include <asm/visasm.h>
 #include <asm/asi.h>
 #include <asm/export.h>
@@ -24,21 +25,17 @@
 #endif
 
 #ifndef EX_LD
-#define EX_LD(x)	x
+#define EX_LD(x,y)	x
 #endif
 #ifndef EX_LD_FP
-#define EX_LD_FP(x)	x
+#define EX_LD_FP(x,y)	x
 #endif
 
 #ifndef EX_ST
-#define EX_ST(x)	x
+#define EX_ST(x,y)	x
 #endif
 #ifndef EX_ST_FP
-#define EX_ST_FP(x)	x
-#endif
-
-#ifndef EX_RETVAL
-#define EX_RETVAL(x)	x
+#define EX_ST_FP(x,y)	x
 #endif
 
 #ifndef LOAD
@@ -79,53 +76,169 @@
 	faligndata		%f7, %f8, %f60;			\
 	faligndata		%f8, %f9, %f62;
 
-#define MAIN_LOOP_CHUNK(src, dest, fdest, fsrc, len, jmptgt)	\
-	EX_LD_FP(LOAD_BLK(%src, %fdest));				\
-	EX_ST_FP(STORE_BLK(%fsrc, %dest));				\
-	add			%src, 0x40, %src;		\
-	subcc			%len, 0x40, %len;		\
-	be,pn			%xcc, jmptgt;			\
-	 add			%dest, 0x40, %dest;		\
+#define MAIN_LOOP_CHUNK(src, dest, fdest, fsrc, jmptgt)			\
+	EX_LD_FP(LOAD_BLK(%src, %fdest), U1_gs_80_fp);			\
+	EX_ST_FP(STORE_BLK(%fsrc, %dest), U1_gs_80_fp);			\
+	add			%src, 0x40, %src;			\
+	subcc			%GLOBAL_SPARE, 0x40, %GLOBAL_SPARE;	\
+	be,pn			%xcc, jmptgt;				\
+	 add			%dest, 0x40, %dest;			\
 
-#define LOOP_CHUNK1(src, dest, len, branch_dest)		\
-	MAIN_LOOP_CHUNK(src, dest, f0,  f48, len, branch_dest)
-#define LOOP_CHUNK2(src, dest, len, branch_dest)		\
-	MAIN_LOOP_CHUNK(src, dest, f16, f48, len, branch_dest)
-#define LOOP_CHUNK3(src, dest, len, branch_dest)		\
-	MAIN_LOOP_CHUNK(src, dest, f32, f48, len, branch_dest)
+#define LOOP_CHUNK1(src, dest, branch_dest)		\
+	MAIN_LOOP_CHUNK(src, dest, f0,  f48, branch_dest)
+#define LOOP_CHUNK2(src, dest, branch_dest)		\
+	MAIN_LOOP_CHUNK(src, dest, f16, f48, branch_dest)
+#define LOOP_CHUNK3(src, dest, branch_dest)		\
+	MAIN_LOOP_CHUNK(src, dest, f32, f48, branch_dest)
 
 #define DO_SYNC			membar	#Sync;
 #define STORE_SYNC(dest, fsrc)				\
-	EX_ST_FP(STORE_BLK(%fsrc, %dest));			\
+	EX_ST_FP(STORE_BLK(%fsrc, %dest), U1_gs_80_fp);	\
 	add			%dest, 0x40, %dest;	\
 	DO_SYNC
 
 #define STORE_JUMP(dest, fsrc, target)			\
-	EX_ST_FP(STORE_BLK(%fsrc, %dest));			\
+	EX_ST_FP(STORE_BLK(%fsrc, %dest), U1_gs_40_fp);	\
 	add			%dest, 0x40, %dest;	\
 	ba,pt			%xcc, target;		\
 	 nop;
 
-#define FINISH_VISCHUNK(dest, f0, f1, left)	\
-	subcc			%left, 8, %left;\
-	bl,pn			%xcc, 95f;	\
-	 faligndata		%f0, %f1, %f48;	\
-	EX_ST_FP(STORE(std, %f48, %dest));		\
+#define FINISH_VISCHUNK(dest, f0, f1)			\
+	subcc			%g3, 8, %g3;		\
+	bl,pn			%xcc, 95f;		\
+	 faligndata		%f0, %f1, %f48;		\
+	EX_ST_FP(STORE(std, %f48, %dest), U1_g3_8_fp);	\
 	add			%dest, 8, %dest;
 
-#define UNEVEN_VISCHUNK_LAST(dest, f0, f1, left)	\
-	subcc			%left, 8, %left;	\
-	bl,pn			%xcc, 95f;		\
+#define UNEVEN_VISCHUNK_LAST(dest, f0, f1)	\
+	subcc			%g3, 8, %g3;	\
+	bl,pn			%xcc, 95f;	\
 	 fsrc2			%f0, %f1;
 
-#define UNEVEN_VISCHUNK(dest, f0, f1, left)		\
-	UNEVEN_VISCHUNK_LAST(dest, f0, f1, left)	\
+#define UNEVEN_VISCHUNK(dest, f0, f1)		\
+	UNEVEN_VISCHUNK_LAST(dest, f0, f1)	\
 	ba,a,pt			%xcc, 93f;
 
 	.register	%g2,#scratch
 	.register	%g3,#scratch
 
 	.text
+#ifndef EX_RETVAL
+#define EX_RETVAL(x)	x
+ENTRY(U1_g1_1_fp)
+	VISExitHalf
+	add		%g1, 1, %g1
+	add		%g1, %g2, %g1
+	retl
+	 add		%g1, %o2, %o0
+ENDPROC(U1_g1_1_fp)
+ENTRY(U1_g2_0_fp)
+	VISExitHalf
+	retl
+	 add		%g2, %o2, %o0
+ENDPROC(U1_g2_0_fp)
+ENTRY(U1_g2_8_fp)
+	VISExitHalf
+	add		%g2, 8, %g2
+	retl
+	 add		%g2, %o2, %o0
+ENDPROC(U1_g2_8_fp)
+ENTRY(U1_gs_0_fp)
+	VISExitHalf
+	add		%GLOBAL_SPARE, %g3, %o0
+	retl
+	 add		%o0, %o2, %o0
+ENDPROC(U1_gs_0_fp)
+ENTRY(U1_gs_80_fp)
+	VISExitHalf
+	add		%GLOBAL_SPARE, 0x80, %GLOBAL_SPARE
+	add		%GLOBAL_SPARE, %g3, %o0
+	retl
+	 add		%o0, %o2, %o0
+ENDPROC(U1_gs_80_fp)
+ENTRY(U1_gs_40_fp)
+	VISExitHalf
+	add		%GLOBAL_SPARE, 0x40, %GLOBAL_SPARE
+	add		%GLOBAL_SPARE, %g3, %o0
+	retl
+	 add		%o0, %o2, %o0
+ENDPROC(U1_gs_40_fp)
+ENTRY(U1_g3_0_fp)
+	VISExitHalf
+	retl
+	 add		%g3, %o2, %o0
+ENDPROC(U1_g3_0_fp)
+ENTRY(U1_g3_8_fp)
+	VISExitHalf
+	add		%g3, 8, %g3
+	retl
+	 add		%g3, %o2, %o0
+ENDPROC(U1_g3_8_fp)
+ENTRY(U1_o2_0_fp)
+	VISExitHalf
+	retl
+	 mov		%o2, %o0
+ENDPROC(U1_o2_0_fp)
+ENTRY(U1_o2_1_fp)
+	VISExitHalf
+	retl
+	 add		%o2, 1, %o0
+ENDPROC(U1_o2_1_fp)
+ENTRY(U1_gs_0)
+	VISExitHalf
+	retl
+	 add		%GLOBAL_SPARE, %o2, %o0
+ENDPROC(U1_gs_0)
+ENTRY(U1_gs_8)
+	VISExitHalf
+	add		%GLOBAL_SPARE, %o2, %GLOBAL_SPARE
+	retl
+	 add		%GLOBAL_SPARE, 0x8, %o0
+ENDPROC(U1_gs_8)
+ENTRY(U1_gs_10)
+	VISExitHalf
+	add		%GLOBAL_SPARE, %o2, %GLOBAL_SPARE
+	retl
+	 add		%GLOBAL_SPARE, 0x10, %o0
+ENDPROC(U1_gs_10)
+ENTRY(U1_o2_0)
+	retl
+	 mov		%o2, %o0
+ENDPROC(U1_o2_0)
+ENTRY(U1_o2_8)
+	retl
+	 add		%o2, 8, %o0
+ENDPROC(U1_o2_8)
+ENTRY(U1_o2_4)
+	retl
+	 add		%o2, 4, %o0
+ENDPROC(U1_o2_4)
+ENTRY(U1_o2_1)
+	retl
+	 add		%o2, 1, %o0
+ENDPROC(U1_o2_1)
+ENTRY(U1_g1_0)
+	retl
+	 add		%g1, %o2, %o0
+ENDPROC(U1_g1_0)
+ENTRY(U1_g1_1)
+	add		%g1, 1, %g1
+	retl
+	 add		%g1, %o2, %o0
+ENDPROC(U1_g1_1)
+ENTRY(U1_gs_0_o2_adj)
+	and		%o2, 7, %o2
+	retl
+	 add		%GLOBAL_SPARE, %o2, %o0
+ENDPROC(U1_gs_0_o2_adj)
+ENTRY(U1_gs_8_o2_adj)
+	and		%o2, 7, %o2
+	add		%GLOBAL_SPARE, 8, %GLOBAL_SPARE
+	retl
+	 add		%GLOBAL_SPARE, %o2, %o0
+ENDPROC(U1_gs_8_o2_adj)
+#endif
+
 	.align		64
 
 	.globl		FUNC_NAME
@@ -167,8 +280,8 @@
 	 and		%g2, 0x38, %g2
 
 1:	subcc		%g1, 0x1, %g1
-	EX_LD_FP(LOAD(ldub, %o1 + 0x00, %o3))
-	EX_ST_FP(STORE(stb, %o3, %o1 + %GLOBAL_SPARE))
+	EX_LD_FP(LOAD(ldub, %o1 + 0x00, %o3), U1_g1_1_fp)
+	EX_ST_FP(STORE(stb, %o3, %o1 + %GLOBAL_SPARE), U1_g1_1_fp)
 	bgu,pt		%XCC, 1b
 	 add		%o1, 0x1, %o1
 
@@ -179,20 +292,20 @@
 	be,pt		%icc, 3f
 	 alignaddr	%o1, %g0, %o1
 
-	EX_LD_FP(LOAD(ldd, %o1, %f4))
-1:	EX_LD_FP(LOAD(ldd, %o1 + 0x8, %f6))
+	EX_LD_FP(LOAD(ldd, %o1, %f4), U1_g2_0_fp)
+1:	EX_LD_FP(LOAD(ldd, %o1 + 0x8, %f6), U1_g2_0_fp)
 	add		%o1, 0x8, %o1
 	subcc		%g2, 0x8, %g2
 	faligndata	%f4, %f6, %f0
-	EX_ST_FP(STORE(std, %f0, %o0))
+	EX_ST_FP(STORE(std, %f0, %o0), U1_g2_8_fp)
 	be,pn		%icc, 3f
 	 add		%o0, 0x8, %o0
 
-	EX_LD_FP(LOAD(ldd, %o1 + 0x8, %f4))
+	EX_LD_FP(LOAD(ldd, %o1 + 0x8, %f4), U1_g2_0_fp)
 	add		%o1, 0x8, %o1
 	subcc		%g2, 0x8, %g2
 	faligndata	%f6, %f4, %f0
-	EX_ST_FP(STORE(std, %f0, %o0))
+	EX_ST_FP(STORE(std, %f0, %o0), U1_g2_8_fp)
 	bne,pt		%icc, 1b
 	 add		%o0, 0x8, %o0
 
@@ -215,13 +328,13 @@
 	add		%g1, %GLOBAL_SPARE, %g1
 	subcc		%o2, %g3, %o2
 
-	EX_LD_FP(LOAD_BLK(%o1, %f0))
+	EX_LD_FP(LOAD_BLK(%o1, %f0), U1_gs_0_fp)
 	add		%o1, 0x40, %o1
 	add		%g1, %g3, %g1
-	EX_LD_FP(LOAD_BLK(%o1, %f16))
+	EX_LD_FP(LOAD_BLK(%o1, %f16), U1_gs_0_fp)
 	add		%o1, 0x40, %o1
 	sub		%GLOBAL_SPARE, 0x80, %GLOBAL_SPARE
-	EX_LD_FP(LOAD_BLK(%o1, %f32))
+	EX_LD_FP(LOAD_BLK(%o1, %f32), U1_gs_80_fp)
 	add		%o1, 0x40, %o1
 
 	/* There are 8 instances of the unrolled loop,
@@ -241,11 +354,11 @@
 
 	.align		64
 1:	FREG_FROB(f0, f2, f4, f6, f8, f10,f12,f14,f16)
-	LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f)
+	LOOP_CHUNK1(o1, o0, 1f)
 	FREG_FROB(f16,f18,f20,f22,f24,f26,f28,f30,f32)
-	LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f)
+	LOOP_CHUNK2(o1, o0, 2f)
 	FREG_FROB(f32,f34,f36,f38,f40,f42,f44,f46,f0)
-	LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f)
+	LOOP_CHUNK3(o1, o0, 3f)
 	ba,pt		%xcc, 1b+4
 	 faligndata	%f0, %f2, %f48
 1:	FREG_FROB(f16,f18,f20,f22,f24,f26,f28,f30,f32)
@@ -262,11 +375,11 @@
 	STORE_JUMP(o0, f48, 56f)
 
 1:	FREG_FROB(f2, f4, f6, f8, f10,f12,f14,f16,f18)
-	LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f)
+	LOOP_CHUNK1(o1, o0, 1f)
 	FREG_FROB(f18,f20,f22,f24,f26,f28,f30,f32,f34)
-	LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f)
+	LOOP_CHUNK2(o1, o0, 2f)
 	FREG_FROB(f34,f36,f38,f40,f42,f44,f46,f0, f2)
-	LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f)
+	LOOP_CHUNK3(o1, o0, 3f)
 	ba,pt		%xcc, 1b+4
 	 faligndata	%f2, %f4, %f48
 1:	FREG_FROB(f18,f20,f22,f24,f26,f28,f30,f32,f34)
@@ -283,11 +396,11 @@
 	STORE_JUMP(o0, f48, 57f)
 
 1:	FREG_FROB(f4, f6, f8, f10,f12,f14,f16,f18,f20)
-	LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f)
+	LOOP_CHUNK1(o1, o0, 1f)
 	FREG_FROB(f20,f22,f24,f26,f28,f30,f32,f34,f36)
-	LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f)
+	LOOP_CHUNK2(o1, o0, 2f)
 	FREG_FROB(f36,f38,f40,f42,f44,f46,f0, f2, f4)
-	LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f)
+	LOOP_CHUNK3(o1, o0, 3f)
 	ba,pt		%xcc, 1b+4
 	 faligndata	%f4, %f6, %f48
 1:	FREG_FROB(f20,f22,f24,f26,f28,f30,f32,f34,f36)
@@ -304,11 +417,11 @@
 	STORE_JUMP(o0, f48, 58f)
 
 1:	FREG_FROB(f6, f8, f10,f12,f14,f16,f18,f20,f22)
-	LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f)
+	LOOP_CHUNK1(o1, o0, 1f)
 	FREG_FROB(f22,f24,f26,f28,f30,f32,f34,f36,f38)
-	LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f)
+	LOOP_CHUNK2(o1, o0, 2f)
 	FREG_FROB(f38,f40,f42,f44,f46,f0, f2, f4, f6) 
-	LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f)
+	LOOP_CHUNK3(o1, o0, 3f)
 	ba,pt		%xcc, 1b+4
 	 faligndata	%f6, %f8, %f48
 1:	FREG_FROB(f22,f24,f26,f28,f30,f32,f34,f36,f38)
@@ -325,11 +438,11 @@
 	STORE_JUMP(o0, f48, 59f)
 
 1:	FREG_FROB(f8, f10,f12,f14,f16,f18,f20,f22,f24)
-	LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f)
+	LOOP_CHUNK1(o1, o0, 1f)
 	FREG_FROB(f24,f26,f28,f30,f32,f34,f36,f38,f40)
-	LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f)
+	LOOP_CHUNK2(o1, o0, 2f)
 	FREG_FROB(f40,f42,f44,f46,f0, f2, f4, f6, f8)
-	LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f)
+	LOOP_CHUNK3(o1, o0, 3f)
 	ba,pt		%xcc, 1b+4
 	 faligndata	%f8, %f10, %f48
 1:	FREG_FROB(f24,f26,f28,f30,f32,f34,f36,f38,f40)
@@ -346,11 +459,11 @@
 	STORE_JUMP(o0, f48, 60f)
 
 1:	FREG_FROB(f10,f12,f14,f16,f18,f20,f22,f24,f26)
-	LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f)
+	LOOP_CHUNK1(o1, o0, 1f)
 	FREG_FROB(f26,f28,f30,f32,f34,f36,f38,f40,f42)
-	LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f)
+	LOOP_CHUNK2(o1, o0, 2f)
 	FREG_FROB(f42,f44,f46,f0, f2, f4, f6, f8, f10)
-	LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f)
+	LOOP_CHUNK3(o1, o0, 3f)
 	ba,pt		%xcc, 1b+4
 	 faligndata	%f10, %f12, %f48
 1:	FREG_FROB(f26,f28,f30,f32,f34,f36,f38,f40,f42)
@@ -367,11 +480,11 @@
 	STORE_JUMP(o0, f48, 61f)
 
 1:	FREG_FROB(f12,f14,f16,f18,f20,f22,f24,f26,f28)
-	LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f)
+	LOOP_CHUNK1(o1, o0, 1f)
 	FREG_FROB(f28,f30,f32,f34,f36,f38,f40,f42,f44)
-	LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f)
+	LOOP_CHUNK2(o1, o0, 2f)
 	FREG_FROB(f44,f46,f0, f2, f4, f6, f8, f10,f12)
-	LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f)
+	LOOP_CHUNK3(o1, o0, 3f)
 	ba,pt		%xcc, 1b+4
 	 faligndata	%f12, %f14, %f48
 1:	FREG_FROB(f28,f30,f32,f34,f36,f38,f40,f42,f44)
@@ -388,11 +501,11 @@
 	STORE_JUMP(o0, f48, 62f)
 
 1:	FREG_FROB(f14,f16,f18,f20,f22,f24,f26,f28,f30)
-	LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f)
+	LOOP_CHUNK1(o1, o0, 1f)
 	FREG_FROB(f30,f32,f34,f36,f38,f40,f42,f44,f46)
-	LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f)
+	LOOP_CHUNK2(o1, o0, 2f)
 	FREG_FROB(f46,f0, f2, f4, f6, f8, f10,f12,f14)
-	LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f)
+	LOOP_CHUNK3(o1, o0, 3f)
 	ba,pt		%xcc, 1b+4
 	 faligndata	%f14, %f16, %f48
 1:	FREG_FROB(f30,f32,f34,f36,f38,f40,f42,f44,f46)
@@ -408,53 +521,53 @@
 	FREG_FROB(f30,f32,f34,f36,f38,f40,f42,f44,f46)
 	STORE_JUMP(o0, f48, 63f)
 
-40:	FINISH_VISCHUNK(o0, f0,  f2,  g3)
-41:	FINISH_VISCHUNK(o0, f2,  f4,  g3)
-42:	FINISH_VISCHUNK(o0, f4,  f6,  g3)
-43:	FINISH_VISCHUNK(o0, f6,  f8,  g3)
-44:	FINISH_VISCHUNK(o0, f8,  f10, g3)
-45:	FINISH_VISCHUNK(o0, f10, f12, g3)
-46:	FINISH_VISCHUNK(o0, f12, f14, g3)
-47:	UNEVEN_VISCHUNK(o0, f14, f0,  g3)
-48:	FINISH_VISCHUNK(o0, f16, f18, g3)
-49:	FINISH_VISCHUNK(o0, f18, f20, g3)
-50:	FINISH_VISCHUNK(o0, f20, f22, g3)
-51:	FINISH_VISCHUNK(o0, f22, f24, g3)
-52:	FINISH_VISCHUNK(o0, f24, f26, g3)
-53:	FINISH_VISCHUNK(o0, f26, f28, g3)
-54:	FINISH_VISCHUNK(o0, f28, f30, g3)
-55:	UNEVEN_VISCHUNK(o0, f30, f0,  g3)
-56:	FINISH_VISCHUNK(o0, f32, f34, g3)
-57:	FINISH_VISCHUNK(o0, f34, f36, g3)
-58:	FINISH_VISCHUNK(o0, f36, f38, g3)
-59:	FINISH_VISCHUNK(o0, f38, f40, g3)
-60:	FINISH_VISCHUNK(o0, f40, f42, g3)
-61:	FINISH_VISCHUNK(o0, f42, f44, g3)
-62:	FINISH_VISCHUNK(o0, f44, f46, g3)
-63:	UNEVEN_VISCHUNK_LAST(o0, f46, f0,  g3)
+40:	FINISH_VISCHUNK(o0, f0,  f2)
+41:	FINISH_VISCHUNK(o0, f2,  f4)
+42:	FINISH_VISCHUNK(o0, f4,  f6)
+43:	FINISH_VISCHUNK(o0, f6,  f8)
+44:	FINISH_VISCHUNK(o0, f8,  f10)
+45:	FINISH_VISCHUNK(o0, f10, f12)
+46:	FINISH_VISCHUNK(o0, f12, f14)
+47:	UNEVEN_VISCHUNK(o0, f14, f0)
+48:	FINISH_VISCHUNK(o0, f16, f18)
+49:	FINISH_VISCHUNK(o0, f18, f20)
+50:	FINISH_VISCHUNK(o0, f20, f22)
+51:	FINISH_VISCHUNK(o0, f22, f24)
+52:	FINISH_VISCHUNK(o0, f24, f26)
+53:	FINISH_VISCHUNK(o0, f26, f28)
+54:	FINISH_VISCHUNK(o0, f28, f30)
+55:	UNEVEN_VISCHUNK(o0, f30, f0)
+56:	FINISH_VISCHUNK(o0, f32, f34)
+57:	FINISH_VISCHUNK(o0, f34, f36)
+58:	FINISH_VISCHUNK(o0, f36, f38)
+59:	FINISH_VISCHUNK(o0, f38, f40)
+60:	FINISH_VISCHUNK(o0, f40, f42)
+61:	FINISH_VISCHUNK(o0, f42, f44)
+62:	FINISH_VISCHUNK(o0, f44, f46)
+63:	UNEVEN_VISCHUNK_LAST(o0, f46, f0)
 
-93:	EX_LD_FP(LOAD(ldd, %o1, %f2))
+93:	EX_LD_FP(LOAD(ldd, %o1, %f2), U1_g3_0_fp)
 	add		%o1, 8, %o1
 	subcc		%g3, 8, %g3
 	faligndata	%f0, %f2, %f8
-	EX_ST_FP(STORE(std, %f8, %o0))
+	EX_ST_FP(STORE(std, %f8, %o0), U1_g3_8_fp)
 	bl,pn		%xcc, 95f
 	 add		%o0, 8, %o0
-	EX_LD_FP(LOAD(ldd, %o1, %f0))
+	EX_LD_FP(LOAD(ldd, %o1, %f0), U1_g3_0_fp)
 	add		%o1, 8, %o1
 	subcc		%g3, 8, %g3
 	faligndata	%f2, %f0, %f8
-	EX_ST_FP(STORE(std, %f8, %o0))
+	EX_ST_FP(STORE(std, %f8, %o0), U1_g3_8_fp)
 	bge,pt		%xcc, 93b
 	 add		%o0, 8, %o0
 
 95:	brz,pt		%o2, 2f
 	 mov		%g1, %o1
 
-1:	EX_LD_FP(LOAD(ldub, %o1, %o3))
+1:	EX_LD_FP(LOAD(ldub, %o1, %o3), U1_o2_0_fp)
 	add		%o1, 1, %o1
 	subcc		%o2, 1, %o2
-	EX_ST_FP(STORE(stb, %o3, %o0))
+	EX_ST_FP(STORE(stb, %o3, %o0), U1_o2_1_fp)
 	bne,pt		%xcc, 1b
 	 add		%o0, 1, %o0
 
@@ -470,27 +583,27 @@
 
 72:	andn		%o2, 0xf, %GLOBAL_SPARE
 	and		%o2, 0xf, %o2
-1:	EX_LD(LOAD(ldx, %o1 + 0x00, %o5))
-	EX_LD(LOAD(ldx, %o1 + 0x08, %g1))
+1:	EX_LD(LOAD(ldx, %o1 + 0x00, %o5), U1_gs_0)
+	EX_LD(LOAD(ldx, %o1 + 0x08, %g1), U1_gs_0)
 	subcc		%GLOBAL_SPARE, 0x10, %GLOBAL_SPARE
-	EX_ST(STORE(stx, %o5, %o1 + %o3))
+	EX_ST(STORE(stx, %o5, %o1 + %o3), U1_gs_10)
 	add		%o1, 0x8, %o1
-	EX_ST(STORE(stx, %g1, %o1 + %o3))
+	EX_ST(STORE(stx, %g1, %o1 + %o3), U1_gs_8)
 	bgu,pt		%XCC, 1b
 	 add		%o1, 0x8, %o1
 73:	andcc		%o2, 0x8, %g0
 	be,pt		%XCC, 1f
 	 nop
-	EX_LD(LOAD(ldx, %o1, %o5))
+	EX_LD(LOAD(ldx, %o1, %o5), U1_o2_0)
 	sub		%o2, 0x8, %o2
-	EX_ST(STORE(stx, %o5, %o1 + %o3))
+	EX_ST(STORE(stx, %o5, %o1 + %o3), U1_o2_8)
 	add		%o1, 0x8, %o1
 1:	andcc		%o2, 0x4, %g0
 	be,pt		%XCC, 1f
 	 nop
-	EX_LD(LOAD(lduw, %o1, %o5))
+	EX_LD(LOAD(lduw, %o1, %o5), U1_o2_0)
 	sub		%o2, 0x4, %o2
-	EX_ST(STORE(stw, %o5, %o1 + %o3))
+	EX_ST(STORE(stw, %o5, %o1 + %o3), U1_o2_4)
 	add		%o1, 0x4, %o1
 1:	cmp		%o2, 0
 	be,pt		%XCC, 85f
@@ -504,9 +617,9 @@
 	 sub		%g0, %g1, %g1
 	sub		%o2, %g1, %o2
 
-1:	EX_LD(LOAD(ldub, %o1, %o5))
+1:	EX_LD(LOAD(ldub, %o1, %o5), U1_g1_0)
 	subcc		%g1, 1, %g1
-	EX_ST(STORE(stb, %o5, %o1 + %o3))
+	EX_ST(STORE(stb, %o5, %o1 + %o3), U1_g1_1)
 	bgu,pt		%icc, 1b
 	 add		%o1, 1, %o1
 
@@ -522,16 +635,16 @@
 
 8:	mov		64, %o3
 	andn		%o1, 0x7, %o1
-	EX_LD(LOAD(ldx, %o1, %g2))
+	EX_LD(LOAD(ldx, %o1, %g2), U1_o2_0)
 	sub		%o3, %g1, %o3
 	andn		%o2, 0x7, %GLOBAL_SPARE
 	sllx		%g2, %g1, %g2
-1:	EX_LD(LOAD(ldx, %o1 + 0x8, %g3))
+1:	EX_LD(LOAD(ldx, %o1 + 0x8, %g3), U1_gs_0_o2_adj)
 	subcc		%GLOBAL_SPARE, 0x8, %GLOBAL_SPARE
 	add		%o1, 0x8, %o1
 	srlx		%g3, %o3, %o5
 	or		%o5, %g2, %o5
-	EX_ST(STORE(stx, %o5, %o0))
+	EX_ST(STORE(stx, %o5, %o0), U1_gs_8_o2_adj)
 	add		%o0, 0x8, %o0
 	bgu,pt		%icc, 1b
 	 sllx		%g3, %g1, %g2
@@ -549,9 +662,9 @@
 	bne,pn		%XCC, 90f
 	 sub		%o0, %o1, %o3
 
-1:	EX_LD(LOAD(lduw, %o1, %g1))
+1:	EX_LD(LOAD(lduw, %o1, %g1), U1_o2_0)
 	subcc		%o2, 4, %o2
-	EX_ST(STORE(stw, %g1, %o1 + %o3))
+	EX_ST(STORE(stw, %g1, %o1 + %o3), U1_o2_4)
 	bgu,pt		%XCC, 1b
 	 add		%o1, 4, %o1
 
@@ -559,9 +672,9 @@
 	 mov		EX_RETVAL(%o4), %o0
 
 	.align		32
-90:	EX_LD(LOAD(ldub, %o1, %g1))
+90:	EX_LD(LOAD(ldub, %o1, %g1), U1_o2_0)
 	subcc		%o2, 1, %o2
-	EX_ST(STORE(stb, %g1, %o1 + %o3))
+	EX_ST(STORE(stb, %g1, %o1 + %o3), U1_o2_1)
 	bgu,pt		%XCC, 90b
 	 add		%o1, 1, %o1
 	retl
diff --git a/arch/sparc/lib/U3copy_from_user.S b/arch/sparc/lib/U3copy_from_user.S
index 88ad73d..db73010 100644
--- a/arch/sparc/lib/U3copy_from_user.S
+++ b/arch/sparc/lib/U3copy_from_user.S
@@ -3,19 +3,19 @@
  * Copyright (C) 1999, 2000, 2004 David S. Miller (davem@redhat.com)
  */
 
-#define EX_LD(x)		\
+#define EX_LD(x,y)		\
 98:	x;			\
 	.section __ex_table,"a";\
 	.align 4;		\
-	.word 98b, __retl_one;	\
+	.word 98b, y;		\
 	.text;			\
 	.align 4;
 
-#define EX_LD_FP(x)		\
+#define EX_LD_FP(x,y)		\
 98:	x;			\
 	.section __ex_table,"a";\
 	.align 4;		\
-	.word 98b, __retl_one_fp;\
+	.word 98b, y##_fp;	\
 	.text;			\
 	.align 4;
 
diff --git a/arch/sparc/lib/U3copy_to_user.S b/arch/sparc/lib/U3copy_to_user.S
index 845139d..c4ee858 100644
--- a/arch/sparc/lib/U3copy_to_user.S
+++ b/arch/sparc/lib/U3copy_to_user.S
@@ -3,19 +3,19 @@
  * Copyright (C) 1999, 2000, 2004 David S. Miller (davem@redhat.com)
  */
 
-#define EX_ST(x)		\
+#define EX_ST(x,y)		\
 98:	x;			\
 	.section __ex_table,"a";\
 	.align 4;		\
-	.word 98b, __retl_one;	\
+	.word 98b, y;		\
 	.text;			\
 	.align 4;
 
-#define EX_ST_FP(x)		\
+#define EX_ST_FP(x,y)		\
 98:	x;			\
 	.section __ex_table,"a";\
 	.align 4;		\
-	.word 98b, __retl_one_fp;\
+	.word 98b, y##_fp;	\
 	.text;			\
 	.align 4;
 
diff --git a/arch/sparc/lib/U3memcpy.S b/arch/sparc/lib/U3memcpy.S
index 491ee69..54f9870 100644
--- a/arch/sparc/lib/U3memcpy.S
+++ b/arch/sparc/lib/U3memcpy.S
@@ -4,6 +4,7 @@
  */
 
 #ifdef __KERNEL__
+#include <linux/linkage.h>
 #include <asm/visasm.h>
 #include <asm/asi.h>
 #define GLOBAL_SPARE	%g7
@@ -22,21 +23,17 @@
 #endif
 
 #ifndef EX_LD
-#define EX_LD(x)	x
+#define EX_LD(x,y)	x
 #endif
 #ifndef EX_LD_FP
-#define EX_LD_FP(x)	x
+#define EX_LD_FP(x,y)	x
 #endif
 
 #ifndef EX_ST
-#define EX_ST(x)	x
+#define EX_ST(x,y)	x
 #endif
 #ifndef EX_ST_FP
-#define EX_ST_FP(x)	x
-#endif
-
-#ifndef EX_RETVAL
-#define EX_RETVAL(x)	x
+#define EX_ST_FP(x,y)	x
 #endif
 
 #ifndef LOAD
@@ -77,6 +74,87 @@
 	 */
 
 	.text
+#ifndef EX_RETVAL
+#define EX_RETVAL(x)	x
+__restore_fp:
+	VISExitHalf
+	retl
+	 nop
+ENTRY(U3_retl_o2_plus_g2_plus_g1_plus_1_fp)
+	add	%g1, 1, %g1
+	add	%g2, %g1, %g2
+	ba,pt	%xcc, __restore_fp
+	 add	%o2, %g2, %o0
+ENDPROC(U3_retl_o2_plus_g2_plus_g1_plus_1_fp)
+ENTRY(U3_retl_o2_plus_g2_fp)
+	ba,pt	%xcc, __restore_fp
+	 add	%o2, %g2, %o0
+ENDPROC(U3_retl_o2_plus_g2_fp)
+ENTRY(U3_retl_o2_plus_g2_plus_8_fp)
+	add	%g2, 8, %g2
+	ba,pt	%xcc, __restore_fp
+	 add	%o2, %g2, %o0
+ENDPROC(U3_retl_o2_plus_g2_plus_8_fp)
+ENTRY(U3_retl_o2)
+	retl
+	 mov	%o2, %o0
+ENDPROC(U3_retl_o2)
+ENTRY(U3_retl_o2_plus_1)
+	retl
+	 add	%o2, 1, %o0
+ENDPROC(U3_retl_o2_plus_1)
+ENTRY(U3_retl_o2_plus_4)
+	retl
+	 add	%o2, 4, %o0
+ENDPROC(U3_retl_o2_plus_4)
+ENTRY(U3_retl_o2_plus_8)
+	retl
+	 add	%o2, 8, %o0
+ENDPROC(U3_retl_o2_plus_8)
+ENTRY(U3_retl_o2_plus_g1_plus_1)
+	add	%g1, 1, %g1
+	retl
+	 add	%o2, %g1, %o0
+ENDPROC(U3_retl_o2_plus_g1_plus_1)
+ENTRY(U3_retl_o2_fp)
+	ba,pt	%xcc, __restore_fp
+	 mov	%o2, %o0
+ENDPROC(U3_retl_o2_fp)
+ENTRY(U3_retl_o2_plus_o3_sll_6_plus_0x80_fp)
+	sll	%o3, 6, %o3
+	add	%o3, 0x80, %o3
+	ba,pt	%xcc, __restore_fp
+	 add	%o2, %o3, %o0
+ENDPROC(U3_retl_o2_plus_o3_sll_6_plus_0x80_fp)
+ENTRY(U3_retl_o2_plus_o3_sll_6_plus_0x40_fp)
+	sll	%o3, 6, %o3
+	add	%o3, 0x40, %o3
+	ba,pt	%xcc, __restore_fp
+	 add	%o2, %o3, %o0
+ENDPROC(U3_retl_o2_plus_o3_sll_6_plus_0x40_fp)
+ENTRY(U3_retl_o2_plus_GS_plus_0x10)
+	add	GLOBAL_SPARE, 0x10, GLOBAL_SPARE
+	retl
+	 add	%o2, GLOBAL_SPARE, %o0
+ENDPROC(U3_retl_o2_plus_GS_plus_0x10)
+ENTRY(U3_retl_o2_plus_GS_plus_0x08)
+	add	GLOBAL_SPARE, 0x08, GLOBAL_SPARE
+	retl
+	 add	%o2, GLOBAL_SPARE, %o0
+ENDPROC(U3_retl_o2_plus_GS_plus_0x08)
+ENTRY(U3_retl_o2_and_7_plus_GS)
+	and	%o2, 7, %o2
+	retl
+	 add	%o2, GLOBAL_SPARE, %o2
+ENDPROC(U3_retl_o2_and_7_plus_GS)
+ENTRY(U3_retl_o2_and_7_plus_GS_plus_8)
+	add	GLOBAL_SPARE, 8, GLOBAL_SPARE
+	and	%o2, 7, %o2
+	retl
+	 add	%o2, GLOBAL_SPARE, %o2
+ENDPROC(U3_retl_o2_and_7_plus_GS_plus_8)
+#endif
+
 	.align		64
 
 	/* The cheetah's flexible spine, oversized liver, enlarged heart,
@@ -126,8 +204,8 @@
 	 and		%g2, 0x38, %g2
 
 1:	subcc		%g1, 0x1, %g1
-	EX_LD_FP(LOAD(ldub, %o1 + 0x00, %o3))
-	EX_ST_FP(STORE(stb, %o3, %o1 + GLOBAL_SPARE))
+	EX_LD_FP(LOAD(ldub, %o1 + 0x00, %o3), U3_retl_o2_plus_g2_plus_g1_plus_1)
+	EX_ST_FP(STORE(stb, %o3, %o1 + GLOBAL_SPARE), U3_retl_o2_plus_g2_plus_g1_plus_1)
 	bgu,pt		%XCC, 1b
 	 add		%o1, 0x1, %o1
 
@@ -138,20 +216,20 @@
 	be,pt		%icc, 3f
 	 alignaddr	%o1, %g0, %o1
 
-	EX_LD_FP(LOAD(ldd, %o1, %f4))
-1:	EX_LD_FP(LOAD(ldd, %o1 + 0x8, %f6))
+	EX_LD_FP(LOAD(ldd, %o1, %f4), U3_retl_o2_plus_g2)
+1:	EX_LD_FP(LOAD(ldd, %o1 + 0x8, %f6), U3_retl_o2_plus_g2)
 	add		%o1, 0x8, %o1
 	subcc		%g2, 0x8, %g2
 	faligndata	%f4, %f6, %f0
-	EX_ST_FP(STORE(std, %f0, %o0))
+	EX_ST_FP(STORE(std, %f0, %o0), U3_retl_o2_plus_g2_plus_8)
 	be,pn		%icc, 3f
 	 add		%o0, 0x8, %o0
 
-	EX_LD_FP(LOAD(ldd, %o1 + 0x8, %f4))
+	EX_LD_FP(LOAD(ldd, %o1 + 0x8, %f4), U3_retl_o2_plus_g2)
 	add		%o1, 0x8, %o1
 	subcc		%g2, 0x8, %g2
 	faligndata	%f6, %f4, %f2
-	EX_ST_FP(STORE(std, %f2, %o0))
+	EX_ST_FP(STORE(std, %f2, %o0), U3_retl_o2_plus_g2_plus_8)
 	bne,pt		%icc, 1b
 	 add		%o0, 0x8, %o0
 
@@ -161,25 +239,25 @@
 	LOAD(prefetch, %o1 + 0x080, #one_read)
 	LOAD(prefetch, %o1 + 0x0c0, #one_read)
 	LOAD(prefetch, %o1 + 0x100, #one_read)
-	EX_LD_FP(LOAD(ldd, %o1 + 0x000, %f0))
+	EX_LD_FP(LOAD(ldd, %o1 + 0x000, %f0), U3_retl_o2)
 	LOAD(prefetch, %o1 + 0x140, #one_read)
-	EX_LD_FP(LOAD(ldd, %o1 + 0x008, %f2))
+	EX_LD_FP(LOAD(ldd, %o1 + 0x008, %f2), U3_retl_o2)
 	LOAD(prefetch, %o1 + 0x180, #one_read)
-	EX_LD_FP(LOAD(ldd, %o1 + 0x010, %f4))
+	EX_LD_FP(LOAD(ldd, %o1 + 0x010, %f4), U3_retl_o2)
 	LOAD(prefetch, %o1 + 0x1c0, #one_read)
 	faligndata	%f0, %f2, %f16
-	EX_LD_FP(LOAD(ldd, %o1 + 0x018, %f6))
+	EX_LD_FP(LOAD(ldd, %o1 + 0x018, %f6), U3_retl_o2)
 	faligndata	%f2, %f4, %f18
-	EX_LD_FP(LOAD(ldd, %o1 + 0x020, %f8))
+	EX_LD_FP(LOAD(ldd, %o1 + 0x020, %f8), U3_retl_o2)
 	faligndata	%f4, %f6, %f20
-	EX_LD_FP(LOAD(ldd, %o1 + 0x028, %f10))
+	EX_LD_FP(LOAD(ldd, %o1 + 0x028, %f10), U3_retl_o2)
 	faligndata	%f6, %f8, %f22
 
-	EX_LD_FP(LOAD(ldd, %o1 + 0x030, %f12))
+	EX_LD_FP(LOAD(ldd, %o1 + 0x030, %f12), U3_retl_o2)
 	faligndata	%f8, %f10, %f24
-	EX_LD_FP(LOAD(ldd, %o1 + 0x038, %f14))
+	EX_LD_FP(LOAD(ldd, %o1 + 0x038, %f14), U3_retl_o2)
 	faligndata	%f10, %f12, %f26
-	EX_LD_FP(LOAD(ldd, %o1 + 0x040, %f0))
+	EX_LD_FP(LOAD(ldd, %o1 + 0x040, %f0), U3_retl_o2)
 
 	subcc		GLOBAL_SPARE, 0x80, GLOBAL_SPARE
 	add		%o1, 0x40, %o1
@@ -190,26 +268,26 @@
 
 	.align		64
 1:
-	EX_LD_FP(LOAD(ldd, %o1 + 0x008, %f2))
+	EX_LD_FP(LOAD(ldd, %o1 + 0x008, %f2), U3_retl_o2_plus_o3_sll_6_plus_0x80)
 	faligndata	%f12, %f14, %f28
-	EX_LD_FP(LOAD(ldd, %o1 + 0x010, %f4))
+	EX_LD_FP(LOAD(ldd, %o1 + 0x010, %f4), U3_retl_o2_plus_o3_sll_6_plus_0x80)
 	faligndata	%f14, %f0, %f30
-	EX_ST_FP(STORE_BLK(%f16, %o0))
-	EX_LD_FP(LOAD(ldd, %o1 + 0x018, %f6))
+	EX_ST_FP(STORE_BLK(%f16, %o0), U3_retl_o2_plus_o3_sll_6_plus_0x80)
+	EX_LD_FP(LOAD(ldd, %o1 + 0x018, %f6), U3_retl_o2_plus_o3_sll_6_plus_0x40)
 	faligndata	%f0, %f2, %f16
 	add		%o0, 0x40, %o0
 
-	EX_LD_FP(LOAD(ldd, %o1 + 0x020, %f8))
+	EX_LD_FP(LOAD(ldd, %o1 + 0x020, %f8), U3_retl_o2_plus_o3_sll_6_plus_0x40)
 	faligndata	%f2, %f4, %f18
-	EX_LD_FP(LOAD(ldd, %o1 + 0x028, %f10))
+	EX_LD_FP(LOAD(ldd, %o1 + 0x028, %f10), U3_retl_o2_plus_o3_sll_6_plus_0x40)
 	faligndata	%f4, %f6, %f20
-	EX_LD_FP(LOAD(ldd, %o1 + 0x030, %f12))
+	EX_LD_FP(LOAD(ldd, %o1 + 0x030, %f12), U3_retl_o2_plus_o3_sll_6_plus_0x40)
 	subcc		%o3, 0x01, %o3
 	faligndata	%f6, %f8, %f22
-	EX_LD_FP(LOAD(ldd, %o1 + 0x038, %f14))
+	EX_LD_FP(LOAD(ldd, %o1 + 0x038, %f14), U3_retl_o2_plus_o3_sll_6_plus_0x80)
 
 	faligndata	%f8, %f10, %f24
-	EX_LD_FP(LOAD(ldd, %o1 + 0x040, %f0))
+	EX_LD_FP(LOAD(ldd, %o1 + 0x040, %f0), U3_retl_o2_plus_o3_sll_6_plus_0x80)
 	LOAD(prefetch, %o1 + 0x1c0, #one_read)
 	faligndata	%f10, %f12, %f26
 	bg,pt		%XCC, 1b
@@ -217,29 +295,29 @@
 
 	/* Finally we copy the last full 64-byte block. */
 2:
-	EX_LD_FP(LOAD(ldd, %o1 + 0x008, %f2))
+	EX_LD_FP(LOAD(ldd, %o1 + 0x008, %f2), U3_retl_o2_plus_o3_sll_6_plus_0x80)
 	faligndata	%f12, %f14, %f28
-	EX_LD_FP(LOAD(ldd, %o1 + 0x010, %f4))
+	EX_LD_FP(LOAD(ldd, %o1 + 0x010, %f4), U3_retl_o2_plus_o3_sll_6_plus_0x80)
 	faligndata	%f14, %f0, %f30
-	EX_ST_FP(STORE_BLK(%f16, %o0))
-	EX_LD_FP(LOAD(ldd, %o1 + 0x018, %f6))
+	EX_ST_FP(STORE_BLK(%f16, %o0), U3_retl_o2_plus_o3_sll_6_plus_0x80)
+	EX_LD_FP(LOAD(ldd, %o1 + 0x018, %f6), U3_retl_o2_plus_o3_sll_6_plus_0x40)
 	faligndata	%f0, %f2, %f16
-	EX_LD_FP(LOAD(ldd, %o1 + 0x020, %f8))
+	EX_LD_FP(LOAD(ldd, %o1 + 0x020, %f8), U3_retl_o2_plus_o3_sll_6_plus_0x40)
 	faligndata	%f2, %f4, %f18
-	EX_LD_FP(LOAD(ldd, %o1 + 0x028, %f10))
+	EX_LD_FP(LOAD(ldd, %o1 + 0x028, %f10), U3_retl_o2_plus_o3_sll_6_plus_0x40)
 	faligndata	%f4, %f6, %f20
-	EX_LD_FP(LOAD(ldd, %o1 + 0x030, %f12))
+	EX_LD_FP(LOAD(ldd, %o1 + 0x030, %f12), U3_retl_o2_plus_o3_sll_6_plus_0x40)
 	faligndata	%f6, %f8, %f22
-	EX_LD_FP(LOAD(ldd, %o1 + 0x038, %f14))
+	EX_LD_FP(LOAD(ldd, %o1 + 0x038, %f14), U3_retl_o2_plus_o3_sll_6_plus_0x40)
 	faligndata	%f8, %f10, %f24
 	cmp		%g1, 0
 	be,pt		%XCC, 1f
 	 add		%o0, 0x40, %o0
-	EX_LD_FP(LOAD(ldd, %o1 + 0x040, %f0))
+	EX_LD_FP(LOAD(ldd, %o1 + 0x040, %f0), U3_retl_o2_plus_o3_sll_6_plus_0x40)
 1:	faligndata	%f10, %f12, %f26
 	faligndata	%f12, %f14, %f28
 	faligndata	%f14, %f0, %f30
-	EX_ST_FP(STORE_BLK(%f16, %o0))
+	EX_ST_FP(STORE_BLK(%f16, %o0), U3_retl_o2_plus_o3_sll_6_plus_0x40)
 	add		%o0, 0x40, %o0
 	add		%o1, 0x40, %o1
 	membar		#Sync
@@ -259,20 +337,20 @@
 
 	sub		%o2, %g2, %o2
 	be,a,pt		%XCC, 1f
-	 EX_LD_FP(LOAD(ldd, %o1 + 0x00, %f0))
+	 EX_LD_FP(LOAD(ldd, %o1 + 0x00, %f0), U3_retl_o2_plus_g2)
 
-1:	EX_LD_FP(LOAD(ldd, %o1 + 0x08, %f2))
+1:	EX_LD_FP(LOAD(ldd, %o1 + 0x08, %f2), U3_retl_o2_plus_g2)
 	add		%o1, 0x8, %o1
 	subcc		%g2, 0x8, %g2
 	faligndata	%f0, %f2, %f8
-	EX_ST_FP(STORE(std, %f8, %o0))
+	EX_ST_FP(STORE(std, %f8, %o0), U3_retl_o2_plus_g2_plus_8)
 	be,pn		%XCC, 2f
 	 add		%o0, 0x8, %o0
-	EX_LD_FP(LOAD(ldd, %o1 + 0x08, %f0))
+	EX_LD_FP(LOAD(ldd, %o1 + 0x08, %f0), U3_retl_o2_plus_g2)
 	add		%o1, 0x8, %o1
 	subcc		%g2, 0x8, %g2
 	faligndata	%f2, %f0, %f8
-	EX_ST_FP(STORE(std, %f8, %o0))
+	EX_ST_FP(STORE(std, %f8, %o0), U3_retl_o2_plus_g2_plus_8)
 	bne,pn		%XCC, 1b
 	 add		%o0, 0x8, %o0
 
@@ -292,30 +370,33 @@
 	 andcc		%o2, 0x8, %g0
 	be,pt		%icc, 1f
 	 nop
-	EX_LD(LOAD(ldx, %o1, %o5))
-	EX_ST(STORE(stx, %o5, %o1 + %o3))
+	EX_LD(LOAD(ldx, %o1, %o5), U3_retl_o2)
+	EX_ST(STORE(stx, %o5, %o1 + %o3), U3_retl_o2)
 	add		%o1, 0x8, %o1
+	sub		%o2, 8, %o2
 
 1:	andcc		%o2, 0x4, %g0
 	be,pt		%icc, 1f
 	 nop
-	EX_LD(LOAD(lduw, %o1, %o5))
-	EX_ST(STORE(stw, %o5, %o1 + %o3))
+	EX_LD(LOAD(lduw, %o1, %o5), U3_retl_o2)
+	EX_ST(STORE(stw, %o5, %o1 + %o3), U3_retl_o2)
 	add		%o1, 0x4, %o1
+	sub		%o2, 4, %o2
 
 1:	andcc		%o2, 0x2, %g0
 	be,pt		%icc, 1f
 	 nop
-	EX_LD(LOAD(lduh, %o1, %o5))
-	EX_ST(STORE(sth, %o5, %o1 + %o3))
+	EX_LD(LOAD(lduh, %o1, %o5), U3_retl_o2)
+	EX_ST(STORE(sth, %o5, %o1 + %o3), U3_retl_o2)
 	add		%o1, 0x2, %o1
+	sub		%o2, 2, %o2
 
 1:	andcc		%o2, 0x1, %g0
 	be,pt		%icc, 85f
 	 nop
-	EX_LD(LOAD(ldub, %o1, %o5))
+	EX_LD(LOAD(ldub, %o1, %o5), U3_retl_o2)
 	ba,pt		%xcc, 85f
-	 EX_ST(STORE(stb, %o5, %o1 + %o3))
+	 EX_ST(STORE(stb, %o5, %o1 + %o3), U3_retl_o2)
 
 	.align		64
 70: /* 16 < len <= 64 */
@@ -326,26 +407,26 @@
 	andn		%o2, 0xf, GLOBAL_SPARE
 	and		%o2, 0xf, %o2
 1:	subcc		GLOBAL_SPARE, 0x10, GLOBAL_SPARE
-	EX_LD(LOAD(ldx, %o1 + 0x00, %o5))
-	EX_LD(LOAD(ldx, %o1 + 0x08, %g1))
-	EX_ST(STORE(stx, %o5, %o1 + %o3))
+	EX_LD(LOAD(ldx, %o1 + 0x00, %o5), U3_retl_o2_plus_GS_plus_0x10)
+	EX_LD(LOAD(ldx, %o1 + 0x08, %g1), U3_retl_o2_plus_GS_plus_0x10)
+	EX_ST(STORE(stx, %o5, %o1 + %o3), U3_retl_o2_plus_GS_plus_0x10)
 	add		%o1, 0x8, %o1
-	EX_ST(STORE(stx, %g1, %o1 + %o3))
+	EX_ST(STORE(stx, %g1, %o1 + %o3), U3_retl_o2_plus_GS_plus_0x08)
 	bgu,pt		%XCC, 1b
 	 add		%o1, 0x8, %o1
 73:	andcc		%o2, 0x8, %g0
 	be,pt		%XCC, 1f
 	 nop
 	sub		%o2, 0x8, %o2
-	EX_LD(LOAD(ldx, %o1, %o5))
-	EX_ST(STORE(stx, %o5, %o1 + %o3))
+	EX_LD(LOAD(ldx, %o1, %o5), U3_retl_o2_plus_8)
+	EX_ST(STORE(stx, %o5, %o1 + %o3), U3_retl_o2_plus_8)
 	add		%o1, 0x8, %o1
 1:	andcc		%o2, 0x4, %g0
 	be,pt		%XCC, 1f
 	 nop
 	sub		%o2, 0x4, %o2
-	EX_LD(LOAD(lduw, %o1, %o5))
-	EX_ST(STORE(stw, %o5, %o1 + %o3))
+	EX_LD(LOAD(lduw, %o1, %o5), U3_retl_o2_plus_4)
+	EX_ST(STORE(stw, %o5, %o1 + %o3), U3_retl_o2_plus_4)
 	add		%o1, 0x4, %o1
 1:	cmp		%o2, 0
 	be,pt		%XCC, 85f
@@ -361,8 +442,8 @@
 	sub		%o2, %g1, %o2
 
 1:	subcc		%g1, 1, %g1
-	EX_LD(LOAD(ldub, %o1, %o5))
-	EX_ST(STORE(stb, %o5, %o1 + %o3))
+	EX_LD(LOAD(ldub, %o1, %o5), U3_retl_o2_plus_g1_plus_1)
+	EX_ST(STORE(stb, %o5, %o1 + %o3), U3_retl_o2_plus_g1_plus_1)
 	bgu,pt		%icc, 1b
 	 add		%o1, 1, %o1
 
@@ -378,16 +459,16 @@
 
 8:	mov		64, %o3
 	andn		%o1, 0x7, %o1
-	EX_LD(LOAD(ldx, %o1, %g2))
+	EX_LD(LOAD(ldx, %o1, %g2), U3_retl_o2)
 	sub		%o3, %g1, %o3
 	andn		%o2, 0x7, GLOBAL_SPARE
 	sllx		%g2, %g1, %g2
-1:	EX_LD(LOAD(ldx, %o1 + 0x8, %g3))
+1:	EX_LD(LOAD(ldx, %o1 + 0x8, %g3), U3_retl_o2_and_7_plus_GS)
 	subcc		GLOBAL_SPARE, 0x8, GLOBAL_SPARE
 	add		%o1, 0x8, %o1
 	srlx		%g3, %o3, %o5
 	or		%o5, %g2, %o5
-	EX_ST(STORE(stx, %o5, %o0))
+	EX_ST(STORE(stx, %o5, %o0), U3_retl_o2_and_7_plus_GS_plus_8)
 	add		%o0, 0x8, %o0
 	bgu,pt		%icc, 1b
 	 sllx		%g3, %g1, %g2
@@ -407,8 +488,8 @@
 
 1:
 	subcc		%o2, 4, %o2
-	EX_LD(LOAD(lduw, %o1, %g1))
-	EX_ST(STORE(stw, %g1, %o1 + %o3))
+	EX_LD(LOAD(lduw, %o1, %g1), U3_retl_o2_plus_4)
+	EX_ST(STORE(stw, %g1, %o1 + %o3), U3_retl_o2_plus_4)
 	bgu,pt		%XCC, 1b
 	 add		%o1, 4, %o1
 
@@ -418,8 +499,8 @@
 	.align		32
 90:
 	subcc		%o2, 1, %o2
-	EX_LD(LOAD(ldub, %o1, %g1))
-	EX_ST(STORE(stb, %g1, %o1 + %o3))
+	EX_LD(LOAD(ldub, %o1, %g1), U3_retl_o2_plus_1)
+	EX_ST(STORE(stb, %g1, %o1 + %o3), U3_retl_o2_plus_1)
 	bgu,pt		%XCC, 90b
 	 add		%o1, 1, %o1
 	retl
diff --git a/arch/sparc/lib/copy_in_user.S b/arch/sparc/lib/copy_in_user.S
index 482de09..0252b21 100644
--- a/arch/sparc/lib/copy_in_user.S
+++ b/arch/sparc/lib/copy_in_user.S
@@ -9,18 +9,33 @@
 
 #define XCC xcc
 
-#define EX(x,y)			\
+#define EX(x,y,z)		\
 98:	x,y;			\
 	.section __ex_table,"a";\
 	.align 4;		\
-	.word 98b, __retl_one;	\
+	.word 98b, z;		\
 	.text;			\
 	.align 4;
 
+#define EX_O4(x,y) EX(x,y,__retl_o4_plus_8)
+#define EX_O2_4(x,y) EX(x,y,__retl_o2_plus_4)
+#define EX_O2_1(x,y) EX(x,y,__retl_o2_plus_1)
+
 	.register	%g2,#scratch
 	.register	%g3,#scratch
 
 	.text
+__retl_o4_plus_8:
+	add	%o4, %o2, %o4
+	retl
+	 add	%o4, 8, %o0
+__retl_o2_plus_4:
+	retl
+	 add	%o2, 4, %o0
+__retl_o2_plus_1:
+	retl
+	 add	%o2, 1, %o0
+
 	.align	32
 
 	/* Don't try to get too fancy here, just nice and
@@ -45,8 +60,8 @@
 	andn		%o2, 0x7, %o4
 	and		%o2, 0x7, %o2
 1:	subcc		%o4, 0x8, %o4
-	EX(ldxa [%o1] %asi, %o5)
-	EX(stxa %o5, [%o0] %asi)
+	EX_O4(ldxa [%o1] %asi, %o5)
+	EX_O4(stxa %o5, [%o0] %asi)
 	add		%o1, 0x8, %o1
 	bgu,pt		%XCC, 1b
 	 add		%o0, 0x8, %o0
@@ -54,8 +69,8 @@
 	be,pt		%XCC, 1f
 	 nop
 	sub		%o2, 0x4, %o2
-	EX(lduwa [%o1] %asi, %o5)
-	EX(stwa %o5, [%o0] %asi)
+	EX_O2_4(lduwa [%o1] %asi, %o5)
+	EX_O2_4(stwa %o5, [%o0] %asi)
 	add		%o1, 0x4, %o1
 	add		%o0, 0x4, %o0
 1:	cmp		%o2, 0
@@ -71,8 +86,8 @@
 
 82:
 	subcc		%o2, 4, %o2
-	EX(lduwa [%o1] %asi, %g1)
-	EX(stwa %g1, [%o0] %asi)
+	EX_O2_4(lduwa [%o1] %asi, %g1)
+	EX_O2_4(stwa %g1, [%o0] %asi)
 	add		%o1, 4, %o1
 	bgu,pt		%XCC, 82b
 	 add		%o0, 4, %o0
@@ -83,8 +98,8 @@
 	.align	32
 90:
 	subcc		%o2, 1, %o2
-	EX(lduba [%o1] %asi, %g1)
-	EX(stba %g1, [%o0] %asi)
+	EX_O2_1(lduba [%o1] %asi, %g1)
+	EX_O2_1(stba %g1, [%o0] %asi)
 	add		%o1, 1, %o1
 	bgu,pt		%XCC, 90b
 	 add		%o0, 1, %o0
diff --git a/arch/sparc/lib/user_fixup.c b/arch/sparc/lib/user_fixup.c
deleted file mode 100644
index ac96ae2..0000000
--- a/arch/sparc/lib/user_fixup.c
+++ /dev/null
@@ -1,71 +0,0 @@
-/* user_fixup.c: Fix up user copy faults.
- *
- * Copyright (C) 2004 David S. Miller <davem@redhat.com>
- */
-
-#include <linux/compiler.h>
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/errno.h>
-#include <linux/module.h>
-
-#include <asm/uaccess.h>
-
-/* Calculating the exact fault address when using
- * block loads and stores can be very complicated.
- *
- * Instead of trying to be clever and handling all
- * of the cases, just fix things up simply here.
- */
-
-static unsigned long compute_size(unsigned long start, unsigned long size, unsigned long *offset)
-{
-	unsigned long fault_addr = current_thread_info()->fault_address;
-	unsigned long end = start + size;
-
-	if (fault_addr < start || fault_addr >= end) {
-		*offset = 0;
-	} else {
-		*offset = fault_addr - start;
-		size = end - fault_addr;
-	}
-	return size;
-}
-
-unsigned long copy_from_user_fixup(void *to, const void __user *from, unsigned long size)
-{
-	unsigned long offset;
-
-	size = compute_size((unsigned long) from, size, &offset);
-	if (likely(size))
-		memset(to + offset, 0, size);
-
-	return size;
-}
-EXPORT_SYMBOL(copy_from_user_fixup);
-
-unsigned long copy_to_user_fixup(void __user *to, const void *from, unsigned long size)
-{
-	unsigned long offset;
-
-	return compute_size((unsigned long) to, size, &offset);
-}
-EXPORT_SYMBOL(copy_to_user_fixup);
-
-unsigned long copy_in_user_fixup(void __user *to, void __user *from, unsigned long size)
-{
-	unsigned long fault_addr = current_thread_info()->fault_address;
-	unsigned long start = (unsigned long) to;
-	unsigned long end = start + size;
-
-	if (fault_addr >= start && fault_addr < end)
-		return end - fault_addr;
-
-	start = (unsigned long) from;
-	end = start + size;
-	if (fault_addr >= start && fault_addr < end)
-		return end - fault_addr;
-
-	return size;
-}
-EXPORT_SYMBOL(copy_in_user_fixup);
diff --git a/arch/sparc/mm/gup.c b/arch/sparc/mm/gup.c
index 4e06750..cd0e32b 100644
--- a/arch/sparc/mm/gup.c
+++ b/arch/sparc/mm/gup.c
@@ -238,7 +238,8 @@
 		pages += nr;
 
 		ret = get_user_pages_unlocked(start,
-			(end - start) >> PAGE_SHIFT, write, 0, pages);
+			(end - start) >> PAGE_SHIFT, pages,
+			write ? FOLL_WRITE : 0);
 
 		/* Have to be a bit careful with return values */
 		if (nr > 0) {
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 439784b..37aa537 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -802,8 +802,10 @@
 };
 static struct mdesc_mblock *mblocks;
 static int num_mblocks;
+static int find_numa_node_for_addr(unsigned long pa,
+				   struct node_mem_mask *pnode_mask);
 
-static unsigned long ra_to_pa(unsigned long addr)
+static unsigned long __init ra_to_pa(unsigned long addr)
 {
 	int i;
 
@@ -819,8 +821,11 @@
 	return addr;
 }
 
-static int find_node(unsigned long addr)
+static int __init find_node(unsigned long addr)
 {
+	static bool search_mdesc = true;
+	static struct node_mem_mask last_mem_mask = { ~0UL, ~0UL };
+	static int last_index;
 	int i;
 
 	addr = ra_to_pa(addr);
@@ -830,13 +835,30 @@
 		if ((addr & p->mask) == p->val)
 			return i;
 	}
-	/* The following condition has been observed on LDOM guests.*/
-	WARN_ONCE(1, "find_node: A physical address doesn't match a NUMA node"
-		" rule. Some physical memory will be owned by node 0.");
-	return 0;
+	/* The following condition has been observed on LDOM guests because
+	 * node_masks only contains the best latency mask and value.
+	 * LDOM guest's mdesc can contain a single latency group to
+	 * cover multiple address range. Print warning message only if the
+	 * address cannot be found in node_masks nor mdesc.
+	 */
+	if ((search_mdesc) &&
+	    ((addr & last_mem_mask.mask) != last_mem_mask.val)) {
+		/* find the available node in the mdesc */
+		last_index = find_numa_node_for_addr(addr, &last_mem_mask);
+		numadbg("find_node: latency group for address 0x%lx is %d\n",
+			addr, last_index);
+		if ((last_index < 0) || (last_index >= num_node_masks)) {
+			/* WARN_ONCE() and use default group 0 */
+			WARN_ONCE(1, "find_node: A physical address doesn't match a NUMA node rule. Some physical memory will be owned by node 0.");
+			search_mdesc = false;
+			last_index = 0;
+		}
+	}
+
+	return last_index;
 }
 
-static u64 memblock_nid_range(u64 start, u64 end, int *nid)
+static u64 __init memblock_nid_range(u64 start, u64 end, int *nid)
 {
 	*nid = find_node(start);
 	start += PAGE_SIZE;
@@ -1160,6 +1182,41 @@
 	return numa_latency[from][to];
 }
 
+static int find_numa_node_for_addr(unsigned long pa,
+				   struct node_mem_mask *pnode_mask)
+{
+	struct mdesc_handle *md = mdesc_grab();
+	u64 node, arc;
+	int i = 0;
+
+	node = mdesc_node_by_name(md, MDESC_NODE_NULL, "latency-groups");
+	if (node == MDESC_NODE_NULL)
+		goto out;
+
+	mdesc_for_each_node_by_name(md, node, "group") {
+		mdesc_for_each_arc(arc, md, node, MDESC_ARC_TYPE_FWD) {
+			u64 target = mdesc_arc_target(md, arc);
+			struct mdesc_mlgroup *m = find_mlgroup(target);
+
+			if (!m)
+				continue;
+			if ((pa & m->mask) == m->match) {
+				if (pnode_mask) {
+					pnode_mask->mask = m->mask;
+					pnode_mask->val = m->match;
+				}
+				mdesc_release(md);
+				return i;
+			}
+		}
+		i++;
+	}
+
+out:
+	mdesc_release(md);
+	return -1;
+}
+
 static int __init find_best_numa_node_for_mlgroup(struct mdesc_mlgroup *grp)
 {
 	int i;
diff --git a/arch/sparc/mm/tsb.c b/arch/sparc/mm/tsb.c
index f2b7711..e20fbba 100644
--- a/arch/sparc/mm/tsb.c
+++ b/arch/sparc/mm/tsb.c
@@ -27,6 +27,20 @@
 	return (tag == (vaddr >> 22));
 }
 
+static void flush_tsb_kernel_range_scan(unsigned long start, unsigned long end)
+{
+	unsigned long idx;
+
+	for (idx = 0; idx < KERNEL_TSB_NENTRIES; idx++) {
+		struct tsb *ent = &swapper_tsb[idx];
+		unsigned long match = idx << 13;
+
+		match |= (ent->tag << 22);
+		if (match >= start && match < end)
+			ent->tag = (1UL << TSB_TAG_INVALID_BIT);
+	}
+}
+
 /* TSB flushes need only occur on the processor initiating the address
  * space modification, not on each cpu the address space has run on.
  * Only the TLB flush needs that treatment.
@@ -36,6 +50,9 @@
 {
 	unsigned long v;
 
+	if ((end - start) >> PAGE_SHIFT >= 2 * KERNEL_TSB_NENTRIES)
+		return flush_tsb_kernel_range_scan(start, end);
+
 	for (v = start; v < end; v += PAGE_SIZE) {
 		unsigned long hash = tsb_hash(v, PAGE_SHIFT,
 					      KERNEL_TSB_NENTRIES);
diff --git a/arch/sparc/mm/ultra.S b/arch/sparc/mm/ultra.S
index b4f4733..5d2fd6c 100644
--- a/arch/sparc/mm/ultra.S
+++ b/arch/sparc/mm/ultra.S
@@ -30,7 +30,7 @@
 	.text
 	.align		32
 	.globl		__flush_tlb_mm
-__flush_tlb_mm:		/* 18 insns */
+__flush_tlb_mm:		/* 19 insns */
 	/* %o0=(ctx & TAG_CONTEXT_BITS), %o1=SECONDARY_CONTEXT */
 	ldxa		[%o1] ASI_DMMU, %g2
 	cmp		%g2, %o0
@@ -81,7 +81,7 @@
 
 	.align		32
 	.globl		__flush_tlb_pending
-__flush_tlb_pending:	/* 26 insns */
+__flush_tlb_pending:	/* 27 insns */
 	/* %o0 = context, %o1 = nr, %o2 = vaddrs[] */
 	rdpr		%pstate, %g7
 	sllx		%o1, 3, %o1
@@ -113,12 +113,14 @@
 
 	.align		32
 	.globl		__flush_tlb_kernel_range
-__flush_tlb_kernel_range:	/* 16 insns */
+__flush_tlb_kernel_range:	/* 31 insns */
 	/* %o0=start, %o1=end */
 	cmp		%o0, %o1
 	be,pn		%xcc, 2f
+	 sub		%o1, %o0, %o3
+	srlx		%o3, 18, %o4
+	brnz,pn		%o4, __spitfire_flush_tlb_kernel_range_slow
 	 sethi		%hi(PAGE_SIZE), %o4
-	sub		%o1, %o0, %o3
 	sub		%o3, %o4, %o3
 	or		%o0, 0x20, %o0		! Nucleus
 1:	stxa		%g0, [%o0 + %o3] ASI_DMMU_DEMAP
@@ -131,6 +133,41 @@
 	retl
 	 nop
 	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+
+__spitfire_flush_tlb_kernel_range_slow:
+	mov		63 * 8, %o4
+1:	ldxa		[%o4] ASI_ITLB_DATA_ACCESS, %o3
+	andcc		%o3, 0x40, %g0			/* _PAGE_L_4U */
+	bne,pn		%xcc, 2f
+	 mov		TLB_TAG_ACCESS, %o3
+	stxa		%g0, [%o3] ASI_IMMU
+	stxa		%g0, [%o4] ASI_ITLB_DATA_ACCESS
+	membar		#Sync
+2:	ldxa		[%o4] ASI_DTLB_DATA_ACCESS, %o3
+	andcc		%o3, 0x40, %g0
+	bne,pn		%xcc, 2f
+	 mov		TLB_TAG_ACCESS, %o3
+	stxa		%g0, [%o3] ASI_DMMU
+	stxa		%g0, [%o4] ASI_DTLB_DATA_ACCESS
+	membar		#Sync
+2:	sub		%o4, 8, %o4
+	brgez,pt	%o4, 1b
+	 nop
+	retl
+	 nop
 
 __spitfire_flush_tlb_mm_slow:
 	rdpr		%pstate, %g1
@@ -285,6 +322,40 @@
 	retl
 	 wrpr		%g7, 0x0, %pstate
 
+__cheetah_flush_tlb_kernel_range:	/* 31 insns */
+	/* %o0=start, %o1=end */
+	cmp		%o0, %o1
+	be,pn		%xcc, 2f
+	 sub		%o1, %o0, %o3
+	srlx		%o3, 18, %o4
+	brnz,pn		%o4, 3f
+	 sethi		%hi(PAGE_SIZE), %o4
+	sub		%o3, %o4, %o3
+	or		%o0, 0x20, %o0		! Nucleus
+1:	stxa		%g0, [%o0 + %o3] ASI_DMMU_DEMAP
+	stxa		%g0, [%o0 + %o3] ASI_IMMU_DEMAP
+	membar		#Sync
+	brnz,pt		%o3, 1b
+	 sub		%o3, %o4, %o3
+2:	sethi		%hi(KERNBASE), %o3
+	flush		%o3
+	retl
+	 nop
+3:	mov		0x80, %o4
+	stxa		%g0, [%o4] ASI_DMMU_DEMAP
+	membar		#Sync
+	stxa		%g0, [%o4] ASI_IMMU_DEMAP
+	membar		#Sync
+	retl
+	 nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+
 #ifdef DCACHE_ALIASING_POSSIBLE
 __cheetah_flush_dcache_page: /* 11 insns */
 	sethi		%hi(PAGE_OFFSET), %g1
@@ -309,19 +380,28 @@
 	ret
 	 restore
 
-__hypervisor_flush_tlb_mm: /* 10 insns */
+__hypervisor_flush_tlb_mm: /* 19 insns */
 	mov		%o0, %o2	/* ARG2: mmu context */
 	mov		0, %o0		/* ARG0: CPU lists unimplemented */
 	mov		0, %o1		/* ARG1: CPU lists unimplemented */
 	mov		HV_MMU_ALL, %o3	/* ARG3: flags */
 	mov		HV_FAST_MMU_DEMAP_CTX, %o5
 	ta		HV_FAST_TRAP
-	brnz,pn		%o0, __hypervisor_tlb_tl0_error
+	brnz,pn		%o0, 1f
 	 mov		HV_FAST_MMU_DEMAP_CTX, %o1
 	retl
 	 nop
+1:	sethi		%hi(__hypervisor_tlb_tl0_error), %o5
+	jmpl		%o5 + %lo(__hypervisor_tlb_tl0_error), %g0
+	 nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
 
-__hypervisor_flush_tlb_page: /* 11 insns */
+__hypervisor_flush_tlb_page: /* 22 insns */
 	/* %o0 = context, %o1 = vaddr */
 	mov		%o0, %g2
 	mov		%o1, %o0              /* ARG0: vaddr + IMMU-bit */
@@ -330,12 +410,23 @@
 	srlx		%o0, PAGE_SHIFT, %o0
 	sllx		%o0, PAGE_SHIFT, %o0
 	ta		HV_MMU_UNMAP_ADDR_TRAP
-	brnz,pn		%o0, __hypervisor_tlb_tl0_error
+	brnz,pn		%o0, 1f
 	 mov		HV_MMU_UNMAP_ADDR_TRAP, %o1
 	retl
 	 nop
+1:	sethi		%hi(__hypervisor_tlb_tl0_error), %o2
+	jmpl		%o2 + %lo(__hypervisor_tlb_tl0_error), %g0
+	 nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
 
-__hypervisor_flush_tlb_pending: /* 16 insns */
+__hypervisor_flush_tlb_pending: /* 27 insns */
 	/* %o0 = context, %o1 = nr, %o2 = vaddrs[] */
 	sllx		%o1, 3, %g1
 	mov		%o2, %g2
@@ -347,31 +438,57 @@
 	srlx		%o0, PAGE_SHIFT, %o0
 	sllx		%o0, PAGE_SHIFT, %o0
 	ta		HV_MMU_UNMAP_ADDR_TRAP
-	brnz,pn		%o0, __hypervisor_tlb_tl0_error
+	brnz,pn		%o0, 1f
 	 mov		HV_MMU_UNMAP_ADDR_TRAP, %o1
 	brnz,pt		%g1, 1b
 	 nop
 	retl
 	 nop
+1:	sethi		%hi(__hypervisor_tlb_tl0_error), %o2
+	jmpl		%o2 + %lo(__hypervisor_tlb_tl0_error), %g0
+	 nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
 
-__hypervisor_flush_tlb_kernel_range: /* 16 insns */
+__hypervisor_flush_tlb_kernel_range: /* 31 insns */
 	/* %o0=start, %o1=end */
 	cmp		%o0, %o1
 	be,pn		%xcc, 2f
-	 sethi		%hi(PAGE_SIZE), %g3
-	mov		%o0, %g1
-	sub		%o1, %g1, %g2
+	 sub		%o1, %o0, %g2
+	srlx		%g2, 18, %g3
+	brnz,pn		%g3, 4f
+	 mov		%o0, %g1
+	sethi		%hi(PAGE_SIZE), %g3
 	sub		%g2, %g3, %g2
 1:	add		%g1, %g2, %o0	/* ARG0: virtual address */
 	mov		0, %o1		/* ARG1: mmu context */
 	mov		HV_MMU_ALL, %o2	/* ARG2: flags */
 	ta		HV_MMU_UNMAP_ADDR_TRAP
-	brnz,pn		%o0, __hypervisor_tlb_tl0_error
+	brnz,pn		%o0, 3f
 	 mov		HV_MMU_UNMAP_ADDR_TRAP, %o1
 	brnz,pt		%g2, 1b
 	 sub		%g2, %g3, %g2
 2:	retl
 	 nop
+3:	sethi		%hi(__hypervisor_tlb_tl0_error), %o2
+	jmpl		%o2 + %lo(__hypervisor_tlb_tl0_error), %g0
+	 nop
+4:	mov		0, %o0		/* ARG0: CPU lists unimplemented */
+	mov		0, %o1		/* ARG1: CPU lists unimplemented */
+	mov		0, %o2		/* ARG2: mmu context == nucleus */
+	mov		HV_MMU_ALL, %o3	/* ARG3: flags */
+	mov		HV_FAST_MMU_DEMAP_CTX, %o5
+	ta		HV_FAST_TRAP
+	brnz,pn		%o0, 3b
+	 mov		HV_FAST_MMU_DEMAP_CTX, %o1
+	retl
+	 nop
 
 #ifdef DCACHE_ALIASING_POSSIBLE
 	/* XXX Niagara and friends have an 8K cache, so no aliasing is
@@ -394,43 +511,6 @@
 	retl
 	 nop
 
-	.globl		cheetah_patch_cachetlbops
-cheetah_patch_cachetlbops:
-	save		%sp, -128, %sp
-
-	sethi		%hi(__flush_tlb_mm), %o0
-	or		%o0, %lo(__flush_tlb_mm), %o0
-	sethi		%hi(__cheetah_flush_tlb_mm), %o1
-	or		%o1, %lo(__cheetah_flush_tlb_mm), %o1
-	call		tlb_patch_one
-	 mov		19, %o2
-
-	sethi		%hi(__flush_tlb_page), %o0
-	or		%o0, %lo(__flush_tlb_page), %o0
-	sethi		%hi(__cheetah_flush_tlb_page), %o1
-	or		%o1, %lo(__cheetah_flush_tlb_page), %o1
-	call		tlb_patch_one
-	 mov		22, %o2
-
-	sethi		%hi(__flush_tlb_pending), %o0
-	or		%o0, %lo(__flush_tlb_pending), %o0
-	sethi		%hi(__cheetah_flush_tlb_pending), %o1
-	or		%o1, %lo(__cheetah_flush_tlb_pending), %o1
-	call		tlb_patch_one
-	 mov		27, %o2
-
-#ifdef DCACHE_ALIASING_POSSIBLE
-	sethi		%hi(__flush_dcache_page), %o0
-	or		%o0, %lo(__flush_dcache_page), %o0
-	sethi		%hi(__cheetah_flush_dcache_page), %o1
-	or		%o1, %lo(__cheetah_flush_dcache_page), %o1
-	call		tlb_patch_one
-	 mov		11, %o2
-#endif /* DCACHE_ALIASING_POSSIBLE */
-
-	ret
-	 restore
-
 #ifdef CONFIG_SMP
 	/* These are all called by the slaves of a cross call, at
 	 * trap level 1, with interrupts fully disabled.
@@ -447,7 +527,7 @@
 	 */
 	.align		32
 	.globl		xcall_flush_tlb_mm
-xcall_flush_tlb_mm:	/* 21 insns */
+xcall_flush_tlb_mm:	/* 24 insns */
 	mov		PRIMARY_CONTEXT, %g2
 	ldxa		[%g2] ASI_DMMU, %g3
 	srlx		%g3, CTX_PGSZ1_NUC_SHIFT, %g4
@@ -469,9 +549,12 @@
 	nop
 	nop
 	nop
+	nop
+	nop
+	nop
 
 	.globl		xcall_flush_tlb_page
-xcall_flush_tlb_page:	/* 17 insns */
+xcall_flush_tlb_page:	/* 20 insns */
 	/* %g5=context, %g1=vaddr */
 	mov		PRIMARY_CONTEXT, %g4
 	ldxa		[%g4] ASI_DMMU, %g2
@@ -490,15 +573,20 @@
 	retry
 	nop
 	nop
+	nop
+	nop
+	nop
 
 	.globl		xcall_flush_tlb_kernel_range
-xcall_flush_tlb_kernel_range:	/* 25 insns */
+xcall_flush_tlb_kernel_range:	/* 44 insns */
 	sethi		%hi(PAGE_SIZE - 1), %g2
 	or		%g2, %lo(PAGE_SIZE - 1), %g2
 	andn		%g1, %g2, %g1
 	andn		%g7, %g2, %g7
 	sub		%g7, %g1, %g3
-	add		%g2, 1, %g2
+	srlx		%g3, 18, %g2
+	brnz,pn		%g2, 2f
+	 add		%g2, 1, %g2
 	sub		%g3, %g2, %g3
 	or		%g1, 0x20, %g1		! Nucleus
 1:	stxa		%g0, [%g1 + %g3] ASI_DMMU_DEMAP
@@ -507,8 +595,25 @@
 	brnz,pt		%g3, 1b
 	 sub		%g3, %g2, %g3
 	retry
-	nop
-	nop
+2:	mov		63 * 8, %g1
+1:	ldxa		[%g1] ASI_ITLB_DATA_ACCESS, %g2
+	andcc		%g2, 0x40, %g0			/* _PAGE_L_4U */
+	bne,pn		%xcc, 2f
+	 mov		TLB_TAG_ACCESS, %g2
+	stxa		%g0, [%g2] ASI_IMMU
+	stxa		%g0, [%g1] ASI_ITLB_DATA_ACCESS
+	membar		#Sync
+2:	ldxa		[%g1] ASI_DTLB_DATA_ACCESS, %g2
+	andcc		%g2, 0x40, %g0
+	bne,pn		%xcc, 2f
+	 mov		TLB_TAG_ACCESS, %g2
+	stxa		%g0, [%g2] ASI_DMMU
+	stxa		%g0, [%g1] ASI_DTLB_DATA_ACCESS