Merge tag 'mmc-v5.4-2' of git://git.kernel.org/pub/scm/linux/kernel/git/ulfh/mmc Pull more MMC updates from Ulf Hansson: "A couple more updates/fixes for MMC: - sdhci-pci: Add Genesys Logic GL975x support - sdhci-tegra: Recover loss in throughput for DMA - sdhci-of-esdhc: Fix DMA bug" * tag 'mmc-v5.4-2' of git://git.kernel.org/pub/scm/linux/kernel/git/ulfh/mmc: mmc: host: sdhci-pci: Add Genesys Logic GL975x support mmc: tegra: Implement ->set_dma_mask() mmc: sdhci: Let drivers define their DMA mask mmc: sdhci-of-esdhc: set DMA snooping based on DMA coherence mmc: sdhci: improve ADMA error reporting

commit: c710364f78afdef8c2ed07556d0743c5a30ed429 [log] [tgz]
author: Linus Torvalds <torvalds@linux-foundation.org> Mon Sep 30 09:21:53 2019 -0700
committer: Linus Torvalds <torvalds@linux-foundation.org> Mon Sep 30 09:21:53 2019 -0700
tree: 38173e6818c3c2e4414a767dd81e60d1d9ff4309
parent: 97f9a3c4eee55b0178b518ae7114a6a53372913d [diff]
parent: e51df6ce668a8f75ce27f83ce0f60103c568c375 [diff]
diff --git a/Documentation/ABI/testing/ima_policy b/Documentation/ABI/testing/ima_policy
index fc376a3..29ebe9a 100644
--- a/Documentation/ABI/testing/ima_policy
+++ b/Documentation/ABI/testing/ima_policy

@@ -37,7 +37,7 @@
 			euid:= decimal value
 			fowner:= decimal value
 		lsm:  	are LSM specific
-		option:	appraise_type:= [imasig]
+		option:	appraise_type:= [imasig] [imasig|modsig]
 			template:= name of a defined IMA template type
 			(eg, ima-ng). Only valid when action is "measure".
 			pcr:= decimal value
@@ -105,3 +105,7 @@
 
 			measure func=KEXEC_KERNEL_CHECK pcr=4
 			measure func=KEXEC_INITRAMFS_CHECK pcr=5
+
+		Example of appraise rule allowing modsig appended signatures:
+
+			appraise func=KEXEC_KERNEL_CHECK appraise_type=imasig|modsig

diff --git a/Documentation/ABI/testing/sysfs-class-backlight b/Documentation/ABI/testing/sysfs-class-backlight
new file mode 100644
index 0000000..3ab175a
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-class-backlight

@@ -0,0 +1,26 @@
+What:		/sys/class/backlight/<backlight>/scale
+Date:		July 2019
+KernelVersion:	5.4
+Contact:	Daniel Thompson <daniel.thompson@linaro.org>
+Description:
+		Description of the scale of the brightness curve.
+
+		The human eye senses brightness approximately logarithmically,
+		hence linear changes in brightness are perceived as being
+		non-linear. To achieve a linear perception of brightness changes
+		controls like sliders need to apply a logarithmic mapping for
+		backlights with a linear brightness curve.
+
+		Possible values of the attribute are:
+
+		unknown
+		  The scale of the brightness curve is unknown.
+
+		linear
+		  The brightness changes linearly with each step. Brightness
+		  controls should apply a logarithmic mapping for a linear
+		  perception.
+
+		non-linear
+		  The brightness changes non-linearly with each step. Brightness
+		  controls should use a linear mapping for a linear perception.

diff --git a/Documentation/ABI/testing/sysfs-class-watchdog b/Documentation/ABI/testing/sysfs-class-watchdog
index 6317ade..675f9b5 100644
--- a/Documentation/ABI/testing/sysfs-class-watchdog
+++ b/Documentation/ABI/testing/sysfs-class-watchdog

@@ -72,3 +72,37 @@
 		It is a read/write file. When read, the currently assigned
 		pretimeout governor is returned.  When written, it sets
 		the pretimeout governor.
+
+What:		/sys/class/watchdog/watchdog1/access_cs0
+Date:		August 2019
+Contact:	Ivan Mikhaylov <i.mikhaylov@yadro.com>,
+		Alexander Amelkin <a.amelkin@yadro.com>
+Description:
+		It is a read/write file. This attribute exists only if the
+		system has booted from the alternate flash chip due to
+		expiration of a watchdog timer of AST2400/AST2500 when
+		alternate boot function was enabled with 'aspeed,alt-boot'
+		devicetree option for that watchdog or with an appropriate
+		h/w strapping (for WDT2 only).
+
+		At alternate flash the 'access_cs0' sysfs node provides:
+			ast2400: a way to get access to the primary SPI flash
+				chip at CS0 after booting from the alternate
+				chip at CS1.
+			ast2500: a way to restore the normal address mapping
+				from (CS0->CS1, CS1->CS0) to (CS0->CS0,
+				CS1->CS1).
+
+		Clearing the boot code selection and timeout counter also
+		resets to the initial state the chip select line mapping. When
+		the SoC is in normal mapping state (i.e. booted from CS0),
+		clearing those bits does nothing for both versions of the SoC.
+		For alternate boot mode (booted from CS1 due to wdt2
+		expiration) the behavior differs as described above.
+
+		This option can be used with wdt2 (watchdog1) only.
+
+		When read, the current status of the boot code selection is
+		shown. When written with any non-zero value, it clears
+		the boot code selection and the timeout counter, which results
+		in chipselect reset for AST2400/AST2500.

diff --git a/Documentation/ABI/testing/sysfs-kernel-slab b/Documentation/ABI/testing/sysfs-kernel-slab
index 29601d9..ed35833 100644
--- a/Documentation/ABI/testing/sysfs-kernel-slab
+++ b/Documentation/ABI/testing/sysfs-kernel-slab

@@ -429,10 +429,15 @@
 Contact:	Pekka Enberg <penberg@cs.helsinki.fi>,
 		Christoph Lameter <cl@linux-foundation.org>
 Description:
-		The shrink file is written when memory should be reclaimed from
-		a cache.  Empty partial slabs are freed and the partial list is
-		sorted so the slabs with the fewest available objects are used
-		first.
+		The shrink file is used to reclaim unused slab cache
+		memory from a cache.  Empty per-cpu or partial slabs
+		are freed and the partial list is sorted so the slabs
+		with the fewest available objects are used first.
+		It only accepts a value of "1" on write for shrinking
+		the cache. Other input values are considered invalid.
+		Shrinking slab caches might be expensive and can
+		adversely impact other running applications.  So it
+		should be used with care.
 
 What:		/sys/kernel/slab/cache/slab_size
 Date:		May 2007

diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst
index 41bdc03..0ae4f56 100644
--- a/Documentation/admin-guide/cgroup-v1/memory.rst
+++ b/Documentation/admin-guide/cgroup-v1/memory.rst

@@ -85,8 +85,10 @@
  memory.oom_control		     set/show oom controls.
  memory.numa_stat		     show the number of memory usage per numa
 				     node
-
  memory.kmem.limit_in_bytes          set/show hard limit for kernel memory
+                                     This knob is deprecated and shouldn't be
+                                     used. It is planned that this be removed in
+                                     the foreseeable future.
  memory.kmem.usage_in_bytes          show current kernel memory allocation
  memory.kmem.failcnt                 show the number of kernel memory usage
 				     hits limits

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index d381478..c7ac2f3 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt

@@ -809,6 +809,8 @@
 			enables the feature at boot time. By default, it is
 			disabled and the system will work mostly the same as a
 			kernel built without CONFIG_DEBUG_PAGEALLOC.
+			Note: to get most of debug_pagealloc error reports, it's
+			useful to also enable the page_owner functionality.
 			on: enable the feature
 
 	debugpat	[X86] Enable PAT debugging
@@ -2274,6 +2276,15 @@
 	lockd.nlm_udpport=M	[NFS] Assign UDP port.
 			Format: <integer>
 
+	lockdown=	[SECURITY]
+			{ integrity | confidentiality }
+			Enable the kernel lockdown feature. If set to
+			integrity, kernel features that allow userland to
+			modify the running kernel are disabled. If set to
+			confidentiality, kernel features that allow userland
+			to extract confidential information from the kernel
+			are also disabled.
+
 	locktorture.nreaders_stress= [KNL]
 			Set the number of locking read-acquisition kthreads.
 			Defaults to being automatically set based on the
@@ -3465,12 +3476,13 @@
 				specify the device is described above.
 				If <order of align> is not specified,
 				PAGE_SIZE is used as alignment.
-				PCI-PCI bridge can be specified, if resource
+				A PCI-PCI bridge can be specified if resource
 				windows need to be expanded.
 				To specify the alignment for several
 				instances of a device, the PCI vendor,
 				device, subvendor, and subdevice may be
-				specified, e.g., 4096@pci:8086:9c22:103c:198f
+				specified, e.g., 12@pci:8086:9c22:103c:198f
+				for 4096-byte alignment.
 		ecrc=		Enable/disable PCIe ECRC (transaction layer
 				end-to-end CRC checking).
 				bios: Use BIOS/firmware settings. This is the

diff --git a/Documentation/core-api/kernel-api.rst b/Documentation/core-api/kernel-api.rst
index 08af5ca..f77de49 100644
--- a/Documentation/core-api/kernel-api.rst
+++ b/Documentation/core-api/kernel-api.rst

@@ -42,6 +42,9 @@
 .. kernel-doc:: lib/string.c
    :export:
 
+.. kernel-doc:: include/linux/string.h
+   :internal:
+
 .. kernel-doc:: mm/util.c
    :functions: kstrdup kstrdup_const kstrndup kmemdup kmemdup_nul memdup_user
                vmemdup_user strndup_user memdup_user_nul

diff --git a/Documentation/devicetree/bindings/i2c/brcm,bcm2835-i2c.txt b/Documentation/devicetree/bindings/i2c/brcm,bcm2835-i2c.txt
index e9de375..c9a6587 100644
--- a/Documentation/devicetree/bindings/i2c/brcm,bcm2835-i2c.txt
+++ b/Documentation/devicetree/bindings/i2c/brcm,bcm2835-i2c.txt

@@ -1,7 +1,9 @@
 Broadcom BCM2835 I2C controller
 
 Required properties:
-- compatible : Should be "brcm,bcm2835-i2c".
+- compatible : Should be one of:
+	"brcm,bcm2711-i2c"
+	"brcm,bcm2835-i2c"
 - reg: Should contain register location and length.
 - interrupts: Should contain interrupt.
 - clocks : The clock feeding the I2C controller.

diff --git a/Documentation/devicetree/bindings/i2c/i2c-rcar.txt b/Documentation/devicetree/bindings/i2c/renesas,i2c.txt
similarity index 100%
rename from Documentation/devicetree/bindings/i2c/i2c-rcar.txt
rename to Documentation/devicetree/bindings/i2c/renesas,i2c.txt


diff --git a/Documentation/devicetree/bindings/i2c/i2c-emev2.txt b/Documentation/devicetree/bindings/i2c/renesas,iic-emev2.txt
similarity index 100%
rename from Documentation/devicetree/bindings/i2c/i2c-emev2.txt
rename to Documentation/devicetree/bindings/i2c/renesas,iic-emev2.txt


diff --git a/Documentation/devicetree/bindings/i2c/i2c-sh_mobile.txt b/Documentation/devicetree/bindings/i2c/renesas,iic.txt
similarity index 100%
rename from Documentation/devicetree/bindings/i2c/i2c-sh_mobile.txt
rename to Documentation/devicetree/bindings/i2c/renesas,iic.txt


diff --git a/Documentation/devicetree/bindings/i2c/i2c-riic.txt b/Documentation/devicetree/bindings/i2c/renesas,riic.txt
similarity index 100%
rename from Documentation/devicetree/bindings/i2c/i2c-riic.txt
rename to Documentation/devicetree/bindings/i2c/renesas,riic.txt


diff --git a/Documentation/devicetree/bindings/mfd/mt6397.txt b/Documentation/devicetree/bindings/mfd/mt6397.txt
index 0ebd08a..a9b105a 100644
--- a/Documentation/devicetree/bindings/mfd/mt6397.txt
+++ b/Documentation/devicetree/bindings/mfd/mt6397.txt

@@ -8,11 +8,12 @@
 - Clock
 - LED
 - Keys
+- Power controller
 
 It is interfaced to host controller using SPI interface by a proprietary hardware
 called PMIC wrapper or pwrap. MT6397/MT6323 MFD is a child device of pwrap.
 See the following for pwarp node definitions:
-Documentation/devicetree/bindings/soc/mediatek/pwrap.txt
+../soc/mediatek/pwrap.txt
 
 This document describes the binding for MFD device and its sub module.
 
@@ -22,14 +23,16 @@
 Optional subnodes:
 
 - rtc
-	Required properties:
+	Required properties: Should be one of follows
+		- compatible: "mediatek,mt6323-rtc"
 		- compatible: "mediatek,mt6397-rtc"
+	For details, see ../rtc/rtc-mt6397.txt
 - regulators
 	Required properties:
 		- compatible: "mediatek,mt6397-regulator"
-	see Documentation/devicetree/bindings/regulator/mt6397-regulator.txt
+	see ../regulator/mt6397-regulator.txt
 		- compatible: "mediatek,mt6323-regulator"
-	see Documentation/devicetree/bindings/regulator/mt6323-regulator.txt
+	see ../regulator/mt6323-regulator.txt
 - codec
 	Required properties:
 		- compatible: "mediatek,mt6397-codec"
@@ -39,12 +42,17 @@
 - led
 	Required properties:
 		- compatible: "mediatek,mt6323-led"
-	see Documentation/devicetree/bindings/leds/leds-mt6323.txt
+	see ../leds/leds-mt6323.txt
 
 - keys
 	Required properties:
 		- compatible: "mediatek,mt6397-keys" or "mediatek,mt6323-keys"
-	see Documentation/devicetree/bindings/input/mtk-pmic-keys.txt
+	see ../input/mtk-pmic-keys.txt
+
+- power-controller
+	Required properties:
+		- compatible: "mediatek,mt6323-pwrc"
+	For details, see ../power/reset/mt6323-poweroff.txt
 
 Example:
 	pwrap: pwrap@1000f000 {

diff --git a/Documentation/devicetree/bindings/mfd/rn5t618.txt b/Documentation/devicetree/bindings/mfd/rn5t618.txt
index 65c2326..b74e5e9 100644
--- a/Documentation/devicetree/bindings/mfd/rn5t618.txt
+++ b/Documentation/devicetree/bindings/mfd/rn5t618.txt

@@ -14,6 +14,10 @@
 		"ricoh,rc5t619"
  - reg: the I2C slave address of the device
 
+Optional properties:
+ - system-power-controller:
+   See Documentation/devicetree/bindings/power/power-controller.txt
+
 Sub-nodes:
  - regulators: the node is required if the regulator functionality is
    needed. The valid regulator names are: DCDC1, DCDC2, DCDC3, DCDC4
@@ -28,6 +32,7 @@
 	pmic@32 {
 		compatible = "ricoh,rn5t618";
 		reg = <0x32>;
+		system-power-controller;
 
 		regulators {
 			DCDC1 {

diff --git a/Documentation/devicetree/bindings/net/adi,adin.yaml b/Documentation/devicetree/bindings/net/adi,adin.yaml
index 69375cb2..d95cc69 100644
--- a/Documentation/devicetree/bindings/net/adi,adin.yaml
+++ b/Documentation/devicetree/bindings/net/adi,adin.yaml

@@ -36,12 +36,6 @@
     enum: [ 4, 8, 12, 16, 20, 24 ]
     default: 8
 
-  adi,disable-energy-detect:
-    description: |
-      Disables Energy Detect Powerdown Mode (default disabled, i.e energy detect
-      is enabled if this property is unspecified)
-    type: boolean
-
 examples:
   - |
     ethernet {
@@ -68,6 +62,5 @@
             reg = <1>;
 
             adi,fifo-depth-bits = <16>;
-            adi,disable-energy-detect;
         };
     };

diff --git a/Documentation/devicetree/bindings/net/micrel-ksz90x1.txt b/Documentation/devicetree/bindings/net/micrel-ksz90x1.txt
index 5100358..b921731 100644
--- a/Documentation/devicetree/bindings/net/micrel-ksz90x1.txt
+++ b/Documentation/devicetree/bindings/net/micrel-ksz90x1.txt

@@ -12,8 +12,36 @@
 KSZ9021:
 
   All skew control options are specified in picoseconds. The minimum
-  value is 0, the maximum value is 3000, and it is incremented by 200ps
-  steps.
+  value is 0, the maximum value is 3000, and it can be specified in 200ps
+  steps, *but* these values are in not fact what you get because this chip's
+  skew values actually increase in 120ps steps, starting from -840ps. The
+  incorrect values came from an error in the original KSZ9021 datasheet
+  before it was corrected in revision 1.2 (Feb 2014), but it is too late to
+  change the driver now because of the many existing device trees that have
+  been created using values that go up in increments of 200.
+
+  The following table shows the actual skew delay you will get for each of the
+  possible devicetree values, and the number that will be programmed into the
+  corresponding pad skew register:
+
+  Device Tree Value	Delay	Pad Skew Register Value
+  -----------------------------------------------------
+	0   		-840ps		0000
+	200 		-720ps		0001
+	400 		-600ps		0010
+	600 		-480ps		0011
+	800 		-360ps		0100
+	1000		-240ps		0101
+	1200		-120ps		0110
+	1400		   0ps		0111
+	1600		 120ps		1000
+	1800		 240ps		1001
+	2000		 360ps		1010
+	2200		 480ps		1011
+	2400		 600ps		1100
+	2600		 720ps		1101
+	2800		 840ps		1110
+	3000		 960ps		1111
 
   Optional properties:
 

diff --git a/Documentation/devicetree/bindings/net/renesas,ravb.txt b/Documentation/devicetree/bindings/net/renesas,ravb.txt
index 7ad3621..5df4aa7 100644
--- a/Documentation/devicetree/bindings/net/renesas,ravb.txt
+++ b/Documentation/devicetree/bindings/net/renesas,ravb.txt

@@ -18,6 +18,7 @@
 		R-Car Gen2 and RZ/G1 devices.
 
       - "renesas,etheravb-r8a774a1" for the R8A774A1 SoC.
+      - "renesas,etheravb-r8a774b1" for the R8A774B1 SoC.
       - "renesas,etheravb-r8a774c0" for the R8A774C0 SoC.
       - "renesas,etheravb-r8a7795" for the R8A7795 SoC.
       - "renesas,etheravb-r8a7796" for the R8A7796 SoC.

diff --git a/Documentation/devicetree/bindings/net/snps,dwmac.yaml b/Documentation/devicetree/bindings/net/snps,dwmac.yaml
index ebe4537..4845e29 100644
--- a/Documentation/devicetree/bindings/net/snps,dwmac.yaml
+++ b/Documentation/devicetree/bindings/net/snps,dwmac.yaml

@@ -113,7 +113,7 @@
     const: stmmaceth
 
   mac-mode:
-    maxItems: 1
+    $ref: ethernet-controller.yaml#/properties/phy-connection-type
     description:
       The property is identical to 'phy-mode', and assumes that there is mode
       converter in-between the MAC & PHY (e.g. GMII-to-RGMII). This converter

diff --git a/Documentation/devicetree/bindings/pci/designware-pcie.txt b/Documentation/devicetree/bindings/pci/designware-pcie.txt
index 5561a1c..78494c4 100644
--- a/Documentation/devicetree/bindings/pci/designware-pcie.txt
+++ b/Documentation/devicetree/bindings/pci/designware-pcie.txt

@@ -11,7 +11,6 @@
 	     the ATU address space.
     (The old way of getting the configuration address space from "ranges"
     is deprecated and should be avoided.)
-- num-lanes: number of lanes to use
 RC mode:
 - #address-cells: set to <3>
 - #size-cells: set to <2>
@@ -34,6 +33,11 @@
 - clock-names: Must include the following entries:
 	- "pcie"
 	- "pcie_bus"
+- snps,enable-cdm-check: This is a boolean property and if present enables
+   automatic checking of CDM (Configuration Dependent Module) registers
+   for data corruption. CDM registers include standard PCIe configuration
+   space registers, Port Logic registers, DMA and iATU (internal Address
+   Translation Unit) registers.
 RC mode:
 - num-viewport: number of view ports configured in hardware. If a platform
   does not specify it, the driver assumes 2.

diff --git a/Documentation/devicetree/bindings/pci/fsl,imx6q-pcie.txt b/Documentation/devicetree/bindings/pci/fsl,imx6q-pcie.txt
index a7f5f5a..de4b2ba 100644
--- a/Documentation/devicetree/bindings/pci/fsl,imx6q-pcie.txt
+++ b/Documentation/devicetree/bindings/pci/fsl,imx6q-pcie.txt

@@ -50,7 +50,7 @@
 - power-domains: Must be set to a phandle pointing to PCIE_PHY power domain
 - resets: Must contain phandles to PCIe-related reset lines exposed by SRC
   IP block
-- reset-names: Must contain the following entires:
+- reset-names: Must contain the following entries:
 	       - "pciephy"
 	       - "apps"
 	       - "turnoff"

diff --git a/Documentation/devicetree/bindings/pci/mediatek-pcie.txt b/Documentation/devicetree/bindings/pci/mediatek-pcie.txt
index 92437a3..7468d66 100644
--- a/Documentation/devicetree/bindings/pci/mediatek-pcie.txt
+++ b/Documentation/devicetree/bindings/pci/mediatek-pcie.txt

@@ -6,6 +6,7 @@
 	"mediatek,mt2712-pcie"
 	"mediatek,mt7622-pcie"
 	"mediatek,mt7623-pcie"
+	"mediatek,mt7629-pcie"
 - device_type: Must be "pci"
 - reg: Base addresses and lengths of the PCIe subsys and root ports.
 - reg-names: Names of the above areas to use during resource lookup.

diff --git a/Documentation/devicetree/bindings/pci/nvidia,tegra194-pcie.txt b/Documentation/devicetree/bindings/pci/nvidia,tegra194-pcie.txt
new file mode 100644
index 0000000..b739f92
--- /dev/null
+++ b/Documentation/devicetree/bindings/pci/nvidia,tegra194-pcie.txt

@@ -0,0 +1,171 @@
+NVIDIA Tegra PCIe controller (Synopsys DesignWare Core based)
+
+This PCIe host controller is based on the Synopsis Designware PCIe IP
+and thus inherits all the common properties defined in designware-pcie.txt.
+
+Required properties:
+- compatible: For Tegra19x, must contain "nvidia,tegra194-pcie".
+- device_type: Must be "pci"
+- power-domains: A phandle to the node that controls power to the respective
+  PCIe controller and a specifier name for the PCIe controller. Following are
+  the specifiers for the different PCIe controllers
+    TEGRA194_POWER_DOMAIN_PCIEX8B: C0
+    TEGRA194_POWER_DOMAIN_PCIEX1A: C1
+    TEGRA194_POWER_DOMAIN_PCIEX1A: C2
+    TEGRA194_POWER_DOMAIN_PCIEX1A: C3
+    TEGRA194_POWER_DOMAIN_PCIEX4A: C4
+    TEGRA194_POWER_DOMAIN_PCIEX8A: C5
+  these specifiers are defined in
+  "include/dt-bindings/power/tegra194-powergate.h" file.
+- reg: A list of physical base address and length pairs for each set of
+  controller registers. Must contain an entry for each entry in the reg-names
+  property.
+- reg-names: Must include the following entries:
+  "appl": Controller's application logic registers
+  "config": As per the definition in designware-pcie.txt
+  "atu_dma": iATU and DMA registers. This is where the iATU (internal Address
+             Translation Unit) registers of the PCIe core are made available
+             for SW access.
+  "dbi": The aperture where root port's own configuration registers are
+         available
+- interrupts: A list of interrupt outputs of the controller. Must contain an
+  entry for each entry in the interrupt-names property.
+- interrupt-names: Must include the following entries:
+  "intr": The Tegra interrupt that is asserted for controller interrupts
+  "msi": The Tegra interrupt that is asserted when an MSI is received
+- bus-range: Range of bus numbers associated with this controller
+- #address-cells: Address representation for root ports (must be 3)
+  - cell 0 specifies the bus and device numbers of the root port:
+    [23:16]: bus number
+    [15:11]: device number
+  - cell 1 denotes the upper 32 address bits and should be 0
+  - cell 2 contains the lower 32 address bits and is used to translate to the
+    CPU address space
+- #size-cells: Size representation for root ports (must be 2)
+- ranges: Describes the translation of addresses for root ports and standard
+  PCI regions. The entries must be 7 cells each, where the first three cells
+  correspond to the address as described for the #address-cells property
+  above, the fourth and fifth cells are for the physical CPU address to
+  translate to and the sixth and seventh cells are as described for the
+  #size-cells property above.
+  - Entries setup the mapping for the standard I/O, memory and
+    prefetchable PCI regions. The first cell determines the type of region
+    that is setup:
+    - 0x81000000: I/O memory region
+    - 0x82000000: non-prefetchable memory region
+    - 0xc2000000: prefetchable memory region
+  Please refer to the standard PCI bus binding document for a more detailed
+  explanation.
+- #interrupt-cells: Size representation for interrupts (must be 1)
+- interrupt-map-mask and interrupt-map: Standard PCI IRQ mapping properties
+  Please refer to the standard PCI bus binding document for a more detailed
+  explanation.
+- clocks: Must contain an entry for each entry in clock-names.
+  See ../clocks/clock-bindings.txt for details.
+- clock-names: Must include the following entries:
+  - core
+- resets: Must contain an entry for each entry in reset-names.
+  See ../reset/reset.txt for details.
+- reset-names: Must include the following entries:
+  - apb
+  - core
+- phys: Must contain a phandle to P2U PHY for each entry in phy-names.
+- phy-names: Must include an entry for each active lane.
+  "p2u-N": where N ranges from 0 to one less than the total number of lanes
+- nvidia,bpmp: Must contain a pair of phandle to BPMP controller node followed
+  by controller-id. Following are the controller ids for each controller.
+    0: C0
+    1: C1
+    2: C2
+    3: C3
+    4: C4
+    5: C5
+- vddio-pex-ctl-supply: Regulator supply for PCIe side band signals
+
+Optional properties:
+- pinctrl-names: A list of pinctrl state names.
+  It is mandatory for C5 controller and optional for other controllers.
+  - "default": Configures PCIe I/O for proper operation.
+- pinctrl-0: phandle for the 'default' state of pin configuration.
+  It is mandatory for C5 controller and optional for other controllers.
+- supports-clkreq: Refer to Documentation/devicetree/bindings/pci/pci.txt
+- nvidia,update-fc-fixup: This is a boolean property and needs to be present to
+    improve performance when a platform is designed in such a way that it
+    satisfies at least one of the following conditions thereby enabling root
+    port to exchange optimum number of FC (Flow Control) credits with
+    downstream devices
+    1. If C0/C4/C5 run at x1/x2 link widths (irrespective of speed and MPS)
+    2. If C0/C1/C2/C3/C4/C5 operate at their respective max link widths and
+       a) speed is Gen-2 and MPS is 256B
+       b) speed is >= Gen-3 with any MPS
+- nvidia,aspm-cmrt-us: Common Mode Restore Time for proper operation of ASPM
+   to be specified in microseconds
+- nvidia,aspm-pwr-on-t-us: Power On time for proper operation of ASPM to be
+   specified in microseconds
+- nvidia,aspm-l0s-entrance-latency-us: ASPM L0s entrance latency to be
+   specified in microseconds
+- vpcie3v3-supply: A phandle to the regulator node that supplies 3.3V to the slot
+  if the platform has one such slot. (Ex:- x16 slot owned by C5 controller
+  in p2972-0000 platform).
+- vpcie12v-supply: A phandle to the regulator node that supplies 12V to the slot
+  if the platform has one such slot. (Ex:- x16 slot owned by C5 controller
+  in p2972-0000 platform).
+
+Examples:
+=========
+
+Tegra194:
+--------
+
+	pcie@14180000 {
+		compatible = "nvidia,tegra194-pcie", "snps,dw-pcie";
+		power-domains = <&bpmp TEGRA194_POWER_DOMAIN_PCIEX8B>;
+		reg = <0x00 0x14180000 0x0 0x00020000   /* appl registers (128K)      */
+		       0x00 0x38000000 0x0 0x00040000   /* configuration space (256K) */
+		       0x00 0x38040000 0x0 0x00040000>; /* iATU_DMA reg space (256K)  */
+		reg-names = "appl", "config", "atu_dma";
+
+		#address-cells = <3>;
+		#size-cells = <2>;
+		device_type = "pci";
+		num-lanes = <8>;
+		linux,pci-domain = <0>;
+
+		pinctrl-names = "default";
+		pinctrl-0 = <&pex_rst_c5_out_state>, <&clkreq_c5_bi_dir_state>;
+
+		clocks = <&bpmp TEGRA194_CLK_PEX0_CORE_0>;
+		clock-names = "core";
+
+		resets = <&bpmp TEGRA194_RESET_PEX0_CORE_0_APB>,
+			 <&bpmp TEGRA194_RESET_PEX0_CORE_0>;
+		reset-names = "apb", "core";
+
+		interrupts = <GIC_SPI 72 IRQ_TYPE_LEVEL_HIGH>,	/* controller interrupt */
+			     <GIC_SPI 73 IRQ_TYPE_LEVEL_HIGH>;	/* MSI interrupt */
+		interrupt-names = "intr", "msi";
+
+		#interrupt-cells = <1>;
+		interrupt-map-mask = <0 0 0 0>;
+		interrupt-map = <0 0 0 0 &gic GIC_SPI 72 IRQ_TYPE_LEVEL_HIGH>;
+
+		nvidia,bpmp = <&bpmp 0>;
+
+		supports-clkreq;
+		nvidia,aspm-cmrt-us = <60>;
+		nvidia,aspm-pwr-on-t-us = <20>;
+		nvidia,aspm-l0s-entrance-latency-us = <3>;
+
+		bus-range = <0x0 0xff>;
+		ranges = <0x81000000 0x0  0x38100000 0x0  0x38100000 0x0 0x00100000    /* downstream I/O (1MB) */
+			  0x82000000 0x0  0x38200000 0x0  0x38200000 0x0 0x01E00000    /* non-prefetchable memory (30MB) */
+			  0xc2000000 0x18 0x00000000 0x18 0x00000000 0x4 0x00000000>;  /* prefetchable memory (16GB) */
+
+		vddio-pex-ctl-supply = <&vdd_1v8ao>;
+		vpcie3v3-supply = <&vdd_3v3_pcie>;
+		vpcie12v-supply = <&vdd_12v_pcie>;
+
+		phys = <&p2u_hsio_2>, <&p2u_hsio_3>, <&p2u_hsio_4>,
+		       <&p2u_hsio_5>;
+		phy-names = "p2u-0", "p2u-1", "p2u-2", "p2u-3";
+	};

diff --git a/Documentation/devicetree/bindings/pci/pci-armada8k.txt b/Documentation/devicetree/bindings/pci/pci-armada8k.txt
index 8324a4e..7a813d0 100644
--- a/Documentation/devicetree/bindings/pci/pci-armada8k.txt
+++ b/Documentation/devicetree/bindings/pci/pci-armada8k.txt

@@ -11,7 +11,7 @@
 - reg-names:
    - "ctrl" for the control register region
    - "config" for the config space region
-- interrupts: Interrupt specifier for the PCIe controler
+- interrupts: Interrupt specifier for the PCIe controller
 - clocks: reference to the PCIe controller clocks
 - clock-names: mandatory if there is a second clock, in this case the
    name must be "core" for the first clock and "reg" for the second

diff --git a/Documentation/devicetree/bindings/pci/pci.txt b/Documentation/devicetree/bindings/pci/pci.txt
index 2a5d910..29bcbd8 100644
--- a/Documentation/devicetree/bindings/pci/pci.txt
+++ b/Documentation/devicetree/bindings/pci/pci.txt

@@ -27,6 +27,11 @@
 - reset-gpios:
    If present this property specifies PERST# GPIO. Host drivers can parse the
    GPIO and apply fundamental reset to endpoints.
+- supports-clkreq:
+   If present this property specifies that CLKREQ signal routing exists from
+   root port to downstream device and host bridge drivers can do programming
+   which depends on CLKREQ signal existence. For example, programming root port
+   not to advertise ASPM L1 Sub-States support if there is no CLKREQ signal.
 
 PCI-PCI Bridge properties
 -------------------------

diff --git a/Documentation/devicetree/bindings/pci/pcie-al.txt b/Documentation/devicetree/bindings/pci/pcie-al.txt
new file mode 100644
index 0000000..557a508
--- /dev/null
+++ b/Documentation/devicetree/bindings/pci/pcie-al.txt

@@ -0,0 +1,46 @@
+* Amazon Annapurna Labs PCIe host bridge
+
+Amazon's Annapurna Labs PCIe Host Controller is based on the Synopsys DesignWare
+PCI core. It inherits common properties defined in
+Documentation/devicetree/bindings/pci/designware-pcie.txt.
+
+Properties of the host controller node that differ from it are:
+
+- compatible:
+	Usage: required
+	Value type: <stringlist>
+	Definition: Value should contain
+			- "amazon,al-alpine-v2-pcie" for alpine_v2
+			- "amazon,al-alpine-v3-pcie" for alpine_v3
+
+- reg:
+	Usage: required
+	Value type: <prop-encoded-array>
+	Definition: Register ranges as listed in the reg-names property
+
+- reg-names:
+	Usage: required
+	Value type: <stringlist>
+	Definition: Must include the following entries
+			- "config"	PCIe ECAM space
+			- "controller"	AL proprietary registers
+			- "dbi"		Designware PCIe registers
+
+Example:
+
+	pcie-external0: pcie@fb600000 {
+		compatible = "amazon,al-alpine-v3-pcie";
+		reg = <0x0 0xfb600000 0x0 0x00100000
+		       0x0 0xfd800000 0x0 0x00010000
+		       0x0 0xfd810000 0x0 0x00001000>;
+		reg-names = "config", "controller", "dbi";
+		bus-range = <0 255>;
+		device_type = "pci";
+		#address-cells = <3>;
+		#size-cells = <2>;
+		#interrupt-cells = <1>;
+		interrupts = <GIC_SPI 49 IRQ_TYPE_LEVEL_HIGH>;
+		interrupt-map-mask = <0x00 0 0 7>;
+		interrupt-map = <0x0000 0 0 1 &gic GIC_SPI 41 IRQ_TYPE_LEVEL_HIGH>; /* INTa */
+		ranges = <0x02000000 0x0 0xc0010000 0x0 0xc0010000 0x0 0x07ff0000>;
+	};

diff --git a/Documentation/devicetree/bindings/phy/phy-tegra194-p2u.txt b/Documentation/devicetree/bindings/phy/phy-tegra194-p2u.txt
new file mode 100644
index 0000000..d23ff90
--- /dev/null
+++ b/Documentation/devicetree/bindings/phy/phy-tegra194-p2u.txt

@@ -0,0 +1,28 @@
+NVIDIA Tegra194 P2U binding
+
+Tegra194 has two PHY bricks namely HSIO (High Speed IO) and NVHS (NVIDIA High
+Speed) each interfacing with 12 and 8 P2U instances respectively.
+A P2U instance is a glue logic between Synopsys DesignWare Core PCIe IP's PIPE
+interface and PHY of HSIO/NVHS bricks. Each P2U instance represents one PCIe
+lane.
+
+Required properties:
+- compatible: For Tegra19x, must contain "nvidia,tegra194-p2u".
+- reg: Should be the physical address space and length of respective each P2U
+       instance.
+- reg-names: Must include the entry "ctl".
+
+Required properties for PHY port node:
+- #phy-cells: Defined by generic PHY bindings.  Must be 0.
+
+Refer to phy/phy-bindings.txt for the generic PHY binding properties.
+
+Example:
+
+p2u_hsio_0: phy@3e10000 {
+	compatible = "nvidia,tegra194-p2u";
+	reg = <0x03e10000 0x10000>;
+	reg-names = "ctl";
+
+	#phy-cells = <0>;
+};

diff --git a/Documentation/devicetree/bindings/power/reset/mt6323-poweroff.txt b/Documentation/devicetree/bindings/power/reset/mt6323-poweroff.txt
new file mode 100644
index 0000000..933f0c4
--- /dev/null
+++ b/Documentation/devicetree/bindings/power/reset/mt6323-poweroff.txt

@@ -0,0 +1,20 @@
+Device Tree Bindings for Power Controller on MediaTek PMIC
+
+The power controller which could be found on PMIC is responsible for externally
+powering off or on the remote MediaTek SoC through the circuit BBPU.
+
+Required properties:
+- compatible: Should be one of follows
+       "mediatek,mt6323-pwrc": for MT6323 PMIC
+
+Example:
+
+       pmic {
+               compatible = "mediatek,mt6323";
+
+               ...
+
+               power-controller {
+                       compatible = "mediatek,mt6323-pwrc";
+               };
+       }

diff --git a/Documentation/devicetree/bindings/pwm/pwm-mediatek.txt b/Documentation/devicetree/bindings/pwm/pwm-mediatek.txt
index 991728c..c850153 100644
--- a/Documentation/devicetree/bindings/pwm/pwm-mediatek.txt
+++ b/Documentation/devicetree/bindings/pwm/pwm-mediatek.txt

@@ -6,6 +6,8 @@
    - "mediatek,mt7622-pwm": found on mt7622 SoC.
    - "mediatek,mt7623-pwm": found on mt7623 SoC.
    - "mediatek,mt7628-pwm": found on mt7628 SoC.
+   - "mediatek,mt7629-pwm", "mediatek,mt7622-pwm": found on mt7629 SoC.
+   - "mediatek,mt8516-pwm": found on mt8516 SoC.
  - reg: physical base address and length of the controller's registers.
  - #pwm-cells: must be 2. See pwm.txt in this directory for a description of
    the cell format.

diff --git a/Documentation/devicetree/bindings/pwm/pwm-sprd.txt b/Documentation/devicetree/bindings/pwm/pwm-sprd.txt
new file mode 100644
index 0000000..16fa5a0
--- /dev/null
+++ b/Documentation/devicetree/bindings/pwm/pwm-sprd.txt

@@ -0,0 +1,40 @@
+Spreadtrum PWM controller
+
+Spreadtrum SoCs PWM controller provides 4 PWM channels.
+
+Required properties:
+- compatible : Should be "sprd,ums512-pwm".
+- reg: Physical base address and length of the controller's registers.
+- clocks: The phandle and specifier referencing the controller's clocks.
+- clock-names: Should contain following entries:
+  "pwmn": used to derive the functional clock for PWM channel n (n range: 0 ~ 3).
+  "enablen": for PWM channel n enable clock (n range: 0 ~ 3).
+- #pwm-cells: Should be 2. See pwm.txt in this directory for a description of
+  the cells format.
+
+Optional properties:
+- assigned-clocks: Reference to the PWM clock entries.
+- assigned-clock-parents: The phandle of the parent clock of PWM clock.
+
+Example:
+	pwms: pwm@32260000 {
+		compatible = "sprd,ums512-pwm";
+		reg = <0 0x32260000 0 0x10000>;
+		clock-names = "pwm0", "enable0",
+			"pwm1", "enable1",
+			"pwm2", "enable2",
+			"pwm3", "enable3";
+		clocks = <&aon_clk CLK_PWM0>, <&aonapb_gate CLK_PWM0_EB>,
+		       <&aon_clk CLK_PWM1>, <&aonapb_gate CLK_PWM1_EB>,
+		       <&aon_clk CLK_PWM2>, <&aonapb_gate CLK_PWM2_EB>,
+		       <&aon_clk CLK_PWM3>, <&aonapb_gate CLK_PWM3_EB>;
+		assigned-clocks = <&aon_clk CLK_PWM0>,
+			<&aon_clk CLK_PWM1>,
+			<&aon_clk CLK_PWM2>,
+			<&aon_clk CLK_PWM3>;
+		assigned-clock-parents = <&ext_26m>,
+			<&ext_26m>,
+			<&ext_26m>,
+			<&ext_26m>;
+		#pwm-cells = <2>;
+	};

diff --git a/Documentation/devicetree/bindings/thermal/qoriq-thermal.txt b/Documentation/devicetree/bindings/thermal/qoriq-thermal.txt
index 04cbb90..28f2cba 100644
--- a/Documentation/devicetree/bindings/thermal/qoriq-thermal.txt
+++ b/Documentation/devicetree/bindings/thermal/qoriq-thermal.txt

@@ -23,6 +23,7 @@
 Optional property:
 - little-endian : If present, the TMU registers are little endian. If absent,
 	the default is big endian.
+- clocks : the clock for clocking the TMU silicon.
 
 Example:
 

diff --git a/Documentation/devicetree/bindings/watchdog/allwinner,sun4i-a10-wdt.yaml b/Documentation/devicetree/bindings/watchdog/allwinner,sun4i-a10-wdt.yaml
new file mode 100644
index 0000000..3a54f58
--- /dev/null
+++ b/Documentation/devicetree/bindings/watchdog/allwinner,sun4i-a10-wdt.yaml

@@ -0,0 +1,58 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/watchdog/allwinner,sun4i-a10-wdt.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Allwinner A10 Watchdog Device Tree Bindings
+
+allOf:
+  - $ref: "watchdog.yaml#"
+
+maintainers:
+  - Chen-Yu Tsai <wens@csie.org>
+  - Maxime Ripard <maxime.ripard@bootlin.com>
+
+properties:
+  compatible:
+    oneOf:
+      - const: allwinner,sun4i-a10-wdt
+      - const: allwinner,sun6i-a31-wdt
+      - items:
+          - const: allwinner,sun50i-a64-wdt
+          - const: allwinner,sun6i-a31-wdt
+      - items:
+          - const: allwinner,sun50i-h6-wdt
+          - const: allwinner,sun6i-a31-wdt
+      - items:
+          - const: allwinner,suniv-f1c100s-wdt
+          - const: allwinner,sun4i-a10-wdt
+
+  reg:
+    maxItems: 1
+
+  clocks:
+    maxItems: 1
+
+  interrupts:
+    maxItems: 1
+
+required:
+  - compatible
+  - reg
+  - clocks
+  - interrupts
+
+unevaluatedProperties: false
+
+examples:
+  - |
+    wdt: watchdog@1c20c90 {
+        compatible = "allwinner,sun4i-a10-wdt";
+        reg = <0x01c20c90 0x10>;
+        interrupts = <24>;
+        clocks = <&osc24M>;
+        timeout-sec = <10>;
+    };
+
+...

diff --git a/Documentation/devicetree/bindings/watchdog/aspeed-wdt.txt b/Documentation/devicetree/bindings/watchdog/aspeed-wdt.txt
index c5077a1..d78d4a8f 100644
--- a/Documentation/devicetree/bindings/watchdog/aspeed-wdt.txt
+++ b/Documentation/devicetree/bindings/watchdog/aspeed-wdt.txt

@@ -4,6 +4,7 @@
  - compatible: must be one of:
 	- "aspeed,ast2400-wdt"
 	- "aspeed,ast2500-wdt"
+	- "aspeed,ast2600-wdt"
 
  - reg: physical base address of the controller and length of memory mapped
    region

diff --git a/Documentation/devicetree/bindings/watchdog/fsl-imx7ulp-wdt.txt b/Documentation/devicetree/bindings/watchdog/fsl-imx7ulp-wdt.txt
new file mode 100644
index 0000000..f902508
--- /dev/null
+++ b/Documentation/devicetree/bindings/watchdog/fsl-imx7ulp-wdt.txt

@@ -0,0 +1,22 @@
+* Freescale i.MX7ULP Watchdog Timer (WDT) Controller
+
+Required properties:
+- compatible : Should be "fsl,imx7ulp-wdt"
+- reg : Should contain WDT registers location and length
+- interrupts : Should contain WDT interrupt
+- clocks: Should contain a phandle pointing to the gated peripheral clock.
+
+Optional properties:
+- timeout-sec : Contains the watchdog timeout in seconds
+
+Examples:
+
+wdog1: watchdog@403d0000 {
+	compatible = "fsl,imx7ulp-wdt";
+	reg = <0x403d0000 0x10000>;
+	interrupts = <GIC_SPI 55 IRQ_TYPE_LEVEL_HIGH>;
+	clocks = <&pcc2 IMX7ULP_CLK_WDG1>;
+	assigned-clocks = <&pcc2 IMX7ULP_CLK_WDG1>;
+	assigned-clocks-parents = <&scg1 IMX7ULP_CLK_FIRC_BUS_CLK>;
+	timeout-sec = <40>;
+};

diff --git a/Documentation/devicetree/bindings/watchdog/sunxi-wdt.txt b/Documentation/devicetree/bindings/watchdog/sunxi-wdt.txt
deleted file mode 100644
index e65198d..0000000
--- a/Documentation/devicetree/bindings/watchdog/sunxi-wdt.txt
+++ /dev/null

@@ -1,22 +0,0 @@
-Allwinner SoCs Watchdog timer
-
-Required properties:
-
-- compatible : should be one of
-	"allwinner,sun4i-a10-wdt"
-	"allwinner,sun6i-a31-wdt"
-	"allwinner,sun50i-a64-wdt","allwinner,sun6i-a31-wdt"
-	"allwinner,sun50i-h6-wdt","allwinner,sun6i-a31-wdt"
-	"allwinner,suniv-f1c100s-wdt", "allwinner,sun4i-a10-wdt"
-- reg : Specifies base physical address and size of the registers.
-
-Optional properties:
-- timeout-sec : Contains the watchdog timeout in seconds
-
-Example:
-
-wdt: watchdog@1c20c90 {
-	compatible = "allwinner,sun4i-a10-wdt";
-	reg = <0x01c20c90 0x10>;
-	timeout-sec = <10>;
-};

diff --git a/Documentation/devicetree/bindings/watchdog/watchdog.yaml b/Documentation/devicetree/bindings/watchdog/watchdog.yaml
new file mode 100644
index 0000000..187bf6c
--- /dev/null
+++ b/Documentation/devicetree/bindings/watchdog/watchdog.yaml

@@ -0,0 +1,26 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/watchdog/watchdog.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Watchdog Generic Bindings
+
+maintainers:
+  - Guenter Roeck <linux@roeck-us.net>
+  - Wim Van Sebroeck <wim@linux-watchdog.org>
+
+description: |
+  This document describes generic bindings which can be used to
+  describe watchdog devices in a device tree.
+
+properties:
+  $nodename:
+    pattern: "^watchdog(@.*|-[0-9a-f])?$"
+
+  timeout-sec:
+    $ref: /schemas/types.yaml#/definitions/uint32
+    description:
+      Contains the watchdog timeout in seconds.
+
+...

diff --git a/Documentation/filesystems/ceph.txt b/Documentation/filesystems/ceph.txt
index d2c6a5c..b19b6a0 100644
--- a/Documentation/filesystems/ceph.txt
+++ b/Documentation/filesystems/ceph.txt

@@ -158,6 +158,20 @@
         copies.  Currently, it's only used in copy_file_range, which will revert
         to the default VFS implementation if this option is used.
 
+  recover_session=<no|clean>
+	Set auto reconnect mode in the case where the client is blacklisted. The
+	available modes are "no" and "clean". The default is "no".
+
+	* no: never attempt to reconnect when client detects that it has been
+	blacklisted. Operations will generally fail after being blacklisted.
+
+	* clean: client reconnects to the ceph cluster automatically when it
+	detects that it has been blacklisted. During reconnect, client drops
+	dirty data/metadata, invalidates page caches and writable file handles.
+	After reconnect, file locks become stale because the MDS loses track
+	of them. If an inode contains any stale file locks, read/write on the
+	inode is not allowed until applications release all stale file locks.
+
 More Information
 ================
 

diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst
index fd2bcf9..2c3a9f76 100644
--- a/Documentation/filesystems/index.rst
+++ b/Documentation/filesystems/index.rst

@@ -37,3 +37,13 @@
    journalling
    fscrypt
    fsverity
+
+Filesystems
+===========
+
+Documentation for filesystem implementations.
+
+.. toctree::
+   :maxdepth: 2
+
+   virtiofs

diff --git a/Documentation/filesystems/virtiofs.rst b/Documentation/filesystems/virtiofs.rst
new file mode 100644
index 0000000..4f338e3
--- /dev/null
+++ b/Documentation/filesystems/virtiofs.rst

@@ -0,0 +1,60 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===================================================
+virtiofs: virtio-fs host<->guest shared file system
+===================================================
+
+- Copyright (C) 2019 Red Hat, Inc.
+
+Introduction
+============
+The virtiofs file system for Linux implements a driver for the paravirtualized
+VIRTIO "virtio-fs" device for guest<->host file system sharing.  It allows a
+guest to mount a directory that has been exported on the host.
+
+Guests often require access to files residing on the host or remote systems.
+Use cases include making files available to new guests during installation,
+booting from a root file system located on the host, persistent storage for
+stateless or ephemeral guests, and sharing a directory between guests.
+
+Although it is possible to use existing network file systems for some of these
+tasks, they require configuration steps that are hard to automate and they
+expose the storage network to the guest.  The virtio-fs device was designed to
+solve these problems by providing file system access without networking.
+
+Furthermore the virtio-fs device takes advantage of the co-location of the
+guest and host to increase performance and provide semantics that are not
+possible with network file systems.
+
+Usage
+=====
+Mount file system with tag ``myfs`` on ``/mnt``:
+
+.. code-block:: sh
+
+  guest# mount -t virtiofs myfs /mnt
+
+Please see https://virtio-fs.gitlab.io/ for details on how to configure QEMU
+and the virtiofsd daemon.
+
+Internals
+=========
+Since the virtio-fs device uses the FUSE protocol for file system requests, the
+virtiofs file system for Linux is integrated closely with the FUSE file system
+client.  The guest acts as the FUSE client while the host acts as the FUSE
+server.  The /dev/fuse interface between the kernel and userspace is replaced
+with the virtio-fs device interface.
+
+FUSE requests are placed into a virtqueue and processed by the host.  The
+response portion of the buffer is filled in by the host and the guest handles
+the request completion.
+
+Mapping /dev/fuse to virtqueues requires solving differences in semantics
+between /dev/fuse and virtqueues.  Each time the /dev/fuse device is read, the
+FUSE client may choose which request to transfer, making it possible to
+prioritize certain requests over others.  Virtqueues have queue semantics and
+it is not possible to change the order of requests that have been enqueued.
+This is especially important if the virtqueue becomes full since it is then
+impossible to add high priority requests.  In order to address this difference,
+the virtio-fs device uses a "hiprio" virtqueue specifically for requests that
+have priority over normal requests.

diff --git a/Documentation/networking/devlink-trap.rst b/Documentation/networking/devlink-trap.rst
index c20c7c4..8e90a85 100644
--- a/Documentation/networking/devlink-trap.rst
+++ b/Documentation/networking/devlink-trap.rst

@@ -143,7 +143,8 @@
    * - ``port_list_is_empty``
      - ``drop``
      - Traps packets that the device decided to drop in case they need to be
-       flooded and the flood list is empty
+       flooded (e.g., unknown unicast, unregistered multicast) and there are
+       no ports the packets should be flooded to
    * - ``port_loopback_filter``
      - ``drop``
      - Traps packets that the device decided to drop in case after layer 2

diff --git a/Documentation/process/embargoed-hardware-issues.rst b/Documentation/process/embargoed-hardware-issues.rst
index 4026363..a3c3349 100644
--- a/Documentation/process/embargoed-hardware-issues.rst
+++ b/Documentation/process/embargoed-hardware-issues.rst

@@ -143,6 +143,20 @@
 in their role as Linux kernel developers. They will, however, agree to
 adhere to this documented process and the Memorandum of Understanding.
 
+The disclosing party should provide a list of contacts for all other
+entities who have already been, or should be, informed about the issue.
+This serves several purposes:
+
+ - The list of disclosed entities allows communication accross the
+   industry, e.g. other OS vendors, HW vendors, etc.
+
+ - The disclosed entities can be contacted to name experts who should
+   participate in the mitigation development.
+
+ - If an expert which is required to handle an issue is employed by an
+   listed entity or member of an listed entity, then the response teams can
+   request the disclosure of that expert from that entity. This ensures
+   that the expert is also part of the entity's response team.
 
 Disclosure
 """"""""""
@@ -158,10 +172,7 @@
 """"""""""""""""""""""
 
 The initial response team sets up an encrypted mailing-list or repurposes
-an existing one if appropriate. The disclosing party should provide a list
-of contacts for all other parties who have already been, or should be,
-informed about the issue. The response team contacts these parties so they
-can name experts who should be subscribed to the mailing-list.
+an existing one if appropriate.
 
 Using a mailing-list is close to the normal Linux development process and
 has been successfully used in developing mitigations for various hardware
@@ -175,9 +186,24 @@
 stable kernel versions as necessary.
 
 The initial response team will identify further experts from the Linux
-kernel developer community as needed and inform the disclosing party about
-their participation. Bringing in experts can happen at any time of the
-development process and often needs to be handled in a timely manner.
+kernel developer community as needed. Bringing in experts can happen at any
+time of the development process and needs to be handled in a timely manner.
+
+If an expert is employed by or member of an entity on the disclosure list
+provided by the disclosing party, then participation will be requested from
+the relevant entity.
+
+If not, then the disclosing party will be informed about the experts
+participation. The experts are covered by the Memorandum of Understanding
+and the disclosing party is requested to acknowledge the participation. In
+case that the disclosing party has a compelling reason to object, then this
+objection has to be raised within five work days and resolved with the
+incident team immediately. If the disclosing party does not react within
+five work days this is taken as silent acknowledgement.
+
+After acknowledgement or resolution of an objection the expert is disclosed
+by the incident team and brought into the development process.
+
 
 Coordinated release
 """""""""""""""""""
@@ -216,7 +242,7 @@
   ARM
   AMD
   IBM
-  Intel
+  Intel		Tony Luck <tony.luck@intel.com>
   Qualcomm	Trilok Soni <tsoni@codeaurora.org>
 
   Microsoft	Sasha Levin <sashal@kernel.org>

diff --git a/Documentation/security/IMA-templates.rst b/Documentation/security/IMA-templates.rst
index 3d1cca2..c5a8432 100644
--- a/Documentation/security/IMA-templates.rst
+++ b/Documentation/security/IMA-templates.rst

@@ -68,8 +68,10 @@
  - 'd-ng': the digest of the event, calculated with an arbitrary hash
    algorithm (field format: [<hash algo>:]digest, where the digest
    prefix is shown only if the hash algorithm is not SHA1 or MD5);
+ - 'd-modsig': the digest of the event without the appended modsig;
  - 'n-ng': the name of the event, without size limitations;
  - 'sig': the file signature;
+ - 'modsig' the appended file signature;
  - 'buf': the buffer data that was used to generate the hash without size limitations;
 
 
@@ -79,6 +81,7 @@
  - "ima-ng" (default): its format is ``d-ng|n-ng``;
  - "ima-sig": its format is ``d-ng|n-ng|sig``;
  - "ima-buf": its format is ``d-ng|n-ng|buf``;
+ - "ima-modsig": its format is ``d-ng|n-ng|sig|d-modsig|modsig``;
 
 
 Use

diff --git a/Documentation/virt/kvm/api.txt b/Documentation/virt/kvm/api.txt
index 136f1ee..4833904 100644
--- a/Documentation/virt/kvm/api.txt
+++ b/Documentation/virt/kvm/api.txt

@@ -5309,3 +5309,16 @@
 This capability indicates that KVM supports paravirtualized Hyper-V IPI send
 hypercalls:
 HvCallSendSyntheticClusterIpi, HvCallSendSyntheticClusterIpiEx.
+8.21 KVM_CAP_HYPERV_DIRECT_TLBFLUSH
+
+Architecture: x86
+
+This capability indicates that KVM running on top of Hyper-V hypervisor
+enables Direct TLB flush for its guests meaning that TLB flush
+hypercalls are handled by Level 0 hypervisor (Hyper-V) bypassing KVM.
+Due to the different ABI for hypercall parameters between Hyper-V and
+KVM, enabling this capability effectively disables all hypercall
+handling by KVM (as some KVM hypercall may be mistakenly treated as TLB
+flush hypercalls by Hyper-V) so userspace should disable KVM identification
+in CPUID and only exposes Hyper-V identification. In this case, guest
+thinks it's running on Hyper-V and only use Hyper-V hypercalls.

diff --git a/Documentation/vm/split_page_table_lock.rst b/Documentation/vm/split_page_table_lock.rst
index 889b00b..ff51f4a 100644
--- a/Documentation/vm/split_page_table_lock.rst
+++ b/Documentation/vm/split_page_table_lock.rst

@@ -54,9 +54,9 @@
 Support of split page table lock by an architecture
 ===================================================
 
-There's no need in special enabling of PTE split page table lock:
-everything required is done by pgtable_page_ctor() and pgtable_page_dtor(),
-which must be called on PTE table allocation / freeing.
+There's no need in special enabling of PTE split page table lock: everything
+required is done by pgtable_pte_page_ctor() and pgtable_pte_page_dtor(), which
+must be called on PTE table allocation / freeing.
 
 Make sure the architecture doesn't use slab allocator for page table
 allocation: slab uses page->slab_cache for its pages.
@@ -74,7 +74,7 @@
 
 With everything in place you can set CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK.
 
-NOTE: pgtable_page_ctor() and pgtable_pmd_page_ctor() can fail -- it must
+NOTE: pgtable_pte_page_ctor() and pgtable_pmd_page_ctor() can fail -- it must
 be handled properly.
 
 page->ptl
@@ -94,7 +94,7 @@
    split lock with enabled DEBUG_SPINLOCK or DEBUG_LOCK_ALLOC, but costs
    one more cache line for indirect access;
 
-The spinlock_t allocated in pgtable_page_ctor() for PTE table and in
+The spinlock_t allocated in pgtable_pte_page_ctor() for PTE table and in
 pgtable_pmd_page_ctor() for PMD table.
 
 Please, never access page->ptl directly -- use appropriate helper.

diff --git a/Documentation/watchdog/watchdog-parameters.rst b/Documentation/watchdog/watchdog-parameters.rst
index a3985cc..223c993 100644
--- a/Documentation/watchdog/watchdog-parameters.rst
+++ b/Documentation/watchdog/watchdog-parameters.rst

@@ -301,15 +301,6 @@
 
 -------------------------------------------------
 
-ks8695_wdt:
-    wdt_time:
-	Watchdog time in seconds. (default=5)
-    nowayout:
-	Watchdog cannot be stopped once started
-	(default=kernel config parameter)
-
--------------------------------------------------
-
 machzwd:
     nowayout:
 	Watchdog cannot be stopped once started
@@ -375,16 +366,6 @@
 
 -------------------------------------------------
 
-nuc900_wdt:
-    heartbeat:
-	Watchdog heartbeats in seconds.
-	(default = 15)
-    nowayout:
-	Watchdog cannot be stopped once started
-	(default=kernel config parameter)
-
--------------------------------------------------
-
 omap_wdt:
     timer_margin:
 	initial watchdog timeout (in seconds)

diff --git a/MAINTAINERS b/MAINTAINERS
index a400af0..296de2b 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS

@@ -643,6 +643,7 @@
 
 FORCEDETH GIGABIT ETHERNET DRIVER
 M:	Rain River <rain.1986.08.12@gmail.com>
+M:	Zhu Yanjun <yanjun.zhu@oracle.com>
 L:	netdev@vger.kernel.org
 S:	Maintained
 F:	drivers/net/ethernet/nvidia/*
@@ -728,7 +729,7 @@
 M:	Thor Thayer <thor.thayer@linux.intel.com>
 S:	Maintained
 F:	drivers/mfd/altera-sysmgr.c
-F:	include/linux/mfd/altera-sysgmr.h
+F:	include/linux/mfd/altera-sysmgr.h
 
 ALTERA SYSTEM RESOURCE DRIVER FOR ARRIA10 DEVKIT
 M:	Thor Thayer <thor.thayer@linux.intel.com>
@@ -2921,6 +2922,8 @@
 F:	include/linux/backlight.h
 F:	include/linux/pwm_backlight.h
 F:	Documentation/devicetree/bindings/leds/backlight
+F:	Documentation/ABI/stable/sysfs-class-backlight
+F:	Documentation/ABI/testing/sysfs-class-backlight
 
 BATMAN ADVANCED
 M:	Marek Lindner <mareklindner@neomailbox.ch>
@@ -4338,6 +4341,12 @@
 F:	Documentation/filesystems/cramfs.txt
 F:	fs/cramfs/
 
+CREATIVE SB0540
+M:	Bastien Nocera <hadess@hadess.net>
+L:	linux-input@vger.kernel.org
+S:	Maintained
+F:	drivers/hid/hid-creative-sb0540.c
+
 CRYPTO API
 M:	Herbert Xu <herbert@gondor.apana.org.au>
 M:	"David S. Miller" <davem@davemloft.net>
@@ -7857,6 +7866,12 @@
 F:	drivers/mfd/lpc_ich.c
 F:	drivers/gpio/gpio-ich.c
 
+ICY I2C DRIVER
+M:	Max Staudt <max@enpas.org>
+L:	linux-i2c@vger.kernel.org
+S:	Maintained
+F:	drivers/i2c/busses/i2c-icy.c
+
 IDE SUBSYSTEM
 M:	"David S. Miller" <davem@davemloft.net>
 L:	linux-ide@vger.kernel.org
@@ -9042,10 +9057,11 @@
 F:	Documentation/security/keys/trusted-encrypted.rst
 F:	include/keys/trusted-type.h
 F:	security/keys/trusted.c
-F:	security/keys/trusted.h
+F:	include/keys/trusted.h
 
 KEYS/KEYRINGS:
 M:	David Howells <dhowells@redhat.com>
+M:	Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
 L:	keyrings@vger.kernel.org
 S:	Maintained
 F:	Documentation/security/keys/core.rst
@@ -12574,16 +12590,18 @@
 
 PCI NATIVE HOST BRIDGE AND ENDPOINT DRIVERS
 M:	Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
+R:	Andrew Murray <andrew.murray@arm.com>
 L:	linux-pci@vger.kernel.org
 Q:	http://patchwork.ozlabs.org/project/linux-pci/list/
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/lpieralisi/pci.git/
 S:	Supported
 F:	drivers/pci/controller/
 
-PCIE DRIVER FOR ANNAPURNA LABS
+PCIE DRIVER FOR AMAZON ANNAPURNA LABS
 M:	Jonathan Chocron <jonnyc@amazon.com>
 L:	linux-pci@vger.kernel.org
 S:	Maintained
+F:	Documentation/devicetree/bindings/pci/pcie-al.txt
 F:	drivers/pci/controller/dwc/pcie-al.c
 
 PCIE DRIVER FOR AMLOGIC MESON
@@ -13229,9 +13247,11 @@
 
 PWM SUBSYSTEM
 M:	Thierry Reding <thierry.reding@gmail.com>
+R:	Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
 L:	linux-pwm@vger.kernel.org
 S:	Maintained
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/thierry.reding/linux-pwm.git
+Q:	https://patchwork.ozlabs.org/project/linux-pwm/list/
 F:	Documentation/driver-api/pwm.rst
 F:	Documentation/devicetree/bindings/pwm/
 F:	include/linux/pwm.h
@@ -13240,6 +13260,7 @@
 F:	include/linux/pwm_backlight.h
 F:	drivers/gpio/gpio-mvebu.c
 F:	Documentation/devicetree/bindings/gpio/gpio-mvebu.txt
+K:	pwm_(config|apply_state|ops)
 
 PXA GPIO DRIVER
 M:	Robert Jarzmik <robert.jarzmik@free.fr>
@@ -13781,7 +13802,7 @@
 RENESAS EMEV2 I2C DRIVER
 M:	Wolfram Sang <wsa+renesas@sang-engineering.com>
 S:	Supported
-F:	Documentation/devicetree/bindings/i2c/i2c-emev2.txt
+F:	Documentation/devicetree/bindings/i2c/renesas,iic-emev2.txt
 F:	drivers/i2c/busses/i2c-emev2.c
 
 RENESAS ETHERNET DRIVERS
@@ -13803,15 +13824,15 @@
 RENESAS R-CAR I2C DRIVERS
 M:	Wolfram Sang <wsa+renesas@sang-engineering.com>
 S:	Supported
-F:	Documentation/devicetree/bindings/i2c/i2c-rcar.txt
-F:	Documentation/devicetree/bindings/i2c/i2c-sh_mobile.txt
+F:	Documentation/devicetree/bindings/i2c/renesas,i2c.txt
+F:	Documentation/devicetree/bindings/i2c/renesas,iic.txt
 F:	drivers/i2c/busses/i2c-rcar.c
 F:	drivers/i2c/busses/i2c-sh_mobile.c
 
 RENESAS RIIC DRIVER
 M:	Chris Brandt <chris.brandt@renesas.com>
 S:	Supported
-F:	Documentation/devicetree/bindings/i2c/i2c-riic.txt
+F:	Documentation/devicetree/bindings/i2c/renesas,riic.txt
 F:	drivers/i2c/busses/i2c-riic.c
 
 RENESAS USB PHY DRIVER
@@ -16055,6 +16076,7 @@
 M:	Zhang Rui <rui.zhang@intel.com>
 M:	Eduardo Valentin <edubezval@gmail.com>
 R:	Daniel Lezcano <daniel.lezcano@linaro.org>
+R:	Amit Kucheria <amit.kucheria@verdurent.com>
 L:	linux-pm@vger.kernel.org
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/rzhang/linux.git
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/evalenti/linux-soc-thermal.git
@@ -17259,6 +17281,18 @@
 F:	drivers/s390/virtio/
 F:	arch/s390/include/uapi/asm/virtio-ccw.h
 
+VIRTIO FILE SYSTEM
+M:	Vivek Goyal <vgoyal@redhat.com>
+M:	Stefan Hajnoczi <stefanha@redhat.com>
+M:	Miklos Szeredi <miklos@szeredi.hu>
+L:	virtualization@lists.linux-foundation.org
+L:	linux-fsdevel@vger.kernel.org
+W:	https://virtio-fs.gitlab.io/
+S:	Supported
+F:	fs/fuse/virtio_fs.c
+F:	include/uapi/linux/virtio_fs.h
+F:	Documentation/filesystems/virtiofs.rst
+
 VIRTIO GPU DRIVER
 M:	David Airlie <airlied@linux.ie>
 M:	Gerd Hoffmann <kraxel@redhat.com>

diff --git a/arch/Kconfig b/arch/Kconfig
index 0fcf8ec..5f8a5d8 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig

@@ -706,6 +706,17 @@
 	  and vice-versa 32-bit applications to call 64-bit mmap().
 	  Required for applications doing different bitness syscalls.
 
+# This allows to use a set of generic functions to determine mmap base
+# address by giving priority to top-down scheme only if the process
+# is not in legacy mode (compat task, unlimited stack size or
+# sysctl_legacy_va_layout).
+# Architecture that selects this option can provide its own version of:
+# - STACK_RND_MASK
+config ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
+	bool
+	depends on MMU
+	select ARCH_HAS_ELF_RANDOMIZE
+
 config HAVE_COPY_THREAD_TLS
 	bool
 	help

diff --git a/arch/alpha/include/asm/pgalloc.h b/arch/alpha/include/asm/pgalloc.h
index 71ded3b..eb91f1e 100644
--- a/arch/alpha/include/asm/pgalloc.h
+++ b/arch/alpha/include/asm/pgalloc.h

@@ -53,6 +53,4 @@
 	free_page((unsigned long)pmd);
 }
 
-#define check_pgt_cache()	do { } while (0)
-
 #endif /* _ALPHA_PGALLOC_H */

diff --git a/arch/alpha/include/asm/pgtable.h b/arch/alpha/include/asm/pgtable.h
index 89c2032..065b57f 100644
--- a/arch/alpha/include/asm/pgtable.h
+++ b/arch/alpha/include/asm/pgtable.h

@@ -359,11 +359,6 @@
 
 #include <asm-generic/pgtable.h>
 
-/*
- * No page table caches to initialise
- */
-#define pgtable_cache_init()	do { } while (0)
-
 /* We have our own get_unmapped_area to cope with ADDR_LIMIT_32BIT.  */
 #define HAVE_ARCH_UNMAPPED_AREA
 

diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h
index ac23379..a18ec7f 100644
--- a/arch/alpha/include/uapi/asm/mman.h
+++ b/arch/alpha/include/uapi/asm/mman.h

@@ -68,6 +68,9 @@
 #define MADV_WIPEONFORK 18		/* Zero memory on fork, child only */
 #define MADV_KEEPONFORK 19		/* Undo MADV_WIPEONFORK */
 
+#define MADV_COLD	20		/* deactivate these pages */
+#define MADV_PAGEOUT	21		/* reclaim these pages */
+
 /* compatibility flags */
 #define MAP_FILE	0
 

diff --git a/arch/arc/include/asm/pgalloc.h b/arch/arc/include/asm/pgalloc.h
index 9bdb8ed..b747f2e 100644
--- a/arch/arc/include/asm/pgalloc.h
+++ b/arch/arc/include/asm/pgalloc.h

@@ -108,7 +108,7 @@
 		return 0;
 	memzero((void *)pte_pg, PTRS_PER_PTE * sizeof(pte_t));
 	page = virt_to_page(pte_pg);
-	if (!pgtable_page_ctor(page)) {
+	if (!pgtable_pte_page_ctor(page)) {
 		__free_page(page);
 		return 0;
 	}
@@ -123,13 +123,12 @@
 
 static inline void pte_free(struct mm_struct *mm, pgtable_t ptep)
 {
-	pgtable_page_dtor(virt_to_page(ptep));
+	pgtable_pte_page_dtor(virt_to_page(ptep));
 	free_pages((unsigned long)ptep, __get_order_pte());
 }
 
 #define __pte_free_tlb(tlb, pte, addr)  pte_free((tlb)->mm, pte)
 
-#define check_pgt_cache()   do { } while (0)
 #define pmd_pgtable(pmd)	((pgtable_t) pmd_page_vaddr(pmd))
 
 #endif /* _ASM_ARC_PGALLOC_H */

diff --git a/arch/arc/include/asm/pgtable.h b/arch/arc/include/asm/pgtable.h
index 1d87c18..7addd03 100644
--- a/arch/arc/include/asm/pgtable.h
+++ b/arch/arc/include/asm/pgtable.h

@@ -395,11 +395,6 @@
 /* to cope with aliasing VIPT cache */
 #define HAVE_ARCH_UNMAPPED_AREA
 
-/*
- * No page table caches to initialise
- */
-#define pgtable_cache_init()   do { } while (0)
-
 #endif /* __ASSEMBLY__ */
 
 #endif

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 229f2cdd..8a50efb 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig

@@ -34,6 +34,7 @@
 	select ARCH_SUPPORTS_ATOMIC_RMW
 	select ARCH_USE_BUILTIN_BSWAP
 	select ARCH_USE_CMPXCHG_LOCKREF
+	select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU
 	select ARCH_WANT_IPC_PARSE_VERSION
 	select BINFMT_FLAT_ARGVP_ENVP_ON_STACK
 	select BUILDTIME_EXTABLE_SORT if MMU

diff --git a/arch/arm/boot/dts/ls1021a.dtsi b/arch/arm/boot/dts/ls1021a.dtsi
index 464df42..2f6977a 100644
--- a/arch/arm/boot/dts/ls1021a.dtsi
+++ b/arch/arm/boot/dts/ls1021a.dtsi

@@ -874,7 +874,6 @@
 			#address-cells = <3>;
 			#size-cells = <2>;
 			device_type = "pci";
-			num-lanes = <4>;
 			num-viewport = <6>;
 			bus-range = <0x0 0xff>;
 			ranges = <0x81000000 0x0 0x00000000 0x40 0x00010000 0x0 0x00010000   /* downstream I/O */
@@ -899,7 +898,6 @@
 			#address-cells = <3>;
 			#size-cells = <2>;
 			device_type = "pci";
-			num-lanes = <4>;
 			num-viewport = <6>;
 			bus-range = <0x0 0xff>;
 			ranges = <0x81000000 0x0 0x00000000 0x48 0x00010000 0x0 0x00010000   /* downstream I/O */

diff --git a/arch/arm/include/asm/pgalloc.h b/arch/arm/include/asm/pgalloc.h
index a2a68b7..069da39 100644
--- a/arch/arm/include/asm/pgalloc.h
+++ b/arch/arm/include/asm/pgalloc.h

@@ -15,8 +15,6 @@
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
 
-#define check_pgt_cache()		do { } while (0)
-
 #ifdef CONFIG_MMU
 
 #define _PAGE_USER_TABLE	(PMD_TYPE_TABLE | PMD_BIT4 | PMD_DOMAIN(DOMAIN_USER))

diff --git a/arch/arm/include/asm/pgtable-nommu.h b/arch/arm/include/asm/pgtable-nommu.h
index d0de24f..010fa1a 100644
--- a/arch/arm/include/asm/pgtable-nommu.h
+++ b/arch/arm/include/asm/pgtable-nommu.h

@@ -71,11 +71,6 @@
 extern unsigned int kobjsize(const void *objp);
 
 /*
- * No page table caches to initialise.
- */
-#define pgtable_cache_init()	do { } while (0)
-
-/*
  * All 32bit addresses are effectively valid for vmalloc...
  * Sort of meaningless for non-VM targets.
  */

diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h
index f2e990d..3ae120c 100644
--- a/arch/arm/include/asm/pgtable.h
+++ b/arch/arm/include/asm/pgtable.h

@@ -368,8 +368,6 @@
 #define HAVE_ARCH_UNMAPPED_AREA
 #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
 
-#define pgtable_cache_init() do { } while (0)
-
 #endif /* !__ASSEMBLY__ */
 
 #endif /* CONFIG_MMU */

diff --git a/arch/arm/include/asm/processor.h b/arch/arm/include/asm/processor.h
index 20c2f42..614bf82 100644
--- a/arch/arm/include/asm/processor.h
+++ b/arch/arm/include/asm/processor.h

@@ -140,8 +140,6 @@
 #endif
 #endif
 
-#define HAVE_ARCH_PICK_MMAP_LAYOUT
-
 #endif
 
 #endif /* __ASM_ARM_PROCESSOR_H */

diff --git a/arch/arm/include/asm/tlb.h b/arch/arm/include/asm/tlb.h
index b75ea15..669474a 100644
--- a/arch/arm/include/asm/tlb.h
+++ b/arch/arm/include/asm/tlb.h

@@ -44,7 +44,7 @@
 static inline void
 __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, unsigned long addr)
 {
-	pgtable_page_dtor(pte);
+	pgtable_pte_page_dtor(pte);
 
 #ifndef CONFIG_ARM_LPAE
 	/*

diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
index f934a67..9485acc 100644
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c

@@ -319,11 +319,6 @@
 	return 0;
 }
 
-unsigned long arch_randomize_brk(struct mm_struct *mm)
-{
-	return randomize_page(mm->brk, 0x02000000);
-}
-
 #ifdef CONFIG_MMU
 #ifdef CONFIG_KUSER_HELPERS
 /*

diff --git a/arch/arm/mm/flush.c b/arch/arm/mm/flush.c
index 6ecbda8..6d89db7 100644
--- a/arch/arm/mm/flush.c
+++ b/arch/arm/mm/flush.c

@@ -204,18 +204,17 @@
 	 * coherent with the kernels mapping.
 	 */
 	if (!PageHighMem(page)) {
-		size_t page_size = PAGE_SIZE << compound_order(page);
-		__cpuc_flush_dcache_area(page_address(page), page_size);
+		__cpuc_flush_dcache_area(page_address(page), page_size(page));
 	} else {
 		unsigned long i;
 		if (cache_is_vipt_nonaliasing()) {
-			for (i = 0; i < (1 << compound_order(page)); i++) {
+			for (i = 0; i < compound_nr(page); i++) {
 				void *addr = kmap_atomic(page + i);
 				__cpuc_flush_dcache_area(addr, PAGE_SIZE);
 				kunmap_atomic(addr);
 			}
 		} else {
-			for (i = 0; i < (1 << compound_order(page)); i++) {
+			for (i = 0; i < compound_nr(page); i++) {
 				void *addr = kmap_high_get(page + i);
 				if (addr) {
 					__cpuc_flush_dcache_area(addr, PAGE_SIZE);

diff --git a/arch/arm/mm/mmap.c b/arch/arm/mm/mmap.c
index f866870..b8d912a 100644
--- a/arch/arm/mm/mmap.c
+++ b/arch/arm/mm/mmap.c

@@ -17,33 +17,6 @@
 	((((addr)+SHMLBA-1)&~(SHMLBA-1)) +	\
 	 (((pgoff)<<PAGE_SHIFT) & (SHMLBA-1)))
 
-/* gap between mmap and stack */
-#define MIN_GAP (128*1024*1024UL)
-#define MAX_GAP ((TASK_SIZE)/6*5)
-
-static int mmap_is_legacy(struct rlimit *rlim_stack)
-{
-	if (current->personality & ADDR_COMPAT_LAYOUT)
-		return 1;
-
-	if (rlim_stack->rlim_cur == RLIM_INFINITY)
-		return 1;
-
-	return sysctl_legacy_va_layout;
-}
-
-static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack)
-{
-	unsigned long gap = rlim_stack->rlim_cur;
-
-	if (gap < MIN_GAP)
-		gap = MIN_GAP;
-	else if (gap > MAX_GAP)
-		gap = MAX_GAP;
-
-	return PAGE_ALIGN(TASK_SIZE - gap - rnd);
-}
-
 /*
  * We need to ensure that shared mappings are correctly aligned to
  * avoid aliasing issues with VIPT caches.  We need to ensure that
@@ -171,31 +144,6 @@
 	return addr;
 }
 
-unsigned long arch_mmap_rnd(void)
-{
-	unsigned long rnd;
-
-	rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1);
-
-	return rnd << PAGE_SHIFT;
-}
-
-void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
-{
-	unsigned long random_factor = 0UL;
-
-	if (current->flags & PF_RANDOMIZE)
-		random_factor = arch_mmap_rnd();
-
-	if (mmap_is_legacy(rlim_stack)) {
-		mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
-		mm->get_unmapped_area = arch_get_unmapped_area;
-	} else {
-		mm->mmap_base = mmap_base(random_factor, rlim_stack);
-		mm->get_unmapped_area = arch_get_unmapped_area_topdown;
-	}
-}
-
 /*
  * You really shouldn't be using read() or write() on /dev/mem.  This
  * might go away in the future.

diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
index 25da9b2..48c2888 100644
--- a/arch/arm/mm/mmu.c
+++ b/arch/arm/mm/mmu.c

@@ -731,7 +731,7 @@
 {
 	void *ptr = (void *)__get_free_pages(GFP_PGTABLE_KERNEL, get_order(sz));
 
-	if (!ptr || !pgtable_page_ctor(virt_to_page(ptr)))
+	if (!ptr || !pgtable_pte_page_ctor(virt_to_page(ptr)))
 		BUG();
 	return ptr;
 }

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 37c6109..41a9b42 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig

@@ -15,7 +15,6 @@
 	select ARCH_HAS_DMA_COHERENT_TO_PFN
 	select ARCH_HAS_DMA_PREP_COHERENT
 	select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI
-	select ARCH_HAS_ELF_RANDOMIZE
 	select ARCH_HAS_FAST_MULTIPLIER
 	select ARCH_HAS_FORTIFY_SOURCE
 	select ARCH_HAS_GCOV_PROFILE_ALL
@@ -71,6 +70,7 @@
 	select ARCH_SUPPORTS_INT128 if GCC_VERSION >= 50000 || CC_IS_CLANG
 	select ARCH_SUPPORTS_NUMA_BALANCING
 	select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT
+	select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
 	select ARCH_WANT_FRAME_POINTERS
 	select ARCH_WANT_HUGE_PMD_SHARE if ARM64_4K_PAGES || (ARM64_16K_PAGES && !ARM64_VA_BITS_36)
 	select ARCH_HAS_UBSAN_SANITIZE_ALL
@@ -982,7 +982,7 @@
 	  for kernel and initramfs as opposed to list of segments as
 	  accepted by previous system call.
 
-config KEXEC_VERIFY_SIG
+config KEXEC_SIG
 	bool "Verify kernel signature during kexec_file_load() syscall"
 	depends on KEXEC_FILE
 	help
@@ -997,13 +997,13 @@
 config KEXEC_IMAGE_VERIFY_SIG
 	bool "Enable Image signature verification support"
 	default y
-	depends on KEXEC_VERIFY_SIG
+	depends on KEXEC_SIG
 	depends on EFI && SIGNED_PE_FILE_VERIFICATION
 	help
 	  Enable Image signature verification support.
 
 comment "Support for PE file signature verification disabled"
-	depends on KEXEC_VERIFY_SIG
+	depends on KEXEC_SIG
 	depends on !EFI || !SIGNED_PE_FILE_VERIFICATION
 
 config CRASH_DUMP

diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1012a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls1012a.dtsi
index 124a7e2..3379193 100644
--- a/arch/arm64/boot/dts/freescale/fsl-ls1012a.dtsi
+++ b/arch/arm64/boot/dts/freescale/fsl-ls1012a.dtsi

@@ -486,7 +486,6 @@
 			#address-cells = <3>;
 			#size-cells = <2>;
 			device_type = "pci";
-			num-lanes = <4>;
 			num-viewport = <2>;
 			bus-range = <0x0 0xff>;
 			ranges = <0x81000000 0x0 0x00000000 0x40 0x00010000 0x0 0x00010000   /* downstream I/O */

diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi
index 71d9ed9f..c084c7a4 100644
--- a/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi
+++ b/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi

@@ -677,7 +677,6 @@
 			#size-cells = <2>;
 			device_type = "pci";
 			dma-coherent;
-			num-lanes = <4>;
 			num-viewport = <6>;
 			bus-range = <0x0 0xff>;
 			ranges = <0x81000000 0x0 0x00000000 0x40 0x00010000 0x0 0x00010000   /* downstream I/O */
@@ -704,7 +703,6 @@
 			#size-cells = <2>;
 			device_type = "pci";
 			dma-coherent;
-			num-lanes = <2>;
 			num-viewport = <6>;
 			bus-range = <0x0 0xff>;
 			ranges = <0x81000000 0x0 0x00000000 0x48 0x00010000 0x0 0x00010000   /* downstream I/O */
@@ -731,7 +729,6 @@
 			#size-cells = <2>;
 			device_type = "pci";
 			dma-coherent;
-			num-lanes = <2>;
 			num-viewport = <6>;
 			bus-range = <0x0 0xff>;
 			ranges = <0x81000000 0x0 0x00000000 0x50 0x00010000 0x0 0x00010000   /* downstream I/O */

diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi
index b0ef08b..d4c1da3 100644
--- a/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi
+++ b/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi

@@ -649,7 +649,6 @@
 			#size-cells = <2>;
 			device_type = "pci";
 			dma-coherent;
-			num-lanes = <4>;
 			num-viewport = <8>;
 			bus-range = <0x0 0xff>;
 			ranges = <0x81000000 0x0 0x00000000 0x40 0x00010000 0x0 0x00010000   /* downstream I/O */
@@ -671,7 +670,6 @@
 			reg-names = "regs", "addr_space";
 			num-ib-windows = <6>;
 			num-ob-windows = <8>;
-			num-lanes = <2>;
 			status = "disabled";
 		};
 
@@ -687,7 +685,6 @@
 			#size-cells = <2>;
 			device_type = "pci";
 			dma-coherent;
-			num-lanes = <2>;
 			num-viewport = <8>;
 			bus-range = <0x0 0xff>;
 			ranges = <0x81000000 0x0 0x00000000 0x48 0x00010000 0x0 0x00010000   /* downstream I/O */
@@ -709,7 +706,6 @@
 			reg-names = "regs", "addr_space";
 			num-ib-windows = <6>;
 			num-ob-windows = <8>;
-			num-lanes = <2>;
 			status = "disabled";
 		};
 
@@ -725,7 +721,6 @@
 			#size-cells = <2>;
 			device_type = "pci";
 			dma-coherent;
-			num-lanes = <2>;
 			num-viewport = <8>;
 			bus-range = <0x0 0xff>;
 			ranges = <0x81000000 0x0 0x00000000 0x50 0x00010000 0x0 0x00010000   /* downstream I/O */
@@ -747,7 +742,6 @@
 			reg-names = "regs", "addr_space";
 			num-ib-windows = <6>;
 			num-ob-windows = <8>;
-			num-lanes = <2>;
 			status = "disabled";
 		};
 

diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1088a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls1088a.dtsi
index d1469b0..c676d07 100644
--- a/arch/arm64/boot/dts/freescale/fsl-ls1088a.dtsi
+++ b/arch/arm64/boot/dts/freescale/fsl-ls1088a.dtsi

@@ -469,7 +469,6 @@
 			#size-cells = <2>;
 			device_type = "pci";
 			dma-coherent;
-			num-lanes = <4>;
 			num-viewport = <256>;
 			bus-range = <0x0 0xff>;
 			ranges = <0x81000000 0x0 0x00000000 0x20 0x00010000 0x0 0x00010000   /* downstream I/O */
@@ -495,7 +494,6 @@
 			#size-cells = <2>;
 			device_type = "pci";
 			dma-coherent;
-			num-lanes = <4>;
 			num-viewport = <6>;
 			bus-range = <0x0 0xff>;
 			ranges = <0x81000000 0x0 0x00000000 0x28 0x00010000 0x0 0x00010000   /* downstream I/O */
@@ -521,7 +519,6 @@
 			#size-cells = <2>;
 			device_type = "pci";
 			dma-coherent;
-			num-lanes = <8>;
 			num-viewport = <6>;
 			bus-range = <0x0 0xff>;
 			ranges = <0x81000000 0x0 0x00000000 0x30 0x00010000 0x0 0x00010000   /* downstream I/O */

diff --git a/arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi
index 64101c9..7a0be8e 100644
--- a/arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi
+++ b/arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi

@@ -639,7 +639,6 @@
 			#size-cells = <2>;
 			device_type = "pci";
 			dma-coherent;
-			num-lanes = <4>;
 			num-viewport = <6>;
 			bus-range = <0x0 0xff>;
 			msi-parent = <&its>;
@@ -661,7 +660,6 @@
 			#size-cells = <2>;
 			device_type = "pci";
 			dma-coherent;
-			num-lanes = <4>;
 			num-viewport = <6>;
 			bus-range = <0x0 0xff>;
 			msi-parent = <&its>;
@@ -683,7 +681,6 @@
 			#size-cells = <2>;
 			device_type = "pci";
 			dma-coherent;
-			num-lanes = <8>;
 			num-viewport = <256>;
 			bus-range = <0x0 0xff>;
 			msi-parent = <&its>;
@@ -705,7 +702,6 @@
 			#size-cells = <2>;
 			device_type = "pci";
 			dma-coherent;
-			num-lanes = <4>;
 			num-viewport = <6>;
 			bus-range = <0x0 0xff>;
 			msi-parent = <&its>;

diff --git a/arch/arm64/boot/dts/nvidia/tegra194-p2888.dtsi b/arch/arm64/boot/dts/nvidia/tegra194-p2888.dtsi
index 62e07e11..4c38426 100644
--- a/arch/arm64/boot/dts/nvidia/tegra194-p2888.dtsi
+++ b/arch/arm64/boot/dts/nvidia/tegra194-p2888.dtsi

@@ -289,5 +289,29 @@
 			gpio = <&gpio TEGRA194_MAIN_GPIO(A, 3) GPIO_ACTIVE_HIGH>;
 			enable-active-high;
 		};
+
+		vdd_3v3_pcie: regulator@2 {
+			compatible = "regulator-fixed";
+			reg = <2>;
+
+			regulator-name = "PEX_3V3";
+			regulator-min-microvolt = <3300000>;
+			regulator-max-microvolt = <3300000>;
+			gpio = <&gpio TEGRA194_MAIN_GPIO(Z, 2) GPIO_ACTIVE_HIGH>;
+			regulator-boot-on;
+			enable-active-high;
+		};
+
+		vdd_12v_pcie: regulator@3 {
+			compatible = "regulator-fixed";
+			reg = <3>;
+
+			regulator-name = "VDD_12V";
+			regulator-min-microvolt = <1200000>;
+			regulator-max-microvolt = <1200000>;
+			gpio = <&gpio TEGRA194_MAIN_GPIO(A, 1) GPIO_ACTIVE_LOW>;
+			regulator-boot-on;
+			enable-active-low;
+		};
 	};
 };

diff --git a/arch/arm64/boot/dts/nvidia/tegra194-p2972-0000.dts b/arch/arm64/boot/dts/nvidia/tegra194-p2972-0000.dts
index 23597d5..d47cd8c 100644
--- a/arch/arm64/boot/dts/nvidia/tegra194-p2972-0000.dts
+++ b/arch/arm64/boot/dts/nvidia/tegra194-p2972-0000.dts

@@ -93,9 +93,11 @@
 	};
 
 	pcie@141a0000 {
-		status = "disabled";
+		status = "okay";
 
 		vddio-pex-ctl-supply = <&vdd_1v8ao>;
+		vpcie3v3-supply = <&vdd_3v3_pcie>;
+		vpcie12v-supply = <&vdd_12v_pcie>;
 
 		phys = <&p2u_nvhs_0>, <&p2u_nvhs_1>, <&p2u_nvhs_2>,
 		       <&p2u_nvhs_3>, <&p2u_nvhs_4>, <&p2u_nvhs_5>,

diff --git a/arch/arm64/boot/dts/nvidia/tegra194.dtsi b/arch/arm64/boot/dts/nvidia/tegra194.dtsi
index adebbbf..3c0cf54 100644
--- a/arch/arm64/boot/dts/nvidia/tegra194.dtsi
+++ b/arch/arm64/boot/dts/nvidia/tegra194.dtsi

@@ -3,8 +3,9 @@
 #include <dt-bindings/gpio/tegra194-gpio.h>
 #include <dt-bindings/interrupt-controller/arm-gic.h>
 #include <dt-bindings/mailbox/tegra186-hsp.h>
-#include <dt-bindings/reset/tegra194-reset.h>
+#include <dt-bindings/pinctrl/pinctrl-tegra.h>
 #include <dt-bindings/power/tegra194-powergate.h>
+#include <dt-bindings/reset/tegra194-reset.h>
 #include <dt-bindings/thermal/tegra194-bpmp-thermal.h>
 
 / {
@@ -130,6 +131,38 @@
 			};
 		};
 
+		pinmux: pinmux@2430000 {
+			compatible = "nvidia,tegra194-pinmux";
+			reg = <0x2430000 0x17000
+			       0xc300000 0x4000>;
+
+			status = "okay";
+
+			pex_rst_c5_out_state: pex_rst_c5_out {
+				pex_rst {
+					nvidia,pins = "pex_l5_rst_n_pgg1";
+					nvidia,schmitt = <TEGRA_PIN_DISABLE>;
+					nvidia,lpdr = <TEGRA_PIN_ENABLE>;
+					nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+					nvidia,io-high-voltage = <TEGRA_PIN_ENABLE>;
+					nvidia,tristate = <TEGRA_PIN_DISABLE>;
+					nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				};
+			};
+
+			clkreq_c5_bi_dir_state: clkreq_c5_bi_dir {
+				clkreq {
+					nvidia,pins = "pex_l5_clkreq_n_pgg0";
+					nvidia,schmitt = <TEGRA_PIN_DISABLE>;
+					nvidia,lpdr = <TEGRA_PIN_ENABLE>;
+					nvidia,enable-input = <TEGRA_PIN_ENABLE>;
+					nvidia,io-high-voltage = <TEGRA_PIN_ENABLE>;
+					nvidia,tristate = <TEGRA_PIN_DISABLE>;
+					nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+				};
+			};
+		};
+
 		uarta: serial@3100000 {
 			compatible = "nvidia,tegra194-uart", "nvidia,tegra20-uart";
 			reg = <0x03100000 0x40>;
@@ -1365,6 +1398,9 @@
 		num-viewport = <8>;
 		linux,pci-domain = <5>;
 
+		pinctrl-names = "default";
+		pinctrl-0 = <&pex_rst_c5_out_state>, <&clkreq_c5_bi_dir_state>;
+
 		clocks = <&bpmp TEGRA194_CLK_PEX1_CORE_5>,
 			<&bpmp TEGRA194_CLK_PEX1_CORE_5M>;
 		clock-names = "core", "core_m";

diff --git a/arch/arm64/include/asm/pgalloc.h b/arch/arm64/include/asm/pgalloc.h
index 14d0bc4..172d76f 100644
--- a/arch/arm64/include/asm/pgalloc.h
+++ b/arch/arm64/include/asm/pgalloc.h

@@ -15,8 +15,6 @@
 
 #include <asm-generic/pgalloc.h>	/* for pte_{alloc,free}_one */
 
-#define check_pgt_cache()		do { } while (0)
-
 #define PGD_SIZE	(PTRS_PER_PGD * sizeof(pgd_t))
 
 #if CONFIG_PGTABLE_LEVELS > 2

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 57427d1..7576df0 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h

@@ -861,8 +861,6 @@
 
 #include <asm-generic/pgtable.h>
 
-static inline void pgtable_cache_init(void) { }
-
 /*
  * On AArch64, the cache coherency is handled via the set_pte_at() function.
  */

diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
index c67848c..5623685 100644
--- a/arch/arm64/include/asm/processor.h
+++ b/arch/arm64/include/asm/processor.h

@@ -280,8 +280,6 @@
 		     "nop") : : "p" (ptr));
 }
 
-#define HAVE_ARCH_PICK_MMAP_LAYOUT
-
 extern unsigned long __ro_after_init signal_minsigstksz; /* sigframe size */
 extern void __init minsigstksz_setup(void);
 

diff --git a/arch/arm64/include/asm/tlb.h b/arch/arm64/include/asm/tlb.h
index a95d1fc..b76df82 100644
--- a/arch/arm64/include/asm/tlb.h
+++ b/arch/arm64/include/asm/tlb.h

@@ -44,7 +44,7 @@
 static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte,
 				  unsigned long addr)
 {
-	pgtable_page_dtor(pte);
+	pgtable_pte_page_dtor(pte);
 	tlb_remove_table(tlb, pte);
 }
 

diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index 03689c0..a47462d 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c

@@ -557,14 +557,6 @@
 	return sp & ~0xf;
 }
 
-unsigned long arch_randomize_brk(struct mm_struct *mm)
-{
-	if (is_compat_task())
-		return randomize_page(mm->brk, SZ_32M);
-	else
-		return randomize_page(mm->brk, SZ_1G);
-}
-
 /*
  * Called from setup_new_exec() after (COMPAT_)SET_PERSONALITY.
  */

diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c
index dc19300..ac48516 100644
--- a/arch/arm64/mm/flush.c
+++ b/arch/arm64/mm/flush.c

@@ -56,8 +56,7 @@
 	struct page *page = pte_page(pte);
 
 	if (!test_and_set_bit(PG_dcache_clean, &page->flags))
-		sync_icache_aliases(page_address(page),
-				    PAGE_SIZE << compound_order(page));
+		sync_icache_aliases(page_address(page), page_size(page));
 }
 EXPORT_SYMBOL_GPL(__sync_icache_dcache);
 

diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c
index b050641..3028bac 100644
--- a/arch/arm64/mm/mmap.c
+++ b/arch/arm64/mm/mmap.c

@@ -21,78 +21,6 @@
 #include <asm/cputype.h>
 
 /*
- * Leave enough space between the mmap area and the stack to honour ulimit in
- * the face of randomisation.
- */
-#define MIN_GAP (SZ_128M)
-#define MAX_GAP	(STACK_TOP/6*5)
-
-static int mmap_is_legacy(struct rlimit *rlim_stack)
-{
-	if (current->personality & ADDR_COMPAT_LAYOUT)
-		return 1;
-
-	if (rlim_stack->rlim_cur == RLIM_INFINITY)
-		return 1;
-
-	return sysctl_legacy_va_layout;
-}
-
-unsigned long arch_mmap_rnd(void)
-{
-	unsigned long rnd;
-
-#ifdef CONFIG_COMPAT
-	if (test_thread_flag(TIF_32BIT))
-		rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1);
-	else
-#endif
-		rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1);
-	return rnd << PAGE_SHIFT;
-}
-
-static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack)
-{
-	unsigned long gap = rlim_stack->rlim_cur;
-	unsigned long pad = (STACK_RND_MASK << PAGE_SHIFT) + stack_guard_gap;
-
-	/* Values close to RLIM_INFINITY can overflow. */
-	if (gap + pad > gap)
-		gap += pad;
-
-	if (gap < MIN_GAP)
-		gap = MIN_GAP;
-	else if (gap > MAX_GAP)
-		gap = MAX_GAP;
-
-	return PAGE_ALIGN(STACK_TOP - gap - rnd);
-}
-
-/*
- * This function, called very early during the creation of a new process VM
- * image, sets up which VM layout function to use:
- */
-void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
-{
-	unsigned long random_factor = 0UL;
-
-	if (current->flags & PF_RANDOMIZE)
-		random_factor = arch_mmap_rnd();
-
-	/*
-	 * Fall back to the standard layout if the personality bit is set, or
-	 * if the expected stack growth is unlimited:
-	 */
-	if (mmap_is_legacy(rlim_stack)) {
-		mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
-		mm->get_unmapped_area = arch_get_unmapped_area;
-	} else {
-		mm->mmap_base = mmap_base(random_factor, rlim_stack);
-		mm->get_unmapped_area = arch_get_unmapped_area_topdown;
-	}
-}
-
-/*
  * You really shouldn't be using read() or write() on /dev/mem.  This might go
  * away in the future.
  */

diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 53dc6f2..60c929f 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c

@@ -384,7 +384,7 @@
 	 * folded, and if so pgtable_pmd_page_ctor() becomes nop.
 	 */
 	if (shift == PAGE_SHIFT)
-		BUG_ON(!pgtable_page_ctor(phys_to_page(pa)));
+		BUG_ON(!pgtable_pte_page_ctor(phys_to_page(pa)));
 	else if (shift == PMD_SHIFT)
 		BUG_ON(!pgtable_pmd_page_ctor(phys_to_page(pa)));
 

diff --git a/arch/arm64/mm/pgd.c b/arch/arm64/mm/pgd.c
index 7548f9c..4a64089 100644
--- a/arch/arm64/mm/pgd.c
+++ b/arch/arm64/mm/pgd.c

@@ -35,7 +35,7 @@
 		kmem_cache_free(pgd_cache, pgd);
 }
 
-void __init pgd_cache_init(void)
+void __init pgtable_cache_init(void)
 {
 	if (PGD_SIZE == PAGE_SIZE)
 		return;

diff --git a/arch/c6x/include/asm/pgtable.h b/arch/c6x/include/asm/pgtable.h
index 0bd8059..0b6919c 100644
--- a/arch/c6x/include/asm/pgtable.h
+++ b/arch/c6x/include/asm/pgtable.h

@@ -60,11 +60,6 @@
 #define swapper_pg_dir ((pgd_t *) 0)
 
 /*
- * No page table caches to initialise
- */
-#define pgtable_cache_init()   do { } while (0)
-
-/*
  * c6x is !MMU, so define the simpliest implementation
  */
 #define pgprot_writecombine pgprot_noncached

diff --git a/arch/csky/include/asm/pgalloc.h b/arch/csky/include/asm/pgalloc.h
index 98c571670..c7c1ed2 100644
--- a/arch/csky/include/asm/pgalloc.h
+++ b/arch/csky/include/asm/pgalloc.h

@@ -71,12 +71,10 @@
 
 #define __pte_free_tlb(tlb, pte, address)		\
 do {							\
-	pgtable_page_dtor(pte);				\
+	pgtable_pte_page_dtor(pte);			\
 	tlb_remove_page(tlb, pte);			\
 } while (0)
 
-#define check_pgt_cache()	do {} while (0)
-
 extern void pagetable_init(void);
 extern void pre_mmu_init(void);
 extern void pre_trap_init(void);

diff --git a/arch/csky/include/asm/pgtable.h b/arch/csky/include/asm/pgtable.h
index c429a6f..0040b3a 100644
--- a/arch/csky/include/asm/pgtable.h
+++ b/arch/csky/include/asm/pgtable.h

@@ -296,11 +296,6 @@
 /* Needs to be defined here and not in linux/mm.h, as it is arch dependent */
 #define kern_addr_valid(addr)	(1)
 
-/*
- * No page table caches to initialise
- */
-#define pgtable_cache_init()	do {} while (0)
-
 #define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \
 	remap_pfn_range(vma, vaddr, pfn, size, prot)
 

diff --git a/arch/h8300/include/asm/pgtable.h b/arch/h8300/include/asm/pgtable.h
index a99caa4..4d00152 100644
--- a/arch/h8300/include/asm/pgtable.h
+++ b/arch/h8300/include/asm/pgtable.h

@@ -4,7 +4,6 @@
 #define __ARCH_USE_5LEVEL_HACK
 #include <asm-generic/pgtable-nopud.h>
 #include <asm-generic/pgtable.h>
-#define pgtable_cache_init()   do { } while (0)
 extern void paging_init(void);
 #define PAGE_NONE		__pgprot(0)    /* these mean nothing to NO_MM */
 #define PAGE_SHARED		__pgprot(0)    /* these mean nothing to NO_MM */
@@ -35,11 +34,6 @@
 extern int is_in_rom(unsigned long);
 
 /*
- * No page table caches to initialise
- */
-#define pgtable_cache_init()   do { } while (0)
-
-/*
  * All 32bit addresses are effectively valid for vmalloc...
  * Sort of meaningless for non-VM targets.
  */

diff --git a/arch/hexagon/include/asm/pgalloc.h b/arch/hexagon/include/asm/pgalloc.h
index d6544dc..cc9be514 100644
--- a/arch/hexagon/include/asm/pgalloc.h
+++ b/arch/hexagon/include/asm/pgalloc.h

@@ -13,8 +13,6 @@
 
 #include <asm-generic/pgalloc.h>	/* for pte_{alloc,free}_one */
 
-#define check_pgt_cache() do {} while (0)
-
 extern unsigned long long kmap_generation;
 
 /*
@@ -96,7 +94,7 @@
 
 #define __pte_free_tlb(tlb, pte, addr)		\
 do {						\
-	pgtable_page_dtor((pte));		\
+	pgtable_pte_page_dtor((pte));		\
 	tlb_remove_page((tlb), (pte));		\
 } while (0)
 

diff --git a/arch/hexagon/include/asm/pgtable.h b/arch/hexagon/include/asm/pgtable.h
index a3ff6d2..2fec20a 100644
--- a/arch/hexagon/include/asm/pgtable.h
+++ b/arch/hexagon/include/asm/pgtable.h

@@ -431,9 +431,6 @@
 
 #define __pte_offset(address) (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
 
-/*  I think this is in case we have page table caches; needed by init/main.c  */
-#define pgtable_cache_init()    do { } while (0)
-
 /*
  * Swap/file PTE definitions.  If _PAGE_PRESENT is zero, the rest of the PTE is
  * interpreted as swap information.  The remaining free bits are interpreted as

diff --git a/arch/hexagon/mm/Makefile b/arch/hexagon/mm/Makefile
index 1894263..8938384 100644
--- a/arch/hexagon/mm/Makefile
+++ b/arch/hexagon/mm/Makefile

@@ -3,5 +3,5 @@
 # Makefile for Hexagon memory management subsystem
 #
 
-obj-y := init.o pgalloc.o ioremap.o uaccess.o vm_fault.o cache.o
+obj-y := init.o ioremap.o uaccess.o vm_fault.o cache.o
 obj-y += copy_to_user.o copy_from_user.o strnlen_user.o vm_tlb.o

diff --git a/arch/hexagon/mm/init.c b/arch/hexagon/mm/init.c
index f1f6ebd..c961773 100644
--- a/arch/hexagon/mm/init.c
+++ b/arch/hexagon/mm/init.c

@@ -71,19 +71,6 @@
 	init_mm.context.ptbase = __pa(init_mm.pgd);
 }
 
-/*
- * free_initrd_mem - frees...  initrd memory.
- * @start - start of init memory
- * @end - end of init memory
- *
- * Apparently has to be passed the address of the initrd memory.
- *
- * Wrapped by #ifdef CONFIG_BLKDEV_INITRD
- */
-void free_initrd_mem(unsigned long start, unsigned long end)
-{
-}
-
 void sync_icache_dcache(pte_t pte)
 {
 	unsigned long addr;

diff --git a/arch/hexagon/mm/pgalloc.c b/arch/hexagon/mm/pgalloc.c
deleted file mode 100644
index 4d43161..0000000
--- a/arch/hexagon/mm/pgalloc.c
+++ /dev/null

@@ -1,10 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (c) 2010-2011, The Linux Foundation. All rights reserved.
- */
-
-#include <linux/init.h>
-
-void __init pgtable_cache_init(void)
-{
-}

diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 685a3df..16714477 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig

@@ -72,10 +72,6 @@
 config ZONE_DMA32
 	def_bool y
 
-config QUICKLIST
-	bool
-	default y
-
 config MMU
 	bool
 	default y

diff --git a/arch/ia64/include/asm/pgalloc.h b/arch/ia64/include/asm/pgalloc.h
index c9e4810..f4c4910 100644
--- a/arch/ia64/include/asm/pgalloc.h
+++ b/arch/ia64/include/asm/pgalloc.h

@@ -19,18 +19,19 @@
 #include <linux/mm.h>
 #include <linux/page-flags.h>
 #include <linux/threads.h>
-#include <linux/quicklist.h>
+
+#include <asm-generic/pgalloc.h>
 
 #include <asm/mmu_context.h>
 
 static inline pgd_t *pgd_alloc(struct mm_struct *mm)
 {
-	return quicklist_alloc(0, GFP_KERNEL, NULL);
+	return (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
 }
 
 static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 {
-	quicklist_free(0, NULL, pgd);
+	free_page((unsigned long)pgd);
 }
 
 #if CONFIG_PGTABLE_LEVELS == 4
@@ -42,12 +43,12 @@
 
 static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
 {
-	return quicklist_alloc(0, GFP_KERNEL, NULL);
+	return (pud_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
 }
 
 static inline void pud_free(struct mm_struct *mm, pud_t *pud)
 {
-	quicklist_free(0, NULL, pud);
+	free_page((unsigned long)pud);
 }
 #define __pud_free_tlb(tlb, pud, address)	pud_free((tlb)->mm, pud)
 #endif /* CONFIG_PGTABLE_LEVELS == 4 */
@@ -60,12 +61,12 @@
 
 static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
 {
-	return quicklist_alloc(0, GFP_KERNEL, NULL);
+	return (pmd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
 }
 
 static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
 {
-	quicklist_free(0, NULL, pmd);
+	free_page((unsigned long)pmd);
 }
 
 #define __pmd_free_tlb(tlb, pmd, address)	pmd_free((tlb)->mm, pmd)
@@ -83,43 +84,6 @@
 	pmd_val(*pmd_entry) = __pa(pte);
 }
 
-static inline pgtable_t pte_alloc_one(struct mm_struct *mm)
-{
-	struct page *page;
-	void *pg;
-
-	pg = quicklist_alloc(0, GFP_KERNEL, NULL);
-	if (!pg)
-		return NULL;
-	page = virt_to_page(pg);
-	if (!pgtable_page_ctor(page)) {
-		quicklist_free(0, NULL, pg);
-		return NULL;
-	}
-	return page;
-}
-
-static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm)
-{
-	return quicklist_alloc(0, GFP_KERNEL, NULL);
-}
-
-static inline void pte_free(struct mm_struct *mm, pgtable_t pte)
-{
-	pgtable_page_dtor(pte);
-	quicklist_free_page(0, NULL, pte);
-}
-
-static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
-{
-	quicklist_free(0, NULL, pte);
-}
-
-static inline void check_pgt_cache(void)
-{
-	quicklist_trim(0, NULL, 25, 16);
-}
-
 #define __pte_free_tlb(tlb, pte, address)	pte_free((tlb)->mm, pte)
 
 #endif				/* _ASM_IA64_PGALLOC_H */

diff --git a/arch/ia64/include/asm/pgtable.h b/arch/ia64/include/asm/pgtable.h
index b1e7468e..d602e7c 100644
--- a/arch/ia64/include/asm/pgtable.h
+++ b/arch/ia64/include/asm/pgtable.h

@@ -566,11 +566,6 @@
 #define KERNEL_TR_PAGE_SHIFT	_PAGE_SIZE_64M
 #define KERNEL_TR_PAGE_SIZE	(1 << KERNEL_TR_PAGE_SHIFT)
 
-/*
- * No page table caches to initialise
- */
-#define pgtable_cache_init()	do { } while (0)
-
 /* These tell get_user_pages() that the first gate page is accessible from user-level.  */
 #define FIXADDR_USER_START	GATE_ADDR
 #ifdef HAVE_BUGGY_SEGREL

diff --git a/arch/ia64/kernel/irq_ia64.c b/arch/ia64/kernel/irq_ia64.c
index f102084..8e91c86 100644
--- a/arch/ia64/kernel/irq_ia64.c
+++ b/arch/ia64/kernel/irq_ia64.c

@@ -583,6 +583,7 @@
 static irqreturn_t dummy_handler (int irq, void *dev_id)
 {
 	BUG();
+	return IRQ_NONE;
 }
 
 static struct irqaction ipi_irqaction = {

diff --git a/arch/ia64/mm/contig.c b/arch/ia64/mm/contig.c
index db09a69..5b00dc3 100644
--- a/arch/ia64/mm/contig.c
+++ b/arch/ia64/mm/contig.c

@@ -108,7 +108,6 @@
 	struct pcpu_group_info *gi;
 	unsigned int cpu;
 	ssize_t static_size, reserved_size, dyn_size;
-	int rc;
 
 	ai = pcpu_alloc_alloc_info(1, num_possible_cpus());
 	if (!ai)

diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c
index 219fc64..4f33f6e 100644
--- a/arch/ia64/mm/discontig.c
+++ b/arch/ia64/mm/discontig.c

@@ -186,7 +186,7 @@
 	unsigned long base_offset;
 	unsigned int cpu;
 	ssize_t static_size, reserved_size, dyn_size;
-	int node, prev_node, unit, nr_units, rc;
+	int node, prev_node, unit, nr_units;
 
 	ai = pcpu_alloc_alloc_info(MAX_NUMNODES, nr_cpu_ids);
 	if (!ai)

diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index 678b98a..bf9df26 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c

@@ -64,7 +64,7 @@
 	if (test_bit(PG_arch_1, &page->flags))
 		return;				/* i-cache is already coherent with d-cache */
 
-	flush_icache_range(addr, addr + (PAGE_SIZE << compound_order(page)));
+	flush_icache_range(addr, addr + page_size(page));
 	set_bit(PG_arch_1, &page->flags);	/* mark page as clean */
 }
 

diff --git a/arch/m68k/include/asm/mcf_pgalloc.h b/arch/m68k/include/asm/mcf_pgalloc.h
index 4399d71..b34d44d 100644
--- a/arch/m68k/include/asm/mcf_pgalloc.h
+++ b/arch/m68k/include/asm/mcf_pgalloc.h

@@ -41,7 +41,7 @@
 static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t page,
 				  unsigned long address)
 {
-	pgtable_page_dtor(page);
+	pgtable_pte_page_dtor(page);
 	__free_page(page);
 }
 
@@ -54,7 +54,7 @@
 
 	if (!page)
 		return NULL;
-	if (!pgtable_page_ctor(page)) {
+	if (!pgtable_pte_page_ctor(page)) {
 		__free_page(page);
 		return NULL;
 	}
@@ -73,7 +73,7 @@
 
 static inline void pte_free(struct mm_struct *mm, struct page *page)
 {
-	pgtable_page_dtor(page);
+	pgtable_pte_page_dtor(page);
 	__free_page(page);
 }
 

diff --git a/arch/m68k/include/asm/motorola_pgalloc.h b/arch/m68k/include/asm/motorola_pgalloc.h
index d04d9ba..acab315 100644
--- a/arch/m68k/include/asm/motorola_pgalloc.h
+++ b/arch/m68k/include/asm/motorola_pgalloc.h

@@ -36,7 +36,7 @@
 	page = alloc_pages(GFP_KERNEL|__GFP_ZERO, 0);
 	if(!page)
 		return NULL;
-	if (!pgtable_page_ctor(page)) {
+	if (!pgtable_pte_page_ctor(page)) {
 		__free_page(page);
 		return NULL;
 	}
@@ -51,7 +51,7 @@
 
 static inline void pte_free(struct mm_struct *mm, pgtable_t page)
 {
-	pgtable_page_dtor(page);
+	pgtable_pte_page_dtor(page);
 	cache_page(kmap(page));
 	kunmap(page);
 	__free_page(page);
@@ -60,7 +60,7 @@
 static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t page,
 				  unsigned long address)
 {
-	pgtable_page_dtor(page);
+	pgtable_pte_page_dtor(page);
 	cache_page(kmap(page));
 	kunmap(page);
 	__free_page(page);

diff --git a/arch/m68k/include/asm/pgtable_mm.h b/arch/m68k/include/asm/pgtable_mm.h
index fde4534..646c174 100644
--- a/arch/m68k/include/asm/pgtable_mm.h
+++ b/arch/m68k/include/asm/pgtable_mm.h

@@ -176,11 +176,4 @@
 #include <asm-generic/pgtable.h>
 #endif /* !__ASSEMBLY__ */
 
-/*
- * No page table caches to initialise
- */
-#define pgtable_cache_init()	do { } while (0)
-
-#define check_pgt_cache()	do { } while (0)
-
 #endif /* _M68K_PGTABLE_H */

diff --git a/arch/m68k/include/asm/pgtable_no.h b/arch/m68k/include/asm/pgtable_no.h
index fc3a96c..c18165b 100644
--- a/arch/m68k/include/asm/pgtable_no.h
+++ b/arch/m68k/include/asm/pgtable_no.h

@@ -45,11 +45,6 @@
 #define ZERO_PAGE(vaddr)	(virt_to_page(0))
 
 /*
- * No page table caches to initialise.
- */
-#define pgtable_cache_init()	do { } while (0)
-
-/*
  * All 32bit addresses are effectively valid for vmalloc...
  * Sort of meaningless for non-VM targets.
  */
@@ -60,6 +55,4 @@
 
 #include <asm-generic/pgtable.h>
 
-#define check_pgt_cache()	do { } while (0)
-
 #endif /* _M68KNOMMU_PGTABLE_H */

diff --git a/arch/m68k/include/asm/sun3_pgalloc.h b/arch/m68k/include/asm/sun3_pgalloc.h
index 1a8ddbd..8561211 100644
--- a/arch/m68k/include/asm/sun3_pgalloc.h
+++ b/arch/m68k/include/asm/sun3_pgalloc.h

@@ -21,7 +21,7 @@
 
 #define __pte_free_tlb(tlb,pte,addr)			\
 do {							\
-	pgtable_page_dtor(pte);				\
+	pgtable_pte_page_dtor(pte);			\
 	tlb_remove_page((tlb), pte);			\
 } while (0)
 

diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig
index 632c947..c9c4be8 100644
--- a/arch/microblaze/Kconfig
+++ b/arch/microblaze/Kconfig

@@ -5,15 +5,18 @@
 	select ARCH_NO_SWAP
 	select ARCH_HAS_BINFMT_FLAT if !MMU
 	select ARCH_HAS_DMA_COHERENT_TO_PFN if MMU
+	select ARCH_HAS_DMA_PREP_COHERENT
 	select ARCH_HAS_GCOV_PROFILE_ALL
 	select ARCH_HAS_SYNC_DMA_FOR_CPU
 	select ARCH_HAS_SYNC_DMA_FOR_DEVICE
+	select ARCH_HAS_UNCACHED_SEGMENT if !MMU
 	select ARCH_MIGHT_HAVE_PC_PARPORT
 	select ARCH_WANT_IPC_PARSE_VERSION
 	select BUILDTIME_EXTABLE_SORT
 	select TIMER_OF
 	select CLONE_BACKWARDS3
 	select COMMON_CLK
+	select DMA_DIRECT_REMAP if MMU
 	select GENERIC_ATOMIC64
 	select GENERIC_CLOCKEVENTS
 	select GENERIC_CPU_DEVICES

diff --git a/arch/microblaze/boot/dts/system.dts b/arch/microblaze/boot/dts/system.dts
index 5a8a9d0..5b23652 100644
--- a/arch/microblaze/boot/dts/system.dts
+++ b/arch/microblaze/boot/dts/system.dts

@@ -18,7 +18,6 @@
 	#address-cells = <1>;
 	#size-cells = <1>;
 	compatible = "xlnx,microblaze";
-	hard-reset-gpios = <&LEDs_8Bit 2 1>;
 	model = "testing";
 	DDR2_SDRAM: memory@90000000 {
 		device_type = "memory";
@@ -281,6 +280,21 @@
 				gpios = <&LEDs_8Bit 7 1>;
 			};
 		} ;
+
+		gpio-restart {
+			compatible = "gpio-restart";
+			/*
+			 * FIXME: is this active low or active high?
+			 * the current flag (1) indicates active low.
+			 * delay measures are templates, should be adjusted
+			 * to datasheet or trial-and-error with real hardware.
+			 */
+			gpios = <&LEDs_8Bit 2 1>;
+			active-delay = <100>;
+			inactive-delay = <10>;
+			wait-delay = <100>;
+		};
+
 		RS232_Uart_1: serial@84000000 {
 			clock-frequency = <125000000>;
 			compatible = "xlnx,xps-uartlite-1.00.a";

diff --git a/arch/microblaze/configs/mmu_defconfig b/arch/microblaze/configs/mmu_defconfig
index 92fd4e95..654edfd 100644
--- a/arch/microblaze/configs/mmu_defconfig
+++ b/arch/microblaze/configs/mmu_defconfig

@@ -5,15 +5,10 @@
 CONFIG_IKCONFIG_PROC=y
 CONFIG_SYSFS_DEPRECATED=y
 CONFIG_SYSFS_DEPRECATED_V2=y
-CONFIG_KALLSYMS_ALL=y
 # CONFIG_BASE_FULL is not set
+CONFIG_KALLSYMS_ALL=y
 CONFIG_EMBEDDED=y
 CONFIG_SLAB=y
-CONFIG_MODULES=y
-CONFIG_MODULE_UNLOAD=y
-# CONFIG_BLK_DEV_BSG is not set
-CONFIG_PARTITION_ADVANCED=y
-# CONFIG_EFI_PARTITION is not set
 CONFIG_XILINX_MICROBLAZE0_USE_MSR_INSTR=1
 CONFIG_XILINX_MICROBLAZE0_USE_PCMP_INSTR=1
 CONFIG_XILINX_MICROBLAZE0_USE_BARREL=1
@@ -25,14 +20,19 @@
 CONFIG_CMDLINE_BOOL=y
 CONFIG_CMDLINE_FORCE=y
 CONFIG_HIGHMEM=y
-CONFIG_PCI=y
 CONFIG_PCI_XILINX=y
+CONFIG_MODULES=y
+CONFIG_MODULE_UNLOAD=y
+# CONFIG_BLK_DEV_BSG is not set
+CONFIG_PARTITION_ADVANCED=y
+# CONFIG_EFI_PARTITION is not set
 CONFIG_NET=y
 CONFIG_PACKET=y
 CONFIG_UNIX=y
 CONFIG_INET=y
 # CONFIG_IPV6 is not set
 CONFIG_BRIDGE=m
+CONFIG_PCI=y
 CONFIG_MTD=y
 CONFIG_MTD_CFI=y
 CONFIG_MTD_CFI_INTELEXT=y
@@ -41,6 +41,7 @@
 CONFIG_BLK_DEV_RAM_SIZE=8192
 CONFIG_NETDEVICES=y
 CONFIG_XILINX_EMACLITE=y
+CONFIG_XILINX_AXI_EMAC=y
 CONFIG_XILINX_LL_TEMAC=y
 # CONFIG_INPUT is not set
 # CONFIG_SERIO is not set
@@ -59,6 +60,8 @@
 CONFIG_GPIOLIB=y
 CONFIG_GPIO_SYSFS=y
 CONFIG_GPIO_XILINX=y
+CONFIG_POWER_RESET=y
+CONFIG_POWER_RESET_GPIO_RESTART=y
 # CONFIG_HWMON is not set
 CONFIG_WATCHDOG=y
 CONFIG_XILINX_WATCHDOG=y
@@ -74,8 +77,8 @@
 CONFIG_ROMFS_FS=y
 CONFIG_NFS_FS=y
 CONFIG_CIFS=y
-CONFIG_CIFS_STATS=y
 CONFIG_CIFS_STATS2=y
+CONFIG_ENCRYPTED_KEYS=y
 CONFIG_DEBUG_INFO=y
 CONFIG_DEBUG_SLAB=y
 CONFIG_DETECT_HUNG_TASK=y
@@ -83,6 +86,3 @@
 CONFIG_KGDB=y
 CONFIG_KGDB_TESTS=y
 CONFIG_KGDB_KDB=y
-CONFIG_EARLY_PRINTK=y
-CONFIG_KEYS=y
-CONFIG_ENCRYPTED_KEYS=y

diff --git a/arch/microblaze/configs/nommu_defconfig b/arch/microblaze/configs/nommu_defconfig
index 06d69a6..377de39c 100644
--- a/arch/microblaze/configs/nommu_defconfig
+++ b/arch/microblaze/configs/nommu_defconfig

@@ -7,15 +7,10 @@
 CONFIG_IKCONFIG_PROC=y
 CONFIG_SYSFS_DEPRECATED=y
 CONFIG_SYSFS_DEPRECATED_V2=y
-CONFIG_KALLSYMS_ALL=y
 # CONFIG_BASE_FULL is not set
+CONFIG_KALLSYMS_ALL=y
 CONFIG_EMBEDDED=y
 CONFIG_SLAB=y
-CONFIG_MODULES=y
-CONFIG_MODULE_UNLOAD=y
-# CONFIG_BLK_DEV_BSG is not set
-CONFIG_PARTITION_ADVANCED=y
-# CONFIG_EFI_PARTITION is not set
 CONFIG_XILINX_MICROBLAZE0_USE_MSR_INSTR=1
 CONFIG_XILINX_MICROBLAZE0_USE_PCMP_INSTR=1
 CONFIG_XILINX_MICROBLAZE0_USE_BARREL=1
@@ -25,13 +20,18 @@
 CONFIG_HZ_100=y
 CONFIG_CMDLINE_BOOL=y
 CONFIG_CMDLINE_FORCE=y
-CONFIG_PCI=y
 CONFIG_PCI_XILINX=y
+CONFIG_MODULES=y
+CONFIG_MODULE_UNLOAD=y
+# CONFIG_BLK_DEV_BSG is not set
+CONFIG_PARTITION_ADVANCED=y
+# CONFIG_EFI_PARTITION is not set
 CONFIG_NET=y
 CONFIG_PACKET=y
 CONFIG_UNIX=y
 CONFIG_INET=y
 # CONFIG_IPV6 is not set
+CONFIG_PCI=y
 CONFIG_MTD=y
 CONFIG_MTD_CMDLINE_PARTS=y
 CONFIG_MTD_BLOCK=y
@@ -62,6 +62,8 @@
 CONFIG_GPIOLIB=y
 CONFIG_GPIO_SYSFS=y
 CONFIG_GPIO_XILINX=y
+CONFIG_POWER_RESET=y
+CONFIG_POWER_RESET_GPIO_RESTART=y
 # CONFIG_HWMON is not set
 CONFIG_WATCHDOG=y
 CONFIG_XILINX_WATCHDOG=y
@@ -75,11 +77,6 @@
 CONFIG_NFS_FS=y
 CONFIG_NFS_V3_ACL=y
 CONFIG_NLS=y
-CONFIG_DEBUG_INFO=y
-CONFIG_DEBUG_SLAB=y
-CONFIG_DETECT_HUNG_TASK=y
-CONFIG_DEBUG_SPINLOCK=y
-CONFIG_EARLY_PRINTK=y
 CONFIG_KEYS=y
 CONFIG_ENCRYPTED_KEYS=y
 CONFIG_CRYPTO_ECB=y
@@ -87,3 +84,7 @@
 CONFIG_CRYPTO_MD5=y
 CONFIG_CRYPTO_ARC4=y
 CONFIG_CRYPTO_DES=y
+CONFIG_DEBUG_INFO=y
+CONFIG_DEBUG_SLAB=y
+CONFIG_DETECT_HUNG_TASK=y
+CONFIG_DEBUG_SPINLOCK=y

diff --git a/arch/microblaze/include/asm/io.h b/arch/microblaze/include/asm/io.h
index c796813..86c95b2 100644
--- a/arch/microblaze/include/asm/io.h
+++ b/arch/microblaze/include/asm/io.h

@@ -40,7 +40,6 @@
 
 extern void __iomem *ioremap(phys_addr_t address, unsigned long size);
 #define ioremap_nocache(addr, size)		ioremap((addr), (size))
-#define ioremap_fullcache(addr, size)		ioremap((addr), (size))
 #define ioremap_wc(addr, size)			ioremap((addr), (size))
 #define ioremap_wt(addr, size)			ioremap((addr), (size))
 

diff --git a/arch/microblaze/include/asm/pci.h b/arch/microblaze/include/asm/pci.h
index 21ddba9..7c4dc5d 100644
--- a/arch/microblaze/include/asm/pci.h
+++ b/arch/microblaze/include/asm/pci.h

@@ -66,8 +66,6 @@
 					 unsigned long size,
 					 pgprot_t prot);
 
-#define HAVE_ARCH_PCI_RESOURCE_TO_USER
-
 /* This part of code was originally in xilinx-pci.h */
 #ifdef CONFIG_PCI_XILINX
 extern void __init xilinx_pci_init(void);

diff --git a/arch/microblaze/include/asm/pgalloc.h b/arch/microblaze/include/asm/pgalloc.h
index f4cc9ff..7ecb05b 100644
--- a/arch/microblaze/include/asm/pgalloc.h
+++ b/arch/microblaze/include/asm/pgalloc.h

@@ -21,83 +21,23 @@
 #include <asm/cache.h>
 #include <asm/pgtable.h>
 
-#define PGDIR_ORDER	0
-
-/*
- * This is handled very differently on MicroBlaze since out page tables
- * are all 0's and I want to be able to use these zero'd pages elsewhere
- * as well - it gives us quite a speedup.
- * -- Cort
- */
-extern struct pgtable_cache_struct {
-	unsigned long *pgd_cache;
-	unsigned long *pte_cache;
-	unsigned long pgtable_cache_sz;
-} quicklists;
-
-#define pgd_quicklist		(quicklists.pgd_cache)
-#define pmd_quicklist		((unsigned long *)0)
-#define pte_quicklist		(quicklists.pte_cache)
-#define pgtable_cache_size	(quicklists.pgtable_cache_sz)
-
-extern unsigned long *zero_cache; /* head linked list of pre-zero'd pages */
-extern atomic_t zero_sz; /* # currently pre-zero'd pages */
-extern atomic_t zeropage_hits; /* # zero'd pages request that we've done */
-extern atomic_t zeropage_calls; /* # zero'd pages request that've been made */
-extern atomic_t zerototal; /* # pages zero'd over time */
-
-#define zero_quicklist		(zero_cache)
-#define zero_cache_sz	 	(zero_sz)
-#define zero_cache_calls	(zeropage_calls)
-#define zero_cache_hits		(zeropage_hits)
-#define zero_cache_total	(zerototal)
-
-/*
- * return a pre-zero'd page from the list,
- * return NULL if none available -- Cort
- */
-extern unsigned long get_zero_page_fast(void);
+#define __HAVE_ARCH_PTE_ALLOC_ONE_KERNEL
+#include <asm-generic/pgalloc.h>
 
 extern void __bad_pte(pmd_t *pmd);
 
-static inline pgd_t *get_pgd_slow(void)
+static inline pgd_t *get_pgd(void)
 {
-	pgd_t *ret;
-
-	ret = (pgd_t *)__get_free_pages(GFP_KERNEL, PGDIR_ORDER);
-	if (ret != NULL)
-		clear_page(ret);
-	return ret;
+	return (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, 0);
 }
 
-static inline pgd_t *get_pgd_fast(void)
-{
-	unsigned long *ret;
-
-	ret = pgd_quicklist;
-	if (ret != NULL) {
-		pgd_quicklist = (unsigned long *)(*ret);
-		ret[0] = 0;
-		pgtable_cache_size--;
-	} else
-		ret = (unsigned long *)get_pgd_slow();
-	return (pgd_t *)ret;
-}
-
-static inline void free_pgd_fast(pgd_t *pgd)
-{
-	*(unsigned long **)pgd = pgd_quicklist;
-	pgd_quicklist = (unsigned long *) pgd;
-	pgtable_cache_size++;
-}
-
-static inline void free_pgd_slow(pgd_t *pgd)
+static inline void free_pgd(pgd_t *pgd)
 {
 	free_page((unsigned long)pgd);
 }
 
-#define pgd_free(mm, pgd)        free_pgd_fast(pgd)
-#define pgd_alloc(mm)		get_pgd_fast()
+#define pgd_free(mm, pgd)	free_pgd(pgd)
+#define pgd_alloc(mm)		get_pgd()
 
 #define pmd_pgtable(pmd)	pmd_page(pmd)
 
@@ -110,50 +50,6 @@
 
 extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm);
 
-static inline struct page *pte_alloc_one(struct mm_struct *mm)
-{
-	struct page *ptepage;
-
-#ifdef CONFIG_HIGHPTE
-	int flags = GFP_KERNEL | __GFP_HIGHMEM;
-#else
-	int flags = GFP_KERNEL;
-#endif
-
-	ptepage = alloc_pages(flags, 0);
-	if (!ptepage)
-		return NULL;
-	clear_highpage(ptepage);
-	if (!pgtable_page_ctor(ptepage)) {
-		__free_page(ptepage);
-		return NULL;
-	}
-	return ptepage;
-}
-
-static inline void pte_free_fast(pte_t *pte)
-{
-	*(unsigned long **)pte = pte_quicklist;
-	pte_quicklist = (unsigned long *) pte;
-	pgtable_cache_size++;
-}
-
-static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
-{
-	free_page((unsigned long)pte);
-}
-
-static inline void pte_free_slow(struct page *ptepage)
-{
-	__free_page(ptepage);
-}
-
-static inline void pte_free(struct mm_struct *mm, struct page *ptepage)
-{
-	pgtable_page_dtor(ptepage);
-	__free_page(ptepage);
-}
-
 #define __pte_free_tlb(tlb, pte, addr)	pte_free((tlb)->mm, (pte))
 
 #define pmd_populate(mm, pmd, pte) \
@@ -171,10 +67,6 @@
 #define __pmd_free_tlb(tlb, x, addr)	pmd_free((tlb)->mm, x)
 #define pgd_populate(mm, pmd, pte)	BUG()
 
-extern int do_check_pgt_cache(int, int);
-
 #endif /* CONFIG_MMU */
 
-#define check_pgt_cache()		do { } while (0)
-
 #endif /* _ASM_MICROBLAZE_PGALLOC_H */

diff --git a/arch/microblaze/include/asm/pgtable.h b/arch/microblaze/include/asm/pgtable.h
index 142d3f0..954b69a 100644
--- a/arch/microblaze/include/asm/pgtable.h
+++ b/arch/microblaze/include/asm/pgtable.h

@@ -46,8 +46,6 @@
 
 #define swapper_pg_dir ((pgd_t *) NULL)
 
-#define pgtable_cache_init()	do {} while (0)
-
 #define arch_enter_lazy_cpu_mode()	do {} while (0)
 
 #define pgprot_noncached_wc(prot)	prot
@@ -526,11 +524,6 @@
 /* Needs to be defined here and not in linux/mm.h, as it is arch dependent */
 #define kern_addr_valid(addr)	(1)
 
-/*
- * No page table caches to initialise
- */
-#define pgtable_cache_init()	do { } while (0)
-
 void do_page_fault(struct pt_regs *regs, unsigned long address,
 		   unsigned long error_code);
 

diff --git a/arch/microblaze/include/asm/uaccess.h b/arch/microblaze/include/asm/uaccess.h
index bff2a71..a1f206b 100644
--- a/arch/microblaze/include/asm/uaccess.h
+++ b/arch/microblaze/include/asm/uaccess.h

@@ -163,44 +163,15 @@
  * Returns zero on success, or -EFAULT on error.
  * On error, the variable @x is set to zero.
  */
-#define get_user(x, ptr)						\
-	__get_user_check((x), (ptr), sizeof(*(ptr)))
-
-#define __get_user_check(x, ptr, size)					\
-({									\
-	unsigned long __gu_val = 0;					\
-	const typeof(*(ptr)) __user *__gu_addr = (ptr);			\
-	int __gu_err = 0;						\
-									\
-	if (access_ok(__gu_addr, size)) {			\
-		switch (size) {						\
-		case 1:							\
-			__get_user_asm("lbu", __gu_addr, __gu_val,	\
-				       __gu_err);			\
-			break;						\
-		case 2:							\
-			__get_user_asm("lhu", __gu_addr, __gu_val,	\
-				       __gu_err);			\
-			break;						\
-		case 4:							\
-			__get_user_asm("lw", __gu_addr, __gu_val,	\
-				       __gu_err);			\
-			break;						\
-		default:						\
-			__gu_err = __user_bad();			\
-			break;						\
-		}							\
-	} else {							\
-		__gu_err = -EFAULT;					\
-	}								\
-	x = (__force typeof(*(ptr)))__gu_val;				\
-	__gu_err;							\
+#define get_user(x, ptr) ({				\
+	const typeof(*(ptr)) __user *__gu_ptr = (ptr);	\
+	access_ok(__gu_ptr, sizeof(*__gu_ptr)) ?	\
+		__get_user(x, __gu_ptr) : -EFAULT;	\
 })
 
 #define __get_user(x, ptr)						\
 ({									\
 	unsigned long __gu_val = 0;					\
-	/*unsigned long __gu_ptr = (unsigned long)(ptr);*/		\
 	long __gu_err;							\
 	switch (sizeof(*(ptr))) {					\
 	case 1:								\
@@ -212,6 +183,11 @@
 	case 4:								\
 		__get_user_asm("lw", (ptr), __gu_val, __gu_err);	\
 		break;							\
+	case 8:								\
+		__gu_err = __copy_from_user(&__gu_val, ptr, 8);		\
+		if (__gu_err)						\
+			__gu_err = -EFAULT;				\
+		break;							\
 	default:							\
 		/* __gu_val = 0; __gu_err = -EINVAL;*/ __gu_err = __user_bad();\
 	}								\

diff --git a/arch/microblaze/kernel/reset.c b/arch/microblaze/kernel/reset.c
index fcbe1da..5f47229 100644
--- a/arch/microblaze/kernel/reset.c
+++ b/arch/microblaze/kernel/reset.c

@@ -8,83 +8,9 @@
  */
 
 #include <linux/init.h>
+#include <linux/delay.h>
 #include <linux/of_platform.h>
-
-/* Trigger specific functions */
-#ifdef CONFIG_GPIOLIB
-
-#include <linux/of_gpio.h>
-
-static int handle; /* reset pin handle */
-static unsigned int reset_val;
-
-static int of_platform_reset_gpio_probe(void)
-{
-	int ret;
-	handle = of_get_named_gpio(of_find_node_by_path("/"),
-				   "hard-reset-gpios", 0);
-
-	if (!gpio_is_valid(handle)) {
-		pr_info("Skipping unavailable RESET gpio %d (%s)\n",
-				handle, "reset");
-		return -ENODEV;
-	}
-
-	ret = gpio_request(handle, "reset");
-	if (ret < 0) {
-		pr_info("GPIO pin is already allocated\n");
-		return ret;
-	}
-
-	/* get current setup value */
-	reset_val = gpio_get_value(handle);
-	/* FIXME maybe worth to perform any action */
-	pr_debug("Reset: Gpio output state: 0x%x\n", reset_val);
-
-	/* Setup GPIO as output */
-	ret = gpio_direction_output(handle, 0);
-	if (ret < 0)
-		goto err;
-
-	/* Setup output direction */
-	gpio_set_value(handle, 0);
-
-	pr_info("RESET: Registered gpio device: %d, current val: %d\n",
-							handle, reset_val);
-	return 0;
-err:
-	gpio_free(handle);
-	return ret;
-}
-device_initcall(of_platform_reset_gpio_probe);
-
-
-static void gpio_system_reset(void)
-{
-	if (gpio_is_valid(handle))
-		gpio_set_value(handle, 1 - reset_val);
-	else
-		pr_notice("Reset GPIO unavailable - halting!\n");
-}
-#else
-static void gpio_system_reset(void)
-{
-	pr_notice("No reset GPIO present - halting!\n");
-}
-
-void of_platform_reset_gpio_probe(void)
-{
-	return;
-}
-#endif
-
-void machine_restart(char *cmd)
-{
-	pr_notice("Machine restart...\n");
-	gpio_system_reset();
-	while (1)
-		;
-}
+#include <linux/reboot.h>
 
 void machine_shutdown(void)
 {
@@ -106,3 +32,12 @@
 	while (1)
 		;
 }
+
+void machine_restart(char *cmd)
+{
+	do_kernel_restart(cmd);
+	/* Give the restart hook 1 s to take us down */
+	mdelay(1000);
+	pr_emerg("Reboot failed -- System halted\n");
+	while (1);
+}

diff --git a/arch/microblaze/mm/consistent.c b/arch/microblaze/mm/consistent.c
index bc70422..8c5f0c3 100644
--- a/arch/microblaze/mm/consistent.c
+++ b/arch/microblaze/mm/consistent.c

@@ -4,217 +4,56 @@
  * Copyright (C) 2010 Michal Simek <monstr@monstr.eu>
  * Copyright (C) 2010 PetaLogix
  * Copyright (C) 2005 John Williams <jwilliams@itee.uq.edu.au>
- *
- * Based on PowerPC version derived from arch/arm/mm/consistent.c
- * Copyright (C) 2001 Dan Malek (dmalek@jlc.net)
- * Copyright (C) 2000 Russell King
  */
 
-#include <linux/export.h>
-#include <linux/signal.h>
-#include <linux/sched.h>
 #include <linux/kernel.h>
-#include <linux/errno.h>
 #include <linux/string.h>
 #include <linux/types.h>
-#include <linux/ptrace.h>
-#include <linux/mman.h>
 #include <linux/mm.h>
-#include <linux/swap.h>
-#include <linux/stddef.h>
-#include <linux/vmalloc.h>
 #include <linux/init.h>
-#include <linux/delay.h>
-#include <linux/memblock.h>
-#include <linux/highmem.h>
-#include <linux/pci.h>
-#include <linux/interrupt.h>
-#include <linux/gfp.h>
 #include <linux/dma-noncoherent.h>
-
-#include <asm/pgalloc.h>
-#include <linux/io.h>
-#include <linux/hardirq.h>
-#include <linux/mmu_context.h>
-#include <asm/mmu.h>
-#include <linux/uaccess.h>
-#include <asm/pgtable.h>
 #include <asm/cpuinfo.h>
-#include <asm/tlbflush.h>
+#include <asm/cacheflush.h>
 
-#ifndef CONFIG_MMU
-/* I have to use dcache values because I can't relate on ram size */
-# define UNCACHED_SHADOW_MASK (cpuinfo.dcache_high - cpuinfo.dcache_base + 1)
-#endif
-
-/*
- * Consistent memory allocators. Used for DMA devices that want to
- * share uncached memory with the processor core.
- * My crufty no-MMU approach is simple. In the HW platform we can optionally
- * mirror the DDR up above the processor cacheable region.  So, memory accessed
- * in this mirror region will not be cached.  It's alloced from the same
- * pool as normal memory, but the handle we return is shifted up into the
- * uncached region.  This will no doubt cause big problems if memory allocated
- * here is not also freed properly. -- JW
- */
-void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
-		gfp_t gfp, unsigned long attrs)
+void arch_dma_prep_coherent(struct page *page, size_t size)
 {
-	unsigned long order, vaddr;
-	void *ret;
-	unsigned int i, err = 0;
-	struct page *page, *end;
+	phys_addr_t paddr = page_to_phys(page);
 
-#ifdef CONFIG_MMU
-	phys_addr_t pa;
-	struct vm_struct *area;
-	unsigned long va;
-#endif
-
-	if (in_interrupt())
-		BUG();
-
-	/* Only allocate page size areas. */
-	size = PAGE_ALIGN(size);
-	order = get_order(size);
-
-	vaddr = __get_free_pages(gfp | __GFP_ZERO, order);
-	if (!vaddr)
-		return NULL;
-
-	/*
-	 * we need to ensure that there are no cachelines in use,
-	 * or worse dirty in this area.
-	 */
-	flush_dcache_range(virt_to_phys((void *)vaddr),
-					virt_to_phys((void *)vaddr) + size);
+	flush_dcache_range(paddr, paddr + size);
+}
 
 #ifndef CONFIG_MMU
-	ret = (void *)vaddr;
-	/*
-	 * Here's the magic!  Note if the uncached shadow is not implemented,
-	 * it's up to the calling code to also test that condition and make
-	 * other arranegments, such as manually flushing the cache and so on.
-	 */
-# ifdef CONFIG_XILINX_UNCACHED_SHADOW
-	ret = (void *)((unsigned) ret | UNCACHED_SHADOW_MASK);
-# endif
-	if ((unsigned int)ret > cpuinfo.dcache_base &&
-				(unsigned int)ret < cpuinfo.dcache_high)
+/*
+ * Consistent memory allocators. Used for DMA devices that want to share
+ * uncached memory with the processor core.  My crufty no-MMU approach is
+ * simple.  In the HW platform we can optionally mirror the DDR up above the
+ * processor cacheable region.  So, memory accessed in this mirror region will
+ * not be cached.  It's alloced from the same pool as normal memory, but the
+ * handle we return is shifted up into the uncached region.  This will no doubt
+ * cause big problems if memory allocated here is not also freed properly. -- JW
+ *
+ * I have to use dcache values because I can't relate on ram size:
+ */
+#ifdef CONFIG_XILINX_UNCACHED_SHADOW
+#define UNCACHED_SHADOW_MASK (cpuinfo.dcache_high - cpuinfo.dcache_base + 1)
+#else
+#define UNCACHED_SHADOW_MASK 0
+#endif /* CONFIG_XILINX_UNCACHED_SHADOW */
+
+void *uncached_kernel_address(void *ptr)
+{
+	unsigned long addr = (unsigned long)ptr;
+
+	addr |= UNCACHED_SHADOW_MASK;
+	if (addr > cpuinfo.dcache_base && addr < cpuinfo.dcache_high)
 		pr_warn("ERROR: Your cache coherent area is CACHED!!!\n");
-
-	/* dma_handle is same as physical (shadowed) address */
-	*dma_handle = (dma_addr_t)ret;
-#else
-	/* Allocate some common virtual space to map the new pages. */
-	area = get_vm_area(size, VM_ALLOC);
-	if (!area) {
-		free_pages(vaddr, order);
-		return NULL;
-	}
-	va = (unsigned long) area->addr;
-	ret = (void *)va;
-
-	/* This gives us the real physical address of the first page. */
-	*dma_handle = pa = __virt_to_phys(vaddr);
-#endif
-
-	/*
-	 * free wasted pages.  We skip the first page since we know
-	 * that it will have count = 1 and won't require freeing.
-	 * We also mark the pages in use as reserved so that
-	 * remap_page_range works.
-	 */
-	page = virt_to_page(vaddr);
-	end = page + (1 << order);
-
-	split_page(page, order);
-
-	for (i = 0; i < size && err == 0; i += PAGE_SIZE) {
-#ifdef CONFIG_MMU
-		/* MS: This is the whole magic - use cache inhibit pages */
-		err = map_page(va + i, pa + i, _PAGE_KERNEL | _PAGE_NO_CACHE);
-#endif
-
-		SetPageReserved(page);
-		page++;
-	}
-
-	/* Free the otherwise unused pages. */
-	while (page < end) {
-		__free_page(page);
-		page++;
-	}
-
-	if (err) {
-		free_pages(vaddr, order);
-		return NULL;
-	}
-
-	return ret;
+	return (void *)addr;
 }
 
-#ifdef CONFIG_MMU
-static pte_t *consistent_virt_to_pte(void *vaddr)
+void *cached_kernel_address(void *ptr)
 {
-	unsigned long addr = (unsigned long)vaddr;
+	unsigned long addr = (unsigned long)ptr;
 
-	return pte_offset_kernel(pmd_offset(pgd_offset_k(addr), addr), addr);
+	return (void *)(addr & ~UNCACHED_SHADOW_MASK);
 }
-
-long arch_dma_coherent_to_pfn(struct device *dev, void *vaddr,
-		dma_addr_t dma_addr)
-{
-	pte_t *ptep = consistent_virt_to_pte(vaddr);
-
-	if (pte_none(*ptep) || !pte_present(*ptep))
-		return 0;
-
-	return pte_pfn(*ptep);
-}
-#endif
-
-/*
- * free page(s) as defined by the above mapping.
- */
-void arch_dma_free(struct device *dev, size_t size, void *vaddr,
-		dma_addr_t dma_addr, unsigned long attrs)
-{
-	struct page *page;
-
-	if (in_interrupt())
-		BUG();
-
-	size = PAGE_ALIGN(size);
-
-#ifndef CONFIG_MMU
-	/* Clear SHADOW_MASK bit in address, and free as per usual */
-# ifdef CONFIG_XILINX_UNCACHED_SHADOW
-	vaddr = (void *)((unsigned)vaddr & ~UNCACHED_SHADOW_MASK);
-# endif
-	page = virt_to_page(vaddr);
-
-	do {
-		__free_reserved_page(page);
-		page++;
-	} while (size -= PAGE_SIZE);
-#else
-	do {
-		pte_t *ptep = consistent_virt_to_pte(vaddr);
-		unsigned long pfn;
-
-		if (!pte_none(*ptep) && pte_present(*ptep)) {
-			pfn = pte_pfn(*ptep);
-			pte_clear(&init_mm, (unsigned int)vaddr, ptep);
-			if (pfn_valid(pfn)) {
-				page = pfn_to_page(pfn);
-				__free_reserved_page(page);
-			}
-		}
-		vaddr += PAGE_SIZE;
-	} while (size -= PAGE_SIZE);
-
-	/* flush tlb */
-	flush_tlb_all();
-#endif
-}
+#endif /* CONFIG_MMU */

diff --git a/arch/microblaze/mm/pgtable.c b/arch/microblaze/mm/pgtable.c
index 8fe54fd..010bb9c 100644
--- a/arch/microblaze/mm/pgtable.c
+++ b/arch/microblaze/mm/pgtable.c

@@ -44,10 +44,6 @@
 unsigned long ioremap_bot;
 EXPORT_SYMBOL(ioremap_bot);
 
-#ifndef CONFIG_SMP
-struct pgtable_cache_struct quicklists;
-#endif
-
 static void __iomem *__ioremap(phys_addr_t addr, unsigned long size,
 		unsigned long flags)
 {

diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index cc8e2b1..a0bd9bd 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig

@@ -5,7 +5,6 @@
 	select ARCH_32BIT_OFF_T if !64BIT
 	select ARCH_BINFMT_ELF_STATE if MIPS_FP_SUPPORT
 	select ARCH_CLOCKSOURCE_DATA
-	select ARCH_HAS_ELF_RANDOMIZE
 	select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
 	select ARCH_HAS_UBSAN_SANITIZE_ALL
 	select ARCH_SUPPORTS_UPROBES
@@ -13,6 +12,7 @@
 	select ARCH_USE_CMPXCHG_LOCKREF if 64BIT
 	select ARCH_USE_QUEUED_RWLOCKS
 	select ARCH_USE_QUEUED_SPINLOCKS
+	select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU
 	select ARCH_WANT_IPC_PARSE_VERSION
 	select BUILDTIME_EXTABLE_SORT
 	select CLONE_BACKWARDS

diff --git a/arch/mips/include/asm/pci.h b/arch/mips/include/asm/pci.h
index 4360998..6f48649 100644
--- a/arch/mips/include/asm/pci.h
+++ b/arch/mips/include/asm/pci.h

@@ -108,7 +108,6 @@
 
 #define HAVE_PCI_MMAP
 #define ARCH_GENERIC_PCI_MMAP_RESOURCE
-#define HAVE_ARCH_PCI_RESOURCE_TO_USER
 
 /*
  * Dynamic DMA mapping stuff.

diff --git a/arch/mips/include/asm/pgalloc.h b/arch/mips/include/asm/pgalloc.h
index aa16b85..1668423 100644
--- a/arch/mips/include/asm/pgalloc.h
+++ b/arch/mips/include/asm/pgalloc.h

@@ -54,7 +54,7 @@
 
 #define __pte_free_tlb(tlb,pte,address)			\
 do {							\
-	pgtable_page_dtor(pte);				\
+	pgtable_pte_page_dtor(pte);			\
 	tlb_remove_page((tlb), pte);			\
 } while (0)
 
@@ -105,8 +105,6 @@
 
 #endif /* __PAGETABLE_PUD_FOLDED */
 
-#define check_pgt_cache()	do { } while (0)
-
 extern void pagetable_init(void);
 
 #endif /* _ASM_PGALLOC_H */

diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h
index 4dca733..f85bd5b 100644
--- a/arch/mips/include/asm/pgtable.h
+++ b/arch/mips/include/asm/pgtable.h

@@ -661,9 +661,4 @@
 #define HAVE_ARCH_UNMAPPED_AREA
 #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
 
-/*
- * No page table caches to initialise
- */
-#define pgtable_cache_init()	do { } while (0)
-
 #endif /* _ASM_PGTABLE_H */

diff --git a/arch/mips/include/asm/processor.h b/arch/mips/include/asm/processor.h
index aca909b..fba18d4 100644
--- a/arch/mips/include/asm/processor.h
+++ b/arch/mips/include/asm/processor.h

@@ -29,11 +29,6 @@
 
 extern unsigned int vced_count, vcei_count;
 
-/*
- * MIPS does have an arch_pick_mmap_layout()
- */
-#define HAVE_ARCH_PICK_MMAP_LAYOUT 1
-
 #ifdef CONFIG_32BIT
 #ifdef CONFIG_KVM_GUEST
 /* User space process size is limited to 1GB in KVM Guest Mode */

diff --git a/arch/mips/include/uapi/asm/mman.h b/arch/mips/include/uapi/asm/mman.h
index c2b4096..57dc2ac 100644
--- a/arch/mips/include/uapi/asm/mman.h
+++ b/arch/mips/include/uapi/asm/mman.h

@@ -95,6 +95,9 @@
 #define MADV_WIPEONFORK 18		/* Zero memory on fork, child only */
 #define MADV_KEEPONFORK 19		/* Undo MADV_WIPEONFORK */
 
+#define MADV_COLD	20		/* deactivate these pages */
+#define MADV_PAGEOUT	21		/* reclaim these pages */
+
 /* compatibility flags */
 #define MAP_FILE	0
 

diff --git a/arch/mips/mm/mmap.c b/arch/mips/mm/mmap.c
index d79f2b4..00fe90c 100644
--- a/arch/mips/mm/mmap.c
+++ b/arch/mips/mm/mmap.c

@@ -20,33 +20,6 @@
 unsigned long shm_align_mask = PAGE_SIZE - 1;	/* Sane caches */
 EXPORT_SYMBOL(shm_align_mask);
 
-/* gap between mmap and stack */
-#define MIN_GAP (128*1024*1024UL)
-#define MAX_GAP ((TASK_SIZE)/6*5)
-
-static int mmap_is_legacy(struct rlimit *rlim_stack)
-{
-	if (current->personality & ADDR_COMPAT_LAYOUT)
-		return 1;
-
-	if (rlim_stack->rlim_cur == RLIM_INFINITY)
-		return 1;
-
-	return sysctl_legacy_va_layout;
-}
-
-static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack)
-{
-	unsigned long gap = rlim_stack->rlim_cur;
-
-	if (gap < MIN_GAP)
-		gap = MIN_GAP;
-	else if (gap > MAX_GAP)
-		gap = MAX_GAP;
-
-	return PAGE_ALIGN(TASK_SIZE - gap - rnd);
-}
-
 #define COLOUR_ALIGN(addr, pgoff)				\
 	((((addr) + shm_align_mask) & ~shm_align_mask) +	\
 	 (((pgoff) << PAGE_SHIFT) & shm_align_mask))
@@ -144,63 +117,6 @@
 			addr0, len, pgoff, flags, DOWN);
 }
 
-unsigned long arch_mmap_rnd(void)
-{
-	unsigned long rnd;
-
-#ifdef CONFIG_COMPAT
-	if (TASK_IS_32BIT_ADDR)
-		rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1);
-	else
-#endif /* CONFIG_COMPAT */
-		rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1);
-
-	return rnd << PAGE_SHIFT;
-}
-
-void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
-{
-	unsigned long random_factor = 0UL;
-
-	if (current->flags & PF_RANDOMIZE)
-		random_factor = arch_mmap_rnd();
-
-	if (mmap_is_legacy(rlim_stack)) {
-		mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
-		mm->get_unmapped_area = arch_get_unmapped_area;
-	} else {
-		mm->mmap_base = mmap_base(random_factor, rlim_stack);
-		mm->get_unmapped_area = arch_get_unmapped_area_topdown;
-	}
-}
-
-static inline unsigned long brk_rnd(void)
-{
-	unsigned long rnd = get_random_long();
-
-	rnd = rnd << PAGE_SHIFT;
-	/* 8MB for 32bit, 256MB for 64bit */
-	if (TASK_IS_32BIT_ADDR)
-		rnd = rnd & 0x7ffffful;
-	else
-		rnd = rnd & 0xffffffful;
-
-	return rnd;
-}
-
-unsigned long arch_randomize_brk(struct mm_struct *mm)
-{
-	unsigned long base = mm->brk;
-	unsigned long ret;
-
-	ret = PAGE_ALIGN(base + brk_rnd());
-
-	if (ret < mm->brk)
-		return mm->brk;
-
-	return ret;
-}
-
 bool __virt_addr_valid(const volatile void *kaddr)
 {
 	unsigned long vaddr = (unsigned long)kaddr;

diff --git a/arch/nds32/include/asm/pgalloc.h b/arch/nds32/include/asm/pgalloc.h
index e78b43d..37125e6 100644
--- a/arch/nds32/include/asm/pgalloc.h
+++ b/arch/nds32/include/asm/pgalloc.h

@@ -23,8 +23,6 @@
 extern pgd_t *pgd_alloc(struct mm_struct *mm);
 extern void pgd_free(struct mm_struct *mm, pgd_t * pgd);
 
-#define check_pgt_cache()		do { } while (0)
-
 static inline pgtable_t pte_alloc_one(struct mm_struct *mm)
 {
 	pgtable_t pte;

diff --git a/arch/nds32/include/asm/pgtable.h b/arch/nds32/include/asm/pgtable.h
index c70cc56..0588ec9 100644
--- a/arch/nds32/include/asm/pgtable.h
+++ b/arch/nds32/include/asm/pgtable.h

@@ -403,8 +403,6 @@
  * into virtual address `from'
  */
 
-#define pgtable_cache_init()       do { } while (0)
-
 #endif /* !__ASSEMBLY__ */
 
 #endif /* _ASMNDS32_PGTABLE_H */

diff --git a/arch/nios2/include/asm/pgalloc.h b/arch/nios2/include/asm/pgalloc.h
index 4bc8cf7..0b146d7 100644
--- a/arch/nios2/include/asm/pgalloc.h
+++ b/arch/nios2/include/asm/pgalloc.h

@@ -41,10 +41,8 @@
 
 #define __pte_free_tlb(tlb, pte, addr)				\
 	do {							\
-		pgtable_page_dtor(pte);				\
+		pgtable_pte_page_dtor(pte);			\
 		tlb_remove_page((tlb), (pte));			\
 	} while (0)
 
-#define check_pgt_cache()	do { } while (0)
-
 #endif /* _ASM_NIOS2_PGALLOC_H */

diff --git a/arch/nios2/include/asm/pgtable.h b/arch/nios2/include/asm/pgtable.h
index 95237b7..99985d8 100644
--- a/arch/nios2/include/asm/pgtable.h
+++ b/arch/nios2/include/asm/pgtable.h

@@ -291,8 +291,6 @@
 
 #include <asm-generic/pgtable.h>
 
-#define pgtable_cache_init()		do { } while (0)
-
 extern void __init paging_init(void);
 extern void __init mmu_init(void);
 

diff --git a/arch/nios2/kernel/setup.c b/arch/nios2/kernel/setup.c
index 6bbd4ae..4cf35b0 100644
--- a/arch/nios2/kernel/setup.c
+++ b/arch/nios2/kernel/setup.c

@@ -123,7 +123,7 @@
 		dtb_passed = r6;
 
 		if (r7)
-			strncpy(cmdline_passed, (char *)r7, COMMAND_LINE_SIZE);
+			strlcpy(cmdline_passed, (char *)r7, COMMAND_LINE_SIZE);
 	}
 #endif
 
@@ -131,10 +131,10 @@
 
 #ifndef CONFIG_CMDLINE_FORCE
 	if (cmdline_passed[0])
-		strncpy(boot_command_line, cmdline_passed, COMMAND_LINE_SIZE);
+		strlcpy(boot_command_line, cmdline_passed, COMMAND_LINE_SIZE);
 #ifdef CONFIG_NIOS2_CMDLINE_IGNORE_DTB
 	else
-		strncpy(boot_command_line, CONFIG_CMDLINE, COMMAND_LINE_SIZE);
+		strlcpy(boot_command_line, CONFIG_CMDLINE, COMMAND_LINE_SIZE);
 #endif
 #endif
 

diff --git a/arch/openrisc/include/asm/pgalloc.h b/arch/openrisc/include/asm/pgalloc.h
index 3d4b397c..da12a4c 100644
--- a/arch/openrisc/include/asm/pgalloc.h
+++ b/arch/openrisc/include/asm/pgalloc.h

@@ -75,7 +75,7 @@
 	if (!pte)
 		return NULL;
 	clear_page(page_address(pte));
-	if (!pgtable_page_ctor(pte)) {
+	if (!pgtable_pte_page_ctor(pte)) {
 		__free_page(pte);
 		return NULL;
 	}
@@ -89,18 +89,16 @@
 
 static inline void pte_free(struct mm_struct *mm, struct page *pte)
 {
-	pgtable_page_dtor(pte);
+	pgtable_pte_page_dtor(pte);
 	__free_page(pte);
 }
 
 #define __pte_free_tlb(tlb, pte, addr)	\
 do {					\
-	pgtable_page_dtor(pte);		\
+	pgtable_pte_page_dtor(pte);	\
 	tlb_remove_page((tlb), (pte));	\
 } while (0)
 
 #define pmd_pgtable(pmd) pmd_page(pmd)
 
-#define check_pgt_cache()          do { } while (0)
-
 #endif

diff --git a/arch/openrisc/include/asm/pgtable.h b/arch/openrisc/include/asm/pgtable.h
index 2fe9ff5..248d22d 100644
--- a/arch/openrisc/include/asm/pgtable.h
+++ b/arch/openrisc/include/asm/pgtable.h

@@ -443,11 +443,6 @@
 
 #include <asm-generic/pgtable.h>
 
-/*
- * No page table caches to initialise
- */
-#define pgtable_cache_init()		do { } while (0)
-
 typedef pte_t *pte_addr_t;
 
 #endif /* __ASSEMBLY__ */

diff --git a/arch/parisc/include/asm/pgalloc.h b/arch/parisc/include/asm/pgalloc.h
index 4f2059a..d98647c 100644
--- a/arch/parisc/include/asm/pgalloc.h
+++ b/arch/parisc/include/asm/pgalloc.h

@@ -124,6 +124,4 @@
 	pmd_populate_kernel(mm, pmd, page_address(pte_page))
 #define pmd_pgtable(pmd) pmd_page(pmd)
 
-#define check_pgt_cache()	do { } while (0)
-
 #endif

diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h
index 6d58c17..4ac374b 100644
--- a/arch/parisc/include/asm/pgtable.h
+++ b/arch/parisc/include/asm/pgtable.h

@@ -132,8 +132,6 @@
 #define PTRS_PER_PTE    (1UL << BITS_PER_PTE)
 
 /* Definitions for 2nd level */
-#define pgtable_cache_init()	do { } while (0)
-
 #define PMD_SHIFT       (PLD_SHIFT + BITS_PER_PTE)
 #define PMD_SIZE	(1UL << PMD_SHIFT)
 #define PMD_MASK	(~(PMD_SIZE-1))

diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h
index c98162f..6fd8871 100644
--- a/arch/parisc/include/uapi/asm/mman.h
+++ b/arch/parisc/include/uapi/asm/mman.h

@@ -48,6 +48,9 @@
 #define MADV_DONTFORK	10		/* don't inherit across fork */
 #define MADV_DOFORK	11		/* do inherit across fork */
 
+#define MADV_COLD	20		/* deactivate these pages */
+#define MADV_PAGEOUT	21		/* reclaim these pages */
+
 #define MADV_MERGEABLE   65		/* KSM may merge identical pages */
 #define MADV_UNMERGEABLE 66		/* KSM may not merge identical pages */
 

diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h
index 574eca3..d97db3a 100644
--- a/arch/powerpc/include/asm/book3s/64/radix.h
+++ b/arch/powerpc/include/asm/book3s/64/radix.h

@@ -254,7 +254,13 @@
 extern pgtable_t radix__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
 extern pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm,
 				      unsigned long addr, pmd_t *pmdp);
-extern int radix__has_transparent_hugepage(void);
+static inline int radix__has_transparent_hugepage(void)
+{
+	/* For radix 2M at PMD level means thp */
+	if (mmu_psize_defs[MMU_PAGE_2M].shift == PMD_SHIFT)
+		return 1;
+	return 0;
+}
 #endif
 
 extern int __meminit radix__vmemmap_create_mapping(unsigned long start,

diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h
index a1ebcbc..cf00ff0 100644
--- a/arch/powerpc/include/asm/cputable.h
+++ b/arch/powerpc/include/asm/cputable.h

@@ -209,8 +209,9 @@
 #define CPU_FTR_POWER9_DD2_1		LONG_ASM_CONST(0x0000080000000000)
 #define CPU_FTR_P9_TM_HV_ASSIST		LONG_ASM_CONST(0x0000100000000000)
 #define CPU_FTR_P9_TM_XER_SO_BUG	LONG_ASM_CONST(0x0000200000000000)
-#define CPU_FTR_P9_TLBIE_BUG		LONG_ASM_CONST(0x0000400000000000)
+#define CPU_FTR_P9_TLBIE_STQ_BUG	LONG_ASM_CONST(0x0000400000000000)
 #define CPU_FTR_P9_TIDR			LONG_ASM_CONST(0x0000800000000000)
+#define CPU_FTR_P9_TLBIE_ERAT_BUG	LONG_ASM_CONST(0x0001000000000000)
 
 #ifndef __ASSEMBLY__
 
@@ -457,7 +458,7 @@
 	    CPU_FTR_CFAR | CPU_FTR_HVMODE | CPU_FTR_VMX_COPY | \
 	    CPU_FTR_DBELL | CPU_FTR_HAS_PPR | CPU_FTR_ARCH_207S | \
 	    CPU_FTR_TM_COMP | CPU_FTR_ARCH_300 | CPU_FTR_PKEY | \
-	    CPU_FTR_P9_TLBIE_BUG | CPU_FTR_P9_TIDR)
+	    CPU_FTR_P9_TLBIE_STQ_BUG | CPU_FTR_P9_TLBIE_ERAT_BUG | CPU_FTR_P9_TIDR)
 #define CPU_FTRS_POWER9_DD2_0 CPU_FTRS_POWER9
 #define CPU_FTRS_POWER9_DD2_1 (CPU_FTRS_POWER9 | CPU_FTR_POWER9_DD2_1)
 #define CPU_FTRS_POWER9_DD2_2 (CPU_FTRS_POWER9 | CPU_FTR_POWER9_DD2_1 | \

diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 8e8514e..ee62776 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h

@@ -452,9 +452,100 @@
 	return xirr;
 }
 
-static inline void kvmppc_set_host_ipi(int cpu, u8 host_ipi)
+/*
+ * To avoid the need to unnecessarily exit fully to the host kernel, an IPI to
+ * a CPU thread that's running/napping inside of a guest is by default regarded
+ * as a request to wake the CPU (if needed) and continue execution within the
+ * guest, potentially to process new state like externally-generated
+ * interrupts or IPIs sent from within the guest itself (e.g. H_PROD/H_IPI).
+ *
+ * To force an exit to the host kernel, kvmppc_set_host_ipi() must be called
+ * prior to issuing the IPI to set the corresponding 'host_ipi' flag in the
+ * target CPU's PACA. To avoid unnecessary exits to the host, this flag should
+ * be immediately cleared via kvmppc_clear_host_ipi() by the IPI handler on
+ * the receiving side prior to processing the IPI work.
+ *
+ * NOTE:
+ *
+ * We currently issue an smp_mb() at the beginning of kvmppc_set_host_ipi().
+ * This is to guard against sequences such as the following:
+ *
+ *      CPU
+ *        X: smp_muxed_ipi_set_message():
+ *        X:   smp_mb()
+ *        X:   message[RESCHEDULE] = 1
+ *        X: doorbell_global_ipi(42):
+ *        X:   kvmppc_set_host_ipi(42)
+ *        X:   ppc_msgsnd_sync()/smp_mb()
+ *        X:   ppc_msgsnd() -> 42
+ *       42: doorbell_exception(): // from CPU X
+ *       42:   ppc_msgsync()
+ *      105: smp_muxed_ipi_set_message():
+ *      105:   smb_mb()
+ *           // STORE DEFERRED DUE TO RE-ORDERING
+ *    --105:   message[CALL_FUNCTION] = 1
+ *    | 105: doorbell_global_ipi(42):
+ *    | 105:   kvmppc_set_host_ipi(42)
+ *    |  42:   kvmppc_clear_host_ipi(42)
+ *    |  42: smp_ipi_demux_relaxed()
+ *    |  42: // returns to executing guest
+ *    |      // RE-ORDERED STORE COMPLETES
+ *    ->105:   message[CALL_FUNCTION] = 1
+ *      105:   ppc_msgsnd_sync()/smp_mb()
+ *      105:   ppc_msgsnd() -> 42
+ *       42: local_paca->kvm_hstate.host_ipi == 0 // IPI ignored
+ *      105: // hangs waiting on 42 to process messages/call_single_queue
+ *
+ * We also issue an smp_mb() at the end of kvmppc_clear_host_ipi(). This is
+ * to guard against sequences such as the following (as well as to create
+ * a read-side pairing with the barrier in kvmppc_set_host_ipi()):
+ *
+ *      CPU
+ *        X: smp_muxed_ipi_set_message():
+ *        X:   smp_mb()
+ *        X:   message[RESCHEDULE] = 1
+ *        X: doorbell_global_ipi(42):
+ *        X:   kvmppc_set_host_ipi(42)
+ *        X:   ppc_msgsnd_sync()/smp_mb()
+ *        X:   ppc_msgsnd() -> 42
+ *       42: doorbell_exception(): // from CPU X
+ *       42:   ppc_msgsync()
+ *           // STORE DEFERRED DUE TO RE-ORDERING
+ *    -- 42:   kvmppc_clear_host_ipi(42)
+ *    |  42: smp_ipi_demux_relaxed()
+ *    | 105: smp_muxed_ipi_set_message():
+ *    | 105:   smb_mb()
+ *    | 105:   message[CALL_FUNCTION] = 1
+ *    | 105: doorbell_global_ipi(42):
+ *    | 105:   kvmppc_set_host_ipi(42)
+ *    |      // RE-ORDERED STORE COMPLETES
+ *    -> 42:   kvmppc_clear_host_ipi(42)
+ *       42: // returns to executing guest
+ *      105:   ppc_msgsnd_sync()/smp_mb()
+ *      105:   ppc_msgsnd() -> 42
+ *       42: local_paca->kvm_hstate.host_ipi == 0 // IPI ignored
+ *      105: // hangs waiting on 42 to process messages/call_single_queue
+ */
+static inline void kvmppc_set_host_ipi(int cpu)
 {
-	paca_ptrs[cpu]->kvm_hstate.host_ipi = host_ipi;
+	/*
+	 * order stores of IPI messages vs. setting of host_ipi flag
+	 *
+	 * pairs with the barrier in kvmppc_clear_host_ipi()
+	 */
+	smp_mb();
+	paca_ptrs[cpu]->kvm_hstate.host_ipi = 1;
+}
+
+static inline void kvmppc_clear_host_ipi(int cpu)
+{
+	paca_ptrs[cpu]->kvm_hstate.host_ipi = 0;
+	/*
+	 * order clearing of host_ipi flag vs. processing of IPI messages
+	 *
+	 * pairs with the barrier in kvmppc_set_host_ipi()
+	 */
+	smp_mb();
 }
 
 static inline void kvmppc_fast_vcpu_kick(struct kvm_vcpu *vcpu)
@@ -486,7 +577,10 @@
 	return 0;
 }
 
-static inline void kvmppc_set_host_ipi(int cpu, u8 host_ipi)
+static inline void kvmppc_set_host_ipi(int cpu)
+{}
+
+static inline void kvmppc_clear_host_ipi(int cpu)
 {}
 
 static inline void kvmppc_fast_vcpu_kick(struct kvm_vcpu *vcpu)

diff --git a/arch/powerpc/include/asm/pci.h b/arch/powerpc/include/asm/pci.h
index 2372d35..327567b 100644
--- a/arch/powerpc/include/asm/pci.h
+++ b/arch/powerpc/include/asm/pci.h

@@ -112,8 +112,6 @@
 					 unsigned long size,
 					 pgprot_t prot);
 
-#define HAVE_ARCH_PCI_RESOURCE_TO_USER
-
 extern resource_size_t pcibios_io_space_offset(struct pci_controller *hose);
 extern void pcibios_setup_bus_devices(struct pci_bus *bus);
 extern void pcibios_setup_bus_self(struct pci_bus *bus);

diff --git a/arch/powerpc/include/asm/pgalloc.h b/arch/powerpc/include/asm/pgalloc.h
index 2b2c60a..6dd78a2 100644
--- a/arch/powerpc/include/asm/pgalloc.h
+++ b/arch/powerpc/include/asm/pgalloc.h

@@ -64,8 +64,6 @@
 extern struct kmem_cache *pgtable_cache[];
 #define PGT_CACHE(shift) pgtable_cache[shift]
 
-static inline void check_pgt_cache(void) { }
-
 #ifdef CONFIG_PPC_BOOK3S
 #include <asm/book3s/pgalloc.h>
 #else

diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index 8b7865a..4053b2a 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h

@@ -87,7 +87,6 @@
 unsigned long vmalloc_to_phys(void *vmalloc_addr);
 
 void pgtable_cache_add(unsigned int shift);
-void pgtable_cache_init(void);
 
 #if defined(CONFIG_STRICT_KERNEL_RWX) || defined(CONFIG_PPC32)
 void mark_initmem_nx(void);

diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index ec3714c..b3cbb11 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h

@@ -475,9 +475,10 @@
 #define   HMER_DEBUG_TRIG	(1ul << (63 - 17)) /* Debug trigger */
 #define	SPRN_HMEER	0x151	/* Hyp maintenance exception enable reg */
 #define SPRN_PCR	0x152	/* Processor compatibility register */
-#define   PCR_VEC_DIS	(1ul << (63-0))	/* Vec. disable (bit NA since POWER8) */
-#define   PCR_VSX_DIS	(1ul << (63-1))	/* VSX disable (bit NA since POWER8) */
-#define   PCR_TM_DIS	(1ul << (63-2))	/* Trans. memory disable (POWER8) */
+#define   PCR_VEC_DIS	(__MASK(63-0))	/* Vec. disable (bit NA since POWER8) */
+#define   PCR_VSX_DIS	(__MASK(63-1))	/* VSX disable (bit NA since POWER8) */
+#define   PCR_TM_DIS	(__MASK(63-2))	/* Trans. memory disable (POWER8) */
+#define   PCR_HIGH_BITS	(PCR_VEC_DIS | PCR_VSX_DIS | PCR_TM_DIS)
 /*
  * These bits are used in the function kvmppc_set_arch_compat() to specify and
  * determine both the compatibility level which we want to emulate and the
@@ -486,6 +487,8 @@
 #define   PCR_ARCH_207	0x8		/* Architecture 2.07 */
 #define   PCR_ARCH_206	0x4		/* Architecture 2.06 */
 #define   PCR_ARCH_205	0x2		/* Architecture 2.05 */
+#define   PCR_LOW_BITS	(PCR_ARCH_207 | PCR_ARCH_206 | PCR_ARCH_205)
+#define   PCR_MASK	~(PCR_HIGH_BITS | PCR_LOW_BITS)	/* PCR Reserved Bits */
 #define	SPRN_HEIR	0x153	/* Hypervisor Emulated Instruction Register */
 #define SPRN_TLBINDEXR	0x154	/* P7 TLB control register */
 #define SPRN_TLBVPNR	0x155	/* P7 TLB control register */

diff --git a/arch/powerpc/kernel/cpu_setup_power.S b/arch/powerpc/kernel/cpu_setup_power.S
index 3239a9f..a460298 100644
--- a/arch/powerpc/kernel/cpu_setup_power.S
+++ b/arch/powerpc/kernel/cpu_setup_power.S

@@ -23,6 +23,7 @@
 	beqlr
 	li	r0,0
 	mtspr	SPRN_LPID,r0
+	LOAD_REG_IMMEDIATE(r0, PCR_MASK)
 	mtspr	SPRN_PCR,r0
 	mfspr	r3,SPRN_LPCR
 	li	r4,(LPCR_LPES1 >> LPCR_LPES_SH)
@@ -37,6 +38,7 @@
 	beqlr
 	li	r0,0
 	mtspr	SPRN_LPID,r0
+	LOAD_REG_IMMEDIATE(r0, PCR_MASK)
 	mtspr	SPRN_PCR,r0
 	mfspr	r3,SPRN_LPCR
 	li	r4,(LPCR_LPES1 >> LPCR_LPES_SH)
@@ -54,6 +56,7 @@
 	beqlr
 	li	r0,0
 	mtspr	SPRN_LPID,r0
+	LOAD_REG_IMMEDIATE(r0, PCR_MASK)
 	mtspr	SPRN_PCR,r0
 	mfspr	r3,SPRN_LPCR
 	ori	r3, r3, LPCR_PECEDH
@@ -76,6 +79,7 @@
 	beqlr
 	li	r0,0
 	mtspr	SPRN_LPID,r0
+	LOAD_REG_IMMEDIATE(r0, PCR_MASK)
 	mtspr	SPRN_PCR,r0
 	mfspr   r3,SPRN_LPCR
 	ori	r3, r3, LPCR_PECEDH
@@ -98,6 +102,7 @@
 	mtspr	SPRN_PSSCR,r0
 	mtspr	SPRN_LPID,r0
 	mtspr	SPRN_PID,r0
+	LOAD_REG_IMMEDIATE(r0, PCR_MASK)
 	mtspr	SPRN_PCR,r0
 	mfspr	r3,SPRN_LPCR
 	LOAD_REG_IMMEDIATE(r4, LPCR_PECEDH | LPCR_PECE_HVEE | LPCR_HVICE  | LPCR_HEIC)
@@ -123,6 +128,7 @@
 	mtspr	SPRN_PSSCR,r0
 	mtspr	SPRN_LPID,r0
 	mtspr	SPRN_PID,r0
+	LOAD_REG_IMMEDIATE(r0, PCR_MASK)
 	mtspr	SPRN_PCR,r0
 	mfspr   r3,SPRN_LPCR
 	LOAD_REG_IMMEDIATE(r4, LPCR_PECEDH | LPCR_PECE_HVEE | LPCR_HVICE | LPCR_HEIC)

diff --git a/arch/powerpc/kernel/dbell.c b/arch/powerpc/kernel/dbell.c
index 804b1a6..f17ff12 100644
--- a/arch/powerpc/kernel/dbell.c
+++ b/arch/powerpc/kernel/dbell.c

@@ -33,7 +33,7 @@
 {
 	u32 tag = get_hard_smp_processor_id(cpu);
 
-	kvmppc_set_host_ipi(cpu, 1);
+	kvmppc_set_host_ipi(cpu);
 	/* Order previous accesses vs. msgsnd, which is treated as a store */
 	ppc_msgsnd_sync();
 	ppc_msgsnd(PPC_DBELL_MSGTYPE, 0, tag);
@@ -48,7 +48,7 @@
 {
 	u32 tag = cpu_thread_in_core(cpu);
 
-	kvmppc_set_host_ipi(cpu, 1);
+	kvmppc_set_host_ipi(cpu);
 	/* Order previous accesses vs. msgsnd, which is treated as a store */
 	ppc_msgsnd_sync();
 	ppc_msgsnd(PPC_DBELL_MSGTYPE, 0, tag);
@@ -84,7 +84,7 @@
 
 	may_hard_irq_enable();
 
-	kvmppc_set_host_ipi(smp_processor_id(), 0);
+	kvmppc_clear_host_ipi(smp_processor_id());
 	__this_cpu_inc(irq_stat.doorbell_irqs);
 
 	smp_ipi_demux_relaxed(); /* already performed the barrier */

diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.c
index bd95318..180b3a5 100644
--- a/arch/powerpc/kernel/dt_cpu_ftrs.c
+++ b/arch/powerpc/kernel/dt_cpu_ftrs.c

@@ -101,7 +101,7 @@
 	if (hv_mode) {
 		mtspr(SPRN_LPID, 0);
 		mtspr(SPRN_HFSCR, system_registers.hfscr);
-		mtspr(SPRN_PCR, 0);
+		mtspr(SPRN_PCR, PCR_MASK);
 	}
 	mtspr(SPRN_FSCR, system_registers.fscr);
 
@@ -144,6 +144,7 @@
 		mtspr(SPRN_HFSCR, 0);
 	}
 	mtspr(SPRN_FSCR, 0);
+	mtspr(SPRN_PCR, PCR_MASK);
 
 	/*
 	 * LPCR does not get cleared, to match behaviour with secondaries
@@ -691,9 +692,37 @@
 	return true;
 }
 
+/*
+ * Handle POWER9 broadcast tlbie invalidation issue using
+ * cpu feature flag.
+ */
+static __init void update_tlbie_feature_flag(unsigned long pvr)
+{
+	if (PVR_VER(pvr) == PVR_POWER9) {
+		/*
+		 * Set the tlbie feature flag for anything below
+		 * Nimbus DD 2.3 and Cumulus DD 1.3
+		 */
+		if ((pvr & 0xe000) == 0) {
+			/* Nimbus */
+			if ((pvr & 0xfff) < 0x203)
+				cur_cpu_spec->cpu_features |= CPU_FTR_P9_TLBIE_STQ_BUG;
+		} else if ((pvr & 0xc000) == 0) {
+			/* Cumulus */
+			if ((pvr & 0xfff) < 0x103)
+				cur_cpu_spec->cpu_features |= CPU_FTR_P9_TLBIE_STQ_BUG;
+		} else {
+			WARN_ONCE(1, "Unknown PVR");
+			cur_cpu_spec->cpu_features |= CPU_FTR_P9_TLBIE_STQ_BUG;
+		}
+
+		cur_cpu_spec->cpu_features |= CPU_FTR_P9_TLBIE_ERAT_BUG;
+	}
+}
+
 static __init void cpufeatures_cpu_quirks(void)
 {
-	int version = mfspr(SPRN_PVR);
+	unsigned long version = mfspr(SPRN_PVR);
 
 	/*
 	 * Not all quirks can be derived from the cpufeatures device tree.
@@ -712,10 +741,10 @@
 
 	if ((version & 0xffff0000) == 0x004e0000) {
 		cur_cpu_spec->cpu_features &= ~(CPU_FTR_DAWR);
-		cur_cpu_spec->cpu_features |= CPU_FTR_P9_TLBIE_BUG;
 		cur_cpu_spec->cpu_features |= CPU_FTR_P9_TIDR;
 	}
 
+	update_tlbie_feature_flag(version);
 	/*
 	 * PKEY was not in the initial base or feature node
 	 * specification, but it should become optional in the next

diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 0a91dee..bc8a551 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c

@@ -1960,7 +1960,7 @@
 	pci_err(pdev, "Going to break: %pR\n", bar);
 
 	if (pdev->is_virtfn) {
-#ifndef CONFIG_IOV
+#ifndef CONFIG_PCI_IOV
 		return -ENXIO;
 #else
 		/*
@@ -1980,7 +1980,7 @@
 		pos  = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_SRIOV);
 		pos += PCI_SRIOV_CTRL;
 		bit  = PCI_SRIOV_CTRL_MSE;
-#endif /* !CONFIG_IOV */
+#endif /* !CONFIG_PCI_IOV */
 	} else {
 		bit = PCI_COMMAND_MEMORY;
 		pos = PCI_COMMAND;

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index efd8f93..709cf1f 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c

@@ -401,8 +401,11 @@
 
 	spin_lock(&vc->lock);
 	vc->arch_compat = arch_compat;
-	/* Set all PCR bits for which guest_pcr_bit <= bit < host_pcr_bit */
-	vc->pcr = host_pcr_bit - guest_pcr_bit;
+	/*
+	 * Set all PCR bits for which guest_pcr_bit <= bit < host_pcr_bit
+	 * Also set all reserved PCR bits
+	 */
+	vc->pcr = (host_pcr_bit - guest_pcr_bit) | PCR_MASK;
 	spin_unlock(&vc->lock);
 
 	return 0;
@@ -3410,7 +3413,7 @@
 	}
 
 	if (vc->pcr)
-		mtspr(SPRN_PCR, vc->pcr);
+		mtspr(SPRN_PCR, vc->pcr | PCR_MASK);
 	mtspr(SPRN_DPDES, vc->dpdes);
 	mtspr(SPRN_VTB, vc->vtb);
 
@@ -3490,7 +3493,7 @@
 	vc->vtb = mfspr(SPRN_VTB);
 	mtspr(SPRN_DPDES, 0);
 	if (vc->pcr)
-		mtspr(SPRN_PCR, 0);
+		mtspr(SPRN_PCR, PCR_MASK);
 
 	if (vc->tb_offset_applied) {
 		u64 new_tb = mftb() - vc->tb_offset_applied;

diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c
index fff90f2..cdf30c6 100644
--- a/arch/powerpc/kvm/book3s_hv_nested.c
+++ b/arch/powerpc/kvm/book3s_hv_nested.c

@@ -29,7 +29,7 @@
 {
 	struct kvmppc_vcore *vc = vcpu->arch.vcore;
 
-	hr->pcr = vc->pcr;
+	hr->pcr = vc->pcr | PCR_MASK;
 	hr->dpdes = vc->dpdes;
 	hr->hfscr = vcpu->arch.hfscr;
 	hr->tb_offset = vc->tb_offset;
@@ -65,7 +65,7 @@
 	hr->lpid = swab32(hr->lpid);
 	hr->vcpu_token = swab32(hr->vcpu_token);
 	hr->lpcr = swab64(hr->lpcr);
-	hr->pcr = swab64(hr->pcr);
+	hr->pcr = swab64(hr->pcr) | PCR_MASK;
 	hr->amor = swab64(hr->amor);
 	hr->dpdes = swab64(hr->dpdes);
 	hr->hfscr = swab64(hr->hfscr);
@@ -148,7 +148,7 @@
 {
 	struct kvmppc_vcore *vc = vcpu->arch.vcore;
 
-	vc->pcr = hr->pcr;
+	vc->pcr = hr->pcr | PCR_MASK;
 	vc->dpdes = hr->dpdes;
 	vcpu->arch.hfscr = hr->hfscr;
 	vcpu->arch.dawr = hr->dawr0;

diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 7186c65..2203054 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c

@@ -433,6 +433,37 @@
 		(HPTE_R_KEY_HI | HPTE_R_KEY_LO));
 }
 
+static inline void fixup_tlbie_lpid(unsigned long rb_value, unsigned long lpid)
+{
+
+	if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) {
+		/* Radix flush for a hash guest */
+
+		unsigned long rb,rs,prs,r,ric;
+
+		rb = PPC_BIT(52); /* IS = 2 */
+		rs = 0;  /* lpid = 0 */
+		prs = 0; /* partition scoped */
+		r = 1;   /* radix format */
+		ric = 0; /* RIC_FLSUH_TLB */
+
+		/*
+		 * Need the extra ptesync to make sure we don't
+		 * re-order the tlbie
+		 */
+		asm volatile("ptesync": : :"memory");
+		asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
+			     : : "r"(rb), "i"(r), "i"(prs),
+			       "i"(ric), "r"(rs) : "memory");
+	}
+
+	if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) {
+		asm volatile("ptesync": : :"memory");
+		asm volatile(PPC_TLBIE_5(%0,%1,0,0,0) : :
+			     "r" (rb_value), "r" (lpid));
+	}
+}
+
 static void do_tlbies(struct kvm *kvm, unsigned long *rbvalues,
 		      long npages, int global, bool need_sync)
 {
@@ -451,16 +482,7 @@
 				     "r" (rbvalues[i]), "r" (kvm->arch.lpid));
 		}
 
-		if (cpu_has_feature(CPU_FTR_P9_TLBIE_BUG)) {
-			/*
-			 * Need the extra ptesync to make sure we don't
-			 * re-order the tlbie
-			 */
-			asm volatile("ptesync": : :"memory");
-			asm volatile(PPC_TLBIE_5(%0,%1,0,0,0) : :
-				     "r" (rbvalues[0]), "r" (kvm->arch.lpid));
-		}
-
+		fixup_tlbie_lpid(rbvalues[i - 1], kvm->arch.lpid);
 		asm volatile("eieio; tlbsync; ptesync" : : : "memory");
 	} else {
 		if (need_sync)

diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c
index 4d2ec77..287d591 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_xics.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c

@@ -58,7 +58,7 @@
 	hcpu = hcore << threads_shift;
 	kvmppc_host_rm_ops_hv->rm_core[hcore].rm_data = vcpu;
 	smp_muxed_ipi_set_message(hcpu, PPC_MSG_RM_HOST_ACTION);
-	kvmppc_set_host_ipi(hcpu, 1);
+	kvmppc_set_host_ipi(hcpu);
 	smp_mb();
 	kvmhv_rm_send_ipi(hcpu);
 }

diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 9a05b0d..74a9cfe 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S

@@ -644,8 +644,10 @@
 
 	/* Load guest PCR value to select appropriate compat mode */
 37:	ld	r7, VCORE_PCR(r5)
-	cmpdi	r7, 0
+	LOAD_REG_IMMEDIATE(r6, PCR_MASK)
+	cmpld	r7, r6
 	beq	38f
+	or	r7, r7, r6
 	mtspr	SPRN_PCR, r7
 38:
 
@@ -1913,10 +1915,10 @@
 
 	/* Reset PCR */
 	ld	r0, VCORE_PCR(r5)
-	cmpdi	r0, 0
+	LOAD_REG_IMMEDIATE(r6, PCR_MASK)
+	cmpld	r0, r6
 	beq	18f
-	li	r0, 0
-	mtspr	SPRN_PCR, r0
+	mtspr	SPRN_PCR, r6
 18:
 	/* Signal secondary CPUs to continue */
 	stb	r0,VCORE_IN_GUEST(r5)

diff --git a/arch/powerpc/mm/book3s64/hash_native.c b/arch/powerpc/mm/book3s64/hash_native.c
index 90ab4f3..523e42e 100644
--- a/arch/powerpc/mm/book3s64/hash_native.c
+++ b/arch/powerpc/mm/book3s64/hash_native.c

@@ -197,9 +197,32 @@
 	return va;
 }
 
-static inline void fixup_tlbie(unsigned long vpn, int psize, int apsize, int ssize)
+static inline void fixup_tlbie_vpn(unsigned long vpn, int psize,
+				   int apsize, int ssize)
 {
-	if (cpu_has_feature(CPU_FTR_P9_TLBIE_BUG)) {
+	if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) {
+		/* Radix flush for a hash guest */
+
+		unsigned long rb,rs,prs,r,ric;
+
+		rb = PPC_BIT(52); /* IS = 2 */
+		rs = 0;  /* lpid = 0 */
+		prs = 0; /* partition scoped */
+		r = 1;   /* radix format */
+		ric = 0; /* RIC_FLSUH_TLB */
+
+		/*
+		 * Need the extra ptesync to make sure we don't
+		 * re-order the tlbie
+		 */
+		asm volatile("ptesync": : :"memory");
+		asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
+			     : : "r"(rb), "i"(r), "i"(prs),
+			       "i"(ric), "r"(rs) : "memory");
+	}
+
+
+	if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) {
 		/* Need the extra ptesync to ensure we don't reorder tlbie*/
 		asm volatile("ptesync": : :"memory");
 		___tlbie(vpn, psize, apsize, ssize);
@@ -283,7 +306,7 @@
 		asm volatile("ptesync": : :"memory");
 	} else {
 		__tlbie(vpn, psize, apsize, ssize);
-		fixup_tlbie(vpn, psize, apsize, ssize);
+		fixup_tlbie_vpn(vpn, psize, apsize, ssize);
 		asm volatile("eieio; tlbsync; ptesync": : :"memory");
 	}
 	if (lock_tlbie && !use_local)
@@ -856,7 +879,7 @@
 		/*
 		 * Just do one more with the last used values.
 		 */
-		fixup_tlbie(vpn, psize, psize, ssize);
+		fixup_tlbie_vpn(vpn, psize, psize, ssize);
 		asm volatile("eieio; tlbsync; ptesync":::"memory");
 
 		if (lock_tlbie)

diff --git a/arch/powerpc/mm/book3s64/hash_pgtable.c b/arch/powerpc/mm/book3s64/hash_pgtable.c
index d1f390a..64733b9 100644
--- a/arch/powerpc/mm/book3s64/hash_pgtable.c
+++ b/arch/powerpc/mm/book3s64/hash_pgtable.c

@@ -406,6 +406,8 @@
 
 	return 1;
 }
+EXPORT_SYMBOL_GPL(hash__has_transparent_hugepage);
+
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
 #ifdef CONFIG_STRICT_KERNEL_RWX

diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c
index 3410ea9..6c12376 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c

@@ -1748,7 +1748,7 @@
 	/*
 	 * IF we try to do a HUGE PTE update after a withdraw is done.
 	 * we will find the below NULL. This happens when we do
-	 * split_huge_page_pmd
+	 * split_huge_pmd
 	 */
 	if (!hpte_slot_array)
 		return;

diff --git a/arch/powerpc/mm/book3s64/iommu_api.c b/arch/powerpc/mm/book3s64/iommu_api.c
index b056cae..56cc845 100644
--- a/arch/powerpc/mm/book3s64/iommu_api.c
+++ b/arch/powerpc/mm/book3s64/iommu_api.c

@@ -129,11 +129,8 @@
 		 * Allow to use larger than 64k IOMMU pages. Only do that
 		 * if we are backed by hugetlb.
 		 */
-		if ((mem->pageshift > PAGE_SHIFT) && PageHuge(page)) {
-			struct page *head = compound_head(page);
-
-			pageshift = compound_order(head) + PAGE_SHIFT;
-		}
+		if ((mem->pageshift > PAGE_SHIFT) && PageHuge(page))
+			pageshift = page_shift(compound_head(page));
 		mem->pageshift = min(mem->pageshift, pageshift);
 		/*
 		 * We don't need struct page reference any more, switch

diff --git a/arch/powerpc/mm/book3s64/mmu_context.c b/arch/powerpc/mm/book3s64/mmu_context.c
index 2d0cb5b..0ba30b8 100644
--- a/arch/powerpc/mm/book3s64/mmu_context.c
+++ b/arch/powerpc/mm/book3s64/mmu_context.c

@@ -256,8 +256,21 @@
 #ifdef CONFIG_SPAPR_TCE_IOMMU
 	WARN_ON_ONCE(!list_empty(&mm->context.iommu_group_mem_list));
 #endif
+	/*
+	 * For tasks which were successfully initialized we end up calling
+	 * arch_exit_mmap() which clears the process table entry. And
+	 * arch_exit_mmap() is called before the required fullmm TLB flush
+	 * which does a RIC=2 flush. Hence for an initialized task, we do clear
+	 * any cached process table entries.
+	 *
+	 * The condition below handles the error case during task init. We have
+	 * set the process table entry early and if we fail a task
+	 * initialization, we need to ensure the process table entry is zeroed.
+	 * We need not worry about process table entry caches because the task
+	 * never ran with the PID value.
+	 */
 	if (radix_enabled())
-		WARN_ON(process_tb[mm->context.id].prtb0 != 0);
+		process_tb[mm->context.id].prtb0 = 0;
 	else
 		subpage_prot_free(mm);
 	destroy_contexts(&mm->context);

diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
index 3a1fbf9..6ee17d0 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c

@@ -1027,13 +1027,6 @@
 	return old_pmd;
 }
 
-int radix__has_transparent_hugepage(void)
-{
-	/* For radix 2M at PMD level means thp */
-	if (mmu_psize_defs[MMU_PAGE_2M].shift == PMD_SHIFT)
-		return 1;
-	return 0;
-}
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
 void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep,

diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c
index 631be42..67af871 100644
--- a/arch/powerpc/mm/book3s64/radix_tlb.c
+++ b/arch/powerpc/mm/book3s64/radix_tlb.c

@@ -196,22 +196,83 @@
 	trace_tlbie(lpid, 0, rb, rs, ric, prs, r);
 }
 
-static inline void fixup_tlbie(void)
+
+static inline void fixup_tlbie_va(unsigned long va, unsigned long pid,
+				  unsigned long ap)
 {
-	unsigned long pid = 0;
+	if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) {
+		asm volatile("ptesync": : :"memory");
+		__tlbie_va(va, 0, ap, RIC_FLUSH_TLB);
+	}
+
+	if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) {
+		asm volatile("ptesync": : :"memory");
+		__tlbie_va(va, pid, ap, RIC_FLUSH_TLB);
+	}
+}
+
+static inline void fixup_tlbie_va_range(unsigned long va, unsigned long pid,
+					unsigned long ap)
+{
+	if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) {
+		asm volatile("ptesync": : :"memory");
+		__tlbie_pid(0, RIC_FLUSH_TLB);
+	}
+
+	if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) {
+		asm volatile("ptesync": : :"memory");
+		__tlbie_va(va, pid, ap, RIC_FLUSH_TLB);
+	}
+}
+
+static inline void fixup_tlbie_pid(unsigned long pid)
+{
+	/*
+	 * We can use any address for the invalidation, pick one which is
+	 * probably unused as an optimisation.
+	 */
 	unsigned long va = ((1UL << 52) - 1);
 
-	if (cpu_has_feature(CPU_FTR_P9_TLBIE_BUG)) {
+	if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) {
+		asm volatile("ptesync": : :"memory");
+		__tlbie_pid(0, RIC_FLUSH_TLB);
+	}
+
+	if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) {
 		asm volatile("ptesync": : :"memory");
 		__tlbie_va(va, pid, mmu_get_ap(MMU_PAGE_64K), RIC_FLUSH_TLB);
 	}
 }
 
+
+static inline void fixup_tlbie_lpid_va(unsigned long va, unsigned long lpid,
+				       unsigned long ap)
+{
+	if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) {
+		asm volatile("ptesync": : :"memory");
+		__tlbie_lpid_va(va, 0, ap, RIC_FLUSH_TLB);
+	}
+
+	if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) {
+		asm volatile("ptesync": : :"memory");
+		__tlbie_lpid_va(va, lpid, ap, RIC_FLUSH_TLB);
+	}
+}
+
 static inline void fixup_tlbie_lpid(unsigned long lpid)
 {
+	/*
+	 * We can use any address for the invalidation, pick one which is
+	 * probably unused as an optimisation.
+	 */
 	unsigned long va = ((1UL << 52) - 1);
 
-	if (cpu_has_feature(CPU_FTR_P9_TLBIE_BUG)) {
+	if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) {
+		asm volatile("ptesync": : :"memory");
+		__tlbie_lpid(0, RIC_FLUSH_TLB);
+	}
+
+	if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) {
 		asm volatile("ptesync": : :"memory");
 		__tlbie_lpid_va(va, lpid, mmu_get_ap(MMU_PAGE_64K), RIC_FLUSH_TLB);
 	}
@@ -258,6 +319,7 @@
 	switch (ric) {
 	case RIC_FLUSH_TLB:
 		__tlbie_pid(pid, RIC_FLUSH_TLB);
+		fixup_tlbie_pid(pid);
 		break;
 	case RIC_FLUSH_PWC:
 		__tlbie_pid(pid, RIC_FLUSH_PWC);
@@ -265,8 +327,8 @@
 	case RIC_FLUSH_ALL:
 	default:
 		__tlbie_pid(pid, RIC_FLUSH_ALL);
+		fixup_tlbie_pid(pid);
 	}
-	fixup_tlbie();
 	asm volatile("eieio; tlbsync; ptesync": : :"memory");
 }
 
@@ -315,6 +377,7 @@
 	switch (ric) {
 	case RIC_FLUSH_TLB:
 		__tlbie_lpid(lpid, RIC_FLUSH_TLB);
+		fixup_tlbie_lpid(lpid);
 		break;
 	case RIC_FLUSH_PWC:
 		__tlbie_lpid(lpid, RIC_FLUSH_PWC);
@@ -322,8 +385,8 @@
 	case RIC_FLUSH_ALL:
 	default:
 		__tlbie_lpid(lpid, RIC_FLUSH_ALL);
+		fixup_tlbie_lpid(lpid);
 	}
-	fixup_tlbie_lpid(lpid);
 	asm volatile("eieio; tlbsync; ptesync": : :"memory");
 }
 
@@ -390,6 +453,8 @@
 
 	for (addr = start; addr < end; addr += page_size)
 		__tlbie_va(addr, pid, ap, RIC_FLUSH_TLB);
+
+	fixup_tlbie_va_range(addr - page_size, pid, ap);
 }
 
 static __always_inline void _tlbie_va(unsigned long va, unsigned long pid,
@@ -399,7 +464,7 @@
 
 	asm volatile("ptesync": : :"memory");
 	__tlbie_va(va, pid, ap, ric);
-	fixup_tlbie();
+	fixup_tlbie_va(va, pid, ap);
 	asm volatile("eieio; tlbsync; ptesync": : :"memory");
 }
 
@@ -457,7 +522,7 @@
 
 	asm volatile("ptesync": : :"memory");
 	__tlbie_lpid_va(va, lpid, ap, ric);
-	fixup_tlbie_lpid(lpid);
+	fixup_tlbie_lpid_va(va, lpid, ap);
 	asm volatile("eieio; tlbsync; ptesync": : :"memory");
 }
 
@@ -469,7 +534,6 @@
 	if (also_pwc)
 		__tlbie_pid(pid, RIC_FLUSH_PWC);
 	__tlbie_va_range(start, end, pid, page_size, psize);
-	fixup_tlbie();
 	asm volatile("eieio; tlbsync; ptesync": : :"memory");
 }
 
@@ -856,7 +920,7 @@
 			if (gflush)
 				__tlbie_va_range(gstart, gend, pid,
 						PUD_SIZE, MMU_PAGE_1G);
-			fixup_tlbie();
+
 			asm volatile("eieio; tlbsync; ptesync": : :"memory");
 		} else {
 			_tlbiel_va_range_multicast(mm,

diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index a8953f1..73d4873f 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c

@@ -667,7 +667,7 @@
 
 	BUG_ON(!PageCompound(page));
 
-	for (i = 0; i < (1UL << compound_order(page)); i++) {
+	for (i = 0; i < compound_nr(page); i++) {
 		if (!PageHighMem(page)) {
 			__flush_dcache_icache(page_address(page+i));
 		} else {

diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index a44f628..4e08246 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c

@@ -172,6 +172,21 @@
 	vmemmap_list = vmem_back;
 }
 
+static bool altmap_cross_boundary(struct vmem_altmap *altmap, unsigned long start,
+				unsigned long page_size)
+{
+	unsigned long nr_pfn = page_size / sizeof(struct page);
+	unsigned long start_pfn = page_to_pfn((struct page *)start);
+
+	if ((start_pfn + nr_pfn) > altmap->end_pfn)
+		return true;
+
+	if (start_pfn < altmap->base_pfn)
+		return true;
+
+	return false;
+}
+
 int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
 		struct vmem_altmap *altmap)
 {
@@ -194,7 +209,7 @@
 		 * fail due to alignment issues when using 16MB hugepages, so
 		 * fall back to system memory if the altmap allocation fail.
 		 */
-		if (altmap) {
+		if (altmap && !altmap_cross_boundary(altmap, start, page_size)) {
 			p = altmap_alloc_block_buf(page_size, altmap);
 			if (!p)
 				pr_debug("altmap block allocation failed, falling back to system memory");

diff --git a/arch/powerpc/mm/kasan/kasan_init_32.c b/arch/powerpc/mm/kasan/kasan_init_32.c
index 802387b..0e6ed44 100644
--- a/arch/powerpc/mm/kasan/kasan_init_32.c
+++ b/arch/powerpc/mm/kasan/kasan_init_32.c

@@ -12,6 +12,14 @@
 #include <asm/code-patching.h>
 #include <mm/mmu_decl.h>
 
+static pgprot_t kasan_prot_ro(void)
+{
+	if (early_mmu_has_feature(MMU_FTR_HPTE_TABLE))
+		return PAGE_READONLY;
+
+	return PAGE_KERNEL_RO;
+}
+
 static void kasan_populate_pte(pte_t *ptep, pgprot_t prot)
 {
 	unsigned long va = (unsigned long)kasan_early_shadow_page;
@@ -26,6 +34,7 @@
 {
 	pmd_t *pmd;
 	unsigned long k_cur, k_next;
+	pgprot_t prot = slab_is_available() ? kasan_prot_ro() : PAGE_KERNEL;
 
 	pmd = pmd_offset(pud_offset(pgd_offset_k(k_start), k_start), k_start);
 
@@ -43,10 +52,7 @@
 
 		if (!new)
 			return -ENOMEM;
-		if (early_mmu_has_feature(MMU_FTR_HPTE_TABLE))
-			kasan_populate_pte(new, PAGE_READONLY);
-		else
-			kasan_populate_pte(new, PAGE_KERNEL_RO);
+		kasan_populate_pte(new, prot);
 
 		smp_wmb(); /* See comment in __pte_alloc */
 
@@ -103,11 +109,23 @@
 
 static void __init kasan_remap_early_shadow_ro(void)
 {
-	if (early_mmu_has_feature(MMU_FTR_HPTE_TABLE))
-		kasan_populate_pte(kasan_early_shadow_pte, PAGE_READONLY);
-	else
-		kasan_populate_pte(kasan_early_shadow_pte, PAGE_KERNEL_RO);
+	pgprot_t prot = kasan_prot_ro();
+	unsigned long k_start = KASAN_SHADOW_START;
+	unsigned long k_end = KASAN_SHADOW_END;
+	unsigned long k_cur;
+	phys_addr_t pa = __pa(kasan_early_shadow_page);
 
+	kasan_populate_pte(kasan_early_shadow_pte, prot);
+
+	for (k_cur = k_start & PAGE_MASK; k_cur < k_end; k_cur += PAGE_SIZE) {
+		pmd_t *pmd = pmd_offset(pud_offset(pgd_offset_k(k_cur), k_cur), k_cur);
+		pte_t *ptep = pte_offset_kernel(pmd, k_cur);
+
+		if ((pte_val(*ptep) & PTE_RPN_MASK) != pa)
+			continue;
+
+		__set_pte_at(&init_mm, k_cur, ptep, pfn_pte(PHYS_PFN(pa), prot), 0);
+	}
 	flush_tlb_kernel_range(KASAN_SHADOW_START, KASAN_SHADOW_END);
 }
 

diff --git a/arch/powerpc/mm/pgtable-frag.c b/arch/powerpc/mm/pgtable-frag.c
index a7b0521..ee4bd6d 100644
--- a/arch/powerpc/mm/pgtable-frag.c
+++ b/arch/powerpc/mm/pgtable-frag.c

@@ -25,7 +25,7 @@
 	count = ((unsigned long)pte_frag & ~PAGE_MASK) >> PTE_FRAG_SIZE_SHIFT;
 	/* We allow PTE_FRAG_NR fragments from a PTE page */
 	if (atomic_sub_and_test(PTE_FRAG_NR - count, &page->pt_frag_refcount)) {
-		pgtable_page_dtor(page);
+		pgtable_pte_page_dtor(page);
 		__free_page(page);
 	}
 }
@@ -61,7 +61,7 @@
 		page = alloc_page(PGALLOC_GFP | __GFP_ACCOUNT);
 		if (!page)
 			return NULL;
-		if (!pgtable_page_ctor(page)) {
+		if (!pgtable_pte_page_ctor(page)) {
 			__free_page(page);
 			return NULL;
 		}
@@ -113,7 +113,7 @@
 	BUG_ON(atomic_read(&page->pt_frag_refcount) <= 0);
 	if (atomic_dec_and_test(&page->pt_frag_refcount)) {
 		if (!kernel)
-			pgtable_page_dtor(page);
+			pgtable_pte_page_dtor(page);
 		__free_page(page);
 	}
 }

diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c
index 065ff14..1d93e55 100644
--- a/arch/powerpc/platforms/cell/spufs/inode.c
+++ b/arch/powerpc/platforms/cell/spufs/inode.c

@@ -10,6 +10,8 @@
 
 #include <linux/file.h>
 #include <linux/fs.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
 #include <linux/fsnotify.h>
 #include <linux/backing-dev.h>
 #include <linux/init.h>
@@ -20,7 +22,6 @@
 #include <linux/pagemap.h>
 #include <linux/poll.h>
 #include <linux/slab.h>
-#include <linux/parser.h>
 
 #include <asm/prom.h>
 #include <asm/spu.h>
@@ -30,7 +31,7 @@
 #include "spufs.h"
 
 struct spufs_sb_info {
-	int debug;
+	bool debug;
 };
 
 static struct kmem_cache *spufs_inode_cache;
@@ -574,16 +575,27 @@
 }
 
 /* File system initialization */
-enum {
-	Opt_uid, Opt_gid, Opt_mode, Opt_debug, Opt_err,
+struct spufs_fs_context {
+	kuid_t	uid;
+	kgid_t	gid;
+	umode_t	mode;
 };
 
-static const match_table_t spufs_tokens = {
-	{ Opt_uid,   "uid=%d" },
-	{ Opt_gid,   "gid=%d" },
-	{ Opt_mode,  "mode=%o" },
-	{ Opt_debug, "debug" },
-	{ Opt_err,    NULL  },
+enum {
+	Opt_uid, Opt_gid, Opt_mode, Opt_debug,
+};
+
+static const struct fs_parameter_spec spufs_param_specs[] = {
+	fsparam_u32	("gid",				Opt_gid),
+	fsparam_u32oct	("mode",			Opt_mode),
+	fsparam_u32	("uid",				Opt_uid),
+	fsparam_flag	("debug",			Opt_debug),
+	{}
+};
+
+static const struct fs_parameter_description spufs_fs_parameters = {
+	.name		= "spufs",
+	.specs		= spufs_param_specs,
 };
 
 static int spufs_show_options(struct seq_file *m, struct dentry *root)
@@ -604,47 +616,41 @@
 	return 0;
 }
 
-static int
-spufs_parse_options(struct super_block *sb, char *options, struct inode *root)
+static int spufs_parse_param(struct fs_context *fc, struct fs_parameter *param)
 {
-	char *p;
-	substring_t args[MAX_OPT_ARGS];
+	struct spufs_fs_context *ctx = fc->fs_private;
+	struct spufs_sb_info *sbi = fc->s_fs_info;
+	struct fs_parse_result result;
+	kuid_t uid;
+	kgid_t gid;
+	int opt;
 
-	while ((p = strsep(&options, ",")) != NULL) {
-		int token, option;
+	opt = fs_parse(fc, &spufs_fs_parameters, param, &result);
+	if (opt < 0)
+		return opt;
 
-		if (!*p)
-			continue;
-
-		token = match_token(p, spufs_tokens, args);
-		switch (token) {
-		case Opt_uid:
-			if (match_int(&args[0], &option))
-				return 0;
-			root->i_uid = make_kuid(current_user_ns(), option);
-			if (!uid_valid(root->i_uid))
-				return 0;
-			break;
-		case Opt_gid:
-			if (match_int(&args[0], &option))
-				return 0;
-			root->i_gid = make_kgid(current_user_ns(), option);
-			if (!gid_valid(root->i_gid))
-				return 0;
-			break;
-		case Opt_mode:
-			if (match_octal(&args[0], &option))
-				return 0;
-			root->i_mode = option | S_IFDIR;
-			break;
-		case Opt_debug:
-			spufs_get_sb_info(sb)->debug = 1;
-			break;
-		default:
-			return 0;
-		}
+	switch (opt) {
+	case Opt_uid:
+		uid = make_kuid(current_user_ns(), result.uint_32);
+		if (!uid_valid(uid))
+			return invalf(fc, "Unknown uid");
+		ctx->uid = uid;
+		break;
+	case Opt_gid:
+		gid = make_kgid(current_user_ns(), result.uint_32);
+		if (!gid_valid(gid))
+			return invalf(fc, "Unknown gid");
+		ctx->gid = gid;
+		break;
+	case Opt_mode:
+		ctx->mode = result.uint_32 & S_IALLUGO;
+		break;
+	case Opt_debug:
+		sbi->debug = true;
+		break;
 	}
-	return 1;
+
+	return 0;
 }
 
 static void spufs_exit_isolated_loader(void)
@@ -678,79 +684,98 @@
 	printk(KERN_INFO "spufs: SPU isolation mode enabled\n");
 }
 
-static int
-spufs_create_root(struct super_block *sb, void *data)
+static int spufs_create_root(struct super_block *sb, struct fs_context *fc)
 {
+	struct spufs_fs_context *ctx = fc->fs_private;
 	struct inode *inode;
-	int ret;
 
-	ret = -ENODEV;
 	if (!spu_management_ops)
-		goto out;
+		return -ENODEV;
 
-	ret = -ENOMEM;
-	inode = spufs_new_inode(sb, S_IFDIR | 0775);
+	inode = spufs_new_inode(sb, S_IFDIR | ctx->mode);
 	if (!inode)
-		goto out;
+		return -ENOMEM;
 
+	inode->i_uid = ctx->uid;
+	inode->i_gid = ctx->gid;
 	inode->i_op = &simple_dir_inode_operations;
 	inode->i_fop = &simple_dir_operations;
 	SPUFS_I(inode)->i_ctx = NULL;
 	inc_nlink(inode);
 
-	ret = -EINVAL;
-	if (!spufs_parse_options(sb, data, inode))
-		goto out_iput;
-
-	ret = -ENOMEM;
 	sb->s_root = d_make_root(inode);
 	if (!sb->s_root)
-		goto out;
-
+		return -ENOMEM;
 	return 0;
-out_iput:
-	iput(inode);
-out:
-	return ret;
 }
 
-static int
-spufs_fill_super(struct super_block *sb, void *data, int silent)
+static const struct super_operations spufs_ops = {
+	.alloc_inode	= spufs_alloc_inode,
+	.free_inode	= spufs_free_inode,
+	.statfs		= simple_statfs,
+	.evict_inode	= spufs_evict_inode,
+	.show_options	= spufs_show_options,
+};
+
+static int spufs_fill_super(struct super_block *sb, struct fs_context *fc)
 {
-	struct spufs_sb_info *info;
-	static const struct super_operations s_ops = {
-		.alloc_inode = spufs_alloc_inode,
-		.free_inode = spufs_free_inode,
-		.statfs = simple_statfs,
-		.evict_inode = spufs_evict_inode,
-		.show_options = spufs_show_options,
-	};
-
-	info = kzalloc(sizeof(*info), GFP_KERNEL);
-	if (!info)
-		return -ENOMEM;
-
 	sb->s_maxbytes = MAX_LFS_FILESIZE;
 	sb->s_blocksize = PAGE_SIZE;
 	sb->s_blocksize_bits = PAGE_SHIFT;
 	sb->s_magic = SPUFS_MAGIC;
-	sb->s_op = &s_ops;
-	sb->s_fs_info = info;
+	sb->s_op = &spufs_ops;
 
-	return spufs_create_root(sb, data);
+	return spufs_create_root(sb, fc);
 }
 
-static struct dentry *
-spufs_mount(struct file_system_type *fstype, int flags,
-		const char *name, void *data)
+static int spufs_get_tree(struct fs_context *fc)
 {
-	return mount_single(fstype, flags, data, spufs_fill_super);
+	return get_tree_single(fc, spufs_fill_super);
+}
+
+static void spufs_free_fc(struct fs_context *fc)
+{
+	kfree(fc->s_fs_info);
+}
+
+static const struct fs_context_operations spufs_context_ops = {
+	.free		= spufs_free_fc,
+	.parse_param	= spufs_parse_param,
+	.get_tree	= spufs_get_tree,
+};
+
+static int spufs_init_fs_context(struct fs_context *fc)
+{
+	struct spufs_fs_context *ctx;
+	struct spufs_sb_info *sbi;
+
+	ctx = kzalloc(sizeof(struct spufs_fs_context), GFP_KERNEL);
+	if (!ctx)
+		goto nomem;
+
+	sbi = kzalloc(sizeof(struct spufs_sb_info), GFP_KERNEL);
+	if (!sbi)
+		goto nomem_ctx;
+
+	ctx->uid = current_uid();
+	ctx->gid = current_gid();
+	ctx->mode = 0755;
+
+	fc->s_fs_info = sbi;
+	fc->ops = &spufs_context_ops;
+	return 0;
+
+nomem_ctx:
+	kfree(ctx);
+nomem:
+	return -ENOMEM;
 }
 
 static struct file_system_type spufs_type = {
 	.owner = THIS_MODULE,
 	.name = "spufs",
-	.mount = spufs_mount,
+	.init_fs_context = spufs_init_fs_context,
+	.parameters	= &spufs_fs_parameters,
 	.kill_sb = kill_litter_super,
 };
 MODULE_ALIAS_FS("spufs");

diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c
index 94cd96b..fbd6e6b 100644
--- a/arch/powerpc/platforms/powernv/smp.c
+++ b/arch/powerpc/platforms/powernv/smp.c

@@ -193,7 +193,7 @@
 		 * for coming online, which are handled via
 		 * generic_check_cpu_restart() calls.
 		 */
-		kvmppc_set_host_ipi(cpu, 0);
+		kvmppc_clear_host_ipi(cpu);
 
 		srr1 = pnv_cpu_offline(cpu);
 

diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c
index 36b846f..b533592 100644
--- a/arch/powerpc/platforms/pseries/lpar.c
+++ b/arch/powerpc/platforms/pseries/lpar.c

@@ -56,6 +56,22 @@
 EXPORT_SYMBOL(plpar_hcall9);
 EXPORT_SYMBOL(plpar_hcall_norets);
 
+/*
+ * H_BLOCK_REMOVE supported block size for this page size in segment who's base
+ * page size is that page size.
+ *
+ * The first index is the segment base page size, the second one is the actual
+ * page size.
+ */
+static int hblkrm_size[MMU_PAGE_COUNT][MMU_PAGE_COUNT] __ro_after_init;
+
+/*
+ * Due to the involved complexity, and that the current hypervisor is only
+ * returning this value or 0, we are limiting the support of the H_BLOCK_REMOVE
+ * buffer size to 8 size block.
+ */
+#define HBLKRM_SUPPORTED_BLOCK_SIZE 8
+
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 static u8 dtl_mask = DTL_LOG_PREEMPT;
 #else
@@ -984,6 +1000,17 @@
 #define HBLKR_CTRL_ERRNOTFOUND	0x8800000000000000UL
 #define HBLKR_CTRL_ERRBUSY	0xa000000000000000UL
 
+/*
+ * Returned true if we are supporting this block size for the specified segment
+ * base page size and actual page size.
+ *
+ * Currently, we only support 8 size block.
+ */
+static inline bool is_supported_hlbkrm(int bpsize, int psize)
+{
+	return (hblkrm_size[bpsize][psize] == HBLKRM_SUPPORTED_BLOCK_SIZE);
+}
+
 /**
  * H_BLOCK_REMOVE caller.
  * @idx should point to the latest @param entry set with a PTEX.
@@ -1143,7 +1170,8 @@
 	if (lock_tlbie)
 		spin_lock_irqsave(&pSeries_lpar_tlbie_lock, flags);
 
-	if (firmware_has_feature(FW_FEATURE_BLOCK_REMOVE))
+	/* Assuming THP size is 16M */
+	if (is_supported_hlbkrm(psize, MMU_PAGE_16M))
 		hugepage_block_invalidate(slot, vpn, count, psize, ssize);
 	else
 		hugepage_bulk_invalidate(slot, vpn, count, psize, ssize);
@@ -1312,6 +1340,137 @@
 }
 
 /*
+ * TLB Block Invalidate Characteristics
+ *
+ * These characteristics define the size of the block the hcall H_BLOCK_REMOVE
+ * is able to process for each couple segment base page size, actual page size.
+ *
+ * The ibm,get-system-parameter properties is returning a buffer with the
+ * following layout:
+ *
+ * [ 2 bytes size of the RTAS buffer (excluding these 2 bytes) ]
+ * -----------------
+ * TLB Block Invalidate Specifiers:
+ * [ 1 byte LOG base 2 of the TLB invalidate block size being specified ]
+ * [ 1 byte Number of page sizes (N) that are supported for the specified
+ *          TLB invalidate block size ]
+ * [ 1 byte Encoded segment base page size and actual page size
+ *          MSB=0 means 4k segment base page size and actual page size
+ *          MSB=1 the penc value in mmu_psize_def ]
+ * ...
+ * -----------------
+ * Next TLB Block Invalidate Specifiers...
+ * -----------------
+ * [ 0 ]
+ */
+static inline void set_hblkrm_bloc_size(int bpsize, int psize,
+					unsigned int block_size)
+{
+	if (block_size > hblkrm_size[bpsize][psize])
+		hblkrm_size[bpsize][psize] = block_size;
+}
+
+/*
+ * Decode the Encoded segment base page size and actual page size.
+ * PAPR specifies:
+ *   - bit 7 is the L bit
+ *   - bits 0-5 are the penc value
+ * If the L bit is 0, this means 4K segment base page size and actual page size
+ * otherwise the penc value should be read.
+ */
+#define HBLKRM_L_MASK		0x80
+#define HBLKRM_PENC_MASK	0x3f
+static inline void __init check_lp_set_hblkrm(unsigned int lp,
+					      unsigned int block_size)
+{
+	unsigned int bpsize, psize;
+
+	/* First, check the L bit, if not set, this means 4K */
+	if ((lp & HBLKRM_L_MASK) == 0) {
+		set_hblkrm_bloc_size(MMU_PAGE_4K, MMU_PAGE_4K, block_size);
+		return;
+	}
+
+	lp &= HBLKRM_PENC_MASK;
+	for (bpsize = 0; bpsize < MMU_PAGE_COUNT; bpsize++) {
+		struct mmu_psize_def *def = &mmu_psize_defs[bpsize];
+
+		for (psize = 0; psize < MMU_PAGE_COUNT; psize++) {
+			if (def->penc[psize] == lp) {
+				set_hblkrm_bloc_size(bpsize, psize, block_size);
+				return;
+			}
+		}
+	}
+}
+
+#define SPLPAR_TLB_BIC_TOKEN		50
+
+/*
+ * The size of the TLB Block Invalidate Characteristics is variable. But at the
+ * maximum it will be the number of possible page sizes *2 + 10 bytes.
+ * Currently MMU_PAGE_COUNT is 16, which means 42 bytes. Use a cache line size
+ * (128 bytes) for the buffer to get plenty of space.
+ */
+#define SPLPAR_TLB_BIC_MAXLENGTH	128
+
+void __init pseries_lpar_read_hblkrm_characteristics(void)
+{
+	unsigned char local_buffer[SPLPAR_TLB_BIC_MAXLENGTH];
+	int call_status, len, idx, bpsize;
+
+	spin_lock(&rtas_data_buf_lock);
+	memset(rtas_data_buf, 0, RTAS_DATA_BUF_SIZE);
+	call_status = rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1,
+				NULL,
+				SPLPAR_TLB_BIC_TOKEN,
+				__pa(rtas_data_buf),
+				RTAS_DATA_BUF_SIZE);
+	memcpy(local_buffer, rtas_data_buf, SPLPAR_TLB_BIC_MAXLENGTH);
+	local_buffer[SPLPAR_TLB_BIC_MAXLENGTH - 1] = '\0';
+	spin_unlock(&rtas_data_buf_lock);
+
+	if (call_status != 0) {
+		pr_warn("%s %s Error calling get-system-parameter (0x%x)\n",
+			__FILE__, __func__, call_status);
+		return;
+	}
+
+	/*
+	 * The first two (2) bytes of the data in the buffer are the length of
+	 * the returned data, not counting these first two (2) bytes.
+	 */
+	len = be16_to_cpu(*((u16 *)local_buffer)) + 2;
+	if (len > SPLPAR_TLB_BIC_MAXLENGTH) {
+		pr_warn("%s too large returned buffer %d", __func__, len);
+		return;
+	}
+
+	idx = 2;
+	while (idx < len) {
+		u8 block_shift = local_buffer[idx++];
+		u32 block_size;
+		unsigned int npsize;
+
+		if (!block_shift)
+			break;
+
+		block_size = 1 << block_shift;
+
+		for (npsize = local_buffer[idx++];
+		     npsize > 0 && idx < len; npsize--)
+			check_lp_set_hblkrm((unsigned int) local_buffer[idx++],
+					    block_size);
+	}
+
+	for (bpsize = 0; bpsize < MMU_PAGE_COUNT; bpsize++)
+		for (idx = 0; idx < MMU_PAGE_COUNT; idx++)
+			if (hblkrm_size[bpsize][idx])
+				pr_info("H_BLOCK_REMOVE supports base psize:%d psize:%d block size:%d",
+					bpsize, idx, hblkrm_size[bpsize][idx]);
+}
+
+/*
  * Take a spinlock around flushes to avoid bouncing the hypervisor tlbie
  * lock.
  */
@@ -1330,7 +1489,7 @@
 	if (lock_tlbie)
 		spin_lock_irqsave(&pSeries_lpar_tlbie_lock, flags);
 
-	if (firmware_has_feature(FW_FEATURE_BLOCK_REMOVE)) {
+	if (is_supported_hlbkrm(batch->psize, batch->psize)) {
 		do_block_remove(number, batch, param);
 		goto out;
 	}

diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c
index a5ac371..6188329 100644
--- a/arch/powerpc/platforms/pseries/papr_scm.c
+++ b/arch/powerpc/platforms/pseries/papr_scm.c

@@ -65,29 +65,21 @@
 		cond_resched();
 	} while (rc == H_BUSY);
 
-	if (rc) {
-		/* H_OVERLAP needs a separate error path */
-		if (rc == H_OVERLAP)
-			return -EBUSY;
-
-		dev_err(&p->pdev->dev, "bind err: %lld\n", rc);
-		return -ENXIO;
-	}
+	if (rc)
+		return rc;
 
 	p->bound_addr = saved;
-
-	dev_dbg(&p->pdev->dev, "bound drc %x to %pR\n", p->drc_index, &p->res);
-
-	return 0;
+	dev_dbg(&p->pdev->dev, "bound drc 0x%x to %pR\n", p->drc_index, &p->res);
+	return rc;
 }
 
-static int drc_pmem_unbind(struct papr_scm_priv *p)
+static void drc_pmem_unbind(struct papr_scm_priv *p)
 {
 	unsigned long ret[PLPAR_HCALL_BUFSIZE];
 	uint64_t token = 0;
 	int64_t rc;
 
-	dev_dbg(&p->pdev->dev, "unbind drc %x\n", p->drc_index);
+	dev_dbg(&p->pdev->dev, "unbind drc 0x%x\n", p->drc_index);
 
 	/* NB: unbind has the same retry requirements as drc_pmem_bind() */
 	do {
@@ -110,12 +102,48 @@
 	if (rc)
 		dev_err(&p->pdev->dev, "unbind error: %lld\n", rc);
 	else
-		dev_dbg(&p->pdev->dev, "unbind drc %x complete\n",
+		dev_dbg(&p->pdev->dev, "unbind drc 0x%x complete\n",
 			p->drc_index);
 
-	return rc == H_SUCCESS ? 0 : -ENXIO;
+	return;
 }
 
+static int drc_pmem_query_n_bind(struct papr_scm_priv *p)
+{
+	unsigned long start_addr;
+	unsigned long end_addr;
+	unsigned long ret[PLPAR_HCALL_BUFSIZE];
+	int64_t rc;
+
+
+	rc = plpar_hcall(H_SCM_QUERY_BLOCK_MEM_BINDING, ret,
+			 p->drc_index, 0);
+	if (rc)
+		goto err_out;
+	start_addr = ret[0];
+
+	/* Make sure the full region is bound. */
+	rc = plpar_hcall(H_SCM_QUERY_BLOCK_MEM_BINDING, ret,
+			 p->drc_index, p->blocks - 1);
+	if (rc)
+		goto err_out;
+	end_addr = ret[0];
+
+	if ((end_addr - start_addr) != ((p->blocks - 1) * p->block_size))
+		goto err_out;
+
+	p->bound_addr = start_addr;
+	dev_dbg(&p->pdev->dev, "bound drc 0x%x to %pR\n", p->drc_index, &p->res);
+	return rc;
+
+err_out:
+	dev_info(&p->pdev->dev,
+		 "Failed to query, trying an unbind followed by bind");
+	drc_pmem_unbind(p);
+	return drc_pmem_bind(p);
+}
+
+
 static int papr_scm_meta_get(struct papr_scm_priv *p,
 			     struct nd_cmd_get_config_data_hdr *hdr)
 {
@@ -436,14 +464,14 @@
 	rc = drc_pmem_bind(p);
 
 	/* If phyp says drc memory still bound then force unbound and retry */
-	if (rc == -EBUSY) {
-		dev_warn(&pdev->dev, "Retrying bind after unbinding\n");
-		drc_pmem_unbind(p);
-		rc = drc_pmem_bind(p);
-	}
+	if (rc == H_OVERLAP)
+		rc = drc_pmem_query_n_bind(p);
 
-	if (rc)
+	if (rc != H_SUCCESS) {
+		dev_err(&p->pdev->dev, "bind err: %d\n", rc);
+		rc = -ENXIO;
 		goto err;
+	}
 
 	/* setup the resource for the newly bound range */
 	p->res.start = p->bound_addr;

diff --git a/arch/powerpc/platforms/pseries/pseries.h b/arch/powerpc/platforms/pseries/pseries.h
index a6624d4..13fa370 100644
--- a/arch/powerpc/platforms/pseries/pseries.h
+++ b/arch/powerpc/platforms/pseries/pseries.h

@@ -112,5 +112,6 @@
 int dlpar_workqueue_init(void);
 
 void pseries_setup_rfi_flush(void);
+void pseries_lpar_read_hblkrm_characteristics(void);
 
 #endif /* _PSERIES_PSERIES_H */

diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
index f8adcd0..0a40201 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c

@@ -744,6 +744,7 @@
 
 	pseries_setup_rfi_flush();
 	setup_stf_barrier();
+	pseries_lpar_read_hblkrm_characteristics();
 
 	/* By default, only probe PCI (can be overridden by rtas_pci) */
 	pci_add_flags(PCI_PROBE_ONLY);

diff --git a/arch/powerpc/sysdev/xics/icp-native.c b/arch/powerpc/sysdev/xics/icp-native.c
index 485569f..7d13d2e 100644
--- a/arch/powerpc/sysdev/xics/icp-native.c
+++ b/arch/powerpc/sysdev/xics/icp-native.c

@@ -140,7 +140,7 @@
 
 static void icp_native_cause_ipi(int cpu)
 {
-	kvmppc_set_host_ipi(cpu, 1);
+	kvmppc_set_host_ipi(cpu);
 	icp_native_set_qirr(cpu, IPI_PRIORITY);
 }
 
@@ -179,7 +179,7 @@
 	if (vec == XICS_IPI) {
 		/* Clear pending IPI */
 		int cpu = smp_processor_id();
-		kvmppc_set_host_ipi(cpu, 0);
+		kvmppc_clear_host_ipi(cpu);
 		icp_native_set_qirr(cpu, 0xff);
 	} else {
 		pr_err("XICS: hw interrupt 0x%x to offline cpu, disabling\n",
@@ -200,7 +200,7 @@
 {
 	int cpu = smp_processor_id();
 
-	kvmppc_set_host_ipi(cpu, 0);
+	kvmppc_clear_host_ipi(cpu);
 	icp_native_set_qirr(cpu, 0xff);
 
 	return smp_ipi_demux();

diff --git a/arch/powerpc/sysdev/xics/icp-opal.c b/arch/powerpc/sysdev/xics/icp-opal.c
index 8bb8dd7..68fd254 100644
--- a/arch/powerpc/sysdev/xics/icp-opal.c
+++ b/arch/powerpc/sysdev/xics/icp-opal.c

@@ -126,7 +126,7 @@
 {
 	int hw_cpu = get_hard_smp_processor_id(cpu);
 
-	kvmppc_set_host_ipi(cpu, 1);
+	kvmppc_set_host_ipi(cpu);
 	opal_int_set_mfrr(hw_cpu, IPI_PRIORITY);
 }
 
@@ -134,7 +134,7 @@
 {
 	int cpu = smp_processor_id();
 
-	kvmppc_set_host_ipi(cpu, 0);
+	kvmppc_clear_host_ipi(cpu);
 	opal_int_set_mfrr(get_hard_smp_processor_id(cpu), 0xff);
 
 	return smp_ipi_demux();
@@ -157,7 +157,7 @@
 		if (vec == XICS_IPI) {
 			/* Clear pending IPI */
 			int cpu = smp_processor_id();
-			kvmppc_set_host_ipi(cpu, 0);
+			kvmppc_clear_host_ipi(cpu);
 			opal_int_set_mfrr(get_hard_smp_processor_id(cpu), 0xff);
 		} else {
 			pr_err("XICS: hw interrupt 0x%x to offline cpu, "

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 71d29fb..8eebbc8 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig

@@ -59,6 +59,18 @@
 	select ARCH_HAS_GIGANTIC_PAGE
 	select ARCH_WANT_HUGE_PMD_SHARE if 64BIT
 	select SPARSEMEM_STATIC if 32BIT
+	select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU
+	select HAVE_ARCH_MMAP_RND_BITS
+
+config ARCH_MMAP_RND_BITS_MIN
+	default 18 if 64BIT
+	default 8
+
+# max bits determined by the following formula:
+#  VA_BITS - PAGE_SHIFT - 3
+config ARCH_MMAP_RND_BITS_MAX
+	default 24 if 64BIT # SV39 based
+	default 17
 
 config MMU
 	def_bool y

diff --git a/arch/riscv/boot/dts/sifive/fu540-c000.dtsi b/arch/riscv/boot/dts/sifive/fu540-c000.dtsi
index 42b5ec2..afa43c7 100644
--- a/arch/riscv/boot/dts/sifive/fu540-c000.dtsi
+++ b/arch/riscv/boot/dts/sifive/fu540-c000.dtsi

@@ -13,6 +13,7 @@
 	aliases {
 		serial0 = &uart0;
 		serial1 = &uart1;
+		ethernet0 = &eth0;
 	};
 
 	chosen {
@@ -60,7 +61,6 @@
 			};
 		};
 		cpu2: cpu@2 {
-			clock-frequency = <0>;
 			compatible = "sifive,u54-mc", "sifive,rocket0", "riscv";
 			d-cache-block-size = <64>;
 			d-cache-sets = <64>;
@@ -84,7 +84,6 @@
 			};
 		};
 		cpu3: cpu@3 {
-			clock-frequency = <0>;
 			compatible = "sifive,u54-mc", "sifive,rocket0", "riscv";
 			d-cache-block-size = <64>;
 			d-cache-sets = <64>;
@@ -108,7 +107,6 @@
 			};
 		};
 		cpu4: cpu@4 {
-			clock-frequency = <0>;
 			compatible = "sifive,u54-mc", "sifive,rocket0", "riscv";
 			d-cache-block-size = <64>;
 			d-cache-sets = <64>;
@@ -230,6 +228,24 @@
 			#size-cells = <0>;
 			status = "disabled";
 		};
+		pwm0: pwm@10020000 {
+			compatible = "sifive,fu540-c000-pwm", "sifive,pwm0";
+			reg = <0x0 0x10020000 0x0 0x1000>;
+			interrupt-parent = <&plic0>;
+			interrupts = <42 43 44 45>;
+			clocks = <&prci PRCI_CLK_TLCLK>;
+			#pwm-cells = <3>;
+			status = "disabled";
+		};
+		pwm1: pwm@10021000 {
+			compatible = "sifive,fu540-c000-pwm", "sifive,pwm0";
+			reg = <0x0 0x10021000 0x0 0x1000>;
+			interrupt-parent = <&plic0>;
+			interrupts = <46 47 48 49>;
+			clocks = <&prci PRCI_CLK_TLCLK>;
+			#pwm-cells = <3>;
+			status = "disabled";
+		};
 
 	};
 };

diff --git a/arch/riscv/boot/dts/sifive/hifive-unleashed-a00.dts b/arch/riscv/boot/dts/sifive/hifive-unleashed-a00.dts
index 93d68cb..104d334 100644
--- a/arch/riscv/boot/dts/sifive/hifive-unleashed-a00.dts
+++ b/arch/riscv/boot/dts/sifive/hifive-unleashed-a00.dts

@@ -85,3 +85,11 @@
 		reg = <0>;
 	};
 };
+
+&pwm0 {
+	status = "okay";
+};
+
+&pwm1 {
+	status = "okay";
+};

diff --git a/arch/riscv/configs/defconfig b/arch/riscv/configs/defconfig
index 3efff55..420a0db 100644
--- a/arch/riscv/configs/defconfig
+++ b/arch/riscv/configs/defconfig

@@ -29,6 +29,8 @@
 CONFIG_IP_PNP_BOOTP=y
 CONFIG_IP_PNP_RARP=y
 CONFIG_NETLINK_DIAG=y
+CONFIG_NET_9P=y
+CONFIG_NET_9P_VIRTIO=y
 CONFIG_PCI=y
 CONFIG_PCIEPORTBUS=y
 CONFIG_PCI_HOST_GENERIC=y
@@ -39,6 +41,7 @@
 CONFIG_VIRTIO_BLK=y
 CONFIG_BLK_DEV_SD=y
 CONFIG_BLK_DEV_SR=y
+CONFIG_SCSI_VIRTIO=y
 CONFIG_ATA=y
 CONFIG_SATA_AHCI=y
 CONFIG_SATA_AHCI_PLATFORM=y
@@ -54,6 +57,7 @@
 CONFIG_SERIAL_OF_PLATFORM=y
 CONFIG_SERIAL_EARLYCON_RISCV_SBI=y
 CONFIG_HVC_RISCV_SBI=y
+CONFIG_VIRTIO_CONSOLE=y
 CONFIG_HW_RANDOM=y
 CONFIG_HW_RANDOM_VIRTIO=y
 CONFIG_SPI=y
@@ -61,6 +65,7 @@
 # CONFIG_PTP_1588_CLOCK is not set
 CONFIG_DRM=y
 CONFIG_DRM_RADEON=y
+CONFIG_DRM_VIRTIO_GPU=y
 CONFIG_FRAMEBUFFER_CONSOLE=y
 CONFIG_USB=y
 CONFIG_USB_XHCI_HCD=y
@@ -73,7 +78,12 @@
 CONFIG_USB_UAS=y
 CONFIG_MMC=y
 CONFIG_MMC_SPI=y
+CONFIG_VIRTIO_PCI=y
+CONFIG_VIRTIO_BALLOON=y
+CONFIG_VIRTIO_INPUT=y
 CONFIG_VIRTIO_MMIO=y
+CONFIG_RPMSG_CHAR=y
+CONFIG_RPMSG_VIRTIO=y
 CONFIG_EXT4_FS=y
 CONFIG_EXT4_FS_POSIX_ACL=y
 CONFIG_AUTOFS4_FS=y
@@ -86,6 +96,7 @@
 CONFIG_NFS_V4_1=y
 CONFIG_NFS_V4_2=y
 CONFIG_ROOT_NFS=y
+CONFIG_9P_FS=y
 CONFIG_CRYPTO_USER_API_HASH=y
 CONFIG_CRYPTO_DEV_VIRTIO=y
 CONFIG_PRINTK_TIME=y

diff --git a/arch/riscv/configs/rv32_defconfig b/arch/riscv/configs/rv32_defconfig
index 7da93e4..87ee6e6 100644
--- a/arch/riscv/configs/rv32_defconfig
+++ b/arch/riscv/configs/rv32_defconfig

@@ -29,6 +29,8 @@
 CONFIG_IP_PNP_BOOTP=y
 CONFIG_IP_PNP_RARP=y
 CONFIG_NETLINK_DIAG=y
+CONFIG_NET_9P=y
+CONFIG_NET_9P_VIRTIO=y
 CONFIG_PCI=y
 CONFIG_PCIEPORTBUS=y
 CONFIG_PCI_HOST_GENERIC=y
@@ -39,6 +41,7 @@
 CONFIG_VIRTIO_BLK=y
 CONFIG_BLK_DEV_SD=y
 CONFIG_BLK_DEV_SR=y
+CONFIG_SCSI_VIRTIO=y
 CONFIG_ATA=y
 CONFIG_SATA_AHCI=y
 CONFIG_SATA_AHCI_PLATFORM=y
@@ -54,11 +57,13 @@
 CONFIG_SERIAL_OF_PLATFORM=y
 CONFIG_SERIAL_EARLYCON_RISCV_SBI=y
 CONFIG_HVC_RISCV_SBI=y
+CONFIG_VIRTIO_CONSOLE=y
 CONFIG_HW_RANDOM=y
 CONFIG_HW_RANDOM_VIRTIO=y
 # CONFIG_PTP_1588_CLOCK is not set
 CONFIG_DRM=y
 CONFIG_DRM_RADEON=y
+CONFIG_DRM_VIRTIO_GPU=y
 CONFIG_FRAMEBUFFER_CONSOLE=y
 CONFIG_USB=y
 CONFIG_USB_XHCI_HCD=y
@@ -69,7 +74,12 @@
 CONFIG_USB_OHCI_HCD_PLATFORM=y
 CONFIG_USB_STORAGE=y
 CONFIG_USB_UAS=y
+CONFIG_VIRTIO_PCI=y
+CONFIG_VIRTIO_BALLOON=y
+CONFIG_VIRTIO_INPUT=y
 CONFIG_VIRTIO_MMIO=y
+CONFIG_RPMSG_CHAR=y
+CONFIG_RPMSG_VIRTIO=y
 CONFIG_SIFIVE_PLIC=y
 CONFIG_EXT4_FS=y
 CONFIG_EXT4_FS_POSIX_ACL=y
@@ -83,6 +93,7 @@
 CONFIG_NFS_V4_1=y
 CONFIG_NFS_V4_2=y
 CONFIG_ROOT_NFS=y
+CONFIG_9P_FS=y
 CONFIG_CRYPTO_USER_API_HASH=y
 CONFIG_CRYPTO_DEV_VIRTIO=y
 CONFIG_PRINTK_TIME=y

diff --git a/arch/riscv/include/asm/pgalloc.h b/arch/riscv/include/asm/pgalloc.h
index 56a67d6..d59ea92 100644
--- a/arch/riscv/include/asm/pgalloc.h
+++ b/arch/riscv/include/asm/pgalloc.h

@@ -78,12 +78,8 @@
 
 #define __pte_free_tlb(tlb, pte, buf)   \
 do {                                    \
-	pgtable_page_dtor(pte);         \
+	pgtable_pte_page_dtor(pte);     \
 	tlb_remove_page((tlb), pte);    \
 } while (0)
 
-static inline void check_pgt_cache(void)
-{
-}
-
 #endif /* _ASM_RISCV_PGALLOC_H */

diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index 80905b2..7255f2d 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h

@@ -83,6 +83,18 @@
 #define __S110	PAGE_SHARED_EXEC
 #define __S111	PAGE_SHARED_EXEC
 
+#define VMALLOC_SIZE     (KERN_VIRT_SIZE >> 1)
+#define VMALLOC_END      (PAGE_OFFSET - 1)
+#define VMALLOC_START    (PAGE_OFFSET - VMALLOC_SIZE)
+
+#define FIXADDR_TOP      VMALLOC_START
+#ifdef CONFIG_64BIT
+#define FIXADDR_SIZE     PMD_SIZE
+#else
+#define FIXADDR_SIZE     PGDIR_SIZE
+#endif
+#define FIXADDR_START    (FIXADDR_TOP - FIXADDR_SIZE)
+
 /*
  * Roughly size the vmemmap space to be large enough to fit enough
  * struct pages to map half the virtual address space. Then
@@ -424,23 +436,6 @@
 extern void setup_bootmem(void);
 extern void paging_init(void);
 
-static inline void pgtable_cache_init(void)
-{
-	/* No page table caches to initialize */
-}
-
-#define VMALLOC_SIZE     (KERN_VIRT_SIZE >> 1)
-#define VMALLOC_END      (PAGE_OFFSET - 1)
-#define VMALLOC_START    (PAGE_OFFSET - VMALLOC_SIZE)
-
-#define FIXADDR_TOP      VMALLOC_START
-#ifdef CONFIG_64BIT
-#define FIXADDR_SIZE     PMD_SIZE
-#else
-#define FIXADDR_SIZE     PGDIR_SIZE
-#endif
-#define FIXADDR_START    (FIXADDR_TOP - FIXADDR_SIZE)
-
 /*
  * Task size is 0x4000000000 for RV64 or 0x9fc00000 for RV32.
  * Note that PGDIR_SIZE must evenly divide TASK_SIZE.

diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S
index 74ccfd4..da7aa88 100644
--- a/arch/riscv/kernel/entry.S
+++ b/arch/riscv/kernel/entry.S

@@ -166,9 +166,13 @@
 	move a0, sp /* pt_regs */
 	tail do_IRQ
 1:
-	/* Exceptions run with interrupts enabled */
+	/* Exceptions run with interrupts enabled or disabled
+	   depending on the state of sstatus.SR_SPIE */
+	andi t0, s1, SR_SPIE
+	beqz t0, 1f
 	csrs CSR_SSTATUS, SR_SIE
 
+1:
 	/* Handle syscalls */
 	li t0, EXC_SYSCALL
 	beq s4, t0, handle_syscall

diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S
index 15a9189..72f89b7 100644
--- a/arch/riscv/kernel/head.S
+++ b/arch/riscv/kernel/head.S

@@ -63,6 +63,11 @@
 	li t0, SR_FS
 	csrc CSR_SSTATUS, t0
 
+#ifdef CONFIG_SMP
+	li t0, CONFIG_NR_CPUS
+	bgeu a0, t0, .Lsecondary_park
+#endif
+
 	/* Pick one hart to run the main boot sequence */
 	la a3, hart_lottery
 	li a2, 1
@@ -154,9 +159,6 @@
 
 .Lsecondary_start:
 #ifdef CONFIG_SMP
-	li a1, CONFIG_NR_CPUS
-	bgeu a0, a1, .Lsecondary_park
-
 	/* Set trap vector to spin forever to help debug */
 	la a3, .Lsecondary_park
 	csrw CSR_STVEC, a3

diff --git a/arch/riscv/kernel/smp.c b/arch/riscv/kernel/smp.c
index 3836760..b18cd6c 100644
--- a/arch/riscv/kernel/smp.c
+++ b/arch/riscv/kernel/smp.c

@@ -206,3 +206,4 @@
 {
 	send_ipi_single(cpu, IPI_RESCHEDULE);
 }
+EXPORT_SYMBOL_GPL(smp_send_reschedule);

diff --git a/arch/riscv/kernel/time.c b/arch/riscv/kernel/time.c
index 541a2b8..9dd1f2e 100644
--- a/arch/riscv/kernel/time.c
+++ b/arch/riscv/kernel/time.c

@@ -9,6 +9,7 @@
 #include <asm/sbi.h>
 
 unsigned long riscv_timebase;
+EXPORT_SYMBOL_GPL(riscv_timebase);
 
 void __init time_init(void)
 {

diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index f933a47..43a81d0 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig

@@ -554,9 +554,9 @@
 	def_bool y
 	depends on KEXEC_FILE
 
-config KEXEC_VERIFY_SIG
+config KEXEC_SIG
 	bool "Verify kernel signature during kexec_file_load() syscall"
-	depends on KEXEC_FILE && SYSTEM_DATA_VERIFICATION
+	depends on KEXEC_FILE && MODULE_SIG_FORMAT
 	help
 	  This option makes kernel signature verification mandatory for
 	  the kexec_file_load() syscall.

diff --git a/arch/s390/hypfs/inode.c b/arch/s390/hypfs/inode.c
index a4418fc..70139d0 100644
--- a/arch/s390/hypfs/inode.c
+++ b/arch/s390/hypfs/inode.c

@@ -12,17 +12,17 @@
 #include <linux/types.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
 #include <linux/namei.h>
 #include <linux/vfs.h>
 #include <linux/slab.h>
 #include <linux/pagemap.h>
 #include <linux/time.h>
-#include <linux/parser.h>
 #include <linux/sysfs.h>
 #include <linux/init.h>
 #include <linux/kobject.h>
 #include <linux/seq_file.h>
-#include <linux/mount.h>
 #include <linux/uio.h>
 #include <asm/ebcdic.h>
 #include "hypfs.h"
@@ -207,52 +207,44 @@
 	return 0;
 }
 
-enum { opt_uid, opt_gid, opt_err };
+enum { Opt_uid, Opt_gid, };
 
-static const match_table_t hypfs_tokens = {
-	{opt_uid, "uid=%u"},
-	{opt_gid, "gid=%u"},
-	{opt_err, NULL}
+static const struct fs_parameter_spec hypfs_param_specs[] = {
+	fsparam_u32("gid", Opt_gid),
+	fsparam_u32("uid", Opt_uid),
+	{}
 };
 
-static int hypfs_parse_options(char *options, struct super_block *sb)
+static const struct fs_parameter_description hypfs_fs_parameters = {
+	.name		= "hypfs",
+	.specs		= hypfs_param_specs,
+};
+
+static int hypfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
 {
-	char *str;
-	substring_t args[MAX_OPT_ARGS];
+	struct hypfs_sb_info *hypfs_info = fc->s_fs_info;
+	struct fs_parse_result result;
 	kuid_t uid;
 	kgid_t gid;
+	int opt;
 
-	if (!options)
-		return 0;
-	while ((str = strsep(&options, ",")) != NULL) {
-		int token, option;
-		struct hypfs_sb_info *hypfs_info = sb->s_fs_info;
+	opt = fs_parse(fc, &hypfs_fs_parameters, param, &result);
+	if (opt < 0)
+		return opt;
 
-		if (!*str)
-			continue;
-		token = match_token(str, hypfs_tokens, args);
-		switch (token) {
-		case opt_uid:
-			if (match_int(&args[0], &option))
-				return -EINVAL;
-			uid = make_kuid(current_user_ns(), option);
-			if (!uid_valid(uid))
-				return -EINVAL;
-			hypfs_info->uid = uid;
-			break;
-		case opt_gid:
-			if (match_int(&args[0], &option))
-				return -EINVAL;
-			gid = make_kgid(current_user_ns(), option);
-			if (!gid_valid(gid))
-				return -EINVAL;
-			hypfs_info->gid = gid;
-			break;
-		case opt_err:
-		default:
-			pr_err("%s is not a valid mount option\n", str);
-			return -EINVAL;
-		}
+	switch (opt) {
+	case Opt_uid:
+		uid = make_kuid(current_user_ns(), result.uint_32);
+		if (!uid_valid(uid))
+			return invalf(fc, "Unknown uid");
+		hypfs_info->uid = uid;
+		break;
+	case Opt_gid:
+		gid = make_kgid(current_user_ns(), result.uint_32);
+		if (!gid_valid(gid))
+			return invalf(fc, "Unknown gid");
+		hypfs_info->gid = gid;
+		break;
 	}
 	return 0;
 }
@@ -266,26 +258,18 @@
 	return 0;
 }
 
-static int hypfs_fill_super(struct super_block *sb, void *data, int silent)
+static int hypfs_fill_super(struct super_block *sb, struct fs_context *fc)
 {
+	struct hypfs_sb_info *sbi = sb->s_fs_info;
 	struct inode *root_inode;
-	struct dentry *root_dentry;
-	int rc = 0;
-	struct hypfs_sb_info *sbi;
+	struct dentry *root_dentry, *update_file;
+	int rc;
 
-	sbi = kzalloc(sizeof(struct hypfs_sb_info), GFP_KERNEL);
-	if (!sbi)
-		return -ENOMEM;
-	mutex_init(&sbi->lock);
-	sbi->uid = current_uid();
-	sbi->gid = current_gid();
-	sb->s_fs_info = sbi;
 	sb->s_blocksize = PAGE_SIZE;
 	sb->s_blocksize_bits = PAGE_SHIFT;
 	sb->s_magic = HYPFS_MAGIC;
 	sb->s_op = &hypfs_s_ops;
-	if (hypfs_parse_options(data, sb))
-		return -EINVAL;
+
 	root_inode = hypfs_make_inode(sb, S_IFDIR | 0755);
 	if (!root_inode)
 		return -ENOMEM;
@@ -300,18 +284,46 @@
 		rc = hypfs_diag_create_files(root_dentry);
 	if (rc)
 		return rc;
-	sbi->update_file = hypfs_create_update_file(root_dentry);
-	if (IS_ERR(sbi->update_file))
-		return PTR_ERR(sbi->update_file);
+	update_file = hypfs_create_update_file(root_dentry);
+	if (IS_ERR(update_file))
+		return PTR_ERR(update_file);
+	sbi->update_file = update_file;
 	hypfs_update_update(sb);
 	pr_info("Hypervisor filesystem mounted\n");
 	return 0;
 }
 
-static struct dentry *hypfs_mount(struct file_system_type *fst, int flags,
-			const char *devname, void *data)
+static int hypfs_get_tree(struct fs_context *fc)
 {
-	return mount_single(fst, flags, data, hypfs_fill_super);
+	return get_tree_single(fc, hypfs_fill_super);
+}
+
+static void hypfs_free_fc(struct fs_context *fc)
+{
+	kfree(fc->s_fs_info);
+}
+
+static const struct fs_context_operations hypfs_context_ops = {
+	.free		= hypfs_free_fc,
+	.parse_param	= hypfs_parse_param,
+	.get_tree	= hypfs_get_tree,
+};
+
+static int hypfs_init_fs_context(struct fs_context *fc)
+{
+	struct hypfs_sb_info *sbi;
+
+	sbi = kzalloc(sizeof(struct hypfs_sb_info), GFP_KERNEL);
+	if (!sbi)
+		return -ENOMEM;
+
+	mutex_init(&sbi->lock);
+	sbi->uid = current_uid();
+	sbi->gid = current_gid();
+
+	fc->s_fs_info = sbi;
+	fc->ops = &hypfs_context_ops;
+	return 0;
 }
 
 static void hypfs_kill_super(struct super_block *sb)
@@ -442,7 +454,8 @@
 static struct file_system_type hypfs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "s390_hypfs",
-	.mount		= hypfs_mount,
+	.init_fs_context = hypfs_init_fs_context,
+	.parameters	= &hypfs_fs_parameters,
 	.kill_sb	= hypfs_kill_super
 };
 

diff --git a/arch/s390/include/asm/cpu_mf.h b/arch/s390/include/asm/cpu_mf.h
index ae3e3221..ceeb552 100644
--- a/arch/s390/include/asm/cpu_mf.h
+++ b/arch/s390/include/asm/cpu_mf.h

@@ -70,7 +70,7 @@
 	unsigned long tear;	    /* 24-31: TEAR contents		 */
 	unsigned long dear;	    /* 32-39: DEAR contents		 */
 	unsigned int rsvrd0;	    /* 40-43: reserved			 */
-	unsigned int cpu_speed;     /* 44-47: CPU speed 		 */
+	unsigned int cpu_speed;     /* 44-47: CPU speed			 */
 	unsigned long long rsvrd1;  /* 48-55: reserved			 */
 	unsigned long long rsvrd2;  /* 56-63: reserved			 */
 } __packed;
@@ -89,10 +89,10 @@
 	unsigned long tear;	    /* 16-23: TEAR contents		 */
 	unsigned long dear;	    /* 24-31: DEAR contents		 */
 	/* 32-63:							 */
-	unsigned long rsvrd1;	    /* reserved 			 */
-	unsigned long rsvrd2;	    /* reserved 			 */
-	unsigned long rsvrd3;	    /* reserved 			 */
-	unsigned long rsvrd4;	    /* reserved 			 */
+	unsigned long rsvrd1;	    /* reserved				 */
+	unsigned long rsvrd2;	    /* reserved				 */
+	unsigned long rsvrd3;	    /* reserved				 */
+	unsigned long rsvrd4;	    /* reserved				 */
 } __packed;
 
 struct hws_basic_entry {

diff --git a/arch/s390/include/asm/perf_event.h b/arch/s390/include/asm/perf_event.h
index 560d8f7..4652fff 100644
--- a/arch/s390/include/asm/perf_event.h
+++ b/arch/s390/include/asm/perf_event.h

@@ -60,6 +60,7 @@
 #define PERF_CPUM_SF_MODE_MASK		(PERF_CPUM_SF_BASIC_MODE| \
 					 PERF_CPUM_SF_DIAG_MODE)
 #define PERF_CPUM_SF_FULL_BLOCKS	0x0004	  /* Process full SDBs only */
+#define PERF_CPUM_SF_FREQ_MODE		0x0008	  /* Sampling with frequency */
 
 #define REG_NONE		0
 #define REG_OVERFLOW		1
@@ -70,5 +71,6 @@
 #define SAMPL_FLAGS(hwc)	((hwc)->config_base)
 #define SAMPL_DIAG_MODE(hwc)	(SAMPL_FLAGS(hwc) & PERF_CPUM_SF_DIAG_MODE)
 #define SDB_FULL_BLOCKS(hwc)	(SAMPL_FLAGS(hwc) & PERF_CPUM_SF_FULL_BLOCKS)
+#define SAMPLE_FREQ_MODE(hwc)	(SAMPL_FLAGS(hwc) & PERF_CPUM_SF_FREQ_MODE)
 
 #endif /* _ASM_S390_PERF_EVENT_H */

diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 0c46007..36c578c 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h

@@ -1682,12 +1682,6 @@
 #define HAVE_ARCH_UNMAPPED_AREA
 #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
 
-/*
- * No page table caches to initialise
- */
-static inline void pgtable_cache_init(void) { }
-static inline void check_pgt_cache(void) { }
-
 #include <asm-generic/pgtable.h>
 
 #endif /* _S390_PAGE_H */

diff --git a/arch/s390/include/uapi/asm/zcrypt.h b/arch/s390/include/uapi/asm/zcrypt.h
index 8c5755f..f9e5e1f 100644
--- a/arch/s390/include/uapi/asm/zcrypt.h
+++ b/arch/s390/include/uapi/asm/zcrypt.h

@@ -4,7 +4,7 @@
  *
  *  zcrypt 2.2.1 (user-visible header)
  *
- *  Copyright IBM Corp. 2001, 2018
+ *  Copyright IBM Corp. 2001, 2019
  *  Author(s): Robert Burroughs
  *	       Eric Rossman (edrossma@us.ibm.com)
  *
@@ -286,7 +286,7 @@
  *	 0x08: CEX3A
  *	 0x0a: CEX4
  *	 0x0b: CEX5
- *	 0x0c: CEX6
+ *	 0x0c: CEX6 and CEX7
  *	 0x0d: device is disabled
  *
  *   ZCRYPT_QDEPTH_MASK

diff --git a/arch/s390/kernel/kexec_elf.c b/arch/s390/kernel/kexec_elf.c
index 6d0635c..9da6fa3 100644
--- a/arch/s390/kernel/kexec_elf.c
+++ b/arch/s390/kernel/kexec_elf.c

@@ -130,7 +130,7 @@
 const struct kexec_file_ops s390_kexec_elf_ops = {
 	.probe = s390_elf_probe,
 	.load = s390_elf_load,
-#ifdef CONFIG_KEXEC_VERIFY_SIG
+#ifdef CONFIG_KEXEC_SIG
 	.verify_sig = s390_verify_sig,
-#endif /* CONFIG_KEXEC_VERIFY_SIG */
+#endif /* CONFIG_KEXEC_SIG */
 };

diff --git a/arch/s390/kernel/kexec_image.c b/arch/s390/kernel/kexec_image.c
index 58318bf..af23eff 100644
--- a/arch/s390/kernel/kexec_image.c
+++ b/arch/s390/kernel/kexec_image.c

@@ -59,7 +59,7 @@
 const struct kexec_file_ops s390_kexec_image_ops = {
 	.probe = s390_image_probe,
 	.load = s390_image_load,
-#ifdef CONFIG_KEXEC_VERIFY_SIG
+#ifdef CONFIG_KEXEC_SIG
 	.verify_sig = s390_verify_sig,
-#endif /* CONFIG_KEXEC_VERIFY_SIG */
+#endif /* CONFIG_KEXEC_SIG */
 };

diff --git a/arch/s390/kernel/machine_kexec_file.c b/arch/s390/kernel/machine_kexec_file.c
index fbdd3ea..8415ae7 100644
--- a/arch/s390/kernel/machine_kexec_file.c
+++ b/arch/s390/kernel/machine_kexec_file.c

@@ -10,7 +10,7 @@
 #include <linux/elf.h>
 #include <linux/errno.h>
 #include <linux/kexec.h>
-#include <linux/module.h>
+#include <linux/module_signature.h>
 #include <linux/verification.h>
 #include <asm/boot_data.h>
 #include <asm/ipl.h>
@@ -22,29 +22,7 @@
 	NULL,
 };
 
-#ifdef CONFIG_KEXEC_VERIFY_SIG
-/*
- * Module signature information block.
- *
- * The constituents of the signature section are, in order:
- *
- *	- Signer's name
- *	- Key identifier
- *	- Signature data
- *	- Information block
- */
-struct module_signature {
-	u8	algo;		/* Public-key crypto algorithm [0] */
-	u8	hash;		/* Digest algorithm [0] */
-	u8	id_type;	/* Key identifier type [PKEY_ID_PKCS7] */
-	u8	signer_len;	/* Length of signer's name [0] */
-	u8	key_id_len;	/* Length of key identifier [0] */
-	u8	__pad[3];
-	__be32	sig_len;	/* Length of signature data */
-};
-
-#define PKEY_ID_PKCS7 2
-
+#ifdef CONFIG_KEXEC_SIG
 int s390_verify_sig(const char *kernel, unsigned long kernel_len)
 {
 	const unsigned long marker_len = sizeof(MODULE_SIG_STRING) - 1;
@@ -90,7 +68,7 @@
 				      VERIFYING_MODULE_SIGNATURE,
 				      NULL, NULL);
 }
-#endif /* CONFIG_KEXEC_VERIFY_SIG */
+#endif /* CONFIG_KEXEC_SIG */
 
 static int kexec_file_update_purgatory(struct kimage *image,
 				       struct s390_load_data *data)

diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c
index 292a452..544a02e 100644
--- a/arch/s390/kernel/perf_cpum_sf.c
+++ b/arch/s390/kernel/perf_cpum_sf.c

@@ -673,13 +673,89 @@
 	rcu_read_unlock();
 }
 
+static unsigned long getrate(bool freq, unsigned long sample,
+			     struct hws_qsi_info_block *si)
+{
+	unsigned long rate;
+
+	if (freq) {
+		rate = freq_to_sample_rate(si, sample);
+		rate = hw_limit_rate(si, rate);
+	} else {
+		/* The min/max sampling rates specifies the valid range
+		 * of sample periods.  If the specified sample period is
+		 * out of range, limit the period to the range boundary.
+		 */
+		rate = hw_limit_rate(si, sample);
+
+		/* The perf core maintains a maximum sample rate that is
+		 * configurable through the sysctl interface.  Ensure the
+		 * sampling rate does not exceed this value.  This also helps
+		 * to avoid throttling when pushing samples with
+		 * perf_event_overflow().
+		 */
+		if (sample_rate_to_freq(si, rate) >
+		    sysctl_perf_event_sample_rate) {
+			debug_sprintf_event(sfdbg, 1,
+					    "Sampling rate exceeds maximum "
+					    "perf sample rate\n");
+			rate = 0;
+		}
+	}
+	return rate;
+}
+
+/* The sampling information (si) contains information about the
+ * min/max sampling intervals and the CPU speed.  So calculate the
+ * correct sampling interval and avoid the whole period adjust
+ * feedback loop.
+ *
+ * Since the CPU Measurement sampling facility can not handle frequency
+ * calculate the sampling interval when frequency is specified using
+ * this formula:
+ *	interval := cpu_speed * 1000000 / sample_freq
+ *
+ * Returns errno on bad input and zero on success with parameter interval
+ * set to the correct sampling rate.
+ *
+ * Note: This function turns off freq bit to avoid calling function
+ * perf_adjust_period(). This causes frequency adjustment in the common
+ * code part which causes tremendous variations in the counter values.
+ */
+static int __hw_perf_event_init_rate(struct perf_event *event,
+				     struct hws_qsi_info_block *si)
+{
+	struct perf_event_attr *attr = &event->attr;
+	struct hw_perf_event *hwc = &event->hw;
+	unsigned long rate;
+
+	if (attr->freq) {
+		if (!attr->sample_freq)
+			return -EINVAL;
+		rate = getrate(attr->freq, attr->sample_freq, si);
+		attr->freq = 0;		/* Don't call  perf_adjust_period() */
+		SAMPL_FLAGS(hwc) |= PERF_CPUM_SF_FREQ_MODE;
+	} else {
+		rate = getrate(attr->freq, attr->sample_period, si);
+		if (!rate)
+			return -EINVAL;
+	}
+	attr->sample_period = rate;
+	SAMPL_RATE(hwc) = rate;
+	hw_init_period(hwc, SAMPL_RATE(hwc));
+	debug_sprintf_event(sfdbg, 4, "__hw_perf_event_init_rate:"
+			    "cpu:%d period:%llx freq:%d,%#lx\n", event->cpu,
+			    event->attr.sample_period, event->attr.freq,
+			    SAMPLE_FREQ_MODE(hwc));
+	return 0;
+}
+
 static int __hw_perf_event_init(struct perf_event *event)
 {
 	struct cpu_hw_sf *cpuhw;
 	struct hws_qsi_info_block si;
 	struct perf_event_attr *attr = &event->attr;
 	struct hw_perf_event *hwc = &event->hw;
-	unsigned long rate;
 	int cpu, err;
 
 	/* Reserve CPU-measurement sampling facility */
@@ -745,43 +821,9 @@
 	if (attr->config1 & PERF_CPUM_SF_FULL_BLOCKS)
 		SAMPL_FLAGS(hwc) |= PERF_CPUM_SF_FULL_BLOCKS;
 
-	/* The sampling information (si) contains information about the
-	 * min/max sampling intervals and the CPU speed.  So calculate the
-	 * correct sampling interval and avoid the whole period adjust
-	 * feedback loop.
-	 */
-	rate = 0;
-	if (attr->freq) {
-		if (!attr->sample_freq) {
-			err = -EINVAL;
-			goto out;
-		}
-		rate = freq_to_sample_rate(&si, attr->sample_freq);
-		rate = hw_limit_rate(&si, rate);
-		attr->freq = 0;
-		attr->sample_period = rate;
-	} else {
-		/* The min/max sampling rates specifies the valid range
-		 * of sample periods.  If the specified sample period is
-		 * out of range, limit the period to the range boundary.
-		 */
-		rate = hw_limit_rate(&si, hwc->sample_period);
-
-		/* The perf core maintains a maximum sample rate that is
-		 * configurable through the sysctl interface.  Ensure the
-		 * sampling rate does not exceed this value.  This also helps
-		 * to avoid throttling when pushing samples with
-		 * perf_event_overflow().
-		 */
-		if (sample_rate_to_freq(&si, rate) >
-		      sysctl_perf_event_sample_rate) {
-			err = -EINVAL;
-			debug_sprintf_event(sfdbg, 1, "Sampling rate exceeds maximum perf sample rate\n");
-			goto out;
-		}
-	}
-	SAMPL_RATE(hwc) = rate;
-	hw_init_period(hwc, SAMPL_RATE(hwc));
+	err =  __hw_perf_event_init_rate(event, &si);
+	if (err)
+		goto out;
 
 	/* Initialize sample data overflow accounting */
 	hwc->extra_reg.reg = REG_OVERFLOW;
@@ -904,6 +946,8 @@
 			if (sfb_has_pending_allocs(&cpuhw->sfb, hwc))
 				extend_sampling_buffer(&cpuhw->sfb, hwc);
 		}
+		/* Rate may be adjusted with ioctl() */
+		cpuhw->lsctl.interval = SAMPL_RATE(&cpuhw->event->hw);
 	}
 
 	/* (Re)enable the PMU and sampling facility */
@@ -922,8 +966,9 @@
 	lpp(&S390_lowcore.lpp);
 
 	debug_sprintf_event(sfdbg, 6, "pmu_enable: es=%i cs=%i ed=%i cd=%i "
-			    "tear=%p dear=%p\n", cpuhw->lsctl.es,
-			    cpuhw->lsctl.cs, cpuhw->lsctl.ed, cpuhw->lsctl.cd,
+			    "interval:%lx tear=%p dear=%p\n",
+			    cpuhw->lsctl.es, cpuhw->lsctl.cs, cpuhw->lsctl.ed,
+			    cpuhw->lsctl.cd, cpuhw->lsctl.interval,
 			    (void *) cpuhw->lsctl.tear,
 			    (void *) cpuhw->lsctl.dear);
 }
@@ -1717,6 +1762,44 @@
 	/* Nothing to do ... updates are interrupt-driven */
 }
 
+/* Check if the new sampling period/freqeuncy is appropriate.
+ *
+ * Return non-zero on error and zero on passed checks.
+ */
+static int cpumsf_pmu_check_period(struct perf_event *event, u64 value)
+{
+	struct hws_qsi_info_block si;
+	unsigned long rate;
+	bool do_freq;
+
+	memset(&si, 0, sizeof(si));
+	if (event->cpu == -1) {
+		if (qsi(&si))
+			return -ENODEV;
+	} else {
+		/* Event is pinned to a particular CPU, retrieve the per-CPU
+		 * sampling structure for accessing the CPU-specific QSI.
+		 */
+		struct cpu_hw_sf *cpuhw = &per_cpu(cpu_hw_sf, event->cpu);
+
+		si = cpuhw->qsi;
+	}
+
+	do_freq = !!SAMPLE_FREQ_MODE(&event->hw);
+	rate = getrate(do_freq, value, &si);
+	if (!rate)
+		return -EINVAL;
+
+	event->attr.sample_period = rate;
+	SAMPL_RATE(&event->hw) = rate;
+	hw_init_period(&event->hw, SAMPL_RATE(&event->hw));
+	debug_sprintf_event(sfdbg, 4, "cpumsf_pmu_check_period:"
+			    "cpu:%d value:%llx period:%llx freq:%d\n",
+			    event->cpu, value,
+			    event->attr.sample_period, do_freq);
+	return 0;
+}
+
 /* Activate sampling control.
  * Next call of pmu_enable() starts sampling.
  */
@@ -1908,6 +1991,8 @@
 
 	.setup_aux    = aux_buffer_setup,
 	.free_aux     = aux_buffer_free,
+
+	.check_period = cpumsf_pmu_check_period,
 };
 
 static void cpumf_measurement_alert(struct ext_code ext_code,

diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c
index 2db6fb4..3627953 100644
--- a/arch/s390/kernel/topology.c
+++ b/arch/s390/kernel/topology.c

@@ -311,7 +311,8 @@
 	on_each_cpu(__arch_update_dedicated_flag, NULL, 0);
 	for_each_online_cpu(cpu) {
 		dev = get_cpu_device(cpu);
-		kobject_uevent(&dev->kobj, KOBJ_CHANGE);
+		if (dev)
+			kobject_uevent(&dev->kobj, KOBJ_CHANGE);
 	}
 	return rc;
 }

diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c
index 54fcdf6..3dd253f 100644
--- a/arch/s390/mm/pgalloc.c
+++ b/arch/s390/mm/pgalloc.c

@@ -210,7 +210,7 @@
 	page = alloc_page(GFP_KERNEL);
 	if (!page)
 		return NULL;
-	if (!pgtable_page_ctor(page)) {
+	if (!pgtable_pte_page_ctor(page)) {
 		__free_page(page);
 		return NULL;
 	}
@@ -256,7 +256,7 @@
 		atomic_xor_bits(&page->_refcount, 3U << 24);
 	}
 
-	pgtable_page_dtor(page);
+	pgtable_pte_page_dtor(page);
 	__free_page(page);
 }
 
@@ -308,7 +308,7 @@
 	case 3:		/* 4K page table with pgstes */
 		if (mask & 3)
 			atomic_xor_bits(&page->_refcount, 3 << 24);
-		pgtable_page_dtor(page);
+		pgtable_pte_page_dtor(page);
 		__free_page(page);
 		break;
 	}

diff --git a/arch/sh/include/asm/pgalloc.h b/arch/sh/include/asm/pgalloc.h
index b56f908..22d968b 100644
--- a/arch/sh/include/asm/pgalloc.h
+++ b/arch/sh/include/asm/pgalloc.h

@@ -2,10 +2,8 @@
 #ifndef __ASM_SH_PGALLOC_H
 #define __ASM_SH_PGALLOC_H
 
-#include <linux/quicklist.h>
 #include <asm/page.h>
-
-#define QUICK_PT 0	/* Other page table pages that are zero on free */
+#include <asm-generic/pgalloc.h>
 
 extern pgd_t *pgd_alloc(struct mm_struct *);
 extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);
@@ -29,44 +27,9 @@
 }
 #define pmd_pgtable(pmd) pmd_page(pmd)
 
-/*
- * Allocate and free page tables.
- */
-static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm)
-{
-	return quicklist_alloc(QUICK_PT, GFP_KERNEL, NULL);
-}
-
-static inline pgtable_t pte_alloc_one(struct mm_struct *mm)
-{
-	struct page *page;
-	void *pg;
-
-	pg = quicklist_alloc(QUICK_PT, GFP_KERNEL, NULL);
-	if (!pg)
-		return NULL;
-	page = virt_to_page(pg);
-	if (!pgtable_page_ctor(page)) {
-		quicklist_free(QUICK_PT, NULL, pg);
-		return NULL;
-	}
-	return page;
-}
-
-static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
-{
-	quicklist_free(QUICK_PT, NULL, pte);
-}
-
-static inline void pte_free(struct mm_struct *mm, pgtable_t pte)
-{
-	pgtable_page_dtor(pte);
-	quicklist_free_page(QUICK_PT, NULL, pte);
-}
-
 #define __pte_free_tlb(tlb,pte,addr)			\
 do {							\
-	pgtable_page_dtor(pte);				\
+	pgtable_pte_page_dtor(pte);			\
 	tlb_remove_page((tlb), (pte));			\
 } while (0)
 
@@ -79,9 +42,4 @@
 } while (0);
 #endif
 
-static inline void check_pgt_cache(void)
-{
-	quicklist_trim(QUICK_PT, NULL, 25, 16);
-}
-
 #endif /* __ASM_SH_PGALLOC_H */

diff --git a/arch/sh/include/asm/pgtable.h b/arch/sh/include/asm/pgtable.h
index 9085d11..cbd0f3c 100644
--- a/arch/sh/include/asm/pgtable.h
+++ b/arch/sh/include/asm/pgtable.h

@@ -123,11 +123,6 @@
 
 #define pte_pfn(x)		((unsigned long)(((x).pte_low >> PAGE_SHIFT)))
 
-/*
- * Initialise the page table caches
- */
-extern void pgtable_cache_init(void);
-
 struct vm_area_struct;
 struct mm_struct;
 

diff --git a/arch/sh/mm/Kconfig b/arch/sh/mm/Kconfig
index 02ed2df..5c8a2eb 100644
--- a/arch/sh/mm/Kconfig
+++ b/arch/sh/mm/Kconfig

@@ -1,9 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0
 menu "Memory management options"
 
-config QUICKLIST
-	def_bool y
-
 config MMU
         bool "Support for memory management hardware"
 	depends on !CPU_SH2

diff --git a/arch/sh/mm/nommu.c b/arch/sh/mm/nommu.c
index cc779a9..dca946f 100644
--- a/arch/sh/mm/nommu.c
+++ b/arch/sh/mm/nommu.c

@@ -97,7 +97,3 @@
 void __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
 {
 }
-
-void pgtable_cache_init(void)
-{
-}

diff --git a/arch/sparc/include/asm/pci.h b/arch/sparc/include/asm/pci.h
index cfec79b..4deddf4 100644
--- a/arch/sparc/include/asm/pci.h
+++ b/arch/sparc/include/asm/pci.h

@@ -38,8 +38,6 @@
 #define arch_can_pci_mmap_io()	1
 #define HAVE_ARCH_PCI_GET_UNMAPPED_AREA
 #define get_pci_unmapped_area get_fb_unmapped_area
-
-#define HAVE_ARCH_PCI_RESOURCE_TO_USER
 #endif /* CONFIG_SPARC64 */
 
 #if defined(CONFIG_SPARC64) || defined(CONFIG_LEON_PCI)

diff --git a/arch/sparc/include/asm/pgalloc_32.h b/arch/sparc/include/asm/pgalloc_32.h
index 282be50..10538a4 100644
--- a/arch/sparc/include/asm/pgalloc_32.h
+++ b/arch/sparc/include/asm/pgalloc_32.h

@@ -17,8 +17,6 @@
 
 extern struct resource sparc_iomap;
 
-#define check_pgt_cache()	do { } while (0)
-
 pgd_t *get_pgd_fast(void);
 static inline void free_pgd_fast(pgd_t *pgd)
 {

diff --git a/arch/sparc/include/asm/pgalloc_64.h b/arch/sparc/include/asm/pgalloc_64.h
index 48abccb..9d3e5cc 100644
--- a/arch/sparc/include/asm/pgalloc_64.h
+++ b/arch/sparc/include/asm/pgalloc_64.h

@@ -69,8 +69,6 @@
 #define pmd_populate(MM, PMD, PTE)		pmd_set(MM, PMD, PTE)
 #define pmd_pgtable(PMD)			((pte_t *)__pmd_page(PMD))
 
-#define check_pgt_cache()	do { } while (0)
-
 void pgtable_free(void *table, bool is_page);
 
 #ifdef CONFIG_SMP

diff --git a/arch/sparc/include/asm/pgtable_32.h b/arch/sparc/include/asm/pgtable_32.h
index 4eebed6..31da448 100644
--- a/arch/sparc/include/asm/pgtable_32.h
+++ b/arch/sparc/include/asm/pgtable_32.h

@@ -445,9 +445,4 @@
 /* We provide our own get_unmapped_area to cope with VA holes for userland */
 #define HAVE_ARCH_UNMAPPED_AREA
 
-/*
- * No page table caches to initialise
- */
-#define pgtable_cache_init()	do { } while (0)
-
 #endif /* !(_SPARC_PGTABLE_H) */

diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
index 1599de7..6ae8016 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h

@@ -1078,7 +1078,7 @@
 }
 #define io_remap_pfn_range io_remap_pfn_range 
 
-static inline unsigned long untagged_addr(unsigned long start)
+static inline unsigned long __untagged_addr(unsigned long start)
 {
 	if (adi_capable()) {
 		long addr = start;
@@ -1098,7 +1098,8 @@
 
 	return start;
 }
-#define untagged_addr untagged_addr
+#define untagged_addr(addr) \
+	((__typeof__(addr))(__untagged_addr((unsigned long)(addr))))
 
 static inline bool pte_access_permitted(pte_t pte, bool write)
 {
@@ -1135,7 +1136,6 @@
 				   unsigned long);
 #define HAVE_ARCH_FB_UNMAPPED_AREA
 
-void pgtable_cache_init(void);
 void sun4v_register_fault_status(void);
 void sun4v_ktsb_register(void);
 void __init cheetah_ecache_flush_init(void);

diff --git a/arch/sparc/mm/init_32.c b/arch/sparc/mm/init_32.c
index 046ab11..906eda11 100644
--- a/arch/sparc/mm/init_32.c
+++ b/arch/sparc/mm/init_32.c

@@ -31,7 +31,6 @@
 #include <asm/page.h>
 #include <asm/pgtable.h>
 #include <asm/vaddrs.h>
-#include <asm/pgalloc.h>	/* bug in asm-generic/tlb.h: check_pgt_cache */
 #include <asm/setup.h>
 #include <asm/tlb.h>
 #include <asm/prom.h>

diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 4b099dd..e6d9181 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c

@@ -2903,7 +2903,7 @@
 	struct page *page = alloc_page(GFP_KERNEL | __GFP_ZERO);
 	if (!page)
 		return NULL;
-	if (!pgtable_page_ctor(page)) {
+	if (!pgtable_pte_page_ctor(page)) {
 		free_unref_page(page);
 		return NULL;
 	}
@@ -2919,7 +2919,7 @@
 {
 	struct page *page = virt_to_page(pte);
 
-	pgtable_page_dtor(page);
+	pgtable_pte_page_dtor(page);
 	__free_page(page);
 }
 

diff --git a/arch/sparc/mm/srmmu.c b/arch/sparc/mm/srmmu.c
index aaebbc0..cc3ad64 100644
--- a/arch/sparc/mm/srmmu.c
+++ b/arch/sparc/mm/srmmu.c

@@ -378,7 +378,7 @@
 	if ((pte = (unsigned long)pte_alloc_one_kernel(mm)) == 0)
 		return NULL;
 	page = pfn_to_page(__nocache_pa(pte) >> PAGE_SHIFT);
-	if (!pgtable_page_ctor(page)) {
+	if (!pgtable_pte_page_ctor(page)) {
 		__free_page(page);
 		return NULL;
 	}
@@ -389,7 +389,7 @@
 {
 	unsigned long p;
 
-	pgtable_page_dtor(pte);
+	pgtable_pte_page_dtor(pte);
 	p = (unsigned long)page_address(pte);	/* Cached address (for test) */
 	if (p == 0)
 		BUG();

diff --git a/arch/um/include/asm/pgalloc.h b/arch/um/include/asm/pgalloc.h
index 023599c..881e76d 100644
--- a/arch/um/include/asm/pgalloc.h
+++ b/arch/um/include/asm/pgalloc.h

@@ -29,7 +29,7 @@
 
 #define __pte_free_tlb(tlb,pte, address)		\
 do {							\
-	pgtable_page_dtor(pte);				\
+	pgtable_pte_page_dtor(pte);			\
 	tlb_remove_page((tlb),(pte));			\
 } while (0)
 
@@ -43,7 +43,5 @@
 #define __pmd_free_tlb(tlb,x, address)   tlb_remove_page((tlb),virt_to_page(x))
 #endif
 
-#define check_pgt_cache()	do { } while (0)
-
 #endif
 

diff --git a/arch/um/include/asm/pgtable.h b/arch/um/include/asm/pgtable.h
index e4d3ed9..36a44d5 100644
--- a/arch/um/include/asm/pgtable.h
+++ b/arch/um/include/asm/pgtable.h

@@ -32,8 +32,6 @@
 /* zero page used for uninitialized stuff */
 extern unsigned long *empty_zero_page;
 
-#define pgtable_cache_init() do ; while (0)
-
 /* Just any arbitrary offset to the start of the vmalloc VM area: the
  * current 8MB value just means that there will be a 8MB "hole" after the
  * physical memory until the kernel virtual memory starts.  That means that

diff --git a/arch/unicore32/include/asm/pgalloc.h b/arch/unicore32/include/asm/pgalloc.h
index 3f0903b..ba1c9a7 100644
--- a/arch/unicore32/include/asm/pgalloc.h
+++ b/arch/unicore32/include/asm/pgalloc.h

@@ -18,8 +18,6 @@
 #define __HAVE_ARCH_PTE_ALLOC_ONE
 #include <asm-generic/pgalloc.h>
 
-#define check_pgt_cache()		do { } while (0)
-
 #define _PAGE_USER_TABLE	(PMD_TYPE_TABLE | PMD_PRESENT)
 #define _PAGE_KERNEL_TABLE	(PMD_TYPE_TABLE | PMD_PRESENT)
 

diff --git a/arch/unicore32/include/asm/pgtable.h b/arch/unicore32/include/asm/pgtable.h
index 126e961..c8f7ba1 100644
--- a/arch/unicore32/include/asm/pgtable.h
+++ b/arch/unicore32/include/asm/pgtable.h

@@ -285,8 +285,6 @@
 
 #include <asm-generic/pgtable.h>
 
-#define pgtable_cache_init() do { } while (0)
-
 #endif /* !__ASSEMBLY__ */
 
 #endif /* __UNICORE_PGTABLE_H__ */

diff --git a/arch/unicore32/include/asm/tlb.h b/arch/unicore32/include/asm/tlb.h
index 10d2356..4663d8c 100644
--- a/arch/unicore32/include/asm/tlb.h
+++ b/arch/unicore32/include/asm/tlb.h

@@ -15,7 +15,7 @@
 
 #define __pte_free_tlb(tlb, pte, addr)				\
 	do {							\
-		pgtable_page_dtor(pte);				\
+		pgtable_pte_page_dtor(pte);			\
 		tlb_remove_page((tlb), (pte));			\
 	} while (0)
 

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 37ed5f5..d6e1faa 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig

@@ -2031,20 +2031,30 @@
 config ARCH_HAS_KEXEC_PURGATORY
 	def_bool KEXEC_FILE
 
-config KEXEC_VERIFY_SIG
+config KEXEC_SIG
 	bool "Verify kernel signature during kexec_file_load() syscall"
 	depends on KEXEC_FILE
 	---help---
+
+	  This option makes the kexec_file_load() syscall check for a valid
+	  signature of the kernel image.  The image can still be loaded without
+	  a valid signature unless you also enable KEXEC_SIG_FORCE, though if
+	  there's a signature that we can check, then it must be valid.
+
+	  In addition to this option, you need to enable signature
+	  verification for the corresponding kernel image type being
+	  loaded in order for this to work.
+
+config KEXEC_SIG_FORCE
+	bool "Require a valid signature in kexec_file_load() syscall"
+	depends on KEXEC_SIG
+	---help---
 	  This option makes kernel signature verification mandatory for
 	  the kexec_file_load() syscall.
 
-	  In addition to that option, you need to enable signature
-	  verification for the corresponding kernel image type being
-	  loaded in order for this to work.
-
 config KEXEC_BZIMAGE_VERIFY_SIG
 	bool "Enable bzImage signature verification support"
-	depends on KEXEC_VERIFY_SIG
+	depends on KEXEC_SIG
 	depends on SIGNED_PE_FILE_VERIFICATION
 	select SYSTEM_TRUSTED_KEYRING
 	---help---

diff --git a/arch/x86/boot/compressed/acpi.c b/arch/x86/boot/compressed/acpi.c
index 15255f3..149795c 100644
--- a/arch/x86/boot/compressed/acpi.c
+++ b/arch/x86/boot/compressed/acpi.c

@@ -26,7 +26,7 @@
  */
 #define MAX_ADDR_LEN 19
 
-static acpi_physical_address get_acpi_rsdp(void)
+static acpi_physical_address get_cmdline_acpi_rsdp(void)
 {
 	acpi_physical_address addr = 0;
 
@@ -278,10 +278,7 @@
 {
 	acpi_physical_address pa;
 
-	pa = get_acpi_rsdp();
-
-	if (!pa)
-		pa = boot_params->acpi_rsdp_addr;
+	pa = boot_params->acpi_rsdp_addr;
 
 	/*
 	 * Try to get EFI data from setup_data. This can happen when we're a
@@ -311,7 +308,17 @@
 	char arg[10];
 	u8 *entry;
 
-	rsdp = (struct acpi_table_rsdp *)(long)boot_params->acpi_rsdp_addr;
+	/*
+	 * Check whether we were given an RSDP on the command line. We don't
+	 * stash this in boot params because the kernel itself may have
+	 * different ideas about whether to trust a command-line parameter.
+	 */
+	rsdp = (struct acpi_table_rsdp *)get_cmdline_acpi_rsdp();
+
+	if (!rsdp)
+		rsdp = (struct acpi_table_rsdp *)(long)
+			boot_params->acpi_rsdp_addr;
+
 	if (!rsdp)
 		return 0;
 

diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index aac686e..bc9693c 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h

@@ -117,6 +117,12 @@
 	return !!acpi_lapic;
 }
 
+#define ACPI_HAVE_ARCH_SET_ROOT_POINTER
+static inline void acpi_arch_set_root_pointer(u64 addr)
+{
+	x86_init.acpi.set_root_pointer(addr);
+}
+
 #define ACPI_HAVE_ARCH_GET_ROOT_POINTER
 static inline u64 acpi_arch_get_root_pointer(void)
 {
@@ -125,6 +131,7 @@
 
 void acpi_generic_reduced_hw_init(void);
 
+void x86_default_set_root_pointer(u64 addr);
 u64 x86_default_get_root_pointer(void);
 
 #else /* !CONFIG_ACPI */
@@ -138,6 +145,8 @@
 
 static inline void acpi_generic_reduced_hw_init(void) { }
 
+static inline void x86_default_set_root_pointer(u64 addr) { }
+
 static inline u64 x86_default_get_root_pointer(void)
 {
 	return 0;

diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h
index 7a27056..7741e21 100644
--- a/arch/x86/include/asm/hyperv-tlfs.h
+++ b/arch/x86/include/asm/hyperv-tlfs.h

@@ -180,7 +180,15 @@
 /* Recommend using enlightened VMCS */
 #define HV_X64_ENLIGHTENED_VMCS_RECOMMENDED		BIT(14)
 
+/*
+ * Virtual processor will never share a physical core with another virtual
+ * processor, except for virtual processors that are reported as sibling SMT
+ * threads.
+ */
+#define HV_X64_NO_NONARCH_CORESHARING                  BIT(18)
+
 /* Nested features. These are HYPERV_CPUID_NESTED_FEATURES.EAX bits. */
+#define HV_X64_NESTED_DIRECT_FLUSH			BIT(17)
 #define HV_X64_NESTED_GUEST_MAPPING_FLUSH		BIT(18)
 #define HV_X64_NESTED_MSR_BITMAP			BIT(19)
 
@@ -524,14 +532,24 @@
 	__u64 delivery_time;	/* When the message was delivered */
 } __packed;
 
+struct hv_nested_enlightenments_control {
+	struct {
+		__u32 directhypercall:1;
+		__u32 reserved:31;
+	} features;
+	struct {
+		__u32 reserved;
+	} hypercallControls;
+} __packed;
+
 /* Define virtual processor assist page structure. */
 struct hv_vp_assist_page {
 	__u32 apic_assist;
-	__u32 reserved;
-	__u64 vtl_control[2];
-	__u64 nested_enlightenments_control[2];
-	__u32 enlighten_vmentry;
-	__u32 padding;
+	__u32 reserved1;
+	__u64 vtl_control[3];
+	struct hv_nested_enlightenments_control nested_control;
+	__u8 enlighten_vmentry;
+	__u8 reserved2[7];
 	__u64 current_nested_vmcs;
 } __packed;
 
@@ -882,4 +900,7 @@
 	u64 gva_list[];
 } __packed;
 
+struct hv_partition_assist_pg {
+	u32 tlb_lock_count;
+};
 #endif

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index a3a3ec7..23edf56 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h

@@ -320,6 +320,7 @@
 	struct list_head link;
 	struct hlist_node hash_link;
 	bool unsync;
+	u8 mmu_valid_gen;
 	bool mmio_cached;
 
 	/*
@@ -335,7 +336,6 @@
 	int root_count;          /* Currently serving as active root */
 	unsigned int unsync_children;
 	struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */
-	unsigned long mmu_valid_gen;
 	DECLARE_BITMAP(unsync_child_bitmap, 512);
 
 #ifdef CONFIG_X86_32
@@ -844,6 +844,8 @@
 
 	/* How many vCPUs have VP index != vCPU index */
 	atomic_t num_mismatched_vp_indexes;
+
+	struct hv_partition_assist_pg *hv_pa_pg;
 };
 
 enum kvm_irqchip_mode {
@@ -857,12 +859,13 @@
 	unsigned long n_requested_mmu_pages;
 	unsigned long n_max_mmu_pages;
 	unsigned int indirect_shadow_pages;
-	unsigned long mmu_valid_gen;
+	u8 mmu_valid_gen;
 	struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
 	/*
 	 * Hash table of struct kvm_mmu_page.
 	 */
 	struct list_head active_mmu_pages;
+	struct list_head zapped_obsolete_pages;
 	struct kvm_page_track_notifier_node mmu_sp_tracker;
 	struct kvm_page_track_notifier_head track_notifier_head;
 
@@ -1213,6 +1216,7 @@
 	bool (*need_emulation_on_page_fault)(struct kvm_vcpu *vcpu);
 
 	bool (*apic_init_signal_blocked)(struct kvm_vcpu *vcpu);
+	int (*enable_direct_tlbflush)(struct kvm_vcpu *vcpu);
 };
 
 struct kvm_arch_async_pf {
@@ -1312,18 +1316,42 @@
 
 extern u64 kvm_mce_cap_supported;
 
-enum emulation_result {
-	EMULATE_DONE,         /* no further processing */
-	EMULATE_USER_EXIT,    /* kvm_run ready for userspace exit */
-	EMULATE_FAIL,         /* can't emulate this instruction */
-};
-
+/*
+ * EMULTYPE_NO_DECODE - Set when re-emulating an instruction (after completing
+ *			userspace I/O) to indicate that the emulation context
+ *			should be resued as is, i.e. skip initialization of
+ *			emulation context, instruction fetch and decode.
+ *
+ * EMULTYPE_TRAP_UD - Set when emulating an intercepted #UD from hardware.
+ *		      Indicates that only select instructions (tagged with
+ *		      EmulateOnUD) should be emulated (to minimize the emulator
+ *		      attack surface).  See also EMULTYPE_TRAP_UD_FORCED.
+ *
+ * EMULTYPE_SKIP - Set when emulating solely to skip an instruction, i.e. to
+ *		   decode the instruction length.  For use *only* by
+ *		   kvm_x86_ops->skip_emulated_instruction() implementations.
+ *
+ * EMULTYPE_ALLOW_RETRY - Set when the emulator should resume the guest to
+ *			  retry native execution under certain conditions.
+ *
+ * EMULTYPE_TRAP_UD_FORCED - Set when emulating an intercepted #UD that was
+ *			     triggered by KVM's magic "force emulation" prefix,
+ *			     which is opt in via module param (off by default).
+ *			     Bypasses EmulateOnUD restriction despite emulating
+ *			     due to an intercepted #UD (see EMULTYPE_TRAP_UD).
+ *			     Used to test the full emulator from userspace.
+ *
+ * EMULTYPE_VMWARE_GP - Set when emulating an intercepted #GP for VMware
+ *			backdoor emulation, which is opt in via module param.
+ *			VMware backoor emulation handles select instructions
+ *			and reinjects the #GP for all other cases.
+ */
 #define EMULTYPE_NO_DECODE	    (1 << 0)
 #define EMULTYPE_TRAP_UD	    (1 << 1)
 #define EMULTYPE_SKIP		    (1 << 2)
 #define EMULTYPE_ALLOW_RETRY	    (1 << 3)
-#define EMULTYPE_NO_UD_ON_FAIL	    (1 << 4)
-#define EMULTYPE_VMWARE		    (1 << 5)
+#define EMULTYPE_TRAP_UD_FORCED	    (1 << 4)
+#define EMULTYPE_VMWARE_GP	    (1 << 5)
 int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type);
 int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu,
 					void *insn, int insn_len);
@@ -1506,7 +1534,7 @@
 #define kvm_arch_vcpu_memslots_id(vcpu) ((vcpu)->arch.hflags & HF_SMM_MASK ? 1 : 0)
 #define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, (role).smm)
 
-asmlinkage void __noreturn kvm_spurious_fault(void);
+asmlinkage void kvm_spurious_fault(void);
 
 /*
  * Hardware virtualization extension instructions may fault if a
@@ -1514,24 +1542,14 @@
  * Usually after catching the fault we just panic; during reboot
  * instead the instruction is ignored.
  */
-#define ____kvm_handle_fault_on_reboot(insn, cleanup_insn)		\
+#define __kvm_handle_fault_on_reboot(insn)				\
 	"666: \n\t"							\
 	insn "\n\t"							\
 	"jmp	668f \n\t"						\
 	"667: \n\t"							\
 	"call	kvm_spurious_fault \n\t"				\
 	"668: \n\t"							\
-	".pushsection .fixup, \"ax\" \n\t"				\
-	"700: \n\t"							\
-	cleanup_insn "\n\t"						\
-	"cmpb	$0, kvm_rebooting\n\t"					\
-	"je	667b \n\t"						\
-	"jmp	668b \n\t"						\
-	".popsection \n\t"						\
-	_ASM_EXTABLE(666b, 700b)
-
-#define __kvm_handle_fault_on_reboot(insn)		\
-	____kvm_handle_fault_on_reboot(insn, "")
+	_ASM_EXTABLE(666b, 667b)
 
 #define KVM_ARCH_WANT_MMU_NOTIFIER
 int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end);

diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h
index c78da8e..0dca7f7 100644
--- a/arch/x86/include/asm/pgtable_32.h
+++ b/arch/x86/include/asm/pgtable_32.h

@@ -29,8 +29,6 @@
 extern pgd_t initial_page_table[1024];
 extern pmd_t initial_pg_pmd[];
 
-static inline void pgtable_cache_init(void) { }
-static inline void check_pgt_cache(void) { }
 void paging_init(void);
 void sync_initial_page_table(void);
 

diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 4990d26..0b6c4042 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h

@@ -241,9 +241,6 @@
 #define HAVE_ARCH_UNMAPPED_AREA
 #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
 
-#define pgtable_cache_init()   do { } while (0)
-#define check_pgt_cache()      do { } while (0)
-
 #define PAGE_AGP    PAGE_KERNEL_NOCACHE
 #define HAVE_PAGE_AGP 1
 

diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index dec9c1e..6ece856 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h

@@ -52,6 +52,7 @@
 	INTERCEPT_MWAIT,
 	INTERCEPT_MWAIT_COND,
 	INTERCEPT_XSETBV,
+	INTERCEPT_RDPRU,
 };
 
 

diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index b15e646..1835767 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h

@@ -69,6 +69,7 @@
 #define SECONDARY_EXEC_PT_USE_GPA		0x01000000
 #define SECONDARY_EXEC_MODE_BASED_EPT_EXEC	0x00400000
 #define SECONDARY_EXEC_TSC_SCALING              0x02000000
+#define SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE	0x04000000
 
 #define PIN_BASED_EXT_INTR_MASK                 0x00000001
 #define PIN_BASED_NMI_EXITING                   0x00000008
@@ -110,6 +111,7 @@
 #define VMX_MISC_SAVE_EFER_LMA			0x00000020
 #define VMX_MISC_ACTIVITY_HLT			0x00000040
 #define VMX_MISC_ZERO_LEN_INS			0x40000000
+#define VMX_MISC_MSR_LIST_MULTIPLIER		512
 
 /* VMFUNC functions */
 #define VMX_VMFUNC_EPTP_SWITCHING               0x00000001

diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index ac09341..1943585 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h

@@ -134,10 +134,12 @@
 
 /**
  * struct x86_init_acpi - x86 ACPI init functions
+ * @set_root_poitner:		set RSDP address
  * @get_root_pointer:		get RSDP address
  * @reduced_hw_early_init:	hardware reduced platform early init
  */
 struct x86_init_acpi {
+	void (*set_root_pointer)(u64 addr);
 	u64 (*get_root_pointer)(void);
 	void (*reduced_hw_early_init)(void);
 };

diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h
index a9731f8..2e8a30f 100644
--- a/arch/x86/include/uapi/asm/svm.h
+++ b/arch/x86/include/uapi/asm/svm.h

@@ -75,6 +75,7 @@
 #define SVM_EXIT_MWAIT         0x08b
 #define SVM_EXIT_MWAIT_COND    0x08c
 #define SVM_EXIT_XSETBV        0x08d
+#define SVM_EXIT_RDPRU         0x08e
 #define SVM_EXIT_NPF           0x400
 #define SVM_EXIT_AVIC_INCOMPLETE_IPI		0x401
 #define SVM_EXIT_AVIC_UNACCELERATED_ACCESS	0x402

diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index f01950a..3eb8411 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h

@@ -86,6 +86,8 @@
 #define EXIT_REASON_PML_FULL            62
 #define EXIT_REASON_XSAVES              63
 #define EXIT_REASON_XRSTORS             64
+#define EXIT_REASON_UMWAIT              67
+#define EXIT_REASON_TPAUSE              68
 
 #define VMX_EXIT_REASONS \
 	{ EXIT_REASON_EXCEPTION_NMI,         "EXCEPTION_NMI" }, \
@@ -144,7 +146,9 @@
 	{ EXIT_REASON_RDSEED,                "RDSEED" }, \
 	{ EXIT_REASON_PML_FULL,              "PML_FULL" }, \
 	{ EXIT_REASON_XSAVES,                "XSAVES" }, \
-	{ EXIT_REASON_XRSTORS,               "XRSTORS" }
+	{ EXIT_REASON_XRSTORS,               "XRSTORS" }, \
+	{ EXIT_REASON_UMWAIT,                "UMWAIT" }, \
+	{ EXIT_REASON_TPAUSE,                "TPAUSE" }
 
 #define VMX_ABORT_SAVE_GUEST_MSR_FAIL        1
 #define VMX_ABORT_LOAD_HOST_PDPTE_FAIL       2

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 17b33ef..04205ce 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c

@@ -1760,6 +1760,11 @@
 	e820__update_table_print();
 }
 
+void x86_default_set_root_pointer(u64 addr)
+{
+	boot_params.acpi_rsdp_addr = addr;
+}
+
 u64 x86_default_get_root_pointer(void)
 {
 	return boot_params.acpi_rsdp_addr;

diff --git a/arch/x86/kernel/cpu/umwait.c b/arch/x86/kernel/cpu/umwait.c
index 32b4dc9..c222f28 100644
--- a/arch/x86/kernel/cpu/umwait.c
+++ b/arch/x86/kernel/cpu/umwait.c

@@ -17,6 +17,12 @@
  */
 static u32 umwait_control_cached = UMWAIT_CTRL_VAL(100000, UMWAIT_C02_ENABLE);
 
+u32 get_umwait_control_msr(void)
+{
+	return umwait_control_cached;
+}
+EXPORT_SYMBOL_GPL(get_umwait_control_msr);
+
 /*
  * Cache the original IA32_UMWAIT_CONTROL MSR value which is configured by
  * hardware or BIOS before kernel boot.

diff --git a/arch/x86/kernel/ima_arch.c b/arch/x86/kernel/ima_arch.c
index 4c40783..4d4f5d9 100644
--- a/arch/x86/kernel/ima_arch.c
+++ b/arch/x86/kernel/ima_arch.c

@@ -74,9 +74,9 @@
 
 /* secureboot arch rules */
 static const char * const sb_arch_rules[] = {
-#if !IS_ENABLED(CONFIG_KEXEC_VERIFY_SIG)
+#if !IS_ENABLED(CONFIG_KEXEC_SIG)
 	"appraise func=KEXEC_KERNEL_CHECK appraise_type=imasig",
-#endif /* CONFIG_KEXEC_VERIFY_SIG */
+#endif /* CONFIG_KEXEC_SIG */
 	"measure func=KEXEC_KERNEL_CHECK",
 #if !IS_ENABLED(CONFIG_MODULE_SIG)
 	"appraise func=MODULE_CHECK appraise_type=imasig",

diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index 0fe1c87..61a89d3 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c

@@ -11,6 +11,7 @@
 #include <linux/errno.h>
 #include <linux/types.h>
 #include <linux/ioport.h>
+#include <linux/security.h>
 #include <linux/smp.h>
 #include <linux/stddef.h>
 #include <linux/slab.h>
@@ -31,7 +32,8 @@
 
 	if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
 		return -EINVAL;
-	if (turn_on && !capable(CAP_SYS_RAWIO))
+	if (turn_on && (!capable(CAP_SYS_RAWIO) ||
+			security_locked_down(LOCKDOWN_IOPORT)))
 		return -EPERM;
 
 	/*
@@ -126,7 +128,8 @@
 		return -EINVAL;
 	/* Trying to gain more privileges? */
 	if (level > old) {
-		if (!capable(CAP_SYS_RAWIO))
+		if (!capable(CAP_SYS_RAWIO) ||
+		    security_locked_down(LOCKDOWN_IOPORT))
 			return -EPERM;
 	}
 	regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) |

diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c
index 5ebcd02..d2f4e70 100644
--- a/arch/x86/kernel/kexec-bzimage64.c
+++ b/arch/x86/kernel/kexec-bzimage64.c

@@ -180,6 +180,7 @@
 	if (efi_enabled(EFI_OLD_MEMMAP))
 		return 0;
 
+	params->secure_boot = boot_params.secure_boot;
 	ei->efi_loader_signature = current_ei->efi_loader_signature;
 	ei->efi_systab = current_ei->efi_systab;
 	ei->efi_systab_hi = current_ei->efi_systab_hi;

diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 3db2252..1547be3 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c

@@ -34,6 +34,7 @@
 #include <linux/notifier.h>
 #include <linux/uaccess.h>
 #include <linux/gfp.h>
+#include <linux/security.h>
 
 #include <asm/cpufeature.h>
 #include <asm/msr.h>
@@ -79,6 +80,10 @@
 	int err = 0;
 	ssize_t bytes = 0;
 
+	err = security_locked_down(LOCKDOWN_MSR);
+	if (err)
+		return err;
+
 	if (count % 8)
 		return -EINVAL;	/* Invalid chunk size */
 
@@ -130,6 +135,9 @@
 			err = -EFAULT;
 			break;
 		}
+		err = security_locked_down(LOCKDOWN_MSR);
+		if (err)
+			break;
 		err = wrmsr_safe_regs_on_cpu(cpu, regs);
 		if (err)
 			break;

diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 1bef687..18a799c 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c

@@ -95,6 +95,7 @@
 	},
 
 	.acpi = {
+		.set_root_pointer	= x86_default_set_root_pointer,
 		.get_root_pointer	= x86_default_get_root_pointer,
 		.reduced_hw_early_init	= acpi_generic_reduced_hw_init,
 	},

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index dd5985e..6331603 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c

@@ -304,7 +304,13 @@
 	case 7:
 	case 0xb:
 	case 0xd:
+	case 0xf:
+	case 0x10:
+	case 0x12:
 	case 0x14:
+	case 0x17:
+	case 0x18:
+	case 0x1f:
 	case 0x8000001d:
 		entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
 		break;
@@ -360,7 +366,7 @@
 		F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ |
 		F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) |
 		F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) |
-		F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B);
+		F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/;
 
 	/* cpuid 7.0.edx*/
 	const u32 kvm_cpuid_7_0_edx_x86_features =

diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index fff790a..23ff655 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c

@@ -23,6 +23,7 @@
 #include "ioapic.h"
 #include "hyperv.h"
 
+#include <linux/cpu.h>
 #include <linux/kvm_host.h>
 #include <linux/highmem.h>
 #include <linux/sched/cputime.h>
@@ -645,7 +646,9 @@
 		.vector = stimer->config.apic_vector
 	};
 
-	return !kvm_apic_set_irq(vcpu, &irq, NULL);
+	if (lapic_in_kernel(vcpu))
+		return !kvm_apic_set_irq(vcpu, &irq, NULL);
+	return 0;
 }
 
 static void stimer_expiration(struct kvm_vcpu_hv_stimer *stimer)
@@ -1852,7 +1855,13 @@
 
 			ent->edx |= HV_FEATURE_FREQUENCY_MSRS_AVAILABLE;
 			ent->edx |= HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE;
-			ent->edx |= HV_STIMER_DIRECT_MODE_AVAILABLE;
+
+			/*
+			 * Direct Synthetic timers only make sense with in-kernel
+			 * LAPIC
+			 */
+			if (lapic_in_kernel(vcpu))
+				ent->edx |= HV_STIMER_DIRECT_MODE_AVAILABLE;
 
 			break;
 
@@ -1864,7 +1873,8 @@
 			ent->eax |= HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED;
 			if (evmcs_ver)
 				ent->eax |= HV_X64_ENLIGHTENED_VMCS_RECOMMENDED;
-
+			if (!cpu_smt_possible())
+				ent->eax |= HV_X64_NO_NONARCH_CORESHARING;
 			/*
 			 * Default number of spinlock retry attempts, matches
 			 * HyperV 2016.

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 8675458..3a3a685 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c

@@ -65,7 +65,9 @@
 #define APIC_BROADCAST			0xFF
 #define X2APIC_BROADCAST		0xFFFFFFFFul
 
-#define LAPIC_TIMER_ADVANCE_ADJUST_DONE 100
+static bool lapic_timer_advance_dynamic __read_mostly;
+#define LAPIC_TIMER_ADVANCE_ADJUST_MIN 100
+#define LAPIC_TIMER_ADVANCE_ADJUST_MAX 5000
 #define LAPIC_TIMER_ADVANCE_ADJUST_INIT 1000
 /* step-by-step approximation to mitigate fluctuation */
 #define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8
@@ -1485,26 +1487,25 @@
 	u32 timer_advance_ns = apic->lapic_timer.timer_advance_ns;
 	u64 ns;
 
+	/* Do not adjust for tiny fluctuations or large random spikes. */
+	if (abs(advance_expire_delta) > LAPIC_TIMER_ADVANCE_ADJUST_MAX ||
+	    abs(advance_expire_delta) < LAPIC_TIMER_ADVANCE_ADJUST_MIN)
+		return;
+
 	/* too early */
 	if (advance_expire_delta < 0) {
 		ns = -advance_expire_delta * 1000000ULL;
 		do_div(ns, vcpu->arch.virtual_tsc_khz);
-		timer_advance_ns -= min((u32)ns,
-			timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
+		timer_advance_ns -= ns/LAPIC_TIMER_ADVANCE_ADJUST_STEP;
 	} else {
 	/* too late */
 		ns = advance_expire_delta * 1000000ULL;
 		do_div(ns, vcpu->arch.virtual_tsc_khz);
-		timer_advance_ns += min((u32)ns,
-			timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
+		timer_advance_ns += ns/LAPIC_TIMER_ADVANCE_ADJUST_STEP;
 	}
 
-	if (abs(advance_expire_delta) < LAPIC_TIMER_ADVANCE_ADJUST_DONE)
-		apic->lapic_timer.timer_advance_adjust_done = true;
-	if (unlikely(timer_advance_ns > 5000)) {
+	if (unlikely(timer_advance_ns > LAPIC_TIMER_ADVANCE_ADJUST_MAX))
 		timer_advance_ns = LAPIC_TIMER_ADVANCE_ADJUST_INIT;
-		apic->lapic_timer.timer_advance_adjust_done = false;
-	}
 	apic->lapic_timer.timer_advance_ns = timer_advance_ns;
 }
 
@@ -1524,7 +1525,7 @@
 	if (guest_tsc < tsc_deadline)
 		__wait_lapic_expire(vcpu, tsc_deadline - guest_tsc);
 
-	if (unlikely(!apic->lapic_timer.timer_advance_adjust_done))
+	if (lapic_timer_advance_dynamic)
 		adjust_lapic_timer_advance(vcpu, apic->lapic_timer.advance_expire_delta);
 }
 
@@ -2302,13 +2303,12 @@
 	apic->lapic_timer.timer.function = apic_timer_fn;
 	if (timer_advance_ns == -1) {
 		apic->lapic_timer.timer_advance_ns = LAPIC_TIMER_ADVANCE_ADJUST_INIT;
-		apic->lapic_timer.timer_advance_adjust_done = false;
+		lapic_timer_advance_dynamic = true;
 	} else {
 		apic->lapic_timer.timer_advance_ns = timer_advance_ns;
-		apic->lapic_timer.timer_advance_adjust_done = true;
+		lapic_timer_advance_dynamic = false;
 	}
 
-
 	/*
 	 * APIC is created enabled. This will prevent kvm_lapic_set_base from
 	 * thinking that APIC state has changed.

diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 50053d2..2aad7e2 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h

@@ -35,7 +35,6 @@
 	s64 advance_expire_delta;
 	atomic_t pending;			/* accumulated triggered timers */
 	bool hv_timer_in_use;
-	bool timer_advance_adjust_done;
 };
 
 struct kvm_lapic {

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index a10af9c..5269aa0 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c

@@ -403,8 +403,6 @@
 	mask |= (gpa & shadow_nonpresent_or_rsvd_mask)
 		<< shadow_nonpresent_or_rsvd_mask_len;
 
-	page_header(__pa(sptep))->mmio_cached = true;
-
 	trace_mark_mmio_spte(sptep, gfn, access, gen);
 	mmu_spte_set(sptep, mask);
 }
@@ -2103,6 +2101,7 @@
 	 * depends on valid pages being added to the head of the list.  See
 	 * comments in kvm_zap_obsolete_pages().
 	 */
+	sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
 	list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
 	kvm_mod_used_mmu_pages(vcpu->kvm, +1);
 	return sp;
@@ -2252,7 +2251,7 @@
 #define for_each_valid_sp(_kvm, _sp, _gfn)				\
 	hlist_for_each_entry(_sp,					\
 	  &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \
-		if (is_obsolete_sp((_kvm), (_sp)) || (_sp)->role.invalid) {    \
+		if (is_obsolete_sp((_kvm), (_sp))) {			\
 		} else
 
 #define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn)			\
@@ -2311,7 +2310,8 @@
 
 static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
-	return unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
+	return sp->role.invalid ||
+	       unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
 }
 
 static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
@@ -2538,7 +2538,6 @@
 		if (level > PT_PAGE_TABLE_LEVEL && need_sync)
 			flush |= kvm_sync_pages(vcpu, gfn, &invalid_list);
 	}
-	sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
 	clear_page(sp->spt);
 	trace_kvm_mmu_get_page(sp, true);
 
@@ -2753,7 +2752,12 @@
 	} else {
 		list_move(&sp->link, &kvm->arch.active_mmu_pages);
 
-		if (!sp->role.invalid)
+		/*
+		 * Obsolete pages cannot be used on any vCPUs, see the comment
+		 * in kvm_mmu_zap_all_fast().  Note, is_obsolete_sp() also
+		 * treats invalid shadow pages as being obsolete.
+		 */
+		if (!is_obsolete_sp(kvm, sp))
 			kvm_reload_remote_mmus(kvm);
 	}
 
@@ -5383,7 +5387,6 @@
 		       void *insn, int insn_len)
 {
 	int r, emulation_type = 0;
-	enum emulation_result er;
 	bool direct = vcpu->arch.mmu->direct_map;
 
 	/* With shadow page tables, fault_address contains a GVA or nGPA.  */
@@ -5450,19 +5453,8 @@
 			return 1;
 	}
 
-	er = x86_emulate_instruction(vcpu, cr2, emulation_type, insn, insn_len);
-
-	switch (er) {
-	case EMULATE_DONE:
-		return 1;
-	case EMULATE_USER_EXIT:
-		++vcpu->stat.mmio_exits;
-		/* fall through */
-	case EMULATE_FAIL:
-		return 0;
-	default:
-		BUG();
-	}
+	return x86_emulate_instruction(vcpu, cr2, emulation_type, insn,
+				       insn_len);
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
 
@@ -5684,12 +5676,11 @@
 	return ret;
 }
 
-
+#define BATCH_ZAP_PAGES	10
 static void kvm_zap_obsolete_pages(struct kvm *kvm)
 {
 	struct kvm_mmu_page *sp, *node;
-	LIST_HEAD(invalid_list);
-	int ign;
+	int nr_zapped, batch = 0;
 
 restart:
 	list_for_each_entry_safe_reverse(sp, node,
@@ -5702,46 +5693,39 @@
 			break;
 
 		/*
-		 * Do not repeatedly zap a root page to avoid unnecessary
-		 * KVM_REQ_MMU_RELOAD, otherwise we may not be able to
-		 * progress:
-		 *    vcpu 0                        vcpu 1
-		 *                         call vcpu_enter_guest():
-		 *                            1): handle KVM_REQ_MMU_RELOAD
-		 *                                and require mmu-lock to
-		 *                                load mmu
-		 * repeat:
-		 *    1): zap root page and
-		 *        send KVM_REQ_MMU_RELOAD
-		 *
-		 *    2): if (cond_resched_lock(mmu-lock))
-		 *
-		 *                            2): hold mmu-lock and load mmu
-		 *
-		 *                            3): see KVM_REQ_MMU_RELOAD bit
-		 *                                on vcpu->requests is set
-		 *                                then return 1 to call
-		 *                                vcpu_enter_guest() again.
-		 *            goto repeat;
-		 *
-		 * Since we are reversely walking the list and the invalid
-		 * list will be moved to the head, skip the invalid page
-		 * can help us to avoid the infinity list walking.
+		 * Skip invalid pages with a non-zero root count, zapping pages
+		 * with a non-zero root count will never succeed, i.e. the page
+		 * will get thrown back on active_mmu_pages and we'll get stuck
+		 * in an infinite loop.
 		 */
-		if (sp->role.invalid)
+		if (sp->role.invalid && sp->root_count)
 			continue;
 
-		if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
-			kvm_mmu_commit_zap_page(kvm, &invalid_list);
-			cond_resched_lock(&kvm->mmu_lock);
+		/*
+		 * No need to flush the TLB since we're only zapping shadow
+		 * pages with an obsolete generation number and all vCPUS have
+		 * loaded a new root, i.e. the shadow pages being zapped cannot
+		 * be in active use by the guest.
+		 */
+		if (batch >= BATCH_ZAP_PAGES &&
+		    cond_resched_lock(&kvm->mmu_lock)) {
+			batch = 0;
 			goto restart;
 		}
 
-		if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign))
+		if (__kvm_mmu_prepare_zap_page(kvm, sp,
+				&kvm->arch.zapped_obsolete_pages, &nr_zapped)) {
+			batch += nr_zapped;
 			goto restart;
+		}
 	}
 
-	kvm_mmu_commit_zap_page(kvm, &invalid_list);
+	/*
+	 * Trigger a remote TLB flush before freeing the page tables to ensure
+	 * KVM is not in the middle of a lockless shadow page table walk, which
+	 * may reference the pages.
+	 */
+	kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages);
 }
 
 /*
@@ -5755,13 +5739,39 @@
  */
 static void kvm_mmu_zap_all_fast(struct kvm *kvm)
 {
+	lockdep_assert_held(&kvm->slots_lock);
+
 	spin_lock(&kvm->mmu_lock);
-	kvm->arch.mmu_valid_gen++;
+	trace_kvm_mmu_zap_all_fast(kvm);
+
+	/*
+	 * Toggle mmu_valid_gen between '0' and '1'.  Because slots_lock is
+	 * held for the entire duration of zapping obsolete pages, it's
+	 * impossible for there to be multiple invalid generations associated
+	 * with *valid* shadow pages at any given time, i.e. there is exactly
+	 * one valid generation and (at most) one invalid generation.
+	 */
+	kvm->arch.mmu_valid_gen = kvm->arch.mmu_valid_gen ? 0 : 1;
+
+	/*
+	 * Notify all vcpus to reload its shadow page table and flush TLB.
+	 * Then all vcpus will switch to new shadow page table with the new
+	 * mmu_valid_gen.
+	 *
+	 * Note: we need to do this under the protection of mmu_lock,
+	 * otherwise, vcpu would purge shadow page but miss tlb flush.
+	 */
+	kvm_reload_remote_mmus(kvm);
 
 	kvm_zap_obsolete_pages(kvm);
 	spin_unlock(&kvm->mmu_lock);
 }
 
+static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
+{
+	return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
+}
+
 static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
 			struct kvm_memory_slot *slot,
 			struct kvm_page_track_notifier_node *node)
@@ -5959,7 +5969,7 @@
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
 
-static void __kvm_mmu_zap_all(struct kvm *kvm, bool mmio_only)
+void kvm_mmu_zap_all(struct kvm *kvm)
 {
 	struct kvm_mmu_page *sp, *node;
 	LIST_HEAD(invalid_list);
@@ -5968,14 +5978,10 @@
 	spin_lock(&kvm->mmu_lock);
 restart:
 	list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
-		if (mmio_only && !sp->mmio_cached)
-			continue;
 		if (sp->role.invalid && sp->root_count)
 			continue;
-		if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign)) {
-			WARN_ON_ONCE(mmio_only);
+		if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign))
 			goto restart;
-		}
 		if (cond_resched_lock(&kvm->mmu_lock))
 			goto restart;
 	}
@@ -5984,11 +5990,6 @@
 	spin_unlock(&kvm->mmu_lock);
 }
 
-void kvm_mmu_zap_all(struct kvm *kvm)
-{
-	return __kvm_mmu_zap_all(kvm, false);
-}
-
 void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
 {
 	WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
@@ -6010,7 +6011,7 @@
 	 */
 	if (unlikely(gen == 0)) {
 		kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n");
-		__kvm_mmu_zap_all(kvm, true);
+		kvm_mmu_zap_all_fast(kvm);
 	}
 }
 
@@ -6041,16 +6042,24 @@
 		 * want to shrink a VM that only started to populate its MMU
 		 * anyway.
 		 */
-		if (!kvm->arch.n_used_mmu_pages)
+		if (!kvm->arch.n_used_mmu_pages &&
+		    !kvm_has_zapped_obsolete_pages(kvm))
 			continue;
 
 		idx = srcu_read_lock(&kvm->srcu);
 		spin_lock(&kvm->mmu_lock);
 
+		if (kvm_has_zapped_obsolete_pages(kvm)) {
+			kvm_mmu_commit_zap_page(kvm,
+			      &kvm->arch.zapped_obsolete_pages);
+			goto unlock;
+		}
+
 		if (prepare_zap_oldest_mmu_page(kvm, &invalid_list))
 			freed++;
 		kvm_mmu_commit_zap_page(kvm, &invalid_list);
 
+unlock:
 		spin_unlock(&kvm->mmu_lock);
 		srcu_read_unlock(&kvm->srcu, idx);
 

diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index d8001b4..7ca8831 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h

@@ -8,16 +8,18 @@
 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM kvmmmu
 
-#define KVM_MMU_PAGE_FIELDS \
-	__field(__u64, gfn) \
-	__field(__u32, role) \
-	__field(__u32, root_count) \
+#define KVM_MMU_PAGE_FIELDS		\
+	__field(__u8, mmu_valid_gen)	\
+	__field(__u64, gfn)		\
+	__field(__u32, role)		\
+	__field(__u32, root_count)	\
 	__field(bool, unsync)
 
-#define KVM_MMU_PAGE_ASSIGN(sp)			     \
-	__entry->gfn = sp->gfn;			     \
-	__entry->role = sp->role.word;		     \
-	__entry->root_count = sp->root_count;        \
+#define KVM_MMU_PAGE_ASSIGN(sp)				\
+	__entry->mmu_valid_gen = sp->mmu_valid_gen;	\
+	__entry->gfn = sp->gfn;				\
+	__entry->role = sp->role.word;			\
+	__entry->root_count = sp->root_count;		\
 	__entry->unsync = sp->unsync;
 
 #define KVM_MMU_PAGE_PRINTK() ({				        \
@@ -29,8 +31,9 @@
 								        \
 	role.word = __entry->role;					\
 									\
-	trace_seq_printf(p, "sp gfn %llx l%u %u-byte q%u%s %s%s"	\
+	trace_seq_printf(p, "sp gen %u gfn %llx l%u %u-byte q%u%s %s%s"	\
 			 " %snxe %sad root %u %s%c",			\
+			 __entry->mmu_valid_gen,			\
 			 __entry->gfn, role.level,			\
 			 role.gpte_is_8_bytes ? 8 : 4,			\
 			 role.quadrant,					\
@@ -280,6 +283,27 @@
 );
 
 TRACE_EVENT(
+	kvm_mmu_zap_all_fast,
+	TP_PROTO(struct kvm *kvm),
+	TP_ARGS(kvm),
+
+	TP_STRUCT__entry(
+		__field(__u8, mmu_valid_gen)
+		__field(unsigned int, mmu_used_pages)
+	),
+
+	TP_fast_assign(
+		__entry->mmu_valid_gen = kvm->arch.mmu_valid_gen;
+		__entry->mmu_used_pages = kvm->arch.n_used_mmu_pages;
+	),
+
+	TP_printk("kvm-mmu-valid-gen %u used_pages %x",
+		  __entry->mmu_valid_gen, __entry->mmu_used_pages
+	)
+);
+
+
+TRACE_EVENT(
 	check_mmio_spte,
 	TP_PROTO(u64 spte, unsigned int kvm_gen, unsigned int spte_gen),
 	TP_ARGS(spte, kvm_gen, spte_gen),

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 04fe218..f8ecb6d 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c

@@ -777,17 +777,18 @@
 		svm->next_rip = svm->vmcb->control.next_rip;
 	}
 
-	if (!svm->next_rip)
-		return kvm_emulate_instruction(vcpu, EMULTYPE_SKIP);
-
-	if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE)
-		printk(KERN_ERR "%s: ip 0x%lx next 0x%llx\n",
-		       __func__, kvm_rip_read(vcpu), svm->next_rip);
-
-	kvm_rip_write(vcpu, svm->next_rip);
+	if (!svm->next_rip) {
+		if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP))
+			return 0;
+	} else {
+		if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE)
+			pr_err("%s: ip 0x%lx next 0x%llx\n",
+			       __func__, kvm_rip_read(vcpu), svm->next_rip);
+		kvm_rip_write(vcpu, svm->next_rip);
+	}
 	svm_set_interrupt_shadow(vcpu, 0);
 
-	return EMULATE_DONE;
+	return 1;
 }
 
 static void svm_queue_exception(struct kvm_vcpu *vcpu)
@@ -1539,6 +1540,7 @@
 	set_intercept(svm, INTERCEPT_SKINIT);
 	set_intercept(svm, INTERCEPT_WBINVD);
 	set_intercept(svm, INTERCEPT_XSETBV);
+	set_intercept(svm, INTERCEPT_RDPRU);
 	set_intercept(svm, INTERCEPT_RSM);
 
 	if (!kvm_mwait_in_guest(svm->vcpu.kvm)) {
@@ -2768,17 +2770,18 @@
 {
 	struct kvm_vcpu *vcpu = &svm->vcpu;
 	u32 error_code = svm->vmcb->control.exit_info_1;
-	int er;
 
 	WARN_ON_ONCE(!enable_vmware_backdoor);
 
-	er = kvm_emulate_instruction(vcpu,
-		EMULTYPE_VMWARE | EMULTYPE_NO_UD_ON_FAIL);
-	if (er == EMULATE_USER_EXIT)
-		return 0;
-	else if (er != EMULATE_DONE)
+	/*
+	 * VMware backdoor emulation on #GP interception only handles IN{S},
+	 * OUT{S}, and RDPMC, none of which generate a non-zero error code.
+	 */
+	if (error_code) {
 		kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
-	return 1;
+		return 1;
+	}
+	return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP);
 }
 
 static bool is_erratum_383(void)
@@ -2876,7 +2879,7 @@
 	string = (io_info & SVM_IOIO_STR_MASK) != 0;
 	in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
 	if (string)
-		return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE;
+		return kvm_emulate_instruction(vcpu, 0);
 
 	port = io_info >> 16;
 	size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
@@ -3830,6 +3833,12 @@
 	return 1;
 }
 
+static int rdpru_interception(struct vcpu_svm *svm)
+{
+	kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+	return 1;
+}
+
 static int task_switch_interception(struct vcpu_svm *svm)
 {
 	u16 tss_selector;
@@ -3883,24 +3892,15 @@
 	    int_type == SVM_EXITINTINFO_TYPE_SOFT ||
 	    (int_type == SVM_EXITINTINFO_TYPE_EXEPT &&
 	     (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) {
-		if (skip_emulated_instruction(&svm->vcpu) != EMULATE_DONE)
-			goto fail;
+		if (!skip_emulated_instruction(&svm->vcpu))
+			return 0;
 	}
 
 	if (int_type != SVM_EXITINTINFO_TYPE_SOFT)
 		int_vec = -1;
 
-	if (kvm_task_switch(&svm->vcpu, tss_selector, int_vec, reason,
-				has_error_code, error_code) == EMULATE_FAIL)
-		goto fail;
-
-	return 1;
-
-fail:
-	svm->vcpu.run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
-	svm->vcpu.run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
-	svm->vcpu.run->internal.ndata = 0;
-	return 0;
+	return kvm_task_switch(&svm->vcpu, tss_selector, int_vec, reason,
+			       has_error_code, error_code);
 }
 
 static int cpuid_interception(struct vcpu_svm *svm)
@@ -3921,7 +3921,7 @@
 static int invlpg_interception(struct vcpu_svm *svm)
 {
 	if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
-		return kvm_emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
+		return kvm_emulate_instruction(&svm->vcpu, 0);
 
 	kvm_mmu_invlpg(&svm->vcpu, svm->vmcb->control.exit_info_1);
 	return kvm_skip_emulated_instruction(&svm->vcpu);
@@ -3929,13 +3929,12 @@
 
 static int emulate_on_interception(struct vcpu_svm *svm)
 {
-	return kvm_emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
+	return kvm_emulate_instruction(&svm->vcpu, 0);
 }
 
 static int rsm_interception(struct vcpu_svm *svm)
 {
-	return kvm_emulate_instruction_from_buffer(&svm->vcpu,
-					rsm_ins_bytes, 2) == EMULATE_DONE;
+	return kvm_emulate_instruction_from_buffer(&svm->vcpu, rsm_ins_bytes, 2);
 }
 
 static int rdpmc_interception(struct vcpu_svm *svm)
@@ -4724,7 +4723,7 @@
 		ret = avic_unaccel_trap_write(svm);
 	} else {
 		/* Handling Fault */
-		ret = (kvm_emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE);
+		ret = kvm_emulate_instruction(&svm->vcpu, 0);
 	}
 
 	return ret;
@@ -4791,6 +4790,7 @@
 	[SVM_EXIT_MONITOR]			= monitor_interception,
 	[SVM_EXIT_MWAIT]			= mwait_interception,
 	[SVM_EXIT_XSETBV]			= xsetbv_interception,
+	[SVM_EXIT_RDPRU]			= rdpru_interception,
 	[SVM_EXIT_NPF]				= npf_interception,
 	[SVM_EXIT_RSM]                          = rsm_interception,
 	[SVM_EXIT_AVIC_INCOMPLETE_IPI]		= avic_incomplete_ipi_interception,
@@ -7099,13 +7099,6 @@
 	return ret;
 }
 
-static int nested_enable_evmcs(struct kvm_vcpu *vcpu,
-				   uint16_t *vmcs_version)
-{
-	/* Intel-only feature */
-	return -ENODEV;
-}
-
 static bool svm_need_emulation_on_page_fault(struct kvm_vcpu *vcpu)
 {
 	unsigned long cr4 = kvm_read_cr4(vcpu);
@@ -7311,7 +7304,7 @@
 	.mem_enc_reg_region = svm_register_enc_region,
 	.mem_enc_unreg_region = svm_unregister_enc_region,
 
-	.nested_enable_evmcs = nested_enable_evmcs,
+	.nested_enable_evmcs = NULL,
 	.nested_get_evmcs_version = NULL,
 
 	.need_emulation_on_page_fault = svm_need_emulation_on_page_fault,

diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
index d6664ee..7aa6971 100644
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h

@@ -247,6 +247,12 @@
 		SECONDARY_EXEC_XSAVES;
 }
 
+static inline bool vmx_waitpkg_supported(void)
+{
+	return vmcs_config.cpu_based_2nd_exec_ctrl &
+		SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;
+}
+
 static inline bool cpu_has_vmx_tsc_scaling(void)
 {
 	return vmcs_config.cpu_based_2nd_exec_ctrl &

diff --git a/arch/x86/kvm/vmx/evmcs.h b/arch/x86/kvm/vmx/evmcs.h
index 39a24ee..07ebf68 100644
--- a/arch/x86/kvm/vmx/evmcs.h
+++ b/arch/x86/kvm/vmx/evmcs.h

@@ -178,6 +178,8 @@
 	struct hv_vp_assist_page *vp_ap =
 		hv_get_vp_assist_page(smp_processor_id());
 
+	if (current_evmcs->hv_enlightenments_control.nested_flush_hypercall)
+		vp_ap->nested_control.features.directhypercall = 1;
 	vp_ap->current_nested_vmcs = phys_addr;
 	vp_ap->enlighten_vmentry = 1;
 }

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 1a10cd3..41abc62 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c

@@ -198,6 +198,16 @@
 	pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator);
 }
 
+static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
+{
+	return fixed_bits_valid(control, low, high);
+}
+
+static inline u64 vmx_control_msr(u32 low, u32 high)
+{
+	return low | ((u64)high << 32);
+}
+
 static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx)
 {
 	secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS);
@@ -866,16 +876,34 @@
 	return 0;
 }
 
+static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	u64 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low,
+				       vmx->nested.msrs.misc_high);
+
+	return (vmx_misc_max_msr(vmx_misc) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER;
+}
+
 /*
  * Load guest's/host's msr at nested entry/exit.
  * return 0 for success, entry index for failure.
+ *
+ * One of the failure modes for MSR load/store is when a list exceeds the
+ * virtual hardware's capacity. To maintain compatibility with hardware inasmuch
+ * as possible, process all valid entries before failing rather than precheck
+ * for a capacity violation.
  */
 static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
 {
 	u32 i;
 	struct vmx_msr_entry e;
+	u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu);
 
 	for (i = 0; i < count; i++) {
+		if (unlikely(i >= max_msr_list_size))
+			goto fail;
+
 		if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e),
 					&e, sizeof(e))) {
 			pr_debug_ratelimited(
@@ -906,8 +934,12 @@
 	u64 data;
 	u32 i;
 	struct vmx_msr_entry e;
+	u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu);
 
 	for (i = 0; i < count; i++) {
+		if (unlikely(i >= max_msr_list_size))
+			return -EINVAL;
+
 		if (kvm_vcpu_read_guest(vcpu,
 					gpa + i * sizeof(e),
 					&e, 2 * sizeof(u32))) {
@@ -1013,17 +1045,6 @@
 	return vmx->nested.vpid02 ? vmx->nested.vpid02 : vmx->vpid;
 }
 
-
-static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
-{
-	return fixed_bits_valid(control, low, high);
-}
-
-static inline u64 vmx_control_msr(u32 low, u32 high)
-{
-	return low | ((u64)high << 32);
-}
-
 static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask)
 {
 	superset &= mask;
@@ -2089,6 +2110,7 @@
 				  SECONDARY_EXEC_ENABLE_INVPCID |
 				  SECONDARY_EXEC_RDTSCP |
 				  SECONDARY_EXEC_XSAVES |
+				  SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE |
 				  SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
 				  SECONDARY_EXEC_APIC_REGISTER_VIRT |
 				  SECONDARY_EXEC_ENABLE_VMFUNC);
@@ -2642,8 +2664,23 @@
 	    CC(!kvm_pat_valid(vmcs12->host_ia32_pat)))
 		return -EINVAL;
 
-	ia32e = (vmcs12->vm_exit_controls &
-		 VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0;
+#ifdef CONFIG_X86_64
+	ia32e = !!(vcpu->arch.efer & EFER_LMA);
+#else
+	ia32e = false;
+#endif
+
+	if (ia32e) {
+		if (CC(!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)) ||
+		    CC(!(vmcs12->host_cr4 & X86_CR4_PAE)))
+			return -EINVAL;
+	} else {
+		if (CC(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) ||
+		    CC(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) ||
+		    CC(vmcs12->host_cr4 & X86_CR4_PCIDE) ||
+		    CC((vmcs12->host_rip) >> 32))
+			return -EINVAL;
+	}
 
 	if (CC(vmcs12->host_cs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
 	    CC(vmcs12->host_ss_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
@@ -2662,7 +2699,8 @@
 	    CC(is_noncanonical_address(vmcs12->host_gs_base, vcpu)) ||
 	    CC(is_noncanonical_address(vmcs12->host_gdtr_base, vcpu)) ||
 	    CC(is_noncanonical_address(vmcs12->host_idtr_base, vcpu)) ||
-	    CC(is_noncanonical_address(vmcs12->host_tr_base, vcpu)))
+	    CC(is_noncanonical_address(vmcs12->host_tr_base, vcpu)) ||
+	    CC(is_noncanonical_address(vmcs12->host_rip, vcpu)))
 		return -EINVAL;
 #endif
 
@@ -5441,6 +5479,10 @@
 	case EXIT_REASON_ENCLS:
 		/* SGX is never exposed to L1 */
 		return false;
+	case EXIT_REASON_UMWAIT:
+	case EXIT_REASON_TPAUSE:
+		return nested_cpu_has2(vmcs12,
+			SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE);
 	default:
 		return true;
 	}

diff --git a/arch/x86/kvm/vmx/ops.h b/arch/x86/kvm/vmx/ops.h
index 2200fb6..45eaede 100644
--- a/arch/x86/kvm/vmx/ops.h
+++ b/arch/x86/kvm/vmx/ops.h

@@ -11,8 +11,13 @@
 #include "vmcs.h"
 
 #define __ex(x) __kvm_handle_fault_on_reboot(x)
-#define __ex_clear(x, reg) \
-	____kvm_handle_fault_on_reboot(x, "xor " reg ", " reg)
+
+asmlinkage void vmread_error(unsigned long field, bool fault);
+void vmwrite_error(unsigned long field, unsigned long value);
+void vmclear_error(struct vmcs *vmcs, u64 phys_addr);
+void vmptrld_error(struct vmcs *vmcs, u64 phys_addr);
+void invvpid_error(unsigned long ext, u16 vpid, gva_t gva);
+void invept_error(unsigned long ext, u64 eptp, gpa_t gpa);
 
 static __always_inline void vmcs_check16(unsigned long field)
 {
@@ -62,8 +67,22 @@
 {
 	unsigned long value;
 
-	asm volatile (__ex_clear("vmread %1, %0", "%k0")
-		      : "=r"(value) : "r"(field));
+	asm volatile("1: vmread %2, %1\n\t"
+		     ".byte 0x3e\n\t" /* branch taken hint */
+		     "ja 3f\n\t"
+		     "mov %2, %%" _ASM_ARG1 "\n\t"
+		     "xor %%" _ASM_ARG2 ", %%" _ASM_ARG2 "\n\t"
+		     "2: call vmread_error\n\t"
+		     "xor %k1, %k1\n\t"
+		     "3:\n\t"
+
+		     ".pushsection .fixup, \"ax\"\n\t"
+		     "4: mov %2, %%" _ASM_ARG1 "\n\t"
+		     "mov $1, %%" _ASM_ARG2 "\n\t"
+		     "jmp 2b\n\t"
+		     ".popsection\n\t"
+		     _ASM_EXTABLE(1b, 4b)
+		     : ASM_CALL_CONSTRAINT, "=r"(value) : "r"(field) : "cc");
 	return value;
 }
 
@@ -103,21 +122,39 @@
 	return __vmcs_readl(field);
 }
 
-static noinline void vmwrite_error(unsigned long field, unsigned long value)
-{
-	printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n",
-	       field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
-	dump_stack();
-}
+#define vmx_asm1(insn, op1, error_args...)				\
+do {									\
+	asm_volatile_goto("1: " __stringify(insn) " %0\n\t"		\
+			  ".byte 0x2e\n\t" /* branch not taken hint */	\
+			  "jna %l[error]\n\t"				\
+			  _ASM_EXTABLE(1b, %l[fault])			\
+			  : : op1 : "cc" : error, fault);		\
+	return;								\
+error:									\
+	insn##_error(error_args);					\
+	return;								\
+fault:									\
+	kvm_spurious_fault();						\
+} while (0)
+
+#define vmx_asm2(insn, op1, op2, error_args...)				\
+do {									\
+	asm_volatile_goto("1: "  __stringify(insn) " %1, %0\n\t"	\
+			  ".byte 0x2e\n\t" /* branch not taken hint */	\
+			  "jna %l[error]\n\t"				\
+			  _ASM_EXTABLE(1b, %l[fault])			\
+			  : : op1, op2 : "cc" : error, fault);		\
+	return;								\
+error:									\
+	insn##_error(error_args);					\
+	return;								\
+fault:									\
+	kvm_spurious_fault();						\
+} while (0)
 
 static __always_inline void __vmcs_writel(unsigned long field, unsigned long value)
 {
-	bool error;
-
-	asm volatile (__ex("vmwrite %2, %1") CC_SET(na)
-		      : CC_OUT(na) (error) : "r"(field), "rm"(value));
-	if (unlikely(error))
-		vmwrite_error(field, value);
+	vmx_asm2(vmwrite, "r"(field), "rm"(value), field, value);
 }
 
 static __always_inline void vmcs_write16(unsigned long field, u16 value)
@@ -182,28 +219,18 @@
 static inline void vmcs_clear(struct vmcs *vmcs)
 {
 	u64 phys_addr = __pa(vmcs);
-	bool error;
 
-	asm volatile (__ex("vmclear %1") CC_SET(na)
-		      : CC_OUT(na) (error) : "m"(phys_addr));
-	if (unlikely(error))
-		printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n",
-		       vmcs, phys_addr);
+	vmx_asm1(vmclear, "m"(phys_addr), vmcs, phys_addr);
 }
 
 static inline void vmcs_load(struct vmcs *vmcs)
 {
 	u64 phys_addr = __pa(vmcs);
-	bool error;
 
 	if (static_branch_unlikely(&enable_evmcs))
 		return evmcs_load(phys_addr);
 
-	asm volatile (__ex("vmptrld %1") CC_SET(na)
-		      : CC_OUT(na) (error) : "m"(phys_addr));
-	if (unlikely(error))
-		printk(KERN_ERR "kvm: vmptrld %p/%llx failed\n",
-		       vmcs, phys_addr);
+	vmx_asm1(vmptrld, "m"(phys_addr), vmcs, phys_addr);
 }
 
 static inline void __invvpid(unsigned long ext, u16 vpid, gva_t gva)
@@ -213,11 +240,8 @@
 		u64 rsvd : 48;
 		u64 gva;
 	} operand = { vpid, 0, gva };
-	bool error;
 
-	asm volatile (__ex("invvpid %2, %1") CC_SET(na)
-		      : CC_OUT(na) (error) : "r"(ext), "m"(operand));
-	BUG_ON(error);
+	vmx_asm2(invvpid, "r"(ext), "m"(operand), ext, vpid, gva);
 }
 
 static inline void __invept(unsigned long ext, u64 eptp, gpa_t gpa)
@@ -225,11 +249,8 @@
 	struct {
 		u64 eptp, gpa;
 	} operand = {eptp, gpa};
-	bool error;
 
-	asm volatile (__ex("invept %2, %1") CC_SET(na)
-		      : CC_OUT(na) (error) : "r"(ext), "m"(operand));
-	BUG_ON(error);
+	vmx_asm2(invept, "r"(ext), "m"(operand), ext, eptp, gpa);
 }
 
 static inline bool vpid_sync_vcpu_addr(int vpid, gva_t addr)

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 4a99be1..d4575ff 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c

@@ -343,6 +343,48 @@
 
 void vmx_vmexit(void);
 
+#define vmx_insn_failed(fmt...)		\
+do {					\
+	WARN_ONCE(1, fmt);		\
+	pr_warn_ratelimited(fmt);	\
+} while (0)
+
+asmlinkage void vmread_error(unsigned long field, bool fault)
+{
+	if (fault)
+		kvm_spurious_fault();
+	else
+		vmx_insn_failed("kvm: vmread failed: field=%lx\n", field);
+}
+
+noinline void vmwrite_error(unsigned long field, unsigned long value)
+{
+	vmx_insn_failed("kvm: vmwrite failed: field=%lx val=%lx err=%d\n",
+			field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
+}
+
+noinline void vmclear_error(struct vmcs *vmcs, u64 phys_addr)
+{
+	vmx_insn_failed("kvm: vmclear failed: %p/%llx\n", vmcs, phys_addr);
+}
+
+noinline void vmptrld_error(struct vmcs *vmcs, u64 phys_addr)
+{
+	vmx_insn_failed("kvm: vmptrld failed: %p/%llx\n", vmcs, phys_addr);
+}
+
+noinline void invvpid_error(unsigned long ext, u16 vpid, gva_t gva)
+{
+	vmx_insn_failed("kvm: invvpid failed: ext=0x%lx vpid=%u gva=0x%lx\n",
+			ext, vpid, gva);
+}
+
+noinline void invept_error(unsigned long ext, u64 eptp, gpa_t gpa)
+{
+	vmx_insn_failed("kvm: invept failed: ext=0x%lx eptp=%llx gpa=0x%llx\n",
+			ext, eptp, gpa);
+}
+
 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
 DEFINE_PER_CPU(struct vmcs *, current_vmcs);
 /*
@@ -486,6 +528,31 @@
 	return hv_remote_flush_tlb_with_range(kvm, NULL);
 }
 
+static int hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu)
+{
+	struct hv_enlightened_vmcs *evmcs;
+	struct hv_partition_assist_pg **p_hv_pa_pg =
+			&vcpu->kvm->arch.hyperv.hv_pa_pg;
+	/*
+	 * Synthetic VM-Exit is not enabled in current code and so All
+	 * evmcs in singe VM shares same assist page.
+	 */
+	if (!*p_hv_pa_pg)
+		*p_hv_pa_pg = kzalloc(PAGE_SIZE, GFP_KERNEL);
+
+	if (!*p_hv_pa_pg)
+		return -ENOMEM;
+
+	evmcs = (struct hv_enlightened_vmcs *)to_vmx(vcpu)->loaded_vmcs->vmcs;
+
+	evmcs->partition_assist_page =
+		__pa(*p_hv_pa_pg);
+	evmcs->hv_vm_id = (unsigned long)vcpu->kvm;
+	evmcs->hv_enlightenments_control.nested_flush_hypercall = 1;
+
+	return 0;
+}
+
 #endif /* IS_ENABLED(CONFIG_HYPERV) */
 
 /*
@@ -1472,27 +1539,32 @@
 	return 0;
 }
 
-/*
- * Returns an int to be compatible with SVM implementation (which can fail).
- * Do not use directly, use skip_emulated_instruction() instead.
- */
-static int __skip_emulated_instruction(struct kvm_vcpu *vcpu)
+static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
 {
 	unsigned long rip;
 
-	rip = kvm_rip_read(vcpu);
-	rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
-	kvm_rip_write(vcpu, rip);
+	/*
+	 * Using VMCS.VM_EXIT_INSTRUCTION_LEN on EPT misconfig depends on
+	 * undefined behavior: Intel's SDM doesn't mandate the VMCS field be
+	 * set when EPT misconfig occurs.  In practice, real hardware updates
+	 * VM_EXIT_INSTRUCTION_LEN on EPT misconfig, but other hypervisors
+	 * (namely Hyper-V) don't set it due to it being undefined behavior,
+	 * i.e. we end up advancing IP with some random value.
+	 */
+	if (!static_cpu_has(X86_FEATURE_HYPERVISOR) ||
+	    to_vmx(vcpu)->exit_reason != EXIT_REASON_EPT_MISCONFIG) {
+		rip = kvm_rip_read(vcpu);
+		rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+		kvm_rip_write(vcpu, rip);
+	} else {
+		if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP))
+			return 0;
+	}
 
 	/* skipping an emulated instruction also counts */
 	vmx_set_interrupt_shadow(vcpu, 0);
 
-	return EMULATE_DONE;
-}
-
-static inline void skip_emulated_instruction(struct kvm_vcpu *vcpu)
-{
-	(void)__skip_emulated_instruction(vcpu);
+	return 1;
 }
 
 static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
@@ -1527,8 +1599,7 @@
 		int inc_eip = 0;
 		if (kvm_exception_is_soft(nr))
 			inc_eip = vcpu->arch.event_exit_inst_len;
-		if (kvm_inject_realmode_interrupt(vcpu, nr, inc_eip) != EMULATE_DONE)
-			kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+		kvm_inject_realmode_interrupt(vcpu, nr, inc_eip);
 		return;
 	}
 
@@ -1700,6 +1771,12 @@
 #endif
 	case MSR_EFER:
 		return kvm_get_msr_common(vcpu, msr_info);
+	case MSR_IA32_UMWAIT_CONTROL:
+		if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx))
+			return 1;
+
+		msr_info->data = vmx->msr_ia32_umwait_control;
+		break;
 	case MSR_IA32_SPEC_CTRL:
 		if (!msr_info->host_initiated &&
 		    !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
@@ -1873,6 +1950,16 @@
 			return 1;
 		vmcs_write64(GUEST_BNDCFGS, data);
 		break;
+	case MSR_IA32_UMWAIT_CONTROL:
+		if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx))
+			return 1;
+
+		/* The reserved bit 1 and non-32 bit [63:32] should be zero */
+		if (data & (BIT_ULL(1) | GENMASK_ULL(63, 32)))
+			return 1;
+
+		vmx->msr_ia32_umwait_control = data;
+		break;
 	case MSR_IA32_SPEC_CTRL:
 		if (!msr_info->host_initiated &&
 		    !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
@@ -2290,6 +2377,7 @@
 			SECONDARY_EXEC_RDRAND_EXITING |
 			SECONDARY_EXEC_ENABLE_PML |
 			SECONDARY_EXEC_TSC_SCALING |
+			SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE |
 			SECONDARY_EXEC_PT_USE_GPA |
 			SECONDARY_EXEC_PT_CONCEAL_VMX |
 			SECONDARY_EXEC_ENABLE_VMFUNC |
@@ -4026,6 +4114,23 @@
 		}
 	}
 
+	if (vmx_waitpkg_supported()) {
+		bool waitpkg_enabled =
+			guest_cpuid_has(vcpu, X86_FEATURE_WAITPKG);
+
+		if (!waitpkg_enabled)
+			exec_control &= ~SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;
+
+		if (nested) {
+			if (waitpkg_enabled)
+				vmx->nested.msrs.secondary_ctls_high |=
+					SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;
+			else
+				vmx->nested.msrs.secondary_ctls_high &=
+					~SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;
+		}
+	}
+
 	vmx->secondary_exec_control = exec_control;
 }
 
@@ -4160,6 +4265,8 @@
 	vmx->rmode.vm86_active = 0;
 	vmx->spec_ctrl = 0;
 
+	vmx->msr_ia32_umwait_control = 0;
+
 	vcpu->arch.microcode_version = 0x100000000ULL;
 	vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
 	vmx->hv_deadline_tsc = -1;
@@ -4277,8 +4384,7 @@
 		int inc_eip = 0;
 		if (vcpu->arch.interrupt.soft)
 			inc_eip = vcpu->arch.event_exit_inst_len;
-		if (kvm_inject_realmode_interrupt(vcpu, irq, inc_eip) != EMULATE_DONE)
-			kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+		kvm_inject_realmode_interrupt(vcpu, irq, inc_eip);
 		return;
 	}
 	intr = irq | INTR_INFO_VALID_MASK;
@@ -4314,8 +4420,7 @@
 	vmx->loaded_vmcs->nmi_known_unmasked = false;
 
 	if (vmx->rmode.vm86_active) {
-		if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0) != EMULATE_DONE)
-			kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+		kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0);
 		return;
 	}
 
@@ -4442,7 +4547,7 @@
 	 * Cause the #SS fault with 0 error code in VM86 mode.
 	 */
 	if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) {
-		if (kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE) {
+		if (kvm_emulate_instruction(vcpu, 0)) {
 			if (vcpu->arch.halt_request) {
 				vcpu->arch.halt_request = 0;
 				return kvm_vcpu_halt(vcpu);
@@ -4493,7 +4598,6 @@
 	u32 intr_info, ex_no, error_code;
 	unsigned long cr2, rip, dr6;
 	u32 vect_info;
-	enum emulation_result er;
 
 	vect_info = vmx->idt_vectoring_info;
 	intr_info = vmx->exit_intr_info;
@@ -4510,13 +4614,17 @@
 
 	if (!vmx->rmode.vm86_active && is_gp_fault(intr_info)) {
 		WARN_ON_ONCE(!enable_vmware_backdoor);
-		er = kvm_emulate_instruction(vcpu,
-			EMULTYPE_VMWARE | EMULTYPE_NO_UD_ON_FAIL);
-		if (er == EMULATE_USER_EXIT)
-			return 0;
-		else if (er != EMULATE_DONE)
+
+		/*
+		 * VMware backdoor emulation on #GP interception only handles
+		 * IN{S}, OUT{S}, and RDPMC, none of which generate a non-zero
+		 * error code on #GP.
+		 */
+		if (error_code) {
 			kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
-		return 1;
+			return 1;
+		}
+		return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP);
 	}
 
 	/*
@@ -4558,7 +4666,7 @@
 			vcpu->arch.dr6 &= ~DR_TRAP_BITS;
 			vcpu->arch.dr6 |= dr6 | DR6_RTM;
 			if (is_icebp(intr_info))
-				skip_emulated_instruction(vcpu);
+				WARN_ON(!skip_emulated_instruction(vcpu));
 
 			kvm_queue_exception(vcpu, DB_VECTOR);
 			return 1;
@@ -4613,7 +4721,7 @@
 	++vcpu->stat.io_exits;
 
 	if (string)
-		return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE;
+		return kvm_emulate_instruction(vcpu, 0);
 
 	port = exit_qualification >> 16;
 	size = (exit_qualification & 7) + 1;
@@ -4687,7 +4795,7 @@
 static int handle_desc(struct kvm_vcpu *vcpu)
 {
 	WARN_ON(!(vcpu->arch.cr4 & X86_CR4_UMIP));
-	return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE;
+	return kvm_emulate_instruction(vcpu, 0);
 }
 
 static int handle_cr(struct kvm_vcpu *vcpu)
@@ -4903,7 +5011,7 @@
 
 static int handle_invd(struct kvm_vcpu *vcpu)
 {
-	return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE;
+	return kvm_emulate_instruction(vcpu, 0);
 }
 
 static int handle_invlpg(struct kvm_vcpu *vcpu)
@@ -4937,20 +5045,6 @@
 	return 1;
 }
 
-static int handle_xsaves(struct kvm_vcpu *vcpu)
-{
-	kvm_skip_emulated_instruction(vcpu);
-	WARN(1, "this should never happen\n");
-	return 1;
-}
-
-static int handle_xrstors(struct kvm_vcpu *vcpu)
-{
-	kvm_skip_emulated_instruction(vcpu);
-	WARN(1, "this should never happen\n");
-	return 1;
-}
-
 static int handle_apic_access(struct kvm_vcpu *vcpu)
 {
 	if (likely(fasteoi)) {
@@ -4970,7 +5064,7 @@
 			return kvm_skip_emulated_instruction(vcpu);
 		}
 	}
-	return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE;
+	return kvm_emulate_instruction(vcpu, 0);
 }
 
 static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu)
@@ -5039,23 +5133,15 @@
 	if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION &&
 		       type != INTR_TYPE_EXT_INTR &&
 		       type != INTR_TYPE_NMI_INTR))
-		skip_emulated_instruction(vcpu);
-
-	if (kvm_task_switch(vcpu, tss_selector,
-			    type == INTR_TYPE_SOFT_INTR ? idt_index : -1, reason,
-			    has_error_code, error_code) == EMULATE_FAIL) {
-		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
-		vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
-		vcpu->run->internal.ndata = 0;
-		return 0;
-	}
+		WARN_ON(!skip_emulated_instruction(vcpu));
 
 	/*
 	 * TODO: What about debug traps on tss switch?
 	 *       Are we supposed to inject them and update dr6?
 	 */
-
-	return 1;
+	return kvm_task_switch(vcpu, tss_selector,
+			       type == INTR_TYPE_SOFT_INTR ? idt_index : -1,
+			       reason, has_error_code, error_code);
 }
 
 static int handle_ept_violation(struct kvm_vcpu *vcpu)
@@ -5114,21 +5200,7 @@
 	if (!is_guest_mode(vcpu) &&
 	    !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
 		trace_kvm_fast_mmio(gpa);
-		/*
-		 * Doing kvm_skip_emulated_instruction() depends on undefined
-		 * behavior: Intel's manual doesn't mandate
-		 * VM_EXIT_INSTRUCTION_LEN to be set in VMCS when EPT MISCONFIG
-		 * occurs and while on real hardware it was observed to be set,
-		 * other hypervisors (namely Hyper-V) don't set it, we end up
-		 * advancing IP with some random value. Disable fast mmio when
-		 * running nested and keep it for real hardware in hope that
-		 * VM_EXIT_INSTRUCTION_LEN will always be set correctly.
-		 */
-		if (!static_cpu_has(X86_FEATURE_HYPERVISOR))
-			return kvm_skip_emulated_instruction(vcpu);
-		else
-			return kvm_emulate_instruction(vcpu, EMULTYPE_SKIP) ==
-								EMULATE_DONE;
+		return kvm_skip_emulated_instruction(vcpu);
 	}
 
 	return kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0);
@@ -5147,8 +5219,6 @@
 static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	enum emulation_result err = EMULATE_DONE;
-	int ret = 1;
 	bool intr_window_requested;
 	unsigned count = 130;
 
@@ -5169,41 +5239,35 @@
 		if (kvm_test_request(KVM_REQ_EVENT, vcpu))
 			return 1;
 
-		err = kvm_emulate_instruction(vcpu, 0);
-
-		if (err == EMULATE_USER_EXIT) {
-			++vcpu->stat.mmio_exits;
-			ret = 0;
-			goto out;
-		}
-
-		if (err != EMULATE_DONE)
-			goto emulation_error;
+		if (!kvm_emulate_instruction(vcpu, 0))
+			return 0;
 
 		if (vmx->emulation_required && !vmx->rmode.vm86_active &&
-		    vcpu->arch.exception.pending)
-			goto emulation_error;
+		    vcpu->arch.exception.pending) {
+			vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+			vcpu->run->internal.suberror =
+						KVM_INTERNAL_ERROR_EMULATION;
+			vcpu->run->internal.ndata = 0;
+			return 0;
+		}
 
 		if (vcpu->arch.halt_request) {
 			vcpu->arch.halt_request = 0;
-			ret = kvm_vcpu_halt(vcpu);
-			goto out;
+			return kvm_vcpu_halt(vcpu);
 		}
 
+		/*
+		 * Note, return 1 and not 0, vcpu_run() is responsible for
+		 * morphing the pending signal into the proper return code.
+		 */
 		if (signal_pending(current))
-			goto out;
+			return 1;
+
 		if (need_resched())
 			schedule();
 	}
 
-out:
-	return ret;
-
-emulation_error:
-	vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
-	vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
-	vcpu->run->internal.ndata = 0;
-	return 0;
+	return 1;
 }
 
 static void grow_ple_window(struct kvm_vcpu *vcpu)
@@ -5474,6 +5538,14 @@
 	return 1;
 }
 
+static int handle_unexpected_vmexit(struct kvm_vcpu *vcpu)
+{
+	kvm_skip_emulated_instruction(vcpu);
+	WARN_ONCE(1, "Unexpected VM-Exit Reason = 0x%x",
+		vmcs_read32(VM_EXIT_REASON));
+	return 1;
+}
+
 /*
  * The exit handlers return 1 if the exit was handled fully and guest execution
  * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
@@ -5525,13 +5597,15 @@
 	[EXIT_REASON_INVVPID]                 = handle_vmx_instruction,
 	[EXIT_REASON_RDRAND]                  = handle_invalid_op,
 	[EXIT_REASON_RDSEED]                  = handle_invalid_op,
-	[EXIT_REASON_XSAVES]                  = handle_xsaves,
-	[EXIT_REASON_XRSTORS]                 = handle_xrstors,
+	[EXIT_REASON_XSAVES]                  = handle_unexpected_vmexit,
+	[EXIT_REASON_XRSTORS]                 = handle_unexpected_vmexit,
 	[EXIT_REASON_PML_FULL]		      = handle_pml_full,
 	[EXIT_REASON_INVPCID]                 = handle_invpcid,
 	[EXIT_REASON_VMFUNC]		      = handle_vmx_instruction,
 	[EXIT_REASON_PREEMPTION_TIMER]	      = handle_preemption_timer,
 	[EXIT_REASON_ENCLS]		      = handle_encls,
+	[EXIT_REASON_UMWAIT]                  = handle_unexpected_vmexit,
+	[EXIT_REASON_TPAUSE]                  = handle_unexpected_vmexit,
 };
 
 static const int kvm_vmx_max_exit_handlers =
@@ -6362,6 +6436,23 @@
 					msrs[i].host, false);
 }
 
+static void atomic_switch_umwait_control_msr(struct vcpu_vmx *vmx)
+{
+	u32 host_umwait_control;
+
+	if (!vmx_has_waitpkg(vmx))
+		return;
+
+	host_umwait_control = get_umwait_control_msr();
+
+	if (vmx->msr_ia32_umwait_control != host_umwait_control)
+		add_atomic_switch_msr(vmx, MSR_IA32_UMWAIT_CONTROL,
+			vmx->msr_ia32_umwait_control,
+			host_umwait_control, false);
+	else
+		clear_atomic_switch_msr(vmx, MSR_IA32_UMWAIT_CONTROL);
+}
+
 static void vmx_update_hv_timer(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -6456,6 +6547,7 @@
 	pt_guest_enter(vmx);
 
 	atomic_switch_perf_msrs(vmx);
+	atomic_switch_umwait_control_msr(vmx);
 
 	if (enable_preemption_timer)
 		vmx_update_hv_timer(vcpu);
@@ -6511,6 +6603,9 @@
 		current_evmcs->hv_clean_fields |=
 			HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
 
+	if (static_branch_unlikely(&enable_evmcs))
+		current_evmcs->hv_vp_id = vcpu->arch.hyperv.vp_index;
+
 	/* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
 	if (vmx->host_debugctlmsr)
 		update_debugctlmsr(vmx->host_debugctlmsr);
@@ -6578,6 +6673,7 @@
 
 static void vmx_vm_free(struct kvm *kvm)
 {
+	kfree(kvm->arch.hyperv.hv_pa_pg);
 	vfree(to_kvm_vmx(kvm));
 }
 
@@ -7706,7 +7802,7 @@
 
 	.run = vmx_vcpu_run,
 	.handle_exit = vmx_handle_exit,
-	.skip_emulated_instruction = __skip_emulated_instruction,
+	.skip_emulated_instruction = skip_emulated_instruction,
 	.set_interrupt_shadow = vmx_set_interrupt_shadow,
 	.get_interrupt_shadow = vmx_get_interrupt_shadow,
 	.patch_hypercall = vmx_patch_hypercall,
@@ -7837,6 +7933,7 @@
 			if (!vp_ap)
 				continue;
 
+			vp_ap->nested_control.features.directhypercall = 0;
 			vp_ap->current_nested_vmcs = 0;
 			vp_ap->enlighten_vmentry = 0;
 		}
@@ -7876,6 +7973,11 @@
 			pr_info("KVM: vmx: using Hyper-V Enlightened VMCS\n");
 			static_branch_enable(&enable_evmcs);
 		}
+
+		if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH)
+			vmx_x86_ops.enable_direct_tlbflush
+				= hv_enable_direct_tlbflush;
+
 	} else {
 		enlightened_vmcs = false;
 	}

diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 64d5a48..bee1668 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h

@@ -14,6 +14,8 @@
 extern const u32 vmx_msr_index[];
 extern u64 host_efer;
 
+extern u32 get_umwait_control_msr(void);
+
 #define MSR_TYPE_R	1
 #define MSR_TYPE_W	2
 #define MSR_TYPE_RW	3
@@ -211,6 +213,7 @@
 #endif
 
 	u64		      spec_ctrl;
+	u32		      msr_ia32_umwait_control;
 
 	u32 secondary_exec_control;
 
@@ -497,6 +500,12 @@
 	vmcs_write64(TSC_MULTIPLIER, vmx->current_tsc_ratio);
 }
 
+static inline bool vmx_has_waitpkg(struct vcpu_vmx *vmx)
+{
+	return vmx->secondary_exec_control &
+		SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;
+}
+
 void dump_vmcs(void);
 
 #endif /* __KVM_X86_VMX_H */

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index dfd6412..0ed07d8 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c

@@ -360,7 +360,8 @@
 asmlinkage __visible void kvm_spurious_fault(void)
 {
 	/* Fault while not rebooting.  We want the trace. */
-	BUG();
+	if (!kvm_rebooting)
+		BUG();
 }
 EXPORT_SYMBOL_GPL(kvm_spurious_fault);
 
@@ -1145,6 +1146,44 @@
 	MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B,
 	MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B,
 	MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B,
+	MSR_IA32_UMWAIT_CONTROL,
+
+	MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1,
+	MSR_ARCH_PERFMON_FIXED_CTR0 + 2, MSR_ARCH_PERFMON_FIXED_CTR0 + 3,
+	MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS,
+	MSR_CORE_PERF_GLOBAL_CTRL, MSR_CORE_PERF_GLOBAL_OVF_CTRL,
+	MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1,
+	MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3,
+	MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5,
+	MSR_ARCH_PERFMON_PERFCTR0 + 6, MSR_ARCH_PERFMON_PERFCTR0 + 7,
+	MSR_ARCH_PERFMON_PERFCTR0 + 8, MSR_ARCH_PERFMON_PERFCTR0 + 9,
+	MSR_ARCH_PERFMON_PERFCTR0 + 10, MSR_ARCH_PERFMON_PERFCTR0 + 11,
+	MSR_ARCH_PERFMON_PERFCTR0 + 12, MSR_ARCH_PERFMON_PERFCTR0 + 13,
+	MSR_ARCH_PERFMON_PERFCTR0 + 14, MSR_ARCH_PERFMON_PERFCTR0 + 15,
+	MSR_ARCH_PERFMON_PERFCTR0 + 16, MSR_ARCH_PERFMON_PERFCTR0 + 17,
+	MSR_ARCH_PERFMON_PERFCTR0 + 18, MSR_ARCH_PERFMON_PERFCTR0 + 19,
+	MSR_ARCH_PERFMON_PERFCTR0 + 20, MSR_ARCH_PERFMON_PERFCTR0 + 21,
+	MSR_ARCH_PERFMON_PERFCTR0 + 22, MSR_ARCH_PERFMON_PERFCTR0 + 23,
+	MSR_ARCH_PERFMON_PERFCTR0 + 24, MSR_ARCH_PERFMON_PERFCTR0 + 25,
+	MSR_ARCH_PERFMON_PERFCTR0 + 26, MSR_ARCH_PERFMON_PERFCTR0 + 27,
+	MSR_ARCH_PERFMON_PERFCTR0 + 28, MSR_ARCH_PERFMON_PERFCTR0 + 29,
+	MSR_ARCH_PERFMON_PERFCTR0 + 30, MSR_ARCH_PERFMON_PERFCTR0 + 31,
+	MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1,
+	MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3,
+	MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5,
+	MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7,
+	MSR_ARCH_PERFMON_EVENTSEL0 + 8, MSR_ARCH_PERFMON_EVENTSEL0 + 9,
+	MSR_ARCH_PERFMON_EVENTSEL0 + 10, MSR_ARCH_PERFMON_EVENTSEL0 + 11,
+	MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13,
+	MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15,
+	MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17,
+	MSR_ARCH_PERFMON_EVENTSEL0 + 18, MSR_ARCH_PERFMON_EVENTSEL0 + 19,
+	MSR_ARCH_PERFMON_EVENTSEL0 + 20, MSR_ARCH_PERFMON_EVENTSEL0 + 21,
+	MSR_ARCH_PERFMON_EVENTSEL0 + 22, MSR_ARCH_PERFMON_EVENTSEL0 + 23,
+	MSR_ARCH_PERFMON_EVENTSEL0 + 24, MSR_ARCH_PERFMON_EVENTSEL0 + 25,
+	MSR_ARCH_PERFMON_EVENTSEL0 + 26, MSR_ARCH_PERFMON_EVENTSEL0 + 27,
+	MSR_ARCH_PERFMON_EVENTSEL0 + 28, MSR_ARCH_PERFMON_EVENTSEL0 + 29,
+	MSR_ARCH_PERFMON_EVENTSEL0 + 30, MSR_ARCH_PERFMON_EVENTSEL0 + 31,
 };
 
 static unsigned num_msrs_to_save;
@@ -3169,7 +3208,6 @@
 	case KVM_CAP_HYPERV_EVENTFD:
 	case KVM_CAP_HYPERV_TLBFLUSH:
 	case KVM_CAP_HYPERV_SEND_IPI:
-	case KVM_CAP_HYPERV_ENLIGHTENED_VMCS:
 	case KVM_CAP_HYPERV_CPUID:
 	case KVM_CAP_PCI_SEGMENT:
 	case KVM_CAP_DEBUGREGS:
@@ -3246,6 +3284,12 @@
 		r = kvm_x86_ops->get_nested_state ?
 			kvm_x86_ops->get_nested_state(NULL, NULL, 0) : 0;
 		break;
+	case KVM_CAP_HYPERV_DIRECT_TLBFLUSH:
+		r = kvm_x86_ops->enable_direct_tlbflush != NULL;
+		break;
+	case KVM_CAP_HYPERV_ENLIGHTENED_VMCS:
+		r = kvm_x86_ops->nested_enable_evmcs != NULL;
+		break;
 	default:
 		break;
 	}
@@ -4019,6 +4063,11 @@
 				r = -EFAULT;
 		}
 		return r;
+	case KVM_CAP_HYPERV_DIRECT_TLBFLUSH:
+		if (!kvm_x86_ops->enable_direct_tlbflush)
+			return -ENOTTY;
+
+		return kvm_x86_ops->enable_direct_tlbflush(vcpu);
 
 	default:
 		return -EINVAL;
@@ -5051,6 +5100,11 @@
 	u32 dummy[2];
 	unsigned i, j;
 
+	BUILD_BUG_ON_MSG(INTEL_PMC_MAX_FIXED != 4,
+			 "Please update the fixed PMCs in msrs_to_save[]");
+	BUILD_BUG_ON_MSG(INTEL_PMC_MAX_GENERIC != 32,
+			 "Please update the generic perfctr/eventsel MSRs in msrs_to_save[]");
+
 	for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
 		if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
 			continue;
@@ -5389,7 +5443,6 @@
 int handle_ud(struct kvm_vcpu *vcpu)
 {
 	int emul_type = EMULTYPE_TRAP_UD;
-	enum emulation_result er;
 	char sig[5]; /* ud2; .ascii "kvm" */
 	struct x86_exception e;
 
@@ -5398,15 +5451,10 @@
 				sig, sizeof(sig), &e) == 0 &&
 	    memcmp(sig, "\xf\xbkvm", sizeof(sig)) == 0) {
 		kvm_rip_write(vcpu, kvm_rip_read(vcpu) + sizeof(sig));
-		emul_type = 0;
+		emul_type = EMULTYPE_TRAP_UD_FORCED;
 	}
 
-	er = kvm_emulate_instruction(vcpu, emul_type);
-	if (er == EMULATE_USER_EXIT)
-		return 0;
-	if (er != EMULATE_DONE)
-		kvm_queue_exception(vcpu, UD_VECTOR);
-	return 1;
+	return kvm_emulate_instruction(vcpu, emul_type);
 }
 EXPORT_SYMBOL_GPL(handle_ud);
 
@@ -6228,7 +6276,7 @@
 	vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
 }
 
-int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
+void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
 {
 	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
 	int ret;
@@ -6240,37 +6288,43 @@
 	ctxt->_eip = ctxt->eip + inc_eip;
 	ret = emulate_int_real(ctxt, irq);
 
-	if (ret != X86EMUL_CONTINUE)
-		return EMULATE_FAIL;
-
-	ctxt->eip = ctxt->_eip;
-	kvm_rip_write(vcpu, ctxt->eip);
-	kvm_set_rflags(vcpu, ctxt->eflags);
-
-	return EMULATE_DONE;
+	if (ret != X86EMUL_CONTINUE) {
+		kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+	} else {
+		ctxt->eip = ctxt->_eip;
+		kvm_rip_write(vcpu, ctxt->eip);
+		kvm_set_rflags(vcpu, ctxt->eflags);
+	}
 }
 EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt);
 
 static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
 {
-	int r = EMULATE_DONE;
-
 	++vcpu->stat.insn_emulation_fail;
 	trace_kvm_emulate_insn_failed(vcpu);
 
-	if (emulation_type & EMULTYPE_NO_UD_ON_FAIL)
-		return EMULATE_FAIL;
+	if (emulation_type & EMULTYPE_VMWARE_GP) {
+		kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
+		return 1;
+	}
+
+	if (emulation_type & EMULTYPE_SKIP) {
+		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+		vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+		vcpu->run->internal.ndata = 0;
+		return 0;
+	}
+
+	kvm_queue_exception(vcpu, UD_VECTOR);
 
 	if (!is_guest_mode(vcpu) && kvm_x86_ops->get_cpl(vcpu) == 0) {
 		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
 		vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
 		vcpu->run->internal.ndata = 0;
-		r = EMULATE_USER_EXIT;
+		return 0;
 	}
 
-	kvm_queue_exception(vcpu, UD_VECTOR);
-
-	return r;
+	return 1;
 }
 
 static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2,
@@ -6425,7 +6479,7 @@
 	return dr6;
 }
 
-static void kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu, int *r)
+static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu)
 {
 	struct kvm_run *kvm_run = vcpu->run;
 
@@ -6434,10 +6488,10 @@
 		kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip;
 		kvm_run->debug.arch.exception = DB_VECTOR;
 		kvm_run->exit_reason = KVM_EXIT_DEBUG;
-		*r = EMULATE_USER_EXIT;
-	} else {
-		kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BS);
+		return 0;
 	}
+	kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BS);
+	return 1;
 }
 
 int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
@@ -6446,7 +6500,7 @@
 	int r;
 
 	r = kvm_x86_ops->skip_emulated_instruction(vcpu);
-	if (unlikely(r != EMULATE_DONE))
+	if (unlikely(!r))
 		return 0;
 
 	/*
@@ -6458,8 +6512,8 @@
 	 * that sets the TF flag".
 	 */
 	if (unlikely(rflags & X86_EFLAGS_TF))
-		kvm_vcpu_do_singlestep(vcpu, &r);
-	return r == EMULATE_DONE;
+		r = kvm_vcpu_do_singlestep(vcpu);
+	return r;
 }
 EXPORT_SYMBOL_GPL(kvm_skip_emulated_instruction);
 
@@ -6478,7 +6532,7 @@
 			kvm_run->debug.arch.pc = eip;
 			kvm_run->debug.arch.exception = DB_VECTOR;
 			kvm_run->exit_reason = KVM_EXIT_DEBUG;
-			*r = EMULATE_USER_EXIT;
+			*r = 0;
 			return true;
 		}
 	}
@@ -6494,7 +6548,7 @@
 			vcpu->arch.dr6 &= ~DR_TRAP_BITS;
 			vcpu->arch.dr6 |= dr6 | DR6_RTM;
 			kvm_queue_exception(vcpu, DB_VECTOR);
-			*r = EMULATE_DONE;
+			*r = 1;
 			return true;
 		}
 	}
@@ -6578,11 +6632,14 @@
 		trace_kvm_emulate_insn_start(vcpu);
 		++vcpu->stat.insn_emulation;
 		if (r != EMULATION_OK)  {
-			if (emulation_type & EMULTYPE_TRAP_UD)
-				return EMULATE_FAIL;
+			if ((emulation_type & EMULTYPE_TRAP_UD) ||
+			    (emulation_type & EMULTYPE_TRAP_UD_FORCED)) {
+				kvm_queue_exception(vcpu, UD_VECTOR);
+				return 1;
+			}
 			if (reexecute_instruction(vcpu, cr2, write_fault_to_spt,
 						emulation_type))
-				return EMULATE_DONE;
+				return 1;
 			if (ctxt->have_exception) {
 				/*
 				 * #UD should result in just EMULATION_FAILED, and trap-like
@@ -6591,28 +6648,32 @@
 				WARN_ON_ONCE(ctxt->exception.vector == UD_VECTOR ||
 					     exception_type(ctxt->exception.vector) == EXCPT_TRAP);
 				inject_emulated_exception(vcpu);
-				return EMULATE_DONE;
+				return 1;
 			}
-			if (emulation_type & EMULTYPE_SKIP)
-				return EMULATE_FAIL;
 			return handle_emulation_failure(vcpu, emulation_type);
 		}
 	}
 
-	if ((emulation_type & EMULTYPE_VMWARE) &&
-	    !is_vmware_backdoor_opcode(ctxt))
-		return EMULATE_FAIL;
+	if ((emulation_type & EMULTYPE_VMWARE_GP) &&
+	    !is_vmware_backdoor_opcode(ctxt)) {
+		kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
+		return 1;
+	}
 
+	/*
+	 * Note, EMULTYPE_SKIP is intended for use *only* by vendor callbacks
+	 * for kvm_skip_emulated_instruction().  The caller is responsible for
+	 * updating interruptibility state and injecting single-step #DBs.
+	 */
 	if (emulation_type & EMULTYPE_SKIP) {
 		kvm_rip_write(vcpu, ctxt->_eip);
 		if (ctxt->eflags & X86_EFLAGS_RF)
 			kvm_set_rflags(vcpu, ctxt->eflags & ~X86_EFLAGS_RF);
-		kvm_x86_ops->set_interrupt_shadow(vcpu, 0);
-		return EMULATE_DONE;
+		return 1;
 	}
 
 	if (retry_instruction(ctxt, cr2, emulation_type))
-		return EMULATE_DONE;
+		return 1;
 
 	/* this is needed for vmware backdoor interface to work since it
 	   changes registers values  during IO operation */
@@ -6628,18 +6689,18 @@
 	r = x86_emulate_insn(ctxt);
 
 	if (r == EMULATION_INTERCEPTED)
-		return EMULATE_DONE;
+		return 1;
 
 	if (r == EMULATION_FAILED) {
 		if (reexecute_instruction(vcpu, cr2, write_fault_to_spt,
 					emulation_type))
-			return EMULATE_DONE;
+			return 1;
 
 		return handle_emulation_failure(vcpu, emulation_type);
 	}
 
 	if (ctxt->have_exception) {
-		r = EMULATE_DONE;
+		r = 1;
 		if (inject_emulated_exception(vcpu))
 			return r;
 	} else if (vcpu->arch.pio.count) {
@@ -6650,16 +6711,18 @@
 			writeback = false;
 			vcpu->arch.complete_userspace_io = complete_emulated_pio;
 		}
-		r = EMULATE_USER_EXIT;
+		r = 0;
 	} else if (vcpu->mmio_needed) {
+		++vcpu->stat.mmio_exits;
+
 		if (!vcpu->mmio_is_write)
 			writeback = false;
-		r = EMULATE_USER_EXIT;
+		r = 0;
 		vcpu->arch.complete_userspace_io = complete_emulated_mmio;
 	} else if (r == EMULATION_RESTART)
 		goto restart;
 	else
-		r = EMULATE_DONE;
+		r = 1;
 
 	if (writeback) {
 		unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
@@ -6668,8 +6731,8 @@
 		if (!ctxt->have_exception ||
 		    exception_type(ctxt->exception.vector) == EXCPT_TRAP) {
 			kvm_rip_write(vcpu, ctxt->eip);
-			if (r == EMULATE_DONE && ctxt->tf)
-				kvm_vcpu_do_singlestep(vcpu, &r);
+			if (r && ctxt->tf)
+				r = kvm_vcpu_do_singlestep(vcpu);
 			__kvm_set_rflags(vcpu, ctxt->eflags);
 		}
 
@@ -8263,12 +8326,11 @@
 static inline int complete_emulated_io(struct kvm_vcpu *vcpu)
 {
 	int r;
+
 	vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
 	r = kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
 	srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
-	if (r != EMULATE_DONE)
-		return 0;
-	return 1;
+	return r;
 }
 
 static int complete_emulated_pio(struct kvm_vcpu *vcpu)
@@ -8636,14 +8698,17 @@
 
 	ret = emulator_task_switch(ctxt, tss_selector, idt_index, reason,
 				   has_error_code, error_code);
-
-	if (ret)
-		return EMULATE_FAIL;
+	if (ret) {
+		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+		vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+		vcpu->run->internal.ndata = 0;
+		return 0;
+	}
 
 	kvm_rip_write(vcpu, ctxt->eip);
 	kvm_set_rflags(vcpu, ctxt->eflags);
 	kvm_make_request(KVM_REQ_EVENT, vcpu);
-	return EMULATE_DONE;
+	return 1;
 }
 EXPORT_SYMBOL_GPL(kvm_task_switch);
 
@@ -9361,6 +9426,7 @@
 
 	INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list);
 	INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
+	INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
 	INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
 	atomic_set(&kvm->arch.noncoherent_dma_count, 0);
 
@@ -9690,8 +9756,13 @@
 	 * Scan sptes if dirty logging has been stopped, dropping those
 	 * which can be collapsed into a single large-page spte.  Later
 	 * page faults will create the large-page sptes.
+	 *
+	 * There is no need to do this in any of the following cases:
+	 * CREATE:	No dirty mappings will already exist.
+	 * MOVE/DELETE:	The old mappings will already have been cleaned up by
+	 *		kvm_arch_flush_shadow_memslot()
 	 */
-	if ((change != KVM_MR_DELETE) &&
+	if (change == KVM_MR_FLAGS_ONLY &&
 		(old->flags & KVM_MEM_LOG_DIRTY_PAGES) &&
 		!(new->flags & KVM_MEM_LOG_DIRTY_PAGES))
 		kvm_mmu_zap_collapsible_sptes(kvm, new);

diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index b5274e2..dbf7442 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h

@@ -261,7 +261,7 @@
 }
 
 void kvm_set_pending_timer(struct kvm_vcpu *vcpu);
-int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
+void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
 
 void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr);
 u64 get_kvmclock_ns(struct kvm *kvm);

diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c
index fa16036f..65ebe4b 100644
--- a/arch/x86/mm/pat_rbtree.c
+++ b/arch/x86/mm/pat_rbtree.c

@@ -54,23 +54,10 @@
 	return ret;
 }
 
-static u64 compute_subtree_max_end(struct memtype *data)
-{
-	u64 max_end = data->end, child_max_end;
+#define NODE_END(node) ((node)->end)
 
-	child_max_end = get_subtree_max_end(data->rb.rb_right);
-	if (child_max_end > max_end)
-		max_end = child_max_end;
-
-	child_max_end = get_subtree_max_end(data->rb.rb_left);
-	if (child_max_end > max_end)
-		max_end = child_max_end;
-
-	return max_end;
-}
-
-RB_DECLARE_CALLBACKS(static, memtype_rb_augment_cb, struct memtype, rb,
-		     u64, subtree_max_end, compute_subtree_max_end)
+RB_DECLARE_CALLBACKS_MAX(static, memtype_rb_augment_cb,
+			 struct memtype, rb, u64, subtree_max_end, NODE_END)
 
 /* Find the first (lowest start addr) overlapping range from rb tree */
 static struct memtype *memtype_rb_lowest_match(struct rb_root *root,

diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 44816ff..3e4b903 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c

@@ -45,7 +45,7 @@
 
 void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
 {
-	pgtable_page_dtor(pte);
+	pgtable_pte_page_dtor(pte);
 	paravirt_release_pte(page_to_pfn(pte));
 	paravirt_tlb_remove_table(tlb, pte);
 }
@@ -357,7 +357,7 @@
 
 static struct kmem_cache *pgd_cache;
 
-void __init pgd_cache_init(void)
+void __init pgtable_cache_init(void)
 {
 	/*
 	 * When PAE kernel is running as a Xen domain, it does not use
@@ -402,10 +402,6 @@
 }
 #else
 
-void __init pgd_cache_init(void)
-{
-}
-
 static inline pgd_t *_pgd_alloc(void)
 {
 	return (pgd_t *)__get_free_pages(GFP_PGTABLE_USER,

diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c
index 0881e1f..a8bd952 100644
--- a/arch/x86/mm/testmmiotrace.c
+++ b/arch/x86/mm/testmmiotrace.c

@@ -8,6 +8,7 @@
 #include <linux/module.h>
 #include <linux/io.h>
 #include <linux/mmiotrace.h>
+#include <linux/security.h>
 
 static unsigned long mmio_address;
 module_param_hw(mmio_address, ulong, iomem, 0);
@@ -115,6 +116,10 @@
 static int __init init(void)
 {
 	unsigned long size = (read_far) ? (8 << 20) : (16 << 10);
+	int ret = security_locked_down(LOCKDOWN_MMIOTRACE);
+
+	if (ret)
+		return ret;
 
 	if (mmio_address == 0) {
 		pr_err("you have to use the module argument mmio_address.\n");

diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile
index 5277490..fb4ee54 100644
--- a/arch/x86/purgatory/Makefile
+++ b/arch/x86/purgatory/Makefile

@@ -25,6 +25,7 @@
 
 PURGATORY_CFLAGS_REMOVE := -mcmodel=kernel
 PURGATORY_CFLAGS := -mcmodel=large -ffreestanding -fno-zero-initialized-in-bss
+PURGATORY_CFLAGS += $(DISABLE_STACKLEAK_PLUGIN)
 
 # Default KBUILD_CFLAGS can have -pg option set when FTRACE is enabled. That
 # in turn leaves some undefined symbols like __fentry__ in purgatory and not

diff --git a/arch/xtensa/include/asm/pgalloc.h b/arch/xtensa/include/asm/pgalloc.h
index dd744aa..1d38f0e 100644
--- a/arch/xtensa/include/asm/pgalloc.h
+++ b/arch/xtensa/include/asm/pgalloc.h

@@ -55,7 +55,7 @@
 	if (!pte)
 		return NULL;
 	page = virt_to_page(pte);
-	if (!pgtable_page_ctor(page)) {
+	if (!pgtable_pte_page_ctor(page)) {
 		__free_page(page);
 		return NULL;
 	}
@@ -69,7 +69,7 @@
 
 static inline void pte_free(struct mm_struct *mm, pgtable_t pte)
 {
-	pgtable_page_dtor(pte);
+	pgtable_pte_page_dtor(pte);
 	__free_page(pte);
 }
 #define pmd_pgtable(pmd) pmd_page(pmd)

diff --git a/arch/xtensa/include/asm/pgtable.h b/arch/xtensa/include/asm/pgtable.h
index ce3ff5e..3f7fe5a 100644
--- a/arch/xtensa/include/asm/pgtable.h
+++ b/arch/xtensa/include/asm/pgtable.h

@@ -238,7 +238,6 @@
 # define swapper_pg_dir NULL
 static inline void paging_init(void) { }
 #endif
-static inline void pgtable_cache_init(void) { }
 
 /*
  * The pmd contains the kernel virtual address of the pte page.

diff --git a/arch/xtensa/include/asm/tlbflush.h b/arch/xtensa/include/asm/tlbflush.h
index 06875feb..856e2da 100644
--- a/arch/xtensa/include/asm/tlbflush.h
+++ b/arch/xtensa/include/asm/tlbflush.h

@@ -160,9 +160,6 @@
 		invalidate_dtlb_entry(tlb_entry);
 }
 
-#define check_pgt_cache()	do { } while (0)
-
-
 /*
  * DO NOT USE THESE FUNCTIONS.  These instructions aren't part of the Xtensa
  * ISA and exist only for test purposes..

diff --git a/arch/xtensa/include/uapi/asm/mman.h b/arch/xtensa/include/uapi/asm/mman.h
index ebbb488..e5e6437 100644
--- a/arch/xtensa/include/uapi/asm/mman.h
+++ b/arch/xtensa/include/uapi/asm/mman.h

@@ -103,6 +103,9 @@
 #define MADV_WIPEONFORK 18		/* Zero memory on fork, child only */
 #define MADV_KEEPONFORK 19		/* Undo MADV_WIPEONFORK */
 
+#define MADV_COLD	20		/* deactivate these pages */
+#define MADV_PAGEOUT	21		/* reclaim these pages */
+
 /* compatibility flags */
 #define MAP_FILE	0
 

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index b33be92..0319d63 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c

@@ -2016,7 +2016,7 @@
 		     (bfqq->last_serv_time_ns > 0 &&
 		      bfqd->rqs_injected && bfqd->rq_in_driver > 0)) &&
 		    time_is_before_eq_jiffies(bfqq->decrease_time_jif +
-					      msecs_to_jiffies(100))) {
+					      msecs_to_jiffies(10))) {
 			bfqd->last_empty_occupied_ns = ktime_get_ns();
 			/*
 			 * Start the state machine for measuring the
@@ -2025,7 +2025,21 @@
 			 * be set when rq will be dispatched.
 			 */
 			bfqd->wait_dispatch = true;
-			bfqd->rqs_injected = false;
+			/*
+			 * If there is no I/O in service in the drive,
+			 * then possible injection occurred before the
+			 * arrival of rq will not affect the total
+			 * service time of rq. So the injection limit
+			 * must not be updated as a function of such
+			 * total service time, unless new injection
+			 * occurs before rq is completed. To have the
+			 * injection limit updated only in the latter
+			 * case, reset rqs_injected here (rqs_injected
+			 * will be set in case injection is performed
+			 * on bfqq before rq is completed).
+			 */
+			if (bfqd->rq_in_driver == 0)
+				bfqd->rqs_injected = false;
 		}
 	}
 
@@ -5784,14 +5798,14 @@
 	u64 tot_time_ns = ktime_get_ns() - bfqd->last_empty_occupied_ns;
 	unsigned int old_limit = bfqq->inject_limit;
 
-	if (bfqq->last_serv_time_ns > 0) {
+	if (bfqq->last_serv_time_ns > 0 && bfqd->rqs_injected) {
 		u64 threshold = (bfqq->last_serv_time_ns * 3)>>1;
 
 		if (tot_time_ns >= threshold && old_limit > 0) {
 			bfqq->inject_limit--;
 			bfqq->decrease_time_jif = jiffies;
 		} else if (tot_time_ns < threshold &&
-			   old_limit < bfqd->max_rq_in_driver<<1)
+			   old_limit <= bfqd->max_rq_in_driver)
 			bfqq->inject_limit++;
 	}
 
@@ -5809,12 +5823,14 @@
 	 */
 	if ((bfqq->last_serv_time_ns == 0 && bfqd->rq_in_driver == 1) ||
 	    tot_time_ns < bfqq->last_serv_time_ns) {
+		if (bfqq->last_serv_time_ns == 0) {
+			/*
+			 * Now we certainly have a base value: make sure we
+			 * start trying injection.
+			 */
+			bfqq->inject_limit = max_t(unsigned int, 1, old_limit);
+		}
 		bfqq->last_serv_time_ns = tot_time_ns;
-		/*
-		 * Now we certainly have a base value: make sure we
-		 * start trying injection.
-		 */
-		bfqq->inject_limit = max_t(unsigned int, 1, old_limit);
 	} else if (!bfqd->rqs_injected && bfqd->rq_in_driver == 1)
 		/*
 		 * No I/O injected and no request still in service in
@@ -5830,6 +5846,7 @@
 
 	/* update complete, not waiting for any request completion any longer */
 	bfqd->waited_rq = NULL;
+	bfqd->rqs_injected = false;
 }
 
 /*

diff --git a/block/blk-core.c b/block/blk-core.c
index 875e8d1..d5e668e 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c

@@ -34,6 +34,7 @@
 #include <linux/ratelimit.h>
 #include <linux/pm_runtime.h>
 #include <linux/blk-cgroup.h>
+#include <linux/t10-pi.h>
 #include <linux/debugfs.h>
 #include <linux/bpf.h>
 #include <linux/psi.h>
@@ -1436,6 +1437,12 @@
 	if (!req->bio)
 		return false;
 
+#ifdef CONFIG_BLK_DEV_INTEGRITY
+	if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ &&
+	    error == BLK_STS_OK)
+		req->q->integrity.profile->complete_fn(req, nr_bytes);
+#endif
+
 	if (unlikely(error && !blk_rq_is_passthrough(req) &&
 		     !(req->rq_flags & RQF_QUIET)))
 		print_req_error(req, error, __func__);

diff --git a/block/blk-flush.c b/block/blk-flush.c
index aedd932..1eec9cb 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c

@@ -214,6 +214,16 @@
 
 	/* release the tag's ownership to the req cloned from */
 	spin_lock_irqsave(&fq->mq_flush_lock, flags);
+
+	if (!refcount_dec_and_test(&flush_rq->ref)) {
+		fq->rq_status = error;
+		spin_unlock_irqrestore(&fq->mq_flush_lock, flags);
+		return;
+	}
+
+	if (fq->rq_status != BLK_STS_OK)
+		error = fq->rq_status;
+
 	hctx = flush_rq->mq_hctx;
 	if (!q->elevator) {
 		blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq);

diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index ca39b46..ff1070e 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c

@@ -368,10 +368,21 @@
 	return BLK_STS_OK;
 }
 
+static void blk_integrity_nop_prepare(struct request *rq)
+{
+}
+
+static void blk_integrity_nop_complete(struct request *rq,
+		unsigned int nr_bytes)
+{
+}
+
 static const struct blk_integrity_profile nop_profile = {
 	.name = "nop",
 	.generate_fn = blk_integrity_nop_fn,
 	.verify_fn = blk_integrity_nop_fn,
+	.prepare_fn = blk_integrity_nop_prepare,
+	.complete_fn = blk_integrity_nop_complete,
 };
 
 /**

diff --git a/block/blk-iocost.c b/block/blk-iocost.c
index 3b39deb..2a3db80 100644
--- a/block/blk-iocost.c
+++ b/block/blk-iocost.c

@@ -529,8 +529,8 @@
 static const struct ioc_params autop[] = {
 	[AUTOP_HDD] = {
 		.qos				= {
-			[QOS_RLAT]		=         50000, /* 50ms */
-			[QOS_WLAT]		=         50000,
+			[QOS_RLAT]		=        250000, /* 250ms */
+			[QOS_WLAT]		=        250000,
 			[QOS_MIN]		= VRATE_MIN_PPM,
 			[QOS_MAX]		= VRATE_MAX_PPM,
 		},
@@ -1343,7 +1343,7 @@
 	u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM];
 	u32 missed_ppm[2], rq_wait_pct;
 	u64 period_vtime;
-	int i;
+	int prev_busy_level, i;
 
 	/* how were the latencies during the period? */
 	ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct);
@@ -1407,7 +1407,8 @@
 		 * comparing vdone against period start.  If lagging behind
 		 * IOs from past periods, don't increase vrate.
 		 */
-		if (!atomic_read(&iocg_to_blkg(iocg)->use_delay) &&
+		if ((ppm_rthr != MILLION || ppm_wthr != MILLION) &&
+		    !atomic_read(&iocg_to_blkg(iocg)->use_delay) &&
 		    time_after64(vtime, vdone) &&
 		    time_after64(vtime, now.vnow -
 				 MAX_LAGGING_PERIODS * period_vtime) &&
@@ -1531,26 +1532,29 @@
 	 * and experiencing shortages but not surpluses, we're too stingy
 	 * and should increase vtime rate.
 	 */
+	prev_busy_level = ioc->busy_level;
 	if (rq_wait_pct > RQ_WAIT_BUSY_PCT ||
 	    missed_ppm[READ] > ppm_rthr ||
 	    missed_ppm[WRITE] > ppm_wthr) {
 		ioc->busy_level = max(ioc->busy_level, 0);
 		ioc->busy_level++;
-	} else if (nr_lagging) {
-		ioc->busy_level = max(ioc->busy_level, 0);
-	} else if (nr_shortages && !nr_surpluses &&
-		   rq_wait_pct <= RQ_WAIT_BUSY_PCT * UNBUSY_THR_PCT / 100 &&
+	} else if (rq_wait_pct <= RQ_WAIT_BUSY_PCT * UNBUSY_THR_PCT / 100 &&
 		   missed_ppm[READ] <= ppm_rthr * UNBUSY_THR_PCT / 100 &&
 		   missed_ppm[WRITE] <= ppm_wthr * UNBUSY_THR_PCT / 100) {
-		ioc->busy_level = min(ioc->busy_level, 0);
-		ioc->busy_level--;
+		/* take action iff there is contention */
+		if (nr_shortages && !nr_lagging) {
+			ioc->busy_level = min(ioc->busy_level, 0);
+			/* redistribute surpluses first */
+			if (!nr_surpluses)
+				ioc->busy_level--;
+		}
 	} else {
 		ioc->busy_level = 0;
 	}
 
 	ioc->busy_level = clamp(ioc->busy_level, -1000, 1000);
 
-	if (ioc->busy_level) {
+	if (ioc->busy_level > 0 || (ioc->busy_level < 0 && !nr_lagging)) {
 		u64 vrate = atomic64_read(&ioc->vtime_rate);
 		u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max;
 
@@ -1592,6 +1596,10 @@
 		atomic64_set(&ioc->vtime_rate, vrate);
 		ioc->inuse_margin_vtime = DIV64_U64_ROUND_UP(
 			ioc->period_us * vrate * INUSE_MARGIN_PCT, 100);
+	} else if (ioc->busy_level != prev_busy_level || nr_lagging) {
+		trace_iocost_ioc_vrate_adj(ioc, atomic64_read(&ioc->vtime_rate),
+					   &missed_ppm, rq_wait_pct, nr_lagging,
+					   nr_shortages, nr_surpluses);
 	}
 
 	ioc_refresh_params(ioc, false);

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index c9d183d..ca22afd 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c

@@ -555,8 +555,6 @@
 	struct blk_mq_hw_ctx *hctx;
 	int i;
 
-	lockdep_assert_held(&q->sysfs_lock);
-
 	queue_for_each_hw_ctx(q, hctx, i) {
 		if (hctx->sched_tags)
 			blk_mq_free_rqs(q->tag_set, hctx->sched_tags, i);

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 20a49be..6e3b15f 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c

@@ -30,6 +30,7 @@
 #include <trace/events/block.h>
 
 #include <linux/blk-mq.h>
+#include <linux/t10-pi.h>
 #include "blk.h"
 #include "blk-mq.h"
 #include "blk-mq-debugfs.h"
@@ -700,6 +701,11 @@
 		 */
 		rq->nr_phys_segments++;
 	}
+
+#ifdef CONFIG_BLK_DEV_INTEGRITY
+	if (blk_integrity_rq(rq) && req_op(rq) == REQ_OP_WRITE)
+		q->integrity.profile->prepare_fn(rq);
+#endif
 }
 EXPORT_SYMBOL(blk_mq_start_request);
 
@@ -912,7 +918,10 @@
 	 */
 	if (blk_mq_req_expired(rq, next))
 		blk_mq_rq_timed_out(rq, reserved);
-	if (refcount_dec_and_test(&rq->ref))
+
+	if (is_flush_rq(rq, hctx))
+		rq->end_io(rq, 0);
+	else if (refcount_dec_and_test(&rq->ref))
 		__blk_mq_free_request(rq);
 
 	return true;

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index b82736c..46f5198 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c

@@ -482,7 +482,6 @@
 	blk_mq_quiesce_queue(q);
 
 	wbt_set_min_lat(q, val);
-	wbt_update_limits(q);
 
 	blk_mq_unquiesce_queue(q);
 	blk_mq_unfreeze_queue(q);
@@ -989,13 +988,11 @@
 		blk_mq_debugfs_register(q);
 	}
 
-	/*
-	 * The flag of QUEUE_FLAG_REGISTERED isn't set yet, so elevator
-	 * switch won't happen at all.
-	 */
+	mutex_lock(&q->sysfs_lock);
 	if (q->elevator) {
 		ret = elv_register_queue(q, false);
 		if (ret) {
+			mutex_unlock(&q->sysfs_lock);
 			mutex_unlock(&q->sysfs_dir_lock);
 			kobject_del(&q->kobj);
 			blk_trace_remove_sysfs(dev);
@@ -1005,7 +1002,6 @@
 		has_elevator = true;
 	}
 
-	mutex_lock(&q->sysfs_lock);
 	blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q);
 	wbt_enable_default(q);
 	blk_throtl_register_queue(q);
@@ -1062,12 +1058,10 @@
 	kobject_del(&q->kobj);
 	blk_trace_remove_sysfs(disk_to_dev(disk));
 
-	/*
-	 * q->kobj has been removed, so it is safe to check if elevator
-	 * exists without holding q->sysfs_lock.
-	 */
+	mutex_lock(&q->sysfs_lock);
 	if (q->elevator)
 		elv_unregister_queue(q);
+	mutex_unlock(&q->sysfs_lock);
 	mutex_unlock(&q->sysfs_dir_lock);
 
 	kobject_put(&disk_to_dev(disk)->kobj);

diff --git a/block/blk.h b/block/blk.h
index ed347f7..47fba93 100644
--- a/block/blk.h
+++ b/block/blk.h

@@ -19,6 +19,7 @@
 	unsigned int		flush_queue_delayed:1;
 	unsigned int		flush_pending_idx:1;
 	unsigned int		flush_running_idx:1;
+	blk_status_t 		rq_status;
 	unsigned long		flush_pending_since;
 	struct list_head	flush_queue[2];
 	struct list_head	flush_data_in_flight;
@@ -47,6 +48,12 @@
 	kobject_get(&q->kobj);
 }
 
+static inline bool
+is_flush_rq(struct request *req, struct blk_mq_hw_ctx *hctx)
+{
+	return hctx->fq->flush_rq == req;
+}
+
 struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q,
 		int node, int cmd_size, gfp_t flags);
 void blk_free_flush_queue(struct blk_flush_queue *q);
@@ -194,6 +201,8 @@
 static inline void elevator_exit(struct request_queue *q,
 		struct elevator_queue *e)
 {
+	lockdep_assert_held(&q->sysfs_lock);
+
 	blk_mq_sched_free_requests(q);
 	__elevator_exit(q, e);
 }

diff --git a/block/bsg-lib.c b/block/bsg-lib.c
index 785dd58..347dda1 100644
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c

@@ -266,6 +266,7 @@
 	struct request *req = bd->rq;
 	struct bsg_set *bset =
 		container_of(q->tag_set, struct bsg_set, tag_set);
+	int sts = BLK_STS_IOERR;
 	int ret;
 
 	blk_mq_start_request(req);
@@ -274,14 +275,15 @@
 		return BLK_STS_IOERR;
 
 	if (!bsg_prepare_job(dev, req))
-		return BLK_STS_IOERR;
+		goto out;
 
 	ret = bset->job_fn(blk_mq_rq_to_pdu(req));
-	if (ret)
-		return BLK_STS_IOERR;
+	if (!ret)
+		sts = BLK_STS_OK;
 
+out:
 	put_device(dev);
-	return BLK_STS_OK;
+	return sts;
 }
 
 /* called right after the request is allocated for the request_queue */

diff --git a/block/elevator.c b/block/elevator.c
index bba10e8..5437059 100644
--- a/block/elevator.c
+++ b/block/elevator.c

@@ -503,9 +503,7 @@
 		if (uevent)
 			kobject_uevent(&e->kobj, KOBJ_ADD);
 
-		mutex_lock(&q->sysfs_lock);
 		e->registered = 1;
-		mutex_unlock(&q->sysfs_lock);
 	}
 	return error;
 }
@@ -523,11 +521,9 @@
 		kobject_uevent(&e->kobj, KOBJ_REMOVE);
 		kobject_del(&e->kobj);
 
-		mutex_lock(&q->sysfs_lock);
 		e->registered = 0;
 		/* Re-enable throttling in case elevator disabled it */
 		wbt_enable_default(q);
-		mutex_unlock(&q->sysfs_lock);
 	}
 }
 
@@ -590,32 +586,11 @@
 	lockdep_assert_held(&q->sysfs_lock);
 
 	if (q->elevator) {
-		if (q->elevator->registered) {
-			mutex_unlock(&q->sysfs_lock);
-
-			/*
-			 * Concurrent elevator switch can't happen becasue
-			 * sysfs write is always exclusively on same file.
-			 *
-			 * Also the elevator queue won't be freed after
-			 * sysfs_lock is released becasue kobject_del() in
-			 * blk_unregister_queue() waits for completion of
-			 * .store & .show on its attributes.
-			 */
+		if (q->elevator->registered)
 			elv_unregister_queue(q);
 
-			mutex_lock(&q->sysfs_lock);
-		}
 		ioc_clear_queue(q);
 		elevator_exit(q, q->elevator);
-
-		/*
-		 * sysfs_lock may be dropped, so re-check if queue is
-		 * unregistered. If yes, don't switch to new elevator
-		 * any more
-		 */
-		if (!blk_queue_registered(q))
-			return 0;
 	}
 
 	ret = blk_mq_init_sched(q, new_e);
@@ -623,11 +598,7 @@
 		goto out;
 
 	if (new_e) {
-		mutex_unlock(&q->sysfs_lock);
-
 		ret = elv_register_queue(q, true);
-
-		mutex_lock(&q->sysfs_lock);
 		if (ret) {
 			elevator_exit(q, q->elevator);
 			goto out;

diff --git a/block/t10-pi.c b/block/t10-pi.c
index 0c00946..9803c7e 100644
--- a/block/t10-pi.c
+++ b/block/t10-pi.c

@@ -27,7 +27,7 @@
  * tag.
  */
 static blk_status_t t10_pi_generate(struct blk_integrity_iter *iter,
-		csum_fn *fn, unsigned int type)
+		csum_fn *fn, enum t10_dif_type type)
 {
 	unsigned int i;
 
@@ -37,7 +37,7 @@
 		pi->guard_tag = fn(iter->data_buf, iter->interval);
 		pi->app_tag = 0;
 
-		if (type == 1)
+		if (type == T10_PI_TYPE1_PROTECTION)
 			pi->ref_tag = cpu_to_be32(lower_32_bits(iter->seed));
 		else
 			pi->ref_tag = 0;
@@ -51,17 +51,18 @@
 }
 
 static blk_status_t t10_pi_verify(struct blk_integrity_iter *iter,
-		csum_fn *fn, unsigned int type)
+		csum_fn *fn, enum t10_dif_type type)
 {
 	unsigned int i;
 
+	BUG_ON(type == T10_PI_TYPE0_PROTECTION);
+
 	for (i = 0 ; i < iter->data_size ; i += iter->interval) {
 		struct t10_pi_tuple *pi = iter->prot_buf;
 		__be16 csum;
 
-		switch (type) {
-		case 1:
-		case 2:
+		if (type == T10_PI_TYPE1_PROTECTION ||
+		    type == T10_PI_TYPE2_PROTECTION) {
 			if (pi->app_tag == T10_PI_APP_ESCAPE)
 				goto next;
 
@@ -73,12 +74,10 @@
 				       iter->seed, be32_to_cpu(pi->ref_tag));
 				return BLK_STS_PROTECTION;
 			}
-			break;
-		case 3:
+		} else if (type == T10_PI_TYPE3_PROTECTION) {
 			if (pi->app_tag == T10_PI_APP_ESCAPE &&
 			    pi->ref_tag == T10_PI_REF_ESCAPE)
 				goto next;
-			break;
 		}
 
 		csum = fn(iter->data_buf, iter->interval);
@@ -102,94 +101,40 @@
 
 static blk_status_t t10_pi_type1_generate_crc(struct blk_integrity_iter *iter)
 {
-	return t10_pi_generate(iter, t10_pi_crc_fn, 1);
+	return t10_pi_generate(iter, t10_pi_crc_fn, T10_PI_TYPE1_PROTECTION);
 }
 
 static blk_status_t t10_pi_type1_generate_ip(struct blk_integrity_iter *iter)
 {
-	return t10_pi_generate(iter, t10_pi_ip_fn, 1);
+	return t10_pi_generate(iter, t10_pi_ip_fn, T10_PI_TYPE1_PROTECTION);
 }
 
 static blk_status_t t10_pi_type1_verify_crc(struct blk_integrity_iter *iter)
 {
-	return t10_pi_verify(iter, t10_pi_crc_fn, 1);
+	return t10_pi_verify(iter, t10_pi_crc_fn, T10_PI_TYPE1_PROTECTION);
 }
 
 static blk_status_t t10_pi_type1_verify_ip(struct blk_integrity_iter *iter)
 {
-	return t10_pi_verify(iter, t10_pi_ip_fn, 1);
+	return t10_pi_verify(iter, t10_pi_ip_fn, T10_PI_TYPE1_PROTECTION);
 }
 
-static blk_status_t t10_pi_type3_generate_crc(struct blk_integrity_iter *iter)
-{
-	return t10_pi_generate(iter, t10_pi_crc_fn, 3);
-}
-
-static blk_status_t t10_pi_type3_generate_ip(struct blk_integrity_iter *iter)
-{
-	return t10_pi_generate(iter, t10_pi_ip_fn, 3);
-}
-
-static blk_status_t t10_pi_type3_verify_crc(struct blk_integrity_iter *iter)
-{
-	return t10_pi_verify(iter, t10_pi_crc_fn, 3);
-}
-
-static blk_status_t t10_pi_type3_verify_ip(struct blk_integrity_iter *iter)
-{
-	return t10_pi_verify(iter, t10_pi_ip_fn, 3);
-}
-
-const struct blk_integrity_profile t10_pi_type1_crc = {
-	.name			= "T10-DIF-TYPE1-CRC",
-	.generate_fn		= t10_pi_type1_generate_crc,
-	.verify_fn		= t10_pi_type1_verify_crc,
-};
-EXPORT_SYMBOL(t10_pi_type1_crc);
-
-const struct blk_integrity_profile t10_pi_type1_ip = {
-	.name			= "T10-DIF-TYPE1-IP",
-	.generate_fn		= t10_pi_type1_generate_ip,
-	.verify_fn		= t10_pi_type1_verify_ip,
-};
-EXPORT_SYMBOL(t10_pi_type1_ip);
-
-const struct blk_integrity_profile t10_pi_type3_crc = {
-	.name			= "T10-DIF-TYPE3-CRC",
-	.generate_fn		= t10_pi_type3_generate_crc,
-	.verify_fn		= t10_pi_type3_verify_crc,
-};
-EXPORT_SYMBOL(t10_pi_type3_crc);
-
-const struct blk_integrity_profile t10_pi_type3_ip = {
-	.name			= "T10-DIF-TYPE3-IP",
-	.generate_fn		= t10_pi_type3_generate_ip,
-	.verify_fn		= t10_pi_type3_verify_ip,
-};
-EXPORT_SYMBOL(t10_pi_type3_ip);
-
 /**
- * t10_pi_prepare - prepare PI prior submitting request to device
+ * t10_pi_type1_prepare - prepare PI prior submitting request to device
  * @rq:              request with PI that should be prepared
- * @protection_type: PI type (Type 1/Type 2/Type 3)
  *
  * For Type 1/Type 2, the virtual start sector is the one that was
  * originally submitted by the block layer for the ref_tag usage. Due to
  * partitioning, MD/DM cloning, etc. the actual physical start sector is
  * likely to be different. Remap protection information to match the
  * physical LBA.
- *
- * Type 3 does not have a reference tag so no remapping is required.
  */
-void t10_pi_prepare(struct request *rq, u8 protection_type)
+static void t10_pi_type1_prepare(struct request *rq)
 {
 	const int tuple_sz = rq->q->integrity.tuple_size;
 	u32 ref_tag = t10_pi_ref_tag(rq);
 	struct bio *bio;
 
-	if (protection_type == T10_PI_TYPE3_PROTECTION)
-		return;
-
 	__rq_for_each_bio(bio, rq) {
 		struct bio_integrity_payload *bip = bio_integrity(bio);
 		u32 virt = bip_get_seed(bip) & 0xffffffff;
@@ -222,13 +167,11 @@
 		bip->bip_flags |= BIP_MAPPED_INTEGRITY;
 	}
 }
-EXPORT_SYMBOL(t10_pi_prepare);
 
 /**
- * t10_pi_complete - prepare PI prior returning request to the block layer
+ * t10_pi_type1_complete - prepare PI prior returning request to the blk layer
  * @rq:              request with PI that should be prepared
- * @protection_type: PI type (Type 1/Type 2/Type 3)
- * @intervals:       total elements to prepare
+ * @nr_bytes:        total bytes to prepare
  *
  * For Type 1/Type 2, the virtual start sector is the one that was
  * originally submitted by the block layer for the ref_tag usage. Due to
@@ -236,19 +179,14 @@
  * likely to be different. Since the physical start sector was submitted
  * to the device, we should remap it back to virtual values expected by the
  * block layer.
- *
- * Type 3 does not have a reference tag so no remapping is required.
  */
-void t10_pi_complete(struct request *rq, u8 protection_type,
-		     unsigned int intervals)
+static void t10_pi_type1_complete(struct request *rq, unsigned int nr_bytes)
 {
+	unsigned intervals = nr_bytes >> rq->q->integrity.interval_exp;
 	const int tuple_sz = rq->q->integrity.tuple_size;
 	u32 ref_tag = t10_pi_ref_tag(rq);
 	struct bio *bio;
 
-	if (protection_type == T10_PI_TYPE3_PROTECTION)
-		return;
-
 	__rq_for_each_bio(bio, rq) {
 		struct bio_integrity_payload *bip = bio_integrity(bio);
 		u32 virt = bip_get_seed(bip) & 0xffffffff;
@@ -276,4 +214,73 @@
 		}
 	}
 }
-EXPORT_SYMBOL(t10_pi_complete);
+
+static blk_status_t t10_pi_type3_generate_crc(struct blk_integrity_iter *iter)
+{
+	return t10_pi_generate(iter, t10_pi_crc_fn, T10_PI_TYPE3_PROTECTION);
+}
+
+static blk_status_t t10_pi_type3_generate_ip(struct blk_integrity_iter *iter)
+{
+	return t10_pi_generate(iter, t10_pi_ip_fn, T10_PI_TYPE3_PROTECTION);
+}
+
+static blk_status_t t10_pi_type3_verify_crc(struct blk_integrity_iter *iter)
+{
+	return t10_pi_verify(iter, t10_pi_crc_fn, T10_PI_TYPE3_PROTECTION);
+}
+
+static blk_status_t t10_pi_type3_verify_ip(struct blk_integrity_iter *iter)
+{
+	return t10_pi_verify(iter, t10_pi_ip_fn, T10_PI_TYPE3_PROTECTION);
+}
+
+/**
+ * Type 3 does not have a reference tag so no remapping is required.
+ */
+static void t10_pi_type3_prepare(struct request *rq)
+{
+}
+
+/**
+ * Type 3 does not have a reference tag so no remapping is required.
+ */
+static void t10_pi_type3_complete(struct request *rq, unsigned int nr_bytes)
+{
+}
+
+const struct blk_integrity_profile t10_pi_type1_crc = {
+	.name			= "T10-DIF-TYPE1-CRC",
+	.generate_fn		= t10_pi_type1_generate_crc,
+	.verify_fn		= t10_pi_type1_verify_crc,
+	.prepare_fn		= t10_pi_type1_prepare,
+	.complete_fn		= t10_pi_type1_complete,
+};
+EXPORT_SYMBOL(t10_pi_type1_crc);
+
+const struct blk_integrity_profile t10_pi_type1_ip = {
+	.name			= "T10-DIF-TYPE1-IP",
+	.generate_fn		= t10_pi_type1_generate_ip,
+	.verify_fn		= t10_pi_type1_verify_ip,
+	.prepare_fn		= t10_pi_type1_prepare,
+	.complete_fn		= t10_pi_type1_complete,
+};
+EXPORT_SYMBOL(t10_pi_type1_ip);
+
+const struct blk_integrity_profile t10_pi_type3_crc = {
+	.name			= "T10-DIF-TYPE3-CRC",
+	.generate_fn		= t10_pi_type3_generate_crc,
+	.verify_fn		= t10_pi_type3_verify_crc,
+	.prepare_fn		= t10_pi_type3_prepare,
+	.complete_fn		= t10_pi_type3_complete,
+};
+EXPORT_SYMBOL(t10_pi_type3_crc);
+
+const struct blk_integrity_profile t10_pi_type3_ip = {
+	.name			= "T10-DIF-TYPE3-IP",
+	.generate_fn		= t10_pi_type3_generate_ip,
+	.verify_fn		= t10_pi_type3_verify_ip,
+	.prepare_fn		= t10_pi_type3_prepare,
+	.complete_fn		= t10_pi_type3_complete,
+};
+EXPORT_SYMBOL(t10_pi_type3_ip);

diff --git a/certs/system_keyring.c b/certs/system_keyring.c
index 1eba08a..7982911 100644
--- a/certs/system_keyring.c
+++ b/certs/system_keyring.c

@@ -190,33 +190,27 @@
 #ifdef CONFIG_SYSTEM_DATA_VERIFICATION
 
 /**
- * verify_pkcs7_signature - Verify a PKCS#7-based signature on system data.
+ * verify_pkcs7_message_sig - Verify a PKCS#7-based signature on system data.
  * @data: The data to be verified (NULL if expecting internal data).
  * @len: Size of @data.
- * @raw_pkcs7: The PKCS#7 message that is the signature.
- * @pkcs7_len: The size of @raw_pkcs7.
+ * @pkcs7: The PKCS#7 message that is the signature.
  * @trusted_keys: Trusted keys to use (NULL for builtin trusted keys only,
  *					(void *)1UL for all trusted keys).
  * @usage: The use to which the key is being put.
  * @view_content: Callback to gain access to content.
  * @ctx: Context for callback.
  */
-int verify_pkcs7_signature(const void *data, size_t len,
-			   const void *raw_pkcs7, size_t pkcs7_len,
-			   struct key *trusted_keys,
-			   enum key_being_used_for usage,
-			   int (*view_content)(void *ctx,
-					       const void *data, size_t len,
-					       size_t asn1hdrlen),
-			   void *ctx)
+int verify_pkcs7_message_sig(const void *data, size_t len,
+			     struct pkcs7_message *pkcs7,
+			     struct key *trusted_keys,
+			     enum key_being_used_for usage,
+			     int (*view_content)(void *ctx,
+						 const void *data, size_t len,
+						 size_t asn1hdrlen),
+			     void *ctx)
 {
-	struct pkcs7_message *pkcs7;
 	int ret;
 
-	pkcs7 = pkcs7_parse_message(raw_pkcs7, pkcs7_len);
-	if (IS_ERR(pkcs7))
-		return PTR_ERR(pkcs7);
-
 	/* The data should be detached - so we need to supply it. */
 	if (data && pkcs7_supply_detached_data(pkcs7, data, len) < 0) {
 		pr_err("PKCS#7 signature with non-detached data\n");
@@ -269,6 +263,41 @@
 	}
 
 error:
+	pr_devel("<==%s() = %d\n", __func__, ret);
+	return ret;
+}
+
+/**
+ * verify_pkcs7_signature - Verify a PKCS#7-based signature on system data.
+ * @data: The data to be verified (NULL if expecting internal data).
+ * @len: Size of @data.
+ * @raw_pkcs7: The PKCS#7 message that is the signature.
+ * @pkcs7_len: The size of @raw_pkcs7.
+ * @trusted_keys: Trusted keys to use (NULL for builtin trusted keys only,
+ *					(void *)1UL for all trusted keys).
+ * @usage: The use to which the key is being put.
+ * @view_content: Callback to gain access to content.
+ * @ctx: Context for callback.
+ */
+int verify_pkcs7_signature(const void *data, size_t len,
+			   const void *raw_pkcs7, size_t pkcs7_len,
+			   struct key *trusted_keys,
+			   enum key_being_used_for usage,
+			   int (*view_content)(void *ctx,
+					       const void *data, size_t len,
+					       size_t asn1hdrlen),
+			   void *ctx)
+{
+	struct pkcs7_message *pkcs7;
+	int ret;
+
+	pkcs7 = pkcs7_parse_message(raw_pkcs7, pkcs7_len);
+	if (IS_ERR(pkcs7))
+		return PTR_ERR(pkcs7);
+
+	ret = verify_pkcs7_message_sig(data, len, pkcs7, trusted_keys, usage,
+				       view_content, ctx);
+
 	pkcs7_free_message(pkcs7);
 	pr_devel("<==%s() = %d\n", __func__, ret);
 	return ret;

diff --git a/crypto/asymmetric_keys/pkcs7_verify.c b/crypto/asymmetric_keys/pkcs7_verify.c
index 11bee67..ce49820 100644
--- a/crypto/asymmetric_keys/pkcs7_verify.c
+++ b/crypto/asymmetric_keys/pkcs7_verify.c

@@ -12,6 +12,7 @@
 #include <linux/err.h>
 #include <linux/asn1.h>
 #include <crypto/hash.h>
+#include <crypto/hash_info.h>
 #include <crypto/public_key.h>
 #include "pkcs7_parser.h"
 
@@ -29,6 +30,10 @@
 
 	kenter(",%u,%s", sinfo->index, sinfo->sig->hash_algo);
 
+	/* The digest was calculated already. */
+	if (sig->digest)
+		return 0;
+
 	if (!sinfo->sig->hash_algo)
 		return -ENOPKG;
 
@@ -117,6 +122,34 @@
 	return ret;
 }
 
+int pkcs7_get_digest(struct pkcs7_message *pkcs7, const u8 **buf, u32 *len,
+		     enum hash_algo *hash_algo)
+{
+	struct pkcs7_signed_info *sinfo = pkcs7->signed_infos;
+	int i, ret;
+
+	/*
+	 * This function doesn't support messages with more than one signature.
+	 */
+	if (sinfo == NULL || sinfo->next != NULL)
+		return -EBADMSG;
+
+	ret = pkcs7_digest(pkcs7, sinfo);
+	if (ret)
+		return ret;
+
+	*buf = sinfo->sig->digest;
+	*len = sinfo->sig->digest_size;
+
+	for (i = 0; i < HASH_ALGO__LAST; i++)
+		if (!strcmp(hash_algo_name[i], sinfo->sig->hash_algo)) {
+			*hash_algo = i;
+			break;
+		}
+
+	return 0;
+}
+
 /*
  * Find the key (X.509 certificate) to use to verify a PKCS#7 message.  PKCS#7
  * uses the issuer's name and the issuing certificate serial number for

diff --git a/crypto/asymmetric_keys/verify_pefile.c b/crypto/asymmetric_keys/verify_pefile.c
index 3b303fe..cc9dbce 100644
--- a/crypto/asymmetric_keys/verify_pefile.c
+++ b/crypto/asymmetric_keys/verify_pefile.c

@@ -96,7 +96,7 @@
 
 	if (!ddir->certs.virtual_address || !ddir->certs.size) {
 		pr_debug("Unsigned PE binary\n");
-		return -EKEYREJECTED;
+		return -ENODATA;
 	}
 
 	chkaddr(ctx->header_size, ddir->certs.virtual_address,
@@ -403,6 +403,8 @@
  *  (*) 0 if at least one signature chain intersects with the keys in the trust
  *	keyring, or:
  *
+ *  (*) -ENODATA if there is no signature present.
+ *
  *  (*) -ENOPKG if a suitable crypto module couldn't be found for a check on a
  *	chain.
  *

diff --git a/drivers/acpi/acpi_apd.c b/drivers/acpi/acpi_apd.c
index 7cd0c9a..71511ae 100644
--- a/drivers/acpi/acpi_apd.c
+++ b/drivers/acpi/acpi_apd.c

@@ -160,11 +160,17 @@
 	.setup = acpi_apd_setup,
 	.fixed_clk_rate = 250000000,
 };
+
 static const struct apd_device_desc thunderx2_i2c_desc = {
 	.setup = acpi_apd_setup,
 	.fixed_clk_rate = 125000000,
 };
 
+static const struct apd_device_desc nxp_i2c_desc = {
+	.setup = acpi_apd_setup,
+	.fixed_clk_rate = 350000000,
+};
+
 static const struct apd_device_desc hip08_spi_desc = {
 	.setup = acpi_apd_setup,
 	.fixed_clk_rate = 250000000,
@@ -238,6 +244,7 @@
 	{ "HISI02A1", APD_ADDR(hip07_i2c_desc) },
 	{ "HISI02A2", APD_ADDR(hip08_i2c_desc) },
 	{ "HISI0173", APD_ADDR(hip08_spi_desc) },
+	{ "NXP0001", APD_ADDR(nxp_i2c_desc) },
 #endif
 	{ }
 };

diff --git a/drivers/acpi/custom_method.c b/drivers/acpi/custom_method.c
index fd66a73..b097ef2 100644
--- a/drivers/acpi/custom_method.c
+++ b/drivers/acpi/custom_method.c

@@ -9,6 +9,7 @@
 #include <linux/uaccess.h>
 #include <linux/debugfs.h>
 #include <linux/acpi.h>
+#include <linux/security.h>
 
 #include "internal.h"
 
@@ -29,6 +30,11 @@
 
 	struct acpi_table_header table;
 	acpi_status status;
+	int ret;
+
+	ret = security_locked_down(LOCKDOWN_ACPI_TABLES);
+	if (ret)
+		return ret;
 
 	if (!(*ppos)) {
 		/* parse the table header to get the table length */

diff --git a/drivers/acpi/osl.c b/drivers/acpi/osl.c
index 2f9d0d2..a2e844a 100644
--- a/drivers/acpi/osl.c
+++ b/drivers/acpi/osl.c

@@ -27,6 +27,7 @@
 #include <linux/list.h>
 #include <linux/jiffies.h>
 #include <linux/semaphore.h>
+#include <linux/security.h>
 
 #include <asm/io.h>
 #include <linux/uaccess.h>
@@ -182,8 +183,19 @@
 	acpi_physical_address pa;
 
 #ifdef CONFIG_KEXEC
-	if (acpi_rsdp)
+	/*
+	 * We may have been provided with an RSDP on the command line,
+	 * but if a malicious user has done so they may be pointing us
+	 * at modified ACPI tables that could alter kernel behaviour -
+	 * so, we check the lockdown status before making use of
+	 * it. If we trust it then also stash it in an architecture
+	 * specific location (if appropriate) so it can be carried
+	 * over further kexec()s.
+	 */
+	if (acpi_rsdp && !security_locked_down(LOCKDOWN_ACPI_TABLES)) {
+		acpi_arch_set_root_pointer(acpi_rsdp);
 		return acpi_rsdp;
+	}
 #endif
 	pa = acpi_arch_get_root_pointer();
 	if (pa)

diff --git a/drivers/acpi/pci_root.c b/drivers/acpi/pci_root.c
index 314a187..d1e666e 100644
--- a/drivers/acpi/pci_root.c
+++ b/drivers/acpi/pci_root.c

@@ -15,7 +15,6 @@
 #include <linux/pm_runtime.h>
 #include <linux/pci.h>
 #include <linux/pci-acpi.h>
-#include <linux/pci-aspm.h>
 #include <linux/dmar.h>
 #include <linux/acpi.h>
 #include <linux/slab.h>

diff --git a/drivers/acpi/tables.c b/drivers/acpi/tables.c
index b323277..180ac43 100644
--- a/drivers/acpi/tables.c
+++ b/drivers/acpi/tables.c

@@ -20,6 +20,7 @@
 #include <linux/memblock.h>
 #include <linux/earlycpio.h>
 #include <linux/initrd.h>
+#include <linux/security.h>
 #include "internal.h"
 
 #ifdef CONFIG_ACPI_CUSTOM_DSDT
@@ -578,6 +579,11 @@
 	if (table_nr == 0)
 		return;
 
+	if (security_locked_down(LOCKDOWN_ACPI_TABLES)) {
+		pr_notice("kernel is locked down, ignoring table override\n");
+		return;
+	}
+
 	acpi_tables_addr =
 		memblock_find_in_range(0, ACPI_TABLE_UPGRADE_MAX_PHYS,
 				       all_tables_size, PAGE_SIZE);

diff --git a/drivers/ata/libahci_platform.c b/drivers/ata/libahci_platform.c
index 9e9583a..e742780 100644
--- a/drivers/ata/libahci_platform.c
+++ b/drivers/ata/libahci_platform.c

@@ -497,6 +497,7 @@
 
 			if (of_property_read_u32(child, "reg", &port)) {
 				rc = -EINVAL;
+				of_node_put(child);
 				goto err_out;
 			}
 
@@ -514,14 +515,18 @@
 			if (port_dev) {
 				rc = ahci_platform_get_regulator(hpriv, port,
 								&port_dev->dev);
-				if (rc == -EPROBE_DEFER)
+				if (rc == -EPROBE_DEFER) {
+					of_node_put(child);
 					goto err_out;
+				}
 			}
 #endif
 
 			rc = ahci_platform_get_phy(hpriv, port, dev, child);
-			if (rc)
+			if (rc) {
+				of_node_put(child);
 				goto err_out;
+			}
 
 			enabled_ports++;
 		}

diff --git a/drivers/atm/he.c b/drivers/atm/he.c
index 70b00ae..8af793f 100644
--- a/drivers/atm/he.c
+++ b/drivers/atm/he.c

@@ -1690,7 +1690,7 @@
 
 		if (RBRQ_HBUF_ERR(he_dev->rbrq_head)) {
 			hprintk("HBUF_ERR!  (cid 0x%x)\n", cid);
-				atomic_inc(&vcc->stats->rx_drop);
+			atomic_inc(&vcc->stats->rx_drop);
 			goto return_host_buffers;
 		}
 

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 20c39d1..6bea4f3 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c

@@ -100,26 +100,9 @@
 }
 EXPORT_SYMBOL_GPL(memory_block_size_bytes);
 
-static unsigned long get_memory_block_size(void)
-{
-	unsigned long block_sz;
-
-	block_sz = memory_block_size_bytes();
-
-	/* Validate blk_sz is a power of 2 and not less than section size */
-	if ((block_sz & (block_sz - 1)) || (block_sz < MIN_MEMORY_BLOCK_SIZE)) {
-		WARN_ON(1);
-		block_sz = MIN_MEMORY_BLOCK_SIZE;
-	}
-
-	return block_sz;
-}
-
 /*
- * use this as the physical section index that this memsection
- * uses.
+ * Show the first physical section index (number) of this memory block.
  */
-
 static ssize_t phys_index_show(struct device *dev,
 			       struct device_attribute *attr, char *buf)
 {
@@ -131,7 +114,10 @@
 }
 
 /*
- * Show whether the section of memory is likely to be hot-removable
+ * Show whether the memory block is likely to be offlineable (or is already
+ * offline). Once offline, the memory block could be removed. The return
+ * value does, however, not indicate that there is a way to remove the
+ * memory block.
  */
 static ssize_t removable_show(struct device *dev, struct device_attribute *attr,
 			      char *buf)
@@ -455,12 +441,12 @@
 static DEVICE_ATTR_RO(removable);
 
 /*
- * Block size attribute stuff
+ * Show the memory block size (shared by all memory blocks).
  */
 static ssize_t block_size_bytes_show(struct device *dev,
 				     struct device_attribute *attr, char *buf)
 {
-	return sprintf(buf, "%lx\n", get_memory_block_size());
+	return sprintf(buf, "%lx\n", memory_block_size_bytes());
 }
 
 static DEVICE_ATTR_RO(block_size_bytes);
@@ -670,10 +656,10 @@
 		return -ENOMEM;
 
 	mem->start_section_nr = block_id * sections_per_block;
-	mem->end_section_nr = mem->start_section_nr + sections_per_block - 1;
 	mem->state = state;
 	start_pfn = section_nr_to_pfn(mem->start_section_nr);
 	mem->phys_device = arch_get_memory_phys_device(start_pfn);
+	mem->nid = NUMA_NO_NODE;
 
 	ret = register_memory(mem);
 
@@ -810,19 +796,22 @@
 /*
  * Initialize the sysfs support for memory devices...
  */
-int __init memory_dev_init(void)
+void __init memory_dev_init(void)
 {
 	int ret;
 	int err;
 	unsigned long block_sz, nr;
 
+	/* Validate the configured memory block size */
+	block_sz = memory_block_size_bytes();
+	if (!is_power_of_2(block_sz) || block_sz < MIN_MEMORY_BLOCK_SIZE)
+		panic("Memory block size not suitable: 0x%lx\n", block_sz);
+	sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE;
+
 	ret = subsys_system_register(&memory_subsys, memory_root_attr_groups);
 	if (ret)
 		goto out;
 
-	block_sz = get_memory_block_size();
-	sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE;
-
 	/*
 	 * Create entries for memory sections that were found
 	 * during boot and have been initialized
@@ -838,8 +827,7 @@
 
 out:
 	if (ret)
-		printk(KERN_ERR "%s() failed: %d\n", __func__, ret);
-	return ret;
+		panic("%s() failed: %d\n", __func__, ret);
 }
 
 /**

diff --git a/drivers/base/node.c b/drivers/base/node.c
index 75b7e6f..296546f 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c

@@ -427,6 +427,8 @@
 		       "Node %d AnonHugePages:  %8lu kB\n"
 		       "Node %d ShmemHugePages: %8lu kB\n"
 		       "Node %d ShmemPmdMapped: %8lu kB\n"
+		       "Node %d FileHugePages: %8lu kB\n"
+		       "Node %d FilePmdMapped: %8lu kB\n"
 #endif
 			,
 		       nid, K(node_page_state(pgdat, NR_FILE_DIRTY)),
@@ -452,6 +454,10 @@
 		       nid, K(node_page_state(pgdat, NR_SHMEM_THPS) *
 				       HPAGE_PMD_NR),
 		       nid, K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED) *
+				       HPAGE_PMD_NR),
+		       nid, K(node_page_state(pgdat, NR_FILE_THPS) *
+				       HPAGE_PMD_NR),
+		       nid, K(node_page_state(pgdat, NR_FILE_PMDMAPPED) *
 				       HPAGE_PMD_NR)
 #endif
 		       );
@@ -756,15 +762,13 @@
 static int register_mem_sect_under_node(struct memory_block *mem_blk,
 					 void *arg)
 {
+	unsigned long memory_block_pfns = memory_block_size_bytes() / PAGE_SIZE;
+	unsigned long start_pfn = section_nr_to_pfn(mem_blk->start_section_nr);
+	unsigned long end_pfn = start_pfn + memory_block_pfns - 1;
 	int ret, nid = *(int *)arg;
-	unsigned long pfn, sect_start_pfn, sect_end_pfn;
+	unsigned long pfn;
 
-	mem_blk->nid = nid;
-
-	sect_start_pfn = section_nr_to_pfn(mem_blk->start_section_nr);
-	sect_end_pfn = section_nr_to_pfn(mem_blk->end_section_nr);
-	sect_end_pfn += PAGES_PER_SECTION - 1;
-	for (pfn = sect_start_pfn; pfn <= sect_end_pfn; pfn++) {
+	for (pfn = start_pfn; pfn <= end_pfn; pfn++) {
 		int page_nid;
 
 		/*
@@ -789,6 +793,13 @@
 			if (page_nid != nid)
 				continue;
 		}
+
+		/*
+		 * If this memory block spans multiple nodes, we only indicate
+		 * the last processed node.
+		 */
+		mem_blk->nid = nid;
+
 		ret = sysfs_create_link_nowarn(&node_devices[nid]->dev.kobj,
 					&mem_blk->dev.kobj,
 					kobject_name(&mem_blk->dev.kobj));
@@ -804,32 +815,18 @@
 }
 
 /*
- * Unregister memory block device under all nodes that it spans.
- * Has to be called with mem_sysfs_mutex held (due to unlinked_nodes).
+ * Unregister a memory block device under the node it spans. Memory blocks
+ * with multiple nodes cannot be offlined and therefore also never be removed.
  */
 void unregister_memory_block_under_nodes(struct memory_block *mem_blk)
 {
-	unsigned long pfn, sect_start_pfn, sect_end_pfn;
-	static nodemask_t unlinked_nodes;
+	if (mem_blk->nid == NUMA_NO_NODE)
+		return;
 
-	nodes_clear(unlinked_nodes);
-	sect_start_pfn = section_nr_to_pfn(mem_blk->start_section_nr);
-	sect_end_pfn = section_nr_to_pfn(mem_blk->end_section_nr);
-	for (pfn = sect_start_pfn; pfn <= sect_end_pfn; pfn++) {
-		int nid;
-
-		nid = get_nid_for_pfn(pfn);
-		if (nid < 0)
-			continue;
-		if (!node_online(nid))
-			continue;
-		if (node_test_and_set(nid, unlinked_nodes))
-			continue;
-		sysfs_remove_link(&node_devices[nid]->dev.kobj,
-			 kobject_name(&mem_blk->dev.kobj));
-		sysfs_remove_link(&mem_blk->dev.kobj,
-			 kobject_name(&node_devices[nid]->dev.kobj));
-	}
+	sysfs_remove_link(&node_devices[mem_blk->nid]->dev.kobj,
+			  kobject_name(&mem_blk->dev.kobj));
+	sysfs_remove_link(&mem_blk->dev.kobj,
+			  kobject_name(&node_devices[mem_blk->nid]->dev.kobj));
 }
 
 int link_mem_sections(int nid, unsigned long start_pfn, unsigned long end_pfn)

diff --git a/drivers/block/drbd/drbd_interval.c b/drivers/block/drbd/drbd_interval.c
index c589865..651bd02 100644
--- a/drivers/block/drbd/drbd_interval.c
+++ b/drivers/block/drbd/drbd_interval.c

@@ -13,33 +13,10 @@
 	return this->end;
 }
 
-/**
- * compute_subtree_last  -  compute end of @node
- *
- * The end of an interval is the highest (start + (size >> 9)) value of this
- * node and of its children.  Called for @node and its parents whenever the end
- * may have changed.
- */
-static inline sector_t
-compute_subtree_last(struct drbd_interval *node)
-{
-	sector_t max = node->sector + (node->size >> 9);
+#define NODE_END(node) ((node)->sector + ((node)->size >> 9))
 
-	if (node->rb.rb_left) {
-		sector_t left = interval_end(node->rb.rb_left);
-		if (left > max)
-			max = left;
-	}
-	if (node->rb.rb_right) {
-		sector_t right = interval_end(node->rb.rb_right);
-		if (right > max)
-			max = right;
-	}
-	return max;
-}
-
-RB_DECLARE_CALLBACKS(static, augment_callbacks, struct drbd_interval, rb,
-		     sector_t, end, compute_subtree_last);
+RB_DECLARE_CALLBACKS_MAX(static, augment_callbacks,
+			 struct drbd_interval, rb, sector_t, end, NODE_END);
 
 /**
  * drbd_insert_interval  -  insert a new interval into a tree

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index a8e3815..ac07e8c 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c

@@ -26,6 +26,7 @@
 #include <linux/ioctl.h>
 #include <linux/mutex.h>
 #include <linux/compiler.h>
+#include <linux/completion.h>
 #include <linux/err.h>
 #include <linux/kernel.h>
 #include <linux/slab.h>
@@ -71,14 +72,17 @@
 	int index;
 };
 
-#define NBD_TIMEDOUT			0
+#define NBD_RT_TIMEDOUT			0
+#define NBD_RT_DISCONNECT_REQUESTED	1
+#define NBD_RT_DISCONNECTED		2
+#define NBD_RT_HAS_PID_FILE		3
+#define NBD_RT_HAS_CONFIG_REF		4
+#define NBD_RT_BOUND			5
+#define NBD_RT_DESTROY_ON_DISCONNECT	6
+#define NBD_RT_DISCONNECT_ON_CLOSE	7
+
+#define NBD_DESTROY_ON_DISCONNECT	0
 #define NBD_DISCONNECT_REQUESTED	1
-#define NBD_DISCONNECTED		2
-#define NBD_HAS_PID_FILE		3
-#define NBD_HAS_CONFIG_REF		4
-#define NBD_BOUND			5
-#define NBD_DESTROY_ON_DISCONNECT	6
-#define NBD_DISCONNECT_ON_CLOSE 	7
 
 struct nbd_config {
 	u32 flags;
@@ -113,6 +117,9 @@
 	struct list_head list;
 	struct task_struct *task_recv;
 	struct task_struct *task_setup;
+
+	struct completion *destroy_complete;
+	unsigned long flags;
 };
 
 #define NBD_CMD_REQUEUED	1
@@ -223,6 +230,16 @@
 		disk->private_data = NULL;
 		put_disk(disk);
 	}
+
+	/*
+	 * Place this in the last just before the nbd is freed to
+	 * make sure that the disk and the related kobject are also
+	 * totally removed to avoid duplicate creation of the same
+	 * one.
+	 */
+	if (test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags) && nbd->destroy_complete)
+		complete(nbd->destroy_complete);
+
 	kfree(nbd);
 }
 
@@ -238,8 +255,8 @@
 
 static int nbd_disconnected(struct nbd_config *config)
 {
-	return test_bit(NBD_DISCONNECTED, &config->runtime_flags) ||
-		test_bit(NBD_DISCONNECT_REQUESTED, &config->runtime_flags);
+	return test_bit(NBD_RT_DISCONNECTED, &config->runtime_flags) ||
+		test_bit(NBD_RT_DISCONNECT_REQUESTED, &config->runtime_flags);
 }
 
 static void nbd_mark_nsock_dead(struct nbd_device *nbd, struct nbd_sock *nsock,
@@ -257,9 +274,9 @@
 	if (!nsock->dead) {
 		kernel_sock_shutdown(nsock->sock, SHUT_RDWR);
 		if (atomic_dec_return(&nbd->config->live_connections) == 0) {
-			if (test_and_clear_bit(NBD_DISCONNECT_REQUESTED,
+			if (test_and_clear_bit(NBD_RT_DISCONNECT_REQUESTED,
 					       &nbd->config->runtime_flags)) {
-				set_bit(NBD_DISCONNECTED,
+				set_bit(NBD_RT_DISCONNECTED,
 					&nbd->config->runtime_flags);
 				dev_info(nbd_to_dev(nbd),
 					"Disconnected due to user request.\n");
@@ -333,7 +350,7 @@
 
 	if (config->num_connections == 0)
 		return;
-	if (test_and_set_bit(NBD_DISCONNECTED, &config->runtime_flags))
+	if (test_and_set_bit(NBD_RT_DISCONNECTED, &config->runtime_flags))
 		return;
 
 	for (i = 0; i < config->num_connections; i++) {
@@ -427,7 +444,7 @@
 	}
 
 	dev_err_ratelimited(nbd_to_dev(nbd), "Connection timed out\n");
-	set_bit(NBD_TIMEDOUT, &config->runtime_flags);
+	set_bit(NBD_RT_TIMEDOUT, &config->runtime_flags);
 	cmd->status = BLK_STS_IOERR;
 	mutex_unlock(&cmd->lock);
 	sock_shutdown(nbd);
@@ -795,7 +812,7 @@
 	struct nbd_sock *nsock = config->socks[index];
 	int fallback = nsock->fallback_index;
 
-	if (test_bit(NBD_DISCONNECTED, &config->runtime_flags))
+	if (test_bit(NBD_RT_DISCONNECTED, &config->runtime_flags))
 		return new_index;
 
 	if (config->num_connections <= 1) {
@@ -836,7 +853,7 @@
 	struct nbd_config *config = nbd->config;
 	if (!config->dead_conn_timeout)
 		return 0;
-	if (test_bit(NBD_DISCONNECTED, &config->runtime_flags))
+	if (test_bit(NBD_RT_DISCONNECTED, &config->runtime_flags))
 		return 0;
 	return wait_event_timeout(config->conn_wait,
 				  atomic_read(&config->live_connections) > 0,
@@ -969,12 +986,12 @@
 		return err;
 
 	if (!netlink && !nbd->task_setup &&
-	    !test_bit(NBD_BOUND, &config->runtime_flags))
+	    !test_bit(NBD_RT_BOUND, &config->runtime_flags))
 		nbd->task_setup = current;
 
 	if (!netlink &&
 	    (nbd->task_setup != current ||
-	     test_bit(NBD_BOUND, &config->runtime_flags))) {
+	     test_bit(NBD_RT_BOUND, &config->runtime_flags))) {
 		dev_err(disk_to_dev(nbd->disk),
 			"Device being setup by another task");
 		sockfd_put(sock);
@@ -1053,7 +1070,7 @@
 		mutex_unlock(&nsock->tx_lock);
 		sockfd_put(old);
 
-		clear_bit(NBD_DISCONNECTED, &config->runtime_flags);
+		clear_bit(NBD_RT_DISCONNECTED, &config->runtime_flags);
 
 		/* We take the tx_mutex in an error path in the recv_work, so we
 		 * need to queue_work outside of the tx_mutex.
@@ -1124,7 +1141,8 @@
 	struct nbd_config *config = nbd->config;
 
 	dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n");
-	set_bit(NBD_DISCONNECT_REQUESTED, &config->runtime_flags);
+	set_bit(NBD_RT_DISCONNECT_REQUESTED, &config->runtime_flags);
+	set_bit(NBD_DISCONNECT_REQUESTED, &nbd->flags);
 	send_disconnects(nbd);
 	return 0;
 }
@@ -1143,7 +1161,7 @@
 		struct nbd_config *config = nbd->config;
 		nbd_dev_dbg_close(nbd);
 		nbd_size_clear(nbd);
-		if (test_and_clear_bit(NBD_HAS_PID_FILE,
+		if (test_and_clear_bit(NBD_RT_HAS_PID_FILE,
 				       &config->runtime_flags))
 			device_remove_file(disk_to_dev(nbd->disk), &pid_attr);
 		nbd->task_recv = NULL;
@@ -1209,7 +1227,7 @@
 		dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n");
 		return error;
 	}
-	set_bit(NBD_HAS_PID_FILE, &config->runtime_flags);
+	set_bit(NBD_RT_HAS_PID_FILE, &config->runtime_flags);
 
 	nbd_dev_dbg_init(nbd);
 	for (i = 0; i < num_connections; i++) {
@@ -1256,9 +1274,9 @@
 	mutex_lock(&nbd->config_lock);
 	nbd_bdev_reset(bdev);
 	/* user requested, ignore socket errors */
-	if (test_bit(NBD_DISCONNECT_REQUESTED, &config->runtime_flags))
+	if (test_bit(NBD_RT_DISCONNECT_REQUESTED, &config->runtime_flags))
 		ret = 0;
-	if (test_bit(NBD_TIMEDOUT, &config->runtime_flags))
+	if (test_bit(NBD_RT_TIMEDOUT, &config->runtime_flags))
 		ret = -ETIMEDOUT;
 	return ret;
 }
@@ -1269,7 +1287,7 @@
 	sock_shutdown(nbd);
 	__invalidate_device(bdev, true);
 	nbd_bdev_reset(bdev);
-	if (test_and_clear_bit(NBD_HAS_CONFIG_REF,
+	if (test_and_clear_bit(NBD_RT_HAS_CONFIG_REF,
 			       &nbd->config->runtime_flags))
 		nbd_config_put(nbd);
 }
@@ -1364,7 +1382,7 @@
 	/* Don't allow ioctl operations on a nbd device that was created with
 	 * netlink, unless it's DISCONNECT or CLEAR_SOCK, which are fine.
 	 */
-	if (!test_bit(NBD_BOUND, &config->runtime_flags) ||
+	if (!test_bit(NBD_RT_BOUND, &config->runtime_flags) ||
 	    (cmd == NBD_DISCONNECT || cmd == NBD_CLEAR_SOCK))
 		error = __nbd_ioctl(bdev, nbd, cmd, arg);
 	else
@@ -1435,7 +1453,7 @@
 	struct nbd_device *nbd = disk->private_data;
 	struct block_device *bdev = bdget_disk(disk, 0);
 
-	if (test_bit(NBD_DISCONNECT_ON_CLOSE, &nbd->config->runtime_flags) &&
+	if (test_bit(NBD_RT_DISCONNECT_ON_CLOSE, &nbd->config->runtime_flags) &&
 			bdev->bd_openers == 0)
 		nbd_disconnect_and_put(nbd);
 
@@ -1636,6 +1654,7 @@
 	nbd->tag_set.flags = BLK_MQ_F_SHOULD_MERGE |
 		BLK_MQ_F_BLOCKING;
 	nbd->tag_set.driver_data = nbd;
+	nbd->destroy_complete = NULL;
 
 	err = blk_mq_alloc_tag_set(&nbd->tag_set);
 	if (err)
@@ -1750,6 +1769,7 @@
 
 static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info)
 {
+	DECLARE_COMPLETION_ONSTACK(destroy_complete);
 	struct nbd_device *nbd = NULL;
 	struct nbd_config *config;
 	int index = -1;
@@ -1801,6 +1821,17 @@
 		mutex_unlock(&nbd_index_mutex);
 		return -EINVAL;
 	}
+
+	if (test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags) &&
+	    test_bit(NBD_DISCONNECT_REQUESTED, &nbd->flags)) {
+		nbd->destroy_complete = &destroy_complete;
+		mutex_unlock(&nbd_index_mutex);
+
+		/* Wait untill the the nbd stuff is totally destroyed */
+		wait_for_completion(&destroy_complete);
+		goto again;
+	}
+
 	if (!refcount_inc_not_zero(&nbd->refs)) {
 		mutex_unlock(&nbd_index_mutex);
 		if (index == -1)
@@ -1833,7 +1864,7 @@
 		return -ENOMEM;
 	}
 	refcount_set(&nbd->config_refs, 1);
-	set_bit(NBD_BOUND, &config->runtime_flags);
+	set_bit(NBD_RT_BOUND, &config->runtime_flags);
 
 	ret = nbd_genl_size_set(info, nbd);
 	if (ret)
@@ -1853,12 +1884,15 @@
 	if (info->attrs[NBD_ATTR_CLIENT_FLAGS]) {
 		u64 flags = nla_get_u64(info->attrs[NBD_ATTR_CLIENT_FLAGS]);
 		if (flags & NBD_CFLAG_DESTROY_ON_DISCONNECT) {
-			set_bit(NBD_DESTROY_ON_DISCONNECT,
+			set_bit(NBD_RT_DESTROY_ON_DISCONNECT,
 				&config->runtime_flags);
+			set_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags);
 			put_dev = true;
+		} else {
+			clear_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags);
 		}
 		if (flags & NBD_CFLAG_DISCONNECT_ON_CLOSE) {
-			set_bit(NBD_DISCONNECT_ON_CLOSE,
+			set_bit(NBD_RT_DISCONNECT_ON_CLOSE,
 				&config->runtime_flags);
 		}
 	}
@@ -1897,7 +1931,7 @@
 out:
 	mutex_unlock(&nbd->config_lock);
 	if (!ret) {
-		set_bit(NBD_HAS_CONFIG_REF, &config->runtime_flags);
+		set_bit(NBD_RT_HAS_CONFIG_REF, &config->runtime_flags);
 		refcount_inc(&nbd->config_refs);
 		nbd_connect_reply(info, nbd->index);
 	}
@@ -1919,7 +1953,7 @@
 	 * queue.
 	 */
 	flush_workqueue(nbd->recv_workq);
-	if (test_and_clear_bit(NBD_HAS_CONFIG_REF,
+	if (test_and_clear_bit(NBD_RT_HAS_CONFIG_REF,
 			       &nbd->config->runtime_flags))
 		nbd_config_put(nbd);
 }
@@ -2003,7 +2037,7 @@
 
 	mutex_lock(&nbd->config_lock);
 	config = nbd->config;
-	if (!test_bit(NBD_BOUND, &config->runtime_flags) ||
+	if (!test_bit(NBD_RT_BOUND, &config->runtime_flags) ||
 	    !nbd->task_recv) {
 		dev_err(nbd_to_dev(nbd),
 			"not configured, cannot reconfigure\n");
@@ -2026,20 +2060,22 @@
 	if (info->attrs[NBD_ATTR_CLIENT_FLAGS]) {
 		u64 flags = nla_get_u64(info->attrs[NBD_ATTR_CLIENT_FLAGS]);
 		if (flags & NBD_CFLAG_DESTROY_ON_DISCONNECT) {
-			if (!test_and_set_bit(NBD_DESTROY_ON_DISCONNECT,
+			if (!test_and_set_bit(NBD_RT_DESTROY_ON_DISCONNECT,
 					      &config->runtime_flags))
 				put_dev = true;
+			set_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags);
 		} else {
-			if (test_and_clear_bit(NBD_DESTROY_ON_DISCONNECT,
+			if (test_and_clear_bit(NBD_RT_DESTROY_ON_DISCONNECT,
 					       &config->runtime_flags))
 				refcount_inc(&nbd->refs);
+			clear_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags);
 		}
 
 		if (flags & NBD_CFLAG_DISCONNECT_ON_CLOSE) {
-			set_bit(NBD_DISCONNECT_ON_CLOSE,
+			set_bit(NBD_RT_DISCONNECT_ON_CLOSE,
 					&config->runtime_flags);
 		} else {
-			clear_bit(NBD_DISCONNECT_ON_CLOSE,
+			clear_bit(NBD_RT_DISCONNECT_ON_CLOSE,
 					&config->runtime_flags);
 		}
 	}

diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 0240601..7645700 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c

@@ -2594,7 +2594,6 @@
 	if (ret)
 		return ret;
 	if (!blk_queue_scsi_passthrough(bdev_get_queue(bdev))) {
-		WARN_ONCE(true, "Attempt to register a non-SCSI queue\n");
 		blkdev_put(bdev, FMODE_READ | FMODE_NDELAY);
 		return -EINVAL;
 	}

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index c8fb886..7c4350c 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c

@@ -1754,8 +1754,6 @@
 	mutex_init(&img_request->state_mutex);
 	kref_init(&img_request->kref);
 
-	dout("%s: rbd_dev %p %s -> img %p\n", __func__, rbd_dev,
-	     obj_op_name(op_type), img_request);
 	return img_request;
 }
 
@@ -2944,6 +2942,9 @@
 	__set_bit(IMG_REQ_CHILD, &child_img_req->flags);
 	child_img_req->obj_request = obj_req;
 
+	dout("%s child_img_req %p for obj_req %p\n", __func__, child_img_req,
+	     obj_req);
+
 	if (!rbd_img_is_write(img_req)) {
 		switch (img_req->data_type) {
 		case OBJ_REQUEST_BIO:
@@ -4877,6 +4878,9 @@
 	img_request->rq = rq;
 	snapc = NULL; /* img_request consumes a ref */
 
+	dout("%s rbd_dev %p img_req %p %s %llu~%llu\n", __func__, rbd_dev,
+	     img_request, obj_op_name(op_type), offset, length);
+
 	if (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_ZEROOUT)
 		result = rbd_img_fill_nodata(img_request, offset, length);
 	else
@@ -5669,17 +5673,20 @@
 
 static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
 {
+	size_t size;
 	void *reply_buf;
 	int ret;
 	void *p;
 
-	reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
+	/* Response will be an encoded string, which includes a length */
+	size = sizeof(__le32) + RBD_OBJ_PREFIX_LEN_MAX;
+	reply_buf = kzalloc(size, GFP_KERNEL);
 	if (!reply_buf)
 		return -ENOMEM;
 
 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
 				  &rbd_dev->header_oloc, "get_object_prefix",
-				  NULL, 0, reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
+				  NULL, 0, reply_buf, size);
 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
 	if (ret < 0)
 		goto out;
@@ -6696,7 +6703,6 @@
 	dout("rbd id object name is %s\n", oid.name);
 
 	/* Response will be an encoded string, which includes a length */
-
 	size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
 	response = kzalloc(size, GFP_NOIO);
 	if (!response) {
@@ -6708,7 +6714,7 @@
 
 	ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
 				  "get_id", NULL, 0,
-				  response, RBD_IMAGE_ID_LEN_MAX);
+				  response, size);
 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
 	if (ret == -ENOENT) {
 		image_id = kstrdup("", GFP_KERNEL);

diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index 9eb564c..43dd089 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c

@@ -29,8 +29,8 @@
 #include <linux/export.h>
 #include <linux/io.h>
 #include <linux/uio.h>
-
 #include <linux/uaccess.h>
+#include <linux/security.h>
 
 #ifdef CONFIG_IA64
 # include <linux/efi.h>
@@ -807,7 +807,10 @@
 
 static int open_port(struct inode *inode, struct file *filp)
 {
-	return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
+	if (!capable(CAP_SYS_RAWIO))
+		return -EPERM;
+
+	return security_locked_down(LOCKDOWN_DEV_MEM);
 }
 
 #define zero_lseek	null_lseek

diff --git a/drivers/char/random.c b/drivers/char/random.c
index d3beed0..c2f7de9 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c

@@ -1732,6 +1732,56 @@
 }
 EXPORT_SYMBOL(get_random_bytes);
 
+
+/*
+ * Each time the timer fires, we expect that we got an unpredictable
+ * jump in the cycle counter. Even if the timer is running on another
+ * CPU, the timer activity will be touching the stack of the CPU that is
+ * generating entropy..
+ *
+ * Note that we don't re-arm the timer in the timer itself - we are
+ * happy to be scheduled away, since that just makes the load more
+ * complex, but we do not want the timer to keep ticking unless the
+ * entropy loop is running.
+ *
+ * So the re-arming always happens in the entropy loop itself.
+ */
+static void entropy_timer(struct timer_list *t)
+{
+	credit_entropy_bits(&input_pool, 1);
+}
+
+/*
+ * If we have an actual cycle counter, see if we can
+ * generate enough entropy with timing noise
+ */
+static void try_to_generate_entropy(void)
+{
+	struct {
+		unsigned long now;
+		struct timer_list timer;
+	} stack;
+
+	stack.now = random_get_entropy();
+
+	/* Slow counter - or none. Don't even bother */
+	if (stack.now == random_get_entropy())
+		return;
+
+	timer_setup_on_stack(&stack.timer, entropy_timer, 0);
+	while (!crng_ready()) {
+		if (!timer_pending(&stack.timer))
+			mod_timer(&stack.timer, jiffies+1);
+		mix_pool_bytes(&input_pool, &stack.now, sizeof(stack.now));
+		schedule();
+		stack.now = random_get_entropy();
+	}
+
+	del_timer_sync(&stack.timer);
+	destroy_timer_on_stack(&stack.timer);
+	mix_pool_bytes(&input_pool, &stack.now, sizeof(stack.now));
+}
+
 /*
  * Wait for the urandom pool to be seeded and thus guaranteed to supply
  * cryptographically secure random numbers. This applies to: the /dev/urandom
@@ -1746,7 +1796,17 @@
 {
 	if (likely(crng_ready()))
 		return 0;
-	return wait_event_interruptible(crng_init_wait, crng_ready());
+
+	do {
+		int ret;
+		ret = wait_event_interruptible_timeout(crng_init_wait, crng_ready(), HZ);
+		if (ret)
+			return ret > 0 ? 0 : ret;
+
+		try_to_generate_entropy();
+	} while (!crng_ready());
+
+	return 0;
 }
 EXPORT_SYMBOL(wait_for_random_bytes);
 

diff --git a/drivers/char/tpm/tpm-interface.c b/drivers/char/tpm/tpm-interface.c
index 1b4f95c..d7a3888 100644
--- a/drivers/char/tpm/tpm-interface.c
+++ b/drivers/char/tpm/tpm-interface.c

@@ -320,18 +320,22 @@
 	if (!chip)
 		return -ENODEV;
 
-	for (i = 0; i < chip->nr_allocated_banks; i++)
-		if (digests[i].alg_id != chip->allocated_banks[i].alg_id)
-			return -EINVAL;
+	for (i = 0; i < chip->nr_allocated_banks; i++) {
+		if (digests[i].alg_id != chip->allocated_banks[i].alg_id) {
+			rc = EINVAL;
+			goto out;
+		}
+	}
 
 	if (chip->flags & TPM_CHIP_FLAG_TPM2) {
 		rc = tpm2_pcr_extend(chip, pcr_idx, digests);
-		tpm_put_ops(chip);
-		return rc;
+		goto out;
 	}
 
 	rc = tpm1_pcr_extend(chip, pcr_idx, digests[0].digest,
 			     "attempting extend a PCR value");
+
+out:
 	tpm_put_ops(chip);
 	return rc;
 }
@@ -354,14 +358,9 @@
 	if (!chip)
 		return -ENODEV;
 
-	rc = tpm_buf_init(&buf, 0, 0);
-	if (rc)
-		goto out;
-
-	memcpy(buf.data, cmd, buflen);
+	buf.data = cmd;
 	rc = tpm_transmit_cmd(chip, &buf, 0, "attempting to a send a command");
-	tpm_buf_destroy(&buf);
-out:
+
 	tpm_put_ops(chip);
 	return rc;
 }

diff --git a/drivers/char/xillybus/xillybus_pcie.c b/drivers/char/xillybus/xillybus_pcie.c
index 02c1595..18b0c39 100644
--- a/drivers/char/xillybus/xillybus_pcie.c
+++ b/drivers/char/xillybus/xillybus_pcie.c

@@ -9,7 +9,6 @@
 
 #include <linux/module.h>
 #include <linux/pci.h>
-#include <linux/pci-aspm.h>
 #include <linux/slab.h>
 #include "xillybus.h"
 

diff --git a/drivers/crypto/chelsio/chtls/chtls_io.c b/drivers/crypto/chelsio/chtls/chtls_io.c
index c70cb5f..0891ab8 100644
--- a/drivers/crypto/chelsio/chtls/chtls_io.c
+++ b/drivers/crypto/chelsio/chtls/chtls_io.c

@@ -1078,7 +1078,7 @@
 			bool merge;
 
 			if (page)
-				pg_size <<= compound_order(page);
+				pg_size = page_size(page);
 			if (off < pg_size &&
 			    skb_can_coalesce(skb, i, page, off)) {
 				merge = 1;
@@ -1105,8 +1105,7 @@
 							   __GFP_NORETRY,
 							   order);
 					if (page)
-						pg_size <<=
-							compound_order(page);
+						pg_size <<= order;
 				}
 				if (!page) {
 					page = alloc_page(gfp);

diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c
index 8f1ab04..8d3e778 100644
--- a/drivers/firmware/efi/efi.c
+++ b/drivers/firmware/efi/efi.c

@@ -30,6 +30,7 @@
 #include <linux/acpi.h>
 #include <linux/ucs2_string.h>
 #include <linux/memblock.h>
+#include <linux/security.h>
 
 #include <asm/early_ioremap.h>
 
@@ -221,6 +222,11 @@
 static char efivar_ssdt[EFIVAR_SSDT_NAME_MAX] __initdata;
 static int __init efivar_ssdt_setup(char *str)
 {
+	int ret = security_locked_down(LOCKDOWN_ACPI_TABLES);
+
+	if (ret)
+		return ret;
+
 	if (strlen(str) < sizeof(efivar_ssdt))
 		memcpy(efivar_ssdt, str, strlen(str));
 	else

diff --git a/drivers/gpio/gpio-mvebu.c b/drivers/gpio/gpio-mvebu.c
index 869d47f..6c06876 100644
--- a/drivers/gpio/gpio-mvebu.c
+++ b/drivers/gpio/gpio-mvebu.c

@@ -694,7 +694,7 @@
 }
 
 static int mvebu_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
-			   struct pwm_state *state)
+			   const struct pwm_state *state)
 {
 	struct mvebu_pwm *mvpwm = to_mvebu_pwm(chip);
 	struct mvebu_gpio_chip *mvchip = mvpwm->mvchip;

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 42b936b..6d021ec 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c

@@ -1103,7 +1103,7 @@
 		alloc_flags = 0;
 		if (!offset || !*offset)
 			return -EINVAL;
-		user_addr = *offset;
+		user_addr = untagged_addr(*offset);
 	} else if (flags & (ALLOC_MEM_FLAGS_DOORBELL |
 			ALLOC_MEM_FLAGS_MMIO_REMAP)) {
 		domain = AMDGPU_GEM_DOMAIN_GTT;

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dpm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_dpm.c
index 61bd103..5803fcb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dpm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dpm.c

@@ -948,6 +948,7 @@
 	case AMD_IP_BLOCK_TYPE_UVD:
 	case AMD_IP_BLOCK_TYPE_VCN:
 	case AMD_IP_BLOCK_TYPE_VCE:
+	case AMD_IP_BLOCK_TYPE_SDMA:
 		if (swsmu)
 			ret = smu_dpm_set_power_gate(&adev->smu, block_type, gate);
 		else
@@ -956,7 +957,6 @@
 		break;
 	case AMD_IP_BLOCK_TYPE_GMC:
 	case AMD_IP_BLOCK_TYPE_ACP:
-	case AMD_IP_BLOCK_TYPE_SDMA:
 		ret = ((adev)->powerplay.pp_funcs->set_powergating_by_smu(
 				(adev)->powerplay.pp_handle, block_type, gate));
 		break;

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index bdf849d..264677a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c

@@ -1012,11 +1012,16 @@
 	{0x1002, 0x731B, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_NAVI10},
 	{0x1002, 0x731F, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_NAVI10},
 	/* Navi14 */
-	{0x1002, 0x7340, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_NAVI14},
+	{0x1002, 0x7340, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_NAVI14|AMD_EXP_HW_SUPPORT},
+	{0x1002, 0x7341, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_NAVI14|AMD_EXP_HW_SUPPORT},
+	{0x1002, 0x7347, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_NAVI14|AMD_EXP_HW_SUPPORT},
 
 	/* Renoir */
 	{0x1002, 0x1636, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RENOIR|AMD_IS_APU|AMD_EXP_HW_SUPPORT},
 
+	/* Navi12 */
+	{0x1002, 0x7360, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_NAVI12|AMD_EXP_HW_SUPPORT},
+
 	{0, 0, 0}
 };
 

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
index b174bd5..8ceb449 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c

@@ -291,6 +291,8 @@
 	uint32_t handle;
 	int r;
 
+	args->addr = untagged_addr(args->addr);
+
 	if (offset_in_page(args->addr | args->size))
 		return -EINVAL;
 

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
index 7850084..6065583 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c

@@ -143,7 +143,8 @@
 	/* ring tests don't use a job */
 	if (job) {
 		vm = job->vm;
-		fence_ctx = job->base.s_fence->scheduled.context;
+		fence_ctx = job->base.s_fence ?
+			job->base.s_fence->scheduled.context : 0;
 	} else {
 		vm = NULL;
 		fence_ctx = 0;

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index 0e2ec60..f614752 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c

@@ -677,6 +677,9 @@
 		if (sh_num == AMDGPU_INFO_MMR_SH_INDEX_MASK)
 			sh_num = 0xffffffff;
 
+		if (info->read_mmr_reg.count > 128)
+			return -EINVAL;
+
 		regs = kmalloc_array(info->read_mmr_reg.count, sizeof(*regs), GFP_KERNEL);
 		if (!regs)
 			return -ENOMEM;

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
index db28823..638c821 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c

@@ -70,6 +70,11 @@
 MODULE_FIRMWARE("amdgpu/navi10_mec2.bin");
 MODULE_FIRMWARE("amdgpu/navi10_rlc.bin");
 
+MODULE_FIRMWARE("amdgpu/navi14_ce_wks.bin");
+MODULE_FIRMWARE("amdgpu/navi14_pfp_wks.bin");
+MODULE_FIRMWARE("amdgpu/navi14_me_wks.bin");
+MODULE_FIRMWARE("amdgpu/navi14_mec_wks.bin");
+MODULE_FIRMWARE("amdgpu/navi14_mec2_wks.bin");
 MODULE_FIRMWARE("amdgpu/navi14_ce.bin");
 MODULE_FIRMWARE("amdgpu/navi14_pfp.bin");
 MODULE_FIRMWARE("amdgpu/navi14_me.bin");
@@ -594,7 +599,8 @@
 static int gfx_v10_0_init_microcode(struct amdgpu_device *adev)
 {
 	const char *chip_name;
-	char fw_name[30];
+	char fw_name[40];
+	char wks[10];
 	int err;
 	struct amdgpu_firmware_info *info = NULL;
 	const struct common_firmware_header *header = NULL;
@@ -607,12 +613,16 @@
 
 	DRM_DEBUG("\n");
 
+	memset(wks, 0, sizeof(wks));
 	switch (adev->asic_type) {
 	case CHIP_NAVI10:
 		chip_name = "navi10";
 		break;
 	case CHIP_NAVI14:
 		chip_name = "navi14";
+		if (!(adev->pdev->device == 0x7340 &&
+		      adev->pdev->revision != 0x00))
+			snprintf(wks, sizeof(wks), "_wks");
 		break;
 	case CHIP_NAVI12:
 		chip_name = "navi12";
@@ -621,7 +631,7 @@
 		BUG();
 	}
 
-	snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_pfp.bin", chip_name);
+	snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_pfp%s.bin", chip_name, wks);
 	err = request_firmware(&adev->gfx.pfp_fw, fw_name, adev->dev);
 	if (err)
 		goto out;
@@ -632,7 +642,7 @@
 	adev->gfx.pfp_fw_version = le32_to_cpu(cp_hdr->header.ucode_version);
 	adev->gfx.pfp_feature_version = le32_to_cpu(cp_hdr->ucode_feature_version);
 
-	snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_me.bin", chip_name);
+	snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_me%s.bin", chip_name, wks);
 	err = request_firmware(&adev->gfx.me_fw, fw_name, adev->dev);
 	if (err)
 		goto out;
@@ -643,7 +653,7 @@
 	adev->gfx.me_fw_version = le32_to_cpu(cp_hdr->header.ucode_version);
 	adev->gfx.me_feature_version = le32_to_cpu(cp_hdr->ucode_feature_version);
 
-	snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_ce.bin", chip_name);
+	snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_ce%s.bin", chip_name, wks);
 	err = request_firmware(&adev->gfx.ce_fw, fw_name, adev->dev);
 	if (err)
 		goto out;
@@ -708,7 +718,7 @@
 	if (adev->gfx.rlc.is_rlc_v2_1)
 		gfx_v10_0_init_rlc_ext_microcode(adev);
 
-	snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_mec.bin", chip_name);
+	snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_mec%s.bin", chip_name, wks);
 	err = request_firmware(&adev->gfx.mec_fw, fw_name, adev->dev);
 	if (err)
 		goto out;
@@ -719,7 +729,7 @@
 	adev->gfx.mec_fw_version = le32_to_cpu(cp_hdr->header.ucode_version);
 	adev->gfx.mec_feature_version = le32_to_cpu(cp_hdr->ucode_feature_version);
 
-	snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_mec2.bin", chip_name);
+	snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_mec2%s.bin", chip_name, wks);
 	err = request_firmware(&adev->gfx.mec2_fw, fw_name, adev->dev);
 	if (!err) {
 		err = amdgpu_ucode_validate(adev->gfx.mec2_fw);

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 83d45f9..dcadc73 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c

@@ -1650,7 +1650,6 @@
 
 	switch (adev->asic_type) {
 	case CHIP_RAVEN:
-	case CHIP_RENOIR:
 		gfx_v9_0_init_lbpw(adev);
 		break;
 	case CHIP_VEGA20:
@@ -3026,7 +3025,6 @@
 
 	switch (adev->asic_type) {
 	case CHIP_RAVEN:
-	case CHIP_RENOIR:
 		if (amdgpu_lbpw == 0)
 			gfx_v9_0_enable_lbpw(adev, false);
 		else

diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
index ff18b3a..78452cf 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c

@@ -1889,8 +1889,9 @@
 	int r;
 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
-	if (adev->asic_type == CHIP_RAVEN && adev->powerplay.pp_funcs &&
-			adev->powerplay.pp_funcs->set_powergating_by_smu)
+	if ((adev->asic_type == CHIP_RAVEN && adev->powerplay.pp_funcs &&
+			adev->powerplay.pp_funcs->set_powergating_by_smu) ||
+			adev->asic_type == CHIP_RENOIR)
 		amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_SDMA, false);
 
 	if (!amdgpu_sriov_vf(adev))
@@ -1917,8 +1918,9 @@
 	sdma_v4_0_ctx_switch_enable(adev, false);
 	sdma_v4_0_enable(adev, false);
 
-	if (adev->asic_type == CHIP_RAVEN && adev->powerplay.pp_funcs
-			&& adev->powerplay.pp_funcs->set_powergating_by_smu)
+	if ((adev->asic_type == CHIP_RAVEN && adev->powerplay.pp_funcs
+			&& adev->powerplay.pp_funcs->set_powergating_by_smu) ||
+			adev->asic_type == CHIP_RENOIR)
 		amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_SDMA, true);
 
 	return 0;

diff --git a/drivers/gpu/drm/amd/amdgpu/smu_v11_0_i2c.c b/drivers/gpu/drm/amd/amdgpu/smu_v11_0_i2c.c
index 4a59510..c44723c 100644
--- a/drivers/gpu/drm/amd/amdgpu/smu_v11_0_i2c.c
+++ b/drivers/gpu/drm/amd/amdgpu/smu_v11_0_i2c.c

@@ -493,7 +493,15 @@
 	}
 
 	/* Restore clock gating */
-	smu_v11_0_i2c_set_clock_gating(control, true);
+
+	/*
+	 * TODO Reenabling clock gating seems to break subsequent SMU operation
+	 *      on the I2C bus. My guess is that SMU doesn't disable clock gating like
+	 *      we do here before working with the bus. So for now just don't restore
+	 *      it but later work with SMU to see if they have this issue and can
+	 *      update their code appropriately
+	 */
+	/* smu_v11_0_i2c_set_clock_gating(control, true); */
 
 }
 

diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
index a8cf82d..901fe35 100644
--- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
+++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h

@@ -694,10 +694,10 @@
 	0x003f8000, 0x8f6f896f,
 	0x88776f77, 0x8a6eff6e,
 	0x023f8000, 0xb9eef807,
-	0xb970f812, 0xb971f813,
-	0x8ff08870, 0xf4051bb8,
+	0xb97af812, 0xb97bf813,
+	0x8ffa887a, 0xf4051bbd,
 	0xfa000000, 0xbf8cc07f,
-	0xf4051c38, 0xfa000008,
+	0xf4051ebd, 0xfa000008,
 	0xbf8cc07f, 0x87ee6e6e,
 	0xbf840001, 0xbe80206e,
 	0xb971f803, 0x8771ff71,

diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm
index 3598621..cdaa523 100644
--- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm
+++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm

@@ -187,12 +187,12 @@
 	// Read second-level TBA/TMA from first-level TMA and jump if available.
 	// ttmp[2:5] and ttmp12 can be used (others hold SPI-initialized debug data)
 	// ttmp12 holds SQ_WAVE_STATUS
-	s_getreg_b32	ttmp4, hwreg(HW_REG_SHADER_TMA_LO)
-	s_getreg_b32	ttmp5, hwreg(HW_REG_SHADER_TMA_HI)
-	s_lshl_b64	[ttmp4, ttmp5], [ttmp4, ttmp5], 0x8
-	s_load_dwordx2	[ttmp2, ttmp3], [ttmp4, ttmp5], 0x0 glc:1		// second-level TBA
+	s_getreg_b32	ttmp14, hwreg(HW_REG_SHADER_TMA_LO)
+	s_getreg_b32	ttmp15, hwreg(HW_REG_SHADER_TMA_HI)
+	s_lshl_b64	[ttmp14, ttmp15], [ttmp14, ttmp15], 0x8
+	s_load_dwordx2	[ttmp2, ttmp3], [ttmp14, ttmp15], 0x0 glc:1		// second-level TBA
 	s_waitcnt	lgkmcnt(0)
-	s_load_dwordx2	[ttmp4, ttmp5], [ttmp4, ttmp5], 0x8 glc:1		// second-level TMA
+	s_load_dwordx2	[ttmp14, ttmp15], [ttmp14, ttmp15], 0x8 glc:1		// second-level TMA
 	s_waitcnt	lgkmcnt(0)
 	s_and_b64	[ttmp2, ttmp3], [ttmp2, ttmp3], [ttmp2, ttmp3]
 	s_cbranch_scc0	L_NO_NEXT_TRAP						// second-level trap handler not been set

diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index e1b09bb..8cab6da 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c

@@ -2113,6 +2113,7 @@
 }
 
 static const struct backlight_ops amdgpu_dm_backlight_ops = {
+	.options = BL_CORE_SUSPENDRESUME,
 	.get_brightness = amdgpu_dm_backlight_get_brightness,
 	.update_status	= amdgpu_dm_backlight_update_status,
 };
@@ -2384,6 +2385,8 @@
 
 	if (adev->asic_type != CHIP_CARRIZO && adev->asic_type != CHIP_STONEY)
 		dm->dc->debug.disable_stutter = amdgpu_pp_feature_mask & PP_STUTTER_MODE ? false : true;
+	if (adev->asic_type == CHIP_RENOIR)
+		dm->dc->debug.disable_stutter = true;
 
 	return 0;
 fail:
@@ -5770,8 +5773,7 @@
 		 * change FB pitch, DCC state, rotation or mirroing.
 		 */
 		bundle->flip_addrs[planes_count].flip_immediate =
-			(crtc->state->pageflip_flags &
-			 DRM_MODE_PAGE_FLIP_ASYNC) != 0 &&
+			crtc->state->async_flip &&
 			acrtc_state->update_type == UPDATE_TYPE_FAST;
 
 		timestamp_ns = ktime_get_ns();
@@ -6348,7 +6350,7 @@
 	amdgpu_dm_enable_crtc_interrupts(dev, state, true);
 
 	for_each_new_crtc_in_state(state, crtc, new_crtc_state, j)
-		if (new_crtc_state->pageflip_flags & DRM_MODE_PAGE_FLIP_ASYNC)
+		if (new_crtc_state->async_flip)
 			wait_for_vblank = false;
 
 	/* update planes when needed per crtc*/

diff --git a/drivers/gpu/drm/amd/display/dc/calcs/dcn_calcs.c b/drivers/gpu/drm/amd/display/dc/calcs/dcn_calcs.c
index 383f4f8..9b2cb57 100644
--- a/drivers/gpu/drm/amd/display/dc/calcs/dcn_calcs.c
+++ b/drivers/gpu/drm/amd/display/dc/calcs/dcn_calcs.c

@@ -708,6 +708,10 @@
 
 unsigned int get_highest_allowed_voltage_level(uint32_t hw_internal_rev)
 {
+	/* for dali, the highest voltage level we want is 0 */
+	if (ASICREV_IS_DALI(hw_internal_rev))
+		return 0;
+
 	/* we are ok with all levels */
 	return 4;
 }

diff --git a/drivers/gpu/drm/amd/display/dc/clk_mgr/dce110/dce110_clk_mgr.c b/drivers/gpu/drm/amd/display/dc/clk_mgr/dce110/dce110_clk_mgr.c
index 5cc3acc..b1e657e1 100644
--- a/drivers/gpu/drm/amd/display/dc/clk_mgr/dce110/dce110_clk_mgr.c
+++ b/drivers/gpu/drm/amd/display/dc/clk_mgr/dce110/dce110_clk_mgr.c

@@ -98,11 +98,14 @@
 		struct dc_stream_state *stream = context->streams[j];
 		uint32_t vertical_blank_in_pixels = 0;
 		uint32_t vertical_blank_time = 0;
+		uint32_t vertical_total_min = stream->timing.v_total;
+		struct dc_crtc_timing_adjust adjust = stream->adjust;
+		if (adjust.v_total_max != adjust.v_total_min)
+			vertical_total_min = adjust.v_total_min;
 
 		vertical_blank_in_pixels = stream->timing.h_total *
-			(stream->timing.v_total
+			(vertical_total_min
 			 - stream->timing.v_addressable);
-
 		vertical_blank_time = vertical_blank_in_pixels
 			* 10000 / stream->timing.pix_clk_100hz;
 
@@ -171,6 +174,10 @@
 	struct dc_state *context)
 {
 	struct dm_pp_display_configuration *pp_display_cfg = &context->pp_display_cfg;
+	int memory_type_multiplier = MEMORY_TYPE_MULTIPLIER_CZ;
+
+	if (dc->bw_vbios && dc->bw_vbios->memory_type == bw_def_hbm)
+		memory_type_multiplier = MEMORY_TYPE_HBM;
 
 	pp_display_cfg->all_displays_in_sync =
 		context->bw_ctx.bw.dce.all_displays_in_sync;
@@ -183,8 +190,20 @@
 	pp_display_cfg->cpu_pstate_separation_time =
 			context->bw_ctx.bw.dce.blackout_recovery_time_us;
 
-	pp_display_cfg->min_memory_clock_khz = context->bw_ctx.bw.dce.yclk_khz
-		/ MEMORY_TYPE_MULTIPLIER_CZ;
+	/*
+	 * TODO: determine whether the bandwidth has reached memory's limitation
+	 * , then change minimum memory clock based on real-time bandwidth
+	 * limitation.
+	 */
+	if (ASICREV_IS_VEGA20_P(dc->ctx->asic_id.hw_internal_rev) && (context->stream_count >= 2)) {
+		pp_display_cfg->min_memory_clock_khz = max(pp_display_cfg->min_memory_clock_khz,
+							   (uint32_t) div64_s64(
+								   div64_s64(dc->bw_vbios->high_yclk.value,
+									     memory_type_multiplier), 10000));
+	} else {
+		pp_display_cfg->min_memory_clock_khz = context->bw_ctx.bw.dce.yclk_khz
+			/ memory_type_multiplier;
+	}
 
 	pp_display_cfg->min_engine_clock_khz = determine_sclk_from_bounding_box(
 			dc,

diff --git a/drivers/gpu/drm/amd/display/dc/dce/dce_mem_input.c b/drivers/gpu/drm/amd/display/dc/dce/dce_mem_input.c
index 1488ffd..31b698b 100644
--- a/drivers/gpu/drm/amd/display/dc/dce/dce_mem_input.c
+++ b/drivers/gpu/drm/amd/display/dc/dce/dce_mem_input.c

@@ -148,7 +148,7 @@
 			pte->min_pte_before_flip_horiz_scan;
 
 	REG_UPDATE(GRPH_PIPE_OUTSTANDING_REQUEST_LIMIT,
-			GRPH_PIPE_OUTSTANDING_REQUEST_LIMIT, 0xff);
+			GRPH_PIPE_OUTSTANDING_REQUEST_LIMIT, 0x7f);
 
 	REG_UPDATE_3(DVMM_PTE_CONTROL,
 			DVMM_PAGE_WIDTH, page_width,
@@ -157,7 +157,7 @@
 
 	REG_UPDATE_2(DVMM_PTE_ARB_CONTROL,
 			DVMM_PTE_REQ_PER_CHUNK, pte->pte_req_per_chunk,
-			DVMM_MAX_PTE_REQ_OUTSTANDING, 0xff);
+			DVMM_MAX_PTE_REQ_OUTSTANDING, 0x7f);
 }
 
 static void program_urgency_watermark(

diff --git a/drivers/gpu/drm/amd/display/dc/dce100/dce100_resource.c b/drivers/gpu/drm/amd/display/dc/dce100/dce100_resource.c
index afc6105..1787b9b 100644
--- a/drivers/gpu/drm/amd/display/dc/dce100/dce100_resource.c
+++ b/drivers/gpu/drm/amd/display/dc/dce100/dce100_resource.c

@@ -1091,6 +1091,7 @@
 	if (construct(num_virtual_links, dc, pool))
 		return &pool->base;
 
+	kfree(pool);
 	BREAK_TO_DEBUGGER();
 	return NULL;
 }

diff --git a/drivers/gpu/drm/amd/display/dc/dce110/dce110_resource.c b/drivers/gpu/drm/amd/display/dc/dce110/dce110_resource.c
index c66fe17..318e9c2 100644
--- a/drivers/gpu/drm/amd/display/dc/dce110/dce110_resource.c
+++ b/drivers/gpu/drm/amd/display/dc/dce110/dce110_resource.c

@@ -1462,6 +1462,7 @@
 	if (construct(num_virtual_links, dc, pool, asic_id))
 		return &pool->base;
 
+	kfree(pool);
 	BREAK_TO_DEBUGGER();
 	return NULL;
 }

diff --git a/drivers/gpu/drm/amd/display/dc/dce112/dce112_resource.c b/drivers/gpu/drm/amd/display/dc/dce112/dce112_resource.c
index 3ac4c7e..83e1878 100644
--- a/drivers/gpu/drm/amd/display/dc/dce112/dce112_resource.c
+++ b/drivers/gpu/drm/amd/display/dc/dce112/dce112_resource.c

@@ -987,6 +987,10 @@
 	struct dm_pp_clock_levels_with_latency mem_clks = {0};
 	struct dm_pp_wm_sets_with_clock_ranges clk_ranges = {0};
 	struct dm_pp_clock_levels clks = {0};
+	int memory_type_multiplier = MEMORY_TYPE_MULTIPLIER_CZ;
+
+	if (dc->bw_vbios && dc->bw_vbios->memory_type == bw_def_hbm)
+		memory_type_multiplier = MEMORY_TYPE_HBM;
 
 	/*do system clock  TODO PPLIB: after PPLIB implement,
 	 * then remove old way
@@ -1026,12 +1030,12 @@
 				&clks);
 
 		dc->bw_vbios->low_yclk = bw_frc_to_fixed(
-			clks.clocks_in_khz[0] * MEMORY_TYPE_MULTIPLIER_CZ, 1000);
+			clks.clocks_in_khz[0] * memory_type_multiplier, 1000);
 		dc->bw_vbios->mid_yclk = bw_frc_to_fixed(
-			clks.clocks_in_khz[clks.num_levels>>1] * MEMORY_TYPE_MULTIPLIER_CZ,
+			clks.clocks_in_khz[clks.num_levels>>1] * memory_type_multiplier,
 			1000);
 		dc->bw_vbios->high_yclk = bw_frc_to_fixed(
-			clks.clocks_in_khz[clks.num_levels-1] * MEMORY_TYPE_MULTIPLIER_CZ,
+			clks.clocks_in_khz[clks.num_levels-1] * memory_type_multiplier,
 			1000);
 
 		return;
@@ -1067,12 +1071,12 @@
 	 * YCLK = UMACLK*m_memoryTypeMultiplier
 	 */
 	dc->bw_vbios->low_yclk = bw_frc_to_fixed(
-		mem_clks.data[0].clocks_in_khz * MEMORY_TYPE_MULTIPLIER_CZ, 1000);
+		mem_clks.data[0].clocks_in_khz * memory_type_multiplier, 1000);
 	dc->bw_vbios->mid_yclk = bw_frc_to_fixed(
-		mem_clks.data[mem_clks.num_levels>>1].clocks_in_khz * MEMORY_TYPE_MULTIPLIER_CZ,
+		mem_clks.data[mem_clks.num_levels>>1].clocks_in_khz * memory_type_multiplier,
 		1000);
 	dc->bw_vbios->high_yclk = bw_frc_to_fixed(
-		mem_clks.data[mem_clks.num_levels-1].clocks_in_khz * MEMORY_TYPE_MULTIPLIER_CZ,
+		mem_clks.data[mem_clks.num_levels-1].clocks_in_khz * memory_type_multiplier,
 		1000);
 
 	/* Now notify PPLib/SMU about which Watermarks sets they should select
@@ -1338,6 +1342,7 @@
 	if (construct(num_virtual_links, dc, pool))
 		return &pool->base;
 
+	kfree(pool);
 	BREAK_TO_DEBUGGER();
 	return NULL;
 }

diff --git a/drivers/gpu/drm/amd/display/dc/dce120/dce120_resource.c b/drivers/gpu/drm/amd/display/dc/dce120/dce120_resource.c
index 7d08154..8b85e52 100644
--- a/drivers/gpu/drm/amd/display/dc/dce120/dce120_resource.c
+++ b/drivers/gpu/drm/amd/display/dc/dce120/dce120_resource.c

@@ -847,6 +847,8 @@
 	int i;
 	unsigned int clk;
 	unsigned int latency;
+	/*original logic in dal3*/
+	int memory_type_multiplier = MEMORY_TYPE_MULTIPLIER_CZ;
 
 	/*do system clock*/
 	if (!dm_pp_get_clock_levels_by_type_with_latency(
@@ -905,13 +907,16 @@
 	 * ALSO always convert UMA clock (from PPLIB)  to YCLK (HW formula):
 	 * YCLK = UMACLK*m_memoryTypeMultiplier
 	 */
+	if (dc->bw_vbios->memory_type == bw_def_hbm)
+		memory_type_multiplier = MEMORY_TYPE_HBM;
+
 	dc->bw_vbios->low_yclk = bw_frc_to_fixed(
-		mem_clks.data[0].clocks_in_khz * MEMORY_TYPE_MULTIPLIER_CZ, 1000);
+		mem_clks.data[0].clocks_in_khz * memory_type_multiplier, 1000);
 	dc->bw_vbios->mid_yclk = bw_frc_to_fixed(
-		mem_clks.data[mem_clks.num_levels>>1].clocks_in_khz * MEMORY_TYPE_MULTIPLIER_CZ,
+		mem_clks.data[mem_clks.num_levels>>1].clocks_in_khz * memory_type_multiplier,
 		1000);
 	dc->bw_vbios->high_yclk = bw_frc_to_fixed(
-		mem_clks.data[mem_clks.num_levels-1].clocks_in_khz * MEMORY_TYPE_MULTIPLIER_CZ,
+		mem_clks.data[mem_clks.num_levels-1].clocks_in_khz * memory_type_multiplier,
 		1000);
 
 	/* Now notify PPLib/SMU about which Watermarks sets they should select
@@ -1203,6 +1208,7 @@
 	if (construct(num_virtual_links, dc, pool))
 		return &pool->base;
 
+	kfree(pool);
 	BREAK_TO_DEBUGGER();
 	return NULL;
 }

diff --git a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_resource.c b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_resource.c
index 5a89e46..59305e4 100644
--- a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_resource.c
+++ b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_resource.c

@@ -1570,6 +1570,7 @@
 	if (construct(init_data->num_virtual_links, dc, pool))
 		return &pool->base;
 
+	kfree(pool);
 	BREAK_TO_DEBUGGER();
 	return NULL;
 }

diff --git a/drivers/gpu/drm/amd/display/dc/dcn21/dcn21_resource.c b/drivers/gpu/drm/amd/display/dc/dcn21/dcn21_resource.c
index 3ca5139..de18218 100644
--- a/drivers/gpu/drm/amd/display/dc/dcn21/dcn21_resource.c
+++ b/drivers/gpu/drm/amd/display/dc/dcn21/dcn21_resource.c

@@ -23,6 +23,8 @@
  *
  */
 
+#include <linux/slab.h>
+
 #include "dm_services.h"
 #include "dc.h"
 

diff --git a/drivers/gpu/drm/amd/display/dc/gpio/dcn21/hw_factory_dcn21.c b/drivers/gpu/drm/amd/display/dc/gpio/dcn21/hw_factory_dcn21.c
index 34485d9..8572678 100644
--- a/drivers/gpu/drm/amd/display/dc/gpio/dcn21/hw_factory_dcn21.c
+++ b/drivers/gpu/drm/amd/display/dc/gpio/dcn21/hw_factory_dcn21.c

@@ -35,12 +35,10 @@
 
 #include "hw_factory_dcn21.h"
 
-
 #include "dcn/dcn_2_1_0_offset.h"
 #include "dcn/dcn_2_1_0_sh_mask.h"
 #include "renoir_ip_offset.h"
 
-
 #include "reg_helper.h"
 #include "../hpd_regs.h"
 /* begin *********************
@@ -136,6 +134,39 @@
 	DDC_MASK_SH_LIST_DCN2(_MASK, 6)
 };
 
+#include "../generic_regs.h"
+
+/* set field name */
+#define SF_GENERIC(reg_name, field_name, post_fix)\
+	.field_name = reg_name ## __ ## field_name ## post_fix
+
+#define generic_regs(id) \
+{\
+	GENERIC_REG_LIST(id)\
+}
+
+static const struct generic_registers generic_regs[] = {
+	generic_regs(A),
+};
+
+static const struct generic_sh_mask generic_shift[] = {
+	GENERIC_MASK_SH_LIST(__SHIFT, A),
+};
+
+static const struct generic_sh_mask generic_mask[] = {
+	GENERIC_MASK_SH_LIST(_MASK, A),
+};
+
+static void define_generic_registers(struct hw_gpio_pin *pin, uint32_t en)
+{
+	struct hw_generic *generic = HW_GENERIC_FROM_BASE(pin);
+
+	generic->regs = &generic_regs[en];
+	generic->shifts = &generic_shift[en];
+	generic->masks = &generic_mask[en];
+	generic->base.regs = &generic_regs[en].gpio;
+}
+
 static void define_ddc_registers(
 		struct hw_gpio_pin *pin,
 		uint32_t en)
@@ -181,7 +212,8 @@
 	.get_hpd_pin = dal_hw_hpd_get_pin,
 	.get_generic_pin = dal_hw_generic_get_pin,
 	.define_hpd_registers = define_hpd_registers,
-	.define_ddc_registers = define_ddc_registers
+	.define_ddc_registers = define_ddc_registers,
+	.define_generic_registers = define_generic_registers
 };
 /*
  * dal_hw_factory_dcn10_init

diff --git a/drivers/gpu/drm/amd/display/dc/gpio/dcn21/hw_translate_dcn21.c b/drivers/gpu/drm/amd/display/dc/gpio/dcn21/hw_translate_dcn21.c
index ad7c437..fbb58fb 100644
--- a/drivers/gpu/drm/amd/display/dc/gpio/dcn21/hw_translate_dcn21.c
+++ b/drivers/gpu/drm/amd/display/dc/gpio/dcn21/hw_translate_dcn21.c

@@ -58,7 +58,6 @@
 #define SF_HPD(reg_name, field_name, post_fix)\
 	.field_name = reg_name ## __ ## field_name ## post_fix
 
-
 /* macros to expend register list macro defined in HW object header file
  * end *********************/
 
@@ -71,7 +70,7 @@
 {
 	switch (offset) {
 	/* GENERIC */
-	case REG(DC_GENERICA):
+	case REG(DC_GPIO_GENERIC_A):
 		*id = GPIO_ID_GENERIC;
 		switch (mask) {
 		case DC_GPIO_GENERIC_A__DC_GPIO_GENERICA_A_MASK:

diff --git a/drivers/gpu/drm/amd/display/dc/inc/resource.h b/drivers/gpu/drm/amd/display/dc/inc/resource.h
index 1cc1c8c..bef224b 100644
--- a/drivers/gpu/drm/amd/display/dc/inc/resource.h
+++ b/drivers/gpu/drm/amd/display/dc/inc/resource.h

@@ -31,6 +31,8 @@
 #include "dm_pp_smu.h"
 
 #define MEMORY_TYPE_MULTIPLIER_CZ 4
+#define MEMORY_TYPE_HBM 2
+
 
 enum dce_version resource_parse_asic_id(
 		struct hw_asic_id asic_id);

diff --git a/drivers/gpu/drm/amd/display/include/dal_asic_id.h b/drivers/gpu/drm/amd/display/include/dal_asic_id.h
index 1f16892..1be6c44 100644
--- a/drivers/gpu/drm/amd/display/include/dal_asic_id.h
+++ b/drivers/gpu/drm/amd/display/include/dal_asic_id.h

@@ -137,10 +137,13 @@
 #define RAVEN1_F0 0xF0
 #define RAVEN_UNKNOWN 0xFF
 
+#define PICASSO_15D8_REV_E3 0xE3
+#define PICASSO_15D8_REV_E4 0xE4
+
 #define ASICREV_IS_RAVEN(eChipRev) ((eChipRev >= RAVEN_A0) && eChipRev < RAVEN_UNKNOWN)
 #define ASICREV_IS_PICASSO(eChipRev) ((eChipRev >= PICASSO_A0) && (eChipRev < RAVEN2_A0))
-#define ASICREV_IS_RAVEN2(eChipRev) ((eChipRev >= RAVEN2_A0) && (eChipRev < 0xF0))
-
+#define ASICREV_IS_RAVEN2(eChipRev) ((eChipRev >= RAVEN2_A0) && (eChipRev < PICASSO_15D8_REV_E3))
+#define ASICREV_IS_DALI(eChipRev) ((eChipRev >= PICASSO_15D8_REV_E3) && (eChipRev < RAVEN1_F0))
 
 #define ASICREV_IS_RV1_F0(eChipRev) ((eChipRev >= RAVEN1_F0) && (eChipRev < RAVEN_UNKNOWN))
 

diff --git a/drivers/gpu/drm/amd/include/renoir_ip_offset.h b/drivers/gpu/drm/amd/include/renoir_ip_offset.h
index 554714c..094648c 100644
--- a/drivers/gpu/drm/amd/include/renoir_ip_offset.h
+++ b/drivers/gpu/drm/amd/include/renoir_ip_offset.h

@@ -155,7 +155,7 @@
                                         { { 0, 0, 0, 0, 0 } },
                                         { { 0, 0, 0, 0, 0 } },
                                         { { 0, 0, 0, 0, 0 } } } };
-static const struct IP_BASE MP1_BASE ={ { { { 0x00016200, 0x02400400, 0x00E80000, 0x00EC0000, 0x00F00000 } },
+static const struct IP_BASE MP1_BASE ={ { { { 0x00016000, 0x02400400, 0x00E80000, 0x00EC0000, 0x00F00000 } },
                                         { { 0, 0, 0, 0, 0 } },
                                         { { 0, 0, 0, 0, 0 } },
                                         { { 0, 0, 0, 0, 0 } },

diff --git a/drivers/gpu/drm/amd/powerplay/amd_powerplay.c b/drivers/gpu/drm/amd/powerplay/amd_powerplay.c
index fa636cb..fa8ad7d 100644
--- a/drivers/gpu/drm/amd/powerplay/amd_powerplay.c
+++ b/drivers/gpu/drm/amd/powerplay/amd_powerplay.c

@@ -1531,6 +1531,7 @@
 static int pp_smu_i2c_bus_access(void *handle, bool acquire)
 {
 	struct pp_hwmgr *hwmgr = handle;
+	int ret = 0;
 
 	if (!hwmgr || !hwmgr->pm_en)
 		return -EINVAL;
@@ -1540,7 +1541,11 @@
 		return -EINVAL;
 	}
 
-	return hwmgr->hwmgr_func->smu_i2c_bus_access(hwmgr, acquire);
+	mutex_lock(&hwmgr->smu_lock);
+	ret = hwmgr->hwmgr_func->smu_i2c_bus_access(hwmgr, acquire);
+	mutex_unlock(&hwmgr->smu_lock);
+
+	return ret;
 }
 
 static const struct amd_pm_funcs pp_dpm_funcs = {

diff --git a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
index 22f3c60..33960fb 100644
--- a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
+++ b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c

@@ -354,6 +354,9 @@
 	case AMD_IP_BLOCK_TYPE_GFX:
 		ret = smu_gfx_off_control(smu, gate);
 		break;
+	case AMD_IP_BLOCK_TYPE_SDMA:
+		ret = smu_powergate_sdma(smu, gate);
+		break;
 	default:
 		break;
 	}

diff --git a/drivers/gpu/drm/amd/powerplay/renoir_ppt.c b/drivers/gpu/drm/amd/powerplay/renoir_ppt.c
index 2a6da54..e62bfba 100644
--- a/drivers/gpu/drm/amd/powerplay/renoir_ppt.c
+++ b/drivers/gpu/drm/amd/powerplay/renoir_ppt.c

@@ -177,12 +177,82 @@
 
 }
 
+static int renoir_print_clk_levels(struct smu_context *smu,
+			enum smu_clk_type clk_type, char *buf)
+{
+	int i, size = 0, ret = 0;
+	uint32_t cur_value = 0, value = 0, count = 0, min = 0, max = 0;
+	DpmClocks_t *clk_table = smu->smu_table.clocks_table;
+	SmuMetrics_t metrics = {0};
+
+	if (!clk_table || clk_type >= SMU_CLK_COUNT)
+		return -EINVAL;
+
+	ret = smu_update_table(smu, SMU_TABLE_SMU_METRICS, 0,
+			       (void *)&metrics, false);
+	if (ret)
+		return ret;
+
+	switch (clk_type) {
+	case SMU_GFXCLK:
+	case SMU_SCLK:
+		/* retirve table returned paramters unit is MHz */
+		cur_value = metrics.ClockFrequency[CLOCK_GFXCLK];
+		ret = smu_get_dpm_freq_range(smu, SMU_GFXCLK, &min, &max);
+		if (!ret) {
+			/* driver only know min/max gfx_clk, Add level 1 for all other gfx clks */
+			if (cur_value  == max)
+				i = 2;
+			else if (cur_value == min)
+				i = 0;
+			else
+				i = 1;
+
+			size += sprintf(buf + size, "0: %uMhz %s\n", min,
+					i == 0 ? "*" : "");
+			size += sprintf(buf + size, "1: %uMhz %s\n",
+					i == 1 ? cur_value : RENOIR_UMD_PSTATE_GFXCLK,
+					i == 1 ? "*" : "");
+			size += sprintf(buf + size, "2: %uMhz %s\n", max,
+					i == 2 ? "*" : "");
+		}
+		return size;
+	case SMU_SOCCLK:
+		count = NUM_SOCCLK_DPM_LEVELS;
+		cur_value = metrics.ClockFrequency[CLOCK_SOCCLK];
+		break;
+	case SMU_MCLK:
+		count = NUM_MEMCLK_DPM_LEVELS;
+		cur_value = metrics.ClockFrequency[CLOCK_UMCCLK];
+		break;
+	case SMU_DCEFCLK:
+		count = NUM_DCFCLK_DPM_LEVELS;
+		cur_value = metrics.ClockFrequency[CLOCK_DCFCLK];
+		break;
+	case SMU_FCLK:
+		count = NUM_FCLK_DPM_LEVELS;
+		cur_value = metrics.ClockFrequency[CLOCK_FCLK];
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	for (i = 0; i < count; i++) {
+		GET_DPM_CUR_FREQ(clk_table, clk_type, i, value);
+		size += sprintf(buf + size, "%d: %uMhz %s\n", i, value,
+				cur_value == value ? "*" : "");
+	}
+
+	return size;
+}
+
 static const struct pptable_funcs renoir_ppt_funcs = {
 	.get_smu_msg_index = renoir_get_smu_msg_index,
 	.get_smu_table_index = renoir_get_smu_table_index,
 	.tables_init = renoir_tables_init,
 	.set_power_state = NULL,
 	.get_dpm_uclk_limited = renoir_get_dpm_uclk_limited,
+	.print_clk_levels = renoir_print_clk_levels,
 };
 
 void renoir_set_ppt_funcs(struct smu_context *smu)

diff --git a/drivers/gpu/drm/amd/powerplay/renoir_ppt.h b/drivers/gpu/drm/amd/powerplay/renoir_ppt.h
index e9b7237..2a390dd 100644
--- a/drivers/gpu/drm/amd/powerplay/renoir_ppt.h
+++ b/drivers/gpu/drm/amd/powerplay/renoir_ppt.h

@@ -25,4 +25,29 @@
 
 extern void renoir_set_ppt_funcs(struct smu_context *smu);
 
+/* UMD PState Renoir Msg Parameters in MHz */
+#define RENOIR_UMD_PSTATE_GFXCLK       700
+#define RENOIR_UMD_PSTATE_SOCCLK       678
+#define RENOIR_UMD_PSTATE_FCLK         800
+
+#define GET_DPM_CUR_FREQ(table, clk_type, dpm_level, freq)		\
+	do {								\
+		switch (clk_type) {					\
+		case SMU_SOCCLK:					\
+			freq = table->SocClocks[dpm_level].Freq;	\
+			break;						\
+		case SMU_MCLK:						\
+			freq = table->MemClocks[dpm_level].Freq;	\
+			break;						\
+		case SMU_DCEFCLK:					\
+			freq = table->DcfClocks[dpm_level].Freq;	\
+			break;						\
+		case SMU_FCLK:						\
+			freq = table->FClocks[dpm_level].Freq;		\
+			break;						\
+		default:						\
+			break;						\
+		}							\
+	} while (0)
+
 #endif

diff --git a/drivers/gpu/drm/bridge/adv7511/adv7511_drv.c b/drivers/gpu/drm/bridge/adv7511/adv7511_drv.c
index 98bccac..9e13e46 100644
--- a/drivers/gpu/drm/bridge/adv7511/adv7511_drv.c
+++ b/drivers/gpu/drm/bridge/adv7511/adv7511_drv.c

@@ -874,6 +874,9 @@
 				 &adv7511_connector_helper_funcs);
 	drm_connector_attach_encoder(&adv->connector, bridge->encoder);
 
+	if (adv->type == ADV7533)
+		ret = adv7533_attach_dsi(adv);
+
 	if (adv->i2c_main->irq)
 		regmap_write(adv->regmap, ADV7511_REG_INT_ENABLE(0),
 			     ADV7511_INT0_HPD);
@@ -978,10 +981,10 @@
 {
 	int ret;
 
-	adv->i2c_cec = i2c_new_secondary_device(adv->i2c_main, "cec",
+	adv->i2c_cec = i2c_new_ancillary_device(adv->i2c_main, "cec",
 						ADV7511_CEC_I2C_ADDR_DEFAULT);
-	if (!adv->i2c_cec)
-		return -EINVAL;
+	if (IS_ERR(adv->i2c_cec))
+		return PTR_ERR(adv->i2c_cec);
 	i2c_set_clientdata(adv->i2c_cec, adv);
 
 	adv->regmap_cec = devm_regmap_init_i2c(adv->i2c_cec,
@@ -1162,20 +1165,20 @@
 
 	adv7511_packet_disable(adv7511, 0xffff);
 
-	adv7511->i2c_edid = i2c_new_secondary_device(i2c, "edid",
+	adv7511->i2c_edid = i2c_new_ancillary_device(i2c, "edid",
 					ADV7511_EDID_I2C_ADDR_DEFAULT);
-	if (!adv7511->i2c_edid) {
-		ret = -EINVAL;
+	if (IS_ERR(adv7511->i2c_edid)) {
+		ret = PTR_ERR(adv7511->i2c_edid);
 		goto uninit_regulators;
 	}
 
 	regmap_write(adv7511->regmap, ADV7511_REG_EDID_I2C_ADDR,
 		     adv7511->i2c_edid->addr << 1);
 
-	adv7511->i2c_packet = i2c_new_secondary_device(i2c, "packet",
+	adv7511->i2c_packet = i2c_new_ancillary_device(i2c, "packet",
 					ADV7511_PACKET_I2C_ADDR_DEFAULT);
-	if (!adv7511->i2c_packet) {
-		ret = -EINVAL;
+	if (IS_ERR(adv7511->i2c_packet)) {
+		ret = PTR_ERR(adv7511->i2c_packet);
 		goto err_i2c_unregister_edid;
 	}
 
@@ -1219,17 +1222,8 @@
 	drm_bridge_add(&adv7511->bridge);
 
 	adv7511_audio_init(dev, adv7511);
-
-	if (adv7511->type == ADV7533) {
-		ret = adv7533_attach_dsi(adv7511);
-		if (ret)
-			goto err_remove_bridge;
-	}
-
 	return 0;
 
-err_remove_bridge:
-	drm_bridge_remove(&adv7511->bridge);
 err_unregister_cec:
 	i2c_unregister_device(adv7511->i2c_cec);
 	if (adv7511->cec_clk)

diff --git a/drivers/gpu/drm/drm_atomic_helper.c b/drivers/gpu/drm/drm_atomic_helper.c
index aa16ea1..3ef2ac5 100644
--- a/drivers/gpu/drm/drm_atomic_helper.c
+++ b/drivers/gpu/drm/drm_atomic_helper.c

@@ -26,6 +26,7 @@
  */
 
 #include <linux/dma-fence.h>
+#include <linux/ktime.h>
 
 #include <drm/drm_atomic.h>
 #include <drm/drm_atomic_helper.h>
@@ -1580,9 +1581,23 @@
 {
 	struct drm_device *dev = old_state->dev;
 	const struct drm_mode_config_helper_funcs *funcs;
+	ktime_t start;
+	s64 commit_time_ms;
 
 	funcs = dev->mode_config.helper_private;
 
+	/*
+	 * We're measuring the _entire_ commit, so the time will vary depending
+	 * on how many fences and objects are involved. For the purposes of self
+	 * refresh, this is desirable since it'll give us an idea of how
+	 * congested things are. This will inform our decision on how often we
+	 * should enter self refresh after idle.
+	 *
+	 * These times will be averaged out in the self refresh helpers to avoid
+	 * overreacting over one outlier frame
+	 */
+	start = ktime_get();
+
 	drm_atomic_helper_wait_for_fences(dev, old_state, false);
 
 	drm_atomic_helper_wait_for_dependencies(old_state);
@@ -1592,6 +1607,11 @@
 	else
 		drm_atomic_helper_commit_tail(old_state);
 
+	commit_time_ms = ktime_ms_delta(ktime_get(), start);
+	if (commit_time_ms > 0)
+		drm_self_refresh_helper_update_avg_times(old_state,
+						 (unsigned long)commit_time_ms);
+
 	drm_atomic_helper_commit_cleanup_done(old_state);
 
 	drm_atomic_state_put(old_state);
@@ -3275,7 +3295,7 @@
 		return PTR_ERR(crtc_state);
 
 	crtc_state->event = event;
-	crtc_state->pageflip_flags = flags;
+	crtc_state->async_flip = flags & DRM_MODE_PAGE_FLIP_ASYNC;
 
 	plane_state = drm_atomic_get_plane_state(state, plane);
 	if (IS_ERR(plane_state))

diff --git a/drivers/gpu/drm/drm_atomic_state_helper.c b/drivers/gpu/drm/drm_atomic_state_helper.c
index 46dc264..d0a937f 100644
--- a/drivers/gpu/drm/drm_atomic_state_helper.c
+++ b/drivers/gpu/drm/drm_atomic_state_helper.c

@@ -128,7 +128,7 @@
 	state->zpos_changed = false;
 	state->commit = NULL;
 	state->event = NULL;
-	state->pageflip_flags = 0;
+	state->async_flip = false;
 
 	/* Self refresh should be canceled when a new update is available */
 	state->active = drm_atomic_crtc_effectively_active(state);

diff --git a/drivers/gpu/drm/drm_atomic_uapi.c b/drivers/gpu/drm/drm_atomic_uapi.c
index 5a5b42d..7a26bfb 100644
--- a/drivers/gpu/drm/drm_atomic_uapi.c
+++ b/drivers/gpu/drm/drm_atomic_uapi.c

@@ -1305,8 +1305,7 @@
 	if (arg->reserved)
 		return -EINVAL;
 
-	if ((arg->flags & DRM_MODE_PAGE_FLIP_ASYNC) &&
-			!dev->mode_config.async_page_flip)
+	if (arg->flags & DRM_MODE_PAGE_FLIP_ASYNC)
 		return -EINVAL;
 
 	/* can't test and expect an event at the same time. */

diff --git a/drivers/gpu/drm/drm_drv.c b/drivers/gpu/drm/drm_drv.c
index c456c3d..769feef 100644
--- a/drivers/gpu/drm/drm_drv.c
+++ b/drivers/gpu/drm/drm_drv.c

@@ -976,14 +976,14 @@
 	if (ret)
 		goto err_minors;
 
+	dev->registered = true;
+
 	if (dev->driver->load) {
 		ret = dev->driver->load(dev, flags);
 		if (ret)
 			goto err_minors;
 	}
 
-	dev->registered = true;
-
 	if (drm_core_check_feature(dev, DRIVER_MODESET))
 		drm_modeset_register_all(dev);
 

diff --git a/drivers/gpu/drm/drm_ioctl.c b/drivers/gpu/drm/drm_ioctl.c
index f675a3b..fcd728d 100644
--- a/drivers/gpu/drm/drm_ioctl.c
+++ b/drivers/gpu/drm/drm_ioctl.c

@@ -336,7 +336,12 @@
 	case DRM_CLIENT_CAP_ATOMIC:
 		if (!drm_core_check_feature(dev, DRIVER_ATOMIC))
 			return -EOPNOTSUPP;
-		if (req->value > 1)
+		/* The modesetting DDX has a totally broken idea of atomic. */
+		if (current->comm[0] == 'X' && req->value == 1) {
+			pr_info("broken atomic modeset userspace detected, disabling atomic\n");
+			return -EOPNOTSUPP;
+		}
+		if (req->value > 2)
 			return -EINVAL;
 		file_priv->atomic = req->value;
 		file_priv->universal_planes = req->value;

diff --git a/drivers/gpu/drm/drm_mode_object.c b/drivers/gpu/drm/drm_mode_object.c
index c355ba8..6a23e36 100644
--- a/drivers/gpu/drm/drm_mode_object.c
+++ b/drivers/gpu/drm/drm_mode_object.c

@@ -42,7 +42,7 @@
 {
 	int ret;
 
-	WARN_ON(dev->registered && !obj_free_cb);
+	WARN_ON(!dev->driver->load && dev->registered && !obj_free_cb);
 
 	mutex_lock(&dev->mode_config.idr_mutex);
 	ret = idr_alloc(&dev->mode_config.object_idr, register_obj ? obj : NULL,
@@ -104,7 +104,7 @@
 void drm_mode_object_unregister(struct drm_device *dev,
 				struct drm_mode_object *object)
 {
-	WARN_ON(dev->registered && !object->free_cb);
+	WARN_ON(!dev->driver->load && dev->registered && !object->free_cb);
 
 	mutex_lock(&dev->mode_config.idr_mutex);
 	if (object->id) {

diff --git a/drivers/gpu/drm/drm_self_refresh_helper.c b/drivers/gpu/drm/drm_self_refresh_helper.c
index 4b9424a..68f4765 100644
--- a/drivers/gpu/drm/drm_self_refresh_helper.c
+++ b/drivers/gpu/drm/drm_self_refresh_helper.c

@@ -5,6 +5,7 @@
  * Authors:
  * Sean Paul <seanpaul@chromium.org>
  */
+#include <linux/average.h>
 #include <linux/bitops.h>
 #include <linux/slab.h>
 #include <linux/workqueue.h>
@@ -50,11 +51,17 @@
  * atomic_check when &drm_crtc_state.self_refresh_active is true.
  */
 
+#define SELF_REFRESH_AVG_SEED_MS 200
+
+DECLARE_EWMA(psr_time, 4, 4)
+
 struct drm_self_refresh_data {
 	struct drm_crtc *crtc;
 	struct delayed_work entry_work;
-	struct drm_atomic_state *save_state;
-	unsigned int entry_delay_ms;
+
+	struct mutex avg_mutex;
+	struct ewma_psr_time entry_avg_ms;
+	struct ewma_psr_time exit_avg_ms;
 };
 
 static void drm_self_refresh_helper_entry_work(struct work_struct *work)
@@ -123,6 +130,44 @@
 }
 
 /**
+ * drm_self_refresh_helper_update_avg_times - Updates a crtc's SR time averages
+ * @state: the state which has just been applied to hardware
+ * @commit_time_ms: the amount of time in ms that this commit took to complete
+ *
+ * Called after &drm_mode_config_funcs.atomic_commit_tail, this function will
+ * update the average entry/exit self refresh times on self refresh transitions.
+ * These averages will be used when calculating how long to delay before
+ * entering self refresh mode after activity.
+ */
+void drm_self_refresh_helper_update_avg_times(struct drm_atomic_state *state,
+					      unsigned int commit_time_ms)
+{
+	struct drm_crtc *crtc;
+	struct drm_crtc_state *old_crtc_state, *new_crtc_state;
+	int i;
+
+	for_each_oldnew_crtc_in_state(state, crtc, old_crtc_state,
+				      new_crtc_state, i) {
+		struct drm_self_refresh_data *sr_data = crtc->self_refresh_data;
+		struct ewma_psr_time *time;
+
+		if (old_crtc_state->self_refresh_active ==
+		    new_crtc_state->self_refresh_active)
+			continue;
+
+		if (new_crtc_state->self_refresh_active)
+			time = &sr_data->entry_avg_ms;
+		else
+			time = &sr_data->exit_avg_ms;
+
+		mutex_lock(&sr_data->avg_mutex);
+		ewma_psr_time_add(time, commit_time_ms);
+		mutex_unlock(&sr_data->avg_mutex);
+	}
+}
+EXPORT_SYMBOL(drm_self_refresh_helper_update_avg_times);
+
+/**
  * drm_self_refresh_helper_alter_state - Alters the atomic state for SR exit
  * @state: the state currently being checked
  *
@@ -153,6 +198,7 @@
 
 	for_each_new_crtc_in_state(state, crtc, crtc_state, i) {
 		struct drm_self_refresh_data *sr_data;
+		unsigned int delay;
 
 		/* Don't trigger the entry timer when we're already in SR */
 		if (crtc_state->self_refresh_active)
@@ -162,8 +208,13 @@
 		if (!sr_data)
 			continue;
 
+		mutex_lock(&sr_data->avg_mutex);
+		delay = (ewma_psr_time_read(&sr_data->entry_avg_ms) +
+			 ewma_psr_time_read(&sr_data->exit_avg_ms)) * 2;
+		mutex_unlock(&sr_data->avg_mutex);
+
 		mod_delayed_work(system_wq, &sr_data->entry_work,
-				 msecs_to_jiffies(sr_data->entry_delay_ms));
+				 msecs_to_jiffies(delay));
 	}
 }
 EXPORT_SYMBOL(drm_self_refresh_helper_alter_state);
@@ -171,12 +222,10 @@
 /**
  * drm_self_refresh_helper_init - Initializes self refresh helpers for a crtc
  * @crtc: the crtc which supports self refresh supported displays
- * @entry_delay_ms: amount of inactivity to wait before entering self refresh
  *
  * Returns zero if successful or -errno on failure
  */
-int drm_self_refresh_helper_init(struct drm_crtc *crtc,
-				 unsigned int entry_delay_ms)
+int drm_self_refresh_helper_init(struct drm_crtc *crtc)
 {
 	struct drm_self_refresh_data *sr_data = crtc->self_refresh_data;
 
@@ -190,8 +239,18 @@
 
 	INIT_DELAYED_WORK(&sr_data->entry_work,
 			  drm_self_refresh_helper_entry_work);
-	sr_data->entry_delay_ms = entry_delay_ms;
 	sr_data->crtc = crtc;
+	mutex_init(&sr_data->avg_mutex);
+	ewma_psr_time_init(&sr_data->entry_avg_ms);
+	ewma_psr_time_init(&sr_data->exit_avg_ms);
+
+	/*
+	 * Seed the averages so they're non-zero (and sufficiently large
+	 * for even poorly performing panels). As time goes on, this will be
+	 * averaged out and the values will trend to their true value.
+	 */
+	ewma_psr_time_add(&sr_data->entry_avg_ms, SELF_REFRESH_AVG_SEED_MS);
+	ewma_psr_time_add(&sr_data->exit_avg_ms, SELF_REFRESH_AVG_SEED_MS);
 
 	crtc->self_refresh_data = sr_data;
 	return 0;

diff --git a/drivers/gpu/drm/nouveau/dispnv50/wndw.c b/drivers/gpu/drm/nouveau/dispnv50/wndw.c
index 2db0293..5193b62 100644
--- a/drivers/gpu/drm/nouveau/dispnv50/wndw.c
+++ b/drivers/gpu/drm/nouveau/dispnv50/wndw.c

@@ -267,7 +267,7 @@
 			asyw->image.pitch[0] = fb->base.pitches[0];
 		}
 
-		if (!(asyh->state.pageflip_flags & DRM_MODE_PAGE_FLIP_ASYNC))
+		if (!asyh->state.async_flip)
 			asyw->image.interval = 1;
 		else
 			asyw->image.interval = 0;
@@ -383,7 +383,7 @@
 	}
 
 	/* Can't do an immediate flip while changing the LUT. */
-	asyh->state.pageflip_flags &= ~DRM_MODE_PAGE_FLIP_ASYNC;
+	asyh->state.async_flip = false;
 }
 
 static int

diff --git a/drivers/gpu/drm/panfrost/panfrost_devfreq.c b/drivers/gpu/drm/panfrost/panfrost_devfreq.c
index a1f5fa6..12ff77d 100644
--- a/drivers/gpu/drm/panfrost/panfrost_devfreq.c
+++ b/drivers/gpu/drm/panfrost/panfrost_devfreq.c

@@ -39,7 +39,7 @@
 	 * If frequency scaling from low to high, adjust voltage first.
 	 * If frequency scaling from high to low, adjust frequency first.
 	 */
-	if (old_clk_rate < target_rate && pfdev->regulator) {
+	if (old_clk_rate < target_rate) {
 		err = regulator_set_voltage(pfdev->regulator, target_volt,
 					    target_volt);
 		if (err) {
@@ -53,14 +53,12 @@
 	if (err) {
 		dev_err(dev, "Cannot set frequency %lu (%d)\n", target_rate,
 			err);
-		if (pfdev->regulator)
-			regulator_set_voltage(pfdev->regulator,
-					      pfdev->devfreq.cur_volt,
-					      pfdev->devfreq.cur_volt);
+		regulator_set_voltage(pfdev->regulator, pfdev->devfreq.cur_volt,
+				      pfdev->devfreq.cur_volt);
 		return err;
 	}
 
-	if (old_clk_rate > target_rate && pfdev->regulator) {
+	if (old_clk_rate > target_rate) {
 		err = regulator_set_voltage(pfdev->regulator, target_volt,
 					    target_volt);
 		if (err)

diff --git a/drivers/gpu/drm/panfrost/panfrost_device.c b/drivers/gpu/drm/panfrost/panfrost_device.c
index 46b0b02..238fb6d 100644
--- a/drivers/gpu/drm/panfrost/panfrost_device.c
+++ b/drivers/gpu/drm/panfrost/panfrost_device.c

@@ -89,12 +89,9 @@
 {
 	int ret;
 
-	pfdev->regulator = devm_regulator_get_optional(pfdev->dev, "mali");
+	pfdev->regulator = devm_regulator_get(pfdev->dev, "mali");
 	if (IS_ERR(pfdev->regulator)) {
 		ret = PTR_ERR(pfdev->regulator);
-		pfdev->regulator = NULL;
-		if (ret == -ENODEV)
-			return 0;
 		dev_err(pfdev->dev, "failed to get regulator: %d\n", ret);
 		return ret;
 	}
@@ -110,8 +107,7 @@
 
 static void panfrost_regulator_fini(struct panfrost_device *pfdev)
 {
-	if (pfdev->regulator)
-		regulator_disable(pfdev->regulator);
+	regulator_disable(pfdev->regulator);
 }
 
 int panfrost_device_init(struct panfrost_device *pfdev)

diff --git a/drivers/gpu/drm/panfrost/panfrost_mmu.c b/drivers/gpu/drm/panfrost/panfrost_mmu.c
index 6010f9e..bdd9905 100644
--- a/drivers/gpu/drm/panfrost/panfrost_mmu.c
+++ b/drivers/gpu/drm/panfrost/panfrost_mmu.c

@@ -394,28 +394,40 @@
 	free_io_pgtable_ops(mmu->pgtbl_ops);
 }
 
-static struct drm_mm_node *addr_to_drm_mm_node(struct panfrost_device *pfdev, int as, u64 addr)
+static struct panfrost_gem_object *
+addr_to_drm_mm_node(struct panfrost_device *pfdev, int as, u64 addr)
 {
-	struct drm_mm_node *node = NULL;
+	struct panfrost_gem_object *bo = NULL;
+	struct panfrost_file_priv *priv;
+	struct drm_mm_node *node;
 	u64 offset = addr >> PAGE_SHIFT;
 	struct panfrost_mmu *mmu;
 
 	spin_lock(&pfdev->as_lock);
 	list_for_each_entry(mmu, &pfdev->as_lru_list, list) {
-		struct panfrost_file_priv *priv;
-		if (as != mmu->as)
-			continue;
+		if (as == mmu->as)
+			break;
+	}
+	if (as != mmu->as)
+		goto out;
 
-		priv = container_of(mmu, struct panfrost_file_priv, mmu);
-		drm_mm_for_each_node(node, &priv->mm) {
-			if (offset >= node->start && offset < (node->start + node->size))
-				goto out;
+	priv = container_of(mmu, struct panfrost_file_priv, mmu);
+
+	spin_lock(&priv->mm_lock);
+
+	drm_mm_for_each_node(node, &priv->mm) {
+		if (offset >= node->start &&
+		    offset < (node->start + node->size)) {
+			bo = drm_mm_node_to_panfrost_bo(node);
+			drm_gem_object_get(&bo->base.base);
+			break;
 		}
 	}
 
+	spin_unlock(&priv->mm_lock);
 out:
 	spin_unlock(&pfdev->as_lock);
-	return node;
+	return bo;
 }
 
 #define NUM_FAULT_PAGES (SZ_2M / PAGE_SIZE)
@@ -423,29 +435,28 @@
 int panfrost_mmu_map_fault_addr(struct panfrost_device *pfdev, int as, u64 addr)
 {
 	int ret, i;
-	struct drm_mm_node *node;
 	struct panfrost_gem_object *bo;
 	struct address_space *mapping;
 	pgoff_t page_offset;
 	struct sg_table *sgt;
 	struct page **pages;
 
-	node = addr_to_drm_mm_node(pfdev, as, addr);
-	if (!node)
+	bo = addr_to_drm_mm_node(pfdev, as, addr);
+	if (!bo)
 		return -ENOENT;
 
-	bo = drm_mm_node_to_panfrost_bo(node);
 	if (!bo->is_heap) {
 		dev_WARN(pfdev->dev, "matching BO is not heap type (GPU VA = %llx)",
-			 node->start << PAGE_SHIFT);
-		return -EINVAL;
+			 bo->node.start << PAGE_SHIFT);
+		ret = -EINVAL;
+		goto err_bo;
 	}
 	WARN_ON(bo->mmu->as != as);
 
 	/* Assume 2MB alignment and size multiple */
 	addr &= ~((u64)SZ_2M - 1);
 	page_offset = addr >> PAGE_SHIFT;
-	page_offset -= node->start;
+	page_offset -= bo->node.start;
 
 	mutex_lock(&bo->base.pages_lock);
 
@@ -454,7 +465,8 @@
 				     sizeof(struct sg_table), GFP_KERNEL | __GFP_ZERO);
 		if (!bo->sgts) {
 			mutex_unlock(&bo->base.pages_lock);
-			return -ENOMEM;
+			ret = -ENOMEM;
+			goto err_bo;
 		}
 
 		pages = kvmalloc_array(bo->base.base.size >> PAGE_SHIFT,
@@ -463,7 +475,8 @@
 			kfree(bo->sgts);
 			bo->sgts = NULL;
 			mutex_unlock(&bo->base.pages_lock);
-			return -ENOMEM;
+			ret = -ENOMEM;
+			goto err_bo;
 		}
 		bo->base.pages = pages;
 		bo->base.pages_use_count = 1;
@@ -501,12 +514,16 @@
 
 	dev_dbg(pfdev->dev, "mapped page fault @ AS%d %llx", as, addr);
 
+	drm_gem_object_put_unlocked(&bo->base.base);
+
 	return 0;
 
 err_map:
 	sg_free_table(sgt);
 err_pages:
 	drm_gem_shmem_put_pages(&bo->base);
+err_bo:
+	drm_gem_object_put_unlocked(&bo->base.base);
 	return ret;
 }
 

diff --git a/drivers/gpu/drm/radeon/radeon_drv.c b/drivers/gpu/drm/radeon/radeon_drv.c
index 431e6b6..d0bc91e 100644
--- a/drivers/gpu/drm/radeon/radeon_drv.c
+++ b/drivers/gpu/drm/radeon/radeon_drv.c

@@ -324,8 +324,39 @@
 static int radeon_pci_probe(struct pci_dev *pdev,
 			    const struct pci_device_id *ent)
 {
+	unsigned long flags = 0;
 	int ret;
 
+	if (!ent)
+		return -ENODEV; /* Avoid NULL-ptr deref in drm_get_pci_dev */
+
+	flags = ent->driver_data;
+
+	if (!radeon_si_support) {
+		switch (flags & RADEON_FAMILY_MASK) {
+		case CHIP_TAHITI:
+		case CHIP_PITCAIRN:
+		case CHIP_VERDE:
+		case CHIP_OLAND:
+		case CHIP_HAINAN:
+			dev_info(&pdev->dev,
+				 "SI support disabled by module param\n");
+			return -ENODEV;
+		}
+	}
+	if (!radeon_cik_support) {
+		switch (flags & RADEON_FAMILY_MASK) {
+		case CHIP_KAVERI:
+		case CHIP_BONAIRE:
+		case CHIP_HAWAII:
+		case CHIP_KABINI:
+		case CHIP_MULLINS:
+			dev_info(&pdev->dev,
+				 "CIK support disabled by module param\n");
+			return -ENODEV;
+		}
+	}
+
 	if (vga_switcheroo_client_probe_defer(pdev))
 		return -EPROBE_DEFER;
 

diff --git a/drivers/gpu/drm/radeon/radeon_gem.c b/drivers/gpu/drm/radeon/radeon_gem.c
index 4cf58db..b2b0766 100644
--- a/drivers/gpu/drm/radeon/radeon_gem.c
+++ b/drivers/gpu/drm/radeon/radeon_gem.c

@@ -296,6 +296,8 @@
 	uint32_t handle;
 	int r;
 
+	args->addr = untagged_addr(args->addr);
+
 	if (offset_in_page(args->addr | args->size))
 		return -EINVAL;
 

diff --git a/drivers/gpu/drm/radeon/radeon_kms.c b/drivers/gpu/drm/radeon/radeon_kms.c
index 07f7ace..e85c554 100644
--- a/drivers/gpu/drm/radeon/radeon_kms.c
+++ b/drivers/gpu/drm/radeon/radeon_kms.c

@@ -100,31 +100,6 @@
 	struct radeon_device *rdev;
 	int r, acpi_status;
 
-	if (!radeon_si_support) {
-		switch (flags & RADEON_FAMILY_MASK) {
-		case CHIP_TAHITI:
-		case CHIP_PITCAIRN:
-		case CHIP_VERDE:
-		case CHIP_OLAND:
-		case CHIP_HAINAN:
-			dev_info(dev->dev,
-				 "SI support disabled by module param\n");
-			return -ENODEV;
-		}
-	}
-	if (!radeon_cik_support) {
-		switch (flags & RADEON_FAMILY_MASK) {
-		case CHIP_KAVERI:
-		case CHIP_BONAIRE:
-		case CHIP_HAWAII:
-		case CHIP_KABINI:
-		case CHIP_MULLINS:
-			dev_info(dev->dev,
-				 "CIK support disabled by module param\n");
-			return -ENODEV;
-		}
-	}
-
 	rdev = kzalloc(sizeof(struct radeon_device), GFP_KERNEL);
 	if (rdev == NULL) {
 		return -ENOMEM;

diff --git a/drivers/gpu/drm/rockchip/rockchip_drm_vop.c b/drivers/gpu/drm/rockchip/rockchip_drm_vop.c
index 2f821c5..613404f 100644
--- a/drivers/gpu/drm/rockchip/rockchip_drm_vop.c
+++ b/drivers/gpu/drm/rockchip/rockchip_drm_vop.c

@@ -39,8 +39,6 @@
 #include "rockchip_drm_vop.h"
 #include "rockchip_rgb.h"
 
-#define VOP_SELF_REFRESH_ENTRY_DELAY_MS 100
-
 #define VOP_WIN_SET(vop, win, name, v) \
 		vop_reg_set(vop, &win->phy->name, win->base, ~0, v, #name)
 #define VOP_SCL_SET(vop, win, name, v) \
@@ -1563,8 +1561,7 @@
 	init_completion(&vop->line_flag_completion);
 	crtc->port = port;
 
-	ret = drm_self_refresh_helper_init(crtc,
-					   VOP_SELF_REFRESH_ENTRY_DELAY_MS);
+	ret = drm_self_refresh_helper_init(crtc);
 	if (ret)
 		DRM_DEV_DEBUG_KMS(vop->dev,
 			"Failed to init %s with SR helpers %d, ignoring\n",

diff --git a/drivers/gpu/drm/via/via_dmablit.c b/drivers/gpu/drm/via/via_dmablit.c
index feaa538..3db000a 100644
--- a/drivers/gpu/drm/via/via_dmablit.c
+++ b/drivers/gpu/drm/via/via_dmablit.c

@@ -174,7 +174,6 @@
 static void
 via_free_sg_info(struct pci_dev *pdev, drm_via_sg_info_t *vsg)
 {
-	struct page *page;
 	int i;
 
 	switch (vsg->state) {
@@ -189,13 +188,8 @@
 		kfree(vsg->desc_pages);
 		/* fall through */
 	case dr_via_pages_locked:
-		for (i = 0; i < vsg->num_pages; ++i) {
-			if (NULL != (page = vsg->pages[i])) {
-				if (!PageReserved(page) && (DMA_FROM_DEVICE == vsg->direction))
-					SetPageDirty(page);
-				put_page(page);
-			}
-		}
+		put_user_pages_dirty_lock(vsg->pages, vsg->num_pages,
+					  (vsg->direction == DMA_FROM_DEVICE));
 		/* fall through */
 	case dr_via_pages_alloc:
 		vfree(vsg->pages);

diff --git a/drivers/hid/Kconfig b/drivers/hid/Kconfig
index a958b96..1ecb512 100644
--- a/drivers/hid/Kconfig
+++ b/drivers/hid/Kconfig

@@ -273,6 +273,15 @@
 	and gpiochip to expose these functions of the CP2112. The
 	customizable USB descriptor fields are exposed as sysfs attributes.
 
+config HID_CREATIVE_SB0540
+	tristate "Creative SB0540 infrared receiver"
+	depends on USB_HID
+	help
+	Support for Creative infrared SB0540-compatible remote controls, such
+	as the RM-1500 and RM-1800 remotes.
+
+	Say Y here if you want support for Creative SB0540 infrared receiver.
+
 config HID_CYPRESS
 	tristate "Cypress mouse and barcode readers"
 	depends on HID

diff --git a/drivers/hid/Makefile b/drivers/hid/Makefile
index cc5d827..0c03308 100644
--- a/drivers/hid/Makefile
+++ b/drivers/hid/Makefile

@@ -27,6 +27,7 @@
 obj-$(CONFIG_HID_ACRUX)		+= hid-axff.o
 obj-$(CONFIG_HID_APPLE)		+= hid-apple.o
 obj-$(CONFIG_HID_APPLEIR)	+= hid-appleir.o
+obj-$(CONFIG_HID_CREATIVE_SB0540)	+= hid-creative-sb0540.o
 obj-$(CONFIG_HID_ASUS)		+= hid-asus.o
 obj-$(CONFIG_HID_AUREAL)	+= hid-aureal.o
 obj-$(CONFIG_HID_BELKIN)	+= hid-belkin.o

diff --git a/drivers/hid/hid-apple.c b/drivers/hid/hid-apple.c
index 81df62f..6ac8bec 100644
--- a/drivers/hid/hid-apple.c
+++ b/drivers/hid/hid-apple.c

@@ -54,7 +54,6 @@
 struct apple_sc {
 	unsigned long quirks;
 	unsigned int fn_on;
-	DECLARE_BITMAP(pressed_fn, KEY_CNT);
 	DECLARE_BITMAP(pressed_numlock, KEY_CNT);
 };
 
@@ -181,6 +180,8 @@
 {
 	struct apple_sc *asc = hid_get_drvdata(hid);
 	const struct apple_key_translation *trans, *table;
+	bool do_translate;
+	u16 code = 0;
 
 	if (usage->code == KEY_FN) {
 		asc->fn_on = !!value;
@@ -189,8 +190,6 @@
 	}
 
 	if (fnmode) {
-		int do_translate;
-
 		if (hid->product >= USB_DEVICE_ID_APPLE_WELLSPRING4_ANSI &&
 				hid->product <= USB_DEVICE_ID_APPLE_WELLSPRING4A_JIS)
 			table = macbookair_fn_keys;
@@ -202,25 +201,33 @@
 		trans = apple_find_translation (table, usage->code);
 
 		if (trans) {
-			if (test_bit(usage->code, asc->pressed_fn))
-				do_translate = 1;
-			else if (trans->flags & APPLE_FLAG_FKEY)
-				do_translate = (fnmode == 2 && asc->fn_on) ||
-					(fnmode == 1 && !asc->fn_on);
-			else
-				do_translate = asc->fn_on;
+			if (test_bit(trans->from, input->key))
+				code = trans->from;
+			else if (test_bit(trans->to, input->key))
+				code = trans->to;
 
-			if (do_translate) {
-				if (value)
-					set_bit(usage->code, asc->pressed_fn);
-				else
-					clear_bit(usage->code, asc->pressed_fn);
+			if (!code) {
+				if (trans->flags & APPLE_FLAG_FKEY) {
+					switch (fnmode) {
+					case 1:
+						do_translate = !asc->fn_on;
+						break;
+					case 2:
+						do_translate = asc->fn_on;
+						break;
+					default:
+						/* should never happen */
+						do_translate = false;
+					}
+				} else {
+					do_translate = asc->fn_on;
+				}
 
-				input_event(input, usage->type, trans->to,
-						value);
-
-				return 1;
+				code = do_translate ? trans->to : trans->from;
 			}
+
+			input_event(input, usage->type, code, value);
+			return 1;
 		}
 
 		if (asc->quirks & APPLE_NUMLOCK_EMULATION &&

diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c
index 210b81a..3eaee2c 100644
--- a/drivers/hid/hid-core.c
+++ b/drivers/hid/hid-core.c

@@ -1311,8 +1311,8 @@
 			unsigned offset, unsigned n)
 {
 	if (n > 32) {
-		hid_warn(hid, "hid_field_extract() called with n (%d) > 32! (%s)\n",
-			 n, current->comm);
+		hid_warn_once(hid, "%s() called with n (%d) > 32! (%s)\n",
+			      __func__, n, current->comm);
 		n = 32;
 	}
 

diff --git a/drivers/hid/hid-cougar.c b/drivers/hid/hid-cougar.c
index e0bb7b3..4ff3bc1 100644
--- a/drivers/hid/hid-cougar.c
+++ b/drivers/hid/hid-cougar.c

@@ -207,7 +207,7 @@
 	error = hid_parse(hdev);
 	if (error) {
 		hid_err(hdev, "parse failed\n");
-		goto fail;
+		return error;
 	}
 
 	if (hdev->collection->usage == COUGAR_VENDOR_USAGE) {
@@ -219,7 +219,7 @@
 	error = hid_hw_start(hdev, connect_mask);
 	if (error) {
 		hid_err(hdev, "hw start failed\n");
-		goto fail;
+		return error;
 	}
 
 	error = cougar_bind_shared_data(hdev, cougar);
@@ -249,8 +249,6 @@
 
 fail_stop_and_cleanup:
 	hid_hw_stop(hdev);
-fail:
-	hid_set_drvdata(hdev, NULL);
 	return error;
 }
 

diff --git a/drivers/hid/hid-creative-sb0540.c b/drivers/hid/hid-creative-sb0540.c
new file mode 100644
index 0000000..b4c8e7a
--- /dev/null
+++ b/drivers/hid/hid-creative-sb0540.c

@@ -0,0 +1,268 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * HID driver for the Creative SB0540 receiver
+ *
+ * Copyright (C) 2019 Red Hat Inc. All Rights Reserved
+ *
+ */
+
+#include <linux/device.h>
+#include <linux/hid.h>
+#include <linux/module.h>
+#include "hid-ids.h"
+
+MODULE_AUTHOR("Bastien Nocera <hadess@hadess.net>");
+MODULE_DESCRIPTION("HID Creative SB0540 receiver");
+MODULE_LICENSE("GPL");
+
+static const unsigned short creative_sb0540_key_table[] = {
+	KEY_POWER,
+	KEY_RESERVED,		/* text: 24bit */
+	KEY_RESERVED,		/* 24bit wheel up */
+	KEY_RESERVED,		/* 24bit wheel down */
+	KEY_RESERVED,		/* text: CMSS */
+	KEY_RESERVED,		/* CMSS wheel Up */
+	KEY_RESERVED,		/* CMSS wheel Down */
+	KEY_RESERVED,		/* text: EAX */
+	KEY_RESERVED,		/* EAX wheel up */
+	KEY_RESERVED,		/* EAX wheel down */
+	KEY_RESERVED,		/* text: 3D Midi */
+	KEY_RESERVED,		/* 3D Midi wheel up */
+	KEY_RESERVED,		/* 3D Midi wheel down */
+	KEY_MUTE,
+	KEY_VOLUMEUP,
+	KEY_VOLUMEDOWN,
+	KEY_UP,
+	KEY_LEFT,
+	KEY_RIGHT,
+	KEY_REWIND,
+	KEY_OK,
+	KEY_FASTFORWARD,
+	KEY_DOWN,
+	KEY_AGAIN,		/* text: Return, symbol: Jump to */
+	KEY_PLAY,		/* text: Start */
+	KEY_ESC,		/* text: Cancel */
+	KEY_RECORD,
+	KEY_OPTION,
+	KEY_MENU,		/* text: Display */
+	KEY_PREVIOUS,
+	KEY_PLAYPAUSE,
+	KEY_NEXT,
+	KEY_SLOW,
+	KEY_STOP,
+	KEY_NUMERIC_1,
+	KEY_NUMERIC_2,
+	KEY_NUMERIC_3,
+	KEY_NUMERIC_4,
+	KEY_NUMERIC_5,
+	KEY_NUMERIC_6,
+	KEY_NUMERIC_7,
+	KEY_NUMERIC_8,
+	KEY_NUMERIC_9,
+	KEY_NUMERIC_0
+};
+
+/*
+ * Codes and keys from lirc's
+ * remotes/creative/lircd.conf.alsa_usb
+ * order and size must match creative_sb0540_key_table[] above
+ */
+static const unsigned short creative_sb0540_codes[] = {
+	0x619E,
+	0x916E,
+	0x926D,
+	0x936C,
+	0x718E,
+	0x946B,
+	0x956A,
+	0x8C73,
+	0x9669,
+	0x9768,
+	0x9867,
+	0x9966,
+	0x9A65,
+	0x6E91,
+	0x629D,
+	0x639C,
+	0x7B84,
+	0x6B94,
+	0x728D,
+	0x8778,
+	0x817E,
+	0x758A,
+	0x8D72,
+	0x8E71,
+	0x8877,
+	0x7C83,
+	0x738C,
+	0x827D,
+	0x7689,
+	0x7F80,
+	0x7986,
+	0x7A85,
+	0x7D82,
+	0x857A,
+	0x8B74,
+	0x8F70,
+	0x906F,
+	0x8A75,
+	0x847B,
+	0x7887,
+	0x8976,
+	0x837C,
+	0x7788,
+	0x807F
+};
+
+struct creative_sb0540 {
+	struct input_dev *input_dev;
+	struct hid_device *hid;
+	unsigned short keymap[ARRAY_SIZE(creative_sb0540_key_table)];
+};
+
+static inline u64 reverse(u64 data, int bits)
+{
+	int i;
+	u64 c;
+
+	c = 0;
+	for (i = 0; i < bits; i++) {
+		c |= (u64) (((data & (((u64) 1) << i)) ? 1 : 0))
+			<< (bits - 1 - i);
+	}
+	return (c);
+}
+
+static int get_key(struct creative_sb0540 *creative_sb0540, u64 keycode)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(creative_sb0540_codes); i++) {
+		if (creative_sb0540_codes[i] == keycode)
+			return creative_sb0540->keymap[i];
+	}
+
+	return 0;
+
+}
+
+static int creative_sb0540_raw_event(struct hid_device *hid,
+	struct hid_report *report, u8 *data, int len)
+{
+	struct creative_sb0540 *creative_sb0540 = hid_get_drvdata(hid);
+	u64 code, main_code;
+	int key;
+
+	if (len != 6)
+		return 0;
+
+	/* From daemons/hw_hiddev.c sb0540_rec() in lirc */
+	code = reverse(data[5], 8);
+	main_code = (code << 8) + ((~code) & 0xff);
+
+	/*
+	 * Flip to get values in the same format as
+	 * remotes/creative/lircd.conf.alsa_usb in lirc
+	 */
+	main_code = ((main_code & 0xff) << 8) +
+		((main_code & 0xff00) >> 8);
+
+	key = get_key(creative_sb0540, main_code);
+	if (key == 0 || key == KEY_RESERVED) {
+		hid_err(hid, "Could not get a key for main_code %llX\n",
+			main_code);
+		return 0;
+	}
+
+	input_report_key(creative_sb0540->input_dev, key, 1);
+	input_report_key(creative_sb0540->input_dev, key, 0);
+	input_sync(creative_sb0540->input_dev);
+
+	/* let hidraw and hiddev handle the report */
+	return 0;
+}
+
+static int creative_sb0540_input_configured(struct hid_device *hid,
+		struct hid_input *hidinput)
+{
+	struct input_dev *input_dev = hidinput->input;
+	struct creative_sb0540 *creative_sb0540 = hid_get_drvdata(hid);
+	int i;
+
+	creative_sb0540->input_dev = input_dev;
+
+	input_dev->keycode = creative_sb0540->keymap;
+	input_dev->keycodesize = sizeof(unsigned short);
+	input_dev->keycodemax = ARRAY_SIZE(creative_sb0540->keymap);
+
+	input_dev->evbit[0] = BIT(EV_KEY) | BIT(EV_REP);
+
+	memcpy(creative_sb0540->keymap, creative_sb0540_key_table,
+		sizeof(creative_sb0540->keymap));
+	for (i = 0; i < ARRAY_SIZE(creative_sb0540_key_table); i++)
+		set_bit(creative_sb0540->keymap[i], input_dev->keybit);
+	clear_bit(KEY_RESERVED, input_dev->keybit);
+
+	return 0;
+}
+
+static int creative_sb0540_input_mapping(struct hid_device *hid,
+		struct hid_input *hi, struct hid_field *field,
+		struct hid_usage *usage, unsigned long **bit, int *max)
+{
+	/*
+	 * We are remapping the keys ourselves, so ignore the hid-input
+	 * keymap processing.
+	 */
+	return -1;
+}
+
+static int creative_sb0540_probe(struct hid_device *hid,
+		const struct hid_device_id *id)
+{
+	int ret;
+	struct creative_sb0540 *creative_sb0540;
+
+	creative_sb0540 = devm_kzalloc(&hid->dev,
+		sizeof(struct creative_sb0540), GFP_KERNEL);
+
+	if (!creative_sb0540)
+		return -ENOMEM;
+
+	creative_sb0540->hid = hid;
+
+	/* force input as some remotes bypass the input registration */
+	hid->quirks |= HID_QUIRK_HIDINPUT_FORCE;
+
+	hid_set_drvdata(hid, creative_sb0540);
+
+	ret = hid_parse(hid);
+	if (ret) {
+		hid_err(hid, "parse failed\n");
+		return ret;
+	}
+
+	ret = hid_hw_start(hid, HID_CONNECT_DEFAULT);
+	if (ret) {
+		hid_err(hid, "hw start failed\n");
+		return ret;
+	}
+
+	return ret;
+}
+
+static const struct hid_device_id creative_sb0540_devices[] = {
+	{ HID_USB_DEVICE(USB_VENDOR_ID_CREATIVELABS, USB_DEVICE_ID_CREATIVE_SB0540) },
+	{ }
+};
+MODULE_DEVICE_TABLE(hid, creative_sb0540_devices);
+
+static struct hid_driver creative_sb0540_driver = {
+	.name = "creative-sb0540",
+	.id_table = creative_sb0540_devices,
+	.raw_event = creative_sb0540_raw_event,
+	.input_configured = creative_sb0540_input_configured,
+	.probe = creative_sb0540_probe,
+	.input_mapping = creative_sb0540_input_mapping,
+};
+module_hid_driver(creative_sb0540_driver);

diff --git a/drivers/hid/hid-gfrm.c b/drivers/hid/hid-gfrm.c
index 86c3173..699186f 100644
--- a/drivers/hid/hid-gfrm.c
+++ b/drivers/hid/hid-gfrm.c

@@ -123,12 +123,6 @@
 	return ret;
 }
 
-static void gfrm_remove(struct hid_device *hdev)
-{
-	hid_hw_stop(hdev);
-	hid_set_drvdata(hdev, NULL);
-}
-
 static const struct hid_device_id gfrm_devices[] = {
 	{ HID_BLUETOOTH_DEVICE(0x58, 0x2000),
 		.driver_data = GFRM100 },
@@ -142,7 +136,6 @@
 	.name = "gfrm",
 	.id_table = gfrm_devices,
 	.probe = gfrm_probe,
-	.remove = gfrm_remove,
 	.input_mapping = gfrm_input_mapping,
 	.raw_event = gfrm_raw_event,
 	.input_configured = gfrm_input_configured,

diff --git a/drivers/hid/hid-hyperv.c b/drivers/hid/hid-hyperv.c
index 7795831..cc5b09b8 100644
--- a/drivers/hid/hid-hyperv.c
+++ b/drivers/hid/hid-hyperv.c

@@ -104,8 +104,8 @@
 
 #pragma pack(pop)
 
-#define INPUTVSC_SEND_RING_BUFFER_SIZE		(10*PAGE_SIZE)
-#define INPUTVSC_RECV_RING_BUFFER_SIZE		(10*PAGE_SIZE)
+#define INPUTVSC_SEND_RING_BUFFER_SIZE		(40 * 1024)
+#define INPUTVSC_RECV_RING_BUFFER_SIZE		(40 * 1024)
 
 
 enum pipe_prot_msg_type {

diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h
index 0a00be1..76969a2 100644
--- a/drivers/hid/hid-ids.h
+++ b/drivers/hid/hid-ids.h

@@ -314,6 +314,7 @@
 #define USB_VENDOR_ID_CREATIVELABS	0x041e
 #define USB_DEVICE_ID_CREATIVE_SB_OMNI_SURROUND_51	0x322c
 #define USB_DEVICE_ID_PRODIKEYS_PCMIDI	0x2801
+#define USB_DEVICE_ID_CREATIVE_SB0540	0x3100
 
 #define USB_VENDOR_ID_CVTOUCH		0x1ff7
 #define USB_DEVICE_ID_CVTOUCH_SCREEN	0x0013
@@ -568,6 +569,7 @@
 #define USB_PRODUCT_ID_HP_LOGITECH_OEM_USB_OPTICAL_MOUSE_0B4A	0x0b4a
 #define USB_PRODUCT_ID_HP_PIXART_OEM_USB_OPTICAL_MOUSE		0x134a
 #define USB_PRODUCT_ID_HP_PIXART_OEM_USB_OPTICAL_MOUSE_094A	0x094a
+#define USB_PRODUCT_ID_HP_PIXART_OEM_USB_OPTICAL_MOUSE_0941	0x0941
 #define USB_PRODUCT_ID_HP_PIXART_OEM_USB_OPTICAL_MOUSE_0641	0x0641
 
 #define USB_VENDOR_ID_HUION		0x256c
@@ -769,7 +771,8 @@
 #define USB_DEVICE_ID_LOGITECH_NANO_RECEIVER		0xc52f
 #define USB_DEVICE_ID_LOGITECH_UNIFYING_RECEIVER_2	0xc532
 #define USB_DEVICE_ID_LOGITECH_NANO_RECEIVER_2		0xc534
-#define USB_DEVICE_ID_LOGITECH_NANO_RECEIVER_LIGHTSPEED	0xc539
+#define USB_DEVICE_ID_LOGITECH_NANO_RECEIVER_LIGHTSPEED_1	0xc539
+#define USB_DEVICE_ID_LOGITECH_NANO_RECEIVER_LIGHTSPEED_1_1	0xc53f
 #define USB_DEVICE_ID_LOGITECH_NANO_RECEIVER_POWERPLAY	0xc53a
 #define USB_DEVICE_ID_SPACETRAVELLER	0xc623
 #define USB_DEVICE_ID_SPACENAVIGATOR	0xc626

diff --git a/drivers/hid/hid-lenovo.c b/drivers/hid/hid-lenovo.c
index 364bc7f..96fa2a2 100644
--- a/drivers/hid/hid-lenovo.c
+++ b/drivers/hid/hid-lenovo.c

@@ -866,8 +866,6 @@
 
 	led_classdev_unregister(&data_pointer->led_micmute);
 	led_classdev_unregister(&data_pointer->led_mute);
-
-	hid_set_drvdata(hdev, NULL);
 }
 
 static void lenovo_remove_cptkbd(struct hid_device *hdev)

diff --git a/drivers/hid/hid-lg.c b/drivers/hid/hid-lg.c
index 5008a3dc..0dc7cdf 100644
--- a/drivers/hid/hid-lg.c
+++ b/drivers/hid/hid-lg.c

@@ -818,7 +818,7 @@
 
 		if (!buf) {
 			ret = -ENOMEM;
-			goto err_free;
+			goto err_stop;
 		}
 
 		ret = hid_hw_raw_request(hdev, buf[0], buf, sizeof(cbuf),
@@ -850,9 +850,12 @@
 		ret = lg4ff_init(hdev);
 
 	if (ret)
-		goto err_free;
+		goto err_stop;
 
 	return 0;
+
+err_stop:
+	hid_hw_stop(hdev);
 err_free:
 	kfree(drv_data);
 	return ret;
@@ -863,8 +866,7 @@
 	struct lg_drv_data *drv_data = hid_get_drvdata(hdev);
 	if (drv_data->quirks & LG_FF4)
 		lg4ff_deinit(hdev);
-	else
-		hid_hw_stop(hdev);
+	hid_hw_stop(hdev);
 	kfree(drv_data);
 }
 

diff --git a/drivers/hid/hid-lg4ff.c b/drivers/hid/hid-lg4ff.c
index cefba03..03f0220 100644
--- a/drivers/hid/hid-lg4ff.c
+++ b/drivers/hid/hid-lg4ff.c

@@ -1477,7 +1477,6 @@
 		}
 	}
 #endif
-	hid_hw_stop(hid);
 	drv_data->device_props = NULL;
 
 	kfree(entry);

diff --git a/drivers/hid/hid-logitech-dj.c b/drivers/hid/hid-logitech-dj.c
index cc47f94..bb50d6e 100644
--- a/drivers/hid/hid-logitech-dj.c
+++ b/drivers/hid/hid-logitech-dj.c

@@ -380,9 +380,9 @@
 	0x75, 0x10,		/* REPORT_SIZE (16)                    */
 	0x95, 0x02,		/* REPORT_COUNT (2)                    */
 	0x15, 0x01,		/* LOGICAL_MIN (1)                     */
-	0x26, 0x8C, 0x02,	/* LOGICAL_MAX (652)                   */
+	0x26, 0xFF, 0x02,	/* LOGICAL_MAX (767)                   */
 	0x19, 0x01,		/* USAGE_MIN (1)                       */
-	0x2A, 0x8C, 0x02,	/* USAGE_MAX (652)                     */
+	0x2A, 0xFF, 0x02,	/* USAGE_MAX (767)                     */
 	0x81, 0x00,		/* INPUT (Data Ary Abs)                */
 	0xC0,			/* END_COLLECTION                      */
 };				/*                                     */
@@ -959,6 +959,7 @@
 		break;
 	case 0x07:
 		device_type = "eQUAD step 4 Gaming";
+		logi_hidpp_dev_conn_notif_equad(hdev, hidpp_report, &workitem);
 		break;
 	case 0x08:
 		device_type = "eQUAD step 4 for gamepads";
@@ -968,7 +969,12 @@
 		logi_hidpp_dev_conn_notif_equad(hdev, hidpp_report, &workitem);
 		break;
 	case 0x0c:
-		device_type = "eQUAD Lightspeed";
+		device_type = "eQUAD Lightspeed 1";
+		logi_hidpp_dev_conn_notif_equad(hdev, hidpp_report, &workitem);
+		workitem.reports_supported |= STD_KEYBOARD;
+		break;
+	case 0x0d:
+		device_type = "eQUAD Lightspeed 1_1";
 		logi_hidpp_dev_conn_notif_equad(hdev, hidpp_report, &workitem);
 		workitem.reports_supported |= STD_KEYBOARD;
 		break;
@@ -1734,14 +1740,14 @@
 		if (retval < 0) {
 			hid_err(hdev, "%s: logi_dj_recv_query_paired_devices error:%d\n",
 				__func__, retval);
-			goto logi_dj_recv_query_paired_devices_failed;
+			/*
+			 * This can happen with a KVM, let the probe succeed,
+			 * logi_dj_recv_queue_unknown_work will retry later.
+			 */
 		}
 	}
 
-	return retval;
-
-logi_dj_recv_query_paired_devices_failed:
-	hid_hw_close(hdev);
+	return 0;
 
 llopen_failed:
 switch_to_dj_mode_fail:
@@ -1832,9 +1838,17 @@
 	  HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH,
 			 USB_DEVICE_ID_LOGITECH_NANO_RECEIVER_2),
 	 .driver_data = recvr_type_hidpp},
+	{ /* Logitech G700(s) receiver (0xc531) */
+	  HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH,
+		0xc531),
+	 .driver_data = recvr_type_gaming_hidpp},
 	{ /* Logitech lightspeed receiver (0xc539) */
 	  HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH,
-		USB_DEVICE_ID_LOGITECH_NANO_RECEIVER_LIGHTSPEED),
+		USB_DEVICE_ID_LOGITECH_NANO_RECEIVER_LIGHTSPEED_1),
+	 .driver_data = recvr_type_gaming_hidpp},
+	{ /* Logitech lightspeed receiver (0xc53f) */
+	  HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH,
+		USB_DEVICE_ID_LOGITECH_NANO_RECEIVER_LIGHTSPEED_1_1),
 	 .driver_data = recvr_type_gaming_hidpp},
 	{ /* Logitech 27 MHz HID++ 1.0 receiver (0xc513) */
 	  HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_MX3000_RECEIVER),

diff --git a/drivers/hid/hid-multitouch.c b/drivers/hid/hid-multitouch.c
index b603c14..3cfeb16 100644
--- a/drivers/hid/hid-multitouch.c
+++ b/drivers/hid/hid-multitouch.c

@@ -68,6 +68,7 @@
 #define MT_QUIRK_STICKY_FINGERS		BIT(16)
 #define MT_QUIRK_ASUS_CUSTOM_UP		BIT(17)
 #define MT_QUIRK_WIN8_PTP_BUTTONS	BIT(18)
+#define MT_QUIRK_SEPARATE_APP_REPORT	BIT(19)
 
 #define MT_INPUTMODE_TOUCHSCREEN	0x02
 #define MT_INPUTMODE_TOUCHPAD		0x03
@@ -103,6 +104,7 @@
 struct mt_application {
 	struct list_head list;
 	unsigned int application;
+	unsigned int report_id;
 	struct list_head mt_usages;	/* mt usages list */
 
 	__s32 quirks;
@@ -203,6 +205,7 @@
 #define MT_CLS_VTL				0x0110
 #define MT_CLS_GOOGLE				0x0111
 #define MT_CLS_RAZER_BLADE_STEALTH		0x0112
+#define MT_CLS_SMART_TECH			0x0113
 
 #define MT_DEFAULT_MAXCONTACT	10
 #define MT_MAX_MAXCONTACT	250
@@ -263,7 +266,8 @@
 			MT_QUIRK_HOVERING |
 			MT_QUIRK_CONTACT_CNT_ACCURATE |
 			MT_QUIRK_STICKY_FINGERS |
-			MT_QUIRK_WIN8_PTP_BUTTONS },
+			MT_QUIRK_WIN8_PTP_BUTTONS,
+		.export_all_inputs = true },
 	{ .name = MT_CLS_EXPORT_ALL_INPUTS,
 		.quirks = MT_QUIRK_ALWAYS_VALID |
 			MT_QUIRK_CONTACT_CNT_ACCURATE,
@@ -353,6 +357,12 @@
 			MT_QUIRK_CONTACT_CNT_ACCURATE |
 			MT_QUIRK_WIN8_PTP_BUTTONS,
 	},
+	{ .name = MT_CLS_SMART_TECH,
+		.quirks = MT_QUIRK_ALWAYS_VALID |
+			MT_QUIRK_IGNORE_DUPLICATES |
+			MT_QUIRK_CONTACT_CNT_ACCURATE |
+			MT_QUIRK_SEPARATE_APP_REPORT,
+	},
 	{ }
 };
 
@@ -509,8 +519,9 @@
 }
 
 static struct mt_application *mt_allocate_application(struct mt_device *td,
-						      unsigned int application)
+						      struct hid_report *report)
 {
+	unsigned int application = report->application;
 	struct mt_application *mt_application;
 
 	mt_application = devm_kzalloc(&td->hdev->dev, sizeof(*mt_application),
@@ -535,6 +546,7 @@
 	mt_application->scantime = DEFAULT_ZERO;
 	mt_application->raw_cc = DEFAULT_ZERO;
 	mt_application->quirks = td->mtclass.quirks;
+	mt_application->report_id = report->id;
 
 	list_add_tail(&mt_application->list, &td->applications);
 
@@ -542,19 +554,23 @@
 }
 
 static struct mt_application *mt_find_application(struct mt_device *td,
-						  unsigned int application)
+						  struct hid_report *report)
 {
+	unsigned int application = report->application;
 	struct mt_application *tmp, *mt_application = NULL;
 
 	list_for_each_entry(tmp, &td->applications, list) {
 		if (application == tmp->application) {
-			mt_application = tmp;
-			break;
+			if (!(td->mtclass.quirks & MT_QUIRK_SEPARATE_APP_REPORT) ||
+			    tmp->report_id == report->id) {
+				mt_application = tmp;
+				break;
+			}
 		}
 	}
 
 	if (!mt_application)
-		mt_application = mt_allocate_application(td, application);
+		mt_application = mt_allocate_application(td, report);
 
 	return mt_application;
 }
@@ -571,7 +587,7 @@
 		return NULL;
 
 	rdata->report = report;
-	rdata->application = mt_find_application(td, report->application);
+	rdata->application = mt_find_application(td, report);
 
 	if (!rdata->application) {
 		devm_kfree(&td->hdev->dev, rdata);
@@ -1561,6 +1577,9 @@
 	case HID_VD_ASUS_CUSTOM_MEDIA_KEYS:
 		suffix = "Custom Media Keys";
 		break;
+	case HID_DG_PEN:
+		suffix = "Stylus";
+		break;
 	default:
 		suffix = "UNKNOWN";
 		break;
@@ -2022,6 +2041,10 @@
 		HID_DEVICE(BUS_I2C, HID_GROUP_MULTITOUCH_WIN_8,
 			USB_VENDOR_ID_SYNAPTICS, 0x8323) },
 
+	/* Smart Tech panels */
+	{ .driver_data = MT_CLS_SMART_TECH,
+		MT_USB_DEVICE(0x0b8c, 0x0092)},
+
 	/* Stantum panels */
 	{ .driver_data = MT_CLS_CONFIDENCE,
 		MT_USB_DEVICE(USB_VENDOR_ID_STANTUM_STM,

diff --git a/drivers/hid/hid-picolcd_core.c b/drivers/hid/hid-picolcd_core.c
index 5f7a39a..1b5c632 100644
--- a/drivers/hid/hid-picolcd_core.c
+++ b/drivers/hid/hid-picolcd_core.c

@@ -534,8 +534,7 @@
 	data = kzalloc(sizeof(struct picolcd_data), GFP_KERNEL);
 	if (data == NULL) {
 		hid_err(hdev, "can't allocate space for Minibox PicoLCD device data\n");
-		error = -ENOMEM;
-		goto err_no_cleanup;
+		return -ENOMEM;
 	}
 
 	spin_lock_init(&data->lock);
@@ -597,9 +596,6 @@
 	hid_hw_stop(hdev);
 err_cleanup_data:
 	kfree(data);
-err_no_cleanup:
-	hid_set_drvdata(hdev, NULL);
-
 	return error;
 }
 
@@ -635,7 +631,6 @@
 	picolcd_exit_cir(data);
 	picolcd_exit_keys(data);
 
-	hid_set_drvdata(hdev, NULL);
 	mutex_destroy(&data->mutex);
 	/* Finally, clean up the picolcd data itself */
 	kfree(data);

diff --git a/drivers/hid/hid-prodikeys.c b/drivers/hid/hid-prodikeys.c
index 21544eb..5a3b3d9 100644
--- a/drivers/hid/hid-prodikeys.c
+++ b/drivers/hid/hid-prodikeys.c

@@ -551,10 +551,14 @@
 
 static int pcmidi_set_operational(struct pcmidi_snd *pm)
 {
+	int rc;
+
 	if (pm->ifnum != 1)
 		return 0; /* only set up ONCE for interace 1 */
 
-	pcmidi_get_output_report(pm);
+	rc = pcmidi_get_output_report(pm);
+	if (rc < 0)
+		return rc;
 	pcmidi_submit_output_report(pm, 0xc1);
 	return 0;
 }
@@ -683,7 +687,11 @@
 	spin_lock_init(&pm->rawmidi_in_lock);
 
 	init_sustain_timers(pm);
-	pcmidi_set_operational(pm);
+	err = pcmidi_set_operational(pm);
+	if (err < 0) {
+		pk_error("failed to find output report\n");
+		goto fail_register;
+	}
 
 	/* register it */
 	err = snd_card_register(card);

diff --git a/drivers/hid/hid-quirks.c b/drivers/hid/hid-quirks.c
index 166f41f..c50bcd9 100644
--- a/drivers/hid/hid-quirks.c
+++ b/drivers/hid/hid-quirks.c

@@ -92,6 +92,7 @@
 	{ HID_USB_DEVICE(USB_VENDOR_ID_HP, USB_PRODUCT_ID_HP_LOGITECH_OEM_USB_OPTICAL_MOUSE_0B4A), HID_QUIRK_ALWAYS_POLL },
 	{ HID_USB_DEVICE(USB_VENDOR_ID_HP, USB_PRODUCT_ID_HP_PIXART_OEM_USB_OPTICAL_MOUSE), HID_QUIRK_ALWAYS_POLL },
 	{ HID_USB_DEVICE(USB_VENDOR_ID_HP, USB_PRODUCT_ID_HP_PIXART_OEM_USB_OPTICAL_MOUSE_094A), HID_QUIRK_ALWAYS_POLL },
+	{ HID_USB_DEVICE(USB_VENDOR_ID_HP, USB_PRODUCT_ID_HP_PIXART_OEM_USB_OPTICAL_MOUSE_0941), HID_QUIRK_ALWAYS_POLL },
 	{ HID_USB_DEVICE(USB_VENDOR_ID_HP, USB_PRODUCT_ID_HP_PIXART_OEM_USB_OPTICAL_MOUSE_0641), HID_QUIRK_ALWAYS_POLL },
 	{ HID_USB_DEVICE(USB_VENDOR_ID_IDEACOM, USB_DEVICE_ID_IDEACOM_IDC6680), HID_QUIRK_MULTI_INPUT },
 	{ HID_USB_DEVICE(USB_VENDOR_ID_INNOMEDIA, USB_DEVICE_ID_INNEX_GENESIS_ATARI), HID_QUIRK_MULTI_INPUT },

diff --git a/drivers/hid/hid-sensor-hub.c b/drivers/hid/hid-sensor-hub.c
index be92a6f..94c7398 100644
--- a/drivers/hid/hid-sensor-hub.c
+++ b/drivers/hid/hid-sensor-hub.c

@@ -742,7 +742,6 @@
 	}
 	spin_unlock_irqrestore(&data->lock, flags);
 	mfd_remove_devices(&hdev->dev);
-	hid_set_drvdata(hdev, NULL);
 	mutex_destroy(&data->mutex);
 }
 

diff --git a/drivers/hid/hid-sony.c b/drivers/hid/hid-sony.c
index 49dd2d9..73c0f7a 100644
--- a/drivers/hid/hid-sony.c
+++ b/drivers/hid/hid-sony.c

@@ -2811,7 +2811,6 @@
 	sony_cancel_work_sync(sc);
 	sony_remove_dev_list(sc);
 	sony_release_device_id(sc);
-	hid_hw_stop(hdev);
 	return ret;
 }
 
@@ -2876,6 +2875,7 @@
 	 */
 	if (!(hdev->claimed & HID_CLAIMED_INPUT)) {
 		hid_err(hdev, "failed to claim input\n");
+		hid_hw_stop(hdev);
 		return -ENODEV;
 	}
 

diff --git a/drivers/hid/hidraw.c b/drivers/hid/hidraw.c
index 006bd6f..bbc6ec1 100644
--- a/drivers/hid/hidraw.c
+++ b/drivers/hid/hidraw.c

@@ -252,7 +252,7 @@
 
 	poll_wait(file, &list->hidraw->wait, wait);
 	if (list->head != list->tail)
-		return EPOLLIN | EPOLLRDNORM;
+		return EPOLLIN | EPOLLRDNORM | EPOLLOUT;
 	if (!list->hidraw->exist)
 		return EPOLLERR | EPOLLHUP;
 	return 0;
@@ -370,7 +370,7 @@
 
 	mutex_lock(&minors_lock);
 	dev = hidraw_table[minor];
-	if (!dev) {
+	if (!dev || !dev->exist) {
 		ret = -ENODEV;
 		goto out;
 	}

diff --git a/drivers/hid/i2c-hid/i2c-hid-core.c b/drivers/hid/i2c-hid/i2c-hid-core.c
index 90164fe..2a7c6e3 100644
--- a/drivers/hid/i2c-hid/i2c-hid-core.c
+++ b/drivers/hid/i2c-hid/i2c-hid-core.c

@@ -169,9 +169,7 @@
 	__u16 idProduct;
 	__u32 quirks;
 } i2c_hid_quirks[] = {
-	{ USB_VENDOR_ID_WEIDA, USB_DEVICE_ID_WEIDA_8752,
-		I2C_HID_QUIRK_SET_PWR_WAKEUP_DEV },
-	{ USB_VENDOR_ID_WEIDA, USB_DEVICE_ID_WEIDA_8755,
+	{ USB_VENDOR_ID_WEIDA, HID_ANY_ID,
 		I2C_HID_QUIRK_SET_PWR_WAKEUP_DEV },
 	{ I2C_VENDOR_ID_HANTICK, I2C_PRODUCT_ID_HANTICK_5288,
 		I2C_HID_QUIRK_NO_IRQ_AFTER_RESET |

diff --git a/drivers/hid/intel-ish-hid/ipc/hw-ish.h b/drivers/hid/intel-ish-hid/ipc/hw-ish.h
index 5792a10..6c1e611 100644
--- a/drivers/hid/intel-ish-hid/ipc/hw-ish.h
+++ b/drivers/hid/intel-ish-hid/ipc/hw-ish.h

@@ -78,5 +78,6 @@
 struct ishtp_device *ish_dev_init(struct pci_dev *pdev);
 int ish_hw_start(struct ishtp_device *dev);
 void ish_device_disable(struct ishtp_device *dev);
+int ish_disable_dma(struct ishtp_device *dev);
 
 #endif /* _ISHTP_HW_ISH_H_ */

diff --git a/drivers/hid/intel-ish-hid/ipc/ipc.c b/drivers/hid/intel-ish-hid/ipc/ipc.c
index 18fe8af..8f8dfdf 100644
--- a/drivers/hid/intel-ish-hid/ipc/ipc.c
+++ b/drivers/hid/intel-ish-hid/ipc/ipc.c

@@ -672,7 +672,7 @@
  *
  * Return: 0 for success else error code.
  */
-static int ish_disable_dma(struct ishtp_device *dev)
+int ish_disable_dma(struct ishtp_device *dev)
 {
 	unsigned int	dma_delay;
 

diff --git a/drivers/hid/intel-ish-hid/ipc/pci-ish.c b/drivers/hid/intel-ish-hid/ipc/pci-ish.c
index 279567b..784dcc8 100644
--- a/drivers/hid/intel-ish-hid/ipc/pci-ish.c
+++ b/drivers/hid/intel-ish-hid/ipc/pci-ish.c

@@ -14,6 +14,7 @@
 #include <linux/types.h>
 #include <linux/pci.h>
 #include <linux/sched.h>
+#include <linux/suspend.h>
 #include <linux/interrupt.h>
 #include <linux/workqueue.h>
 #define CREATE_TRACE_POINTS
@@ -98,6 +99,11 @@
 	{}
 };
 
+static inline bool ish_should_enter_d0i3(struct pci_dev *pdev)
+{
+	return !pm_suspend_via_firmware() || pdev->device == CHV_DEVICE_ID;
+}
+
 /**
  * ish_probe() - PCI driver probe callback
  * @pdev:	pci device
@@ -148,7 +154,6 @@
 	/* mapping IO device memory */
 	hw->mem_addr = pcim_iomap_table(pdev)[0];
 	ishtp->pdev = pdev;
-	pdev->dev_flags |= PCI_DEV_FLAGS_NO_D3;
 
 	/* request and enable interrupt */
 	ret = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_ALL_TYPES);
@@ -185,7 +190,6 @@
 	struct ishtp_device *ishtp_dev = pci_get_drvdata(pdev);
 
 	ishtp_bus_remove_all_clients(ishtp_dev, false);
-	pdev->dev_flags &= ~PCI_DEV_FLAGS_NO_D3;
 	ish_device_disable(ishtp_dev);
 }
 
@@ -207,17 +211,13 @@
 {
 	struct pci_dev *pdev = to_pci_dev(ish_resume_device);
 	struct ishtp_device *dev = pci_get_drvdata(pdev);
-	uint32_t fwsts;
 	int ret;
 
-	/* Get ISH FW status */
-	fwsts = IPC_GET_ISH_FWSTS(dev->ops->get_fw_status(dev));
+	/* Check the NO_D3 flag to distinguish the resume paths */
+	if (pdev->dev_flags & PCI_DEV_FLAGS_NO_D3) {
+		pdev->dev_flags &= ~PCI_DEV_FLAGS_NO_D3;
+		disable_irq_wake(pdev->irq);
 
-	/*
-	 * If currently, in ISH FW, sensor app is loaded or beyond that,
-	 * it means ISH isn't powered off, in this case, send a resume message.
-	 */
-	if (fwsts >= FWSTS_SENSOR_APP_LOADED) {
 		ishtp_send_resume(dev);
 
 		/* Waiting to get resume response */
@@ -225,16 +225,20 @@
 			ret = wait_event_interruptible_timeout(dev->resume_wait,
 				!dev->resume_flag,
 				msecs_to_jiffies(WAIT_FOR_RESUME_ACK_MS));
-	}
 
-	/*
-	 * If in ISH FW, sensor app isn't loaded yet, or no resume response.
-	 * That means this platform is not S0ix compatible, or something is
-	 * wrong with ISH FW. So on resume, full reboot of ISH processor will
-	 * happen, so need to go through init sequence again.
-	 */
-	if (dev->resume_flag)
+		/*
+		 * If the flag is not cleared, something is wrong with ISH FW.
+		 * So on resume, need to go through init sequence again.
+		 */
+		if (dev->resume_flag)
+			ish_init(dev);
+	} else {
+		/*
+		 * Resume from the D3, full reboot of ISH processor will happen,
+		 * so need to go through init sequence again.
+		 */
 		ish_init(dev);
+	}
 }
 
 /**
@@ -250,23 +254,43 @@
 	struct pci_dev *pdev = to_pci_dev(device);
 	struct ishtp_device *dev = pci_get_drvdata(pdev);
 
-	enable_irq_wake(pdev->irq);
-	/*
-	 * If previous suspend hasn't been asnwered then ISH is likely dead,
-	 * don't attempt nested notification
-	 */
-	if (dev->suspend_flag)
-		return	0;
+	if (ish_should_enter_d0i3(pdev)) {
+		/*
+		 * If previous suspend hasn't been asnwered then ISH is likely
+		 * dead, don't attempt nested notification
+		 */
+		if (dev->suspend_flag)
+			return	0;
 
-	dev->resume_flag = 0;
-	dev->suspend_flag = 1;
-	ishtp_send_suspend(dev);
+		dev->resume_flag = 0;
+		dev->suspend_flag = 1;
+		ishtp_send_suspend(dev);
 
-	/* 25 ms should be enough for live ISH to flush all IPC buf */
-	if (dev->suspend_flag)
-		wait_event_interruptible_timeout(dev->suspend_wait,
-						 !dev->suspend_flag,
-						  msecs_to_jiffies(25));
+		/* 25 ms should be enough for live ISH to flush all IPC buf */
+		if (dev->suspend_flag)
+			wait_event_interruptible_timeout(dev->suspend_wait,
+					!dev->suspend_flag,
+					msecs_to_jiffies(25));
+
+		if (dev->suspend_flag) {
+			/*
+			 * It looks like FW halt, clear the DMA bit, and put
+			 * ISH into D3, and FW would reset on resume.
+			 */
+			ish_disable_dma(dev);
+		} else {
+			/* Set the NO_D3 flag, the ISH would enter D0i3 */
+			pdev->dev_flags |= PCI_DEV_FLAGS_NO_D3;
+
+			enable_irq_wake(pdev->irq);
+		}
+	} else {
+		/*
+		 * Clear the DMA bit before putting ISH into D3,
+		 * or ISH FW would reset automatically.
+		 */
+		ish_disable_dma(dev);
+	}
 
 	return 0;
 }
@@ -288,7 +312,6 @@
 	ish_resume_device = device;
 	dev->resume_flag = 1;
 
-	disable_irq_wake(pdev->irq);
 	schedule_work(&resume_work);
 
 	return 0;

diff --git a/drivers/hid/usbhid/hiddev.c b/drivers/hid/usbhid/hiddev.c
index 4e11cc6..1f9bc44 100644
--- a/drivers/hid/usbhid/hiddev.c
+++ b/drivers/hid/usbhid/hiddev.c

@@ -428,7 +428,7 @@
 
 	poll_wait(file, &list->hiddev->wait, wait);
 	if (list->head != list->tail)
-		return EPOLLIN | EPOLLRDNORM;
+		return EPOLLIN | EPOLLRDNORM | EPOLLOUT;
 	if (!list->hiddev->exist)
 		return EPOLLERR | EPOLLHUP;
 	return 0;

diff --git a/drivers/hid/wacom_sys.c b/drivers/hid/wacom_sys.c
index 53bddb5..5ded94b 100644
--- a/drivers/hid/wacom_sys.c
+++ b/drivers/hid/wacom_sys.c

@@ -88,7 +88,7 @@
 }
 
 static int wacom_wac_pen_serial_enforce(struct hid_device *hdev,
-		struct hid_report *report, u8 *raw_data, int size)
+		struct hid_report *report, u8 *raw_data, int report_size)
 {
 	struct wacom *wacom = hid_get_drvdata(hdev);
 	struct wacom_wac *wacom_wac = &wacom->wacom_wac;
@@ -149,7 +149,8 @@
 	if (flush)
 		wacom_wac_queue_flush(hdev, &wacom_wac->pen_fifo);
 	else if (insert)
-		wacom_wac_queue_insert(hdev, &wacom_wac->pen_fifo, raw_data, size);
+		wacom_wac_queue_insert(hdev, &wacom_wac->pen_fifo,
+				       raw_data, report_size);
 
 	return insert && !flush;
 }
@@ -2176,7 +2177,7 @@
 {
 	struct wacom_wac *wacom_wac = &wacom->wacom_wac;
 	struct wacom_features *features = &wacom_wac->features;
-	char name[WACOM_NAME_MAX];
+	char name[WACOM_NAME_MAX - 20]; /* Leave some room for suffixes */
 
 	/* Generic devices name unspecified */
 	if ((features->type == HID_GENERIC) && !strcmp("Wacom HID", features->name)) {
@@ -2718,14 +2719,12 @@
 	wacom_wac->features = *((struct wacom_features *)id->driver_data);
 	features = &wacom_wac->features;
 
-	if (features->check_for_hid_type && features->hid_type != hdev->type) {
-		error = -ENODEV;
-		goto fail;
-	}
+	if (features->check_for_hid_type && features->hid_type != hdev->type)
+		return -ENODEV;
 
 	error = kfifo_alloc(&wacom_wac->pen_fifo, WACOM_PKGLEN_MAX, GFP_KERNEL);
 	if (error)
-		goto fail;
+		return error;
 
 	wacom_wac->hid_data.inputmode = -1;
 	wacom_wac->mode_report = -1;
@@ -2743,12 +2742,12 @@
 	error = hid_parse(hdev);
 	if (error) {
 		hid_err(hdev, "parse failed\n");
-		goto fail;
+		return error;
 	}
 
 	error = wacom_parse_and_register(wacom, false);
 	if (error)
-		goto fail;
+		return error;
 
 	if (hdev->bus == BUS_BLUETOOTH) {
 		error = device_create_file(&hdev->dev, &dev_attr_speed);
@@ -2759,10 +2758,6 @@
 	}
 
 	return 0;
-
-fail:
-	hid_set_drvdata(hdev, NULL);
-	return error;
 }
 
 static void wacom_remove(struct hid_device *hdev)
@@ -2791,8 +2786,6 @@
 		wacom_release_resources(wacom);
 
 	kfifo_free(&wacom_wac->pen_fifo);
-
-	hid_set_drvdata(hdev, NULL);
 }
 
 #ifdef CONFIG_PM

diff --git a/drivers/hid/wacom_wac.c b/drivers/hid/wacom_wac.c
index 1713235..2b0a5b8 100644
--- a/drivers/hid/wacom_wac.c
+++ b/drivers/hid/wacom_wac.c

@@ -251,7 +251,7 @@
 
 static int wacom_dtus_irq(struct wacom_wac *wacom)
 {
-	char *data = wacom->data;
+	unsigned char *data = wacom->data;
 	struct input_dev *input = wacom->pen_input;
 	unsigned short prox, pressure = 0;
 
@@ -483,6 +483,8 @@
 	int ring1 = 0, ring2 = 0;
 	int strip1 = 0, strip2 = 0;
 	bool prox = false;
+	bool wrench = false, keyboard = false, mute_touch = false, menu = false,
+	     info = false;
 
 	/* pad packets. Works as a second tool and is always in prox */
 	if (!(data[0] == WACOM_REPORT_INTUOSPAD || data[0] == WACOM_REPORT_INTUOS5PAD ||
@@ -512,10 +514,32 @@
 		keys = ((data[3] & 0x1C) ? 1<<2 : 0) |
 		       ((data[4] & 0xE0) ? 1<<1 : 0) |
 		       ((data[4] & 0x07) ? 1<<0 : 0);
+		keyboard = !!(data[4] & 0xE0);
+		info = !!(data[3] & 0x1C);
+
+		if (features->oPid) {
+			mute_touch = !!(data[4] & 0x07);
+			if (mute_touch)
+				wacom->shared->is_touch_on =
+					!wacom->shared->is_touch_on;
+		} else {
+			wrench = !!(data[4] & 0x07);
+		}
 	} else if (features->type == WACOM_27QHD) {
 		nkeys = 3;
 		keys = data[2] & 0x07;
 
+		wrench = !!(data[2] & 0x01);
+		keyboard = !!(data[2] & 0x02);
+
+		if (features->oPid) {
+			mute_touch = !!(data[2] & 0x04);
+			if (mute_touch)
+				wacom->shared->is_touch_on =
+					!wacom->shared->is_touch_on;
+		} else {
+			menu = !!(data[2] & 0x04);
+		}
 		input_report_abs(input, ABS_X, be16_to_cpup((__be16 *)&data[4]));
 		input_report_abs(input, ABS_Y, be16_to_cpup((__be16 *)&data[6]));
 		input_report_abs(input, ABS_Z, be16_to_cpup((__be16 *)&data[8]));
@@ -561,6 +585,9 @@
 			if (features->type == WACOM_22HD) {
 				nkeys = 3;
 				keys = data[9] & 0x07;
+
+				info = !!(data[9] & 0x01);
+				wrench = !!(data[9] & 0x02);
 			}
 		} else {
 			buttons = ((data[6] & 0x10) << 5)  |
@@ -572,7 +599,7 @@
 		strip2 = ((data[3] & 0x1f) << 8) | data[4];
 	}
 
-	prox = (buttons & ~(~0 << nbuttons)) | (keys & ~(~0 << nkeys)) |
+	prox = (buttons & ~(~0U << nbuttons)) | (keys & ~(~0U << nkeys)) |
 	       (ring1 & 0x80) | (ring2 & 0x80) | strip1 | strip2;
 
 	wacom_report_numbered_buttons(input, nbuttons, buttons);
@@ -580,6 +607,18 @@
 	for (i = 0; i < nkeys; i++)
 		input_report_key(input, KEY_PROG1 + i, keys & (1 << i));
 
+	input_report_key(input, KEY_BUTTONCONFIG, wrench);
+	input_report_key(input, KEY_ONSCREEN_KEYBOARD, keyboard);
+	input_report_key(input, KEY_CONTROLPANEL, menu);
+	input_report_key(input, KEY_INFO, info);
+
+	if (wacom->shared && wacom->shared->touch_input) {
+		input_report_switch(wacom->shared->touch_input,
+				    SW_MUTE_DEVICE,
+				    !wacom->shared->is_touch_on);
+		input_sync(wacom->shared->touch_input);
+	}
+
 	input_report_abs(input, ABS_RX, strip1);
 	input_report_abs(input, ABS_RY, strip2);
 
@@ -1483,6 +1522,12 @@
 	int byte_per_packet = WACOM_BYTES_PER_24HDT_PACKET;
 	int y_offset = 2;
 
+	if (wacom->shared->has_mute_touch_switch &&
+	    !wacom->shared->is_touch_on) {
+		if (!wacom->shared->touch_down)
+			return 0;
+	}
+
 	if (wacom->features.type == WACOM_27QHDT) {
 		current_num_contacts = data[63];
 		num_contacts_left = 10;
@@ -2051,14 +2096,14 @@
 		    (hdev->product == 0x34d || hdev->product == 0x34e ||  /* MobileStudio Pro */
 		     hdev->product == 0x357 || hdev->product == 0x358 ||  /* Intuos Pro 2 */
 		     hdev->product == 0x392 ||				  /* Intuos Pro 2 */
-		     hdev->product == 0x399)) {				  /* MobileStudio Pro */
+		     hdev->product == 0x398 || hdev->product == 0x399)) { /* MobileStudio Pro */
 			value = (field->logical_maximum - value);
 
 			if (hdev->product == 0x357 || hdev->product == 0x358 ||
 			    hdev->product == 0x392)
 				value = wacom_offset_rotation(input, usage, value, 3, 16);
 			else if (hdev->product == 0x34d || hdev->product == 0x34e ||
-				 hdev->product == 0x399)
+				 hdev->product == 0x398 || hdev->product == 0x399)
 				value = wacom_offset_rotation(input, usage, value, 1, 2);
 		}
 		else {
@@ -3815,6 +3860,14 @@
 		/* fall through */
 
 	case WACOM_27QHDT:
+		if (wacom_wac->shared->touch->product == 0x32C ||
+		    wacom_wac->shared->touch->product == 0xF6) {
+			input_dev->evbit[0] |= BIT_MASK(EV_SW);
+			__set_bit(SW_MUTE_DEVICE, input_dev->swbit);
+			wacom_wac->shared->has_mute_touch_switch = true;
+		}
+		/* fall through */
+
 	case MTSCREEN:
 	case MTTPC:
 	case MTTPC_B:
@@ -4050,6 +4103,12 @@
 		__set_bit(KEY_PROG2, input_dev->keybit);
 		__set_bit(KEY_PROG3, input_dev->keybit);
 
+		__set_bit(KEY_ONSCREEN_KEYBOARD, input_dev->keybit);
+		__set_bit(KEY_INFO, input_dev->keybit);
+
+		if (!features->oPid)
+			__set_bit(KEY_BUTTONCONFIG, input_dev->keybit);
+
 		input_set_abs_params(input_dev, ABS_WHEEL, 0, 71, 0, 0);
 		input_set_abs_params(input_dev, ABS_THROTTLE, 0, 71, 0, 0);
 		break;
@@ -4058,6 +4117,12 @@
 		__set_bit(KEY_PROG1, input_dev->keybit);
 		__set_bit(KEY_PROG2, input_dev->keybit);
 		__set_bit(KEY_PROG3, input_dev->keybit);
+
+		__set_bit(KEY_ONSCREEN_KEYBOARD, input_dev->keybit);
+		__set_bit(KEY_BUTTONCONFIG, input_dev->keybit);
+
+		if (!features->oPid)
+			__set_bit(KEY_CONTROLPANEL, input_dev->keybit);
 		input_set_abs_params(input_dev, ABS_X, -2048, 2048, 0, 0);
 		input_abs_set_res(input_dev, ABS_X, 1024); /* points/g */
 		input_set_abs_params(input_dev, ABS_Y, -2048, 2048, 0, 0);
@@ -4071,6 +4136,9 @@
 		__set_bit(KEY_PROG1, input_dev->keybit);
 		__set_bit(KEY_PROG2, input_dev->keybit);
 		__set_bit(KEY_PROG3, input_dev->keybit);
+
+		__set_bit(KEY_BUTTONCONFIG, input_dev->keybit);
+		__set_bit(KEY_INFO, input_dev->keybit);
 		/* fall through */
 
 	case WACOM_21UX2:

diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
index addcef5..8eb1675 100644
--- a/drivers/hv/channel_mgmt.c
+++ b/drivers/hv/channel_mgmt.c

@@ -407,7 +407,15 @@
 		cpumask_clear_cpu(channel->target_cpu,
 				  &primary_channel->alloced_cpus_in_node);
 
-	vmbus_release_relid(channel->offermsg.child_relid);
+	/*
+	 * Upon suspend, an in-use hv_sock channel is marked as "rescinded" and
+	 * the relid is invalidated; after hibernation, when the user-space app
+	 * destroys the channel, the relid is INVALID_RELID, and in this case
+	 * it's unnecessary and unsafe to release the old relid, since the same
+	 * relid can refer to a completely different channel now.
+	 */
+	if (channel->offermsg.child_relid != INVALID_RELID)
+		vmbus_release_relid(channel->offermsg.child_relid);
 
 	free_channel(channel);
 }
@@ -545,6 +553,10 @@
 
 	mutex_lock(&vmbus_connection.channel_mutex);
 
+	/* Remember the channels that should be cleaned up upon suspend. */
+	if (is_hvsock_channel(newchannel) || is_sub_channel(newchannel))
+		atomic_inc(&vmbus_connection.nr_chan_close_on_suspend);
+
 	/*
 	 * Now that we have acquired the channel_mutex,
 	 * we can release the potentially racing rescind thread.
@@ -847,6 +859,67 @@
 		vmbus_wait_for_unload();
 }
 
+static void check_ready_for_resume_event(void)
+{
+	/*
+	 * If all the old primary channels have been fixed up, then it's safe
+	 * to resume.
+	 */
+	if (atomic_dec_and_test(&vmbus_connection.nr_chan_fixup_on_resume))
+		complete(&vmbus_connection.ready_for_resume_event);
+}
+
+static void vmbus_setup_channel_state(struct vmbus_channel *channel,
+				      struct vmbus_channel_offer_channel *offer)
+{
+	/*
+	 * Setup state for signalling the host.
+	 */
+	channel->sig_event = VMBUS_EVENT_CONNECTION_ID;
+
+	if (vmbus_proto_version != VERSION_WS2008) {
+		channel->is_dedicated_interrupt =
+				(offer->is_dedicated_interrupt != 0);
+		channel->sig_event = offer->connection_id;
+	}
+
+	memcpy(&channel->offermsg, offer,
+	       sizeof(struct vmbus_channel_offer_channel));
+	channel->monitor_grp = (u8)offer->monitorid / 32;
+	channel->monitor_bit = (u8)offer->monitorid % 32;
+}
+
+/*
+ * find_primary_channel_by_offer - Get the channel object given the new offer.
+ * This is only used in the resume path of hibernation.
+ */
+static struct vmbus_channel *
+find_primary_channel_by_offer(const struct vmbus_channel_offer_channel *offer)
+{
+	struct vmbus_channel *channel = NULL, *iter;
+	const guid_t *inst1, *inst2;
+
+	/* Ignore sub-channel offers. */
+	if (offer->offer.sub_channel_index != 0)
+		return NULL;
+
+	mutex_lock(&vmbus_connection.channel_mutex);
+
+	list_for_each_entry(iter, &vmbus_connection.chn_list, listentry) {
+		inst1 = &iter->offermsg.offer.if_instance;
+		inst2 = &offer->offer.if_instance;
+
+		if (guid_equal(inst1, inst2)) {
+			channel = iter;
+			break;
+		}
+	}
+
+	mutex_unlock(&vmbus_connection.channel_mutex);
+
+	return channel;
+}
+
 /*
  * vmbus_onoffer - Handler for channel offers from vmbus in parent partition.
  *
@@ -854,12 +927,58 @@
 static void vmbus_onoffer(struct vmbus_channel_message_header *hdr)
 {
 	struct vmbus_channel_offer_channel *offer;
-	struct vmbus_channel *newchannel;
+	struct vmbus_channel *oldchannel, *newchannel;
+	size_t offer_sz;
 
 	offer = (struct vmbus_channel_offer_channel *)hdr;
 
 	trace_vmbus_onoffer(offer);
 
+	oldchannel = find_primary_channel_by_offer(offer);
+
+	if (oldchannel != NULL) {
+		atomic_dec(&vmbus_connection.offer_in_progress);
+
+		/*
+		 * We're resuming from hibernation: all the sub-channel and
+		 * hv_sock channels we had before the hibernation should have
+		 * been cleaned up, and now we must be seeing a re-offered
+		 * primary channel that we had before the hibernation.
+		 */
+
+		WARN_ON(oldchannel->offermsg.child_relid != INVALID_RELID);
+		/* Fix up the relid. */
+		oldchannel->offermsg.child_relid = offer->child_relid;
+
+		offer_sz = sizeof(*offer);
+		if (memcmp(offer, &oldchannel->offermsg, offer_sz) == 0) {
+			check_ready_for_resume_event();
+			return;
+		}
+
+		/*
+		 * This is not an error, since the host can also change the
+		 * other field(s) of the offer, e.g. on WS RS5 (Build 17763),
+		 * the offer->connection_id of the Mellanox VF vmbus device
+		 * can change when the host reoffers the device upon resume.
+		 */
+		pr_debug("vmbus offer changed: relid=%d\n",
+			 offer->child_relid);
+
+		print_hex_dump_debug("Old vmbus offer: ", DUMP_PREFIX_OFFSET,
+				     16, 4, &oldchannel->offermsg, offer_sz,
+				     false);
+		print_hex_dump_debug("New vmbus offer: ", DUMP_PREFIX_OFFSET,
+				     16, 4, offer, offer_sz, false);
+
+		/* Fix up the old channel. */
+		vmbus_setup_channel_state(oldchannel, offer);
+
+		check_ready_for_resume_event();
+
+		return;
+	}
+
 	/* Allocate the channel object and save this offer. */
 	newchannel = alloc_channel();
 	if (!newchannel) {
@@ -869,25 +988,21 @@
 		return;
 	}
 
-	/*
-	 * Setup state for signalling the host.
-	 */
-	newchannel->sig_event = VMBUS_EVENT_CONNECTION_ID;
-
-	if (vmbus_proto_version != VERSION_WS2008) {
-		newchannel->is_dedicated_interrupt =
-				(offer->is_dedicated_interrupt != 0);
-		newchannel->sig_event = offer->connection_id;
-	}
-
-	memcpy(&newchannel->offermsg, offer,
-	       sizeof(struct vmbus_channel_offer_channel));
-	newchannel->monitor_grp = (u8)offer->monitorid / 32;
-	newchannel->monitor_bit = (u8)offer->monitorid % 32;
+	vmbus_setup_channel_state(newchannel, offer);
 
 	vmbus_process_offer(newchannel);
 }
 
+static void check_ready_for_suspend_event(void)
+{
+	/*
+	 * If all the sub-channels or hv_sock channels have been cleaned up,
+	 * then it's safe to suspend.
+	 */
+	if (atomic_dec_and_test(&vmbus_connection.nr_chan_close_on_suspend))
+		complete(&vmbus_connection.ready_for_suspend_event);
+}
+
 /*
  * vmbus_onoffer_rescind - Rescind offer handler.
  *
@@ -898,6 +1013,7 @@
 	struct vmbus_channel_rescind_offer *rescind;
 	struct vmbus_channel *channel;
 	struct device *dev;
+	bool clean_up_chan_for_suspend;
 
 	rescind = (struct vmbus_channel_rescind_offer *)hdr;
 
@@ -937,6 +1053,8 @@
 		return;
 	}
 
+	clean_up_chan_for_suspend = is_hvsock_channel(channel) ||
+				    is_sub_channel(channel);
 	/*
 	 * Before setting channel->rescind in vmbus_rescind_cleanup(), we
 	 * should make sure the channel callback is not running any more.
@@ -962,6 +1080,10 @@
 	if (channel->device_obj) {
 		if (channel->chn_rescind_callback) {
 			channel->chn_rescind_callback(channel);
+
+			if (clean_up_chan_for_suspend)
+				check_ready_for_suspend_event();
+
 			return;
 		}
 		/*
@@ -994,6 +1116,11 @@
 		}
 		mutex_unlock(&vmbus_connection.channel_mutex);
 	}
+
+	/* The "channel" may have been freed. Do not access it any longer. */
+
+	if (clean_up_chan_for_suspend)
+		check_ready_for_suspend_event();
 }
 
 void vmbus_hvsock_device_unregister(struct vmbus_channel *channel)

diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c
index 09829e1..6e4c015 100644
--- a/drivers/hv/connection.c
+++ b/drivers/hv/connection.c

@@ -26,6 +26,11 @@
 struct vmbus_connection vmbus_connection = {
 	.conn_state		= DISCONNECTED,
 	.next_gpadl_handle	= ATOMIC_INIT(0xE1E10),
+
+	.ready_for_suspend_event= COMPLETION_INITIALIZER(
+				  vmbus_connection.ready_for_suspend_event),
+	.ready_for_resume_event	= COMPLETION_INITIALIZER(
+				  vmbus_connection.ready_for_resume_event),
 };
 EXPORT_SYMBOL_GPL(vmbus_connection);
 
@@ -59,8 +64,7 @@
 	}
 }
 
-static int vmbus_negotiate_version(struct vmbus_channel_msginfo *msginfo,
-					__u32 version)
+int vmbus_negotiate_version(struct vmbus_channel_msginfo *msginfo, u32 version)
 {
 	int ret = 0;
 	unsigned int cur_cpu;

diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c
index 6188fb7..fcc5279 100644
--- a/drivers/hv/hv.c
+++ b/drivers/hv/hv.c

@@ -154,7 +154,7 @@
  * retrieve the initialized message and event pages.  Otherwise, we create and
  * initialize the message and event pages.
  */
-int hv_synic_init(unsigned int cpu)
+void hv_synic_enable_regs(unsigned int cpu)
 {
 	struct hv_per_cpu_context *hv_cpu
 		= per_cpu_ptr(hv_context.cpu_context, cpu);
@@ -196,6 +196,11 @@
 	sctrl.enable = 1;
 
 	hv_set_synic_state(sctrl.as_uint64);
+}
+
+int hv_synic_init(unsigned int cpu)
+{
+	hv_synic_enable_regs(cpu);
 
 	hv_stimer_init(cpu);
 
@@ -205,20 +210,45 @@
 /*
  * hv_synic_cleanup - Cleanup routine for hv_synic_init().
  */
-int hv_synic_cleanup(unsigned int cpu)
+void hv_synic_disable_regs(unsigned int cpu)
 {
 	union hv_synic_sint shared_sint;
 	union hv_synic_simp simp;
 	union hv_synic_siefp siefp;
 	union hv_synic_scontrol sctrl;
+
+	hv_get_synint_state(VMBUS_MESSAGE_SINT, shared_sint.as_uint64);
+
+	shared_sint.masked = 1;
+
+	/* Need to correctly cleanup in the case of SMP!!! */
+	/* Disable the interrupt */
+	hv_set_synint_state(VMBUS_MESSAGE_SINT, shared_sint.as_uint64);
+
+	hv_get_simp(simp.as_uint64);
+	simp.simp_enabled = 0;
+	simp.base_simp_gpa = 0;
+
+	hv_set_simp(simp.as_uint64);
+
+	hv_get_siefp(siefp.as_uint64);
+	siefp.siefp_enabled = 0;
+	siefp.base_siefp_gpa = 0;
+
+	hv_set_siefp(siefp.as_uint64);
+
+	/* Disable the global synic bit */
+	hv_get_synic_state(sctrl.as_uint64);
+	sctrl.enable = 0;
+	hv_set_synic_state(sctrl.as_uint64);
+}
+
+int hv_synic_cleanup(unsigned int cpu)
+{
 	struct vmbus_channel *channel, *sc;
 	bool channel_found = false;
 	unsigned long flags;
 
-	hv_get_synic_state(sctrl.as_uint64);
-	if (sctrl.enable != 1)
-		return -EFAULT;
-
 	/*
 	 * Search for channels which are bound to the CPU we're about to
 	 * cleanup. In case we find one and vmbus is still connected we need to
@@ -249,29 +279,7 @@
 
 	hv_stimer_cleanup(cpu);
 
-	hv_get_synint_state(VMBUS_MESSAGE_SINT, shared_sint.as_uint64);
-
-	shared_sint.masked = 1;
-
-	/* Need to correctly cleanup in the case of SMP!!! */
-	/* Disable the interrupt */
-	hv_set_synint_state(VMBUS_MESSAGE_SINT, shared_sint.as_uint64);
-
-	hv_get_simp(simp.as_uint64);
-	simp.simp_enabled = 0;
-	simp.base_simp_gpa = 0;
-
-	hv_set_simp(simp.as_uint64);
-
-	hv_get_siefp(siefp.as_uint64);
-	siefp.siefp_enabled = 0;
-	siefp.base_siefp_gpa = 0;
-
-	hv_set_siefp(siefp.as_uint64);
-
-	/* Disable the global synic bit */
-	sctrl.enable = 0;
-	hv_set_synic_state(sctrl.as_uint64);
+	hv_synic_disable_regs(cpu);
 
 	return 0;
 }

diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c
index 6fb4ea5..34bd735 100644
--- a/drivers/hv/hv_balloon.c
+++ b/drivers/hv/hv_balloon.c

@@ -494,7 +494,7 @@
 
 
 static __u8 recv_buffer[PAGE_SIZE];
-static __u8 *send_buffer;
+static __u8 balloon_up_send_buffer[PAGE_SIZE];
 #define PAGES_IN_2M	512
 #define HA_CHUNK (32 * 1024)
 
@@ -1292,8 +1292,8 @@
 	}
 
 	while (!done) {
-		bl_resp = (struct dm_balloon_response *)send_buffer;
-		memset(send_buffer, 0, PAGE_SIZE);
+		memset(balloon_up_send_buffer, 0, PAGE_SIZE);
+		bl_resp = (struct dm_balloon_response *)balloon_up_send_buffer;
 		bl_resp->hdr.type = DM_BALLOON_RESPONSE;
 		bl_resp->hdr.size = sizeof(struct dm_balloon_response);
 		bl_resp->more_pages = 1;
@@ -1564,58 +1564,18 @@
 
 }
 
-static int balloon_probe(struct hv_device *dev,
-			const struct hv_vmbus_device_id *dev_id)
+static int balloon_connect_vsp(struct hv_device *dev)
 {
-	int ret;
-	unsigned long t;
 	struct dm_version_request version_req;
 	struct dm_capabilities cap_msg;
-
-#ifdef CONFIG_MEMORY_HOTPLUG
-	do_hot_add = hot_add;
-#else
-	do_hot_add = false;
-#endif
-
-	/*
-	 * First allocate a send buffer.
-	 */
-
-	send_buffer = kmalloc(PAGE_SIZE, GFP_KERNEL);
-	if (!send_buffer)
-		return -ENOMEM;
+	unsigned long t;
+	int ret;
 
 	ret = vmbus_open(dev->channel, dm_ring_size, dm_ring_size, NULL, 0,
-			balloon_onchannelcallback, dev);
-
+			 balloon_onchannelcallback, dev);
 	if (ret)
-		goto probe_error0;
+		return ret;
 
-	dm_device.dev = dev;
-	dm_device.state = DM_INITIALIZING;
-	dm_device.next_version = DYNMEM_PROTOCOL_VERSION_WIN8;
-	init_completion(&dm_device.host_event);
-	init_completion(&dm_device.config_event);
-	INIT_LIST_HEAD(&dm_device.ha_region_list);
-	spin_lock_init(&dm_device.ha_lock);
-	INIT_WORK(&dm_device.balloon_wrk.wrk, balloon_up);
-	INIT_WORK(&dm_device.ha_wrk.wrk, hot_add_req);
-	dm_device.host_specified_ha_region = false;
-
-	dm_device.thread =
-		 kthread_run(dm_thread_func, &dm_device, "hv_balloon");
-	if (IS_ERR(dm_device.thread)) {
-		ret = PTR_ERR(dm_device.thread);
-		goto probe_error1;
-	}
-
-#ifdef CONFIG_MEMORY_HOTPLUG
-	set_online_page_callback(&hv_online_page);
-	register_memory_notifier(&hv_memory_nb);
-#endif
-
-	hv_set_drvdata(dev, &dm_device);
 	/*
 	 * Initiate the hand shake with the host and negotiate
 	 * a version that the host can support. We start with the
@@ -1631,16 +1591,15 @@
 	dm_device.version = version_req.version.version;
 
 	ret = vmbus_sendpacket(dev->channel, &version_req,
-				sizeof(struct dm_version_request),
-				(unsigned long)NULL,
-				VM_PKT_DATA_INBAND, 0);
+			       sizeof(struct dm_version_request),
+			       (unsigned long)NULL, VM_PKT_DATA_INBAND, 0);
 	if (ret)
-		goto probe_error2;
+		goto out;
 
 	t = wait_for_completion_timeout(&dm_device.host_event, 5*HZ);
 	if (t == 0) {
 		ret = -ETIMEDOUT;
-		goto probe_error2;
+		goto out;
 	}
 
 	/*
@@ -1648,8 +1607,8 @@
 	 * fail the probe function.
 	 */
 	if (dm_device.state == DM_INIT_ERROR) {
-		ret = -ETIMEDOUT;
-		goto probe_error2;
+		ret = -EPROTO;
+		goto out;
 	}
 
 	pr_info("Using Dynamic Memory protocol version %u.%u\n",
@@ -1682,16 +1641,15 @@
 	cap_msg.max_page_number = -1;
 
 	ret = vmbus_sendpacket(dev->channel, &cap_msg,
-				sizeof(struct dm_capabilities),
-				(unsigned long)NULL,
-				VM_PKT_DATA_INBAND, 0);
+			       sizeof(struct dm_capabilities),
+			       (unsigned long)NULL, VM_PKT_DATA_INBAND, 0);
 	if (ret)
-		goto probe_error2;
+		goto out;
 
 	t = wait_for_completion_timeout(&dm_device.host_event, 5*HZ);
 	if (t == 0) {
 		ret = -ETIMEDOUT;
-		goto probe_error2;
+		goto out;
 	}
 
 	/*
@@ -1699,25 +1657,65 @@
 	 * fail the probe function.
 	 */
 	if (dm_device.state == DM_INIT_ERROR) {
-		ret = -ETIMEDOUT;
-		goto probe_error2;
+		ret = -EPROTO;
+		goto out;
 	}
 
+	return 0;
+out:
+	vmbus_close(dev->channel);
+	return ret;
+}
+
+static int balloon_probe(struct hv_device *dev,
+			 const struct hv_vmbus_device_id *dev_id)
+{
+	int ret;
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+	do_hot_add = hot_add;
+#else
+	do_hot_add = false;
+#endif
+	dm_device.dev = dev;
+	dm_device.state = DM_INITIALIZING;
+	dm_device.next_version = DYNMEM_PROTOCOL_VERSION_WIN8;
+	init_completion(&dm_device.host_event);
+	init_completion(&dm_device.config_event);
+	INIT_LIST_HEAD(&dm_device.ha_region_list);
+	spin_lock_init(&dm_device.ha_lock);
+	INIT_WORK(&dm_device.balloon_wrk.wrk, balloon_up);
+	INIT_WORK(&dm_device.ha_wrk.wrk, hot_add_req);
+	dm_device.host_specified_ha_region = false;
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+	set_online_page_callback(&hv_online_page);
+	register_memory_notifier(&hv_memory_nb);
+#endif
+
+	hv_set_drvdata(dev, &dm_device);
+
+	ret = balloon_connect_vsp(dev);
+	if (ret != 0)
+		return ret;
+
 	dm_device.state = DM_INITIALIZED;
-	last_post_time = jiffies;
+
+	dm_device.thread =
+		 kthread_run(dm_thread_func, &dm_device, "hv_balloon");
+	if (IS_ERR(dm_device.thread)) {
+		ret = PTR_ERR(dm_device.thread);
+		goto probe_error;
+	}
 
 	return 0;
 
-probe_error2:
+probe_error:
+	vmbus_close(dev->channel);
 #ifdef CONFIG_MEMORY_HOTPLUG
+	unregister_memory_notifier(&hv_memory_nb);
 	restore_online_page_callback(&hv_online_page);
 #endif
-	kthread_stop(dm_device.thread);
-
-probe_error1:
-	vmbus_close(dev->channel);
-probe_error0:
-	kfree(send_buffer);
 	return ret;
 }
 
@@ -1734,12 +1732,11 @@
 	cancel_work_sync(&dm->balloon_wrk.wrk);
 	cancel_work_sync(&dm->ha_wrk.wrk);
 
-	vmbus_close(dev->channel);
 	kthread_stop(dm->thread);
-	kfree(send_buffer);
+	vmbus_close(dev->channel);
 #ifdef CONFIG_MEMORY_HOTPLUG
-	restore_online_page_callback(&hv_online_page);
 	unregister_memory_notifier(&hv_memory_nb);
+	restore_online_page_callback(&hv_online_page);
 #endif
 	spin_lock_irqsave(&dm_device.ha_lock, flags);
 	list_for_each_entry_safe(has, tmp, &dm->ha_region_list, list) {

diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h
index 50eaa1f..af9379a 100644
--- a/drivers/hv/hyperv_vmbus.h
+++ b/drivers/hv/hyperv_vmbus.h

@@ -169,8 +169,10 @@
 
 extern void hv_synic_free(void);
 
+extern void hv_synic_enable_regs(unsigned int cpu);
 extern int hv_synic_init(unsigned int cpu);
 
+extern void hv_synic_disable_regs(unsigned int cpu);
 extern int hv_synic_cleanup(unsigned int cpu);
 
 /* Interface */
@@ -256,6 +258,32 @@
 	struct workqueue_struct *work_queue;
 	struct workqueue_struct *handle_primary_chan_wq;
 	struct workqueue_struct *handle_sub_chan_wq;
+
+	/*
+	 * The number of sub-channels and hv_sock channels that should be
+	 * cleaned up upon suspend: sub-channels will be re-created upon
+	 * resume, and hv_sock channels should not survive suspend.
+	 */
+	atomic_t nr_chan_close_on_suspend;
+	/*
+	 * vmbus_bus_suspend() waits for "nr_chan_close_on_suspend" to
+	 * drop to zero.
+	 */
+	struct completion ready_for_suspend_event;
+
+	/*
+	 * The number of primary channels that should be "fixed up"
+	 * upon resume: these channels are re-offered upon resume, and some
+	 * fields of the channel offers (i.e. child_relid and connection_id)
+	 * can change, so the old offermsg must be fixed up, before the resume
+	 * callbacks of the VSC drivers start to further touch the channels.
+	 */
+	atomic_t nr_chan_fixup_on_resume;
+	/*
+	 * vmbus_bus_resume() waits for "nr_chan_fixup_on_resume" to
+	 * drop to zero.
+	 */
+	struct completion ready_for_resume_event;
 };
 
 
@@ -270,6 +298,8 @@
 
 extern struct vmbus_connection vmbus_connection;
 
+int vmbus_negotiate_version(struct vmbus_channel_msginfo *msginfo, u32 version);
+
 static inline void vmbus_send_interrupt(u32 relid)
 {
 	sync_set_bit(relid, vmbus_connection.send_int_page);

diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index ebd35fc..391f0b2 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c

@@ -24,12 +24,14 @@
 #include <linux/sched/task_stack.h>
 
 #include <asm/mshyperv.h>
+#include <linux/delay.h>
 #include <linux/notifier.h>
 #include <linux/ptrace.h>
 #include <linux/screen_info.h>
 #include <linux/kdebug.h>
 #include <linux/efi.h>
 #include <linux/random.h>
+#include <linux/syscore_ops.h>
 #include <clocksource/hyperv_timer.h>
 #include "hyperv_vmbus.h"
 
@@ -910,6 +912,43 @@
 		drv->shutdown(dev);
 }
 
+/*
+ * vmbus_suspend - Suspend a vmbus device
+ */
+static int vmbus_suspend(struct device *child_device)
+{
+	struct hv_driver *drv;
+	struct hv_device *dev = device_to_hv_device(child_device);
+
+	/* The device may not be attached yet */
+	if (!child_device->driver)
+		return 0;
+
+	drv = drv_to_hv_drv(child_device->driver);
+	if (!drv->suspend)
+		return -EOPNOTSUPP;
+
+	return drv->suspend(dev);
+}
+
+/*
+ * vmbus_resume - Resume a vmbus device
+ */
+static int vmbus_resume(struct device *child_device)
+{
+	struct hv_driver *drv;
+	struct hv_device *dev = device_to_hv_device(child_device);
+
+	/* The device may not be attached yet */
+	if (!child_device->driver)
+		return 0;
+
+	drv = drv_to_hv_drv(child_device->driver);
+	if (!drv->resume)
+		return -EOPNOTSUPP;
+
+	return drv->resume(dev);
+}
 
 /*
  * vmbus_device_release - Final callback release of the vmbus child device
@@ -925,6 +964,14 @@
 	kfree(hv_dev);
 }
 
+/*
+ * Note: we must use SET_NOIRQ_SYSTEM_SLEEP_PM_OPS rather than
+ * SET_SYSTEM_SLEEP_PM_OPS: see the comment before vmbus_bus_pm.
+ */
+static const struct dev_pm_ops vmbus_pm = {
+	SET_NOIRQ_SYSTEM_SLEEP_PM_OPS(vmbus_suspend, vmbus_resume)
+};
+
 /* The one and only one */
 static struct bus_type  hv_bus = {
 	.name =		"vmbus",
@@ -935,6 +982,7 @@
 	.uevent =		vmbus_uevent,
 	.dev_groups =		vmbus_dev_groups,
 	.drv_groups =		vmbus_drv_groups,
+	.pm =			&vmbus_pm,
 };
 
 struct onmessage_work_context {
@@ -1022,6 +1070,41 @@
 	vmbus_signal_eom(msg, message_type);
 }
 
+/*
+ * Fake RESCIND_CHANNEL messages to clean up hv_sock channels by force for
+ * hibernation, because hv_sock connections can not persist across hibernation.
+ */
+static void vmbus_force_channel_rescinded(struct vmbus_channel *channel)
+{
+	struct onmessage_work_context *ctx;
+	struct vmbus_channel_rescind_offer *rescind;
+
+	WARN_ON(!is_hvsock_channel(channel));
+
+	/*
+	 * sizeof(*ctx) is small and the allocation should really not fail,
+	 * otherwise the state of the hv_sock connections ends up in limbo.
+	 */
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL | __GFP_NOFAIL);
+
+	/*
+	 * So far, these are not really used by Linux. Just set them to the
+	 * reasonable values conforming to the definitions of the fields.
+	 */
+	ctx->msg.header.message_type = 1;
+	ctx->msg.header.payload_size = sizeof(*rescind);
+
+	/* These values are actually used by Linux. */
+	rescind = (struct vmbus_channel_rescind_offer *)ctx->msg.u.payload;
+	rescind->header.msgtype = CHANNELMSG_RESCIND_CHANNELOFFER;
+	rescind->child_relid = channel->offermsg.child_relid;
+
+	INIT_WORK(&ctx->work, vmbus_onmessage_work);
+
+	queue_work_on(vmbus_connection.connect_cpu,
+		      vmbus_connection.work_queue,
+		      &ctx->work);
+}
 
 /*
  * Direct callback for channels using other deferred processing
@@ -2042,6 +2125,129 @@
 	return ret_val;
 }
 
+static int vmbus_bus_suspend(struct device *dev)
+{
+	struct vmbus_channel *channel, *sc;
+	unsigned long flags;
+
+	while (atomic_read(&vmbus_connection.offer_in_progress) != 0) {
+		/*
+		 * We wait here until the completion of any channel
+		 * offers that are currently in progress.
+		 */
+		msleep(1);
+	}
+
+	mutex_lock(&vmbus_connection.channel_mutex);
+	list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) {
+		if (!is_hvsock_channel(channel))
+			continue;
+
+		vmbus_force_channel_rescinded(channel);
+	}
+	mutex_unlock(&vmbus_connection.channel_mutex);
+
+	/*
+	 * Wait until all the sub-channels and hv_sock channels have been
+	 * cleaned up. Sub-channels should be destroyed upon suspend, otherwise
+	 * they would conflict with the new sub-channels that will be created
+	 * in the resume path. hv_sock channels should also be destroyed, but
+	 * a hv_sock channel of an established hv_sock connection can not be
+	 * really destroyed since it may still be referenced by the userspace
+	 * application, so we just force the hv_sock channel to be rescinded
+	 * by vmbus_force_channel_rescinded(), and the userspace application
+	 * will thoroughly destroy the channel after hibernation.
+	 *
+	 * Note: the counter nr_chan_close_on_suspend may never go above 0 if
+	 * the VM has no sub-channel and hv_sock channel, e.g. a 1-vCPU VM.
+	 */
+	if (atomic_read(&vmbus_connection.nr_chan_close_on_suspend) > 0)
+		wait_for_completion(&vmbus_connection.ready_for_suspend_event);
+
+	WARN_ON(atomic_read(&vmbus_connection.nr_chan_fixup_on_resume) != 0);
+
+	mutex_lock(&vmbus_connection.channel_mutex);
+
+	list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) {
+		/*
+		 * Invalidate the field. Upon resume, vmbus_onoffer() will fix
+		 * up the field, and the other fields (if necessary).
+		 */
+		channel->offermsg.child_relid = INVALID_RELID;
+
+		if (is_hvsock_channel(channel)) {
+			if (!channel->rescind) {
+				pr_err("hv_sock channel not rescinded!\n");
+				WARN_ON_ONCE(1);
+			}
+			continue;
+		}
+
+		spin_lock_irqsave(&channel->lock, flags);
+		list_for_each_entry(sc, &channel->sc_list, sc_list) {
+			pr_err("Sub-channel not deleted!\n");
+			WARN_ON_ONCE(1);
+		}
+		spin_unlock_irqrestore(&channel->lock, flags);
+
+		atomic_inc(&vmbus_connection.nr_chan_fixup_on_resume);
+	}
+
+	mutex_unlock(&vmbus_connection.channel_mutex);
+
+	vmbus_initiate_unload(false);
+
+	vmbus_connection.conn_state = DISCONNECTED;
+
+	/* Reset the event for the next resume. */
+	reinit_completion(&vmbus_connection.ready_for_resume_event);
+
+	return 0;
+}
+
+static int vmbus_bus_resume(struct device *dev)
+{
+	struct vmbus_channel_msginfo *msginfo;
+	size_t msgsize;
+	int ret;
+
+	/*
+	 * We only use the 'vmbus_proto_version', which was in use before
+	 * hibernation, to re-negotiate with the host.
+	 */
+	if (vmbus_proto_version == VERSION_INVAL ||
+	    vmbus_proto_version == 0) {
+		pr_err("Invalid proto version = 0x%x\n", vmbus_proto_version);
+		return -EINVAL;
+	}
+
+	msgsize = sizeof(*msginfo) +
+		  sizeof(struct vmbus_channel_initiate_contact);
+
+	msginfo = kzalloc(msgsize, GFP_KERNEL);
+
+	if (msginfo == NULL)
+		return -ENOMEM;
+
+	ret = vmbus_negotiate_version(msginfo, vmbus_proto_version);
+
+	kfree(msginfo);
+
+	if (ret != 0)
+		return ret;
+
+	WARN_ON(atomic_read(&vmbus_connection.nr_chan_fixup_on_resume) == 0);
+
+	vmbus_request_offers();
+
+	wait_for_completion(&vmbus_connection.ready_for_resume_event);
+
+	/* Reset the event for the next suspend. */
+	reinit_completion(&vmbus_connection.ready_for_suspend_event);
+
+	return 0;
+}
+
 static const struct acpi_device_id vmbus_acpi_device_ids[] = {
 	{"VMBUS", 0},
 	{"VMBus", 0},
@@ -2049,6 +2255,19 @@
 };
 MODULE_DEVICE_TABLE(acpi, vmbus_acpi_device_ids);
 
+/*
+ * Note: we must use SET_NOIRQ_SYSTEM_SLEEP_PM_OPS rather than
+ * SET_SYSTEM_SLEEP_PM_OPS, otherwise NIC SR-IOV can not work, because the
+ * "pci_dev_pm_ops" uses the "noirq" callbacks: in the resume path, the
+ * pci "noirq" restore callback runs before "non-noirq" callbacks (see
+ * resume_target_kernel() -> dpm_resume_start(), and hibernation_restore() ->
+ * dpm_resume_end()). This means vmbus_bus_resume() and the pci-hyperv's
+ * resume callback must also run via the "noirq" callbacks.
+ */
+static const struct dev_pm_ops vmbus_bus_pm = {
+	SET_NOIRQ_SYSTEM_SLEEP_PM_OPS(vmbus_bus_suspend, vmbus_bus_resume)
+};
+
 static struct acpi_driver vmbus_acpi_driver = {
 	.name = "vmbus",
 	.ids = vmbus_acpi_device_ids,
@@ -2056,6 +2275,7 @@
 		.add = vmbus_acpi_add,
 		.remove = vmbus_acpi_remove,
 	},
+	.drv.pm = &vmbus_bus_pm,
 };
 
 static void hv_kexec_handler(void)
@@ -2086,6 +2306,47 @@
 	hyperv_cleanup();
 };
 
+static int hv_synic_suspend(void)
+{
+	/*
+	 * When we reach here, all the non-boot CPUs have been offlined, and
+	 * the stimers on them have been unbound in hv_synic_cleanup() ->
+	 * hv_stimer_cleanup() -> clockevents_unbind_device().
+	 *
+	 * hv_synic_suspend() only runs on CPU0 with interrupts disabled. Here
+	 * we do not unbind the stimer on CPU0 because: 1) it's unnecessary
+	 * because the interrupts remain disabled between syscore_suspend()
+	 * and syscore_resume(): see create_image() and resume_target_kernel();
+	 * 2) the stimer on CPU0 is automatically disabled later by
+	 * syscore_suspend() -> timekeeping_suspend() -> tick_suspend() -> ...
+	 * -> clockevents_shutdown() -> ... -> hv_ce_shutdown(); 3) a warning
+	 * would be triggered if we call clockevents_unbind_device(), which
+	 * may sleep, in an interrupts-disabled context. So, we intentionally
+	 * don't call hv_stimer_cleanup(0) here.
+	 */
+
+	hv_synic_disable_regs(0);
+
+	return 0;
+}
+
+static void hv_synic_resume(void)
+{
+	hv_synic_enable_regs(0);
+
+	/*
+	 * Note: we don't need to call hv_stimer_init(0), because the timer
+	 * on CPU0 is not unbound in hv_synic_suspend(), and the timer is
+	 * automatically re-enabled in timekeeping_resume().
+	 */
+}
+
+/* The callbacks run only on CPU0, with irqs_disabled. */
+static struct syscore_ops hv_synic_syscore_ops = {
+	.suspend = hv_synic_suspend,
+	.resume = hv_synic_resume,
+};
+
 static int __init hv_acpi_init(void)
 {
 	int ret, t;
@@ -2116,6 +2377,8 @@
 	hv_setup_kexec_handler(hv_kexec_handler);
 	hv_setup_crash_handler(hv_crash_handler);
 
+	register_syscore_ops(&hv_synic_syscore_ops);
+
 	return 0;
 
 cleanup:
@@ -2128,6 +2391,8 @@
 {
 	int cpu;
 
+	unregister_syscore_ops(&hv_synic_syscore_ops);
+
 	hv_remove_kexec_handler();
 	hv_remove_crash_handler();
 	vmbus_connection.conn_state = DISCONNECTED;

diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig
index 5587215..146ce40 100644
--- a/drivers/i2c/busses/Kconfig
+++ b/drivers/i2c/busses/Kconfig

@@ -429,6 +429,7 @@
 	tristate "Axxia I2C controller"
 	depends on ARCH_AXXIA || COMPILE_TEST
 	default ARCH_AXXIA
+	select I2C_SLAVE
 	help
 	  Say yes if you want to support the I2C bus on Axxia platforms.
 
@@ -977,7 +978,7 @@
 	  will be called i2c-sirf.
 
 config I2C_SPRD
-	bool "Spreadtrum I2C interface"
+	tristate "Spreadtrum I2C interface"
 	depends on I2C=y && ARCH_SPRD
 	help
 	  If you say yes to this option, support will be included for the
@@ -1309,6 +1310,20 @@
 	  This support is also available as a module.  If so, the module
 	  will be called i2c-elektor.
 
+config I2C_ICY
+	tristate "ICY Zorro card"
+	depends on ZORRO
+	select I2C_ALGOPCF
+	help
+	  This supports the PCF8584 Zorro bus I2C adapter, known as ICY.
+	  Say Y if you own such an adapter.
+
+	  This support is also available as a module.  If so, the module
+	  will be called i2c-icy.
+
+	  If you have a 2019 edition board with an LTC2990 sensor at address
+	  0x4c, loading the module 'ltc2990' is sufficient to enable it.
+
 config I2C_MLXCPLD
 	tristate "Mellanox I2C driver"
 	depends on X86_64

diff --git a/drivers/i2c/busses/Makefile b/drivers/i2c/busses/Makefile
index 80c2389..3ab8aeb 100644
--- a/drivers/i2c/busses/Makefile
+++ b/drivers/i2c/busses/Makefile

@@ -140,6 +140,7 @@
 obj-$(CONFIG_I2C_BRCMSTB)	+= i2c-brcmstb.o
 obj-$(CONFIG_I2C_CROS_EC_TUNNEL)	+= i2c-cros-ec-tunnel.o
 obj-$(CONFIG_I2C_ELEKTOR)	+= i2c-elektor.o
+obj-$(CONFIG_I2C_ICY)		+= i2c-icy.o
 obj-$(CONFIG_I2C_MLXCPLD)	+= i2c-mlxcpld.o
 obj-$(CONFIG_I2C_OPAL)		+= i2c-opal.o
 obj-$(CONFIG_I2C_PCA_ISA)	+= i2c-pca-isa.o

diff --git a/drivers/i2c/busses/i2c-axxia.c b/drivers/i2c/busses/i2c-axxia.c
index ff3142b..0214daa 100644
--- a/drivers/i2c/busses/i2c-axxia.c
+++ b/drivers/i2c/busses/i2c-axxia.c

@@ -77,6 +77,40 @@
 				 MST_STATUS_IP)
 #define MST_TX_BYTES_XFRD	0x50
 #define MST_RX_BYTES_XFRD	0x54
+#define SLV_ADDR_DEC_CTL	0x58
+#define   SLV_ADDR_DEC_GCE	BIT(0)  /* ACK to General Call Address from own master (loopback) */
+#define   SLV_ADDR_DEC_OGCE	BIT(1)  /* ACK to General Call Address from external masters */
+#define   SLV_ADDR_DEC_SA1E	BIT(2)  /* ACK to addr_1 enabled */
+#define   SLV_ADDR_DEC_SA1M	BIT(3)  /* 10-bit addressing for addr_1 enabled */
+#define   SLV_ADDR_DEC_SA2E	BIT(4)  /* ACK to addr_2 enabled */
+#define   SLV_ADDR_DEC_SA2M	BIT(5)  /* 10-bit addressing for addr_2 enabled */
+#define SLV_ADDR_1		0x5c
+#define SLV_ADDR_2		0x60
+#define SLV_RX_CTL		0x64
+#define   SLV_RX_ACSA1		BIT(0)  /* Generate ACK for writes to addr_1 */
+#define   SLV_RX_ACSA2		BIT(1)  /* Generate ACK for writes to addr_2 */
+#define   SLV_RX_ACGCA		BIT(2)  /* ACK data phase transfers to General Call Address */
+#define SLV_DATA		0x68
+#define SLV_RX_FIFO		0x6c
+#define   SLV_FIFO_DV1		BIT(0)  /* Data Valid for addr_1 */
+#define   SLV_FIFO_DV2		BIT(1)  /* Data Valid for addr_2 */
+#define   SLV_FIFO_AS		BIT(2)  /* (N)ACK Sent */
+#define   SLV_FIFO_TNAK		BIT(3)  /* Timeout NACK */
+#define   SLV_FIFO_STRC		BIT(4)  /* First byte after start condition received */
+#define   SLV_FIFO_RSC		BIT(5)  /* Repeated Start Condition */
+#define   SLV_FIFO_STPC		BIT(6)  /* Stop Condition */
+#define   SLV_FIFO_DV		(SLV_FIFO_DV1 | SLV_FIFO_DV2)
+#define SLV_INT_ENABLE		0x70
+#define SLV_INT_STATUS		0x74
+#define   SLV_STATUS_RFH	BIT(0)  /* FIFO service */
+#define   SLV_STATUS_WTC	BIT(1)  /* Write transfer complete */
+#define   SLV_STATUS_SRS1	BIT(2)  /* Slave read from addr 1 */
+#define   SLV_STATUS_SRRS1	BIT(3)  /* Repeated start from addr 1 */
+#define   SLV_STATUS_SRND1	BIT(4)  /* Read request not following start condition */
+#define   SLV_STATUS_SRC1	BIT(5)  /* Read canceled */
+#define   SLV_STATUS_SRAT1	BIT(6)  /* Slave Read timed out */
+#define   SLV_STATUS_SRDRE1	BIT(7)  /* Data written after timed out */
+#define SLV_READ_DUMMY		0x78
 #define SCL_HIGH_PERIOD		0x80
 #define SCL_LOW_PERIOD		0x84
 #define SPIKE_FLTR_LEN		0x88
@@ -111,6 +145,8 @@
 	struct clk *i2c_clk;
 	u32 bus_clk_rate;
 	bool last;
+	struct i2c_client *slave;
+	int irq;
 };
 
 static void i2c_int_disable(struct axxia_i2c_dev *idev, u32 mask)
@@ -276,13 +312,65 @@
 	return ret;
 }
 
+static void axxia_i2c_slv_fifo_event(struct axxia_i2c_dev *idev)
+{
+	u32 fifo_status = readl(idev->base + SLV_RX_FIFO);
+	u8 val;
+
+	dev_dbg(idev->dev, "slave irq fifo_status=0x%x\n", fifo_status);
+
+	if (fifo_status & SLV_FIFO_DV1) {
+		if (fifo_status & SLV_FIFO_STRC)
+			i2c_slave_event(idev->slave,
+					I2C_SLAVE_WRITE_REQUESTED, &val);
+
+		val = readl(idev->base + SLV_DATA);
+		i2c_slave_event(idev->slave, I2C_SLAVE_WRITE_RECEIVED, &val);
+	}
+	if (fifo_status & SLV_FIFO_STPC) {
+		readl(idev->base + SLV_DATA); /* dummy read */
+		i2c_slave_event(idev->slave, I2C_SLAVE_STOP, &val);
+	}
+	if (fifo_status & SLV_FIFO_RSC)
+		readl(idev->base + SLV_DATA); /* dummy read */
+}
+
+static irqreturn_t axxia_i2c_slv_isr(struct axxia_i2c_dev *idev)
+{
+	u32 status = readl(idev->base + SLV_INT_STATUS);
+	u8 val;
+
+	dev_dbg(idev->dev, "slave irq status=0x%x\n", status);
+
+	if (status & SLV_STATUS_RFH)
+		axxia_i2c_slv_fifo_event(idev);
+	if (status & SLV_STATUS_SRS1) {
+		i2c_slave_event(idev->slave, I2C_SLAVE_READ_REQUESTED, &val);
+		writel(val, idev->base + SLV_DATA);
+	}
+	if (status & SLV_STATUS_SRND1) {
+		i2c_slave_event(idev->slave, I2C_SLAVE_READ_PROCESSED, &val);
+		writel(val, idev->base + SLV_DATA);
+	}
+	if (status & SLV_STATUS_SRC1)
+		i2c_slave_event(idev->slave, I2C_SLAVE_STOP, &val);
+
+	writel(INT_SLV, idev->base + INTERRUPT_STATUS);
+	return IRQ_HANDLED;
+}
+
 static irqreturn_t axxia_i2c_isr(int irq, void *_dev)
 {
 	struct axxia_i2c_dev *idev = _dev;
+	irqreturn_t ret = IRQ_NONE;
 	u32 status;
 
-	if (!(readl(idev->base + INTERRUPT_STATUS) & INT_MST))
-		return IRQ_NONE;
+	status = readl(idev->base + INTERRUPT_STATUS);
+
+	if (status & INT_SLV)
+		ret = axxia_i2c_slv_isr(idev);
+	if (!(status & INT_MST))
+		return ret;
 
 	/* Read interrupt status bits */
 	status = readl(idev->base + MST_INT_STATUS);
@@ -583,9 +671,58 @@
 	return caps;
 }
 
+static int axxia_i2c_reg_slave(struct i2c_client *slave)
+{
+	struct axxia_i2c_dev *idev = i2c_get_adapdata(slave->adapter);
+	u32 slv_int_mask = SLV_STATUS_RFH;
+	u32 dec_ctl;
+
+	if (idev->slave)
+		return -EBUSY;
+
+	idev->slave = slave;
+
+	/* Enable slave mode as well */
+	writel(GLOBAL_MST_EN | GLOBAL_SLV_EN, idev->base + GLOBAL_CONTROL);
+	writel(INT_MST | INT_SLV, idev->base + INTERRUPT_ENABLE);
+
+	/* Set slave address */
+	dec_ctl = SLV_ADDR_DEC_SA1E;
+	if (slave->flags & I2C_CLIENT_TEN)
+		dec_ctl |= SLV_ADDR_DEC_SA1M;
+
+	writel(SLV_RX_ACSA1, idev->base + SLV_RX_CTL);
+	writel(dec_ctl, idev->base + SLV_ADDR_DEC_CTL);
+	writel(slave->addr, idev->base + SLV_ADDR_1);
+
+	/* Enable interrupts */
+	slv_int_mask |= SLV_STATUS_SRS1 | SLV_STATUS_SRRS1 | SLV_STATUS_SRND1;
+	slv_int_mask |= SLV_STATUS_SRC1;
+	writel(slv_int_mask, idev->base + SLV_INT_ENABLE);
+
+	return 0;
+}
+
+static int axxia_i2c_unreg_slave(struct i2c_client *slave)
+{
+	struct axxia_i2c_dev *idev = i2c_get_adapdata(slave->adapter);
+
+	/* Disable slave mode */
+	writel(GLOBAL_MST_EN, idev->base + GLOBAL_CONTROL);
+	writel(INT_MST, idev->base + INTERRUPT_ENABLE);
+
+	synchronize_irq(idev->irq);
+
+	idev->slave = NULL;
+
+	return 0;
+}
+
 static const struct i2c_algorithm axxia_i2c_algo = {
 	.master_xfer = axxia_i2c_xfer,
 	.functionality = axxia_i2c_func,
+	.reg_slave = axxia_i2c_reg_slave,
+	.unreg_slave = axxia_i2c_unreg_slave,
 };
 
 static const struct i2c_adapter_quirks axxia_i2c_quirks = {
@@ -599,7 +736,6 @@
 	struct axxia_i2c_dev *idev = NULL;
 	struct resource *res;
 	void __iomem *base;
-	int irq;
 	int ret = 0;
 
 	idev = devm_kzalloc(&pdev->dev, sizeof(*idev), GFP_KERNEL);
@@ -611,10 +747,10 @@
 	if (IS_ERR(base))
 		return PTR_ERR(base);
 
-	irq = platform_get_irq(pdev, 0);
-	if (irq < 0) {
+	idev->irq = platform_get_irq(pdev, 0);
+	if (idev->irq < 0) {
 		dev_err(&pdev->dev, "missing interrupt resource\n");
-		return irq;
+		return idev->irq;
 	}
 
 	idev->i2c_clk = devm_clk_get(&pdev->dev, "i2c");
@@ -643,10 +779,10 @@
 		goto error_disable_clk;
 	}
 
-	ret = devm_request_irq(&pdev->dev, irq, axxia_i2c_isr, 0,
+	ret = devm_request_irq(&pdev->dev, idev->irq, axxia_i2c_isr, 0,
 			       pdev->name, idev);
 	if (ret) {
-		dev_err(&pdev->dev, "failed to claim IRQ%d\n", irq);
+		dev_err(&pdev->dev, "failed to claim IRQ%d\n", idev->irq);
 		goto error_disable_clk;
 	}
 

diff --git a/drivers/i2c/busses/i2c-bcm-iproc.c b/drivers/i2c/busses/i2c-bcm-iproc.c
index 19ef2b0..9ffdffa 100644
--- a/drivers/i2c/busses/i2c-bcm-iproc.c
+++ b/drivers/i2c/busses/i2c-bcm-iproc.c

@@ -808,7 +808,7 @@
 	.unreg_slave = bcm_iproc_i2c_unreg_slave,
 };
 
-static struct i2c_adapter_quirks bcm_iproc_i2c_quirks = {
+static const struct i2c_adapter_quirks bcm_iproc_i2c_quirks = {
 	.max_read_len = M_RX_MAX_READ_LEN,
 };
 
@@ -922,7 +922,9 @@
 
 	adap = &iproc_i2c->adapter;
 	i2c_set_adapdata(adap, iproc_i2c);
-	strlcpy(adap->name, "Broadcom iProc I2C adapter", sizeof(adap->name));
+	snprintf(adap->name, sizeof(adap->name),
+		"Broadcom iProc (%s)",
+		of_node_full_name(iproc_i2c->device->of_node));
 	adap->algo = &bcm_iproc_algo;
 	adap->quirks = &bcm_iproc_i2c_quirks;
 	adap->dev.parent = &pdev->dev;

diff --git a/drivers/i2c/busses/i2c-bcm2835.c b/drivers/i2c/busses/i2c-bcm2835.c
index 67752f7..e01b2b5 100644
--- a/drivers/i2c/busses/i2c-bcm2835.c
+++ b/drivers/i2c/busses/i2c-bcm2835.c

@@ -12,6 +12,7 @@
 #include <linux/interrupt.h>
 #include <linux/io.h>
 #include <linux/module.h>
+#include <linux/of_device.h>
 #include <linux/platform_device.h>
 #include <linux/slab.h>
 
@@ -389,7 +390,7 @@
 };
 
 /*
- * This HW was reported to have problems with clock stretching:
+ * The BCM2835 was reported to have problems with clock stretching:
  * http://www.advamation.com/knowhow/raspberrypi/rpi-i2c-bug.html
  * https://www.raspberrypi.org/forums/viewtopic.php?p=146272
  */
@@ -471,11 +472,12 @@
 	i2c_set_adapdata(adap, i2c_dev);
 	adap->owner = THIS_MODULE;
 	adap->class = I2C_CLASS_DEPRECATED;
-	strlcpy(adap->name, "bcm2835 I2C adapter", sizeof(adap->name));
+	snprintf(adap->name, sizeof(adap->name), "bcm2835 (%s)",
+		 of_node_full_name(pdev->dev.of_node));
 	adap->algo = &bcm2835_i2c_algo;
 	adap->dev.parent = &pdev->dev;
 	adap->dev.of_node = pdev->dev.of_node;
-	adap->quirks = &bcm2835_i2c_quirks;
+	adap->quirks = of_device_get_match_data(&pdev->dev);
 
 	bcm2835_i2c_writel(i2c_dev, BCM2835_I2C_C, 0);
 
@@ -501,7 +503,8 @@
 }
 
 static const struct of_device_id bcm2835_i2c_of_match[] = {
-	{ .compatible = "brcm,bcm2835-i2c" },
+	{ .compatible = "brcm,bcm2711-i2c" },
+	{ .compatible = "brcm,bcm2835-i2c", .data = &bcm2835_i2c_quirks },
 	{},
 };
 MODULE_DEVICE_TABLE(of, bcm2835_i2c_of_match);

diff --git a/drivers/i2c/busses/i2c-cht-wc.c b/drivers/i2c/busses/i2c-cht-wc.c
index 66af44b..b8fde61 100644
--- a/drivers/i2c/busses/i2c-cht-wc.c
+++ b/drivers/i2c/busses/i2c-cht-wc.c

@@ -178,6 +178,51 @@
 	.smbus_xfer = cht_wc_i2c_adap_smbus_xfer,
 };
 
+/*
+ * We are an i2c-adapter which itself is part of an i2c-client. This means that
+ * transfers done through us take adapter->bus_lock twice, once for our parent
+ * i2c-adapter and once to take our own bus_lock. Lockdep does not like this
+ * nested locking, to make lockdep happy in the case of busses with muxes, the
+ * i2c-core's i2c_adapter_lock_bus function calls:
+ * rt_mutex_lock_nested(&adapter->bus_lock, i2c_adapter_depth(adapter));
+ *
+ * But i2c_adapter_depth only works when the direct parent of the adapter is
+ * another adapter, as it is only meant for muxes. In our case there is an
+ * i2c-client and MFD instantiated platform_device in the parent->child chain
+ * between the 2 devices.
+ *
+ * So we override the default i2c_lock_operations and pass a hardcoded
+ * depth of 1 to rt_mutex_lock_nested, to make lockdep happy.
+ *
+ * Note that if there were to be a mux attached to our adapter, this would
+ * break things again since the i2c-mux code expects the root-adapter to have
+ * a locking depth of 0. But we always have only 1 client directly attached
+ * in the form of the Charger IC paired with the CHT Whiskey Cove PMIC.
+ */
+static void cht_wc_i2c_adap_lock_bus(struct i2c_adapter *adapter,
+				 unsigned int flags)
+{
+	rt_mutex_lock_nested(&adapter->bus_lock, 1);
+}
+
+static int cht_wc_i2c_adap_trylock_bus(struct i2c_adapter *adapter,
+				   unsigned int flags)
+{
+	return rt_mutex_trylock(&adapter->bus_lock);
+}
+
+static void cht_wc_i2c_adap_unlock_bus(struct i2c_adapter *adapter,
+				   unsigned int flags)
+{
+	rt_mutex_unlock(&adapter->bus_lock);
+}
+
+static const struct i2c_lock_operations cht_wc_i2c_adap_lock_ops = {
+	.lock_bus =    cht_wc_i2c_adap_lock_bus,
+	.trylock_bus = cht_wc_i2c_adap_trylock_bus,
+	.unlock_bus =  cht_wc_i2c_adap_unlock_bus,
+};
+
 /**** irqchip for the client connected to the extchgr i2c adapter ****/
 static void cht_wc_i2c_irq_lock(struct irq_data *data)
 {
@@ -286,6 +331,7 @@
 	adap->adapter.owner = THIS_MODULE;
 	adap->adapter.class = I2C_CLASS_HWMON;
 	adap->adapter.algo = &cht_wc_i2c_adap_algo;
+	adap->adapter.lock_ops = &cht_wc_i2c_adap_lock_ops;
 	strlcpy(adap->adapter.name, "PMIC I2C Adapter",
 		sizeof(adap->adapter.name));
 	adap->adapter.dev.parent = &pdev->dev;
@@ -363,8 +409,7 @@
 {
 	struct cht_wc_i2c_adap *adap = platform_get_drvdata(pdev);
 
-	if (adap->client)
-		i2c_unregister_device(adap->client);
+	i2c_unregister_device(adap->client);
 	i2c_del_adapter(&adap->adapter);
 	irq_domain_remove(adap->irq_domain);
 

diff --git a/drivers/i2c/busses/i2c-designware-master.c b/drivers/i2c/busses/i2c-designware-master.c
index d464799..e8b3282 100644
--- a/drivers/i2c/busses/i2c-designware-master.c
+++ b/drivers/i2c/busses/i2c-designware-master.c

@@ -655,15 +655,11 @@
 	struct i2c_bus_recovery_info *rinfo = &dev->rinfo;
 	struct i2c_adapter *adap = &dev->adapter;
 	struct gpio_desc *gpio;
-	int r;
 
-	gpio = devm_gpiod_get(dev->dev, "scl", GPIOD_OUT_HIGH);
-	if (IS_ERR(gpio)) {
-		r = PTR_ERR(gpio);
-		if (r == -ENOENT || r == -ENOSYS)
-			return 0;
-		return r;
-	}
+	gpio = devm_gpiod_get_optional(dev->dev, "scl", GPIOD_OUT_HIGH);
+	if (IS_ERR_OR_NULL(gpio))
+		return PTR_ERR_OR_ZERO(gpio);
+
 	rinfo->scl_gpiod = gpio;
 
 	gpio = devm_gpiod_get_optional(dev->dev, "sda", GPIOD_IN);

diff --git a/drivers/i2c/busses/i2c-designware-pcidrv.c b/drivers/i2c/busses/i2c-designware-pcidrv.c
index 76810de..050adda 100644
--- a/drivers/i2c/busses/i2c-designware-pcidrv.c
+++ b/drivers/i2c/busses/i2c-designware-pcidrv.c

@@ -33,6 +33,7 @@
 	baytrail,
 	cherrytrail,
 	haswell,
+	elkhartlake,
 };
 
 struct dw_scl_sda_cfg {
@@ -168,13 +169,20 @@
 		.flags = MODEL_CHERRYTRAIL,
 		.scl_sda_cfg = &byt_config,
 	},
+	[elkhartlake] = {
+		.bus_num = -1,
+		.bus_cfg = INTEL_MID_STD_CFG | DW_IC_CON_SPEED_FAST,
+		.tx_fifo_depth = 32,
+		.rx_fifo_depth = 32,
+		.functionality = I2C_FUNC_10BIT_ADDR,
+		.clk_khz = 100000,
+	},
 };
 
 #ifdef CONFIG_PM
 static int i2c_dw_pci_suspend(struct device *dev)
 {
-	struct pci_dev *pdev = to_pci_dev(dev);
-	struct dw_i2c_dev *i_dev = pci_get_drvdata(pdev);
+	struct dw_i2c_dev *i_dev = dev_get_drvdata(dev);
 
 	i_dev->suspended = true;
 	i_dev->disable(i_dev);
@@ -184,8 +192,7 @@
 
 static int i2c_dw_pci_resume(struct device *dev)
 {
-	struct pci_dev *pdev = to_pci_dev(dev);
-	struct dw_i2c_dev *i_dev = pci_get_drvdata(pdev);
+	struct dw_i2c_dev *i_dev = dev_get_drvdata(dev);
 	int ret;
 
 	ret = i_dev->init(i_dev);
@@ -227,6 +234,8 @@
 		return r;
 	}
 
+	pci_set_master(pdev);
+
 	r = pcim_iomap_regions(pdev, 1 << 0, pci_name(pdev));
 	if (r) {
 		dev_err(&pdev->dev, "I/O memory remapping failed\n");
@@ -237,18 +246,24 @@
 	if (!dev)
 		return -ENOMEM;
 
+	r = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_ALL_TYPES);
+	if (r < 0)
+		return r;
+
 	dev->clk = NULL;
 	dev->controller = controller;
 	dev->get_clk_rate_khz = i2c_dw_get_clk_rate_khz;
 	dev->base = pcim_iomap_table(pdev)[0];
 	dev->dev = &pdev->dev;
-	dev->irq = pdev->irq;
+	dev->irq = pci_irq_vector(pdev, 0);
 	dev->flags |= controller->flags;
 
 	if (controller->setup) {
 		r = controller->setup(pdev, controller);
-		if (r)
+		if (r) {
+			pci_free_irq_vectors(pdev);
 			return r;
+		}
 	}
 
 	dev->functionality = controller->functionality |
@@ -276,8 +291,10 @@
 	adap->nr = controller->bus_num;
 
 	r = i2c_dw_probe(dev);
-	if (r)
+	if (r) {
+		pci_free_irq_vectors(pdev);
 		return r;
+	}
 
 	pm_runtime_set_autosuspend_delay(&pdev->dev, 1000);
 	pm_runtime_use_autosuspend(&pdev->dev);
@@ -296,6 +313,7 @@
 	pm_runtime_get_noresume(&pdev->dev);
 
 	i2c_del_adapter(&dev->adapter);
+	pci_free_irq_vectors(pdev);
 }
 
 /* work with hotplug and coldplug */
@@ -331,6 +349,15 @@
 	{ PCI_VDEVICE(INTEL, 0x22C5), cherrytrail },
 	{ PCI_VDEVICE(INTEL, 0x22C6), cherrytrail },
 	{ PCI_VDEVICE(INTEL, 0x22C7), cherrytrail },
+	/* Elkhart Lake (PSE I2C) */
+	{ PCI_VDEVICE(INTEL, 0x4bb9), elkhartlake },
+	{ PCI_VDEVICE(INTEL, 0x4bba), elkhartlake },
+	{ PCI_VDEVICE(INTEL, 0x4bbb), elkhartlake },
+	{ PCI_VDEVICE(INTEL, 0x4bbc), elkhartlake },
+	{ PCI_VDEVICE(INTEL, 0x4bbd), elkhartlake },
+	{ PCI_VDEVICE(INTEL, 0x4bbe), elkhartlake },
+	{ PCI_VDEVICE(INTEL, 0x4bbf), elkhartlake },
+	{ PCI_VDEVICE(INTEL, 0x4bc0), elkhartlake },
 	{ 0,}
 };
 MODULE_DEVICE_TABLE(pci, i2_designware_pci_ids);

diff --git a/drivers/i2c/busses/i2c-designware-platdrv.c b/drivers/i2c/busses/i2c-designware-platdrv.c
index ddfb818..16dd338 100644
--- a/drivers/i2c/busses/i2c-designware-platdrv.c
+++ b/drivers/i2c/busses/i2c-designware-platdrv.c

@@ -279,12 +279,10 @@
 	platform_set_drvdata(pdev, dev);
 
 	dev->rst = devm_reset_control_get_optional_exclusive(&pdev->dev, NULL);
-	if (IS_ERR(dev->rst)) {
-		if (PTR_ERR(dev->rst) == -EPROBE_DEFER)
-			return -EPROBE_DEFER;
-	} else {
-		reset_control_deassert(dev->rst);
-	}
+	if (IS_ERR(dev->rst))
+		return PTR_ERR(dev->rst);
+
+	reset_control_deassert(dev->rst);
 
 	t = &dev->timings;
 	if (pdata)
@@ -346,8 +344,10 @@
 
 	/* Optional interface clock */
 	dev->pclk = devm_clk_get_optional(&pdev->dev, "pclk");
-	if (IS_ERR(dev->pclk))
-		return PTR_ERR(dev->pclk);
+	if (IS_ERR(dev->pclk)) {
+		ret = PTR_ERR(dev->pclk);
+		goto exit_reset;
+	}
 
 	dev->clk = devm_clk_get(&pdev->dev, NULL);
 	if (!i2c_dw_prepare_clk(dev, true)) {
@@ -400,8 +400,7 @@
 exit_probe:
 	dw_i2c_plat_pm_cleanup(dev);
 exit_reset:
-	if (!IS_ERR_OR_NULL(dev->rst))
-		reset_control_assert(dev->rst);
+	reset_control_assert(dev->rst);
 	return ret;
 }
 
@@ -419,8 +418,7 @@
 	pm_runtime_put_sync(&pdev->dev);
 	dw_i2c_plat_pm_cleanup(dev);
 
-	if (!IS_ERR_OR_NULL(dev->rst))
-		reset_control_assert(dev->rst);
+	reset_control_assert(dev->rst);
 
 	return 0;
 }

diff --git a/drivers/i2c/busses/i2c-exynos5.c b/drivers/i2c/busses/i2c-exynos5.c
index e4e7932..e7514c1 100644
--- a/drivers/i2c/busses/i2c-exynos5.c
+++ b/drivers/i2c/busses/i2c-exynos5.c

@@ -791,9 +791,7 @@
 	}
 
 	ret = devm_request_irq(&pdev->dev, i2c->irq, exynos5_i2c_irq,
-				IRQF_NO_SUSPEND | IRQF_ONESHOT,
-				dev_name(&pdev->dev), i2c);
-
+			       IRQF_NO_SUSPEND, dev_name(&pdev->dev), i2c);
 	if (ret != 0) {
 		dev_err(&pdev->dev, "cannot request HS-I2C IRQ %d\n", i2c->irq);
 		goto err_clk;

diff --git a/drivers/i2c/busses/i2c-fsi.c b/drivers/i2c/busses/i2c-fsi.c
index da5eb39..e0c2569 100644
--- a/drivers/i2c/busses/i2c-fsi.c
+++ b/drivers/i2c/busses/i2c-fsi.c

@@ -707,8 +707,10 @@
 			continue;
 
 		port = kzalloc(sizeof(*port), GFP_KERNEL);
-		if (!port)
+		if (!port) {
+			of_node_put(np);
 			break;
+		}
 
 		port->master = i2c;
 		port->port = port_no;

diff --git a/drivers/i2c/busses/i2c-hix5hd2.c b/drivers/i2c/busses/i2c-hix5hd2.c
index 4df1434..8497c7a 100644
--- a/drivers/i2c/busses/i2c-hix5hd2.c
+++ b/drivers/i2c/busses/i2c-hix5hd2.c

@@ -445,8 +445,7 @@
 	hix5hd2_i2c_init(priv);
 
 	ret = devm_request_irq(&pdev->dev, irq, hix5hd2_i2c_irq,
-			       IRQF_NO_SUSPEND | IRQF_ONESHOT,
-			       dev_name(&pdev->dev), priv);
+			       IRQF_NO_SUSPEND, dev_name(&pdev->dev), priv);
 	if (ret != 0) {
 		dev_err(&pdev->dev, "cannot request HS-I2C IRQ %d\n", irq);
 		goto err_clk;

diff --git a/drivers/i2c/busses/i2c-i801.c b/drivers/i2c/busses/i2c-i801.c
index 36e9559..f1c714a 100644
--- a/drivers/i2c/busses/i2c-i801.c
+++ b/drivers/i2c/busses/i2c-i801.c

@@ -292,7 +292,8 @@
 #define FEATURE_HOST_NOTIFY	BIT(5)
 /* Not really a feature, but it's convenient to handle it as such */
 #define FEATURE_IDF		BIT(15)
-#define FEATURE_TCO		BIT(16)
+#define FEATURE_TCO_SPT		BIT(16)
+#define FEATURE_TCO_CNL		BIT(17)
 
 static const char *i801_feature_names[] = {
 	"SMBus PEC",
@@ -1500,57 +1501,23 @@
 }
 #endif
 
-static const struct itco_wdt_platform_data tco_platform_data = {
+static const struct itco_wdt_platform_data spt_tco_platform_data = {
 	.name = "Intel PCH",
 	.version = 4,
 };
 
 static DEFINE_SPINLOCK(p2sb_spinlock);
 
-static void i801_add_tco(struct i801_priv *priv)
+static struct platform_device *
+i801_add_tco_spt(struct i801_priv *priv, struct pci_dev *pci_dev,
+		 struct resource *tco_res)
 {
-	struct pci_dev *pci_dev = priv->pci_dev;
-	struct resource tco_res[3], *res;
-	struct platform_device *pdev;
+	struct resource *res;
 	unsigned int devfn;
-	u32 tco_base, tco_ctl;
-	u32 base_addr, ctrl_val;
 	u64 base64_addr;
+	u32 base_addr;
 	u8 hidden;
 
-	if (!(priv->features & FEATURE_TCO))
-		return;
-
-	pci_read_config_dword(pci_dev, TCOBASE, &tco_base);
-	pci_read_config_dword(pci_dev, TCOCTL, &tco_ctl);
-	if (!(tco_ctl & TCOCTL_EN))
-		return;
-
-	memset(tco_res, 0, sizeof(tco_res));
-
-	res = &tco_res[ICH_RES_IO_TCO];
-	res->start = tco_base & ~1;
-	res->end = res->start + 32 - 1;
-	res->flags = IORESOURCE_IO;
-
-	/*
-	 * Power Management registers.
-	 */
-	devfn = PCI_DEVFN(PCI_SLOT(pci_dev->devfn), 2);
-	pci_bus_read_config_dword(pci_dev->bus, devfn, ACPIBASE, &base_addr);
-
-	res = &tco_res[ICH_RES_IO_SMI];
-	res->start = (base_addr & ~1) + ACPIBASE_SMI_OFF;
-	res->end = res->start + 3;
-	res->flags = IORESOURCE_IO;
-
-	/*
-	 * Enable the ACPI I/O space.
-	 */
-	pci_bus_read_config_dword(pci_dev->bus, devfn, ACPICTRL, &ctrl_val);
-	ctrl_val |= ACPICTRL_EN;
-	pci_bus_write_config_dword(pci_dev->bus, devfn, ACPICTRL, ctrl_val);
-
 	/*
 	 * We must access the NO_REBOOT bit over the Primary to Sideband
 	 * bridge (P2SB). The BIOS prevents the P2SB device from being
@@ -1586,15 +1553,76 @@
 	res->end = res->start + 3;
 	res->flags = IORESOURCE_MEM;
 
-	pdev = platform_device_register_resndata(&pci_dev->dev, "iTCO_wdt", -1,
-						 tco_res, 3, &tco_platform_data,
-						 sizeof(tco_platform_data));
-	if (IS_ERR(pdev)) {
-		dev_warn(&pci_dev->dev, "failed to create iTCO device\n");
-		return;
-	}
+	return platform_device_register_resndata(&pci_dev->dev, "iTCO_wdt", -1,
+					tco_res, 3, &spt_tco_platform_data,
+					sizeof(spt_tco_platform_data));
+}
 
-	priv->tco_pdev = pdev;
+static const struct itco_wdt_platform_data cnl_tco_platform_data = {
+	.name = "Intel PCH",
+	.version = 6,
+};
+
+static struct platform_device *
+i801_add_tco_cnl(struct i801_priv *priv, struct pci_dev *pci_dev,
+		 struct resource *tco_res)
+{
+	return platform_device_register_resndata(&pci_dev->dev, "iTCO_wdt", -1,
+					tco_res, 2, &cnl_tco_platform_data,
+					sizeof(cnl_tco_platform_data));
+}
+
+static void i801_add_tco(struct i801_priv *priv)
+{
+	u32 base_addr, tco_base, tco_ctl, ctrl_val;
+	struct pci_dev *pci_dev = priv->pci_dev;
+	struct resource tco_res[3], *res;
+	unsigned int devfn;
+
+	/* If we have ACPI based watchdog use that instead */
+	if (acpi_has_watchdog())
+		return;
+
+	if (!(priv->features & (FEATURE_TCO_SPT | FEATURE_TCO_CNL)))
+		return;
+
+	pci_read_config_dword(pci_dev, TCOBASE, &tco_base);
+	pci_read_config_dword(pci_dev, TCOCTL, &tco_ctl);
+	if (!(tco_ctl & TCOCTL_EN))
+		return;
+
+	memset(tco_res, 0, sizeof(tco_res));
+
+	res = &tco_res[ICH_RES_IO_TCO];
+	res->start = tco_base & ~1;
+	res->end = res->start + 32 - 1;
+	res->flags = IORESOURCE_IO;
+
+	/*
+	 * Power Management registers.
+	 */
+	devfn = PCI_DEVFN(PCI_SLOT(pci_dev->devfn), 2);
+	pci_bus_read_config_dword(pci_dev->bus, devfn, ACPIBASE, &base_addr);
+
+	res = &tco_res[ICH_RES_IO_SMI];
+	res->start = (base_addr & ~1) + ACPIBASE_SMI_OFF;
+	res->end = res->start + 3;
+	res->flags = IORESOURCE_IO;
+
+	/*
+	 * Enable the ACPI I/O space.
+	 */
+	pci_bus_read_config_dword(pci_dev->bus, devfn, ACPICTRL, &ctrl_val);
+	ctrl_val |= ACPICTRL_EN;
+	pci_bus_write_config_dword(pci_dev->bus, devfn, ACPICTRL, ctrl_val);
+
+	if (priv->features & FEATURE_TCO_CNL)
+		priv->tco_pdev = i801_add_tco_cnl(priv, pci_dev, tco_res);
+	else
+		priv->tco_pdev = i801_add_tco_spt(priv, pci_dev, tco_res);
+
+	if (IS_ERR(priv->tco_pdev))
+		dev_warn(&pci_dev->dev, "failed to create iTCO device\n");
 }
 
 #ifdef CONFIG_ACPI
@@ -1704,13 +1732,22 @@
 	switch (dev->device) {
 	case PCI_DEVICE_ID_INTEL_SUNRISEPOINT_H_SMBUS:
 	case PCI_DEVICE_ID_INTEL_SUNRISEPOINT_LP_SMBUS:
-	case PCI_DEVICE_ID_INTEL_CANNONLAKE_H_SMBUS:
-	case PCI_DEVICE_ID_INTEL_CANNONLAKE_LP_SMBUS:
 	case PCI_DEVICE_ID_INTEL_LEWISBURG_SMBUS:
 	case PCI_DEVICE_ID_INTEL_LEWISBURG_SSKU_SMBUS:
-	case PCI_DEVICE_ID_INTEL_CDF_SMBUS:
 	case PCI_DEVICE_ID_INTEL_DNV_SMBUS:
 	case PCI_DEVICE_ID_INTEL_KABYLAKE_PCH_H_SMBUS:
+		priv->features |= FEATURE_BLOCK_PROC;
+		priv->features |= FEATURE_I2C_BLOCK_READ;
+		priv->features |= FEATURE_IRQ;
+		priv->features |= FEATURE_SMBUS_PEC;
+		priv->features |= FEATURE_BLOCK_BUFFER;
+		priv->features |= FEATURE_TCO_SPT;
+		priv->features |= FEATURE_HOST_NOTIFY;
+		break;
+
+	case PCI_DEVICE_ID_INTEL_CANNONLAKE_H_SMBUS:
+	case PCI_DEVICE_ID_INTEL_CANNONLAKE_LP_SMBUS:
+	case PCI_DEVICE_ID_INTEL_CDF_SMBUS:
 	case PCI_DEVICE_ID_INTEL_ICELAKE_LP_SMBUS:
 	case PCI_DEVICE_ID_INTEL_COMETLAKE_SMBUS:
 	case PCI_DEVICE_ID_INTEL_ELKHART_LAKE_SMBUS:
@@ -1720,9 +1757,7 @@
 		priv->features |= FEATURE_IRQ;
 		priv->features |= FEATURE_SMBUS_PEC;
 		priv->features |= FEATURE_BLOCK_BUFFER;
-		/* If we have ACPI based watchdog use that instead */
-		if (!acpi_has_watchdog())
-			priv->features |= FEATURE_TCO;
+		priv->features |= FEATURE_TCO_CNL;
 		priv->features |= FEATURE_HOST_NOTIFY;
 		break;
 
@@ -1921,8 +1956,7 @@
 
 static int i801_resume(struct device *dev)
 {
-	struct pci_dev *pci_dev = to_pci_dev(dev);
-	struct i801_priv *priv = pci_get_drvdata(pci_dev);
+	struct i801_priv *priv = dev_get_drvdata(dev);
 
 	i801_enable_host_notify(&priv->adapter);
 

diff --git a/drivers/i2c/busses/i2c-icy.c b/drivers/i2c/busses/i2c-icy.c
new file mode 100644
index 0000000..8382eb6
--- /dev/null
+++ b/drivers/i2c/busses/i2c-icy.c

@@ -0,0 +1,230 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * I2C driver for stand-alone PCF8584 style adapters on Zorro cards
+ *
+ * Original ICY documentation can be found on Aminet:
+ * https://aminet.net/package/docs/hard/icy
+ *
+ * There has been a modern community re-print of this design in 2019:
+ * https://www.a1k.org/forum/index.php?threads/70106/
+ *
+ * The card is basically a Philips PCF8584 connected straight to the
+ * beginning of the AutoConfig'd address space (register S1 on base+2),
+ * with /INT on /INT2 on the Zorro bus.
+ *
+ * Copyright (c) 2019 Max Staudt <max@enpas.org>
+ *
+ * This started as a fork of i2c-elektor.c and has evolved since.
+ * Thanks go to its authors for providing a base to grow on.
+ *
+ *
+ * IRQ support is currently not implemented.
+ *
+ * As it turns out, i2c-algo-pcf is really written with i2c-elektor's
+ * edge-triggered ISA interrupts in mind, while the Amiga's Zorro bus has
+ * level-triggered interrupts. This means that once an interrupt occurs, we
+ * have to tell the PCF8584 to shut up immediately, or it will keep the
+ * interrupt line busy and cause an IRQ storm.
+
+ * However, because of the PCF8584's host-side protocol, there is no good
+ * way to just quieten it without side effects. Rather, we have to perform
+ * the next read/write operation straight away, which will reset the /INT
+ * pin. This entails re-designing the core of i2c-algo-pcf in the future.
+ * For now, we never request an IRQ from the PCF8584, and poll it instead.
+ */
+
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/ioport.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+#include <linux/i2c.h>
+#include <linux/i2c-algo-pcf.h>
+
+#include <asm/amigaints.h>
+#include <linux/zorro.h>
+
+#include "../algos/i2c-algo-pcf.h"
+
+struct icy_i2c {
+	struct i2c_adapter adapter;
+
+	void __iomem *reg_s0;
+	void __iomem *reg_s1;
+	struct fwnode_handle *ltc2990_fwnode;
+	struct i2c_client *ltc2990_client;
+};
+
+/*
+ * Functions called by i2c-algo-pcf
+ */
+static void icy_pcf_setpcf(void *data, int ctl, int val)
+{
+	struct icy_i2c *i2c = (struct icy_i2c *)data;
+
+	u8 __iomem *address = ctl ? i2c->reg_s1 : i2c->reg_s0;
+
+	z_writeb(val, address);
+}
+
+static int icy_pcf_getpcf(void *data, int ctl)
+{
+	struct icy_i2c *i2c = (struct icy_i2c *)data;
+
+	u8 __iomem *address = ctl ? i2c->reg_s1 : i2c->reg_s0;
+
+	return z_readb(address);
+}
+
+static int icy_pcf_getown(void *data)
+{
+	return 0x55;
+}
+
+static int icy_pcf_getclock(void *data)
+{
+	return 0x1c;
+}
+
+static void icy_pcf_waitforpin(void *data)
+{
+	usleep_range(50, 150);
+}
+
+/*
+ * Main i2c-icy part
+ */
+static unsigned short const icy_ltc2990_addresses[] = {
+	0x4c, 0x4d, 0x4e, 0x4f, I2C_CLIENT_END
+};
+
+/*
+ * Additional sensors exposed once this property is applied:
+ *
+ * in1 will be the voltage of the 5V rail, divided by 2.
+ * in2 will be the voltage of the 12V rail, divided by 4.
+ * temp3 will be measured using a PCB loop next the chip.
+ */
+static const u32 icy_ltc2990_meas_mode[] = {0, 3};
+
+static const struct property_entry icy_ltc2990_props[] = {
+	PROPERTY_ENTRY_U32_ARRAY("lltc,meas-mode", icy_ltc2990_meas_mode),
+	{ }
+};
+
+static int icy_probe(struct zorro_dev *z,
+		     const struct zorro_device_id *ent)
+{
+	struct icy_i2c *i2c;
+	struct i2c_algo_pcf_data *algo_data;
+	struct fwnode_handle *new_fwnode;
+	struct i2c_board_info ltc2990_info = {
+		.type		= "ltc2990",
+		.addr		= 0x4c,
+	};
+
+	i2c = devm_kzalloc(&z->dev, sizeof(*i2c), GFP_KERNEL);
+	if (!i2c)
+		return -ENOMEM;
+
+	algo_data = devm_kzalloc(&z->dev, sizeof(*algo_data), GFP_KERNEL);
+	if (!algo_data)
+		return -ENOMEM;
+
+	dev_set_drvdata(&z->dev, i2c);
+	i2c->adapter.dev.parent = &z->dev;
+	i2c->adapter.owner = THIS_MODULE;
+	/* i2c->adapter.algo assigned by i2c_pcf_add_bus() */
+	i2c->adapter.algo_data = algo_data;
+	strlcpy(i2c->adapter.name, "ICY I2C Zorro adapter",
+		sizeof(i2c->adapter.name));
+
+	if (!devm_request_mem_region(&z->dev,
+				     z->resource.start,
+				     4, i2c->adapter.name))
+		return -ENXIO;
+
+	/* Driver private data */
+	i2c->reg_s0 = ZTWO_VADDR(z->resource.start);
+	i2c->reg_s1 = ZTWO_VADDR(z->resource.start + 2);
+
+	algo_data->data = i2c;
+	algo_data->setpcf     = icy_pcf_setpcf;
+	algo_data->getpcf     = icy_pcf_getpcf;
+	algo_data->getown     = icy_pcf_getown;
+	algo_data->getclock   = icy_pcf_getclock;
+	algo_data->waitforpin = icy_pcf_waitforpin;
+
+	if (i2c_pcf_add_bus(&i2c->adapter)) {
+		dev_err(&z->dev, "i2c_pcf_add_bus() failed\n");
+		return -ENXIO;
+	}
+
+	dev_info(&z->dev, "ICY I2C controller at %pa, IRQ not implemented\n",
+		 &z->resource.start);
+
+	/*
+	 * The 2019 a1k.org PCBs have an LTC2990 at 0x4c, so start
+	 * it automatically once ltc2990 is modprobed.
+	 *
+	 * in0 is the voltage of the internal 5V power supply.
+	 * temp1 is the temperature inside the chip.
+	 *
+	 * See property_entry above for in1, in2, temp3.
+	 */
+	new_fwnode = fwnode_create_software_node(icy_ltc2990_props, NULL);
+	if (IS_ERR(new_fwnode)) {
+		dev_info(&z->dev, "Failed to create fwnode for LTC2990, error: %ld\n",
+			 PTR_ERR(new_fwnode));
+	} else {
+		/*
+		 * Store the fwnode so we can destroy it on .remove().
+		 * Only store it on success, as fwnode_remove_software_node()
+		 * is NULL safe, but not PTR_ERR safe.
+		 */
+		i2c->ltc2990_fwnode = new_fwnode;
+		ltc2990_info.fwnode = new_fwnode;
+
+		i2c->ltc2990_client =
+			i2c_new_probed_device(&i2c->adapter,
+					      &ltc2990_info,
+					      icy_ltc2990_addresses,
+					      NULL);
+	}
+
+	return 0;
+}
+
+static void icy_remove(struct zorro_dev *z)
+{
+	struct icy_i2c *i2c = dev_get_drvdata(&z->dev);
+
+	i2c_unregister_device(i2c->ltc2990_client);
+	fwnode_remove_software_node(i2c->ltc2990_fwnode);
+
+	i2c_del_adapter(&i2c->adapter);
+}
+
+static const struct zorro_device_id icy_zorro_tbl[] = {
+	{ ZORRO_ID(VMC, 15, 0), },
+	{ 0 }
+};
+
+MODULE_DEVICE_TABLE(zorro, icy_zorro_tbl);
+
+static struct zorro_driver icy_driver = {
+	.name           = "i2c-icy",
+	.id_table       = icy_zorro_tbl,
+	.probe          = icy_probe,
+	.remove         = icy_remove,
+};
+
+module_driver(icy_driver,
+	      zorro_register_driver,
+	      zorro_unregister_driver);
+
+MODULE_AUTHOR("Max Staudt <max@enpas.org>");
+MODULE_DESCRIPTION("I2C bus via PCF8584 on ICY Zorro card");
+MODULE_LICENSE("GPL v2");

diff --git a/drivers/i2c/busses/i2c-imx-lpi2c.c b/drivers/i2c/busses/i2c-imx-lpi2c.c
index dc00fab..c92b564 100644
--- a/drivers/i2c/busses/i2c-imx-lpi2c.c
+++ b/drivers/i2c/busses/i2c-imx-lpi2c.c

@@ -545,7 +545,6 @@
 static int lpi2c_imx_probe(struct platform_device *pdev)
 {
 	struct lpi2c_imx_struct *lpi2c_imx;
-	struct resource *res;
 	unsigned int temp;
 	int irq, ret;
 
@@ -553,8 +552,7 @@
 	if (!lpi2c_imx)
 		return -ENOMEM;
 
-	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	lpi2c_imx->base = devm_ioremap_resource(&pdev->dev, res);
+	lpi2c_imx->base = devm_platform_ioremap_resource(pdev, 0);
 	if (IS_ERR(lpi2c_imx->base))
 		return PTR_ERR(lpi2c_imx->base);
 

diff --git a/drivers/i2c/busses/i2c-imx.c b/drivers/i2c/busses/i2c-imx.c
index 15f6cde..a3b6133 100644
--- a/drivers/i2c/busses/i2c-imx.c
+++ b/drivers/i2c/busses/i2c-imx.c

@@ -20,6 +20,7 @@
  *
  */
 
+#include <linux/acpi.h>
 #include <linux/clk.h>
 #include <linux/completion.h>
 #include <linux/delay.h>
@@ -255,6 +256,12 @@
 };
 MODULE_DEVICE_TABLE(of, i2c_imx_dt_ids);
 
+static const struct acpi_device_id i2c_imx_acpi_ids[] = {
+	{"NXP0001", .driver_data = (kernel_ulong_t)&vf610_i2c_hwdata},
+	{ }
+};
+MODULE_DEVICE_TABLE(acpi, i2c_imx_acpi_ids);
+
 static inline int is_imx1_i2c(struct imx_i2c_struct *i2c_imx)
 {
 	return i2c_imx->hwdata->devtype == IMX1_I2C;
@@ -1048,14 +1055,13 @@
 
 static int i2c_imx_probe(struct platform_device *pdev)
 {
-	const struct of_device_id *of_id = of_match_device(i2c_imx_dt_ids,
-							   &pdev->dev);
 	struct imx_i2c_struct *i2c_imx;
 	struct resource *res;
 	struct imxi2c_platform_data *pdata = dev_get_platdata(&pdev->dev);
 	void __iomem *base;
 	int irq, ret;
 	dma_addr_t phy_addr;
+	const struct imx_i2c_hwdata *match;
 
 	dev_dbg(&pdev->dev, "<%s>\n", __func__);
 
@@ -1075,8 +1081,9 @@
 	if (!i2c_imx)
 		return -ENOMEM;
 
-	if (of_id)
-		i2c_imx->hwdata = of_id->data;
+	match = device_get_match_data(&pdev->dev);
+	if (match)
+		i2c_imx->hwdata = match;
 	else
 		i2c_imx->hwdata = (struct imx_i2c_hwdata *)
 				platform_get_device_id(pdev)->driver_data;
@@ -1089,6 +1096,7 @@
 	i2c_imx->adapter.nr		= pdev->id;
 	i2c_imx->adapter.dev.of_node	= pdev->dev.of_node;
 	i2c_imx->base			= base;
+	ACPI_COMPANION_SET(&i2c_imx->adapter.dev, ACPI_COMPANION(&pdev->dev));
 
 	/* Get I2C clock */
 	i2c_imx->clk = devm_clk_get(&pdev->dev, NULL);
@@ -1247,6 +1255,7 @@
 		.name = DRIVER_NAME,
 		.pm = &i2c_imx_pm_ops,
 		.of_match_table = i2c_imx_dt_ids,
+		.acpi_match_table = i2c_imx_acpi_ids,
 	},
 	.id_table = imx_i2c_devtype,
 };

diff --git a/drivers/i2c/busses/i2c-ismt.c b/drivers/i2c/busses/i2c-ismt.c
index 02d23ed..2f95e25 100644
--- a/drivers/i2c/busses/i2c-ismt.c
+++ b/drivers/i2c/busses/i2c-ismt.c

@@ -781,8 +781,6 @@
 	if (!priv->hw)
 		return -ENOMEM;
 
-	memset(priv->hw, 0, (ISMT_DESC_ENTRIES * sizeof(struct ismt_desc)));
-
 	priv->head = 0;
 	init_completion(&priv->cmp);
 

diff --git a/drivers/i2c/busses/i2c-mxs.c b/drivers/i2c/busses/i2c-mxs.c
index 7d79317..8922491 100644
--- a/drivers/i2c/busses/i2c-mxs.c
+++ b/drivers/i2c/busses/i2c-mxs.c

@@ -802,7 +802,6 @@
 	struct device *dev = &pdev->dev;
 	struct mxs_i2c_dev *i2c;
 	struct i2c_adapter *adap;
-	struct resource *res;
 	int err, irq;
 
 	i2c = devm_kzalloc(dev, sizeof(*i2c), GFP_KERNEL);
@@ -814,8 +813,7 @@
 		i2c->dev_type = device_id->driver_data;
 	}
 
-	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	i2c->regs = devm_ioremap_resource(&pdev->dev, res);
+	i2c->regs = devm_platform_ioremap_resource(pdev, 0);
 	if (IS_ERR(i2c->regs))
 		return PTR_ERR(i2c->regs);
 

diff --git a/drivers/i2c/busses/i2c-ocores.c b/drivers/i2c/busses/i2c-ocores.c
index 4117f1a..ca8b3ec 100644
--- a/drivers/i2c/busses/i2c-ocores.c
+++ b/drivers/i2c/busses/i2c-ocores.c

@@ -703,8 +703,9 @@
 	}
 
 	if (ocores_algorithm.master_xfer != ocores_xfer_polling) {
-		ret = devm_request_irq(&pdev->dev, irq, ocores_isr, 0,
-				       pdev->name, i2c);
+		ret = devm_request_any_context_irq(&pdev->dev, irq,
+						   ocores_isr, 0,
+						   pdev->name, i2c);
 		if (ret) {
 			dev_err(&pdev->dev, "Cannot claim IRQ\n");
 			goto err_clk;

diff --git a/drivers/i2c/busses/i2c-piix4.c b/drivers/i2c/busses/i2c-piix4.c
index cba325e..30ded64 100644
--- a/drivers/i2c/busses/i2c-piix4.c
+++ b/drivers/i2c/busses/i2c-piix4.c

@@ -72,7 +72,8 @@
 #define PIIX4_BLOCK_DATA	0x14
 
 /* Multi-port constants */
-#define PIIX4_MAX_ADAPTERS 4
+#define PIIX4_MAX_ADAPTERS	4
+#define HUDSON2_MAIN_PORTS	2 /* HUDSON2, KERNCZ reserves ports 3, 4 */
 
 /* SB800 constants */
 #define SB800_PIIX4_SMB_IDX		0xcd6
@@ -806,10 +807,12 @@
 
 static struct i2c_adapter *piix4_main_adapters[PIIX4_MAX_ADAPTERS];
 static struct i2c_adapter *piix4_aux_adapter;
+static int piix4_adapter_count;
 
 static int piix4_add_adapter(struct pci_dev *dev, unsigned short smba,
 			     bool sb800_main, u8 port, bool notify_imc,
-			     const char *name, struct i2c_adapter **padap)
+			     u8 hw_port_nr, const char *name,
+			     struct i2c_adapter **padap)
 {
 	struct i2c_adapter *adap;
 	struct i2c_piix4_adapdata *adapdata;
@@ -841,6 +844,12 @@
 	/* set up the sysfs linkage to our parent device */
 	adap->dev.parent = &dev->dev;
 
+	if (has_acpi_companion(&dev->dev)) {
+		acpi_preset_companion(&adap->dev,
+				      ACPI_COMPANION(&dev->dev),
+				      hw_port_nr);
+	}
+
 	snprintf(adap->name, sizeof(adap->name),
 		"SMBus PIIX4 adapter%s at %04x", name, smba);
 
@@ -865,8 +874,19 @@
 	int port;
 	int retval;
 
-	for (port = 0; port < PIIX4_MAX_ADAPTERS; port++) {
+	if (dev->device == PCI_DEVICE_ID_AMD_KERNCZ_SMBUS ||
+	    (dev->device == PCI_DEVICE_ID_AMD_HUDSON2_SMBUS &&
+	     dev->revision >= 0x1F)) {
+		piix4_adapter_count = HUDSON2_MAIN_PORTS;
+	} else {
+		piix4_adapter_count = PIIX4_MAX_ADAPTERS;
+	}
+
+	for (port = 0; port < piix4_adapter_count; port++) {
+		u8 hw_port_nr = port == 0 ? 0 : port + 1;
+
 		retval = piix4_add_adapter(dev, smba, true, port, notify_imc,
+					   hw_port_nr,
 					   piix4_main_port_names_sb800[port],
 					   &piix4_main_adapters[port]);
 		if (retval < 0)
@@ -937,8 +957,8 @@
 			return retval;
 
 		/* Try to register main SMBus adapter, give up if we can't */
-		retval = piix4_add_adapter(dev, retval, false, 0, false, "",
-					   &piix4_main_adapters[0]);
+		retval = piix4_add_adapter(dev, retval, false, 0, false, 0,
+					   "", &piix4_main_adapters[0]);
 		if (retval < 0)
 			return retval;
 	}
@@ -964,7 +984,7 @@
 	if (retval > 0) {
 		/* Try to add the aux adapter if it exists,
 		 * piix4_add_adapter will clean up if this fails */
-		piix4_add_adapter(dev, retval, false, 0, false,
+		piix4_add_adapter(dev, retval, false, 0, false, 1,
 				  is_sb800 ? piix4_aux_port_name_sb800 : "",
 				  &piix4_aux_adapter);
 	}
@@ -987,7 +1007,7 @@
 
 static void piix4_remove(struct pci_dev *dev)
 {
-	int port = PIIX4_MAX_ADAPTERS;
+	int port = piix4_adapter_count;
 
 	while (--port >= 0) {
 		if (piix4_main_adapters[port]) {

diff --git a/drivers/i2c/busses/i2c-qcom-geni.c b/drivers/i2c/busses/i2c-qcom-geni.c
index a89bfce..17abf60c 100644
--- a/drivers/i2c/busses/i2c-qcom-geni.c
+++ b/drivers/i2c/busses/i2c-qcom-geni.c

@@ -355,11 +355,13 @@
 {
 	dma_addr_t rx_dma;
 	unsigned long time_left;
-	void *dma_buf;
+	void *dma_buf = NULL;
 	struct geni_se *se = &gi2c->se;
 	size_t len = msg->len;
 
-	dma_buf = i2c_get_dma_safe_msg_buf(msg, 32);
+	if (!of_machine_is_compatible("lenovo,yoga-c630"))
+		dma_buf = i2c_get_dma_safe_msg_buf(msg, 32);
+
 	if (dma_buf)
 		geni_se_select_mode(se, GENI_SE_DMA);
 	else
@@ -394,11 +396,13 @@
 {
 	dma_addr_t tx_dma;
 	unsigned long time_left;
-	void *dma_buf;
+	void *dma_buf = NULL;
 	struct geni_se *se = &gi2c->se;
 	size_t len = msg->len;
 
-	dma_buf = i2c_get_dma_safe_msg_buf(msg, 32);
+	if (!of_machine_is_compatible("lenovo,yoga-c630"))
+		dma_buf = i2c_get_dma_safe_msg_buf(msg, 32);
+
 	if (dma_buf)
 		geni_se_select_mode(se, GENI_SE_DMA);
 	else

diff --git a/drivers/i2c/busses/i2c-riic.c b/drivers/i2c/busses/i2c-riic.c
index f31413f..8004148 100644
--- a/drivers/i2c/busses/i2c-riic.c
+++ b/drivers/i2c/busses/i2c-riic.c

@@ -202,6 +202,7 @@
 	if (readb(riic->base + RIIC_ICSR2) & ICSR2_NACKF) {
 		/* We got a NACKIE */
 		readb(riic->base + RIIC_ICDRR);	/* dummy read */
+		riic_clear_set_bit(riic, ICSR2_NACKF, 0, RIIC_ICSR2);
 		riic->err = -ENXIO;
 	} else if (riic->bytes_left) {
 		return IRQ_NONE;

diff --git a/drivers/i2c/busses/i2c-sprd.c b/drivers/i2c/busses/i2c-sprd.c
index 9611235..b432e75 100644
--- a/drivers/i2c/busses/i2c-sprd.c
+++ b/drivers/i2c/busses/i2c-sprd.c

@@ -12,6 +12,7 @@
 #include <linux/init.h>
 #include <linux/interrupt.h>
 #include <linux/kernel.h>
+#include <linux/module.h>
 #include <linux/of.h>
 #include <linux/of_device.h>
 #include <linux/platform_device.h>
@@ -465,9 +466,9 @@
 
 	i2c_dev->clk = devm_clk_get(i2c_dev->dev, "enable");
 	if (IS_ERR(i2c_dev->clk)) {
-		dev_warn(i2c_dev->dev, "i2c%d can't get the enable clock\n",
-			 i2c_dev->adap.nr);
-		i2c_dev->clk = NULL;
+		dev_err(i2c_dev->dev, "i2c%d can't get the enable clock\n",
+			i2c_dev->adap.nr);
+		return PTR_ERR(i2c_dev->clk);
 	}
 
 	return 0;
@@ -477,7 +478,6 @@
 {
 	struct device *dev = &pdev->dev;
 	struct sprd_i2c *i2c_dev;
-	struct resource *res;
 	u32 prop;
 	int ret;
 
@@ -487,8 +487,7 @@
 	if (!i2c_dev)
 		return -ENOMEM;
 
-	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	i2c_dev->base = devm_ioremap_resource(dev, res);
+	i2c_dev->base = devm_platform_ioremap_resource(pdev, 0);
 	if (IS_ERR(i2c_dev->base))
 		return PTR_ERR(i2c_dev->base);
 
@@ -520,7 +519,10 @@
 	if (i2c_dev->bus_freq != 100000 && i2c_dev->bus_freq != 400000)
 		return -EINVAL;
 
-	sprd_i2c_clk_init(i2c_dev);
+	ret = sprd_i2c_clk_init(i2c_dev);
+	if (ret)
+		return ret;
+
 	platform_set_drvdata(pdev, i2c_dev);
 
 	ret = clk_prepare_enable(i2c_dev->clk);
@@ -644,8 +646,7 @@
 	},
 };
 
-static int sprd_i2c_init(void)
-{
-	return platform_driver_register(&sprd_i2c_driver);
-}
-arch_initcall_sync(sprd_i2c_init);
+module_platform_driver(sprd_i2c_driver);
+
+MODULE_DESCRIPTION("Spreadtrum I2C master controller driver");
+MODULE_LICENSE("GPL v2");

diff --git a/drivers/i2c/busses/i2c-stm32f7.c b/drivers/i2c/busses/i2c-stm32f7.c
index 266d1c2..d36cf08 100644
--- a/drivers/i2c/busses/i2c-stm32f7.c
+++ b/drivers/i2c/busses/i2c-stm32f7.c

@@ -1809,7 +1809,7 @@
 		I2C_FUNC_SMBUS_I2C_BLOCK;
 }
 
-static struct i2c_algorithm stm32f7_i2c_algo = {
+static const struct i2c_algorithm stm32f7_i2c_algo = {
 	.master_xfer = stm32f7_i2c_xfer,
 	.smbus_xfer = stm32f7_i2c_smbus_xfer,
 	.functionality = stm32f7_i2c_func,

diff --git a/drivers/i2c/busses/i2c-synquacer.c b/drivers/i2c/busses/i2c-synquacer.c
index f724c8e..39762f0 100644
--- a/drivers/i2c/busses/i2c-synquacer.c
+++ b/drivers/i2c/busses/i2c-synquacer.c

@@ -526,7 +526,7 @@
 	.functionality	= synquacer_i2c_functionality,
 };
 
-static struct i2c_adapter synquacer_i2c_ops = {
+static const struct i2c_adapter synquacer_i2c_ops = {
 	.owner		= THIS_MODULE,
 	.name		= "synquacer_i2c-adapter",
 	.algo		= &synquacer_i2c_algo,

diff --git a/drivers/i2c/busses/i2c-taos-evm.c b/drivers/i2c/busses/i2c-taos-evm.c
index 37347c9..0bff3f3 100644
--- a/drivers/i2c/busses/i2c-taos-evm.c
+++ b/drivers/i2c/busses/i2c-taos-evm.c

@@ -39,7 +39,7 @@
 };
 
 /* TAOS TSL2550 EVM */
-static struct i2c_board_info tsl2550_info = {
+static const struct i2c_board_info tsl2550_info = {
 	I2C_BOARD_INFO("tsl2550", 0x39),
 };
 

diff --git a/drivers/i2c/busses/i2c-tegra.c b/drivers/i2c/busses/i2c-tegra.c
index 9fcb13b..c1683f9 100644
--- a/drivers/i2c/busses/i2c-tegra.c
+++ b/drivers/i2c/busses/i2c-tegra.c

@@ -636,7 +636,7 @@
 	dvc_writel(i2c_dev, val, DVC_CTRL_REG1);
 }
 
-static int tegra_i2c_runtime_resume(struct device *dev)
+static int __maybe_unused tegra_i2c_runtime_resume(struct device *dev)
 {
 	struct tegra_i2c_dev *i2c_dev = dev_get_drvdata(dev);
 	int ret;
@@ -665,7 +665,7 @@
 	return 0;
 }
 
-static int tegra_i2c_runtime_suspend(struct device *dev)
+static int __maybe_unused tegra_i2c_runtime_suspend(struct device *dev)
 {
 	struct tegra_i2c_dev *i2c_dev = dev_get_drvdata(dev);
 
@@ -713,12 +713,6 @@
 	u32 tsu_thd;
 	u8 tlow, thigh;
 
-	err = pm_runtime_get_sync(i2c_dev->dev);
-	if (err < 0) {
-		dev_err(i2c_dev->dev, "runtime resume failed %d\n", err);
-		return err;
-	}
-
 	reset_control_assert(i2c_dev->rst);
 	udelay(2);
 	reset_control_deassert(i2c_dev->rst);
@@ -772,7 +766,7 @@
 		if (err) {
 			dev_err(i2c_dev->dev,
 				"failed changing clock rate: %d\n", err);
-			goto err;
+			return err;
 		}
 	}
 
@@ -787,23 +781,21 @@
 
 	err = tegra_i2c_flush_fifos(i2c_dev);
 	if (err)
-		goto err;
+		return err;
 
 	if (i2c_dev->is_multimaster_mode && i2c_dev->hw->has_slcg_override_reg)
 		i2c_writel(i2c_dev, I2C_MST_CORE_CLKEN_OVR, I2C_CLKEN_OVERRIDE);
 
 	err = tegra_i2c_wait_for_config_load(i2c_dev);
 	if (err)
-		goto err;
+		return err;
 
 	if (i2c_dev->irq_disabled) {
 		i2c_dev->irq_disabled = false;
 		enable_irq(i2c_dev->irq);
 	}
 
-err:
-	pm_runtime_put(i2c_dev->dev);
-	return err;
+	return 0;
 }
 
 static int tegra_i2c_disable_packet_mode(struct tegra_i2c_dev *i2c_dev)
@@ -1616,12 +1608,14 @@
 	}
 
 	pm_runtime_enable(&pdev->dev);
-	if (!pm_runtime_enabled(&pdev->dev)) {
+	if (!pm_runtime_enabled(&pdev->dev))
 		ret = tegra_i2c_runtime_resume(&pdev->dev);
-		if (ret < 0) {
-			dev_err(&pdev->dev, "runtime resume failed\n");
-			goto unprepare_div_clk;
-		}
+	else
+		ret = pm_runtime_get_sync(i2c_dev->dev);
+
+	if (ret < 0) {
+		dev_err(&pdev->dev, "runtime resume failed\n");
+		goto unprepare_div_clk;
 	}
 
 	if (i2c_dev->is_multimaster_mode) {
@@ -1666,6 +1660,8 @@
 	if (ret)
 		goto release_dma;
 
+	pm_runtime_put(&pdev->dev);
+
 	return 0;
 
 release_dma:
@@ -1711,8 +1707,7 @@
 	return 0;
 }
 
-#ifdef CONFIG_PM_SLEEP
-static int tegra_i2c_suspend(struct device *dev)
+static int __maybe_unused tegra_i2c_suspend(struct device *dev)
 {
 	struct tegra_i2c_dev *i2c_dev = dev_get_drvdata(dev);
 
@@ -1721,38 +1716,41 @@
 	return 0;
 }
 
-static int tegra_i2c_resume(struct device *dev)
+static int __maybe_unused tegra_i2c_resume(struct device *dev)
 {
 	struct tegra_i2c_dev *i2c_dev = dev_get_drvdata(dev);
 	int err;
 
+	err = tegra_i2c_runtime_resume(dev);
+	if (err)
+		return err;
+
 	err = tegra_i2c_init(i2c_dev, false);
 	if (err)
 		return err;
 
+	err = tegra_i2c_runtime_suspend(dev);
+	if (err)
+		return err;
+
 	i2c_mark_adapter_resumed(&i2c_dev->adapter);
 
 	return 0;
 }
 
 static const struct dev_pm_ops tegra_i2c_pm = {
-	SET_SYSTEM_SLEEP_PM_OPS(tegra_i2c_suspend, tegra_i2c_resume)
+	SET_NOIRQ_SYSTEM_SLEEP_PM_OPS(tegra_i2c_suspend, tegra_i2c_resume)
 	SET_RUNTIME_PM_OPS(tegra_i2c_runtime_suspend, tegra_i2c_runtime_resume,
 			   NULL)
 };
 
-#define TEGRA_I2C_PM	(&tegra_i2c_pm)
-#else
-#define TEGRA_I2C_PM	NULL
-#endif
-
 static struct platform_driver tegra_i2c_driver = {
 	.probe   = tegra_i2c_probe,
 	.remove  = tegra_i2c_remove,
 	.driver  = {
 		.name  = "tegra-i2c",
 		.of_match_table = tegra_i2c_of_match,
-		.pm    = TEGRA_I2C_PM,
+		.pm    = &tegra_i2c_pm,
 	},
 };
 

diff --git a/drivers/i2c/busses/i2c-uniphier-f.c b/drivers/i2c/busses/i2c-uniphier-f.c
index 7acca25..4241aac 100644
--- a/drivers/i2c/busses/i2c-uniphier-f.c
+++ b/drivers/i2c/busses/i2c-uniphier-f.c

@@ -108,7 +108,6 @@
 		if (fifo_space-- <= 0)
 			break;
 
-		dev_dbg(&priv->adap.dev, "write data: %02x\n", *priv->buf);
 		writel(*priv->buf++, priv->membase + UNIPHIER_FI2C_DTTX);
 		priv->len--;
 	}
@@ -124,7 +123,6 @@
 			break;
 
 		*priv->buf++ = readl(priv->membase + UNIPHIER_FI2C_DTRX);
-		dev_dbg(&priv->adap.dev, "read data: %02x\n", priv->buf[-1]);
 		priv->len--;
 	}
 }
@@ -142,8 +140,6 @@
 
 static void uniphier_fi2c_stop(struct uniphier_fi2c_priv *priv)
 {
-	dev_dbg(&priv->adap.dev, "stop condition\n");
-
 	priv->enabled_irqs |= UNIPHIER_FI2C_INT_STOP;
 	uniphier_fi2c_set_irqs(priv);
 	writel(UNIPHIER_FI2C_CR_MST | UNIPHIER_FI2C_CR_STO,
@@ -160,21 +156,15 @@
 	irq_status = readl(priv->membase + UNIPHIER_FI2C_INT);
 	irq_status &= priv->enabled_irqs;
 
-	dev_dbg(&priv->adap.dev,
-		"interrupt: enabled_irqs=%04x, irq_status=%04x\n",
-		priv->enabled_irqs, irq_status);
-
 	if (irq_status & UNIPHIER_FI2C_INT_STOP)
 		goto complete;
 
 	if (unlikely(irq_status & UNIPHIER_FI2C_INT_AL)) {
-		dev_dbg(&priv->adap.dev, "arbitration lost\n");
 		priv->error = -EAGAIN;
 		goto complete;
 	}
 
 	if (unlikely(irq_status & UNIPHIER_FI2C_INT_NA)) {
-		dev_dbg(&priv->adap.dev, "could not get ACK\n");
 		priv->error = -ENXIO;
 		if (priv->flags & UNIPHIER_FI2C_RD) {
 			/*
@@ -215,18 +205,14 @@
 		if (unlikely(priv->flags & UNIPHIER_FI2C_MANUAL_NACK)) {
 			if (priv->len <= UNIPHIER_FI2C_FIFO_SIZE &&
 			    !(priv->flags & UNIPHIER_FI2C_BYTE_WISE)) {
-				dev_dbg(&priv->adap.dev,
-					"enable read byte count IRQ\n");
 				priv->enabled_irqs |= UNIPHIER_FI2C_INT_RB;
 				uniphier_fi2c_set_irqs(priv);
 				priv->flags |= UNIPHIER_FI2C_BYTE_WISE;
 			}
-			if (priv->len <= 1) {
-				dev_dbg(&priv->adap.dev, "set NACK\n");
+			if (priv->len <= 1)
 				writel(UNIPHIER_FI2C_CR_MST |
 				       UNIPHIER_FI2C_CR_NACK,
 				       priv->membase + UNIPHIER_FI2C_CR);
-			}
 		}
 
 		goto handled;
@@ -334,10 +320,6 @@
 	bool is_read = msg->flags & I2C_M_RD;
 	unsigned long time_left, flags;
 
-	dev_dbg(&adap->dev, "%s: addr=0x%02x, len=%d, repeat=%d, stop=%d\n",
-		is_read ? "receive" : "transmit", msg->addr, msg->len,
-		repeat, stop);
-
 	priv->len = msg->len;
 	priv->buf = msg->buf;
 	priv->enabled_irqs = UNIPHIER_FI2C_INT_FAULTS;
@@ -359,7 +341,6 @@
 	else
 		uniphier_fi2c_tx_init(priv, msg->addr, repeat);
 
-	dev_dbg(&adap->dev, "start condition\n");
 	/*
 	 * For a repeated START condition, writing a slave address to the FIFO
 	 * kicks the controller. So, the UNIPHIER_FI2C_CR register should be
@@ -383,7 +364,6 @@
 		uniphier_fi2c_recover(priv);
 		return -ETIMEDOUT;
 	}
-	dev_dbg(&adap->dev, "complete\n");
 
 	if (unlikely(priv->flags & UNIPHIER_FI2C_DEFER_STOP_COMP)) {
 		u32 status;
@@ -538,7 +518,6 @@
 {
 	struct device *dev = &pdev->dev;
 	struct uniphier_fi2c_priv *priv;
-	struct resource *regs;
 	u32 bus_speed;
 	unsigned long clk_rate;
 	int irq, ret;
@@ -547,8 +526,7 @@
 	if (!priv)
 		return -ENOMEM;
 
-	regs = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	priv->membase = devm_ioremap_resource(dev, regs);
+	priv->membase = devm_platform_ioremap_resource(pdev, 0);
 	if (IS_ERR(priv->membase))
 		return PTR_ERR(priv->membase);
 

diff --git a/drivers/i2c/busses/i2c-uniphier.c b/drivers/i2c/busses/i2c-uniphier.c
index 0173840..0270090 100644
--- a/drivers/i2c/busses/i2c-uniphier.c
+++ b/drivers/i2c/busses/i2c-uniphier.c

@@ -71,7 +71,6 @@
 	reinit_completion(&priv->comp);
 
 	txdata |= UNIPHIER_I2C_DTRM_IRQEN;
-	dev_dbg(&adap->dev, "write data: 0x%04x\n", txdata);
 	writel(txdata, priv->membase + UNIPHIER_I2C_DTRM);
 
 	time_left = wait_for_completion_timeout(&priv->comp, adap->timeout);
@@ -81,8 +80,6 @@
 	}
 
 	rxdata = readl(priv->membase + UNIPHIER_I2C_DREC);
-	dev_dbg(&adap->dev, "read data: 0x%04x\n", rxdata);
-
 	if (rxdatap)
 		*rxdatap = rxdata;
 
@@ -98,14 +95,11 @@
 	if (ret)
 		return ret;
 
-	if (unlikely(rxdata & UNIPHIER_I2C_DREC_LAB)) {
-		dev_dbg(&adap->dev, "arbitration lost\n");
+	if (unlikely(rxdata & UNIPHIER_I2C_DREC_LAB))
 		return -EAGAIN;
-	}
-	if (unlikely(rxdata & UNIPHIER_I2C_DREC_LRB)) {
-		dev_dbg(&adap->dev, "could not get ACK\n");
+
+	if (unlikely(rxdata & UNIPHIER_I2C_DREC_LRB))
 		return -ENXIO;
-	}
 
 	return 0;
 }
@@ -115,7 +109,6 @@
 {
 	int ret;
 
-	dev_dbg(&adap->dev, "start condition\n");
 	ret = uniphier_i2c_send_byte(adap, addr << 1 |
 				     UNIPHIER_I2C_DTRM_STA |
 				     UNIPHIER_I2C_DTRM_NACK);
@@ -137,7 +130,6 @@
 {
 	int ret;
 
-	dev_dbg(&adap->dev, "start condition\n");
 	ret = uniphier_i2c_send_byte(adap, addr << 1 |
 				     UNIPHIER_I2C_DTRM_STA |
 				     UNIPHIER_I2C_DTRM_NACK |
@@ -161,7 +153,6 @@
 
 static int uniphier_i2c_stop(struct i2c_adapter *adap)
 {
-	dev_dbg(&adap->dev, "stop condition\n");
 	return uniphier_i2c_send_byte(adap, UNIPHIER_I2C_DTRM_STO |
 				      UNIPHIER_I2C_DTRM_NACK);
 }
@@ -173,9 +164,6 @@
 	bool recovery = false;
 	int ret;
 
-	dev_dbg(&adap->dev, "%s: addr=0x%02x, len=%d, stop=%d\n",
-		is_read ? "receive" : "transmit", msg->addr, msg->len, stop);
-
 	if (is_read)
 		ret = uniphier_i2c_rx(adap, msg->addr, msg->len, msg->buf);
 	else
@@ -326,7 +314,6 @@
 {
 	struct device *dev = &pdev->dev;
 	struct uniphier_i2c_priv *priv;
-	struct resource *regs;
 	u32 bus_speed;
 	unsigned long clk_rate;
 	int irq, ret;
@@ -335,8 +322,7 @@
 	if (!priv)
 		return -ENOMEM;
 
-	regs = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	priv->membase = devm_ioremap_resource(dev, regs);
+	priv->membase = devm_platform_ioremap_resource(pdev, 0);
 	if (IS_ERR(priv->membase))
 		return PTR_ERR(priv->membase);
 

diff --git a/drivers/i2c/i2c-core-base.c b/drivers/i2c/i2c-core-base.c
index 72b3001..5f6a498 100644
--- a/drivers/i2c/i2c-core-base.c
+++ b/drivers/i2c/i2c-core-base.c

@@ -350,13 +350,11 @@
 		return -ENODEV;
 
 	if (client->flags & I2C_CLIENT_WAKE) {
-		int wakeirq = -ENOENT;
+		int wakeirq;
 
-		if (dev->of_node) {
-			wakeirq = of_irq_get_byname(dev->of_node, "wakeup");
-			if (wakeirq == -EPROBE_DEFER)
-				return wakeirq;
-		}
+		wakeirq = of_irq_get_byname(dev->of_node, "wakeup");
+		if (wakeirq == -EPROBE_DEFER)
+			return wakeirq;
 
 		device_init_wakeup(&client->dev, true);
 
@@ -966,7 +964,7 @@
 EXPORT_SYMBOL_GPL(devm_i2c_new_dummy_device);
 
 /**
- * i2c_new_secondary_device - Helper to get the instantiated secondary address
+ * i2c_new_ancillary_device - Helper to get the instantiated secondary address
  * and create the associated device
  * @client: Handle to the primary client
  * @name: Handle to specify which secondary address to get
@@ -985,9 +983,9 @@
  * cell whose "reg-names" value matches the slave name.
  *
  * This returns the new i2c client, which should be saved for later use with
- * i2c_unregister_device(); or NULL to indicate an error.
+ * i2c_unregister_device(); or an ERR_PTR to describe the error.
  */
-struct i2c_client *i2c_new_secondary_device(struct i2c_client *client,
+struct i2c_client *i2c_new_ancillary_device(struct i2c_client *client,
 						const char *name,
 						u16 default_addr)
 {
@@ -1002,9 +1000,9 @@
 	}
 
 	dev_dbg(&client->adapter->dev, "Address for %s : 0x%x\n", name, addr);
-	return i2c_new_dummy(client->adapter, addr);
+	return i2c_new_dummy_device(client->adapter, addr);
 }
-EXPORT_SYMBOL_GPL(i2c_new_secondary_device);
+EXPORT_SYMBOL_GPL(i2c_new_ancillary_device);
 
 /* ------------------------------------------------------------------------- */
 

diff --git a/drivers/i2c/i2c-slave-eeprom.c b/drivers/i2c/i2c-slave-eeprom.c
index be65d38..db9763c 100644
--- a/drivers/i2c/i2c-slave-eeprom.c
+++ b/drivers/i2c/i2c-slave-eeprom.c

@@ -11,6 +11,13 @@
  * pointer, yet implementation is deferred until the need actually arises.
  */
 
+/*
+ * FIXME: What to do if only 8 bits of a 16 bit address are sent?
+ * The ST-M24C64 sends only 0xff then. Needs verification with other
+ * EEPROMs, though. We currently use the 8 bit as a valid address.
+ */
+
+#include <linux/bitfield.h>
 #include <linux/i2c.h>
 #include <linux/init.h>
 #include <linux/module.h>
@@ -21,12 +28,20 @@
 
 struct eeprom_data {
 	struct bin_attribute bin;
-	bool first_write;
 	spinlock_t buffer_lock;
-	u8 buffer_idx;
+	u16 buffer_idx;
+	u16 address_mask;
+	u8 num_address_bytes;
+	u8 idx_write_cnt;
+	bool read_only;
 	u8 buffer[];
 };
 
+#define I2C_SLAVE_BYTELEN GENMASK(15, 0)
+#define I2C_SLAVE_FLAG_ADDR16 BIT(16)
+#define I2C_SLAVE_FLAG_RO BIT(17)
+#define I2C_SLAVE_DEVICE_MAGIC(_len, _flags) ((_flags) | (_len))
+
 static int i2c_slave_eeprom_slave_cb(struct i2c_client *client,
 				     enum i2c_slave_event event, u8 *val)
 {
@@ -34,13 +49,17 @@
 
 	switch (event) {
 	case I2C_SLAVE_WRITE_RECEIVED:
-		if (eeprom->first_write) {
-			eeprom->buffer_idx = *val;
-			eeprom->first_write = false;
+		if (eeprom->idx_write_cnt < eeprom->num_address_bytes) {
+			if (eeprom->idx_write_cnt == 0)
+				eeprom->buffer_idx = 0;
+			eeprom->buffer_idx = *val | (eeprom->buffer_idx << 8);
+			eeprom->idx_write_cnt++;
 		} else {
-			spin_lock(&eeprom->buffer_lock);
-			eeprom->buffer[eeprom->buffer_idx++] = *val;
-			spin_unlock(&eeprom->buffer_lock);
+			if (!eeprom->read_only) {
+				spin_lock(&eeprom->buffer_lock);
+				eeprom->buffer[eeprom->buffer_idx++ & eeprom->address_mask] = *val;
+				spin_unlock(&eeprom->buffer_lock);
+			}
 		}
 		break;
 
@@ -50,7 +69,7 @@
 		/* fallthrough */
 	case I2C_SLAVE_READ_REQUESTED:
 		spin_lock(&eeprom->buffer_lock);
-		*val = eeprom->buffer[eeprom->buffer_idx];
+		*val = eeprom->buffer[eeprom->buffer_idx & eeprom->address_mask];
 		spin_unlock(&eeprom->buffer_lock);
 		/*
 		 * Do not increment buffer_idx here, because we don't know if
@@ -61,7 +80,7 @@
 
 	case I2C_SLAVE_STOP:
 	case I2C_SLAVE_WRITE_REQUESTED:
-		eeprom->first_write = true;
+		eeprom->idx_write_cnt = 0;
 		break;
 
 	default:
@@ -105,13 +124,17 @@
 {
 	struct eeprom_data *eeprom;
 	int ret;
-	unsigned size = id->driver_data;
+	unsigned int size = FIELD_GET(I2C_SLAVE_BYTELEN, id->driver_data);
+	unsigned int flag_addr16 = FIELD_GET(I2C_SLAVE_FLAG_ADDR16, id->driver_data);
 
 	eeprom = devm_kzalloc(&client->dev, sizeof(struct eeprom_data) + size, GFP_KERNEL);
 	if (!eeprom)
 		return -ENOMEM;
 
-	eeprom->first_write = true;
+	eeprom->idx_write_cnt = 0;
+	eeprom->num_address_bytes = flag_addr16 ? 2 : 1;
+	eeprom->address_mask = size - 1;
+	eeprom->read_only = FIELD_GET(I2C_SLAVE_FLAG_RO, id->driver_data);
 	spin_lock_init(&eeprom->buffer_lock);
 	i2c_set_clientdata(client, eeprom);
 
@@ -146,7 +169,12 @@
 }
 
 static const struct i2c_device_id i2c_slave_eeprom_id[] = {
-	{ "slave-24c02", 2048 / 8 },
+	{ "slave-24c02", I2C_SLAVE_DEVICE_MAGIC(2048 / 8,  0) },
+	{ "slave-24c02ro", I2C_SLAVE_DEVICE_MAGIC(2048 / 8,  I2C_SLAVE_FLAG_RO) },
+	{ "slave-24c32", I2C_SLAVE_DEVICE_MAGIC(32768 / 8, I2C_SLAVE_FLAG_ADDR16) },
+	{ "slave-24c32ro", I2C_SLAVE_DEVICE_MAGIC(32768 / 8, I2C_SLAVE_FLAG_ADDR16 | I2C_SLAVE_FLAG_RO) },
+	{ "slave-24c64", I2C_SLAVE_DEVICE_MAGIC(65536 / 8, I2C_SLAVE_FLAG_ADDR16) },
+	{ "slave-24c64ro", I2C_SLAVE_DEVICE_MAGIC(65536 / 8, I2C_SLAVE_FLAG_ADDR16 | I2C_SLAVE_FLAG_RO) },
 	{ }
 };
 MODULE_DEVICE_TABLE(i2c, i2c_slave_eeprom_id);

diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c
index 1dd467b..6d7ec37 100644
--- a/drivers/infiniband/core/addr.c
+++ b/drivers/infiniband/core/addr.c

@@ -352,7 +352,7 @@
 
 	if (family == AF_INET) {
 		rt = container_of(dst, struct rtable, dst);
-		return rt->rt_gw_family == AF_INET;
+		return rt->rt_uses_gateway;
 	}
 
 	rt6 = container_of(dst, struct rt6_info, dst);

diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c
index dce0610..5337393 100644
--- a/drivers/infiniband/core/rw.c
+++ b/drivers/infiniband/core/rw.c

@@ -583,8 +583,10 @@
 		break;
 	}
 
-	/* P2PDMA contexts do not need to be unmapped */
-	if (!is_pci_p2pdma_page(sg_page(sg)))
+	if (is_pci_p2pdma_page(sg_page(sg)))
+		pci_p2pdma_unmap_sg(qp->pd->device->dma_device, sg,
+				    sg_cnt, dir);
+	else
 		ib_dma_unmap_sg(qp->pd->device, sg, sg_cnt, dir);
 }
 EXPORT_SYMBOL(rdma_rw_ctx_destroy);

diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index 41f9e26..24244a2 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c

@@ -54,10 +54,7 @@
 
 	for_each_sg_page(umem->sg_head.sgl, &sg_iter, umem->sg_nents, 0) {
 		page = sg_page_iter_page(&sg_iter);
-		if (umem->writable && dirty)
-			put_user_pages_dirty_lock(&page, 1);
-		else
-			put_user_page(page);
+		put_user_pages_dirty_lock(&page, 1, umem->writable && dirty);
 	}
 
 	sg_free_table(&umem->sg_head);

diff --git a/drivers/infiniband/hw/hfi1/user_pages.c b/drivers/infiniband/hw/hfi1/user_pages.c
index b89a9b9..469acb9 100644
--- a/drivers/infiniband/hw/hfi1/user_pages.c
+++ b/drivers/infiniband/hw/hfi1/user_pages.c

@@ -118,10 +118,7 @@
 void hfi1_release_user_pages(struct mm_struct *mm, struct page **p,
 			     size_t npages, bool dirty)
 {
-	if (dirty)
-		put_user_pages_dirty_lock(p, npages);
-	else
-		put_user_pages(p, npages);
+	put_user_pages_dirty_lock(p, npages, dirty);
 
 	if (mm) { /* during close after signal, mm can be NULL */
 		atomic64_sub(npages, &mm->pinned_vm);

diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c
index 9f53f63..7bff0a1 100644
--- a/drivers/infiniband/hw/hfi1/verbs.c
+++ b/drivers/infiniband/hw/hfi1/verbs.c

@@ -1041,7 +1041,7 @@
 	if (cb)
 		iowait_pio_inc(&priv->s_iowait);
 	pbuf = sc_buffer_alloc(sc, plen, cb, qp);
-	if (unlikely(IS_ERR_OR_NULL(pbuf))) {
+	if (IS_ERR_OR_NULL(pbuf)) {
 		if (cb)
 			verbs_pio_complete(qp, 0);
 		if (IS_ERR(pbuf)) {

diff --git a/drivers/infiniband/hw/qib/qib_user_pages.c b/drivers/infiniband/hw/qib/qib_user_pages.c
index bfbfbb7..6bf764e 100644
--- a/drivers/infiniband/hw/qib/qib_user_pages.c
+++ b/drivers/infiniband/hw/qib/qib_user_pages.c

@@ -40,10 +40,7 @@
 static void __qib_release_user_pages(struct page **p, size_t num_pages,
 				     int dirty)
 {
-	if (dirty)
-		put_user_pages_dirty_lock(p, num_pages);
-	else
-		put_user_pages(p, num_pages);
+	put_user_pages_dirty_lock(p, num_pages, dirty);
 }
 
 /**

diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c b/drivers/infiniband/hw/usnic/usnic_uiom.c
index 0b0237d..62e6ffa 100644
--- a/drivers/infiniband/hw/usnic/usnic_uiom.c
+++ b/drivers/infiniband/hw/usnic/usnic_uiom.c

@@ -75,10 +75,7 @@
 		for_each_sg(chunk->page_list, sg, chunk->nents, i) {
 			page = sg_page(sg);
 			pa = sg_phys(sg);
-			if (dirty)
-				put_user_pages_dirty_lock(&page, 1);
-			else
-				put_user_page(page);
+			put_user_pages_dirty_lock(&page, 1, dirty);
 			usnic_dbg("pa: %pa\n", &pa);
 		}
 		kfree(chunk);

diff --git a/drivers/infiniband/sw/siw/siw_mem.c b/drivers/infiniband/sw/siw/siw_mem.c
index 87a5603..e99983f 100644
--- a/drivers/infiniband/sw/siw/siw_mem.c
+++ b/drivers/infiniband/sw/siw/siw_mem.c

@@ -63,15 +63,7 @@
 static void siw_free_plist(struct siw_page_chunk *chunk, int num_pages,
 			   bool dirty)
 {
-	struct page **p = chunk->plist;
-
-	while (num_pages--) {
-		if (!PageDirty(*p) && dirty)
-			put_user_pages_dirty_lock(p, 1);
-		else
-			put_user_page(*p);
-		p++;
-	}
+	put_user_pages_dirty_lock(chunk->plist, num_pages, dirty);
 }
 
 void siw_umem_release(struct siw_umem *umem, bool dirty)

diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 97975bb..2369b8a 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c

@@ -70,7 +70,6 @@
  */
 #define AMD_IOMMU_PGSIZES	((~0xFFFUL) & ~(2ULL << 38))
 
-static DEFINE_SPINLOCK(amd_iommu_devtable_lock);
 static DEFINE_SPINLOCK(pd_bitmap_lock);
 
 /* List of all available dev_data structures */
@@ -202,6 +201,7 @@
 	if (!dev_data)
 		return NULL;
 
+	spin_lock_init(&dev_data->lock);
 	dev_data->devid = devid;
 	ratelimit_default_init(&dev_data->rs);
 
@@ -501,6 +501,29 @@
 	 */
 }
 
+/*
+ * Helper function to get the first pte of a large mapping
+ */
+static u64 *first_pte_l7(u64 *pte, unsigned long *page_size,
+			 unsigned long *count)
+{
+	unsigned long pte_mask, pg_size, cnt;
+	u64 *fpte;
+
+	pg_size  = PTE_PAGE_SIZE(*pte);
+	cnt      = PAGE_SIZE_PTE_COUNT(pg_size);
+	pte_mask = ~((cnt << 3) - 1);
+	fpte     = (u64 *)(((unsigned long)pte) & pte_mask);
+
+	if (page_size)
+		*page_size = pg_size;
+
+	if (count)
+		*count = cnt;
+
+	return fpte;
+}
+
 /****************************************************************************
  *
  * Interrupt handling functions
@@ -1311,8 +1334,12 @@
 		dma_addr_t iova, size_t size)
 {
 	if (unlikely(amd_iommu_np_cache)) {
+		unsigned long flags;
+
+		spin_lock_irqsave(&domain->lock, flags);
 		domain_flush_pages(domain, iova, size);
 		domain_flush_complete(domain);
+		spin_unlock_irqrestore(&domain->lock, flags);
 	}
 }
 
@@ -1425,7 +1452,7 @@
 	BUG_ON(domain->mode < PAGE_MODE_NONE ||
 	       domain->mode > PAGE_MODE_6_LEVEL);
 
-	free_sub_pt(root, domain->mode, freelist);
+	freelist = free_sub_pt(root, domain->mode, freelist);
 
 	free_page_list(freelist);
 }
@@ -1435,10 +1462,11 @@
  * another level increases the size of the address space by 9 bits to a size up
  * to 64 bits.
  */
-static void increase_address_space(struct protection_domain *domain,
+static bool increase_address_space(struct protection_domain *domain,
 				   gfp_t gfp)
 {
 	unsigned long flags;
+	bool ret = false;
 	u64 *pte;
 
 	spin_lock_irqsave(&domain->lock, flags);
@@ -1455,19 +1483,21 @@
 					iommu_virt_to_phys(domain->pt_root));
 	domain->pt_root  = pte;
 	domain->mode    += 1;
-	domain->updated  = true;
+
+	ret = true;
 
 out:
 	spin_unlock_irqrestore(&domain->lock, flags);
 
-	return;
+	return ret;
 }
 
 static u64 *alloc_pte(struct protection_domain *domain,
 		      unsigned long address,
 		      unsigned long page_size,
 		      u64 **pte_page,
-		      gfp_t gfp)
+		      gfp_t gfp,
+		      bool *updated)
 {
 	int level, end_lvl;
 	u64 *pte, *page;
@@ -1475,7 +1505,7 @@
 	BUG_ON(!is_power_of_2(page_size));
 
 	while (address > PM_LEVEL_SIZE(domain->mode))
-		increase_address_space(domain, gfp);
+		*updated = increase_address_space(domain, gfp) || *updated;
 
 	level   = domain->mode - 1;
 	pte     = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
@@ -1489,9 +1519,32 @@
 		__pte     = *pte;
 		pte_level = PM_PTE_LEVEL(__pte);
 
-		if (!IOMMU_PTE_PRESENT(__pte) ||
+		/*
+		 * If we replace a series of large PTEs, we need
+		 * to tear down all of them.
+		 */
+		if (IOMMU_PTE_PRESENT(__pte) &&
 		    pte_level == PAGE_MODE_7_LEVEL) {
+			unsigned long count, i;
+			u64 *lpte;
+
+			lpte = first_pte_l7(pte, NULL, &count);
+
+			/*
+			 * Unmap the replicated PTEs that still match the
+			 * original large mapping
+			 */
+			for (i = 0; i < count; ++i)
+				cmpxchg64(&lpte[i], __pte, 0ULL);
+
+			*updated = true;
+			continue;
+		}
+
+		if (!IOMMU_PTE_PRESENT(__pte) ||
+		    pte_level == PAGE_MODE_NONE) {
 			page = (u64 *)get_zeroed_page(gfp);
+
 			if (!page)
 				return NULL;
 
@@ -1500,8 +1553,8 @@
 			/* pte could have been changed somewhere. */
 			if (cmpxchg64(pte, __pte, __npte) != __pte)
 				free_page((unsigned long)page);
-			else if (pte_level == PAGE_MODE_7_LEVEL)
-				domain->updated = true;
+			else if (IOMMU_PTE_PRESENT(__pte))
+				*updated = true;
 
 			continue;
 		}
@@ -1566,17 +1619,12 @@
 		*page_size = PTE_LEVEL_PAGE_SIZE(level);
 	}
 
-	if (PM_PTE_LEVEL(*pte) == 0x07) {
-		unsigned long pte_mask;
-
-		/*
-		 * If we have a series of large PTEs, make
-		 * sure to return a pointer to the first one.
-		 */
-		*page_size = pte_mask = PTE_PAGE_SIZE(*pte);
-		pte_mask   = ~((PAGE_SIZE_PTE_COUNT(pte_mask) << 3) - 1);
-		pte        = (u64 *)(((unsigned long)pte) & pte_mask);
-	}
+	/*
+	 * If we have a series of large PTEs, make
+	 * sure to return a pointer to the first one.
+	 */
+	if (PM_PTE_LEVEL(*pte) == PAGE_MODE_7_LEVEL)
+		pte = first_pte_l7(pte, page_size, NULL);
 
 	return pte;
 }
@@ -1615,26 +1663,29 @@
 			  gfp_t gfp)
 {
 	struct page *freelist = NULL;
+	bool updated = false;
 	u64 __pte, *pte;
-	int i, count;
+	int ret, i, count;
 
 	BUG_ON(!IS_ALIGNED(bus_addr, page_size));
 	BUG_ON(!IS_ALIGNED(phys_addr, page_size));
 
+	ret = -EINVAL;
 	if (!(prot & IOMMU_PROT_MASK))
-		return -EINVAL;
+		goto out;
 
 	count = PAGE_SIZE_PTE_COUNT(page_size);
-	pte   = alloc_pte(dom, bus_addr, page_size, NULL, gfp);
+	pte   = alloc_pte(dom, bus_addr, page_size, NULL, gfp, &updated);
 
+	ret = -ENOMEM;
 	if (!pte)
-		return -ENOMEM;
+		goto out;
 
 	for (i = 0; i < count; ++i)
 		freelist = free_clear_pte(&pte[i], pte[i], freelist);
 
 	if (freelist != NULL)
-		dom->updated = true;
+		updated = true;
 
 	if (count > 1) {
 		__pte = PAGE_SIZE_PTE(__sme_set(phys_addr), page_size);
@@ -1650,12 +1701,21 @@
 	for (i = 0; i < count; ++i)
 		pte[i] = __pte;
 
-	update_domain(dom);
+	ret = 0;
+
+out:
+	if (updated) {
+		unsigned long flags;
+
+		spin_lock_irqsave(&dom->lock, flags);
+		update_domain(dom);
+		spin_unlock_irqrestore(&dom->lock, flags);
+	}
 
 	/* Everything flushed out, free pages now */
 	free_page_list(freelist);
 
-	return 0;
+	return ret;
 }
 
 static unsigned long iommu_unmap_page(struct protection_domain *dom,
@@ -1806,8 +1866,12 @@
 
 static void dma_ops_domain_flush_tlb(struct dma_ops_domain *dom)
 {
+	unsigned long flags;
+
+	spin_lock_irqsave(&dom->domain.lock, flags);
 	domain_flush_tlb(&dom->domain);
 	domain_flush_complete(&dom->domain);
+	spin_unlock_irqrestore(&dom->domain.lock, flags);
 }
 
 static void iova_domain_flush_tlb(struct iova_domain *iovad)
@@ -2022,36 +2086,6 @@
 	domain->dev_cnt                 -= 1;
 }
 
-/*
- * If a device is not yet associated with a domain, this function makes the
- * device visible in the domain
- */
-static int __attach_device(struct iommu_dev_data *dev_data,
-			   struct protection_domain *domain)
-{
-	int ret;
-
-	/* lock domain */
-	spin_lock(&domain->lock);
-
-	ret = -EBUSY;
-	if (dev_data->domain != NULL)
-		goto out_unlock;
-
-	/* Attach alias group root */
-	do_attach(dev_data, domain);
-
-	ret = 0;
-
-out_unlock:
-
-	/* ready */
-	spin_unlock(&domain->lock);
-
-	return ret;
-}
-
-
 static void pdev_iommuv2_disable(struct pci_dev *pdev)
 {
 	pci_disable_ats(pdev);
@@ -2133,19 +2167,28 @@
 	unsigned long flags;
 	int ret;
 
+	spin_lock_irqsave(&domain->lock, flags);
+
 	dev_data = get_dev_data(dev);
 
+	spin_lock(&dev_data->lock);
+
+	ret = -EBUSY;
+	if (dev_data->domain != NULL)
+		goto out;
+
 	if (!dev_is_pci(dev))
 		goto skip_ats_check;
 
 	pdev = to_pci_dev(dev);
 	if (domain->flags & PD_IOMMUV2_MASK) {
+		ret = -EINVAL;
 		if (!dev_data->passthrough)
-			return -EINVAL;
+			goto out;
 
 		if (dev_data->iommu_v2) {
 			if (pdev_iommuv2_enable(pdev) != 0)
-				return -EINVAL;
+				goto out;
 
 			dev_data->ats.enabled = true;
 			dev_data->ats.qdep    = pci_ats_queue_depth(pdev);
@@ -2158,9 +2201,9 @@
 	}
 
 skip_ats_check:
-	spin_lock_irqsave(&amd_iommu_devtable_lock, flags);
-	ret = __attach_device(dev_data, domain);
-	spin_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
+	ret = 0;
+
+	do_attach(dev_data, domain);
 
 	/*
 	 * We might boot into a crash-kernel here. The crashed kernel
@@ -2169,26 +2212,17 @@
 	 */
 	domain_flush_tlb_pde(domain);
 
+	domain_flush_complete(domain);
+
+out:
+	spin_unlock(&dev_data->lock);
+
+	spin_unlock_irqrestore(&domain->lock, flags);
+
 	return ret;
 }
 
 /*
- * Removes a device from a protection domain (unlocked)
- */
-static void __detach_device(struct iommu_dev_data *dev_data)
-{
-	struct protection_domain *domain;
-
-	domain = dev_data->domain;
-
-	spin_lock(&domain->lock);
-
-	do_detach(dev_data);
-
-	spin_unlock(&domain->lock);
-}
-
-/*
  * Removes a device from a protection domain (with devtable_lock held)
  */
 static void detach_device(struct device *dev)
@@ -2200,6 +2234,10 @@
 	dev_data = get_dev_data(dev);
 	domain   = dev_data->domain;
 
+	spin_lock_irqsave(&domain->lock, flags);
+
+	spin_lock(&dev_data->lock);
+
 	/*
 	 * First check if the device is still attached. It might already
 	 * be detached from its domain because the generic
@@ -2207,15 +2245,12 @@
 	 * our alias handling.
 	 */
 	if (WARN_ON(!dev_data->domain))
-		return;
+		goto out;
 
-	/* lock device table */
-	spin_lock_irqsave(&amd_iommu_devtable_lock, flags);
-	__detach_device(dev_data);
-	spin_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
+	do_detach(dev_data);
 
 	if (!dev_is_pci(dev))
-		return;
+		goto out;
 
 	if (domain->flags & PD_IOMMUV2_MASK && dev_data->iommu_v2)
 		pdev_iommuv2_disable(to_pci_dev(dev));
@@ -2223,6 +2258,11 @@
 		pci_disable_ats(to_pci_dev(dev));
 
 	dev_data->ats.enabled = false;
+
+out:
+	spin_unlock(&dev_data->lock);
+
+	spin_unlock_irqrestore(&domain->lock, flags);
 }
 
 static int amd_iommu_add_device(struct device *dev)
@@ -2354,15 +2394,10 @@
 
 static void update_domain(struct protection_domain *domain)
 {
-	if (!domain->updated)
-		return;
-
 	update_device_table(domain);
 
 	domain_flush_devices(domain);
 	domain_flush_tlb_pde(domain);
-
-	domain->updated = false;
 }
 
 static int dir2prot(enum dma_data_direction direction)
@@ -2392,6 +2427,7 @@
 {
 	dma_addr_t offset = paddr & ~PAGE_MASK;
 	dma_addr_t address, start, ret;
+	unsigned long flags;
 	unsigned int pages;
 	int prot = 0;
 	int i;
@@ -2429,8 +2465,10 @@
 		iommu_unmap_page(&dma_dom->domain, start, PAGE_SIZE);
 	}
 
+	spin_lock_irqsave(&dma_dom->domain.lock, flags);
 	domain_flush_tlb(&dma_dom->domain);
 	domain_flush_complete(&dma_dom->domain);
+	spin_unlock_irqrestore(&dma_dom->domain.lock, flags);
 
 	dma_ops_free_iova(dma_dom, address, pages);
 
@@ -2459,8 +2497,12 @@
 	}
 
 	if (amd_iommu_unmap_flush) {
+		unsigned long flags;
+
+		spin_lock_irqsave(&dma_dom->domain.lock, flags);
 		domain_flush_tlb(&dma_dom->domain);
 		domain_flush_complete(&dma_dom->domain);
+		spin_unlock_irqrestore(&dma_dom->domain.lock, flags);
 		dma_ops_free_iova(dma_dom, dma_addr, pages);
 	} else {
 		pages = __roundup_pow_of_two(pages);
@@ -2866,16 +2908,16 @@
 	struct iommu_dev_data *entry;
 	unsigned long flags;
 
-	spin_lock_irqsave(&amd_iommu_devtable_lock, flags);
+	spin_lock_irqsave(&domain->lock, flags);
 
 	while (!list_empty(&domain->dev_list)) {
 		entry = list_first_entry(&domain->dev_list,
 					 struct iommu_dev_data, list);
 		BUG_ON(!entry->domain);
-		__detach_device(entry);
+		do_detach(entry);
 	}
 
-	spin_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
+	spin_unlock_irqrestore(&domain->lock, flags);
 }
 
 static void protection_domain_free(struct protection_domain *domain)
@@ -3226,9 +3268,12 @@
 static void amd_iommu_flush_iotlb_all(struct iommu_domain *domain)
 {
 	struct protection_domain *dom = to_pdomain(domain);
+	unsigned long flags;
 
+	spin_lock_irqsave(&dom->lock, flags);
 	domain_flush_tlb_pde(dom);
 	domain_flush_complete(dom);
+	spin_unlock_irqrestore(&dom->lock, flags);
 }
 
 static void amd_iommu_iotlb_sync(struct iommu_domain *domain,
@@ -3290,7 +3335,6 @@
 
 	/* Update data structure */
 	domain->mode    = PAGE_MODE_NONE;
-	domain->updated = true;
 
 	/* Make changes visible to IOMMUs */
 	update_domain(domain);
@@ -3336,7 +3380,6 @@
 
 	domain->glx      = levels;
 	domain->flags   |= PD_IOMMUV2_MASK;
-	domain->updated  = true;
 
 	update_domain(domain);
 

diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h
index 9ac229e..c9c1612 100644
--- a/drivers/iommu/amd_iommu_types.h
+++ b/drivers/iommu/amd_iommu_types.h

@@ -475,7 +475,6 @@
 	int glx;		/* Number of levels for GCR3 table */
 	u64 *gcr3_tbl;		/* Guest CR3 table */
 	unsigned long flags;	/* flags to find out type of domain */
-	bool updated;		/* complete domain flush required */
 	unsigned dev_cnt;	/* devices assigned to this domain */
 	unsigned dev_iommu[MAX_IOMMUS]; /* per-IOMMU reference count */
 };
@@ -634,6 +633,9 @@
  * This struct contains device specific data for the IOMMU
  */
 struct iommu_dev_data {
+	/*Protect against attach/detach races */
+	spinlock_t lock;
+
 	struct list_head list;		  /* For domain->dev_list */
 	struct llist_node dev_data_list;  /* For global dev_data_list */
 	struct protection_domain *domain; /* Domain the device is bound to */

diff --git a/drivers/isdn/mISDN/socket.c b/drivers/isdn/mISDN/socket.c
index c6ba37d..dff4132 100644
--- a/drivers/isdn/mISDN/socket.c
+++ b/drivers/isdn/mISDN/socket.c

@@ -754,6 +754,8 @@
 
 	if (sock->type != SOCK_RAW)
 		return -ESOCKTNOSUPPORT;
+	if (!capable(CAP_NET_RAW))
+		return -EPERM;
 
 	sk = sk_alloc(net, PF_ISDN, GFP_KERNEL, &mISDN_proto, kern);
 	if (!sk)

diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index 9118ab8..dab4446 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c

@@ -345,6 +345,14 @@
 #define DEBUG_bytes(bytes, len, msg, ...)	do { } while (0)
 #endif
 
+static void dm_integrity_prepare(struct request *rq)
+{
+}
+
+static void dm_integrity_complete(struct request *rq, unsigned int nr_bytes)
+{
+}
+
 /*
  * DM Integrity profile, protection is performed layer above (dm-crypt)
  */
@@ -352,6 +360,8 @@
 	.name			= "DM-DIF-EXT-TAG",
 	.generate_fn		= NULL,
 	.verify_fn		= NULL,
+	.prepare_fn		= dm_integrity_prepare,
+	.complete_fn		= dm_integrity_complete,
 };
 
 static void dm_integrity_map_continue(struct dm_integrity_io *dio, bool from_map);

diff --git a/drivers/media/i2c/adv748x/adv748x-core.c b/drivers/media/i2c/adv748x/adv748x-core.c
index 0a47d47..23e02ff 100644
--- a/drivers/media/i2c/adv748x/adv748x-core.c
+++ b/drivers/media/i2c/adv748x/adv748x-core.c

@@ -183,14 +183,14 @@
 	int ret;
 
 	for (i = ADV748X_PAGE_DPLL; i < ADV748X_PAGE_MAX; ++i) {
-		state->i2c_clients[i] = i2c_new_secondary_device(
+		state->i2c_clients[i] = i2c_new_ancillary_device(
 				state->client,
 				adv748x_default_addresses[i].name,
 				adv748x_default_addresses[i].default_addr);
 
-		if (state->i2c_clients[i] == NULL) {
+		if (IS_ERR(state->i2c_clients[i])) {
 			adv_err(state, "failed to create i2c client %u\n", i);
-			return -ENOMEM;
+			return PTR_ERR(state->i2c_clients[i]);
 		}
 
 		ret = adv748x_configure_regmap(state, i);

diff --git a/drivers/media/i2c/adv7604.c b/drivers/media/i2c/adv7604.c
index 28a84bf..2dedd6e 100644
--- a/drivers/media/i2c/adv7604.c
+++ b/drivers/media/i2c/adv7604.c

@@ -2862,10 +2862,8 @@
 {
 	unsigned int i;
 
-	for (i = 1; i < ARRAY_SIZE(state->i2c_clients); ++i) {
-		if (state->i2c_clients[i])
-			i2c_unregister_device(state->i2c_clients[i]);
-	}
+	for (i = 1; i < ARRAY_SIZE(state->i2c_clients); ++i)
+		i2c_unregister_device(state->i2c_clients[i]);
 }
 
 static struct i2c_client *adv76xx_dummy_client(struct v4l2_subdev *sd,
@@ -2878,14 +2876,14 @@
 	struct i2c_client *new_client;
 
 	if (pdata && pdata->i2c_addresses[page])
-		new_client = i2c_new_dummy(client->adapter,
+		new_client = i2c_new_dummy_device(client->adapter,
 					   pdata->i2c_addresses[page]);
 	else
-		new_client = i2c_new_secondary_device(client,
+		new_client = i2c_new_ancillary_device(client,
 				adv76xx_default_addresses[page].name,
 				adv76xx_default_addresses[page].default_addr);
 
-	if (new_client)
+	if (!IS_ERR(new_client))
 		io_write(sd, io_reg, new_client->addr << 1);
 
 	return new_client;
@@ -3516,15 +3514,19 @@
 	}
 
 	for (i = 1; i < ADV76XX_PAGE_MAX; ++i) {
+		struct i2c_client *dummy_client;
+
 		if (!(BIT(i) & state->info->page_mask))
 			continue;
 
-		state->i2c_clients[i] = adv76xx_dummy_client(sd, i);
-		if (!state->i2c_clients[i]) {
-			err = -EINVAL;
+		dummy_client = adv76xx_dummy_client(sd, i);
+		if (IS_ERR(dummy_client)) {
+			err = PTR_ERR(dummy_client);
 			v4l2_err(sd, "failed to create i2c client %u\n", i);
 			goto err_i2c;
 		}
+
+		state->i2c_clients[i] = dummy_client;
 	}
 
 	INIT_DELAYED_WORK(&state->delayed_work_enable_hotplug,

diff --git a/drivers/media/v4l2-core/videobuf-dma-contig.c b/drivers/media/v4l2-core/videobuf-dma-contig.c
index 76b4ac7..aeb2f49 100644
--- a/drivers/media/v4l2-core/videobuf-dma-contig.c
+++ b/drivers/media/v4l2-core/videobuf-dma-contig.c

@@ -157,6 +157,7 @@
 static int videobuf_dma_contig_user_get(struct videobuf_dma_contig_memory *mem,
 					struct videobuf_buffer *vb)
 {
+	unsigned long untagged_baddr = untagged_addr(vb->baddr);
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma;
 	unsigned long prev_pfn, this_pfn;
@@ -164,22 +165,22 @@
 	unsigned int offset;
 	int ret;
 
-	offset = vb->baddr & ~PAGE_MASK;
+	offset = untagged_baddr & ~PAGE_MASK;
 	mem->size = PAGE_ALIGN(vb->size + offset);
 	ret = -EINVAL;
 
 	down_read(&mm->mmap_sem);
 
-	vma = find_vma(mm, vb->baddr);
+	vma = find_vma(mm, untagged_baddr);
 	if (!vma)
 		goto out_up;
 
-	if ((vb->baddr + mem->size) > vma->vm_end)
+	if ((untagged_baddr + mem->size) > vma->vm_end)
 		goto out_up;
 
 	pages_done = 0;
 	prev_pfn = 0; /* kill warning */
-	user_address = vb->baddr;
+	user_address = untagged_baddr;
 
 	while (pages_done < (mem->size >> PAGE_SHIFT)) {
 		ret = follow_pfn(vma, user_address, &this_pfn);

diff --git a/drivers/mfd/88pm800.c b/drivers/mfd/88pm800.c
index f2d9fb4..4e8d0d6 100644
--- a/drivers/mfd/88pm800.c
+++ b/drivers/mfd/88pm800.c

@@ -425,10 +425,10 @@
 		return -ENODEV;
 
 	/* PM800 block power page */
-	subchip->power_page = i2c_new_dummy(client->adapter,
+	subchip->power_page = i2c_new_dummy_device(client->adapter,
 					    subchip->power_page_addr);
-	if (subchip->power_page == NULL) {
-		ret = -ENODEV;
+	if (IS_ERR(subchip->power_page)) {
+		ret = PTR_ERR(subchip->power_page);
 		goto out;
 	}
 
@@ -444,10 +444,10 @@
 	i2c_set_clientdata(subchip->power_page, chip);
 
 	/* PM800 block GPADC */
-	subchip->gpadc_page = i2c_new_dummy(client->adapter,
+	subchip->gpadc_page = i2c_new_dummy_device(client->adapter,
 					    subchip->gpadc_page_addr);
-	if (subchip->gpadc_page == NULL) {
-		ret = -ENODEV;
+	if (IS_ERR(subchip->gpadc_page)) {
+		ret = PTR_ERR(subchip->gpadc_page);
 		goto out;
 	}
 

diff --git a/drivers/mfd/88pm860x-core.c b/drivers/mfd/88pm860x-core.c
index 9e0bd13..c9bae71 100644
--- a/drivers/mfd/88pm860x-core.c
+++ b/drivers/mfd/88pm860x-core.c

@@ -1178,12 +1178,12 @@
 	 */
 	if (pdata->companion_addr && (pdata->companion_addr != client->addr)) {
 		chip->companion_addr = pdata->companion_addr;
-		chip->companion = i2c_new_dummy(chip->client->adapter,
+		chip->companion = i2c_new_dummy_device(chip->client->adapter,
 						chip->companion_addr);
-		if (!chip->companion) {
+		if (IS_ERR(chip->companion)) {
 			dev_err(&client->dev,
 				"Failed to allocate I2C companion device\n");
-			return -ENODEV;
+			return PTR_ERR(chip->companion);
 		}
 		chip->regmap_companion = regmap_init_i2c(chip->companion,
 							&pm860x_regmap_config);

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index c9c49da..ae24d3e 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig

@@ -589,6 +589,17 @@
 	  Select this option for supporting Dollar Cove (TI version) PMIC
 	  device that is found on some Intel Cherry Trail systems.
 
+config INTEL_SOC_PMIC_MRFLD
+	tristate "Support for Intel Merrifield Basin Cove PMIC"
+	depends on GPIOLIB
+	depends on ACPI
+	depends on INTEL_SCU_IPC
+	select MFD_CORE
+	select REGMAP_IRQ
+	help
+	  Select this option for supporting Basin Cove PMIC device
+	  that is found on Intel Merrifield systems.
+
 config MFD_INTEL_LPSS
 	tristate
 	select COMMON_CLK
@@ -641,15 +652,6 @@
 	  host many different types of MODULbus daughterboards, including
 	  CAN and GPIO controllers.
 
-config MFD_JZ4740_ADC
-	bool "Janz JZ4740 ADC core"
-	select MFD_CORE
-	select GENERIC_IRQ_CHIP
-	depends on MACH_JZ4740
-	help
-	  Say yes here if you want support for the ADC unit in the JZ4740 SoC.
-	  This driver is necessary for jz4740-battery and jz4740-hwmon driver.
-
 config MFD_KEMPLD
 	tristate "Kontron module PLD device"
 	select MFD_CORE

diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index 0c0a848..c1067ea 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile

@@ -189,7 +189,6 @@
 obj-$(CONFIG_LPC_ICH)		+= lpc_ich.o
 obj-$(CONFIG_MFD_RDC321X)	+= rdc321x-southbridge.o
 obj-$(CONFIG_MFD_JANZ_CMODIO)	+= janz-cmodio.o
-obj-$(CONFIG_MFD_JZ4740_ADC)	+= jz4740-adc.o
 obj-$(CONFIG_MFD_TPS6586X)	+= tps6586x.o
 obj-$(CONFIG_MFD_VX855)		+= vx855.o
 obj-$(CONFIG_MFD_WL1273_CORE)	+= wl1273-core.o
@@ -239,7 +238,9 @@
 obj-$(CONFIG_INTEL_SOC_PMIC_BXTWC)	+= intel_soc_pmic_bxtwc.o
 obj-$(CONFIG_INTEL_SOC_PMIC_CHTWC)	+= intel_soc_pmic_chtwc.o
 obj-$(CONFIG_INTEL_SOC_PMIC_CHTDC_TI)	+= intel_soc_pmic_chtdc_ti.o
-obj-$(CONFIG_MFD_MT6397)	+= mt6397-core.o
+mt6397-objs	:= mt6397-core.o mt6397-irq.o
+obj-$(CONFIG_MFD_MT6397)	+= mt6397.o
+obj-$(CONFIG_INTEL_SOC_PMIC_MRFLD)	+= intel_soc_pmic_mrfld.o
 
 obj-$(CONFIG_MFD_ALTERA_A10SR)	+= altera-a10sr.o
 obj-$(CONFIG_MFD_ALTERA_SYSMGR) += altera-sysmgr.o

diff --git a/drivers/mfd/ab3100-core.c b/drivers/mfd/ab3100-core.c
index 9f3dbc3..57723f1 100644
--- a/drivers/mfd/ab3100-core.c
+++ b/drivers/mfd/ab3100-core.c

@@ -865,10 +865,10 @@
 		 &ab3100->chip_name[0]);
 
 	/* Attach a second dummy i2c_client to the test register address */
-	ab3100->testreg_client = i2c_new_dummy(client->adapter,
+	ab3100->testreg_client = i2c_new_dummy_device(client->adapter,
 					       client->addr + 1);
-	if (!ab3100->testreg_client) {
-		err = -ENOMEM;
+	if (IS_ERR(ab3100->testreg_client)) {
+		err = PTR_ERR(ab3100->testreg_client);
 		goto exit_no_testreg_client;
 	}
 

diff --git a/drivers/mfd/ab8500-debugfs.c b/drivers/mfd/ab8500-debugfs.c
index 567a34b..f4e26b6 100644
--- a/drivers/mfd/ab8500-debugfs.c
+++ b/drivers/mfd/ab8500-debugfs.c

@@ -2680,16 +2680,12 @@
 	irq_ab8500 = res->start;
 
 	irq_first = platform_get_irq_byname(plf, "IRQ_FIRST");
-	if (irq_first < 0) {
-		dev_err(&plf->dev, "First irq not found, err %d\n", irq_first);
+	if (irq_first < 0)
 		return irq_first;
-	}
 
 	irq_last = platform_get_irq_byname(plf, "IRQ_LAST");
-	if (irq_last < 0) {
-		dev_err(&plf->dev, "Last irq not found, err %d\n", irq_last);
+	if (irq_last < 0)
 		return irq_last;
-	}
 
 	ab8500_dir = debugfs_create_dir(AB8500_NAME_STRING, NULL);
 

diff --git a/drivers/mfd/asic3.c b/drivers/mfd/asic3.c
index 83b18c9..a6bd213 100644
--- a/drivers/mfd/asic3.c
+++ b/drivers/mfd/asic3.c

@@ -15,7 +15,7 @@
 #include <linux/kernel.h>
 #include <linux/delay.h>
 #include <linux/irq.h>
-#include <linux/gpio.h>
+#include <linux/gpio/driver.h>
 #include <linux/export.h>
 #include <linux/io.h>
 #include <linux/slab.h>

diff --git a/drivers/mfd/bcm590xx.c b/drivers/mfd/bcm590xx.c
index 1aeb5e4..bfac5dc 100644
--- a/drivers/mfd/bcm590xx.c
+++ b/drivers/mfd/bcm590xx.c

@@ -61,11 +61,11 @@
 	}
 
 	/* Secondary I2C slave address is the base address with A(2) asserted */
-	bcm590xx->i2c_sec = i2c_new_dummy(i2c_pri->adapter,
+	bcm590xx->i2c_sec = i2c_new_dummy_device(i2c_pri->adapter,
 					  i2c_pri->addr | BIT(2));
-	if (!bcm590xx->i2c_sec) {
+	if (IS_ERR(bcm590xx->i2c_sec)) {
 		dev_err(&i2c_pri->dev, "failed to add secondary I2C device\n");
-		return -ENODEV;
+		return PTR_ERR(bcm590xx->i2c_sec);
 	}
 	i2c_set_clientdata(bcm590xx->i2c_sec, bcm590xx);
 

diff --git a/drivers/mfd/da9150-core.c b/drivers/mfd/da9150-core.c
index 1303306..7f0aa1e 100644
--- a/drivers/mfd/da9150-core.c
+++ b/drivers/mfd/da9150-core.c

@@ -420,10 +420,10 @@
 	qif_addr = da9150_reg_read(da9150, DA9150_CORE2WIRE_CTRL_A);
 	qif_addr = (qif_addr & DA9150_CORE_BASE_ADDR_MASK) >> 1;
 	qif_addr |= DA9150_QIF_I2C_ADDR_LSB;
-	da9150->core_qif = i2c_new_dummy(client->adapter, qif_addr);
-	if (!da9150->core_qif) {
+	da9150->core_qif = i2c_new_dummy_device(client->adapter, qif_addr);
+	if (IS_ERR(da9150->core_qif)) {
 		dev_err(da9150->dev, "Failed to attach QIF client\n");
-		return -ENODEV;
+		return PTR_ERR(da9150->core_qif);
 	}
 
 	i2c_set_clientdata(da9150->core_qif, da9150);

diff --git a/drivers/mfd/davinci_voicecodec.c b/drivers/mfd/davinci_voicecodec.c
index 13ca720..e5c8bc9 100644
--- a/drivers/mfd/davinci_voicecodec.c
+++ b/drivers/mfd/davinci_voicecodec.c

@@ -19,7 +19,6 @@
 #include <sound/pcm.h>
 
 #include <linux/mfd/davinci_voicecodec.h>
-#include <mach/hardware.h>
 
 static const struct regmap_config davinci_vc_regmap = {
 	.reg_bits = 32,
@@ -31,6 +30,7 @@
 	struct davinci_vc *davinci_vc;
 	struct resource *res;
 	struct mfd_cell *cell = NULL;
+	dma_addr_t fifo_base;
 	int ret;
 
 	davinci_vc = devm_kzalloc(&pdev->dev,
@@ -48,6 +48,7 @@
 
 	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 
+	fifo_base = (dma_addr_t)res->start;
 	davinci_vc->base = devm_ioremap_resource(&pdev->dev, res);
 	if (IS_ERR(davinci_vc->base)) {
 		ret = PTR_ERR(davinci_vc->base);
@@ -70,8 +71,7 @@
 	}
 
 	davinci_vc->davinci_vcif.dma_tx_channel = res->start;
-	davinci_vc->davinci_vcif.dma_tx_addr =
-		(dma_addr_t)(io_v2p(davinci_vc->base) + DAVINCI_VC_WFIFO);
+	davinci_vc->davinci_vcif.dma_tx_addr = fifo_base + DAVINCI_VC_WFIFO;
 
 	res = platform_get_resource(pdev, IORESOURCE_DMA, 1);
 	if (!res) {
@@ -81,8 +81,7 @@
 	}
 
 	davinci_vc->davinci_vcif.dma_rx_channel = res->start;
-	davinci_vc->davinci_vcif.dma_rx_addr =
-		(dma_addr_t)(io_v2p(davinci_vc->base) + DAVINCI_VC_RFIFO);
+	davinci_vc->davinci_vcif.dma_rx_addr = fifo_base + DAVINCI_VC_RFIFO;
 
 	davinci_vc->dev = &pdev->dev;
 	davinci_vc->pdev = pdev;

diff --git a/drivers/mfd/db8500-prcmu.c b/drivers/mfd/db8500-prcmu.c
index 90e0f21..dfac6af 100644
--- a/drivers/mfd/db8500-prcmu.c
+++ b/drivers/mfd/db8500-prcmu.c

@@ -36,7 +36,6 @@
 #include <linux/regulator/db8500-prcmu.h>
 #include <linux/regulator/machine.h>
 #include <linux/platform_data/ux500_wdt.h>
-#include <linux/platform_data/db8500_thermal.h>
 #include "dbx500-prcmu-regs.h"
 
 /* Index of different voltages to be used when accessing AVSData */
@@ -1695,21 +1694,41 @@
 	return rounded_rate;
 }
 
-static const unsigned long armss_freqs[] = {
+static const unsigned long db8500_armss_freqs[] = {
 	200000000,
 	400000000,
 	800000000,
 	998400000
 };
 
+/* The DB8520 has slightly higher ARMSS max frequency */
+static const unsigned long db8520_armss_freqs[] = {
+	200000000,
+	400000000,
+	800000000,
+	1152000000
+};
+
+
+
 static long round_armss_rate(unsigned long rate)
 {
 	unsigned long freq = 0;
+	const unsigned long *freqs;
+	int nfreqs;
 	int i;
 
+	if (fw_info.version.project == PRCMU_FW_PROJECT_U8520) {
+		freqs = db8520_armss_freqs;
+		nfreqs = ARRAY_SIZE(db8520_armss_freqs);
+	} else {
+		freqs = db8500_armss_freqs;
+		nfreqs = ARRAY_SIZE(db8500_armss_freqs);
+	}
+
 	/* Find the corresponding arm opp from the cpufreq table. */
-	for (i = 0; i < ARRAY_SIZE(armss_freqs); i++) {
-		freq = armss_freqs[i];
+	for (i = 0; i < nfreqs; i++) {
+		freq = freqs[i];
 		if (rate <= freq)
 			break;
 	}
@@ -1854,11 +1873,21 @@
 {
 	unsigned long freq;
 	u8 opps[] = { ARM_EXTCLK, ARM_50_OPP, ARM_100_OPP, ARM_MAX_OPP };
+	const unsigned long *freqs;
+	int nfreqs;
 	int i;
 
+	if (fw_info.version.project == PRCMU_FW_PROJECT_U8520) {
+		freqs = db8520_armss_freqs;
+		nfreqs = ARRAY_SIZE(db8520_armss_freqs);
+	} else {
+		freqs = db8500_armss_freqs;
+		nfreqs = ARRAY_SIZE(db8500_armss_freqs);
+	}
+
 	/* Find the corresponding arm opp from the cpufreq table. */
-	for (i = 0; i < ARRAY_SIZE(armss_freqs); i++) {
-		freq = armss_freqs[i];
+	for (i = 0; i < nfreqs; i++) {
+		freq = freqs[i];
 		if (rate == freq)
 			break;
 	}
@@ -2984,53 +3013,6 @@
 	.timeout = 600, /* 10 minutes */
 	.has_28_bits_resolution = true,
 };
-/*
- * Thermal Sensor
- */
-
-static struct resource db8500_thsens_resources[] = {
-	{
-		.name = "IRQ_HOTMON_LOW",
-		.start  = IRQ_PRCMU_HOTMON_LOW,
-		.end    = IRQ_PRCMU_HOTMON_LOW,
-		.flags  = IORESOURCE_IRQ,
-	},
-	{
-		.name = "IRQ_HOTMON_HIGH",
-		.start  = IRQ_PRCMU_HOTMON_HIGH,
-		.end    = IRQ_PRCMU_HOTMON_HIGH,
-		.flags  = IORESOURCE_IRQ,
-	},
-};
-
-static struct db8500_thsens_platform_data db8500_thsens_data = {
-	.trip_points[0] = {
-		.temp = 70000,
-		.type = THERMAL_TRIP_ACTIVE,
-		.cdev_name = {
-			[0] = "thermal-cpufreq-0",
-		},
-	},
-	.trip_points[1] = {
-		.temp = 75000,
-		.type = THERMAL_TRIP_ACTIVE,
-		.cdev_name = {
-			[0] = "thermal-cpufreq-0",
-		},
-	},
-	.trip_points[2] = {
-		.temp = 80000,
-		.type = THERMAL_TRIP_ACTIVE,
-		.cdev_name = {
-			[0] = "thermal-cpufreq-0",
-		},
-	},
-	.trip_points[3] = {
-		.temp = 85000,
-		.type = THERMAL_TRIP_CRITICAL,
-	},
-	.num_trips = 4,
-};
 
 static const struct mfd_cell common_prcmu_devs[] = {
 	{
@@ -3054,10 +3036,7 @@
 	},
 	{
 		.name = "db8500-thermal",
-		.num_resources = ARRAY_SIZE(db8500_thsens_resources),
-		.resources = db8500_thsens_resources,
-		.platform_data = &db8500_thsens_data,
-		.pdata_size = sizeof(db8500_thsens_data),
+		.of_compatible = "stericsson,db8500-thermal",
 	},
 };
 
@@ -3130,10 +3109,8 @@
 	writel(ALL_MBOX_BITS, PRCM_ARM_IT1_CLR);
 
 	irq = platform_get_irq(pdev, 0);
-	if (irq <= 0) {
-		dev_err(&pdev->dev, "no prcmu irq provided\n");
+	if (irq <= 0)
 		return irq;
-	}
 
 	err = request_threaded_irq(irq, prcmu_irq_handler,
 	        prcmu_irq_thread_fn, IRQF_NO_SUSPEND, "prcmu", NULL);

diff --git a/drivers/mfd/ezx-pcap.c b/drivers/mfd/ezx-pcap.c
index f505e3e..70fa18b 100644
--- a/drivers/mfd/ezx-pcap.c
+++ b/drivers/mfd/ezx-pcap.c

@@ -35,7 +35,7 @@
 
 	/* IO */
 	u32 buf;
-	struct mutex io_mutex;
+	spinlock_t io_lock;
 
 	/* IRQ */
 	unsigned int irq_base;
@@ -48,7 +48,7 @@
 	struct pcap_adc_request *adc_queue[PCAP_ADC_MAXQ];
 	u8 adc_head;
 	u8 adc_tail;
-	struct mutex adc_mutex;
+	spinlock_t adc_lock;
 };
 
 /* IO */
@@ -76,14 +76,15 @@
 
 int ezx_pcap_write(struct pcap_chip *pcap, u8 reg_num, u32 value)
 {
+	unsigned long flags;
 	int ret;
 
-	mutex_lock(&pcap->io_mutex);
+	spin_lock_irqsave(&pcap->io_lock, flags);
 	value &= PCAP_REGISTER_VALUE_MASK;
 	value |= PCAP_REGISTER_WRITE_OP_BIT
 		| (reg_num << PCAP_REGISTER_ADDRESS_SHIFT);
 	ret = ezx_pcap_putget(pcap, &value);
-	mutex_unlock(&pcap->io_mutex);
+	spin_unlock_irqrestore(&pcap->io_lock, flags);
 
 	return ret;
 }
@@ -91,14 +92,15 @@
 
 int ezx_pcap_read(struct pcap_chip *pcap, u8 reg_num, u32 *value)
 {
+	unsigned long flags;
 	int ret;
 
-	mutex_lock(&pcap->io_mutex);
+	spin_lock_irqsave(&pcap->io_lock, flags);
 	*value = PCAP_REGISTER_READ_OP_BIT
 		| (reg_num << PCAP_REGISTER_ADDRESS_SHIFT);
 
 	ret = ezx_pcap_putget(pcap, value);
-	mutex_unlock(&pcap->io_mutex);
+	spin_unlock_irqrestore(&pcap->io_lock, flags);
 
 	return ret;
 }
@@ -106,11 +108,12 @@
 
 int ezx_pcap_set_bits(struct pcap_chip *pcap, u8 reg_num, u32 mask, u32 val)
 {
+	unsigned long flags;
 	int ret;
 	u32 tmp = PCAP_REGISTER_READ_OP_BIT |
 		(reg_num << PCAP_REGISTER_ADDRESS_SHIFT);
 
-	mutex_lock(&pcap->io_mutex);
+	spin_lock_irqsave(&pcap->io_lock, flags);
 	ret = ezx_pcap_putget(pcap, &tmp);
 	if (ret)
 		goto out_unlock;
@@ -121,7 +124,7 @@
 
 	ret = ezx_pcap_putget(pcap, &tmp);
 out_unlock:
-	mutex_unlock(&pcap->io_mutex);
+	spin_unlock_irqrestore(&pcap->io_lock, flags);
 
 	return ret;
 }
@@ -212,14 +215,15 @@
 /* ADC */
 void pcap_set_ts_bits(struct pcap_chip *pcap, u32 bits)
 {
+	unsigned long flags;
 	u32 tmp;
 
-	mutex_lock(&pcap->adc_mutex);
+	spin_lock_irqsave(&pcap->adc_lock, flags);
 	ezx_pcap_read(pcap, PCAP_REG_ADC, &tmp);
 	tmp &= ~(PCAP_ADC_TS_M_MASK | PCAP_ADC_TS_REF_LOWPWR);
 	tmp |= bits & (PCAP_ADC_TS_M_MASK | PCAP_ADC_TS_REF_LOWPWR);
 	ezx_pcap_write(pcap, PCAP_REG_ADC, tmp);
-	mutex_unlock(&pcap->adc_mutex);
+	spin_unlock_irqrestore(&pcap->adc_lock, flags);
 }
 EXPORT_SYMBOL_GPL(pcap_set_ts_bits);
 
@@ -234,15 +238,16 @@
 
 static void pcap_adc_trigger(struct pcap_chip *pcap)
 {
+	unsigned long flags;
 	u32 tmp;
 	u8 head;
 
-	mutex_lock(&pcap->adc_mutex);
+	spin_lock_irqsave(&pcap->adc_lock, flags);
 	head = pcap->adc_head;
 	if (!pcap->adc_queue[head]) {
 		/* queue is empty, save power */
 		pcap_disable_adc(pcap);
-		mutex_unlock(&pcap->adc_mutex);
+		spin_unlock_irqrestore(&pcap->adc_lock, flags);
 		return;
 	}
 	/* start conversion on requested bank, save TS_M bits */
@@ -254,7 +259,7 @@
 		tmp |= PCAP_ADC_AD_SEL1;
 
 	ezx_pcap_write(pcap, PCAP_REG_ADC, tmp);
-	mutex_unlock(&pcap->adc_mutex);
+	spin_unlock_irqrestore(&pcap->adc_lock, flags);
 	ezx_pcap_write(pcap, PCAP_REG_ADR, PCAP_ADR_ASC);
 }
 
@@ -265,11 +270,11 @@
 	u16 res[2];
 	u32 tmp;
 
-	mutex_lock(&pcap->adc_mutex);
+	spin_lock(&pcap->adc_lock);
 	req = pcap->adc_queue[pcap->adc_head];
 
 	if (WARN(!req, "adc irq without pending request\n")) {
-		mutex_unlock(&pcap->adc_mutex);
+		spin_unlock(&pcap->adc_lock);
 		return IRQ_HANDLED;
 	}
 
@@ -285,7 +290,7 @@
 
 	pcap->adc_queue[pcap->adc_head] = NULL;
 	pcap->adc_head = (pcap->adc_head + 1) & (PCAP_ADC_MAXQ - 1);
-	mutex_unlock(&pcap->adc_mutex);
+	spin_unlock(&pcap->adc_lock);
 
 	/* pass the results and release memory */
 	req->callback(req->data, res);
@@ -301,6 +306,7 @@
 						void *callback, void *data)
 {
 	struct pcap_adc_request *req;
+	unsigned long irq_flags;
 
 	/* This will be freed after we have a result */
 	req = kmalloc(sizeof(struct pcap_adc_request), GFP_KERNEL);
@@ -314,15 +320,15 @@
 	req->callback = callback;
 	req->data = data;
 
-	mutex_lock(&pcap->adc_mutex);
+	spin_lock_irqsave(&pcap->adc_lock, irq_flags);
 	if (pcap->adc_queue[pcap->adc_tail]) {
-		mutex_unlock(&pcap->adc_mutex);
+		spin_unlock_irqrestore(&pcap->adc_lock, irq_flags);
 		kfree(req);
 		return -EBUSY;
 	}
 	pcap->adc_queue[pcap->adc_tail] = req;
 	pcap->adc_tail = (pcap->adc_tail + 1) & (PCAP_ADC_MAXQ - 1);
-	mutex_unlock(&pcap->adc_mutex);
+	spin_unlock_irqrestore(&pcap->adc_lock, irq_flags);
 
 	/* start conversion */
 	pcap_adc_trigger(pcap);
@@ -389,16 +395,17 @@
 static int ezx_pcap_remove(struct spi_device *spi)
 {
 	struct pcap_chip *pcap = spi_get_drvdata(spi);
+	unsigned long flags;
 	int i;
 
 	/* remove all registered subdevs */
 	device_for_each_child(&spi->dev, NULL, pcap_remove_subdev);
 
 	/* cleanup ADC */
-	mutex_lock(&pcap->adc_mutex);
+	spin_lock_irqsave(&pcap->adc_lock, flags);
 	for (i = 0; i < PCAP_ADC_MAXQ; i++)
 		kfree(pcap->adc_queue[i]);
-	mutex_unlock(&pcap->adc_mutex);
+	spin_unlock_irqrestore(&pcap->adc_lock, flags);
 
 	/* cleanup irqchip */
 	for (i = pcap->irq_base; i < (pcap->irq_base + PCAP_NIRQS); i++)
@@ -426,8 +433,8 @@
 		goto ret;
 	}
 
-	mutex_init(&pcap->io_mutex);
-	mutex_init(&pcap->adc_mutex);
+	spin_lock_init(&pcap->io_lock);
+	spin_lock_init(&pcap->adc_lock);
 	INIT_WORK(&pcap->isr_work, pcap_isr_work);
 	INIT_WORK(&pcap->msr_work, pcap_msr_work);
 	spi_set_drvdata(spi, pcap);

diff --git a/drivers/mfd/fsl-imx25-tsadc.c b/drivers/mfd/fsl-imx25-tsadc.c
index 20791ca..a016b39 100644
--- a/drivers/mfd/fsl-imx25-tsadc.c
+++ b/drivers/mfd/fsl-imx25-tsadc.c

@@ -69,10 +69,8 @@
 	int irq;
 
 	irq = platform_get_irq(pdev, 0);
-	if (irq <= 0) {
-		dev_err(dev, "Failed to get irq\n");
+	if (irq <= 0)
 		return irq;
-	}
 
 	tsadc->domain = irq_domain_add_simple(np, 2, 0, &mx25_tsadc_domain_ops,
 					      tsadc);

diff --git a/drivers/mfd/htc-i2cpld.c b/drivers/mfd/htc-i2cpld.c
index 370519a..8ad6768 100644
--- a/drivers/mfd/htc-i2cpld.c
+++ b/drivers/mfd/htc-i2cpld.c

@@ -385,8 +385,7 @@
 	htcpld = platform_get_drvdata(pdev);
 	chip = &htcpld->chip[chip_index];
 
-	if (chip->client)
-		i2c_unregister_device(chip->client);
+	i2c_unregister_device(chip->client);
 }
 
 static int htcpld_register_chip_gpio(

diff --git a/drivers/mfd/intel-lpss-acpi.c b/drivers/mfd/intel-lpss-acpi.c
index 61ffb8b..c8fe334 100644
--- a/drivers/mfd/intel-lpss-acpi.c
+++ b/drivers/mfd/intel-lpss-acpi.c

@@ -18,6 +18,10 @@
 
 #include "intel-lpss.h"
 
+static const struct intel_lpss_platform_info spt_info = {
+	.clk_rate = 120000000,
+};
+
 static struct property_entry spt_i2c_properties[] = {
 	PROPERTY_ENTRY_U32("i2c-sda-hold-time-ns", 230),
 	{ },
@@ -28,6 +32,19 @@
 	.properties = spt_i2c_properties,
 };
 
+static struct property_entry uart_properties[] = {
+	PROPERTY_ENTRY_U32("reg-io-width", 4),
+	PROPERTY_ENTRY_U32("reg-shift", 2),
+	PROPERTY_ENTRY_BOOL("snps,uart-16550-compatible"),
+	{ },
+};
+
+static const struct intel_lpss_platform_info spt_uart_info = {
+	.clk_rate = 120000000,
+	.clk_con_id = "baudclk",
+	.properties = uart_properties,
+};
+
 static const struct intel_lpss_platform_info bxt_info = {
 	.clk_rate = 100000000,
 };
@@ -58,8 +75,17 @@
 
 static const struct acpi_device_id intel_lpss_acpi_ids[] = {
 	/* SPT */
+	{ "INT3440", (kernel_ulong_t)&spt_info },
+	{ "INT3441", (kernel_ulong_t)&spt_info },
+	{ "INT3442", (kernel_ulong_t)&spt_i2c_info },
+	{ "INT3443", (kernel_ulong_t)&spt_i2c_info },
+	{ "INT3444", (kernel_ulong_t)&spt_i2c_info },
+	{ "INT3445", (kernel_ulong_t)&spt_i2c_info },
 	{ "INT3446", (kernel_ulong_t)&spt_i2c_info },
 	{ "INT3447", (kernel_ulong_t)&spt_i2c_info },
+	{ "INT3448", (kernel_ulong_t)&spt_uart_info },
+	{ "INT3449", (kernel_ulong_t)&spt_uart_info },
+	{ "INT344A", (kernel_ulong_t)&spt_uart_info },
 	/* BXT */
 	{ "80860AAC", (kernel_ulong_t)&bxt_i2c_info },
 	{ "80860ABC", (kernel_ulong_t)&bxt_info },

diff --git a/drivers/mfd/intel-lpss-pci.c b/drivers/mfd/intel-lpss-pci.c
index ade6e1c..9355db2 100644
--- a/drivers/mfd/intel-lpss-pci.c
+++ b/drivers/mfd/intel-lpss-pci.c

@@ -35,6 +35,8 @@
 	info->mem = &pdev->resource[0];
 	info->irq = pdev->irq;
 
+	pdev->d3cold_delay = 0;
+
 	/* Probably it is enough to set this for iDMA capable devices only */
 	pci_set_master(pdev);
 	pci_try_set_mwi(pdev);
@@ -256,6 +258,29 @@
 	{ PCI_VDEVICE(INTEL, 0x9dea), (kernel_ulong_t)&cnl_i2c_info },
 	{ PCI_VDEVICE(INTEL, 0x9deb), (kernel_ulong_t)&cnl_i2c_info },
 	{ PCI_VDEVICE(INTEL, 0x9dfb), (kernel_ulong_t)&spt_info },
+	/* TGL-LP */
+	{ PCI_VDEVICE(INTEL, 0xa0a8), (kernel_ulong_t)&bxt_uart_info },
+	{ PCI_VDEVICE(INTEL, 0xa0a9), (kernel_ulong_t)&bxt_uart_info },
+	{ PCI_VDEVICE(INTEL, 0xa0aa), (kernel_ulong_t)&spt_info },
+	{ PCI_VDEVICE(INTEL, 0xa0ab), (kernel_ulong_t)&spt_info },
+	{ PCI_VDEVICE(INTEL, 0xa0c5), (kernel_ulong_t)&spt_i2c_info },
+	{ PCI_VDEVICE(INTEL, 0xa0c6), (kernel_ulong_t)&spt_i2c_info },
+	{ PCI_VDEVICE(INTEL, 0xa0c7), (kernel_ulong_t)&bxt_uart_info },
+	{ PCI_VDEVICE(INTEL, 0xa0d8), (kernel_ulong_t)&spt_i2c_info },
+	{ PCI_VDEVICE(INTEL, 0xa0d9), (kernel_ulong_t)&spt_i2c_info },
+	{ PCI_VDEVICE(INTEL, 0xa0da), (kernel_ulong_t)&bxt_uart_info },
+	{ PCI_VDEVICE(INTEL, 0xa0db), (kernel_ulong_t)&bxt_uart_info },
+	{ PCI_VDEVICE(INTEL, 0xa0dc), (kernel_ulong_t)&bxt_uart_info },
+	{ PCI_VDEVICE(INTEL, 0xa0dd), (kernel_ulong_t)&bxt_uart_info },
+	{ PCI_VDEVICE(INTEL, 0xa0de), (kernel_ulong_t)&spt_info },
+	{ PCI_VDEVICE(INTEL, 0xa0df), (kernel_ulong_t)&spt_info },
+	{ PCI_VDEVICE(INTEL, 0xa0e8), (kernel_ulong_t)&spt_i2c_info },
+	{ PCI_VDEVICE(INTEL, 0xa0e9), (kernel_ulong_t)&spt_i2c_info },
+	{ PCI_VDEVICE(INTEL, 0xa0ea), (kernel_ulong_t)&spt_i2c_info },
+	{ PCI_VDEVICE(INTEL, 0xa0eb), (kernel_ulong_t)&spt_i2c_info },
+	{ PCI_VDEVICE(INTEL, 0xa0fb), (kernel_ulong_t)&spt_info },
+	{ PCI_VDEVICE(INTEL, 0xa0fd), (kernel_ulong_t)&spt_info },
+	{ PCI_VDEVICE(INTEL, 0xa0fe), (kernel_ulong_t)&spt_info },
 	/* SPT-H */
 	{ PCI_VDEVICE(INTEL, 0xa127), (kernel_ulong_t)&spt_uart_info },
 	{ PCI_VDEVICE(INTEL, 0xa128), (kernel_ulong_t)&spt_uart_info },

diff --git a/drivers/mfd/intel-lpss.c b/drivers/mfd/intel-lpss.c
index 277f48f..bfe4ff3 100644
--- a/drivers/mfd/intel-lpss.c
+++ b/drivers/mfd/intel-lpss.c

@@ -47,10 +47,10 @@
 #define LPSS_PRIV_IDLELTR		0x14
 
 #define LPSS_PRIV_LTR_REQ		BIT(15)
-#define LPSS_PRIV_LTR_SCALE_MASK	0xc00
-#define LPSS_PRIV_LTR_SCALE_1US		0x800
-#define LPSS_PRIV_LTR_SCALE_32US	0xc00
-#define LPSS_PRIV_LTR_VALUE_MASK	0x3ff
+#define LPSS_PRIV_LTR_SCALE_MASK	GENMASK(11, 10)
+#define LPSS_PRIV_LTR_SCALE_1US		(2 << 10)
+#define LPSS_PRIV_LTR_SCALE_32US	(3 << 10)
+#define LPSS_PRIV_LTR_VALUE_MASK	GENMASK(9, 0)
 
 #define LPSS_PRIV_SSP_REG		0x20
 #define LPSS_PRIV_SSP_REG_DIS_DMA_FIN	BIT(0)
@@ -59,8 +59,8 @@
 
 #define LPSS_PRIV_CAPS			0xfc
 #define LPSS_PRIV_CAPS_NO_IDMA		BIT(8)
+#define LPSS_PRIV_CAPS_TYPE_MASK	GENMASK(7, 4)
 #define LPSS_PRIV_CAPS_TYPE_SHIFT	4
-#define LPSS_PRIV_CAPS_TYPE_MASK	(0xf << LPSS_PRIV_CAPS_TYPE_SHIFT)
 
 /* This matches the type field in CAPS register */
 enum intel_lpss_dev_type {
@@ -128,17 +128,6 @@
 static DEFINE_IDA(intel_lpss_devid_ida);
 static struct dentry *intel_lpss_debugfs;
 
-static int intel_lpss_request_dma_module(const char *name)
-{
-	static bool intel_lpss_dma_requested;
-
-	if (intel_lpss_dma_requested)
-		return 0;
-
-	intel_lpss_dma_requested = true;
-	return request_module("%s", name);
-}
-
 static void intel_lpss_cache_ltr(struct intel_lpss *lpss)
 {
 	lpss->active_ltr = readl(lpss->priv + LPSS_PRIV_ACTIVELTR);
@@ -429,16 +418,6 @@
 		dev_warn(dev, "Failed to create debugfs entries\n");
 
 	if (intel_lpss_has_idma(lpss)) {
-		/*
-		 * Ensure the DMA driver is loaded before the host
-		 * controller device appears, so that the host controller
-		 * driver can request its DMA channels as early as
-		 * possible.
-		 *
-		 * If the DMA module is not there that's OK as well.
-		 */
-		intel_lpss_request_dma_module(LPSS_IDMA64_DRIVER_NAME);
-
 		ret = mfd_add_devices(dev, lpss->devid, &intel_lpss_idma64_cell,
 				      1, info->mem, info->irq, NULL);
 		if (ret)
@@ -554,3 +533,11 @@
 MODULE_AUTHOR("Jarkko Nikula <jarkko.nikula@linux.intel.com>");
 MODULE_DESCRIPTION("Intel LPSS core driver");
 MODULE_LICENSE("GPL v2");
+/*
+ * Ensure the DMA driver is loaded before the host controller device appears,
+ * so that the host controller driver can request its DMA channels as early
+ * as possible.
+ *
+ * If the DMA module is not there that's OK as well.
+ */
+MODULE_SOFTDEP("pre: platform:" LPSS_IDMA64_DRIVER_NAME);

diff --git a/drivers/mfd/intel_soc_pmic_bxtwc.c b/drivers/mfd/intel_soc_pmic_bxtwc.c
index 6310c3b..739cfb5 100644
--- a/drivers/mfd/intel_soc_pmic_bxtwc.c
+++ b/drivers/mfd/intel_soc_pmic_bxtwc.c

@@ -450,10 +450,8 @@
 		return -ENOMEM;
 
 	ret = platform_get_irq(pdev, 0);
-	if (ret < 0) {
-		dev_err(&pdev->dev, "Invalid IRQ\n");
+	if (ret < 0)
 		return ret;
-	}
 	pmic->irq = ret;
 
 	dev_set_drvdata(&pdev->dev, pmic);

diff --git a/drivers/mfd/intel_soc_pmic_mrfld.c b/drivers/mfd/intel_soc_pmic_mrfld.c
new file mode 100644
index 0000000..26a1551
--- /dev/null
+++ b/drivers/mfd/intel_soc_pmic_mrfld.c

@@ -0,0 +1,157 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Device access for Basin Cove PMIC
+ *
+ * Copyright (c) 2019, Intel Corporation.
+ * Author: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
+ */
+
+#include <linux/acpi.h>
+#include <linux/interrupt.h>
+#include <linux/mfd/core.h>
+#include <linux/mfd/intel_soc_pmic.h>
+#include <linux/mfd/intel_soc_pmic_mrfld.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/regmap.h>
+
+#include <asm/intel_scu_ipc.h>
+
+/*
+ * Level 2 IRQs
+ *
+ * Firmware on the systems with Basin Cove PMIC services Level 1 IRQs
+ * without an assistance. Thus, each of the Level 1 IRQ is represented
+ * as a separate RTE in IOAPIC.
+ */
+static struct resource irq_level2_resources[] = {
+	DEFINE_RES_IRQ(0), /* power button */
+	DEFINE_RES_IRQ(0), /* TMU */
+	DEFINE_RES_IRQ(0), /* thermal */
+	DEFINE_RES_IRQ(0), /* BCU */
+	DEFINE_RES_IRQ(0), /* ADC */
+	DEFINE_RES_IRQ(0), /* charger */
+	DEFINE_RES_IRQ(0), /* GPIO */
+};
+
+static const struct mfd_cell bcove_dev[] = {
+	{
+		.name = "mrfld_bcove_pwrbtn",
+		.num_resources = 1,
+		.resources = &irq_level2_resources[0],
+	}, {
+		.name = "mrfld_bcove_tmu",
+		.num_resources = 1,
+		.resources = &irq_level2_resources[1],
+	}, {
+		.name = "mrfld_bcove_thermal",
+		.num_resources = 1,
+		.resources = &irq_level2_resources[2],
+	}, {
+		.name = "mrfld_bcove_bcu",
+		.num_resources = 1,
+		.resources = &irq_level2_resources[3],
+	}, {
+		.name = "mrfld_bcove_adc",
+		.num_resources = 1,
+		.resources = &irq_level2_resources[4],
+	}, {
+		.name = "mrfld_bcove_charger",
+		.num_resources = 1,
+		.resources = &irq_level2_resources[5],
+	}, {
+		.name = "mrfld_bcove_pwrsrc",
+		.num_resources = 1,
+		.resources = &irq_level2_resources[5],
+	}, {
+		.name = "mrfld_bcove_gpio",
+		.num_resources = 1,
+		.resources = &irq_level2_resources[6],
+	},
+	{	.name = "mrfld_bcove_region", },
+};
+
+static int bcove_ipc_byte_reg_read(void *context, unsigned int reg,
+				    unsigned int *val)
+{
+	u8 ipc_out;
+	int ret;
+
+	ret = intel_scu_ipc_ioread8(reg, &ipc_out);
+	if (ret)
+		return ret;
+
+	*val = ipc_out;
+	return 0;
+}
+
+static int bcove_ipc_byte_reg_write(void *context, unsigned int reg,
+				     unsigned int val)
+{
+	u8 ipc_in = val;
+	int ret;
+
+	ret = intel_scu_ipc_iowrite8(reg, ipc_in);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+static const struct regmap_config bcove_regmap_config = {
+	.reg_bits = 16,
+	.val_bits = 8,
+	.max_register = 0xff,
+	.reg_write = bcove_ipc_byte_reg_write,
+	.reg_read = bcove_ipc_byte_reg_read,
+};
+
+static int bcove_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct intel_soc_pmic *pmic;
+	unsigned int i;
+	int ret;
+
+	pmic = devm_kzalloc(dev, sizeof(*pmic), GFP_KERNEL);
+	if (!pmic)
+		return -ENOMEM;
+
+	platform_set_drvdata(pdev, pmic);
+	pmic->dev = &pdev->dev;
+
+	pmic->regmap = devm_regmap_init(dev, NULL, pmic, &bcove_regmap_config);
+	if (IS_ERR(pmic->regmap))
+		return PTR_ERR(pmic->regmap);
+
+	for (i = 0; i < ARRAY_SIZE(irq_level2_resources); i++) {
+		ret = platform_get_irq(pdev, i);
+		if (ret < 0)
+			return ret;
+
+		irq_level2_resources[i].start = ret;
+		irq_level2_resources[i].end = ret;
+	}
+
+	return devm_mfd_add_devices(dev, PLATFORM_DEVID_NONE,
+				    bcove_dev, ARRAY_SIZE(bcove_dev),
+				    NULL, 0, NULL);
+}
+
+static const struct acpi_device_id bcove_acpi_ids[] = {
+	{ "INTC100E" },
+	{}
+};
+MODULE_DEVICE_TABLE(acpi, bcove_acpi_ids);
+
+static struct platform_driver bcove_driver = {
+	.driver = {
+		.name = "intel_soc_pmic_mrfld",
+		.acpi_match_table = bcove_acpi_ids,
+	},
+	.probe = bcove_probe,
+};
+module_platform_driver(bcove_driver);
+
+MODULE_DESCRIPTION("IPC driver for Intel SoC Basin Cove PMIC");
+MODULE_LICENSE("GPL v2");

diff --git a/drivers/mfd/jz4740-adc.c b/drivers/mfd/jz4740-adc.c
deleted file mode 100644
index 082f169..0000000
--- a/drivers/mfd/jz4740-adc.c
+++ /dev/null

@@ -1,324 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Copyright (C) 2009-2010, Lars-Peter Clausen <lars@metafoo.de>
- * JZ4740 SoC ADC driver
- *
- * This driver synchronizes access to the JZ4740 ADC core between the
- * JZ4740 battery and hwmon drivers.
- */
-
-#include <linux/err.h>
-#include <linux/io.h>
-#include <linux/irq.h>
-#include <linux/interrupt.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/platform_device.h>
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-
-#include <linux/clk.h>
-#include <linux/mfd/core.h>
-
-#include <linux/jz4740-adc.h>
-
-
-#define JZ_REG_ADC_ENABLE	0x00
-#define JZ_REG_ADC_CFG		0x04
-#define JZ_REG_ADC_CTRL		0x08
-#define JZ_REG_ADC_STATUS	0x0c
-
-#define JZ_REG_ADC_TOUCHSCREEN_BASE	0x10
-#define JZ_REG_ADC_BATTERY_BASE	0x1c
-#define JZ_REG_ADC_HWMON_BASE	0x20
-
-#define JZ_ADC_ENABLE_TOUCH	BIT(2)
-#define JZ_ADC_ENABLE_BATTERY	BIT(1)
-#define JZ_ADC_ENABLE_ADCIN	BIT(0)
-
-enum {
-	JZ_ADC_IRQ_ADCIN = 0,
-	JZ_ADC_IRQ_BATTERY,
-	JZ_ADC_IRQ_TOUCH,
-	JZ_ADC_IRQ_PENUP,
-	JZ_ADC_IRQ_PENDOWN,
-};
-
-struct jz4740_adc {
-	struct resource *mem;
-	void __iomem *base;
-
-	int irq;
-	struct irq_chip_generic *gc;
-
-	struct clk *clk;
-	atomic_t clk_ref;
-
-	spinlock_t lock;
-};
-
-static void jz4740_adc_irq_demux(struct irq_desc *desc)
-{
-	struct irq_chip_generic *gc = irq_desc_get_handler_data(desc);
-	uint8_t status;
-	unsigned int i;
-
-	status = readb(gc->reg_base + JZ_REG_ADC_STATUS);
-
-	for (i = 0; i < 5; ++i) {
-		if (status & BIT(i))
-			generic_handle_irq(gc->irq_base + i);
-	}
-}
-
-
-/* Refcounting for the ADC clock is done in here instead of in the clock
- * framework, because it is the only clock which is shared between multiple
- * devices and thus is the only clock which needs refcounting */
-static inline void jz4740_adc_clk_enable(struct jz4740_adc *adc)
-{
-	if (atomic_inc_return(&adc->clk_ref) == 1)
-		clk_prepare_enable(adc->clk);
-}
-
-static inline void jz4740_adc_clk_disable(struct jz4740_adc *adc)
-{
-	if (atomic_dec_return(&adc->clk_ref) == 0)
-		clk_disable_unprepare(adc->clk);
-}
-
-static inline void jz4740_adc_set_enabled(struct jz4740_adc *adc, int engine,
-	bool enabled)
-{
-	unsigned long flags;
-	uint8_t val;
-
-	spin_lock_irqsave(&adc->lock, flags);
-
-	val = readb(adc->base + JZ_REG_ADC_ENABLE);
-	if (enabled)
-		val |= BIT(engine);
-	else
-		val &= ~BIT(engine);
-	writeb(val, adc->base + JZ_REG_ADC_ENABLE);
-
-	spin_unlock_irqrestore(&adc->lock, flags);
-}
-
-static int jz4740_adc_cell_enable(struct platform_device *pdev)
-{
-	struct jz4740_adc *adc = dev_get_drvdata(pdev->dev.parent);
-
-	jz4740_adc_clk_enable(adc);
-	jz4740_adc_set_enabled(adc, pdev->id, true);
-
-	return 0;
-}
-
-static int jz4740_adc_cell_disable(struct platform_device *pdev)
-{
-	struct jz4740_adc *adc = dev_get_drvdata(pdev->dev.parent);
-
-	jz4740_adc_set_enabled(adc, pdev->id, false);
-	jz4740_adc_clk_disable(adc);
-
-	return 0;
-}
-
-int jz4740_adc_set_config(struct device *dev, uint32_t mask, uint32_t val)
-{
-	struct jz4740_adc *adc = dev_get_drvdata(dev);
-	unsigned long flags;
-	uint32_t cfg;
-
-	if (!adc)
-		return -ENODEV;
-
-	spin_lock_irqsave(&adc->lock, flags);
-
-	cfg = readl(adc->base + JZ_REG_ADC_CFG);
-
-	cfg &= ~mask;
-	cfg |= val;
-
-	writel(cfg, adc->base + JZ_REG_ADC_CFG);
-
-	spin_unlock_irqrestore(&adc->lock, flags);
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(jz4740_adc_set_config);
-
-static struct resource jz4740_hwmon_resources[] = {
-	{
-		.start = JZ_ADC_IRQ_ADCIN,
-		.flags = IORESOURCE_IRQ,
-	},
-	{
-		.start	= JZ_REG_ADC_HWMON_BASE,
-		.end	= JZ_REG_ADC_HWMON_BASE + 3,
-		.flags	= IORESOURCE_MEM,
-	},
-};
-
-static struct resource jz4740_battery_resources[] = {
-	{
-		.start = JZ_ADC_IRQ_BATTERY,
-		.flags = IORESOURCE_IRQ,
-	},
-	{
-		.start	= JZ_REG_ADC_BATTERY_BASE,
-		.end	= JZ_REG_ADC_BATTERY_BASE + 3,
-		.flags	= IORESOURCE_MEM,
-	},
-};
-
-static const struct mfd_cell jz4740_adc_cells[] = {
-	{
-		.id = 0,
-		.name = "jz4740-hwmon",
-		.num_resources = ARRAY_SIZE(jz4740_hwmon_resources),
-		.resources = jz4740_hwmon_resources,
-
-		.enable = jz4740_adc_cell_enable,
-		.disable = jz4740_adc_cell_disable,
-	},
-	{
-		.id = 1,
-		.name = "jz4740-battery",
-		.num_resources = ARRAY_SIZE(jz4740_battery_resources),
-		.resources = jz4740_battery_resources,
-
-		.enable = jz4740_adc_cell_enable,
-		.disable = jz4740_adc_cell_disable,
-	},
-};
-
-static int jz4740_adc_probe(struct platform_device *pdev)
-{
-	struct irq_chip_generic *gc;
-	struct irq_chip_type *ct;
-	struct jz4740_adc *adc;
-	struct resource *mem_base;
-	int ret;
-	int irq_base;
-
-	adc = devm_kzalloc(&pdev->dev, sizeof(*adc), GFP_KERNEL);
-	if (!adc)
-		return -ENOMEM;
-
-	adc->irq = platform_get_irq(pdev, 0);
-	if (adc->irq < 0) {
-		ret = adc->irq;
-		dev_err(&pdev->dev, "Failed to get platform irq: %d\n", ret);
-		return ret;
-	}
-
-	irq_base = platform_get_irq(pdev, 1);
-	if (irq_base < 0) {
-		dev_err(&pdev->dev, "Failed to get irq base: %d\n", irq_base);
-		return irq_base;
-	}
-
-	mem_base = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	if (!mem_base) {
-		dev_err(&pdev->dev, "Failed to get platform mmio resource\n");
-		return -ENOENT;
-	}
-
-	/* Only request the shared registers for the MFD driver */
-	adc->mem = request_mem_region(mem_base->start, JZ_REG_ADC_STATUS,
-					pdev->name);
-	if (!adc->mem) {
-		dev_err(&pdev->dev, "Failed to request mmio memory region\n");
-		return -EBUSY;
-	}
-
-	adc->base = ioremap_nocache(adc->mem->start, resource_size(adc->mem));
-	if (!adc->base) {
-		ret = -EBUSY;
-		dev_err(&pdev->dev, "Failed to ioremap mmio memory\n");
-		goto err_release_mem_region;
-	}
-
-	adc->clk = clk_get(&pdev->dev, "adc");
-	if (IS_ERR(adc->clk)) {
-		ret = PTR_ERR(adc->clk);
-		dev_err(&pdev->dev, "Failed to get clock: %d\n", ret);
-		goto err_iounmap;
-	}
-
-	spin_lock_init(&adc->lock);
-	atomic_set(&adc->clk_ref, 0);
-
-	platform_set_drvdata(pdev, adc);
-
-	gc = irq_alloc_generic_chip("INTC", 1, irq_base, adc->base,
-		handle_level_irq);
-
-	ct = gc->chip_types;
-	ct->regs.mask = JZ_REG_ADC_CTRL;
-	ct->regs.ack = JZ_REG_ADC_STATUS;
-	ct->chip.irq_mask = irq_gc_mask_set_bit;
-	ct->chip.irq_unmask = irq_gc_mask_clr_bit;
-	ct->chip.irq_ack = irq_gc_ack_set_bit;
-
-	irq_setup_generic_chip(gc, IRQ_MSK(5), IRQ_GC_INIT_MASK_CACHE, 0,
-				IRQ_NOPROBE | IRQ_LEVEL);
-
-	adc->gc = gc;
-
-	irq_set_chained_handler_and_data(adc->irq, jz4740_adc_irq_demux, gc);
-
-	writeb(0x00, adc->base + JZ_REG_ADC_ENABLE);
-	writeb(0xff, adc->base + JZ_REG_ADC_CTRL);
-
-	ret = mfd_add_devices(&pdev->dev, 0, jz4740_adc_cells,
-			      ARRAY_SIZE(jz4740_adc_cells), mem_base,
-			      irq_base, NULL);
-	if (ret < 0)
-		goto err_clk_put;
-
-	return 0;
-
-err_clk_put:
-	clk_put(adc->clk);
-err_iounmap:
-	iounmap(adc->base);
-err_release_mem_region:
-	release_mem_region(adc->mem->start, resource_size(adc->mem));
-	return ret;
-}
-
-static int jz4740_adc_remove(struct platform_device *pdev)
-{
-	struct jz4740_adc *adc = platform_get_drvdata(pdev);
-
-	mfd_remove_devices(&pdev->dev);
-
-	irq_remove_generic_chip(adc->gc, IRQ_MSK(5), IRQ_NOPROBE | IRQ_LEVEL, 0);
-	kfree(adc->gc);
-	irq_set_chained_handler_and_data(adc->irq, NULL, NULL);
-
-	iounmap(adc->base);
-	release_mem_region(adc->mem->start, resource_size(adc->mem));
-
-	clk_put(adc->clk);
-
-	return 0;
-}
-
-static struct platform_driver jz4740_adc_driver = {
-	.probe	= jz4740_adc_probe,
-	.remove = jz4740_adc_remove,
-	.driver = {
-		.name = "jz4740-adc",
-	},
-};
-
-module_platform_driver(jz4740_adc_driver);
-
-MODULE_DESCRIPTION("JZ4740 SoC ADC driver");
-MODULE_AUTHOR("Lars-Peter Clausen <lars@metafoo.de>");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS("platform:jz4740-adc");

diff --git a/drivers/mfd/max14577.c b/drivers/mfd/max14577.c
index ebb13d5..fd8864c 100644
--- a/drivers/mfd/max14577.c
+++ b/drivers/mfd/max14577.c

@@ -297,11 +297,11 @@
 	int ret;
 	u8 intsrc_mask;
 
-	max14577->i2c_pmic = i2c_new_dummy(max14577->i2c->adapter,
+	max14577->i2c_pmic = i2c_new_dummy_device(max14577->i2c->adapter,
 			I2C_ADDR_PMIC);
-	if (!max14577->i2c_pmic) {
+	if (IS_ERR(max14577->i2c_pmic)) {
 		dev_err(max14577->dev, "Failed to register PMIC I2C device\n");
-		return -ENODEV;
+		return PTR_ERR(max14577->i2c_pmic);
 	}
 	i2c_set_clientdata(max14577->i2c_pmic, max14577);
 

diff --git a/drivers/mfd/max77620.c b/drivers/mfd/max77620.c
index 0c28965..a851ff4 100644
--- a/drivers/mfd/max77620.c
+++ b/drivers/mfd/max77620.c

@@ -416,8 +416,10 @@
 
 	for_each_child_of_node(fps_np, fps_child) {
 		ret = max77620_config_fps(chip, fps_child);
-		if (ret < 0)
+		if (ret < 0) {
+			of_node_put(fps_child);
 			return ret;
+		}
 	}
 
 	config = chip->enable_global_lpm ? MAX77620_ONOFFCNFG2_SLP_LPM_MSK : 0;

diff --git a/drivers/mfd/max77693.c b/drivers/mfd/max77693.c
index 901d99d..596ed85 100644
--- a/drivers/mfd/max77693.c
+++ b/drivers/mfd/max77693.c

@@ -183,17 +183,17 @@
 	} else
 		dev_info(max77693->dev, "device ID: 0x%x\n", reg_data);
 
-	max77693->i2c_muic = i2c_new_dummy(i2c->adapter, I2C_ADDR_MUIC);
-	if (!max77693->i2c_muic) {
+	max77693->i2c_muic = i2c_new_dummy_device(i2c->adapter, I2C_ADDR_MUIC);
+	if (IS_ERR(max77693->i2c_muic)) {
 		dev_err(max77693->dev, "Failed to allocate I2C device for MUIC\n");
-		return -ENODEV;
+		return PTR_ERR(max77693->i2c_muic);
 	}
 	i2c_set_clientdata(max77693->i2c_muic, max77693);
 
-	max77693->i2c_haptic = i2c_new_dummy(i2c->adapter, I2C_ADDR_HAPTIC);
-	if (!max77693->i2c_haptic) {
+	max77693->i2c_haptic = i2c_new_dummy_device(i2c->adapter, I2C_ADDR_HAPTIC);
+	if (IS_ERR(max77693->i2c_haptic)) {
 		dev_err(max77693->dev, "Failed to allocate I2C device for Haptic\n");
-		ret = -ENODEV;
+		ret = PTR_ERR(max77693->i2c_haptic);
 		goto err_i2c_haptic;
 	}
 	i2c_set_clientdata(max77693->i2c_haptic, max77693);

diff --git a/drivers/mfd/max77843.c b/drivers/mfd/max77843.c
index 25cbb22..209ee24 100644
--- a/drivers/mfd/max77843.c
+++ b/drivers/mfd/max77843.c

@@ -70,11 +70,11 @@
 {
 	int ret;
 
-	max77843->i2c_chg = i2c_new_dummy(max77843->i2c->adapter, I2C_ADDR_CHG);
-	if (!max77843->i2c_chg) {
+	max77843->i2c_chg = i2c_new_dummy_device(max77843->i2c->adapter, I2C_ADDR_CHG);
+	if (IS_ERR(max77843->i2c_chg)) {
 		dev_err(&max77843->i2c->dev,
 				"Cannot allocate I2C device for Charger\n");
-		return -ENODEV;
+		return PTR_ERR(max77843->i2c_chg);
 	}
 	i2c_set_clientdata(max77843->i2c_chg, max77843);
 

diff --git a/drivers/mfd/max8907.c b/drivers/mfd/max8907.c
index cc01f70..d44baaf 100644
--- a/drivers/mfd/max8907.c
+++ b/drivers/mfd/max8907.c

@@ -214,9 +214,9 @@
 		goto err_regmap_gen;
 	}
 
-	max8907->i2c_rtc = i2c_new_dummy(i2c->adapter, MAX8907_RTC_I2C_ADDR);
-	if (!max8907->i2c_rtc) {
-		ret = -ENOMEM;
+	max8907->i2c_rtc = i2c_new_dummy_device(i2c->adapter, MAX8907_RTC_I2C_ADDR);
+	if (IS_ERR(max8907->i2c_rtc)) {
+		ret = PTR_ERR(max8907->i2c_rtc);
 		goto err_dummy_rtc;
 	}
 	i2c_set_clientdata(max8907->i2c_rtc, max8907);

diff --git a/drivers/mfd/max8925-i2c.c b/drivers/mfd/max8925-i2c.c
index 20bb19b..114e905 100644
--- a/drivers/mfd/max8925-i2c.c
+++ b/drivers/mfd/max8925-i2c.c

@@ -176,18 +176,18 @@
 	dev_set_drvdata(chip->dev, chip);
 	mutex_init(&chip->io_lock);
 
-	chip->rtc = i2c_new_dummy(chip->i2c->adapter, RTC_I2C_ADDR);
-	if (!chip->rtc) {
+	chip->rtc = i2c_new_dummy_device(chip->i2c->adapter, RTC_I2C_ADDR);
+	if (IS_ERR(chip->rtc)) {
 		dev_err(chip->dev, "Failed to allocate I2C device for RTC\n");
-		return -ENODEV;
+		return PTR_ERR(chip->rtc);
 	}
 	i2c_set_clientdata(chip->rtc, chip);
 
-	chip->adc = i2c_new_dummy(chip->i2c->adapter, ADC_I2C_ADDR);
-	if (!chip->adc) {
+	chip->adc = i2c_new_dummy_device(chip->i2c->adapter, ADC_I2C_ADDR);
+	if (IS_ERR(chip->adc)) {
 		dev_err(chip->dev, "Failed to allocate I2C device for ADC\n");
 		i2c_unregister_device(chip->rtc);
-		return -ENODEV;
+		return PTR_ERR(chip->adc);
 	}
 	i2c_set_clientdata(chip->adc, chip);
 

diff --git a/drivers/mfd/max8997.c b/drivers/mfd/max8997.c
index 8c06c09..68d8f2b 100644
--- a/drivers/mfd/max8997.c
+++ b/drivers/mfd/max8997.c

@@ -185,25 +185,25 @@
 
 	mutex_init(&max8997->iolock);
 
-	max8997->rtc = i2c_new_dummy(i2c->adapter, I2C_ADDR_RTC);
-	if (!max8997->rtc) {
+	max8997->rtc = i2c_new_dummy_device(i2c->adapter, I2C_ADDR_RTC);
+	if (IS_ERR(max8997->rtc)) {
 		dev_err(max8997->dev, "Failed to allocate I2C device for RTC\n");
-		return -ENODEV;
+		return PTR_ERR(max8997->rtc);
 	}
 	i2c_set_clientdata(max8997->rtc, max8997);
 
-	max8997->haptic = i2c_new_dummy(i2c->adapter, I2C_ADDR_HAPTIC);
-	if (!max8997->haptic) {
+	max8997->haptic = i2c_new_dummy_device(i2c->adapter, I2C_ADDR_HAPTIC);
+	if (IS_ERR(max8997->haptic)) {
 		dev_err(max8997->dev, "Failed to allocate I2C device for Haptic\n");
-		ret = -ENODEV;
+		ret = PTR_ERR(max8997->haptic);
 		goto err_i2c_haptic;
 	}
 	i2c_set_clientdata(max8997->haptic, max8997);
 
-	max8997->muic = i2c_new_dummy(i2c->adapter, I2C_ADDR_MUIC);
-	if (!max8997->muic) {
+	max8997->muic = i2c_new_dummy_device(i2c->adapter, I2C_ADDR_MUIC);
+	if (IS_ERR(max8997->muic)) {
 		dev_err(max8997->dev, "Failed to allocate I2C device for MUIC\n");
-		ret = -ENODEV;
+		ret = PTR_ERR(max8997->muic);
 		goto err_i2c_muic;
 	}
 	i2c_set_clientdata(max8997->muic, max8997);

diff --git a/drivers/mfd/max8998.c b/drivers/mfd/max8998.c
index 56409df..785f8e9 100644
--- a/drivers/mfd/max8998.c
+++ b/drivers/mfd/max8998.c

@@ -195,10 +195,10 @@
 	}
 	mutex_init(&max8998->iolock);
 
-	max8998->rtc = i2c_new_dummy(i2c->adapter, RTC_I2C_ADDR);
-	if (!max8998->rtc) {
+	max8998->rtc = i2c_new_dummy_device(i2c->adapter, RTC_I2C_ADDR);
+	if (IS_ERR(max8998->rtc)) {
 		dev_err(&i2c->dev, "Failed to allocate I2C device for RTC\n");
-		return -ENODEV;
+		return PTR_ERR(max8998->rtc);
 	}
 	i2c_set_clientdata(max8998->rtc, max8998);
 

diff --git a/drivers/mfd/mt6397-core.c b/drivers/mfd/mt6397-core.c
index 337bccc..310dae2 100644
--- a/drivers/mfd/mt6397-core.c
+++ b/drivers/mfd/mt6397-core.c

@@ -5,34 +5,34 @@
  */
 
 #include <linux/interrupt.h>
+#include <linux/ioport.h>
 #include <linux/module.h>
 #include <linux/of_device.h>
 #include <linux/of_irq.h>
 #include <linux/regmap.h>
 #include <linux/mfd/core.h>
-#include <linux/mfd/mt6397/core.h>
 #include <linux/mfd/mt6323/core.h>
-#include <linux/mfd/mt6397/registers.h>
+#include <linux/mfd/mt6397/core.h>
 #include <linux/mfd/mt6323/registers.h>
+#include <linux/mfd/mt6397/registers.h>
+
+#define MT6323_RTC_BASE		0x8000
+#define MT6323_RTC_SIZE		0x40
 
 #define MT6397_RTC_BASE		0xe000
 #define MT6397_RTC_SIZE		0x3e
 
-#define MT6323_CID_CODE		0x23
-#define MT6391_CID_CODE		0x91
-#define MT6397_CID_CODE		0x97
+#define MT6323_PWRC_BASE	0x8000
+#define MT6323_PWRC_SIZE	0x40
+
+static const struct resource mt6323_rtc_resources[] = {
+	DEFINE_RES_MEM(MT6323_RTC_BASE, MT6323_RTC_SIZE),
+	DEFINE_RES_IRQ(MT6323_IRQ_STATUS_RTC),
+};
 
 static const struct resource mt6397_rtc_resources[] = {
-	{
-		.start = MT6397_RTC_BASE,
-		.end   = MT6397_RTC_BASE + MT6397_RTC_SIZE,
-		.flags = IORESOURCE_MEM,
-	},
-	{
-		.start = MT6397_IRQ_RTC,
-		.end   = MT6397_IRQ_RTC,
-		.flags = IORESOURCE_IRQ,
-	},
+	DEFINE_RES_MEM(MT6397_RTC_BASE, MT6397_RTC_SIZE),
+	DEFINE_RES_IRQ(MT6397_IRQ_RTC),
 };
 
 static const struct resource mt6323_keys_resources[] = {
@@ -45,8 +45,17 @@
 	DEFINE_RES_IRQ(MT6397_IRQ_HOMEKEY),
 };
 
+static const struct resource mt6323_pwrc_resources[] = {
+	DEFINE_RES_MEM(MT6323_PWRC_BASE, MT6323_PWRC_SIZE),
+};
+
 static const struct mfd_cell mt6323_devs[] = {
 	{
+		.name = "mt6323-rtc",
+		.num_resources = ARRAY_SIZE(mt6323_rtc_resources),
+		.resources = mt6323_rtc_resources,
+		.of_compatible = "mediatek,mt6323-rtc",
+	}, {
 		.name = "mt6323-regulator",
 		.of_compatible = "mediatek,mt6323-regulator"
 	}, {
@@ -57,6 +66,11 @@
 		.num_resources = ARRAY_SIZE(mt6323_keys_resources),
 		.resources = mt6323_keys_resources,
 		.of_compatible = "mediatek,mt6323-keys"
+	}, {
+		.name = "mt6323-pwrc",
+		.num_resources = ARRAY_SIZE(mt6323_pwrc_resources),
+		.resources = mt6323_pwrc_resources,
+		.of_compatible = "mediatek,mt6323-pwrc"
 	},
 };
 
@@ -86,148 +100,6 @@
 	}
 };
 
-static void mt6397_irq_lock(struct irq_data *data)
-{
-	struct mt6397_chip *mt6397 = irq_data_get_irq_chip_data(data);
-
-	mutex_lock(&mt6397->irqlock);
-}
-
-static void mt6397_irq_sync_unlock(struct irq_data *data)
-{
-	struct mt6397_chip *mt6397 = irq_data_get_irq_chip_data(data);
-
-	regmap_write(mt6397->regmap, mt6397->int_con[0],
-		     mt6397->irq_masks_cur[0]);
-	regmap_write(mt6397->regmap, mt6397->int_con[1],
-		     mt6397->irq_masks_cur[1]);
-
-	mutex_unlock(&mt6397->irqlock);
-}
-
-static void mt6397_irq_disable(struct irq_data *data)
-{
-	struct mt6397_chip *mt6397 = irq_data_get_irq_chip_data(data);
-	int shift = data->hwirq & 0xf;
-	int reg = data->hwirq >> 4;
-
-	mt6397->irq_masks_cur[reg] &= ~BIT(shift);
-}
-
-static void mt6397_irq_enable(struct irq_data *data)
-{
-	struct mt6397_chip *mt6397 = irq_data_get_irq_chip_data(data);
-	int shift = data->hwirq & 0xf;
-	int reg = data->hwirq >> 4;
-
-	mt6397->irq_masks_cur[reg] |= BIT(shift);
-}
-
-#ifdef CONFIG_PM_SLEEP
-static int mt6397_irq_set_wake(struct irq_data *irq_data, unsigned int on)
-{
-	struct mt6397_chip *mt6397 = irq_data_get_irq_chip_data(irq_data);
-	int shift = irq_data->hwirq & 0xf;
-	int reg = irq_data->hwirq >> 4;
-
-	if (on)
-		mt6397->wake_mask[reg] |= BIT(shift);
-	else
-		mt6397->wake_mask[reg] &= ~BIT(shift);
-
-	return 0;
-}
-#else
-#define mt6397_irq_set_wake NULL
-#endif
-
-static struct irq_chip mt6397_irq_chip = {
-	.name = "mt6397-irq",
-	.irq_bus_lock = mt6397_irq_lock,
-	.irq_bus_sync_unlock = mt6397_irq_sync_unlock,
-	.irq_enable = mt6397_irq_enable,
-	.irq_disable = mt6397_irq_disable,
-	.irq_set_wake = mt6397_irq_set_wake,
-};
-
-static void mt6397_irq_handle_reg(struct mt6397_chip *mt6397, int reg,
-		int irqbase)
-{
-	unsigned int status;
-	int i, irq, ret;
-
-	ret = regmap_read(mt6397->regmap, reg, &status);
-	if (ret) {
-		dev_err(mt6397->dev, "Failed to read irq status: %d\n", ret);
-		return;
-	}
-
-	for (i = 0; i < 16; i++) {
-		if (status & BIT(i)) {
-			irq = irq_find_mapping(mt6397->irq_domain, irqbase + i);
-			if (irq)
-				handle_nested_irq(irq);
-		}
-	}
-
-	regmap_write(mt6397->regmap, reg, status);
-}
-
-static irqreturn_t mt6397_irq_thread(int irq, void *data)
-{
-	struct mt6397_chip *mt6397 = data;
-
-	mt6397_irq_handle_reg(mt6397, mt6397->int_status[0], 0);
-	mt6397_irq_handle_reg(mt6397, mt6397->int_status[1], 16);
-
-	return IRQ_HANDLED;
-}
-
-static int mt6397_irq_domain_map(struct irq_domain *d, unsigned int irq,
-					irq_hw_number_t hw)
-{
-	struct mt6397_chip *mt6397 = d->host_data;
-
-	irq_set_chip_data(irq, mt6397);
-	irq_set_chip_and_handler(irq, &mt6397_irq_chip, handle_level_irq);
-	irq_set_nested_thread(irq, 1);
-	irq_set_noprobe(irq);
-
-	return 0;
-}
-
-static const struct irq_domain_ops mt6397_irq_domain_ops = {
-	.map = mt6397_irq_domain_map,
-};
-
-static int mt6397_irq_init(struct mt6397_chip *mt6397)
-{
-	int ret;
-
-	mutex_init(&mt6397->irqlock);
-
-	/* Mask all interrupt sources */
-	regmap_write(mt6397->regmap, mt6397->int_con[0], 0x0);
-	regmap_write(mt6397->regmap, mt6397->int_con[1], 0x0);
-
-	mt6397->irq_domain = irq_domain_add_linear(mt6397->dev->of_node,
-		MT6397_IRQ_NR, &mt6397_irq_domain_ops, mt6397);
-	if (!mt6397->irq_domain) {
-		dev_err(mt6397->dev, "could not create irq domain\n");
-		return -ENOMEM;
-	}
-
-	ret = devm_request_threaded_irq(mt6397->dev, mt6397->irq, NULL,
-		mt6397_irq_thread, IRQF_ONESHOT, "mt6397-pmic", mt6397);
-	if (ret) {
-		dev_err(mt6397->dev, "failed to register irq=%d; err: %d\n",
-			mt6397->irq, ret);
-		return ret;
-	}
-
-	return 0;
-}
-
 #ifdef CONFIG_PM_SLEEP
 static int mt6397_irq_suspend(struct device *dev)
 {
@@ -290,7 +162,7 @@
 		return pmic->irq;
 
 	switch (id & 0xff) {
-	case MT6323_CID_CODE:
+	case MT6323_CHIP_ID:
 		pmic->int_con[0] = MT6323_INT_CON0;
 		pmic->int_con[1] = MT6323_INT_CON1;
 		pmic->int_status[0] = MT6323_INT_STATUS0;
@@ -304,8 +176,8 @@
 					   0, pmic->irq_domain);
 		break;
 
-	case MT6397_CID_CODE:
-	case MT6391_CID_CODE:
+	case MT6391_CHIP_ID:
+	case MT6397_CHIP_ID:
 		pmic->int_con[0] = MT6397_INT_CON0;
 		pmic->int_con[1] = MT6397_INT_CON1;
 		pmic->int_status[0] = MT6397_INT_STATUS0;

diff --git a/drivers/mfd/mt6397-irq.c b/drivers/mfd/mt6397-irq.c
new file mode 100644
index 0000000..b2d3ce1
--- /dev/null
+++ b/drivers/mfd/mt6397-irq.c

@@ -0,0 +1,181 @@
+// SPDX-License-Identifier: GPL-2.0
+//
+// Copyright (c) 2019 MediaTek Inc.
+
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
+#include <linux/of_irq.h>
+#include <linux/platform_device.h>
+#include <linux/regmap.h>
+#include <linux/mfd/mt6323/core.h>
+#include <linux/mfd/mt6323/registers.h>
+#include <linux/mfd/mt6397/core.h>
+#include <linux/mfd/mt6397/registers.h>
+
+static void mt6397_irq_lock(struct irq_data *data)
+{
+	struct mt6397_chip *mt6397 = irq_data_get_irq_chip_data(data);
+
+	mutex_lock(&mt6397->irqlock);
+}
+
+static void mt6397_irq_sync_unlock(struct irq_data *data)
+{
+	struct mt6397_chip *mt6397 = irq_data_get_irq_chip_data(data);
+
+	regmap_write(mt6397->regmap, mt6397->int_con[0],
+		     mt6397->irq_masks_cur[0]);
+	regmap_write(mt6397->regmap, mt6397->int_con[1],
+		     mt6397->irq_masks_cur[1]);
+
+	mutex_unlock(&mt6397->irqlock);
+}
+
+static void mt6397_irq_disable(struct irq_data *data)
+{
+	struct mt6397_chip *mt6397 = irq_data_get_irq_chip_data(data);
+	int shift = data->hwirq & 0xf;
+	int reg = data->hwirq >> 4;
+
+	mt6397->irq_masks_cur[reg] &= ~BIT(shift);
+}
+
+static void mt6397_irq_enable(struct irq_data *data)
+{
+	struct mt6397_chip *mt6397 = irq_data_get_irq_chip_data(data);
+	int shift = data->hwirq & 0xf;
+	int reg = data->hwirq >> 4;
+
+	mt6397->irq_masks_cur[reg] |= BIT(shift);
+}
+
+#ifdef CONFIG_PM_SLEEP
+static int mt6397_irq_set_wake(struct irq_data *irq_data, unsigned int on)
+{
+	struct mt6397_chip *mt6397 = irq_data_get_irq_chip_data(irq_data);
+	int shift = irq_data->hwirq & 0xf;
+	int reg = irq_data->hwirq >> 4;
+
+	if (on)
+		mt6397->wake_mask[reg] |= BIT(shift);
+	else
+		mt6397->wake_mask[reg] &= ~BIT(shift);
+
+	return 0;
+}
+#else
+#define mt6397_irq_set_wake NULL
+#endif
+
+static struct irq_chip mt6397_irq_chip = {
+	.name = "mt6397-irq",
+	.irq_bus_lock = mt6397_irq_lock,
+	.irq_bus_sync_unlock = mt6397_irq_sync_unlock,
+	.irq_enable = mt6397_irq_enable,
+	.irq_disable = mt6397_irq_disable,
+	.irq_set_wake = mt6397_irq_set_wake,
+};
+
+static void mt6397_irq_handle_reg(struct mt6397_chip *mt6397, int reg,
+				  int irqbase)
+{
+	unsigned int status;
+	int i, irq, ret;
+
+	ret = regmap_read(mt6397->regmap, reg, &status);
+	if (ret) {
+		dev_err(mt6397->dev, "Failed to read irq status: %d\n", ret);
+		return;
+	}
+
+	for (i = 0; i < 16; i++) {
+		if (status & BIT(i)) {
+			irq = irq_find_mapping(mt6397->irq_domain, irqbase + i);
+			if (irq)
+				handle_nested_irq(irq);
+		}
+	}
+
+	regmap_write(mt6397->regmap, reg, status);
+}
+
+static irqreturn_t mt6397_irq_thread(int irq, void *data)
+{
+	struct mt6397_chip *mt6397 = data;
+
+	mt6397_irq_handle_reg(mt6397, mt6397->int_status[0], 0);
+	mt6397_irq_handle_reg(mt6397, mt6397->int_status[1], 16);
+
+	return IRQ_HANDLED;
+}
+
+static int mt6397_irq_domain_map(struct irq_domain *d, unsigned int irq,
+				 irq_hw_number_t hw)
+{
+	struct mt6397_chip *mt6397 = d->host_data;
+
+	irq_set_chip_data(irq, mt6397);
+	irq_set_chip_and_handler(irq, &mt6397_irq_chip, handle_level_irq);
+	irq_set_nested_thread(irq, 1);
+	irq_set_noprobe(irq);
+
+	return 0;
+}
+
+static const struct irq_domain_ops mt6397_irq_domain_ops = {
+	.map = mt6397_irq_domain_map,
+};
+
+int mt6397_irq_init(struct mt6397_chip *chip)
+{
+	int ret;
+
+	mutex_init(&chip->irqlock);
+
+	switch (chip->chip_id) {
+	case MT6323_CHIP_ID:
+		chip->int_con[0] = MT6323_INT_CON0;
+		chip->int_con[1] = MT6323_INT_CON1;
+		chip->int_status[0] = MT6323_INT_STATUS0;
+		chip->int_status[1] = MT6323_INT_STATUS1;
+		break;
+
+	case MT6391_CHIP_ID:
+	case MT6397_CHIP_ID:
+		chip->int_con[0] = MT6397_INT_CON0;
+		chip->int_con[1] = MT6397_INT_CON1;
+		chip->int_status[0] = MT6397_INT_STATUS0;
+		chip->int_status[1] = MT6397_INT_STATUS1;
+		break;
+
+	default:
+		dev_err(chip->dev, "unsupported chip: 0x%x\n", chip->chip_id);
+		return -ENODEV;
+	}
+
+	/* Mask all interrupt sources */
+	regmap_write(chip->regmap, chip->int_con[0], 0x0);
+	regmap_write(chip->regmap, chip->int_con[1], 0x0);
+
+	chip->irq_domain = irq_domain_add_linear(chip->dev->of_node,
+						 MT6397_IRQ_NR,
+						 &mt6397_irq_domain_ops,
+						 chip);
+	if (!chip->irq_domain) {
+		dev_err(chip->dev, "could not create irq domain\n");
+		return -ENOMEM;
+	}
+
+	ret = devm_request_threaded_irq(chip->dev, chip->irq, NULL,
+					mt6397_irq_thread, IRQF_ONESHOT,
+					"mt6397-pmic", chip);
+	if (ret) {
+		dev_err(chip->dev, "failed to register irq=%d; err: %d\n",
+			chip->irq, ret);
+		return ret;
+	}
+
+	return 0;
+}

diff --git a/drivers/mfd/palmas.c b/drivers/mfd/palmas.c
index 6818ff3..f5b3fa9 100644
--- a/drivers/mfd/palmas.c
+++ b/drivers/mfd/palmas.c

@@ -549,12 +549,12 @@
 			palmas->i2c_clients[i] = i2c;
 		else {
 			palmas->i2c_clients[i] =
-					i2c_new_dummy(i2c->adapter,
+					i2c_new_dummy_device(i2c->adapter,
 							i2c->addr + i);
-			if (!palmas->i2c_clients[i]) {
+			if (IS_ERR(palmas->i2c_clients[i])) {
 				dev_err(palmas->dev,
 					"can't attach client %d\n", i);
-				ret = -ENOMEM;
+				ret = PTR_ERR(palmas->i2c_clients[i]);
 				goto err_i2c;
 			}
 			palmas->i2c_clients[i]->dev.of_node = of_node_get(node);

diff --git a/drivers/mfd/qcom_rpm.c b/drivers/mfd/qcom_rpm.c
index 4d7e900..71bc34b7 100644
--- a/drivers/mfd/qcom_rpm.c
+++ b/drivers/mfd/qcom_rpm.c

@@ -561,22 +561,16 @@
 	clk_prepare_enable(rpm->ramclk); /* Accepts NULL */
 
 	irq_ack = platform_get_irq_byname(pdev, "ack");
-	if (irq_ack < 0) {
-		dev_err(&pdev->dev, "required ack interrupt missing\n");
+	if (irq_ack < 0)
 		return irq_ack;
-	}
 
 	irq_err = platform_get_irq_byname(pdev, "err");
-	if (irq_err < 0) {
-		dev_err(&pdev->dev, "required err interrupt missing\n");
+	if (irq_err < 0)
 		return irq_err;
-	}
 
 	irq_wakeup = platform_get_irq_byname(pdev, "wakeup");
-	if (irq_wakeup < 0) {
-		dev_err(&pdev->dev, "required wakeup interrupt missing\n");
+	if (irq_wakeup < 0)
 		return irq_wakeup;
-	}
 
 	match = of_match_device(qcom_rpm_of_match, &pdev->dev);
 	if (!match)

diff --git a/drivers/mfd/sm501.c b/drivers/mfd/sm501.c
index 9b9b06d..154270f 100644
--- a/drivers/mfd/sm501.c
+++ b/drivers/mfd/sm501.c

@@ -17,6 +17,7 @@
 #include <linux/platform_device.h>
 #include <linux/pci.h>
 #include <linux/platform_data/i2c-gpio.h>
+#include <linux/gpio/driver.h>
 #include <linux/gpio/machine.h>
 #include <linux/slab.h>
 
@@ -1394,10 +1395,8 @@
 	sm->platdata = dev_get_platdata(&dev->dev);
 
 	ret = platform_get_irq(dev, 0);
-	if (ret < 0) {
-		dev_err(&dev->dev, "failed to get irq resource\n");
+	if (ret < 0)
 		goto err_res;
-	}
 	sm->irq = ret;
 
 	sm->io_res = platform_get_resource(dev, IORESOURCE_MEM, 1);

diff --git a/drivers/mfd/timberdale.c b/drivers/mfd/timberdale.c
index 60c122e..faecbca 100644
--- a/drivers/mfd/timberdale.c
+++ b/drivers/mfd/timberdale.c

@@ -626,8 +626,7 @@
 static ssize_t show_fw_ver(struct device *dev, struct device_attribute *attr,
 	char *buf)
 {
-	struct pci_dev *pdev = to_pci_dev(dev);
-	struct timberdale_device *priv = pci_get_drvdata(pdev);
+	struct timberdale_device *priv = dev_get_drvdata(dev);
 
 	return sprintf(buf, "%d.%d.%d\n", priv->fw.major, priv->fw.minor,
 		priv->fw.config);

diff --git a/drivers/mfd/tps80031.c b/drivers/mfd/tps80031.c
index 865257a..907452b 100644
--- a/drivers/mfd/tps80031.c
+++ b/drivers/mfd/tps80031.c

@@ -437,12 +437,11 @@
 		if (tps80031_slave_address[i] == client->addr)
 			tps80031->clients[i] = client;
 		else
-			tps80031->clients[i] = i2c_new_dummy(client->adapter,
-						tps80031_slave_address[i]);
-		if (!tps80031->clients[i]) {
+			tps80031->clients[i] = devm_i2c_new_dummy_device(&client->dev,
+						client->adapter, tps80031_slave_address[i]);
+		if (IS_ERR(tps80031->clients[i])) {
 			dev_err(&client->dev, "can't attach client %d\n", i);
-			ret = -ENOMEM;
-			goto fail_client_reg;
+			return PTR_ERR(tps80031->clients[i]);
 		}
 
 		i2c_set_clientdata(tps80031->clients[i], tps80031);
@@ -452,7 +451,7 @@
 			ret = PTR_ERR(tps80031->regmap[i]);
 			dev_err(&client->dev,
 				"regmap %d init failed, err %d\n", i, ret);
-			goto fail_client_reg;
+			return ret;
 		}
 	}
 
@@ -461,7 +460,7 @@
 	if (ret < 0) {
 		dev_err(&client->dev,
 			"Silicon version number read failed: %d\n", ret);
-		goto fail_client_reg;
+		return ret;
 	}
 
 	ret = tps80031_read(&client->dev, TPS80031_SLAVE_ID3,
@@ -469,7 +468,7 @@
 	if (ret < 0) {
 		dev_err(&client->dev,
 			"Silicon eeprom version read failed: %d\n", ret);
-		goto fail_client_reg;
+		return ret;
 	}
 
 	dev_info(&client->dev, "ES version 0x%02x and EPROM version 0x%02x\n",
@@ -482,7 +481,7 @@
 	ret = tps80031_irq_init(tps80031, client->irq, pdata->irq_base);
 	if (ret) {
 		dev_err(&client->dev, "IRQ init failed: %d\n", ret);
-		goto fail_client_reg;
+		return ret;
 	}
 
 	tps80031_pupd_init(tps80031, pdata);
@@ -506,12 +505,6 @@
 
 fail_mfd_add:
 	regmap_del_irq_chip(client->irq, tps80031->irq_data);
-
-fail_client_reg:
-	for (i = 0; i < TPS80031_NUM_SLAVES; i++) {
-		if (tps80031->clients[i]  && (tps80031->clients[i] != client))
-			i2c_unregister_device(tps80031->clients[i]);
-	}
 	return ret;
 }
 

diff --git a/drivers/mfd/twl-core.c b/drivers/mfd/twl-core.c
index 448d939..20cf8cf 100644
--- a/drivers/mfd/twl-core.c
+++ b/drivers/mfd/twl-core.c

@@ -1141,12 +1141,12 @@
 		if (i == 0) {
 			twl->client = client;
 		} else {
-			twl->client = i2c_new_dummy(client->adapter,
+			twl->client = i2c_new_dummy_device(client->adapter,
 						    client->addr + i);
-			if (!twl->client) {
+			if (IS_ERR(twl->client)) {
 				dev_err(&client->dev,
 					"can't attach client %d\n", i);
-				status = -ENOMEM;
+				status = PTR_ERR(twl->client);
 				goto fail;
 			}
 		}

diff --git a/drivers/misc/eeprom/at24.c b/drivers/misc/eeprom/at24.c
index 518945b..2cccd82 100644
--- a/drivers/misc/eeprom/at24.c
+++ b/drivers/misc/eeprom/at24.c

@@ -14,7 +14,6 @@
 #include <linux/delay.h>
 #include <linux/mutex.h>
 #include <linux/mod_devicetable.h>
-#include <linux/log2.h>
 #include <linux/bitops.h>
 #include <linux/jiffies.h>
 #include <linux/property.h>

diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 48e209e..df1c798 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig

@@ -487,7 +487,7 @@
 	depends on ACPI
 	help
 	  This driver provides support for Extended Socket network device
-          on Extended Partitioning of FUJITSU PRIMEQUEST 2000 E2 series.
+	  on Extended Partitioning of FUJITSU PRIMEQUEST 2000 E2 series.
 
 config THUNDERBOLT_NET
 	tristate "Networking over Thunderbolt cable"

diff --git a/drivers/net/arcnet/Kconfig b/drivers/net/arcnet/Kconfig
index faeb4419..27551bf 100644
--- a/drivers/net/arcnet/Kconfig
+++ b/drivers/net/arcnet/Kconfig

@@ -56,19 +56,19 @@
 	tristate "Enable CAP mode packet interface"
 	help
 	  ARCnet "cap mode" packet encapsulation. Used to get the hardware
-          acknowledge back to userspace. After the initial protocol byte every
-          packet is stuffed with an extra 4 byte "cookie" which doesn't
-          actually appear on the network. After transmit the driver will send
-          back a packet with protocol byte 0 containing the status of the
-          transmission:
-             0=no hardware acknowledge
-             1=excessive nak
-             2=transmission accepted by the receiver hardware
+	  acknowledge back to userspace. After the initial protocol byte every
+	  packet is stuffed with an extra 4 byte "cookie" which doesn't
+	  actually appear on the network. After transmit the driver will send
+	  back a packet with protocol byte 0 containing the status of the
+	  transmission:
+	     0=no hardware acknowledge
+	     1=excessive nak
+	     2=transmission accepted by the receiver hardware
 
-          Received packets are also stuffed with the extra 4 bytes but it will
-          be random data.
+	  Received packets are also stuffed with the extra 4 bytes but it will
+	  be random data.
 
-          Cap only listens to protocol 1-8.
+	  Cap only listens to protocol 1-8.
 
 config ARCNET_COM90xx
 	tristate "ARCnet COM90xx (normal) chipset driver"

diff --git a/drivers/net/arcnet/arcnet.c b/drivers/net/arcnet/arcnet.c
index 8459115..553776c 100644
--- a/drivers/net/arcnet/arcnet.c
+++ b/drivers/net/arcnet/arcnet.c

@@ -1063,31 +1063,34 @@
 static void arcnet_rx(struct net_device *dev, int bufnum)
 {
 	struct arcnet_local *lp = netdev_priv(dev);
-	struct archdr pkt;
+	union {
+		struct archdr pkt;
+		char buf[512];
+	} rxdata;
 	struct arc_rfc1201 *soft;
 	int length, ofs;
 
-	soft = &pkt.soft.rfc1201;
+	soft = &rxdata.pkt.soft.rfc1201;
 
-	lp->hw.copy_from_card(dev, bufnum, 0, &pkt, ARC_HDR_SIZE);
-	if (pkt.hard.offset[0]) {
-		ofs = pkt.hard.offset[0];
+	lp->hw.copy_from_card(dev, bufnum, 0, &rxdata.pkt, ARC_HDR_SIZE);
+	if (rxdata.pkt.hard.offset[0]) {
+		ofs = rxdata.pkt.hard.offset[0];
 		length = 256 - ofs;
 	} else {
-		ofs = pkt.hard.offset[1];
+		ofs = rxdata.pkt.hard.offset[1];
 		length = 512 - ofs;
 	}
 
 	/* get the full header, if possible */
-	if (sizeof(pkt.soft) <= length) {
-		lp->hw.copy_from_card(dev, bufnum, ofs, soft, sizeof(pkt.soft));
+	if (sizeof(rxdata.pkt.soft) <= length) {
+		lp->hw.copy_from_card(dev, bufnum, ofs, soft, sizeof(rxdata.pkt.soft));
 	} else {
-		memset(&pkt.soft, 0, sizeof(pkt.soft));
+		memset(&rxdata.pkt.soft, 0, sizeof(rxdata.pkt.soft));
 		lp->hw.copy_from_card(dev, bufnum, ofs, soft, length);
 	}
 
 	arc_printk(D_DURING, dev, "Buffer #%d: received packet from %02Xh to %02Xh (%d+4 bytes)\n",
-		   bufnum, pkt.hard.source, pkt.hard.dest, length);
+		   bufnum, rxdata.pkt.hard.source, rxdata.pkt.hard.dest, length);
 
 	dev->stats.rx_packets++;
 	dev->stats.rx_bytes += length + ARC_HDR_SIZE;
@@ -1096,13 +1099,13 @@
 	if (arc_proto_map[soft->proto]->is_ip) {
 		if (BUGLVL(D_PROTO)) {
 			struct ArcProto
-			*oldp = arc_proto_map[lp->default_proto[pkt.hard.source]],
+			*oldp = arc_proto_map[lp->default_proto[rxdata.pkt.hard.source]],
 			*newp = arc_proto_map[soft->proto];
 
 			if (oldp != newp) {
 				arc_printk(D_PROTO, dev,
 					   "got protocol %02Xh; encap for host %02Xh is now '%c' (was '%c')\n",
-					   soft->proto, pkt.hard.source,
+					   soft->proto, rxdata.pkt.hard.source,
 					   newp->suffix, oldp->suffix);
 			}
 		}
@@ -1111,10 +1114,10 @@
 		lp->default_proto[0] = soft->proto;
 
 		/* in striking contrast, the following isn't a hack. */
-		lp->default_proto[pkt.hard.source] = soft->proto;
+		lp->default_proto[rxdata.pkt.hard.source] = soft->proto;
 	}
 	/* call the protocol-specific receiver. */
-	arc_proto_map[soft->proto]->rx(dev, bufnum, &pkt, length);
+	arc_proto_map[soft->proto]->rx(dev, bufnum, &rxdata.pkt, length);
 }
 
 static void null_rx(struct net_device *dev, int bufnum,

diff --git a/drivers/net/can/usb/Kconfig b/drivers/net/can/usb/Kconfig
index 4b3d0dd..b412f7b 100644
--- a/drivers/net/can/usb/Kconfig
+++ b/drivers/net/can/usb/Kconfig

@@ -15,10 +15,10 @@
 	  from EMS Dr. Thomas Wuensche (http://www.ems-wuensche.de).
 
 config CAN_ESD_USB2
-        tristate "ESD USB/2 CAN/USB interface"
-        ---help---
-          This driver supports the CAN-USB/2 interface
-          from esd electronic system design gmbh (http://www.esd.eu).
+	tristate "ESD USB/2 CAN/USB interface"
+	---help---
+	  This driver supports the CAN-USB/2 interface
+	  from esd electronic system design gmbh (http://www.esd.eu).
 
 config CAN_GS_USB
 	tristate "Geschwister Schneider UG interfaces"

diff --git a/drivers/net/dsa/b53/b53_serdes.h b/drivers/net/dsa/b53/b53_serdes.h
index 3bb4f91..55d280f 100644
--- a/drivers/net/dsa/b53/b53_serdes.h
+++ b/drivers/net/dsa/b53/b53_serdes.h

@@ -1,5 +1,5 @@
-/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
- *
+/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */
+/*
  * Northstar Plus switch SerDes/SGMII PHY definitions
  *
  * Copyright (C) 2018 Florian Fainelli <f.fainelli@gmail.com>

diff --git a/drivers/net/dsa/lantiq_pce.h b/drivers/net/dsa/lantiq_pce.h
index 1806631..e2be31f 100644
--- a/drivers/net/dsa/lantiq_pce.h
+++ b/drivers/net/dsa/lantiq_pce.h

@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0
+/* SPDX-License-Identifier: GPL-2.0 */
 /*
  * PCE microcode extracted from UGW 7.1.1 switch api
  *

diff --git a/drivers/net/dsa/microchip/ksz_common.h b/drivers/net/dsa/microchip/ksz_common.h
index a24d8e6..dd60d08 100644
--- a/drivers/net/dsa/microchip/ksz_common.h
+++ b/drivers/net/dsa/microchip/ksz_common.h

@@ -303,7 +303,7 @@
 	{								\
 		.name = #width,						\
 		.val_bits = (width),					\
-		.reg_stride = (width) / 8,				\
+		.reg_stride = 1,					\
 		.reg_bits = (regbits) + (regalign),			\
 		.pad_bits = (regpad),					\
 		.max_register = BIT(regbits) - 1,			\

diff --git a/drivers/net/dsa/qca8k.c b/drivers/net/dsa/qca8k.c
index 16f15c9..684aa51 100644
--- a/drivers/net/dsa/qca8k.c
+++ b/drivers/net/dsa/qca8k.c

@@ -936,6 +936,9 @@
 {
 	struct qca8k_priv *priv = (struct qca8k_priv *)ds->priv;
 
+	if (!dsa_is_user_port(ds, port))
+		return 0;
+
 	qca8k_port_set_status(priv, port, 1);
 	priv->port_sts[port].enabled = 1;
 

diff --git a/drivers/net/dsa/sja1105/Kconfig b/drivers/net/dsa/sja1105/Kconfig
index 55424f3..f40b248 100644
--- a/drivers/net/dsa/sja1105/Kconfig
+++ b/drivers/net/dsa/sja1105/Kconfig

@@ -27,6 +27,7 @@
 config NET_DSA_SJA1105_TAS
 	bool "Support for the Time-Aware Scheduler on NXP SJA1105"
 	depends on NET_DSA_SJA1105
+	depends on NET_SCH_TAPRIO
 	help
 	  This enables support for the TTEthernet-based egress scheduling
 	  engine in the SJA1105 DSA driver, which is controlled using a

diff --git a/drivers/net/ethernet/Kconfig b/drivers/net/ethernet/Kconfig
index 1e2de9d..e8e9c16 100644
--- a/drivers/net/ethernet/Kconfig
+++ b/drivers/net/ethernet/Kconfig

@@ -140,17 +140,6 @@
 source "drivers/net/ethernet/netronome/Kconfig"
 source "drivers/net/ethernet/ni/Kconfig"
 source "drivers/net/ethernet/8390/Kconfig"
-
-config NET_NETX
-	tristate "NetX Ethernet support"
-	select MII
-	depends on ARCH_NETX
-	---help---
-	  This is support for the Hilscher netX builtin Ethernet ports
-
-	  To compile this driver as a module, choose M here. The module
-	  will be called netx-eth.
-
 source "drivers/net/ethernet/nvidia/Kconfig"
 source "drivers/net/ethernet/nxp/Kconfig"
 source "drivers/net/ethernet/oki-semi/Kconfig"

diff --git a/drivers/net/ethernet/Makefile b/drivers/net/ethernet/Makefile
index 77f9838..05abebc 100644
--- a/drivers/net/ethernet/Makefile
+++ b/drivers/net/ethernet/Makefile

@@ -64,7 +64,6 @@
 obj-$(CONFIG_NET_VENDOR_NETERION) += neterion/
 obj-$(CONFIG_NET_VENDOR_NETRONOME) += netronome/
 obj-$(CONFIG_NET_VENDOR_NI) += ni/
-obj-$(CONFIG_NET_NETX) += netx-eth.o
 obj-$(CONFIG_NET_VENDOR_NVIDIA) += nvidia/
 obj-$(CONFIG_LPC_ENET) += nxp/
 obj-$(CONFIG_NET_VENDOR_OKI) += oki-semi/

diff --git a/drivers/net/ethernet/allwinner/Kconfig b/drivers/net/ethernet/allwinner/Kconfig
index a5e2bcb..264a482 100644
--- a/drivers/net/ethernet/allwinner/Kconfig
+++ b/drivers/net/ethernet/allwinner/Kconfig

@@ -21,17 +21,17 @@
 if NET_VENDOR_ALLWINNER
 
 config SUN4I_EMAC
-        tristate "Allwinner A10 EMAC support"
+	tristate "Allwinner A10 EMAC support"
 	depends on ARCH_SUNXI
 	depends on OF
 	select CRC32
 	select MII
 	select PHYLIB
 	select MDIO_SUN4I
-        ---help---
-          Support for Allwinner A10 EMAC ethernet driver.
+	---help---
+	  Support for Allwinner A10 EMAC ethernet driver.
 
-          To compile this driver as a module, choose M here.  The module
-          will be called sun4i-emac.
+	  To compile this driver as a module, choose M here.  The module
+	  will be called sun4i-emac.
 
 endif # NET_VENDOR_ALLWINNER

diff --git a/drivers/net/ethernet/amazon/Kconfig b/drivers/net/ethernet/amazon/Kconfig
index 69ca99d..cca72a7 100644
--- a/drivers/net/ethernet/amazon/Kconfig
+++ b/drivers/net/ethernet/amazon/Kconfig

@@ -19,6 +19,7 @@
 config ENA_ETHERNET
 	tristate "Elastic Network Adapter (ENA) support"
 	depends on PCI_MSI && !CPU_BIG_ENDIAN
+	select DIMLIB
 	---help---
 	  This driver supports Elastic Network Adapter (ENA)"
 

diff --git a/drivers/net/ethernet/amazon/ena/ena_eth_com.c b/drivers/net/ethernet/amazon/ena/ena_eth_com.c
index 38046bf..2845ac27 100644
--- a/drivers/net/ethernet/amazon/ena/ena_eth_com.c
+++ b/drivers/net/ethernet/amazon/ena/ena_eth_com.c

@@ -211,8 +211,8 @@
 
 		pkt_ctrl->curr_bounce_buf =
 			ena_com_get_next_bounce_buffer(&io_sq->bounce_buf_ctrl);
-			memset(io_sq->llq_buf_ctrl.curr_bounce_buf,
-			       0x0, llq_info->desc_list_entry_size);
+		memset(io_sq->llq_buf_ctrl.curr_bounce_buf,
+		       0x0, llq_info->desc_list_entry_size);
 
 		pkt_ctrl->idx = 0;
 		if (unlikely(llq_info->desc_stride_ctrl == ENA_ADMIN_SINGLE_DESC_PER_ENTRY))

diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_vec.c b/drivers/net/ethernet/aquantia/atlantic/aq_vec.c
index 28892b8..a95c263 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_vec.c
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_vec.c

@@ -306,15 +306,13 @@
 {
 	struct aq_vec_s *self = private;
 	u64 irq_mask = 0U;
-	irqreturn_t err = 0;
+	int err;
 
-	if (!self) {
-		err = -EINVAL;
-		goto err_exit;
-	}
+	if (!self)
+		return IRQ_NONE;
 	err = self->aq_hw_ops->hw_irq_read(self->aq_hw, &irq_mask);
 	if (err < 0)
-		goto err_exit;
+		return IRQ_NONE;
 
 	if (irq_mask) {
 		self->aq_hw_ops->hw_irq_disable(self->aq_hw,
@@ -322,11 +320,10 @@
 		napi_schedule(&self->napi);
 	} else {
 		self->aq_hw_ops->hw_irq_enable(self->aq_hw, 1U);
-		err = IRQ_NONE;
+		return IRQ_NONE;
 	}
 
-err_exit:
-	return err >= 0 ? IRQ_HANDLED : IRQ_NONE;
+	return IRQ_HANDLED;
 }
 
 cpumask_t *aq_vec_get_affinity_mask(struct aq_vec_s *self)

diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c b/drivers/net/ethernet/broadcom/bcmsysport.c
index 7df887e..a977a45 100644
--- a/drivers/net/ethernet/broadcom/bcmsysport.c
+++ b/drivers/net/ethernet/broadcom/bcmsysport.c

@@ -2481,7 +2481,7 @@
 
 	priv->phy_interface = of_get_phy_mode(dn);
 	/* Default to GMII interface mode */
-	if (priv->phy_interface < 0)
+	if ((int)priv->phy_interface < 0)
 		priv->phy_interface = PHY_INTERFACE_MODE_GMII;
 
 	/* In the case of a fixed PHY, the DT node associated

diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c
index 35b59b5..8e8d557 100644
--- a/drivers/net/ethernet/cadence/macb_main.c
+++ b/drivers/net/ethernet/cadence/macb_main.c

@@ -165,9 +165,8 @@
 #ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT
 static struct macb_dma_desc_64 *macb_64b_desc(struct macb *bp, struct macb_dma_desc *desc)
 {
-	if (bp->hw_dma_cap & HW_DMA_CAP_64B)
-		return (struct macb_dma_desc_64 *)((void *)desc + sizeof(struct macb_dma_desc));
-	return NULL;
+	return (struct macb_dma_desc_64 *)((void *)desc
+		+ sizeof(struct macb_dma_desc));
 }
 #endif
 

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index 71854a1..3802487 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c

@@ -5701,7 +5701,7 @@
 	whoami = t4_read_reg(adapter, PL_WHOAMI_A);
 	pci_read_config_word(pdev, PCI_DEVICE_ID, &device_id);
 	chip = t4_get_chip_type(adapter, CHELSIO_PCI_ID_VER(device_id));
-	if (chip < 0) {
+	if ((int)chip < 0) {
 		dev_err(&pdev->dev, "Device %d is not supported\n", device_id);
 		err = chip;
 		goto out_free_adapter;

diff --git a/drivers/net/ethernet/emulex/benet/Kconfig b/drivers/net/ethernet/emulex/benet/Kconfig
index e8c7eb8..17d300e 100644
--- a/drivers/net/ethernet/emulex/benet/Kconfig
+++ b/drivers/net/ethernet/emulex/benet/Kconfig

@@ -48,5 +48,5 @@
 	  chipsets. (e.g. OneConnect OCe14xxx)
 
 comment "WARNING: be2net is useless without any enabled chip"
-        depends on BE2NET_BE2=n && BE2NET_BE3=n && BE2NET_LANCER=n && \
+	depends on BE2NET_BE2=n && BE2NET_BE3=n && BE2NET_LANCER=n && \
 	BE2NET_SKYHAWK=n && BE2NET

diff --git a/drivers/net/ethernet/freescale/enetc/enetc_pf.c b/drivers/net/ethernet/freescale/enetc/enetc_pf.c
index 7d6513f..b73421c 100644
--- a/drivers/net/ethernet/freescale/enetc/enetc_pf.c
+++ b/drivers/net/ethernet/freescale/enetc/enetc_pf.c

@@ -785,7 +785,7 @@
 	}
 
 	priv->if_mode = of_get_phy_mode(np);
-	if (priv->if_mode < 0) {
+	if ((int)priv->if_mode < 0) {
 		dev_err(priv->dev, "missing phy type\n");
 		of_node_put(priv->phy_node);
 		if (of_phy_is_fixed_link(np))

diff --git a/drivers/net/ethernet/freescale/gianfar.c b/drivers/net/ethernet/freescale/gianfar.c
index 24bf7f6..51ad864 100644
--- a/drivers/net/ethernet/freescale/gianfar.c
+++ b/drivers/net/ethernet/freescale/gianfar.c

@@ -2067,7 +2067,7 @@
 	return 0;
 }
 
-void reset_gfar(struct net_device *ndev)
+static void reset_gfar(struct net_device *ndev)
 {
 	struct gfar_private *priv = netdev_priv(ndev);
 

diff --git a/drivers/net/ethernet/hisilicon/hix5hd2_gmac.c b/drivers/net/ethernet/hisilicon/hix5hd2_gmac.c
index 95a6b09..c41b19c 100644
--- a/drivers/net/ethernet/hisilicon/hix5hd2_gmac.c
+++ b/drivers/net/ethernet/hisilicon/hix5hd2_gmac.c

@@ -1194,7 +1194,7 @@
 		goto err_free_mdio;
 
 	priv->phy_mode = of_get_phy_mode(node);
-	if (priv->phy_mode < 0) {
+	if ((int)priv->phy_mode < 0) {
 		netdev_err(ndev, "not find phy-mode\n");
 		ret = -EINVAL;
 		goto err_mdiobus;

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index 2e5172f..2b073a3 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c

@@ -1207,7 +1207,7 @@
 	struct ibmvnic_adapter *adapter = netdev_priv(netdev);
 
 	/* ensure that transmissions are stopped if called by do_reset */
-	if (adapter->resetting)
+	if (test_bit(0, &adapter->resetting))
 		netif_tx_disable(netdev);
 	else
 		netif_tx_stop_all_queues(netdev);
@@ -1428,7 +1428,7 @@
 	u8 proto = 0;
 	netdev_tx_t ret = NETDEV_TX_OK;
 
-	if (adapter->resetting) {
+	if (test_bit(0, &adapter->resetting)) {
 		if (!netif_subqueue_stopped(netdev, skb))
 			netif_stop_subqueue(netdev, queue_num);
 		dev_kfree_skb_any(skb);
@@ -1724,6 +1724,86 @@
 }
 
 /**
+ * do_change_param_reset returns zero if we are able to keep processing reset
+ * events, or non-zero if we hit a fatal error and must halt.
+ */
+static int do_change_param_reset(struct ibmvnic_adapter *adapter,
+				 struct ibmvnic_rwi *rwi,
+				 u32 reset_state)
+{
+	struct net_device *netdev = adapter->netdev;
+	int i, rc;
+
+	netdev_dbg(adapter->netdev, "Change param resetting driver (%d)\n",
+		   rwi->reset_reason);
+
+	netif_carrier_off(netdev);
+	adapter->reset_reason = rwi->reset_reason;
+
+	ibmvnic_cleanup(netdev);
+
+	if (reset_state == VNIC_OPEN) {
+		rc = __ibmvnic_close(netdev);
+		if (rc)
+			return rc;
+	}
+
+	release_resources(adapter);
+	release_sub_crqs(adapter, 1);
+	release_crq_queue(adapter);
+
+	adapter->state = VNIC_PROBED;
+
+	rc = init_crq_queue(adapter);
+
+	if (rc) {
+		netdev_err(adapter->netdev,
+			   "Couldn't initialize crq. rc=%d\n", rc);
+		return rc;
+	}
+
+	rc = ibmvnic_reset_init(adapter);
+	if (rc)
+		return IBMVNIC_INIT_FAILED;
+
+	/* If the adapter was in PROBE state prior to the reset,
+	 * exit here.
+	 */
+	if (reset_state == VNIC_PROBED)
+		return 0;
+
+	rc = ibmvnic_login(netdev);
+	if (rc) {
+		adapter->state = reset_state;
+		return rc;
+	}
+
+	rc = init_resources(adapter);
+	if (rc)
+		return rc;
+
+	ibmvnic_disable_irqs(adapter);
+
+	adapter->state = VNIC_CLOSED;
+
+	if (reset_state == VNIC_CLOSED)
+		return 0;
+
+	rc = __ibmvnic_open(netdev);
+	if (rc)
+		return IBMVNIC_OPEN_FAILED;
+
+	/* refresh device's multicast list */
+	ibmvnic_set_multi(netdev);
+
+	/* kick napi */
+	for (i = 0; i < adapter->req_rx_queues; i++)
+		napi_schedule(&adapter->napi[i]);
+
+	return 0;
+}
+
+/**
  * do_reset returns zero if we are able to keep processing reset events, or
  * non-zero if we hit a fatal error and must halt.
  */
@@ -1738,6 +1818,8 @@
 	netdev_dbg(adapter->netdev, "Re-setting driver (%d)\n",
 		   rwi->reset_reason);
 
+	rtnl_lock();
+
 	netif_carrier_off(netdev);
 	adapter->reset_reason = rwi->reset_reason;
 
@@ -1751,16 +1833,25 @@
 	if (reset_state == VNIC_OPEN &&
 	    adapter->reset_reason != VNIC_RESET_MOBILITY &&
 	    adapter->reset_reason != VNIC_RESET_FAILOVER) {
-		rc = __ibmvnic_close(netdev);
-		if (rc)
-			return rc;
-	}
+		adapter->state = VNIC_CLOSING;
 
-	if (adapter->reset_reason == VNIC_RESET_CHANGE_PARAM ||
-	    adapter->wait_for_reset) {
-		release_resources(adapter);
-		release_sub_crqs(adapter, 1);
-		release_crq_queue(adapter);
+		/* Release the RTNL lock before link state change and
+		 * re-acquire after the link state change to allow
+		 * linkwatch_event to grab the RTNL lock and run during
+		 * a reset.
+		 */
+		rtnl_unlock();
+		rc = set_link_state(adapter, IBMVNIC_LOGICAL_LNK_DN);
+		rtnl_lock();
+		if (rc)
+			goto out;
+
+		if (adapter->state != VNIC_CLOSING) {
+			rc = -1;
+			goto out;
+		}
+
+		adapter->state = VNIC_CLOSED;
 	}
 
 	if (adapter->reset_reason != VNIC_RESET_NON_FATAL) {
@@ -1769,9 +1860,7 @@
 		 */
 		adapter->state = VNIC_PROBED;
 
-		if (adapter->wait_for_reset) {
-			rc = init_crq_queue(adapter);
-		} else if (adapter->reset_reason == VNIC_RESET_MOBILITY) {
+		if (adapter->reset_reason == VNIC_RESET_MOBILITY) {
 			rc = ibmvnic_reenable_crq_queue(adapter);
 			release_sub_crqs(adapter, 1);
 		} else {
@@ -1783,36 +1872,35 @@
 		if (rc) {
 			netdev_err(adapter->netdev,
 				   "Couldn't initialize crq. rc=%d\n", rc);
-			return rc;
+			goto out;
 		}
 
 		rc = ibmvnic_reset_init(adapter);
-		if (rc)
-			return IBMVNIC_INIT_FAILED;
+		if (rc) {
+			rc = IBMVNIC_INIT_FAILED;
+			goto out;
+		}
 
 		/* If the adapter was in PROBE state prior to the reset,
 		 * exit here.
 		 */
-		if (reset_state == VNIC_PROBED)
-			return 0;
+		if (reset_state == VNIC_PROBED) {
+			rc = 0;
+			goto out;
+		}
 
 		rc = ibmvnic_login(netdev);
 		if (rc) {
 			adapter->state = reset_state;
-			return rc;
+			goto out;
 		}
 
-		if (adapter->reset_reason == VNIC_RESET_CHANGE_PARAM ||
-		    adapter->wait_for_reset) {
-			rc = init_resources(adapter);
-			if (rc)
-				return rc;
-		} else if (adapter->req_rx_queues != old_num_rx_queues ||
-			   adapter->req_tx_queues != old_num_tx_queues ||
-			   adapter->req_rx_add_entries_per_subcrq !=
-							old_num_rx_slots ||
-			   adapter->req_tx_entries_per_subcrq !=
-							old_num_tx_slots) {
+		if (adapter->req_rx_queues != old_num_rx_queues ||
+		    adapter->req_tx_queues != old_num_tx_queues ||
+		    adapter->req_rx_add_entries_per_subcrq !=
+		    old_num_rx_slots ||
+		    adapter->req_tx_entries_per_subcrq !=
+		    old_num_tx_slots) {
 			release_rx_pools(adapter);
 			release_tx_pools(adapter);
 			release_napi(adapter);
@@ -1820,32 +1908,30 @@
 
 			rc = init_resources(adapter);
 			if (rc)
-				return rc;
+				goto out;
 
 		} else {
 			rc = reset_tx_pools(adapter);
 			if (rc)
-				return rc;
+				goto out;
 
 			rc = reset_rx_pools(adapter);
 			if (rc)
-				return rc;
+				goto out;
 		}
 		ibmvnic_disable_irqs(adapter);
 	}
 	adapter->state = VNIC_CLOSED;
 
-	if (reset_state == VNIC_CLOSED)
-		return 0;
+	if (reset_state == VNIC_CLOSED) {
+		rc = 0;
+		goto out;
+	}
 
 	rc = __ibmvnic_open(netdev);
 	if (rc) {
-		if (list_empty(&adapter->rwi_list))
-			adapter->state = VNIC_CLOSED;
-		else
-			adapter->state = reset_state;
-
-		return 0;
+		rc = IBMVNIC_OPEN_FAILED;
+		goto out;
 	}
 
 	/* refresh device's multicast list */
@@ -1855,11 +1941,15 @@
 	for (i = 0; i < adapter->req_rx_queues; i++)
 		napi_schedule(&adapter->napi[i]);
 
-	if (adapter->reset_reason != VNIC_RESET_FAILOVER &&
-	    adapter->reset_reason != VNIC_RESET_CHANGE_PARAM)
+	if (adapter->reset_reason != VNIC_RESET_FAILOVER)
 		call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, netdev);
 
-	return 0;
+	rc = 0;
+
+out:
+	rtnl_unlock();
+
+	return rc;
 }
 
 static int do_hard_reset(struct ibmvnic_adapter *adapter,
@@ -1919,14 +2009,8 @@
 		return 0;
 
 	rc = __ibmvnic_open(netdev);
-	if (rc) {
-		if (list_empty(&adapter->rwi_list))
-			adapter->state = VNIC_CLOSED;
-		else
-			adapter->state = reset_state;
-
-		return 0;
-	}
+	if (rc)
+		return IBMVNIC_OPEN_FAILED;
 
 	return 0;
 }
@@ -1965,20 +2049,17 @@
 {
 	struct ibmvnic_rwi *rwi;
 	struct ibmvnic_adapter *adapter;
-	bool we_lock_rtnl = false;
 	u32 reset_state;
 	int rc = 0;
 
 	adapter = container_of(work, struct ibmvnic_adapter, ibmvnic_reset);
 
-	/* netif_set_real_num_xx_queues needs to take rtnl lock here
-	 * unless wait_for_reset is set, in which case the rtnl lock
-	 * has already been taken before initializing the reset
-	 */
-	if (!adapter->wait_for_reset) {
-		rtnl_lock();
-		we_lock_rtnl = true;
+	if (test_and_set_bit_lock(0, &adapter->resetting)) {
+		schedule_delayed_work(&adapter->ibmvnic_delayed_reset,
+				      IBMVNIC_RESET_DELAY);
+		return;
 	}
+
 	reset_state = adapter->state;
 
 	rwi = get_next_rwi(adapter);
@@ -1990,22 +2071,43 @@
 			break;
 		}
 
-		if (adapter->force_reset_recovery) {
-			adapter->force_reset_recovery = false;
-			rc = do_hard_reset(adapter, rwi, reset_state);
+		if (rwi->reset_reason == VNIC_RESET_CHANGE_PARAM) {
+			/* CHANGE_PARAM requestor holds rtnl_lock */
+			rc = do_change_param_reset(adapter, rwi, reset_state);
+		} else if (adapter->force_reset_recovery) {
+			/* Transport event occurred during previous reset */
+			if (adapter->wait_for_reset) {
+				/* Previous was CHANGE_PARAM; caller locked */
+				adapter->force_reset_recovery = false;
+				rc = do_hard_reset(adapter, rwi, reset_state);
+			} else {
+				rtnl_lock();
+				adapter->force_reset_recovery = false;
+				rc = do_hard_reset(adapter, rwi, reset_state);
+				rtnl_unlock();
+			}
 		} else {
 			rc = do_reset(adapter, rwi, reset_state);
 		}
 		kfree(rwi);
-		if (rc && rc != IBMVNIC_INIT_FAILED &&
+		if (rc == IBMVNIC_OPEN_FAILED) {
+			if (list_empty(&adapter->rwi_list))
+				adapter->state = VNIC_CLOSED;
+			else
+				adapter->state = reset_state;
+			rc = 0;
+		} else if (rc && rc != IBMVNIC_INIT_FAILED &&
 		    !adapter->force_reset_recovery)
 			break;
 
 		rwi = get_next_rwi(adapter);
+
+		if (rwi && (rwi->reset_reason == VNIC_RESET_FAILOVER ||
+			    rwi->reset_reason == VNIC_RESET_MOBILITY))
+			adapter->force_reset_recovery = true;
 	}
 
 	if (adapter->wait_for_reset) {
-		adapter->wait_for_reset = false;
 		adapter->reset_done_rc = rc;
 		complete(&adapter->reset_done);
 	}
@@ -2015,9 +2117,16 @@
 		free_all_rwi(adapter);
 	}
 
-	adapter->resetting = false;
-	if (we_lock_rtnl)
-		rtnl_unlock();
+	clear_bit_unlock(0, &adapter->resetting);
+}
+
+static void __ibmvnic_delayed_reset(struct work_struct *work)
+{
+	struct ibmvnic_adapter *adapter;
+
+	adapter = container_of(work, struct ibmvnic_adapter,
+			       ibmvnic_delayed_reset.work);
+	__ibmvnic_reset(&adapter->ibmvnic_reset);
 }
 
 static int ibmvnic_reset(struct ibmvnic_adapter *adapter,
@@ -2072,14 +2181,11 @@
 	rwi->reset_reason = reason;
 	list_add_tail(&rwi->list, &adapter->rwi_list);
 	spin_unlock_irqrestore(&adapter->rwi_lock, flags);
-	adapter->resetting = true;
 	netdev_dbg(adapter->netdev, "Scheduling reset (reason %d)\n", reason);
 	schedule_work(&adapter->ibmvnic_reset);
 
 	return 0;
 err:
-	if (adapter->wait_for_reset)
-		adapter->wait_for_reset = false;
 	return -ret;
 }
 
@@ -2119,7 +2225,7 @@
 		u16 offset;
 		u8 flags = 0;
 
-		if (unlikely(adapter->resetting &&
+		if (unlikely(test_bit(0, &adapter->resetting) &&
 			     adapter->reset_reason != VNIC_RESET_NON_FATAL)) {
 			enable_scrq_irq(adapter, adapter->rx_scrq[scrq_num]);
 			napi_complete_done(napi, frames_processed);
@@ -2770,7 +2876,7 @@
 		return 1;
 	}
 
-	if (adapter->resetting &&
+	if (test_bit(0, &adapter->resetting) &&
 	    adapter->reset_reason == VNIC_RESET_MOBILITY) {
 		u64 val = (0xff000000) | scrq->hw_irq;
 
@@ -3320,7 +3426,7 @@
 	if (rc) {
 		if (rc == H_CLOSED) {
 			dev_warn(dev, "CRQ Queue closed\n");
-			if (adapter->resetting)
+			if (test_bit(0, &adapter->resetting))
 				ibmvnic_reset(adapter, VNIC_RESET_FATAL);
 		}
 
@@ -4312,13 +4418,14 @@
 {
 	struct net_device *netdev = adapter->netdev;
 	int rc;
+	__be32 rspeed = cpu_to_be32(crq->query_phys_parms_rsp.speed);
 
 	rc = crq->query_phys_parms_rsp.rc.code;
 	if (rc) {
 		netdev_err(netdev, "Error %d in QUERY_PHYS_PARMS\n", rc);
 		return rc;
 	}
-	switch (cpu_to_be32(crq->query_phys_parms_rsp.speed)) {
+	switch (rspeed) {
 	case IBMVNIC_10MBPS:
 		adapter->speed = SPEED_10;
 		break;
@@ -4344,8 +4451,8 @@
 		adapter->speed = SPEED_100000;
 		break;
 	default:
-		netdev_warn(netdev, "Unknown speed 0x%08x\n",
-			    cpu_to_be32(crq->query_phys_parms_rsp.speed));
+		if (netif_carrier_ok(netdev))
+			netdev_warn(netdev, "Unknown speed 0x%08x\n", rspeed);
 		adapter->speed = SPEED_UNKNOWN;
 	}
 	if (crq->query_phys_parms_rsp.flags1 & IBMVNIC_FULL_DUPLEX)
@@ -4395,7 +4502,7 @@
 	case IBMVNIC_CRQ_XPORT_EVENT:
 		netif_carrier_off(netdev);
 		adapter->crq.active = false;
-		if (adapter->resetting)
+		if (test_bit(0, &adapter->resetting))
 			adapter->force_reset_recovery = true;
 		if (gen_crq->cmd == IBMVNIC_PARTITION_MIGRATED) {
 			dev_info(dev, "Migrated, re-enabling adapter\n");
@@ -4733,7 +4840,7 @@
 		return -1;
 	}
 
-	if (adapter->resetting && !adapter->wait_for_reset &&
+	if (test_bit(0, &adapter->resetting) && !adapter->wait_for_reset &&
 	    adapter->reset_reason != VNIC_RESET_MOBILITY) {
 		if (adapter->req_rx_queues != old_num_rx_queues ||
 		    adapter->req_tx_queues != old_num_tx_queues) {
@@ -4845,10 +4952,12 @@
 	spin_lock_init(&adapter->stats_lock);
 
 	INIT_WORK(&adapter->ibmvnic_reset, __ibmvnic_reset);
+	INIT_DELAYED_WORK(&adapter->ibmvnic_delayed_reset,
+			  __ibmvnic_delayed_reset);
 	INIT_LIST_HEAD(&adapter->rwi_list);
 	spin_lock_init(&adapter->rwi_lock);
 	init_completion(&adapter->init_done);
-	adapter->resetting = false;
+	clear_bit(0, &adapter->resetting);
 
 	do {
 		rc = init_crq_queue(adapter);

diff --git a/drivers/net/ethernet/ibm/ibmvnic.h b/drivers/net/ethernet/ibm/ibmvnic.h
index 70bd286..ebc3924 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.h
+++ b/drivers/net/ethernet/ibm/ibmvnic.h

@@ -20,6 +20,7 @@
 #define IBMVNIC_INVALID_MAP	-1
 #define IBMVNIC_STATS_TIMEOUT	1
 #define IBMVNIC_INIT_FAILED	2
+#define IBMVNIC_OPEN_FAILED	3
 
 /* basic structures plus 100 2k buffers */
 #define IBMVNIC_IO_ENTITLEMENT_DEFAULT	610305
@@ -38,6 +39,8 @@
 #define IBMVNIC_MAX_LTB_SIZE ((1 << (MAX_ORDER - 1)) * PAGE_SIZE)
 #define IBMVNIC_BUFFER_HLEN 500
 
+#define IBMVNIC_RESET_DELAY 100
+
 static const char ibmvnic_priv_flags[][ETH_GSTRING_LEN] = {
 #define IBMVNIC_USE_SERVER_MAXES 0x1
 	"use-server-maxes"
@@ -1076,7 +1079,8 @@
 	spinlock_t rwi_lock;
 	struct list_head rwi_list;
 	struct work_struct ibmvnic_reset;
-	bool resetting;
+	struct delayed_work ibmvnic_delayed_reset;
+	unsigned long resetting;
 	bool napi_enabled, from_passive_init;
 
 	bool failover_pending;

diff --git a/drivers/net/ethernet/intel/e1000e/e1000.h b/drivers/net/ethernet/intel/e1000e/e1000.h
index 34cd679..6c51b1b 100644
--- a/drivers/net/ethernet/intel/e1000e/e1000.h
+++ b/drivers/net/ethernet/intel/e1000e/e1000.h

@@ -13,7 +13,6 @@
 #include <linux/io.h>
 #include <linux/netdevice.h>
 #include <linux/pci.h>
-#include <linux/pci-aspm.h>
 #include <linux/crc32.h>
 #include <linux/if_vlan.h>
 #include <linux/timecounter.h>

diff --git a/drivers/net/ethernet/jme.c b/drivers/net/ethernet/jme.c
index 6d52cf5..25aa400 100644
--- a/drivers/net/ethernet/jme.c
+++ b/drivers/net/ethernet/jme.c

@@ -14,7 +14,6 @@
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/pci.h>
-#include <linux/pci-aspm.h>
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
 #include <linux/ethtool.h>

diff --git a/drivers/net/ethernet/marvell/skge.c b/drivers/net/ethernet/marvell/skge.c
index 0a2ec38..095f6c7 100644
--- a/drivers/net/ethernet/marvell/skge.c
+++ b/drivers/net/ethernet/marvell/skge.c

@@ -3108,7 +3108,7 @@
 	skb_put(skb, len);
 
 	if (dev->features & NETIF_F_RXCSUM) {
-		skb->csum = csum;
+		skb->csum = le16_to_cpu(csum);
 		skb->ip_summed = CHECKSUM_COMPLETE;
 	}
 

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
index 0dba272..a1f20b2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig

@@ -20,15 +20,15 @@
 	bool
 
 config MLX5_FPGA
-        bool "Mellanox Technologies Innova support"
-        depends on MLX5_CORE
+	bool "Mellanox Technologies Innova support"
+	depends on MLX5_CORE
 	select MLX5_ACCEL
-        ---help---
-          Build support for the Innova family of network cards by Mellanox
-          Technologies. Innova network cards are comprised of a ConnectX chip
-          and an FPGA chip on one board. If you select this option, the
-          mlx5_core driver will include the Innova FPGA core and allow building
-          sandbox-specific client drivers.
+	---help---
+	  Build support for the Innova family of network cards by Mellanox
+	  Technologies. Innova network cards are comprised of a ConnectX chip
+	  and an FPGA chip on one board. If you select this option, the
+	  mlx5_core driver will include the Innova FPGA core and allow building
+	  sandbox-specific client drivers.
 
 config MLX5_CORE_EN
 	bool "Mellanox 5th generation network adapters (ConnectX series) Ethernet support"
@@ -58,14 +58,14 @@
 	  API.
 
 config MLX5_MPFS
-        bool "Mellanox Technologies MLX5 MPFS support"
-        depends on MLX5_CORE_EN
+	bool "Mellanox Technologies MLX5 MPFS support"
+	depends on MLX5_CORE_EN
 	default y
-        ---help---
+	---help---
 	  Mellanox Technologies Ethernet Multi-Physical Function Switch (MPFS)
-          support in ConnectX NIC. MPFs is required for when multi-PF configuration
-          is enabled to allow passing user configured unicast MAC addresses to the
-          requesting PF.
+	  support in ConnectX NIC. MPFs is required for when multi-PF configuration
+	  is enabled to allow passing user configured unicast MAC addresses to the
+	  requesting PF.
 
 config MLX5_ESWITCH
 	bool "Mellanox Technologies MLX5 SRIOV E-Switch support"
@@ -73,10 +73,10 @@
 	default y
 	---help---
 	  Mellanox Technologies Ethernet SRIOV E-Switch support in ConnectX NIC.
-          E-Switch provides internal SRIOV packet steering and switching for the
-          enabled VFs and PF in two available modes:
-                Legacy SRIOV mode (L2 mac vlan steering based).
-                Switchdev mode (eswitch offloads).
+	  E-Switch provides internal SRIOV packet steering and switching for the
+	  enabled VFs and PF in two available modes:
+	        Legacy SRIOV mode (L2 mac vlan steering based).
+	        Switchdev mode (eswitch offloads).
 
 config MLX5_CORE_EN_DCB
 	bool "Data Center Bridging (DCB) Support"

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c
index eed7101..acd946f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c

@@ -399,10 +399,10 @@
 		      struct mlx5_flow_table *ft,
 		      struct ethtool_rx_flow_spec *fs)
 {
+	struct mlx5_flow_act flow_act = { .flags = FLOW_ACT_NO_APPEND };
 	struct mlx5_flow_destination *dst = NULL;
-	struct mlx5_flow_act flow_act = {0};
-	struct mlx5_flow_spec *spec;
 	struct mlx5_flow_handle *rule;
+	struct mlx5_flow_spec *spec;
 	int err = 0;
 
 	spec = kvzalloc(sizeof(*spec), GFP_KERNEL);

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index da7555f..3e78a72 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c

@@ -1664,46 +1664,63 @@
 		return err;
 	}
 
-	if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS)) {
-		struct flow_match_ipv4_addrs match;
+	if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ENC_CONTROL)) {
+		struct flow_match_control match;
+		u16 addr_type;
 
-		flow_rule_match_enc_ipv4_addrs(rule, &match);
-		MLX5_SET(fte_match_set_lyr_2_4, headers_c,
-			 src_ipv4_src_ipv6.ipv4_layout.ipv4,
-			 ntohl(match.mask->src));
-		MLX5_SET(fte_match_set_lyr_2_4, headers_v,
-			 src_ipv4_src_ipv6.ipv4_layout.ipv4,
-			 ntohl(match.key->src));
+		flow_rule_match_enc_control(rule, &match);
+		addr_type = match.key->addr_type;
 
-		MLX5_SET(fte_match_set_lyr_2_4, headers_c,
-			 dst_ipv4_dst_ipv6.ipv4_layout.ipv4,
-			 ntohl(match.mask->dst));
-		MLX5_SET(fte_match_set_lyr_2_4, headers_v,
-			 dst_ipv4_dst_ipv6.ipv4_layout.ipv4,
-			 ntohl(match.key->dst));
+		/* For tunnel addr_type used same key id`s as for non-tunnel */
+		if (addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
+			struct flow_match_ipv4_addrs match;
 
-		MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, headers_c, ethertype);
-		MLX5_SET(fte_match_set_lyr_2_4, headers_v, ethertype, ETH_P_IP);
-	} else if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS)) {
-		struct flow_match_ipv6_addrs match;
+			flow_rule_match_enc_ipv4_addrs(rule, &match);
+			MLX5_SET(fte_match_set_lyr_2_4, headers_c,
+				 src_ipv4_src_ipv6.ipv4_layout.ipv4,
+				 ntohl(match.mask->src));
+			MLX5_SET(fte_match_set_lyr_2_4, headers_v,
+				 src_ipv4_src_ipv6.ipv4_layout.ipv4,
+				 ntohl(match.key->src));
 
-		flow_rule_match_enc_ipv6_addrs(rule, &match);
-		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
-				    src_ipv4_src_ipv6.ipv6_layout.ipv6),
-		       &match.mask->src, MLX5_FLD_SZ_BYTES(ipv6_layout, ipv6));
-		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
-				    src_ipv4_src_ipv6.ipv6_layout.ipv6),
-		       &match.key->src, MLX5_FLD_SZ_BYTES(ipv6_layout, ipv6));
+			MLX5_SET(fte_match_set_lyr_2_4, headers_c,
+				 dst_ipv4_dst_ipv6.ipv4_layout.ipv4,
+				 ntohl(match.mask->dst));
+			MLX5_SET(fte_match_set_lyr_2_4, headers_v,
+				 dst_ipv4_dst_ipv6.ipv4_layout.ipv4,
+				 ntohl(match.key->dst));
 
-		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
-				    dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
-		       &match.mask->dst, MLX5_FLD_SZ_BYTES(ipv6_layout, ipv6));
-		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
-				    dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
-		       &match.key->dst, MLX5_FLD_SZ_BYTES(ipv6_layout, ipv6));
+			MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, headers_c,
+					 ethertype);
+			MLX5_SET(fte_match_set_lyr_2_4, headers_v, ethertype,
+				 ETH_P_IP);
+		} else if (addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
+			struct flow_match_ipv6_addrs match;
 
-		MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, headers_c, ethertype);
-		MLX5_SET(fte_match_set_lyr_2_4, headers_v, ethertype, ETH_P_IPV6);
+			flow_rule_match_enc_ipv6_addrs(rule, &match);
+			memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
+					    src_ipv4_src_ipv6.ipv6_layout.ipv6),
+			       &match.mask->src, MLX5_FLD_SZ_BYTES(ipv6_layout,
+								   ipv6));
+			memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
+					    src_ipv4_src_ipv6.ipv6_layout.ipv6),
+			       &match.key->src, MLX5_FLD_SZ_BYTES(ipv6_layout,
+								  ipv6));
+
+			memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
+					    dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
+			       &match.mask->dst, MLX5_FLD_SZ_BYTES(ipv6_layout,
+								   ipv6));
+			memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
+					    dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
+			       &match.key->dst, MLX5_FLD_SZ_BYTES(ipv6_layout,
+								  ipv6));
+
+			MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, headers_c,
+					 ethertype);
+			MLX5_SET(fte_match_set_lyr_2_4, headers_v, ethertype,
+				 ETH_P_IPV6);
+		}
 	}
 
 	if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ENC_IP)) {

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 9648c22..e47dd7c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c

@@ -1568,6 +1568,7 @@
 	{ PCI_VDEVICE(MELLANOX, 0x101e), MLX5_PCI_DEV_IS_VF},	/* ConnectX Family mlx5Gen Virtual Function */
 	{ PCI_VDEVICE(MELLANOX, 0xa2d2) },			/* BlueField integrated ConnectX-5 network controller */
 	{ PCI_VDEVICE(MELLANOX, 0xa2d3), MLX5_PCI_DEV_IS_VF},	/* BlueField integrated ConnectX-5 network controller VF */
+	{ PCI_VDEVICE(MELLANOX, 0xa2d6) },			/* BlueField-2 integrated ConnectX-6 Dx network controller */
 	{ 0, }
 };
 

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c
index 7d81a77..b74b7d0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c

@@ -615,7 +615,7 @@
 		 * that recalculates the CS and forwards to the vport.
 		 */
 		ret = mlx5dr_domain_cache_get_recalc_cs_ft_addr(dest_action->vport.dmn,
-								dest_action->vport.num,
+								dest_action->vport.caps->num,
 								final_icm_addr);
 		if (ret) {
 			mlx5dr_err(dmn, "Failed to get FW cs recalc flow table\n");
@@ -744,7 +744,7 @@
 			dest_action = action;
 			if (rx_rule) {
 				/* Loopback on WIRE vport is not supported */
-				if (action->vport.num == WIRE_PORT)
+				if (action->vport.caps->num == WIRE_PORT)
 					goto out_invalid_arg;
 
 				attr.final_icm_addr = action->vport.caps->icm_address_rx;

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_matcher.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_matcher.c
index 01008cd..67dea76 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_matcher.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_matcher.c

@@ -230,8 +230,7 @@
 		    (dmn->type == MLX5DR_DOMAIN_TYPE_FDB ||
 		     dmn->type == MLX5DR_DOMAIN_TYPE_NIC_RX)) {
 			ret = mlx5dr_ste_build_src_gvmi_qpn(&sb[idx++], &mask,
-							    &dmn->info.caps,
-							    inner, rx);
+							    dmn, inner, rx);
 			if (ret)
 				return ret;
 		}
@@ -458,13 +457,11 @@
 
 	prev_matcher = NULL;
 	if (next_matcher && !first)
-		prev_matcher = list_entry(next_matcher->matcher_list.prev,
-					  struct mlx5dr_matcher,
-					  matcher_list);
+		prev_matcher = list_prev_entry(next_matcher, matcher_list);
 	else if (!first)
-		prev_matcher = list_entry(tbl->matcher_list.prev,
-					  struct mlx5dr_matcher,
-					  matcher_list);
+		prev_matcher = list_last_entry(&tbl->matcher_list,
+					       struct mlx5dr_matcher,
+					       matcher_list);
 
 	if (dmn->type == MLX5DR_DOMAIN_TYPE_FDB ||
 	    dmn->type == MLX5DR_DOMAIN_TYPE_NIC_RX) {

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_rule.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_rule.c
index 3bc3f66..4187f2b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_rule.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_rule.c

@@ -18,7 +18,7 @@
 	struct mlx5dr_ste *last_ste;
 
 	/* The new entry will be inserted after the last */
-	last_ste = list_entry(miss_list->prev, struct mlx5dr_ste, miss_list_node);
+	last_ste = list_last_entry(miss_list, struct mlx5dr_ste, miss_list_node);
 	WARN_ON(!last_ste);
 
 	ste_info_last = kzalloc(sizeof(*ste_info_last), GFP_KERNEL);

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.c
index 6b0af64..4efe1b0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.c

@@ -429,12 +429,9 @@
 	struct mlx5dr_ste *prev_ste;
 	u64 miss_addr;
 
-	prev_ste = list_entry(mlx5dr_ste_get_miss_list(ste)->prev, struct mlx5dr_ste,
-			      miss_list_node);
-	if (!prev_ste) {
-		WARN_ON(true);
+	prev_ste = list_prev_entry(ste, miss_list_node);
+	if (WARN_ON(!prev_ste))
 		return;
-	}
 
 	miss_addr = mlx5dr_ste_get_miss_addr(ste->hw_ste);
 	mlx5dr_ste_set_miss_addr(prev_ste->hw_ste, miss_addr);
@@ -461,8 +458,8 @@
 	struct mlx5dr_ste_htbl *stats_tbl;
 	LIST_HEAD(send_ste_list);
 
-	first_ste = list_entry(mlx5dr_ste_get_miss_list(ste)->next,
-			       struct mlx5dr_ste, miss_list_node);
+	first_ste = list_first_entry(mlx5dr_ste_get_miss_list(ste),
+				     struct mlx5dr_ste, miss_list_node);
 	stats_tbl = first_ste->htbl;
 
 	/* Two options:
@@ -479,8 +476,7 @@
 		if (last_ste == first_ste)
 			next_ste = NULL;
 		else
-			next_ste = list_entry(ste->miss_list_node.next,
-					      struct mlx5dr_ste, miss_list_node);
+			next_ste = list_next_entry(ste, miss_list_node);
 
 		if (!next_ste) {
 			/* One and only entry in the list */
@@ -841,6 +837,8 @@
 	spec->source_sqn = MLX5_GET(fte_match_set_misc, mask, source_sqn);
 
 	spec->source_port = MLX5_GET(fte_match_set_misc, mask, source_port);
+	spec->source_eswitch_owner_vhca_id = MLX5_GET(fte_match_set_misc, mask,
+						      source_eswitch_owner_vhca_id);
 
 	spec->outer_second_prio = MLX5_GET(fte_match_set_misc, mask, outer_second_prio);
 	spec->outer_second_cfi = MLX5_GET(fte_match_set_misc, mask, outer_second_cfi);
@@ -2254,11 +2252,18 @@
 {
 	struct mlx5dr_match_misc *misc_mask = &value->misc;
 
-	if (misc_mask->source_port != 0xffff)
+	/* Partial misc source_port is not supported */
+	if (misc_mask->source_port && misc_mask->source_port != 0xffff)
+		return -EINVAL;
+
+	/* Partial misc source_eswitch_owner_vhca_id is not supported */
+	if (misc_mask->source_eswitch_owner_vhca_id &&
+	    misc_mask->source_eswitch_owner_vhca_id != 0xffff)
 		return -EINVAL;
 
 	DR_STE_SET_MASK(src_gvmi_qp, bit_mask, source_gvmi, misc_mask, source_port);
 	DR_STE_SET_MASK(src_gvmi_qp, bit_mask, source_qp, misc_mask, source_sqn);
+	misc_mask->source_eswitch_owner_vhca_id = 0;
 
 	return 0;
 }
@@ -2270,17 +2275,33 @@
 	struct dr_hw_ste_format *hw_ste = (struct dr_hw_ste_format *)hw_ste_p;
 	struct mlx5dr_match_misc *misc = &value->misc;
 	struct mlx5dr_cmd_vport_cap *vport_cap;
+	struct mlx5dr_domain *dmn = sb->dmn;
+	struct mlx5dr_cmd_caps *caps;
 	u8 *tag = hw_ste->tag;
 
 	DR_STE_SET_TAG(src_gvmi_qp, tag, source_qp, misc, source_sqn);
 
-	vport_cap = mlx5dr_get_vport_cap(sb->caps, misc->source_port);
+	if (sb->vhca_id_valid) {
+		/* Find port GVMI based on the eswitch_owner_vhca_id */
+		if (misc->source_eswitch_owner_vhca_id == dmn->info.caps.gvmi)
+			caps = &dmn->info.caps;
+		else if (dmn->peer_dmn && (misc->source_eswitch_owner_vhca_id ==
+					   dmn->peer_dmn->info.caps.gvmi))
+			caps = &dmn->peer_dmn->info.caps;
+		else
+			return -EINVAL;
+	} else {
+		caps = &dmn->info.caps;
+	}
+
+	vport_cap = mlx5dr_get_vport_cap(caps, misc->source_port);
 	if (!vport_cap)
 		return -EINVAL;
 
 	if (vport_cap->vport_gvmi)
 		MLX5_SET(ste_src_gvmi_qp, tag, source_gvmi, vport_cap->vport_gvmi);
 
+	misc->source_eswitch_owner_vhca_id = 0;
 	misc->source_port = 0;
 
 	return 0;
@@ -2288,17 +2309,20 @@
 
 int mlx5dr_ste_build_src_gvmi_qpn(struct mlx5dr_ste_build *sb,
 				  struct mlx5dr_match_param *mask,
-				  struct mlx5dr_cmd_caps *caps,
+				  struct mlx5dr_domain *dmn,
 				  bool inner, bool rx)
 {
 	int ret;
 
+	/* Set vhca_id_valid before we reset source_eswitch_owner_vhca_id */
+	sb->vhca_id_valid = mask->misc.source_eswitch_owner_vhca_id;
+
 	ret = dr_ste_build_src_gvmi_qpn_bit_mask(mask, sb->bit_mask);
 	if (ret)
 		return ret;
 
 	sb->rx = rx;
-	sb->caps = caps;
+	sb->dmn = dmn;
 	sb->inner = inner;
 	sb->lu_type = MLX5DR_STE_LU_TYPE_SRC_GVMI_AND_QP;
 	sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask);

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h
index a37ee63..1cb3769 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h

@@ -180,6 +180,8 @@
 struct mlx5dr_ste_build {
 	u8 inner:1;
 	u8 rx:1;
+	u8 vhca_id_valid:1;
+	struct mlx5dr_domain *dmn;
 	struct mlx5dr_cmd_caps *caps;
 	u8 lu_type;
 	u16 byte_mask;
@@ -331,7 +333,7 @@
 				 bool inner, bool rx);
 int mlx5dr_ste_build_src_gvmi_qpn(struct mlx5dr_ste_build *sb,
 				  struct mlx5dr_match_param *mask,
-				  struct mlx5dr_cmd_caps *caps,
+				  struct mlx5dr_domain *dmn,
 				  bool inner, bool rx);
 void mlx5dr_ste_build_empty_always_hit(struct mlx5dr_ste_build *sb, bool rx);
 
@@ -453,7 +455,7 @@
 	u32 gre_c_present:1;
 	/* Source port.;0xffff determines wire port */
 	u32 source_port:16;
-	u32 reserved_auto2:16;
+	u32 source_eswitch_owner_vhca_id:16;
 	/* VLAN ID of first VLAN tag the inner header of the incoming packet.
 	 * Valid only when inner_second_cvlan_tag ==1 or inner_second_svlan_tag ==1
 	 */
@@ -745,7 +747,6 @@
 		struct {
 			struct mlx5dr_domain *dmn;
 			struct mlx5dr_cmd_vport_cap *caps;
-			u32 num;
 		} vport;
 		struct {
 			u32 vlan_hdr; /* tpid_pcp_dei_vid */

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index dd234cf..dcf9562 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c

@@ -3771,6 +3771,14 @@
 		goto err_port_qdiscs_init;
 	}
 
+	err = mlxsw_sp_port_vlan_set(mlxsw_sp_port, 0, VLAN_N_VID - 1, false,
+				     false);
+	if (err) {
+		dev_err(mlxsw_sp->bus_info->dev, "Port %d: Failed to clear VLAN filter\n",
+			mlxsw_sp_port->local_port);
+		goto err_port_vlan_clear;
+	}
+
 	err = mlxsw_sp_port_nve_init(mlxsw_sp_port);
 	if (err) {
 		dev_err(mlxsw_sp->bus_info->dev, "Port %d: Failed to initialize NVE\n",
@@ -3818,6 +3826,7 @@
 err_port_pvid_set:
 	mlxsw_sp_port_nve_fini(mlxsw_sp_port);
 err_port_nve_init:
+err_port_vlan_clear:
 	mlxsw_sp_tc_qdisc_fini(mlxsw_sp_port);
 err_port_qdiscs_init:
 	mlxsw_sp_port_fids_fini(mlxsw_sp_port);

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c
index 0ad1a24..b607919 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c

@@ -21,6 +21,7 @@
 					 struct netlink_ext_ack *extack)
 {
 	const struct flow_action_entry *act;
+	int mirror_act_count = 0;
 	int err, i;
 
 	if (!flow_action_has_entries(flow_action))
@@ -105,6 +106,11 @@
 		case FLOW_ACTION_MIRRED: {
 			struct net_device *out_dev = act->dev;
 
+			if (mirror_act_count++) {
+				NL_SET_ERR_MSG_MOD(extack, "Multiple mirror actions per rule are not supported");
+				return -EOPNOTSUPP;
+			}
+
 			err = mlxsw_sp_acl_rulei_act_mirror(mlxsw_sp, rulei,
 							    block, out_dev,
 							    extack);

diff --git a/drivers/net/ethernet/netronome/nfp/abm/cls.c b/drivers/net/ethernet/netronome/nfp/abm/cls.c
index 23ebddf..9f8a1f6 100644
--- a/drivers/net/ethernet/netronome/nfp/abm/cls.c
+++ b/drivers/net/ethernet/netronome/nfp/abm/cls.c

@@ -176,8 +176,10 @@
 	u8 mask, val;
 	int err;
 
-	if (!nfp_abm_u32_check_knode(alink->abm, knode, proto, extack))
+	if (!nfp_abm_u32_check_knode(alink->abm, knode, proto, extack)) {
+		err = -EOPNOTSUPP;
 		goto err_delete;
+	}
 
 	tos_off = proto == htons(ETH_P_IP) ? 16 : 20;
 
@@ -198,14 +200,18 @@
 		if ((iter->val & cmask) == (val & cmask) &&
 		    iter->band != knode->res->classid) {
 			NL_SET_ERR_MSG_MOD(extack, "conflict with already offloaded filter");
+			err = -EOPNOTSUPP;
 			goto err_delete;
 		}
 	}
 
 	if (!match) {
 		match = kzalloc(sizeof(*match), GFP_KERNEL);
-		if (!match)
-			return -ENOMEM;
+		if (!match) {
+			err = -ENOMEM;
+			goto err_delete;
+		}
+
 		list_add(&match->list, &alink->dscp_map);
 	}
 	match->handle = knode->handle;
@@ -221,7 +227,7 @@
 
 err_delete:
 	nfp_abm_u32_knode_delete(alink, knode);
-	return -EOPNOTSUPP;
+	return err;
 }
 
 static int nfp_abm_setup_tc_block_cb(enum tc_setup_type type,

diff --git a/drivers/net/ethernet/netronome/nfp/flower/main.c b/drivers/net/ethernet/netronome/nfp/flower/main.c
index 7a20447..d8ad934 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/main.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/main.c

@@ -400,6 +400,7 @@
 		repr_priv = kzalloc(sizeof(*repr_priv), GFP_KERNEL);
 		if (!repr_priv) {
 			err = -ENOMEM;
+			nfp_repr_free(repr);
 			goto err_reprs_clean;
 		}
 
@@ -413,6 +414,7 @@
 		port = nfp_port_alloc(app, port_type, repr);
 		if (IS_ERR(port)) {
 			err = PTR_ERR(port);
+			kfree(repr_priv);
 			nfp_repr_free(repr);
 			goto err_reprs_clean;
 		}
@@ -433,6 +435,7 @@
 		err = nfp_repr_init(app, repr,
 				    port_id, port, priv->nn->dp.netdev);
 		if (err) {
+			kfree(repr_priv);
 			nfp_port_free(port);
 			nfp_repr_free(repr);
 			goto err_reprs_clean;
@@ -515,6 +518,7 @@
 		repr_priv = kzalloc(sizeof(*repr_priv), GFP_KERNEL);
 		if (!repr_priv) {
 			err = -ENOMEM;
+			nfp_repr_free(repr);
 			goto err_reprs_clean;
 		}
 
@@ -525,11 +529,13 @@
 		port = nfp_port_alloc(app, NFP_PORT_PHYS_PORT, repr);
 		if (IS_ERR(port)) {
 			err = PTR_ERR(port);
+			kfree(repr_priv);
 			nfp_repr_free(repr);
 			goto err_reprs_clean;
 		}
 		err = nfp_port_init_phy_port(app->pf, app, port, i);
 		if (err) {
+			kfree(repr_priv);
 			nfp_port_free(port);
 			nfp_repr_free(repr);
 			goto err_reprs_clean;
@@ -542,6 +548,7 @@
 		err = nfp_repr_init(app, repr,
 				    cmsg_port_id, port, priv->nn->dp.netdev);
 		if (err) {
+			kfree(repr_priv);
 			nfp_port_free(port);
 			nfp_repr_free(repr);
 			goto err_reprs_clean;

diff --git a/drivers/net/ethernet/netx-eth.c b/drivers/net/ethernet/netx-eth.c
deleted file mode 100644
index cf6e7eb..0000000
--- a/drivers/net/ethernet/netx-eth.c
+++ /dev/null

@@ -1,497 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * drivers/net/ethernet/netx-eth.c
- *
- * Copyright (c) 2005 Sascha Hauer <s.hauer@pengutronix.de>, Pengutronix
- */
-
-#include <linux/init.h>
-#include <linux/interrupt.h>
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/delay.h>
-
-#include <linux/netdevice.h>
-#include <linux/platform_device.h>
-#include <linux/etherdevice.h>
-#include <linux/skbuff.h>
-#include <linux/mii.h>
-
-#include <asm/io.h>
-#include <mach/hardware.h>
-#include <mach/netx-regs.h>
-#include <mach/pfifo.h>
-#include <mach/xc.h>
-#include <linux/platform_data/eth-netx.h>
-
-/* XC Fifo Offsets */
-#define EMPTY_PTR_FIFO(xcno)    (0 + ((xcno) << 3))	/* Index of the empty pointer FIFO */
-#define IND_FIFO_PORT_HI(xcno)  (1 + ((xcno) << 3))	/* Index of the FIFO where received */
-							/* Data packages are indicated by XC */
-#define IND_FIFO_PORT_LO(xcno)  (2 + ((xcno) << 3))	/* Index of the FIFO where received */
-							/* Data packages are indicated by XC */
-#define REQ_FIFO_PORT_HI(xcno)  (3 + ((xcno) << 3))	/* Index of the FIFO where Data packages */
-							/* have to be indicated by ARM which */
-							/* shall be sent */
-#define REQ_FIFO_PORT_LO(xcno)  (4 + ((xcno) << 3))	/* Index of the FIFO where Data packages */
-							/* have to be indicated by ARM which shall */
-							/* be sent */
-#define CON_FIFO_PORT_HI(xcno)  (5 + ((xcno) << 3))	/* Index of the FIFO where sent Data packages */
-							/* are confirmed */
-#define CON_FIFO_PORT_LO(xcno)  (6 + ((xcno) << 3))	/* Index of the FIFO where sent Data */
-							/* packages are confirmed */
-#define PFIFO_MASK(xcno)        (0x7f << (xcno*8))
-
-#define FIFO_PTR_FRAMELEN_SHIFT 0
-#define FIFO_PTR_FRAMELEN_MASK  (0x7ff << 0)
-#define FIFO_PTR_FRAMELEN(len)  (((len) << 0) & FIFO_PTR_FRAMELEN_MASK)
-#define FIFO_PTR_TIMETRIG       (1<<11)
-#define FIFO_PTR_MULTI_REQ
-#define FIFO_PTR_ORIGIN         (1<<14)
-#define FIFO_PTR_VLAN           (1<<15)
-#define FIFO_PTR_FRAMENO_SHIFT  16
-#define FIFO_PTR_FRAMENO_MASK   (0x3f << 16)
-#define FIFO_PTR_FRAMENO(no)    (((no) << 16) & FIFO_PTR_FRAMENO_MASK)
-#define FIFO_PTR_SEGMENT_SHIFT  22
-#define FIFO_PTR_SEGMENT_MASK   (0xf << 22)
-#define FIFO_PTR_SEGMENT(seg)   (((seg) & 0xf) << 22)
-#define FIFO_PTR_ERROR_SHIFT    28
-#define FIFO_PTR_ERROR_MASK     (0xf << 28)
-
-#define ISR_LINK_STATUS_CHANGE (1<<4)
-#define ISR_IND_LO             (1<<3)
-#define ISR_CON_LO             (1<<2)
-#define ISR_IND_HI             (1<<1)
-#define ISR_CON_HI             (1<<0)
-
-#define ETH_MAC_LOCAL_CONFIG 0x1560
-#define ETH_MAC_4321         0x1564
-#define ETH_MAC_65           0x1568
-
-#define MAC_TRAFFIC_CLASS_ARRANGEMENT_SHIFT 16
-#define MAC_TRAFFIC_CLASS_ARRANGEMENT_MASK (0xf<<MAC_TRAFFIC_CLASS_ARRANGEMENT_SHIFT)
-#define MAC_TRAFFIC_CLASS_ARRANGEMENT(x) (((x)<<MAC_TRAFFIC_CLASS_ARRANGEMENT_SHIFT) & MAC_TRAFFIC_CLASS_ARRANGEMENT_MASK)
-#define LOCAL_CONFIG_LINK_STATUS_IRQ_EN (1<<24)
-#define LOCAL_CONFIG_CON_LO_IRQ_EN (1<<23)
-#define LOCAL_CONFIG_CON_HI_IRQ_EN (1<<22)
-#define LOCAL_CONFIG_IND_LO_IRQ_EN (1<<21)
-#define LOCAL_CONFIG_IND_HI_IRQ_EN (1<<20)
-
-#define CARDNAME "netx-eth"
-
-/* LSB must be zero */
-#define INTERNAL_PHY_ADR 0x1c
-
-struct netx_eth_priv {
-	void                    __iomem *sram_base, *xpec_base, *xmac_base;
-	int                     id;
-	struct mii_if_info      mii;
-	u32                     msg_enable;
-	struct xc               *xc;
-	spinlock_t              lock;
-};
-
-static void netx_eth_set_multicast_list(struct net_device *ndev)
-{
-	/* implement me */
-}
-
-static int
-netx_eth_hard_start_xmit(struct sk_buff *skb, struct net_device *ndev)
-{
-	struct netx_eth_priv *priv = netdev_priv(ndev);
-	unsigned char *buf = skb->data;
-	unsigned int len = skb->len;
-
-	spin_lock_irq(&priv->lock);
-	memcpy_toio(priv->sram_base + 1560, (void *)buf, len);
-	if (len < 60) {
-		memset_io(priv->sram_base + 1560 + len, 0, 60 - len);
-		len = 60;
-	}
-
-	pfifo_push(REQ_FIFO_PORT_LO(priv->id),
-	           FIFO_PTR_SEGMENT(priv->id) |
-	           FIFO_PTR_FRAMENO(1) |
-	           FIFO_PTR_FRAMELEN(len));
-
-	ndev->stats.tx_packets++;
-	ndev->stats.tx_bytes += skb->len;
-
-	netif_stop_queue(ndev);
-	spin_unlock_irq(&priv->lock);
-	dev_kfree_skb(skb);
-
-	return NETDEV_TX_OK;
-}
-
-static void netx_eth_receive(struct net_device *ndev)
-{
-	struct netx_eth_priv *priv = netdev_priv(ndev);
-	unsigned int val, frameno, seg, len;
-	unsigned char *data;
-	struct sk_buff *skb;
-
-	val = pfifo_pop(IND_FIFO_PORT_LO(priv->id));
-
-	frameno = (val & FIFO_PTR_FRAMENO_MASK) >> FIFO_PTR_FRAMENO_SHIFT;
-	seg = (val & FIFO_PTR_SEGMENT_MASK) >> FIFO_PTR_SEGMENT_SHIFT;
-	len = (val & FIFO_PTR_FRAMELEN_MASK) >> FIFO_PTR_FRAMELEN_SHIFT;
-
-	skb = netdev_alloc_skb(ndev, len);
-	if (unlikely(skb == NULL)) {
-		ndev->stats.rx_dropped++;
-		return;
-	}
-
-	data = skb_put(skb, len);
-
-	memcpy_fromio(data, priv->sram_base + frameno * 1560, len);
-
-	pfifo_push(EMPTY_PTR_FIFO(priv->id),
-		FIFO_PTR_SEGMENT(seg) | FIFO_PTR_FRAMENO(frameno));
-
-	skb->protocol = eth_type_trans(skb, ndev);
-	netif_rx(skb);
-	ndev->stats.rx_packets++;
-	ndev->stats.rx_bytes += len;
-}
-
-static irqreturn_t
-netx_eth_interrupt(int irq, void *dev_id)
-{
-	struct net_device *ndev = dev_id;
-	struct netx_eth_priv *priv = netdev_priv(ndev);
-	int status;
-	unsigned long flags;
-
-	spin_lock_irqsave(&priv->lock, flags);
-
-	status = readl(NETX_PFIFO_XPEC_ISR(priv->id));
-	while (status) {
-		int fill_level;
-		writel(status, NETX_PFIFO_XPEC_ISR(priv->id));
-
-		if ((status & ISR_CON_HI) || (status & ISR_IND_HI))
-			printk("%s: unexpected status: 0x%08x\n",
-			    __func__, status);
-
-		fill_level =
-		    readl(NETX_PFIFO_FILL_LEVEL(IND_FIFO_PORT_LO(priv->id)));
-		while (fill_level--)
-			netx_eth_receive(ndev);
-
-		if (status & ISR_CON_LO)
-			netif_wake_queue(ndev);
-
-		if (status & ISR_LINK_STATUS_CHANGE)
-			mii_check_media(&priv->mii, netif_msg_link(priv), 1);
-
-		status = readl(NETX_PFIFO_XPEC_ISR(priv->id));
-	}
-	spin_unlock_irqrestore(&priv->lock, flags);
-	return IRQ_HANDLED;
-}
-
-static int netx_eth_open(struct net_device *ndev)
-{
-	struct netx_eth_priv *priv = netdev_priv(ndev);
-
-	if (request_irq
-	    (ndev->irq, netx_eth_interrupt, IRQF_SHARED, ndev->name, ndev))
-		return -EAGAIN;
-
-	writel(ndev->dev_addr[0] |
-	       ndev->dev_addr[1]<<8 |
-	       ndev->dev_addr[2]<<16 |
-	       ndev->dev_addr[3]<<24,
-	       priv->xpec_base + NETX_XPEC_RAM_START_OFS + ETH_MAC_4321);
-	writel(ndev->dev_addr[4] |
-	       ndev->dev_addr[5]<<8,
-	       priv->xpec_base + NETX_XPEC_RAM_START_OFS + ETH_MAC_65);
-
-	writel(LOCAL_CONFIG_LINK_STATUS_IRQ_EN |
-		LOCAL_CONFIG_CON_LO_IRQ_EN |
-		LOCAL_CONFIG_CON_HI_IRQ_EN |
-		LOCAL_CONFIG_IND_LO_IRQ_EN |
-		LOCAL_CONFIG_IND_HI_IRQ_EN,
-		priv->xpec_base + NETX_XPEC_RAM_START_OFS +
-		ETH_MAC_LOCAL_CONFIG);
-
-	mii_check_media(&priv->mii, netif_msg_link(priv), 1);
-	netif_start_queue(ndev);
-
-	return 0;
-}
-
-static int netx_eth_close(struct net_device *ndev)
-{
-	struct netx_eth_priv *priv = netdev_priv(ndev);
-
-	netif_stop_queue(ndev);
-
-	writel(0,
-	    priv->xpec_base + NETX_XPEC_RAM_START_OFS + ETH_MAC_LOCAL_CONFIG);
-
-	free_irq(ndev->irq, ndev);
-
-	return 0;
-}
-
-static void netx_eth_timeout(struct net_device *ndev)
-{
-	struct netx_eth_priv *priv = netdev_priv(ndev);
-	int i;
-
-	printk(KERN_ERR "%s: transmit timed out, resetting\n", ndev->name);
-
-	spin_lock_irq(&priv->lock);
-
-	xc_reset(priv->xc);
-	xc_start(priv->xc);
-
-	for (i=2; i<=18; i++)
-		pfifo_push(EMPTY_PTR_FIFO(priv->id),
-			FIFO_PTR_FRAMENO(i) | FIFO_PTR_SEGMENT(priv->id));
-
-	spin_unlock_irq(&priv->lock);
-
-	netif_wake_queue(ndev);
-}
-
-static int
-netx_eth_phy_read(struct net_device *ndev, int phy_id, int reg)
-{
-	unsigned int val;
-
-	val = MIIMU_SNRDY | MIIMU_PREAMBLE | MIIMU_PHYADDR(phy_id) |
-	      MIIMU_REGADDR(reg) | MIIMU_PHY_NRES;
-
-	writel(val, NETX_MIIMU);
-	while (readl(NETX_MIIMU) & MIIMU_SNRDY);
-
-	return readl(NETX_MIIMU) >> 16;
-
-}
-
-static void
-netx_eth_phy_write(struct net_device *ndev, int phy_id, int reg, int value)
-{
-	unsigned int val;
-
-	val = MIIMU_SNRDY | MIIMU_PREAMBLE | MIIMU_PHYADDR(phy_id) |
-	      MIIMU_REGADDR(reg) | MIIMU_PHY_NRES | MIIMU_OPMODE_WRITE |
-	      MIIMU_DATA(value);
-
-	writel(val, NETX_MIIMU);
-	while (readl(NETX_MIIMU) & MIIMU_SNRDY);
-}
-
-static const struct net_device_ops netx_eth_netdev_ops = {
-	.ndo_open		= netx_eth_open,
-	.ndo_stop		= netx_eth_close,
-	.ndo_start_xmit		= netx_eth_hard_start_xmit,
-	.ndo_tx_timeout		= netx_eth_timeout,
-	.ndo_set_rx_mode	= netx_eth_set_multicast_list,
-	.ndo_validate_addr	= eth_validate_addr,
-	.ndo_set_mac_address	= eth_mac_addr,
-};
-
-static int netx_eth_enable(struct net_device *ndev)
-{
-	struct netx_eth_priv *priv = netdev_priv(ndev);
-	unsigned int mac4321, mac65;
-	int running, i, ret;
-	bool inv_mac_addr = false;
-
-	ndev->netdev_ops = &netx_eth_netdev_ops;
-	ndev->watchdog_timeo = msecs_to_jiffies(5000);
-
-	priv->msg_enable       = NETIF_MSG_LINK;
-	priv->mii.phy_id_mask  = 0x1f;
-	priv->mii.reg_num_mask = 0x1f;
-	priv->mii.force_media  = 0;
-	priv->mii.full_duplex  = 0;
-	priv->mii.dev	     = ndev;
-	priv->mii.mdio_read    = netx_eth_phy_read;
-	priv->mii.mdio_write   = netx_eth_phy_write;
-	priv->mii.phy_id = INTERNAL_PHY_ADR + priv->id;
-
-	running = xc_running(priv->xc);
-	xc_stop(priv->xc);
-
-	/* if the xc engine is already running, assume the bootloader has
-	 * loaded the firmware for us
-	 */
-	if (running) {
-		/* get Node Address from hardware */
-		mac4321 = readl(priv->xpec_base +
-			NETX_XPEC_RAM_START_OFS + ETH_MAC_4321);
-		mac65 = readl(priv->xpec_base +
-			NETX_XPEC_RAM_START_OFS + ETH_MAC_65);
-
-		ndev->dev_addr[0] = mac4321 & 0xff;
-		ndev->dev_addr[1] = (mac4321 >> 8) & 0xff;
-		ndev->dev_addr[2] = (mac4321 >> 16) & 0xff;
-		ndev->dev_addr[3] = (mac4321 >> 24) & 0xff;
-		ndev->dev_addr[4] = mac65 & 0xff;
-		ndev->dev_addr[5] = (mac65 >> 8) & 0xff;
-	} else {
-		if (xc_request_firmware(priv->xc)) {
-			printk(CARDNAME ": requesting firmware failed\n");
-			return -ENODEV;
-		}
-	}
-
-	xc_reset(priv->xc);
-	xc_start(priv->xc);
-
-	if (!is_valid_ether_addr(ndev->dev_addr))
-		inv_mac_addr = true;
-
-	for (i=2; i<=18; i++)
-		pfifo_push(EMPTY_PTR_FIFO(priv->id),
-			FIFO_PTR_FRAMENO(i) | FIFO_PTR_SEGMENT(priv->id));
-
-	ret = register_netdev(ndev);
-	if (inv_mac_addr)
-		printk("%s: Invalid ethernet MAC address. Please set using ip\n",
-		       ndev->name);
-
-	return ret;
-}
-
-static int netx_eth_drv_probe(struct platform_device *pdev)
-{
-	struct netx_eth_priv *priv;
-	struct net_device *ndev;
-	struct netxeth_platform_data *pdata;
-	int ret;
-
-	ndev = alloc_etherdev(sizeof (struct netx_eth_priv));
-	if (!ndev) {
-		ret = -ENOMEM;
-		goto exit;
-	}
-	SET_NETDEV_DEV(ndev, &pdev->dev);
-
-	platform_set_drvdata(pdev, ndev);
-
-	priv = netdev_priv(ndev);
-
-	pdata = dev_get_platdata(&pdev->dev);
-	priv->xc = request_xc(pdata->xcno, &pdev->dev);
-	if (!priv->xc) {
-		dev_err(&pdev->dev, "unable to request xc engine\n");
-		ret = -ENODEV;
-		goto exit_free_netdev;
-	}
-
-	ndev->irq = priv->xc->irq;
-	priv->id = pdev->id;
-	priv->xpec_base = priv->xc->xpec_base;
-	priv->xmac_base = priv->xc->xmac_base;
-	priv->sram_base = priv->xc->sram_base;
-
-	spin_lock_init(&priv->lock);
-
-	ret = pfifo_request(PFIFO_MASK(priv->id));
-	if (ret) {
-		printk("unable to request PFIFO\n");
-		goto exit_free_xc;
-	}
-
-	ret = netx_eth_enable(ndev);
-	if (ret)
-		goto exit_free_pfifo;
-
-	return 0;
-exit_free_pfifo:
-	pfifo_free(PFIFO_MASK(priv->id));
-exit_free_xc:
-	free_xc(priv->xc);
-exit_free_netdev:
-	free_netdev(ndev);
-exit:
-	return ret;
-}
-
-static int netx_eth_drv_remove(struct platform_device *pdev)
-{
-	struct net_device *ndev = platform_get_drvdata(pdev);
-	struct netx_eth_priv *priv = netdev_priv(ndev);
-
-	unregister_netdev(ndev);
-	xc_stop(priv->xc);
-	free_xc(priv->xc);
-	free_netdev(ndev);
-	pfifo_free(PFIFO_MASK(priv->id));
-
-	return 0;
-}
-
-static int netx_eth_drv_suspend(struct platform_device *pdev, pm_message_t state)
-{
-	dev_err(&pdev->dev, "suspend not implemented\n");
-	return 0;
-}
-
-static int netx_eth_drv_resume(struct platform_device *pdev)
-{
-	dev_err(&pdev->dev, "resume not implemented\n");
-	return 0;
-}
-
-static struct platform_driver netx_eth_driver = {
-	.probe		= netx_eth_drv_probe,
-	.remove		= netx_eth_drv_remove,
-	.suspend	= netx_eth_drv_suspend,
-	.resume		= netx_eth_drv_resume,
-	.driver		= {
-		.name	= CARDNAME,
-	},
-};
-
-static int __init netx_eth_init(void)
-{
-	unsigned int phy_control, val;
-
-	printk("NetX Ethernet driver\n");
-
-	phy_control = PHY_CONTROL_PHY_ADDRESS(INTERNAL_PHY_ADR>>1) |
-		      PHY_CONTROL_PHY1_MODE(PHY_MODE_ALL) |
-		      PHY_CONTROL_PHY1_AUTOMDIX |
-		      PHY_CONTROL_PHY1_EN |
-		      PHY_CONTROL_PHY0_MODE(PHY_MODE_ALL) |
-		      PHY_CONTROL_PHY0_AUTOMDIX |
-		      PHY_CONTROL_PHY0_EN |
-		      PHY_CONTROL_CLK_XLATIN;
-
-	val = readl(NETX_SYSTEM_IOC_ACCESS_KEY);
-	writel(val, NETX_SYSTEM_IOC_ACCESS_KEY);
-
-	writel(phy_control | PHY_CONTROL_RESET, NETX_SYSTEM_PHY_CONTROL);
-	udelay(100);
-
-	val = readl(NETX_SYSTEM_IOC_ACCESS_KEY);
-	writel(val, NETX_SYSTEM_IOC_ACCESS_KEY);
-
-	writel(phy_control, NETX_SYSTEM_PHY_CONTROL);
-
-	return platform_driver_register(&netx_eth_driver);
-}
-
-static void __exit netx_eth_cleanup(void)
-{
-	platform_driver_unregister(&netx_eth_driver);
-}
-
-module_init(netx_eth_init);
-module_exit(netx_eth_cleanup);
-
-MODULE_AUTHOR("Sascha Hauer, Pengutronix");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS("platform:" CARDNAME);
-MODULE_FIRMWARE("xc0.bin");
-MODULE_FIRMWARE("xc1.bin");
-MODULE_FIRMWARE("xc2.bin");

diff --git a/drivers/net/ethernet/ni/nixge.c b/drivers/net/ethernet/ni/nixge.c
index 0b384f9..2761f3a 100644
--- a/drivers/net/ethernet/ni/nixge.c
+++ b/drivers/net/ethernet/ni/nixge.c

@@ -1347,7 +1347,7 @@
 	}
 
 	priv->phy_mode = of_get_phy_mode(pdev->dev.of_node);
-	if (priv->phy_mode < 0) {
+	if ((int)priv->phy_mode < 0) {
 		netdev_err(ndev, "not find \"phy-mode\" property\n");
 		err = -EINVAL;
 		goto unregister_mdio;

diff --git a/drivers/net/ethernet/nxp/Kconfig b/drivers/net/ethernet/nxp/Kconfig
index 418afb8..ee83a71 100644
--- a/drivers/net/ethernet/nxp/Kconfig
+++ b/drivers/net/ethernet/nxp/Kconfig

@@ -1,9 +1,9 @@
 # SPDX-License-Identifier: GPL-2.0-only
 config LPC_ENET
-        tristate "NXP ethernet MAC on LPC devices"
-        depends on ARCH_LPC32XX || COMPILE_TEST
-        select PHYLIB
-        help
+	tristate "NXP ethernet MAC on LPC devices"
+	depends on ARCH_LPC32XX || COMPILE_TEST
+	select PHYLIB
+	help
 	  Say Y or M here if you want to use the NXP ethernet MAC included on
 	  some NXP LPC devices. You can safely enable this option for LPC32xx
 	  SoC. Also available as a module.

diff --git a/drivers/net/ethernet/pensando/Kconfig b/drivers/net/ethernet/pensando/Kconfig
index 5ea570b..bd0583e 100644
--- a/drivers/net/ethernet/pensando/Kconfig
+++ b/drivers/net/ethernet/pensando/Kconfig

@@ -26,7 +26,7 @@
 	  found in
 	  <file:Documentation/networking/device_drivers/pensando/ionic.rst>.
 
-          To compile this driver as a module, choose M here. The module
-          will be called ionic.
+	  To compile this driver as a module, choose M here. The module
+	  will be called ionic.
 
 endif # NET_VENDOR_PENSANDO

diff --git a/drivers/net/ethernet/pensando/ionic/ionic_debugfs.c b/drivers/net/ethernet/pensando/ionic/ionic_debugfs.c
index 7afc4a3..bc03cec 100644
--- a/drivers/net/ethernet/pensando/ionic/ionic_debugfs.c
+++ b/drivers/net/ethernet/pensando/ionic/ionic_debugfs.c

@@ -57,7 +57,7 @@
 void ionic_debugfs_add_ident(struct ionic *ionic)
 {
 	debugfs_create_file("identity", 0400, ionic->dentry,
-			    ionic, &identity_fops) ? 0 : -EOPNOTSUPP;
+			    ionic, &identity_fops);
 }
 
 void ionic_debugfs_add_sizes(struct ionic *ionic)

diff --git a/drivers/net/ethernet/pensando/ionic/ionic_lif.c b/drivers/net/ethernet/pensando/ionic/ionic_lif.c
index db7c827..72107a0 100644
--- a/drivers/net/ethernet/pensando/ionic/ionic_lif.c
+++ b/drivers/net/ethernet/pensando/ionic/ionic_lif.c

@@ -1704,6 +1704,7 @@
 					      GFP_KERNEL);
 
 	if (!lif->rss_ind_tbl) {
+		err = -ENOMEM;
 		dev_err(dev, "Failed to allocate rss indirection table, aborting\n");
 		goto err_out_free_qcqs;
 	}

diff --git a/drivers/net/ethernet/qlogic/qede/qede_fp.c b/drivers/net/ethernet/qlogic/qede/qede_fp.c
index 0ae28f0..004c0bf 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_fp.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_fp.c

@@ -779,8 +779,7 @@
 			return NULL;
 
 		skb_reserve(skb, pad);
-		memcpy(skb_put(skb, len),
-		       page_address(bd->data) + offset, len);
+		skb_put_data(skb, page_address(bd->data) + offset, len);
 		qede_reuse_page(rxq, bd);
 		goto out;
 	}

diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c
index 0ef01db..74f81fe 100644
--- a/drivers/net/ethernet/realtek/r8169_main.c
+++ b/drivers/net/ethernet/realtek/r8169_main.c

@@ -28,7 +28,6 @@
 #include <linux/dma-mapping.h>
 #include <linux/pm_runtime.h>
 #include <linux/prefetch.h>
-#include <linux/pci-aspm.h>
 #include <linux/ipv6.h>
 #include <net/ip6_checksum.h>
 

diff --git a/drivers/net/ethernet/socionext/netsec.c b/drivers/net/ethernet/socionext/netsec.c
index 1502fe8..55db7fb 100644
--- a/drivers/net/ethernet/socionext/netsec.c
+++ b/drivers/net/ethernet/socionext/netsec.c

@@ -2007,7 +2007,7 @@
 			   NETIF_MSG_LINK | NETIF_MSG_PROBE;
 
 	priv->phy_interface = device_get_phy_mode(&pdev->dev);
-	if (priv->phy_interface < 0) {
+	if ((int)priv->phy_interface < 0) {
 		dev_err(&pdev->dev, "missing required property 'phy-mode'\n");
 		ret = -ENODEV;
 		goto free_ndev;

diff --git a/drivers/net/ethernet/socionext/sni_ave.c b/drivers/net/ethernet/socionext/sni_ave.c
index 10d0c3e..6e984d5 100644
--- a/drivers/net/ethernet/socionext/sni_ave.c
+++ b/drivers/net/ethernet/socionext/sni_ave.c

@@ -1566,7 +1566,7 @@
 
 	np = dev->of_node;
 	phy_mode = of_get_phy_mode(np);
-	if (phy_mode < 0) {
+	if ((int)phy_mode < 0) {
 		dev_err(dev, "phy-mode not found\n");
 		return -EINVAL;
 	}
@@ -1662,19 +1662,19 @@
 					       "socionext,syscon-phy-mode",
 					       1, 0, &args);
 	if (ret) {
-		netdev_err(ndev, "can't get syscon-phy-mode property\n");
+		dev_err(dev, "can't get syscon-phy-mode property\n");
 		goto out_free_netdev;
 	}
 	priv->regmap = syscon_node_to_regmap(args.np);
 	of_node_put(args.np);
 	if (IS_ERR(priv->regmap)) {
-		netdev_err(ndev, "can't map syscon-phy-mode\n");
+		dev_err(dev, "can't map syscon-phy-mode\n");
 		ret = PTR_ERR(priv->regmap);
 		goto out_free_netdev;
 	}
 	ret = priv->data->get_pinmode(priv, phy_mode, args.args[0]);
 	if (ret) {
-		netdev_err(ndev, "invalid phy-mode setting\n");
+		dev_err(dev, "invalid phy-mode setting\n");
 		goto out_free_netdev;
 	}
 

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-ipq806x.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-ipq806x.c
index 2c6d7c6..0d21082 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-ipq806x.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-ipq806x.c

@@ -191,7 +191,7 @@
 	struct device *dev = &gmac->pdev->dev;
 
 	gmac->phy_mode = of_get_phy_mode(dev->of_node);
-	if (gmac->phy_mode < 0) {
+	if ((int)gmac->phy_mode < 0) {
 		dev_err(dev, "missing phy mode property\n");
 		return -EINVAL;
 	}

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c
index 9cda29e..306da8f 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c

@@ -339,7 +339,7 @@
 
 	dwmac->dev = &pdev->dev;
 	dwmac->phy_mode = of_get_phy_mode(pdev->dev.of_node);
-	if (dwmac->phy_mode < 0) {
+	if ((int)dwmac->phy_mode < 0) {
 		dev_err(&pdev->dev, "missing phy-mode property\n");
 		ret = -EINVAL;
 		goto err_remove_config_dt;

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c
index d5173dd..2b277b2 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c

@@ -523,19 +523,18 @@
 				  struct stmmac_rss *cfg, u32 num_rxq)
 {
 	void __iomem *ioaddr = hw->pcsr;
-	u32 *key = (u32 *)cfg->key;
 	int i, ret;
 	u32 value;
 
 	value = readl(ioaddr + XGMAC_RSS_CTRL);
-	if (!cfg->enable) {
+	if (!cfg || !cfg->enable) {
 		value &= ~XGMAC_RSSE;
 		writel(value, ioaddr + XGMAC_RSS_CTRL);
 		return 0;
 	}
 
 	for (i = 0; i < (sizeof(cfg->key) / sizeof(u32)); i++) {
-		ret = dwxgmac2_rss_write_reg(ioaddr, true, i, *key++);
+		ret = dwxgmac2_rss_write_reg(ioaddr, true, i, cfg->key[i]);
 		if (ret)
 			return ret;
 	}

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index a6cb2aa..d323273 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c

@@ -1557,13 +1557,15 @@
 	for (queue = 0; queue < rx_count; queue++) {
 		struct stmmac_rx_queue *rx_q = &priv->rx_queue[queue];
 		struct page_pool_params pp_params = { 0 };
+		unsigned int num_pages;
 
 		rx_q->queue_index = queue;
 		rx_q->priv_data = priv;
 
 		pp_params.flags = PP_FLAG_DMA_MAP;
 		pp_params.pool_size = DMA_RX_SIZE;
-		pp_params.order = DIV_ROUND_UP(priv->dma_buf_sz, PAGE_SIZE);
+		num_pages = DIV_ROUND_UP(priv->dma_buf_sz, PAGE_SIZE);
+		pp_params.order = ilog2(num_pages);
 		pp_params.nid = dev_to_node(priv->device);
 		pp_params.dev = priv->device;
 		pp_params.dma_dir = DMA_FROM_DEVICE;

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_selftests.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_selftests.c
index c56e89e..5f66f61 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_selftests.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_selftests.c

@@ -670,7 +670,7 @@
 	unsigned int pkt_count;
 	int i, ret = 0;
 
-	if (!phydev || !phydev->pause)
+	if (!phydev || (!phydev->pause && !phydev->asym_pause))
 		return -EOPNOTSUPP;
 
 	tpriv = kzalloc(sizeof(*tpriv), GFP_KERNEL);
@@ -1233,12 +1233,9 @@
 		return -EOPNOTSUPP;
 	if (!priv->dma_cap.l3l4fnum)
 		return -EOPNOTSUPP;
-	if (priv->rss.enable) {
-		struct stmmac_rss rss = { .enable = false, };
-
-		stmmac_rss_configure(priv, priv->hw, &rss,
+	if (priv->rss.enable)
+		stmmac_rss_configure(priv, priv->hw, NULL,
 				     priv->plat->rx_queues_to_use);
-	}
 
 	dissector = kzalloc(sizeof(*dissector), GFP_KERNEL);
 	if (!dissector) {
@@ -1357,12 +1354,9 @@
 		return -EOPNOTSUPP;
 	if (!priv->dma_cap.l3l4fnum)
 		return -EOPNOTSUPP;
-	if (priv->rss.enable) {
-		struct stmmac_rss rss = { .enable = false, };
-
-		stmmac_rss_configure(priv, priv->hw, &rss,
+	if (priv->rss.enable)
+		stmmac_rss_configure(priv, priv->hw, NULL,
 				     priv->plat->rx_queues_to_use);
-	}
 
 	dissector = kzalloc(sizeof(*dissector), GFP_KERNEL);
 	if (!dissector) {

diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
index 4fc627f..676006f 100644
--- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
+++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c

@@ -1762,7 +1762,7 @@
 		}
 	} else {
 		lp->phy_mode = of_get_phy_mode(pdev->dev.of_node);
-		if (lp->phy_mode < 0) {
+		if ((int)lp->phy_mode < 0) {
 			ret = -EINVAL;
 			goto free_netdev;
 		}

diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c
index 8f46aa1..cb76373 100644
--- a/drivers/net/macsec.c
+++ b/drivers/net/macsec.c

@@ -1235,6 +1235,7 @@
 		macsec_rxsa_put(rx_sa);
 	macsec_rxsc_put(rx_sc);
 
+	skb_orphan(skb);
 	ret = gro_cells_receive(&macsec->gro_cells, skb);
 	if (ret == NET_RX_SUCCESS)
 		count_rx(dev, skb->len);

diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig
index 03be30c..fe602648 100644
--- a/drivers/net/phy/Kconfig
+++ b/drivers/net/phy/Kconfig

@@ -460,9 +460,9 @@
 	  Supports the Renesas PHYs uPD60620 and uPD60620A.
 
 config ROCKCHIP_PHY
-        tristate "Driver for Rockchip Ethernet PHYs"
-        ---help---
-          Currently supports the integrated Ethernet PHY.
+	tristate "Driver for Rockchip Ethernet PHYs"
+	---help---
+	  Currently supports the integrated Ethernet PHY.
 
 config SMSC_PHY
 	tristate "SMSC PHYs"

diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c
index 3c8186f..2fea554 100644
--- a/drivers/net/phy/micrel.c
+++ b/drivers/net/phy/micrel.c

@@ -763,6 +763,8 @@
 	 * Whenever the device's Asymmetric Pause capability is set to 1,
 	 * link-up may fail after a link-up to link-down transition.
 	 *
+	 * The Errata Sheet is for ksz9031, but ksz9021 has the same issue
+	 *
 	 * Workaround:
 	 * Do not enable the Asymmetric Pause capability bit.
 	 */
@@ -1076,6 +1078,7 @@
 	/* PHY_GBIT_FEATURES */
 	.driver_data	= &ksz9021_type,
 	.probe		= kszphy_probe,
+	.get_features	= ksz9031_get_features,
 	.config_init	= ksz9021_config_init,
 	.ack_interrupt	= kszphy_ack_interrupt,
 	.config_intr	= kszphy_config_intr,

diff --git a/drivers/net/phy/national.c b/drivers/net/phy/national.c
index a221dd5..a5bf087 100644
--- a/drivers/net/phy/national.c
+++ b/drivers/net/phy/national.c

@@ -105,14 +105,17 @@
 
 static void ns_10_base_t_hdx_loopack(struct phy_device *phydev, int disable)
 {
+	u16 lb_dis = BIT(1);
+
 	if (disable)
-		ns_exp_write(phydev, 0x1c0, ns_exp_read(phydev, 0x1c0) | 1);
+		ns_exp_write(phydev, 0x1c0,
+			     ns_exp_read(phydev, 0x1c0) | lb_dis);
 	else
 		ns_exp_write(phydev, 0x1c0,
-			     ns_exp_read(phydev, 0x1c0) & 0xfffe);
+			     ns_exp_read(phydev, 0x1c0) & ~lb_dis);
 
 	pr_debug("10BASE-T HDX loopback %s\n",
-		 (ns_exp_read(phydev, 0x1c0) & 0x0001) ? "off" : "on");
+		 (ns_exp_read(phydev, 0x1c0) & lb_dis) ? "off" : "on");
 }
 
 static int ns_config_init(struct phy_device *phydev)

diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c
index a30e41a..9a1b006 100644
--- a/drivers/net/ppp/ppp_generic.c
+++ b/drivers/net/ppp/ppp_generic.c

@@ -1415,6 +1415,8 @@
 			netif_wake_queue(ppp->dev);
 		else
 			netif_stop_queue(ppp->dev);
+	} else {
+		kfree_skb(skb);
 	}
 	ppp_xmit_unlock(ppp);
 }

diff --git a/drivers/net/tap.c b/drivers/net/tap.c
index dd614c2..3ae70c7 100644
--- a/drivers/net/tap.c
+++ b/drivers/net/tap.c

@@ -1200,7 +1200,7 @@
 	kfree_skb(skb);
 err:
 	rcu_read_lock();
-		tap = rcu_dereference(q->tap);
+	tap = rcu_dereference(q->tap);
 	if (tap && tap->count_tx_dropped)
 		tap->count_tx_dropped(tap);
 	rcu_read_unlock();

diff --git a/drivers/net/usb/cdc_ncm.c b/drivers/net/usb/cdc_ncm.c
index 50c05d0f..00cab3f 100644
--- a/drivers/net/usb/cdc_ncm.c
+++ b/drivers/net/usb/cdc_ncm.c

@@ -681,8 +681,12 @@
 	u8 ep;
 
 	for (ep = 0; ep < intf->cur_altsetting->desc.bNumEndpoints; ep++) {
-
 		e = intf->cur_altsetting->endpoint + ep;
+
+		/* ignore endpoints which cannot transfer data */
+		if (!usb_endpoint_maxp(&e->desc))
+			continue;
+
 		switch (e->desc.bmAttributes & USB_ENDPOINT_XFERTYPE_MASK) {
 		case USB_ENDPOINT_XFER_INT:
 			if (usb_endpoint_dir_in(&e->desc)) {

diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c
index 58952a7..dde05e2 100644
--- a/drivers/net/usb/usbnet.c
+++ b/drivers/net/usb/usbnet.c

@@ -100,6 +100,11 @@
 			int				intr = 0;
 
 			e = alt->endpoint + ep;
+
+			/* ignore endpoints which cannot transfer data */
+			if (!usb_endpoint_maxp(&e->desc))
+				continue;
+
 			switch (e->desc.bmAttributes) {
 			case USB_ENDPOINT_XFER_INT:
 				if (!usb_endpoint_dir_in(&e->desc))
@@ -339,6 +344,8 @@
 {
 	enum usb_device_speed speed = dev->udev->speed;
 
+	if (!dev->rx_urb_size || !dev->hard_mtu)
+		goto insanity;
 	switch (speed) {
 	case USB_SPEED_HIGH:
 		dev->rx_qlen = MAX_QUEUE_MEMORY / dev->rx_urb_size;
@@ -355,6 +362,7 @@
 		dev->tx_qlen = 5 * MAX_QUEUE_MEMORY / dev->hard_mtu;
 		break;
 	default:
+insanity:
 		dev->rx_qlen = dev->tx_qlen = 4;
 	}
 }

diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index 6e84328..a4b38a9 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c

@@ -1154,7 +1154,8 @@
 	struct sk_buff *skb;
 	int err;
 
-	if (family == AF_INET6 && !ipv6_mod_enabled())
+	if ((family == AF_INET6 || family == RTNL_FAMILY_IP6MR) &&
+	    !ipv6_mod_enabled())
 		return 0;
 
 	skb = nlmsg_new(vrf_fib_rule_nl_size(), GFP_KERNEL);

diff --git a/drivers/net/wimax/i2400m/tx.c b/drivers/net/wimax/i2400m/tx.c
index ebd64e0..1255302 100644
--- a/drivers/net/wimax/i2400m/tx.c
+++ b/drivers/net/wimax/i2400m/tx.c

@@ -654,8 +654,7 @@
 	padding = aligned_size - tx_msg_moved->size;
 	if (padding > 0) {
 		pad_buf = i2400m_tx_fifo_push(i2400m, padding, 0, 0);
-		if (unlikely(WARN_ON(pad_buf == NULL
-				     || pad_buf == TAIL_FULL))) {
+		if (WARN_ON(pad_buf == NULL || pad_buf == TAIL_FULL)) {
 			/* This should not happen -- append should verify
 			 * there is always space left at least to append
 			 * tx_block_size */

diff --git a/drivers/net/wireless/ath/Kconfig b/drivers/net/wireless/ath/Kconfig
index d98d6ac..56616d9 100644
--- a/drivers/net/wireless/ath/Kconfig
+++ b/drivers/net/wireless/ath/Kconfig

@@ -34,7 +34,7 @@
        depends on ATH_DEBUG
        depends on EVENT_TRACING
        ---help---
-         This option enables tracepoints for atheros wireless drivers.
+	 This option enables tracepoints for atheros wireless drivers.
 	 Currently, ath9k makes use of this facility.
 
 config ATH_REG_DYNAMIC_USER_REG_HINTS

diff --git a/drivers/net/wireless/ath/ar5523/Kconfig b/drivers/net/wireless/ath/ar5523/Kconfig
index 41d3c9a..65b39c7 100644
--- a/drivers/net/wireless/ath/ar5523/Kconfig
+++ b/drivers/net/wireless/ath/ar5523/Kconfig

@@ -5,5 +5,5 @@
        select ATH_COMMON
        select FW_LOADER
        ---help---
-         This module add support for AR5523 based USB dongles such as D-Link
-         DWL-G132, Netgear WPN111 and many more.
+	 This module add support for AR5523 based USB dongles such as D-Link
+	 DWL-G132, Netgear WPN111 and many more.

diff --git a/drivers/net/wireless/ath/ath5k/pci.c b/drivers/net/wireless/ath/ath5k/pci.c
index c6156cc..d5ee32c 100644
--- a/drivers/net/wireless/ath/ath5k/pci.c
+++ b/drivers/net/wireless/ath/ath5k/pci.c

@@ -18,7 +18,6 @@
 
 #include <linux/nl80211.h>
 #include <linux/pci.h>
-#include <linux/pci-aspm.h>
 #include <linux/etherdevice.h>
 #include <linux/module.h>
 #include "../ath.h"

diff --git a/drivers/net/wireless/ath/ath6kl/Kconfig b/drivers/net/wireless/ath/ath6kl/Kconfig
index dcf8ca0..62c22fd 100644
--- a/drivers/net/wireless/ath/ath6kl/Kconfig
+++ b/drivers/net/wireless/ath/ath6kl/Kconfig

@@ -2,7 +2,7 @@
 config ATH6KL
 	tristate "Atheros mobile chipsets support"
 	depends on CFG80211
-        ---help---
+	---help---
 	  This module adds core support for wireless adapters based on
 	  Atheros AR6003 and AR6004 chipsets. You still need separate
 	  bus drivers for USB and SDIO to be able to use real devices.

diff --git a/drivers/net/wireless/ath/ath9k/Kconfig b/drivers/net/wireless/ath/ath9k/Kconfig
index 2d1247f..c99f422 100644
--- a/drivers/net/wireless/ath/ath9k/Kconfig
+++ b/drivers/net/wireless/ath/ath9k/Kconfig

@@ -148,7 +148,7 @@
        depends on ATH9K
        default n
        ---help---
-         This option enables channel context support in ath9k, which is needed
+	 This option enables channel context support in ath9k, which is needed
 	 for multi-channel concurrency. Enable this if P2P PowerSave support
 	 is required.
 

diff --git a/drivers/net/wireless/ath/carl9170/Kconfig b/drivers/net/wireless/ath/carl9170/Kconfig
index 757eb76..b1bce7a 100644
--- a/drivers/net/wireless/ath/carl9170/Kconfig
+++ b/drivers/net/wireless/ath/carl9170/Kconfig

@@ -41,9 +41,9 @@
 	default y
 
 config CARL9170_HWRNG
-        bool "Random number generator"
-        depends on CARL9170 && (HW_RANDOM = y || HW_RANDOM = CARL9170)
-        default n
+	bool "Random number generator"
+	depends on CARL9170 && (HW_RANDOM = y || HW_RANDOM = CARL9170)
+	default n
 	help
 	  Provides a hardware random number generator to the kernel.
 

diff --git a/drivers/net/wireless/ath/wil6210/txrx.c b/drivers/net/wireless/ath/wil6210/txrx.c
index cb13652..598c1fb 100644
--- a/drivers/net/wireless/ath/wil6210/txrx.c
+++ b/drivers/net/wireless/ath/wil6210/txrx.c

@@ -1012,11 +1012,11 @@
 	skb_orphan(skb);
 
 	if (security && (wil->txrx_ops.rx_crypto_check(wil, skb) != 0)) {
+		wil_dbg_txrx(wil, "Rx drop %d bytes\n", skb->len);
 		dev_kfree_skb(skb);
 		ndev->stats.rx_dropped++;
 		stats->rx_replay++;
 		stats->rx_dropped++;
-		wil_dbg_txrx(wil, "Rx drop %d bytes\n", skb->len);
 		return;
 	}
 

diff --git a/drivers/net/wireless/atmel/Kconfig b/drivers/net/wireless/atmel/Kconfig
index 809bdf3..4c0556b 100644
--- a/drivers/net/wireless/atmel/Kconfig
+++ b/drivers/net/wireless/atmel/Kconfig

@@ -20,22 +20,22 @@
       select FW_LOADER
       select CRC32
        ---help---
-        A driver 802.11b wireless cards based on the Atmel fast-vnet
-        chips. This driver supports standard Linux wireless extensions.
+	A driver 802.11b wireless cards based on the Atmel fast-vnet
+	chips. This driver supports standard Linux wireless extensions.
 
-        Many  cards based on this chipset do not have flash memory
-        and need their firmware loaded at start-up. If yours is
-        one of these, you will need to provide a firmware image
-        to be loaded into the card by the driver. The Atmel
-        firmware package can be downloaded from
-        <http://www.thekelleys.org.uk/atmel>
+	Many  cards based on this chipset do not have flash memory
+	and need their firmware loaded at start-up. If yours is
+	one of these, you will need to provide a firmware image
+	to be loaded into the card by the driver. The Atmel
+	firmware package can be downloaded from
+	<http://www.thekelleys.org.uk/atmel>
 
 config PCI_ATMEL
       tristate "Atmel at76c506 PCI cards"
       depends on ATMEL && PCI
        ---help---
-        Enable support for PCI and mini-PCI cards containing the
-        Atmel at76c506 chip.
+	Enable support for PCI and mini-PCI cards containing the
+	Atmel at76c506 chip.
 
 config PCMCIA_ATMEL
 	tristate "Atmel at76c502/at76c504 PCMCIA cards"
@@ -48,11 +48,11 @@
 	  Atmel at76c502 and at76c504 chips.
 
 config AT76C50X_USB
-        tristate "Atmel at76c503/at76c505/at76c505a USB cards"
-        depends on MAC80211 && USB
-        select FW_LOADER
-        ---help---
-          Enable support for USB Wireless devices using Atmel at76c503,
-          at76c505 or at76c505a chips.
+	tristate "Atmel at76c503/at76c505/at76c505a USB cards"
+	depends on MAC80211 && USB
+	select FW_LOADER
+	---help---
+	  Enable support for USB Wireless devices using Atmel at76c503,
+	  at76c505 or at76c505a chips.
 
 endif # WLAN_VENDOR_ATMEL

diff --git a/drivers/net/wireless/intel/ipw2x00/Kconfig b/drivers/net/wireless/intel/ipw2x00/Kconfig
index 5d2878a..ab17903 100644
--- a/drivers/net/wireless/intel/ipw2x00/Kconfig
+++ b/drivers/net/wireless/intel/ipw2x00/Kconfig

@@ -13,37 +13,37 @@
 	select LIB80211
 	select LIBIPW
 	---help---
-          A driver for the Intel PRO/Wireless 2100 Network 
+	  A driver for the Intel PRO/Wireless 2100 Network
 	  Connection 802.11b wireless network adapter.
 
-          See <file:Documentation/networking/device_drivers/intel/ipw2100.txt>
+	  See <file:Documentation/networking/device_drivers/intel/ipw2100.txt>
 	  for information on the capabilities currently enabled in this driver
 	  and for tips for debugging issues and problems.
 
 	  In order to use this driver, you will need a firmware image for it.
-          You can obtain the firmware from
-	  <http://ipw2100.sf.net/>.  Once you have the firmware image, you 
+	  You can obtain the firmware from
+	  <http://ipw2100.sf.net/>.  Once you have the firmware image, you
 	  will need to place it in /lib/firmware.
 
-          You will also very likely need the Wireless Tools in order to
-          configure your card:
+	  You will also very likely need the Wireless Tools in order to
+	  configure your card:
 
-          <http://www.hpl.hp.com/personal/Jean_Tourrilhes/Linux/Tools.html>.
+	  <http://www.hpl.hp.com/personal/Jean_Tourrilhes/Linux/Tools.html>.
 
-          It is recommended that you compile this driver as a module (M)
-          rather than built-in (Y). This driver requires firmware at device
-          initialization time, and when built-in this typically happens
-          before the filesystem is accessible (hence firmware will be
-          unavailable and initialization will fail). If you do choose to build
-          this driver into your kernel image, you can avoid this problem by
-          including the firmware and a firmware loader in an initramfs.
- 
+	  It is recommended that you compile this driver as a module (M)
+	  rather than built-in (Y). This driver requires firmware at device
+	  initialization time, and when built-in this typically happens
+	  before the filesystem is accessible (hence firmware will be
+	  unavailable and initialization will fail). If you do choose to build
+	  this driver into your kernel image, you can avoid this problem by
+	  including the firmware and a firmware loader in an initramfs.
+
 config IPW2100_MONITOR
-        bool "Enable promiscuous mode"
-        depends on IPW2100
-        ---help---
+	bool "Enable promiscuous mode"
+	depends on IPW2100
+	---help---
 	  Enables promiscuous/monitor mode support for the ipw2100 driver.
-	  With this feature compiled into the driver, you can switch to 
+	  With this feature compiled into the driver, you can switch to
 	  promiscuous mode via the Wireless Tool's Monitor mode.  While in this
 	  mode, no packets can be sent.
 
@@ -51,17 +51,17 @@
 	bool "Enable full debugging output in IPW2100 module."
 	depends on IPW2100
 	---help---
-	  This option will enable debug tracing output for the IPW2100.  
+	  This option will enable debug tracing output for the IPW2100.
 
-	  This will result in the kernel module being ~60k larger.  You can 
-	  control which debug output is sent to the kernel log by setting the 
-	  value in 
+	  This will result in the kernel module being ~60k larger.  You can
+	  control which debug output is sent to the kernel log by setting the
+	  value in
 
 	  /sys/bus/pci/drivers/ipw2100/debug_level
 
 	  This entry will only exist if this option is enabled.
 
-	  If you are not trying to debug or develop the IPW2100 driver, you 
+	  If you are not trying to debug or develop the IPW2100 driver, you
 	  most likely want to say N here.
 
 config IPW2200
@@ -75,37 +75,37 @@
 	select LIB80211
 	select LIBIPW
 	---help---
-          A driver for the Intel PRO/Wireless 2200BG and 2915ABG Network
-	  Connection adapters. 
+	  A driver for the Intel PRO/Wireless 2200BG and 2915ABG Network
+	  Connection adapters.
 
-          See <file:Documentation/networking/device_drivers/intel/ipw2200.txt>
+	  See <file:Documentation/networking/device_drivers/intel/ipw2200.txt>
 	  for information on the capabilities currently enabled in this
 	  driver and for tips for debugging issues and problems.
 
 	  In order to use this driver, you will need a firmware image for it.
-          You can obtain the firmware from
-	  <http://ipw2200.sf.net/>.  See the above referenced README.ipw2200 
+	  You can obtain the firmware from
+	  <http://ipw2200.sf.net/>.  See the above referenced README.ipw2200
 	  for information on where to install the firmware images.
 
-          You will also very likely need the Wireless Tools in order to
-          configure your card:
+	  You will also very likely need the Wireless Tools in order to
+	  configure your card:
 
-          <http://www.hpl.hp.com/personal/Jean_Tourrilhes/Linux/Tools.html>.
+	  <http://www.hpl.hp.com/personal/Jean_Tourrilhes/Linux/Tools.html>.
 
-          It is recommended that you compile this driver as a module (M)
-          rather than built-in (Y). This driver requires firmware at device
-          initialization time, and when built-in this typically happens
-          before the filesystem is accessible (hence firmware will be
-          unavailable and initialization will fail). If you do choose to build
-          this driver into your kernel image, you can avoid this problem by
-          including the firmware and a firmware loader in an initramfs.
+	  It is recommended that you compile this driver as a module (M)
+	  rather than built-in (Y). This driver requires firmware at device
+	  initialization time, and when built-in this typically happens
+	  before the filesystem is accessible (hence firmware will be
+	  unavailable and initialization will fail). If you do choose to build
+	  this driver into your kernel image, you can avoid this problem by
+	  including the firmware and a firmware loader in an initramfs.
 
 config IPW2200_MONITOR
-        bool "Enable promiscuous mode"
-        depends on IPW2200
-        ---help---
+	bool "Enable promiscuous mode"
+	depends on IPW2200
+	---help---
 	  Enables promiscuous/monitor mode support for the ipw2200 driver.
-	  With this feature compiled into the driver, you can switch to 
+	  With this feature compiled into the driver, you can switch to
 	  promiscuous mode via the Wireless Tool's Monitor mode.  While in this
 	  mode, no packets can be sent.
 
@@ -118,28 +118,28 @@
 	depends on IPW2200_MONITOR
 	select IPW2200_RADIOTAP
 	---help---
-          Enables the creation of a second interface prefixed 'rtap'. 
-          This second interface will provide every received in radiotap
+	  Enables the creation of a second interface prefixed 'rtap'.
+	  This second interface will provide every received in radiotap
 	  format.
 
-          This is useful for performing wireless network analysis while
-          maintaining an active association.
+	  This is useful for performing wireless network analysis while
+	  maintaining an active association.
 
-          Example usage:
+	  Example usage:
 
-            % modprobe ipw2200 rtap_iface=1
-            % ifconfig rtap0 up
-            % tethereal -i rtap0
+	    % modprobe ipw2200 rtap_iface=1
+	    % ifconfig rtap0 up
+	    % tethereal -i rtap0
 
-          If you do not specify 'rtap_iface=1' as a module parameter then 
-          the rtap interface will not be created and you will need to turn 
-          it on via sysfs:
-	
-            % echo 1 > /sys/bus/pci/drivers/ipw2200/*/rtap_iface
+	  If you do not specify 'rtap_iface=1' as a module parameter then
+	  the rtap interface will not be created and you will need to turn
+	  it on via sysfs:
+
+	    % echo 1 > /sys/bus/pci/drivers/ipw2200/*/rtap_iface
 
 config IPW2200_QOS
-        bool "Enable QoS support"
-        depends on IPW2200
+	bool "Enable QoS support"
+	depends on IPW2200
 
 config IPW2200_DEBUG
 	bool "Enable full debugging output in IPW2200 module."

diff --git a/drivers/net/wireless/intel/iwlegacy/3945-mac.c b/drivers/net/wireless/intel/iwlegacy/3945-mac.c
index b82da75..4fbcc7f 100644
--- a/drivers/net/wireless/intel/iwlegacy/3945-mac.c
+++ b/drivers/net/wireless/intel/iwlegacy/3945-mac.c

@@ -18,7 +18,6 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/pci.h>
-#include <linux/pci-aspm.h>
 #include <linux/slab.h>
 #include <linux/dma-mapping.h>
 #include <linux/delay.h>

diff --git a/drivers/net/wireless/intel/iwlegacy/4965-mac.c b/drivers/net/wireless/intel/iwlegacy/4965-mac.c
index fa2c028..ffb705b 100644
--- a/drivers/net/wireless/intel/iwlegacy/4965-mac.c
+++ b/drivers/net/wireless/intel/iwlegacy/4965-mac.c

@@ -18,7 +18,6 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/pci.h>
-#include <linux/pci-aspm.h>
 #include <linux/slab.h>
 #include <linux/dma-mapping.h>
 #include <linux/delay.h>

diff --git a/drivers/net/wireless/intel/iwlegacy/Kconfig b/drivers/net/wireless/intel/iwlegacy/Kconfig
index e329fd7..100f558 100644
--- a/drivers/net/wireless/intel/iwlegacy/Kconfig
+++ b/drivers/net/wireless/intel/iwlegacy/Kconfig

@@ -91,9 +91,9 @@
 	  any problems you may encounter.
 
 config IWLEGACY_DEBUGFS
-        bool "iwlegacy (iwl 3945/4965) debugfs support"
-        depends on IWLEGACY && MAC80211_DEBUGFS
-        ---help---
+	bool "iwlegacy (iwl 3945/4965) debugfs support"
+	depends on IWLEGACY && MAC80211_DEBUGFS
+	---help---
 	  Enable creation of debugfs files for the iwlegacy drivers. This
 	  is a low-impact option that allows getting insight into the
 	  driver's state at runtime.

diff --git a/drivers/net/wireless/intel/iwlwifi/Kconfig b/drivers/net/wireless/intel/iwlwifi/Kconfig
index 7dbc0d3..091d621 100644
--- a/drivers/net/wireless/intel/iwlwifi/Kconfig
+++ b/drivers/net/wireless/intel/iwlwifi/Kconfig

@@ -119,9 +119,9 @@
 	  any problems you may encounter.
 
 config IWLWIFI_DEBUGFS
-        bool "iwlwifi debugfs support"
-        depends on MAC80211_DEBUGFS
-        ---help---
+	bool "iwlwifi debugfs support"
+	depends on MAC80211_DEBUGFS
+	---help---
 	  Enable creation of debugfs files for the iwlwifi drivers. This
 	  is a low-impact option that allows getting insight into the
 	  driver's state at runtime.

diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/fw.c b/drivers/net/wireless/intel/iwlwifi/mvm/fw.c
index 014eca6..32a5e4e 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/fw.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/fw.c

@@ -889,11 +889,13 @@
 	 * firmware versions.  Unfortunately, we don't have a TLV API
 	 * flag to rely on, so rely on the major version which is in
 	 * the first byte of ucode_ver.  This was implemented
-	 * initially on version 38 and then backported to 36, 29 and
-	 * 17.
+	 * initially on version 38 and then backported to29 and 17.
+	 * The intention was to have it in 36 as well, but not all
+	 * 8000 family got this feature enabled.  The 8000 family is
+	 * the only one using version 36, so skip this version
+	 * entirely.
 	 */
 	return IWL_UCODE_SERIAL(mvm->fw->ucode_ver) >= 38 ||
-	       IWL_UCODE_SERIAL(mvm->fw->ucode_ver) == 36 ||
 	       IWL_UCODE_SERIAL(mvm->fw->ucode_ver) == 29 ||
 	       IWL_UCODE_SERIAL(mvm->fw->ucode_ver) == 17;
 }

diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/tt.c b/drivers/net/wireless/intel/iwlwifi/mvm/tt.c
index 32a7083..f0c539b3 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/tt.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/tt.c

@@ -555,16 +555,19 @@
 	return ((s16)le16_to_cpu(*(__le16 *)a) -
 		(s16)le16_to_cpu(*(__le16 *)b));
 }
+#endif
 
 int iwl_mvm_send_temp_report_ths_cmd(struct iwl_mvm *mvm)
 {
 	struct temp_report_ths_cmd cmd = {0};
-	int ret, i, j, idx = 0;
+	int ret;
+#ifdef CONFIG_THERMAL
+	int i, j, idx = 0;
 
 	lockdep_assert_held(&mvm->mutex);
 
 	if (!mvm->tz_device.tzone)
-		return -EINVAL;
+		goto send;
 
 	/* The driver holds array of temperature trips that are unsorted
 	 * and uncompressed, the FW should get it compressed and sorted
@@ -597,6 +600,7 @@
 	}
 
 send:
+#endif
 	ret = iwl_mvm_send_cmd_pdu(mvm, WIDE_ID(PHY_OPS_GROUP,
 						TEMP_REPORTING_THRESHOLDS_CMD),
 				   0, sizeof(cmd), &cmd);
@@ -607,6 +611,7 @@
 	return ret;
 }
 
+#ifdef CONFIG_THERMAL
 static int iwl_mvm_tzone_get_temp(struct thermal_zone_device *device,
 				  int *temperature)
 {

diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/trans.c b/drivers/net/wireless/intel/iwlwifi/pcie/trans.c
index 5ab87a8..f8a1f98 100644
--- a/drivers/net/wireless/intel/iwlwifi/pcie/trans.c
+++ b/drivers/net/wireless/intel/iwlwifi/pcie/trans.c

@@ -62,7 +62,6 @@
  *
  *****************************************************************************/
 #include <linux/pci.h>
-#include <linux/pci-aspm.h>
 #include <linux/interrupt.h>
 #include <linux/debugfs.h>
 #include <linux/sched.h>

diff --git a/drivers/net/wireless/mediatek/mt76/mt7615/mcu.c b/drivers/net/wireless/mediatek/mt76/mt7615/mcu.c
index 275d5ea..842cd81 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7615/mcu.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7615/mcu.c

@@ -333,7 +333,6 @@
 
 static int mt7615_load_patch(struct mt7615_dev *dev)
 {
-	const char *firmware = MT7615_ROM_PATCH;
 	const struct mt7615_patch_hdr *hdr;
 	const struct firmware *fw = NULL;
 	int len, ret, sem;
@@ -349,7 +348,7 @@
 		return -EAGAIN;
 	}
 
-	ret = request_firmware(&fw, firmware, dev->mt76.dev);
+	ret = request_firmware(&fw, MT7615_ROM_PATCH, dev->mt76.dev);
 	if (ret)
 		goto out;
 
@@ -447,13 +446,11 @@
 
 static int mt7615_load_ram(struct mt7615_dev *dev)
 {
-	const struct firmware *fw;
 	const struct mt7615_fw_trailer *hdr;
-	const char *n9_firmware = MT7615_FIRMWARE_N9;
-	const char *cr4_firmware = MT7615_FIRMWARE_CR4;
+	const struct firmware *fw;
 	int ret;
 
-	ret = request_firmware(&fw, n9_firmware, dev->mt76.dev);
+	ret = request_firmware(&fw, MT7615_FIRMWARE_N9, dev->mt76.dev);
 	if (ret)
 		return ret;
 
@@ -482,7 +479,7 @@
 
 	release_firmware(fw);
 
-	ret = request_firmware(&fw, cr4_firmware, dev->mt76.dev);
+	ret = request_firmware(&fw, MT7615_FIRMWARE_CR4, dev->mt76.dev);
 	if (ret)
 		return ret;
 

diff --git a/drivers/net/wireless/mediatek/mt76/mt7615/mt7615.h b/drivers/net/wireless/mediatek/mt76/mt7615/mt7615.h
index cef3fd43..7963e30 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7615/mt7615.h
+++ b/drivers/net/wireless/mediatek/mt76/mt7615/mt7615.h

@@ -26,9 +26,9 @@
 #define MT7615_RX_RING_SIZE		1024
 #define MT7615_RX_MCU_RING_SIZE		512
 
-#define MT7615_FIRMWARE_CR4		"mt7615_cr4.bin"
-#define MT7615_FIRMWARE_N9		"mt7615_n9.bin"
-#define MT7615_ROM_PATCH		"mt7615_rom_patch.bin"
+#define MT7615_FIRMWARE_CR4		"mediatek/mt7615_cr4.bin"
+#define MT7615_FIRMWARE_N9		"mediatek/mt7615_n9.bin"
+#define MT7615_ROM_PATCH		"mediatek/mt7615_rom_patch.bin"
 
 #define MT7615_EEPROM_SIZE		1024
 #define MT7615_TOKEN_SIZE		4096

diff --git a/drivers/net/wireless/ralink/rt2x00/Kconfig b/drivers/net/wireless/ralink/rt2x00/Kconfig
index 858f8aa..f8a9244 100644
--- a/drivers/net/wireless/ralink/rt2x00/Kconfig
+++ b/drivers/net/wireless/ralink/rt2x00/Kconfig

@@ -98,17 +98,17 @@
        bool "rt2800pci - Include support for rt53xx devices (EXPERIMENTAL)"
        default y
        ---help---
-         This adds support for rt53xx wireless chipset family to the
-         rt2800pci driver.
-         Supported chips: RT5390
+	 This adds support for rt53xx wireless chipset family to the
+	 rt2800pci driver.
+	 Supported chips: RT5390
 
 config RT2800PCI_RT3290
        bool "rt2800pci - Include support for rt3290 devices (EXPERIMENTAL)"
        default y
        ---help---
-         This adds support for rt3290 wireless chipset family to the
-         rt2800pci driver.
-         Supported chips: RT3290
+	 This adds support for rt3290 wireless chipset family to the
+	 rt2800pci driver.
+	 Supported chips: RT3290
 endif
 
 config RT2500USB
@@ -176,16 +176,16 @@
 config RT2800USB_RT53XX
        bool "rt2800usb - Include support for rt53xx devices (EXPERIMENTAL)"
        ---help---
-         This adds support for rt53xx wireless chipset family to the
-         rt2800usb driver.
-         Supported chips: RT5370
+	 This adds support for rt53xx wireless chipset family to the
+	 rt2800usb driver.
+	 Supported chips: RT5370
 
 config RT2800USB_RT55XX
        bool "rt2800usb - Include support for rt55xx devices (EXPERIMENTAL)"
        ---help---
-         This adds support for rt55xx wireless chipset family to the
-         rt2800usb driver.
-         Supported chips: RT5572
+	 This adds support for rt55xx wireless chipset family to the
+	 rt2800usb driver.
+	 Supported chips: RT5572
 
 config RT2800USB_UNKNOWN
 	bool "rt2800usb - Include support for unknown (USB) devices"

diff --git a/drivers/net/wireless/realtek/rtw88/mac.c b/drivers/net/wireless/realtek/rtw88/mac.c
index fc14b37..b61b073 100644
--- a/drivers/net/wireless/realtek/rtw88/mac.c
+++ b/drivers/net/wireless/realtek/rtw88/mac.c

@@ -707,9 +707,6 @@
 	rtwdev->h2c.last_box_num = 0;
 	rtwdev->h2c.seq = 0;
 
-	rtw_fw_send_general_info(rtwdev);
-	rtw_fw_send_phydm_info(rtwdev);
-
 	rtw_flag_set(rtwdev, RTW_FLAG_FW_RUNNING);
 
 	return 0;

diff --git a/drivers/net/wireless/realtek/rtw88/main.c b/drivers/net/wireless/realtek/rtw88/main.c
index fc8f6213..6dd4577 100644
--- a/drivers/net/wireless/realtek/rtw88/main.c
+++ b/drivers/net/wireless/realtek/rtw88/main.c

@@ -704,6 +704,10 @@
 		goto err_off;
 	}
 
+	/* send H2C after HCI has started */
+	rtw_fw_send_general_info(rtwdev);
+	rtw_fw_send_phydm_info(rtwdev);
+
 	wifi_only = !rtwdev->efuse.btcoex;
 	rtw_coex_power_on_setting(rtwdev);
 	rtw_coex_init_hw_config(rtwdev, wifi_only);

diff --git a/drivers/net/wireless/realtek/rtw88/pci.c b/drivers/net/wireless/realtek/rtw88/pci.c
index 3fdb52a..d90928b 100644
--- a/drivers/net/wireless/realtek/rtw88/pci.c
+++ b/drivers/net/wireless/realtek/rtw88/pci.c

@@ -90,16 +90,13 @@
 	return tx_ring->r.head + offset;
 }
 
-static void rtw_pci_free_tx_ring(struct rtw_dev *rtwdev,
-				 struct rtw_pci_tx_ring *tx_ring)
+static void rtw_pci_free_tx_ring_skbs(struct rtw_dev *rtwdev,
+				      struct rtw_pci_tx_ring *tx_ring)
 {
 	struct pci_dev *pdev = to_pci_dev(rtwdev->dev);
 	struct rtw_pci_tx_data *tx_data;
 	struct sk_buff *skb, *tmp;
 	dma_addr_t dma;
-	u8 *head = tx_ring->r.head;
-	u32 len = tx_ring->r.len;
-	int ring_sz = len * tx_ring->r.desc_size;
 
 	/* free every skb remained in tx list */
 	skb_queue_walk_safe(&tx_ring->queue, skb, tmp) {
@@ -110,21 +107,30 @@
 		pci_unmap_single(pdev, dma, skb->len, PCI_DMA_TODEVICE);
 		dev_kfree_skb_any(skb);
 	}
+}
+
+static void rtw_pci_free_tx_ring(struct rtw_dev *rtwdev,
+				 struct rtw_pci_tx_ring *tx_ring)
+{
+	struct pci_dev *pdev = to_pci_dev(rtwdev->dev);
+	u8 *head = tx_ring->r.head;
+	u32 len = tx_ring->r.len;
+	int ring_sz = len * tx_ring->r.desc_size;
+
+	rtw_pci_free_tx_ring_skbs(rtwdev, tx_ring);
 
 	/* free the ring itself */
 	pci_free_consistent(pdev, ring_sz, head, tx_ring->r.dma);
 	tx_ring->r.head = NULL;
 }
 
-static void rtw_pci_free_rx_ring(struct rtw_dev *rtwdev,
-				 struct rtw_pci_rx_ring *rx_ring)
+static void rtw_pci_free_rx_ring_skbs(struct rtw_dev *rtwdev,
+				      struct rtw_pci_rx_ring *rx_ring)
 {
 	struct pci_dev *pdev = to_pci_dev(rtwdev->dev);
 	struct sk_buff *skb;
-	dma_addr_t dma;
-	u8 *head = rx_ring->r.head;
 	int buf_sz = RTK_PCI_RX_BUF_SIZE;
-	int ring_sz = rx_ring->r.desc_size * rx_ring->r.len;
+	dma_addr_t dma;
 	int i;
 
 	for (i = 0; i < rx_ring->r.len; i++) {
@@ -137,6 +143,16 @@
 		dev_kfree_skb(skb);
 		rx_ring->buf[i] = NULL;
 	}
+}
+
+static void rtw_pci_free_rx_ring(struct rtw_dev *rtwdev,
+				 struct rtw_pci_rx_ring *rx_ring)
+{
+	struct pci_dev *pdev = to_pci_dev(rtwdev->dev);
+	u8 *head = rx_ring->r.head;
+	int ring_sz = rx_ring->r.desc_size * rx_ring->r.len;
+
+	rtw_pci_free_rx_ring_skbs(rtwdev, rx_ring);
 
 	pci_free_consistent(pdev, ring_sz, head, rx_ring->r.dma);
 }
@@ -484,6 +500,17 @@
 	rtwpci->rx_tag = 0;
 }
 
+static void rtw_pci_dma_release(struct rtw_dev *rtwdev, struct rtw_pci *rtwpci)
+{
+	struct rtw_pci_tx_ring *tx_ring;
+	u8 queue;
+
+	for (queue = 0; queue < RTK_MAX_TX_QUEUE_NUM; queue++) {
+		tx_ring = &rtwpci->tx_rings[queue];
+		rtw_pci_free_tx_ring_skbs(rtwdev, tx_ring);
+	}
+}
+
 static int rtw_pci_start(struct rtw_dev *rtwdev)
 {
 	struct rtw_pci *rtwpci = (struct rtw_pci *)rtwdev->priv;
@@ -505,6 +532,7 @@
 
 	spin_lock_irqsave(&rtwpci->irq_lock, flags);
 	rtw_pci_disable_interrupt(rtwdev, rtwpci);
+	rtw_pci_dma_release(rtwdev, rtwpci);
 	spin_unlock_irqrestore(&rtwpci->irq_lock, flags);
 }
 

diff --git a/drivers/net/wireless/zydas/zd1211rw/zd_usb.c b/drivers/net/wireless/zydas/zd1211rw/zd_usb.c
index 4e44ea8..7b5c2fe 100644
--- a/drivers/net/wireless/zydas/zd1211rw/zd_usb.c
+++ b/drivers/net/wireless/zydas/zd1211rw/zd_usb.c

@@ -1633,7 +1633,7 @@
 	 */
 	if (rr->length < struct_size(regs, regs, count)) {
 		dev_dbg_f(zd_usb_dev(usb),
-			 "error: actual length %d less than expected %ld\n",
+			 "error: actual length %d less than expected %zu\n",
 			 rr->length, struct_size(regs, regs, count));
 		return false;
 	}

diff --git a/drivers/nfc/st95hf/core.c b/drivers/nfc/st95hf/core.c
index 7eda62a..9642971 100644
--- a/drivers/nfc/st95hf/core.c
+++ b/drivers/nfc/st95hf/core.c

@@ -661,7 +661,7 @@
 			result = -ETIMEDOUT;
 		else
 			result = -EIO;
-	return  result;
+		return result;
 	}
 
 	/* Check for CRC err only if CRC is present in the tag response */

diff --git a/drivers/ntb/hw/amd/ntb_hw_amd.c b/drivers/ntb/hw/amd/ntb_hw_amd.c
index 2859cc9..156c2a1 100644
--- a/drivers/ntb/hw/amd/ntb_hw_amd.c
+++ b/drivers/ntb/hw/amd/ntb_hw_amd.c

@@ -78,7 +78,7 @@
 	if (idx < 0 || idx > ndev->mw_count)
 		return -EINVAL;
 
-	return 1 << idx;
+	return ndev->dev_data->mw_idx << idx;
 }
 
 static int amd_ntb_mw_count(struct ntb_dev *ntb, int pidx)
@@ -909,7 +909,7 @@
 {
 	void __iomem *mmio = ndev->self_mmio;
 
-	ndev->mw_count = AMD_MW_CNT;
+	ndev->mw_count = ndev->dev_data->mw_count;
 	ndev->spad_count = AMD_SPADS_CNT;
 	ndev->db_count = AMD_DB_CNT;
 
@@ -1069,6 +1069,8 @@
 		goto err_ndev;
 	}
 
+	ndev->dev_data = (struct ntb_dev_data *)id->driver_data;
+
 	ndev_init_struct(ndev, pdev);
 
 	rc = amd_ntb_init_pci(ndev, pdev);
@@ -1123,9 +1125,21 @@
 	.read = ndev_debugfs_read,
 };
 
+static const struct ntb_dev_data dev_data[] = {
+	{ /* for device 145b */
+		.mw_count = 3,
+		.mw_idx = 1,
+	},
+	{ /* for device 148b */
+		.mw_count = 2,
+		.mw_idx = 2,
+	},
+};
+
 static const struct pci_device_id amd_ntb_pci_tbl[] = {
-	{PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_NTB)},
-	{0}
+	{ PCI_VDEVICE(AMD, 0x145b), (kernel_ulong_t)&dev_data[0] },
+	{ PCI_VDEVICE(AMD, 0x148b), (kernel_ulong_t)&dev_data[1] },
+	{ 0, }
 };
 MODULE_DEVICE_TABLE(pci, amd_ntb_pci_tbl);
 

diff --git a/drivers/ntb/hw/amd/ntb_hw_amd.h b/drivers/ntb/hw/amd/ntb_hw_amd.h
index 8f3617a..139a307 100644
--- a/drivers/ntb/hw/amd/ntb_hw_amd.h
+++ b/drivers/ntb/hw/amd/ntb_hw_amd.h

@@ -52,7 +52,6 @@
 #include <linux/ntb.h>
 #include <linux/pci.h>
 
-#define PCI_DEVICE_ID_AMD_NTB	0x145B
 #define AMD_LINK_HB_TIMEOUT	msecs_to_jiffies(1000)
 #define AMD_LINK_STATUS_OFFSET	0x68
 #define NTB_LIN_STA_ACTIVE_BIT	0x00000002
@@ -93,7 +92,6 @@
 
 enum {
 	/* AMD NTB Capability */
-	AMD_MW_CNT		= 3,
 	AMD_DB_CNT		= 16,
 	AMD_MSIX_VECTOR_CNT	= 24,
 	AMD_SPADS_CNT		= 16,
@@ -170,6 +168,11 @@
 	AMD_PEER_OFFSET		= 0x400,
 };
 
+struct ntb_dev_data {
+	const unsigned char mw_count;
+	const unsigned int mw_idx;
+};
+
 struct amd_ntb_dev;
 
 struct amd_ntb_vec {
@@ -185,6 +188,7 @@
 	u32 cntl_sta;
 	u32 peer_sta;
 
+	struct ntb_dev_data *dev_data;
 	unsigned char mw_count;
 	unsigned char spad_count;
 	unsigned char db_count;

diff --git a/drivers/ntb/hw/idt/Kconfig b/drivers/ntb/hw/idt/Kconfig
index bfc7cac..c79b54c 100644
--- a/drivers/ntb/hw/idt/Kconfig
+++ b/drivers/ntb/hw/idt/Kconfig

@@ -4,11 +4,11 @@
 	depends on PCI
 	select HWMON
 	help
-	 This driver supports NTB of cappable IDT PCIe-switches.
+	 This driver supports NTB of capable IDT PCIe-switches.
 
 	 Some of the pre-initializations must be made before IDT PCIe-switch
-	 exposes it NT-functions correctly. It should be done by either proper
-	 initialisation of EEPROM connected to master smbus of the switch or
+	 exposes its NT-functions correctly. It should be done by either proper
+	 initialization of EEPROM connected to master SMbus of the switch or
 	 by BIOS using slave-SMBus interface changing corresponding registers
 	 value. Evidently it must be done before PCI bus enumeration is
 	 finished in Linux kernel.

diff --git a/drivers/ntb/hw/mscc/ntb_hw_switchtec.c b/drivers/ntb/hw/mscc/ntb_hw_switchtec.c
index f495945..86ffa71 100644
--- a/drivers/ntb/hw/mscc/ntb_hw_switchtec.c
+++ b/drivers/ntb/hw/mscc/ntb_hw_switchtec.c

@@ -306,7 +306,7 @@
 	if (rc)
 		return rc;
 
-	if (addr == 0 || size == 0) {
+	if (size == 0) {
 		if (widx < nr_direct_mw)
 			switchtec_ntb_mw_clr_direct(sndev, widx);
 		else

diff --git a/drivers/ntb/ntb_transport.c b/drivers/ntb/ntb_transport.c
index 40c90ca..00a5d57 100644
--- a/drivers/ntb/ntb_transport.c
+++ b/drivers/ntb/ntb_transport.c

@@ -292,7 +292,7 @@
 static int ntb_transport_bus_probe(struct device *dev)
 {
 	const struct ntb_transport_client *client;
-	int rc = -EINVAL;
+	int rc;
 
 	get_device(dev);
 

diff --git a/drivers/ntb/test/ntb_perf.c b/drivers/ntb/test/ntb_perf.c
index d028331..e9b7c2d 100644
--- a/drivers/ntb/test/ntb_perf.c
+++ b/drivers/ntb/test/ntb_perf.c

@@ -1378,7 +1378,7 @@
 	int ret;
 
 	/* Get outbound MW parameters and map it */
-	ret = ntb_peer_mw_get_addr(perf->ntb, peer->gidx, &phys_addr,
+	ret = ntb_peer_mw_get_addr(perf->ntb, perf->gidx, &phys_addr,
 				   &peer->outbuf_size);
 	if (ret)
 		return ret;

diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
index a8d5688..3e9f45ae 100644
--- a/drivers/nvdimm/btt.c
+++ b/drivers/nvdimm/btt.c

@@ -392,9 +392,9 @@
 	arena->freelist[lane].sub = 1 - arena->freelist[lane].sub;
 	if (++(arena->freelist[lane].seq) == 4)
 		arena->freelist[lane].seq = 1;
-	if (ent_e_flag(ent->old_map))
+	if (ent_e_flag(le32_to_cpu(ent->old_map)))
 		arena->freelist[lane].has_err = 1;
-	arena->freelist[lane].block = le32_to_cpu(ent_lba(ent->old_map));
+	arena->freelist[lane].block = ent_lba(le32_to_cpu(ent->old_map));
 
 	return ret;
 }
@@ -560,8 +560,8 @@
 		 * FIXME: if error clearing fails during init, we want to make
 		 * the BTT read-only
 		 */
-		if (ent_e_flag(log_new.old_map) &&
-				!ent_normal(log_new.old_map)) {
+		if (ent_e_flag(le32_to_cpu(log_new.old_map)) &&
+		    !ent_normal(le32_to_cpu(log_new.old_map))) {
 			arena->freelist[i].has_err = 1;
 			ret = arena_clear_freelist_error(arena, i);
 			if (ret)

diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index 75a58a6..d47412d 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c

@@ -180,7 +180,7 @@
 	sector_t sector;
 
 	/* make sure device is a region */
-	if (!is_nd_pmem(dev))
+	if (!is_memory(dev))
 		return 0;
 
 	nd_region = to_nd_region(dev);

diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c
index 4340132..cca0a3b 100644
--- a/drivers/nvdimm/namespace_devs.c
+++ b/drivers/nvdimm/namespace_devs.c

@@ -1987,7 +1987,7 @@
 		nd_mapping = &nd_region->mapping[i];
 		label_ent = list_first_entry_or_null(&nd_mapping->labels,
 				typeof(*label_ent), list);
-		label0 = label_ent ? label_ent->label : 0;
+		label0 = label_ent ? label_ent->label : NULL;
 
 		if (!label0) {
 			WARN_ON(1);
@@ -2322,8 +2322,9 @@
 			continue;
 
 		/* skip labels that describe extents outside of the region */
-		if (nd_label->dpa < nd_mapping->start || nd_label->dpa > map_end)
-			continue;
+		if (__le64_to_cpu(nd_label->dpa) < nd_mapping->start ||
+		    __le64_to_cpu(nd_label->dpa) > map_end)
+				continue;
 
 		i = add_namespace_resource(nd_region, nd_label, devs, count);
 		if (i < 0)

diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index e89af4b..ee5c040 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h

@@ -289,11 +289,7 @@
 struct nd_pfn *to_nd_pfn(struct device *dev);
 #if IS_ENABLED(CONFIG_NVDIMM_PFN)
 
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-#define PFN_DEFAULT_ALIGNMENT HPAGE_PMD_SIZE
-#else
-#define PFN_DEFAULT_ALIGNMENT PAGE_SIZE
-#endif
+#define MAX_NVDIMM_ALIGN	4
 
 int nd_pfn_probe(struct device *dev, struct nd_namespace_common *ndns);
 bool is_nd_pfn(struct device *dev);

diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index bb9cc5c..60d81fa 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c

@@ -103,39 +103,42 @@
 	return sprintf(buf, "%ld\n", nd_pfn->align);
 }
 
-static const unsigned long *nd_pfn_supported_alignments(void)
+static unsigned long *nd_pfn_supported_alignments(unsigned long *alignments)
 {
-	/*
-	 * This needs to be a non-static variable because the *_SIZE
-	 * macros aren't always constants.
-	 */
-	const unsigned long supported_alignments[] = {
-		PAGE_SIZE,
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-		HPAGE_PMD_SIZE,
-#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
-		HPAGE_PUD_SIZE,
-#endif
-#endif
-		0,
-	};
-	static unsigned long data[ARRAY_SIZE(supported_alignments)];
 
-	memcpy(data, supported_alignments, sizeof(data));
+	alignments[0] = PAGE_SIZE;
 
-	return data;
+	if (has_transparent_hugepage()) {
+		alignments[1] = HPAGE_PMD_SIZE;
+		if (IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD))
+			alignments[2] = HPAGE_PUD_SIZE;
+	}
+
+	return alignments;
+}
+
+/*
+ * Use pmd mapping if supported as default alignment
+ */
+static unsigned long nd_pfn_default_alignment(void)
+{
+
+	if (has_transparent_hugepage())
+		return HPAGE_PMD_SIZE;
+	return PAGE_SIZE;
 }
 
 static ssize_t align_store(struct device *dev,
 		struct device_attribute *attr, const char *buf, size_t len)
 {
 	struct nd_pfn *nd_pfn = to_nd_pfn_safe(dev);
+	unsigned long aligns[MAX_NVDIMM_ALIGN] = { [0] = 0, };
 	ssize_t rc;
 
 	nd_device_lock(dev);
 	nvdimm_bus_lock(dev);
 	rc = nd_size_select_store(dev, buf, &nd_pfn->align,
-			nd_pfn_supported_alignments());
+			nd_pfn_supported_alignments(aligns));
 	dev_dbg(dev, "result: %zd wrote: %s%s", rc, buf,
 			buf[len - 1] == '\n' ? "" : "\n");
 	nvdimm_bus_unlock(dev);
@@ -259,7 +262,10 @@
 static ssize_t supported_alignments_show(struct device *dev,
 		struct device_attribute *attr, char *buf)
 {
-	return nd_size_select_show(0, nd_pfn_supported_alignments(), buf);
+	unsigned long aligns[MAX_NVDIMM_ALIGN] = { [0] = 0, };
+
+	return nd_size_select_show(0,
+			nd_pfn_supported_alignments(aligns), buf);
 }
 static DEVICE_ATTR_RO(supported_alignments);
 
@@ -302,7 +308,7 @@
 		return NULL;
 
 	nd_pfn->mode = PFN_MODE_NONE;
-	nd_pfn->align = PFN_DEFAULT_ALIGNMENT;
+	nd_pfn->align = nd_pfn_default_alignment();
 	dev = &nd_pfn->dev;
 	device_initialize(&nd_pfn->dev);
 	if (ndns && !__nd_attach_ndns(&nd_pfn->dev, ndns, &nd_pfn->ndns)) {
@@ -412,6 +418,21 @@
 	return 0;
 }
 
+static bool nd_supported_alignment(unsigned long align)
+{
+	int i;
+	unsigned long supported[MAX_NVDIMM_ALIGN] = { [0] = 0, };
+
+	if (align == 0)
+		return false;
+
+	nd_pfn_supported_alignments(supported);
+	for (i = 0; supported[i]; i++)
+		if (align == supported[i])
+			return true;
+	return false;
+}
+
 /**
  * nd_pfn_validate - read and validate info-block
  * @nd_pfn: fsdax namespace runtime state / properties
@@ -496,6 +517,18 @@
 		return -EOPNOTSUPP;
 	}
 
+	/*
+	 * Check whether the we support the alignment. For Dax if the
+	 * superblock alignment is not matching, we won't initialize
+	 * the device.
+	 */
+	if (!nd_supported_alignment(align) &&
+			!memcmp(pfn_sb->signature, DAX_SIG, PFN_SIG_LEN)) {
+		dev_err(&nd_pfn->dev, "init failed, alignment mismatch: "
+				"%ld:%ld\n", nd_pfn->align, align);
+		return -EOPNOTSUPP;
+	}
+
 	if (!nd_pfn->uuid) {
 		/*
 		 * When probing a namepace via nd_pfn_probe() the uuid
@@ -639,9 +672,11 @@
 	struct nd_namespace_common *ndns = nd_pfn->ndns;
 	struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
 	resource_size_t base = nsio->res.start + start_pad;
+	resource_size_t end = nsio->res.end - end_trunc;
 	struct vmem_altmap __altmap = {
 		.base_pfn = init_altmap_base(base),
 		.reserve = init_altmap_reserve(base),
+		.end_pfn = PHYS_PFN(end),
 	};
 
 	memcpy(res, &nsio->res, sizeof(*res));

diff --git a/drivers/nvdimm/region.c b/drivers/nvdimm/region.c
index 37bf8719..0f6978e 100644
--- a/drivers/nvdimm/region.c
+++ b/drivers/nvdimm/region.c

@@ -34,7 +34,7 @@
 	if (rc)
 		return rc;
 
-	if (is_nd_pmem(&nd_region->dev)) {
+	if (is_memory(&nd_region->dev)) {
 		struct resource ndr_res;
 
 		if (devm_init_badblocks(dev, &nd_region->bb))
@@ -123,7 +123,7 @@
 		struct nd_region *nd_region = to_nd_region(dev);
 		struct resource res;
 
-		if (is_nd_pmem(&nd_region->dev)) {
+		if (is_memory(&nd_region->dev)) {
 			res.start = nd_region->ndr_start;
 			res.end = nd_region->ndr_start +
 				nd_region->ndr_size - 1;

diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index 3fd6b59..ef423ba1a7 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c

@@ -632,11 +632,11 @@
 	if (!is_memory(dev) && a == &dev_attr_dax_seed.attr)
 		return 0;
 
-	if (!is_nd_pmem(dev) && a == &dev_attr_badblocks.attr)
+	if (!is_memory(dev) && a == &dev_attr_badblocks.attr)
 		return 0;
 
 	if (a == &dev_attr_resource.attr) {
-		if (is_nd_pmem(dev))
+		if (is_memory(dev))
 			return 0400;
 		else
 			return 0;
@@ -1168,6 +1168,9 @@
 
 bool is_nvdimm_sync(struct nd_region *nd_region)
 {
+	if (is_nd_volatile(&nd_region->dev))
+		return true;
+
 	return is_nd_pmem(&nd_region->dev) &&
 		!test_bit(ND_REGION_ASYNC, &nd_region->flags);
 }

diff --git a/drivers/nvdimm/security.c b/drivers/nvdimm/security.c
index 9e45b20..89b8597 100644
--- a/drivers/nvdimm/security.c
+++ b/drivers/nvdimm/security.c

@@ -177,6 +177,10 @@
 			|| !nvdimm->sec.flags)
 		return -EIO;
 
+	/* No need to go further if security is disabled */
+	if (test_bit(NVDIMM_SECURITY_DISABLED, &nvdimm->sec.flags))
+		return 0;
+
 	if (test_bit(NDD_SECURITY_OVERWRITE, &nvdimm->flags)) {
 		dev_dbg(dev, "Security operation in progress.\n");
 		return -EBUSY;

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 1ede176..108f60b 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c

@@ -666,8 +666,6 @@
 			if (WARN_ON_ONCE(!nvme_ns_has_pi(ns)))
 				return BLK_STS_NOTSUPP;
 			control |= NVME_RW_PRINFO_PRACT;
-		} else if (req_op(req) == REQ_OP_WRITE) {
-			t10_pi_prepare(req, ns->pi_type);
 		}
 
 		switch (ns->pi_type) {
@@ -690,13 +688,6 @@
 
 void nvme_cleanup_cmd(struct request *req)
 {
-	if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ &&
-	    nvme_req(req)->status == 0) {
-		struct nvme_ns *ns = req->rq_disk->private_data;
-
-		t10_pi_complete(req, ns->pi_type,
-				blk_rq_bytes(req) >> ns->lba_shift);
-	}
 	if (req->rq_flags & RQF_SPECIAL_PAYLOAD) {
 		struct nvme_ns *ns = req->rq_disk->private_data;
 		struct page *page = req->special_vec.bv_page;

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 6b4d7b0..c0808f9 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c

@@ -549,8 +549,10 @@
 
 	WARN_ON_ONCE(!iod->nents);
 
-	/* P2PDMA requests do not need to be unmapped */
-	if (!is_pci_p2pdma_page(sg_page(iod->sg)))
+	if (is_pci_p2pdma_page(sg_page(iod->sg)))
+		pci_p2pdma_unmap_sg(dev->dev, iod->sg, iod->nents,
+				    rq_dma_dir(req));
+	else
 		dma_unmap_sg(dev->dev, iod->sg, iod->nents, rq_dma_dir(req));
 
 
@@ -834,8 +836,8 @@
 		goto out;
 
 	if (is_pci_p2pdma_page(sg_page(iod->sg)))
-		nr_mapped = pci_p2pdma_map_sg(dev->dev, iod->sg, iod->nents,
-					      rq_dma_dir(req));
+		nr_mapped = pci_p2pdma_map_sg_attrs(dev->dev, iod->sg,
+				iod->nents, rq_dma_dir(req), DMA_ATTR_NO_WARN);
 	else
 		nr_mapped = dma_map_sg_attrs(dev->dev, iod->sg, iod->nents,
 					     rq_dma_dir(req), DMA_ATTR_NO_WARN);

diff --git a/drivers/of/of_mdio.c b/drivers/of/of_mdio.c
index 000b957..bd6129d 100644
--- a/drivers/of/of_mdio.c
+++ b/drivers/of/of_mdio.c

@@ -362,7 +362,7 @@
 	int ret;
 
 	iface = of_get_phy_mode(np);
-	if (iface < 0)
+	if ((int)iface < 0)
 		return NULL;
 	if (of_phy_is_fixed_link(np)) {
 		ret = of_phy_register_fixed_link(np);

diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig
index c313de9..a304f5e 100644
--- a/drivers/pci/Kconfig
+++ b/drivers/pci/Kconfig

@@ -52,7 +52,7 @@
 	   If you don't know what to do here, say Y.
 
 config PCI_MSI_IRQ_DOMAIN
-	def_bool ARC || ARM || ARM64 || X86
+	def_bool ARC || ARM || ARM64 || X86 || RISCV
 	depends on PCI_MSI
 	select GENERIC_MSI_IRQ_DOMAIN
 
@@ -170,7 +170,7 @@
 
 	  Many PCIe root complexes do not support P2P transactions and
 	  it's hard to tell which support it at all, so at this time,
-	  P2P DMA transations must be between devices behind the same root
+	  P2P DMA transactions must be between devices behind the same root
 	  port.
 
 	  If unsure, say N.
@@ -181,7 +181,7 @@
 
 config PCI_HYPERV
         tristate "Hyper-V PCI Frontend"
-        depends on X86 && HYPERV && PCI_MSI && PCI_MSI_IRQ_DOMAIN && X86_64
+        depends on X86_64 && HYPERV && PCI_MSI && PCI_MSI_IRQ_DOMAIN && SYSFS
 	select PCI_HYPERV_INTERFACE
         help
           The PCI device frontend driver allows the kernel to import arbitrary

diff --git a/drivers/pci/access.c b/drivers/pci/access.c
index 544922f..2fccb57 100644
--- a/drivers/pci/access.c
+++ b/drivers/pci/access.c

@@ -336,15 +336,6 @@
 	return pcie_caps_reg(dev) & PCI_EXP_FLAGS_VERS;
 }
 
-static bool pcie_downstream_port(const struct pci_dev *dev)
-{
-	int type = pci_pcie_type(dev);
-
-	return type == PCI_EXP_TYPE_ROOT_PORT ||
-	       type == PCI_EXP_TYPE_DOWNSTREAM ||
-	       type == PCI_EXP_TYPE_PCIE_BRIDGE;
-}
-
 bool pcie_cap_has_lnkctl(const struct pci_dev *dev)
 {
 	int type = pci_pcie_type(dev);

diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c
index 495059d..8e40b3e 100644
--- a/drivers/pci/bus.c
+++ b/drivers/pci/bus.c

@@ -417,11 +417,9 @@
 		get_device(&bus->dev);
 	return bus;
 }
-EXPORT_SYMBOL(pci_bus_get);
 
 void pci_bus_put(struct pci_bus *bus)
 {
 	if (bus)
 		put_device(&bus->dev);
 }
-EXPORT_SYMBOL(pci_bus_put);

diff --git a/drivers/pci/controller/dwc/Kconfig b/drivers/pci/controller/dwc/Kconfig
index 6ea778a..0ba988b 100644
--- a/drivers/pci/controller/dwc/Kconfig
+++ b/drivers/pci/controller/dwc/Kconfig

@@ -131,13 +131,29 @@
 	  DesignWare core functions to implement the driver.
 
 config PCI_LAYERSCAPE
-	bool "Freescale Layerscape PCIe controller"
+	bool "Freescale Layerscape PCIe controller - Host mode"
 	depends on OF && (ARM || ARCH_LAYERSCAPE || COMPILE_TEST)
 	depends on PCI_MSI_IRQ_DOMAIN
 	select MFD_SYSCON
 	select PCIE_DW_HOST
 	help
-	  Say Y here if you want PCIe controller support on Layerscape SoCs.
+	  Say Y here if you want to enable PCIe controller support on Layerscape
+	  SoCs to work in Host mode.
+	  This controller can work either as EP or RC. The RCW[HOST_AGT_PEX]
+	  determines which PCIe controller works in EP mode and which PCIe
+	  controller works in RC mode.
+
+config PCI_LAYERSCAPE_EP
+	bool "Freescale Layerscape PCIe controller - Endpoint mode"
+	depends on OF && (ARM || ARCH_LAYERSCAPE || COMPILE_TEST)
+	depends on PCI_ENDPOINT
+	select PCIE_DW_EP
+	help
+	  Say Y here if you want to enable PCIe controller support on Layerscape
+	  SoCs to work in Endpoint mode.
+	  This controller can work either as EP or RC. The RCW[HOST_AGT_PEX]
+	  determines which PCIe controller works in EP mode and which PCIe
+	  controller works in RC mode.
 
 config PCI_HISI
 	depends on OF && (ARM64 || COMPILE_TEST)
@@ -220,6 +236,16 @@
 	  and therefore the driver re-uses the DesignWare core functions to
 	  implement the driver.
 
+config PCIE_TEGRA194
+	tristate "NVIDIA Tegra194 (and later) PCIe controller"
+	depends on ARCH_TEGRA_194_SOC || COMPILE_TEST
+	depends on PCI_MSI_IRQ_DOMAIN
+	select PCIE_DW_HOST
+	select PHY_TEGRA194_P2U
+	help
+	  Say Y here if you want support for DesignWare core based PCIe host
+	  controller found in NVIDIA Tegra194 SoC.
+
 config PCIE_UNIPHIER
 	bool "Socionext UniPhier PCIe controllers"
 	depends on ARCH_UNIPHIER || COMPILE_TEST
@@ -230,4 +256,16 @@
 	  Say Y here if you want PCIe controller support on UniPhier SoCs.
 	  This driver supports LD20 and PXs3 SoCs.
 
+config PCIE_AL
+	bool "Amazon Annapurna Labs PCIe controller"
+	depends on OF && (ARM64 || COMPILE_TEST)
+	depends on PCI_MSI_IRQ_DOMAIN
+	select PCIE_DW_HOST
+	help
+	  Say Y here to enable support of the Amazon's Annapurna Labs PCIe
+	  controller IP on Amazon SoCs. The PCIe controller uses the DesignWare
+	  core plus Annapurna Labs proprietary hardware wrappers. This is
+	  required only for DT-based platforms. ACPI platforms with the
+	  Annapurna Labs PCIe controller don't need to enable this.
+
 endmenu

diff --git a/drivers/pci/controller/dwc/Makefile b/drivers/pci/controller/dwc/Makefile
index b085dfd..69faff3 100644
--- a/drivers/pci/controller/dwc/Makefile
+++ b/drivers/pci/controller/dwc/Makefile

@@ -8,13 +8,15 @@
 obj-$(CONFIG_PCI_IMX6) += pci-imx6.o
 obj-$(CONFIG_PCIE_SPEAR13XX) += pcie-spear13xx.o
 obj-$(CONFIG_PCI_KEYSTONE) += pci-keystone.o
-obj-$(CONFIG_PCI_LAYERSCAPE) += pci-layerscape.o pci-layerscape-ep.o
+obj-$(CONFIG_PCI_LAYERSCAPE) += pci-layerscape.o
+obj-$(CONFIG_PCI_LAYERSCAPE_EP) += pci-layerscape-ep.o
 obj-$(CONFIG_PCIE_QCOM) += pcie-qcom.o
 obj-$(CONFIG_PCIE_ARMADA_8K) += pcie-armada8k.o
 obj-$(CONFIG_PCIE_ARTPEC6) += pcie-artpec6.o
 obj-$(CONFIG_PCIE_KIRIN) += pcie-kirin.o
 obj-$(CONFIG_PCIE_HISI_STB) += pcie-histb.o
 obj-$(CONFIG_PCI_MESON) += pci-meson.o
+obj-$(CONFIG_PCIE_TEGRA194) += pcie-tegra194.o
 obj-$(CONFIG_PCIE_UNIPHIER) += pcie-uniphier.o
 
 # The following drivers are for devices that use the generic ACPI

diff --git a/drivers/pci/controller/dwc/pci-exynos.c b/drivers/pci/controller/dwc/pci-exynos.c
index cee5f2f..14a6ba4 100644
--- a/drivers/pci/controller/dwc/pci-exynos.c
+++ b/drivers/pci/controller/dwc/pci-exynos.c

@@ -465,7 +465,7 @@
 
 	ep->phy = devm_of_phy_get(dev, np, NULL);
 	if (IS_ERR(ep->phy)) {
-		if (PTR_ERR(ep->phy) == -EPROBE_DEFER)
+		if (PTR_ERR(ep->phy) != -ENODEV)
 			return PTR_ERR(ep->phy);
 
 		ep->phy = NULL;

diff --git a/drivers/pci/controller/dwc/pci-imx6.c b/drivers/pci/controller/dwc/pci-imx6.c
index 9b5cb5b..acfbd34 100644
--- a/drivers/pci/controller/dwc/pci-imx6.c
+++ b/drivers/pci/controller/dwc/pci-imx6.c

@@ -57,6 +57,7 @@
 struct imx6_pcie_drvdata {
 	enum imx6_pcie_variants variant;
 	u32 flags;
+	int dbi_length;
 };
 
 struct imx6_pcie {
@@ -1173,8 +1174,8 @@
 
 	imx6_pcie->vpcie = devm_regulator_get_optional(&pdev->dev, "vpcie");
 	if (IS_ERR(imx6_pcie->vpcie)) {
-		if (PTR_ERR(imx6_pcie->vpcie) == -EPROBE_DEFER)
-			return -EPROBE_DEFER;
+		if (PTR_ERR(imx6_pcie->vpcie) != -ENODEV)
+			return PTR_ERR(imx6_pcie->vpcie);
 		imx6_pcie->vpcie = NULL;
 	}
 
@@ -1212,6 +1213,7 @@
 		.variant = IMX6Q,
 		.flags = IMX6_PCIE_FLAG_IMX6_PHY |
 			 IMX6_PCIE_FLAG_IMX6_SPEED_CHANGE,
+		.dbi_length = 0x200,
 	},
 	[IMX6SX] = {
 		.variant = IMX6SX,
@@ -1254,6 +1256,37 @@
 	.shutdown = imx6_pcie_shutdown,
 };
 
+static void imx6_pcie_quirk(struct pci_dev *dev)
+{
+	struct pci_bus *bus = dev->bus;
+	struct pcie_port *pp = bus->sysdata;
+
+	/* Bus parent is the PCI bridge, its parent is this platform driver */
+	if (!bus->dev.parent || !bus->dev.parent->parent)
+		return;
+
+	/* Make sure we only quirk devices associated with this driver */
+	if (bus->dev.parent->parent->driver != &imx6_pcie_driver.driver)
+		return;
+
+	if (bus->number == pp->root_bus_nr) {
+		struct dw_pcie *pci = to_dw_pcie_from_pp(pp);
+		struct imx6_pcie *imx6_pcie = to_imx6_pcie(pci);
+
+		/*
+		 * Limit config length to avoid the kernel reading beyond
+		 * the register set and causing an abort on i.MX 6Quad
+		 */
+		if (imx6_pcie->drvdata->dbi_length) {
+			dev->cfg_size = imx6_pcie->drvdata->dbi_length;
+			dev_info(&dev->dev, "Limiting cfg_size to %d\n",
+					dev->cfg_size);
+		}
+	}
+}
+DECLARE_PCI_FIXUP_CLASS_HEADER(PCI_VENDOR_ID_SYNOPSYS, 0xabcd,
+			PCI_CLASS_BRIDGE_PCI, 8, imx6_pcie_quirk);
+
 static int __init imx6_pcie_init(void)
 {
 #ifdef CONFIG_ARM

diff --git a/drivers/pci/controller/dwc/pci-layerscape-ep.c b/drivers/pci/controller/dwc/pci-layerscape-ep.c
index be61d96..ca9aa45 100644
--- a/drivers/pci/controller/dwc/pci-layerscape-ep.c
+++ b/drivers/pci/controller/dwc/pci-layerscape-ep.c

@@ -44,6 +44,7 @@
 	.linkup_notifier = false,
 	.msi_capable = true,
 	.msix_capable = false,
+	.bar_fixed_64bit = (1 << BAR_2) | (1 << BAR_4),
 };
 
 static const struct pci_epc_features*

diff --git a/drivers/pci/controller/dwc/pcie-al.c b/drivers/pci/controller/dwc/pcie-al.c
index 3ab58f0..1eeda2f 100644
--- a/drivers/pci/controller/dwc/pcie-al.c
+++ b/drivers/pci/controller/dwc/pcie-al.c

@@ -91,3 +91,368 @@
 };
 
 #endif /* defined(CONFIG_ACPI) && defined(CONFIG_PCI_QUIRKS) */
+
+#ifdef CONFIG_PCIE_AL
+
+#include <linux/of_pci.h>
+#include "pcie-designware.h"
+
+#define AL_PCIE_REV_ID_2	2
+#define AL_PCIE_REV_ID_3	3
+#define AL_PCIE_REV_ID_4	4
+
+#define AXI_BASE_OFFSET		0x0
+
+#define DEVICE_ID_OFFSET	0x16c
+
+#define DEVICE_REV_ID			0x0
+#define DEVICE_REV_ID_DEV_ID_MASK	GENMASK(31, 16)
+
+#define DEVICE_REV_ID_DEV_ID_X4		0
+#define DEVICE_REV_ID_DEV_ID_X8		2
+#define DEVICE_REV_ID_DEV_ID_X16	4
+
+#define OB_CTRL_REV1_2_OFFSET	0x0040
+#define OB_CTRL_REV3_5_OFFSET	0x0030
+
+#define CFG_TARGET_BUS			0x0
+#define CFG_TARGET_BUS_MASK_MASK	GENMASK(7, 0)
+#define CFG_TARGET_BUS_BUSNUM_MASK	GENMASK(15, 8)
+
+#define CFG_CONTROL			0x4
+#define CFG_CONTROL_SUBBUS_MASK		GENMASK(15, 8)
+#define CFG_CONTROL_SEC_BUS_MASK	GENMASK(23, 16)
+
+struct al_pcie_reg_offsets {
+	unsigned int ob_ctrl;
+};
+
+struct al_pcie_target_bus_cfg {
+	u8 reg_val;
+	u8 reg_mask;
+	u8 ecam_mask;
+};
+
+struct al_pcie {
+	struct dw_pcie *pci;
+	void __iomem *controller_base; /* base of PCIe unit (not DW core) */
+	struct device *dev;
+	resource_size_t ecam_size;
+	unsigned int controller_rev_id;
+	struct al_pcie_reg_offsets reg_offsets;
+	struct al_pcie_target_bus_cfg target_bus_cfg;
+};
+
+#define PCIE_ECAM_DEVFN(x)		(((x) & 0xff) << 12)
+
+#define to_al_pcie(x)		dev_get_drvdata((x)->dev)
+
+static inline u32 al_pcie_controller_readl(struct al_pcie *pcie, u32 offset)
+{
+	return readl_relaxed(pcie->controller_base + offset);
+}
+
+static inline void al_pcie_controller_writel(struct al_pcie *pcie, u32 offset,
+					     u32 val)
+{
+	writel_relaxed(val, pcie->controller_base + offset);
+}
+
+static int al_pcie_rev_id_get(struct al_pcie *pcie, unsigned int *rev_id)
+{
+	u32 dev_rev_id_val;
+	u32 dev_id_val;
+
+	dev_rev_id_val = al_pcie_controller_readl(pcie, AXI_BASE_OFFSET +
+						  DEVICE_ID_OFFSET +
+						  DEVICE_REV_ID);
+	dev_id_val = FIELD_GET(DEVICE_REV_ID_DEV_ID_MASK, dev_rev_id_val);
+
+	switch (dev_id_val) {
+	case DEVICE_REV_ID_DEV_ID_X4:
+		*rev_id = AL_PCIE_REV_ID_2;
+		break;
+	case DEVICE_REV_ID_DEV_ID_X8:
+		*rev_id = AL_PCIE_REV_ID_3;
+		break;
+	case DEVICE_REV_ID_DEV_ID_X16:
+		*rev_id = AL_PCIE_REV_ID_4;
+		break;
+	default:
+		dev_err(pcie->dev, "Unsupported dev_id_val (0x%x)\n",
+			dev_id_val);
+		return -EINVAL;
+	}
+
+	dev_dbg(pcie->dev, "dev_id_val: 0x%x\n", dev_id_val);
+
+	return 0;
+}
+
+static int al_pcie_reg_offsets_set(struct al_pcie *pcie)
+{
+	switch (pcie->controller_rev_id) {
+	case AL_PCIE_REV_ID_2:
+		pcie->reg_offsets.ob_ctrl = OB_CTRL_REV1_2_OFFSET;
+		break;
+	case AL_PCIE_REV_ID_3:
+	case AL_PCIE_REV_ID_4:
+		pcie->reg_offsets.ob_ctrl = OB_CTRL_REV3_5_OFFSET;
+		break;
+	default:
+		dev_err(pcie->dev, "Unsupported controller rev_id: 0x%x\n",
+			pcie->controller_rev_id);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static inline void al_pcie_target_bus_set(struct al_pcie *pcie,
+					  u8 target_bus,
+					  u8 mask_target_bus)
+{
+	u32 reg;
+
+	reg = FIELD_PREP(CFG_TARGET_BUS_MASK_MASK, mask_target_bus) |
+	      FIELD_PREP(CFG_TARGET_BUS_BUSNUM_MASK, target_bus);
+
+	al_pcie_controller_writel(pcie, AXI_BASE_OFFSET +
+				  pcie->reg_offsets.ob_ctrl + CFG_TARGET_BUS,
+				  reg);
+}
+
+static void __iomem *al_pcie_conf_addr_map(struct al_pcie *pcie,
+					   unsigned int busnr,
+					   unsigned int devfn)
+{
+	struct al_pcie_target_bus_cfg *target_bus_cfg = &pcie->target_bus_cfg;
+	unsigned int busnr_ecam = busnr & target_bus_cfg->ecam_mask;
+	unsigned int busnr_reg = busnr & target_bus_cfg->reg_mask;
+	struct pcie_port *pp = &pcie->pci->pp;
+	void __iomem *pci_base_addr;
+
+	pci_base_addr = (void __iomem *)((uintptr_t)pp->va_cfg0_base +
+					 (busnr_ecam << 20) +
+					 PCIE_ECAM_DEVFN(devfn));
+
+	if (busnr_reg != target_bus_cfg->reg_val) {
+		dev_dbg(pcie->pci->dev, "Changing target bus busnum val from 0x%x to 0x%x\n",
+			target_bus_cfg->reg_val, busnr_reg);
+		target_bus_cfg->reg_val = busnr_reg;
+		al_pcie_target_bus_set(pcie,
+				       target_bus_cfg->reg_val,
+				       target_bus_cfg->reg_mask);
+	}
+
+	return pci_base_addr;
+}
+
+static int al_pcie_rd_other_conf(struct pcie_port *pp, struct pci_bus *bus,
+				 unsigned int devfn, int where, int size,
+				 u32 *val)
+{
+	struct dw_pcie *pci = to_dw_pcie_from_pp(pp);
+	struct al_pcie *pcie = to_al_pcie(pci);
+	unsigned int busnr = bus->number;
+	void __iomem *pci_addr;
+	int rc;
+
+	pci_addr = al_pcie_conf_addr_map(pcie, busnr, devfn);
+
+	rc = dw_pcie_read(pci_addr + where, size, val);
+
+	dev_dbg(pci->dev, "%d-byte config read from %04x:%02x:%02x.%d offset 0x%x (pci_addr: 0x%px) - val:0x%x\n",
+		size, pci_domain_nr(bus), bus->number,
+		PCI_SLOT(devfn), PCI_FUNC(devfn), where,
+		(pci_addr + where), *val);
+
+	return rc;
+}
+
+static int al_pcie_wr_other_conf(struct pcie_port *pp, struct pci_bus *bus,
+				 unsigned int devfn, int where, int size,
+				 u32 val)
+{
+	struct dw_pcie *pci = to_dw_pcie_from_pp(pp);
+	struct al_pcie *pcie = to_al_pcie(pci);
+	unsigned int busnr = bus->number;
+	void __iomem *pci_addr;
+	int rc;
+
+	pci_addr = al_pcie_conf_addr_map(pcie, busnr, devfn);
+
+	rc = dw_pcie_write(pci_addr + where, size, val);
+
+	dev_dbg(pci->dev, "%d-byte config write to %04x:%02x:%02x.%d offset 0x%x (pci_addr: 0x%px) - val:0x%x\n",
+		size, pci_domain_nr(bus), bus->number,
+		PCI_SLOT(devfn), PCI_FUNC(devfn), where,
+		(pci_addr + where), val);
+
+	return rc;
+}
+
+static void al_pcie_config_prepare(struct al_pcie *pcie)
+{
+	struct al_pcie_target_bus_cfg *target_bus_cfg;
+	struct pcie_port *pp = &pcie->pci->pp;
+	unsigned int ecam_bus_mask;
+	u32 cfg_control_offset;
+	u8 subordinate_bus;
+	u8 secondary_bus;
+	u32 cfg_control;
+	u32 reg;
+
+	target_bus_cfg = &pcie->target_bus_cfg;
+
+	ecam_bus_mask = (pcie->ecam_size >> 20) - 1;
+	if (ecam_bus_mask > 255) {
+		dev_warn(pcie->dev, "ECAM window size is larger than 256MB. Cutting off at 256\n");
+		ecam_bus_mask = 255;
+	}
+
+	/* This portion is taken from the transaction address */
+	target_bus_cfg->ecam_mask = ecam_bus_mask;
+	/* This portion is taken from the cfg_target_bus reg */
+	target_bus_cfg->reg_mask = ~target_bus_cfg->ecam_mask;
+	target_bus_cfg->reg_val = pp->busn->start & target_bus_cfg->reg_mask;
+
+	al_pcie_target_bus_set(pcie, target_bus_cfg->reg_val,
+			       target_bus_cfg->reg_mask);
+
+	secondary_bus = pp->busn->start + 1;
+	subordinate_bus = pp->busn->end;
+
+	/* Set the valid values of secondary and subordinate buses */
+	cfg_control_offset = AXI_BASE_OFFSET + pcie->reg_offsets.ob_ctrl +
+			     CFG_CONTROL;
+
+	cfg_control = al_pcie_controller_readl(pcie, cfg_control_offset);
+
+	reg = cfg_control &
+	      ~(CFG_CONTROL_SEC_BUS_MASK | CFG_CONTROL_SUBBUS_MASK);
+
+	reg |= FIELD_PREP(CFG_CONTROL_SUBBUS_MASK, subordinate_bus) |
+	       FIELD_PREP(CFG_CONTROL_SEC_BUS_MASK, secondary_bus);
+
+	al_pcie_controller_writel(pcie, cfg_control_offset, reg);
+}
+
+static int al_pcie_host_init(struct pcie_port *pp)
+{
+	struct dw_pcie *pci = to_dw_pcie_from_pp(pp);
+	struct al_pcie *pcie = to_al_pcie(pci);
+	int rc;
+
+	rc = al_pcie_rev_id_get(pcie, &pcie->controller_rev_id);
+	if (rc)
+		return rc;
+
+	rc = al_pcie_reg_offsets_set(pcie);
+	if (rc)
+		return rc;
+
+	al_pcie_config_prepare(pcie);
+
+	return 0;
+}
+
+static const struct dw_pcie_host_ops al_pcie_host_ops = {
+	.rd_other_conf = al_pcie_rd_other_conf,
+	.wr_other_conf = al_pcie_wr_other_conf,
+	.host_init = al_pcie_host_init,
+};
+
+static int al_add_pcie_port(struct pcie_port *pp,
+			    struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	int ret;
+
+	pp->ops = &al_pcie_host_ops;
+
+	ret = dw_pcie_host_init(pp);
+	if (ret) {
+		dev_err(dev, "failed to initialize host\n");
+		return ret;
+	}
+
+	return 0;
+}
+
+static const struct dw_pcie_ops dw_pcie_ops = {
+};
+
+static int al_pcie_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct resource *controller_res;
+	struct resource *ecam_res;
+	struct resource *dbi_res;
+	struct al_pcie *al_pcie;
+	struct dw_pcie *pci;
+
+	al_pcie = devm_kzalloc(dev, sizeof(*al_pcie), GFP_KERNEL);
+	if (!al_pcie)
+		return -ENOMEM;
+
+	pci = devm_kzalloc(dev, sizeof(*pci), GFP_KERNEL);
+	if (!pci)
+		return -ENOMEM;
+
+	pci->dev = dev;
+	pci->ops = &dw_pcie_ops;
+
+	al_pcie->pci = pci;
+	al_pcie->dev = dev;
+
+	dbi_res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "dbi");
+	pci->dbi_base = devm_pci_remap_cfg_resource(dev, dbi_res);
+	if (IS_ERR(pci->dbi_base)) {
+		dev_err(dev, "couldn't remap dbi base %pR\n", dbi_res);
+		return PTR_ERR(pci->dbi_base);
+	}
+
+	ecam_res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "config");
+	if (!ecam_res) {
+		dev_err(dev, "couldn't find 'config' reg in DT\n");
+		return -ENOENT;
+	}
+	al_pcie->ecam_size = resource_size(ecam_res);
+
+	controller_res = platform_get_resource_byname(pdev, IORESOURCE_MEM,
+						      "controller");
+	al_pcie->controller_base = devm_ioremap_resource(dev, controller_res);
+	if (IS_ERR(al_pcie->controller_base)) {
+		dev_err(dev, "couldn't remap controller base %pR\n",
+			controller_res);
+		return PTR_ERR(al_pcie->controller_base);
+	}
+
+	dev_dbg(dev, "From DT: dbi_base: %pR, controller_base: %pR\n",
+		dbi_res, controller_res);
+
+	platform_set_drvdata(pdev, al_pcie);
+
+	return al_add_pcie_port(&pci->pp, pdev);
+}
+
+static const struct of_device_id al_pcie_of_match[] = {
+	{ .compatible = "amazon,al-alpine-v2-pcie",
+	},
+	{ .compatible = "amazon,al-alpine-v3-pcie",
+	},
+	{},
+};
+
+static struct platform_driver al_pcie_driver = {
+	.driver = {
+		.name	= "al-pcie",
+		.of_match_table = al_pcie_of_match,
+		.suppress_bind_attrs = true,
+	},
+	.probe = al_pcie_probe,
+};
+builtin_platform_driver(al_pcie_driver);
+
+#endif /* CONFIG_PCIE_AL*/

diff --git a/drivers/pci/controller/dwc/pcie-armada8k.c b/drivers/pci/controller/dwc/pcie-armada8k.c
index 3d55dc7..4959654 100644
--- a/drivers/pci/controller/dwc/pcie-armada8k.c
+++ b/drivers/pci/controller/dwc/pcie-armada8k.c

@@ -118,11 +118,10 @@
 
 	for (i = 0; i < ARMADA8K_PCIE_MAX_LANES; i++) {
 		pcie->phy[i] = devm_of_phy_get_by_index(dev, node, i);
-		if (IS_ERR(pcie->phy[i]) &&
-		    (PTR_ERR(pcie->phy[i]) == -EPROBE_DEFER))
-			return PTR_ERR(pcie->phy[i]);
-
 		if (IS_ERR(pcie->phy[i])) {
+			if (PTR_ERR(pcie->phy[i]) != -ENODEV)
+				return PTR_ERR(pcie->phy[i]);
+
 			pcie->phy[i] = NULL;
 			continue;
 		}

diff --git a/drivers/pci/controller/dwc/pcie-designware-ep.c b/drivers/pci/controller/dwc/pcie-designware-ep.c
index 2bf5a35..3dd2e26 100644
--- a/drivers/pci/controller/dwc/pcie-designware-ep.c
+++ b/drivers/pci/controller/dwc/pcie-designware-ep.c

@@ -40,39 +40,6 @@
 	__dw_pcie_ep_reset_bar(pci, bar, 0);
 }
 
-static u8 __dw_pcie_ep_find_next_cap(struct dw_pcie *pci, u8 cap_ptr,
-			      u8 cap)
-{
-	u8 cap_id, next_cap_ptr;
-	u16 reg;
-
-	if (!cap_ptr)
-		return 0;
-
-	reg = dw_pcie_readw_dbi(pci, cap_ptr);
-	cap_id = (reg & 0x00ff);
-
-	if (cap_id > PCI_CAP_ID_MAX)
-		return 0;
-
-	if (cap_id == cap)
-		return cap_ptr;
-
-	next_cap_ptr = (reg & 0xff00) >> 8;
-	return __dw_pcie_ep_find_next_cap(pci, next_cap_ptr, cap);
-}
-
-static u8 dw_pcie_ep_find_capability(struct dw_pcie *pci, u8 cap)
-{
-	u8 next_cap_ptr;
-	u16 reg;
-
-	reg = dw_pcie_readw_dbi(pci, PCI_CAPABILITY_LIST);
-	next_cap_ptr = (reg & 0x00ff);
-
-	return __dw_pcie_ep_find_next_cap(pci, next_cap_ptr, cap);
-}
-
 static int dw_pcie_ep_write_header(struct pci_epc *epc, u8 func_no,
 				   struct pci_epf_header *hdr)
 {
@@ -531,6 +498,7 @@
 	int ret;
 	u32 reg;
 	void *addr;
+	u8 hdr_type;
 	unsigned int nbars;
 	unsigned int offset;
 	struct pci_epc *epc;
@@ -595,6 +563,13 @@
 	if (ep->ops->ep_init)
 		ep->ops->ep_init(ep);
 
+	hdr_type = dw_pcie_readb_dbi(pci, PCI_HEADER_TYPE);
+	if (hdr_type != PCI_HEADER_TYPE_NORMAL) {
+		dev_err(pci->dev, "PCIe controller is not set to EP mode (hdr_type:0x%x)!\n",
+			hdr_type);
+		return -EIO;
+	}
+
 	ret = of_property_read_u8(np, "max-functions", &epc->max_functions);
 	if (ret < 0)
 		epc->max_functions = 1;
@@ -612,9 +587,9 @@
 		dev_err(dev, "Failed to reserve memory for MSI/MSI-X\n");
 		return -ENOMEM;
 	}
-	ep->msi_cap = dw_pcie_ep_find_capability(pci, PCI_CAP_ID_MSI);
+	ep->msi_cap = dw_pcie_find_capability(pci, PCI_CAP_ID_MSI);
 
-	ep->msix_cap = dw_pcie_ep_find_capability(pci, PCI_CAP_ID_MSIX);
+	ep->msix_cap = dw_pcie_find_capability(pci, PCI_CAP_ID_MSIX);
 
 	offset = dw_pcie_ep_find_ext_capability(pci, PCI_EXT_CAP_ID_REBAR);
 	if (offset) {

diff --git a/drivers/pci/controller/dwc/pcie-designware-host.c b/drivers/pci/controller/dwc/pcie-designware-host.c
index f93252d..0f36a92 100644
--- a/drivers/pci/controller/dwc/pcie-designware-host.c
+++ b/drivers/pci/controller/dwc/pcie-designware-host.c

@@ -323,6 +323,7 @@
 	struct pci_bus *child;
 	struct pci_host_bridge *bridge;
 	struct resource *cfg_res;
+	u32 hdr_type;
 	int ret;
 
 	raw_spin_lock_init(&pci->pp.lock);
@@ -464,6 +465,21 @@
 			goto err_free_msi;
 	}
 
+	ret = dw_pcie_rd_own_conf(pp, PCI_HEADER_TYPE, 1, &hdr_type);
+	if (ret != PCIBIOS_SUCCESSFUL) {
+		dev_err(pci->dev, "Failed reading PCI_HEADER_TYPE cfg space reg (ret: 0x%x)\n",
+			ret);
+		ret = pcibios_err_to_errno(ret);
+		goto err_free_msi;
+	}
+	if (hdr_type != PCI_HEADER_TYPE_BRIDGE) {
+		dev_err(pci->dev,
+			"PCIe controller is not set to bridge type (hdr_type: 0x%x)!\n",
+			hdr_type);
+		ret = -EIO;
+		goto err_free_msi;
+	}
+
 	pp->root_bus_nr = pp->busn->start;
 
 	bridge->dev.parent = dev;
@@ -628,6 +644,12 @@
 	u32 val, ctrl, num_ctrls;
 	struct dw_pcie *pci = to_dw_pcie_from_pp(pp);
 
+	/*
+	 * Enable DBI read-only registers for writing/updating configuration.
+	 * Write permission gets disabled towards the end of this function.
+	 */
+	dw_pcie_dbi_ro_wr_en(pci);
+
 	dw_pcie_setup(pci);
 
 	if (!pp->ops->msi_host_init) {
@@ -650,12 +672,10 @@
 	dw_pcie_writel_dbi(pci, PCI_BASE_ADDRESS_1, 0x00000000);
 
 	/* Setup interrupt pins */
-	dw_pcie_dbi_ro_wr_en(pci);
 	val = dw_pcie_readl_dbi(pci, PCI_INTERRUPT_LINE);
 	val &= 0xffff00ff;
 	val |= 0x00000100;
 	dw_pcie_writel_dbi(pci, PCI_INTERRUPT_LINE, val);
-	dw_pcie_dbi_ro_wr_dis(pci);
 
 	/* Setup bus numbers */
 	val = dw_pcie_readl_dbi(pci, PCI_PRIMARY_BUS);
@@ -687,15 +707,13 @@
 
 	dw_pcie_wr_own_conf(pp, PCI_BASE_ADDRESS_0, 4, 0);
 
-	/* Enable write permission for the DBI read-only register */
-	dw_pcie_dbi_ro_wr_en(pci);
 	/* Program correct class for RC */
 	dw_pcie_wr_own_conf(pp, PCI_CLASS_DEVICE, 2, PCI_CLASS_BRIDGE_PCI);
-	/* Better disable write permission right after the update */
-	dw_pcie_dbi_ro_wr_dis(pci);
 
 	dw_pcie_rd_own_conf(pp, PCIE_LINK_WIDTH_SPEED_CONTROL, 4, &val);
 	val |= PORT_LOGIC_SPEED_CHANGE;
 	dw_pcie_wr_own_conf(pp, PCIE_LINK_WIDTH_SPEED_CONTROL, 4, val);
+
+	dw_pcie_dbi_ro_wr_dis(pci);
 }
 EXPORT_SYMBOL_GPL(dw_pcie_setup_rc);

diff --git a/drivers/pci/controller/dwc/pcie-designware.c b/drivers/pci/controller/dwc/pcie-designware.c
index 7d25102..820488d 100644
--- a/drivers/pci/controller/dwc/pcie-designware.c
+++ b/drivers/pci/controller/dwc/pcie-designware.c

@@ -14,6 +14,86 @@
 
 #include "pcie-designware.h"
 
+/*
+ * These interfaces resemble the pci_find_*capability() interfaces, but these
+ * are for configuring host controllers, which are bridges *to* PCI devices but
+ * are not PCI devices themselves.
+ */
+static u8 __dw_pcie_find_next_cap(struct dw_pcie *pci, u8 cap_ptr,
+				  u8 cap)
+{
+	u8 cap_id, next_cap_ptr;
+	u16 reg;
+
+	if (!cap_ptr)
+		return 0;
+
+	reg = dw_pcie_readw_dbi(pci, cap_ptr);
+	cap_id = (reg & 0x00ff);
+
+	if (cap_id > PCI_CAP_ID_MAX)
+		return 0;
+
+	if (cap_id == cap)
+		return cap_ptr;
+
+	next_cap_ptr = (reg & 0xff00) >> 8;
+	return __dw_pcie_find_next_cap(pci, next_cap_ptr, cap);
+}
+
+u8 dw_pcie_find_capability(struct dw_pcie *pci, u8 cap)
+{
+	u8 next_cap_ptr;
+	u16 reg;
+
+	reg = dw_pcie_readw_dbi(pci, PCI_CAPABILITY_LIST);
+	next_cap_ptr = (reg & 0x00ff);
+
+	return __dw_pcie_find_next_cap(pci, next_cap_ptr, cap);
+}
+EXPORT_SYMBOL_GPL(dw_pcie_find_capability);
+
+static u16 dw_pcie_find_next_ext_capability(struct dw_pcie *pci, u16 start,
+					    u8 cap)
+{
+	u32 header;
+	int ttl;
+	int pos = PCI_CFG_SPACE_SIZE;
+
+	/* minimum 8 bytes per capability */
+	ttl = (PCI_CFG_SPACE_EXP_SIZE - PCI_CFG_SPACE_SIZE) / 8;
+
+	if (start)
+		pos = start;
+
+	header = dw_pcie_readl_dbi(pci, pos);
+	/*
+	 * If we have no capabilities, this is indicated by cap ID,
+	 * cap version and next pointer all being 0.
+	 */
+	if (header == 0)
+		return 0;
+
+	while (ttl-- > 0) {
+		if (PCI_EXT_CAP_ID(header) == cap && pos != start)
+			return pos;
+
+		pos = PCI_EXT_CAP_NEXT(header);
+		if (pos < PCI_CFG_SPACE_SIZE)
+			break;
+
+		header = dw_pcie_readl_dbi(pci, pos);
+	}
+
+	return 0;
+}
+
+u16 dw_pcie_find_ext_capability(struct dw_pcie *pci, u8 cap)
+{
+	return dw_pcie_find_next_ext_capability(pci, 0, cap);
+}
+EXPORT_SYMBOL_GPL(dw_pcie_find_ext_capability);
+
 int dw_pcie_read(void __iomem *addr, int size, u32 *val)
 {
 	if (!IS_ALIGNED((uintptr_t)addr, size)) {
@@ -376,10 +456,11 @@
 		usleep_range(LINK_WAIT_USLEEP_MIN, LINK_WAIT_USLEEP_MAX);
 	}
 
-	dev_err(pci->dev, "Phy link never came up\n");
+	dev_info(pci->dev, "Phy link never came up\n");
 
 	return -ETIMEDOUT;
 }
+EXPORT_SYMBOL_GPL(dw_pcie_wait_for_link);
 
 int dw_pcie_link_up(struct dw_pcie *pci)
 {
@@ -423,8 +504,10 @@
 
 
 	ret = of_property_read_u32(np, "num-lanes", &lanes);
-	if (ret)
-		lanes = 0;
+	if (ret) {
+		dev_dbg(pci->dev, "property num-lanes isn't found\n");
+		return;
+	}
 
 	/* Set the number of lanes */
 	val = dw_pcie_readl_dbi(pci, PCIE_PORT_LINK_CONTROL);
@@ -466,4 +549,11 @@
 		break;
 	}
 	dw_pcie_writel_dbi(pci, PCIE_LINK_WIDTH_SPEED_CONTROL, val);
+
+	if (of_property_read_bool(np, "snps,enable-cdm-check")) {
+		val = dw_pcie_readl_dbi(pci, PCIE_PL_CHK_REG_CONTROL_STATUS);
+		val |= PCIE_PL_CHK_REG_CHK_REG_CONTINUOUS |
+		       PCIE_PL_CHK_REG_CHK_REG_START;
+		dw_pcie_writel_dbi(pci, PCIE_PL_CHK_REG_CONTROL_STATUS, val);
+	}
 }

diff --git a/drivers/pci/controller/dwc/pcie-designware.h b/drivers/pci/controller/dwc/pcie-designware.h
index ffed084..5a18e94 100644
--- a/drivers/pci/controller/dwc/pcie-designware.h
+++ b/drivers/pci/controller/dwc/pcie-designware.h

@@ -86,6 +86,15 @@
 #define PCIE_MISC_CONTROL_1_OFF		0x8BC
 #define PCIE_DBI_RO_WR_EN		BIT(0)
 
+#define PCIE_PL_CHK_REG_CONTROL_STATUS			0xB20
+#define PCIE_PL_CHK_REG_CHK_REG_START			BIT(0)
+#define PCIE_PL_CHK_REG_CHK_REG_CONTINUOUS		BIT(1)
+#define PCIE_PL_CHK_REG_CHK_REG_COMPARISON_ERROR	BIT(16)
+#define PCIE_PL_CHK_REG_CHK_REG_LOGIC_ERROR		BIT(17)
+#define PCIE_PL_CHK_REG_CHK_REG_COMPLETE		BIT(18)
+
+#define PCIE_PL_CHK_REG_ERR_ADDR			0xB28
+
 /*
  * iATU Unroll-specific register definitions
  * From 4.80 core version the address translation will be made by unroll
@@ -251,6 +260,9 @@
 #define to_dw_pcie_from_ep(endpoint)   \
 		container_of((endpoint), struct dw_pcie, ep)
 
+u8 dw_pcie_find_capability(struct dw_pcie *pci, u8 cap);
+u16 dw_pcie_find_ext_capability(struct dw_pcie *pci, u8 cap);
+
 int dw_pcie_read(void __iomem *addr, int size, u32 *val);
 int dw_pcie_write(void __iomem *addr, int size, u32 val);
 

diff --git a/drivers/pci/controller/dwc/pcie-histb.c b/drivers/pci/controller/dwc/pcie-histb.c
index 954bc2b..811b5c6 100644
--- a/drivers/pci/controller/dwc/pcie-histb.c
+++ b/drivers/pci/controller/dwc/pcie-histb.c

@@ -340,8 +340,8 @@
 
 	hipcie->vpcie = devm_regulator_get_optional(dev, "vpcie");
 	if (IS_ERR(hipcie->vpcie)) {
-		if (PTR_ERR(hipcie->vpcie) == -EPROBE_DEFER)
-			return -EPROBE_DEFER;
+		if (PTR_ERR(hipcie->vpcie) != -ENODEV)
+			return PTR_ERR(hipcie->vpcie);
 		hipcie->vpcie = NULL;
 	}
 

diff --git a/drivers/pci/controller/dwc/pcie-kirin.c b/drivers/pci/controller/dwc/pcie-kirin.c
index 8df1914..c19617a 100644
--- a/drivers/pci/controller/dwc/pcie-kirin.c
+++ b/drivers/pci/controller/dwc/pcie-kirin.c

@@ -436,7 +436,7 @@
 	return 0;
 }
 
-static struct dw_pcie_ops kirin_dw_pcie_ops = {
+static const struct dw_pcie_ops kirin_dw_pcie_ops = {
 	.read_dbi = kirin_pcie_read_dbi,
 	.write_dbi = kirin_pcie_write_dbi,
 	.link_up = kirin_pcie_link_up,

diff --git a/drivers/pci/controller/dwc/pcie-tegra194.c b/drivers/pci/controller/dwc/pcie-tegra194.c
new file mode 100644
index 0000000..f89f5ac
--- /dev/null
+++ b/drivers/pci/controller/dwc/pcie-tegra194.c

@@ -0,0 +1,1732 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * PCIe host controller driver for Tegra194 SoC
+ *
+ * Copyright (C) 2019 NVIDIA Corporation.
+ *
+ * Author: Vidya Sagar <vidyas@nvidia.com>
+ */
+
+#include <linux/clk.h>
+#include <linux/debugfs.h>
+#include <linux/delay.h>
+#include <linux/gpio.h>
+#include <linux/interrupt.h>
+#include <linux/iopoll.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
+#include <linux/of_gpio.h>
+#include <linux/of_irq.h>
+#include <linux/of_pci.h>
+#include <linux/pci.h>
+#include <linux/phy/phy.h>
+#include <linux/pinctrl/consumer.h>
+#include <linux/platform_device.h>
+#include <linux/pm_runtime.h>
+#include <linux/random.h>
+#include <linux/reset.h>
+#include <linux/resource.h>
+#include <linux/types.h>
+#include "pcie-designware.h"
+#include <soc/tegra/bpmp.h>
+#include <soc/tegra/bpmp-abi.h>
+#include "../../pci.h"
+
+#define APPL_PINMUX				0x0
+#define APPL_PINMUX_PEX_RST			BIT(0)
+#define APPL_PINMUX_CLKREQ_OVERRIDE_EN		BIT(2)
+#define APPL_PINMUX_CLKREQ_OVERRIDE		BIT(3)
+#define APPL_PINMUX_CLK_OUTPUT_IN_OVERRIDE_EN	BIT(4)
+#define APPL_PINMUX_CLK_OUTPUT_IN_OVERRIDE	BIT(5)
+#define APPL_PINMUX_CLKREQ_OUT_OVRD_EN		BIT(9)
+#define APPL_PINMUX_CLKREQ_OUT_OVRD		BIT(10)
+
+#define APPL_CTRL				0x4
+#define APPL_CTRL_SYS_PRE_DET_STATE		BIT(6)
+#define APPL_CTRL_LTSSM_EN			BIT(7)
+#define APPL_CTRL_HW_HOT_RST_EN			BIT(20)
+#define APPL_CTRL_HW_HOT_RST_MODE_MASK		GENMASK(1, 0)
+#define APPL_CTRL_HW_HOT_RST_MODE_SHIFT		22
+#define APPL_CTRL_HW_HOT_RST_MODE_IMDT_RST	0x1
+
+#define APPL_INTR_EN_L0_0			0x8
+#define APPL_INTR_EN_L0_0_LINK_STATE_INT_EN	BIT(0)
+#define APPL_INTR_EN_L0_0_MSI_RCV_INT_EN	BIT(4)
+#define APPL_INTR_EN_L0_0_INT_INT_EN		BIT(8)
+#define APPL_INTR_EN_L0_0_CDM_REG_CHK_INT_EN	BIT(19)
+#define APPL_INTR_EN_L0_0_SYS_INTR_EN		BIT(30)
+#define APPL_INTR_EN_L0_0_SYS_MSI_INTR_EN	BIT(31)
+
+#define APPL_INTR_STATUS_L0			0xC
+#define APPL_INTR_STATUS_L0_LINK_STATE_INT	BIT(0)
+#define APPL_INTR_STATUS_L0_INT_INT		BIT(8)
+#define APPL_INTR_STATUS_L0_CDM_REG_CHK_INT	BIT(18)
+
+#define APPL_INTR_EN_L1_0_0				0x1C
+#define APPL_INTR_EN_L1_0_0_LINK_REQ_RST_NOT_INT_EN	BIT(1)
+
+#define APPL_INTR_STATUS_L1_0_0				0x20
+#define APPL_INTR_STATUS_L1_0_0_LINK_REQ_RST_NOT_CHGED	BIT(1)
+
+#define APPL_INTR_STATUS_L1_1			0x2C
+#define APPL_INTR_STATUS_L1_2			0x30
+#define APPL_INTR_STATUS_L1_3			0x34
+#define APPL_INTR_STATUS_L1_6			0x3C
+#define APPL_INTR_STATUS_L1_7			0x40
+
+#define APPL_INTR_EN_L1_8_0			0x44
+#define APPL_INTR_EN_L1_8_BW_MGT_INT_EN		BIT(2)
+#define APPL_INTR_EN_L1_8_AUTO_BW_INT_EN	BIT(3)
+#define APPL_INTR_EN_L1_8_INTX_EN		BIT(11)
+#define APPL_INTR_EN_L1_8_AER_INT_EN		BIT(15)
+
+#define APPL_INTR_STATUS_L1_8_0			0x4C
+#define APPL_INTR_STATUS_L1_8_0_EDMA_INT_MASK	GENMASK(11, 6)
+#define APPL_INTR_STATUS_L1_8_0_BW_MGT_INT_STS	BIT(2)
+#define APPL_INTR_STATUS_L1_8_0_AUTO_BW_INT_STS	BIT(3)
+
+#define APPL_INTR_STATUS_L1_9			0x54
+#define APPL_INTR_STATUS_L1_10			0x58
+#define APPL_INTR_STATUS_L1_11			0x64
+#define APPL_INTR_STATUS_L1_13			0x74
+#define APPL_INTR_STATUS_L1_14			0x78
+#define APPL_INTR_STATUS_L1_15			0x7C
+#define APPL_INTR_STATUS_L1_17			0x88
+
+#define APPL_INTR_EN_L1_18				0x90
+#define APPL_INTR_EN_L1_18_CDM_REG_CHK_CMPLT		BIT(2)
+#define APPL_INTR_EN_L1_18_CDM_REG_CHK_CMP_ERR		BIT(1)
+#define APPL_INTR_EN_L1_18_CDM_REG_CHK_LOGIC_ERR	BIT(0)
+
+#define APPL_INTR_STATUS_L1_18				0x94
+#define APPL_INTR_STATUS_L1_18_CDM_REG_CHK_CMPLT	BIT(2)
+#define APPL_INTR_STATUS_L1_18_CDM_REG_CHK_CMP_ERR	BIT(1)
+#define APPL_INTR_STATUS_L1_18_CDM_REG_CHK_LOGIC_ERR	BIT(0)
+
+#define APPL_MSI_CTRL_2				0xB0
+
+#define APPL_LTR_MSG_1				0xC4
+#define LTR_MSG_REQ				BIT(15)
+#define LTR_MST_NO_SNOOP_SHIFT			16
+
+#define APPL_LTR_MSG_2				0xC8
+#define APPL_LTR_MSG_2_LTR_MSG_REQ_STATE	BIT(3)
+
+#define APPL_LINK_STATUS			0xCC
+#define APPL_LINK_STATUS_RDLH_LINK_UP		BIT(0)
+
+#define APPL_DEBUG				0xD0
+#define APPL_DEBUG_PM_LINKST_IN_L2_LAT		BIT(21)
+#define APPL_DEBUG_PM_LINKST_IN_L0		0x11
+#define APPL_DEBUG_LTSSM_STATE_MASK		GENMASK(8, 3)
+#define APPL_DEBUG_LTSSM_STATE_SHIFT		3
+#define LTSSM_STATE_PRE_DETECT			5
+
+#define APPL_RADM_STATUS			0xE4
+#define APPL_PM_XMT_TURNOFF_STATE		BIT(0)
+
+#define APPL_DM_TYPE				0x100
+#define APPL_DM_TYPE_MASK			GENMASK(3, 0)
+#define APPL_DM_TYPE_RP				0x4
+#define APPL_DM_TYPE_EP				0x0
+
+#define APPL_CFG_BASE_ADDR			0x104
+#define APPL_CFG_BASE_ADDR_MASK			GENMASK(31, 12)
+
+#define APPL_CFG_IATU_DMA_BASE_ADDR		0x108
+#define APPL_CFG_IATU_DMA_BASE_ADDR_MASK	GENMASK(31, 18)
+
+#define APPL_CFG_MISC				0x110
+#define APPL_CFG_MISC_SLV_EP_MODE		BIT(14)
+#define APPL_CFG_MISC_ARCACHE_MASK		GENMASK(13, 10)
+#define APPL_CFG_MISC_ARCACHE_SHIFT		10
+#define APPL_CFG_MISC_ARCACHE_VAL		3
+
+#define APPL_CFG_SLCG_OVERRIDE			0x114
+#define APPL_CFG_SLCG_OVERRIDE_SLCG_EN_MASTER	BIT(0)
+
+#define APPL_CAR_RESET_OVRD				0x12C
+#define APPL_CAR_RESET_OVRD_CYA_OVERRIDE_CORE_RST_N	BIT(0)
+
+#define IO_BASE_IO_DECODE				BIT(0)
+#define IO_BASE_IO_DECODE_BIT8				BIT(8)
+
+#define CFG_PREF_MEM_LIMIT_BASE_MEM_DECODE		BIT(0)
+#define CFG_PREF_MEM_LIMIT_BASE_MEM_LIMIT_DECODE	BIT(16)
+
+#define CFG_TIMER_CTRL_MAX_FUNC_NUM_OFF	0x718
+#define CFG_TIMER_CTRL_ACK_NAK_SHIFT	(19)
+
+#define EVENT_COUNTER_ALL_CLEAR		0x3
+#define EVENT_COUNTER_ENABLE_ALL	0x7
+#define EVENT_COUNTER_ENABLE_SHIFT	2
+#define EVENT_COUNTER_EVENT_SEL_MASK	GENMASK(7, 0)
+#define EVENT_COUNTER_EVENT_SEL_SHIFT	16
+#define EVENT_COUNTER_EVENT_Tx_L0S	0x2
+#define EVENT_COUNTER_EVENT_Rx_L0S	0x3
+#define EVENT_COUNTER_EVENT_L1		0x5
+#define EVENT_COUNTER_EVENT_L1_1	0x7
+#define EVENT_COUNTER_EVENT_L1_2	0x8
+#define EVENT_COUNTER_GROUP_SEL_SHIFT	24
+#define EVENT_COUNTER_GROUP_5		0x5
+
+#define PORT_LOGIC_ACK_F_ASPM_CTRL			0x70C
+#define ENTER_ASPM					BIT(30)
+#define L0S_ENTRANCE_LAT_SHIFT				24
+#define L0S_ENTRANCE_LAT_MASK				GENMASK(26, 24)
+#define L1_ENTRANCE_LAT_SHIFT				27
+#define L1_ENTRANCE_LAT_MASK				GENMASK(29, 27)
+#define N_FTS_SHIFT					8
+#define N_FTS_MASK					GENMASK(7, 0)
+#define N_FTS_VAL					52
+
+#define PORT_LOGIC_GEN2_CTRL				0x80C
+#define PORT_LOGIC_GEN2_CTRL_DIRECT_SPEED_CHANGE	BIT(17)
+#define FTS_MASK					GENMASK(7, 0)
+#define FTS_VAL						52
+
+#define PORT_LOGIC_MSI_CTRL_INT_0_EN		0x828
+
+#define GEN3_EQ_CONTROL_OFF			0x8a8
+#define GEN3_EQ_CONTROL_OFF_PSET_REQ_VEC_SHIFT	8
+#define GEN3_EQ_CONTROL_OFF_PSET_REQ_VEC_MASK	GENMASK(23, 8)
+#define GEN3_EQ_CONTROL_OFF_FB_MODE_MASK	GENMASK(3, 0)
+
+#define GEN3_RELATED_OFF			0x890
+#define GEN3_RELATED_OFF_GEN3_ZRXDC_NONCOMPL	BIT(0)
+#define GEN3_RELATED_OFF_GEN3_EQ_DISABLE	BIT(16)
+#define GEN3_RELATED_OFF_RATE_SHADOW_SEL_SHIFT	24
+#define GEN3_RELATED_OFF_RATE_SHADOW_SEL_MASK	GENMASK(25, 24)
+
+#define PORT_LOGIC_AMBA_ERROR_RESPONSE_DEFAULT	0x8D0
+#define AMBA_ERROR_RESPONSE_CRS_SHIFT		3
+#define AMBA_ERROR_RESPONSE_CRS_MASK		GENMASK(1, 0)
+#define AMBA_ERROR_RESPONSE_CRS_OKAY		0
+#define AMBA_ERROR_RESPONSE_CRS_OKAY_FFFFFFFF	1
+#define AMBA_ERROR_RESPONSE_CRS_OKAY_FFFF0001	2
+
+#define PORT_LOGIC_MSIX_DOORBELL			0x948
+
+#define CAP_SPCIE_CAP_OFF			0x154
+#define CAP_SPCIE_CAP_OFF_DSP_TX_PRESET0_MASK	GENMASK(3, 0)
+#define CAP_SPCIE_CAP_OFF_USP_TX_PRESET0_MASK	GENMASK(11, 8)
+#define CAP_SPCIE_CAP_OFF_USP_TX_PRESET0_SHIFT	8
+
+#define PME_ACK_TIMEOUT 10000
+
+#define LTSSM_TIMEOUT 50000	/* 50ms */
+
+#define GEN3_GEN4_EQ_PRESET_INIT	5
+
+#define GEN1_CORE_CLK_FREQ	62500000
+#define GEN2_CORE_CLK_FREQ	125000000
+#define GEN3_CORE_CLK_FREQ	250000000
+#define GEN4_CORE_CLK_FREQ	500000000
+
+static const unsigned int pcie_gen_freq[] = {
+	GEN1_CORE_CLK_FREQ,
+	GEN2_CORE_CLK_FREQ,
+	GEN3_CORE_CLK_FREQ,
+	GEN4_CORE_CLK_FREQ
+};
+
+static const u32 event_cntr_ctrl_offset[] = {
+	0x1d8,
+	0x1a8,
+	0x1a8,
+	0x1a8,
+	0x1c4,
+	0x1d8
+};
+
+static const u32 event_cntr_data_offset[] = {
+	0x1dc,
+	0x1ac,
+	0x1ac,
+	0x1ac,
+	0x1c8,
+	0x1dc
+};
+
+struct tegra_pcie_dw {
+	struct device *dev;
+	struct resource *appl_res;
+	struct resource *dbi_res;
+	struct resource *atu_dma_res;
+	void __iomem *appl_base;
+	struct clk *core_clk;
+	struct reset_control *core_apb_rst;
+	struct reset_control *core_rst;
+	struct dw_pcie pci;
+	struct tegra_bpmp *bpmp;
+
+	bool supports_clkreq;
+	bool enable_cdm_check;
+	bool link_state;
+	bool update_fc_fixup;
+	u8 init_link_width;
+	u32 msi_ctrl_int;
+	u32 num_lanes;
+	u32 max_speed;
+	u32 cid;
+	u32 cfg_link_cap_l1sub;
+	u32 pcie_cap_base;
+	u32 aspm_cmrt;
+	u32 aspm_pwr_on_t;
+	u32 aspm_l0s_enter_lat;
+
+	struct regulator *pex_ctl_supply;
+	struct regulator *slot_ctl_3v3;
+	struct regulator *slot_ctl_12v;
+
+	unsigned int phy_count;
+	struct phy **phys;
+
+	struct dentry *debugfs;
+};
+
+static inline struct tegra_pcie_dw *to_tegra_pcie(struct dw_pcie *pci)
+{
+	return container_of(pci, struct tegra_pcie_dw, pci);
+}
+
+static inline void appl_writel(struct tegra_pcie_dw *pcie, const u32 value,
+			       const u32 reg)
+{
+	writel_relaxed(value, pcie->appl_base + reg);
+}
+
+static inline u32 appl_readl(struct tegra_pcie_dw *pcie, const u32 reg)
+{
+	return readl_relaxed(pcie->appl_base + reg);
+}
+
+struct tegra_pcie_soc {
+	enum dw_pcie_device_mode mode;
+};
+
+static void apply_bad_link_workaround(struct pcie_port *pp)
+{
+	struct dw_pcie *pci = to_dw_pcie_from_pp(pp);
+	struct tegra_pcie_dw *pcie = to_tegra_pcie(pci);
+	u32 current_link_width;
+	u16 val;
+
+	/*
+	 * NOTE:- Since this scenario is uncommon and link as such is not
+	 * stable anyway, not waiting to confirm if link is really
+	 * transitioning to Gen-2 speed
+	 */
+	val = dw_pcie_readw_dbi(pci, pcie->pcie_cap_base + PCI_EXP_LNKSTA);
+	if (val & PCI_EXP_LNKSTA_LBMS) {
+		current_link_width = (val & PCI_EXP_LNKSTA_NLW) >>
+				     PCI_EXP_LNKSTA_NLW_SHIFT;
+		if (pcie->init_link_width > current_link_width) {
+			dev_warn(pci->dev, "PCIe link is bad, width reduced\n");
+			val = dw_pcie_readw_dbi(pci, pcie->pcie_cap_base +
+						PCI_EXP_LNKCTL2);
+			val &= ~PCI_EXP_LNKCTL2_TLS;
+			val |= PCI_EXP_LNKCTL2_TLS_2_5GT;
+			dw_pcie_writew_dbi(pci, pcie->pcie_cap_base +
+					   PCI_EXP_LNKCTL2, val);
+
+			val = dw_pcie_readw_dbi(pci, pcie->pcie_cap_base +
+						PCI_EXP_LNKCTL);
+			val |= PCI_EXP_LNKCTL_RL;
+			dw_pcie_writew_dbi(pci, pcie->pcie_cap_base +
+					   PCI_EXP_LNKCTL, val);
+		}
+	}
+}
+
+static irqreturn_t tegra_pcie_rp_irq_handler(struct tegra_pcie_dw *pcie)
+{
+	struct dw_pcie *pci = &pcie->pci;
+	struct pcie_port *pp = &pci->pp;
+	u32 val, tmp;
+	u16 val_w;
+
+	val = appl_readl(pcie, APPL_INTR_STATUS_L0);
+	if (val & APPL_INTR_STATUS_L0_LINK_STATE_INT) {
+		val = appl_readl(pcie, APPL_INTR_STATUS_L1_0_0);
+		if (val & APPL_INTR_STATUS_L1_0_0_LINK_REQ_RST_NOT_CHGED) {
+			appl_writel(pcie, val, APPL_INTR_STATUS_L1_0_0);
+
+			/* SBR & Surprise Link Down WAR */
+			val = appl_readl(pcie, APPL_CAR_RESET_OVRD);
+			val &= ~APPL_CAR_RESET_OVRD_CYA_OVERRIDE_CORE_RST_N;
+			appl_writel(pcie, val, APPL_CAR_RESET_OVRD);
+			udelay(1);
+			val = appl_readl(pcie, APPL_CAR_RESET_OVRD);
+			val |= APPL_CAR_RESET_OVRD_CYA_OVERRIDE_CORE_RST_N;
+			appl_writel(pcie, val, APPL_CAR_RESET_OVRD);
+
+			val = dw_pcie_readl_dbi(pci, PORT_LOGIC_GEN2_CTRL);
+			val |= PORT_LOGIC_GEN2_CTRL_DIRECT_SPEED_CHANGE;
+			dw_pcie_writel_dbi(pci, PORT_LOGIC_GEN2_CTRL, val);
+		}
+	}
+
+	if (val & APPL_INTR_STATUS_L0_INT_INT) {
+		val = appl_readl(pcie, APPL_INTR_STATUS_L1_8_0);
+		if (val & APPL_INTR_STATUS_L1_8_0_AUTO_BW_INT_STS) {
+			appl_writel(pcie,
+				    APPL_INTR_STATUS_L1_8_0_AUTO_BW_INT_STS,
+				    APPL_INTR_STATUS_L1_8_0);
+			apply_bad_link_workaround(pp);
+		}
+		if (val & APPL_INTR_STATUS_L1_8_0_BW_MGT_INT_STS) {
+			appl_writel(pcie,
+				    APPL_INTR_STATUS_L1_8_0_BW_MGT_INT_STS,
+				    APPL_INTR_STATUS_L1_8_0);
+
+			val_w = dw_pcie_readw_dbi(pci, pcie->pcie_cap_base +
+						  PCI_EXP_LNKSTA);
+			dev_dbg(pci->dev, "Link Speed : Gen-%u\n", val_w &
+				PCI_EXP_LNKSTA_CLS);
+		}
+	}
+
+	val = appl_readl(pcie, APPL_INTR_STATUS_L0);
+	if (val & APPL_INTR_STATUS_L0_CDM_REG_CHK_INT) {
+		val = appl_readl(pcie, APPL_INTR_STATUS_L1_18);
+		tmp = dw_pcie_readl_dbi(pci, PCIE_PL_CHK_REG_CONTROL_STATUS);
+		if (val & APPL_INTR_STATUS_L1_18_CDM_REG_CHK_CMPLT) {
+			dev_info(pci->dev, "CDM check complete\n");
+			tmp |= PCIE_PL_CHK_REG_CHK_REG_COMPLETE;
+		}
+		if (val & APPL_INTR_STATUS_L1_18_CDM_REG_CHK_CMP_ERR) {
+			dev_err(pci->dev, "CDM comparison mismatch\n");
+			tmp |= PCIE_PL_CHK_REG_CHK_REG_COMPARISON_ERROR;
+		}
+		if (val & APPL_INTR_STATUS_L1_18_CDM_REG_CHK_LOGIC_ERR) {
+			dev_err(pci->dev, "CDM Logic error\n");
+			tmp |= PCIE_PL_CHK_REG_CHK_REG_LOGIC_ERROR;
+		}
+		dw_pcie_writel_dbi(pci, PCIE_PL_CHK_REG_CONTROL_STATUS, tmp);
+		tmp = dw_pcie_readl_dbi(pci, PCIE_PL_CHK_REG_ERR_ADDR);
+		dev_err(pci->dev, "CDM Error Address Offset = 0x%08X\n", tmp);
+	}
+
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t tegra_pcie_irq_handler(int irq, void *arg)
+{
+	struct tegra_pcie_dw *pcie = arg;
+
+	return tegra_pcie_rp_irq_handler(pcie);
+}
+
+static int tegra_pcie_dw_rd_own_conf(struct pcie_port *pp, int where, int size,
+				     u32 *val)
+{
+	struct dw_pcie *pci = to_dw_pcie_from_pp(pp);
+
+	/*
+	 * This is an endpoint mode specific register happen to appear even
+	 * when controller is operating in root port mode and system hangs
+	 * when it is accessed with link being in ASPM-L1 state.
+	 * So skip accessing it altogether
+	 */
+	if (where == PORT_LOGIC_MSIX_DOORBELL) {
+		*val = 0x00000000;
+		return PCIBIOS_SUCCESSFUL;
+	}
+
+	return dw_pcie_read(pci->dbi_base + where, size, val);
+}
+
+static int tegra_pcie_dw_wr_own_conf(struct pcie_port *pp, int where, int size,
+				     u32 val)
+{
+	struct dw_pcie *pci = to_dw_pcie_from_pp(pp);
+
+	/*
+	 * This is an endpoint mode specific register happen to appear even
+	 * when controller is operating in root port mode and system hangs
+	 * when it is accessed with link being in ASPM-L1 state.
+	 * So skip accessing it altogether
+	 */
+	if (where == PORT_LOGIC_MSIX_DOORBELL)
+		return PCIBIOS_SUCCESSFUL;
+
+	return dw_pcie_write(pci->dbi_base + where, size, val);
+}
+
+#if defined(CONFIG_PCIEASPM)
+static void disable_aspm_l11(struct tegra_pcie_dw *pcie)
+{
+	u32 val;
+
+	val = dw_pcie_readl_dbi(&pcie->pci, pcie->cfg_link_cap_l1sub);
+	val &= ~PCI_L1SS_CAP_ASPM_L1_1;
+	dw_pcie_writel_dbi(&pcie->pci, pcie->cfg_link_cap_l1sub, val);
+}
+
+static void disable_aspm_l12(struct tegra_pcie_dw *pcie)
+{
+	u32 val;
+
+	val = dw_pcie_readl_dbi(&pcie->pci, pcie->cfg_link_cap_l1sub);
+	val &= ~PCI_L1SS_CAP_ASPM_L1_2;
+	dw_pcie_writel_dbi(&pcie->pci, pcie->cfg_link_cap_l1sub, val);
+}
+
+static inline u32 event_counter_prog(struct tegra_pcie_dw *pcie, u32 event)
+{
+	u32 val;
+
+	val = dw_pcie_readl_dbi(&pcie->pci, event_cntr_ctrl_offset[pcie->cid]);
+	val &= ~(EVENT_COUNTER_EVENT_SEL_MASK << EVENT_COUNTER_EVENT_SEL_SHIFT);
+	val |= EVENT_COUNTER_GROUP_5 << EVENT_COUNTER_GROUP_SEL_SHIFT;
+	val |= event << EVENT_COUNTER_EVENT_SEL_SHIFT;
+	val |= EVENT_COUNTER_ENABLE_ALL << EVENT_COUNTER_ENABLE_SHIFT;
+	dw_pcie_writel_dbi(&pcie->pci, event_cntr_ctrl_offset[pcie->cid], val);
+	val = dw_pcie_readl_dbi(&pcie->pci, event_cntr_data_offset[pcie->cid]);
+
+	return val;
+}
+
+static int aspm_state_cnt(struct seq_file *s, void *data)
+{
+	struct tegra_pcie_dw *pcie = (struct tegra_pcie_dw *)
+				     dev_get_drvdata(s->private);
+	u32 val;
+
+	seq_printf(s, "Tx L0s entry count : %u\n",
+		   event_counter_prog(pcie, EVENT_COUNTER_EVENT_Tx_L0S));
+
+	seq_printf(s, "Rx L0s entry count : %u\n",
+		   event_counter_prog(pcie, EVENT_COUNTER_EVENT_Rx_L0S));
+
+	seq_printf(s, "Link L1 entry count : %u\n",
+		   event_counter_prog(pcie, EVENT_COUNTER_EVENT_L1));
+
+	seq_printf(s, "Link L1.1 entry count : %u\n",
+		   event_counter_prog(pcie, EVENT_COUNTER_EVENT_L1_1));
+
+	seq_printf(s, "Link L1.2 entry count : %u\n",
+		   event_counter_prog(pcie, EVENT_COUNTER_EVENT_L1_2));
+
+	/* Clear all counters */
+	dw_pcie_writel_dbi(&pcie->pci, event_cntr_ctrl_offset[pcie->cid],
+			   EVENT_COUNTER_ALL_CLEAR);
+
+	/* Re-enable counting */
+	val = EVENT_COUNTER_ENABLE_ALL << EVENT_COUNTER_ENABLE_SHIFT;
+	val |= EVENT_COUNTER_GROUP_5 << EVENT_COUNTER_GROUP_SEL_SHIFT;
+	dw_pcie_writel_dbi(&pcie->pci, event_cntr_ctrl_offset[pcie->cid], val);
+
+	return 0;
+}
+
+static void init_host_aspm(struct tegra_pcie_dw *pcie)
+{
+	struct dw_pcie *pci = &pcie->pci;
+	u32 val;
+
+	val = dw_pcie_find_ext_capability(pci, PCI_EXT_CAP_ID_L1SS);
+	pcie->cfg_link_cap_l1sub = val + PCI_L1SS_CAP;
+
+	/* Enable ASPM counters */
+	val = EVENT_COUNTER_ENABLE_ALL << EVENT_COUNTER_ENABLE_SHIFT;
+	val |= EVENT_COUNTER_GROUP_5 << EVENT_COUNTER_GROUP_SEL_SHIFT;
+	dw_pcie_writel_dbi(pci, event_cntr_ctrl_offset[pcie->cid], val);
+
+	/* Program T_cmrt and T_pwr_on values */
+	val = dw_pcie_readl_dbi(pci, pcie->cfg_link_cap_l1sub);
+	val &= ~(PCI_L1SS_CAP_CM_RESTORE_TIME | PCI_L1SS_CAP_P_PWR_ON_VALUE);
+	val |= (pcie->aspm_cmrt << 8);
+	val |= (pcie->aspm_pwr_on_t << 19);
+	dw_pcie_writel_dbi(pci, pcie->cfg_link_cap_l1sub, val);
+
+	/* Program L0s and L1 entrance latencies */
+	val = dw_pcie_readl_dbi(pci, PORT_LOGIC_ACK_F_ASPM_CTRL);
+	val &= ~L0S_ENTRANCE_LAT_MASK;
+	val |= (pcie->aspm_l0s_enter_lat << L0S_ENTRANCE_LAT_SHIFT);
+	val |= ENTER_ASPM;
+	dw_pcie_writel_dbi(pci, PORT_LOGIC_ACK_F_ASPM_CTRL, val);
+}
+
+static int init_debugfs(struct tegra_pcie_dw *pcie)
+{
+	struct dentry *d;
+
+	d = debugfs_create_devm_seqfile(pcie->dev, "aspm_state_cnt",
+					pcie->debugfs, aspm_state_cnt);
+	if (IS_ERR_OR_NULL(d))
+		dev_err(pcie->dev,
+			"Failed to create debugfs file \"aspm_state_cnt\"\n");
+
+	return 0;
+}
+#else
+static inline void disable_aspm_l12(struct tegra_pcie_dw *pcie) { return; }
+static inline void disable_aspm_l11(struct tegra_pcie_dw *pcie) { return; }
+static inline void init_host_aspm(struct tegra_pcie_dw *pcie) { return; }
+static inline int init_debugfs(struct tegra_pcie_dw *pcie) { return 0; }
+#endif
+
+static void tegra_pcie_enable_system_interrupts(struct pcie_port *pp)
+{
+	struct dw_pcie *pci = to_dw_pcie_from_pp(pp);
+	struct tegra_pcie_dw *pcie = to_tegra_pcie(pci);
+	u32 val;
+	u16 val_w;
+
+	val = appl_readl(pcie, APPL_INTR_EN_L0_0);
+	val |= APPL_INTR_EN_L0_0_LINK_STATE_INT_EN;
+	appl_writel(pcie, val, APPL_INTR_EN_L0_0);
+
+	val = appl_readl(pcie, APPL_INTR_EN_L1_0_0);
+	val |= APPL_INTR_EN_L1_0_0_LINK_REQ_RST_NOT_INT_EN;
+	appl_writel(pcie, val, APPL_INTR_EN_L1_0_0);
+
+	if (pcie->enable_cdm_check) {
+		val = appl_readl(pcie, APPL_INTR_EN_L0_0);
+		val |= APPL_INTR_EN_L0_0_CDM_REG_CHK_INT_EN;
+		appl_writel(pcie, val, APPL_INTR_EN_L0_0);
+
+		val = appl_readl(pcie, APPL_INTR_EN_L1_18);
+		val |= APPL_INTR_EN_L1_18_CDM_REG_CHK_CMP_ERR;
+		val |= APPL_INTR_EN_L1_18_CDM_REG_CHK_LOGIC_ERR;
+		appl_writel(pcie, val, APPL_INTR_EN_L1_18);
+	}
+
+	val_w = dw_pcie_readw_dbi(&pcie->pci, pcie->pcie_cap_base +
+				  PCI_EXP_LNKSTA);
+	pcie->init_link_width = (val_w & PCI_EXP_LNKSTA_NLW) >>
+				PCI_EXP_LNKSTA_NLW_SHIFT;
+
+	val_w = dw_pcie_readw_dbi(&pcie->pci, pcie->pcie_cap_base +
+				  PCI_EXP_LNKCTL);
+	val_w |= PCI_EXP_LNKCTL_LBMIE;
+	dw_pcie_writew_dbi(&pcie->pci, pcie->pcie_cap_base + PCI_EXP_LNKCTL,
+			   val_w);
+}
+
+static void tegra_pcie_enable_legacy_interrupts(struct pcie_port *pp)
+{
+	struct dw_pcie *pci = to_dw_pcie_from_pp(pp);
+	struct tegra_pcie_dw *pcie = to_tegra_pcie(pci);
+	u32 val;
+
+	/* Enable legacy interrupt generation */
+	val = appl_readl(pcie, APPL_INTR_EN_L0_0);
+	val |= APPL_INTR_EN_L0_0_SYS_INTR_EN;
+	val |= APPL_INTR_EN_L0_0_INT_INT_EN;
+	appl_writel(pcie, val, APPL_INTR_EN_L0_0);
+
+	val = appl_readl(pcie, APPL_INTR_EN_L1_8_0);
+	val |= APPL_INTR_EN_L1_8_INTX_EN;
+	val |= APPL_INTR_EN_L1_8_AUTO_BW_INT_EN;
+	val |= APPL_INTR_EN_L1_8_BW_MGT_INT_EN;
+	if (IS_ENABLED(CONFIG_PCIEAER))
+		val |= APPL_INTR_EN_L1_8_AER_INT_EN;
+	appl_writel(pcie, val, APPL_INTR_EN_L1_8_0);
+}
+
+static void tegra_pcie_enable_msi_interrupts(struct pcie_port *pp)
+{
+	struct dw_pcie *pci = to_dw_pcie_from_pp(pp);
+	struct tegra_pcie_dw *pcie = to_tegra_pcie(pci);
+	u32 val;
+
+	dw_pcie_msi_init(pp);
+
+	/* Enable MSI interrupt generation */
+	val = appl_readl(pcie, APPL_INTR_EN_L0_0);
+	val |= APPL_INTR_EN_L0_0_SYS_MSI_INTR_EN;
+	val |= APPL_INTR_EN_L0_0_MSI_RCV_INT_EN;
+	appl_writel(pcie, val, APPL_INTR_EN_L0_0);
+}
+
+static void tegra_pcie_enable_interrupts(struct pcie_port *pp)
+{
+	struct dw_pcie *pci = to_dw_pcie_from_pp(pp);
+	struct tegra_pcie_dw *pcie = to_tegra_pcie(pci);
+
+	/* Clear interrupt statuses before enabling interrupts */
+	appl_writel(pcie, 0xFFFFFFFF, APPL_INTR_STATUS_L0);
+	appl_writel(pcie, 0xFFFFFFFF, APPL_INTR_STATUS_L1_0_0);
+	appl_writel(pcie, 0xFFFFFFFF, APPL_INTR_STATUS_L1_1);
+	appl_writel(pcie, 0xFFFFFFFF, APPL_INTR_STATUS_L1_2);
+	appl_writel(pcie, 0xFFFFFFFF, APPL_INTR_STATUS_L1_3);
+	appl_writel(pcie, 0xFFFFFFFF, APPL_INTR_STATUS_L1_6);
+	appl_writel(pcie, 0xFFFFFFFF, APPL_INTR_STATUS_L1_7);
+	appl_writel(pcie, 0xFFFFFFFF, APPL_INTR_STATUS_L1_8_0);
+	appl_writel(pcie, 0xFFFFFFFF, APPL_INTR_STATUS_L1_9);
+	appl_writel(pcie, 0xFFFFFFFF, APPL_INTR_STATUS_L1_10);
+	appl_writel(pcie, 0xFFFFFFFF, APPL_INTR_STATUS_L1_11);
+	appl_writel(pcie, 0xFFFFFFFF, APPL_INTR_STATUS_L1_13);
+	appl_writel(pcie, 0xFFFFFFFF, APPL_INTR_STATUS_L1_14);
+	appl_writel(pcie, 0xFFFFFFFF, APPL_INTR_STATUS_L1_15);
+	appl_writel(pcie, 0xFFFFFFFF, APPL_INTR_STATUS_L1_17);
+
+	tegra_pcie_enable_system_interrupts(pp);
+	tegra_pcie_enable_legacy_interrupts(pp);
+	if (IS_ENABLED(CONFIG_PCI_MSI))
+		tegra_pcie_enable_msi_interrupts(pp);
+}
+
+static void config_gen3_gen4_eq_presets(struct tegra_pcie_dw *pcie)
+{
+	struct dw_pcie *pci = &pcie->pci;
+	u32 val, offset, i;
+
+	/* Program init preset */
+	for (i = 0; i < pcie->num_lanes; i++) {
+		dw_pcie_read(pci->dbi_base + CAP_SPCIE_CAP_OFF
+				 + (i * 2), 2, &val);
+		val &= ~CAP_SPCIE_CAP_OFF_DSP_TX_PRESET0_MASK;
+		val |= GEN3_GEN4_EQ_PRESET_INIT;
+		val &= ~CAP_SPCIE_CAP_OFF_USP_TX_PRESET0_MASK;
+		val |= (GEN3_GEN4_EQ_PRESET_INIT <<
+			   CAP_SPCIE_CAP_OFF_USP_TX_PRESET0_SHIFT);
+		dw_pcie_write(pci->dbi_base + CAP_SPCIE_CAP_OFF
+				 + (i * 2), 2, val);
+
+		offset = dw_pcie_find_ext_capability(pci,
+						     PCI_EXT_CAP_ID_PL_16GT) +
+				PCI_PL_16GT_LE_CTRL;
+		dw_pcie_read(pci->dbi_base + offset + i, 1, &val);
+		val &= ~PCI_PL_16GT_LE_CTRL_DSP_TX_PRESET_MASK;
+		val |= GEN3_GEN4_EQ_PRESET_INIT;
+		val &= ~PCI_PL_16GT_LE_CTRL_USP_TX_PRESET_MASK;
+		val |= (GEN3_GEN4_EQ_PRESET_INIT <<
+			PCI_PL_16GT_LE_CTRL_USP_TX_PRESET_SHIFT);
+		dw_pcie_write(pci->dbi_base + offset + i, 1, val);
+	}
+
+	val = dw_pcie_readl_dbi(pci, GEN3_RELATED_OFF);
+	val &= ~GEN3_RELATED_OFF_RATE_SHADOW_SEL_MASK;
+	dw_pcie_writel_dbi(pci, GEN3_RELATED_OFF, val);
+
+	val = dw_pcie_readl_dbi(pci, GEN3_EQ_CONTROL_OFF);
+	val &= ~GEN3_EQ_CONTROL_OFF_PSET_REQ_VEC_MASK;
+	val |= (0x3ff << GEN3_EQ_CONTROL_OFF_PSET_REQ_VEC_SHIFT);
+	val &= ~GEN3_EQ_CONTROL_OFF_FB_MODE_MASK;
+	dw_pcie_writel_dbi(pci, GEN3_EQ_CONTROL_OFF, val);
+
+	val = dw_pcie_readl_dbi(pci, GEN3_RELATED_OFF);
+	val &= ~GEN3_RELATED_OFF_RATE_SHADOW_SEL_MASK;
+	val |= (0x1 << GEN3_RELATED_OFF_RATE_SHADOW_SEL_SHIFT);
+	dw_pcie_writel_dbi(pci, GEN3_RELATED_OFF, val);
+
+	val = dw_pcie_readl_dbi(pci, GEN3_EQ_CONTROL_OFF);
+	val &= ~GEN3_EQ_CONTROL_OFF_PSET_REQ_VEC_MASK;
+	val |= (0x360 << GEN3_EQ_CONTROL_OFF_PSET_REQ_VEC_SHIFT);
+	val &= ~GEN3_EQ_CONTROL_OFF_FB_MODE_MASK;
+	dw_pcie_writel_dbi(pci, GEN3_EQ_CONTROL_OFF, val);
+
+	val = dw_pcie_readl_dbi(pci, GEN3_RELATED_OFF);
+	val &= ~GEN3_RELATED_OFF_RATE_SHADOW_SEL_MASK;
+	dw_pcie_writel_dbi(pci, GEN3_RELATED_OFF, val);
+}
+
+static void tegra_pcie_prepare_host(struct pcie_port *pp)
+{
+	struct dw_pcie *pci = to_dw_pcie_from_pp(pp);
+	struct tegra_pcie_dw *pcie = to_tegra_pcie(pci);
+	u32 val;
+
+	val = dw_pcie_readl_dbi(pci, PCI_IO_BASE);
+	val &= ~(IO_BASE_IO_DECODE | IO_BASE_IO_DECODE_BIT8);
+	dw_pcie_writel_dbi(pci, PCI_IO_BASE, val);
+
+	val = dw_pcie_readl_dbi(pci, PCI_PREF_MEMORY_BASE);
+	val |= CFG_PREF_MEM_LIMIT_BASE_MEM_DECODE;
+	val |= CFG_PREF_MEM_LIMIT_BASE_MEM_LIMIT_DECODE;
+	dw_pcie_writel_dbi(pci, PCI_PREF_MEMORY_BASE, val);
+
+	dw_pcie_writel_dbi(pci, PCI_BASE_ADDRESS_0, 0);
+
+	/* Configure FTS */
+	val = dw_pcie_readl_dbi(pci, PORT_LOGIC_ACK_F_ASPM_CTRL);
+	val &= ~(N_FTS_MASK << N_FTS_SHIFT);
+	val |= N_FTS_VAL << N_FTS_SHIFT;
+	dw_pcie_writel_dbi(pci, PORT_LOGIC_ACK_F_ASPM_CTRL, val);
+
+	val = dw_pcie_readl_dbi(pci, PORT_LOGIC_GEN2_CTRL);
+	val &= ~FTS_MASK;
+	val |= FTS_VAL;
+	dw_pcie_writel_dbi(pci, PORT_LOGIC_GEN2_CTRL, val);
+
+	/* Enable as 0xFFFF0001 response for CRS */
+	val = dw_pcie_readl_dbi(pci, PORT_LOGIC_AMBA_ERROR_RESPONSE_DEFAULT);
+	val &= ~(AMBA_ERROR_RESPONSE_CRS_MASK << AMBA_ERROR_RESPONSE_CRS_SHIFT);
+	val |= (AMBA_ERROR_RESPONSE_CRS_OKAY_FFFF0001 <<
+		AMBA_ERROR_RESPONSE_CRS_SHIFT);
+	dw_pcie_writel_dbi(pci, PORT_LOGIC_AMBA_ERROR_RESPONSE_DEFAULT, val);
+
+	/* Configure Max Speed from DT */
+	if (pcie->max_speed && pcie->max_speed != -EINVAL) {
+		val = dw_pcie_readl_dbi(pci, pcie->pcie_cap_base +
+					PCI_EXP_LNKCAP);
+		val &= ~PCI_EXP_LNKCAP_SLS;
+		val |= pcie->max_speed;
+		dw_pcie_writel_dbi(pci, pcie->pcie_cap_base + PCI_EXP_LNKCAP,
+				   val);
+	}
+
+	/* Configure Max lane width from DT */
+	val = dw_pcie_readl_dbi(pci, pcie->pcie_cap_base + PCI_EXP_LNKCAP);
+	val &= ~PCI_EXP_LNKCAP_MLW;
+	val |= (pcie->num_lanes << PCI_EXP_LNKSTA_NLW_SHIFT);
+	dw_pcie_writel_dbi(pci, pcie->pcie_cap_base + PCI_EXP_LNKCAP, val);
+
+	config_gen3_gen4_eq_presets(pcie);
+
+	init_host_aspm(pcie);
+
+	val = dw_pcie_readl_dbi(pci, GEN3_RELATED_OFF);
+	val &= ~GEN3_RELATED_OFF_GEN3_ZRXDC_NONCOMPL;
+	dw_pcie_writel_dbi(pci, GEN3_RELATED_OFF, val);
+
+	if (pcie->update_fc_fixup) {
+		val = dw_pcie_readl_dbi(pci, CFG_TIMER_CTRL_MAX_FUNC_NUM_OFF);
+		val |= 0x1 << CFG_TIMER_CTRL_ACK_NAK_SHIFT;
+		dw_pcie_writel_dbi(pci, CFG_TIMER_CTRL_MAX_FUNC_NUM_OFF, val);
+	}
+
+	dw_pcie_setup_rc(pp);
+
+	clk_set_rate(pcie->core_clk, GEN4_CORE_CLK_FREQ);
+
+	/* Assert RST */
+	val = appl_readl(pcie, APPL_PINMUX);
+	val &= ~APPL_PINMUX_PEX_RST;
+	appl_writel(pcie, val, APPL_PINMUX);
+
+	usleep_range(100, 200);
+
+	/* Enable LTSSM */
+	val = appl_readl(pcie, APPL_CTRL);
+	val |= APPL_CTRL_LTSSM_EN;
+	appl_writel(pcie, val, APPL_CTRL);
+
+	/* De-assert RST */
+	val = appl_readl(pcie, APPL_PINMUX);
+	val |= APPL_PINMUX_PEX_RST;
+	appl_writel(pcie, val, APPL_PINMUX);
+
+	msleep(100);
+}
+
+static int tegra_pcie_dw_host_init(struct pcie_port *pp)
+{
+	struct dw_pcie *pci = to_dw_pcie_from_pp(pp);
+	struct tegra_pcie_dw *pcie = to_tegra_pcie(pci);
+	u32 val, tmp, offset, speed;
+
+	tegra_pcie_prepare_host(pp);
+
+	if (dw_pcie_wait_for_link(pci)) {
+		/*
+		 * There are some endpoints which can't get the link up if
+		 * root port has Data Link Feature (DLF) enabled.
+		 * Refer Spec rev 4.0 ver 1.0 sec 3.4.2 & 7.7.4 for more info
+		 * on Scaled Flow Control and DLF.
+		 * So, need to confirm that is indeed the case here and attempt
+		 * link up once again with DLF disabled.
+		 */
+		val = appl_readl(pcie, APPL_DEBUG);
+		val &= APPL_DEBUG_LTSSM_STATE_MASK;
+		val >>= APPL_DEBUG_LTSSM_STATE_SHIFT;
+		tmp = appl_readl(pcie, APPL_LINK_STATUS);
+		tmp &= APPL_LINK_STATUS_RDLH_LINK_UP;
+		if (!(val == 0x11 && !tmp)) {
+			/* Link is down for all good reasons */
+			return 0;
+		}
+
+		dev_info(pci->dev, "Link is down in DLL");
+		dev_info(pci->dev, "Trying again with DLFE disabled\n");
+		/* Disable LTSSM */
+		val = appl_readl(pcie, APPL_CTRL);
+		val &= ~APPL_CTRL_LTSSM_EN;
+		appl_writel(pcie, val, APPL_CTRL);
+
+		reset_control_assert(pcie->core_rst);
+		reset_control_deassert(pcie->core_rst);
+
+		offset = dw_pcie_find_ext_capability(pci, PCI_EXT_CAP_ID_DLF);
+		val = dw_pcie_readl_dbi(pci, offset + PCI_DLF_CAP);
+		val &= ~PCI_DLF_EXCHANGE_ENABLE;
+		dw_pcie_writel_dbi(pci, offset, val);
+
+		tegra_pcie_prepare_host(pp);
+
+		if (dw_pcie_wait_for_link(pci))
+			return 0;
+	}
+
+	speed = dw_pcie_readw_dbi(pci, pcie->pcie_cap_base + PCI_EXP_LNKSTA) &
+		PCI_EXP_LNKSTA_CLS;
+	clk_set_rate(pcie->core_clk, pcie_gen_freq[speed - 1]);
+
+	tegra_pcie_enable_interrupts(pp);
+
+	return 0;
+}
+
+static int tegra_pcie_dw_link_up(struct dw_pcie *pci)
+{
+	struct tegra_pcie_dw *pcie = to_tegra_pcie(pci);
+	u32 val = dw_pcie_readw_dbi(pci, pcie->pcie_cap_base + PCI_EXP_LNKSTA);
+
+	return !!(val & PCI_EXP_LNKSTA_DLLLA);
+}
+
+static void tegra_pcie_set_msi_vec_num(struct pcie_port *pp)
+{
+	pp->num_vectors = MAX_MSI_IRQS;
+}
+
+static const struct dw_pcie_ops tegra_dw_pcie_ops = {
+	.link_up = tegra_pcie_dw_link_up,
+};
+
+static struct dw_pcie_host_ops tegra_pcie_dw_host_ops = {
+	.rd_own_conf = tegra_pcie_dw_rd_own_conf,
+	.wr_own_conf = tegra_pcie_dw_wr_own_conf,
+	.host_init = tegra_pcie_dw_host_init,
+	.set_num_vectors = tegra_pcie_set_msi_vec_num,
+};
+
+static void tegra_pcie_disable_phy(struct tegra_pcie_dw *pcie)
+{
+	unsigned int phy_count = pcie->phy_count;
+
+	while (phy_count--) {
+		phy_power_off(pcie->phys[phy_count]);
+		phy_exit(pcie->phys[phy_count]);
+	}
+}
+
+static int tegra_pcie_enable_phy(struct tegra_pcie_dw *pcie)
+{
+	unsigned int i;
+	int ret;
+
+	for (i = 0; i < pcie->phy_count; i++) {
+		ret = phy_init(pcie->phys[i]);
+		if (ret < 0)
+			goto phy_power_off;
+
+		ret = phy_power_on(pcie->phys[i]);
+		if (ret < 0)
+			goto phy_exit;
+	}
+
+	return 0;
+
+phy_power_off:
+	while (i--) {
+		phy_power_off(pcie->phys[i]);
+phy_exit:
+		phy_exit(pcie->phys[i]);
+	}
+
+	return ret;
+}
+
+static int tegra_pcie_dw_parse_dt(struct tegra_pcie_dw *pcie)
+{
+	struct device_node *np = pcie->dev->of_node;
+	int ret;
+
+	ret = of_property_read_u32(np, "nvidia,aspm-cmrt-us", &pcie->aspm_cmrt);
+	if (ret < 0) {
+		dev_info(pcie->dev, "Failed to read ASPM T_cmrt: %d\n", ret);
+		return ret;
+	}
+
+	ret = of_property_read_u32(np, "nvidia,aspm-pwr-on-t-us",
+				   &pcie->aspm_pwr_on_t);
+	if (ret < 0)
+		dev_info(pcie->dev, "Failed to read ASPM Power On time: %d\n",
+			 ret);
+
+	ret = of_property_read_u32(np, "nvidia,aspm-l0s-entrance-latency-us",
+				   &pcie->aspm_l0s_enter_lat);
+	if (ret < 0)
+		dev_info(pcie->dev,
+			 "Failed to read ASPM L0s Entrance latency: %d\n", ret);
+
+	ret = of_property_read_u32(np, "num-lanes", &pcie->num_lanes);
+	if (ret < 0) {
+		dev_err(pcie->dev, "Failed to read num-lanes: %d\n", ret);
+		return ret;
+	}
+
+	pcie->max_speed = of_pci_get_max_link_speed(np);
+
+	ret = of_property_read_u32_index(np, "nvidia,bpmp", 1, &pcie->cid);
+	if (ret) {
+		dev_err(pcie->dev, "Failed to read Controller-ID: %d\n", ret);
+		return ret;
+	}
+
+	ret = of_property_count_strings(np, "phy-names");
+	if (ret < 0) {
+		dev_err(pcie->dev, "Failed to find PHY entries: %d\n",
+			ret);
+		return ret;
+	}
+	pcie->phy_count = ret;
+
+	if (of_property_read_bool(np, "nvidia,update-fc-fixup"))
+		pcie->update_fc_fixup = true;
+
+	pcie->supports_clkreq =
+		of_property_read_bool(pcie->dev->of_node, "supports-clkreq");
+
+	pcie->enable_cdm_check =
+		of_property_read_bool(np, "snps,enable-cdm-check");
+
+	return 0;
+}
+
+static int tegra_pcie_bpmp_set_ctrl_state(struct tegra_pcie_dw *pcie,
+					  bool enable)
+{
+	struct mrq_uphy_response resp;
+	struct tegra_bpmp_message msg;
+	struct mrq_uphy_request req;
+
+	/* Controller-5 doesn't need to have its state set by BPMP-FW */
+	if (pcie->cid == 5)
+		return 0;
+
+	memset(&req, 0, sizeof(req));
+	memset(&resp, 0, sizeof(resp));
+
+	req.cmd = CMD_UPHY_PCIE_CONTROLLER_STATE;
+	req.controller_state.pcie_controller = pcie->cid;
+	req.controller_state.enable = enable;
+
+	memset(&msg, 0, sizeof(msg));
+	msg.mrq = MRQ_UPHY;
+	msg.tx.data = &req;
+	msg.tx.size = sizeof(req);
+	msg.rx.data = &resp;
+	msg.rx.size = sizeof(resp);
+
+	return tegra_bpmp_transfer(pcie->bpmp, &msg);
+}
+
+static void tegra_pcie_downstream_dev_to_D0(struct tegra_pcie_dw *pcie)
+{
+	struct pcie_port *pp = &pcie->pci.pp;
+	struct pci_bus *child, *root_bus = NULL;
+	struct pci_dev *pdev;
+
+	/*
+	 * link doesn't go into L2 state with some of the endpoints with Tegra
+	 * if they are not in D0 state. So, need to make sure that immediate
+	 * downstream devices are in D0 state before sending PME_TurnOff to put
+	 * link into L2 state.
+	 * This is as per PCI Express Base r4.0 v1.0 September 27-2017,
+	 * 5.2 Link State Power Management (Page #428).
+	 */
+
+	list_for_each_entry(child, &pp->root_bus->children, node) {
+		/* Bring downstream devices to D0 if they are not already in */
+		if (child->parent == pp->root_bus) {
+			root_bus = child;
+			break;
+		}
+	}
+
+	if (!root_bus) {
+		dev_err(pcie->dev, "Failed to find downstream devices\n");
+		return;
+	}
+
+	list_for_each_entry(pdev, &root_bus->devices, bus_list) {
+		if (PCI_SLOT(pdev->devfn) == 0) {
+			if (pci_set_power_state(pdev, PCI_D0))
+				dev_err(pcie->dev,
+					"Failed to transition %s to D0 state\n",
+					dev_name(&pdev->dev));
+		}
+	}
+}
+
+static int tegra_pcie_get_slot_regulators(struct tegra_pcie_dw *pcie)
+{
+	pcie->slot_ctl_3v3 = devm_regulator_get_optional(pcie->dev, "vpcie3v3");
+	if (IS_ERR(pcie->slot_ctl_3v3)) {
+		if (PTR_ERR(pcie->slot_ctl_3v3) != -ENODEV)
+			return PTR_ERR(pcie->slot_ctl_3v3);
+
+		pcie->slot_ctl_3v3 = NULL;
+	}
+
+	pcie->slot_ctl_12v = devm_regulator_get_optional(pcie->dev, "vpcie12v");
+	if (IS_ERR(pcie->slot_ctl_12v)) {
+		if (PTR_ERR(pcie->slot_ctl_12v) != -ENODEV)
+			return PTR_ERR(pcie->slot_ctl_12v);
+
+		pcie->slot_ctl_12v = NULL;
+	}
+
+	return 0;
+}
+
+static int tegra_pcie_enable_slot_regulators(struct tegra_pcie_dw *pcie)
+{
+	int ret;
+
+	if (pcie->slot_ctl_3v3) {
+		ret = regulator_enable(pcie->slot_ctl_3v3);
+		if (ret < 0) {
+			dev_err(pcie->dev,
+				"Failed to enable 3.3V slot supply: %d\n", ret);
+			return ret;
+		}
+	}
+
+	if (pcie->slot_ctl_12v) {
+		ret = regulator_enable(pcie->slot_ctl_12v);
+		if (ret < 0) {
+			dev_err(pcie->dev,
+				"Failed to enable 12V slot supply: %d\n", ret);
+			goto fail_12v_enable;
+		}
+	}
+
+	/*
+	 * According to PCI Express Card Electromechanical Specification
+	 * Revision 1.1, Table-2.4, T_PVPERL (Power stable to PERST# inactive)
+	 * should be a minimum of 100ms.
+	 */
+	if (pcie->slot_ctl_3v3 || pcie->slot_ctl_12v)
+		msleep(100);
+
+	return 0;
+
+fail_12v_enable:
+	if (pcie->slot_ctl_3v3)
+		regulator_disable(pcie->slot_ctl_3v3);
+	return ret;
+}
+
+static void tegra_pcie_disable_slot_regulators(struct tegra_pcie_dw *pcie)
+{
+	if (pcie->slot_ctl_12v)
+		regulator_disable(pcie->slot_ctl_12v);
+	if (pcie->slot_ctl_3v3)
+		regulator_disable(pcie->slot_ctl_3v3);
+}
+
+static int tegra_pcie_config_controller(struct tegra_pcie_dw *pcie,
+					bool en_hw_hot_rst)
+{
+	int ret;
+	u32 val;
+
+	ret = tegra_pcie_bpmp_set_ctrl_state(pcie, true);
+	if (ret) {
+		dev_err(pcie->dev,
+			"Failed to enable controller %u: %d\n", pcie->cid, ret);
+		return ret;
+	}
+
+	ret = tegra_pcie_enable_slot_regulators(pcie);
+	if (ret < 0)
+		goto fail_slot_reg_en;
+
+	ret = regulator_enable(pcie->pex_ctl_supply);
+	if (ret < 0) {
+		dev_err(pcie->dev, "Failed to enable regulator: %d\n", ret);
+		goto fail_reg_en;
+	}
+
+	ret = clk_prepare_enable(pcie->core_clk);
+	if (ret) {
+		dev_err(pcie->dev, "Failed to enable core clock: %d\n", ret);
+		goto fail_core_clk;
+	}
+
+	ret = reset_control_deassert(pcie->core_apb_rst);
+	if (ret) {
+		dev_err(pcie->dev, "Failed to deassert core APB reset: %d\n",
+			ret);
+		goto fail_core_apb_rst;
+	}
+
+	if (en_hw_hot_rst) {
+		/* Enable HW_HOT_RST mode */
+		val = appl_readl(pcie, APPL_CTRL);
+		val &= ~(APPL_CTRL_HW_HOT_RST_MODE_MASK <<
+			 APPL_CTRL_HW_HOT_RST_MODE_SHIFT);
+		val |= APPL_CTRL_HW_HOT_RST_EN;
+		appl_writel(pcie, val, APPL_CTRL);
+	}
+
+	ret = tegra_pcie_enable_phy(pcie);
+	if (ret) {
+		dev_err(pcie->dev, "Failed to enable PHY: %d\n", ret);
+		goto fail_phy;
+	}
+
+	/* Update CFG base address */
+	appl_writel(pcie, pcie->dbi_res->start & APPL_CFG_BASE_ADDR_MASK,
+		    APPL_CFG_BASE_ADDR);
+
+	/* Configure this core for RP mode operation */
+	appl_writel(pcie, APPL_DM_TYPE_RP, APPL_DM_TYPE);
+
+	appl_writel(pcie, 0x0, APPL_CFG_SLCG_OVERRIDE);
+
+	val = appl_readl(pcie, APPL_CTRL);
+	appl_writel(pcie, val | APPL_CTRL_SYS_PRE_DET_STATE, APPL_CTRL);
+
+	val = appl_readl(pcie, APPL_CFG_MISC);
+	val |= (APPL_CFG_MISC_ARCACHE_VAL << APPL_CFG_MISC_ARCACHE_SHIFT);
+	appl_writel(pcie, val, APPL_CFG_MISC);
+
+	if (!pcie->supports_clkreq) {
+		val = appl_readl(pcie, APPL_PINMUX);
+		val |= APPL_PINMUX_CLKREQ_OUT_OVRD_EN;
+		val |= APPL_PINMUX_CLKREQ_OUT_OVRD;
+		appl_writel(pcie, val, APPL_PINMUX);
+	}
+
+	/* Update iATU_DMA base address */
+	appl_writel(pcie,
+		    pcie->atu_dma_res->start & APPL_CFG_IATU_DMA_BASE_ADDR_MASK,
+		    APPL_CFG_IATU_DMA_BASE_ADDR);
+
+	reset_control_deassert(pcie->core_rst);
+
+	pcie->pcie_cap_base = dw_pcie_find_capability(&pcie->pci,
+						      PCI_CAP_ID_EXP);
+
+	/* Disable ASPM-L1SS advertisement as there is no CLKREQ routing */
+	if (!pcie->supports_clkreq) {
+		disable_aspm_l11(pcie);
+		disable_aspm_l12(pcie);
+	}
+
+	return ret;
+
+fail_phy:
+	reset_control_assert(pcie->core_apb_rst);
+fail_core_apb_rst:
+	clk_disable_unprepare(pcie->core_clk);
+fail_core_clk:
+	regulator_disable(pcie->pex_ctl_supply);
+fail_reg_en:
+	tegra_pcie_disable_slot_regulators(pcie);
+fail_slot_reg_en:
+	tegra_pcie_bpmp_set_ctrl_state(pcie, false);
+
+	return ret;
+}
+
+static int __deinit_controller(struct tegra_pcie_dw *pcie)
+{
+	int ret;
+
+	ret = reset_control_assert(pcie->core_rst);
+	if (ret) {
+		dev_err(pcie->dev, "Failed to assert \"core\" reset: %d\n",
+			ret);
+		return ret;
+	}
+
+	tegra_pcie_disable_phy(pcie);
+
+	ret = reset_control_assert(pcie->core_apb_rst);
+	if (ret) {
+		dev_err(pcie->dev, "Failed to assert APB reset: %d\n", ret);
+		return ret;
+	}
+
+	clk_disable_unprepare(pcie->core_clk);
+
+	ret = regulator_disable(pcie->pex_ctl_supply);
+	if (ret) {
+		dev_err(pcie->dev, "Failed to disable regulator: %d\n", ret);
+		return ret;
+	}
+
+	tegra_pcie_disable_slot_regulators(pcie);
+
+	ret = tegra_pcie_bpmp_set_ctrl_state(pcie, false);
+	if (ret) {
+		dev_err(pcie->dev, "Failed to disable controller %d: %d\n",
+			pcie->cid, ret);
+		return ret;
+	}
+
+	return ret;
+}
+
+static int tegra_pcie_init_controller(struct tegra_pcie_dw *pcie)
+{
+	struct dw_pcie *pci = &pcie->pci;
+	struct pcie_port *pp = &pci->pp;
+	int ret;
+
+	ret = tegra_pcie_config_controller(pcie, false);
+	if (ret < 0)
+		return ret;
+
+	pp->ops = &tegra_pcie_dw_host_ops;
+
+	ret = dw_pcie_host_init(pp);
+	if (ret < 0) {
+		dev_err(pcie->dev, "Failed to add PCIe port: %d\n", ret);
+		goto fail_host_init;
+	}
+
+	return 0;
+
+fail_host_init:
+	return __deinit_controller(pcie);
+}
+
+static int tegra_pcie_try_link_l2(struct tegra_pcie_dw *pcie)
+{
+	u32 val;
+
+	if (!tegra_pcie_dw_link_up(&pcie->pci))
+		return 0;
+
+	val = appl_readl(pcie, APPL_RADM_STATUS);
+	val |= APPL_PM_XMT_TURNOFF_STATE;
+	appl_writel(pcie, val, APPL_RADM_STATUS);
+
+	return readl_poll_timeout_atomic(pcie->appl_base + APPL_DEBUG, val,
+				 val & APPL_DEBUG_PM_LINKST_IN_L2_LAT,
+				 1, PME_ACK_TIMEOUT);
+}
+
+static void tegra_pcie_dw_pme_turnoff(struct tegra_pcie_dw *pcie)
+{
+	u32 data;
+	int err;
+
+	if (!tegra_pcie_dw_link_up(&pcie->pci)) {
+		dev_dbg(pcie->dev, "PCIe link is not up...!\n");
+		return;
+	}
+
+	if (tegra_pcie_try_link_l2(pcie)) {
+		dev_info(pcie->dev, "Link didn't transition to L2 state\n");
+		/*
+		 * TX lane clock freq will reset to Gen1 only if link is in L2
+		 * or detect state.
+		 * So apply pex_rst to end point to force RP to go into detect
+		 * state
+		 */
+		data = appl_readl(pcie, APPL_PINMUX);
+		data &= ~APPL_PINMUX_PEX_RST;
+		appl_writel(pcie, data, APPL_PINMUX);
+
+		err = readl_poll_timeout_atomic(pcie->appl_base + APPL_DEBUG,
+						data,
+						((data &
+						APPL_DEBUG_LTSSM_STATE_MASK) >>
+						APPL_DEBUG_LTSSM_STATE_SHIFT) ==
+						LTSSM_STATE_PRE_DETECT,
+						1, LTSSM_TIMEOUT);
+		if (err) {
+			dev_info(pcie->dev, "Link didn't go to detect state\n");
+		} else {
+			/* Disable LTSSM after link is in detect state */
+			data = appl_readl(pcie, APPL_CTRL);
+			data &= ~APPL_CTRL_LTSSM_EN;
+			appl_writel(pcie, data, APPL_CTRL);
+		}
+	}
+	/*
+	 * DBI registers may not be accessible after this as PLL-E would be
+	 * down depending on how CLKREQ is pulled by end point
+	 */
+	data = appl_readl(pcie, APPL_PINMUX);
+	data |= (APPL_PINMUX_CLKREQ_OVERRIDE_EN | APPL_PINMUX_CLKREQ_OVERRIDE);
+	/* Cut REFCLK to slot */
+	data |= APPL_PINMUX_CLK_OUTPUT_IN_OVERRIDE_EN;
+	data &= ~APPL_PINMUX_CLK_OUTPUT_IN_OVERRIDE;
+	appl_writel(pcie, data, APPL_PINMUX);
+}
+
+static int tegra_pcie_deinit_controller(struct tegra_pcie_dw *pcie)
+{
+	tegra_pcie_downstream_dev_to_D0(pcie);
+	dw_pcie_host_deinit(&pcie->pci.pp);
+	tegra_pcie_dw_pme_turnoff(pcie);
+
+	return __deinit_controller(pcie);
+}
+
+static int tegra_pcie_config_rp(struct tegra_pcie_dw *pcie)
+{
+	struct pcie_port *pp = &pcie->pci.pp;
+	struct device *dev = pcie->dev;
+	char *name;
+	int ret;
+
+	if (IS_ENABLED(CONFIG_PCI_MSI)) {
+		pp->msi_irq = of_irq_get_byname(dev->of_node, "msi");
+		if (!pp->msi_irq) {
+			dev_err(dev, "Failed to get MSI interrupt\n");
+			return -ENODEV;
+		}
+	}
+
+	pm_runtime_enable(dev);
+
+	ret = pm_runtime_get_sync(dev);
+	if (ret < 0) {
+		dev_err(dev, "Failed to get runtime sync for PCIe dev: %d\n",
+			ret);
+		goto fail_pm_get_sync;
+	}
+
+	ret = pinctrl_pm_select_default_state(dev);
+	if (ret < 0) {
+		dev_err(dev, "Failed to configure sideband pins: %d\n", ret);
+		goto fail_pinctrl;
+	}
+
+	tegra_pcie_init_controller(pcie);
+
+	pcie->link_state = tegra_pcie_dw_link_up(&pcie->pci);
+	if (!pcie->link_state) {
+		ret = -ENOMEDIUM;
+		goto fail_host_init;
+	}
+
+	name = devm_kasprintf(dev, GFP_KERNEL, "%pOFP", dev->of_node);
+	if (!name) {
+		ret = -ENOMEM;
+		goto fail_host_init;
+	}
+
+	pcie->debugfs = debugfs_create_dir(name, NULL);
+	if (!pcie->debugfs)
+		dev_err(dev, "Failed to create debugfs\n");
+	else
+		init_debugfs(pcie);
+
+	return ret;
+
+fail_host_init:
+	tegra_pcie_deinit_controller(pcie);
+fail_pinctrl:
+	pm_runtime_put_sync(dev);
+fail_pm_get_sync:
+	pm_runtime_disable(dev);
+	return ret;
+}
+
+static int tegra_pcie_dw_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct resource *atu_dma_res;
+	struct tegra_pcie_dw *pcie;
+	struct resource *dbi_res;
+	struct pcie_port *pp;
+	struct dw_pcie *pci;
+	struct phy **phys;
+	char *name;
+	int ret;
+	u32 i;
+
+	pcie = devm_kzalloc(dev, sizeof(*pcie), GFP_KERNEL);
+	if (!pcie)
+		return -ENOMEM;
+
+	pci = &pcie->pci;
+	pci->dev = &pdev->dev;
+	pci->ops = &tegra_dw_pcie_ops;
+	pp = &pci->pp;
+	pcie->dev = &pdev->dev;
+
+	ret = tegra_pcie_dw_parse_dt(pcie);
+	if (ret < 0) {
+		dev_err(dev, "Failed to parse device tree: %d\n", ret);
+		return ret;
+	}
+
+	ret = tegra_pcie_get_slot_regulators(pcie);
+	if (ret < 0) {
+		dev_err(dev, "Failed to get slot regulators: %d\n", ret);
+		return ret;
+	}
+
+	pcie->pex_ctl_supply = devm_regulator_get(dev, "vddio-pex-ctl");
+	if (IS_ERR(pcie->pex_ctl_supply)) {
+		ret = PTR_ERR(pcie->pex_ctl_supply);
+		if (ret != -EPROBE_DEFER)
+			dev_err(dev, "Failed to get regulator: %ld\n",
+				PTR_ERR(pcie->pex_ctl_supply));
+		return ret;
+	}
+
+	pcie->core_clk = devm_clk_get(dev, "core");
+	if (IS_ERR(pcie->core_clk)) {
+		dev_err(dev, "Failed to get core clock: %ld\n",
+			PTR_ERR(pcie->core_clk));
+		return PTR_ERR(pcie->core_clk);
+	}
+
+	pcie->appl_res = platform_get_resource_byname(pdev, IORESOURCE_MEM,
+						      "appl");
+	if (!pcie->appl_res) {
+		dev_err(dev, "Failed to find \"appl\" region\n");
+		return -ENODEV;
+	}
+
+	pcie->appl_base = devm_ioremap_resource(dev, pcie->appl_res);
+	if (IS_ERR(pcie->appl_base))
+		return PTR_ERR(pcie->appl_base);
+
+	pcie->core_apb_rst = devm_reset_control_get(dev, "apb");
+	if (IS_ERR(pcie->core_apb_rst)) {
+		dev_err(dev, "Failed to get APB reset: %ld\n",
+			PTR_ERR(pcie->core_apb_rst));
+		return PTR_ERR(pcie->core_apb_rst);
+	}
+
+	phys = devm_kcalloc(dev, pcie->phy_count, sizeof(*phys), GFP_KERNEL);
+	if (!phys)
+		return -ENOMEM;
+
+	for (i = 0; i < pcie->phy_count; i++) {
+		name = kasprintf(GFP_KERNEL, "p2u-%u", i);
+		if (!name) {
+			dev_err(dev, "Failed to create P2U string\n");
+			return -ENOMEM;
+		}
+		phys[i] = devm_phy_get(dev, name);
+		kfree(name);
+		if (IS_ERR(phys[i])) {
+			ret = PTR_ERR(phys[i]);
+			if (ret != -EPROBE_DEFER)
+				dev_err(dev, "Failed to get PHY: %d\n", ret);
+			return ret;
+		}
+	}
+
+	pcie->phys = phys;
+
+	dbi_res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "dbi");
+	if (!dbi_res) {
+		dev_err(dev, "Failed to find \"dbi\" region\n");
+		return -ENODEV;
+	}
+	pcie->dbi_res = dbi_res;
+
+	pci->dbi_base = devm_ioremap_resource(dev, dbi_res);
+	if (IS_ERR(pci->dbi_base))
+		return PTR_ERR(pci->dbi_base);
+
+	/* Tegra HW locates DBI2 at a fixed offset from DBI */
+	pci->dbi_base2 = pci->dbi_base + 0x1000;
+
+	atu_dma_res = platform_get_resource_byname(pdev, IORESOURCE_MEM,
+						   "atu_dma");
+	if (!atu_dma_res) {
+		dev_err(dev, "Failed to find \"atu_dma\" region\n");
+		return -ENODEV;
+	}
+	pcie->atu_dma_res = atu_dma_res;
+
+	pci->atu_base = devm_ioremap_resource(dev, atu_dma_res);
+	if (IS_ERR(pci->atu_base))
+		return PTR_ERR(pci->atu_base);
+
+	pcie->core_rst = devm_reset_control_get(dev, "core");
+	if (IS_ERR(pcie->core_rst)) {
+		dev_err(dev, "Failed to get core reset: %ld\n",
+			PTR_ERR(pcie->core_rst));
+		return PTR_ERR(pcie->core_rst);
+	}
+
+	pp->irq = platform_get_irq_byname(pdev, "intr");
+	if (!pp->irq) {
+		dev_err(dev, "Failed to get \"intr\" interrupt\n");
+		return -ENODEV;
+	}
+
+	ret = devm_request_irq(dev, pp->irq, tegra_pcie_irq_handler,
+			       IRQF_SHARED, "tegra-pcie-intr", pcie);
+	if (ret) {
+		dev_err(dev, "Failed to request IRQ %d: %d\n", pp->irq, ret);
+		return ret;
+	}
+
+	pcie->bpmp = tegra_bpmp_get(dev);
+	if (IS_ERR(pcie->bpmp))
+		return PTR_ERR(pcie->bpmp);
+
+	platform_set_drvdata(pdev, pcie);
+
+	ret = tegra_pcie_config_rp(pcie);
+	if (ret && ret != -ENOMEDIUM)
+		goto fail;
+	else
+		return 0;
+
+fail:
+	tegra_bpmp_put(pcie->bpmp);
+	return ret;
+}
+
+static int tegra_pcie_dw_remove(struct platform_device *pdev)
+{
+	struct tegra_pcie_dw *pcie = platform_get_drvdata(pdev);
+
+	if (!pcie->link_state)
+		return 0;
+
+	debugfs_remove_recursive(pcie->debugfs);
+	tegra_pcie_deinit_controller(pcie);
+	pm_runtime_put_sync(pcie->dev);
+	pm_runtime_disable(pcie->dev);
+	tegra_bpmp_put(pcie->bpmp);
+
+	return 0;
+}
+
+static int tegra_pcie_dw_suspend_late(struct device *dev)
+{
+	struct tegra_pcie_dw *pcie = dev_get_drvdata(dev);
+	u32 val;
+
+	if (!pcie->link_state)
+		return 0;
+
+	/* Enable HW_HOT_RST mode */
+	val = appl_readl(pcie, APPL_CTRL);
+	val &= ~(APPL_CTRL_HW_HOT_RST_MODE_MASK <<
+		 APPL_CTRL_HW_HOT_RST_MODE_SHIFT);
+	val |= APPL_CTRL_HW_HOT_RST_EN;
+	appl_writel(pcie, val, APPL_CTRL);
+
+	return 0;
+}
+
+static int tegra_pcie_dw_suspend_noirq(struct device *dev)
+{
+	struct tegra_pcie_dw *pcie = dev_get_drvdata(dev);
+
+	if (!pcie->link_state)
+		return 0;
+
+	/* Save MSI interrupt vector */
+	pcie->msi_ctrl_int = dw_pcie_readl_dbi(&pcie->pci,
+					       PORT_LOGIC_MSI_CTRL_INT_0_EN);
+	tegra_pcie_downstream_dev_to_D0(pcie);
+	tegra_pcie_dw_pme_turnoff(pcie);
+
+	return __deinit_controller(pcie);
+}
+
+static int tegra_pcie_dw_resume_noirq(struct device *dev)
+{
+	struct tegra_pcie_dw *pcie = dev_get_drvdata(dev);
+	int ret;
+
+	if (!pcie->link_state)
+		return 0;
+
+	ret = tegra_pcie_config_controller(pcie, true);
+	if (ret < 0)
+		return ret;
+
+	ret = tegra_pcie_dw_host_init(&pcie->pci.pp);
+	if (ret < 0) {
+		dev_err(dev, "Failed to init host: %d\n", ret);
+		goto fail_host_init;
+	}
+
+	/* Restore MSI interrupt vector */
+	dw_pcie_writel_dbi(&pcie->pci, PORT_LOGIC_MSI_CTRL_INT_0_EN,
+			   pcie->msi_ctrl_int);
+
+	return 0;
+
+fail_host_init:
+	return __deinit_controller(pcie);
+}
+
+static int tegra_pcie_dw_resume_early(struct device *dev)
+{
+	struct tegra_pcie_dw *pcie = dev_get_drvdata(dev);
+	u32 val;
+
+	if (!pcie->link_state)
+		return 0;
+
+	/* Disable HW_HOT_RST mode */
+	val = appl_readl(pcie, APPL_CTRL);
+	val &= ~(APPL_CTRL_HW_HOT_RST_MODE_MASK <<
+		 APPL_CTRL_HW_HOT_RST_MODE_SHIFT);
+	val |= APPL_CTRL_HW_HOT_RST_MODE_IMDT_RST <<
+	       APPL_CTRL_HW_HOT_RST_MODE_SHIFT;
+	val &= ~APPL_CTRL_HW_HOT_RST_EN;
+	appl_writel(pcie, val, APPL_CTRL);
+
+	return 0;
+}
+
+static void tegra_pcie_dw_shutdown(struct platform_device *pdev)
+{
+	struct tegra_pcie_dw *pcie = platform_get_drvdata(pdev);
+
+	if (!pcie->link_state)
+		return;
+
+	debugfs_remove_recursive(pcie->debugfs);
+	tegra_pcie_downstream_dev_to_D0(pcie);
+
+	disable_irq(pcie->pci.pp.irq);
+	if (IS_ENABLED(CONFIG_PCI_MSI))
+		disable_irq(pcie->pci.pp.msi_irq);
+
+	tegra_pcie_dw_pme_turnoff(pcie);
+	__deinit_controller(pcie);
+}
+
+static const struct of_device_id tegra_pcie_dw_of_match[] = {
+	{
+		.compatible = "nvidia,tegra194-pcie",
+	},
+	{},
+};
+
+static const struct dev_pm_ops tegra_pcie_dw_pm_ops = {
+	.suspend_late = tegra_pcie_dw_suspend_late,
+	.suspend_noirq = tegra_pcie_dw_suspend_noirq,
+	.resume_noirq = tegra_pcie_dw_resume_noirq,
+	.resume_early = tegra_pcie_dw_resume_early,
+};
+
+static struct platform_driver tegra_pcie_dw_driver = {
+	.probe = tegra_pcie_dw_probe,
+	.remove = tegra_pcie_dw_remove,
+	.shutdown = tegra_pcie_dw_shutdown,
+	.driver = {
+		.name	= "tegra194-pcie",
+		.pm = &tegra_pcie_dw_pm_ops,
+		.of_match_table = tegra_pcie_dw_of_match,
+	},
+};
+module_platform_driver(tegra_pcie_dw_driver);
+
+MODULE_DEVICE_TABLE(of, tegra_pcie_dw_of_match);
+
+MODULE_AUTHOR("Vidya Sagar <vidyas@nvidia.com>");
+MODULE_DESCRIPTION("NVIDIA PCIe host controller driver");
+MODULE_LICENSE("GPL v2");

diff --git a/drivers/pci/controller/pci-host-common.c b/drivers/pci/controller/pci-host-common.c
index c742881..c8cb9c5 100644
--- a/drivers/pci/controller/pci-host-common.c
+++ b/drivers/pci/controller/pci-host-common.c

@@ -43,9 +43,8 @@
 		goto err_out;
 	}
 
-	err = devm_add_action(dev, gen_pci_unmap_cfg, cfg);
+	err = devm_add_action_or_reset(dev, gen_pci_unmap_cfg, cfg);
 	if (err) {
-		gen_pci_unmap_cfg(cfg);
 		goto err_out;
 	}
 	return cfg;

diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c
index 0ca73c8..f1f3002 100644
--- a/drivers/pci/controller/pci-hyperv.c
+++ b/drivers/pci/controller/pci-hyperv.c

@@ -2809,6 +2809,48 @@
 		complete(&hbus->remove_event);
 }
 
+#define HVPCI_DOM_MAP_SIZE (64 * 1024)
+static DECLARE_BITMAP(hvpci_dom_map, HVPCI_DOM_MAP_SIZE);
+
+/*
+ * PCI domain number 0 is used by emulated devices on Gen1 VMs, so define 0
+ * as invalid for passthrough PCI devices of this driver.
+ */
+#define HVPCI_DOM_INVALID 0
+
+/**
+ * hv_get_dom_num() - Get a valid PCI domain number
+ * Check if the PCI domain number is in use, and return another number if
+ * it is in use.
+ *
+ * @dom: Requested domain number
+ *
+ * return: domain number on success, HVPCI_DOM_INVALID on failure
+ */
+static u16 hv_get_dom_num(u16 dom)
+{
+	unsigned int i;
+
+	if (test_and_set_bit(dom, hvpci_dom_map) == 0)
+		return dom;
+
+	for_each_clear_bit(i, hvpci_dom_map, HVPCI_DOM_MAP_SIZE) {
+		if (test_and_set_bit(i, hvpci_dom_map) == 0)
+			return i;
+	}
+
+	return HVPCI_DOM_INVALID;
+}
+
+/**
+ * hv_put_dom_num() - Mark the PCI domain number as free
+ * @dom: Domain number to be freed
+ */
+static void hv_put_dom_num(u16 dom)
+{
+	clear_bit(dom, hvpci_dom_map);
+}
+
 /**
  * hv_pci_probe() - New VMBus channel probe, for a root PCI bus
  * @hdev:	VMBus's tracking struct for this root PCI bus
@@ -2820,6 +2862,7 @@
 			const struct hv_vmbus_device_id *dev_id)
 {
 	struct hv_pcibus_device *hbus;
+	u16 dom_req, dom;
 	char *name;
 	int ret;
 
@@ -2835,19 +2878,34 @@
 	hbus->state = hv_pcibus_init;
 
 	/*
-	 * The PCI bus "domain" is what is called "segment" in ACPI and
-	 * other specs.  Pull it from the instance ID, to get something
-	 * unique.  Bytes 8 and 9 are what is used in Windows guests, so
-	 * do the same thing for consistency.  Note that, since this code
-	 * only runs in a Hyper-V VM, Hyper-V can (and does) guarantee
-	 * that (1) the only domain in use for something that looks like
-	 * a physical PCI bus (which is actually emulated by the
-	 * hypervisor) is domain 0 and (2) there will be no overlap
-	 * between domains derived from these instance IDs in the same
-	 * VM.
+	 * The PCI bus "domain" is what is called "segment" in ACPI and other
+	 * specs. Pull it from the instance ID, to get something usually
+	 * unique. In rare cases of collision, we will find out another number
+	 * not in use.
+	 *
+	 * Note that, since this code only runs in a Hyper-V VM, Hyper-V
+	 * together with this guest driver can guarantee that (1) The only
+	 * domain used by Gen1 VMs for something that looks like a physical
+	 * PCI bus (which is actually emulated by the hypervisor) is domain 0.
+	 * (2) There will be no overlap between domains (after fixing possible
+	 * collisions) in the same VM.
 	 */
-	hbus->sysdata.domain = hdev->dev_instance.b[9] |
-			       hdev->dev_instance.b[8] << 8;
+	dom_req = hdev->dev_instance.b[5] << 8 | hdev->dev_instance.b[4];
+	dom = hv_get_dom_num(dom_req);
+
+	if (dom == HVPCI_DOM_INVALID) {
+		dev_err(&hdev->device,
+			"Unable to use dom# 0x%hx or other numbers", dom_req);
+		ret = -EINVAL;
+		goto free_bus;
+	}
+
+	if (dom != dom_req)
+		dev_info(&hdev->device,
+			 "PCI dom# 0x%hx has collision, using 0x%hx",
+			 dom_req, dom);
+
+	hbus->sysdata.domain = dom;
 
 	hbus->hdev = hdev;
 	refcount_set(&hbus->remove_lock, 1);
@@ -2862,7 +2920,7 @@
 					   hbus->sysdata.domain);
 	if (!hbus->wq) {
 		ret = -ENOMEM;
-		goto free_bus;
+		goto free_dom;
 	}
 
 	ret = vmbus_open(hdev->channel, pci_ring_size, pci_ring_size, NULL, 0,
@@ -2946,6 +3004,8 @@
 	vmbus_close(hdev->channel);
 destroy_wq:
 	destroy_workqueue(hbus->wq);
+free_dom:
+	hv_put_dom_num(hbus->sysdata.domain);
 free_bus:
 	free_page((unsigned long)hbus);
 	return ret;
@@ -3008,8 +3068,8 @@
 		/* Remove the bus from PCI's point of view. */
 		pci_lock_rescan_remove();
 		pci_stop_root_bus(hbus->pci_bus);
-		pci_remove_root_bus(hbus->pci_bus);
 		hv_pci_remove_slots(hbus);
+		pci_remove_root_bus(hbus->pci_bus);
 		pci_unlock_rescan_remove();
 		hbus->state = hv_pcibus_removed;
 	}
@@ -3027,6 +3087,9 @@
 	put_hvpcibus(hbus);
 	wait_for_completion(&hbus->remove_event);
 	destroy_workqueue(hbus->wq);
+
+	hv_put_dom_num(hbus->sysdata.domain);
+
 	free_page((unsigned long)hbus);
 	return 0;
 }
@@ -3058,6 +3121,9 @@
 
 static int __init init_hv_pci_drv(void)
 {
+	/* Set the invalid domain number's bit, so it will not be used */
+	set_bit(HVPCI_DOM_INVALID, hvpci_dom_map);
+
 	/* Initialize PCI block r/w interface */
 	hvpci_block_ops.read_block = hv_read_config_block;
 	hvpci_block_ops.write_block = hv_write_config_block;

diff --git a/drivers/pci/controller/pci-tegra.c b/drivers/pci/controller/pci-tegra.c
index 9a917b2..673a172 100644
--- a/drivers/pci/controller/pci-tegra.c
+++ b/drivers/pci/controller/pci-tegra.c

@@ -2237,14 +2237,15 @@
 		err = of_pci_get_devfn(port);
 		if (err < 0) {
 			dev_err(dev, "failed to parse address: %d\n", err);
-			return err;
+			goto err_node_put;
 		}
 
 		index = PCI_SLOT(err);
 
 		if (index < 1 || index > soc->num_ports) {
 			dev_err(dev, "invalid port number: %d\n", index);
-			return -EINVAL;
+			err = -EINVAL;
+			goto err_node_put;
 		}
 
 		index--;
@@ -2253,12 +2254,13 @@
 		if (err < 0) {
 			dev_err(dev, "failed to parse # of lanes: %d\n",
 				err);
-			return err;
+			goto err_node_put;
 		}
 
 		if (value > 16) {
 			dev_err(dev, "invalid # of lanes: %u\n", value);
-			return -EINVAL;
+			err = -EINVAL;
+			goto err_node_put;
 		}
 
 		lanes |= value << (index << 3);
@@ -2272,13 +2274,15 @@
 		lane += value;
 
 		rp = devm_kzalloc(dev, sizeof(*rp), GFP_KERNEL);
-		if (!rp)
-			return -ENOMEM;
+		if (!rp) {
+			err = -ENOMEM;
+			goto err_node_put;
+		}
 
 		err = of_address_to_resource(port, 0, &rp->regs);
 		if (err < 0) {
 			dev_err(dev, "failed to parse address: %d\n", err);
-			return err;
+			goto err_node_put;
 		}
 
 		INIT_LIST_HEAD(&rp->list);
@@ -2330,6 +2334,10 @@
 		return err;
 
 	return 0;
+
+err_node_put:
+	of_node_put(port);
+	return err;
 }
 
 /*

diff --git a/drivers/pci/controller/pcie-iproc-platform.c b/drivers/pci/controller/pcie-iproc-platform.c
index 5a3550b..9ee6200 100644
--- a/drivers/pci/controller/pcie-iproc-platform.c
+++ b/drivers/pci/controller/pcie-iproc-platform.c

@@ -93,12 +93,9 @@
 	pcie->need_ib_cfg = of_property_read_bool(np, "dma-ranges");
 
 	/* PHY use is optional */
-	pcie->phy = devm_phy_get(dev, "pcie-phy");
-	if (IS_ERR(pcie->phy)) {
-		if (PTR_ERR(pcie->phy) == -EPROBE_DEFER)
-			return -EPROBE_DEFER;
-		pcie->phy = NULL;
-	}
+	pcie->phy = devm_phy_optional_get(dev, "pcie-phy");
+	if (IS_ERR(pcie->phy))
+		return PTR_ERR(pcie->phy);
 
 	ret = devm_of_pci_get_host_bridge_resources(dev, 0, 0xff, &resources,
 						    &iobase);

diff --git a/drivers/pci/controller/pcie-mediatek.c b/drivers/pci/controller/pcie-mediatek.c
index 80601e1..626a7c3 100644
--- a/drivers/pci/controller/pcie-mediatek.c
+++ b/drivers/pci/controller/pcie-mediatek.c

@@ -73,6 +73,7 @@
 #define PCIE_MSI_VECTOR		0x0c0
 
 #define PCIE_CONF_VEND_ID	0x100
+#define PCIE_CONF_DEVICE_ID	0x102
 #define PCIE_CONF_CLASS_ID	0x106
 
 #define PCIE_INT_MASK		0x420
@@ -141,12 +142,16 @@
 /**
  * struct mtk_pcie_soc - differentiate between host generations
  * @need_fix_class_id: whether this host's class ID needed to be fixed or not
+ * @need_fix_device_id: whether this host's device ID needed to be fixed or not
+ * @device_id: device ID which this host need to be fixed
  * @ops: pointer to configuration access functions
  * @startup: pointer to controller setting functions
  * @setup_irq: pointer to initialize IRQ functions
  */
 struct mtk_pcie_soc {
 	bool need_fix_class_id;
+	bool need_fix_device_id;
+	unsigned int device_id;
 	struct pci_ops *ops;
 	int (*startup)(struct mtk_pcie_port *port);
 	int (*setup_irq)(struct mtk_pcie_port *port, struct device_node *node);
@@ -630,8 +635,6 @@
 	}
 
 	chained_irq_exit(irqchip, desc);
-
-	return;
 }
 
 static int mtk_pcie_setup_irq(struct mtk_pcie_port *port,
@@ -696,6 +699,9 @@
 		writew(val, port->base + PCIE_CONF_CLASS_ID);
 	}
 
+	if (soc->need_fix_device_id)
+		writew(soc->device_id, port->base + PCIE_CONF_DEVICE_ID);
+
 	/* 100ms timeout value should be enough for Gen1/2 training */
 	err = readl_poll_timeout(port->base + PCIE_LINK_STATUS_V2, val,
 				 !!(val & PCIE_PORT_LINKUP_V2), 20,
@@ -1216,11 +1222,21 @@
 	.setup_irq = mtk_pcie_setup_irq,
 };
 
+static const struct mtk_pcie_soc mtk_pcie_soc_mt7629 = {
+	.need_fix_class_id = true,
+	.need_fix_device_id = true,
+	.device_id = PCI_DEVICE_ID_MEDIATEK_7629,
+	.ops = &mtk_pcie_ops_v2,
+	.startup = mtk_pcie_startup_port_v2,
+	.setup_irq = mtk_pcie_setup_irq,
+};
+
 static const struct of_device_id mtk_pcie_ids[] = {
 	{ .compatible = "mediatek,mt2701-pcie", .data = &mtk_pcie_soc_v1 },
 	{ .compatible = "mediatek,mt7623-pcie", .data = &mtk_pcie_soc_v1 },
 	{ .compatible = "mediatek,mt2712-pcie", .data = &mtk_pcie_soc_mt2712 },
 	{ .compatible = "mediatek,mt7622-pcie", .data = &mtk_pcie_soc_mt7622 },
+	{ .compatible = "mediatek,mt7629-pcie", .data = &mtk_pcie_soc_mt7629 },
 	{},
 };
 

diff --git a/drivers/pci/controller/pcie-mobiveil.c b/drivers/pci/controller/pcie-mobiveil.c
index 672e633..a45a644 100644
--- a/drivers/pci/controller/pcie-mobiveil.c
+++ b/drivers/pci/controller/pcie-mobiveil.c

@@ -88,6 +88,7 @@
 #define  AMAP_CTRL_TYPE_MASK		3
 
 #define PAB_EXT_PEX_AMAP_SIZEN(win)	PAB_EXT_REG_ADDR(0xbef0, win)
+#define PAB_EXT_PEX_AMAP_AXI_WIN(win)	PAB_EXT_REG_ADDR(0xb4a0, win)
 #define PAB_PEX_AMAP_AXI_WIN(win)	PAB_REG_ADDR(0x4ba4, win)
 #define PAB_PEX_AMAP_PEX_WIN_L(win)	PAB_REG_ADDR(0x4ba8, win)
 #define PAB_PEX_AMAP_PEX_WIN_H(win)	PAB_REG_ADDR(0x4bac, win)
@@ -462,7 +463,7 @@
 }
 
 static void program_ib_windows(struct mobiveil_pcie *pcie, int win_num,
-			       u64 pci_addr, u32 type, u64 size)
+			       u64 cpu_addr, u64 pci_addr, u32 type, u64 size)
 {
 	u32 value;
 	u64 size64 = ~(size - 1);
@@ -482,7 +483,10 @@
 	csr_writel(pcie, upper_32_bits(size64),
 		   PAB_EXT_PEX_AMAP_SIZEN(win_num));
 
-	csr_writel(pcie, pci_addr, PAB_PEX_AMAP_AXI_WIN(win_num));
+	csr_writel(pcie, lower_32_bits(cpu_addr),
+		   PAB_PEX_AMAP_AXI_WIN(win_num));
+	csr_writel(pcie, upper_32_bits(cpu_addr),
+		   PAB_EXT_PEX_AMAP_AXI_WIN(win_num));
 
 	csr_writel(pcie, lower_32_bits(pci_addr),
 		   PAB_PEX_AMAP_PEX_WIN_L(win_num));
@@ -624,7 +628,7 @@
 			   CFG_WINDOW_TYPE, resource_size(pcie->ob_io_res));
 
 	/* memory inbound translation window */
-	program_ib_windows(pcie, WIN_NUM_0, 0, MEM_WINDOW_TYPE, IB_WIN_SIZE);
+	program_ib_windows(pcie, WIN_NUM_0, 0, 0, MEM_WINDOW_TYPE, IB_WIN_SIZE);
 
 	/* Get the I/O and memory ranges from DT */
 	resource_list_for_each_entry(win, &pcie->resources) {

diff --git a/drivers/pci/controller/pcie-rockchip-host.c b/drivers/pci/controller/pcie-rockchip-host.c
index 8d20f17..ef8e677 100644
--- a/drivers/pci/controller/pcie-rockchip-host.c
+++ b/drivers/pci/controller/pcie-rockchip-host.c

@@ -608,29 +608,29 @@
 
 	rockchip->vpcie12v = devm_regulator_get_optional(dev, "vpcie12v");
 	if (IS_ERR(rockchip->vpcie12v)) {
-		if (PTR_ERR(rockchip->vpcie12v) == -EPROBE_DEFER)
-			return -EPROBE_DEFER;
+		if (PTR_ERR(rockchip->vpcie12v) != -ENODEV)
+			return PTR_ERR(rockchip->vpcie12v);
 		dev_info(dev, "no vpcie12v regulator found\n");
 	}
 
 	rockchip->vpcie3v3 = devm_regulator_get_optional(dev, "vpcie3v3");
 	if (IS_ERR(rockchip->vpcie3v3)) {
-		if (PTR_ERR(rockchip->vpcie3v3) == -EPROBE_DEFER)
-			return -EPROBE_DEFER;
+		if (PTR_ERR(rockchip->vpcie3v3) != -ENODEV)
+			return PTR_ERR(rockchip->vpcie3v3);
 		dev_info(dev, "no vpcie3v3 regulator found\n");
 	}
 
 	rockchip->vpcie1v8 = devm_regulator_get_optional(dev, "vpcie1v8");
 	if (IS_ERR(rockchip->vpcie1v8)) {
-		if (PTR_ERR(rockchip->vpcie1v8) == -EPROBE_DEFER)
-			return -EPROBE_DEFER;
+		if (PTR_ERR(rockchip->vpcie1v8) != -ENODEV)
+			return PTR_ERR(rockchip->vpcie1v8);
 		dev_info(dev, "no vpcie1v8 regulator found\n");
 	}
 
 	rockchip->vpcie0v9 = devm_regulator_get_optional(dev, "vpcie0v9");
 	if (IS_ERR(rockchip->vpcie0v9)) {
-		if (PTR_ERR(rockchip->vpcie0v9) == -EPROBE_DEFER)
-			return -EPROBE_DEFER;
+		if (PTR_ERR(rockchip->vpcie0v9) != -ENODEV)
+			return PTR_ERR(rockchip->vpcie0v9);
 		dev_info(dev, "no vpcie0v9 regulator found\n");
 	}
 

diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c
index 4575e0c..a35d3f3 100644
--- a/drivers/pci/controller/vmd.c
+++ b/drivers/pci/controller/vmd.c

@@ -31,6 +31,9 @@
 #define PCI_REG_VMLOCK		0x70
 #define MB2_SHADOW_EN(vmlock)	(vmlock & 0x2)
 
+#define MB2_SHADOW_OFFSET	0x2000
+#define MB2_SHADOW_SIZE		16
+
 enum vmd_features {
 	/*
 	 * Device may contain registers which hint the physical location of the
@@ -94,6 +97,7 @@
 	struct resource		resources[3];
 	struct irq_domain	*irq_domain;
 	struct pci_bus		*bus;
+	u8			busn_start;
 
 	struct dma_map_ops	dma_ops;
 	struct dma_domain	dma_domain;
@@ -440,7 +444,8 @@
 				  unsigned int devfn, int reg, int len)
 {
 	char __iomem *addr = vmd->cfgbar +
-			     (bus->number << 20) + (devfn << 12) + reg;
+			     ((bus->number - vmd->busn_start) << 20) +
+			     (devfn << 12) + reg;
 
 	if ((addr - vmd->cfgbar) + len >=
 	    resource_size(&vmd->dev->resource[VMD_CFGBAR]))
@@ -563,7 +568,7 @@
 	unsigned long flags;
 	LIST_HEAD(resources);
 	resource_size_t offset[2] = {0};
-	resource_size_t membar2_offset = 0x2000, busn_start = 0;
+	resource_size_t membar2_offset = 0x2000;
 	struct pci_bus *child;
 
 	/*
@@ -576,7 +581,7 @@
 		u32 vmlock;
 		int ret;
 
-		membar2_offset = 0x2018;
+		membar2_offset = MB2_SHADOW_OFFSET + MB2_SHADOW_SIZE;
 		ret = pci_read_config_dword(vmd->dev, PCI_REG_VMLOCK, &vmlock);
 		if (ret || vmlock == ~0)
 			return -ENODEV;
@@ -588,9 +593,9 @@
 			if (!membar2)
 				return -ENOMEM;
 			offset[0] = vmd->dev->resource[VMD_MEMBAR1].start -
-						readq(membar2 + 0x2008);
+					readq(membar2 + MB2_SHADOW_OFFSET);
 			offset[1] = vmd->dev->resource[VMD_MEMBAR2].start -
-						readq(membar2 + 0x2010);
+					readq(membar2 + MB2_SHADOW_OFFSET + 8);
 			pci_iounmap(vmd->dev, membar2);
 		}
 	}
@@ -606,14 +611,14 @@
 		pci_read_config_dword(vmd->dev, PCI_REG_VMCONFIG, &vmconfig);
 		if (BUS_RESTRICT_CAP(vmcap) &&
 		    (BUS_RESTRICT_CFG(vmconfig) == 0x1))
-			busn_start = 128;
+			vmd->busn_start = 128;
 	}
 
 	res = &vmd->dev->resource[VMD_CFGBAR];
 	vmd->resources[0] = (struct resource) {
 		.name  = "VMD CFGBAR",
-		.start = busn_start,
-		.end   = busn_start + (resource_size(res) >> 20) - 1,
+		.start = vmd->busn_start,
+		.end   = vmd->busn_start + (resource_size(res) >> 20) - 1,
 		.flags = IORESOURCE_BUS | IORESOURCE_PCI_FIXED,
 	};
 
@@ -681,8 +686,8 @@
 	pci_add_resource_offset(&resources, &vmd->resources[1], offset[0]);
 	pci_add_resource_offset(&resources, &vmd->resources[2], offset[1]);
 
-	vmd->bus = pci_create_root_bus(&vmd->dev->dev, busn_start, &vmd_ops,
-				       sd, &resources);
+	vmd->bus = pci_create_root_bus(&vmd->dev->dev, vmd->busn_start,
+				       &vmd_ops, sd, &resources);
 	if (!vmd->bus) {
 		pci_free_resource_list(&resources);
 		irq_domain_remove(vmd->irq_domain);

diff --git a/drivers/pci/hotplug/cpci_hotplug_core.c b/drivers/pci/hotplug/cpci_hotplug_core.c
index 603eadf..d0559d2 100644
--- a/drivers/pci/hotplug/cpci_hotplug_core.c
+++ b/drivers/pci/hotplug/cpci_hotplug_core.c

@@ -563,7 +563,6 @@
 	}
 cleanup_null:
 	up_write(&list_rwsem);
-	return;
 }
 
 int

diff --git a/drivers/pci/hotplug/cpqphp_core.c b/drivers/pci/hotplug/cpqphp_core.c
index 16bbb18..b8aacb4 100644
--- a/drivers/pci/hotplug/cpqphp_core.c
+++ b/drivers/pci/hotplug/cpqphp_core.c

@@ -173,7 +173,6 @@
 		dbg("%d %d %d %d\n", tbus, tdevice >> 3, tdevice & 0x7, tslot);
 
 	}
-	return;
 }
 
 

diff --git a/drivers/pci/hotplug/cpqphp_ctrl.c b/drivers/pci/hotplug/cpqphp_ctrl.c
index b7f4e1f..68de958 100644
--- a/drivers/pci/hotplug/cpqphp_ctrl.c
+++ b/drivers/pci/hotplug/cpqphp_ctrl.c

@@ -1872,8 +1872,6 @@
 			}
 		}		/* End of FOR loop */
 	}
-
-	return;
 }
 
 
@@ -1943,8 +1941,6 @@
 
 		p_slot->state = STATIC_STATE;
 	}
-
-	return;
 }
 
 

diff --git a/drivers/pci/hotplug/cpqphp_nvram.h b/drivers/pci/hotplug/cpqphp_nvram.h
index 918ff8d..70e879b 100644
--- a/drivers/pci/hotplug/cpqphp_nvram.h
+++ b/drivers/pci/hotplug/cpqphp_nvram.h

@@ -16,10 +16,7 @@
 
 #ifndef CONFIG_HOTPLUG_PCI_COMPAQ_NVRAM
 
-static inline void compaq_nvram_init(void __iomem *rom_start)
-{
-	return;
-}
+static inline void compaq_nvram_init(void __iomem *rom_start) { }
 
 static inline int compaq_nvram_load(void __iomem *rom_start, struct controller *ctrl)
 {

diff --git a/drivers/pci/hotplug/ibmphp_res.c b/drivers/pci/hotplug/ibmphp_res.c
index 5e8caf7..5c93aa14 100644
--- a/drivers/pci/hotplug/ibmphp_res.c
+++ b/drivers/pci/hotplug/ibmphp_res.c

@@ -1941,6 +1941,7 @@
 						break;
 					case PCI_HEADER_TYPE_BRIDGE:
 						function = 0x8;
+						/* fall through */
 					case PCI_HEADER_TYPE_MULTIBRIDGE:
 						/* We assume here that only 1 bus behind the bridge
 						   TO DO: add functionality for several:

diff --git a/drivers/pci/hotplug/pciehp.h b/drivers/pci/hotplug/pciehp.h
index 8c51a04..654c972 100644
--- a/drivers/pci/hotplug/pciehp.h
+++ b/drivers/pci/hotplug/pciehp.h

@@ -110,9 +110,9 @@
  *
  * @OFF_STATE: slot is powered off, no subordinate devices are enumerated
  * @BLINKINGON_STATE: slot will be powered on after the 5 second delay,
- *	green led is blinking
+ *	Power Indicator is blinking
  * @BLINKINGOFF_STATE: slot will be powered off after the 5 second delay,
- *	green led is blinking
+ *	Power Indicator is blinking
  * @POWERON_STATE: slot is currently powering on
  * @POWEROFF_STATE: slot is currently powering off
  * @ON_STATE: slot is powered on, subordinate devices have been enumerated
@@ -167,12 +167,11 @@
 void pciehp_power_off_slot(struct controller *ctrl);
 void pciehp_get_power_status(struct controller *ctrl, u8 *status);
 
-void pciehp_set_attention_status(struct controller *ctrl, u8 status);
+#define INDICATOR_NOOP -1	/* Leave indicator unchanged */
+void pciehp_set_indicators(struct controller *ctrl, int pwr, int attn);
+
 void pciehp_get_latch_status(struct controller *ctrl, u8 *status);
 int pciehp_query_power_fault(struct controller *ctrl);
-void pciehp_green_led_on(struct controller *ctrl);
-void pciehp_green_led_off(struct controller *ctrl);
-void pciehp_green_led_blink(struct controller *ctrl);
 bool pciehp_card_present(struct controller *ctrl);
 bool pciehp_card_present_or_link_active(struct controller *ctrl);
 int pciehp_check_link_status(struct controller *ctrl);

diff --git a/drivers/pci/hotplug/pciehp_core.c b/drivers/pci/hotplug/pciehp_core.c
index 6ad0d86..b3122c1 100644
--- a/drivers/pci/hotplug/pciehp_core.c
+++ b/drivers/pci/hotplug/pciehp_core.c

@@ -95,15 +95,20 @@
 }
 
 /*
- * set_attention_status - Turns the Amber LED for a slot on, off or blink
+ * set_attention_status - Turns the Attention Indicator on, off or blinking
  */
 static int set_attention_status(struct hotplug_slot *hotplug_slot, u8 status)
 {
 	struct controller *ctrl = to_ctrl(hotplug_slot);
 	struct pci_dev *pdev = ctrl->pcie->port;
 
+	if (status)
+		status <<= PCI_EXP_SLTCTL_ATTN_IND_SHIFT;
+	else
+		status = PCI_EXP_SLTCTL_ATTN_IND_OFF;
+
 	pci_config_pm_runtime_get(pdev);
-	pciehp_set_attention_status(ctrl, status);
+	pciehp_set_indicators(ctrl, INDICATOR_NOOP, status);
 	pci_config_pm_runtime_put(pdev);
 	return 0;
 }

diff --git a/drivers/pci/hotplug/pciehp_ctrl.c b/drivers/pci/hotplug/pciehp_ctrl.c
index 631ced0..21af7b1 100644
--- a/drivers/pci/hotplug/pciehp_ctrl.c
+++ b/drivers/pci/hotplug/pciehp_ctrl.c

@@ -30,7 +30,10 @@
 
 static void set_slot_off(struct controller *ctrl)
 {
-	/* turn off slot, turn on Amber LED, turn off Green LED if supported*/
+	/*
+	 * Turn off slot, turn on attention indicator, turn off power
+	 * indicator
+	 */
 	if (POWER_CTRL(ctrl)) {
 		pciehp_power_off_slot(ctrl);
 
@@ -42,8 +45,8 @@
 		msleep(1000);
 	}
 
-	pciehp_green_led_off(ctrl);
-	pciehp_set_attention_status(ctrl, 1);
+	pciehp_set_indicators(ctrl, PCI_EXP_SLTCTL_PWR_IND_OFF,
+			      PCI_EXP_SLTCTL_ATTN_IND_ON);
 }
 
 /**
@@ -65,7 +68,8 @@
 			return retval;
 	}
 
-	pciehp_green_led_blink(ctrl);
+	pciehp_set_indicators(ctrl, PCI_EXP_SLTCTL_PWR_IND_BLINK,
+			      INDICATOR_NOOP);
 
 	/* Check link training status */
 	retval = pciehp_check_link_status(ctrl);
@@ -90,8 +94,8 @@
 		}
 	}
 
-	pciehp_green_led_on(ctrl);
-	pciehp_set_attention_status(ctrl, 0);
+	pciehp_set_indicators(ctrl, PCI_EXP_SLTCTL_PWR_IND_ON,
+			      PCI_EXP_SLTCTL_ATTN_IND_OFF);
 	return 0;
 
 err_exit:
@@ -100,7 +104,7 @@
 }
 
 /**
- * remove_board - Turns off slot and LEDs
+ * remove_board - Turn off slot and Power Indicator
  * @ctrl: PCIe hotplug controller where board is being removed
  * @safe_removal: whether the board is safely removed (versus surprise removed)
  */
@@ -123,8 +127,8 @@
 			   &ctrl->pending_events);
 	}
 
-	/* turn off Green LED */
-	pciehp_green_led_off(ctrl);
+	pciehp_set_indicators(ctrl, PCI_EXP_SLTCTL_PWR_IND_OFF,
+			      INDICATOR_NOOP);
 }
 
 static int pciehp_enable_slot(struct controller *ctrl);
@@ -171,9 +175,9 @@
 			ctrl_info(ctrl, "Slot(%s) Powering on due to button press\n",
 				  slot_name(ctrl));
 		}
-		/* blink green LED and turn off amber */
-		pciehp_green_led_blink(ctrl);
-		pciehp_set_attention_status(ctrl, 0);
+		/* blink power indicator and turn off attention */
+		pciehp_set_indicators(ctrl, PCI_EXP_SLTCTL_PWR_IND_BLINK,
+				      PCI_EXP_SLTCTL_ATTN_IND_OFF);
 		schedule_delayed_work(&ctrl->button_work, 5 * HZ);
 		break;
 	case BLINKINGOFF_STATE:
@@ -187,12 +191,13 @@
 		cancel_delayed_work(&ctrl->button_work);
 		if (ctrl->state == BLINKINGOFF_STATE) {
 			ctrl->state = ON_STATE;
-			pciehp_green_led_on(ctrl);
+			pciehp_set_indicators(ctrl, PCI_EXP_SLTCTL_PWR_IND_ON,
+					      PCI_EXP_SLTCTL_ATTN_IND_OFF);
 		} else {
 			ctrl->state = OFF_STATE;
-			pciehp_green_led_off(ctrl);
+			pciehp_set_indicators(ctrl, PCI_EXP_SLTCTL_PWR_IND_OFF,
+					      PCI_EXP_SLTCTL_ATTN_IND_OFF);
 		}
-		pciehp_set_attention_status(ctrl, 0);
 		ctrl_info(ctrl, "Slot(%s): Action canceled due to button press\n",
 			  slot_name(ctrl));
 		break;
@@ -310,7 +315,9 @@
 	pm_runtime_get_sync(&ctrl->pcie->port->dev);
 	ret = __pciehp_enable_slot(ctrl);
 	if (ret && ATTN_BUTTN(ctrl))
-		pciehp_green_led_off(ctrl); /* may be blinking */
+		/* may be blinking */
+		pciehp_set_indicators(ctrl, PCI_EXP_SLTCTL_PWR_IND_OFF,
+				      INDICATOR_NOOP);
 	pm_runtime_put(&ctrl->pcie->port->dev);
 
 	mutex_lock(&ctrl->state_lock);

diff --git a/drivers/pci/hotplug/pciehp_hpc.c b/drivers/pci/hotplug/pciehp_hpc.c
index bd990e3..1a522c1 100644
--- a/drivers/pci/hotplug/pciehp_hpc.c
+++ b/drivers/pci/hotplug/pciehp_hpc.c

@@ -418,65 +418,40 @@
 	return 0;
 }
 
-void pciehp_set_attention_status(struct controller *ctrl, u8 value)
+/**
+ * pciehp_set_indicators() - set attention indicator, power indicator, or both
+ * @ctrl: PCIe hotplug controller
+ * @pwr: one of:
+ *	PCI_EXP_SLTCTL_PWR_IND_ON
+ *	PCI_EXP_SLTCTL_PWR_IND_BLINK
+ *	PCI_EXP_SLTCTL_PWR_IND_OFF
+ * @attn: one of:
+ *	PCI_EXP_SLTCTL_ATTN_IND_ON
+ *	PCI_EXP_SLTCTL_ATTN_IND_BLINK
+ *	PCI_EXP_SLTCTL_ATTN_IND_OFF
+ *
+ * Either @pwr or @attn can also be INDICATOR_NOOP to leave that indicator
+ * unchanged.
+ */
+void pciehp_set_indicators(struct controller *ctrl, int pwr, int attn)
 {
-	u16 slot_cmd;
+	u16 cmd = 0, mask = 0;
 
-	if (!ATTN_LED(ctrl))
-		return;
-
-	switch (value) {
-	case 0:		/* turn off */
-		slot_cmd = PCI_EXP_SLTCTL_ATTN_IND_OFF;
-		break;
-	case 1:		/* turn on */
-		slot_cmd = PCI_EXP_SLTCTL_ATTN_IND_ON;
-		break;
-	case 2:		/* turn blink */
-		slot_cmd = PCI_EXP_SLTCTL_ATTN_IND_BLINK;
-		break;
-	default:
-		return;
+	if (PWR_LED(ctrl) && pwr != INDICATOR_NOOP) {
+		cmd |= (pwr & PCI_EXP_SLTCTL_PIC);
+		mask |= PCI_EXP_SLTCTL_PIC;
 	}
-	pcie_write_cmd_nowait(ctrl, slot_cmd, PCI_EXP_SLTCTL_AIC);
-	ctrl_dbg(ctrl, "%s: SLOTCTRL %x write cmd %x\n", __func__,
-		 pci_pcie_cap(ctrl->pcie->port) + PCI_EXP_SLTCTL, slot_cmd);
-}
 
-void pciehp_green_led_on(struct controller *ctrl)
-{
-	if (!PWR_LED(ctrl))
-		return;
+	if (ATTN_LED(ctrl) && attn != INDICATOR_NOOP) {
+		cmd |= (attn & PCI_EXP_SLTCTL_AIC);
+		mask |= PCI_EXP_SLTCTL_AIC;
+	}
 
-	pcie_write_cmd_nowait(ctrl, PCI_EXP_SLTCTL_PWR_IND_ON,
-			      PCI_EXP_SLTCTL_PIC);
-	ctrl_dbg(ctrl, "%s: SLOTCTRL %x write cmd %x\n", __func__,
-		 pci_pcie_cap(ctrl->pcie->port) + PCI_EXP_SLTCTL,
-		 PCI_EXP_SLTCTL_PWR_IND_ON);
-}
-
-void pciehp_green_led_off(struct controller *ctrl)
-{
-	if (!PWR_LED(ctrl))
-		return;
-
-	pcie_write_cmd_nowait(ctrl, PCI_EXP_SLTCTL_PWR_IND_OFF,
-			      PCI_EXP_SLTCTL_PIC);
-	ctrl_dbg(ctrl, "%s: SLOTCTRL %x write cmd %x\n", __func__,
-		 pci_pcie_cap(ctrl->pcie->port) + PCI_EXP_SLTCTL,
-		 PCI_EXP_SLTCTL_PWR_IND_OFF);
-}
-
-void pciehp_green_led_blink(struct controller *ctrl)
-{
-	if (!PWR_LED(ctrl))
-		return;
-
-	pcie_write_cmd_nowait(ctrl, PCI_EXP_SLTCTL_PWR_IND_BLINK,
-			      PCI_EXP_SLTCTL_PIC);
-	ctrl_dbg(ctrl, "%s: SLOTCTRL %x write cmd %x\n", __func__,
-		 pci_pcie_cap(ctrl->pcie->port) + PCI_EXP_SLTCTL,
-		 PCI_EXP_SLTCTL_PWR_IND_BLINK);
+	if (cmd) {
+		pcie_write_cmd_nowait(ctrl, cmd, mask);
+		ctrl_dbg(ctrl, "%s: SLOTCTRL %x write cmd %x\n", __func__,
+			 pci_pcie_cap(ctrl->pcie->port) + PCI_EXP_SLTCTL, cmd);
+	}
 }
 
 int pciehp_power_on_slot(struct controller *ctrl)
@@ -638,8 +613,8 @@
 	if ((events & PCI_EXP_SLTSTA_PFD) && !ctrl->power_fault_detected) {
 		ctrl->power_fault_detected = 1;
 		ctrl_err(ctrl, "Slot(%s): Power fault\n", slot_name(ctrl));
-		pciehp_set_attention_status(ctrl, 1);
-		pciehp_green_led_off(ctrl);
+		pciehp_set_indicators(ctrl, PCI_EXP_SLTCTL_PWR_IND_OFF,
+				      PCI_EXP_SLTCTL_ATTN_IND_ON);
 	}
 
 	/*

diff --git a/drivers/pci/hotplug/rpadlpar_core.c b/drivers/pci/hotplug/rpadlpar_core.c
index 182f9e3..977946e 100644
--- a/drivers/pci/hotplug/rpadlpar_core.c
+++ b/drivers/pci/hotplug/rpadlpar_core.c

@@ -473,7 +473,6 @@
 void rpadlpar_io_exit(void)
 {
 	dlpar_sysfs_exit();
-	return;
 }
 
 module_init(rpadlpar_io_init);

diff --git a/drivers/pci/hotplug/rpaphp_core.c b/drivers/pci/hotplug/rpaphp_core.c
index c3899ee..18627bb 100644
--- a/drivers/pci/hotplug/rpaphp_core.c
+++ b/drivers/pci/hotplug/rpaphp_core.c

@@ -408,7 +408,6 @@
 		pci_hp_deregister(&slot->hotplug_slot);
 		dealloc_slot_struct(slot);
 	}
-	return;
 }
 
 static int __init rpaphp_init(void)

diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
index 525fd3f2..b3f972e 100644
--- a/drivers/pci/iov.c
+++ b/drivers/pci/iov.c

@@ -240,6 +240,173 @@
 	pci_dev_put(dev);
 }
 
+static ssize_t sriov_totalvfs_show(struct device *dev,
+				   struct device_attribute *attr,
+				   char *buf)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+
+	return sprintf(buf, "%u\n", pci_sriov_get_totalvfs(pdev));
+}
+
+static ssize_t sriov_numvfs_show(struct device *dev,
+				 struct device_attribute *attr,
+				 char *buf)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+
+	return sprintf(buf, "%u\n", pdev->sriov->num_VFs);
+}
+
+/*
+ * num_vfs > 0; number of VFs to enable
+ * num_vfs = 0; disable all VFs
+ *
+ * Note: SRIOV spec does not allow partial VF
+ *	 disable, so it's all or none.
+ */
+static ssize_t sriov_numvfs_store(struct device *dev,
+				  struct device_attribute *attr,
+				  const char *buf, size_t count)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+	int ret;
+	u16 num_vfs;
+
+	ret = kstrtou16(buf, 0, &num_vfs);
+	if (ret < 0)
+		return ret;
+
+	if (num_vfs > pci_sriov_get_totalvfs(pdev))
+		return -ERANGE;
+
+	device_lock(&pdev->dev);
+
+	if (num_vfs == pdev->sriov->num_VFs)
+		goto exit;
+
+	/* is PF driver loaded w/callback */
+	if (!pdev->driver || !pdev->driver->sriov_configure) {
+		pci_info(pdev, "Driver does not support SRIOV configuration via sysfs\n");
+		ret = -ENOENT;
+		goto exit;
+	}
+
+	if (num_vfs == 0) {
+		/* disable VFs */
+		ret = pdev->driver->sriov_configure(pdev, 0);
+		goto exit;
+	}
+
+	/* enable VFs */
+	if (pdev->sriov->num_VFs) {
+		pci_warn(pdev, "%d VFs already enabled. Disable before enabling %d VFs\n",
+			 pdev->sriov->num_VFs, num_vfs);
+		ret = -EBUSY;
+		goto exit;
+	}
+
+	ret = pdev->driver->sriov_configure(pdev, num_vfs);
+	if (ret < 0)
+		goto exit;
+
+	if (ret != num_vfs)
+		pci_warn(pdev, "%d VFs requested; only %d enabled\n",
+			 num_vfs, ret);
+
+exit:
+	device_unlock(&pdev->dev);
+
+	if (ret < 0)
+		return ret;
+
+	return count;
+}
+
+static ssize_t sriov_offset_show(struct device *dev,
+				 struct device_attribute *attr,
+				 char *buf)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+
+	return sprintf(buf, "%u\n", pdev->sriov->offset);
+}
+
+static ssize_t sriov_stride_show(struct device *dev,
+				 struct device_attribute *attr,
+				 char *buf)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+
+	return sprintf(buf, "%u\n", pdev->sriov->stride);
+}
+
+static ssize_t sriov_vf_device_show(struct device *dev,
+				    struct device_attribute *attr,
+				    char *buf)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+
+	return sprintf(buf, "%x\n", pdev->sriov->vf_device);
+}
+
+static ssize_t sriov_drivers_autoprobe_show(struct device *dev,
+					    struct device_attribute *attr,
+					    char *buf)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+
+	return sprintf(buf, "%u\n", pdev->sriov->drivers_autoprobe);
+}
+
+static ssize_t sriov_drivers_autoprobe_store(struct device *dev,
+					     struct device_attribute *attr,
+					     const char *buf, size_t count)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+	bool drivers_autoprobe;
+
+	if (kstrtobool(buf, &drivers_autoprobe) < 0)
+		return -EINVAL;
+
+	pdev->sriov->drivers_autoprobe = drivers_autoprobe;
+
+	return count;
+}
+
+static DEVICE_ATTR_RO(sriov_totalvfs);
+static DEVICE_ATTR_RW(sriov_numvfs);
+static DEVICE_ATTR_RO(sriov_offset);
+static DEVICE_ATTR_RO(sriov_stride);
+static DEVICE_ATTR_RO(sriov_vf_device);
+static DEVICE_ATTR_RW(sriov_drivers_autoprobe);
+
+static struct attribute *sriov_dev_attrs[] = {
+	&dev_attr_sriov_totalvfs.attr,
+	&dev_attr_sriov_numvfs.attr,
+	&dev_attr_sriov_offset.attr,
+	&dev_attr_sriov_stride.attr,
+	&dev_attr_sriov_vf_device.attr,
+	&dev_attr_sriov_drivers_autoprobe.attr,
+	NULL,
+};
+
+static umode_t sriov_attrs_are_visible(struct kobject *kobj,
+				       struct attribute *a, int n)
+{
+	struct device *dev = kobj_to_dev(kobj);
+
+	if (!dev_is_pf(dev))
+		return 0;
+
+	return a->mode;
+}
+
+const struct attribute_group sriov_dev_attr_group = {
+	.attrs = sriov_dev_attrs,
+	.is_visible = sriov_attrs_are_visible,
+};
+
 int __weak pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
 {
 	return 0;
@@ -557,8 +724,8 @@
 	ctrl |= iov->ctrl & PCI_SRIOV_CTRL_ARI;
 	pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, ctrl);
 
-	for (i = PCI_IOV_RESOURCES; i <= PCI_IOV_RESOURCE_END; i++)
-		pci_update_resource(dev, i);
+	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
+		pci_update_resource(dev, i + PCI_IOV_RESOURCES);
 
 	pci_write_config_dword(dev, iov->pos + PCI_SRIOV_SYS_PGSIZE, iov->pgsz);
 	pci_iov_set_numvfs(dev, iov->num_VFs);

diff --git a/drivers/pci/of.c b/drivers/pci/of.c
index bc7b27a..36891e7 100644
--- a/drivers/pci/of.c
+++ b/drivers/pci/of.c

@@ -353,7 +353,7 @@
 /**
  * of_irq_parse_pci - Resolve the interrupt for a PCI device
  * @pdev:       the device whose interrupt is to be resolved
- * @out_irq:    structure of_irq filled by this function
+ * @out_irq:    structure of_phandle_args filled by this function
  *
  * This function resolves the PCI interrupt for a given PCI device. If a
  * device-node exists for a given pci_dev, it will use normal OF tree

diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
index 2344762..0608aae 100644
--- a/drivers/pci/p2pdma.c
+++ b/drivers/pci/p2pdma.c

@@ -18,13 +18,32 @@
 #include <linux/percpu-refcount.h>
 #include <linux/random.h>
 #include <linux/seq_buf.h>
-#include <linux/iommu.h>
+#include <linux/xarray.h>
+
+enum pci_p2pdma_map_type {
+	PCI_P2PDMA_MAP_UNKNOWN = 0,
+	PCI_P2PDMA_MAP_NOT_SUPPORTED,
+	PCI_P2PDMA_MAP_BUS_ADDR,
+	PCI_P2PDMA_MAP_THRU_HOST_BRIDGE,
+};
 
 struct pci_p2pdma {
 	struct gen_pool *pool;
 	bool p2pmem_published;
+	struct xarray map_types;
 };
 
+struct pci_p2pdma_pagemap {
+	struct dev_pagemap pgmap;
+	struct pci_dev *provider;
+	u64 bus_offset;
+};
+
+static struct pci_p2pdma_pagemap *to_p2p_pgmap(struct dev_pagemap *pgmap)
+{
+	return container_of(pgmap, struct pci_p2pdma_pagemap, pgmap);
+}
+
 static ssize_t size_show(struct device *dev, struct device_attribute *attr,
 			 char *buf)
 {
@@ -87,6 +106,7 @@
 
 	gen_pool_destroy(p2pdma->pool);
 	sysfs_remove_group(&pdev->dev.kobj, &p2pmem_group);
+	xa_destroy(&p2pdma->map_types);
 }
 
 static int pci_p2pdma_setup(struct pci_dev *pdev)
@@ -98,6 +118,8 @@
 	if (!p2p)
 		return -ENOMEM;
 
+	xa_init(&p2p->map_types);
+
 	p2p->pool = gen_pool_create(PAGE_SHIFT, dev_to_node(&pdev->dev));
 	if (!p2p->pool)
 		goto out;
@@ -135,6 +157,7 @@
 int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
 			    u64 offset)
 {
+	struct pci_p2pdma_pagemap *p2p_pgmap;
 	struct dev_pagemap *pgmap;
 	void *addr;
 	int error;
@@ -157,14 +180,18 @@
 			return error;
 	}
 
-	pgmap = devm_kzalloc(&pdev->dev, sizeof(*pgmap), GFP_KERNEL);
-	if (!pgmap)
+	p2p_pgmap = devm_kzalloc(&pdev->dev, sizeof(*p2p_pgmap), GFP_KERNEL);
+	if (!p2p_pgmap)
 		return -ENOMEM;
+
+	pgmap = &p2p_pgmap->pgmap;
 	pgmap->res.start = pci_resource_start(pdev, bar) + offset;
 	pgmap->res.end = pgmap->res.start + size - 1;
 	pgmap->res.flags = pci_resource_flags(pdev, bar);
 	pgmap->type = MEMORY_DEVICE_PCI_P2PDMA;
-	pgmap->pci_p2pdma_bus_offset = pci_bus_address(pdev, bar) -
+
+	p2p_pgmap->provider = pdev;
+	p2p_pgmap->bus_offset = pci_bus_address(pdev, bar) -
 		pci_resource_start(pdev, bar);
 
 	addr = devm_memremap_pages(&pdev->dev, pgmap);
@@ -246,18 +273,31 @@
 	seq_buf_printf(buf, "%s;", pci_name(pdev));
 }
 
-/*
- * If we can't find a common upstream bridge take a look at the root
- * complex and compare it to a whitelist of known good hardware.
- */
-static bool root_complex_whitelist(struct pci_dev *dev)
-{
-	struct pci_host_bridge *host = pci_find_host_bridge(dev->bus);
-	struct pci_dev *root = pci_get_slot(host->bus, PCI_DEVFN(0, 0));
-	unsigned short vendor, device;
+static const struct pci_p2pdma_whitelist_entry {
+	unsigned short vendor;
+	unsigned short device;
+	enum {
+		REQ_SAME_HOST_BRIDGE	= 1 << 0,
+	} flags;
+} pci_p2pdma_whitelist[] = {
+	/* AMD ZEN */
+	{PCI_VENDOR_ID_AMD,	0x1450,	0},
 
-	if (iommu_present(dev->dev.bus))
-		return false;
+	/* Intel Xeon E5/Core i7 */
+	{PCI_VENDOR_ID_INTEL,	0x3c00, REQ_SAME_HOST_BRIDGE},
+	{PCI_VENDOR_ID_INTEL,	0x3c01, REQ_SAME_HOST_BRIDGE},
+	/* Intel Xeon E7 v3/Xeon E5 v3/Core i7 */
+	{PCI_VENDOR_ID_INTEL,	0x2f00, REQ_SAME_HOST_BRIDGE},
+	{PCI_VENDOR_ID_INTEL,	0x2f01, REQ_SAME_HOST_BRIDGE},
+	{}
+};
+
+static bool __host_bridge_whitelist(struct pci_host_bridge *host,
+				    bool same_host_bridge)
+{
+	struct pci_dev *root = pci_get_slot(host->bus, PCI_DEVFN(0, 0));
+	const struct pci_p2pdma_whitelist_entry *entry;
+	unsigned short vendor, device;
 
 	if (!root)
 		return false;
@@ -266,65 +306,49 @@
 	device = root->device;
 	pci_dev_put(root);
 
-	/* AMD ZEN host bridges can do peer to peer */
-	if (vendor == PCI_VENDOR_ID_AMD && device == 0x1450)
+	for (entry = pci_p2pdma_whitelist; entry->vendor; entry++) {
+		if (vendor != entry->vendor || device != entry->device)
+			continue;
+		if (entry->flags & REQ_SAME_HOST_BRIDGE && !same_host_bridge)
+			return false;
+
 		return true;
+	}
 
 	return false;
 }
 
 /*
- * Find the distance through the nearest common upstream bridge between
- * two PCI devices.
- *
- * If the two devices are the same device then 0 will be returned.
- *
- * If there are two virtual functions of the same device behind the same
- * bridge port then 2 will be returned (one step down to the PCIe switch,
- * then one step back to the same device).
- *
- * In the case where two devices are connected to the same PCIe switch, the
- * value 4 will be returned. This corresponds to the following PCI tree:
- *
- *     -+  Root Port
- *      \+ Switch Upstream Port
- *       +-+ Switch Downstream Port
- *       + \- Device A
- *       \-+ Switch Downstream Port
- *         \- Device B
- *
- * The distance is 4 because we traverse from Device A through the downstream
- * port of the switch, to the common upstream port, back up to the second
- * downstream port and then to Device B.
- *
- * Any two devices that don't have a common upstream bridge will return -1.
- * In this way devices on separate PCIe root ports will be rejected, which
- * is what we want for peer-to-peer seeing each PCIe root port defines a
- * separate hierarchy domain and there's no way to determine whether the root
- * complex supports forwarding between them.
- *
- * In the case where two devices are connected to different PCIe switches,
- * this function will still return a positive distance as long as both
- * switches eventually have a common upstream bridge. Note this covers
- * the case of using multiple PCIe switches to achieve a desired level of
- * fan-out from a root port. The exact distance will be a function of the
- * number of switches between Device A and Device B.
- *
- * If a bridge which has any ACS redirection bits set is in the path
- * then this functions will return -2. This is so we reject any
- * cases where the TLPs are forwarded up into the root complex.
- * In this case, a list of all infringing bridge addresses will be
- * populated in acs_list (assuming it's non-null) for printk purposes.
+ * If we can't find a common upstream bridge take a look at the root
+ * complex and compare it to a whitelist of known good hardware.
  */
-static int upstream_bridge_distance(struct pci_dev *provider,
-				    struct pci_dev *client,
-				    struct seq_buf *acs_list)
+static bool host_bridge_whitelist(struct pci_dev *a, struct pci_dev *b)
+{
+	struct pci_host_bridge *host_a = pci_find_host_bridge(a->bus);
+	struct pci_host_bridge *host_b = pci_find_host_bridge(b->bus);
+
+	if (host_a == host_b)
+		return __host_bridge_whitelist(host_a, true);
+
+	if (__host_bridge_whitelist(host_a, false) &&
+	    __host_bridge_whitelist(host_b, false))
+		return true;
+
+	return false;
+}
+
+static enum pci_p2pdma_map_type
+__upstream_bridge_distance(struct pci_dev *provider, struct pci_dev *client,
+		int *dist, bool *acs_redirects, struct seq_buf *acs_list)
 {
 	struct pci_dev *a = provider, *b = client, *bb;
 	int dist_a = 0;
 	int dist_b = 0;
 	int acs_cnt = 0;
 
+	if (acs_redirects)
+		*acs_redirects = false;
+
 	/*
 	 * Note, we don't need to take references to devices returned by
 	 * pci_upstream_bridge() seeing we hold a reference to a child
@@ -353,15 +377,10 @@
 		dist_a++;
 	}
 
-	/*
-	 * Allow the connection if both devices are on a whitelisted root
-	 * complex, but add an arbitrary large value to the distance.
-	 */
-	if (root_complex_whitelist(provider) &&
-	    root_complex_whitelist(client))
-		return 0x1000 + dist_a + dist_b;
+	if (dist)
+		*dist = dist_a + dist_b;
 
-	return -1;
+	return PCI_P2PDMA_MAP_THRU_HOST_BRIDGE;
 
 check_b_path_acs:
 	bb = b;
@@ -378,33 +397,110 @@
 		bb = pci_upstream_bridge(bb);
 	}
 
-	if (acs_cnt)
-		return -2;
+	if (dist)
+		*dist = dist_a + dist_b;
 
-	return dist_a + dist_b;
+	if (acs_cnt) {
+		if (acs_redirects)
+			*acs_redirects = true;
+
+		return PCI_P2PDMA_MAP_THRU_HOST_BRIDGE;
+	}
+
+	return PCI_P2PDMA_MAP_BUS_ADDR;
 }
 
-static int upstream_bridge_distance_warn(struct pci_dev *provider,
-					 struct pci_dev *client)
+static unsigned long map_types_idx(struct pci_dev *client)
+{
+	return (pci_domain_nr(client->bus) << 16) |
+		(client->bus->number << 8) | client->devfn;
+}
+
+/*
+ * Find the distance through the nearest common upstream bridge between
+ * two PCI devices.
+ *
+ * If the two devices are the same device then 0 will be returned.
+ *
+ * If there are two virtual functions of the same device behind the same
+ * bridge port then 2 will be returned (one step down to the PCIe switch,
+ * then one step back to the same device).
+ *
+ * In the case where two devices are connected to the same PCIe switch, the
+ * value 4 will be returned. This corresponds to the following PCI tree:
+ *
+ *     -+  Root Port
+ *      \+ Switch Upstream Port
+ *       +-+ Switch Downstream Port
+ *       + \- Device A
+ *       \-+ Switch Downstream Port
+ *         \- Device B
+ *
+ * The distance is 4 because we traverse from Device A through the downstream
+ * port of the switch, to the common upstream port, back up to the second
+ * downstream port and then to Device B.
+ *
+ * Any two devices that cannot communicate using p2pdma will return
+ * PCI_P2PDMA_MAP_NOT_SUPPORTED.
+ *
+ * Any two devices that have a data path that goes through the host bridge
+ * will consult a whitelist. If the host bridges are on the whitelist,
+ * this function will return PCI_P2PDMA_MAP_THRU_HOST_BRIDGE.
+ *
+ * If either bridge is not on the whitelist this function returns
+ * PCI_P2PDMA_MAP_NOT_SUPPORTED.
+ *
+ * If a bridge which has any ACS redirection bits set is in the path,
+ * acs_redirects will be set to true. In this case, a list of all infringing
+ * bridge addresses will be populated in acs_list (assuming it's non-null)
+ * for printk purposes.
+ */
+static enum pci_p2pdma_map_type
+upstream_bridge_distance(struct pci_dev *provider, struct pci_dev *client,
+		int *dist, bool *acs_redirects, struct seq_buf *acs_list)
+{
+	enum pci_p2pdma_map_type map_type;
+
+	map_type = __upstream_bridge_distance(provider, client, dist,
+					      acs_redirects, acs_list);
+
+	if (map_type == PCI_P2PDMA_MAP_THRU_HOST_BRIDGE) {
+		if (!host_bridge_whitelist(provider, client))
+			map_type = PCI_P2PDMA_MAP_NOT_SUPPORTED;
+	}
+
+	if (provider->p2pdma)
+		xa_store(&provider->p2pdma->map_types, map_types_idx(client),
+			 xa_mk_value(map_type), GFP_KERNEL);
+
+	return map_type;
+}
+
+static enum pci_p2pdma_map_type
+upstream_bridge_distance_warn(struct pci_dev *provider, struct pci_dev *client,
+			      int *dist)
 {
 	struct seq_buf acs_list;
+	bool acs_redirects;
 	int ret;
 
 	seq_buf_init(&acs_list, kmalloc(PAGE_SIZE, GFP_KERNEL), PAGE_SIZE);
 	if (!acs_list.buffer)
 		return -ENOMEM;
 
-	ret = upstream_bridge_distance(provider, client, &acs_list);
-	if (ret == -2) {
-		pci_warn(client, "cannot be used for peer-to-peer DMA as ACS redirect is set between the client and provider (%s)\n",
+	ret = upstream_bridge_distance(provider, client, dist, &acs_redirects,
+				       &acs_list);
+	if (acs_redirects) {
+		pci_warn(client, "ACS redirect is set between the client and provider (%s)\n",
 			 pci_name(provider));
 		/* Drop final semicolon */
 		acs_list.buffer[acs_list.len-1] = 0;
 		pci_warn(client, "to disable ACS redirect for this path, add the kernel parameter: pci=disable_acs_redir=%s\n",
 			 acs_list.buffer);
+	}
 
-	} else if (ret < 0) {
-		pci_warn(client, "cannot be used for peer-to-peer DMA as the client and provider (%s) do not share an upstream bridge\n",
+	if (ret == PCI_P2PDMA_MAP_NOT_SUPPORTED) {
+		pci_warn(client, "cannot be used for peer-to-peer DMA as the client and provider (%s) do not share an upstream bridge or whitelisted host bridge\n",
 			 pci_name(provider));
 	}
 
@@ -421,22 +517,22 @@
  * @num_clients: number of clients in the array
  * @verbose: if true, print warnings for devices when we return -1
  *
- * Returns -1 if any of the clients are not compatible (behind the same
- * root port as the provider), otherwise returns a positive number where
- * a lower number is the preferable choice. (If there's one client
- * that's the same as the provider it will return 0, which is best choice).
+ * Returns -1 if any of the clients are not compatible, otherwise returns a
+ * positive number where a lower number is the preferable choice. (If there's
+ * one client that's the same as the provider it will return 0, which is best
+ * choice).
  *
- * For now, "compatible" means the provider and the clients are all behind
- * the same PCI root port. This cuts out cases that may work but is safest
- * for the user. Future work can expand this to white-list root complexes that
- * can safely forward between each ports.
+ * "compatible" means the provider and the clients are either all behind
+ * the same PCI root port or the host bridges connected to each of the devices
+ * are listed in the 'pci_p2pdma_whitelist'.
  */
 int pci_p2pdma_distance_many(struct pci_dev *provider, struct device **clients,
 			     int num_clients, bool verbose)
 {
 	bool not_supported = false;
 	struct pci_dev *pci_client;
-	int distance = 0;
+	int total_dist = 0;
+	int distance;
 	int i, ret;
 
 	if (num_clients == 0)
@@ -461,26 +557,26 @@
 
 		if (verbose)
 			ret = upstream_bridge_distance_warn(provider,
-							    pci_client);
+					pci_client, &distance);
 		else
 			ret = upstream_bridge_distance(provider, pci_client,
-						       NULL);
+						       &distance, NULL, NULL);
 
 		pci_dev_put(pci_client);
 
-		if (ret < 0)
+		if (ret == PCI_P2PDMA_MAP_NOT_SUPPORTED)
 			not_supported = true;
 
 		if (not_supported && !verbose)
 			break;
 
-		distance += ret;
+		total_dist += distance;
 	}
 
 	if (not_supported)
 		return -1;
 
-	return distance;
+	return total_dist;
 }
 EXPORT_SYMBOL_GPL(pci_p2pdma_distance_many);
 
@@ -706,21 +802,19 @@
 }
 EXPORT_SYMBOL_GPL(pci_p2pmem_publish);
 
-/**
- * pci_p2pdma_map_sg - map a PCI peer-to-peer scatterlist for DMA
- * @dev: device doing the DMA request
- * @sg: scatter list to map
- * @nents: elements in the scatterlist
- * @dir: DMA direction
- *
- * Scatterlists mapped with this function should not be unmapped in any way.
- *
- * Returns the number of SG entries mapped or 0 on error.
- */
-int pci_p2pdma_map_sg(struct device *dev, struct scatterlist *sg, int nents,
-		      enum dma_data_direction dir)
+static enum pci_p2pdma_map_type pci_p2pdma_map_type(struct pci_dev *provider,
+						    struct pci_dev *client)
 {
-	struct dev_pagemap *pgmap;
+	if (!provider->p2pdma)
+		return PCI_P2PDMA_MAP_NOT_SUPPORTED;
+
+	return xa_to_value(xa_load(&provider->p2pdma->map_types,
+				   map_types_idx(client)));
+}
+
+static int __pci_p2pdma_map_sg(struct pci_p2pdma_pagemap *p2p_pgmap,
+		struct device *dev, struct scatterlist *sg, int nents)
+{
 	struct scatterlist *s;
 	phys_addr_t paddr;
 	int i;
@@ -736,16 +830,80 @@
 		return 0;
 
 	for_each_sg(sg, s, nents, i) {
-		pgmap = sg_page(s)->pgmap;
 		paddr = sg_phys(s);
 
-		s->dma_address = paddr - pgmap->pci_p2pdma_bus_offset;
+		s->dma_address = paddr - p2p_pgmap->bus_offset;
 		sg_dma_len(s) = s->length;
 	}
 
 	return nents;
 }
-EXPORT_SYMBOL_GPL(pci_p2pdma_map_sg);
+
+/**
+ * pci_p2pdma_map_sg - map a PCI peer-to-peer scatterlist for DMA
+ * @dev: device doing the DMA request
+ * @sg: scatter list to map
+ * @nents: elements in the scatterlist
+ * @dir: DMA direction
+ * @attrs: DMA attributes passed to dma_map_sg() (if called)
+ *
+ * Scatterlists mapped with this function should be unmapped using
+ * pci_p2pdma_unmap_sg_attrs().
+ *
+ * Returns the number of SG entries mapped or 0 on error.
+ */
+int pci_p2pdma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
+		int nents, enum dma_data_direction dir, unsigned long attrs)
+{
+	struct pci_p2pdma_pagemap *p2p_pgmap =
+		to_p2p_pgmap(sg_page(sg)->pgmap);
+	struct pci_dev *client;
+
+	if (WARN_ON_ONCE(!dev_is_pci(dev)))
+		return 0;
+
+	client = to_pci_dev(dev);
+
+	switch (pci_p2pdma_map_type(p2p_pgmap->provider, client)) {
+	case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
+		return dma_map_sg_attrs(dev, sg, nents, dir, attrs);
+	case PCI_P2PDMA_MAP_BUS_ADDR:
+		return __pci_p2pdma_map_sg(p2p_pgmap, dev, sg, nents);
+	default:
+		WARN_ON_ONCE(1);
+		return 0;
+	}
+}
+EXPORT_SYMBOL_GPL(pci_p2pdma_map_sg_attrs);
+
+/**
+ * pci_p2pdma_unmap_sg - unmap a PCI peer-to-peer scatterlist that was
+ *	mapped with pci_p2pdma_map_sg()
+ * @dev: device doing the DMA request
+ * @sg: scatter list to map
+ * @nents: number of elements returned by pci_p2pdma_map_sg()
+ * @dir: DMA direction
+ * @attrs: DMA attributes passed to dma_unmap_sg() (if called)
+ */
+void pci_p2pdma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg,
+		int nents, enum dma_data_direction dir, unsigned long attrs)
+{
+	struct pci_p2pdma_pagemap *p2p_pgmap =
+		to_p2p_pgmap(sg_page(sg)->pgmap);
+	enum pci_p2pdma_map_type map_type;
+	struct pci_dev *client;
+
+	if (WARN_ON_ONCE(!dev_is_pci(dev)))
+		return;
+
+	client = to_pci_dev(dev);
+
+	map_type = pci_p2pdma_map_type(p2p_pgmap->provider, client);
+
+	if (map_type == PCI_P2PDMA_MAP_THRU_HOST_BRIDGE)
+		dma_unmap_sg_attrs(dev, sg, nents, dir, attrs);
+}
+EXPORT_SYMBOL_GPL(pci_p2pdma_unmap_sg_attrs);
 
 /**
  * pci_p2pdma_enable_store - parse a configfs/sysfs attribute store

diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c
index 45049f5..0c02d50 100644
--- a/drivers/pci/pci-acpi.c
+++ b/drivers/pci/pci-acpi.c

@@ -14,7 +14,6 @@
 #include <linux/msi.h>
 #include <linux/pci_hotplug.h>
 #include <linux/module.h>
-#include <linux/pci-aspm.h>
 #include <linux/pci-acpi.h>
 #include <linux/pm_runtime.h>
 #include <linux/pm_qos.h>
@@ -118,8 +117,58 @@
 	return (phys_addr_t)mcfg_addr;
 }
 
+/* _HPX PCI Setting Record (Type 0); same as _HPP */
+struct hpx_type0 {
+	u32 revision;		/* Not present in _HPP */
+	u8  cache_line_size;	/* Not applicable to PCIe */
+	u8  latency_timer;	/* Not applicable to PCIe */
+	u8  enable_serr;
+	u8  enable_perr;
+};
+
+static struct hpx_type0 pci_default_type0 = {
+	.revision = 1,
+	.cache_line_size = 8,
+	.latency_timer = 0x40,
+	.enable_serr = 0,
+	.enable_perr = 0,
+};
+
+static void program_hpx_type0(struct pci_dev *dev, struct hpx_type0 *hpx)
+{
+	u16 pci_cmd, pci_bctl;
+
+	if (!hpx)
+		hpx = &pci_default_type0;
+
+	if (hpx->revision > 1) {
+		pci_warn(dev, "PCI settings rev %d not supported; using defaults\n",
+			 hpx->revision);
+		hpx = &pci_default_type0;
+	}
+
+	pci_write_config_byte(dev, PCI_CACHE_LINE_SIZE, hpx->cache_line_size);
+	pci_write_config_byte(dev, PCI_LATENCY_TIMER, hpx->latency_timer);
+	pci_read_config_word(dev, PCI_COMMAND, &pci_cmd);
+	if (hpx->enable_serr)
+		pci_cmd |= PCI_COMMAND_SERR;
+	if (hpx->enable_perr)
+		pci_cmd |= PCI_COMMAND_PARITY;
+	pci_write_config_word(dev, PCI_COMMAND, pci_cmd);
+
+	/* Program bridge control value */
+	if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI) {
+		pci_write_config_byte(dev, PCI_SEC_LATENCY_TIMER,
+				      hpx->latency_timer);
+		pci_read_config_word(dev, PCI_BRIDGE_CONTROL, &pci_bctl);
+		if (hpx->enable_perr)
+			pci_bctl |= PCI_BRIDGE_CTL_PARITY;
+		pci_write_config_word(dev, PCI_BRIDGE_CONTROL, pci_bctl);
+	}
+}
+
 static acpi_status decode_type0_hpx_record(union acpi_object *record,
-					   struct hpp_type0 *hpx0)
+					   struct hpx_type0 *hpx0)
 {
 	int i;
 	union acpi_object *fields = record->package.elements;
@@ -146,8 +195,30 @@
 	return AE_OK;
 }
 
+/* _HPX PCI-X Setting Record (Type 1) */
+struct hpx_type1 {
+	u32 revision;
+	u8  max_mem_read;
+	u8  avg_max_split;
+	u16 tot_max_split;
+};
+
+static void program_hpx_type1(struct pci_dev *dev, struct hpx_type1 *hpx)
+{
+	int pos;
+
+	if (!hpx)
+		return;
+
+	pos = pci_find_capability(dev, PCI_CAP_ID_PCIX);
+	if (!pos)
+		return;
+
+	pci_warn(dev, "PCI-X settings not supported\n");
+}
+
 static acpi_status decode_type1_hpx_record(union acpi_object *record,
-					   struct hpp_type1 *hpx1)
+					   struct hpx_type1 *hpx1)
 {
 	int i;
 	union acpi_object *fields = record->package.elements;
@@ -173,8 +244,130 @@
 	return AE_OK;
 }
 
+static bool pcie_root_rcb_set(struct pci_dev *dev)
+{
+	struct pci_dev *rp = pcie_find_root_port(dev);
+	u16 lnkctl;
+
+	if (!rp)
+		return false;
+
+	pcie_capability_read_word(rp, PCI_EXP_LNKCTL, &lnkctl);
+	if (lnkctl & PCI_EXP_LNKCTL_RCB)
+		return true;
+
+	return false;
+}
+
+/* _HPX PCI Express Setting Record (Type 2) */
+struct hpx_type2 {
+	u32 revision;
+	u32 unc_err_mask_and;
+	u32 unc_err_mask_or;
+	u32 unc_err_sever_and;
+	u32 unc_err_sever_or;
+	u32 cor_err_mask_and;
+	u32 cor_err_mask_or;
+	u32 adv_err_cap_and;
+	u32 adv_err_cap_or;
+	u16 pci_exp_devctl_and;
+	u16 pci_exp_devctl_or;
+	u16 pci_exp_lnkctl_and;
+	u16 pci_exp_lnkctl_or;
+	u32 sec_unc_err_sever_and;
+	u32 sec_unc_err_sever_or;
+	u32 sec_unc_err_mask_and;
+	u32 sec_unc_err_mask_or;
+};
+
+static void program_hpx_type2(struct pci_dev *dev, struct hpx_type2 *hpx)
+{
+	int pos;
+	u32 reg32;
+
+	if (!hpx)
+		return;
+
+	if (!pci_is_pcie(dev))
+		return;
+
+	if (hpx->revision > 1) {
+		pci_warn(dev, "PCIe settings rev %d not supported\n",
+			 hpx->revision);
+		return;
+	}
+
+	/*
+	 * Don't allow _HPX to change MPS or MRRS settings.  We manage
+	 * those to make sure they're consistent with the rest of the
+	 * platform.
+	 */
+	hpx->pci_exp_devctl_and |= PCI_EXP_DEVCTL_PAYLOAD |
+				    PCI_EXP_DEVCTL_READRQ;
+	hpx->pci_exp_devctl_or &= ~(PCI_EXP_DEVCTL_PAYLOAD |
+				    PCI_EXP_DEVCTL_READRQ);
+
+	/* Initialize Device Control Register */
+	pcie_capability_clear_and_set_word(dev, PCI_EXP_DEVCTL,
+			~hpx->pci_exp_devctl_and, hpx->pci_exp_devctl_or);
+
+	/* Initialize Link Control Register */
+	if (pcie_cap_has_lnkctl(dev)) {
+
+		/*
+		 * If the Root Port supports Read Completion Boundary of
+		 * 128, set RCB to 128.  Otherwise, clear it.
+		 */
+		hpx->pci_exp_lnkctl_and |= PCI_EXP_LNKCTL_RCB;
+		hpx->pci_exp_lnkctl_or &= ~PCI_EXP_LNKCTL_RCB;
+		if (pcie_root_rcb_set(dev))
+			hpx->pci_exp_lnkctl_or |= PCI_EXP_LNKCTL_RCB;
+
+		pcie_capability_clear_and_set_word(dev, PCI_EXP_LNKCTL,
+			~hpx->pci_exp_lnkctl_and, hpx->pci_exp_lnkctl_or);
+	}
+
+	/* Find Advanced Error Reporting Enhanced Capability */
+	pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ERR);
+	if (!pos)
+		return;
+
+	/* Initialize Uncorrectable Error Mask Register */
+	pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_MASK, &reg32);
+	reg32 = (reg32 & hpx->unc_err_mask_and) | hpx->unc_err_mask_or;
+	pci_write_config_dword(dev, pos + PCI_ERR_UNCOR_MASK, reg32);
+
+	/* Initialize Uncorrectable Error Severity Register */
+	pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_SEVER, &reg32);
+	reg32 = (reg32 & hpx->unc_err_sever_and) | hpx->unc_err_sever_or;
+	pci_write_config_dword(dev, pos + PCI_ERR_UNCOR_SEVER, reg32);
+
+	/* Initialize Correctable Error Mask Register */
+	pci_read_config_dword(dev, pos + PCI_ERR_COR_MASK, &reg32);
+	reg32 = (reg32 & hpx->cor_err_mask_and) | hpx->cor_err_mask_or;
+	pci_write_config_dword(dev, pos + PCI_ERR_COR_MASK, reg32);
+
+	/* Initialize Advanced Error Capabilities and Control Register */
+	pci_read_config_dword(dev, pos + PCI_ERR_CAP, &reg32);
+	reg32 = (reg32 & hpx->adv_err_cap_and) | hpx->adv_err_cap_or;
+
+	/* Don't enable ECRC generation or checking if unsupported */
+	if (!(reg32 & PCI_ERR_CAP_ECRC_GENC))
+		reg32 &= ~PCI_ERR_CAP_ECRC_GENE;
+	if (!(reg32 & PCI_ERR_CAP_ECRC_CHKC))
+		reg32 &= ~PCI_ERR_CAP_ECRC_CHKE;
+	pci_write_config_dword(dev, pos + PCI_ERR_CAP, reg32);
+
+	/*
+	 * FIXME: The following two registers are not supported yet.
+	 *
+	 *   o Secondary Uncorrectable Error Severity Register
+	 *   o Secondary Uncorrectable Error Mask Register
+	 */
+}
+
 static acpi_status decode_type2_hpx_record(union acpi_object *record,
-					   struct hpp_type2 *hpx2)
+					   struct hpx_type2 *hpx2)
 {
 	int i;
 	union acpi_object *fields = record->package.elements;
@@ -213,6 +406,164 @@
 	return AE_OK;
 }
 
+/* _HPX PCI Express Setting Record (Type 3) */
+struct hpx_type3 {
+	u16 device_type;
+	u16 function_type;
+	u16 config_space_location;
+	u16 pci_exp_cap_id;
+	u16 pci_exp_cap_ver;
+	u16 pci_exp_vendor_id;
+	u16 dvsec_id;
+	u16 dvsec_rev;
+	u16 match_offset;
+	u32 match_mask_and;
+	u32 match_value;
+	u16 reg_offset;
+	u32 reg_mask_and;
+	u32 reg_mask_or;
+};
+
+enum hpx_type3_dev_type {
+	HPX_TYPE_ENDPOINT	= BIT(0),
+	HPX_TYPE_LEG_END	= BIT(1),
+	HPX_TYPE_RC_END		= BIT(2),
+	HPX_TYPE_RC_EC		= BIT(3),
+	HPX_TYPE_ROOT_PORT	= BIT(4),
+	HPX_TYPE_UPSTREAM	= BIT(5),
+	HPX_TYPE_DOWNSTREAM	= BIT(6),
+	HPX_TYPE_PCI_BRIDGE	= BIT(7),
+	HPX_TYPE_PCIE_BRIDGE	= BIT(8),
+};
+
+static u16 hpx3_device_type(struct pci_dev *dev)
+{
+	u16 pcie_type = pci_pcie_type(dev);
+	const int pcie_to_hpx3_type[] = {
+		[PCI_EXP_TYPE_ENDPOINT]    = HPX_TYPE_ENDPOINT,
+		[PCI_EXP_TYPE_LEG_END]     = HPX_TYPE_LEG_END,
+		[PCI_EXP_TYPE_RC_END]      = HPX_TYPE_RC_END,
+		[PCI_EXP_TYPE_RC_EC]       = HPX_TYPE_RC_EC,
+		[PCI_EXP_TYPE_ROOT_PORT]   = HPX_TYPE_ROOT_PORT,
+		[PCI_EXP_TYPE_UPSTREAM]    = HPX_TYPE_UPSTREAM,
+		[PCI_EXP_TYPE_DOWNSTREAM]  = HPX_TYPE_DOWNSTREAM,
+		[PCI_EXP_TYPE_PCI_BRIDGE]  = HPX_TYPE_PCI_BRIDGE,
+		[PCI_EXP_TYPE_PCIE_BRIDGE] = HPX_TYPE_PCIE_BRIDGE,
+	};
+
+	if (pcie_type >= ARRAY_SIZE(pcie_to_hpx3_type))
+		return 0;
+
+	return pcie_to_hpx3_type[pcie_type];
+}
+
+enum hpx_type3_fn_type {
+	HPX_FN_NORMAL		= BIT(0),
+	HPX_FN_SRIOV_PHYS	= BIT(1),
+	HPX_FN_SRIOV_VIRT	= BIT(2),
+};
+
+static u8 hpx3_function_type(struct pci_dev *dev)
+{
+	if (dev->is_virtfn)
+		return HPX_FN_SRIOV_VIRT;
+	else if (pci_find_ext_capability(dev, PCI_EXT_CAP_ID_SRIOV) > 0)
+		return HPX_FN_SRIOV_PHYS;
+	else
+		return HPX_FN_NORMAL;
+}
+
+static bool hpx3_cap_ver_matches(u8 pcie_cap_id, u8 hpx3_cap_id)
+{
+	u8 cap_ver = hpx3_cap_id & 0xf;
+
+	if ((hpx3_cap_id & BIT(4)) && cap_ver >= pcie_cap_id)
+		return true;
+	else if (cap_ver == pcie_cap_id)
+		return true;
+
+	return false;
+}
+
+enum hpx_type3_cfg_loc {
+	HPX_CFG_PCICFG		= 0,
+	HPX_CFG_PCIE_CAP	= 1,
+	HPX_CFG_PCIE_CAP_EXT	= 2,
+	HPX_CFG_VEND_CAP	= 3,
+	HPX_CFG_DVSEC		= 4,
+	HPX_CFG_MAX,
+};
+
+static void program_hpx_type3_register(struct pci_dev *dev,
+				       const struct hpx_type3 *reg)
+{
+	u32 match_reg, write_reg, header, orig_value;
+	u16 pos;
+
+	if (!(hpx3_device_type(dev) & reg->device_type))
+		return;
+
+	if (!(hpx3_function_type(dev) & reg->function_type))
+		return;
+
+	switch (reg->config_space_location) {
+	case HPX_CFG_PCICFG:
+		pos = 0;
+		break;
+	case HPX_CFG_PCIE_CAP:
+		pos = pci_find_capability(dev, reg->pci_exp_cap_id);
+		if (pos == 0)
+			return;
+
+		break;
+	case HPX_CFG_PCIE_CAP_EXT:
+		pos = pci_find_ext_capability(dev, reg->pci_exp_cap_id);
+		if (pos == 0)
+			return;
+
+		pci_read_config_dword(dev, pos, &header);
+		if (!hpx3_cap_ver_matches(PCI_EXT_CAP_VER(header),
+					  reg->pci_exp_cap_ver))
+			return;
+
+		break;
+	case HPX_CFG_VEND_CAP:	/* Fall through */
+	case HPX_CFG_DVSEC:	/* Fall through */
+	default:
+		pci_warn(dev, "Encountered _HPX type 3 with unsupported config space location");
+		return;
+	}
+
+	pci_read_config_dword(dev, pos + reg->match_offset, &match_reg);
+
+	if ((match_reg & reg->match_mask_and) != reg->match_value)
+		return;
+
+	pci_read_config_dword(dev, pos + reg->reg_offset, &write_reg);
+	orig_value = write_reg;
+	write_reg &= reg->reg_mask_and;
+	write_reg |= reg->reg_mask_or;
+
+	if (orig_value == write_reg)
+		return;
+
+	pci_write_config_dword(dev, pos + reg->reg_offset, write_reg);
+
+	pci_dbg(dev, "Applied _HPX3 at [0x%x]: 0x%08x -> 0x%08x",
+		pos, orig_value, write_reg);
+}
+
+static void program_hpx_type3(struct pci_dev *dev, struct hpx_type3 *hpx)
+{
+	if (!hpx)
+		return;
+
+	if (!pci_is_pcie(dev))
+		return;
+
+	program_hpx_type3_register(dev, hpx);
+}
+
 static void parse_hpx3_register(struct hpx_type3 *hpx3_reg,
 				union acpi_object *reg_fields)
 {
@@ -233,8 +584,7 @@
 }
 
 static acpi_status program_type3_hpx_record(struct pci_dev *dev,
-					   union acpi_object *record,
-					   const struct hotplug_program_ops *hp_ops)
+					   union acpi_object *record)
 {
 	union acpi_object *fields = record->package.elements;
 	u32 desc_count, expected_length, revision;
@@ -258,7 +608,7 @@
 		for (i = 0; i < desc_count; i++) {
 			reg_fields = fields + 3 + i * 14;
 			parse_hpx3_register(&hpx3, reg_fields);
-			hp_ops->program_type3(dev, &hpx3);
+			program_hpx_type3(dev, &hpx3);
 		}
 
 		break;
@@ -271,15 +621,14 @@
 	return AE_OK;
 }
 
-static acpi_status acpi_run_hpx(struct pci_dev *dev, acpi_handle handle,
-				const struct hotplug_program_ops *hp_ops)
+static acpi_status acpi_run_hpx(struct pci_dev *dev, acpi_handle handle)
 {
 	acpi_status status;
 	struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER, NULL};
 	union acpi_object *package, *record, *fields;
-	struct hpp_type0 hpx0;
-	struct hpp_type1 hpx1;
-	struct hpp_type2 hpx2;
+	struct hpx_type0 hpx0;
+	struct hpx_type1 hpx1;
+	struct hpx_type2 hpx2;
 	u32 type;
 	int i;
 
@@ -314,24 +663,24 @@
 			status = decode_type0_hpx_record(record, &hpx0);
 			if (ACPI_FAILURE(status))
 				goto exit;
-			hp_ops->program_type0(dev, &hpx0);
+			program_hpx_type0(dev, &hpx0);
 			break;
 		case 1:
 			memset(&hpx1, 0, sizeof(hpx1));
 			status = decode_type1_hpx_record(record, &hpx1);
 			if (ACPI_FAILURE(status))
 				goto exit;
-			hp_ops->program_type1(dev, &hpx1);
+			program_hpx_type1(dev, &hpx1);
 			break;
 		case 2:
 			memset(&hpx2, 0, sizeof(hpx2));
 			status = decode_type2_hpx_record(record, &hpx2);
 			if (ACPI_FAILURE(status))
 				goto exit;
-			hp_ops->program_type2(dev, &hpx2);
+			program_hpx_type2(dev, &hpx2);
 			break;
 		case 3:
-			status = program_type3_hpx_record(dev, record, hp_ops);
+			status = program_type3_hpx_record(dev, record);
 			if (ACPI_FAILURE(status))
 				goto exit;
 			break;
@@ -347,16 +696,15 @@
 	return status;
 }
 
-static acpi_status acpi_run_hpp(struct pci_dev *dev, acpi_handle handle,
-				const struct hotplug_program_ops *hp_ops)
+static acpi_status acpi_run_hpp(struct pci_dev *dev, acpi_handle handle)
 {
 	acpi_status status;
 	struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
 	union acpi_object *package, *fields;
-	struct hpp_type0 hpp0;
+	struct hpx_type0 hpx0;
 	int i;
 
-	memset(&hpp0, 0, sizeof(hpp0));
+	memset(&hpx0, 0, sizeof(hpx0));
 
 	status = acpi_evaluate_object(handle, "_HPP", NULL, &buffer);
 	if (ACPI_FAILURE(status))
@@ -377,26 +725,24 @@
 		}
 	}
 
-	hpp0.revision        = 1;
-	hpp0.cache_line_size = fields[0].integer.value;
-	hpp0.latency_timer   = fields[1].integer.value;
-	hpp0.enable_serr     = fields[2].integer.value;
-	hpp0.enable_perr     = fields[3].integer.value;
+	hpx0.revision        = 1;
+	hpx0.cache_line_size = fields[0].integer.value;
+	hpx0.latency_timer   = fields[1].integer.value;
+	hpx0.enable_serr     = fields[2].integer.value;
+	hpx0.enable_perr     = fields[3].integer.value;
 
-	hp_ops->program_type0(dev, &hpp0);
+	program_hpx_type0(dev, &hpx0);
 
 exit:
 	kfree(buffer.pointer);
 	return status;
 }
 
-/* pci_get_hp_params
+/* pci_acpi_program_hp_params
  *
  * @dev - the pci_dev for which we want parameters
- * @hpp - allocated by the caller
  */
-int pci_acpi_program_hp_params(struct pci_dev *dev,
-			       const struct hotplug_program_ops *hp_ops)
+int pci_acpi_program_hp_params(struct pci_dev *dev)
 {
 	acpi_status status;
 	acpi_handle handle, phandle;
@@ -419,10 +765,10 @@
 	 * this pci dev.
 	 */
 	while (handle) {
-		status = acpi_run_hpx(dev, handle, hp_ops);
+		status = acpi_run_hpx(dev, handle);
 		if (ACPI_SUCCESS(status))
 			return 0;
-		status = acpi_run_hpp(dev, handle, hp_ops);
+		status = acpi_run_hpp(dev, handle);
 		if (ACPI_SUCCESS(status))
 			return 0;
 		if (acpi_is_root_bridge(handle))

diff --git a/drivers/pci/pci-bridge-emul.c b/drivers/pci/pci-bridge-emul.c
index 06083b8..5fd9010 100644
--- a/drivers/pci/pci-bridge-emul.c
+++ b/drivers/pci/pci-bridge-emul.c

@@ -38,7 +38,7 @@
 	u32 rsvd;
 };
 
-const static struct pci_bridge_reg_behavior pci_regs_behavior[] = {
+static const struct pci_bridge_reg_behavior pci_regs_behavior[] = {
 	[PCI_VENDOR_ID / 4] = { .ro = ~0 },
 	[PCI_COMMAND / 4] = {
 		.rw = (PCI_COMMAND_IO | PCI_COMMAND_MEMORY |
@@ -173,7 +173,7 @@
 	},
 };
 
-const static struct pci_bridge_reg_behavior pcie_cap_regs_behavior[] = {
+static const struct pci_bridge_reg_behavior pcie_cap_regs_behavior[] = {
 	[PCI_CAP_LIST_ID / 4] = {
 		/*
 		 * Capability ID, Next Capability Pointer and

diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index 965c721..7934129 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c

@@ -464,9 +464,7 @@
 	}
 	return count;
 }
-static struct device_attribute dev_rescan_attr = __ATTR(rescan,
-							(S_IWUSR|S_IWGRP),
-							NULL, dev_rescan_store);
+static DEVICE_ATTR_WO(dev_rescan);
 
 static ssize_t remove_store(struct device *dev, struct device_attribute *attr,
 			    const char *buf, size_t count)
@@ -480,13 +478,12 @@
 		pci_stop_and_remove_bus_device_locked(to_pci_dev(dev));
 	return count;
 }
-static struct device_attribute dev_remove_attr = __ATTR_IGNORE_LOCKDEP(remove,
-							(S_IWUSR|S_IWGRP),
-							NULL, remove_store);
+static DEVICE_ATTR_IGNORE_LOCKDEP(remove, 0220, NULL,
+				  remove_store);
 
-static ssize_t dev_bus_rescan_store(struct device *dev,
-				    struct device_attribute *attr,
-				    const char *buf, size_t count)
+static ssize_t bus_rescan_store(struct device *dev,
+				struct device_attribute *attr,
+				const char *buf, size_t count)
 {
 	unsigned long val;
 	struct pci_bus *bus = to_pci_bus(dev);
@@ -504,7 +501,7 @@
 	}
 	return count;
 }
-static DEVICE_ATTR(rescan, (S_IWUSR|S_IWGRP), NULL, dev_bus_rescan_store);
+static DEVICE_ATTR_WO(bus_rescan);
 
 #if defined(CONFIG_PM) && defined(CONFIG_ACPI)
 static ssize_t d3cold_allowed_store(struct device *dev,
@@ -551,154 +548,6 @@
 static DEVICE_ATTR_RO(devspec);
 #endif
 
-#ifdef CONFIG_PCI_IOV
-static ssize_t sriov_totalvfs_show(struct device *dev,
-				   struct device_attribute *attr,
-				   char *buf)
-{
-	struct pci_dev *pdev = to_pci_dev(dev);
-
-	return sprintf(buf, "%u\n", pci_sriov_get_totalvfs(pdev));
-}
-
-
-static ssize_t sriov_numvfs_show(struct device *dev,
-				 struct device_attribute *attr,
-				 char *buf)
-{
-	struct pci_dev *pdev = to_pci_dev(dev);
-
-	return sprintf(buf, "%u\n", pdev->sriov->num_VFs);
-}
-
-/*
- * num_vfs > 0; number of VFs to enable
- * num_vfs = 0; disable all VFs
- *
- * Note: SRIOV spec doesn't allow partial VF
- *       disable, so it's all or none.
- */
-static ssize_t sriov_numvfs_store(struct device *dev,
-				  struct device_attribute *attr,
-				  const char *buf, size_t count)
-{
-	struct pci_dev *pdev = to_pci_dev(dev);
-	int ret;
-	u16 num_vfs;
-
-	ret = kstrtou16(buf, 0, &num_vfs);
-	if (ret < 0)
-		return ret;
-
-	if (num_vfs > pci_sriov_get_totalvfs(pdev))
-		return -ERANGE;
-
-	device_lock(&pdev->dev);
-
-	if (num_vfs == pdev->sriov->num_VFs)
-		goto exit;
-
-	/* is PF driver loaded w/callback */
-	if (!pdev->driver || !pdev->driver->sriov_configure) {
-		pci_info(pdev, "Driver doesn't support SRIOV configuration via sysfs\n");
-		ret = -ENOENT;
-		goto exit;
-	}
-
-	if (num_vfs == 0) {
-		/* disable VFs */
-		ret = pdev->driver->sriov_configure(pdev, 0);
-		goto exit;
-	}
-
-	/* enable VFs */
-	if (pdev->sriov->num_VFs) {
-		pci_warn(pdev, "%d VFs already enabled. Disable before enabling %d VFs\n",
-			 pdev->sriov->num_VFs, num_vfs);
-		ret = -EBUSY;
-		goto exit;
-	}
-
-	ret = pdev->driver->sriov_configure(pdev, num_vfs);
-	if (ret < 0)
-		goto exit;
-
-	if (ret != num_vfs)
-		pci_warn(pdev, "%d VFs requested; only %d enabled\n",
-			 num_vfs, ret);
-
-exit:
-	device_unlock(&pdev->dev);
-
-	if (ret < 0)
-		return ret;
-
-	return count;
-}
-
-static ssize_t sriov_offset_show(struct device *dev,
-				 struct device_attribute *attr,
-				 char *buf)
-{
-	struct pci_dev *pdev = to_pci_dev(dev);
-
-	return sprintf(buf, "%u\n", pdev->sriov->offset);
-}
-
-static ssize_t sriov_stride_show(struct device *dev,
-				 struct device_attribute *attr,
-				 char *buf)
-{
-	struct pci_dev *pdev = to_pci_dev(dev);
-
-	return sprintf(buf, "%u\n", pdev->sriov->stride);
-}
-
-static ssize_t sriov_vf_device_show(struct device *dev,
-				    struct device_attribute *attr,
-				    char *buf)
-{
-	struct pci_dev *pdev = to_pci_dev(dev);
-
-	return sprintf(buf, "%x\n", pdev->sriov->vf_device);
-}
-
-static ssize_t sriov_drivers_autoprobe_show(struct device *dev,
-					    struct device_attribute *attr,
-					    char *buf)
-{
-	struct pci_dev *pdev = to_pci_dev(dev);
-
-	return sprintf(buf, "%u\n", pdev->sriov->drivers_autoprobe);
-}
-
-static ssize_t sriov_drivers_autoprobe_store(struct device *dev,
-					     struct device_attribute *attr,
-					     const char *buf, size_t count)
-{
-	struct pci_dev *pdev = to_pci_dev(dev);
-	bool drivers_autoprobe;
-
-	if (kstrtobool(buf, &drivers_autoprobe) < 0)
-		return -EINVAL;
-
-	pdev->sriov->drivers_autoprobe = drivers_autoprobe;
-
-	return count;
-}
-
-static struct device_attribute sriov_totalvfs_attr = __ATTR_RO(sriov_totalvfs);
-static struct device_attribute sriov_numvfs_attr =
-		__ATTR(sriov_numvfs, (S_IRUGO|S_IWUSR|S_IWGRP),
-		       sriov_numvfs_show, sriov_numvfs_store);
-static struct device_attribute sriov_offset_attr = __ATTR_RO(sriov_offset);
-static struct device_attribute sriov_stride_attr = __ATTR_RO(sriov_stride);
-static struct device_attribute sriov_vf_device_attr = __ATTR_RO(sriov_vf_device);
-static struct device_attribute sriov_drivers_autoprobe_attr =
-		__ATTR(sriov_drivers_autoprobe, (S_IRUGO|S_IWUSR|S_IWGRP),
-		       sriov_drivers_autoprobe_show, sriov_drivers_autoprobe_store);
-#endif /* CONFIG_PCI_IOV */
-
 static ssize_t driver_override_store(struct device *dev,
 				     struct device_attribute *attr,
 				     const char *buf, size_t count)
@@ -792,7 +641,7 @@
 };
 
 static struct attribute *pcibus_attrs[] = {
-	&dev_attr_rescan.attr,
+	&dev_attr_bus_rescan.attr,
 	&dev_attr_cpuaffinity.attr,
 	&dev_attr_cpulistaffinity.attr,
 	NULL,
@@ -820,7 +669,7 @@
 		!!(pdev->resource[PCI_ROM_RESOURCE].flags &
 		   IORESOURCE_ROM_SHADOW));
 }
-static struct device_attribute vga_attr = __ATTR_RO(boot_vga);
+static DEVICE_ATTR_RO(boot_vga);
 
 static ssize_t pci_read_config(struct file *filp, struct kobject *kobj,
 			       struct bin_attribute *bin_attr, char *buf,
@@ -906,6 +755,11 @@
 	unsigned int size = count;
 	loff_t init_off = off;
 	u8 *data = (u8 *) buf;
+	int ret;
+
+	ret = security_locked_down(LOCKDOWN_PCI_ACCESS);
+	if (ret)
+		return ret;
 
 	if (off > dev->cfg_size)
 		return 0;
@@ -1085,7 +939,7 @@
 	sysfs_bin_attr_init(b->legacy_io);
 	b->legacy_io->attr.name = "legacy_io";
 	b->legacy_io->size = 0xffff;
-	b->legacy_io->attr.mode = S_IRUSR | S_IWUSR;
+	b->legacy_io->attr.mode = 0600;
 	b->legacy_io->read = pci_read_legacy_io;
 	b->legacy_io->write = pci_write_legacy_io;
 	b->legacy_io->mmap = pci_mmap_legacy_io;
@@ -1099,7 +953,7 @@
 	sysfs_bin_attr_init(b->legacy_mem);
 	b->legacy_mem->attr.name = "legacy_mem";
 	b->legacy_mem->size = 1024*1024;
-	b->legacy_mem->attr.mode = S_IRUSR | S_IWUSR;
+	b->legacy_mem->attr.mode = 0600;
 	b->legacy_mem->mmap = pci_mmap_legacy_mem;
 	pci_adjust_legacy_attr(b, pci_mmap_mem);
 	error = device_create_bin_file(&b->dev, b->legacy_mem);
@@ -1167,6 +1021,11 @@
 	int bar = (unsigned long)attr->private;
 	enum pci_mmap_state mmap_type;
 	struct resource *res = &pdev->resource[bar];
+	int ret;
+
+	ret = security_locked_down(LOCKDOWN_PCI_ACCESS);
+	if (ret)
+		return ret;
 
 	if (res->flags & IORESOURCE_MEM && iomem_is_exclusive(res->start))
 		return -EINVAL;
@@ -1243,6 +1102,12 @@
 				     struct bin_attribute *attr, char *buf,
 				     loff_t off, size_t count)
 {
+	int ret;
+
+	ret = security_locked_down(LOCKDOWN_PCI_ACCESS);
+	if (ret)
+		return ret;
+
 	return pci_resource_io(filp, kobj, attr, buf, off, count, true);
 }
 
@@ -1306,7 +1171,7 @@
 		}
 	}
 	res_attr->attr.name = res_attr_name;
-	res_attr->attr.mode = S_IRUSR | S_IWUSR;
+	res_attr->attr.mode = 0600;
 	res_attr->size = pci_resource_len(pdev, num);
 	res_attr->private = (void *)(unsigned long)num;
 	retval = sysfs_create_bin_file(&pdev->dev.kobj, res_attr);
@@ -1419,7 +1284,7 @@
 static const struct bin_attribute pci_config_attr = {
 	.attr =	{
 		.name = "config",
-		.mode = S_IRUGO | S_IWUSR,
+		.mode = 0644,
 	},
 	.size = PCI_CFG_SPACE_SIZE,
 	.read = pci_read_config,
@@ -1429,7 +1294,7 @@
 static const struct bin_attribute pcie_config_attr = {
 	.attr =	{
 		.name = "config",
-		.mode = S_IRUGO | S_IWUSR,
+		.mode = 0644,
 	},
 	.size = PCI_CFG_SPACE_EXP_SIZE,
 	.read = pci_read_config,
@@ -1458,7 +1323,7 @@
 	return count;
 }
 
-static struct device_attribute reset_attr = __ATTR(reset, 0200, NULL, reset_store);
+static DEVICE_ATTR(reset, 0200, NULL, reset_store);
 
 static int pci_create_capabilities_sysfs(struct pci_dev *dev)
 {
@@ -1468,7 +1333,7 @@
 	pcie_aspm_create_sysfs_dev_files(dev);
 
 	if (dev->reset_fn) {
-		retval = device_create_file(&dev->dev, &reset_attr);
+		retval = device_create_file(&dev->dev, &dev_attr_reset);
 		if (retval)
 			goto error;
 	}
@@ -1511,7 +1376,7 @@
 		sysfs_bin_attr_init(attr);
 		attr->size = rom_size;
 		attr->attr.name = "rom";
-		attr->attr.mode = S_IRUSR | S_IWUSR;
+		attr->attr.mode = 0600;
 		attr->read = pci_read_rom;
 		attr->write = pci_write_rom;
 		retval = sysfs_create_bin_file(&pdev->dev.kobj, attr);
@@ -1553,7 +1418,7 @@
 	pcie_vpd_remove_sysfs_dev_files(dev);
 	pcie_aspm_remove_sysfs_dev_files(dev);
 	if (dev->reset_fn) {
-		device_remove_file(&dev->dev, &reset_attr);
+		device_remove_file(&dev->dev, &dev_attr_reset);
 		dev->reset_fn = 0;
 	}
 }
@@ -1606,7 +1471,7 @@
 late_initcall(pci_sysfs_init);
 
 static struct attribute *pci_dev_dev_attrs[] = {
-	&vga_attr.attr,
+	&dev_attr_boot_vga.attr,
 	NULL,
 };
 
@@ -1616,7 +1481,7 @@
 	struct device *dev = kobj_to_dev(kobj);
 	struct pci_dev *pdev = to_pci_dev(dev);
 
-	if (a == &vga_attr.attr)
+	if (a == &dev_attr_boot_vga.attr)
 		if ((pdev->class >> 8) != PCI_CLASS_DISPLAY_VGA)
 			return 0;
 
@@ -1624,8 +1489,8 @@
 }
 
 static struct attribute *pci_dev_hp_attrs[] = {
-	&dev_remove_attr.attr,
-	&dev_rescan_attr.attr,
+	&dev_attr_remove.attr,
+	&dev_attr_dev_rescan.attr,
 	NULL,
 };
 
@@ -1697,34 +1562,6 @@
 	.is_visible = pci_dev_hp_attrs_are_visible,
 };
 
-#ifdef CONFIG_PCI_IOV
-static struct attribute *sriov_dev_attrs[] = {
-	&sriov_totalvfs_attr.attr,
-	&sriov_numvfs_attr.attr,
-	&sriov_offset_attr.attr,
-	&sriov_stride_attr.attr,
-	&sriov_vf_device_attr.attr,
-	&sriov_drivers_autoprobe_attr.attr,
-	NULL,
-};
-
-static umode_t sriov_attrs_are_visible(struct kobject *kobj,
-				       struct attribute *a, int n)
-{
-	struct device *dev = kobj_to_dev(kobj);
-
-	if (!dev_is_pf(dev))
-		return 0;
-
-	return a->mode;
-}
-
-static const struct attribute_group sriov_dev_attr_group = {
-	.attrs = sriov_dev_attrs,
-	.is_visible = sriov_attrs_are_visible,
-};
-#endif /* CONFIG_PCI_IOV */
-
 static const struct attribute_group pci_dev_attr_group = {
 	.attrs = pci_dev_dev_attrs,
 	.is_visible = pci_dev_attrs_are_visible,

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 1b27b5a..e7982af 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c

@@ -890,8 +890,8 @@
 
 	pci_read_config_word(dev, dev->pm_cap + PCI_PM_CTRL, &pmcsr);
 	dev->current_state = (pmcsr & PCI_PM_CTRL_STATE_MASK);
-	if (dev->current_state != state && printk_ratelimit())
-		pci_info(dev, "Refused to change power state, currently in D%d\n",
+	if (dev->current_state != state)
+		pci_info_ratelimited(dev, "Refused to change power state, currently in D%d\n",
 			 dev->current_state);
 
 	/*
@@ -1443,7 +1443,7 @@
 		pci_read_config_dword(pdev, pos + PCI_REBAR_CTRL, &ctrl);
 		bar_idx = ctrl & PCI_REBAR_CTRL_BAR_IDX;
 		res = pdev->resource + bar_idx;
-		size = order_base_2((resource_size(res) >> 20) | 1) - 1;
+		size = ilog2(resource_size(res)) - 20;
 		ctrl &= ~PCI_REBAR_CTRL_BAR_SIZE;
 		ctrl |= size << PCI_REBAR_CTRL_BAR_SHIFT;
 		pci_write_config_dword(pdev, pos + PCI_REBAR_CTRL, ctrl);
@@ -3581,7 +3581,7 @@
 		}
 
 		/* Ensure upstream ports don't block AtomicOps on egress */
-		if (!bridge->has_secondary_link) {
+		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_UPSTREAM) {
 			pcie_capability_read_dword(bridge, PCI_EXP_DEVCTL2,
 						   &ctl2);
 			if (ctl2 & PCI_EXP_DEVCTL2_ATOMIC_EGRESS_BLOCK)
@@ -5923,8 +5923,19 @@
 	return 0;
 }
 
-#define RESOURCE_ALIGNMENT_PARAM_SIZE COMMAND_LINE_SIZE
-static char resource_alignment_param[RESOURCE_ALIGNMENT_PARAM_SIZE] = {0};
+/*
+ * Arches that don't want to expose struct resource to userland as-is in
+ * sysfs and /proc can implement their own pci_resource_to_user().
+ */
+void __weak pci_resource_to_user(const struct pci_dev *dev, int bar,
+				 const struct resource *rsrc,
+				 resource_size_t *start, resource_size_t *end)
+{
+	*start = rsrc->start;
+	*end = rsrc->end;
+}
+
+static char *resource_alignment_param;
 static DEFINE_SPINLOCK(resource_alignment_lock);
 
 /**
@@ -5945,7 +5956,7 @@
 
 	spin_lock(&resource_alignment_lock);
 	p = resource_alignment_param;
-	if (!*p && !align)
+	if (!p || !*p)
 		goto out;
 	if (pci_has_flag(PCI_PROBE_ONLY)) {
 		align = 0;
@@ -6109,35 +6120,41 @@
 	}
 }
 
-static ssize_t pci_set_resource_alignment_param(const char *buf, size_t count)
-{
-	if (count > RESOURCE_ALIGNMENT_PARAM_SIZE - 1)
-		count = RESOURCE_ALIGNMENT_PARAM_SIZE - 1;
-	spin_lock(&resource_alignment_lock);
-	strncpy(resource_alignment_param, buf, count);
-	resource_alignment_param[count] = '\0';
-	spin_unlock(&resource_alignment_lock);
-	return count;
-}
-
-static ssize_t pci_get_resource_alignment_param(char *buf, size_t size)
-{
-	size_t count;
-	spin_lock(&resource_alignment_lock);
-	count = snprintf(buf, size, "%s", resource_alignment_param);
-	spin_unlock(&resource_alignment_lock);
-	return count;
-}
-
 static ssize_t resource_alignment_show(struct bus_type *bus, char *buf)
 {
-	return pci_get_resource_alignment_param(buf, PAGE_SIZE);
+	size_t count = 0;
+
+	spin_lock(&resource_alignment_lock);
+	if (resource_alignment_param)
+		count = snprintf(buf, PAGE_SIZE, "%s", resource_alignment_param);
+	spin_unlock(&resource_alignment_lock);
+
+	/*
+	 * When set by the command line, resource_alignment_param will not
+	 * have a trailing line feed, which is ugly. So conditionally add
+	 * it here.
+	 */
+	if (count >= 2 && buf[count - 2] != '\n' && count < PAGE_SIZE - 1) {
+		buf[count - 1] = '\n';
+		buf[count++] = 0;
+	}
+
+	return count;
 }
 
 static ssize_t resource_alignment_store(struct bus_type *bus,
 					const char *buf, size_t count)
 {
-	return pci_set_resource_alignment_param(buf, count);
+	char *param = kstrndup(buf, count, GFP_KERNEL);
+
+	if (!param)
+		return -ENOMEM;
+
+	spin_lock(&resource_alignment_lock);
+	kfree(resource_alignment_param);
+	resource_alignment_param = param;
+	spin_unlock(&resource_alignment_lock);
+	return count;
 }
 
 static BUS_ATTR_RW(resource_alignment);
@@ -6266,8 +6283,7 @@
 			} else if (!strncmp(str, "cbmemsize=", 10)) {
 				pci_cardbus_mem_size = memparse(str + 10, &str);
 			} else if (!strncmp(str, "resource_alignment=", 19)) {
-				pci_set_resource_alignment_param(str + 19,
-							strlen(str + 19));
+				resource_alignment_param = str + 19;
 			} else if (!strncmp(str, "ecrc=", 5)) {
 				pcie_ecrc_get_policy(str + 5);
 			} else if (!strncmp(str, "hpiosize=", 9)) {
@@ -6302,15 +6318,18 @@
 early_param("pci", pci_setup);
 
 /*
- * 'disable_acs_redir_param' is initialized in pci_setup(), above, to point
- * to data in the __initdata section which will be freed after the init
- * sequence is complete. We can't allocate memory in pci_setup() because some
- * architectures do not have any memory allocation service available during
- * an early_param() call. So we allocate memory and copy the variable here
- * before the init section is freed.
+ * 'resource_alignment_param' and 'disable_acs_redir_param' are initialized
+ * in pci_setup(), above, to point to data in the __initdata section which
+ * will be freed after the init sequence is complete. We can't allocate memory
+ * in pci_setup() because some architectures do not have any memory allocation
+ * service available during an early_param() call. So we allocate memory and
+ * copy the variable here before the init section is freed.
+ *
  */
 static int __init pci_realloc_setup_params(void)
 {
+	resource_alignment_param = kstrdup(resource_alignment_param,
+					   GFP_KERNEL);
 	disable_acs_redir_param = kstrdup(disable_acs_redir_param, GFP_KERNEL);
 
 	return 0;

diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index d22d1b8..3f6947e 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h

@@ -39,6 +39,11 @@
 int pci_bridge_secondary_bus_reset(struct pci_dev *dev);
 int pci_bus_error_reset(struct pci_dev *dev);
 
+#define PCI_PM_D2_DELAY         200
+#define PCI_PM_D3_WAIT          10
+#define PCI_PM_D3COLD_WAIT      100
+#define PCI_PM_BUS_WAIT         50
+
 /**
  * struct pci_platform_pm_ops - Firmware PM callbacks
  *
@@ -84,6 +89,8 @@
 void pci_disable_enabled_device(struct pci_dev *dev);
 int pci_finish_runtime_suspend(struct pci_dev *dev);
 void pcie_clear_root_pme_status(struct pci_dev *dev);
+bool pci_check_pme_status(struct pci_dev *dev);
+void pci_pme_wakeup_bus(struct pci_bus *bus);
 int __pci_pme_wakeup(struct pci_dev *dev, void *ign);
 void pci_pme_restore(struct pci_dev *dev);
 bool pci_dev_need_resume(struct pci_dev *dev);
@@ -118,11 +125,25 @@
 	return !pci_has_subordinate(pci_dev) || pci_dev->bridge_d3;
 }
 
+static inline bool pcie_downstream_port(const struct pci_dev *dev)
+{
+	int type = pci_pcie_type(dev);
+
+	return type == PCI_EXP_TYPE_ROOT_PORT ||
+	       type == PCI_EXP_TYPE_DOWNSTREAM ||
+	       type == PCI_EXP_TYPE_PCIE_BRIDGE;
+}
+
 int pci_vpd_init(struct pci_dev *dev);
 void pci_vpd_release(struct pci_dev *dev);
 void pcie_vpd_create_sysfs_dev_files(struct pci_dev *dev);
 void pcie_vpd_remove_sysfs_dev_files(struct pci_dev *dev);
 
+/* PCI Virtual Channel */
+int pci_save_vc_state(struct pci_dev *dev);
+void pci_restore_vc_state(struct pci_dev *dev);
+void pci_allocate_vc_save_buffers(struct pci_dev *dev);
+
 /* PCI /proc functions */
 #ifdef CONFIG_PROC_FS
 int pci_proc_attach_device(struct pci_dev *dev);
@@ -196,6 +217,9 @@
 extern const struct device_type pci_dev_type;
 extern const struct attribute_group *pci_bus_groups[];
 
+extern unsigned long pci_hotplug_io_size;
+extern unsigned long pci_hotplug_mem_size;
+extern unsigned long pci_hotplug_bus_size;
 
 /**
  * pci_match_one_device - Tell if a PCI device structure has a matching
@@ -236,6 +260,9 @@
 	pci_bar_mem64,		/* A 64-bit memory BAR */
 };
 
+struct device *pci_get_host_bridge_device(struct pci_dev *dev);
+void pci_put_host_bridge_device(struct device *dev);
+
 int pci_configure_extended_tags(struct pci_dev *dev, void *ign);
 bool pci_bus_read_dev_vendor_id(struct pci_bus *bus, int devfn, u32 *pl,
 				int crs_timeout);
@@ -256,6 +283,8 @@
 
 void pci_reassigndev_resource_alignment(struct pci_dev *dev);
 void pci_disable_bridge_window(struct pci_dev *dev);
+struct pci_bus *pci_bus_get(struct pci_bus *bus);
+void pci_bus_put(struct pci_bus *bus);
 
 /* PCIe link information */
 #define PCIE_SPEED2STR(speed) \
@@ -279,6 +308,7 @@
 			   enum pcie_link_width *width);
 void __pcie_print_link_status(struct pci_dev *dev, bool verbose);
 void pcie_report_downtraining(struct pci_dev *dev);
+void pcie_update_link_speed(struct pci_bus *bus, u16 link_status);
 
 /* Single Root I/O Virtualization */
 struct pci_sriov {
@@ -418,11 +448,12 @@
 #endif
 
 #ifdef CONFIG_PCI_ATS
+/* Address Translation Service */
+void pci_ats_init(struct pci_dev *dev);
 void pci_restore_ats_state(struct pci_dev *dev);
 #else
-static inline void pci_restore_ats_state(struct pci_dev *dev)
-{
-}
+static inline void pci_ats_init(struct pci_dev *d) { }
+static inline void pci_restore_ats_state(struct pci_dev *dev) { }
 #endif /* CONFIG_PCI_ATS */
 
 #ifdef CONFIG_PCI_IOV
@@ -433,7 +464,7 @@
 resource_size_t pci_sriov_resource_alignment(struct pci_dev *dev, int resno);
 void pci_restore_iov_state(struct pci_dev *dev);
 int pci_iov_bus_range(struct pci_bus *bus);
-
+extern const struct attribute_group sriov_dev_attr_group;
 #else
 static inline int pci_iov_init(struct pci_dev *dev)
 {
@@ -518,10 +549,21 @@
 static inline void pcie_aspm_remove_sysfs_dev_files(struct pci_dev *pdev) { }
 #endif
 
+#ifdef CONFIG_PCIE_ECRC
+void pcie_set_ecrc_checking(struct pci_dev *dev);
+void pcie_ecrc_get_policy(char *str);
+#else
+static inline void pcie_set_ecrc_checking(struct pci_dev *dev) { }
+static inline void pcie_ecrc_get_policy(char *str) { }
+#endif
+
 #ifdef CONFIG_PCIE_PTM
 void pci_ptm_init(struct pci_dev *dev);
+int pci_enable_ptm(struct pci_dev *dev, u8 *granularity);
 #else
 static inline void pci_ptm_init(struct pci_dev *dev) { }
+static inline int pci_enable_ptm(struct pci_dev *dev, u8 *granularity)
+{ return -EINVAL; }
 #endif
 
 struct pci_dev_reset_methods {
@@ -558,6 +600,10 @@
 int of_pci_parse_bus_range(struct device_node *node, struct resource *res);
 int of_get_pci_domain_nr(struct device_node *node);
 int of_pci_get_max_link_speed(struct device_node *node);
+void pci_set_of_node(struct pci_dev *dev);
+void pci_release_of_node(struct pci_dev *dev);
+void pci_set_bus_of_node(struct pci_bus *bus);
+void pci_release_bus_of_node(struct pci_bus *bus);
 
 #else
 static inline int
@@ -577,6 +623,11 @@
 {
 	return -EINVAL;
 }
+
+static inline void pci_set_of_node(struct pci_dev *dev) { }
+static inline void pci_release_of_node(struct pci_dev *dev) { }
+static inline void pci_set_bus_of_node(struct pci_bus *bus) { }
+static inline void pci_release_bus_of_node(struct pci_bus *bus) { }
 #endif /* CONFIG_OF */
 
 #if defined(CONFIG_OF_ADDRESS)
@@ -607,4 +658,13 @@
 static inline void pci_aer_clear_device_status(struct pci_dev *dev) { }
 #endif
 
+#ifdef CONFIG_ACPI
+int pci_acpi_program_hp_params(struct pci_dev *dev);
+#else
+static inline int pci_acpi_program_hp_params(struct pci_dev *dev)
+{
+	return -ENODEV;
+}
+#endif
+
 #endif /* DRIVERS_PCI_H */

diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c
index 464f8f9..652ef23 100644
--- a/drivers/pci/pcie/aspm.c
+++ b/drivers/pci/pcie/aspm.c

@@ -18,7 +18,6 @@
 #include <linux/slab.h>
 #include <linux/jiffies.h>
 #include <linux/delay.h>
-#include <linux/pci-aspm.h>
 #include "../pci.h"
 
 #ifdef MODULE_PARAM_PREFIX
@@ -913,10 +912,10 @@
 
 	/*
 	 * We allocate pcie_link_state for the component on the upstream
-	 * end of a Link, so there's nothing to do unless this device has a
-	 * Link on its secondary side.
+	 * end of a Link, so there's nothing to do unless this device is
+	 * downstream port.
 	 */
-	if (!pdev->has_secondary_link)
+	if (!pcie_downstream_port(pdev))
 		return;
 
 	/* VIA has a strange chipset, root port is under a bridge */
@@ -1070,7 +1069,7 @@
 	if (!pci_is_pcie(pdev))
 		return 0;
 
-	if (pdev->has_secondary_link)
+	if (pcie_downstream_port(pdev))
 		parent = pdev;
 	if (!parent || !parent->link_state)
 		return -EINVAL;

diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
index 773197a..b0e6048 100644
--- a/drivers/pci/pcie/err.c
+++ b/drivers/pci/pcie/err.c

@@ -166,7 +166,7 @@
 	driver = pcie_port_find_service(dev, service);
 	if (driver && driver->reset_link) {
 		status = driver->reset_link(dev);
-	} else if (dev->has_secondary_link) {
+	} else if (pcie_downstream_port(dev)) {
 		status = default_reset_link(dev);
 	} else {
 		pci_printk(KERN_DEBUG, dev, "no link-reset support at upstream device %s\n",

diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index dbeeb38..3d5271a 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c

@@ -1426,26 +1426,38 @@
 	pci_read_config_word(pdev, pos + PCI_EXP_DEVCAP, &reg16);
 	pdev->pcie_mpss = reg16 & PCI_EXP_DEVCAP_PAYLOAD;
 
+	parent = pci_upstream_bridge(pdev);
+	if (!parent)
+		return;
+
 	/*
-	 * A Root Port or a PCI-to-PCIe bridge is always the upstream end
-	 * of a Link.  No PCIe component has two Links.  Two Links are
-	 * connected by a Switch that has a Port on each Link and internal
-	 * logic to connect the two Ports.
+	 * Some systems do not identify their upstream/downstream ports
+	 * correctly so detect impossible configurations here and correct
+	 * the port type accordingly.
 	 */
 	type = pci_pcie_type(pdev);
-	if (type == PCI_EXP_TYPE_ROOT_PORT ||
-	    type == PCI_EXP_TYPE_PCIE_BRIDGE)
-		pdev->has_secondary_link = 1;
-	else if (type == PCI_EXP_TYPE_UPSTREAM ||
-		 type == PCI_EXP_TYPE_DOWNSTREAM) {
-		parent = pci_upstream_bridge(pdev);
-
+	if (type == PCI_EXP_TYPE_DOWNSTREAM) {
 		/*
-		 * Usually there's an upstream device (Root Port or Switch
-		 * Downstream Port), but we can't assume one exists.
+		 * If pdev claims to be downstream port but the parent
+		 * device is also downstream port assume pdev is actually
+		 * upstream port.
 		 */
-		if (parent && !parent->has_secondary_link)
-			pdev->has_secondary_link = 1;
+		if (pcie_downstream_port(parent)) {
+			pci_info(pdev, "claims to be downstream port but is acting as upstream port, correcting type\n");
+			pdev->pcie_flags_reg &= ~PCI_EXP_FLAGS_TYPE;
+			pdev->pcie_flags_reg |= PCI_EXP_TYPE_UPSTREAM;
+		}
+	} else if (type == PCI_EXP_TYPE_UPSTREAM) {
+		/*
+		 * If pdev claims to be upstream port but the parent
+		 * device is also upstream port assume pdev is actually
+		 * downstream port.
+		 */
+		if (pci_pcie_type(parent) == PCI_EXP_TYPE_UPSTREAM) {
+			pci_info(pdev, "claims to be upstream port but is acting as downstream port, correcting type\n");
+			pdev->pcie_flags_reg &= ~PCI_EXP_FLAGS_TYPE;
+			pdev->pcie_flags_reg |= PCI_EXP_TYPE_DOWNSTREAM;
+		}
 	}
 }
 
@@ -1915,275 +1927,6 @@
 		 p_mps, mps, mpss);
 }
 
-static struct hpp_type0 pci_default_type0 = {
-	.revision = 1,
-	.cache_line_size = 8,
-	.latency_timer = 0x40,
-	.enable_serr = 0,
-	.enable_perr = 0,
-};
-
-static void program_hpp_type0(struct pci_dev *dev, struct hpp_type0 *hpp)
-{
-	u16 pci_cmd, pci_bctl;
-
-	if (!hpp)
-		hpp = &pci_default_type0;
-
-	if (hpp->revision > 1) {
-		pci_warn(dev, "PCI settings rev %d not supported; using defaults\n",
-			 hpp->revision);
-		hpp = &pci_default_type0;
-	}
-
-	pci_write_config_byte(dev, PCI_CACHE_LINE_SIZE, hpp->cache_line_size);
-	pci_write_config_byte(dev, PCI_LATENCY_TIMER, hpp->latency_timer);
-	pci_read_config_word(dev, PCI_COMMAND, &pci_cmd);
-	if (hpp->enable_serr)
-		pci_cmd |= PCI_COMMAND_SERR;
-	if (hpp->enable_perr)
-		pci_cmd |= PCI_COMMAND_PARITY;
-	pci_write_config_word(dev, PCI_COMMAND, pci_cmd);
-
-	/* Program bridge control value */
-	if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI) {
-		pci_write_config_byte(dev, PCI_SEC_LATENCY_TIMER,
-				      hpp->latency_timer);
-		pci_read_config_word(dev, PCI_BRIDGE_CONTROL, &pci_bctl);
-		if (hpp->enable_perr)
-			pci_bctl |= PCI_BRIDGE_CTL_PARITY;
-		pci_write_config_word(dev, PCI_BRIDGE_CONTROL, pci_bctl);
-	}
-}
-
-static void program_hpp_type1(struct pci_dev *dev, struct hpp_type1 *hpp)
-{
-	int pos;
-
-	if (!hpp)
-		return;
-
-	pos = pci_find_capability(dev, PCI_CAP_ID_PCIX);
-	if (!pos)
-		return;
-
-	pci_warn(dev, "PCI-X settings not supported\n");
-}
-
-static bool pcie_root_rcb_set(struct pci_dev *dev)
-{
-	struct pci_dev *rp = pcie_find_root_port(dev);
-	u16 lnkctl;
-
-	if (!rp)
-		return false;
-
-	pcie_capability_read_word(rp, PCI_EXP_LNKCTL, &lnkctl);
-	if (lnkctl & PCI_EXP_LNKCTL_RCB)
-		return true;
-
-	return false;
-}
-
-static void program_hpp_type2(struct pci_dev *dev, struct hpp_type2 *hpp)
-{
-	int pos;
-	u32 reg32;
-
-	if (!hpp)
-		return;
-
-	if (!pci_is_pcie(dev))
-		return;
-
-	if (hpp->revision > 1) {
-		pci_warn(dev, "PCIe settings rev %d not supported\n",
-			 hpp->revision);
-		return;
-	}
-
-	/*
-	 * Don't allow _HPX to change MPS or MRRS settings.  We manage
-	 * those to make sure they're consistent with the rest of the
-	 * platform.
-	 */
-	hpp->pci_exp_devctl_and |= PCI_EXP_DEVCTL_PAYLOAD |
-				    PCI_EXP_DEVCTL_READRQ;
-	hpp->pci_exp_devctl_or &= ~(PCI_EXP_DEVCTL_PAYLOAD |
-				    PCI_EXP_DEVCTL_READRQ);
-
-	/* Initialize Device Control Register */
-	pcie_capability_clear_and_set_word(dev, PCI_EXP_DEVCTL,
-			~hpp->pci_exp_devctl_and, hpp->pci_exp_devctl_or);
-
-	/* Initialize Link Control Register */
-	if (pcie_cap_has_lnkctl(dev)) {
-
-		/*
-		 * If the Root Port supports Read Completion Boundary of
-		 * 128, set RCB to 128.  Otherwise, clear it.
-		 */
-		hpp->pci_exp_lnkctl_and |= PCI_EXP_LNKCTL_RCB;
-		hpp->pci_exp_lnkctl_or &= ~PCI_EXP_LNKCTL_RCB;
-		if (pcie_root_rcb_set(dev))
-			hpp->pci_exp_lnkctl_or |= PCI_EXP_LNKCTL_RCB;
-
-		pcie_capability_clear_and_set_word(dev, PCI_EXP_LNKCTL,
-			~hpp->pci_exp_lnkctl_and, hpp->pci_exp_lnkctl_or);
-	}
-
-	/* Find Advanced Error Reporting Enhanced Capability */
-	pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ERR);
-	if (!pos)
-		return;
-
-	/* Initialize Uncorrectable Error Mask Register */
-	pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_MASK, &reg32);
-	reg32 = (reg32 & hpp->unc_err_mask_and) | hpp->unc_err_mask_or;
-	pci_write_config_dword(dev, pos + PCI_ERR_UNCOR_MASK, reg32);
-
-	/* Initialize Uncorrectable Error Severity Register */
-	pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_SEVER, &reg32);
-	reg32 = (reg32 & hpp->unc_err_sever_and) | hpp->unc_err_sever_or;
-	pci_write_config_dword(dev, pos + PCI_ERR_UNCOR_SEVER, reg32);
-
-	/* Initialize Correctable Error Mask Register */
-	pci_read_config_dword(dev, pos + PCI_ERR_COR_MASK, &reg32);
-	reg32 = (reg32 & hpp->cor_err_mask_and) | hpp->cor_err_mask_or;
-	pci_write_config_dword(dev, pos + PCI_ERR_COR_MASK, reg32);
-
-	/* Initialize Advanced Error Capabilities and Control Register */
-	pci_read_config_dword(dev, pos + PCI_ERR_CAP, &reg32);
-	reg32 = (reg32 & hpp->adv_err_cap_and) | hpp->adv_err_cap_or;
-
-	/* Don't enable ECRC generation or checking if unsupported */
-	if (!(reg32 & PCI_ERR_CAP_ECRC_GENC))
-		reg32 &= ~PCI_ERR_CAP_ECRC_GENE;
-	if (!(reg32 & PCI_ERR_CAP_ECRC_CHKC))
-		reg32 &= ~PCI_ERR_CAP_ECRC_CHKE;
-	pci_write_config_dword(dev, pos + PCI_ERR_CAP, reg32);
-
-	/*
-	 * FIXME: The following two registers are not supported yet.
-	 *
-	 *   o Secondary Uncorrectable Error Severity Register
-	 *   o Secondary Uncorrectable Error Mask Register
-	 */
-}
-
-static u16 hpx3_device_type(struct pci_dev *dev)
-{
-	u16 pcie_type = pci_pcie_type(dev);
-	const int pcie_to_hpx3_type[] = {
-		[PCI_EXP_TYPE_ENDPOINT]    = HPX_TYPE_ENDPOINT,
-		[PCI_EXP_TYPE_LEG_END]     = HPX_TYPE_LEG_END,
-		[PCI_EXP_TYPE_RC_END]      = HPX_TYPE_RC_END,
-		[PCI_EXP_TYPE_RC_EC]       = HPX_TYPE_RC_EC,
-		[PCI_EXP_TYPE_ROOT_PORT]   = HPX_TYPE_ROOT_PORT,
-		[PCI_EXP_TYPE_UPSTREAM]    = HPX_TYPE_UPSTREAM,
-		[PCI_EXP_TYPE_DOWNSTREAM]  = HPX_TYPE_DOWNSTREAM,
-		[PCI_EXP_TYPE_PCI_BRIDGE]  = HPX_TYPE_PCI_BRIDGE,
-		[PCI_EXP_TYPE_PCIE_BRIDGE] = HPX_TYPE_PCIE_BRIDGE,
-	};
-
-	if (pcie_type >= ARRAY_SIZE(pcie_to_hpx3_type))
-		return 0;
-
-	return pcie_to_hpx3_type[pcie_type];
-}
-
-static u8 hpx3_function_type(struct pci_dev *dev)
-{
-	if (dev->is_virtfn)
-		return HPX_FN_SRIOV_VIRT;
-	else if (pci_find_ext_capability(dev, PCI_EXT_CAP_ID_SRIOV) > 0)
-		return HPX_FN_SRIOV_PHYS;
-	else
-		return HPX_FN_NORMAL;
-}
-
-static bool hpx3_cap_ver_matches(u8 pcie_cap_id, u8 hpx3_cap_id)
-{
-	u8 cap_ver = hpx3_cap_id & 0xf;
-
-	if ((hpx3_cap_id & BIT(4)) && cap_ver >= pcie_cap_id)
-		return true;
-	else if (cap_ver == pcie_cap_id)
-		return true;
-
-	return false;
-}
-
-static void program_hpx_type3_register(struct pci_dev *dev,
-				       const struct hpx_type3 *reg)
-{
-	u32 match_reg, write_reg, header, orig_value;
-	u16 pos;
-
-	if (!(hpx3_device_type(dev) & reg->device_type))
-		return;
-
-	if (!(hpx3_function_type(dev) & reg->function_type))
-		return;
-
-	switch (reg->config_space_location) {
-	case HPX_CFG_PCICFG:
-		pos = 0;
-		break;
-	case HPX_CFG_PCIE_CAP:
-		pos = pci_find_capability(dev, reg->pci_exp_cap_id);
-		if (pos == 0)
-			return;
-
-		break;
-	case HPX_CFG_PCIE_CAP_EXT:
-		pos = pci_find_ext_capability(dev, reg->pci_exp_cap_id);
-		if (pos == 0)
-			return;
-
-		pci_read_config_dword(dev, pos, &header);
-		if (!hpx3_cap_ver_matches(PCI_EXT_CAP_VER(header),
-					  reg->pci_exp_cap_ver))
-			return;
-
-		break;
-	case HPX_CFG_VEND_CAP:	/* Fall through */
-	case HPX_CFG_DVSEC:	/* Fall through */
-	default:
-		pci_warn(dev, "Encountered _HPX type 3 with unsupported config space location");
-		return;
-	}
-
-	pci_read_config_dword(dev, pos + reg->match_offset, &match_reg);
-
-	if ((match_reg & reg->match_mask_and) != reg->match_value)
-		return;
-
-	pci_read_config_dword(dev, pos + reg->reg_offset, &write_reg);
-	orig_value = write_reg;
-	write_reg &= reg->reg_mask_and;
-	write_reg |= reg->reg_mask_or;
-
-	if (orig_value == write_reg)
-		return;
-
-	pci_write_config_dword(dev, pos + reg->reg_offset, write_reg);
-
-	pci_dbg(dev, "Applied _HPX3 at [0x%x]: 0x%08x -> 0x%08x",
-		pos, orig_value, write_reg);
-}
-
-static void program_hpx_type3(struct pci_dev *dev, struct hpx_type3 *hpx3)
-{
-	if (!hpx3)
-		return;
-
-	if (!pci_is_pcie(dev))
-		return;
-
-	program_hpx_type3_register(dev, hpx3);
-}
-
 int pci_configure_extended_tags(struct pci_dev *dev, void *ign)
 {
 	struct pci_host_bridge *host;
@@ -2364,13 +2107,6 @@
 
 static void pci_configure_device(struct pci_dev *dev)
 {
-	static const struct hotplug_program_ops hp_ops = {
-		.program_type0 = program_hpp_type0,
-		.program_type1 = program_hpp_type1,
-		.program_type2 = program_hpp_type2,
-		.program_type3 = program_hpx_type3,
-	};
-
 	pci_configure_mps(dev);
 	pci_configure_extended_tags(dev, NULL);
 	pci_configure_relaxed_ordering(dev);
@@ -2378,7 +2114,7 @@
 	pci_configure_eetlp_prefix(dev);
 	pci_configure_serr(dev);
 
-	pci_acpi_program_hp_params(dev, &hp_ops);
+	pci_acpi_program_hp_params(dev);
 }
 
 static void pci_release_capabilities(struct pci_dev *dev)
@@ -2759,12 +2495,8 @@
 	 * A PCIe Downstream Port normally leads to a Link with only Device
 	 * 0 on it (PCIe spec r3.1, sec 7.3.1).  As an optimization, scan
 	 * only for Device 0 in that situation.
-	 *
-	 * Checking has_secondary_link is a hack to identify Downstream
-	 * Ports because sometimes Switches are configured such that the
-	 * PCIe Port Type labels are backwards.
 	 */
-	if (bridge && pci_is_pcie(bridge) && bridge->has_secondary_link)
+	if (bridge && pci_is_pcie(bridge) && pcie_downstream_port(bridge))
 		return 1;
 
 	return 0;

diff --git a/drivers/pci/proc.c b/drivers/pci/proc.c
index fe7fe67..5495537 100644
--- a/drivers/pci/proc.c
+++ b/drivers/pci/proc.c

@@ -13,6 +13,7 @@
 #include <linux/seq_file.h>
 #include <linux/capability.h>
 #include <linux/uaccess.h>
+#include <linux/security.h>
 #include <asm/byteorder.h>
 #include "pci.h"
 
@@ -115,7 +116,11 @@
 	struct pci_dev *dev = PDE_DATA(ino);
 	int pos = *ppos;
 	int size = dev->cfg_size;
-	int cnt;
+	int cnt, ret;
+
+	ret = security_locked_down(LOCKDOWN_PCI_ACCESS);
+	if (ret)
+		return ret;
 
 	if (pos >= size)
 		return 0;
@@ -196,6 +201,10 @@
 #endif /* HAVE_PCI_MMAP */
 	int ret = 0;
 
+	ret = security_locked_down(LOCKDOWN_PCI_ACCESS);
+	if (ret)
+		return ret;
+
 	switch (cmd) {
 	case PCIIOC_CONTROLLER:
 		ret = pci_domain_nr(dev->bus);
@@ -238,7 +247,8 @@
 	struct pci_filp_private *fpriv = file->private_data;
 	int i, ret, write_combine = 0, res_bit = IORESOURCE_MEM;
 
-	if (!capable(CAP_SYS_RAWIO))
+	if (!capable(CAP_SYS_RAWIO) ||
+	    security_locked_down(LOCKDOWN_PCI_ACCESS))
 		return -EPERM;
 
 	if (fpriv->mmap_state == pci_mmap_io) {

diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index 44c4ae1..320255e 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c

@@ -20,7 +20,6 @@
 #include <linux/delay.h>
 #include <linux/acpi.h>
 #include <linux/dmi.h>
-#include <linux/pci-aspm.h>
 #include <linux/ioport.h>
 #include <linux/sched.h>
 #include <linux/ktime.h>
@@ -2593,6 +2592,59 @@
 			nvenet_msi_disable);
 
 /*
+ * PCIe spec r4.0 sec 7.7.1.2 and sec 7.7.2.2 say that if MSI/MSI-X is enabled,
+ * then the device can't use INTx interrupts. Tegra's PCIe root ports don't
+ * generate MSI interrupts for PME and AER events instead only INTx interrupts
+ * are generated. Though Tegra's PCIe root ports can generate MSI interrupts
+ * for other events, since PCIe specificiation doesn't support using a mix of
+ * INTx and MSI/MSI-X, it is required to disable MSI interrupts to avoid port
+ * service drivers registering their respective ISRs for MSIs.
+ */
+static void pci_quirk_nvidia_tegra_disable_rp_msi(struct pci_dev *dev)
+{
+	dev->no_msi = 1;
+}
+DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_NVIDIA, 0x1ad0,
+			      PCI_CLASS_BRIDGE_PCI, 8,
+			      pci_quirk_nvidia_tegra_disable_rp_msi);
+DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_NVIDIA, 0x1ad1,
+			      PCI_CLASS_BRIDGE_PCI, 8,
+			      pci_quirk_nvidia_tegra_disable_rp_msi);
+DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_NVIDIA, 0x1ad2,
+			      PCI_CLASS_BRIDGE_PCI, 8,
+			      pci_quirk_nvidia_tegra_disable_rp_msi);
+DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_NVIDIA, 0x0bf0,
+			      PCI_CLASS_BRIDGE_PCI, 8,
+			      pci_quirk_nvidia_tegra_disable_rp_msi);
+DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_NVIDIA, 0x0bf1,
+			      PCI_CLASS_BRIDGE_PCI, 8,
+			      pci_quirk_nvidia_tegra_disable_rp_msi);
+DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_NVIDIA, 0x0e1c,
+			      PCI_CLASS_BRIDGE_PCI, 8,
+			      pci_quirk_nvidia_tegra_disable_rp_msi);
+DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_NVIDIA, 0x0e1d,
+			      PCI_CLASS_BRIDGE_PCI, 8,
+			      pci_quirk_nvidia_tegra_disable_rp_msi);
+DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_NVIDIA, 0x0e12,
+			      PCI_CLASS_BRIDGE_PCI, 8,
+			      pci_quirk_nvidia_tegra_disable_rp_msi);
+DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_NVIDIA, 0x0e13,
+			      PCI_CLASS_BRIDGE_PCI, 8,
+			      pci_quirk_nvidia_tegra_disable_rp_msi);
+DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_NVIDIA, 0x0fae,
+			      PCI_CLASS_BRIDGE_PCI, 8,
+			      pci_quirk_nvidia_tegra_disable_rp_msi);
+DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_NVIDIA, 0x0faf,
+			      PCI_CLASS_BRIDGE_PCI, 8,
+			      pci_quirk_nvidia_tegra_disable_rp_msi);
+DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_NVIDIA, 0x10e5,
+			      PCI_CLASS_BRIDGE_PCI, 8,
+			      pci_quirk_nvidia_tegra_disable_rp_msi);
+DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_NVIDIA, 0x10e6,
+			      PCI_CLASS_BRIDGE_PCI, 8,
+			      pci_quirk_nvidia_tegra_disable_rp_msi);
+
+/*
  * Some versions of the MCP55 bridge from Nvidia have a legacy IRQ routing
  * config register.  This register controls the routing of legacy
  * interrupts from devices that route through the MCP55.  If this register
@@ -2925,6 +2977,24 @@
 			quirk_msi_intx_disable_qca_bug);
 DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ATTANSIC, 0xe091,
 			quirk_msi_intx_disable_qca_bug);
+
+/*
+ * Amazon's Annapurna Labs 1c36:0031 Root Ports don't support MSI-X, so it
+ * should be disabled on platforms where the device (mistakenly) advertises it.
+ *
+ * Notice that this quirk also disables MSI (which may work, but hasn't been
+ * tested), since currently there is no standard way to disable only MSI-X.
+ *
+ * The 0031 device id is reused for other non Root Port device types,
+ * therefore the quirk is registered for the PCI_CLASS_BRIDGE_PCI class.
+ */
+static void quirk_al_msi_disable(struct pci_dev *dev)
+{
+	dev->no_msi = 1;
+	pci_warn(dev, "Disabling MSI/MSI-X\n");
+}
+DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_AMAZON_ANNAPURNA_LABS, 0x0031,
+			      PCI_CLASS_BRIDGE_PCI, 8, quirk_al_msi_disable);
 #endif /* CONFIG_PCI_MSI */
 
 /*
@@ -4366,6 +4436,24 @@
 	return ret;
 }
 
+static int pci_quirk_al_acs(struct pci_dev *dev, u16 acs_flags)
+{
+	if (pci_pcie_type(dev) != PCI_EXP_TYPE_ROOT_PORT)
+		return -ENOTTY;
+
+	/*
+	 * Amazon's Annapurna Labs root ports don't include an ACS capability,
+	 * but do include ACS-like functionality. The hardware doesn't support
+	 * peer-to-peer transactions via the root port and each has a unique
+	 * segment number.
+	 *
+	 * Additionally, the root ports cannot send traffic to each other.
+	 */
+	acs_flags &= ~(PCI_ACS_SV | PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_UF);
+
+	return acs_flags ? 0 : 1;
+}
+
 /*
  * Sunrise Point PCH root ports implement ACS, but unfortunately as shown in
  * the datasheet (Intel 100 Series Chipset Family PCH Datasheet, Vol. 2,
@@ -4466,6 +4554,19 @@
 	return acs_flags ? 0 : 1;
 }
 
+static int pci_quirk_brcm_acs(struct pci_dev *dev, u16 acs_flags)
+{
+	/*
+	 * iProc PAXB Root Ports don't advertise an ACS capability, but
+	 * they do not allow peer-to-peer transactions between Root Ports.
+	 * Allow each Root Port to be in a separate IOMMU group by masking
+	 * SV/RR/CR/UF bits.
+	 */
+	acs_flags &= ~(PCI_ACS_SV | PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_UF);
+
+	return acs_flags ? 0 : 1;
+}
+
 static const struct pci_dev_acs_enabled {
 	u16 vendor;
 	u16 device;
@@ -4559,6 +4660,9 @@
 	{ PCI_VENDOR_ID_AMPERE, 0xE00A, pci_quirk_xgene_acs },
 	{ PCI_VENDOR_ID_AMPERE, 0xE00B, pci_quirk_xgene_acs },
 	{ PCI_VENDOR_ID_AMPERE, 0xE00C, pci_quirk_xgene_acs },
+	{ PCI_VENDOR_ID_BROADCOM, 0xD714, pci_quirk_brcm_acs },
+	/* Amazon Annapurna Labs */
+	{ PCI_VENDOR_ID_AMAZON_ANNAPURNA_LABS, 0x0031, pci_quirk_al_acs },
 	{ 0 }
 };
 

diff --git a/drivers/pci/search.c b/drivers/pci/search.c
index 7f4e658..bade140 100644
--- a/drivers/pci/search.c
+++ b/drivers/pci/search.c

@@ -15,7 +15,6 @@
 #include "pci.h"
 
 DECLARE_RWSEM(pci_bus_sem);
-EXPORT_SYMBOL_GPL(pci_bus_sem);
 
 /*
  * pci_for_each_dma_alias - Iterate over DMA aliases for a device

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 79b1fa6..e7dbe21 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c

@@ -1662,8 +1662,8 @@
 	int i;
 	bool *unassigned = data;
 
-	for (i = PCI_IOV_RESOURCES; i <= PCI_IOV_RESOURCE_END; i++) {
-		struct resource *r = &dev->resource[i];
+	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
+		struct resource *r = &dev->resource[i + PCI_IOV_RESOURCES];
 		struct pci_bus_region region;
 
 		/* Not assigned or rejected by kernel? */

diff --git a/drivers/pci/syscall.c b/drivers/pci/syscall.c
index d96626c..31e3955 100644
--- a/drivers/pci/syscall.c
+++ b/drivers/pci/syscall.c

@@ -7,6 +7,7 @@
 
 #include <linux/errno.h>
 #include <linux/pci.h>
+#include <linux/security.h>
 #include <linux/syscalls.h>
 #include <linux/uaccess.h>
 #include "pci.h"
@@ -90,7 +91,8 @@
 	u32 dword;
 	int err = 0;
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!capable(CAP_SYS_ADMIN) ||
+	    security_locked_down(LOCKDOWN_PCI_ACCESS))
 		return -EPERM;
 
 	dev = pci_get_domain_bus_and_slot(0, bus, dfn);

diff --git a/drivers/pci/vc.c b/drivers/pci/vc.c
index 5acd9c0..5486f87 100644
--- a/drivers/pci/vc.c
+++ b/drivers/pci/vc.c

@@ -13,6 +13,8 @@
 #include <linux/pci_regs.h>
 #include <linux/types.h>
 
+#include "pci.h"
+
 /**
  * pci_vc_save_restore_dwords - Save or restore a series of dwords
  * @dev: device
@@ -105,7 +107,7 @@
 	struct pci_dev *link = NULL;
 
 	/* Enable VCs from the downstream device */
-	if (!dev->has_secondary_link)
+	if (!pci_is_pcie(dev) || !pcie_downstream_port(dev))
 		return;
 
 	ctrl_pos = pos + PCI_VC_RES_CTRL + (res * PCI_CAP_VC_PER_VC_SIZEOF);
@@ -409,7 +411,6 @@
  * For each type of VC capability, VC/VC9/MFVC, find the capability, size
  * it, and allocate a buffer for save/restore.
  */
-
 void pci_allocate_vc_save_buffers(struct pci_dev *dev)
 {
 	int i;

diff --git a/drivers/pci/vpd.c b/drivers/pci/vpd.c
index 4963c2e2..7915d10 100644
--- a/drivers/pci/vpd.c
+++ b/drivers/pci/vpd.c

@@ -571,6 +571,12 @@
 DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ATTANSIC, PCI_ANY_ID,
 		quirk_blacklist_vpd);
 DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_QLOGIC, 0x2261, quirk_blacklist_vpd);
+/*
+ * The Amazon Annapurna Labs 0x0031 device id is reused for other non Root Port
+ * device types, so the quirk is registered for the PCI_CLASS_BRIDGE_PCI class.
+ */
+DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_AMAZON_ANNAPURNA_LABS, 0x0031,
+			      PCI_CLASS_BRIDGE_PCI, 8, quirk_blacklist_vpd);
 
 /*
  * For Broadcom 5706, 5708, 5709 rev. A nics, any read beyond the

diff --git a/drivers/pcmcia/cistpl.c b/drivers/pcmcia/cistpl.c
index abd0299..629359f 100644
--- a/drivers/pcmcia/cistpl.c
+++ b/drivers/pcmcia/cistpl.c

@@ -21,6 +21,7 @@
 #include <linux/pci.h>
 #include <linux/ioport.h>
 #include <linux/io.h>
+#include <linux/security.h>
 #include <asm/byteorder.h>
 #include <asm/unaligned.h>
 
@@ -1575,6 +1576,10 @@
 	struct pcmcia_socket *s;
 	int error;
 
+	error = security_locked_down(LOCKDOWN_PCMCIA_CIS);
+	if (error)
+		return error;
+
 	s = to_socket(container_of(kobj, struct device, kobj));
 
 	if (off)

diff --git a/drivers/phy/tegra/Kconfig b/drivers/phy/tegra/Kconfig
index e516967..f9817c3 100644
--- a/drivers/phy/tegra/Kconfig
+++ b/drivers/phy/tegra/Kconfig

@@ -7,3 +7,10 @@
 
 	  To compile this driver as a module, choose M here: the module will
 	  be called phy-tegra-xusb.
+
+config PHY_TEGRA194_P2U
+	tristate "NVIDIA Tegra194 PIPE2UPHY PHY driver"
+	depends on ARCH_TEGRA_194_SOC || COMPILE_TEST
+	select GENERIC_PHY
+	help
+	  Enable this to support the P2U (PIPE to UPHY) that is part of Tegra 19x SOCs.

diff --git a/drivers/phy/tegra/Makefile b/drivers/phy/tegra/Makefile
index 64ccaea..320dd38 100644
--- a/drivers/phy/tegra/Makefile
+++ b/drivers/phy/tegra/Makefile

@@ -6,3 +6,4 @@
 phy-tegra-xusb-$(CONFIG_ARCH_TEGRA_132_SOC) += xusb-tegra124.o
 phy-tegra-xusb-$(CONFIG_ARCH_TEGRA_210_SOC) += xusb-tegra210.o
 phy-tegra-xusb-$(CONFIG_ARCH_TEGRA_186_SOC) += xusb-tegra186.o
+obj-$(CONFIG_PHY_TEGRA194_P2U) += phy-tegra194-p2u.o

diff --git a/drivers/phy/tegra/phy-tegra194-p2u.c b/drivers/phy/tegra/phy-tegra194-p2u.c
new file mode 100644
index 0000000..7042bed
--- /dev/null
+++ b/drivers/phy/tegra/phy-tegra194-p2u.c

@@ -0,0 +1,120 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * P2U (PIPE to UPHY) driver for Tegra T194 SoC
+ *
+ * Copyright (C) 2019 NVIDIA Corporation.
+ *
+ * Author: Vidya Sagar <vidyas@nvidia.com>
+ */
+
+#include <linux/err.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_platform.h>
+#include <linux/phy/phy.h>
+
+#define P2U_PERIODIC_EQ_CTRL_GEN3	0xc0
+#define P2U_PERIODIC_EQ_CTRL_GEN3_PERIODIC_EQ_EN		BIT(0)
+#define P2U_PERIODIC_EQ_CTRL_GEN3_INIT_PRESET_EQ_TRAIN_EN	BIT(1)
+#define P2U_PERIODIC_EQ_CTRL_GEN4	0xc4
+#define P2U_PERIODIC_EQ_CTRL_GEN4_INIT_PRESET_EQ_TRAIN_EN	BIT(1)
+
+#define P2U_RX_DEBOUNCE_TIME				0xa4
+#define P2U_RX_DEBOUNCE_TIME_DEBOUNCE_TIMER_MASK	0xffff
+#define P2U_RX_DEBOUNCE_TIME_DEBOUNCE_TIMER_VAL		160
+
+struct tegra_p2u {
+	void __iomem *base;
+};
+
+static inline void p2u_writel(struct tegra_p2u *phy, const u32 value,
+			      const u32 reg)
+{
+	writel_relaxed(value, phy->base + reg);
+}
+
+static inline u32 p2u_readl(struct tegra_p2u *phy, const u32 reg)
+{
+	return readl_relaxed(phy->base + reg);
+}
+
+static int tegra_p2u_power_on(struct phy *x)
+{
+	struct tegra_p2u *phy = phy_get_drvdata(x);
+	u32 val;
+
+	val = p2u_readl(phy, P2U_PERIODIC_EQ_CTRL_GEN3);
+	val &= ~P2U_PERIODIC_EQ_CTRL_GEN3_PERIODIC_EQ_EN;
+	val |= P2U_PERIODIC_EQ_CTRL_GEN3_INIT_PRESET_EQ_TRAIN_EN;
+	p2u_writel(phy, val, P2U_PERIODIC_EQ_CTRL_GEN3);
+
+	val = p2u_readl(phy, P2U_PERIODIC_EQ_CTRL_GEN4);
+	val |= P2U_PERIODIC_EQ_CTRL_GEN4_INIT_PRESET_EQ_TRAIN_EN;
+	p2u_writel(phy, val, P2U_PERIODIC_EQ_CTRL_GEN4);
+
+	val = p2u_readl(phy, P2U_RX_DEBOUNCE_TIME);
+	val &= ~P2U_RX_DEBOUNCE_TIME_DEBOUNCE_TIMER_MASK;
+	val |= P2U_RX_DEBOUNCE_TIME_DEBOUNCE_TIMER_VAL;
+	p2u_writel(phy, val, P2U_RX_DEBOUNCE_TIME);
+
+	return 0;
+}
+
+static const struct phy_ops ops = {
+	.power_on = tegra_p2u_power_on,
+	.owner = THIS_MODULE,
+};
+
+static int tegra_p2u_probe(struct platform_device *pdev)
+{
+	struct phy_provider *phy_provider;
+	struct device *dev = &pdev->dev;
+	struct phy *generic_phy;
+	struct tegra_p2u *phy;
+	struct resource *res;
+
+	phy = devm_kzalloc(dev, sizeof(*phy), GFP_KERNEL);
+	if (!phy)
+		return -ENOMEM;
+
+	res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "ctl");
+	phy->base = devm_ioremap_resource(dev, res);
+	if (IS_ERR(phy->base))
+		return PTR_ERR(phy->base);
+
+	platform_set_drvdata(pdev, phy);
+
+	generic_phy = devm_phy_create(dev, NULL, &ops);
+	if (IS_ERR(generic_phy))
+		return PTR_ERR(generic_phy);
+
+	phy_set_drvdata(generic_phy, phy);
+
+	phy_provider = devm_of_phy_provider_register(dev, of_phy_simple_xlate);
+	if (IS_ERR(phy_provider))
+		return PTR_ERR(phy_provider);
+
+	return 0;
+}
+
+static const struct of_device_id tegra_p2u_id_table[] = {
+	{
+		.compatible = "nvidia,tegra194-p2u",
+	},
+	{}
+};
+MODULE_DEVICE_TABLE(of, tegra_p2u_id_table);
+
+static struct platform_driver tegra_p2u_driver = {
+	.probe = tegra_p2u_probe,
+	.driver = {
+		.name = "tegra194-p2u",
+		.of_match_table = tegra_p2u_id_table,
+	},
+};
+module_platform_driver(tegra_p2u_driver);
+
+MODULE_AUTHOR("Vidya Sagar <vidyas@nvidia.com>");
+MODULE_DESCRIPTION("NVIDIA Tegra194 PIPE2UPHY PHY driver");
+MODULE_LICENSE("GPL v2");

diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig
index 1b67bb5..ae21d08 100644
--- a/drivers/platform/x86/Kconfig
+++ b/drivers/platform/x86/Kconfig

@@ -674,6 +674,7 @@
 config ASUS_WMI
 	tristate "ASUS WMI Driver"
 	depends on ACPI_WMI
+	depends on ACPI_BATTERY
 	depends on INPUT
 	depends on HWMON
 	depends on BACKLIGHT_CLASS_DEVICE

diff --git a/drivers/platform/x86/i2c-multi-instantiate.c b/drivers/platform/x86/i2c-multi-instantiate.c
index 61fe341..ea68f6e 100644
--- a/drivers/platform/x86/i2c-multi-instantiate.c
+++ b/drivers/platform/x86/i2c-multi-instantiate.c

@@ -90,7 +90,7 @@
 	for (i = 0; i < multi->num_clients && inst_data[i].type; i++) {
 		memset(&board_info, 0, sizeof(board_info));
 		strlcpy(board_info.type, inst_data[i].type, I2C_NAME_SIZE);
-		snprintf(name, sizeof(name), "%s-%s.%d", match->id,
+		snprintf(name, sizeof(name), "%s-%s.%d", dev_name(dev),
 			 inst_data[i].type, i);
 		board_info.dev_name = name;
 		switch (inst_data[i].flags & IRQ_RESOURCE_TYPE) {

diff --git a/drivers/platform/x86/pmc_atom.c b/drivers/platform/x86/pmc_atom.c
index 9aca5e7..07d1b91 100644
--- a/drivers/platform/x86/pmc_atom.c
+++ b/drivers/platform/x86/pmc_atom.c

@@ -422,6 +422,13 @@
 			DMI_MATCH(DMI_PRODUCT_VERSION, "6ES7647-8B"),
 		},
 	},
+	{
+		.ident = "SIMATIC IPC277E",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "SIEMENS AG"),
+			DMI_MATCH(DMI_PRODUCT_VERSION, "6AV7882-0"),
+		},
+	},
 	{ /*sentinel*/ }
 };
 

diff --git a/drivers/ptp/ptp_chardev.c b/drivers/ptp/ptp_chardev.c
index 9c18476..67d01998 100644
--- a/drivers/ptp/ptp_chardev.c
+++ b/drivers/ptp/ptp_chardev.c

@@ -155,7 +155,7 @@
 			err = -EINVAL;
 			break;
 		} else if (cmd == PTP_EXTTS_REQUEST) {
-			req.extts.flags &= ~PTP_EXTTS_VALID_FLAGS;
+			req.extts.flags &= PTP_EXTTS_V1_VALID_FLAGS;
 			req.extts.rsv[0] = 0;
 			req.extts.rsv[1] = 0;
 		}
@@ -184,7 +184,7 @@
 			err = -EINVAL;
 			break;
 		} else if (cmd == PTP_PEROUT_REQUEST) {
-			req.perout.flags &= ~PTP_PEROUT_VALID_FLAGS;
+			req.perout.flags &= PTP_PEROUT_V1_VALID_FLAGS;
 			req.perout.rsv[0] = 0;
 			req.perout.rsv[1] = 0;
 			req.perout.rsv[2] = 0;

diff --git a/drivers/pwm/Kconfig b/drivers/pwm/Kconfig
index b0e632b..e3a2518 100644
--- a/drivers/pwm/Kconfig
+++ b/drivers/pwm/Kconfig

@@ -44,7 +44,7 @@
 
 config PWM_ATMEL
 	tristate "Atmel PWM support"
-	depends on ARCH_AT91
+	depends on ARCH_AT91 && OF
 	help
 	  Generic PWM framework driver for Atmel SoC.
 
@@ -423,6 +423,17 @@
 	  To compile this driver as a module, choose M here: the module
 	  will be called pwm-spear.
 
+config PWM_SPRD
+	tristate "Spreadtrum PWM support"
+	depends on ARCH_SPRD || COMPILE_TEST
+	depends on HAS_IOMEM
+	help
+	  Generic PWM framework driver for the PWM controller on
+	  Spreadtrum SoCs.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called pwm-sprd.
+
 config PWM_STI
 	tristate "STiH4xx PWM support"
 	depends on ARCH_STI

diff --git a/drivers/pwm/Makefile b/drivers/pwm/Makefile
index 76b555b..26326ad 100644
--- a/drivers/pwm/Makefile
+++ b/drivers/pwm/Makefile

@@ -41,6 +41,7 @@
 obj-$(CONFIG_PWM_SAMSUNG)	+= pwm-samsung.o
 obj-$(CONFIG_PWM_SIFIVE)	+= pwm-sifive.o
 obj-$(CONFIG_PWM_SPEAR)		+= pwm-spear.o
+obj-$(CONFIG_PWM_SPRD)		+= pwm-sprd.o
 obj-$(CONFIG_PWM_STI)		+= pwm-sti.o
 obj-$(CONFIG_PWM_STM32)		+= pwm-stm32.o
 obj-$(CONFIG_PWM_STM32_LP)	+= pwm-stm32-lp.o

diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c
index 8edfac1..6ad51aa 100644
--- a/drivers/pwm/core.c
+++ b/drivers/pwm/core.c

@@ -448,36 +448,44 @@
 /**
  * pwm_apply_state() - atomically apply a new state to a PWM device
  * @pwm: PWM device
- * @state: new state to apply. This can be adjusted by the PWM driver
- *	   if the requested config is not achievable, for example,
- *	   ->duty_cycle and ->period might be approximated.
+ * @state: new state to apply
  */
-int pwm_apply_state(struct pwm_device *pwm, struct pwm_state *state)
+int pwm_apply_state(struct pwm_device *pwm, const struct pwm_state *state)
 {
+	struct pwm_chip *chip;
 	int err;
 
 	if (!pwm || !state || !state->period ||
 	    state->duty_cycle > state->period)
 		return -EINVAL;
 
+	chip = pwm->chip;
+
 	if (state->period == pwm->state.period &&
 	    state->duty_cycle == pwm->state.duty_cycle &&
 	    state->polarity == pwm->state.polarity &&
 	    state->enabled == pwm->state.enabled)
 		return 0;
 
-	if (pwm->chip->ops->apply) {
-		err = pwm->chip->ops->apply(pwm->chip, pwm, state);
+	if (chip->ops->apply) {
+		err = chip->ops->apply(chip, pwm, state);
 		if (err)
 			return err;
 
-		pwm->state = *state;
+		/*
+		 * .apply might have to round some values in *state, if possible
+		 * read the actually implemented value back.
+		 */
+		if (chip->ops->get_state)
+			chip->ops->get_state(chip, pwm, &pwm->state);
+		else
+			pwm->state = *state;
 	} else {
 		/*
 		 * FIXME: restore the initial state in case of error.
 		 */
 		if (state->polarity != pwm->state.polarity) {
-			if (!pwm->chip->ops->set_polarity)
+			if (!chip->ops->set_polarity)
 				return -ENOTSUPP;
 
 			/*
@@ -486,12 +494,12 @@
 			 * ->apply().
 			 */
 			if (pwm->state.enabled) {
-				pwm->chip->ops->disable(pwm->chip, pwm);
+				chip->ops->disable(chip, pwm);
 				pwm->state.enabled = false;
 			}
 
-			err = pwm->chip->ops->set_polarity(pwm->chip, pwm,
-							   state->polarity);
+			err = chip->ops->set_polarity(chip, pwm,
+						      state->polarity);
 			if (err)
 				return err;
 
@@ -500,9 +508,9 @@
 
 		if (state->period != pwm->state.period ||
 		    state->duty_cycle != pwm->state.duty_cycle) {
-			err = pwm->chip->ops->config(pwm->chip, pwm,
-						     state->duty_cycle,
-						     state->period);
+			err = chip->ops->config(pwm->chip, pwm,
+						state->duty_cycle,
+						state->period);
 			if (err)
 				return err;
 
@@ -512,11 +520,11 @@
 
 		if (state->enabled != pwm->state.enabled) {
 			if (state->enabled) {
-				err = pwm->chip->ops->enable(pwm->chip, pwm);
+				err = chip->ops->enable(chip, pwm);
 				if (err)
 					return err;
 			} else {
-				pwm->chip->ops->disable(pwm->chip, pwm);
+				chip->ops->disable(chip, pwm);
 			}
 
 			pwm->state.enabled = state->enabled;

diff --git a/drivers/pwm/pwm-atmel-hlcdc.c b/drivers/pwm/pwm-atmel-hlcdc.c
index d13a83f..dcbc048 100644
--- a/drivers/pwm/pwm-atmel-hlcdc.c
+++ b/drivers/pwm/pwm-atmel-hlcdc.c

@@ -39,7 +39,7 @@
 }
 
 static int atmel_hlcdc_pwm_apply(struct pwm_chip *c, struct pwm_device *pwm,
-				 struct pwm_state *state)
+				 const struct pwm_state *state)
 {
 	struct atmel_hlcdc_pwm *chip = to_atmel_hlcdc_pwm(c);
 	struct atmel_hlcdc *hlcdc = chip->hlcdc;

diff --git a/drivers/pwm/pwm-atmel.c b/drivers/pwm/pwm-atmel.c
index e5e1eaf..9ba7334 100644
--- a/drivers/pwm/pwm-atmel.c
+++ b/drivers/pwm/pwm-atmel.c

@@ -209,7 +209,7 @@
 }
 
 static int atmel_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
-			   struct pwm_state *state)
+			   const struct pwm_state *state)
 {
 	struct atmel_pwm_chip *atmel_pwm = to_atmel_pwm_chip(chip);
 	struct pwm_state cstate;
@@ -318,19 +318,6 @@
 	},
 };
 
-static const struct platform_device_id atmel_pwm_devtypes[] = {
-	{
-		.name = "at91sam9rl-pwm",
-		.driver_data = (kernel_ulong_t)&atmel_sam9rl_pwm_data,
-	}, {
-		.name = "sama5d3-pwm",
-		.driver_data = (kernel_ulong_t)&atmel_sama5_pwm_data,
-	}, {
-		/* sentinel */
-	},
-};
-MODULE_DEVICE_TABLE(platform, atmel_pwm_devtypes);
-
 static const struct of_device_id atmel_pwm_dt_ids[] = {
 	{
 		.compatible = "atmel,at91sam9rl-pwm",
@@ -350,34 +337,20 @@
 };
 MODULE_DEVICE_TABLE(of, atmel_pwm_dt_ids);
 
-static inline const struct atmel_pwm_data *
-atmel_pwm_get_driver_data(struct platform_device *pdev)
-{
-	const struct platform_device_id *id;
-
-	if (pdev->dev.of_node)
-		return of_device_get_match_data(&pdev->dev);
-
-	id = platform_get_device_id(pdev);
-
-	return (struct atmel_pwm_data *)id->driver_data;
-}
-
 static int atmel_pwm_probe(struct platform_device *pdev)
 {
-	const struct atmel_pwm_data *data;
 	struct atmel_pwm_chip *atmel_pwm;
 	struct resource *res;
 	int ret;
 
-	data = atmel_pwm_get_driver_data(pdev);
-	if (!data)
-		return -ENODEV;
-
 	atmel_pwm = devm_kzalloc(&pdev->dev, sizeof(*atmel_pwm), GFP_KERNEL);
 	if (!atmel_pwm)
 		return -ENOMEM;
 
+	mutex_init(&atmel_pwm->isr_lock);
+	atmel_pwm->data = of_device_get_match_data(&pdev->dev);
+	atmel_pwm->updated_pwms = 0;
+
 	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	atmel_pwm->base = devm_ioremap_resource(&pdev->dev, res);
 	if (IS_ERR(atmel_pwm->base))
@@ -395,17 +368,10 @@
 
 	atmel_pwm->chip.dev = &pdev->dev;
 	atmel_pwm->chip.ops = &atmel_pwm_ops;
-
-	if (pdev->dev.of_node) {
-		atmel_pwm->chip.of_xlate = of_pwm_xlate_with_flags;
-		atmel_pwm->chip.of_pwm_n_cells = 3;
-	}
-
+	atmel_pwm->chip.of_xlate = of_pwm_xlate_with_flags;
+	atmel_pwm->chip.of_pwm_n_cells = 3;
 	atmel_pwm->chip.base = -1;
 	atmel_pwm->chip.npwm = 4;
-	atmel_pwm->data = data;
-	atmel_pwm->updated_pwms = 0;
-	mutex_init(&atmel_pwm->isr_lock);
 
 	ret = pwmchip_add(&atmel_pwm->chip);
 	if (ret < 0) {
@@ -437,7 +403,6 @@
 		.name = "atmel-pwm",
 		.of_match_table = of_match_ptr(atmel_pwm_dt_ids),
 	},
-	.id_table = atmel_pwm_devtypes,
 	.probe = atmel_pwm_probe,
 	.remove = atmel_pwm_remove,
 };

diff --git a/drivers/pwm/pwm-bcm-iproc.c b/drivers/pwm/pwm-bcm-iproc.c
index d961a82..56c38cf 100644
--- a/drivers/pwm/pwm-bcm-iproc.c
+++ b/drivers/pwm/pwm-bcm-iproc.c

@@ -115,7 +115,7 @@
 }
 
 static int iproc_pwmc_apply(struct pwm_chip *chip, struct pwm_device *pwm,
-			    struct pwm_state *state)
+			    const struct pwm_state *state)
 {
 	unsigned long prescale = IPROC_PWM_PRESCALE_MIN;
 	struct iproc_pwmc *ip = to_iproc_pwmc(chip);

diff --git a/drivers/pwm/pwm-bcm2835.c b/drivers/pwm/pwm-bcm2835.c
index f6fe0b9..91e24f0 100644
--- a/drivers/pwm/pwm-bcm2835.c
+++ b/drivers/pwm/pwm-bcm2835.c

@@ -21,7 +21,7 @@
 #define PERIOD(x)		(((x) * 0x10) + 0x10)
 #define DUTY(x)			(((x) * 0x10) + 0x14)
 
-#define MIN_PERIOD		108		/* 9.2 MHz max. PWM clock */
+#define PERIOD_MIN		0x2
 
 struct bcm2835_pwm {
 	struct pwm_chip chip;
@@ -64,6 +64,7 @@
 	struct bcm2835_pwm *pc = to_bcm2835_pwm(chip);
 	unsigned long rate = clk_get_rate(pc->clk);
 	unsigned long scaler;
+	u32 period;
 
 	if (!rate) {
 		dev_err(pc->dev, "failed to get clock rate\n");
@@ -71,17 +72,14 @@
 	}
 
 	scaler = DIV_ROUND_CLOSEST(NSEC_PER_SEC, rate);
+	period = DIV_ROUND_CLOSEST(period_ns, scaler);
 
-	if (period_ns <= MIN_PERIOD) {
-		dev_err(pc->dev, "period %d not supported, minimum %d\n",
-			period_ns, MIN_PERIOD);
+	if (period < PERIOD_MIN)
 		return -EINVAL;
-	}
 
 	writel(DIV_ROUND_CLOSEST(duty_ns, scaler),
 	       pc->base + DUTY(pwm->hwpwm));
-	writel(DIV_ROUND_CLOSEST(period_ns, scaler),
-	       pc->base + PERIOD(pwm->hwpwm));
+	writel(period, pc->base + PERIOD(pwm->hwpwm));
 
 	return 0;
 }
@@ -155,8 +153,11 @@
 
 	pc->clk = devm_clk_get(&pdev->dev, NULL);
 	if (IS_ERR(pc->clk)) {
-		dev_err(&pdev->dev, "clock not found: %ld\n", PTR_ERR(pc->clk));
-		return PTR_ERR(pc->clk);
+		ret = PTR_ERR(pc->clk);
+		if (ret != -EPROBE_DEFER)
+			dev_err(&pdev->dev, "clock not found: %d\n", ret);
+
+		return ret;
 	}
 
 	ret = clk_prepare_enable(pc->clk);

diff --git a/drivers/pwm/pwm-cros-ec.c b/drivers/pwm/pwm-cros-ec.c
index 85bea2d..8949744 100644
--- a/drivers/pwm/pwm-cros-ec.c
+++ b/drivers/pwm/pwm-cros-ec.c

@@ -93,7 +93,7 @@
 }
 
 static int cros_ec_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
-			     struct pwm_state *state)
+			     const struct pwm_state *state)
 {
 	struct cros_ec_pwm_device *ec_pwm = pwm_to_cros_ec_pwm(chip);
 	int duty_cycle;

diff --git a/drivers/pwm/pwm-fsl-ftm.c b/drivers/pwm/pwm-fsl-ftm.c
index 9d31a21..59272a9 100644
--- a/drivers/pwm/pwm-fsl-ftm.c
+++ b/drivers/pwm/pwm-fsl-ftm.c

@@ -227,7 +227,7 @@
 
 static int fsl_pwm_apply_config(struct fsl_pwm_chip *fpc,
 				struct pwm_device *pwm,
-				struct pwm_state *newstate)
+				const struct pwm_state *newstate)
 {
 	unsigned int duty;
 	u32 reg_polarity;
@@ -292,17 +292,13 @@
 
 	regmap_update_bits(fpc->regmap, FTM_POL, BIT(pwm->hwpwm), reg_polarity);
 
-	newstate->period = fsl_pwm_ticks_to_ns(fpc,
-					       fpc->period.mod_period + 1);
-	newstate->duty_cycle = fsl_pwm_ticks_to_ns(fpc, duty);
-
 	ftm_set_write_protection(fpc);
 
 	return 0;
 }
 
 static int fsl_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
-			 struct pwm_state *newstate)
+			 const struct pwm_state *newstate)
 {
 	struct fsl_pwm_chip *fpc = to_fsl_chip(chip);
 	struct pwm_state *oldstate = &pwm->state;

diff --git a/drivers/pwm/pwm-hibvt.c b/drivers/pwm/pwm-hibvt.c
index 753bd58..ad205fd 100644
--- a/drivers/pwm/pwm-hibvt.c
+++ b/drivers/pwm/pwm-hibvt.c

@@ -149,7 +149,7 @@
 }
 
 static int hibvt_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
-				struct pwm_state *state)
+			   const struct pwm_state *state)
 {
 	struct hibvt_pwm_chip *hi_pwm_chip = to_hibvt_pwm_chip(chip);
 

diff --git a/drivers/pwm/pwm-imx-tpm.c b/drivers/pwm/pwm-imx-tpm.c
index e8385c1..9145f61 100644
--- a/drivers/pwm/pwm-imx-tpm.c
+++ b/drivers/pwm/pwm-imx-tpm.c

@@ -89,7 +89,7 @@
 static int pwm_imx_tpm_round_state(struct pwm_chip *chip,
 				   struct imx_tpm_pwm_param *p,
 				   struct pwm_state *real_state,
-				   struct pwm_state *state)
+				   const struct pwm_state *state)
 {
 	struct imx_tpm_pwm_chip *tpm = to_imx_tpm_pwm_chip(chip);
 	u32 rate, prescale, period_count, clock_unit;
@@ -289,7 +289,7 @@
 
 static int pwm_imx_tpm_apply(struct pwm_chip *chip,
 			     struct pwm_device *pwm,
-			     struct pwm_state *state)
+			     const struct pwm_state *state)
 {
 	struct imx_tpm_pwm_chip *tpm = to_imx_tpm_pwm_chip(chip);
 	struct imx_tpm_pwm_param param;

diff --git a/drivers/pwm/pwm-imx27.c b/drivers/pwm/pwm-imx27.c
index 434a351..ae11d85 100644
--- a/drivers/pwm/pwm-imx27.c
+++ b/drivers/pwm/pwm-imx27.c

@@ -3,6 +3,10 @@
  * simple driver for PWM (Pulse Width Modulator) controller
  *
  * Derived from pxa PWM driver by eric miao <eric.miao@marvell.com>
+ *
+ * Limitations:
+ * - When disabled the output is driven to 0 independent of the configured
+ *   polarity.
  */
 
 #include <linux/bitfield.h>
@@ -205,7 +209,7 @@
 }
 
 static int pwm_imx27_apply(struct pwm_chip *chip, struct pwm_device *pwm,
-			   struct pwm_state *state)
+			   const struct pwm_state *state)
 {
 	unsigned long period_cycles, duty_cycles, prescale;
 	struct pwm_imx27_chip *imx = to_pwm_imx27_chip(chip);

diff --git a/drivers/pwm/pwm-jz4740.c b/drivers/pwm/pwm-jz4740.c
index f901e8a..9d78cc2 100644
--- a/drivers/pwm/pwm-jz4740.c
+++ b/drivers/pwm/pwm-jz4740.c

@@ -2,6 +2,11 @@
 /*
  *  Copyright (C) 2010, Lars-Peter Clausen <lars@metafoo.de>
  *  JZ4740 platform PWM support
+ *
+ * Limitations:
+ * - The .apply callback doesn't complete the currently running period before
+ *   reconfiguring the hardware.
+ * - Each period starts with the inactive part.
  */
 
 #include <linux/clk.h>
@@ -83,7 +88,7 @@
 }
 
 static int jz4740_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
-			    struct pwm_state *state)
+			    const struct pwm_state *state)
 {
 	struct jz4740_pwm_chip *jz4740 = to_jz4740(pwm->chip);
 	unsigned long long tmp;

diff --git a/drivers/pwm/pwm-lpss.c b/drivers/pwm/pwm-lpss.c
index 4098a46..75bbfe5 100644
--- a/drivers/pwm/pwm-lpss.c
+++ b/drivers/pwm/pwm-lpss.c

@@ -122,7 +122,7 @@
 }
 
 static int pwm_lpss_apply(struct pwm_chip *chip, struct pwm_device *pwm,
-			  struct pwm_state *state)
+			  const struct pwm_state *state)
 {
 	struct pwm_lpss_chip *lpwm = to_lpwm(chip);
 	int ret;

diff --git a/drivers/pwm/pwm-mediatek.c b/drivers/pwm/pwm-mediatek.c
index eb6674c..b94e0d0 100644
--- a/drivers/pwm/pwm-mediatek.c
+++ b/drivers/pwm/pwm-mediatek.c

@@ -1,12 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
- * Mediatek Pulse Width Modulator driver
+ * MediaTek Pulse Width Modulator driver
  *
  * Copyright (C) 2015 John Crispin <blogic@openwrt.org>
  * Copyright (C) 2017 Zhi Mao <zhi.mao@mediatek.com>
  *
- * This file is licensed under the terms of the GNU General Public
- * License version 2. This program is licensed "as is" without any
- * warranty of any kind, whether express or implied.
  */
 
 #include <linux/err.h>
@@ -35,125 +33,107 @@
 
 #define PWM_CLK_DIV_MAX		7
 
-enum {
-	MTK_CLK_MAIN = 0,
-	MTK_CLK_TOP,
-	MTK_CLK_PWM1,
-	MTK_CLK_PWM2,
-	MTK_CLK_PWM3,
-	MTK_CLK_PWM4,
-	MTK_CLK_PWM5,
-	MTK_CLK_PWM6,
-	MTK_CLK_PWM7,
-	MTK_CLK_PWM8,
-	MTK_CLK_MAX,
-};
-
-static const char * const mtk_pwm_clk_name[MTK_CLK_MAX] = {
-	"main", "top", "pwm1", "pwm2", "pwm3", "pwm4", "pwm5", "pwm6", "pwm7",
-	"pwm8"
-};
-
-struct mtk_pwm_platform_data {
+struct pwm_mediatek_of_data {
 	unsigned int num_pwms;
 	bool pwm45_fixup;
-	bool has_clks;
 };
 
 /**
- * struct mtk_pwm_chip - struct representing PWM chip
+ * struct pwm_mediatek_chip - struct representing PWM chip
  * @chip: linux PWM chip representation
  * @regs: base address of PWM chip
- * @clks: list of clocks
+ * @clk_top: the top clock generator
+ * @clk_main: the clock used by PWM core
+ * @clk_pwms: the clock used by each PWM channel
+ * @clk_freq: the fix clock frequency of legacy MIPS SoC
  */
-struct mtk_pwm_chip {
+struct pwm_mediatek_chip {
 	struct pwm_chip chip;
 	void __iomem *regs;
-	struct clk *clks[MTK_CLK_MAX];
-	const struct mtk_pwm_platform_data *soc;
+	struct clk *clk_top;
+	struct clk *clk_main;
+	struct clk **clk_pwms;
+	const struct pwm_mediatek_of_data *soc;
 };
 
-static const unsigned int mtk_pwm_reg_offset[] = {
+static const unsigned int pwm_mediatek_reg_offset[] = {
 	0x0010, 0x0050, 0x0090, 0x00d0, 0x0110, 0x0150, 0x0190, 0x0220
 };
 
-static inline struct mtk_pwm_chip *to_mtk_pwm_chip(struct pwm_chip *chip)
+static inline struct pwm_mediatek_chip *
+to_pwm_mediatek_chip(struct pwm_chip *chip)
 {
-	return container_of(chip, struct mtk_pwm_chip, chip);
+	return container_of(chip, struct pwm_mediatek_chip, chip);
 }
 
-static int mtk_pwm_clk_enable(struct pwm_chip *chip, struct pwm_device *pwm)
+static int pwm_mediatek_clk_enable(struct pwm_chip *chip,
+				   struct pwm_device *pwm)
 {
-	struct mtk_pwm_chip *pc = to_mtk_pwm_chip(chip);
+	struct pwm_mediatek_chip *pc = to_pwm_mediatek_chip(chip);
 	int ret;
 
-	if (!pc->soc->has_clks)
-		return 0;
-
-	ret = clk_prepare_enable(pc->clks[MTK_CLK_TOP]);
+	ret = clk_prepare_enable(pc->clk_top);
 	if (ret < 0)
 		return ret;
 
-	ret = clk_prepare_enable(pc->clks[MTK_CLK_MAIN]);
+	ret = clk_prepare_enable(pc->clk_main);
 	if (ret < 0)
 		goto disable_clk_top;
 
-	ret = clk_prepare_enable(pc->clks[MTK_CLK_PWM1 + pwm->hwpwm]);
+	ret = clk_prepare_enable(pc->clk_pwms[pwm->hwpwm]);
 	if (ret < 0)
 		goto disable_clk_main;
 
 	return 0;
 
 disable_clk_main:
-	clk_disable_unprepare(pc->clks[MTK_CLK_MAIN]);
+	clk_disable_unprepare(pc->clk_main);
 disable_clk_top:
-	clk_disable_unprepare(pc->clks[MTK_CLK_TOP]);
+	clk_disable_unprepare(pc->clk_top);
 
 	return ret;
 }
 
-static void mtk_pwm_clk_disable(struct pwm_chip *chip, struct pwm_device *pwm)
+static void pwm_mediatek_clk_disable(struct pwm_chip *chip,
+				     struct pwm_device *pwm)
 {
-	struct mtk_pwm_chip *pc = to_mtk_pwm_chip(chip);
+	struct pwm_mediatek_chip *pc = to_pwm_mediatek_chip(chip);
 
-	if (!pc->soc->has_clks)
-		return;
-
-	clk_disable_unprepare(pc->clks[MTK_CLK_PWM1 + pwm->hwpwm]);
-	clk_disable_unprepare(pc->clks[MTK_CLK_MAIN]);
-	clk_disable_unprepare(pc->clks[MTK_CLK_TOP]);
+	clk_disable_unprepare(pc->clk_pwms[pwm->hwpwm]);
+	clk_disable_unprepare(pc->clk_main);
+	clk_disable_unprepare(pc->clk_top);
 }
 
-static inline u32 mtk_pwm_readl(struct mtk_pwm_chip *chip, unsigned int num,
-				unsigned int offset)
+static inline u32 pwm_mediatek_readl(struct pwm_mediatek_chip *chip,
+				     unsigned int num, unsigned int offset)
 {
-	return readl(chip->regs + mtk_pwm_reg_offset[num] + offset);
+	return readl(chip->regs + pwm_mediatek_reg_offset[num] + offset);
 }
 
-static inline void mtk_pwm_writel(struct mtk_pwm_chip *chip,
-				  unsigned int num, unsigned int offset,
-				  u32 value)
+static inline void pwm_mediatek_writel(struct pwm_mediatek_chip *chip,
+				       unsigned int num, unsigned int offset,
+				       u32 value)
 {
-	writel(value, chip->regs + mtk_pwm_reg_offset[num] + offset);
+	writel(value, chip->regs + pwm_mediatek_reg_offset[num] + offset);
 }
 
-static int mtk_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
-			  int duty_ns, int period_ns)
+static int pwm_mediatek_config(struct pwm_chip *chip, struct pwm_device *pwm,
+			       int duty_ns, int period_ns)
 {
-	struct mtk_pwm_chip *pc = to_mtk_pwm_chip(chip);
-	struct clk *clk = pc->clks[MTK_CLK_PWM1 + pwm->hwpwm];
+	struct pwm_mediatek_chip *pc = to_pwm_mediatek_chip(chip);
 	u32 clkdiv = 0, cnt_period, cnt_duty, reg_width = PWMDWIDTH,
 	    reg_thres = PWMTHRES;
 	u64 resolution;
 	int ret;
 
-	ret = mtk_pwm_clk_enable(chip, pwm);
+	ret = pwm_mediatek_clk_enable(chip, pwm);
+
 	if (ret < 0)
 		return ret;
 
 	/* Using resolution in picosecond gets accuracy higher */
 	resolution = (u64)NSEC_PER_SEC * 1000;
-	do_div(resolution, clk_get_rate(clk));
+	do_div(resolution, clk_get_rate(pc->clk_pwms[pwm->hwpwm]));
 
 	cnt_period = DIV_ROUND_CLOSEST_ULL((u64)period_ns * 1000, resolution);
 	while (cnt_period > 8191) {
@@ -164,7 +144,7 @@
 	}
 
 	if (clkdiv > PWM_CLK_DIV_MAX) {
-		mtk_pwm_clk_disable(chip, pwm);
+		pwm_mediatek_clk_disable(chip, pwm);
 		dev_err(chip->dev, "period %d not supported\n", period_ns);
 		return -EINVAL;
 	}
@@ -179,22 +159,22 @@
 	}
 
 	cnt_duty = DIV_ROUND_CLOSEST_ULL((u64)duty_ns * 1000, resolution);
-	mtk_pwm_writel(pc, pwm->hwpwm, PWMCON, BIT(15) | clkdiv);
-	mtk_pwm_writel(pc, pwm->hwpwm, reg_width, cnt_period);
-	mtk_pwm_writel(pc, pwm->hwpwm, reg_thres, cnt_duty);
+	pwm_mediatek_writel(pc, pwm->hwpwm, PWMCON, BIT(15) | clkdiv);
+	pwm_mediatek_writel(pc, pwm->hwpwm, reg_width, cnt_period);
+	pwm_mediatek_writel(pc, pwm->hwpwm, reg_thres, cnt_duty);
 
-	mtk_pwm_clk_disable(chip, pwm);
+	pwm_mediatek_clk_disable(chip, pwm);
 
 	return 0;
 }
 
-static int mtk_pwm_enable(struct pwm_chip *chip, struct pwm_device *pwm)
+static int pwm_mediatek_enable(struct pwm_chip *chip, struct pwm_device *pwm)
 {
-	struct mtk_pwm_chip *pc = to_mtk_pwm_chip(chip);
+	struct pwm_mediatek_chip *pc = to_pwm_mediatek_chip(chip);
 	u32 value;
 	int ret;
 
-	ret = mtk_pwm_clk_enable(chip, pwm);
+	ret = pwm_mediatek_clk_enable(chip, pwm);
 	if (ret < 0)
 		return ret;
 
@@ -205,29 +185,28 @@
 	return 0;
 }
 
-static void mtk_pwm_disable(struct pwm_chip *chip, struct pwm_device *pwm)
+static void pwm_mediatek_disable(struct pwm_chip *chip, struct pwm_device *pwm)
 {
-	struct mtk_pwm_chip *pc = to_mtk_pwm_chip(chip);
+	struct pwm_mediatek_chip *pc = to_pwm_mediatek_chip(chip);
 	u32 value;
 
 	value = readl(pc->regs);
 	value &= ~BIT(pwm->hwpwm);
 	writel(value, pc->regs);
 
-	mtk_pwm_clk_disable(chip, pwm);
+	pwm_mediatek_clk_disable(chip, pwm);
 }
 
-static const struct pwm_ops mtk_pwm_ops = {
-	.config = mtk_pwm_config,
-	.enable = mtk_pwm_enable,
-	.disable = mtk_pwm_disable,
+static const struct pwm_ops pwm_mediatek_ops = {
+	.config = pwm_mediatek_config,
+	.enable = pwm_mediatek_enable,
+	.disable = pwm_mediatek_disable,
 	.owner = THIS_MODULE,
 };
 
-static int mtk_pwm_probe(struct platform_device *pdev)
+static int pwm_mediatek_probe(struct platform_device *pdev)
 {
-	const struct mtk_pwm_platform_data *data;
-	struct mtk_pwm_chip *pc;
+	struct pwm_mediatek_chip *pc;
 	struct resource *res;
 	unsigned int i;
 	int ret;
@@ -236,31 +215,51 @@
 	if (!pc)
 		return -ENOMEM;
 
-	data = of_device_get_match_data(&pdev->dev);
-	if (data == NULL)
-		return -EINVAL;
-	pc->soc = data;
+	pc->soc = of_device_get_match_data(&pdev->dev);
 
 	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	pc->regs = devm_ioremap_resource(&pdev->dev, res);
 	if (IS_ERR(pc->regs))
 		return PTR_ERR(pc->regs);
 
-	for (i = 0; i < data->num_pwms + 2 && pc->soc->has_clks; i++) {
-		pc->clks[i] = devm_clk_get(&pdev->dev, mtk_pwm_clk_name[i]);
-		if (IS_ERR(pc->clks[i])) {
+	pc->clk_pwms = devm_kcalloc(&pdev->dev, pc->soc->num_pwms,
+				    sizeof(*pc->clk_pwms), GFP_KERNEL);
+	if (!pc->clk_pwms)
+		return -ENOMEM;
+
+	pc->clk_top = devm_clk_get(&pdev->dev, "top");
+	if (IS_ERR(pc->clk_top)) {
+		dev_err(&pdev->dev, "clock: top fail: %ld\n",
+			PTR_ERR(pc->clk_top));
+		return PTR_ERR(pc->clk_top);
+	}
+
+	pc->clk_main = devm_clk_get(&pdev->dev, "main");
+	if (IS_ERR(pc->clk_main)) {
+		dev_err(&pdev->dev, "clock: main fail: %ld\n",
+			PTR_ERR(pc->clk_main));
+		return PTR_ERR(pc->clk_main);
+	}
+
+	for (i = 0; i < pc->soc->num_pwms; i++) {
+		char name[8];
+
+		snprintf(name, sizeof(name), "pwm%d", i + 1);
+
+		pc->clk_pwms[i] = devm_clk_get(&pdev->dev, name);
+		if (IS_ERR(pc->clk_pwms[i])) {
 			dev_err(&pdev->dev, "clock: %s fail: %ld\n",
-				mtk_pwm_clk_name[i], PTR_ERR(pc->clks[i]));
-			return PTR_ERR(pc->clks[i]);
+				name, PTR_ERR(pc->clk_pwms[i]));
+			return PTR_ERR(pc->clk_pwms[i]);
 		}
 	}
 
 	platform_set_drvdata(pdev, pc);
 
 	pc->chip.dev = &pdev->dev;
-	pc->chip.ops = &mtk_pwm_ops;
+	pc->chip.ops = &pwm_mediatek_ops;
 	pc->chip.base = -1;
-	pc->chip.npwm = data->num_pwms;
+	pc->chip.npwm = pc->soc->num_pwms;
 
 	ret = pwmchip_add(&pc->chip);
 	if (ret < 0) {
@@ -271,55 +270,63 @@
 	return 0;
 }
 
-static int mtk_pwm_remove(struct platform_device *pdev)
+static int pwm_mediatek_remove(struct platform_device *pdev)
 {
-	struct mtk_pwm_chip *pc = platform_get_drvdata(pdev);
+	struct pwm_mediatek_chip *pc = platform_get_drvdata(pdev);
 
 	return pwmchip_remove(&pc->chip);
 }
 
-static const struct mtk_pwm_platform_data mt2712_pwm_data = {
+static const struct pwm_mediatek_of_data mt2712_pwm_data = {
 	.num_pwms = 8,
 	.pwm45_fixup = false,
-	.has_clks = true,
 };
 
-static const struct mtk_pwm_platform_data mt7622_pwm_data = {
+static const struct pwm_mediatek_of_data mt7622_pwm_data = {
 	.num_pwms = 6,
 	.pwm45_fixup = false,
-	.has_clks = true,
 };
 
-static const struct mtk_pwm_platform_data mt7623_pwm_data = {
+static const struct pwm_mediatek_of_data mt7623_pwm_data = {
 	.num_pwms = 5,
 	.pwm45_fixup = true,
-	.has_clks = true,
 };
 
-static const struct mtk_pwm_platform_data mt7628_pwm_data = {
+static const struct pwm_mediatek_of_data mt7628_pwm_data = {
 	.num_pwms = 4,
 	.pwm45_fixup = true,
-	.has_clks = false,
 };
 
-static const struct of_device_id mtk_pwm_of_match[] = {
+static const struct pwm_mediatek_of_data mt7629_pwm_data = {
+	.num_pwms = 1,
+	.pwm45_fixup = false,
+};
+
+static const struct pwm_mediatek_of_data mt8516_pwm_data = {
+	.num_pwms = 5,
+	.pwm45_fixup = false,
+};
+
+static const struct of_device_id pwm_mediatek_of_match[] = {
 	{ .compatible = "mediatek,mt2712-pwm", .data = &mt2712_pwm_data },
 	{ .compatible = "mediatek,mt7622-pwm", .data = &mt7622_pwm_data },
 	{ .compatible = "mediatek,mt7623-pwm", .data = &mt7623_pwm_data },
 	{ .compatible = "mediatek,mt7628-pwm", .data = &mt7628_pwm_data },
+	{ .compatible = "mediatek,mt7629-pwm", .data = &mt7629_pwm_data },
+	{ .compatible = "mediatek,mt8516-pwm", .data = &mt8516_pwm_data },
 	{ },
 };
-MODULE_DEVICE_TABLE(of, mtk_pwm_of_match);
+MODULE_DEVICE_TABLE(of, pwm_mediatek_of_match);
 
-static struct platform_driver mtk_pwm_driver = {
+static struct platform_driver pwm_mediatek_driver = {
 	.driver = {
-		.name = "mtk-pwm",
-		.of_match_table = mtk_pwm_of_match,
+		.name = "pwm-mediatek",
+		.of_match_table = pwm_mediatek_of_match,
 	},
-	.probe = mtk_pwm_probe,
-	.remove = mtk_pwm_remove,
+	.probe = pwm_mediatek_probe,
+	.remove = pwm_mediatek_remove,
 };
-module_platform_driver(mtk_pwm_driver);
+module_platform_driver(pwm_mediatek_driver);
 
 MODULE_AUTHOR("John Crispin <blogic@openwrt.org>");
-MODULE_LICENSE("GPL");
+MODULE_LICENSE("GPL v2");

diff --git a/drivers/pwm/pwm-meson.c b/drivers/pwm/pwm-meson.c
index 3cbff5c..6245bbd 100644
--- a/drivers/pwm/pwm-meson.c
+++ b/drivers/pwm/pwm-meson.c

@@ -159,7 +159,7 @@
 }
 
 static int meson_pwm_calc(struct meson_pwm *meson, struct pwm_device *pwm,
-			  struct pwm_state *state)
+			  const struct pwm_state *state)
 {
 	struct meson_pwm_channel *channel = pwm_get_chip_data(pwm);
 	unsigned int duty, period, pre_div, cnt, duty_cnt;
@@ -265,7 +265,7 @@
 }
 
 static int meson_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
-			   struct pwm_state *state)
+			   const struct pwm_state *state)
 {
 	struct meson_pwm_channel *channel = pwm_get_chip_data(pwm);
 	struct meson_pwm *meson = to_meson_pwm(chip);

diff --git a/drivers/pwm/pwm-mxs.c b/drivers/pwm/pwm-mxs.c
index 04c0f6b..b14376b 100644
--- a/drivers/pwm/pwm-mxs.c
+++ b/drivers/pwm/pwm-mxs.c

@@ -126,15 +126,13 @@
 {
 	struct device_node *np = pdev->dev.of_node;
 	struct mxs_pwm_chip *mxs;
-	struct resource *res;
 	int ret;
 
 	mxs = devm_kzalloc(&pdev->dev, sizeof(*mxs), GFP_KERNEL);
 	if (!mxs)
 		return -ENOMEM;
 
-	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	mxs->base = devm_ioremap_resource(&pdev->dev, res);
+	mxs->base = devm_platform_ioremap_resource(pdev, 0);
 	if (IS_ERR(mxs->base))
 		return PTR_ERR(mxs->base);
 

diff --git a/drivers/pwm/pwm-rcar.c b/drivers/pwm/pwm-rcar.c
index 5b2b8ec..852eb23 100644
--- a/drivers/pwm/pwm-rcar.c
+++ b/drivers/pwm/pwm-rcar.c

@@ -158,7 +158,7 @@
 }
 
 static int rcar_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
-			  struct pwm_state *state)
+			  const struct pwm_state *state)
 {
 	struct rcar_pwm_chip *rp = to_rcar_pwm_chip(chip);
 	struct pwm_state cur_state;
@@ -187,7 +187,7 @@
 	/* The SYNC should be set to 0 even if rcar_pwm_set_counter failed */
 	rcar_pwm_update(rp, RCAR_PWMCR_SYNC, 0, RCAR_PWMCR);
 
-	if (!ret && state->enabled)
+	if (!ret)
 		ret = rcar_pwm_enable(rp);
 
 	return ret;

diff --git a/drivers/pwm/pwm-rockchip.c b/drivers/pwm/pwm-rockchip.c
index 51b96cb..73352e6 100644
--- a/drivers/pwm/pwm-rockchip.c
+++ b/drivers/pwm/pwm-rockchip.c

@@ -90,16 +90,16 @@
 		state->enabled = ((val & enable_conf) == enable_conf) ?
 				 true : false;
 
-	if (pc->data->supports_polarity) {
-		if (!(val & PWM_DUTY_POSITIVE))
-			state->polarity = PWM_POLARITY_INVERSED;
-	}
+	if (pc->data->supports_polarity && !(val & PWM_DUTY_POSITIVE))
+		state->polarity = PWM_POLARITY_INVERSED;
+	else
+		state->polarity = PWM_POLARITY_NORMAL;
 
 	clk_disable(pc->pclk);
 }
 
 static void rockchip_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
-			       struct pwm_state *state)
+			       const struct pwm_state *state)
 {
 	struct rockchip_pwm_chip *pc = to_rockchip_pwm_chip(chip);
 	unsigned long period, duty;
@@ -183,7 +183,7 @@
 }
 
 static int rockchip_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
-			      struct pwm_state *state)
+			      const struct pwm_state *state)
 {
 	struct rockchip_pwm_chip *pc = to_rockchip_pwm_chip(chip);
 	struct pwm_state curstate;
@@ -212,12 +212,6 @@
 			goto out;
 	}
 
-	/*
-	 * Update the state with the real hardware, which can differ a bit
-	 * because of period/duty_cycle approximation.
-	 */
-	rockchip_pwm_get_state(chip, pwm, state);
-
 out:
 	clk_disable(pc->pclk);
 

diff --git a/drivers/pwm/pwm-sifive.c b/drivers/pwm/pwm-sifive.c
index a7c107f..cc63f9b 100644
--- a/drivers/pwm/pwm-sifive.c
+++ b/drivers/pwm/pwm-sifive.c

@@ -147,7 +147,7 @@
 }
 
 static int pwm_sifive_apply(struct pwm_chip *chip, struct pwm_device *pwm,
-			    struct pwm_state *state)
+			    const struct pwm_state *state)
 {
 	struct pwm_sifive_ddata *ddata = pwm_sifive_chip_to_ddata(chip);
 	struct pwm_state cur_state;
@@ -250,10 +250,8 @@
 
 	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	ddata->regs = devm_ioremap_resource(dev, res);
-	if (IS_ERR(ddata->regs)) {
-		dev_err(dev, "Unable to map IO resources\n");
+	if (IS_ERR(ddata->regs))
 		return PTR_ERR(ddata->regs);
-	}
 
 	ddata->clk = devm_clk_get(dev, NULL);
 	if (IS_ERR(ddata->clk)) {

diff --git a/drivers/pwm/pwm-sprd.c b/drivers/pwm/pwm-sprd.c
new file mode 100644
index 0000000..be23942
--- /dev/null
+++ b/drivers/pwm/pwm-sprd.c

@@ -0,0 +1,309 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019 Spreadtrum Communications Inc.
+ */
+
+#include <linux/clk.h>
+#include <linux/err.h>
+#include <linux/io.h>
+#include <linux/math64.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/pwm.h>
+
+#define SPRD_PWM_PRESCALE	0x0
+#define SPRD_PWM_MOD		0x4
+#define SPRD_PWM_DUTY		0x8
+#define SPRD_PWM_ENABLE		0x18
+
+#define SPRD_PWM_MOD_MAX	GENMASK(7, 0)
+#define SPRD_PWM_DUTY_MSK	GENMASK(15, 0)
+#define SPRD_PWM_PRESCALE_MSK	GENMASK(7, 0)
+#define SPRD_PWM_ENABLE_BIT	BIT(0)
+
+#define SPRD_PWM_CHN_NUM	4
+#define SPRD_PWM_REGS_SHIFT	5
+#define SPRD_PWM_CHN_CLKS_NUM	2
+#define SPRD_PWM_CHN_OUTPUT_CLK	1
+
+struct sprd_pwm_chn {
+	struct clk_bulk_data clks[SPRD_PWM_CHN_CLKS_NUM];
+	u32 clk_rate;
+};
+
+struct sprd_pwm_chip {
+	void __iomem *base;
+	struct device *dev;
+	struct pwm_chip chip;
+	int num_pwms;
+	struct sprd_pwm_chn chn[SPRD_PWM_CHN_NUM];
+};
+
+/*
+ * The list of clocks required by PWM channels, and each channel has 2 clocks:
+ * enable clock and pwm clock.
+ */
+static const char * const sprd_pwm_clks[] = {
+	"enable0", "pwm0",
+	"enable1", "pwm1",
+	"enable2", "pwm2",
+	"enable3", "pwm3",
+};
+
+static u32 sprd_pwm_read(struct sprd_pwm_chip *spc, u32 hwid, u32 reg)
+{
+	u32 offset = reg + (hwid << SPRD_PWM_REGS_SHIFT);
+
+	return readl_relaxed(spc->base + offset);
+}
+
+static void sprd_pwm_write(struct sprd_pwm_chip *spc, u32 hwid,
+			   u32 reg, u32 val)
+{
+	u32 offset = reg + (hwid << SPRD_PWM_REGS_SHIFT);
+
+	writel_relaxed(val, spc->base + offset);
+}
+
+static void sprd_pwm_get_state(struct pwm_chip *chip, struct pwm_device *pwm,
+			       struct pwm_state *state)
+{
+	struct sprd_pwm_chip *spc =
+		container_of(chip, struct sprd_pwm_chip, chip);
+	struct sprd_pwm_chn *chn = &spc->chn[pwm->hwpwm];
+	u32 val, duty, prescale;
+	u64 tmp;
+	int ret;
+
+	/*
+	 * The clocks to PWM channel has to be enabled first before
+	 * reading to the registers.
+	 */
+	ret = clk_bulk_prepare_enable(SPRD_PWM_CHN_CLKS_NUM, chn->clks);
+	if (ret) {
+		dev_err(spc->dev, "failed to enable pwm%u clocks\n",
+			pwm->hwpwm);
+		return;
+	}
+
+	val = sprd_pwm_read(spc, pwm->hwpwm, SPRD_PWM_ENABLE);
+	if (val & SPRD_PWM_ENABLE_BIT)
+		state->enabled = true;
+	else
+		state->enabled = false;
+
+	/*
+	 * The hardware provides a counter that is feed by the source clock.
+	 * The period length is (PRESCALE + 1) * MOD counter steps.
+	 * The duty cycle length is (PRESCALE + 1) * DUTY counter steps.
+	 * Thus the period_ns and duty_ns calculation formula should be:
+	 * period_ns = NSEC_PER_SEC * (prescale + 1) * mod / clk_rate
+	 * duty_ns = NSEC_PER_SEC * (prescale + 1) * duty / clk_rate
+	 */
+	val = sprd_pwm_read(spc, pwm->hwpwm, SPRD_PWM_PRESCALE);
+	prescale = val & SPRD_PWM_PRESCALE_MSK;
+	tmp = (prescale + 1) * NSEC_PER_SEC * SPRD_PWM_MOD_MAX;
+	state->period = DIV_ROUND_CLOSEST_ULL(tmp, chn->clk_rate);
+
+	val = sprd_pwm_read(spc, pwm->hwpwm, SPRD_PWM_DUTY);
+	duty = val & SPRD_PWM_DUTY_MSK;
+	tmp = (prescale + 1) * NSEC_PER_SEC * duty;
+	state->duty_cycle = DIV_ROUND_CLOSEST_ULL(tmp, chn->clk_rate);
+
+	/* Disable PWM clocks if the PWM channel is not in enable state. */
+	if (!state->enabled)
+		clk_bulk_disable_unprepare(SPRD_PWM_CHN_CLKS_NUM, chn->clks);
+}
+
+static int sprd_pwm_config(struct sprd_pwm_chip *spc, struct pwm_device *pwm,
+			   int duty_ns, int period_ns)
+{
+	struct sprd_pwm_chn *chn = &spc->chn[pwm->hwpwm];
+	u32 prescale, duty;
+	u64 tmp;
+
+	/*
+	 * The hardware provides a counter that is feed by the source clock.
+	 * The period length is (PRESCALE + 1) * MOD counter steps.
+	 * The duty cycle length is (PRESCALE + 1) * DUTY counter steps.
+	 *
+	 * To keep the maths simple we're always using MOD = SPRD_PWM_MOD_MAX.
+	 * The value for PRESCALE is selected such that the resulting period
+	 * gets the maximal length not bigger than the requested one with the
+	 * given settings (MOD = SPRD_PWM_MOD_MAX and input clock).
+	 */
+	duty = duty_ns * SPRD_PWM_MOD_MAX / period_ns;
+
+	tmp = (u64)chn->clk_rate * period_ns;
+	do_div(tmp, NSEC_PER_SEC);
+	prescale = DIV_ROUND_CLOSEST_ULL(tmp, SPRD_PWM_MOD_MAX) - 1;
+	if (prescale > SPRD_PWM_PRESCALE_MSK)
+		prescale = SPRD_PWM_PRESCALE_MSK;
+
+	/*
+	 * Note: Writing DUTY triggers the hardware to actually apply the
+	 * values written to MOD and DUTY to the output, so must keep writing
+	 * DUTY last.
+	 *
+	 * The hardware can ensures that current running period is completed
+	 * before changing a new configuration to avoid mixed settings.
+	 */
+	sprd_pwm_write(spc, pwm->hwpwm, SPRD_PWM_PRESCALE, prescale);
+	sprd_pwm_write(spc, pwm->hwpwm, SPRD_PWM_MOD, SPRD_PWM_MOD_MAX);
+	sprd_pwm_write(spc, pwm->hwpwm, SPRD_PWM_DUTY, duty);
+
+	return 0;
+}
+
+static int sprd_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
+			  const struct pwm_state *state)
+{
+	struct sprd_pwm_chip *spc =
+		container_of(chip, struct sprd_pwm_chip, chip);
+	struct sprd_pwm_chn *chn = &spc->chn[pwm->hwpwm];
+	struct pwm_state *cstate = &pwm->state;
+	int ret;
+
+	if (state->enabled) {
+		if (!cstate->enabled) {
+			/*
+			 * The clocks to PWM channel has to be enabled first
+			 * before writing to the registers.
+			 */
+			ret = clk_bulk_prepare_enable(SPRD_PWM_CHN_CLKS_NUM,
+						      chn->clks);
+			if (ret) {
+				dev_err(spc->dev,
+					"failed to enable pwm%u clocks\n",
+					pwm->hwpwm);
+				return ret;
+			}
+		}
+
+		if (state->period != cstate->period ||
+		    state->duty_cycle != cstate->duty_cycle) {
+			ret = sprd_pwm_config(spc, pwm, state->duty_cycle,
+					      state->period);
+			if (ret)
+				return ret;
+		}
+
+		sprd_pwm_write(spc, pwm->hwpwm, SPRD_PWM_ENABLE, 1);
+	} else if (cstate->enabled) {
+		/*
+		 * Note: After setting SPRD_PWM_ENABLE to zero, the controller
+		 * will not wait for current period to be completed, instead it
+		 * will stop the PWM channel immediately.
+		 */
+		sprd_pwm_write(spc, pwm->hwpwm, SPRD_PWM_ENABLE, 0);
+
+		clk_bulk_disable_unprepare(SPRD_PWM_CHN_CLKS_NUM, chn->clks);
+	}
+
+	return 0;
+}
+
+static const struct pwm_ops sprd_pwm_ops = {
+	.apply = sprd_pwm_apply,
+	.get_state = sprd_pwm_get_state,
+	.owner = THIS_MODULE,
+};
+
+static int sprd_pwm_clk_init(struct sprd_pwm_chip *spc)
+{
+	struct clk *clk_pwm;
+	int ret, i;
+
+	for (i = 0; i < SPRD_PWM_CHN_NUM; i++) {
+		struct sprd_pwm_chn *chn = &spc->chn[i];
+		int j;
+
+		for (j = 0; j < SPRD_PWM_CHN_CLKS_NUM; ++j)
+			chn->clks[j].id =
+				sprd_pwm_clks[i * SPRD_PWM_CHN_CLKS_NUM + j];
+
+		ret = devm_clk_bulk_get(spc->dev, SPRD_PWM_CHN_CLKS_NUM,
+					chn->clks);
+		if (ret) {
+			if (ret == -ENOENT)
+				break;
+
+			if (ret != -EPROBE_DEFER)
+				dev_err(spc->dev,
+					"failed to get channel clocks\n");
+
+			return ret;
+		}
+
+		clk_pwm = chn->clks[SPRD_PWM_CHN_OUTPUT_CLK].clk;
+		chn->clk_rate = clk_get_rate(clk_pwm);
+	}
+
+	if (!i) {
+		dev_err(spc->dev, "no available PWM channels\n");
+		return -ENODEV;
+	}
+
+	spc->num_pwms = i;
+
+	return 0;
+}
+
+static int sprd_pwm_probe(struct platform_device *pdev)
+{
+	struct sprd_pwm_chip *spc;
+	int ret;
+
+	spc = devm_kzalloc(&pdev->dev, sizeof(*spc), GFP_KERNEL);
+	if (!spc)
+		return -ENOMEM;
+
+	spc->base = devm_platform_ioremap_resource(pdev, 0);
+	if (IS_ERR(spc->base))
+		return PTR_ERR(spc->base);
+
+	spc->dev = &pdev->dev;
+	platform_set_drvdata(pdev, spc);
+
+	ret = sprd_pwm_clk_init(spc);
+	if (ret)
+		return ret;
+
+	spc->chip.dev = &pdev->dev;
+	spc->chip.ops = &sprd_pwm_ops;
+	spc->chip.base = -1;
+	spc->chip.npwm = spc->num_pwms;
+
+	ret = pwmchip_add(&spc->chip);
+	if (ret)
+		dev_err(&pdev->dev, "failed to add PWM chip\n");
+
+	return ret;
+}
+
+static int sprd_pwm_remove(struct platform_device *pdev)
+{
+	struct sprd_pwm_chip *spc = platform_get_drvdata(pdev);
+
+	return pwmchip_remove(&spc->chip);
+}
+
+static const struct of_device_id sprd_pwm_of_match[] = {
+	{ .compatible = "sprd,ums512-pwm", },
+	{ },
+};
+MODULE_DEVICE_TABLE(of, sprd_pwm_of_match);
+
+static struct platform_driver sprd_pwm_driver = {
+	.driver = {
+		.name = "sprd-pwm",
+		.of_match_table = sprd_pwm_of_match,
+	},
+	.probe = sprd_pwm_probe,
+	.remove = sprd_pwm_remove,
+};
+
+module_platform_driver(sprd_pwm_driver);
+
+MODULE_DESCRIPTION("Spreadtrum PWM Driver");
+MODULE_LICENSE("GPL v2");

diff --git a/drivers/pwm/pwm-sti.c b/drivers/pwm/pwm-sti.c
index 20450e3..1508616 100644
--- a/drivers/pwm/pwm-sti.c
+++ b/drivers/pwm/pwm-sti.c

@@ -564,10 +564,8 @@
 		return PTR_ERR(pc->regmap);
 
 	irq = platform_get_irq(pdev, 0);
-	if (irq < 0) {
-		dev_err(&pdev->dev, "Failed to obtain IRQ\n");
+	if (irq < 0)
 		return irq;
-	}
 
 	ret = devm_request_irq(&pdev->dev, irq, sti_pwm_interrupt, 0,
 			       pdev->name, pc);

diff --git a/drivers/pwm/pwm-stm32-lp.c b/drivers/pwm/pwm-stm32-lp.c
index 2211a64..67fca62 100644
--- a/drivers/pwm/pwm-stm32-lp.c
+++ b/drivers/pwm/pwm-stm32-lp.c

@@ -32,7 +32,7 @@
 #define STM32_LPTIM_MAX_PRESCALER	128
 
 static int stm32_pwm_lp_apply(struct pwm_chip *chip, struct pwm_device *pwm,
-			      struct pwm_state *state)
+			      const struct pwm_state *state)
 {
 	struct stm32_pwm_lp *priv = to_stm32_pwm_lp(chip);
 	unsigned long long prd, div, dty;
@@ -59,6 +59,12 @@
 	/* Calculate the period and prescaler value */
 	div = (unsigned long long)clk_get_rate(priv->clk) * state->period;
 	do_div(div, NSEC_PER_SEC);
+	if (!div) {
+		/* Clock is too slow to achieve requested period. */
+		dev_dbg(priv->chip.dev, "Can't reach %u ns\n",	state->period);
+		return -EINVAL;
+	}
+
 	prd = div;
 	while (div > STM32_LPTIM_MAX_ARR) {
 		presc++;

diff --git a/drivers/pwm/pwm-stm32.c b/drivers/pwm/pwm-stm32.c
index 740e2de..359b085 100644
--- a/drivers/pwm/pwm-stm32.c
+++ b/drivers/pwm/pwm-stm32.c

@@ -440,7 +440,7 @@
 }
 
 static int stm32_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
-			   struct pwm_state *state)
+			   const struct pwm_state *state)
 {
 	bool enabled;
 	struct stm32_pwm *priv = to_stm32_pwm_dev(chip);
@@ -468,7 +468,7 @@
 }
 
 static int stm32_pwm_apply_locked(struct pwm_chip *chip, struct pwm_device *pwm,
-				  struct pwm_state *state)
+				  const struct pwm_state *state)
 {
 	struct stm32_pwm *priv = to_stm32_pwm_dev(chip);
 	int ret;

diff --git a/drivers/pwm/pwm-sun4i.c b/drivers/pwm/pwm-sun4i.c
index de78c82..6f5840a 100644
--- a/drivers/pwm/pwm-sun4i.c
+++ b/drivers/pwm/pwm-sun4i.c

@@ -145,7 +145,7 @@
 }
 
 static int sun4i_pwm_calculate(struct sun4i_pwm_chip *sun4i_pwm,
-			       struct pwm_state *state,
+			       const struct pwm_state *state,
 			       u32 *dty, u32 *prd, unsigned int *prsclr)
 {
 	u64 clk_rate, div = 0;
@@ -192,17 +192,11 @@
 	*dty = div;
 	*prsclr = prescaler;
 
-	div = (u64)pval * NSEC_PER_SEC * *prd;
-	state->period = DIV_ROUND_CLOSEST_ULL(div, clk_rate);
-
-	div = (u64)pval * NSEC_PER_SEC * *dty;
-	state->duty_cycle = DIV_ROUND_CLOSEST_ULL(div, clk_rate);
-
 	return 0;
 }
 
 static int sun4i_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
-			   struct pwm_state *state)
+			   const struct pwm_state *state)
 {
 	struct sun4i_pwm_chip *sun4i_pwm = to_sun4i_pwm_chip(chip);
 	struct pwm_state cstate;

diff --git a/drivers/pwm/pwm-zx.c b/drivers/pwm/pwm-zx.c
index e24f4be..e2c21cc 100644
--- a/drivers/pwm/pwm-zx.c
+++ b/drivers/pwm/pwm-zx.c

@@ -148,7 +148,7 @@
 }
 
 static int zx_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
-			struct pwm_state *state)
+			const struct pwm_state *state)
 {
 	struct zx_pwm_chip *zpc = to_zx_pwm_chip(chip);
 	struct pwm_state cstate;

diff --git a/drivers/s390/cio/ccwgroup.c b/drivers/s390/cio/ccwgroup.c
index 0005ec9..b42a937 100644
--- a/drivers/s390/cio/ccwgroup.c
+++ b/drivers/s390/cio/ccwgroup.c

@@ -372,7 +372,7 @@
 		goto error;
 	}
 	/* Check for trailing stuff. */
-	if (i == num_devices && strlen(buf) > 0) {
+	if (i == num_devices && buf && strlen(buf) > 0) {
 		rc = -EINVAL;
 		goto error;
 	}

diff --git a/drivers/s390/cio/css.c b/drivers/s390/cio/css.c
index 22c5581..1fbfb0a 100644
--- a/drivers/s390/cio/css.c
+++ b/drivers/s390/cio/css.c

@@ -1388,6 +1388,8 @@
 
 int sch_is_pseudo_sch(struct subchannel *sch)
 {
+	if (!sch->dev.parent)
+		return 0;
 	return sch == to_css(sch->dev.parent)->pseudo_subchannel;
 }
 

diff --git a/drivers/s390/cio/device_ops.c b/drivers/s390/cio/device_ops.c
index d722458..65841af 100644
--- a/drivers/s390/cio/device_ops.c
+++ b/drivers/s390/cio/device_ops.c

@@ -124,9 +124,7 @@
 /**
  * ccw_device_clear() - terminate I/O request processing
  * @cdev: target ccw device
- * @intparm: interruption parameter; value is only used if no I/O is
- *	     outstanding, otherwise the intparm associated with the I/O request
- *	     is returned
+ * @intparm: interruption parameter to be returned upon conclusion of csch
  *
  * ccw_device_clear() calls csch on @cdev's subchannel.
  * Returns:
@@ -179,6 +177,9 @@
  * completed during the time specified by @expires. If a timeout occurs, the
  * channel program is terminated via xsch, hsch or csch, and the device's
  * interrupt handler will be called with an irb containing ERR_PTR(-%ETIMEDOUT).
+ * The interruption handler will echo back the @intparm specified here, unless
+ * another interruption parameter is specified by a subsequent invocation of
+ * ccw_device_halt() or ccw_device_clear().
  * Returns:
  *  %0, if the operation was successful;
  *  -%EBUSY, if the device is busy, or status pending;
@@ -256,6 +257,9 @@
  * Start a S/390 channel program. When the interrupt arrives, the
  * IRQ handler is called, either immediately, delayed (dev-end missing,
  * or sense required) or never (no IRQ handler registered).
+ * The interruption handler will echo back the @intparm specified here, unless
+ * another interruption parameter is specified by a subsequent invocation of
+ * ccw_device_halt() or ccw_device_clear().
  * Returns:
  *  %0, if the operation was successful;
  *  -%EBUSY, if the device is busy, or status pending;
@@ -287,6 +291,9 @@
  * Start a S/390 channel program. When the interrupt arrives, the
  * IRQ handler is called, either immediately, delayed (dev-end missing,
  * or sense required) or never (no IRQ handler registered).
+ * The interruption handler will echo back the @intparm specified here, unless
+ * another interruption parameter is specified by a subsequent invocation of
+ * ccw_device_halt() or ccw_device_clear().
  * Returns:
  *  %0, if the operation was successful;
  *  -%EBUSY, if the device is busy, or status pending;
@@ -322,6 +329,9 @@
  * completed during the time specified by @expires. If a timeout occurs, the
  * channel program is terminated via xsch, hsch or csch, and the device's
  * interrupt handler will be called with an irb containing ERR_PTR(-%ETIMEDOUT).
+ * The interruption handler will echo back the @intparm specified here, unless
+ * another interruption parameter is specified by a subsequent invocation of
+ * ccw_device_halt() or ccw_device_clear().
  * Returns:
  *  %0, if the operation was successful;
  *  -%EBUSY, if the device is busy, or status pending;
@@ -343,11 +353,12 @@
 /**
  * ccw_device_halt() - halt I/O request processing
  * @cdev: target ccw device
- * @intparm: interruption parameter; value is only used if no I/O is
- *	     outstanding, otherwise the intparm associated with the I/O request
- *	     is returned
+ * @intparm: interruption parameter to be returned upon conclusion of hsch
  *
  * ccw_device_halt() calls hsch on @cdev's subchannel.
+ * The interruption handler will echo back the @intparm specified here, unless
+ * another interruption parameter is specified by a subsequent invocation of
+ * ccw_device_clear().
  * Returns:
  *  %0 on success,
  *  -%ENODEV on device not operational,

diff --git a/drivers/s390/crypto/ap_bus.c b/drivers/s390/crypto/ap_bus.c
index a76b8a8..a191506 100644
--- a/drivers/s390/crypto/ap_bus.c
+++ b/drivers/s390/crypto/ap_bus.c

@@ -1322,24 +1322,24 @@
 	/* < CEX2A is not supported */
 	if (rawtype < AP_DEVICE_TYPE_CEX2A)
 		return 0;
-	/* up to CEX6 known and fully supported */
-	if (rawtype <= AP_DEVICE_TYPE_CEX6)
+	/* up to CEX7 known and fully supported */
+	if (rawtype <= AP_DEVICE_TYPE_CEX7)
 		return rawtype;
 	/*
-	 * unknown new type > CEX6, check for compatibility
+	 * unknown new type > CEX7, check for compatibility
 	 * to the highest known and supported type which is
-	 * currently CEX6 with the help of the QACT function.
+	 * currently CEX7 with the help of the QACT function.
 	 */
 	if (ap_qact_available()) {
 		struct ap_queue_status status;
 		union ap_qact_ap_info apinfo = {0};
 
 		apinfo.mode = (func >> 26) & 0x07;
-		apinfo.cat = AP_DEVICE_TYPE_CEX6;
+		apinfo.cat = AP_DEVICE_TYPE_CEX7;
 		status = ap_qact(qid, 0, &apinfo);
 		if (status.response_code == AP_RESPONSE_NORMAL
 		    && apinfo.cat >= AP_DEVICE_TYPE_CEX2A
-		    && apinfo.cat <= AP_DEVICE_TYPE_CEX6)
+		    && apinfo.cat <= AP_DEVICE_TYPE_CEX7)
 			comp_type = apinfo.cat;
 	}
 	if (!comp_type)

diff --git a/drivers/s390/crypto/ap_bus.h b/drivers/s390/crypto/ap_bus.h
index 6f3cf37..433b7b6 100644
--- a/drivers/s390/crypto/ap_bus.h
+++ b/drivers/s390/crypto/ap_bus.h

@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0+ */
 /*
- * Copyright IBM Corp. 2006, 2012
+ * Copyright IBM Corp. 2006, 2019
  * Author(s): Cornelia Huck <cornelia.huck@de.ibm.com>
  *	      Martin Schwidefsky <schwidefsky@de.ibm.com>
  *	      Ralph Wuerthner <rwuerthn@de.ibm.com>
@@ -63,6 +63,7 @@
 #define AP_DEVICE_TYPE_CEX4	10
 #define AP_DEVICE_TYPE_CEX5	11
 #define AP_DEVICE_TYPE_CEX6	12
+#define AP_DEVICE_TYPE_CEX7	13
 
 /*
  * Known function facilities

diff --git a/drivers/s390/crypto/pkey_api.c b/drivers/s390/crypto/pkey_api.c
index f76a1d0..9de3d46 100644
--- a/drivers/s390/crypto/pkey_api.c
+++ b/drivers/s390/crypto/pkey_api.c

@@ -1363,9 +1363,122 @@
 	.bin_attrs = ccadata_attrs,
 };
 
+#define CCACIPHERTOKENSIZE	(sizeof(struct cipherkeytoken) + 80)
+
+/*
+ * Sysfs attribute read function for all secure key ccacipher binary attributes.
+ * The implementation can not deal with partial reads, because a new random
+ * secure key blob is generated with each read. In case of partial reads
+ * (i.e. off != 0 or count < key blob size) -EINVAL is returned.
+ */
+static ssize_t pkey_ccacipher_aes_attr_read(enum pkey_key_size keybits,
+					    bool is_xts, char *buf, loff_t off,
+					    size_t count)
+{
+	size_t keysize;
+	int rc;
+
+	if (off != 0 || count < CCACIPHERTOKENSIZE)
+		return -EINVAL;
+	if (is_xts)
+		if (count < 2 * CCACIPHERTOKENSIZE)
+			return -EINVAL;
+
+	keysize = CCACIPHERTOKENSIZE;
+	rc = cca_gencipherkey(-1, -1, keybits, 0, buf, &keysize);
+	if (rc)
+		return rc;
+	memset(buf + keysize, 0, CCACIPHERTOKENSIZE - keysize);
+
+	if (is_xts) {
+		keysize = CCACIPHERTOKENSIZE;
+		rc = cca_gencipherkey(-1, -1, keybits, 0,
+				      buf + CCACIPHERTOKENSIZE, &keysize);
+		if (rc)
+			return rc;
+		memset(buf + CCACIPHERTOKENSIZE + keysize, 0,
+		       CCACIPHERTOKENSIZE - keysize);
+
+		return 2 * CCACIPHERTOKENSIZE;
+	}
+
+	return CCACIPHERTOKENSIZE;
+}
+
+static ssize_t ccacipher_aes_128_read(struct file *filp,
+				      struct kobject *kobj,
+				      struct bin_attribute *attr,
+				      char *buf, loff_t off,
+				      size_t count)
+{
+	return pkey_ccacipher_aes_attr_read(PKEY_SIZE_AES_128, false, buf,
+					    off, count);
+}
+
+static ssize_t ccacipher_aes_192_read(struct file *filp,
+				      struct kobject *kobj,
+				      struct bin_attribute *attr,
+				      char *buf, loff_t off,
+				      size_t count)
+{
+	return pkey_ccacipher_aes_attr_read(PKEY_SIZE_AES_192, false, buf,
+					    off, count);
+}
+
+static ssize_t ccacipher_aes_256_read(struct file *filp,
+				      struct kobject *kobj,
+				      struct bin_attribute *attr,
+				      char *buf, loff_t off,
+				      size_t count)
+{
+	return pkey_ccacipher_aes_attr_read(PKEY_SIZE_AES_256, false, buf,
+					    off, count);
+}
+
+static ssize_t ccacipher_aes_128_xts_read(struct file *filp,
+					  struct kobject *kobj,
+					  struct bin_attribute *attr,
+					  char *buf, loff_t off,
+					  size_t count)
+{
+	return pkey_ccacipher_aes_attr_read(PKEY_SIZE_AES_128, true, buf,
+					    off, count);
+}
+
+static ssize_t ccacipher_aes_256_xts_read(struct file *filp,
+					  struct kobject *kobj,
+					  struct bin_attribute *attr,
+					  char *buf, loff_t off,
+					  size_t count)
+{
+	return pkey_ccacipher_aes_attr_read(PKEY_SIZE_AES_256, true, buf,
+					    off, count);
+}
+
+static BIN_ATTR_RO(ccacipher_aes_128, CCACIPHERTOKENSIZE);
+static BIN_ATTR_RO(ccacipher_aes_192, CCACIPHERTOKENSIZE);
+static BIN_ATTR_RO(ccacipher_aes_256, CCACIPHERTOKENSIZE);
+static BIN_ATTR_RO(ccacipher_aes_128_xts, 2 * CCACIPHERTOKENSIZE);
+static BIN_ATTR_RO(ccacipher_aes_256_xts, 2 * CCACIPHERTOKENSIZE);
+
+static struct bin_attribute *ccacipher_attrs[] = {
+	&bin_attr_ccacipher_aes_128,
+	&bin_attr_ccacipher_aes_192,
+	&bin_attr_ccacipher_aes_256,
+	&bin_attr_ccacipher_aes_128_xts,
+	&bin_attr_ccacipher_aes_256_xts,
+	NULL
+};
+
+static struct attribute_group ccacipher_attr_group = {
+	.name	   = "ccacipher",
+	.bin_attrs = ccacipher_attrs,
+};
+
 static const struct attribute_group *pkey_attr_groups[] = {
 	&protkey_attr_group,
 	&ccadata_attr_group,
+	&ccacipher_attr_group,
 	NULL,
 };
 

diff --git a/drivers/s390/crypto/vfio_ap_drv.c b/drivers/s390/crypto/vfio_ap_drv.c
index 003662a..be2520c 100644
--- a/drivers/s390/crypto/vfio_ap_drv.c
+++ b/drivers/s390/crypto/vfio_ap_drv.c

@@ -36,6 +36,8 @@
 	  .match_flags = AP_DEVICE_ID_MATCH_QUEUE_TYPE },
 	{ .dev_type = AP_DEVICE_TYPE_CEX6,
 	  .match_flags = AP_DEVICE_ID_MATCH_QUEUE_TYPE },
+	{ .dev_type = AP_DEVICE_TYPE_CEX7,
+	  .match_flags = AP_DEVICE_ID_MATCH_QUEUE_TYPE },
 	{ /* end of sibling */ },
 };
 

diff --git a/drivers/s390/crypto/zcrypt_api.h b/drivers/s390/crypto/zcrypt_api.h
index 2d3f273..d464618 100644
--- a/drivers/s390/crypto/zcrypt_api.h
+++ b/drivers/s390/crypto/zcrypt_api.h

@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0+ */
 /*
- *  Copyright IBM Corp. 2001, 2018
+ *  Copyright IBM Corp. 2001, 2019
  *  Author(s): Robert Burroughs
  *	       Eric Rossman (edrossma@us.ibm.com)
  *	       Cornelia Huck <cornelia.huck@de.ibm.com>
@@ -29,6 +29,7 @@
 #define ZCRYPT_CEX4	       10
 #define ZCRYPT_CEX5	       11
 #define ZCRYPT_CEX6	       12
+#define ZCRYPT_CEX7	       13
 
 /**
  * Large random numbers are pulled in 4096 byte chunks from the crypto cards

diff --git a/drivers/s390/crypto/zcrypt_cex4.c b/drivers/s390/crypto/zcrypt_cex4.c
index f58d8de..442e3d6 100644
--- a/drivers/s390/crypto/zcrypt_cex4.c
+++ b/drivers/s390/crypto/zcrypt_cex4.c

@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- *  Copyright IBM Corp. 2012
+ *  Copyright IBM Corp. 2012, 2019
  *  Author(s): Holger Dengler <hd@linux.vnet.ibm.com>
  */
 
@@ -38,8 +38,8 @@
 #define CEX4_CLEANUP_TIME	(900*HZ)
 
 MODULE_AUTHOR("IBM Corporation");
-MODULE_DESCRIPTION("CEX4/CEX5/CEX6 Cryptographic Card device driver, " \
-		   "Copyright IBM Corp. 2018");
+MODULE_DESCRIPTION("CEX4/CEX5/CEX6/CEX7 Cryptographic Card device driver, " \
+		   "Copyright IBM Corp. 2019");
 MODULE_LICENSE("GPL");
 
 static struct ap_device_id zcrypt_cex4_card_ids[] = {
@@ -49,6 +49,8 @@
 	  .match_flags = AP_DEVICE_ID_MATCH_CARD_TYPE },
 	{ .dev_type = AP_DEVICE_TYPE_CEX6,
 	  .match_flags = AP_DEVICE_ID_MATCH_CARD_TYPE },
+	{ .dev_type = AP_DEVICE_TYPE_CEX7,
+	  .match_flags = AP_DEVICE_ID_MATCH_CARD_TYPE },
 	{ /* end of list */ },
 };
 
@@ -61,6 +63,8 @@
 	  .match_flags = AP_DEVICE_ID_MATCH_QUEUE_TYPE },
 	{ .dev_type = AP_DEVICE_TYPE_CEX6,
 	  .match_flags = AP_DEVICE_ID_MATCH_QUEUE_TYPE },
+	{ .dev_type = AP_DEVICE_TYPE_CEX7,
+	  .match_flags = AP_DEVICE_ID_MATCH_QUEUE_TYPE },
 	{ /* end of list */ },
 };
 
@@ -146,7 +150,7 @@
 };
 
 /**
- * Probe function for CEX4/CEX5/CEX6 card device. It always
+ * Probe function for CEX4/CEX5/CEX6/CEX7 card device. It always
  * accepts the AP device since the bus_match already checked
  * the hardware type.
  * @ap_dev: pointer to the AP device.
@@ -158,25 +162,31 @@
 	 * MEX_1k, MEX_2k, MEX_4k, CRT_1k, CRT_2k, CRT_4k, RNG, SECKEY
 	 */
 	static const int CEX4A_SPEED_IDX[] = {
-		 14, 19, 249, 42, 228, 1458, 0, 0};
+		 14,  19, 249, 42, 228, 1458, 0, 0};
 	static const int CEX5A_SPEED_IDX[] = {
-		  8,  9,  20, 18,  66,	458, 0, 0};
+		  8,   9,  20, 18,  66,	 458, 0, 0};
 	static const int CEX6A_SPEED_IDX[] = {
-		  6,  9,  20, 17,  65,	438, 0, 0};
+		  6,   9,  20, 17,  65,	 438, 0, 0};
+	static const int CEX7A_SPEED_IDX[] = {
+		  6,   8,  17, 15,  54,	 362, 0, 0};
 
 	static const int CEX4C_SPEED_IDX[] = {
 		 59,  69, 308, 83, 278, 2204, 209, 40};
 	static const int CEX5C_SPEED_IDX[] = {
-		 24,  31,  50, 37,  90,  479,  27, 10};
+		 24,  31,  50, 37,  90,	 479,  27, 10};
 	static const int CEX6C_SPEED_IDX[] = {
-		 16,  20,  32, 27,  77,  455,  23,  9};
+		 16,  20,  32, 27,  77,	 455,  24,  9};
+	static const int CEX7C_SPEED_IDX[] = {
+		 14,  16,  26, 23,  64,	 376,  23,  8};
 
 	static const int CEX4P_SPEED_IDX[] = {
-		224, 313, 3560, 359, 605, 2827, 0, 50};
+		  0,   0,   0,	 0,   0,   0,	0,  50};
 	static const int CEX5P_SPEED_IDX[] = {
-		 63,  84,  156,  83, 142,  533, 0, 10};
+		  0,   0,   0,	 0,   0,   0,	0,  10};
 	static const int CEX6P_SPEED_IDX[] = {
-		 55,  70,  121,  73, 129,  522, 0,  9};
+		  0,   0,   0,	 0,   0,   0,	0,   9};
+	static const int CEX7P_SPEED_IDX[] = {
+		  0,   0,   0,	 0,   0,   0,	0,   8};
 
 	struct ap_card *ac = to_ap_card(&ap_dev->device);
 	struct zcrypt_card *zc;
@@ -198,11 +208,19 @@
 			zc->user_space_type = ZCRYPT_CEX5;
 			memcpy(zc->speed_rating, CEX5A_SPEED_IDX,
 			       sizeof(CEX5A_SPEED_IDX));
-		} else {
+		} else if (ac->ap_dev.device_type == AP_DEVICE_TYPE_CEX6) {
 			zc->type_string = "CEX6A";
 			zc->user_space_type = ZCRYPT_CEX6;
 			memcpy(zc->speed_rating, CEX6A_SPEED_IDX,
 			       sizeof(CEX6A_SPEED_IDX));
+		} else {
+			zc->type_string = "CEX7A";
+			/* wrong user space type, just for compatibility
+			 * with the ZCRYPT_STATUS_MASK ioctl.
+			 */
+			zc->user_space_type = ZCRYPT_CEX6;
+			memcpy(zc->speed_rating, CEX7A_SPEED_IDX,
+			       sizeof(CEX7A_SPEED_IDX));
 		}
 		zc->min_mod_size = CEX4A_MIN_MOD_SIZE;
 		if (ap_test_bit(&ac->functions, AP_FUNC_MEX4K) &&
@@ -232,7 +250,7 @@
 			zc->user_space_type = ZCRYPT_CEX3C;
 			memcpy(zc->speed_rating, CEX5C_SPEED_IDX,
 			       sizeof(CEX5C_SPEED_IDX));
-		} else {
+		} else if (ac->ap_dev.device_type == AP_DEVICE_TYPE_CEX6) {
 			zc->type_string = "CEX6C";
 			/* wrong user space type, must be CEX6
 			 * just keep it for cca compatibility
@@ -240,6 +258,14 @@
 			zc->user_space_type = ZCRYPT_CEX3C;
 			memcpy(zc->speed_rating, CEX6C_SPEED_IDX,
 			       sizeof(CEX6C_SPEED_IDX));
+		} else {
+			zc->type_string = "CEX7C";
+			/* wrong user space type, must be CEX7
+			 * just keep it for cca compatibility
+			 */
+			zc->user_space_type = ZCRYPT_CEX3C;
+			memcpy(zc->speed_rating, CEX7C_SPEED_IDX,
+			       sizeof(CEX7C_SPEED_IDX));
 		}
 		zc->min_mod_size = CEX4C_MIN_MOD_SIZE;
 		zc->max_mod_size = CEX4C_MAX_MOD_SIZE;
@@ -255,11 +281,19 @@
 			zc->user_space_type = ZCRYPT_CEX5;
 			memcpy(zc->speed_rating, CEX5P_SPEED_IDX,
 			       sizeof(CEX5P_SPEED_IDX));
-		} else {
+		} else if (ac->ap_dev.device_type == AP_DEVICE_TYPE_CEX6) {
 			zc->type_string = "CEX6P";
 			zc->user_space_type = ZCRYPT_CEX6;
 			memcpy(zc->speed_rating, CEX6P_SPEED_IDX,
 			       sizeof(CEX6P_SPEED_IDX));
+		} else {
+			zc->type_string = "CEX7P";
+			/* wrong user space type, just for compatibility
+			 * with the ZCRYPT_STATUS_MASK ioctl.
+			 */
+			zc->user_space_type = ZCRYPT_CEX6;
+			memcpy(zc->speed_rating, CEX7P_SPEED_IDX,
+			       sizeof(CEX7P_SPEED_IDX));
 		}
 		zc->min_mod_size = CEX4C_MIN_MOD_SIZE;
 		zc->max_mod_size = CEX4C_MAX_MOD_SIZE;
@@ -289,8 +323,8 @@
 }
 
 /**
- * This is called to remove the CEX4/CEX5/CEX6 card driver information
- * if an AP card device is removed.
+ * This is called to remove the CEX4/CEX5/CEX6/CEX7 card driver
+ * information if an AP card device is removed.
  */
 static void zcrypt_cex4_card_remove(struct ap_device *ap_dev)
 {
@@ -311,7 +345,7 @@
 };
 
 /**
- * Probe function for CEX4/CEX5/CEX6 queue device. It always
+ * Probe function for CEX4/CEX5/CEX6/CEX7 queue device. It always
  * accepts the AP device since the bus_match already checked
  * the hardware type.
  * @ap_dev: pointer to the AP device.
@@ -369,7 +403,7 @@
 }
 
 /**
- * This is called to remove the CEX4/CEX5/CEX6 queue driver
+ * This is called to remove the CEX4/CEX5/CEX6/CEX7 queue driver
  * information if an AP queue device is removed.
  */
 static void zcrypt_cex4_queue_remove(struct ap_device *ap_dev)

diff --git a/drivers/scsi/aacraid/linit.c b/drivers/scsi/aacraid/linit.c
index 644f7f5..4a85878 100644
--- a/drivers/scsi/aacraid/linit.c
+++ b/drivers/scsi/aacraid/linit.c

@@ -27,7 +27,6 @@
 #include <linux/moduleparam.h>
 #include <linux/pci.h>
 #include <linux/aer.h>
-#include <linux/pci-aspm.h>
 #include <linux/slab.h>
 #include <linux/mutex.h>
 #include <linux/spinlock.h>

diff --git a/drivers/scsi/hpsa.c b/drivers/scsi/hpsa.c
index 1bb6aad..ac39ed7 100644
--- a/drivers/scsi/hpsa.c
+++ b/drivers/scsi/hpsa.c

@@ -21,7 +21,6 @@
 #include <linux/interrupt.h>
 #include <linux/types.h>
 #include <linux/pci.h>
-#include <linux/pci-aspm.h>
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/delay.h>

diff --git a/drivers/scsi/mpt3sas/mpt3sas_scsih.c b/drivers/scsi/mpt3sas/mpt3sas_scsih.c
index d0c2f8d..c8e512b 100644
--- a/drivers/scsi/mpt3sas/mpt3sas_scsih.c
+++ b/drivers/scsi/mpt3sas/mpt3sas_scsih.c

@@ -51,7 +51,6 @@
 #include <linux/workqueue.h>
 #include <linux/delay.h>
 #include <linux/pci.h>
-#include <linux/pci-aspm.h>
 #include <linux/interrupt.h>
 #include <linux/aer.h>
 #include <linux/raid_class.h>

diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 7623196..50928bc 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c

@@ -1211,9 +1211,6 @@
 	dix = scsi_prot_sg_count(cmd);
 	dif = scsi_host_dif_capable(cmd->device->host, sdkp->protection_type);
 
-	if (write && dix)
-		t10_pi_prepare(cmd->request, sdkp->protection_type);
-
 	if (dif || dix)
 		protect = sd_setup_protect_cmnd(cmd, dix, dif);
 	else
@@ -2055,11 +2052,6 @@
 					   "sd_done: completed %d of %d bytes\n",
 					   good_bytes, scsi_bufflen(SCpnt)));
 
-	if (rq_data_dir(SCpnt->request) == READ && scsi_prot_sg_count(SCpnt) &&
-	    good_bytes)
-		t10_pi_complete(SCpnt->request, sdkp->protection_type,
-				good_bytes / scsi_prot_interval(SCpnt));
-
 	return good_bytes;
 }
 

diff --git a/drivers/staging/android/ion/ion_system_heap.c b/drivers/staging/android/ion/ion_system_heap.c
index aa8d842..b83a1d1 100644
--- a/drivers/staging/android/ion/ion_system_heap.c
+++ b/drivers/staging/android/ion/ion_system_heap.c

@@ -120,7 +120,7 @@
 		if (!page)
 			goto free_pages;
 		list_add_tail(&page->lru, &pages);
-		size_remaining -= PAGE_SIZE << compound_order(page);
+		size_remaining -= page_size(page);
 		max_order = compound_order(page);
 		i++;
 	}
@@ -133,7 +133,7 @@
 
 	sg = table->sgl;
 	list_for_each_entry_safe(page, tmp_page, &pages, lru) {
-		sg_set_page(sg, page, PAGE_SIZE << compound_order(page), 0);
+		sg_set_page(sg, page, page_size(page), 0);
 		sg = sg_next(sg);
 		list_del(&page->lru);
 	}

diff --git a/drivers/target/tcm_fc/tfc_io.c b/drivers/target/tcm_fc/tfc_io.c
index a254792..1354a15 100644
--- a/drivers/target/tcm_fc/tfc_io.c
+++ b/drivers/target/tcm_fc/tfc_io.c

@@ -136,8 +136,7 @@
 					   page, off_in_page, tlen);
 			fr_len(fp) += tlen;
 			fp_skb(fp)->data_len += tlen;
-			fp_skb(fp)->truesize +=
-					PAGE_SIZE << compound_order(page);
+			fp_skb(fp)->truesize += page_size(page);
 		} else {
 			BUG_ON(!page);
 			from = kmap_atomic(page + (mem_off >> PAGE_SHIFT));

diff --git a/drivers/tee/tee_shm.c b/drivers/tee/tee_shm.c
index 2da026f..09ddcd0 100644
--- a/drivers/tee/tee_shm.c
+++ b/drivers/tee/tee_shm.c

@@ -254,6 +254,7 @@
 	shm->teedev = teedev;
 	shm->ctx = ctx;
 	shm->id = -1;
+	addr = untagged_addr(addr);
 	start = rounddown(addr, PAGE_SIZE);
 	shm->offset = addr - start;
 	shm->size = length;

diff --git a/drivers/thermal/Kconfig b/drivers/thermal/Kconfig
index 9966364..001a21a 100644
--- a/drivers/thermal/Kconfig
+++ b/drivers/thermal/Kconfig

@@ -310,7 +310,7 @@
 
 config DB8500_THERMAL
 	tristate "DB8500 thermal management"
-	depends on MFD_DB8500_PRCMU
+	depends on MFD_DB8500_PRCMU && OF
 	default y
 	help
 	  Adds DB8500 thermal management implementation according to the thermal

diff --git a/drivers/thermal/armada_thermal.c b/drivers/thermal/armada_thermal.c
index 8c07a39..709a22f 100644
--- a/drivers/thermal/armada_thermal.c
+++ b/drivers/thermal/armada_thermal.c

@@ -53,7 +53,6 @@
 #define CONTROL0_TSEN_MODE_EXTERNAL	0x2
 #define CONTROL0_TSEN_MODE_MASK		0x3
 
-#define CONTROL1_TSEN_AVG_SHIFT		0
 #define CONTROL1_TSEN_AVG_MASK		0x7
 #define CONTROL1_EXT_TSEN_SW_RESET	BIT(7)
 #define CONTROL1_EXT_TSEN_HW_RESETn	BIT(8)
@@ -267,8 +266,8 @@
 
 	/* Average the output value over 2^1 = 2 samples */
 	regmap_read(priv->syscon, data->syscon_control1_off, &reg);
-	reg &= ~CONTROL1_TSEN_AVG_MASK << CONTROL1_TSEN_AVG_SHIFT;
-	reg |= 1 << CONTROL1_TSEN_AVG_SHIFT;
+	reg &= ~CONTROL1_TSEN_AVG_MASK;
+	reg |= 1;
 	regmap_write(priv->syscon, data->syscon_control1_off, reg);
 }
 

diff --git a/drivers/thermal/db8500_thermal.c b/drivers/thermal/db8500_thermal.c
index b71a999..372dbba 100644
--- a/drivers/thermal/db8500_thermal.c
+++ b/drivers/thermal/db8500_thermal.c

@@ -3,9 +3,9 @@
  * db8500_thermal.c - DB8500 Thermal Management Implementation
  *
  * Copyright (C) 2012 ST-Ericsson
- * Copyright (C) 2012 Linaro Ltd.
+ * Copyright (C) 2012-2019 Linaro Ltd.
  *
- * Author: Hongbo Zhang <hongbo.zhang@linaro.com>
+ * Authors: Hongbo Zhang, Linus Walleij
  */
 
 #include <linux/cpu_cooling.h>
@@ -13,7 +13,6 @@
 #include <linux/mfd/dbx500-prcmu.h>
 #include <linux/module.h>
 #include <linux/of.h>
-#include <linux/platform_data/db8500_thermal.h>
 #include <linux/platform_device.h>
 #include <linux/slab.h>
 #include <linux/thermal.h>
@@ -21,453 +20,201 @@
 #define PRCMU_DEFAULT_MEASURE_TIME	0xFFF
 #define PRCMU_DEFAULT_LOW_TEMP		0
 
+/**
+ * db8500_thermal_points - the interpolation points that trigger
+ * interrupts
+ */
+static const unsigned long db8500_thermal_points[] = {
+	15000,
+	20000,
+	25000,
+	30000,
+	35000,
+	40000,
+	45000,
+	50000,
+	55000,
+	60000,
+	65000,
+	70000,
+	75000,
+	80000,
+	/*
+	 * This is where things start to get really bad for the
+	 * SoC and the thermal zones should be set up to trigger
+	 * critical temperature at 85000 mC so we don't get above
+	 * this point.
+	 */
+	85000,
+	90000,
+	95000,
+	100000,
+};
+
 struct db8500_thermal_zone {
-	struct thermal_zone_device *therm_dev;
-	struct mutex th_lock;
-	struct work_struct therm_work;
-	struct db8500_thsens_platform_data *trip_tab;
-	enum thermal_device_mode mode;
+	struct thermal_zone_device *tz;
 	enum thermal_trend trend;
-	unsigned long cur_temp_pseudo;
+	unsigned long interpolated_temp;
 	unsigned int cur_index;
 };
 
-/* Local function to check if thermal zone matches cooling devices */
-static int db8500_thermal_match_cdev(struct thermal_cooling_device *cdev,
-		struct db8500_trip_point *trip_point)
-{
-	int i;
-
-	if (!strlen(cdev->type))
-		return -EINVAL;
-
-	for (i = 0; i < COOLING_DEV_MAX; i++) {
-		if (!strcmp(trip_point->cdev_name[i], cdev->type))
-			return 0;
-	}
-
-	return -ENODEV;
-}
-
-/* Callback to bind cooling device to thermal zone */
-static int db8500_cdev_bind(struct thermal_zone_device *thermal,
-		struct thermal_cooling_device *cdev)
-{
-	struct db8500_thermal_zone *pzone = thermal->devdata;
-	struct db8500_thsens_platform_data *ptrips = pzone->trip_tab;
-	unsigned long max_state, upper, lower;
-	int i, ret = -EINVAL;
-
-	cdev->ops->get_max_state(cdev, &max_state);
-
-	for (i = 0; i < ptrips->num_trips; i++) {
-		if (db8500_thermal_match_cdev(cdev, &ptrips->trip_points[i]))
-			continue;
-
-		upper = lower = i > max_state ? max_state : i;
-
-		ret = thermal_zone_bind_cooling_device(thermal, i, cdev,
-			upper, lower, THERMAL_WEIGHT_DEFAULT);
-
-		dev_info(&cdev->device, "%s bind to %d: %d-%s\n", cdev->type,
-			i, ret, ret ? "fail" : "succeed");
-	}
-
-	return ret;
-}
-
-/* Callback to unbind cooling device from thermal zone */
-static int db8500_cdev_unbind(struct thermal_zone_device *thermal,
-		struct thermal_cooling_device *cdev)
-{
-	struct db8500_thermal_zone *pzone = thermal->devdata;
-	struct db8500_thsens_platform_data *ptrips = pzone->trip_tab;
-	int i, ret = -EINVAL;
-
-	for (i = 0; i < ptrips->num_trips; i++) {
-		if (db8500_thermal_match_cdev(cdev, &ptrips->trip_points[i]))
-			continue;
-
-		ret = thermal_zone_unbind_cooling_device(thermal, i, cdev);
-
-		dev_info(&cdev->device, "%s unbind from %d: %s\n", cdev->type,
-			i, ret ? "fail" : "succeed");
-	}
-
-	return ret;
-}
-
 /* Callback to get current temperature */
-static int db8500_sys_get_temp(struct thermal_zone_device *thermal, int *temp)
+static int db8500_thermal_get_temp(void *data, int *temp)
 {
-	struct db8500_thermal_zone *pzone = thermal->devdata;
+	struct db8500_thermal_zone *th = data;
 
 	/*
 	 * TODO: There is no PRCMU interface to get temperature data currently,
 	 * so a pseudo temperature is returned , it works for thermal framework
 	 * and this will be fixed when the PRCMU interface is available.
 	 */
-	*temp = pzone->cur_temp_pseudo;
+	*temp = th->interpolated_temp;
 
 	return 0;
 }
 
 /* Callback to get temperature changing trend */
-static int db8500_sys_get_trend(struct thermal_zone_device *thermal,
-		int trip, enum thermal_trend *trend)
+static int db8500_thermal_get_trend(void *data, int trip, enum thermal_trend *trend)
 {
-	struct db8500_thermal_zone *pzone = thermal->devdata;
+	struct db8500_thermal_zone *th = data;
 
-	*trend = pzone->trend;
+	*trend = th->trend;
 
 	return 0;
 }
 
-/* Callback to get thermal zone mode */
-static int db8500_sys_get_mode(struct thermal_zone_device *thermal,
-		enum thermal_device_mode *mode)
-{
-	struct db8500_thermal_zone *pzone = thermal->devdata;
-
-	mutex_lock(&pzone->th_lock);
-	*mode = pzone->mode;
-	mutex_unlock(&pzone->th_lock);
-
-	return 0;
-}
-
-/* Callback to set thermal zone mode */
-static int db8500_sys_set_mode(struct thermal_zone_device *thermal,
-		enum thermal_device_mode mode)
-{
-	struct db8500_thermal_zone *pzone = thermal->devdata;
-
-	mutex_lock(&pzone->th_lock);
-
-	pzone->mode = mode;
-	if (mode == THERMAL_DEVICE_ENABLED)
-		schedule_work(&pzone->therm_work);
-
-	mutex_unlock(&pzone->th_lock);
-
-	return 0;
-}
-
-/* Callback to get trip point type */
-static int db8500_sys_get_trip_type(struct thermal_zone_device *thermal,
-		int trip, enum thermal_trip_type *type)
-{
-	struct db8500_thermal_zone *pzone = thermal->devdata;
-	struct db8500_thsens_platform_data *ptrips = pzone->trip_tab;
-
-	if (trip >= ptrips->num_trips)
-		return -EINVAL;
-
-	*type = ptrips->trip_points[trip].type;
-
-	return 0;
-}
-
-/* Callback to get trip point temperature */
-static int db8500_sys_get_trip_temp(struct thermal_zone_device *thermal,
-		int trip, int *temp)
-{
-	struct db8500_thermal_zone *pzone = thermal->devdata;
-	struct db8500_thsens_platform_data *ptrips = pzone->trip_tab;
-
-	if (trip >= ptrips->num_trips)
-		return -EINVAL;
-
-	*temp = ptrips->trip_points[trip].temp;
-
-	return 0;
-}
-
-/* Callback to get critical trip point temperature */
-static int db8500_sys_get_crit_temp(struct thermal_zone_device *thermal,
-		int *temp)
-{
-	struct db8500_thermal_zone *pzone = thermal->devdata;
-	struct db8500_thsens_platform_data *ptrips = pzone->trip_tab;
-	int i;
-
-	for (i = ptrips->num_trips - 1; i > 0; i--) {
-		if (ptrips->trip_points[i].type == THERMAL_TRIP_CRITICAL) {
-			*temp = ptrips->trip_points[i].temp;
-			return 0;
-		}
-	}
-
-	return -EINVAL;
-}
-
-static struct thermal_zone_device_ops thdev_ops = {
-	.bind = db8500_cdev_bind,
-	.unbind = db8500_cdev_unbind,
-	.get_temp = db8500_sys_get_temp,
-	.get_trend = db8500_sys_get_trend,
-	.get_mode = db8500_sys_get_mode,
-	.set_mode = db8500_sys_set_mode,
-	.get_trip_type = db8500_sys_get_trip_type,
-	.get_trip_temp = db8500_sys_get_trip_temp,
-	.get_crit_temp = db8500_sys_get_crit_temp,
+static struct thermal_zone_of_device_ops thdev_ops = {
+	.get_temp = db8500_thermal_get_temp,
+	.get_trend = db8500_thermal_get_trend,
 };
 
-static void db8500_thermal_update_config(struct db8500_thermal_zone *pzone,
-		unsigned int idx, enum thermal_trend trend,
-		unsigned long next_low, unsigned long next_high)
+static void db8500_thermal_update_config(struct db8500_thermal_zone *th,
+					 unsigned int idx,
+					 enum thermal_trend trend,
+					 unsigned long next_low,
+					 unsigned long next_high)
 {
 	prcmu_stop_temp_sense();
 
-	pzone->cur_index = idx;
-	pzone->cur_temp_pseudo = (next_low + next_high)/2;
-	pzone->trend = trend;
+	th->cur_index = idx;
+	th->interpolated_temp = (next_low + next_high)/2;
+	th->trend = trend;
 
+	/*
+	 * The PRCMU accept absolute temperatures in celsius so divide
+	 * down the millicelsius with 1000
+	 */
 	prcmu_config_hotmon((u8)(next_low/1000), (u8)(next_high/1000));
 	prcmu_start_temp_sense(PRCMU_DEFAULT_MEASURE_TIME);
 }
 
 static irqreturn_t prcmu_low_irq_handler(int irq, void *irq_data)
 {
-	struct db8500_thermal_zone *pzone = irq_data;
-	struct db8500_thsens_platform_data *ptrips = pzone->trip_tab;
-	unsigned int idx = pzone->cur_index;
+	struct db8500_thermal_zone *th = irq_data;
+	unsigned int idx = th->cur_index;
 	unsigned long next_low, next_high;
 
-	if (unlikely(idx == 0))
+	if (idx == 0)
 		/* Meaningless for thermal management, ignoring it */
 		return IRQ_HANDLED;
 
 	if (idx == 1) {
-		next_high = ptrips->trip_points[0].temp;
+		next_high = db8500_thermal_points[0];
 		next_low = PRCMU_DEFAULT_LOW_TEMP;
 	} else {
-		next_high = ptrips->trip_points[idx-1].temp;
-		next_low = ptrips->trip_points[idx-2].temp;
+		next_high = db8500_thermal_points[idx - 1];
+		next_low = db8500_thermal_points[idx - 2];
 	}
 	idx -= 1;
 
-	db8500_thermal_update_config(pzone, idx, THERMAL_TREND_DROPPING,
-		next_low, next_high);
-
-	dev_dbg(&pzone->therm_dev->device,
+	db8500_thermal_update_config(th, idx, THERMAL_TREND_DROPPING,
+				     next_low, next_high);
+	dev_dbg(&th->tz->device,
 		"PRCMU set max %ld, min %ld\n", next_high, next_low);
 
-	schedule_work(&pzone->therm_work);
+	thermal_zone_device_update(th->tz, THERMAL_EVENT_UNSPECIFIED);
 
 	return IRQ_HANDLED;
 }
 
 static irqreturn_t prcmu_high_irq_handler(int irq, void *irq_data)
 {
-	struct db8500_thermal_zone *pzone = irq_data;
-	struct db8500_thsens_platform_data *ptrips = pzone->trip_tab;
-	unsigned int idx = pzone->cur_index;
+	struct db8500_thermal_zone *th = irq_data;
+	unsigned int idx = th->cur_index;
 	unsigned long next_low, next_high;
+	int num_points = ARRAY_SIZE(db8500_thermal_points);
 
-	if (idx < ptrips->num_trips - 1) {
-		next_high = ptrips->trip_points[idx+1].temp;
-		next_low = ptrips->trip_points[idx].temp;
+	if (idx < num_points - 1) {
+		next_high = db8500_thermal_points[idx+1];
+		next_low = db8500_thermal_points[idx];
 		idx += 1;
 
-		db8500_thermal_update_config(pzone, idx, THERMAL_TREND_RAISING,
-			next_low, next_high);
+		db8500_thermal_update_config(th, idx, THERMAL_TREND_RAISING,
+					     next_low, next_high);
 
-		dev_dbg(&pzone->therm_dev->device,
-		"PRCMU set max %ld, min %ld\n", next_high, next_low);
-	} else if (idx == ptrips->num_trips - 1)
-		pzone->cur_temp_pseudo = ptrips->trip_points[idx].temp + 1;
+		dev_info(&th->tz->device,
+			 "PRCMU set max %ld, min %ld\n", next_high, next_low);
+	} else if (idx == num_points - 1)
+		/* So we roof out 1 degree over the max point */
+		th->interpolated_temp = db8500_thermal_points[idx] + 1;
 
-	schedule_work(&pzone->therm_work);
+	thermal_zone_device_update(th->tz, THERMAL_EVENT_UNSPECIFIED);
 
 	return IRQ_HANDLED;
 }
 
-static void db8500_thermal_work(struct work_struct *work)
-{
-	enum thermal_device_mode cur_mode;
-	struct db8500_thermal_zone *pzone;
-
-	pzone = container_of(work, struct db8500_thermal_zone, therm_work);
-
-	mutex_lock(&pzone->th_lock);
-	cur_mode = pzone->mode;
-	mutex_unlock(&pzone->th_lock);
-
-	if (cur_mode == THERMAL_DEVICE_DISABLED)
-		return;
-
-	thermal_zone_device_update(pzone->therm_dev, THERMAL_EVENT_UNSPECIFIED);
-	dev_dbg(&pzone->therm_dev->device, "thermal work finished.\n");
-}
-
-#ifdef CONFIG_OF
-static struct db8500_thsens_platform_data*
-		db8500_thermal_parse_dt(struct platform_device *pdev)
-{
-	struct db8500_thsens_platform_data *ptrips;
-	struct device_node *np = pdev->dev.of_node;
-	char prop_name[32];
-	const char *tmp_str;
-	u32 tmp_data;
-	int i, j;
-
-	ptrips = devm_kzalloc(&pdev->dev, sizeof(*ptrips), GFP_KERNEL);
-	if (!ptrips)
-		return NULL;
-
-	if (of_property_read_u32(np, "num-trips", &tmp_data))
-		goto err_parse_dt;
-
-	if (tmp_data > THERMAL_MAX_TRIPS)
-		goto err_parse_dt;
-
-	ptrips->num_trips = tmp_data;
-
-	for (i = 0; i < ptrips->num_trips; i++) {
-		sprintf(prop_name, "trip%d-temp", i);
-		if (of_property_read_u32(np, prop_name, &tmp_data))
-			goto err_parse_dt;
-
-		ptrips->trip_points[i].temp = tmp_data;
-
-		sprintf(prop_name, "trip%d-type", i);
-		if (of_property_read_string(np, prop_name, &tmp_str))
-			goto err_parse_dt;
-
-		if (!strcmp(tmp_str, "active"))
-			ptrips->trip_points[i].type = THERMAL_TRIP_ACTIVE;
-		else if (!strcmp(tmp_str, "passive"))
-			ptrips->trip_points[i].type = THERMAL_TRIP_PASSIVE;
-		else if (!strcmp(tmp_str, "hot"))
-			ptrips->trip_points[i].type = THERMAL_TRIP_HOT;
-		else if (!strcmp(tmp_str, "critical"))
-			ptrips->trip_points[i].type = THERMAL_TRIP_CRITICAL;
-		else
-			goto err_parse_dt;
-
-		sprintf(prop_name, "trip%d-cdev-num", i);
-		if (of_property_read_u32(np, prop_name, &tmp_data))
-			goto err_parse_dt;
-
-		if (tmp_data > COOLING_DEV_MAX)
-			goto err_parse_dt;
-
-		for (j = 0; j < tmp_data; j++) {
-			sprintf(prop_name, "trip%d-cdev-name%d", i, j);
-			if (of_property_read_string(np, prop_name, &tmp_str))
-				goto err_parse_dt;
-
-			if (strlen(tmp_str) >= THERMAL_NAME_LENGTH)
-				goto err_parse_dt;
-
-			strcpy(ptrips->trip_points[i].cdev_name[j], tmp_str);
-		}
-	}
-	return ptrips;
-
-err_parse_dt:
-	dev_err(&pdev->dev, "Parsing device tree data error.\n");
-	return NULL;
-}
-#else
-static inline struct db8500_thsens_platform_data*
-		db8500_thermal_parse_dt(struct platform_device *pdev)
-{
-	return NULL;
-}
-#endif
-
 static int db8500_thermal_probe(struct platform_device *pdev)
 {
-	struct db8500_thermal_zone *pzone = NULL;
-	struct db8500_thsens_platform_data *ptrips = NULL;
-	struct device_node *np = pdev->dev.of_node;
+	struct db8500_thermal_zone *th = NULL;
+	struct device *dev = &pdev->dev;
 	int low_irq, high_irq, ret = 0;
-	unsigned long dft_low, dft_high;
 
-	if (np)
-		ptrips = db8500_thermal_parse_dt(pdev);
-	else
-		ptrips = dev_get_platdata(&pdev->dev);
-
-	if (!ptrips)
-		return -EINVAL;
-
-	pzone = devm_kzalloc(&pdev->dev, sizeof(*pzone), GFP_KERNEL);
-	if (!pzone)
+	th = devm_kzalloc(dev, sizeof(*th), GFP_KERNEL);
+	if (!th)
 		return -ENOMEM;
 
-	mutex_init(&pzone->th_lock);
-	mutex_lock(&pzone->th_lock);
-
-	pzone->mode = THERMAL_DEVICE_DISABLED;
-	pzone->trip_tab = ptrips;
-
-	INIT_WORK(&pzone->therm_work, db8500_thermal_work);
-
 	low_irq = platform_get_irq_byname(pdev, "IRQ_HOTMON_LOW");
 	if (low_irq < 0) {
-		dev_err(&pdev->dev, "Get IRQ_HOTMON_LOW failed.\n");
-		ret = low_irq;
-		goto out_unlock;
+		dev_err(dev, "Get IRQ_HOTMON_LOW failed\n");
+		return low_irq;
 	}
 
-	ret = devm_request_threaded_irq(&pdev->dev, low_irq, NULL,
+	ret = devm_request_threaded_irq(dev, low_irq, NULL,
 		prcmu_low_irq_handler, IRQF_NO_SUSPEND | IRQF_ONESHOT,
-		"dbx500_temp_low", pzone);
+		"dbx500_temp_low", th);
 	if (ret < 0) {
-		dev_err(&pdev->dev, "Failed to allocate temp low irq.\n");
-		goto out_unlock;
+		dev_err(dev, "failed to allocate temp low irq\n");
+		return ret;
 	}
 
 	high_irq = platform_get_irq_byname(pdev, "IRQ_HOTMON_HIGH");
 	if (high_irq < 0) {
-		dev_err(&pdev->dev, "Get IRQ_HOTMON_HIGH failed.\n");
-		ret = high_irq;
-		goto out_unlock;
+		dev_err(dev, "Get IRQ_HOTMON_HIGH failed\n");
+		return high_irq;
 	}
 
-	ret = devm_request_threaded_irq(&pdev->dev, high_irq, NULL,
+	ret = devm_request_threaded_irq(dev, high_irq, NULL,
 		prcmu_high_irq_handler, IRQF_NO_SUSPEND | IRQF_ONESHOT,
-		"dbx500_temp_high", pzone);
+		"dbx500_temp_high", th);
 	if (ret < 0) {
-		dev_err(&pdev->dev, "Failed to allocate temp high irq.\n");
-		goto out_unlock;
+		dev_err(dev, "failed to allocate temp high irq\n");
+		return ret;
 	}
 
-	pzone->therm_dev = thermal_zone_device_register("db8500_thermal_zone",
-		ptrips->num_trips, 0, pzone, &thdev_ops, NULL, 0, 0);
-
-	if (IS_ERR(pzone->therm_dev)) {
-		dev_err(&pdev->dev, "Register thermal zone device failed.\n");
-		ret = PTR_ERR(pzone->therm_dev);
-		goto out_unlock;
+	/* register of thermal sensor and get info from DT */
+	th->tz = devm_thermal_zone_of_sensor_register(dev, 0, th, &thdev_ops);
+	if (IS_ERR(th->tz)) {
+		dev_err(dev, "register thermal zone sensor failed\n");
+		return PTR_ERR(th->tz);
 	}
-	dev_info(&pdev->dev, "Thermal zone device registered.\n");
+	dev_info(dev, "thermal zone sensor registered\n");
 
-	dft_low = PRCMU_DEFAULT_LOW_TEMP;
-	dft_high = ptrips->trip_points[0].temp;
+	/* Start measuring at the lowest point */
+	db8500_thermal_update_config(th, 0, THERMAL_TREND_STABLE,
+				     PRCMU_DEFAULT_LOW_TEMP,
+				     db8500_thermal_points[0]);
 
-	db8500_thermal_update_config(pzone, 0, THERMAL_TREND_STABLE,
-		dft_low, dft_high);
-
-	platform_set_drvdata(pdev, pzone);
-	pzone->mode = THERMAL_DEVICE_ENABLED;
-
-out_unlock:
-	mutex_unlock(&pzone->th_lock);
-
-	return ret;
-}
-
-static int db8500_thermal_remove(struct platform_device *pdev)
-{
-	struct db8500_thermal_zone *pzone = platform_get_drvdata(pdev);
-
-	thermal_zone_device_unregister(pzone->therm_dev);
-	cancel_work_sync(&pzone->therm_work);
-	mutex_destroy(&pzone->th_lock);
+	platform_set_drvdata(pdev, th);
 
 	return 0;
 }
@@ -475,9 +222,6 @@
 static int db8500_thermal_suspend(struct platform_device *pdev,
 		pm_message_t state)
 {
-	struct db8500_thermal_zone *pzone = platform_get_drvdata(pdev);
-
-	flush_work(&pzone->therm_work);
 	prcmu_stop_temp_sense();
 
 	return 0;
@@ -485,26 +229,21 @@
 
 static int db8500_thermal_resume(struct platform_device *pdev)
 {
-	struct db8500_thermal_zone *pzone = platform_get_drvdata(pdev);
-	struct db8500_thsens_platform_data *ptrips = pzone->trip_tab;
-	unsigned long dft_low, dft_high;
+	struct db8500_thermal_zone *th = platform_get_drvdata(pdev);
 
-	dft_low = PRCMU_DEFAULT_LOW_TEMP;
-	dft_high = ptrips->trip_points[0].temp;
-
-	db8500_thermal_update_config(pzone, 0, THERMAL_TREND_STABLE,
-		dft_low, dft_high);
+	/* Resume and start measuring at the lowest point */
+	db8500_thermal_update_config(th, 0, THERMAL_TREND_STABLE,
+				     PRCMU_DEFAULT_LOW_TEMP,
+				     db8500_thermal_points[0]);
 
 	return 0;
 }
 
-#ifdef CONFIG_OF
 static const struct of_device_id db8500_thermal_match[] = {
 	{ .compatible = "stericsson,db8500-thermal" },
 	{},
 };
 MODULE_DEVICE_TABLE(of, db8500_thermal_match);
-#endif
 
 static struct platform_driver db8500_thermal_driver = {
 	.driver = {
@@ -514,7 +253,6 @@
 	.probe = db8500_thermal_probe,
 	.suspend = db8500_thermal_suspend,
 	.resume = db8500_thermal_resume,
-	.remove = db8500_thermal_remove,
 };
 
 module_platform_driver(db8500_thermal_driver);

diff --git a/drivers/thermal/intel/int340x_thermal/acpi_thermal_rel.c b/drivers/thermal/intel/int340x_thermal/acpi_thermal_rel.c
index 9716bc3..7130e90 100644
--- a/drivers/thermal/intel/int340x_thermal/acpi_thermal_rel.c
+++ b/drivers/thermal/intel/int340x_thermal/acpi_thermal_rel.c

@@ -77,9 +77,6 @@
 	struct acpi_buffer element = { 0, NULL };
 	struct acpi_buffer trt_format = { sizeof("RRNNNNNN"), "RRNNNNNN" };
 
-	if (!acpi_has_method(handle, "_TRT"))
-		return -ENODEV;
-
 	status = acpi_evaluate_object(handle, "_TRT", NULL, &buffer);
 	if (ACPI_FAILURE(status))
 		return -ENODEV;
@@ -158,9 +155,6 @@
 	struct acpi_buffer art_format =	{
 		sizeof("RRNNNNNNNNNNN"), "RRNNNNNNNNNNN" };
 
-	if (!acpi_has_method(handle, "_ART"))
-		return -ENODEV;
-
 	status = acpi_evaluate_object(handle, "_ART", NULL, &buffer);
 	if (ACPI_FAILURE(status))
 		return -ENODEV;

diff --git a/drivers/thermal/intel/int340x_thermal/int3403_thermal.c b/drivers/thermal/intel/int340x_thermal/int3403_thermal.c
index f5749d4..a7bbd85 100644
--- a/drivers/thermal/intel/int340x_thermal/int3403_thermal.c
+++ b/drivers/thermal/intel/int340x_thermal/int3403_thermal.c

@@ -181,7 +181,7 @@
 
 	p = buf.pointer;
 	if (!p || (p->type != ACPI_TYPE_PACKAGE)) {
-		printk(KERN_WARNING "Invalid PPSS data\n");
+		pr_warn("Invalid PPSS data\n");
 		kfree(buf.pointer);
 		return -EFAULT;
 	}

diff --git a/drivers/thermal/intel/int340x_thermal/processor_thermal_device.c b/drivers/thermal/intel/int340x_thermal/processor_thermal_device.c
index d3446ac..89a0153 100644
--- a/drivers/thermal/intel/int340x_thermal/processor_thermal_device.c
+++ b/drivers/thermal/intel/int340x_thermal/processor_thermal_device.c

@@ -39,6 +39,9 @@
 /* GeminiLake thermal reporting device */
 #define PCI_DEVICE_ID_PROC_GLK_THERMAL	0x318C
 
+/* IceLake thermal reporting device */
+#define PCI_DEVICE_ID_PROC_ICL_THERMAL	0x8a03
+
 #define DRV_NAME "proc_thermal"
 
 struct power_config {
@@ -137,6 +140,72 @@
 	.name = "power_limits"
 };
 
+static ssize_t tcc_offset_degree_celsius_show(struct device *dev,
+			       struct device_attribute *attr, char *buf)
+{
+	u64 val;
+	int err;
+
+	err = rdmsrl_safe(MSR_IA32_TEMPERATURE_TARGET, &val);
+	if (err)
+		return err;
+
+	val = (val >> 24) & 0xff;
+	return sprintf(buf, "%d\n", (int)val);
+}
+
+static int tcc_offset_update(int tcc)
+{
+	u64 val;
+	int err;
+
+	if (!tcc)
+		return -EINVAL;
+
+	err = rdmsrl_safe(MSR_IA32_TEMPERATURE_TARGET, &val);
+	if (err)
+		return err;
+
+	val &= ~GENMASK_ULL(31, 24);
+	val |= (tcc & 0xff) << 24;
+
+	err = wrmsrl_safe(MSR_IA32_TEMPERATURE_TARGET, val);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+static int tcc_offset_save;
+
+static ssize_t tcc_offset_degree_celsius_store(struct device *dev,
+				struct device_attribute *attr, const char *buf,
+				size_t count)
+{
+	u64 val;
+	int tcc, err;
+
+	err = rdmsrl_safe(MSR_PLATFORM_INFO, &val);
+	if (err)
+		return err;
+
+	if (!(val & BIT(30)))
+		return -EACCES;
+
+	if (kstrtoint(buf, 0, &tcc))
+		return -EINVAL;
+
+	err = tcc_offset_update(tcc);
+	if (err)
+		return err;
+
+	tcc_offset_save = tcc;
+
+	return count;
+}
+
+static DEVICE_ATTR_RW(tcc_offset_degree_celsius);
+
 static int stored_tjmax; /* since it is fixed, we can have local storage */
 
 static int get_tjmax(void)
@@ -332,6 +401,7 @@
 	acpi_remove_notify_handler(proc_priv->adev->handle,
 				   ACPI_DEVICE_NOTIFY, proc_thermal_notify);
 	int340x_thermal_zone_remove(proc_priv->int340x_zone);
+	sysfs_remove_file(&proc_priv->dev->kobj, &dev_attr_tcc_offset_degree_celsius.attr);
 	sysfs_remove_group(&proc_priv->dev->kobj,
 			   &power_limit_attribute_group);
 }
@@ -355,8 +425,15 @@
 
 	dev_info(&pdev->dev, "Creating sysfs group for PROC_THERMAL_PLATFORM_DEV\n");
 
-	return sysfs_create_group(&pdev->dev.kobj,
-					 &power_limit_attribute_group);
+	ret = sysfs_create_file(&pdev->dev.kobj, &dev_attr_tcc_offset_degree_celsius.attr);
+	if (ret)
+		return ret;
+
+	ret = sysfs_create_group(&pdev->dev.kobj, &power_limit_attribute_group);
+	if (ret)
+		sysfs_remove_file(&pdev->dev.kobj, &dev_attr_tcc_offset_degree_celsius.attr);
+
+	return ret;
 }
 
 static int int3401_remove(struct platform_device *pdev)
@@ -588,8 +665,15 @@
 
 	dev_info(&pdev->dev, "Creating sysfs group for PROC_THERMAL_PCI\n");
 
-	return sysfs_create_group(&pdev->dev.kobj,
-					 &power_limit_attribute_group);
+	ret = sysfs_create_file(&pdev->dev.kobj, &dev_attr_tcc_offset_degree_celsius.attr);
+	if (ret)
+		return ret;
+
+	ret = sysfs_create_group(&pdev->dev.kobj, &power_limit_attribute_group);
+	if (ret)
+		sysfs_remove_file(&pdev->dev.kobj, &dev_attr_tcc_offset_degree_celsius.attr);
+
+	return ret;
 }
 
 static void  proc_thermal_pci_remove(struct pci_dev *pdev)
@@ -615,6 +699,8 @@
 	proc_dev = dev_get_drvdata(dev);
 	proc_thermal_read_ppcc(proc_dev);
 
+	tcc_offset_update(tcc_offset_save);
+
 	return 0;
 }
 #else
@@ -636,6 +722,8 @@
 	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_PROC_CNL_THERMAL)},
 	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_PROC_CFL_THERMAL)},
 	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_PROC_GLK_THERMAL)},
+	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_PROC_ICL_THERMAL),
+		.driver_data = (kernel_ulong_t)&rapl_mmio_hsw, },
 	{ 0, },
 };
 

diff --git a/drivers/thermal/intel/intel_pch_thermal.c b/drivers/thermal/intel/intel_pch_thermal.c
index 99f8b25..4f0bb8f 100644
--- a/drivers/thermal/intel/intel_pch_thermal.c
+++ b/drivers/thermal/intel/intel_pch_thermal.c

@@ -371,16 +371,14 @@
 
 static int intel_pch_thermal_suspend(struct device *device)
 {
-	struct pci_dev *pdev = to_pci_dev(device);
-	struct pch_thermal_device *ptd = pci_get_drvdata(pdev);
+	struct pch_thermal_device *ptd = dev_get_drvdata(device);
 
 	return ptd->ops->suspend(ptd);
 }
 
 static int intel_pch_thermal_resume(struct device *device)
 {
-	struct pci_dev *pdev = to_pci_dev(device);
-	struct pch_thermal_device *ptd = pci_get_drvdata(pdev);
+	struct pch_thermal_device *ptd = dev_get_drvdata(device);
 
 	return ptd->ops->resume(ptd);
 }

diff --git a/drivers/thermal/qcom/tsens-8960.c b/drivers/thermal/qcom/tsens-8960.c
index 8d9b721..e46a4e3 100644
--- a/drivers/thermal/qcom/tsens-8960.c
+++ b/drivers/thermal/qcom/tsens-8960.c

@@ -229,6 +229,8 @@
 	for (i = 0; i < num_read; i++, s++)
 		s->offset = data[i];
 
+	kfree(data);
+
 	return 0;
 }
 

diff --git a/drivers/thermal/qcom/tsens-v0_1.c b/drivers/thermal/qcom/tsens-v0_1.c
index 6f26fad..055647b 100644
--- a/drivers/thermal/qcom/tsens-v0_1.c
+++ b/drivers/thermal/qcom/tsens-v0_1.c

@@ -145,8 +145,10 @@
 		return PTR_ERR(qfprom_cdata);
 
 	qfprom_csel = (u32 *)qfprom_read(priv->dev, "calib_sel");
-	if (IS_ERR(qfprom_csel))
+	if (IS_ERR(qfprom_csel)) {
+		kfree(qfprom_cdata);
 		return PTR_ERR(qfprom_csel);
+	}
 
 	mode = (qfprom_csel[0] & MSM8916_CAL_SEL_MASK) >> MSM8916_CAL_SEL_SHIFT;
 	dev_dbg(priv->dev, "calibration mode is %d\n", mode);
@@ -181,6 +183,8 @@
 	}
 
 	compute_intercept_slope(priv, p1, p2, mode);
+	kfree(qfprom_cdata);
+	kfree(qfprom_csel);
 
 	return 0;
 }
@@ -198,8 +202,10 @@
 		return PTR_ERR(calib);
 
 	bkp = (u32 *)qfprom_read(priv->dev, "calib_backup");
-	if (IS_ERR(bkp))
+	if (IS_ERR(bkp)) {
+		kfree(calib);
 		return PTR_ERR(bkp);
+	}
 
 	calib_redun_sel =  bkp[1] & BKP_REDUN_SEL;
 	calib_redun_sel >>= BKP_REDUN_SHIFT;
@@ -313,6 +319,8 @@
 	}
 
 	compute_intercept_slope(priv, p1, p2, mode);
+	kfree(calib);
+	kfree(bkp);
 
 	return 0;
 }

diff --git a/drivers/thermal/qcom/tsens-v1.c b/drivers/thermal/qcom/tsens-v1.c
index 10b595d..870f502 100644
--- a/drivers/thermal/qcom/tsens-v1.c
+++ b/drivers/thermal/qcom/tsens-v1.c

@@ -138,6 +138,7 @@
 	}
 
 	compute_intercept_slope(priv, p1, p2, mode);
+	kfree(qfprom_cdata);
 
 	return 0;
 }

diff --git a/drivers/thermal/qcom/tsens.h b/drivers/thermal/qcom/tsens.h
index 2fd9499..b89083b 100644
--- a/drivers/thermal/qcom/tsens.h
+++ b/drivers/thermal/qcom/tsens.h

@@ -17,6 +17,7 @@
 
 #include <linux/thermal.h>
 #include <linux/regmap.h>
+#include <linux/slab.h>
 
 struct tsens_priv;
 

diff --git a/drivers/thermal/qoriq_thermal.c b/drivers/thermal/qoriq_thermal.c
index 7b36493..39542c6 100644
--- a/drivers/thermal/qoriq_thermal.c
+++ b/drivers/thermal/qoriq_thermal.c

@@ -2,6 +2,7 @@
 //
 // Copyright 2016 Freescale Semiconductor, Inc.
 
+#include <linux/clk.h>
 #include <linux/module.h>
 #include <linux/platform_device.h>
 #include <linux/err.h>
@@ -72,6 +73,7 @@
 
 struct qoriq_tmu_data {
 	struct qoriq_tmu_regs __iomem *regs;
+	struct clk *clk;
 	bool little_endian;
 	struct qoriq_sensor	*sensor[SITES_MAX];
 };
@@ -202,32 +204,39 @@
 
 	data->little_endian = of_property_read_bool(np, "little-endian");
 
-	data->regs = of_iomap(np, 0);
-	if (!data->regs) {
+	data->regs = devm_platform_ioremap_resource(pdev, 0);
+	if (IS_ERR(data->regs)) {
 		dev_err(&pdev->dev, "Failed to get memory region\n");
-		ret = -ENODEV;
-		goto err_iomap;
+		return PTR_ERR(data->regs);
+	}
+
+	data->clk = devm_clk_get_optional(&pdev->dev, NULL);
+	if (IS_ERR(data->clk))
+		return PTR_ERR(data->clk);
+
+	ret = clk_prepare_enable(data->clk);
+	if (ret) {
+		dev_err(&pdev->dev, "Failed to enable clock\n");
+		return ret;
 	}
 
 	qoriq_tmu_init_device(data);	/* TMU initialization */
 
 	ret = qoriq_tmu_calibration(pdev);	/* TMU calibration */
 	if (ret < 0)
-		goto err_tmu;
+		goto err;
 
 	ret = qoriq_tmu_register_tmu_zone(pdev);
 	if (ret < 0) {
 		dev_err(&pdev->dev, "Failed to register sensors\n");
 		ret = -ENODEV;
-		goto err_iomap;
+		goto err;
 	}
 
 	return 0;
 
-err_tmu:
-	iounmap(data->regs);
-
-err_iomap:
+err:
+	clk_disable_unprepare(data->clk);
 	platform_set_drvdata(pdev, NULL);
 
 	return ret;
@@ -240,14 +249,14 @@
 	/* Disable monitoring */
 	tmu_write(data, TMR_DISABLE, &data->regs->tmr);
 
-	iounmap(data->regs);
+	clk_disable_unprepare(data->clk);
+
 	platform_set_drvdata(pdev, NULL);
 
 	return 0;
 }
 
-#ifdef CONFIG_PM_SLEEP
-static int qoriq_tmu_suspend(struct device *dev)
+static int __maybe_unused qoriq_tmu_suspend(struct device *dev)
 {
 	u32 tmr;
 	struct qoriq_tmu_data *data = dev_get_drvdata(dev);
@@ -257,14 +266,21 @@
 	tmr &= ~TMR_ME;
 	tmu_write(data, tmr, &data->regs->tmr);
 
+	clk_disable_unprepare(data->clk);
+
 	return 0;
 }
 
-static int qoriq_tmu_resume(struct device *dev)
+static int __maybe_unused qoriq_tmu_resume(struct device *dev)
 {
 	u32 tmr;
+	int ret;
 	struct qoriq_tmu_data *data = dev_get_drvdata(dev);
 
+	ret = clk_prepare_enable(data->clk);
+	if (ret)
+		return ret;
+
 	/* Enable monitoring */
 	tmr = tmu_read(data, &data->regs->tmr);
 	tmr |= TMR_ME;
@@ -272,7 +288,6 @@
 
 	return 0;
 }
-#endif
 
 static SIMPLE_DEV_PM_OPS(qoriq_tmu_pm_ops,
 			 qoriq_tmu_suspend, qoriq_tmu_resume);

diff --git a/drivers/thermal/rcar_gen3_thermal.c b/drivers/thermal/rcar_gen3_thermal.c
index a564633..755d2b5 100644
--- a/drivers/thermal/rcar_gen3_thermal.c
+++ b/drivers/thermal/rcar_gen3_thermal.c

@@ -443,9 +443,8 @@
 		if (ret)
 			goto error_unregister;
 
-		ret = devm_add_action(dev, rcar_gen3_hwmon_action, zone);
+		ret = devm_add_action_or_reset(dev, rcar_gen3_hwmon_action, zone);
 		if (ret) {
-			rcar_gen3_hwmon_action(zone);
 			goto error_unregister;
 		}
 

diff --git a/drivers/thermal/tegra/soctherm.c b/drivers/thermal/tegra/soctherm.c
index 43941eb..5acaad3 100644
--- a/drivers/thermal/tegra/soctherm.c
+++ b/drivers/thermal/tegra/soctherm.c

@@ -202,7 +202,7 @@
 /* get dividend from the depth */
 #define THROT_DEPTH_DIVIDEND(depth)	((256 * (100 - (depth)) / 100) - 1)
 
-/* gk20a nv_therm interface N:3 Mapping. Levels defined in tegra124-sochterm.h
+/* gk20a nv_therm interface N:3 Mapping. Levels defined in tegra124-soctherm.h
  * level	vector
  * NONE		3'b000
  * LOW		3'b001

diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index 6bab66e..d4481cc 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c

@@ -304,7 +304,7 @@
 				 &tz->poll_queue,
 				 msecs_to_jiffies(delay));
 	else
-		cancel_delayed_work(&tz->poll_queue);
+		cancel_delayed_work_sync(&tz->poll_queue);
 }
 
 static void monitor_thermal_zone(struct thermal_zone_device *tz)
@@ -985,7 +985,7 @@
 	result = device_register(&cdev->device);
 	if (result) {
 		ida_simple_remove(&thermal_cdev_ida, cdev->id);
-		kfree(cdev);
+		put_device(&cdev->device);
 		return ERR_PTR(result);
 	}
 
@@ -1240,21 +1240,31 @@
 	struct thermal_zone_device *tz;
 	enum thermal_trip_type trip_type;
 	int trip_temp;
+	int id;
 	int result;
 	int count;
 	struct thermal_governor *governor;
 
-	if (!type || strlen(type) == 0)
+	if (!type || strlen(type) == 0) {
+		pr_err("Error: No thermal zone type defined\n");
 		return ERR_PTR(-EINVAL);
+	}
 
-	if (type && strlen(type) >= THERMAL_NAME_LENGTH)
+	if (type && strlen(type) >= THERMAL_NAME_LENGTH) {
+		pr_err("Error: Thermal zone name (%s) too long, should be under %d chars\n",
+		       type, THERMAL_NAME_LENGTH);
 		return ERR_PTR(-EINVAL);
+	}
 
-	if (trips > THERMAL_MAX_TRIPS || trips < 0 || mask >> trips)
+	if (trips > THERMAL_MAX_TRIPS || trips < 0 || mask >> trips) {
+		pr_err("Error: Incorrect number of thermal trips\n");
 		return ERR_PTR(-EINVAL);
+	}
 
-	if (!ops)
+	if (!ops) {
+		pr_err("Error: Thermal zone device ops not defined\n");
 		return ERR_PTR(-EINVAL);
+	}
 
 	if (trips > 0 && (!ops->get_trip_type || !ops->get_trip_temp))
 		return ERR_PTR(-EINVAL);
@@ -1266,11 +1276,13 @@
 	INIT_LIST_HEAD(&tz->thermal_instances);
 	ida_init(&tz->ida);
 	mutex_init(&tz->lock);
-	result = ida_simple_get(&thermal_tz_ida, 0, 0, GFP_KERNEL);
-	if (result < 0)
+	id = ida_simple_get(&thermal_tz_ida, 0, 0, GFP_KERNEL);
+	if (id < 0) {
+		result = id;
 		goto free_tz;
+	}
 
-	tz->id = result;
+	tz->id = id;
 	strlcpy(tz->type, type, sizeof(tz->type));
 	tz->ops = ops;
 	tz->tzp = tzp;
@@ -1292,7 +1304,7 @@
 	dev_set_name(&tz->device, "thermal_zone%d", tz->id);
 	result = device_register(&tz->device);
 	if (result)
-		goto remove_device_groups;
+		goto release_device;
 
 	for (count = 0; count < trips; count++) {
 		if (tz->ops->get_trip_type(tz, count, &trip_type))
@@ -1343,14 +1355,12 @@
 	return tz;
 
 unregister:
-	ida_simple_remove(&thermal_tz_ida, tz->id);
-	device_unregister(&tz->device);
-	return ERR_PTR(result);
-
-remove_device_groups:
-	thermal_zone_destroy_device_groups(tz);
+	device_del(&tz->device);
+release_device:
+	put_device(&tz->device);
+	tz = NULL;
 remove_id:
-	ida_simple_remove(&thermal_tz_ida, tz->id);
+	ida_simple_remove(&thermal_tz_ida, id);
 free_tz:
 	kfree(tz);
 	return ERR_PTR(result);

diff --git a/drivers/thermal/thermal_hwmon.c b/drivers/thermal/thermal_hwmon.c
index 40c69a5..dd5d8ee 100644
--- a/drivers/thermal/thermal_hwmon.c
+++ b/drivers/thermal/thermal_hwmon.c

@@ -87,13 +87,17 @@
 thermal_hwmon_lookup_by_type(const struct thermal_zone_device *tz)
 {
 	struct thermal_hwmon_device *hwmon;
+	char type[THERMAL_NAME_LENGTH];
 
 	mutex_lock(&thermal_hwmon_list_lock);
-	list_for_each_entry(hwmon, &thermal_hwmon_list, node)
-		if (!strcmp(hwmon->type, tz->type)) {
+	list_for_each_entry(hwmon, &thermal_hwmon_list, node) {
+		strcpy(type, tz->type);
+		strreplace(type, '-', '_');
+		if (!strcmp(hwmon->type, type)) {
 			mutex_unlock(&thermal_hwmon_list_lock);
 			return hwmon;
 		}
+	}
 	mutex_unlock(&thermal_hwmon_list_lock);
 
 	return NULL;

diff --git a/drivers/thermal/thermal_mmio.c b/drivers/thermal/thermal_mmio.c
index de3ccee..40524fa 100644
--- a/drivers/thermal/thermal_mmio.c
+++ b/drivers/thermal/thermal_mmio.c

@@ -53,13 +53,6 @@
 		return -ENOMEM;
 
 	resource = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	if (IS_ERR(resource)) {
-		dev_err(&pdev->dev,
-			"fail to get platform memory resource (%ld)\n",
-			PTR_ERR(resource));
-		return PTR_ERR(resource);
-	}
-
 	sensor->mmio_base = devm_ioremap_resource(&pdev->dev, resource);
 	if (IS_ERR(sensor->mmio_base)) {
 		dev_err(&pdev->dev, "failed to ioremap memory (%ld)\n",

diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c
index 4223cb4..6e713be 100644
--- a/drivers/tty/serial/serial_core.c
+++ b/drivers/tty/serial/serial_core.c

@@ -22,6 +22,7 @@
 #include <linux/serial_core.h>
 #include <linux/delay.h>
 #include <linux/mutex.h>
+#include <linux/security.h>
 
 #include <linux/irq.h>
 #include <linux/uaccess.h>
@@ -862,6 +863,10 @@
 		goto check_and_exit;
 	}
 
+	retval = security_locked_down(LOCKDOWN_TIOCSSERIAL);
+	if (retval && (change_irq || change_port))
+		goto exit;
+
 	/*
 	 * Ask the low level driver to verify the settings.
 	 */

diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c
index 213ff03..59d9d51 100644
--- a/drivers/usb/gadget/function/f_fs.c
+++ b/drivers/usb/gadget/function/f_fs.c

@@ -17,6 +17,7 @@
 #include <linux/blkdev.h>
 #include <linux/pagemap.h>
 #include <linux/export.h>
+#include <linux/fs_parser.h>
 #include <linux/hid.h>
 #include <linux/mm.h>
 #include <linux/module.h>
@@ -1451,9 +1452,9 @@
 	struct ffs_data *ffs_data;
 };
 
-static int ffs_sb_fill(struct super_block *sb, void *_data, int silent)
+static int ffs_sb_fill(struct super_block *sb, struct fs_context *fc)
 {
-	struct ffs_sb_fill_data *data = _data;
+	struct ffs_sb_fill_data *data = fc->fs_private;
 	struct inode	*inode;
 	struct ffs_data	*ffs = data->ffs_data;
 
@@ -1486,147 +1487,152 @@
 	return 0;
 }
 
-static int ffs_fs_parse_opts(struct ffs_sb_fill_data *data, char *opts)
+enum {
+	Opt_no_disconnect,
+	Opt_rmode,
+	Opt_fmode,
+	Opt_mode,
+	Opt_uid,
+	Opt_gid,
+};
+
+static const struct fs_parameter_spec ffs_fs_param_specs[] = {
+	fsparam_bool	("no_disconnect",	Opt_no_disconnect),
+	fsparam_u32	("rmode",		Opt_rmode),
+	fsparam_u32	("fmode",		Opt_fmode),
+	fsparam_u32	("mode",		Opt_mode),
+	fsparam_u32	("uid",			Opt_uid),
+	fsparam_u32	("gid",			Opt_gid),
+	{}
+};
+
+static const struct fs_parameter_description ffs_fs_fs_parameters = {
+	.name		= "kAFS",
+	.specs		= ffs_fs_param_specs,
+};
+
+static int ffs_fs_parse_param(struct fs_context *fc, struct fs_parameter *param)
 {
+	struct ffs_sb_fill_data *data = fc->fs_private;
+	struct fs_parse_result result;
+	int opt;
+
 	ENTER();
 
-	if (!opts || !*opts)
-		return 0;
+	opt = fs_parse(fc, &ffs_fs_fs_parameters, param, &result);
+	if (opt < 0)
+		return opt;
 
-	for (;;) {
-		unsigned long value;
-		char *eq, *comma;
+	switch (opt) {
+	case Opt_no_disconnect:
+		data->no_disconnect = result.boolean;
+		break;
+	case Opt_rmode:
+		data->root_mode  = (result.uint_32 & 0555) | S_IFDIR;
+		break;
+	case Opt_fmode:
+		data->perms.mode = (result.uint_32 & 0666) | S_IFREG;
+		break;
+	case Opt_mode:
+		data->root_mode  = (result.uint_32 & 0555) | S_IFDIR;
+		data->perms.mode = (result.uint_32 & 0666) | S_IFREG;
+		break;
 
-		/* Option limit */
-		comma = strchr(opts, ',');
-		if (comma)
-			*comma = 0;
+	case Opt_uid:
+		data->perms.uid = make_kuid(current_user_ns(), result.uint_32);
+		if (!uid_valid(data->perms.uid))
+			goto unmapped_value;
+		break;
+	case Opt_gid:
+		data->perms.gid = make_kgid(current_user_ns(), result.uint_32);
+		if (!gid_valid(data->perms.gid))
+			goto unmapped_value;
+		break;
 
-		/* Value limit */
-		eq = strchr(opts, '=');
-		if (unlikely(!eq)) {
-			pr_err("'=' missing in %s\n", opts);
-			return -EINVAL;
-		}
-		*eq = 0;
-
-		/* Parse value */
-		if (kstrtoul(eq + 1, 0, &value)) {
-			pr_err("%s: invalid value: %s\n", opts, eq + 1);
-			return -EINVAL;
-		}
-
-		/* Interpret option */
-		switch (eq - opts) {
-		case 13:
-			if (!memcmp(opts, "no_disconnect", 13))
-				data->no_disconnect = !!value;
-			else
-				goto invalid;
-			break;
-		case 5:
-			if (!memcmp(opts, "rmode", 5))
-				data->root_mode  = (value & 0555) | S_IFDIR;
-			else if (!memcmp(opts, "fmode", 5))
-				data->perms.mode = (value & 0666) | S_IFREG;
-			else
-				goto invalid;
-			break;
-
-		case 4:
-			if (!memcmp(opts, "mode", 4)) {
-				data->root_mode  = (value & 0555) | S_IFDIR;
-				data->perms.mode = (value & 0666) | S_IFREG;
-			} else {
-				goto invalid;
-			}
-			break;
-
-		case 3:
-			if (!memcmp(opts, "uid", 3)) {
-				data->perms.uid = make_kuid(current_user_ns(), value);
-				if (!uid_valid(data->perms.uid)) {
-					pr_err("%s: unmapped value: %lu\n", opts, value);
-					return -EINVAL;
-				}
-			} else if (!memcmp(opts, "gid", 3)) {
-				data->perms.gid = make_kgid(current_user_ns(), value);
-				if (!gid_valid(data->perms.gid)) {
-					pr_err("%s: unmapped value: %lu\n", opts, value);
-					return -EINVAL;
-				}
-			} else {
-				goto invalid;
-			}
-			break;
-
-		default:
-invalid:
-			pr_err("%s: invalid option\n", opts);
-			return -EINVAL;
-		}
-
-		/* Next iteration */
-		if (!comma)
-			break;
-		opts = comma + 1;
+	default:
+		return -ENOPARAM;
 	}
 
 	return 0;
+
+unmapped_value:
+	return invalf(fc, "%s: unmapped value: %u", param->key, result.uint_32);
 }
 
-/* "mount -t functionfs dev_name /dev/function" ends up here */
-
-static struct dentry *
-ffs_fs_mount(struct file_system_type *t, int flags,
-	      const char *dev_name, void *opts)
+/*
+ * Set up the superblock for a mount.
+ */
+static int ffs_fs_get_tree(struct fs_context *fc)
 {
-	struct ffs_sb_fill_data data = {
-		.perms = {
-			.mode = S_IFREG | 0600,
-			.uid = GLOBAL_ROOT_UID,
-			.gid = GLOBAL_ROOT_GID,
-		},
-		.root_mode = S_IFDIR | 0500,
-		.no_disconnect = false,
-	};
-	struct dentry *rv;
-	int ret;
+	struct ffs_sb_fill_data *ctx = fc->fs_private;
 	void *ffs_dev;
 	struct ffs_data	*ffs;
 
 	ENTER();
 
-	ret = ffs_fs_parse_opts(&data, opts);
-	if (unlikely(ret < 0))
-		return ERR_PTR(ret);
+	if (!fc->source)
+		return invalf(fc, "No source specified");
 
-	ffs = ffs_data_new(dev_name);
+	ffs = ffs_data_new(fc->source);
 	if (unlikely(!ffs))
-		return ERR_PTR(-ENOMEM);
-	ffs->file_perms = data.perms;
-	ffs->no_disconnect = data.no_disconnect;
+		return -ENOMEM;
+	ffs->file_perms = ctx->perms;
+	ffs->no_disconnect = ctx->no_disconnect;
 
-	ffs->dev_name = kstrdup(dev_name, GFP_KERNEL);
+	ffs->dev_name = kstrdup(fc->source, GFP_KERNEL);
 	if (unlikely(!ffs->dev_name)) {
 		ffs_data_put(ffs);
-		return ERR_PTR(-ENOMEM);
+		return -ENOMEM;
 	}
 
-	ffs_dev = ffs_acquire_dev(dev_name);
+	ffs_dev = ffs_acquire_dev(ffs->dev_name);
 	if (IS_ERR(ffs_dev)) {
 		ffs_data_put(ffs);
-		return ERR_CAST(ffs_dev);
+		return PTR_ERR(ffs_dev);
 	}
-	ffs->private_data = ffs_dev;
-	data.ffs_data = ffs;
 
-	rv = mount_nodev(t, flags, &data, ffs_sb_fill);
-	if (IS_ERR(rv) && data.ffs_data) {
-		ffs_release_dev(data.ffs_data);
-		ffs_data_put(data.ffs_data);
+	ffs->private_data = ffs_dev;
+	ctx->ffs_data = ffs;
+	return get_tree_nodev(fc, ffs_sb_fill);
+}
+
+static void ffs_fs_free_fc(struct fs_context *fc)
+{
+	struct ffs_sb_fill_data *ctx = fc->fs_private;
+
+	if (ctx) {
+		if (ctx->ffs_data) {
+			ffs_release_dev(ctx->ffs_data);
+			ffs_data_put(ctx->ffs_data);
+		}
+
+		kfree(ctx);
 	}
-	return rv;
+}
+
+static const struct fs_context_operations ffs_fs_context_ops = {
+	.free		= ffs_fs_free_fc,
+	.parse_param	= ffs_fs_parse_param,
+	.get_tree	= ffs_fs_get_tree,
+};
+
+static int ffs_fs_init_fs_context(struct fs_context *fc)
+{
+	struct ffs_sb_fill_data *ctx;
+
+	ctx = kzalloc(sizeof(struct ffs_sb_fill_data), GFP_KERNEL);
+	if (!ctx)
+		return -ENOMEM;
+
+	ctx->perms.mode = S_IFREG | 0600;
+	ctx->perms.uid = GLOBAL_ROOT_UID;
+	ctx->perms.gid = GLOBAL_ROOT_GID;
+	ctx->root_mode = S_IFDIR | 0500;
+	ctx->no_disconnect = false;
+
+	fc->fs_private = ctx;
+	fc->ops = &ffs_fs_context_ops;
+	return 0;
 }
 
 static void
@@ -1644,7 +1650,8 @@
 static struct file_system_type ffs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "functionfs",
-	.mount		= ffs_fs_mount,
+	.init_fs_context = ffs_fs_init_fs_context,
+	.parameters	= &ffs_fs_fs_parameters,
 	.kill_sb	= ffs_fs_kill_sb,
 };
 MODULE_ALIAS_FS("functionfs");

diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
index 3b18fa4..26cef65 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c

@@ -176,13 +176,13 @@
 }
 
 static bool tce_page_is_contained(struct mm_struct *mm, unsigned long hpa,
-		unsigned int page_shift)
+		unsigned int it_page_shift)
 {
 	struct page *page;
 	unsigned long size = 0;
 
-	if (mm_iommu_is_devmem(mm, hpa, page_shift, &size))
-		return size == (1UL << page_shift);
+	if (mm_iommu_is_devmem(mm, hpa, it_page_shift, &size))
+		return size == (1UL << it_page_shift);
 
 	page = pfn_to_page(hpa >> PAGE_SHIFT);
 	/*
@@ -190,7 +190,7 @@
 	 * a page we just found. Otherwise the hardware can get access to
 	 * a bigger memory chunk that it should.
 	 */
-	return (PAGE_SHIFT + compound_order(compound_head(page))) >= page_shift;
+	return page_shift(compound_head(page)) >= it_page_shift;
 }
 
 static inline bool tce_groups_attached(struct tce_container *container)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 9a50b05..96fddc1d 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c

@@ -375,6 +375,8 @@
 
 	down_read(&mm->mmap_sem);
 
+	vaddr = untagged_addr(vaddr);
+
 	vma = find_vma_intersection(mm, vaddr, vaddr + 1);
 
 	if (vma && vma->vm_flags & VM_PFNMAP) {

diff --git a/drivers/video/backlight/Kconfig b/drivers/video/backlight/Kconfig
index 8b081d6..40676be 100644
--- a/drivers/video/backlight/Kconfig
+++ b/drivers/video/backlight/Kconfig

@@ -10,7 +10,6 @@
 #
 config LCD_CLASS_DEVICE
         tristate "Lowlevel LCD controls"
-	default m
 	help
 	  This framework adds support for low-level control of LCD.
 	  Some framebuffer devices connect to platform-specific LCD modules
@@ -143,7 +142,6 @@
 #
 config BACKLIGHT_CLASS_DEVICE
         tristate "Lowlevel Backlight controls"
-	default m
 	help
 	  This framework adds support for low-level control of the LCD
           backlight. This includes support for brightness and power.

diff --git a/drivers/video/backlight/backlight.c b/drivers/video/backlight/backlight.c
index 5dc0710..cac3e35 100644
--- a/drivers/video/backlight/backlight.c
+++ b/drivers/video/backlight/backlight.c

@@ -32,6 +32,12 @@
 	[BACKLIGHT_FIRMWARE] = "firmware",
 };
 
+static const char *const backlight_scale_types[] = {
+	[BACKLIGHT_SCALE_UNKNOWN]	= "unknown",
+	[BACKLIGHT_SCALE_LINEAR]	= "linear",
+	[BACKLIGHT_SCALE_NON_LINEAR]	= "non-linear",
+};
+
 #if defined(CONFIG_FB) || (defined(CONFIG_FB_MODULE) && \
 			   defined(CONFIG_BACKLIGHT_CLASS_DEVICE_MODULE))
 /* This callback gets called when something important happens inside a
@@ -246,6 +252,18 @@
 }
 static DEVICE_ATTR_RO(actual_brightness);
 
+static ssize_t scale_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct backlight_device *bd = to_backlight_device(dev);
+
+	if (WARN_ON(bd->props.scale > BACKLIGHT_SCALE_NON_LINEAR))
+		return sprintf(buf, "unknown\n");
+
+	return sprintf(buf, "%s\n", backlight_scale_types[bd->props.scale]);
+}
+static DEVICE_ATTR_RO(scale);
+
 static struct class *backlight_class;
 
 #ifdef CONFIG_PM_SLEEP
@@ -292,6 +310,7 @@
 	&dev_attr_brightness.attr,
 	&dev_attr_actual_brightness.attr,
 	&dev_attr_max_brightness.attr,
+	&dev_attr_scale.attr,
 	&dev_attr_type.attr,
 	NULL,
 };

diff --git a/drivers/video/backlight/gpio_backlight.c b/drivers/video/backlight/gpio_backlight.c
index e84f308..18e053e 100644
--- a/drivers/video/backlight/gpio_backlight.c
+++ b/drivers/video/backlight/gpio_backlight.c

@@ -59,13 +59,11 @@
 				   struct gpio_backlight *gbl)
 {
 	struct device *dev = &pdev->dev;
-	enum gpiod_flags flags;
 	int ret;
 
 	gbl->def_value = device_property_read_bool(dev, "default-on");
-	flags = gbl->def_value ? GPIOD_OUT_HIGH : GPIOD_OUT_LOW;
 
-	gbl->gpiod = devm_gpiod_get(dev, NULL, flags);
+	gbl->gpiod = devm_gpiod_get(dev, NULL, GPIOD_ASIS);
 	if (IS_ERR(gbl->gpiod)) {
 		ret = PTR_ERR(gbl->gpiod);
 
@@ -79,6 +77,22 @@
 	return 0;
 }
 
+static int gpio_backlight_initial_power_state(struct gpio_backlight *gbl)
+{
+	struct device_node *node = gbl->dev->of_node;
+
+	/* Not booted with device tree or no phandle link to the node */
+	if (!node || !node->phandle)
+		return gbl->def_value ? FB_BLANK_UNBLANK : FB_BLANK_POWERDOWN;
+
+	/* if the enable GPIO is disabled, do not enable the backlight */
+	if (gpiod_get_value_cansleep(gbl->gpiod) == 0)
+		return FB_BLANK_POWERDOWN;
+
+	return FB_BLANK_UNBLANK;
+}
+
+
 static int gpio_backlight_probe(struct platform_device *pdev)
 {
 	struct gpio_backlight_platform_data *pdata =
@@ -136,7 +150,9 @@
 		return PTR_ERR(bl);
 	}
 
-	bl->props.brightness = gbl->def_value;
+	bl->props.power = gpio_backlight_initial_power_state(gbl);
+	bl->props.brightness = 1;
+
 	backlight_update_status(bl);
 
 	platform_set_drvdata(pdev, bl);

diff --git a/drivers/video/backlight/lm3630a_bl.c b/drivers/video/backlight/lm3630a_bl.c
index b04b35d..2d8e819 100644
--- a/drivers/video/backlight/lm3630a_bl.c
+++ b/drivers/video/backlight/lm3630a_bl.c

@@ -377,8 +377,7 @@
 	u32 sources[LM3630A_NUM_SINKS];
 	int ret, num_sources, i;
 
-	num_sources = fwnode_property_read_u32_array(node, "led-sources", NULL,
-						     0);
+	num_sources = fwnode_property_count_u32(node, "led-sources");
 	if (num_sources < 0)
 		return default_led_sources;
 	else if (num_sources > ARRAY_SIZE(sources))

diff --git a/drivers/video/backlight/lms283gf05.c b/drivers/video/backlight/lms283gf05.c
index 35bc012..0e45685 100644
--- a/drivers/video/backlight/lms283gf05.c
+++ b/drivers/video/backlight/lms283gf05.c

@@ -158,7 +158,7 @@
 		ret = devm_gpio_request_one(&spi->dev, pdata->reset_gpio,
 				GPIOF_DIR_OUT | (!pdata->reset_inverted ?
 				GPIOF_INIT_HIGH : GPIOF_INIT_LOW),
-				"LMS285GF05 RESET");
+				"LMS283GF05 RESET");
 		if (ret)
 			return ret;
 	}

diff --git a/drivers/video/backlight/pwm_bl.c b/drivers/video/backlight/pwm_bl.c
index 2201b8c..746eebc 100644
--- a/drivers/video/backlight/pwm_bl.c
+++ b/drivers/video/backlight/pwm_bl.c

@@ -387,6 +387,31 @@
 }
 #endif
 
+static bool pwm_backlight_is_linear(struct platform_pwm_backlight_data *data)
+{
+	unsigned int nlevels = data->max_brightness + 1;
+	unsigned int min_val = data->levels[0];
+	unsigned int max_val = data->levels[nlevels - 1];
+	/*
+	 * Multiplying by 128 means that even in pathological cases such
+	 * as (max_val - min_val) == nlevels the error at max_val is less
+	 * than 1%.
+	 */
+	unsigned int slope = (128 * (max_val - min_val)) / nlevels;
+	unsigned int margin = (max_val - min_val) / 20; /* 5% */
+	int i;
+
+	for (i = 1; i < nlevels; i++) {
+		unsigned int linear_value = min_val + ((i * slope) / 128);
+		unsigned int delta = abs(linear_value - data->levels[i]);
+
+		if (delta > margin)
+			return false;
+	}
+
+	return true;
+}
+
 static int pwm_backlight_initial_power_state(const struct pwm_bl_data *pb)
 {
 	struct device_node *node = pb->dev->of_node;
@@ -536,6 +561,8 @@
 		goto err_alloc;
 	}
 
+	memset(&props, 0, sizeof(struct backlight_properties));
+
 	if (data->levels) {
 		/*
 		 * For the DT case, only when brightness levels is defined
@@ -548,6 +575,11 @@
 
 			pb->levels = data->levels;
 		}
+
+		if (pwm_backlight_is_linear(data))
+			props.scale = BACKLIGHT_SCALE_LINEAR;
+		else
+			props.scale = BACKLIGHT_SCALE_NON_LINEAR;
 	} else if (!data->max_brightness) {
 		/*
 		 * If no brightness levels are provided and max_brightness is
@@ -574,6 +606,8 @@
 
 			pb->levels = data->levels;
 		}
+
+		props.scale = BACKLIGHT_SCALE_NON_LINEAR;
 	} else {
 		/*
 		 * That only happens for the non-DT case, where platform data
@@ -584,7 +618,6 @@
 
 	pb->lth_brightness = data->lth_brightness * (state.period / pb->scale);
 
-	memset(&props, 0, sizeof(struct backlight_properties));
 	props.type = BACKLIGHT_RAW;
 	props.max_brightness = data->max_brightness;
 	bl = backlight_device_register(dev_name(&pdev->dev), &pdev->dev, pb,

diff --git a/drivers/video/backlight/rave-sp-backlight.c b/drivers/video/backlight/rave-sp-backlight.c
index 462f14a..05b5f00 100644
--- a/drivers/video/backlight/rave-sp-backlight.c
+++ b/drivers/video/backlight/rave-sp-backlight.c

@@ -48,14 +48,20 @@
 	struct device *dev = &pdev->dev;
 	struct backlight_device *bd;
 
-	bd = devm_backlight_device_register(dev, pdev->name, dev->parent,
+	bd = devm_backlight_device_register(dev, pdev->name, dev,
 					    dev_get_drvdata(dev->parent),
 					    &rave_sp_backlight_ops,
 					    &rave_sp_backlight_props);
 	if (IS_ERR(bd))
 		return PTR_ERR(bd);
 
-	backlight_update_status(bd);
+	/*
+	 * If there is a phandle pointing to the device node we can
+	 * assume that another device will manage the status changes.
+	 * If not we make sure the backlight is in a consistent state.
+	 */
+	if (!dev->of_node->phandle)
+		backlight_update_status(bd);
 
 	return 0;
 }

diff --git a/drivers/video/backlight/tosa_lcd.c b/drivers/video/backlight/tosa_lcd.c
index 65cb757..29af8e2 100644
--- a/drivers/video/backlight/tosa_lcd.c
+++ b/drivers/video/backlight/tosa_lcd.c

@@ -222,8 +222,7 @@
 {
 	struct tosa_lcd_data *data = spi_get_drvdata(spi);
 
-	if (data->i2c)
-		i2c_unregister_device(data->i2c);
+	i2c_unregister_device(data->i2c);
 
 	tosa_lcd_tg_off(data);
 

diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig
index a45f9e3..58e7c10 100644
--- a/drivers/watchdog/Kconfig
+++ b/drivers/watchdog/Kconfig

@@ -477,13 +477,6 @@
 
 	  Say N if you are unsure.
 
-config KS8695_WATCHDOG
-	tristate "KS8695 watchdog"
-	depends on ARCH_KS8695
-	help
-	  Watchdog timer embedded into KS8695 processor. This will reboot your
-	  system when the timeout is reached.
-
 config HAVE_S3C2410_WATCHDOG
 	bool
 	help
@@ -662,15 +655,6 @@
 	  To compile this driver as a module, choose M here: the
 	  module will be called stmp3xxx_rtc_wdt.
 
-config NUC900_WATCHDOG
-	tristate "Nuvoton NUC900 watchdog"
-	depends on ARCH_W90X900 || COMPILE_TEST
-	help
-	  Say Y here if to include support for the watchdog timer
-	  for the Nuvoton NUC900 series SoCs.
-	  To compile this driver as a module, choose M here: the
-	  module will be called nuc900_wdt.
-
 config TS4800_WATCHDOG
 	tristate "TS-4800 Watchdog"
 	depends on HAS_IOMEM && OF
@@ -740,6 +724,19 @@
 	  To compile this driver as a module, choose M here: the
 	  module will be called imx_sc_wdt.
 
+config IMX7ULP_WDT
+	tristate "IMX7ULP Watchdog"
+	depends on ARCH_MXC || COMPILE_TEST
+	select WATCHDOG_CORE
+	help
+	  This is the driver for the hardware watchdog on the Freescale
+	  IMX7ULP and later processors. If you have one of these
+	  processors and wish to have watchdog support enabled,
+	  say Y, otherwise say N.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called imx7ulp_wdt.
+
 config UX500_WATCHDOG
 	tristate "ST-Ericsson Ux500 watchdog"
 	depends on MFD_DB8500_PRCMU
@@ -1046,8 +1043,8 @@
 	depends on X86
 	help
 	  This is the driver for the hardware watchdog on the Fintek F71808E,
-	  F71862FG, F71868, F71869, F71882FG, F71889FG, F81865 and F81866
-	  Super I/O controllers.
+	  F71862FG, F71868, F71869, F71882FG, F71889FG, F81803, F81865, and
+	  F81866 Super I/O controllers.
 
 	  You can compile this driver directly into the kernel, or use
 	  it as a module.  The module will be called f71808e_wdt.

diff --git a/drivers/watchdog/Makefile b/drivers/watchdog/Makefile
index 7caa920..2ee352b 100644
--- a/drivers/watchdog/Makefile
+++ b/drivers/watchdog/Makefile

@@ -49,7 +49,6 @@
 obj-$(CONFIG_977_WATCHDOG) += wdt977.o
 obj-$(CONFIG_FTWDT010_WATCHDOG) += ftwdt010_wdt.o
 obj-$(CONFIG_IXP4XX_WATCHDOG) += ixp4xx_wdt.o
-obj-$(CONFIG_KS8695_WATCHDOG) += ks8695_wdt.o
 obj-$(CONFIG_S3C2410_WATCHDOG) += s3c2410_wdt.o
 obj-$(CONFIG_SA1100_WATCHDOG) += sa1100_wdt.o
 obj-$(CONFIG_SAMA5D4_WATCHDOG) += sama5d4_wdt.o
@@ -64,11 +63,11 @@
 obj-$(CONFIG_COH901327_WATCHDOG) += coh901327_wdt.o
 obj-$(CONFIG_NPCM7XX_WATCHDOG) += npcm_wdt.o
 obj-$(CONFIG_STMP3XXX_RTC_WATCHDOG) += stmp3xxx_rtc_wdt.o
-obj-$(CONFIG_NUC900_WATCHDOG) += nuc900_wdt.o
 obj-$(CONFIG_TS4800_WATCHDOG) += ts4800_wdt.o
 obj-$(CONFIG_TS72XX_WATCHDOG) += ts72xx_wdt.o
 obj-$(CONFIG_IMX2_WDT) += imx2_wdt.o
 obj-$(CONFIG_IMX_SC_WDT) += imx_sc_wdt.o
+obj-$(CONFIG_IMX7ULP_WDT) += imx7ulp_wdt.o
 obj-$(CONFIG_UX500_WATCHDOG) += ux500_wdt.o
 obj-$(CONFIG_RETU_WATCHDOG) += retu_wdt.o
 obj-$(CONFIG_BCM2835_WDT) += bcm2835_wdt.o

diff --git a/drivers/watchdog/aspeed_wdt.c b/drivers/watchdog/aspeed_wdt.c
index cc71861..4ec0906 100644
--- a/drivers/watchdog/aspeed_wdt.c
+++ b/drivers/watchdog/aspeed_wdt.c

@@ -34,6 +34,7 @@
 static const struct of_device_id aspeed_wdt_of_table[] = {
 	{ .compatible = "aspeed,ast2400-wdt", .data = &ast2400_config },
 	{ .compatible = "aspeed,ast2500-wdt", .data = &ast2500_config },
+	{ .compatible = "aspeed,ast2600-wdt", .data = &ast2500_config },
 	{ },
 };
 MODULE_DEVICE_TABLE(of, aspeed_wdt_of_table);
@@ -53,6 +54,8 @@
 #define   WDT_CTRL_ENABLE		BIT(0)
 #define WDT_TIMEOUT_STATUS	0x10
 #define   WDT_TIMEOUT_STATUS_BOOT_SECONDARY	BIT(1)
+#define WDT_CLEAR_TIMEOUT_STATUS	0x14
+#define   WDT_CLEAR_TIMEOUT_AND_BOOT_CODE_SELECTION	BIT(0)
 
 /*
  * WDT_RESET_WIDTH controls the characteristics of the external pulse (if
@@ -165,6 +168,60 @@
 	return 0;
 }
 
+/* access_cs0 shows if cs0 is accessible, hence the reverted bit */
+static ssize_t access_cs0_show(struct device *dev,
+			       struct device_attribute *attr, char *buf)
+{
+	struct aspeed_wdt *wdt = dev_get_drvdata(dev);
+	u32 status = readl(wdt->base + WDT_TIMEOUT_STATUS);
+
+	return sprintf(buf, "%u\n",
+		      !(status & WDT_TIMEOUT_STATUS_BOOT_SECONDARY));
+}
+
+static ssize_t access_cs0_store(struct device *dev,
+				struct device_attribute *attr, const char *buf,
+				size_t size)
+{
+	struct aspeed_wdt *wdt = dev_get_drvdata(dev);
+	unsigned long val;
+
+	if (kstrtoul(buf, 10, &val))
+		return -EINVAL;
+
+	if (val)
+		writel(WDT_CLEAR_TIMEOUT_AND_BOOT_CODE_SELECTION,
+		       wdt->base + WDT_CLEAR_TIMEOUT_STATUS);
+
+	return size;
+}
+
+/*
+ * This attribute exists only if the system has booted from the alternate
+ * flash with 'alt-boot' option.
+ *
+ * At alternate flash the 'access_cs0' sysfs node provides:
+ *   ast2400: a way to get access to the primary SPI flash chip at CS0
+ *            after booting from the alternate chip at CS1.
+ *   ast2500: a way to restore the normal address mapping from
+ *            (CS0->CS1, CS1->CS0) to (CS0->CS0, CS1->CS1).
+ *
+ * Clearing the boot code selection and timeout counter also resets to the
+ * initial state the chip select line mapping. When the SoC is in normal
+ * mapping state (i.e. booted from CS0), clearing those bits does nothing for
+ * both versions of the SoC. For alternate boot mode (booted from CS1 due to
+ * wdt2 expiration) the behavior differs as described above.
+ *
+ * This option can be used with wdt2 (watchdog1) only.
+ */
+static DEVICE_ATTR_RW(access_cs0);
+
+static struct attribute *bswitch_attrs[] = {
+	&dev_attr_access_cs0.attr,
+	NULL
+};
+ATTRIBUTE_GROUPS(bswitch);
+
 static const struct watchdog_ops aspeed_wdt_ops = {
 	.start		= aspeed_wdt_start,
 	.stop		= aspeed_wdt_stop,
@@ -259,7 +316,8 @@
 		set_bit(WDOG_HW_RUNNING, &wdt->wdd.status);
 	}
 
-	if (of_device_is_compatible(np, "aspeed,ast2500-wdt")) {
+	if ((of_device_is_compatible(np, "aspeed,ast2500-wdt")) ||
+		(of_device_is_compatible(np, "aspeed,ast2600-wdt"))) {
 		u32 reg = readl(wdt->base + WDT_RESET_WIDTH);
 
 		reg &= config->ext_pulse_width_mask;
@@ -306,9 +364,16 @@
 	}
 
 	status = readl(wdt->base + WDT_TIMEOUT_STATUS);
-	if (status & WDT_TIMEOUT_STATUS_BOOT_SECONDARY)
+	if (status & WDT_TIMEOUT_STATUS_BOOT_SECONDARY) {
 		wdt->wdd.bootstatus = WDIOF_CARDRESET;
 
+		if (of_device_is_compatible(np, "aspeed,ast2400-wdt") ||
+		    of_device_is_compatible(np, "aspeed,ast2500-wdt"))
+			wdt->wdd.groups = bswitch_groups;
+	}
+
+	dev_set_drvdata(dev, wdt);
+
 	return devm_watchdog_register_device(dev, &wdt->wdd);
 }
 

diff --git a/drivers/watchdog/ath79_wdt.c b/drivers/watchdog/ath79_wdt.c
index 2e09981..75de664 100644
--- a/drivers/watchdog/ath79_wdt.c
+++ b/drivers/watchdog/ath79_wdt.c

@@ -302,7 +302,7 @@
 	return 0;
 }
 
-static void ath97_wdt_shutdown(struct platform_device *pdev)
+static void ath79_wdt_shutdown(struct platform_device *pdev)
 {
 	ath79_wdt_disable();
 }
@@ -318,7 +318,7 @@
 static struct platform_driver ath79_wdt_driver = {
 	.probe		= ath79_wdt_probe,
 	.remove		= ath79_wdt_remove,
-	.shutdown	= ath97_wdt_shutdown,
+	.shutdown	= ath79_wdt_shutdown,
 	.driver		= {
 		.name	= DRIVER_NAME,
 		.of_match_table = of_match_ptr(ath79_wdt_match),

diff --git a/drivers/watchdog/cpwd.c b/drivers/watchdog/cpwd.c
index b973b31..9393be5 100644
--- a/drivers/watchdog/cpwd.c
+++ b/drivers/watchdog/cpwd.c

@@ -473,29 +473,6 @@
 	return 0;
 }
 
-static long cpwd_compat_ioctl(struct file *file, unsigned int cmd,
-			      unsigned long arg)
-{
-	int rval = -ENOIOCTLCMD;
-
-	switch (cmd) {
-	/* solaris ioctls are specific to this driver */
-	case WIOCSTART:
-	case WIOCSTOP:
-	case WIOCGSTAT:
-		mutex_lock(&cpwd_mutex);
-		rval = cpwd_ioctl(file, cmd, arg);
-		mutex_unlock(&cpwd_mutex);
-		break;
-
-	/* everything else is handled by the generic compat layer */
-	default:
-		break;
-	}
-
-	return rval;
-}
-
 static ssize_t cpwd_write(struct file *file, const char __user *buf,
 			  size_t count, loff_t *ppos)
 {
@@ -520,7 +497,7 @@
 static const struct file_operations cpwd_fops = {
 	.owner =		THIS_MODULE,
 	.unlocked_ioctl =	cpwd_ioctl,
-	.compat_ioctl =		cpwd_compat_ioctl,
+	.compat_ioctl =		compat_ptr_ioctl,
 	.open =			cpwd_open,
 	.write =		cpwd_write,
 	.read =			cpwd_read,

diff --git a/drivers/watchdog/diag288_wdt.c b/drivers/watchdog/diag288_wdt.c
index 181440b..aafc8d9 100644
--- a/drivers/watchdog/diag288_wdt.c
+++ b/drivers/watchdog/diag288_wdt.c

@@ -26,13 +26,11 @@
 #include <linux/module.h>
 #include <linux/moduleparam.h>
 #include <linux/slab.h>
-#include <linux/miscdevice.h>
 #include <linux/watchdog.h>
 #include <linux/suspend.h>
 #include <asm/ebcdic.h>
 #include <asm/diag.h>
 #include <linux/io.h>
-#include <linux/uaccess.h>
 
 #define MAX_CMDLEN 240
 #define DEFAULT_CMD "SYSTEM RESTART"
@@ -70,7 +68,6 @@
 module_param_named(nowayout, nowayout_info, bool, 0444);
 MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default = CONFIG_WATCHDOG_NOWAYOUT)");
 
-MODULE_ALIAS_MISCDEV(WATCHDOG_MINOR);
 MODULE_ALIAS("vmwatchdog");
 
 static int __diag288(unsigned int func, unsigned int timeout,

diff --git a/drivers/watchdog/f71808e_wdt.c b/drivers/watchdog/f71808e_wdt.c
index ff5cf1b..e46104c 100644
--- a/drivers/watchdog/f71808e_wdt.c
+++ b/drivers/watchdog/f71808e_wdt.c

@@ -31,8 +31,10 @@
 #define SIO_REG_DEVID		0x20	/* Device ID (2 bytes) */
 #define SIO_REG_DEVREV		0x22	/* Device revision */
 #define SIO_REG_MANID		0x23	/* Fintek ID (2 bytes) */
+#define SIO_REG_CLOCK_SEL	0x26	/* Clock select */
 #define SIO_REG_ROM_ADDR_SEL	0x27	/* ROM address select */
 #define SIO_F81866_REG_PORT_SEL	0x27	/* F81866 Multi-Function Register */
+#define SIO_REG_TSI_LEVEL_SEL	0x28	/* TSI Level select */
 #define SIO_REG_MFUNCT1		0x29	/* Multi function select 1 */
 #define SIO_REG_MFUNCT2		0x2a	/* Multi function select 2 */
 #define SIO_REG_MFUNCT3		0x2b	/* Multi function select 3 */
@@ -49,6 +51,7 @@
 #define SIO_F71869A_ID		0x1007	/* Chipset ID */
 #define SIO_F71882_ID		0x0541	/* Chipset ID */
 #define SIO_F71889_ID		0x0723	/* Chipset ID */
+#define SIO_F81803_ID		0x1210	/* Chipset ID */
 #define SIO_F81865_ID		0x0704	/* Chipset ID */
 #define SIO_F81866_ID		0x1010	/* Chipset ID */
 
@@ -108,7 +111,7 @@
 	" given initial timeout. Zero (default) disables this feature.");
 
 enum chips { f71808fg, f71858fg, f71862fg, f71868, f71869, f71882fg, f71889fg,
-	     f81865, f81866};
+	     f81803, f81865, f81866};
 
 static const char *f71808e_names[] = {
 	"f71808fg",
@@ -118,6 +121,7 @@
 	"f71869",
 	"f71882fg",
 	"f71889fg",
+	"f81803",
 	"f81865",
 	"f81866",
 };
@@ -370,6 +374,14 @@
 			superio_inb(watchdog.sioaddr, SIO_REG_MFUNCT3) & 0xcf);
 		break;
 
+	case f81803:
+		/* Enable TSI Level register bank */
+		superio_clear_bit(watchdog.sioaddr, SIO_REG_CLOCK_SEL, 3);
+		/* Set pin 27 to WDTRST# */
+		superio_outb(watchdog.sioaddr, SIO_REG_TSI_LEVEL_SEL, 0x5f &
+			superio_inb(watchdog.sioaddr, SIO_REG_TSI_LEVEL_SEL));
+		break;
+
 	case f81865:
 		/* Set pin 70 to WDTRST# */
 		superio_clear_bit(watchdog.sioaddr, SIO_REG_MFUNCT3, 5);
@@ -809,6 +821,9 @@
 		/* Confirmed (by datasheet) not to have a watchdog. */
 		err = -ENODEV;
 		goto exit;
+	case SIO_F81803_ID:
+		watchdog.type = f81803;
+		break;
 	case SIO_F81865_ID:
 		watchdog.type = f81865;
 		break;

diff --git a/drivers/watchdog/iTCO_wdt.c b/drivers/watchdog/iTCO_wdt.c
index c559f70..156360e 100644
--- a/drivers/watchdog/iTCO_wdt.c
+++ b/drivers/watchdog/iTCO_wdt.c

@@ -48,6 +48,7 @@
 
 /* Includes */
 #include <linux/acpi.h>			/* For ACPI support */
+#include <linux/bits.h>			/* For BIT() */
 #include <linux/module.h>		/* For module specific items */
 #include <linux/moduleparam.h>		/* For new moduleparam's */
 #include <linux/types.h>		/* For standard types (like size_t) */
@@ -215,6 +216,23 @@
 	return 0;
 }
 
+static int update_no_reboot_bit_cnt(void *priv, bool set)
+{
+	struct iTCO_wdt_private *p = priv;
+	u16 val, newval;
+
+	val = inw(TCO1_CNT(p));
+	if (set)
+		val |= BIT(0);
+	else
+		val &= ~BIT(0);
+	outw(val, TCO1_CNT(p));
+	newval = inw(TCO1_CNT(p));
+
+	/* make sure the update is successful */
+	return val != newval ? -EIO : 0;
+}
+
 static void iTCO_wdt_no_reboot_bit_setup(struct iTCO_wdt_private *p,
 		struct itco_wdt_platform_data *pdata)
 {
@@ -224,7 +242,9 @@
 		return;
 	}
 
-	if (p->iTCO_version >= 2)
+	if (p->iTCO_version >= 6)
+		p->update_no_reboot_bit = update_no_reboot_bit_cnt;
+	else if (p->iTCO_version >= 2)
 		p->update_no_reboot_bit = update_no_reboot_bit_mem;
 	else if (p->iTCO_version == 1)
 		p->update_no_reboot_bit = update_no_reboot_bit_pci;
@@ -452,7 +472,8 @@
 	 * Get the Memory-Mapped GCS or PMC register, we need it for the
 	 * NO_REBOOT flag (TCO v2 and v3).
 	 */
-	if (p->iTCO_version >= 2 && !pdata->update_no_reboot_bit) {
+	if (p->iTCO_version >= 2 && p->iTCO_version < 6 &&
+	    !pdata->update_no_reboot_bit) {
 		p->gcs_pmc_res = platform_get_resource(pdev,
 						       IORESOURCE_MEM,
 						       ICH_RES_MEM_GCS_PMC);
@@ -502,6 +523,7 @@
 
 	/* Clear out the (probably old) status */
 	switch (p->iTCO_version) {
+	case 6:
 	case 5:
 	case 4:
 		outw(0x0008, TCO1_STS(p)); /* Clear the Time Out Status bit */

diff --git a/drivers/watchdog/imx2_wdt.c b/drivers/watchdog/imx2_wdt.c
index 32af397..8d019a9 100644
--- a/drivers/watchdog/imx2_wdt.c
+++ b/drivers/watchdog/imx2_wdt.c

@@ -55,7 +55,7 @@
 
 #define IMX2_WDT_WMCR		0x08		/* Misc Register */
 
-#define IMX2_WDT_MAX_TIME	128
+#define IMX2_WDT_MAX_TIME	128U
 #define IMX2_WDT_DEFAULT_TIME	60		/* in seconds */
 
 #define WDOG_SEC_TO_COUNT(s)	((s * 2 - 1) << 8)
@@ -180,7 +180,7 @@
 {
 	unsigned int actual;
 
-	actual = min(new_timeout, wdog->max_hw_heartbeat_ms * 1000);
+	actual = min(new_timeout, IMX2_WDT_MAX_TIME);
 	__imx2_wdt_set_timeout(wdog, actual);
 	wdog->timeout = new_timeout;
 	return 0;

diff --git a/drivers/watchdog/imx7ulp_wdt.c b/drivers/watchdog/imx7ulp_wdt.c
new file mode 100644
index 0000000..5ce5102
--- /dev/null
+++ b/drivers/watchdog/imx7ulp_wdt.c

@@ -0,0 +1,243 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2019 NXP.
+ */
+
+#include <linux/clk.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/platform_device.h>
+#include <linux/reboot.h>
+#include <linux/watchdog.h>
+
+#define WDOG_CS			0x0
+#define WDOG_CS_CMD32EN		BIT(13)
+#define WDOG_CS_ULK		BIT(11)
+#define WDOG_CS_RCS		BIT(10)
+#define WDOG_CS_EN		BIT(7)
+#define WDOG_CS_UPDATE		BIT(5)
+
+#define WDOG_CNT	0x4
+#define WDOG_TOVAL	0x8
+
+#define REFRESH_SEQ0	0xA602
+#define REFRESH_SEQ1	0xB480
+#define REFRESH		((REFRESH_SEQ1 << 16) | REFRESH_SEQ0)
+
+#define UNLOCK_SEQ0	0xC520
+#define UNLOCK_SEQ1	0xD928
+#define UNLOCK		((UNLOCK_SEQ1 << 16) | UNLOCK_SEQ0)
+
+#define DEFAULT_TIMEOUT	60
+#define MAX_TIMEOUT	128
+#define WDOG_CLOCK_RATE	1000
+
+static bool nowayout = WATCHDOG_NOWAYOUT;
+module_param(nowayout, bool, 0000);
+MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default="
+		 __MODULE_STRING(WATCHDOG_NOWAYOUT) ")");
+
+struct imx7ulp_wdt_device {
+	struct notifier_block restart_handler;
+	struct watchdog_device wdd;
+	void __iomem *base;
+	struct clk *clk;
+};
+
+static inline void imx7ulp_wdt_enable(void __iomem *base, bool enable)
+{
+	u32 val = readl(base + WDOG_CS);
+
+	writel(UNLOCK, base + WDOG_CNT);
+	if (enable)
+		writel(val | WDOG_CS_EN, base + WDOG_CS);
+	else
+		writel(val & ~WDOG_CS_EN, base + WDOG_CS);
+}
+
+static inline bool imx7ulp_wdt_is_enabled(void __iomem *base)
+{
+	u32 val = readl(base + WDOG_CS);
+
+	return val & WDOG_CS_EN;
+}
+
+static int imx7ulp_wdt_ping(struct watchdog_device *wdog)
+{
+	struct imx7ulp_wdt_device *wdt = watchdog_get_drvdata(wdog);
+
+	writel(REFRESH, wdt->base + WDOG_CNT);
+
+	return 0;
+}
+
+static int imx7ulp_wdt_start(struct watchdog_device *wdog)
+{
+	struct imx7ulp_wdt_device *wdt = watchdog_get_drvdata(wdog);
+
+	imx7ulp_wdt_enable(wdt->base, true);
+
+	return 0;
+}
+
+static int imx7ulp_wdt_stop(struct watchdog_device *wdog)
+{
+	struct imx7ulp_wdt_device *wdt = watchdog_get_drvdata(wdog);
+
+	imx7ulp_wdt_enable(wdt->base, false);
+
+	return 0;
+}
+
+static int imx7ulp_wdt_set_timeout(struct watchdog_device *wdog,
+				   unsigned int timeout)
+{
+	struct imx7ulp_wdt_device *wdt = watchdog_get_drvdata(wdog);
+	u32 val = WDOG_CLOCK_RATE * timeout;
+
+	writel(UNLOCK, wdt->base + WDOG_CNT);
+	writel(val, wdt->base + WDOG_TOVAL);
+
+	wdog->timeout = timeout;
+
+	return 0;
+}
+
+static const struct watchdog_ops imx7ulp_wdt_ops = {
+	.owner = THIS_MODULE,
+	.start = imx7ulp_wdt_start,
+	.stop  = imx7ulp_wdt_stop,
+	.ping  = imx7ulp_wdt_ping,
+	.set_timeout = imx7ulp_wdt_set_timeout,
+};
+
+static const struct watchdog_info imx7ulp_wdt_info = {
+	.identity = "i.MX7ULP watchdog timer",
+	.options  = WDIOF_SETTIMEOUT | WDIOF_KEEPALIVEPING |
+		    WDIOF_MAGICCLOSE,
+};
+
+static inline void imx7ulp_wdt_init(void __iomem *base, unsigned int timeout)
+{
+	u32 val;
+
+	/* unlock the wdog for reconfiguration */
+	writel_relaxed(UNLOCK_SEQ0, base + WDOG_CNT);
+	writel_relaxed(UNLOCK_SEQ1, base + WDOG_CNT);
+
+	/* set an initial timeout value in TOVAL */
+	writel(timeout, base + WDOG_TOVAL);
+	/* enable 32bit command sequence and reconfigure */
+	val = BIT(13) | BIT(8) | BIT(5);
+	writel(val, base + WDOG_CS);
+}
+
+static void imx7ulp_wdt_action(void *data)
+{
+	clk_disable_unprepare(data);
+}
+
+static int imx7ulp_wdt_probe(struct platform_device *pdev)
+{
+	struct imx7ulp_wdt_device *imx7ulp_wdt;
+	struct device *dev = &pdev->dev;
+	struct watchdog_device *wdog;
+	int ret;
+
+	imx7ulp_wdt = devm_kzalloc(dev, sizeof(*imx7ulp_wdt), GFP_KERNEL);
+	if (!imx7ulp_wdt)
+		return -ENOMEM;
+
+	platform_set_drvdata(pdev, imx7ulp_wdt);
+
+	imx7ulp_wdt->base = devm_platform_ioremap_resource(pdev, 0);
+	if (IS_ERR(imx7ulp_wdt->base))
+		return PTR_ERR(imx7ulp_wdt->base);
+
+	imx7ulp_wdt->clk = devm_clk_get(dev, NULL);
+	if (IS_ERR(imx7ulp_wdt->clk)) {
+		dev_err(dev, "Failed to get watchdog clock\n");
+		return PTR_ERR(imx7ulp_wdt->clk);
+	}
+
+	ret = clk_prepare_enable(imx7ulp_wdt->clk);
+	if (ret)
+		return ret;
+
+	ret = devm_add_action_or_reset(dev, imx7ulp_wdt_action, imx7ulp_wdt->clk);
+	if (ret)
+		return ret;
+
+	wdog = &imx7ulp_wdt->wdd;
+	wdog->info = &imx7ulp_wdt_info;
+	wdog->ops = &imx7ulp_wdt_ops;
+	wdog->min_timeout = 1;
+	wdog->max_timeout = MAX_TIMEOUT;
+	wdog->parent = dev;
+	wdog->timeout = DEFAULT_TIMEOUT;
+
+	watchdog_init_timeout(wdog, 0, dev);
+	watchdog_stop_on_reboot(wdog);
+	watchdog_stop_on_unregister(wdog);
+	watchdog_set_drvdata(wdog, imx7ulp_wdt);
+	imx7ulp_wdt_init(imx7ulp_wdt->base, wdog->timeout * WDOG_CLOCK_RATE);
+
+	return devm_watchdog_register_device(dev, wdog);
+}
+
+static int __maybe_unused imx7ulp_wdt_suspend(struct device *dev)
+{
+	struct imx7ulp_wdt_device *imx7ulp_wdt = dev_get_drvdata(dev);
+
+	if (watchdog_active(&imx7ulp_wdt->wdd))
+		imx7ulp_wdt_stop(&imx7ulp_wdt->wdd);
+
+	clk_disable_unprepare(imx7ulp_wdt->clk);
+
+	return 0;
+}
+
+static int __maybe_unused imx7ulp_wdt_resume(struct device *dev)
+{
+	struct imx7ulp_wdt_device *imx7ulp_wdt = dev_get_drvdata(dev);
+	u32 timeout = imx7ulp_wdt->wdd.timeout * WDOG_CLOCK_RATE;
+	int ret;
+
+	ret = clk_prepare_enable(imx7ulp_wdt->clk);
+	if (ret)
+		return ret;
+
+	if (imx7ulp_wdt_is_enabled(imx7ulp_wdt->base))
+		imx7ulp_wdt_init(imx7ulp_wdt->base, timeout);
+
+	if (watchdog_active(&imx7ulp_wdt->wdd))
+		imx7ulp_wdt_start(&imx7ulp_wdt->wdd);
+
+	return 0;
+}
+
+static SIMPLE_DEV_PM_OPS(imx7ulp_wdt_pm_ops, imx7ulp_wdt_suspend,
+			 imx7ulp_wdt_resume);
+
+static const struct of_device_id imx7ulp_wdt_dt_ids[] = {
+	{ .compatible = "fsl,imx7ulp-wdt", },
+	{ /* sentinel */ }
+};
+MODULE_DEVICE_TABLE(of, imx7ulp_wdt_dt_ids);
+
+static struct platform_driver imx7ulp_wdt_driver = {
+	.probe		= imx7ulp_wdt_probe,
+	.driver		= {
+		.name	= "imx7ulp-wdt",
+		.pm	= &imx7ulp_wdt_pm_ops,
+		.of_match_table = imx7ulp_wdt_dt_ids,
+	},
+};
+module_platform_driver(imx7ulp_wdt_driver);
+
+MODULE_AUTHOR("Anson Huang <Anson.Huang@nxp.com>");
+MODULE_DESCRIPTION("Freescale i.MX7ULP watchdog driver");
+MODULE_LICENSE("GPL v2");

diff --git a/drivers/watchdog/imx_sc_wdt.c b/drivers/watchdog/imx_sc_wdt.c
index 78eaaf7..7ea5cf5 100644
--- a/drivers/watchdog/imx_sc_wdt.c
+++ b/drivers/watchdog/imx_sc_wdt.c

@@ -175,12 +175,9 @@
 	watchdog_stop_on_unregister(wdog);
 
 	ret = devm_watchdog_register_device(dev, wdog);
- 
- 	if (ret) {
- 		dev_err(dev, "Failed to register watchdog device\n");
- 		return ret;
- 	}
- 
+	if (ret)
+		return ret;
+
 	ret = imx_scu_irq_group_enable(SC_IRQ_GROUP_WDOG,
 				       SC_IRQ_WDOG,
 				       true);

diff --git a/drivers/watchdog/jz4740_wdt.c b/drivers/watchdog/jz4740_wdt.c
index d4a9091..c6052ae 100644
--- a/drivers/watchdog/jz4740_wdt.c
+++ b/drivers/watchdog/jz4740_wdt.c

@@ -162,7 +162,6 @@
 	struct device *dev = &pdev->dev;
 	struct jz4740_wdt_drvdata *drvdata;
 	struct watchdog_device *jz4740_wdt;
-	int ret;
 
 	drvdata = devm_kzalloc(dev, sizeof(struct jz4740_wdt_drvdata),
 			       GFP_KERNEL);

diff --git a/drivers/watchdog/ks8695_wdt.c b/drivers/watchdog/ks8695_wdt.c
deleted file mode 100644
index 1550ce3..0000000
--- a/drivers/watchdog/ks8695_wdt.c
+++ /dev/null

@@ -1,319 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Watchdog driver for Kendin/Micrel KS8695.
- *
- * (C) 2007 Andrew Victor
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/bitops.h>
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/miscdevice.h>
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/platform_device.h>
-#include <linux/types.h>
-#include <linux/watchdog.h>
-#include <linux/io.h>
-#include <linux/uaccess.h>
-#include <mach/hardware.h>
-
-#define KS8695_TMR_OFFSET	(0xF0000 + 0xE400)
-#define KS8695_TMR_VA		(KS8695_IO_VA + KS8695_TMR_OFFSET)
-
-/*
- * Timer registers
- */
-#define KS8695_TMCON		(0x00)		/* Timer Control Register */
-#define KS8695_T0TC		(0x08)		/* Timer 0 Timeout Count Register */
-#define TMCON_T0EN		(1 << 0)	/* Timer 0 Enable */
-
-/* Timer0 Timeout Counter Register */
-#define T0TC_WATCHDOG		(0xff)		/* Enable watchdog mode */
-
-#define WDT_DEFAULT_TIME	5	/* seconds */
-#define WDT_MAX_TIME		171	/* seconds */
-
-static int wdt_time = WDT_DEFAULT_TIME;
-static bool nowayout = WATCHDOG_NOWAYOUT;
-
-module_param(wdt_time, int, 0);
-MODULE_PARM_DESC(wdt_time, "Watchdog time in seconds. (default="
-					__MODULE_STRING(WDT_DEFAULT_TIME) ")");
-
-#ifdef CONFIG_WATCHDOG_NOWAYOUT
-module_param(nowayout, bool, 0);
-MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default="
-				__MODULE_STRING(WATCHDOG_NOWAYOUT) ")");
-#endif
-
-
-static unsigned long ks8695wdt_busy;
-static DEFINE_SPINLOCK(ks8695_lock);
-
-/* ......................................................................... */
-
-/*
- * Disable the watchdog.
- */
-static inline void ks8695_wdt_stop(void)
-{
-	unsigned long tmcon;
-
-	spin_lock(&ks8695_lock);
-	/* disable timer0 */
-	tmcon = __raw_readl(KS8695_TMR_VA + KS8695_TMCON);
-	__raw_writel(tmcon & ~TMCON_T0EN, KS8695_TMR_VA + KS8695_TMCON);
-	spin_unlock(&ks8695_lock);
-}
-
-/*
- * Enable and reset the watchdog.
- */
-static inline void ks8695_wdt_start(void)
-{
-	unsigned long tmcon;
-	unsigned long tval = wdt_time * KS8695_CLOCK_RATE;
-
-	spin_lock(&ks8695_lock);
-	/* disable timer0 */
-	tmcon = __raw_readl(KS8695_TMR_VA + KS8695_TMCON);
-	__raw_writel(tmcon & ~TMCON_T0EN, KS8695_TMR_VA + KS8695_TMCON);
-
-	/* program timer0 */
-	__raw_writel(tval | T0TC_WATCHDOG, KS8695_TMR_VA + KS8695_T0TC);
-
-	/* re-enable timer0 */
-	tmcon = __raw_readl(KS8695_TMR_VA + KS8695_TMCON);
-	__raw_writel(tmcon | TMCON_T0EN, KS8695_TMR_VA + KS8695_TMCON);
-	spin_unlock(&ks8695_lock);
-}
-
-/*
- * Reload the watchdog timer.  (ie, pat the watchdog)
- */
-static inline void ks8695_wdt_reload(void)
-{
-	unsigned long tmcon;
-
-	spin_lock(&ks8695_lock);
-	/* disable, then re-enable timer0 */
-	tmcon = __raw_readl(KS8695_TMR_VA + KS8695_TMCON);
-	__raw_writel(tmcon & ~TMCON_T0EN, KS8695_TMR_VA + KS8695_TMCON);
-	__raw_writel(tmcon | TMCON_T0EN, KS8695_TMR_VA + KS8695_TMCON);
-	spin_unlock(&ks8695_lock);
-}
-
-/*
- * Change the watchdog time interval.
- */
-static int ks8695_wdt_settimeout(int new_time)
-{
-	/*
-	 * All counting occurs at KS8695_CLOCK_RATE / 128 = 0.256 Hz
-	 *
-	 * Since WDV is a 16-bit counter, the maximum period is
-	 * 65536 / 0.256 = 256 seconds.
-	 */
-	if ((new_time <= 0) || (new_time > WDT_MAX_TIME))
-		return -EINVAL;
-
-	/* Set new watchdog time. It will be used when
-	   ks8695_wdt_start() is called. */
-	wdt_time = new_time;
-	return 0;
-}
-
-/* ......................................................................... */
-
-/*
- * Watchdog device is opened, and watchdog starts running.
- */
-static int ks8695_wdt_open(struct inode *inode, struct file *file)
-{
-	if (test_and_set_bit(0, &ks8695wdt_busy))
-		return -EBUSY;
-
-	ks8695_wdt_start();
-	return stream_open(inode, file);
-}
-
-/*
- * Close the watchdog device.
- * If CONFIG_WATCHDOG_NOWAYOUT is NOT defined then the watchdog is also
- *  disabled.
- */
-static int ks8695_wdt_close(struct inode *inode, struct file *file)
-{
-	/* Disable the watchdog when file is closed */
-	if (!nowayout)
-		ks8695_wdt_stop();
-	clear_bit(0, &ks8695wdt_busy);
-	return 0;
-}
-
-static const struct watchdog_info ks8695_wdt_info = {
-	.identity	= "ks8695 watchdog",
-	.options	= WDIOF_SETTIMEOUT | WDIOF_KEEPALIVEPING,
-};
-
-/*
- * Handle commands from user-space.
- */
-static long ks8695_wdt_ioctl(struct file *file, unsigned int cmd,
-							unsigned long arg)
-{
-	void __user *argp = (void __user *)arg;
-	int __user *p = argp;
-	int new_value;
-
-	switch (cmd) {
-	case WDIOC_GETSUPPORT:
-		return copy_to_user(argp, &ks8695_wdt_info,
-					sizeof(ks8695_wdt_info)) ? -EFAULT : 0;
-	case WDIOC_GETSTATUS:
-	case WDIOC_GETBOOTSTATUS:
-		return put_user(0, p);
-	case WDIOC_SETOPTIONS:
-		if (get_user(new_value, p))
-			return -EFAULT;
-		if (new_value & WDIOS_DISABLECARD)
-			ks8695_wdt_stop();
-		if (new_value & WDIOS_ENABLECARD)
-			ks8695_wdt_start();
-		return 0;
-	case WDIOC_KEEPALIVE:
-		ks8695_wdt_reload();	/* pat the watchdog */
-		return 0;
-	case WDIOC_SETTIMEOUT:
-		if (get_user(new_value, p))
-			return -EFAULT;
-		if (ks8695_wdt_settimeout(new_value))
-			return -EINVAL;
-		/* Enable new time value */
-		ks8695_wdt_start();
-		/* Return current value */
-		return put_user(wdt_time, p);
-	case WDIOC_GETTIMEOUT:
-		return put_user(wdt_time, p);
-	default:
-		return -ENOTTY;
-	}
-}
-
-/*
- * Pat the watchdog whenever device is written to.
- */
-static ssize_t ks8695_wdt_write(struct file *file, const char *data,
-						size_t len, loff_t *ppos)
-{
-	ks8695_wdt_reload();		/* pat the watchdog */
-	return len;
-}
-
-/* ......................................................................... */
-
-static const struct file_operations ks8695wdt_fops = {
-	.owner		= THIS_MODULE,
-	.llseek		= no_llseek,
-	.unlocked_ioctl	= ks8695_wdt_ioctl,
-	.open		= ks8695_wdt_open,
-	.release	= ks8695_wdt_close,
-	.write		= ks8695_wdt_write,
-};
-
-static struct miscdevice ks8695wdt_miscdev = {
-	.minor		= WATCHDOG_MINOR,
-	.name		= "watchdog",
-	.fops		= &ks8695wdt_fops,
-};
-
-static int ks8695wdt_probe(struct platform_device *pdev)
-{
-	int res;
-
-	if (ks8695wdt_miscdev.parent)
-		return -EBUSY;
-	ks8695wdt_miscdev.parent = &pdev->dev;
-
-	res = misc_register(&ks8695wdt_miscdev);
-	if (res)
-		return res;
-
-	pr_info("KS8695 Watchdog Timer enabled (%d seconds%s)\n",
-		wdt_time, nowayout ? ", nowayout" : "");
-	return 0;
-}
-
-static int ks8695wdt_remove(struct platform_device *pdev)
-{
-	misc_deregister(&ks8695wdt_miscdev);
-	ks8695wdt_miscdev.parent = NULL;
-
-	return 0;
-}
-
-static void ks8695wdt_shutdown(struct platform_device *pdev)
-{
-	ks8695_wdt_stop();
-}
-
-#ifdef CONFIG_PM
-
-static int ks8695wdt_suspend(struct platform_device *pdev, pm_message_t message)
-{
-	ks8695_wdt_stop();
-	return 0;
-}
-
-static int ks8695wdt_resume(struct platform_device *pdev)
-{
-	if (ks8695wdt_busy)
-		ks8695_wdt_start();
-	return 0;
-}
-
-#else
-#define ks8695wdt_suspend NULL
-#define ks8695wdt_resume	NULL
-#endif
-
-static struct platform_driver ks8695wdt_driver = {
-	.probe		= ks8695wdt_probe,
-	.remove		= ks8695wdt_remove,
-	.shutdown	= ks8695wdt_shutdown,
-	.suspend	= ks8695wdt_suspend,
-	.resume		= ks8695wdt_resume,
-	.driver		= {
-		.name	= "ks8695_wdt",
-	},
-};
-
-static int __init ks8695_wdt_init(void)
-{
-	/* Check that the heartbeat value is within range;
-	   if not reset to the default */
-	if (ks8695_wdt_settimeout(wdt_time)) {
-		ks8695_wdt_settimeout(WDT_DEFAULT_TIME);
-		pr_info("ks8695_wdt: wdt_time value must be 1 <= wdt_time <= %i"
-					", using %d\n", wdt_time, WDT_MAX_TIME);
-	}
-	return platform_driver_register(&ks8695wdt_driver);
-}
-
-static void __exit ks8695_wdt_exit(void)
-{
-	platform_driver_unregister(&ks8695wdt_driver);
-}
-
-module_init(ks8695_wdt_init);
-module_exit(ks8695_wdt_exit);
-
-MODULE_AUTHOR("Andrew Victor");
-MODULE_DESCRIPTION("Watchdog driver for KS8695");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS("platform:ks8695_wdt");

diff --git a/drivers/watchdog/nuc900_wdt.c b/drivers/watchdog/nuc900_wdt.c
deleted file mode 100644
index db124ce..0000000
--- a/drivers/watchdog/nuc900_wdt.c
+++ /dev/null

@@ -1,302 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (c) 2009 Nuvoton technology corporation.
- *
- * Wan ZongShun <mcuos.com@gmail.com>
- */
-
-#include <linux/bitops.h>
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/io.h>
-#include <linux/clk.h>
-#include <linux/kernel.h>
-#include <linux/miscdevice.h>
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/platform_device.h>
-#include <linux/slab.h>
-#include <linux/interrupt.h>
-#include <linux/types.h>
-#include <linux/watchdog.h>
-#include <linux/uaccess.h>
-
-#define REG_WTCR		0x1c
-#define WTCLK			(0x01 << 10)
-#define WTE			(0x01 << 7)	/*wdt enable*/
-#define WTIS			(0x03 << 4)
-#define WTIF			(0x01 << 3)
-#define WTRF			(0x01 << 2)
-#define WTRE			(0x01 << 1)
-#define WTR			(0x01 << 0)
-/*
- * The watchdog time interval can be calculated via following formula:
- * WTIS		real time interval (formula)
- * 0x00		((2^ 14 ) * ((external crystal freq) / 256))seconds
- * 0x01		((2^ 16 ) * ((external crystal freq) / 256))seconds
- * 0x02		((2^ 18 ) * ((external crystal freq) / 256))seconds
- * 0x03		((2^ 20 ) * ((external crystal freq) / 256))seconds
- *
- * The external crystal freq is 15Mhz in the nuc900 evaluation board.
- * So 0x00 = +-0.28 seconds, 0x01 = +-1.12 seconds, 0x02 = +-4.48 seconds,
- * 0x03 = +- 16.92 seconds..
- */
-#define WDT_HW_TIMEOUT		0x02
-#define WDT_TIMEOUT		(HZ/2)
-#define WDT_HEARTBEAT		15
-
-static int heartbeat = WDT_HEARTBEAT;
-module_param(heartbeat, int, 0);
-MODULE_PARM_DESC(heartbeat, "Watchdog heartbeats in seconds. "
-	"(default = " __MODULE_STRING(WDT_HEARTBEAT) ")");
-
-static bool nowayout = WATCHDOG_NOWAYOUT;
-module_param(nowayout, bool, 0);
-MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started "
-	"(default=" __MODULE_STRING(WATCHDOG_NOWAYOUT) ")");
-
-struct nuc900_wdt {
-	struct clk	 *wdt_clock;
-	struct platform_device *pdev;
-	void __iomem	 *wdt_base;
-	char		 expect_close;
-	struct timer_list timer;
-	spinlock_t       wdt_lock;
-	unsigned long next_heartbeat;
-};
-
-static unsigned long nuc900wdt_busy;
-static struct nuc900_wdt *nuc900_wdt;
-
-static inline void nuc900_wdt_keepalive(void)
-{
-	unsigned int val;
-
-	spin_lock(&nuc900_wdt->wdt_lock);
-
-	val = __raw_readl(nuc900_wdt->wdt_base + REG_WTCR);
-	val |= (WTR | WTIF);
-	__raw_writel(val, nuc900_wdt->wdt_base + REG_WTCR);
-
-	spin_unlock(&nuc900_wdt->wdt_lock);
-}
-
-static inline void nuc900_wdt_start(void)
-{
-	unsigned int val;
-
-	spin_lock(&nuc900_wdt->wdt_lock);
-
-	val = __raw_readl(nuc900_wdt->wdt_base + REG_WTCR);
-	val |= (WTRE | WTE | WTR | WTCLK | WTIF);
-	val &= ~WTIS;
-	val |= (WDT_HW_TIMEOUT << 0x04);
-	__raw_writel(val, nuc900_wdt->wdt_base + REG_WTCR);
-
-	spin_unlock(&nuc900_wdt->wdt_lock);
-
-	nuc900_wdt->next_heartbeat = jiffies + heartbeat * HZ;
-	mod_timer(&nuc900_wdt->timer, jiffies + WDT_TIMEOUT);
-}
-
-static inline void nuc900_wdt_stop(void)
-{
-	unsigned int val;
-
-	del_timer(&nuc900_wdt->timer);
-
-	spin_lock(&nuc900_wdt->wdt_lock);
-
-	val = __raw_readl(nuc900_wdt->wdt_base + REG_WTCR);
-	val &= ~WTE;
-	__raw_writel(val, nuc900_wdt->wdt_base + REG_WTCR);
-
-	spin_unlock(&nuc900_wdt->wdt_lock);
-}
-
-static inline void nuc900_wdt_ping(void)
-{
-	nuc900_wdt->next_heartbeat = jiffies + heartbeat * HZ;
-}
-
-static int nuc900_wdt_open(struct inode *inode, struct file *file)
-{
-
-	if (test_and_set_bit(0, &nuc900wdt_busy))
-		return -EBUSY;
-
-	nuc900_wdt_start();
-
-	return stream_open(inode, file);
-}
-
-static int nuc900_wdt_close(struct inode *inode, struct file *file)
-{
-	if (nuc900_wdt->expect_close == 42)
-		nuc900_wdt_stop();
-	else {
-		dev_crit(&nuc900_wdt->pdev->dev,
-			"Unexpected close, not stopping watchdog!\n");
-		nuc900_wdt_ping();
-	}
-
-	nuc900_wdt->expect_close = 0;
-	clear_bit(0, &nuc900wdt_busy);
-	return 0;
-}
-
-static const struct watchdog_info nuc900_wdt_info = {
-	.identity	= "nuc900 watchdog",
-	.options	= WDIOF_SETTIMEOUT | WDIOF_KEEPALIVEPING |
-						WDIOF_MAGICCLOSE,
-};
-
-static long nuc900_wdt_ioctl(struct file *file,
-					unsigned int cmd, unsigned long arg)
-{
-	void __user *argp = (void __user *)arg;
-	int __user *p = argp;
-	int new_value;
-
-	switch (cmd) {
-	case WDIOC_GETSUPPORT:
-		return copy_to_user(argp, &nuc900_wdt_info,
-				sizeof(nuc900_wdt_info)) ? -EFAULT : 0;
-	case WDIOC_GETSTATUS:
-	case WDIOC_GETBOOTSTATUS:
-		return put_user(0, p);
-
-	case WDIOC_KEEPALIVE:
-		nuc900_wdt_ping();
-		return 0;
-
-	case WDIOC_SETTIMEOUT:
-		if (get_user(new_value, p))
-			return -EFAULT;
-
-		heartbeat = new_value;
-		nuc900_wdt_ping();
-
-		return put_user(new_value, p);
-	case WDIOC_GETTIMEOUT:
-		return put_user(heartbeat, p);
-	default:
-		return -ENOTTY;
-	}
-}
-
-static ssize_t nuc900_wdt_write(struct file *file, const char __user *data,
-						size_t len, loff_t *ppos)
-{
-	if (!len)
-		return 0;
-
-	/* Scan for magic character */
-	if (!nowayout) {
-		size_t i;
-
-		nuc900_wdt->expect_close = 0;
-
-		for (i = 0; i < len; i++) {
-			char c;
-			if (get_user(c, data + i))
-				return -EFAULT;
-			if (c == 'V') {
-				nuc900_wdt->expect_close = 42;
-				break;
-			}
-		}
-	}
-
-	nuc900_wdt_ping();
-	return len;
-}
-
-static void nuc900_wdt_timer_ping(struct timer_list *unused)
-{
-	if (time_before(jiffies, nuc900_wdt->next_heartbeat)) {
-		nuc900_wdt_keepalive();
-		mod_timer(&nuc900_wdt->timer, jiffies + WDT_TIMEOUT);
-	} else
-		dev_warn(&nuc900_wdt->pdev->dev, "Will reset the machine !\n");
-}
-
-static const struct file_operations nuc900wdt_fops = {
-	.owner		= THIS_MODULE,
-	.llseek		= no_llseek,
-	.unlocked_ioctl	= nuc900_wdt_ioctl,
-	.open		= nuc900_wdt_open,
-	.release	= nuc900_wdt_close,
-	.write		= nuc900_wdt_write,
-};
-
-static struct miscdevice nuc900wdt_miscdev = {
-	.minor		= WATCHDOG_MINOR,
-	.name		= "watchdog",
-	.fops		= &nuc900wdt_fops,
-};
-
-static int nuc900wdt_probe(struct platform_device *pdev)
-{
-	int ret = 0;
-
-	nuc900_wdt = devm_kzalloc(&pdev->dev, sizeof(*nuc900_wdt),
-				GFP_KERNEL);
-	if (!nuc900_wdt)
-		return -ENOMEM;
-
-	nuc900_wdt->pdev = pdev;
-
-	spin_lock_init(&nuc900_wdt->wdt_lock);
-
-	nuc900_wdt->wdt_base = devm_platform_ioremap_resource(pdev, 0);
-	if (IS_ERR(nuc900_wdt->wdt_base))
-		return PTR_ERR(nuc900_wdt->wdt_base);
-
-	nuc900_wdt->wdt_clock = devm_clk_get(&pdev->dev, NULL);
-	if (IS_ERR(nuc900_wdt->wdt_clock)) {
-		dev_err(&pdev->dev, "failed to find watchdog clock source\n");
-		return PTR_ERR(nuc900_wdt->wdt_clock);
-	}
-
-	clk_enable(nuc900_wdt->wdt_clock);
-
-	timer_setup(&nuc900_wdt->timer, nuc900_wdt_timer_ping, 0);
-
-	ret = misc_register(&nuc900wdt_miscdev);
-	if (ret) {
-		dev_err(&pdev->dev, "err register miscdev on minor=%d (%d)\n",
-			WATCHDOG_MINOR, ret);
-		goto err_clk;
-	}
-
-	return 0;
-
-err_clk:
-	clk_disable(nuc900_wdt->wdt_clock);
-	return ret;
-}
-
-static int nuc900wdt_remove(struct platform_device *pdev)
-{
-	misc_deregister(&nuc900wdt_miscdev);
-
-	clk_disable(nuc900_wdt->wdt_clock);
-
-	return 0;
-}
-
-static struct platform_driver nuc900wdt_driver = {
-	.probe		= nuc900wdt_probe,
-	.remove		= nuc900wdt_remove,
-	.driver		= {
-		.name	= "nuc900-wdt",
-	},
-};
-
-module_platform_driver(nuc900wdt_driver);
-
-MODULE_AUTHOR("Wan ZongShun <mcuos.com@gmail.com>");
-MODULE_DESCRIPTION("Watchdog driver for NUC900");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS("platform:nuc900-wdt");

diff --git a/drivers/watchdog/orion_wdt.c b/drivers/watchdog/orion_wdt.c
index cdb0d17..1cccf8e 100644
--- a/drivers/watchdog/orion_wdt.c
+++ b/drivers/watchdog/orion_wdt.c

@@ -35,7 +35,15 @@
  * Watchdog timer block registers.
  */
 #define TIMER_CTRL		0x0000
-#define TIMER_A370_STATUS	0x04
+#define TIMER1_FIXED_ENABLE_BIT	BIT(12)
+#define WDT_AXP_FIXED_ENABLE_BIT BIT(10)
+#define TIMER1_ENABLE_BIT	BIT(2)
+
+#define TIMER_A370_STATUS	0x0004
+#define WDT_A370_EXPIRED	BIT(31)
+#define TIMER1_STATUS_BIT	BIT(8)
+
+#define TIMER1_VAL_OFF		0x001c
 
 #define WDT_MAX_CYCLE_COUNT	0xffffffff
 
@@ -43,9 +51,6 @@
 #define WDT_A370_RATIO_SHIFT	5
 #define WDT_A370_RATIO		(1 << WDT_A370_RATIO_SHIFT)
 
-#define WDT_AXP_FIXED_ENABLE_BIT BIT(10)
-#define WDT_A370_EXPIRED	BIT(31)
-
 static bool nowayout = WATCHDOG_NOWAYOUT;
 static int heartbeat = -1;		/* module parameter (seconds) */
 
@@ -158,6 +163,7 @@
 				   struct orion_watchdog *dev)
 {
 	int ret;
+	u32 val;
 
 	dev->clk = of_clk_get_by_name(pdev->dev.of_node, "fixed");
 	if (IS_ERR(dev->clk))
@@ -168,10 +174,9 @@
 		return ret;
 	}
 
-	/* Enable the fixed watchdog clock input */
-	atomic_io_modify(dev->reg + TIMER_CTRL,
-			 WDT_AXP_FIXED_ENABLE_BIT,
-			 WDT_AXP_FIXED_ENABLE_BIT);
+	/* Fix the wdt and timer1 clock freqency to 25MHz */
+	val = WDT_AXP_FIXED_ENABLE_BIT | TIMER1_FIXED_ENABLE_BIT;
+	atomic_io_modify(dev->reg + TIMER_CTRL, val, val);
 
 	dev->clk_rate = clk_get_rate(dev->clk);
 	return 0;
@@ -183,6 +188,10 @@
 	/* Reload watchdog duration */
 	writel(dev->clk_rate * wdt_dev->timeout,
 	       dev->reg + dev->data->wdt_counter_offset);
+	if (dev->wdt.info->options & WDIOF_PRETIMEOUT)
+		writel(dev->clk_rate * (wdt_dev->timeout - wdt_dev->pretimeout),
+		       dev->reg + TIMER1_VAL_OFF);
+
 	return 0;
 }
 
@@ -194,13 +203,18 @@
 	/* Set watchdog duration */
 	writel(dev->clk_rate * wdt_dev->timeout,
 	       dev->reg + dev->data->wdt_counter_offset);
+	if (dev->wdt.info->options & WDIOF_PRETIMEOUT)
+		writel(dev->clk_rate * (wdt_dev->timeout - wdt_dev->pretimeout),
+		       dev->reg + TIMER1_VAL_OFF);
 
 	/* Clear the watchdog expiration bit */
 	atomic_io_modify(dev->reg + TIMER_A370_STATUS, WDT_A370_EXPIRED, 0);
 
 	/* Enable watchdog timer */
-	atomic_io_modify(dev->reg + TIMER_CTRL, dev->data->wdt_enable_bit,
-						dev->data->wdt_enable_bit);
+	reg = dev->data->wdt_enable_bit;
+	if (dev->wdt.info->options & WDIOF_PRETIMEOUT)
+		reg |= TIMER1_ENABLE_BIT;
+	atomic_io_modify(dev->reg + TIMER_CTRL, reg, reg);
 
 	/* Enable reset on watchdog */
 	reg = readl(dev->rstout);
@@ -277,7 +291,7 @@
 static int armada375_stop(struct watchdog_device *wdt_dev)
 {
 	struct orion_watchdog *dev = watchdog_get_drvdata(wdt_dev);
-	u32 reg;
+	u32 reg, mask;
 
 	/* Disable reset on watchdog */
 	atomic_io_modify(dev->rstout_mask, dev->data->rstout_mask_bit,
@@ -287,7 +301,10 @@
 	writel(reg, dev->rstout);
 
 	/* Disable watchdog timer */
-	atomic_io_modify(dev->reg + TIMER_CTRL, dev->data->wdt_enable_bit, 0);
+	mask = dev->data->wdt_enable_bit;
+	if (wdt_dev->info->options & WDIOF_PRETIMEOUT)
+		mask |= TIMER1_ENABLE_BIT;
+	atomic_io_modify(dev->reg + TIMER_CTRL, mask, 0);
 
 	return 0;
 }
@@ -349,7 +366,7 @@
 	return readl(dev->reg + dev->data->wdt_counter_offset) / dev->clk_rate;
 }
 
-static const struct watchdog_info orion_wdt_info = {
+static struct watchdog_info orion_wdt_info = {
 	.options = WDIOF_SETTIMEOUT | WDIOF_KEEPALIVEPING | WDIOF_MAGICCLOSE,
 	.identity = "Orion Watchdog",
 };
@@ -368,6 +385,16 @@
 	return IRQ_HANDLED;
 }
 
+static irqreturn_t orion_wdt_pre_irq(int irq, void *devid)
+{
+	struct orion_watchdog *dev = devid;
+
+	atomic_io_modify(dev->reg + TIMER_A370_STATUS,
+			 TIMER1_STATUS_BIT, 0);
+	watchdog_notify_pretimeout(&dev->wdt);
+	return IRQ_HANDLED;
+}
+
 /*
  * The original devicetree binding for this driver specified only
  * one memory resource, so in order to keep DT backwards compatibility
@@ -589,6 +616,19 @@
 		}
 	}
 
+	/* Optional 2nd interrupt for pretimeout */
+	irq = platform_get_irq(pdev, 1);
+	if (irq > 0) {
+		orion_wdt_info.options |= WDIOF_PRETIMEOUT;
+		ret = devm_request_irq(&pdev->dev, irq, orion_wdt_pre_irq,
+				       0, pdev->name, dev);
+		if (ret < 0) {
+			dev_err(&pdev->dev, "failed to request IRQ\n");
+			goto disable_clk;
+		}
+	}
+
+
 	watchdog_set_nowayout(&dev->wdt, nowayout);
 	ret = watchdog_register_device(&dev->wdt);
 	if (ret)

diff --git a/drivers/watchdog/qcom-wdt.c b/drivers/watchdog/qcom-wdt.c
index 7be7f87..a494543 100644
--- a/drivers/watchdog/qcom-wdt.c
+++ b/drivers/watchdog/qcom-wdt.c

@@ -1,8 +1,10 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /* Copyright (c) 2014, The Linux Foundation. All rights reserved.
  */
+#include <linux/bits.h>
 #include <linux/clk.h>
 #include <linux/delay.h>
+#include <linux/interrupt.h>
 #include <linux/io.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
@@ -19,6 +21,9 @@
 	WDT_BITE_TIME,
 };
 
+#define QCOM_WDT_ENABLE		BIT(0)
+#define QCOM_WDT_ENABLE_IRQ	BIT(1)
+
 static const u32 reg_offset_data_apcs_tmr[] = {
 	[WDT_RST] = 0x38,
 	[WDT_EN] = 0x40,
@@ -37,7 +42,6 @@
 
 struct qcom_wdt {
 	struct watchdog_device	wdd;
-	struct clk		*clk;
 	unsigned long		rate;
 	void __iomem		*base;
 	const u32		*layout;
@@ -54,15 +58,35 @@
 	return container_of(wdd, struct qcom_wdt, wdd);
 }
 
+static inline int qcom_get_enable(struct watchdog_device *wdd)
+{
+	int enable = QCOM_WDT_ENABLE;
+
+	if (wdd->pretimeout)
+		enable |= QCOM_WDT_ENABLE_IRQ;
+
+	return enable;
+}
+
+static irqreturn_t qcom_wdt_isr(int irq, void *arg)
+{
+	struct watchdog_device *wdd = arg;
+
+	watchdog_notify_pretimeout(wdd);
+
+	return IRQ_HANDLED;
+}
+
 static int qcom_wdt_start(struct watchdog_device *wdd)
 {
 	struct qcom_wdt *wdt = to_qcom_wdt(wdd);
+	unsigned int bark = wdd->timeout - wdd->pretimeout;
 
 	writel(0, wdt_addr(wdt, WDT_EN));
 	writel(1, wdt_addr(wdt, WDT_RST));
-	writel(wdd->timeout * wdt->rate, wdt_addr(wdt, WDT_BARK_TIME));
+	writel(bark * wdt->rate, wdt_addr(wdt, WDT_BARK_TIME));
 	writel(wdd->timeout * wdt->rate, wdt_addr(wdt, WDT_BITE_TIME));
-	writel(1, wdt_addr(wdt, WDT_EN));
+	writel(qcom_get_enable(wdd), wdt_addr(wdt, WDT_EN));
 	return 0;
 }
 
@@ -89,6 +113,13 @@
 	return qcom_wdt_start(wdd);
 }
 
+static int qcom_wdt_set_pretimeout(struct watchdog_device *wdd,
+				   unsigned int timeout)
+{
+	wdd->pretimeout = timeout;
+	return qcom_wdt_start(wdd);
+}
+
 static int qcom_wdt_restart(struct watchdog_device *wdd, unsigned long action,
 			    void *data)
 {
@@ -105,7 +136,7 @@
 	writel(1, wdt_addr(wdt, WDT_RST));
 	writel(timeout, wdt_addr(wdt, WDT_BARK_TIME));
 	writel(timeout, wdt_addr(wdt, WDT_BITE_TIME));
-	writel(1, wdt_addr(wdt, WDT_EN));
+	writel(QCOM_WDT_ENABLE, wdt_addr(wdt, WDT_EN));
 
 	/*
 	 * Actually make sure the above sequence hits hardware before sleeping.
@@ -121,6 +152,7 @@
 	.stop		= qcom_wdt_stop,
 	.ping		= qcom_wdt_ping,
 	.set_timeout	= qcom_wdt_set_timeout,
+	.set_pretimeout	= qcom_wdt_set_pretimeout,
 	.restart        = qcom_wdt_restart,
 	.owner		= THIS_MODULE,
 };
@@ -133,6 +165,15 @@
 	.identity	= KBUILD_MODNAME,
 };
 
+static const struct watchdog_info qcom_wdt_pt_info = {
+	.options	= WDIOF_KEEPALIVEPING
+			| WDIOF_MAGICCLOSE
+			| WDIOF_SETTIMEOUT
+			| WDIOF_PRETIMEOUT
+			| WDIOF_CARDRESET,
+	.identity	= KBUILD_MODNAME,
+};
+
 static void qcom_clk_disable_unprepare(void *data)
 {
 	clk_disable_unprepare(data);
@@ -146,7 +187,8 @@
 	struct device_node *np = dev->of_node;
 	const u32 *regs;
 	u32 percpu_offset;
-	int ret;
+	int irq, ret;
+	struct clk *clk;
 
 	regs = of_device_get_match_data(dev);
 	if (!regs) {
@@ -173,19 +215,18 @@
 	if (IS_ERR(wdt->base))
 		return PTR_ERR(wdt->base);
 
-	wdt->clk = devm_clk_get(dev, NULL);
-	if (IS_ERR(wdt->clk)) {
+	clk = devm_clk_get(dev, NULL);
+	if (IS_ERR(clk)) {
 		dev_err(dev, "failed to get input clock\n");
-		return PTR_ERR(wdt->clk);
+		return PTR_ERR(clk);
 	}
 
-	ret = clk_prepare_enable(wdt->clk);
+	ret = clk_prepare_enable(clk);
 	if (ret) {
 		dev_err(dev, "failed to setup clock\n");
 		return ret;
 	}
-	ret = devm_add_action_or_reset(dev, qcom_clk_disable_unprepare,
-				       wdt->clk);
+	ret = devm_add_action_or_reset(dev, qcom_clk_disable_unprepare, clk);
 	if (ret)
 		return ret;
 
@@ -197,14 +238,31 @@
 	 * that it would bite before a second elapses it's usefulness is
 	 * limited.  Bail if this is the case.
 	 */
-	wdt->rate = clk_get_rate(wdt->clk);
+	wdt->rate = clk_get_rate(clk);
 	if (wdt->rate == 0 ||
 	    wdt->rate > 0x10000000U) {
 		dev_err(dev, "invalid clock rate\n");
 		return -EINVAL;
 	}
 
-	wdt->wdd.info = &qcom_wdt_info;
+	/* check if there is pretimeout support */
+	irq = platform_get_irq(pdev, 0);
+	if (irq > 0) {
+		ret = devm_request_irq(dev, irq, qcom_wdt_isr,
+				       IRQF_TRIGGER_RISING,
+				       "wdt_bark", &wdt->wdd);
+		if (ret)
+			return ret;
+
+		wdt->wdd.info = &qcom_wdt_pt_info;
+		wdt->wdd.pretimeout = 1;
+	} else {
+		if (irq == -EPROBE_DEFER)
+			return -EPROBE_DEFER;
+
+		wdt->wdd.info = &qcom_wdt_info;
+	}
+
 	wdt->wdd.ops = &qcom_wdt_ops;
 	wdt->wdd.min_timeout = 1;
 	wdt->wdd.max_timeout = 0x10000000U / wdt->rate;

diff --git a/drivers/watchdog/sprd_wdt.c b/drivers/watchdog/sprd_wdt.c
index edba4e2..0bb17b0 100644
--- a/drivers/watchdog/sprd_wdt.c
+++ b/drivers/watchdog/sprd_wdt.c

@@ -284,10 +284,8 @@
 	}
 
 	wdt->irq = platform_get_irq(pdev, 0);
-	if (wdt->irq < 0) {
-		dev_err(dev, "failed to get IRQ resource\n");
+	if (wdt->irq < 0)
 		return wdt->irq;
-	}
 
 	ret = devm_request_irq(dev, wdt->irq, sprd_wdt_isr, IRQF_NO_SUSPEND,
 			       "sprd-wdt", (void *)wdt);

diff --git a/drivers/watchdog/ziirave_wdt.c b/drivers/watchdog/ziirave_wdt.c
index dec660c..4a363a8 100644
--- a/drivers/watchdog/ziirave_wdt.c
+++ b/drivers/watchdog/ziirave_wdt.c

@@ -21,8 +21,11 @@
 #include <linux/version.h>
 #include <linux/watchdog.h>
 
+#include <asm/unaligned.h>
+
 #define ZIIRAVE_TIMEOUT_MIN	3
 #define ZIIRAVE_TIMEOUT_MAX	255
+#define ZIIRAVE_TIMEOUT_DEFAULT	30
 
 #define ZIIRAVE_PING_VALUE	0x0
 
@@ -48,16 +51,12 @@
 
 #define ZIIRAVE_FIRM_PKT_TOTAL_SIZE	20
 #define ZIIRAVE_FIRM_PKT_DATA_SIZE	16
-#define ZIIRAVE_FIRM_FLASH_MEMORY_START	0x1600
-#define ZIIRAVE_FIRM_FLASH_MEMORY_END	0x2bbf
+#define ZIIRAVE_FIRM_FLASH_MEMORY_START	(2 * 0x1600)
+#define ZIIRAVE_FIRM_FLASH_MEMORY_END	(2 * 0x2bbf)
+#define ZIIRAVE_FIRM_PAGE_SIZE		128
 
 /* Received and ready for next Download packet. */
 #define ZIIRAVE_FIRM_DOWNLOAD_ACK	1
-/* Currently writing to flash. Retry Download status in a moment! */
-#define ZIIRAVE_FIRM_DOWNLOAD_BUSY	2
-
-/* Wait for ACK timeout in ms */
-#define ZIIRAVE_FIRM_WAIT_FOR_ACK_TIMEOUT	50
 
 /* Firmware commands */
 #define ZIIRAVE_CMD_DOWNLOAD_START		0x10
@@ -68,6 +67,12 @@
 #define ZIIRAVE_CMD_JUMP_TO_BOOTLOADER		0x0c
 #define ZIIRAVE_CMD_DOWNLOAD_PACKET		0x0e
 
+#define ZIIRAVE_CMD_JUMP_TO_BOOTLOADER_MAGIC	1
+#define ZIIRAVE_CMD_RESET_PROCESSOR_MAGIC	1
+
+#define ZIIRAVE_FW_VERSION_FMT	"02.%02u.%02u"
+#define ZIIRAVE_BL_VERSION_FMT	"01.%02u.%02u"
+
 struct ziirave_wdt_rev {
 	unsigned char major;
 	unsigned char minor;
@@ -165,67 +170,37 @@
 	return ret;
 }
 
-static int ziirave_firm_wait_for_ack(struct watchdog_device *wdd)
+static int ziirave_firm_read_ack(struct watchdog_device *wdd)
 {
 	struct i2c_client *client = to_i2c_client(wdd->parent);
 	int ret;
-	unsigned long timeout;
 
-	timeout = jiffies + msecs_to_jiffies(ZIIRAVE_FIRM_WAIT_FOR_ACK_TIMEOUT);
-	do {
-		if (time_after(jiffies, timeout))
-			return -ETIMEDOUT;
-
-		usleep_range(5000, 10000);
-
-		ret = i2c_smbus_read_byte(client);
-		if (ret < 0) {
-			dev_err(&client->dev, "Failed to read byte\n");
-			return ret;
-		}
-	} while (ret == ZIIRAVE_FIRM_DOWNLOAD_BUSY);
+	ret = i2c_smbus_read_byte(client);
+	if (ret < 0) {
+		dev_err(&client->dev, "Failed to read status byte\n");
+		return ret;
+	}
 
 	return ret == ZIIRAVE_FIRM_DOWNLOAD_ACK ? 0 : -EIO;
 }
 
-static int ziirave_firm_set_read_addr(struct watchdog_device *wdd, u16 addr)
+static int ziirave_firm_set_read_addr(struct watchdog_device *wdd, u32 addr)
 {
 	struct i2c_client *client = to_i2c_client(wdd->parent);
+	const u16 addr16 = (u16)addr / 2;
 	u8 address[2];
 
-	address[0] = addr & 0xff;
-	address[1] = (addr >> 8) & 0xff;
+	put_unaligned_le16(addr16, address);
 
 	return i2c_smbus_write_block_data(client,
 					  ZIIRAVE_CMD_DOWNLOAD_SET_READ_ADDR,
-					  ARRAY_SIZE(address), address);
+					  sizeof(address), address);
 }
 
-static int ziirave_firm_write_block_data(struct watchdog_device *wdd,
-					 u8 command, u8 length, const u8 *data,
-					 bool wait_for_ack)
+static bool ziirave_firm_addr_readonly(u32 addr)
 {
-	struct i2c_client *client = to_i2c_client(wdd->parent);
-	int ret;
-
-	ret = i2c_smbus_write_block_data(client, command, length, data);
-	if (ret) {
-		dev_err(&client->dev,
-			"Failed to send command 0x%02x: %d\n", command, ret);
-		return ret;
-	}
-
-	if (wait_for_ack)
-		ret = ziirave_firm_wait_for_ack(wdd);
-
-	return ret;
-}
-
-static int ziirave_firm_write_byte(struct watchdog_device *wdd, u8 command,
-				   u8 byte, bool wait_for_ack)
-{
-	return ziirave_firm_write_block_data(wdd, command, 1, &byte,
-					     wait_for_ack);
+	return addr < ZIIRAVE_FIRM_FLASH_MEMORY_START ||
+	       addr > ZIIRAVE_FIRM_FLASH_MEMORY_END;
 }
 
 /*
@@ -240,35 +215,53 @@
  *     Data0 .. Data15: Array of 16 bytes of data.
  *     Checksum: Checksum byte to verify data integrity.
  */
-static int ziirave_firm_write_pkt(struct watchdog_device *wdd,
-				  const struct ihex_binrec *rec)
+static int __ziirave_firm_write_pkt(struct watchdog_device *wdd,
+				    u32 addr, const u8 *data, u8 len)
 {
+	const u16 addr16 = (u16)addr / 2;
 	struct i2c_client *client = to_i2c_client(wdd->parent);
 	u8 i, checksum = 0, packet[ZIIRAVE_FIRM_PKT_TOTAL_SIZE];
 	int ret;
-	u16 addr;
 
-	memset(packet, 0, ARRAY_SIZE(packet));
+	/* Check max data size */
+	if (len > ZIIRAVE_FIRM_PKT_DATA_SIZE) {
+		dev_err(&client->dev, "Firmware packet too long (%d)\n",
+			len);
+		return -EMSGSIZE;
+	}
+
+	/*
+	 * Ignore packets that are targeting program memory outisde of
+	 * app partition, since they will be ignored by the
+	 * bootloader. At the same time, we need to make sure we'll
+	 * allow zero length packet that will be sent as the last step
+	 * of firmware update
+	 */
+	if (len && ziirave_firm_addr_readonly(addr))
+		return 0;
 
 	/* Packet length */
-	packet[0] = (u8)be16_to_cpu(rec->len);
+	packet[0] = len;
 	/* Packet address */
-	addr = (be32_to_cpu(rec->addr) & 0xffff) >> 1;
-	packet[1] = addr & 0xff;
-	packet[2] = (addr & 0xff00) >> 8;
+	put_unaligned_le16(addr16, packet + 1);
 
-	/* Packet data */
-	if (be16_to_cpu(rec->len) > ZIIRAVE_FIRM_PKT_DATA_SIZE)
-		return -EMSGSIZE;
-	memcpy(packet + 3, rec->data, be16_to_cpu(rec->len));
+	memcpy(packet + 3, data, len);
+	memset(packet + 3 + len, 0, ZIIRAVE_FIRM_PKT_DATA_SIZE - len);
 
 	/* Packet checksum */
-	for (i = 0; i < ZIIRAVE_FIRM_PKT_TOTAL_SIZE - 1; i++)
+	for (i = 0; i < len + 3; i++)
 		checksum += packet[i];
 	packet[ZIIRAVE_FIRM_PKT_TOTAL_SIZE - 1] = checksum;
 
-	ret = ziirave_firm_write_block_data(wdd, ZIIRAVE_CMD_DOWNLOAD_PACKET,
-					    ARRAY_SIZE(packet), packet, true);
+	ret = i2c_smbus_write_block_data(client, ZIIRAVE_CMD_DOWNLOAD_PACKET,
+					 sizeof(packet), packet);
+	if (ret) {
+		dev_err(&client->dev,
+			"Failed to send DOWNLOAD_PACKET: %d\n", ret);
+		return ret;
+	}
+
+	ret = ziirave_firm_read_ack(wdd);
 	if (ret)
 		dev_err(&client->dev,
 		      "Failed to write firmware packet at address 0x%04x: %d\n",
@@ -277,6 +270,30 @@
 	return ret;
 }
 
+static int ziirave_firm_write_pkt(struct watchdog_device *wdd,
+				  u32 addr, const u8 *data, u8 len)
+{
+	const u8 max_write_len = ZIIRAVE_FIRM_PAGE_SIZE -
+		(addr - ALIGN_DOWN(addr, ZIIRAVE_FIRM_PAGE_SIZE));
+	int ret;
+
+	if (len > max_write_len) {
+		/*
+		 * If data crossed page boundary we need to split this
+		 * write in two
+		 */
+		ret = __ziirave_firm_write_pkt(wdd, addr, data, max_write_len);
+		if (ret)
+			return ret;
+
+		addr += max_write_len;
+		data += max_write_len;
+		len  -= max_write_len;
+	}
+
+	return __ziirave_firm_write_pkt(wdd, addr, data, len);
+}
+
 static int ziirave_firm_verify(struct watchdog_device *wdd,
 			       const struct firmware *fw)
 {
@@ -284,16 +301,12 @@
 	const struct ihex_binrec *rec;
 	int i, ret;
 	u8 data[ZIIRAVE_FIRM_PKT_DATA_SIZE];
-	u16 addr;
 
 	for (rec = (void *)fw->data; rec; rec = ihex_next_binrec(rec)) {
-		/* Zero length marks end of records */
-		if (!be16_to_cpu(rec->len))
-			break;
+		const u16 len = be16_to_cpu(rec->len);
+		const u32 addr = be32_to_cpu(rec->addr);
 
-		addr = (be32_to_cpu(rec->addr) & 0xffff) >> 1;
-		if (addr < ZIIRAVE_FIRM_FLASH_MEMORY_START ||
-		    addr > ZIIRAVE_FIRM_FLASH_MEMORY_END)
+		if (ziirave_firm_addr_readonly(addr))
 			continue;
 
 		ret = ziirave_firm_set_read_addr(wdd, addr);
@@ -304,7 +317,7 @@
 			return ret;
 		}
 
-		for (i = 0; i < ARRAY_SIZE(data); i++) {
+		for (i = 0; i < len; i++) {
 			ret = i2c_smbus_read_byte_data(client,
 						ZIIRAVE_CMD_DOWNLOAD_READ_BYTE);
 			if (ret < 0) {
@@ -315,7 +328,7 @@
 			data[i] = ret;
 		}
 
-		if (memcmp(data, rec->data, be16_to_cpu(rec->len))) {
+		if (memcmp(data, rec->data, len)) {
 			dev_err(&client->dev,
 				"Firmware mismatch at address 0x%04x\n", addr);
 			return -EINVAL;
@@ -329,97 +342,45 @@
 			       const struct firmware *fw)
 {
 	struct i2c_client *client = to_i2c_client(wdd->parent);
-	int ret, words_till_page_break;
 	const struct ihex_binrec *rec;
-	struct ihex_binrec *rec_new;
+	int ret;
 
-	ret = ziirave_firm_write_byte(wdd, ZIIRAVE_CMD_JUMP_TO_BOOTLOADER, 1,
-				      false);
-	if (ret)
+	ret = i2c_smbus_write_byte_data(client,
+					ZIIRAVE_CMD_JUMP_TO_BOOTLOADER,
+					ZIIRAVE_CMD_JUMP_TO_BOOTLOADER_MAGIC);
+	if (ret) {
+		dev_err(&client->dev, "Failed to jump to bootloader\n");
 		return ret;
+	}
 
 	msleep(500);
 
-	ret = ziirave_firm_write_byte(wdd, ZIIRAVE_CMD_DOWNLOAD_START, 1, true);
-	if (ret)
+	ret = i2c_smbus_write_byte(client, ZIIRAVE_CMD_DOWNLOAD_START);
+	if (ret) {
+		dev_err(&client->dev, "Failed to start download\n");
 		return ret;
+	}
+
+	ret = ziirave_firm_read_ack(wdd);
+	if (ret) {
+		dev_err(&client->dev, "No ACK for start download\n");
+		return ret;
+	}
 
 	msleep(500);
 
 	for (rec = (void *)fw->data; rec; rec = ihex_next_binrec(rec)) {
-		/* Zero length marks end of records */
-		if (!be16_to_cpu(rec->len))
-			break;
-
-		/* Check max data size */
-		if (be16_to_cpu(rec->len) > ZIIRAVE_FIRM_PKT_DATA_SIZE) {
-			dev_err(&client->dev, "Firmware packet too long (%d)\n",
-				be16_to_cpu(rec->len));
-			return -EMSGSIZE;
-		}
-
-		/* Calculate words till page break */
-		words_till_page_break = (64 - ((be32_to_cpu(rec->addr) >> 1) &
-					 0x3f));
-		if ((be16_to_cpu(rec->len) >> 1) > words_till_page_break) {
-			/*
-			 * Data in passes page boundary, so we need to split in
-			 * two blocks of data. Create a packet with the first
-			 * block of data.
-			 */
-			rec_new = kzalloc(sizeof(struct ihex_binrec) +
-					  (words_till_page_break << 1),
-					  GFP_KERNEL);
-			if (!rec_new)
-				return -ENOMEM;
-
-			rec_new->len = cpu_to_be16(words_till_page_break << 1);
-			rec_new->addr = rec->addr;
-			memcpy(rec_new->data, rec->data,
-			       be16_to_cpu(rec_new->len));
-
-			ret = ziirave_firm_write_pkt(wdd, rec_new);
-			kfree(rec_new);
-			if (ret)
-				return ret;
-
-			/* Create a packet with the second block of data */
-			rec_new = kzalloc(sizeof(struct ihex_binrec) +
-					  be16_to_cpu(rec->len) -
-					  (words_till_page_break << 1),
-					  GFP_KERNEL);
-			if (!rec_new)
-				return -ENOMEM;
-
-			/* Remaining bytes */
-			rec_new->len = rec->len -
-				       cpu_to_be16(words_till_page_break << 1);
-
-			rec_new->addr = cpu_to_be32(be32_to_cpu(rec->addr) +
-					(words_till_page_break << 1));
-
-			memcpy(rec_new->data,
-			       rec->data + (words_till_page_break << 1),
-			       be16_to_cpu(rec_new->len));
-
-			ret = ziirave_firm_write_pkt(wdd, rec_new);
-			kfree(rec_new);
-			if (ret)
-				return ret;
-		} else {
-			ret = ziirave_firm_write_pkt(wdd, rec);
-			if (ret)
-				return ret;
-		}
+		ret = ziirave_firm_write_pkt(wdd, be32_to_cpu(rec->addr),
+					     rec->data, be16_to_cpu(rec->len));
+		if (ret)
+			return ret;
 	}
 
-	/* For end of download, the length field will be set to 0 */
-	rec_new = kzalloc(sizeof(struct ihex_binrec) + 1, GFP_KERNEL);
-	if (!rec_new)
-		return -ENOMEM;
-
-	ret = ziirave_firm_write_pkt(wdd, rec_new);
-	kfree(rec_new);
+	/*
+	 * Finish firmware download process by sending a zero length
+	 * payload
+	 */
+	ret = ziirave_firm_write_pkt(wdd, 0, NULL, 0);
 	if (ret) {
 		dev_err(&client->dev, "Failed to send EMPTY packet: %d\n", ret);
 		return ret;
@@ -437,15 +398,22 @@
 	}
 
 	/* End download operation */
-	ret = ziirave_firm_write_byte(wdd, ZIIRAVE_CMD_DOWNLOAD_END, 1, false);
-	if (ret)
+	ret = i2c_smbus_write_byte(client, ZIIRAVE_CMD_DOWNLOAD_END);
+	if (ret) {
+		dev_err(&client->dev,
+			"Failed to end firmware download: %d\n", ret);
 		return ret;
+	}
 
 	/* Reset the processor */
-	ret = ziirave_firm_write_byte(wdd, ZIIRAVE_CMD_RESET_PROCESSOR, 1,
-				      false);
-	if (ret)
+	ret = i2c_smbus_write_byte_data(client,
+					ZIIRAVE_CMD_RESET_PROCESSOR,
+					ZIIRAVE_CMD_RESET_PROCESSOR_MAGIC);
+	if (ret) {
+		dev_err(&client->dev,
+			"Failed to reset the watchdog: %d\n", ret);
 		return ret;
+	}
 
 	msleep(500);
 
@@ -478,7 +446,7 @@
 	if (ret)
 		return ret;
 
-	ret = sprintf(buf, "02.%02u.%02u", w_priv->firmware_rev.major,
+	ret = sprintf(buf, ZIIRAVE_FW_VERSION_FMT, w_priv->firmware_rev.major,
 		      w_priv->firmware_rev.minor);
 
 	mutex_unlock(&w_priv->sysfs_mutex);
@@ -501,7 +469,7 @@
 	if (ret)
 		return ret;
 
-	ret = sprintf(buf, "01.%02u.%02u", w_priv->bootloader_rev.major,
+	ret = sprintf(buf, ZIIRAVE_BL_VERSION_FMT, w_priv->bootloader_rev.major,
 		      w_priv->bootloader_rev.minor);
 
 	mutex_unlock(&w_priv->sysfs_mutex);
@@ -568,7 +536,8 @@
 		goto unlock_mutex;
 	}
 
-	dev_info(&client->dev, "Firmware updated to version 02.%02u.%02u\n",
+	dev_info(&client->dev,
+		 "Firmware updated to version " ZIIRAVE_FW_VERSION_FMT "\n",
 		 w_priv->firmware_rev.major, w_priv->firmware_rev.minor);
 
 	/* Restore the watchdog timeout */
@@ -611,7 +580,7 @@
 						   &reset_duration);
 		if (ret) {
 			dev_info(&client->dev,
-				 "Unable to set reset pulse duration, using default\n");
+			 "No reset pulse duration specified, using default\n");
 			return 0;
 		}
 	}
@@ -633,7 +602,10 @@
 	struct ziirave_wdt_data *w_priv;
 	int val;
 
-	if (!i2c_check_functionality(client->adapter, I2C_FUNC_SMBUS_BYTE_DATA))
+	if (!i2c_check_functionality(client->adapter,
+				     I2C_FUNC_SMBUS_BYTE |
+				     I2C_FUNC_SMBUS_BYTE_DATA |
+				     I2C_FUNC_SMBUS_WRITE_BLOCK_DATA))
 		return -ENODEV;
 
 	w_priv = devm_kzalloc(&client->dev, sizeof(*w_priv), GFP_KERNEL);
@@ -658,57 +630,80 @@
 	 */
 	if (w_priv->wdd.timeout == 0) {
 		val = i2c_smbus_read_byte_data(client, ZIIRAVE_WDT_TIMEOUT);
-		if (val < 0)
+		if (val < 0) {
+			dev_err(&client->dev, "Failed to read timeout\n");
 			return val;
+		}
 
-		if (val < ZIIRAVE_TIMEOUT_MIN)
-			return -ENODEV;
+		if (val > ZIIRAVE_TIMEOUT_MAX ||
+		    val < ZIIRAVE_TIMEOUT_MIN)
+			val = ZIIRAVE_TIMEOUT_DEFAULT;
 
 		w_priv->wdd.timeout = val;
-	} else {
-		ret = ziirave_wdt_set_timeout(&w_priv->wdd,
-					      w_priv->wdd.timeout);
-		if (ret)
-			return ret;
-
-		dev_info(&client->dev, "Timeout set to %ds.",
-			 w_priv->wdd.timeout);
 	}
 
+	ret = ziirave_wdt_set_timeout(&w_priv->wdd, w_priv->wdd.timeout);
+	if (ret) {
+		dev_err(&client->dev, "Failed to set timeout\n");
+		return ret;
+	}
+
+	dev_info(&client->dev, "Timeout set to %ds\n", w_priv->wdd.timeout);
+
 	watchdog_set_nowayout(&w_priv->wdd, nowayout);
 
 	i2c_set_clientdata(client, w_priv);
 
 	/* If in unconfigured state, set to stopped */
 	val = i2c_smbus_read_byte_data(client, ZIIRAVE_WDT_STATE);
-	if (val < 0)
+	if (val < 0) {
+		dev_err(&client->dev, "Failed to read state\n");
 		return val;
+	}
 
 	if (val == ZIIRAVE_STATE_INITIAL)
 		ziirave_wdt_stop(&w_priv->wdd);
 
 	ret = ziirave_wdt_init_duration(client);
-	if (ret)
+	if (ret) {
+		dev_err(&client->dev, "Failed to init duration\n");
 		return ret;
+	}
 
 	ret = ziirave_wdt_revision(client, &w_priv->firmware_rev,
 				   ZIIRAVE_WDT_FIRM_VER_MAJOR);
-	if (ret)
+	if (ret) {
+		dev_err(&client->dev, "Failed to read firmware version\n");
 		return ret;
+	}
+
+	dev_info(&client->dev,
+		 "Firmware version: " ZIIRAVE_FW_VERSION_FMT "\n",
+		 w_priv->firmware_rev.major, w_priv->firmware_rev.minor);
 
 	ret = ziirave_wdt_revision(client, &w_priv->bootloader_rev,
 				   ZIIRAVE_WDT_BOOT_VER_MAJOR);
-	if (ret)
+	if (ret) {
+		dev_err(&client->dev, "Failed to read bootloader version\n");
 		return ret;
+	}
+
+	dev_info(&client->dev,
+		 "Bootloader version: " ZIIRAVE_BL_VERSION_FMT "\n",
+		 w_priv->bootloader_rev.major, w_priv->bootloader_rev.minor);
 
 	w_priv->reset_reason = i2c_smbus_read_byte_data(client,
 						ZIIRAVE_WDT_RESET_REASON);
-	if (w_priv->reset_reason < 0)
+	if (w_priv->reset_reason < 0) {
+		dev_err(&client->dev, "Failed to read reset reason\n");
 		return w_priv->reset_reason;
+	}
 
 	if (w_priv->reset_reason >= ARRAY_SIZE(ziirave_reasons) ||
-	    !ziirave_reasons[w_priv->reset_reason])
+	    !ziirave_reasons[w_priv->reset_reason]) {
+		dev_err(&client->dev, "Invalid reset reason\n");
 		return -ENODEV;
+	}
 
 	ret = watchdog_register_device(&w_priv->wdd);
 

diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c
index 2e8570c..6c88439 100644
--- a/drivers/xen/events/events_base.c
+++ b/drivers/xen/events/events_base.c

@@ -247,7 +247,7 @@
  */
 unsigned int evtchn_from_irq(unsigned irq)
 {
-	if (unlikely(WARN(irq >= nr_irqs, "Invalid irq %d!\n", irq)))
+	if (WARN(irq >= nr_irqs, "Invalid irq %d!\n", irq))
 		return 0;
 
 	return info_for_irq(irq)->evtchn;

diff --git a/drivers/xen/pci.c b/drivers/xen/pci.c
index 3eeb9be..224df03 100644
--- a/drivers/xen/pci.c
+++ b/drivers/xen/pci.c

@@ -17,6 +17,8 @@
 #include "../pci/pci.h"
 #ifdef CONFIG_PCI_MMCONFIG
 #include <asm/pci_x86.h>
+
+static int xen_mcfg_late(void);
 #endif
 
 static bool __read_mostly pci_seg_supported = true;
@@ -28,7 +30,18 @@
 #ifdef CONFIG_PCI_IOV
 	struct pci_dev *physfn = pci_dev->physfn;
 #endif
-
+#ifdef CONFIG_PCI_MMCONFIG
+	static bool pci_mcfg_reserved = false;
+	/*
+	 * Reserve MCFG areas in Xen on first invocation due to this being
+	 * potentially called from inside of acpi_init immediately after
+	 * MCFG table has been finally parsed.
+	 */
+	if (!pci_mcfg_reserved) {
+		xen_mcfg_late();
+		pci_mcfg_reserved = true;
+	}
+#endif
 	if (pci_seg_supported) {
 		struct {
 			struct physdev_pci_device_add add;
@@ -201,7 +214,7 @@
 arch_initcall(register_xen_pci_notifier);
 
 #ifdef CONFIG_PCI_MMCONFIG
-static int __init xen_mcfg_late(void)
+static int xen_mcfg_late(void)
 {
 	struct pci_mmcfg_region *cfg;
 	int rc;
@@ -240,8 +253,4 @@
 	}
 	return 0;
 }
-/*
- * Needs to be done after acpi_init which are subsys_initcall.
- */
-subsys_initcall_sync(xen_mcfg_late);
 #endif

diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c
index 58c9365..bd3a10d 100644
--- a/drivers/xen/swiotlb-xen.c
+++ b/drivers/xen/swiotlb-xen.c

@@ -39,6 +39,7 @@
 #include <asm/xen/page-coherent.h>
 
 #include <trace/events/swiotlb.h>
+#define MAX_DMA_BITS 32
 /*
  * Used to do a quick range check in swiotlb_tbl_unmap_single and
  * swiotlb_tbl_sync_single_*, to see if the memory was in fact allocated by this
@@ -115,8 +116,6 @@
 	return 0;
 }
 
-static int max_dma_bits = 32;
-
 static int
 xen_swiotlb_fixup(void *buf, size_t size, unsigned long nslabs)
 {
@@ -136,7 +135,7 @@
 				p + (i << IO_TLB_SHIFT),
 				get_order(slabs << IO_TLB_SHIFT),
 				dma_bits, &dma_handle);
-		} while (rc && dma_bits++ < max_dma_bits);
+		} while (rc && dma_bits++ < MAX_DMA_BITS);
 		if (rc)
 			return rc;
 

diff --git a/fs/9p/cache.c b/fs/9p/cache.c
index 995e332e..eb2151f 100644
--- a/fs/9p/cache.c
+++ b/fs/9p/cache.c

@@ -51,6 +51,8 @@
 	if (!v9ses->cachetag) {
 		if (v9fs_random_cachetag(v9ses) < 0) {
 			v9ses->fscache = NULL;
+			kfree(v9ses->cachetag);
+			v9ses->cachetag = NULL;
 			return;
 		}
 	}

diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 4cc966a..fe7f0bd 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c

@@ -513,6 +513,7 @@
 	v9inode = V9FS_I(inode);
 	mutex_lock(&v9inode->v_mutex);
 	if (!v9inode->writeback_fid &&
+	    (vma->vm_flags & VM_SHARED) &&
 	    (vma->vm_flags & VM_WRITE)) {
 		/*
 		 * clone a fid and add it to writeback_fid
@@ -614,6 +615,8 @@
 			(vma->vm_end - vma->vm_start - 1),
 	};
 
+	if (!(vma->vm_flags & VM_SHARED))
+		return;
 
 	p9_debug(P9_DEBUG_VFS, "9p VMA close, %p, flushing", vma);
 

diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index ca243e6..74df32b 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c

@@ -58,7 +58,7 @@
 
 static int
 v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
-		int flags, void *data)
+		int flags)
 {
 	int ret;
 
@@ -132,7 +132,7 @@
 		retval = PTR_ERR(sb);
 		goto clunk_fid;
 	}
-	retval = v9fs_fill_super(sb, v9ses, flags, data);
+	retval = v9fs_fill_super(sb, v9ses, flags);
 	if (retval)
 		goto release_sb;
 

diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c
index bcd1baf..4150280 100644
--- a/fs/afs/dynroot.c
+++ b/fs/afs/dynroot.c

@@ -10,13 +10,6 @@
 #include <linux/dns_resolver.h>
 #include "internal.h"
 
-const struct file_operations afs_dynroot_file_operations = {
-	.open		= dcache_dir_open,
-	.release	= dcache_dir_close,
-	.iterate_shared	= dcache_readdir,
-	.llseek		= dcache_dir_lseek,
-};
-
 /*
  * Probe to see if a cell may exist.  This prevents positive dentries from
  * being created unnecessarily.

diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 7b1c18c..46d2d7c 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c

@@ -443,7 +443,7 @@
 	inode->i_mode		= S_IFDIR | S_IRUGO | S_IXUGO;
 	if (root) {
 		inode->i_op	= &afs_dynroot_inode_operations;
-		inode->i_fop	= &afs_dynroot_file_operations;
+		inode->i_fop	= &simple_dir_operations;
 	} else {
 		inode->i_op	= &afs_autocell_inode_operations;
 	}

diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 9cdfaba..759e057 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h

@@ -910,7 +910,6 @@
 /*
  * dynroot.c
  */
-extern const struct file_operations afs_dynroot_file_operations;
 extern const struct inode_operations afs_dynroot_inode_operations;
 extern const struct dentry_operations afs_dynroot_dentry_operations;
 

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index d4e11b2..ad4c6b1 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c

@@ -670,26 +670,6 @@
  * libraries.  There is no binary dependent code anywhere else.
  */
 
-#ifndef STACK_RND_MASK
-#define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12))	/* 8MB of VA */
-#endif
-
-static unsigned long randomize_stack_top(unsigned long stack_top)
-{
-	unsigned long random_variable = 0;
-
-	if (current->flags & PF_RANDOMIZE) {
-		random_variable = get_random_long();
-		random_variable &= STACK_RND_MASK;
-		random_variable <<= PAGE_SHIFT;
-	}
-#ifdef CONFIG_STACK_GROWSUP
-	return PAGE_ALIGN(stack_top) + random_variable;
-#else
-	return PAGE_ALIGN(stack_top) - random_variable;
-#endif
-}
-
 static int load_elf_binary(struct linux_binprm *bprm)
 {
 	struct file *interpreter = NULL; /* to shut gcc up */
@@ -1141,7 +1121,8 @@
 		 * (since it grows up, and may collide early with the stack
 		 * growing down), and into the unused ELF_ET_DYN_BASE region.
 		 */
-		if (IS_ENABLED(CONFIG_ARCH_HAS_ELF_RANDOMIZE) && !interpreter)
+		if (IS_ENABLED(CONFIG_ARCH_HAS_ELF_RANDOMIZE) &&
+		    loc->elf_ex.e_type == ET_DYN && !interpreter)
 			current->mm->brk = current->mm->start_brk =
 				ELF_ET_DYN_BASE;
 

diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
index a699e32..c1da294 100644
--- a/fs/ceph/Makefile
+++ b/fs/ceph/Makefile

@@ -6,7 +6,7 @@
 obj-$(CONFIG_CEPH_FS) += ceph.o
 
 ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
-	export.o caps.o snap.o xattr.o quota.o \
+	export.o caps.o snap.o xattr.o quota.o io.o \
 	mds_client.o mdsmap.o strings.o ceph_frag.o \
 	debugfs.o
 

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index b3c8b88..7ab6166 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c

@@ -189,8 +189,7 @@
 {
 	struct inode *inode = file_inode(filp);
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_osd_client *osdc =
-		&ceph_inode_to_client(inode)->client->osdc;
+	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
 	int err = 0;
 	u64 off = page_offset(page);
 	u64 len = PAGE_SIZE;
@@ -219,8 +218,8 @@
 
 	dout("readpage inode %p file %p page %p index %lu\n",
 	     inode, filp, page, page->index);
-	err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
-				  off, &len,
+	err = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode),
+				  &ci->i_layout, off, &len,
 				  ci->i_truncate_seq, ci->i_truncate_size,
 				  &page, 1, 0);
 	if (err == -ENOENT)
@@ -228,6 +227,8 @@
 	if (err < 0) {
 		SetPageError(page);
 		ceph_fscache_readpage_cancel(inode, page);
+		if (err == -EBLACKLISTED)
+			fsc->blacklisted = true;
 		goto out;
 	}
 	if (err < PAGE_SIZE)
@@ -266,6 +267,8 @@
 	int i;
 
 	dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes);
+	if (rc == -EBLACKLISTED)
+		ceph_inode_to_client(inode)->blacklisted = true;
 
 	/* unlock all pages, zeroing any data we didn't read */
 	osd_data = osd_req_op_extent_osd_data(req, 0);
@@ -323,7 +326,8 @@
 		/* caller of readpages does not hold buffer and read caps
 		 * (fadvise, madvise and readahead cases) */
 		int want = CEPH_CAP_FILE_CACHE;
-		ret = ceph_try_get_caps(ci, CEPH_CAP_FILE_RD, want, true, &got);
+		ret = ceph_try_get_caps(inode, CEPH_CAP_FILE_RD, want,
+					true, &got);
 		if (ret < 0) {
 			dout("start_read %p, error getting cap\n", inode);
 		} else if (!(got & want)) {
@@ -569,7 +573,7 @@
 /*
  * Write a single page, but leave the page locked.
  *
- * If we get a write error, set the page error bit, but still adjust the
+ * If we get a write error, mark the mapping for error, but still adjust the
  * dirty page accounting (i.e., page is no longer dirty).
  */
 static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
@@ -640,9 +644,10 @@
 			end_page_writeback(page);
 			return err;
 		}
+		if (err == -EBLACKLISTED)
+			fsc->blacklisted = true;
 		dout("writepage setting page/mapping error %d %p\n",
 		     err, page);
-		SetPageError(page);
 		mapping_set_error(&inode->i_data, err);
 		wbc->pages_skipped++;
 	} else {
@@ -680,23 +685,6 @@
 }
 
 /*
- * lame release_pages helper.  release_pages() isn't exported to
- * modules.
- */
-static void ceph_release_pages(struct page **pages, int num)
-{
-	struct pagevec pvec;
-	int i;
-
-	pagevec_init(&pvec);
-	for (i = 0; i < num; i++) {
-		if (pagevec_add(&pvec, pages[i]) == 0)
-			pagevec_release(&pvec);
-	}
-	pagevec_release(&pvec);
-}
-
-/*
  * async writeback completion handler.
  *
  * If we get an error, set the mapping error bit, but not the individual
@@ -720,6 +708,8 @@
 	if (rc < 0) {
 		mapping_set_error(mapping, rc);
 		ceph_set_error_write(ci);
+		if (rc == -EBLACKLISTED)
+			fsc->blacklisted = true;
 	} else {
 		ceph_clear_error_write(ci);
 	}
@@ -769,7 +759,7 @@
 		dout("writepages_finish %p wrote %llu bytes cleaned %d pages\n",
 		     inode, osd_data->length, rc >= 0 ? num_pages : 0);
 
-		ceph_release_pages(osd_data->pages, num_pages);
+		release_pages(osd_data->pages, num_pages);
 	}
 
 	ceph_put_wrbuffer_cap_refs(ci, total_pages, snapc);
@@ -1452,7 +1442,8 @@
 		want = CEPH_CAP_FILE_CACHE;
 
 	got = 0;
-	err = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, -1, &got, &pinned_page);
+	err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_RD, want, -1,
+			    &got, &pinned_page);
 	if (err < 0)
 		goto out_restore;
 
@@ -1540,6 +1531,7 @@
 	if (!prealloc_cf)
 		return VM_FAULT_OOM;
 
+	sb_start_pagefault(inode->i_sb);
 	ceph_block_sigs(&oldset);
 
 	if (ci->i_inline_version != CEPH_INLINE_NONE) {
@@ -1568,7 +1560,7 @@
 		want = CEPH_CAP_FILE_BUFFER;
 
 	got = 0;
-	err = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, off + len,
+	err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_WR, want, off + len,
 			    &got, NULL);
 	if (err < 0)
 		goto out_free;
@@ -1614,6 +1606,7 @@
 	ceph_put_cap_refs(ci, got);
 out_free:
 	ceph_restore_sigs(&oldset);
+	sb_end_pagefault(inode->i_sb);
 	ceph_free_cap_flush(prealloc_cf);
 	if (err < 0)
 		ret = vmf_error(err);
@@ -1946,12 +1939,17 @@
 
 	if (err >= 0 || err == -ENOENT)
 		have |= POOL_READ;
-	else if (err != -EPERM)
+	else if (err != -EPERM) {
+		if (err == -EBLACKLISTED)
+			fsc->blacklisted = true;
 		goto out_unlock;
+	}
 
 	if (err2 == 0 || err2 == -EEXIST)
 		have |= POOL_WRITE;
 	else if (err2 != -EPERM) {
+		if (err2 == -EBLACKLISTED)
+			fsc->blacklisted = true;
 		err = err2;
 		goto out_unlock;
 	}
@@ -1989,10 +1987,11 @@
 	return err;
 }
 
-int ceph_pool_perm_check(struct ceph_inode_info *ci, int need)
+int ceph_pool_perm_check(struct inode *inode, int need)
 {
-	s64 pool;
+	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_string *pool_ns;
+	s64 pool;
 	int ret, flags;
 
 	if (ci->i_vino.snap != CEPH_NOSNAP) {
@@ -2004,7 +2003,7 @@
 		return 0;
 	}
 
-	if (ceph_test_mount_opt(ceph_inode_to_client(&ci->vfs_inode),
+	if (ceph_test_mount_opt(ceph_inode_to_client(inode),
 				NOPOOLPERM))
 		return 0;
 

diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
index bc90cf6..b2ec29e 100644
--- a/fs/ceph/cache.c
+++ b/fs/ceph/cache.c

@@ -6,6 +6,8 @@
  *  Written by Milosz Tanski (milosz@adfin.com)
  */
 
+#include <linux/ceph/ceph_debug.h>
+
 #include "super.h"
 #include "cache.h"
 

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index ce0f565..d3b9c9d 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c

@@ -458,37 +458,6 @@
 }
 
 /*
- * Return id of any MDS with a cap, preferably FILE_WR|BUFFER|EXCL, else -1.
- */
-static int __ceph_get_cap_mds(struct ceph_inode_info *ci)
-{
-	struct ceph_cap *cap;
-	int mds = -1;
-	struct rb_node *p;
-
-	/* prefer mds with WR|BUFFER|EXCL caps */
-	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
-		cap = rb_entry(p, struct ceph_cap, ci_node);
-		mds = cap->mds;
-		if (cap->issued & (CEPH_CAP_FILE_WR |
-				   CEPH_CAP_FILE_BUFFER |
-				   CEPH_CAP_FILE_EXCL))
-			break;
-	}
-	return mds;
-}
-
-int ceph_get_cap_mds(struct inode *inode)
-{
-	struct ceph_inode_info *ci = ceph_inode(inode);
-	int mds;
-	spin_lock(&ci->i_ceph_lock);
-	mds = __ceph_get_cap_mds(ceph_inode(inode));
-	spin_unlock(&ci->i_ceph_lock);
-	return mds;
-}
-
-/*
  * Called under i_ceph_lock.
  */
 static void __insert_cap_node(struct ceph_inode_info *ci,
@@ -628,7 +597,7 @@
 /*
  * Add a capability under the given MDS session.
  *
- * Caller should hold session snap_rwsem (read) and s_mutex.
+ * Caller should hold session snap_rwsem (read) and ci->i_ceph_lock
  *
  * @fmode is the open file mode, if we are opening a file, otherwise
  * it is < 0.  (This is so we can atomically add the cap and add an
@@ -645,6 +614,9 @@
 	struct ceph_cap *cap;
 	int mds = session->s_mds;
 	int actual_wanted;
+	u32 gen;
+
+	lockdep_assert_held(&ci->i_ceph_lock);
 
 	dout("add_cap %p mds%d cap %llx %s seq %d\n", inode,
 	     session->s_mds, cap_id, ceph_cap_string(issued), seq);
@@ -656,6 +628,10 @@
 	if (fmode >= 0)
 		wanted |= ceph_caps_for_mode(fmode);
 
+	spin_lock(&session->s_gen_ttl_lock);
+	gen = session->s_cap_gen;
+	spin_unlock(&session->s_gen_ttl_lock);
+
 	cap = __get_cap_for_mds(ci, mds);
 	if (!cap) {
 		cap = *new_cap;
@@ -681,7 +657,7 @@
 		list_move_tail(&cap->session_caps, &session->s_caps);
 		spin_unlock(&session->s_cap_lock);
 
-		if (cap->cap_gen < session->s_cap_gen)
+		if (cap->cap_gen < gen)
 			cap->issued = cap->implemented = CEPH_CAP_PIN;
 
 		/*
@@ -775,7 +751,7 @@
 	cap->seq = seq;
 	cap->issue_seq = seq;
 	cap->mseq = mseq;
-	cap->cap_gen = session->s_cap_gen;
+	cap->cap_gen = gen;
 
 	if (fmode >= 0)
 		__ceph_get_fmode(ci, fmode);
@@ -1284,10 +1260,6 @@
  * Make note of max_size reported/requested from mds, revoked caps
  * that have now been implemented.
  *
- * Make half-hearted attempt ot to invalidate page cache if we are
- * dropping RDCACHE.  Note that this will leave behind locked pages
- * that we'll then need to deal with elsewhere.
- *
  * Return non-zero if delayed release, or we experienced an error
  * such that the caller should requeue + retry later.
  *
@@ -1746,11 +1718,11 @@
  * Add dirty inode to the flushing list.  Assigned a seq number so we
  * can wait for caps to flush without starving.
  *
- * Called under i_ceph_lock.
+ * Called under i_ceph_lock. Returns the flush tid.
  */
-static int __mark_caps_flushing(struct inode *inode,
+static u64 __mark_caps_flushing(struct inode *inode,
 				struct ceph_mds_session *session, bool wake,
-				u64 *flush_tid, u64 *oldest_flush_tid)
+				u64 *oldest_flush_tid)
 {
 	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
 	struct ceph_inode_info *ci = ceph_inode(inode);
@@ -1789,8 +1761,7 @@
 
 	list_add_tail(&cf->i_list, &ci->i_cap_flush_list);
 
-	*flush_tid = cf->tid;
-	return flushing;
+	return cf->tid;
 }
 
 /*
@@ -2028,11 +1999,6 @@
 		}
 
 ack:
-		if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
-			dout(" skipping %p I_NOFLUSH set\n", inode);
-			continue;
-		}
-
 		if (session && session != cap->session) {
 			dout("oops, wrong session %p mutex\n", session);
 			mutex_unlock(&session->s_mutex);
@@ -2080,9 +2046,9 @@
 		}
 
 		if (cap == ci->i_auth_cap && ci->i_dirty_caps) {
-			flushing = __mark_caps_flushing(inode, session, false,
-							&flush_tid,
-							&oldest_flush_tid);
+			flushing = ci->i_dirty_caps;
+			flush_tid = __mark_caps_flushing(inode, session, false,
+							 &oldest_flush_tid);
 		} else {
 			flushing = 0;
 			flush_tid = 0;
@@ -2130,16 +2096,11 @@
 retry:
 	spin_lock(&ci->i_ceph_lock);
 retry_locked:
-	if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
-		spin_unlock(&ci->i_ceph_lock);
-		dout("try_flush_caps skipping %p I_NOFLUSH set\n", inode);
-		goto out;
-	}
 	if (ci->i_dirty_caps && ci->i_auth_cap) {
 		struct ceph_cap *cap = ci->i_auth_cap;
 		int delayed;
 
-		if (!session || session != cap->session) {
+		if (session != cap->session) {
 			spin_unlock(&ci->i_ceph_lock);
 			if (session)
 				mutex_unlock(&session->s_mutex);
@@ -2161,8 +2122,9 @@
 			goto retry_locked;
 		}
 
-		flushing = __mark_caps_flushing(inode, session, true,
-						&flush_tid, &oldest_flush_tid);
+		flushing = ci->i_dirty_caps;
+		flush_tid = __mark_caps_flushing(inode, session, true,
+						 &oldest_flush_tid);
 
 		/* __send_cap drops i_ceph_lock */
 		delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
@@ -2261,35 +2223,45 @@
 
 int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 {
+	struct ceph_file_info *fi = file->private_data;
 	struct inode *inode = file->f_mapping->host;
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	u64 flush_tid;
-	int ret;
+	int ret, err;
 	int dirty;
 
 	dout("fsync %p%s\n", inode, datasync ? " datasync" : "");
 
 	ret = file_write_and_wait_range(file, start, end);
-	if (ret < 0)
-		goto out;
-
 	if (datasync)
 		goto out;
 
 	dirty = try_flush_caps(inode, &flush_tid);
 	dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
 
-	ret = unsafe_request_wait(inode);
+	err = unsafe_request_wait(inode);
 
 	/*
 	 * only wait on non-file metadata writeback (the mds
 	 * can recover size and mtime, so we don't need to
 	 * wait for that)
 	 */
-	if (!ret && (dirty & ~CEPH_CAP_ANY_FILE_WR)) {
-		ret = wait_event_interruptible(ci->i_cap_wq,
+	if (!err && (dirty & ~CEPH_CAP_ANY_FILE_WR)) {
+		err = wait_event_interruptible(ci->i_cap_wq,
 					caps_are_flushed(inode, flush_tid));
 	}
+
+	if (err < 0)
+		ret = err;
+
+	if (errseq_check(&ci->i_meta_err, READ_ONCE(fi->meta_err))) {
+		spin_lock(&file->f_lock);
+		err = errseq_check_and_advance(&ci->i_meta_err,
+					       &fi->meta_err);
+		spin_unlock(&file->f_lock);
+		if (err < 0)
+			ret = err;
+	}
 out:
 	dout("fsync %p%s result=%d\n", inode, datasync ? " datasync" : "", ret);
 	return ret;
@@ -2560,10 +2532,15 @@
  *
  * FIXME: how does a 0 return differ from -EAGAIN?
  */
-static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
-			    loff_t endoff, bool nonblock, int *got)
+enum {
+	NON_BLOCKING	= 1,
+	CHECK_FILELOCK	= 2,
+};
+
+static int try_get_cap_refs(struct inode *inode, int need, int want,
+			    loff_t endoff, int flags, int *got)
 {
-	struct inode *inode = &ci->vfs_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
 	int ret = 0;
 	int have, implemented;
@@ -2576,6 +2553,13 @@
 again:
 	spin_lock(&ci->i_ceph_lock);
 
+	if ((flags & CHECK_FILELOCK) &&
+	    (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK)) {
+		dout("try_get_cap_refs %p error filelock\n", inode);
+		ret = -EIO;
+		goto out_unlock;
+	}
+
 	/* make sure file is actually open */
 	file_wanted = __ceph_caps_file_wanted(ci);
 	if ((file_wanted & need) != need) {
@@ -2637,7 +2621,7 @@
 					 * we can not call down_read() when
 					 * task isn't in TASK_RUNNING state
 					 */
-					if (nonblock) {
+					if (flags & NON_BLOCKING) {
 						ret = -EAGAIN;
 						goto out_unlock;
 					}
@@ -2731,18 +2715,19 @@
 		ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
 }
 
-int ceph_try_get_caps(struct ceph_inode_info *ci, int need, int want,
+int ceph_try_get_caps(struct inode *inode, int need, int want,
 		      bool nonblock, int *got)
 {
 	int ret;
 
 	BUG_ON(need & ~CEPH_CAP_FILE_RD);
 	BUG_ON(want & ~(CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO|CEPH_CAP_FILE_SHARED));
-	ret = ceph_pool_perm_check(ci, need);
+	ret = ceph_pool_perm_check(inode, need);
 	if (ret < 0)
 		return ret;
 
-	ret = try_get_cap_refs(ci, need, want, 0, nonblock, got);
+	ret = try_get_cap_refs(inode, need, want, 0,
+			       (nonblock ? NON_BLOCKING : 0), got);
 	return ret == -EAGAIN ? 0 : ret;
 }
 
@@ -2751,30 +2736,40 @@
  * due to a small max_size, make sure we check_max_size (and possibly
  * ask the mds) so we don't get hung up indefinitely.
  */
-int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
+int ceph_get_caps(struct file *filp, int need, int want,
 		  loff_t endoff, int *got, struct page **pinned_page)
 {
-	int _got, ret;
+	struct ceph_file_info *fi = filp->private_data;
+	struct inode *inode = file_inode(filp);
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+	int ret, _got, flags;
 
-	ret = ceph_pool_perm_check(ci, need);
+	ret = ceph_pool_perm_check(inode, need);
 	if (ret < 0)
 		return ret;
 
+	if ((fi->fmode & CEPH_FILE_MODE_WR) &&
+	    fi->filp_gen != READ_ONCE(fsc->filp_gen))
+		return -EBADF;
+
 	while (true) {
 		if (endoff > 0)
-			check_max_size(&ci->vfs_inode, endoff);
+			check_max_size(inode, endoff);
 
+		flags = atomic_read(&fi->num_locks) ? CHECK_FILELOCK : 0;
 		_got = 0;
-		ret = try_get_cap_refs(ci, need, want, endoff,
-				       false, &_got);
+		ret = try_get_cap_refs(inode, need, want, endoff,
+				       flags, &_got);
 		if (ret == -EAGAIN)
 			continue;
 		if (!ret) {
 			DEFINE_WAIT_FUNC(wait, woken_wake_function);
 			add_wait_queue(&ci->i_cap_wq, &wait);
 
-			while (!(ret = try_get_cap_refs(ci, need, want, endoff,
-							true, &_got))) {
+			flags |= NON_BLOCKING;
+			while (!(ret = try_get_cap_refs(inode, need, want,
+							endoff, flags, &_got))) {
 				if (signal_pending(current)) {
 					ret = -ERESTARTSYS;
 					break;
@@ -2786,10 +2781,18 @@
 			if (ret == -EAGAIN)
 				continue;
 		}
+
+		if ((fi->fmode & CEPH_FILE_MODE_WR) &&
+		    fi->filp_gen != READ_ONCE(fsc->filp_gen)) {
+			if (ret >= 0 && _got)
+				ceph_put_cap_refs(ci, _got);
+			return -EBADF;
+		}
+
 		if (ret < 0) {
 			if (ret == -ESTALE) {
 				/* session was killed, try renew caps */
-				ret = ceph_renew_caps(&ci->vfs_inode);
+				ret = ceph_renew_caps(inode);
 				if (ret == 0)
 					continue;
 			}
@@ -2798,9 +2801,9 @@
 
 		if (ci->i_inline_version != CEPH_INLINE_NONE &&
 		    (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
-		    i_size_read(&ci->vfs_inode) > 0) {
+		    i_size_read(inode) > 0) {
 			struct page *page =
-				find_get_page(ci->vfs_inode.i_mapping, 0);
+				find_get_page(inode->i_mapping, 0);
 			if (page) {
 				if (PageUptodate(page)) {
 					*pinned_page = page;
@@ -2819,7 +2822,7 @@
 			 * getattr request will bring inline data into
 			 * page cache
 			 */
-			ret = __ceph_do_getattr(&ci->vfs_inode, NULL,
+			ret = __ceph_do_getattr(inode, NULL,
 						CEPH_STAT_CAP_INLINE_DATA,
 						true);
 			if (ret < 0)

diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 2eb88ed..facb387 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c

@@ -294,7 +294,6 @@
 
 void ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
 {
-	return 0;
 }
 
 void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)

diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 15ff1b0..b6bfa94 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c

@@ -35,7 +35,7 @@
 static int ceph_encode_snapfh(struct inode *inode, u32 *rawfh, int *max_len,
 			      struct inode *parent_inode)
 {
-	const static int snap_handle_length =
+	static const int snap_handle_length =
 		sizeof(struct ceph_nfs_snapfh) >> 2;
 	struct ceph_nfs_snapfh *sfh = (void *)rawfh;
 	u64 snapid = ceph_snap(inode);
@@ -85,9 +85,9 @@
 static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len,
 			  struct inode *parent_inode)
 {
-	const static int handle_length =
+	static const int handle_length =
 		sizeof(struct ceph_nfs_fh) >> 2;
-	const static int connected_handle_length =
+	static const int connected_handle_length =
 		sizeof(struct ceph_nfs_confh) >> 2;
 	int type;
 
@@ -458,33 +458,33 @@
 		if (err < 0)
 			goto out;
 
-		 rinfo = &req->r_reply_info;
-		 for (i = 0; i < rinfo->dir_nr; i++) {
-			 rde = rinfo->dir_entries + i;
-			 BUG_ON(!rde->inode.in);
-			 if (ceph_snap(inode) ==
-			     le64_to_cpu(rde->inode.in->snapid)) {
-				 memcpy(name, rde->name, rde->name_len);
-				 name[rde->name_len] = '\0';
-				 err = 0;
-				 goto out;
-			 }
-		 }
+		rinfo = &req->r_reply_info;
+		for (i = 0; i < rinfo->dir_nr; i++) {
+			rde = rinfo->dir_entries + i;
+			BUG_ON(!rde->inode.in);
+			if (ceph_snap(inode) ==
+			    le64_to_cpu(rde->inode.in->snapid)) {
+				memcpy(name, rde->name, rde->name_len);
+				name[rde->name_len] = '\0';
+				err = 0;
+				goto out;
+			}
+		}
 
-		 if (rinfo->dir_end)
-			 break;
+		if (rinfo->dir_end)
+			break;
 
-		 BUG_ON(rinfo->dir_nr <= 0);
-		 rde = rinfo->dir_entries + (rinfo->dir_nr - 1);
-		 next_offset += rinfo->dir_nr;
-		 last_name = kstrndup(rde->name, rde->name_len, GFP_KERNEL);
-		 if (!last_name) {
-			 err = -ENOMEM;
-			 goto out;
-		 }
+		BUG_ON(rinfo->dir_nr <= 0);
+		rde = rinfo->dir_entries + (rinfo->dir_nr - 1);
+		next_offset += rinfo->dir_nr;
+		last_name = kstrndup(rde->name, rde->name_len, GFP_KERNEL);
+		if (!last_name) {
+			err = -ENOMEM;
+			goto out;
+		}
 
-		 ceph_mdsc_put_request(req);
-		 req = NULL;
+		ceph_mdsc_put_request(req);
+		req = NULL;
 	}
 	err = -ENOENT;
 out:

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 685a03c..d277f71 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c

@@ -15,6 +15,7 @@
 #include "super.h"
 #include "mds_client.h"
 #include "cache.h"
+#include "io.h"
 
 static __le32 ceph_flags_sys2wire(u32 flags)
 {
@@ -201,6 +202,7 @@
 static int ceph_init_file_info(struct inode *inode, struct file *file,
 					int fmode, bool isdir)
 {
+	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_file_info *fi;
 
 	dout("%s %p %p 0%o (%s)\n", __func__, inode, file,
@@ -211,7 +213,7 @@
 		struct ceph_dir_file_info *dfi =
 			kmem_cache_zalloc(ceph_dir_file_cachep, GFP_KERNEL);
 		if (!dfi) {
-			ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
+			ceph_put_fmode(ci, fmode); /* clean up */
 			return -ENOMEM;
 		}
 
@@ -222,7 +224,7 @@
 	} else {
 		fi = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL);
 		if (!fi) {
-			ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
+			ceph_put_fmode(ci, fmode); /* clean up */
 			return -ENOMEM;
 		}
 
@@ -232,6 +234,8 @@
 	fi->fmode = fmode;
 	spin_lock_init(&fi->rw_contexts_lock);
 	INIT_LIST_HEAD(&fi->rw_contexts);
+	fi->meta_err = errseq_sample(&ci->i_meta_err);
+	fi->filp_gen = READ_ONCE(ceph_inode_to_client(inode)->filp_gen);
 
 	return 0;
 }
@@ -695,7 +699,13 @@
 			ceph_release_page_vector(pages, num_pages);
 		}
 
-		if (ret <= 0 || off >= i_size || !more)
+		if (ret < 0) {
+			if (ret == -EBLACKLISTED)
+				fsc->blacklisted = true;
+			break;
+		}
+
+		if (off >= i_size || !more)
 			break;
 	}
 
@@ -921,7 +931,7 @@
 	struct ceph_aio_request *aio_req = NULL;
 	int num_pages = 0;
 	int flags;
-	int ret;
+	int ret = 0;
 	struct timespec64 mtime = current_time(inode);
 	size_t count = iov_iter_count(iter);
 	loff_t pos = iocb->ki_pos;
@@ -935,11 +945,6 @@
 	     (write ? "write" : "read"), file, pos, (unsigned)count,
 	     snapc, snapc ? snapc->seq : 0);
 
-	ret = filemap_write_and_wait_range(inode->i_mapping,
-					   pos, pos + count - 1);
-	if (ret < 0)
-		return ret;
-
 	if (write) {
 		int ret2 = invalidate_inode_pages2_range(inode->i_mapping,
 					pos >> PAGE_SHIFT,
@@ -1260,7 +1265,8 @@
 		want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
 	else
 		want = CEPH_CAP_FILE_CACHE;
-	ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, -1, &got, &pinned_page);
+	ret = ceph_get_caps(filp, CEPH_CAP_FILE_RD, want, -1,
+			    &got, &pinned_page);
 	if (ret < 0)
 		return ret;
 
@@ -1274,12 +1280,16 @@
 
 		if (ci->i_inline_version == CEPH_INLINE_NONE) {
 			if (!retry_op && (iocb->ki_flags & IOCB_DIRECT)) {
+				ceph_start_io_direct(inode);
 				ret = ceph_direct_read_write(iocb, to,
 							     NULL, NULL);
+				ceph_end_io_direct(inode);
 				if (ret >= 0 && ret < len)
 					retry_op = CHECK_EOF;
 			} else {
+				ceph_start_io_read(inode);
 				ret = ceph_sync_read(iocb, to, &retry_op);
+				ceph_end_io_read(inode);
 			}
 		} else {
 			retry_op = READ_INLINE;
@@ -1290,7 +1300,9 @@
 		     inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len,
 		     ceph_cap_string(got));
 		ceph_add_rw_context(fi, &rw_ctx);
+		ceph_start_io_read(inode);
 		ret = generic_file_read_iter(iocb, to);
+		ceph_end_io_read(inode);
 		ceph_del_rw_context(fi, &rw_ctx);
 	}
 	dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n",
@@ -1399,7 +1411,10 @@
 		return -ENOMEM;
 
 retry_snap:
-	inode_lock(inode);
+	if (iocb->ki_flags & IOCB_DIRECT)
+		ceph_start_io_direct(inode);
+	else
+		ceph_start_io_write(inode);
 
 	/* We can write back this queue in page reclaim */
 	current->backing_dev_info = inode_to_bdi(inode);
@@ -1457,7 +1472,7 @@
 	else
 		want = CEPH_CAP_FILE_BUFFER;
 	got = 0;
-	err = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, pos + count,
+	err = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, pos + count,
 			    &got, NULL);
 	if (err < 0)
 		goto out;
@@ -1470,7 +1485,6 @@
 	    (ci->i_ceph_flags & CEPH_I_ERROR_WRITE)) {
 		struct ceph_snap_context *snapc;
 		struct iov_iter data;
-		inode_unlock(inode);
 
 		spin_lock(&ci->i_ceph_lock);
 		if (__ceph_have_pending_cap_snap(ci)) {
@@ -1487,11 +1501,14 @@
 
 		/* we might need to revert back to that point */
 		data = *from;
-		if (iocb->ki_flags & IOCB_DIRECT)
+		if (iocb->ki_flags & IOCB_DIRECT) {
 			written = ceph_direct_read_write(iocb, &data, snapc,
 							 &prealloc_cf);
-		else
+			ceph_end_io_direct(inode);
+		} else {
 			written = ceph_sync_write(iocb, &data, pos, snapc);
+			ceph_end_io_write(inode);
+		}
 		if (written > 0)
 			iov_iter_advance(from, written);
 		ceph_put_snap_context(snapc);
@@ -1506,7 +1523,7 @@
 		written = generic_perform_write(file, from, pos);
 		if (likely(written >= 0))
 			iocb->ki_pos = pos + written;
-		inode_unlock(inode);
+		ceph_end_io_write(inode);
 	}
 
 	if (written >= 0) {
@@ -1541,9 +1558,11 @@
 	}
 
 	goto out_unlocked;
-
 out:
-	inode_unlock(inode);
+	if (iocb->ki_flags & IOCB_DIRECT)
+		ceph_end_io_direct(inode);
+	else
+		ceph_end_io_write(inode);
 out_unlocked:
 	ceph_free_cap_flush(prealloc_cf);
 	current->backing_dev_info = NULL;
@@ -1781,7 +1800,7 @@
 	else
 		want = CEPH_CAP_FILE_BUFFER;
 
-	ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, endoff, &got, NULL);
+	ret = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, endoff, &got, NULL);
 	if (ret < 0)
 		goto unlock;
 
@@ -1810,16 +1829,15 @@
  * src_ci.  Two attempts are made to obtain both caps, and an error is return if
  * this fails; zero is returned on success.
  */
-static int get_rd_wr_caps(struct ceph_inode_info *src_ci,
-			  loff_t src_endoff, int *src_got,
-			  struct ceph_inode_info *dst_ci,
+static int get_rd_wr_caps(struct file *src_filp, int *src_got,
+			  struct file *dst_filp,
 			  loff_t dst_endoff, int *dst_got)
 {
 	int ret = 0;
 	bool retrying = false;
 
 retry_caps:
-	ret = ceph_get_caps(dst_ci, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER,
+	ret = ceph_get_caps(dst_filp, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER,
 			    dst_endoff, dst_got, NULL);
 	if (ret < 0)
 		return ret;
@@ -1829,24 +1847,24 @@
 	 * we would risk a deadlock by using ceph_get_caps.  Thus, we'll do some
 	 * retry dance instead to try to get both capabilities.
 	 */
-	ret = ceph_try_get_caps(src_ci, CEPH_CAP_FILE_RD, CEPH_CAP_FILE_SHARED,
+	ret = ceph_try_get_caps(file_inode(src_filp),
+				CEPH_CAP_FILE_RD, CEPH_CAP_FILE_SHARED,
 				false, src_got);
 	if (ret <= 0) {
 		/* Start by dropping dst_ci caps and getting src_ci caps */
-		ceph_put_cap_refs(dst_ci, *dst_got);
+		ceph_put_cap_refs(ceph_inode(file_inode(dst_filp)), *dst_got);
 		if (retrying) {
 			if (!ret)
 				/* ceph_try_get_caps masks EAGAIN */
 				ret = -EAGAIN;
 			return ret;
 		}
-		ret = ceph_get_caps(src_ci, CEPH_CAP_FILE_RD,
-				    CEPH_CAP_FILE_SHARED, src_endoff,
-				    src_got, NULL);
+		ret = ceph_get_caps(src_filp, CEPH_CAP_FILE_RD,
+				    CEPH_CAP_FILE_SHARED, -1, src_got, NULL);
 		if (ret < 0)
 			return ret;
 		/*... drop src_ci caps too, and retry */
-		ceph_put_cap_refs(src_ci, *src_got);
+		ceph_put_cap_refs(ceph_inode(file_inode(src_filp)), *src_got);
 		retrying = true;
 		goto retry_caps;
 	}
@@ -1904,6 +1922,7 @@
 	struct ceph_inode_info *src_ci = ceph_inode(src_inode);
 	struct ceph_inode_info *dst_ci = ceph_inode(dst_inode);
 	struct ceph_cap_flush *prealloc_cf;
+	struct ceph_fs_client *src_fsc = ceph_inode_to_client(src_inode);
 	struct ceph_object_locator src_oloc, dst_oloc;
 	struct ceph_object_id src_oid, dst_oid;
 	loff_t endoff = 0, size;
@@ -1913,10 +1932,16 @@
 	int src_got = 0, dst_got = 0, err, dirty;
 	bool do_final_copy = false;
 
-	if (src_inode == dst_inode)
-		return -EINVAL;
-	if (src_inode->i_sb != dst_inode->i_sb)
-		return -EXDEV;
+	if (src_inode->i_sb != dst_inode->i_sb) {
+		struct ceph_fs_client *dst_fsc = ceph_inode_to_client(dst_inode);
+
+		if (ceph_fsid_compare(&src_fsc->client->fsid,
+				      &dst_fsc->client->fsid)) {
+			dout("Copying files across clusters: src: %pU dst: %pU\n",
+			     &src_fsc->client->fsid, &dst_fsc->client->fsid);
+			return -EXDEV;
+		}
+	}
 	if (ceph_snap(dst_inode) != CEPH_NOSNAP)
 		return -EROFS;
 
@@ -1928,7 +1953,7 @@
 	 * efficient).
 	 */
 
-	if (ceph_test_mount_opt(ceph_inode_to_client(src_inode), NOCOPYFROM))
+	if (ceph_test_mount_opt(src_fsc, NOCOPYFROM))
 		return -EOPNOTSUPP;
 
 	if ((src_ci->i_layout.stripe_unit != dst_ci->i_layout.stripe_unit) ||
@@ -1960,8 +1985,8 @@
 	 * clients may have dirty data in their caches.  And OSDs know nothing
 	 * about caps, so they can't safely do the remote object copies.
 	 */
-	err = get_rd_wr_caps(src_ci, (src_off + len), &src_got,
-			     dst_ci, (dst_off + len), &dst_got);
+	err = get_rd_wr_caps(src_file, &src_got,
+			     dst_file, (dst_off + len), &dst_got);
 	if (err < 0) {
 		dout("get_rd_wr_caps returned %d\n", err);
 		ret = -EOPNOTSUPP;
@@ -2018,9 +2043,8 @@
 			goto out;
 		}
 		len -= ret;
-		err = get_rd_wr_caps(src_ci, (src_off + len),
-				     &src_got, dst_ci,
-				     (dst_off + len), &dst_got);
+		err = get_rd_wr_caps(src_file, &src_got,
+				     dst_file, (dst_off + len), &dst_got);
 		if (err < 0)
 			goto out;
 		err = is_file_size_ok(src_inode, dst_inode,
@@ -2044,7 +2068,7 @@
 				dst_ci->i_vino.ino, dst_objnum);
 		/* Do an object remote copy */
 		err = ceph_osdc_copy_from(
-			&ceph_inode_to_client(src_inode)->client->osdc,
+			&src_fsc->client->osdc,
 			src_ci->i_vino.snap, 0,
 			&src_oid, &src_oloc,
 			CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL |

diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 18500ede..9f13562 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c

@@ -515,6 +515,8 @@
 
 	ceph_fscache_inode_init(ci);
 
+	ci->i_meta_err = 0;
+
 	return &ci->vfs_inode;
 }
 
@@ -801,7 +803,12 @@
 
 	/* update inode */
 	inode->i_rdev = le32_to_cpu(info->rdev);
-	inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
+	/* directories have fl_stripe_unit set to zero */
+	if (le32_to_cpu(info->layout.fl_stripe_unit))
+		inode->i_blkbits =
+			fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
+	else
+		inode->i_blkbits = CEPH_BLOCK_SHIFT;
 
 	__ceph_update_quota(ci, iinfo->max_bytes, iinfo->max_files);
 
@@ -1982,7 +1989,7 @@
 int __ceph_setattr(struct inode *inode, struct iattr *attr)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	const unsigned int ia_valid = attr->ia_valid;
+	unsigned int ia_valid = attr->ia_valid;
 	struct ceph_mds_request *req;
 	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
 	struct ceph_cap_flush *prealloc_cf;
@@ -2087,6 +2094,26 @@
 				   CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR;
 		}
 	}
+	if (ia_valid & ATTR_SIZE) {
+		dout("setattr %p size %lld -> %lld\n", inode,
+		     inode->i_size, attr->ia_size);
+		if ((issued & CEPH_CAP_FILE_EXCL) &&
+		    attr->ia_size > inode->i_size) {
+			i_size_write(inode, attr->ia_size);
+			inode->i_blocks = calc_inode_blocks(attr->ia_size);
+			ci->i_reported_size = attr->ia_size;
+			dirtied |= CEPH_CAP_FILE_EXCL;
+			ia_valid |= ATTR_MTIME;
+		} else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
+			   attr->ia_size != inode->i_size) {
+			req->r_args.setattr.size = cpu_to_le64(attr->ia_size);
+			req->r_args.setattr.old_size =
+				cpu_to_le64(inode->i_size);
+			mask |= CEPH_SETATTR_SIZE;
+			release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL |
+				   CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR;
+		}
+	}
 	if (ia_valid & ATTR_MTIME) {
 		dout("setattr %p mtime %lld.%ld -> %lld.%ld\n", inode,
 		     inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec,
@@ -2109,25 +2136,6 @@
 				   CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR;
 		}
 	}
-	if (ia_valid & ATTR_SIZE) {
-		dout("setattr %p size %lld -> %lld\n", inode,
-		     inode->i_size, attr->ia_size);
-		if ((issued & CEPH_CAP_FILE_EXCL) &&
-		    attr->ia_size > inode->i_size) {
-			i_size_write(inode, attr->ia_size);
-			inode->i_blocks = calc_inode_blocks(attr->ia_size);
-			ci->i_reported_size = attr->ia_size;
-			dirtied |= CEPH_CAP_FILE_EXCL;
-		} else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
-			   attr->ia_size != inode->i_size) {
-			req->r_args.setattr.size = cpu_to_le64(attr->ia_size);
-			req->r_args.setattr.old_size =
-				cpu_to_le64(inode->i_size);
-			mask |= CEPH_SETATTR_SIZE;
-			release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL |
-				   CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR;
-		}
-	}
 
 	/* these do nothing */
 	if (ia_valid & ATTR_CTIME) {

diff --git a/fs/ceph/io.c b/fs/ceph/io.c
new file mode 100644
index 0000000..97602ea
--- /dev/null
+++ b/fs/ceph/io.c

@@ -0,0 +1,163 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2016 Trond Myklebust
+ * Copyright (c) 2019 Jeff Layton
+ *
+ * I/O and data path helper functionality.
+ *
+ * Heavily borrowed from equivalent code in fs/nfs/io.c
+ */
+
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/rwsem.h>
+#include <linux/fs.h>
+
+#include "super.h"
+#include "io.h"
+
+/* Call with exclusively locked inode->i_rwsem */
+static void ceph_block_o_direct(struct ceph_inode_info *ci, struct inode *inode)
+{
+	lockdep_assert_held_write(&inode->i_rwsem);
+
+	if (READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT) {
+		spin_lock(&ci->i_ceph_lock);
+		ci->i_ceph_flags &= ~CEPH_I_ODIRECT;
+		spin_unlock(&ci->i_ceph_lock);
+		inode_dio_wait(inode);
+	}
+}
+
+/**
+ * ceph_start_io_read - declare the file is being used for buffered reads
+ * @inode: file inode
+ *
+ * Declare that a buffered read operation is about to start, and ensure
+ * that we block all direct I/O.
+ * On exit, the function ensures that the CEPH_I_ODIRECT flag is unset,
+ * and holds a shared lock on inode->i_rwsem to ensure that the flag
+ * cannot be changed.
+ * In practice, this means that buffered read operations are allowed to
+ * execute in parallel, thanks to the shared lock, whereas direct I/O
+ * operations need to wait to grab an exclusive lock in order to set
+ * CEPH_I_ODIRECT.
+ * Note that buffered writes and truncates both take a write lock on
+ * inode->i_rwsem, meaning that those are serialised w.r.t. the reads.
+ */
+void
+ceph_start_io_read(struct inode *inode)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+
+	/* Be an optimist! */
+	down_read(&inode->i_rwsem);
+	if (!(READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT))
+		return;
+	up_read(&inode->i_rwsem);
+	/* Slow path.... */
+	down_write(&inode->i_rwsem);
+	ceph_block_o_direct(ci, inode);
+	downgrade_write(&inode->i_rwsem);
+}
+
+/**
+ * ceph_end_io_read - declare that the buffered read operation is done
+ * @inode: file inode
+ *
+ * Declare that a buffered read operation is done, and release the shared
+ * lock on inode->i_rwsem.
+ */
+void
+ceph_end_io_read(struct inode *inode)
+{
+	up_read(&inode->i_rwsem);
+}
+
+/**
+ * ceph_start_io_write - declare the file is being used for buffered writes
+ * @inode: file inode
+ *
+ * Declare that a buffered write operation is about to start, and ensure
+ * that we block all direct I/O.
+ */
+void
+ceph_start_io_write(struct inode *inode)
+{
+	down_write(&inode->i_rwsem);
+	ceph_block_o_direct(ceph_inode(inode), inode);
+}
+
+/**
+ * ceph_end_io_write - declare that the buffered write operation is done
+ * @inode: file inode
+ *
+ * Declare that a buffered write operation is done, and release the
+ * lock on inode->i_rwsem.
+ */
+void
+ceph_end_io_write(struct inode *inode)
+{
+	up_write(&inode->i_rwsem);
+}
+
+/* Call with exclusively locked inode->i_rwsem */
+static void ceph_block_buffered(struct ceph_inode_info *ci, struct inode *inode)
+{
+	lockdep_assert_held_write(&inode->i_rwsem);
+
+	if (!(READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT)) {
+		spin_lock(&ci->i_ceph_lock);
+		ci->i_ceph_flags |= CEPH_I_ODIRECT;
+		spin_unlock(&ci->i_ceph_lock);
+		/* FIXME: unmap_mapping_range? */
+		filemap_write_and_wait(inode->i_mapping);
+	}
+}
+
+/**
+ * ceph_end_io_direct - declare the file is being used for direct i/o
+ * @inode: file inode
+ *
+ * Declare that a direct I/O operation is about to start, and ensure
+ * that we block all buffered I/O.
+ * On exit, the function ensures that the CEPH_I_ODIRECT flag is set,
+ * and holds a shared lock on inode->i_rwsem to ensure that the flag
+ * cannot be changed.
+ * In practice, this means that direct I/O operations are allowed to
+ * execute in parallel, thanks to the shared lock, whereas buffered I/O
+ * operations need to wait to grab an exclusive lock in order to clear
+ * CEPH_I_ODIRECT.
+ * Note that buffered writes and truncates both take a write lock on
+ * inode->i_rwsem, meaning that those are serialised w.r.t. O_DIRECT.
+ */
+void
+ceph_start_io_direct(struct inode *inode)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+
+	/* Be an optimist! */
+	down_read(&inode->i_rwsem);
+	if (READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT)
+		return;
+	up_read(&inode->i_rwsem);
+	/* Slow path.... */
+	down_write(&inode->i_rwsem);
+	ceph_block_buffered(ci, inode);
+	downgrade_write(&inode->i_rwsem);
+}
+
+/**
+ * ceph_end_io_direct - declare that the direct i/o operation is done
+ * @inode: file inode
+ *
+ * Declare that a direct I/O operation is done, and release the shared
+ * lock on inode->i_rwsem.
+ */
+void
+ceph_end_io_direct(struct inode *inode)
+{
+	up_read(&inode->i_rwsem);
+}

diff --git a/fs/ceph/io.h b/fs/ceph/io.h
new file mode 100644
index 0000000..fa594cd
--- /dev/null
+++ b/fs/ceph/io.h

@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _FS_CEPH_IO_H
+#define _FS_CEPH_IO_H
+
+void ceph_start_io_read(struct inode *inode);
+void ceph_end_io_read(struct inode *inode);
+void ceph_start_io_write(struct inode *inode);
+void ceph_end_io_write(struct inode *inode);
+void ceph_start_io_direct(struct inode *inode);
+void ceph_end_io_direct(struct inode *inode);
+
+#endif /* FS_CEPH_IO_H */

diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index 5083e23..544e9e8 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c

@@ -32,14 +32,18 @@
 
 static void ceph_fl_copy_lock(struct file_lock *dst, struct file_lock *src)
 {
-	struct inode *inode = file_inode(src->fl_file);
+	struct ceph_file_info *fi = dst->fl_file->private_data;
+	struct inode *inode = file_inode(dst->fl_file);
 	atomic_inc(&ceph_inode(inode)->i_filelock_ref);
+	atomic_inc(&fi->num_locks);
 }
 
 static void ceph_fl_release_lock(struct file_lock *fl)
 {
+	struct ceph_file_info *fi = fl->fl_file->private_data;
 	struct inode *inode = file_inode(fl->fl_file);
 	struct ceph_inode_info *ci = ceph_inode(inode);
+	atomic_dec(&fi->num_locks);
 	if (atomic_dec_and_test(&ci->i_filelock_ref)) {
 		/* clear error when all locks are released */
 		spin_lock(&ci->i_ceph_lock);
@@ -73,7 +77,7 @@
 		 * window. Caller function will decrease the counter.
 		 */
 		fl->fl_ops = &ceph_fl_lock_ops;
-		atomic_inc(&ceph_inode(inode)->i_filelock_ref);
+		fl->fl_ops->fl_copy_lock(fl, NULL);
 	}
 
 	if (operation != CEPH_MDS_OP_SETFILELOCK || cmd == CEPH_LOCK_UNLOCK)

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 920e9f0..a8a8f84 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c

@@ -639,7 +639,6 @@
 	s->s_renew_seq = 0;
 	INIT_LIST_HEAD(&s->s_caps);
 	s->s_nr_caps = 0;
-	s->s_trim_caps = 0;
 	refcount_set(&s->s_ref, 1);
 	INIT_LIST_HEAD(&s->s_waiting);
 	INIT_LIST_HEAD(&s->s_unsafe);
@@ -1270,6 +1269,7 @@
 {
 	struct ceph_mds_request *req;
 	struct rb_node *p;
+	struct ceph_inode_info *ci;
 
 	dout("cleanup_session_requests mds%d\n", session->s_mds);
 	mutex_lock(&mdsc->mutex);
@@ -1278,6 +1278,16 @@
 				       struct ceph_mds_request, r_unsafe_item);
 		pr_warn_ratelimited(" dropping unsafe request %llu\n",
 				    req->r_tid);
+		if (req->r_target_inode) {
+			/* dropping unsafe change of inode's attributes */
+			ci = ceph_inode(req->r_target_inode);
+			errseq_set(&ci->i_meta_err, -EIO);
+		}
+		if (req->r_unsafe_dir) {
+			/* dropping unsafe directory operation */
+			ci = ceph_inode(req->r_unsafe_dir);
+			errseq_set(&ci->i_meta_err, -EIO);
+		}
 		__unregister_request(mdsc, req);
 	}
 	/* zero r_attempts, so kick_requests() will re-send requests */
@@ -1370,7 +1380,7 @@
 	struct ceph_fs_client *fsc = (struct ceph_fs_client *)arg;
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	LIST_HEAD(to_remove);
-	bool drop = false;
+	bool dirty_dropped = false;
 	bool invalidate = false;
 
 	dout("removing cap %p, ci is %p, inode is %p\n",
@@ -1383,9 +1393,12 @@
 		struct ceph_cap_flush *cf;
 		struct ceph_mds_client *mdsc = fsc->mdsc;
 
-		if (ci->i_wrbuffer_ref > 0 &&
-		    READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
-			invalidate = true;
+		if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
+			if (inode->i_data.nrpages > 0)
+				invalidate = true;
+			if (ci->i_wrbuffer_ref > 0)
+				mapping_set_error(&inode->i_data, -EIO);
+		}
 
 		while (!list_empty(&ci->i_cap_flush_list)) {
 			cf = list_first_entry(&ci->i_cap_flush_list,
@@ -1405,7 +1418,7 @@
 				inode, ceph_ino(inode));
 			ci->i_dirty_caps = 0;
 			list_del_init(&ci->i_dirty_item);
-			drop = true;
+			dirty_dropped = true;
 		}
 		if (!list_empty(&ci->i_flushing_item)) {
 			pr_warn_ratelimited(
@@ -1415,10 +1428,22 @@
 			ci->i_flushing_caps = 0;
 			list_del_init(&ci->i_flushing_item);
 			mdsc->num_cap_flushing--;
-			drop = true;
+			dirty_dropped = true;
 		}
 		spin_unlock(&mdsc->cap_dirty_lock);
 
+		if (dirty_dropped) {
+			errseq_set(&ci->i_meta_err, -EIO);
+
+			if (ci->i_wrbuffer_ref_head == 0 &&
+			    ci->i_wr_ref == 0 &&
+			    ci->i_dirty_caps == 0 &&
+			    ci->i_flushing_caps == 0) {
+				ceph_put_snap_context(ci->i_head_snapc);
+				ci->i_head_snapc = NULL;
+			}
+		}
+
 		if (atomic_read(&ci->i_filelock_ref) > 0) {
 			/* make further file lock syscall return -EIO */
 			ci->i_ceph_flags |= CEPH_I_ERROR_FILELOCK;
@@ -1430,15 +1455,6 @@
 			list_add(&ci->i_prealloc_cap_flush->i_list, &to_remove);
 			ci->i_prealloc_cap_flush = NULL;
 		}
-
-               if (drop &&
-                  ci->i_wrbuffer_ref_head == 0 &&
-                  ci->i_wr_ref == 0 &&
-                  ci->i_dirty_caps == 0 &&
-                  ci->i_flushing_caps == 0) {
-                      ceph_put_snap_context(ci->i_head_snapc);
-                      ci->i_head_snapc = NULL;
-               }
 	}
 	spin_unlock(&ci->i_ceph_lock);
 	while (!list_empty(&to_remove)) {
@@ -1452,7 +1468,7 @@
 	wake_up_all(&ci->i_cap_wq);
 	if (invalidate)
 		ceph_queue_invalidate(inode);
-	if (drop)
+	if (dirty_dropped)
 		iput(inode);
 	return 0;
 }
@@ -1705,11 +1721,11 @@
  */
 static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
 {
-	struct ceph_mds_session *session = arg;
+	int *remaining = arg;
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	int used, wanted, oissued, mine;
 
-	if (session->s_trim_caps <= 0)
+	if (*remaining <= 0)
 		return -1;
 
 	spin_lock(&ci->i_ceph_lock);
@@ -1746,7 +1762,7 @@
 	if (oissued) {
 		/* we aren't the only cap.. just remove us */
 		__ceph_remove_cap(cap, true);
-		session->s_trim_caps--;
+		(*remaining)--;
 	} else {
 		struct dentry *dentry;
 		/* try dropping referring dentries */
@@ -1758,7 +1774,7 @@
 			d_prune_aliases(inode);
 			count = atomic_read(&inode->i_count);
 			if (count == 1)
-				session->s_trim_caps--;
+				(*remaining)--;
 			dout("trim_caps_cb %p cap %p pruned, count now %d\n",
 			     inode, cap, count);
 		} else {
@@ -1784,12 +1800,12 @@
 	dout("trim_caps mds%d start: %d / %d, trim %d\n",
 	     session->s_mds, session->s_nr_caps, max_caps, trim_caps);
 	if (trim_caps > 0) {
-		session->s_trim_caps = trim_caps;
-		ceph_iterate_session_caps(session, trim_caps_cb, session);
+		int remaining = trim_caps;
+
+		ceph_iterate_session_caps(session, trim_caps_cb, &remaining);
 		dout("trim_caps mds%d done: %d / %d, trimmed %d\n",
 		     session->s_mds, session->s_nr_caps, max_caps,
-			trim_caps - session->s_trim_caps);
-		session->s_trim_caps = 0;
+			trim_caps - remaining);
 	}
 
 	ceph_flush_cap_releases(mdsc, session);
@@ -3015,18 +3031,23 @@
 	pr_err("mdsc_handle_forward decode error err=%d\n", err);
 }
 
-static int __decode_and_drop_session_metadata(void **p, void *end)
+static int __decode_session_metadata(void **p, void *end,
+				     bool *blacklisted)
 {
 	/* map<string,string> */
 	u32 n;
+	bool err_str;
 	ceph_decode_32_safe(p, end, n, bad);
 	while (n-- > 0) {
 		u32 len;
 		ceph_decode_32_safe(p, end, len, bad);
 		ceph_decode_need(p, end, len, bad);
+		err_str = !strncmp(*p, "error_string", len);
 		*p += len;
 		ceph_decode_32_safe(p, end, len, bad);
 		ceph_decode_need(p, end, len, bad);
+		if (err_str && strnstr(*p, "blacklisted", len))
+			*blacklisted = true;
 		*p += len;
 	}
 	return 0;
@@ -3050,6 +3071,7 @@
 	u64 seq;
 	unsigned long features = 0;
 	int wake = 0;
+	bool blacklisted = false;
 
 	/* decode */
 	ceph_decode_need(&p, end, sizeof(*h), bad);
@@ -3062,7 +3084,7 @@
 	if (msg_version >= 3) {
 		u32 len;
 		/* version >= 2, metadata */
-		if (__decode_and_drop_session_metadata(&p, end) < 0)
+		if (__decode_session_metadata(&p, end, &blacklisted) < 0)
 			goto bad;
 		/* version >= 3, feature bits */
 		ceph_decode_32_safe(&p, end, len, bad);
@@ -3149,6 +3171,8 @@
 		session->s_state = CEPH_MDS_SESSION_REJECTED;
 		cleanup_session_requests(mdsc, session);
 		remove_session_caps(session);
+		if (blacklisted)
+			mdsc->fsc->blacklisted = true;
 		wake = 2; /* for good measure */
 		break;
 
@@ -3998,7 +4022,27 @@
 	mutex_unlock(&mdsc->mutex);
 }
 
+static void maybe_recover_session(struct ceph_mds_client *mdsc)
+{
+	struct ceph_fs_client *fsc = mdsc->fsc;
 
+	if (!ceph_test_mount_opt(fsc, CLEANRECOVER))
+		return;
+
+	if (READ_ONCE(fsc->mount_state) != CEPH_MOUNT_MOUNTED)
+		return;
+
+	if (!READ_ONCE(fsc->blacklisted))
+		return;
+
+	if (fsc->last_auto_reconnect &&
+	    time_before(jiffies, fsc->last_auto_reconnect + HZ * 60 * 30))
+		return;
+
+	pr_info("auto reconnect after blacklisted\n");
+	fsc->last_auto_reconnect = jiffies;
+	ceph_force_reconnect(fsc->sb);
+}
 
 /*
  * delayed work -- periodically trim expired leases, renew caps with mds
@@ -4044,7 +4088,9 @@
 				pr_info("mds%d hung\n", s->s_mds);
 			}
 		}
-		if (s->s_state < CEPH_MDS_SESSION_OPEN) {
+		if (s->s_state == CEPH_MDS_SESSION_NEW ||
+		    s->s_state == CEPH_MDS_SESSION_RESTARTING ||
+		    s->s_state == CEPH_MDS_SESSION_REJECTED) {
 			/* this mds is failed or recovering, just wait */
 			ceph_put_mds_session(s);
 			continue;
@@ -4072,6 +4118,8 @@
 
 	ceph_trim_snapid_map(mdsc);
 
+	maybe_recover_session(mdsc);
+
 	schedule_delayed(mdsc);
 }
 
@@ -4355,7 +4403,12 @@
 		session = __ceph_lookup_mds_session(mdsc, mds);
 		if (!session)
 			continue;
+
+		if (session->s_state == CEPH_MDS_SESSION_REJECTED)
+			__unregister_session(mdsc, session);
+		__wake_requests(mdsc, &session->s_waiting);
 		mutex_unlock(&mdsc->mutex);
+
 		mutex_lock(&session->s_mutex);
 		__close_session(mdsc, session);
 		if (session->s_state == CEPH_MDS_SESSION_CLOSING) {
@@ -4364,6 +4417,7 @@
 		}
 		mutex_unlock(&session->s_mutex);
 		ceph_put_mds_session(session);
+
 		mutex_lock(&mdsc->mutex);
 		kick_requests(mdsc, mds);
 	}

diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index f7c8603..5cd131b 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h

@@ -148,9 +148,9 @@
 	CEPH_MDS_SESSION_OPENING = 2,
 	CEPH_MDS_SESSION_OPEN = 3,
 	CEPH_MDS_SESSION_HUNG = 4,
-	CEPH_MDS_SESSION_CLOSING = 5,
-	CEPH_MDS_SESSION_RESTARTING = 6,
-	CEPH_MDS_SESSION_RECONNECTING = 7,
+	CEPH_MDS_SESSION_RESTARTING = 5,
+	CEPH_MDS_SESSION_RECONNECTING = 6,
+	CEPH_MDS_SESSION_CLOSING = 7,
 	CEPH_MDS_SESSION_REJECTED = 8,
 };
 
@@ -176,7 +176,7 @@
 	spinlock_t        s_cap_lock;
 	struct list_head  s_caps;     /* all caps issued by this session */
 	struct ceph_cap  *s_cap_iterator;
-	int               s_nr_caps, s_trim_caps;
+	int               s_nr_caps;
 	int               s_num_cap_releases;
 	int		  s_cap_reconnect;
 	int		  s_readonly;

diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 377fafc..edfd643 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c

@@ -143,6 +143,7 @@
 	Opt_snapdirname,
 	Opt_mds_namespace,
 	Opt_fscache_uniq,
+	Opt_recover_session,
 	Opt_last_string,
 	/* string args above */
 	Opt_dirstat,
@@ -184,6 +185,7 @@
 	/* int args above */
 	{Opt_snapdirname, "snapdirname=%s"},
 	{Opt_mds_namespace, "mds_namespace=%s"},
+	{Opt_recover_session, "recover_session=%s"},
 	{Opt_fscache_uniq, "fsc=%s"},
 	/* string args above */
 	{Opt_dirstat, "dirstat"},
@@ -254,6 +256,17 @@
 		if (!fsopt->mds_namespace)
 			return -ENOMEM;
 		break;
+	case Opt_recover_session:
+		if (!strncmp(argstr[0].from, "no",
+			     argstr[0].to - argstr[0].from)) {
+			fsopt->flags &= ~CEPH_MOUNT_OPT_CLEANRECOVER;
+		} else if (!strncmp(argstr[0].from, "clean",
+				    argstr[0].to - argstr[0].from)) {
+			fsopt->flags |= CEPH_MOUNT_OPT_CLEANRECOVER;
+		} else {
+			return -EINVAL;
+		}
+		break;
 	case Opt_fscache_uniq:
 		kfree(fsopt->fscache_uniq);
 		fsopt->fscache_uniq = kstrndup(argstr[0].from,
@@ -576,6 +589,10 @@
 
 	if (fsopt->mds_namespace)
 		seq_show_option(m, "mds_namespace", fsopt->mds_namespace);
+
+	if (fsopt->flags & CEPH_MOUNT_OPT_CLEANRECOVER)
+		seq_show_option(m, "recover_session", "clean");
+
 	if (fsopt->wsize != CEPH_MAX_WRITE_SIZE)
 		seq_printf(m, ",wsize=%d", fsopt->wsize);
 	if (fsopt->rsize != CEPH_MAX_READ_SIZE)
@@ -664,6 +681,7 @@
 
 	fsc->sb = NULL;
 	fsc->mount_state = CEPH_MOUNT_MOUNTING;
+	fsc->filp_gen = 1;
 
 	atomic_long_set(&fsc->writeback_count, 0);
 
@@ -713,6 +731,7 @@
 {
 	dout("destroy_fs_client %p\n", fsc);
 
+	ceph_mdsc_destroy(fsc);
 	destroy_workqueue(fsc->inode_wq);
 	destroy_workqueue(fsc->cap_wq);
 
@@ -829,7 +848,7 @@
 	fsc->mount_state = CEPH_MOUNT_SHUTDOWN;
 	ceph_osdc_abort_requests(&fsc->client->osdc, -EIO);
 	ceph_mdsc_force_umount(fsc->mdsc);
-	return;
+	fsc->filp_gen++; // invalidate open files
 }
 
 static int ceph_remount(struct super_block *sb, int *flags, char *data)
@@ -1089,7 +1108,6 @@
 	}
 
 	if (ceph_sb_to_client(sb) != fsc) {
-		ceph_mdsc_destroy(fsc);
 		destroy_fs_client(fsc);
 		fsc = ceph_sb_to_client(sb);
 		dout("get_sb got existing client %p\n", fsc);
@@ -1115,7 +1133,6 @@
 	goto out_final;
 
 out:
-	ceph_mdsc_destroy(fsc);
 	destroy_fs_client(fsc);
 out_final:
 	dout("ceph_mount fail %ld\n", PTR_ERR(res));
@@ -1139,8 +1156,6 @@
 
 	ceph_fscache_unregister_fs(fsc);
 
-	ceph_mdsc_destroy(fsc);
-
 	destroy_fs_client(fsc);
 	free_anon_bdev(dev);
 }
@@ -1154,6 +1169,33 @@
 };
 MODULE_ALIAS_FS("ceph");
 
+int ceph_force_reconnect(struct super_block *sb)
+{
+	struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
+	int err = 0;
+
+	ceph_umount_begin(sb);
+
+	/* Make sure all page caches get invalidated.
+	 * see remove_session_caps_cb() */
+	flush_workqueue(fsc->inode_wq);
+
+	/* In case that we were blacklisted. This also reset
+	 * all mon/osd connections */
+	ceph_reset_client_addr(fsc->client);
+
+	ceph_osdc_clear_abort_err(&fsc->client->osdc);
+
+	fsc->blacklisted = false;
+	fsc->mount_state = CEPH_MOUNT_MOUNTED;
+
+	if (sb->s_root) {
+		err = __ceph_do_getattr(d_inode(sb->s_root), NULL,
+					CEPH_STAT_CAP_INODE, true);
+	}
+	return err;
+}
+
 static int __init init_ceph(void)
 {
 	int ret = init_caches();

diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 6b9f1ee..f98d924 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h

@@ -16,6 +16,7 @@
 #include <linux/slab.h>
 #include <linux/posix_acl.h>
 #include <linux/refcount.h>
+#include <linux/security.h>
 
 #include <linux/ceph/libceph.h>
 
@@ -31,6 +32,7 @@
 #define CEPH_BLOCK_SHIFT   22  /* 4 MB */
 #define CEPH_BLOCK         (1 << CEPH_BLOCK_SHIFT)
 
+#define CEPH_MOUNT_OPT_CLEANRECOVER    (1<<1) /* auto reonnect (clean mode) after blacklisted */
 #define CEPH_MOUNT_OPT_DIRSTAT         (1<<4) /* `cat dirname` for stats */
 #define CEPH_MOUNT_OPT_RBYTES          (1<<5) /* dir st_bytes = rbytes */
 #define CEPH_MOUNT_OPT_NOASYNCREADDIR  (1<<7) /* no dcache readdir */
@@ -101,6 +103,11 @@
 	struct ceph_client *client;
 
 	unsigned long mount_state;
+
+	unsigned long last_auto_reconnect;
+	bool blacklisted;
+
+	u32 filp_gen;
 	loff_t max_file_size;
 
 	struct ceph_mds_client *mdsc;
@@ -395,6 +402,8 @@
 	struct fscache_cookie *fscache;
 	u32 i_fscache_gen;
 #endif
+	errseq_t i_meta_err;
+
 	struct inode vfs_inode; /* at end */
 };
 
@@ -499,17 +508,16 @@
 #define CEPH_I_DIR_ORDERED	(1 << 0)  /* dentries in dir are ordered */
 #define CEPH_I_NODELAY		(1 << 1)  /* do not delay cap release */
 #define CEPH_I_FLUSH		(1 << 2)  /* do not delay flush of dirty metadata */
-#define CEPH_I_NOFLUSH		(1 << 3)  /* do not flush dirty caps */
-#define CEPH_I_POOL_PERM	(1 << 4)  /* pool rd/wr bits are valid */
-#define CEPH_I_POOL_RD		(1 << 5)  /* can read from pool */
-#define CEPH_I_POOL_WR		(1 << 6)  /* can write to pool */
-#define CEPH_I_SEC_INITED	(1 << 7)  /* security initialized */
-#define CEPH_I_CAP_DROPPED	(1 << 8)  /* caps were forcibly dropped */
-#define CEPH_I_KICK_FLUSH	(1 << 9)  /* kick flushing caps */
-#define CEPH_I_FLUSH_SNAPS	(1 << 10) /* need flush snapss */
-#define CEPH_I_ERROR_WRITE	(1 << 11) /* have seen write errors */
-#define CEPH_I_ERROR_FILELOCK	(1 << 12) /* have seen file lock errors */
-
+#define CEPH_I_POOL_PERM	(1 << 3)  /* pool rd/wr bits are valid */
+#define CEPH_I_POOL_RD		(1 << 4)  /* can read from pool */
+#define CEPH_I_POOL_WR		(1 << 5)  /* can write to pool */
+#define CEPH_I_SEC_INITED	(1 << 6)  /* security initialized */
+#define CEPH_I_CAP_DROPPED	(1 << 7)  /* caps were forcibly dropped */
+#define CEPH_I_KICK_FLUSH	(1 << 8)  /* kick flushing caps */
+#define CEPH_I_FLUSH_SNAPS	(1 << 9)  /* need flush snapss */
+#define CEPH_I_ERROR_WRITE	(1 << 10) /* have seen write errors */
+#define CEPH_I_ERROR_FILELOCK	(1 << 11) /* have seen file lock errors */
+#define CEPH_I_ODIRECT		(1 << 12) /* inode in direct I/O mode */
 
 /*
  * Masks of ceph inode work.
@@ -703,6 +711,10 @@
 
 	spinlock_t rw_contexts_lock;
 	struct list_head rw_contexts;
+
+	errseq_t meta_err;
+	u32 filp_gen;
+	atomic_t num_locks;
 };
 
 struct ceph_dir_file_info {
@@ -842,7 +854,8 @@
 }
 
 
-
+/* super.c */
+extern int ceph_force_reconnect(struct super_block *sb);
 /* snap.c */
 struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
 					       u64 ino);
@@ -959,7 +972,10 @@
 #ifdef CONFIG_CEPH_FS_SECURITY_LABEL
 extern int ceph_security_init_secctx(struct dentry *dentry, umode_t mode,
 				     struct ceph_acl_sec_ctx *ctx);
-extern void ceph_security_invalidate_secctx(struct inode *inode);
+static inline void ceph_security_invalidate_secctx(struct inode *inode)
+{
+	security_inode_invalidate_secctx(inode);
+}
 #else
 static inline int ceph_security_init_secctx(struct dentry *dentry, umode_t mode,
 					    struct ceph_acl_sec_ctx *ctx)
@@ -1039,7 +1055,6 @@
 				    struct ceph_mds_session *session);
 extern struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci,
 					     int mds);
-extern int ceph_get_cap_mds(struct inode *inode);
 extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps);
 extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had);
 extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
@@ -1058,9 +1073,9 @@
 				      struct inode *dir,
 				      int mds, int drop, int unless);
 
-extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
+extern int ceph_get_caps(struct file *filp, int need, int want,
 			 loff_t endoff, int *got, struct page **pinned_page);
-extern int ceph_try_get_caps(struct ceph_inode_info *ci,
+extern int ceph_try_get_caps(struct inode *inode,
 			     int need, int want, bool nonblock, int *got);
 
 /* for counting open files by mode */
@@ -1071,7 +1086,7 @@
 extern const struct address_space_operations ceph_aops;
 extern int ceph_mmap(struct file *file, struct vm_area_struct *vma);
 extern int ceph_uninline_data(struct file *filp, struct page *locked_page);
-extern int ceph_pool_perm_check(struct ceph_inode_info *ci, int need);
+extern int ceph_pool_perm_check(struct inode *inode, int need);
 extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc);
 
 /* file.c */

diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 939eab7..cb18ee6 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c

@@ -20,7 +20,8 @@
 
 static bool ceph_is_valid_xattr(const char *name)
 {
-	return !strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) ||
+	return !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) ||
+	       !strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) ||
 	       !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
 	       !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
 }
@@ -892,7 +893,8 @@
 	memcpy(value, xattr->val, xattr->val_len);
 
 	if (current->journal_info &&
-	    !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN))
+	    !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) &&
+	    security_ismaclabel(name + XATTR_SECURITY_PREFIX_LEN))
 		ci->i_ceph_flags |= CEPH_I_SEC_INITED;
 out:
 	spin_unlock(&ci->i_ceph_lock);
@@ -903,11 +905,9 @@
 {
 	struct inode *inode = d_inode(dentry);
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_vxattr *vxattrs = ceph_inode_vxattrs(inode);
 	bool len_only = (size == 0);
 	u32 namelen;
 	int err;
-	int i;
 
 	spin_lock(&ci->i_ceph_lock);
 	dout("listxattr %p ver=%lld index_ver=%lld\n", inode,
@@ -936,33 +936,6 @@
 		names = __copy_xattr_names(ci, names);
 		size -= namelen;
 	}
-
-
-	/* virtual xattr names, too */
-	if (vxattrs) {
-		for (i = 0; vxattrs[i].name; i++) {
-			size_t this_len;
-
-			if (vxattrs[i].flags & VXATTR_FLAG_HIDDEN)
-				continue;
-			if (vxattrs[i].exists_cb && !vxattrs[i].exists_cb(ci))
-				continue;
-
-			this_len = strlen(vxattrs[i].name) + 1;
-			namelen += this_len;
-			if (len_only)
-				continue;
-
-			if (this_len > size) {
-				err = -ERANGE;
-				goto out;
-			}
-
-			memcpy(names, vxattrs[i].name, this_len);
-			names += this_len;
-			size -= this_len;
-		}
-	}
 	err = namelen;
 out:
 	spin_unlock(&ci->i_ceph_lock);
@@ -1293,42 +1266,8 @@
 		ceph_pagelist_release(pagelist);
 	return err;
 }
-
-void ceph_security_invalidate_secctx(struct inode *inode)
-{
-	security_inode_invalidate_secctx(inode);
-}
-
-static int ceph_xattr_set_security_label(const struct xattr_handler *handler,
-				    struct dentry *unused, struct inode *inode,
-				    const char *key, const void *buf,
-				    size_t buflen, int flags)
-{
-	if (security_ismaclabel(key)) {
-		const char *name = xattr_full_name(handler, key);
-		return __ceph_setxattr(inode, name, buf, buflen, flags);
-	}
-	return  -EOPNOTSUPP;
-}
-
-static int ceph_xattr_get_security_label(const struct xattr_handler *handler,
-				    struct dentry *unused, struct inode *inode,
-				    const char *key, void *buf, size_t buflen)
-{
-	if (security_ismaclabel(key)) {
-		const char *name = xattr_full_name(handler, key);
-		return __ceph_getxattr(inode, name, buf, buflen);
-	}
-	return  -EOPNOTSUPP;
-}
-
-static const struct xattr_handler ceph_security_label_handler = {
-	.prefix = XATTR_SECURITY_PREFIX,
-	.get    = ceph_xattr_get_security_label,
-	.set    = ceph_xattr_set_security_label,
-};
-#endif
-#endif
+#endif /* CONFIG_CEPH_FS_SECURITY_LABEL */
+#endif /* CONFIG_SECURITY */
 
 void ceph_release_acl_sec_ctx(struct ceph_acl_sec_ctx *as_ctx)
 {
@@ -1352,9 +1291,6 @@
 	&posix_acl_access_xattr_handler,
 	&posix_acl_default_xattr_handler,
 #endif
-#ifdef CONFIG_CEPH_FS_SECURITY_LABEL
-	&ceph_security_label_handler,
-#endif
 	&ceph_other_xattr_handler,
 	NULL,
 };

diff --git a/fs/cifs/cifs_ioctl.h b/fs/cifs/cifs_ioctl.h
index 6c3bd07..0f0dc1c 100644
--- a/fs/cifs/cifs_ioctl.h
+++ b/fs/cifs/cifs_ioctl.h

@@ -57,9 +57,18 @@
 	/* char buffer[]; */
 } __packed;
 
+struct smb3_key_debug_info {
+	__u64	Suid;
+	__u16	cipher_type;
+	__u8	auth_key[16]; /* SMB2_NTLMV2_SESSKEY_SIZE */
+	__u8	smb3encryptionkey[SMB3_SIGN_KEY_SIZE];
+	__u8	smb3decryptionkey[SMB3_SIGN_KEY_SIZE];
+} __packed;
+
 #define CIFS_IOCTL_MAGIC	0xCF
 #define CIFS_IOC_COPYCHUNK_FILE	_IOW(CIFS_IOCTL_MAGIC, 3, int)
 #define CIFS_IOC_SET_INTEGRITY  _IO(CIFS_IOCTL_MAGIC, 4)
 #define CIFS_IOC_GET_MNT_INFO _IOR(CIFS_IOCTL_MAGIC, 5, struct smb_mnt_fs_info)
 #define CIFS_ENUMERATE_SNAPSHOTS _IOR(CIFS_IOCTL_MAGIC, 6, struct smb_snapshot_array)
 #define CIFS_QUERY_INFO _IOWR(CIFS_IOCTL_MAGIC, 7, struct smb_query_info)
+#define CIFS_DUMP_KEY _IOWR(CIFS_IOCTL_MAGIC, 8, struct smb3_key_debug_info)

diff --git a/fs/cifs/cifsacl.h b/fs/cifs/cifsacl.h
index eb42834..439b99c 100644
--- a/fs/cifs/cifsacl.h
+++ b/fs/cifs/cifsacl.h

@@ -90,8 +90,39 @@
 	__le32 num_aces;
 } __attribute__((packed));
 
+/* ACE types - see MS-DTYP 2.4.4.1 */
+#define ACCESS_ALLOWED_ACE_TYPE	0x00
+#define ACCESS_DENIED_ACE_TYPE	0x01
+#define SYSTEM_AUDIT_ACE_TYPE	0x02
+#define SYSTEM_ALARM_ACE_TYPE	0x03
+#define ACCESS_ALLOWED_COMPOUND_ACE_TYPE 0x04
+#define ACCESS_ALLOWED_OBJECT_ACE_TYPE	0x05
+#define ACCESS_DENIED_OBJECT_ACE_TYPE	0x06
+#define SYSTEM_AUDIT_OBJECT_ACE_TYPE	0x07
+#define SYSTEM_ALARM_OBJECT_ACE_TYPE	0x08
+#define ACCESS_ALLOWED_CALLBACK_ACE_TYPE 0x09
+#define ACCESS_DENIED_CALLBACK_ACE_TYPE	0x0A
+#define ACCESS_ALLOWED_CALLBACK_OBJECT_ACE_TYPE 0x0B
+#define ACCESS_DENIED_CALLBACK_OBJECT_ACE_TYPE  0x0C
+#define SYSTEM_AUDIT_CALLBACK_ACE_TYPE	0x0D
+#define SYSTEM_ALARM_CALLBACK_ACE_TYPE	0x0E /* Reserved */
+#define SYSTEM_AUDIT_CALLBACK_OBJECT_ACE_TYPE 0x0F
+#define SYSTEM_ALARM_CALLBACK_OBJECT_ACE_TYPE 0x10 /* reserved */
+#define SYSTEM_MANDATORY_LABEL_ACE_TYPE	0x11
+#define SYSTEM_RESOURCE_ATTRIBUTE_ACE_TYPE 0x12
+#define SYSTEM_SCOPED_POLICY_ID_ACE_TYPE 0x13
+
+/* ACE flags */
+#define OBJECT_INHERIT_ACE	0x01
+#define CONTAINER_INHERIT_ACE	0x02
+#define NO_PROPAGATE_INHERIT_ACE 0x04
+#define INHERIT_ONLY_ACE	0x08
+#define INHERITED_ACE		0x10
+#define SUCCESSFUL_ACCESS_ACE_FLAG 0x40
+#define FAILED_ACCESS_ACE_FLAG	0x80
+
 struct cifs_ace {
-	__u8 type;
+	__u8 type; /* see above and MS-DTYP 2.4.4.1 */
 	__u8 flags;
 	__le16 size;
 	__le32 access_req;
@@ -99,6 +130,54 @@
 } __attribute__((packed));
 
 /*
+ * The current SMB3 form of security descriptor is similar to what was used for
+ * cifs (see above) but some fields are split, and fields in the struct below
+ * matches names of fields to the the spec, MS-DTYP (see sections 2.4.5 and
+ * 2.4.6). Note that "CamelCase" fields are used in this struct in order to
+ * match the MS-DTYP and MS-SMB2 specs which define the wire format.
+ */
+struct smb3_sd {
+	__u8 Revision; /* revision level, MUST be one */
+	__u8 Sbz1; /* only meaningful if 'RM' flag set below */
+	__le16 Control;
+	__le32 OffsetOwner;
+	__le32 OffsetGroup;
+	__le32 OffsetSacl;
+	__le32 OffsetDacl;
+} __packed;
+
+/* Meaning of 'Control' field flags */
+#define ACL_CONTROL_SR	0x0001	/* Self relative */
+#define ACL_CONTROL_RM	0x0002	/* Resource manager control bits */
+#define ACL_CONTROL_PS	0x0004	/* SACL protected from inherits */
+#define ACL_CONTROL_PD	0x0008	/* DACL protected from inherits */
+#define ACL_CONTROL_SI	0x0010	/* SACL Auto-Inherited */
+#define ACL_CONTROL_DI	0x0020	/* DACL Auto-Inherited */
+#define ACL_CONTROL_SC	0x0040	/* SACL computed through inheritance */
+#define ACL_CONTROL_DC	0x0080	/* DACL computed through inheritence */
+#define ACL_CONTROL_SS	0x0100	/* Create server ACL */
+#define ACL_CONTROL_DT	0x0200	/* DACL provided by trusteed source */
+#define ACL_CONTROL_SD	0x0400	/* SACL defaulted */
+#define ACL_CONTROL_SP	0x0800	/* SACL is present on object */
+#define ACL_CONTROL_DD	0x1000	/* DACL defaulted */
+#define ACL_CONTROL_DP	0x2000	/* DACL is present on object */
+#define ACL_CONTROL_GD	0x4000	/* Group was defaulted */
+#define ACL_CONTROL_OD	0x8000	/* User was defaulted */
+
+/* Meaning of AclRevision flags */
+#define ACL_REVISION	0x02 /* See section 2.4.4.1 of MS-DTYP */
+#define ACL_REVISION_DS	0x04 /* Additional AceTypes allowed */
+
+struct smb3_acl {
+	u8 AclRevision; /* revision level */
+	u8 Sbz1; /* MBZ */
+	__le16 AclSize;
+	__le16 AceCount;
+	__le16 Sbz2; /* MBZ */
+} __packed;
+
+
+/*
  * Minimum security identifier can be one for system defined Users
  * and Groups such as NULL SID and World or Built-in accounts such
  * as Administrator and Guest and consists of

diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 54e2045..2e960e1 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h

@@ -331,8 +331,9 @@
 			umode_t mode, struct cifs_tcon *tcon,
 			const char *full_path,
 			struct cifs_sb_info *cifs_sb);
-	int (*mkdir)(const unsigned int, struct cifs_tcon *, const char *,
-		     struct cifs_sb_info *);
+	int (*mkdir)(const unsigned int xid, struct inode *inode, umode_t mode,
+		     struct cifs_tcon *tcon, const char *name,
+		     struct cifs_sb_info *sb);
 	/* set info on created directory */
 	void (*mkdir_setinfo)(struct inode *, const char *,
 			      struct cifs_sb_info *, struct cifs_tcon *,
@@ -1209,6 +1210,7 @@
 	bool smallBuf:1; /* so we know which buf_release function to call */
 };
 
+#define ACL_NO_MODE	-1
 struct cifs_open_parms {
 	struct cifs_tcon *tcon;
 	struct cifs_sb_info *cifs_sb;

diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 99b1b1e..e53e9f6 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h

@@ -372,7 +372,8 @@
 				  const struct nls_table *nls_codepage,
 				  int remap);
 
-extern int CIFSSMBMkDir(const unsigned int xid, struct cifs_tcon *tcon,
+extern int CIFSSMBMkDir(const unsigned int xid, struct inode *inode,
+			umode_t mode, struct cifs_tcon *tcon,
 			const char *name, struct cifs_sb_info *cifs_sb);
 extern int CIFSSMBRmDir(const unsigned int xid, struct cifs_tcon *tcon,
 			const char *name, struct cifs_sb_info *cifs_sb);

diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index dbee213..4f554f0 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c

@@ -1078,7 +1078,8 @@
 }
 
 int
-CIFSSMBMkDir(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
+CIFSSMBMkDir(const unsigned int xid, struct inode *inode, umode_t mode,
+	     struct cifs_tcon *tcon, const char *name,
 	     struct cifs_sb_info *cifs_sb)
 {
 	int rc = 0;

diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 26cdfbf..3bae2e5 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c

@@ -1622,13 +1622,14 @@
 	}
 
 	/* BB add setting the equivalent of mode via CreateX w/ACLs */
-	rc = server->ops->mkdir(xid, tcon, full_path, cifs_sb);
+	rc = server->ops->mkdir(xid, inode, mode, tcon, full_path, cifs_sb);
 	if (rc) {
 		cifs_dbg(FYI, "cifs_mkdir returned 0x%x\n", rc);
 		d_drop(direntry);
 		goto mkdir_out;
 	}
 
+	/* TODO: skip this for smb2/smb3 */
 	rc = cifs_mkdir_qinfo(inode, direntry, mode, full_path, cifs_sb, tcon,
 			      xid);
 mkdir_out:

diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index 76ddd98b..1a01e10 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c

@@ -164,6 +164,7 @@
 long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
 {
 	struct inode *inode = file_inode(filep);
+	struct smb3_key_debug_info pkey_inf;
 	int rc = -ENOTTY; /* strange error - but the precedent */
 	unsigned int xid;
 	struct cifsFileInfo *pSMBFile = filep->private_data;
@@ -270,6 +271,34 @@
 			else
 				rc = -EOPNOTSUPP;
 			break;
+		case CIFS_DUMP_KEY:
+			if (pSMBFile == NULL)
+				break;
+			if (!capable(CAP_SYS_ADMIN)) {
+				rc = -EACCES;
+				break;
+			}
+
+			tcon = tlink_tcon(pSMBFile->tlink);
+			if (!smb3_encryption_required(tcon)) {
+				rc = -EOPNOTSUPP;
+				break;
+			}
+			pkey_inf.cipher_type =
+				le16_to_cpu(tcon->ses->server->cipher_type);
+			pkey_inf.Suid = tcon->ses->Suid;
+			memcpy(pkey_inf.auth_key, tcon->ses->auth_key.response,
+					16 /* SMB2_NTLMV2_SESSKEY_SIZE */);
+			memcpy(pkey_inf.smb3decryptionkey,
+			      tcon->ses->smb3decryptionkey, SMB3_SIGN_KEY_SIZE);
+			memcpy(pkey_inf.smb3encryptionkey,
+			      tcon->ses->smb3encryptionkey, SMB3_SIGN_KEY_SIZE);
+			if (copy_to_user((void __user *)arg, &pkey_inf,
+					sizeof(struct smb3_key_debug_info)))
+				rc = -EFAULT;
+			else
+				rc = 0;
+			break;
 		default:
 			cifs_dbg(FYI, "unsupported ioctl\n");
 			break;

diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 4c764ff..85bd644 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c

@@ -698,7 +698,6 @@
 	char *bcc_ptr;
 	struct cifs_ses *ses = sess_data->ses;
 	char lnm_session_key[CIFS_AUTH_RESP_SIZE];
-	__u32 capabilities;
 	__u16 bytes_remaining;
 
 	/* lanman 2 style sessionsetup */
@@ -709,7 +708,7 @@
 
 	pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
 	bcc_ptr = sess_data->iov[2].iov_base;
-	capabilities = cifs_ssetup_hdr(ses, pSMB);
+	(void)cifs_ssetup_hdr(ses, pSMB);
 
 	pSMB->req.hdr.Flags2 &= ~SMBFLG2_UNICODE;
 

diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c
index d2a3fb7..4121ac1 100644
--- a/fs/cifs/smb2inode.c
+++ b/fs/cifs/smb2inode.c

@@ -51,7 +51,7 @@
 smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
 		 struct cifs_sb_info *cifs_sb, const char *full_path,
 		 __u32 desired_access, __u32 create_disposition,
-		 __u32 create_options, void *ptr, int command,
+		 __u32 create_options, umode_t mode, void *ptr, int command,
 		 struct cifsFileInfo *cfile)
 {
 	int rc;
@@ -103,6 +103,7 @@
 		oparms.create_options |= CREATE_OPEN_BACKUP_INTENT;
 	oparms.fid = &fid;
 	oparms.reconnect = false;
+	oparms.mode = mode;
 
 	memset(&open_iov, 0, sizeof(open_iov));
 	rqst[num_rqst].rq_iov = open_iov;
@@ -478,7 +479,7 @@
 	cifs_get_readable_path(tcon, full_path, &cfile);
 	rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
 			      FILE_READ_ATTRIBUTES, FILE_OPEN, create_options,
-			      smb2_data, SMB2_OP_QUERY_INFO, cfile);
+			      ACL_NO_MODE, smb2_data, SMB2_OP_QUERY_INFO, cfile);
 	if (rc == -EOPNOTSUPP) {
 		*symlink = true;
 		create_options |= OPEN_REPARSE_POINT;
@@ -486,8 +487,8 @@
 		/* Failed on a symbolic link - query a reparse point info */
 		rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
 				      FILE_READ_ATTRIBUTES, FILE_OPEN,
-				      create_options, smb2_data,
-				      SMB2_OP_QUERY_INFO, NULL);
+				      create_options, ACL_NO_MODE,
+				      smb2_data, SMB2_OP_QUERY_INFO, NULL);
 	}
 	if (rc)
 		goto out;
@@ -499,12 +500,14 @@
 }
 
 int
-smb2_mkdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
+smb2_mkdir(const unsigned int xid, struct inode *parent_inode, umode_t mode,
+	   struct cifs_tcon *tcon, const char *name,
 	   struct cifs_sb_info *cifs_sb)
 {
 	return smb2_compound_op(xid, tcon, cifs_sb, name,
 				FILE_WRITE_ATTRIBUTES, FILE_CREATE,
-				CREATE_NOT_FILE, NULL, SMB2_OP_MKDIR, NULL);
+				CREATE_NOT_FILE, mode, NULL, SMB2_OP_MKDIR,
+				NULL);
 }
 
 void
@@ -525,8 +528,8 @@
 	cifs_get_writable_path(tcon, name, &cfile);
 	tmprc = smb2_compound_op(xid, tcon, cifs_sb, name,
 				 FILE_WRITE_ATTRIBUTES, FILE_CREATE,
-				 CREATE_NOT_FILE, &data, SMB2_OP_SET_INFO,
-				 cfile);
+				 CREATE_NOT_FILE, ACL_NO_MODE,
+				 &data, SMB2_OP_SET_INFO, cfile);
 	if (tmprc == 0)
 		cifs_i->cifsAttrs = dosattrs;
 }
@@ -536,7 +539,7 @@
 	   struct cifs_sb_info *cifs_sb)
 {
 	return smb2_compound_op(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN,
-				CREATE_NOT_FILE,
+				CREATE_NOT_FILE, ACL_NO_MODE,
 				NULL, SMB2_OP_RMDIR, NULL);
 }
 
@@ -546,7 +549,7 @@
 {
 	return smb2_compound_op(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN,
 				CREATE_DELETE_ON_CLOSE | OPEN_REPARSE_POINT,
-				NULL, SMB2_OP_DELETE, NULL);
+				ACL_NO_MODE, NULL, SMB2_OP_DELETE, NULL);
 }
 
 static int
@@ -564,7 +567,8 @@
 		goto smb2_rename_path;
 	}
 	rc = smb2_compound_op(xid, tcon, cifs_sb, from_name, access,
-			      FILE_OPEN, 0, smb2_to_name, command, cfile);
+			      FILE_OPEN, 0, ACL_NO_MODE, smb2_to_name,
+			      command, cfile);
 smb2_rename_path:
 	kfree(smb2_to_name);
 	return rc;
@@ -601,8 +605,8 @@
 	__le64 eof = cpu_to_le64(size);
 
 	return smb2_compound_op(xid, tcon, cifs_sb, full_path,
-				FILE_WRITE_DATA, FILE_OPEN, 0, &eof,
-				SMB2_OP_SET_EOF, NULL);
+				FILE_WRITE_DATA, FILE_OPEN, 0, ACL_NO_MODE,
+				&eof, SMB2_OP_SET_EOF, NULL);
 }
 
 int
@@ -623,8 +627,8 @@
 		return PTR_ERR(tlink);
 
 	rc = smb2_compound_op(xid, tlink_tcon(tlink), cifs_sb, full_path,
-			      FILE_WRITE_ATTRIBUTES, FILE_OPEN, 0, buf,
-			      SMB2_OP_SET_INFO, NULL);
+			      FILE_WRITE_ATTRIBUTES, FILE_OPEN,
+			      0, ACL_NO_MODE, buf, SMB2_OP_SET_INFO, NULL);
 	cifs_put_tlink(tlink);
 	return rc;
 }

diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index eaed180..4c09225 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c

@@ -751,6 +751,8 @@
 		goto oshr_exit;
 	}
 
+	atomic_inc(&tcon->num_remote_opens);
+
 	o_rsp = (struct smb2_create_rsp *)rsp_iov[0].iov_base;
 	oparms.fid->persistent_fid = o_rsp->PersistentFileId;
 	oparms.fid->volatile_fid = o_rsp->VolatileFileId;
@@ -1176,6 +1178,7 @@
 
 	rc = compound_send_recv(xid, ses, flags, 3, rqst,
 				resp_buftype, rsp_iov);
+	/* no need to bump num_remote_opens because handle immediately closed */
 
  sea_exit:
 	kfree(ea);
@@ -1518,6 +1521,8 @@
 				resp_buftype, rsp_iov);
 	if (rc)
 		goto iqinf_exit;
+
+	/* No need to bump num_remote_opens since handle immediately closed */
 	if (qi.flags & PASSTHRU_FSCTL) {
 		pqi = (struct smb_query_info __user *)arg;
 		io_rsp = (struct smb2_ioctl_rsp *)rsp_iov[1].iov_base;
@@ -3328,6 +3333,11 @@
 	if (oplock == SMB2_OPLOCK_LEVEL_NOCHANGE)
 		return;
 
+	/* Check if the server granted an oplock rather than a lease */
+	if (oplock & SMB2_OPLOCK_LEVEL_EXCLUSIVE)
+		return smb2_set_oplock_level(cinode, oplock, epoch,
+					     purge_cache);
+
 	if (oplock & SMB2_LEASE_READ_CACHING_HE) {
 		new_oplock |= CIFS_CACHE_READ_FLG;
 		strcat(message, "R");

diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 87066f1..85f9d61 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c

@@ -751,6 +751,8 @@
 	unsigned int num = *num_iovec;
 
 	iov[num].iov_base = create_posix_buf(mode);
+	if (mode == -1)
+		cifs_dbg(VFS, "illegal mode\n"); /* BB REMOVEME */
 	if (iov[num].iov_base == NULL)
 		return -ENOMEM;
 	iov[num].iov_len = sizeof(struct create_posix);
@@ -2352,6 +2354,7 @@
 	rqst.rq_iov = iov;
 	rqst.rq_nvec = n_iov;
 
+	/* no need to inc num_remote_opens because we close it just below */
 	trace_smb3_posix_mkdir_enter(xid, tcon->tid, ses->Suid, CREATE_NOT_FILE,
 				    FILE_WRITE_ATTRIBUTES);
 	/* resource #4: response buffer */
@@ -2416,6 +2419,7 @@
 	/* File attributes ignored on open (used in create though) */
 	req->FileAttributes = cpu_to_le32(file_attributes);
 	req->ShareAccess = FILE_SHARE_ALL_LE;
+
 	req->CreateDisposition = cpu_to_le32(oparms->disposition);
 	req->CreateOptions = cpu_to_le32(oparms->create_options & CREATE_OPTIONS_MASK);
 	req->NameOffset = cpu_to_le16(sizeof(struct smb2_create_req));
@@ -2517,6 +2521,23 @@
 			return rc;
 	}
 
+	/* TODO: add handling for the mode on create */
+	if (oparms->disposition == FILE_CREATE)
+		cifs_dbg(VFS, "mode is 0x%x\n", oparms->mode); /* BB REMOVEME */
+
+	if ((oparms->disposition == FILE_CREATE) && (oparms->mode != -1)) {
+		if (n_iov > 2) {
+			struct create_context *ccontext =
+			    (struct create_context *)iov[n_iov-1].iov_base;
+			ccontext->Next =
+				cpu_to_le32(iov[n_iov-1].iov_len);
+		}
+
+		/* rc = add_sd_context(iov, &n_iov, oparms->mode); */
+		if (rc)
+			return rc;
+	}
+
 	if (n_iov > 2) {
 		struct create_context *ccontext =
 			(struct create_context *)iov[n_iov-1].iov_base;
@@ -3180,7 +3201,7 @@
  * See MS-SMB2 2.2.35 and 2.2.36
  */
 
-int
+static int
 SMB2_notify_init(const unsigned int xid, struct smb_rqst *rqst,
 		struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid,
 		u32 completion_filter, bool watch_tree)

diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index 67a91b1..da3a6d5 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h

@@ -84,7 +84,8 @@
 			       umode_t mode, struct cifs_tcon *tcon,
 			       const char *full_path,
 			       struct cifs_sb_info *cifs_sb);
-extern int smb2_mkdir(const unsigned int xid, struct cifs_tcon *tcon,
+extern int smb2_mkdir(const unsigned int xid, struct inode *inode,
+		      umode_t mode, struct cifs_tcon *tcon,
 		      const char *name, struct cifs_sb_info *cifs_sb);
 extern void smb2_mkdir_setinfo(struct inode *inode, const char *full_path,
 			       struct cifs_sb_info *cifs_sb,

diff --git a/fs/cifs/smbfsctl.h b/fs/cifs/smbfsctl.h
index 08628e6..1ff2852 100644
--- a/fs/cifs/smbfsctl.h
+++ b/fs/cifs/smbfsctl.h

@@ -144,6 +144,17 @@
 #define IO_REPARSE_APPXSTREAM	     0xC0000014
 /* NFS symlinks, Win 8/SMB3 and later */
 #define IO_REPARSE_TAG_NFS           0x80000014
+/*
+ * AzureFileSync - see
+ * https://docs.microsoft.com/en-us/azure/storage/files/storage-sync-cloud-tiering
+ */
+#define IO_REPARSE_TAG_AZ_FILE_SYNC  0x8000001e
+/* WSL reparse tags */
+#define IO_REPARSE_TAG_LX_SYMLINK    0xA000001D
+#define IO_REPARSE_TAG_AF_UNIX	     0x80000023
+#define IO_REPARSE_TAG_LX_FIFO	     0x80000024
+#define IO_REPARSE_TAG_LX_CHR	     0x80000025
+#define IO_REPARSE_TAG_LX_BLK	     0x80000026
 
 /* fsctl flags */
 /* If Flags is set to this value, the request is an FSCTL not ioctl request */

diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index 9076150..db4ba8f 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c

@@ -31,7 +31,7 @@
 #include "cifs_fs_sb.h"
 #include "cifs_unicode.h"
 
-#define MAX_EA_VALUE_SIZE 65535
+#define MAX_EA_VALUE_SIZE CIFSMaxBufSize
 #define CIFS_XATTR_CIFS_ACL "system.cifs_acl"
 #define CIFS_XATTR_ATTRIB "cifs.dosattrib"  /* full name: user.cifs.dosattrib */
 #define CIFS_XATTR_CREATETIME "cifs.creationtime"  /* user.cifs.creationtime */

diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 93e4ca6..87846aa 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c

@@ -19,6 +19,7 @@
 #include <linux/atomic.h>
 #include <linux/device.h>
 #include <linux/poll.h>
+#include <linux/security.h>
 
 #include "internal.h"
 
@@ -136,6 +137,25 @@
 }
 EXPORT_SYMBOL_GPL(debugfs_file_put);
 
+/*
+ * Only permit access to world-readable files when the kernel is locked down.
+ * We also need to exclude any file that has ways to write or alter it as root
+ * can bypass the permissions check.
+ */
+static bool debugfs_is_locked_down(struct inode *inode,
+				   struct file *filp,
+				   const struct file_operations *real_fops)
+{
+	if ((inode->i_mode & 07777) == 0444 &&
+	    !(filp->f_mode & FMODE_WRITE) &&
+	    !real_fops->unlocked_ioctl &&
+	    !real_fops->compat_ioctl &&
+	    !real_fops->mmap)
+		return false;
+
+	return security_locked_down(LOCKDOWN_DEBUGFS);
+}
+
 static int open_proxy_open(struct inode *inode, struct file *filp)
 {
 	struct dentry *dentry = F_DENTRY(filp);
@@ -147,6 +167,11 @@
 		return r == -EIO ? -ENOENT : r;
 
 	real_fops = debugfs_real_fops(filp);
+
+	r = debugfs_is_locked_down(inode, filp, real_fops);
+	if (r)
+		goto out;
+
 	real_fops = fops_get(real_fops);
 	if (!real_fops) {
 		/* Huh? Module did not clean up after itself at exit? */
@@ -272,6 +297,11 @@
 		return r == -EIO ? -ENOENT : r;
 
 	real_fops = debugfs_real_fops(filp);
+
+	r = debugfs_is_locked_down(inode, filp, real_fops);
+	if (r)
+		goto out;
+
 	real_fops = fops_get(real_fops);
 	if (!real_fops) {
 		/* Huh? Module did not cleanup after itself at exit? */

diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 042b688..7b975db 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c

@@ -26,6 +26,7 @@
 #include <linux/parser.h>
 #include <linux/magic.h>
 #include <linux/slab.h>
+#include <linux/security.h>
 
 #include "internal.h"
 
@@ -35,6 +36,32 @@
 static int debugfs_mount_count;
 static bool debugfs_registered;
 
+/*
+ * Don't allow access attributes to be changed whilst the kernel is locked down
+ * so that we can use the file mode as part of a heuristic to determine whether
+ * to lock down individual files.
+ */
+static int debugfs_setattr(struct dentry *dentry, struct iattr *ia)
+{
+	int ret = security_locked_down(LOCKDOWN_DEBUGFS);
+
+	if (ret && (ia->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID)))
+		return ret;
+	return simple_setattr(dentry, ia);
+}
+
+static const struct inode_operations debugfs_file_inode_operations = {
+	.setattr	= debugfs_setattr,
+};
+static const struct inode_operations debugfs_dir_inode_operations = {
+	.lookup		= simple_lookup,
+	.setattr	= debugfs_setattr,
+};
+static const struct inode_operations debugfs_symlink_inode_operations = {
+	.get_link	= simple_get_link,
+	.setattr	= debugfs_setattr,
+};
+
 static struct inode *debugfs_get_inode(struct super_block *sb)
 {
 	struct inode *inode = new_inode(sb);
@@ -369,6 +396,7 @@
 	inode->i_mode = mode;
 	inode->i_private = data;
 
+	inode->i_op = &debugfs_file_inode_operations;
 	inode->i_fop = proxy_fops;
 	dentry->d_fsdata = (void *)((unsigned long)real_fops |
 				DEBUGFS_FSDATA_IS_REAL_FOPS_BIT);
@@ -532,7 +560,7 @@
 	}
 
 	inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
-	inode->i_op = &simple_dir_inode_operations;
+	inode->i_op = &debugfs_dir_inode_operations;
 	inode->i_fop = &simple_dir_operations;
 
 	/* directory inodes start off with i_nlink == 2 (for "." entry) */
@@ -632,7 +660,7 @@
 		return failed_creating(dentry);
 	}
 	inode->i_mode = S_IFLNK | S_IRWXUGO;
-	inode->i_op = &simple_symlink_inode_operations;
+	inode->i_op = &debugfs_symlink_inode_operations;
 	inode->i_link = link;
 	d_instantiate(dentry, inode);
 	return end_creating(dentry);

diff --git a/fs/exec.c b/fs/exec.c
index f7f6a14..555e93c 100644
--- a/fs/exec.c
+++ b/fs/exec.c

@@ -1033,6 +1033,7 @@
 	}
 	task_lock(tsk);
 	active_mm = tsk->active_mm;
+	membarrier_exec_mmap(mm);
 	tsk->mm = mm;
 	tsk->active_mm = mm;
 	activate_mm(active_mm, mm);
@@ -1825,7 +1826,6 @@
 	/* execve succeeded */
 	current->fs->in_exec = 0;
 	current->in_execve = 0;
-	membarrier_execve(current);
 	rseq_execve(current);
 	acct_update_integrals(current);
 	task_numa_free(current, false);

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 123e3dee..516faa2 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c

@@ -4551,6 +4551,7 @@
 	struct buffer_head	*bh;
 	struct super_block	*sb = inode->i_sb;
 	ext4_fsblk_t		block;
+	struct blk_plug		plug;
 	int			inodes_per_block, inode_offset;
 
 	iloc->bh = NULL;
@@ -4639,6 +4640,7 @@
 		 * If we need to do any I/O, try to pre-readahead extra
 		 * blocks from the inode table.
 		 */
+		blk_start_plug(&plug);
 		if (EXT4_SB(sb)->s_inode_readahead_blks) {
 			ext4_fsblk_t b, end, table;
 			unsigned num;
@@ -4669,6 +4671,7 @@
 		get_bh(bh);
 		bh->b_end_io = end_buffer_read_sync;
 		submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, bh);
+		blk_finish_plug(&plug);
 		wait_on_buffer(bh);
 		if (!buffer_uptodate(bh)) {
 			EXT4_ERROR_INODE_BLOCK(inode, block,

diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 1bda2ab..054acd9f 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c

@@ -88,9 +88,7 @@
 	int err, offset;
 
 next:
-	if (*bh)
-		brelse(*bh);
-
+	brelse(*bh);
 	*bh = NULL;
 	iblock = *pos >> sb->s_blocksize_bits;
 	err = fat_bmap(dir, iblock, &phys, &mapped_blocks, 0, false);
@@ -1100,8 +1098,11 @@
 			err = -ENOMEM;
 			goto error;
 		}
+		/* Avoid race with userspace read via bdev */
+		lock_buffer(bhs[n]);
 		memset(bhs[n]->b_data, 0, sb->s_blocksize);
 		set_buffer_uptodate(bhs[n]);
+		unlock_buffer(bhs[n]);
 		mark_buffer_dirty_inode(bhs[n], dir);
 
 		n++;
@@ -1158,6 +1159,8 @@
 	fat_time_unix2fat(sbi, ts, &time, &date, &time_cs);
 
 	de = (struct msdos_dir_entry *)bhs[0]->b_data;
+	/* Avoid race with userspace read via bdev */
+	lock_buffer(bhs[0]);
 	/* filling the new directory slots ("." and ".." entries) */
 	memcpy(de[0].name, MSDOS_DOT, MSDOS_NAME);
 	memcpy(de[1].name, MSDOS_DOTDOT, MSDOS_NAME);
@@ -1180,6 +1183,7 @@
 	de[0].size = de[1].size = 0;
 	memset(de + 2, 0, sb->s_blocksize - 2 * sizeof(*de));
 	set_buffer_uptodate(bhs[0]);
+	unlock_buffer(bhs[0]);
 	mark_buffer_dirty_inode(bhs[0], dir);
 
 	err = fat_zeroed_cluster(dir, blknr, 1, bhs, MAX_BUF_PER_PAGE);
@@ -1237,11 +1241,14 @@
 
 			/* fill the directory entry */
 			copy = min(size, sb->s_blocksize);
+			/* Avoid race with userspace read via bdev */
+			lock_buffer(bhs[n]);
 			memcpy(bhs[n]->b_data, slots, copy);
+			set_buffer_uptodate(bhs[n]);
+			unlock_buffer(bhs[n]);
+			mark_buffer_dirty_inode(bhs[n], dir);
 			slots += copy;
 			size -= copy;
-			set_buffer_uptodate(bhs[n]);
-			mark_buffer_dirty_inode(bhs[n], dir);
 			if (!size)
 				break;
 			n++;

diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index 2659836..3647c65 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c

@@ -388,8 +388,11 @@
 				err = -ENOMEM;
 				goto error;
 			}
+			/* Avoid race with userspace read via bdev */
+			lock_buffer(c_bh);
 			memcpy(c_bh->b_data, bhs[n]->b_data, sb->s_blocksize);
 			set_buffer_uptodate(c_bh);
+			unlock_buffer(c_bh);
 			mark_buffer_dirty_inode(c_bh, sbi->fat_inode);
 			if (sb->s_flags & SB_SYNCHRONOUS)
 				err = sync_dirty_buffer(c_bh);

diff --git a/fs/fhandle.c b/fs/fhandle.c
index 0ee7274..01263ff 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c

@@ -246,7 +246,7 @@
  * sys_open_by_handle_at: Open the file handle
  * @mountdirfd: directory file descriptor
  * @handle: file handle to be opened
- * @flag: open flags.
+ * @flags: open flags.
  *
  * @mountdirfd indicate the directory file descriptor
  * of the mount point. file handle is decoded relative

diff --git a/fs/file_table.c b/fs/file_table.c
index b07b53f..30d55c9 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c

@@ -327,6 +327,7 @@
 {
 	delayed_fput(NULL);
 }
+EXPORT_SYMBOL_GPL(flush_delayed_fput);
 
 static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput);
 

diff --git a/fs/fs_context.c b/fs/fs_context.c
index 87c2c96..138b5b4 100644
--- a/fs/fs_context.c
+++ b/fs/fs_context.c

@@ -504,7 +504,6 @@
 	put_net(fc->net_ns);
 	put_user_ns(fc->user_ns);
 	put_cred(fc->cred);
-	kfree(fc->subtype);
 	put_fc_log(fc);
 	put_filesystem(fc->fs_type);
 	kfree(fc->source);
@@ -571,17 +570,6 @@
 		return 0;
 	}
 
-	if ((fc->fs_type->fs_flags & FS_HAS_SUBTYPE) &&
-	    strcmp(param->key, "subtype") == 0) {
-		if (param->type != fs_value_is_string)
-			return invalf(fc, "VFS: Legacy: Non-string subtype");
-		if (fc->subtype)
-			return invalf(fc, "VFS: Legacy: Multiple subtype");
-		fc->subtype = param->string;
-		param->string = NULL;
-		return 0;
-	}
-
 	if (ctx->param_type == LEGACY_FS_MONOLITHIC_PARAMS)
 		return invalf(fc, "VFS: Legacy: Can't mix monolithic and individual options");
 
@@ -738,8 +726,6 @@
 	fc->s_fs_info = NULL;
 	fc->sb_flags = 0;
 	security_free_mnt_opts(&fc->security);
-	kfree(fc->subtype);
-	fc->subtype = NULL;
 	kfree(fc->source);
 	fc->source = NULL;
 

diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig
index 24fc5a5..0635cba 100644
--- a/fs/fuse/Kconfig
+++ b/fs/fuse/Kconfig

@@ -27,3 +27,14 @@
 
 	  If you want to develop or use a userspace character device
 	  based on CUSE, answer Y or M.
+
+config VIRTIO_FS
+	tristate "Virtio Filesystem"
+	depends on FUSE_FS
+	select VIRTIO
+	help
+	  The Virtio Filesystem allows guests to mount file systems from the
+          host.
+
+	  If you want to share files between guests or with the host, answer Y
+          or M.

diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile
index 9485019..6419a2b 100644
--- a/fs/fuse/Makefile
+++ b/fs/fuse/Makefile

@@ -5,5 +5,6 @@
 
 obj-$(CONFIG_FUSE_FS) += fuse.o
 obj-$(CONFIG_CUSE) += cuse.o
+obj-$(CONFIG_VIRTIO_FS) += virtio_fs.o
 
 fuse-objs := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o

diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index bab7a0d..00015d8 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c

@@ -142,11 +142,10 @@
 
 static int cuse_release(struct inode *inode, struct file *file)
 {
-	struct fuse_inode *fi = get_fuse_inode(inode);
 	struct fuse_file *ff = file->private_data;
 	struct fuse_conn *fc = ff->fc;
 
-	fuse_sync_release(fi, ff, file->f_flags);
+	fuse_sync_release(NULL, ff, file->f_flags);
 	fuse_conn_put(fc);
 
 	return 0;
@@ -299,6 +298,14 @@
 	kfree(dev);
 }
 
+struct cuse_init_args {
+	struct fuse_args_pages ap;
+	struct cuse_init_in in;
+	struct cuse_init_out out;
+	struct page *page;
+	struct fuse_page_desc desc;
+};
+
 /**
  * cuse_process_init_reply - finish initializing CUSE channel
  *
@@ -306,21 +313,22 @@
  * required data structures for it.  Please read the comment at the
  * top of this file for high level overview.
  */
-static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
+static void cuse_process_init_reply(struct fuse_conn *fc,
+				    struct fuse_args *args, int error)
 {
+	struct cuse_init_args *ia = container_of(args, typeof(*ia), ap.args);
+	struct fuse_args_pages *ap = &ia->ap;
 	struct cuse_conn *cc = fc_to_cc(fc), *pos;
-	struct cuse_init_out *arg = req->out.args[0].value;
-	struct page *page = req->pages[0];
+	struct cuse_init_out *arg = &ia->out;
+	struct page *page = ap->pages[0];
 	struct cuse_devinfo devinfo = { };
 	struct device *dev;
 	struct cdev *cdev;
 	dev_t devt;
 	int rc, i;
 
-	if (req->out.h.error ||
-	    arg->major != FUSE_KERNEL_VERSION || arg->minor < 11) {
+	if (error || arg->major != FUSE_KERNEL_VERSION || arg->minor < 11)
 		goto err;
-	}
 
 	fc->minor = arg->minor;
 	fc->max_read = max_t(unsigned, arg->max_read, 4096);
@@ -329,7 +337,7 @@
 	/* parse init reply */
 	cc->unrestricted_ioctl = arg->flags & CUSE_UNRESTRICTED_IOCTL;
 
-	rc = cuse_parse_devinfo(page_address(page), req->out.args[1].size,
+	rc = cuse_parse_devinfo(page_address(page), ap->args.out_args[1].size,
 				&devinfo);
 	if (rc)
 		goto err;
@@ -396,7 +404,7 @@
 	dev_set_uevent_suppress(dev, 0);
 	kobject_uevent(&dev->kobj, KOBJ_ADD);
 out:
-	kfree(arg);
+	kfree(ia);
 	__free_page(page);
 	return;
 
@@ -415,55 +423,49 @@
 static int cuse_send_init(struct cuse_conn *cc)
 {
 	int rc;
-	struct fuse_req *req;
 	struct page *page;
 	struct fuse_conn *fc = &cc->fc;
-	struct cuse_init_in *arg;
-	void *outarg;
+	struct cuse_init_args *ia;
+	struct fuse_args_pages *ap;
 
 	BUILD_BUG_ON(CUSE_INIT_INFO_MAX > PAGE_SIZE);
 
-	req = fuse_get_req_for_background(fc, 1);
-	if (IS_ERR(req)) {
-		rc = PTR_ERR(req);
-		goto err;
-	}
-
 	rc = -ENOMEM;
 	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
 	if (!page)
-		goto err_put_req;
+		goto err;
 
-	outarg = kzalloc(sizeof(struct cuse_init_out), GFP_KERNEL);
-	if (!outarg)
+	ia = kzalloc(sizeof(*ia), GFP_KERNEL);
+	if (!ia)
 		goto err_free_page;
 
-	arg = &req->misc.cuse_init_in;
-	arg->major = FUSE_KERNEL_VERSION;
-	arg->minor = FUSE_KERNEL_MINOR_VERSION;
-	arg->flags |= CUSE_UNRESTRICTED_IOCTL;
-	req->in.h.opcode = CUSE_INIT;
-	req->in.numargs = 1;
-	req->in.args[0].size = sizeof(struct cuse_init_in);
-	req->in.args[0].value = arg;
-	req->out.numargs = 2;
-	req->out.args[0].size = sizeof(struct cuse_init_out);
-	req->out.args[0].value = outarg;
-	req->out.args[1].size = CUSE_INIT_INFO_MAX;
-	req->out.argvar = 1;
-	req->out.argpages = 1;
-	req->pages[0] = page;
-	req->page_descs[0].length = req->out.args[1].size;
-	req->num_pages = 1;
-	req->end = cuse_process_init_reply;
-	fuse_request_send_background(fc, req);
+	ap = &ia->ap;
+	ia->in.major = FUSE_KERNEL_VERSION;
+	ia->in.minor = FUSE_KERNEL_MINOR_VERSION;
+	ia->in.flags |= CUSE_UNRESTRICTED_IOCTL;
+	ap->args.opcode = CUSE_INIT;
+	ap->args.in_numargs = 1;
+	ap->args.in_args[0].size = sizeof(ia->in);
+	ap->args.in_args[0].value = &ia->in;
+	ap->args.out_numargs = 2;
+	ap->args.out_args[0].size = sizeof(ia->out);
+	ap->args.out_args[0].value = &ia->out;
+	ap->args.out_args[1].size = CUSE_INIT_INFO_MAX;
+	ap->args.out_argvar = 1;
+	ap->args.out_pages = 1;
+	ap->num_pages = 1;
+	ap->pages = &ia->page;
+	ap->descs = &ia->desc;
+	ia->page = page;
+	ia->desc.length = ap->args.out_args[1].size;
+	ap->args.end = cuse_process_init_reply;
 
-	return 0;
-
+	rc = fuse_simple_background(fc, &ap->args, GFP_KERNEL);
+	if (rc) {
+		kfree(ia);
 err_free_page:
-	__free_page(page);
-err_put_req:
-	fuse_put_request(fc, req);
+		__free_page(page);
+	}
 err:
 	return rc;
 }
@@ -504,9 +506,9 @@
 	 * Limit the cuse channel to requests that can
 	 * be represented in file->f_cred->user_ns.
 	 */
-	fuse_conn_init(&cc->fc, file->f_cred->user_ns);
+	fuse_conn_init(&cc->fc, file->f_cred->user_ns, &fuse_dev_fiq_ops, NULL);
 
-	fud = fuse_dev_alloc(&cc->fc);
+	fud = fuse_dev_alloc_install(&cc->fc);
 	if (!fud) {
 		kfree(cc);
 		return -ENOMEM;
@@ -519,6 +521,7 @@
 	rc = cuse_send_init(cc);
 	if (rc) {
 		fuse_dev_free(fud);
+		fuse_conn_put(&cc->fc);
 		return rc;
 	}
 	file->private_data = fud;

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index ea82375..dadd617 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c

@@ -40,107 +40,30 @@
 	return READ_ONCE(file->private_data);
 }
 
-static void fuse_request_init(struct fuse_req *req, struct page **pages,
-			      struct fuse_page_desc *page_descs,
-			      unsigned npages)
+static void fuse_request_init(struct fuse_req *req)
 {
 	INIT_LIST_HEAD(&req->list);
 	INIT_LIST_HEAD(&req->intr_entry);
 	init_waitqueue_head(&req->waitq);
 	refcount_set(&req->count, 1);
-	req->pages = pages;
-	req->page_descs = page_descs;
-	req->max_pages = npages;
 	__set_bit(FR_PENDING, &req->flags);
 }
 
-static struct page **fuse_req_pages_alloc(unsigned int npages, gfp_t flags,
-					  struct fuse_page_desc **desc)
-{
-	struct page **pages;
-
-	pages = kzalloc(npages * (sizeof(struct page *) +
-				  sizeof(struct fuse_page_desc)), flags);
-	*desc = (void *) pages + npages * sizeof(struct page *);
-
-	return pages;
-}
-
-static struct fuse_req *__fuse_request_alloc(unsigned npages, gfp_t flags)
+static struct fuse_req *fuse_request_alloc(gfp_t flags)
 {
 	struct fuse_req *req = kmem_cache_zalloc(fuse_req_cachep, flags);
-	if (req) {
-		struct page **pages = NULL;
-		struct fuse_page_desc *page_descs = NULL;
+	if (req)
+		fuse_request_init(req);
 
-		WARN_ON(npages > FUSE_MAX_MAX_PAGES);
-		if (npages > FUSE_REQ_INLINE_PAGES) {
-			pages = fuse_req_pages_alloc(npages, flags,
-						     &page_descs);
-			if (!pages) {
-				kmem_cache_free(fuse_req_cachep, req);
-				return NULL;
-			}
-		} else if (npages) {
-			pages = req->inline_pages;
-			page_descs = req->inline_page_descs;
-		}
-
-		fuse_request_init(req, pages, page_descs, npages);
-	}
 	return req;
 }
 
-struct fuse_req *fuse_request_alloc(unsigned npages)
+static void fuse_request_free(struct fuse_req *req)
 {
-	return __fuse_request_alloc(npages, GFP_KERNEL);
-}
-EXPORT_SYMBOL_GPL(fuse_request_alloc);
-
-struct fuse_req *fuse_request_alloc_nofs(unsigned npages)
-{
-	return __fuse_request_alloc(npages, GFP_NOFS);
-}
-
-static void fuse_req_pages_free(struct fuse_req *req)
-{
-	if (req->pages != req->inline_pages)
-		kfree(req->pages);
-}
-
-bool fuse_req_realloc_pages(struct fuse_conn *fc, struct fuse_req *req,
-			    gfp_t flags)
-{
-	struct page **pages;
-	struct fuse_page_desc *page_descs;
-	unsigned int npages = min_t(unsigned int,
-				    max_t(unsigned int, req->max_pages * 2,
-					  FUSE_DEFAULT_MAX_PAGES_PER_REQ),
-				    fc->max_pages);
-	WARN_ON(npages <= req->max_pages);
-
-	pages = fuse_req_pages_alloc(npages, flags, &page_descs);
-	if (!pages)
-		return false;
-
-	memcpy(pages, req->pages, sizeof(struct page *) * req->max_pages);
-	memcpy(page_descs, req->page_descs,
-	       sizeof(struct fuse_page_desc) * req->max_pages);
-	fuse_req_pages_free(req);
-	req->pages = pages;
-	req->page_descs = page_descs;
-	req->max_pages = npages;
-
-	return true;
-}
-
-void fuse_request_free(struct fuse_req *req)
-{
-	fuse_req_pages_free(req);
 	kmem_cache_free(fuse_req_cachep, req);
 }
 
-void __fuse_get_request(struct fuse_req *req)
+static void __fuse_get_request(struct fuse_req *req)
 {
 	refcount_inc(&req->count);
 }
@@ -177,8 +100,9 @@
 	}
 }
 
-static struct fuse_req *__fuse_get_req(struct fuse_conn *fc, unsigned npages,
-				       bool for_background)
+static void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req);
+
+static struct fuse_req *fuse_get_req(struct fuse_conn *fc, bool for_background)
 {
 	struct fuse_req *req;
 	int err;
@@ -201,7 +125,7 @@
 	if (fc->conn_error)
 		goto out;
 
-	req = fuse_request_alloc(npages);
+	req = fuse_request_alloc(GFP_KERNEL);
 	err = -ENOMEM;
 	if (!req) {
 		if (for_background)
@@ -229,101 +153,7 @@
 	return ERR_PTR(err);
 }
 
-struct fuse_req *fuse_get_req(struct fuse_conn *fc, unsigned npages)
-{
-	return __fuse_get_req(fc, npages, false);
-}
-EXPORT_SYMBOL_GPL(fuse_get_req);
-
-struct fuse_req *fuse_get_req_for_background(struct fuse_conn *fc,
-					     unsigned npages)
-{
-	return __fuse_get_req(fc, npages, true);
-}
-EXPORT_SYMBOL_GPL(fuse_get_req_for_background);
-
-/*
- * Return request in fuse_file->reserved_req.  However that may
- * currently be in use.  If that is the case, wait for it to become
- * available.
- */
-static struct fuse_req *get_reserved_req(struct fuse_conn *fc,
-					 struct file *file)
-{
-	struct fuse_req *req = NULL;
-	struct fuse_inode *fi = get_fuse_inode(file_inode(file));
-	struct fuse_file *ff = file->private_data;
-
-	do {
-		wait_event(fc->reserved_req_waitq, ff->reserved_req);
-		spin_lock(&fi->lock);
-		if (ff->reserved_req) {
-			req = ff->reserved_req;
-			ff->reserved_req = NULL;
-			req->stolen_file = get_file(file);
-		}
-		spin_unlock(&fi->lock);
-	} while (!req);
-
-	return req;
-}
-
-/*
- * Put stolen request back into fuse_file->reserved_req
- */
-static void put_reserved_req(struct fuse_conn *fc, struct fuse_req *req)
-{
-	struct file *file = req->stolen_file;
-	struct fuse_inode *fi = get_fuse_inode(file_inode(file));
-	struct fuse_file *ff = file->private_data;
-
-	WARN_ON(req->max_pages);
-	spin_lock(&fi->lock);
-	memset(req, 0, sizeof(*req));
-	fuse_request_init(req, NULL, NULL, 0);
-	BUG_ON(ff->reserved_req);
-	ff->reserved_req = req;
-	wake_up_all(&fc->reserved_req_waitq);
-	spin_unlock(&fi->lock);
-	fput(file);
-}
-
-/*
- * Gets a requests for a file operation, always succeeds
- *
- * This is used for sending the FLUSH request, which must get to
- * userspace, due to POSIX locks which may need to be unlocked.
- *
- * If allocation fails due to OOM, use the reserved request in
- * fuse_file.
- *
- * This is very unlikely to deadlock accidentally, since the
- * filesystem should not have it's own file open.  If deadlock is
- * intentional, it can still be broken by "aborting" the filesystem.
- */
-struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc,
-					     struct file *file)
-{
-	struct fuse_req *req;
-
-	atomic_inc(&fc->num_waiting);
-	wait_event(fc->blocked_waitq, fc->initialized);
-	/* Matches smp_wmb() in fuse_set_initialized() */
-	smp_rmb();
-	req = fuse_request_alloc(0);
-	if (!req)
-		req = get_reserved_req(fc, file);
-
-	req->in.h.uid = from_kuid_munged(fc->user_ns, current_fsuid());
-	req->in.h.gid = from_kgid_munged(fc->user_ns, current_fsgid());
-	req->in.h.pid = pid_nr_ns(task_pid(current), fc->pid_ns);
-
-	__set_bit(FR_WAITING, &req->flags);
-	__clear_bit(FR_BACKGROUND, &req->flags);
-	return req;
-}
-
-void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
+static void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
 {
 	if (refcount_dec_and_test(&req->count)) {
 		if (test_bit(FR_BACKGROUND, &req->flags)) {
@@ -342,15 +172,11 @@
 			fuse_drop_waiting(fc);
 		}
 
-		if (req->stolen_file)
-			put_reserved_req(fc, req);
-		else
-			fuse_request_free(req);
+		fuse_request_free(req);
 	}
 }
-EXPORT_SYMBOL_GPL(fuse_put_request);
 
-static unsigned len_args(unsigned numargs, struct fuse_arg *args)
+unsigned int fuse_len_args(unsigned int numargs, struct fuse_arg *args)
 {
 	unsigned nbytes = 0;
 	unsigned i;
@@ -360,25 +186,47 @@
 
 	return nbytes;
 }
+EXPORT_SYMBOL_GPL(fuse_len_args);
 
-static u64 fuse_get_unique(struct fuse_iqueue *fiq)
+u64 fuse_get_unique(struct fuse_iqueue *fiq)
 {
 	fiq->reqctr += FUSE_REQ_ID_STEP;
 	return fiq->reqctr;
 }
+EXPORT_SYMBOL_GPL(fuse_get_unique);
 
 static unsigned int fuse_req_hash(u64 unique)
 {
 	return hash_long(unique & ~FUSE_INT_REQ_BIT, FUSE_PQ_HASH_BITS);
 }
 
-static void queue_request(struct fuse_iqueue *fiq, struct fuse_req *req)
+/**
+ * A new request is available, wake fiq->waitq
+ */
+static void fuse_dev_wake_and_unlock(struct fuse_iqueue *fiq)
+__releases(fiq->lock)
+{
+	wake_up(&fiq->waitq);
+	kill_fasync(&fiq->fasync, SIGIO, POLL_IN);
+	spin_unlock(&fiq->lock);
+}
+
+const struct fuse_iqueue_ops fuse_dev_fiq_ops = {
+	.wake_forget_and_unlock		= fuse_dev_wake_and_unlock,
+	.wake_interrupt_and_unlock	= fuse_dev_wake_and_unlock,
+	.wake_pending_and_unlock	= fuse_dev_wake_and_unlock,
+};
+EXPORT_SYMBOL_GPL(fuse_dev_fiq_ops);
+
+static void queue_request_and_unlock(struct fuse_iqueue *fiq,
+				     struct fuse_req *req)
+__releases(fiq->lock)
 {
 	req->in.h.len = sizeof(struct fuse_in_header) +
-		len_args(req->in.numargs, (struct fuse_arg *) req->in.args);
+		fuse_len_args(req->args->in_numargs,
+			      (struct fuse_arg *) req->args->in_args);
 	list_add_tail(&req->list, &fiq->pending);
-	wake_up_locked(&fiq->waitq);
-	kill_fasync(&fiq->fasync, SIGIO, POLL_IN);
+	fiq->ops->wake_pending_and_unlock(fiq);
 }
 
 void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget,
@@ -389,16 +237,15 @@
 	forget->forget_one.nodeid = nodeid;
 	forget->forget_one.nlookup = nlookup;
 
-	spin_lock(&fiq->waitq.lock);
+	spin_lock(&fiq->lock);
 	if (fiq->connected) {
 		fiq->forget_list_tail->next = forget;
 		fiq->forget_list_tail = forget;
-		wake_up_locked(&fiq->waitq);
-		kill_fasync(&fiq->fasync, SIGIO, POLL_IN);
+		fiq->ops->wake_forget_and_unlock(fiq);
 	} else {
 		kfree(forget);
+		spin_unlock(&fiq->lock);
 	}
-	spin_unlock(&fiq->waitq.lock);
 }
 
 static void flush_bg_queue(struct fuse_conn *fc)
@@ -412,10 +259,9 @@
 		req = list_first_entry(&fc->bg_queue, struct fuse_req, list);
 		list_del(&req->list);
 		fc->active_background++;
-		spin_lock(&fiq->waitq.lock);
+		spin_lock(&fiq->lock);
 		req->in.h.unique = fuse_get_unique(fiq);
-		queue_request(fiq, req);
-		spin_unlock(&fiq->waitq.lock);
+		queue_request_and_unlock(fiq, req);
 	}
 }
 
@@ -427,9 +273,10 @@
  * the 'end' callback is called if given, else the reference to the
  * request is released
  */
-static void request_end(struct fuse_conn *fc, struct fuse_req *req)
+void fuse_request_end(struct fuse_conn *fc, struct fuse_req *req)
 {
 	struct fuse_iqueue *fiq = &fc->iq;
+	bool async = req->args->end;
 
 	if (test_and_set_bit(FR_FINISHED, &req->flags))
 		goto put_request;
@@ -439,9 +286,9 @@
 	 * smp_mb() from queue_interrupt().
 	 */
 	if (!list_empty(&req->intr_entry)) {
-		spin_lock(&fiq->waitq.lock);
+		spin_lock(&fiq->lock);
 		list_del_init(&req->intr_entry);
-		spin_unlock(&fiq->waitq.lock);
+		spin_unlock(&fiq->lock);
 	}
 	WARN_ON(test_bit(FR_PENDING, &req->flags));
 	WARN_ON(test_bit(FR_SENT, &req->flags));
@@ -475,18 +322,19 @@
 		wake_up(&req->waitq);
 	}
 
-	if (req->end)
-		req->end(fc, req);
+	if (async)
+		req->args->end(fc, req->args, req->out.h.error);
 put_request:
 	fuse_put_request(fc, req);
 }
+EXPORT_SYMBOL_GPL(fuse_request_end);
 
 static int queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req)
 {
-	spin_lock(&fiq->waitq.lock);
+	spin_lock(&fiq->lock);
 	/* Check for we've sent request to interrupt this req */
 	if (unlikely(!test_bit(FR_INTERRUPTED, &req->flags))) {
-		spin_unlock(&fiq->waitq.lock);
+		spin_unlock(&fiq->lock);
 		return -EINVAL;
 	}
 
@@ -499,13 +347,13 @@
 		smp_mb();
 		if (test_bit(FR_FINISHED, &req->flags)) {
 			list_del_init(&req->intr_entry);
-			spin_unlock(&fiq->waitq.lock);
+			spin_unlock(&fiq->lock);
 			return 0;
 		}
-		wake_up_locked(&fiq->waitq);
-		kill_fasync(&fiq->fasync, SIGIO, POLL_IN);
+		fiq->ops->wake_interrupt_and_unlock(fiq);
+	} else {
+		spin_unlock(&fiq->lock);
 	}
-	spin_unlock(&fiq->waitq.lock);
 	return 0;
 }
 
@@ -535,16 +383,16 @@
 		if (!err)
 			return;
 
-		spin_lock(&fiq->waitq.lock);
+		spin_lock(&fiq->lock);
 		/* Request is not yet in userspace, bail out */
 		if (test_bit(FR_PENDING, &req->flags)) {
 			list_del(&req->list);
-			spin_unlock(&fiq->waitq.lock);
+			spin_unlock(&fiq->lock);
 			__fuse_put_request(req);
 			req->out.h.error = -EINTR;
 			return;
 		}
-		spin_unlock(&fiq->waitq.lock);
+		spin_unlock(&fiq->lock);
 	}
 
 	/*
@@ -559,101 +407,110 @@
 	struct fuse_iqueue *fiq = &fc->iq;
 
 	BUG_ON(test_bit(FR_BACKGROUND, &req->flags));
-	spin_lock(&fiq->waitq.lock);
+	spin_lock(&fiq->lock);
 	if (!fiq->connected) {
-		spin_unlock(&fiq->waitq.lock);
+		spin_unlock(&fiq->lock);
 		req->out.h.error = -ENOTCONN;
 	} else {
 		req->in.h.unique = fuse_get_unique(fiq);
-		queue_request(fiq, req);
 		/* acquire extra reference, since request is still needed
-		   after request_end() */
+		   after fuse_request_end() */
 		__fuse_get_request(req);
-		spin_unlock(&fiq->waitq.lock);
+		queue_request_and_unlock(fiq, req);
 
 		request_wait_answer(fc, req);
-		/* Pairs with smp_wmb() in request_end() */
+		/* Pairs with smp_wmb() in fuse_request_end() */
 		smp_rmb();
 	}
 }
 
-void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
-{
-	__set_bit(FR_ISREPLY, &req->flags);
-	if (!test_bit(FR_WAITING, &req->flags)) {
-		__set_bit(FR_WAITING, &req->flags);
-		atomic_inc(&fc->num_waiting);
-	}
-	__fuse_request_send(fc, req);
-}
-EXPORT_SYMBOL_GPL(fuse_request_send);
-
 static void fuse_adjust_compat(struct fuse_conn *fc, struct fuse_args *args)
 {
-	if (fc->minor < 4 && args->in.h.opcode == FUSE_STATFS)
-		args->out.args[0].size = FUSE_COMPAT_STATFS_SIZE;
+	if (fc->minor < 4 && args->opcode == FUSE_STATFS)
+		args->out_args[0].size = FUSE_COMPAT_STATFS_SIZE;
 
 	if (fc->minor < 9) {
-		switch (args->in.h.opcode) {
+		switch (args->opcode) {
 		case FUSE_LOOKUP:
 		case FUSE_CREATE:
 		case FUSE_MKNOD:
 		case FUSE_MKDIR:
 		case FUSE_SYMLINK:
 		case FUSE_LINK:
-			args->out.args[0].size = FUSE_COMPAT_ENTRY_OUT_SIZE;
+			args->out_args[0].size = FUSE_COMPAT_ENTRY_OUT_SIZE;
 			break;
 		case FUSE_GETATTR:
 		case FUSE_SETATTR:
-			args->out.args[0].size = FUSE_COMPAT_ATTR_OUT_SIZE;
+			args->out_args[0].size = FUSE_COMPAT_ATTR_OUT_SIZE;
 			break;
 		}
 	}
 	if (fc->minor < 12) {
-		switch (args->in.h.opcode) {
+		switch (args->opcode) {
 		case FUSE_CREATE:
-			args->in.args[0].size = sizeof(struct fuse_open_in);
+			args->in_args[0].size = sizeof(struct fuse_open_in);
 			break;
 		case FUSE_MKNOD:
-			args->in.args[0].size = FUSE_COMPAT_MKNOD_IN_SIZE;
+			args->in_args[0].size = FUSE_COMPAT_MKNOD_IN_SIZE;
 			break;
 		}
 	}
 }
 
+static void fuse_force_creds(struct fuse_conn *fc, struct fuse_req *req)
+{
+	req->in.h.uid = from_kuid_munged(fc->user_ns, current_fsuid());
+	req->in.h.gid = from_kgid_munged(fc->user_ns, current_fsgid());
+	req->in.h.pid = pid_nr_ns(task_pid(current), fc->pid_ns);
+}
+
+static void fuse_args_to_req(struct fuse_req *req, struct fuse_args *args)
+{
+	req->in.h.opcode = args->opcode;
+	req->in.h.nodeid = args->nodeid;
+	req->args = args;
+}
+
 ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args)
 {
 	struct fuse_req *req;
 	ssize_t ret;
 
-	req = fuse_get_req(fc, 0);
-	if (IS_ERR(req))
-		return PTR_ERR(req);
+	if (args->force) {
+		atomic_inc(&fc->num_waiting);
+		req = fuse_request_alloc(GFP_KERNEL | __GFP_NOFAIL);
+
+		if (!args->nocreds)
+			fuse_force_creds(fc, req);
+
+		__set_bit(FR_WAITING, &req->flags);
+		__set_bit(FR_FORCE, &req->flags);
+	} else {
+		WARN_ON(args->nocreds);
+		req = fuse_get_req(fc, false);
+		if (IS_ERR(req))
+			return PTR_ERR(req);
+	}
 
 	/* Needs to be done after fuse_get_req() so that fc->minor is valid */
 	fuse_adjust_compat(fc, args);
+	fuse_args_to_req(req, args);
 
-	req->in.h.opcode = args->in.h.opcode;
-	req->in.h.nodeid = args->in.h.nodeid;
-	req->in.numargs = args->in.numargs;
-	memcpy(req->in.args, args->in.args,
-	       args->in.numargs * sizeof(struct fuse_in_arg));
-	req->out.argvar = args->out.argvar;
-	req->out.numargs = args->out.numargs;
-	memcpy(req->out.args, args->out.args,
-	       args->out.numargs * sizeof(struct fuse_arg));
-	fuse_request_send(fc, req);
+	if (!args->noreply)
+		__set_bit(FR_ISREPLY, &req->flags);
+	__fuse_request_send(fc, req);
 	ret = req->out.h.error;
-	if (!ret && args->out.argvar) {
-		BUG_ON(args->out.numargs != 1);
-		ret = req->out.args[0].size;
+	if (!ret && args->out_argvar) {
+		BUG_ON(args->out_numargs == 0);
+		ret = args->out_args[args->out_numargs - 1].size;
 	}
 	fuse_put_request(fc, req);
 
 	return ret;
 }
 
-bool fuse_request_queue_background(struct fuse_conn *fc, struct fuse_req *req)
+static bool fuse_request_queue_background(struct fuse_conn *fc,
+					  struct fuse_req *req)
 {
 	bool queued = false;
 
@@ -681,56 +538,63 @@
 	return queued;
 }
 
-void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req)
+int fuse_simple_background(struct fuse_conn *fc, struct fuse_args *args,
+			    gfp_t gfp_flags)
 {
-	WARN_ON(!req->end);
-	if (!fuse_request_queue_background(fc, req)) {
-		req->out.h.error = -ENOTCONN;
-		req->end(fc, req);
-		fuse_put_request(fc, req);
-	}
-}
-EXPORT_SYMBOL_GPL(fuse_request_send_background);
+	struct fuse_req *req;
 
-static int fuse_request_send_notify_reply(struct fuse_conn *fc,
-					  struct fuse_req *req, u64 unique)
+	if (args->force) {
+		WARN_ON(!args->nocreds);
+		req = fuse_request_alloc(gfp_flags);
+		if (!req)
+			return -ENOMEM;
+		__set_bit(FR_BACKGROUND, &req->flags);
+	} else {
+		WARN_ON(args->nocreds);
+		req = fuse_get_req(fc, true);
+		if (IS_ERR(req))
+			return PTR_ERR(req);
+	}
+
+	fuse_args_to_req(req, args);
+
+	if (!fuse_request_queue_background(fc, req)) {
+		fuse_put_request(fc, req);
+		return -ENOTCONN;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(fuse_simple_background);
+
+static int fuse_simple_notify_reply(struct fuse_conn *fc,
+				    struct fuse_args *args, u64 unique)
 {
-	int err = -ENODEV;
+	struct fuse_req *req;
 	struct fuse_iqueue *fiq = &fc->iq;
+	int err = 0;
+
+	req = fuse_get_req(fc, false);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
 	__clear_bit(FR_ISREPLY, &req->flags);
 	req->in.h.unique = unique;
-	spin_lock(&fiq->waitq.lock);
+
+	fuse_args_to_req(req, args);
+
+	spin_lock(&fiq->lock);
 	if (fiq->connected) {
-		queue_request(fiq, req);
-		err = 0;
+		queue_request_and_unlock(fiq, req);
+	} else {
+		err = -ENODEV;
+		spin_unlock(&fiq->lock);
+		fuse_put_request(fc, req);
 	}
-	spin_unlock(&fiq->waitq.lock);
 
 	return err;
 }
 
-void fuse_force_forget(struct file *file, u64 nodeid)
-{
-	struct inode *inode = file_inode(file);
-	struct fuse_conn *fc = get_fuse_conn(inode);
-	struct fuse_req *req;
-	struct fuse_forget_in inarg;
-
-	memset(&inarg, 0, sizeof(inarg));
-	inarg.nlookup = 1;
-	req = fuse_get_req_nofail_nopages(fc, file);
-	req->in.h.opcode = FUSE_FORGET;
-	req->in.h.nodeid = nodeid;
-	req->in.numargs = 1;
-	req->in.args[0].size = sizeof(inarg);
-	req->in.args[0].value = &inarg;
-	__clear_bit(FR_ISREPLY, &req->flags);
-	__fuse_request_send(fc, req);
-	/* ignore errors */
-	fuse_put_request(fc, req);
-}
-
 /*
  * Lock the request.  Up to the next unlock_request() there mustn't be
  * anything that could cause a page-fault.  If the request was already
@@ -1084,14 +948,15 @@
 {
 	unsigned i;
 	struct fuse_req *req = cs->req;
+	struct fuse_args_pages *ap = container_of(req->args, typeof(*ap), args);
 
-	for (i = 0; i < req->num_pages && (nbytes || zeroing); i++) {
+
+	for (i = 0; i < ap->num_pages && (nbytes || zeroing); i++) {
 		int err;
-		unsigned offset = req->page_descs[i].offset;
-		unsigned count = min(nbytes, req->page_descs[i].length);
+		unsigned int offset = ap->descs[i].offset;
+		unsigned int count = min(nbytes, ap->descs[i].length);
 
-		err = fuse_copy_page(cs, &req->pages[i], offset, count,
-				     zeroing);
+		err = fuse_copy_page(cs, &ap->pages[i], offset, count, zeroing);
 		if (err)
 			return err;
 
@@ -1149,12 +1014,12 @@
  * Unlike other requests this is assembled on demand, without a need
  * to allocate a separate fuse_req structure.
  *
- * Called with fiq->waitq.lock held, releases it
+ * Called with fiq->lock held, releases it
  */
 static int fuse_read_interrupt(struct fuse_iqueue *fiq,
 			       struct fuse_copy_state *cs,
 			       size_t nbytes, struct fuse_req *req)
-__releases(fiq->waitq.lock)
+__releases(fiq->lock)
 {
 	struct fuse_in_header ih;
 	struct fuse_interrupt_in arg;
@@ -1169,7 +1034,7 @@
 	ih.unique = (req->in.h.unique | FUSE_INT_REQ_BIT);
 	arg.unique = req->in.h.unique;
 
-	spin_unlock(&fiq->waitq.lock);
+	spin_unlock(&fiq->lock);
 	if (nbytes < reqsize)
 		return -EINVAL;
 
@@ -1181,9 +1046,9 @@
 	return err ? err : reqsize;
 }
 
-static struct fuse_forget_link *dequeue_forget(struct fuse_iqueue *fiq,
-					       unsigned max,
-					       unsigned *countp)
+struct fuse_forget_link *fuse_dequeue_forget(struct fuse_iqueue *fiq,
+					     unsigned int max,
+					     unsigned int *countp)
 {
 	struct fuse_forget_link *head = fiq->forget_list_head.next;
 	struct fuse_forget_link **newhead = &head;
@@ -1202,14 +1067,15 @@
 
 	return head;
 }
+EXPORT_SYMBOL(fuse_dequeue_forget);
 
 static int fuse_read_single_forget(struct fuse_iqueue *fiq,
 				   struct fuse_copy_state *cs,
 				   size_t nbytes)
-__releases(fiq->waitq.lock)
+__releases(fiq->lock)
 {
 	int err;
-	struct fuse_forget_link *forget = dequeue_forget(fiq, 1, NULL);
+	struct fuse_forget_link *forget = fuse_dequeue_forget(fiq, 1, NULL);
 	struct fuse_forget_in arg = {
 		.nlookup = forget->forget_one.nlookup,
 	};
@@ -1220,7 +1086,7 @@
 		.len = sizeof(ih) + sizeof(arg),
 	};
 
-	spin_unlock(&fiq->waitq.lock);
+	spin_unlock(&fiq->lock);
 	kfree(forget);
 	if (nbytes < ih.len)
 		return -EINVAL;
@@ -1238,7 +1104,7 @@
 
 static int fuse_read_batch_forget(struct fuse_iqueue *fiq,
 				   struct fuse_copy_state *cs, size_t nbytes)
-__releases(fiq->waitq.lock)
+__releases(fiq->lock)
 {
 	int err;
 	unsigned max_forgets;
@@ -1252,13 +1118,13 @@
 	};
 
 	if (nbytes < ih.len) {
-		spin_unlock(&fiq->waitq.lock);
+		spin_unlock(&fiq->lock);
 		return -EINVAL;
 	}
 
 	max_forgets = (nbytes - ih.len) / sizeof(struct fuse_forget_one);
-	head = dequeue_forget(fiq, max_forgets, &count);
-	spin_unlock(&fiq->waitq.lock);
+	head = fuse_dequeue_forget(fiq, max_forgets, &count);
+	spin_unlock(&fiq->lock);
 
 	arg.count = count;
 	ih.len += count * sizeof(struct fuse_forget_one);
@@ -1288,7 +1154,7 @@
 static int fuse_read_forget(struct fuse_conn *fc, struct fuse_iqueue *fiq,
 			    struct fuse_copy_state *cs,
 			    size_t nbytes)
-__releases(fiq->waitq.lock)
+__releases(fiq->lock)
 {
 	if (fc->minor < 16 || fiq->forget_list_head.next->next == NULL)
 		return fuse_read_single_forget(fiq, cs, nbytes);
@@ -1302,7 +1168,7 @@
  * the pending list and copies request data to userspace buffer.  If
  * no reply is needed (FORGET) or request has been aborted or there
  * was an error during the copying then it's finished by calling
- * request_end().  Otherwise add it to the processing list, and set
+ * fuse_request_end().  Otherwise add it to the processing list, and set
  * the 'sent' flag.
  */
 static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file,
@@ -1313,21 +1179,42 @@
 	struct fuse_iqueue *fiq = &fc->iq;
 	struct fuse_pqueue *fpq = &fud->pq;
 	struct fuse_req *req;
-	struct fuse_in *in;
+	struct fuse_args *args;
 	unsigned reqsize;
 	unsigned int hash;
 
- restart:
-	spin_lock(&fiq->waitq.lock);
-	err = -EAGAIN;
-	if ((file->f_flags & O_NONBLOCK) && fiq->connected &&
-	    !request_pending(fiq))
-		goto err_unlock;
+	/*
+	 * Require sane minimum read buffer - that has capacity for fixed part
+	 * of any request header + negotiated max_write room for data.
+	 *
+	 * Historically libfuse reserves 4K for fixed header room, but e.g.
+	 * GlusterFS reserves only 80 bytes
+	 *
+	 *	= `sizeof(fuse_in_header) + sizeof(fuse_write_in)`
+	 *
+	 * which is the absolute minimum any sane filesystem should be using
+	 * for header room.
+	 */
+	if (nbytes < max_t(size_t, FUSE_MIN_READ_BUFFER,
+			   sizeof(struct fuse_in_header) +
+			   sizeof(struct fuse_write_in) +
+			   fc->max_write))
+		return -EINVAL;
 
-	err = wait_event_interruptible_exclusive_locked(fiq->waitq,
+ restart:
+	for (;;) {
+		spin_lock(&fiq->lock);
+		if (!fiq->connected || request_pending(fiq))
+			break;
+		spin_unlock(&fiq->lock);
+
+		if (file->f_flags & O_NONBLOCK)
+			return -EAGAIN;
+		err = wait_event_interruptible_exclusive(fiq->waitq,
 				!fiq->connected || request_pending(fiq));
-	if (err)
-		goto err_unlock;
+		if (err)
+			return err;
+	}
 
 	if (!fiq->connected) {
 		err = fc->aborted ? -ECONNABORTED : -ENODEV;
@@ -1351,28 +1238,28 @@
 	req = list_entry(fiq->pending.next, struct fuse_req, list);
 	clear_bit(FR_PENDING, &req->flags);
 	list_del_init(&req->list);
-	spin_unlock(&fiq->waitq.lock);
+	spin_unlock(&fiq->lock);
 
-	in = &req->in;
-	reqsize = in->h.len;
+	args = req->args;
+	reqsize = req->in.h.len;
 
 	/* If request is too large, reply with an error and restart the read */
 	if (nbytes < reqsize) {
 		req->out.h.error = -EIO;
 		/* SETXATTR is special, since it may contain too large data */
-		if (in->h.opcode == FUSE_SETXATTR)
+		if (args->opcode == FUSE_SETXATTR)
 			req->out.h.error = -E2BIG;
-		request_end(fc, req);
+		fuse_request_end(fc, req);
 		goto restart;
 	}
 	spin_lock(&fpq->lock);
 	list_add(&req->list, &fpq->io);
 	spin_unlock(&fpq->lock);
 	cs->req = req;
-	err = fuse_copy_one(cs, &in->h, sizeof(in->h));
+	err = fuse_copy_one(cs, &req->in.h, sizeof(req->in.h));
 	if (!err)
-		err = fuse_copy_args(cs, in->numargs, in->argpages,
-				     (struct fuse_arg *) in->args, 0);
+		err = fuse_copy_args(cs, args->in_numargs, args->in_pages,
+				     (struct fuse_arg *) args->in_args, 0);
 	fuse_copy_finish(cs);
 	spin_lock(&fpq->lock);
 	clear_bit(FR_LOCKED, &req->flags);
@@ -1405,11 +1292,11 @@
 	if (!test_bit(FR_PRIVATE, &req->flags))
 		list_del_init(&req->list);
 	spin_unlock(&fpq->lock);
-	request_end(fc, req);
+	fuse_request_end(fc, req);
 	return err;
 
  err_unlock:
-	spin_unlock(&fiq->waitq.lock);
+	spin_unlock(&fiq->lock);
 	return err;
 }
 
@@ -1728,9 +1615,19 @@
 	return err;
 }
 
-static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_req *req)
+struct fuse_retrieve_args {
+	struct fuse_args_pages ap;
+	struct fuse_notify_retrieve_in inarg;
+};
+
+static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_args *args,
+			      int error)
 {
-	release_pages(req->pages, req->num_pages);
+	struct fuse_retrieve_args *ra =
+		container_of(args, typeof(*ra), ap.args);
+
+	release_pages(ra->ap.pages, ra->ap.num_pages);
+	kfree(ra);
 }
 
 static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
@@ -1738,13 +1635,16 @@
 {
 	int err;
 	struct address_space *mapping = inode->i_mapping;
-	struct fuse_req *req;
 	pgoff_t index;
 	loff_t file_size;
 	unsigned int num;
 	unsigned int offset;
 	size_t total_len = 0;
 	unsigned int num_pages;
+	struct fuse_retrieve_args *ra;
+	size_t args_size = sizeof(*ra);
+	struct fuse_args_pages *ap;
+	struct fuse_args *args;
 
 	offset = outarg->offset & ~PAGE_MASK;
 	file_size = i_size_read(inode);
@@ -1758,19 +1658,26 @@
 	num_pages = (num + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	num_pages = min(num_pages, fc->max_pages);
 
-	req = fuse_get_req(fc, num_pages);
-	if (IS_ERR(req))
-		return PTR_ERR(req);
+	args_size += num_pages * (sizeof(ap->pages[0]) + sizeof(ap->descs[0]));
 
-	req->in.h.opcode = FUSE_NOTIFY_REPLY;
-	req->in.h.nodeid = outarg->nodeid;
-	req->in.numargs = 2;
-	req->in.argpages = 1;
-	req->end = fuse_retrieve_end;
+	ra = kzalloc(args_size, GFP_KERNEL);
+	if (!ra)
+		return -ENOMEM;
+
+	ap = &ra->ap;
+	ap->pages = (void *) (ra + 1);
+	ap->descs = (void *) (ap->pages + num_pages);
+
+	args = &ap->args;
+	args->nodeid = outarg->nodeid;
+	args->opcode = FUSE_NOTIFY_REPLY;
+	args->in_numargs = 2;
+	args->in_pages = true;
+	args->end = fuse_retrieve_end;
 
 	index = outarg->offset >> PAGE_SHIFT;
 
-	while (num && req->num_pages < num_pages) {
+	while (num && ap->num_pages < num_pages) {
 		struct page *page;
 		unsigned int this_num;
 
@@ -1779,27 +1686,25 @@
 			break;
 
 		this_num = min_t(unsigned, num, PAGE_SIZE - offset);
-		req->pages[req->num_pages] = page;
-		req->page_descs[req->num_pages].offset = offset;
-		req->page_descs[req->num_pages].length = this_num;
-		req->num_pages++;
+		ap->pages[ap->num_pages] = page;
+		ap->descs[ap->num_pages].offset = offset;
+		ap->descs[ap->num_pages].length = this_num;
+		ap->num_pages++;
 
 		offset = 0;
 		num -= this_num;
 		total_len += this_num;
 		index++;
 	}
-	req->misc.retrieve_in.offset = outarg->offset;
-	req->misc.retrieve_in.size = total_len;
-	req->in.args[0].size = sizeof(req->misc.retrieve_in);
-	req->in.args[0].value = &req->misc.retrieve_in;
-	req->in.args[1].size = total_len;
+	ra->inarg.offset = outarg->offset;
+	ra->inarg.size = total_len;
+	args->in_args[0].size = sizeof(ra->inarg);
+	args->in_args[0].value = &ra->inarg;
+	args->in_args[1].size = total_len;
 
-	err = fuse_request_send_notify_reply(fc, req, outarg->notify_unique);
-	if (err) {
-		fuse_retrieve_end(fc, req);
-		fuse_put_request(fc, req);
-	}
+	err = fuse_simple_notify_reply(fc, args, outarg->notify_unique);
+	if (err)
+		fuse_retrieve_end(fc, args, err);
 
 	return err;
 }
@@ -1885,27 +1790,25 @@
 	return NULL;
 }
 
-static int copy_out_args(struct fuse_copy_state *cs, struct fuse_out *out,
+static int copy_out_args(struct fuse_copy_state *cs, struct fuse_args *args,
 			 unsigned nbytes)
 {
 	unsigned reqsize = sizeof(struct fuse_out_header);
 
-	if (out->h.error)
-		return nbytes != reqsize ? -EINVAL : 0;
+	reqsize += fuse_len_args(args->out_numargs, args->out_args);
 
-	reqsize += len_args(out->numargs, out->args);
-
-	if (reqsize < nbytes || (reqsize > nbytes && !out->argvar))
+	if (reqsize < nbytes || (reqsize > nbytes && !args->out_argvar))
 		return -EINVAL;
 	else if (reqsize > nbytes) {
-		struct fuse_arg *lastarg = &out->args[out->numargs-1];
+		struct fuse_arg *lastarg = &args->out_args[args->out_numargs-1];
 		unsigned diffsize = reqsize - nbytes;
+
 		if (diffsize > lastarg->size)
 			return -EINVAL;
 		lastarg->size -= diffsize;
 	}
-	return fuse_copy_args(cs, out->numargs, out->argpages, out->args,
-			      out->page_zeroing);
+	return fuse_copy_args(cs, args->out_numargs, args->out_pages,
+			      args->out_args, args->page_zeroing);
 }
 
 /*
@@ -1913,7 +1816,7 @@
  * the write buffer.  The request is then searched on the processing
  * list by the unique ID found in the header.  If found, then remove
  * it from the list and copy the rest of the buffer to the request.
- * The request is finished by calling request_end()
+ * The request is finished by calling fuse_request_end().
  */
 static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
 				 struct fuse_copy_state *cs, size_t nbytes)
@@ -1984,10 +1887,13 @@
 	set_bit(FR_LOCKED, &req->flags);
 	spin_unlock(&fpq->lock);
 	cs->req = req;
-	if (!req->out.page_replace)
+	if (!req->args->page_replace)
 		cs->move_pages = 0;
 
-	err = copy_out_args(cs, &req->out, nbytes);
+	if (oh.error)
+		err = nbytes != sizeof(oh) ? -EINVAL : 0;
+	else
+		err = copy_out_args(cs, req->args, nbytes);
 	fuse_copy_finish(cs);
 
 	spin_lock(&fpq->lock);
@@ -2000,7 +1906,7 @@
 		list_del_init(&req->list);
 	spin_unlock(&fpq->lock);
 
-	request_end(fc, req);
+	fuse_request_end(fc, req);
 out:
 	return err ? err : nbytes;
 
@@ -2121,12 +2027,12 @@
 	fiq = &fud->fc->iq;
 	poll_wait(file, &fiq->waitq, wait);
 
-	spin_lock(&fiq->waitq.lock);
+	spin_lock(&fiq->lock);
 	if (!fiq->connected)
 		mask = EPOLLERR;
 	else if (request_pending(fiq))
 		mask |= EPOLLIN | EPOLLRDNORM;
-	spin_unlock(&fiq->waitq.lock);
+	spin_unlock(&fiq->lock);
 
 	return mask;
 }
@@ -2140,7 +2046,7 @@
 		req->out.h.error = -ECONNABORTED;
 		clear_bit(FR_SENT, &req->flags);
 		list_del_init(&req->list);
-		request_end(fc, req);
+		fuse_request_end(fc, req);
 	}
 }
 
@@ -2221,15 +2127,15 @@
 		flush_bg_queue(fc);
 		spin_unlock(&fc->bg_lock);
 
-		spin_lock(&fiq->waitq.lock);
+		spin_lock(&fiq->lock);
 		fiq->connected = 0;
 		list_for_each_entry(req, &fiq->pending, list)
 			clear_bit(FR_PENDING, &req->flags);
 		list_splice_tail_init(&fiq->pending, &to_end);
 		while (forget_pending(fiq))
-			kfree(dequeue_forget(fiq, 1, NULL));
-		wake_up_all_locked(&fiq->waitq);
-		spin_unlock(&fiq->waitq.lock);
+			kfree(fuse_dequeue_forget(fiq, 1, NULL));
+		wake_up_all(&fiq->waitq);
+		spin_unlock(&fiq->lock);
 		kill_fasync(&fiq->fasync, SIGIO, POLL_IN);
 		end_polls(fc);
 		wake_up_all(&fc->blocked_waitq);
@@ -2296,7 +2202,7 @@
 	if (new->private_data)
 		return -EINVAL;
 
-	fud = fuse_dev_alloc(fc);
+	fud = fuse_dev_alloc_install(fc);
 	if (!fud)
 		return -ENOMEM;
 

diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index dd0f64f..d572c90 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c

@@ -24,20 +24,54 @@
 	set_bit(FUSE_I_ADVISE_RDPLUS, &fi->state);
 }
 
+#if BITS_PER_LONG >= 64
+static inline void __fuse_dentry_settime(struct dentry *entry, u64 time)
+{
+	entry->d_fsdata = (void *) time;
+}
+
+static inline u64 fuse_dentry_time(const struct dentry *entry)
+{
+	return (u64)entry->d_fsdata;
+}
+
+#else
 union fuse_dentry {
 	u64 time;
 	struct rcu_head rcu;
 };
 
-static inline void fuse_dentry_settime(struct dentry *entry, u64 time)
+static inline void __fuse_dentry_settime(struct dentry *dentry, u64 time)
 {
-	((union fuse_dentry *) entry->d_fsdata)->time = time;
+	((union fuse_dentry *) dentry->d_fsdata)->time = time;
 }
 
-static inline u64 fuse_dentry_time(struct dentry *entry)
+static inline u64 fuse_dentry_time(const struct dentry *entry)
 {
 	return ((union fuse_dentry *) entry->d_fsdata)->time;
 }
+#endif
+
+static void fuse_dentry_settime(struct dentry *dentry, u64 time)
+{
+	struct fuse_conn *fc = get_fuse_conn_super(dentry->d_sb);
+	bool delete = !time && fc->delete_stale;
+	/*
+	 * Mess with DCACHE_OP_DELETE because dput() will be faster without it.
+	 * Don't care about races, either way it's just an optimization
+	 */
+	if ((!delete && (dentry->d_flags & DCACHE_OP_DELETE)) ||
+	    (delete && !(dentry->d_flags & DCACHE_OP_DELETE))) {
+		spin_lock(&dentry->d_lock);
+		if (!delete)
+			dentry->d_flags &= ~DCACHE_OP_DELETE;
+		else
+			dentry->d_flags |= DCACHE_OP_DELETE;
+		spin_unlock(&dentry->d_lock);
+	}
+
+	__fuse_dentry_settime(dentry, time);
+}
 
 /*
  * FUSE caches dentries and attributes with separate timeout.  The
@@ -139,14 +173,14 @@
 			     struct fuse_entry_out *outarg)
 {
 	memset(outarg, 0, sizeof(struct fuse_entry_out));
-	args->in.h.opcode = FUSE_LOOKUP;
-	args->in.h.nodeid = nodeid;
-	args->in.numargs = 1;
-	args->in.args[0].size = name->len + 1;
-	args->in.args[0].value = name->name;
-	args->out.numargs = 1;
-	args->out.args[0].size = sizeof(struct fuse_entry_out);
-	args->out.args[0].value = outarg;
+	args->opcode = FUSE_LOOKUP;
+	args->nodeid = nodeid;
+	args->in_numargs = 1;
+	args->in_args[0].size = name->len + 1;
+	args->in_args[0].value = name->name;
+	args->out_numargs = 1;
+	args->out_args[0].size = sizeof(struct fuse_entry_out);
+	args->out_args[0].value = outarg;
 }
 
 /*
@@ -242,9 +276,11 @@
 	goto out;
 }
 
+#if BITS_PER_LONG < 64
 static int fuse_dentry_init(struct dentry *dentry)
 {
-	dentry->d_fsdata = kzalloc(sizeof(union fuse_dentry), GFP_KERNEL);
+	dentry->d_fsdata = kzalloc(sizeof(union fuse_dentry),
+				   GFP_KERNEL_ACCOUNT | __GFP_RECLAIMABLE);
 
 	return dentry->d_fsdata ? 0 : -ENOMEM;
 }
@@ -254,16 +290,27 @@
 
 	kfree_rcu(fd, rcu);
 }
+#endif
+
+static int fuse_dentry_delete(const struct dentry *dentry)
+{
+	return time_before64(fuse_dentry_time(dentry), get_jiffies_64());
+}
 
 const struct dentry_operations fuse_dentry_operations = {
 	.d_revalidate	= fuse_dentry_revalidate,
+	.d_delete	= fuse_dentry_delete,
+#if BITS_PER_LONG < 64
 	.d_init		= fuse_dentry_init,
 	.d_release	= fuse_dentry_release,
+#endif
 };
 
 const struct dentry_operations fuse_root_dentry_operations = {
+#if BITS_PER_LONG < 64
 	.d_init		= fuse_dentry_init,
 	.d_release	= fuse_dentry_release,
+#endif
 };
 
 int fuse_valid_type(int m)
@@ -410,18 +457,18 @@
 	inarg.flags = flags;
 	inarg.mode = mode;
 	inarg.umask = current_umask();
-	args.in.h.opcode = FUSE_CREATE;
-	args.in.h.nodeid = get_node_id(dir);
-	args.in.numargs = 2;
-	args.in.args[0].size = sizeof(inarg);
-	args.in.args[0].value = &inarg;
-	args.in.args[1].size = entry->d_name.len + 1;
-	args.in.args[1].value = entry->d_name.name;
-	args.out.numargs = 2;
-	args.out.args[0].size = sizeof(outentry);
-	args.out.args[0].value = &outentry;
-	args.out.args[1].size = sizeof(outopen);
-	args.out.args[1].value = &outopen;
+	args.opcode = FUSE_CREATE;
+	args.nodeid = get_node_id(dir);
+	args.in_numargs = 2;
+	args.in_args[0].size = sizeof(inarg);
+	args.in_args[0].value = &inarg;
+	args.in_args[1].size = entry->d_name.len + 1;
+	args.in_args[1].value = entry->d_name.name;
+	args.out_numargs = 2;
+	args.out_args[0].size = sizeof(outentry);
+	args.out_args[0].value = &outentry;
+	args.out_args[1].size = sizeof(outopen);
+	args.out_args[1].value = &outopen;
 	err = fuse_simple_request(fc, &args);
 	if (err)
 		goto out_free_ff;
@@ -526,10 +573,10 @@
 		return -ENOMEM;
 
 	memset(&outarg, 0, sizeof(outarg));
-	args->in.h.nodeid = get_node_id(dir);
-	args->out.numargs = 1;
-	args->out.args[0].size = sizeof(outarg);
-	args->out.args[0].value = &outarg;
+	args->nodeid = get_node_id(dir);
+	args->out_numargs = 1;
+	args->out_args[0].size = sizeof(outarg);
+	args->out_args[0].value = &outarg;
 	err = fuse_simple_request(fc, args);
 	if (err)
 		goto out_put_forget_req;
@@ -582,12 +629,12 @@
 	inarg.mode = mode;
 	inarg.rdev = new_encode_dev(rdev);
 	inarg.umask = current_umask();
-	args.in.h.opcode = FUSE_MKNOD;
-	args.in.numargs = 2;
-	args.in.args[0].size = sizeof(inarg);
-	args.in.args[0].value = &inarg;
-	args.in.args[1].size = entry->d_name.len + 1;
-	args.in.args[1].value = entry->d_name.name;
+	args.opcode = FUSE_MKNOD;
+	args.in_numargs = 2;
+	args.in_args[0].size = sizeof(inarg);
+	args.in_args[0].value = &inarg;
+	args.in_args[1].size = entry->d_name.len + 1;
+	args.in_args[1].value = entry->d_name.name;
 	return create_new_entry(fc, &args, dir, entry, mode);
 }
 
@@ -609,12 +656,12 @@
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.mode = mode;
 	inarg.umask = current_umask();
-	args.in.h.opcode = FUSE_MKDIR;
-	args.in.numargs = 2;
-	args.in.args[0].size = sizeof(inarg);
-	args.in.args[0].value = &inarg;
-	args.in.args[1].size = entry->d_name.len + 1;
-	args.in.args[1].value = entry->d_name.name;
+	args.opcode = FUSE_MKDIR;
+	args.in_numargs = 2;
+	args.in_args[0].size = sizeof(inarg);
+	args.in_args[0].value = &inarg;
+	args.in_args[1].size = entry->d_name.len + 1;
+	args.in_args[1].value = entry->d_name.name;
 	return create_new_entry(fc, &args, dir, entry, S_IFDIR);
 }
 
@@ -625,12 +672,12 @@
 	unsigned len = strlen(link) + 1;
 	FUSE_ARGS(args);
 
-	args.in.h.opcode = FUSE_SYMLINK;
-	args.in.numargs = 2;
-	args.in.args[0].size = entry->d_name.len + 1;
-	args.in.args[0].value = entry->d_name.name;
-	args.in.args[1].size = len;
-	args.in.args[1].value = link;
+	args.opcode = FUSE_SYMLINK;
+	args.in_numargs = 2;
+	args.in_args[0].size = entry->d_name.len + 1;
+	args.in_args[0].value = entry->d_name.name;
+	args.in_args[1].size = len;
+	args.in_args[1].value = link;
 	return create_new_entry(fc, &args, dir, entry, S_IFLNK);
 }
 
@@ -648,11 +695,11 @@
 	struct fuse_conn *fc = get_fuse_conn(dir);
 	FUSE_ARGS(args);
 
-	args.in.h.opcode = FUSE_UNLINK;
-	args.in.h.nodeid = get_node_id(dir);
-	args.in.numargs = 1;
-	args.in.args[0].size = entry->d_name.len + 1;
-	args.in.args[0].value = entry->d_name.name;
+	args.opcode = FUSE_UNLINK;
+	args.nodeid = get_node_id(dir);
+	args.in_numargs = 1;
+	args.in_args[0].size = entry->d_name.len + 1;
+	args.in_args[0].value = entry->d_name.name;
 	err = fuse_simple_request(fc, &args);
 	if (!err) {
 		struct inode *inode = d_inode(entry);
@@ -684,11 +731,11 @@
 	struct fuse_conn *fc = get_fuse_conn(dir);
 	FUSE_ARGS(args);
 
-	args.in.h.opcode = FUSE_RMDIR;
-	args.in.h.nodeid = get_node_id(dir);
-	args.in.numargs = 1;
-	args.in.args[0].size = entry->d_name.len + 1;
-	args.in.args[0].value = entry->d_name.name;
+	args.opcode = FUSE_RMDIR;
+	args.nodeid = get_node_id(dir);
+	args.in_numargs = 1;
+	args.in_args[0].size = entry->d_name.len + 1;
+	args.in_args[0].value = entry->d_name.name;
 	err = fuse_simple_request(fc, &args);
 	if (!err) {
 		clear_nlink(d_inode(entry));
@@ -711,15 +758,15 @@
 	memset(&inarg, 0, argsize);
 	inarg.newdir = get_node_id(newdir);
 	inarg.flags = flags;
-	args.in.h.opcode = opcode;
-	args.in.h.nodeid = get_node_id(olddir);
-	args.in.numargs = 3;
-	args.in.args[0].size = argsize;
-	args.in.args[0].value = &inarg;
-	args.in.args[1].size = oldent->d_name.len + 1;
-	args.in.args[1].value = oldent->d_name.name;
-	args.in.args[2].size = newent->d_name.len + 1;
-	args.in.args[2].value = newent->d_name.name;
+	args.opcode = opcode;
+	args.nodeid = get_node_id(olddir);
+	args.in_numargs = 3;
+	args.in_args[0].size = argsize;
+	args.in_args[0].value = &inarg;
+	args.in_args[1].size = oldent->d_name.len + 1;
+	args.in_args[1].value = oldent->d_name.name;
+	args.in_args[2].size = newent->d_name.len + 1;
+	args.in_args[2].value = newent->d_name.name;
 	err = fuse_simple_request(fc, &args);
 	if (!err) {
 		/* ctime changes */
@@ -796,12 +843,12 @@
 
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.oldnodeid = get_node_id(inode);
-	args.in.h.opcode = FUSE_LINK;
-	args.in.numargs = 2;
-	args.in.args[0].size = sizeof(inarg);
-	args.in.args[0].value = &inarg;
-	args.in.args[1].size = newent->d_name.len + 1;
-	args.in.args[1].value = newent->d_name.name;
+	args.opcode = FUSE_LINK;
+	args.in_numargs = 2;
+	args.in_args[0].size = sizeof(inarg);
+	args.in_args[0].value = &inarg;
+	args.in_args[1].size = newent->d_name.len + 1;
+	args.in_args[1].value = newent->d_name.name;
 	err = create_new_entry(fc, &args, newdir, newent, inode->i_mode);
 	/* Contrary to "normal" filesystems it can happen that link
 	   makes two "logical" inodes point to the same "physical"
@@ -884,14 +931,14 @@
 		inarg.getattr_flags |= FUSE_GETATTR_FH;
 		inarg.fh = ff->fh;
 	}
-	args.in.h.opcode = FUSE_GETATTR;
-	args.in.h.nodeid = get_node_id(inode);
-	args.in.numargs = 1;
-	args.in.args[0].size = sizeof(inarg);
-	args.in.args[0].value = &inarg;
-	args.out.numargs = 1;
-	args.out.args[0].size = sizeof(outarg);
-	args.out.args[0].value = &outarg;
+	args.opcode = FUSE_GETATTR;
+	args.nodeid = get_node_id(inode);
+	args.in_numargs = 1;
+	args.in_args[0].size = sizeof(inarg);
+	args.in_args[0].value = &inarg;
+	args.out_numargs = 1;
+	args.out_args[0].size = sizeof(outarg);
+	args.out_args[0].value = &outarg;
 	err = fuse_simple_request(fc, &args);
 	if (!err) {
 		if ((inode->i_mode ^ outarg.attr.mode) & S_IFMT) {
@@ -1056,11 +1103,11 @@
 
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.mask = mask & (MAY_READ | MAY_WRITE | MAY_EXEC);
-	args.in.h.opcode = FUSE_ACCESS;
-	args.in.h.nodeid = get_node_id(inode);
-	args.in.numargs = 1;
-	args.in.args[0].size = sizeof(inarg);
-	args.in.args[0].value = &inarg;
+	args.opcode = FUSE_ACCESS;
+	args.nodeid = get_node_id(inode);
+	args.in_numargs = 1;
+	args.in_args[0].size = sizeof(inarg);
+	args.in_args[0].value = &inarg;
 	err = fuse_simple_request(fc, &args);
 	if (err == -ENOSYS) {
 		fc->no_access = 1;
@@ -1152,38 +1199,36 @@
 static int fuse_readlink_page(struct inode *inode, struct page *page)
 {
 	struct fuse_conn *fc = get_fuse_conn(inode);
-	struct fuse_req *req;
-	int err;
+	struct fuse_page_desc desc = { .length = PAGE_SIZE - 1 };
+	struct fuse_args_pages ap = {
+		.num_pages = 1,
+		.pages = &page,
+		.descs = &desc,
+	};
+	char *link;
+	ssize_t res;
 
-	req = fuse_get_req(fc, 1);
-	if (IS_ERR(req))
-		return PTR_ERR(req);
+	ap.args.opcode = FUSE_READLINK;
+	ap.args.nodeid = get_node_id(inode);
+	ap.args.out_pages = true;
+	ap.args.out_argvar = true;
+	ap.args.page_zeroing = true;
+	ap.args.out_numargs = 1;
+	ap.args.out_args[0].size = desc.length;
+	res = fuse_simple_request(fc, &ap.args);
 
-	req->out.page_zeroing = 1;
-	req->out.argpages = 1;
-	req->num_pages = 1;
-	req->pages[0] = page;
-	req->page_descs[0].length = PAGE_SIZE - 1;
-	req->in.h.opcode = FUSE_READLINK;
-	req->in.h.nodeid = get_node_id(inode);
-	req->out.argvar = 1;
-	req->out.numargs = 1;
-	req->out.args[0].size = PAGE_SIZE - 1;
-	fuse_request_send(fc, req);
-	err = req->out.h.error;
-
-	if (!err) {
-		char *link = page_address(page);
-		size_t len = req->out.args[0].size;
-
-		BUG_ON(len >= PAGE_SIZE);
-		link[len] = '\0';
-	}
-
-	fuse_put_request(fc, req);
 	fuse_invalidate_atime(inode);
 
-	return err;
+	if (res < 0)
+		return res;
+
+	if (WARN_ON(res >= PAGE_SIZE))
+		return -EIO;
+
+	link = page_address(page);
+	link[res] = '\0';
+
+	return 0;
 }
 
 static const char *fuse_get_link(struct dentry *dentry, struct inode *inode,
@@ -1383,14 +1428,14 @@
 			      struct fuse_setattr_in *inarg_p,
 			      struct fuse_attr_out *outarg_p)
 {
-	args->in.h.opcode = FUSE_SETATTR;
-	args->in.h.nodeid = get_node_id(inode);
-	args->in.numargs = 1;
-	args->in.args[0].size = sizeof(*inarg_p);
-	args->in.args[0].value = inarg_p;
-	args->out.numargs = 1;
-	args->out.args[0].size = sizeof(*outarg_p);
-	args->out.args[0].value = outarg_p;
+	args->opcode = FUSE_SETATTR;
+	args->nodeid = get_node_id(inode);
+	args->in_numargs = 1;
+	args->in_args[0].size = sizeof(*inarg_p);
+	args->in_args[0].value = inarg_p;
+	args->out_numargs = 1;
+	args->out_args[0].size = sizeof(*outarg_p);
+	args->out_args[0].value = outarg_p;
 }
 
 /*

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 5ae2828..0f022568 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c

@@ -19,6 +19,18 @@
 #include <linux/falloc.h>
 #include <linux/uio.h>
 
+static struct page **fuse_pages_alloc(unsigned int npages, gfp_t flags,
+				      struct fuse_page_desc **desc)
+{
+	struct page **pages;
+
+	pages = kzalloc(npages * (sizeof(struct page *) +
+				  sizeof(struct fuse_page_desc)), flags);
+	*desc = (void *) (pages + npages);
+
+	return pages;
+}
+
 static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
 			  int opcode, struct fuse_open_out *outargp)
 {
@@ -29,29 +41,36 @@
 	inarg.flags = file->f_flags & ~(O_CREAT | O_EXCL | O_NOCTTY);
 	if (!fc->atomic_o_trunc)
 		inarg.flags &= ~O_TRUNC;
-	args.in.h.opcode = opcode;
-	args.in.h.nodeid = nodeid;
-	args.in.numargs = 1;
-	args.in.args[0].size = sizeof(inarg);
-	args.in.args[0].value = &inarg;
-	args.out.numargs = 1;
-	args.out.args[0].size = sizeof(*outargp);
-	args.out.args[0].value = outargp;
+	args.opcode = opcode;
+	args.nodeid = nodeid;
+	args.in_numargs = 1;
+	args.in_args[0].size = sizeof(inarg);
+	args.in_args[0].value = &inarg;
+	args.out_numargs = 1;
+	args.out_args[0].size = sizeof(*outargp);
+	args.out_args[0].value = outargp;
 
 	return fuse_simple_request(fc, &args);
 }
 
+struct fuse_release_args {
+	struct fuse_args args;
+	struct fuse_release_in inarg;
+	struct inode *inode;
+};
+
 struct fuse_file *fuse_file_alloc(struct fuse_conn *fc)
 {
 	struct fuse_file *ff;
 
-	ff = kzalloc(sizeof(struct fuse_file), GFP_KERNEL);
+	ff = kzalloc(sizeof(struct fuse_file), GFP_KERNEL_ACCOUNT);
 	if (unlikely(!ff))
 		return NULL;
 
 	ff->fc = fc;
-	ff->reserved_req = fuse_request_alloc(0);
-	if (unlikely(!ff->reserved_req)) {
+	ff->release_args = kzalloc(sizeof(*ff->release_args),
+				   GFP_KERNEL_ACCOUNT);
+	if (!ff->release_args) {
 		kfree(ff);
 		return NULL;
 	}
@@ -69,7 +88,7 @@
 
 void fuse_file_free(struct fuse_file *ff)
 {
-	fuse_request_free(ff->reserved_req);
+	kfree(ff->release_args);
 	mutex_destroy(&ff->readdir.lock);
 	kfree(ff);
 }
@@ -80,34 +99,31 @@
 	return ff;
 }
 
-static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req)
+static void fuse_release_end(struct fuse_conn *fc, struct fuse_args *args,
+			     int error)
 {
-	iput(req->misc.release.inode);
+	struct fuse_release_args *ra = container_of(args, typeof(*ra), args);
+
+	iput(ra->inode);
+	kfree(ra);
 }
 
 static void fuse_file_put(struct fuse_file *ff, bool sync, bool isdir)
 {
 	if (refcount_dec_and_test(&ff->count)) {
-		struct fuse_req *req = ff->reserved_req;
+		struct fuse_args *args = &ff->release_args->args;
 
 		if (isdir ? ff->fc->no_opendir : ff->fc->no_open) {
-			/*
-			 * Drop the release request when client does not
-			 * implement 'open'
-			 */
-			__clear_bit(FR_BACKGROUND, &req->flags);
-			iput(req->misc.release.inode);
-			fuse_put_request(ff->fc, req);
+			/* Do nothing when client does not implement 'open' */
+			fuse_release_end(ff->fc, args, 0);
 		} else if (sync) {
-			__set_bit(FR_FORCE, &req->flags);
-			__clear_bit(FR_BACKGROUND, &req->flags);
-			fuse_request_send(ff->fc, req);
-			iput(req->misc.release.inode);
-			fuse_put_request(ff->fc, req);
+			fuse_simple_request(ff->fc, args);
+			fuse_release_end(ff->fc, args, 0);
 		} else {
-			req->end = fuse_release_end;
-			__set_bit(FR_BACKGROUND, &req->flags);
-			fuse_request_send_background(ff->fc, req);
+			args->end = fuse_release_end;
+			if (fuse_simple_background(ff->fc, args,
+						   GFP_KERNEL | __GFP_NOFAIL))
+				fuse_release_end(ff->fc, args, -ENOTCONN);
 		}
 		kfree(ff);
 	}
@@ -227,8 +243,7 @@
 				 int flags, int opcode)
 {
 	struct fuse_conn *fc = ff->fc;
-	struct fuse_req *req = ff->reserved_req;
-	struct fuse_release_in *inarg = &req->misc.release.in;
+	struct fuse_release_args *ra = ff->release_args;
 
 	/* Inode is NULL on error path of fuse_create_open() */
 	if (likely(fi)) {
@@ -243,32 +258,33 @@
 
 	wake_up_interruptible_all(&ff->poll_wait);
 
-	inarg->fh = ff->fh;
-	inarg->flags = flags;
-	req->in.h.opcode = opcode;
-	req->in.h.nodeid = ff->nodeid;
-	req->in.numargs = 1;
-	req->in.args[0].size = sizeof(struct fuse_release_in);
-	req->in.args[0].value = inarg;
+	ra->inarg.fh = ff->fh;
+	ra->inarg.flags = flags;
+	ra->args.in_numargs = 1;
+	ra->args.in_args[0].size = sizeof(struct fuse_release_in);
+	ra->args.in_args[0].value = &ra->inarg;
+	ra->args.opcode = opcode;
+	ra->args.nodeid = ff->nodeid;
+	ra->args.force = true;
+	ra->args.nocreds = true;
 }
 
 void fuse_release_common(struct file *file, bool isdir)
 {
 	struct fuse_inode *fi = get_fuse_inode(file_inode(file));
 	struct fuse_file *ff = file->private_data;
-	struct fuse_req *req = ff->reserved_req;
+	struct fuse_release_args *ra = ff->release_args;
 	int opcode = isdir ? FUSE_RELEASEDIR : FUSE_RELEASE;
 
 	fuse_prepare_release(fi, ff, file->f_flags, opcode);
 
 	if (ff->flock) {
-		struct fuse_release_in *inarg = &req->misc.release.in;
-		inarg->release_flags |= FUSE_RELEASE_FLOCK_UNLOCK;
-		inarg->lock_owner = fuse_lock_owner_id(ff->fc,
-						       (fl_owner_t) file);
+		ra->inarg.release_flags |= FUSE_RELEASE_FLOCK_UNLOCK;
+		ra->inarg.lock_owner = fuse_lock_owner_id(ff->fc,
+							  (fl_owner_t) file);
 	}
 	/* Hold inode until release is finished */
-	req->misc.release.inode = igrab(file_inode(file));
+	ra->inode = igrab(file_inode(file));
 
 	/*
 	 * Normally this will send the RELEASE request, however if
@@ -279,7 +295,7 @@
 	 * synchronous RELEASE is allowed (and desirable) in this case
 	 * because the server can be trusted not to screw up.
 	 */
-	fuse_file_put(ff, ff->fc->destroy_req != NULL, isdir);
+	fuse_file_put(ff, ff->fc->destroy, isdir);
 }
 
 static int fuse_open(struct inode *inode, struct file *file)
@@ -335,19 +351,27 @@
 	return (u64) v0 + ((u64) v1 << 32);
 }
 
-static struct fuse_req *fuse_find_writeback(struct fuse_inode *fi,
+struct fuse_writepage_args {
+	struct fuse_io_args ia;
+	struct list_head writepages_entry;
+	struct list_head queue_entry;
+	struct fuse_writepage_args *next;
+	struct inode *inode;
+};
+
+static struct fuse_writepage_args *fuse_find_writeback(struct fuse_inode *fi,
 					    pgoff_t idx_from, pgoff_t idx_to)
 {
-	struct fuse_req *req;
+	struct fuse_writepage_args *wpa;
 
-	list_for_each_entry(req, &fi->writepages, writepages_entry) {
+	list_for_each_entry(wpa, &fi->writepages, writepages_entry) {
 		pgoff_t curr_index;
 
-		WARN_ON(get_fuse_inode(req->inode) != fi);
-		curr_index = req->misc.write.in.offset >> PAGE_SHIFT;
-		if (idx_from < curr_index + req->num_pages &&
+		WARN_ON(get_fuse_inode(wpa->inode) != fi);
+		curr_index = wpa->ia.write.in.offset >> PAGE_SHIFT;
+		if (idx_from < curr_index + wpa->ia.ap.num_pages &&
 		    curr_index <= idx_to) {
-			return req;
+			return wpa;
 		}
 	}
 	return NULL;
@@ -383,12 +407,11 @@
  * Since fuse doesn't rely on the VM writeback tracking, this has to
  * use some other means.
  */
-static int fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index)
+static void fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index)
 {
 	struct fuse_inode *fi = get_fuse_inode(inode);
 
 	wait_event(fi->page_waitq, !fuse_page_is_writeback(inode, index));
-	return 0;
 }
 
 /*
@@ -411,8 +434,8 @@
 	struct inode *inode = file_inode(file);
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	struct fuse_file *ff = file->private_data;
-	struct fuse_req *req;
 	struct fuse_flush_in inarg;
+	FUSE_ARGS(args);
 	int err;
 
 	if (is_bad_inode(inode))
@@ -433,19 +456,17 @@
 	if (err)
 		return err;
 
-	req = fuse_get_req_nofail_nopages(fc, file);
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.fh = ff->fh;
 	inarg.lock_owner = fuse_lock_owner_id(fc, id);
-	req->in.h.opcode = FUSE_FLUSH;
-	req->in.h.nodeid = get_node_id(inode);
-	req->in.numargs = 1;
-	req->in.args[0].size = sizeof(inarg);
-	req->in.args[0].value = &inarg;
-	__set_bit(FR_FORCE, &req->flags);
-	fuse_request_send(fc, req);
-	err = req->out.h.error;
-	fuse_put_request(fc, req);
+	args.opcode = FUSE_FLUSH;
+	args.nodeid = get_node_id(inode);
+	args.in_numargs = 1;
+	args.in_args[0].size = sizeof(inarg);
+	args.in_args[0].value = &inarg;
+	args.force = true;
+
+	err = fuse_simple_request(fc, &args);
 	if (err == -ENOSYS) {
 		fc->no_flush = 1;
 		err = 0;
@@ -465,11 +486,11 @@
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.fh = ff->fh;
 	inarg.fsync_flags = datasync ? FUSE_FSYNC_FDATASYNC : 0;
-	args.in.h.opcode = opcode;
-	args.in.h.nodeid = get_node_id(inode);
-	args.in.numargs = 1;
-	args.in.args[0].size = sizeof(inarg);
-	args.in.args[0].value = &inarg;
+	args.opcode = opcode;
+	args.nodeid = get_node_id(inode);
+	args.in_numargs = 1;
+	args.in_args[0].size = sizeof(inarg);
+	args.in_args[0].value = &inarg;
 	return fuse_simple_request(fc, &args);
 }
 
@@ -523,35 +544,35 @@
 	return err;
 }
 
-void fuse_read_fill(struct fuse_req *req, struct file *file, loff_t pos,
-		    size_t count, int opcode)
+void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos,
+			 size_t count, int opcode)
 {
-	struct fuse_read_in *inarg = &req->misc.read.in;
 	struct fuse_file *ff = file->private_data;
+	struct fuse_args *args = &ia->ap.args;
 
-	inarg->fh = ff->fh;
-	inarg->offset = pos;
-	inarg->size = count;
-	inarg->flags = file->f_flags;
-	req->in.h.opcode = opcode;
-	req->in.h.nodeid = ff->nodeid;
-	req->in.numargs = 1;
-	req->in.args[0].size = sizeof(struct fuse_read_in);
-	req->in.args[0].value = inarg;
-	req->out.argvar = 1;
-	req->out.numargs = 1;
-	req->out.args[0].size = count;
+	ia->read.in.fh = ff->fh;
+	ia->read.in.offset = pos;
+	ia->read.in.size = count;
+	ia->read.in.flags = file->f_flags;
+	args->opcode = opcode;
+	args->nodeid = ff->nodeid;
+	args->in_numargs = 1;
+	args->in_args[0].size = sizeof(ia->read.in);
+	args->in_args[0].value = &ia->read.in;
+	args->out_argvar = true;
+	args->out_numargs = 1;
+	args->out_args[0].size = count;
 }
 
-static void fuse_release_user_pages(struct fuse_req *req, bool should_dirty)
+static void fuse_release_user_pages(struct fuse_args_pages *ap,
+				    bool should_dirty)
 {
-	unsigned i;
+	unsigned int i;
 
-	for (i = 0; i < req->num_pages; i++) {
-		struct page *page = req->pages[i];
+	for (i = 0; i < ap->num_pages; i++) {
 		if (should_dirty)
-			set_page_dirty_lock(page);
-		put_page(page);
+			set_page_dirty_lock(ap->pages[i]);
+		put_page(ap->pages[i]);
 	}
 }
 
@@ -621,64 +642,94 @@
 	kref_put(&io->refcnt, fuse_io_release);
 }
 
-static void fuse_aio_complete_req(struct fuse_conn *fc, struct fuse_req *req)
+static struct fuse_io_args *fuse_io_alloc(struct fuse_io_priv *io,
+					  unsigned int npages)
 {
-	struct fuse_io_priv *io = req->io;
-	ssize_t pos = -1;
+	struct fuse_io_args *ia;
 
-	fuse_release_user_pages(req, io->should_dirty);
-
-	if (io->write) {
-		if (req->misc.write.in.size != req->misc.write.out.size)
-			pos = req->misc.write.in.offset - io->offset +
-				req->misc.write.out.size;
-	} else {
-		if (req->misc.read.in.size != req->out.args[0].size)
-			pos = req->misc.read.in.offset - io->offset +
-				req->out.args[0].size;
+	ia = kzalloc(sizeof(*ia), GFP_KERNEL);
+	if (ia) {
+		ia->io = io;
+		ia->ap.pages = fuse_pages_alloc(npages, GFP_KERNEL,
+						&ia->ap.descs);
+		if (!ia->ap.pages) {
+			kfree(ia);
+			ia = NULL;
+		}
 	}
-
-	fuse_aio_complete(io, req->out.h.error, pos);
+	return ia;
 }
 
-static size_t fuse_async_req_send(struct fuse_conn *fc, struct fuse_req *req,
-		size_t num_bytes, struct fuse_io_priv *io)
+static void fuse_io_free(struct fuse_io_args *ia)
 {
+	kfree(ia->ap.pages);
+	kfree(ia);
+}
+
+static void fuse_aio_complete_req(struct fuse_conn *fc, struct fuse_args *args,
+				  int err)
+{
+	struct fuse_io_args *ia = container_of(args, typeof(*ia), ap.args);
+	struct fuse_io_priv *io = ia->io;
+	ssize_t pos = -1;
+
+	fuse_release_user_pages(&ia->ap, io->should_dirty);
+
+	if (err) {
+		/* Nothing */
+	} else if (io->write) {
+		if (ia->write.out.size > ia->write.in.size) {
+			err = -EIO;
+		} else if (ia->write.in.size != ia->write.out.size) {
+			pos = ia->write.in.offset - io->offset +
+				ia->write.out.size;
+		}
+	} else {
+		u32 outsize = args->out_args[0].size;
+
+		if (ia->read.in.size != outsize)
+			pos = ia->read.in.offset - io->offset + outsize;
+	}
+
+	fuse_aio_complete(io, err, pos);
+	fuse_io_free(ia);
+}
+
+static ssize_t fuse_async_req_send(struct fuse_conn *fc,
+				   struct fuse_io_args *ia, size_t num_bytes)
+{
+	ssize_t err;
+	struct fuse_io_priv *io = ia->io;
+
 	spin_lock(&io->lock);
 	kref_get(&io->refcnt);
 	io->size += num_bytes;
 	io->reqs++;
 	spin_unlock(&io->lock);
 
-	req->io = io;
-	req->end = fuse_aio_complete_req;
+	ia->ap.args.end = fuse_aio_complete_req;
+	err = fuse_simple_background(fc, &ia->ap.args, GFP_KERNEL);
 
-	__fuse_get_request(req);
-	fuse_request_send_background(fc, req);
-
-	return num_bytes;
+	return err ?: num_bytes;
 }
 
-static size_t fuse_send_read(struct fuse_req *req, struct fuse_io_priv *io,
-			     loff_t pos, size_t count, fl_owner_t owner)
+static ssize_t fuse_send_read(struct fuse_io_args *ia, loff_t pos, size_t count,
+			      fl_owner_t owner)
 {
-	struct file *file = io->iocb->ki_filp;
+	struct file *file = ia->io->iocb->ki_filp;
 	struct fuse_file *ff = file->private_data;
 	struct fuse_conn *fc = ff->fc;
 
-	fuse_read_fill(req, file, pos, count, FUSE_READ);
+	fuse_read_args_fill(ia, file, pos, count, FUSE_READ);
 	if (owner != NULL) {
-		struct fuse_read_in *inarg = &req->misc.read.in;
-
-		inarg->read_flags |= FUSE_READ_LOCKOWNER;
-		inarg->lock_owner = fuse_lock_owner_id(fc, owner);
+		ia->read.in.read_flags |= FUSE_READ_LOCKOWNER;
+		ia->read.in.lock_owner = fuse_lock_owner_id(fc, owner);
 	}
 
-	if (io->async)
-		return fuse_async_req_send(fc, req, count, io);
+	if (ia->io->async)
+		return fuse_async_req_send(fc, ia, count);
 
-	fuse_request_send(fc, req);
-	return req->out.args[0].size;
+	return fuse_simple_request(fc, &ia->ap.args);
 }
 
 static void fuse_read_update_size(struct inode *inode, loff_t size,
@@ -696,10 +747,9 @@
 	spin_unlock(&fi->lock);
 }
 
-static void fuse_short_read(struct fuse_req *req, struct inode *inode,
-			    u64 attr_ver)
+static void fuse_short_read(struct inode *inode, u64 attr_ver, size_t num_read,
+			    struct fuse_args_pages *ap)
 {
-	size_t num_read = req->out.args[0].size;
 	struct fuse_conn *fc = get_fuse_conn(inode);
 
 	if (fc->writeback_cache) {
@@ -712,28 +762,31 @@
 		int start_idx = num_read >> PAGE_SHIFT;
 		size_t off = num_read & (PAGE_SIZE - 1);
 
-		for (i = start_idx; i < req->num_pages; i++) {
-			zero_user_segment(req->pages[i], off, PAGE_SIZE);
+		for (i = start_idx; i < ap->num_pages; i++) {
+			zero_user_segment(ap->pages[i], off, PAGE_SIZE);
 			off = 0;
 		}
 	} else {
-		loff_t pos = page_offset(req->pages[0]) + num_read;
+		loff_t pos = page_offset(ap->pages[0]) + num_read;
 		fuse_read_update_size(inode, pos, attr_ver);
 	}
 }
 
 static int fuse_do_readpage(struct file *file, struct page *page)
 {
-	struct kiocb iocb;
-	struct fuse_io_priv io;
 	struct inode *inode = page->mapping->host;
 	struct fuse_conn *fc = get_fuse_conn(inode);
-	struct fuse_req *req;
-	size_t num_read;
 	loff_t pos = page_offset(page);
-	size_t count = PAGE_SIZE;
+	struct fuse_page_desc desc = { .length = PAGE_SIZE };
+	struct fuse_io_args ia = {
+		.ap.args.page_zeroing = true,
+		.ap.args.out_pages = true,
+		.ap.num_pages = 1,
+		.ap.pages = &page,
+		.ap.descs = &desc,
+	};
+	ssize_t res;
 	u64 attr_ver;
-	int err;
 
 	/*
 	 * Page writeback can extend beyond the lifetime of the
@@ -742,35 +795,21 @@
 	 */
 	fuse_wait_on_page_writeback(inode, page->index);
 
-	req = fuse_get_req(fc, 1);
-	if (IS_ERR(req))
-		return PTR_ERR(req);
-
 	attr_ver = fuse_get_attr_version(fc);
 
-	req->out.page_zeroing = 1;
-	req->out.argpages = 1;
-	req->num_pages = 1;
-	req->pages[0] = page;
-	req->page_descs[0].length = count;
-	init_sync_kiocb(&iocb, file);
-	io = (struct fuse_io_priv) FUSE_IO_PRIV_SYNC(&iocb);
-	num_read = fuse_send_read(req, &io, pos, count, NULL);
-	err = req->out.h.error;
+	fuse_read_args_fill(&ia, file, pos, desc.length, FUSE_READ);
+	res = fuse_simple_request(fc, &ia.ap.args);
+	if (res < 0)
+		return res;
+	/*
+	 * Short read means EOF.  If file size is larger, truncate it
+	 */
+	if (res < desc.length)
+		fuse_short_read(inode, attr_ver, res, &ia.ap);
 
-	if (!err) {
-		/*
-		 * Short read means EOF.  If file size is larger, truncate it
-		 */
-		if (num_read < count)
-			fuse_short_read(req, inode, attr_ver);
+	SetPageUptodate(page);
 
-		SetPageUptodate(page);
-	}
-
-	fuse_put_request(fc, req);
-
-	return err;
+	return 0;
 }
 
 static int fuse_readpage(struct file *file, struct page *page)
@@ -789,15 +828,18 @@
 	return err;
 }
 
-static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
+static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_args *args,
+			       int err)
 {
 	int i;
-	size_t count = req->misc.read.in.size;
-	size_t num_read = req->out.args[0].size;
+	struct fuse_io_args *ia = container_of(args, typeof(*ia), ap.args);
+	struct fuse_args_pages *ap = &ia->ap;
+	size_t count = ia->read.in.size;
+	size_t num_read = args->out_args[0].size;
 	struct address_space *mapping = NULL;
 
-	for (i = 0; mapping == NULL && i < req->num_pages; i++)
-		mapping = req->pages[i]->mapping;
+	for (i = 0; mapping == NULL && i < ap->num_pages; i++)
+		mapping = ap->pages[i]->mapping;
 
 	if (mapping) {
 		struct inode *inode = mapping->host;
@@ -805,93 +847,97 @@
 		/*
 		 * Short read means EOF. If file size is larger, truncate it
 		 */
-		if (!req->out.h.error && num_read < count)
-			fuse_short_read(req, inode, req->misc.read.attr_ver);
+		if (!err && num_read < count)
+			fuse_short_read(inode, ia->read.attr_ver, num_read, ap);
 
 		fuse_invalidate_atime(inode);
 	}
 
-	for (i = 0; i < req->num_pages; i++) {
-		struct page *page = req->pages[i];
-		if (!req->out.h.error)
+	for (i = 0; i < ap->num_pages; i++) {
+		struct page *page = ap->pages[i];
+
+		if (!err)
 			SetPageUptodate(page);
 		else
 			SetPageError(page);
 		unlock_page(page);
 		put_page(page);
 	}
-	if (req->ff)
-		fuse_file_put(req->ff, false, false);
+	if (ia->ff)
+		fuse_file_put(ia->ff, false, false);
+
+	fuse_io_free(ia);
 }
 
-static void fuse_send_readpages(struct fuse_req *req, struct file *file)
+static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file)
 {
 	struct fuse_file *ff = file->private_data;
 	struct fuse_conn *fc = ff->fc;
-	loff_t pos = page_offset(req->pages[0]);
-	size_t count = req->num_pages << PAGE_SHIFT;
+	struct fuse_args_pages *ap = &ia->ap;
+	loff_t pos = page_offset(ap->pages[0]);
+	size_t count = ap->num_pages << PAGE_SHIFT;
+	int err;
 
-	req->out.argpages = 1;
-	req->out.page_zeroing = 1;
-	req->out.page_replace = 1;
-	fuse_read_fill(req, file, pos, count, FUSE_READ);
-	req->misc.read.attr_ver = fuse_get_attr_version(fc);
+	ap->args.out_pages = true;
+	ap->args.page_zeroing = true;
+	ap->args.page_replace = true;
+	fuse_read_args_fill(ia, file, pos, count, FUSE_READ);
+	ia->read.attr_ver = fuse_get_attr_version(fc);
 	if (fc->async_read) {
-		req->ff = fuse_file_get(ff);
-		req->end = fuse_readpages_end;
-		fuse_request_send_background(fc, req);
+		ia->ff = fuse_file_get(ff);
+		ap->args.end = fuse_readpages_end;
+		err = fuse_simple_background(fc, &ap->args, GFP_KERNEL);
+		if (!err)
+			return;
 	} else {
-		fuse_request_send(fc, req);
-		fuse_readpages_end(fc, req);
-		fuse_put_request(fc, req);
+		err = fuse_simple_request(fc, &ap->args);
 	}
+	fuse_readpages_end(fc, &ap->args, err);
 }
 
 struct fuse_fill_data {
-	struct fuse_req *req;
+	struct fuse_io_args *ia;
 	struct file *file;
 	struct inode *inode;
-	unsigned nr_pages;
+	unsigned int nr_pages;
+	unsigned int max_pages;
 };
 
 static int fuse_readpages_fill(void *_data, struct page *page)
 {
 	struct fuse_fill_data *data = _data;
-	struct fuse_req *req = data->req;
+	struct fuse_io_args *ia = data->ia;
+	struct fuse_args_pages *ap = &ia->ap;
 	struct inode *inode = data->inode;
 	struct fuse_conn *fc = get_fuse_conn(inode);
 
 	fuse_wait_on_page_writeback(inode, page->index);
 
-	if (req->num_pages &&
-	    (req->num_pages == fc->max_pages ||
-	     (req->num_pages + 1) * PAGE_SIZE > fc->max_read ||
-	     req->pages[req->num_pages - 1]->index + 1 != page->index)) {
-		unsigned int nr_alloc = min_t(unsigned int, data->nr_pages,
-					      fc->max_pages);
-		fuse_send_readpages(req, data->file);
-		if (fc->async_read)
-			req = fuse_get_req_for_background(fc, nr_alloc);
-		else
-			req = fuse_get_req(fc, nr_alloc);
-
-		data->req = req;
-		if (IS_ERR(req)) {
+	if (ap->num_pages &&
+	    (ap->num_pages == fc->max_pages ||
+	     (ap->num_pages + 1) * PAGE_SIZE > fc->max_read ||
+	     ap->pages[ap->num_pages - 1]->index + 1 != page->index)) {
+		data->max_pages = min_t(unsigned int, data->nr_pages,
+					fc->max_pages);
+		fuse_send_readpages(ia, data->file);
+		data->ia = ia = fuse_io_alloc(NULL, data->max_pages);
+		if (!ia) {
 			unlock_page(page);
-			return PTR_ERR(req);
+			return -ENOMEM;
 		}
+		ap = &ia->ap;
 	}
 
-	if (WARN_ON(req->num_pages >= req->max_pages)) {
+	if (WARN_ON(ap->num_pages >= data->max_pages)) {
 		unlock_page(page);
-		fuse_put_request(fc, req);
+		fuse_io_free(ia);
 		return -EIO;
 	}
 
 	get_page(page);
-	req->pages[req->num_pages] = page;
-	req->page_descs[req->num_pages].length = PAGE_SIZE;
-	req->num_pages++;
+	ap->pages[ap->num_pages] = page;
+	ap->descs[ap->num_pages].length = PAGE_SIZE;
+	ap->num_pages++;
 	data->nr_pages--;
 	return 0;
 }
@@ -903,7 +949,6 @@
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	struct fuse_fill_data data;
 	int err;
-	unsigned int nr_alloc = min_t(unsigned int, nr_pages, fc->max_pages);
 
 	err = -EIO;
 	if (is_bad_inode(inode))
@@ -911,21 +956,20 @@
 
 	data.file = file;
 	data.inode = inode;
-	if (fc->async_read)
-		data.req = fuse_get_req_for_background(fc, nr_alloc);
-	else
-		data.req = fuse_get_req(fc, nr_alloc);
 	data.nr_pages = nr_pages;
-	err = PTR_ERR(data.req);
-	if (IS_ERR(data.req))
+	data.max_pages = min_t(unsigned int, nr_pages, fc->max_pages);
+;
+	data.ia = fuse_io_alloc(NULL, data.max_pages);
+	err = -ENOMEM;
+	if (!data.ia)
 		goto out;
 
 	err = read_cache_pages(mapping, pages, fuse_readpages_fill, &data);
 	if (!err) {
-		if (data.req->num_pages)
-			fuse_send_readpages(data.req, file);
+		if (data.ia->ap.num_pages)
+			fuse_send_readpages(data.ia, file);
 		else
-			fuse_put_request(fc, data.req);
+			fuse_io_free(data.ia);
 	}
 out:
 	return err;
@@ -952,54 +996,65 @@
 	return generic_file_read_iter(iocb, to);
 }
 
-static void fuse_write_fill(struct fuse_req *req, struct fuse_file *ff,
-			    loff_t pos, size_t count)
+static void fuse_write_args_fill(struct fuse_io_args *ia, struct fuse_file *ff,
+				 loff_t pos, size_t count)
 {
-	struct fuse_write_in *inarg = &req->misc.write.in;
-	struct fuse_write_out *outarg = &req->misc.write.out;
+	struct fuse_args *args = &ia->ap.args;
 
-	inarg->fh = ff->fh;
-	inarg->offset = pos;
-	inarg->size = count;
-	req->in.h.opcode = FUSE_WRITE;
-	req->in.h.nodeid = ff->nodeid;
-	req->in.numargs = 2;
+	ia->write.in.fh = ff->fh;
+	ia->write.in.offset = pos;
+	ia->write.in.size = count;
+	args->opcode = FUSE_WRITE;
+	args->nodeid = ff->nodeid;
+	args->in_numargs = 2;
 	if (ff->fc->minor < 9)
-		req->in.args[0].size = FUSE_COMPAT_WRITE_IN_SIZE;
+		args->in_args[0].size = FUSE_COMPAT_WRITE_IN_SIZE;
 	else
-		req->in.args[0].size = sizeof(struct fuse_write_in);
-	req->in.args[0].value = inarg;
-	req->in.args[1].size = count;
-	req->out.numargs = 1;
-	req->out.args[0].size = sizeof(struct fuse_write_out);
-	req->out.args[0].value = outarg;
+		args->in_args[0].size = sizeof(ia->write.in);
+	args->in_args[0].value = &ia->write.in;
+	args->in_args[1].size = count;
+	args->out_numargs = 1;
+	args->out_args[0].size = sizeof(ia->write.out);
+	args->out_args[0].value = &ia->write.out;
 }
 
-static size_t fuse_send_write(struct fuse_req *req, struct fuse_io_priv *io,
-			      loff_t pos, size_t count, fl_owner_t owner)
+static unsigned int fuse_write_flags(struct kiocb *iocb)
 {
-	struct kiocb *iocb = io->iocb;
+	unsigned int flags = iocb->ki_filp->f_flags;
+
+	if (iocb->ki_flags & IOCB_DSYNC)
+		flags |= O_DSYNC;
+	if (iocb->ki_flags & IOCB_SYNC)
+		flags |= O_SYNC;
+
+	return flags;
+}
+
+static ssize_t fuse_send_write(struct fuse_io_args *ia, loff_t pos,
+			       size_t count, fl_owner_t owner)
+{
+	struct kiocb *iocb = ia->io->iocb;
 	struct file *file = iocb->ki_filp;
 	struct fuse_file *ff = file->private_data;
 	struct fuse_conn *fc = ff->fc;
-	struct fuse_write_in *inarg = &req->misc.write.in;
+	struct fuse_write_in *inarg = &ia->write.in;
+	ssize_t err;
 
-	fuse_write_fill(req, ff, pos, count);
-	inarg->flags = file->f_flags;
-	if (iocb->ki_flags & IOCB_DSYNC)
-		inarg->flags |= O_DSYNC;
-	if (iocb->ki_flags & IOCB_SYNC)
-		inarg->flags |= O_SYNC;
+	fuse_write_args_fill(ia, ff, pos, count);
+	inarg->flags = fuse_write_flags(iocb);
 	if (owner != NULL) {
 		inarg->write_flags |= FUSE_WRITE_LOCKOWNER;
 		inarg->lock_owner = fuse_lock_owner_id(fc, owner);
 	}
 
-	if (io->async)
-		return fuse_async_req_send(fc, req, count, io);
+	if (ia->io->async)
+		return fuse_async_req_send(fc, ia, count);
 
-	fuse_request_send(fc, req);
-	return req->misc.write.out.size;
+	err = fuse_simple_request(fc, &ia->ap.args);
+	if (!err && ia->write.out.size > count)
+		err = -EIO;
+
+	return err ?: ia->write.out.size;
 }
 
 bool fuse_write_update_size(struct inode *inode, loff_t pos)
@@ -1019,26 +1074,31 @@
 	return ret;
 }
 
-static size_t fuse_send_write_pages(struct fuse_req *req, struct kiocb *iocb,
-				    struct inode *inode, loff_t pos,
-				    size_t count)
+static ssize_t fuse_send_write_pages(struct fuse_io_args *ia,
+				     struct kiocb *iocb, struct inode *inode,
+				     loff_t pos, size_t count)
 {
-	size_t res;
-	unsigned offset;
-	unsigned i;
-	struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb);
+	struct fuse_args_pages *ap = &ia->ap;
+	struct file *file = iocb->ki_filp;
+	struct fuse_file *ff = file->private_data;
+	struct fuse_conn *fc = ff->fc;
+	unsigned int offset, i;
+	int err;
 
-	for (i = 0; i < req->num_pages; i++)
-		fuse_wait_on_page_writeback(inode, req->pages[i]->index);
+	for (i = 0; i < ap->num_pages; i++)
+		fuse_wait_on_page_writeback(inode, ap->pages[i]->index);
 
-	res = fuse_send_write(req, &io, pos, count, NULL);
+	fuse_write_args_fill(ia, ff, pos, count);
+	ia->write.in.flags = fuse_write_flags(iocb);
 
-	offset = req->page_descs[0].offset;
-	count = res;
-	for (i = 0; i < req->num_pages; i++) {
-		struct page *page = req->pages[i];
+	err = fuse_simple_request(fc, &ap->args);
 
-		if (!req->out.h.error && !offset && count >= PAGE_SIZE)
+	offset = ap->descs[0].offset;
+	count = ia->write.out.size;
+	for (i = 0; i < ap->num_pages; i++) {
+		struct page *page = ap->pages[i];
+
+		if (!err && !offset && count >= PAGE_SIZE)
 			SetPageUptodate(page);
 
 		if (count > PAGE_SIZE - offset)
@@ -1051,20 +1111,21 @@
 		put_page(page);
 	}
 
-	return res;
+	return err;
 }
 
-static ssize_t fuse_fill_write_pages(struct fuse_req *req,
-			       struct address_space *mapping,
-			       struct iov_iter *ii, loff_t pos)
+static ssize_t fuse_fill_write_pages(struct fuse_args_pages *ap,
+				     struct address_space *mapping,
+				     struct iov_iter *ii, loff_t pos,
+				     unsigned int max_pages)
 {
 	struct fuse_conn *fc = get_fuse_conn(mapping->host);
 	unsigned offset = pos & (PAGE_SIZE - 1);
 	size_t count = 0;
 	int err;
 
-	req->in.argpages = 1;
-	req->page_descs[0].offset = offset;
+	ap->args.in_pages = true;
+	ap->descs[0].offset = offset;
 
 	do {
 		size_t tmp;
@@ -1100,9 +1161,9 @@
 		}
 
 		err = 0;
-		req->pages[req->num_pages] = page;
-		req->page_descs[req->num_pages].length = tmp;
-		req->num_pages++;
+		ap->pages[ap->num_pages] = page;
+		ap->descs[ap->num_pages].length = tmp;
+		ap->num_pages++;
 
 		count += tmp;
 		pos += tmp;
@@ -1113,7 +1174,7 @@
 		if (!fc->big_writes)
 			break;
 	} while (iov_iter_count(ii) && count < fc->max_write &&
-		 req->num_pages < req->max_pages && offset == 0);
+		 ap->num_pages < max_pages && offset == 0);
 
 	return count > 0 ? count : err;
 }
@@ -1141,27 +1202,27 @@
 		set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
 
 	do {
-		struct fuse_req *req;
 		ssize_t count;
+		struct fuse_io_args ia = {};
+		struct fuse_args_pages *ap = &ia.ap;
 		unsigned int nr_pages = fuse_wr_pages(pos, iov_iter_count(ii),
 						      fc->max_pages);
 
-		req = fuse_get_req(fc, nr_pages);
-		if (IS_ERR(req)) {
-			err = PTR_ERR(req);
+		ap->pages = fuse_pages_alloc(nr_pages, GFP_KERNEL, &ap->descs);
+		if (!ap->pages) {
+			err = -ENOMEM;
 			break;
 		}
 
-		count = fuse_fill_write_pages(req, mapping, ii, pos);
+		count = fuse_fill_write_pages(ap, mapping, ii, pos, nr_pages);
 		if (count <= 0) {
 			err = count;
 		} else {
-			size_t num_written;
-
-			num_written = fuse_send_write_pages(req, iocb, inode,
-							    pos, count);
-			err = req->out.h.error;
+			err = fuse_send_write_pages(&ia, iocb, inode,
+						    pos, count);
 			if (!err) {
+				size_t num_written = ia.write.out.size;
+
 				res += num_written;
 				pos += num_written;
 
@@ -1170,7 +1231,7 @@
 					err = -EIO;
 			}
 		}
-		fuse_put_request(fc, req);
+		kfree(ap->pages);
 	} while (!err && iov_iter_count(ii));
 
 	if (res > 0)
@@ -1258,14 +1319,14 @@
 	return written ? written : err;
 }
 
-static inline void fuse_page_descs_length_init(struct fuse_req *req,
-		unsigned index, unsigned nr_pages)
+static inline void fuse_page_descs_length_init(struct fuse_page_desc *descs,
+					       unsigned int index,
+					       unsigned int nr_pages)
 {
 	int i;
 
 	for (i = index; i < index + nr_pages; i++)
-		req->page_descs[i].length = PAGE_SIZE -
-			req->page_descs[i].offset;
+		descs[i].length = PAGE_SIZE - descs[i].offset;
 }
 
 static inline unsigned long fuse_get_user_addr(const struct iov_iter *ii)
@@ -1279,8 +1340,9 @@
 	return min(iov_iter_single_seg_count(ii), max_size);
 }
 
-static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii,
-			       size_t *nbytesp, int write)
+static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii,
+			       size_t *nbytesp, int write,
+			       unsigned int max_pages)
 {
 	size_t nbytes = 0;  /* # bytes already packed in req */
 	ssize_t ret = 0;
@@ -1291,21 +1353,21 @@
 		size_t frag_size = fuse_get_frag_size(ii, *nbytesp);
 
 		if (write)
-			req->in.args[1].value = (void *) user_addr;
+			ap->args.in_args[1].value = (void *) user_addr;
 		else
-			req->out.args[0].value = (void *) user_addr;
+			ap->args.out_args[0].value = (void *) user_addr;
 
 		iov_iter_advance(ii, frag_size);
 		*nbytesp = frag_size;
 		return 0;
 	}
 
-	while (nbytes < *nbytesp && req->num_pages < req->max_pages) {
+	while (nbytes < *nbytesp && ap->num_pages < max_pages) {
 		unsigned npages;
 		size_t start;
-		ret = iov_iter_get_pages(ii, &req->pages[req->num_pages],
+		ret = iov_iter_get_pages(ii, &ap->pages[ap->num_pages],
 					*nbytesp - nbytes,
-					req->max_pages - req->num_pages,
+					max_pages - ap->num_pages,
 					&start);
 		if (ret < 0)
 			break;
@@ -1316,18 +1378,18 @@
 		ret += start;
 		npages = (ret + PAGE_SIZE - 1) / PAGE_SIZE;
 
-		req->page_descs[req->num_pages].offset = start;
-		fuse_page_descs_length_init(req, req->num_pages, npages);
+		ap->descs[ap->num_pages].offset = start;
+		fuse_page_descs_length_init(ap->descs, ap->num_pages, npages);
 
-		req->num_pages += npages;
-		req->page_descs[req->num_pages - 1].length -=
+		ap->num_pages += npages;
+		ap->descs[ap->num_pages - 1].length -=
 			(PAGE_SIZE - ret) & (PAGE_SIZE - 1);
 	}
 
 	if (write)
-		req->in.argpages = 1;
+		ap->args.in_pages = 1;
 	else
-		req->out.argpages = 1;
+		ap->args.out_pages = 1;
 
 	*nbytesp = nbytes;
 
@@ -1349,17 +1411,16 @@
 	pgoff_t idx_from = pos >> PAGE_SHIFT;
 	pgoff_t idx_to = (pos + count - 1) >> PAGE_SHIFT;
 	ssize_t res = 0;
-	struct fuse_req *req;
 	int err = 0;
+	struct fuse_io_args *ia;
+	unsigned int max_pages;
 
-	if (io->async)
-		req = fuse_get_req_for_background(fc, iov_iter_npages(iter,
-								fc->max_pages));
-	else
-		req = fuse_get_req(fc, iov_iter_npages(iter, fc->max_pages));
-	if (IS_ERR(req))
-		return PTR_ERR(req);
+	max_pages = iov_iter_npages(iter, fc->max_pages);
+	ia = fuse_io_alloc(io, max_pages);
+	if (!ia)
+		return -ENOMEM;
 
+	ia->io = io;
 	if (!cuse && fuse_range_is_writeback(inode, idx_from, idx_to)) {
 		if (!write)
 			inode_lock(inode);
@@ -1370,54 +1431,49 @@
 
 	io->should_dirty = !write && iter_is_iovec(iter);
 	while (count) {
-		size_t nres;
+		ssize_t nres;
 		fl_owner_t owner = current->files;
 		size_t nbytes = min(count, nmax);
-		err = fuse_get_user_pages(req, iter, &nbytes, write);
+
+		err = fuse_get_user_pages(&ia->ap, iter, &nbytes, write,
+					  max_pages);
 		if (err && !nbytes)
 			break;
 
 		if (write) {
-			if (!capable(CAP_FSETID)) {
-				struct fuse_write_in *inarg;
+			if (!capable(CAP_FSETID))
+				ia->write.in.write_flags |= FUSE_WRITE_KILL_PRIV;
 
-				inarg = &req->misc.write.in;
-				inarg->write_flags |= FUSE_WRITE_KILL_PRIV;
-			}
-			nres = fuse_send_write(req, io, pos, nbytes, owner);
+			nres = fuse_send_write(ia, pos, nbytes, owner);
 		} else {
-			nres = fuse_send_read(req, io, pos, nbytes, owner);
+			nres = fuse_send_read(ia, pos, nbytes, owner);
 		}
 
-		if (!io->async)
-			fuse_release_user_pages(req, io->should_dirty);
-		if (req->out.h.error) {
-			err = req->out.h.error;
-			break;
-		} else if (nres > nbytes) {
-			res = 0;
-			err = -EIO;
+		if (!io->async || nres < 0) {
+			fuse_release_user_pages(&ia->ap, io->should_dirty);
+			fuse_io_free(ia);
+		}
+		ia = NULL;
+		if (nres < 0) {
+			err = nres;
 			break;
 		}
+		WARN_ON(nres > nbytes);
+
 		count -= nres;
 		res += nres;
 		pos += nres;
 		if (nres != nbytes)
 			break;
 		if (count) {
-			fuse_put_request(fc, req);
-			if (io->async)
-				req = fuse_get_req_for_background(fc,
-					iov_iter_npages(iter, fc->max_pages));
-			else
-				req = fuse_get_req(fc, iov_iter_npages(iter,
-								fc->max_pages));
-			if (IS_ERR(req))
+			max_pages = iov_iter_npages(iter, fc->max_pages);
+			ia = fuse_io_alloc(io, max_pages);
+			if (!ia)
 				break;
 		}
 	}
-	if (!IS_ERR(req))
-		fuse_put_request(fc, req);
+	if (ia)
+		fuse_io_free(ia);
 	if (res > 0)
 		*ppos = pos;
 
@@ -1509,45 +1565,53 @@
 		return fuse_direct_write_iter(iocb, from);
 }
 
-static void fuse_writepage_free(struct fuse_conn *fc, struct fuse_req *req)
+static void fuse_writepage_free(struct fuse_writepage_args *wpa)
 {
+	struct fuse_args_pages *ap = &wpa->ia.ap;
 	int i;
 
-	for (i = 0; i < req->num_pages; i++)
-		__free_page(req->pages[i]);
+	for (i = 0; i < ap->num_pages; i++)
+		__free_page(ap->pages[i]);
 
-	if (req->ff)
-		fuse_file_put(req->ff, false, false);
+	if (wpa->ia.ff)
+		fuse_file_put(wpa->ia.ff, false, false);
+
+	kfree(ap->pages);
+	kfree(wpa);
 }
 
-static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
+static void fuse_writepage_finish(struct fuse_conn *fc,
+				  struct fuse_writepage_args *wpa)
 {
-	struct inode *inode = req->inode;
+	struct fuse_args_pages *ap = &wpa->ia.ap;
+	struct inode *inode = wpa->inode;
 	struct fuse_inode *fi = get_fuse_inode(inode);
 	struct backing_dev_info *bdi = inode_to_bdi(inode);
 	int i;
 
-	list_del(&req->writepages_entry);
-	for (i = 0; i < req->num_pages; i++) {
+	list_del(&wpa->writepages_entry);
+	for (i = 0; i < ap->num_pages; i++) {
 		dec_wb_stat(&bdi->wb, WB_WRITEBACK);
-		dec_node_page_state(req->pages[i], NR_WRITEBACK_TEMP);
+		dec_node_page_state(ap->pages[i], NR_WRITEBACK_TEMP);
 		wb_writeout_inc(&bdi->wb);
 	}
 	wake_up(&fi->page_waitq);
 }
 
 /* Called under fi->lock, may release and reacquire it */
-static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req,
-				loff_t size)
+static void fuse_send_writepage(struct fuse_conn *fc,
+				struct fuse_writepage_args *wpa, loff_t size)
 __releases(fi->lock)
 __acquires(fi->lock)
 {
-	struct fuse_req *aux, *next;
-	struct fuse_inode *fi = get_fuse_inode(req->inode);
-	struct fuse_write_in *inarg = &req->misc.write.in;
-	__u64 data_size = req->num_pages * PAGE_SIZE;
-	bool queued;
+	struct fuse_writepage_args *aux, *next;
+	struct fuse_inode *fi = get_fuse_inode(wpa->inode);
+	struct fuse_write_in *inarg = &wpa->ia.write.in;
+	struct fuse_args *args = &wpa->ia.ap.args;
+	__u64 data_size = wpa->ia.ap.num_pages * PAGE_SIZE;
+	int err;
 
+	fi->writectr++;
 	if (inarg->offset + data_size <= size) {
 		inarg->size = data_size;
 	} else if (inarg->offset < size) {
@@ -1557,29 +1621,36 @@
 		goto out_free;
 	}
 
-	req->in.args[1].size = inarg->size;
-	queued = fuse_request_queue_background(fc, req);
+	args->in_args[1].size = inarg->size;
+	args->force = true;
+	args->nocreds = true;
+
+	err = fuse_simple_background(fc, args, GFP_ATOMIC);
+	if (err == -ENOMEM) {
+		spin_unlock(&fi->lock);
+		err = fuse_simple_background(fc, args, GFP_NOFS | __GFP_NOFAIL);
+		spin_lock(&fi->lock);
+	}
+
 	/* Fails on broken connection only */
-	if (unlikely(!queued))
+	if (unlikely(err))
 		goto out_free;
 
-	fi->writectr++;
 	return;
 
  out_free:
-	fuse_writepage_finish(fc, req);
+	fi->writectr--;
+	fuse_writepage_finish(fc, wpa);
 	spin_unlock(&fi->lock);
 
 	/* After fuse_writepage_finish() aux request list is private */
-	for (aux = req->misc.write.next; aux; aux = next) {
-		next = aux->misc.write.next;
-		aux->misc.write.next = NULL;
-		fuse_writepage_free(fc, aux);
-		fuse_put_request(fc, aux);
+	for (aux = wpa->next; aux; aux = next) {
+		next = aux->next;
+		aux->next = NULL;
+		fuse_writepage_free(aux);
 	}
 
-	fuse_writepage_free(fc, req);
-	fuse_put_request(fc, req);
+	fuse_writepage_free(wpa);
 	spin_lock(&fi->lock);
 }
 
@@ -1596,29 +1667,34 @@
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	struct fuse_inode *fi = get_fuse_inode(inode);
 	loff_t crop = i_size_read(inode);
-	struct fuse_req *req;
+	struct fuse_writepage_args *wpa;
 
 	while (fi->writectr >= 0 && !list_empty(&fi->queued_writes)) {
-		req = list_entry(fi->queued_writes.next, struct fuse_req, list);
-		list_del_init(&req->list);
-		fuse_send_writepage(fc, req, crop);
+		wpa = list_entry(fi->queued_writes.next,
+				 struct fuse_writepage_args, queue_entry);
+		list_del_init(&wpa->queue_entry);
+		fuse_send_writepage(fc, wpa, crop);
 	}
 }
 
-static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_req *req)
+static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_args *args,
+			       int error)
 {
-	struct inode *inode = req->inode;
+	struct fuse_writepage_args *wpa =
+		container_of(args, typeof(*wpa), ia.ap.args);
+	struct inode *inode = wpa->inode;
 	struct fuse_inode *fi = get_fuse_inode(inode);
 
-	mapping_set_error(inode->i_mapping, req->out.h.error);
+	mapping_set_error(inode->i_mapping, error);
 	spin_lock(&fi->lock);
-	while (req->misc.write.next) {
+	while (wpa->next) {
 		struct fuse_conn *fc = get_fuse_conn(inode);
-		struct fuse_write_in *inarg = &req->misc.write.in;
-		struct fuse_req *next = req->misc.write.next;
-		req->misc.write.next = next->misc.write.next;
-		next->misc.write.next = NULL;
-		next->ff = fuse_file_get(req->ff);
+		struct fuse_write_in *inarg = &wpa->ia.write.in;
+		struct fuse_writepage_args *next = wpa->next;
+
+		wpa->next = next->next;
+		next->next = NULL;
+		next->ia.ff = fuse_file_get(wpa->ia.ff);
 		list_add(&next->writepages_entry, &fi->writepages);
 
 		/*
@@ -1647,9 +1723,9 @@
 		fuse_send_writepage(fc, next, inarg->offset + inarg->size);
 	}
 	fi->writectr--;
-	fuse_writepage_finish(fc, req);
+	fuse_writepage_finish(fc, wpa);
 	spin_unlock(&fi->lock);
-	fuse_writepage_free(fc, req);
+	fuse_writepage_free(wpa);
 }
 
 static struct fuse_file *__fuse_write_file_get(struct fuse_conn *fc,
@@ -1691,52 +1767,71 @@
 	return err;
 }
 
+static struct fuse_writepage_args *fuse_writepage_args_alloc(void)
+{
+	struct fuse_writepage_args *wpa;
+	struct fuse_args_pages *ap;
+
+	wpa = kzalloc(sizeof(*wpa), GFP_NOFS);
+	if (wpa) {
+		ap = &wpa->ia.ap;
+		ap->num_pages = 0;
+		ap->pages = fuse_pages_alloc(1, GFP_NOFS, &ap->descs);
+		if (!ap->pages) {
+			kfree(wpa);
+			wpa = NULL;
+		}
+	}
+	return wpa;
+
+}
+
 static int fuse_writepage_locked(struct page *page)
 {
 	struct address_space *mapping = page->mapping;
 	struct inode *inode = mapping->host;
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	struct fuse_inode *fi = get_fuse_inode(inode);
-	struct fuse_req *req;
+	struct fuse_writepage_args *wpa;
+	struct fuse_args_pages *ap;
 	struct page *tmp_page;
 	int error = -ENOMEM;
 
 	set_page_writeback(page);
 
-	req = fuse_request_alloc_nofs(1);
-	if (!req)
+	wpa = fuse_writepage_args_alloc();
+	if (!wpa)
 		goto err;
+	ap = &wpa->ia.ap;
 
-	/* writeback always goes to bg_queue */
-	__set_bit(FR_BACKGROUND, &req->flags);
 	tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
 	if (!tmp_page)
 		goto err_free;
 
 	error = -EIO;
-	req->ff = fuse_write_file_get(fc, fi);
-	if (!req->ff)
+	wpa->ia.ff = fuse_write_file_get(fc, fi);
+	if (!wpa->ia.ff)
 		goto err_nofile;
 
-	fuse_write_fill(req, req->ff, page_offset(page), 0);
+	fuse_write_args_fill(&wpa->ia, wpa->ia.ff, page_offset(page), 0);
 
 	copy_highpage(tmp_page, page);
-	req->misc.write.in.write_flags |= FUSE_WRITE_CACHE;
-	req->misc.write.next = NULL;
-	req->in.argpages = 1;
-	req->num_pages = 1;
-	req->pages[0] = tmp_page;
-	req->page_descs[0].offset = 0;
-	req->page_descs[0].length = PAGE_SIZE;
-	req->end = fuse_writepage_end;
-	req->inode = inode;
+	wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE;
+	wpa->next = NULL;
+	ap->args.in_pages = true;
+	ap->num_pages = 1;
+	ap->pages[0] = tmp_page;
+	ap->descs[0].offset = 0;
+	ap->descs[0].length = PAGE_SIZE;
+	ap->args.end = fuse_writepage_end;
+	wpa->inode = inode;
 
 	inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
 	inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP);
 
 	spin_lock(&fi->lock);
-	list_add(&req->writepages_entry, &fi->writepages);
-	list_add_tail(&req->list, &fi->queued_writes);
+	list_add(&wpa->writepages_entry, &fi->writepages);
+	list_add_tail(&wpa->queue_entry, &fi->queued_writes);
 	fuse_flush_writepages(inode);
 	spin_unlock(&fi->lock);
 
@@ -1747,7 +1842,7 @@
 err_nofile:
 	__free_page(tmp_page);
 err_free:
-	fuse_request_free(req);
+	kfree(wpa);
 err:
 	mapping_set_error(page->mapping, error);
 	end_page_writeback(page);
@@ -1767,6 +1862,7 @@
 		WARN_ON(wbc->sync_mode == WB_SYNC_ALL);
 
 		redirty_page_for_writepage(wbc, page);
+		unlock_page(page);
 		return 0;
 	}
 
@@ -1777,23 +1873,50 @@
 }
 
 struct fuse_fill_wb_data {
-	struct fuse_req *req;
+	struct fuse_writepage_args *wpa;
 	struct fuse_file *ff;
 	struct inode *inode;
 	struct page **orig_pages;
+	unsigned int max_pages;
 };
 
+static bool fuse_pages_realloc(struct fuse_fill_wb_data *data)
+{
+	struct fuse_args_pages *ap = &data->wpa->ia.ap;
+	struct fuse_conn *fc = get_fuse_conn(data->inode);
+	struct page **pages;
+	struct fuse_page_desc *descs;
+	unsigned int npages = min_t(unsigned int,
+				    max_t(unsigned int, data->max_pages * 2,
+					  FUSE_DEFAULT_MAX_PAGES_PER_REQ),
+				    fc->max_pages);
+	WARN_ON(npages <= data->max_pages);
+
+	pages = fuse_pages_alloc(npages, GFP_NOFS, &descs);
+	if (!pages)
+		return false;
+
+	memcpy(pages, ap->pages, sizeof(struct page *) * ap->num_pages);
+	memcpy(descs, ap->descs, sizeof(struct fuse_page_desc) * ap->num_pages);
+	kfree(ap->pages);
+	ap->pages = pages;
+	ap->descs = descs;
+	data->max_pages = npages;
+
+	return true;
+}
+
 static void fuse_writepages_send(struct fuse_fill_wb_data *data)
 {
-	struct fuse_req *req = data->req;
+	struct fuse_writepage_args *wpa = data->wpa;
 	struct inode *inode = data->inode;
 	struct fuse_inode *fi = get_fuse_inode(inode);
-	int num_pages = req->num_pages;
+	int num_pages = wpa->ia.ap.num_pages;
 	int i;
 
-	req->ff = fuse_file_get(data->ff);
+	wpa->ia.ff = fuse_file_get(data->ff);
 	spin_lock(&fi->lock);
-	list_add_tail(&req->list, &fi->queued_writes);
+	list_add_tail(&wpa->queue_entry, &fi->queued_writes);
 	fuse_flush_writepages(inode);
 	spin_unlock(&fi->lock);
 
@@ -1808,54 +1931,52 @@
  * this new request onto the auxiliary list, otherwise reuse the existing one by
  * copying the new page contents over to the old temporary page.
  */
-static bool fuse_writepage_in_flight(struct fuse_req *new_req,
+static bool fuse_writepage_in_flight(struct fuse_writepage_args *new_wpa,
 				     struct page *page)
 {
-	struct fuse_conn *fc = get_fuse_conn(new_req->inode);
-	struct fuse_inode *fi = get_fuse_inode(new_req->inode);
-	struct fuse_req *tmp;
-	struct fuse_req *old_req;
+	struct fuse_inode *fi = get_fuse_inode(new_wpa->inode);
+	struct fuse_writepage_args *tmp;
+	struct fuse_writepage_args *old_wpa;
+	struct fuse_args_pages *new_ap = &new_wpa->ia.ap;
 
-	WARN_ON(new_req->num_pages != 0);
+	WARN_ON(new_ap->num_pages != 0);
 
 	spin_lock(&fi->lock);
-	list_del(&new_req->writepages_entry);
-	old_req = fuse_find_writeback(fi, page->index, page->index);
-	if (!old_req) {
-		list_add(&new_req->writepages_entry, &fi->writepages);
+	list_del(&new_wpa->writepages_entry);
+	old_wpa = fuse_find_writeback(fi, page->index, page->index);
+	if (!old_wpa) {
+		list_add(&new_wpa->writepages_entry, &fi->writepages);
 		spin_unlock(&fi->lock);
 		return false;
 	}
 
-	new_req->num_pages = 1;
-	for (tmp = old_req->misc.write.next; tmp; tmp = tmp->misc.write.next) {
+	new_ap->num_pages = 1;
+	for (tmp = old_wpa->next; tmp; tmp = tmp->next) {
 		pgoff_t curr_index;
 
-		WARN_ON(tmp->inode != new_req->inode);
-		curr_index = tmp->misc.write.in.offset >> PAGE_SHIFT;
+		WARN_ON(tmp->inode != new_wpa->inode);
+		curr_index = tmp->ia.write.in.offset >> PAGE_SHIFT;
 		if (curr_index == page->index) {
-			WARN_ON(tmp->num_pages != 1);
-			WARN_ON(!test_bit(FR_PENDING, &tmp->flags));
-			swap(tmp->pages[0], new_req->pages[0]);
+			WARN_ON(tmp->ia.ap.num_pages != 1);
+			swap(tmp->ia.ap.pages[0], new_ap->pages[0]);
 			break;
 		}
 	}
 
 	if (!tmp) {
-		new_req->misc.write.next = old_req->misc.write.next;
-		old_req->misc.write.next = new_req;
+		new_wpa->next = old_wpa->next;
+		old_wpa->next = new_wpa;
 	}
 
 	spin_unlock(&fi->lock);
 
 	if (tmp) {
-		struct backing_dev_info *bdi = inode_to_bdi(new_req->inode);
+		struct backing_dev_info *bdi = inode_to_bdi(new_wpa->inode);
 
 		dec_wb_stat(&bdi->wb, WB_WRITEBACK);
-		dec_node_page_state(new_req->pages[0], NR_WRITEBACK_TEMP);
+		dec_node_page_state(new_ap->pages[0], NR_WRITEBACK_TEMP);
 		wb_writeout_inc(&bdi->wb);
-		fuse_writepage_free(fc, new_req);
-		fuse_request_free(new_req);
+		fuse_writepage_free(new_wpa);
 	}
 
 	return true;
@@ -1865,7 +1986,8 @@
 		struct writeback_control *wbc, void *_data)
 {
 	struct fuse_fill_wb_data *data = _data;
-	struct fuse_req *req = data->req;
+	struct fuse_writepage_args *wpa = data->wpa;
+	struct fuse_args_pages *ap = &wpa->ia.ap;
 	struct inode *inode = data->inode;
 	struct fuse_inode *fi = get_fuse_inode(inode);
 	struct fuse_conn *fc = get_fuse_conn(inode);
@@ -1888,16 +2010,16 @@
 	 */
 	is_writeback = fuse_page_is_writeback(inode, page->index);
 
-	if (req && req->num_pages &&
-	    (is_writeback || req->num_pages == fc->max_pages ||
-	     (req->num_pages + 1) * PAGE_SIZE > fc->max_write ||
-	     data->orig_pages[req->num_pages - 1]->index + 1 != page->index)) {
+	if (wpa && ap->num_pages &&
+	    (is_writeback || ap->num_pages == fc->max_pages ||
+	     (ap->num_pages + 1) * PAGE_SIZE > fc->max_write ||
+	     data->orig_pages[ap->num_pages - 1]->index + 1 != page->index)) {
 		fuse_writepages_send(data);
-		data->req = NULL;
-	} else if (req && req->num_pages == req->max_pages) {
-		if (!fuse_req_realloc_pages(fc, req, GFP_NOFS)) {
+		data->wpa = NULL;
+	} else if (wpa && ap->num_pages == data->max_pages) {
+		if (!fuse_pages_realloc(data)) {
 			fuse_writepages_send(data);
-			req = data->req = NULL;
+			data->wpa = NULL;
 		}
 	}
 
@@ -1915,59 +2037,60 @@
 	 * This is ensured by holding the page lock in page_mkwrite() while
 	 * checking fuse_page_is_writeback().  We already hold the page lock
 	 * since clear_page_dirty_for_io() and keep it held until we add the
-	 * request to the fi->writepages list and increment req->num_pages.
+	 * request to the fi->writepages list and increment ap->num_pages.
 	 * After this fuse_page_is_writeback() will indicate that the page is
 	 * under writeback, so we can release the page lock.
 	 */
-	if (data->req == NULL) {
+	if (data->wpa == NULL) {
 		struct fuse_inode *fi = get_fuse_inode(inode);
 
 		err = -ENOMEM;
-		req = fuse_request_alloc_nofs(FUSE_REQ_INLINE_PAGES);
-		if (!req) {
+		wpa = fuse_writepage_args_alloc();
+		if (!wpa) {
 			__free_page(tmp_page);
 			goto out_unlock;
 		}
+		data->max_pages = 1;
 
-		fuse_write_fill(req, data->ff, page_offset(page), 0);
-		req->misc.write.in.write_flags |= FUSE_WRITE_CACHE;
-		req->misc.write.next = NULL;
-		req->in.argpages = 1;
-		__set_bit(FR_BACKGROUND, &req->flags);
-		req->num_pages = 0;
-		req->end = fuse_writepage_end;
-		req->inode = inode;
+		ap = &wpa->ia.ap;
+		fuse_write_args_fill(&wpa->ia, data->ff, page_offset(page), 0);
+		wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE;
+		wpa->next = NULL;
+		ap->args.in_pages = true;
+		ap->args.end = fuse_writepage_end;
+		ap->num_pages = 0;
+		wpa->inode = inode;
 
 		spin_lock(&fi->lock);
-		list_add(&req->writepages_entry, &fi->writepages);
+		list_add(&wpa->writepages_entry, &fi->writepages);
 		spin_unlock(&fi->lock);
 
-		data->req = req;
+		data->wpa = wpa;
 	}
 	set_page_writeback(page);
 
 	copy_highpage(tmp_page, page);
-	req->pages[req->num_pages] = tmp_page;
-	req->page_descs[req->num_pages].offset = 0;
-	req->page_descs[req->num_pages].length = PAGE_SIZE;
+	ap->pages[ap->num_pages] = tmp_page;
+	ap->descs[ap->num_pages].offset = 0;
+	ap->descs[ap->num_pages].length = PAGE_SIZE;
 
 	inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
 	inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP);
 
 	err = 0;
-	if (is_writeback && fuse_writepage_in_flight(req, page)) {
+	if (is_writeback && fuse_writepage_in_flight(wpa, page)) {
 		end_page_writeback(page);
-		data->req = NULL;
+		data->wpa = NULL;
 		goto out_unlock;
 	}
-	data->orig_pages[req->num_pages] = page;
+	data->orig_pages[ap->num_pages] = page;
 
 	/*
 	 * Protected by fi->lock against concurrent access by
 	 * fuse_page_is_writeback().
 	 */
 	spin_lock(&fi->lock);
-	req->num_pages++;
+	ap->num_pages++;
 	spin_unlock(&fi->lock);
 
 out_unlock:
@@ -1989,7 +2112,7 @@
 		goto out;
 
 	data.inode = inode;
-	data.req = NULL;
+	data.wpa = NULL;
 	data.ff = NULL;
 
 	err = -ENOMEM;
@@ -2000,9 +2123,9 @@
 		goto out;
 
 	err = write_cache_pages(mapping, wbc, fuse_writepages_fill, &data);
-	if (data.req) {
+	if (data.wpa) {
 		/* Ignore errors if we can write at least one page */
-		BUG_ON(!data.req->num_pages);
+		WARN_ON(!data.wpa->ia.ap.num_pages);
 		fuse_writepages_send(&data);
 		err = 0;
 	}
@@ -2222,11 +2345,11 @@
 	inarg->lk.pid = pid;
 	if (flock)
 		inarg->lk_flags |= FUSE_LK_FLOCK;
-	args->in.h.opcode = opcode;
-	args->in.h.nodeid = get_node_id(inode);
-	args->in.numargs = 1;
-	args->in.args[0].size = sizeof(*inarg);
-	args->in.args[0].value = inarg;
+	args->opcode = opcode;
+	args->nodeid = get_node_id(inode);
+	args->in_numargs = 1;
+	args->in_args[0].size = sizeof(*inarg);
+	args->in_args[0].value = inarg;
 }
 
 static int fuse_getlk(struct file *file, struct file_lock *fl)
@@ -2239,9 +2362,9 @@
 	int err;
 
 	fuse_lk_fill(&args, file, fl, FUSE_GETLK, 0, 0, &inarg);
-	args.out.numargs = 1;
-	args.out.args[0].size = sizeof(outarg);
-	args.out.args[0].value = &outarg;
+	args.out_numargs = 1;
+	args.out_args[0].size = sizeof(outarg);
+	args.out_args[0].value = &outarg;
 	err = fuse_simple_request(fc, &args);
 	if (!err)
 		err = convert_fuse_file_lock(fc, &outarg.lk, fl);
@@ -2336,14 +2459,14 @@
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.block = block;
 	inarg.blocksize = inode->i_sb->s_blocksize;
-	args.in.h.opcode = FUSE_BMAP;
-	args.in.h.nodeid = get_node_id(inode);
-	args.in.numargs = 1;
-	args.in.args[0].size = sizeof(inarg);
-	args.in.args[0].value = &inarg;
-	args.out.numargs = 1;
-	args.out.args[0].size = sizeof(outarg);
-	args.out.args[0].value = &outarg;
+	args.opcode = FUSE_BMAP;
+	args.nodeid = get_node_id(inode);
+	args.in_numargs = 1;
+	args.in_args[0].size = sizeof(inarg);
+	args.in_args[0].value = &inarg;
+	args.out_numargs = 1;
+	args.out_args[0].size = sizeof(outarg);
+	args.out_args[0].value = &outarg;
 	err = fuse_simple_request(fc, &args);
 	if (err == -ENOSYS)
 		fc->no_bmap = 1;
@@ -2368,14 +2491,14 @@
 	if (fc->no_lseek)
 		goto fallback;
 
-	args.in.h.opcode = FUSE_LSEEK;
-	args.in.h.nodeid = ff->nodeid;
-	args.in.numargs = 1;
-	args.in.args[0].size = sizeof(inarg);
-	args.in.args[0].value = &inarg;
-	args.out.numargs = 1;
-	args.out.args[0].size = sizeof(outarg);
-	args.out.args[0].value = &outarg;
+	args.opcode = FUSE_LSEEK;
+	args.nodeid = ff->nodeid;
+	args.in_numargs = 1;
+	args.in_args[0].size = sizeof(inarg);
+	args.in_args[0].value = &inarg;
+	args.out_numargs = 1;
+	args.out_args[0].size = sizeof(outarg);
+	args.out_args[0].value = &outarg;
 	err = fuse_simple_request(fc, &args);
 	if (err) {
 		if (err == -ENOSYS) {
@@ -2573,14 +2696,14 @@
 		.flags = flags
 	};
 	struct fuse_ioctl_out outarg;
-	struct fuse_req *req = NULL;
-	struct page **pages = NULL;
 	struct iovec *iov_page = NULL;
 	struct iovec *in_iov = NULL, *out_iov = NULL;
-	unsigned int in_iovs = 0, out_iovs = 0, num_pages = 0, max_pages;
-	size_t in_size, out_size, transferred, c;
+	unsigned int in_iovs = 0, out_iovs = 0, max_pages;
+	size_t in_size, out_size, c;
+	ssize_t transferred;
 	int err, i;
 	struct iov_iter ii;
+	struct fuse_args_pages ap = {};
 
 #if BITS_PER_LONG == 32
 	inarg.flags |= FUSE_IOCTL_32BIT;
@@ -2598,11 +2721,13 @@
 	BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE);
 
 	err = -ENOMEM;
-	pages = kcalloc(fc->max_pages, sizeof(pages[0]), GFP_KERNEL);
+	ap.pages = fuse_pages_alloc(fc->max_pages, GFP_KERNEL, &ap.descs);
 	iov_page = (struct iovec *) __get_free_page(GFP_KERNEL);
-	if (!pages || !iov_page)
+	if (!ap.pages || !iov_page)
 		goto out;
 
+	fuse_page_descs_length_init(ap.descs, 0, fc->max_pages);
+
 	/*
 	 * If restricted, initialize IO parameters as encoded in @cmd.
 	 * RETRY from server is not allowed.
@@ -2639,56 +2764,44 @@
 	err = -ENOMEM;
 	if (max_pages > fc->max_pages)
 		goto out;
-	while (num_pages < max_pages) {
-		pages[num_pages] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
-		if (!pages[num_pages])
+	while (ap.num_pages < max_pages) {
+		ap.pages[ap.num_pages] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
+		if (!ap.pages[ap.num_pages])
 			goto out;
-		num_pages++;
+		ap.num_pages++;
 	}
 
-	req = fuse_get_req(fc, num_pages);
-	if (IS_ERR(req)) {
-		err = PTR_ERR(req);
-		req = NULL;
-		goto out;
-	}
-	memcpy(req->pages, pages, sizeof(req->pages[0]) * num_pages);
-	req->num_pages = num_pages;
-	fuse_page_descs_length_init(req, 0, req->num_pages);
 
 	/* okay, let's send it to the client */
-	req->in.h.opcode = FUSE_IOCTL;
-	req->in.h.nodeid = ff->nodeid;
-	req->in.numargs = 1;
-	req->in.args[0].size = sizeof(inarg);
-	req->in.args[0].value = &inarg;
+	ap.args.opcode = FUSE_IOCTL;
+	ap.args.nodeid = ff->nodeid;
+	ap.args.in_numargs = 1;
+	ap.args.in_args[0].size = sizeof(inarg);
+	ap.args.in_args[0].value = &inarg;
 	if (in_size) {
-		req->in.numargs++;
-		req->in.args[1].size = in_size;
-		req->in.argpages = 1;
+		ap.args.in_numargs++;
+		ap.args.in_args[1].size = in_size;
+		ap.args.in_pages = true;
 
 		err = -EFAULT;
 		iov_iter_init(&ii, WRITE, in_iov, in_iovs, in_size);
-		for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= num_pages); i++) {
-			c = copy_page_from_iter(pages[i], 0, PAGE_SIZE, &ii);
+		for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_pages); i++) {
+			c = copy_page_from_iter(ap.pages[i], 0, PAGE_SIZE, &ii);
 			if (c != PAGE_SIZE && iov_iter_count(&ii))
 				goto out;
 		}
 	}
 
-	req->out.numargs = 2;
-	req->out.args[0].size = sizeof(outarg);
-	req->out.args[0].value = &outarg;
-	req->out.args[1].size = out_size;
-	req->out.argpages = 1;
-	req->out.argvar = 1;
+	ap.args.out_numargs = 2;
+	ap.args.out_args[0].size = sizeof(outarg);
+	ap.args.out_args[0].value = &outarg;
+	ap.args.out_args[1].size = out_size;
+	ap.args.out_pages = true;
+	ap.args.out_argvar = true;
 
-	fuse_request_send(fc, req);
-	err = req->out.h.error;
-	transferred = req->out.args[1].size;
-	fuse_put_request(fc, req);
-	req = NULL;
-	if (err)
+	transferred = fuse_simple_request(fc, &ap.args);
+	err = transferred;
+	if (transferred < 0)
 		goto out;
 
 	/* did it ask for retry? */
@@ -2713,7 +2826,7 @@
 		    in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV)
 			goto out;
 
-		vaddr = kmap_atomic(pages[0]);
+		vaddr = kmap_atomic(ap.pages[0]);
 		err = fuse_copy_ioctl_iovec(fc, iov_page, vaddr,
 					    transferred, in_iovs + out_iovs,
 					    (flags & FUSE_IOCTL_COMPAT) != 0);
@@ -2741,19 +2854,17 @@
 
 	err = -EFAULT;
 	iov_iter_init(&ii, READ, out_iov, out_iovs, transferred);
-	for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= num_pages); i++) {
-		c = copy_page_to_iter(pages[i], 0, PAGE_SIZE, &ii);
+	for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_pages); i++) {
+		c = copy_page_to_iter(ap.pages[i], 0, PAGE_SIZE, &ii);
 		if (c != PAGE_SIZE && iov_iter_count(&ii))
 			goto out;
 	}
 	err = 0;
  out:
-	if (req)
-		fuse_put_request(fc, req);
 	free_page((unsigned long) iov_page);
-	while (num_pages)
-		__free_page(pages[--num_pages]);
-	kfree(pages);
+	while (ap.num_pages)
+		__free_page(ap.pages[--ap.num_pages]);
+	kfree(ap.pages);
 
 	return err ? err : outarg.result;
 }
@@ -2861,14 +2972,14 @@
 		fuse_register_polled_file(fc, ff);
 	}
 
-	args.in.h.opcode = FUSE_POLL;
-	args.in.h.nodeid = ff->nodeid;
-	args.in.numargs = 1;
-	args.in.args[0].size = sizeof(inarg);
-	args.in.args[0].value = &inarg;
-	args.out.numargs = 1;
-	args.out.args[0].size = sizeof(outarg);
-	args.out.args[0].value = &outarg;
+	args.opcode = FUSE_POLL;
+	args.nodeid = ff->nodeid;
+	args.in_numargs = 1;
+	args.in_args[0].size = sizeof(inarg);
+	args.in_args[0].value = &inarg;
+	args.out_numargs = 1;
+	args.out_args[0].size = sizeof(outarg);
+	args.out_args[0].value = &outarg;
 	err = fuse_simple_request(fc, &args);
 
 	if (!err)
@@ -3076,11 +3187,11 @@
 	if (!(mode & FALLOC_FL_KEEP_SIZE))
 		set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
 
-	args.in.h.opcode = FUSE_FALLOCATE;
-	args.in.h.nodeid = ff->nodeid;
-	args.in.numargs = 1;
-	args.in.args[0].size = sizeof(inarg);
-	args.in.args[0].value = &inarg;
+	args.opcode = FUSE_FALLOCATE;
+	args.nodeid = ff->nodeid;
+	args.in_numargs = 1;
+	args.in_args[0].size = sizeof(inarg);
+	args.in_args[0].value = &inarg;
 	err = fuse_simple_request(fc, &args);
 	if (err == -ENOSYS) {
 		fc->no_fallocate = 1;
@@ -3168,14 +3279,14 @@
 	if (is_unstable)
 		set_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state);
 
-	args.in.h.opcode = FUSE_COPY_FILE_RANGE;
-	args.in.h.nodeid = ff_in->nodeid;
-	args.in.numargs = 1;
-	args.in.args[0].size = sizeof(inarg);
-	args.in.args[0].value = &inarg;
-	args.out.numargs = 1;
-	args.out.args[0].size = sizeof(outarg);
-	args.out.args[0].value = &outarg;
+	args.opcode = FUSE_COPY_FILE_RANGE;
+	args.nodeid = ff_in->nodeid;
+	args.in_numargs = 1;
+	args.in_args[0].size = sizeof(inarg);
+	args.in_args[0].value = &inarg;
+	args.out_numargs = 1;
+	args.out_args[0].size = sizeof(outarg);
+	args.out_args[0].value = &outarg;
 	err = fuse_simple_request(fc, &args);
 	if (err == -ENOSYS) {
 		fc->no_copy_file_range = 1;

diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 24dbca7..956aeaf 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h

@@ -47,9 +47,6 @@
 /** Number of dentries for each connection in the control filesystem */
 #define FUSE_CTL_NUM_DENTRIES 5
 
-/** Number of page pointers embedded in fuse_req */
-#define FUSE_REQ_INLINE_PAGES 1
-
 /** List of active connections */
 extern struct list_head fuse_conn_list;
 
@@ -164,17 +161,15 @@
 };
 
 struct fuse_conn;
+struct fuse_release_args;
 
 /** FUSE specific file data */
 struct fuse_file {
 	/** Fuse connection for this file */
 	struct fuse_conn *fc;
 
-	/*
-	 * Request reserved for flush and release.
-	 * Modified under relative fuse_inode::lock.
-	 */
-	struct fuse_req *reserved_req;
+	/* Argument space reserved for release */
+	struct fuse_release_args *release_args;
 
 	/** Kernel file handle guaranteed to be unique */
 	u64 kh;
@@ -229,57 +224,12 @@
 	const void *value;
 };
 
-/** The request input */
-struct fuse_in {
-	/** The request header */
-	struct fuse_in_header h;
-
-	/** True if the data for the last argument is in req->pages */
-	unsigned argpages:1;
-
-	/** Number of arguments */
-	unsigned numargs;
-
-	/** Array of arguments */
-	struct fuse_in_arg args[3];
-};
-
 /** One output argument of a request */
 struct fuse_arg {
 	unsigned size;
 	void *value;
 };
 
-/** The request output */
-struct fuse_out {
-	/** Header returned from userspace */
-	struct fuse_out_header h;
-
-	/*
-	 * The following bitfields are not changed during the request
-	 * processing
-	 */
-
-	/** Last argument is variable length (can be shorter than
-	    arg->size) */
-	unsigned argvar:1;
-
-	/** Last argument is a list of pages to copy data to */
-	unsigned argpages:1;
-
-	/** Zero partially or not copied pages */
-	unsigned page_zeroing:1;
-
-	/** Pages may be replaced with new ones */
-	unsigned page_replace:1;
-
-	/** Number or arguments */
-	unsigned numargs;
-
-	/** Array of arguments */
-	struct fuse_arg args[2];
-};
-
 /** FUSE page descriptor */
 struct fuse_page_desc {
 	unsigned int length;
@@ -287,20 +237,28 @@
 };
 
 struct fuse_args {
-	struct {
-		struct {
-			uint32_t opcode;
-			uint64_t nodeid;
-		} h;
-		unsigned numargs;
-		struct fuse_in_arg args[3];
+	uint64_t nodeid;
+	uint32_t opcode;
+	unsigned short in_numargs;
+	unsigned short out_numargs;
+	bool force:1;
+	bool noreply:1;
+	bool nocreds:1;
+	bool in_pages:1;
+	bool out_pages:1;
+	bool out_argvar:1;
+	bool page_zeroing:1;
+	bool page_replace:1;
+	struct fuse_in_arg in_args[3];
+	struct fuse_arg out_args[2];
+	void (*end)(struct fuse_conn *fc, struct fuse_args *args, int error);
+};
 
-	} in;
-	struct {
-		unsigned argvar:1;
-		unsigned numargs;
-		struct fuse_arg args[2];
-	} out;
+struct fuse_args_pages {
+	struct fuse_args args;
+	struct page **pages;
+	struct fuse_page_desc *descs;
+	unsigned int num_pages;
 };
 
 #define FUSE_ARGS(args) struct fuse_args args = {}
@@ -373,83 +331,79 @@
 	/** Entry on the interrupts list  */
 	struct list_head intr_entry;
 
+	/* Input/output arguments */
+	struct fuse_args *args;
+
 	/** refcount */
 	refcount_t count;
 
 	/* Request flags, updated with test/set/clear_bit() */
 	unsigned long flags;
 
-	/** The request input */
-	struct fuse_in in;
+	/* The request input header */
+	struct {
+		struct fuse_in_header h;
+	} in;
 
-	/** The request output */
-	struct fuse_out out;
+	/* The request output header */
+	struct {
+		struct fuse_out_header h;
+	} out;
 
 	/** Used to wake up the task waiting for completion of request*/
 	wait_queue_head_t waitq;
 
-	/** Data for asynchronous requests */
-	union {
-		struct {
-			struct fuse_release_in in;
-			struct inode *inode;
-		} release;
-		struct fuse_init_in init_in;
-		struct fuse_init_out init_out;
-		struct cuse_init_in cuse_init_in;
-		struct {
-			struct fuse_read_in in;
-			u64 attr_ver;
-		} read;
-		struct {
-			struct fuse_write_in in;
-			struct fuse_write_out out;
-			struct fuse_req *next;
-		} write;
-		struct fuse_notify_retrieve_in retrieve_in;
-	} misc;
-
-	/** page vector */
-	struct page **pages;
-
-	/** page-descriptor vector */
-	struct fuse_page_desc *page_descs;
-
-	/** size of the 'pages' array */
-	unsigned max_pages;
-
-	/** inline page vector */
-	struct page *inline_pages[FUSE_REQ_INLINE_PAGES];
-
-	/** inline page-descriptor vector */
-	struct fuse_page_desc inline_page_descs[FUSE_REQ_INLINE_PAGES];
-
-	/** number of pages in vector */
-	unsigned num_pages;
-
-	/** File used in the request (or NULL) */
-	struct fuse_file *ff;
-
-	/** Inode used in the request or NULL */
-	struct inode *inode;
-
-	/** AIO control block */
-	struct fuse_io_priv *io;
-
-	/** Link on fi->writepages */
-	struct list_head writepages_entry;
-
-	/** Request completion callback */
-	void (*end)(struct fuse_conn *, struct fuse_req *);
-
-	/** Request is stolen from fuse_file->reserved_req */
-	struct file *stolen_file;
+#if IS_ENABLED(CONFIG_VIRTIO_FS)
+	/** virtio-fs's physically contiguous buffer for in and out args */
+	void *argbuf;
+#endif
 };
 
+struct fuse_iqueue;
+
+/**
+ * Input queue callbacks
+ *
+ * Input queue signalling is device-specific.  For example, the /dev/fuse file
+ * uses fiq->waitq and fasync to wake processes that are waiting on queue
+ * readiness.  These callbacks allow other device types to respond to input
+ * queue activity.
+ */
+struct fuse_iqueue_ops {
+	/**
+	 * Signal that a forget has been queued
+	 */
+	void (*wake_forget_and_unlock)(struct fuse_iqueue *fiq)
+		__releases(fiq->lock);
+
+	/**
+	 * Signal that an INTERRUPT request has been queued
+	 */
+	void (*wake_interrupt_and_unlock)(struct fuse_iqueue *fiq)
+		__releases(fiq->lock);
+
+	/**
+	 * Signal that a request has been queued
+	 */
+	void (*wake_pending_and_unlock)(struct fuse_iqueue *fiq)
+		__releases(fiq->lock);
+
+	/**
+	 * Clean up when fuse_iqueue is destroyed
+	 */
+	void (*release)(struct fuse_iqueue *fiq);
+};
+
+/** /dev/fuse input queue operations */
+extern const struct fuse_iqueue_ops fuse_dev_fiq_ops;
+
 struct fuse_iqueue {
 	/** Connection established */
 	unsigned connected;
 
+	/** Lock protecting accesses to members of this structure */
+	spinlock_t lock;
+
 	/** Readers of the connection are waiting on this */
 	wait_queue_head_t waitq;
 
@@ -471,6 +425,12 @@
 
 	/** O_ASYNC requests */
 	struct fasync_struct *fasync;
+
+	/** Device-specific callbacks */
+	const struct fuse_iqueue_ops *ops;
+
+	/** Device-specific state */
+	void *priv;
 };
 
 #define FUSE_PQ_HASH_BITS 8
@@ -504,6 +464,29 @@
 	struct list_head entry;
 };
 
+struct fuse_fs_context {
+	int fd;
+	unsigned int rootmode;
+	kuid_t user_id;
+	kgid_t group_id;
+	bool is_bdev:1;
+	bool fd_present:1;
+	bool rootmode_present:1;
+	bool user_id_present:1;
+	bool group_id_present:1;
+	bool default_permissions:1;
+	bool allow_other:1;
+	bool destroy:1;
+	bool no_control:1;
+	bool no_force_umount:1;
+	unsigned int max_read;
+	unsigned int blksize;
+	const char *subtype;
+
+	/* fuse_dev pointer to fill in, should contain NULL on entry */
+	void **fudptr;
+};
+
 /**
  * A Fuse connection.
  *
@@ -584,9 +567,6 @@
 	/** waitq for blocked connection */
 	wait_queue_head_t blocked_waitq;
 
-	/** waitq for reserved requests */
-	wait_queue_head_t reserved_req_waitq;
-
 	/** Connection established, cleared on umount, connection
 	    abort and device release */
 	unsigned connected;
@@ -721,6 +701,18 @@
 	/** Does the filesystem support copy_file_range? */
 	unsigned no_copy_file_range:1;
 
+	/* Send DESTROY request */
+	unsigned int destroy:1;
+
+	/* Delete dentries that have gone stale */
+	unsigned int delete_stale:1;
+
+	/** Do not create entry in fusectl fs */
+	unsigned int no_control:1;
+
+	/** Do not allow MNT_FORCE umount */
+	unsigned int no_force_umount:1;
+
 	/** The number of requests waiting for completion */
 	atomic_t num_waiting;
 
@@ -742,9 +734,6 @@
 	/** Key for lock owner ID scrambling */
 	u32 scramble_key[4];
 
-	/** Reserved request for the DESTROY message */
-	struct fuse_req *destroy_req;
-
 	/** Version counter for attribute changes */
 	atomic64_t attr_version;
 
@@ -820,14 +809,32 @@
 
 struct fuse_forget_link *fuse_alloc_forget(void);
 
-/* Used by READDIRPLUS */
-void fuse_force_forget(struct file *file, u64 nodeid);
+struct fuse_forget_link *fuse_dequeue_forget(struct fuse_iqueue *fiq,
+					     unsigned int max,
+					     unsigned int *countp);
 
-/**
+/*
  * Initialize READ or READDIR request
  */
-void fuse_read_fill(struct fuse_req *req, struct file *file,
-		    loff_t pos, size_t count, int opcode);
+struct fuse_io_args {
+	union {
+		struct {
+			struct fuse_read_in in;
+			u64 attr_ver;
+		} read;
+		struct {
+			struct fuse_write_in in;
+			struct fuse_write_out out;
+		} write;
+	};
+	struct fuse_args_pages ap;
+	struct fuse_io_priv *io;
+	struct fuse_file *ff;
+};
+
+void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos,
+			 size_t count, int opcode);
+
 
 /**
  * Send OPEN or OPENDIR request
@@ -900,61 +907,16 @@
 void __exit fuse_ctl_cleanup(void);
 
 /**
- * Allocate a request
- */
-struct fuse_req *fuse_request_alloc(unsigned npages);
-
-struct fuse_req *fuse_request_alloc_nofs(unsigned npages);
-
-bool fuse_req_realloc_pages(struct fuse_conn *fc, struct fuse_req *req,
-			    gfp_t flags);
-
-
-/**
- * Free a request
- */
-void fuse_request_free(struct fuse_req *req);
-
-/**
- * Get a request, may fail with -ENOMEM,
- * caller should specify # elements in req->pages[] explicitly
- */
-struct fuse_req *fuse_get_req(struct fuse_conn *fc, unsigned npages);
-struct fuse_req *fuse_get_req_for_background(struct fuse_conn *fc,
-					     unsigned npages);
-
-/*
- * Increment reference count on request
- */
-void __fuse_get_request(struct fuse_req *req);
-
-/**
- * Gets a requests for a file operation, always succeeds
- */
-struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc,
-					     struct file *file);
-
-/**
- * Decrement reference count of a request.  If count goes to zero free
- * the request.
- */
-void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req);
-
-/**
- * Send a request (synchronous)
- */
-void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req);
-
-/**
  * Simple request sending that does request allocation and freeing
  */
 ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args);
+int fuse_simple_background(struct fuse_conn *fc, struct fuse_args *args,
+			   gfp_t gfp_flags);
 
 /**
- * Send a request in the background
+ * End a finished request
  */
-void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req);
-bool fuse_request_queue_background(struct fuse_conn *fc, struct fuse_req *req);
+void fuse_request_end(struct fuse_conn *fc, struct fuse_req *req);
 
 /* Abort all requests */
 void fuse_abort_conn(struct fuse_conn *fc);
@@ -980,15 +942,33 @@
 /**
  * Initialize fuse_conn
  */
-void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns);
+void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns,
+		    const struct fuse_iqueue_ops *fiq_ops, void *fiq_priv);
 
 /**
  * Release reference to fuse_conn
  */
 void fuse_conn_put(struct fuse_conn *fc);
 
-struct fuse_dev *fuse_dev_alloc(struct fuse_conn *fc);
+struct fuse_dev *fuse_dev_alloc_install(struct fuse_conn *fc);
+struct fuse_dev *fuse_dev_alloc(void);
+void fuse_dev_install(struct fuse_dev *fud, struct fuse_conn *fc);
 void fuse_dev_free(struct fuse_dev *fud);
+void fuse_send_init(struct fuse_conn *fc);
+
+/**
+ * Fill in superblock and initialize fuse connection
+ * @sb: partially-initialized superblock to fill in
+ * @ctx: mount context
+ */
+int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx);
+
+/**
+ * Disassociate fuse connection from superblock and kill the superblock
+ *
+ * Calls kill_anon_super(), do not use with bdev mounts.
+ */
+void fuse_kill_sb_anon(struct super_block *sb);
 
 /**
  * Add connection to control filesystem
@@ -1093,4 +1073,15 @@
 /* readdir.c */
 int fuse_readdir(struct file *file, struct dir_context *ctx);
 
+/**
+ * Return the number of bytes in an arguments list
+ */
+unsigned int fuse_len_args(unsigned int numargs, struct fuse_arg *args);
+
+/**
+ * Get the next unique ID for a request
+ */
+u64 fuse_get_unique(struct fuse_iqueue *fiq);
+void fuse_free_conn(struct fuse_conn *fc);
+
 #endif /* _FS_FUSE_I_H */

diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 4bb885b..e040e2a 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c

@@ -15,7 +15,8 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
-#include <linux/parser.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
 #include <linux/statfs.h>
 #include <linux/random.h>
 #include <linux/sched.h>
@@ -59,24 +60,13 @@
 /** Congestion starts at 75% of maximum */
 #define FUSE_DEFAULT_CONGESTION_THRESHOLD (FUSE_DEFAULT_MAX_BACKGROUND * 3 / 4)
 
-struct fuse_mount_data {
-	int fd;
-	unsigned rootmode;
-	kuid_t user_id;
-	kgid_t group_id;
-	unsigned fd_present:1;
-	unsigned rootmode_present:1;
-	unsigned user_id_present:1;
-	unsigned group_id_present:1;
-	unsigned default_permissions:1;
-	unsigned allow_other:1;
-	unsigned max_read;
-	unsigned blksize;
-};
+#ifdef CONFIG_BLOCK
+static struct file_system_type fuseblk_fs_type;
+#endif
 
 struct fuse_forget_link *fuse_alloc_forget(void)
 {
-	return kzalloc(sizeof(struct fuse_forget_link), GFP_KERNEL);
+	return kzalloc(sizeof(struct fuse_forget_link), GFP_KERNEL_ACCOUNT);
 }
 
 static struct inode *fuse_alloc_inode(struct super_block *sb)
@@ -374,19 +364,21 @@
 
 static void fuse_umount_begin(struct super_block *sb)
 {
-	fuse_abort_conn(get_fuse_conn_super(sb));
+	struct fuse_conn *fc = get_fuse_conn_super(sb);
+
+	if (!fc->no_force_umount)
+		fuse_abort_conn(fc);
 }
 
 static void fuse_send_destroy(struct fuse_conn *fc)
 {
-	struct fuse_req *req = fc->destroy_req;
-	if (req && fc->conn_init) {
-		fc->destroy_req = NULL;
-		req->in.h.opcode = FUSE_DESTROY;
-		__set_bit(FR_FORCE, &req->flags);
-		__clear_bit(FR_BACKGROUND, &req->flags);
-		fuse_request_send(fc, req);
-		fuse_put_request(fc, req);
+	if (fc->conn_init) {
+		FUSE_ARGS(args);
+
+		args.opcode = FUSE_DESTROY;
+		args.force = true;
+		args.nocreds = true;
+		fuse_simple_request(fc, &args);
 	}
 }
 
@@ -430,12 +422,12 @@
 	}
 
 	memset(&outarg, 0, sizeof(outarg));
-	args.in.numargs = 0;
-	args.in.h.opcode = FUSE_STATFS;
-	args.in.h.nodeid = get_node_id(d_inode(dentry));
-	args.out.numargs = 1;
-	args.out.args[0].size = sizeof(outarg);
-	args.out.args[0].value = &outarg;
+	args.in_numargs = 0;
+	args.opcode = FUSE_STATFS;
+	args.nodeid = get_node_id(d_inode(dentry));
+	args.out_numargs = 1;
+	args.out_args[0].size = sizeof(outarg);
+	args.out_args[0].value = &outarg;
 	err = fuse_simple_request(fc, &args);
 	if (!err)
 		convert_fuse_statfs(buf, &outarg.st);
@@ -443,6 +435,8 @@
 }
 
 enum {
+	OPT_SOURCE,
+	OPT_SUBTYPE,
 	OPT_FD,
 	OPT_ROOTMODE,
 	OPT_USER_ID,
@@ -454,111 +448,109 @@
 	OPT_ERR
 };
 
-static const match_table_t tokens = {
-	{OPT_FD,			"fd=%u"},
-	{OPT_ROOTMODE,			"rootmode=%o"},
-	{OPT_USER_ID,			"user_id=%u"},
-	{OPT_GROUP_ID,			"group_id=%u"},
-	{OPT_DEFAULT_PERMISSIONS,	"default_permissions"},
-	{OPT_ALLOW_OTHER,		"allow_other"},
-	{OPT_MAX_READ,			"max_read=%u"},
-	{OPT_BLKSIZE,			"blksize=%u"},
-	{OPT_ERR,			NULL}
+static const struct fs_parameter_spec fuse_param_specs[] = {
+	fsparam_string	("source",		OPT_SOURCE),
+	fsparam_u32	("fd",			OPT_FD),
+	fsparam_u32oct	("rootmode",		OPT_ROOTMODE),
+	fsparam_u32	("user_id",		OPT_USER_ID),
+	fsparam_u32	("group_id",		OPT_GROUP_ID),
+	fsparam_flag	("default_permissions",	OPT_DEFAULT_PERMISSIONS),
+	fsparam_flag	("allow_other",		OPT_ALLOW_OTHER),
+	fsparam_u32	("max_read",		OPT_MAX_READ),
+	fsparam_u32	("blksize",		OPT_BLKSIZE),
+	fsparam_string	("subtype",		OPT_SUBTYPE),
+	{}
 };
 
-static int fuse_match_uint(substring_t *s, unsigned int *res)
+static const struct fs_parameter_description fuse_fs_parameters = {
+	.name		= "fuse",
+	.specs		= fuse_param_specs,
+};
+
+static int fuse_parse_param(struct fs_context *fc, struct fs_parameter *param)
 {
-	int err = -ENOMEM;
-	char *buf = match_strdup(s);
-	if (buf) {
-		err = kstrtouint(buf, 10, res);
-		kfree(buf);
-	}
-	return err;
-}
+	struct fs_parse_result result;
+	struct fuse_fs_context *ctx = fc->fs_private;
+	int opt;
 
-static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev,
-			  struct user_namespace *user_ns)
-{
-	char *p;
-	memset(d, 0, sizeof(struct fuse_mount_data));
-	d->max_read = ~0;
-	d->blksize = FUSE_DEFAULT_BLKSIZE;
+	opt = fs_parse(fc, &fuse_fs_parameters, param, &result);
+	if (opt < 0)
+		return opt;
 
-	while ((p = strsep(&opt, ",")) != NULL) {
-		int token;
-		int value;
-		unsigned uv;
-		substring_t args[MAX_OPT_ARGS];
-		if (!*p)
-			continue;
+	switch (opt) {
+	case OPT_SOURCE:
+		if (fc->source)
+			return invalf(fc, "fuse: Multiple sources specified");
+		fc->source = param->string;
+		param->string = NULL;
+		break;
 
-		token = match_token(p, tokens, args);
-		switch (token) {
-		case OPT_FD:
-			if (match_int(&args[0], &value))
-				return 0;
-			d->fd = value;
-			d->fd_present = 1;
-			break;
-
-		case OPT_ROOTMODE:
-			if (match_octal(&args[0], &value))
-				return 0;
-			if (!fuse_valid_type(value))
-				return 0;
-			d->rootmode = value;
-			d->rootmode_present = 1;
-			break;
-
-		case OPT_USER_ID:
-			if (fuse_match_uint(&args[0], &uv))
-				return 0;
-			d->user_id = make_kuid(user_ns, uv);
-			if (!uid_valid(d->user_id))
-				return 0;
-			d->user_id_present = 1;
-			break;
-
-		case OPT_GROUP_ID:
-			if (fuse_match_uint(&args[0], &uv))
-				return 0;
-			d->group_id = make_kgid(user_ns, uv);
-			if (!gid_valid(d->group_id))
-				return 0;
-			d->group_id_present = 1;
-			break;
-
-		case OPT_DEFAULT_PERMISSIONS:
-			d->default_permissions = 1;
-			break;
-
-		case OPT_ALLOW_OTHER:
-			d->allow_other = 1;
-			break;
-
-		case OPT_MAX_READ:
-			if (match_int(&args[0], &value))
-				return 0;
-			d->max_read = value;
-			break;
-
-		case OPT_BLKSIZE:
-			if (!is_bdev || match_int(&args[0], &value))
-				return 0;
-			d->blksize = value;
-			break;
-
-		default:
-			return 0;
-		}
-	}
-
-	if (!d->fd_present || !d->rootmode_present ||
-	    !d->user_id_present || !d->group_id_present)
+	case OPT_SUBTYPE:
+		if (ctx->subtype)
+			return invalf(fc, "fuse: Multiple subtypes specified");
+		ctx->subtype = param->string;
+		param->string = NULL;
 		return 0;
 
-	return 1;
+	case OPT_FD:
+		ctx->fd = result.uint_32;
+		ctx->fd_present = 1;
+		break;
+
+	case OPT_ROOTMODE:
+		if (!fuse_valid_type(result.uint_32))
+			return invalf(fc, "fuse: Invalid rootmode");
+		ctx->rootmode = result.uint_32;
+		ctx->rootmode_present = 1;
+		break;
+
+	case OPT_USER_ID:
+		ctx->user_id = make_kuid(fc->user_ns, result.uint_32);
+		if (!uid_valid(ctx->user_id))
+			return invalf(fc, "fuse: Invalid user_id");
+		ctx->user_id_present = 1;
+		break;
+
+	case OPT_GROUP_ID:
+		ctx->group_id = make_kgid(fc->user_ns, result.uint_32);
+		if (!gid_valid(ctx->group_id))
+			return invalf(fc, "fuse: Invalid group_id");
+		ctx->group_id_present = 1;
+		break;
+
+	case OPT_DEFAULT_PERMISSIONS:
+		ctx->default_permissions = 1;
+		break;
+
+	case OPT_ALLOW_OTHER:
+		ctx->allow_other = 1;
+		break;
+
+	case OPT_MAX_READ:
+		ctx->max_read = result.uint_32;
+		break;
+
+	case OPT_BLKSIZE:
+		if (!ctx->is_bdev)
+			return invalf(fc, "fuse: blksize only supported for fuseblk");
+		ctx->blksize = result.uint_32;
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static void fuse_free_fc(struct fs_context *fc)
+{
+	struct fuse_fs_context *ctx = fc->fs_private;
+
+	if (ctx) {
+		kfree(ctx->subtype);
+		kfree(ctx);
+	}
 }
 
 static int fuse_show_options(struct seq_file *m, struct dentry *root)
@@ -579,14 +571,19 @@
 	return 0;
 }
 
-static void fuse_iqueue_init(struct fuse_iqueue *fiq)
+static void fuse_iqueue_init(struct fuse_iqueue *fiq,
+			     const struct fuse_iqueue_ops *ops,
+			     void *priv)
 {
 	memset(fiq, 0, sizeof(struct fuse_iqueue));
+	spin_lock_init(&fiq->lock);
 	init_waitqueue_head(&fiq->waitq);
 	INIT_LIST_HEAD(&fiq->pending);
 	INIT_LIST_HEAD(&fiq->interrupts);
 	fiq->forget_list_tail = &fiq->forget_list_head;
 	fiq->connected = 1;
+	fiq->ops = ops;
+	fiq->priv = priv;
 }
 
 static void fuse_pqueue_init(struct fuse_pqueue *fpq)
@@ -600,7 +597,8 @@
 	fpq->connected = 1;
 }
 
-void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns)
+void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns,
+		    const struct fuse_iqueue_ops *fiq_ops, void *fiq_priv)
 {
 	memset(fc, 0, sizeof(*fc));
 	spin_lock_init(&fc->lock);
@@ -609,8 +607,7 @@
 	refcount_set(&fc->count, 1);
 	atomic_set(&fc->dev_count, 1);
 	init_waitqueue_head(&fc->blocked_waitq);
-	init_waitqueue_head(&fc->reserved_req_waitq);
-	fuse_iqueue_init(&fc->iq);
+	fuse_iqueue_init(&fc->iq, fiq_ops, fiq_priv);
 	INIT_LIST_HEAD(&fc->bg_queue);
 	INIT_LIST_HEAD(&fc->entry);
 	INIT_LIST_HEAD(&fc->devices);
@@ -633,8 +630,10 @@
 void fuse_conn_put(struct fuse_conn *fc)
 {
 	if (refcount_dec_and_test(&fc->count)) {
-		if (fc->destroy_req)
-			fuse_request_free(fc->destroy_req);
+		struct fuse_iqueue *fiq = &fc->iq;
+
+		if (fiq->ops->release)
+			fiq->ops->release(fiq);
 		put_pid_ns(fc->pid_ns);
 		put_user_ns(fc->user_ns);
 		fc->release(fc);
@@ -822,9 +821,12 @@
 
 static void sanitize_global_limit(unsigned *limit)
 {
+	/*
+	 * The default maximum number of async requests is calculated to consume
+	 * 1/2^13 of the total memory, assuming 392 bytes per request.
+	 */
 	if (*limit == 0)
-		*limit = ((totalram_pages() << PAGE_SHIFT) >> 13) /
-			 sizeof(struct fuse_req);
+		*limit = ((totalram_pages() << PAGE_SHIFT) >> 13) / 392;
 
 	if (*limit >= 1 << 16)
 		*limit = (1 << 16) - 1;
@@ -870,11 +872,19 @@
 	spin_unlock(&fc->bg_lock);
 }
 
-static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
-{
-	struct fuse_init_out *arg = &req->misc.init_out;
+struct fuse_init_args {
+	struct fuse_args args;
+	struct fuse_init_in in;
+	struct fuse_init_out out;
+};
 
-	if (req->out.h.error || arg->major != FUSE_KERNEL_VERSION)
+static void process_init_reply(struct fuse_conn *fc, struct fuse_args *args,
+			       int error)
+{
+	struct fuse_init_args *ia = container_of(args, typeof(*ia), args);
+	struct fuse_init_out *arg = &ia->out;
+
+	if (error || arg->major != FUSE_KERNEL_VERSION)
 		fc->conn_error = 1;
 	else {
 		unsigned long ra_pages;
@@ -951,18 +961,23 @@
 		fc->max_write = max_t(unsigned, 4096, fc->max_write);
 		fc->conn_init = 1;
 	}
+	kfree(ia);
+
 	fuse_set_initialized(fc);
 	wake_up_all(&fc->blocked_waitq);
 }
 
-static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
+void fuse_send_init(struct fuse_conn *fc)
 {
-	struct fuse_init_in *arg = &req->misc.init_in;
+	struct fuse_init_args *ia;
 
-	arg->major = FUSE_KERNEL_VERSION;
-	arg->minor = FUSE_KERNEL_MINOR_VERSION;
-	arg->max_readahead = fc->sb->s_bdi->ra_pages * PAGE_SIZE;
-	arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC |
+	ia = kzalloc(sizeof(*ia), GFP_KERNEL | __GFP_NOFAIL);
+
+	ia->in.major = FUSE_KERNEL_VERSION;
+	ia->in.minor = FUSE_KERNEL_MINOR_VERSION;
+	ia->in.max_readahead = fc->sb->s_bdi->ra_pages * PAGE_SIZE;
+	ia->in.flags |=
+		FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC |
 		FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK |
 		FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ |
 		FUSE_FLOCK_LOCKS | FUSE_HAS_IOCTL_DIR | FUSE_AUTO_INVAL_DATA |
@@ -971,26 +986,32 @@
 		FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL |
 		FUSE_ABORT_ERROR | FUSE_MAX_PAGES | FUSE_CACHE_SYMLINKS |
 		FUSE_NO_OPENDIR_SUPPORT | FUSE_EXPLICIT_INVAL_DATA;
-	req->in.h.opcode = FUSE_INIT;
-	req->in.numargs = 1;
-	req->in.args[0].size = sizeof(*arg);
-	req->in.args[0].value = arg;
-	req->out.numargs = 1;
+	ia->args.opcode = FUSE_INIT;
+	ia->args.in_numargs = 1;
+	ia->args.in_args[0].size = sizeof(ia->in);
+	ia->args.in_args[0].value = &ia->in;
+	ia->args.out_numargs = 1;
 	/* Variable length argument used for backward compatibility
 	   with interface version < 7.5.  Rest of init_out is zeroed
 	   by do_get_request(), so a short reply is not a problem */
-	req->out.argvar = 1;
-	req->out.args[0].size = sizeof(struct fuse_init_out);
-	req->out.args[0].value = &req->misc.init_out;
-	req->end = process_init_reply;
-	fuse_request_send_background(fc, req);
-}
+	ia->args.out_argvar = 1;
+	ia->args.out_args[0].size = sizeof(ia->out);
+	ia->args.out_args[0].value = &ia->out;
+	ia->args.force = true;
+	ia->args.nocreds = true;
+	ia->args.end = process_init_reply;
 
-static void fuse_free_conn(struct fuse_conn *fc)
+	if (fuse_simple_background(fc, &ia->args, GFP_KERNEL) != 0)
+		process_init_reply(fc, &ia->args, -ENOTCONN);
+}
+EXPORT_SYMBOL_GPL(fuse_send_init);
+
+void fuse_free_conn(struct fuse_conn *fc)
 {
 	WARN_ON(!list_empty(&fc->devices));
 	kfree_rcu(fc, rcu);
 }
+EXPORT_SYMBOL_GPL(fuse_free_conn);
 
 static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
 {
@@ -1032,7 +1053,7 @@
 	return 0;
 }
 
-struct fuse_dev *fuse_dev_alloc(struct fuse_conn *fc)
+struct fuse_dev *fuse_dev_alloc(void)
 {
 	struct fuse_dev *fud;
 	struct list_head *pq;
@@ -1048,17 +1069,34 @@
 	}
 
 	fud->pq.processing = pq;
-	fud->fc = fuse_conn_get(fc);
 	fuse_pqueue_init(&fud->pq);
 
-	spin_lock(&fc->lock);
-	list_add_tail(&fud->entry, &fc->devices);
-	spin_unlock(&fc->lock);
-
 	return fud;
 }
 EXPORT_SYMBOL_GPL(fuse_dev_alloc);
 
+void fuse_dev_install(struct fuse_dev *fud, struct fuse_conn *fc)
+{
+	fud->fc = fuse_conn_get(fc);
+	spin_lock(&fc->lock);
+	list_add_tail(&fud->entry, &fc->devices);
+	spin_unlock(&fc->lock);
+}
+EXPORT_SYMBOL_GPL(fuse_dev_install);
+
+struct fuse_dev *fuse_dev_alloc_install(struct fuse_conn *fc)
+{
+	struct fuse_dev *fud;
+
+	fud = fuse_dev_alloc();
+	if (!fud)
+		return NULL;
+
+	fuse_dev_install(fud, fc);
+	return fud;
+}
+EXPORT_SYMBOL_GPL(fuse_dev_alloc_install);
+
 void fuse_dev_free(struct fuse_dev *fud)
 {
 	struct fuse_conn *fc = fud->fc;
@@ -1075,17 +1113,13 @@
 }
 EXPORT_SYMBOL_GPL(fuse_dev_free);
 
-static int fuse_fill_super(struct super_block *sb, void *data, int silent)
+int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
 {
 	struct fuse_dev *fud;
-	struct fuse_conn *fc;
+	struct fuse_conn *fc = get_fuse_conn_super(sb);
 	struct inode *root;
-	struct fuse_mount_data d;
-	struct file *file;
 	struct dentry *root_dentry;
-	struct fuse_req *init_req;
 	int err;
-	int is_bdev = sb->s_bdev != NULL;
 
 	err = -EINVAL;
 	if (sb->s_flags & SB_MANDLOCK)
@@ -1093,19 +1127,19 @@
 
 	sb->s_flags &= ~(SB_NOSEC | SB_I_VERSION);
 
-	if (!parse_fuse_opt(data, &d, is_bdev, sb->s_user_ns))
-		goto err;
-
-	if (is_bdev) {
+	if (ctx->is_bdev) {
 #ifdef CONFIG_BLOCK
 		err = -EINVAL;
-		if (!sb_set_blocksize(sb, d.blksize))
+		if (!sb_set_blocksize(sb, ctx->blksize))
 			goto err;
 #endif
 	} else {
 		sb->s_blocksize = PAGE_SIZE;
 		sb->s_blocksize_bits = PAGE_SHIFT;
 	}
+
+	sb->s_subtype = ctx->subtype;
+	ctx->subtype = NULL;
 	sb->s_magic = FUSE_SUPER_MAGIC;
 	sb->s_op = &fuse_super_operations;
 	sb->s_xattr = fuse_xattr_handlers;
@@ -1116,19 +1150,6 @@
 	if (sb->s_user_ns != &init_user_ns)
 		sb->s_iflags |= SB_I_UNTRUSTED_MOUNTER;
 
-	file = fget(d.fd);
-	err = -EINVAL;
-	if (!file)
-		goto err;
-
-	/*
-	 * Require mount to happen from the same user namespace which
-	 * opened /dev/fuse to prevent potential attacks.
-	 */
-	if (file->f_op != &fuse_dev_operations ||
-	    file->f_cred->user_ns != sb->s_user_ns)
-		goto err_fput;
-
 	/*
 	 * If we are not in the initial user namespace posix
 	 * acls must be translated.
@@ -1136,17 +1157,9 @@
 	if (sb->s_user_ns != &init_user_ns)
 		sb->s_xattr = fuse_no_acl_xattr_handlers;
 
-	fc = kmalloc(sizeof(*fc), GFP_KERNEL);
-	err = -ENOMEM;
-	if (!fc)
-		goto err_fput;
-
-	fuse_conn_init(fc, sb->s_user_ns);
-	fc->release = fuse_free_conn;
-
-	fud = fuse_dev_alloc(fc);
+	fud = fuse_dev_alloc_install(fc);
 	if (!fud)
-		goto err_put_conn;
+		goto err;
 
 	fc->dev = sb->s_dev;
 	fc->sb = sb;
@@ -1159,17 +1172,17 @@
 		fc->dont_mask = 1;
 	sb->s_flags |= SB_POSIXACL;
 
-	fc->default_permissions = d.default_permissions;
-	fc->allow_other = d.allow_other;
-	fc->user_id = d.user_id;
-	fc->group_id = d.group_id;
-	fc->max_read = max_t(unsigned, 4096, d.max_read);
-
-	/* Used by get_root_inode() */
-	sb->s_fs_info = fc;
+	fc->default_permissions = ctx->default_permissions;
+	fc->allow_other = ctx->allow_other;
+	fc->user_id = ctx->user_id;
+	fc->group_id = ctx->group_id;
+	fc->max_read = max_t(unsigned, 4096, ctx->max_read);
+	fc->destroy = ctx->destroy;
+	fc->no_control = ctx->no_control;
+	fc->no_force_umount = ctx->no_force_umount;
 
 	err = -ENOMEM;
-	root = fuse_get_root_inode(sb, d.rootmode);
+	root = fuse_get_root_inode(sb, ctx->rootmode);
 	sb->s_d_op = &fuse_root_dentry_operations;
 	root_dentry = d_make_root(root);
 	if (!root_dentry)
@@ -1177,20 +1190,9 @@
 	/* Root dentry doesn't have .d_revalidate */
 	sb->s_d_op = &fuse_dentry_operations;
 
-	init_req = fuse_request_alloc(0);
-	if (!init_req)
-		goto err_put_root;
-	__set_bit(FR_BACKGROUND, &init_req->flags);
-
-	if (is_bdev) {
-		fc->destroy_req = fuse_request_alloc(0);
-		if (!fc->destroy_req)
-			goto err_free_init_req;
-	}
-
 	mutex_lock(&fuse_mutex);
 	err = -EINVAL;
-	if (file->private_data)
+	if (*ctx->fudptr)
 		goto err_unlock;
 
 	err = fuse_ctl_add_conn(fc);
@@ -1199,27 +1201,62 @@
 
 	list_add_tail(&fc->entry, &fuse_conn_list);
 	sb->s_root = root_dentry;
-	file->private_data = fud;
+	*ctx->fudptr = fud;
 	mutex_unlock(&fuse_mutex);
+	return 0;
+
+ err_unlock:
+	mutex_unlock(&fuse_mutex);
+	dput(root_dentry);
+ err_dev_free:
+	fuse_dev_free(fud);
+ err:
+	return err;
+}
+EXPORT_SYMBOL_GPL(fuse_fill_super_common);
+
+static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc)
+{
+	struct fuse_fs_context *ctx = fsc->fs_private;
+	struct file *file;
+	int err;
+	struct fuse_conn *fc;
+
+	err = -EINVAL;
+	file = fget(ctx->fd);
+	if (!file)
+		goto err;
+
+	/*
+	 * Require mount to happen from the same user namespace which
+	 * opened /dev/fuse to prevent potential attacks.
+	 */
+	if ((file->f_op != &fuse_dev_operations) ||
+	    (file->f_cred->user_ns != sb->s_user_ns))
+		goto err_fput;
+	ctx->fudptr = &file->private_data;
+
+	fc = kmalloc(sizeof(*fc), GFP_KERNEL);
+	err = -ENOMEM;
+	if (!fc)
+		goto err_fput;
+
+	fuse_conn_init(fc, sb->s_user_ns, &fuse_dev_fiq_ops, NULL);
+	fc->release = fuse_free_conn;
+	sb->s_fs_info = fc;
+
+	err = fuse_fill_super_common(sb, ctx);
+	if (err)
+		goto err_put_conn;
 	/*
 	 * atomic_dec_and_test() in fput() provides the necessary
 	 * memory barrier for file->private_data to be visible on all
 	 * CPUs after this
 	 */
 	fput(file);
-
-	fuse_send_init(fc, init_req);
-
+	fuse_send_init(get_fuse_conn_super(sb));
 	return 0;
 
- err_unlock:
-	mutex_unlock(&fuse_mutex);
- err_free_init_req:
-	fuse_request_free(init_req);
- err_put_root:
-	dput(root_dentry);
- err_dev_free:
-	fuse_dev_free(fud);
  err_put_conn:
 	fuse_conn_put(fc);
 	sb->s_fs_info = NULL;
@@ -1229,11 +1266,52 @@
 	return err;
 }
 
-static struct dentry *fuse_mount(struct file_system_type *fs_type,
-		       int flags, const char *dev_name,
-		       void *raw_data)
+static int fuse_get_tree(struct fs_context *fc)
 {
-	return mount_nodev(fs_type, flags, raw_data, fuse_fill_super);
+	struct fuse_fs_context *ctx = fc->fs_private;
+
+	if (!ctx->fd_present || !ctx->rootmode_present ||
+	    !ctx->user_id_present || !ctx->group_id_present)
+		return -EINVAL;
+
+#ifdef CONFIG_BLOCK
+	if (ctx->is_bdev)
+		return get_tree_bdev(fc, fuse_fill_super);
+#endif
+
+	return get_tree_nodev(fc, fuse_fill_super);
+}
+
+static const struct fs_context_operations fuse_context_ops = {
+	.free		= fuse_free_fc,
+	.parse_param	= fuse_parse_param,
+	.get_tree	= fuse_get_tree,
+};
+
+/*
+ * Set up the filesystem mount context.
+ */
+static int fuse_init_fs_context(struct fs_context *fc)
+{
+	struct fuse_fs_context *ctx;
+
+	ctx = kzalloc(sizeof(struct fuse_fs_context), GFP_KERNEL);
+	if (!ctx)
+		return -ENOMEM;
+
+	ctx->max_read = ~0;
+	ctx->blksize = FUSE_DEFAULT_BLKSIZE;
+
+#ifdef CONFIG_BLOCK
+	if (fc->fs_type == &fuseblk_fs_type) {
+		ctx->is_bdev = true;
+		ctx->destroy = true;
+	}
+#endif
+
+	fc->fs_private = ctx;
+	fc->ops = &fuse_context_ops;
+	return 0;
 }
 
 static void fuse_sb_destroy(struct super_block *sb)
@@ -1241,7 +1319,8 @@
 	struct fuse_conn *fc = get_fuse_conn_super(sb);
 
 	if (fc) {
-		fuse_send_destroy(fc);
+		if (fc->destroy)
+			fuse_send_destroy(fc);
 
 		fuse_abort_conn(fc);
 		fuse_wait_aborted(fc);
@@ -1252,29 +1331,24 @@
 	}
 }
 
-static void fuse_kill_sb_anon(struct super_block *sb)
+void fuse_kill_sb_anon(struct super_block *sb)
 {
 	fuse_sb_destroy(sb);
 	kill_anon_super(sb);
 }
+EXPORT_SYMBOL_GPL(fuse_kill_sb_anon);
 
 static struct file_system_type fuse_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "fuse",
 	.fs_flags	= FS_HAS_SUBTYPE | FS_USERNS_MOUNT,
-	.mount		= fuse_mount,
+	.init_fs_context = fuse_init_fs_context,
+	.parameters	= &fuse_fs_parameters,
 	.kill_sb	= fuse_kill_sb_anon,
 };
 MODULE_ALIAS_FS("fuse");
 
 #ifdef CONFIG_BLOCK
-static struct dentry *fuse_mount_blk(struct file_system_type *fs_type,
-			   int flags, const char *dev_name,
-			   void *raw_data)
-{
-	return mount_bdev(fs_type, flags, dev_name, raw_data, fuse_fill_super);
-}
-
 static void fuse_kill_sb_blk(struct super_block *sb)
 {
 	fuse_sb_destroy(sb);
@@ -1284,7 +1358,8 @@
 static struct file_system_type fuseblk_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "fuseblk",
-	.mount		= fuse_mount_blk,
+	.init_fs_context = fuse_init_fs_context,
+	.parameters	= &fuse_fs_parameters,
 	.kill_sb	= fuse_kill_sb_blk,
 	.fs_flags	= FS_REQUIRES_DEV | FS_HAS_SUBTYPE,
 };

diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c
index 574d03f..5c38b9d 100644
--- a/fs/fuse/readdir.c
+++ b/fs/fuse/readdir.c

@@ -249,6 +249,27 @@
 	return 0;
 }
 
+static void fuse_force_forget(struct file *file, u64 nodeid)
+{
+	struct inode *inode = file_inode(file);
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_forget_in inarg;
+	FUSE_ARGS(args);
+
+	memset(&inarg, 0, sizeof(inarg));
+	inarg.nlookup = 1;
+	args.opcode = FUSE_FORGET;
+	args.nodeid = nodeid;
+	args.in_numargs = 1;
+	args.in_args[0].size = sizeof(inarg);
+	args.in_args[0].value = &inarg;
+	args.force = true;
+	args.noreply = true;
+
+	fuse_simple_request(fc, &args);
+	/* ignore errors */
+}
+
 static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file,
 			     struct dir_context *ctx, u64 attr_version)
 {
@@ -295,62 +316,55 @@
 
 static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx)
 {
-	int plus, err;
-	size_t nbytes;
+	int plus;
+	ssize_t res;
 	struct page *page;
 	struct inode *inode = file_inode(file);
 	struct fuse_conn *fc = get_fuse_conn(inode);
-	struct fuse_req *req;
+	struct fuse_io_args ia = {};
+	struct fuse_args_pages *ap = &ia.ap;
+	struct fuse_page_desc desc = { .length = PAGE_SIZE };
 	u64 attr_version = 0;
 	bool locked;
 
-	req = fuse_get_req(fc, 1);
-	if (IS_ERR(req))
-		return PTR_ERR(req);
-
 	page = alloc_page(GFP_KERNEL);
-	if (!page) {
-		fuse_put_request(fc, req);
+	if (!page)
 		return -ENOMEM;
-	}
 
 	plus = fuse_use_readdirplus(inode, ctx);
-	req->out.argpages = 1;
-	req->num_pages = 1;
-	req->pages[0] = page;
-	req->page_descs[0].length = PAGE_SIZE;
+	ap->args.out_pages = 1;
+	ap->num_pages = 1;
+	ap->pages = &page;
+	ap->descs = &desc;
 	if (plus) {
 		attr_version = fuse_get_attr_version(fc);
-		fuse_read_fill(req, file, ctx->pos, PAGE_SIZE,
-			       FUSE_READDIRPLUS);
+		fuse_read_args_fill(&ia, file, ctx->pos, PAGE_SIZE,
+				    FUSE_READDIRPLUS);
 	} else {
-		fuse_read_fill(req, file, ctx->pos, PAGE_SIZE,
-			       FUSE_READDIR);
+		fuse_read_args_fill(&ia, file, ctx->pos, PAGE_SIZE,
+				    FUSE_READDIR);
 	}
 	locked = fuse_lock_inode(inode);
-	fuse_request_send(fc, req);
+	res = fuse_simple_request(fc, &ap->args);
 	fuse_unlock_inode(inode, locked);
-	nbytes = req->out.args[0].size;
-	err = req->out.h.error;
-	fuse_put_request(fc, req);
-	if (!err) {
-		if (!nbytes) {
+	if (res >= 0) {
+		if (!res) {
 			struct fuse_file *ff = file->private_data;
 
 			if (ff->open_flags & FOPEN_CACHE_DIR)
 				fuse_readdir_cache_end(file, ctx->pos);
 		} else if (plus) {
-			err = parse_dirplusfile(page_address(page), nbytes,
+			res = parse_dirplusfile(page_address(page), res,
 						file, ctx, attr_version);
 		} else {
-			err = parse_dirfile(page_address(page), nbytes, file,
+			res = parse_dirfile(page_address(page), res, file,
 					    ctx);
 		}
 	}
 
 	__free_page(page);
 	fuse_invalidate_atime(inode);
-	return err;
+	return res;
 }
 
 enum fuse_parse_result {
@@ -372,11 +386,13 @@
 	for (;;) {
 		struct fuse_dirent *dirent = addr + offset;
 		unsigned int nbytes = size - offset;
-		size_t reclen = FUSE_DIRENT_SIZE(dirent);
+		size_t reclen;
 
 		if (nbytes < FUSE_NAME_OFFSET || !dirent->namelen)
 			break;
 
+		reclen = FUSE_DIRENT_SIZE(dirent); /* derefs ->namelen */
+
 		if (WARN_ON(dirent->namelen > FUSE_NAME_MAX))
 			return FOUND_ERR;
 		if (WARN_ON(reclen > nbytes))

diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
new file mode 100644
index 0000000..6af3f13
--- /dev/null
+++ b/fs/fuse/virtio_fs.c

@@ -0,0 +1,1195 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * virtio-fs: Virtio Filesystem
+ * Copyright (C) 2018 Red Hat, Inc.
+ */
+
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/virtio.h>
+#include <linux/virtio_fs.h>
+#include <linux/delay.h>
+#include <linux/fs_context.h>
+#include <linux/highmem.h>
+#include "fuse_i.h"
+
+/* List of virtio-fs device instances and a lock for the list. Also provides
+ * mutual exclusion in device removal and mounting path
+ */
+static DEFINE_MUTEX(virtio_fs_mutex);
+static LIST_HEAD(virtio_fs_instances);
+
+enum {
+	VQ_HIPRIO,
+	VQ_REQUEST
+};
+
+/* Per-virtqueue state */
+struct virtio_fs_vq {
+	spinlock_t lock;
+	struct virtqueue *vq;     /* protected by ->lock */
+	struct work_struct done_work;
+	struct list_head queued_reqs;
+	struct delayed_work dispatch_work;
+	struct fuse_dev *fud;
+	bool connected;
+	long in_flight;
+	char name[24];
+} ____cacheline_aligned_in_smp;
+
+/* A virtio-fs device instance */
+struct virtio_fs {
+	struct kref refcount;
+	struct list_head list;    /* on virtio_fs_instances */
+	char *tag;
+	struct virtio_fs_vq *vqs;
+	unsigned int nvqs;               /* number of virtqueues */
+	unsigned int num_request_queues; /* number of request queues */
+};
+
+struct virtio_fs_forget {
+	struct fuse_in_header ih;
+	struct fuse_forget_in arg;
+	/* This request can be temporarily queued on virt queue */
+	struct list_head list;
+};
+
+static inline struct virtio_fs_vq *vq_to_fsvq(struct virtqueue *vq)
+{
+	struct virtio_fs *fs = vq->vdev->priv;
+
+	return &fs->vqs[vq->index];
+}
+
+static inline struct fuse_pqueue *vq_to_fpq(struct virtqueue *vq)
+{
+	return &vq_to_fsvq(vq)->fud->pq;
+}
+
+static void release_virtio_fs_obj(struct kref *ref)
+{
+	struct virtio_fs *vfs = container_of(ref, struct virtio_fs, refcount);
+
+	kfree(vfs->vqs);
+	kfree(vfs);
+}
+
+/* Make sure virtiofs_mutex is held */
+static void virtio_fs_put(struct virtio_fs *fs)
+{
+	kref_put(&fs->refcount, release_virtio_fs_obj);
+}
+
+static void virtio_fs_fiq_release(struct fuse_iqueue *fiq)
+{
+	struct virtio_fs *vfs = fiq->priv;
+
+	mutex_lock(&virtio_fs_mutex);
+	virtio_fs_put(vfs);
+	mutex_unlock(&virtio_fs_mutex);
+}
+
+static void virtio_fs_drain_queue(struct virtio_fs_vq *fsvq)
+{
+	WARN_ON(fsvq->in_flight < 0);
+
+	/* Wait for in flight requests to finish.*/
+	while (1) {
+		spin_lock(&fsvq->lock);
+		if (!fsvq->in_flight) {
+			spin_unlock(&fsvq->lock);
+			break;
+		}
+		spin_unlock(&fsvq->lock);
+		/* TODO use completion instead of timeout */
+		usleep_range(1000, 2000);
+	}
+
+	flush_work(&fsvq->done_work);
+	flush_delayed_work(&fsvq->dispatch_work);
+}
+
+static inline void drain_hiprio_queued_reqs(struct virtio_fs_vq *fsvq)
+{
+	struct virtio_fs_forget *forget;
+
+	spin_lock(&fsvq->lock);
+	while (1) {
+		forget = list_first_entry_or_null(&fsvq->queued_reqs,
+						struct virtio_fs_forget, list);
+		if (!forget)
+			break;
+		list_del(&forget->list);
+		kfree(forget);
+	}
+	spin_unlock(&fsvq->lock);
+}
+
+static void virtio_fs_drain_all_queues(struct virtio_fs *fs)
+{
+	struct virtio_fs_vq *fsvq;
+	int i;
+
+	for (i = 0; i < fs->nvqs; i++) {
+		fsvq = &fs->vqs[i];
+		if (i == VQ_HIPRIO)
+			drain_hiprio_queued_reqs(fsvq);
+
+		virtio_fs_drain_queue(fsvq);
+	}
+}
+
+static void virtio_fs_start_all_queues(struct virtio_fs *fs)
+{
+	struct virtio_fs_vq *fsvq;
+	int i;
+
+	for (i = 0; i < fs->nvqs; i++) {
+		fsvq = &fs->vqs[i];
+		spin_lock(&fsvq->lock);
+		fsvq->connected = true;
+		spin_unlock(&fsvq->lock);
+	}
+}
+
+/* Add a new instance to the list or return -EEXIST if tag name exists*/
+static int virtio_fs_add_instance(struct virtio_fs *fs)
+{
+	struct virtio_fs *fs2;
+	bool duplicate = false;
+
+	mutex_lock(&virtio_fs_mutex);
+
+	list_for_each_entry(fs2, &virtio_fs_instances, list) {
+		if (strcmp(fs->tag, fs2->tag) == 0)
+			duplicate = true;
+	}
+
+	if (!duplicate)
+		list_add_tail(&fs->list, &virtio_fs_instances);
+
+	mutex_unlock(&virtio_fs_mutex);
+
+	if (duplicate)
+		return -EEXIST;
+	return 0;
+}
+
+/* Return the virtio_fs with a given tag, or NULL */
+static struct virtio_fs *virtio_fs_find_instance(const char *tag)
+{
+	struct virtio_fs *fs;
+
+	mutex_lock(&virtio_fs_mutex);
+
+	list_for_each_entry(fs, &virtio_fs_instances, list) {
+		if (strcmp(fs->tag, tag) == 0) {
+			kref_get(&fs->refcount);
+			goto found;
+		}
+	}
+
+	fs = NULL; /* not found */
+
+found:
+	mutex_unlock(&virtio_fs_mutex);
+
+	return fs;
+}
+
+static void virtio_fs_free_devs(struct virtio_fs *fs)
+{
+	unsigned int i;
+
+	for (i = 0; i < fs->nvqs; i++) {
+		struct virtio_fs_vq *fsvq = &fs->vqs[i];
+
+		if (!fsvq->fud)
+			continue;
+
+		fuse_dev_free(fsvq->fud);
+		fsvq->fud = NULL;
+	}
+}
+
+/* Read filesystem name from virtio config into fs->tag (must kfree()). */
+static int virtio_fs_read_tag(struct virtio_device *vdev, struct virtio_fs *fs)
+{
+	char tag_buf[sizeof_field(struct virtio_fs_config, tag)];
+	char *end;
+	size_t len;
+
+	virtio_cread_bytes(vdev, offsetof(struct virtio_fs_config, tag),
+			   &tag_buf, sizeof(tag_buf));
+	end = memchr(tag_buf, '\0', sizeof(tag_buf));
+	if (end == tag_buf)
+		return -EINVAL; /* empty tag */
+	if (!end)
+		end = &tag_buf[sizeof(tag_buf)];
+
+	len = end - tag_buf;
+	fs->tag = devm_kmalloc(&vdev->dev, len + 1, GFP_KERNEL);
+	if (!fs->tag)
+		return -ENOMEM;
+	memcpy(fs->tag, tag_buf, len);
+	fs->tag[len] = '\0';
+	return 0;
+}
+
+/* Work function for hiprio completion */
+static void virtio_fs_hiprio_done_work(struct work_struct *work)
+{
+	struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq,
+						 done_work);
+	struct virtqueue *vq = fsvq->vq;
+
+	/* Free completed FUSE_FORGET requests */
+	spin_lock(&fsvq->lock);
+	do {
+		unsigned int len;
+		void *req;
+
+		virtqueue_disable_cb(vq);
+
+		while ((req = virtqueue_get_buf(vq, &len)) != NULL) {
+			kfree(req);
+			fsvq->in_flight--;
+		}
+	} while (!virtqueue_enable_cb(vq) && likely(!virtqueue_is_broken(vq)));
+	spin_unlock(&fsvq->lock);
+}
+
+static void virtio_fs_dummy_dispatch_work(struct work_struct *work)
+{
+}
+
+static void virtio_fs_hiprio_dispatch_work(struct work_struct *work)
+{
+	struct virtio_fs_forget *forget;
+	struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq,
+						 dispatch_work.work);
+	struct virtqueue *vq = fsvq->vq;
+	struct scatterlist sg;
+	struct scatterlist *sgs[] = {&sg};
+	bool notify;
+	int ret;
+
+	pr_debug("virtio-fs: worker %s called.\n", __func__);
+	while (1) {
+		spin_lock(&fsvq->lock);
+		forget = list_first_entry_or_null(&fsvq->queued_reqs,
+					struct virtio_fs_forget, list);
+		if (!forget) {
+			spin_unlock(&fsvq->lock);
+			return;
+		}
+
+		list_del(&forget->list);
+		if (!fsvq->connected) {
+			spin_unlock(&fsvq->lock);
+			kfree(forget);
+			continue;
+		}
+
+		sg_init_one(&sg, forget, sizeof(*forget));
+
+		/* Enqueue the request */
+		dev_dbg(&vq->vdev->dev, "%s\n", __func__);
+		ret = virtqueue_add_sgs(vq, sgs, 1, 0, forget, GFP_ATOMIC);
+		if (ret < 0) {
+			if (ret == -ENOMEM || ret == -ENOSPC) {
+				pr_debug("virtio-fs: Could not queue FORGET: err=%d. Will try later\n",
+					 ret);
+				list_add_tail(&forget->list,
+						&fsvq->queued_reqs);
+				schedule_delayed_work(&fsvq->dispatch_work,
+						msecs_to_jiffies(1));
+			} else {
+				pr_debug("virtio-fs: Could not queue FORGET: err=%d. Dropping it.\n",
+					 ret);
+				kfree(forget);
+			}
+			spin_unlock(&fsvq->lock);
+			return;
+		}
+
+		fsvq->in_flight++;
+		notify = virtqueue_kick_prepare(vq);
+		spin_unlock(&fsvq->lock);
+
+		if (notify)
+			virtqueue_notify(vq);
+		pr_debug("virtio-fs: worker %s dispatched one forget request.\n",
+			 __func__);
+	}
+}
+
+/* Allocate and copy args into req->argbuf */
+static int copy_args_to_argbuf(struct fuse_req *req)
+{
+	struct fuse_args *args = req->args;
+	unsigned int offset = 0;
+	unsigned int num_in;
+	unsigned int num_out;
+	unsigned int len;
+	unsigned int i;
+
+	num_in = args->in_numargs - args->in_pages;
+	num_out = args->out_numargs - args->out_pages;
+	len = fuse_len_args(num_in, (struct fuse_arg *) args->in_args) +
+	      fuse_len_args(num_out, args->out_args);
+
+	req->argbuf = kmalloc(len, GFP_ATOMIC);
+	if (!req->argbuf)
+		return -ENOMEM;
+
+	for (i = 0; i < num_in; i++) {
+		memcpy(req->argbuf + offset,
+		       args->in_args[i].value,
+		       args->in_args[i].size);
+		offset += args->in_args[i].size;
+	}
+
+	return 0;
+}
+
+/* Copy args out of and free req->argbuf */
+static void copy_args_from_argbuf(struct fuse_args *args, struct fuse_req *req)
+{
+	unsigned int remaining;
+	unsigned int offset;
+	unsigned int num_in;
+	unsigned int num_out;
+	unsigned int i;
+
+	remaining = req->out.h.len - sizeof(req->out.h);
+	num_in = args->in_numargs - args->in_pages;
+	num_out = args->out_numargs - args->out_pages;
+	offset = fuse_len_args(num_in, (struct fuse_arg *)args->in_args);
+
+	for (i = 0; i < num_out; i++) {
+		unsigned int argsize = args->out_args[i].size;
+
+		if (args->out_argvar &&
+		    i == args->out_numargs - 1 &&
+		    argsize > remaining) {
+			argsize = remaining;
+		}
+
+		memcpy(args->out_args[i].value, req->argbuf + offset, argsize);
+		offset += argsize;
+
+		if (i != args->out_numargs - 1)
+			remaining -= argsize;
+	}
+
+	/* Store the actual size of the variable-length arg */
+	if (args->out_argvar)
+		args->out_args[args->out_numargs - 1].size = remaining;
+
+	kfree(req->argbuf);
+	req->argbuf = NULL;
+}
+
+/* Work function for request completion */
+static void virtio_fs_requests_done_work(struct work_struct *work)
+{
+	struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq,
+						 done_work);
+	struct fuse_pqueue *fpq = &fsvq->fud->pq;
+	struct fuse_conn *fc = fsvq->fud->fc;
+	struct virtqueue *vq = fsvq->vq;
+	struct fuse_req *req;
+	struct fuse_args_pages *ap;
+	struct fuse_req *next;
+	struct fuse_args *args;
+	unsigned int len, i, thislen;
+	struct page *page;
+	LIST_HEAD(reqs);
+
+	/* Collect completed requests off the virtqueue */
+	spin_lock(&fsvq->lock);
+	do {
+		virtqueue_disable_cb(vq);
+
+		while ((req = virtqueue_get_buf(vq, &len)) != NULL) {
+			spin_lock(&fpq->lock);
+			list_move_tail(&req->list, &reqs);
+			spin_unlock(&fpq->lock);
+		}
+	} while (!virtqueue_enable_cb(vq) && likely(!virtqueue_is_broken(vq)));
+	spin_unlock(&fsvq->lock);
+
+	/* End requests */
+	list_for_each_entry_safe(req, next, &reqs, list) {
+		/*
+		 * TODO verify that server properly follows FUSE protocol
+		 * (oh.uniq, oh.len)
+		 */
+		args = req->args;
+		copy_args_from_argbuf(args, req);
+
+		if (args->out_pages && args->page_zeroing) {
+			len = args->out_args[args->out_numargs - 1].size;
+			ap = container_of(args, typeof(*ap), args);
+			for (i = 0; i < ap->num_pages; i++) {
+				thislen = ap->descs[i].length;
+				if (len < thislen) {
+					WARN_ON(ap->descs[i].offset);
+					page = ap->pages[i];
+					zero_user_segment(page, len, thislen);
+					len = 0;
+				} else {
+					len -= thislen;
+				}
+			}
+		}
+
+		spin_lock(&fpq->lock);
+		clear_bit(FR_SENT, &req->flags);
+		list_del_init(&req->list);
+		spin_unlock(&fpq->lock);
+
+		fuse_request_end(fc, req);
+		spin_lock(&fsvq->lock);
+		fsvq->in_flight--;
+		spin_unlock(&fsvq->lock);
+	}
+}
+
+/* Virtqueue interrupt handler */
+static void virtio_fs_vq_done(struct virtqueue *vq)
+{
+	struct virtio_fs_vq *fsvq = vq_to_fsvq(vq);
+
+	dev_dbg(&vq->vdev->dev, "%s %s\n", __func__, fsvq->name);
+
+	schedule_work(&fsvq->done_work);
+}
+
+/* Initialize virtqueues */
+static int virtio_fs_setup_vqs(struct virtio_device *vdev,
+			       struct virtio_fs *fs)
+{
+	struct virtqueue **vqs;
+	vq_callback_t **callbacks;
+	const char **names;
+	unsigned int i;
+	int ret = 0;
+
+	virtio_cread(vdev, struct virtio_fs_config, num_request_queues,
+		     &fs->num_request_queues);
+	if (fs->num_request_queues == 0)
+		return -EINVAL;
+
+	fs->nvqs = 1 + fs->num_request_queues;
+	fs->vqs = kcalloc(fs->nvqs, sizeof(fs->vqs[VQ_HIPRIO]), GFP_KERNEL);
+	if (!fs->vqs)
+		return -ENOMEM;
+
+	vqs = kmalloc_array(fs->nvqs, sizeof(vqs[VQ_HIPRIO]), GFP_KERNEL);
+	callbacks = kmalloc_array(fs->nvqs, sizeof(callbacks[VQ_HIPRIO]),
+					GFP_KERNEL);
+	names = kmalloc_array(fs->nvqs, sizeof(names[VQ_HIPRIO]), GFP_KERNEL);
+	if (!vqs || !callbacks || !names) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	callbacks[VQ_HIPRIO] = virtio_fs_vq_done;
+	snprintf(fs->vqs[VQ_HIPRIO].name, sizeof(fs->vqs[VQ_HIPRIO].name),
+			"hiprio");
+	names[VQ_HIPRIO] = fs->vqs[VQ_HIPRIO].name;
+	INIT_WORK(&fs->vqs[VQ_HIPRIO].done_work, virtio_fs_hiprio_done_work);
+	INIT_LIST_HEAD(&fs->vqs[VQ_HIPRIO].queued_reqs);
+	INIT_DELAYED_WORK(&fs->vqs[VQ_HIPRIO].dispatch_work,
+			virtio_fs_hiprio_dispatch_work);
+	spin_lock_init(&fs->vqs[VQ_HIPRIO].lock);
+
+	/* Initialize the requests virtqueues */
+	for (i = VQ_REQUEST; i < fs->nvqs; i++) {
+		spin_lock_init(&fs->vqs[i].lock);
+		INIT_WORK(&fs->vqs[i].done_work, virtio_fs_requests_done_work);
+		INIT_DELAYED_WORK(&fs->vqs[i].dispatch_work,
+					virtio_fs_dummy_dispatch_work);
+		INIT_LIST_HEAD(&fs->vqs[i].queued_reqs);
+		snprintf(fs->vqs[i].name, sizeof(fs->vqs[i].name),
+			 "requests.%u", i - VQ_REQUEST);
+		callbacks[i] = virtio_fs_vq_done;
+		names[i] = fs->vqs[i].name;
+	}
+
+	ret = virtio_find_vqs(vdev, fs->nvqs, vqs, callbacks, names, NULL);
+	if (ret < 0)
+		goto out;
+
+	for (i = 0; i < fs->nvqs; i++)
+		fs->vqs[i].vq = vqs[i];
+
+	virtio_fs_start_all_queues(fs);
+out:
+	kfree(names);
+	kfree(callbacks);
+	kfree(vqs);
+	if (ret)
+		kfree(fs->vqs);
+	return ret;
+}
+
+/* Free virtqueues (device must already be reset) */
+static void virtio_fs_cleanup_vqs(struct virtio_device *vdev,
+				  struct virtio_fs *fs)
+{
+	vdev->config->del_vqs(vdev);
+}
+
+static int virtio_fs_probe(struct virtio_device *vdev)
+{
+	struct virtio_fs *fs;
+	int ret;
+
+	fs = kzalloc(sizeof(*fs), GFP_KERNEL);
+	if (!fs)
+		return -ENOMEM;
+	kref_init(&fs->refcount);
+	vdev->priv = fs;
+
+	ret = virtio_fs_read_tag(vdev, fs);
+	if (ret < 0)
+		goto out;
+
+	ret = virtio_fs_setup_vqs(vdev, fs);
+	if (ret < 0)
+		goto out;
+
+	/* TODO vq affinity */
+
+	/* Bring the device online in case the filesystem is mounted and
+	 * requests need to be sent before we return.
+	 */
+	virtio_device_ready(vdev);
+
+	ret = virtio_fs_add_instance(fs);
+	if (ret < 0)
+		goto out_vqs;
+
+	return 0;
+
+out_vqs:
+	vdev->config->reset(vdev);
+	virtio_fs_cleanup_vqs(vdev, fs);
+
+out:
+	vdev->priv = NULL;
+	kfree(fs);
+	return ret;
+}
+
+static void virtio_fs_stop_all_queues(struct virtio_fs *fs)
+{
+	struct virtio_fs_vq *fsvq;
+	int i;
+
+	for (i = 0; i < fs->nvqs; i++) {
+		fsvq = &fs->vqs[i];
+		spin_lock(&fsvq->lock);
+		fsvq->connected = false;
+		spin_unlock(&fsvq->lock);
+	}
+}
+
+static void virtio_fs_remove(struct virtio_device *vdev)
+{
+	struct virtio_fs *fs = vdev->priv;
+
+	mutex_lock(&virtio_fs_mutex);
+	/* This device is going away. No one should get new reference */
+	list_del_init(&fs->list);
+	virtio_fs_stop_all_queues(fs);
+	virtio_fs_drain_all_queues(fs);
+	vdev->config->reset(vdev);
+	virtio_fs_cleanup_vqs(vdev, fs);
+
+	vdev->priv = NULL;
+	/* Put device reference on virtio_fs object */
+	virtio_fs_put(fs);
+	mutex_unlock(&virtio_fs_mutex);
+}
+
+#ifdef CONFIG_PM_SLEEP
+static int virtio_fs_freeze(struct virtio_device *vdev)
+{
+	/* TODO need to save state here */
+	pr_warn("virtio-fs: suspend/resume not yet supported\n");
+	return -EOPNOTSUPP;
+}
+
+static int virtio_fs_restore(struct virtio_device *vdev)
+{
+	 /* TODO need to restore state here */
+	return 0;
+}
+#endif /* CONFIG_PM_SLEEP */
+
+const static struct virtio_device_id id_table[] = {
+	{ VIRTIO_ID_FS, VIRTIO_DEV_ANY_ID },
+	{},
+};
+
+const static unsigned int feature_table[] = {};
+
+static struct virtio_driver virtio_fs_driver = {
+	.driver.name		= KBUILD_MODNAME,
+	.driver.owner		= THIS_MODULE,
+	.id_table		= id_table,
+	.feature_table		= feature_table,
+	.feature_table_size	= ARRAY_SIZE(feature_table),
+	.probe			= virtio_fs_probe,
+	.remove			= virtio_fs_remove,
+#ifdef CONFIG_PM_SLEEP
+	.freeze			= virtio_fs_freeze,
+	.restore		= virtio_fs_restore,
+#endif
+};
+
+static void virtio_fs_wake_forget_and_unlock(struct fuse_iqueue *fiq)
+__releases(fiq->lock)
+{
+	struct fuse_forget_link *link;
+	struct virtio_fs_forget *forget;
+	struct scatterlist sg;
+	struct scatterlist *sgs[] = {&sg};
+	struct virtio_fs *fs;
+	struct virtqueue *vq;
+	struct virtio_fs_vq *fsvq;
+	bool notify;
+	u64 unique;
+	int ret;
+
+	link = fuse_dequeue_forget(fiq, 1, NULL);
+	unique = fuse_get_unique(fiq);
+
+	fs = fiq->priv;
+	fsvq = &fs->vqs[VQ_HIPRIO];
+	spin_unlock(&fiq->lock);
+
+	/* Allocate a buffer for the request */
+	forget = kmalloc(sizeof(*forget), GFP_NOFS | __GFP_NOFAIL);
+
+	forget->ih = (struct fuse_in_header){
+		.opcode = FUSE_FORGET,
+		.nodeid = link->forget_one.nodeid,
+		.unique = unique,
+		.len = sizeof(*forget),
+	};
+	forget->arg = (struct fuse_forget_in){
+		.nlookup = link->forget_one.nlookup,
+	};
+
+	sg_init_one(&sg, forget, sizeof(*forget));
+
+	/* Enqueue the request */
+	spin_lock(&fsvq->lock);
+
+	if (!fsvq->connected) {
+		kfree(forget);
+		spin_unlock(&fsvq->lock);
+		goto out;
+	}
+
+	vq = fsvq->vq;
+	dev_dbg(&vq->vdev->dev, "%s\n", __func__);
+
+	ret = virtqueue_add_sgs(vq, sgs, 1, 0, forget, GFP_ATOMIC);
+	if (ret < 0) {
+		if (ret == -ENOMEM || ret == -ENOSPC) {
+			pr_debug("virtio-fs: Could not queue FORGET: err=%d. Will try later.\n",
+				 ret);
+			list_add_tail(&forget->list, &fsvq->queued_reqs);
+			schedule_delayed_work(&fsvq->dispatch_work,
+					msecs_to_jiffies(1));
+		} else {
+			pr_debug("virtio-fs: Could not queue FORGET: err=%d. Dropping it.\n",
+				 ret);
+			kfree(forget);
+		}
+		spin_unlock(&fsvq->lock);
+		goto out;
+	}
+
+	fsvq->in_flight++;
+	notify = virtqueue_kick_prepare(vq);
+
+	spin_unlock(&fsvq->lock);
+
+	if (notify)
+		virtqueue_notify(vq);
+out:
+	kfree(link);
+}
+
+static void virtio_fs_wake_interrupt_and_unlock(struct fuse_iqueue *fiq)
+__releases(fiq->lock)
+{
+	/*
+	 * TODO interrupts.
+	 *
+	 * Normal fs operations on a local filesystems aren't interruptible.
+	 * Exceptions are blocking lock operations; for example fcntl(F_SETLKW)
+	 * with shared lock between host and guest.
+	 */
+	spin_unlock(&fiq->lock);
+}
+
+/* Return the number of scatter-gather list elements required */
+static unsigned int sg_count_fuse_req(struct fuse_req *req)
+{
+	struct fuse_args *args = req->args;
+	struct fuse_args_pages *ap = container_of(args, typeof(*ap), args);
+	unsigned int total_sgs = 1 /* fuse_in_header */;
+
+	if (args->in_numargs - args->in_pages)
+		total_sgs += 1;
+
+	if (args->in_pages)
+		total_sgs += ap->num_pages;
+
+	if (!test_bit(FR_ISREPLY, &req->flags))
+		return total_sgs;
+
+	total_sgs += 1 /* fuse_out_header */;
+
+	if (args->out_numargs - args->out_pages)
+		total_sgs += 1;
+
+	if (args->out_pages)
+		total_sgs += ap->num_pages;
+
+	return total_sgs;
+}
+
+/* Add pages to scatter-gather list and return number of elements used */
+static unsigned int sg_init_fuse_pages(struct scatterlist *sg,
+				       struct page **pages,
+				       struct fuse_page_desc *page_descs,
+				       unsigned int num_pages,
+				       unsigned int total_len)
+{
+	unsigned int i;
+	unsigned int this_len;
+
+	for (i = 0; i < num_pages && total_len; i++) {
+		sg_init_table(&sg[i], 1);
+		this_len =  min(page_descs[i].length, total_len);
+		sg_set_page(&sg[i], pages[i], this_len, page_descs[i].offset);
+		total_len -= this_len;
+	}
+
+	return i;
+}
+
+/* Add args to scatter-gather list and return number of elements used */
+static unsigned int sg_init_fuse_args(struct scatterlist *sg,
+				      struct fuse_req *req,
+				      struct fuse_arg *args,
+				      unsigned int numargs,
+				      bool argpages,
+				      void *argbuf,
+				      unsigned int *len_used)
+{
+	struct fuse_args_pages *ap = container_of(req->args, typeof(*ap), args);
+	unsigned int total_sgs = 0;
+	unsigned int len;
+
+	len = fuse_len_args(numargs - argpages, args);
+	if (len)
+		sg_init_one(&sg[total_sgs++], argbuf, len);
+
+	if (argpages)
+		total_sgs += sg_init_fuse_pages(&sg[total_sgs],
+						ap->pages, ap->descs,
+						ap->num_pages,
+						args[numargs - 1].size);
+
+	if (len_used)
+		*len_used = len;
+
+	return total_sgs;
+}
+
+/* Add a request to a virtqueue and kick the device */
+static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq,
+				 struct fuse_req *req)
+{
+	/* requests need at least 4 elements */
+	struct scatterlist *stack_sgs[6];
+	struct scatterlist stack_sg[ARRAY_SIZE(stack_sgs)];
+	struct scatterlist **sgs = stack_sgs;
+	struct scatterlist *sg = stack_sg;
+	struct virtqueue *vq;
+	struct fuse_args *args = req->args;
+	unsigned int argbuf_used = 0;
+	unsigned int out_sgs = 0;
+	unsigned int in_sgs = 0;
+	unsigned int total_sgs;
+	unsigned int i;
+	int ret;
+	bool notify;
+
+	/* Does the sglist fit on the stack? */
+	total_sgs = sg_count_fuse_req(req);
+	if (total_sgs > ARRAY_SIZE(stack_sgs)) {
+		sgs = kmalloc_array(total_sgs, sizeof(sgs[0]), GFP_ATOMIC);
+		sg = kmalloc_array(total_sgs, sizeof(sg[0]), GFP_ATOMIC);
+		if (!sgs || !sg) {
+			ret = -ENOMEM;
+			goto out;
+		}
+	}
+
+	/* Use a bounce buffer since stack args cannot be mapped */
+	ret = copy_args_to_argbuf(req);
+	if (ret < 0)
+		goto out;
+
+	/* Request elements */
+	sg_init_one(&sg[out_sgs++], &req->in.h, sizeof(req->in.h));
+	out_sgs += sg_init_fuse_args(&sg[out_sgs], req,
+				     (struct fuse_arg *)args->in_args,
+				     args->in_numargs, args->in_pages,
+				     req->argbuf, &argbuf_used);
+
+	/* Reply elements */
+	if (test_bit(FR_ISREPLY, &req->flags)) {
+		sg_init_one(&sg[out_sgs + in_sgs++],
+			    &req->out.h, sizeof(req->out.h));
+		in_sgs += sg_init_fuse_args(&sg[out_sgs + in_sgs], req,
+					    args->out_args, args->out_numargs,
+					    args->out_pages,
+					    req->argbuf + argbuf_used, NULL);
+	}
+
+	WARN_ON(out_sgs + in_sgs != total_sgs);
+
+	for (i = 0; i < total_sgs; i++)
+		sgs[i] = &sg[i];
+
+	spin_lock(&fsvq->lock);
+
+	if (!fsvq->connected) {
+		spin_unlock(&fsvq->lock);
+		ret = -ENOTCONN;
+		goto out;
+	}
+
+	vq = fsvq->vq;
+	ret = virtqueue_add_sgs(vq, sgs, out_sgs, in_sgs, req, GFP_ATOMIC);
+	if (ret < 0) {
+		spin_unlock(&fsvq->lock);
+		goto out;
+	}
+
+	fsvq->in_flight++;
+	notify = virtqueue_kick_prepare(vq);
+
+	spin_unlock(&fsvq->lock);
+
+	if (notify)
+		virtqueue_notify(vq);
+
+out:
+	if (ret < 0 && req->argbuf) {
+		kfree(req->argbuf);
+		req->argbuf = NULL;
+	}
+	if (sgs != stack_sgs) {
+		kfree(sgs);
+		kfree(sg);
+	}
+
+	return ret;
+}
+
+static void virtio_fs_wake_pending_and_unlock(struct fuse_iqueue *fiq)
+__releases(fiq->lock)
+{
+	unsigned int queue_id = VQ_REQUEST; /* TODO multiqueue */
+	struct virtio_fs *fs;
+	struct fuse_conn *fc;
+	struct fuse_req *req;
+	struct fuse_pqueue *fpq;
+	int ret;
+
+	WARN_ON(list_empty(&fiq->pending));
+	req = list_last_entry(&fiq->pending, struct fuse_req, list);
+	clear_bit(FR_PENDING, &req->flags);
+	list_del_init(&req->list);
+	WARN_ON(!list_empty(&fiq->pending));
+	spin_unlock(&fiq->lock);
+
+	fs = fiq->priv;
+	fc = fs->vqs[queue_id].fud->fc;
+
+	pr_debug("%s: opcode %u unique %#llx nodeid %#llx in.len %u out.len %u\n",
+		  __func__, req->in.h.opcode, req->in.h.unique,
+		 req->in.h.nodeid, req->in.h.len,
+		 fuse_len_args(req->args->out_numargs, req->args->out_args));
+
+	fpq = &fs->vqs[queue_id].fud->pq;
+	spin_lock(&fpq->lock);
+	if (!fpq->connected) {
+		spin_unlock(&fpq->lock);
+		req->out.h.error = -ENODEV;
+		pr_err("virtio-fs: %s disconnected\n", __func__);
+		fuse_request_end(fc, req);
+		return;
+	}
+	list_add_tail(&req->list, fpq->processing);
+	spin_unlock(&fpq->lock);
+	set_bit(FR_SENT, &req->flags);
+	/* matches barrier in request_wait_answer() */
+	smp_mb__after_atomic();
+
+retry:
+	ret = virtio_fs_enqueue_req(&fs->vqs[queue_id], req);
+	if (ret < 0) {
+		if (ret == -ENOMEM || ret == -ENOSPC) {
+			/* Virtqueue full. Retry submission */
+			/* TODO use completion instead of timeout */
+			usleep_range(20, 30);
+			goto retry;
+		}
+		req->out.h.error = ret;
+		pr_err("virtio-fs: virtio_fs_enqueue_req() failed %d\n", ret);
+		spin_lock(&fpq->lock);
+		clear_bit(FR_SENT, &req->flags);
+		list_del_init(&req->list);
+		spin_unlock(&fpq->lock);
+		fuse_request_end(fc, req);
+		return;
+	}
+}
+
+const static struct fuse_iqueue_ops virtio_fs_fiq_ops = {
+	.wake_forget_and_unlock		= virtio_fs_wake_forget_and_unlock,
+	.wake_interrupt_and_unlock	= virtio_fs_wake_interrupt_and_unlock,
+	.wake_pending_and_unlock	= virtio_fs_wake_pending_and_unlock,
+	.release			= virtio_fs_fiq_release,
+};
+
+static int virtio_fs_fill_super(struct super_block *sb)
+{
+	struct fuse_conn *fc = get_fuse_conn_super(sb);
+	struct virtio_fs *fs = fc->iq.priv;
+	unsigned int i;
+	int err;
+	struct fuse_fs_context ctx = {
+		.rootmode = S_IFDIR,
+		.default_permissions = 1,
+		.allow_other = 1,
+		.max_read = UINT_MAX,
+		.blksize = 512,
+		.destroy = true,
+		.no_control = true,
+		.no_force_umount = true,
+	};
+
+	mutex_lock(&virtio_fs_mutex);
+
+	/* After holding mutex, make sure virtiofs device is still there.
+	 * Though we are holding a reference to it, drive ->remove might
+	 * still have cleaned up virtual queues. In that case bail out.
+	 */
+	err = -EINVAL;
+	if (list_empty(&fs->list)) {
+		pr_info("virtio-fs: tag <%s> not found\n", fs->tag);
+		goto err;
+	}
+
+	err = -ENOMEM;
+	/* Allocate fuse_dev for hiprio and notification queues */
+	for (i = 0; i < VQ_REQUEST; i++) {
+		struct virtio_fs_vq *fsvq = &fs->vqs[i];
+
+		fsvq->fud = fuse_dev_alloc();
+		if (!fsvq->fud)
+			goto err_free_fuse_devs;
+	}
+
+	ctx.fudptr = (void **)&fs->vqs[VQ_REQUEST].fud;
+	err = fuse_fill_super_common(sb, &ctx);
+	if (err < 0)
+		goto err_free_fuse_devs;
+
+	fc = fs->vqs[VQ_REQUEST].fud->fc;
+
+	for (i = 0; i < fs->nvqs; i++) {
+		struct virtio_fs_vq *fsvq = &fs->vqs[i];
+
+		if (i == VQ_REQUEST)
+			continue; /* already initialized */
+		fuse_dev_install(fsvq->fud, fc);
+	}
+
+	/* Previous unmount will stop all queues. Start these again */
+	virtio_fs_start_all_queues(fs);
+	fuse_send_init(fc);
+	mutex_unlock(&virtio_fs_mutex);
+	return 0;
+
+err_free_fuse_devs:
+	virtio_fs_free_devs(fs);
+err:
+	mutex_unlock(&virtio_fs_mutex);
+	return err;
+}
+
+static void virtio_kill_sb(struct super_block *sb)
+{
+	struct fuse_conn *fc = get_fuse_conn_super(sb);
+	struct virtio_fs *vfs;
+	struct virtio_fs_vq *fsvq;
+
+	/* If mount failed, we can still be called without any fc */
+	if (!fc)
+		return fuse_kill_sb_anon(sb);
+
+	vfs = fc->iq.priv;
+	fsvq = &vfs->vqs[VQ_HIPRIO];
+
+	/* Stop forget queue. Soon destroy will be sent */
+	spin_lock(&fsvq->lock);
+	fsvq->connected = false;
+	spin_unlock(&fsvq->lock);
+	virtio_fs_drain_all_queues(vfs);
+
+	fuse_kill_sb_anon(sb);
+
+	/* fuse_kill_sb_anon() must have sent destroy. Stop all queues
+	 * and drain one more time and free fuse devices. Freeing fuse
+	 * devices will drop their reference on fuse_conn and that in
+	 * turn will drop its reference on virtio_fs object.
+	 */
+	virtio_fs_stop_all_queues(vfs);
+	virtio_fs_drain_all_queues(vfs);
+	virtio_fs_free_devs(vfs);
+}
+
+static int virtio_fs_test_super(struct super_block *sb,
+				struct fs_context *fsc)
+{
+	struct fuse_conn *fc = fsc->s_fs_info;
+
+	return fc->iq.priv == get_fuse_conn_super(sb)->iq.priv;
+}
+
+static int virtio_fs_set_super(struct super_block *sb,
+			       struct fs_context *fsc)
+{
+	int err;
+
+	err = get_anon_bdev(&sb->s_dev);
+	if (!err)
+		fuse_conn_get(fsc->s_fs_info);
+
+	return err;
+}
+
+static int virtio_fs_get_tree(struct fs_context *fsc)
+{
+	struct virtio_fs *fs;
+	struct super_block *sb;
+	struct fuse_conn *fc;
+	int err;
+
+	/* This gets a reference on virtio_fs object. This ptr gets installed
+	 * in fc->iq->priv. Once fuse_conn is going away, it calls ->put()
+	 * to drop the reference to this object.
+	 */
+	fs = virtio_fs_find_instance(fsc->source);
+	if (!fs) {
+		pr_info("virtio-fs: tag <%s> not found\n", fsc->source);
+		return -EINVAL;
+	}
+
+	fc = kzalloc(sizeof(struct fuse_conn), GFP_KERNEL);
+	if (!fc) {
+		mutex_lock(&virtio_fs_mutex);
+		virtio_fs_put(fs);
+		mutex_unlock(&virtio_fs_mutex);
+		return -ENOMEM;
+	}
+
+	fuse_conn_init(fc, get_user_ns(current_user_ns()), &virtio_fs_fiq_ops,
+		       fs);
+	fc->release = fuse_free_conn;
+	fc->delete_stale = true;
+
+	fsc->s_fs_info = fc;
+	sb = sget_fc(fsc, virtio_fs_test_super, virtio_fs_set_super);
+	fuse_conn_put(fc);
+	if (IS_ERR(sb))
+		return PTR_ERR(sb);
+
+	if (!sb->s_root) {
+		err = virtio_fs_fill_super(sb);
+		if (err) {
+			deactivate_locked_super(sb);
+			return err;
+		}
+
+		sb->s_flags |= SB_ACTIVE;
+	}
+
+	WARN_ON(fsc->root);
+	fsc->root = dget(sb->s_root);
+	return 0;
+}
+
+static const struct fs_context_operations virtio_fs_context_ops = {
+	.get_tree	= virtio_fs_get_tree,
+};
+
+static int virtio_fs_init_fs_context(struct fs_context *fsc)
+{
+	fsc->ops = &virtio_fs_context_ops;
+	return 0;
+}
+
+static struct file_system_type virtio_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "virtiofs",
+	.init_fs_context = virtio_fs_init_fs_context,
+	.kill_sb	= virtio_kill_sb,
+};
+
+static int __init virtio_fs_init(void)
+{
+	int ret;
+
+	ret = register_virtio_driver(&virtio_fs_driver);
+	if (ret < 0)
+		return ret;
+
+	ret = register_filesystem(&virtio_fs_type);
+	if (ret < 0) {
+		unregister_virtio_driver(&virtio_fs_driver);
+		return ret;
+	}
+
+	return 0;
+}
+module_init(virtio_fs_init);
+
+static void __exit virtio_fs_exit(void)
+{
+	unregister_filesystem(&virtio_fs_type);
+	unregister_virtio_driver(&virtio_fs_driver);
+}
+module_exit(virtio_fs_exit);
+
+MODULE_AUTHOR("Stefan Hajnoczi <stefanha@redhat.com>");
+MODULE_DESCRIPTION("Virtio Filesystem");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_FS(KBUILD_MODNAME);
+MODULE_DEVICE_TABLE(virtio, id_table);

diff --git a/fs/fuse/xattr.c b/fs/fuse/xattr.c
index 4337176..20d052e 100644
--- a/fs/fuse/xattr.c
+++ b/fs/fuse/xattr.c

@@ -25,15 +25,15 @@
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.size = size;
 	inarg.flags = flags;
-	args.in.h.opcode = FUSE_SETXATTR;
-	args.in.h.nodeid = get_node_id(inode);
-	args.in.numargs = 3;
-	args.in.args[0].size = sizeof(inarg);
-	args.in.args[0].value = &inarg;
-	args.in.args[1].size = strlen(name) + 1;
-	args.in.args[1].value = name;
-	args.in.args[2].size = size;
-	args.in.args[2].value = value;
+	args.opcode = FUSE_SETXATTR;
+	args.nodeid = get_node_id(inode);
+	args.in_numargs = 3;
+	args.in_args[0].size = sizeof(inarg);
+	args.in_args[0].value = &inarg;
+	args.in_args[1].size = strlen(name) + 1;
+	args.in_args[1].value = name;
+	args.in_args[2].size = size;
+	args.in_args[2].value = value;
 	err = fuse_simple_request(fc, &args);
 	if (err == -ENOSYS) {
 		fc->no_setxattr = 1;
@@ -60,22 +60,22 @@
 
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.size = size;
-	args.in.h.opcode = FUSE_GETXATTR;
-	args.in.h.nodeid = get_node_id(inode);
-	args.in.numargs = 2;
-	args.in.args[0].size = sizeof(inarg);
-	args.in.args[0].value = &inarg;
-	args.in.args[1].size = strlen(name) + 1;
-	args.in.args[1].value = name;
+	args.opcode = FUSE_GETXATTR;
+	args.nodeid = get_node_id(inode);
+	args.in_numargs = 2;
+	args.in_args[0].size = sizeof(inarg);
+	args.in_args[0].value = &inarg;
+	args.in_args[1].size = strlen(name) + 1;
+	args.in_args[1].value = name;
 	/* This is really two different operations rolled into one */
-	args.out.numargs = 1;
+	args.out_numargs = 1;
 	if (size) {
-		args.out.argvar = 1;
-		args.out.args[0].size = size;
-		args.out.args[0].value = value;
+		args.out_argvar = true;
+		args.out_args[0].size = size;
+		args.out_args[0].value = value;
 	} else {
-		args.out.args[0].size = sizeof(outarg);
-		args.out.args[0].value = &outarg;
+		args.out_args[0].size = sizeof(outarg);
+		args.out_args[0].value = &outarg;
 	}
 	ret = fuse_simple_request(fc, &args);
 	if (!ret && !size)
@@ -121,20 +121,20 @@
 
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.size = size;
-	args.in.h.opcode = FUSE_LISTXATTR;
-	args.in.h.nodeid = get_node_id(inode);
-	args.in.numargs = 1;
-	args.in.args[0].size = sizeof(inarg);
-	args.in.args[0].value = &inarg;
+	args.opcode = FUSE_LISTXATTR;
+	args.nodeid = get_node_id(inode);
+	args.in_numargs = 1;
+	args.in_args[0].size = sizeof(inarg);
+	args.in_args[0].value = &inarg;
 	/* This is really two different operations rolled into one */
-	args.out.numargs = 1;
+	args.out_numargs = 1;
 	if (size) {
-		args.out.argvar = 1;
-		args.out.args[0].size = size;
-		args.out.args[0].value = list;
+		args.out_argvar = true;
+		args.out_args[0].size = size;
+		args.out_args[0].value = list;
 	} else {
-		args.out.args[0].size = sizeof(outarg);
-		args.out.args[0].value = &outarg;
+		args.out_args[0].size = sizeof(outarg);
+		args.out_args[0].value = &outarg;
 	}
 	ret = fuse_simple_request(fc, &args);
 	if (!ret && !size)
@@ -157,11 +157,11 @@
 	if (fc->no_removexattr)
 		return -EOPNOTSUPP;
 
-	args.in.h.opcode = FUSE_REMOVEXATTR;
-	args.in.h.nodeid = get_node_id(inode);
-	args.in.numargs = 1;
-	args.in.args[0].size = strlen(name) + 1;
-	args.in.args[0].value = name;
+	args.opcode = FUSE_REMOVEXATTR;
+	args.nodeid = get_node_id(inode);
+	args.in_numargs = 1;
+	args.in_args[0].size = strlen(name) + 1;
+	args.in_args[0].value = name;
 	err = fuse_simple_request(fc, &args);
 	if (err == -ENOSYS) {
 		fc->no_removexattr = 1;

diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 6b45006..5f89c51 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h

@@ -584,10 +584,10 @@
 	unsigned int ar_rgrplvb:1;		/* use lvbs for rgrp info */
 	unsigned int ar_loccookie:1;		/* use location based readdir
 						   cookies */
-	int ar_commit;				/* Commit interval */
-	int ar_statfs_quantum;			/* The fast statfs interval */
-	int ar_quota_quantum;			/* The quota interval */
-	int ar_statfs_percent;			/* The % change to force sync */
+	s32 ar_commit;				/* Commit interval */
+	s32 ar_statfs_quantum;			/* The fast statfs interval */
+	s32 ar_quota_quantum;			/* The quota interval */
+	s32 ar_statfs_percent;			/* The % change to force sync */
 };
 
 struct gfs2_tune {

diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index f3fd5cd..681b446 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c

@@ -21,6 +21,7 @@
 #include <linux/lockdep.h>
 #include <linux/module.h>
 #include <linux/backing-dev.h>
+#include <linux/fs_parser.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -1031,16 +1032,17 @@
 }
 
 /**
- * fill_super - Read in superblock
+ * gfs2_fill_super - Read in superblock
  * @sb: The VFS superblock
- * @data: Mount options
+ * @args: Mount options
  * @silent: Don't complain if it's not a GFS2 filesystem
  *
- * Returns: errno
+ * Returns: -errno
  */
-
-static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent)
+static int gfs2_fill_super(struct super_block *sb, struct fs_context *fc)
 {
+	struct gfs2_args *args = fc->fs_private;
+	int silent = fc->sb_flags & SB_SILENT;
 	struct gfs2_sbd *sdp;
 	struct gfs2_holder mount_gh;
 	int error;
@@ -1205,161 +1207,411 @@
 	return error;
 }
 
-static int set_gfs2_super(struct super_block *s, void *data)
+/**
+ * gfs2_get_tree - Get the GFS2 superblock and root directory
+ * @fc: The filesystem context
+ *
+ * Returns: 0 or -errno on error
+ */
+static int gfs2_get_tree(struct fs_context *fc)
 {
-	s->s_bdev = data;
-	s->s_dev = s->s_bdev->bd_dev;
-	s->s_bdi = bdi_get(s->s_bdev->bd_bdi);
+	struct gfs2_args *args = fc->fs_private;
+	struct gfs2_sbd *sdp;
+	int error;
+
+	error = get_tree_bdev(fc, gfs2_fill_super);
+	if (error)
+		return error;
+
+	sdp = fc->root->d_sb->s_fs_info;
+	dput(fc->root);
+	if (args->ar_meta)
+		fc->root = dget(sdp->sd_master_dir);
+	else
+		fc->root = dget(sdp->sd_root_dir);
 	return 0;
 }
 
-static int test_gfs2_super(struct super_block *s, void *ptr)
+static void gfs2_fc_free(struct fs_context *fc)
 {
-	struct block_device *bdev = ptr;
-	return (bdev == s->s_bdev);
+	struct gfs2_args *args = fc->fs_private;
+
+	kfree(args);
 }
 
-/**
- * gfs2_mount - Get the GFS2 superblock
- * @fs_type: The GFS2 filesystem type
- * @flags: Mount flags
- * @dev_name: The name of the device
- * @data: The mount arguments
- *
- * Q. Why not use get_sb_bdev() ?
- * A. We need to select one of two root directories to mount, independent
- *    of whether this is the initial, or subsequent, mount of this sb
- *
- * Returns: 0 or -ve on error
- */
+enum gfs2_param {
+	Opt_lockproto,
+	Opt_locktable,
+	Opt_hostdata,
+	Opt_spectator,
+	Opt_ignore_local_fs,
+	Opt_localflocks,
+	Opt_localcaching,
+	Opt_debug,
+	Opt_upgrade,
+	Opt_acl,
+	Opt_quota,
+	Opt_suiddir,
+	Opt_data,
+	Opt_meta,
+	Opt_discard,
+	Opt_commit,
+	Opt_errors,
+	Opt_statfs_quantum,
+	Opt_statfs_percent,
+	Opt_quota_quantum,
+	Opt_barrier,
+	Opt_rgrplvb,
+	Opt_loccookie,
+};
 
-static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags,
-		       const char *dev_name, void *data)
+enum opt_quota {
+	Opt_quota_unset = 0,
+	Opt_quota_off,
+	Opt_quota_account,
+	Opt_quota_on,
+};
+
+static const unsigned int opt_quota_values[] = {
+	[Opt_quota_off]     = GFS2_QUOTA_OFF,
+	[Opt_quota_account] = GFS2_QUOTA_ACCOUNT,
+	[Opt_quota_on]      = GFS2_QUOTA_ON,
+};
+
+enum opt_data {
+	Opt_data_writeback = GFS2_DATA_WRITEBACK,
+	Opt_data_ordered   = GFS2_DATA_ORDERED,
+};
+
+enum opt_errors {
+	Opt_errors_withdraw = GFS2_ERRORS_WITHDRAW,
+	Opt_errors_panic    = GFS2_ERRORS_PANIC,
+};
+
+static const struct fs_parameter_spec gfs2_param_specs[] = {
+	fsparam_string ("lockproto",          Opt_lockproto),
+	fsparam_string ("locktable",          Opt_locktable),
+	fsparam_string ("hostdata",           Opt_hostdata),
+	fsparam_flag   ("spectator",          Opt_spectator),
+	fsparam_flag   ("norecovery",         Opt_spectator),
+	fsparam_flag   ("ignore_local_fs",    Opt_ignore_local_fs),
+	fsparam_flag   ("localflocks",        Opt_localflocks),
+	fsparam_flag   ("localcaching",       Opt_localcaching),
+	fsparam_flag_no("debug",              Opt_debug),
+	fsparam_flag   ("upgrade",            Opt_upgrade),
+	fsparam_flag_no("acl",                Opt_acl),
+	fsparam_flag_no("suiddir",            Opt_suiddir),
+	fsparam_enum   ("data",               Opt_data),
+	fsparam_flag   ("meta",               Opt_meta),
+	fsparam_flag_no("discard",            Opt_discard),
+	fsparam_s32    ("commit",             Opt_commit),
+	fsparam_enum   ("errors",             Opt_errors),
+	fsparam_s32    ("statfs_quantum",     Opt_statfs_quantum),
+	fsparam_s32    ("statfs_percent",     Opt_statfs_percent),
+	fsparam_s32    ("quota_quantum",      Opt_quota_quantum),
+	fsparam_flag_no("barrier",            Opt_barrier),
+	fsparam_flag_no("rgrplvb",            Opt_rgrplvb),
+	fsparam_flag_no("loccookie",          Opt_loccookie),
+	/* quota can be a flag or an enum so it gets special treatment */
+	__fsparam(fs_param_is_enum, "quota", Opt_quota, fs_param_neg_with_no|fs_param_v_optional),
+	{}
+};
+
+static const struct fs_parameter_enum gfs2_param_enums[] = {
+	{ Opt_quota,    "off",        Opt_quota_off },
+	{ Opt_quota,    "account",    Opt_quota_account },
+	{ Opt_quota,    "on",         Opt_quota_on },
+	{ Opt_data,     "writeback",  Opt_data_writeback },
+	{ Opt_data,     "ordered",    Opt_data_ordered },
+	{ Opt_errors,   "withdraw",   Opt_errors_withdraw },
+	{ Opt_errors,   "panic",      Opt_errors_panic },
+	{}
+};
+
+const struct fs_parameter_description gfs2_fs_parameters = {
+	.name = "gfs2",
+	.specs = gfs2_param_specs,
+	.enums = gfs2_param_enums,
+};
+
+/* Parse a single mount parameter */
+static int gfs2_parse_param(struct fs_context *fc, struct fs_parameter *param)
 {
-	struct block_device *bdev;
-	struct super_block *s;
-	fmode_t mode = FMODE_READ | FMODE_EXCL;
-	int error;
-	struct gfs2_args args;
-	struct gfs2_sbd *sdp;
+	struct gfs2_args *args = fc->fs_private;
+	struct fs_parse_result result;
+	int o;
 
-	if (!(flags & SB_RDONLY))
-		mode |= FMODE_WRITE;
+	o = fs_parse(fc, &gfs2_fs_parameters, param, &result);
+	if (o < 0)
+		return o;
 
-	bdev = blkdev_get_by_path(dev_name, mode, fs_type);
-	if (IS_ERR(bdev))
-		return ERR_CAST(bdev);
-
-	/*
-	 * once the super is inserted into the list by sget, s_umount
-	 * will protect the lockfs code from trying to start a snapshot
-	 * while we are mounting
-	 */
-	mutex_lock(&bdev->bd_fsfreeze_mutex);
-	if (bdev->bd_fsfreeze_count > 0) {
-		mutex_unlock(&bdev->bd_fsfreeze_mutex);
-		error = -EBUSY;
-		goto error_bdev;
+	switch (o) {
+	case Opt_lockproto:
+		strlcpy(args->ar_lockproto, param->string, GFS2_LOCKNAME_LEN);
+		break;
+	case Opt_locktable:
+		strlcpy(args->ar_locktable, param->string, GFS2_LOCKNAME_LEN);
+		break;
+	case Opt_hostdata:
+		strlcpy(args->ar_hostdata, param->string, GFS2_LOCKNAME_LEN);
+		break;
+	case Opt_spectator:
+		args->ar_spectator = 1;
+		break;
+	case Opt_ignore_local_fs:
+		/* Retained for backwards compat only */
+		break;
+	case Opt_localflocks:
+		args->ar_localflocks = 1;
+		break;
+	case Opt_localcaching:
+		/* Retained for backwards compat only */
+		break;
+	case Opt_debug:
+		if (result.boolean && args->ar_errors == GFS2_ERRORS_PANIC)
+			return invalf(fc, "gfs2: -o debug and -o errors=panic are mutually exclusive");
+		args->ar_debug = result.boolean;
+		break;
+	case Opt_upgrade:
+		/* Retained for backwards compat only */
+		break;
+	case Opt_acl:
+		args->ar_posix_acl = result.boolean;
+		break;
+	case Opt_quota:
+		/* The quota option can be a flag or an enum. A non-zero int_32
+		   result means that we have an enum index. Otherwise we have
+		   to rely on the 'negated' flag to tell us whether 'quota' or
+		   'noquota' was specified. */
+		if (result.negated)
+			args->ar_quota = GFS2_QUOTA_OFF;
+		else if (result.int_32 > 0)
+			args->ar_quota = opt_quota_values[result.int_32];
+		else
+			args->ar_quota = GFS2_QUOTA_ON;
+		break;
+	case Opt_suiddir:
+		args->ar_suiddir = result.boolean;
+		break;
+	case Opt_data:
+		/* The uint_32 result maps directly to GFS2_DATA_* */
+		args->ar_data = result.uint_32;
+		break;
+	case Opt_meta:
+		args->ar_meta = 1;
+		break;
+	case Opt_discard:
+		args->ar_discard = result.boolean;
+		break;
+	case Opt_commit:
+		if (result.int_32 <= 0)
+			return invalf(fc, "gfs2: commit mount option requires a positive numeric argument");
+		args->ar_commit = result.int_32;
+		break;
+	case Opt_statfs_quantum:
+		if (result.int_32 < 0)
+			return invalf(fc, "gfs2: statfs_quantum mount option requires a non-negative numeric argument");
+		args->ar_statfs_quantum = result.int_32;
+		break;
+	case Opt_quota_quantum:
+		if (result.int_32 <= 0)
+			return invalf(fc, "gfs2: quota_quantum mount option requires a positive numeric argument");
+		args->ar_quota_quantum = result.int_32;
+		break;
+	case Opt_statfs_percent:
+		if (result.int_32 < 0 || result.int_32 > 100)
+			return invalf(fc, "gfs2: statfs_percent mount option requires a numeric argument between 0 and 100");
+		args->ar_statfs_percent = result.int_32;
+		break;
+	case Opt_errors:
+		if (args->ar_debug && result.uint_32 == GFS2_ERRORS_PANIC)
+			return invalf(fc, "gfs2: -o debug and -o errors=panic are mutually exclusive");
+		args->ar_errors = result.uint_32;
+		break;
+	case Opt_barrier:
+		args->ar_nobarrier = result.boolean;
+		break;
+	case Opt_rgrplvb:
+		args->ar_rgrplvb = result.boolean;
+		break;
+	case Opt_loccookie:
+		args->ar_loccookie = result.boolean;
+		break;
+	default:
+		return invalf(fc, "gfs2: invalid mount option: %s", param->key);
 	}
-	s = sget(fs_type, test_gfs2_super, set_gfs2_super, flags, bdev);
-	mutex_unlock(&bdev->bd_fsfreeze_mutex);
-	error = PTR_ERR(s);
-	if (IS_ERR(s))
-		goto error_bdev;
+	return 0;
+}
 
-	if (s->s_root) {
-		/*
-		 * s_umount nests inside bd_mutex during
-		 * __invalidate_device().  blkdev_put() acquires
-		 * bd_mutex and can't be called under s_umount.  Drop
-		 * s_umount temporarily.  This is safe as we're
-		 * holding an active reference.
-		 */
-		up_write(&s->s_umount);
-		blkdev_put(bdev, mode);
-		down_write(&s->s_umount);
-	} else {
-		/* s_mode must be set before deactivate_locked_super calls */
-		s->s_mode = mode;
-	}
+static int gfs2_reconfigure(struct fs_context *fc)
+{
+	struct super_block *sb = fc->root->d_sb;
+	struct gfs2_sbd *sdp = sb->s_fs_info;
+	struct gfs2_args *oldargs = &sdp->sd_args;
+	struct gfs2_args *newargs = fc->fs_private;
+	struct gfs2_tune *gt = &sdp->sd_tune;
+	int error = 0;
 
-	memset(&args, 0, sizeof(args));
-	args.ar_quota = GFS2_QUOTA_DEFAULT;
-	args.ar_data = GFS2_DATA_DEFAULT;
-	args.ar_commit = 30;
-	args.ar_statfs_quantum = 30;
-	args.ar_quota_quantum = 60;
-	args.ar_errors = GFS2_ERRORS_DEFAULT;
+	sync_filesystem(sb);
 
-	error = gfs2_mount_args(&args, data);
-	if (error) {
-		pr_warn("can't parse mount arguments\n");
-		goto error_super;
-	}
-
-	if (s->s_root) {
-		error = -EBUSY;
-		if ((flags ^ s->s_flags) & SB_RDONLY)
-			goto error_super;
-	} else {
-		snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
-		sb_set_blocksize(s, block_size(bdev));
-		error = fill_super(s, &args, flags & SB_SILENT ? 1 : 0);
-		if (error)
-			goto error_super;
-		s->s_flags |= SB_ACTIVE;
-		bdev->bd_super = s;
-	}
-
-	sdp = s->s_fs_info;
-	if (args.ar_meta)
-		return dget(sdp->sd_master_dir);
+	spin_lock(&gt->gt_spin);
+	oldargs->ar_commit = gt->gt_logd_secs;
+	oldargs->ar_quota_quantum = gt->gt_quota_quantum;
+	if (gt->gt_statfs_slow)
+		oldargs->ar_statfs_quantum = 0;
 	else
-		return dget(sdp->sd_root_dir);
+		oldargs->ar_statfs_quantum = gt->gt_statfs_quantum;
+	spin_unlock(&gt->gt_spin);
 
-error_super:
-	deactivate_locked_super(s);
-	return ERR_PTR(error);
-error_bdev:
-	blkdev_put(bdev, mode);
-	return ERR_PTR(error);
+	if (strcmp(newargs->ar_lockproto, oldargs->ar_lockproto)) {
+		errorf(fc, "gfs2: reconfiguration of locking protocol not allowed");
+		return -EINVAL;
+	}
+	if (strcmp(newargs->ar_locktable, oldargs->ar_locktable)) {
+		errorf(fc, "gfs2: reconfiguration of lock table not allowed");
+		return -EINVAL;
+	}
+	if (strcmp(newargs->ar_hostdata, oldargs->ar_hostdata)) {
+		errorf(fc, "gfs2: reconfiguration of host data not allowed");
+		return -EINVAL;
+	}
+	if (newargs->ar_spectator != oldargs->ar_spectator) {
+		errorf(fc, "gfs2: reconfiguration of spectator mode not allowed");
+		return -EINVAL;
+	}
+	if (newargs->ar_localflocks != oldargs->ar_localflocks) {
+		errorf(fc, "gfs2: reconfiguration of localflocks not allowed");
+		return -EINVAL;
+	}
+	if (newargs->ar_meta != oldargs->ar_meta) {
+		errorf(fc, "gfs2: switching between gfs2 and gfs2meta not allowed");
+		return -EINVAL;
+	}
+	if (oldargs->ar_spectator)
+		fc->sb_flags |= SB_RDONLY;
+
+	if ((sb->s_flags ^ fc->sb_flags) & SB_RDONLY) {
+		if (fc->sb_flags & SB_RDONLY) {
+			error = gfs2_make_fs_ro(sdp);
+			if (error)
+				errorf(fc, "gfs2: unable to remount read-only");
+		} else {
+			error = gfs2_make_fs_rw(sdp);
+			if (error)
+				errorf(fc, "gfs2: unable to remount read-write");
+		}
+	}
+	sdp->sd_args = *newargs;
+
+	if (sdp->sd_args.ar_posix_acl)
+		sb->s_flags |= SB_POSIXACL;
+	else
+		sb->s_flags &= ~SB_POSIXACL;
+	if (sdp->sd_args.ar_nobarrier)
+		set_bit(SDF_NOBARRIERS, &sdp->sd_flags);
+	else
+		clear_bit(SDF_NOBARRIERS, &sdp->sd_flags);
+	spin_lock(&gt->gt_spin);
+	gt->gt_logd_secs = newargs->ar_commit;
+	gt->gt_quota_quantum = newargs->ar_quota_quantum;
+	if (newargs->ar_statfs_quantum) {
+		gt->gt_statfs_slow = 0;
+		gt->gt_statfs_quantum = newargs->ar_statfs_quantum;
+	}
+	else {
+		gt->gt_statfs_slow = 1;
+		gt->gt_statfs_quantum = 30;
+	}
+	spin_unlock(&gt->gt_spin);
+
+	gfs2_online_uevent(sdp);
+	return error;
 }
 
-static int set_meta_super(struct super_block *s, void *ptr)
+static const struct fs_context_operations gfs2_context_ops = {
+	.free        = gfs2_fc_free,
+	.parse_param = gfs2_parse_param,
+	.get_tree    = gfs2_get_tree,
+	.reconfigure = gfs2_reconfigure,
+};
+
+/* Set up the filesystem mount context */
+static int gfs2_init_fs_context(struct fs_context *fc)
+{
+	struct gfs2_args *args;
+
+	args = kzalloc(sizeof(*args), GFP_KERNEL);
+	if (args == NULL)
+		return -ENOMEM;
+
+	args->ar_quota = GFS2_QUOTA_DEFAULT;
+	args->ar_data = GFS2_DATA_DEFAULT;
+	args->ar_commit = 30;
+	args->ar_statfs_quantum = 30;
+	args->ar_quota_quantum = 60;
+	args->ar_errors = GFS2_ERRORS_DEFAULT;
+
+	fc->fs_private = args;
+	fc->ops = &gfs2_context_ops;
+	return 0;
+}
+
+static int set_meta_super(struct super_block *s, struct fs_context *fc)
 {
 	return -EINVAL;
 }
 
-static struct dentry *gfs2_mount_meta(struct file_system_type *fs_type,
-			int flags, const char *dev_name, void *data)
+static int test_meta_super(struct super_block *s, struct fs_context *fc)
+{
+	return (fc->sget_key == s->s_bdev);
+}
+
+static int gfs2_meta_get_tree(struct fs_context *fc)
 {
 	struct super_block *s;
 	struct gfs2_sbd *sdp;
 	struct path path;
 	int error;
 
-	if (!dev_name || !*dev_name)
-		return ERR_PTR(-EINVAL);
+	if (!fc->source || !*fc->source)
+		return -EINVAL;
 
-	error = kern_path(dev_name, LOOKUP_FOLLOW, &path);
+	error = kern_path(fc->source, LOOKUP_FOLLOW, &path);
 	if (error) {
 		pr_warn("path_lookup on %s returned error %d\n",
-			dev_name, error);
-		return ERR_PTR(error);
+		        fc->source, error);
+		return error;
 	}
-	s = sget(&gfs2_fs_type, test_gfs2_super, set_meta_super, flags,
-		 path.dentry->d_sb->s_bdev);
+	fc->fs_type = &gfs2_fs_type;
+	fc->sget_key = path.dentry->d_sb->s_bdev;
+	s = sget_fc(fc, test_meta_super, set_meta_super);
 	path_put(&path);
 	if (IS_ERR(s)) {
 		pr_warn("gfs2 mount does not exist\n");
-		return ERR_CAST(s);
+		return PTR_ERR(s);
 	}
-	if ((flags ^ s->s_flags) & SB_RDONLY) {
+	if ((fc->sb_flags ^ s->s_flags) & SB_RDONLY) {
 		deactivate_locked_super(s);
-		return ERR_PTR(-EBUSY);
+		return -EBUSY;
 	}
 	sdp = s->s_fs_info;
-	return dget(sdp->sd_master_dir);
+	fc->root = dget(sdp->sd_master_dir);
+	return 0;
+}
+
+static const struct fs_context_operations gfs2_meta_context_ops = {
+	.get_tree    = gfs2_meta_get_tree,
+};
+
+static int gfs2_meta_init_fs_context(struct fs_context *fc)
+{
+	int ret = gfs2_init_fs_context(fc);
+
+	if (ret)
+		return ret;
+
+	fc->ops = &gfs2_meta_context_ops;
+	return 0;
 }
 
 static void gfs2_kill_sb(struct super_block *sb)
@@ -1383,7 +1635,8 @@
 struct file_system_type gfs2_fs_type = {
 	.name = "gfs2",
 	.fs_flags = FS_REQUIRES_DEV,
-	.mount = gfs2_mount,
+	.init_fs_context = gfs2_init_fs_context,
+	.parameters = &gfs2_fs_parameters,
 	.kill_sb = gfs2_kill_sb,
 	.owner = THIS_MODULE,
 };
@@ -1392,7 +1645,7 @@
 struct file_system_type gfs2meta_fs_type = {
 	.name = "gfs2meta",
 	.fs_flags = FS_REQUIRES_DEV,
-	.mount = gfs2_mount_meta,
+	.init_fs_context = gfs2_meta_init_fs_context,
 	.owner = THIS_MODULE,
 };
 MODULE_ALIAS_FS("gfs2meta");

diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 644c70a..5fa1eec 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c

@@ -44,258 +44,6 @@
 #include "xattr.h"
 #include "lops.h"
 
-#define args_neq(a1, a2, x) ((a1)->ar_##x != (a2)->ar_##x)
-
-enum {
-	Opt_lockproto,
-	Opt_locktable,
-	Opt_hostdata,
-	Opt_spectator,
-	Opt_ignore_local_fs,
-	Opt_localflocks,
-	Opt_localcaching,
-	Opt_debug,
-	Opt_nodebug,
-	Opt_upgrade,
-	Opt_acl,
-	Opt_noacl,
-	Opt_quota_off,
-	Opt_quota_account,
-	Opt_quota_on,
-	Opt_quota,
-	Opt_noquota,
-	Opt_suiddir,
-	Opt_nosuiddir,
-	Opt_data_writeback,
-	Opt_data_ordered,
-	Opt_meta,
-	Opt_discard,
-	Opt_nodiscard,
-	Opt_commit,
-	Opt_err_withdraw,
-	Opt_err_panic,
-	Opt_statfs_quantum,
-	Opt_statfs_percent,
-	Opt_quota_quantum,
-	Opt_barrier,
-	Opt_nobarrier,
-	Opt_rgrplvb,
-	Opt_norgrplvb,
-	Opt_loccookie,
-	Opt_noloccookie,
-	Opt_error,
-};
-
-static const match_table_t tokens = {
-	{Opt_lockproto, "lockproto=%s"},
-	{Opt_locktable, "locktable=%s"},
-	{Opt_hostdata, "hostdata=%s"},
-	{Opt_spectator, "spectator"},
-	{Opt_spectator, "norecovery"},
-	{Opt_ignore_local_fs, "ignore_local_fs"},
-	{Opt_localflocks, "localflocks"},
-	{Opt_localcaching, "localcaching"},
-	{Opt_debug, "debug"},
-	{Opt_nodebug, "nodebug"},
-	{Opt_upgrade, "upgrade"},
-	{Opt_acl, "acl"},
-	{Opt_noacl, "noacl"},
-	{Opt_quota_off, "quota=off"},
-	{Opt_quota_account, "quota=account"},
-	{Opt_quota_on, "quota=on"},
-	{Opt_quota, "quota"},
-	{Opt_noquota, "noquota"},
-	{Opt_suiddir, "suiddir"},
-	{Opt_nosuiddir, "nosuiddir"},
-	{Opt_data_writeback, "data=writeback"},
-	{Opt_data_ordered, "data=ordered"},
-	{Opt_meta, "meta"},
-	{Opt_discard, "discard"},
-	{Opt_nodiscard, "nodiscard"},
-	{Opt_commit, "commit=%d"},
-	{Opt_err_withdraw, "errors=withdraw"},
-	{Opt_err_panic, "errors=panic"},
-	{Opt_statfs_quantum, "statfs_quantum=%d"},
-	{Opt_statfs_percent, "statfs_percent=%d"},
-	{Opt_quota_quantum, "quota_quantum=%d"},
-	{Opt_barrier, "barrier"},
-	{Opt_nobarrier, "nobarrier"},
-	{Opt_rgrplvb, "rgrplvb"},
-	{Opt_norgrplvb, "norgrplvb"},
-	{Opt_loccookie, "loccookie"},
-	{Opt_noloccookie, "noloccookie"},
-	{Opt_error, NULL}
-};
-
-/**
- * gfs2_mount_args - Parse mount options
- * @args: The structure into which the parsed options will be written
- * @options: The options to parse
- *
- * Return: errno
- */
-
-int gfs2_mount_args(struct gfs2_args *args, char *options)
-{
-	char *o;
-	int token;
-	substring_t tmp[MAX_OPT_ARGS];
-	int rv;
-
-	/* Split the options into tokens with the "," character and
-	   process them */
-
-	while (1) {
-		o = strsep(&options, ",");
-		if (o == NULL)
-			break;
-		if (*o == '\0')
-			continue;
-
-		token = match_token(o, tokens, tmp);
-		switch (token) {
-		case Opt_lockproto:
-			match_strlcpy(args->ar_lockproto, &tmp[0],
-				      GFS2_LOCKNAME_LEN);
-			break;
-		case Opt_locktable:
-			match_strlcpy(args->ar_locktable, &tmp[0],
-				      GFS2_LOCKNAME_LEN);
-			break;
-		case Opt_hostdata:
-			match_strlcpy(args->ar_hostdata, &tmp[0],
-				      GFS2_LOCKNAME_LEN);
-			break;
-		case Opt_spectator:
-			args->ar_spectator = 1;
-			break;
-		case Opt_ignore_local_fs:
-			/* Retained for backwards compat only */
-			break;
-		case Opt_localflocks:
-			args->ar_localflocks = 1;
-			break;
-		case Opt_localcaching:
-			/* Retained for backwards compat only */
-			break;
-		case Opt_debug:
-			if (args->ar_errors == GFS2_ERRORS_PANIC) {
-				pr_warn("-o debug and -o errors=panic are mutually exclusive\n");
-				return -EINVAL;
-			}
-			args->ar_debug = 1;
-			break;
-		case Opt_nodebug:
-			args->ar_debug = 0;
-			break;
-		case Opt_upgrade:
-			/* Retained for backwards compat only */
-			break;
-		case Opt_acl:
-			args->ar_posix_acl = 1;
-			break;
-		case Opt_noacl:
-			args->ar_posix_acl = 0;
-			break;
-		case Opt_quota_off:
-		case Opt_noquota:
-			args->ar_quota = GFS2_QUOTA_OFF;
-			break;
-		case Opt_quota_account:
-			args->ar_quota = GFS2_QUOTA_ACCOUNT;
-			break;
-		case Opt_quota_on:
-		case Opt_quota:
-			args->ar_quota = GFS2_QUOTA_ON;
-			break;
-		case Opt_suiddir:
-			args->ar_suiddir = 1;
-			break;
-		case Opt_nosuiddir:
-			args->ar_suiddir = 0;
-			break;
-		case Opt_data_writeback:
-			args->ar_data = GFS2_DATA_WRITEBACK;
-			break;
-		case Opt_data_ordered:
-			args->ar_data = GFS2_DATA_ORDERED;
-			break;
-		case Opt_meta:
-			args->ar_meta = 1;
-			break;
-		case Opt_discard:
-			args->ar_discard = 1;
-			break;
-		case Opt_nodiscard:
-			args->ar_discard = 0;
-			break;
-		case Opt_commit:
-			rv = match_int(&tmp[0], &args->ar_commit);
-			if (rv || args->ar_commit <= 0) {
-				pr_warn("commit mount option requires a positive numeric argument\n");
-				return rv ? rv : -EINVAL;
-			}
-			break;
-		case Opt_statfs_quantum:
-			rv = match_int(&tmp[0], &args->ar_statfs_quantum);
-			if (rv || args->ar_statfs_quantum < 0) {
-				pr_warn("statfs_quantum mount option requires a non-negative numeric argument\n");
-				return rv ? rv : -EINVAL;
-			}
-			break;
-		case Opt_quota_quantum:
-			rv = match_int(&tmp[0], &args->ar_quota_quantum);
-			if (rv || args->ar_quota_quantum <= 0) {
-				pr_warn("quota_quantum mount option requires a positive numeric argument\n");
-				return rv ? rv : -EINVAL;
-			}
-			break;
-		case Opt_statfs_percent:
-			rv = match_int(&tmp[0], &args->ar_statfs_percent);
-			if (rv || args->ar_statfs_percent < 0 ||
-			    args->ar_statfs_percent > 100) {
-				pr_warn("statfs_percent mount option requires a numeric argument between 0 and 100\n");
-				return rv ? rv : -EINVAL;
-			}
-			break;
-		case Opt_err_withdraw:
-			args->ar_errors = GFS2_ERRORS_WITHDRAW;
-			break;
-		case Opt_err_panic:
-			if (args->ar_debug) {
-				pr_warn("-o debug and -o errors=panic are mutually exclusive\n");
-				return -EINVAL;
-			}
-			args->ar_errors = GFS2_ERRORS_PANIC;
-			break;
-		case Opt_barrier:
-			args->ar_nobarrier = 0;
-			break;
-		case Opt_nobarrier:
-			args->ar_nobarrier = 1;
-			break;
-		case Opt_rgrplvb:
-			args->ar_rgrplvb = 1;
-			break;
-		case Opt_norgrplvb:
-			args->ar_rgrplvb = 0;
-			break;
-		case Opt_loccookie:
-			args->ar_loccookie = 1;
-			break;
-		case Opt_noloccookie:
-			args->ar_loccookie = 0;
-			break;
-		case Opt_error:
-		default:
-			pr_warn("invalid mount option: %s\n", o);
-			return -EINVAL;
-		}
-	}
-
-	return 0;
-}
-
 /**
  * gfs2_jindex_free - Clear all the journal index information
  * @sdp: The GFS2 superblock
@@ -847,7 +595,7 @@
  * Returns: errno
  */
 
-static int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
+int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
 {
 	struct gfs2_holder freeze_gh;
 	int error;
@@ -1227,84 +975,6 @@
 }
 
 /**
- * gfs2_remount_fs - called when the FS is remounted
- * @sb:  the filesystem
- * @flags:  the remount flags
- * @data:  extra data passed in (not used right now)
- *
- * Returns: errno
- */
-
-static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
-{
-	struct gfs2_sbd *sdp = sb->s_fs_info;
-	struct gfs2_args args = sdp->sd_args; /* Default to current settings */
-	struct gfs2_tune *gt = &sdp->sd_tune;
-	int error;
-
-	sync_filesystem(sb);
-
-	spin_lock(&gt->gt_spin);
-	args.ar_commit = gt->gt_logd_secs;
-	args.ar_quota_quantum = gt->gt_quota_quantum;
-	if (gt->gt_statfs_slow)
-		args.ar_statfs_quantum = 0;
-	else
-		args.ar_statfs_quantum = gt->gt_statfs_quantum;
-	spin_unlock(&gt->gt_spin);
-	error = gfs2_mount_args(&args, data);
-	if (error)
-		return error;
-
-	/* Not allowed to change locking details */
-	if (strcmp(args.ar_lockproto, sdp->sd_args.ar_lockproto) ||
-	    strcmp(args.ar_locktable, sdp->sd_args.ar_locktable) ||
-	    strcmp(args.ar_hostdata, sdp->sd_args.ar_hostdata))
-		return -EINVAL;
-
-	/* Some flags must not be changed */
-	if (args_neq(&args, &sdp->sd_args, spectator) ||
-	    args_neq(&args, &sdp->sd_args, localflocks) ||
-	    args_neq(&args, &sdp->sd_args, meta))
-		return -EINVAL;
-
-	if (sdp->sd_args.ar_spectator)
-		*flags |= SB_RDONLY;
-
-	if ((sb->s_flags ^ *flags) & SB_RDONLY) {
-		if (*flags & SB_RDONLY)
-			error = gfs2_make_fs_ro(sdp);
-		else
-			error = gfs2_make_fs_rw(sdp);
-	}
-
-	sdp->sd_args = args;
-	if (sdp->sd_args.ar_posix_acl)
-		sb->s_flags |= SB_POSIXACL;
-	else
-		sb->s_flags &= ~SB_POSIXACL;
-	if (sdp->sd_args.ar_nobarrier)
-		set_bit(SDF_NOBARRIERS, &sdp->sd_flags);
-	else
-		clear_bit(SDF_NOBARRIERS, &sdp->sd_flags);
-	spin_lock(&gt->gt_spin);
-	gt->gt_logd_secs = args.ar_commit;
-	gt->gt_quota_quantum = args.ar_quota_quantum;
-	if (args.ar_statfs_quantum) {
-		gt->gt_statfs_slow = 0;
-		gt->gt_statfs_quantum = args.ar_statfs_quantum;
-	}
-	else {
-		gt->gt_statfs_slow = 1;
-		gt->gt_statfs_quantum = 30;
-	}
-	spin_unlock(&gt->gt_spin);
-
-	gfs2_online_uevent(sdp);
-	return error;
-}
-
-/**
  * gfs2_drop_inode - Drop an inode (test for remote unlink)
  * @inode: The inode to drop
  *
@@ -1748,7 +1418,6 @@
 	.freeze_super		= gfs2_freeze,
 	.thaw_super		= gfs2_unfreeze,
 	.statfs			= gfs2_statfs,
-	.remount_fs		= gfs2_remount_fs,
 	.drop_inode		= gfs2_drop_inode,
 	.show_options		= gfs2_show_options,
 };

diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index 9d49eaa..b8bf811 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h

@@ -24,8 +24,6 @@
 
 extern void gfs2_jindex_free(struct gfs2_sbd *sdp);
 
-extern int gfs2_mount_args(struct gfs2_args *args, char *data);
-
 extern struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid);
 extern int gfs2_jdesc_check(struct gfs2_jdesc *jd);
 
@@ -33,6 +31,7 @@
 				     struct gfs2_inode **ipp);
 
 extern int gfs2_make_fs_rw(struct gfs2_sbd *sdp);
+extern int gfs2_make_fs_ro(struct gfs2_sbd *sdp);
 extern void gfs2_online_uevent(struct gfs2_sbd *sdp);
 extern int gfs2_statfs_init(struct gfs2_sbd *sdp);
 extern void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free,

diff --git a/fs/inode.c b/fs/inode.c
index 64bf28c..fef457a 100644
--- a/fs/inode.c
+++ b/fs/inode.c

@@ -181,6 +181,9 @@
 	mapping->flags = 0;
 	mapping->wb_err = 0;
 	atomic_set(&mapping->i_mmap_writable, 0);
+#ifdef CONFIG_READ_ONLY_THP_FOR_FS
+	atomic_set(&mapping->nr_thps, 0);
+#endif
 	mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE);
 	mapping->private_data = NULL;
 	mapping->writeback_index = 0;

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 0dadbdb..aa8ac55 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c

@@ -200,6 +200,7 @@
 		struct io_uring_sqe	*sq_sqes;
 
 		struct list_head	defer_list;
+		struct list_head	timeout_list;
 	} ____cacheline_aligned_in_smp;
 
 	/* IO offload */
@@ -216,6 +217,7 @@
 		struct wait_queue_head	cq_wait;
 		struct fasync_struct	*cq_fasync;
 		struct eventfd_ctx	*cq_ev_fd;
+		atomic_t		cq_timeouts;
 	} ____cacheline_aligned_in_smp;
 
 	struct io_rings	*rings;
@@ -283,6 +285,11 @@
 	struct wait_queue_entry		wait;
 };
 
+struct io_timeout {
+	struct file			*file;
+	struct hrtimer			timer;
+};
+
 /*
  * NOTE! Each of the iocb union members has the file pointer
  * as the first entry in their struct definition. So you can
@@ -294,6 +301,7 @@
 		struct file		*file;
 		struct kiocb		rw;
 		struct io_poll_iocb	poll;
+		struct io_timeout	timeout;
 	};
 
 	struct sqe_submit	submit;
@@ -313,6 +321,7 @@
 #define REQ_F_LINK_DONE		128	/* linked sqes done */
 #define REQ_F_FAIL_LINK		256	/* fail rest of links */
 #define REQ_F_SHADOW_DRAIN	512	/* link-drain shadow req */
+#define REQ_F_TIMEOUT		1024	/* timeout request */
 	u64			user_data;
 	u32			result;
 	u32			sequence;
@@ -344,6 +353,8 @@
 };
 
 static void io_sq_wq_submit_work(struct work_struct *work);
+static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data,
+				 long res);
 static void __io_free_req(struct io_kiocb *req);
 
 static struct kmem_cache *req_cachep;
@@ -400,26 +411,30 @@
 	INIT_LIST_HEAD(&ctx->poll_list);
 	INIT_LIST_HEAD(&ctx->cancel_list);
 	INIT_LIST_HEAD(&ctx->defer_list);
+	INIT_LIST_HEAD(&ctx->timeout_list);
 	return ctx;
 }
 
 static inline bool io_sequence_defer(struct io_ring_ctx *ctx,
 				     struct io_kiocb *req)
 {
-	if ((req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) != REQ_F_IO_DRAIN)
+	/* timeout requests always honor sequence */
+	if (!(req->flags & REQ_F_TIMEOUT) &&
+	    (req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) != REQ_F_IO_DRAIN)
 		return false;
 
 	return req->sequence != ctx->cached_cq_tail + ctx->rings->sq_dropped;
 }
 
-static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx)
+static struct io_kiocb *__io_get_deferred_req(struct io_ring_ctx *ctx,
+					      struct list_head *list)
 {
 	struct io_kiocb *req;
 
-	if (list_empty(&ctx->defer_list))
+	if (list_empty(list))
 		return NULL;
 
-	req = list_first_entry(&ctx->defer_list, struct io_kiocb, list);
+	req = list_first_entry(list, struct io_kiocb, list);
 	if (!io_sequence_defer(ctx, req)) {
 		list_del_init(&req->list);
 		return req;
@@ -428,6 +443,16 @@
 	return NULL;
 }
 
+static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx)
+{
+	return __io_get_deferred_req(ctx, &ctx->defer_list);
+}
+
+static struct io_kiocb *io_get_timeout_req(struct io_ring_ctx *ctx)
+{
+	return __io_get_deferred_req(ctx, &ctx->timeout_list);
+}
+
 static void __io_commit_cqring(struct io_ring_ctx *ctx)
 {
 	struct io_rings *rings = ctx->rings;
@@ -446,25 +471,50 @@
 static inline void io_queue_async_work(struct io_ring_ctx *ctx,
 				       struct io_kiocb *req)
 {
-	int rw;
+	int rw = 0;
 
-	switch (req->submit.sqe->opcode) {
-	case IORING_OP_WRITEV:
-	case IORING_OP_WRITE_FIXED:
-		rw = !(req->rw.ki_flags & IOCB_DIRECT);
-		break;
-	default:
-		rw = 0;
-		break;
+	if (req->submit.sqe) {
+		switch (req->submit.sqe->opcode) {
+		case IORING_OP_WRITEV:
+		case IORING_OP_WRITE_FIXED:
+			rw = !(req->rw.ki_flags & IOCB_DIRECT);
+			break;
+		}
 	}
 
 	queue_work(ctx->sqo_wq[rw], &req->work);
 }
 
+static void io_kill_timeout(struct io_kiocb *req)
+{
+	int ret;
+
+	ret = hrtimer_try_to_cancel(&req->timeout.timer);
+	if (ret != -1) {
+		atomic_inc(&req->ctx->cq_timeouts);
+		list_del(&req->list);
+		io_cqring_fill_event(req->ctx, req->user_data, 0);
+		__io_free_req(req);
+	}
+}
+
+static void io_kill_timeouts(struct io_ring_ctx *ctx)
+{
+	struct io_kiocb *req, *tmp;
+
+	spin_lock_irq(&ctx->completion_lock);
+	list_for_each_entry_safe(req, tmp, &ctx->timeout_list, list)
+		io_kill_timeout(req);
+	spin_unlock_irq(&ctx->completion_lock);
+}
+
 static void io_commit_cqring(struct io_ring_ctx *ctx)
 {
 	struct io_kiocb *req;
 
+	while ((req = io_get_timeout_req(ctx)) != NULL)
+		io_kill_timeout(req);
+
 	__io_commit_cqring(ctx);
 
 	while ((req = io_get_deferred_req(ctx)) != NULL) {
@@ -1248,6 +1298,51 @@
 	}
 }
 
+/*
+ * For files that don't have ->read_iter() and ->write_iter(), handle them
+ * by looping over ->read() or ->write() manually.
+ */
+static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb,
+			   struct iov_iter *iter)
+{
+	ssize_t ret = 0;
+
+	/*
+	 * Don't support polled IO through this interface, and we can't
+	 * support non-blocking either. For the latter, this just causes
+	 * the kiocb to be handled from an async context.
+	 */
+	if (kiocb->ki_flags & IOCB_HIPRI)
+		return -EOPNOTSUPP;
+	if (kiocb->ki_flags & IOCB_NOWAIT)
+		return -EAGAIN;
+
+	while (iov_iter_count(iter)) {
+		struct iovec iovec = iov_iter_iovec(iter);
+		ssize_t nr;
+
+		if (rw == READ) {
+			nr = file->f_op->read(file, iovec.iov_base,
+					      iovec.iov_len, &kiocb->ki_pos);
+		} else {
+			nr = file->f_op->write(file, iovec.iov_base,
+					       iovec.iov_len, &kiocb->ki_pos);
+		}
+
+		if (nr < 0) {
+			if (!ret)
+				ret = nr;
+			break;
+		}
+		ret += nr;
+		if (nr != iovec.iov_len)
+			break;
+		iov_iter_advance(iter, nr);
+	}
+
+	return ret;
+}
+
 static int io_read(struct io_kiocb *req, const struct sqe_submit *s,
 		   bool force_nonblock)
 {
@@ -1265,8 +1360,6 @@
 
 	if (unlikely(!(file->f_mode & FMODE_READ)))
 		return -EBADF;
-	if (unlikely(!file->f_op->read_iter))
-		return -EINVAL;
 
 	ret = io_import_iovec(req->ctx, READ, s, &iovec, &iter);
 	if (ret < 0)
@@ -1281,7 +1374,11 @@
 	if (!ret) {
 		ssize_t ret2;
 
-		ret2 = call_read_iter(file, kiocb, &iter);
+		if (file->f_op->read_iter)
+			ret2 = call_read_iter(file, kiocb, &iter);
+		else
+			ret2 = loop_rw_iter(READ, file, kiocb, &iter);
+
 		/*
 		 * In case of a short read, punt to async. This can happen
 		 * if we have data partially cached. Alternatively we can
@@ -1326,8 +1423,6 @@
 	file = kiocb->ki_filp;
 	if (unlikely(!(file->f_mode & FMODE_WRITE)))
 		return -EBADF;
-	if (unlikely(!file->f_op->write_iter))
-		return -EINVAL;
 
 	ret = io_import_iovec(req->ctx, WRITE, s, &iovec, &iter);
 	if (ret < 0)
@@ -1365,7 +1460,10 @@
 		}
 		kiocb->ki_flags |= IOCB_WRITE;
 
-		ret2 = call_write_iter(file, kiocb, &iter);
+		if (file->f_op->write_iter)
+			ret2 = call_write_iter(file, kiocb, &iter);
+		else
+			ret2 = loop_rw_iter(WRITE, file, kiocb, &iter);
 		if (!force_nonblock || ret2 != -EAGAIN) {
 			io_rw_done(kiocb, ret2);
 		} else {
@@ -1714,6 +1812,7 @@
 	if (!poll->file)
 		return -EBADF;
 
+	req->submit.sqe = NULL;
 	INIT_WORK(&req->work, io_poll_complete_work);
 	events = READ_ONCE(sqe->poll_events);
 	poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
@@ -1765,6 +1864,81 @@
 	return ipt.error;
 }
 
+static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
+{
+	struct io_ring_ctx *ctx;
+	struct io_kiocb *req;
+	unsigned long flags;
+
+	req = container_of(timer, struct io_kiocb, timeout.timer);
+	ctx = req->ctx;
+	atomic_inc(&ctx->cq_timeouts);
+
+	spin_lock_irqsave(&ctx->completion_lock, flags);
+	list_del(&req->list);
+
+	io_cqring_fill_event(ctx, req->user_data, -ETIME);
+	io_commit_cqring(ctx);
+	spin_unlock_irqrestore(&ctx->completion_lock, flags);
+
+	io_cqring_ev_posted(ctx);
+
+	io_put_req(req);
+	return HRTIMER_NORESTART;
+}
+
+static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	unsigned count, req_dist, tail_index;
+	struct io_ring_ctx *ctx = req->ctx;
+	struct list_head *entry;
+	struct timespec ts;
+
+	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
+		return -EINVAL;
+	if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->timeout_flags ||
+	    sqe->len != 1)
+		return -EINVAL;
+	if (copy_from_user(&ts, (void __user *) (unsigned long) sqe->addr,
+	    sizeof(ts)))
+		return -EFAULT;
+
+	/*
+	 * sqe->off holds how many events that need to occur for this
+	 * timeout event to be satisfied.
+	 */
+	count = READ_ONCE(sqe->off);
+	if (!count)
+		count = 1;
+
+	req->sequence = ctx->cached_sq_head + count - 1;
+	req->flags |= REQ_F_TIMEOUT;
+
+	/*
+	 * Insertion sort, ensuring the first entry in the list is always
+	 * the one we need first.
+	 */
+	tail_index = ctx->cached_cq_tail - ctx->rings->sq_dropped;
+	req_dist = req->sequence - tail_index;
+	spin_lock_irq(&ctx->completion_lock);
+	list_for_each_prev(entry, &ctx->timeout_list) {
+		struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list);
+		unsigned dist;
+
+		dist = nxt->sequence - tail_index;
+		if (req_dist >= dist)
+			break;
+	}
+	list_add(&req->list, entry);
+	spin_unlock_irq(&ctx->completion_lock);
+
+	hrtimer_init(&req->timeout.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	req->timeout.timer.function = io_timeout_fn;
+	hrtimer_start(&req->timeout.timer, timespec_to_ktime(ts),
+			HRTIMER_MODE_REL);
+	return 0;
+}
+
 static int io_req_defer(struct io_ring_ctx *ctx, struct io_kiocb *req,
 			const struct io_uring_sqe *sqe)
 {
@@ -1842,6 +2016,9 @@
 	case IORING_OP_RECVMSG:
 		ret = io_recvmsg(req, s->sqe, force_nonblock);
 		break;
+	case IORING_OP_TIMEOUT:
+		ret = io_timeout(req, s->sqe);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
@@ -2098,13 +2275,11 @@
 	if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
 		struct io_uring_sqe *sqe_copy;
 
-		sqe_copy = kmalloc(sizeof(*sqe_copy), GFP_KERNEL);
+		sqe_copy = kmemdup(s->sqe, sizeof(*sqe_copy), GFP_KERNEL);
 		if (sqe_copy) {
 			struct async_list *list;
 
-			memcpy(sqe_copy, s->sqe, sizeof(*sqe_copy));
 			s->sqe = sqe_copy;
-
 			memcpy(&req->submit, s, sizeof(*s));
 			list = io_async_list_from_sqe(ctx, s->sqe);
 			if (!io_add_to_prev_work(list, req)) {
@@ -2359,18 +2534,22 @@
 			io_queue_link_head(ctx, link, &link->submit, shadow_req,
 						true);
 			link = NULL;
+			shadow_req = NULL;
 		}
 		prev_was_link = (sqes[i].sqe->flags & IOSQE_IO_LINK) != 0;
 
 		if (link && (sqes[i].sqe->flags & IOSQE_IO_DRAIN)) {
 			if (!shadow_req) {
 				shadow_req = io_get_req(ctx, NULL);
+				if (unlikely(!shadow_req))
+					goto out;
 				shadow_req->flags |= (REQ_F_IO_DRAIN | REQ_F_SHADOW_DRAIN);
 				refcount_dec(&shadow_req->refs);
 			}
 			shadow_req->sequence = sqes[i].sequence;
 		}
 
+out:
 		if (unlikely(mm_fault)) {
 			io_cqring_add_event(ctx, sqes[i].sqe->user_data,
 						-EFAULT);
@@ -2436,7 +2615,7 @@
 			 * to sleep.
 			 */
 			if (inflight || !time_after(jiffies, timeout)) {
-				cpu_relax();
+				cond_resched();
 				continue;
 			}
 
@@ -2545,18 +2724,22 @@
 			io_queue_link_head(ctx, link, &link->submit, shadow_req,
 						force_nonblock);
 			link = NULL;
+			shadow_req = NULL;
 		}
 		prev_was_link = (s.sqe->flags & IOSQE_IO_LINK) != 0;
 
 		if (link && (s.sqe->flags & IOSQE_IO_DRAIN)) {
 			if (!shadow_req) {
 				shadow_req = io_get_req(ctx, NULL);
+				if (unlikely(!shadow_req))
+					goto out;
 				shadow_req->flags |= (REQ_F_IO_DRAIN | REQ_F_SHADOW_DRAIN);
 				refcount_dec(&shadow_req->refs);
 			}
 			shadow_req->sequence = s.sequence;
 		}
 
+out:
 		s.has_user = true;
 		s.needs_lock = false;
 		s.needs_fixed_file = false;
@@ -2585,6 +2768,38 @@
 	return submit;
 }
 
+struct io_wait_queue {
+	struct wait_queue_entry wq;
+	struct io_ring_ctx *ctx;
+	unsigned to_wait;
+	unsigned nr_timeouts;
+};
+
+static inline bool io_should_wake(struct io_wait_queue *iowq)
+{
+	struct io_ring_ctx *ctx = iowq->ctx;
+
+	/*
+	 * Wake up if we have enough events, or if a timeout occured since we
+	 * started waiting. For timeouts, we always want to return to userspace,
+	 * regardless of event count.
+	 */
+	return io_cqring_events(ctx->rings) >= iowq->to_wait ||
+			atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
+}
+
+static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
+			    int wake_flags, void *key)
+{
+	struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
+							wq);
+
+	if (!io_should_wake(iowq))
+		return -1;
+
+	return autoremove_wake_function(curr, mode, wake_flags, key);
+}
+
 /*
  * Wait until events become available, if we don't already have some. The
  * application must reap them itself, as they reside on the shared cq ring.
@@ -2592,6 +2807,15 @@
 static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 			  const sigset_t __user *sig, size_t sigsz)
 {
+	struct io_wait_queue iowq = {
+		.wq = {
+			.private	= current,
+			.func		= io_wake_function,
+			.entry		= LIST_HEAD_INIT(iowq.wq.entry),
+		},
+		.ctx		= ctx,
+		.to_wait	= min_events,
+	};
 	struct io_rings *rings = ctx->rings;
 	int ret;
 
@@ -2611,7 +2835,21 @@
 			return ret;
 	}
 
-	ret = wait_event_interruptible(ctx->wait, io_cqring_events(rings) >= min_events);
+	ret = 0;
+	iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
+	do {
+		prepare_to_wait_exclusive(&ctx->wait, &iowq.wq,
+						TASK_INTERRUPTIBLE);
+		if (io_should_wake(&iowq))
+			break;
+		schedule();
+		if (signal_pending(current)) {
+			ret = -ERESTARTSYS;
+			break;
+		}
+	} while (1);
+	finish_wait(&ctx->wait, &iowq.wq);
+
 	restore_saved_sigmask_unless(ret == -ERESTARTSYS);
 	if (ret == -ERESTARTSYS)
 		ret = -EINTR;
@@ -3263,7 +3501,7 @@
 	if (READ_ONCE(ctx->rings->sq.tail) - ctx->cached_sq_head !=
 	    ctx->rings->sq_ring_entries)
 		mask |= EPOLLOUT | EPOLLWRNORM;
-	if (READ_ONCE(ctx->rings->sq.head) != ctx->cached_cq_tail)
+	if (READ_ONCE(ctx->rings->cq.head) != ctx->cached_cq_tail)
 		mask |= EPOLLIN | EPOLLRDNORM;
 
 	return mask;
@@ -3282,6 +3520,7 @@
 	percpu_ref_kill(&ctx->refs);
 	mutex_unlock(&ctx->uring_lock);
 
+	io_kill_timeouts(ctx);
 	io_poll_remove_all(ctx);
 	io_iopoll_reap_events(ctx);
 	wait_for_completion(&ctx->ctx_done);
@@ -3319,7 +3558,7 @@
 	}
 
 	page = virt_to_head_page(ptr);
-	if (sz > (PAGE_SIZE << compound_order(page)))
+	if (sz > page_size(page))
 		return -EINVAL;
 
 	pfn = virt_to_phys(ptr) >> PAGE_SHIFT;

diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index 10517ce..1fc28c2 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c

@@ -24,7 +24,7 @@
 
 struct iomap_dio {
 	struct kiocb		*iocb;
-	iomap_dio_end_io_t	*end_io;
+	const struct iomap_dio_ops *dops;
 	loff_t			i_size;
 	loff_t			size;
 	atomic_t		ref;
@@ -72,18 +72,14 @@
 
 static ssize_t iomap_dio_complete(struct iomap_dio *dio)
 {
+	const struct iomap_dio_ops *dops = dio->dops;
 	struct kiocb *iocb = dio->iocb;
 	struct inode *inode = file_inode(iocb->ki_filp);
 	loff_t offset = iocb->ki_pos;
-	ssize_t ret;
+	ssize_t ret = dio->error;
 
-	if (dio->end_io) {
-		ret = dio->end_io(iocb,
-				dio->error ? dio->error : dio->size,
-				dio->flags);
-	} else {
-		ret = dio->error;
-	}
+	if (dops && dops->end_io)
+		ret = dops->end_io(iocb, dio->size, ret, dio->flags);
 
 	if (likely(!ret)) {
 		ret = dio->size;
@@ -101,9 +97,9 @@
 	 * one is a pretty crazy thing to do, so we don't support it 100%.  If
 	 * this invalidation fails, tough, the write still worked...
 	 *
-	 * And this page cache invalidation has to be after dio->end_io(), as
-	 * some filesystems convert unwritten extents to real allocations in
-	 * end_io() when necessary, otherwise a racing buffer read would cache
+	 * And this page cache invalidation has to be after ->end_io(), as some
+	 * filesystems convert unwritten extents to real allocations in
+	 * ->end_io() when necessary, otherwise a racing buffer read would cache
 	 * zeros from unwritten extents.
 	 */
 	if (!dio->error &&
@@ -396,7 +392,7 @@
  */
 ssize_t
 iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
-		const struct iomap_ops *ops, iomap_dio_end_io_t end_io)
+		const struct iomap_ops *ops, const struct iomap_dio_ops *dops)
 {
 	struct address_space *mapping = iocb->ki_filp->f_mapping;
 	struct inode *inode = file_inode(iocb->ki_filp);
@@ -421,7 +417,7 @@
 	atomic_set(&dio->ref, 1);
 	dio->size = 0;
 	dio->i_size = i_size_read(inode);
-	dio->end_io = end_io;
+	dio->dops = dops;
 	dio->error = 0;
 	dio->flags = 0;
 

diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 953990e..1c58859 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c

@@ -89,8 +89,6 @@
 EXPORT_SYMBOL(jbd2_journal_invalidatepage);
 EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers);
 EXPORT_SYMBOL(jbd2_journal_force_commit);
-EXPORT_SYMBOL(jbd2_journal_inode_add_write);
-EXPORT_SYMBOL(jbd2_journal_inode_add_wait);
 EXPORT_SYMBOL(jbd2_journal_inode_ranged_write);
 EXPORT_SYMBOL(jbd2_journal_inode_ranged_wait);
 EXPORT_SYMBOL(jbd2_journal_init_jbd_inode);

diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index afc06da..bee8498 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c

@@ -2622,18 +2622,6 @@
 	return 0;
 }
 
-int jbd2_journal_inode_add_write(handle_t *handle, struct jbd2_inode *jinode)
-{
-	return jbd2_journal_file_inode(handle, jinode,
-			JI_WRITE_DATA | JI_WAIT_DATA, 0, LLONG_MAX);
-}
-
-int jbd2_journal_inode_add_wait(handle_t *handle, struct jbd2_inode *jinode)
-{
-	return jbd2_journal_file_inode(handle, jinode, JI_WAIT_DATA, 0,
-			LLONG_MAX);
-}
-
 int jbd2_journal_inode_ranged_write(handle_t *handle,
 		struct jbd2_inode *jinode, loff_t start_byte, loff_t length)
 {

diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index cbe7063..0e6406c 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c

@@ -163,13 +163,11 @@
  * Opt_rp_size: size of reserved pool in KiB
  */
 enum {
-	Opt_source,
 	Opt_override_compr,
 	Opt_rp_size,
 };
 
 static const struct fs_parameter_spec jffs2_param_specs[] = {
-	fsparam_string	("source",	Opt_source),
 	fsparam_enum	("compr",	Opt_override_compr),
 	fsparam_u32	("rp_size",	Opt_rp_size),
 	{}

diff --git a/fs/locks.c b/fs/locks.c
index a364ebc..6970f55 100644
--- a/fs/locks.c
+++ b/fs/locks.c

@@ -212,6 +212,7 @@
 static DEFINE_PER_CPU(struct file_lock_list_struct, file_lock_list);
 DEFINE_STATIC_PERCPU_RWSEM(file_rwsem);
 
+
 /*
  * The blocked_hash is used to find POSIX lock loops for deadlock detection.
  * It is protected by blocked_lock_lock.
@@ -1991,6 +1992,64 @@
 }
 EXPORT_SYMBOL(generic_setlease);
 
+#if IS_ENABLED(CONFIG_SRCU)
+/*
+ * Kernel subsystems can register to be notified on any attempt to set
+ * a new lease with the lease_notifier_chain. This is used by (e.g.) nfsd
+ * to close files that it may have cached when there is an attempt to set a
+ * conflicting lease.
+ */
+static struct srcu_notifier_head lease_notifier_chain;
+
+static inline void
+lease_notifier_chain_init(void)
+{
+	srcu_init_notifier_head(&lease_notifier_chain);
+}
+
+static inline void
+setlease_notifier(long arg, struct file_lock *lease)
+{
+	if (arg != F_UNLCK)
+		srcu_notifier_call_chain(&lease_notifier_chain, arg, lease);
+}
+
+int lease_register_notifier(struct notifier_block *nb)
+{
+	return srcu_notifier_chain_register(&lease_notifier_chain, nb);
+}
+EXPORT_SYMBOL_GPL(lease_register_notifier);
+
+void lease_unregister_notifier(struct notifier_block *nb)
+{
+	srcu_notifier_chain_unregister(&lease_notifier_chain, nb);
+}
+EXPORT_SYMBOL_GPL(lease_unregister_notifier);
+
+#else /* !IS_ENABLED(CONFIG_SRCU) */
+static inline void
+lease_notifier_chain_init(void)
+{
+}
+
+static inline void
+setlease_notifier(long arg, struct file_lock *lease)
+{
+}
+
+int lease_register_notifier(struct notifier_block *nb)
+{
+	return 0;
+}
+EXPORT_SYMBOL_GPL(lease_register_notifier);
+
+void lease_unregister_notifier(struct notifier_block *nb)
+{
+}
+EXPORT_SYMBOL_GPL(lease_unregister_notifier);
+
+#endif /* IS_ENABLED(CONFIG_SRCU) */
+
 /**
  * vfs_setlease        -       sets a lease on an open file
  * @filp:	file pointer
@@ -2011,6 +2070,8 @@
 int
 vfs_setlease(struct file *filp, long arg, struct file_lock **lease, void **priv)
 {
+	if (lease)
+		setlease_notifier(arg, *lease);
 	if (filp->f_op->setlease)
 		return filp->f_op->setlease(filp, arg, lease, priv);
 	else
@@ -2924,6 +2985,7 @@
 		INIT_HLIST_HEAD(&fll->hlist);
 	}
 
+	lease_notifier_chain_init();
 	return 0;
 }
 core_initcall(filelock_init);

diff --git a/fs/namespace.c b/fs/namespace.c
index 93c0432..fe0e9e1 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c

@@ -2802,8 +2802,6 @@
 				put_filesystem(type);
 				return -EINVAL;
 			}
-		} else {
-			subtype = "";
 		}
 	}
 
@@ -3028,7 +3026,7 @@
 	 * the remainder of the page.
 	 */
 	/* copy_from_user cannot cross TASK_SIZE ! */
-	size = TASK_SIZE - (unsigned long)data;
+	size = TASK_SIZE - (unsigned long)untagged_addr(data);
 	if (size > PAGE_SIZE)
 		size = PAGE_SIZE;
 

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 0adfd884..e180033 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c

@@ -1669,10 +1669,8 @@
 
 #endif /* CONFIG_NFSV4 */
 
-/*
- * Code common to create, mkdir, and mknod.
- */
-int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle,
+struct dentry *
+nfs_add_or_obtain(struct dentry *dentry, struct nfs_fh *fhandle,
 				struct nfs_fattr *fattr,
 				struct nfs4_label *label)
 {
@@ -1680,13 +1678,10 @@
 	struct inode *dir = d_inode(parent);
 	struct inode *inode;
 	struct dentry *d;
-	int error = -EACCES;
+	int error;
 
 	d_drop(dentry);
 
-	/* We may have been initialized further down */
-	if (d_really_is_positive(dentry))
-		goto out;
 	if (fhandle->size == 0) {
 		error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, NULL);
 		if (error)
@@ -1702,18 +1697,32 @@
 	}
 	inode = nfs_fhget(dentry->d_sb, fhandle, fattr, label);
 	d = d_splice_alias(inode, dentry);
-	if (IS_ERR(d)) {
-		error = PTR_ERR(d);
-		goto out_error;
-	}
-	dput(d);
 out:
 	dput(parent);
-	return 0;
+	return d;
 out_error:
 	nfs_mark_for_revalidate(dir);
-	dput(parent);
-	return error;
+	d = ERR_PTR(error);
+	goto out;
+}
+EXPORT_SYMBOL_GPL(nfs_add_or_obtain);
+
+/*
+ * Code common to create, mkdir, and mknod.
+ */
+int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle,
+				struct nfs_fattr *fattr,
+				struct nfs4_label *label)
+{
+	struct dentry *d;
+
+	d = nfs_add_or_obtain(dentry, fhandle, fattr, label);
+	if (IS_ERR(d))
+		return PTR_ERR(d);
+
+	/* Callers don't care */
+	dput(d);
+	return 0;
 }
 EXPORT_SYMBOL_GPL(nfs_instantiate);
 

diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index 3cb073c..c9b605f 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c

@@ -1164,6 +1164,7 @@
 	.id			= LAYOUT_NFSV4_1_FILES,
 	.name			= "LAYOUT_NFSV4_1_FILES",
 	.owner			= THIS_MODULE,
+	.flags			= PNFS_LAYOUTGET_ON_OPEN,
 	.max_layoutget_response	= 4096, /* 1 page or so... */
 	.alloc_layout_hdr	= filelayout_alloc_layout_hdr,
 	.free_layout_hdr	= filelayout_free_layout_hdr,

diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index e64f810..447a3c1 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h

@@ -16,14 +16,6 @@
 
 struct nfs_string;
 
-/* Maximum number of readahead requests
- * FIXME: this should really be a sysctl so that users may tune it to suit
- *        their needs. People that do NFS over a slow network, might for
- *        instance want to reduce it to something closer to 1 for improved
- *        interactive response.
- */
-#define NFS_MAX_READAHEAD	(RPC_DEF_SLOT_TABLE - 1)
-
 static inline void nfs_attr_check_mountpoint(struct super_block *parent, struct nfs_fattr *fattr)
 {
 	if (!nfs_fsid_equal(&NFS_SB(parent)->fsid, &fattr->fsid))

diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index a3ad2d4..9eb2f1a 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c

@@ -279,15 +279,17 @@
 	return data;
 }
 
-static int nfs3_do_create(struct inode *dir, struct dentry *dentry, struct nfs3_createdata *data)
+static struct dentry *
+nfs3_do_create(struct inode *dir, struct dentry *dentry, struct nfs3_createdata *data)
 {
 	int status;
 
 	status = rpc_call_sync(NFS_CLIENT(dir), &data->msg, 0);
 	nfs_post_op_update_inode(dir, data->res.dir_attr);
-	if (status == 0)
-		status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL);
-	return status;
+	if (status != 0)
+		return ERR_PTR(status);
+
+	return nfs_add_or_obtain(dentry, data->res.fh, data->res.fattr, NULL);
 }
 
 static void nfs3_free_createdata(struct nfs3_createdata *data)
@@ -304,6 +306,7 @@
 {
 	struct posix_acl *default_acl, *acl;
 	struct nfs3_createdata *data;
+	struct dentry *d_alias;
 	int status = -ENOMEM;
 
 	dprintk("NFS call  create %pd\n", dentry);
@@ -330,7 +333,8 @@
 		goto out;
 
 	for (;;) {
-		status = nfs3_do_create(dir, dentry, data);
+		d_alias = nfs3_do_create(dir, dentry, data);
+		status = PTR_ERR_OR_ZERO(d_alias);
 
 		if (status != -ENOTSUPP)
 			break;
@@ -355,6 +359,9 @@
 	if (status != 0)
 		goto out_release_acls;
 
+	if (d_alias)
+		dentry = d_alias;
+
 	/* When we created the file with exclusive semantics, make
 	 * sure we set the attributes afterwards. */
 	if (data->arg.create.createmode == NFS3_CREATE_EXCLUSIVE) {
@@ -372,11 +379,13 @@
 		nfs_post_op_update_inode(d_inode(dentry), data->res.fattr);
 		dprintk("NFS reply setattr (post-create): %d\n", status);
 		if (status != 0)
-			goto out_release_acls;
+			goto out_dput;
 	}
 
 	status = nfs3_proc_setacls(d_inode(dentry), acl, default_acl);
 
+out_dput:
+	dput(d_alias);
 out_release_acls:
 	posix_acl_release(acl);
 	posix_acl_release(default_acl);
@@ -504,6 +513,7 @@
 		  unsigned int len, struct iattr *sattr)
 {
 	struct nfs3_createdata *data;
+	struct dentry *d_alias;
 	int status = -ENOMEM;
 
 	if (len > NFS3_MAXPATHLEN)
@@ -522,7 +532,11 @@
 	data->arg.symlink.pathlen = len;
 	data->arg.symlink.sattr = sattr;
 
-	status = nfs3_do_create(dir, dentry, data);
+	d_alias = nfs3_do_create(dir, dentry, data);
+	status = PTR_ERR_OR_ZERO(d_alias);
+
+	if (status == 0)
+		dput(d_alias);
 
 	nfs3_free_createdata(data);
 out:
@@ -535,6 +549,7 @@
 {
 	struct posix_acl *default_acl, *acl;
 	struct nfs3_createdata *data;
+	struct dentry *d_alias;
 	int status = -ENOMEM;
 
 	dprintk("NFS call  mkdir %pd\n", dentry);
@@ -553,12 +568,18 @@
 	data->arg.mkdir.len = dentry->d_name.len;
 	data->arg.mkdir.sattr = sattr;
 
-	status = nfs3_do_create(dir, dentry, data);
+	d_alias = nfs3_do_create(dir, dentry, data);
+	status = PTR_ERR_OR_ZERO(d_alias);
+
 	if (status != 0)
 		goto out_release_acls;
 
+	if (d_alias)
+		dentry = d_alias;
+
 	status = nfs3_proc_setacls(d_inode(dentry), acl, default_acl);
 
+	dput(d_alias);
 out_release_acls:
 	posix_acl_release(acl);
 	posix_acl_release(default_acl);
@@ -660,6 +681,7 @@
 {
 	struct posix_acl *default_acl, *acl;
 	struct nfs3_createdata *data;
+	struct dentry *d_alias;
 	int status = -ENOMEM;
 
 	dprintk("NFS call  mknod %pd %u:%u\n", dentry,
@@ -698,12 +720,17 @@
 		goto out;
 	}
 
-	status = nfs3_do_create(dir, dentry, data);
+	d_alias = nfs3_do_create(dir, dentry, data);
+	status = PTR_ERR_OR_ZERO(d_alias);
 	if (status != 0)
 		goto out_release_acls;
 
+	if (d_alias)
+		dentry = d_alias;
+
 	status = nfs3_proc_setacls(d_inode(dentry), acl, default_acl);
 
+	dput(d_alias);
 out_release_acls:
 	posix_acl_release(acl);
 	posix_acl_release(default_acl);

diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 3564da1..16b2e5c 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h

@@ -491,8 +491,6 @@
 extern int nfs4_select_rw_stateid(struct nfs4_state *, fmode_t,
 		const struct nfs_lock_context *, nfs4_stateid *,
 		const struct cred **);
-extern bool nfs4_refresh_open_stateid(nfs4_stateid *dst,
-		struct nfs4_state *state);
 extern bool nfs4_copy_open_stateid(nfs4_stateid *dst,
 		struct nfs4_state *state);
 
@@ -574,6 +572,15 @@
 	return (s32)(be32_to_cpu(s1->seqid) - be32_to_cpu(s2->seqid)) > 0;
 }
 
+static inline void nfs4_stateid_seqid_inc(nfs4_stateid *s1)
+{
+	u32 seqid = be32_to_cpu(s1->seqid);
+
+	if (++seqid == 0)
+		++seqid;
+	s1->seqid = cpu_to_be32(seqid);
+}
+
 static inline bool nfs4_valid_open_stateid(const struct nfs4_state *state)
 {
 	return test_bit(NFS_STATE_RECOVERY_FAILED, &state->flags) == 0;

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 1406858..11eafcf 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c

@@ -1073,14 +1073,26 @@
 	.rpc_call_done = nfs40_call_sync_done,
 };
 
+static int nfs4_call_sync_custom(struct rpc_task_setup *task_setup)
+{
+	int ret;
+	struct rpc_task *task;
+
+	task = rpc_run_task(task_setup);
+	if (IS_ERR(task))
+		return PTR_ERR(task);
+
+	ret = task->tk_status;
+	rpc_put_task(task);
+	return ret;
+}
+
 static int nfs4_call_sync_sequence(struct rpc_clnt *clnt,
 				   struct nfs_server *server,
 				   struct rpc_message *msg,
 				   struct nfs4_sequence_args *args,
 				   struct nfs4_sequence_res *res)
 {
-	int ret;
-	struct rpc_task *task;
 	struct nfs_client *clp = server->nfs_client;
 	struct nfs4_call_sync_data data = {
 		.seq_server = server,
@@ -1094,14 +1106,7 @@
 		.callback_data = &data
 	};
 
-	task = rpc_run_task(&task_setup);
-	if (IS_ERR(task))
-		ret = PTR_ERR(task);
-	else {
-		ret = task->tk_status;
-		rpc_put_task(task);
-	}
-	return ret;
+	return nfs4_call_sync_custom(&task_setup);
 }
 
 int nfs4_call_sync(struct rpc_clnt *clnt,
@@ -3308,6 +3313,75 @@
 	return pnfs_wait_on_layoutreturn(inode, task);
 }
 
+/*
+ * Update the seqid of an open stateid
+ */
+static void nfs4_sync_open_stateid(nfs4_stateid *dst,
+		struct nfs4_state *state)
+{
+	__be32 seqid_open;
+	u32 dst_seqid;
+	int seq;
+
+	for (;;) {
+		if (!nfs4_valid_open_stateid(state))
+			break;
+		seq = read_seqbegin(&state->seqlock);
+		if (!nfs4_state_match_open_stateid_other(state, dst)) {
+			nfs4_stateid_copy(dst, &state->open_stateid);
+			if (read_seqretry(&state->seqlock, seq))
+				continue;
+			break;
+		}
+		seqid_open = state->open_stateid.seqid;
+		if (read_seqretry(&state->seqlock, seq))
+			continue;
+
+		dst_seqid = be32_to_cpu(dst->seqid);
+		if ((s32)(dst_seqid - be32_to_cpu(seqid_open)) < 0)
+			dst->seqid = seqid_open;
+		break;
+	}
+}
+
+/*
+ * Update the seqid of an open stateid after receiving
+ * NFS4ERR_OLD_STATEID
+ */
+static bool nfs4_refresh_open_old_stateid(nfs4_stateid *dst,
+		struct nfs4_state *state)
+{
+	__be32 seqid_open;
+	u32 dst_seqid;
+	bool ret;
+	int seq;
+
+	for (;;) {
+		ret = false;
+		if (!nfs4_valid_open_stateid(state))
+			break;
+		seq = read_seqbegin(&state->seqlock);
+		if (!nfs4_state_match_open_stateid_other(state, dst)) {
+			if (read_seqretry(&state->seqlock, seq))
+				continue;
+			break;
+		}
+		seqid_open = state->open_stateid.seqid;
+		if (read_seqretry(&state->seqlock, seq))
+			continue;
+
+		dst_seqid = be32_to_cpu(dst->seqid);
+		if ((s32)(dst_seqid - be32_to_cpu(seqid_open)) >= 0)
+			dst->seqid = cpu_to_be32(dst_seqid + 1);
+		else
+			dst->seqid = seqid_open;
+		ret = true;
+		break;
+	}
+
+	return ret;
+}
+
 struct nfs4_closedata {
 	struct inode *inode;
 	struct nfs4_state *state;
@@ -3358,32 +3432,11 @@
 	trace_nfs4_close(state, &calldata->arg, &calldata->res, task->tk_status);
 
 	/* Handle Layoutreturn errors */
-	if (calldata->arg.lr_args && task->tk_status != 0) {
-		switch (calldata->res.lr_ret) {
-		default:
-			calldata->res.lr_ret = -NFS4ERR_NOMATCHING_LAYOUT;
-			break;
-		case 0:
-			calldata->arg.lr_args = NULL;
-			calldata->res.lr_res = NULL;
-			break;
-		case -NFS4ERR_OLD_STATEID:
-			if (nfs4_layoutreturn_refresh_stateid(&calldata->arg.lr_args->stateid,
-						&calldata->arg.lr_args->range,
-						calldata->inode))
-				goto lr_restart;
-			/* Fallthrough */
-		case -NFS4ERR_ADMIN_REVOKED:
-		case -NFS4ERR_DELEG_REVOKED:
-		case -NFS4ERR_EXPIRED:
-		case -NFS4ERR_BAD_STATEID:
-		case -NFS4ERR_UNKNOWN_LAYOUTTYPE:
-		case -NFS4ERR_WRONG_CRED:
-			calldata->arg.lr_args = NULL;
-			calldata->res.lr_res = NULL;
-			goto lr_restart;
-		}
-	}
+	if (pnfs_roc_done(task, calldata->inode,
+				&calldata->arg.lr_args,
+				&calldata->res.lr_res,
+				&calldata->res.lr_ret) == -EAGAIN)
+		goto out_restart;
 
 	/* hmm. we are done with the inode, and in the process of freeing
 	 * the state_owner. we keep this around to process errors
@@ -3403,7 +3456,7 @@
 			break;
 		case -NFS4ERR_OLD_STATEID:
 			/* Did we race with OPEN? */
-			if (nfs4_refresh_open_stateid(&calldata->arg.stateid,
+			if (nfs4_refresh_open_old_stateid(&calldata->arg.stateid,
 						state))
 				goto out_restart;
 			goto out_release;
@@ -3415,7 +3468,9 @@
 					task->tk_msg.rpc_cred);
 			/* Fallthrough */
 		case -NFS4ERR_BAD_STATEID:
-			break;
+			if (calldata->arg.fmode == 0)
+				break;
+			/* Fallthrough */
 		default:
 			task->tk_status = nfs4_async_handle_exception(task,
 					server, task->tk_status, &exception);
@@ -3430,8 +3485,6 @@
 	nfs_refresh_inode(calldata->inode, &calldata->fattr);
 	dprintk("%s: done, ret = %d!\n", __func__, task->tk_status);
 	return;
-lr_restart:
-	calldata->res.lr_ret = 0;
 out_restart:
 	task->tk_status = 0;
 	rpc_restart_call_prepare(task);
@@ -3472,8 +3525,8 @@
 	} else if (is_rdwr)
 		calldata->arg.fmode |= FMODE_READ|FMODE_WRITE;
 
-	if (!nfs4_valid_open_stateid(state) ||
-	    !nfs4_refresh_open_stateid(&calldata->arg.stateid, state))
+	nfs4_sync_open_stateid(&calldata->arg.stateid, state);
+	if (!nfs4_valid_open_stateid(state))
 		call_close = 0;
 	spin_unlock(&state->owner->so_lock);
 
@@ -6018,7 +6071,6 @@
 		.rpc_resp = res,
 		.rpc_cred = cred,
 	};
-	struct rpc_task *task;
 	struct rpc_task_setup task_setup_data = {
 		.rpc_client = clp->cl_rpcclient,
 		.rpc_message = &msg,
@@ -6051,17 +6103,12 @@
 	dprintk("NFS call  setclientid auth=%s, '%s'\n",
 		clp->cl_rpcclient->cl_auth->au_ops->au_name,
 		clp->cl_owner_id);
-	task = rpc_run_task(&task_setup_data);
-	if (IS_ERR(task)) {
-		status = PTR_ERR(task);
-		goto out;
-	}
-	status = task->tk_status;
+
+	status = nfs4_call_sync_custom(&task_setup_data);
 	if (setclientid.sc_cred) {
 		clp->cl_acceptor = rpcauth_stringify_acceptor(setclientid.sc_cred);
 		put_rpccred(setclientid.sc_cred);
 	}
-	rpc_put_task(task);
 out:
 	trace_nfs4_setclientid(clp, status);
 	dprintk("NFS reply setclientid: %d\n", status);
@@ -6129,32 +6176,11 @@
 	trace_nfs4_delegreturn_exit(&data->args, &data->res, task->tk_status);
 
 	/* Handle Layoutreturn errors */
-	if (data->args.lr_args && task->tk_status != 0) {
-		switch(data->res.lr_ret) {
-		default:
-			data->res.lr_ret = -NFS4ERR_NOMATCHING_LAYOUT;
-			break;
-		case 0:
-			data->args.lr_args = NULL;
-			data->res.lr_res = NULL;
-			break;
-		case -NFS4ERR_OLD_STATEID:
-			if (nfs4_layoutreturn_refresh_stateid(&data->args.lr_args->stateid,
-						&data->args.lr_args->range,
-						data->inode))
-				goto lr_restart;
-			/* Fallthrough */
-		case -NFS4ERR_ADMIN_REVOKED:
-		case -NFS4ERR_DELEG_REVOKED:
-		case -NFS4ERR_EXPIRED:
-		case -NFS4ERR_BAD_STATEID:
-		case -NFS4ERR_UNKNOWN_LAYOUTTYPE:
-		case -NFS4ERR_WRONG_CRED:
-			data->args.lr_args = NULL;
-			data->res.lr_res = NULL;
-			goto lr_restart;
-		}
-	}
+	if (pnfs_roc_done(task, data->inode,
+				&data->args.lr_args,
+				&data->res.lr_res,
+				&data->res.lr_ret) == -EAGAIN)
+		goto out_restart;
 
 	switch (task->tk_status) {
 	case 0:
@@ -6192,8 +6218,6 @@
 	}
 	data->rpc_status = task->tk_status;
 	return;
-lr_restart:
-	data->res.lr_ret = 0;
 out_restart:
 	task->tk_status = 0;
 	rpc_restart_call_prepare(task);
@@ -6386,6 +6410,42 @@
 	return err;
 }
 
+/*
+ * Update the seqid of a lock stateid after receiving
+ * NFS4ERR_OLD_STATEID
+ */
+static bool nfs4_refresh_lock_old_stateid(nfs4_stateid *dst,
+		struct nfs4_lock_state *lsp)
+{
+	struct nfs4_state *state = lsp->ls_state;
+	bool ret = false;
+
+	spin_lock(&state->state_lock);
+	if (!nfs4_stateid_match_other(dst, &lsp->ls_stateid))
+		goto out;
+	if (!nfs4_stateid_is_newer(&lsp->ls_stateid, dst))
+		nfs4_stateid_seqid_inc(dst);
+	else
+		dst->seqid = lsp->ls_stateid.seqid;
+	ret = true;
+out:
+	spin_unlock(&state->state_lock);
+	return ret;
+}
+
+static bool nfs4_sync_lock_stateid(nfs4_stateid *dst,
+		struct nfs4_lock_state *lsp)
+{
+	struct nfs4_state *state = lsp->ls_state;
+	bool ret;
+
+	spin_lock(&state->state_lock);
+	ret = !nfs4_stateid_match_other(dst, &lsp->ls_stateid);
+	nfs4_stateid_copy(dst, &lsp->ls_stateid);
+	spin_unlock(&state->state_lock);
+	return ret;
+}
+
 struct nfs4_unlockdata {
 	struct nfs_locku_args arg;
 	struct nfs_locku_res res;
@@ -6403,7 +6463,8 @@
 		struct nfs_seqid *seqid)
 {
 	struct nfs4_unlockdata *p;
-	struct inode *inode = lsp->ls_state->inode;
+	struct nfs4_state *state = lsp->ls_state;
+	struct inode *inode = state->inode;
 
 	p = kzalloc(sizeof(*p), GFP_NOFS);
 	if (p == NULL)
@@ -6419,6 +6480,9 @@
 	locks_init_lock(&p->fl);
 	locks_copy_lock(&p->fl, fl);
 	p->server = NFS_SERVER(inode);
+	spin_lock(&state->state_lock);
+	nfs4_stateid_copy(&p->arg.stateid, &lsp->ls_stateid);
+	spin_unlock(&state->state_lock);
 	return p;
 }
 
@@ -6457,10 +6521,14 @@
 					task->tk_msg.rpc_cred);
 			/* Fall through */
 		case -NFS4ERR_BAD_STATEID:
-		case -NFS4ERR_OLD_STATEID:
 		case -NFS4ERR_STALE_STATEID:
-			if (!nfs4_stateid_match(&calldata->arg.stateid,
-						&calldata->lsp->ls_stateid))
+			if (nfs4_sync_lock_stateid(&calldata->arg.stateid,
+						calldata->lsp))
+				rpc_restart_call_prepare(task);
+			break;
+		case -NFS4ERR_OLD_STATEID:
+			if (nfs4_refresh_lock_old_stateid(&calldata->arg.stateid,
+						calldata->lsp))
 				rpc_restart_call_prepare(task);
 			break;
 		default:
@@ -6483,7 +6551,6 @@
 
 	if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0)
 		goto out_wait;
-	nfs4_stateid_copy(&calldata->arg.stateid, &calldata->lsp->ls_stateid);
 	if (test_bit(NFS_LOCK_INITIALIZED, &calldata->lsp->ls_flags) == 0) {
 		/* Note: exit _without_ running nfs4_locku_done */
 		goto out_no_action;
@@ -7645,6 +7712,8 @@
 static int _nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, struct nfs4_secinfo_flavors *flavors, bool use_integrity)
 {
 	int status;
+	struct rpc_clnt *clnt = NFS_SERVER(dir)->client;
+	struct nfs_client *clp = NFS_SERVER(dir)->nfs_client;
 	struct nfs4_secinfo_arg args = {
 		.dir_fh = NFS_FH(dir),
 		.name   = name,
@@ -7657,26 +7726,37 @@
 		.rpc_argp = &args,
 		.rpc_resp = &res,
 	};
-	struct rpc_clnt *clnt = NFS_SERVER(dir)->client;
+	struct nfs4_call_sync_data data = {
+		.seq_server = NFS_SERVER(dir),
+		.seq_args = &args.seq_args,
+		.seq_res = &res.seq_res,
+	};
+	struct rpc_task_setup task_setup = {
+		.rpc_client = clnt,
+		.rpc_message = &msg,
+		.callback_ops = clp->cl_mvops->call_sync_ops,
+		.callback_data = &data,
+		.flags = RPC_TASK_NO_ROUND_ROBIN,
+	};
 	const struct cred *cred = NULL;
 
 	if (use_integrity) {
-		clnt = NFS_SERVER(dir)->nfs_client->cl_rpcclient;
-		cred = nfs4_get_clid_cred(NFS_SERVER(dir)->nfs_client);
+		clnt = clp->cl_rpcclient;
+		task_setup.rpc_client = clnt;
+
+		cred = nfs4_get_clid_cred(clp);
 		msg.rpc_cred = cred;
 	}
 
 	dprintk("NFS call  secinfo %s\n", name->name);
 
-	nfs4_state_protect(NFS_SERVER(dir)->nfs_client,
-		NFS_SP4_MACH_CRED_SECINFO, &clnt, &msg);
+	nfs4_state_protect(clp, NFS_SP4_MACH_CRED_SECINFO, &clnt, &msg);
+	nfs4_init_sequence(&args.seq_args, &res.seq_res, 0, 0);
+	status = nfs4_call_sync_custom(&task_setup);
 
-	status = nfs4_call_sync(clnt, NFS_SERVER(dir), &msg, &args.seq_args,
-				&res.seq_res, RPC_TASK_NO_ROUND_ROBIN);
 	dprintk("NFS reply  secinfo: %d\n", status);
 
 	put_cred(cred);
-
 	return status;
 }
 
@@ -8344,7 +8424,6 @@
 
 int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo)
 {
-	struct rpc_task *task;
 	struct nfs4_get_lease_time_args args;
 	struct nfs4_get_lease_time_res res = {
 		.lr_fsinfo = fsinfo,
@@ -8366,17 +8445,9 @@
 		.callback_data = &data,
 		.flags = RPC_TASK_TIMEOUT,
 	};
-	int status;
 
 	nfs4_init_sequence(&args.la_seq_args, &res.lr_seq_res, 0, 1);
-	task = rpc_run_task(&task_setup);
-
-	if (IS_ERR(task))
-		return PTR_ERR(task);
-
-	status = task->tk_status;
-	rpc_put_task(task);
-	return status;
+	return nfs4_call_sync_custom(&task_setup);
 }
 
 #ifdef CONFIG_NFS_V4_1
@@ -8845,7 +8916,6 @@
 		const struct cred *cred)
 {
 	struct nfs4_reclaim_complete_data *calldata;
-	struct rpc_task *task;
 	struct rpc_message msg = {
 		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RECLAIM_COMPLETE],
 		.rpc_cred = cred,
@@ -8854,7 +8924,7 @@
 		.rpc_client = clp->cl_rpcclient,
 		.rpc_message = &msg,
 		.callback_ops = &nfs4_reclaim_complete_call_ops,
-		.flags = RPC_TASK_ASYNC | RPC_TASK_NO_ROUND_ROBIN,
+		.flags = RPC_TASK_NO_ROUND_ROBIN,
 	};
 	int status = -ENOMEM;
 
@@ -8869,15 +8939,7 @@
 	msg.rpc_argp = &calldata->arg;
 	msg.rpc_resp = &calldata->res;
 	task_setup_data.callback_data = calldata;
-	task = rpc_run_task(&task_setup_data);
-	if (IS_ERR(task)) {
-		status = PTR_ERR(task);
-		goto out;
-	}
-	status = rpc_wait_for_completion_task(task);
-	if (status == 0)
-		status = task->tk_status;
-	rpc_put_task(task);
+	status = nfs4_call_sync_custom(&task_setup_data);
 out:
 	dprintk("<-- %s status=%d\n", __func__, status);
 	return status;
@@ -9103,10 +9165,19 @@
 	if (!nfs41_sequence_process(task, &lrp->res.seq_res))
 		return;
 
+	/*
+	 * Was there an RPC level error? Assume the call succeeded,
+	 * and that we need to release the layout
+	 */
+	if (task->tk_rpc_status != 0 && RPC_WAS_SENT(task)) {
+		lrp->res.lrs_present = 0;
+		return;
+	}
+
 	server = NFS_SERVER(lrp->args.inode);
 	switch (task->tk_status) {
 	case -NFS4ERR_OLD_STATEID:
-		if (nfs4_layoutreturn_refresh_stateid(&lrp->args.stateid,
+		if (nfs4_layout_refresh_old_stateid(&lrp->args.stateid,
 					&lrp->args.range,
 					lrp->args.inode))
 			goto out_restart;
@@ -9362,18 +9433,32 @@
 		.rpc_resp = &res,
 	};
 	struct rpc_clnt *clnt = server->client;
+	struct nfs4_call_sync_data data = {
+		.seq_server = server,
+		.seq_args = &args.seq_args,
+		.seq_res = &res.seq_res,
+	};
+	struct rpc_task_setup task_setup = {
+		.rpc_client = server->client,
+		.rpc_message = &msg,
+		.callback_ops = server->nfs_client->cl_mvops->call_sync_ops,
+		.callback_data = &data,
+		.flags = RPC_TASK_NO_ROUND_ROBIN,
+	};
 	const struct cred *cred = NULL;
 	int status;
 
 	if (use_integrity) {
 		clnt = server->nfs_client->cl_rpcclient;
+		task_setup.rpc_client = clnt;
+
 		cred = nfs4_get_clid_cred(server->nfs_client);
 		msg.rpc_cred = cred;
 	}
 
 	dprintk("--> %s\n", __func__);
-	status = nfs4_call_sync(clnt, server, &msg, &args.seq_args,
-				&res.seq_res, RPC_TASK_NO_ROUND_ROBIN);
+	nfs4_init_sequence(&args.seq_args, &res.seq_res, 0, 0);
+	status = nfs4_call_sync_custom(&task_setup);
 	dprintk("<-- %s status=%d\n", __func__, status);
 
 	put_cred(cred);

diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index cad4e06..0c6d53d 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c

@@ -1015,22 +1015,6 @@
 	return ret;
 }
 
-bool nfs4_refresh_open_stateid(nfs4_stateid *dst, struct nfs4_state *state)
-{
-	bool ret;
-	int seq;
-
-	do {
-		ret = false;
-		seq = read_seqbegin(&state->seqlock);
-		if (nfs4_state_match_open_stateid_other(state, dst)) {
-			dst->seqid = state->open_stateid.seqid;
-			ret = true;
-		}
-	} while (read_seqretry(&state->seqlock, seq));
-	return ret;
-}
-
 bool nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state)
 {
 	bool ret;
@@ -2095,8 +2079,10 @@
 	}
 
 	status = nfs4_begin_drain_session(clp);
-	if (status != 0)
-		return status;
+	if (status != 0) {
+		result = status;
+		goto out;
+	}
 
 	status = nfs4_replace_transport(server, locations);
 	if (status != 0) {

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 46a8d63..ab07db0 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c

@@ -1174,7 +1174,7 @@
 		} else
 			*p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME);
 	}
-	if (bmval[2] & FATTR4_WORD2_SECURITY_LABEL) {
+	if (label && (bmval[2] & FATTR4_WORD2_SECURITY_LABEL)) {
 		*p++ = cpu_to_be32(label->lfs);
 		*p++ = cpu_to_be32(label->pi);
 		*p++ = cpu_to_be32(label->len);

diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 4525d5a..bb80034 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c

@@ -359,9 +359,10 @@
 }
 
 /*
- * Update the seqid of a layout stateid
+ * Update the seqid of a layout stateid after receiving
+ * NFS4ERR_OLD_STATEID
  */
-bool nfs4_layoutreturn_refresh_stateid(nfs4_stateid *dst,
+bool nfs4_layout_refresh_old_stateid(nfs4_stateid *dst,
 		struct pnfs_layout_range *dst_range,
 		struct inode *inode)
 {
@@ -377,7 +378,15 @@
 
 	spin_lock(&inode->i_lock);
 	lo = NFS_I(inode)->layout;
-	if (lo && nfs4_stateid_match_other(dst, &lo->plh_stateid)) {
+	if (lo &&  pnfs_layout_is_valid(lo) &&
+	    nfs4_stateid_match_other(dst, &lo->plh_stateid)) {
+		/* Is our call using the most recent seqid? If so, bump it */
+		if (!nfs4_stateid_is_newer(&lo->plh_stateid, dst)) {
+			nfs4_stateid_seqid_inc(dst);
+			ret = true;
+			goto out;
+		}
+		/* Try to update the seqid to the most recent */
 		err = pnfs_mark_matching_lsegs_return(lo, &head, &range, 0);
 		if (err != -EBUSY) {
 			dst->seqid = lo->plh_stateid.seqid;
@@ -385,6 +394,7 @@
 			ret = true;
 		}
 	}
+out:
 	spin_unlock(&inode->i_lock);
 	pnfs_free_lseg_list(&head);
 	return ret;
@@ -1440,6 +1450,52 @@
 	return false;
 }
 
+int pnfs_roc_done(struct rpc_task *task, struct inode *inode,
+		struct nfs4_layoutreturn_args **argpp,
+		struct nfs4_layoutreturn_res **respp,
+		int *ret)
+{
+	struct nfs4_layoutreturn_args *arg = *argpp;
+	int retval = -EAGAIN;
+
+	if (!arg)
+		return 0;
+	/* Handle Layoutreturn errors */
+	switch (*ret) {
+	case 0:
+		retval = 0;
+		break;
+	case -NFS4ERR_NOMATCHING_LAYOUT:
+		/* Was there an RPC level error? If not, retry */
+		if (task->tk_rpc_status == 0)
+			break;
+		/* If the call was not sent, let caller handle it */
+		if (!RPC_WAS_SENT(task))
+			return 0;
+		/*
+		 * Otherwise, assume the call succeeded and
+		 * that we need to release the layout
+		 */
+		*ret = 0;
+		(*respp)->lrs_present = 0;
+		retval = 0;
+		break;
+	case -NFS4ERR_DELAY:
+		/* Let the caller handle the retry */
+		*ret = -NFS4ERR_NOMATCHING_LAYOUT;
+		return 0;
+	case -NFS4ERR_OLD_STATEID:
+		if (!nfs4_layout_refresh_old_stateid(&arg->stateid,
+					&arg->range, inode))
+			break;
+		*ret = -NFS4ERR_NOMATCHING_LAYOUT;
+		return -EAGAIN;
+	}
+	*argpp = NULL;
+	*respp = NULL;
+	return retval;
+}
+
 void pnfs_roc_release(struct nfs4_layoutreturn_args *args,
 		struct nfs4_layoutreturn_res *res,
 		int ret)
@@ -1449,10 +1505,15 @@
 	const nfs4_stateid *res_stateid = NULL;
 	struct nfs4_xdr_opaque_data *ld_private = args->ld_private;
 
-	if (ret == 0) {
-		arg_stateid = &args->stateid;
+	switch (ret) {
+	case -NFS4ERR_NOMATCHING_LAYOUT:
+		break;
+	case 0:
 		if (res->lrs_present)
 			res_stateid = &res->stateid;
+		/* Fallthrough */
+	default:
+		arg_stateid = &args->stateid;
 	}
 	pnfs_layoutreturn_free_lsegs(lo, arg_stateid, &args->range,
 			res_stateid);

diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index f15609c..f8a3806 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h

@@ -261,7 +261,7 @@
 		bool is_recall);
 int pnfs_destroy_layouts_byclid(struct nfs_client *clp,
 		bool is_recall);
-bool nfs4_layoutreturn_refresh_stateid(nfs4_stateid *dst,
+bool nfs4_layout_refresh_old_stateid(nfs4_stateid *dst,
 		struct pnfs_layout_range *dst_range,
 		struct inode *inode);
 void pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo);
@@ -282,6 +282,10 @@
 		struct nfs4_layoutreturn_args *args,
 		struct nfs4_layoutreturn_res *res,
 		const struct cred *cred);
+int pnfs_roc_done(struct rpc_task *task, struct inode *inode,
+		struct nfs4_layoutreturn_args **argpp,
+		struct nfs4_layoutreturn_res **respp,
+		int *ret);
 void pnfs_roc_release(struct nfs4_layoutreturn_args *args,
 		struct nfs4_layoutreturn_res *res,
 		int ret);
@@ -701,6 +705,15 @@
 	return false;
 }
 
+static inline int
+pnfs_roc_done(struct rpc_task *task, struct inode *inode,
+		struct nfs4_layoutreturn_args **argpp,
+		struct nfs4_layoutreturn_res **respp,
+		int *ret)
+{
+	return 0;
+}
+
 static inline void
 pnfs_roc_release(struct nfs4_layoutreturn_args *args,
 		struct nfs4_layoutreturn_res *res,
@@ -785,7 +798,7 @@
 {
 }
 
-static inline bool nfs4_layoutreturn_refresh_stateid(nfs4_stateid *dst,
+static inline bool nfs4_layout_refresh_old_stateid(nfs4_stateid *dst,
 		struct pnfs_layout_range *dst_range,
 		struct inode *inode)
 {

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 19a76cf..a84df7d6 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c

@@ -2645,6 +2645,13 @@
 }
 EXPORT_SYMBOL_GPL(nfs_clone_sb_security);
 
+static void nfs_set_readahead(struct backing_dev_info *bdi,
+			      unsigned long iomax_pages)
+{
+	bdi->ra_pages = VM_READAHEAD_PAGES;
+	bdi->io_pages = iomax_pages;
+}
+
 struct dentry *nfs_fs_mount_common(struct nfs_server *server,
 				   int flags, const char *dev_name,
 				   struct nfs_mount_info *mount_info,
@@ -2687,7 +2694,7 @@
 			mntroot = ERR_PTR(error);
 			goto error_splat_super;
 		}
-		s->s_bdi->ra_pages = server->rpages * NFS_MAX_READAHEAD;
+		nfs_set_readahead(s->s_bdi, server->rpages);
 		server->super = s;
 	}
 

diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index d25f6bb..10cefb0 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig

@@ -3,6 +3,7 @@
 	tristate "NFS server support"
 	depends on INET
 	depends on FILE_LOCKING
+	depends on FSNOTIFY
 	select LOCKD
 	select SUNRPC
 	select EXPORTFS
@@ -147,7 +148,7 @@
 
 config NFSD_FAULT_INJECTION
 	bool "NFS server manual fault injection"
-	depends on NFSD_V4 && DEBUG_KERNEL && DEBUG_FS
+	depends on NFSD_V4 && DEBUG_KERNEL && DEBUG_FS && BROKEN
 	help
 	  This option enables support for manually injecting faults
 	  into the NFS server.  This is intended to be used for

diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile
index 2bfb58e..6a40b1a 100644
--- a/fs/nfsd/Makefile
+++ b/fs/nfsd/Makefile

@@ -11,7 +11,8 @@
 nfsd-y			+= trace.o
 
 nfsd-y 			+= nfssvc.o nfsctl.o nfsproc.o nfsfh.o vfs.o \
-			   export.o auth.o lockd.o nfscache.o nfsxdr.o stats.o
+			   export.o auth.o lockd.o nfscache.o nfsxdr.o \
+			   stats.o filecache.o
 nfsd-$(CONFIG_NFSD_FAULT_INJECTION) += fault_inject.o
 nfsd-$(CONFIG_NFSD_V2_ACL) += nfs2acl.o
 nfsd-$(CONFIG_NFSD_V3)	+= nfs3proc.o nfs3xdr.o

diff --git a/fs/nfsd/acl.h b/fs/nfsd/acl.h
index 4cd7c69..ba14d2f 100644
--- a/fs/nfsd/acl.h
+++ b/fs/nfsd/acl.h

@@ -39,14 +39,6 @@
 struct svc_fh;
 struct svc_rqst;
 
-/*
- * Maximum ACL we'll accept from a client; chosen (somewhat
- * arbitrarily) so that kmalloc'ing the ACL shouldn't require a
- * high-order allocation.  This allows 204 ACEs on x86_64:
- */
-#define NFS4_ACL_MAX ((PAGE_SIZE - sizeof(struct nfs4_acl)) \
-			/ sizeof(struct nfs4_ace))
-
 int nfs4_acl_bytes(int entries);
 int nfs4_acl_get_whotype(char *, u32);
 __be32 nfs4_acl_write_who(struct xdr_stream *xdr, int who);

diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index 66d4c55..9bbaa67 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c

@@ -15,6 +15,7 @@
 
 #include "blocklayoutxdr.h"
 #include "pnfs.h"
+#include "filecache.h"
 
 #define NFSDDBG_FACILITY	NFSDDBG_PNFS
 
@@ -404,7 +405,7 @@
 nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls)
 {
 	struct nfs4_client *clp = ls->ls_stid.sc_client;
-	struct block_device *bdev = ls->ls_file->f_path.mnt->mnt_sb->s_bdev;
+	struct block_device *bdev = ls->ls_file->nf_file->f_path.mnt->mnt_sb->s_bdev;
 
 	bdev->bd_disk->fops->pr_ops->pr_preempt(bdev, NFSD_MDS_PR_KEY,
 			nfsd4_scsi_pr_key(clp), 0, true);

diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index baa0195..15422c9 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c

@@ -22,6 +22,7 @@
 #include "nfsfh.h"
 #include "netns.h"
 #include "pnfs.h"
+#include "filecache.h"
 
 #define NFSDDBG_FACILITY	NFSDDBG_EXPORT
 
@@ -232,6 +233,17 @@
 		return NULL;
 }
 
+static void expkey_flush(void)
+{
+	/*
+	 * Take the nfsd_mutex here to ensure that the file cache is not
+	 * destroyed while we're in the middle of flushing.
+	 */
+	mutex_lock(&nfsd_mutex);
+	nfsd_file_cache_purge(current->nsproxy->net_ns);
+	mutex_unlock(&nfsd_mutex);
+}
+
 static const struct cache_detail svc_expkey_cache_template = {
 	.owner		= THIS_MODULE,
 	.hash_size	= EXPKEY_HASHMAX,
@@ -244,6 +256,7 @@
 	.init		= expkey_init,
 	.update       	= expkey_update,
 	.alloc		= expkey_alloc,
+	.flush		= expkey_flush,
 };
 
 static int

diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
new file mode 100644
index 0000000..ef55e9b
--- /dev/null
+++ b/fs/nfsd/filecache.c

@@ -0,0 +1,934 @@
+/*
+ * Open file cache.
+ *
+ * (c) 2015 - Jeff Layton <jeff.layton@primarydata.com>
+ */
+
+#include <linux/hash.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/sched.h>
+#include <linux/list_lru.h>
+#include <linux/fsnotify_backend.h>
+#include <linux/fsnotify.h>
+#include <linux/seq_file.h>
+
+#include "vfs.h"
+#include "nfsd.h"
+#include "nfsfh.h"
+#include "netns.h"
+#include "filecache.h"
+#include "trace.h"
+
+#define NFSDDBG_FACILITY	NFSDDBG_FH
+
+/* FIXME: dynamically size this for the machine somehow? */
+#define NFSD_FILE_HASH_BITS                   12
+#define NFSD_FILE_HASH_SIZE                  (1 << NFSD_FILE_HASH_BITS)
+#define NFSD_LAUNDRETTE_DELAY		     (2 * HZ)
+
+#define NFSD_FILE_LRU_RESCAN		     (0)
+#define NFSD_FILE_SHUTDOWN		     (1)
+#define NFSD_FILE_LRU_THRESHOLD		     (4096UL)
+#define NFSD_FILE_LRU_LIMIT		     (NFSD_FILE_LRU_THRESHOLD << 2)
+
+/* We only care about NFSD_MAY_READ/WRITE for this cache */
+#define NFSD_FILE_MAY_MASK	(NFSD_MAY_READ|NFSD_MAY_WRITE)
+
+struct nfsd_fcache_bucket {
+	struct hlist_head	nfb_head;
+	spinlock_t		nfb_lock;
+	unsigned int		nfb_count;
+	unsigned int		nfb_maxcount;
+};
+
+static DEFINE_PER_CPU(unsigned long, nfsd_file_cache_hits);
+
+static struct kmem_cache		*nfsd_file_slab;
+static struct kmem_cache		*nfsd_file_mark_slab;
+static struct nfsd_fcache_bucket	*nfsd_file_hashtbl;
+static struct list_lru			nfsd_file_lru;
+static long				nfsd_file_lru_flags;
+static struct fsnotify_group		*nfsd_file_fsnotify_group;
+static atomic_long_t			nfsd_filecache_count;
+static struct delayed_work		nfsd_filecache_laundrette;
+
+enum nfsd_file_laundrette_ctl {
+	NFSD_FILE_LAUNDRETTE_NOFLUSH = 0,
+	NFSD_FILE_LAUNDRETTE_MAY_FLUSH
+};
+
+static void
+nfsd_file_schedule_laundrette(enum nfsd_file_laundrette_ctl ctl)
+{
+	long count = atomic_long_read(&nfsd_filecache_count);
+
+	if (count == 0 || test_bit(NFSD_FILE_SHUTDOWN, &nfsd_file_lru_flags))
+		return;
+
+	/* Be more aggressive about scanning if over the threshold */
+	if (count > NFSD_FILE_LRU_THRESHOLD)
+		mod_delayed_work(system_wq, &nfsd_filecache_laundrette, 0);
+	else
+		schedule_delayed_work(&nfsd_filecache_laundrette, NFSD_LAUNDRETTE_DELAY);
+
+	if (ctl == NFSD_FILE_LAUNDRETTE_NOFLUSH)
+		return;
+
+	/* ...and don't delay flushing if we're out of control */
+	if (count >= NFSD_FILE_LRU_LIMIT)
+		flush_delayed_work(&nfsd_filecache_laundrette);
+}
+
+static void
+nfsd_file_slab_free(struct rcu_head *rcu)
+{
+	struct nfsd_file *nf = container_of(rcu, struct nfsd_file, nf_rcu);
+
+	put_cred(nf->nf_cred);
+	kmem_cache_free(nfsd_file_slab, nf);
+}
+
+static void
+nfsd_file_mark_free(struct fsnotify_mark *mark)
+{
+	struct nfsd_file_mark *nfm = container_of(mark, struct nfsd_file_mark,
+						  nfm_mark);
+
+	kmem_cache_free(nfsd_file_mark_slab, nfm);
+}
+
+static struct nfsd_file_mark *
+nfsd_file_mark_get(struct nfsd_file_mark *nfm)
+{
+	if (!atomic_inc_not_zero(&nfm->nfm_ref))
+		return NULL;
+	return nfm;
+}
+
+static void
+nfsd_file_mark_put(struct nfsd_file_mark *nfm)
+{
+	if (atomic_dec_and_test(&nfm->nfm_ref)) {
+
+		fsnotify_destroy_mark(&nfm->nfm_mark, nfsd_file_fsnotify_group);
+		fsnotify_put_mark(&nfm->nfm_mark);
+	}
+}
+
+static struct nfsd_file_mark *
+nfsd_file_mark_find_or_create(struct nfsd_file *nf)
+{
+	int			err;
+	struct fsnotify_mark	*mark;
+	struct nfsd_file_mark	*nfm = NULL, *new;
+	struct inode *inode = nf->nf_inode;
+
+	do {
+		mutex_lock(&nfsd_file_fsnotify_group->mark_mutex);
+		mark = fsnotify_find_mark(&inode->i_fsnotify_marks,
+				nfsd_file_fsnotify_group);
+		if (mark) {
+			nfm = nfsd_file_mark_get(container_of(mark,
+						 struct nfsd_file_mark,
+						 nfm_mark));
+			mutex_unlock(&nfsd_file_fsnotify_group->mark_mutex);
+			fsnotify_put_mark(mark);
+			if (likely(nfm))
+				break;
+		} else
+			mutex_unlock(&nfsd_file_fsnotify_group->mark_mutex);
+
+		/* allocate a new nfm */
+		new = kmem_cache_alloc(nfsd_file_mark_slab, GFP_KERNEL);
+		if (!new)
+			return NULL;
+		fsnotify_init_mark(&new->nfm_mark, nfsd_file_fsnotify_group);
+		new->nfm_mark.mask = FS_ATTRIB|FS_DELETE_SELF;
+		atomic_set(&new->nfm_ref, 1);
+
+		err = fsnotify_add_inode_mark(&new->nfm_mark, inode, 0);
+
+		/*
+		 * If the add was successful, then return the object.
+		 * Otherwise, we need to put the reference we hold on the
+		 * nfm_mark. The fsnotify code will take a reference and put
+		 * it on failure, so we can't just free it directly. It's also
+		 * not safe to call fsnotify_destroy_mark on it as the
+		 * mark->group will be NULL. Thus, we can't let the nfm_ref
+		 * counter drive the destruction at this point.
+		 */
+		if (likely(!err))
+			nfm = new;
+		else
+			fsnotify_put_mark(&new->nfm_mark);
+	} while (unlikely(err == -EEXIST));
+
+	return nfm;
+}
+
+static struct nfsd_file *
+nfsd_file_alloc(struct inode *inode, unsigned int may, unsigned int hashval,
+		struct net *net)
+{
+	struct nfsd_file *nf;
+
+	nf = kmem_cache_alloc(nfsd_file_slab, GFP_KERNEL);
+	if (nf) {
+		INIT_HLIST_NODE(&nf->nf_node);
+		INIT_LIST_HEAD(&nf->nf_lru);
+		nf->nf_file = NULL;
+		nf->nf_cred = get_current_cred();
+		nf->nf_net = net;
+		nf->nf_flags = 0;
+		nf->nf_inode = inode;
+		nf->nf_hashval = hashval;
+		atomic_set(&nf->nf_ref, 1);
+		nf->nf_may = may & NFSD_FILE_MAY_MASK;
+		if (may & NFSD_MAY_NOT_BREAK_LEASE) {
+			if (may & NFSD_MAY_WRITE)
+				__set_bit(NFSD_FILE_BREAK_WRITE, &nf->nf_flags);
+			if (may & NFSD_MAY_READ)
+				__set_bit(NFSD_FILE_BREAK_READ, &nf->nf_flags);
+		}
+		nf->nf_mark = NULL;
+		trace_nfsd_file_alloc(nf);
+	}
+	return nf;
+}
+
+static bool
+nfsd_file_free(struct nfsd_file *nf)
+{
+	bool flush = false;
+
+	trace_nfsd_file_put_final(nf);
+	if (nf->nf_mark)
+		nfsd_file_mark_put(nf->nf_mark);
+	if (nf->nf_file) {
+		get_file(nf->nf_file);
+		filp_close(nf->nf_file, NULL);
+		fput(nf->nf_file);
+		flush = true;
+	}
+	call_rcu(&nf->nf_rcu, nfsd_file_slab_free);
+	return flush;
+}
+
+static bool
+nfsd_file_check_writeback(struct nfsd_file *nf)
+{
+	struct file *file = nf->nf_file;
+	struct address_space *mapping;
+
+	if (!file || !(file->f_mode & FMODE_WRITE))
+		return false;
+	mapping = file->f_mapping;
+	return mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) ||
+		mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK);
+}
+
+static int
+nfsd_file_check_write_error(struct nfsd_file *nf)
+{
+	struct file *file = nf->nf_file;
+
+	if (!file || !(file->f_mode & FMODE_WRITE))
+		return 0;
+	return filemap_check_wb_err(file->f_mapping, READ_ONCE(file->f_wb_err));
+}
+
+static bool
+nfsd_file_in_use(struct nfsd_file *nf)
+{
+	return nfsd_file_check_writeback(nf) ||
+			nfsd_file_check_write_error(nf);
+}
+
+static void
+nfsd_file_do_unhash(struct nfsd_file *nf)
+{
+	lockdep_assert_held(&nfsd_file_hashtbl[nf->nf_hashval].nfb_lock);
+
+	trace_nfsd_file_unhash(nf);
+
+	if (nfsd_file_check_write_error(nf))
+		nfsd_reset_boot_verifier(net_generic(nf->nf_net, nfsd_net_id));
+	--nfsd_file_hashtbl[nf->nf_hashval].nfb_count;
+	hlist_del_rcu(&nf->nf_node);
+	if (!list_empty(&nf->nf_lru))
+		list_lru_del(&nfsd_file_lru, &nf->nf_lru);
+	atomic_long_dec(&nfsd_filecache_count);
+}
+
+static bool
+nfsd_file_unhash(struct nfsd_file *nf)
+{
+	if (test_and_clear_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
+		nfsd_file_do_unhash(nf);
+		return true;
+	}
+	return false;
+}
+
+/*
+ * Return true if the file was unhashed.
+ */
+static bool
+nfsd_file_unhash_and_release_locked(struct nfsd_file *nf, struct list_head *dispose)
+{
+	lockdep_assert_held(&nfsd_file_hashtbl[nf->nf_hashval].nfb_lock);
+
+	trace_nfsd_file_unhash_and_release_locked(nf);
+	if (!nfsd_file_unhash(nf))
+		return false;
+	/* keep final reference for nfsd_file_lru_dispose */
+	if (atomic_add_unless(&nf->nf_ref, -1, 1))
+		return true;
+
+	list_add(&nf->nf_lru, dispose);
+	return true;
+}
+
+static int
+nfsd_file_put_noref(struct nfsd_file *nf)
+{
+	int count;
+	trace_nfsd_file_put(nf);
+
+	count = atomic_dec_return(&nf->nf_ref);
+	if (!count) {
+		WARN_ON(test_bit(NFSD_FILE_HASHED, &nf->nf_flags));
+		nfsd_file_free(nf);
+	}
+	return count;
+}
+
+void
+nfsd_file_put(struct nfsd_file *nf)
+{
+	bool is_hashed = test_bit(NFSD_FILE_HASHED, &nf->nf_flags) != 0;
+	bool unused = !nfsd_file_in_use(nf);
+
+	set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags);
+	if (nfsd_file_put_noref(nf) == 1 && is_hashed && unused)
+		nfsd_file_schedule_laundrette(NFSD_FILE_LAUNDRETTE_MAY_FLUSH);
+}
+
+struct nfsd_file *
+nfsd_file_get(struct nfsd_file *nf)
+{
+	if (likely(atomic_inc_not_zero(&nf->nf_ref)))
+		return nf;
+	return NULL;
+}
+
+static void
+nfsd_file_dispose_list(struct list_head *dispose)
+{
+	struct nfsd_file *nf;
+
+	while(!list_empty(dispose)) {
+		nf = list_first_entry(dispose, struct nfsd_file, nf_lru);
+		list_del(&nf->nf_lru);
+		nfsd_file_put_noref(nf);
+	}
+}
+
+static void
+nfsd_file_dispose_list_sync(struct list_head *dispose)
+{
+	bool flush = false;
+	struct nfsd_file *nf;
+
+	while(!list_empty(dispose)) {
+		nf = list_first_entry(dispose, struct nfsd_file, nf_lru);
+		list_del(&nf->nf_lru);
+		if (!atomic_dec_and_test(&nf->nf_ref))
+			continue;
+		if (nfsd_file_free(nf))
+			flush = true;
+	}
+	if (flush)
+		flush_delayed_fput();
+}
+
+/*
+ * Note this can deadlock with nfsd_file_cache_purge.
+ */
+static enum lru_status
+nfsd_file_lru_cb(struct list_head *item, struct list_lru_one *lru,
+		 spinlock_t *lock, void *arg)
+	__releases(lock)
+	__acquires(lock)
+{
+	struct list_head *head = arg;
+	struct nfsd_file *nf = list_entry(item, struct nfsd_file, nf_lru);
+
+	/*
+	 * Do a lockless refcount check. The hashtable holds one reference, so
+	 * we look to see if anything else has a reference, or if any have
+	 * been put since the shrinker last ran. Those don't get unhashed and
+	 * released.
+	 *
+	 * Note that in the put path, we set the flag and then decrement the
+	 * counter. Here we check the counter and then test and clear the flag.
+	 * That order is deliberate to ensure that we can do this locklessly.
+	 */
+	if (atomic_read(&nf->nf_ref) > 1)
+		goto out_skip;
+
+	/*
+	 * Don't throw out files that are still undergoing I/O or
+	 * that have uncleared errors pending.
+	 */
+	if (nfsd_file_check_writeback(nf))
+		goto out_skip;
+
+	if (test_and_clear_bit(NFSD_FILE_REFERENCED, &nf->nf_flags))
+		goto out_rescan;
+
+	if (!test_and_clear_bit(NFSD_FILE_HASHED, &nf->nf_flags))
+		goto out_skip;
+
+	list_lru_isolate_move(lru, &nf->nf_lru, head);
+	return LRU_REMOVED;
+out_rescan:
+	set_bit(NFSD_FILE_LRU_RESCAN, &nfsd_file_lru_flags);
+out_skip:
+	return LRU_SKIP;
+}
+
+static void
+nfsd_file_lru_dispose(struct list_head *head)
+{
+	while(!list_empty(head)) {
+		struct nfsd_file *nf = list_first_entry(head,
+				struct nfsd_file, nf_lru);
+		list_del_init(&nf->nf_lru);
+		spin_lock(&nfsd_file_hashtbl[nf->nf_hashval].nfb_lock);
+		nfsd_file_do_unhash(nf);
+		spin_unlock(&nfsd_file_hashtbl[nf->nf_hashval].nfb_lock);
+		nfsd_file_put_noref(nf);
+	}
+}
+
+static unsigned long
+nfsd_file_lru_count(struct shrinker *s, struct shrink_control *sc)
+{
+	return list_lru_count(&nfsd_file_lru);
+}
+
+static unsigned long
+nfsd_file_lru_scan(struct shrinker *s, struct shrink_control *sc)
+{
+	LIST_HEAD(head);
+	unsigned long ret;
+
+	ret = list_lru_shrink_walk(&nfsd_file_lru, sc, nfsd_file_lru_cb, &head);
+	nfsd_file_lru_dispose(&head);
+	return ret;
+}
+
+static struct shrinker	nfsd_file_shrinker = {
+	.scan_objects = nfsd_file_lru_scan,
+	.count_objects = nfsd_file_lru_count,
+	.seeks = 1,
+};
+
+static void
+__nfsd_file_close_inode(struct inode *inode, unsigned int hashval,
+			struct list_head *dispose)
+{
+	struct nfsd_file	*nf;
+	struct hlist_node	*tmp;
+
+	spin_lock(&nfsd_file_hashtbl[hashval].nfb_lock);
+	hlist_for_each_entry_safe(nf, tmp, &nfsd_file_hashtbl[hashval].nfb_head, nf_node) {
+		if (inode == nf->nf_inode)
+			nfsd_file_unhash_and_release_locked(nf, dispose);
+	}
+	spin_unlock(&nfsd_file_hashtbl[hashval].nfb_lock);
+}
+
+/**
+ * nfsd_file_close_inode_sync - attempt to forcibly close a nfsd_file
+ * @inode: inode of the file to attempt to remove
+ *
+ * Walk the whole hash bucket, looking for any files that correspond to "inode".
+ * If any do, then unhash them and put the hashtable reference to them and
+ * destroy any that had their last reference put. Also ensure that any of the
+ * fputs also have their final __fput done as well.
+ */
+void
+nfsd_file_close_inode_sync(struct inode *inode)
+{
+	unsigned int		hashval = (unsigned int)hash_long(inode->i_ino,
+						NFSD_FILE_HASH_BITS);
+	LIST_HEAD(dispose);
+
+	__nfsd_file_close_inode(inode, hashval, &dispose);
+	trace_nfsd_file_close_inode_sync(inode, hashval, !list_empty(&dispose));
+	nfsd_file_dispose_list_sync(&dispose);
+}
+
+/**
+ * nfsd_file_close_inode_sync - attempt to forcibly close a nfsd_file
+ * @inode: inode of the file to attempt to remove
+ *
+ * Walk the whole hash bucket, looking for any files that correspond to "inode".
+ * If any do, then unhash them and put the hashtable reference to them and
+ * destroy any that had their last reference put.
+ */
+static void
+nfsd_file_close_inode(struct inode *inode)
+{
+	unsigned int		hashval = (unsigned int)hash_long(inode->i_ino,
+						NFSD_FILE_HASH_BITS);
+	LIST_HEAD(dispose);
+
+	__nfsd_file_close_inode(inode, hashval, &dispose);
+	trace_nfsd_file_close_inode(inode, hashval, !list_empty(&dispose));
+	nfsd_file_dispose_list(&dispose);
+}
+
+/**
+ * nfsd_file_delayed_close - close unused nfsd_files
+ * @work: dummy
+ *
+ * Walk the LRU list and close any entries that have not been used since
+ * the last scan.
+ *
+ * Note this can deadlock with nfsd_file_cache_purge.
+ */
+static void
+nfsd_file_delayed_close(struct work_struct *work)
+{
+	LIST_HEAD(head);
+
+	list_lru_walk(&nfsd_file_lru, nfsd_file_lru_cb, &head, LONG_MAX);
+
+	if (test_and_clear_bit(NFSD_FILE_LRU_RESCAN, &nfsd_file_lru_flags))
+		nfsd_file_schedule_laundrette(NFSD_FILE_LAUNDRETTE_NOFLUSH);
+
+	if (!list_empty(&head)) {
+		nfsd_file_lru_dispose(&head);
+		flush_delayed_fput();
+	}
+}
+
+static int
+nfsd_file_lease_notifier_call(struct notifier_block *nb, unsigned long arg,
+			    void *data)
+{
+	struct file_lock *fl = data;
+
+	/* Only close files for F_SETLEASE leases */
+	if (fl->fl_flags & FL_LEASE)
+		nfsd_file_close_inode_sync(file_inode(fl->fl_file));
+	return 0;
+}
+
+static struct notifier_block nfsd_file_lease_notifier = {
+	.notifier_call = nfsd_file_lease_notifier_call,
+};
+
+static int
+nfsd_file_fsnotify_handle_event(struct fsnotify_group *group,
+				struct inode *inode,
+				u32 mask, const void *data, int data_type,
+				const struct qstr *file_name, u32 cookie,
+				struct fsnotify_iter_info *iter_info)
+{
+	trace_nfsd_file_fsnotify_handle_event(inode, mask);
+
+	/* Should be no marks on non-regular files */
+	if (!S_ISREG(inode->i_mode)) {
+		WARN_ON_ONCE(1);
+		return 0;
+	}
+
+	/* don't close files if this was not the last link */
+	if (mask & FS_ATTRIB) {
+		if (inode->i_nlink)
+			return 0;
+	}
+
+	nfsd_file_close_inode(inode);
+	return 0;
+}
+
+
+static const struct fsnotify_ops nfsd_file_fsnotify_ops = {
+	.handle_event = nfsd_file_fsnotify_handle_event,
+	.free_mark = nfsd_file_mark_free,
+};
+
+int
+nfsd_file_cache_init(void)
+{
+	int		ret = -ENOMEM;
+	unsigned int	i;
+
+	clear_bit(NFSD_FILE_SHUTDOWN, &nfsd_file_lru_flags);
+
+	if (nfsd_file_hashtbl)
+		return 0;
+
+	nfsd_file_hashtbl = kcalloc(NFSD_FILE_HASH_SIZE,
+				sizeof(*nfsd_file_hashtbl), GFP_KERNEL);
+	if (!nfsd_file_hashtbl) {
+		pr_err("nfsd: unable to allocate nfsd_file_hashtbl\n");
+		goto out_err;
+	}
+
+	nfsd_file_slab = kmem_cache_create("nfsd_file",
+				sizeof(struct nfsd_file), 0, 0, NULL);
+	if (!nfsd_file_slab) {
+		pr_err("nfsd: unable to create nfsd_file_slab\n");
+		goto out_err;
+	}
+
+	nfsd_file_mark_slab = kmem_cache_create("nfsd_file_mark",
+					sizeof(struct nfsd_file_mark), 0, 0, NULL);
+	if (!nfsd_file_mark_slab) {
+		pr_err("nfsd: unable to create nfsd_file_mark_slab\n");
+		goto out_err;
+	}
+
+
+	ret = list_lru_init(&nfsd_file_lru);
+	if (ret) {
+		pr_err("nfsd: failed to init nfsd_file_lru: %d\n", ret);
+		goto out_err;
+	}
+
+	ret = register_shrinker(&nfsd_file_shrinker);
+	if (ret) {
+		pr_err("nfsd: failed to register nfsd_file_shrinker: %d\n", ret);
+		goto out_lru;
+	}
+
+	ret = lease_register_notifier(&nfsd_file_lease_notifier);
+	if (ret) {
+		pr_err("nfsd: unable to register lease notifier: %d\n", ret);
+		goto out_shrinker;
+	}
+
+	nfsd_file_fsnotify_group = fsnotify_alloc_group(&nfsd_file_fsnotify_ops);
+	if (IS_ERR(nfsd_file_fsnotify_group)) {
+		pr_err("nfsd: unable to create fsnotify group: %ld\n",
+			PTR_ERR(nfsd_file_fsnotify_group));
+		nfsd_file_fsnotify_group = NULL;
+		goto out_notifier;
+	}
+
+	for (i = 0; i < NFSD_FILE_HASH_SIZE; i++) {
+		INIT_HLIST_HEAD(&nfsd_file_hashtbl[i].nfb_head);
+		spin_lock_init(&nfsd_file_hashtbl[i].nfb_lock);
+	}
+
+	INIT_DELAYED_WORK(&nfsd_filecache_laundrette, nfsd_file_delayed_close);
+out:
+	return ret;
+out_notifier:
+	lease_unregister_notifier(&nfsd_file_lease_notifier);
+out_shrinker:
+	unregister_shrinker(&nfsd_file_shrinker);
+out_lru:
+	list_lru_destroy(&nfsd_file_lru);
+out_err:
+	kmem_cache_destroy(nfsd_file_slab);
+	nfsd_file_slab = NULL;
+	kmem_cache_destroy(nfsd_file_mark_slab);
+	nfsd_file_mark_slab = NULL;
+	kfree(nfsd_file_hashtbl);
+	nfsd_file_hashtbl = NULL;
+	goto out;
+}
+
+/*
+ * Note this can deadlock with nfsd_file_lru_cb.
+ */
+void
+nfsd_file_cache_purge(struct net *net)
+{
+	unsigned int		i;
+	struct nfsd_file	*nf;
+	struct hlist_node	*next;
+	LIST_HEAD(dispose);
+	bool del;
+
+	if (!nfsd_file_hashtbl)
+		return;
+
+	for (i = 0; i < NFSD_FILE_HASH_SIZE; i++) {
+		struct nfsd_fcache_bucket *nfb = &nfsd_file_hashtbl[i];
+
+		spin_lock(&nfb->nfb_lock);
+		hlist_for_each_entry_safe(nf, next, &nfb->nfb_head, nf_node) {
+			if (net && nf->nf_net != net)
+				continue;
+			del = nfsd_file_unhash_and_release_locked(nf, &dispose);
+
+			/*
+			 * Deadlock detected! Something marked this entry as
+			 * unhased, but hasn't removed it from the hash list.
+			 */
+			WARN_ON_ONCE(!del);
+		}
+		spin_unlock(&nfb->nfb_lock);
+		nfsd_file_dispose_list(&dispose);
+	}
+}
+
+void
+nfsd_file_cache_shutdown(void)
+{
+	LIST_HEAD(dispose);
+
+	set_bit(NFSD_FILE_SHUTDOWN, &nfsd_file_lru_flags);
+
+	lease_unregister_notifier(&nfsd_file_lease_notifier);
+	unregister_shrinker(&nfsd_file_shrinker);
+	/*
+	 * make sure all callers of nfsd_file_lru_cb are done before
+	 * calling nfsd_file_cache_purge
+	 */
+	cancel_delayed_work_sync(&nfsd_filecache_laundrette);
+	nfsd_file_cache_purge(NULL);
+	list_lru_destroy(&nfsd_file_lru);
+	rcu_barrier();
+	fsnotify_put_group(nfsd_file_fsnotify_group);
+	nfsd_file_fsnotify_group = NULL;
+	kmem_cache_destroy(nfsd_file_slab);
+	nfsd_file_slab = NULL;
+	fsnotify_wait_marks_destroyed();
+	kmem_cache_destroy(nfsd_file_mark_slab);
+	nfsd_file_mark_slab = NULL;
+	kfree(nfsd_file_hashtbl);
+	nfsd_file_hashtbl = NULL;
+}
+
+static bool
+nfsd_match_cred(const struct cred *c1, const struct cred *c2)
+{
+	int i;
+
+	if (!uid_eq(c1->fsuid, c2->fsuid))
+		return false;
+	if (!gid_eq(c1->fsgid, c2->fsgid))
+		return false;
+	if (c1->group_info == NULL || c2->group_info == NULL)
+		return c1->group_info == c2->group_info;
+	if (c1->group_info->ngroups != c2->group_info->ngroups)
+		return false;
+	for (i = 0; i < c1->group_info->ngroups; i++) {
+		if (!gid_eq(c1->group_info->gid[i], c2->group_info->gid[i]))
+			return false;
+	}
+	return true;
+}
+
+static struct nfsd_file *
+nfsd_file_find_locked(struct inode *inode, unsigned int may_flags,
+			unsigned int hashval, struct net *net)
+{
+	struct nfsd_file *nf;
+	unsigned char need = may_flags & NFSD_FILE_MAY_MASK;
+
+	hlist_for_each_entry_rcu(nf, &nfsd_file_hashtbl[hashval].nfb_head,
+				 nf_node) {
+		if ((need & nf->nf_may) != need)
+			continue;
+		if (nf->nf_inode != inode)
+			continue;
+		if (nf->nf_net != net)
+			continue;
+		if (!nfsd_match_cred(nf->nf_cred, current_cred()))
+			continue;
+		if (nfsd_file_get(nf) != NULL)
+			return nf;
+	}
+	return NULL;
+}
+
+/**
+ * nfsd_file_is_cached - are there any cached open files for this fh?
+ * @inode: inode of the file to check
+ *
+ * Scan the hashtable for open files that match this fh. Returns true if there
+ * are any, and false if not.
+ */
+bool
+nfsd_file_is_cached(struct inode *inode)
+{
+	bool			ret = false;
+	struct nfsd_file	*nf;
+	unsigned int		hashval;
+
+        hashval = (unsigned int)hash_long(inode->i_ino, NFSD_FILE_HASH_BITS);
+
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(nf, &nfsd_file_hashtbl[hashval].nfb_head,
+				 nf_node) {
+		if (inode == nf->nf_inode) {
+			ret = true;
+			break;
+		}
+	}
+	rcu_read_unlock();
+	trace_nfsd_file_is_cached(inode, hashval, (int)ret);
+	return ret;
+}
+
+__be32
+nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
+		  unsigned int may_flags, struct nfsd_file **pnf)
+{
+	__be32	status;
+	struct net *net = SVC_NET(rqstp);
+	struct nfsd_file *nf, *new;
+	struct inode *inode;
+	unsigned int hashval;
+
+	/* FIXME: skip this if fh_dentry is already set? */
+	status = fh_verify(rqstp, fhp, S_IFREG,
+				may_flags|NFSD_MAY_OWNER_OVERRIDE);
+	if (status != nfs_ok)
+		return status;
+
+	inode = d_inode(fhp->fh_dentry);
+	hashval = (unsigned int)hash_long(inode->i_ino, NFSD_FILE_HASH_BITS);
+retry:
+	rcu_read_lock();
+	nf = nfsd_file_find_locked(inode, may_flags, hashval, net);
+	rcu_read_unlock();
+	if (nf)
+		goto wait_for_construction;
+
+	new = nfsd_file_alloc(inode, may_flags, hashval, net);
+	if (!new) {
+		trace_nfsd_file_acquire(rqstp, hashval, inode, may_flags,
+					NULL, nfserr_jukebox);
+		return nfserr_jukebox;
+	}
+
+	spin_lock(&nfsd_file_hashtbl[hashval].nfb_lock);
+	nf = nfsd_file_find_locked(inode, may_flags, hashval, net);
+	if (nf == NULL)
+		goto open_file;
+	spin_unlock(&nfsd_file_hashtbl[hashval].nfb_lock);
+	nfsd_file_slab_free(&new->nf_rcu);
+
+wait_for_construction:
+	wait_on_bit(&nf->nf_flags, NFSD_FILE_PENDING, TASK_UNINTERRUPTIBLE);
+
+	/* Did construction of this file fail? */
+	if (!test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
+		nfsd_file_put_noref(nf);
+		goto retry;
+	}
+
+	this_cpu_inc(nfsd_file_cache_hits);
+
+	if (!(may_flags & NFSD_MAY_NOT_BREAK_LEASE)) {
+		bool write = (may_flags & NFSD_MAY_WRITE);
+
+		if (test_bit(NFSD_FILE_BREAK_READ, &nf->nf_flags) ||
+		    (test_bit(NFSD_FILE_BREAK_WRITE, &nf->nf_flags) && write)) {
+			status = nfserrno(nfsd_open_break_lease(
+					file_inode(nf->nf_file), may_flags));
+			if (status == nfs_ok) {
+				clear_bit(NFSD_FILE_BREAK_READ, &nf->nf_flags);
+				if (write)
+					clear_bit(NFSD_FILE_BREAK_WRITE,
+						  &nf->nf_flags);
+			}
+		}
+	}
+out:
+	if (status == nfs_ok) {
+		*pnf = nf;
+	} else {
+		nfsd_file_put(nf);
+		nf = NULL;
+	}
+
+	trace_nfsd_file_acquire(rqstp, hashval, inode, may_flags, nf, status);
+	return status;
+open_file:
+	nf = new;
+	/* Take reference for the hashtable */
+	atomic_inc(&nf->nf_ref);
+	__set_bit(NFSD_FILE_HASHED, &nf->nf_flags);
+	__set_bit(NFSD_FILE_PENDING, &nf->nf_flags);
+	list_lru_add(&nfsd_file_lru, &nf->nf_lru);
+	hlist_add_head_rcu(&nf->nf_node, &nfsd_file_hashtbl[hashval].nfb_head);
+	++nfsd_file_hashtbl[hashval].nfb_count;
+	nfsd_file_hashtbl[hashval].nfb_maxcount = max(nfsd_file_hashtbl[hashval].nfb_maxcount,
+			nfsd_file_hashtbl[hashval].nfb_count);
+	spin_unlock(&nfsd_file_hashtbl[hashval].nfb_lock);
+	atomic_long_inc(&nfsd_filecache_count);
+
+	nf->nf_mark = nfsd_file_mark_find_or_create(nf);
+	if (nf->nf_mark)
+		status = nfsd_open_verified(rqstp, fhp, S_IFREG,
+				may_flags, &nf->nf_file);
+	else
+		status = nfserr_jukebox;
+	/*
+	 * If construction failed, or we raced with a call to unlink()
+	 * then unhash.
+	 */
+	if (status != nfs_ok || inode->i_nlink == 0) {
+		bool do_free;
+		spin_lock(&nfsd_file_hashtbl[hashval].nfb_lock);
+		do_free = nfsd_file_unhash(nf);
+		spin_unlock(&nfsd_file_hashtbl[hashval].nfb_lock);
+		if (do_free)
+			nfsd_file_put_noref(nf);
+	}
+	clear_bit_unlock(NFSD_FILE_PENDING, &nf->nf_flags);
+	smp_mb__after_atomic();
+	wake_up_bit(&nf->nf_flags, NFSD_FILE_PENDING);
+	goto out;
+}
+
+/*
+ * Note that fields may be added, removed or reordered in the future. Programs
+ * scraping this file for info should test the labels to ensure they're
+ * getting the correct field.
+ */
+static int nfsd_file_cache_stats_show(struct seq_file *m, void *v)
+{
+	unsigned int i, count = 0, longest = 0;
+	unsigned long hits = 0;
+
+	/*
+	 * No need for spinlocks here since we're not terribly interested in
+	 * accuracy. We do take the nfsd_mutex simply to ensure that we
+	 * don't end up racing with server shutdown
+	 */
+	mutex_lock(&nfsd_mutex);
+	if (nfsd_file_hashtbl) {
+		for (i = 0; i < NFSD_FILE_HASH_SIZE; i++) {
+			count += nfsd_file_hashtbl[i].nfb_count;
+			longest = max(longest, nfsd_file_hashtbl[i].nfb_count);
+		}
+	}
+	mutex_unlock(&nfsd_mutex);
+
+	for_each_possible_cpu(i)
+		hits += per_cpu(nfsd_file_cache_hits, i);
+
+	seq_printf(m, "total entries: %u\n", count);
+	seq_printf(m, "longest chain: %u\n", longest);
+	seq_printf(m, "cache hits:    %lu\n", hits);
+	return 0;
+}
+
+int nfsd_file_cache_stats_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, nfsd_file_cache_stats_show, NULL);
+}

diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h
new file mode 100644
index 0000000..851d9ab
--- /dev/null
+++ b/fs/nfsd/filecache.h

@@ -0,0 +1,61 @@
+#ifndef _FS_NFSD_FILECACHE_H
+#define _FS_NFSD_FILECACHE_H
+
+#include <linux/fsnotify_backend.h>
+
+/*
+ * This is the fsnotify_mark container that nfsd attaches to the files that it
+ * is holding open. Note that we have a separate refcount here aside from the
+ * one in the fsnotify_mark. We only want a single fsnotify_mark attached to
+ * the inode, and for each nfsd_file to hold a reference to it.
+ *
+ * The fsnotify_mark is itself refcounted, but that's not sufficient to tell us
+ * how to put that reference. If there are still outstanding nfsd_files that
+ * reference the mark, then we would want to call fsnotify_put_mark on it.
+ * If there were not, then we'd need to call fsnotify_destroy_mark. Since we
+ * can't really tell the difference, we use the nfm_mark to keep track of how
+ * many nfsd_files hold references to the mark. When that counter goes to zero
+ * then we know to call fsnotify_destroy_mark on it.
+ */
+struct nfsd_file_mark {
+	struct fsnotify_mark	nfm_mark;
+	atomic_t		nfm_ref;
+};
+
+/*
+ * A representation of a file that has been opened by knfsd. These are hashed
+ * in the hashtable by inode pointer value. Note that this object doesn't
+ * hold a reference to the inode by itself, so the nf_inode pointer should
+ * never be dereferenced, only used for comparison.
+ */
+struct nfsd_file {
+	struct hlist_node	nf_node;
+	struct list_head	nf_lru;
+	struct rcu_head		nf_rcu;
+	struct file		*nf_file;
+	const struct cred	*nf_cred;
+	struct net		*nf_net;
+#define NFSD_FILE_HASHED	(0)
+#define NFSD_FILE_PENDING	(1)
+#define NFSD_FILE_BREAK_READ	(2)
+#define NFSD_FILE_BREAK_WRITE	(3)
+#define NFSD_FILE_REFERENCED	(4)
+	unsigned long		nf_flags;
+	struct inode		*nf_inode;
+	unsigned int		nf_hashval;
+	atomic_t		nf_ref;
+	unsigned char		nf_may;
+	struct nfsd_file_mark	*nf_mark;
+};
+
+int nfsd_file_cache_init(void);
+void nfsd_file_cache_purge(struct net *);
+void nfsd_file_cache_shutdown(void);
+void nfsd_file_put(struct nfsd_file *nf);
+struct nfsd_file *nfsd_file_get(struct nfsd_file *nf);
+void nfsd_file_close_inode_sync(struct inode *inode);
+bool nfsd_file_is_cached(struct inode *inode);
+__be32 nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
+		  unsigned int may_flags, struct nfsd_file **nfp);
+int	nfsd_file_cache_stats_open(struct inode *, struct file *);
+#endif /* _FS_NFSD_FILECACHE_H */

diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index bdfe5bc..9a4ef81 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h

@@ -104,6 +104,7 @@
 
 	/* Time of server startup */
 	struct timespec64 nfssvc_boot;
+	seqlock_t boot_lock;
 
 	/*
 	 * Max number of connections this nfsd container will allow. Defaults
@@ -179,4 +180,7 @@
 extern void nfsd_netns_free_versions(struct nfsd_net *nn);
 
 extern unsigned int nfsd_net_id;
+
+void nfsd_copy_boot_verifier(__be32 verf[2], struct nfsd_net *nn);
+void nfsd_reset_boot_verifier(struct nfsd_net *nn);
 #endif /* __NFSD_NETNS_H__ */

diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 9bc32af..cea68d8 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c

@@ -172,13 +172,8 @@
 	nfserr = nfsd_read(rqstp, &resp->fh,
 				  argp->offset,
 			   	  rqstp->rq_vec, argp->vlen,
-				  &resp->count);
-	if (nfserr == 0) {
-		struct inode	*inode = d_inode(resp->fh.fh_dentry);
-		resp->eof = nfsd_eof_on_read(cnt, resp->count, argp->offset,
-							inode->i_size);
-	}
-
+				  &resp->count,
+				  &resp->eof);
 	RETURN_STATUS(nfserr);
 }
 

diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index fcf3182..86e5658 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c

@@ -27,6 +27,7 @@
 	NF3SOCK, NF3BAD,  NF3LNK, NF3BAD,
 };
 
+
 /*
  * XDR functions for basic NFS types
  */
@@ -751,14 +752,16 @@
 {
 	struct nfsd3_writeres *resp = rqstp->rq_resp;
 	struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+	__be32 verf[2];
 
 	p = encode_wcc_data(rqstp, p, &resp->fh);
 	if (resp->status == 0) {
 		*p++ = htonl(resp->count);
 		*p++ = htonl(resp->committed);
 		/* unique identifier, y2038 overflow can be ignored */
-		*p++ = htonl((u32)nn->nfssvc_boot.tv_sec);
-		*p++ = htonl(nn->nfssvc_boot.tv_nsec);
+		nfsd_copy_boot_verifier(verf, nn);
+		*p++ = verf[0];
+		*p++ = verf[1];
 	}
 	return xdr_ressize_check(rqstp, p);
 }
@@ -1125,13 +1128,15 @@
 {
 	struct nfsd3_commitres *resp = rqstp->rq_resp;
 	struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+	__be32 verf[2];
 
 	p = encode_wcc_data(rqstp, p, &resp->fh);
 	/* Write verifier */
 	if (resp->status == 0) {
 		/* unique identifier, y2038 overflow can be ignored */
-		*p++ = htonl((u32)nn->nfssvc_boot.tv_sec);
-		*p++ = htonl(nn->nfssvc_boot.tv_nsec);
+		nfsd_copy_boot_verifier(verf, nn);
+		*p++ = verf[0];
+		*p++ = verf[1];
 	}
 	return xdr_ressize_check(rqstp, p);
 }

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 397eb78..5241114 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c

@@ -512,11 +512,9 @@
 	if (unlikely(status))
 		return status;
 
-	if (cb != NULL) {
-		status = decode_cb_sequence4res(xdr, cb);
-		if (unlikely(status || cb->cb_seq_status))
-			return status;
-	}
+	status = decode_cb_sequence4res(xdr, cb);
+	if (unlikely(status || cb->cb_seq_status))
+		return status;
 
 	return decode_cb_op_status(xdr, OP_CB_RECALL, &cb->cb_status);
 }
@@ -604,11 +602,10 @@
 	if (unlikely(status))
 		return status;
 
-	if (cb) {
-		status = decode_cb_sequence4res(xdr, cb);
-		if (unlikely(status || cb->cb_seq_status))
-			return status;
-	}
+	status = decode_cb_sequence4res(xdr, cb);
+	if (unlikely(status || cb->cb_seq_status))
+		return status;
+
 	return decode_cb_op_status(xdr, OP_CB_LAYOUTRECALL, &cb->cb_status);
 }
 #endif /* CONFIG_NFSD_PNFS */
@@ -663,11 +660,10 @@
 	if (unlikely(status))
 		return status;
 
-	if (cb) {
-		status = decode_cb_sequence4res(xdr, cb);
-		if (unlikely(status || cb->cb_seq_status))
-			return status;
-	}
+	status = decode_cb_sequence4res(xdr, cb);
+	if (unlikely(status || cb->cb_seq_status))
+		return status;
+
 	return decode_cb_op_status(xdr, OP_CB_NOTIFY_LOCK, &cb->cb_status);
 }
 
@@ -759,11 +755,10 @@
 	if (unlikely(status))
 		return status;
 
-	if (cb) {
-		status = decode_cb_sequence4res(xdr, cb);
-		if (unlikely(status || cb->cb_seq_status))
-			return status;
-	}
+	status = decode_cb_sequence4res(xdr, cb);
+	if (unlikely(status || cb->cb_seq_status))
+		return status;
+
 	return decode_cb_op_status(xdr, OP_CB_OFFLOAD, &cb->cb_status);
 }
 /*

diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index a79e24b..2681c70 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c

@@ -169,8 +169,8 @@
 	spin_unlock(&fp->fi_lock);
 
 	if (!nfsd4_layout_ops[ls->ls_layout_type]->disable_recalls)
-		vfs_setlease(ls->ls_file, F_UNLCK, NULL, (void **)&ls);
-	fput(ls->ls_file);
+		vfs_setlease(ls->ls_file->nf_file, F_UNLCK, NULL, (void **)&ls);
+	nfsd_file_put(ls->ls_file);
 
 	if (ls->ls_recalled)
 		atomic_dec(&ls->ls_stid.sc_file->fi_lo_recalls);
@@ -197,7 +197,7 @@
 	fl->fl_end = OFFSET_MAX;
 	fl->fl_owner = ls;
 	fl->fl_pid = current->tgid;
-	fl->fl_file = ls->ls_file;
+	fl->fl_file = ls->ls_file->nf_file;
 
 	status = vfs_setlease(fl->fl_file, fl->fl_type, &fl, NULL);
 	if (status) {
@@ -236,13 +236,13 @@
 			NFSPROC4_CLNT_CB_LAYOUT);
 
 	if (parent->sc_type == NFS4_DELEG_STID)
-		ls->ls_file = get_file(fp->fi_deleg_file);
+		ls->ls_file = nfsd_file_get(fp->fi_deleg_file);
 	else
 		ls->ls_file = find_any_file(fp);
 	BUG_ON(!ls->ls_file);
 
 	if (nfsd4_layout_setlease(ls)) {
-		fput(ls->ls_file);
+		nfsd_file_put(ls->ls_file);
 		put_nfs4_file(fp);
 		kmem_cache_free(nfs4_layout_stateid_cache, ls);
 		return NULL;
@@ -626,7 +626,7 @@
 
 	argv[0] = (char *)nfsd_recall_failed;
 	argv[1] = addr_str;
-	argv[2] = ls->ls_file->f_path.mnt->mnt_sb->s_id;
+	argv[2] = ls->ls_file->nf_file->f_path.mnt->mnt_sb->s_id;
 	argv[3] = NULL;
 
 	error = call_usermodehelper(nfsd_recall_failed, argv, envp,

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 8beda99..4e3e77b 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c

@@ -568,17 +568,11 @@
 
 static void gen_boot_verifier(nfs4_verifier *verifier, struct net *net)
 {
-	__be32 verf[2];
-	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+	__be32 *verf = (__be32 *)verifier->data;
 
-	/*
-	 * This is opaque to client, so no need to byte-swap. Use
-	 * __force to keep sparse happy. y2038 time_t overflow is
-	 * irrelevant in this usage.
-	 */
-	verf[0] = (__force __be32)nn->nfssvc_boot.tv_sec;
-	verf[1] = (__force __be32)nn->nfssvc_boot.tv_nsec;
-	memcpy(verifier->data, verf, sizeof(verifier->data));
+	BUILD_BUG_ON(2*sizeof(*verf) != sizeof(verifier->data));
+
+	nfsd_copy_boot_verifier(verf, net_generic(net, nfsd_net_id));
 }
 
 static __be32
@@ -761,7 +755,7 @@
 	struct nfsd4_read *read = &u->read;
 	__be32 status;
 
-	read->rd_filp = NULL;
+	read->rd_nf = NULL;
 	if (read->rd_offset >= OFFSET_MAX)
 		return nfserr_inval;
 
@@ -782,7 +776,7 @@
 	/* check stateid */
 	status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
 					&read->rd_stateid, RD_STATE,
-					&read->rd_filp, &read->rd_tmp_file);
+					&read->rd_nf);
 	if (status) {
 		dprintk("NFSD: nfsd4_read: couldn't process stateid!\n");
 		goto out;
@@ -798,8 +792,8 @@
 static void
 nfsd4_read_release(union nfsd4_op_u *u)
 {
-	if (u->read.rd_filp)
-		fput(u->read.rd_filp);
+	if (u->read.rd_nf)
+		nfsd_file_put(u->read.rd_nf);
 	trace_nfsd_read_done(u->read.rd_rqstp, u->read.rd_fhp,
 			     u->read.rd_offset, u->read.rd_length);
 }
@@ -954,7 +948,7 @@
 	if (setattr->sa_iattr.ia_valid & ATTR_SIZE) {
 		status = nfs4_preprocess_stateid_op(rqstp, cstate,
 				&cstate->current_fh, &setattr->sa_stateid,
-				WR_STATE, NULL, NULL);
+				WR_STATE, NULL);
 		if (status) {
 			dprintk("NFSD: nfsd4_setattr: couldn't process stateid!\n");
 			return status;
@@ -993,7 +987,7 @@
 {
 	struct nfsd4_write *write = &u->write;
 	stateid_t *stateid = &write->wr_stateid;
-	struct file *filp = NULL;
+	struct nfsd_file *nf = NULL;
 	__be32 status = nfs_ok;
 	unsigned long cnt;
 	int nvecs;
@@ -1005,7 +999,7 @@
 	trace_nfsd_write_start(rqstp, &cstate->current_fh,
 			       write->wr_offset, cnt);
 	status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
-						stateid, WR_STATE, &filp, NULL);
+						stateid, WR_STATE, &nf);
 	if (status) {
 		dprintk("NFSD: nfsd4_write: couldn't process stateid!\n");
 		return status;
@@ -1018,10 +1012,10 @@
 				      &write->wr_head, write->wr_buflen);
 	WARN_ON_ONCE(nvecs > ARRAY_SIZE(rqstp->rq_vec));
 
-	status = nfsd_vfs_write(rqstp, &cstate->current_fh, filp,
+	status = nfsd_vfs_write(rqstp, &cstate->current_fh, nf->nf_file,
 				write->wr_offset, rqstp->rq_vec, nvecs, &cnt,
 				write->wr_how_written);
-	fput(filp);
+	nfsd_file_put(nf);
 
 	write->wr_bytes_written = cnt;
 	trace_nfsd_write_done(rqstp, &cstate->current_fh,
@@ -1031,8 +1025,8 @@
 
 static __be32
 nfsd4_verify_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
-		  stateid_t *src_stateid, struct file **src,
-		  stateid_t *dst_stateid, struct file **dst)
+		  stateid_t *src_stateid, struct nfsd_file **src,
+		  stateid_t *dst_stateid, struct nfsd_file **dst)
 {
 	__be32 status;
 
@@ -1040,22 +1034,22 @@
 		return nfserr_nofilehandle;
 
 	status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->save_fh,
-					    src_stateid, RD_STATE, src, NULL);
+					    src_stateid, RD_STATE, src);
 	if (status) {
 		dprintk("NFSD: %s: couldn't process src stateid!\n", __func__);
 		goto out;
 	}
 
 	status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
-					    dst_stateid, WR_STATE, dst, NULL);
+					    dst_stateid, WR_STATE, dst);
 	if (status) {
 		dprintk("NFSD: %s: couldn't process dst stateid!\n", __func__);
 		goto out_put_src;
 	}
 
 	/* fix up for NFS-specific error code */
-	if (!S_ISREG(file_inode(*src)->i_mode) ||
-	    !S_ISREG(file_inode(*dst)->i_mode)) {
+	if (!S_ISREG(file_inode((*src)->nf_file)->i_mode) ||
+	    !S_ISREG(file_inode((*dst)->nf_file)->i_mode)) {
 		status = nfserr_wrong_type;
 		goto out_put_dst;
 	}
@@ -1063,9 +1057,9 @@
 out:
 	return status;
 out_put_dst:
-	fput(*dst);
+	nfsd_file_put(*dst);
 out_put_src:
-	fput(*src);
+	nfsd_file_put(*src);
 	goto out;
 }
 
@@ -1074,7 +1068,7 @@
 		union nfsd4_op_u *u)
 {
 	struct nfsd4_clone *clone = &u->clone;
-	struct file *src, *dst;
+	struct nfsd_file *src, *dst;
 	__be32 status;
 
 	status = nfsd4_verify_copy(rqstp, cstate, &clone->cl_src_stateid, &src,
@@ -1082,11 +1076,11 @@
 	if (status)
 		goto out;
 
-	status = nfsd4_clone_file_range(src, clone->cl_src_pos,
-			dst, clone->cl_dst_pos, clone->cl_count);
+	status = nfsd4_clone_file_range(src->nf_file, clone->cl_src_pos,
+			dst->nf_file, clone->cl_dst_pos, clone->cl_count);
 
-	fput(dst);
-	fput(src);
+	nfsd_file_put(dst);
+	nfsd_file_put(src);
 out:
 	return status;
 }
@@ -1176,8 +1170,9 @@
 	do {
 		if (kthread_should_stop())
 			break;
-		bytes_copied = nfsd_copy_file_range(copy->file_src, src_pos,
-				copy->file_dst, dst_pos, bytes_total);
+		bytes_copied = nfsd_copy_file_range(copy->nf_src->nf_file,
+				src_pos, copy->nf_dst->nf_file, dst_pos,
+				bytes_total);
 		if (bytes_copied <= 0)
 			break;
 		bytes_total -= bytes_copied;
@@ -1204,8 +1199,8 @@
 		status = nfs_ok;
 	}
 
-	fput(copy->file_src);
-	fput(copy->file_dst);
+	nfsd_file_put(copy->nf_src);
+	nfsd_file_put(copy->nf_dst);
 	return status;
 }
 
@@ -1218,16 +1213,16 @@
 	memcpy(&dst->cp_res, &src->cp_res, sizeof(src->cp_res));
 	memcpy(&dst->fh, &src->fh, sizeof(src->fh));
 	dst->cp_clp = src->cp_clp;
-	dst->file_dst = get_file(src->file_dst);
-	dst->file_src = get_file(src->file_src);
+	dst->nf_dst = nfsd_file_get(src->nf_dst);
+	dst->nf_src = nfsd_file_get(src->nf_src);
 	memcpy(&dst->cp_stateid, &src->cp_stateid, sizeof(src->cp_stateid));
 }
 
 static void cleanup_async_copy(struct nfsd4_copy *copy)
 {
 	nfs4_free_cp_state(copy);
-	fput(copy->file_dst);
-	fput(copy->file_src);
+	nfsd_file_put(copy->nf_dst);
+	nfsd_file_put(copy->nf_src);
 	spin_lock(&copy->cp_clp->async_lock);
 	list_del(&copy->copies);
 	spin_unlock(&copy->cp_clp->async_lock);
@@ -1264,8 +1259,8 @@
 	struct nfsd4_copy *async_copy = NULL;
 
 	status = nfsd4_verify_copy(rqstp, cstate, &copy->cp_src_stateid,
-				   &copy->file_src, &copy->cp_dst_stateid,
-				   &copy->file_dst);
+				   &copy->nf_src, &copy->cp_dst_stateid,
+				   &copy->nf_dst);
 	if (status)
 		goto out;
 
@@ -1347,21 +1342,21 @@
 		struct nfsd4_fallocate *fallocate, int flags)
 {
 	__be32 status;
-	struct file *file;
+	struct nfsd_file *nf;
 
 	status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
 					    &fallocate->falloc_stateid,
-					    WR_STATE, &file, NULL);
+					    WR_STATE, &nf);
 	if (status != nfs_ok) {
 		dprintk("NFSD: nfsd4_fallocate: couldn't process stateid!\n");
 		return status;
 	}
 
-	status = nfsd4_vfs_fallocate(rqstp, &cstate->current_fh, file,
+	status = nfsd4_vfs_fallocate(rqstp, &cstate->current_fh, nf->nf_file,
 				     fallocate->falloc_offset,
 				     fallocate->falloc_length,
 				     flags);
-	fput(file);
+	nfsd_file_put(nf);
 	return status;
 }
 static __be32
@@ -1406,11 +1401,11 @@
 	struct nfsd4_seek *seek = &u->seek;
 	int whence;
 	__be32 status;
-	struct file *file;
+	struct nfsd_file *nf;
 
 	status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
 					    &seek->seek_stateid,
-					    RD_STATE, &file, NULL);
+					    RD_STATE, &nf);
 	if (status) {
 		dprintk("NFSD: nfsd4_seek: couldn't process stateid!\n");
 		return status;
@@ -1432,14 +1427,14 @@
 	 * Note:  This call does change file->f_pos, but nothing in NFSD
 	 *        should ever file->f_pos.
 	 */
-	seek->seek_pos = vfs_llseek(file, seek->seek_offset, whence);
+	seek->seek_pos = vfs_llseek(nf->nf_file, seek->seek_offset, whence);
 	if (seek->seek_pos < 0)
 		status = nfserrno(seek->seek_pos);
-	else if (seek->seek_pos >= i_size_read(file_inode(file)))
+	else if (seek->seek_pos >= i_size_read(file_inode(nf->nf_file)))
 		seek->seek_eof = true;
 
 out:
-	fput(file);
+	nfsd_file_put(nf);
 	return status;
 }
 

diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 8767955..cdc75ad 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c

@@ -59,8 +59,13 @@
 	void (*remove)(struct nfs4_client *);
 	int (*check)(struct nfs4_client *);
 	void (*grace_done)(struct nfsd_net *);
+	uint8_t version;
+	size_t msglen;
 };
 
+static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops;
+static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops_v2;
+
 /* Globals */
 static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery";
 
@@ -173,6 +178,7 @@
 		const char *dname, int len, struct nfsd_net *nn)
 {
 	struct xdr_netobj name;
+	struct xdr_netobj princhash = { .len = 0, .data = NULL };
 	struct nfs4_client_reclaim *crp;
 
 	name.data = kmemdup(dname, len, GFP_KERNEL);
@@ -182,7 +188,7 @@
 		return;
 	}
 	name.len = len;
-	crp = nfs4_client_to_reclaim(name, nn);
+	crp = nfs4_client_to_reclaim(name, princhash, nn);
 	if (!crp) {
 		kfree(name.data);
 		return;
@@ -482,6 +488,7 @@
 load_recdir(struct dentry *parent, struct dentry *child, struct nfsd_net *nn)
 {
 	struct xdr_netobj name;
+	struct xdr_netobj princhash = { .len = 0, .data = NULL };
 
 	if (child->d_name.len != HEXDIR_LEN - 1) {
 		printk("%s: illegal name %pd in recovery directory\n",
@@ -496,7 +503,7 @@
 		goto out;
 	}
 	name.len = HEXDIR_LEN;
-	if (!nfs4_client_to_reclaim(name, nn))
+	if (!nfs4_client_to_reclaim(name, princhash, nn))
 		kfree(name.data);
 out:
 	return 0;
@@ -718,6 +725,8 @@
 	.remove		= nfsd4_remove_clid_dir,
 	.check		= nfsd4_check_legacy_client,
 	.grace_done	= nfsd4_recdir_purge_old,
+	.version	= 1,
+	.msglen		= 0,
 };
 
 /* Globals */
@@ -731,25 +740,32 @@
 	struct list_head	 cn_list;
 	unsigned int		 cn_xid;
 	bool			 cn_has_legacy;
+	struct crypto_shash	*cn_tfm;
 };
 
 struct cld_upcall {
 	struct list_head	 cu_list;
 	struct cld_net		*cu_net;
 	struct completion	 cu_done;
-	struct cld_msg		 cu_msg;
+	union {
+		struct cld_msg_hdr	 cu_hdr;
+		struct cld_msg		 cu_msg;
+		struct cld_msg_v2	 cu_msg_v2;
+	} cu_u;
 };
 
 static int
-__cld_pipe_upcall(struct rpc_pipe *pipe, struct cld_msg *cmsg)
+__cld_pipe_upcall(struct rpc_pipe *pipe, void *cmsg)
 {
 	int ret;
 	struct rpc_pipe_msg msg;
-	struct cld_upcall *cup = container_of(cmsg, struct cld_upcall, cu_msg);
+	struct cld_upcall *cup = container_of(cmsg, struct cld_upcall, cu_u);
+	struct nfsd_net *nn = net_generic(pipe->dentry->d_sb->s_fs_info,
+					  nfsd_net_id);
 
 	memset(&msg, 0, sizeof(msg));
 	msg.data = cmsg;
-	msg.len = sizeof(*cmsg);
+	msg.len = nn->client_tracking_ops->msglen;
 
 	ret = rpc_queue_upcall(pipe, &msg);
 	if (ret < 0) {
@@ -765,7 +781,7 @@
 }
 
 static int
-cld_pipe_upcall(struct rpc_pipe *pipe, struct cld_msg *cmsg)
+cld_pipe_upcall(struct rpc_pipe *pipe, void *cmsg)
 {
 	int ret;
 
@@ -781,11 +797,11 @@
 }
 
 static ssize_t
-__cld_pipe_inprogress_downcall(const struct cld_msg __user *cmsg,
+__cld_pipe_inprogress_downcall(const struct cld_msg_v2 __user *cmsg,
 		struct nfsd_net *nn)
 {
-	uint8_t cmd;
-	struct xdr_netobj name;
+	uint8_t cmd, princhashlen;
+	struct xdr_netobj name, princhash = { .len = 0, .data = NULL };
 	uint16_t namelen;
 	struct cld_net *cn = nn->cld_net;
 
@@ -794,22 +810,48 @@
 		return -EFAULT;
 	}
 	if (cmd == Cld_GraceStart) {
-		if (get_user(namelen, &cmsg->cm_u.cm_name.cn_len))
-			return -EFAULT;
-		name.data = memdup_user(&cmsg->cm_u.cm_name.cn_id, namelen);
-		if (IS_ERR_OR_NULL(name.data))
-			return -EFAULT;
-		name.len = namelen;
+		if (nn->client_tracking_ops->version >= 2) {
+			const struct cld_clntinfo __user *ci;
+
+			ci = &cmsg->cm_u.cm_clntinfo;
+			if (get_user(namelen, &ci->cc_name.cn_len))
+				return -EFAULT;
+			name.data = memdup_user(&ci->cc_name.cn_id, namelen);
+			if (IS_ERR_OR_NULL(name.data))
+				return -EFAULT;
+			name.len = namelen;
+			get_user(princhashlen, &ci->cc_princhash.cp_len);
+			if (princhashlen > 0) {
+				princhash.data = memdup_user(
+						&ci->cc_princhash.cp_data,
+						princhashlen);
+				if (IS_ERR_OR_NULL(princhash.data))
+					return -EFAULT;
+				princhash.len = princhashlen;
+			} else
+				princhash.len = 0;
+		} else {
+			const struct cld_name __user *cnm;
+
+			cnm = &cmsg->cm_u.cm_name;
+			if (get_user(namelen, &cnm->cn_len))
+				return -EFAULT;
+			name.data = memdup_user(&cnm->cn_id, namelen);
+			if (IS_ERR_OR_NULL(name.data))
+				return -EFAULT;
+			name.len = namelen;
+		}
 		if (name.len > 5 && memcmp(name.data, "hash:", 5) == 0) {
 			name.len = name.len - 5;
 			memmove(name.data, name.data + 5, name.len);
 			cn->cn_has_legacy = true;
 		}
-		if (!nfs4_client_to_reclaim(name, nn)) {
+		if (!nfs4_client_to_reclaim(name, princhash, nn)) {
 			kfree(name.data);
+			kfree(princhash.data);
 			return -EFAULT;
 		}
-		return sizeof(*cmsg);
+		return nn->client_tracking_ops->msglen;
 	}
 	return -EFAULT;
 }
@@ -818,21 +860,22 @@
 cld_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
 {
 	struct cld_upcall *tmp, *cup;
-	struct cld_msg __user *cmsg = (struct cld_msg __user *)src;
+	struct cld_msg_hdr __user *hdr = (struct cld_msg_hdr __user *)src;
+	struct cld_msg_v2 __user *cmsg = (struct cld_msg_v2 __user *)src;
 	uint32_t xid;
 	struct nfsd_net *nn = net_generic(file_inode(filp)->i_sb->s_fs_info,
 						nfsd_net_id);
 	struct cld_net *cn = nn->cld_net;
 	int16_t status;
 
-	if (mlen != sizeof(*cmsg)) {
+	if (mlen != nn->client_tracking_ops->msglen) {
 		dprintk("%s: got %zu bytes, expected %zu\n", __func__, mlen,
-			sizeof(*cmsg));
+			nn->client_tracking_ops->msglen);
 		return -EINVAL;
 	}
 
 	/* copy just the xid so we can try to find that */
-	if (copy_from_user(&xid, &cmsg->cm_xid, sizeof(xid)) != 0) {
+	if (copy_from_user(&xid, &hdr->cm_xid, sizeof(xid)) != 0) {
 		dprintk("%s: error when copying xid from userspace", __func__);
 		return -EFAULT;
 	}
@@ -842,7 +885,7 @@
 	 * list (for -EINPROGRESS, we just want to make sure the xid is
 	 * valid, not remove the upcall from the list)
 	 */
-	if (get_user(status, &cmsg->cm_status)) {
+	if (get_user(status, &hdr->cm_status)) {
 		dprintk("%s: error when copying status from userspace", __func__);
 		return -EFAULT;
 	}
@@ -851,7 +894,7 @@
 	cup = NULL;
 	spin_lock(&cn->cn_lock);
 	list_for_each_entry(tmp, &cn->cn_list, cu_list) {
-		if (get_unaligned(&tmp->cu_msg.cm_xid) == xid) {
+		if (get_unaligned(&tmp->cu_u.cu_hdr.cm_xid) == xid) {
 			cup = tmp;
 			if (status != -EINPROGRESS)
 				list_del_init(&cup->cu_list);
@@ -869,7 +912,7 @@
 	if (status == -EINPROGRESS)
 		return __cld_pipe_inprogress_downcall(cmsg, nn);
 
-	if (copy_from_user(&cup->cu_msg, src, mlen) != 0)
+	if (copy_from_user(&cup->cu_u.cu_msg_v2, src, mlen) != 0)
 		return -EFAULT;
 
 	complete(&cup->cu_done);
@@ -881,7 +924,7 @@
 {
 	struct cld_msg *cmsg = msg->data;
 	struct cld_upcall *cup = container_of(cmsg, struct cld_upcall,
-						 cu_msg);
+						 cu_u.cu_msg);
 
 	/* errno >= 0 means we got a downcall */
 	if (msg->errno >= 0)
@@ -1007,14 +1050,17 @@
 
 	nfsd4_cld_unregister_net(net, cn->cn_pipe);
 	rpc_destroy_pipe_data(cn->cn_pipe);
+	if (cn->cn_tfm)
+		crypto_free_shash(cn->cn_tfm);
 	kfree(nn->cld_net);
 	nn->cld_net = NULL;
 }
 
 static struct cld_upcall *
-alloc_cld_upcall(struct cld_net *cn)
+alloc_cld_upcall(struct nfsd_net *nn)
 {
 	struct cld_upcall *new, *tmp;
+	struct cld_net *cn = nn->cld_net;
 
 	new = kzalloc(sizeof(*new), GFP_KERNEL);
 	if (!new)
@@ -1024,20 +1070,20 @@
 restart_search:
 	spin_lock(&cn->cn_lock);
 	list_for_each_entry(tmp, &cn->cn_list, cu_list) {
-		if (tmp->cu_msg.cm_xid == cn->cn_xid) {
+		if (tmp->cu_u.cu_msg.cm_xid == cn->cn_xid) {
 			cn->cn_xid++;
 			spin_unlock(&cn->cn_lock);
 			goto restart_search;
 		}
 	}
 	init_completion(&new->cu_done);
-	new->cu_msg.cm_vers = CLD_UPCALL_VERSION;
-	put_unaligned(cn->cn_xid++, &new->cu_msg.cm_xid);
+	new->cu_u.cu_msg.cm_vers = nn->client_tracking_ops->version;
+	put_unaligned(cn->cn_xid++, &new->cu_u.cu_msg.cm_xid);
 	new->cu_net = cn;
 	list_add(&new->cu_list, &cn->cn_list);
 	spin_unlock(&cn->cn_lock);
 
-	dprintk("%s: allocated xid %u\n", __func__, new->cu_msg.cm_xid);
+	dprintk("%s: allocated xid %u\n", __func__, new->cu_u.cu_msg.cm_xid);
 
 	return new;
 }
@@ -1066,20 +1112,20 @@
 	if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
 		return;
 
-	cup = alloc_cld_upcall(cn);
+	cup = alloc_cld_upcall(nn);
 	if (!cup) {
 		ret = -ENOMEM;
 		goto out_err;
 	}
 
-	cup->cu_msg.cm_cmd = Cld_Create;
-	cup->cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len;
-	memcpy(cup->cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data,
+	cup->cu_u.cu_msg.cm_cmd = Cld_Create;
+	cup->cu_u.cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len;
+	memcpy(cup->cu_u.cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data,
 			clp->cl_name.len);
 
-	ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg);
+	ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg);
 	if (!ret) {
-		ret = cup->cu_msg.cm_status;
+		ret = cup->cu_u.cu_msg.cm_status;
 		set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
 	}
 
@@ -1092,6 +1138,75 @@
 
 /* Ask daemon to create a new record */
 static void
+nfsd4_cld_create_v2(struct nfs4_client *clp)
+{
+	int ret;
+	struct cld_upcall *cup;
+	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+	struct cld_net *cn = nn->cld_net;
+	struct cld_msg_v2 *cmsg;
+	struct crypto_shash *tfm = cn->cn_tfm;
+	struct xdr_netobj cksum;
+	char *principal = NULL;
+	SHASH_DESC_ON_STACK(desc, tfm);
+
+	/* Don't upcall if it's already stored */
+	if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
+		return;
+
+	cup = alloc_cld_upcall(nn);
+	if (!cup) {
+		ret = -ENOMEM;
+		goto out_err;
+	}
+
+	cmsg = &cup->cu_u.cu_msg_v2;
+	cmsg->cm_cmd = Cld_Create;
+	cmsg->cm_u.cm_clntinfo.cc_name.cn_len = clp->cl_name.len;
+	memcpy(cmsg->cm_u.cm_clntinfo.cc_name.cn_id, clp->cl_name.data,
+			clp->cl_name.len);
+	if (clp->cl_cred.cr_raw_principal)
+		principal = clp->cl_cred.cr_raw_principal;
+	else if (clp->cl_cred.cr_principal)
+		principal = clp->cl_cred.cr_principal;
+	if (principal) {
+		desc->tfm = tfm;
+		cksum.len = crypto_shash_digestsize(tfm);
+		cksum.data = kmalloc(cksum.len, GFP_KERNEL);
+		if (cksum.data == NULL) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		ret = crypto_shash_digest(desc, principal, strlen(principal),
+					  cksum.data);
+		shash_desc_zero(desc);
+		if (ret) {
+			kfree(cksum.data);
+			goto out;
+		}
+		cmsg->cm_u.cm_clntinfo.cc_princhash.cp_len = cksum.len;
+		memcpy(cmsg->cm_u.cm_clntinfo.cc_princhash.cp_data,
+		       cksum.data, cksum.len);
+		kfree(cksum.data);
+	} else
+		cmsg->cm_u.cm_clntinfo.cc_princhash.cp_len = 0;
+
+	ret = cld_pipe_upcall(cn->cn_pipe, cmsg);
+	if (!ret) {
+		ret = cmsg->cm_status;
+		set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
+	}
+
+out:
+	free_cld_upcall(cup);
+out_err:
+	if (ret)
+		pr_err("NFSD: Unable to create client record on stable storage: %d\n",
+				ret);
+}
+
+/* Ask daemon to create a new record */
+static void
 nfsd4_cld_remove(struct nfs4_client *clp)
 {
 	int ret;
@@ -1103,20 +1218,20 @@
 	if (!test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
 		return;
 
-	cup = alloc_cld_upcall(cn);
+	cup = alloc_cld_upcall(nn);
 	if (!cup) {
 		ret = -ENOMEM;
 		goto out_err;
 	}
 
-	cup->cu_msg.cm_cmd = Cld_Remove;
-	cup->cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len;
-	memcpy(cup->cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data,
+	cup->cu_u.cu_msg.cm_cmd = Cld_Remove;
+	cup->cu_u.cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len;
+	memcpy(cup->cu_u.cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data,
 			clp->cl_name.len);
 
-	ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg);
+	ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg);
 	if (!ret) {
-		ret = cup->cu_msg.cm_status;
+		ret = cup->cu_u.cu_msg.cm_status;
 		clear_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
 	}
 
@@ -1145,21 +1260,21 @@
 	if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
 		return 0;
 
-	cup = alloc_cld_upcall(cn);
+	cup = alloc_cld_upcall(nn);
 	if (!cup) {
 		printk(KERN_ERR "NFSD: Unable to check client record on "
 				"stable storage: %d\n", -ENOMEM);
 		return -ENOMEM;
 	}
 
-	cup->cu_msg.cm_cmd = Cld_Check;
-	cup->cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len;
-	memcpy(cup->cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data,
+	cup->cu_u.cu_msg.cm_cmd = Cld_Check;
+	cup->cu_u.cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len;
+	memcpy(cup->cu_u.cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data,
 			clp->cl_name.len);
 
-	ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg);
+	ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg);
 	if (!ret) {
-		ret = cup->cu_msg.cm_status;
+		ret = cup->cu_u.cu_msg.cm_status;
 		set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
 	}
 
@@ -1217,22 +1332,95 @@
 }
 
 static int
+nfsd4_cld_check_v2(struct nfs4_client *clp)
+{
+	struct nfs4_client_reclaim *crp;
+	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+	struct cld_net *cn = nn->cld_net;
+	int status;
+	char dname[HEXDIR_LEN];
+	struct xdr_netobj name;
+	struct crypto_shash *tfm = cn->cn_tfm;
+	struct xdr_netobj cksum;
+	char *principal = NULL;
+	SHASH_DESC_ON_STACK(desc, tfm);
+
+	/* did we already find that this client is stable? */
+	if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
+		return 0;
+
+	/* look for it in the reclaim hashtable otherwise */
+	crp = nfsd4_find_reclaim_client(clp->cl_name, nn);
+	if (crp)
+		goto found;
+
+	if (cn->cn_has_legacy) {
+		status = nfs4_make_rec_clidname(dname, &clp->cl_name);
+		if (status)
+			return -ENOENT;
+
+		name.data = kmemdup(dname, HEXDIR_LEN, GFP_KERNEL);
+		if (!name.data) {
+			dprintk("%s: failed to allocate memory for name.data\n",
+					__func__);
+			return -ENOENT;
+		}
+		name.len = HEXDIR_LEN;
+		crp = nfsd4_find_reclaim_client(name, nn);
+		kfree(name.data);
+		if (crp)
+			goto found;
+
+	}
+	return -ENOENT;
+found:
+	if (crp->cr_princhash.len) {
+		if (clp->cl_cred.cr_raw_principal)
+			principal = clp->cl_cred.cr_raw_principal;
+		else if (clp->cl_cred.cr_principal)
+			principal = clp->cl_cred.cr_principal;
+		if (principal == NULL)
+			return -ENOENT;
+		desc->tfm = tfm;
+		cksum.len = crypto_shash_digestsize(tfm);
+		cksum.data = kmalloc(cksum.len, GFP_KERNEL);
+		if (cksum.data == NULL)
+			return -ENOENT;
+		status = crypto_shash_digest(desc, principal, strlen(principal),
+					     cksum.data);
+		shash_desc_zero(desc);
+		if (status) {
+			kfree(cksum.data);
+			return -ENOENT;
+		}
+		if (memcmp(crp->cr_princhash.data, cksum.data,
+				crp->cr_princhash.len)) {
+			kfree(cksum.data);
+			return -ENOENT;
+		}
+		kfree(cksum.data);
+	}
+	crp->cr_clp = clp;
+	return 0;
+}
+
+static int
 nfsd4_cld_grace_start(struct nfsd_net *nn)
 {
 	int ret;
 	struct cld_upcall *cup;
 	struct cld_net *cn = nn->cld_net;
 
-	cup = alloc_cld_upcall(cn);
+	cup = alloc_cld_upcall(nn);
 	if (!cup) {
 		ret = -ENOMEM;
 		goto out_err;
 	}
 
-	cup->cu_msg.cm_cmd = Cld_GraceStart;
-	ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg);
+	cup->cu_u.cu_msg.cm_cmd = Cld_GraceStart;
+	ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg);
 	if (!ret)
-		ret = cup->cu_msg.cm_status;
+		ret = cup->cu_u.cu_msg.cm_status;
 
 	free_cld_upcall(cup);
 out_err:
@@ -1250,17 +1438,17 @@
 	struct cld_upcall *cup;
 	struct cld_net *cn = nn->cld_net;
 
-	cup = alloc_cld_upcall(cn);
+	cup = alloc_cld_upcall(nn);
 	if (!cup) {
 		ret = -ENOMEM;
 		goto out_err;
 	}
 
-	cup->cu_msg.cm_cmd = Cld_GraceDone;
-	cup->cu_msg.cm_u.cm_gracetime = (int64_t)nn->boot_time;
-	ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg);
+	cup->cu_u.cu_msg.cm_cmd = Cld_GraceDone;
+	cup->cu_u.cu_msg.cm_u.cm_gracetime = (int64_t)nn->boot_time;
+	ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg);
 	if (!ret)
-		ret = cup->cu_msg.cm_status;
+		ret = cup->cu_u.cu_msg.cm_status;
 
 	free_cld_upcall(cup);
 out_err:
@@ -1279,16 +1467,16 @@
 	struct cld_upcall *cup;
 	struct cld_net *cn = nn->cld_net;
 
-	cup = alloc_cld_upcall(cn);
+	cup = alloc_cld_upcall(nn);
 	if (!cup) {
 		ret = -ENOMEM;
 		goto out_err;
 	}
 
-	cup->cu_msg.cm_cmd = Cld_GraceDone;
-	ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg);
+	cup->cu_u.cu_msg.cm_cmd = Cld_GraceDone;
+	ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg);
 	if (!ret)
-		ret = cup->cu_msg.cm_status;
+		ret = cup->cu_u.cu_msg.cm_status;
 
 	free_cld_upcall(cup);
 out_err:
@@ -1337,6 +1525,53 @@
 }
 
 static int
+nfsd4_cld_get_version(struct nfsd_net *nn)
+{
+	int ret = 0;
+	struct cld_upcall *cup;
+	struct cld_net *cn = nn->cld_net;
+	uint8_t version;
+
+	cup = alloc_cld_upcall(nn);
+	if (!cup) {
+		ret = -ENOMEM;
+		goto out_err;
+	}
+	cup->cu_u.cu_msg.cm_cmd = Cld_GetVersion;
+	ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg);
+	if (!ret) {
+		ret = cup->cu_u.cu_msg.cm_status;
+		if (ret)
+			goto out_free;
+		version = cup->cu_u.cu_msg.cm_u.cm_version;
+		dprintk("%s: userspace returned version %u\n",
+				__func__, version);
+		if (version < 1)
+			version = 1;
+		else if (version > CLD_UPCALL_VERSION)
+			version = CLD_UPCALL_VERSION;
+
+		switch (version) {
+		case 1:
+			nn->client_tracking_ops = &nfsd4_cld_tracking_ops;
+			break;
+		case 2:
+			nn->client_tracking_ops = &nfsd4_cld_tracking_ops_v2;
+			break;
+		default:
+			break;
+		}
+	}
+out_free:
+	free_cld_upcall(cup);
+out_err:
+	if (ret)
+		dprintk("%s: Unable to get version from userspace: %d\n",
+			__func__, ret);
+	return ret;
+}
+
+static int
 nfsd4_cld_tracking_init(struct net *net)
 {
 	int status;
@@ -1351,6 +1586,11 @@
 	status = __nfsd4_init_cld_pipe(net);
 	if (status)
 		goto err_shutdown;
+	nn->cld_net->cn_tfm = crypto_alloc_shash("sha256", 0, 0);
+	if (IS_ERR(nn->cld_net->cn_tfm)) {
+		status = PTR_ERR(nn->cld_net->cn_tfm);
+		goto err_remove;
+	}
 
 	/*
 	 * rpc pipe upcalls take 30 seconds to time out, so we don't want to
@@ -1368,10 +1608,14 @@
 		goto err_remove;
 	}
 
+	status = nfsd4_cld_get_version(nn);
+	if (status == -EOPNOTSUPP)
+		pr_warn("NFSD: nfsdcld GetVersion upcall failed. Please upgrade nfsdcld.\n");
+
 	status = nfsd4_cld_grace_start(nn);
 	if (status) {
 		if (status == -EOPNOTSUPP)
-			printk(KERN_WARNING "NFSD: Please upgrade nfsdcld.\n");
+			pr_warn("NFSD: nfsdcld GraceStart upcall failed. Please upgrade nfsdcld.\n");
 		nfs4_release_reclaim(nn);
 		goto err_remove;
 	} else
@@ -1403,6 +1647,8 @@
 	.remove		= nfsd4_cld_remove,
 	.check		= nfsd4_cld_check_v0,
 	.grace_done	= nfsd4_cld_grace_done_v0,
+	.version	= 1,
+	.msglen		= sizeof(struct cld_msg),
 };
 
 /* For newer nfsdcld's */
@@ -1413,6 +1659,20 @@
 	.remove		= nfsd4_cld_remove,
 	.check		= nfsd4_cld_check,
 	.grace_done	= nfsd4_cld_grace_done,
+	.version	= 1,
+	.msglen		= sizeof(struct cld_msg),
+};
+
+/* v2 create/check ops include the principal, if available */
+static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops_v2 = {
+	.init		= nfsd4_cld_tracking_init,
+	.exit		= nfsd4_cld_tracking_exit,
+	.create		= nfsd4_cld_create_v2,
+	.remove		= nfsd4_cld_remove,
+	.check		= nfsd4_cld_check_v2,
+	.grace_done	= nfsd4_cld_grace_done,
+	.version	= 2,
+	.msglen		= sizeof(struct cld_msg_v2),
 };
 
 /* upcall via usermodehelper */
@@ -1760,6 +2020,8 @@
 	.remove		= nfsd4_umh_cltrack_remove,
 	.check		= nfsd4_umh_cltrack_check,
 	.grace_done	= nfsd4_umh_cltrack_grace_done,
+	.version	= 1,
+	.msglen		= 0,
 };
 
 int

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 7857942..c65aeaa 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c

@@ -50,6 +50,7 @@
 
 #include "netns.h"
 #include "pnfs.h"
+#include "filecache.h"
 
 #define NFSDDBG_FACILITY                NFSDDBG_PROC
 
@@ -429,18 +430,18 @@
 	}
 }
 
-static struct file *
+static struct nfsd_file *
 __nfs4_get_fd(struct nfs4_file *f, int oflag)
 {
 	if (f->fi_fds[oflag])
-		return get_file(f->fi_fds[oflag]);
+		return nfsd_file_get(f->fi_fds[oflag]);
 	return NULL;
 }
 
-static struct file *
+static struct nfsd_file *
 find_writeable_file_locked(struct nfs4_file *f)
 {
-	struct file *ret;
+	struct nfsd_file *ret;
 
 	lockdep_assert_held(&f->fi_lock);
 
@@ -450,10 +451,10 @@
 	return ret;
 }
 
-static struct file *
+static struct nfsd_file *
 find_writeable_file(struct nfs4_file *f)
 {
-	struct file *ret;
+	struct nfsd_file *ret;
 
 	spin_lock(&f->fi_lock);
 	ret = find_writeable_file_locked(f);
@@ -462,9 +463,10 @@
 	return ret;
 }
 
-static struct file *find_readable_file_locked(struct nfs4_file *f)
+static struct nfsd_file *
+find_readable_file_locked(struct nfs4_file *f)
 {
-	struct file *ret;
+	struct nfsd_file *ret;
 
 	lockdep_assert_held(&f->fi_lock);
 
@@ -474,10 +476,10 @@
 	return ret;
 }
 
-static struct file *
+static struct nfsd_file *
 find_readable_file(struct nfs4_file *f)
 {
-	struct file *ret;
+	struct nfsd_file *ret;
 
 	spin_lock(&f->fi_lock);
 	ret = find_readable_file_locked(f);
@@ -486,10 +488,10 @@
 	return ret;
 }
 
-struct file *
+struct nfsd_file *
 find_any_file(struct nfs4_file *f)
 {
-	struct file *ret;
+	struct nfsd_file *ret;
 
 	spin_lock(&f->fi_lock);
 	ret = __nfs4_get_fd(f, O_RDWR);
@@ -590,17 +592,17 @@
 	might_lock(&fp->fi_lock);
 
 	if (atomic_dec_and_lock(&fp->fi_access[oflag], &fp->fi_lock)) {
-		struct file *f1 = NULL;
-		struct file *f2 = NULL;
+		struct nfsd_file *f1 = NULL;
+		struct nfsd_file *f2 = NULL;
 
 		swap(f1, fp->fi_fds[oflag]);
 		if (atomic_read(&fp->fi_access[1 - oflag]) == 0)
 			swap(f2, fp->fi_fds[O_RDWR]);
 		spin_unlock(&fp->fi_lock);
 		if (f1)
-			fput(f1);
+			nfsd_file_put(f1);
 		if (f2)
-			fput(f2);
+			nfsd_file_put(f2);
 	}
 }
 
@@ -933,25 +935,25 @@
 
 static void put_deleg_file(struct nfs4_file *fp)
 {
-	struct file *filp = NULL;
+	struct nfsd_file *nf = NULL;
 
 	spin_lock(&fp->fi_lock);
 	if (--fp->fi_delegees == 0)
-		swap(filp, fp->fi_deleg_file);
+		swap(nf, fp->fi_deleg_file);
 	spin_unlock(&fp->fi_lock);
 
-	if (filp)
-		fput(filp);
+	if (nf)
+		nfsd_file_put(nf);
 }
 
 static void nfs4_unlock_deleg_lease(struct nfs4_delegation *dp)
 {
 	struct nfs4_file *fp = dp->dl_stid.sc_file;
-	struct file *filp = fp->fi_deleg_file;
+	struct nfsd_file *nf = fp->fi_deleg_file;
 
 	WARN_ON_ONCE(!fp->fi_delegees);
 
-	vfs_setlease(filp, F_UNLCK, NULL, (void **)&dp);
+	vfs_setlease(nf->nf_file, F_UNLCK, NULL, (void **)&dp);
 	put_deleg_file(fp);
 }
 
@@ -1289,11 +1291,14 @@
 {
 	struct nfs4_ol_stateid *stp = openlockstateid(stid);
 	struct nfs4_lockowner *lo = lockowner(stp->st_stateowner);
-	struct file *file;
+	struct nfsd_file *nf;
 
-	file = find_any_file(stp->st_stid.sc_file);
-	if (file)
-		filp_close(file, (fl_owner_t)lo);
+	nf = find_any_file(stp->st_stid.sc_file);
+	if (nf) {
+		get_file(nf->nf_file);
+		filp_close(nf->nf_file, (fl_owner_t)lo);
+		nfsd_file_put(nf);
+	}
 	nfs4_free_ol_stateid(stid);
 }
 
@@ -1563,21 +1568,39 @@
  * re-negotiate active sessions and reduce their slot usage to make
  * room for new connections. For now we just fail the create session.
  */
-static u32 nfsd4_get_drc_mem(struct nfsd4_channel_attrs *ca)
+static u32 nfsd4_get_drc_mem(struct nfsd4_channel_attrs *ca, struct nfsd_net *nn)
 {
 	u32 slotsize = slot_bytes(ca);
 	u32 num = ca->maxreqs;
 	unsigned long avail, total_avail;
+	unsigned int scale_factor;
 
 	spin_lock(&nfsd_drc_lock);
-	total_avail = nfsd_drc_max_mem - nfsd_drc_mem_used;
+	if (nfsd_drc_max_mem > nfsd_drc_mem_used)
+		total_avail = nfsd_drc_max_mem - nfsd_drc_mem_used;
+	else
+		/* We have handed out more space than we chose in
+		 * set_max_drc() to allow.  That isn't really a
+		 * problem as long as that doesn't make us think we
+		 * have lots more due to integer overflow.
+		 */
+		total_avail = 0;
 	avail = min((unsigned long)NFSD_MAX_MEM_PER_SESSION, total_avail);
 	/*
-	 * Never use more than a third of the remaining memory,
-	 * unless it's the only way to give this client a slot:
+	 * Never use more than a fraction of the remaining memory,
+	 * unless it's the only way to give this client a slot.
+	 * The chosen fraction is either 1/8 or 1/number of threads,
+	 * whichever is smaller.  This ensures there are adequate
+	 * slots to support multiple clients per thread.
+	 * Give the client one slot even if that would require
+	 * over-allocation--it is better than failure.
 	 */
-	avail = clamp_t(unsigned long, avail, slotsize, total_avail/3);
+	scale_factor = max_t(unsigned int, 8, nn->nfsd_serv->sv_nrthreads);
+
+	avail = clamp_t(unsigned long, avail, slotsize,
+			total_avail/scale_factor);
 	num = min_t(int, num, avail / slotsize);
+	num = max_t(int, num, 1);
 	nfsd_drc_mem_used += num * slotsize;
 	spin_unlock(&nfsd_drc_lock);
 
@@ -2323,9 +2346,9 @@
 	spin_unlock(&clp->cl_lock);
 }
 
-static void nfs4_show_superblock(struct seq_file *s, struct file *f)
+static void nfs4_show_superblock(struct seq_file *s, struct nfsd_file *f)
 {
-	struct inode *inode = file_inode(f);
+	struct inode *inode = f->nf_inode;
 
 	seq_printf(s, "superblock: \"%02x:%02x:%ld\"",
 					MAJOR(inode->i_sb->s_dev),
@@ -2343,7 +2366,7 @@
 {
 	struct nfs4_ol_stateid *ols;
 	struct nfs4_file *nf;
-	struct file *file;
+	struct nfsd_file *file;
 	struct nfs4_stateowner *oo;
 	unsigned int access, deny;
 
@@ -2370,7 +2393,7 @@
 	seq_printf(s, ", ");
 	nfs4_show_owner(s, oo);
 	seq_printf(s, " }\n");
-	fput(file);
+	nfsd_file_put(file);
 
 	return 0;
 }
@@ -2379,7 +2402,7 @@
 {
 	struct nfs4_ol_stateid *ols;
 	struct nfs4_file *nf;
-	struct file *file;
+	struct nfsd_file *file;
 	struct nfs4_stateowner *oo;
 
 	ols = openlockstateid(st);
@@ -2401,7 +2424,7 @@
 	seq_printf(s, ", ");
 	nfs4_show_owner(s, oo);
 	seq_printf(s, " }\n");
-	fput(file);
+	nfsd_file_put(file);
 
 	return 0;
 }
@@ -2410,7 +2433,7 @@
 {
 	struct nfs4_delegation *ds;
 	struct nfs4_file *nf;
-	struct file *file;
+	struct nfsd_file *file;
 
 	ds = delegstateid(st);
 	nf = st->sc_file;
@@ -2433,7 +2456,7 @@
 static int nfs4_show_layout(struct seq_file *s, struct nfs4_stid *st)
 {
 	struct nfs4_layout_stateid *ls;
-	struct file *file;
+	struct nfsd_file *file;
 
 	ls = container_of(st, struct nfs4_layout_stateid, ls_stid);
 	file = ls->ls_file;
@@ -3169,10 +3192,10 @@
 	 * performance.  When short on memory we therefore prefer to
 	 * decrease number of slots instead of their size.  Clients that
 	 * request larger slots than they need will get poor results:
+	 * Note that we always allow at least one slot, because our
+	 * accounting is soft and provides no guarantees either way.
 	 */
-	ca->maxreqs = nfsd4_get_drc_mem(ca);
-	if (!ca->maxreqs)
-		return nfserr_jukebox;
+	ca->maxreqs = nfsd4_get_drc_mem(ca, nn);
 
 	return nfs_ok;
 }
@@ -4651,7 +4674,7 @@
 		struct svc_fh *cur_fh, struct nfs4_ol_stateid *stp,
 		struct nfsd4_open *open)
 {
-	struct file *filp = NULL;
+	struct nfsd_file *nf = NULL;
 	__be32 status;
 	int oflag = nfs4_access_to_omode(open->op_share_access);
 	int access = nfs4_access_to_access(open->op_share_access);
@@ -4687,18 +4710,18 @@
 
 	if (!fp->fi_fds[oflag]) {
 		spin_unlock(&fp->fi_lock);
-		status = nfsd_open(rqstp, cur_fh, S_IFREG, access, &filp);
+		status = nfsd_file_acquire(rqstp, cur_fh, access, &nf);
 		if (status)
 			goto out_put_access;
 		spin_lock(&fp->fi_lock);
 		if (!fp->fi_fds[oflag]) {
-			fp->fi_fds[oflag] = filp;
-			filp = NULL;
+			fp->fi_fds[oflag] = nf;
+			nf = NULL;
 		}
 	}
 	spin_unlock(&fp->fi_lock);
-	if (filp)
-		fput(filp);
+	if (nf)
+		nfsd_file_put(nf);
 
 	status = nfsd4_truncate(rqstp, cur_fh, open);
 	if (status)
@@ -4767,7 +4790,7 @@
 	fl->fl_end = OFFSET_MAX;
 	fl->fl_owner = (fl_owner_t)dp;
 	fl->fl_pid = current->tgid;
-	fl->fl_file = dp->dl_stid.sc_file->fi_deleg_file;
+	fl->fl_file = dp->dl_stid.sc_file->fi_deleg_file->nf_file;
 	return fl;
 }
 
@@ -4777,7 +4800,7 @@
 {
 	int status = 0;
 	struct nfs4_delegation *dp;
-	struct file *filp;
+	struct nfsd_file *nf;
 	struct file_lock *fl;
 
 	/*
@@ -4788,8 +4811,8 @@
 	if (fp->fi_had_conflict)
 		return ERR_PTR(-EAGAIN);
 
-	filp = find_readable_file(fp);
-	if (!filp) {
+	nf = find_readable_file(fp);
+	if (!nf) {
 		/* We should always have a readable file here */
 		WARN_ON_ONCE(1);
 		return ERR_PTR(-EBADF);
@@ -4799,17 +4822,17 @@
 	if (nfs4_delegation_exists(clp, fp))
 		status = -EAGAIN;
 	else if (!fp->fi_deleg_file) {
-		fp->fi_deleg_file = filp;
+		fp->fi_deleg_file = nf;
 		/* increment early to prevent fi_deleg_file from being
 		 * cleared */
 		fp->fi_delegees = 1;
-		filp = NULL;
+		nf = NULL;
 	} else
 		fp->fi_delegees++;
 	spin_unlock(&fp->fi_lock);
 	spin_unlock(&state_lock);
-	if (filp)
-		fput(filp);
+	if (nf)
+		nfsd_file_put(nf);
 	if (status)
 		return ERR_PTR(status);
 
@@ -4822,7 +4845,7 @@
 	if (!fl)
 		goto out_clnt_odstate;
 
-	status = vfs_setlease(fp->fi_deleg_file, fl->fl_type, &fl, NULL);
+	status = vfs_setlease(fp->fi_deleg_file->nf_file, fl->fl_type, &fl, NULL);
 	if (fl)
 		locks_free_lock(fl);
 	if (status)
@@ -4842,7 +4865,7 @@
 
 	return dp;
 out_unlock:
-	vfs_setlease(fp->fi_deleg_file, F_UNLCK, NULL, (void **)&dp);
+	vfs_setlease(fp->fi_deleg_file->nf_file, F_UNLCK, NULL, (void **)&dp);
 out_clnt_odstate:
 	put_clnt_odstate(dp->dl_clnt_odstate);
 	nfs4_put_stid(&dp->dl_stid);
@@ -5513,7 +5536,7 @@
 	return nfs_ok;
 }
 
-static struct file *
+static struct nfsd_file *
 nfs4_find_file(struct nfs4_stid *s, int flags)
 {
 	if (!s)
@@ -5523,7 +5546,7 @@
 	case NFS4_DELEG_STID:
 		if (WARN_ON_ONCE(!s->sc_file->fi_deleg_file))
 			return NULL;
-		return get_file(s->sc_file->fi_deleg_file);
+		return nfsd_file_get(s->sc_file->fi_deleg_file);
 	case NFS4_OPEN_STID:
 	case NFS4_LOCK_STID:
 		if (flags & RD_STATE)
@@ -5549,32 +5572,28 @@
 
 static __be32
 nfs4_check_file(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfs4_stid *s,
-		struct file **filpp, bool *tmp_file, int flags)
+		struct nfsd_file **nfp, int flags)
 {
 	int acc = (flags & RD_STATE) ? NFSD_MAY_READ : NFSD_MAY_WRITE;
-	struct file *file;
+	struct nfsd_file *nf;
 	__be32 status;
 
-	file = nfs4_find_file(s, flags);
-	if (file) {
+	nf = nfs4_find_file(s, flags);
+	if (nf) {
 		status = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry,
 				acc | NFSD_MAY_OWNER_OVERRIDE);
 		if (status) {
-			fput(file);
-			return status;
+			nfsd_file_put(nf);
+			goto out;
 		}
-
-		*filpp = file;
 	} else {
-		status = nfsd_open(rqstp, fhp, S_IFREG, acc, filpp);
+		status = nfsd_file_acquire(rqstp, fhp, acc, &nf);
 		if (status)
 			return status;
-
-		if (tmp_file)
-			*tmp_file = true;
 	}
-
-	return 0;
+	*nfp = nf;
+out:
+	return status;
 }
 
 /*
@@ -5583,7 +5602,7 @@
 __be32
 nfs4_preprocess_stateid_op(struct svc_rqst *rqstp,
 		struct nfsd4_compound_state *cstate, struct svc_fh *fhp,
-		stateid_t *stateid, int flags, struct file **filpp, bool *tmp_file)
+		stateid_t *stateid, int flags, struct nfsd_file **nfp)
 {
 	struct inode *ino = d_inode(fhp->fh_dentry);
 	struct net *net = SVC_NET(rqstp);
@@ -5591,10 +5610,8 @@
 	struct nfs4_stid *s = NULL;
 	__be32 status;
 
-	if (filpp)
-		*filpp = NULL;
-	if (tmp_file)
-		*tmp_file = false;
+	if (nfp)
+		*nfp = NULL;
 
 	if (grace_disallows_io(net, ino))
 		return nfserr_grace;
@@ -5631,8 +5648,8 @@
 	status = nfs4_check_fh(fhp, s);
 
 done:
-	if (!status && filpp)
-		status = nfs4_check_file(rqstp, fhp, s, filpp, tmp_file, flags);
+	if (status == nfs_ok && nfp)
+		status = nfs4_check_file(rqstp, fhp, s, nfp, flags);
 out:
 	if (s)
 		nfs4_put_stid(s);
@@ -6392,7 +6409,7 @@
 	struct nfs4_ol_stateid *lock_stp = NULL;
 	struct nfs4_ol_stateid *open_stp = NULL;
 	struct nfs4_file *fp;
-	struct file *filp = NULL;
+	struct nfsd_file *nf = NULL;
 	struct nfsd4_blocked_lock *nbl = NULL;
 	struct file_lock *file_lock = NULL;
 	struct file_lock *conflock = NULL;
@@ -6474,8 +6491,8 @@
 			/* Fallthrough */
 		case NFS4_READ_LT:
 			spin_lock(&fp->fi_lock);
-			filp = find_readable_file_locked(fp);
-			if (filp)
+			nf = find_readable_file_locked(fp);
+			if (nf)
 				get_lock_access(lock_stp, NFS4_SHARE_ACCESS_READ);
 			spin_unlock(&fp->fi_lock);
 			fl_type = F_RDLCK;
@@ -6486,8 +6503,8 @@
 			/* Fallthrough */
 		case NFS4_WRITE_LT:
 			spin_lock(&fp->fi_lock);
-			filp = find_writeable_file_locked(fp);
-			if (filp)
+			nf = find_writeable_file_locked(fp);
+			if (nf)
 				get_lock_access(lock_stp, NFS4_SHARE_ACCESS_WRITE);
 			spin_unlock(&fp->fi_lock);
 			fl_type = F_WRLCK;
@@ -6497,7 +6514,7 @@
 		goto out;
 	}
 
-	if (!filp) {
+	if (!nf) {
 		status = nfserr_openmode;
 		goto out;
 	}
@@ -6513,7 +6530,7 @@
 	file_lock->fl_type = fl_type;
 	file_lock->fl_owner = (fl_owner_t)lockowner(nfs4_get_stateowner(&lock_sop->lo_owner));
 	file_lock->fl_pid = current->tgid;
-	file_lock->fl_file = filp;
+	file_lock->fl_file = nf->nf_file;
 	file_lock->fl_flags = fl_flags;
 	file_lock->fl_lmops = &nfsd_posix_mng_ops;
 	file_lock->fl_start = lock->lk_offset;
@@ -6535,7 +6552,7 @@
 		spin_unlock(&nn->blocked_locks_lock);
 	}
 
-	err = vfs_lock_file(filp, F_SETLK, file_lock, conflock);
+	err = vfs_lock_file(nf->nf_file, F_SETLK, file_lock, conflock);
 	switch (err) {
 	case 0: /* success! */
 		nfs4_inc_and_copy_stateid(&lock->lk_resp_stateid, &lock_stp->st_stid);
@@ -6570,8 +6587,8 @@
 		}
 		free_blocked_lock(nbl);
 	}
-	if (filp)
-		fput(filp);
+	if (nf)
+		nfsd_file_put(nf);
 	if (lock_stp) {
 		/* Bump seqid manually if the 4.0 replay owner is openowner */
 		if (cstate->replay_owner &&
@@ -6606,11 +6623,11 @@
  */
 static __be32 nfsd_test_lock(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file_lock *lock)
 {
-	struct file *file;
-	__be32 err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file);
+	struct nfsd_file *nf;
+	__be32 err = nfsd_file_acquire(rqstp, fhp, NFSD_MAY_READ, &nf);
 	if (!err) {
-		err = nfserrno(vfs_test_lock(file, lock));
-		fput(file);
+		err = nfserrno(vfs_test_lock(nf->nf_file, lock));
+		nfsd_file_put(nf);
 	}
 	return err;
 }
@@ -6698,7 +6715,7 @@
 {
 	struct nfsd4_locku *locku = &u->locku;
 	struct nfs4_ol_stateid *stp;
-	struct file *filp = NULL;
+	struct nfsd_file *nf = NULL;
 	struct file_lock *file_lock = NULL;
 	__be32 status;
 	int err;
@@ -6716,8 +6733,8 @@
 					&stp, nn);
 	if (status)
 		goto out;
-	filp = find_any_file(stp->st_stid.sc_file);
-	if (!filp) {
+	nf = find_any_file(stp->st_stid.sc_file);
+	if (!nf) {
 		status = nfserr_lock_range;
 		goto put_stateid;
 	}
@@ -6725,13 +6742,13 @@
 	if (!file_lock) {
 		dprintk("NFSD: %s: unable to allocate lock!\n", __func__);
 		status = nfserr_jukebox;
-		goto fput;
+		goto put_file;
 	}
 
 	file_lock->fl_type = F_UNLCK;
 	file_lock->fl_owner = (fl_owner_t)lockowner(nfs4_get_stateowner(stp->st_stateowner));
 	file_lock->fl_pid = current->tgid;
-	file_lock->fl_file = filp;
+	file_lock->fl_file = nf->nf_file;
 	file_lock->fl_flags = FL_POSIX;
 	file_lock->fl_lmops = &nfsd_posix_mng_ops;
 	file_lock->fl_start = locku->lu_offset;
@@ -6740,14 +6757,14 @@
 						locku->lu_length);
 	nfs4_transform_lock_offset(file_lock);
 
-	err = vfs_lock_file(filp, F_SETLK, file_lock, NULL);
+	err = vfs_lock_file(nf->nf_file, F_SETLK, file_lock, NULL);
 	if (err) {
 		dprintk("NFSD: nfs4_locku: vfs_lock_file failed!\n");
 		goto out_nfserr;
 	}
 	nfs4_inc_and_copy_stateid(&locku->lu_stateid, &stp->st_stid);
-fput:
-	fput(filp);
+put_file:
+	nfsd_file_put(nf);
 put_stateid:
 	mutex_unlock(&stp->st_mutex);
 	nfs4_put_stid(&stp->st_stid);
@@ -6759,7 +6776,7 @@
 
 out_nfserr:
 	status = nfserrno(err);
-	goto fput;
+	goto put_file;
 }
 
 /*
@@ -6772,17 +6789,17 @@
 {
 	struct file_lock *fl;
 	int status = false;
-	struct file *filp = find_any_file(fp);
+	struct nfsd_file *nf = find_any_file(fp);
 	struct inode *inode;
 	struct file_lock_context *flctx;
 
-	if (!filp) {
+	if (!nf) {
 		/* Any valid lock stateid should have some sort of access */
 		WARN_ON_ONCE(1);
 		return status;
 	}
 
-	inode = locks_inode(filp);
+	inode = locks_inode(nf->nf_file);
 	flctx = inode->i_flctx;
 
 	if (flctx && !list_empty_careful(&flctx->flc_posix)) {
@@ -6795,7 +6812,7 @@
 		}
 		spin_unlock(&flctx->flc_lock);
 	}
-	fput(filp);
+	nfsd_file_put(nf);
 	return status;
 }
 
@@ -6888,7 +6905,8 @@
  * will be freed in nfs4_remove_reclaim_record in the normal case).
  */
 struct nfs4_client_reclaim *
-nfs4_client_to_reclaim(struct xdr_netobj name, struct nfsd_net *nn)
+nfs4_client_to_reclaim(struct xdr_netobj name, struct xdr_netobj princhash,
+		struct nfsd_net *nn)
 {
 	unsigned int strhashval;
 	struct nfs4_client_reclaim *crp;
@@ -6901,6 +6919,8 @@
 		list_add(&crp->cr_strhash, &nn->reclaim_str_hashtbl[strhashval]);
 		crp->cr_name.data = name.data;
 		crp->cr_name.len = name.len;
+		crp->cr_princhash.data = princhash.data;
+		crp->cr_princhash.len = princhash.len;
 		crp->cr_clp = NULL;
 		nn->reclaim_str_hashtbl_size++;
 	}
@@ -6912,6 +6932,7 @@
 {
 	list_del(&crp->cr_strhash);
 	kfree(crp->cr_name.data);
+	kfree(crp->cr_princhash.data);
 	kfree(crp);
 	nn->reclaim_str_hashtbl_size--;
 }

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 4428118..533d0fc 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c

@@ -49,6 +49,7 @@
 #include "cache.h"
 #include "netns.h"
 #include "pnfs.h"
+#include "filecache.h"
 
 #ifdef CONFIG_NFSD_V4_SECURITY_LABEL
 #include <linux/security.h>
@@ -203,6 +204,13 @@
 	return p;
 }
 
+static unsigned int compoundargs_bytes_left(struct nfsd4_compoundargs *argp)
+{
+	unsigned int this = (char *)argp->end - (char *)argp->p;
+
+	return this + argp->pagelen;
+}
+
 static int zero_clientid(clientid_t *clid)
 {
 	return (clid->cl_boot == 0) && (clid->cl_id == 0);
@@ -211,10 +219,10 @@
 /**
  * svcxdr_tmpalloc - allocate memory to be freed after compound processing
  * @argp: NFSv4 compound argument structure
- * @p: pointer to be freed (with kfree())
+ * @len: length of buffer to allocate
  *
- * Marks @p to be freed when processing the compound operation
- * described in @argp finishes.
+ * Allocates a buffer of size @len to be freed when processing the compound
+ * operation described in @argp finishes.
  */
 static void *
 svcxdr_tmpalloc(struct nfsd4_compoundargs *argp, u32 len)
@@ -347,7 +355,12 @@
 		READ_BUF(4); len += 4;
 		nace = be32_to_cpup(p++);
 
-		if (nace > NFS4_ACL_MAX)
+		if (nace > compoundargs_bytes_left(argp)/20)
+			/*
+			 * Even with 4-byte names there wouldn't be
+			 * space for that many aces; something fishy is
+			 * going on:
+			 */
 			return nfserr_fbig;
 
 		*acl = svcxdr_tmpalloc(argp, nfs4_acl_bytes(nace));
@@ -1418,7 +1431,6 @@
 			    struct nfsd4_create_session *sess)
 {
 	DECODE_HEAD;
-	u32 dummy;
 
 	READ_BUF(16);
 	COPYMEM(&sess->clientid, 8);
@@ -1427,7 +1439,7 @@
 
 	/* Fore channel attrs */
 	READ_BUF(28);
-	dummy = be32_to_cpup(p++); /* headerpadsz is always 0 */
+	p++; /* headerpadsz is always 0 */
 	sess->fore_channel.maxreq_sz = be32_to_cpup(p++);
 	sess->fore_channel.maxresp_sz = be32_to_cpup(p++);
 	sess->fore_channel.maxresp_cached = be32_to_cpup(p++);
@@ -1444,7 +1456,7 @@
 
 	/* Back channel attrs */
 	READ_BUF(28);
-	dummy = be32_to_cpup(p++); /* headerpadsz is always 0 */
+	p++; /* headerpadsz is always 0 */
 	sess->back_channel.maxreq_sz = be32_to_cpup(p++);
 	sess->back_channel.maxresp_sz = be32_to_cpup(p++);
 	sess->back_channel.maxresp_cached = be32_to_cpup(p++);
@@ -1736,7 +1748,6 @@
 nfsd4_decode_copy(struct nfsd4_compoundargs *argp, struct nfsd4_copy *copy)
 {
 	DECODE_HEAD;
-	unsigned int tmp;
 
 	status = nfsd4_decode_stateid(argp, &copy->cp_src_stateid);
 	if (status)
@@ -1751,7 +1762,7 @@
 	p = xdr_decode_hyper(p, &copy->cp_count);
 	p++; /* ca_consecutive: we always do consecutive copies */
 	copy->cp_synchronous = be32_to_cpup(p++);
-	tmp = be32_to_cpup(p); /* Source server list not supported */
+	/* tmp = be32_to_cpup(p); Source server list not supported */
 
 	DECODE_TAIL;
 }
@@ -3217,9 +3228,8 @@
 	if (!p)
 		return nfserr_resource;
 	encode_cinfo(p, &create->cr_cinfo);
-	nfserr = nfsd4_encode_bitmap(xdr, create->cr_bmval[0],
+	return nfsd4_encode_bitmap(xdr, create->cr_bmval[0],
 			create->cr_bmval[1], create->cr_bmval[2]);
-	return 0;
 }
 
 static __be32
@@ -3462,7 +3472,7 @@
 
 	len = maxcount;
 	nfserr = nfsd_splice_read(read->rd_rqstp, read->rd_fhp,
-				  file, read->rd_offset, &maxcount);
+				  file, read->rd_offset, &maxcount, &eof);
 	read->rd_length = maxcount;
 	if (nfserr) {
 		/*
@@ -3474,9 +3484,6 @@
 		return nfserr;
 	}
 
-	eof = nfsd_eof_on_read(len, maxcount, read->rd_offset,
-				d_inode(read->rd_fhp->fh_dentry)->i_size);
-
 	*(p++) = htonl(eof);
 	*(p++) = htonl(maxcount);
 
@@ -3547,15 +3554,13 @@
 
 	len = maxcount;
 	nfserr = nfsd_readv(resp->rqstp, read->rd_fhp, file, read->rd_offset,
-			    resp->rqstp->rq_vec, read->rd_vlen, &maxcount);
+			    resp->rqstp->rq_vec, read->rd_vlen, &maxcount,
+			    &eof);
 	read->rd_length = maxcount;
 	if (nfserr)
 		return nfserr;
 	xdr_truncate_encode(xdr, starting_len + 8 + ((maxcount+3)&~3));
 
-	eof = nfsd_eof_on_read(len, maxcount, read->rd_offset,
-				d_inode(read->rd_fhp->fh_dentry)->i_size);
-
 	tmp = htonl(eof);
 	write_bytes_to_xdr_buf(xdr->buf, starting_len    , &tmp, 4);
 	tmp = htonl(maxcount);
@@ -3574,11 +3579,14 @@
 {
 	unsigned long maxcount;
 	struct xdr_stream *xdr = &resp->xdr;
-	struct file *file = read->rd_filp;
+	struct file *file;
 	int starting_len = xdr->buf->len;
-	struct raparms *ra = NULL;
 	__be32 *p;
 
+	if (nfserr)
+		return nfserr;
+	file = read->rd_nf->nf_file;
+
 	p = xdr_reserve_space(xdr, 8); /* eof flag and byte count */
 	if (!p) {
 		WARN_ON_ONCE(test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags));
@@ -3596,18 +3604,12 @@
 			 (xdr->buf->buflen - xdr->buf->len));
 	maxcount = min_t(unsigned long, maxcount, read->rd_length);
 
-	if (read->rd_tmp_file)
-		ra = nfsd_init_raparms(file);
-
 	if (file->f_op->splice_read &&
 	    test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags))
 		nfserr = nfsd4_encode_splice_read(resp, read, file, maxcount);
 	else
 		nfserr = nfsd4_encode_readv(resp, read, file, maxcount);
 
-	if (ra)
-		nfsd_put_raparams(file, ra);
-
 	if (nfserr)
 		xdr_truncate_encode(xdr, starting_len);
 

diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 2c21517..11b42c5 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c

@@ -1476,6 +1476,7 @@
 
 	atomic_set(&nn->ntf_refcnt, 0);
 	init_waitqueue_head(&nn->ntf_wq);
+	seqlock_init(&nn->boot_lock);
 
 	mnt =  vfs_kern_mount(&nfsd_fs_type, SB_KERNMOUNT, "nfsd", NULL);
 	if (IS_ERR(mnt)) {

diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 0d20fd1..c83ddac 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c

@@ -172,6 +172,7 @@
 	struct nfsd_readargs *argp = rqstp->rq_argp;
 	struct nfsd_readres *resp = rqstp->rq_resp;
 	__be32	nfserr;
+	u32 eof;
 
 	dprintk("nfsd: READ    %s %d bytes at %d\n",
 		SVCFH_fmt(&argp->fh),
@@ -195,7 +196,8 @@
 	nfserr = nfsd_read(rqstp, fh_copy(&resp->fh, &argp->fh),
 				  argp->offset,
 			   	  rqstp->rq_vec, argp->vlen,
-				  &resp->count);
+				  &resp->count,
+				  &eof);
 
 	if (nfserr) return nfserr;
 	return fh_getattr(&resp->fh, &resp->stat);

diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 18d94ea..fdf7ed4 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c

@@ -27,6 +27,7 @@
 #include "cache.h"
 #include "vfs.h"
 #include "netns.h"
+#include "filecache.h"
 
 #define NFSDDBG_FACILITY	NFSDDBG_SVC
 
@@ -313,22 +314,17 @@
 	if (nfsd_users++)
 		return 0;
 
-	/*
-	 * Readahead param cache - will no-op if it already exists.
-	 * (Note therefore results will be suboptimal if number of
-	 * threads is modified after nfsd start.)
-	 */
-	ret = nfsd_racache_init(2*nrservs);
+	ret = nfsd_file_cache_init();
 	if (ret)
 		goto dec_users;
 
 	ret = nfs4_state_start();
 	if (ret)
-		goto out_racache;
+		goto out_file_cache;
 	return 0;
 
-out_racache:
-	nfsd_racache_shutdown();
+out_file_cache:
+	nfsd_file_cache_shutdown();
 dec_users:
 	nfsd_users--;
 	return ret;
@@ -340,7 +336,7 @@
 		return;
 
 	nfs4_state_shutdown();
-	nfsd_racache_shutdown();
+	nfsd_file_cache_shutdown();
 }
 
 static bool nfsd_needs_lockd(struct nfsd_net *nn)
@@ -348,6 +344,35 @@
 	return nfsd_vers(nn, 2, NFSD_TEST) || nfsd_vers(nn, 3, NFSD_TEST);
 }
 
+void nfsd_copy_boot_verifier(__be32 verf[2], struct nfsd_net *nn)
+{
+	int seq = 0;
+
+	do {
+		read_seqbegin_or_lock(&nn->boot_lock, &seq);
+		/*
+		 * This is opaque to client, so no need to byte-swap. Use
+		 * __force to keep sparse happy. y2038 time_t overflow is
+		 * irrelevant in this usage
+		 */
+		verf[0] = (__force __be32)nn->nfssvc_boot.tv_sec;
+		verf[1] = (__force __be32)nn->nfssvc_boot.tv_nsec;
+	} while (need_seqretry(&nn->boot_lock, seq));
+	done_seqretry(&nn->boot_lock, seq);
+}
+
+static void nfsd_reset_boot_verifier_locked(struct nfsd_net *nn)
+{
+	ktime_get_real_ts64(&nn->nfssvc_boot);
+}
+
+void nfsd_reset_boot_verifier(struct nfsd_net *nn)
+{
+	write_seqlock(&nn->boot_lock);
+	nfsd_reset_boot_verifier_locked(nn);
+	write_sequnlock(&nn->boot_lock);
+}
+
 static int nfsd_startup_net(int nrservs, struct net *net, const struct cred *cred)
 {
 	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
@@ -391,6 +416,7 @@
 {
 	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
 
+	nfsd_file_cache_purge(net);
 	nfs4_state_shutdown_net(net);
 	if (nn->lockd_up) {
 		lockd_down(net);
@@ -599,7 +625,7 @@
 #endif
 	}
 	atomic_inc(&nn->ntf_refcnt);
-	ktime_get_real_ts64(&nn->nfssvc_boot); /* record boot time */
+	nfsd_reset_boot_verifier(nn);
 	return 0;
 }
 

diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 5dbd169..46f56af 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h

@@ -378,6 +378,7 @@
 	struct list_head	cr_strhash;	/* hash by cr_name */
 	struct nfs4_client	*cr_clp;	/* pointer to associated clp */
 	struct xdr_netobj	cr_name;	/* recovery dir name */
+	struct xdr_netobj	cr_princhash;
 };
 
 /* A reasonable value for REPLAY_ISIZE was estimated as follows:  
@@ -506,7 +507,7 @@
 	};
 	struct list_head	fi_clnt_odstate;
 	/* One each for O_RDONLY, O_WRONLY, O_RDWR: */
-	struct file *		fi_fds[3];
+	struct nfsd_file	*fi_fds[3];
 	/*
 	 * Each open or lock stateid contributes 0-4 to the counts
 	 * below depending on which bits are set in st_access_bitmap:
@@ -516,7 +517,7 @@
 	 */
 	atomic_t		fi_access[2];
 	u32			fi_share_deny;
-	struct file		*fi_deleg_file;
+	struct nfsd_file	*fi_deleg_file;
 	int			fi_delegees;
 	struct knfsd_fh		fi_fhandle;
 	bool			fi_had_conflict;
@@ -565,7 +566,7 @@
 	spinlock_t			ls_lock;
 	struct list_head		ls_layouts;
 	u32				ls_layout_type;
-	struct file			*ls_file;
+	struct nfsd_file		*ls_file;
 	struct nfsd4_callback		ls_recall;
 	stateid_t			ls_recall_sid;
 	bool				ls_recalled;
@@ -616,7 +617,7 @@
 
 extern __be32 nfs4_preprocess_stateid_op(struct svc_rqst *rqstp,
 		struct nfsd4_compound_state *cstate, struct svc_fh *fhp,
-		stateid_t *stateid, int flags, struct file **filp, bool *tmp_file);
+		stateid_t *stateid, int flags, struct nfsd_file **filp);
 __be32 nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
 		     stateid_t *stateid, unsigned char typemask,
 		     struct nfs4_stid **s, struct nfsd_net *nn);
@@ -645,7 +646,7 @@
 extern void nfsd4_shutdown_copy(struct nfs4_client *clp);
 extern void nfsd4_prepare_cb_recall(struct nfs4_delegation *dp);
 extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(struct xdr_netobj name,
-							struct nfsd_net *nn);
+				struct xdr_netobj princhash, struct nfsd_net *nn);
 extern bool nfs4_has_reclaimed_state(struct xdr_netobj name, struct nfsd_net *nn);
 
 struct nfs4_file *find_file(struct knfsd_fh *fh);
@@ -657,7 +658,7 @@
 {
 	refcount_inc(&fi->fi_ref);
 }
-struct file *find_any_file(struct nfs4_file *f);
+struct nfsd_file *find_any_file(struct nfs4_file *f);
 
 /* grace period management */
 void nfsd4_end_grace(struct nfsd_net *nn);

diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index 80933e4..ffc78a0 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h

@@ -126,6 +126,8 @@
 DEFINE_NFSD_ERR_EVENT(write_err);
 
 #include "state.h"
+#include "filecache.h"
+#include "vfs.h"
 
 DECLARE_EVENT_CLASS(nfsd_stateid_class,
 	TP_PROTO(stateid_t *stp),
@@ -164,6 +166,144 @@
 DEFINE_STATEID_EVENT(layout_recall_fail);
 DEFINE_STATEID_EVENT(layout_recall_release);
 
+#define show_nf_flags(val)						\
+	__print_flags(val, "|",						\
+		{ 1 << NFSD_FILE_HASHED,	"HASHED" },		\
+		{ 1 << NFSD_FILE_PENDING,	"PENDING" },		\
+		{ 1 << NFSD_FILE_BREAK_READ,	"BREAK_READ" },		\
+		{ 1 << NFSD_FILE_BREAK_WRITE,	"BREAK_WRITE" },	\
+		{ 1 << NFSD_FILE_REFERENCED,	"REFERENCED"})
+
+/* FIXME: This should probably be fleshed out in the future. */
+#define show_nf_may(val)						\
+	__print_flags(val, "|",						\
+		{ NFSD_MAY_READ,		"READ" },		\
+		{ NFSD_MAY_WRITE,		"WRITE" },		\
+		{ NFSD_MAY_NOT_BREAK_LEASE,	"NOT_BREAK_LEASE" })
+
+DECLARE_EVENT_CLASS(nfsd_file_class,
+	TP_PROTO(struct nfsd_file *nf),
+	TP_ARGS(nf),
+	TP_STRUCT__entry(
+		__field(unsigned int, nf_hashval)
+		__field(void *, nf_inode)
+		__field(int, nf_ref)
+		__field(unsigned long, nf_flags)
+		__field(unsigned char, nf_may)
+		__field(struct file *, nf_file)
+	),
+	TP_fast_assign(
+		__entry->nf_hashval = nf->nf_hashval;
+		__entry->nf_inode = nf->nf_inode;
+		__entry->nf_ref = atomic_read(&nf->nf_ref);
+		__entry->nf_flags = nf->nf_flags;
+		__entry->nf_may = nf->nf_may;
+		__entry->nf_file = nf->nf_file;
+	),
+	TP_printk("hash=0x%x inode=0x%p ref=%d flags=%s may=%s file=%p",
+		__entry->nf_hashval,
+		__entry->nf_inode,
+		__entry->nf_ref,
+		show_nf_flags(__entry->nf_flags),
+		show_nf_may(__entry->nf_may),
+		__entry->nf_file)
+)
+
+#define DEFINE_NFSD_FILE_EVENT(name) \
+DEFINE_EVENT(nfsd_file_class, name, \
+	TP_PROTO(struct nfsd_file *nf), \
+	TP_ARGS(nf))
+
+DEFINE_NFSD_FILE_EVENT(nfsd_file_alloc);
+DEFINE_NFSD_FILE_EVENT(nfsd_file_put_final);
+DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash);
+DEFINE_NFSD_FILE_EVENT(nfsd_file_put);
+DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash_and_release_locked);
+
+TRACE_EVENT(nfsd_file_acquire,
+	TP_PROTO(struct svc_rqst *rqstp, unsigned int hash,
+		 struct inode *inode, unsigned int may_flags,
+		 struct nfsd_file *nf, __be32 status),
+
+	TP_ARGS(rqstp, hash, inode, may_flags, nf, status),
+
+	TP_STRUCT__entry(
+		__field(__be32, xid)
+		__field(unsigned int, hash)
+		__field(void *, inode)
+		__field(unsigned int, may_flags)
+		__field(int, nf_ref)
+		__field(unsigned long, nf_flags)
+		__field(unsigned char, nf_may)
+		__field(struct file *, nf_file)
+		__field(__be32, status)
+	),
+
+	TP_fast_assign(
+		__entry->xid = rqstp->rq_xid;
+		__entry->hash = hash;
+		__entry->inode = inode;
+		__entry->may_flags = may_flags;
+		__entry->nf_ref = nf ? atomic_read(&nf->nf_ref) : 0;
+		__entry->nf_flags = nf ? nf->nf_flags : 0;
+		__entry->nf_may = nf ? nf->nf_may : 0;
+		__entry->nf_file = nf ? nf->nf_file : NULL;
+		__entry->status = status;
+	),
+
+	TP_printk("xid=0x%x hash=0x%x inode=0x%p may_flags=%s ref=%d nf_flags=%s nf_may=%s nf_file=0x%p status=%u",
+			be32_to_cpu(__entry->xid), __entry->hash, __entry->inode,
+			show_nf_may(__entry->may_flags), __entry->nf_ref,
+			show_nf_flags(__entry->nf_flags),
+			show_nf_may(__entry->nf_may), __entry->nf_file,
+			be32_to_cpu(__entry->status))
+);
+
+DECLARE_EVENT_CLASS(nfsd_file_search_class,
+	TP_PROTO(struct inode *inode, unsigned int hash, int found),
+	TP_ARGS(inode, hash, found),
+	TP_STRUCT__entry(
+		__field(struct inode *, inode)
+		__field(unsigned int, hash)
+		__field(int, found)
+	),
+	TP_fast_assign(
+		__entry->inode = inode;
+		__entry->hash = hash;
+		__entry->found = found;
+	),
+	TP_printk("hash=0x%x inode=0x%p found=%d", __entry->hash,
+			__entry->inode, __entry->found)
+);
+
+#define DEFINE_NFSD_FILE_SEARCH_EVENT(name)				\
+DEFINE_EVENT(nfsd_file_search_class, name,				\
+	TP_PROTO(struct inode *inode, unsigned int hash, int found),	\
+	TP_ARGS(inode, hash, found))
+
+DEFINE_NFSD_FILE_SEARCH_EVENT(nfsd_file_close_inode_sync);
+DEFINE_NFSD_FILE_SEARCH_EVENT(nfsd_file_close_inode);
+DEFINE_NFSD_FILE_SEARCH_EVENT(nfsd_file_is_cached);
+
+TRACE_EVENT(nfsd_file_fsnotify_handle_event,
+	TP_PROTO(struct inode *inode, u32 mask),
+	TP_ARGS(inode, mask),
+	TP_STRUCT__entry(
+		__field(struct inode *, inode)
+		__field(unsigned int, nlink)
+		__field(umode_t, mode)
+		__field(u32, mask)
+	),
+	TP_fast_assign(
+		__entry->inode = inode;
+		__entry->nlink = inode->i_nlink;
+		__entry->mode = inode->i_mode;
+		__entry->mask = mask;
+	),
+	TP_printk("inode=0x%p nlink=%u mode=0%ho mask=0x%x", __entry->inode,
+			__entry->nlink, __entry->mode, __entry->mask)
+);
+
 #endif /* _NFSD_TRACE_H */
 
 #undef TRACE_INCLUDE_PATH

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index c85783e..bd0a385 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c

@@ -44,38 +44,11 @@
 
 #include "nfsd.h"
 #include "vfs.h"
+#include "filecache.h"
 #include "trace.h"
 
 #define NFSDDBG_FACILITY		NFSDDBG_FILEOP
 
-
-/*
- * This is a cache of readahead params that help us choose the proper
- * readahead strategy. Initially, we set all readahead parameters to 0
- * and let the VFS handle things.
- * If you increase the number of cached files very much, you'll need to
- * add a hash table here.
- */
-struct raparms {
-	struct raparms		*p_next;
-	unsigned int		p_count;
-	ino_t			p_ino;
-	dev_t			p_dev;
-	int			p_set;
-	struct file_ra_state	p_ra;
-	unsigned int		p_hindex;
-};
-
-struct raparm_hbucket {
-	struct raparms		*pb_head;
-	spinlock_t		pb_lock;
-} ____cacheline_aligned_in_smp;
-
-#define RAPARM_HASH_BITS	4
-#define RAPARM_HASH_SIZE	(1<<RAPARM_HASH_BITS)
-#define RAPARM_HASH_MASK	(RAPARM_HASH_SIZE-1)
-static struct raparm_hbucket	raparm_hash[RAPARM_HASH_SIZE];
-
 /* 
  * Called from nfsd_lookup and encode_dirent. Check if we have crossed 
  * a mount point.
@@ -699,7 +672,7 @@
 }
 #endif /* CONFIG_NFSD_V3 */
 
-static int nfsd_open_break_lease(struct inode *inode, int access)
+int nfsd_open_break_lease(struct inode *inode, int access)
 {
 	unsigned int mode;
 
@@ -715,8 +688,8 @@
  * and additional flags.
  * N.B. After this call fhp needs an fh_put
  */
-__be32
-nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
+static __be32
+__nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
 			int may_flags, struct file **filp)
 {
 	struct path	path;
@@ -726,25 +699,6 @@
 	__be32		err;
 	int		host_err = 0;
 
-	validate_process_creds();
-
-	/*
-	 * If we get here, then the client has already done an "open",
-	 * and (hopefully) checked permission - so allow OWNER_OVERRIDE
-	 * in case a chmod has now revoked permission.
-	 *
-	 * Arguably we should also allow the owner override for
-	 * directories, but we never have and it doesn't seem to have
-	 * caused anyone a problem.  If we were to change this, note
-	 * also that our filldir callbacks would need a variant of
-	 * lookup_one_len that doesn't check permissions.
-	 */
-	if (type == S_IFREG)
-		may_flags |= NFSD_MAY_OWNER_OVERRIDE;
-	err = fh_verify(rqstp, fhp, type, may_flags);
-	if (err)
-		goto out;
-
 	path.mnt = fhp->fh_export->ex_path.mnt;
 	path.dentry = fhp->fh_dentry;
 	inode = d_inode(path.dentry);
@@ -798,67 +752,46 @@
 out_nfserr:
 	err = nfserrno(host_err);
 out:
+	return err;
+}
+
+__be32
+nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
+		int may_flags, struct file **filp)
+{
+	__be32 err;
+
+	validate_process_creds();
+	/*
+	 * If we get here, then the client has already done an "open",
+	 * and (hopefully) checked permission - so allow OWNER_OVERRIDE
+	 * in case a chmod has now revoked permission.
+	 *
+	 * Arguably we should also allow the owner override for
+	 * directories, but we never have and it doesn't seem to have
+	 * caused anyone a problem.  If we were to change this, note
+	 * also that our filldir callbacks would need a variant of
+	 * lookup_one_len that doesn't check permissions.
+	 */
+	if (type == S_IFREG)
+		may_flags |= NFSD_MAY_OWNER_OVERRIDE;
+	err = fh_verify(rqstp, fhp, type, may_flags);
+	if (!err)
+		err = __nfsd_open(rqstp, fhp, type, may_flags, filp);
 	validate_process_creds();
 	return err;
 }
 
-struct raparms *
-nfsd_init_raparms(struct file *file)
+__be32
+nfsd_open_verified(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
+		int may_flags, struct file **filp)
 {
-	struct inode *inode = file_inode(file);
-	dev_t dev = inode->i_sb->s_dev;
-	ino_t ino = inode->i_ino;
-	struct raparms	*ra, **rap, **frap = NULL;
-	int depth = 0;
-	unsigned int hash;
-	struct raparm_hbucket *rab;
+	__be32 err;
 
-	hash = jhash_2words(dev, ino, 0xfeedbeef) & RAPARM_HASH_MASK;
-	rab = &raparm_hash[hash];
-
-	spin_lock(&rab->pb_lock);
-	for (rap = &rab->pb_head; (ra = *rap); rap = &ra->p_next) {
-		if (ra->p_ino == ino && ra->p_dev == dev)
-			goto found;
-		depth++;
-		if (ra->p_count == 0)
-			frap = rap;
-	}
-	depth = nfsdstats.ra_size;
-	if (!frap) {	
-		spin_unlock(&rab->pb_lock);
-		return NULL;
-	}
-	rap = frap;
-	ra = *frap;
-	ra->p_dev = dev;
-	ra->p_ino = ino;
-	ra->p_set = 0;
-	ra->p_hindex = hash;
-found:
-	if (rap != &rab->pb_head) {
-		*rap = ra->p_next;
-		ra->p_next   = rab->pb_head;
-		rab->pb_head = ra;
-	}
-	ra->p_count++;
-	nfsdstats.ra_depth[depth*10/nfsdstats.ra_size]++;
-	spin_unlock(&rab->pb_lock);
-
-	if (ra->p_set)
-		file->f_ra = ra->p_ra;
-	return ra;
-}
-
-void nfsd_put_raparams(struct file *file, struct raparms *ra)
-{
-	struct raparm_hbucket *rab = &raparm_hash[ra->p_hindex];
-
-	spin_lock(&rab->pb_lock);
-	ra->p_ra = file->f_ra;
-	ra->p_set = 1;
-	ra->p_count--;
-	spin_unlock(&rab->pb_lock);
+	validate_process_creds();
+	err = __nfsd_open(rqstp, fhp, type, may_flags, filp);
+	validate_process_creds();
+	return err;
 }
 
 /*
@@ -901,12 +834,23 @@
 	return __splice_from_pipe(pipe, sd, nfsd_splice_actor);
 }
 
+static u32 nfsd_eof_on_read(struct file *file, loff_t offset, ssize_t len,
+		size_t expected)
+{
+	if (expected != 0 && len == 0)
+		return 1;
+	if (offset+len >= i_size_read(file_inode(file)))
+		return 1;
+	return 0;
+}
+
 static __be32 nfsd_finish_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
 			       struct file *file, loff_t offset,
-			       unsigned long *count, int host_err)
+			       unsigned long *count, u32 *eof, ssize_t host_err)
 {
 	if (host_err >= 0) {
 		nfsdstats.io_read += host_err;
+		*eof = nfsd_eof_on_read(file, offset, host_err, *count);
 		*count = host_err;
 		fsnotify_access(file);
 		trace_nfsd_read_io_done(rqstp, fhp, offset, *count);
@@ -918,7 +862,8 @@
 }
 
 __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
-			struct file *file, loff_t offset, unsigned long *count)
+			struct file *file, loff_t offset, unsigned long *count,
+			u32 *eof)
 {
 	struct splice_desc sd = {
 		.len		= 0,
@@ -926,25 +871,27 @@
 		.pos		= offset,
 		.u.data		= rqstp,
 	};
-	int host_err;
+	ssize_t host_err;
 
 	trace_nfsd_read_splice(rqstp, fhp, offset, *count);
 	rqstp->rq_next_page = rqstp->rq_respages + 1;
 	host_err = splice_direct_to_actor(file, &sd, nfsd_direct_splice_actor);
-	return nfsd_finish_read(rqstp, fhp, file, offset, count, host_err);
+	return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err);
 }
 
 __be32 nfsd_readv(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		  struct file *file, loff_t offset,
-		  struct kvec *vec, int vlen, unsigned long *count)
+		  struct kvec *vec, int vlen, unsigned long *count,
+		  u32 *eof)
 {
 	struct iov_iter iter;
-	int host_err;
+	loff_t ppos = offset;
+	ssize_t host_err;
 
 	trace_nfsd_read_vector(rqstp, fhp, offset, *count);
 	iov_iter_kvec(&iter, READ, vec, vlen, *count);
-	host_err = vfs_iter_read(file, &iter, &offset, 0);
-	return nfsd_finish_read(rqstp, fhp, file, offset, count, host_err);
+	host_err = vfs_iter_read(file, &iter, &ppos, 0);
+	return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err);
 }
 
 /*
@@ -1025,8 +972,12 @@
 	nfsdstats.io_write += *cnt;
 	fsnotify_modify(file);
 
-	if (stable && use_wgather)
+	if (stable && use_wgather) {
 		host_err = wait_for_concurrent_writes(file);
+		if (host_err < 0)
+			nfsd_reset_boot_verifier(net_generic(SVC_NET(rqstp),
+						 nfsd_net_id));
+	}
 
 out_nfserr:
 	if (host_err >= 0) {
@@ -1047,27 +998,25 @@
  * N.B. After this call fhp needs an fh_put
  */
 __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
-	loff_t offset, struct kvec *vec, int vlen, unsigned long *count)
+	loff_t offset, struct kvec *vec, int vlen, unsigned long *count,
+	u32 *eof)
 {
+	struct nfsd_file	*nf;
 	struct file *file;
-	struct raparms	*ra;
 	__be32 err;
 
 	trace_nfsd_read_start(rqstp, fhp, offset, *count);
-	err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file);
+	err = nfsd_file_acquire(rqstp, fhp, NFSD_MAY_READ, &nf);
 	if (err)
 		return err;
 
-	ra = nfsd_init_raparms(file);
-
+	file = nf->nf_file;
 	if (file->f_op->splice_read && test_bit(RQ_SPLICE_OK, &rqstp->rq_flags))
-		err = nfsd_splice_read(rqstp, fhp, file, offset, count);
+		err = nfsd_splice_read(rqstp, fhp, file, offset, count, eof);
 	else
-		err = nfsd_readv(rqstp, fhp, file, offset, vec, vlen, count);
+		err = nfsd_readv(rqstp, fhp, file, offset, vec, vlen, count, eof);
 
-	if (ra)
-		nfsd_put_raparams(file, ra);
-	fput(file);
+	nfsd_file_put(nf);
 
 	trace_nfsd_read_done(rqstp, fhp, offset, *count);
 
@@ -1083,17 +1032,18 @@
 nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset,
 	   struct kvec *vec, int vlen, unsigned long *cnt, int stable)
 {
-	struct file *file = NULL;
-	__be32 err = 0;
+	struct nfsd_file *nf;
+	__be32 err;
 
 	trace_nfsd_write_start(rqstp, fhp, offset, *cnt);
 
-	err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_WRITE, &file);
+	err = nfsd_file_acquire(rqstp, fhp, NFSD_MAY_WRITE, &nf);
 	if (err)
 		goto out;
 
-	err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen, cnt, stable);
-	fput(file);
+	err = nfsd_vfs_write(rqstp, fhp, nf->nf_file, offset, vec,
+			vlen, cnt, stable);
+	nfsd_file_put(nf);
 out:
 	trace_nfsd_write_done(rqstp, fhp, offset, *cnt);
 	return err;
@@ -1113,9 +1063,9 @@
 nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp,
                loff_t offset, unsigned long count)
 {
-	struct file	*file;
-	loff_t		end = LLONG_MAX;
-	__be32		err = nfserr_inval;
+	struct nfsd_file	*nf;
+	loff_t			end = LLONG_MAX;
+	__be32			err = nfserr_inval;
 
 	if (offset < 0)
 		goto out;
@@ -1125,20 +1075,27 @@
 			goto out;
 	}
 
-	err = nfsd_open(rqstp, fhp, S_IFREG,
-			NFSD_MAY_WRITE|NFSD_MAY_NOT_BREAK_LEASE, &file);
+	err = nfsd_file_acquire(rqstp, fhp,
+			NFSD_MAY_WRITE|NFSD_MAY_NOT_BREAK_LEASE, &nf);
 	if (err)
 		goto out;
 	if (EX_ISSYNC(fhp->fh_export)) {
-		int err2 = vfs_fsync_range(file, offset, end, 0);
+		int err2 = vfs_fsync_range(nf->nf_file, offset, end, 0);
 
-		if (err2 != -EINVAL)
-			err = nfserrno(err2);
-		else
+		switch (err2) {
+		case 0:
+			break;
+		case -EINVAL:
 			err = nfserr_notsupp;
+			break;
+		default:
+			err = nfserrno(err2);
+			nfsd_reset_boot_verifier(net_generic(nf->nf_net,
+						 nfsd_net_id));
+		}
 	}
 
-	fput(file);
+	nfsd_file_put(nf);
 out:
 	return err;
 }
@@ -1659,6 +1616,26 @@
 	goto out_unlock;
 }
 
+static void
+nfsd_close_cached_files(struct dentry *dentry)
+{
+	struct inode *inode = d_inode(dentry);
+
+	if (inode && S_ISREG(inode->i_mode))
+		nfsd_file_close_inode_sync(inode);
+}
+
+static bool
+nfsd_has_cached_files(struct dentry *dentry)
+{
+	bool		ret = false;
+	struct inode *inode = d_inode(dentry);
+
+	if (inode && S_ISREG(inode->i_mode))
+		ret = nfsd_file_is_cached(inode);
+	return ret;
+}
+
 /*
  * Rename a file
  * N.B. After this call _both_ ffhp and tfhp need an fh_put
@@ -1671,6 +1648,7 @@
 	struct inode	*fdir, *tdir;
 	__be32		err;
 	int		host_err;
+	bool		has_cached = false;
 
 	err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_REMOVE);
 	if (err)
@@ -1689,6 +1667,7 @@
 	if (!flen || isdotent(fname, flen) || !tlen || isdotent(tname, tlen))
 		goto out;
 
+retry:
 	host_err = fh_want_write(ffhp);
 	if (host_err) {
 		err = nfserrno(host_err);
@@ -1728,11 +1707,16 @@
 	if (ffhp->fh_export->ex_path.dentry != tfhp->fh_export->ex_path.dentry)
 		goto out_dput_new;
 
-	host_err = vfs_rename(fdir, odentry, tdir, ndentry, NULL, 0);
-	if (!host_err) {
-		host_err = commit_metadata(tfhp);
-		if (!host_err)
-			host_err = commit_metadata(ffhp);
+	if (nfsd_has_cached_files(ndentry)) {
+		has_cached = true;
+		goto out_dput_old;
+	} else {
+		host_err = vfs_rename(fdir, odentry, tdir, ndentry, NULL, 0);
+		if (!host_err) {
+			host_err = commit_metadata(tfhp);
+			if (!host_err)
+				host_err = commit_metadata(ffhp);
+		}
 	}
  out_dput_new:
 	dput(ndentry);
@@ -1745,12 +1729,26 @@
 	 * as that would do the wrong thing if the two directories
 	 * were the same, so again we do it by hand.
 	 */
-	fill_post_wcc(ffhp);
-	fill_post_wcc(tfhp);
+	if (!has_cached) {
+		fill_post_wcc(ffhp);
+		fill_post_wcc(tfhp);
+	}
 	unlock_rename(tdentry, fdentry);
 	ffhp->fh_locked = tfhp->fh_locked = false;
 	fh_drop_write(ffhp);
 
+	/*
+	 * If the target dentry has cached open files, then we need to try to
+	 * close them prior to doing the rename. Flushing delayed fput
+	 * shouldn't be done with locks held however, so we delay it until this
+	 * point and then reattempt the whole shebang.
+	 */
+	if (has_cached) {
+		has_cached = false;
+		nfsd_close_cached_files(ndentry);
+		dput(ndentry);
+		goto retry;
+	}
 out:
 	return err;
 }
@@ -1797,10 +1795,13 @@
 	if (!type)
 		type = d_inode(rdentry)->i_mode & S_IFMT;
 
-	if (type != S_IFDIR)
+	if (type != S_IFDIR) {
+		nfsd_close_cached_files(rdentry);
 		host_err = vfs_unlink(dirp, rdentry, NULL);
-	else
+	} else {
 		host_err = vfs_rmdir(dirp, rdentry);
+	}
+
 	if (!host_err)
 		host_err = commit_metadata(fhp);
 	dput(rdentry);
@@ -2074,63 +2075,3 @@
 
 	return err? nfserrno(err) : 0;
 }
-
-void
-nfsd_racache_shutdown(void)
-{
-	struct raparms *raparm, *last_raparm;
-	unsigned int i;
-
-	dprintk("nfsd: freeing readahead buffers.\n");
-
-	for (i = 0; i < RAPARM_HASH_SIZE; i++) {
-		raparm = raparm_hash[i].pb_head;
-		while(raparm) {
-			last_raparm = raparm;
-			raparm = raparm->p_next;
-			kfree(last_raparm);
-		}
-		raparm_hash[i].pb_head = NULL;
-	}
-}
-/*
- * Initialize readahead param cache
- */
-int
-nfsd_racache_init(int cache_size)
-{
-	int	i;
-	int	j = 0;
-	int	nperbucket;
-	struct raparms **raparm = NULL;
-
-
-	if (raparm_hash[0].pb_head)
-		return 0;
-	nperbucket = DIV_ROUND_UP(cache_size, RAPARM_HASH_SIZE);
-	nperbucket = max(2, nperbucket);
-	cache_size = nperbucket * RAPARM_HASH_SIZE;
-
-	dprintk("nfsd: allocating %d readahead buffers.\n", cache_size);
-
-	for (i = 0; i < RAPARM_HASH_SIZE; i++) {
-		spin_lock_init(&raparm_hash[i].pb_lock);
-
-		raparm = &raparm_hash[i].pb_head;
-		for (j = 0; j < nperbucket; j++) {
-			*raparm = kzalloc(sizeof(struct raparms), GFP_KERNEL);
-			if (!*raparm)
-				goto out_nomem;
-			raparm = &(*raparm)->p_next;
-		}
-		*raparm = NULL;
-	}
-
-	nfsdstats.ra_size = cache_size;
-	return 0;
-
-out_nomem:
-	dprintk("nfsd: kmalloc failed, freeing readahead buffers\n");
-	nfsd_racache_shutdown();
-	return -ENOMEM;
-}

diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index db35124..a13fd9d 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h

@@ -40,8 +40,6 @@
 typedef int (*nfsd_filldir_t)(void *, const char *, int, loff_t, u64, unsigned);
 
 /* nfsd/vfs.c */
-int		nfsd_racache_init(int);
-void		nfsd_racache_shutdown(void);
 int		nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
 		                struct svc_export **expp);
 __be32		nfsd_lookup(struct svc_rqst *, struct svc_fh *,
@@ -75,18 +73,23 @@
 __be32		nfsd_commit(struct svc_rqst *, struct svc_fh *,
 				loff_t, unsigned long);
 #endif /* CONFIG_NFSD_V3 */
+int 		nfsd_open_break_lease(struct inode *, int);
 __be32		nfsd_open(struct svc_rqst *, struct svc_fh *, umode_t,
 				int, struct file **);
-struct raparms;
+__be32		nfsd_open_verified(struct svc_rqst *, struct svc_fh *, umode_t,
+				int, struct file **);
 __be32		nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
 				struct file *file, loff_t offset,
-				unsigned long *count);
+				unsigned long *count,
+				u32 *eof);
 __be32		nfsd_readv(struct svc_rqst *rqstp, struct svc_fh *fhp,
 				struct file *file, loff_t offset,
 				struct kvec *vec, int vlen,
-				unsigned long *count);
+				unsigned long *count,
+				u32 *eof);
 __be32 		nfsd_read(struct svc_rqst *, struct svc_fh *,
-				loff_t, struct kvec *, int, unsigned long *);
+				loff_t, struct kvec *, int, unsigned long *,
+				u32 *eof);
 __be32 		nfsd_write(struct svc_rqst *, struct svc_fh *, loff_t,
 				struct kvec *, int, unsigned long *, int);
 __be32		nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
@@ -115,9 +118,6 @@
 __be32		nfsd_permission(struct svc_rqst *, struct svc_export *,
 				struct dentry *, int);
 
-struct raparms *nfsd_init_raparms(struct file *file);
-void		nfsd_put_raparams(struct file *file, struct raparms *ra);
-
 static inline int fh_want_write(struct svc_fh *fh)
 {
 	int ret;
@@ -152,23 +152,4 @@
 	       || createmode == NFS4_CREATE_EXCLUSIVE4_1;
 }
 
-static inline bool nfsd_eof_on_read(long requested, long read,
-				loff_t offset, loff_t size)
-{
-	/* We assume a short read means eof: */
-	if (requested > read)
-		return true;
-	/*
-	 * A non-short read might also reach end of file.  The spec
-	 * still requires us to set eof in that case.
-	 *
-	 * Further operations may have modified the file size since
-	 * the read, so the following check is not atomic with the read.
-	 * We've only seen that cause a problem for a client in the case
-	 * where the read returned a count of 0 without setting eof.
-	 * That case was fixed by the addition of the above check.
-	 */
-	return (offset + read >= size);
-}
-
 #endif /* LINUX_NFSD_VFS_H */

diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h
index 2cb29e9..99ff9f4 100644
--- a/fs/nfsd/xdr3.h
+++ b/fs/nfsd/xdr3.h

@@ -151,7 +151,7 @@
 	__be32			status;
 	struct svc_fh		fh;
 	unsigned long		count;
-	int			eof;
+	__u32			eof;
 };
 
 struct nfsd3_writeres {

diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index d64c870..f4737d6 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h

@@ -273,15 +273,14 @@
 
 
 struct nfsd4_read {
-	stateid_t	rd_stateid;         /* request */
-	u64		rd_offset;          /* request */
-	u32		rd_length;          /* request */
-	int		rd_vlen;
-	struct file     *rd_filp;
-	bool		rd_tmp_file;
+	stateid_t		rd_stateid;         /* request */
+	u64			rd_offset;          /* request */
+	u32			rd_length;          /* request */
+	int			rd_vlen;
+	struct nfsd_file	*rd_nf;
 	
-	struct svc_rqst *rd_rqstp;          /* response */
-	struct svc_fh * rd_fhp;             /* response */
+	struct svc_rqst		*rd_rqstp;          /* response */
+	struct svc_fh		*rd_fhp;             /* response */
 };
 
 struct nfsd4_readdir {
@@ -538,8 +537,8 @@
 
 	struct nfs4_client      *cp_clp;
 
-	struct file             *file_src;
-	struct file             *file_dst;
+	struct nfsd_file        *nf_src;
+	struct nfsd_file        *nf_dst;
 
 	stateid_t		cp_stateid;
 

diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index c03758c..7a42c2e 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c

@@ -13,6 +13,7 @@
 #include <linux/sched/signal.h>
 #include <linux/dnotify.h>
 #include <linux/init.h>
+#include <linux/security.h>
 #include <linux/spinlock.h>
 #include <linux/slab.h>
 #include <linux/fdtable.h>
@@ -279,6 +280,17 @@
 		goto out_err;
 	}
 
+	/*
+	 * convert the userspace DN_* "arg" to the internal FS_*
+	 * defined in fsnotify
+	 */
+	mask = convert_arg(arg);
+
+	error = security_path_notify(&filp->f_path, mask,
+			FSNOTIFY_OBJ_TYPE_INODE);
+	if (error)
+		goto out_err;
+
 	/* expect most fcntl to add new rather than augment old */
 	dn = kmem_cache_alloc(dnotify_struct_cache, GFP_KERNEL);
 	if (!dn) {
@@ -293,9 +305,6 @@
 		goto out_err;
 	}
 
-	/* convert the userspace DN_* "arg" to the internal FS_* defines in fsnotify */
-	mask = convert_arg(arg);
-
 	/* set up the new_fsn_mark and new_dn_mark */
 	new_fsn_mark = &new_dn_mark->fsn_mark;
 	fsnotify_init_mark(new_fsn_mark, dnotify_group);

diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 91006f4..8508ab5 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c

@@ -528,7 +528,8 @@
 };
 
 static int fanotify_find_path(int dfd, const char __user *filename,
-			      struct path *path, unsigned int flags)
+			      struct path *path, unsigned int flags, __u64 mask,
+			      unsigned int obj_type)
 {
 	int ret;
 
@@ -567,8 +568,15 @@
 
 	/* you can only watch an inode if you have read permissions on it */
 	ret = inode_permission(path->dentry->d_inode, MAY_READ);
+	if (ret) {
+		path_put(path);
+		goto out;
+	}
+
+	ret = security_path_notify(path, mask, obj_type);
 	if (ret)
 		path_put(path);
+
 out:
 	return ret;
 }
@@ -947,6 +955,7 @@
 	__kernel_fsid_t __fsid, *fsid = NULL;
 	u32 valid_mask = FANOTIFY_EVENTS | FANOTIFY_EVENT_FLAGS;
 	unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS;
+	unsigned int obj_type;
 	int ret;
 
 	pr_debug("%s: fanotify_fd=%d flags=%x dfd=%d pathname=%p mask=%llx\n",
@@ -961,8 +970,13 @@
 
 	switch (mark_type) {
 	case FAN_MARK_INODE:
+		obj_type = FSNOTIFY_OBJ_TYPE_INODE;
+		break;
 	case FAN_MARK_MOUNT:
+		obj_type = FSNOTIFY_OBJ_TYPE_VFSMOUNT;
+		break;
 	case FAN_MARK_FILESYSTEM:
+		obj_type = FSNOTIFY_OBJ_TYPE_SB;
 		break;
 	default:
 		return -EINVAL;
@@ -1030,7 +1044,8 @@
 		goto fput_and_out;
 	}
 
-	ret = fanotify_find_path(dfd, pathname, &path, flags);
+	ret = fanotify_find_path(dfd, pathname, &path, flags,
+			(mask & ALL_FSNOTIFY_EVENTS), obj_type);
 	if (ret)
 		goto fput_and_out;
 

diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index 5a00121..f346282 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h

@@ -54,8 +54,6 @@
 {
 	fsnotify_destroy_marks(&sb->s_fsnotify_marks);
 }
-/* Wait until all marks queued for destruction are destroyed */
-extern void fsnotify_wait_marks_destroyed(void);
 
 /*
  * update the dentry->d_flags of all of inode's children to indicate if inode cares

diff --git a/fs/notify/group.c b/fs/notify/group.c
index 0391190..133f723 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c

@@ -108,6 +108,7 @@
 	if (refcount_dec_and_test(&group->refcnt))
 		fsnotify_final_destroy_group(group);
 }
+EXPORT_SYMBOL_GPL(fsnotify_put_group);
 
 /*
  * Create a new fsnotify_group and hold a reference for the group returned.
@@ -137,6 +138,7 @@
 
 	return group;
 }
+EXPORT_SYMBOL_GPL(fsnotify_alloc_group);
 
 int fsnotify_fasync(int fd, struct file *file, int on)
 {

diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 0b81517..107537a 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c

@@ -30,6 +30,7 @@
 #include <linux/poll.h>
 #include <linux/wait.h>
 #include <linux/memcontrol.h>
+#include <linux/security.h>
 
 #include "inotify.h"
 #include "../fdinfo.h"
@@ -331,7 +332,8 @@
 /*
  * find_inode - resolve a user-given path to a specific inode
  */
-static int inotify_find_inode(const char __user *dirname, struct path *path, unsigned flags)
+static int inotify_find_inode(const char __user *dirname, struct path *path,
+						unsigned int flags, __u64 mask)
 {
 	int error;
 
@@ -340,8 +342,15 @@
 		return error;
 	/* you can only watch an inode if you have read permissions on it */
 	error = inode_permission(path->dentry->d_inode, MAY_READ);
+	if (error) {
+		path_put(path);
+		return error;
+	}
+	error = security_path_notify(path, mask,
+				FSNOTIFY_OBJ_TYPE_INODE);
 	if (error)
 		path_put(path);
+
 	return error;
 }
 
@@ -733,7 +742,8 @@
 	if (mask & IN_ONLYDIR)
 		flags |= LOOKUP_DIRECTORY;
 
-	ret = inotify_find_inode(pathname, &path, flags);
+	ret = inotify_find_inode(pathname, &path, flags,
+			(mask & IN_ALL_EVENTS));
 	if (ret)
 		goto fput_and_out;
 

diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 99ddd12..1d96216 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c

@@ -276,6 +276,7 @@
 	queue_delayed_work(system_unbound_wq, &reaper_work,
 			   FSNOTIFY_REAPER_DELAY);
 }
+EXPORT_SYMBOL_GPL(fsnotify_put_mark);
 
 /*
  * Get mark reference when we found the mark via lockless traversal of object
@@ -430,6 +431,7 @@
 	mutex_unlock(&group->mark_mutex);
 	fsnotify_free_mark(mark);
 }
+EXPORT_SYMBOL_GPL(fsnotify_destroy_mark);
 
 /*
  * Sorting function for lists of fsnotify marks.
@@ -685,6 +687,7 @@
 	mutex_unlock(&group->mark_mutex);
 	return ret;
 }
+EXPORT_SYMBOL_GPL(fsnotify_add_mark);
 
 /*
  * Given a list of marks, find the mark associated with given group. If found
@@ -711,6 +714,7 @@
 	spin_unlock(&conn->lock);
 	return NULL;
 }
+EXPORT_SYMBOL_GPL(fsnotify_find_mark);
 
 /* Clear any marks in a group with given type mask */
 void fsnotify_clear_marks_by_group(struct fsnotify_group *group,
@@ -809,6 +813,7 @@
 	mark->group = group;
 	WRITE_ONCE(mark->connector, NULL);
 }
+EXPORT_SYMBOL_GPL(fsnotify_init_mark);
 
 /*
  * Destroy all marks in destroy_list, waits for SRCU period to finish before
@@ -837,3 +842,4 @@
 {
 	flush_delayed_work(&reaper_work);
 }
+EXPORT_SYMBOL_GPL(fsnotify_wait_marks_destroyed);

diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index 20c841a..3aac5c9 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c

@@ -71,7 +71,7 @@
 	}
 	/* Read, map, and pin the page. */
 	page = ntfs_map_page(mft_vi->i_mapping, index);
-	if (likely(!IS_ERR(page))) {
+	if (!IS_ERR(page)) {
 		/* Catch multi sector transfer fixup errors. */
 		if (likely(ntfs_is_mft_recordp((le32*)(page_address(page) +
 				ofs)))) {
@@ -154,7 +154,7 @@
 	mutex_lock(&ni->mrec_lock);
 
 	m = map_mft_record_page(ni);
-	if (likely(!IS_ERR(m)))
+	if (!IS_ERR(m))
 		return m;
 
 	mutex_unlock(&ni->mrec_lock);
@@ -271,7 +271,7 @@
 		m = map_mft_record(ni);
 		/* map_mft_record() has incremented this on success. */
 		atomic_dec(&ni->count);
-		if (likely(!IS_ERR(m))) {
+		if (!IS_ERR(m)) {
 			/* Verify the sequence number. */
 			if (likely(le16_to_cpu(m->sequence_number) == seq_no)) {
 				ntfs_debug("Done 1.");
@@ -1303,7 +1303,7 @@
 	read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
 	rl = ntfs_attr_find_vcn_nolock(mftbmp_ni,
 			(ll - 1) >> vol->cluster_size_bits, NULL);
-	if (unlikely(IS_ERR(rl) || !rl->length || rl->lcn < 0)) {
+	if (IS_ERR(rl) || unlikely(!rl->length || rl->lcn < 0)) {
 		up_write(&mftbmp_ni->runlist.lock);
 		ntfs_error(vol->sb, "Failed to determine last allocated "
 				"cluster of mft bitmap attribute.");
@@ -1734,7 +1734,7 @@
 	read_unlock_irqrestore(&mft_ni->size_lock, flags);
 	rl = ntfs_attr_find_vcn_nolock(mft_ni,
 			(ll - 1) >> vol->cluster_size_bits, NULL);
-	if (unlikely(IS_ERR(rl) || !rl->length || rl->lcn < 0)) {
+	if (IS_ERR(rl) || unlikely(!rl->length || rl->lcn < 0)) {
 		up_write(&mft_ni->runlist.lock);
 		ntfs_error(vol->sb, "Failed to determine last allocated "
 				"cluster of mft data attribute.");
@@ -1776,7 +1776,7 @@
 	do {
 		rl2 = ntfs_cluster_alloc(vol, old_last_vcn, nr, lcn, MFT_ZONE,
 				true);
-		if (likely(!IS_ERR(rl2)))
+		if (!IS_ERR(rl2))
 			break;
 		if (PTR_ERR(rl2) != -ENOSPC || nr == min_nr) {
 			ntfs_error(vol->sb, "Failed to allocate the minimal "

diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c
index 2d3cc9e..4e6a44b 100644
--- a/fs/ntfs/namei.c
+++ b/fs/ntfs/namei.c

@@ -115,7 +115,7 @@
 		dent_ino = MREF(mref);
 		ntfs_debug("Found inode 0x%lx. Calling ntfs_iget.", dent_ino);
 		dent_inode = ntfs_iget(vol->sb, dent_ino);
-		if (likely(!IS_ERR(dent_inode))) {
+		if (!IS_ERR(dent_inode)) {
 			/* Consistency check. */
 			if (is_bad_inode(dent_inode) || MSEQNO(mref) ==
 					NTFS_I(dent_inode)->seq_no ||

diff --git a/fs/ntfs/runlist.c b/fs/ntfs/runlist.c
index 508744a..97932fb 100644
--- a/fs/ntfs/runlist.c
+++ b/fs/ntfs/runlist.c

@@ -951,7 +951,7 @@
 	}
 	/* Now combine the new and old runlists checking for overlaps. */
 	old_rl = ntfs_runlists_merge(old_rl, rl);
-	if (likely(!IS_ERR(old_rl)))
+	if (!IS_ERR(old_rl))
 		return old_rl;
 	ntfs_free(rl);
 	ntfs_error(vol->sb, "Failed to merge runlists.");

diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 29621d4..7dc3bc6 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c

@@ -1475,7 +1475,7 @@
 	kfree(name);
 	/* Get the inode. */
 	tmp_ino = ntfs_iget(vol->sb, MREF(mref));
-	if (unlikely(IS_ERR(tmp_ino) || is_bad_inode(tmp_ino))) {
+	if (IS_ERR(tmp_ino) || unlikely(is_bad_inode(tmp_ino))) {
 		if (!IS_ERR(tmp_ino))
 			iput(tmp_ino);
 		ntfs_error(vol->sb, "Failed to load $UsnJrnl.");

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 0c335b5..f9baefc 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c

@@ -5993,6 +5993,7 @@
 	struct buffer_head *data_alloc_bh = NULL;
 	struct ocfs2_dinode *di;
 	struct ocfs2_truncate_log *tl;
+	struct ocfs2_journal *journal = osb->journal;
 
 	BUG_ON(inode_trylock(tl_inode));
 
@@ -6013,6 +6014,20 @@
 		goto out;
 	}
 
+	/* Appending truncate log(TA) and and flushing truncate log(TF) are
+	 * two separated transactions. They can be both committed but not
+	 * checkpointed. If crash occurs then, both two transaction will be
+	 * replayed with several already released to global bitmap clusters.
+	 * Then truncate log will be replayed resulting in cluster double free.
+	 */
+	jbd2_journal_lock_updates(journal->j_journal);
+	status = jbd2_journal_flush(journal->j_journal);
+	jbd2_journal_unlock_updates(journal->j_journal);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+
 	data_alloc_inode = ocfs2_get_system_file_inode(osb,
 						       GLOBAL_BITMAP_SYSTEM_INODE,
 						       OCFS2_INVALID_SLOT);
@@ -6792,6 +6807,8 @@
 			      struct page *page, int zero, u64 *phys)
 {
 	int ret, partial = 0;
+	loff_t start_byte = ((loff_t)page->index << PAGE_SHIFT) + from;
+	loff_t length = to - from;
 
 	ret = ocfs2_map_page_blocks(page, phys, inode, from, to, 0);
 	if (ret)
@@ -6811,7 +6828,8 @@
 	if (ret < 0)
 		mlog_errno(ret);
 	else if (ocfs2_should_order_data(inode)) {
-		ret = ocfs2_jbd2_file_inode(handle, inode);
+		ret = ocfs2_jbd2_inode_add_write(handle, inode,
+						 start_byte, length);
 		if (ret < 0)
 			mlog_errno(ret);
 	}

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index a4c905d..8de1c9d 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c

@@ -942,7 +942,8 @@
 
 		if (tmppage && page_has_buffers(tmppage)) {
 			if (ocfs2_should_order_data(inode))
-				ocfs2_jbd2_file_inode(wc->w_handle, inode);
+				ocfs2_jbd2_inode_add_write(wc->w_handle, inode,
+							   user_pos, user_len);
 
 			block_commit_write(tmppage, from, to);
 		}
@@ -2023,8 +2024,14 @@
 		}
 
 		if (page_has_buffers(tmppage)) {
-			if (handle && ocfs2_should_order_data(inode))
-				ocfs2_jbd2_file_inode(handle, inode);
+			if (handle && ocfs2_should_order_data(inode)) {
+				loff_t start_byte =
+					((loff_t)tmppage->index << PAGE_SHIFT) +
+					from;
+				loff_t length = to - from;
+				ocfs2_jbd2_inode_add_write(handle, inode,
+							   start_byte, length);
+			}
 			block_commit_write(tmppage, from, to);
 		}
 	}

diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c
index 429e6a8..eaf042f 100644
--- a/fs/ocfs2/blockcheck.c
+++ b/fs/ocfs2/blockcheck.c

@@ -231,14 +231,6 @@
 }
 DEFINE_SIMPLE_ATTRIBUTE(blockcheck_fops, blockcheck_u64_get, NULL, "%llu\n");
 
-static struct dentry *blockcheck_debugfs_create(const char *name,
-						struct dentry *parent,
-						u64 *value)
-{
-	return debugfs_create_file(name, S_IFREG | S_IRUSR, parent, value,
-				   &blockcheck_fops);
-}
-
 static void ocfs2_blockcheck_debug_remove(struct ocfs2_blockcheck_stats *stats)
 {
 	if (stats) {
@@ -250,16 +242,20 @@
 static void ocfs2_blockcheck_debug_install(struct ocfs2_blockcheck_stats *stats,
 					   struct dentry *parent)
 {
-	stats->b_debug_dir = debugfs_create_dir("blockcheck", parent);
+	struct dentry *dir;
 
-	blockcheck_debugfs_create("blocks_checked", stats->b_debug_dir,
-				  &stats->b_check_count);
+	dir = debugfs_create_dir("blockcheck", parent);
+	stats->b_debug_dir = dir;
 
-	blockcheck_debugfs_create("checksums_failed", stats->b_debug_dir,
-				  &stats->b_failure_count);
+	debugfs_create_file("blocks_checked", S_IFREG | S_IRUSR, dir,
+			    &stats->b_check_count, &blockcheck_fops);
 
-	blockcheck_debugfs_create("ecc_recoveries", stats->b_debug_dir,
-				  &stats->b_recover_count);
+	debugfs_create_file("checksums_failed", S_IFREG | S_IRUSR, dir,
+			    &stats->b_failure_count, &blockcheck_fops);
+
+	debugfs_create_file("ecc_recoveries", S_IFREG | S_IRUSR, dir,
+			    &stats->b_recover_count, &blockcheck_fops);
+
 }
 #else
 static inline void ocfs2_blockcheck_debug_install(struct ocfs2_blockcheck_stats *stats,

diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index f1b6133..a368350 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c

@@ -225,10 +225,6 @@
 	unsigned int		hr_region_num;
 
 	struct dentry		*hr_debug_dir;
-	struct dentry		*hr_debug_livenodes;
-	struct dentry		*hr_debug_regnum;
-	struct dentry		*hr_debug_elapsed_time;
-	struct dentry		*hr_debug_pinned;
 	struct o2hb_debug_buf	*hr_db_livenodes;
 	struct o2hb_debug_buf	*hr_db_regnum;
 	struct o2hb_debug_buf	*hr_db_elapsed_time;
@@ -1394,21 +1390,20 @@
 	kfree(o2hb_db_failedregions);
 }
 
-static struct dentry *o2hb_debug_create(const char *name, struct dentry *dir,
-					struct o2hb_debug_buf **db, int db_len,
-					int type, int size, int len, void *data)
+static void o2hb_debug_create(const char *name, struct dentry *dir,
+			      struct o2hb_debug_buf **db, int db_len, int type,
+			      int size, int len, void *data)
 {
 	*db = kmalloc(db_len, GFP_KERNEL);
 	if (!*db)
-		return NULL;
+		return;
 
 	(*db)->db_type = type;
 	(*db)->db_size = size;
 	(*db)->db_len = len;
 	(*db)->db_data = data;
 
-	return debugfs_create_file(name, S_IFREG|S_IRUSR, dir, *db,
-				   &o2hb_debug_fops);
+	debugfs_create_file(name, S_IFREG|S_IRUSR, dir, *db, &o2hb_debug_fops);
 }
 
 static void o2hb_debug_init(void)
@@ -1525,11 +1520,7 @@
 
 	kfree(reg->hr_slots);
 
-	debugfs_remove(reg->hr_debug_livenodes);
-	debugfs_remove(reg->hr_debug_regnum);
-	debugfs_remove(reg->hr_debug_elapsed_time);
-	debugfs_remove(reg->hr_debug_pinned);
-	debugfs_remove(reg->hr_debug_dir);
+	debugfs_remove_recursive(reg->hr_debug_dir);
 	kfree(reg->hr_db_livenodes);
 	kfree(reg->hr_db_regnum);
 	kfree(reg->hr_db_elapsed_time);
@@ -1988,69 +1979,33 @@
 		: NULL;
 }
 
-static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir)
+static void o2hb_debug_region_init(struct o2hb_region *reg,
+				   struct dentry *parent)
 {
-	int ret = -ENOMEM;
+	struct dentry *dir;
 
-	reg->hr_debug_dir =
-		debugfs_create_dir(config_item_name(&reg->hr_item), dir);
-	if (!reg->hr_debug_dir) {
-		mlog_errno(ret);
-		goto bail;
-	}
+	dir = debugfs_create_dir(config_item_name(&reg->hr_item), parent);
+	reg->hr_debug_dir = dir;
 
-	reg->hr_debug_livenodes =
-			o2hb_debug_create(O2HB_DEBUG_LIVENODES,
-					  reg->hr_debug_dir,
-					  &(reg->hr_db_livenodes),
-					  sizeof(*(reg->hr_db_livenodes)),
-					  O2HB_DB_TYPE_REGION_LIVENODES,
-					  sizeof(reg->hr_live_node_bitmap),
-					  O2NM_MAX_NODES, reg);
-	if (!reg->hr_debug_livenodes) {
-		mlog_errno(ret);
-		goto bail;
-	}
+	o2hb_debug_create(O2HB_DEBUG_LIVENODES, dir, &(reg->hr_db_livenodes),
+			  sizeof(*(reg->hr_db_livenodes)),
+			  O2HB_DB_TYPE_REGION_LIVENODES,
+			  sizeof(reg->hr_live_node_bitmap), O2NM_MAX_NODES,
+			  reg);
 
-	reg->hr_debug_regnum =
-			o2hb_debug_create(O2HB_DEBUG_REGION_NUMBER,
-					  reg->hr_debug_dir,
-					  &(reg->hr_db_regnum),
-					  sizeof(*(reg->hr_db_regnum)),
-					  O2HB_DB_TYPE_REGION_NUMBER,
-					  0, O2NM_MAX_NODES, reg);
-	if (!reg->hr_debug_regnum) {
-		mlog_errno(ret);
-		goto bail;
-	}
+	o2hb_debug_create(O2HB_DEBUG_REGION_NUMBER, dir, &(reg->hr_db_regnum),
+			  sizeof(*(reg->hr_db_regnum)),
+			  O2HB_DB_TYPE_REGION_NUMBER, 0, O2NM_MAX_NODES, reg);
 
-	reg->hr_debug_elapsed_time =
-			o2hb_debug_create(O2HB_DEBUG_REGION_ELAPSED_TIME,
-					  reg->hr_debug_dir,
-					  &(reg->hr_db_elapsed_time),
-					  sizeof(*(reg->hr_db_elapsed_time)),
-					  O2HB_DB_TYPE_REGION_ELAPSED_TIME,
-					  0, 0, reg);
-	if (!reg->hr_debug_elapsed_time) {
-		mlog_errno(ret);
-		goto bail;
-	}
+	o2hb_debug_create(O2HB_DEBUG_REGION_ELAPSED_TIME, dir,
+			  &(reg->hr_db_elapsed_time),
+			  sizeof(*(reg->hr_db_elapsed_time)),
+			  O2HB_DB_TYPE_REGION_ELAPSED_TIME, 0, 0, reg);
 
-	reg->hr_debug_pinned =
-			o2hb_debug_create(O2HB_DEBUG_REGION_PINNED,
-					  reg->hr_debug_dir,
-					  &(reg->hr_db_pinned),
-					  sizeof(*(reg->hr_db_pinned)),
-					  O2HB_DB_TYPE_REGION_PINNED,
-					  0, 0, reg);
-	if (!reg->hr_debug_pinned) {
-		mlog_errno(ret);
-		goto bail;
-	}
+	o2hb_debug_create(O2HB_DEBUG_REGION_PINNED, dir, &(reg->hr_db_pinned),
+			  sizeof(*(reg->hr_db_pinned)),
+			  O2HB_DB_TYPE_REGION_PINNED, 0, 0, reg);
 
-	ret = 0;
-bail:
-	return ret;
 }
 
 static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *group,
@@ -2106,11 +2061,7 @@
 	if (ret)
 		goto unregister_handler;
 
-	ret = o2hb_debug_region_init(reg, o2hb_debug_dir);
-	if (ret) {
-		config_item_put(&reg->hr_item);
-		goto unregister_handler;
-	}
+	o2hb_debug_region_init(reg, o2hb_debug_dir);
 
 	return &reg->hr_item;
 

diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 784426d..bdef72c 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c

@@ -3636,7 +3636,7 @@
 	int i, j, num_used;
 	u32 major_hash;
 	struct ocfs2_dx_leaf *orig_dx_leaf, *new_dx_leaf;
-	struct ocfs2_dx_entry_list *orig_list, *new_list, *tmp_list;
+	struct ocfs2_dx_entry_list *orig_list, *tmp_list;
 	struct ocfs2_dx_entry *dx_entry;
 
 	tmp_list = &tmp_dx_leaf->dl_list;
@@ -3645,7 +3645,6 @@
 		orig_dx_leaf = (struct ocfs2_dx_leaf *) orig_dx_leaves[i]->b_data;
 		orig_list = &orig_dx_leaf->dl_list;
 		new_dx_leaf = (struct ocfs2_dx_leaf *) new_dx_leaves[i]->b_data;
-		new_list = &new_dx_leaf->dl_list;
 
 		num_used = le16_to_cpu(orig_list->de_num_used);
 

diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 69a429b..aaf2454 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h

@@ -142,7 +142,6 @@
 	atomic_t res_tot_count;
 	atomic_t res_cur_count;
 
-	struct dlm_debug_ctxt *dlm_debug_ctxt;
 	struct dentry *dlm_debugfs_subroot;
 
 	/* NOTE: Next three are protected by dlm_domain_lock */

diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index a4b58ba..4d0b452 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c

@@ -853,67 +853,34 @@
 /* files in subroot */
 void dlm_debug_init(struct dlm_ctxt *dlm)
 {
-	struct dlm_debug_ctxt *dc = dlm->dlm_debug_ctxt;
-
 	/* for dumping dlm_ctxt */
-	dc->debug_state_dentry = debugfs_create_file(DLM_DEBUGFS_DLM_STATE,
-						     S_IFREG|S_IRUSR,
-						     dlm->dlm_debugfs_subroot,
-						     dlm, &debug_state_fops);
+	debugfs_create_file(DLM_DEBUGFS_DLM_STATE, S_IFREG|S_IRUSR,
+			    dlm->dlm_debugfs_subroot, dlm, &debug_state_fops);
 
 	/* for dumping lockres */
-	dc->debug_lockres_dentry =
-			debugfs_create_file(DLM_DEBUGFS_LOCKING_STATE,
-					    S_IFREG|S_IRUSR,
-					    dlm->dlm_debugfs_subroot,
-					    dlm, &debug_lockres_fops);
+	debugfs_create_file(DLM_DEBUGFS_LOCKING_STATE, S_IFREG|S_IRUSR,
+			    dlm->dlm_debugfs_subroot, dlm, &debug_lockres_fops);
 
 	/* for dumping mles */
-	dc->debug_mle_dentry = debugfs_create_file(DLM_DEBUGFS_MLE_STATE,
-						   S_IFREG|S_IRUSR,
-						   dlm->dlm_debugfs_subroot,
-						   dlm, &debug_mle_fops);
+	debugfs_create_file(DLM_DEBUGFS_MLE_STATE, S_IFREG|S_IRUSR,
+			    dlm->dlm_debugfs_subroot, dlm, &debug_mle_fops);
 
 	/* for dumping lockres on the purge list */
-	dc->debug_purgelist_dentry =
-			debugfs_create_file(DLM_DEBUGFS_PURGE_LIST,
-					    S_IFREG|S_IRUSR,
-					    dlm->dlm_debugfs_subroot,
-					    dlm, &debug_purgelist_fops);
-}
-
-void dlm_debug_shutdown(struct dlm_ctxt *dlm)
-{
-	struct dlm_debug_ctxt *dc = dlm->dlm_debug_ctxt;
-
-	if (dc) {
-		debugfs_remove(dc->debug_purgelist_dentry);
-		debugfs_remove(dc->debug_mle_dentry);
-		debugfs_remove(dc->debug_lockres_dentry);
-		debugfs_remove(dc->debug_state_dentry);
-		kfree(dc);
-		dc = NULL;
-	}
+	debugfs_create_file(DLM_DEBUGFS_PURGE_LIST, S_IFREG|S_IRUSR,
+			    dlm->dlm_debugfs_subroot, dlm,
+			    &debug_purgelist_fops);
 }
 
 /* subroot - domain dir */
-int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm)
+void dlm_create_debugfs_subroot(struct dlm_ctxt *dlm)
 {
-	dlm->dlm_debug_ctxt = kzalloc(sizeof(struct dlm_debug_ctxt),
-				      GFP_KERNEL);
-	if (!dlm->dlm_debug_ctxt) {
-		mlog_errno(-ENOMEM);
-		return -ENOMEM;
-	}
-
 	dlm->dlm_debugfs_subroot = debugfs_create_dir(dlm->name,
 						      dlm_debugfs_root);
-	return 0;
 }
 
 void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm)
 {
-	debugfs_remove(dlm->dlm_debugfs_subroot);
+	debugfs_remove_recursive(dlm->dlm_debugfs_subroot);
 }
 
 /* debugfs root */

diff --git a/fs/ocfs2/dlm/dlmdebug.h b/fs/ocfs2/dlm/dlmdebug.h
index 7d0c7c9..f8fd868 100644
--- a/fs/ocfs2/dlm/dlmdebug.h
+++ b/fs/ocfs2/dlm/dlmdebug.h

@@ -14,13 +14,6 @@
 
 #ifdef CONFIG_DEBUG_FS
 
-struct dlm_debug_ctxt {
-	struct dentry *debug_state_dentry;
-	struct dentry *debug_lockres_dentry;
-	struct dentry *debug_mle_dentry;
-	struct dentry *debug_purgelist_dentry;
-};
-
 struct debug_lockres {
 	int dl_len;
 	char *dl_buf;
@@ -29,9 +22,8 @@
 };
 
 void dlm_debug_init(struct dlm_ctxt *dlm);
-void dlm_debug_shutdown(struct dlm_ctxt *dlm);
 
-int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm);
+void dlm_create_debugfs_subroot(struct dlm_ctxt *dlm);
 void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm);
 
 void dlm_create_debugfs_root(void);
@@ -42,13 +34,9 @@
 static inline void dlm_debug_init(struct dlm_ctxt *dlm)
 {
 }
-static inline void dlm_debug_shutdown(struct dlm_ctxt *dlm)
+static inline void dlm_create_debugfs_subroot(struct dlm_ctxt *dlm)
 {
 }
-static inline int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm)
-{
-	return 0;
-}
 static inline void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm)
 {
 }

diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 7338b5d..ee6f459 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c

@@ -387,7 +387,6 @@
 static void dlm_complete_dlm_shutdown(struct dlm_ctxt *dlm)
 {
 	dlm_unregister_domain_handlers(dlm);
-	dlm_debug_shutdown(dlm);
 	dlm_complete_thread(dlm);
 	dlm_complete_recovery_thread(dlm);
 	dlm_destroy_dlm_worker(dlm);
@@ -1938,7 +1937,6 @@
 
 	if (status) {
 		dlm_unregister_domain_handlers(dlm);
-		dlm_debug_shutdown(dlm);
 		dlm_complete_thread(dlm);
 		dlm_complete_recovery_thread(dlm);
 		dlm_destroy_dlm_worker(dlm);
@@ -1992,9 +1990,7 @@
 	dlm->key = key;
 	dlm->node_num = o2nm_this_node();
 
-	ret = dlm_create_debugfs_subroot(dlm);
-	if (ret < 0)
-		goto leave;
+	dlm_create_debugfs_subroot(dlm);
 
 	spin_lock_init(&dlm->spinlock);
 	spin_lock_init(&dlm->master_lock);
@@ -2056,6 +2052,7 @@
 	mlog(0, "context init: refcount %u\n",
 		  kref_read(&dlm->dlm_refs));
 
+	ret = 0;
 leave:
 	if (ret < 0 && dlm) {
 		if (dlm->master_hash)

diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index e786577..3883633 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c

@@ -90,7 +90,8 @@
 	enum dlm_status status;
 	int actions = 0;
 	int in_use;
-        u8 owner;
+	u8 owner;
+	int recovery_wait = 0;
 
 	mlog(0, "master_node = %d, valblk = %d\n", master_node,
 	     flags & LKM_VALBLK);
@@ -193,9 +194,12 @@
 		}
 		if (flags & LKM_CANCEL)
 			lock->cancel_pending = 0;
-		else
-			lock->unlock_pending = 0;
-
+		else {
+			if (!lock->unlock_pending)
+				recovery_wait = 1;
+			else
+				lock->unlock_pending = 0;
+		}
 	}
 
 	/* get an extra ref on lock.  if we are just switching
@@ -229,6 +233,17 @@
 	spin_unlock(&res->spinlock);
 	wake_up(&res->wq);
 
+	if (recovery_wait) {
+		spin_lock(&res->spinlock);
+		/* Unlock request will directly succeed after owner dies,
+		 * and the lock is already removed from grant list. We have to
+		 * wait for RECOVERING done or we miss the chance to purge it
+		 * since the removement is much faster than RECOVERING proc.
+		 */
+		__dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_RECOVERING);
+		spin_unlock(&res->spinlock);
+	}
+
 	/* let the caller's final dlm_lock_put handle the actual kfree */
 	if (actions & DLM_UNLOCK_FREE_LOCK) {
 		/* this should always be coupled with list removal */

diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 1420723..6e774c5 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c

@@ -2508,9 +2508,7 @@
 			ocfs2_inode_unlock(inode, ex);
 	}
 
-	if (local_bh)
-		brelse(local_bh);
-
+	brelse(local_bh);
 	return status;
 }
 
@@ -2593,8 +2591,7 @@
 		*level = 1;
 		if (ocfs2_should_update_atime(inode, vfsmnt))
 			ocfs2_update_inode_atime(inode, bh);
-		if (bh)
-			brelse(bh);
+		brelse(bh);
 	} else
 		*level = 0;
 
@@ -3012,8 +3009,6 @@
 
 	kref_init(&dlm_debug->d_refcnt);
 	INIT_LIST_HEAD(&dlm_debug->d_lockres_tracking);
-	dlm_debug->d_locking_state = NULL;
-	dlm_debug->d_locking_filter = NULL;
 	dlm_debug->d_filter_secs = 0;
 out:
 	return dlm_debug;
@@ -3282,27 +3277,19 @@
 {
 	struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug;
 
-	dlm_debug->d_locking_state = debugfs_create_file("locking_state",
-							 S_IFREG|S_IRUSR,
-							 osb->osb_debug_root,
-							 osb,
-							 &ocfs2_dlm_debug_fops);
+	debugfs_create_file("locking_state", S_IFREG|S_IRUSR,
+			    osb->osb_debug_root, osb, &ocfs2_dlm_debug_fops);
 
-	dlm_debug->d_locking_filter = debugfs_create_u32("locking_filter",
-						0600,
-						osb->osb_debug_root,
-						&dlm_debug->d_filter_secs);
+	debugfs_create_u32("locking_filter", 0600, osb->osb_debug_root,
+			   &dlm_debug->d_filter_secs);
 }
 
 static void ocfs2_dlm_shutdown_debug(struct ocfs2_super *osb)
 {
 	struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug;
 
-	if (dlm_debug) {
-		debugfs_remove(dlm_debug->d_locking_state);
-		debugfs_remove(dlm_debug->d_locking_filter);
+	if (dlm_debug)
 		ocfs2_put_dlm_debug(dlm_debug);
-	}
 }
 
 int ocfs2_dlm_init(struct ocfs2_super *osb)

diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index e66a249..e3e2d1b 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c

@@ -590,8 +590,7 @@
 			*extent_flags = rec->e_flags;
 	}
 out:
-	if (eb_bh)
-		brelse(eb_bh);
+	brelse(eb_bh);
 	return ret;
 }
 

diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 4435df3..2e982db 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c

@@ -706,7 +706,9 @@
  * Thus, we need to explicitly order the zeroed pages.
  */
 static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode,
-						struct buffer_head *di_bh)
+						      struct buffer_head *di_bh,
+						      loff_t start_byte,
+						      loff_t length)
 {
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	handle_t *handle = NULL;
@@ -722,7 +724,7 @@
 		goto out;
 	}
 
-	ret = ocfs2_jbd2_file_inode(handle, inode);
+	ret = ocfs2_jbd2_inode_add_write(handle, inode, start_byte, length);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out;
@@ -761,7 +763,9 @@
 	BUG_ON(abs_to > (((u64)index + 1) << PAGE_SHIFT));
 	BUG_ON(abs_from & (inode->i_blkbits - 1));
 
-	handle = ocfs2_zero_start_ordered_transaction(inode, di_bh);
+	handle = ocfs2_zero_start_ordered_transaction(inode, di_bh,
+						      abs_from,
+						      abs_to - abs_from);
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
 		goto out;
@@ -2126,7 +2130,6 @@
 	struct dentry *dentry = file->f_path.dentry;
 	struct inode *inode = d_inode(dentry);
 	struct buffer_head *di_bh = NULL;
-	loff_t end;
 
 	/*
 	 * We start with a read level meta lock and only jump to an ex
@@ -2190,8 +2193,6 @@
 			}
 		}
 
-		end = pos + count;
-
 		ret = ocfs2_check_range_for_refcount(inode, pos, count);
 		if (ret == 1) {
 			ocfs2_inode_unlock(inode, meta_level);

diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 7ad9d65..7c9dfd5 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c

@@ -534,7 +534,7 @@
 	 */
 	mlog_bug_on_msg(!!(fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) !=
 			!!(args->fi_flags & OCFS2_FI_FLAG_SYSFILE),
-			"Inode %llu: system file state is ambigous\n",
+			"Inode %llu: system file state is ambiguous\n",
 			(unsigned long long)args->fi_blkno);
 
 	if (S_ISCHR(le16_to_cpu(fe->i_mode)) ||

diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index c0fe6ed..3103ba7 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h

@@ -144,7 +144,6 @@
 void ocfs2_orphan_scan_init(struct ocfs2_super *osb);
 void ocfs2_orphan_scan_start(struct ocfs2_super *osb);
 void ocfs2_orphan_scan_stop(struct ocfs2_super *osb);
-void ocfs2_orphan_scan_exit(struct ocfs2_super *osb);
 
 void ocfs2_complete_recovery(struct work_struct *work);
 void ocfs2_wait_for_recovery(struct ocfs2_super *osb);
@@ -232,8 +231,8 @@
  *                          ocfs2_journal_access_*() unless you intend to
  *                          manage the checksum by hand.
  *  ocfs2_journal_dirty    - Mark a journalled buffer as having dirty data.
- *  ocfs2_jbd2_file_inode  - Mark an inode so that its data goes out before
- *                           the current handle commits.
+ *  ocfs2_jbd2_inode_add_write  - Mark an inode with range so that its data goes
+ *                                out before the current handle commits.
  */
 
 /* You must always start_trans with a number of buffs > 0, but it's
@@ -441,7 +440,7 @@
  * previous dirblock update in the free list */
 static inline int ocfs2_link_credits(struct super_block *sb)
 {
-	return 2*OCFS2_INODE_UPDATE_CREDITS + 4 +
+	return 2 * OCFS2_INODE_UPDATE_CREDITS + 4 +
 	       ocfs2_quota_trans_credits(sb);
 }
 
@@ -575,37 +574,12 @@
 	return ocfs2_extent_recs_per_gd(sb);
 }
 
-static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb,
-						unsigned int clusters_to_del,
-						struct ocfs2_dinode *fe,
-						struct ocfs2_extent_list *last_el)
+static inline int ocfs2_jbd2_inode_add_write(handle_t *handle, struct inode *inode,
+					     loff_t start_byte, loff_t length)
 {
- 	/* for dinode + all headers in this pass + update to next leaf */
-	u16 next_free = le16_to_cpu(last_el->l_next_free_rec);
-	u16 tree_depth = le16_to_cpu(fe->id2.i_list.l_tree_depth);
-	int credits = 1 + tree_depth + 1;
-	int i;
-
-	i = next_free - 1;
-	BUG_ON(i < 0);
-
-	/* We may be deleting metadata blocks, so metadata alloc dinode +
-	   one desc. block for each possible delete. */
-	if (tree_depth && next_free == 1 &&
-	    ocfs2_rec_clusters(last_el, &last_el->l_recs[i]) == clusters_to_del)
-		credits += 1 + tree_depth;
-
-	/* update to the truncate log. */
-	credits += OCFS2_TRUNCATE_LOG_UPDATE;
-
-	credits += ocfs2_quota_trans_credits(sb);
-
-	return credits;
-}
-
-static inline int ocfs2_jbd2_file_inode(handle_t *handle, struct inode *inode)
-{
-	return jbd2_journal_inode_add_write(handle, &OCFS2_I(inode)->ip_jinode);
+	return jbd2_journal_inode_ranged_write(handle,
+					       &OCFS2_I(inode)->ip_jinode,
+					       start_byte, length);
 }
 
 static inline int ocfs2_begin_ordered_truncate(struct inode *inode,

diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 6f8e1c4..8ea51cf 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c

@@ -2486,7 +2486,6 @@
 	struct inode *inode = NULL;
 	struct inode *orphan_dir = NULL;
 	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
-	struct ocfs2_dinode *di = NULL;
 	handle_t *handle = NULL;
 	char orphan_name[OCFS2_ORPHAN_NAMELEN + 1];
 	struct buffer_head *parent_di_bh = NULL;
@@ -2552,7 +2551,6 @@
 		goto leave;
 	}
 
-	di = (struct ocfs2_dinode *)new_di_bh->b_data;
 	status = ocfs2_orphan_add(osb, handle, inode, new_di_bh, orphan_name,
 				  &orphan_insert, orphan_dir, false);
 	if (status < 0) {

diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index fddbbd6..9150cfa 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h

@@ -223,8 +223,6 @@
 
 struct ocfs2_dlm_debug {
 	struct kref d_refcnt;
-	struct dentry *d_locking_state;
-	struct dentry *d_locking_filter;
 	u32 d_filter_secs;
 	struct list_head d_lockres_tracking;
 };
@@ -401,7 +399,6 @@
 	struct ocfs2_dlm_debug *osb_dlm_debug;
 
 	struct dentry *osb_debug_root;
-	struct dentry *osb_ctxt;
 
 	wait_queue_head_t recovery_event;
 

diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 8b2f395..c81e86c 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c

@@ -1080,10 +1080,8 @@
 	osb->osb_debug_root = debugfs_create_dir(osb->uuid_str,
 						 ocfs2_debugfs_root);
 
-	osb->osb_ctxt = debugfs_create_file("fs_state", S_IFREG|S_IRUSR,
-					    osb->osb_debug_root,
-					    osb,
-					    &ocfs2_osb_debug_fops);
+	debugfs_create_file("fs_state", S_IFREG|S_IRUSR, osb->osb_debug_root,
+			    osb, &ocfs2_osb_debug_fops);
 
 	if (ocfs2_meta_ecc(osb))
 		ocfs2_blockcheck_stats_debugfs_install( &osb->osb_ecc_stats,
@@ -1861,8 +1859,6 @@
 
 	kset_unregister(osb->osb_dev_kset);
 
-	debugfs_remove(osb->osb_ctxt);
-
 	/* Orphan scan should be stopped as early as possible */
 	ocfs2_orphan_scan_stop(osb);
 
@@ -1918,7 +1914,7 @@
 		ocfs2_dlm_shutdown(osb, hangup_needed);
 
 	ocfs2_blockcheck_stats_debugfs_remove(&osb->osb_ecc_stats);
-	debugfs_remove(osb->osb_debug_root);
+	debugfs_remove_recursive(osb->osb_debug_root);
 
 	if (hangup_needed)
 		ocfs2_cluster_hangup(osb->uuid_str, strlen(osb->uuid_str));

diff --git a/fs/open.c b/fs/open.c
index a59abe3..b62f5c0 100644
--- a/fs/open.c
+++ b/fs/open.c

@@ -776,7 +776,7 @@
 		f->f_mode |= FMODE_ATOMIC_POS;
 
 	f->f_op = fops_get(inode->i_fop);
-	if (unlikely(WARN_ON(!f->f_op))) {
+	if (WARN_ON(!f->f_op)) {
 		error = -ENODEV;
 		goto cleanup_all;
 	}
@@ -818,6 +818,14 @@
 		if (!f->f_mapping->a_ops || !f->f_mapping->a_ops->direct_IO)
 			return -EINVAL;
 	}
+
+	/*
+	 * XXX: Huge page cache doesn't support writing yet. Drop all page
+	 * cache for this file before processing writes.
+	 */
+	if ((f->f_mode & FMODE_WRITE) && filemap_nr_thps(inode->i_mapping))
+		truncate_pagecache(inode, 0);
+
 	return 0;
 
 cleanup_all:

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index f583448..e2ed8e0 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c

@@ -31,6 +31,7 @@
 #include <linux/ioport.h>
 #include <linux/memory.h>
 #include <linux/sched/task.h>
+#include <linux/security.h>
 #include <asm/sections.h>
 #include "internal.h"
 
@@ -545,9 +546,14 @@
 
 static int open_kcore(struct inode *inode, struct file *filp)
 {
+	int ret = security_locked_down(LOCKDOWN_KCORE);
+
 	if (!capable(CAP_SYS_RAWIO))
 		return -EPERM;
 
+	if (ret)
+		return ret;
+
 	filp->private_data = kmalloc(PAGE_SIZE, GFP_KERNEL);
 	if (!filp->private_data)
 		return -ENOMEM;

diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 465ea01..ac92473 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c

@@ -8,7 +8,6 @@
 #include <linux/mmzone.h>
 #include <linux/proc_fs.h>
 #include <linux/percpu.h>
-#include <linux/quicklist.h>
 #include <linux/seq_file.h>
 #include <linux/swap.h>
 #include <linux/vmstat.h>
@@ -106,9 +105,6 @@
 		   global_zone_page_state(NR_KERNEL_STACK_KB));
 	show_val_kb(m, "PageTables:     ",
 		    global_zone_page_state(NR_PAGETABLE));
-#ifdef CONFIG_QUICKLIST
-	show_val_kb(m, "Quicklists:     ", quicklist_total_size());
-#endif
 
 	show_val_kb(m, "NFS_Unstable:   ",
 		    global_node_page_state(NR_UNSTABLE_NFS));
@@ -136,6 +132,10 @@
 		    global_node_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR);
 	show_val_kb(m, "ShmemPmdMapped: ",
 		    global_node_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR);
+	show_val_kb(m, "FileHugePages: ",
+		    global_node_page_state(NR_FILE_THPS) * HPAGE_PMD_NR);
+	show_val_kb(m, "FilePmdMapped: ",
+		    global_node_page_state(NR_FILE_PMDMAPPED) * HPAGE_PMD_NR);
 #endif
 
 #ifdef CONFIG_CMA

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index bf43d1d..9442631 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c

@@ -417,6 +417,7 @@
 	unsigned long lazyfree;
 	unsigned long anonymous_thp;
 	unsigned long shmem_thp;
+	unsigned long file_thp;
 	unsigned long swap;
 	unsigned long shared_hugetlb;
 	unsigned long private_hugetlb;
@@ -461,7 +462,7 @@
 static void smaps_account(struct mem_size_stats *mss, struct page *page,
 		bool compound, bool young, bool dirty, bool locked)
 {
-	int i, nr = compound ? 1 << compound_order(page) : 1;
+	int i, nr = compound ? compound_nr(page) : 1;
 	unsigned long size = nr * PAGE_SIZE;
 
 	/*
@@ -588,7 +589,7 @@
 	else if (is_zone_device_page(page))
 		/* pass */;
 	else
-		VM_BUG_ON_PAGE(1, page);
+		mss->file_thp += HPAGE_PMD_SIZE;
 	smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd), locked);
 }
 #else
@@ -809,6 +810,7 @@
 	SEQ_PUT_DEC(" kB\nLazyFree:       ", mss->lazyfree);
 	SEQ_PUT_DEC(" kB\nAnonHugePages:  ", mss->anonymous_thp);
 	SEQ_PUT_DEC(" kB\nShmemPmdMapped: ", mss->shmem_thp);
+	SEQ_PUT_DEC(" kB\nFilePmdMapped: ", mss->file_thp);
 	SEQ_PUT_DEC(" kB\nShared_Hugetlb: ", mss->shared_hugetlb);
 	seq_put_decimal_ull_width(m, " kB\nPrivate_Hugetlb: ",
 				  mss->private_hugetlb >> 10, 7);

diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index e16fb8f..273ee82 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c

@@ -88,7 +88,7 @@
 static void show_type(struct seq_file *m, struct super_block *sb)
 {
 	mangle(m, sb->s_type->name);
-	if (sb->s_subtype && sb->s_subtype[0]) {
+	if (sb->s_subtype) {
 		seq_putc(m, '.');
 		mangle(m, sb->s_subtype);
 	}

diff --git a/fs/reiserfs/do_balan.c b/fs/reiserfs/do_balan.c
index 9c02d96..4075e41 100644
--- a/fs/reiserfs/do_balan.c
+++ b/fs/reiserfs/do_balan.c

@@ -239,10 +239,8 @@
 static int balance_leaf_when_delete(struct tree_balance *tb, int flag)
 {
 	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
-	int item_pos = PATH_LAST_POSITION(tb->tb_path);
 	struct buffer_info bi;
 	int n;
-	struct item_head *ih;
 
 	RFALSE(tb->FR[0] && B_LEVEL(tb->FR[0]) != DISK_LEAF_NODE_LEVEL + 1,
 	       "vs- 12000: level: wrong FR %z", tb->FR[0]);
@@ -251,7 +249,6 @@
 	RFALSE(!tb->blknum[0] && !PATH_H_PPARENT(tb->tb_path, 0),
 	       "PAP-12010: tree can not be empty");
 
-	ih = item_head(tbS0, item_pos);
 	buffer_info_init_tbS0(tb, &bi);
 
 	/* Delete or truncate the item */
@@ -298,7 +295,6 @@
 	if (tb->item_pos == tb->lnum[0] - 1 && tb->lbytes != -1) {
 		/* part of new item falls into L[0] */
 		int new_item_len, shift;
-		int version;
 
 		ret = leaf_shift_left(tb, tb->lnum[0] - 1, -1);
 
@@ -317,8 +313,6 @@
 		leaf_insert_into_buf(&bi, n + tb->item_pos - ret, ih, body,
 			     min_t(int, tb->zeroes_num, ih_item_len(ih)));
 
-		version = ih_version(ih);
-
 		/*
 		 * Calculate key component, item length and body to
 		 * insert into S[0]
@@ -632,7 +626,6 @@
 	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
 	int n = B_NR_ITEMS(tbS0);
 	struct buffer_info bi;
-	int ret;
 
 	/* new item or part of it doesn't fall into R[0] */
 	if (n - tb->rnum[0] >= tb->item_pos) {
@@ -646,13 +639,11 @@
 	if (tb->item_pos == n - tb->rnum[0] + 1 && tb->rbytes != -1) {
 		loff_t old_key_comp, old_len, r_zeroes_number;
 		const char *r_body;
-		int version, shift;
+		int shift;
 		loff_t offset;
 
 		leaf_shift_right(tb, tb->rnum[0] - 1, -1);
 
-		version = ih_version(ih);
-
 		/* Remember key component and item length */
 		old_key_comp = le_ih_k_offset(ih);
 		old_len = ih_item_len(ih);
@@ -698,7 +689,7 @@
 		/* whole new item falls into R[0] */
 
 		/* Shift rnum[0]-1 items to R[0] */
-		ret = leaf_shift_right(tb, tb->rnum[0] - 1, tb->rbytes);
+		leaf_shift_right(tb, tb->rnum[0] - 1, tb->rbytes);
 
 		/* Insert new item into R[0] */
 		buffer_info_init_right(tb, &bi);
@@ -950,14 +941,12 @@
 	if (tb->item_pos == n - tb->snum[i] + 1 && tb->sbytes[i] != -1) {
 		int old_key_comp, old_len, r_zeroes_number;
 		const char *r_body;
-		int version;
 
 		/* Move snum[i]-1 items from S[0] to S_new[i] */
 		leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, tb->snum[i] - 1, -1,
 				tb->S_new[i]);
 
 		/* Remember key component and item length */
-		version = ih_version(ih);
 		old_key_comp = le_ih_k_offset(ih);
 		old_len = ih_item_len(ih);
 

diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c
index 6b0ddb2..1170922 100644
--- a/fs/reiserfs/fix_node.c
+++ b/fs/reiserfs/fix_node.c

@@ -376,7 +376,6 @@
 		       int to, int to_bytes, short *snum012, int flow)
 {
 	int i;
-	int cur_free;
 	int units;
 	struct virtual_node *vn = tb->tb_vn;
 	int total_node_size, max_node_size, current_item_size;
@@ -438,7 +437,6 @@
 	/* leaf level */
 	needed_nodes = 1;
 	total_node_size = 0;
-	cur_free = max_node_size;
 
 	/* start from 'from'-th item */
 	start_item = from;
@@ -1734,14 +1732,12 @@
 	 * and Fh is its father.
 	 */
 	struct buffer_head *Sh, *Fh;
-	int maxsize, ret;
+	int ret;
 	int lfree, rfree /* free space in L and R */ ;
 
 	Sh = PATH_H_PBUFFER(tb->tb_path, h);
 	Fh = PATH_H_PPARENT(tb->tb_path, h);
 
-	maxsize = MAX_CHILD_SIZE(Sh);
-
 	/*
 	 * using tb->insert_size[h], which is negative in this case,
 	 * create_virtual_node calculates:

diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 4517a13..4b3e3e7 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c

@@ -891,7 +891,6 @@
 	struct list_head *entry;
 	unsigned int trans_id = jl->j_trans_id;
 	unsigned int other_trans_id;
-	unsigned int first_trans_id;
 
 find_first:
 	/*
@@ -914,8 +913,6 @@
 		return 0;
 	}
 
-	first_trans_id = first_jl->j_trans_id;
-
 	entry = &first_jl->j_list;
 	while (1) {
 		other_jl = JOURNAL_LIST_ENTRY(entry);
@@ -1351,7 +1348,7 @@
 			      struct reiserfs_journal_list *jl, int flushall)
 {
 	struct reiserfs_journal_list *pjl;
-	struct reiserfs_journal_cnode *cn, *last;
+	struct reiserfs_journal_cnode *cn;
 	int count;
 	int was_jwait = 0;
 	int was_dirty = 0;
@@ -1509,7 +1506,6 @@
 					 b_blocknr, __func__);
 		}
 free_cnode:
-		last = cn;
 		cn = cn->next;
 		if (saved_bh) {
 			/*
@@ -1792,7 +1788,6 @@
 {
 	unsigned long len = 0;
 	unsigned long cur_len;
-	int ret;
 	int i;
 	int limit = 256;
 	struct reiserfs_journal_list *tjl;
@@ -1829,9 +1824,9 @@
 	 * transactions, but only bother if we've actually spanned
 	 * across multiple lists
 	 */
-	if (flush_jl != jl) {
-		ret = kupdate_transactions(s, jl, &tjl, &trans_id, len, i);
-	}
+	if (flush_jl != jl)
+		kupdate_transactions(s, jl, &tjl, &trans_id, len, i);
+
 	flush_journal_list(s, flush_jl, 1);
 	put_journal_list(s, flush_jl);
 	put_journal_list(s, jl);
@@ -1911,7 +1906,6 @@
 			      struct super_block *sb, int error)
 {
 	struct reiserfs_transaction_handle myth;
-	int flushed = 0;
 	struct reiserfs_journal *journal = SB_JOURNAL(sb);
 
 	/*
@@ -1933,7 +1927,6 @@
 						     1);
 			journal_mark_dirty(&myth, SB_BUFFER_WITH_SB(sb));
 			do_journal_end(&myth, FLUSH_ALL);
-			flushed = 1;
 		}
 	}
 
@@ -3444,9 +3437,8 @@
 	if (cn == journal->j_last) {
 		journal->j_last = cn->prev;
 	}
-	if (bh)
-		remove_journal_hash(sb, journal->j_hash_table, NULL,
-				    bh->b_blocknr, 0);
+	remove_journal_hash(sb, journal->j_hash_table, NULL,
+			    bh->b_blocknr, 0);
 	clear_buffer_journaled(bh);	/* don't log this one */
 
 	if (!already_cleaned) {
@@ -3988,7 +3980,6 @@
 	struct buffer_head *c_bh;	/* commit bh */
 	struct buffer_head *d_bh;	/* desc bh */
 	int cur_write_start = 0;	/* start index of current log write */
-	int old_start;
 	int i;
 	int flush;
 	int wait_on_commit;
@@ -4245,7 +4236,6 @@
 	journal->j_num_work_lists++;
 
 	/* reset journal values for the next transaction */
-	old_start = journal->j_start;
 	journal->j_start =
 	    (journal->j_start + journal->j_len +
 	     2) % SB_ONDISK_JOURNAL_SIZE(sb);

diff --git a/fs/reiserfs/lbalance.c b/fs/reiserfs/lbalance.c
index f5cebd7..7f86856 100644
--- a/fs/reiserfs/lbalance.c
+++ b/fs/reiserfs/lbalance.c

@@ -1322,7 +1322,7 @@
 	char *item;
 	struct reiserfs_de_head *deh;
 	char *insert_point;
-	int i, old_entry_num;
+	int i;
 	struct buffer_head *bh = bi->bi_bh;
 
 	if (new_entry_count == 0)
@@ -1362,7 +1362,6 @@
 		put_deh_location(&deh[i],
 				 deh_location(&deh[i]) + paste_size);
 
-	old_entry_num = ih_entry_count(ih);
 	put_ih_entry_count(ih, ih_entry_count(ih) + new_entry_count);
 
 	/* prepare space for pasted records */

diff --git a/fs/reiserfs/objectid.c b/fs/reiserfs/objectid.c
index 415d66c..34baf5c 100644
--- a/fs/reiserfs/objectid.c
+++ b/fs/reiserfs/objectid.c

@@ -183,13 +183,12 @@
 	int new_size = (s->s_blocksize - SB_SIZE) / sizeof(__u32) / 2 * 2;
 	int old_max = sb_oid_maxsize(disk_sb);
 	struct reiserfs_super_block_v1 *disk_sb_v1;
-	__le32 *objectid_map, *new_objectid_map;
+	__le32 *objectid_map;
 	int i;
 
 	disk_sb_v1 =
 	    (struct reiserfs_super_block_v1 *)(SB_BUFFER_WITH_SB(s)->b_data);
 	objectid_map = (__le32 *) (disk_sb_v1 + 1);
-	new_objectid_map = (__le32 *) (disk_sb + 1);
 
 	if (cur_size > new_size) {
 		/*

diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c
index 9fed1c0..500f200 100644
--- a/fs/reiserfs/prints.c
+++ b/fs/reiserfs/prints.c

@@ -746,9 +746,6 @@
 
 static void check_internal_block_head(struct buffer_head *bh)
 {
-	struct block_head *blkh;
-
-	blkh = B_BLK_HEAD(bh);
 	if (!(B_LEVEL(bh) > DISK_LEAF_NODE_LEVEL && B_LEVEL(bh) <= MAX_HEIGHT))
 		reiserfs_panic(NULL, "vs-6025", "invalid level %z", bh);
 

diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index 0037aea..da9ebe3 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c

@@ -593,7 +593,6 @@
 	struct buffer_head *bh;
 	struct path_element *last_element;
 	int node_level, retval;
-	int right_neighbor_of_leaf_node;
 	int fs_gen;
 	struct buffer_head *reada_bh[SEARCH_BY_KEY_READA];
 	b_blocknr_t reada_blocks[SEARCH_BY_KEY_READA];
@@ -614,8 +613,6 @@
 
 	pathrelse(search_path);
 
-	right_neighbor_of_leaf_node = 0;
-
 	/*
 	 * With each iteration of this loop we search through the items in the
 	 * current node, and calculate the next current node(next path element)
@@ -701,7 +698,6 @@
 			 */
 			block_number = SB_ROOT_BLOCK(sb);
 			expected_level = -1;
-			right_neighbor_of_leaf_node = 0;
 
 			/* repeat search from the root */
 			continue;

diff --git a/fs/super.c b/fs/super.c
index 8020974..f627b7c 100644
--- a/fs/super.c
+++ b/fs/super.c

@@ -1555,11 +1555,6 @@
 	sb = fc->root->d_sb;
 	WARN_ON(!sb->s_bdi);
 
-	if (fc->subtype && !sb->s_subtype) {
-		sb->s_subtype = fc->subtype;
-		fc->subtype = NULL;
-	}
-
 	/*
 	 * Write barrier is for super_cache_count(). We place it before setting
 	 * SB_BORN as the data dependency between the two functions is the

diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index eeeae04..9fc14e3 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c

@@ -20,6 +20,7 @@
 #include <linux/parser.h>
 #include <linux/magic.h>
 #include <linux/slab.h>
+#include <linux/security.h>
 
 #define TRACEFS_DEFAULT_MODE	0700
 
@@ -27,6 +28,25 @@
 static int tracefs_mount_count;
 static bool tracefs_registered;
 
+static int default_open_file(struct inode *inode, struct file *filp)
+{
+	struct dentry *dentry = filp->f_path.dentry;
+	struct file_operations *real_fops;
+	int ret;
+
+	if (!dentry)
+		return -EINVAL;
+
+	ret = security_locked_down(LOCKDOWN_TRACEFS);
+	if (ret)
+		return ret;
+
+	real_fops = dentry->d_fsdata;
+	if (!real_fops->open)
+		return 0;
+	return real_fops->open(inode, filp);
+}
+
 static ssize_t default_read_file(struct file *file, char __user *buf,
 				 size_t count, loff_t *ppos)
 {
@@ -221,6 +241,12 @@
 	return 0;
 }
 
+static void tracefs_destroy_inode(struct inode *inode)
+{
+	if (S_ISREG(inode->i_mode))
+		kfree(inode->i_fop);
+}
+
 static int tracefs_remount(struct super_block *sb, int *flags, char *data)
 {
 	int err;
@@ -257,6 +283,7 @@
 static const struct super_operations tracefs_super_operations = {
 	.statfs		= simple_statfs,
 	.remount_fs	= tracefs_remount,
+	.destroy_inode  = tracefs_destroy_inode,
 	.show_options	= tracefs_show_options,
 };
 
@@ -387,6 +414,7 @@
 				   struct dentry *parent, void *data,
 				   const struct file_operations *fops)
 {
+	struct file_operations *proxy_fops;
 	struct dentry *dentry;
 	struct inode *inode;
 
@@ -402,8 +430,20 @@
 	if (unlikely(!inode))
 		return failed_creating(dentry);
 
+	proxy_fops = kzalloc(sizeof(struct file_operations), GFP_KERNEL);
+	if (unlikely(!proxy_fops)) {
+		iput(inode);
+		return failed_creating(dentry);
+	}
+
+	if (!fops)
+		fops = &tracefs_file_operations;
+
+	dentry->d_fsdata = (void *)fops;
+	memcpy(proxy_fops, fops, sizeof(*proxy_fops));
+	proxy_fops->open = default_open_file;
 	inode->i_mode = mode;
-	inode->i_fop = fops ? fops : &tracefs_file_operations;
+	inode->i_fop = proxy_fops;
 	inode->i_private = data;
 	d_instantiate(dentry, inode);
 	fsnotify_create(dentry->d_parent->d_inode, dentry);

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index fe6d804..f9fd186 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c

@@ -1272,21 +1272,23 @@
 }
 
 static __always_inline int validate_range(struct mm_struct *mm,
-					  __u64 start, __u64 len)
+					  __u64 *start, __u64 len)
 {
 	__u64 task_size = mm->task_size;
 
-	if (start & ~PAGE_MASK)
+	*start = untagged_addr(*start);
+
+	if (*start & ~PAGE_MASK)
 		return -EINVAL;
 	if (len & ~PAGE_MASK)
 		return -EINVAL;
 	if (!len)
 		return -EINVAL;
-	if (start < mmap_min_addr)
+	if (*start < mmap_min_addr)
 		return -EINVAL;
-	if (start >= task_size)
+	if (*start >= task_size)
 		return -EINVAL;
-	if (len > task_size - start)
+	if (len > task_size - *start)
 		return -EINVAL;
 	return 0;
 }
@@ -1336,7 +1338,7 @@
 		goto out;
 	}
 
-	ret = validate_range(mm, uffdio_register.range.start,
+	ret = validate_range(mm, &uffdio_register.range.start,
 			     uffdio_register.range.len);
 	if (ret)
 		goto out;
@@ -1525,7 +1527,7 @@
 	if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister)))
 		goto out;
 
-	ret = validate_range(mm, uffdio_unregister.start,
+	ret = validate_range(mm, &uffdio_unregister.start,
 			     uffdio_unregister.len);
 	if (ret)
 		goto out;
@@ -1676,7 +1678,7 @@
 	if (copy_from_user(&uffdio_wake, buf, sizeof(uffdio_wake)))
 		goto out;
 
-	ret = validate_range(ctx->mm, uffdio_wake.start, uffdio_wake.len);
+	ret = validate_range(ctx->mm, &uffdio_wake.start, uffdio_wake.len);
 	if (ret)
 		goto out;
 
@@ -1716,7 +1718,7 @@
 			   sizeof(uffdio_copy)-sizeof(__s64)))
 		goto out;
 
-	ret = validate_range(ctx->mm, uffdio_copy.dst, uffdio_copy.len);
+	ret = validate_range(ctx->mm, &uffdio_copy.dst, uffdio_copy.len);
 	if (ret)
 		goto out;
 	/*
@@ -1772,7 +1774,7 @@
 			   sizeof(uffdio_zeropage)-sizeof(__s64)))
 		goto out;
 
-	ret = validate_range(ctx->mm, uffdio_zeropage.range.start,
+	ret = validate_range(ctx->mm, &uffdio_zeropage.range.start,
 			     uffdio_zeropage.range.len);
 	if (ret)
 		goto out;

diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index 58fa85c..d6ed5d2 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h

@@ -81,9 +81,10 @@
 /*
  * Defines for datatype
  */
-#define XFS_ALLOC_INITIAL_USER_DATA	(1 << 0)/* special case start of file */
-#define XFS_ALLOC_USERDATA_ZERO		(1 << 1)/* zero extent on allocation */
-#define XFS_ALLOC_NOBUSY		(1 << 2)/* Busy extents not allowed */
+#define XFS_ALLOC_USERDATA		(1 << 0)/* allocation is for user data*/
+#define XFS_ALLOC_INITIAL_USER_DATA	(1 << 1)/* special case start of file */
+#define XFS_ALLOC_USERDATA_ZERO		(1 << 2)/* zero extent on allocation */
+#define XFS_ALLOC_NOBUSY		(1 << 3)/* Busy extents not allowed */
 
 static inline bool
 xfs_alloc_is_userdata(int datatype)

diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 054b4ce..4edc25a 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c

@@ -4042,8 +4042,12 @@
 	 */
 	if (!(bma->flags & XFS_BMAPI_METADATA)) {
 		bma->datatype = XFS_ALLOC_NOBUSY;
-		if (whichfork == XFS_DATA_FORK && bma->offset == 0)
-			bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA;
+		if (whichfork == XFS_DATA_FORK) {
+			if (bma->offset == 0)
+				bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA;
+			else
+				bma->datatype |= XFS_ALLOC_USERDATA;
+		}
 		if (bma->flags & XFS_BMAPI_ZERO)
 			bma->datatype |= XFS_ALLOC_USERDATA_ZERO;
 	}
@@ -5621,6 +5625,11 @@
 	if (error)
 		return error;
 
+	/* change to extent format if required after extent removal */
+	error = xfs_bmap_btree_to_extents(tp, ip, cur, logflags, whichfork);
+	if (error)
+		return error;
+
 done:
 	xfs_iext_remove(ip, icur, 0);
 	xfs_iext_prev(XFS_IFORK_PTR(ip, whichfork), icur);

diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index a08dd8f..ac6cdca 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c

@@ -928,7 +928,7 @@
 
 	xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb);
 	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF);
-	xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsb));
+	xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsb) - 1);
 }
 
 /*

diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c
index a43d181..5533e48 100644
--- a/fs/xfs/scrub/alloc.c
+++ b/fs/xfs/scrub/alloc.c

@@ -97,7 +97,6 @@
 	xfs_agnumber_t		agno = bs->cur->bc_private.a.agno;
 	xfs_agblock_t		bno;
 	xfs_extlen_t		len;
-	int			error = 0;
 
 	bno = be32_to_cpu(rec->alloc.ar_startblock);
 	len = be32_to_cpu(rec->alloc.ar_blockcount);
@@ -109,7 +108,7 @@
 
 	xchk_allocbt_xref(bs->sc, bno, len);
 
-	return error;
+	return 0;
 }
 
 /* Scrub the freespace btrees for some AG. */

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 120ef99..21c2436 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c

@@ -2097,7 +2097,7 @@
 	int			idx;
 
 	idx = xfs_sb_version_hascrc(&mp->m_sb);
-	if (unlikely(WARN_ON(!bp->b_ops || !bp->b_ops->magic[idx])))
+	if (WARN_ON(!bp->b_ops || !bp->b_ops->magic[idx]))
 		return false;
 	return dmagic == bp->b_ops->magic[idx];
 }
@@ -2115,7 +2115,7 @@
 	int			idx;
 
 	idx = xfs_sb_version_hascrc(&mp->m_sb);
-	if (unlikely(WARN_ON(!bp->b_ops || !bp->b_ops->magic16[idx])))
+	if (WARN_ON(!bp->b_ops || !bp->b_ops->magic16[idx]))
 		return false;
 	return dmagic == bp->b_ops->magic16[idx];
 }

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index d952d59..1ffb179 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c

@@ -370,21 +370,23 @@
 xfs_dio_write_end_io(
 	struct kiocb		*iocb,
 	ssize_t			size,
+	int			error,
 	unsigned		flags)
 {
 	struct inode		*inode = file_inode(iocb->ki_filp);
 	struct xfs_inode	*ip = XFS_I(inode);
 	loff_t			offset = iocb->ki_pos;
 	unsigned int		nofs_flag;
-	int			error = 0;
 
 	trace_xfs_end_io_direct_write(ip, offset, size);
 
 	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
 		return -EIO;
 
-	if (size <= 0)
-		return size;
+	if (error)
+		return error;
+	if (!size)
+		return 0;
 
 	/*
 	 * Capture amount written on completion as we can't reliably account
@@ -441,6 +443,10 @@
 	return error;
 }
 
+static const struct iomap_dio_ops xfs_dio_write_ops = {
+	.end_io		= xfs_dio_write_end_io,
+};
+
 /*
  * xfs_file_dio_aio_write - handle direct IO writes
  *
@@ -541,7 +547,7 @@
 	}
 
 	trace_xfs_file_direct_write(ip, count, iocb->ki_pos);
-	ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, xfs_dio_write_end_io);
+	ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, &xfs_dio_write_ops);
 
 	/*
 	 * If unaligned, this is the only IO in-flight. If it has not yet

diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index ddd0bf7..f1bc88f 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c

@@ -63,19 +63,6 @@
 	.store = xfs_sysfs_object_store,
 };
 
-/*
- * xfs_mount kobject. The mp kobject also serves as the per-mount parent object
- * that is identified by the fsname under sysfs.
- */
-
-static inline struct xfs_mount *
-to_mp(struct kobject *kobject)
-{
-	struct xfs_kobj *kobj = to_kobj(kobject);
-
-	return container_of(kobj, struct xfs_mount, m_kobj);
-}
-
 static struct attribute *xfs_mp_attrs[] = {
 	NULL,
 };

diff --git a/include/Kbuild b/include/Kbuild
index 4ae65e1..ffba794 100644
--- a/include/Kbuild
+++ b/include/Kbuild

@@ -312,7 +312,6 @@
 header-test-			+= linux/mfd/as3722.h
 header-test-			+= linux/mfd/da903x.h
 header-test-			+= linux/mfd/da9055/pdata.h
-header-test-			+= linux/mfd/da9063/pdata.h
 header-test-			+= linux/mfd/db8500-prcmu.h
 header-test-			+= linux/mfd/dbx500-prcmu.h
 header-test-			+= linux/mfd/dln2.h

diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h
index 7357a3c..384b5c8 100644
--- a/include/asm-generic/bug.h
+++ b/include/asm-generic/bug.h

@@ -10,6 +10,7 @@
 #define BUGFLAG_WARNING		(1 << 0)
 #define BUGFLAG_ONCE		(1 << 1)
 #define BUGFLAG_DONE		(1 << 2)
+#define BUGFLAG_NO_CUT_HERE	(1 << 3)	/* CUT_HERE already sent */
 #define BUGFLAG_TAINT(taint)	((taint) << 8)
 #define BUG_GET_TAINT(bug)	((bug)->flags >> 8)
 #endif
@@ -61,18 +62,6 @@
 #define BUG_ON(condition) do { if (unlikely(condition)) BUG(); } while (0)
 #endif
 
-#ifdef __WARN_FLAGS
-#define __WARN_TAINT(taint)		__WARN_FLAGS(BUGFLAG_TAINT(taint))
-#define __WARN_ONCE_TAINT(taint)	__WARN_FLAGS(BUGFLAG_ONCE|BUGFLAG_TAINT(taint))
-
-#define WARN_ON_ONCE(condition) ({				\
-	int __ret_warn_on = !!(condition);			\
-	if (unlikely(__ret_warn_on))				\
-		__WARN_ONCE_TAINT(TAINT_WARN);			\
-	unlikely(__ret_warn_on);				\
-})
-#endif
-
 /*
  * WARN(), WARN_ON(), WARN_ON_ONCE, and so on can be used to report
  * significant kernel issues that need prompt attention if they should ever
@@ -89,27 +78,27 @@
  *
  * Use the versions with printk format strings to provide better diagnostics.
  */
-#ifndef __WARN_TAINT
-extern __printf(3, 4)
-void warn_slowpath_fmt(const char *file, const int line,
-		       const char *fmt, ...);
+#ifndef __WARN_FLAGS
 extern __printf(4, 5)
-void warn_slowpath_fmt_taint(const char *file, const int line, unsigned taint,
-			     const char *fmt, ...);
-extern void warn_slowpath_null(const char *file, const int line);
-#define WANT_WARN_ON_SLOWPATH
-#define __WARN()		warn_slowpath_null(__FILE__, __LINE__)
-#define __WARN_printf(arg...)	warn_slowpath_fmt(__FILE__, __LINE__, arg)
-#define __WARN_printf_taint(taint, arg...)				\
-	warn_slowpath_fmt_taint(__FILE__, __LINE__, taint, arg)
+void warn_slowpath_fmt(const char *file, const int line, unsigned taint,
+		       const char *fmt, ...);
+#define __WARN()		__WARN_printf(TAINT_WARN, NULL)
+#define __WARN_printf(taint, arg...)					\
+	warn_slowpath_fmt(__FILE__, __LINE__, taint, arg)
 #else
 extern __printf(1, 2) void __warn_printk(const char *fmt, ...);
-#define __WARN() do { \
-	printk(KERN_WARNING CUT_HERE); __WARN_TAINT(TAINT_WARN); \
-} while (0)
-#define __WARN_printf(arg...)	__WARN_printf_taint(TAINT_WARN, arg)
-#define __WARN_printf_taint(taint, arg...)				\
-	do { __warn_printk(arg); __WARN_TAINT(taint); } while (0)
+#define __WARN()		__WARN_FLAGS(BUGFLAG_TAINT(TAINT_WARN))
+#define __WARN_printf(taint, arg...) do {				\
+		__warn_printk(arg);					\
+		__WARN_FLAGS(BUGFLAG_NO_CUT_HERE | BUGFLAG_TAINT(taint));\
+	} while (0)
+#define WARN_ON_ONCE(condition) ({				\
+	int __ret_warn_on = !!(condition);			\
+	if (unlikely(__ret_warn_on))				\
+		__WARN_FLAGS(BUGFLAG_ONCE |			\
+			     BUGFLAG_TAINT(TAINT_WARN));	\
+	unlikely(__ret_warn_on);				\
+})
 #endif
 
 /* used internally by panic.c */
@@ -132,7 +121,7 @@
 #define WARN(condition, format...) ({					\
 	int __ret_warn_on = !!(condition);				\
 	if (unlikely(__ret_warn_on))					\
-		__WARN_printf(format);					\
+		__WARN_printf(TAINT_WARN, format);			\
 	unlikely(__ret_warn_on);					\
 })
 #endif
@@ -140,7 +129,7 @@
 #define WARN_TAINT(condition, taint, format...) ({			\
 	int __ret_warn_on = !!(condition);				\
 	if (unlikely(__ret_warn_on))					\
-		__WARN_printf_taint(taint, format);			\
+		__WARN_printf(taint, format);				\
 	unlikely(__ret_warn_on);					\
 })
 

diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h
index 8476175..73f7421 100644
--- a/include/asm-generic/pgalloc.h
+++ b/include/asm-generic/pgalloc.h

@@ -49,7 +49,7 @@
  * @mm: the mm_struct of the current context
  * @gfp: GFP flags to use for the allocation
  *
- * Allocates a page and runs the pgtable_page_ctor().
+ * Allocates a page and runs the pgtable_pte_page_ctor().
  *
  * This function is intended for architectures that need
  * anything beyond simple page allocation or must have custom GFP flags.
@@ -63,7 +63,7 @@
 	pte = alloc_page(gfp);
 	if (!pte)
 		return NULL;
-	if (!pgtable_page_ctor(pte)) {
+	if (!pgtable_pte_page_ctor(pte)) {
 		__free_page(pte);
 		return NULL;
 	}
@@ -76,7 +76,7 @@
  * pte_alloc_one - allocate a page for PTE-level user page table
  * @mm: the mm_struct of the current context
  *
- * Allocates a page and runs the pgtable_page_ctor().
+ * Allocates a page and runs the pgtable_pte_page_ctor().
  *
  * Return: `struct page` initialized as page table or %NULL on error
  */
@@ -98,15 +98,10 @@
  */
 static inline void pte_free(struct mm_struct *mm, struct page *pte_page)
 {
-	pgtable_page_dtor(pte_page);
+	pgtable_pte_page_dtor(pte_page);
 	__free_page(pte_page);
 }
 
-#else /* CONFIG_MMU */
-
-/* This is enough for a nommu architecture */
-#define check_pgt_cache()          do { } while (0)
-
 #endif /* CONFIG_MMU */
 
 #endif /* __ASM_GENERIC_PGALLOC_H */

diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 75d9d68..8186918 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h

@@ -1002,9 +1002,8 @@
  * need this). If THP is not enabled, the pmd can't go away under the
  * code even if MADV_DONTNEED runs, but if THP is enabled we need to
  * run a pmd_trans_unstable before walking the ptes after
- * split_huge_page_pmd returns (because it may have run when the pmd
- * become null, but then a page fault can map in a THP and not a
- * regular page).
+ * split_huge_pmd returns (because it may have run when the pmd become
+ * null, but then a page fault can map in a THP and not a regular page).
  */
 static inline int pmd_trans_unstable(pmd_t *pmd)
 {
@@ -1126,7 +1125,7 @@
 static inline void init_espfix_bsp(void) { }
 #endif
 
-extern void __init pgd_cache_init(void);
+extern void __init pgtable_cache_init(void);
 
 #ifndef __HAVE_ARCH_PFN_MODIFY_ALLOWED
 static inline bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot)

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index cd28f63..dae6460 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h

@@ -215,8 +215,13 @@
 			__start_lsm_info = .;				\
 			KEEP(*(.lsm_info.init))				\
 			__end_lsm_info = .;
+#define EARLY_LSM_TABLE()	. = ALIGN(8);				\
+			__start_early_lsm_info = .;			\
+			KEEP(*(.early_lsm_info.init))			\
+			__end_early_lsm_info = .;
 #else
 #define LSM_TABLE()
+#define EARLY_LSM_TABLE()
 #endif
 
 #define ___OF_TABLE(cfg, name)	_OF_TABLE_##cfg(name)
@@ -627,7 +632,8 @@
 	ACPI_PROBE_TABLE(timer)						\
 	THERMAL_TABLE(governor)						\
 	EARLYCON_TABLE()						\
-	LSM_TABLE()
+	LSM_TABLE()							\
+	EARLY_LSM_TABLE()
 
 #define INIT_TEXT							\
 	*(.init.text .init.text.*)					\

diff --git a/include/crypto/pkcs7.h b/include/crypto/pkcs7.h
index 96071be..38ec7f5 100644
--- a/include/crypto/pkcs7.h
+++ b/include/crypto/pkcs7.h

@@ -9,6 +9,7 @@
 #define _CRYPTO_PKCS7_H
 
 #include <linux/verification.h>
+#include <linux/hash_info.h>
 #include <crypto/public_key.h>
 
 struct key;
@@ -40,4 +41,7 @@
 extern int pkcs7_supply_detached_data(struct pkcs7_message *pkcs7,
 				      const void *data, size_t datalen);
 
+extern int pkcs7_get_digest(struct pkcs7_message *pkcs7, const u8 **buf,
+			    u32 *len, enum hash_algo *hash_algo);
+
 #endif /* _CRYPTO_PKCS7_H */

diff --git a/include/drm/drm_crtc.h b/include/drm/drm_crtc.h
index 7d14c11..408b6f4 100644
--- a/include/drm/drm_crtc.h
+++ b/include/drm/drm_crtc.h

@@ -285,12 +285,12 @@
 	u32 target_vblank;
 
 	/**
-	 * @pageflip_flags:
+	 * @async_flip:
 	 *
-	 * DRM_MODE_PAGE_FLIP_* flags, as passed to the page flip ioctl.
-	 * Zero in any other case.
+	 * This is set when DRM_MODE_PAGE_FLIP_ASYNC is set in the legacy
+	 * PAGE_FLIP IOCTL. It's not wired up for the atomic IOCTL itself yet.
 	 */
-	u32 pageflip_flags;
+	bool async_flip;
 
 	/**
 	 * @vrr_enabled:
@@ -1108,7 +1108,7 @@
 	/**
 	 * @self_refresh_data: Holds the state for the self refresh helpers
 	 *
-	 * Initialized via drm_self_refresh_helper_register().
+	 * Initialized via drm_self_refresh_helper_init().
 	 */
 	struct drm_self_refresh_data *self_refresh_data;
 };

diff --git a/include/drm/drm_self_refresh_helper.h b/include/drm/drm_self_refresh_helper.h
index 397a583..5b79d25 100644
--- a/include/drm/drm_self_refresh_helper.h
+++ b/include/drm/drm_self_refresh_helper.h

@@ -12,9 +12,9 @@
 struct drm_crtc;
 
 void drm_self_refresh_helper_alter_state(struct drm_atomic_state *state);
+void drm_self_refresh_helper_update_avg_times(struct drm_atomic_state *state,
+					      unsigned int commit_time_ms);
 
-int drm_self_refresh_helper_init(struct drm_crtc *crtc,
-				 unsigned int entry_delay_ms);
-
+int drm_self_refresh_helper_init(struct drm_crtc *crtc);
 void drm_self_refresh_helper_cleanup(struct drm_crtc *crtc);
 #endif

diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 978cc23..8b4e516 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h

@@ -643,6 +643,12 @@
 int acpi_arch_timer_mem_init(struct arch_timer_mem *timer_mem, int *timer_count);
 #endif
 
+#ifndef ACPI_HAVE_ARCH_SET_ROOT_POINTER
+static inline void acpi_arch_set_root_pointer(u64 addr)
+{
+}
+#endif
+
 #ifndef ACPI_HAVE_ARCH_GET_ROOT_POINTER
 static inline u64 acpi_arch_get_root_pointer(void)
 {

diff --git a/include/linux/backlight.h b/include/linux/backlight.h
index 0b589744..c7d6b2e 100644
--- a/include/linux/backlight.h
+++ b/include/linux/backlight.h

@@ -46,6 +46,12 @@
 	BACKLIGHT_UNREGISTERED,
 };
 
+enum backlight_scale {
+	BACKLIGHT_SCALE_UNKNOWN = 0,
+	BACKLIGHT_SCALE_LINEAR,
+	BACKLIGHT_SCALE_NON_LINEAR,
+};
+
 struct backlight_device;
 struct fb_info;
 
@@ -80,6 +86,8 @@
 	enum backlight_type type;
 	/* Flags used to signal drivers of state changes */
 	unsigned int state;
+	/* Type of the brightness scale (linear, non-linear, ...) */
+	enum backlight_scale scale;
 
 #define BL_CORE_SUSPENDED	(1 << 0)	/* backlight is suspended */
 #define BL_CORE_FBBLANK		(1 << 1)	/* backlight is under an fb blank event */

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index d9db32f..f3ea78b 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h

@@ -1524,10 +1524,14 @@
 };
 
 typedef blk_status_t (integrity_processing_fn) (struct blk_integrity_iter *);
+typedef void (integrity_prepare_fn) (struct request *);
+typedef void (integrity_complete_fn) (struct request *, unsigned int);
 
 struct blk_integrity_profile {
 	integrity_processing_fn		*generate_fn;
 	integrity_processing_fn		*verify_fn;
+	integrity_prepare_fn		*prepare_fn;
+	integrity_complete_fn		*complete_fn;
 	const char			*name;
 };
 

diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index 82156da..b9dbda1 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h

@@ -293,6 +293,7 @@
 struct ceph_entity_addr *ceph_client_addr(struct ceph_client *client);
 u64 ceph_client_gid(struct ceph_client *client);
 extern void ceph_destroy_client(struct ceph_client *client);
+extern void ceph_reset_client_addr(struct ceph_client *client);
 extern int __ceph_open_session(struct ceph_client *client,
 			       unsigned long started);
 extern int ceph_open_session(struct ceph_client *client);

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 23895d1..c4458dc 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h

@@ -337,6 +337,7 @@
 extern void ceph_messenger_init(struct ceph_messenger *msgr,
 				struct ceph_entity_addr *myaddr);
 extern void ceph_messenger_fini(struct ceph_messenger *msgr);
+extern void ceph_messenger_reset_nonce(struct ceph_messenger *msgr);
 
 extern void ceph_con_init(struct ceph_connection *con, void *private,
 			const struct ceph_connection_operations *ops,

diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h
index b4d134d..dbb8a69 100644
--- a/include/linux/ceph/mon_client.h
+++ b/include/linux/ceph/mon_client.h

@@ -109,6 +109,7 @@
 
 extern int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl);
 extern void ceph_monc_stop(struct ceph_mon_client *monc);
+extern void ceph_monc_reopen_session(struct ceph_mon_client *monc);
 
 enum {
 	CEPH_SUB_MONMAP = 0,

diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index ad7fe5d..eaffbdd 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h

@@ -381,6 +381,7 @@
 extern int ceph_osdc_init(struct ceph_osd_client *osdc,
 			  struct ceph_client *client);
 extern void ceph_osdc_stop(struct ceph_osd_client *osdc);
+extern void ceph_osdc_reopen_osds(struct ceph_osd_client *osdc);
 
 extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc,
 				   struct ceph_msg *msg);
@@ -388,6 +389,7 @@
 				 struct ceph_msg *msg);
 void ceph_osdc_update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb);
 void ceph_osdc_abort_requests(struct ceph_osd_client *osdc, int err);
+void ceph_osdc_clear_abort_err(struct ceph_osd_client *osdc);
 
 #define osd_req_op_data(oreq, whch, typ, fld)				\
 ({									\

diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 9569e7c..4b898cd 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h

@@ -129,11 +129,8 @@
 	return false;
 }
 
-/*
- * Compaction  has backed off for some reason. It might be throttling or
- * lock contention. Retrying is still worthwhile.
- */
-static inline bool compaction_withdrawn(enum compact_result result)
+/* Compaction needs reclaim to be performed first, so it can continue. */
+static inline bool compaction_needs_reclaim(enum compact_result result)
 {
 	/*
 	 * Compaction backed off due to watermark checks for order-0
@@ -142,6 +139,16 @@
 	if (result == COMPACT_SKIPPED)
 		return true;
 
+	return false;
+}
+
+/*
+ * Compaction has backed off for some reason after doing some work or none
+ * at all. It might be throttling or lock contention. Retrying might be still
+ * worthwhile, but with a higher priority if allowed.
+ */
+static inline bool compaction_withdrawn(enum compact_result result)
+{
 	/*
 	 * If compaction is deferred for high-order allocations, it is
 	 * because sync compaction recently failed. If this is the case
@@ -207,6 +214,11 @@
 	return false;
 }
 
+static inline bool compaction_needs_reclaim(enum compact_result result)
+{
+	return false;
+}
+
 static inline bool compaction_withdrawn(enum compact_result result)
 {
 	return true;

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 88dc0c6..d0633eb 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h

@@ -201,12 +201,14 @@
 extern enum cpuhp_smt_control cpu_smt_control;
 extern void cpu_smt_disable(bool force);
 extern void cpu_smt_check_topology(void);
+extern bool cpu_smt_possible(void);
 extern int cpuhp_smt_enable(void);
 extern int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval);
 #else
 # define cpu_smt_control		(CPU_SMT_NOT_IMPLEMENTED)
 static inline void cpu_smt_disable(bool force) { }
 static inline void cpu_smt_check_topology(void) { }
+static inline bool cpu_smt_possible(void) { return false; }
 static inline int cpuhp_smt_enable(void) { return 0; }
 static inline int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) { return 0; }
 #endif

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index b5a5a1e..78a73eb 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h

@@ -200,8 +200,8 @@
 	for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask)
 #define for_each_cpu_wrap(cpu, mask, start)	\
 	for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask, (void)(start))
-#define for_each_cpu_and(cpu, mask, and)	\
-	for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask, (void)and)
+#define for_each_cpu_and(cpu, mask1, mask2)	\
+	for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask1, (void)mask2)
 #else
 /**
  * cpumask_first - get the first cpu in a cpumask
@@ -290,20 +290,20 @@
 /**
  * for_each_cpu_and - iterate over every cpu in both masks
  * @cpu: the (optionally unsigned) integer iterator
- * @mask: the first cpumask pointer
- * @and: the second cpumask pointer
+ * @mask1: the first cpumask pointer
+ * @mask2: the second cpumask pointer
  *
  * This saves a temporary CPU mask in many places.  It is equivalent to:
  *	struct cpumask tmp;
- *	cpumask_and(&tmp, &mask, &and);
+ *	cpumask_and(&tmp, &mask1, &mask2);
  *	for_each_cpu(cpu, &tmp)
  *		...
  *
  * After the loop, cpu is >= nr_cpu_ids.
  */
-#define for_each_cpu_and(cpu, mask, and)				\
+#define for_each_cpu_and(cpu, mask1, mask2)				\
 	for ((cpu) = -1;						\
-		(cpu) = cpumask_next_and((cpu), (mask), (and)),		\
+		(cpu) = cpumask_next_and((cpu), (mask1), (mask2)),	\
 		(cpu) < nr_cpu_ids;)
 #endif /* SMP */
 

diff --git a/include/linux/cred.h b/include/linux/cred.h
index f7a30e0..18639c0 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h

@@ -386,7 +386,6 @@
 #define current_fsgid() 	(current_cred_xxx(fsgid))
 #define current_cap()		(current_cred_xxx(cap_effective))
 #define current_user()		(current_cred_xxx(user))
-#define current_security()	(current_cred_xxx(security))
 
 extern struct user_namespace init_user_ns;
 #ifdef CONFIG_USER_NS

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 866268c..e0d909d 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h

@@ -429,6 +429,7 @@
  * @i_pages: Cached pages.
  * @gfp_mask: Memory allocation flags to use for allocating pages.
  * @i_mmap_writable: Number of VM_SHARED mappings.
+ * @nr_thps: Number of THPs in the pagecache (non-shmem only).
  * @i_mmap: Tree of private and shared mappings.
  * @i_mmap_rwsem: Protects @i_mmap and @i_mmap_writable.
  * @nrpages: Number of page entries, protected by the i_pages lock.
@@ -446,6 +447,10 @@
 	struct xarray		i_pages;
 	gfp_t			gfp_mask;
 	atomic_t		i_mmap_writable;
+#ifdef CONFIG_READ_ONLY_THP_FOR_FS
+	/* number of thp, only for non-shmem files */
+	atomic_t		nr_thps;
+#endif
 	struct rb_root_cached	i_mmap;
 	struct rw_semaphore	i_mmap_rwsem;
 	unsigned long		nrpages;
@@ -1163,6 +1168,11 @@
 extern int generic_setlease(struct file *, long, struct file_lock **, void **priv);
 extern int vfs_setlease(struct file *, long, struct file_lock **, void **);
 extern int lease_modify(struct file_lock *, int, struct list_head *);
+
+struct notifier_block;
+extern int lease_register_notifier(struct notifier_block *);
+extern void lease_unregister_notifier(struct notifier_block *);
+
 struct files_struct;
 extern void show_fd_locks(struct seq_file *f,
 			 struct file *filp, struct files_struct *files);
@@ -2798,6 +2808,33 @@
 	return errseq_sample(&mapping->wb_err);
 }
 
+static inline int filemap_nr_thps(struct address_space *mapping)
+{
+#ifdef CONFIG_READ_ONLY_THP_FOR_FS
+	return atomic_read(&mapping->nr_thps);
+#else
+	return 0;
+#endif
+}
+
+static inline void filemap_nr_thps_inc(struct address_space *mapping)
+{
+#ifdef CONFIG_READ_ONLY_THP_FOR_FS
+	atomic_inc(&mapping->nr_thps);
+#else
+	WARN_ON_ONCE(1);
+#endif
+}
+
+static inline void filemap_nr_thps_dec(struct address_space *mapping)
+{
+#ifdef CONFIG_READ_ONLY_THP_FOR_FS
+	atomic_dec(&mapping->nr_thps);
+#else
+	WARN_ON_ONCE(1);
+#endif
+}
+
 extern int vfs_fsync_range(struct file *file, loff_t start, loff_t end,
 			   int datasync);
 extern int vfs_fsync(struct file *file, int datasync);

diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h
index 0424df7..e5c14e2 100644
--- a/include/linux/fs_context.h
+++ b/include/linux/fs_context.h

@@ -95,7 +95,6 @@
 	const struct cred	*cred;		/* The mounter's credentials */
 	struct fc_log		*log;		/* Logging buffer */
 	const char		*source;	/* The source name (eg. dev path) */
-	const char		*subtype;	/* The subtype to set on the superblock */
 	void			*security;	/* Linux S&M options */
 	void			*s_fs_info;	/* Proposed s_fs_info */
 	unsigned int		sb_flags;	/* Proposed superblock flags (SB_*) */

diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 2de3b2d..1915bdb 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h

@@ -475,6 +475,8 @@
 extern void fsnotify_detach_mark(struct fsnotify_mark *mark);
 /* free mark */
 extern void fsnotify_free_mark(struct fsnotify_mark *mark);
+/* Wait until all marks queued for destruction are destroyed */
+extern void fsnotify_wait_marks_destroyed(void);
 /* run all the marks in a group, and clear all of the marks attached to given object type */
 extern void fsnotify_clear_marks_by_group(struct fsnotify_group *group, unsigned int type);
 /* run all the marks in a group, and clear all of the vfsmount marks */

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index f338816..fb07b50 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h

@@ -510,18 +510,22 @@
 }
 extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order,
 			struct vm_area_struct *vma, unsigned long addr,
-			int node);
+			int node, bool hugepage);
+#define alloc_hugepage_vma(gfp_mask, vma, addr, order) \
+	alloc_pages_vma(gfp_mask, order, vma, addr, numa_node_id(), true)
 #else
 #define alloc_pages(gfp_mask, order) \
 		alloc_pages_node(numa_node_id(), gfp_mask, order)
-#define alloc_pages_vma(gfp_mask, order, vma, addr, node)\
+#define alloc_pages_vma(gfp_mask, order, vma, addr, node, false)\
+	alloc_pages(gfp_mask, order)
+#define alloc_hugepage_vma(gfp_mask, vma, addr, order) \
 	alloc_pages(gfp_mask, order)
 #endif
 #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
 #define alloc_page_vma(gfp_mask, vma, addr)			\
-	alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id())
+	alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id(), false)
 #define alloc_page_vma_node(gfp_mask, vma, addr, node)		\
-	alloc_pages_vma(gfp_mask, 0, vma, addr, node)
+	alloc_pages_vma(gfp_mask, 0, vma, addr, node, false)
 
 extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order);
 extern unsigned long get_zeroed_page(gfp_t gfp_mask);

diff --git a/include/linux/hid.h b/include/linux/hid.h
index d770ab1..cd41f20 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h

@@ -1154,29 +1154,32 @@
 #define hid_pidff_init NULL
 #endif
 
-#define dbg_hid(format, arg...)						\
+#define dbg_hid(fmt, ...)						\
 do {									\
 	if (hid_debug)							\
-		printk(KERN_DEBUG "%s: " format, __FILE__, ##arg);	\
+		printk(KERN_DEBUG "%s: " fmt, __FILE__, ##__VA_ARGS__);	\
 } while (0)
 
-#define hid_printk(level, hid, fmt, arg...)		\
-	dev_printk(level, &(hid)->dev, fmt, ##arg)
-#define hid_emerg(hid, fmt, arg...)			\
-	dev_emerg(&(hid)->dev, fmt, ##arg)
-#define hid_crit(hid, fmt, arg...)			\
-	dev_crit(&(hid)->dev, fmt, ##arg)
-#define hid_alert(hid, fmt, arg...)			\
-	dev_alert(&(hid)->dev, fmt, ##arg)
-#define hid_err(hid, fmt, arg...)			\
-	dev_err(&(hid)->dev, fmt, ##arg)
-#define hid_notice(hid, fmt, arg...)			\
-	dev_notice(&(hid)->dev, fmt, ##arg)
-#define hid_warn(hid, fmt, arg...)			\
-	dev_warn(&(hid)->dev, fmt, ##arg)
-#define hid_info(hid, fmt, arg...)			\
-	dev_info(&(hid)->dev, fmt, ##arg)
-#define hid_dbg(hid, fmt, arg...)			\
-	dev_dbg(&(hid)->dev, fmt, ##arg)
+#define hid_err(hid, fmt, ...)				\
+	dev_err(&(hid)->dev, fmt, ##__VA_ARGS__)
+#define hid_notice(hid, fmt, ...)			\
+	dev_notice(&(hid)->dev, fmt, ##__VA_ARGS__)
+#define hid_warn(hid, fmt, ...)				\
+	dev_warn(&(hid)->dev, fmt, ##__VA_ARGS__)
+#define hid_info(hid, fmt, ...)				\
+	dev_info(&(hid)->dev, fmt, ##__VA_ARGS__)
+#define hid_dbg(hid, fmt, ...)				\
+	dev_dbg(&(hid)->dev, fmt, ##__VA_ARGS__)
+
+#define hid_err_once(hid, fmt, ...)			\
+	dev_err_once(&(hid)->dev, fmt, ##__VA_ARGS__)
+#define hid_notice_once(hid, fmt, ...)			\
+	dev_notice_once(&(hid)->dev, fmt, ##__VA_ARGS__)
+#define hid_warn_once(hid, fmt, ...)			\
+	dev_warn_once(&(hid)->dev, fmt, ##__VA_ARGS__)
+#define hid_info_once(hid, fmt, ...)			\
+	dev_info_once(&(hid)->dev, fmt, ##__VA_ARGS__)
+#define hid_dbg_once(hid, fmt, ...)			\
+	dev_dbg_once(&(hid)->dev, fmt, ##__VA_ARGS__)
 
 #endif

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 45ede62..93d5cf0 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h

@@ -108,7 +108,12 @@
 
 	if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_FLAG))
 		return true;
-
+	/*
+	 * For dax vmas, try to always use hugepage mappings. If the kernel does
+	 * not support hugepages, fsdax mappings will fallback to PAGE_SIZE
+	 * mappings, and device-dax namespaces, that try to guarantee a given
+	 * mapping size, will fail to enable
+	 */
 	if (vma_is_dax(vma))
 		return true;
 
@@ -267,6 +272,15 @@
 	return IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION);
 }
 
+static inline struct list_head *page_deferred_list(struct page *page)
+{
+	/*
+	 * Global or memcg deferred list in the second tail pages is
+	 * occupied by compound_head.
+	 */
+	return &page[2].deferred_list;
+}
+
 #else /* CONFIG_TRANSPARENT_HUGEPAGE */
 #define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; })
 #define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; })

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index edfca42..53fc34f9 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h

@@ -454,7 +454,7 @@
 static inline struct hstate *page_hstate(struct page *page)
 {
 	VM_BUG_ON_PAGE(!PageHuge(page), page);
-	return size_to_hstate(PAGE_SIZE << compound_order(page));
+	return size_to_hstate(page_size(page));
 }
 
 static inline unsigned hstate_index_to_shift(unsigned index)

diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 2afe6fd..b4a0170 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h

@@ -245,7 +245,10 @@
 		} pipe;
 	} u;
 	/*
-	 * The sub_channel_index is defined in win8.
+	 * The sub_channel_index is defined in Win8: a value of zero means a
+	 * primary channel and a value of non-zero means a sub-channel.
+	 *
+	 * Before Win8, the field is reserved, meaning it's always zero.
 	 */
 	u16 sub_channel_index;
 	u16 reserved3;
@@ -423,6 +426,9 @@
 	CHANNELMSG_COUNT
 };
 
+/* Hyper-V supports about 2048 channels, and the RELIDs start with 1. */
+#define INVALID_RELID	U32_MAX
+
 struct vmbus_channel_message_header {
 	enum vmbus_channel_message_type msgtype;
 	u32 padding;
@@ -934,6 +940,11 @@
 		  VMBUS_CHANNEL_TLNPI_PROVIDER_OFFER);
 }
 
+static inline bool is_sub_channel(const struct vmbus_channel *c)
+{
+	return c->offermsg.offer.sub_channel_index != 0;
+}
+
 static inline void set_channel_affinity_state(struct vmbus_channel *c,
 					      enum hv_numa_policy policy)
 {
@@ -1149,6 +1160,9 @@
 	int (*remove)(struct hv_device *);
 	void (*shutdown)(struct hv_device *);
 
+	int (*suspend)(struct hv_device *);
+	int (*resume)(struct hv_device *);
+
 };
 
 /* Base device object */

diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index c0a78c0..1361637 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h

@@ -473,7 +473,7 @@
 devm_i2c_new_dummy_device(struct device *dev, struct i2c_adapter *adap, u16 address);
 
 extern struct i2c_client *
-i2c_new_secondary_device(struct i2c_client *client,
+i2c_new_ancillary_device(struct i2c_client *client,
 				const char *name,
 				u16 default_addr);
 

diff --git a/include/linux/ima.h b/include/linux/ima.h
index a20ad39..1c37f17 100644
--- a/include/linux/ima.h
+++ b/include/linux/ima.h

@@ -131,4 +131,13 @@
 	return 0;
 }
 #endif /* CONFIG_IMA_APPRAISE */
+
+#if defined(CONFIG_IMA_APPRAISE) && defined(CONFIG_INTEGRITY_TRUSTED_KEYRING)
+extern bool ima_appraise_signature(enum kernel_read_file_id func);
+#else
+static inline bool ima_appraise_signature(enum kernel_read_file_id func)
+{
+	return false;
+}
+#endif /* CONFIG_IMA_APPRAISE && CONFIG_INTEGRITY_TRUSTED_KEYRING */
 #endif /* _LINUX_IMA_H */

diff --git a/include/linux/interval_tree_generic.h b/include/linux/interval_tree_generic.h
index 8554761..aaa8a07 100644
--- a/include/linux/interval_tree_generic.h
+++ b/include/linux/interval_tree_generic.h

@@ -30,26 +30,8 @@
 									      \
 /* Callbacks for augmented rbtree insert and remove */			      \
 									      \
-static inline ITTYPE ITPREFIX ## _compute_subtree_last(ITSTRUCT *node)	      \
-{									      \
-	ITTYPE max = ITLAST(node), subtree_last;			      \
-	if (node->ITRB.rb_left) {					      \
-		subtree_last = rb_entry(node->ITRB.rb_left,		      \
-					ITSTRUCT, ITRB)->ITSUBTREE;	      \
-		if (max < subtree_last)					      \
-			max = subtree_last;				      \
-	}								      \
-	if (node->ITRB.rb_right) {					      \
-		subtree_last = rb_entry(node->ITRB.rb_right,		      \
-					ITSTRUCT, ITRB)->ITSUBTREE;	      \
-		if (max < subtree_last)					      \
-			max = subtree_last;				      \
-	}								      \
-	return max;							      \
-}									      \
-									      \
-RB_DECLARE_CALLBACKS(static, ITPREFIX ## _augment, ITSTRUCT, ITRB,	      \
-		     ITTYPE, ITSUBTREE, ITPREFIX ## _compute_subtree_last)    \
+RB_DECLARE_CALLBACKS_MAX(static, ITPREFIX ## _augment,			      \
+			 ITSTRUCT, ITRB, ITTYPE, ITSUBTREE, ITLAST)	      \
 									      \
 /* Insert / remove interval nodes from the tree */			      \
 									      \

diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index bc499ce..7aa5d61 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h

@@ -188,10 +188,14 @@
  */
 #define IOMAP_DIO_UNWRITTEN	(1 << 0)	/* covers unwritten extent(s) */
 #define IOMAP_DIO_COW		(1 << 1)	/* covers COW extent(s) */
-typedef int (iomap_dio_end_io_t)(struct kiocb *iocb, ssize_t ret,
-		unsigned flags);
+
+struct iomap_dio_ops {
+	int (*end_io)(struct kiocb *iocb, ssize_t size, int error,
+		      unsigned flags);
+};
+
 ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
-		const struct iomap_ops *ops, iomap_dio_end_io_t end_io);
+		const struct iomap_ops *ops, const struct iomap_dio_ops *dops);
 int iomap_dio_iopoll(struct kiocb *kiocb, bool spin);
 
 #ifdef CONFIG_SWAP

diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index df03825..603fbc4 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h

@@ -1410,8 +1410,6 @@
 extern int	   jbd2_journal_bmap(journal_t *, unsigned long, unsigned long long *);
 extern int	   jbd2_journal_force_commit(journal_t *);
 extern int	   jbd2_journal_force_commit_nested(journal_t *);
-extern int	   jbd2_journal_inode_add_write(handle_t *handle, struct jbd2_inode *inode);
-extern int	   jbd2_journal_inode_add_wait(handle_t *handle, struct jbd2_inode *inode);
 extern int	   jbd2_journal_inode_ranged_write(handle_t *handle,
 			struct jbd2_inode *inode, loff_t start_byte,
 			loff_t length);

diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index f0b8092..1776eb2 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h

@@ -125,7 +125,7 @@
 			     unsigned long cmdline_len);
 typedef int (kexec_cleanup_t)(void *loader_data);
 
-#ifdef CONFIG_KEXEC_VERIFY_SIG
+#ifdef CONFIG_KEXEC_SIG
 typedef int (kexec_verify_sig_t)(const char *kernel_buf,
 				 unsigned long kernel_len);
 #endif
@@ -134,7 +134,7 @@
 	kexec_probe_t *probe;
 	kexec_load_t *load;
 	kexec_cleanup_t *cleanup;
-#ifdef CONFIG_KEXEC_VERIFY_SIG
+#ifdef CONFIG_KEXEC_SIG
 	kexec_verify_sig_t *verify_sig;
 #endif
 };
@@ -183,6 +183,8 @@
 				   bool get_value);
 void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name);
 
+int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
+					 unsigned long buf_len);
 void * __weak arch_kexec_kernel_image_load(struct kimage *image);
 int __weak arch_kexec_apply_relocations_add(struct purgatory_info *pi,
 					    Elf_Shdr *section,

diff --git a/include/linux/kgdb.h b/include/linux/kgdb.h
index fbf144a..b072aeb 100644
--- a/include/linux/kgdb.h
+++ b/include/linux/kgdb.h

@@ -326,8 +326,10 @@
 	(raw_smp_processor_id() == atomic_read(&kgdb_active))
 extern bool dbg_is_early;
 extern void __init dbg_late_init(void);
+extern void kgdb_panic(const char *msg);
 #else /* ! CONFIG_KGDB */
 #define in_dbg_master() (0)
 #define dbg_late_init()
+static inline void kgdb_panic(const char *msg) {}
 #endif /* ! CONFIG_KGDB */
 #endif /* _KGDB_H_ */

diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
index 082d1d2..bc45ea1 100644
--- a/include/linux/khugepaged.h
+++ b/include/linux/khugepaged.h

@@ -15,6 +15,14 @@
 extern void __khugepaged_exit(struct mm_struct *mm);
 extern int khugepaged_enter_vma_merge(struct vm_area_struct *vma,
 				      unsigned long vm_flags);
+#ifdef CONFIG_SHMEM
+extern void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr);
+#else
+static inline void collapse_pte_mapped_thp(struct mm_struct *mm,
+					   unsigned long addr)
+{
+}
+#endif
 
 #define khugepaged_enabled()					       \
 	(transparent_hugepage_flags &				       \
@@ -73,6 +81,10 @@
 {
 	return 0;
 }
+static inline void collapse_pte_mapped_thp(struct mm_struct *mm,
+					   unsigned long addr)
+{
+}
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
 #endif /* _LINUX_KHUGEPAGED_H */

diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index df1318d..a376324 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h

@@ -339,6 +339,9 @@
  *	Check for permission to change root directory.
  *	@path contains the path structure.
  *	Return 0 if permission is granted.
+ * @path_notify:
+ *	Check permissions before setting a watch on events as defined by @mask,
+ *	on an object at @path, whose type is defined by @obj_type.
  * @inode_readlink:
  *	Check the permission to read the symbolic link.
  *	@dentry contains the dentry structure for the file link.
@@ -1446,6 +1449,11 @@
  * @bpf_prog_free_security:
  *	Clean up the security information stored inside bpf prog.
  *
+ * @locked_down
+ *     Determine whether a kernel feature that potentially enables arbitrary
+ *     code execution in kernel space should be permitted.
+ *
+ *     @what: kernel feature being accessed
  */
 union security_list_options {
 	int (*binder_set_context_mgr)(struct task_struct *mgr);
@@ -1535,7 +1543,9 @@
 	int (*path_chown)(const struct path *path, kuid_t uid, kgid_t gid);
 	int (*path_chroot)(const struct path *path);
 #endif
-
+	/* Needed for inode based security check */
+	int (*path_notify)(const struct path *path, u64 mask,
+				unsigned int obj_type);
 	int (*inode_alloc_security)(struct inode *inode);
 	void (*inode_free_security)(struct inode *inode);
 	int (*inode_init_security)(struct inode *inode, struct inode *dir,
@@ -1807,6 +1817,7 @@
 	int (*bpf_prog_alloc_security)(struct bpf_prog_aux *aux);
 	void (*bpf_prog_free_security)(struct bpf_prog_aux *aux);
 #endif /* CONFIG_BPF_SYSCALL */
+	int (*locked_down)(enum lockdown_reason what);
 };
 
 struct security_hook_heads {
@@ -1860,6 +1871,8 @@
 	struct hlist_head path_chown;
 	struct hlist_head path_chroot;
 #endif
+	/* Needed for inode based modules as well */
+	struct hlist_head path_notify;
 	struct hlist_head inode_alloc_security;
 	struct hlist_head inode_free_security;
 	struct hlist_head inode_init_security;
@@ -2046,6 +2059,7 @@
 	struct hlist_head bpf_prog_alloc_security;
 	struct hlist_head bpf_prog_free_security;
 #endif /* CONFIG_BPF_SYSCALL */
+	struct hlist_head locked_down;
 } __randomize_layout;
 
 /*
@@ -2104,12 +2118,18 @@
 };
 
 extern struct lsm_info __start_lsm_info[], __end_lsm_info[];
+extern struct lsm_info __start_early_lsm_info[], __end_early_lsm_info[];
 
 #define DEFINE_LSM(lsm)							\
 	static struct lsm_info __lsm_##lsm				\
 		__used __section(.lsm_info.init)			\
 		__aligned(sizeof(unsigned long))
 
+#define DEFINE_EARLY_LSM(lsm)						\
+	static struct lsm_info __early_lsm_##lsm			\
+		__used __section(.early_lsm_info.init)			\
+		__aligned(sizeof(unsigned long))
+
 #ifdef CONFIG_SECURITY_SELINUX_DISABLE
 /*
  * Assuring the safety of deleting a security module is up to

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index ad8f1a3..9b60863 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h

@@ -128,9 +128,8 @@
 
 	struct mem_cgroup_reclaim_iter	iter[DEF_PRIORITY + 1];
 
-#ifdef CONFIG_MEMCG_KMEM
 	struct memcg_shrinker_map __rcu	*shrinker_map;
-#endif
+
 	struct rb_node		tree_node;	/* RB tree node */
 	unsigned long		usage_in_excess;/* Set to the value by which */
 						/* the soft limit is exceeded*/
@@ -331,6 +330,10 @@
 	struct list_head event_list;
 	spinlock_t event_list_lock;
 
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	struct deferred_split deferred_split_queue;
+#endif
+
 	struct mem_cgroup_per_node *nodeinfo[0];
 	/* WARNING: nodeinfo must be the last member here */
 };
@@ -1311,6 +1314,11 @@
 	} while ((memcg = parent_mem_cgroup(memcg)));
 	return false;
 }
+
+extern int memcg_expand_shrinker_maps(int new_id);
+
+extern void memcg_set_shrinker_bit(struct mem_cgroup *memcg,
+				   int nid, int shrinker_id);
 #else
 #define mem_cgroup_sockets_enabled 0
 static inline void mem_cgroup_sk_alloc(struct sock *sk) { };
@@ -1319,6 +1327,11 @@
 {
 	return false;
 }
+
+static inline void memcg_set_shrinker_bit(struct mem_cgroup *memcg,
+					  int nid, int shrinker_id)
+{
+}
 #endif
 
 struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep);
@@ -1390,10 +1403,6 @@
 	return memcg ? memcg->kmemcg_id : -1;
 }
 
-extern int memcg_expand_shrinker_maps(int new_id);
-
-extern void memcg_set_shrinker_bit(struct mem_cgroup *memcg,
-				   int nid, int shrinker_id);
 #else
 
 static inline int memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
@@ -1435,8 +1444,6 @@
 {
 }
 
-static inline void memcg_set_shrinker_bit(struct mem_cgroup *memcg,
-					  int nid, int shrinker_id) { }
 #endif /* CONFIG_MEMCG_KMEM */
 
 #endif /* _LINUX_MEMCONTROL_H */

diff --git a/include/linux/memory.h b/include/linux/memory.h
index 02e633f..0ebb105 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h

@@ -25,7 +25,6 @@
 
 struct memory_block {
 	unsigned long start_section_nr;
-	unsigned long end_section_nr;
 	unsigned long state;		/* serialized by the dev->lock */
 	int section_count;		/* serialized by mem_sysfs_mutex */
 	int online_type;		/* for passing data to online routine */
@@ -80,9 +79,9 @@
 #define IPC_CALLBACK_PRI        10
 
 #ifndef CONFIG_MEMORY_HOTPLUG_SPARSE
-static inline int memory_dev_init(void)
+static inline void memory_dev_init(void)
 {
-	return 0;
+	return;
 }
 static inline int register_memory_notifier(struct notifier_block *nb)
 {
@@ -113,7 +112,7 @@
 extern void unregister_memory_isolate_notifier(struct notifier_block *nb);
 int create_memory_block_devices(unsigned long start, unsigned long size);
 void remove_memory_block_devices(unsigned long start, unsigned long size);
-extern int memory_dev_init(void);
+extern void memory_dev_init(void);
 extern int memory_notify(unsigned long val, void *v);
 extern int memory_isolate_notify(unsigned long val, void *v);
 extern struct memory_block *find_memory_block(struct mem_section *);

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index bac395f..5228c62 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h

@@ -139,8 +139,6 @@
 struct mempolicy *get_task_policy(struct task_struct *p);
 struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
 		unsigned long addr);
-struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
-						unsigned long addr);
 bool vma_policy_mof(struct vm_area_struct *vma);
 
 extern void numa_default_policy(void);

diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index fb2a0bd8..6fefb09 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h

@@ -17,6 +17,7 @@
  */
 struct vmem_altmap {
 	const unsigned long base_pfn;
+	const unsigned long end_pfn;
 	const unsigned long reserve;
 	unsigned long free;
 	unsigned long align;
@@ -111,7 +112,6 @@
 	struct completion done;
 	enum memory_type type;
 	unsigned int flags;
-	u64 pci_p2pdma_bus_offset;
 	const struct dev_pagemap_ops *ops;
 };
 

diff --git a/include/linux/mfd/da9063/pdata.h b/include/linux/mfd/da9063/pdata.h
deleted file mode 100644
index 085edbf..0000000
--- a/include/linux/mfd/da9063/pdata.h
+++ /dev/null

@@ -1,60 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * Platform configuration options for DA9063
- *
- * Copyright 2012 Dialog Semiconductor Ltd.
- *
- * Author: Michal Hajduk, Dialog Semiconductor
- * Author: Krystian Garbaciak, Dialog Semiconductor
- */
-
-#ifndef __MFD_DA9063_PDATA_H__
-#define __MFD_DA9063_PDATA_H__
-
-/*
- * RGB LED configuration
- */
-/* LED IDs for flags in struct led_info. */
-enum {
-	DA9063_GPIO11_LED,
-	DA9063_GPIO14_LED,
-	DA9063_GPIO15_LED,
-
-	DA9063_LED_NUM
-};
-#define DA9063_LED_ID_MASK		0x3
-
-/* LED polarity for flags in struct led_info. */
-#define DA9063_LED_HIGH_LEVEL_ACTIVE	0x0
-#define DA9063_LED_LOW_LEVEL_ACTIVE	0x4
-
-
-/*
- * General PMIC configuration
- */
-/* HWMON ADC channels configuration */
-#define DA9063_FLG_FORCE_IN0_MANUAL_MODE	0x0010
-#define DA9063_FLG_FORCE_IN0_AUTO_MODE		0x0020
-#define DA9063_FLG_FORCE_IN1_MANUAL_MODE	0x0040
-#define DA9063_FLG_FORCE_IN1_AUTO_MODE		0x0080
-#define DA9063_FLG_FORCE_IN2_MANUAL_MODE	0x0100
-#define DA9063_FLG_FORCE_IN2_AUTO_MODE		0x0200
-#define DA9063_FLG_FORCE_IN3_MANUAL_MODE	0x0400
-#define DA9063_FLG_FORCE_IN3_AUTO_MODE		0x0800
-
-/* Disable register caching. */
-#define DA9063_FLG_NO_CACHE			0x0008
-
-struct da9063;
-
-/* DA9063 platform data */
-struct da9063_pdata {
-	int				(*init)(struct da9063 *da9063);
-	int				irq_base;
-	bool				key_power;
-	unsigned			flags;
-	struct da9063_regulators_pdata	*regulators_pdata;
-	struct led_platform_data	*leds_pdata;
-};
-
-#endif	/* __MFD_DA9063_PDATA_H__ */

diff --git a/include/linux/mfd/intel_soc_pmic_mrfld.h b/include/linux/mfd/intel_soc_pmic_mrfld.h
new file mode 100644
index 0000000..4daecd6
--- /dev/null
+++ b/include/linux/mfd/intel_soc_pmic_mrfld.h

@@ -0,0 +1,81 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Header file for Intel Merrifield Basin Cove PMIC
+ *
+ * Copyright (C) 2019 Intel Corporation. All rights reserved.
+ */
+
+#ifndef __INTEL_SOC_PMIC_MRFLD_H__
+#define __INTEL_SOC_PMIC_MRFLD_H__
+
+#include <linux/bits.h>
+
+#define BCOVE_ID		0x00
+
+#define BCOVE_ID_MINREV0	GENMASK(2, 0)
+#define BCOVE_ID_MAJREV0	GENMASK(5, 3)
+#define BCOVE_ID_VENDID0	GENMASK(7, 6)
+
+#define BCOVE_MINOR(x)		(unsigned int)(((x) & BCOVE_ID_MINREV0) >> 0)
+#define BCOVE_MAJOR(x)		(unsigned int)(((x) & BCOVE_ID_MAJREV0) >> 3)
+#define BCOVE_VENDOR(x)		(unsigned int)(((x) & BCOVE_ID_VENDID0) >> 6)
+
+#define BCOVE_IRQLVL1		0x01
+
+#define BCOVE_PBIRQ		0x02
+#define BCOVE_TMUIRQ		0x03
+#define BCOVE_THRMIRQ		0x04
+#define BCOVE_BCUIRQ		0x05
+#define BCOVE_ADCIRQ		0x06
+#define BCOVE_CHGRIRQ0		0x07
+#define BCOVE_CHGRIRQ1		0x08
+#define BCOVE_GPIOIRQ		0x09
+#define BCOVE_CRITIRQ		0x0B
+
+#define BCOVE_MIRQLVL1		0x0C
+
+#define BCOVE_MPBIRQ		0x0D
+#define BCOVE_MTMUIRQ		0x0E
+#define BCOVE_MTHRMIRQ		0x0F
+#define BCOVE_MBCUIRQ		0x10
+#define BCOVE_MADCIRQ		0x11
+#define BCOVE_MCHGRIRQ0		0x12
+#define BCOVE_MCHGRIRQ1		0x13
+#define BCOVE_MGPIOIRQ		0x14
+#define BCOVE_MCRITIRQ		0x16
+
+#define BCOVE_SCHGRIRQ0		0x4E
+#define BCOVE_SCHGRIRQ1		0x4F
+
+/* Level 1 IRQs */
+#define BCOVE_LVL1_PWRBTN	BIT(0)	/* power button */
+#define BCOVE_LVL1_TMU		BIT(1)	/* time management unit */
+#define BCOVE_LVL1_THRM		BIT(2)	/* thermal */
+#define BCOVE_LVL1_BCU		BIT(3)	/* burst control unit */
+#define BCOVE_LVL1_ADC		BIT(4)	/* ADC */
+#define BCOVE_LVL1_CHGR		BIT(5)	/* charger */
+#define BCOVE_LVL1_GPIO		BIT(6)	/* GPIO */
+#define BCOVE_LVL1_CRIT		BIT(7)	/* critical event */
+
+/* Level 2 IRQs: power button */
+#define BCOVE_PBIRQ_PBTN	BIT(0)
+#define BCOVE_PBIRQ_UBTN	BIT(1)
+
+/* Level 2 IRQs: ADC */
+#define BCOVE_ADCIRQ_BATTEMP	BIT(2)
+#define BCOVE_ADCIRQ_SYSTEMP	BIT(3)
+#define BCOVE_ADCIRQ_BATTID	BIT(4)
+#define BCOVE_ADCIRQ_VIBATT	BIT(5)
+#define BCOVE_ADCIRQ_CCTICK	BIT(7)
+
+/* Level 2 IRQs: charger */
+#define BCOVE_CHGRIRQ_BAT0ALRT	BIT(4)
+#define BCOVE_CHGRIRQ_BAT1ALRT	BIT(5)
+#define BCOVE_CHGRIRQ_BATCRIT	BIT(6)
+
+#define BCOVE_CHGRIRQ_VBUSDET	BIT(0)
+#define BCOVE_CHGRIRQ_DCDET	BIT(1)
+#define BCOVE_CHGRIRQ_BATTDET	BIT(2)
+#define BCOVE_CHGRIRQ_USBIDDET	BIT(3)
+
+#endif	/* __INTEL_SOC_PMIC_MRFLD_H__ */

diff --git a/include/linux/mfd/mt6397/core.h b/include/linux/mfd/mt6397/core.h
index 25a95e7..fc88d31 100644
--- a/include/linux/mfd/mt6397/core.h
+++ b/include/linux/mfd/mt6397/core.h

@@ -7,6 +7,14 @@
 #ifndef __MFD_MT6397_CORE_H__
 #define __MFD_MT6397_CORE_H__
 
+#include <linux/mutex.h>
+
+enum chip_id {
+	MT6323_CHIP_ID = 0x23,
+	MT6391_CHIP_ID = 0x91,
+	MT6397_CHIP_ID = 0x97,
+};
+
 enum mt6397_irq_numbers {
 	MT6397_IRQ_SPKL_AB = 0,
 	MT6397_IRQ_SPKR_AB,
@@ -54,6 +62,9 @@
 	u16 irq_masks_cache[2];
 	u16 int_con[2];
 	u16 int_status[2];
+	u16 chip_id;
 };
 
+int mt6397_irq_init(struct mt6397_chip *chip);
+
 #endif /* __MFD_MT6397_CORE_H__ */

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index a487b68..138c50d 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h

@@ -282,7 +282,6 @@
 	MLX5_CMD_OP_ALLOC_MODIFY_HEADER_CONTEXT   = 0x940,
 	MLX5_CMD_OP_DEALLOC_MODIFY_HEADER_CONTEXT = 0x941,
 	MLX5_CMD_OP_QUERY_MODIFY_HEADER_CONTEXT   = 0x942,
-	MLX5_CMD_OP_SYNC_STEERING                 = 0xb00,
 	MLX5_CMD_OP_FPGA_CREATE_QP                = 0x960,
 	MLX5_CMD_OP_FPGA_MODIFY_QP                = 0x961,
 	MLX5_CMD_OP_FPGA_QUERY_QP                 = 0x962,
@@ -296,6 +295,7 @@
 	MLX5_CMD_OP_DESTROY_UCTX                  = 0xa06,
 	MLX5_CMD_OP_CREATE_UMEM                   = 0xa08,
 	MLX5_CMD_OP_DESTROY_UMEM                  = 0xa0a,
+	MLX5_CMD_OP_SYNC_STEERING                 = 0xb00,
 	MLX5_CMD_OP_MAX
 };
 
@@ -487,7 +487,7 @@
 
 struct mlx5_ifc_fte_match_set_misc_bits {
 	u8         gre_c_present[0x1];
-	u8         reserved_auto1[0x1];
+	u8         reserved_at_1[0x1];
 	u8         gre_k_present[0x1];
 	u8         gre_s_present[0x1];
 	u8         source_vhca_port[0x4];
@@ -5054,50 +5054,50 @@
 
 struct mlx5_ifc_other_hca_cap_bits {
 	u8         roce[0x1];
-	u8         reserved_0[0x27f];
+	u8         reserved_at_1[0x27f];
 };
 
 struct mlx5_ifc_query_other_hca_cap_out_bits {
 	u8         status[0x8];
-	u8         reserved_0[0x18];
+	u8         reserved_at_8[0x18];
 
 	u8         syndrome[0x20];
 
-	u8         reserved_1[0x40];
+	u8         reserved_at_40[0x40];
 
 	struct     mlx5_ifc_other_hca_cap_bits other_capability;
 };
 
 struct mlx5_ifc_query_other_hca_cap_in_bits {
 	u8         opcode[0x10];
-	u8         reserved_0[0x10];
+	u8         reserved_at_10[0x10];
 
-	u8         reserved_1[0x10];
+	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
 
-	u8         reserved_2[0x10];
+	u8         reserved_at_40[0x10];
 	u8         function_id[0x10];
 
-	u8         reserved_3[0x20];
+	u8         reserved_at_60[0x20];
 };
 
 struct mlx5_ifc_modify_other_hca_cap_out_bits {
 	u8         status[0x8];
-	u8         reserved_0[0x18];
+	u8         reserved_at_8[0x18];
 
 	u8         syndrome[0x20];
 
-	u8         reserved_1[0x40];
+	u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_modify_other_hca_cap_in_bits {
 	u8         opcode[0x10];
-	u8         reserved_0[0x10];
+	u8         reserved_at_10[0x10];
 
-	u8         reserved_1[0x10];
+	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
 
-	u8         reserved_2[0x10];
+	u8         reserved_at_40[0x10];
 	u8         function_id[0x10];
 	u8         field_select[0x20];
 

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7cf955f..cc29227 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h

@@ -805,6 +805,24 @@
 	page[1].compound_order = order;
 }
 
+/* Returns the number of pages in this potentially compound page. */
+static inline unsigned long compound_nr(struct page *page)
+{
+	return 1UL << compound_order(page);
+}
+
+/* Returns the number of bytes in this potentially compound page. */
+static inline unsigned long page_size(struct page *page)
+{
+	return PAGE_SIZE << compound_order(page);
+}
+
+/* Returns the number of bits needed for the number of bytes in a page */
+static inline unsigned int page_shift(struct page *page)
+{
+	return PAGE_SHIFT + compound_order(page);
+}
+
 void free_compound_page(struct page *page);
 
 #ifdef CONFIG_MMU
@@ -1057,8 +1075,9 @@
 	put_page(page);
 }
 
-void put_user_pages_dirty(struct page **pages, unsigned long npages);
-void put_user_pages_dirty_lock(struct page **pages, unsigned long npages);
+void put_user_pages_dirty_lock(struct page **pages, unsigned long npages,
+			       bool make_dirty);
+
 void put_user_pages(struct page **pages, unsigned long npages);
 
 #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
@@ -1405,7 +1424,11 @@
 
 extern void show_free_areas(unsigned int flags, nodemask_t *nodemask);
 
+#ifdef CONFIG_MMU
 extern bool can_do_mlock(void);
+#else
+static inline bool can_do_mlock(void) { return false; }
+#endif
 extern int user_shm_lock(size_t, struct user_struct *);
 extern void user_shm_unlock(size_t, struct user_struct *);
 
@@ -1926,7 +1949,7 @@
 	pgtable_cache_init();
 }
 
-static inline bool pgtable_page_ctor(struct page *page)
+static inline bool pgtable_pte_page_ctor(struct page *page)
 {
 	if (!ptlock_init(page))
 		return false;
@@ -1935,7 +1958,7 @@
 	return true;
 }
 
-static inline void pgtable_page_dtor(struct page *page)
+static inline void pgtable_pte_page_dtor(struct page *page)
 {
 	ptlock_free(page);
 	__ClearPageTable(page);
@@ -2305,6 +2328,8 @@
 				   unsigned long addr, unsigned long len,
 				   unsigned long flags, struct page **pages);
 
+unsigned long randomize_stack_top(unsigned long stack_top);
+
 extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
 
 extern unsigned long mmap_region(struct file *file, unsigned long addr,
@@ -2568,6 +2593,7 @@
 #define FOLL_COW	0x4000	/* internal GUP flag */
 #define FOLL_ANON	0x8000	/* don't do file mappings */
 #define FOLL_LONGTERM	0x10000	/* mapping lifetime is indefinite: see below */
+#define FOLL_SPLIT_PMD	0x20000	/* split huge pmd before returning */
 
 /*
  * NOTE on FOLL_LONGTERM:
@@ -2845,5 +2871,12 @@
 static inline void setup_nr_node_ids(void) {}
 #endif
 
+extern int memcmp_pages(struct page *page1, struct page *page2);
+
+static inline int pages_identical(struct page *page1, struct page *page2)
+{
+	return !memcmp_pages(page1, page2);
+}
+
 #endif /* __KERNEL__ */
 #endif /* _LINUX_MM_H */

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 0b739f3..2222fa7 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h

@@ -138,6 +138,7 @@
 		struct {	/* Second tail page of compound page */
 			unsigned long _compound_pad_1;	/* compound_head */
 			unsigned long _compound_pad_2;
+			/* For both global and memcg */
 			struct list_head deferred_list;
 		};
 		struct {	/* Page table pages */
@@ -382,6 +383,16 @@
 		unsigned long highest_vm_end;	/* highest vma end address */
 		pgd_t * pgd;
 
+#ifdef CONFIG_MEMBARRIER
+		/**
+		 * @membarrier_state: Flags controlling membarrier behavior.
+		 *
+		 * This field is close to @pgd to hopefully fit in the same
+		 * cache-line, which needs to be touched by switch_mm().
+		 */
+		atomic_t membarrier_state;
+#endif
+
 		/**
 		 * @mm_users: The number of users including userspace.
 		 *
@@ -451,9 +462,7 @@
 		unsigned long flags; /* Must use atomic bitops to access */
 
 		struct core_state *core_state; /* coredumping support */
-#ifdef CONFIG_MEMBARRIER
-		atomic_t membarrier_state;
-#endif
+
 #ifdef CONFIG_AIO
 		spinlock_t			ioctx_lock;
 		struct kioctx_table __rcu	*ioctx_table;

diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h
index d7016dc..c1bc673 100644
--- a/include/linux/mm_types_task.h
+++ b/include/linux/mm_types_task.h

@@ -36,6 +36,10 @@
 	struct vm_area_struct *vmas[VMACACHE_SIZE];
 };
 
+/*
+ * When updating this, please also update struct resident_page_types[] in
+ * kernel/fork.c
+ */
 enum {
 	MM_FILEPAGES,	/* Resident file mapping pages */
 	MM_ANONPAGES,	/* Resident anonymous pages */

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 3f38c30..bda2028 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h

@@ -235,6 +235,8 @@
 	NR_SHMEM,		/* shmem pages (included tmpfs/GEM pages) */
 	NR_SHMEM_THPS,
 	NR_SHMEM_PMDMAPPED,
+	NR_FILE_THPS,
+	NR_FILE_PMDMAPPED,
 	NR_ANON_THPS,
 	NR_UNSTABLE_NFS,	/* NFS unstable pages */
 	NR_VMSCAN_WRITE,
@@ -677,6 +679,14 @@
 extern struct page *mem_map;
 #endif
 
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+struct deferred_split {
+	spinlock_t split_queue_lock;
+	struct list_head split_queue;
+	unsigned long split_queue_len;
+};
+#endif
+
 /*
  * On NUMA machines, each NUMA node would have a pg_data_t to describe
  * it's memory layout. On UMA machines there is a single pglist_data which
@@ -756,9 +766,7 @@
 #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-	spinlock_t split_queue_lock;
-	struct list_head split_queue;
-	unsigned long split_queue_len;
+	struct deferred_split deferred_split_queue;
 #endif
 
 	/* Fields commonly accessed by the page reclaim scanner */

diff --git a/include/linux/module.h b/include/linux/module.h
index b1a6735..6d20895 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h

@@ -26,9 +26,6 @@
 #include <linux/percpu.h>
 #include <asm/module.h>
 
-/* In stripped ARM and x86-64 modules, ~ is surprisingly rare. */
-#define MODULE_SIG_STRING "~Module signature appended~\n"
-
 /* Not Yet Implemented */
 #define MODULE_SUPPORTED_DEVICE(name)
 

diff --git a/include/linux/module_signature.h b/include/linux/module_signature.h
new file mode 100644
index 0000000..7eb4b00
--- /dev/null
+++ b/include/linux/module_signature.h

@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * Module signature handling.
+ *
+ * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#ifndef _LINUX_MODULE_SIGNATURE_H
+#define _LINUX_MODULE_SIGNATURE_H
+
+#include <linux/types.h>
+
+/* In stripped ARM and x86-64 modules, ~ is surprisingly rare. */
+#define MODULE_SIG_STRING "~Module signature appended~\n"
+
+enum pkey_id_type {
+	PKEY_ID_PGP,		/* OpenPGP generated key ID */
+	PKEY_ID_X509,		/* X.509 arbitrary subjectKeyIdentifier */
+	PKEY_ID_PKCS7,		/* Signature in PKCS#7 message */
+};
+
+/*
+ * Module signature information block.
+ *
+ * The constituents of the signature section are, in order:
+ *
+ *	- Signer's name
+ *	- Key identifier
+ *	- Signature data
+ *	- Information block
+ */
+struct module_signature {
+	u8	algo;		/* Public-key crypto algorithm [0] */
+	u8	hash;		/* Digest algorithm [0] */
+	u8	id_type;	/* Key identifier type [PKEY_ID_PKCS7] */
+	u8	signer_len;	/* Length of signer's name [0] */
+	u8	key_id_len;	/* Length of key identifier [0] */
+	u8	__pad[3];
+	__be32	sig_len;	/* Length of signature data */
+};
+
+int mod_check_sig(const struct module_signature *ms, size_t file_len,
+		  const char *name);
+
+#endif /* _LINUX_MODULE_SIGNATURE_H */

diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 0a11712..570a60c 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h

@@ -490,6 +490,9 @@
 extern const struct dentry_operations nfs_dentry_operations;
 
 extern void nfs_force_lookup_revalidate(struct inode *dir);
+extern struct dentry *nfs_add_or_obtain(struct dentry *dentry,
+			struct nfs_fh *fh, struct nfs_fattr *fattr,
+			struct nfs4_label *label);
 extern int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fh,
 			struct nfs_fattr *fattr, struct nfs4_label *label);
 extern int nfs_may_open(struct inode *inode, const struct cred *cred, int openflags);

diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h
index 0959295..682fd46 100644
--- a/include/linux/page_ext.h
+++ b/include/linux/page_ext.h

@@ -18,6 +18,7 @@
 
 enum page_ext_flags {
 	PAGE_EXT_OWNER,
+	PAGE_EXT_OWNER_ACTIVE,
 #if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT)
 	PAGE_EXT_YOUNG,
 	PAGE_EXT_IDLE,

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index c755245..37a4d9e 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h

@@ -333,6 +333,16 @@
 			mapping_gfp_mask(mapping));
 }
 
+static inline struct page *find_subpage(struct page *page, pgoff_t offset)
+{
+	if (PageHuge(page))
+		return page;
+
+	VM_BUG_ON_PAGE(PageTail(page), page);
+
+	return page + (offset & (compound_nr(page) - 1));
+}
+
 struct page *find_get_entry(struct address_space *mapping, pgoff_t offset);
 struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset);
 unsigned find_get_entries(struct address_space *mapping, pgoff_t start,

diff --git a/include/linux/pci-aspm.h b/include/linux/pci-aspm.h
deleted file mode 100644
index 6706414..0000000
--- a/include/linux/pci-aspm.h
+++ /dev/null

@@ -1,36 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- *	aspm.h
- *
- *	PCI Express ASPM defines and function prototypes
- *
- *	Copyright (C) 2007 Intel Corp.
- *		Zhang Yanmin (yanmin.zhang@intel.com)
- *		Shaohua Li (shaohua.li@intel.com)
- *
- *	For more information, please consult the following manuals (look at
- *	http://www.pcisig.com/ for how to get them):
- *
- *	PCI Express Specification
- */
-
-#ifndef LINUX_ASPM_H
-#define LINUX_ASPM_H
-
-#include <linux/pci.h>
-
-#define PCIE_LINK_STATE_L0S	1
-#define PCIE_LINK_STATE_L1	2
-#define PCIE_LINK_STATE_CLKPM	4
-
-#ifdef CONFIG_PCIEASPM
-int pci_disable_link_state(struct pci_dev *pdev, int state);
-int pci_disable_link_state_locked(struct pci_dev *pdev, int state);
-void pcie_no_aspm(void);
-#else
-static inline int pci_disable_link_state(struct pci_dev *pdev, int state)
-{ return 0; }
-static inline void pcie_no_aspm(void) { }
-#endif
-
-#endif /* LINUX_ASPM_H */

diff --git a/include/linux/pci-p2pdma.h b/include/linux/pci-p2pdma.h
index bca9bc3..8318a97 100644
--- a/include/linux/pci-p2pdma.h
+++ b/include/linux/pci-p2pdma.h

@@ -30,8 +30,10 @@
 					 unsigned int *nents, u32 length);
 void pci_p2pmem_free_sgl(struct pci_dev *pdev, struct scatterlist *sgl);
 void pci_p2pmem_publish(struct pci_dev *pdev, bool publish);
-int pci_p2pdma_map_sg(struct device *dev, struct scatterlist *sg, int nents,
-		      enum dma_data_direction dir);
+int pci_p2pdma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
+		int nents, enum dma_data_direction dir, unsigned long attrs);
+void pci_p2pdma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg,
+		int nents, enum dma_data_direction dir, unsigned long attrs);
 int pci_p2pdma_enable_store(const char *page, struct pci_dev **p2p_dev,
 			    bool *use_p2pdma);
 ssize_t pci_p2pdma_enable_show(char *page, struct pci_dev *p2p_dev,
@@ -81,11 +83,17 @@
 static inline void pci_p2pmem_publish(struct pci_dev *pdev, bool publish)
 {
 }
-static inline int pci_p2pdma_map_sg(struct device *dev,
-		struct scatterlist *sg, int nents, enum dma_data_direction dir)
+static inline int pci_p2pdma_map_sg_attrs(struct device *dev,
+		struct scatterlist *sg, int nents, enum dma_data_direction dir,
+		unsigned long attrs)
 {
 	return 0;
 }
+static inline void pci_p2pdma_unmap_sg_attrs(struct device *dev,
+		struct scatterlist *sg, int nents, enum dma_data_direction dir,
+		unsigned long attrs)
+{
+}
 static inline int pci_p2pdma_enable_store(const char *page,
 		struct pci_dev **p2p_dev, bool *use_p2pdma)
 {
@@ -111,4 +119,16 @@
 	return pci_p2pmem_find_many(&client, 1);
 }
 
+static inline int pci_p2pdma_map_sg(struct device *dev, struct scatterlist *sg,
+				    int nents, enum dma_data_direction dir)
+{
+	return pci_p2pdma_map_sg_attrs(dev, sg, nents, dir, 0);
+}
+
+static inline void pci_p2pdma_unmap_sg(struct device *dev,
+		struct scatterlist *sg, int nents, enum dma_data_direction dir)
+{
+	pci_p2pdma_unmap_sg_attrs(dev, sg, nents, dir, 0);
+}
+
 #endif /* _LINUX_PCI_P2P_H */

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 82e4cd1..f9088c8 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h

@@ -6,12 +6,18 @@
  *	Copyright 1994, Drew Eckhardt
  *	Copyright 1997--1999 Martin Mares <mj@ucw.cz>
  *
+ *	PCI Express ASPM defines and function prototypes
+ *	Copyright (c) 2007 Intel Corp.
+ *		Zhang Yanmin (yanmin.zhang@intel.com)
+ *		Shaohua Li (shaohua.li@intel.com)
+ *
  *	For more information, please consult the following manuals (look at
  *	http://www.pcisig.com/ for how to get them):
  *
  *	PCI BIOS Specification
  *	PCI Local Bus Specification
  *	PCI to PCI Bridge Specification
+ *	PCI Express Specification
  *	PCI System Design Guide
  */
 #ifndef LINUX_PCI_H
@@ -145,11 +151,6 @@
 	return pci_power_names[1 + (__force int) state];
 }
 
-#define PCI_PM_D2_DELAY		200
-#define PCI_PM_D3_WAIT		10
-#define PCI_PM_D3COLD_WAIT	100
-#define PCI_PM_BUS_WAIT		50
-
 /**
  * typedef pci_channel_state_t
  *
@@ -418,7 +419,6 @@
 	unsigned int	broken_intx_masking:1;	/* INTx masking can't be used */
 	unsigned int	io_window_1k:1;		/* Intel bridge 1K I/O windows */
 	unsigned int	irq_managed:1;
-	unsigned int	has_secondary_link:1;
 	unsigned int	non_compliant_bars:1;	/* Broken BARs; ignore them */
 	unsigned int	is_probed:1;		/* Device probing in progress */
 	unsigned int	link_active_reporting:1;/* Device capable of reporting link active */
@@ -649,9 +649,6 @@
 	return dev->bus->self;
 }
 
-struct device *pci_get_host_bridge_device(struct pci_dev *dev);
-void pci_put_host_bridge_device(struct device *dev);
-
 #ifdef CONFIG_PCI_MSI
 static inline bool pci_dev_msi_enabled(struct pci_dev *pci_dev)
 {
@@ -925,6 +922,11 @@
 	PCI_SCAN_ALL_PCIE_DEVS	= 0x00000040,	/* Scan all, not just dev 0 */
 };
 
+#define PCI_IRQ_LEGACY		(1 << 0) /* Allow legacy interrupts */
+#define PCI_IRQ_MSI		(1 << 1) /* Allow MSI interrupts */
+#define PCI_IRQ_MSIX		(1 << 2) /* Allow MSI-X interrupts */
+#define PCI_IRQ_AFFINITY	(1 << 3) /* Auto-assign affinity */
+
 /* These external functions are only available when PCI support is enabled */
 #ifdef CONFIG_PCI
 
@@ -969,7 +971,7 @@
 				resource_size_t,
 				resource_size_t);
 
-/* Weak but can be overriden by arch */
+/* Weak but can be overridden by arch */
 void pci_fixup_cardbus(struct pci_bus *);
 
 /* Generic PCI functions used internally */
@@ -995,7 +997,6 @@
 int pci_scan_root_bus_bridge(struct pci_host_bridge *bridge);
 struct pci_bus *pci_add_new_bus(struct pci_bus *parent, struct pci_dev *dev,
 				int busnr);
-void pcie_update_link_speed(struct pci_bus *bus, u16 link_status);
 struct pci_slot *pci_create_slot(struct pci_bus *parent, int slot_nr,
 				 const char *name,
 				 struct hotplug_slot *hotplug);
@@ -1241,19 +1242,12 @@
 int pci_prepare_to_sleep(struct pci_dev *dev);
 int pci_back_from_sleep(struct pci_dev *dev);
 bool pci_dev_run_wake(struct pci_dev *dev);
-bool pci_check_pme_status(struct pci_dev *dev);
-void pci_pme_wakeup_bus(struct pci_bus *bus);
 void pci_d3cold_enable(struct pci_dev *dev);
 void pci_d3cold_disable(struct pci_dev *dev);
 bool pcie_relaxed_ordering_enabled(struct pci_dev *dev);
 void pci_wakeup_bus(struct pci_bus *bus);
 void pci_bus_set_current_state(struct pci_bus *bus, pci_power_t state);
 
-/* PCI Virtual Channel */
-int pci_save_vc_state(struct pci_dev *dev);
-void pci_restore_vc_state(struct pci_dev *dev);
-void pci_allocate_vc_save_buffers(struct pci_dev *dev);
-
 /* For use by arch with custom probe code */
 void set_pcie_port_type(struct pci_dev *pdev);
 void set_pcie_hotplug_bridge(struct pci_dev *pdev);
@@ -1297,8 +1291,6 @@
 void pci_release_selected_regions(struct pci_dev *, int);
 
 /* drivers/pci/bus.c */
-struct pci_bus *pci_bus_get(struct pci_bus *bus);
-void pci_bus_put(struct pci_bus *bus);
 void pci_add_resource(struct list_head *resources, struct resource *res);
 void pci_add_resource_offset(struct list_head *resources, struct resource *res,
 			     resource_size_t offset);
@@ -1408,11 +1400,6 @@
 int pci_set_vga_state(struct pci_dev *pdev, bool decode,
 		      unsigned int command_bits, u32 flags);
 
-#define PCI_IRQ_LEGACY		(1 << 0) /* Allow legacy interrupts */
-#define PCI_IRQ_MSI		(1 << 1) /* Allow MSI interrupts */
-#define PCI_IRQ_MSIX		(1 << 2) /* Allow MSI-X interrupts */
-#define PCI_IRQ_AFFINITY	(1 << 3) /* Auto-assign affinity */
-
 /*
  * Virtual interrupts allow for more interrupts to be allocated
  * than the device has interrupts for. These are not programmed
@@ -1517,14 +1504,6 @@
 }
 #endif
 
-static inline int
-pci_alloc_irq_vectors(struct pci_dev *dev, unsigned int min_vecs,
-		      unsigned int max_vecs, unsigned int flags)
-{
-	return pci_alloc_irq_vectors_affinity(dev, min_vecs, max_vecs, flags,
-					      NULL);
-}
-
 /**
  * pci_irqd_intx_xlate() - Translate PCI INTx value to an IRQ domain hwirq
  * @d: the INTx IRQ domain
@@ -1565,10 +1544,22 @@
 #define pcie_ports_native	false
 #endif
 
+#define PCIE_LINK_STATE_L0S	1
+#define PCIE_LINK_STATE_L1	2
+#define PCIE_LINK_STATE_CLKPM	4
+
 #ifdef CONFIG_PCIEASPM
+int pci_disable_link_state(struct pci_dev *pdev, int state);
+int pci_disable_link_state_locked(struct pci_dev *pdev, int state);
+void pcie_no_aspm(void);
 bool pcie_aspm_support_enabled(void);
 bool pcie_aspm_enabled(struct pci_dev *pdev);
 #else
+static inline int pci_disable_link_state(struct pci_dev *pdev, int state)
+{ return 0; }
+static inline int pci_disable_link_state_locked(struct pci_dev *pdev, int state)
+{ return 0; }
+static inline void pcie_no_aspm(void) { }
 static inline bool pcie_aspm_support_enabled(void) { return false; }
 static inline bool pcie_aspm_enabled(struct pci_dev *pdev) { return false; }
 #endif
@@ -1579,23 +1570,8 @@
 static inline bool pci_aer_available(void) { return false; }
 #endif
 
-#ifdef CONFIG_PCIE_ECRC
-void pcie_set_ecrc_checking(struct pci_dev *dev);
-void pcie_ecrc_get_policy(char *str);
-#else
-static inline void pcie_set_ecrc_checking(struct pci_dev *dev) { }
-static inline void pcie_ecrc_get_policy(char *str) { }
-#endif
-
 bool pci_ats_disabled(void);
 
-#ifdef CONFIG_PCIE_PTM
-int pci_enable_ptm(struct pci_dev *dev, u8 *granularity);
-#else
-static inline int pci_enable_ptm(struct pci_dev *dev, u8 *granularity)
-{ return -EINVAL; }
-#endif
-
 void pci_cfg_access_lock(struct pci_dev *dev);
 bool pci_cfg_access_trylock(struct pci_dev *dev);
 void pci_cfg_access_unlock(struct pci_dev *dev);
@@ -1749,11 +1725,6 @@
 
 static inline unsigned long pci_address_to_pio(phys_addr_t addr) { return -1; }
 
-static inline void pci_block_cfg_access(struct pci_dev *dev) { }
-static inline int pci_block_cfg_access_in_atomic(struct pci_dev *dev)
-{ return 0; }
-static inline void pci_unblock_cfg_access(struct pci_dev *dev) { }
-
 static inline struct pci_bus *pci_find_next_bus(const struct pci_bus *from)
 { return NULL; }
 static inline struct pci_dev *pci_get_slot(struct pci_bus *bus,
@@ -1782,17 +1753,36 @@
 							 struct pci_dev *dev)
 { return NULL; }
 static inline bool pci_ats_disabled(void) { return true; }
+
+static inline int pci_irq_vector(struct pci_dev *dev, unsigned int nr)
+{
+	return -EINVAL;
+}
+
+static inline int
+pci_alloc_irq_vectors_affinity(struct pci_dev *dev, unsigned int min_vecs,
+			       unsigned int max_vecs, unsigned int flags,
+			       struct irq_affinity *aff_desc)
+{
+	return -ENOSPC;
+}
 #endif /* CONFIG_PCI */
 
+static inline int
+pci_alloc_irq_vectors(struct pci_dev *dev, unsigned int min_vecs,
+		      unsigned int max_vecs, unsigned int flags)
+{
+	return pci_alloc_irq_vectors_affinity(dev, min_vecs, max_vecs, flags,
+					      NULL);
+}
+
 #ifdef CONFIG_PCI_ATS
 /* Address Translation Service */
-void pci_ats_init(struct pci_dev *dev);
 int pci_enable_ats(struct pci_dev *dev, int ps);
 void pci_disable_ats(struct pci_dev *dev);
 int pci_ats_queue_depth(struct pci_dev *dev);
 int pci_ats_page_aligned(struct pci_dev *dev);
 #else
-static inline void pci_ats_init(struct pci_dev *d) { }
 static inline int pci_enable_ats(struct pci_dev *d, int ps) { return -ENODEV; }
 static inline void pci_disable_ats(struct pci_dev *d) { }
 static inline int pci_ats_queue_depth(struct pci_dev *d) { return -ENODEV; }
@@ -1803,7 +1793,7 @@
 
 #include <asm/pci.h>
 
-/* These two functions provide almost identical functionality. Depennding
+/* These two functions provide almost identical functionality. Depending
  * on the architecture, one will be implemented as a wrapper around the
  * other (in drivers/pci/mmap.c).
  *
@@ -1872,25 +1862,9 @@
 	return dev_name(&pdev->dev);
 }
 
-
-/*
- * Some archs don't want to expose struct resource to userland as-is
- * in sysfs and /proc
- */
-#ifdef HAVE_ARCH_PCI_RESOURCE_TO_USER
 void pci_resource_to_user(const struct pci_dev *dev, int bar,
 			  const struct resource *rsrc,
 			  resource_size_t *start, resource_size_t *end);
-#else
-static inline void pci_resource_to_user(const struct pci_dev *dev, int bar,
-		const struct resource *rsrc, resource_size_t *start,
-		resource_size_t *end)
-{
-	*start = rsrc->start;
-	*end = rsrc->end;
-}
-#endif /* HAVE_ARCH_PCI_RESOURCE_TO_USER */
-
 
 /*
  * The world is not perfect and supplies us with broken PCI devices.
@@ -2032,10 +2006,6 @@
 extern u8 pci_dfl_cache_line_size;
 extern u8 pci_cache_line_size;
 
-extern unsigned long pci_hotplug_io_size;
-extern unsigned long pci_hotplug_mem_size;
-extern unsigned long pci_hotplug_bus_size;
-
 /* Architecture-specific versions may override these (weak) */
 void pcibios_disable_device(struct pci_dev *dev);
 void pcibios_set_master(struct pci_dev *dev);
@@ -2305,10 +2275,6 @@
 #ifdef CONFIG_OF
 struct device_node;
 struct irq_domain;
-void pci_set_of_node(struct pci_dev *dev);
-void pci_release_of_node(struct pci_dev *dev);
-void pci_set_bus_of_node(struct pci_bus *bus);
-void pci_release_bus_of_node(struct pci_bus *bus);
 struct irq_domain *pci_host_bridge_of_msi_domain(struct pci_bus *bus);
 int pci_parse_request_of_pci_ranges(struct device *dev,
 				    struct list_head *resources,
@@ -2318,10 +2284,6 @@
 struct device_node *pcibios_get_phb_of_node(struct pci_bus *bus);
 
 #else	/* CONFIG_OF */
-static inline void pci_set_of_node(struct pci_dev *dev) { }
-static inline void pci_release_of_node(struct pci_dev *dev) { }
-static inline void pci_set_bus_of_node(struct pci_bus *bus) { }
-static inline void pci_release_bus_of_node(struct pci_bus *bus) { }
 static inline struct irq_domain *
 pci_host_bridge_of_msi_domain(struct pci_bus *bus) { return NULL; }
 static inline int pci_parse_request_of_pci_ranges(struct device *dev,
@@ -2435,4 +2397,7 @@
 #define pci_notice_ratelimited(pdev, fmt, arg...) \
 	dev_notice_ratelimited(&(pdev)->dev, fmt, ##arg)
 
+#define pci_info_ratelimited(pdev, fmt, arg...) \
+	dev_info_ratelimited(&(pdev)->dev, fmt, ##arg)
+
 #endif /* LINUX_PCI_H */

diff --git a/include/linux/pci_hotplug.h b/include/linux/pci_hotplug.h
index f694eb2..b482e42 100644
--- a/include/linux/pci_hotplug.h
+++ b/include/linux/pci_hotplug.h

@@ -86,114 +86,14 @@
 #define pci_hp_initialize(slot, bus, nr, name) \
 	__pci_hp_initialize(slot, bus, nr, name, THIS_MODULE, KBUILD_MODNAME)
 
-/* PCI Setting Record (Type 0) */
-struct hpp_type0 {
-	u32 revision;
-	u8  cache_line_size;
-	u8  latency_timer;
-	u8  enable_serr;
-	u8  enable_perr;
-};
-
-/* PCI-X Setting Record (Type 1) */
-struct hpp_type1 {
-	u32 revision;
-	u8  max_mem_read;
-	u8  avg_max_split;
-	u16 tot_max_split;
-};
-
-/* PCI Express Setting Record (Type 2) */
-struct hpp_type2 {
-	u32 revision;
-	u32 unc_err_mask_and;
-	u32 unc_err_mask_or;
-	u32 unc_err_sever_and;
-	u32 unc_err_sever_or;
-	u32 cor_err_mask_and;
-	u32 cor_err_mask_or;
-	u32 adv_err_cap_and;
-	u32 adv_err_cap_or;
-	u16 pci_exp_devctl_and;
-	u16 pci_exp_devctl_or;
-	u16 pci_exp_lnkctl_and;
-	u16 pci_exp_lnkctl_or;
-	u32 sec_unc_err_sever_and;
-	u32 sec_unc_err_sever_or;
-	u32 sec_unc_err_mask_and;
-	u32 sec_unc_err_mask_or;
-};
-
-/*
- * _HPX PCI Express Setting Record (Type 3)
- */
-struct hpx_type3 {
-	u16 device_type;
-	u16 function_type;
-	u16 config_space_location;
-	u16 pci_exp_cap_id;
-	u16 pci_exp_cap_ver;
-	u16 pci_exp_vendor_id;
-	u16 dvsec_id;
-	u16 dvsec_rev;
-	u16 match_offset;
-	u32 match_mask_and;
-	u32 match_value;
-	u16 reg_offset;
-	u32 reg_mask_and;
-	u32 reg_mask_or;
-};
-
-struct hotplug_program_ops {
-	void (*program_type0)(struct pci_dev *dev, struct hpp_type0 *hpp);
-	void (*program_type1)(struct pci_dev *dev, struct hpp_type1 *hpp);
-	void (*program_type2)(struct pci_dev *dev, struct hpp_type2 *hpp);
-	void (*program_type3)(struct pci_dev *dev, struct hpx_type3 *hpp);
-};
-
-enum hpx_type3_dev_type {
-	HPX_TYPE_ENDPOINT	= BIT(0),
-	HPX_TYPE_LEG_END	= BIT(1),
-	HPX_TYPE_RC_END		= BIT(2),
-	HPX_TYPE_RC_EC		= BIT(3),
-	HPX_TYPE_ROOT_PORT	= BIT(4),
-	HPX_TYPE_UPSTREAM	= BIT(5),
-	HPX_TYPE_DOWNSTREAM	= BIT(6),
-	HPX_TYPE_PCI_BRIDGE	= BIT(7),
-	HPX_TYPE_PCIE_BRIDGE	= BIT(8),
-};
-
-enum hpx_type3_fn_type {
-	HPX_FN_NORMAL		= BIT(0),
-	HPX_FN_SRIOV_PHYS	= BIT(1),
-	HPX_FN_SRIOV_VIRT	= BIT(2),
-};
-
-enum hpx_type3_cfg_loc {
-	HPX_CFG_PCICFG		= 0,
-	HPX_CFG_PCIE_CAP	= 1,
-	HPX_CFG_PCIE_CAP_EXT	= 2,
-	HPX_CFG_VEND_CAP	= 3,
-	HPX_CFG_DVSEC		= 4,
-	HPX_CFG_MAX,
-};
-
 #ifdef CONFIG_ACPI
 #include <linux/acpi.h>
-int pci_acpi_program_hp_params(struct pci_dev *dev,
-			       const struct hotplug_program_ops *hp_ops);
 bool pciehp_is_native(struct pci_dev *bridge);
 int acpi_get_hp_hw_control_from_firmware(struct pci_dev *bridge);
 bool shpchp_is_native(struct pci_dev *bridge);
 int acpi_pci_check_ejectable(struct pci_bus *pbus, acpi_handle handle);
 int acpi_pci_detect_ejectable(acpi_handle handle);
 #else
-static inline int pci_acpi_program_hp_params(struct pci_dev *dev,
-				    const struct hotplug_program_ops *hp_ops)
-{
-	return -ENODEV;
-}
-
 static inline int acpi_get_hp_hw_control_from_firmware(struct pci_dev *bridge)
 {
 	return 0;

diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index de1b75e..21a5724 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h

@@ -2134,6 +2134,7 @@
 #define PCI_VENDOR_ID_MYRICOM		0x14c1
 
 #define PCI_VENDOR_ID_MEDIATEK		0x14c3
+#define PCI_DEVICE_ID_MEDIATEK_7629	0x7629
 
 #define PCI_VENDOR_ID_TITAN		0x14D2
 #define PCI_DEVICE_ID_TITAN_010L	0x8001
@@ -2574,6 +2575,8 @@
 
 #define PCI_VENDOR_ID_ASMEDIA		0x1b21
 
+#define PCI_VENDOR_ID_AMAZON_ANNAPURNA_LABS	0x1c36
+
 #define PCI_VENDOR_ID_CIRCUITCO		0x1cc8
 #define PCI_SUBSYSTEM_ID_CIRCUITCO_MINNOWBOARD	0x0001
 

diff --git a/include/linux/platform_data/cros_ec_commands.h b/include/linux/platform_data/cros_ec_commands.h
index 7ccb875..9841568 100644
--- a/include/linux/platform_data/cros_ec_commands.h
+++ b/include/linux/platform_data/cros_ec_commands.h

@@ -5513,6 +5513,18 @@
 	uint8_t seed[FP_CONTEXT_TPM_BYTES];
 } __ec_align4;
 
+#define EC_CMD_FP_ENC_STATUS 0x0409
+
+/* FP TPM seed has been set or not */
+#define FP_ENC_STATUS_SEED_SET BIT(0)
+
+struct ec_response_fp_encryption_status {
+	/* Used bits in encryption engine status */
+	uint32_t valid_flags;
+	/* Encryption engine status */
+	uint32_t status;
+} __ec_align4;
+
 /*****************************************************************************/
 /* Touchpad MCU commands: range 0x0500-0x05FF */
 

diff --git a/include/linux/platform_data/db8500_thermal.h b/include/linux/platform_data/db8500_thermal.h
deleted file mode 100644
index 55e5575..0000000
--- a/include/linux/platform_data/db8500_thermal.h
+++ /dev/null

@@ -1,29 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * db8500_thermal.h - DB8500 Thermal Management Implementation
- *
- * Copyright (C) 2012 ST-Ericsson
- * Copyright (C) 2012 Linaro Ltd.
- *
- * Author: Hongbo Zhang <hongbo.zhang@linaro.com>
- */
-
-#ifndef _DB8500_THERMAL_H_
-#define _DB8500_THERMAL_H_
-
-#include <linux/thermal.h>
-
-#define COOLING_DEV_MAX 8
-
-struct db8500_trip_point {
-	unsigned long temp;
-	enum thermal_trip_type type;
-	char cdev_name[COOLING_DEV_MAX][THERMAL_NAME_LENGTH];
-};
-
-struct db8500_thsens_platform_data {
-	struct db8500_trip_point trip_points[THERMAL_MAX_TRIPS];
-	int num_trips;
-};
-
-#endif /* _DB8500_THERMAL_H_ */

diff --git a/include/linux/platform_data/eth-netx.h b/include/linux/platform_data/eth-netx.h
deleted file mode 100644
index a3a6322..0000000
--- a/include/linux/platform_data/eth-netx.h
+++ /dev/null

@@ -1,13 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (c) 2005 Sascha Hauer <s.hauer@pengutronix.de>, Pengutronix
- */
-
-#ifndef __ETH_NETX_H
-#define __ETH_NETX_H
-
-struct netxeth_platform_data {
-	unsigned int xcno;	/* number of xmac/xpec engine this eth uses */
-};
-
-#endif

diff --git a/include/linux/printk.h b/include/linux/printk.h
index cefd374..c09d67e 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h

@@ -488,13 +488,6 @@
 extern void print_hex_dump(const char *level, const char *prefix_str,
 			   int prefix_type, int rowsize, int groupsize,
 			   const void *buf, size_t len, bool ascii);
-#if defined(CONFIG_DYNAMIC_DEBUG)
-#define print_hex_dump_bytes(prefix_str, prefix_type, buf, len)	\
-	dynamic_hex_dump(prefix_str, prefix_type, 16, 1, buf, len, true)
-#else
-extern void print_hex_dump_bytes(const char *prefix_str, int prefix_type,
-				 const void *buf, size_t len);
-#endif /* defined(CONFIG_DYNAMIC_DEBUG) */
 #else
 static inline void print_hex_dump(const char *level, const char *prefix_str,
 				  int prefix_type, int rowsize, int groupsize,
@@ -526,4 +519,19 @@
 }
 #endif
 
+/**
+ * print_hex_dump_bytes - shorthand form of print_hex_dump() with default params
+ * @prefix_str: string to prefix each line with;
+ *  caller supplies trailing spaces for alignment if desired
+ * @prefix_type: controls whether prefix of an offset, address, or none
+ *  is printed (%DUMP_PREFIX_OFFSET, %DUMP_PREFIX_ADDRESS, %DUMP_PREFIX_NONE)
+ * @buf: data blob to dump
+ * @len: number of bytes in the @buf
+ *
+ * Calls print_hex_dump(), with log level of KERN_DEBUG,
+ * rowsize of 16, groupsize of 1, and ASCII output included.
+ */
+#define print_hex_dump_bytes(prefix_str, prefix_type, buf, len)	\
+	print_hex_dump_debug(prefix_str, prefix_type, 16, 1, buf, len, true)
+
 #endif

diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index 24632a7..b2c9c46 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h

@@ -262,7 +262,7 @@
 	int (*capture)(struct pwm_chip *chip, struct pwm_device *pwm,
 		       struct pwm_capture *result, unsigned long timeout);
 	int (*apply)(struct pwm_chip *chip, struct pwm_device *pwm,
-		     struct pwm_state *state);
+		     const struct pwm_state *state);
 	void (*get_state)(struct pwm_chip *chip, struct pwm_device *pwm,
 			  struct pwm_state *state);
 	struct module *owner;
@@ -316,7 +316,7 @@
 /* PWM user APIs */
 struct pwm_device *pwm_request(int pwm_id, const char *label);
 void pwm_free(struct pwm_device *pwm);
-int pwm_apply_state(struct pwm_device *pwm, struct pwm_state *state);
+int pwm_apply_state(struct pwm_device *pwm, const struct pwm_state *state);
 int pwm_adjust_config(struct pwm_device *pwm);
 
 /**

diff --git a/include/linux/quicklist.h b/include/linux/quicklist.h
deleted file mode 100644
index 034982c..0000000
--- a/include/linux/quicklist.h
+++ /dev/null

@@ -1,94 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef LINUX_QUICKLIST_H
-#define LINUX_QUICKLIST_H
-/*
- * Fast allocations and disposal of pages. Pages must be in the condition
- * as needed after allocation when they are freed. Per cpu lists of pages
- * are kept that only contain node local pages.
- *
- * (C) 2007, SGI. Christoph Lameter <cl@linux.com>
- */
-#include <linux/kernel.h>
-#include <linux/gfp.h>
-#include <linux/percpu.h>
-
-#ifdef CONFIG_QUICKLIST
-
-struct quicklist {
-	void *page;
-	int nr_pages;
-};
-
-DECLARE_PER_CPU(struct quicklist, quicklist)[CONFIG_NR_QUICK];
-
-/*
- * The two key functions quicklist_alloc and quicklist_free are inline so
- * that they may be custom compiled for the platform.
- * Specifying a NULL ctor can remove constructor support. Specifying
- * a constant quicklist allows the determination of the exact address
- * in the per cpu area.
- *
- * The fast patch in quicklist_alloc touched only a per cpu cacheline and
- * the first cacheline of the page itself. There is minmal overhead involved.
- */
-static inline void *quicklist_alloc(int nr, gfp_t flags, void (*ctor)(void *))
-{
-	struct quicklist *q;
-	void **p = NULL;
-
-	q =&get_cpu_var(quicklist)[nr];
-	p = q->page;
-	if (likely(p)) {
-		q->page = p[0];
-		p[0] = NULL;
-		q->nr_pages--;
-	}
-	put_cpu_var(quicklist);
-	if (likely(p))
-		return p;
-
-	p = (void *)__get_free_page(flags | __GFP_ZERO);
-	if (ctor && p)
-		ctor(p);
-	return p;
-}
-
-static inline void __quicklist_free(int nr, void (*dtor)(void *), void *p,
-	struct page *page)
-{
-	struct quicklist *q;
-
-	q = &get_cpu_var(quicklist)[nr];
-	*(void **)p = q->page;
-	q->page = p;
-	q->nr_pages++;
-	put_cpu_var(quicklist);
-}
-
-static inline void quicklist_free(int nr, void (*dtor)(void *), void *pp)
-{
-	__quicklist_free(nr, dtor, pp, virt_to_page(pp));
-}
-
-static inline void quicklist_free_page(int nr, void (*dtor)(void *),
-							struct page *page)
-{
-	__quicklist_free(nr, dtor, page_address(page), page);
-}
-
-void quicklist_trim(int nr, void (*dtor)(void *),
-	unsigned long min_pages, unsigned long max_free);
-
-unsigned long quicklist_total_size(void);
-
-#else
-
-static inline unsigned long quicklist_total_size(void)
-{
-	return 0;
-}
-
-#endif
-
-#endif /* LINUX_QUICKLIST_H */
-

diff --git a/include/linux/rbtree_augmented.h b/include/linux/rbtree_augmented.h
index 179faab..fdd421b 100644
--- a/include/linux/rbtree_augmented.h
+++ b/include/linux/rbtree_augmented.h

@@ -60,41 +60,87 @@
 	rb_insert_augmented(node, &root->rb_root, augment);
 }
 
-#define RB_DECLARE_CALLBACKS(rbstatic, rbname, rbstruct, rbfield,	\
-			     rbtype, rbaugmented, rbcompute)		\
+/*
+ * Template for declaring augmented rbtree callbacks (generic case)
+ *
+ * RBSTATIC:    'static' or empty
+ * RBNAME:      name of the rb_augment_callbacks structure
+ * RBSTRUCT:    struct type of the tree nodes
+ * RBFIELD:     name of struct rb_node field within RBSTRUCT
+ * RBAUGMENTED: name of field within RBSTRUCT holding data for subtree
+ * RBCOMPUTE:   name of function that recomputes the RBAUGMENTED data
+ */
+
+#define RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME,				\
+			     RBSTRUCT, RBFIELD, RBAUGMENTED, RBCOMPUTE)	\
 static inline void							\
-rbname ## _propagate(struct rb_node *rb, struct rb_node *stop)		\
+RBNAME ## _propagate(struct rb_node *rb, struct rb_node *stop)		\
 {									\
 	while (rb != stop) {						\
-		rbstruct *node = rb_entry(rb, rbstruct, rbfield);	\
-		rbtype augmented = rbcompute(node);			\
-		if (node->rbaugmented == augmented)			\
+		RBSTRUCT *node = rb_entry(rb, RBSTRUCT, RBFIELD);	\
+		if (RBCOMPUTE(node, true))				\
 			break;						\
-		node->rbaugmented = augmented;				\
-		rb = rb_parent(&node->rbfield);				\
+		rb = rb_parent(&node->RBFIELD);				\
 	}								\
 }									\
 static inline void							\
-rbname ## _copy(struct rb_node *rb_old, struct rb_node *rb_new)		\
+RBNAME ## _copy(struct rb_node *rb_old, struct rb_node *rb_new)		\
 {									\
-	rbstruct *old = rb_entry(rb_old, rbstruct, rbfield);		\
-	rbstruct *new = rb_entry(rb_new, rbstruct, rbfield);		\
-	new->rbaugmented = old->rbaugmented;				\
+	RBSTRUCT *old = rb_entry(rb_old, RBSTRUCT, RBFIELD);		\
+	RBSTRUCT *new = rb_entry(rb_new, RBSTRUCT, RBFIELD);		\
+	new->RBAUGMENTED = old->RBAUGMENTED;				\
 }									\
 static void								\
-rbname ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new)	\
+RBNAME ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new)	\
 {									\
-	rbstruct *old = rb_entry(rb_old, rbstruct, rbfield);		\
-	rbstruct *new = rb_entry(rb_new, rbstruct, rbfield);		\
-	new->rbaugmented = old->rbaugmented;				\
-	old->rbaugmented = rbcompute(old);				\
+	RBSTRUCT *old = rb_entry(rb_old, RBSTRUCT, RBFIELD);		\
+	RBSTRUCT *new = rb_entry(rb_new, RBSTRUCT, RBFIELD);		\
+	new->RBAUGMENTED = old->RBAUGMENTED;				\
+	RBCOMPUTE(old, false);						\
 }									\
-rbstatic const struct rb_augment_callbacks rbname = {			\
-	.propagate = rbname ## _propagate,				\
-	.copy = rbname ## _copy,					\
-	.rotate = rbname ## _rotate					\
+RBSTATIC const struct rb_augment_callbacks RBNAME = {			\
+	.propagate = RBNAME ## _propagate,				\
+	.copy = RBNAME ## _copy,					\
+	.rotate = RBNAME ## _rotate					\
 };
 
+/*
+ * Template for declaring augmented rbtree callbacks,
+ * computing RBAUGMENTED scalar as max(RBCOMPUTE(node)) for all subtree nodes.
+ *
+ * RBSTATIC:    'static' or empty
+ * RBNAME:      name of the rb_augment_callbacks structure
+ * RBSTRUCT:    struct type of the tree nodes
+ * RBFIELD:     name of struct rb_node field within RBSTRUCT
+ * RBTYPE:      type of the RBAUGMENTED field
+ * RBAUGMENTED: name of RBTYPE field within RBSTRUCT holding data for subtree
+ * RBCOMPUTE:   name of function that returns the per-node RBTYPE scalar
+ */
+
+#define RB_DECLARE_CALLBACKS_MAX(RBSTATIC, RBNAME, RBSTRUCT, RBFIELD,	      \
+				 RBTYPE, RBAUGMENTED, RBCOMPUTE)	      \
+static inline bool RBNAME ## _compute_max(RBSTRUCT *node, bool exit)	      \
+{									      \
+	RBSTRUCT *child;						      \
+	RBTYPE max = RBCOMPUTE(node);					      \
+	if (node->RBFIELD.rb_left) {					      \
+		child = rb_entry(node->RBFIELD.rb_left, RBSTRUCT, RBFIELD);   \
+		if (child->RBAUGMENTED > max)				      \
+			max = child->RBAUGMENTED;			      \
+	}								      \
+	if (node->RBFIELD.rb_right) {					      \
+		child = rb_entry(node->RBFIELD.rb_right, RBSTRUCT, RBFIELD);  \
+		if (child->RBAUGMENTED > max)				      \
+			max = child->RBAUGMENTED;			      \
+	}								      \
+	if (exit && node->RBAUGMENTED == max)				      \
+		return true;						      \
+	node->RBAUGMENTED = max;					      \
+	return false;							      \
+}									      \
+RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME,					      \
+		     RBSTRUCT, RBFIELD, RBAUGMENTED, RBNAME ## _compute_max)
+
 
 #define	RB_RED		0
 #define	RB_BLACK	1

diff --git a/include/linux/rcuwait.h b/include/linux/rcuwait.h
index 563290f..75c97e4 100644
--- a/include/linux/rcuwait.h
+++ b/include/linux/rcuwait.h

@@ -6,16 +6,11 @@
 
 /*
  * rcuwait provides a way of blocking and waking up a single
- * task in an rcu-safe manner; where it is forbidden to use
- * after exit_notify(). task_struct is not properly rcu protected,
- * unless dealing with rcu-aware lists, ie: find_task_by_*().
+ * task in an rcu-safe manner.
  *
- * Alternatively we have task_rcu_dereference(), but the return
- * semantics have different implications which would break the
- * wakeup side. The only time @task is non-nil is when a user is
- * blocked (or checking if it needs to) on a condition, and reset
- * as soon as we know that the condition has succeeded and are
- * awoken.
+ * The only time @task is non-nil is when a user is blocked (or
+ * checking if it needs to) on a condition, and reset as soon as we
+ * know that the condition has succeeded and are awoken.
  */
 struct rcuwait {
 	struct task_struct __rcu *task;
@@ -37,13 +32,6 @@
  */
 #define rcuwait_wait_event(w, condition)				\
 ({									\
-	/*								\
-	 * Complain if we are called after do_exit()/exit_notify(),     \
-	 * as we cannot rely on the rcu critical region for the		\
-	 * wakeup side.							\
-	 */                                                             \
-	WARN_ON(current->exit_state);                                   \
-									\
 	rcu_assign_pointer((w)->task, current);				\
 	for (;;) {							\
 		/*							\

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 70db597..2c2e56b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h

@@ -1130,7 +1130,10 @@
 
 	struct tlbflush_unmap_batch	tlb_ubc;
 
-	struct rcu_head			rcu;
+	union {
+		refcount_t		rcu_users;
+		struct rcu_head		rcu;
+	};
 
 	/* Cache last used pipe for splice(): */
 	struct pipe_inode_info		*splice_pipe;
@@ -1839,7 +1842,10 @@
  * running or not.
  */
 #ifndef vcpu_is_preempted
-# define vcpu_is_preempted(cpu)	false
+static inline bool vcpu_is_preempted(int cpu)
+{
+	return false;
+}
 #endif
 
 extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);

diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 4a79440..e677001 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h

@@ -362,16 +362,16 @@
 
 static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm)
 {
+	if (current->mm != mm)
+		return;
 	if (likely(!(atomic_read(&mm->membarrier_state) &
 		     MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE)))
 		return;
 	sync_core_before_usermode();
 }
 
-static inline void membarrier_execve(struct task_struct *t)
-{
-	atomic_set(&t->mm->membarrier_state, 0);
-}
+extern void membarrier_exec_mmap(struct mm_struct *mm);
+
 #else
 #ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS
 static inline void membarrier_arch_switch_mm(struct mm_struct *prev,
@@ -380,7 +380,7 @@
 {
 }
 #endif
-static inline void membarrier_execve(struct task_struct *t)
+static inline void membarrier_exec_mmap(struct mm_struct *mm)
 {
 }
 static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm)

diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index 3d90ed8..4b1c3b6 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h

@@ -119,7 +119,7 @@
 		__put_task_struct(t);
 }
 
-struct task_struct *task_rcu_dereference(struct task_struct **ptask);
+void put_task_struct_rcu_user(struct task_struct *task);
 
 #ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
 extern int arch_task_struct_size __read_mostly;

diff --git a/include/linux/security.h b/include/linux/security.h
index 5f7441a..a8d59d6 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h

@@ -77,6 +77,54 @@
 	LSM_POLICY_CHANGE,
 };
 
+/*
+ * These are reasons that can be passed to the security_locked_down()
+ * LSM hook. Lockdown reasons that protect kernel integrity (ie, the
+ * ability for userland to modify kernel code) are placed before
+ * LOCKDOWN_INTEGRITY_MAX.  Lockdown reasons that protect kernel
+ * confidentiality (ie, the ability for userland to extract
+ * information from the running kernel that would otherwise be
+ * restricted) are placed before LOCKDOWN_CONFIDENTIALITY_MAX.
+ *
+ * LSM authors should note that the semantics of any given lockdown
+ * reason are not guaranteed to be stable - the same reason may block
+ * one set of features in one kernel release, and a slightly different
+ * set of features in a later kernel release. LSMs that seek to expose
+ * lockdown policy at any level of granularity other than "none",
+ * "integrity" or "confidentiality" are responsible for either
+ * ensuring that they expose a consistent level of functionality to
+ * userland, or ensuring that userland is aware that this is
+ * potentially a moving target. It is easy to misuse this information
+ * in a way that could break userspace. Please be careful not to do
+ * so.
+ *
+ * If you add to this, remember to extend lockdown_reasons in
+ * security/lockdown/lockdown.c.
+ */
+enum lockdown_reason {
+	LOCKDOWN_NONE,
+	LOCKDOWN_MODULE_SIGNATURE,
+	LOCKDOWN_DEV_MEM,
+	LOCKDOWN_KEXEC,
+	LOCKDOWN_HIBERNATION,
+	LOCKDOWN_PCI_ACCESS,
+	LOCKDOWN_IOPORT,
+	LOCKDOWN_MSR,
+	LOCKDOWN_ACPI_TABLES,
+	LOCKDOWN_PCMCIA_CIS,
+	LOCKDOWN_TIOCSSERIAL,
+	LOCKDOWN_MODULE_PARAMETERS,
+	LOCKDOWN_MMIOTRACE,
+	LOCKDOWN_DEBUGFS,
+	LOCKDOWN_INTEGRITY_MAX,
+	LOCKDOWN_KCORE,
+	LOCKDOWN_KPROBES,
+	LOCKDOWN_BPF_READ,
+	LOCKDOWN_PERF,
+	LOCKDOWN_TRACEFS,
+	LOCKDOWN_CONFIDENTIALITY_MAX,
+};
+
 /* These functions are in security/commoncap.c */
 extern int cap_capable(const struct cred *cred, struct user_namespace *ns,
 		       int cap, unsigned int opts);
@@ -195,6 +243,7 @@
 
 /* prototypes */
 extern int security_init(void);
+extern int early_security_init(void);
 
 /* Security operations */
 int security_binder_set_context_mgr(struct task_struct *mgr);
@@ -259,7 +308,8 @@
 					struct qstr *name,
 					const struct cred *old,
 					struct cred *new);
-
+int security_path_notify(const struct path *path, u64 mask,
+					unsigned int obj_type);
 int security_inode_alloc(struct inode *inode);
 void security_inode_free(struct inode *inode);
 int security_inode_init_security(struct inode *inode, struct inode *dir,
@@ -387,11 +437,11 @@
 int security_secid_to_secctx(u32 secid, char **secdata, u32 *seclen);
 int security_secctx_to_secid(const char *secdata, u32 seclen, u32 *secid);
 void security_release_secctx(char *secdata, u32 seclen);
-
 void security_inode_invalidate_secctx(struct inode *inode);
 int security_inode_notifysecctx(struct inode *inode, void *ctx, u32 ctxlen);
 int security_inode_setsecctx(struct dentry *dentry, void *ctx, u32 ctxlen);
 int security_inode_getsecctx(struct inode *inode, void **ctx, u32 *ctxlen);
+int security_locked_down(enum lockdown_reason what);
 #else /* CONFIG_SECURITY */
 
 static inline int call_blocking_lsm_notifier(enum lsm_event event, void *data)
@@ -423,6 +473,11 @@
 	return 0;
 }
 
+static inline int early_security_init(void)
+{
+	return 0;
+}
+
 static inline int security_binder_set_context_mgr(struct task_struct *mgr)
 {
 	return 0;
@@ -621,6 +676,12 @@
 	return 0;
 }
 
+static inline int security_path_notify(const struct path *path, u64 mask,
+				unsigned int obj_type)
+{
+	return 0;
+}
+
 static inline int security_inode_alloc(struct inode *inode)
 {
 	return 0;
@@ -1204,6 +1265,10 @@
 {
 	return -EOPNOTSUPP;
 }
+static inline int security_locked_down(enum lockdown_reason what)
+{
+	return 0;
+}
 #endif	/* CONFIG_SECURITY */
 
 #ifdef CONFIG_SECURITY_NETWORK

diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index 9443caf..0f80123 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h

@@ -69,7 +69,7 @@
 
 	/* These are for internal use */
 	struct list_head list;
-#ifdef CONFIG_MEMCG_KMEM
+#ifdef CONFIG_MEMCG
 	/* ID in shrinker_idr */
 	int id;
 #endif
@@ -81,6 +81,11 @@
 /* Flags */
 #define SHRINKER_NUMA_AWARE	(1 << 0)
 #define SHRINKER_MEMCG_AWARE	(1 << 1)
+/*
+ * It just makes sense when the shrinker is also MEMCG_AWARE for now,
+ * non-MEMCG_AWARE shrinker should not have this flag set.
+ */
+#define SHRINKER_NONSLAB	(1 << 2)
 
 extern int prealloc_shrinker(struct shrinker *shrinker);
 extern void register_shrinker_prepared(struct shrinker *shrinker);

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 907209c..e7d3b1a 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h

@@ -4144,8 +4144,17 @@
 
 	return NULL;
 }
+
+static inline void skb_ext_reset(struct sk_buff *skb)
+{
+	if (unlikely(skb->active_extensions)) {
+		__skb_ext_put(skb->extensions);
+		skb->active_extensions = 0;
+	}
+}
 #else
 static inline void skb_ext_put(struct sk_buff *skb) {}
+static inline void skb_ext_reset(struct sk_buff *skb) {}
 static inline void skb_ext_del(struct sk_buff *skb, int unused) {}
 static inline void __skb_ext_copy(struct sk_buff *d, const struct sk_buff *s) {}
 static inline void skb_ext_copy(struct sk_buff *dst, const struct sk_buff *s) {}

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 56c9c7e..ab2b98a 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h

@@ -595,68 +595,6 @@
 	return __kmalloc_node(size, flags, node);
 }
 
-struct memcg_cache_array {
-	struct rcu_head rcu;
-	struct kmem_cache *entries[0];
-};
-
-/*
- * This is the main placeholder for memcg-related information in kmem caches.
- * Both the root cache and the child caches will have it. For the root cache,
- * this will hold a dynamically allocated array large enough to hold
- * information about the currently limited memcgs in the system. To allow the
- * array to be accessed without taking any locks, on relocation we free the old
- * version only after a grace period.
- *
- * Root and child caches hold different metadata.
- *
- * @root_cache:	Common to root and child caches.  NULL for root, pointer to
- *		the root cache for children.
- *
- * The following fields are specific to root caches.
- *
- * @memcg_caches: kmemcg ID indexed table of child caches.  This table is
- *		used to index child cachces during allocation and cleared
- *		early during shutdown.
- *
- * @root_caches_node: List node for slab_root_caches list.
- *
- * @children:	List of all child caches.  While the child caches are also
- *		reachable through @memcg_caches, a child cache remains on
- *		this list until it is actually destroyed.
- *
- * The following fields are specific to child caches.
- *
- * @memcg:	Pointer to the memcg this cache belongs to.
- *
- * @children_node: List node for @root_cache->children list.
- *
- * @kmem_caches_node: List node for @memcg->kmem_caches list.
- */
-struct memcg_cache_params {
-	struct kmem_cache *root_cache;
-	union {
-		struct {
-			struct memcg_cache_array __rcu *memcg_caches;
-			struct list_head __root_caches_node;
-			struct list_head children;
-			bool dying;
-		};
-		struct {
-			struct mem_cgroup *memcg;
-			struct list_head children_node;
-			struct list_head kmem_caches_node;
-			struct percpu_ref refcnt;
-
-			void (*work_fn)(struct kmem_cache *);
-			union {
-				struct rcu_head rcu_head;
-				struct work_struct work;
-			};
-		};
-	};
-};
-
 int memcg_update_all_caches(int num_memcgs);
 
 /**

diff --git a/include/linux/string.h b/include/linux/string.h
index 4deb11f..b2f9df7 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h

@@ -474,8 +474,9 @@
  * But this can lead to bugs due to typos, or if prefix is a pointer
  * and not a constant. Instead use str_has_prefix().
  *
- * Returns: 0 if @str does not start with @prefix
-         strlen(@prefix) if @str does start with @prefix
+ * Returns:
+ * * strlen(@prefix) if @str starts with @prefix
+ * * 0 if @str does not start with @prefix
  */
 static __always_inline size_t str_has_prefix(const char *str, const char *prefix)
 {

diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h
index c7f38e8..f860372 100644
--- a/include/linux/sunrpc/cache.h
+++ b/include/linux/sunrpc/cache.h

@@ -87,6 +87,7 @@
 					      int has_died);
 
 	struct cache_head *	(*alloc)(void);
+	void			(*flush)(void);
 	int			(*match)(struct cache_head *orig, struct cache_head *new);
 	void			(*init)(struct cache_head *orig, struct cache_head *new);
 	void			(*update)(struct cache_head *orig, struct cache_head *new);
@@ -107,9 +108,9 @@
 	/* fields for communication over channel */
 	struct list_head	queue;
 
-	atomic_t		readers;		/* how many time is /chennel open */
-	time_t			last_close;		/* if no readers, when did last close */
-	time_t			last_warn;		/* when we last warned about no readers */
+	atomic_t		writers;		/* how many time is /channel open */
+	time_t			last_close;		/* if no writers, when did last close */
+	time_t			last_warn;		/* when we last warned about no writers */
 
 	union {
 		struct proc_dir_entry	*procfs;

diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index 27536b9..a6ef351 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h

@@ -242,9 +242,6 @@
 void		rpc_sleep_on_priority(struct rpc_wait_queue *,
 					struct rpc_task *,
 					int priority);
-void rpc_wake_up_queued_task_on_wq(struct workqueue_struct *wq,
-		struct rpc_wait_queue *queue,
-		struct rpc_task *task);
 void		rpc_wake_up_queued_task(struct rpc_wait_queue *,
 					struct rpc_task *);
 void		rpc_wake_up_queued_task_set_status(struct rpc_wait_queue *,

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 981f0d7..40f6588 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h

@@ -42,6 +42,7 @@
 
 #ifndef SVC_RDMA_H
 #define SVC_RDMA_H
+#include <linux/llist.h>
 #include <linux/sunrpc/xdr.h>
 #include <linux/sunrpc/svcsock.h>
 #include <linux/sunrpc/rpc_rdma.h>
@@ -107,8 +108,7 @@
 	struct list_head     sc_read_complete_q;
 	struct work_struct   sc_work;
 
-	spinlock_t	     sc_recv_lock;
-	struct list_head     sc_recv_ctxts;
+	struct llist_head    sc_recv_ctxts;
 };
 /* sc_flags */
 #define RDMAXPRT_CONN_PENDING	3
@@ -125,6 +125,7 @@
 #define RPCSVC_MAXPAYLOAD_RDMA	RPCSVC_MAXPAYLOAD
 
 struct svc_rdma_recv_ctxt {
+	struct llist_node	rc_node;
 	struct list_head	rc_list;
 	struct ib_recv_wr	rc_recv_wr;
 	struct ib_cqe		rc_cqe;
@@ -200,7 +201,6 @@
 #endif
 
 /* svc_rdma.c */
-extern struct workqueue_struct *svc_rdma_wq;
 extern int svc_rdma_init(void);
 extern void svc_rdma_cleanup(void);
 

diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index 8a87d8b..f33e501 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h

@@ -186,7 +186,7 @@
 extern void xdr_shift_buf(struct xdr_buf *, size_t);
 extern void xdr_buf_from_iov(struct kvec *, struct xdr_buf *);
 extern int xdr_buf_subsegment(struct xdr_buf *, struct xdr_buf *, unsigned int, unsigned int);
-extern int xdr_buf_read_netobj(struct xdr_buf *, struct xdr_netobj *, unsigned int);
+extern int xdr_buf_read_mic(struct xdr_buf *, struct xdr_netobj *, unsigned int);
 extern int read_bytes_from_xdr_buf(struct xdr_buf *, unsigned int, void *, unsigned int);
 extern int write_bytes_to_xdr_buf(struct xdr_buf *, unsigned int, void *, unsigned int);
 

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 13e108b..d783e15 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h

@@ -352,6 +352,7 @@
 void			xprt_request_enqueue_transmit(struct rpc_task *task);
 void			xprt_request_enqueue_receive(struct rpc_task *task);
 void			xprt_request_wait_receive(struct rpc_task *task);
+void			xprt_request_dequeue_xprt(struct rpc_task *task);
 bool			xprt_request_need_retransmit(struct rpc_task *task);
 void			xprt_transmit(struct rpc_task *task);
 void			xprt_end_transmit(struct rpc_task *task);

diff --git a/include/linux/sunrpc/xprtrdma.h b/include/linux/sunrpc/xprtrdma.h
index 86fc38f..16c239e 100644
--- a/include/linux/sunrpc/xprtrdma.h
+++ b/include/linux/sunrpc/xprtrdma.h

@@ -49,9 +49,9 @@
  * fully-chunked NFS message (read chunks are the largest). Note only
  * a single chunk type per message is supported currently.
  */
-#define RPCRDMA_MIN_SLOT_TABLE	(2U)
+#define RPCRDMA_MIN_SLOT_TABLE	(4U)
 #define RPCRDMA_DEF_SLOT_TABLE	(128U)
-#define RPCRDMA_MAX_SLOT_TABLE	(256U)
+#define RPCRDMA_MAX_SLOT_TABLE	(16384U)
 
 #define RPCRDMA_MIN_INLINE  (1024)	/* min inline thresh */
 #define RPCRDMA_DEF_INLINE  (4096)	/* default inline thresh */

diff --git a/include/linux/swap.h b/include/linux/swap.h
index de2c67a..063c0c1 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h

@@ -340,6 +340,7 @@
 extern void lru_add_drain_all(void);
 extern void rotate_reclaimable_page(struct page *page);
 extern void deactivate_file_page(struct page *page);
+extern void deactivate_page(struct page *page);
 extern void mark_page_lazyfree(struct page *page);
 extern void swap_setup(void);
 
@@ -364,6 +365,7 @@
 extern int remove_mapping(struct address_space *mapping, struct page *page);
 extern unsigned long vm_total_pages;
 
+extern unsigned long reclaim_pages(struct list_head *page_list);
 #ifdef CONFIG_NUMA
 extern int node_reclaim_mode;
 extern int sysctl_min_unmapped_ratio;

diff --git a/include/linux/t10-pi.h b/include/linux/t10-pi.h
index 3e2a80c..96305a6 100644
--- a/include/linux/t10-pi.h
+++ b/include/linux/t10-pi.h

@@ -53,18 +53,4 @@
 extern const struct blk_integrity_profile t10_pi_type3_crc;
 extern const struct blk_integrity_profile t10_pi_type3_ip;
 
-#ifdef CONFIG_BLK_DEV_INTEGRITY
-extern void t10_pi_prepare(struct request *rq, u8 protection_type);
-extern void t10_pi_complete(struct request *rq, u8 protection_type,
-			    unsigned int intervals);
-#else
-static inline void t10_pi_complete(struct request *rq, u8 protection_type,
-				   unsigned int intervals)
-{
-}
-static inline void t10_pi_prepare(struct request *rq, u8 protection_type)
-{
-}
-#endif
-
 #endif

diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index 8d8821b..659a440 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h

@@ -134,7 +134,7 @@
 	WARN(1, "Buffer overflow detected (%d < %lu)!\n", size, count);
 }
 
-static __always_inline bool
+static __always_inline __must_check bool
 check_copy_size(const void *addr, size_t bytes, bool is_source)
 {
 	int sz = __compiletime_object_size(addr);

diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index 34a0385..70bbdc3 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h

@@ -55,7 +55,7 @@
  * as usual) and both source and destination can trigger faults.
  */
 
-static __always_inline unsigned long
+static __always_inline __must_check unsigned long
 __copy_from_user_inatomic(void *to, const void __user *from, unsigned long n)
 {
 	kasan_check_write(to, n);
@@ -63,7 +63,7 @@
 	return raw_copy_from_user(to, from, n);
 }
 
-static __always_inline unsigned long
+static __always_inline __must_check unsigned long
 __copy_from_user(void *to, const void __user *from, unsigned long n)
 {
 	might_fault();
@@ -85,7 +85,7 @@
  * The caller should also make sure he pins the user space address
  * so that we don't result in page fault and sleep.
  */
-static __always_inline unsigned long
+static __always_inline __must_check unsigned long
 __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n)
 {
 	kasan_check_read(from, n);
@@ -93,7 +93,7 @@
 	return raw_copy_to_user(to, from, n);
 }
 
-static __always_inline unsigned long
+static __always_inline __must_check unsigned long
 __copy_to_user(void __user *to, const void *from, unsigned long n)
 {
 	might_fault();
@@ -103,7 +103,7 @@
 }
 
 #ifdef INLINE_COPY_FROM_USER
-static inline unsigned long
+static inline __must_check unsigned long
 _copy_from_user(void *to, const void __user *from, unsigned long n)
 {
 	unsigned long res = n;
@@ -117,12 +117,12 @@
 	return res;
 }
 #else
-extern unsigned long
+extern __must_check unsigned long
 _copy_from_user(void *, const void __user *, unsigned long);
 #endif
 
 #ifdef INLINE_COPY_TO_USER
-static inline unsigned long
+static inline __must_check unsigned long
 _copy_to_user(void __user *to, const void *from, unsigned long n)
 {
 	might_fault();
@@ -133,7 +133,7 @@
 	return n;
 }
 #else
-extern unsigned long
+extern __must_check unsigned long
 _copy_to_user(void __user *, const void *, unsigned long);
 #endif
 
@@ -222,8 +222,9 @@
 
 #ifndef ARCH_HAS_NOCACHE_UACCESS
 
-static inline unsigned long __copy_from_user_inatomic_nocache(void *to,
-				const void __user *from, unsigned long n)
+static inline __must_check unsigned long
+__copy_from_user_inatomic_nocache(void *to, const void __user *from,
+				  unsigned long n)
 {
 	return __copy_from_user_inatomic(to, from, n);
 }

diff --git a/include/linux/verification.h b/include/linux/verification.h
index 32d990d..911ab7c 100644
--- a/include/linux/verification.h
+++ b/include/linux/verification.h

@@ -32,6 +32,7 @@
 #ifdef CONFIG_SYSTEM_DATA_VERIFICATION
 
 struct key;
+struct pkcs7_message;
 
 extern int verify_pkcs7_signature(const void *data, size_t len,
 				  const void *raw_pkcs7, size_t pkcs7_len,
@@ -41,6 +42,15 @@
 						      const void *data, size_t len,
 						      size_t asn1hdrlen),
 				  void *ctx);
+extern int verify_pkcs7_message_sig(const void *data, size_t len,
+				    struct pkcs7_message *pkcs7,
+				    struct key *trusted_keys,
+				    enum key_being_used_for usage,
+				    int (*view_content)(void *ctx,
+							const void *data,
+							size_t len,
+							size_t asn1hdrlen),
+				    void *ctx);
 
 #ifdef CONFIG_SIGNED_PE_FILE_VERIFICATION
 extern int verify_pefile_signature(const void *pebuf, unsigned pelen,

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index dfa718f..4e78094 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h

@@ -53,15 +53,21 @@
 	unsigned long va_start;
 	unsigned long va_end;
 
-	/*
-	 * Largest available free size in subtree.
-	 */
-	unsigned long subtree_max_size;
-	unsigned long flags;
 	struct rb_node rb_node;         /* address sorted rbtree */
 	struct list_head list;          /* address sorted list */
-	struct llist_node purge_list;    /* "lazy purge" list */
-	struct vm_struct *vm;
+
+	/*
+	 * The following three variables can be packed, because
+	 * a vmap_area object is always one of the three states:
+	 *    1) in "free" tree (root is vmap_area_root)
+	 *    2) in "busy" tree (root is free_vmap_area_root)
+	 *    3) in purge list  (head is vmap_purge_list)
+	 */
+	union {
+		unsigned long subtree_max_size; /* in "free" tree */
+		struct vm_struct *vm;           /* in "busy" tree */
+		struct llist_node purge_list;   /* in purge list */
+	};
 };
 
 /*

diff --git a/include/linux/zpool.h b/include/linux/zpool.h
index 7238865e..51bf430 100644
--- a/include/linux/zpool.h
+++ b/include/linux/zpool.h

@@ -46,6 +46,8 @@
 
 void zpool_destroy_pool(struct zpool *pool);
 
+bool zpool_malloc_support_movable(struct zpool *pool);
+
 int zpool_malloc(struct zpool *pool, size_t size, gfp_t gfp,
 			unsigned long *handle);
 
@@ -90,6 +92,7 @@
 			struct zpool *zpool);
 	void (*destroy)(void *pool);
 
+	bool malloc_support_movable;
 	int (*malloc)(void *pool, size_t size, gfp_t gfp,
 				unsigned long *handle);
 	void (*free)(void *pool, unsigned long handle);

diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index aef38c1..dfd919b 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h

@@ -71,6 +71,7 @@
 				tw_pad		: 2,	/* 2 bits hole */
 				tw_tos		: 8;
 	u32			tw_txhash;
+	u32			tw_priority;
 	struct timer_list	tw_timer;
 	struct inet_bind_bucket	*tw_tb;
 };

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 8dfc656..009605c 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h

@@ -981,7 +981,7 @@
  *	upper-layer output functions
  */
 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
-	     __u32 mark, struct ipv6_txoptions *opt, int tclass);
+	     __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority);
 
 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr);
 

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 2655e03..001d294 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h

@@ -889,6 +889,8 @@
 	NFT_CHAIN_HW_OFFLOAD		= 0x2,
 };
 
+#define NFT_CHAIN_POLICY_UNSET		U8_MAX
+
 /**
  *	struct nft_chain - nf_tables chain
  *
@@ -1181,6 +1183,10 @@
 					   const struct nlattr *nla,
 					   u8 genmask);
 
+void nf_tables_deactivate_flowtable(const struct nft_ctx *ctx,
+				    struct nft_flowtable *flowtable,
+				    enum nft_trans_phase phase);
+
 void nft_register_flowtable_type(struct nf_flowtable_type *type);
 void nft_unregister_flowtable_type(struct nf_flowtable_type *type);
 

diff --git a/include/net/route.h b/include/net/route.h
index dfce19c..6c51684 100644
--- a/include/net/route.h
+++ b/include/net/route.h

@@ -53,10 +53,11 @@
 	unsigned int		rt_flags;
 	__u16			rt_type;
 	__u8			rt_is_input;
-	u8			rt_gw_family;
+	__u8			rt_uses_gateway;
 
 	int			rt_iif;
 
+	u8			rt_gw_family;
 	/* Info on neighbour */
 	union {
 		__be32		rt_gw4;

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 43f5b7e..637548d 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h

@@ -494,6 +494,11 @@
 	return q;
 }
 
+static inline struct Qdisc *qdisc_root_bh(const struct Qdisc *qdisc)
+{
+	return rcu_dereference_bh(qdisc->dev_queue->qdisc);
+}
+
 static inline struct Qdisc *qdisc_root_sleeping(const struct Qdisc *qdisc)
 {
 	return qdisc->dev_queue->qdisc_sleeping;

diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h
index f6a4eaa..a138306 100644
--- a/include/trace/events/rpcrdma.h
+++ b/include/trace/events/rpcrdma.h

@@ -451,20 +451,81 @@
 
 	TP_STRUCT__entry(
 		__field(const void *, r_xprt)
+		__string(addr, rpcrdma_addrstr(r_xprt))
+		__string(port, rpcrdma_portstr(r_xprt))
 		__field(unsigned int, count)
 	),
 
 	TP_fast_assign(
 		__entry->r_xprt = r_xprt;
 		__entry->count = count;
+		__assign_str(addr, rpcrdma_addrstr(r_xprt));
+		__assign_str(port, rpcrdma_portstr(r_xprt));
 	),
 
-	TP_printk("r_xprt=%p: created %u MRs",
-		__entry->r_xprt, __entry->count
+	TP_printk("peer=[%s]:%s r_xprt=%p: created %u MRs",
+		__get_str(addr), __get_str(port), __entry->r_xprt,
+		__entry->count
 	)
 );
 
-DEFINE_RXPRT_EVENT(xprtrdma_nomrs);
+TRACE_EVENT(xprtrdma_mr_get,
+	TP_PROTO(
+		const struct rpcrdma_req *req
+	),
+
+	TP_ARGS(req),
+
+	TP_STRUCT__entry(
+		__field(const void *, req)
+		__field(unsigned int, task_id)
+		__field(unsigned int, client_id)
+		__field(u32, xid)
+	),
+
+	TP_fast_assign(
+		const struct rpc_rqst *rqst = &req->rl_slot;
+
+		__entry->req = req;
+		__entry->task_id = rqst->rq_task->tk_pid;
+		__entry->client_id = rqst->rq_task->tk_client->cl_clid;
+		__entry->xid = be32_to_cpu(rqst->rq_xid);
+	),
+
+	TP_printk("task:%u@%u xid=0x%08x req=%p",
+		__entry->task_id, __entry->client_id, __entry->xid,
+		__entry->req
+	)
+);
+
+TRACE_EVENT(xprtrdma_nomrs,
+	TP_PROTO(
+		const struct rpcrdma_req *req
+	),
+
+	TP_ARGS(req),
+
+	TP_STRUCT__entry(
+		__field(const void *, req)
+		__field(unsigned int, task_id)
+		__field(unsigned int, client_id)
+		__field(u32, xid)
+	),
+
+	TP_fast_assign(
+		const struct rpc_rqst *rqst = &req->rl_slot;
+
+		__entry->req = req;
+		__entry->task_id = rqst->rq_task->tk_pid;
+		__entry->client_id = rqst->rq_task->tk_client->cl_clid;
+		__entry->xid = be32_to_cpu(rqst->rq_xid);
+	),
+
+	TP_printk("task:%u@%u xid=0x%08x req=%p",
+		__entry->task_id, __entry->client_id, __entry->xid,
+		__entry->req
+	)
+);
 
 DEFINE_RDCH_EVENT(read);
 DEFINE_WRCH_EVENT(write);
@@ -623,21 +684,21 @@
 
 TRACE_EVENT(xprtrdma_post_recv,
 	TP_PROTO(
-		const struct ib_cqe *cqe
+		const struct rpcrdma_rep *rep
 	),
 
-	TP_ARGS(cqe),
+	TP_ARGS(rep),
 
 	TP_STRUCT__entry(
-		__field(const void *, cqe)
+		__field(const void *, rep)
 	),
 
 	TP_fast_assign(
-		__entry->cqe = cqe;
+		__entry->rep = rep;
 	),
 
-	TP_printk("cqe=%p",
-		__entry->cqe
+	TP_printk("rep=%p",
+		__entry->rep
 	)
 );
 
@@ -715,14 +776,15 @@
 	TP_ARGS(wc),
 
 	TP_STRUCT__entry(
-		__field(const void *, cqe)
+		__field(const void *, rep)
 		__field(u32, byte_len)
 		__field(unsigned int, status)
 		__field(u32, vendor_err)
 	),
 
 	TP_fast_assign(
-		__entry->cqe = wc->wr_cqe;
+		__entry->rep = container_of(wc->wr_cqe, struct rpcrdma_rep,
+					    rr_cqe);
 		__entry->status = wc->status;
 		if (wc->status) {
 			__entry->byte_len = 0;
@@ -733,8 +795,8 @@
 		}
 	),
 
-	TP_printk("cqe=%p %u bytes: %s (%u/0x%x)",
-		__entry->cqe, __entry->byte_len,
+	TP_printk("rep=%p %u bytes: %s (%u/0x%x)",
+		__entry->rep, __entry->byte_len,
 		rdma_show_wc_status(__entry->status),
 		__entry->status, __entry->vendor_err
 	)

diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index 3a27335..c2ce648 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h

@@ -66,8 +66,9 @@
 	),
 
 	TP_fast_assign(
-		strncpy(__entry->name,
-			mapping ? dev_name(inode_to_bdi(mapping->host)->dev) : "(unknown)", 32);
+		strscpy_pad(__entry->name,
+			    mapping ? dev_name(inode_to_bdi(mapping->host)->dev) : "(unknown)",
+			    32);
 		__entry->ino = mapping ? mapping->host->i_ino : 0;
 		__entry->index = page->index;
 	),
@@ -110,8 +111,8 @@
 		struct backing_dev_info *bdi = inode_to_bdi(inode);
 
 		/* may be called for files on pseudo FSes w/ unregistered bdi */
-		strncpy(__entry->name,
-			bdi->dev ? dev_name(bdi->dev) : "(unknown)", 32);
+		strscpy_pad(__entry->name,
+			    bdi->dev ? dev_name(bdi->dev) : "(unknown)", 32);
 		__entry->ino		= inode->i_ino;
 		__entry->state		= inode->i_state;
 		__entry->flags		= flags;
@@ -316,8 +317,8 @@
 	),
 
 	TP_fast_assign(
-		strncpy(__entry->name,
-			dev_name(inode_to_bdi(inode)->dev), 32);
+		strscpy_pad(__entry->name,
+			    dev_name(inode_to_bdi(inode)->dev), 32);
 		__entry->ino		= inode->i_ino;
 		__entry->sync_mode	= wbc->sync_mode;
 		__entry->cgroup_ino	= __trace_wbc_assign_cgroup(wbc);
@@ -360,8 +361,9 @@
 		__field(unsigned int, cgroup_ino)
 	),
 	TP_fast_assign(
-		strncpy(__entry->name,
-			wb->bdi->dev ? dev_name(wb->bdi->dev) : "(unknown)", 32);
+		strscpy_pad(__entry->name,
+			    wb->bdi->dev ? dev_name(wb->bdi->dev) :
+			    "(unknown)", 32);
 		__entry->nr_pages = work->nr_pages;
 		__entry->sb_dev = work->sb ? work->sb->s_dev : 0;
 		__entry->sync_mode = work->sync_mode;
@@ -414,7 +416,7 @@
 		__field(unsigned int, cgroup_ino)
 	),
 	TP_fast_assign(
-		strncpy(__entry->name, dev_name(wb->bdi->dev), 32);
+		strscpy_pad(__entry->name, dev_name(wb->bdi->dev), 32);
 		__entry->cgroup_ino = __trace_wb_assign_cgroup(wb);
 	),
 	TP_printk("bdi %s: cgroup_ino=%u",
@@ -436,7 +438,7 @@
 		__array(char, name, 32)
 	),
 	TP_fast_assign(
-		strncpy(__entry->name, dev_name(bdi->dev), 32);
+		strscpy_pad(__entry->name, dev_name(bdi->dev), 32);
 	),
 	TP_printk("bdi %s",
 		__entry->name
@@ -461,7 +463,7 @@
 	),
 
 	TP_fast_assign(
-		strncpy(__entry->name, dev_name(bdi->dev), 32);
+		strscpy_pad(__entry->name, dev_name(bdi->dev), 32);
 		__entry->nr_to_write	= wbc->nr_to_write;
 		__entry->pages_skipped	= wbc->pages_skipped;
 		__entry->sync_mode	= wbc->sync_mode;
@@ -512,7 +514,7 @@
 	),
 	TP_fast_assign(
 		unsigned long *older_than_this = work->older_than_this;
-		strncpy(__entry->name, dev_name(wb->bdi->dev), 32);
+		strscpy_pad(__entry->name, dev_name(wb->bdi->dev), 32);
 		__entry->older	= older_than_this ?  *older_than_this : 0;
 		__entry->age	= older_than_this ?
 				  (jiffies - *older_than_this) * 1000 / HZ : -1;
@@ -598,7 +600,7 @@
 	),
 
 	TP_fast_assign(
-		strlcpy(__entry->bdi, dev_name(wb->bdi->dev), 32);
+		strscpy_pad(__entry->bdi, dev_name(wb->bdi->dev), 32);
 		__entry->write_bw	= KBps(wb->write_bandwidth);
 		__entry->avg_write_bw	= KBps(wb->avg_write_bandwidth);
 		__entry->dirty_rate	= KBps(dirty_rate);
@@ -663,7 +665,7 @@
 
 	TP_fast_assign(
 		unsigned long freerun = (thresh + bg_thresh) / 2;
-		strlcpy(__entry->bdi, dev_name(wb->bdi->dev), 32);
+		strscpy_pad(__entry->bdi, dev_name(wb->bdi->dev), 32);
 
 		__entry->limit		= global_wb_domain.dirty_limit;
 		__entry->setpoint	= (global_wb_domain.dirty_limit +
@@ -723,8 +725,8 @@
 	),
 
 	TP_fast_assign(
-		strncpy(__entry->name,
-		        dev_name(inode_to_bdi(inode)->dev), 32);
+		strscpy_pad(__entry->name,
+			    dev_name(inode_to_bdi(inode)->dev), 32);
 		__entry->ino		= inode->i_ino;
 		__entry->state		= inode->i_state;
 		__entry->dirtied_when	= inode->dirtied_when;
@@ -797,8 +799,8 @@
 	),
 
 	TP_fast_assign(
-		strncpy(__entry->name,
-			dev_name(inode_to_bdi(inode)->dev), 32);
+		strscpy_pad(__entry->name,
+			    dev_name(inode_to_bdi(inode)->dev), 32);
 		__entry->ino		= inode->i_ino;
 		__entry->state		= inode->i_state;
 		__entry->dirtied_when	= inode->dirtied_when;

diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
index 63b1f50..c160a53 100644
--- a/include/uapi/asm-generic/mman-common.h
+++ b/include/uapi/asm-generic/mman-common.h

@@ -67,6 +67,9 @@
 #define MADV_WIPEONFORK 18		/* Zero memory on fork, child only */
 #define MADV_KEEPONFORK 19		/* Undo MADV_WIPEONFORK */
 
+#define MADV_COLD	20		/* deactivate these pages */
+#define MADV_PAGEOUT	21		/* reclaim these pages */
+
 /* compatibility flags */
 #define MAP_FILE	0
 

diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h
index 63ae4a3..c02dec9 100644
--- a/include/uapi/linux/btf.h
+++ b/include/uapi/linux/btf.h

@@ -22,9 +22,9 @@
 };
 
 /* Max # of type identifier */
-#define BTF_MAX_TYPE	0x0000ffff
+#define BTF_MAX_TYPE	0x000fffff
 /* Max offset into the string section */
-#define BTF_MAX_NAME_OFFSET	0x0000ffff
+#define BTF_MAX_NAME_OFFSET	0x00ffffff
 /* Max # of struct/union/enum members or func args */
 #define BTF_MAX_VLEN	0xffff
 

diff --git a/include/uapi/linux/coff.h b/include/uapi/linux/coff.h
index e4a79f8..ab5c7e8 100644
--- a/include/uapi/linux/coff.h
+++ b/include/uapi/linux/coff.h

@@ -11,6 +11,9 @@
    more information about COFF, then O'Reilly has a very excellent book.
 */
 
+#ifndef _UAPI_LINUX_COFF_H
+#define _UAPI_LINUX_COFF_H
+
 #define  E_SYMNMLEN  8   /* Number of characters in a symbol name         */
 #define  E_FILNMLEN 14   /* Number of characters in a file name           */
 #define  E_DIMNUM    4   /* Number of array dimensions in auxiliary entry */
@@ -350,3 +353,5 @@
 
 /* For new sections we haven't heard of before */
 #define COFF_DEF_SECTION_ALIGNMENT       4
+
+#endif /* _UAPI_LINUX_COFF_H */

diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
index 2971d29..802b037 100644
--- a/include/uapi/linux/fuse.h
+++ b/include/uapi/linux/fuse.h

@@ -133,6 +133,8 @@
  *
  *  7.31
  *  - add FUSE_WRITE_KILL_PRIV flag
+ *  - add FUSE_SETUPMAPPING and FUSE_REMOVEMAPPING
+ *  - add map_alignment to fuse_init_out, add FUSE_MAP_ALIGNMENT flag
  */
 
 #ifndef _LINUX_FUSE_H
@@ -274,6 +276,7 @@
  * FUSE_CACHE_SYMLINKS: cache READLINK responses
  * FUSE_NO_OPENDIR_SUPPORT: kernel supports zero-message opendir
  * FUSE_EXPLICIT_INVAL_DATA: only invalidate cached pages on explicit request
+ * FUSE_MAP_ALIGNMENT: map_alignment field is valid
  */
 #define FUSE_ASYNC_READ		(1 << 0)
 #define FUSE_POSIX_LOCKS	(1 << 1)
@@ -301,6 +304,7 @@
 #define FUSE_CACHE_SYMLINKS	(1 << 23)
 #define FUSE_NO_OPENDIR_SUPPORT (1 << 24)
 #define FUSE_EXPLICIT_INVAL_DATA (1 << 25)
+#define FUSE_MAP_ALIGNMENT	(1 << 26)
 
 /**
  * CUSE INIT request/reply flags
@@ -422,9 +426,15 @@
 	FUSE_RENAME2		= 45,
 	FUSE_LSEEK		= 46,
 	FUSE_COPY_FILE_RANGE	= 47,
+	FUSE_SETUPMAPPING	= 48,
+	FUSE_REMOVEMAPPING	= 49,
 
 	/* CUSE specific operations */
 	CUSE_INIT		= 4096,
+
+	/* Reserved opcodes: helpful to detect structure endian-ness */
+	CUSE_INIT_BSWAP_RESERVED	= 1048576,	/* CUSE_INIT << 8 */
+	FUSE_INIT_BSWAP_RESERVED	= 436207616,	/* FUSE_INIT << 24 */
 };
 
 enum fuse_notify_code {
@@ -652,7 +662,7 @@
 	uint32_t	max_write;
 	uint32_t	time_gran;
 	uint16_t	max_pages;
-	uint16_t	padding;
+	uint16_t	map_alignment;
 	uint32_t	unused[8];
 };
 

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 96ee9d9..ea57526a 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h

@@ -28,6 +28,7 @@
 		__u16		poll_events;
 		__u32		sync_range_flags;
 		__u32		msg_flags;
+		__u32		timeout_flags;
 	};
 	__u64	user_data;	/* data to be passed back at completion time */
 	union {
@@ -61,6 +62,7 @@
 #define IORING_OP_SYNC_FILE_RANGE	8
 #define IORING_OP_SENDMSG	9
 #define IORING_OP_RECVMSG	10
+#define IORING_OP_TIMEOUT	11
 
 /*
  * sqe->fsync_flags

diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 233efbb..52641d8 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h

@@ -999,6 +999,7 @@
 #define KVM_CAP_ARM_PTRAUTH_GENERIC 172
 #define KVM_CAP_PMU_EVENT_FILTER 173
 #define KVM_CAP_ARM_IRQ_LINE_LAYOUT_2 174
+#define KVM_CAP_HYPERV_DIRECT_TLBFLUSH 175
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -1145,6 +1146,7 @@
 #define KVM_REG_S390		0x5000000000000000ULL
 #define KVM_REG_ARM64		0x6000000000000000ULL
 #define KVM_REG_MIPS		0x7000000000000000ULL
+#define KVM_REG_RISCV		0x8000000000000000ULL
 
 #define KVM_REG_SIZE_SHIFT	52
 #define KVM_REG_SIZE_MASK	0x00f0000000000000ULL

diff --git a/include/uapi/linux/netfilter_bridge/ebtables.h b/include/uapi/linux/netfilter_bridge/ebtables.h
index 3b86c14..8076c94 100644
--- a/include/uapi/linux/netfilter_bridge/ebtables.h
+++ b/include/uapi/linux/netfilter_bridge/ebtables.h

@@ -123,7 +123,7 @@
 	union {
 		struct {
 			char name[EBT_EXTENSION_MAXNAMELEN];
-			uint8_t revision;
+			__u8 revision;
 		};
 		struct xt_match *match;
 	} u;
@@ -136,7 +136,7 @@
 	union {
 		struct {
 			char name[EBT_EXTENSION_MAXNAMELEN];
-			uint8_t revision;
+			__u8 revision;
 		};
 		struct xt_target *watcher;
 	} u;
@@ -149,7 +149,7 @@
 	union {
 		struct {
 			char name[EBT_EXTENSION_MAXNAMELEN];
-			uint8_t revision;
+			__u8 revision;
 		};
 		struct xt_target *target;
 	} u;

diff --git a/include/uapi/linux/nfsd/cld.h b/include/uapi/linux/nfsd/cld.h
index b1e9de4..a519313 100644
--- a/include/uapi/linux/nfsd/cld.h
+++ b/include/uapi/linux/nfsd/cld.h

@@ -26,17 +26,22 @@
 #include <linux/types.h>
 
 /* latest upcall version available */
-#define CLD_UPCALL_VERSION 1
+#define CLD_UPCALL_VERSION 2
 
 /* defined by RFC3530 */
 #define NFS4_OPAQUE_LIMIT 1024
 
+#ifndef SHA256_DIGEST_SIZE
+#define SHA256_DIGEST_SIZE      32
+#endif
+
 enum cld_command {
 	Cld_Create,		/* create a record for this cm_id */
 	Cld_Remove,		/* remove record of this cm_id */
 	Cld_Check,		/* is this cm_id allowed? */
 	Cld_GraceDone,		/* grace period is complete */
-	Cld_GraceStart,
+	Cld_GraceStart,		/* grace start (upload client records) */
+	Cld_GetVersion,		/* query max supported upcall version */
 };
 
 /* representation of long-form NFSv4 client ID */
@@ -45,6 +50,17 @@
 	unsigned char	cn_id[NFS4_OPAQUE_LIMIT];	/* client-provided */
 } __attribute__((packed));
 
+/* sha256 hash of the kerberos principal */
+struct cld_princhash {
+	__u8		cp_len;				/* length of cp_data */
+	unsigned char	cp_data[SHA256_DIGEST_SIZE];	/* hash of principal */
+} __attribute__((packed));
+
+struct cld_clntinfo {
+	struct cld_name		cc_name;
+	struct cld_princhash	cc_princhash;
+} __attribute__((packed));
+
 /* message struct for communication with userspace */
 struct cld_msg {
 	__u8		cm_vers;		/* upcall version */
@@ -54,7 +70,28 @@
 	union {
 		__s64		cm_gracetime;	/* grace period start time */
 		struct cld_name	cm_name;
+		__u8		cm_version;	/* for getting max version */
 	} __attribute__((packed)) cm_u;
 } __attribute__((packed));
 
+/* version 2 message can include hash of kerberos principal */
+struct cld_msg_v2 {
+	__u8		cm_vers;		/* upcall version */
+	__u8		cm_cmd;			/* upcall command */
+	__s16		cm_status;		/* return code */
+	__u32		cm_xid;			/* transaction id */
+	union {
+		struct cld_name	cm_name;
+		__u8		cm_version;	/* for getting max version */
+		struct cld_clntinfo cm_clntinfo; /* name & princ hash */
+	} __attribute__((packed)) cm_u;
+} __attribute__((packed));
+
+struct cld_msg_hdr {
+	__u8		cm_vers;		/* upcall version */
+	__u8		cm_cmd;			/* upcall command */
+	__s16		cm_status;		/* return code */
+	__u32		cm_xid;			/* transaction id */
+} __attribute__((packed));
+
 #endif /* !_NFSD_CLD_H */

diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index f28e562..29d6e93 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h

@@ -591,6 +591,7 @@
 #define  PCI_EXP_SLTCTL_CCIE	0x0010	/* Command Completed Interrupt Enable */
 #define  PCI_EXP_SLTCTL_HPIE	0x0020	/* Hot-Plug Interrupt Enable */
 #define  PCI_EXP_SLTCTL_AIC	0x00c0	/* Attention Indicator Control */
+#define  PCI_EXP_SLTCTL_ATTN_IND_SHIFT 6      /* Attention Indicator shift */
 #define  PCI_EXP_SLTCTL_ATTN_IND_ON    0x0040 /* Attention Indicator on */
 #define  PCI_EXP_SLTCTL_ATTN_IND_BLINK 0x0080 /* Attention Indicator blinking */
 #define  PCI_EXP_SLTCTL_ATTN_IND_OFF   0x00c0 /* Attention Indicator off */
@@ -713,7 +714,9 @@
 #define PCI_EXT_CAP_ID_DPC	0x1D	/* Downstream Port Containment */
 #define PCI_EXT_CAP_ID_L1SS	0x1E	/* L1 PM Substates */
 #define PCI_EXT_CAP_ID_PTM	0x1F	/* Precision Time Measurement */
-#define PCI_EXT_CAP_ID_MAX	PCI_EXT_CAP_ID_PTM
+#define PCI_EXT_CAP_ID_DLF	0x25	/* Data Link Feature */
+#define PCI_EXT_CAP_ID_PL_16GT	0x26	/* Physical Layer 16.0 GT/s */
+#define PCI_EXT_CAP_ID_MAX	PCI_EXT_CAP_ID_PL_16GT
 
 #define PCI_EXT_CAP_DSN_SIZEOF	12
 #define PCI_EXT_CAP_MCAST_ENDPOINT_SIZEOF 40
@@ -1053,4 +1056,14 @@
 #define  PCI_L1SS_CTL1_LTR_L12_TH_SCALE	0xe0000000  /* LTR_L1.2_THRESHOLD_Scale */
 #define PCI_L1SS_CTL2		0x0c	/* Control 2 Register */
 
+/* Data Link Feature */
+#define PCI_DLF_CAP		0x04	/* Capabilities Register */
+#define  PCI_DLF_EXCHANGE_ENABLE	0x80000000  /* Data Link Feature Exchange Enable */
+
+/* Physical Layer 16.0 GT/s */
+#define PCI_PL_16GT_LE_CTRL	0x20	/* Lane Equalization Control Register */
+#define  PCI_PL_16GT_LE_CTRL_DSP_TX_PRESET_MASK		0x0000000F
+#define  PCI_PL_16GT_LE_CTRL_USP_TX_PRESET_MASK		0x000000F0
+#define  PCI_PL_16GT_LE_CTRL_USP_TX_PRESET_SHIFT	4
+
 #endif /* LINUX_PCI_REGS_H */

diff --git a/include/uapi/linux/ptp_clock.h b/include/uapi/linux/ptp_clock.h
index f163010..59e89a1 100644
--- a/include/uapi/linux/ptp_clock.h
+++ b/include/uapi/linux/ptp_clock.h

@@ -31,15 +31,37 @@
 #define PTP_ENABLE_FEATURE (1<<0)
 #define PTP_RISING_EDGE    (1<<1)
 #define PTP_FALLING_EDGE   (1<<2)
+
+/*
+ * flag fields valid for the new PTP_EXTTS_REQUEST2 ioctl.
+ */
 #define PTP_EXTTS_VALID_FLAGS	(PTP_ENABLE_FEATURE |	\
 				 PTP_RISING_EDGE |	\
 				 PTP_FALLING_EDGE)
 
 /*
+ * flag fields valid for the original PTP_EXTTS_REQUEST ioctl.
+ * DO NOT ADD NEW FLAGS HERE.
+ */
+#define PTP_EXTTS_V1_VALID_FLAGS	(PTP_ENABLE_FEATURE |	\
+					 PTP_RISING_EDGE |	\
+					 PTP_FALLING_EDGE)
+
+/*
  * Bits of the ptp_perout_request.flags field:
  */
 #define PTP_PEROUT_ONE_SHOT (1<<0)
+
+/*
+ * flag fields valid for the new PTP_PEROUT_REQUEST2 ioctl.
+ */
 #define PTP_PEROUT_VALID_FLAGS	(PTP_PEROUT_ONE_SHOT)
+
+/*
+ * No flags are valid for the original PTP_PEROUT_REQUEST ioctl
+ */
+#define PTP_PEROUT_V1_VALID_FLAGS	(0)
+
 /*
  * struct ptp_clock_time - represents a time value
  *

diff --git a/include/uapi/linux/virtio_fs.h b/include/uapi/linux/virtio_fs.h
new file mode 100644
index 0000000..b02eb2a
--- /dev/null
+++ b/include/uapi/linux/virtio_fs.h

@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */
+
+#ifndef _UAPI_LINUX_VIRTIO_FS_H
+#define _UAPI_LINUX_VIRTIO_FS_H
+
+#include <linux/types.h>
+#include <linux/virtio_ids.h>
+#include <linux/virtio_config.h>
+#include <linux/virtio_types.h>
+
+struct virtio_fs_config {
+	/* Filesystem name (UTF-8, not NUL-terminated, padded with NULs) */
+	__u8 tag[36];
+
+	/* Number of request queues */
+	__u32 num_request_queues;
+} __attribute__((packed));
+
+#endif /* _UAPI_LINUX_VIRTIO_FS_H */

diff --git a/include/uapi/linux/virtio_ids.h b/include/uapi/linux/virtio_ids.h
index 348fd01..585e07b 100644
--- a/include/uapi/linux/virtio_ids.h
+++ b/include/uapi/linux/virtio_ids.h

@@ -44,6 +44,7 @@
 #define VIRTIO_ID_VSOCK        19 /* virtio vsock transport */
 #define VIRTIO_ID_CRYPTO       20 /* virtio crypto */
 #define VIRTIO_ID_IOMMU        23 /* virtio IOMMU */
+#define VIRTIO_ID_FS           26 /* virtio filesystem */
 #define VIRTIO_ID_PMEM         27 /* virtio pmem */
 
 #endif /* _LINUX_VIRTIO_IDS_H */

diff --git a/init/Kconfig b/init/Kconfig
index 5712359..b4daad2 100644
--- a/init/Kconfig
+++ b/init/Kconfig

@@ -1963,6 +1963,10 @@
 	default 0 if BASE_FULL
 	default 1 if !BASE_FULL
 
+config MODULE_SIG_FORMAT
+	def_bool n
+	select SYSTEM_DATA_VERIFICATION
+
 menuconfig MODULES
 	bool "Enable loadable module support"
 	option modules
@@ -2047,7 +2051,7 @@
 
 config MODULE_SIG
 	bool "Module signature verification"
-	select SYSTEM_DATA_VERIFICATION
+	select MODULE_SIG_FORMAT
 	help
 	  Check modules for valid signatures upon load: the signature
 	  is simply appended to the module. For more information see
@@ -2057,6 +2061,11 @@
 	  kernel build dependency so that the signing tool can use its crypto
 	  library.
 
+	  You should enable this option if you wish to use either
+	  CONFIG_SECURITY_LOCKDOWN_LSM or lockdown functionality imposed via
+	  another LSM - otherwise unsigned modules will be loadable regardless
+	  of the lockdown policy.
+
 	  !!!WARNING!!!  If you enable this option, you MUST make sure that the
 	  module DOES NOT get stripped after being signed.  This includes the
 	  debuginfo strip done by some packagers (such as rpmbuild) and

diff --git a/init/main.c b/init/main.c
index 653693d..91f6ebb 100644
--- a/init/main.c
+++ b/init/main.c

@@ -507,7 +507,7 @@
 
 void __init __weak poking_init(void) { }
 
-void __init __weak pgd_cache_init(void) { }
+void __init __weak pgtable_cache_init(void) { }
 
 bool initcall_debug;
 core_param(initcall_debug, initcall_debug, bool, 0644);
@@ -556,6 +556,7 @@
 	report_meminit();
 	mem_init();
 	kmem_cache_init();
+	kmemleak_init();
 	pgtable_init();
 	debug_objects_mem_init();
 	vmalloc_init();
@@ -564,7 +565,6 @@
 	init_espfix_bsp();
 	/* Should be run after espfix64 is set up. */
 	pti_init();
-	pgd_cache_init();
 }
 
 void __init __weak arch_call_rest_init(void)
@@ -593,8 +593,8 @@
 	boot_cpu_init();
 	page_address_init();
 	pr_notice("%s", linux_banner);
+	early_security_init();
 	setup_arch(&command_line);
-	mm_init_cpumask(&init_mm);
 	setup_command_line(command_line);
 	setup_nr_cpu_ids();
 	setup_per_cpu_areas();
@@ -740,7 +740,6 @@
 		initrd_start = 0;
 	}
 #endif
-	kmemleak_init();
 	setup_per_cpu_pageset();
 	numa_policy_init();
 	acpi_early_init();

diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 7c15729..3d920ff 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c

@@ -1240,15 +1240,14 @@
 
 			/* create the notify skb */
 			nc = alloc_skb(NOTIFY_COOKIE_LEN, GFP_KERNEL);
-			if (!nc) {
-				ret = -ENOMEM;
-				goto out;
-			}
+			if (!nc)
+				return -ENOMEM;
+
 			if (copy_from_user(nc->data,
 					notification->sigev_value.sival_ptr,
 					NOTIFY_COOKIE_LEN)) {
 				ret = -EFAULT;
-				goto out;
+				goto free_skb;
 			}
 
 			/* TODO: add a header? */
@@ -1264,8 +1263,7 @@
 			fdput(f);
 			if (IS_ERR(sock)) {
 				ret = PTR_ERR(sock);
-				sock = NULL;
-				goto out;
+				goto free_skb;
 			}
 
 			timeo = MAX_SCHEDULE_TIMEOUT;
@@ -1274,11 +1272,8 @@
 				sock = NULL;
 				goto retry;
 			}
-			if (ret) {
-				sock = NULL;
-				nc = NULL;
-				goto out;
-			}
+			if (ret)
+				return ret;
 		}
 	}
 
@@ -1333,7 +1328,8 @@
 out:
 	if (sock)
 		netlink_detachskb(sock, nc);
-	else if (nc)
+	else
+free_skb:
 		dev_kfree_skb(nc);
 
 	return ret;

diff --git a/ipc/sem.c b/ipc/sem.c
index 7da4504..ec97a70 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c

@@ -1852,7 +1852,8 @@
 {
 	struct sem_undo *un;
 
-	list_for_each_entry_rcu(un, &ulp->list_proc, list_proc) {
+	list_for_each_entry_rcu(un, &ulp->list_proc, list_proc,
+				spin_is_locked(&ulp->lock)) {
 		if (un->semid == semid)
 			return un;
 	}

diff --git a/kernel/Makefile b/kernel/Makefile
index 25f9d83..daad787 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile

@@ -58,6 +58,7 @@
 obj-$(CONFIG_UID16) += uid16.o
 obj-$(CONFIG_MODULES) += module.o
 obj-$(CONFIG_MODULE_SIG) += module_signing.o
+obj-$(CONFIG_MODULE_SIG_FORMAT) += module_signature.o
 obj-$(CONFIG_KALLSYMS) += kallsyms.o
 obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
 obj-$(CONFIG_CRASH_CORE) += crash_core.o

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index adb3adc..29c7c06 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c

@@ -2332,7 +2332,7 @@
 		if (BITS_PER_BYTE_MASKED(struct_bits_off)) {
 			btf_verifier_log_member(env, struct_type, member,
 						"Member is not byte aligned");
-				return -EINVAL;
+			return -EINVAL;
 		}
 
 		nr_bits = int_bitsize;
@@ -2377,9 +2377,8 @@
 		return -EINVAL;
 	}
 
-	if (t->size != sizeof(int)) {
-		btf_verifier_log_type(env, t, "Expected size:%zu",
-				      sizeof(int));
+	if (t->size > 8 || !is_power_of_2(t->size)) {
+		btf_verifier_log_type(env, t, "Unexpected size");
 		return -EINVAL;
 	}
 

diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index cc0d0cf..a70f720 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c

@@ -14,8 +14,9 @@
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/fs.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
 #include <linux/kdev_t.h>
-#include <linux/parser.h>
 #include <linux/filter.h>
 #include <linux/bpf.h>
 #include <linux/bpf_trace.h>
@@ -583,58 +584,52 @@
 
 enum {
 	OPT_MODE,
-	OPT_ERR,
 };
 
-static const match_table_t bpf_mount_tokens = {
-	{ OPT_MODE, "mode=%o" },
-	{ OPT_ERR, NULL },
+static const struct fs_parameter_spec bpf_param_specs[] = {
+	fsparam_u32oct	("mode",			OPT_MODE),
+	{}
+};
+
+static const struct fs_parameter_description bpf_fs_parameters = {
+	.name		= "bpf",
+	.specs		= bpf_param_specs,
 };
 
 struct bpf_mount_opts {
 	umode_t mode;
 };
 
-static int bpf_parse_options(char *data, struct bpf_mount_opts *opts)
+static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param)
 {
-	substring_t args[MAX_OPT_ARGS];
-	int option, token;
-	char *ptr;
+	struct bpf_mount_opts *opts = fc->fs_private;
+	struct fs_parse_result result;
+	int opt;
 
-	opts->mode = S_IRWXUGO;
-
-	while ((ptr = strsep(&data, ",")) != NULL) {
-		if (!*ptr)
-			continue;
-
-		token = match_token(ptr, bpf_mount_tokens, args);
-		switch (token) {
-		case OPT_MODE:
-			if (match_octal(&args[0], &option))
-				return -EINVAL;
-			opts->mode = option & S_IALLUGO;
-			break;
+	opt = fs_parse(fc, &bpf_fs_parameters, param, &result);
+	if (opt < 0)
 		/* We might like to report bad mount options here, but
 		 * traditionally we've ignored all mount options, so we'd
 		 * better continue to ignore non-existing options for bpf.
 		 */
-		}
+		return opt == -ENOPARAM ? 0 : opt;
+
+	switch (opt) {
+	case OPT_MODE:
+		opts->mode = result.uint_32 & S_IALLUGO;
+		break;
 	}
 
 	return 0;
 }
 
-static int bpf_fill_super(struct super_block *sb, void *data, int silent)
+static int bpf_fill_super(struct super_block *sb, struct fs_context *fc)
 {
 	static const struct tree_descr bpf_rfiles[] = { { "" } };
-	struct bpf_mount_opts opts;
+	struct bpf_mount_opts *opts = fc->fs_private;
 	struct inode *inode;
 	int ret;
 
-	ret = bpf_parse_options(data, &opts);
-	if (ret)
-		return ret;
-
 	ret = simple_fill_super(sb, BPF_FS_MAGIC, bpf_rfiles);
 	if (ret)
 		return ret;
@@ -644,21 +639,50 @@
 	inode = sb->s_root->d_inode;
 	inode->i_op = &bpf_dir_iops;
 	inode->i_mode &= ~S_IALLUGO;
-	inode->i_mode |= S_ISVTX | opts.mode;
+	inode->i_mode |= S_ISVTX | opts->mode;
 
 	return 0;
 }
 
-static struct dentry *bpf_mount(struct file_system_type *type, int flags,
-				const char *dev_name, void *data)
+static int bpf_get_tree(struct fs_context *fc)
 {
-	return mount_nodev(type, flags, data, bpf_fill_super);
+	return get_tree_nodev(fc, bpf_fill_super);
+}
+
+static void bpf_free_fc(struct fs_context *fc)
+{
+	kfree(fc->fs_private);
+}
+
+static const struct fs_context_operations bpf_context_ops = {
+	.free		= bpf_free_fc,
+	.parse_param	= bpf_parse_param,
+	.get_tree	= bpf_get_tree,
+};
+
+/*
+ * Set up the filesystem mount context.
+ */
+static int bpf_init_fs_context(struct fs_context *fc)
+{
+	struct bpf_mount_opts *opts;
+
+	opts = kzalloc(sizeof(struct bpf_mount_opts), GFP_KERNEL);
+	if (!opts)
+		return -ENOMEM;
+
+	opts->mode = S_IRWXUGO;
+
+	fc->fs_private = opts;
+	fc->ops = &bpf_context_ops;
+	return 0;
 }
 
 static struct file_system_type bpf_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "bpf",
-	.mount		= bpf_mount,
+	.init_fs_context = bpf_init_fs_context,
+	.parameters	= &bpf_fs_parameters,
 	.kill_sb	= kill_litter_super,
 };
 

diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c
index 942c662..82a1ffe 100644
--- a/kernel/bpf/xskmap.c
+++ b/kernel/bpf/xskmap.c

@@ -37,7 +37,7 @@
 
 	node = kzalloc(sizeof(*node), GFP_ATOMIC | __GFP_NOWARN);
 	if (!node)
-		return NULL;
+		return ERR_PTR(-ENOMEM);
 
 	err = xsk_map_inc(map);
 	if (err) {

diff --git a/kernel/cpu.c b/kernel/cpu.c
index e1967e9..fc28e17 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c

@@ -392,8 +392,7 @@
 
 void __init cpu_smt_disable(bool force)
 {
-	if (cpu_smt_control == CPU_SMT_FORCE_DISABLED ||
-		cpu_smt_control == CPU_SMT_NOT_SUPPORTED)
+	if (!cpu_smt_possible())
 		return;
 
 	if (force) {
@@ -438,6 +437,14 @@
 	 */
 	return !cpumask_test_cpu(cpu, &cpus_booted_once_mask);
 }
+
+/* Returns true if SMT is not supported of forcefully (irreversibly) disabled */
+bool cpu_smt_possible(void)
+{
+	return cpu_smt_control != CPU_SMT_FORCE_DISABLED &&
+		cpu_smt_control != CPU_SMT_NOT_SUPPORTED;
+}
+EXPORT_SYMBOL_GPL(cpu_smt_possible);
 #else
 static inline bool cpu_smt_allowed(unsigned int cpu) { return true; }
 #endif

diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 10f1187..f76d6f7 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c

@@ -893,29 +893,24 @@
 };
 #endif
 
-static int kgdb_panic_event(struct notifier_block *self,
-			    unsigned long val,
-			    void *data)
+void kgdb_panic(const char *msg)
 {
+	if (!kgdb_io_module_registered)
+		return;
+
 	/*
-	 * Avoid entering the debugger if we were triggered due to a panic
-	 * We don't want to get stuck waiting for input from user in such case.
-	 * panic_timeout indicates the system should automatically
+	 * We don't want to get stuck waiting for input from user if
+	 * "panic_timeout" indicates the system should automatically
 	 * reboot on panic.
 	 */
 	if (panic_timeout)
-		return NOTIFY_DONE;
+		return;
 
 	if (dbg_kdb_mode)
-		kdb_printf("PANIC: %s\n", (char *)data);
-	kgdb_breakpoint();
-	return NOTIFY_DONE;
-}
+		kdb_printf("PANIC: %s\n", msg);
 
-static struct notifier_block kgdb_panic_event_nb = {
-       .notifier_call	= kgdb_panic_event,
-       .priority	= INT_MAX,
-};
+	kgdb_breakpoint();
+}
 
 void __weak kgdb_arch_late(void)
 {
@@ -965,8 +960,6 @@
 			kgdb_arch_late();
 		register_module_notifier(&dbg_module_load_nb);
 		register_reboot_notifier(&dbg_reboot_notifier);
-		atomic_notifier_chain_register(&panic_notifier_list,
-					       &kgdb_panic_event_nb);
 #ifdef CONFIG_MAGIC_SYSRQ
 		register_sysrq_key('g', &sysrq_dbg_op);
 #endif
@@ -980,16 +973,14 @@
 static void kgdb_unregister_callbacks(void)
 {
 	/*
-	 * When this routine is called KGDB should unregister from the
-	 * panic handler and clean up, making sure it is not handling any
+	 * When this routine is called KGDB should unregister from
+	 * handlers and clean up, making sure it is not handling any
 	 * break exceptions at the time.
 	 */
 	if (kgdb_io_module_registered) {
 		kgdb_io_module_registered = 0;
 		unregister_reboot_notifier(&dbg_reboot_notifier);
 		unregister_module_notifier(&dbg_module_load_nb);
-		atomic_notifier_chain_unregister(&panic_notifier_list,
-					       &kgdb_panic_event_nb);
 		kgdb_arch_exit();
 #ifdef CONFIG_MAGIC_SYSRQ
 		unregister_sysrq_key('g', &sysrq_dbg_op);

diff --git a/kernel/elfcore.c b/kernel/elfcore.c
index fc482c8..57fb4dc 100644
--- a/kernel/elfcore.c
+++ b/kernel/elfcore.c

@@ -3,6 +3,7 @@
 #include <linux/fs.h>
 #include <linux/mm.h>
 #include <linux/binfmts.h>
+#include <linux/elfcore.h>
 
 Elf_Half __weak elf_core_extra_phdrs(void)
 {

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 4f08b17..4655adb 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c

@@ -2239,7 +2239,7 @@
  *
  * If event->ctx is a cloned context, callers must make sure that
  * every task struct that event->ctx->task could possibly point to
- * remains valid.  This condition is satisifed when called through
+ * remains valid.  This condition is satisfied when called through
  * perf_event_for_each_child or perf_event_for_each because they
  * hold the top-level event's child_mutex, so any descendant that
  * goes to exit will block in perf_event_exit_event().
@@ -6054,7 +6054,7 @@
  * Get remaining task size from user stack pointer.
  *
  * It'd be better to take stack vma map and limit this more
- * precisly, but there's no way to get it safely under interrupt,
+ * precisely, but there's no way to get it safely under interrupt,
  * so using TASK_SIZE as limit.
  */
 static u64 perf_ustack_task_size(struct pt_regs *regs)
@@ -6616,7 +6616,7 @@
 
 	if (sample_type & PERF_SAMPLE_STACK_USER) {
 		/*
-		 * Either we need PERF_SAMPLE_STACK_USER bit to be allways
+		 * Either we need PERF_SAMPLE_STACK_USER bit to be always
 		 * processed as the last one or have additional check added
 		 * in case new sample type is added, because we could eat
 		 * up the rest of the sample size.
@@ -10917,6 +10917,13 @@
 	    perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
 		return -EACCES;
 
+	err = security_locked_down(LOCKDOWN_PERF);
+	if (err && (attr.sample_type & PERF_SAMPLE_REGS_INTR))
+		/* REGS_INTR can leak data, lockdown must prevent this */
+		return err;
+
+	err = 0;
+
 	/*
 	 * In cgroup mode, the pid argument is used to pass the fd
 	 * opened to the cgroup directory in cgroupfs. The cpu argument

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 84fa004..94d38a3 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c

@@ -26,6 +26,7 @@
 #include <linux/percpu-rwsem.h>
 #include <linux/task_work.h>
 #include <linux/shmem_fs.h>
+#include <linux/khugepaged.h>
 
 #include <linux/uprobes.h>
 
@@ -143,17 +144,19 @@
  *
  * @vma:      vma that holds the pte pointing to page
  * @addr:     address the old @page is mapped at
- * @page:     the cowed page we are replacing by kpage
- * @kpage:    the modified page we replace page by
+ * @old_page: the page we are replacing by new_page
+ * @new_page: the modified page we replace page by
  *
- * Returns 0 on success, -EFAULT on failure.
+ * If @new_page is NULL, only unmap @old_page.
+ *
+ * Returns 0 on success, negative error code otherwise.
  */
 static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
 				struct page *old_page, struct page *new_page)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	struct page_vma_mapped_walk pvmw = {
-		.page = old_page,
+		.page = compound_head(old_page),
 		.vma = vma,
 		.address = addr,
 	};
@@ -164,12 +167,12 @@
 	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr,
 				addr + PAGE_SIZE);
 
-	VM_BUG_ON_PAGE(PageTransHuge(old_page), old_page);
-
-	err = mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL, &memcg,
-			false);
-	if (err)
-		return err;
+	if (new_page) {
+		err = mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL,
+					    &memcg, false);
+		if (err)
+			return err;
+	}
 
 	/* For try_to_free_swap() and munlock_vma_page() below */
 	lock_page(old_page);
@@ -177,15 +180,20 @@
 	mmu_notifier_invalidate_range_start(&range);
 	err = -EAGAIN;
 	if (!page_vma_mapped_walk(&pvmw)) {
-		mem_cgroup_cancel_charge(new_page, memcg, false);
+		if (new_page)
+			mem_cgroup_cancel_charge(new_page, memcg, false);
 		goto unlock;
 	}
 	VM_BUG_ON_PAGE(addr != pvmw.address, old_page);
 
-	get_page(new_page);
-	page_add_new_anon_rmap(new_page, vma, addr, false);
-	mem_cgroup_commit_charge(new_page, memcg, false, false);
-	lru_cache_add_active_or_unevictable(new_page, vma);
+	if (new_page) {
+		get_page(new_page);
+		page_add_new_anon_rmap(new_page, vma, addr, false);
+		mem_cgroup_commit_charge(new_page, memcg, false, false);
+		lru_cache_add_active_or_unevictable(new_page, vma);
+	} else
+		/* no new page, just dec_mm_counter for old_page */
+		dec_mm_counter(mm, MM_ANONPAGES);
 
 	if (!PageAnon(old_page)) {
 		dec_mm_counter(mm, mm_counter_file(old_page));
@@ -194,8 +202,9 @@
 
 	flush_cache_page(vma, addr, pte_pfn(*pvmw.pte));
 	ptep_clear_flush_notify(vma, addr, pvmw.pte);
-	set_pte_at_notify(mm, addr, pvmw.pte,
-			mk_pte(new_page, vma->vm_page_prot));
+	if (new_page)
+		set_pte_at_notify(mm, addr, pvmw.pte,
+				  mk_pte(new_page, vma->vm_page_prot));
 
 	page_remove_rmap(old_page, false);
 	if (!page_mapped(old_page))
@@ -464,6 +473,7 @@
 	struct page *old_page, *new_page;
 	struct vm_area_struct *vma;
 	int ret, is_register, ref_ctr_updated = 0;
+	bool orig_page_huge = false;
 
 	is_register = is_swbp_insn(&opcode);
 	uprobe = container_of(auprobe, struct uprobe, arch);
@@ -471,7 +481,7 @@
 retry:
 	/* Read the page with vaddr into memory */
 	ret = get_user_pages_remote(NULL, mm, vaddr, 1,
-			FOLL_FORCE | FOLL_SPLIT, &old_page, &vma, NULL);
+			FOLL_FORCE | FOLL_SPLIT_PMD, &old_page, &vma, NULL);
 	if (ret <= 0)
 		return ret;
 
@@ -488,6 +498,10 @@
 		ref_ctr_updated = 1;
 	}
 
+	ret = 0;
+	if (!is_register && !PageAnon(old_page))
+		goto put_old;
+
 	ret = anon_vma_prepare(vma);
 	if (ret)
 		goto put_old;
@@ -501,8 +515,33 @@
 	copy_highpage(new_page, old_page);
 	copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
 
+	if (!is_register) {
+		struct page *orig_page;
+		pgoff_t index;
+
+		VM_BUG_ON_PAGE(!PageAnon(old_page), old_page);
+
+		index = vaddr_to_offset(vma, vaddr & PAGE_MASK) >> PAGE_SHIFT;
+		orig_page = find_get_page(vma->vm_file->f_inode->i_mapping,
+					  index);
+
+		if (orig_page) {
+			if (PageUptodate(orig_page) &&
+			    pages_identical(new_page, orig_page)) {
+				/* let go new_page */
+				put_page(new_page);
+				new_page = NULL;
+
+				if (PageCompound(orig_page))
+					orig_page_huge = true;
+			}
+			put_page(orig_page);
+		}
+	}
+
 	ret = __replace_page(vma, vaddr, old_page, new_page);
-	put_page(new_page);
+	if (new_page)
+		put_page(new_page);
 put_old:
 	put_page(old_page);
 
@@ -513,6 +552,10 @@
 	if (ret && is_register && ref_ctr_updated)
 		update_ref_ctr(uprobe, mm, -1);
 
+	/* try collapse pmd for compound page */
+	if (!ret && orig_page_huge)
+		collapse_pte_mapped_thp(mm, vaddr);
+
 	return ret;
 }
 

diff --git a/kernel/exit.c b/kernel/exit.c
index 22ab6a4..a46a50d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c

@@ -182,6 +182,11 @@
 	put_task_struct(tsk);
 }
 
+void put_task_struct_rcu_user(struct task_struct *task)
+{
+	if (refcount_dec_and_test(&task->rcu_users))
+		call_rcu(&task->rcu, delayed_put_task_struct);
+}
 
 void release_task(struct task_struct *p)
 {
@@ -222,76 +227,13 @@
 
 	write_unlock_irq(&tasklist_lock);
 	release_thread(p);
-	call_rcu(&p->rcu, delayed_put_task_struct);
+	put_task_struct_rcu_user(p);
 
 	p = leader;
 	if (unlikely(zap_leader))
 		goto repeat;
 }
 
-/*
- * Note that if this function returns a valid task_struct pointer (!NULL)
- * task->usage must remain >0 for the duration of the RCU critical section.
- */
-struct task_struct *task_rcu_dereference(struct task_struct **ptask)
-{
-	struct sighand_struct *sighand;
-	struct task_struct *task;
-
-	/*
-	 * We need to verify that release_task() was not called and thus
-	 * delayed_put_task_struct() can't run and drop the last reference
-	 * before rcu_read_unlock(). We check task->sighand != NULL,
-	 * but we can read the already freed and reused memory.
-	 */
-retry:
-	task = rcu_dereference(*ptask);
-	if (!task)
-		return NULL;
-
-	probe_kernel_address(&task->sighand, sighand);
-
-	/*
-	 * Pairs with atomic_dec_and_test() in put_task_struct(). If this task
-	 * was already freed we can not miss the preceding update of this
-	 * pointer.
-	 */
-	smp_rmb();
-	if (unlikely(task != READ_ONCE(*ptask)))
-		goto retry;
-
-	/*
-	 * We've re-checked that "task == *ptask", now we have two different
-	 * cases:
-	 *
-	 * 1. This is actually the same task/task_struct. In this case
-	 *    sighand != NULL tells us it is still alive.
-	 *
-	 * 2. This is another task which got the same memory for task_struct.
-	 *    We can't know this of course, and we can not trust
-	 *    sighand != NULL.
-	 *
-	 *    In this case we actually return a random value, but this is
-	 *    correct.
-	 *
-	 *    If we return NULL - we can pretend that we actually noticed that
-	 *    *ptask was updated when the previous task has exited. Or pretend
-	 *    that probe_slab_address(&sighand) reads NULL.
-	 *
-	 *    If we return the new task (because sighand is not NULL for any
-	 *    reason) - this is fine too. This (new) task can't go away before
-	 *    another gp pass.
-	 *
-	 *    And note: We could even eliminate the false positive if re-read
-	 *    task->sighand once again to avoid the falsely NULL. But this case
-	 *    is very unlikely so we don't care.
-	 */
-	if (!sighand)
-		return NULL;
-
-	return task;
-}
-
 void rcuwait_wake_up(struct rcuwait *w)
 {
 	struct task_struct *task;
@@ -311,10 +253,6 @@
 	 */
 	smp_mb(); /* (B) */
 
-	/*
-	 * Avoid using task_rcu_dereference() magic as long as we are careful,
-	 * see comment in rcuwait_wait_event() regarding ->exit_state.
-	 */
 	task = rcu_dereference(w->task);
 	if (task)
 		wake_up_process(task);

diff --git a/kernel/fork.c b/kernel/fork.c
index 5a0fd51..f9572f4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c

@@ -125,6 +125,15 @@
 
 static int max_threads;		/* tunable limit on nr_threads */
 
+#define NAMED_ARRAY_INDEX(x)	[x] = __stringify(x)
+
+static const char * const resident_page_types[] = {
+	NAMED_ARRAY_INDEX(MM_FILEPAGES),
+	NAMED_ARRAY_INDEX(MM_ANONPAGES),
+	NAMED_ARRAY_INDEX(MM_SWAPENTS),
+	NAMED_ARRAY_INDEX(MM_SHMEMPAGES),
+};
+
 DEFINE_PER_CPU(unsigned long, process_counts) = 0;
 
 __cacheline_aligned DEFINE_RWLOCK(tasklist_lock);  /* outer */
@@ -645,12 +654,15 @@
 {
 	int i;
 
+	BUILD_BUG_ON_MSG(ARRAY_SIZE(resident_page_types) != NR_MM_COUNTERS,
+			 "Please make sure 'struct resident_page_types[]' is updated as well");
+
 	for (i = 0; i < NR_MM_COUNTERS; i++) {
 		long x = atomic_long_read(&mm->rss_stat.count[i]);
 
 		if (unlikely(x))
-			printk(KERN_ALERT "BUG: Bad rss-counter state "
-					  "mm:%p idx:%d val:%ld\n", mm, i, x);
+			pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n",
+				 mm, resident_page_types[i], x);
 	}
 
 	if (mm_pgtables_bytes(mm))
@@ -903,10 +915,12 @@
 		tsk->cpus_ptr = &tsk->cpus_mask;
 
 	/*
-	 * One for us, one for whoever does the "release_task()" (usually
-	 * parent)
+	 * One for the user space visible state that goes away when reaped.
+	 * One for the scheduler.
 	 */
-	refcount_set(&tsk->usage, 2);
+	refcount_set(&tsk->rcu_users, 2);
+	/* One for the rcu users */
+	refcount_set(&tsk->usage, 1);
 #ifdef CONFIG_BLK_DEV_IO_TRACE
 	tsk->btrace_seq = 0;
 #endif

diff --git a/kernel/kexec.c b/kernel/kexec.c
index 1b018f1..bc933c0 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c

@@ -206,6 +206,14 @@
 		return result;
 
 	/*
+	 * kexec can be used to circumvent module loading restrictions, so
+	 * prevent loading in that case
+	 */
+	result = security_locked_down(LOCKDOWN_KEXEC);
+	if (result)
+		return result;
+
+	/*
 	 * Verify we have a legal set of flags
 	 * This leaves us room for future extensions.
 	 */

diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index d5870723..15d70a9 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c

@@ -300,6 +300,8 @@
 {
 	struct page *pages;
 
+	if (fatal_signal_pending(current))
+		return NULL;
 	pages = alloc_pages(gfp_mask & ~__GFP_ZERO, order);
 	if (pages) {
 		unsigned int count, i;

diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index b8cc032..79f252a 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c

@@ -88,7 +88,7 @@
 	return kexec_image_post_load_cleanup_default(image);
 }
 
-#ifdef CONFIG_KEXEC_VERIFY_SIG
+#ifdef CONFIG_KEXEC_SIG
 static int kexec_image_verify_sig_default(struct kimage *image, void *buf,
 					  unsigned long buf_len)
 {
@@ -177,6 +177,59 @@
 	image->image_loader_data = NULL;
 }
 
+#ifdef CONFIG_KEXEC_SIG
+static int
+kimage_validate_signature(struct kimage *image)
+{
+	const char *reason;
+	int ret;
+
+	ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf,
+					   image->kernel_buf_len);
+	switch (ret) {
+	case 0:
+		break;
+
+		/* Certain verification errors are non-fatal if we're not
+		 * checking errors, provided we aren't mandating that there
+		 * must be a valid signature.
+		 */
+	case -ENODATA:
+		reason = "kexec of unsigned image";
+		goto decide;
+	case -ENOPKG:
+		reason = "kexec of image with unsupported crypto";
+		goto decide;
+	case -ENOKEY:
+		reason = "kexec of image with unavailable key";
+	decide:
+		if (IS_ENABLED(CONFIG_KEXEC_SIG_FORCE)) {
+			pr_notice("%s rejected\n", reason);
+			return ret;
+		}
+
+		/* If IMA is guaranteed to appraise a signature on the kexec
+		 * image, permit it even if the kernel is otherwise locked
+		 * down.
+		 */
+		if (!ima_appraise_signature(READING_KEXEC_IMAGE) &&
+		    security_locked_down(LOCKDOWN_KEXEC))
+			return -EPERM;
+
+		return 0;
+
+		/* All other errors are fatal, including nomem, unparseable
+		 * signatures and signature check failures - even if signatures
+		 * aren't required.
+		 */
+	default:
+		pr_notice("kernel signature verification failed (%d).\n", ret);
+	}
+
+	return ret;
+}
+#endif
+
 /*
  * In file mode list of segments is prepared by kernel. Copy relevant
  * data from user space, do error checking, prepare segment list
@@ -186,7 +239,7 @@
 			     const char __user *cmdline_ptr,
 			     unsigned long cmdline_len, unsigned flags)
 {
-	int ret = 0;
+	int ret;
 	void *ldata;
 	loff_t size;
 
@@ -202,14 +255,11 @@
 	if (ret)
 		goto out;
 
-#ifdef CONFIG_KEXEC_VERIFY_SIG
-	ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf,
-					   image->kernel_buf_len);
-	if (ret) {
-		pr_debug("kernel signature verification failed.\n");
+#ifdef CONFIG_KEXEC_SIG
+	ret = kimage_validate_signature(image);
+
+	if (ret)
 		goto out;
-	}
-	pr_debug("kernel signature verification successful.\n");
 #endif
 	/* It is possible that there no initramfs is being loaded */
 	if (!(flags & KEXEC_FILE_NO_INITRAMFS)) {

diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index c4ce08f..ab4a460 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c

@@ -1175,6 +1175,7 @@
 	pr_warn("patch '%s' failed for module '%s', refusing to load module '%s'\n",
 		patch->mod->name, obj->mod->name, obj->mod->name);
 	mod->klp_alive = false;
+	obj->mod = NULL;
 	klp_cleanup_module_patches_limited(mod, patch);
 	mutex_unlock(&klp_mutex);
 

diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index 89bab07..e84d21a 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h

@@ -269,7 +269,7 @@
 	if ((loop & PV_PREV_CHECK_MASK) != 0)
 		return false;
 
-	return READ_ONCE(prev->state) != vcpu_running || vcpu_is_preempted(prev->cpu);
+	return READ_ONCE(prev->state) != vcpu_running;
 }
 
 /*

diff --git a/kernel/module.c b/kernel/module.c
index 32873bc..ff2d7359a 100644
--- a/kernel/module.c
+++ b/kernel/module.c

@@ -7,6 +7,7 @@
 #include <linux/export.h>
 #include <linux/extable.h>
 #include <linux/moduleloader.h>
+#include <linux/module_signature.h>
 #include <linux/trace_events.h>
 #include <linux/init.h>
 #include <linux/kallsyms.h>
@@ -2838,8 +2839,9 @@
 #ifdef CONFIG_MODULE_SIG
 static int module_sig_check(struct load_info *info, int flags)
 {
-	int err = -ENOKEY;
+	int err = -ENODATA;
 	const unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1;
+	const char *reason;
 	const void *mod = info->hdr;
 
 	/*
@@ -2854,16 +2856,38 @@
 		err = mod_verify_sig(mod, info);
 	}
 
-	if (!err) {
+	switch (err) {
+	case 0:
 		info->sig_ok = true;
 		return 0;
+
+		/* We don't permit modules to be loaded into trusted kernels
+		 * without a valid signature on them, but if we're not
+		 * enforcing, certain errors are non-fatal.
+		 */
+	case -ENODATA:
+		reason = "Loading of unsigned module";
+		goto decide;
+	case -ENOPKG:
+		reason = "Loading of module with unsupported crypto";
+		goto decide;
+	case -ENOKEY:
+		reason = "Loading of module with unavailable key";
+	decide:
+		if (is_module_sig_enforced()) {
+			pr_notice("%s is rejected\n", reason);
+			return -EKEYREJECTED;
+		}
+
+		return security_locked_down(LOCKDOWN_MODULE_SIGNATURE);
+
+		/* All other errors are fatal, including nomem, unparseable
+		 * signatures and signature check failures - even if signatures
+		 * aren't required.
+		 */
+	default:
+		return err;
 	}
-
-	/* Not having a signature is only an error if we're strict. */
-	if (err == -ENOKEY && !is_module_sig_enforced())
-		err = 0;
-
-	return err;
 }
 #else /* !CONFIG_MODULE_SIG */
 static int module_sig_check(struct load_info *info, int flags)

diff --git a/kernel/module_signature.c b/kernel/module_signature.c
new file mode 100644
index 0000000..4224a10
--- /dev/null
+++ b/kernel/module_signature.c

@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Module signature checker
+ *
+ * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/errno.h>
+#include <linux/printk.h>
+#include <linux/module_signature.h>
+#include <asm/byteorder.h>
+
+/**
+ * mod_check_sig - check that the given signature is sane
+ *
+ * @ms:		Signature to check.
+ * @file_len:	Size of the file to which @ms is appended.
+ * @name:	What is being checked. Used for error messages.
+ */
+int mod_check_sig(const struct module_signature *ms, size_t file_len,
+		  const char *name)
+{
+	if (be32_to_cpu(ms->sig_len) >= file_len - sizeof(*ms))
+		return -EBADMSG;
+
+	if (ms->id_type != PKEY_ID_PKCS7) {
+		pr_err("%s: Module is not signed with expected PKCS#7 message\n",
+		       name);
+		return -ENOPKG;
+	}
+
+	if (ms->algo != 0 ||
+	    ms->hash != 0 ||
+	    ms->signer_len != 0 ||
+	    ms->key_id_len != 0 ||
+	    ms->__pad[0] != 0 ||
+	    ms->__pad[1] != 0 ||
+	    ms->__pad[2] != 0) {
+		pr_err("%s: PKCS#7 signature info has unexpected non-zero params\n",
+		       name);
+		return -EBADMSG;
+	}
+
+	return 0;
+}

diff --git a/kernel/module_signing.c b/kernel/module_signing.c
index b10fb19..9d9fc67 100644
--- a/kernel/module_signing.c
+++ b/kernel/module_signing.c

@@ -7,37 +7,13 @@
 
 #include <linux/kernel.h>
 #include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/module_signature.h>
 #include <linux/string.h>
 #include <linux/verification.h>
 #include <crypto/public_key.h>
 #include "module-internal.h"
 
-enum pkey_id_type {
-	PKEY_ID_PGP,		/* OpenPGP generated key ID */
-	PKEY_ID_X509,		/* X.509 arbitrary subjectKeyIdentifier */
-	PKEY_ID_PKCS7,		/* Signature in PKCS#7 message */
-};
-
-/*
- * Module signature information block.
- *
- * The constituents of the signature section are, in order:
- *
- *	- Signer's name
- *	- Key identifier
- *	- Signature data
- *	- Information block
- */
-struct module_signature {
-	u8	algo;		/* Public-key crypto algorithm [0] */
-	u8	hash;		/* Digest algorithm [0] */
-	u8	id_type;	/* Key identifier type [PKEY_ID_PKCS7] */
-	u8	signer_len;	/* Length of signer's name [0] */
-	u8	key_id_len;	/* Length of key identifier [0] */
-	u8	__pad[3];
-	__be32	sig_len;	/* Length of signature data */
-};
-
 /*
  * Verify the signature on a module.
  */
@@ -45,6 +21,7 @@
 {
 	struct module_signature ms;
 	size_t sig_len, modlen = info->len;
+	int ret;
 
 	pr_devel("==>%s(,%zu)\n", __func__, modlen);
 
@@ -52,32 +29,15 @@
 		return -EBADMSG;
 
 	memcpy(&ms, mod + (modlen - sizeof(ms)), sizeof(ms));
-	modlen -= sizeof(ms);
+
+	ret = mod_check_sig(&ms, modlen, info->name);
+	if (ret)
+		return ret;
 
 	sig_len = be32_to_cpu(ms.sig_len);
-	if (sig_len >= modlen)
-		return -EBADMSG;
-	modlen -= sig_len;
+	modlen -= sig_len + sizeof(ms);
 	info->len = modlen;
 
-	if (ms.id_type != PKEY_ID_PKCS7) {
-		pr_err("%s: Module is not signed with expected PKCS#7 message\n",
-		       info->name);
-		return -ENOPKG;
-	}
-
-	if (ms.algo != 0 ||
-	    ms.hash != 0 ||
-	    ms.signer_len != 0 ||
-	    ms.key_id_len != 0 ||
-	    ms.__pad[0] != 0 ||
-	    ms.__pad[1] != 0 ||
-	    ms.__pad[2] != 0) {
-		pr_err("%s: PKCS#7 signature info has unexpected non-zero params\n",
-		       info->name);
-		return -EBADMSG;
-	}
-
 	return verify_pkcs7_signature(mod, modlen, mod + modlen, sig_len,
 				      VERIFY_USE_SECONDARY_KEYRING,
 				      VERIFYING_MODULE_SIGNATURE,

diff --git a/kernel/panic.c b/kernel/panic.c
index 057540b..47e8ebc 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c

@@ -12,6 +12,7 @@
 #include <linux/debug_locks.h>
 #include <linux/sched/debug.h>
 #include <linux/interrupt.h>
+#include <linux/kgdb.h>
 #include <linux/kmsg_dump.h>
 #include <linux/kallsyms.h>
 #include <linux/notifier.h>
@@ -220,6 +221,13 @@
 #endif
 
 	/*
+	 * If kgdb is enabled, give it a chance to run before we stop all
+	 * the other CPUs or else we won't be able to debug processes left
+	 * running on them.
+	 */
+	kgdb_panic(buf);
+
+	/*
 	 * If we have crashed and we have a crash kernel loaded let it handle
 	 * everything else.
 	 * If we want to run this after calling panic_notifiers, pass
@@ -551,9 +559,6 @@
 {
 	disable_trace_on_warning();
 
-	if (args)
-		pr_warn(CUT_HERE);
-
 	if (file)
 		pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS\n",
 			raw_smp_processor_id(), current->pid, file, line,
@@ -591,37 +596,26 @@
 	add_taint(taint, LOCKDEP_STILL_OK);
 }
 
-#ifdef WANT_WARN_ON_SLOWPATH
-void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...)
+#ifndef __WARN_FLAGS
+void warn_slowpath_fmt(const char *file, int line, unsigned taint,
+		       const char *fmt, ...)
 {
 	struct warn_args args;
 
-	args.fmt = fmt;
-	va_start(args.args, fmt);
-	__warn(file, line, __builtin_return_address(0), TAINT_WARN, NULL,
-	       &args);
-	va_end(args.args);
-}
-EXPORT_SYMBOL(warn_slowpath_fmt);
+	pr_warn(CUT_HERE);
 
-void warn_slowpath_fmt_taint(const char *file, int line,
-			     unsigned taint, const char *fmt, ...)
-{
-	struct warn_args args;
+	if (!fmt) {
+		__warn(file, line, __builtin_return_address(0), taint,
+		       NULL, NULL);
+		return;
+	}
 
 	args.fmt = fmt;
 	va_start(args.args, fmt);
 	__warn(file, line, __builtin_return_address(0), taint, NULL, &args);
 	va_end(args.args);
 }
-EXPORT_SYMBOL(warn_slowpath_fmt_taint);
-
-void warn_slowpath_null(const char *file, int line)
-{
-	pr_warn(CUT_HERE);
-	__warn(file, line, __builtin_return_address(0), TAINT_WARN, NULL, NULL);
-}
-EXPORT_SYMBOL(warn_slowpath_null);
+EXPORT_SYMBOL(warn_slowpath_fmt);
 #else
 void __warn_printk(const char *fmt, ...)
 {

diff --git a/kernel/params.c b/kernel/params.c
index cf44878..8e56f8b 100644
--- a/kernel/params.c
+++ b/kernel/params.c

@@ -12,6 +12,7 @@
 #include <linux/err.h>
 #include <linux/slab.h>
 #include <linux/ctype.h>
+#include <linux/security.h>
 
 #ifdef CONFIG_SYSFS
 /* Protects all built-in parameters, modules use their own param_lock */
@@ -96,13 +97,19 @@
 	return parameqn(a, b, strlen(a)+1);
 }
 
-static void param_check_unsafe(const struct kernel_param *kp)
+static bool param_check_unsafe(const struct kernel_param *kp)
 {
+	if (kp->flags & KERNEL_PARAM_FL_HWPARAM &&
+	    security_locked_down(LOCKDOWN_MODULE_PARAMETERS))
+		return false;
+
 	if (kp->flags & KERNEL_PARAM_FL_UNSAFE) {
 		pr_notice("Setting dangerous option %s - tainting kernel\n",
 			  kp->name);
 		add_taint(TAINT_USER, LOCKDEP_STILL_OK);
 	}
+
+	return true;
 }
 
 static int parse_one(char *param,
@@ -132,8 +139,10 @@
 			pr_debug("handling %s with %p\n", param,
 				params[i].ops->set);
 			kernel_param_lock(params[i].mod);
-			param_check_unsafe(&params[i]);
-			err = params[i].ops->set(val, &params[i]);
+			if (param_check_unsafe(&params[i]))
+				err = params[i].ops->set(val, &params[i]);
+			else
+				err = -EPERM;
 			kernel_param_unlock(params[i].mod);
 			return err;
 		}
@@ -553,8 +562,10 @@
 		return -EPERM;
 
 	kernel_param_lock(mk->mod);
-	param_check_unsafe(attribute->param);
-	err = attribute->param->ops->set(buf, attribute->param);
+	if (param_check_unsafe(attribute->param))
+		err = attribute->param->ops->set(buf, attribute->param);
+	else
+		err = -EPERM;
 	kernel_param_unlock(mk->mod);
 	if (!err)
 		return len;

diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index cd7434e..3c0a5a8 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c

@@ -30,6 +30,7 @@
 #include <linux/ctype.h>
 #include <linux/genhd.h>
 #include <linux/ktime.h>
+#include <linux/security.h>
 #include <trace/events/power.h>
 
 #include "power.h"
@@ -68,7 +69,7 @@
 
 bool hibernation_available(void)
 {
-	return (nohibernate == 0);
+	return nohibernate == 0 && !security_locked_down(LOCKDOWN_HIBERNATION);
 }
 
 /**

diff --git a/kernel/resource.c b/kernel/resource.c
index 74877e9..76036a4 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c

@@ -487,8 +487,8 @@
 	while (start < end &&
 	       !find_next_iomem_res(start, end, flags, IORES_DESC_NONE,
 				    false, &res)) {
-		pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT;
-		end_pfn = (res.end + 1) >> PAGE_SHIFT;
+		pfn = PFN_UP(res.start);
+		end_pfn = PFN_DOWN(res.end + 1);
 		if (end_pfn > pfn)
 			ret = (*func)(pfn, end_pfn - pfn, arg);
 		if (ret)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f9a1346..7880f4f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c

@@ -1656,7 +1656,8 @@
 	if (cpumask_equal(p->cpus_ptr, new_mask))
 		goto out;
 
-	if (!cpumask_intersects(new_mask, cpu_valid_mask)) {
+	dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
+	if (dest_cpu >= nr_cpu_ids) {
 		ret = -EINVAL;
 		goto out;
 	}
@@ -1677,7 +1678,6 @@
 	if (cpumask_test_cpu(task_cpu(p), new_mask))
 		goto out;
 
-	dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
 	if (task_running(rq, p) || p->state == TASK_WAKING) {
 		struct migration_arg arg = { p, dest_cpu };
 		/* Need help from migration thread: drop lock and wait. */
@@ -3254,7 +3254,7 @@
 		/* Task is done with its stack. */
 		put_task_stack(prev);
 
-		put_task_struct(prev);
+		put_task_struct_rcu_user(prev);
 	}
 
 	tick_nohz_task_switch();
@@ -3358,15 +3358,15 @@
 		else
 			prev->active_mm = NULL;
 	} else {                                        // to user
+		membarrier_switch_mm(rq, prev->active_mm, next->mm);
 		/*
 		 * sys_membarrier() requires an smp_mb() between setting
-		 * rq->curr and returning to userspace.
+		 * rq->curr / membarrier_switch_mm() and returning to userspace.
 		 *
 		 * The below provides this either through switch_mm(), or in
 		 * case 'prev->active_mm == next->mm' through
 		 * finish_task_switch()'s mmdrop().
 		 */
-
 		switch_mm_irqs_off(prev->active_mm, next->mm, next);
 
 		if (!prev->mm) {                        // from kernel
@@ -4042,7 +4042,11 @@
 
 	if (likely(prev != next)) {
 		rq->nr_switches++;
-		rq->curr = next;
+		/*
+		 * RCU users of rcu_dereference(rq->curr) may not see
+		 * changes to task_struct made by pick_next_task().
+		 */
+		RCU_INIT_POINTER(rq->curr, next);
 		/*
 		 * The membarrier system call requires each architecture
 		 * to have a full memory barrier after updating
@@ -4223,9 +4227,8 @@
 
 #ifdef CONFIG_PREEMPTION
 /*
- * this is the entry point to schedule() from in-kernel preemption
- * off of preempt_enable. Kernel preemptions off return from interrupt
- * occur there and call schedule directly.
+ * This is the entry point to schedule() from in-kernel preemption
+ * off of preempt_enable.
  */
 asmlinkage __visible void __sched notrace preempt_schedule(void)
 {
@@ -4296,7 +4299,7 @@
 #endif /* CONFIG_PREEMPTION */
 
 /*
- * this is the entry point to schedule() from kernel preemption
+ * This is the entry point to schedule() from kernel preemption
  * off of irq context.
  * Note, that this is called and return with irqs disabled. This will
  * protect us against recursive calling from irq.
@@ -6069,7 +6072,8 @@
 	__set_task_cpu(idle, cpu);
 	rcu_read_unlock();
 
-	rq->curr = rq->idle = idle;
+	rq->idle = idle;
+	rcu_assign_pointer(rq->curr, idle);
 	idle->on_rq = TASK_ON_RQ_QUEUED;
 #ifdef CONFIG_SMP
 	idle->on_cpu = 1;
@@ -6430,8 +6434,6 @@
 	}
 	rq_unlock_irqrestore(rq, &rf);
 
-	update_max_interval();
-
 	return 0;
 }
 

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d4bbf68..83ab35e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c

@@ -749,7 +749,6 @@
 	/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
 }
 
-static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
 static void attach_entity_cfs_rq(struct sched_entity *se);
 
 /*
@@ -1603,7 +1602,7 @@
 		return;
 
 	rcu_read_lock();
-	cur = task_rcu_dereference(&dst_rq->curr);
+	cur = rcu_dereference(dst_rq->curr);
 	if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))
 		cur = NULL;
 
@@ -4354,21 +4353,16 @@
 }
 
 /*
- * Replenish runtime according to assigned quota and update expiration time.
- * We use sched_clock_cpu directly instead of rq->clock to avoid adding
- * additional synchronization around rq->lock.
+ * Replenish runtime according to assigned quota. We use sched_clock_cpu
+ * directly instead of rq->clock to avoid adding additional synchronization
+ * around rq->lock.
  *
  * requires cfs_b->lock
  */
 void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
 {
-	u64 now;
-
-	if (cfs_b->quota == RUNTIME_INF)
-		return;
-
-	now = sched_clock_cpu(smp_processor_id());
-	cfs_b->runtime = cfs_b->quota;
+	if (cfs_b->quota != RUNTIME_INF)
+		cfs_b->runtime = cfs_b->quota;
 }
 
 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
@@ -4376,15 +4370,6 @@
 	return &tg->cfs_bandwidth;
 }
 
-/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
-static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
-{
-	if (unlikely(cfs_rq->throttle_count))
-		return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time;
-
-	return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
-}
-
 /* returns 0 on failure to allocate runtime */
 static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 {
@@ -4476,7 +4461,6 @@
 
 	cfs_rq->throttle_count--;
 	if (!cfs_rq->throttle_count) {
-		/* adjust cfs_rq_clock_task() */
 		cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
 					     cfs_rq->throttled_clock_task;
 
@@ -4994,15 +4978,13 @@
 
 void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
 {
-	u64 overrun;
-
 	lockdep_assert_held(&cfs_b->lock);
 
 	if (cfs_b->period_active)
 		return;
 
 	cfs_b->period_active = 1;
-	overrun = hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
+	hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
 	hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
 }
 
@@ -5080,11 +5062,6 @@
 	return false;
 }
 
-static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
-{
-	return rq_clock_task(rq_of(cfs_rq));
-}
-
 static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
@@ -6412,7 +6389,7 @@
 		}
 
 		/* Evaluate the energy impact of using this CPU. */
-		if (max_spare_cap_cpu >= 0) {
+		if (max_spare_cap_cpu >= 0 && max_spare_cap_cpu != prev_cpu) {
 			cur_delta = compute_energy(p, max_spare_cap_cpu, pd);
 			cur_delta -= base_energy_pd;
 			if (cur_delta < best_delta) {

diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index c892c62..8dad5aa 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c

@@ -238,7 +238,6 @@
 	tick_nohz_idle_enter();
 
 	while (!need_resched()) {
-		check_pgt_cache();
 		rmb();
 
 		local_irq_disable();

diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index aa8d758..a39bed2 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c

@@ -30,10 +30,42 @@
 	smp_mb();	/* IPIs should be serializing but paranoid. */
 }
 
+static void ipi_sync_rq_state(void *info)
+{
+	struct mm_struct *mm = (struct mm_struct *) info;
+
+	if (current->mm != mm)
+		return;
+	this_cpu_write(runqueues.membarrier_state,
+		       atomic_read(&mm->membarrier_state));
+	/*
+	 * Issue a memory barrier after setting
+	 * MEMBARRIER_STATE_GLOBAL_EXPEDITED in the current runqueue to
+	 * guarantee that no memory access following registration is reordered
+	 * before registration.
+	 */
+	smp_mb();
+}
+
+void membarrier_exec_mmap(struct mm_struct *mm)
+{
+	/*
+	 * Issue a memory barrier before clearing membarrier_state to
+	 * guarantee that no memory access prior to exec is reordered after
+	 * clearing this state.
+	 */
+	smp_mb();
+	atomic_set(&mm->membarrier_state, 0);
+	/*
+	 * Keep the runqueue membarrier_state in sync with this mm
+	 * membarrier_state.
+	 */
+	this_cpu_write(runqueues.membarrier_state, 0);
+}
+
 static int membarrier_global_expedited(void)
 {
 	int cpu;
-	bool fallback = false;
 	cpumask_var_t tmpmask;
 
 	if (num_online_cpus() == 1)
@@ -45,17 +77,11 @@
 	 */
 	smp_mb();	/* system call entry is not a mb. */
 
-	/*
-	 * Expedited membarrier commands guarantee that they won't
-	 * block, hence the GFP_NOWAIT allocation flag and fallback
-	 * implementation.
-	 */
-	if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) {
-		/* Fallback for OOM. */
-		fallback = true;
-	}
+	if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
+		return -ENOMEM;
 
 	cpus_read_lock();
+	rcu_read_lock();
 	for_each_online_cpu(cpu) {
 		struct task_struct *p;
 
@@ -70,23 +96,28 @@
 		if (cpu == raw_smp_processor_id())
 			continue;
 
-		rcu_read_lock();
-		p = task_rcu_dereference(&cpu_rq(cpu)->curr);
-		if (p && p->mm && (atomic_read(&p->mm->membarrier_state) &
-				   MEMBARRIER_STATE_GLOBAL_EXPEDITED)) {
-			if (!fallback)
-				__cpumask_set_cpu(cpu, tmpmask);
-			else
-				smp_call_function_single(cpu, ipi_mb, NULL, 1);
-		}
-		rcu_read_unlock();
+		if (!(READ_ONCE(cpu_rq(cpu)->membarrier_state) &
+		    MEMBARRIER_STATE_GLOBAL_EXPEDITED))
+			continue;
+
+		/*
+		 * Skip the CPU if it runs a kernel thread. The scheduler
+		 * leaves the prior task mm in place as an optimization when
+		 * scheduling a kthread.
+		 */
+		p = rcu_dereference(cpu_rq(cpu)->curr);
+		if (p->flags & PF_KTHREAD)
+			continue;
+
+		__cpumask_set_cpu(cpu, tmpmask);
 	}
-	if (!fallback) {
-		preempt_disable();
-		smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
-		preempt_enable();
-		free_cpumask_var(tmpmask);
-	}
+	rcu_read_unlock();
+
+	preempt_disable();
+	smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
+	preempt_enable();
+
+	free_cpumask_var(tmpmask);
 	cpus_read_unlock();
 
 	/*
@@ -101,22 +132,22 @@
 static int membarrier_private_expedited(int flags)
 {
 	int cpu;
-	bool fallback = false;
 	cpumask_var_t tmpmask;
+	struct mm_struct *mm = current->mm;
 
 	if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
 		if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
 			return -EINVAL;
-		if (!(atomic_read(&current->mm->membarrier_state) &
+		if (!(atomic_read(&mm->membarrier_state) &
 		      MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY))
 			return -EPERM;
 	} else {
-		if (!(atomic_read(&current->mm->membarrier_state) &
+		if (!(atomic_read(&mm->membarrier_state) &
 		      MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY))
 			return -EPERM;
 	}
 
-	if (num_online_cpus() == 1)
+	if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1)
 		return 0;
 
 	/*
@@ -125,17 +156,11 @@
 	 */
 	smp_mb();	/* system call entry is not a mb. */
 
-	/*
-	 * Expedited membarrier commands guarantee that they won't
-	 * block, hence the GFP_NOWAIT allocation flag and fallback
-	 * implementation.
-	 */
-	if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) {
-		/* Fallback for OOM. */
-		fallback = true;
-	}
+	if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
+		return -ENOMEM;
 
 	cpus_read_lock();
+	rcu_read_lock();
 	for_each_online_cpu(cpu) {
 		struct task_struct *p;
 
@@ -150,21 +175,17 @@
 		if (cpu == raw_smp_processor_id())
 			continue;
 		rcu_read_lock();
-		p = task_rcu_dereference(&cpu_rq(cpu)->curr);
-		if (p && p->mm == current->mm) {
-			if (!fallback)
-				__cpumask_set_cpu(cpu, tmpmask);
-			else
-				smp_call_function_single(cpu, ipi_mb, NULL, 1);
-		}
-		rcu_read_unlock();
+		p = rcu_dereference(cpu_rq(cpu)->curr);
+		if (p && p->mm == mm)
+			__cpumask_set_cpu(cpu, tmpmask);
 	}
-	if (!fallback) {
-		preempt_disable();
-		smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
-		preempt_enable();
-		free_cpumask_var(tmpmask);
-	}
+	rcu_read_unlock();
+
+	preempt_disable();
+	smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
+	preempt_enable();
+
+	free_cpumask_var(tmpmask);
 	cpus_read_unlock();
 
 	/*
@@ -177,32 +198,78 @@
 	return 0;
 }
 
+static int sync_runqueues_membarrier_state(struct mm_struct *mm)
+{
+	int membarrier_state = atomic_read(&mm->membarrier_state);
+	cpumask_var_t tmpmask;
+	int cpu;
+
+	if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1) {
+		this_cpu_write(runqueues.membarrier_state, membarrier_state);
+
+		/*
+		 * For single mm user, we can simply issue a memory barrier
+		 * after setting MEMBARRIER_STATE_GLOBAL_EXPEDITED in the
+		 * mm and in the current runqueue to guarantee that no memory
+		 * access following registration is reordered before
+		 * registration.
+		 */
+		smp_mb();
+		return 0;
+	}
+
+	if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
+		return -ENOMEM;
+
+	/*
+	 * For mm with multiple users, we need to ensure all future
+	 * scheduler executions will observe @mm's new membarrier
+	 * state.
+	 */
+	synchronize_rcu();
+
+	/*
+	 * For each cpu runqueue, if the task's mm match @mm, ensure that all
+	 * @mm's membarrier state set bits are also set in in the runqueue's
+	 * membarrier state. This ensures that a runqueue scheduling
+	 * between threads which are users of @mm has its membarrier state
+	 * updated.
+	 */
+	cpus_read_lock();
+	rcu_read_lock();
+	for_each_online_cpu(cpu) {
+		struct rq *rq = cpu_rq(cpu);
+		struct task_struct *p;
+
+		p = rcu_dereference(rq->curr);
+		if (p && p->mm == mm)
+			__cpumask_set_cpu(cpu, tmpmask);
+	}
+	rcu_read_unlock();
+
+	preempt_disable();
+	smp_call_function_many(tmpmask, ipi_sync_rq_state, mm, 1);
+	preempt_enable();
+
+	free_cpumask_var(tmpmask);
+	cpus_read_unlock();
+
+	return 0;
+}
+
 static int membarrier_register_global_expedited(void)
 {
 	struct task_struct *p = current;
 	struct mm_struct *mm = p->mm;
+	int ret;
 
 	if (atomic_read(&mm->membarrier_state) &
 	    MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY)
 		return 0;
 	atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &mm->membarrier_state);
-	if (atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1) {
-		/*
-		 * For single mm user, single threaded process, we can
-		 * simply issue a memory barrier after setting
-		 * MEMBARRIER_STATE_GLOBAL_EXPEDITED to guarantee that
-		 * no memory access following registration is reordered
-		 * before registration.
-		 */
-		smp_mb();
-	} else {
-		/*
-		 * For multi-mm user threads, we need to ensure all
-		 * future scheduler executions will observe the new
-		 * thread flag state for this mm.
-		 */
-		synchronize_rcu();
-	}
+	ret = sync_runqueues_membarrier_state(mm);
+	if (ret)
+		return ret;
 	atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY,
 		  &mm->membarrier_state);
 
@@ -213,12 +280,15 @@
 {
 	struct task_struct *p = current;
 	struct mm_struct *mm = p->mm;
-	int state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY;
+	int ready_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY,
+	    set_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED,
+	    ret;
 
 	if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
 		if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
 			return -EINVAL;
-		state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY;
+		ready_state =
+			MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY;
 	}
 
 	/*
@@ -226,20 +296,15 @@
 	 * groups, which use the same mm. (CLONE_VM but not
 	 * CLONE_THREAD).
 	 */
-	if (atomic_read(&mm->membarrier_state) & state)
+	if ((atomic_read(&mm->membarrier_state) & ready_state) == ready_state)
 		return 0;
-	atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED, &mm->membarrier_state);
 	if (flags & MEMBARRIER_FLAG_SYNC_CORE)
-		atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE,
-			  &mm->membarrier_state);
-	if (!(atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1)) {
-		/*
-		 * Ensure all future scheduler executions will observe the
-		 * new thread flag state for this process.
-		 */
-		synchronize_rcu();
-	}
-	atomic_or(state, &mm->membarrier_state);
+		set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE;
+	atomic_or(set_state, &mm->membarrier_state);
+	ret = sync_runqueues_membarrier_state(mm);
+	if (ret)
+		return ret;
+	atomic_or(ready_state, &mm->membarrier_state);
 
 	return 0;
 }
@@ -253,8 +318,10 @@
  * command specified does not exist, not available on the running
  * kernel, or if the command argument is invalid, this system call
  * returns -EINVAL. For a given command, with flags argument set to 0,
- * this system call is guaranteed to always return the same value until
- * reboot.
+ * if this system call returns -ENOSYS or -EINVAL, it is guaranteed to
+ * always return the same value until reboot. In addition, it can return
+ * -ENOMEM if there is not enough memory available to perform the system
+ * call.
  *
  * All memory accesses performed in program order from each targeted thread
  * is guaranteed to be ordered with respect to sys_membarrier(). If we use

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b3cb895..0db2c1b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h

@@ -911,6 +911,10 @@
 
 	atomic_t		nr_iowait;
 
+#ifdef CONFIG_MEMBARRIER
+	int membarrier_state;
+#endif
+
 #ifdef CONFIG_SMP
 	struct root_domain		*rd;
 	struct sched_domain __rcu	*sd;
@@ -2438,3 +2442,33 @@
 static inline bool sched_energy_enabled(void) { return false; }
 
 #endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL */
+
+#ifdef CONFIG_MEMBARRIER
+/*
+ * The scheduler provides memory barriers required by membarrier between:
+ * - prior user-space memory accesses and store to rq->membarrier_state,
+ * - store to rq->membarrier_state and following user-space memory accesses.
+ * In the same way it provides those guarantees around store to rq->curr.
+ */
+static inline void membarrier_switch_mm(struct rq *rq,
+					struct mm_struct *prev_mm,
+					struct mm_struct *next_mm)
+{
+	int membarrier_state;
+
+	if (prev_mm == next_mm)
+		return;
+
+	membarrier_state = atomic_read(&next_mm->membarrier_state);
+	if (READ_ONCE(rq->membarrier_state) == membarrier_state)
+		return;
+
+	WRITE_ONCE(rq->membarrier_state, membarrier_state);
+}
+#else
+static inline void membarrier_switch_mm(struct rq *rq,
+					struct mm_struct *prev_mm,
+					struct mm_struct *next_mm)
+{
+}
+#endif

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 078950d..00fcea2 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c

@@ -264,7 +264,8 @@
 extern struct ctl_table firmware_config_table[];
 #endif
 
-#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT
+#if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \
+    defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT)
 int sysctl_legacy_va_layout;
 #endif
 
@@ -1573,7 +1574,8 @@
 		.proc_handler	= proc_dointvec,
 		.extra1		= SYSCTL_ZERO,
 	},
-#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT
+#if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \
+    defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT)
 	{
 		.procname	= "legacy_va_layout",
 		.data		= &sysctl_legacy_va_layout,

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 0e315a2..4820823 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c

@@ -1678,24 +1678,26 @@
 static int collect_expired_timers(struct timer_base *base,
 				  struct hlist_head *heads)
 {
+	unsigned long now = READ_ONCE(jiffies);
+
 	/*
 	 * NOHZ optimization. After a long idle sleep we need to forward the
 	 * base to current jiffies. Avoid a loop by searching the bitfield for
 	 * the next expiring timer.
 	 */
-	if ((long)(jiffies - base->clk) > 2) {
+	if ((long)(now - base->clk) > 2) {
 		unsigned long next = __next_timer_interrupt(base);
 
 		/*
 		 * If the next timer is ahead of time forward to current
 		 * jiffies, otherwise forward to the next expiry time:
 		 */
-		if (time_after(next, jiffies)) {
+		if (time_after(next, now)) {
 			/*
 			 * The call site will increment base->clk and then
 			 * terminate the expiry loop immediately.
 			 */
-			base->clk = jiffies;
+			base->clk = now;
 			return 0;
 		}
 		base->clk = next;

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index ca1255d..44bd08f 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c

@@ -142,8 +142,13 @@
 {
 	int ret;
 
+	ret = security_locked_down(LOCKDOWN_BPF_READ);
+	if (ret < 0)
+		goto out;
+
 	ret = probe_kernel_read(dst, unsafe_ptr, size);
 	if (unlikely(ret < 0))
+out:
 		memset(dst, 0, size);
 
 	return ret;
@@ -500,14 +505,17 @@
 	.arg5_type	= ARG_CONST_SIZE_OR_ZERO,
 };
 
-static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs);
-static DEFINE_PER_CPU(struct perf_sample_data, bpf_misc_sd);
+static DEFINE_PER_CPU(int, bpf_event_output_nest_level);
+struct bpf_nested_pt_regs {
+	struct pt_regs regs[3];
+};
+static DEFINE_PER_CPU(struct bpf_nested_pt_regs, bpf_pt_regs);
+static DEFINE_PER_CPU(struct bpf_trace_sample_data, bpf_misc_sds);
 
 u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
 		     void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy)
 {
-	struct perf_sample_data *sd = this_cpu_ptr(&bpf_misc_sd);
-	struct pt_regs *regs = this_cpu_ptr(&bpf_pt_regs);
+	int nest_level = this_cpu_inc_return(bpf_event_output_nest_level);
 	struct perf_raw_frag frag = {
 		.copy		= ctx_copy,
 		.size		= ctx_size,
@@ -522,12 +530,25 @@
 			.data	= meta,
 		},
 	};
+	struct perf_sample_data *sd;
+	struct pt_regs *regs;
+	u64 ret;
+
+	if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(bpf_misc_sds.sds))) {
+		ret = -EBUSY;
+		goto out;
+	}
+	sd = this_cpu_ptr(&bpf_misc_sds.sds[nest_level - 1]);
+	regs = this_cpu_ptr(&bpf_pt_regs.regs[nest_level - 1]);
 
 	perf_fetch_caller_regs(regs);
 	perf_sample_data_init(sd, 0, 0);
 	sd->raw = &raw;
 
-	return __bpf_perf_event_output(regs, map, flags, sd);
+	ret = __bpf_perf_event_output(regs, map, flags, sd);
+out:
+	this_cpu_dec(bpf_event_output_nest_level);
+	return ret;
 }
 
 BPF_CALL_0(bpf_get_current_task)
@@ -569,6 +590,10 @@
 {
 	int ret;
 
+	ret = security_locked_down(LOCKDOWN_BPF_READ);
+	if (ret < 0)
+		goto out;
+
 	/*
 	 * The strncpy_from_unsafe() call will likely not fill the entire
 	 * buffer, but that's okay in this circumstance as we're probing
@@ -580,6 +605,7 @@
 	 */
 	ret = strncpy_from_unsafe(dst, unsafe_ptr, size);
 	if (unlikely(ret < 0))
+out:
 		memset(dst, 0, size);
 
 	return ret;

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index a6697e2..324ffbe 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c

@@ -11,6 +11,7 @@
 #include <linux/uaccess.h>
 #include <linux/rculist.h>
 #include <linux/error-injection.h>
+#include <linux/security.h>
 
 #include <asm/setup.h>  /* for COMMAND_LINE_SIZE */
 
@@ -460,6 +461,10 @@
 {
 	int i, ret;
 
+	ret = security_locked_down(LOCKDOWN_KPROBES);
+	if (ret)
+		return ret;
+
 	if (trace_kprobe_is_registered(tk))
 		return -EINVAL;
 
@@ -549,10 +554,11 @@
 		for (i = 0; i < orig->tp.nr_args; i++) {
 			if (strcmp(orig->tp.args[i].comm,
 				   comp->tp.args[i].comm))
-				continue;
+				break;
 		}
 
-		return true;
+		if (i == orig->tp.nr_args)
+			return true;
 	}
 
 	return false;

diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 34dd6d0..dd88434 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c

@@ -431,10 +431,11 @@
 		for (i = 0; i < orig->tp.nr_args; i++) {
 			if (strcmp(orig->tp.args[i].comm,
 				   comp->tp.args[i].comm))
-				continue;
+				break;
 		}
 
-		return true;
+		if (i == orig->tp.nr_args)
+			return true;
 	}
 
 	return false;

diff --git a/lib/Kconfig b/lib/Kconfig
index 4e6b1c3..183f92a 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig

@@ -555,11 +555,10 @@
 	  Implementation is done using GnuPG MPI library
 
 config DIMLIB
-	bool "DIM library"
-	default y
+	bool
 	help
 	  Dynamic Interrupt Moderation library.
-	  Implements an algorithm for dynamically change CQ modertion values
+	  Implements an algorithm for dynamically changing CQ moderation values
 	  according to run time performance.
 
 #

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index e0e1478..93d97f9 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug

@@ -311,7 +311,7 @@
 	  relevant for userspace, say 'Y'.
 
 config OPTIMIZE_INLINING
-	bool "Allow compiler to uninline functions marked 'inline'"
+	def_bool y
 	help
 	  This option determines if the kernel forces gcc to inline the functions
 	  developers have marked 'inline'. Doing so takes away freedom from gcc to
@@ -322,8 +322,6 @@
 	  decision will become the default in the future. Until then this option
 	  is there to test gcc for this.
 
-	  If unsure, say N.
-
 config DEBUG_SECTION_MISMATCH
 	bool "Enable full Section mismatch analysis"
 	help
@@ -576,17 +574,18 @@
 	  In order to access the kmemleak file, debugfs needs to be
 	  mounted (usually at /sys/kernel/debug).
 
-config DEBUG_KMEMLEAK_EARLY_LOG_SIZE
-	int "Maximum kmemleak early log entries"
+config DEBUG_KMEMLEAK_MEM_POOL_SIZE
+	int "Kmemleak memory pool size"
 	depends on DEBUG_KMEMLEAK
-	range 200 40000
-	default 400
+	range 200 1000000
+	default 16000
 	help
 	  Kmemleak must track all the memory allocations to avoid
 	  reporting false positives. Since memory may be allocated or
-	  freed before kmemleak is initialised, an early log buffer is
-	  used to store these actions. If kmemleak reports "early log
-	  buffer exceeded", please increase this value.
+	  freed before kmemleak is fully initialised, use a static pool
+	  of metadata objects to track such callbacks. After kmemleak is
+	  fully initialised, this memory pool acts as an emergency one
+	  if slab allocations fail.
 
 config DEBUG_KMEMLEAK_TEST
 	tristate "Simple test for the kernel memory leak detector"

diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan
index 7fa97a8..6c9682c 100644
--- a/lib/Kconfig.kasan
+++ b/lib/Kconfig.kasan

@@ -134,6 +134,14 @@
 	  to 3TB of RAM with KASan enabled). This options allows to force
 	  4-level paging instead.
 
+config KASAN_SW_TAGS_IDENTIFY
+	bool "Enable memory corruption identification"
+	depends on KASAN_SW_TAGS
+	help
+	  This option enables best-effort identification of bug type
+	  (use-after-free or out-of-bounds) at the cost of increased
+	  memory consumption.
+
 config TEST_KASAN
 	tristate "Module for testing KASAN for bug detection"
 	depends on m && KASAN

diff --git a/lib/bug.c b/lib/bug.c
index 1077366..8c98af0 100644
--- a/lib/bug.c
+++ b/lib/bug.c

@@ -181,6 +181,15 @@
 		}
 	}
 
+	/*
+	 * BUG() and WARN_ON() families don't print a custom debug message
+	 * before triggering the exception handler, so we must add the
+	 * "cut here" line now. WARN() issues its own "cut here" before the
+	 * extra debugging message it writes before triggering the handler.
+	 */
+	if ((bug->flags & BUGFLAG_NO_CUT_HERE) == 0)
+		printk(KERN_DEFAULT CUT_HERE);
+
 	if (warning) {
 		/* this is a WARN_ON rather than BUG/BUG_ON */
 		__warn(file, line, (void *)bugaddr, BUG_GET_TAINT(bug), regs,
@@ -188,8 +197,6 @@
 		return BUG_TRAP_TYPE_WARN;
 	}
 
-	printk(KERN_DEFAULT CUT_HERE);
-
 	if (file)
 		pr_crit("kernel BUG at %s:%u!\n", file, line);
 	else

diff --git a/lib/extable.c b/lib/extable.c
index 25da407..c3e59ca 100644
--- a/lib/extable.c
+++ b/lib/extable.c

@@ -10,6 +10,7 @@
 #include <linux/init.h>
 #include <linux/sort.h>
 #include <linux/uaccess.h>
+#include <linux/extable.h>
 
 #ifndef ARCH_HAS_RELATIVE_EXTABLE
 #define ex_to_insn(x)	((x)->insn)

diff --git a/lib/generic-radix-tree.c b/lib/generic-radix-tree.c
index a7bafc4..ae25e2f 100644
--- a/lib/generic-radix-tree.c
+++ b/lib/generic-radix-tree.c

@@ -36,12 +36,12 @@
 #define GENRADIX_DEPTH_MASK				\
 	((unsigned long) (roundup_pow_of_two(GENRADIX_MAX_DEPTH + 1) - 1))
 
-unsigned genradix_root_to_depth(struct genradix_root *r)
+static inline unsigned genradix_root_to_depth(struct genradix_root *r)
 {
 	return (unsigned long) r & GENRADIX_DEPTH_MASK;
 }
 
-struct genradix_node *genradix_root_to_node(struct genradix_root *r)
+static inline struct genradix_node *genradix_root_to_node(struct genradix_root *r)
 {
 	return (void *) ((unsigned long) r & ~GENRADIX_DEPTH_MASK);
 }

diff --git a/lib/hexdump.c b/lib/hexdump.c
index b1d55b6..147133f 100644
--- a/lib/hexdump.c
+++ b/lib/hexdump.c

@@ -270,25 +270,4 @@
 }
 EXPORT_SYMBOL(print_hex_dump);
 
-#if !defined(CONFIG_DYNAMIC_DEBUG)
-/**
- * print_hex_dump_bytes - shorthand form of print_hex_dump() with default params
- * @prefix_str: string to prefix each line with;
- *  caller supplies trailing spaces for alignment if desired
- * @prefix_type: controls whether prefix of an offset, address, or none
- *  is printed (%DUMP_PREFIX_OFFSET, %DUMP_PREFIX_ADDRESS, %DUMP_PREFIX_NONE)
- * @buf: data blob to dump
- * @len: number of bytes in the @buf
- *
- * Calls print_hex_dump(), with log level of KERN_DEBUG,
- * rowsize of 16, groupsize of 1, and ASCII output included.
- */
-void print_hex_dump_bytes(const char *prefix_str, int prefix_type,
-			  const void *buf, size_t len)
-{
-	print_hex_dump(KERN_DEBUG, prefix_str, prefix_type, 16, 1,
-		       buf, len, true);
-}
-EXPORT_SYMBOL(print_hex_dump_bytes);
-#endif /* !defined(CONFIG_DYNAMIC_DEBUG) */
 #endif /* defined(CONFIG_PRINTK) */

diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index f1e0569..639d5e7 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c

@@ -878,7 +878,7 @@
 	head = compound_head(page);
 	v += (page - head) << PAGE_SHIFT;
 
-	if (likely(n <= v && v <= (PAGE_SIZE << compound_order(head))))
+	if (likely(n <= v && v <= (page_size(head))))
 		return true;
 	WARN_ON(1);
 	return false;

diff --git a/lib/lzo/lzo1x_compress.c b/lib/lzo/lzo1x_compress.c
index ba16c08..717c940 100644
--- a/lib/lzo/lzo1x_compress.c
+++ b/lib/lzo/lzo1x_compress.c

@@ -83,17 +83,19 @@
 					ALIGN((uintptr_t)ir, 4)) &&
 					(ir < limit) && (*ir == 0))
 				ir++;
-			for (; (ir + 4) <= limit; ir += 4) {
-				dv = *((u32 *)ir);
-				if (dv) {
+			if (IS_ALIGNED((uintptr_t)ir, 4)) {
+				for (; (ir + 4) <= limit; ir += 4) {
+					dv = *((u32 *)ir);
+					if (dv) {
 #  if defined(__LITTLE_ENDIAN)
-					ir += __builtin_ctz(dv) >> 3;
+						ir += __builtin_ctz(dv) >> 3;
 #  elif defined(__BIG_ENDIAN)
-					ir += __builtin_clz(dv) >> 3;
+						ir += __builtin_clz(dv) >> 3;
 #  else
 #    error "missing endian definition"
 #  endif
-					break;
+						break;
+					}
 				}
 			}
 #endif

diff --git a/lib/rbtree_test.c b/lib/rbtree_test.c
index 62b8ee9..41ae3c7 100644
--- a/lib/rbtree_test.c
+++ b/lib/rbtree_test.c

@@ -77,26 +77,10 @@
 }
 
 
-static inline u32 augment_recompute(struct test_node *node)
-{
-	u32 max = node->val, child_augmented;
-	if (node->rb.rb_left) {
-		child_augmented = rb_entry(node->rb.rb_left, struct test_node,
-					   rb)->augmented;
-		if (max < child_augmented)
-			max = child_augmented;
-	}
-	if (node->rb.rb_right) {
-		child_augmented = rb_entry(node->rb.rb_right, struct test_node,
-					   rb)->augmented;
-		if (max < child_augmented)
-			max = child_augmented;
-	}
-	return max;
-}
+#define NODE_VAL(node) ((node)->val)
 
-RB_DECLARE_CALLBACKS(static, augment_callbacks, struct test_node, rb,
-		     u32, augmented, augment_recompute)
+RB_DECLARE_CALLBACKS_MAX(static, augment_callbacks,
+			 struct test_node, rb, u32, augmented, NODE_VAL)
 
 static void insert_augmented(struct test_node *node,
 			     struct rb_root_cached *root)
@@ -238,7 +222,20 @@
 	check(nr_nodes);
 	for (rb = rb_first(&root.rb_root); rb; rb = rb_next(rb)) {
 		struct test_node *node = rb_entry(rb, struct test_node, rb);
-		WARN_ON_ONCE(node->augmented != augment_recompute(node));
+		u32 subtree, max = node->val;
+		if (node->rb.rb_left) {
+			subtree = rb_entry(node->rb.rb_left, struct test_node,
+					   rb)->augmented;
+			if (max < subtree)
+				max = subtree;
+		}
+		if (node->rb.rb_right) {
+			subtree = rb_entry(node->rb.rb_right, struct test_node,
+					   rb)->augmented;
+			if (max < subtree)
+				max = subtree;
+		}
+		WARN_ON_ONCE(node->augmented != max);
 	}
 }
 

diff --git a/lib/show_mem.c b/lib/show_mem.c
index 5c86ef4..1c26c14 100644
--- a/lib/show_mem.c
+++ b/lib/show_mem.c

@@ -6,7 +6,6 @@
  */
 
 #include <linux/mm.h>
-#include <linux/quicklist.h>
 #include <linux/cma.h>
 
 void show_mem(unsigned int filter, nodemask_t *nodemask)
@@ -39,10 +38,6 @@
 #ifdef CONFIG_CMA
 	printk("%lu pages cma reserved\n", totalcma_pages);
 #endif
-#ifdef CONFIG_QUICKLIST
-	printk("%lu pages in pagetable cache\n",
-		quicklist_total_size());
-#endif
 #ifdef CONFIG_MEMORY_FAILURE
 	printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages));
 #endif

diff --git a/lib/string.c b/lib/string.c
index 461fb62..cd7a10c 100644
--- a/lib/string.c
+++ b/lib/string.c

@@ -173,8 +173,9 @@
  * doesn't unnecessarily force the tail of the destination buffer to be
  * zeroed.  If zeroing is desired please use strscpy_pad().
  *
- * Return: The number of characters copied (not including the trailing
- *         %NUL) or -E2BIG if the destination buffer wasn't big enough.
+ * Returns:
+ * * The number of characters copied (not including the trailing %NUL)
+ * * -E2BIG if count is 0 or @src was truncated.
  */
 ssize_t strscpy(char *dest, const char *src, size_t count)
 {
@@ -182,7 +183,7 @@
 	size_t max = count;
 	long res = 0;
 
-	if (count == 0)
+	if (count == 0 || WARN_ON_ONCE(count > INT_MAX))
 		return -E2BIG;
 
 #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
@@ -253,8 +254,9 @@
  * For full explanation of why you may want to consider using the
  * 'strscpy' functions please see the function docstring for strscpy().
  *
- * Return: The number of characters copied (not including the trailing
- *         %NUL) or -E2BIG if the destination buffer wasn't big enough.
+ * Returns:
+ * * The number of characters copied (not including the trailing %NUL)
+ * * -E2BIG if count is 0 or @src was truncated.
  */
 ssize_t strscpy_pad(char *dest, const char *src, size_t count)
 {

diff --git a/lib/strncpy_from_user.c b/lib/strncpy_from_user.c
index 023ba9f..dccb95a 100644
--- a/lib/strncpy_from_user.c
+++ b/lib/strncpy_from_user.c

@@ -6,6 +6,7 @@
 #include <linux/uaccess.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
+#include <linux/mm.h>
 
 #include <asm/byteorder.h>
 #include <asm/word-at-a-time.h>
@@ -108,7 +109,7 @@
 		return 0;
 
 	max_addr = user_addr_max();
-	src_addr = (unsigned long)src;
+	src_addr = (unsigned long)untagged_addr(src);
 	if (likely(src_addr < max_addr)) {
 		unsigned long max = max_addr - src_addr;
 		long retval;

diff --git a/lib/strnlen_user.c b/lib/strnlen_user.c
index 7f2db3f..28ff554 100644
--- a/lib/strnlen_user.c
+++ b/lib/strnlen_user.c

@@ -2,6 +2,7 @@
 #include <linux/kernel.h>
 #include <linux/export.h>
 #include <linux/uaccess.h>
+#include <linux/mm.h>
 
 #include <asm/word-at-a-time.h>
 
@@ -109,7 +110,7 @@
 		return 0;
 
 	max_addr = user_addr_max();
-	src_addr = (unsigned long)str;
+	src_addr = (unsigned long)untagged_addr(str);
 	if (likely(src_addr < max_addr)) {
 		unsigned long max = max_addr - src_addr;
 		long retval;

diff --git a/lib/test_kasan.c b/lib/test_kasan.c
index b63b367..49cc4d5 100644
--- a/lib/test_kasan.c
+++ b/lib/test_kasan.c

@@ -18,6 +18,9 @@
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/uaccess.h>
+#include <linux/io.h>
+
+#include <asm/page.h>
 
 /*
  * Note: test functions are marked noinline so that their names appear in
@@ -337,6 +340,42 @@
 	kfree(ptr2);
 }
 
+static noinline void __init kfree_via_page(void)
+{
+	char *ptr;
+	size_t size = 8;
+	struct page *page;
+	unsigned long offset;
+
+	pr_info("invalid-free false positive (via page)\n");
+	ptr = kmalloc(size, GFP_KERNEL);
+	if (!ptr) {
+		pr_err("Allocation failed\n");
+		return;
+	}
+
+	page = virt_to_page(ptr);
+	offset = offset_in_page(ptr);
+	kfree(page_address(page) + offset);
+}
+
+static noinline void __init kfree_via_phys(void)
+{
+	char *ptr;
+	size_t size = 8;
+	phys_addr_t phys;
+
+	pr_info("invalid-free false positive (via phys)\n");
+	ptr = kmalloc(size, GFP_KERNEL);
+	if (!ptr) {
+		pr_err("Allocation failed\n");
+		return;
+	}
+
+	phys = virt_to_phys(ptr);
+	kfree(phys_to_virt(phys));
+}
+
 static noinline void __init kmem_cache_oob(void)
 {
 	char *p;
@@ -737,6 +776,8 @@
 	kmalloc_uaf();
 	kmalloc_uaf_memset();
 	kmalloc_uaf2();
+	kfree_via_page();
+	kfree_via_phys();
 	kmem_cache_oob();
 	memcg_accounted_kmem_cache();
 	kasan_stack_oob();

diff --git a/mm/Kconfig b/mm/Kconfig
index 2fe4902..a5dae9a 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig

@@ -273,11 +273,6 @@
 	  by default when ZONE_DMA or HIGHMEM is selected, but you
 	  may say n to override this.
 
-config NR_QUICK
-	int
-	depends on QUICKLIST
-	default "1"
-
 config VIRT_TO_BUS
 	bool
 	help
@@ -717,6 +712,17 @@
 config GUP_GET_PTE_LOW_HIGH
 	bool
 
+config READ_ONLY_THP_FOR_FS
+	bool "Read-only THP for filesystems (EXPERIMENTAL)"
+	depends on TRANSPARENT_HUGE_PAGECACHE && SHMEM
+
+	help
+	  Allow khugepaged to put read-only file-backed pages in THP.
+
+	  This is marked experimental because it is a new feature. Write
+	  support of file THPs will be developed in the next few release
+	  cycles.
+
 config ARCH_HAS_PTE_SPECIAL
 	bool
 

diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index 82b6a20..327b3eb 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug

@@ -21,7 +21,9 @@
 	  Also, the state of page tracking structures is checked more often as
 	  pages are being allocated and freed, as unexpected state changes
 	  often happen for same reasons as memory corruption (e.g. double free,
-	  use-after-free).
+	  use-after-free). The error reports for these checks can be augmented
+	  with stack traces of last allocation and freeing of the page, when
+	  PAGE_OWNER is also selected and enabled on boot.
 
 	  For architectures which don't enable ARCH_SUPPORTS_DEBUG_PAGEALLOC,
 	  fill the pages with poison patterns after free_pages() and verify

diff --git a/mm/Makefile b/mm/Makefile
index d0b295c..d996846 100644
--- a/mm/Makefile
+++ b/mm/Makefile

@@ -21,6 +21,9 @@
 KCOV_INSTRUMENT_mmzone.o := n
 KCOV_INSTRUMENT_vmstat.o := n
 
+CFLAGS_init-mm.o += $(call cc-disable-warning, override-init)
+CFLAGS_init-mm.o += $(call cc-disable-warning, initializer-overrides)
+
 mmu-y			:= nommu.o
 mmu-$(CONFIG_MMU)	:= highmem.o memory.o mincore.o \
 			   mlock.o mmap.o mmu_gather.o mprotect.o mremap.o \
@@ -72,7 +75,6 @@
 obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
 obj-$(CONFIG_MEMTEST)		+= memtest.o
 obj-$(CONFIG_MIGRATION) += migrate.o
-obj-$(CONFIG_QUICKLIST) += quicklist.o
 obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o
 obj-$(CONFIG_PAGE_COUNTER) += page_counter.o
 obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o

diff --git a/mm/compaction.c b/mm/compaction.c
index 952dc2f..ce08b39 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c

@@ -969,7 +969,7 @@
 			 * is safe to read and it's 0 for tail pages.
 			 */
 			if (unlikely(PageCompound(page))) {
-				low_pfn += (1UL << compound_order(page)) - 1;
+				low_pfn += compound_nr(page) - 1;
 				goto isolate_fail;
 			}
 		}
@@ -1737,8 +1737,7 @@
  * starting at the block pointed to by the migrate scanner pfn within
  * compact_control.
  */
-static isolate_migrate_t isolate_migratepages(struct zone *zone,
-					struct compact_control *cc)
+static isolate_migrate_t isolate_migratepages(struct compact_control *cc)
 {
 	unsigned long block_start_pfn;
 	unsigned long block_end_pfn;
@@ -1756,8 +1755,8 @@
 	 */
 	low_pfn = fast_find_migrateblock(cc);
 	block_start_pfn = pageblock_start_pfn(low_pfn);
-	if (block_start_pfn < zone->zone_start_pfn)
-		block_start_pfn = zone->zone_start_pfn;
+	if (block_start_pfn < cc->zone->zone_start_pfn)
+		block_start_pfn = cc->zone->zone_start_pfn;
 
 	/*
 	 * fast_find_migrateblock marks a pageblock skipped so to avoid
@@ -1787,8 +1786,8 @@
 		if (!(low_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages)))
 			cond_resched();
 
-		page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn,
-									zone);
+		page = pageblock_pfn_to_page(block_start_pfn,
+						block_end_pfn, cc->zone);
 		if (!page)
 			continue;
 
@@ -2078,6 +2077,17 @@
 	const bool sync = cc->mode != MIGRATE_ASYNC;
 	bool update_cached;
 
+	/*
+	 * These counters track activities during zone compaction.  Initialize
+	 * them before compacting a new zone.
+	 */
+	cc->total_migrate_scanned = 0;
+	cc->total_free_scanned = 0;
+	cc->nr_migratepages = 0;
+	cc->nr_freepages = 0;
+	INIT_LIST_HEAD(&cc->freepages);
+	INIT_LIST_HEAD(&cc->migratepages);
+
 	cc->migratetype = gfpflags_to_migratetype(cc->gfp_mask);
 	ret = compaction_suitable(cc->zone, cc->order, cc->alloc_flags,
 							cc->classzone_idx);
@@ -2158,7 +2168,7 @@
 			cc->rescan = true;
 		}
 
-		switch (isolate_migratepages(cc->zone, cc)) {
+		switch (isolate_migratepages(cc)) {
 		case ISOLATE_ABORT:
 			ret = COMPACT_CONTENDED;
 			putback_movable_pages(&cc->migratepages);
@@ -2281,10 +2291,6 @@
 {
 	enum compact_result ret;
 	struct compact_control cc = {
-		.nr_freepages = 0,
-		.nr_migratepages = 0,
-		.total_migrate_scanned = 0,
-		.total_free_scanned = 0,
 		.order = order,
 		.search_order = order,
 		.gfp_mask = gfp_mask,
@@ -2305,8 +2311,6 @@
 
 	if (capture)
 		current->capture_control = &capc;
-	INIT_LIST_HEAD(&cc.freepages);
-	INIT_LIST_HEAD(&cc.migratepages);
 
 	ret = compact_zone(&cc, &capc);
 
@@ -2408,8 +2412,6 @@
 	struct zone *zone;
 	struct compact_control cc = {
 		.order = -1,
-		.total_migrate_scanned = 0,
-		.total_free_scanned = 0,
 		.mode = MIGRATE_SYNC,
 		.ignore_skip_hint = true,
 		.whole_zone = true,
@@ -2423,11 +2425,7 @@
 		if (!populated_zone(zone))
 			continue;
 
-		cc.nr_freepages = 0;
-		cc.nr_migratepages = 0;
 		cc.zone = zone;
-		INIT_LIST_HEAD(&cc.freepages);
-		INIT_LIST_HEAD(&cc.migratepages);
 
 		compact_zone(&cc, NULL);
 
@@ -2529,8 +2527,6 @@
 	struct compact_control cc = {
 		.order = pgdat->kcompactd_max_order,
 		.search_order = pgdat->kcompactd_max_order,
-		.total_migrate_scanned = 0,
-		.total_free_scanned = 0,
 		.classzone_idx = pgdat->kcompactd_classzone_idx,
 		.mode = MIGRATE_SYNC_LIGHT,
 		.ignore_skip_hint = false,
@@ -2554,16 +2550,10 @@
 							COMPACT_CONTINUE)
 			continue;
 
-		cc.nr_freepages = 0;
-		cc.nr_migratepages = 0;
-		cc.total_migrate_scanned = 0;
-		cc.total_free_scanned = 0;
-		cc.zone = zone;
-		INIT_LIST_HEAD(&cc.freepages);
-		INIT_LIST_HEAD(&cc.migratepages);
-
 		if (kthread_should_stop())
 			return;
+
+		cc.zone = zone;
 		status = compact_zone(&cc, NULL);
 
 		if (status == COMPACT_SUCCESS) {

diff --git a/mm/filemap.c b/mm/filemap.c
index 40667c2..1146fcf 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c

@@ -126,7 +126,7 @@
 	/* hugetlb pages are represented by a single entry in the xarray */
 	if (!PageHuge(page)) {
 		xas_set_order(&xas, page->index, compound_order(page));
-		nr = 1U << compound_order(page);
+		nr = compound_nr(page);
 	}
 
 	VM_BUG_ON_PAGE(!PageLocked(page), page);
@@ -203,8 +203,9 @@
 		__mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr);
 		if (PageTransHuge(page))
 			__dec_node_page_state(page, NR_SHMEM_THPS);
-	} else {
-		VM_BUG_ON_PAGE(PageTransHuge(page), page);
+	} else if (PageTransHuge(page)) {
+		__dec_node_page_state(page, NR_FILE_THPS);
+		filemap_nr_thps_dec(mapping);
 	}
 
 	/*
@@ -281,11 +282,11 @@
  * @pvec: pagevec with pages to delete
  *
  * The function walks over mapping->i_pages and removes pages passed in @pvec
- * from the mapping. The function expects @pvec to be sorted by page index.
+ * from the mapping. The function expects @pvec to be sorted by page index
+ * and is optimised for it to be dense.
  * It tolerates holes in @pvec (mapping entries at those indices are not
  * modified). The function expects only THP head pages to be present in the
- * @pvec and takes care to delete all corresponding tail pages from the
- * mapping as well.
+ * @pvec.
  *
  * The function expects the i_pages lock to be held.
  */
@@ -294,40 +295,43 @@
 {
 	XA_STATE(xas, &mapping->i_pages, pvec->pages[0]->index);
 	int total_pages = 0;
-	int i = 0, tail_pages = 0;
+	int i = 0;
 	struct page *page;
 
 	mapping_set_update(&xas, mapping);
 	xas_for_each(&xas, page, ULONG_MAX) {
-		if (i >= pagevec_count(pvec) && !tail_pages)
+		if (i >= pagevec_count(pvec))
 			break;
+
+		/* A swap/dax/shadow entry got inserted? Skip it. */
 		if (xa_is_value(page))
 			continue;
-		if (!tail_pages) {
-			/*
-			 * Some page got inserted in our range? Skip it. We
-			 * have our pages locked so they are protected from
-			 * being removed.
-			 */
-			if (page != pvec->pages[i]) {
-				VM_BUG_ON_PAGE(page->index >
-						pvec->pages[i]->index, page);
-				continue;
-			}
-			WARN_ON_ONCE(!PageLocked(page));
-			if (PageTransHuge(page) && !PageHuge(page))
-				tail_pages = HPAGE_PMD_NR - 1;
-			page->mapping = NULL;
-			/*
-			 * Leave page->index set: truncation lookup relies
-			 * upon it
-			 */
-			i++;
-		} else {
-			VM_BUG_ON_PAGE(page->index + HPAGE_PMD_NR - tail_pages
-					!= pvec->pages[i]->index, page);
-			tail_pages--;
+		/*
+		 * A page got inserted in our range? Skip it. We have our
+		 * pages locked so they are protected from being removed.
+		 * If we see a page whose index is higher than ours, it
+		 * means our page has been removed, which shouldn't be
+		 * possible because we're holding the PageLock.
+		 */
+		if (page != pvec->pages[i]) {
+			VM_BUG_ON_PAGE(page->index > pvec->pages[i]->index,
+					page);
+			continue;
 		}
+
+		WARN_ON_ONCE(!PageLocked(page));
+
+		if (page->index == xas.xa_index)
+			page->mapping = NULL;
+		/* Leave page->index set: truncation lookup relies on it */
+
+		/*
+		 * Move to the next page in the vector if this is a regular
+		 * page or the index is of the last sub-page of this compound
+		 * page.
+		 */
+		if (page->index + compound_nr(page) - 1 == xas.xa_index)
+			i++;
 		xas_store(&xas, NULL);
 		total_pages++;
 	}
@@ -408,7 +412,8 @@
 		.range_end = end,
 	};
 
-	if (!mapping_cap_writeback_dirty(mapping))
+	if (!mapping_cap_writeback_dirty(mapping) ||
+	    !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
 		return 0;
 
 	wbc_attach_fdatawrite_inode(&wbc, mapping->host);
@@ -617,10 +622,13 @@
 }
 EXPORT_SYMBOL(filemap_fdatawait_keep_errors);
 
+/* Returns true if writeback might be needed or already in progress. */
 static bool mapping_needs_writeback(struct address_space *mapping)
 {
-	return (!dax_mapping(mapping) && mapping->nrpages) ||
-	    (dax_mapping(mapping) && mapping->nrexceptional);
+	if (dax_mapping(mapping))
+		return mapping->nrexceptional;
+
+	return mapping->nrpages;
 }
 
 int filemap_write_and_wait(struct address_space *mapping)
@@ -1516,7 +1524,7 @@
 struct page *find_get_entry(struct address_space *mapping, pgoff_t offset)
 {
 	XA_STATE(xas, &mapping->i_pages, offset);
-	struct page *head, *page;
+	struct page *page;
 
 	rcu_read_lock();
 repeat:
@@ -1531,25 +1539,19 @@
 	if (!page || xa_is_value(page))
 		goto out;
 
-	head = compound_head(page);
-	if (!page_cache_get_speculative(head))
+	if (!page_cache_get_speculative(page))
 		goto repeat;
 
-	/* The page was split under us? */
-	if (compound_head(page) != head) {
-		put_page(head);
-		goto repeat;
-	}
-
 	/*
-	 * Has the page moved?
+	 * Has the page moved or been split?
 	 * This is part of the lockless pagecache protocol. See
 	 * include/linux/pagemap.h for details.
 	 */
 	if (unlikely(page != xas_reload(&xas))) {
-		put_page(head);
+		put_page(page);
 		goto repeat;
 	}
+	page = find_subpage(page, offset);
 out:
 	rcu_read_unlock();
 
@@ -1646,7 +1648,7 @@
 		}
 
 		/* Has the page been truncated? */
-		if (unlikely(page->mapping != mapping)) {
+		if (unlikely(compound_head(page)->mapping != mapping)) {
 			unlock_page(page);
 			put_page(page);
 			goto repeat;
@@ -1731,7 +1733,6 @@
 
 	rcu_read_lock();
 	xas_for_each(&xas, page, ULONG_MAX) {
-		struct page *head;
 		if (xas_retry(&xas, page))
 			continue;
 		/*
@@ -1742,17 +1743,13 @@
 		if (xa_is_value(page))
 			goto export;
 
-		head = compound_head(page);
-		if (!page_cache_get_speculative(head))
+		if (!page_cache_get_speculative(page))
 			goto retry;
 
-		/* The page was split under us? */
-		if (compound_head(page) != head)
-			goto put_page;
-
-		/* Has the page moved? */
+		/* Has the page moved or been split? */
 		if (unlikely(page != xas_reload(&xas)))
 			goto put_page;
+		page = find_subpage(page, xas.xa_index);
 
 export:
 		indices[ret] = xas.xa_index;
@@ -1761,7 +1758,7 @@
 			break;
 		continue;
 put_page:
-		put_page(head);
+		put_page(page);
 retry:
 		xas_reset(&xas);
 	}
@@ -1803,33 +1800,27 @@
 
 	rcu_read_lock();
 	xas_for_each(&xas, page, end) {
-		struct page *head;
 		if (xas_retry(&xas, page))
 			continue;
 		/* Skip over shadow, swap and DAX entries */
 		if (xa_is_value(page))
 			continue;
 
-		head = compound_head(page);
-		if (!page_cache_get_speculative(head))
+		if (!page_cache_get_speculative(page))
 			goto retry;
 
-		/* The page was split under us? */
-		if (compound_head(page) != head)
-			goto put_page;
-
-		/* Has the page moved? */
+		/* Has the page moved or been split? */
 		if (unlikely(page != xas_reload(&xas)))
 			goto put_page;
 
-		pages[ret] = page;
+		pages[ret] = find_subpage(page, xas.xa_index);
 		if (++ret == nr_pages) {
 			*start = xas.xa_index + 1;
 			goto out;
 		}
 		continue;
 put_page:
-		put_page(head);
+		put_page(page);
 retry:
 		xas_reset(&xas);
 	}
@@ -1874,7 +1865,6 @@
 
 	rcu_read_lock();
 	for (page = xas_load(&xas); page; page = xas_next(&xas)) {
-		struct page *head;
 		if (xas_retry(&xas, page))
 			continue;
 		/*
@@ -1884,24 +1874,19 @@
 		if (xa_is_value(page))
 			break;
 
-		head = compound_head(page);
-		if (!page_cache_get_speculative(head))
+		if (!page_cache_get_speculative(page))
 			goto retry;
 
-		/* The page was split under us? */
-		if (compound_head(page) != head)
-			goto put_page;
-
-		/* Has the page moved? */
+		/* Has the page moved or been split? */
 		if (unlikely(page != xas_reload(&xas)))
 			goto put_page;
 
-		pages[ret] = page;
+		pages[ret] = find_subpage(page, xas.xa_index);
 		if (++ret == nr_pages)
 			break;
 		continue;
 put_page:
-		put_page(head);
+		put_page(page);
 retry:
 		xas_reset(&xas);
 	}
@@ -1937,7 +1922,6 @@
 
 	rcu_read_lock();
 	xas_for_each_marked(&xas, page, end, tag) {
-		struct page *head;
 		if (xas_retry(&xas, page))
 			continue;
 		/*
@@ -1948,26 +1932,21 @@
 		if (xa_is_value(page))
 			continue;
 
-		head = compound_head(page);
-		if (!page_cache_get_speculative(head))
+		if (!page_cache_get_speculative(page))
 			goto retry;
 
-		/* The page was split under us? */
-		if (compound_head(page) != head)
-			goto put_page;
-
-		/* Has the page moved? */
+		/* Has the page moved or been split? */
 		if (unlikely(page != xas_reload(&xas)))
 			goto put_page;
 
-		pages[ret] = page;
+		pages[ret] = find_subpage(page, xas.xa_index);
 		if (++ret == nr_pages) {
 			*index = xas.xa_index + 1;
 			goto out;
 		}
 		continue;
 put_page:
-		put_page(head);
+		put_page(page);
 retry:
 		xas_reset(&xas);
 	}
@@ -2562,12 +2541,12 @@
 		goto out_retry;
 
 	/* Did it get truncated? */
-	if (unlikely(page->mapping != mapping)) {
+	if (unlikely(compound_head(page)->mapping != mapping)) {
 		unlock_page(page);
 		put_page(page);
 		goto retry_find;
 	}
-	VM_BUG_ON_PAGE(page->index != offset, page);
+	VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page);
 
 	/*
 	 * We have a locked page in the page cache, now we need to check
@@ -2648,7 +2627,7 @@
 	pgoff_t last_pgoff = start_pgoff;
 	unsigned long max_idx;
 	XA_STATE(xas, &mapping->i_pages, start_pgoff);
-	struct page *head, *page;
+	struct page *page;
 
 	rcu_read_lock();
 	xas_for_each(&xas, page, end_pgoff) {
@@ -2657,24 +2636,19 @@
 		if (xa_is_value(page))
 			goto next;
 
-		head = compound_head(page);
-
 		/*
 		 * Check for a locked page first, as a speculative
 		 * reference may adversely influence page migration.
 		 */
-		if (PageLocked(head))
+		if (PageLocked(page))
 			goto next;
-		if (!page_cache_get_speculative(head))
+		if (!page_cache_get_speculative(page))
 			goto next;
 
-		/* The page was split under us? */
-		if (compound_head(page) != head)
-			goto skip;
-
-		/* Has the page moved? */
+		/* Has the page moved or been split? */
 		if (unlikely(page != xas_reload(&xas)))
 			goto skip;
+		page = find_subpage(page, xas.xa_index);
 
 		if (!PageUptodate(page) ||
 				PageReadahead(page) ||

diff --git a/mm/frame_vector.c b/mm/frame_vector.c
index c64dca6..c431ca8 100644
--- a/mm/frame_vector.c
+++ b/mm/frame_vector.c

@@ -46,6 +46,8 @@
 	if (WARN_ON_ONCE(nr_frames > vec->nr_allocated))
 		nr_frames = vec->nr_allocated;
 
+	start = untagged_addr(start);
+
 	down_read(&mm->mmap_sem);
 	locked = 1;
 	vma = find_vma_intersection(mm, start, start + 1);

diff --git a/mm/gup.c b/mm/gup.c
index 98f13ab..23a9f9c 100644
--- a/mm/gup.c
+++ b/mm/gup.c

@@ -29,85 +29,70 @@
 	unsigned int page_mask;
 };
 
-typedef int (*set_dirty_func_t)(struct page *page);
-
-static void __put_user_pages_dirty(struct page **pages,
-				   unsigned long npages,
-				   set_dirty_func_t sdf)
-{
-	unsigned long index;
-
-	for (index = 0; index < npages; index++) {
-		struct page *page = compound_head(pages[index]);
-
-		/*
-		 * Checking PageDirty at this point may race with
-		 * clear_page_dirty_for_io(), but that's OK. Two key cases:
-		 *
-		 * 1) This code sees the page as already dirty, so it skips
-		 * the call to sdf(). That could happen because
-		 * clear_page_dirty_for_io() called page_mkclean(),
-		 * followed by set_page_dirty(). However, now the page is
-		 * going to get written back, which meets the original
-		 * intention of setting it dirty, so all is well:
-		 * clear_page_dirty_for_io() goes on to call
-		 * TestClearPageDirty(), and write the page back.
-		 *
-		 * 2) This code sees the page as clean, so it calls sdf().
-		 * The page stays dirty, despite being written back, so it
-		 * gets written back again in the next writeback cycle.
-		 * This is harmless.
-		 */
-		if (!PageDirty(page))
-			sdf(page);
-
-		put_user_page(page);
-	}
-}
-
 /**
- * put_user_pages_dirty() - release and dirty an array of gup-pinned pages
- * @pages:  array of pages to be marked dirty and released.
+ * put_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages
+ * @pages:  array of pages to be maybe marked dirty, and definitely released.
  * @npages: number of pages in the @pages array.
+ * @make_dirty: whether to mark the pages dirty
  *
  * "gup-pinned page" refers to a page that has had one of the get_user_pages()
  * variants called on that page.
  *
  * For each page in the @pages array, make that page (or its head page, if a
- * compound page) dirty, if it was previously listed as clean. Then, release
- * the page using put_user_page().
+ * compound page) dirty, if @make_dirty is true, and if the page was previously
+ * listed as clean. In any case, releases all pages using put_user_page(),
+ * possibly via put_user_pages(), for the non-dirty case.
  *
  * Please see the put_user_page() documentation for details.
  *
- * set_page_dirty(), which does not lock the page, is used here.
- * Therefore, it is the caller's responsibility to ensure that this is
- * safe. If not, then put_user_pages_dirty_lock() should be called instead.
+ * set_page_dirty_lock() is used internally. If instead, set_page_dirty() is
+ * required, then the caller should a) verify that this is really correct,
+ * because _lock() is usually required, and b) hand code it:
+ * set_page_dirty_lock(), put_user_page().
  *
  */
-void put_user_pages_dirty(struct page **pages, unsigned long npages)
+void put_user_pages_dirty_lock(struct page **pages, unsigned long npages,
+			       bool make_dirty)
 {
-	__put_user_pages_dirty(pages, npages, set_page_dirty);
-}
-EXPORT_SYMBOL(put_user_pages_dirty);
+	unsigned long index;
 
-/**
- * put_user_pages_dirty_lock() - release and dirty an array of gup-pinned pages
- * @pages:  array of pages to be marked dirty and released.
- * @npages: number of pages in the @pages array.
- *
- * For each page in the @pages array, make that page (or its head page, if a
- * compound page) dirty, if it was previously listed as clean. Then, release
- * the page using put_user_page().
- *
- * Please see the put_user_page() documentation for details.
- *
- * This is just like put_user_pages_dirty(), except that it invokes
- * set_page_dirty_lock(), instead of set_page_dirty().
- *
- */
-void put_user_pages_dirty_lock(struct page **pages, unsigned long npages)
-{
-	__put_user_pages_dirty(pages, npages, set_page_dirty_lock);
+	/*
+	 * TODO: this can be optimized for huge pages: if a series of pages is
+	 * physically contiguous and part of the same compound page, then a
+	 * single operation to the head page should suffice.
+	 */
+
+	if (!make_dirty) {
+		put_user_pages(pages, npages);
+		return;
+	}
+
+	for (index = 0; index < npages; index++) {
+		struct page *page = compound_head(pages[index]);
+		/*
+		 * Checking PageDirty at this point may race with
+		 * clear_page_dirty_for_io(), but that's OK. Two key
+		 * cases:
+		 *
+		 * 1) This code sees the page as already dirty, so it
+		 * skips the call to set_page_dirty(). That could happen
+		 * because clear_page_dirty_for_io() called
+		 * page_mkclean(), followed by set_page_dirty().
+		 * However, now the page is going to get written back,
+		 * which meets the original intention of setting it
+		 * dirty, so all is well: clear_page_dirty_for_io() goes
+		 * on to call TestClearPageDirty(), and write the page
+		 * back.
+		 *
+		 * 2) This code sees the page as clean, so it calls
+		 * set_page_dirty(). The page stays dirty, despite being
+		 * written back, so it gets written back again in the
+		 * next writeback cycle. This is harmless.
+		 */
+		if (!PageDirty(page))
+			set_page_dirty_lock(page);
+		put_user_page(page);
+	}
 }
 EXPORT_SYMBOL(put_user_pages_dirty_lock);
 
@@ -399,7 +384,7 @@
 		spin_unlock(ptl);
 		return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
 	}
-	if (flags & FOLL_SPLIT) {
+	if (flags & (FOLL_SPLIT | FOLL_SPLIT_PMD)) {
 		int ret;
 		page = pmd_page(*pmd);
 		if (is_huge_zero_page(page)) {
@@ -408,7 +393,7 @@
 			split_huge_pmd(vma, pmd, address);
 			if (pmd_trans_unstable(pmd))
 				ret = -EBUSY;
-		} else {
+		} else if (flags & FOLL_SPLIT) {
 			if (unlikely(!try_get_page(page))) {
 				spin_unlock(ptl);
 				return ERR_PTR(-ENOMEM);
@@ -420,6 +405,10 @@
 			put_page(page);
 			if (pmd_none(*pmd))
 				return no_page_table(vma, flags);
+		} else {  /* flags & FOLL_SPLIT_PMD */
+			spin_unlock(ptl);
+			split_huge_pmd(vma, pmd, address);
+			ret = pte_alloc(mm, pmd) ? -ENOMEM : 0;
 		}
 
 		return ret ? ERR_PTR(ret) :
@@ -799,6 +788,8 @@
 	if (!nr_pages)
 		return 0;
 
+	start = untagged_addr(start);
+
 	VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));
 
 	/*
@@ -961,6 +952,8 @@
 	struct vm_area_struct *vma;
 	vm_fault_t ret, major = 0;
 
+	address = untagged_addr(address);
+
 	if (unlocked)
 		fault_flags |= FAULT_FLAG_ALLOW_RETRY;
 
@@ -1460,7 +1453,7 @@
 		 * gup may start from a tail page. Advance step by the left
 		 * part.
 		 */
-		step = (1 << compound_order(head)) - (pages[i] - head);
+		step = compound_nr(head) - (pages[i] - head);
 		/*
 		 * If we get a page from the CMA zone, since we are going to
 		 * be pinning these entries, we might as well move them out

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index de1f159..c5cb6dc 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c

@@ -496,11 +496,25 @@
 	return pmd;
 }
 
-static inline struct list_head *page_deferred_list(struct page *page)
+#ifdef CONFIG_MEMCG
+static inline struct deferred_split *get_deferred_split_queue(struct page *page)
 {
-	/* ->lru in the tail pages is occupied by compound_head. */
-	return &page[2].deferred_list;
+	struct mem_cgroup *memcg = compound_head(page)->mem_cgroup;
+	struct pglist_data *pgdat = NODE_DATA(page_to_nid(page));
+
+	if (memcg)
+		return &memcg->deferred_split_queue;
+	else
+		return &pgdat->deferred_split_queue;
 }
+#else
+static inline struct deferred_split *get_deferred_split_queue(struct page *page)
+{
+	struct pglist_data *pgdat = NODE_DATA(page_to_nid(page));
+
+	return &pgdat->deferred_split_queue;
+}
+#endif
 
 void prep_transhuge_page(struct page *page)
 {
@@ -645,40 +659,30 @@
  *	    available
  * never: never stall for any thp allocation
  */
-static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma, unsigned long addr)
+static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
 {
 	const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE);
-	gfp_t this_node = 0;
 
-#ifdef CONFIG_NUMA
-	struct mempolicy *pol;
-	/*
-	 * __GFP_THISNODE is used only when __GFP_DIRECT_RECLAIM is not
-	 * specified, to express a general desire to stay on the current
-	 * node for optimistic allocation attempts. If the defrag mode
-	 * and/or madvise hint requires the direct reclaim then we prefer
-	 * to fallback to other node rather than node reclaim because that
-	 * can lead to excessive reclaim even though there is free memory
-	 * on other nodes. We expect that NUMA preferences are specified
-	 * by memory policies.
-	 */
-	pol = get_vma_policy(vma, addr);
-	if (pol->mode != MPOL_BIND)
-		this_node = __GFP_THISNODE;
-	mpol_cond_put(pol);
-#endif
-
+	/* Always do synchronous compaction */
 	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
 		return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY);
+
+	/* Kick kcompactd and fail quickly */
 	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
-		return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM | this_node;
+		return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;
+
+	/* Synchronous compaction if madvised, otherwise kick kcompactd */
 	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
-		return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM :
-							     __GFP_KSWAPD_RECLAIM | this_node);
+		return GFP_TRANSHUGE_LIGHT |
+			(vma_madvised ? __GFP_DIRECT_RECLAIM :
+					__GFP_KSWAPD_RECLAIM);
+
+	/* Only do synchronous compaction if madvised */
 	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
-		return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM :
-							     this_node);
-	return GFP_TRANSHUGE_LIGHT | this_node;
+		return GFP_TRANSHUGE_LIGHT |
+		       (vma_madvised ? __GFP_DIRECT_RECLAIM : 0);
+
+	return GFP_TRANSHUGE_LIGHT;
 }
 
 /* Caller must hold page table lock. */
@@ -750,8 +754,8 @@
 			pte_free(vma->vm_mm, pgtable);
 		return ret;
 	}
-	gfp = alloc_hugepage_direct_gfpmask(vma, haddr);
-	page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER, vma, haddr, numa_node_id());
+	gfp = alloc_hugepage_direct_gfpmask(vma);
+	page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
 	if (unlikely(!page)) {
 		count_vm_event(THP_FAULT_FALLBACK);
 		return VM_FAULT_FALLBACK;
@@ -1358,9 +1362,8 @@
 alloc:
 	if (__transparent_hugepage_enabled(vma) &&
 	    !transparent_hugepage_debug_cow()) {
-		huge_gfp = alloc_hugepage_direct_gfpmask(vma, haddr);
-		new_page = alloc_pages_vma(huge_gfp, HPAGE_PMD_ORDER, vma,
-				haddr, numa_node_id());
+		huge_gfp = alloc_hugepage_direct_gfpmask(vma);
+		new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER);
 	} else
 		new_page = NULL;
 
@@ -2497,6 +2500,8 @@
 	struct page *head = compound_head(page);
 	pg_data_t *pgdat = page_pgdat(head);
 	struct lruvec *lruvec;
+	struct address_space *swap_cache = NULL;
+	unsigned long offset = 0;
 	int i;
 
 	lruvec = mem_cgroup_page_lruvec(head, pgdat);
@@ -2504,6 +2509,14 @@
 	/* complete memcg works before add pages to LRU */
 	mem_cgroup_split_huge_fixup(head);
 
+	if (PageAnon(head) && PageSwapCache(head)) {
+		swp_entry_t entry = { .val = page_private(head) };
+
+		offset = swp_offset(entry);
+		swap_cache = swap_address_space(entry);
+		xa_lock(&swap_cache->i_pages);
+	}
+
 	for (i = HPAGE_PMD_NR - 1; i >= 1; i--) {
 		__split_huge_page_tail(head, i, lruvec, list);
 		/* Some pages can be beyond i_size: drop them from page cache */
@@ -2513,6 +2526,12 @@
 			if (IS_ENABLED(CONFIG_SHMEM) && PageSwapBacked(head))
 				shmem_uncharge(head->mapping->host, 1);
 			put_page(head + i);
+		} else if (!PageAnon(page)) {
+			__xa_store(&head->mapping->i_pages, head[i].index,
+					head + i, 0);
+		} else if (swap_cache) {
+			__xa_store(&swap_cache->i_pages, offset + i,
+					head + i, 0);
 		}
 	}
 
@@ -2523,10 +2542,12 @@
 	/* See comment in __split_huge_page_tail() */
 	if (PageAnon(head)) {
 		/* Additional pin to swap cache */
-		if (PageSwapCache(head))
+		if (PageSwapCache(head)) {
 			page_ref_add(head, 2);
-		else
+			xa_unlock(&swap_cache->i_pages);
+		} else {
 			page_ref_inc(head);
+		}
 	} else {
 		/* Additional pin to page cache */
 		page_ref_add(head, 2);
@@ -2673,6 +2694,7 @@
 {
 	struct page *head = compound_head(page);
 	struct pglist_data *pgdata = NODE_DATA(page_to_nid(head));
+	struct deferred_split *ds_queue = get_deferred_split_queue(page);
 	struct anon_vma *anon_vma = NULL;
 	struct address_space *mapping = NULL;
 	int count, mapcount, extra_pins, ret;
@@ -2759,17 +2781,17 @@
 	}
 
 	/* Prevent deferred_split_scan() touching ->_refcount */
-	spin_lock(&pgdata->split_queue_lock);
+	spin_lock(&ds_queue->split_queue_lock);
 	count = page_count(head);
 	mapcount = total_mapcount(head);
 	if (!mapcount && page_ref_freeze(head, 1 + extra_pins)) {
 		if (!list_empty(page_deferred_list(head))) {
-			pgdata->split_queue_len--;
+			ds_queue->split_queue_len--;
 			list_del(page_deferred_list(head));
 		}
 		if (mapping)
 			__dec_node_page_state(page, NR_SHMEM_THPS);
-		spin_unlock(&pgdata->split_queue_lock);
+		spin_unlock(&ds_queue->split_queue_lock);
 		__split_huge_page(page, list, end, flags);
 		if (PageSwapCache(head)) {
 			swp_entry_t entry = { .val = page_private(head) };
@@ -2786,7 +2808,7 @@
 			dump_page(page, "total_mapcount(head) > 0");
 			BUG();
 		}
-		spin_unlock(&pgdata->split_queue_lock);
+		spin_unlock(&ds_queue->split_queue_lock);
 fail:		if (mapping)
 			xa_unlock(&mapping->i_pages);
 		spin_unlock_irqrestore(&pgdata->lru_lock, flags);
@@ -2808,53 +2830,86 @@
 
 void free_transhuge_page(struct page *page)
 {
-	struct pglist_data *pgdata = NODE_DATA(page_to_nid(page));
+	struct deferred_split *ds_queue = get_deferred_split_queue(page);
 	unsigned long flags;
 
-	spin_lock_irqsave(&pgdata->split_queue_lock, flags);
+	spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
 	if (!list_empty(page_deferred_list(page))) {
-		pgdata->split_queue_len--;
+		ds_queue->split_queue_len--;
 		list_del(page_deferred_list(page));
 	}
-	spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
+	spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
 	free_compound_page(page);
 }
 
 void deferred_split_huge_page(struct page *page)
 {
-	struct pglist_data *pgdata = NODE_DATA(page_to_nid(page));
+	struct deferred_split *ds_queue = get_deferred_split_queue(page);
+#ifdef CONFIG_MEMCG
+	struct mem_cgroup *memcg = compound_head(page)->mem_cgroup;
+#endif
 	unsigned long flags;
 
 	VM_BUG_ON_PAGE(!PageTransHuge(page), page);
 
-	spin_lock_irqsave(&pgdata->split_queue_lock, flags);
+	/*
+	 * The try_to_unmap() in page reclaim path might reach here too,
+	 * this may cause a race condition to corrupt deferred split queue.
+	 * And, if page reclaim is already handling the same page, it is
+	 * unnecessary to handle it again in shrinker.
+	 *
+	 * Check PageSwapCache to determine if the page is being
+	 * handled by page reclaim since THP swap would add the page into
+	 * swap cache before calling try_to_unmap().
+	 */
+	if (PageSwapCache(page))
+		return;
+
+	spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
 	if (list_empty(page_deferred_list(page))) {
 		count_vm_event(THP_DEFERRED_SPLIT_PAGE);
-		list_add_tail(page_deferred_list(page), &pgdata->split_queue);
-		pgdata->split_queue_len++;
+		list_add_tail(page_deferred_list(page), &ds_queue->split_queue);
+		ds_queue->split_queue_len++;
+#ifdef CONFIG_MEMCG
+		if (memcg)
+			memcg_set_shrinker_bit(memcg, page_to_nid(page),
+					       deferred_split_shrinker.id);
+#endif
 	}
-	spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
+	spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
 }
 
 static unsigned long deferred_split_count(struct shrinker *shrink,
 		struct shrink_control *sc)
 {
 	struct pglist_data *pgdata = NODE_DATA(sc->nid);
-	return READ_ONCE(pgdata->split_queue_len);
+	struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
+
+#ifdef CONFIG_MEMCG
+	if (sc->memcg)
+		ds_queue = &sc->memcg->deferred_split_queue;
+#endif
+	return READ_ONCE(ds_queue->split_queue_len);
 }
 
 static unsigned long deferred_split_scan(struct shrinker *shrink,
 		struct shrink_control *sc)
 {
 	struct pglist_data *pgdata = NODE_DATA(sc->nid);
+	struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
 	unsigned long flags;
 	LIST_HEAD(list), *pos, *next;
 	struct page *page;
 	int split = 0;
 
-	spin_lock_irqsave(&pgdata->split_queue_lock, flags);
+#ifdef CONFIG_MEMCG
+	if (sc->memcg)
+		ds_queue = &sc->memcg->deferred_split_queue;
+#endif
+
+	spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
 	/* Take pin on all head pages to avoid freeing them under us */
-	list_for_each_safe(pos, next, &pgdata->split_queue) {
+	list_for_each_safe(pos, next, &ds_queue->split_queue) {
 		page = list_entry((void *)pos, struct page, mapping);
 		page = compound_head(page);
 		if (get_page_unless_zero(page)) {
@@ -2862,12 +2917,12 @@
 		} else {
 			/* We lost race with put_compound_page() */
 			list_del_init(page_deferred_list(page));
-			pgdata->split_queue_len--;
+			ds_queue->split_queue_len--;
 		}
 		if (!--sc->nr_to_scan)
 			break;
 	}
-	spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
+	spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
 
 	list_for_each_safe(pos, next, &list) {
 		page = list_entry((void *)pos, struct page, mapping);
@@ -2881,15 +2936,15 @@
 		put_page(page);
 	}
 
-	spin_lock_irqsave(&pgdata->split_queue_lock, flags);
-	list_splice_tail(&list, &pgdata->split_queue);
-	spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
+	spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
+	list_splice_tail(&list, &ds_queue->split_queue);
+	spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
 
 	/*
 	 * Stop shrinker if we didn't split any page, but the queue is empty.
 	 * This can happen if pages were freed under us.
 	 */
-	if (!split && list_empty(&pgdata->split_queue))
+	if (!split && list_empty(&ds_queue->split_queue))
 		return SHRINK_STOP;
 	return split;
 }
@@ -2898,7 +2953,8 @@
 	.count_objects = deferred_split_count,
 	.scan_objects = deferred_split_scan,
 	.seeks = DEFAULT_SEEKS,
-	.flags = SHRINKER_NUMA_AWARE,
+	.flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE |
+		 SHRINKER_NONSLAB,
 };
 
 #ifdef CONFIG_DEBUG_FS

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 6d7296d..ef37c85 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c

@@ -1405,12 +1405,25 @@
 }
 
 static struct page *alloc_buddy_huge_page(struct hstate *h,
-		gfp_t gfp_mask, int nid, nodemask_t *nmask)
+		gfp_t gfp_mask, int nid, nodemask_t *nmask,
+		nodemask_t *node_alloc_noretry)
 {
 	int order = huge_page_order(h);
 	struct page *page;
+	bool alloc_try_hard = true;
 
-	gfp_mask |= __GFP_COMP|__GFP_RETRY_MAYFAIL|__GFP_NOWARN;
+	/*
+	 * By default we always try hard to allocate the page with
+	 * __GFP_RETRY_MAYFAIL flag.  However, if we are allocating pages in
+	 * a loop (to adjust global huge page counts) and previous allocation
+	 * failed, do not continue to try hard on the same node.  Use the
+	 * node_alloc_noretry bitmap to manage this state information.
+	 */
+	if (node_alloc_noretry && node_isset(nid, *node_alloc_noretry))
+		alloc_try_hard = false;
+	gfp_mask |= __GFP_COMP|__GFP_NOWARN;
+	if (alloc_try_hard)
+		gfp_mask |= __GFP_RETRY_MAYFAIL;
 	if (nid == NUMA_NO_NODE)
 		nid = numa_mem_id();
 	page = __alloc_pages_nodemask(gfp_mask, order, nid, nmask);
@@ -1419,6 +1432,22 @@
 	else
 		__count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
 
+	/*
+	 * If we did not specify __GFP_RETRY_MAYFAIL, but still got a page this
+	 * indicates an overall state change.  Clear bit so that we resume
+	 * normal 'try hard' allocations.
+	 */
+	if (node_alloc_noretry && page && !alloc_try_hard)
+		node_clear(nid, *node_alloc_noretry);
+
+	/*
+	 * If we tried hard to get a page but failed, set bit so that
+	 * subsequent attempts will not try as hard until there is an
+	 * overall state change.
+	 */
+	if (node_alloc_noretry && !page && alloc_try_hard)
+		node_set(nid, *node_alloc_noretry);
+
 	return page;
 }
 
@@ -1427,7 +1456,8 @@
  * should use this function to get new hugetlb pages
  */
 static struct page *alloc_fresh_huge_page(struct hstate *h,
-		gfp_t gfp_mask, int nid, nodemask_t *nmask)
+		gfp_t gfp_mask, int nid, nodemask_t *nmask,
+		nodemask_t *node_alloc_noretry)
 {
 	struct page *page;
 
@@ -1435,7 +1465,7 @@
 		page = alloc_gigantic_page(h, gfp_mask, nid, nmask);
 	else
 		page = alloc_buddy_huge_page(h, gfp_mask,
-				nid, nmask);
+				nid, nmask, node_alloc_noretry);
 	if (!page)
 		return NULL;
 
@@ -1450,14 +1480,16 @@
  * Allocates a fresh page to the hugetlb allocator pool in the node interleaved
  * manner.
  */
-static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
+static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
+				nodemask_t *node_alloc_noretry)
 {
 	struct page *page;
 	int nr_nodes, node;
 	gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
 
 	for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
-		page = alloc_fresh_huge_page(h, gfp_mask, node, nodes_allowed);
+		page = alloc_fresh_huge_page(h, gfp_mask, node, nodes_allowed,
+						node_alloc_noretry);
 		if (page)
 			break;
 	}
@@ -1601,7 +1633,7 @@
 		goto out_unlock;
 	spin_unlock(&hugetlb_lock);
 
-	page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask);
+	page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL);
 	if (!page)
 		return NULL;
 
@@ -1637,7 +1669,7 @@
 	if (hstate_is_gigantic(h))
 		return NULL;
 
-	page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask);
+	page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL);
 	if (!page)
 		return NULL;
 
@@ -2207,13 +2239,33 @@
 static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
 {
 	unsigned long i;
+	nodemask_t *node_alloc_noretry;
+
+	if (!hstate_is_gigantic(h)) {
+		/*
+		 * Bit mask controlling how hard we retry per-node allocations.
+		 * Ignore errors as lower level routines can deal with
+		 * node_alloc_noretry == NULL.  If this kmalloc fails at boot
+		 * time, we are likely in bigger trouble.
+		 */
+		node_alloc_noretry = kmalloc(sizeof(*node_alloc_noretry),
+						GFP_KERNEL);
+	} else {
+		/* allocations done at boot time */
+		node_alloc_noretry = NULL;
+	}
+
+	/* bit mask controlling how hard we retry per-node allocations */
+	if (node_alloc_noretry)
+		nodes_clear(*node_alloc_noretry);
 
 	for (i = 0; i < h->max_huge_pages; ++i) {
 		if (hstate_is_gigantic(h)) {
 			if (!alloc_bootmem_huge_page(h))
 				break;
 		} else if (!alloc_pool_huge_page(h,
-					 &node_states[N_MEMORY]))
+					 &node_states[N_MEMORY],
+					 node_alloc_noretry))
 			break;
 		cond_resched();
 	}
@@ -2225,6 +2277,8 @@
 			h->max_huge_pages, buf, i);
 		h->max_huge_pages = i;
 	}
+
+	kfree(node_alloc_noretry);
 }
 
 static void __init hugetlb_init_hstates(void)
@@ -2323,6 +2377,17 @@
 			      nodemask_t *nodes_allowed)
 {
 	unsigned long min_count, ret;
+	NODEMASK_ALLOC(nodemask_t, node_alloc_noretry, GFP_KERNEL);
+
+	/*
+	 * Bit mask controlling how hard we retry per-node allocations.
+	 * If we can not allocate the bit mask, do not attempt to allocate
+	 * the requested huge pages.
+	 */
+	if (node_alloc_noretry)
+		nodes_clear(*node_alloc_noretry);
+	else
+		return -ENOMEM;
 
 	spin_lock(&hugetlb_lock);
 
@@ -2356,6 +2421,7 @@
 	if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) {
 		if (count > persistent_huge_pages(h)) {
 			spin_unlock(&hugetlb_lock);
+			NODEMASK_FREE(node_alloc_noretry);
 			return -EINVAL;
 		}
 		/* Fall through to decrease pool */
@@ -2388,7 +2454,8 @@
 		/* yield cpu to avoid soft lockup */
 		cond_resched();
 
-		ret = alloc_pool_huge_page(h, nodes_allowed);
+		ret = alloc_pool_huge_page(h, nodes_allowed,
+						node_alloc_noretry);
 		spin_lock(&hugetlb_lock);
 		if (!ret)
 			goto out;
@@ -2429,6 +2496,8 @@
 	h->max_huge_pages = persistent_huge_pages(h);
 	spin_unlock(&hugetlb_lock);
 
+	NODEMASK_FREE(node_alloc_noretry);
+
 	return 0;
 }
 

diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index 68c2f2f..f1930fa 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c

@@ -139,7 +139,7 @@
 	if (!page_hcg || page_hcg != h_cg)
 		goto out;
 
-	nr_pages = 1 << compound_order(page);
+	nr_pages = compound_nr(page);
 	if (!parent) {
 		parent = root_h_cgroup;
 		/* root has no limit */

diff --git a/mm/init-mm.c b/mm/init-mm.c
index a787a31..fb1e150 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c

@@ -35,6 +35,6 @@
 	.arg_lock	=  __SPIN_LOCK_UNLOCKED(init_mm.arg_lock),
 	.mmlist		= LIST_HEAD_INIT(init_mm.mmlist),
 	.user_ns	= &init_user_ns,
-	.cpu_bitmap	= { [BITS_TO_LONGS(NR_CPUS)] = 0},
+	.cpu_bitmap	= CPU_BITS_NONE,
 	INIT_MM_CONTEXT(init_mm)
 };

diff --git a/mm/internal.h b/mm/internal.h
index e323908..0d5f720 100644
--- a/mm/internal.h
+++ b/mm/internal.h

@@ -39,7 +39,7 @@
 void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
 		unsigned long floor, unsigned long ceiling);
 
-static inline bool can_madv_dontneed_vma(struct vm_area_struct *vma)
+static inline bool can_madv_lru_vma(struct vm_area_struct *vma)
 {
 	return !(vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP));
 }

diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 95d16a4..6814d6d 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c

@@ -304,7 +304,6 @@
 struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache,
 					const void *object)
 {
-	BUILD_BUG_ON(sizeof(struct kasan_alloc_meta) > 32);
 	return (void *)object + cache->kasan_info.alloc_meta_offset;
 }
 
@@ -315,14 +314,31 @@
 	return (void *)object + cache->kasan_info.free_meta_offset;
 }
 
+
+static void kasan_set_free_info(struct kmem_cache *cache,
+		void *object, u8 tag)
+{
+	struct kasan_alloc_meta *alloc_meta;
+	u8 idx = 0;
+
+	alloc_meta = get_alloc_info(cache, object);
+
+#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY
+	idx = alloc_meta->free_track_idx;
+	alloc_meta->free_pointer_tag[idx] = tag;
+	alloc_meta->free_track_idx = (idx + 1) % KASAN_NR_FREE_STACKS;
+#endif
+
+	set_track(&alloc_meta->free_track[idx], GFP_NOWAIT);
+}
+
 void kasan_poison_slab(struct page *page)
 {
 	unsigned long i;
 
-	for (i = 0; i < (1 << compound_order(page)); i++)
+	for (i = 0; i < compound_nr(page); i++)
 		page_kasan_tag_reset(page + i);
-	kasan_poison_shadow(page_address(page),
-			PAGE_SIZE << compound_order(page),
+	kasan_poison_shadow(page_address(page), page_size(page),
 			KASAN_KMALLOC_REDZONE);
 }
 
@@ -452,7 +468,8 @@
 			unlikely(!(cache->flags & SLAB_KASAN)))
 		return false;
 
-	set_track(&get_alloc_info(cache, object)->free_track, GFP_NOWAIT);
+	kasan_set_free_info(cache, object, tag);
+
 	quarantine_put(get_free_info(cache, object), cache);
 
 	return IS_ENABLED(CONFIG_KASAN_GENERIC);
@@ -524,7 +541,7 @@
 	page = virt_to_page(ptr);
 	redzone_start = round_up((unsigned long)(ptr + size),
 				KASAN_SHADOW_SCALE_SIZE);
-	redzone_end = (unsigned long)ptr + (PAGE_SIZE << compound_order(page));
+	redzone_end = (unsigned long)ptr + page_size(page);
 
 	kasan_unpoison_shadow(ptr, size);
 	kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start,
@@ -560,8 +577,7 @@
 			kasan_report_invalid_free(ptr, ip);
 			return;
 		}
-		kasan_poison_shadow(ptr, PAGE_SIZE << compound_order(page),
-				KASAN_FREE_PAGE);
+		kasan_poison_shadow(ptr, page_size(page), KASAN_FREE_PAGE);
 	} else {
 		__kasan_slab_free(page->slab_cache, ptr, ip, false);
 	}

diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index 014f19e..35cff6b 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h

@@ -95,9 +95,19 @@
 	depot_stack_handle_t stack;
 };
 
+#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY
+#define KASAN_NR_FREE_STACKS 5
+#else
+#define KASAN_NR_FREE_STACKS 1
+#endif
+
 struct kasan_alloc_meta {
 	struct kasan_track alloc_track;
-	struct kasan_track free_track;
+	struct kasan_track free_track[KASAN_NR_FREE_STACKS];
+#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY
+	u8 free_pointer_tag[KASAN_NR_FREE_STACKS];
+	u8 free_track_idx;
+#endif
 };
 
 struct qlist_node {
@@ -146,6 +156,8 @@
 		bool is_write, unsigned long ip);
 void kasan_report_invalid_free(void *object, unsigned long ip);
 
+struct page *kasan_addr_to_page(const void *addr);
+
 #if defined(CONFIG_KASAN_GENERIC) && \
 	(defined(CONFIG_SLAB) || defined(CONFIG_SLUB))
 void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache);

diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 0e5f965..6217821 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c

@@ -111,7 +111,7 @@
 	}
 }
 
-static struct page *addr_to_page(const void *addr)
+struct page *kasan_addr_to_page(const void *addr)
 {
 	if ((addr >= (void *)PAGE_OFFSET) &&
 			(addr < high_memory))
@@ -151,15 +151,38 @@
 		(void *)(object_addr + cache->object_size));
 }
 
+static struct kasan_track *kasan_get_free_track(struct kmem_cache *cache,
+		void *object, u8 tag)
+{
+	struct kasan_alloc_meta *alloc_meta;
+	int i = 0;
+
+	alloc_meta = get_alloc_info(cache, object);
+
+#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY
+	for (i = 0; i < KASAN_NR_FREE_STACKS; i++) {
+		if (alloc_meta->free_pointer_tag[i] == tag)
+			break;
+	}
+	if (i == KASAN_NR_FREE_STACKS)
+		i = alloc_meta->free_track_idx;
+#endif
+
+	return &alloc_meta->free_track[i];
+}
+
 static void describe_object(struct kmem_cache *cache, void *object,
-				const void *addr)
+				const void *addr, u8 tag)
 {
 	struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object);
 
 	if (cache->flags & SLAB_KASAN) {
+		struct kasan_track *free_track;
+
 		print_track(&alloc_info->alloc_track, "Allocated");
 		pr_err("\n");
-		print_track(&alloc_info->free_track, "Freed");
+		free_track = kasan_get_free_track(cache, object, tag);
+		print_track(free_track, "Freed");
 		pr_err("\n");
 	}
 
@@ -344,9 +367,9 @@
 	print_decoded_frame_descr(frame_descr);
 }
 
-static void print_address_description(void *addr)
+static void print_address_description(void *addr, u8 tag)
 {
-	struct page *page = addr_to_page(addr);
+	struct page *page = kasan_addr_to_page(addr);
 
 	dump_stack();
 	pr_err("\n");
@@ -355,7 +378,7 @@
 		struct kmem_cache *cache = page->slab_cache;
 		void *object = nearest_obj(cache, page,	addr);
 
-		describe_object(cache, object, addr);
+		describe_object(cache, object, addr, tag);
 	}
 
 	if (kernel_or_module_addr(addr) && !init_task_stack_addr(addr)) {
@@ -435,13 +458,14 @@
 void kasan_report_invalid_free(void *object, unsigned long ip)
 {
 	unsigned long flags;
+	u8 tag = get_tag(object);
 
+	object = reset_tag(object);
 	start_report(&flags);
 	pr_err("BUG: KASAN: double-free or invalid-free in %pS\n", (void *)ip);
-	print_tags(get_tag(object), reset_tag(object));
-	object = reset_tag(object);
+	print_tags(tag, object);
 	pr_err("\n");
-	print_address_description(object);
+	print_address_description(object, tag);
 	pr_err("\n");
 	print_shadow_for_address(object);
 	end_report(&flags);
@@ -479,7 +503,7 @@
 	pr_err("\n");
 
 	if (addr_has_shadow(untagged_addr)) {
-		print_address_description(untagged_addr);
+		print_address_description(untagged_addr, get_tag(tagged_addr));
 		pr_err("\n");
 		print_shadow_for_address(info.first_bad_addr);
 	} else {

diff --git a/mm/kasan/tags_report.c b/mm/kasan/tags_report.c
index 8eaf5f7..969ae08 100644
--- a/mm/kasan/tags_report.c
+++ b/mm/kasan/tags_report.c

@@ -36,6 +36,30 @@
 
 const char *get_bug_type(struct kasan_access_info *info)
 {
+#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY
+	struct kasan_alloc_meta *alloc_meta;
+	struct kmem_cache *cache;
+	struct page *page;
+	const void *addr;
+	void *object;
+	u8 tag;
+	int i;
+
+	tag = get_tag(info->access_addr);
+	addr = reset_tag(info->access_addr);
+	page = kasan_addr_to_page(addr);
+	if (page && PageSlab(page)) {
+		cache = page->slab_cache;
+		object = nearest_obj(cache, page, (void *)addr);
+		alloc_meta = get_alloc_info(cache, object);
+
+		for (i = 0; i < KASAN_NR_FREE_STACKS; i++)
+			if (alloc_meta->free_pointer_tag[i] == tag)
+				return "use-after-free";
+		return "out-of-bounds";
+	}
+
+#endif
 	return "invalid-access";
 }
 

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index ccede24..0a1b4b4 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c

@@ -48,6 +48,7 @@
 	SCAN_CGROUP_CHARGE_FAIL,
 	SCAN_EXCEED_SWAP_PTE,
 	SCAN_TRUNCATED,
+	SCAN_PAGE_HAS_PRIVATE,
 };
 
 #define CREATE_TRACE_POINTS
@@ -76,6 +77,8 @@
 
 static struct kmem_cache *mm_slot_cache __read_mostly;
 
+#define MAX_PTE_MAPPED_THP 8
+
 /**
  * struct mm_slot - hash lookup from mm to mm_slot
  * @hash: hash collision list
@@ -86,6 +89,10 @@
 	struct hlist_node hash;
 	struct list_head mm_node;
 	struct mm_struct *mm;
+
+	/* pte-mapped THP in this mm */
+	int nr_pte_mapped_thp;
+	unsigned long pte_mapped_thp[MAX_PTE_MAPPED_THP];
 };
 
 /**
@@ -404,7 +411,11 @@
 	    (vm_flags & VM_NOHUGEPAGE) ||
 	    test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
 		return false;
-	if (shmem_file(vma->vm_file)) {
+
+	if (shmem_file(vma->vm_file) ||
+	    (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
+	     vma->vm_file &&
+	     (vm_flags & VM_DENYWRITE))) {
 		if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
 			return false;
 		return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
@@ -456,8 +467,9 @@
 	unsigned long hstart, hend;
 
 	/*
-	 * khugepaged does not yet work on non-shmem files or special
-	 * mappings. And file-private shmem THP is not supported.
+	 * khugepaged only supports read-only files for non-shmem files.
+	 * khugepaged does not yet work on special mappings. And
+	 * file-private shmem THP is not supported.
 	 */
 	if (!hugepage_vma_check(vma, vm_flags))
 		return 0;
@@ -1248,6 +1260,159 @@
 }
 
 #if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE)
+/*
+ * Notify khugepaged that given addr of the mm is pte-mapped THP. Then
+ * khugepaged should try to collapse the page table.
+ */
+static int khugepaged_add_pte_mapped_thp(struct mm_struct *mm,
+					 unsigned long addr)
+{
+	struct mm_slot *mm_slot;
+
+	VM_BUG_ON(addr & ~HPAGE_PMD_MASK);
+
+	spin_lock(&khugepaged_mm_lock);
+	mm_slot = get_mm_slot(mm);
+	if (likely(mm_slot && mm_slot->nr_pte_mapped_thp < MAX_PTE_MAPPED_THP))
+		mm_slot->pte_mapped_thp[mm_slot->nr_pte_mapped_thp++] = addr;
+	spin_unlock(&khugepaged_mm_lock);
+	return 0;
+}
+
+/**
+ * Try to collapse a pte-mapped THP for mm at address haddr.
+ *
+ * This function checks whether all the PTEs in the PMD are pointing to the
+ * right THP. If so, retract the page table so the THP can refault in with
+ * as pmd-mapped.
+ */
+void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
+{
+	unsigned long haddr = addr & HPAGE_PMD_MASK;
+	struct vm_area_struct *vma = find_vma(mm, haddr);
+	struct page *hpage = NULL;
+	pte_t *start_pte, *pte;
+	pmd_t *pmd, _pmd;
+	spinlock_t *ptl;
+	int count = 0;
+	int i;
+
+	if (!vma || !vma->vm_file ||
+	    vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE)
+		return;
+
+	/*
+	 * This vm_flags may not have VM_HUGEPAGE if the page was not
+	 * collapsed by this mm. But we can still collapse if the page is
+	 * the valid THP. Add extra VM_HUGEPAGE so hugepage_vma_check()
+	 * will not fail the vma for missing VM_HUGEPAGE
+	 */
+	if (!hugepage_vma_check(vma, vma->vm_flags | VM_HUGEPAGE))
+		return;
+
+	pmd = mm_find_pmd(mm, haddr);
+	if (!pmd)
+		return;
+
+	start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
+
+	/* step 1: check all mapped PTEs are to the right huge page */
+	for (i = 0, addr = haddr, pte = start_pte;
+	     i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
+		struct page *page;
+
+		/* empty pte, skip */
+		if (pte_none(*pte))
+			continue;
+
+		/* page swapped out, abort */
+		if (!pte_present(*pte))
+			goto abort;
+
+		page = vm_normal_page(vma, addr, *pte);
+
+		if (!page || !PageCompound(page))
+			goto abort;
+
+		if (!hpage) {
+			hpage = compound_head(page);
+			/*
+			 * The mapping of the THP should not change.
+			 *
+			 * Note that uprobe, debugger, or MAP_PRIVATE may
+			 * change the page table, but the new page will
+			 * not pass PageCompound() check.
+			 */
+			if (WARN_ON(hpage->mapping != vma->vm_file->f_mapping))
+				goto abort;
+		}
+
+		/*
+		 * Confirm the page maps to the correct subpage.
+		 *
+		 * Note that uprobe, debugger, or MAP_PRIVATE may change
+		 * the page table, but the new page will not pass
+		 * PageCompound() check.
+		 */
+		if (WARN_ON(hpage + i != page))
+			goto abort;
+		count++;
+	}
+
+	/* step 2: adjust rmap */
+	for (i = 0, addr = haddr, pte = start_pte;
+	     i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
+		struct page *page;
+
+		if (pte_none(*pte))
+			continue;
+		page = vm_normal_page(vma, addr, *pte);
+		page_remove_rmap(page, false);
+	}
+
+	pte_unmap_unlock(start_pte, ptl);
+
+	/* step 3: set proper refcount and mm_counters. */
+	if (hpage) {
+		page_ref_sub(hpage, count);
+		add_mm_counter(vma->vm_mm, mm_counter_file(hpage), -count);
+	}
+
+	/* step 4: collapse pmd */
+	ptl = pmd_lock(vma->vm_mm, pmd);
+	_pmd = pmdp_collapse_flush(vma, addr, pmd);
+	spin_unlock(ptl);
+	mm_dec_nr_ptes(mm);
+	pte_free(mm, pmd_pgtable(_pmd));
+	return;
+
+abort:
+	pte_unmap_unlock(start_pte, ptl);
+}
+
+static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
+{
+	struct mm_struct *mm = mm_slot->mm;
+	int i;
+
+	if (likely(mm_slot->nr_pte_mapped_thp == 0))
+		return 0;
+
+	if (!down_write_trylock(&mm->mmap_sem))
+		return -EBUSY;
+
+	if (unlikely(khugepaged_test_exit(mm)))
+		goto out;
+
+	for (i = 0; i < mm_slot->nr_pte_mapped_thp; i++)
+		collapse_pte_mapped_thp(mm, mm_slot->pte_mapped_thp[i]);
+
+out:
+	mm_slot->nr_pte_mapped_thp = 0;
+	up_write(&mm->mmap_sem);
+	return 0;
+}
+
 static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
 {
 	struct vm_area_struct *vma;
@@ -1256,7 +1421,22 @@
 
 	i_mmap_lock_write(mapping);
 	vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
-		/* probably overkill */
+		/*
+		 * Check vma->anon_vma to exclude MAP_PRIVATE mappings that
+		 * got written to. These VMAs are likely not worth investing
+		 * down_write(mmap_sem) as PMD-mapping is likely to be split
+		 * later.
+		 *
+		 * Not that vma->anon_vma check is racy: it can be set up after
+		 * the check but before we took mmap_sem by the fault path.
+		 * But page lock would prevent establishing any new ptes of the
+		 * page, so we are safe.
+		 *
+		 * An alternative would be drop the check, but check that page
+		 * table is clear before calling pmdp_collapse_flush() under
+		 * ptl. It has higher chance to recover THP for the VMA, but
+		 * has higher cost too.
+		 */
 		if (vma->anon_vma)
 			continue;
 		addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
@@ -1269,9 +1449,10 @@
 			continue;
 		/*
 		 * We need exclusive mmap_sem to retract page table.
-		 * If trylock fails we would end up with pte-mapped THP after
-		 * re-fault. Not ideal, but it's more important to not disturb
-		 * the system too much.
+		 *
+		 * We use trylock due to lock inversion: we need to acquire
+		 * mmap_sem while holding page lock. Fault path does it in
+		 * reverse order. Trylock is a way to avoid deadlock.
 		 */
 		if (down_write_trylock(&vma->vm_mm->mmap_sem)) {
 			spinlock_t *ptl = pmd_lock(vma->vm_mm, pmd);
@@ -1281,18 +1462,21 @@
 			up_write(&vma->vm_mm->mmap_sem);
 			mm_dec_nr_ptes(vma->vm_mm);
 			pte_free(vma->vm_mm, pmd_pgtable(_pmd));
+		} else {
+			/* Try again later */
+			khugepaged_add_pte_mapped_thp(vma->vm_mm, addr);
 		}
 	}
 	i_mmap_unlock_write(mapping);
 }
 
 /**
- * collapse_shmem - collapse small tmpfs/shmem pages into huge one.
+ * collapse_file - collapse filemap/tmpfs/shmem pages into huge one.
  *
  * Basic scheme is simple, details are more complex:
  *  - allocate and lock a new huge page;
  *  - scan page cache replacing old pages with the new one
- *    + swap in pages if necessary;
+ *    + swap/gup in pages if necessary;
  *    + fill in gaps;
  *    + keep old pages around in case rollback is required;
  *  - if replacing succeeds:
@@ -1304,10 +1488,11 @@
  *    + restore gaps in the page cache;
  *    + unlock and free huge page;
  */
-static void collapse_shmem(struct mm_struct *mm,
-		struct address_space *mapping, pgoff_t start,
+static void collapse_file(struct mm_struct *mm,
+		struct file *file, pgoff_t start,
 		struct page **hpage, int node)
 {
+	struct address_space *mapping = file->f_mapping;
 	gfp_t gfp;
 	struct page *new_page;
 	struct mem_cgroup *memcg;
@@ -1315,7 +1500,9 @@
 	LIST_HEAD(pagelist);
 	XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER);
 	int nr_none = 0, result = SCAN_SUCCEED;
+	bool is_shmem = shmem_file(file);
 
+	VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem);
 	VM_BUG_ON(start & (HPAGE_PMD_NR - 1));
 
 	/* Only allocate from the target node */
@@ -1347,7 +1534,8 @@
 	} while (1);
 
 	__SetPageLocked(new_page);
-	__SetPageSwapBacked(new_page);
+	if (is_shmem)
+		__SetPageSwapBacked(new_page);
 	new_page->index = start;
 	new_page->mapping = mapping;
 
@@ -1362,41 +1550,75 @@
 		struct page *page = xas_next(&xas);
 
 		VM_BUG_ON(index != xas.xa_index);
-		if (!page) {
-			/*
-			 * Stop if extent has been truncated or hole-punched,
-			 * and is now completely empty.
-			 */
-			if (index == start) {
-				if (!xas_next_entry(&xas, end - 1)) {
-					result = SCAN_TRUNCATED;
+		if (is_shmem) {
+			if (!page) {
+				/*
+				 * Stop if extent has been truncated or
+				 * hole-punched, and is now completely
+				 * empty.
+				 */
+				if (index == start) {
+					if (!xas_next_entry(&xas, end - 1)) {
+						result = SCAN_TRUNCATED;
+						goto xa_locked;
+					}
+					xas_set(&xas, index);
+				}
+				if (!shmem_charge(mapping->host, 1)) {
+					result = SCAN_FAIL;
 					goto xa_locked;
 				}
-				xas_set(&xas, index);
+				xas_store(&xas, new_page);
+				nr_none++;
+				continue;
 			}
-			if (!shmem_charge(mapping->host, 1)) {
-				result = SCAN_FAIL;
+
+			if (xa_is_value(page) || !PageUptodate(page)) {
+				xas_unlock_irq(&xas);
+				/* swap in or instantiate fallocated page */
+				if (shmem_getpage(mapping->host, index, &page,
+						  SGP_NOHUGE)) {
+					result = SCAN_FAIL;
+					goto xa_unlocked;
+				}
+			} else if (trylock_page(page)) {
+				get_page(page);
+				xas_unlock_irq(&xas);
+			} else {
+				result = SCAN_PAGE_LOCK;
 				goto xa_locked;
 			}
-			xas_store(&xas, new_page + (index % HPAGE_PMD_NR));
-			nr_none++;
-			continue;
-		}
-
-		if (xa_is_value(page) || !PageUptodate(page)) {
-			xas_unlock_irq(&xas);
-			/* swap in or instantiate fallocated page */
-			if (shmem_getpage(mapping->host, index, &page,
-						SGP_NOHUGE)) {
+		} else {	/* !is_shmem */
+			if (!page || xa_is_value(page)) {
+				xas_unlock_irq(&xas);
+				page_cache_sync_readahead(mapping, &file->f_ra,
+							  file, index,
+							  PAGE_SIZE);
+				/* drain pagevecs to help isolate_lru_page() */
+				lru_add_drain();
+				page = find_lock_page(mapping, index);
+				if (unlikely(page == NULL)) {
+					result = SCAN_FAIL;
+					goto xa_unlocked;
+				}
+			} else if (!PageUptodate(page)) {
+				xas_unlock_irq(&xas);
+				wait_on_page_locked(page);
+				if (!trylock_page(page)) {
+					result = SCAN_PAGE_LOCK;
+					goto xa_unlocked;
+				}
+				get_page(page);
+			} else if (PageDirty(page)) {
 				result = SCAN_FAIL;
-				goto xa_unlocked;
+				goto xa_locked;
+			} else if (trylock_page(page)) {
+				get_page(page);
+				xas_unlock_irq(&xas);
+			} else {
+				result = SCAN_PAGE_LOCK;
+				goto xa_locked;
 			}
-		} else if (trylock_page(page)) {
-			get_page(page);
-			xas_unlock_irq(&xas);
-		} else {
-			result = SCAN_PAGE_LOCK;
-			goto xa_locked;
 		}
 
 		/*
@@ -1425,6 +1647,12 @@
 			goto out_unlock;
 		}
 
+		if (page_has_private(page) &&
+		    !try_to_release_page(page, GFP_KERNEL)) {
+			result = SCAN_PAGE_HAS_PRIVATE;
+			goto out_unlock;
+		}
+
 		if (page_mapped(page))
 			unmap_mapping_pages(mapping, index, 1, false);
 
@@ -1454,7 +1682,7 @@
 		list_add_tail(&page->lru, &pagelist);
 
 		/* Finally, replace with the new page. */
-		xas_store(&xas, new_page + (index % HPAGE_PMD_NR));
+		xas_store(&xas, new_page);
 		continue;
 out_unlock:
 		unlock_page(page);
@@ -1462,12 +1690,20 @@
 		goto xa_unlocked;
 	}
 
-	__inc_node_page_state(new_page, NR_SHMEM_THPS);
+	if (is_shmem)
+		__inc_node_page_state(new_page, NR_SHMEM_THPS);
+	else {
+		__inc_node_page_state(new_page, NR_FILE_THPS);
+		filemap_nr_thps_inc(mapping);
+	}
+
 	if (nr_none) {
 		struct zone *zone = page_zone(new_page);
 
 		__mod_node_page_state(zone->zone_pgdat, NR_FILE_PAGES, nr_none);
-		__mod_node_page_state(zone->zone_pgdat, NR_SHMEM, nr_none);
+		if (is_shmem)
+			__mod_node_page_state(zone->zone_pgdat,
+					      NR_SHMEM, nr_none);
 	}
 
 xa_locked:
@@ -1505,10 +1741,15 @@
 
 		SetPageUptodate(new_page);
 		page_ref_add(new_page, HPAGE_PMD_NR - 1);
-		set_page_dirty(new_page);
 		mem_cgroup_commit_charge(new_page, memcg, false, true);
+
+		if (is_shmem) {
+			set_page_dirty(new_page);
+			lru_cache_add_anon(new_page);
+		} else {
+			lru_cache_add_file(new_page);
+		}
 		count_memcg_events(memcg, THP_COLLAPSE_ALLOC, 1);
-		lru_cache_add_anon(new_page);
 
 		/*
 		 * Remove pte page tables, so we can re-fault the page as huge.
@@ -1523,7 +1764,9 @@
 		/* Something went wrong: roll back page cache changes */
 		xas_lock_irq(&xas);
 		mapping->nrpages -= nr_none;
-		shmem_uncharge(mapping->host, nr_none);
+
+		if (is_shmem)
+			shmem_uncharge(mapping->host, nr_none);
 
 		xas_set(&xas, start);
 		xas_for_each(&xas, page, end - 1) {
@@ -1563,11 +1806,11 @@
 	/* TODO: tracepoints */
 }
 
-static void khugepaged_scan_shmem(struct mm_struct *mm,
-		struct address_space *mapping,
-		pgoff_t start, struct page **hpage)
+static void khugepaged_scan_file(struct mm_struct *mm,
+		struct file *file, pgoff_t start, struct page **hpage)
 {
 	struct page *page = NULL;
+	struct address_space *mapping = file->f_mapping;
 	XA_STATE(xas, &mapping->i_pages, start);
 	int present, swap;
 	int node = NUMA_NO_NODE;
@@ -1606,7 +1849,8 @@
 			break;
 		}
 
-		if (page_count(page) != 1 + page_mapcount(page)) {
+		if (page_count(page) !=
+		    1 + page_mapcount(page) + page_has_private(page)) {
 			result = SCAN_PAGE_COUNT;
 			break;
 		}
@@ -1631,19 +1875,23 @@
 			result = SCAN_EXCEED_NONE_PTE;
 		} else {
 			node = khugepaged_find_target_node();
-			collapse_shmem(mm, mapping, start, hpage, node);
+			collapse_file(mm, file, start, hpage, node);
 		}
 	}
 
 	/* TODO: tracepoints */
 }
 #else
-static void khugepaged_scan_shmem(struct mm_struct *mm,
-		struct address_space *mapping,
-		pgoff_t start, struct page **hpage)
+static void khugepaged_scan_file(struct mm_struct *mm,
+		struct file *file, pgoff_t start, struct page **hpage)
 {
 	BUILD_BUG();
 }
+
+static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
+{
+	return 0;
+}
 #endif
 
 static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
@@ -1668,6 +1916,7 @@
 		khugepaged_scan.mm_slot = mm_slot;
 	}
 	spin_unlock(&khugepaged_mm_lock);
+	khugepaged_collapse_pte_mapped_thps(mm_slot);
 
 	mm = mm_slot->mm;
 	/*
@@ -1713,17 +1962,18 @@
 			VM_BUG_ON(khugepaged_scan.address < hstart ||
 				  khugepaged_scan.address + HPAGE_PMD_SIZE >
 				  hend);
-			if (shmem_file(vma->vm_file)) {
+			if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) {
 				struct file *file;
 				pgoff_t pgoff = linear_page_index(vma,
 						khugepaged_scan.address);
-				if (!shmem_huge_enabled(vma))
+
+				if (shmem_file(vma->vm_file)
+				    && !shmem_huge_enabled(vma))
 					goto skip;
 				file = get_file(vma->vm_file);
 				up_read(&mm->mmap_sem);
 				ret = 1;
-				khugepaged_scan_shmem(mm, file->f_mapping,
-						pgoff, hpage);
+				khugepaged_scan_file(mm, file, pgoff, hpage);
 				fput(file);
 			} else {
 				ret = khugepaged_scan_pmd(mm, vma,

diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index f6e6029..03a8d84 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c

@@ -168,6 +168,8 @@
 #define OBJECT_REPORTED		(1 << 1)
 /* flag set to not scan the object */
 #define OBJECT_NO_SCAN		(1 << 2)
+/* flag set to fully scan the object when scan_area allocation failed */
+#define OBJECT_FULL_SCAN	(1 << 3)
 
 #define HEX_PREFIX		"    "
 /* number of bytes to print per line; must be 16 or 32 */
@@ -183,6 +185,10 @@
 static LIST_HEAD(object_list);
 /* the list of gray-colored objects (see color_gray comment below) */
 static LIST_HEAD(gray_list);
+/* memory pool allocation */
+static struct kmemleak_object mem_pool[CONFIG_DEBUG_KMEMLEAK_MEM_POOL_SIZE];
+static int mem_pool_free_count = ARRAY_SIZE(mem_pool);
+static LIST_HEAD(mem_pool_free_list);
 /* search tree for object boundaries */
 static struct rb_root object_tree_root = RB_ROOT;
 /* rw_lock protecting the access to object_list and object_tree_root */
@@ -193,13 +199,11 @@
 static struct kmem_cache *scan_area_cache;
 
 /* set if tracing memory operations is enabled */
-static int kmemleak_enabled;
+static int kmemleak_enabled = 1;
 /* same as above but only for the kmemleak_free() callback */
-static int kmemleak_free_enabled;
+static int kmemleak_free_enabled = 1;
 /* set in the late_initcall if there were no errors */
 static int kmemleak_initialized;
-/* enables or disables early logging of the memory operations */
-static int kmemleak_early_log = 1;
 /* set if a kmemleak warning was issued */
 static int kmemleak_warning;
 /* set if a fatal kmemleak error has occurred */
@@ -227,49 +231,6 @@
 static bool kmemleak_verbose;
 module_param_named(verbose, kmemleak_verbose, bool, 0600);
 
-/*
- * Early object allocation/freeing logging. Kmemleak is initialized after the
- * kernel allocator. However, both the kernel allocator and kmemleak may
- * allocate memory blocks which need to be tracked. Kmemleak defines an
- * arbitrary buffer to hold the allocation/freeing information before it is
- * fully initialized.
- */
-
-/* kmemleak operation type for early logging */
-enum {
-	KMEMLEAK_ALLOC,
-	KMEMLEAK_ALLOC_PERCPU,
-	KMEMLEAK_FREE,
-	KMEMLEAK_FREE_PART,
-	KMEMLEAK_FREE_PERCPU,
-	KMEMLEAK_NOT_LEAK,
-	KMEMLEAK_IGNORE,
-	KMEMLEAK_SCAN_AREA,
-	KMEMLEAK_NO_SCAN,
-	KMEMLEAK_SET_EXCESS_REF
-};
-
-/*
- * Structure holding the information passed to kmemleak callbacks during the
- * early logging.
- */
-struct early_log {
-	int op_type;			/* kmemleak operation type */
-	int min_count;			/* minimum reference count */
-	const void *ptr;		/* allocated/freed memory block */
-	union {
-		size_t size;		/* memory block size */
-		unsigned long excess_ref; /* surplus reference passing */
-	};
-	unsigned long trace[MAX_TRACE];	/* stack trace */
-	unsigned int trace_len;		/* stack trace length */
-};
-
-/* early logging buffer and current position */
-static struct early_log
-	early_log[CONFIG_DEBUG_KMEMLEAK_EARLY_LOG_SIZE] __initdata;
-static int crt_early_log __initdata;
-
 static void kmemleak_disable(void);
 
 /*
@@ -450,6 +411,54 @@
 }
 
 /*
+ * Memory pool allocation and freeing. kmemleak_lock must not be held.
+ */
+static struct kmemleak_object *mem_pool_alloc(gfp_t gfp)
+{
+	unsigned long flags;
+	struct kmemleak_object *object;
+
+	/* try the slab allocator first */
+	if (object_cache) {
+		object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp));
+		if (object)
+			return object;
+	}
+
+	/* slab allocation failed, try the memory pool */
+	write_lock_irqsave(&kmemleak_lock, flags);
+	object = list_first_entry_or_null(&mem_pool_free_list,
+					  typeof(*object), object_list);
+	if (object)
+		list_del(&object->object_list);
+	else if (mem_pool_free_count)
+		object = &mem_pool[--mem_pool_free_count];
+	else
+		pr_warn_once("Memory pool empty, consider increasing CONFIG_DEBUG_KMEMLEAK_MEM_POOL_SIZE\n");
+	write_unlock_irqrestore(&kmemleak_lock, flags);
+
+	return object;
+}
+
+/*
+ * Return the object to either the slab allocator or the memory pool.
+ */
+static void mem_pool_free(struct kmemleak_object *object)
+{
+	unsigned long flags;
+
+	if (object < mem_pool || object >= mem_pool + ARRAY_SIZE(mem_pool)) {
+		kmem_cache_free(object_cache, object);
+		return;
+	}
+
+	/* add the object to the memory pool free list */
+	write_lock_irqsave(&kmemleak_lock, flags);
+	list_add(&object->object_list, &mem_pool_free_list);
+	write_unlock_irqrestore(&kmemleak_lock, flags);
+}
+
+/*
  * RCU callback to free a kmemleak_object.
  */
 static void free_object_rcu(struct rcu_head *rcu)
@@ -467,7 +476,7 @@
 		hlist_del(&area->node);
 		kmem_cache_free(scan_area_cache, area);
 	}
-	kmem_cache_free(object_cache, object);
+	mem_pool_free(object);
 }
 
 /*
@@ -485,7 +494,15 @@
 	/* should only get here after delete_object was called */
 	WARN_ON(object->flags & OBJECT_ALLOCATED);
 
-	call_rcu(&object->rcu, free_object_rcu);
+	/*
+	 * It may be too early for the RCU callbacks, however, there is no
+	 * concurrent object_list traversal when !object_cache and all objects
+	 * came from the memory pool. Free the object directly.
+	 */
+	if (object_cache)
+		call_rcu(&object->rcu, free_object_rcu);
+	else
+		free_object_rcu(&object->rcu);
 }
 
 /*
@@ -550,7 +567,7 @@
 	struct rb_node **link, *rb_parent;
 	unsigned long untagged_ptr;
 
-	object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp));
+	object = mem_pool_alloc(gfp);
 	if (!object) {
 		pr_warn("Cannot allocate a kmemleak_object structure\n");
 		kmemleak_disable();
@@ -689,9 +706,7 @@
 	/*
 	 * Create one or two objects that may result from the memory block
 	 * split. Note that partial freeing is only done by free_bootmem() and
-	 * this happens before kmemleak_init() is called. The path below is
-	 * only executed during early log recording in kmemleak_init(), so
-	 * GFP_KERNEL is enough.
+	 * this happens before kmemleak_init() is called.
 	 */
 	start = object->pointer;
 	end = object->pointer + object->size;
@@ -763,7 +778,7 @@
 {
 	unsigned long flags;
 	struct kmemleak_object *object;
-	struct kmemleak_scan_area *area;
+	struct kmemleak_scan_area *area = NULL;
 
 	object = find_and_get_object(ptr, 1);
 	if (!object) {
@@ -772,13 +787,16 @@
 		return;
 	}
 
-	area = kmem_cache_alloc(scan_area_cache, gfp_kmemleak_mask(gfp));
-	if (!area) {
-		pr_warn("Cannot allocate a scan area\n");
-		goto out;
-	}
+	if (scan_area_cache)
+		area = kmem_cache_alloc(scan_area_cache, gfp_kmemleak_mask(gfp));
 
 	spin_lock_irqsave(&object->lock, flags);
+	if (!area) {
+		pr_warn_once("Cannot allocate a scan area, scanning the full object\n");
+		/* mark the object for full scan to avoid false positives */
+		object->flags |= OBJECT_FULL_SCAN;
+		goto out_unlock;
+	}
 	if (size == SIZE_MAX) {
 		size = object->pointer + object->size - ptr;
 	} else if (ptr + size > object->pointer + object->size) {
@@ -795,7 +813,6 @@
 	hlist_add_head(&area->node, &object->area_list);
 out_unlock:
 	spin_unlock_irqrestore(&object->lock, flags);
-out:
 	put_object(object);
 }
 
@@ -845,86 +862,6 @@
 	put_object(object);
 }
 
-/*
- * Log an early kmemleak_* call to the early_log buffer. These calls will be
- * processed later once kmemleak is fully initialized.
- */
-static void __init log_early(int op_type, const void *ptr, size_t size,
-			     int min_count)
-{
-	unsigned long flags;
-	struct early_log *log;
-
-	if (kmemleak_error) {
-		/* kmemleak stopped recording, just count the requests */
-		crt_early_log++;
-		return;
-	}
-
-	if (crt_early_log >= ARRAY_SIZE(early_log)) {
-		crt_early_log++;
-		kmemleak_disable();
-		return;
-	}
-
-	/*
-	 * There is no need for locking since the kernel is still in UP mode
-	 * at this stage. Disabling the IRQs is enough.
-	 */
-	local_irq_save(flags);
-	log = &early_log[crt_early_log];
-	log->op_type = op_type;
-	log->ptr = ptr;
-	log->size = size;
-	log->min_count = min_count;
-	log->trace_len = __save_stack_trace(log->trace);
-	crt_early_log++;
-	local_irq_restore(flags);
-}
-
-/*
- * Log an early allocated block and populate the stack trace.
- */
-static void early_alloc(struct early_log *log)
-{
-	struct kmemleak_object *object;
-	unsigned long flags;
-	int i;
-
-	if (!kmemleak_enabled || !log->ptr || IS_ERR(log->ptr))
-		return;
-
-	/*
-	 * RCU locking needed to ensure object is not freed via put_object().
-	 */
-	rcu_read_lock();
-	object = create_object((unsigned long)log->ptr, log->size,
-			       log->min_count, GFP_ATOMIC);
-	if (!object)
-		goto out;
-	spin_lock_irqsave(&object->lock, flags);
-	for (i = 0; i < log->trace_len; i++)
-		object->trace[i] = log->trace[i];
-	object->trace_len = log->trace_len;
-	spin_unlock_irqrestore(&object->lock, flags);
-out:
-	rcu_read_unlock();
-}
-
-/*
- * Log an early allocated block and populate the stack trace.
- */
-static void early_alloc_percpu(struct early_log *log)
-{
-	unsigned int cpu;
-	const void __percpu *ptr = log->ptr;
-
-	for_each_possible_cpu(cpu) {
-		log->ptr = per_cpu_ptr(ptr, cpu);
-		early_alloc(log);
-	}
-}
-
 /**
  * kmemleak_alloc - register a newly allocated object
  * @ptr:	pointer to beginning of the object
@@ -946,8 +883,6 @@
 
 	if (kmemleak_enabled && ptr && !IS_ERR(ptr))
 		create_object((unsigned long)ptr, size, min_count, gfp);
-	else if (kmemleak_early_log)
-		log_early(KMEMLEAK_ALLOC, ptr, size, min_count);
 }
 EXPORT_SYMBOL_GPL(kmemleak_alloc);
 
@@ -975,8 +910,6 @@
 		for_each_possible_cpu(cpu)
 			create_object((unsigned long)per_cpu_ptr(ptr, cpu),
 				      size, 0, gfp);
-	else if (kmemleak_early_log)
-		log_early(KMEMLEAK_ALLOC_PERCPU, ptr, size, 0);
 }
 EXPORT_SYMBOL_GPL(kmemleak_alloc_percpu);
 
@@ -1001,11 +934,6 @@
 		create_object((unsigned long)area->addr, size, 2, gfp);
 		object_set_excess_ref((unsigned long)area,
 				      (unsigned long)area->addr);
-	} else if (kmemleak_early_log) {
-		log_early(KMEMLEAK_ALLOC, area->addr, size, 2);
-		/* reusing early_log.size for storing area->addr */
-		log_early(KMEMLEAK_SET_EXCESS_REF,
-			  area, (unsigned long)area->addr, 0);
 	}
 }
 EXPORT_SYMBOL_GPL(kmemleak_vmalloc);
@@ -1023,8 +951,6 @@
 
 	if (kmemleak_free_enabled && ptr && !IS_ERR(ptr))
 		delete_object_full((unsigned long)ptr);
-	else if (kmemleak_early_log)
-		log_early(KMEMLEAK_FREE, ptr, 0, 0);
 }
 EXPORT_SYMBOL_GPL(kmemleak_free);
 
@@ -1043,8 +969,6 @@
 
 	if (kmemleak_enabled && ptr && !IS_ERR(ptr))
 		delete_object_part((unsigned long)ptr, size);
-	else if (kmemleak_early_log)
-		log_early(KMEMLEAK_FREE_PART, ptr, size, 0);
 }
 EXPORT_SYMBOL_GPL(kmemleak_free_part);
 
@@ -1065,8 +989,6 @@
 		for_each_possible_cpu(cpu)
 			delete_object_full((unsigned long)per_cpu_ptr(ptr,
 								      cpu));
-	else if (kmemleak_early_log)
-		log_early(KMEMLEAK_FREE_PERCPU, ptr, 0, 0);
 }
 EXPORT_SYMBOL_GPL(kmemleak_free_percpu);
 
@@ -1117,8 +1039,6 @@
 
 	if (kmemleak_enabled && ptr && !IS_ERR(ptr))
 		make_gray_object((unsigned long)ptr);
-	else if (kmemleak_early_log)
-		log_early(KMEMLEAK_NOT_LEAK, ptr, 0, 0);
 }
 EXPORT_SYMBOL(kmemleak_not_leak);
 
@@ -1137,8 +1057,6 @@
 
 	if (kmemleak_enabled && ptr && !IS_ERR(ptr))
 		make_black_object((unsigned long)ptr);
-	else if (kmemleak_early_log)
-		log_early(KMEMLEAK_IGNORE, ptr, 0, 0);
 }
 EXPORT_SYMBOL(kmemleak_ignore);
 
@@ -1159,8 +1077,6 @@
 
 	if (kmemleak_enabled && ptr && size && !IS_ERR(ptr))
 		add_scan_area((unsigned long)ptr, size, gfp);
-	else if (kmemleak_early_log)
-		log_early(KMEMLEAK_SCAN_AREA, ptr, size, 0);
 }
 EXPORT_SYMBOL(kmemleak_scan_area);
 
@@ -1179,8 +1095,6 @@
 
 	if (kmemleak_enabled && ptr && !IS_ERR(ptr))
 		object_no_scan((unsigned long)ptr);
-	else if (kmemleak_early_log)
-		log_early(KMEMLEAK_NO_SCAN, ptr, 0, 0);
 }
 EXPORT_SYMBOL(kmemleak_no_scan);
 
@@ -1408,7 +1322,8 @@
 	if (!(object->flags & OBJECT_ALLOCATED))
 		/* already freed object */
 		goto out;
-	if (hlist_empty(&object->area_list)) {
+	if (hlist_empty(&object->area_list) ||
+	    object->flags & OBJECT_FULL_SCAN) {
 		void *start = (void *)object->pointer;
 		void *end = (void *)(object->pointer + object->size);
 		void *next;
@@ -1966,7 +1881,6 @@
 
 	/* stop any memory operation tracing */
 	kmemleak_enabled = 0;
-	kmemleak_early_log = 0;
 
 	/* check whether it is too early for a kernel thread */
 	if (kmemleak_initialized)
@@ -1994,20 +1908,11 @@
 }
 early_param("kmemleak", kmemleak_boot_config);
 
-static void __init print_log_trace(struct early_log *log)
-{
-	pr_notice("Early log backtrace:\n");
-	stack_trace_print(log->trace, log->trace_len, 2);
-}
-
 /*
  * Kmemleak initialization.
  */
 void __init kmemleak_init(void)
 {
-	int i;
-	unsigned long flags;
-
 #ifdef CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF
 	if (!kmemleak_skip_disable) {
 		kmemleak_disable();
@@ -2015,28 +1920,15 @@
 	}
 #endif
 
+	if (kmemleak_error)
+		return;
+
 	jiffies_min_age = msecs_to_jiffies(MSECS_MIN_AGE);
 	jiffies_scan_wait = msecs_to_jiffies(SECS_SCAN_WAIT * 1000);
 
 	object_cache = KMEM_CACHE(kmemleak_object, SLAB_NOLEAKTRACE);
 	scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE);
 
-	if (crt_early_log > ARRAY_SIZE(early_log))
-		pr_warn("Early log buffer exceeded (%d), please increase DEBUG_KMEMLEAK_EARLY_LOG_SIZE\n",
-			crt_early_log);
-
-	/* the kernel is still in UP mode, so disabling the IRQs is enough */
-	local_irq_save(flags);
-	kmemleak_early_log = 0;
-	if (kmemleak_error) {
-		local_irq_restore(flags);
-		return;
-	} else {
-		kmemleak_enabled = 1;
-		kmemleak_free_enabled = 1;
-	}
-	local_irq_restore(flags);
-
 	/* register the data/bss sections */
 	create_object((unsigned long)_sdata, _edata - _sdata,
 		      KMEMLEAK_GREY, GFP_ATOMIC);
@@ -2047,57 +1939,6 @@
 		create_object((unsigned long)__start_ro_after_init,
 			      __end_ro_after_init - __start_ro_after_init,
 			      KMEMLEAK_GREY, GFP_ATOMIC);
-
-	/*
-	 * This is the point where tracking allocations is safe. Automatic
-	 * scanning is started during the late initcall. Add the early logged
-	 * callbacks to the kmemleak infrastructure.
-	 */
-	for (i = 0; i < crt_early_log; i++) {
-		struct early_log *log = &early_log[i];
-
-		switch (log->op_type) {
-		case KMEMLEAK_ALLOC:
-			early_alloc(log);
-			break;
-		case KMEMLEAK_ALLOC_PERCPU:
-			early_alloc_percpu(log);
-			break;
-		case KMEMLEAK_FREE:
-			kmemleak_free(log->ptr);
-			break;
-		case KMEMLEAK_FREE_PART:
-			kmemleak_free_part(log->ptr, log->size);
-			break;
-		case KMEMLEAK_FREE_PERCPU:
-			kmemleak_free_percpu(log->ptr);
-			break;
-		case KMEMLEAK_NOT_LEAK:
-			kmemleak_not_leak(log->ptr);
-			break;
-		case KMEMLEAK_IGNORE:
-			kmemleak_ignore(log->ptr);
-			break;
-		case KMEMLEAK_SCAN_AREA:
-			kmemleak_scan_area(log->ptr, log->size, GFP_KERNEL);
-			break;
-		case KMEMLEAK_NO_SCAN:
-			kmemleak_no_scan(log->ptr);
-			break;
-		case KMEMLEAK_SET_EXCESS_REF:
-			object_set_excess_ref((unsigned long)log->ptr,
-					      log->excess_ref);
-			break;
-		default:
-			kmemleak_warn("Unknown early log operation: %d\n",
-				      log->op_type);
-		}
-
-		if (kmemleak_warning) {
-			print_log_trace(log);
-			kmemleak_warning = 0;
-		}
-	}
 }
 
 /*
@@ -2126,7 +1967,8 @@
 		mutex_unlock(&scan_mutex);
 	}
 
-	pr_info("Kernel memory leak detector initialized\n");
+	pr_info("Kernel memory leak detector initialized (mem pool available: %d)\n",
+		mem_pool_free_count);
 
 	return 0;
 }

diff --git a/mm/ksm.c b/mm/ksm.c
index 3dc4346..dbee2eb 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c

@@ -1029,24 +1029,6 @@
 	return checksum;
 }
 
-static int memcmp_pages(struct page *page1, struct page *page2)
-{
-	char *addr1, *addr2;
-	int ret;
-
-	addr1 = kmap_atomic(page1);
-	addr2 = kmap_atomic(page2);
-	ret = memcmp(addr1, addr2, PAGE_SIZE);
-	kunmap_atomic(addr2);
-	kunmap_atomic(addr1);
-	return ret;
-}
-
-static inline int pages_identical(struct page *page1, struct page *page2)
-{
-	return !memcmp_pages(page1, page2);
-}
-
 static int write_protect_page(struct vm_area_struct *vma, struct page *page,
 			      pte_t *orig_pte)
 {

diff --git a/mm/madvise.c b/mm/madvise.c
index 88babcc..2be9f3f 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c

@@ -11,6 +11,7 @@
 #include <linux/syscalls.h>
 #include <linux/mempolicy.h>
 #include <linux/page-isolation.h>
+#include <linux/page_idle.h>
 #include <linux/userfaultfd_k.h>
 #include <linux/hugetlb.h>
 #include <linux/falloc.h>
@@ -31,6 +32,11 @@
 
 #include "internal.h"
 
+struct madvise_walk_private {
+	struct mmu_gather *tlb;
+	bool pageout;
+};
+
 /*
  * Any behaviour which results in changes to the vma->vm_flags needs to
  * take mmap_sem for writing. Others, which simply traverse vmas, need
@@ -42,6 +48,8 @@
 	case MADV_REMOVE:
 	case MADV_WILLNEED:
 	case MADV_DONTNEED:
+	case MADV_COLD:
+	case MADV_PAGEOUT:
 	case MADV_FREE:
 		return 0;
 	default:
@@ -107,28 +115,14 @@
 	case MADV_MERGEABLE:
 	case MADV_UNMERGEABLE:
 		error = ksm_madvise(vma, start, end, behavior, &new_flags);
-		if (error) {
-			/*
-			 * madvise() returns EAGAIN if kernel resources, such as
-			 * slab, are temporarily unavailable.
-			 */
-			if (error == -ENOMEM)
-				error = -EAGAIN;
-			goto out;
-		}
+		if (error)
+			goto out_convert_errno;
 		break;
 	case MADV_HUGEPAGE:
 	case MADV_NOHUGEPAGE:
 		error = hugepage_madvise(vma, &new_flags, behavior);
-		if (error) {
-			/*
-			 * madvise() returns EAGAIN if kernel resources, such as
-			 * slab, are temporarily unavailable.
-			 */
-			if (error == -ENOMEM)
-				error = -EAGAIN;
-			goto out;
-		}
+		if (error)
+			goto out_convert_errno;
 		break;
 	}
 
@@ -154,15 +148,8 @@
 			goto out;
 		}
 		error = __split_vma(mm, vma, start, 1);
-		if (error) {
-			/*
-			 * madvise() returns EAGAIN if kernel resources, such as
-			 * slab, are temporarily unavailable.
-			 */
-			if (error == -ENOMEM)
-				error = -EAGAIN;
-			goto out;
-		}
+		if (error)
+			goto out_convert_errno;
 	}
 
 	if (end != vma->vm_end) {
@@ -171,15 +158,8 @@
 			goto out;
 		}
 		error = __split_vma(mm, vma, end, 0);
-		if (error) {
-			/*
-			 * madvise() returns EAGAIN if kernel resources, such as
-			 * slab, are temporarily unavailable.
-			 */
-			if (error == -ENOMEM)
-				error = -EAGAIN;
-			goto out;
-		}
+		if (error)
+			goto out_convert_errno;
 	}
 
 success:
@@ -187,6 +167,14 @@
 	 * vm_flags is protected by the mmap_sem held in write mode.
 	 */
 	vma->vm_flags = new_flags;
+
+out_convert_errno:
+	/*
+	 * madvise() returns EAGAIN if kernel resources, such as
+	 * slab, are temporarily unavailable.
+	 */
+	if (error == -ENOMEM)
+		error = -EAGAIN;
 out:
 	return error;
 }
@@ -309,6 +297,254 @@
 	return 0;
 }
 
+static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
+				unsigned long addr, unsigned long end,
+				struct mm_walk *walk)
+{
+	struct madvise_walk_private *private = walk->private;
+	struct mmu_gather *tlb = private->tlb;
+	bool pageout = private->pageout;
+	struct mm_struct *mm = tlb->mm;
+	struct vm_area_struct *vma = walk->vma;
+	pte_t *orig_pte, *pte, ptent;
+	spinlock_t *ptl;
+	struct page *page = NULL;
+	LIST_HEAD(page_list);
+
+	if (fatal_signal_pending(current))
+		return -EINTR;
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	if (pmd_trans_huge(*pmd)) {
+		pmd_t orig_pmd;
+		unsigned long next = pmd_addr_end(addr, end);
+
+		tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
+		ptl = pmd_trans_huge_lock(pmd, vma);
+		if (!ptl)
+			return 0;
+
+		orig_pmd = *pmd;
+		if (is_huge_zero_pmd(orig_pmd))
+			goto huge_unlock;
+
+		if (unlikely(!pmd_present(orig_pmd))) {
+			VM_BUG_ON(thp_migration_supported() &&
+					!is_pmd_migration_entry(orig_pmd));
+			goto huge_unlock;
+		}
+
+		page = pmd_page(orig_pmd);
+		if (next - addr != HPAGE_PMD_SIZE) {
+			int err;
+
+			if (page_mapcount(page) != 1)
+				goto huge_unlock;
+
+			get_page(page);
+			spin_unlock(ptl);
+			lock_page(page);
+			err = split_huge_page(page);
+			unlock_page(page);
+			put_page(page);
+			if (!err)
+				goto regular_page;
+			return 0;
+		}
+
+		if (pmd_young(orig_pmd)) {
+			pmdp_invalidate(vma, addr, pmd);
+			orig_pmd = pmd_mkold(orig_pmd);
+
+			set_pmd_at(mm, addr, pmd, orig_pmd);
+			tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
+		}
+
+		ClearPageReferenced(page);
+		test_and_clear_page_young(page);
+		if (pageout) {
+			if (!isolate_lru_page(page))
+				list_add(&page->lru, &page_list);
+		} else
+			deactivate_page(page);
+huge_unlock:
+		spin_unlock(ptl);
+		if (pageout)
+			reclaim_pages(&page_list);
+		return 0;
+	}
+
+	if (pmd_trans_unstable(pmd))
+		return 0;
+regular_page:
+#endif
+	tlb_change_page_size(tlb, PAGE_SIZE);
+	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+	flush_tlb_batched_pending(mm);
+	arch_enter_lazy_mmu_mode();
+	for (; addr < end; pte++, addr += PAGE_SIZE) {
+		ptent = *pte;
+
+		if (pte_none(ptent))
+			continue;
+
+		if (!pte_present(ptent))
+			continue;
+
+		page = vm_normal_page(vma, addr, ptent);
+		if (!page)
+			continue;
+
+		/*
+		 * Creating a THP page is expensive so split it only if we
+		 * are sure it's worth. Split it if we are only owner.
+		 */
+		if (PageTransCompound(page)) {
+			if (page_mapcount(page) != 1)
+				break;
+			get_page(page);
+			if (!trylock_page(page)) {
+				put_page(page);
+				break;
+			}
+			pte_unmap_unlock(orig_pte, ptl);
+			if (split_huge_page(page)) {
+				unlock_page(page);
+				put_page(page);
+				pte_offset_map_lock(mm, pmd, addr, &ptl);
+				break;
+			}
+			unlock_page(page);
+			put_page(page);
+			pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+			pte--;
+			addr -= PAGE_SIZE;
+			continue;
+		}
+
+		VM_BUG_ON_PAGE(PageTransCompound(page), page);
+
+		if (pte_young(ptent)) {
+			ptent = ptep_get_and_clear_full(mm, addr, pte,
+							tlb->fullmm);
+			ptent = pte_mkold(ptent);
+			set_pte_at(mm, addr, pte, ptent);
+			tlb_remove_tlb_entry(tlb, pte, addr);
+		}
+
+		/*
+		 * We are deactivating a page for accelerating reclaiming.
+		 * VM couldn't reclaim the page unless we clear PG_young.
+		 * As a side effect, it makes confuse idle-page tracking
+		 * because they will miss recent referenced history.
+		 */
+		ClearPageReferenced(page);
+		test_and_clear_page_young(page);
+		if (pageout) {
+			if (!isolate_lru_page(page))
+				list_add(&page->lru, &page_list);
+		} else
+			deactivate_page(page);
+	}
+
+	arch_leave_lazy_mmu_mode();
+	pte_unmap_unlock(orig_pte, ptl);
+	if (pageout)
+		reclaim_pages(&page_list);
+	cond_resched();
+
+	return 0;
+}
+
+static const struct mm_walk_ops cold_walk_ops = {
+	.pmd_entry = madvise_cold_or_pageout_pte_range,
+};
+
+static void madvise_cold_page_range(struct mmu_gather *tlb,
+			     struct vm_area_struct *vma,
+			     unsigned long addr, unsigned long end)
+{
+	struct madvise_walk_private walk_private = {
+		.pageout = false,
+		.tlb = tlb,
+	};
+
+	tlb_start_vma(tlb, vma);
+	walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
+	tlb_end_vma(tlb, vma);
+}
+
+static long madvise_cold(struct vm_area_struct *vma,
+			struct vm_area_struct **prev,
+			unsigned long start_addr, unsigned long end_addr)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	struct mmu_gather tlb;
+
+	*prev = vma;
+	if (!can_madv_lru_vma(vma))
+		return -EINVAL;
+
+	lru_add_drain();
+	tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
+	madvise_cold_page_range(&tlb, vma, start_addr, end_addr);
+	tlb_finish_mmu(&tlb, start_addr, end_addr);
+
+	return 0;
+}
+
+static void madvise_pageout_page_range(struct mmu_gather *tlb,
+			     struct vm_area_struct *vma,
+			     unsigned long addr, unsigned long end)
+{
+	struct madvise_walk_private walk_private = {
+		.pageout = true,
+		.tlb = tlb,
+	};
+
+	tlb_start_vma(tlb, vma);
+	walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
+	tlb_end_vma(tlb, vma);
+}
+
+static inline bool can_do_pageout(struct vm_area_struct *vma)
+{
+	if (vma_is_anonymous(vma))
+		return true;
+	if (!vma->vm_file)
+		return false;
+	/*
+	 * paging out pagecache only for non-anonymous mappings that correspond
+	 * to the files the calling process could (if tried) open for writing;
+	 * otherwise we'd be including shared non-exclusive mappings, which
+	 * opens a side channel.
+	 */
+	return inode_owner_or_capable(file_inode(vma->vm_file)) ||
+		inode_permission(file_inode(vma->vm_file), MAY_WRITE) == 0;
+}
+
+static long madvise_pageout(struct vm_area_struct *vma,
+			struct vm_area_struct **prev,
+			unsigned long start_addr, unsigned long end_addr)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	struct mmu_gather tlb;
+
+	*prev = vma;
+	if (!can_madv_lru_vma(vma))
+		return -EINVAL;
+
+	if (!can_do_pageout(vma))
+		return 0;
+
+	lru_add_drain();
+	tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
+	madvise_pageout_page_range(&tlb, vma, start_addr, end_addr);
+	tlb_finish_mmu(&tlb, start_addr, end_addr);
+
+	return 0;
+}
+
 static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
 				unsigned long end, struct mm_walk *walk)
 
@@ -513,7 +749,7 @@
 				  int behavior)
 {
 	*prev = vma;
-	if (!can_madv_dontneed_vma(vma))
+	if (!can_madv_lru_vma(vma))
 		return -EINVAL;
 
 	if (!userfaultfd_remove(vma, start, end)) {
@@ -535,7 +771,7 @@
 			 */
 			return -ENOMEM;
 		}
-		if (!can_madv_dontneed_vma(vma))
+		if (!can_madv_lru_vma(vma))
 			return -EINVAL;
 		if (end > vma->vm_end) {
 			/*
@@ -689,6 +925,10 @@
 		return madvise_remove(vma, prev, start, end);
 	case MADV_WILLNEED:
 		return madvise_willneed(vma, prev, start, end);
+	case MADV_COLD:
+		return madvise_cold(vma, prev, start, end);
+	case MADV_PAGEOUT:
+		return madvise_pageout(vma, prev, start, end);
 	case MADV_FREE:
 	case MADV_DONTNEED:
 		return madvise_dontneed_free(vma, prev, start, end, behavior);
@@ -710,6 +950,8 @@
 	case MADV_WILLNEED:
 	case MADV_DONTNEED:
 	case MADV_FREE:
+	case MADV_COLD:
+	case MADV_PAGEOUT:
 #ifdef CONFIG_KSM
 	case MADV_MERGEABLE:
 	case MADV_UNMERGEABLE:
@@ -804,6 +1046,8 @@
 	size_t len;
 	struct blk_plug plug;
 
+	start = untagged_addr(start);
+
 	if (!madvise_behavior_valid(behavior))
 		return error;
 

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f3c15bb..c313c49 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c

@@ -57,6 +57,7 @@
 #include <linux/lockdep.h>
 #include <linux/file.h>
 #include <linux/tracehook.h>
+#include <linux/psi.h>
 #include <linux/seq_buf.h>
 #include "internal.h"
 #include <net/sock.h>
@@ -317,6 +318,7 @@
 EXPORT_SYMBOL(memcg_kmem_enabled_key);
 
 struct workqueue_struct *memcg_kmem_cache_wq;
+#endif
 
 static int memcg_shrinker_map_size;
 static DEFINE_MUTEX(memcg_shrinker_map_mutex);
@@ -440,14 +442,6 @@
 	}
 }
 
-#else /* CONFIG_MEMCG_KMEM */
-static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg)
-{
-	return 0;
-}
-static void memcg_free_shrinker_maps(struct mem_cgroup *memcg) { }
-#endif /* CONFIG_MEMCG_KMEM */
-
 /**
  * mem_cgroup_css_from_page - css of the memcg associated with a page
  * @page: page of interest
@@ -2270,21 +2264,22 @@
 	for_each_online_cpu(cpu) {
 		struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
 		struct mem_cgroup *memcg;
+		bool flush = false;
 
+		rcu_read_lock();
 		memcg = stock->cached;
-		if (!memcg || !stock->nr_pages || !css_tryget(&memcg->css))
-			continue;
-		if (!mem_cgroup_is_descendant(memcg, root_memcg)) {
-			css_put(&memcg->css);
-			continue;
-		}
-		if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
+		if (memcg && stock->nr_pages &&
+		    mem_cgroup_is_descendant(memcg, root_memcg))
+			flush = true;
+		rcu_read_unlock();
+
+		if (flush &&
+		    !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
 			if (cpu == curcpu)
 				drain_local_stock(&stock->work);
 			else
 				schedule_work_on(cpu, &stock->work);
 		}
-		css_put(&memcg->css);
 	}
 	put_cpu();
 	mutex_unlock(&percpu_charge_mutex);
@@ -2359,11 +2354,67 @@
 }
 
 /*
+ * Clamp the maximum sleep time per allocation batch to 2 seconds. This is
+ * enough to still cause a significant slowdown in most cases, while still
+ * allowing diagnostics and tracing to proceed without becoming stuck.
+ */
+#define MEMCG_MAX_HIGH_DELAY_JIFFIES (2UL*HZ)
+
+/*
+ * When calculating the delay, we use these either side of the exponentiation to
+ * maintain precision and scale to a reasonable number of jiffies (see the table
+ * below.
+ *
+ * - MEMCG_DELAY_PRECISION_SHIFT: Extra precision bits while translating the
+ *   overage ratio to a delay.
+ * - MEMCG_DELAY_SCALING_SHIFT: The number of bits to scale down down the
+ *   proposed penalty in order to reduce to a reasonable number of jiffies, and
+ *   to produce a reasonable delay curve.
+ *
+ * MEMCG_DELAY_SCALING_SHIFT just happens to be a number that produces a
+ * reasonable delay curve compared to precision-adjusted overage, not
+ * penalising heavily at first, but still making sure that growth beyond the
+ * limit penalises misbehaviour cgroups by slowing them down exponentially. For
+ * example, with a high of 100 megabytes:
+ *
+ *  +-------+------------------------+
+ *  | usage | time to allocate in ms |
+ *  +-------+------------------------+
+ *  | 100M  |                      0 |
+ *  | 101M  |                      6 |
+ *  | 102M  |                     25 |
+ *  | 103M  |                     57 |
+ *  | 104M  |                    102 |
+ *  | 105M  |                    159 |
+ *  | 106M  |                    230 |
+ *  | 107M  |                    313 |
+ *  | 108M  |                    409 |
+ *  | 109M  |                    518 |
+ *  | 110M  |                    639 |
+ *  | 111M  |                    774 |
+ *  | 112M  |                    921 |
+ *  | 113M  |                   1081 |
+ *  | 114M  |                   1254 |
+ *  | 115M  |                   1439 |
+ *  | 116M  |                   1638 |
+ *  | 117M  |                   1849 |
+ *  | 118M  |                   2000 |
+ *  | 119M  |                   2000 |
+ *  | 120M  |                   2000 |
+ *  +-------+------------------------+
+ */
+ #define MEMCG_DELAY_PRECISION_SHIFT 20
+ #define MEMCG_DELAY_SCALING_SHIFT 14
+
+/*
  * Scheduled by try_charge() to be executed from the userland return path
  * and reclaims memory over the high limit.
  */
 void mem_cgroup_handle_over_high(void)
 {
+	unsigned long usage, high, clamped_high;
+	unsigned long pflags;
+	unsigned long penalty_jiffies, overage;
 	unsigned int nr_pages = current->memcg_nr_pages_over_high;
 	struct mem_cgroup *memcg;
 
@@ -2372,8 +2423,75 @@
 
 	memcg = get_mem_cgroup_from_mm(current->mm);
 	reclaim_high(memcg, nr_pages, GFP_KERNEL);
-	css_put(&memcg->css);
 	current->memcg_nr_pages_over_high = 0;
+
+	/*
+	 * memory.high is breached and reclaim is unable to keep up. Throttle
+	 * allocators proactively to slow down excessive growth.
+	 *
+	 * We use overage compared to memory.high to calculate the number of
+	 * jiffies to sleep (penalty_jiffies). Ideally this value should be
+	 * fairly lenient on small overages, and increasingly harsh when the
+	 * memcg in question makes it clear that it has no intention of stopping
+	 * its crazy behaviour, so we exponentially increase the delay based on
+	 * overage amount.
+	 */
+
+	usage = page_counter_read(&memcg->memory);
+	high = READ_ONCE(memcg->high);
+
+	if (usage <= high)
+		goto out;
+
+	/*
+	 * Prevent division by 0 in overage calculation by acting as if it was a
+	 * threshold of 1 page
+	 */
+	clamped_high = max(high, 1UL);
+
+	overage = div_u64((u64)(usage - high) << MEMCG_DELAY_PRECISION_SHIFT,
+			  clamped_high);
+
+	penalty_jiffies = ((u64)overage * overage * HZ)
+		>> (MEMCG_DELAY_PRECISION_SHIFT + MEMCG_DELAY_SCALING_SHIFT);
+
+	/*
+	 * Factor in the task's own contribution to the overage, such that four
+	 * N-sized allocations are throttled approximately the same as one
+	 * 4N-sized allocation.
+	 *
+	 * MEMCG_CHARGE_BATCH pages is nominal, so work out how much smaller or
+	 * larger the current charge patch is than that.
+	 */
+	penalty_jiffies = penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH;
+
+	/*
+	 * Clamp the max delay per usermode return so as to still keep the
+	 * application moving forwards and also permit diagnostics, albeit
+	 * extremely slowly.
+	 */
+	penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES);
+
+	/*
+	 * Don't sleep if the amount of jiffies this memcg owes us is so low
+	 * that it's not even worth doing, in an attempt to be nice to those who
+	 * go only a small amount over their memory.high value and maybe haven't
+	 * been aggressively reclaimed enough yet.
+	 */
+	if (penalty_jiffies <= HZ / 100)
+		goto out;
+
+	/*
+	 * If we exit early, we're guaranteed to die (since
+	 * schedule_timeout_killable sets TASK_KILLABLE). This means we don't
+	 * need to account for any ill-begotten jiffies to pay them off later.
+	 */
+	psi_memstall_enter(&pflags);
+	schedule_timeout_killable(penalty_jiffies);
+	psi_memstall_leave(&pflags);
+
+out:
+	css_put(&memcg->css);
 }
 
 static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
@@ -2825,6 +2943,16 @@
 
 	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) &&
 	    !page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) {
+
+		/*
+		 * Enforce __GFP_NOFAIL allocation because callers are not
+		 * prepared to see failures and likely do not have any failure
+		 * handling code.
+		 */
+		if (gfp & __GFP_NOFAIL) {
+			page_counter_charge(&memcg->kmem, nr_pages);
+			return 0;
+		}
 		cancel_charge(memcg, nr_pages);
 		return -ENOMEM;
 	}
@@ -3512,6 +3640,9 @@
 			ret = mem_cgroup_resize_max(memcg, nr_pages, true);
 			break;
 		case _KMEM:
+			pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. "
+				     "Please report your usecase to linux-mm@kvack.org if you "
+				     "depend on this functionality.\n");
 			ret = memcg_update_kmem_max(memcg, nr_pages);
 			break;
 		case _TCP:
@@ -4805,11 +4936,6 @@
 	}
 }
 
-static inline void mem_cgroup_id_get(struct mem_cgroup *memcg)
-{
-	mem_cgroup_id_get_many(memcg, 1);
-}
-
 static inline void mem_cgroup_id_put(struct mem_cgroup *memcg)
 {
 	mem_cgroup_id_put_many(memcg, 1);
@@ -4955,6 +5081,11 @@
 		memcg->cgwb_frn[i].done =
 			__WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq);
 #endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	spin_lock_init(&memcg->deferred_split_queue.split_queue_lock);
+	INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue);
+	memcg->deferred_split_queue.split_queue_len = 0;
+#endif
 	idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
 	return memcg;
 fail:
@@ -5333,6 +5464,14 @@
 		__mod_memcg_state(to, NR_WRITEBACK, nr_pages);
 	}
 
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	if (compound && !list_empty(page_deferred_list(page))) {
+		spin_lock(&from->deferred_split_queue.split_queue_lock);
+		list_del_init(page_deferred_list(page));
+		from->deferred_split_queue.split_queue_len--;
+		spin_unlock(&from->deferred_split_queue.split_queue_lock);
+	}
+#endif
 	/*
 	 * It is safe to change page->mem_cgroup here because the page
 	 * is referenced, charged, and isolated - we can't race with
@@ -5341,6 +5480,17 @@
 
 	/* caller should have done css_get */
 	page->mem_cgroup = to;
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	if (compound && list_empty(page_deferred_list(page))) {
+		spin_lock(&to->deferred_split_queue.split_queue_lock);
+		list_add_tail(page_deferred_list(page),
+			      &to->deferred_split_queue.split_queue);
+		to->deferred_split_queue.split_queue_len++;
+		spin_unlock(&to->deferred_split_queue.split_queue_lock);
+	}
+#endif
+
 	spin_unlock_irqrestore(&from->move_lock, flags);
 
 	ret = 0;
@@ -6511,7 +6661,7 @@
 		unsigned int nr_pages = 1;
 
 		if (PageTransHuge(page)) {
-			nr_pages <<= compound_order(page);
+			nr_pages = compound_nr(page);
 			ug->nr_huge += nr_pages;
 		}
 		if (PageAnon(page))
@@ -6523,7 +6673,7 @@
 		}
 		ug->pgpgout++;
 	} else {
-		ug->nr_kmem += 1 << compound_order(page);
+		ug->nr_kmem += compound_nr(page);
 		__ClearPageKmemcg(page);
 	}
 

diff --git a/mm/memfd.c b/mm/memfd.c
index 650e65a..2647c89 100644
--- a/mm/memfd.c
+++ b/mm/memfd.c

@@ -39,6 +39,7 @@
 	xas_for_each(xas, page, ULONG_MAX) {
 		if (xa_is_value(page))
 			continue;
+		page = find_subpage(page, xas->xa_index);
 		if (page_count(page) - page_mapcount(page) > 1)
 			xas_set_mark(xas, MEMFD_TAG_PINNED);
 
@@ -88,6 +89,7 @@
 			bool clear = true;
 			if (xa_is_value(page))
 				continue;
+			page = find_subpage(page, xas.xa_index);
 			if (page_count(page) - page_mapcount(page) != 1) {
 				/*
 				 * On the last scan, we clean up all those tags

diff --git a/mm/memory.c b/mm/memory.c
index b1dff75..b1ca51a 100644
--- a/mm/memory.c
+++ b/mm/memory.c

@@ -518,7 +518,7 @@
 		 (long long)pte_val(pte), (long long)pmd_val(*pmd));
 	if (page)
 		dump_page(page, "bad pte");
-	pr_alert("addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
+	pr_alert("addr:%px vm_flags:%08lx anon_vma:%px mapping:%px index:%lx\n",
 		 (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
 	pr_alert("file:%pD fault:%ps mmap:%ps readpage:%ps\n",
 		 vma->vm_file,
@@ -1026,6 +1026,9 @@
 		if (pte_none(ptent))
 			continue;
 
+		if (need_resched())
+			break;
+
 		if (pte_present(ptent)) {
 			struct page *page;
 
@@ -1093,7 +1096,6 @@
 		if (unlikely(details))
 			continue;
 
-		entry = pte_to_swp_entry(ptent);
 		if (!non_swap_entry(entry))
 			rss[MM_SWAPENTS]--;
 		else if (is_migration_entry(entry)) {
@@ -1124,8 +1126,11 @@
 	if (force_flush) {
 		force_flush = 0;
 		tlb_flush_mmu(tlb);
-		if (addr != end)
-			goto again;
+	}
+
+	if (addr != end) {
+		cond_resched();
+		goto again;
 	}
 
 	return addr;

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index c73f099..b1be791 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c

@@ -632,33 +632,30 @@
 #endif
 }
 
-static int online_pages_blocks(unsigned long start, unsigned long nr_pages)
-{
-	unsigned long end = start + nr_pages;
-	int order, onlined_pages = 0;
-
-	while (start < end) {
-		order = min(MAX_ORDER - 1,
-			get_order(PFN_PHYS(end) - PFN_PHYS(start)));
-		(*online_page_callback)(pfn_to_page(start), order);
-
-		onlined_pages += (1UL << order);
-		start += (1UL << order);
-	}
-	return onlined_pages;
-}
-
 static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
 			void *arg)
 {
-	unsigned long onlined_pages = *(unsigned long *)arg;
+	const unsigned long end_pfn = start_pfn + nr_pages;
+	unsigned long pfn;
+	int order;
 
-	if (PageReserved(pfn_to_page(start_pfn)))
-		onlined_pages += online_pages_blocks(start_pfn, nr_pages);
+	/*
+	 * Online the pages. The callback might decide to keep some pages
+	 * PG_reserved (to add them to the buddy later), but we still account
+	 * them as being online/belonging to this zone ("present").
+	 */
+	for (pfn = start_pfn; pfn < end_pfn; pfn += 1ul << order) {
+		order = min(MAX_ORDER - 1, get_order(PFN_PHYS(end_pfn - pfn)));
+		/* __free_pages_core() wants pfns to be aligned to the order */
+		if (WARN_ON_ONCE(!IS_ALIGNED(pfn, 1ul << order)))
+			order = 0;
+		(*online_page_callback)(pfn_to_page(pfn), order);
+	}
 
-	online_mem_sections(start_pfn, start_pfn + nr_pages);
+	/* mark all involved sections as online */
+	online_mem_sections(start_pfn, end_pfn);
 
-	*(unsigned long *)arg = onlined_pages;
+	*(unsigned long *)arg += nr_pages;
 	return 0;
 }
 
@@ -714,8 +711,13 @@
 		pgdat->node_start_pfn = start_pfn;
 
 	pgdat->node_spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - pgdat->node_start_pfn;
-}
 
+}
+/*
+ * Associate the pfn range with the given zone, initializing the memmaps
+ * and resizing the pgdat/zone data to span the added pages. After this
+ * call, all affected pages are PG_reserved.
+ */
 void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
 		unsigned long nr_pages, struct vmem_altmap *altmap)
 {
@@ -804,20 +806,6 @@
 	return default_zone_for_pfn(nid, start_pfn, nr_pages);
 }
 
-/*
- * Associates the given pfn range with the given node and the zone appropriate
- * for the given online type.
- */
-static struct zone * __meminit move_pfn_range(int online_type, int nid,
-		unsigned long start_pfn, unsigned long nr_pages)
-{
-	struct zone *zone;
-
-	zone = zone_for_pfn_range(online_type, nid, start_pfn, nr_pages);
-	move_pfn_range_to_zone(zone, start_pfn, nr_pages, NULL);
-	return zone;
-}
-
 int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type)
 {
 	unsigned long flags;
@@ -840,7 +828,8 @@
 	put_device(&mem->dev);
 
 	/* associate pfn range with the zone */
-	zone = move_pfn_range(online_type, nid, pfn, nr_pages);
+	zone = zone_for_pfn_range(online_type, nid, pfn, nr_pages);
+	move_pfn_range_to_zone(zone, pfn, nr_pages, NULL);
 
 	arg.start_pfn = pfn;
 	arg.nr_pages = nr_pages;
@@ -864,6 +853,7 @@
 	ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages,
 		online_pages_range);
 	if (ret) {
+		/* not a single memory resource was applicable */
 		if (need_zonelists_rebuild)
 			zone_pcp_reset(zone);
 		goto failed_addition;
@@ -877,27 +867,22 @@
 
 	shuffle_zone(zone);
 
-	if (onlined_pages) {
-		node_states_set_node(nid, &arg);
-		if (need_zonelists_rebuild)
-			build_all_zonelists(NULL);
-		else
-			zone_pcp_update(zone);
-	}
+	node_states_set_node(nid, &arg);
+	if (need_zonelists_rebuild)
+		build_all_zonelists(NULL);
+	else
+		zone_pcp_update(zone);
 
 	init_per_zone_wmark_min();
 
-	if (onlined_pages) {
-		kswapd_run(nid);
-		kcompactd_run(nid);
-	}
+	kswapd_run(nid);
+	kcompactd_run(nid);
 
 	vm_total_pages = nr_free_pagecache_pages();
 
 	writeback_set_ratelimit();
 
-	if (onlined_pages)
-		memory_notify(MEM_ONLINE, &arg);
+	memory_notify(MEM_ONLINE, &arg);
 	mem_hotplug_done();
 	return 0;
 
@@ -933,8 +918,11 @@
 		if (!pgdat)
 			return NULL;
 
+		pgdat->per_cpu_nodestats =
+			alloc_percpu(struct per_cpu_nodestat);
 		arch_refresh_nodedata(nid, pgdat);
 	} else {
+		int cpu;
 		/*
 		 * Reset the nr_zones, order and classzone_idx before reuse.
 		 * Note that kswapd will init kswapd_classzone_idx properly
@@ -943,6 +931,12 @@
 		pgdat->nr_zones = 0;
 		pgdat->kswapd_order = 0;
 		pgdat->kswapd_classzone_idx = 0;
+		for_each_online_cpu(cpu) {
+			struct per_cpu_nodestat *p;
+
+			p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);
+			memset(p, 0, sizeof(*p));
+		}
 	}
 
 	/* we can use NODE_DATA(nid) from here */
@@ -952,7 +946,6 @@
 
 	/* init node's zones as empty zones, we don't have any present pages.*/
 	free_area_init_core_hotplug(nid);
-	pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat);
 
 	/*
 	 * The node we allocated has no zone fallback lists. For avoiding
@@ -1309,7 +1302,7 @@
 		head = compound_head(page);
 		if (page_huge_active(head))
 			return pfn;
-		skip = (1 << compound_order(head)) - (page - head);
+		skip = compound_nr(head) - (page - head);
 		pfn += skip - 1;
 	}
 	return 0;
@@ -1347,7 +1340,7 @@
 
 		if (PageHuge(page)) {
 			struct page *head = compound_head(page);
-			pfn = page_to_pfn(head) + (1<<compound_order(head)) - 1;
+			pfn = page_to_pfn(head) + compound_nr(head) - 1;
 			isolate_huge_page(head, &source);
 			continue;
 		} else if (PageTransHuge(page))
@@ -1662,7 +1655,7 @@
 		phys_addr_t beginpa, endpa;
 
 		beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr));
-		endpa = PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1;
+		endpa = beginpa + memory_block_size_bytes() - 1;
 		pr_warn("removing memory fails, because memory [%pa-%pa] is onlined\n",
 			&beginpa, &endpa);
 
@@ -1800,7 +1793,7 @@
 {
 
 	/*
-	 * trigger BUG() is some memory is not offlined prior to calling this
+	 * trigger BUG() if some memory is not offlined prior to calling this
 	 * function
 	 */
 	if (try_remove_memory(nid, start, size))

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index f000771..4ae967b 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c

@@ -1179,8 +1179,8 @@
 	} else if (PageTransHuge(page)) {
 		struct page *thp;
 
-		thp = alloc_pages_vma(GFP_TRANSHUGE, HPAGE_PMD_ORDER, vma,
-				address, numa_node_id());
+		thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address,
+					 HPAGE_PMD_ORDER);
 		if (!thp)
 			return NULL;
 		prep_transhuge_page(thp);
@@ -1405,6 +1405,7 @@
 	int err;
 	unsigned short mode_flags;
 
+	start = untagged_addr(start);
 	mode_flags = mode & MPOL_MODE_FLAGS;
 	mode &= ~MPOL_MODE_FLAGS;
 	if (mode >= MPOL_MAX)
@@ -1512,10 +1513,6 @@
 	if (nodes_empty(*new))
 		goto out_put;
 
-	nodes_and(*new, *new, node_states[N_MEMORY]);
-	if (nodes_empty(*new))
-		goto out_put;
-
 	err = security_task_movememory(task);
 	if (err)
 		goto out_put;
@@ -1562,6 +1559,8 @@
 	int uninitialized_var(pval);
 	nodemask_t nodes;
 
+	addr = untagged_addr(addr);
+
 	if (nmask != NULL && maxnode < nr_node_ids)
 		return -EINVAL;
 
@@ -1733,7 +1732,7 @@
  * freeing by another task.  It is the caller's responsibility to free the
  * extra reference for shared policies.
  */
-struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
+static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
 						unsigned long addr)
 {
 	struct mempolicy *pol = __get_vma_policy(vma, addr);
@@ -2082,6 +2081,7 @@
  * 	@vma:  Pointer to VMA or NULL if not available.
  *	@addr: Virtual Address of the allocation. Must be inside the VMA.
  *	@node: Which node to prefer for allocation (modulo policy).
+ *	@hugepage: for hugepages try only the preferred node if possible
  *
  * 	This function allocates a page from the kernel page pool and applies
  *	a NUMA policy associated with the VMA or the current process.
@@ -2092,7 +2092,7 @@
  */
 struct page *
 alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
-		unsigned long addr, int node)
+		unsigned long addr, int node, bool hugepage)
 {
 	struct mempolicy *pol;
 	struct page *page;
@@ -2110,6 +2110,42 @@
 		goto out;
 	}
 
+	if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
+		int hpage_node = node;
+
+		/*
+		 * For hugepage allocation and non-interleave policy which
+		 * allows the current node (or other explicitly preferred
+		 * node) we only try to allocate from the current/preferred
+		 * node and don't fall back to other nodes, as the cost of
+		 * remote accesses would likely offset THP benefits.
+		 *
+		 * If the policy is interleave, or does not allow the current
+		 * node in its nodemask, we allocate the standard way.
+		 */
+		if (pol->mode == MPOL_PREFERRED && !(pol->flags & MPOL_F_LOCAL))
+			hpage_node = pol->v.preferred_node;
+
+		nmask = policy_nodemask(gfp, pol);
+		if (!nmask || node_isset(hpage_node, *nmask)) {
+			mpol_cond_put(pol);
+			page = __alloc_pages_node(hpage_node,
+						gfp | __GFP_THISNODE, order);
+
+			/*
+			 * If hugepage allocations are configured to always
+			 * synchronous compact or the vma has been madvised
+			 * to prefer hugepage backing, retry allowing remote
+			 * memory as well.
+			 */
+			if (!page && (gfp & __GFP_DIRECT_RECLAIM))
+				page = __alloc_pages_node(hpage_node,
+						gfp | __GFP_NORETRY, order);
+
+			goto out;
+		}
+	}
+
 	nmask = policy_nodemask(gfp, pol);
 	preferred_nid = policy_node(gfp, pol, node);
 	page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask);

diff --git a/mm/migrate.c b/mm/migrate.c
index 9f4ed4e..4fe45d1 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c

@@ -460,7 +460,7 @@
 
 		for (i = 1; i < HPAGE_PMD_NR; i++) {
 			xas_next(&xas);
-			xas_store(&xas, newpage + i);
+			xas_store(&xas, newpage);
 		}
 	}
 
@@ -1612,7 +1612,7 @@
 			goto out_flush;
 		if (get_user(node, nodes + i))
 			goto out_flush;
-		addr = (unsigned long)p;
+		addr = (unsigned long)untagged_addr(p);
 
 		err = -ENODEV;
 		if (node < 0 || node >= MAX_NUMNODES)
@@ -1892,7 +1892,7 @@
 	VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page), page);
 
 	/* Avoid migrating to a node that is nearly full */
-	if (!migrate_balanced_pgdat(pgdat, 1UL << compound_order(page)))
+	if (!migrate_balanced_pgdat(pgdat, compound_nr(page)))
 		return 0;
 
 	if (isolate_lru_page(page))
@@ -2218,17 +2218,15 @@
 		pte_t pte;
 
 		pte = *ptep;
-		pfn = pte_pfn(pte);
 
 		if (pte_none(pte)) {
 			mpfn = MIGRATE_PFN_MIGRATE;
 			migrate->cpages++;
-			pfn = 0;
 			goto next;
 		}
 
 		if (!pte_present(pte)) {
-			mpfn = pfn = 0;
+			mpfn = 0;
 
 			/*
 			 * Only care about unaddressable device page special
@@ -2245,10 +2243,10 @@
 			if (is_write_device_private_entry(entry))
 				mpfn |= MIGRATE_PFN_WRITE;
 		} else {
+			pfn = pte_pfn(pte);
 			if (is_zero_pfn(pfn)) {
 				mpfn = MIGRATE_PFN_MIGRATE;
 				migrate->cpages++;
-				pfn = 0;
 				goto next;
 			}
 			page = vm_normal_page(migrate->vma, addr, pte);
@@ -2258,10 +2256,9 @@
 
 		/* FIXME support THP */
 		if (!page || !page->mapping || PageTransCompound(page)) {
-			mpfn = pfn = 0;
+			mpfn = 0;
 			goto next;
 		}
-		pfn = page_to_pfn(page);
 
 		/*
 		 * By getting a reference on the page we pin it and that blocks

diff --git a/mm/mincore.c b/mm/mincore.c
index f9a9dbe..49b6fa2 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c

@@ -256,6 +256,8 @@
 	unsigned long pages;
 	unsigned char *tmp;
 
+	start = untagged_addr(start);
+
 	/* Check the start address: needs to be page-aligned.. */
 	if (start & ~PAGE_MASK)
 		return -EINVAL;

diff --git a/mm/mlock.c b/mm/mlock.c
index a90099da4..a72c1ee 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c

@@ -674,6 +674,8 @@
 	unsigned long lock_limit;
 	int error = -ENOMEM;
 
+	start = untagged_addr(start);
+
 	if (!can_do_mlock())
 		return -EPERM;
 
@@ -735,6 +737,8 @@
 {
 	int ret;
 
+	start = untagged_addr(start);
+
 	len = PAGE_ALIGN(len + (offset_in_page(start)));
 	start &= PAGE_MASK;
 

diff --git a/mm/mmap.c b/mm/mmap.c
index 6bc21fca..a7d8c84 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c

@@ -201,6 +201,8 @@
 	bool downgraded = false;
 	LIST_HEAD(uf);
 
+	brk = untagged_addr(brk);
+
 	if (down_write_killable(&mm->mmap_sem))
 		return -EINTR;
 
@@ -289,9 +291,9 @@
 	return retval;
 }
 
-static long vma_compute_subtree_gap(struct vm_area_struct *vma)
+static inline unsigned long vma_compute_gap(struct vm_area_struct *vma)
 {
-	unsigned long max, prev_end, subtree_gap;
+	unsigned long gap, prev_end;
 
 	/*
 	 * Note: in the rare case of a VM_GROWSDOWN above a VM_GROWSUP, we
@@ -299,14 +301,21 @@
 	 * an unmapped area; whereas when expanding we only require one.
 	 * That's a little inconsistent, but keeps the code here simpler.
 	 */
-	max = vm_start_gap(vma);
+	gap = vm_start_gap(vma);
 	if (vma->vm_prev) {
 		prev_end = vm_end_gap(vma->vm_prev);
-		if (max > prev_end)
-			max -= prev_end;
+		if (gap > prev_end)
+			gap -= prev_end;
 		else
-			max = 0;
+			gap = 0;
 	}
+	return gap;
+}
+
+#ifdef CONFIG_DEBUG_VM_RB
+static unsigned long vma_compute_subtree_gap(struct vm_area_struct *vma)
+{
+	unsigned long max = vma_compute_gap(vma), subtree_gap;
 	if (vma->vm_rb.rb_left) {
 		subtree_gap = rb_entry(vma->vm_rb.rb_left,
 				struct vm_area_struct, vm_rb)->rb_subtree_gap;
@@ -322,7 +331,6 @@
 	return max;
 }
 
-#ifdef CONFIG_DEBUG_VM_RB
 static int browse_rb(struct mm_struct *mm)
 {
 	struct rb_root *root = &mm->mm_rb;
@@ -428,8 +436,9 @@
 #define validate_mm(mm) do { } while (0)
 #endif
 
-RB_DECLARE_CALLBACKS(static, vma_gap_callbacks, struct vm_area_struct, vm_rb,
-		     unsigned long, rb_subtree_gap, vma_compute_subtree_gap)
+RB_DECLARE_CALLBACKS_MAX(static, vma_gap_callbacks,
+			 struct vm_area_struct, vm_rb,
+			 unsigned long, rb_subtree_gap, vma_compute_gap)
 
 /*
  * Update augmented rbtree rb_subtree_gap values after vma->vm_start or
@@ -439,8 +448,8 @@
 static void vma_gap_update(struct vm_area_struct *vma)
 {
 	/*
-	 * As it turns out, RB_DECLARE_CALLBACKS() already created a callback
-	 * function that does exactly what we want.
+	 * As it turns out, RB_DECLARE_CALLBACKS_MAX() already created
+	 * a callback function that does exactly what we want.
 	 */
 	vma_gap_callbacks_propagate(&vma->vm_rb, NULL);
 }
@@ -1358,6 +1367,9 @@
 	if (S_ISBLK(inode->i_mode))
 		return MAX_LFS_FILESIZE;
 
+	if (S_ISSOCK(inode->i_mode))
+		return MAX_LFS_FILESIZE;
+
 	/* Special "we do even unsigned file positions" case */
 	if (file->f_mode & FMODE_UNSIGNED_OFFSET)
 		return 0;
@@ -1577,6 +1589,8 @@
 	struct file *file = NULL;
 	unsigned long retval;
 
+	addr = untagged_addr(addr);
+
 	if (!(flags & MAP_ANONYMOUS)) {
 		audit_mmap_fd(fd, flags);
 		file = fget(fd);
@@ -2274,12 +2288,9 @@
 	if (vma) {
 		*pprev = vma->vm_prev;
 	} else {
-		struct rb_node *rb_node = mm->mm_rb.rb_node;
-		*pprev = NULL;
-		while (rb_node) {
-			*pprev = rb_entry(rb_node, struct vm_area_struct, vm_rb);
-			rb_node = rb_node->rb_right;
-		}
+		struct rb_node *rb_node = rb_last(&mm->mm_rb);
+
+		*pprev = rb_node ? rb_entry(rb_node, struct vm_area_struct, vm_rb) : NULL;
 	}
 	return vma;
 }
@@ -2878,6 +2889,7 @@
 
 SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
 {
+	addr = untagged_addr(addr);
 	profile_munmap(addr);
 	return __vm_munmap(addr, len, true);
 }

diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index 8c943a6..7d70e5c 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c

@@ -271,8 +271,6 @@
 
 	tlb_flush_mmu(tlb);
 
-	/* keep the page table cache within bounds */
-	check_pgt_cache();
 #ifndef CONFIG_HAVE_MMU_GATHER_NO_GATHER
 	tlb_batch_list_free(tlb);
 #endif

diff --git a/mm/mprotect.c b/mm/mprotect.c
index 675e5d3..7967825 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c

@@ -459,6 +459,8 @@
 	const bool rier = (current->personality & READ_IMPLIES_EXEC) &&
 				(prot & PROT_READ);
 
+	start = untagged_addr(start);
+
 	prot &= ~(PROT_GROWSDOWN|PROT_GROWSUP);
 	if (grows == (PROT_GROWSDOWN|PROT_GROWSUP)) /* can't be both */
 		return -EINVAL;

diff --git a/mm/mremap.c b/mm/mremap.c
index fc241d2..1fc8a29 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c

@@ -606,6 +606,9 @@
 	LIST_HEAD(uf_unmap_early);
 	LIST_HEAD(uf_unmap);
 
+	addr = untagged_addr(addr);
+	new_addr = untagged_addr(new_addr);
+
 	if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
 		return ret;
 

diff --git a/mm/msync.c b/mm/msync.c
index ef30a42..c3bd3e7 100644
--- a/mm/msync.c
+++ b/mm/msync.c

@@ -37,6 +37,8 @@
 	int unmapped_error = 0;
 	int error = -EINVAL;
 
+	start = untagged_addr(start);
+
 	if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
 		goto out;
 	if (offset_in_page(start))

diff --git a/mm/nommu.c b/mm/nommu.c
index fed1b6e..99b7ec3 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c

@@ -108,7 +108,7 @@
 	 * The ksize() function is only guaranteed to work for pointers
 	 * returned by kmalloc(). So handle arbitrary pointers here.
 	 */
-	return PAGE_SIZE << compound_order(page);
+	return page_size(page);
 }
 
 /**

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index eda2e2a..71e3ace 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c

@@ -73,7 +73,7 @@
 /**
  * oom_cpuset_eligible() - check task eligiblity for kill
  * @start: task struct of which task to consider
- * @mask: nodemask passed to page allocator for mempolicy ooms
+ * @oc: pointer to struct oom_control
  *
  * Task eligibility is determined by whether or not a candidate task, @tsk,
  * shares the same mempolicy nodes as current if it is bound by such a policy
@@ -287,7 +287,7 @@
 	    !nodes_subset(node_states[N_MEMORY], *oc->nodemask)) {
 		oc->totalpages = total_swap_pages;
 		for_each_node_mask(nid, *oc->nodemask)
-			oc->totalpages += node_spanned_pages(nid);
+			oc->totalpages += node_present_pages(nid);
 		return CONSTRAINT_MEMORY_POLICY;
 	}
 
@@ -300,7 +300,7 @@
 	if (cpuset_limited) {
 		oc->totalpages = total_swap_pages;
 		for_each_node_mask(nid, cpuset_current_mems_allowed)
-			oc->totalpages += node_spanned_pages(nid);
+			oc->totalpages += node_present_pages(nid);
 		return CONSTRAINT_CPUSET;
 	}
 	return CONSTRAINT_NONE;
@@ -523,7 +523,7 @@
 	set_bit(MMF_UNSTABLE, &mm->flags);
 
 	for (vma = mm->mmap ; vma; vma = vma->vm_next) {
-		if (!can_madv_dontneed_vma(vma))
+		if (!can_madv_lru_vma(vma))
 			continue;
 
 		/*
@@ -884,12 +884,13 @@
 	 */
 	do_send_sig_info(SIGKILL, SEND_SIG_PRIV, victim, PIDTYPE_TGID);
 	mark_oom_victim(victim);
-	pr_err("%s: Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
-		message, task_pid_nr(victim), victim->comm,
-		K(victim->mm->total_vm),
-		K(get_mm_counter(victim->mm, MM_ANONPAGES)),
-		K(get_mm_counter(victim->mm, MM_FILEPAGES)),
-		K(get_mm_counter(victim->mm, MM_SHMEMPAGES)));
+	pr_err("%s: Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB, UID:%u pgtables:%lukB oom_score_adj:%hd\n",
+		message, task_pid_nr(victim), victim->comm, K(mm->total_vm),
+		K(get_mm_counter(mm, MM_ANONPAGES)),
+		K(get_mm_counter(mm, MM_FILEPAGES)),
+		K(get_mm_counter(mm, MM_SHMEMPAGES)),
+		from_kuid(&init_user_ns, task_uid(victim)),
+		mm_pgtables_bytes(mm), victim->signal->oom_score_adj);
 	task_unlock(victim);
 
 	/*
@@ -1068,9 +1069,10 @@
 	 * The OOM killer does not compensate for IO-less reclaim.
 	 * pagefault_out_of_memory lost its gfp context so we have to
 	 * make sure exclude 0 mask - all other users should have at least
-	 * ___GFP_DIRECT_RECLAIM to get here.
+	 * ___GFP_DIRECT_RECLAIM to get here. But mem_cgroup_oom() has to
+	 * invoke the OOM killer even if it is a GFP_NOFS allocation.
 	 */
-	if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS))
+	if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS) && !is_memcg_oom(oc))
 		return true;
 
 	/*

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ff5484f..15c2050 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c

@@ -670,6 +670,7 @@
 
 void free_compound_page(struct page *page)
 {
+	mem_cgroup_uncharge(page);
 	__free_pages_ok(page, compound_order(page));
 }
 
@@ -3955,17 +3956,25 @@
 		goto check_priority;
 
 	/*
-	 * make sure the compaction wasn't deferred or didn't bail out early
-	 * due to locks contention before we declare that we should give up.
-	 * But do not retry if the given zonelist is not suitable for
-	 * compaction.
+	 * compaction was skipped because there are not enough order-0 pages
+	 * to work with, so we retry only if it looks like reclaim can help.
 	 */
-	if (compaction_withdrawn(compact_result)) {
+	if (compaction_needs_reclaim(compact_result)) {
 		ret = compaction_zonelist_suitable(ac, order, alloc_flags);
 		goto out;
 	}
 
 	/*
+	 * make sure the compaction wasn't deferred or didn't bail out early
+	 * due to locks contention before we declare that we should give up.
+	 * But the next retry should use a higher priority if allowed, so
+	 * we don't just keep bailing out endlessly.
+	 */
+	if (compaction_withdrawn(compact_result)) {
+		goto check_priority;
+	}
+
+	/*
 	 * !costly requests are much more important than __GFP_RETRY_MAYFAIL
 	 * costly ones because they are de facto nofail and invoke OOM
 	 * killer to move on while costly can fail and users are ready
@@ -4458,6 +4467,28 @@
 		if (page)
 			goto got_pg;
 
+		 if (order >= pageblock_order && (gfp_mask & __GFP_IO)) {
+			/*
+			 * If allocating entire pageblock(s) and compaction
+			 * failed because all zones are below low watermarks
+			 * or is prohibited because it recently failed at this
+			 * order, fail immediately.
+			 *
+			 * Reclaim is
+			 *  - potentially very expensive because zones are far
+			 *    below their low watermarks or this is part of very
+			 *    bursty high order allocations,
+			 *  - not guaranteed to help because isolate_freepages()
+			 *    may not iterate over freed pages as part of its
+			 *    linear scan, and
+			 *  - unlikely to make entire pageblocks free on its
+			 *    own.
+			 */
+			if (compact_result == COMPACT_SKIPPED ||
+			    compact_result == COMPACT_DEFERRED)
+				goto nopage;
+		}
+
 		/*
 		 * Checks for costly allocations with __GFP_NORETRY, which
 		 * includes THP page fault allocations
@@ -6638,9 +6669,11 @@
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 static void pgdat_init_split_queue(struct pglist_data *pgdat)
 {
-	spin_lock_init(&pgdat->split_queue_lock);
-	INIT_LIST_HEAD(&pgdat->split_queue);
-	pgdat->split_queue_len = 0;
+	struct deferred_split *ds_queue = &pgdat->deferred_split_queue;
+
+	spin_lock_init(&ds_queue->split_queue_lock);
+	INIT_LIST_HEAD(&ds_queue->split_queue);
+	ds_queue->split_queue_len = 0;
 }
 #else
 static void pgdat_init_split_queue(struct pglist_data *pgdat) {}
@@ -8196,7 +8229,7 @@
 			if (!hugepage_migration_supported(page_hstate(head)))
 				goto unmovable;
 
-			skip_pages = (1 << compound_order(head)) - (page - head);
+			skip_pages = compound_nr(head) - (page - head);
 			iter += skip_pages - 1;
 			continue;
 		}

diff --git a/mm/page_owner.c b/mm/page_owner.c
index addcbb2..dee9311 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c

@@ -24,6 +24,9 @@
 	short last_migrate_reason;
 	gfp_t gfp_mask;
 	depot_stack_handle_t handle;
+#ifdef CONFIG_DEBUG_PAGEALLOC
+	depot_stack_handle_t free_handle;
+#endif
 };
 
 static bool page_owner_disabled = true;
@@ -102,19 +105,6 @@
 	return (void *)page_ext + page_owner_ops.offset;
 }
 
-void __reset_page_owner(struct page *page, unsigned int order)
-{
-	int i;
-	struct page_ext *page_ext;
-
-	for (i = 0; i < (1 << order); i++) {
-		page_ext = lookup_page_ext(page + i);
-		if (unlikely(!page_ext))
-			continue;
-		__clear_bit(PAGE_EXT_OWNER, &page_ext->flags);
-	}
-}
-
 static inline bool check_recursive_alloc(unsigned long *entries,
 					 unsigned int nr_entries,
 					 unsigned long ip)
@@ -154,18 +144,50 @@
 	return handle;
 }
 
-static inline void __set_page_owner_handle(struct page_ext *page_ext,
-	depot_stack_handle_t handle, unsigned int order, gfp_t gfp_mask)
+void __reset_page_owner(struct page *page, unsigned int order)
 {
+	int i;
+	struct page_ext *page_ext;
+#ifdef CONFIG_DEBUG_PAGEALLOC
+	depot_stack_handle_t handle = 0;
 	struct page_owner *page_owner;
 
-	page_owner = get_page_owner(page_ext);
-	page_owner->handle = handle;
-	page_owner->order = order;
-	page_owner->gfp_mask = gfp_mask;
-	page_owner->last_migrate_reason = -1;
+	if (debug_pagealloc_enabled())
+		handle = save_stack(GFP_NOWAIT | __GFP_NOWARN);
+#endif
 
-	__set_bit(PAGE_EXT_OWNER, &page_ext->flags);
+	for (i = 0; i < (1 << order); i++) {
+		page_ext = lookup_page_ext(page + i);
+		if (unlikely(!page_ext))
+			continue;
+		__clear_bit(PAGE_EXT_OWNER_ACTIVE, &page_ext->flags);
+#ifdef CONFIG_DEBUG_PAGEALLOC
+		if (debug_pagealloc_enabled()) {
+			page_owner = get_page_owner(page_ext);
+			page_owner->free_handle = handle;
+		}
+#endif
+	}
+}
+
+static inline void __set_page_owner_handle(struct page *page,
+	struct page_ext *page_ext, depot_stack_handle_t handle,
+	unsigned int order, gfp_t gfp_mask)
+{
+	struct page_owner *page_owner;
+	int i;
+
+	for (i = 0; i < (1 << order); i++) {
+		page_owner = get_page_owner(page_ext);
+		page_owner->handle = handle;
+		page_owner->order = order;
+		page_owner->gfp_mask = gfp_mask;
+		page_owner->last_migrate_reason = -1;
+		__set_bit(PAGE_EXT_OWNER, &page_ext->flags);
+		__set_bit(PAGE_EXT_OWNER_ACTIVE, &page_ext->flags);
+
+		page_ext = lookup_page_ext(page + i);
+	}
 }
 
 noinline void __set_page_owner(struct page *page, unsigned int order,
@@ -178,7 +200,7 @@
 		return;
 
 	handle = save_stack(gfp_mask);
-	__set_page_owner_handle(page_ext, handle, order, gfp_mask);
+	__set_page_owner_handle(page, page_ext, handle, order, gfp_mask);
 }
 
 void __set_page_owner_migrate_reason(struct page *page, int reason)
@@ -204,8 +226,11 @@
 
 	page_owner = get_page_owner(page_ext);
 	page_owner->order = 0;
-	for (i = 1; i < (1 << order); i++)
-		__copy_page_owner(page, page + i);
+	for (i = 1; i < (1 << order); i++) {
+		page_ext = lookup_page_ext(page + i);
+		page_owner = get_page_owner(page_ext);
+		page_owner->order = 0;
+	}
 }
 
 void __copy_page_owner(struct page *oldpage, struct page *newpage)
@@ -235,6 +260,7 @@
 	 * the new page, which will be freed.
 	 */
 	__set_bit(PAGE_EXT_OWNER, &new_ext->flags);
+	__set_bit(PAGE_EXT_OWNER_ACTIVE, &new_ext->flags);
 }
 
 void pagetypeinfo_showmixedcount_print(struct seq_file *m,
@@ -294,7 +320,7 @@
 			if (unlikely(!page_ext))
 				continue;
 
-			if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags))
+			if (!test_bit(PAGE_EXT_OWNER_ACTIVE, &page_ext->flags))
 				continue;
 
 			page_owner = get_page_owner(page_ext);
@@ -405,20 +431,36 @@
 	mt = gfpflags_to_migratetype(gfp_mask);
 
 	if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) {
-		pr_alert("page_owner info is not active (free page?)\n");
+		pr_alert("page_owner info is not present (never set?)\n");
 		return;
 	}
 
+	if (test_bit(PAGE_EXT_OWNER_ACTIVE, &page_ext->flags))
+		pr_alert("page_owner tracks the page as allocated\n");
+	else
+		pr_alert("page_owner tracks the page as freed\n");
+
+	pr_alert("page last allocated via order %u, migratetype %s, gfp_mask %#x(%pGg)\n",
+		 page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask);
+
 	handle = READ_ONCE(page_owner->handle);
 	if (!handle) {
-		pr_alert("page_owner info is not active (free page?)\n");
-		return;
+		pr_alert("page_owner allocation stack trace missing\n");
+	} else {
+		nr_entries = stack_depot_fetch(handle, &entries);
+		stack_trace_print(entries, nr_entries, 0);
 	}
 
-	nr_entries = stack_depot_fetch(handle, &entries);
-	pr_alert("page allocated via order %u, migratetype %s, gfp_mask %#x(%pGg)\n",
-		 page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask);
-	stack_trace_print(entries, nr_entries, 0);
+#ifdef CONFIG_DEBUG_PAGEALLOC
+	handle = READ_ONCE(page_owner->free_handle);
+	if (!handle) {
+		pr_alert("page_owner free stack trace missing\n");
+	} else {
+		nr_entries = stack_depot_fetch(handle, &entries);
+		pr_alert("page last free stack trace:\n");
+		stack_trace_print(entries, nr_entries, 0);
+	}
+#endif
 
 	if (page_owner->last_migrate_reason != -1)
 		pr_alert("page has been migrated, last migrate reason: %s\n",
@@ -481,9 +523,23 @@
 		if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags))
 			continue;
 
+		/*
+		 * Although we do have the info about past allocation of free
+		 * pages, it's not relevant for current memory usage.
+		 */
+		if (!test_bit(PAGE_EXT_OWNER_ACTIVE, &page_ext->flags))
+			continue;
+
 		page_owner = get_page_owner(page_ext);
 
 		/*
+		 * Don't print "tail" pages of high-order allocations as that
+		 * would inflate the stats.
+		 */
+		if (!IS_ALIGNED(pfn, 1 << page_owner->order))
+			continue;
+
+		/*
 		 * Access to page_ext->handle isn't synchronous so we should
 		 * be careful to access it.
 		 */
@@ -562,7 +618,8 @@
 				continue;
 
 			/* Found early allocated page */
-			__set_page_owner_handle(page_ext, early_handle, 0, 0);
+			__set_page_owner_handle(page, page_ext, early_handle,
+						0, 0);
 			count++;
 		}
 		cond_resched();

diff --git a/mm/page_poison.c b/mm/page_poison.c
index 21d4f97..34b9181 100644
--- a/mm/page_poison.c
+++ b/mm/page_poison.c

@@ -101,7 +101,7 @@
 	/*
 	 * Page poisoning when enabled poisons each and every page
 	 * that is freed to buddy. Thus no extra check is done to
-	 * see if a page was posioned.
+	 * see if a page was poisoned.
 	 */
 	check_poison_mem(addr, PAGE_SIZE);
 	kunmap_atomic(addr);

diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index 11df03e..eff4b45 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c

@@ -153,8 +153,7 @@
 
 	if (unlikely(PageHuge(pvmw->page))) {
 		/* when pud is not present, pte will be NULL */
-		pvmw->pte = huge_pte_offset(mm, pvmw->address,
-					    PAGE_SIZE << compound_order(page));
+		pvmw->pte = huge_pte_offset(mm, pvmw->address, page_size(page));
 		if (!pvmw->pte)
 			return false;
 

diff --git a/mm/quicklist.c b/mm/quicklist.c
deleted file mode 100644
index 5e98ac7..0000000
--- a/mm/quicklist.c
+++ /dev/null

@@ -1,103 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Quicklist support.
- *
- * Quicklists are light weight lists of pages that have a defined state
- * on alloc and free. Pages must be in the quicklist specific defined state
- * (zero by default) when the page is freed. It seems that the initial idea
- * for such lists first came from Dave Miller and then various other people
- * improved on it.
- *
- * Copyright (C) 2007 SGI,
- * 	Christoph Lameter <cl@linux.com>
- * 		Generalized, added support for multiple lists and
- * 		constructors / destructors.
- */
-#include <linux/kernel.h>
-
-#include <linux/gfp.h>
-#include <linux/mm.h>
-#include <linux/mmzone.h>
-#include <linux/quicklist.h>
-
-DEFINE_PER_CPU(struct quicklist [CONFIG_NR_QUICK], quicklist);
-
-#define FRACTION_OF_NODE_MEM	16
-
-static unsigned long max_pages(unsigned long min_pages)
-{
-	unsigned long node_free_pages, max;
-	int node = numa_node_id();
-	struct zone *zones = NODE_DATA(node)->node_zones;
-	int num_cpus_on_node;
-
-	node_free_pages =
-#ifdef CONFIG_ZONE_DMA
-		zone_page_state(&zones[ZONE_DMA], NR_FREE_PAGES) +
-#endif
-#ifdef CONFIG_ZONE_DMA32
-		zone_page_state(&zones[ZONE_DMA32], NR_FREE_PAGES) +
-#endif
-		zone_page_state(&zones[ZONE_NORMAL], NR_FREE_PAGES);
-
-	max = node_free_pages / FRACTION_OF_NODE_MEM;
-
-	num_cpus_on_node = cpumask_weight(cpumask_of_node(node));
-	max /= num_cpus_on_node;
-
-	return max(max, min_pages);
-}
-
-static long min_pages_to_free(struct quicklist *q,
-	unsigned long min_pages, long max_free)
-{
-	long pages_to_free;
-
-	pages_to_free = q->nr_pages - max_pages(min_pages);
-
-	return min(pages_to_free, max_free);
-}
-
-/*
- * Trim down the number of pages in the quicklist
- */
-void quicklist_trim(int nr, void (*dtor)(void *),
-	unsigned long min_pages, unsigned long max_free)
-{
-	long pages_to_free;
-	struct quicklist *q;
-
-	q = &get_cpu_var(quicklist)[nr];
-	if (q->nr_pages > min_pages) {
-		pages_to_free = min_pages_to_free(q, min_pages, max_free);
-
-		while (pages_to_free > 0) {
-			/*
-			 * We pass a gfp_t of 0 to quicklist_alloc here
-			 * because we will never call into the page allocator.
-			 */
-			void *p = quicklist_alloc(nr, 0, NULL);
-
-			if (dtor)
-				dtor(p);
-			free_page((unsigned long)p);
-			pages_to_free--;
-		}
-	}
-	put_cpu_var(quicklist);
-}
-
-unsigned long quicklist_total_size(void)
-{
-	unsigned long count = 0;
-	int cpu;
-	struct quicklist *ql, *q;
-
-	for_each_online_cpu(cpu) {
-		ql = per_cpu(quicklist, cpu);
-		for (q = ql; q < ql + CONFIG_NR_QUICK; q++)
-			count += q->nr_pages;
-	}
-	return count;
-}
-

diff --git a/mm/rmap.c b/mm/rmap.c
index 003377e..d9a23bb 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c

@@ -898,15 +898,13 @@
 	 */
 	mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
 				0, vma, vma->vm_mm, address,
-				min(vma->vm_end, address +
-				    (PAGE_SIZE << compound_order(page))));
+				min(vma->vm_end, address + page_size(page)));
 	mmu_notifier_invalidate_range_start(&range);
 
 	while (page_vma_mapped_walk(&pvmw)) {
-		unsigned long cstart;
 		int ret = 0;
 
-		cstart = address = pvmw.address;
+		address = pvmw.address;
 		if (pvmw.pte) {
 			pte_t entry;
 			pte_t *pte = pvmw.pte;
@@ -933,7 +931,6 @@
 			entry = pmd_wrprotect(entry);
 			entry = pmd_mkclean(entry);
 			set_pmd_at(vma->vm_mm, address, pmd, entry);
-			cstart &= PMD_MASK;
 			ret = 1;
 #else
 			/* unexpected pmd-mapped page? */
@@ -1192,8 +1189,10 @@
 		}
 		if (!atomic_inc_and_test(compound_mapcount_ptr(page)))
 			goto out;
-		VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
-		__inc_node_page_state(page, NR_SHMEM_PMDMAPPED);
+		if (PageSwapBacked(page))
+			__inc_node_page_state(page, NR_SHMEM_PMDMAPPED);
+		else
+			__inc_node_page_state(page, NR_FILE_PMDMAPPED);
 	} else {
 		if (PageTransCompound(page) && page_mapping(page)) {
 			VM_WARN_ON_ONCE(!PageLocked(page));
@@ -1232,8 +1231,10 @@
 		}
 		if (!atomic_add_negative(-1, compound_mapcount_ptr(page)))
 			goto out;
-		VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
-		__dec_node_page_state(page, NR_SHMEM_PMDMAPPED);
+		if (PageSwapBacked(page))
+			__dec_node_page_state(page, NR_SHMEM_PMDMAPPED);
+		else
+			__dec_node_page_state(page, NR_FILE_PMDMAPPED);
 	} else {
 		if (!atomic_add_negative(-1, &page->_mapcount))
 			goto out;
@@ -1374,8 +1375,7 @@
 	 */
 	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
 				address,
-				min(vma->vm_end, address +
-				    (PAGE_SIZE << compound_order(page))));
+				min(vma->vm_end, address + page_size(page)));
 	if (PageHuge(page)) {
 		/*
 		 * If sharing is possible, start and end will be adjusted
@@ -1524,8 +1524,7 @@
 		if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
 			pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
 			if (PageHuge(page)) {
-				int nr = 1 << compound_order(page);
-				hugetlb_count_sub(nr, mm);
+				hugetlb_count_sub(compound_nr(page), mm);
 				set_huge_swap_pte_at(mm, address,
 						     pvmw.pte, pteval,
 						     vma_mmu_pagesize(vma));

diff --git a/mm/shmem.c b/mm/shmem.c
index 0f7fd4a..cd570cc 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c

@@ -609,7 +609,7 @@
 {
 	XA_STATE_ORDER(xas, &mapping->i_pages, index, compound_order(page));
 	unsigned long i = 0;
-	unsigned long nr = 1UL << compound_order(page);
+	unsigned long nr = compound_nr(page);
 
 	VM_BUG_ON_PAGE(PageTail(page), page);
 	VM_BUG_ON_PAGE(index != round_down(index, nr), page);
@@ -631,7 +631,7 @@
 		if (xas_error(&xas))
 			goto unlock;
 next:
-		xas_store(&xas, page + i);
+		xas_store(&xas, page);
 		if (++i < nr) {
 			xas_next(&xas);
 			goto next;
@@ -1481,7 +1481,7 @@
 
 	shmem_pseudo_vma_init(&pvma, info, hindex);
 	page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN,
-			HPAGE_PMD_ORDER, &pvma, 0, numa_node_id());
+			HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(), true);
 	shmem_pseudo_vma_destroy(&pvma);
 	if (page)
 		prep_transhuge_page(page);
@@ -1734,7 +1734,7 @@
  * vm. If we swap it in we mark it dirty since we also free the swap
  * entry since a page cannot live in both the swap and page cache.
  *
- * fault_mm and fault_type are only supplied by shmem_fault:
+ * vmf and fault_type are only supplied by shmem_fault:
  * otherwise they are NULL.
  */
 static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
@@ -1884,7 +1884,7 @@
 	lru_cache_add_anon(page);
 
 	spin_lock_irq(&info->lock);
-	info->alloced += 1 << compound_order(page);
+	info->alloced += compound_nr(page);
 	inode->i_blocks += BLOCKS_PER_PAGE << compound_order(page);
 	shmem_recalc_inode(inode);
 	spin_unlock_irq(&info->lock);
@@ -1925,7 +1925,7 @@
 		struct page *head = compound_head(page);
 		int i;
 
-		for (i = 0; i < (1 << compound_order(head)); i++) {
+		for (i = 0; i < compound_nr(head); i++) {
 			clear_highpage(head + i);
 			flush_dcache_page(head + i);
 		}
@@ -1952,7 +1952,7 @@
 	 * Error recovery.
 	 */
 unacct:
-	shmem_inode_unacct_blocks(inode, 1 << compound_order(page));
+	shmem_inode_unacct_blocks(inode, compound_nr(page));
 
 	if (PageTransHuge(page)) {
 		unlock_page(page);

diff --git a/mm/slab.h b/mm/slab.h
index 9057b80..68e455f 100644
--- a/mm/slab.h
+++ b/mm/slab.h

@@ -30,6 +30,69 @@
 	struct list_head list;	/* List of all slab caches on the system */
 };
 
+#else /* !CONFIG_SLOB */
+
+struct memcg_cache_array {
+	struct rcu_head rcu;
+	struct kmem_cache *entries[0];
+};
+
+/*
+ * This is the main placeholder for memcg-related information in kmem caches.
+ * Both the root cache and the child caches will have it. For the root cache,
+ * this will hold a dynamically allocated array large enough to hold
+ * information about the currently limited memcgs in the system. To allow the
+ * array to be accessed without taking any locks, on relocation we free the old
+ * version only after a grace period.
+ *
+ * Root and child caches hold different metadata.
+ *
+ * @root_cache:	Common to root and child caches.  NULL for root, pointer to
+ *		the root cache for children.
+ *
+ * The following fields are specific to root caches.
+ *
+ * @memcg_caches: kmemcg ID indexed table of child caches.  This table is
+ *		used to index child cachces during allocation and cleared
+ *		early during shutdown.
+ *
+ * @root_caches_node: List node for slab_root_caches list.
+ *
+ * @children:	List of all child caches.  While the child caches are also
+ *		reachable through @memcg_caches, a child cache remains on
+ *		this list until it is actually destroyed.
+ *
+ * The following fields are specific to child caches.
+ *
+ * @memcg:	Pointer to the memcg this cache belongs to.
+ *
+ * @children_node: List node for @root_cache->children list.
+ *
+ * @kmem_caches_node: List node for @memcg->kmem_caches list.
+ */
+struct memcg_cache_params {
+	struct kmem_cache *root_cache;
+	union {
+		struct {
+			struct memcg_cache_array __rcu *memcg_caches;
+			struct list_head __root_caches_node;
+			struct list_head children;
+			bool dying;
+		};
+		struct {
+			struct mem_cgroup *memcg;
+			struct list_head children_node;
+			struct list_head kmem_caches_node;
+			struct percpu_ref refcnt;
+
+			void (*work_fn)(struct kmem_cache *);
+			union {
+				struct rcu_head rcu_head;
+				struct work_struct work;
+			};
+		};
+	};
+};
 #endif /* CONFIG_SLOB */
 
 #ifdef CONFIG_SLAB
@@ -174,6 +237,7 @@
 void __kmemcg_cache_deactivate(struct kmem_cache *s);
 void __kmemcg_cache_deactivate_after_rcu(struct kmem_cache *s);
 void slab_kmem_cache_release(struct kmem_cache *);
+void kmem_cache_shrink_all(struct kmem_cache *s);
 
 struct seq_file;
 struct file;

diff --git a/mm/slab_common.c b/mm/slab_common.c
index 807490f..6491c3a 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c

@@ -981,6 +981,43 @@
 }
 EXPORT_SYMBOL(kmem_cache_shrink);
 
+/**
+ * kmem_cache_shrink_all - shrink a cache and all memcg caches for root cache
+ * @s: The cache pointer
+ */
+void kmem_cache_shrink_all(struct kmem_cache *s)
+{
+	struct kmem_cache *c;
+
+	if (!IS_ENABLED(CONFIG_MEMCG_KMEM) || !is_root_cache(s)) {
+		kmem_cache_shrink(s);
+		return;
+	}
+
+	get_online_cpus();
+	get_online_mems();
+	kasan_cache_shrink(s);
+	__kmem_cache_shrink(s);
+
+	/*
+	 * We have to take the slab_mutex to protect from the memcg list
+	 * modification.
+	 */
+	mutex_lock(&slab_mutex);
+	for_each_memcg_cache(c, s) {
+		/*
+		 * Don't need to shrink deactivated memcg caches.
+		 */
+		if (s->flags & SLAB_DEACTIVATED)
+			continue;
+		kasan_cache_shrink(c);
+		__kmem_cache_shrink(c);
+	}
+	mutex_unlock(&slab_mutex);
+	put_online_mems();
+	put_online_cpus();
+}
+
 bool slab_is_available(void)
 {
 	return slab_state >= UP;

diff --git a/mm/slob.c b/mm/slob.c
index 7f421d0..cf377be 100644
--- a/mm/slob.c
+++ b/mm/slob.c

@@ -539,7 +539,7 @@
 
 	sp = virt_to_page(block);
 	if (unlikely(!PageSlab(sp)))
-		return PAGE_SIZE << compound_order(sp);
+		return page_size(sp);
 
 	align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
 	m = (unsigned int *)(block - align);

diff --git a/mm/slub.c b/mm/slub.c
index 8834563..42c1b3a 100644
--- a/mm/slub.c
+++ b/mm/slub.c

@@ -829,7 +829,7 @@
 		return 1;
 
 	start = page_address(page);
-	length = PAGE_SIZE << compound_order(page);
+	length = page_size(page);
 	end = start + length;
 	remainder = length % s->size;
 	if (!remainder)
@@ -1074,13 +1074,14 @@
 	init_tracking(s, object);
 }
 
-static void setup_page_debug(struct kmem_cache *s, void *addr, int order)
+static
+void setup_page_debug(struct kmem_cache *s, struct page *page, void *addr)
 {
 	if (!(s->flags & SLAB_POISON))
 		return;
 
 	metadata_access_enable();
-	memset(addr, POISON_INUSE, PAGE_SIZE << order);
+	memset(addr, POISON_INUSE, page_size(page));
 	metadata_access_disable();
 }
 
@@ -1340,8 +1341,8 @@
 #else /* !CONFIG_SLUB_DEBUG */
 static inline void setup_object_debug(struct kmem_cache *s,
 			struct page *page, void *object) {}
-static inline void setup_page_debug(struct kmem_cache *s,
-			void *addr, int order) {}
+static inline
+void setup_page_debug(struct kmem_cache *s, struct page *page, void *addr) {}
 
 static inline int alloc_debug_processing(struct kmem_cache *s,
 	struct page *page, void *object, unsigned long addr) { return 0; }
@@ -1639,7 +1640,7 @@
 	struct kmem_cache_order_objects oo = s->oo;
 	gfp_t alloc_gfp;
 	void *start, *p, *next;
-	int idx, order;
+	int idx;
 	bool shuffle;
 
 	flags &= gfp_allowed_mask;
@@ -1673,7 +1674,6 @@
 
 	page->objects = oo_objects(oo);
 
-	order = compound_order(page);
 	page->slab_cache = s;
 	__SetPageSlab(page);
 	if (page_is_pfmemalloc(page))
@@ -1683,7 +1683,7 @@
 
 	start = page_address(page);
 
-	setup_page_debug(s, start, order);
+	setup_page_debug(s, page, start);
 
 	shuffle = shuffle_freelist(s, page);
 
@@ -2004,6 +2004,7 @@
 	return tid + TID_STEP;
 }
 
+#ifdef SLUB_DEBUG_CMPXCHG
 static inline unsigned int tid_to_cpu(unsigned long tid)
 {
 	return tid % TID_STEP;
@@ -2013,6 +2014,7 @@
 {
 	return tid / TID_STEP;
 }
+#endif
 
 static inline unsigned int init_tid(int cpu)
 {
@@ -3930,7 +3932,7 @@
 
 	if (unlikely(!PageSlab(page))) {
 		WARN_ON(!PageCompound(page));
-		return PAGE_SIZE << compound_order(page);
+		return page_size(page);
 	}
 
 	return slab_ksize(page->slab_cache);
@@ -5298,7 +5300,7 @@
 			const char *buf, size_t length)
 {
 	if (buf[0] == '1')
-		kmem_cache_shrink(s);
+		kmem_cache_shrink_all(s);
 	else
 		return -EINVAL;
 	return length;

diff --git a/mm/sparse.c b/mm/sparse.c
index 72f010d..bf32de9 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c

@@ -11,6 +11,8 @@
 #include <linux/export.h>
 #include <linux/spinlock.h>
 #include <linux/vmalloc.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
 
 #include "internal.h"
 #include <asm/dma.h>
@@ -470,6 +472,12 @@
 static void *sparsemap_buf __meminitdata;
 static void *sparsemap_buf_end __meminitdata;
 
+static inline void __meminit sparse_buffer_free(unsigned long size)
+{
+	WARN_ON(!sparsemap_buf || size == 0);
+	memblock_free_early(__pa(sparsemap_buf), size);
+}
+
 static void __init sparse_buffer_init(unsigned long size, int nid)
 {
 	phys_addr_t addr = __pa(MAX_DMA_ADDRESS);
@@ -486,7 +494,7 @@
 	unsigned long size = sparsemap_buf_end - sparsemap_buf;
 
 	if (sparsemap_buf && size > 0)
-		memblock_free_early(__pa(sparsemap_buf), size);
+		sparse_buffer_free(size);
 	sparsemap_buf = NULL;
 }
 
@@ -495,11 +503,15 @@
 	void *ptr = NULL;
 
 	if (sparsemap_buf) {
-		ptr = PTR_ALIGN(sparsemap_buf, size);
+		ptr = (void *) roundup((unsigned long)sparsemap_buf, size);
 		if (ptr + size > sparsemap_buf_end)
 			ptr = NULL;
-		else
+		else {
+			/* Free redundant aligned space */
+			if ((unsigned long)(ptr - sparsemap_buf) > 0)
+				sparse_buffer_free((unsigned long)(ptr - sparsemap_buf));
 			sparsemap_buf = ptr + size;
+		}
 	}
 	return ptr;
 }
@@ -867,7 +879,7 @@
 	 */
 	page_init_poison(pfn_to_page(start_pfn), sizeof(struct page) * nr_pages);
 
-	ms = __pfn_to_section(start_pfn);
+	ms = __nr_to_section(section_nr);
 	set_section_nid(section_nr, nid);
 	section_mark_present(ms);
 
@@ -884,9 +896,6 @@
 {
 	int i;
 
-	if (!memmap)
-		return;
-
 	/*
 	 * A further optimization is to have per section refcounted
 	 * num_poisoned_pages.  But that would need more space per memmap, so
@@ -898,7 +907,7 @@
 
 	for (i = 0; i < nr_pages; i++) {
 		if (PageHWPoison(&memmap[i])) {
-			atomic_long_sub(1, &num_poisoned_pages);
+			num_poisoned_pages_dec();
 			ClearPageHWPoison(&memmap[i]);
 		}
 	}

diff --git a/mm/swap.c b/mm/swap.c
index ae30039..38c3fa4 100644
--- a/mm/swap.c
+++ b/mm/swap.c

@@ -47,6 +47,7 @@
 static DEFINE_PER_CPU(struct pagevec, lru_add_pvec);
 static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
 static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs);
+static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
 static DEFINE_PER_CPU(struct pagevec, lru_lazyfree_pvecs);
 #ifdef CONFIG_SMP
 static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs);
@@ -71,12 +72,12 @@
 		spin_unlock_irqrestore(&pgdat->lru_lock, flags);
 	}
 	__ClearPageWaiters(page);
-	mem_cgroup_uncharge(page);
 }
 
 static void __put_single_page(struct page *page)
 {
 	__page_cache_release(page);
+	mem_cgroup_uncharge(page);
 	free_unref_page(page);
 }
 
@@ -515,7 +516,6 @@
 	del_page_from_lru_list(page, lruvec, lru + active);
 	ClearPageActive(page);
 	ClearPageReferenced(page);
-	add_page_to_lru_list(page, lruvec, lru);
 
 	if (PageWriteback(page) || PageDirty(page)) {
 		/*
@@ -523,13 +523,14 @@
 		 * It can make readahead confusing.  But race window
 		 * is _really_ small and  it's non-critical problem.
 		 */
+		add_page_to_lru_list(page, lruvec, lru);
 		SetPageReclaim(page);
 	} else {
 		/*
 		 * The page's writeback ends up during pagevec
 		 * We moves tha page into tail of inactive.
 		 */
-		list_move_tail(&page->lru, &lruvec->lists[lru]);
+		add_page_to_lru_list_tail(page, lruvec, lru);
 		__count_vm_event(PGROTATED);
 	}
 
@@ -538,6 +539,22 @@
 	update_page_reclaim_stat(lruvec, file, 0);
 }
 
+static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec,
+			    void *arg)
+{
+	if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
+		int file = page_is_file_cache(page);
+		int lru = page_lru_base_type(page);
+
+		del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE);
+		ClearPageActive(page);
+		ClearPageReferenced(page);
+		add_page_to_lru_list(page, lruvec, lru);
+
+		__count_vm_events(PGDEACTIVATE, hpage_nr_pages(page));
+		update_page_reclaim_stat(lruvec, file, 0);
+	}
+}
 
 static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec,
 			    void *arg)
@@ -590,6 +607,10 @@
 	if (pagevec_count(pvec))
 		pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
 
+	pvec = &per_cpu(lru_deactivate_pvecs, cpu);
+	if (pagevec_count(pvec))
+		pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
+
 	pvec = &per_cpu(lru_lazyfree_pvecs, cpu);
 	if (pagevec_count(pvec))
 		pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL);
@@ -623,6 +644,26 @@
 	}
 }
 
+/*
+ * deactivate_page - deactivate a page
+ * @page: page to deactivate
+ *
+ * deactivate_page() moves @page to the inactive list if @page was on the active
+ * list and was not an unevictable page.  This is done to accelerate the reclaim
+ * of @page.
+ */
+void deactivate_page(struct page *page)
+{
+	if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
+		struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs);
+
+		get_page(page);
+		if (!pagevec_add(pvec, page) || PageCompound(page))
+			pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
+		put_cpu_var(lru_deactivate_pvecs);
+	}
+}
+
 /**
  * mark_page_lazyfree - make an anon page lazyfree
  * @page: page to deactivate
@@ -687,6 +728,7 @@
 		if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
 		    pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
 		    pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) ||
+		    pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
 		    pagevec_count(&per_cpu(lru_lazyfree_pvecs, cpu)) ||
 		    need_activate_page_drain(cpu)) {
 			INIT_WORK(work, lru_add_drain_per_cpu);
@@ -844,17 +886,15 @@
 		get_page(page_tail);
 		list_add_tail(&page_tail->lru, list);
 	} else {
-		struct list_head *list_head;
 		/*
 		 * Head page has not yet been counted, as an hpage,
 		 * so we must account for each subpage individually.
 		 *
-		 * Use the standard add function to put page_tail on the list,
-		 * but then correct its position so they all end up in order.
+		 * Put page_tail on the list at the correct position
+		 * so they all end up in order.
 		 */
-		add_page_to_lru_list(page_tail, lruvec, page_lru(page_tail));
-		list_head = page_tail->lru.prev;
-		list_move_tail(&page_tail->lru, list_head);
+		add_page_to_lru_list_tail(page_tail, lruvec,
+					  page_lru(page_tail));
 	}
 
 	if (!PageUnevictable(page))

diff --git a/mm/swap_state.c b/mm/swap_state.c
index 8368621..8e7ce9a 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c

@@ -116,7 +116,7 @@
 	struct address_space *address_space = swap_address_space(entry);
 	pgoff_t idx = swp_offset(entry);
 	XA_STATE_ORDER(xas, &address_space->i_pages, idx, compound_order(page));
-	unsigned long i, nr = 1UL << compound_order(page);
+	unsigned long i, nr = compound_nr(page);
 
 	VM_BUG_ON_PAGE(!PageLocked(page), page);
 	VM_BUG_ON_PAGE(PageSwapCache(page), page);
@@ -133,7 +133,7 @@
 		for (i = 0; i < nr; i++) {
 			VM_BUG_ON_PAGE(xas.xa_index != idx + i, page);
 			set_page_private(page + i, entry.val + i);
-			xas_store(&xas, page + i);
+			xas_store(&xas, page);
 			xas_next(&xas);
 		}
 		address_space->nrpages += nr;
@@ -168,7 +168,7 @@
 
 	for (i = 0; i < nr; i++) {
 		void *entry = xas_store(&xas, NULL);
-		VM_BUG_ON_PAGE(entry != page + i, entry);
+		VM_BUG_ON_PAGE(entry != page, entry);
 		set_page_private(page + i, 0);
 		xas_next(&xas);
 	}

diff --git a/mm/usercopy.c b/mm/usercopy.c
index 98e92486..660717a 100644
--- a/mm/usercopy.c
+++ b/mm/usercopy.c

@@ -11,6 +11,7 @@
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
 #include <linux/mm.h>
+#include <linux/highmem.h>
 #include <linux/slab.h>
 #include <linux/sched.h>
 #include <linux/sched/task.h>
@@ -227,7 +228,12 @@
 	if (!virt_addr_valid(ptr))
 		return;
 
-	page = virt_to_head_page(ptr);
+	/*
+	 * When CONFIG_HIGHMEM=y, kmap_to_page() will give either the
+	 * highmem page or fallback to virt_to_page(). The following
+	 * is effectively a highmem-aware virt_to_head_page().
+	 */
+	page = compound_head(kmap_to_page((void *)ptr));
 
 	if (PageSlab(page)) {
 		/* Check slab allocator for flags and size. */

diff --git a/mm/util.c b/mm/util.c
index e6351a8..3ad6db9 100644
--- a/mm/util.c
+++ b/mm/util.c

@@ -16,6 +16,13 @@
 #include <linux/hugetlb.h>
 #include <linux/vmalloc.h>
 #include <linux/userfaultfd_k.h>
+#include <linux/elf.h>
+#include <linux/elf-randomize.h>
+#include <linux/personality.h>
+#include <linux/random.h>
+#include <linux/processor.h>
+#include <linux/sizes.h>
+#include <linux/compat.h>
 
 #include <linux/uaccess.h>
 
@@ -293,7 +300,105 @@
 	return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t));
 }
 
-#if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
+#ifndef STACK_RND_MASK
+#define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12))     /* 8MB of VA */
+#endif
+
+unsigned long randomize_stack_top(unsigned long stack_top)
+{
+	unsigned long random_variable = 0;
+
+	if (current->flags & PF_RANDOMIZE) {
+		random_variable = get_random_long();
+		random_variable &= STACK_RND_MASK;
+		random_variable <<= PAGE_SHIFT;
+	}
+#ifdef CONFIG_STACK_GROWSUP
+	return PAGE_ALIGN(stack_top) + random_variable;
+#else
+	return PAGE_ALIGN(stack_top) - random_variable;
+#endif
+}
+
+#ifdef CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
+unsigned long arch_randomize_brk(struct mm_struct *mm)
+{
+	/* Is the current task 32bit ? */
+	if (!IS_ENABLED(CONFIG_64BIT) || is_compat_task())
+		return randomize_page(mm->brk, SZ_32M);
+
+	return randomize_page(mm->brk, SZ_1G);
+}
+
+unsigned long arch_mmap_rnd(void)
+{
+	unsigned long rnd;
+
+#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
+	if (is_compat_task())
+		rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1);
+	else
+#endif /* CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS */
+		rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1);
+
+	return rnd << PAGE_SHIFT;
+}
+
+static int mmap_is_legacy(struct rlimit *rlim_stack)
+{
+	if (current->personality & ADDR_COMPAT_LAYOUT)
+		return 1;
+
+	if (rlim_stack->rlim_cur == RLIM_INFINITY)
+		return 1;
+
+	return sysctl_legacy_va_layout;
+}
+
+/*
+ * Leave enough space between the mmap area and the stack to honour ulimit in
+ * the face of randomisation.
+ */
+#define MIN_GAP		(SZ_128M)
+#define MAX_GAP		(STACK_TOP / 6 * 5)
+
+static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack)
+{
+	unsigned long gap = rlim_stack->rlim_cur;
+	unsigned long pad = stack_guard_gap;
+
+	/* Account for stack randomization if necessary */
+	if (current->flags & PF_RANDOMIZE)
+		pad += (STACK_RND_MASK << PAGE_SHIFT);
+
+	/* Values close to RLIM_INFINITY can overflow. */
+	if (gap + pad > gap)
+		gap += pad;
+
+	if (gap < MIN_GAP)
+		gap = MIN_GAP;
+	else if (gap > MAX_GAP)
+		gap = MAX_GAP;
+
+	return PAGE_ALIGN(STACK_TOP - gap - rnd);
+}
+
+void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
+{
+	unsigned long random_factor = 0UL;
+
+	if (current->flags & PF_RANDOMIZE)
+		random_factor = arch_mmap_rnd();
+
+	if (mmap_is_legacy(rlim_stack)) {
+		mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
+		mm->get_unmapped_area = arch_get_unmapped_area;
+	} else {
+		mm->mmap_base = mmap_base(random_factor, rlim_stack);
+		mm->get_unmapped_area = arch_get_unmapped_area_topdown;
+	}
+}
+#elif defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
 void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
 {
 	mm->mmap_base = TASK_UNMAPPED_BASE;
@@ -521,7 +626,7 @@
 		return true;
 	if (PageHuge(page))
 		return false;
-	for (i = 0; i < (1 << compound_order(page)); i++) {
+	for (i = 0; i < compound_nr(page); i++) {
 		if (atomic_read(&page[i]._mapcount) >= 0)
 			return true;
 	}
@@ -783,3 +888,16 @@
 out:
 	return res;
 }
+
+int memcmp_pages(struct page *page1, struct page *page2)
+{
+	char *addr1, *addr2;
+	int ret;
+
+	addr1 = kmap_atomic(page1);
+	addr2 = kmap_atomic(page2);
+	ret = memcmp(addr1, addr2, PAGE_SIZE);
+	kunmap_atomic(addr2);
+	kunmap_atomic(addr1);
+	return ret;
+}

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index c1246d7..a3c70e2 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c

@@ -329,8 +329,6 @@
 #define DEBUG_AUGMENT_PROPAGATE_CHECK 0
 #define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0
 
-#define VM_LAZY_FREE	0x02
-#define VM_VM_AREA	0x04
 
 static DEFINE_SPINLOCK(vmap_area_lock);
 /* Export for kexec only */
@@ -398,9 +396,8 @@
 		get_subtree_max_size(va->rb_node.rb_right));
 }
 
-RB_DECLARE_CALLBACKS(static, free_vmap_area_rb_augment_cb,
-	struct vmap_area, rb_node, unsigned long, subtree_max_size,
-	compute_subtree_max_size)
+RB_DECLARE_CALLBACKS_MAX(static, free_vmap_area_rb_augment_cb,
+	struct vmap_area, rb_node, unsigned long, subtree_max_size, va_size)
 
 static void purge_vmap_area_lazy(void);
 static BLOCKING_NOTIFIER_HEAD(vmap_notify_list);
@@ -1116,7 +1113,7 @@
 
 	va->va_start = addr;
 	va->va_end = addr + size;
-	va->flags = 0;
+	va->vm = NULL;
 	insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
 
 	spin_unlock(&vmap_area_lock);
@@ -1282,7 +1279,14 @@
 	llist_for_each_entry_safe(va, n_va, valist, purge_list) {
 		unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
 
-		__free_vmap_area(va);
+		/*
+		 * Finally insert or merge lazily-freed area. It is
+		 * detached and there is no need to "unlink" it from
+		 * anything.
+		 */
+		merge_or_add_vmap_area(va,
+			&free_vmap_area_root, &free_vmap_area_list);
+
 		atomic_long_sub(nr, &vmap_lazy_nr);
 
 		if (atomic_long_read(&vmap_lazy_nr) < resched_threshold)
@@ -1324,6 +1328,10 @@
 {
 	unsigned long nr_lazy;
 
+	spin_lock(&vmap_area_lock);
+	unlink_va(va, &vmap_area_root);
+	spin_unlock(&vmap_area_lock);
+
 	nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >>
 				PAGE_SHIFT, &vmap_lazy_nr);
 
@@ -1918,7 +1926,6 @@
 		if (WARN_ON_ONCE(!va))
 			continue;
 
-		va->flags = VM_VM_AREA;
 		va->va_start = (unsigned long)tmp->addr;
 		va->va_end = va->va_start + tmp->size;
 		va->vm = tmp;
@@ -2016,7 +2023,6 @@
 	vm->size = va->va_end - va->va_start;
 	vm->caller = caller;
 	va->vm = vm;
-	va->flags |= VM_VM_AREA;
 	spin_unlock(&vmap_area_lock);
 }
 
@@ -2121,10 +2127,10 @@
 	struct vmap_area *va;
 
 	va = find_vmap_area((unsigned long)addr);
-	if (va && va->flags & VM_VM_AREA)
-		return va->vm;
+	if (!va)
+		return NULL;
 
-	return NULL;
+	return va->vm;
 }
 
 /**
@@ -2143,14 +2149,12 @@
 
 	might_sleep();
 
-	va = find_vmap_area((unsigned long)addr);
-	if (va && va->flags & VM_VM_AREA) {
+	spin_lock(&vmap_area_lock);
+	va = __find_vmap_area((unsigned long)addr);
+	if (va && va->vm) {
 		struct vm_struct *vm = va->vm;
 
-		spin_lock(&vmap_area_lock);
 		va->vm = NULL;
-		va->flags &= ~VM_VM_AREA;
-		va->flags |= VM_LAZY_FREE;
 		spin_unlock(&vmap_area_lock);
 
 		kasan_free_shadow(vm);
@@ -2158,6 +2162,8 @@
 
 		return vm;
 	}
+
+	spin_unlock(&vmap_area_lock);
 	return NULL;
 }
 
@@ -2402,7 +2408,6 @@
 	nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
 	array_size = (nr_pages * sizeof(struct page *));
 
-	area->nr_pages = nr_pages;
 	/* Please note that the recursion is strictly bounded. */
 	if (array_size > PAGE_SIZE) {
 		pages = __vmalloc_node(array_size, 1, nested_gfp|highmem_mask,
@@ -2410,13 +2415,16 @@
 	} else {
 		pages = kmalloc_node(array_size, nested_gfp, node);
 	}
-	area->pages = pages;
-	if (!area->pages) {
+
+	if (!pages) {
 		remove_vm_area(area->addr);
 		kfree(area);
 		return NULL;
 	}
 
+	area->pages = pages;
+	area->nr_pages = nr_pages;
+
 	for (i = 0; i < area->nr_pages; i++) {
 		struct page *page;
 
@@ -2851,7 +2859,7 @@
 		if (!count)
 			break;
 
-		if (!(va->flags & VM_VM_AREA))
+		if (!va->vm)
 			continue;
 
 		vm = va->vm;
@@ -2931,7 +2939,7 @@
 		if (!count)
 			break;
 
-		if (!(va->flags & VM_VM_AREA))
+		if (!va->vm)
 			continue;
 
 		vm = va->vm;
@@ -3450,6 +3458,22 @@
 	}
 }
 
+static void show_purge_info(struct seq_file *m)
+{
+	struct llist_node *head;
+	struct vmap_area *va;
+
+	head = READ_ONCE(vmap_purge_list.first);
+	if (head == NULL)
+		return;
+
+	llist_for_each_entry(va, head, purge_list) {
+		seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n",
+			(void *)va->va_start, (void *)va->va_end,
+			va->va_end - va->va_start);
+	}
+}
+
 static int s_show(struct seq_file *m, void *p)
 {
 	struct vmap_area *va;
@@ -3458,14 +3482,13 @@
 	va = list_entry(p, struct vmap_area, list);
 
 	/*
-	 * s_show can encounter race with remove_vm_area, !VM_VM_AREA on
-	 * behalf of vmap area is being tear down or vm_map_ram allocation.
+	 * s_show can encounter race with remove_vm_area, !vm on behalf
+	 * of vmap area is being tear down or vm_map_ram allocation.
 	 */
-	if (!(va->flags & VM_VM_AREA)) {
-		seq_printf(m, "0x%pK-0x%pK %7ld %s\n",
+	if (!va->vm) {
+		seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n",
 			(void *)va->va_start, (void *)va->va_end,
-			va->va_end - va->va_start,
-			va->flags & VM_LAZY_FREE ? "unpurged vm_area" : "vm_map_ram");
+			va->va_end - va->va_start);
 
 		return 0;
 	}
@@ -3504,6 +3527,16 @@
 
 	show_numa_info(m, v);
 	seq_putc(m, '\n');
+
+	/*
+	 * As a final step, dump "unpurged" areas. Note,
+	 * that entire "/proc/vmallocinfo" output will not
+	 * be address sorted, because the purge list is not
+	 * sorted.
+	 */
+	if (list_is_last(&va->list, &vmap_area_list))
+		show_purge_info(m);
+
 	return 0;
 }
 

diff --git a/mm/vmscan.c b/mm/vmscan.c
index a6c5d0b..e5d52d6 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c

@@ -171,11 +171,22 @@
  */
 unsigned long vm_total_pages;
 
+static void set_task_reclaim_state(struct task_struct *task,
+				   struct reclaim_state *rs)
+{
+	/* Check for an overwrite */
+	WARN_ON_ONCE(rs && task->reclaim_state);
+
+	/* Check for the nulling of an already-nulled member */
+	WARN_ON_ONCE(!rs && !task->reclaim_state);
+
+	task->reclaim_state = rs;
+}
+
 static LIST_HEAD(shrinker_list);
 static DECLARE_RWSEM(shrinker_rwsem);
 
-#ifdef CONFIG_MEMCG_KMEM
-
+#ifdef CONFIG_MEMCG
 /*
  * We allow subsystems to populate their shrinker-related
  * LRU lists before register_shrinker_prepared() is called
@@ -227,30 +238,7 @@
 	idr_remove(&shrinker_idr, id);
 	up_write(&shrinker_rwsem);
 }
-#else /* CONFIG_MEMCG_KMEM */
-static int prealloc_memcg_shrinker(struct shrinker *shrinker)
-{
-	return 0;
-}
 
-static void unregister_memcg_shrinker(struct shrinker *shrinker)
-{
-}
-#endif /* CONFIG_MEMCG_KMEM */
-
-static void set_task_reclaim_state(struct task_struct *task,
-				   struct reclaim_state *rs)
-{
-	/* Check for an overwrite */
-	WARN_ON_ONCE(rs && task->reclaim_state);
-
-	/* Check for the nulling of an already-nulled member */
-	WARN_ON_ONCE(!rs && !task->reclaim_state);
-
-	task->reclaim_state = rs;
-}
-
-#ifdef CONFIG_MEMCG
 static bool global_reclaim(struct scan_control *sc)
 {
 	return !sc->target_mem_cgroup;
@@ -305,6 +293,15 @@
 
 }
 #else
+static int prealloc_memcg_shrinker(struct shrinker *shrinker)
+{
+	return 0;
+}
+
+static void unregister_memcg_shrinker(struct shrinker *shrinker)
+{
+}
+
 static bool global_reclaim(struct scan_control *sc)
 {
 	return true;
@@ -591,7 +588,7 @@
 	return freed;
 }
 
-#ifdef CONFIG_MEMCG_KMEM
+#ifdef CONFIG_MEMCG
 static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
 			struct mem_cgroup *memcg, int priority)
 {
@@ -599,7 +596,7 @@
 	unsigned long ret, freed = 0;
 	int i;
 
-	if (!memcg_kmem_enabled() || !mem_cgroup_online(memcg))
+	if (!mem_cgroup_online(memcg))
 		return 0;
 
 	if (!down_read_trylock(&shrinker_rwsem))
@@ -625,6 +622,11 @@
 			continue;
 		}
 
+		/* Call non-slab shrinkers even though kmem is disabled */
+		if (!memcg_kmem_enabled() &&
+		    !(shrinker->flags & SHRINKER_NONSLAB))
+			continue;
+
 		ret = do_shrink_slab(&sc, shrinker, priority);
 		if (ret == SHRINK_EMPTY) {
 			clear_bit(i, map->map);
@@ -661,13 +663,13 @@
 	up_read(&shrinker_rwsem);
 	return freed;
 }
-#else /* CONFIG_MEMCG_KMEM */
+#else /* CONFIG_MEMCG */
 static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
 			struct mem_cgroup *memcg, int priority)
 {
 	return 0;
 }
-#endif /* CONFIG_MEMCG_KMEM */
+#endif /* CONFIG_MEMCG */
 
 /**
  * shrink_slab - shrink slab caches
@@ -1121,7 +1123,7 @@
 				      struct scan_control *sc,
 				      enum ttu_flags ttu_flags,
 				      struct reclaim_stat *stat,
-				      bool force_reclaim)
+				      bool ignore_references)
 {
 	LIST_HEAD(ret_pages);
 	LIST_HEAD(free_pages);
@@ -1135,7 +1137,7 @@
 		struct address_space *mapping;
 		struct page *page;
 		int may_enter_fs;
-		enum page_references references = PAGEREF_RECLAIM_CLEAN;
+		enum page_references references = PAGEREF_RECLAIM;
 		bool dirty, writeback;
 		unsigned int nr_pages;
 
@@ -1149,7 +1151,7 @@
 
 		VM_BUG_ON_PAGE(PageActive(page), page);
 
-		nr_pages = 1 << compound_order(page);
+		nr_pages = compound_nr(page);
 
 		/* Account the number of base pages even though THP */
 		sc->nr_scanned += nr_pages;
@@ -1266,7 +1268,7 @@
 			}
 		}
 
-		if (!force_reclaim)
+		if (!ignore_references)
 			references = page_check_references(page, sc);
 
 		switch (references) {
@@ -1487,10 +1489,9 @@
 		 * Is there need to periodically free_page_list? It would
 		 * appear not as the counts should be low
 		 */
-		if (unlikely(PageTransHuge(page))) {
-			mem_cgroup_uncharge(page);
+		if (unlikely(PageTransHuge(page)))
 			(*get_compound_page_dtor(page))(page);
-		} else
+		else
 			list_add(&page->lru, &free_pages);
 		continue;
 
@@ -1705,7 +1706,7 @@
 
 		VM_BUG_ON_PAGE(!PageLRU(page), page);
 
-		nr_pages = 1 << compound_order(page);
+		nr_pages = compound_nr(page);
 		total_scan += nr_pages;
 
 		if (page_zonenum(page) > sc->reclaim_idx) {
@@ -1911,7 +1912,6 @@
 
 			if (unlikely(PageCompound(page))) {
 				spin_unlock_irq(&pgdat->lru_lock);
-				mem_cgroup_uncharge(page);
 				(*get_compound_page_dtor(page))(page);
 				spin_lock_irq(&pgdat->lru_lock);
 			} else
@@ -2145,6 +2145,62 @@
 			nr_deactivate, nr_rotated, sc->priority, file);
 }
 
+unsigned long reclaim_pages(struct list_head *page_list)
+{
+	int nid = -1;
+	unsigned long nr_reclaimed = 0;
+	LIST_HEAD(node_page_list);
+	struct reclaim_stat dummy_stat;
+	struct page *page;
+	struct scan_control sc = {
+		.gfp_mask = GFP_KERNEL,
+		.priority = DEF_PRIORITY,
+		.may_writepage = 1,
+		.may_unmap = 1,
+		.may_swap = 1,
+	};
+
+	while (!list_empty(page_list)) {
+		page = lru_to_page(page_list);
+		if (nid == -1) {
+			nid = page_to_nid(page);
+			INIT_LIST_HEAD(&node_page_list);
+		}
+
+		if (nid == page_to_nid(page)) {
+			ClearPageActive(page);
+			list_move(&page->lru, &node_page_list);
+			continue;
+		}
+
+		nr_reclaimed += shrink_page_list(&node_page_list,
+						NODE_DATA(nid),
+						&sc, 0,
+						&dummy_stat, false);
+		while (!list_empty(&node_page_list)) {
+			page = lru_to_page(&node_page_list);
+			list_del(&page->lru);
+			putback_lru_page(page);
+		}
+
+		nid = -1;
+	}
+
+	if (!list_empty(&node_page_list)) {
+		nr_reclaimed += shrink_page_list(&node_page_list,
+						NODE_DATA(nid),
+						&sc, 0,
+						&dummy_stat, false);
+		while (!list_empty(&node_page_list)) {
+			page = lru_to_page(&node_page_list);
+			list_del(&page->lru);
+			putback_lru_page(page);
+		}
+	}
+
+	return nr_reclaimed;
+}
+
 /*
  * The inactive anon list should be small enough that the VM never has
  * to do too much work.
@@ -2586,7 +2642,6 @@
  */
 static inline bool should_continue_reclaim(struct pglist_data *pgdat,
 					unsigned long nr_reclaimed,
-					unsigned long nr_scanned,
 					struct scan_control *sc)
 {
 	unsigned long pages_for_compaction;
@@ -2597,40 +2652,18 @@
 	if (!in_reclaim_compaction(sc))
 		return false;
 
-	/* Consider stopping depending on scan and reclaim activity */
-	if (sc->gfp_mask & __GFP_RETRY_MAYFAIL) {
-		/*
-		 * For __GFP_RETRY_MAYFAIL allocations, stop reclaiming if the
-		 * full LRU list has been scanned and we are still failing
-		 * to reclaim pages. This full LRU scan is potentially
-		 * expensive but a __GFP_RETRY_MAYFAIL caller really wants to succeed
-		 */
-		if (!nr_reclaimed && !nr_scanned)
-			return false;
-	} else {
-		/*
-		 * For non-__GFP_RETRY_MAYFAIL allocations which can presumably
-		 * fail without consequence, stop if we failed to reclaim
-		 * any pages from the last SWAP_CLUSTER_MAX number of
-		 * pages that were scanned. This will return to the
-		 * caller faster at the risk reclaim/compaction and
-		 * the resulting allocation attempt fails
-		 */
-		if (!nr_reclaimed)
-			return false;
-	}
-
 	/*
-	 * If we have not reclaimed enough pages for compaction and the
-	 * inactive lists are large enough, continue reclaiming
+	 * Stop if we failed to reclaim any pages from the last SWAP_CLUSTER_MAX
+	 * number of pages that were scanned. This will return to the caller
+	 * with the risk reclaim/compaction and the resulting allocation attempt
+	 * fails. In the past we have tried harder for __GFP_RETRY_MAYFAIL
+	 * allocations through requiring that the full LRU list has been scanned
+	 * first, by assuming that zero delta of sc->nr_scanned means full LRU
+	 * scan, but that approximation was wrong, and there were corner cases
+	 * where always a non-zero amount of pages were scanned.
 	 */
-	pages_for_compaction = compact_gap(sc->order);
-	inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE);
-	if (get_nr_swap_pages() > 0)
-		inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON);
-	if (sc->nr_reclaimed < pages_for_compaction &&
-			inactive_lru_pages > pages_for_compaction)
-		return true;
+	if (!nr_reclaimed)
+		return false;
 
 	/* If compaction would go ahead or the allocation would succeed, stop */
 	for (z = 0; z <= sc->reclaim_idx; z++) {
@@ -2647,7 +2680,17 @@
 			;
 		}
 	}
-	return true;
+
+	/*
+	 * If we have not reclaimed enough pages for compaction and the
+	 * inactive lists are large enough, continue reclaiming
+	 */
+	pages_for_compaction = compact_gap(sc->order);
+	inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE);
+	if (get_nr_swap_pages() > 0)
+		inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON);
+
+	return inactive_lru_pages > pages_for_compaction;
 }
 
 static bool pgdat_memcg_congested(pg_data_t *pgdat, struct mem_cgroup *memcg)
@@ -2664,10 +2707,6 @@
 
 	do {
 		struct mem_cgroup *root = sc->target_mem_cgroup;
-		struct mem_cgroup_reclaim_cookie reclaim = {
-			.pgdat = pgdat,
-			.priority = sc->priority,
-		};
 		unsigned long node_lru_pages = 0;
 		struct mem_cgroup *memcg;
 
@@ -2676,7 +2715,7 @@
 		nr_reclaimed = sc->nr_reclaimed;
 		nr_scanned = sc->nr_scanned;
 
-		memcg = mem_cgroup_iter(root, NULL, &reclaim);
+		memcg = mem_cgroup_iter(root, NULL, NULL);
 		do {
 			unsigned long lru_pages;
 			unsigned long reclaimed;
@@ -2719,21 +2758,7 @@
 				   sc->nr_scanned - scanned,
 				   sc->nr_reclaimed - reclaimed);
 
-			/*
-			 * Kswapd have to scan all memory cgroups to fulfill
-			 * the overall scan target for the node.
-			 *
-			 * Limit reclaim, on the other hand, only cares about
-			 * nr_to_reclaim pages to be reclaimed and it will
-			 * retry with decreasing priority if one round over the
-			 * whole hierarchy is not sufficient.
-			 */
-			if (!current_is_kswapd() &&
-					sc->nr_reclaimed >= sc->nr_to_reclaim) {
-				mem_cgroup_iter_break(root, memcg);
-				break;
-			}
-		} while ((memcg = mem_cgroup_iter(root, memcg, &reclaim)));
+		} while ((memcg = mem_cgroup_iter(root, memcg, NULL)));
 
 		if (reclaim_state) {
 			sc->nr_reclaimed += reclaim_state->reclaimed_slab;
@@ -2810,7 +2835,7 @@
 			wait_iff_congested(BLK_RW_ASYNC, HZ/10);
 
 	} while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
-					 sc->nr_scanned - nr_scanned, sc));
+					 sc));
 
 	/*
 	 * Kswapd gives up on balancing particular nodes after too

diff --git a/mm/vmstat.c b/mm/vmstat.c
index fd7e16c..6afc892 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c

@@ -1158,6 +1158,8 @@
 	"nr_shmem",
 	"nr_shmem_hugepages",
 	"nr_shmem_pmdmapped",
+	"nr_file_hugepages",
+	"nr_file_pmdmapped",
 	"nr_anon_transparent_hugepages",
 	"nr_unstable",
 	"nr_vmscan_write",

diff --git a/mm/z3fold.c b/mm/z3fold.c
index 75b7962..05bdf90 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c

@@ -41,7 +41,6 @@
 #include <linux/workqueue.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
-#include <linux/wait.h>
 #include <linux/zpool.h>
 #include <linux/magic.h>
 
@@ -146,8 +145,6 @@
  * @release_wq:	workqueue for safe page release
  * @work:	work_struct for safe page release
  * @inode:	inode for z3fold pseudo filesystem
- * @destroying: bool to stop migration once we start destruction
- * @isolated: int to count the number of pages currently in isolation
  *
  * This structure is allocated at pool creation time and maintains metadata
  * pertaining to a particular z3fold pool.
@@ -166,11 +163,8 @@
 	const struct zpool_ops *zpool_ops;
 	struct workqueue_struct *compact_wq;
 	struct workqueue_struct *release_wq;
-	struct wait_queue_head isolate_wait;
 	struct work_struct work;
 	struct inode *inode;
-	bool destroying;
-	int isolated;
 };
 
 /*
@@ -301,14 +295,11 @@
  }
 
 /* Initializes the z3fold header of a newly allocated z3fold page */
-static struct z3fold_header *init_z3fold_page(struct page *page,
+static struct z3fold_header *init_z3fold_page(struct page *page, bool headless,
 					struct z3fold_pool *pool, gfp_t gfp)
 {
 	struct z3fold_header *zhdr = page_address(page);
-	struct z3fold_buddy_slots *slots = alloc_slots(pool, gfp);
-
-	if (!slots)
-		return NULL;
+	struct z3fold_buddy_slots *slots;
 
 	INIT_LIST_HEAD(&page->lru);
 	clear_bit(PAGE_HEADLESS, &page->private);
@@ -316,6 +307,12 @@
 	clear_bit(NEEDS_COMPACTING, &page->private);
 	clear_bit(PAGE_STALE, &page->private);
 	clear_bit(PAGE_CLAIMED, &page->private);
+	if (headless)
+		return zhdr;
+
+	slots = alloc_slots(pool, gfp);
+	if (!slots)
+		return NULL;
 
 	spin_lock_init(&zhdr->page_lock);
 	kref_init(&zhdr->refcount);
@@ -372,9 +369,10 @@
  * Encodes the handle of a particular buddy within a z3fold page
  * Pool lock should be held as this function accesses first_num
  */
-static unsigned long encode_handle(struct z3fold_header *zhdr, enum buddy bud)
+static unsigned long __encode_handle(struct z3fold_header *zhdr,
+				struct z3fold_buddy_slots *slots,
+				enum buddy bud)
 {
-	struct z3fold_buddy_slots *slots;
 	unsigned long h = (unsigned long)zhdr;
 	int idx = 0;
 
@@ -391,11 +389,15 @@
 	if (bud == LAST)
 		h |= (zhdr->last_chunks << BUDDY_SHIFT);
 
-	slots = zhdr->slots;
 	slots->slot[idx] = h;
 	return (unsigned long)&slots->slot[idx];
 }
 
+static unsigned long encode_handle(struct z3fold_header *zhdr, enum buddy bud)
+{
+	return __encode_handle(zhdr, zhdr->slots, bud);
+}
+
 /* Returns the z3fold page where a given handle is stored */
 static inline struct z3fold_header *handle_to_z3fold_header(unsigned long h)
 {
@@ -630,6 +632,7 @@
 	}
 
 	if (unlikely(PageIsolated(page) ||
+		     test_bit(PAGE_CLAIMED, &page->private) ||
 		     test_bit(PAGE_STALE, &page->private))) {
 		z3fold_page_unlock(zhdr);
 		return;
@@ -775,7 +778,6 @@
 		goto out_c;
 	spin_lock_init(&pool->lock);
 	spin_lock_init(&pool->stale_lock);
-	init_waitqueue_head(&pool->isolate_wait);
 	pool->unbuddied = __alloc_percpu(sizeof(struct list_head)*NCHUNKS, 2);
 	if (!pool->unbuddied)
 		goto out_pool;
@@ -815,15 +817,6 @@
 	return NULL;
 }
 
-static bool pool_isolated_are_drained(struct z3fold_pool *pool)
-{
-	bool ret;
-
-	spin_lock(&pool->lock);
-	ret = pool->isolated == 0;
-	spin_unlock(&pool->lock);
-	return ret;
-}
 /**
  * z3fold_destroy_pool() - destroys an existing z3fold pool
  * @pool:	the z3fold pool to be destroyed
@@ -833,22 +826,6 @@
 static void z3fold_destroy_pool(struct z3fold_pool *pool)
 {
 	kmem_cache_destroy(pool->c_handle);
-	/*
-	 * We set pool-> destroying under lock to ensure that
-	 * z3fold_page_isolate() sees any changes to destroying. This way we
-	 * avoid the need for any memory barriers.
-	 */
-
-	spin_lock(&pool->lock);
-	pool->destroying = true;
-	spin_unlock(&pool->lock);
-
-	/*
-	 * We need to ensure that no pages are being migrated while we destroy
-	 * these workqueues, as migration can queue work on either of the
-	 * workqueues.
-	 */
-	wait_event(pool->isolate_wait, !pool_isolated_are_drained(pool));
 
 	/*
 	 * We need to destroy pool->compact_wq before pool->release_wq,
@@ -956,7 +933,7 @@
 	if (!page)
 		return -ENOMEM;
 
-	zhdr = init_z3fold_page(page, pool, gfp);
+	zhdr = init_z3fold_page(page, bud == HEADLESS, pool, gfp);
 	if (!zhdr) {
 		__free_page(page);
 		return -ENOMEM;
@@ -1132,6 +1109,7 @@
 	struct z3fold_header *zhdr = NULL;
 	struct page *page = NULL;
 	struct list_head *pos;
+	struct z3fold_buddy_slots slots;
 	unsigned long first_handle = 0, middle_handle = 0, last_handle = 0;
 
 	spin_lock(&pool->lock);
@@ -1150,16 +1128,22 @@
 			/* this bit could have been set by free, in which case
 			 * we pass over to the next page in the pool.
 			 */
-			if (test_and_set_bit(PAGE_CLAIMED, &page->private))
+			if (test_and_set_bit(PAGE_CLAIMED, &page->private)) {
+				page = NULL;
 				continue;
+			}
 
-			if (unlikely(PageIsolated(page)))
+			if (unlikely(PageIsolated(page))) {
+				clear_bit(PAGE_CLAIMED, &page->private);
+				page = NULL;
 				continue;
+			}
+			zhdr = page_address(page);
 			if (test_bit(PAGE_HEADLESS, &page->private))
 				break;
 
-			zhdr = page_address(page);
 			if (!z3fold_page_trylock(zhdr)) {
+				clear_bit(PAGE_CLAIMED, &page->private);
 				zhdr = NULL;
 				continue; /* can't evict at this point */
 			}
@@ -1177,26 +1161,30 @@
 
 		if (!test_bit(PAGE_HEADLESS, &page->private)) {
 			/*
-			 * We need encode the handles before unlocking, since
-			 * we can race with free that will set
-			 * (first|last)_chunks to 0
+			 * We need encode the handles before unlocking, and
+			 * use our local slots structure because z3fold_free
+			 * can zero out zhdr->slots and we can't do much
+			 * about that
 			 */
 			first_handle = 0;
 			last_handle = 0;
 			middle_handle = 0;
 			if (zhdr->first_chunks)
-				first_handle = encode_handle(zhdr, FIRST);
+				first_handle = __encode_handle(zhdr, &slots,
+								FIRST);
 			if (zhdr->middle_chunks)
-				middle_handle = encode_handle(zhdr, MIDDLE);
+				middle_handle = __encode_handle(zhdr, &slots,
+								MIDDLE);
 			if (zhdr->last_chunks)
-				last_handle = encode_handle(zhdr, LAST);
+				last_handle = __encode_handle(zhdr, &slots,
+								LAST);
 			/*
 			 * it's safe to unlock here because we hold a
 			 * reference to this page
 			 */
 			z3fold_page_unlock(zhdr);
 		} else {
-			first_handle = encode_handle(zhdr, HEADLESS);
+			first_handle = __encode_handle(zhdr, &slots, HEADLESS);
 			last_handle = middle_handle = 0;
 		}
 
@@ -1226,9 +1214,9 @@
 			spin_lock(&pool->lock);
 			list_add(&page->lru, &pool->lru);
 			spin_unlock(&pool->lock);
+			clear_bit(PAGE_CLAIMED, &page->private);
 		} else {
 			z3fold_page_lock(zhdr);
-			clear_bit(PAGE_CLAIMED, &page->private);
 			if (kref_put(&zhdr->refcount,
 					release_z3fold_page_locked)) {
 				atomic64_dec(&pool->pages_nr);
@@ -1243,6 +1231,7 @@
 			list_add(&page->lru, &pool->lru);
 			spin_unlock(&pool->lock);
 			z3fold_page_unlock(zhdr);
+			clear_bit(PAGE_CLAIMED, &page->private);
 		}
 
 		/* We started off locked to we need to lock the pool back */
@@ -1339,28 +1328,6 @@
 	return atomic64_read(&pool->pages_nr);
 }
 
-/*
- * z3fold_dec_isolated() expects to be called while pool->lock is held.
- */
-static void z3fold_dec_isolated(struct z3fold_pool *pool)
-{
-	assert_spin_locked(&pool->lock);
-	VM_BUG_ON(pool->isolated <= 0);
-	pool->isolated--;
-
-	/*
-	 * If we have no more isolated pages, we have to see if
-	 * z3fold_destroy_pool() is waiting for a signal.
-	 */
-	if (pool->isolated == 0 && waitqueue_active(&pool->isolate_wait))
-		wake_up_all(&pool->isolate_wait);
-}
-
-static void z3fold_inc_isolated(struct z3fold_pool *pool)
-{
-	pool->isolated++;
-}
-
 static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode)
 {
 	struct z3fold_header *zhdr;
@@ -1369,7 +1336,8 @@
 	VM_BUG_ON_PAGE(!PageMovable(page), page);
 	VM_BUG_ON_PAGE(PageIsolated(page), page);
 
-	if (test_bit(PAGE_HEADLESS, &page->private))
+	if (test_bit(PAGE_HEADLESS, &page->private) ||
+	    test_bit(PAGE_CLAIMED, &page->private))
 		return false;
 
 	zhdr = page_address(page);
@@ -1387,34 +1355,6 @@
 		spin_lock(&pool->lock);
 		if (!list_empty(&page->lru))
 			list_del(&page->lru);
-		/*
-		 * We need to check for destruction while holding pool->lock, as
-		 * otherwise destruction could see 0 isolated pages, and
-		 * proceed.
-		 */
-		if (unlikely(pool->destroying)) {
-			spin_unlock(&pool->lock);
-			/*
-			 * If this page isn't stale, somebody else holds a
-			 * reference to it. Let't drop our refcount so that they
-			 * can call the release logic.
-			 */
-			if (unlikely(kref_put(&zhdr->refcount,
-					      release_z3fold_page_locked))) {
-				/*
-				 * If we get here we have kref problems, so we
-				 * should freak out.
-				 */
-				WARN(1, "Z3fold is experiencing kref problems\n");
-				z3fold_page_unlock(zhdr);
-				return false;
-			}
-			z3fold_page_unlock(zhdr);
-			return false;
-		}
-
-
-		z3fold_inc_isolated(pool);
 		spin_unlock(&pool->lock);
 		z3fold_page_unlock(zhdr);
 		return true;
@@ -1483,10 +1423,6 @@
 
 	queue_work_on(new_zhdr->cpu, pool->compact_wq, &new_zhdr->work);
 
-	spin_lock(&pool->lock);
-	z3fold_dec_isolated(pool);
-	spin_unlock(&pool->lock);
-
 	page_mapcount_reset(page);
 	put_page(page);
 	return 0;
@@ -1506,14 +1442,10 @@
 	INIT_LIST_HEAD(&page->lru);
 	if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) {
 		atomic64_dec(&pool->pages_nr);
-		spin_lock(&pool->lock);
-		z3fold_dec_isolated(pool);
-		spin_unlock(&pool->lock);
 		return;
 	}
 	spin_lock(&pool->lock);
 	list_add(&page->lru, &pool->lru);
-	z3fold_dec_isolated(pool);
 	spin_unlock(&pool->lock);
 	z3fold_page_unlock(zhdr);
 }

diff --git a/mm/zpool.c b/mm/zpool.c
index a2dd910..8636692 100644
--- a/mm/zpool.c
+++ b/mm/zpool.c

@@ -239,6 +239,22 @@
 }
 
 /**
+ * zpool_malloc_support_movable() - Check if the zpool support
+ * allocate movable memory
+ * @zpool:	The zpool to check
+ *
+ * This returns if the zpool support allocate movable memory.
+ *
+ * Implementations must guarantee this to be thread-safe.
+ *
+ * Returns: true if if the zpool support allocate movable memory, false if not
+ */
+bool zpool_malloc_support_movable(struct zpool *zpool)
+{
+	return zpool->driver->malloc_support_movable;
+}
+
+/**
  * zpool_malloc() - Allocate memory
  * @zpool:	The zpool to allocate from.
  * @size:	The amount of memory to allocate.

diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index e98bb6a..2b2b9aa 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c

@@ -443,15 +443,16 @@
 }
 
 static struct zpool_driver zs_zpool_driver = {
-	.type =		"zsmalloc",
-	.owner =	THIS_MODULE,
-	.create =	zs_zpool_create,
-	.destroy =	zs_zpool_destroy,
-	.malloc =	zs_zpool_malloc,
-	.free =		zs_zpool_free,
-	.map =		zs_zpool_map,
-	.unmap =	zs_zpool_unmap,
-	.total_size =	zs_zpool_total_size,
+	.type =			  "zsmalloc",
+	.owner =		  THIS_MODULE,
+	.create =		  zs_zpool_create,
+	.destroy =		  zs_zpool_destroy,
+	.malloc_support_movable = true,
+	.malloc =		  zs_zpool_malloc,
+	.free =			  zs_zpool_free,
+	.map =			  zs_zpool_map,
+	.unmap =		  zs_zpool_unmap,
+	.total_size =		  zs_zpool_total_size,
 };
 
 MODULE_ALIAS("zpool-zsmalloc");
@@ -476,10 +477,6 @@
 	return zspage->inuse;
 }
 
-static inline void set_zspage_inuse(struct zspage *zspage, int val)
-{
-	zspage->inuse = val;
-}
 
 static inline void mod_zspage_inuse(struct zspage *zspage, int val)
 {

diff --git a/mm/zswap.c b/mm/zswap.c
index 0e22744..46a3223 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c

@@ -856,7 +856,6 @@
 	/* extract swpentry from data */
 	zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO);
 	swpentry = zhdr->swpentry; /* here */
-	zpool_unmap_handle(pool, handle);
 	tree = zswap_trees[swp_type(swpentry)];
 	offset = swp_offset(swpentry);
 
@@ -866,6 +865,7 @@
 	if (!entry) {
 		/* entry was invalidated */
 		spin_unlock(&tree->lock);
+		zpool_unmap_handle(pool, handle);
 		return 0;
 	}
 	spin_unlock(&tree->lock);
@@ -886,15 +886,13 @@
 	case ZSWAP_SWAPCACHE_NEW: /* page is locked */
 		/* decompress */
 		dlen = PAGE_SIZE;
-		src = (u8 *)zpool_map_handle(entry->pool->zpool, entry->handle,
-				ZPOOL_MM_RO) + sizeof(struct zswap_header);
+		src = (u8 *)zhdr + sizeof(struct zswap_header);
 		dst = kmap_atomic(page);
 		tfm = *get_cpu_ptr(entry->pool->tfm);
 		ret = crypto_comp_decompress(tfm, src, entry->length,
 					     dst, &dlen);
 		put_cpu_ptr(entry->pool->tfm);
 		kunmap_atomic(dst);
-		zpool_unmap_handle(entry->pool->zpool, entry->handle);
 		BUG_ON(ret);
 		BUG_ON(dlen != PAGE_SIZE);
 
@@ -940,6 +938,7 @@
 	spin_unlock(&tree->lock);
 
 end:
+	zpool_unmap_handle(pool, handle);
 	return ret;
 }
 
@@ -997,6 +996,7 @@
 	char *buf;
 	u8 *src, *dst;
 	struct zswap_header zhdr = { .swpentry = swp_entry(type, offset) };
+	gfp_t gfp;
 
 	/* THP isn't supported */
 	if (PageTransHuge(page)) {
@@ -1070,9 +1070,10 @@
 
 	/* store */
 	hlen = zpool_evictable(entry->pool->zpool) ? sizeof(zhdr) : 0;
-	ret = zpool_malloc(entry->pool->zpool, hlen + dlen,
-			   __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM,
-			   &handle);
+	gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
+	if (zpool_malloc_support_movable(entry->pool->zpool))
+		gfp |= __GFP_HIGHMEM | __GFP_MOVABLE;
+	ret = zpool_malloc(entry->pool->zpool, hlen + dlen, gfp, &handle);
 	if (ret == -ENOSPC) {
 		zswap_reject_compress_poor++;
 		goto put_dstmem;

diff --git a/net/9p/client.c b/net/9p/client.c
index 9622f3e..1d48afc 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c

@@ -281,6 +281,7 @@
 
 	p9pdu_reset(&req->tc);
 	p9pdu_reset(&req->rc);
+	req->t_err = 0;
 	req->status = REQ_STATUS_ALLOC;
 	init_waitqueue_head(&req->wq);
 	INIT_LIST_HEAD(&req->req_list);

diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c
index 4072e9d..b41375d 100644
--- a/net/appletalk/ddp.c
+++ b/net/appletalk/ddp.c

@@ -1023,6 +1023,11 @@
 	 */
 	if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM)
 		goto out;
+
+	rc = -EPERM;
+	if (sock->type == SOCK_RAW && !kern && !capable(CAP_NET_RAW))
+		goto out;
+
 	rc = -ENOMEM;
 	sk = sk_alloc(net, PF_APPLETALK, GFP_KERNEL, &ddp_proto, kern);
 	if (!sk)

diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index ca52077..bb222b8 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c

@@ -855,6 +855,8 @@
 		break;
 
 	case SOCK_RAW:
+		if (!capable(CAP_NET_RAW))
+			return -EPERM;
 		break;
 	default:
 		return -ESOCKTNOSUPPORT;

diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig
index a3d188d..d5028af 100644
--- a/net/batman-adv/Kconfig
+++ b/net/batman-adv/Kconfig

@@ -12,11 +12,11 @@
 	depends on NET
 	select LIBCRC32C
 	help
-          B.A.T.M.A.N. (better approach to mobile ad-hoc networking) is
-          a routing protocol for multi-hop ad-hoc mesh networks. The
-          networks may be wired or wireless. See
-          https://www.open-mesh.org/ for more information and user space
-          tools.
+	  B.A.T.M.A.N. (better approach to mobile ad-hoc networking) is
+	  a routing protocol for multi-hop ad-hoc mesh networks. The
+	  networks may be wired or wireless. See
+	  https://www.open-mesh.org/ for more information and user space
+	  tools.
 
 config BATMAN_ADV_BATMAN_V
 	bool "B.A.T.M.A.N. V protocol"

diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index 4eeea4d..2d56824 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c

@@ -13,6 +13,7 @@
 #include <linux/nsproxy.h>
 #include <linux/parser.h>
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/seq_file.h>
 #include <linux/slab.h>
 #include <linux/statfs.h>
@@ -185,18 +186,34 @@
 }
 EXPORT_SYMBOL(ceph_compare_options);
 
+/*
+ * kvmalloc() doesn't fall back to the vmalloc allocator unless flags are
+ * compatible with (a superset of) GFP_KERNEL.  This is because while the
+ * actual pages are allocated with the specified flags, the page table pages
+ * are always allocated with GFP_KERNEL.  map_vm_area() doesn't even take
+ * flags because GFP_KERNEL is hard-coded in {p4d,pud,pmd,pte}_alloc().
+ *
+ * ceph_kvmalloc() may be called with GFP_KERNEL, GFP_NOFS or GFP_NOIO.
+ */
 void *ceph_kvmalloc(size_t size, gfp_t flags)
 {
-	if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
-		void *ptr = kmalloc(size, flags | __GFP_NOWARN);
-		if (ptr)
-			return ptr;
+	void *p;
+
+	if ((flags & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS)) {
+		p = kvmalloc(size, flags);
+	} else if ((flags & (__GFP_IO | __GFP_FS)) == __GFP_IO) {
+		unsigned int nofs_flag = memalloc_nofs_save();
+		p = kvmalloc(size, GFP_KERNEL);
+		memalloc_nofs_restore(nofs_flag);
+	} else {
+		unsigned int noio_flag = memalloc_noio_save();
+		p = kvmalloc(size, GFP_KERNEL);
+		memalloc_noio_restore(noio_flag);
 	}
 
-	return __vmalloc(size, flags, PAGE_KERNEL);
+	return p;
 }
 
-
 static int parse_fsid(const char *str, struct ceph_fsid *fsid)
 {
 	int i = 0;
@@ -694,6 +711,14 @@
 }
 EXPORT_SYMBOL(ceph_destroy_client);
 
+void ceph_reset_client_addr(struct ceph_client *client)
+{
+	ceph_messenger_reset_nonce(&client->msgr);
+	ceph_monc_reopen_session(&client->monc);
+	ceph_osdc_reopen_osds(&client->osdc);
+}
+EXPORT_SYMBOL(ceph_reset_client_addr);
+
 /*
  * true if we have the mon map (and have thus joined the cluster)
  */

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 962f521..e4cb3db 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c

@@ -3031,6 +3031,12 @@
 }
 
 
+void ceph_messenger_reset_nonce(struct ceph_messenger *msgr)
+{
+	u32 nonce = le32_to_cpu(msgr->inst.addr.nonce) + 1000000;
+	msgr->inst.addr.nonce = cpu_to_le32(nonce);
+	encode_my_addr(msgr);
+}
 
 /*
  * initialize a new messenger instance

diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index 0520bf9..7256c40 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c

@@ -213,6 +213,13 @@
 	__open_session(monc);
 }
 
+void ceph_monc_reopen_session(struct ceph_mon_client *monc)
+{
+	mutex_lock(&monc->mutex);
+	reopen_session(monc);
+	mutex_unlock(&monc->mutex);
+}
+
 static void un_backoff(struct ceph_mon_client *monc)
 {
 	monc->hunt_mult /= 2; /* reduce by 50% */

diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 78ae6e8..ba45b07 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c

@@ -841,6 +841,7 @@
 	struct ceph_pagelist *pagelist;
 	size_t payload_len = 0;
 	size_t size;
+	int ret;
 
 	op = _osd_req_op_init(osd_req, which, CEPH_OSD_OP_CALL, 0);
 
@@ -852,20 +853,27 @@
 	size = strlen(class);
 	BUG_ON(size > (size_t) U8_MAX);
 	op->cls.class_len = size;
-	ceph_pagelist_append(pagelist, class, size);
+	ret = ceph_pagelist_append(pagelist, class, size);
+	if (ret)
+		goto err_pagelist_free;
 	payload_len += size;
 
 	op->cls.method_name = method;
 	size = strlen(method);
 	BUG_ON(size > (size_t) U8_MAX);
 	op->cls.method_len = size;
-	ceph_pagelist_append(pagelist, method, size);
+	ret = ceph_pagelist_append(pagelist, method, size);
+	if (ret)
+		goto err_pagelist_free;
 	payload_len += size;
 
 	osd_req_op_cls_request_info_pagelist(osd_req, which, pagelist);
-
 	op->indata_len = payload_len;
 	return 0;
+
+err_pagelist_free:
+	ceph_pagelist_release(pagelist);
+	return ret;
 }
 EXPORT_SYMBOL(osd_req_op_cls_init);
 
@@ -877,6 +885,7 @@
 						      opcode, 0);
 	struct ceph_pagelist *pagelist;
 	size_t payload_len;
+	int ret;
 
 	BUG_ON(opcode != CEPH_OSD_OP_SETXATTR && opcode != CEPH_OSD_OP_CMPXATTR);
 
@@ -886,10 +895,14 @@
 
 	payload_len = strlen(name);
 	op->xattr.name_len = payload_len;
-	ceph_pagelist_append(pagelist, name, payload_len);
+	ret = ceph_pagelist_append(pagelist, name, payload_len);
+	if (ret)
+		goto err_pagelist_free;
 
 	op->xattr.value_len = size;
-	ceph_pagelist_append(pagelist, value, size);
+	ret = ceph_pagelist_append(pagelist, value, size);
+	if (ret)
+		goto err_pagelist_free;
 	payload_len += size;
 
 	op->xattr.cmp_op = cmp_op;
@@ -898,6 +911,10 @@
 	ceph_osd_data_pagelist_init(&op->xattr.osd_data, pagelist);
 	op->indata_len = payload_len;
 	return 0;
+
+err_pagelist_free:
+	ceph_pagelist_release(pagelist);
+	return ret;
 }
 EXPORT_SYMBOL(osd_req_op_xattr_init);
 
@@ -1488,7 +1505,6 @@
 
 static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
 					   struct ceph_osd_request_target *t,
-					   struct ceph_connection *con,
 					   bool any_change)
 {
 	struct ceph_pg_pool_info *pi;
@@ -2272,7 +2288,7 @@
 	dout("%s req %p wrlocked %d\n", __func__, req, wrlocked);
 
 again:
-	ct_res = calc_target(osdc, &req->r_t, NULL, false);
+	ct_res = calc_target(osdc, &req->r_t, false);
 	if (ct_res == CALC_TARGET_POOL_DNE && !wrlocked)
 		goto promote;
 
@@ -2476,6 +2492,14 @@
 }
 EXPORT_SYMBOL(ceph_osdc_abort_requests);
 
+void ceph_osdc_clear_abort_err(struct ceph_osd_client *osdc)
+{
+	down_write(&osdc->lock);
+	osdc->abort_err = 0;
+	up_write(&osdc->lock);
+}
+EXPORT_SYMBOL(ceph_osdc_clear_abort_err);
+
 static void update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb)
 {
 	if (likely(eb > osdc->epoch_barrier)) {
@@ -3087,7 +3111,7 @@
 		lreq->reg_req->r_ops[0].notify.cookie = lreq->linger_id;
 	}
 
-	calc_target(osdc, &lreq->t, NULL, false);
+	calc_target(osdc, &lreq->t, false);
 	osd = lookup_create_osd(osdc, lreq->t.osd, true);
 	link_linger(osd, lreq);
 
@@ -3704,7 +3728,7 @@
 	struct ceph_osd_client *osdc = lreq->osdc;
 	enum calc_target_result ct_res;
 
-	ct_res = calc_target(osdc, &lreq->t, NULL, true);
+	ct_res = calc_target(osdc, &lreq->t, true);
 	if (ct_res == CALC_TARGET_NEED_RESEND) {
 		struct ceph_osd *osd;
 
@@ -3776,8 +3800,7 @@
 		n = rb_next(n); /* unlink_request(), check_pool_dne() */
 
 		dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
-		ct_res = calc_target(osdc, &req->r_t, &req->r_osd->o_con,
-				     false);
+		ct_res = calc_target(osdc, &req->r_t, false);
 		switch (ct_res) {
 		case CALC_TARGET_NO_ACTION:
 			force_resend_writes = cleared_full ||
@@ -3886,7 +3909,7 @@
 		n = rb_next(n);
 
 		if (req->r_t.epoch < osdc->osdmap->epoch) {
-			ct_res = calc_target(osdc, &req->r_t, NULL, false);
+			ct_res = calc_target(osdc, &req->r_t, false);
 			if (ct_res == CALC_TARGET_POOL_DNE) {
 				erase_request(need_resend, req);
 				check_pool_dne(req);
@@ -5087,6 +5110,24 @@
 EXPORT_SYMBOL(ceph_osdc_call);
 
 /*
+ * reset all osd connections
+ */
+void ceph_osdc_reopen_osds(struct ceph_osd_client *osdc)
+{
+	struct rb_node *n;
+
+	down_write(&osdc->lock);
+	for (n = rb_first(&osdc->osds); n; ) {
+		struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
+
+		n = rb_next(n);
+		if (!reopen_osd(osd))
+			kick_osd_requests(osd);
+	}
+	up_write(&osdc->lock);
+}
+
+/*
  * init, shutdown
  */
 int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)

diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 9043790..4e0de14 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c

@@ -973,11 +973,11 @@
 				 struct ceph_pg_pool_info, node);
 		__remove_pg_pool(&map->pg_pools, pi);
 	}
-	kfree(map->osd_state);
-	kfree(map->osd_weight);
-	kfree(map->osd_addr);
-	kfree(map->osd_primary_affinity);
-	kfree(map->crush_workspace);
+	kvfree(map->osd_state);
+	kvfree(map->osd_weight);
+	kvfree(map->osd_addr);
+	kvfree(map->osd_primary_affinity);
+	kvfree(map->crush_workspace);
 	kfree(map);
 }
 
@@ -986,28 +986,41 @@
  *
  * The new elements are properly initialized.
  */
-static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
+static int osdmap_set_max_osd(struct ceph_osdmap *map, u32 max)
 {
 	u32 *state;
 	u32 *weight;
 	struct ceph_entity_addr *addr;
+	u32 to_copy;
 	int i;
 
-	state = krealloc(map->osd_state, max*sizeof(*state), GFP_NOFS);
-	if (!state)
+	dout("%s old %u new %u\n", __func__, map->max_osd, max);
+	if (max == map->max_osd)
+		return 0;
+
+	state = ceph_kvmalloc(array_size(max, sizeof(*state)), GFP_NOFS);
+	weight = ceph_kvmalloc(array_size(max, sizeof(*weight)), GFP_NOFS);
+	addr = ceph_kvmalloc(array_size(max, sizeof(*addr)), GFP_NOFS);
+	if (!state || !weight || !addr) {
+		kvfree(state);
+		kvfree(weight);
+		kvfree(addr);
 		return -ENOMEM;
+	}
+
+	to_copy = min(map->max_osd, max);
+	if (map->osd_state) {
+		memcpy(state, map->osd_state, to_copy * sizeof(*state));
+		memcpy(weight, map->osd_weight, to_copy * sizeof(*weight));
+		memcpy(addr, map->osd_addr, to_copy * sizeof(*addr));
+		kvfree(map->osd_state);
+		kvfree(map->osd_weight);
+		kvfree(map->osd_addr);
+	}
+
 	map->osd_state = state;
-
-	weight = krealloc(map->osd_weight, max*sizeof(*weight), GFP_NOFS);
-	if (!weight)
-		return -ENOMEM;
 	map->osd_weight = weight;
-
-	addr = krealloc(map->osd_addr, max*sizeof(*addr), GFP_NOFS);
-	if (!addr)
-		return -ENOMEM;
 	map->osd_addr = addr;
-
 	for (i = map->max_osd; i < max; i++) {
 		map->osd_state[i] = 0;
 		map->osd_weight[i] = CEPH_OSD_OUT;
@@ -1017,12 +1030,16 @@
 	if (map->osd_primary_affinity) {
 		u32 *affinity;
 
-		affinity = krealloc(map->osd_primary_affinity,
-				    max*sizeof(*affinity), GFP_NOFS);
+		affinity = ceph_kvmalloc(array_size(max, sizeof(*affinity)),
+					 GFP_NOFS);
 		if (!affinity)
 			return -ENOMEM;
-		map->osd_primary_affinity = affinity;
 
+		memcpy(affinity, map->osd_primary_affinity,
+		       to_copy * sizeof(*affinity));
+		kvfree(map->osd_primary_affinity);
+
+		map->osd_primary_affinity = affinity;
 		for (i = map->max_osd; i < max; i++)
 			map->osd_primary_affinity[i] =
 			    CEPH_OSD_DEFAULT_PRIMARY_AFFINITY;
@@ -1043,7 +1060,7 @@
 
 	work_size = crush_work_size(crush, CEPH_PG_MAX_SIZE);
 	dout("%s work_size %zu bytes\n", __func__, work_size);
-	workspace = kmalloc(work_size, GFP_NOIO);
+	workspace = ceph_kvmalloc(work_size, GFP_NOIO);
 	if (!workspace) {
 		crush_destroy(crush);
 		return -ENOMEM;
@@ -1052,7 +1069,7 @@
 
 	if (map->crush)
 		crush_destroy(map->crush);
-	kfree(map->crush_workspace);
+	kvfree(map->crush_workspace);
 	map->crush = crush;
 	map->crush_workspace = workspace;
 	return 0;
@@ -1298,9 +1315,9 @@
 	if (!map->osd_primary_affinity) {
 		int i;
 
-		map->osd_primary_affinity = kmalloc_array(map->max_osd,
-							  sizeof(u32),
-							  GFP_NOFS);
+		map->osd_primary_affinity = ceph_kvmalloc(
+		    array_size(map->max_osd, sizeof(*map->osd_primary_affinity)),
+		    GFP_NOFS);
 		if (!map->osd_primary_affinity)
 			return -ENOMEM;
 
@@ -1321,7 +1338,7 @@
 
 	ceph_decode_32_safe(p, end, len, e_inval);
 	if (len == 0) {
-		kfree(map->osd_primary_affinity);
+		kvfree(map->osd_primary_affinity);
 		map->osd_primary_affinity = NULL;
 		return 0;
 	}

diff --git a/net/core/dev.c b/net/core/dev.c
index 71b18e8..bf3ed41 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c

@@ -5666,7 +5666,7 @@
 static void napi_skb_free_stolen_head(struct sk_buff *skb)
 {
 	skb_dst_drop(skb);
-	secpath_reset(skb);
+	skb_ext_put(skb);
 	kmem_cache_free(skbuff_head_cache, skb);
 }
 
@@ -5733,7 +5733,7 @@
 	skb->encapsulation = 0;
 	skb_shinfo(skb)->gso_type = 0;
 	skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
-	secpath_reset(skb);
+	skb_ext_reset(skb);
 
 	napi->skb = skb;
 }

diff --git a/net/core/dst.c b/net/core/dst.c
index 1325316..193af52 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c

@@ -172,7 +172,7 @@
 		int newrefcnt;
 
 		newrefcnt = atomic_dec_return(&dst->__refcnt);
-		if (unlikely(newrefcnt < 0))
+		if (WARN_ONCE(newrefcnt < 0, "dst_release underflow"))
 			net_warn_ratelimited("%s: dst:%p refcnt:%d\n",
 					     __func__, dst, newrefcnt);
 		if (!newrefcnt)
@@ -187,7 +187,7 @@
 		int newrefcnt;
 
 		newrefcnt = atomic_dec_return(&dst->__refcnt);
-		if (unlikely(newrefcnt < 0))
+		if (WARN_ONCE(newrefcnt < 0, "dst_release_immediate underflow"))
 			net_warn_ratelimited("%s: dst:%p refcnt:%d\n",
 					     __func__, dst, newrefcnt);
 		if (!newrefcnt)

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index f12e8a0..01d6520 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c

@@ -5119,7 +5119,7 @@
 	skb->skb_iif = 0;
 	skb->ignore_df = 0;
 	skb_dst_drop(skb);
-	secpath_reset(skb);
+	skb_ext_reset(skb);
 	nf_reset(skb);
 	nf_reset_trace(skb);
 

diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 1b7381f..25aab67 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c

@@ -230,7 +230,8 @@
 		opt = ireq->ipv6_opt;
 		if (!opt)
 			opt = rcu_dereference(np->opt);
-		err = ip6_xmit(sk, skb, &fl6, sk->sk_mark, opt, np->tclass);
+		err = ip6_xmit(sk, skb, &fl6, sk->sk_mark, opt, np->tclass,
+			       sk->sk_priority);
 		rcu_read_unlock();
 		err = net_xmit_eval(err);
 	}
@@ -284,7 +285,7 @@
 	dst = ip6_dst_lookup_flow(ctl_sk, &fl6, NULL);
 	if (!IS_ERR(dst)) {
 		skb_dst_set(skb, dst);
-		ip6_xmit(ctl_sk, skb, &fl6, 0, NULL, 0);
+		ip6_xmit(ctl_sk, skb, &fl6, 0, NULL, 0, 0);
 		DCCP_INC_STATS(DCCP_MIB_OUTSEGS);
 		DCCP_INC_STATS(DCCP_MIB_OUTRSTS);
 		return;

diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c
index badc5cf..d93d453 100644
--- a/net/ieee802154/socket.c
+++ b/net/ieee802154/socket.c

@@ -1008,6 +1008,9 @@
 
 	switch (sock->type) {
 	case SOCK_RAW:
+		rc = -EPERM;
+		if (!capable(CAP_NET_RAW))
+			goto out;
 		proto = &ieee802154_raw_prot;
 		ops = &ieee802154_raw_ops;
 		break;

diff --git a/net/ife/Kconfig b/net/ife/Kconfig
index 6cd1f6d..bcf6505 100644
--- a/net/ife/Kconfig
+++ b/net/ife/Kconfig

@@ -5,7 +5,7 @@
 
 menuconfig NET_IFE
 	depends on NET
-        tristate "Inter-FE based on IETF ForCES InterFE LFB"
+	tristate "Inter-FE based on IETF ForCES InterFE LFB"
 	default n
 	help
 	  Say Y here to add support of IFE encapsulation protocol

diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 974de4d2..03381f3 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig

@@ -492,8 +492,8 @@
 	wired networks and throughput over wireless links.
 
 config TCP_CONG_HTCP
-        tristate "H-TCP"
-        default m
+	tristate "H-TCP"
+	default m
 	---help---
 	H-TCP is a send-side only modifications of the TCP Reno
 	protocol stack that optimizes the performance of TCP

diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index f5c163d..a918354 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c

@@ -560,7 +560,7 @@
 	rt = ip_route_output_flow(net, fl4, sk);
 	if (IS_ERR(rt))
 		goto no_route;
-	if (opt && opt->opt.is_strictroute && rt->rt_gw_family)
+	if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway)
 		goto route_err;
 	rcu_read_unlock();
 	return &rt->dst;
@@ -598,7 +598,7 @@
 	rt = ip_route_output_flow(net, fl4, sk);
 	if (IS_ERR(rt))
 		goto no_route;
-	if (opt && opt->opt.is_strictroute && rt->rt_gw_family)
+	if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway)
 		goto route_err;
 	return &rt->dst;
 

diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index 06f6f280..00ec819 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c

@@ -123,7 +123,7 @@
 
 	rt = skb_rtable(skb);
 
-	if (opt->is_strictroute && rt->rt_gw_family)
+	if (opt->is_strictroute && rt->rt_uses_gateway)
 		goto sr_failed;
 
 	IPCB(skb)->flags |= IPSKB_FORWARDED;

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 5eb7377..28fca40 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c

@@ -499,7 +499,7 @@
 	skb_dst_set_noref(skb, &rt->dst);
 
 packet_routed:
-	if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_gw_family)
+	if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_uses_gateway)
 		goto no_route;
 
 	/* OK, we know where to send it, allocate and build IP header. */
@@ -1694,7 +1694,6 @@
 
 	inet_sk(sk)->tos = arg->tos;
 
-	sk->sk_priority = skb->priority;
 	sk->sk_protocol = ip_hdr(skb)->protocol;
 	sk->sk_bound_dev_if = arg->bound_dev_if;
 	sk->sk_sndbuf = sysctl_wmem_default;

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index b6a6f18..7dcce72 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c

@@ -635,6 +635,7 @@
 
 	if (fnhe->fnhe_gw) {
 		rt->rt_flags |= RTCF_REDIRECTED;
+		rt->rt_uses_gateway = 1;
 		rt->rt_gw_family = AF_INET;
 		rt->rt_gw4 = fnhe->fnhe_gw;
 	}
@@ -1313,7 +1314,7 @@
 	mtu = READ_ONCE(dst->dev->mtu);
 
 	if (unlikely(ip_mtu_locked(dst))) {
-		if (rt->rt_gw_family && mtu > 576)
+		if (rt->rt_uses_gateway && mtu > 576)
 			mtu = 576;
 	}
 
@@ -1569,6 +1570,7 @@
 		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
 
 		if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
+			rt->rt_uses_gateway = 1;
 			rt->rt_gw_family = nhc->nhc_gw_family;
 			/* only INET and INET6 are supported */
 			if (likely(nhc->nhc_gw_family == AF_INET))
@@ -1634,6 +1636,7 @@
 		rt->rt_iif = 0;
 		rt->rt_pmtu = 0;
 		rt->rt_mtu_locked = 0;
+		rt->rt_uses_gateway = 0;
 		rt->rt_gw_family = 0;
 		rt->rt_gw4 = 0;
 		INIT_LIST_HEAD(&rt->rt_uncached);
@@ -2694,6 +2697,7 @@
 		rt->rt_genid = rt_genid_ipv4(net);
 		rt->rt_flags = ort->rt_flags;
 		rt->rt_type = ort->rt_type;
+		rt->rt_uses_gateway = ort->rt_uses_gateway;
 		rt->rt_gw_family = ort->rt_gw_family;
 		if (rt->rt_gw_family == AF_INET)
 			rt->rt_gw4 = ort->rt_gw4;
@@ -2778,21 +2782,23 @@
 		if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
 			goto nla_put_failure;
 	}
-	if (rt->rt_gw_family == AF_INET &&
-	    nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
-		goto nla_put_failure;
-	} else if (rt->rt_gw_family == AF_INET6) {
-		int alen = sizeof(struct in6_addr);
-		struct nlattr *nla;
-		struct rtvia *via;
-
-		nla = nla_reserve(skb, RTA_VIA, alen + 2);
-		if (!nla)
+	if (rt->rt_uses_gateway) {
+		if (rt->rt_gw_family == AF_INET &&
+		    nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
 			goto nla_put_failure;
+		} else if (rt->rt_gw_family == AF_INET6) {
+			int alen = sizeof(struct in6_addr);
+			struct nlattr *nla;
+			struct rtvia *via;
 
-		via = nla_data(nla);
-		via->rtvia_family = AF_INET6;
-		memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
+			nla = nla_reserve(skb, RTA_VIA, alen + 2);
+			if (!nla)
+				goto nla_put_failure;
+
+			via = nla_data(nla);
+			via->rtvia_family = AF_INET6;
+			memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
+		}
 	}
 
 	expires = rt->dst.expires;

diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
index 95b5954..32772d6 100644
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c

@@ -388,7 +388,7 @@
  * which allows 2 outstanding 2-packet sequences, to try to keep pipe
  * full even with ACK-every-other-packet delayed ACKs.
  */
-static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd, int gain)
+static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd)
 {
 	struct bbr *bbr = inet_csk_ca(sk);
 
@@ -399,7 +399,7 @@
 	cwnd = (cwnd + 1) & ~1U;
 
 	/* Ensure gain cycling gets inflight above BDP even for small BDPs. */
-	if (bbr->mode == BBR_PROBE_BW && gain > BBR_UNIT)
+	if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == 0)
 		cwnd += 2;
 
 	return cwnd;
@@ -411,7 +411,7 @@
 	u32 inflight;
 
 	inflight = bbr_bdp(sk, bw, gain);
-	inflight = bbr_quantization_budget(sk, inflight, gain);
+	inflight = bbr_quantization_budget(sk, inflight);
 
 	return inflight;
 }
@@ -531,7 +531,7 @@
 	 * due to aggregation (of data and/or ACKs) visible in the ACK stream.
 	 */
 	target_cwnd += bbr_ack_aggregation_cwnd(sk);
-	target_cwnd = bbr_quantization_budget(sk, target_cwnd, gain);
+	target_cwnd = bbr_quantization_budget(sk, target_cwnd);
 
 	/* If we're below target cwnd, slow start cwnd toward target cwnd. */
 	if (bbr_full_bw_reached(sk))  /* only cut cwnd if we filled the pipe */

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index fd394ad..2ee45e3 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c

@@ -771,6 +771,8 @@
 	if (sk) {
 		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 				   inet_twsk(sk)->tw_mark : sk->sk_mark;
+		ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
+				   inet_twsk(sk)->tw_priority : sk->sk_priority;
 		transmit_time = tcp_transmit_time(sk);
 	}
 	ip_send_unicast_reply(ctl_sk,
@@ -866,6 +868,8 @@
 	ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
 	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
 			   inet_twsk(sk)->tw_mark : sk->sk_mark;
+	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
+			   inet_twsk(sk)->tw_priority : sk->sk_priority;
 	transmit_time = tcp_transmit_time(sk);
 	ip_send_unicast_reply(ctl_sk,
 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,

diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 8bcaf25..bb140a5 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c

@@ -266,6 +266,7 @@
 
 		tw->tw_transparent	= inet->transparent;
 		tw->tw_mark		= sk->sk_mark;
+		tw->tw_priority		= sk->sk_priority;
 		tw->tw_rcv_wscale	= tp->rx_opt.rcv_wscale;
 		tcptw->tw_rcv_nxt	= tp->rcv_nxt;
 		tcptw->tw_snd_nxt	= tp->snd_nxt;

diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index dbd9d2d..40de2d2 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c

@@ -210,7 +210,7 @@
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct net *net = sock_net(sk);
-	bool expired, do_reset;
+	bool expired = false, do_reset;
 	int retry_until;
 
 	if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
@@ -242,9 +242,10 @@
 			if (tcp_out_of_resources(sk, do_reset))
 				return 1;
 		}
+	}
+	if (!expired)
 		expired = retransmits_timed_out(sk, retry_until,
 						icsk->icsk_user_timeout);
-	}
 	tcp_fastopen_active_detect_blackhole(sk, expired);
 
 	if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RTO_CB_FLAG))

diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index cdef8f9..35b84b5 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c

@@ -85,6 +85,7 @@
 	xdst->u.rt.rt_flags = rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST |
 					      RTCF_LOCAL);
 	xdst->u.rt.rt_type = rt->rt_type;
+	xdst->u.rt.rt_uses_gateway = rt->rt_uses_gateway;
 	xdst->u.rt.rt_gw_family = rt->rt_gw_family;
 	if (rt->rt_gw_family == AF_INET)
 		xdst->u.rt.rt_gw4 = rt->rt_gw4;

diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index d22b6c1..f9e8fe3 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c

@@ -287,7 +287,8 @@
 	return false;
 
 suppress_route:
-	ip6_rt_put(rt);
+	if (!(arg->flags & FIB_LOOKUP_NOREF))
+		ip6_rt_put(rt);
 	return true;
 }
 

diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
index 4da24aa..0a0945a 100644
--- a/net/ipv6/inet6_connection_sock.c
+++ b/net/ipv6/inet6_connection_sock.c

@@ -133,7 +133,7 @@
 	fl6.daddr = sk->sk_v6_daddr;
 
 	res = ip6_xmit(sk, skb, &fl6, sk->sk_mark, rcu_dereference(np->opt),
-		       np->tclass);
+		       np->tclass,  sk->sk_priority);
 	rcu_read_unlock();
 	return res;
 }

diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 87f47bc..6e2af41 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c

@@ -318,7 +318,7 @@
 	if (rt->dst.error == -EAGAIN) {
 		ip6_rt_put_flags(rt, flags);
 		rt = net->ipv6.ip6_null_entry;
-		if (!(flags | RT6_LOOKUP_F_DST_NOREF))
+		if (!(flags & RT6_LOOKUP_F_DST_NOREF))
 			dst_hold(&rt->dst);
 	}
 

diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 89a4c7c..edadee4 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c

@@ -193,7 +193,7 @@
  * which are using proper atomic operations or spinlocks.
  */
 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
-	     __u32 mark, struct ipv6_txoptions *opt, int tclass)
+	     __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
 {
 	struct net *net = sock_net(sk);
 	const struct ipv6_pinfo *np = inet6_sk(sk);
@@ -258,7 +258,7 @@
 	hdr->daddr = *first_hop;
 
 	skb->protocol = htons(ETH_P_IPV6);
-	skb->priority = sk->sk_priority;
+	skb->priority = priority;
 	skb->mark = mark;
 
 	mtu = dst_mtu(dst);

diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig
index 6120a78..69443e9 100644
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig

@@ -170,13 +170,13 @@
 	  To compile it as a module, choose M here.  If unsure, say N.
 
 config IP6_NF_MATCH_SRH
-        tristate '"srh" Segment Routing header match support'
-        depends on NETFILTER_ADVANCED
-        help
-          srh matching allows you to match packets based on the segment
+	tristate '"srh" Segment Routing header match support'
+	depends on NETFILTER_ADVANCED
+	help
+	  srh matching allows you to match packets based on the segment
 	  routing header of the packet.
 
-          To compile it as a module, choose M here.  If unsure, say N.
+	  To compile it as a module, choose M here.  If unsure, say N.
 
 # The targets
 config IP6_NF_TARGET_HL
@@ -249,10 +249,10 @@
        depends on SECURITY
        depends on NETFILTER_ADVANCED
        help
-         This option adds a `security' table to iptables, for use
-         with Mandatory Access Control (MAC) policy.
+	 This option adds a `security' table to iptables, for use
+	 with Mandatory Access Control (MAC) policy.
 
-         If unsure, say N.
+	 If unsure, say N.
 
 config IP6_NF_NAT
 	tristate "ip6tables NAT support"

diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 87f44d3..e3d9f45 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c

@@ -512,7 +512,8 @@
 		opt = ireq->ipv6_opt;
 		if (!opt)
 			opt = rcu_dereference(np->opt);
-		err = ip6_xmit(sk, skb, fl6, sk->sk_mark, opt, np->tclass);
+		err = ip6_xmit(sk, skb, fl6, sk->sk_mark, opt, np->tclass,
+			       sk->sk_priority);
 		rcu_read_unlock();
 		err = net_xmit_eval(err);
 	}
@@ -803,7 +804,7 @@
 static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 seq,
 				 u32 ack, u32 win, u32 tsval, u32 tsecr,
 				 int oif, struct tcp_md5sig_key *key, int rst,
-				 u8 tclass, __be32 label)
+				 u8 tclass, __be32 label, u32 priority)
 {
 	const struct tcphdr *th = tcp_hdr(skb);
 	struct tcphdr *t1;
@@ -907,7 +908,8 @@
 	dst = ip6_dst_lookup_flow(ctl_sk, &fl6, NULL);
 	if (!IS_ERR(dst)) {
 		skb_dst_set(buff, dst);
-		ip6_xmit(ctl_sk, buff, &fl6, fl6.flowi6_mark, NULL, tclass);
+		ip6_xmit(ctl_sk, buff, &fl6, fl6.flowi6_mark, NULL, tclass,
+			 priority);
 		TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 		if (rst)
 			TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
@@ -930,6 +932,7 @@
 	struct sock *sk1 = NULL;
 #endif
 	__be32 label = 0;
+	u32 priority = 0;
 	struct net *net;
 	int oif = 0;
 
@@ -990,16 +993,19 @@
 			trace_tcp_send_reset(sk, skb);
 			if (np->repflow)
 				label = ip6_flowlabel(ipv6h);
+			priority = sk->sk_priority;
 		}
-		if (sk->sk_state == TCP_TIME_WAIT)
+		if (sk->sk_state == TCP_TIME_WAIT) {
 			label = cpu_to_be32(inet_twsk(sk)->tw_flowlabel);
+			priority = inet_twsk(sk)->tw_priority;
+		}
 	} else {
 		if (net->ipv6.sysctl.flowlabel_reflect & FLOWLABEL_REFLECT_TCP_RESET)
 			label = ip6_flowlabel(ipv6h);
 	}
 
 	tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, key, 1, 0,
-			     label);
+			     label, priority);
 
 #ifdef CONFIG_TCP_MD5SIG
 out:
@@ -1010,10 +1016,10 @@
 static void tcp_v6_send_ack(const struct sock *sk, struct sk_buff *skb, u32 seq,
 			    u32 ack, u32 win, u32 tsval, u32 tsecr, int oif,
 			    struct tcp_md5sig_key *key, u8 tclass,
-			    __be32 label)
+			    __be32 label, u32 priority)
 {
 	tcp_v6_send_response(sk, skb, seq, ack, win, tsval, tsecr, oif, key, 0,
-			     tclass, label);
+			     tclass, label, priority);
 }
 
 static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb)
@@ -1025,7 +1031,7 @@
 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 			tcp_time_stamp_raw() + tcptw->tw_ts_offset,
 			tcptw->tw_ts_recent, tw->tw_bound_dev_if, tcp_twsk_md5_key(tcptw),
-			tw->tw_tclass, cpu_to_be32(tw->tw_flowlabel));
+			tw->tw_tclass, cpu_to_be32(tw->tw_flowlabel), tw->tw_priority);
 
 	inet_twsk_put(tw);
 }
@@ -1048,7 +1054,7 @@
 			tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
 			req->ts_recent, sk->sk_bound_dev_if,
 			tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->saddr),
-			0, 0);
+			0, 0, sk->sk_priority);
 }
 
 

diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
index 8f12f5c..ea9e734 100644
--- a/net/kcm/kcmsock.c
+++ b/net/kcm/kcmsock.c

@@ -378,8 +378,12 @@
 {
 	struct kcm_psock *psock = container_of(strp, struct kcm_psock, strp);
 	struct bpf_prog *prog = psock->bpf_prog;
+	int res;
 
-	return BPF_PROG_RUN(prog, skb);
+	preempt_disable();
+	res = BPF_PROG_RUN(prog, skb);
+	preempt_enable();
+	return res;
 }
 
 static int kcm_read_sock_done(struct strparser *strp, int err)

diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h
index 0b3f067..ad3fd7f 100644
--- a/net/ncsi/internal.h
+++ b/net/ncsi/internal.h

@@ -264,9 +264,7 @@
 	ncsi_dev_state_config_ev,
 	ncsi_dev_state_config_sma,
 	ncsi_dev_state_config_ebf,
-#if IS_ENABLED(CONFIG_IPV6)
-	ncsi_dev_state_config_egmf,
-#endif
+	ncsi_dev_state_config_dgmf,
 	ncsi_dev_state_config_ecnt,
 	ncsi_dev_state_config_ec,
 	ncsi_dev_state_config_ae,
@@ -295,9 +293,6 @@
 #define NCSI_DEV_RESET		8            /* Reset state of NC          */
 	unsigned int        gma_flag;        /* OEM GMA flag               */
 	spinlock_t          lock;            /* Protect the NCSI device    */
-#if IS_ENABLED(CONFIG_IPV6)
-	unsigned int        inet6_addr_num;  /* Number of IPv6 addresses   */
-#endif
 	unsigned int        package_probe_id;/* Current ID during probe    */
 	unsigned int        package_num;     /* Number of packages         */
 	struct list_head    packages;        /* List of packages           */

diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c
index 755aab6..70fe026 100644
--- a/net/ncsi/ncsi-manage.c
+++ b/net/ncsi/ncsi-manage.c

@@ -14,7 +14,6 @@
 #include <net/sock.h>
 #include <net/addrconf.h>
 #include <net/ipv6.h>
-#include <net/if_inet6.h>
 #include <net/genetlink.h>
 
 #include "internal.h"
@@ -978,9 +977,7 @@
 	case ncsi_dev_state_config_ev:
 	case ncsi_dev_state_config_sma:
 	case ncsi_dev_state_config_ebf:
-#if IS_ENABLED(CONFIG_IPV6)
-	case ncsi_dev_state_config_egmf:
-#endif
+	case ncsi_dev_state_config_dgmf:
 	case ncsi_dev_state_config_ecnt:
 	case ncsi_dev_state_config_ec:
 	case ncsi_dev_state_config_ae:
@@ -1033,23 +1030,23 @@
 		} else if (nd->state == ncsi_dev_state_config_ebf) {
 			nca.type = NCSI_PKT_CMD_EBF;
 			nca.dwords[0] = nc->caps[NCSI_CAP_BC].cap;
+			/* if multicast global filtering is supported then
+			 * disable it so that all multicast packet will be
+			 * forwarded to management controller
+			 */
+			if (nc->caps[NCSI_CAP_GENERIC].cap &
+			    NCSI_CAP_GENERIC_MC)
+				nd->state = ncsi_dev_state_config_dgmf;
+			else if (ncsi_channel_is_tx(ndp, nc))
+				nd->state = ncsi_dev_state_config_ecnt;
+			else
+				nd->state = ncsi_dev_state_config_ec;
+		} else if (nd->state == ncsi_dev_state_config_dgmf) {
+			nca.type = NCSI_PKT_CMD_DGMF;
 			if (ncsi_channel_is_tx(ndp, nc))
 				nd->state = ncsi_dev_state_config_ecnt;
 			else
 				nd->state = ncsi_dev_state_config_ec;
-#if IS_ENABLED(CONFIG_IPV6)
-			if (ndp->inet6_addr_num > 0 &&
-			    (nc->caps[NCSI_CAP_GENERIC].cap &
-			     NCSI_CAP_GENERIC_MC))
-				nd->state = ncsi_dev_state_config_egmf;
-		} else if (nd->state == ncsi_dev_state_config_egmf) {
-			nca.type = NCSI_PKT_CMD_EGMF;
-			nca.dwords[0] = nc->caps[NCSI_CAP_MC].cap;
-			if (ncsi_channel_is_tx(ndp, nc))
-				nd->state = ncsi_dev_state_config_ecnt;
-			else
-				nd->state = ncsi_dev_state_config_ec;
-#endif /* CONFIG_IPV6 */
 		} else if (nd->state == ncsi_dev_state_config_ecnt) {
 			if (np->preferred_channel &&
 			    nc != np->preferred_channel)
@@ -1483,70 +1480,6 @@
 	return -ENODEV;
 }
 
-#if IS_ENABLED(CONFIG_IPV6)
-static int ncsi_inet6addr_event(struct notifier_block *this,
-				unsigned long event, void *data)
-{
-	struct inet6_ifaddr *ifa = data;
-	struct net_device *dev = ifa->idev->dev;
-	struct ncsi_dev *nd = ncsi_find_dev(dev);
-	struct ncsi_dev_priv *ndp = nd ? TO_NCSI_DEV_PRIV(nd) : NULL;
-	struct ncsi_package *np;
-	struct ncsi_channel *nc;
-	struct ncsi_cmd_arg nca;
-	bool action;
-	int ret;
-
-	if (!ndp || (ipv6_addr_type(&ifa->addr) &
-	    (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK)))
-		return NOTIFY_OK;
-
-	switch (event) {
-	case NETDEV_UP:
-		action = (++ndp->inet6_addr_num) == 1;
-		nca.type = NCSI_PKT_CMD_EGMF;
-		break;
-	case NETDEV_DOWN:
-		action = (--ndp->inet6_addr_num == 0);
-		nca.type = NCSI_PKT_CMD_DGMF;
-		break;
-	default:
-		return NOTIFY_OK;
-	}
-
-	/* We might not have active channel or packages. The IPv6
-	 * required multicast will be enabled when active channel
-	 * or packages are chosen.
-	 */
-	np = ndp->active_package;
-	nc = ndp->active_channel;
-	if (!action || !np || !nc)
-		return NOTIFY_OK;
-
-	/* We needn't enable or disable it if the function isn't supported */
-	if (!(nc->caps[NCSI_CAP_GENERIC].cap & NCSI_CAP_GENERIC_MC))
-		return NOTIFY_OK;
-
-	nca.ndp = ndp;
-	nca.req_flags = 0;
-	nca.package = np->id;
-	nca.channel = nc->id;
-	nca.dwords[0] = nc->caps[NCSI_CAP_MC].cap;
-	ret = ncsi_xmit_cmd(&nca);
-	if (ret) {
-		netdev_warn(dev, "Fail to %s global multicast filter (%d)\n",
-			    (event == NETDEV_UP) ? "enable" : "disable", ret);
-		return NOTIFY_DONE;
-	}
-
-	return NOTIFY_OK;
-}
-
-static struct notifier_block ncsi_inet6addr_notifier = {
-	.notifier_call = ncsi_inet6addr_event,
-};
-#endif /* CONFIG_IPV6 */
-
 static int ncsi_kick_channels(struct ncsi_dev_priv *ndp)
 {
 	struct ncsi_dev *nd = &ndp->ndev;
@@ -1725,11 +1658,6 @@
 	}
 
 	spin_lock_irqsave(&ncsi_dev_lock, flags);
-#if IS_ENABLED(CONFIG_IPV6)
-	ndp->inet6_addr_num = 0;
-	if (list_empty(&ncsi_dev_list))
-		register_inet6addr_notifier(&ncsi_inet6addr_notifier);
-#endif
 	list_add_tail_rcu(&ndp->node, &ncsi_dev_list);
 	spin_unlock_irqrestore(&ncsi_dev_lock, flags);
 
@@ -1896,10 +1824,6 @@
 
 	spin_lock_irqsave(&ncsi_dev_lock, flags);
 	list_del_rcu(&ndp->node);
-#if IS_ENABLED(CONFIG_IPV6)
-	if (list_empty(&ncsi_dev_list))
-		unregister_inet6addr_notifier(&ncsi_inet6addr_notifier);
-#endif
 	spin_unlock_irqrestore(&ncsi_dev_lock, flags);
 
 	ncsi_unregister_netlink(nd->dev);

diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 34ec7af..91efae8 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig

@@ -697,7 +697,7 @@
 	tristate "Netfilter flow table mixed IPv4/IPv6 module"
 	depends on NF_FLOW_TABLE
 	help
-          This option adds the flow table mixed IPv4/IPv6 support.
+	  This option adds the flow table mixed IPv4/IPv6 support.
 
 	  To compile it as a module, choose M here.
 

diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig
index f6f1a0d..5b672e0 100644
--- a/net/netfilter/ipvs/Kconfig
+++ b/net/netfilter/ipvs/Kconfig

@@ -135,7 +135,7 @@
 	  module, choose M here. If unsure, say N.
 
 config	IP_VS_LC
-        tristate "least-connection scheduling"
+	tristate "least-connection scheduling"
 	---help---
 	  The least-connection scheduling algorithm directs network
 	  connections to the server with the least number of active 
@@ -145,7 +145,7 @@
 	  module, choose M here. If unsure, say N.
 
 config	IP_VS_WLC
-        tristate "weighted least-connection scheduling"
+	tristate "weighted least-connection scheduling"
 	---help---
 	  The weighted least-connection scheduling algorithm directs network
 	  connections to the server with the least active connections
@@ -333,7 +333,7 @@
 
 config	IP_VS_PE_SIP
 	tristate "SIP persistence engine"
-        depends on IP_VS_PROTO_UDP
+	depends on IP_VS_PROTO_UDP
 	depends on NF_CONNTRACK_SIP
 	---help---
 	  Allow persistence based on the SIP Call-ID

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index e4a68dc..d481f9b 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c

@@ -1715,7 +1715,7 @@
 		goto err2;
 	}
 
-	nft_trans_chain_policy(trans) = -1;
+	nft_trans_chain_policy(trans) = NFT_CHAIN_POLICY_UNSET;
 	if (nft_is_base_chain(chain))
 		nft_trans_chain_policy(trans) = policy;
 
@@ -3562,8 +3562,11 @@
 			      NFT_SET_OBJECT))
 			return -EINVAL;
 		/* Only one of these operations is supported */
-		if ((flags & (NFT_SET_MAP | NFT_SET_EVAL | NFT_SET_OBJECT)) ==
-			     (NFT_SET_MAP | NFT_SET_EVAL | NFT_SET_OBJECT))
+		if ((flags & (NFT_SET_MAP | NFT_SET_OBJECT)) ==
+			     (NFT_SET_MAP | NFT_SET_OBJECT))
+			return -EOPNOTSUPP;
+		if ((flags & (NFT_SET_EVAL | NFT_SET_OBJECT)) ==
+			     (NFT_SET_EVAL | NFT_SET_OBJECT))
 			return -EOPNOTSUPP;
 	}
 
@@ -5595,6 +5598,22 @@
 }
 EXPORT_SYMBOL_GPL(nft_flowtable_lookup);
 
+void nf_tables_deactivate_flowtable(const struct nft_ctx *ctx,
+				    struct nft_flowtable *flowtable,
+				    enum nft_trans_phase phase)
+{
+	switch (phase) {
+	case NFT_TRANS_PREPARE:
+	case NFT_TRANS_ABORT:
+	case NFT_TRANS_RELEASE:
+		flowtable->use--;
+		/* fall through */
+	default:
+		return;
+	}
+}
+EXPORT_SYMBOL_GPL(nf_tables_deactivate_flowtable);
+
 static struct nft_flowtable *
 nft_flowtable_lookup_byhandle(const struct nft_table *table,
 			      const struct nlattr *nla, u8 genmask)

diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c
index 21bb772..e546f75 100644
--- a/net/netfilter/nf_tables_offload.c
+++ b/net/netfilter/nf_tables_offload.c

@@ -313,7 +313,7 @@
 	policy = ppolicy ? *ppolicy : basechain->policy;
 
 	/* Only default policy to accept is supported for now. */
-	if (cmd == FLOW_BLOCK_BIND && policy != -1 && policy != NF_ACCEPT)
+	if (cmd == FLOW_BLOCK_BIND && policy == NF_DROP)
 		return -EOPNOTSUPP;
 
 	if (dev->netdev_ops->ndo_setup_tc)

diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c
index 22cf236..f29bbc7 100644
--- a/net/netfilter/nft_flow_offload.c
+++ b/net/netfilter/nft_flow_offload.c

@@ -177,6 +177,23 @@
 	return nf_ct_netns_get(ctx->net, ctx->family);
 }
 
+static void nft_flow_offload_deactivate(const struct nft_ctx *ctx,
+					const struct nft_expr *expr,
+					enum nft_trans_phase phase)
+{
+	struct nft_flow_offload *priv = nft_expr_priv(expr);
+
+	nf_tables_deactivate_flowtable(ctx, priv->flowtable, phase);
+}
+
+static void nft_flow_offload_activate(const struct nft_ctx *ctx,
+				      const struct nft_expr *expr)
+{
+	struct nft_flow_offload *priv = nft_expr_priv(expr);
+
+	priv->flowtable->use++;
+}
+
 static void nft_flow_offload_destroy(const struct nft_ctx *ctx,
 				     const struct nft_expr *expr)
 {
@@ -205,6 +222,8 @@
 	.size		= NFT_EXPR_SIZE(sizeof(struct nft_flow_offload)),
 	.eval		= nft_flow_offload_eval,
 	.init		= nft_flow_offload_init,
+	.activate	= nft_flow_offload_activate,
+	.deactivate	= nft_flow_offload_deactivate,
 	.destroy	= nft_flow_offload_destroy,
 	.validate	= nft_flow_offload_validate,
 	.dump		= nft_flow_offload_dump,

diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c
index c0560bf..660bad6 100644
--- a/net/netfilter/nft_lookup.c
+++ b/net/netfilter/nft_lookup.c

@@ -73,9 +73,6 @@
 	if (IS_ERR(set))
 		return PTR_ERR(set);
 
-	if (set->flags & NFT_SET_EVAL)
-		return -EOPNOTSUPP;
-
 	priv->sreg = nft_parse_register(tb[NFTA_LOOKUP_SREG]);
 	err = nft_validate_register_load(priv->sreg, set->klen);
 	if (err < 0)

diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c
index 9b87429..8dfea26 100644
--- a/net/nfc/llcp_sock.c
+++ b/net/nfc/llcp_sock.c

@@ -1004,10 +1004,13 @@
 	    sock->type != SOCK_RAW)
 		return -ESOCKTNOSUPPORT;
 
-	if (sock->type == SOCK_RAW)
+	if (sock->type == SOCK_RAW) {
+		if (!capable(CAP_NET_RAW))
+			return -EPERM;
 		sock->ops = &llcp_rawsock_ops;
-	else
+	} else {
 		sock->ops = &llcp_sock_ops;
+	}
 
 	sk = nfc_llcp_sock_alloc(sock, sock->type, GFP_ATOMIC, kern);
 	if (sk == NULL)

diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index dde9d76..f30e406 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c

@@ -2294,7 +2294,7 @@
 	[OVS_VPORT_ATTR_STATS] = { .len = sizeof(struct ovs_vport_stats) },
 	[OVS_VPORT_ATTR_PORT_NO] = { .type = NLA_U32 },
 	[OVS_VPORT_ATTR_TYPE] = { .type = NLA_U32 },
-	[OVS_VPORT_ATTR_UPCALL_PID] = { .type = NLA_U32 },
+	[OVS_VPORT_ATTR_UPCALL_PID] = { .type = NLA_UNSPEC },
 	[OVS_VPORT_ATTR_OPTIONS] = { .type = NLA_NESTED },
 	[OVS_VPORT_ATTR_IFINDEX] = { .type = NLA_U32 },
 	[OVS_VPORT_ATTR_NETNSID] = { .type = NLA_S32 },

diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c
index 6c8b0f6..88f98f2 100644
--- a/net/qrtr/qrtr.c
+++ b/net/qrtr/qrtr.c

@@ -150,6 +150,7 @@
 	list_del(&node->item);
 	mutex_unlock(&qrtr_node_lock);
 
+	cancel_work_sync(&node->work);
 	skb_queue_purge(&node->rx_queue);
 	kfree(node);
 }

diff --git a/net/rds/Kconfig b/net/rds/Kconfig
index 38ea7f0..c64e154 100644
--- a/net/rds/Kconfig
+++ b/net/rds/Kconfig

@@ -23,6 +23,6 @@
 	  This transport does not support RDMA operations.
 
 config RDS_DEBUG
-        bool "RDS debugging messages"
+	bool "RDS debugging messages"
 	depends on RDS
-        default n
+	default n

diff --git a/net/rds/bind.c b/net/rds/bind.c
index 20c156a..5b5fb4c 100644
--- a/net/rds/bind.c
+++ b/net/rds/bind.c

@@ -244,7 +244,8 @@
 	 */
 	if (rs->rs_transport) {
 		trans = rs->rs_transport;
-		if (trans->laddr_check(sock_net(sock->sk),
+		if (!trans->laddr_check ||
+		    trans->laddr_check(sock_net(sock->sk),
 				       binding_addr, scope_id) != 0) {
 			ret = -ENOPROTOOPT;
 			goto out;
@@ -263,6 +264,8 @@
 
 	sock_set_flag(sk, SOCK_RCU_FREE);
 	ret = rds_add_bound(rs, binding_addr, &port, scope_id);
+	if (ret)
+		rs->rs_transport = NULL;
 
 out:
 	release_sock(sk);

diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index b3faafe..2985509 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig

@@ -324,7 +324,7 @@
 	tristate "Common Applications Kept Enhanced (CAKE)"
 	help
 	  Say Y here if you want to use the Common Applications Kept Enhanced
-          (CAKE) queue management algorithm.
+	  (CAKE) queue management algorithm.
 
 	  To compile this driver as a module, choose M here: the module
 	  will be called sch_cake.
@@ -730,8 +730,8 @@
 
 config NET_ACT_POLICE
 	tristate "Traffic Policing"
-        depends on NET_CLS_ACT
-        ---help---
+	depends on NET_CLS_ACT
+	---help---
 	  Say Y here if you want to do traffic policing, i.e. strict
 	  bandwidth limiting. This action replaces the existing policing
 	  module.
@@ -740,9 +740,9 @@
 	  module will be called act_police.
 
 config NET_ACT_GACT
-        tristate "Generic actions"
-        depends on NET_CLS_ACT
-        ---help---
+	tristate "Generic actions"
+	depends on NET_CLS_ACT
+	---help---
 	  Say Y here to take generic actions such as dropping and
 	  accepting packets.
 
@@ -750,15 +750,15 @@
 	  module will be called act_gact.
 
 config GACT_PROB
-        bool "Probability support"
-        depends on NET_ACT_GACT
-        ---help---
+	bool "Probability support"
+	depends on NET_ACT_GACT
+	---help---
 	  Say Y here to use the generic action randomly or deterministically.
 
 config NET_ACT_MIRRED
-        tristate "Redirecting and Mirroring"
-        depends on NET_CLS_ACT
-        ---help---
+	tristate "Redirecting and Mirroring"
+	depends on NET_CLS_ACT
+	---help---
 	  Say Y here to allow packets to be mirrored or redirected to
 	  other devices.
 
@@ -766,10 +766,10 @@
 	  module will be called act_mirred.
 
 config NET_ACT_SAMPLE
-        tristate "Traffic Sampling"
-        depends on NET_CLS_ACT
-        select PSAMPLE
-        ---help---
+	tristate "Traffic Sampling"
+	depends on NET_CLS_ACT
+	select PSAMPLE
+	---help---
 	  Say Y here to allow packet sampling tc action. The packet sample
 	  action consists of statistically choosing packets and sampling
 	  them using the psample module.
@@ -778,9 +778,9 @@
 	  module will be called act_sample.
 
 config NET_ACT_IPT
-        tristate "IPtables targets"
-        depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES
-        ---help---
+	tristate "IPtables targets"
+	depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES
+	---help---
 	  Say Y here to be able to invoke iptables targets after successful
 	  classification.
 
@@ -788,9 +788,9 @@
 	  module will be called act_ipt.
 
 config NET_ACT_NAT
-        tristate "Stateless NAT"
-        depends on NET_CLS_ACT
-        ---help---
+	tristate "Stateless NAT"
+	depends on NET_CLS_ACT
+	---help---
 	  Say Y here to do stateless NAT on IPv4 packets.  You should use
 	  netfilter for NAT unless you know what you are doing.
 
@@ -798,18 +798,18 @@
 	  module will be called act_nat.
 
 config NET_ACT_PEDIT
-        tristate "Packet Editing"
-        depends on NET_CLS_ACT
-        ---help---
+	tristate "Packet Editing"
+	depends on NET_CLS_ACT
+	---help---
 	  Say Y here if you want to mangle the content of packets.
 
 	  To compile this code as a module, choose M here: the
 	  module will be called act_pedit.
 
 config NET_ACT_SIMP
-        tristate "Simple Example (Debug)"
-        depends on NET_CLS_ACT
-        ---help---
+	tristate "Simple Example (Debug)"
+	depends on NET_CLS_ACT
+	---help---
 	  Say Y here to add a simple action for demonstration purposes.
 	  It is meant as an example and for debugging purposes. It will
 	  print a configured policy string followed by the packet count
@@ -821,9 +821,9 @@
 	  module will be called act_simple.
 
 config NET_ACT_SKBEDIT
-        tristate "SKB Editing"
-        depends on NET_CLS_ACT
-        ---help---
+	tristate "SKB Editing"
+	depends on NET_CLS_ACT
+	---help---
 	  Say Y here to change skb priority or queue_mapping settings.
 
 	  If unsure, say N.
@@ -832,10 +832,10 @@
 	  module will be called act_skbedit.
 
 config NET_ACT_CSUM
-        tristate "Checksum Updating"
-        depends on NET_CLS_ACT && INET
-        select LIBCRC32C
-        ---help---
+	tristate "Checksum Updating"
+	depends on NET_CLS_ACT && INET
+	select LIBCRC32C
+	---help---
 	  Say Y here to update some common checksum after some direct
 	  packet alterations.
 
@@ -854,9 +854,9 @@
 	  module will be called act_mpls.
 
 config NET_ACT_VLAN
-        tristate "Vlan manipulation"
-        depends on NET_CLS_ACT
-        ---help---
+	tristate "Vlan manipulation"
+	depends on NET_CLS_ACT
+	---help---
 	  Say Y here to push or pop vlan headers.
 
 	  If unsure, say N.
@@ -865,9 +865,9 @@
 	  module will be called act_vlan.
 
 config NET_ACT_BPF
-        tristate "BPF based action"
-        depends on NET_CLS_ACT
-        ---help---
+	tristate "BPF based action"
+	depends on NET_CLS_ACT
+	---help---
 	  Say Y here to execute BPF code on packets. The BPF code will decide
 	  if the packet should be dropped or not.
 
@@ -877,10 +877,10 @@
 	  module will be called act_bpf.
 
 config NET_ACT_CONNMARK
-        tristate "Netfilter Connection Mark Retriever"
-        depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES
-        depends on NF_CONNTRACK && NF_CONNTRACK_MARK
-        ---help---
+	tristate "Netfilter Connection Mark Retriever"
+	depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES
+	depends on NF_CONNTRACK && NF_CONNTRACK_MARK
+	---help---
 	  Say Y here to allow retrieving of conn mark
 
 	  If unsure, say N.
@@ -889,10 +889,10 @@
 	  module will be called act_connmark.
 
 config NET_ACT_CTINFO
-        tristate "Netfilter Connection Mark Actions"
-        depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES
-        depends on NF_CONNTRACK && NF_CONNTRACK_MARK
-        help
+	tristate "Netfilter Connection Mark Actions"
+	depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES
+	depends on NF_CONNTRACK && NF_CONNTRACK_MARK
+	help
 	  Say Y here to allow transfer of a connmark stored information.
 	  Current actions transfer connmark stored DSCP into
 	  ipv4/v6 diffserv and/or to transfer connmark to packet
@@ -906,21 +906,21 @@
 	  module will be called act_ctinfo.
 
 config NET_ACT_SKBMOD
-        tristate "skb data modification action"
-        depends on NET_CLS_ACT
-        ---help---
-         Say Y here to allow modification of skb data
+	tristate "skb data modification action"
+	depends on NET_CLS_ACT
+	---help---
+	 Say Y here to allow modification of skb data
 
-         If unsure, say N.
+	 If unsure, say N.
 
-         To compile this code as a module, choose M here: the
-         module will be called act_skbmod.
+	 To compile this code as a module, choose M here: the
+	 module will be called act_skbmod.
 
 config NET_ACT_IFE
-        tristate "Inter-FE action based on IETF ForCES InterFE LFB"
-        depends on NET_CLS_ACT
-        select NET_IFE
-        ---help---
+	tristate "Inter-FE action based on IETF ForCES InterFE LFB"
+	depends on NET_CLS_ACT
+	select NET_IFE
+	---help---
 	  Say Y here to allow for sourcing and terminating metadata
 	  For details refer to netdev01 paper:
 	  "Distributing Linux Traffic Control Classifier-Action Subsystem"
@@ -930,9 +930,9 @@
 	  module will be called act_ife.
 
 config NET_ACT_TUNNEL_KEY
-        tristate "IP tunnel metadata manipulation"
-        depends on NET_CLS_ACT
-        ---help---
+	tristate "IP tunnel metadata manipulation"
+	depends on NET_CLS_ACT
+	---help---
 	  Say Y here to set/release ip tunnel metadata.
 
 	  If unsure, say N.
@@ -941,9 +941,9 @@
 	  module will be called act_tunnel_key.
 
 config NET_ACT_CT
-        tristate "connection tracking tc action"
-        depends on NET_CLS_ACT && NF_CONNTRACK && NF_NAT
-        help
+	tristate "connection tracking tc action"
+	depends on NET_CLS_ACT && NF_CONNTRACK && NF_NAT
+	help
 	  Say Y here to allow sending the packets to conntrack module.
 
 	  If unsure, say N.
@@ -952,21 +952,20 @@
 	  module will be called act_ct.
 
 config NET_IFE_SKBMARK
-        tristate "Support to encoding decoding skb mark on IFE action"
-        depends on NET_ACT_IFE
+	tristate "Support to encoding decoding skb mark on IFE action"
+	depends on NET_ACT_IFE
 
 config NET_IFE_SKBPRIO
-        tristate "Support to encoding decoding skb prio on IFE action"
-        depends on NET_ACT_IFE
+	tristate "Support to encoding decoding skb prio on IFE action"
+	depends on NET_ACT_IFE
 
 config NET_IFE_SKBTCINDEX
-        tristate "Support to encoding decoding skb tcindex on IFE action"
-        depends on NET_ACT_IFE
+	tristate "Support to encoding decoding skb tcindex on IFE action"
+	depends on NET_ACT_IFE
 
 config NET_TC_SKB_EXT
 	bool "TC recirculation support"
 	depends on NET_CLS_ACT
-	default y if NET_CLS_ACT
 	select SKB_EXTENSIONS
 
 	help

diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 3397122..2558f00 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c

@@ -831,6 +831,15 @@
 	return c;
 }
 
+static const struct nla_policy tcf_action_policy[TCA_ACT_MAX + 1] = {
+	[TCA_ACT_KIND]		= { .type = NLA_NUL_STRING,
+				    .len = IFNAMSIZ - 1 },
+	[TCA_ACT_INDEX]		= { .type = NLA_U32 },
+	[TCA_ACT_COOKIE]	= { .type = NLA_BINARY,
+				    .len = TC_COOKIE_MAX_SIZE },
+	[TCA_ACT_OPTIONS]	= { .type = NLA_NESTED },
+};
+
 struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
 				    struct nlattr *nla, struct nlattr *est,
 				    char *name, int ovr, int bind,
@@ -846,8 +855,8 @@
 	int err;
 
 	if (name == NULL) {
-		err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla, NULL,
-						  extack);
+		err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla,
+						  tcf_action_policy, extack);
 		if (err < 0)
 			goto err_out;
 		err = -EINVAL;
@@ -856,18 +865,9 @@
 			NL_SET_ERR_MSG(extack, "TC action kind must be specified");
 			goto err_out;
 		}
-		if (nla_strlcpy(act_name, kind, IFNAMSIZ) >= IFNAMSIZ) {
-			NL_SET_ERR_MSG(extack, "TC action name too long");
-			goto err_out;
-		}
+		nla_strlcpy(act_name, kind, IFNAMSIZ);
+
 		if (tb[TCA_ACT_COOKIE]) {
-			int cklen = nla_len(tb[TCA_ACT_COOKIE]);
-
-			if (cklen > TC_COOKIE_MAX_SIZE) {
-				NL_SET_ERR_MSG(extack, "TC cookie size above the maximum");
-				goto err_out;
-			}
-
 			cookie = nla_memdup_cookie(tb);
 			if (!cookie) {
 				NL_SET_ERR_MSG(extack, "No memory to generate TC cookie");
@@ -1098,7 +1098,8 @@
 	int index;
 	int err;
 
-	err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla, NULL, extack);
+	err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla,
+					  tcf_action_policy, extack);
 	if (err < 0)
 		goto err_out;
 
@@ -1152,7 +1153,8 @@
 
 	b = skb_tail_pointer(skb);
 
-	err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla, NULL, extack);
+	err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla,
+					  tcf_action_policy, extack);
 	if (err < 0)
 		goto err_out;
 
@@ -1440,7 +1442,7 @@
 
 	if (tb[1] == NULL)
 		return NULL;
-	if (nla_parse_nested_deprecated(tb2, TCA_ACT_MAX, tb[1], NULL, NULL) < 0)
+	if (nla_parse_nested_deprecated(tb2, TCA_ACT_MAX, tb[1], tcf_action_policy, NULL) < 0)
 		return NULL;
 	kind = tb2[TCA_ACT_KIND];
 

diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c
index 692c4c9..514456a 100644
--- a/net/sched/act_sample.c
+++ b/net/sched/act_sample.c

@@ -146,6 +146,7 @@
 	case ARPHRD_TUNNEL6:
 	case ARPHRD_SIT:
 	case ARPHRD_IPGRE:
+	case ARPHRD_IP6GRE:
 	case ARPHRD_VOID:
 	case ARPHRD_NONE:
 		return false;

diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 32577c2..64584a1 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c

@@ -2894,8 +2894,10 @@
 void tcf_exts_destroy(struct tcf_exts *exts)
 {
 #ifdef CONFIG_NET_CLS_ACT
-	tcf_action_destroy(exts->actions, TCA_ACT_UNBIND);
-	kfree(exts->actions);
+	if (exts->actions) {
+		tcf_action_destroy(exts->actions, TCA_ACT_UNBIND);
+		kfree(exts->actions);
+	}
 	exts->nr_actions = 0;
 #endif
 }

diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 1047825..81d58b2 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c

@@ -1390,7 +1390,8 @@
 }
 
 const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = {
-	[TCA_KIND]		= { .type = NLA_STRING },
+	[TCA_KIND]		= { .type = NLA_NUL_STRING,
+				    .len = IFNAMSIZ - 1 },
 	[TCA_RATE]		= { .type = NLA_BINARY,
 				    .len = sizeof(struct tc_estimator) },
 	[TCA_STAB]		= { .type = NLA_NESTED },

diff --git a/net/sched/sch_cbs.c b/net/sched/sch_cbs.c
index 93b58fd..1bef152 100644
--- a/net/sched/sch_cbs.c
+++ b/net/sched/sch_cbs.c

@@ -392,7 +392,6 @@
 {
 	struct cbs_sched_data *q = qdisc_priv(sch);
 	struct net_device *dev = qdisc_dev(sch);
-	int err;
 
 	if (!opt) {
 		NL_SET_ERR_MSG(extack, "Missing CBS qdisc options  which are mandatory");
@@ -404,6 +403,10 @@
 	if (!q->qdisc)
 		return -ENOMEM;
 
+	spin_lock(&cbs_list_lock);
+	list_add(&q->cbs_list, &cbs_list);
+	spin_unlock(&cbs_list_lock);
+
 	qdisc_hash_add(q->qdisc, false);
 
 	q->queue = sch->dev_queue - netdev_get_tx_queue(dev, 0);
@@ -413,17 +416,7 @@
 
 	qdisc_watchdog_init(&q->watchdog, sch);
 
-	err = cbs_change(sch, opt, extack);
-	if (err)
-		return err;
-
-	if (!q->offload) {
-		spin_lock(&cbs_list_lock);
-		list_add(&q->cbs_list, &cbs_list);
-		spin_unlock(&cbs_list_lock);
-	}
-
-	return 0;
+	return cbs_change(sch, opt, extack);
 }
 
 static void cbs_destroy(struct Qdisc *sch)
@@ -431,15 +424,18 @@
 	struct cbs_sched_data *q = qdisc_priv(sch);
 	struct net_device *dev = qdisc_dev(sch);
 
-	spin_lock(&cbs_list_lock);
-	list_del(&q->cbs_list);
-	spin_unlock(&cbs_list_lock);
+	/* Nothing to do if we couldn't create the underlying qdisc */
+	if (!q->qdisc)
+		return;
 
 	qdisc_watchdog_cancel(&q->watchdog);
 	cbs_disable_offload(dev, q);
 
-	if (q->qdisc)
-		qdisc_put(q->qdisc);
+	spin_lock(&cbs_list_lock);
+	list_del(&q->cbs_list);
+	spin_unlock(&cbs_list_lock);
+
+	qdisc_put(q->qdisc);
 }
 
 static int cbs_dump(struct Qdisc *sch, struct sk_buff *skb)

diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 7bcf20e..8184c87 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c

@@ -1302,6 +1302,7 @@
 	struct htb_class *cl = (struct htb_class *)*arg, *parent;
 	struct nlattr *opt = tca[TCA_OPTIONS];
 	struct nlattr *tb[TCA_HTB_MAX + 1];
+	struct Qdisc *parent_qdisc = NULL;
 	struct tc_htb_opt *hopt;
 	u64 rate64, ceil64;
 	int warn = 0;
@@ -1401,7 +1402,7 @@
 		if (parent && !parent->level) {
 			/* turn parent into inner node */
 			qdisc_purge_queue(parent->leaf.q);
-			qdisc_put(parent->leaf.q);
+			parent_qdisc = parent->leaf.q;
 			if (parent->prio_activity)
 				htb_deactivate(q, parent);
 
@@ -1480,6 +1481,7 @@
 	cl->cbuffer = PSCHED_TICKS2NS(hopt->cbuffer);
 
 	sch_tree_unlock(sch);
+	qdisc_put(parent_qdisc);
 
 	if (warn)
 		pr_warn("HTB: quantum of class %X is %s. Consider r2q change.\n",

diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c
index e108774..b2b7fdb 100644
--- a/net/sched/sch_multiq.c
+++ b/net/sched/sch_multiq.c

@@ -174,7 +174,8 @@
 {
 	struct multiq_sched_data *q = qdisc_priv(sch);
 	struct tc_multiq_qopt *qopt;
-	int i;
+	struct Qdisc **removed;
+	int i, n_removed = 0;
 
 	if (!netif_is_multiqueue(qdisc_dev(sch)))
 		return -EOPNOTSUPP;
@@ -185,6 +186,11 @@
 
 	qopt->bands = qdisc_dev(sch)->real_num_tx_queues;
 
+	removed = kmalloc(sizeof(*removed) * (q->max_bands - q->bands),
+			  GFP_KERNEL);
+	if (!removed)
+		return -ENOMEM;
+
 	sch_tree_lock(sch);
 	q->bands = qopt->bands;
 	for (i = q->bands; i < q->max_bands; i++) {
@@ -192,13 +198,17 @@
 			struct Qdisc *child = q->queues[i];
 
 			q->queues[i] = &noop_qdisc;
-			qdisc_tree_flush_backlog(child);
-			qdisc_put(child);
+			qdisc_purge_queue(child);
+			removed[n_removed++] = child;
 		}
 	}
 
 	sch_tree_unlock(sch);
 
+	for (i = 0; i < n_removed; i++)
+		qdisc_put(removed[i]);
+	kfree(removed);
+
 	for (i = 0; i < q->bands; i++) {
 		if (q->queues[i] == &noop_qdisc) {
 			struct Qdisc *child, *old;
@@ -213,11 +223,10 @@
 				if (child != &noop_qdisc)
 					qdisc_hash_add(child, true);
 
-				if (old != &noop_qdisc) {
-					qdisc_tree_flush_backlog(old);
-					qdisc_put(old);
-				}
+				if (old != &noop_qdisc)
+					qdisc_purge_queue(old);
 				sch_tree_unlock(sch);
+				qdisc_put(old);
 			}
 		}
 	}

diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index b17f2ed..0e44039 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c

@@ -476,7 +476,7 @@
 	 * skb will be queued.
 	 */
 	if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) {
-		struct Qdisc *rootq = qdisc_root(sch);
+		struct Qdisc *rootq = qdisc_root_bh(sch);
 		u32 dupsave = q->duplicate; /* prevent duplicating a dup... */
 
 		q->duplicate = 0;
@@ -777,7 +777,7 @@
 	struct disttable *d;
 	int i;
 
-	if (n > NETEM_DIST_MAX)
+	if (!n || n > NETEM_DIST_MAX)
 		return -EINVAL;
 
 	d = kvmalloc(sizeof(struct disttable) + n * sizeof(s16), GFP_KERNEL);

diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c
index 1dff850..d448fe3 100644
--- a/net/sched/sch_sfb.c
+++ b/net/sched/sch_sfb.c

@@ -488,7 +488,7 @@
 		      struct netlink_ext_ack *extack)
 {
 	struct sfb_sched_data *q = qdisc_priv(sch);
-	struct Qdisc *child;
+	struct Qdisc *child, *old;
 	struct nlattr *tb[TCA_SFB_MAX + 1];
 	const struct tc_sfb_qopt *ctl = &sfb_default_ops;
 	u32 limit;
@@ -518,8 +518,8 @@
 		qdisc_hash_add(child, true);
 	sch_tree_lock(sch);
 
-	qdisc_tree_flush_backlog(q->qdisc);
-	qdisc_put(q->qdisc);
+	qdisc_purge_queue(q->qdisc);
+	old = q->qdisc;
 	q->qdisc = child;
 
 	q->rehash_interval = msecs_to_jiffies(ctl->rehash_interval);
@@ -542,6 +542,7 @@
 	sfb_init_perturbation(1, q);
 
 	sch_tree_unlock(sch);
+	qdisc_put(old);
 
 	return 0;
 }

diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index e5f2fc7..dd860fe 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c

@@ -215,7 +215,7 @@
 
 	rcu_read_lock();
 	res = ip6_xmit(sk, skb, fl6, sk->sk_mark, rcu_dereference(np->opt),
-		       tclass);
+		       tclass, sk->sk_priority);
 	rcu_read_unlock();
 	return res;
 }

diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 4ce42c6..d75fddc 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c

@@ -1960,7 +1960,7 @@
 
 	if (xdr_buf_subsegment(rcv_buf, &integ_buf, data_offset, integ_len))
 		goto unwrap_failed;
-	if (xdr_buf_read_netobj(rcv_buf, &mic, mic_offset))
+	if (xdr_buf_read_mic(rcv_buf, &mic, mic_offset))
 		goto unwrap_failed;
 	maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &integ_buf, &mic);
 	if (maj_stat == GSS_S_CONTEXT_EXPIRED)

diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 6f1528f..a349094 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c

@@ -373,7 +373,7 @@
 	spin_lock(&cache_list_lock);
 	cd->nextcheck = 0;
 	cd->entries = 0;
-	atomic_set(&cd->readers, 0);
+	atomic_set(&cd->writers, 0);
 	cd->last_close = 0;
 	cd->last_warn = -1;
 	list_add(&cd->others, &cache_list);
@@ -1029,11 +1029,13 @@
 		}
 		rp->offset = 0;
 		rp->q.reader = 1;
-		atomic_inc(&cd->readers);
+
 		spin_lock(&queue_lock);
 		list_add(&rp->q.list, &cd->queue);
 		spin_unlock(&queue_lock);
 	}
+	if (filp->f_mode & FMODE_WRITE)
+		atomic_inc(&cd->writers);
 	filp->private_data = rp;
 	return 0;
 }
@@ -1062,8 +1064,10 @@
 		filp->private_data = NULL;
 		kfree(rp);
 
+	}
+	if (filp->f_mode & FMODE_WRITE) {
+		atomic_dec(&cd->writers);
 		cd->last_close = seconds_since_boot();
-		atomic_dec(&cd->readers);
 	}
 	module_put(cd->owner);
 	return 0;
@@ -1171,7 +1175,7 @@
 
 static bool cache_listeners_exist(struct cache_detail *detail)
 {
-	if (atomic_read(&detail->readers))
+	if (atomic_read(&detail->writers))
 		return true;
 	if (detail->last_close == 0)
 		/* This cache was never opened */
@@ -1520,6 +1524,9 @@
 	cd->nextcheck = now;
 	cache_flush();
 
+	if (cd->flush)
+		cd->flush();
+
 	*ppos += count;
 	return count;
 }

diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index a07b516..f7f7856 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c

@@ -1837,7 +1837,7 @@
 		return;
 	}
 
-	rpc_exit(task, -ERESTARTSYS);
+	rpc_call_rpcerror(task, -ERESTARTSYS);
 }
 
 static int
@@ -1862,6 +1862,7 @@
 		     req->rq_rbuffer,
 		     req->rq_rcvsize);
 
+	req->rq_reply_bytes_recvd = 0;
 	req->rq_snd_buf.head[0].iov_len = 0;
 	xdr_init_encode(&xdr, &req->rq_snd_buf,
 			req->rq_snd_buf.head[0].iov_base, req);
@@ -1881,6 +1882,8 @@
 	if (!rpc_task_need_encode(task))
 		goto out;
 	dprint_status(task);
+	/* Dequeue task from the receive queue while we're encoding */
+	xprt_request_dequeue_xprt(task);
 	/* Encode here so that rpcsec_gss can use correct sequence number. */
 	rpc_xdr_encode(task);
 	/* Did the encode result in an error condition? */
@@ -2479,6 +2482,7 @@
 	struct rpc_clnt	*clnt = task->tk_client;
 	struct rpc_rqst	*req = task->tk_rqstp;
 	struct xdr_stream xdr;
+	int err;
 
 	dprint_status(task);
 
@@ -2501,6 +2505,15 @@
 	 * before it changed req->rq_reply_bytes_recvd.
 	 */
 	smp_rmb();
+
+	/*
+	 * Did we ever call xprt_complete_rqst()? If not, we should assume
+	 * the message is incomplete.
+	 */
+	err = -EAGAIN;
+	if (!req->rq_reply_bytes_recvd)
+		goto out;
+
 	req->rq_rcv_buf.len = req->rq_private_buf.len;
 
 	/* Check that the softirq receive buffer is valid */
@@ -2509,7 +2522,9 @@
 
 	xdr_init_decode(&xdr, &req->rq_rcv_buf,
 			req->rq_rcv_buf.head[0].iov_base, req);
-	switch (rpc_decode_header(task, &xdr)) {
+	err = rpc_decode_header(task, &xdr);
+out:
+	switch (err) {
 	case 0:
 		task->tk_action = rpc_exit_task;
 		task->tk_status = rpcauth_unwrap_resp(task, &xdr);
@@ -2518,9 +2533,6 @@
 		return;
 	case -EAGAIN:
 		task->tk_status = 0;
-		xdr_free_bvec(&req->rq_rcv_buf);
-		req->rq_reply_bytes_recvd = 0;
-		req->rq_rcv_buf.len = 0;
 		if (task->tk_client->cl_discrtry)
 			xprt_conditional_disconnect(req->rq_xprt,
 						    req->rq_connect_cookie);
@@ -2561,7 +2573,7 @@
 	return 0;
 out_fail:
 	trace_rpc_bad_callhdr(task);
-	rpc_exit(task, error);
+	rpc_call_rpcerror(task, error);
 	return error;
 }
 
@@ -2628,7 +2640,7 @@
 		return -EAGAIN;
 	}
 out_err:
-	rpc_exit(task, error);
+	rpc_call_rpcerror(task, error);
 	return error;
 
 out_unparsable:

diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 1f275ab..360afe1 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c

@@ -541,33 +541,14 @@
 	return NULL;
 }
 
-static void
-rpc_wake_up_task_on_wq_queue_locked(struct workqueue_struct *wq,
-		struct rpc_wait_queue *queue, struct rpc_task *task)
-{
-	rpc_wake_up_task_on_wq_queue_action_locked(wq, queue, task, NULL, NULL);
-}
-
 /*
  * Wake up a queued task while the queue lock is being held
  */
-static void rpc_wake_up_task_queue_locked(struct rpc_wait_queue *queue, struct rpc_task *task)
+static void rpc_wake_up_task_queue_locked(struct rpc_wait_queue *queue,
+					  struct rpc_task *task)
 {
-	rpc_wake_up_task_on_wq_queue_locked(rpciod_workqueue, queue, task);
-}
-
-/*
- * Wake up a task on a specific queue
- */
-void rpc_wake_up_queued_task_on_wq(struct workqueue_struct *wq,
-		struct rpc_wait_queue *queue,
-		struct rpc_task *task)
-{
-	if (!RPC_IS_QUEUED(task))
-		return;
-	spin_lock(&queue->lock);
-	rpc_wake_up_task_on_wq_queue_locked(wq, queue, task);
-	spin_unlock(&queue->lock);
+	rpc_wake_up_task_on_wq_queue_action_locked(rpciod_workqueue, queue,
+						   task, NULL, NULL);
 }
 
 /*
@@ -930,8 +911,10 @@
 		/*
 		 * Signalled tasks should exit rather than sleep.
 		 */
-		if (RPC_SIGNALLED(task))
+		if (RPC_SIGNALLED(task)) {
+			task->tk_rpc_status = -ERESTARTSYS;
 			rpc_exit(task, -ERESTARTSYS);
+		}
 
 		/*
 		 * The queue->lock protects against races with
@@ -967,6 +950,7 @@
 			 */
 			dprintk("RPC: %5u got signal\n", task->tk_pid);
 			set_bit(RPC_TASK_SIGNALLED, &task->tk_runstate);
+			task->tk_rpc_status = -ERESTARTSYS;
 			rpc_exit(task, -ERESTARTSYS);
 		}
 		dprintk("RPC: %5u sync task resuming\n", task->tk_pid);

diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 220b799..d11b705 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c

@@ -1233,8 +1233,8 @@
 
 	if (rqstp->rq_vers >= progp->pg_nvers )
 		goto err_bad_vers;
-	  versp = progp->pg_vers[rqstp->rq_vers];
-	  if (!versp)
+	versp = progp->pg_vers[rqstp->rq_vers];
+	if (!versp)
 		goto err_bad_vers;
 
 	/*

diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 48c93b9..14ba9e7 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c

@@ -560,7 +560,7 @@
  * required at the end of encoding, or any other time when the xdr_buf
  * data might be read.
  */
-void xdr_commit_encode(struct xdr_stream *xdr)
+inline void xdr_commit_encode(struct xdr_stream *xdr)
 {
 	int shift = xdr->scratch.iov_len;
 	void *page;
@@ -1236,43 +1236,60 @@
 }
 EXPORT_SYMBOL_GPL(xdr_encode_word);
 
-/* If the netobj starting offset bytes from the start of xdr_buf is contained
- * entirely in the head or the tail, set object to point to it; otherwise
- * try to find space for it at the end of the tail, copy it there, and
- * set obj to point to it. */
-int xdr_buf_read_netobj(struct xdr_buf *buf, struct xdr_netobj *obj, unsigned int offset)
+/**
+ * xdr_buf_read_mic() - obtain the address of the GSS mic from xdr buf
+ * @buf: pointer to buffer containing a mic
+ * @mic: on success, returns the address of the mic
+ * @offset: the offset in buf where mic may be found
+ *
+ * This function may modify the xdr buf if the mic is found to be straddling
+ * a boundary between head, pages, and tail.  On success the mic can be read
+ * from the address returned.  There is no need to free the mic.
+ *
+ * Return: Success returns 0, otherwise an integer error.
+ */
+int xdr_buf_read_mic(struct xdr_buf *buf, struct xdr_netobj *mic, unsigned int offset)
 {
 	struct xdr_buf subbuf;
+	unsigned int boundary;
 
-	if (xdr_decode_word(buf, offset, &obj->len))
+	if (xdr_decode_word(buf, offset, &mic->len))
 		return -EFAULT;
-	if (xdr_buf_subsegment(buf, &subbuf, offset + 4, obj->len))
+	offset += 4;
+
+	/* Is the mic partially in the head? */
+	boundary = buf->head[0].iov_len;
+	if (offset < boundary && (offset + mic->len) > boundary)
+		xdr_shift_buf(buf, boundary - offset);
+
+	/* Is the mic partially in the pages? */
+	boundary += buf->page_len;
+	if (offset < boundary && (offset + mic->len) > boundary)
+		xdr_shrink_pagelen(buf, boundary - offset);
+
+	if (xdr_buf_subsegment(buf, &subbuf, offset, mic->len))
 		return -EFAULT;
 
-	/* Is the obj contained entirely in the head? */
-	obj->data = subbuf.head[0].iov_base;
-	if (subbuf.head[0].iov_len == obj->len)
+	/* Is the mic contained entirely in the head? */
+	mic->data = subbuf.head[0].iov_base;
+	if (subbuf.head[0].iov_len == mic->len)
 		return 0;
-	/* ..or is the obj contained entirely in the tail? */
-	obj->data = subbuf.tail[0].iov_base;
-	if (subbuf.tail[0].iov_len == obj->len)
+	/* ..or is the mic contained entirely in the tail? */
+	mic->data = subbuf.tail[0].iov_base;
+	if (subbuf.tail[0].iov_len == mic->len)
 		return 0;
 
-	/* use end of tail as storage for obj:
-	 * (We don't copy to the beginning because then we'd have
-	 * to worry about doing a potentially overlapping copy.
-	 * This assumes the object is at most half the length of the
-	 * tail.) */
-	if (obj->len > buf->buflen - buf->len)
+	/* Find a contiguous area in @buf to hold all of @mic */
+	if (mic->len > buf->buflen - buf->len)
 		return -ENOMEM;
 	if (buf->tail[0].iov_len != 0)
-		obj->data = buf->tail[0].iov_base + buf->tail[0].iov_len;
+		mic->data = buf->tail[0].iov_base + buf->tail[0].iov_len;
 	else
-		obj->data = buf->head[0].iov_base + buf->head[0].iov_len;
-	__read_bytes_from_xdr_buf(&subbuf, obj->data, obj->len);
+		mic->data = buf->head[0].iov_base + buf->head[0].iov_len;
+	__read_bytes_from_xdr_buf(&subbuf, mic->data, mic->len);
 	return 0;
 }
-EXPORT_SYMBOL_GPL(xdr_buf_read_netobj);
+EXPORT_SYMBOL_GPL(xdr_buf_read_mic);
 
 /* Returns 0 on success, or else a negative error code. */
 static int

diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 2e71f54..8a45b3c 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c

@@ -456,6 +456,12 @@
 }
 EXPORT_SYMBOL_GPL(xprt_release_rqst_cong);
 
+static void xprt_clear_congestion_window_wait_locked(struct rpc_xprt *xprt)
+{
+	if (test_and_clear_bit(XPRT_CWND_WAIT, &xprt->state))
+		__xprt_lock_write_next_cong(xprt);
+}
+
 /*
  * Clear the congestion window wait flag and wake up the next
  * entry on xprt->sending
@@ -671,6 +677,7 @@
 	spin_lock(&xprt->transport_lock);
 	xprt_clear_connected(xprt);
 	xprt_clear_write_space_locked(xprt);
+	xprt_clear_congestion_window_wait_locked(xprt);
 	xprt_wake_pending_tasks(xprt, -ENOTCONN);
 	spin_unlock(&xprt->transport_lock);
 }
@@ -1324,6 +1331,36 @@
 }
 
 /**
+ * xprt_request_dequeue_xprt - remove a task from the transmit+receive queue
+ * @task: pointer to rpc_task
+ *
+ * Remove a task from the transmit and receive queues, and ensure that
+ * it is not pinned by the receive work item.
+ */
+void
+xprt_request_dequeue_xprt(struct rpc_task *task)
+{
+	struct rpc_rqst	*req = task->tk_rqstp;
+	struct rpc_xprt *xprt = req->rq_xprt;
+
+	if (test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate) ||
+	    test_bit(RPC_TASK_NEED_RECV, &task->tk_runstate) ||
+	    xprt_is_pinned_rqst(req)) {
+		spin_lock(&xprt->queue_lock);
+		xprt_request_dequeue_transmit_locked(task);
+		xprt_request_dequeue_receive_locked(task);
+		while (xprt_is_pinned_rqst(req)) {
+			set_bit(RPC_TASK_MSG_PIN_WAIT, &task->tk_runstate);
+			spin_unlock(&xprt->queue_lock);
+			xprt_wait_on_pinned_rqst(req);
+			spin_lock(&xprt->queue_lock);
+			clear_bit(RPC_TASK_MSG_PIN_WAIT, &task->tk_runstate);
+		}
+		spin_unlock(&xprt->queue_lock);
+	}
+}
+
+/**
  * xprt_request_prepare - prepare an encoded request for transport
  * @req: pointer to rpc_rqst
  *
@@ -1747,28 +1784,6 @@
 	xprt_do_reserve(xprt, task);
 }
 
-static void
-xprt_request_dequeue_all(struct rpc_task *task, struct rpc_rqst *req)
-{
-	struct rpc_xprt *xprt = req->rq_xprt;
-
-	if (test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate) ||
-	    test_bit(RPC_TASK_NEED_RECV, &task->tk_runstate) ||
-	    xprt_is_pinned_rqst(req)) {
-		spin_lock(&xprt->queue_lock);
-		xprt_request_dequeue_transmit_locked(task);
-		xprt_request_dequeue_receive_locked(task);
-		while (xprt_is_pinned_rqst(req)) {
-			set_bit(RPC_TASK_MSG_PIN_WAIT, &task->tk_runstate);
-			spin_unlock(&xprt->queue_lock);
-			xprt_wait_on_pinned_rqst(req);
-			spin_lock(&xprt->queue_lock);
-			clear_bit(RPC_TASK_MSG_PIN_WAIT, &task->tk_runstate);
-		}
-		spin_unlock(&xprt->queue_lock);
-	}
-}
-
 /**
  * xprt_release - release an RPC request slot
  * @task: task which is finished with the slot
@@ -1788,7 +1803,7 @@
 	}
 
 	xprt = req->rq_xprt;
-	xprt_request_dequeue_all(task, req);
+	xprt_request_dequeue_xprt(task);
 	spin_lock(&xprt->transport_lock);
 	xprt->ops->release_xprt(xprt, task);
 	if (xprt->ops->release_request)

diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c
index 59e624b..50e075f 100644
--- a/net/sunrpc/xprtrdma/backchannel.c
+++ b/net/sunrpc/xprtrdma/backchannel.c

@@ -54,9 +54,7 @@
 
 unsigned int xprt_rdma_bc_max_slots(struct rpc_xprt *xprt)
 {
-	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
-
-	return r_xprt->rx_buf.rb_bc_srv_max_requests;
+	return RPCRDMA_BACKWARD_WRS >> 1;
 }
 
 static int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst)

diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 0b6dad7..30065a2 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c

@@ -7,67 +7,37 @@
 /* Lightweight memory registration using Fast Registration Work
  * Requests (FRWR).
  *
- * FRWR features ordered asynchronous registration and deregistration
- * of arbitrarily sized memory regions. This is the fastest and safest
+ * FRWR features ordered asynchronous registration and invalidation
+ * of arbitrarily-sized memory regions. This is the fastest and safest
  * but most complex memory registration mode.
  */
 
 /* Normal operation
  *
- * A Memory Region is prepared for RDMA READ or WRITE using a FAST_REG
+ * A Memory Region is prepared for RDMA Read or Write using a FAST_REG
  * Work Request (frwr_map). When the RDMA operation is finished, this
  * Memory Region is invalidated using a LOCAL_INV Work Request
- * (frwr_unmap_sync).
+ * (frwr_unmap_async and frwr_unmap_sync).
  *
- * Typically these Work Requests are not signaled, and neither are RDMA
- * SEND Work Requests (with the exception of signaling occasionally to
- * prevent provider work queue overflows). This greatly reduces HCA
+ * Typically FAST_REG Work Requests are not signaled, and neither are
+ * RDMA Send Work Requests (with the exception of signaling occasionally
+ * to prevent provider work queue overflows). This greatly reduces HCA
  * interrupt workload.
- *
- * As an optimization, frwr_unmap marks MRs INVALID before the
- * LOCAL_INV WR is posted. If posting succeeds, the MR is placed on
- * rb_mrs immediately so that no work (like managing a linked list
- * under a spinlock) is needed in the completion upcall.
- *
- * But this means that frwr_map() can occasionally encounter an MR
- * that is INVALID but the LOCAL_INV WR has not completed. Work Queue
- * ordering prevents a subsequent FAST_REG WR from executing against
- * that MR while it is still being invalidated.
  */
 
 /* Transport recovery
  *
- * ->op_map and the transport connect worker cannot run at the same
- * time, but ->op_unmap can fire while the transport connect worker
- * is running. Thus MR recovery is handled in ->op_map, to guarantee
- * that recovered MRs are owned by a sending RPC, and not one where
- * ->op_unmap could fire at the same time transport reconnect is
- * being done.
+ * frwr_map and frwr_unmap_* cannot run at the same time the transport
+ * connect worker is running. The connect worker holds the transport
+ * send lock, just as ->send_request does. This prevents frwr_map and
+ * the connect worker from running concurrently. When a connection is
+ * closed, the Receive completion queue is drained before the allowing
+ * the connect worker to get control. This prevents frwr_unmap and the
+ * connect worker from running concurrently.
  *
- * When the underlying transport disconnects, MRs are left in one of
- * four states:
- *
- * INVALID:	The MR was not in use before the QP entered ERROR state.
- *
- * VALID:	The MR was registered before the QP entered ERROR state.
- *
- * FLUSHED_FR:	The MR was being registered when the QP entered ERROR
- *		state, and the pending WR was flushed.
- *
- * FLUSHED_LI:	The MR was being invalidated when the QP entered ERROR
- *		state, and the pending WR was flushed.
- *
- * When frwr_map encounters FLUSHED and VALID MRs, they are recovered
- * with ib_dereg_mr and then are re-initialized. Because MR recovery
- * allocates fresh resources, it is deferred to a workqueue, and the
- * recovered MRs are placed back on the rb_mrs list when recovery is
- * complete. frwr_map allocates another MR for the current RPC while
- * the broken MR is reset.
- *
- * To ensure that frwr_map doesn't encounter an MR that is marked
- * INVALID but that is about to be flushed due to a previous transport
- * disconnect, the transport connect worker attempts to drain all
- * pending send queue WRs before the transport is reconnected.
+ * When the underlying transport disconnects, MRs that are in flight
+ * are flushed and are likely unusable. Thus all flushed MRs are
+ * destroyed. New MRs are created on demand.
  */
 
 #include <linux/sunrpc/rpc_rdma.h>
@@ -118,15 +88,8 @@
 	kfree(mr);
 }
 
-/* MRs are dynamically allocated, so simply clean up and release the MR.
- * A replacement MR will subsequently be allocated on demand.
- */
-static void
-frwr_mr_recycle_worker(struct work_struct *work)
+static void frwr_mr_recycle(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr)
 {
-	struct rpcrdma_mr *mr = container_of(work, struct rpcrdma_mr, mr_recycle);
-	struct rpcrdma_xprt *r_xprt = mr->mr_xprt;
-
 	trace_xprtrdma_mr_recycle(mr);
 
 	if (mr->mr_dir != DMA_NONE) {
@@ -136,14 +99,40 @@
 		mr->mr_dir = DMA_NONE;
 	}
 
-	spin_lock(&r_xprt->rx_buf.rb_mrlock);
+	spin_lock(&r_xprt->rx_buf.rb_lock);
 	list_del(&mr->mr_all);
 	r_xprt->rx_stats.mrs_recycled++;
-	spin_unlock(&r_xprt->rx_buf.rb_mrlock);
+	spin_unlock(&r_xprt->rx_buf.rb_lock);
 
 	frwr_release_mr(mr);
 }
 
+/* MRs are dynamically allocated, so simply clean up and release the MR.
+ * A replacement MR will subsequently be allocated on demand.
+ */
+static void
+frwr_mr_recycle_worker(struct work_struct *work)
+{
+	struct rpcrdma_mr *mr = container_of(work, struct rpcrdma_mr,
+					     mr_recycle);
+
+	frwr_mr_recycle(mr->mr_xprt, mr);
+}
+
+/* frwr_recycle - Discard MRs
+ * @req: request to reset
+ *
+ * Used after a reconnect. These MRs could be in flight, we can't
+ * tell. Safe thing to do is release them.
+ */
+void frwr_recycle(struct rpcrdma_req *req)
+{
+	struct rpcrdma_mr *mr;
+
+	while ((mr = rpcrdma_mr_pop(&req->rl_registered)))
+		frwr_mr_recycle(mr->mr_xprt, mr);
+}
+
 /* frwr_reset - Place MRs back on the free list
  * @req: request to reset
  *
@@ -156,12 +145,10 @@
  */
 void frwr_reset(struct rpcrdma_req *req)
 {
-	while (!list_empty(&req->rl_registered)) {
-		struct rpcrdma_mr *mr;
+	struct rpcrdma_mr *mr;
 
-		mr = rpcrdma_mr_pop(&req->rl_registered);
-		rpcrdma_mr_unmap_and_put(mr);
-	}
+	while ((mr = rpcrdma_mr_pop(&req->rl_registered)))
+		rpcrdma_mr_put(mr);
 }
 
 /**
@@ -179,11 +166,14 @@
 	struct ib_mr *frmr;
 	int rc;
 
+	/* NB: ib_alloc_mr and device drivers typically allocate
+	 *     memory with GFP_KERNEL.
+	 */
 	frmr = ib_alloc_mr(ia->ri_pd, ia->ri_mrtype, depth);
 	if (IS_ERR(frmr))
 		goto out_mr_err;
 
-	sg = kcalloc(depth, sizeof(*sg), GFP_KERNEL);
+	sg = kcalloc(depth, sizeof(*sg), GFP_NOFS);
 	if (!sg)
 		goto out_list_err;
 
@@ -203,8 +193,6 @@
 	return rc;
 
 out_list_err:
-	dprintk("RPC:       %s: sg allocation failure\n",
-		__func__);
 	ib_dereg_mr(frmr);
 	return -ENOMEM;
 }
@@ -290,8 +278,8 @@
 	ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS;
 	ep->rep_attr.cap.max_recv_wr += 1; /* for ib_drain_rq */
 
-	ia->ri_max_segs = max_t(unsigned int, 1, RPCRDMA_MAX_DATA_SEGS /
-				ia->ri_max_frwr_depth);
+	ia->ri_max_segs =
+		DIV_ROUND_UP(RPCRDMA_MAX_DATA_SEGS, ia->ri_max_frwr_depth);
 	/* Reply chunks require segments for head and tail buffers */
 	ia->ri_max_segs += 2;
 	if (ia->ri_max_segs > RPCRDMA_MAX_HDR_SEGS)
@@ -323,31 +311,25 @@
  * @nsegs: number of segments remaining
  * @writing: true when RDMA Write will be used
  * @xid: XID of RPC using the registered memory
- * @out: initialized MR
+ * @mr: MR to fill in
  *
  * Prepare a REG_MR Work Request to register a memory region
  * for remote access via RDMA READ or RDMA WRITE.
  *
  * Returns the next segment or a negative errno pointer.
- * On success, the prepared MR is planted in @out.
+ * On success, @mr is filled in.
  */
 struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
 				struct rpcrdma_mr_seg *seg,
 				int nsegs, bool writing, __be32 xid,
-				struct rpcrdma_mr **out)
+				struct rpcrdma_mr *mr)
 {
 	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
-	bool holes_ok = ia->ri_mrtype == IB_MR_TYPE_SG_GAPS;
-	struct rpcrdma_mr *mr;
-	struct ib_mr *ibmr;
 	struct ib_reg_wr *reg_wr;
+	struct ib_mr *ibmr;
 	int i, n;
 	u8 key;
 
-	mr = rpcrdma_mr_get(r_xprt);
-	if (!mr)
-		goto out_getmr_err;
-
 	if (nsegs > ia->ri_max_frwr_depth)
 		nsegs = ia->ri_max_frwr_depth;
 	for (i = 0; i < nsegs;) {
@@ -362,7 +344,7 @@
 
 		++seg;
 		++i;
-		if (holes_ok)
+		if (ia->ri_mrtype == IB_MR_TYPE_SG_GAPS)
 			continue;
 		if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
 		    offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
@@ -397,22 +379,15 @@
 	mr->mr_offset = ibmr->iova;
 	trace_xprtrdma_mr_map(mr);
 
-	*out = mr;
 	return seg;
 
-out_getmr_err:
-	xprt_wait_for_buffer_space(&r_xprt->rx_xprt);
-	return ERR_PTR(-EAGAIN);
-
 out_dmamap_err:
 	mr->mr_dir = DMA_NONE;
 	trace_xprtrdma_frwr_sgerr(mr, i);
-	rpcrdma_mr_put(mr);
 	return ERR_PTR(-EIO);
 
 out_mapmr_err:
 	trace_xprtrdma_frwr_maperr(mr, n);
-	rpcrdma_mr_recycle(mr);
 	return ERR_PTR(-EIO);
 }
 
@@ -485,7 +460,7 @@
 		if (mr->mr_handle == rep->rr_inv_rkey) {
 			list_del_init(&mr->mr_list);
 			trace_xprtrdma_mr_remoteinv(mr);
-			rpcrdma_mr_unmap_and_put(mr);
+			rpcrdma_mr_put(mr);
 			break;	/* only one invalidated MR per RPC */
 		}
 }
@@ -495,7 +470,7 @@
 	if (wc->status != IB_WC_SUCCESS)
 		rpcrdma_mr_recycle(mr);
 	else
-		rpcrdma_mr_unmap_and_put(mr);
+		rpcrdma_mr_put(mr);
 }
 
 /**
@@ -532,8 +507,8 @@
 
 	/* WARNING: Only wr_cqe and status are reliable at this point */
 	trace_xprtrdma_wc_li_wake(wc, frwr);
-	complete(&frwr->fr_linv_done);
 	__frwr_release_mr(wc, mr);
+	complete(&frwr->fr_linv_done);
 }
 
 /**
@@ -562,8 +537,7 @@
 	 */
 	frwr = NULL;
 	prev = &first;
-	while (!list_empty(&req->rl_registered)) {
-		mr = rpcrdma_mr_pop(&req->rl_registered);
+	while ((mr = rpcrdma_mr_pop(&req->rl_registered))) {
 
 		trace_xprtrdma_mr_localinv(mr);
 		r_xprt->rx_stats.local_inv_needed++;
@@ -632,11 +606,15 @@
 	struct rpcrdma_frwr *frwr =
 		container_of(cqe, struct rpcrdma_frwr, fr_cqe);
 	struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr);
+	struct rpcrdma_rep *rep = mr->mr_req->rl_reply;
 
 	/* WARNING: Only wr_cqe and status are reliable at this point */
 	trace_xprtrdma_wc_li_done(wc, frwr);
-	rpcrdma_complete_rqst(frwr->fr_req->rl_reply);
 	__frwr_release_mr(wc, mr);
+
+	/* Ensure @rep is generated before __frwr_release_mr */
+	smp_rmb();
+	rpcrdma_complete_rqst(rep);
 }
 
 /**
@@ -662,15 +640,13 @@
 	 */
 	frwr = NULL;
 	prev = &first;
-	while (!list_empty(&req->rl_registered)) {
-		mr = rpcrdma_mr_pop(&req->rl_registered);
+	while ((mr = rpcrdma_mr_pop(&req->rl_registered))) {
 
 		trace_xprtrdma_mr_localinv(mr);
 		r_xprt->rx_stats.local_inv_needed++;
 
 		frwr = &mr->frwr;
 		frwr->fr_cqe.done = frwr_wc_localinv;
-		frwr->fr_req = req;
 		last = &frwr->fr_invwr;
 		last->next = NULL;
 		last->wr_cqe = &frwr->fr_cqe;

diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 4345e69..b86b5fd 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c

@@ -342,6 +342,32 @@
 	return 0;
 }
 
+static struct rpcrdma_mr_seg *rpcrdma_mr_prepare(struct rpcrdma_xprt *r_xprt,
+						 struct rpcrdma_req *req,
+						 struct rpcrdma_mr_seg *seg,
+						 int nsegs, bool writing,
+						 struct rpcrdma_mr **mr)
+{
+	*mr = rpcrdma_mr_pop(&req->rl_free_mrs);
+	if (!*mr) {
+		*mr = rpcrdma_mr_get(r_xprt);
+		if (!*mr)
+			goto out_getmr_err;
+		trace_xprtrdma_mr_get(req);
+		(*mr)->mr_req = req;
+	}
+
+	rpcrdma_mr_push(*mr, &req->rl_registered);
+	return frwr_map(r_xprt, seg, nsegs, writing, req->rl_slot.rq_xid, *mr);
+
+out_getmr_err:
+	trace_xprtrdma_nomrs(req);
+	xprt_wait_for_buffer_space(&r_xprt->rx_xprt);
+	if (r_xprt->rx_ep.rep_connected != -ENODEV)
+		schedule_work(&r_xprt->rx_buf.rb_refresh_worker);
+	return ERR_PTR(-EAGAIN);
+}
+
 /* Register and XDR encode the Read list. Supports encoding a list of read
  * segments that belong to a single read chunk.
  *
@@ -356,9 +382,10 @@
  *
  * Only a single @pos value is currently supported.
  */
-static noinline int
-rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
-			 struct rpc_rqst *rqst, enum rpcrdma_chunktype rtype)
+static int rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt,
+				    struct rpcrdma_req *req,
+				    struct rpc_rqst *rqst,
+				    enum rpcrdma_chunktype rtype)
 {
 	struct xdr_stream *xdr = &req->rl_stream;
 	struct rpcrdma_mr_seg *seg;
@@ -379,10 +406,9 @@
 		return nsegs;
 
 	do {
-		seg = frwr_map(r_xprt, seg, nsegs, false, rqst->rq_xid, &mr);
+		seg = rpcrdma_mr_prepare(r_xprt, req, seg, nsegs, false, &mr);
 		if (IS_ERR(seg))
 			return PTR_ERR(seg);
-		rpcrdma_mr_push(mr, &req->rl_registered);
 
 		if (encode_read_segment(xdr, mr, pos) < 0)
 			return -EMSGSIZE;
@@ -411,9 +437,10 @@
  *
  * Only a single Write chunk is currently supported.
  */
-static noinline int
-rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
-			  struct rpc_rqst *rqst, enum rpcrdma_chunktype wtype)
+static int rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt,
+				     struct rpcrdma_req *req,
+				     struct rpc_rqst *rqst,
+				     enum rpcrdma_chunktype wtype)
 {
 	struct xdr_stream *xdr = &req->rl_stream;
 	struct rpcrdma_mr_seg *seg;
@@ -440,10 +467,9 @@
 
 	nchunks = 0;
 	do {
-		seg = frwr_map(r_xprt, seg, nsegs, true, rqst->rq_xid, &mr);
+		seg = rpcrdma_mr_prepare(r_xprt, req, seg, nsegs, true, &mr);
 		if (IS_ERR(seg))
 			return PTR_ERR(seg);
-		rpcrdma_mr_push(mr, &req->rl_registered);
 
 		if (encode_rdma_segment(xdr, mr) < 0)
 			return -EMSGSIZE;
@@ -474,9 +500,10 @@
  * Returns zero on success, or a negative errno if a failure occurred.
  * @xdr is advanced to the next position in the stream.
  */
-static noinline int
-rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
-			   struct rpc_rqst *rqst, enum rpcrdma_chunktype wtype)
+static int rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt,
+				      struct rpcrdma_req *req,
+				      struct rpc_rqst *rqst,
+				      enum rpcrdma_chunktype wtype)
 {
 	struct xdr_stream *xdr = &req->rl_stream;
 	struct rpcrdma_mr_seg *seg;
@@ -501,10 +528,9 @@
 
 	nchunks = 0;
 	do {
-		seg = frwr_map(r_xprt, seg, nsegs, true, rqst->rq_xid, &mr);
+		seg = rpcrdma_mr_prepare(r_xprt, req, seg, nsegs, true, &mr);
 		if (IS_ERR(seg))
 			return PTR_ERR(seg);
-		rpcrdma_mr_push(mr, &req->rl_registered);
 
 		if (encode_rdma_segment(xdr, mr) < 0)
 			return -EMSGSIZE;
@@ -841,12 +867,7 @@
 	 * chunks. Very likely the connection has been replaced,
 	 * so these registrations are invalid and unusable.
 	 */
-	while (unlikely(!list_empty(&req->rl_registered))) {
-		struct rpcrdma_mr *mr;
-
-		mr = rpcrdma_mr_pop(&req->rl_registered);
-		rpcrdma_mr_recycle(mr);
-	}
+	frwr_recycle(req);
 
 	/* This implementation supports the following combinations
 	 * of chunk lists in one RPC-over-RDMA Call message:
@@ -1240,8 +1261,6 @@
 	struct rpc_rqst *rqst = rep->rr_rqst;
 	int status;
 
-	xprt->reestablish_timeout = 0;
-
 	switch (rep->rr_proc) {
 	case rdma_msg:
 		status = rpcrdma_decode_msg(r_xprt, rep, rqst);
@@ -1300,6 +1319,12 @@
 	u32 credits;
 	__be32 *p;
 
+	/* Any data means we had a useful conversation, so
+	 * then we don't need to delay the next reconnect.
+	 */
+	if (xprt->reestablish_timeout)
+		xprt->reestablish_timeout = 0;
+
 	/* Fixed transport header fields */
 	xdr_init_decode(&rep->rr_stream, &rep->rr_hdrbuf,
 			rep->rr_hdrbuf.head[0].iov_base, NULL);

diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c
index abdb300..97bca50 100644
--- a/net/sunrpc/xprtrdma/svc_rdma.c
+++ b/net/sunrpc/xprtrdma/svc_rdma.c

@@ -73,8 +73,6 @@
 atomic_t rdma_stat_sq_poll;
 atomic_t rdma_stat_sq_prod;
 
-struct workqueue_struct *svc_rdma_wq;
-
 /*
  * This function implements reading and resetting an atomic_t stat
  * variable through read/write to a proc file. Any write to the file
@@ -230,7 +228,6 @@
 void svc_rdma_cleanup(void)
 {
 	dprintk("SVCRDMA Module Removed, deregister RPC RDMA transport\n");
-	destroy_workqueue(svc_rdma_wq);
 	if (svcrdma_table_header) {
 		unregister_sysctl_table(svcrdma_table_header);
 		svcrdma_table_header = NULL;
@@ -246,10 +243,6 @@
 	dprintk("\tmax_bc_requests  : %u\n", svcrdma_max_bc_requests);
 	dprintk("\tmax_inline       : %d\n", svcrdma_max_req_size);
 
-	svc_rdma_wq = alloc_workqueue("svc_rdma", 0, 0);
-	if (!svc_rdma_wq)
-		return -ENOMEM;
-
 	if (!svcrdma_table_header)
 		svcrdma_table_header =
 			register_sysctl_table(svcrdma_root_table);

diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 65e2fb9..96bccd3 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c

@@ -172,9 +172,10 @@
 void svc_rdma_recv_ctxts_destroy(struct svcxprt_rdma *rdma)
 {
 	struct svc_rdma_recv_ctxt *ctxt;
+	struct llist_node *node;
 
-	while ((ctxt = svc_rdma_next_recv_ctxt(&rdma->sc_recv_ctxts))) {
-		list_del(&ctxt->rc_list);
+	while ((node = llist_del_first(&rdma->sc_recv_ctxts))) {
+		ctxt = llist_entry(node, struct svc_rdma_recv_ctxt, rc_node);
 		svc_rdma_recv_ctxt_destroy(rdma, ctxt);
 	}
 }
@@ -183,21 +184,18 @@
 svc_rdma_recv_ctxt_get(struct svcxprt_rdma *rdma)
 {
 	struct svc_rdma_recv_ctxt *ctxt;
+	struct llist_node *node;
 
-	spin_lock(&rdma->sc_recv_lock);
-	ctxt = svc_rdma_next_recv_ctxt(&rdma->sc_recv_ctxts);
-	if (!ctxt)
+	node = llist_del_first(&rdma->sc_recv_ctxts);
+	if (!node)
 		goto out_empty;
-	list_del(&ctxt->rc_list);
-	spin_unlock(&rdma->sc_recv_lock);
+	ctxt = llist_entry(node, struct svc_rdma_recv_ctxt, rc_node);
 
 out:
 	ctxt->rc_page_count = 0;
 	return ctxt;
 
 out_empty:
-	spin_unlock(&rdma->sc_recv_lock);
-
 	ctxt = svc_rdma_recv_ctxt_alloc(rdma);
 	if (!ctxt)
 		return NULL;
@@ -218,11 +216,9 @@
 	for (i = 0; i < ctxt->rc_page_count; i++)
 		put_page(ctxt->rc_pages[i]);
 
-	if (!ctxt->rc_temp) {
-		spin_lock(&rdma->sc_recv_lock);
-		list_add(&ctxt->rc_list, &rdma->sc_recv_ctxts);
-		spin_unlock(&rdma->sc_recv_lock);
-	} else
+	if (!ctxt->rc_temp)
+		llist_add(&ctxt->rc_node, &rdma->sc_recv_ctxts);
+	else
 		svc_rdma_recv_ctxt_destroy(rdma, ctxt);
 }
 

diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 4d3db6e..145a361 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c

@@ -140,14 +140,13 @@
 	INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q);
 	INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q);
 	INIT_LIST_HEAD(&cma_xprt->sc_send_ctxts);
-	INIT_LIST_HEAD(&cma_xprt->sc_recv_ctxts);
+	init_llist_head(&cma_xprt->sc_recv_ctxts);
 	INIT_LIST_HEAD(&cma_xprt->sc_rw_ctxts);
 	init_waitqueue_head(&cma_xprt->sc_send_wait);
 
 	spin_lock_init(&cma_xprt->sc_lock);
 	spin_lock_init(&cma_xprt->sc_rq_dto_lock);
 	spin_lock_init(&cma_xprt->sc_send_lock);
-	spin_lock_init(&cma_xprt->sc_recv_lock);
 	spin_lock_init(&cma_xprt->sc_rw_ctxt_lock);
 
 	/*
@@ -630,8 +629,9 @@
 {
 	struct svcxprt_rdma *rdma =
 		container_of(xprt, struct svcxprt_rdma, sc_xprt);
+
 	INIT_WORK(&rdma->sc_work, __svc_rdma_free);
-	queue_work(svc_rdma_wq, &rdma->sc_work);
+	schedule_work(&rdma->sc_work);
 }
 
 static int svc_rdma_has_wspace(struct svc_xprt *xprt)

diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 2ec349e..160558b 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c

@@ -423,8 +423,6 @@
 
 	if (ep->rep_connected == -ENODEV)
 		return;
-	if (ep->rep_connected > 0)
-		xprt->reestablish_timeout = 0;
 	rpcrdma_ep_disconnect(ep, ia);
 
 	/* Prepare @xprt for the next connection by reinitializing
@@ -434,6 +432,7 @@
 	xprt->cwnd = RPC_CWNDSHIFT;
 
 out:
+	xprt->reestablish_timeout = 0;
 	++xprt->connect_cookie;
 	xprt_disconnect_done(xprt);
 }
@@ -494,9 +493,9 @@
  * @reconnect_timeout: reconnect timeout after server disconnects
  *
  */
-static void xprt_rdma_tcp_set_connect_timeout(struct rpc_xprt *xprt,
-					      unsigned long connect_timeout,
-					      unsigned long reconnect_timeout)
+static void xprt_rdma_set_connect_timeout(struct rpc_xprt *xprt,
+					  unsigned long connect_timeout,
+					  unsigned long reconnect_timeout)
 {
 	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
 
@@ -571,6 +570,7 @@
 	return;
 
 out_sleep:
+	set_bit(XPRT_CONGESTED, &xprt->state);
 	rpc_sleep_on(&xprt->backlog, task, NULL);
 	task->tk_status = -EAGAIN;
 }
@@ -589,7 +589,8 @@
 
 	memset(rqst, 0, sizeof(*rqst));
 	rpcrdma_buffer_put(&r_xprt->rx_buf, rpcr_to_rdmar(rqst));
-	rpc_wake_up_next(&xprt->backlog);
+	if (unlikely(!rpc_wake_up_next(&xprt->backlog)))
+		clear_bit(XPRT_CONGESTED, &xprt->state);
 }
 
 static bool rpcrdma_check_regbuf(struct rpcrdma_xprt *r_xprt,
@@ -803,7 +804,7 @@
 	.send_request		= xprt_rdma_send_request,
 	.close			= xprt_rdma_close,
 	.destroy		= xprt_rdma_destroy,
-	.set_connect_timeout	= xprt_rdma_tcp_set_connect_timeout,
+	.set_connect_timeout	= xprt_rdma_set_connect_timeout,
 	.print_stats		= xprt_rdma_print_stats,
 	.enable_swap		= xprt_rdma_enable_swap,
 	.disable_swap		= xprt_rdma_disable_swap,

diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index b10aa16..3a90753 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c

@@ -53,6 +53,7 @@
 #include <linux/slab.h>
 #include <linux/sunrpc/addr.h>
 #include <linux/sunrpc/svc_rdma.h>
+#include <linux/log2.h>
 
 #include <asm-generic/barrier.h>
 #include <asm/bitops.h>
@@ -74,8 +75,10 @@
  * internal functions
  */
 static void rpcrdma_sendctx_put_locked(struct rpcrdma_sendctx *sc);
+static void rpcrdma_reps_destroy(struct rpcrdma_buffer *buf);
 static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt);
 static void rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf);
+static void rpcrdma_mr_free(struct rpcrdma_mr *mr);
 static struct rpcrdma_regbuf *
 rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction,
 		     gfp_t flags);
@@ -405,9 +408,8 @@
 	struct rpcrdma_ep *ep = &r_xprt->rx_ep;
 	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
 	struct rpcrdma_req *req;
-	struct rpcrdma_rep *rep;
 
-	cancel_delayed_work_sync(&buf->rb_refresh_worker);
+	cancel_work_sync(&buf->rb_refresh_worker);
 
 	/* This is similar to rpcrdma_ep_destroy, but:
 	 * - Don't cancel the connect worker.
@@ -429,8 +431,7 @@
 	/* The ULP is responsible for ensuring all DMA
 	 * mappings and MRs are gone.
 	 */
-	list_for_each_entry(rep, &buf->rb_recv_bufs, rr_list)
-		rpcrdma_regbuf_dma_unmap(rep->rr_rdmabuf);
+	rpcrdma_reps_destroy(buf);
 	list_for_each_entry(req, &buf->rb_allreqs, rl_all) {
 		rpcrdma_regbuf_dma_unmap(req->rl_rdmabuf);
 		rpcrdma_regbuf_dma_unmap(req->rl_sendbuf);
@@ -604,10 +605,10 @@
  * Unlike a normal reconnection, a fresh PD and a new set
  * of MRs and buffers is needed.
  */
-static int
-rpcrdma_ep_recreate_xprt(struct rpcrdma_xprt *r_xprt,
-			 struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
+static int rpcrdma_ep_recreate_xprt(struct rpcrdma_xprt *r_xprt,
+				    struct ib_qp_init_attr *qp_init_attr)
 {
+	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
 	int rc, err;
 
 	trace_xprtrdma_reinsert(r_xprt);
@@ -624,7 +625,7 @@
 	}
 
 	rc = -ENETUNREACH;
-	err = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
+	err = rdma_create_qp(ia->ri_id, ia->ri_pd, qp_init_attr);
 	if (err) {
 		pr_err("rpcrdma: rdma_create_qp returned %d\n", err);
 		goto out3;
@@ -641,16 +642,16 @@
 	return rc;
 }
 
-static int
-rpcrdma_ep_reconnect(struct rpcrdma_xprt *r_xprt, struct rpcrdma_ep *ep,
-		     struct rpcrdma_ia *ia)
+static int rpcrdma_ep_reconnect(struct rpcrdma_xprt *r_xprt,
+				struct ib_qp_init_attr *qp_init_attr)
 {
+	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
 	struct rdma_cm_id *id, *old;
 	int err, rc;
 
 	trace_xprtrdma_reconnect(r_xprt);
 
-	rpcrdma_ep_disconnect(ep, ia);
+	rpcrdma_ep_disconnect(&r_xprt->rx_ep, ia);
 
 	rc = -EHOSTUNREACH;
 	id = rpcrdma_create_id(r_xprt, ia);
@@ -672,7 +673,7 @@
 		goto out_destroy;
 	}
 
-	err = rdma_create_qp(id, ia->ri_pd, &ep->rep_attr);
+	err = rdma_create_qp(id, ia->ri_pd, qp_init_attr);
 	if (err)
 		goto out_destroy;
 
@@ -697,25 +698,27 @@
 	struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt,
 						   rx_ia);
 	struct rpc_xprt *xprt = &r_xprt->rx_xprt;
+	struct ib_qp_init_attr qp_init_attr;
 	int rc;
 
 retry:
+	memcpy(&qp_init_attr, &ep->rep_attr, sizeof(qp_init_attr));
 	switch (ep->rep_connected) {
 	case 0:
 		dprintk("RPC:       %s: connecting...\n", __func__);
-		rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
+		rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &qp_init_attr);
 		if (rc) {
 			rc = -ENETUNREACH;
 			goto out_noupdate;
 		}
 		break;
 	case -ENODEV:
-		rc = rpcrdma_ep_recreate_xprt(r_xprt, ep, ia);
+		rc = rpcrdma_ep_recreate_xprt(r_xprt, &qp_init_attr);
 		if (rc)
 			goto out_noupdate;
 		break;
 	default:
-		rc = rpcrdma_ep_reconnect(r_xprt, ep, ia);
+		rc = rpcrdma_ep_reconnect(r_xprt, &qp_init_attr);
 		if (rc)
 			goto out;
 	}
@@ -729,6 +732,8 @@
 	if (rc)
 		goto out;
 
+	if (xprt->reestablish_timeout < RPCRDMA_INIT_REEST_TO)
+		xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO;
 	wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
 	if (ep->rep_connected <= 0) {
 		if (ep->rep_connected == -EAGAIN)
@@ -942,14 +947,12 @@
 	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
 	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
 	unsigned int count;
-	LIST_HEAD(free);
-	LIST_HEAD(all);
 
 	for (count = 0; count < ia->ri_max_segs; count++) {
 		struct rpcrdma_mr *mr;
 		int rc;
 
-		mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+		mr = kzalloc(sizeof(*mr), GFP_NOFS);
 		if (!mr)
 			break;
 
@@ -961,15 +964,13 @@
 
 		mr->mr_xprt = r_xprt;
 
-		list_add(&mr->mr_list, &free);
-		list_add(&mr->mr_all, &all);
+		spin_lock(&buf->rb_lock);
+		list_add(&mr->mr_list, &buf->rb_mrs);
+		list_add(&mr->mr_all, &buf->rb_all_mrs);
+		spin_unlock(&buf->rb_lock);
 	}
 
-	spin_lock(&buf->rb_mrlock);
-	list_splice(&free, &buf->rb_mrs);
-	list_splice(&all, &buf->rb_all);
 	r_xprt->rx_stats.mrs_allocated += count;
-	spin_unlock(&buf->rb_mrlock);
 	trace_xprtrdma_createmrs(r_xprt, count);
 }
 
@@ -977,7 +978,7 @@
 rpcrdma_mr_refresh_worker(struct work_struct *work)
 {
 	struct rpcrdma_buffer *buf = container_of(work, struct rpcrdma_buffer,
-						  rb_refresh_worker.work);
+						  rb_refresh_worker);
 	struct rpcrdma_xprt *r_xprt = container_of(buf, struct rpcrdma_xprt,
 						   rx_buf);
 
@@ -999,12 +1000,18 @@
 	struct rpcrdma_buffer *buffer = &r_xprt->rx_buf;
 	struct rpcrdma_regbuf *rb;
 	struct rpcrdma_req *req;
+	size_t maxhdrsize;
 
 	req = kzalloc(sizeof(*req), flags);
 	if (req == NULL)
 		goto out1;
 
-	rb = rpcrdma_regbuf_alloc(RPCRDMA_HDRBUF_SIZE, DMA_TO_DEVICE, flags);
+	/* Compute maximum header buffer size in bytes */
+	maxhdrsize = rpcrdma_fixed_maxsz + 3 +
+		     r_xprt->rx_ia.ri_max_segs * rpcrdma_readchunk_maxsz;
+	maxhdrsize *= sizeof(__be32);
+	rb = rpcrdma_regbuf_alloc(__roundup_pow_of_two(maxhdrsize),
+				  DMA_TO_DEVICE, flags);
 	if (!rb)
 		goto out2;
 	req->rl_rdmabuf = rb;
@@ -1018,6 +1025,7 @@
 	if (!req->rl_recvbuf)
 		goto out4;
 
+	INIT_LIST_HEAD(&req->rl_free_mrs);
 	INIT_LIST_HEAD(&req->rl_registered);
 	spin_lock(&buffer->rb_lock);
 	list_add(&req->rl_all, &buffer->rb_allreqs);
@@ -1065,6 +1073,40 @@
 	return NULL;
 }
 
+static void rpcrdma_rep_destroy(struct rpcrdma_rep *rep)
+{
+	rpcrdma_regbuf_free(rep->rr_rdmabuf);
+	kfree(rep);
+}
+
+static struct rpcrdma_rep *rpcrdma_rep_get_locked(struct rpcrdma_buffer *buf)
+{
+	struct llist_node *node;
+
+	/* Calls to llist_del_first are required to be serialized */
+	node = llist_del_first(&buf->rb_free_reps);
+	if (!node)
+		return NULL;
+	return llist_entry(node, struct rpcrdma_rep, rr_node);
+}
+
+static void rpcrdma_rep_put(struct rpcrdma_buffer *buf,
+			    struct rpcrdma_rep *rep)
+{
+	if (!rep->rr_temp)
+		llist_add(&rep->rr_node, &buf->rb_free_reps);
+	else
+		rpcrdma_rep_destroy(rep);
+}
+
+static void rpcrdma_reps_destroy(struct rpcrdma_buffer *buf)
+{
+	struct rpcrdma_rep *rep;
+
+	while ((rep = rpcrdma_rep_get_locked(buf)) != NULL)
+		rpcrdma_rep_destroy(rep);
+}
+
 /**
  * rpcrdma_buffer_create - Create initial set of req/rep objects
  * @r_xprt: transport instance to (re)initialize
@@ -1078,12 +1120,10 @@
 
 	buf->rb_max_requests = r_xprt->rx_ep.rep_max_requests;
 	buf->rb_bc_srv_max_requests = 0;
-	spin_lock_init(&buf->rb_mrlock);
 	spin_lock_init(&buf->rb_lock);
 	INIT_LIST_HEAD(&buf->rb_mrs);
-	INIT_LIST_HEAD(&buf->rb_all);
-	INIT_DELAYED_WORK(&buf->rb_refresh_worker,
-			  rpcrdma_mr_refresh_worker);
+	INIT_LIST_HEAD(&buf->rb_all_mrs);
+	INIT_WORK(&buf->rb_refresh_worker, rpcrdma_mr_refresh_worker);
 
 	rpcrdma_mrs_create(r_xprt);
 
@@ -1102,7 +1142,7 @@
 	}
 
 	buf->rb_credits = 1;
-	INIT_LIST_HEAD(&buf->rb_recv_bufs);
+	init_llist_head(&buf->rb_free_reps);
 
 	rc = rpcrdma_sendctxs_create(r_xprt);
 	if (rc)
@@ -1114,12 +1154,6 @@
 	return rc;
 }
 
-static void rpcrdma_rep_destroy(struct rpcrdma_rep *rep)
-{
-	rpcrdma_regbuf_free(rep->rr_rdmabuf);
-	kfree(rep);
-}
-
 /**
  * rpcrdma_req_destroy - Destroy an rpcrdma_req object
  * @req: unused object to be destroyed
@@ -1127,11 +1161,13 @@
  * This function assumes that the caller prevents concurrent device
  * unload and transport tear-down.
  */
-void
-rpcrdma_req_destroy(struct rpcrdma_req *req)
+void rpcrdma_req_destroy(struct rpcrdma_req *req)
 {
 	list_del(&req->rl_all);
 
+	while (!list_empty(&req->rl_free_mrs))
+		rpcrdma_mr_free(rpcrdma_mr_pop(&req->rl_free_mrs));
+
 	rpcrdma_regbuf_free(req->rl_recvbuf);
 	rpcrdma_regbuf_free(req->rl_sendbuf);
 	rpcrdma_regbuf_free(req->rl_rdmabuf);
@@ -1147,25 +1183,19 @@
 	unsigned int count;
 
 	count = 0;
-	spin_lock(&buf->rb_mrlock);
-	while (!list_empty(&buf->rb_all)) {
-		mr = list_entry(buf->rb_all.next, struct rpcrdma_mr, mr_all);
+	spin_lock(&buf->rb_lock);
+	while ((mr = list_first_entry_or_null(&buf->rb_all_mrs,
+					      struct rpcrdma_mr,
+					      mr_all)) != NULL) {
 		list_del(&mr->mr_all);
-
-		spin_unlock(&buf->rb_mrlock);
-
-		/* Ensure MW is not on any rl_registered list */
-		if (!list_empty(&mr->mr_list))
-			list_del(&mr->mr_list);
+		spin_unlock(&buf->rb_lock);
 
 		frwr_release_mr(mr);
 		count++;
-		spin_lock(&buf->rb_mrlock);
+		spin_lock(&buf->rb_lock);
 	}
-	spin_unlock(&buf->rb_mrlock);
+	spin_unlock(&buf->rb_lock);
 	r_xprt->rx_stats.mrs_allocated = 0;
-
-	dprintk("RPC:       %s: released %u MRs\n", __func__, count);
 }
 
 /**
@@ -1179,18 +1209,10 @@
 void
 rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
 {
-	cancel_delayed_work_sync(&buf->rb_refresh_worker);
+	cancel_work_sync(&buf->rb_refresh_worker);
 
 	rpcrdma_sendctxs_destroy(buf);
-
-	while (!list_empty(&buf->rb_recv_bufs)) {
-		struct rpcrdma_rep *rep;
-
-		rep = list_first_entry(&buf->rb_recv_bufs,
-				       struct rpcrdma_rep, rr_list);
-		list_del(&rep->rr_list);
-		rpcrdma_rep_destroy(rep);
-	}
+	rpcrdma_reps_destroy(buf);
 
 	while (!list_empty(&buf->rb_send_bufs)) {
 		struct rpcrdma_req *req;
@@ -1215,54 +1237,20 @@
 rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt)
 {
 	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
-	struct rpcrdma_mr *mr = NULL;
+	struct rpcrdma_mr *mr;
 
-	spin_lock(&buf->rb_mrlock);
-	if (!list_empty(&buf->rb_mrs))
-		mr = rpcrdma_mr_pop(&buf->rb_mrs);
-	spin_unlock(&buf->rb_mrlock);
-
-	if (!mr)
-		goto out_nomrs;
+	spin_lock(&buf->rb_lock);
+	mr = rpcrdma_mr_pop(&buf->rb_mrs);
+	spin_unlock(&buf->rb_lock);
 	return mr;
-
-out_nomrs:
-	trace_xprtrdma_nomrs(r_xprt);
-	if (r_xprt->rx_ep.rep_connected != -ENODEV)
-		schedule_delayed_work(&buf->rb_refresh_worker, 0);
-
-	/* Allow the reply handler and refresh worker to run */
-	cond_resched();
-
-	return NULL;
-}
-
-static void
-__rpcrdma_mr_put(struct rpcrdma_buffer *buf, struct rpcrdma_mr *mr)
-{
-	spin_lock(&buf->rb_mrlock);
-	rpcrdma_mr_push(mr, &buf->rb_mrs);
-	spin_unlock(&buf->rb_mrlock);
 }
 
 /**
- * rpcrdma_mr_put - Release an rpcrdma_mr object
- * @mr: object to release
+ * rpcrdma_mr_put - DMA unmap an MR and release it
+ * @mr: MR to release
  *
  */
-void
-rpcrdma_mr_put(struct rpcrdma_mr *mr)
-{
-	__rpcrdma_mr_put(&mr->mr_xprt->rx_buf, mr);
-}
-
-/**
- * rpcrdma_mr_unmap_and_put - DMA unmap an MR and release it
- * @mr: object to release
- *
- */
-void
-rpcrdma_mr_unmap_and_put(struct rpcrdma_mr *mr)
+void rpcrdma_mr_put(struct rpcrdma_mr *mr)
 {
 	struct rpcrdma_xprt *r_xprt = mr->mr_xprt;
 
@@ -1272,7 +1260,19 @@
 				mr->mr_sg, mr->mr_nents, mr->mr_dir);
 		mr->mr_dir = DMA_NONE;
 	}
-	__rpcrdma_mr_put(&r_xprt->rx_buf, mr);
+
+	rpcrdma_mr_push(mr, &mr->mr_req->rl_free_mrs);
+}
+
+static void rpcrdma_mr_free(struct rpcrdma_mr *mr)
+{
+	struct rpcrdma_xprt *r_xprt = mr->mr_xprt;
+	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+
+	mr->mr_req = NULL;
+	spin_lock(&buf->rb_lock);
+	rpcrdma_mr_push(mr, &buf->rb_mrs);
+	spin_unlock(&buf->rb_lock);
 }
 
 /**
@@ -1303,39 +1303,24 @@
  */
 void rpcrdma_buffer_put(struct rpcrdma_buffer *buffers, struct rpcrdma_req *req)
 {
-	struct rpcrdma_rep *rep = req->rl_reply;
-
+	if (req->rl_reply)
+		rpcrdma_rep_put(buffers, req->rl_reply);
 	req->rl_reply = NULL;
 
 	spin_lock(&buffers->rb_lock);
 	list_add(&req->rl_list, &buffers->rb_send_bufs);
-	if (rep) {
-		if (!rep->rr_temp) {
-			list_add(&rep->rr_list, &buffers->rb_recv_bufs);
-			rep = NULL;
-		}
-	}
 	spin_unlock(&buffers->rb_lock);
-	if (rep)
-		rpcrdma_rep_destroy(rep);
 }
 
-/*
- * Put reply buffers back into pool when not attached to
- * request. This happens in error conditions.
+/**
+ * rpcrdma_recv_buffer_put - Release rpcrdma_rep back to free list
+ * @rep: rep to release
+ *
+ * Used after error conditions.
  */
-void
-rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
+void rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
 {
-	struct rpcrdma_buffer *buffers = &rep->rr_rxprt->rx_buf;
-
-	if (!rep->rr_temp) {
-		spin_lock(&buffers->rb_lock);
-		list_add(&rep->rr_list, &buffers->rb_recv_bufs);
-		spin_unlock(&buffers->rb_lock);
-	} else {
-		rpcrdma_rep_destroy(rep);
-	}
+	rpcrdma_rep_put(&rep->rr_rxprt->rx_buf, rep);
 }
 
 /* Returns a pointer to a rpcrdma_regbuf object, or NULL.
@@ -1483,7 +1468,7 @@
 	count = 0;
 
 	needed = buf->rb_credits + (buf->rb_bc_srv_max_requests << 1);
-	if (ep->rep_receive_count > needed)
+	if (likely(ep->rep_receive_count > needed))
 		goto out;
 	needed -= ep->rep_receive_count;
 	if (!temp)
@@ -1491,22 +1476,10 @@
 
 	/* fast path: all needed reps can be found on the free list */
 	wr = NULL;
-	spin_lock(&buf->rb_lock);
 	while (needed) {
-		rep = list_first_entry_or_null(&buf->rb_recv_bufs,
-					       struct rpcrdma_rep, rr_list);
+		rep = rpcrdma_rep_get_locked(buf);
 		if (!rep)
-			break;
-
-		list_del(&rep->rr_list);
-		rep->rr_recv_wr.next = wr;
-		wr = &rep->rr_recv_wr;
-		--needed;
-	}
-	spin_unlock(&buf->rb_lock);
-
-	while (needed) {
-		rep = rpcrdma_rep_create(r_xprt, temp);
+			rep = rpcrdma_rep_create(r_xprt, temp);
 		if (!rep)
 			break;
 
@@ -1523,7 +1496,7 @@
 		if (!rpcrdma_regbuf_dma_map(r_xprt, rep->rr_rdmabuf))
 			goto release_wrs;
 
-		trace_xprtrdma_post_recv(rep->rr_recv_wr.wr_cqe);
+		trace_xprtrdma_post_recv(rep);
 		++count;
 	}
 

diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 92ce09f..65e6b0e 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h

@@ -47,6 +47,7 @@
 #include <linux/atomic.h>		/* atomic_t, etc */
 #include <linux/kref.h>			/* struct kref */
 #include <linux/workqueue.h>		/* struct work_struct */
+#include <linux/llist.h>
 
 #include <rdma/rdma_cm.h>		/* RDMA connection api */
 #include <rdma/ib_verbs.h>		/* RDMA verbs api */
@@ -117,9 +118,6 @@
 #endif
 
 /* Registered buffer -- registered kmalloc'd memory for RDMA SEND/RECV
- *
- * The below structure appears at the front of a large region of kmalloc'd
- * memory, which always starts on a good alignment boundary.
  */
 
 struct rpcrdma_regbuf {
@@ -158,25 +156,22 @@
 
 /* To ensure a transport can always make forward progress,
  * the number of RDMA segments allowed in header chunk lists
- * is capped at 8. This prevents less-capable devices and
- * memory registrations from overrunning the Send buffer
- * while building chunk lists.
+ * is capped at 16. This prevents less-capable devices from
+ * overrunning the Send buffer while building chunk lists.
  *
  * Elements of the Read list take up more room than the
- * Write list or Reply chunk. 8 read segments means the Read
- * list (or Write list or Reply chunk) cannot consume more
- * than
+ * Write list or Reply chunk. 16 read segments means the
+ * chunk lists cannot consume more than
  *
- * ((8 + 2) * read segment size) + 1 XDR words, or 244 bytes.
+ * ((16 + 2) * read segment size) + 1 XDR words,
  *
- * And the fixed part of the header is another 24 bytes.
- *
- * The smallest inline threshold is 1024 bytes, ensuring that
- * at least 750 bytes are available for RPC messages.
+ * or about 400 bytes. The fixed part of the header is
+ * another 24 bytes. Thus when the inline threshold is
+ * 1024 bytes, at least 600 bytes are available for RPC
+ * message bodies.
  */
 enum {
-	RPCRDMA_MAX_HDR_SEGS = 8,
-	RPCRDMA_HDRBUF_SIZE = 256,
+	RPCRDMA_MAX_HDR_SEGS = 16,
 };
 
 /*
@@ -206,7 +201,7 @@
 	struct rpc_rqst		*rr_rqst;
 	struct xdr_buf		rr_hdrbuf;
 	struct xdr_stream	rr_stream;
-	struct list_head	rr_list;
+	struct llist_node	rr_node;
 	struct ib_recv_wr	rr_recv_wr;
 };
 
@@ -240,20 +235,20 @@
  * An external memory region is any buffer or page that is registered
  * on the fly (ie, not pre-registered).
  */
-struct rpcrdma_req;
 struct rpcrdma_frwr {
 	struct ib_mr			*fr_mr;
 	struct ib_cqe			fr_cqe;
 	struct completion		fr_linv_done;
-	struct rpcrdma_req		*fr_req;
 	union {
 		struct ib_reg_wr	fr_regwr;
 		struct ib_send_wr	fr_invwr;
 	};
 };
 
+struct rpcrdma_req;
 struct rpcrdma_mr {
 	struct list_head	mr_list;
+	struct rpcrdma_req	*mr_req;
 	struct scatterlist	*mr_sg;
 	int			mr_nents;
 	enum dma_data_direction	mr_dir;
@@ -331,7 +326,8 @@
 	struct list_head	rl_all;
 	struct kref		rl_kref;
 
-	struct list_head	rl_registered;	/* registered segments */
+	struct list_head	rl_free_mrs;
+	struct list_head	rl_registered;
 	struct rpcrdma_mr_seg	rl_segments[RPCRDMA_MAX_SEGS];
 };
 
@@ -344,7 +340,7 @@
 static inline void
 rpcrdma_mr_push(struct rpcrdma_mr *mr, struct list_head *list)
 {
-	list_add_tail(&mr->mr_list, list);
+	list_add(&mr->mr_list, list);
 }
 
 static inline struct rpcrdma_mr *
@@ -352,8 +348,9 @@
 {
 	struct rpcrdma_mr *mr;
 
-	mr = list_first_entry(list, struct rpcrdma_mr, mr_list);
-	list_del_init(&mr->mr_list);
+	mr = list_first_entry_or_null(list, struct rpcrdma_mr, mr_list);
+	if (mr)
+		list_del_init(&mr->mr_list);
 	return mr;
 }
 
@@ -364,19 +361,19 @@
  * One of these is associated with a transport instance
  */
 struct rpcrdma_buffer {
-	spinlock_t		rb_mrlock;	/* protect rb_mrs list */
+	spinlock_t		rb_lock;
+	struct list_head	rb_send_bufs;
 	struct list_head	rb_mrs;
-	struct list_head	rb_all;
 
 	unsigned long		rb_sc_head;
 	unsigned long		rb_sc_tail;
 	unsigned long		rb_sc_last;
 	struct rpcrdma_sendctx	**rb_sc_ctxs;
 
-	spinlock_t		rb_lock;	/* protect buf lists */
-	struct list_head	rb_send_bufs;
-	struct list_head	rb_recv_bufs;
 	struct list_head	rb_allreqs;
+	struct list_head	rb_all_mrs;
+
+	struct llist_head	rb_free_reps;
 
 	u32			rb_max_requests;
 	u32			rb_credits;	/* most recent credit grant */
@@ -384,7 +381,7 @@
 	u32			rb_bc_srv_max_requests;
 	u32			rb_bc_max_requests;
 
-	struct delayed_work	rb_refresh_worker;
+	struct work_struct	rb_refresh_worker;
 };
 
 /*
@@ -490,7 +487,6 @@
 
 struct rpcrdma_mr *rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt);
 void rpcrdma_mr_put(struct rpcrdma_mr *mr);
-void rpcrdma_mr_unmap_and_put(struct rpcrdma_mr *mr);
 
 static inline void
 rpcrdma_mr_recycle(struct rpcrdma_mr *mr)
@@ -546,6 +542,7 @@
 /* Memory registration calls xprtrdma/frwr_ops.c
  */
 bool frwr_is_supported(struct ib_device *device);
+void frwr_recycle(struct rpcrdma_req *req);
 void frwr_reset(struct rpcrdma_req *req);
 int frwr_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep);
 int frwr_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr);
@@ -554,7 +551,7 @@
 struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
 				struct rpcrdma_mr_seg *seg,
 				int nsegs, bool writing, __be32 xid,
-				struct rpcrdma_mr **mr);
+				struct rpcrdma_mr *mr);
 int frwr_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req);
 void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs);
 void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req);

diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index e2176c1..9ac8872 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c

@@ -562,10 +562,14 @@
 		printk(KERN_WARNING "Callback slot table overflowed\n");
 		return -ESHUTDOWN;
 	}
+	if (transport->recv.copied && !req->rq_private_buf.len)
+		return -ESHUTDOWN;
 
 	ret = xs_read_stream_request(transport, msg, flags, req);
 	if (msg->msg_flags & (MSG_EOR|MSG_TRUNC))
 		xprt_complete_bc_request(req, transport->recv.copied);
+	else
+		req->rq_private_buf.len = transport->recv.copied;
 
 	return ret;
 }
@@ -587,7 +591,7 @@
 	/* Look up and lock the request corresponding to the given XID */
 	spin_lock(&xprt->queue_lock);
 	req = xprt_lookup_rqst(xprt, transport->recv.xid);
-	if (!req) {
+	if (!req || (transport->recv.copied && !req->rq_private_buf.len)) {
 		msg->msg_flags |= MSG_TRUNC;
 		goto out;
 	}
@@ -599,6 +603,8 @@
 	spin_lock(&xprt->queue_lock);
 	if (msg->msg_flags & (MSG_EOR|MSG_TRUNC))
 		xprt_complete_rqst(req->rq_task, transport->recv.copied);
+	else
+		req->rq_private_buf.len = transport->recv.copied;
 	xprt_unpin_rqst(req);
 out:
 	spin_unlock(&xprt->queue_lock);

diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index 947b8ff..16d5f35 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c

@@ -206,14 +206,7 @@
 
 static void xdp_umem_unpin_pages(struct xdp_umem *umem)
 {
-	unsigned int i;
-
-	for (i = 0; i < umem->npgs; i++) {
-		struct page *page = umem->pgs[i];
-
-		set_page_dirty_lock(page);
-		put_page(page);
-	}
+	put_user_pages_dirty_lock(umem->pgs, umem->npgs, true);
 
 	kfree(umem->pgs);
 	umem->pgs = NULL;
@@ -383,8 +376,6 @@
 			return -EINVAL;
 	}
 
-	headroom = ALIGN(headroom, 64);
-
 	size_chk = chunk_size - headroom - XDP_PACKET_HEADROOM;
 	if (size_chk < 0)
 		return -EINVAL;

diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index c2f1af3..fa8fbb8f 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c

@@ -977,7 +977,7 @@
 	/* Matches the smp_wmb() in xsk_init_queue */
 	smp_rmb();
 	qpg = virt_to_head_page(q->ring);
-	if (size > (PAGE_SIZE << compound_order(qpg)))
+	if (size > page_size(qpg))
 		return -EINVAL;
 
 	pfn = virt_to_phys(q->ring) >> PAGE_SHIFT;

diff --git a/scripts/Makefile b/scripts/Makefile
index c42891e1..3e86b30 100644
--- a/scripts/Makefile
+++ b/scripts/Makefile

@@ -17,7 +17,7 @@
 hostprogs-$(BUILD_C_RECORDMCOUNT) += recordmcount
 hostprogs-$(CONFIG_BUILDTIME_EXTABLE_SORT) += sortextable
 hostprogs-$(CONFIG_ASN1)	 += asn1_compiler
-hostprogs-$(CONFIG_MODULE_SIG)	 += sign-file
+hostprogs-$(CONFIG_MODULE_SIG_FORMAT) += sign-file
 hostprogs-$(CONFIG_SYSTEM_TRUSTED_KEYRING) += extract-cert
 hostprogs-$(CONFIG_SYSTEM_EXTRA_CERTIFICATE) += insert-sys-cert
 

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 93a7edf..6fcc66a 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl

@@ -62,6 +62,8 @@
 my $typedefsfile = "";
 my $color = "auto";
 my $allow_c99_comments = 1; # Can be overridden by --ignore C99_COMMENT_TOLERANCE
+# git output parsing needs US English output, so first set backtick child process LANGUAGE
+my $git_command ='export LANGUAGE=en_US.UTF-8; git';
 
 sub help {
 	my ($exitcode) = @_;
@@ -904,7 +906,7 @@
 	$camelcase_seeded = 1;
 
 	if (-e ".git") {
-		my $git_last_include_commit = `git log --no-merges --pretty=format:"%h%n" -1 -- include`;
+		my $git_last_include_commit = `${git_command} log --no-merges --pretty=format:"%h%n" -1 -- include`;
 		chomp $git_last_include_commit;
 		$camelcase_cache = ".checkpatch-camelcase.git.$git_last_include_commit";
 	} else {
@@ -932,7 +934,7 @@
 	}
 
 	if (-e ".git") {
-		$files = `git ls-files "include/*.h"`;
+		$files = `${git_command} ls-files "include/*.h"`;
 		@include_files = split('\n', $files);
 	}
 
@@ -956,13 +958,13 @@
 
 	return ($id, $desc) if ((which("git") eq "") || !(-e ".git"));
 
-	my $output = `git log --no-color --format='%H %s' -1 $commit 2>&1`;
+	my $output = `${git_command} log --no-color --format='%H %s' -1 $commit 2>&1`;
 	$output =~ s/^\s*//gm;
 	my @lines = split("\n", $output);
 
 	return ($id, $desc) if ($#lines < 0);
 
-	if ($lines[0] =~ /^error: short SHA1 $commit is ambiguous\./) {
+	if ($lines[0] =~ /^error: short SHA1 $commit is ambiguous/) {
 # Maybe one day convert this block of bash into something that returns
 # all matching commit ids, but it's very slow...
 #
@@ -1006,7 +1008,7 @@
 		} else {
 			$git_range = "-1 $commit_expr";
 		}
-		my $lines = `git log --no-color --no-merges --pretty=format:'%H %s' $git_range`;
+		my $lines = `${git_command} log --no-color --no-merges --pretty=format:'%H %s' $git_range`;
 		foreach my $line (split(/\n/, $lines)) {
 			$line =~ /^([0-9a-fA-F]{40,40}) (.*)$/;
 			next if (!defined($1) || !defined($2));
@@ -2725,8 +2727,10 @@
 		    ($line =~ /^\s*(?:WARNING:|BUG:)/ ||
 		     $line =~ /^\s*\[\s*\d+\.\d{6,6}\s*\]/ ||
 					# timestamp
-		     $line =~ /^\s*\[\<[0-9a-fA-F]{8,}\>\]/)) {
-					# stack dump address
+		     $line =~ /^\s*\[\<[0-9a-fA-F]{8,}\>\]/) ||
+		     $line =~ /^(?:\s+\w+:\s+[0-9a-fA-F]+){3,3}/ ||
+		     $line =~ /^\s*\#\d+\s*\[[0-9a-fA-F]+\]\s*\w+ at [0-9a-fA-F]+/) {
+					# stack dump address styles
 			$commit_log_possible_stack_dump = 1;
 		}
 
@@ -2898,6 +2902,17 @@
 			}
 		}
 
+# check for invalid commit id
+		if ($in_commit_log && $line =~ /(^fixes:|\bcommit)\s+([0-9a-f]{6,40})\b/i) {
+			my $id;
+			my $description;
+			($id, $description) = git_commit_info($2, undef, undef);
+			if (!defined($id)) {
+				WARN("UNKNOWN_COMMIT_ID",
+				     "Unknown commit id '$2', maybe rebased or not pulled?\n" . $herecurr);
+			}
+		}
+
 # ignore non-hunk lines and lines being removed
 		next if (!$hunk_line || $line =~ /^-/);
 
@@ -3069,21 +3084,21 @@
 # check SPDX comment style for .[chsS] files
 				if ($realfile =~ /\.[chsS]$/ &&
 				    $rawline =~ /SPDX-License-Identifier:/ &&
-				    $rawline !~ /^\+\s*\Q$comment\E\s*/) {
+				    $rawline !~ m@^\+\s*\Q$comment\E\s*@) {
 					WARN("SPDX_LICENSE_TAG",
 					     "Improper SPDX comment style for '$realfile', please use '$comment' instead\n" . $herecurr);
 				}
 
 				if ($comment !~ /^$/ &&
-				    $rawline !~ /^\+\Q$comment\E SPDX-License-Identifier: /) {
-					 WARN("SPDX_LICENSE_TAG",
-					      "Missing or malformed SPDX-License-Identifier tag in line $checklicenseline\n" . $herecurr);
+				    $rawline !~ m@^\+\Q$comment\E SPDX-License-Identifier: @) {
+					WARN("SPDX_LICENSE_TAG",
+					     "Missing or malformed SPDX-License-Identifier tag in line $checklicenseline\n" . $herecurr);
 				} elsif ($rawline =~ /(SPDX-License-Identifier: .*)/) {
-					 my $spdx_license = $1;
-					 if (!is_SPDX_License_valid($spdx_license)) {
-						  WARN("SPDX_LICENSE_TAG",
-						       "'$spdx_license' is not supported in LICENSES/...\n" . $herecurr);
-					 }
+					my $spdx_license = $1;
+					if (!is_SPDX_License_valid($spdx_license)) {
+						WARN("SPDX_LICENSE_TAG",
+						     "'$spdx_license' is not supported in LICENSES/...\n" . $herecurr);
+					}
 				}
 			}
 		}
@@ -4660,7 +4675,7 @@
 
 # closing brace should have a space following it when it has anything
 # on the line
-		if ($line =~ /}(?!(?:,|;|\)))\S/) {
+		if ($line =~ /}(?!(?:,|;|\)|\}))\S/) {
 			if (ERROR("SPACING",
 				  "space required after that close brace '}'\n" . $herecurr) &&
 			    $fix) {
@@ -5191,7 +5206,7 @@
 			        next if ($arg =~ /\.\.\./);
 			        next if ($arg =~ /^type$/i);
 				my $tmp_stmt = $define_stmt;
-				$tmp_stmt =~ s/\b(typeof|__typeof__|__builtin\w+|typecheck\s*\(\s*$Type\s*,|\#+)\s*\(*\s*$arg\s*\)*\b//g;
+				$tmp_stmt =~ s/\b(sizeof|typeof|__typeof__|__builtin\w+|typecheck\s*\(\s*$Type\s*,|\#+)\s*\(*\s*$arg\s*\)*\b//g;
 				$tmp_stmt =~ s/\#+\s*$arg\b//g;
 				$tmp_stmt =~ s/\b$arg\s*\#\#//g;
 				my $use_cnt = () = $tmp_stmt =~ /\b$arg\b/g;
@@ -5873,6 +5888,18 @@
 			     "__aligned(size) is preferred over __attribute__((aligned(size)))\n" . $herecurr);
 		}
 
+# Check for __attribute__ section, prefer __section
+		if ($realfile !~ m@\binclude/uapi/@ &&
+		    $line =~ /\b__attribute__\s*\(\s*\(.*_*section_*\s*\(\s*("[^"]*")/) {
+			my $old = substr($rawline, $-[1], $+[1] - $-[1]);
+			my $new = substr($old, 1, -1);
+			if (WARN("PREFER_SECTION",
+				 "__section($new) is preferred over __attribute__((section($old)))\n" . $herecurr) &&
+			    $fix) {
+				$fixed[$fixlinenr] =~ s/\b__attribute__\s*\(\s*\(\s*_*section_*\s*\(\s*\Q$old\E\s*\)\s*\)\s*\)/__section($new)/;
+			}
+		}
+
 # Check for __attribute__ format(printf, prefer __printf
 		if ($realfile !~ m@\binclude/uapi/@ &&
 		    $line =~ /\b__attribute__\s*\(\s*\(\s*format\s*\(\s*printf/) {
@@ -6480,6 +6507,12 @@
 			     "Using $1 should generally have parentheses around the comparison\n" . $herecurr);
 		}
 
+# nested likely/unlikely calls
+		if ($line =~ /\b(?:(?:un)?likely)\s*\(\s*!?\s*(IS_ERR(?:_OR_NULL|_VALUE)?|WARN)/) {
+			WARN("LIKELY_MISUSE",
+			     "nested (un)?likely() calls, $1 already uses unlikely() internally\n" . $herecurr);
+		}
+
 # whine mightly about in_atomic
 		if ($line =~ /\bin_atomic\s*\(/) {
 			if ($realfile =~ m@^drivers/@) {

diff --git a/scripts/gdb/linux/symbols.py b/scripts/gdb/linux/symbols.py
index 2f5b95f..34e40e9 100644
--- a/scripts/gdb/linux/symbols.py
+++ b/scripts/gdb/linux/symbols.py

@@ -77,12 +77,12 @@
             gdb.write("scanning for modules in {0}\n".format(path))
             for root, dirs, files in os.walk(path):
                 for name in files:
-                    if name.endswith(".ko"):
+                    if name.endswith(".ko") or name.endswith(".ko.debug"):
                         self.module_files.append(root + "/" + name)
         self.module_files_updated = True
 
     def _get_module_file(self, module_name):
-        module_pattern = ".*/{0}\.ko$".format(
+        module_pattern = ".*/{0}\.ko(?:.debug)?$".format(
             module_name.replace("_", r"[_\-]"))
         for name in self.module_files:
             if re.match(module_pattern, name) and os.path.exists(name):

diff --git a/security/Kconfig b/security/Kconfig
index 0d65594..2a1a2d3 100644
--- a/security/Kconfig
+++ b/security/Kconfig

@@ -237,6 +237,7 @@
 source "security/loadpin/Kconfig"
 source "security/yama/Kconfig"
 source "security/safesetid/Kconfig"
+source "security/lockdown/Kconfig"
 
 source "security/integrity/Kconfig"
 
@@ -276,11 +277,11 @@
 
 config LSM
 	string "Ordered list of enabled LSMs"
-	default "yama,loadpin,safesetid,integrity,smack,selinux,tomoyo,apparmor" if DEFAULT_SECURITY_SMACK
-	default "yama,loadpin,safesetid,integrity,apparmor,selinux,smack,tomoyo" if DEFAULT_SECURITY_APPARMOR
-	default "yama,loadpin,safesetid,integrity,tomoyo" if DEFAULT_SECURITY_TOMOYO
-	default "yama,loadpin,safesetid,integrity" if DEFAULT_SECURITY_DAC
-	default "yama,loadpin,safesetid,integrity,selinux,smack,tomoyo,apparmor"
+	default "lockdown,yama,loadpin,safesetid,integrity,smack,selinux,tomoyo,apparmor" if DEFAULT_SECURITY_SMACK
+	default "lockdown,yama,loadpin,safesetid,integrity,apparmor,selinux,smack,tomoyo" if DEFAULT_SECURITY_APPARMOR
+	default "lockdown,yama,loadpin,safesetid,integrity,tomoyo" if DEFAULT_SECURITY_TOMOYO
+	default "lockdown,yama,loadpin,safesetid,integrity" if DEFAULT_SECURITY_DAC
+	default "lockdown,yama,loadpin,safesetid,integrity,selinux,smack,tomoyo,apparmor"
 	help
 	  A comma-separated list of LSMs, in initialization order.
 	  Any LSMs left off this list will be ignored. This can be

diff --git a/security/Makefile b/security/Makefile
index c598b90..be1dd9d 100644
--- a/security/Makefile
+++ b/security/Makefile

@@ -11,6 +11,7 @@
 subdir-$(CONFIG_SECURITY_YAMA)		+= yama
 subdir-$(CONFIG_SECURITY_LOADPIN)	+= loadpin
 subdir-$(CONFIG_SECURITY_SAFESETID)    += safesetid
+subdir-$(CONFIG_SECURITY_LOCKDOWN_LSM)	+= lockdown
 
 # always enable default capabilities
 obj-y					+= commoncap.o
@@ -27,6 +28,7 @@
 obj-$(CONFIG_SECURITY_YAMA)		+= yama/
 obj-$(CONFIG_SECURITY_LOADPIN)		+= loadpin/
 obj-$(CONFIG_SECURITY_SAFESETID)       += safesetid/
+obj-$(CONFIG_SECURITY_LOCKDOWN_LSM)	+= lockdown/
 obj-$(CONFIG_CGROUP_DEVICE)		+= device_cgroup.o
 
 # Object integrity file lists

diff --git a/security/integrity/Kconfig b/security/integrity/Kconfig
index c352532..0bae6ad 100644
--- a/security/integrity/Kconfig
+++ b/security/integrity/Kconfig

@@ -18,8 +18,8 @@
 
 config INTEGRITY_SIGNATURE
 	bool "Digital signature verification using multiple keyrings"
-	depends on KEYS
 	default n
+	select KEYS
 	select SIGNATURE
 	help
 	  This option enables digital signature verification support

diff --git a/security/integrity/digsig.c b/security/integrity/digsig.c
index 868ade3..ea1aae3 100644
--- a/security/integrity/digsig.c
+++ b/security/integrity/digsig.c

@@ -39,11 +39,10 @@
 #define restrict_link_to_ima restrict_link_by_builtin_trusted
 #endif
 
-int integrity_digsig_verify(const unsigned int id, const char *sig, int siglen,
-			    const char *digest, int digestlen)
+static struct key *integrity_keyring_from_id(const unsigned int id)
 {
-	if (id >= INTEGRITY_KEYRING_MAX || siglen < 2)
-		return -EINVAL;
+	if (id >= INTEGRITY_KEYRING_MAX)
+		return ERR_PTR(-EINVAL);
 
 	if (!keyring[id]) {
 		keyring[id] =
@@ -52,23 +51,49 @@
 			int err = PTR_ERR(keyring[id]);
 			pr_err("no %s keyring: %d\n", keyring_name[id], err);
 			keyring[id] = NULL;
-			return err;
+			return ERR_PTR(err);
 		}
 	}
 
+	return keyring[id];
+}
+
+int integrity_digsig_verify(const unsigned int id, const char *sig, int siglen,
+			    const char *digest, int digestlen)
+{
+	struct key *keyring;
+
+	if (siglen < 2)
+		return -EINVAL;
+
+	keyring = integrity_keyring_from_id(id);
+	if (IS_ERR(keyring))
+		return PTR_ERR(keyring);
+
 	switch (sig[1]) {
 	case 1:
 		/* v1 API expect signature without xattr type */
-		return digsig_verify(keyring[id], sig + 1, siglen - 1,
-				     digest, digestlen);
+		return digsig_verify(keyring, sig + 1, siglen - 1, digest,
+				     digestlen);
 	case 2:
-		return asymmetric_verify(keyring[id], sig, siglen,
-					 digest, digestlen);
+		return asymmetric_verify(keyring, sig, siglen, digest,
+					 digestlen);
 	}
 
 	return -EOPNOTSUPP;
 }
 
+int integrity_modsig_verify(const unsigned int id, const struct modsig *modsig)
+{
+	struct key *keyring;
+
+	keyring = integrity_keyring_from_id(id);
+	if (IS_ERR(keyring))
+		return PTR_ERR(keyring);
+
+	return ima_modsig_verify(keyring, modsig);
+}
+
 static int __init __integrity_init_keyring(const unsigned int id,
 					   key_perm_t perm,
 					   struct key_restriction *restriction)

diff --git a/security/integrity/ima/Kconfig b/security/integrity/ima/Kconfig
index 2ced99d..838476d 100644
--- a/security/integrity/ima/Kconfig
+++ b/security/integrity/ima/Kconfig

@@ -160,7 +160,7 @@
 
 config IMA_ARCH_POLICY
         bool "Enable loading an IMA architecture specific policy"
-        depends on (KEXEC_VERIFY_SIG && IMA) || IMA_APPRAISE \
+        depends on (KEXEC_SIG && IMA) || IMA_APPRAISE \
 		   && INTEGRITY_ASYMMETRIC_KEYS
         default n
         help
@@ -233,6 +233,19 @@
 	  This option enables the different "ima_appraise=" modes
 	  (eg. fix, log) from the boot command line.
 
+config IMA_APPRAISE_MODSIG
+	bool "Support module-style signatures for appraisal"
+	depends on IMA_APPRAISE
+	depends on INTEGRITY_ASYMMETRIC_KEYS
+	select PKCS7_MESSAGE_PARSER
+	select MODULE_SIG_FORMAT
+	default n
+	help
+	   Adds support for signatures appended to files. The format of the
+	   appended signature is the same used for signed kernel modules.
+	   The modsig keyword can be used in the IMA policy to allow a hook
+	   to accept such signatures.
+
 config IMA_TRUSTED_KEYRING
 	bool "Require all keys on the .ima keyring be signed (deprecated)"
 	depends on IMA_APPRAISE && SYSTEM_TRUSTED_KEYRING

diff --git a/security/integrity/ima/Makefile b/security/integrity/ima/Makefile
index d921dc4..31d57cd 100644
--- a/security/integrity/ima/Makefile
+++ b/security/integrity/ima/Makefile

@@ -9,5 +9,6 @@
 ima-y := ima_fs.o ima_queue.o ima_init.o ima_main.o ima_crypto.o ima_api.o \
 	 ima_policy.o ima_template.o ima_template_lib.o
 ima-$(CONFIG_IMA_APPRAISE) += ima_appraise.o
+ima-$(CONFIG_IMA_APPRAISE_MODSIG) += ima_modsig.o
 ima-$(CONFIG_HAVE_IMA_KEXEC) += ima_kexec.o
 obj-$(CONFIG_IMA_BLACKLIST_KEYRING) += ima_mok.o

diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h
index 011b91c..3689081 100644
--- a/security/integrity/ima/ima.h
+++ b/security/integrity/ima/ima.h

@@ -60,6 +60,7 @@
 	const unsigned char *filename;
 	struct evm_ima_xattr_data *xattr_value;
 	int xattr_len;
+	const struct modsig *modsig;
 	const char *violation;
 	const void *buf;
 	int buf_len;
@@ -113,6 +114,8 @@
 	u64 count;
 };
 
+extern const int read_idmap[];
+
 #ifdef CONFIG_HAVE_IMA_KEXEC
 void ima_load_kexec_buffer(void);
 #else
@@ -149,6 +152,7 @@
 			      int *num_fields);
 struct ima_template_desc *ima_template_desc_current(void);
 struct ima_template_desc *lookup_template_desc(const char *name);
+bool ima_template_has_modsig(const struct ima_template_desc *ima_template);
 int ima_restore_measurement_entry(struct ima_template_entry *entry);
 int ima_restore_measurement_list(loff_t bufsize, void *buf);
 int ima_measurements_show(struct seq_file *m, void *v);
@@ -196,6 +200,10 @@
 	__ima_hooks(__ima_hook_enumify)
 };
 
+extern const char *const func_tokens[];
+
+struct modsig;
+
 /* LIM API function definitions */
 int ima_get_action(struct inode *inode, const struct cred *cred, u32 secid,
 		   int mask, enum ima_hooks func, int *pcr,
@@ -203,11 +211,11 @@
 int ima_must_measure(struct inode *inode, int mask, enum ima_hooks func);
 int ima_collect_measurement(struct integrity_iint_cache *iint,
 			    struct file *file, void *buf, loff_t size,
-			    enum hash_algo algo);
+			    enum hash_algo algo, struct modsig *modsig);
 void ima_store_measurement(struct integrity_iint_cache *iint, struct file *file,
 			   const unsigned char *filename,
 			   struct evm_ima_xattr_data *xattr_value,
-			   int xattr_len, int pcr,
+			   int xattr_len, const struct modsig *modsig, int pcr,
 			   struct ima_template_desc *template_desc);
 void ima_audit_measurement(struct integrity_iint_cache *iint,
 			   const unsigned char *filename);
@@ -249,7 +257,7 @@
 			     struct integrity_iint_cache *iint,
 			     struct file *file, const unsigned char *filename,
 			     struct evm_ima_xattr_data *xattr_value,
-			     int xattr_len);
+			     int xattr_len, const struct modsig *modsig);
 int ima_must_appraise(struct inode *inode, int mask, enum ima_hooks func);
 void ima_update_xattr(struct integrity_iint_cache *iint, struct file *file);
 enum integrity_status ima_get_cache_status(struct integrity_iint_cache *iint,
@@ -265,7 +273,8 @@
 					   struct file *file,
 					   const unsigned char *filename,
 					   struct evm_ima_xattr_data *xattr_value,
-					   int xattr_len)
+					   int xattr_len,
+					   const struct modsig *modsig)
 {
 	return INTEGRITY_UNKNOWN;
 }
@@ -302,6 +311,51 @@
 
 #endif /* CONFIG_IMA_APPRAISE */
 
+#ifdef CONFIG_IMA_APPRAISE_MODSIG
+bool ima_hook_supports_modsig(enum ima_hooks func);
+int ima_read_modsig(enum ima_hooks func, const void *buf, loff_t buf_len,
+		    struct modsig **modsig);
+void ima_collect_modsig(struct modsig *modsig, const void *buf, loff_t size);
+int ima_get_modsig_digest(const struct modsig *modsig, enum hash_algo *algo,
+			  const u8 **digest, u32 *digest_size);
+int ima_get_raw_modsig(const struct modsig *modsig, const void **data,
+		       u32 *data_len);
+void ima_free_modsig(struct modsig *modsig);
+#else
+static inline bool ima_hook_supports_modsig(enum ima_hooks func)
+{
+	return false;
+}
+
+static inline int ima_read_modsig(enum ima_hooks func, const void *buf,
+				  loff_t buf_len, struct modsig **modsig)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline void ima_collect_modsig(struct modsig *modsig, const void *buf,
+				      loff_t size)
+{
+}
+
+static inline int ima_get_modsig_digest(const struct modsig *modsig,
+					enum hash_algo *algo, const u8 **digest,
+					u32 *digest_size)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int ima_get_raw_modsig(const struct modsig *modsig,
+				     const void **data, u32 *data_len)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline void ima_free_modsig(struct modsig *modsig)
+{
+}
+#endif /* CONFIG_IMA_APPRAISE_MODSIG */
+
 /* LSM based policy rules require audit */
 #ifdef CONFIG_IMA_LSM_RULES
 

diff --git a/security/integrity/ima/ima_api.c b/security/integrity/ima/ima_api.c
index f614e22..610759f 100644
--- a/security/integrity/ima/ima_api.c
+++ b/security/integrity/ima/ima_api.c

@@ -45,8 +45,8 @@
 	else
 		template_desc = ima_template_desc_current();
 
-	*entry = kzalloc(sizeof(**entry) + template_desc->num_fields *
-			 sizeof(struct ima_field_data), GFP_NOFS);
+	*entry = kzalloc(struct_size(*entry, template_data,
+				     template_desc->num_fields), GFP_NOFS);
 	if (!*entry)
 		return -ENOMEM;
 
@@ -205,7 +205,7 @@
  */
 int ima_collect_measurement(struct integrity_iint_cache *iint,
 			    struct file *file, void *buf, loff_t size,
-			    enum hash_algo algo)
+			    enum hash_algo algo, struct modsig *modsig)
 {
 	const char *audit_cause = "failed";
 	struct inode *inode = file_inode(file);
@@ -219,6 +219,14 @@
 		char digest[IMA_MAX_DIGEST_SIZE];
 	} hash;
 
+	/*
+	 * Always collect the modsig, because IMA might have already collected
+	 * the file digest without collecting the modsig in a previous
+	 * measurement rule.
+	 */
+	if (modsig)
+		ima_collect_modsig(modsig, buf, size);
+
 	if (iint->flags & IMA_COLLECTED)
 		goto out;
 
@@ -285,7 +293,7 @@
 void ima_store_measurement(struct integrity_iint_cache *iint,
 			   struct file *file, const unsigned char *filename,
 			   struct evm_ima_xattr_data *xattr_value,
-			   int xattr_len, int pcr,
+			   int xattr_len, const struct modsig *modsig, int pcr,
 			   struct ima_template_desc *template_desc)
 {
 	static const char op[] = "add_template_measure";
@@ -297,10 +305,17 @@
 					     .file = file,
 					     .filename = filename,
 					     .xattr_value = xattr_value,
-					     .xattr_len = xattr_len };
+					     .xattr_len = xattr_len,
+					     .modsig = modsig };
 	int violation = 0;
 
-	if (iint->measured_pcrs & (0x1 << pcr))
+	/*
+	 * We still need to store the measurement in the case of MODSIG because
+	 * we only have its contents to put in the list at the time of
+	 * appraisal, but a file measurement from earlier might already exist in
+	 * the measurement list.
+	 */
+	if (iint->measured_pcrs & (0x1 << pcr) && !modsig)
 		return;
 
 	result = ima_alloc_init_template(&event_data, &entry, template_desc);

diff --git a/security/integrity/ima/ima_appraise.c b/security/integrity/ima/ima_appraise.c
index 89b8319..136ae4e 100644
--- a/security/integrity/ima/ima_appraise.c
+++ b/security/integrity/ima/ima_appraise.c

@@ -200,6 +200,110 @@
 }
 
 /*
+ * xattr_verify - verify xattr digest or signature
+ *
+ * Verify whether the hash or signature matches the file contents.
+ *
+ * Return 0 on success, error code otherwise.
+ */
+static int xattr_verify(enum ima_hooks func, struct integrity_iint_cache *iint,
+			struct evm_ima_xattr_data *xattr_value, int xattr_len,
+			enum integrity_status *status, const char **cause)
+{
+	int rc = -EINVAL, hash_start = 0;
+
+	switch (xattr_value->type) {
+	case IMA_XATTR_DIGEST_NG:
+		/* first byte contains algorithm id */
+		hash_start = 1;
+		/* fall through */
+	case IMA_XATTR_DIGEST:
+		if (iint->flags & IMA_DIGSIG_REQUIRED) {
+			*cause = "IMA-signature-required";
+			*status = INTEGRITY_FAIL;
+			break;
+		}
+		clear_bit(IMA_DIGSIG, &iint->atomic_flags);
+		if (xattr_len - sizeof(xattr_value->type) - hash_start >=
+				iint->ima_hash->length)
+			/*
+			 * xattr length may be longer. md5 hash in previous
+			 * version occupied 20 bytes in xattr, instead of 16
+			 */
+			rc = memcmp(&xattr_value->data[hash_start],
+				    iint->ima_hash->digest,
+				    iint->ima_hash->length);
+		else
+			rc = -EINVAL;
+		if (rc) {
+			*cause = "invalid-hash";
+			*status = INTEGRITY_FAIL;
+			break;
+		}
+		*status = INTEGRITY_PASS;
+		break;
+	case EVM_IMA_XATTR_DIGSIG:
+		set_bit(IMA_DIGSIG, &iint->atomic_flags);
+		rc = integrity_digsig_verify(INTEGRITY_KEYRING_IMA,
+					     (const char *)xattr_value,
+					     xattr_len,
+					     iint->ima_hash->digest,
+					     iint->ima_hash->length);
+		if (rc == -EOPNOTSUPP) {
+			*status = INTEGRITY_UNKNOWN;
+			break;
+		}
+		if (IS_ENABLED(CONFIG_INTEGRITY_PLATFORM_KEYRING) && rc &&
+		    func == KEXEC_KERNEL_CHECK)
+			rc = integrity_digsig_verify(INTEGRITY_KEYRING_PLATFORM,
+						     (const char *)xattr_value,
+						     xattr_len,
+						     iint->ima_hash->digest,
+						     iint->ima_hash->length);
+		if (rc) {
+			*cause = "invalid-signature";
+			*status = INTEGRITY_FAIL;
+		} else {
+			*status = INTEGRITY_PASS;
+		}
+		break;
+	default:
+		*status = INTEGRITY_UNKNOWN;
+		*cause = "unknown-ima-data";
+		break;
+	}
+
+	return rc;
+}
+
+/*
+ * modsig_verify - verify modsig signature
+ *
+ * Verify whether the signature matches the file contents.
+ *
+ * Return 0 on success, error code otherwise.
+ */
+static int modsig_verify(enum ima_hooks func, const struct modsig *modsig,
+			 enum integrity_status *status, const char **cause)
+{
+	int rc;
+
+	rc = integrity_modsig_verify(INTEGRITY_KEYRING_IMA, modsig);
+	if (IS_ENABLED(CONFIG_INTEGRITY_PLATFORM_KEYRING) && rc &&
+	    func == KEXEC_KERNEL_CHECK)
+		rc = integrity_modsig_verify(INTEGRITY_KEYRING_PLATFORM,
+					     modsig);
+	if (rc) {
+		*cause = "invalid-signature";
+		*status = INTEGRITY_FAIL;
+	} else {
+		*status = INTEGRITY_PASS;
+	}
+
+	return rc;
+}
+
+/*
  * ima_appraise_measurement - appraise file measurement
  *
  * Call evm_verifyxattr() to verify the integrity of 'security.ima'.
@@ -211,19 +315,22 @@
 			     struct integrity_iint_cache *iint,
 			     struct file *file, const unsigned char *filename,
 			     struct evm_ima_xattr_data *xattr_value,
-			     int xattr_len)
+			     int xattr_len, const struct modsig *modsig)
 {
 	static const char op[] = "appraise_data";
 	const char *cause = "unknown";
 	struct dentry *dentry = file_dentry(file);
 	struct inode *inode = d_backing_inode(dentry);
 	enum integrity_status status = INTEGRITY_UNKNOWN;
-	int rc = xattr_len, hash_start = 0;
+	int rc = xattr_len;
+	bool try_modsig = iint->flags & IMA_MODSIG_ALLOWED && modsig;
 
-	if (!(inode->i_opflags & IOP_XATTR))
+	/* If not appraising a modsig, we need an xattr. */
+	if (!(inode->i_opflags & IOP_XATTR) && !try_modsig)
 		return INTEGRITY_UNKNOWN;
 
-	if (rc <= 0) {
+	/* If reading the xattr failed and there's no modsig, error out. */
+	if (rc <= 0 && !try_modsig) {
 		if (rc && rc != -ENODATA)
 			goto out;
 
@@ -246,6 +353,10 @@
 	case INTEGRITY_UNKNOWN:
 		break;
 	case INTEGRITY_NOXATTRS:	/* No EVM protected xattrs. */
+		/* It's fine not to have xattrs when using a modsig. */
+		if (try_modsig)
+			break;
+		/* fall through */
 	case INTEGRITY_NOLABEL:		/* No security.evm xattr. */
 		cause = "missing-HMAC";
 		goto out;
@@ -256,65 +367,18 @@
 		WARN_ONCE(true, "Unexpected integrity status %d\n", status);
 	}
 
-	switch (xattr_value->type) {
-	case IMA_XATTR_DIGEST_NG:
-		/* first byte contains algorithm id */
-		hash_start = 1;
-		/* fall through */
-	case IMA_XATTR_DIGEST:
-		if (iint->flags & IMA_DIGSIG_REQUIRED) {
-			cause = "IMA-signature-required";
-			status = INTEGRITY_FAIL;
-			break;
-		}
-		clear_bit(IMA_DIGSIG, &iint->atomic_flags);
-		if (xattr_len - sizeof(xattr_value->type) - hash_start >=
-				iint->ima_hash->length)
-			/* xattr length may be longer. md5 hash in previous
-			   version occupied 20 bytes in xattr, instead of 16
-			 */
-			rc = memcmp(&xattr_value->data[hash_start],
-				    iint->ima_hash->digest,
-				    iint->ima_hash->length);
-		else
-			rc = -EINVAL;
-		if (rc) {
-			cause = "invalid-hash";
-			status = INTEGRITY_FAIL;
-			break;
-		}
-		status = INTEGRITY_PASS;
-		break;
-	case EVM_IMA_XATTR_DIGSIG:
-		set_bit(IMA_DIGSIG, &iint->atomic_flags);
-		rc = integrity_digsig_verify(INTEGRITY_KEYRING_IMA,
-					     (const char *)xattr_value,
-					     xattr_len,
-					     iint->ima_hash->digest,
-					     iint->ima_hash->length);
-		if (rc == -EOPNOTSUPP) {
-			status = INTEGRITY_UNKNOWN;
-			break;
-		}
-		if (IS_ENABLED(CONFIG_INTEGRITY_PLATFORM_KEYRING) && rc &&
-		    func == KEXEC_KERNEL_CHECK)
-			rc = integrity_digsig_verify(INTEGRITY_KEYRING_PLATFORM,
-						     (const char *)xattr_value,
-						     xattr_len,
-						     iint->ima_hash->digest,
-						     iint->ima_hash->length);
-		if (rc) {
-			cause = "invalid-signature";
-			status = INTEGRITY_FAIL;
-		} else {
-			status = INTEGRITY_PASS;
-		}
-		break;
-	default:
-		status = INTEGRITY_UNKNOWN;
-		cause = "unknown-ima-data";
-		break;
-	}
+	if (xattr_value)
+		rc = xattr_verify(func, iint, xattr_value, xattr_len, &status,
+				  &cause);
+
+	/*
+	 * If we have a modsig and either no imasig or the imasig's key isn't
+	 * known, then try verifying the modsig.
+	 */
+	if (try_modsig &&
+	    (!xattr_value || xattr_value->type == IMA_XATTR_DIGEST_NG ||
+	     rc == -ENOKEY))
+		rc = modsig_verify(func, modsig, &status, &cause);
 
 out:
 	/*
@@ -332,7 +396,7 @@
 				    op, cause, rc, 0);
 	} else if (status != INTEGRITY_PASS) {
 		/* Fix mode, but don't replace file signatures. */
-		if ((ima_appraise & IMA_APPRAISE_FIX) &&
+		if ((ima_appraise & IMA_APPRAISE_FIX) && !try_modsig &&
 		    (!xattr_value ||
 		     xattr_value->type != EVM_IMA_XATTR_DIGSIG)) {
 			if (!ima_fix_xattr(dentry, iint))
@@ -371,7 +435,7 @@
 	    !(iint->flags & IMA_HASH))
 		return;
 
-	rc = ima_collect_measurement(iint, file, NULL, 0, ima_hash_algo);
+	rc = ima_collect_measurement(iint, file, NULL, 0, ima_hash_algo, NULL);
 	if (rc < 0)
 		return;
 

diff --git a/security/integrity/ima/ima_crypto.c b/security/integrity/ima/ima_crypto.c
index d4c7b8e..73044fc 100644
--- a/security/integrity/ima/ima_crypto.c
+++ b/security/integrity/ima/ima_crypto.c

@@ -268,8 +268,16 @@
 		rbuf_len = min_t(loff_t, i_size - offset, rbuf_size[active]);
 		rc = integrity_kernel_read(file, offset, rbuf[active],
 					   rbuf_len);
-		if (rc != rbuf_len)
+		if (rc != rbuf_len) {
+			if (rc >= 0)
+				rc = -EINVAL;
+			/*
+			 * Forward current rc, do not overwrite with return value
+			 * from ahash_wait()
+			 */
+			ahash_wait(ahash_rc, &wait);
 			goto out3;
+		}
 
 		if (rbuf[1] && offset) {
 			/* Using two buffers, and it is not the first

diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c
index 5840197..60027c6 100644
--- a/security/integrity/ima/ima_main.c
+++ b/security/integrity/ima/ima_main.c

@@ -202,6 +202,7 @@
 	int rc = 0, action, must_appraise = 0;
 	int pcr = CONFIG_IMA_MEASURE_PCR_IDX;
 	struct evm_ima_xattr_data *xattr_value = NULL;
+	struct modsig *modsig = NULL;
 	int xattr_len = 0;
 	bool violation_check;
 	enum hash_algo hash_algo;
@@ -302,13 +303,27 @@
 	}
 
 	if ((action & IMA_APPRAISE_SUBMASK) ||
-		    strcmp(template_desc->name, IMA_TEMPLATE_IMA_NAME) != 0)
+	    strcmp(template_desc->name, IMA_TEMPLATE_IMA_NAME) != 0) {
 		/* read 'security.ima' */
 		xattr_len = ima_read_xattr(file_dentry(file), &xattr_value);
 
+		/*
+		 * Read the appended modsig if allowed by the policy, and allow
+		 * an additional measurement list entry, if needed, based on the
+		 * template format and whether the file was already measured.
+		 */
+		if (iint->flags & IMA_MODSIG_ALLOWED) {
+			rc = ima_read_modsig(func, buf, size, &modsig);
+
+			if (!rc && ima_template_has_modsig(template_desc) &&
+			    iint->flags & IMA_MEASURED)
+				action |= IMA_MEASURE;
+		}
+	}
+
 	hash_algo = ima_get_hash_algo(xattr_value, xattr_len);
 
-	rc = ima_collect_measurement(iint, file, buf, size, hash_algo);
+	rc = ima_collect_measurement(iint, file, buf, size, hash_algo, modsig);
 	if (rc != 0 && rc != -EBADF && rc != -EINVAL)
 		goto out_locked;
 
@@ -317,12 +332,12 @@
 
 	if (action & IMA_MEASURE)
 		ima_store_measurement(iint, file, pathname,
-				      xattr_value, xattr_len, pcr,
+				      xattr_value, xattr_len, modsig, pcr,
 				      template_desc);
 	if (rc == 0 && (action & IMA_APPRAISE_SUBMASK)) {
 		inode_lock(inode);
 		rc = ima_appraise_measurement(func, iint, file, pathname,
-					      xattr_value, xattr_len);
+					      xattr_value, xattr_len, modsig);
 		inode_unlock(inode);
 		if (!rc)
 			rc = mmap_violation_check(func, file, &pathbuf,
@@ -339,6 +354,7 @@
 		rc = -EACCES;
 	mutex_unlock(&iint->mutex);
 	kfree(xattr_value);
+	ima_free_modsig(modsig);
 out:
 	if (pathbuf)
 		__putname(pathbuf);
@@ -502,7 +518,7 @@
 	return 0;
 }
 
-static const int read_idmap[READING_MAX_ID] = {
+const int read_idmap[READING_MAX_ID] = {
 	[READING_FIRMWARE] = FIRMWARE_CHECK,
 	[READING_FIRMWARE_PREALLOC_BUFFER] = FIRMWARE_CHECK,
 	[READING_MODULE] = MODULE_CHECK,
@@ -574,7 +590,7 @@
 
 	switch (id) {
 	case LOADING_KEXEC_IMAGE:
-		if (IS_ENABLED(CONFIG_KEXEC_VERIFY_SIG)
+		if (IS_ENABLED(CONFIG_KEXEC_SIG)
 		    && arch_ima_get_secureboot()) {
 			pr_err("impossible to appraise a kernel image without a file descriptor; try using kexec_file_load syscall.\n");
 			return -EACCES;

diff --git a/security/integrity/ima/ima_modsig.c b/security/integrity/ima/ima_modsig.c
new file mode 100644
index 0000000..d106885
--- /dev/null
+++ b/security/integrity/ima/ima_modsig.c

@@ -0,0 +1,168 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * IMA support for appraising module-style appended signatures.
+ *
+ * Copyright (C) 2019  IBM Corporation
+ *
+ * Author:
+ * Thiago Jung Bauermann <bauerman@linux.ibm.com>
+ */
+
+#include <linux/types.h>
+#include <linux/module_signature.h>
+#include <keys/asymmetric-type.h>
+#include <crypto/pkcs7.h>
+
+#include "ima.h"
+
+struct modsig {
+	struct pkcs7_message *pkcs7_msg;
+
+	enum hash_algo hash_algo;
+
+	/* This digest will go in the 'd-modsig' field of the IMA template. */
+	const u8 *digest;
+	u32 digest_size;
+
+	/*
+	 * This is what will go to the measurement list if the template requires
+	 * storing the signature.
+	 */
+	int raw_pkcs7_len;
+	u8 raw_pkcs7[];
+};
+
+/**
+ * ima_hook_supports_modsig - can the policy allow modsig for this hook?
+ *
+ * modsig is only supported by hooks using ima_post_read_file(), because only
+ * they preload the contents of the file in a buffer. FILE_CHECK does that in
+ * some cases, but not when reached from vfs_open(). POLICY_CHECK can support
+ * it, but it's not useful in practice because it's a text file so deny.
+ */
+bool ima_hook_supports_modsig(enum ima_hooks func)
+{
+	switch (func) {
+	case KEXEC_KERNEL_CHECK:
+	case KEXEC_INITRAMFS_CHECK:
+	case MODULE_CHECK:
+		return true;
+	default:
+		return false;
+	}
+}
+
+/*
+ * ima_read_modsig - Read modsig from buf.
+ *
+ * Return: 0 on success, error code otherwise.
+ */
+int ima_read_modsig(enum ima_hooks func, const void *buf, loff_t buf_len,
+		    struct modsig **modsig)
+{
+	const size_t marker_len = strlen(MODULE_SIG_STRING);
+	const struct module_signature *sig;
+	struct modsig *hdr;
+	size_t sig_len;
+	const void *p;
+	int rc;
+
+	if (buf_len <= marker_len + sizeof(*sig))
+		return -ENOENT;
+
+	p = buf + buf_len - marker_len;
+	if (memcmp(p, MODULE_SIG_STRING, marker_len))
+		return -ENOENT;
+
+	buf_len -= marker_len;
+	sig = (const struct module_signature *)(p - sizeof(*sig));
+
+	rc = mod_check_sig(sig, buf_len, func_tokens[func]);
+	if (rc)
+		return rc;
+
+	sig_len = be32_to_cpu(sig->sig_len);
+	buf_len -= sig_len + sizeof(*sig);
+
+	/* Allocate sig_len additional bytes to hold the raw PKCS#7 data. */
+	hdr = kzalloc(sizeof(*hdr) + sig_len, GFP_KERNEL);
+	if (!hdr)
+		return -ENOMEM;
+
+	hdr->pkcs7_msg = pkcs7_parse_message(buf + buf_len, sig_len);
+	if (IS_ERR(hdr->pkcs7_msg)) {
+		rc = PTR_ERR(hdr->pkcs7_msg);
+		kfree(hdr);
+		return rc;
+	}
+
+	memcpy(hdr->raw_pkcs7, buf + buf_len, sig_len);
+	hdr->raw_pkcs7_len = sig_len;
+
+	/* We don't know the hash algorithm yet. */
+	hdr->hash_algo = HASH_ALGO__LAST;
+
+	*modsig = hdr;
+
+	return 0;
+}
+
+/**
+ * ima_collect_modsig - Calculate the file hash without the appended signature.
+ *
+ * Since the modsig is part of the file contents, the hash used in its signature
+ * isn't the same one ordinarily calculated by IMA. Therefore PKCS7 code
+ * calculates a separate one for signature verification.
+ */
+void ima_collect_modsig(struct modsig *modsig, const void *buf, loff_t size)
+{
+	int rc;
+
+	/*
+	 * Provide the file contents (minus the appended sig) so that the PKCS7
+	 * code can calculate the file hash.
+	 */
+	size -= modsig->raw_pkcs7_len + strlen(MODULE_SIG_STRING) +
+		sizeof(struct module_signature);
+	rc = pkcs7_supply_detached_data(modsig->pkcs7_msg, buf, size);
+	if (rc)
+		return;
+
+	/* Ask the PKCS7 code to calculate the file hash. */
+	rc = pkcs7_get_digest(modsig->pkcs7_msg, &modsig->digest,
+			      &modsig->digest_size, &modsig->hash_algo);
+}
+
+int ima_modsig_verify(struct key *keyring, const struct modsig *modsig)
+{
+	return verify_pkcs7_message_sig(NULL, 0, modsig->pkcs7_msg, keyring,
+					VERIFYING_MODULE_SIGNATURE, NULL, NULL);
+}
+
+int ima_get_modsig_digest(const struct modsig *modsig, enum hash_algo *algo,
+			  const u8 **digest, u32 *digest_size)
+{
+	*algo = modsig->hash_algo;
+	*digest = modsig->digest;
+	*digest_size = modsig->digest_size;
+
+	return 0;
+}
+
+int ima_get_raw_modsig(const struct modsig *modsig, const void **data,
+		       u32 *data_len)
+{
+	*data = &modsig->raw_pkcs7;
+	*data_len = modsig->raw_pkcs7_len;
+
+	return 0;
+}
+
+void ima_free_modsig(struct modsig *modsig)
+{
+	if (!modsig)
+		return;
+
+	pkcs7_free_message(modsig->pkcs7_msg);
+	kfree(modsig);
+}

diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c
index 6df7f64..5380aca 100644
--- a/security/integrity/ima/ima_policy.c
+++ b/security/integrity/ima/ima_policy.c

@@ -6,6 +6,9 @@
  * ima_policy.c
  *	- initialize default measure policy rules
  */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/init.h>
 #include <linux/list.h>
 #include <linux/fs.h>
@@ -491,6 +494,9 @@
 	struct ima_rule_entry *entry;
 	int action = 0, actmask = flags | (flags << 1);
 
+	if (template_desc)
+		*template_desc = ima_template_desc_current();
+
 	rcu_read_lock();
 	list_for_each_entry_rcu(entry, ima_rules, list) {
 
@@ -510,6 +516,7 @@
 				action |= IMA_FAIL_UNVERIFIABLE_SIGS;
 		}
 
+
 		if (entry->action & IMA_DO_MASK)
 			actmask &= ~(entry->action | entry->action << 1);
 		else
@@ -520,8 +527,6 @@
 
 		if (template_desc && entry->template)
 			*template_desc = entry->template;
-		else if (template_desc)
-			*template_desc = ima_template_desc_current();
 
 		if (!actmask)
 			break;
@@ -843,6 +848,38 @@
 	ima_log_string_op(ab, key, value, NULL);
 }
 
+/*
+ * Validating the appended signature included in the measurement list requires
+ * the file hash calculated without the appended signature (i.e., the 'd-modsig'
+ * field). Therefore, notify the user if they have the 'modsig' field but not
+ * the 'd-modsig' field in the template.
+ */
+static void check_template_modsig(const struct ima_template_desc *template)
+{
+#define MSG "template with 'modsig' field also needs 'd-modsig' field\n"
+	bool has_modsig, has_dmodsig;
+	static bool checked;
+	int i;
+
+	/* We only need to notify the user once. */
+	if (checked)
+		return;
+
+	has_modsig = has_dmodsig = false;
+	for (i = 0; i < template->num_fields; i++) {
+		if (!strcmp(template->fields[i]->field_id, "modsig"))
+			has_modsig = true;
+		else if (!strcmp(template->fields[i]->field_id, "d-modsig"))
+			has_dmodsig = true;
+	}
+
+	if (has_modsig && !has_dmodsig)
+		pr_notice(MSG);
+
+	checked = true;
+#undef MSG
+}
+
 static int ima_parse_rule(char *rule, struct ima_rule_entry *entry)
 {
 	struct audit_buffer *ab;
@@ -1128,6 +1165,10 @@
 			ima_log_string(ab, "appraise_type", args[0].from);
 			if ((strcmp(args[0].from, "imasig")) == 0)
 				entry->flags |= IMA_DIGSIG_REQUIRED;
+			else if (ima_hook_supports_modsig(entry->func) &&
+				 strcmp(args[0].from, "imasig|modsig") == 0)
+				entry->flags |= IMA_DIGSIG_REQUIRED |
+						IMA_MODSIG_ALLOWED;
 			else
 				result = -EINVAL;
 			break;
@@ -1181,6 +1222,12 @@
 	else if (entry->action == APPRAISE)
 		temp_ima_appraise |= ima_appraise_flag(entry->func);
 
+	if (!result && entry->flags & IMA_MODSIG_ALLOWED) {
+		template_desc = entry->template ? entry->template :
+						  ima_template_desc_current();
+		check_template_modsig(template_desc);
+	}
+
 	audit_log_format(ab, "res=%d", !result);
 	audit_log_end(ab);
 	return result;
@@ -1252,6 +1299,12 @@
 	}
 }
 
+#define __ima_hook_stringify(str)	(#str),
+
+const char *const func_tokens[] = {
+	__ima_hooks(__ima_hook_stringify)
+};
+
 #ifdef	CONFIG_IMA_READ_POLICY
 enum {
 	mask_exec = 0, mask_write, mask_read, mask_append
@@ -1264,12 +1317,6 @@
 	"^MAY_APPEND"
 };
 
-#define __ima_hook_stringify(str)	(#str),
-
-static const char *const func_tokens[] = {
-	__ima_hooks(__ima_hook_stringify)
-};
-
 void *ima_policy_start(struct seq_file *m, loff_t *pos)
 {
 	loff_t l = *pos;
@@ -1447,8 +1494,12 @@
 	}
 	if (entry->template)
 		seq_printf(m, "template=%s ", entry->template->name);
-	if (entry->flags & IMA_DIGSIG_REQUIRED)
-		seq_puts(m, "appraise_type=imasig ");
+	if (entry->flags & IMA_DIGSIG_REQUIRED) {
+		if (entry->flags & IMA_MODSIG_ALLOWED)
+			seq_puts(m, "appraise_type=imasig|modsig ");
+		else
+			seq_puts(m, "appraise_type=imasig ");
+	}
 	if (entry->flags & IMA_PERMIT_DIRECTIO)
 		seq_puts(m, "permit_directio ");
 	rcu_read_unlock();
@@ -1456,3 +1507,53 @@
 	return 0;
 }
 #endif	/* CONFIG_IMA_READ_POLICY */
+
+#if defined(CONFIG_IMA_APPRAISE) && defined(CONFIG_INTEGRITY_TRUSTED_KEYRING)
+/*
+ * ima_appraise_signature: whether IMA will appraise a given function using
+ * an IMA digital signature. This is restricted to cases where the kernel
+ * has a set of built-in trusted keys in order to avoid an attacker simply
+ * loading additional keys.
+ */
+bool ima_appraise_signature(enum kernel_read_file_id id)
+{
+	struct ima_rule_entry *entry;
+	bool found = false;
+	enum ima_hooks func;
+
+	if (id >= READING_MAX_ID)
+		return false;
+
+	func = read_idmap[id] ?: FILE_CHECK;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(entry, ima_rules, list) {
+		if (entry->action != APPRAISE)
+			continue;
+
+		/*
+		 * A generic entry will match, but otherwise require that it
+		 * match the func we're looking for
+		 */
+		if (entry->func && entry->func != func)
+			continue;
+
+		/*
+		 * We require this to be a digital signature, not a raw IMA
+		 * hash.
+		 */
+		if (entry->flags & IMA_DIGSIG_REQUIRED)
+			found = true;
+
+		/*
+		 * We've found a rule that matches, so break now even if it
+		 * didn't require a digital signature - a later rule that does
+		 * won't override it, so would be a false positive.
+		 */
+		break;
+	}
+
+	rcu_read_unlock();
+	return found;
+}
+#endif /* CONFIG_IMA_APPRAISE && CONFIG_INTEGRITY_TRUSTED_KEYRING */

diff --git a/security/integrity/ima/ima_template.c b/security/integrity/ima/ima_template.c
index cb349d7b..6aa6408 100644
--- a/security/integrity/ima/ima_template.c
+++ b/security/integrity/ima/ima_template.c

@@ -23,6 +23,7 @@
 	{.name = "ima-ng", .fmt = "d-ng|n-ng"},
 	{.name = "ima-sig", .fmt = "d-ng|n-ng|sig"},
 	{.name = "ima-buf", .fmt = "d-ng|n-ng|buf"},
+	{.name = "ima-modsig", .fmt = "d-ng|n-ng|sig|d-modsig|modsig"},
 	{.name = "", .fmt = ""},	/* placeholder for a custom format */
 };
 
@@ -42,6 +43,10 @@
 	 .field_show = ima_show_template_sig},
 	{.field_id = "buf", .field_init = ima_eventbuf_init,
 	 .field_show = ima_show_template_buf},
+	{.field_id = "d-modsig", .field_init = ima_eventdigest_modsig_init,
+	 .field_show = ima_show_template_digest_ng},
+	{.field_id = "modsig", .field_init = ima_eventmodsig_init,
+	 .field_show = ima_show_template_sig},
 };
 
 /*
@@ -49,10 +54,29 @@
  * need to be accounted for since they shouldn't be defined in the same template
  * description as 'd-ng' and 'n-ng' respectively.
  */
-#define MAX_TEMPLATE_NAME_LEN sizeof("d-ng|n-ng|sig|buf")
+#define MAX_TEMPLATE_NAME_LEN sizeof("d-ng|n-ng|sig|buf|d-modisg|modsig")
 
 static struct ima_template_desc *ima_template;
 
+/**
+ * ima_template_has_modsig - Check whether template has modsig-related fields.
+ * @ima_template: IMA template to check.
+ *
+ * Tells whether the given template has fields referencing a file's appended
+ * signature.
+ */
+bool ima_template_has_modsig(const struct ima_template_desc *ima_template)
+{
+	int i;
+
+	for (i = 0; i < ima_template->num_fields; i++)
+		if (!strcmp(ima_template->fields[i]->field_id, "modsig") ||
+		    !strcmp(ima_template->fields[i]->field_id, "d-modsig"))
+			return true;
+
+	return false;
+}
+
 static int __init ima_template_setup(char *str)
 {
 	struct ima_template_desc *template_desc;
@@ -282,9 +306,8 @@
 	int ret = 0;
 	int i;
 
-	*entry = kzalloc(sizeof(**entry) +
-		    template_desc->num_fields * sizeof(struct ima_field_data),
-		    GFP_NOFS);
+	*entry = kzalloc(struct_size(*entry, template_data,
+				     template_desc->num_fields), GFP_NOFS);
 	if (!*entry)
 		return -ENOMEM;
 

diff --git a/security/integrity/ima/ima_template_lib.c b/security/integrity/ima/ima_template_lib.c
index 2fb9a10..32ae05d 100644
--- a/security/integrity/ima/ima_template_lib.c
+++ b/security/integrity/ima/ima_template_lib.c

@@ -225,7 +225,8 @@
 	return 0;
 }
 
-static int ima_eventdigest_init_common(u8 *digest, u32 digestsize, u8 hash_algo,
+static int ima_eventdigest_init_common(const u8 *digest, u32 digestsize,
+				       u8 hash_algo,
 				       struct ima_field_data *field_data)
 {
 	/*
@@ -328,6 +329,41 @@
 					   hash_algo, field_data);
 }
 
+/*
+ * This function writes the digest of the file which is expected to match the
+ * digest contained in the file's appended signature.
+ */
+int ima_eventdigest_modsig_init(struct ima_event_data *event_data,
+				struct ima_field_data *field_data)
+{
+	enum hash_algo hash_algo;
+	const u8 *cur_digest;
+	u32 cur_digestsize;
+
+	if (!event_data->modsig)
+		return 0;
+
+	if (event_data->violation) {
+		/* Recording a violation. */
+		hash_algo = HASH_ALGO_SHA1;
+		cur_digest = NULL;
+		cur_digestsize = 0;
+	} else {
+		int rc;
+
+		rc = ima_get_modsig_digest(event_data->modsig, &hash_algo,
+					   &cur_digest, &cur_digestsize);
+		if (rc)
+			return rc;
+		else if (hash_algo == HASH_ALGO__LAST || cur_digestsize == 0)
+			/* There was some error collecting the digest. */
+			return -EINVAL;
+	}
+
+	return ima_eventdigest_init_common(cur_digest, cur_digestsize,
+					   hash_algo, field_data);
+}
+
 static int ima_eventname_init_common(struct ima_event_data *event_data,
 				     struct ima_field_data *field_data,
 				     bool size_limit)
@@ -406,3 +442,29 @@
 					     event_data->buf_len, DATA_FMT_HEX,
 					     field_data);
 }
+
+/*
+ *  ima_eventmodsig_init - include the appended file signature as part of the
+ *  template data
+ */
+int ima_eventmodsig_init(struct ima_event_data *event_data,
+			 struct ima_field_data *field_data)
+{
+	const void *data;
+	u32 data_len;
+	int rc;
+
+	if (!event_data->modsig)
+		return 0;
+
+	/*
+	 * modsig is a runtime structure containing pointers. Get its raw data
+	 * instead.
+	 */
+	rc = ima_get_raw_modsig(event_data->modsig, &data, &data_len);
+	if (rc)
+		return rc;
+
+	return ima_write_template_field_data(data, data_len, DATA_FMT_HEX,
+					     field_data);
+}

diff --git a/security/integrity/ima/ima_template_lib.h b/security/integrity/ima/ima_template_lib.h
index 652aa5d..9a88c79 100644
--- a/security/integrity/ima/ima_template_lib.h
+++ b/security/integrity/ima/ima_template_lib.h

@@ -36,10 +36,14 @@
 		       struct ima_field_data *field_data);
 int ima_eventdigest_ng_init(struct ima_event_data *event_data,
 			    struct ima_field_data *field_data);
+int ima_eventdigest_modsig_init(struct ima_event_data *event_data,
+				struct ima_field_data *field_data);
 int ima_eventname_ng_init(struct ima_event_data *event_data,
 			  struct ima_field_data *field_data);
 int ima_eventsig_init(struct ima_event_data *event_data,
 		      struct ima_field_data *field_data);
 int ima_eventbuf_init(struct ima_event_data *event_data,
 		      struct ima_field_data *field_data);
+int ima_eventmodsig_init(struct ima_event_data *event_data,
+			 struct ima_field_data *field_data);
 #endif /* __LINUX_IMA_TEMPLATE_LIB_H */

diff --git a/security/integrity/integrity.h b/security/integrity/integrity.h
index ed12d8e..d9323d3 100644
--- a/security/integrity/integrity.h
+++ b/security/integrity/integrity.h

@@ -31,6 +31,7 @@
 #define IMA_NEW_FILE		0x04000000
 #define EVM_IMMUTABLE_DIGSIG	0x08000000
 #define IMA_FAIL_UNVERIFIABLE_SIGS	0x10000000
+#define IMA_MODSIG_ALLOWED	0x20000000
 
 #define IMA_DO_MASK		(IMA_MEASURE | IMA_APPRAISE | IMA_AUDIT | \
 				 IMA_HASH | IMA_APPRAISE_SUBMASK)
@@ -147,10 +148,13 @@
 
 extern struct dentry *integrity_dir;
 
+struct modsig;
+
 #ifdef CONFIG_INTEGRITY_SIGNATURE
 
 int integrity_digsig_verify(const unsigned int id, const char *sig, int siglen,
 			    const char *digest, int digestlen);
+int integrity_modsig_verify(unsigned int id, const struct modsig *modsig);
 
 int __init integrity_init_keyring(const unsigned int id);
 int __init integrity_load_x509(const unsigned int id, const char *path);
@@ -165,6 +169,12 @@
 	return -EOPNOTSUPP;
 }
 
+static inline int integrity_modsig_verify(unsigned int id,
+					  const struct modsig *modsig)
+{
+	return -EOPNOTSUPP;
+}
+
 static inline int integrity_init_keyring(const unsigned int id)
 {
 	return 0;
@@ -190,6 +200,16 @@
 }
 #endif
 
+#ifdef CONFIG_IMA_APPRAISE_MODSIG
+int ima_modsig_verify(struct key *keyring, const struct modsig *modsig);
+#else
+static inline int ima_modsig_verify(struct key *keyring,
+				    const struct modsig *modsig)
+{
+	return -EOPNOTSUPP;
+}
+#endif
+
 #ifdef CONFIG_IMA_LOAD_X509
 void __init ima_load_x509(void);
 #else

diff --git a/security/keys/trusted.c b/security/keys/trusted.c
index ade6991..1fbd778 100644
--- a/security/keys/trusted.c
+++ b/security/keys/trusted.c

@@ -1228,11 +1228,16 @@
 
 static int __init init_digests(void)
 {
+	int i;
+
 	digests = kcalloc(chip->nr_allocated_banks, sizeof(*digests),
 			  GFP_KERNEL);
 	if (!digests)
 		return -ENOMEM;
 
+	for (i = 0; i < chip->nr_allocated_banks; i++)
+		digests[i].alg_id = chip->allocated_banks[i].alg_id;
+
 	return 0;
 }
 

diff --git a/security/lockdown/Kconfig b/security/lockdown/Kconfig
new file mode 100644
index 0000000..e84ddf4
--- /dev/null
+++ b/security/lockdown/Kconfig

@@ -0,0 +1,47 @@
+config SECURITY_LOCKDOWN_LSM
+	bool "Basic module for enforcing kernel lockdown"
+	depends on SECURITY
+	select MODULE_SIG if MODULES
+	help
+	  Build support for an LSM that enforces a coarse kernel lockdown
+	  behaviour.
+
+config SECURITY_LOCKDOWN_LSM_EARLY
+	bool "Enable lockdown LSM early in init"
+	depends on SECURITY_LOCKDOWN_LSM
+	help
+	  Enable the lockdown LSM early in boot. This is necessary in order
+	  to ensure that lockdown enforcement can be carried out on kernel
+	  boot parameters that are otherwise parsed before the security
+	  subsystem is fully initialised. If enabled, lockdown will
+	  unconditionally be called before any other LSMs.
+
+choice
+	prompt "Kernel default lockdown mode"
+	default LOCK_DOWN_KERNEL_FORCE_NONE
+	depends on SECURITY_LOCKDOWN_LSM
+	help
+	  The kernel can be configured to default to differing levels of
+	  lockdown.
+
+config LOCK_DOWN_KERNEL_FORCE_NONE
+	bool "None"
+	help
+	  No lockdown functionality is enabled by default. Lockdown may be
+	  enabled via the kernel commandline or /sys/kernel/security/lockdown.
+
+config LOCK_DOWN_KERNEL_FORCE_INTEGRITY
+	bool "Integrity"
+	help
+	 The kernel runs in integrity mode by default. Features that allow
+	 the kernel to be modified at runtime are disabled.
+
+config LOCK_DOWN_KERNEL_FORCE_CONFIDENTIALITY
+	bool "Confidentiality"
+	help
+	 The kernel runs in confidentiality mode by default. Features that
+	 allow the kernel to be modified at runtime or that permit userland
+	 code to read confidential material held inside the kernel are
+	 disabled.
+
+endchoice

diff --git a/security/lockdown/Makefile b/security/lockdown/Makefile
new file mode 100644
index 0000000..e3634b9
--- /dev/null
+++ b/security/lockdown/Makefile

@@ -0,0 +1 @@
+obj-$(CONFIG_SECURITY_LOCKDOWN_LSM) += lockdown.o

diff --git a/security/lockdown/lockdown.c b/security/lockdown/lockdown.c
new file mode 100644
index 0000000..8a10b43
--- /dev/null
+++ b/security/lockdown/lockdown.c

@@ -0,0 +1,191 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Lock down the kernel
+ *
+ * Copyright (C) 2016 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/security.h>
+#include <linux/export.h>
+#include <linux/lsm_hooks.h>
+
+static enum lockdown_reason kernel_locked_down;
+
+static const char *const lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1] = {
+	[LOCKDOWN_NONE] = "none",
+	[LOCKDOWN_MODULE_SIGNATURE] = "unsigned module loading",
+	[LOCKDOWN_DEV_MEM] = "/dev/mem,kmem,port",
+	[LOCKDOWN_KEXEC] = "kexec of unsigned images",
+	[LOCKDOWN_HIBERNATION] = "hibernation",
+	[LOCKDOWN_PCI_ACCESS] = "direct PCI access",
+	[LOCKDOWN_IOPORT] = "raw io port access",
+	[LOCKDOWN_MSR] = "raw MSR access",
+	[LOCKDOWN_ACPI_TABLES] = "modifying ACPI tables",
+	[LOCKDOWN_PCMCIA_CIS] = "direct PCMCIA CIS storage",
+	[LOCKDOWN_TIOCSSERIAL] = "reconfiguration of serial port IO",
+	[LOCKDOWN_MODULE_PARAMETERS] = "unsafe module parameters",
+	[LOCKDOWN_MMIOTRACE] = "unsafe mmio",
+	[LOCKDOWN_DEBUGFS] = "debugfs access",
+	[LOCKDOWN_INTEGRITY_MAX] = "integrity",
+	[LOCKDOWN_KCORE] = "/proc/kcore access",
+	[LOCKDOWN_KPROBES] = "use of kprobes",
+	[LOCKDOWN_BPF_READ] = "use of bpf to read kernel RAM",
+	[LOCKDOWN_PERF] = "unsafe use of perf",
+	[LOCKDOWN_TRACEFS] = "use of tracefs",
+	[LOCKDOWN_CONFIDENTIALITY_MAX] = "confidentiality",
+};
+
+static const enum lockdown_reason lockdown_levels[] = {LOCKDOWN_NONE,
+						 LOCKDOWN_INTEGRITY_MAX,
+						 LOCKDOWN_CONFIDENTIALITY_MAX};
+
+/*
+ * Put the kernel into lock-down mode.
+ */
+static int lock_kernel_down(const char *where, enum lockdown_reason level)
+{
+	if (kernel_locked_down >= level)
+		return -EPERM;
+
+	kernel_locked_down = level;
+	pr_notice("Kernel is locked down from %s; see man kernel_lockdown.7\n",
+		  where);
+	return 0;
+}
+
+static int __init lockdown_param(char *level)
+{
+	if (!level)
+		return -EINVAL;
+
+	if (strcmp(level, "integrity") == 0)
+		lock_kernel_down("command line", LOCKDOWN_INTEGRITY_MAX);
+	else if (strcmp(level, "confidentiality") == 0)
+		lock_kernel_down("command line", LOCKDOWN_CONFIDENTIALITY_MAX);
+	else
+		return -EINVAL;
+
+	return 0;
+}
+
+early_param("lockdown", lockdown_param);
+
+/**
+ * lockdown_is_locked_down - Find out if the kernel is locked down
+ * @what: Tag to use in notice generated if lockdown is in effect
+ */
+static int lockdown_is_locked_down(enum lockdown_reason what)
+{
+	if (WARN(what >= LOCKDOWN_CONFIDENTIALITY_MAX,
+		 "Invalid lockdown reason"))
+		return -EPERM;
+
+	if (kernel_locked_down >= what) {
+		if (lockdown_reasons[what])
+			pr_notice("Lockdown: %s: %s is restricted; see man kernel_lockdown.7\n",
+				  current->comm, lockdown_reasons[what]);
+		return -EPERM;
+	}
+
+	return 0;
+}
+
+static struct security_hook_list lockdown_hooks[] __lsm_ro_after_init = {
+	LSM_HOOK_INIT(locked_down, lockdown_is_locked_down),
+};
+
+static int __init lockdown_lsm_init(void)
+{
+#if defined(CONFIG_LOCK_DOWN_KERNEL_FORCE_INTEGRITY)
+	lock_kernel_down("Kernel configuration", LOCKDOWN_INTEGRITY_MAX);
+#elif defined(CONFIG_LOCK_DOWN_KERNEL_FORCE_CONFIDENTIALITY)
+	lock_kernel_down("Kernel configuration", LOCKDOWN_CONFIDENTIALITY_MAX);
+#endif
+	security_add_hooks(lockdown_hooks, ARRAY_SIZE(lockdown_hooks),
+			   "lockdown");
+	return 0;
+}
+
+static ssize_t lockdown_read(struct file *filp, char __user *buf, size_t count,
+			     loff_t *ppos)
+{
+	char temp[80];
+	int i, offset = 0;
+
+	for (i = 0; i < ARRAY_SIZE(lockdown_levels); i++) {
+		enum lockdown_reason level = lockdown_levels[i];
+
+		if (lockdown_reasons[level]) {
+			const char *label = lockdown_reasons[level];
+
+			if (kernel_locked_down == level)
+				offset += sprintf(temp+offset, "[%s] ", label);
+			else
+				offset += sprintf(temp+offset, "%s ", label);
+		}
+	}
+
+	/* Convert the last space to a newline if needed. */
+	if (offset > 0)
+		temp[offset-1] = '\n';
+
+	return simple_read_from_buffer(buf, count, ppos, temp, strlen(temp));
+}
+
+static ssize_t lockdown_write(struct file *file, const char __user *buf,
+			      size_t n, loff_t *ppos)
+{
+	char *state;
+	int i, len, err = -EINVAL;
+
+	state = memdup_user_nul(buf, n);
+	if (IS_ERR(state))
+		return PTR_ERR(state);
+
+	len = strlen(state);
+	if (len && state[len-1] == '\n') {
+		state[len-1] = '\0';
+		len--;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(lockdown_levels); i++) {
+		enum lockdown_reason level = lockdown_levels[i];
+		const char *label = lockdown_reasons[level];
+
+		if (label && !strcmp(state, label))
+			err = lock_kernel_down("securityfs", level);
+	}
+
+	kfree(state);
+	return err ? err : n;
+}
+
+static const struct file_operations lockdown_ops = {
+	.read  = lockdown_read,
+	.write = lockdown_write,
+};
+
+static int __init lockdown_secfs_init(void)
+{
+	struct dentry *dentry;
+
+	dentry = securityfs_create_file("lockdown", 0600, NULL, NULL,
+					&lockdown_ops);
+	return PTR_ERR_OR_ZERO(dentry);
+}
+
+core_initcall(lockdown_secfs_init);
+
+#ifdef CONFIG_SECURITY_LOCKDOWN_LSM_EARLY
+DEFINE_EARLY_LSM(lockdown) = {
+#else
+DEFINE_LSM(lockdown) = {
+#endif
+	.name = "lockdown",
+	.init = lockdown_lsm_init,
+};

diff --git a/security/safesetid/securityfs.c b/security/safesetid/securityfs.c
index d568e17..74a13d4 100644
--- a/security/safesetid/securityfs.c
+++ b/security/safesetid/securityfs.c

@@ -187,7 +187,8 @@
 out_free_buf:
 	kfree(buf);
 out_free_pol:
-	release_ruleset(pol);
+	if (pol)
+                release_ruleset(pol);
 	return err;
 }
 

diff --git a/security/security.c b/security/security.c
index 250ee2d..1bc000f 100644
--- a/security/security.c
+++ b/security/security.c

@@ -33,6 +33,7 @@
 
 /* How many LSMs were built into the kernel? */
 #define LSM_COUNT (__end_lsm_info - __start_lsm_info)
+#define EARLY_LSM_COUNT (__end_early_lsm_info - __start_early_lsm_info)
 
 struct security_hook_heads security_hook_heads __lsm_ro_after_init;
 static BLOCKING_NOTIFIER_HEAD(blocking_lsm_notifier_chain);
@@ -277,6 +278,8 @@
 static void __init lsm_early_cred(struct cred *cred);
 static void __init lsm_early_task(struct task_struct *task);
 
+static int lsm_append(const char *new, char **result);
+
 static void __init ordered_lsm_init(void)
 {
 	struct lsm_info **lsm;
@@ -323,6 +326,26 @@
 	kfree(ordered_lsms);
 }
 
+int __init early_security_init(void)
+{
+	int i;
+	struct hlist_head *list = (struct hlist_head *) &security_hook_heads;
+	struct lsm_info *lsm;
+
+	for (i = 0; i < sizeof(security_hook_heads) / sizeof(struct hlist_head);
+	     i++)
+		INIT_HLIST_HEAD(&list[i]);
+
+	for (lsm = __start_early_lsm_info; lsm < __end_early_lsm_info; lsm++) {
+		if (!lsm->enabled)
+			lsm->enabled = &lsm_enabled_true;
+		prepare_lsm(lsm);
+		initialize_lsm(lsm);
+	}
+
+	return 0;
+}
+
 /**
  * security_init - initializes the security framework
  *
@@ -330,14 +353,18 @@
  */
 int __init security_init(void)
 {
-	int i;
-	struct hlist_head *list = (struct hlist_head *) &security_hook_heads;
+	struct lsm_info *lsm;
 
 	pr_info("Security Framework initializing\n");
 
-	for (i = 0; i < sizeof(security_hook_heads) / sizeof(struct hlist_head);
-	     i++)
-		INIT_HLIST_HEAD(&list[i]);
+	/*
+	 * Append the names of the early LSM modules now that kmalloc() is
+	 * available
+	 */
+	for (lsm = __start_early_lsm_info; lsm < __end_early_lsm_info; lsm++) {
+		if (lsm->enabled)
+			lsm_append(lsm->name, &lsm_names);
+	}
 
 	/* Load LSMs in specified order. */
 	ordered_lsm_init();
@@ -384,7 +411,7 @@
 	return !strcmp(last, lsm);
 }
 
-static int lsm_append(char *new, char **result)
+static int lsm_append(const char *new, char **result)
 {
 	char *cp;
 
@@ -422,8 +449,15 @@
 		hooks[i].lsm = lsm;
 		hlist_add_tail_rcu(&hooks[i].list, hooks[i].head);
 	}
-	if (lsm_append(lsm, &lsm_names) < 0)
-		panic("%s - Cannot get early memory.\n", __func__);
+
+	/*
+	 * Don't try to append during early_security_init(), we'll come back
+	 * and fix this up afterwards.
+	 */
+	if (slab_is_available()) {
+		if (lsm_append(lsm, &lsm_names) < 0)
+			panic("%s - Cannot get early memory.\n", __func__);
+	}
 }
 
 int call_blocking_lsm_notifier(enum lsm_event event, void *data)
@@ -870,6 +904,12 @@
 	return call_int_hook(move_mount, 0, from_path, to_path);
 }
 
+int security_path_notify(const struct path *path, u64 mask,
+				unsigned int obj_type)
+{
+	return call_int_hook(path_notify, 0, path, mask, obj_type);
+}
+
 int security_inode_alloc(struct inode *inode)
 {
 	int rc = lsm_inode_alloc(inode);
@@ -2358,3 +2398,9 @@
 	call_void_hook(bpf_prog_free_security, aux);
 }
 #endif /* CONFIG_BPF_SYSCALL */
+
+int security_locked_down(enum lockdown_reason what)
+{
+	return call_int_hook(locked_down, 0, what);
+}
+EXPORT_SYMBOL(security_locked_down);

diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 74dd46d..9625b99 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c

@@ -89,6 +89,8 @@
 #include <linux/kernfs.h>
 #include <linux/stringhash.h>	/* for hashlen_string() */
 #include <uapi/linux/mount.h>
+#include <linux/fsnotify.h>
+#include <linux/fanotify.h>
 
 #include "avc.h"
 #include "objsec.h"
@@ -3275,6 +3277,50 @@
 	return -EACCES;
 }
 
+static int selinux_path_notify(const struct path *path, u64 mask,
+						unsigned int obj_type)
+{
+	int ret;
+	u32 perm;
+
+	struct common_audit_data ad;
+
+	ad.type = LSM_AUDIT_DATA_PATH;
+	ad.u.path = *path;
+
+	/*
+	 * Set permission needed based on the type of mark being set.
+	 * Performs an additional check for sb watches.
+	 */
+	switch (obj_type) {
+	case FSNOTIFY_OBJ_TYPE_VFSMOUNT:
+		perm = FILE__WATCH_MOUNT;
+		break;
+	case FSNOTIFY_OBJ_TYPE_SB:
+		perm = FILE__WATCH_SB;
+		ret = superblock_has_perm(current_cred(), path->dentry->d_sb,
+						FILESYSTEM__WATCH, &ad);
+		if (ret)
+			return ret;
+		break;
+	case FSNOTIFY_OBJ_TYPE_INODE:
+		perm = FILE__WATCH;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	/* blocking watches require the file:watch_with_perm permission */
+	if (mask & (ALL_FSNOTIFY_PERM_EVENTS))
+		perm |= FILE__WATCH_WITH_PERM;
+
+	/* watches on read-like events need the file:watch_reads permission */
+	if (mask & (FS_ACCESS | FS_ACCESS_PERM | FS_CLOSE_NOWRITE))
+		perm |= FILE__WATCH_READS;
+
+	return path_has_perm(current_cred(), path, perm);
+}
+
 /*
  * Copy the inode security context value to the user.
  *
@@ -3403,7 +3449,7 @@
 static int selinux_kernfs_init_security(struct kernfs_node *kn_dir,
 					struct kernfs_node *kn)
 {
-	const struct task_security_struct *tsec = current_security();
+	const struct task_security_struct *tsec = selinux_cred(current_cred());
 	u32 parent_sid, newsid, clen;
 	int rc;
 	char *context;
@@ -6818,6 +6864,7 @@
 	LSM_HOOK_INIT(inode_getsecid, selinux_inode_getsecid),
 	LSM_HOOK_INIT(inode_copy_up, selinux_inode_copy_up),
 	LSM_HOOK_INIT(inode_copy_up_xattr, selinux_inode_copy_up_xattr),
+	LSM_HOOK_INIT(path_notify, selinux_path_notify),
 
 	LSM_HOOK_INIT(kernfs_init_security, selinux_kernfs_init_security),
 

diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h
index 201f7e5..32e9b03 100644
--- a/security/selinux/include/classmap.h
+++ b/security/selinux/include/classmap.h

@@ -7,7 +7,8 @@
 
 #define COMMON_FILE_PERMS COMMON_FILE_SOCK_PERMS, "unlink", "link", \
     "rename", "execute", "quotaon", "mounton", "audit_access", \
-    "open", "execmod"
+	"open", "execmod", "watch", "watch_mount", "watch_sb", \
+	"watch_with_perm", "watch_reads"
 
 #define COMMON_SOCK_PERMS COMMON_FILE_SOCK_PERMS, "bind", "connect", \
     "listen", "accept", "getopt", "setopt", "shutdown", "recvfrom",  \
@@ -60,7 +61,7 @@
 	{ "filesystem",
 	  { "mount", "remount", "unmount", "getattr",
 	    "relabelfrom", "relabelto", "associate", "quotamod",
-	    "quotaget", NULL } },
+	    "quotaget", "watch", NULL } },
 	{ "file",
 	  { COMMON_FILE_PERMS,
 	    "execute_no_trans", "entrypoint", NULL } },

diff --git a/security/selinux/include/objsec.h b/security/selinux/include/objsec.h
index 91c5395..586b7ab 100644
--- a/security/selinux/include/objsec.h
+++ b/security/selinux/include/objsec.h

@@ -37,16 +37,6 @@
 	u32 sockcreate_sid;	/* fscreate SID */
 };
 
-/*
- * get the subjective security ID of the current task
- */
-static inline u32 current_sid(void)
-{
-	const struct task_security_struct *tsec = current_security();
-
-	return tsec->sid;
-}
-
 enum label_initialized {
 	LABEL_INVALID,		/* invalid or not initialized */
 	LABEL_INITIALIZED,	/* initialized */
@@ -185,4 +175,14 @@
 	return ipc->security + selinux_blob_sizes.lbs_ipc;
 }
 
+/*
+ * get the subjective security ID of the current task
+ */
+static inline u32 current_sid(void)
+{
+	const struct task_security_struct *tsec = selinux_cred(current_cred());
+
+	return tsec->sid;
+}
+
 #endif /* _SELINUX_OBJSEC_H_ */

diff --git a/security/selinux/netif.c b/security/selinux/netif.c
index 9cb83ee..e40fecd 100644
--- a/security/selinux/netif.c
+++ b/security/selinux/netif.c

@@ -132,9 +132,9 @@
  */
 static int sel_netif_sid_slow(struct net *ns, int ifindex, u32 *sid)
 {
-	int ret;
+	int ret = 0;
 	struct sel_netif *netif;
-	struct sel_netif *new = NULL;
+	struct sel_netif *new;
 	struct net_device *dev;
 
 	/* NOTE: we always use init's network namespace since we don't
@@ -151,32 +151,27 @@
 	netif = sel_netif_find(ns, ifindex);
 	if (netif != NULL) {
 		*sid = netif->nsec.sid;
-		ret = 0;
 		goto out;
 	}
+
+	ret = security_netif_sid(&selinux_state, dev->name, sid);
+	if (ret != 0)
+		goto out;
 	new = kzalloc(sizeof(*new), GFP_ATOMIC);
-	if (new == NULL) {
-		ret = -ENOMEM;
-		goto out;
+	if (new) {
+		new->nsec.ns = ns;
+		new->nsec.ifindex = ifindex;
+		new->nsec.sid = *sid;
+		if (sel_netif_insert(new))
+			kfree(new);
 	}
-	ret = security_netif_sid(&selinux_state, dev->name, &new->nsec.sid);
-	if (ret != 0)
-		goto out;
-	new->nsec.ns = ns;
-	new->nsec.ifindex = ifindex;
-	ret = sel_netif_insert(new);
-	if (ret != 0)
-		goto out;
-	*sid = new->nsec.sid;
 
 out:
 	spin_unlock_bh(&sel_netif_lock);
 	dev_put(dev);
-	if (unlikely(ret)) {
+	if (unlikely(ret))
 		pr_warn("SELinux: failure in %s(), unable to determine network interface label (%d)\n",
 			__func__, ifindex);
-		kfree(new);
-	}
 	return ret;
 }
 

diff --git a/security/selinux/netnode.c b/security/selinux/netnode.c
index cae1fca..9ab84ef 100644
--- a/security/selinux/netnode.c
+++ b/security/selinux/netnode.c

@@ -189,9 +189,9 @@
  */
 static int sel_netnode_sid_slow(void *addr, u16 family, u32 *sid)
 {
-	int ret = -ENOMEM;
+	int ret;
 	struct sel_netnode *node;
-	struct sel_netnode *new = NULL;
+	struct sel_netnode *new;
 
 	spin_lock_bh(&sel_netnode_lock);
 	node = sel_netnode_find(addr, family);
@@ -200,38 +200,36 @@
 		spin_unlock_bh(&sel_netnode_lock);
 		return 0;
 	}
+
 	new = kzalloc(sizeof(*new), GFP_ATOMIC);
-	if (new == NULL)
-		goto out;
 	switch (family) {
 	case PF_INET:
 		ret = security_node_sid(&selinux_state, PF_INET,
 					addr, sizeof(struct in_addr), sid);
-		new->nsec.addr.ipv4 = *(__be32 *)addr;
+		if (new)
+			new->nsec.addr.ipv4 = *(__be32 *)addr;
 		break;
 	case PF_INET6:
 		ret = security_node_sid(&selinux_state, PF_INET6,
 					addr, sizeof(struct in6_addr), sid);
-		new->nsec.addr.ipv6 = *(struct in6_addr *)addr;
+		if (new)
+			new->nsec.addr.ipv6 = *(struct in6_addr *)addr;
 		break;
 	default:
 		BUG();
 		ret = -EINVAL;
 	}
-	if (ret != 0)
-		goto out;
+	if (ret == 0 && new) {
+		new->nsec.family = family;
+		new->nsec.sid = *sid;
+		sel_netnode_insert(new);
+	} else
+		kfree(new);
 
-	new->nsec.family = family;
-	new->nsec.sid = *sid;
-	sel_netnode_insert(new);
-
-out:
 	spin_unlock_bh(&sel_netnode_lock);
-	if (unlikely(ret)) {
+	if (unlikely(ret))
 		pr_warn("SELinux: failure in %s(), unable to determine network node label\n",
 			__func__);
-		kfree(new);
-	}
 	return ret;
 }
 

diff --git a/security/selinux/netport.c b/security/selinux/netport.c
index 364b6d5..3f8b2c0 100644
--- a/security/selinux/netport.c
+++ b/security/selinux/netport.c

@@ -137,9 +137,9 @@
  */
 static int sel_netport_sid_slow(u8 protocol, u16 pnum, u32 *sid)
 {
-	int ret = -ENOMEM;
+	int ret;
 	struct sel_netport *port;
-	struct sel_netport *new = NULL;
+	struct sel_netport *new;
 
 	spin_lock_bh(&sel_netport_lock);
 	port = sel_netport_find(protocol, pnum);
@@ -148,25 +148,23 @@
 		spin_unlock_bh(&sel_netport_lock);
 		return 0;
 	}
-	new = kzalloc(sizeof(*new), GFP_ATOMIC);
-	if (new == NULL)
-		goto out;
+
 	ret = security_port_sid(&selinux_state, protocol, pnum, sid);
 	if (ret != 0)
 		goto out;
-
-	new->psec.port = pnum;
-	new->psec.protocol = protocol;
-	new->psec.sid = *sid;
-	sel_netport_insert(new);
+	new = kzalloc(sizeof(*new), GFP_ATOMIC);
+	if (new) {
+		new->psec.port = pnum;
+		new->psec.protocol = protocol;
+		new->psec.sid = *sid;
+		sel_netport_insert(new);
+	}
 
 out:
 	spin_unlock_bh(&sel_netport_lock);
-	if (unlikely(ret)) {
+	if (unlikely(ret))
 		pr_warn("SELinux: failure in %s(), unable to determine network port label\n",
 			__func__);
-		kfree(new);
-	}
 	return ret;
 }
 

diff --git a/security/selinux/ss/policydb.c b/security/selinux/ss/policydb.c
index f8efaa9..1260f5fb 100644
--- a/security/selinux/ss/policydb.c
+++ b/security/selinux/ss/policydb.c

@@ -177,397 +177,6 @@
 }
 
 /*
- * Initialize the role table.
- */
-static int roles_init(struct policydb *p)
-{
-	char *key = NULL;
-	int rc;
-	struct role_datum *role;
-
-	role = kzalloc(sizeof(*role), GFP_KERNEL);
-	if (!role)
-		return -ENOMEM;
-
-	rc = -EINVAL;
-	role->value = ++p->p_roles.nprim;
-	if (role->value != OBJECT_R_VAL)
-		goto out;
-
-	rc = -ENOMEM;
-	key = kstrdup(OBJECT_R, GFP_KERNEL);
-	if (!key)
-		goto out;
-
-	rc = hashtab_insert(p->p_roles.table, key, role);
-	if (rc)
-		goto out;
-
-	return 0;
-out:
-	kfree(key);
-	kfree(role);
-	return rc;
-}
-
-static u32 filenametr_hash(struct hashtab *h, const void *k)
-{
-	const struct filename_trans *ft = k;
-	unsigned long hash;
-	unsigned int byte_num;
-	unsigned char focus;
-
-	hash = ft->stype ^ ft->ttype ^ ft->tclass;
-
-	byte_num = 0;
-	while ((focus = ft->name[byte_num++]))
-		hash = partial_name_hash(focus, hash);
-	return hash & (h->size - 1);
-}
-
-static int filenametr_cmp(struct hashtab *h, const void *k1, const void *k2)
-{
-	const struct filename_trans *ft1 = k1;
-	const struct filename_trans *ft2 = k2;
-	int v;
-
-	v = ft1->stype - ft2->stype;
-	if (v)
-		return v;
-
-	v = ft1->ttype - ft2->ttype;
-	if (v)
-		return v;
-
-	v = ft1->tclass - ft2->tclass;
-	if (v)
-		return v;
-
-	return strcmp(ft1->name, ft2->name);
-
-}
-
-static u32 rangetr_hash(struct hashtab *h, const void *k)
-{
-	const struct range_trans *key = k;
-	return (key->source_type + (key->target_type << 3) +
-		(key->target_class << 5)) & (h->size - 1);
-}
-
-static int rangetr_cmp(struct hashtab *h, const void *k1, const void *k2)
-{
-	const struct range_trans *key1 = k1, *key2 = k2;
-	int v;
-
-	v = key1->source_type - key2->source_type;
-	if (v)
-		return v;
-
-	v = key1->target_type - key2->target_type;
-	if (v)
-		return v;
-
-	v = key1->target_class - key2->target_class;
-
-	return v;
-}
-
-static int (*destroy_f[SYM_NUM]) (void *key, void *datum, void *datap);
-
-/*
- * Initialize a policy database structure.
- */
-static int policydb_init(struct policydb *p)
-{
-	int i, rc;
-
-	memset(p, 0, sizeof(*p));
-
-	for (i = 0; i < SYM_NUM; i++) {
-		rc = symtab_init(&p->symtab[i], symtab_sizes[i]);
-		if (rc)
-			goto out;
-	}
-
-	rc = avtab_init(&p->te_avtab);
-	if (rc)
-		goto out;
-
-	rc = roles_init(p);
-	if (rc)
-		goto out;
-
-	rc = cond_policydb_init(p);
-	if (rc)
-		goto out;
-
-	p->filename_trans = hashtab_create(filenametr_hash, filenametr_cmp, (1 << 10));
-	if (!p->filename_trans) {
-		rc = -ENOMEM;
-		goto out;
-	}
-
-	p->range_tr = hashtab_create(rangetr_hash, rangetr_cmp, 256);
-	if (!p->range_tr) {
-		rc = -ENOMEM;
-		goto out;
-	}
-
-	ebitmap_init(&p->filename_trans_ttypes);
-	ebitmap_init(&p->policycaps);
-	ebitmap_init(&p->permissive_map);
-
-	return 0;
-out:
-	hashtab_destroy(p->filename_trans);
-	hashtab_destroy(p->range_tr);
-	for (i = 0; i < SYM_NUM; i++) {
-		hashtab_map(p->symtab[i].table, destroy_f[i], NULL);
-		hashtab_destroy(p->symtab[i].table);
-	}
-	return rc;
-}
-
-/*
- * The following *_index functions are used to
- * define the val_to_name and val_to_struct arrays
- * in a policy database structure.  The val_to_name
- * arrays are used when converting security context
- * structures into string representations.  The
- * val_to_struct arrays are used when the attributes
- * of a class, role, or user are needed.
- */
-
-static int common_index(void *key, void *datum, void *datap)
-{
-	struct policydb *p;
-	struct common_datum *comdatum;
-
-	comdatum = datum;
-	p = datap;
-	if (!comdatum->value || comdatum->value > p->p_commons.nprim)
-		return -EINVAL;
-
-	p->sym_val_to_name[SYM_COMMONS][comdatum->value - 1] = key;
-
-	return 0;
-}
-
-static int class_index(void *key, void *datum, void *datap)
-{
-	struct policydb *p;
-	struct class_datum *cladatum;
-
-	cladatum = datum;
-	p = datap;
-	if (!cladatum->value || cladatum->value > p->p_classes.nprim)
-		return -EINVAL;
-
-	p->sym_val_to_name[SYM_CLASSES][cladatum->value - 1] = key;
-	p->class_val_to_struct[cladatum->value - 1] = cladatum;
-	return 0;
-}
-
-static int role_index(void *key, void *datum, void *datap)
-{
-	struct policydb *p;
-	struct role_datum *role;
-
-	role = datum;
-	p = datap;
-	if (!role->value
-	    || role->value > p->p_roles.nprim
-	    || role->bounds > p->p_roles.nprim)
-		return -EINVAL;
-
-	p->sym_val_to_name[SYM_ROLES][role->value - 1] = key;
-	p->role_val_to_struct[role->value - 1] = role;
-	return 0;
-}
-
-static int type_index(void *key, void *datum, void *datap)
-{
-	struct policydb *p;
-	struct type_datum *typdatum;
-
-	typdatum = datum;
-	p = datap;
-
-	if (typdatum->primary) {
-		if (!typdatum->value
-		    || typdatum->value > p->p_types.nprim
-		    || typdatum->bounds > p->p_types.nprim)
-			return -EINVAL;
-		p->sym_val_to_name[SYM_TYPES][typdatum->value - 1] = key;
-		p->type_val_to_struct_array[typdatum->value - 1] = typdatum;
-	}
-
-	return 0;
-}
-
-static int user_index(void *key, void *datum, void *datap)
-{
-	struct policydb *p;
-	struct user_datum *usrdatum;
-
-	usrdatum = datum;
-	p = datap;
-	if (!usrdatum->value
-	    || usrdatum->value > p->p_users.nprim
-	    || usrdatum->bounds > p->p_users.nprim)
-		return -EINVAL;
-
-	p->sym_val_to_name[SYM_USERS][usrdatum->value - 1] = key;
-	p->user_val_to_struct[usrdatum->value - 1] = usrdatum;
-	return 0;
-}
-
-static int sens_index(void *key, void *datum, void *datap)
-{
-	struct policydb *p;
-	struct level_datum *levdatum;
-
-	levdatum = datum;
-	p = datap;
-
-	if (!levdatum->isalias) {
-		if (!levdatum->level->sens ||
-		    levdatum->level->sens > p->p_levels.nprim)
-			return -EINVAL;
-
-		p->sym_val_to_name[SYM_LEVELS][levdatum->level->sens - 1] = key;
-	}
-
-	return 0;
-}
-
-static int cat_index(void *key, void *datum, void *datap)
-{
-	struct policydb *p;
-	struct cat_datum *catdatum;
-
-	catdatum = datum;
-	p = datap;
-
-	if (!catdatum->isalias) {
-		if (!catdatum->value || catdatum->value > p->p_cats.nprim)
-			return -EINVAL;
-
-		p->sym_val_to_name[SYM_CATS][catdatum->value - 1] = key;
-	}
-
-	return 0;
-}
-
-static int (*index_f[SYM_NUM]) (void *key, void *datum, void *datap) =
-{
-	common_index,
-	class_index,
-	role_index,
-	type_index,
-	user_index,
-	cond_index_bool,
-	sens_index,
-	cat_index,
-};
-
-#ifdef DEBUG_HASHES
-static void hash_eval(struct hashtab *h, const char *hash_name)
-{
-	struct hashtab_info info;
-
-	hashtab_stat(h, &info);
-	pr_debug("SELinux: %s:  %d entries and %d/%d buckets used, "
-	       "longest chain length %d\n", hash_name, h->nel,
-	       info.slots_used, h->size, info.max_chain_len);
-}
-
-static void symtab_hash_eval(struct symtab *s)
-{
-	int i;
-
-	for (i = 0; i < SYM_NUM; i++)
-		hash_eval(s[i].table, symtab_name[i]);
-}
-
-#else
-static inline void hash_eval(struct hashtab *h, char *hash_name)
-{
-}
-#endif
-
-/*
- * Define the other val_to_name and val_to_struct arrays
- * in a policy database structure.
- *
- * Caller must clean up on failure.
- */
-static int policydb_index(struct policydb *p)
-{
-	int i, rc;
-
-	if (p->mls_enabled)
-		pr_debug("SELinux:  %d users, %d roles, %d types, %d bools, %d sens, %d cats\n",
-			 p->p_users.nprim, p->p_roles.nprim, p->p_types.nprim,
-			 p->p_bools.nprim, p->p_levels.nprim, p->p_cats.nprim);
-	else
-		pr_debug("SELinux:  %d users, %d roles, %d types, %d bools\n",
-			 p->p_users.nprim, p->p_roles.nprim, p->p_types.nprim,
-			 p->p_bools.nprim);
-
-	pr_debug("SELinux:  %d classes, %d rules\n",
-		 p->p_classes.nprim, p->te_avtab.nel);
-
-#ifdef DEBUG_HASHES
-	avtab_hash_eval(&p->te_avtab, "rules");
-	symtab_hash_eval(p->symtab);
-#endif
-
-	p->class_val_to_struct = kcalloc(p->p_classes.nprim,
-					 sizeof(*p->class_val_to_struct),
-					 GFP_KERNEL);
-	if (!p->class_val_to_struct)
-		return -ENOMEM;
-
-	p->role_val_to_struct = kcalloc(p->p_roles.nprim,
-					sizeof(*p->role_val_to_struct),
-					GFP_KERNEL);
-	if (!p->role_val_to_struct)
-		return -ENOMEM;
-
-	p->user_val_to_struct = kcalloc(p->p_users.nprim,
-					sizeof(*p->user_val_to_struct),
-					GFP_KERNEL);
-	if (!p->user_val_to_struct)
-		return -ENOMEM;
-
-	p->type_val_to_struct_array = kvcalloc(p->p_types.nprim,
-					       sizeof(*p->type_val_to_struct_array),
-					       GFP_KERNEL);
-	if (!p->type_val_to_struct_array)
-		return -ENOMEM;
-
-	rc = cond_init_bool_indexes(p);
-	if (rc)
-		goto out;
-
-	for (i = 0; i < SYM_NUM; i++) {
-		p->sym_val_to_name[i] = kvcalloc(p->symtab[i].nprim,
-						 sizeof(char *),
-						 GFP_KERNEL);
-		if (!p->sym_val_to_name[i])
-			return -ENOMEM;
-
-		rc = hashtab_map(p->symtab[i].table, index_f[i], p);
-		if (rc)
-			goto out;
-	}
-	rc = 0;
-out:
-	return rc;
-}
-
-/*
  * The following *_destroy functions are used to
  * free any memory allocated for each kind of
  * symbol data in the policy database.
@@ -723,6 +332,7 @@
 static int filenametr_destroy(void *key, void *datum, void *p)
 {
 	struct filename_trans *ft = key;
+
 	kfree(ft->name);
 	kfree(key);
 	kfree(datum);
@@ -733,6 +343,7 @@
 static int range_tr_destroy(void *key, void *datum, void *p)
 {
 	struct mls_range *rt = datum;
+
 	kfree(key);
 	ebitmap_destroy(&rt->level[0].cat);
 	ebitmap_destroy(&rt->level[1].cat);
@@ -755,6 +366,397 @@
 }
 
 /*
+ * Initialize the role table.
+ */
+static int roles_init(struct policydb *p)
+{
+	char *key = NULL;
+	int rc;
+	struct role_datum *role;
+
+	role = kzalloc(sizeof(*role), GFP_KERNEL);
+	if (!role)
+		return -ENOMEM;
+
+	rc = -EINVAL;
+	role->value = ++p->p_roles.nprim;
+	if (role->value != OBJECT_R_VAL)
+		goto out;
+
+	rc = -ENOMEM;
+	key = kstrdup(OBJECT_R, GFP_KERNEL);
+	if (!key)
+		goto out;
+
+	rc = hashtab_insert(p->p_roles.table, key, role);
+	if (rc)
+		goto out;
+
+	return 0;
+out:
+	kfree(key);
+	kfree(role);
+	return rc;
+}
+
+static u32 filenametr_hash(struct hashtab *h, const void *k)
+{
+	const struct filename_trans *ft = k;
+	unsigned long hash;
+	unsigned int byte_num;
+	unsigned char focus;
+
+	hash = ft->stype ^ ft->ttype ^ ft->tclass;
+
+	byte_num = 0;
+	while ((focus = ft->name[byte_num++]))
+		hash = partial_name_hash(focus, hash);
+	return hash & (h->size - 1);
+}
+
+static int filenametr_cmp(struct hashtab *h, const void *k1, const void *k2)
+{
+	const struct filename_trans *ft1 = k1;
+	const struct filename_trans *ft2 = k2;
+	int v;
+
+	v = ft1->stype - ft2->stype;
+	if (v)
+		return v;
+
+	v = ft1->ttype - ft2->ttype;
+	if (v)
+		return v;
+
+	v = ft1->tclass - ft2->tclass;
+	if (v)
+		return v;
+
+	return strcmp(ft1->name, ft2->name);
+
+}
+
+static u32 rangetr_hash(struct hashtab *h, const void *k)
+{
+	const struct range_trans *key = k;
+
+	return (key->source_type + (key->target_type << 3) +
+		(key->target_class << 5)) & (h->size - 1);
+}
+
+static int rangetr_cmp(struct hashtab *h, const void *k1, const void *k2)
+{
+	const struct range_trans *key1 = k1, *key2 = k2;
+	int v;
+
+	v = key1->source_type - key2->source_type;
+	if (v)
+		return v;
+
+	v = key1->target_type - key2->target_type;
+	if (v)
+		return v;
+
+	v = key1->target_class - key2->target_class;
+
+	return v;
+}
+
+/*
+ * Initialize a policy database structure.
+ */
+static int policydb_init(struct policydb *p)
+{
+	int i, rc;
+
+	memset(p, 0, sizeof(*p));
+
+	for (i = 0; i < SYM_NUM; i++) {
+		rc = symtab_init(&p->symtab[i], symtab_sizes[i]);
+		if (rc)
+			goto out;
+	}
+
+	rc = avtab_init(&p->te_avtab);
+	if (rc)
+		goto out;
+
+	rc = roles_init(p);
+	if (rc)
+		goto out;
+
+	rc = cond_policydb_init(p);
+	if (rc)
+		goto out;
+
+	p->filename_trans = hashtab_create(filenametr_hash, filenametr_cmp,
+					   (1 << 10));
+	if (!p->filename_trans) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	p->range_tr = hashtab_create(rangetr_hash, rangetr_cmp, 256);
+	if (!p->range_tr) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	ebitmap_init(&p->filename_trans_ttypes);
+	ebitmap_init(&p->policycaps);
+	ebitmap_init(&p->permissive_map);
+
+	return 0;
+out:
+	hashtab_destroy(p->filename_trans);
+	hashtab_destroy(p->range_tr);
+	for (i = 0; i < SYM_NUM; i++) {
+		hashtab_map(p->symtab[i].table, destroy_f[i], NULL);
+		hashtab_destroy(p->symtab[i].table);
+	}
+	return rc;
+}
+
+/*
+ * The following *_index functions are used to
+ * define the val_to_name and val_to_struct arrays
+ * in a policy database structure.  The val_to_name
+ * arrays are used when converting security context
+ * structures into string representations.  The
+ * val_to_struct arrays are used when the attributes
+ * of a class, role, or user are needed.
+ */
+
+static int common_index(void *key, void *datum, void *datap)
+{
+	struct policydb *p;
+	struct common_datum *comdatum;
+
+	comdatum = datum;
+	p = datap;
+	if (!comdatum->value || comdatum->value > p->p_commons.nprim)
+		return -EINVAL;
+
+	p->sym_val_to_name[SYM_COMMONS][comdatum->value - 1] = key;
+
+	return 0;
+}
+
+static int class_index(void *key, void *datum, void *datap)
+{
+	struct policydb *p;
+	struct class_datum *cladatum;
+
+	cladatum = datum;
+	p = datap;
+	if (!cladatum->value || cladatum->value > p->p_classes.nprim)
+		return -EINVAL;
+
+	p->sym_val_to_name[SYM_CLASSES][cladatum->value - 1] = key;
+	p->class_val_to_struct[cladatum->value - 1] = cladatum;
+	return 0;
+}
+
+static int role_index(void *key, void *datum, void *datap)
+{
+	struct policydb *p;
+	struct role_datum *role;
+
+	role = datum;
+	p = datap;
+	if (!role->value
+	    || role->value > p->p_roles.nprim
+	    || role->bounds > p->p_roles.nprim)
+		return -EINVAL;
+
+	p->sym_val_to_name[SYM_ROLES][role->value - 1] = key;
+	p->role_val_to_struct[role->value - 1] = role;
+	return 0;
+}
+
+static int type_index(void *key, void *datum, void *datap)
+{
+	struct policydb *p;
+	struct type_datum *typdatum;
+
+	typdatum = datum;
+	p = datap;
+
+	if (typdatum->primary) {
+		if (!typdatum->value
+		    || typdatum->value > p->p_types.nprim
+		    || typdatum->bounds > p->p_types.nprim)
+			return -EINVAL;
+		p->sym_val_to_name[SYM_TYPES][typdatum->value - 1] = key;
+		p->type_val_to_struct[typdatum->value - 1] = typdatum;
+	}
+
+	return 0;
+}
+
+static int user_index(void *key, void *datum, void *datap)
+{
+	struct policydb *p;
+	struct user_datum *usrdatum;
+
+	usrdatum = datum;
+	p = datap;
+	if (!usrdatum->value
+	    || usrdatum->value > p->p_users.nprim
+	    || usrdatum->bounds > p->p_users.nprim)
+		return -EINVAL;
+
+	p->sym_val_to_name[SYM_USERS][usrdatum->value - 1] = key;
+	p->user_val_to_struct[usrdatum->value - 1] = usrdatum;
+	return 0;
+}
+
+static int sens_index(void *key, void *datum, void *datap)
+{
+	struct policydb *p;
+	struct level_datum *levdatum;
+
+	levdatum = datum;
+	p = datap;
+
+	if (!levdatum->isalias) {
+		if (!levdatum->level->sens ||
+		    levdatum->level->sens > p->p_levels.nprim)
+			return -EINVAL;
+
+		p->sym_val_to_name[SYM_LEVELS][levdatum->level->sens - 1] = key;
+	}
+
+	return 0;
+}
+
+static int cat_index(void *key, void *datum, void *datap)
+{
+	struct policydb *p;
+	struct cat_datum *catdatum;
+
+	catdatum = datum;
+	p = datap;
+
+	if (!catdatum->isalias) {
+		if (!catdatum->value || catdatum->value > p->p_cats.nprim)
+			return -EINVAL;
+
+		p->sym_val_to_name[SYM_CATS][catdatum->value - 1] = key;
+	}
+
+	return 0;
+}
+
+static int (*index_f[SYM_NUM]) (void *key, void *datum, void *datap) =
+{
+	common_index,
+	class_index,
+	role_index,
+	type_index,
+	user_index,
+	cond_index_bool,
+	sens_index,
+	cat_index,
+};
+
+#ifdef DEBUG_HASHES
+static void hash_eval(struct hashtab *h, const char *hash_name)
+{
+	struct hashtab_info info;
+
+	hashtab_stat(h, &info);
+	pr_debug("SELinux: %s:  %d entries and %d/%d buckets used, longest chain length %d\n",
+		 hash_name, h->nel, info.slots_used, h->size,
+		 info.max_chain_len);
+}
+
+static void symtab_hash_eval(struct symtab *s)
+{
+	int i;
+
+	for (i = 0; i < SYM_NUM; i++)
+		hash_eval(s[i].table, symtab_name[i]);
+}
+
+#else
+static inline void hash_eval(struct hashtab *h, char *hash_name)
+{
+}
+#endif
+
+/*
+ * Define the other val_to_name and val_to_struct arrays
+ * in a policy database structure.
+ *
+ * Caller must clean up on failure.
+ */
+static int policydb_index(struct policydb *p)
+{
+	int i, rc;
+
+	if (p->mls_enabled)
+		pr_debug("SELinux:  %d users, %d roles, %d types, %d bools, %d sens, %d cats\n",
+			 p->p_users.nprim, p->p_roles.nprim, p->p_types.nprim,
+			 p->p_bools.nprim, p->p_levels.nprim, p->p_cats.nprim);
+	else
+		pr_debug("SELinux:  %d users, %d roles, %d types, %d bools\n",
+			 p->p_users.nprim, p->p_roles.nprim, p->p_types.nprim,
+			 p->p_bools.nprim);
+
+	pr_debug("SELinux:  %d classes, %d rules\n",
+		 p->p_classes.nprim, p->te_avtab.nel);
+
+#ifdef DEBUG_HASHES
+	avtab_hash_eval(&p->te_avtab, "rules");
+	symtab_hash_eval(p->symtab);
+#endif
+
+	p->class_val_to_struct = kcalloc(p->p_classes.nprim,
+					 sizeof(*p->class_val_to_struct),
+					 GFP_KERNEL);
+	if (!p->class_val_to_struct)
+		return -ENOMEM;
+
+	p->role_val_to_struct = kcalloc(p->p_roles.nprim,
+					sizeof(*p->role_val_to_struct),
+					GFP_KERNEL);
+	if (!p->role_val_to_struct)
+		return -ENOMEM;
+
+	p->user_val_to_struct = kcalloc(p->p_users.nprim,
+					sizeof(*p->user_val_to_struct),
+					GFP_KERNEL);
+	if (!p->user_val_to_struct)
+		return -ENOMEM;
+
+	p->type_val_to_struct = kvcalloc(p->p_types.nprim,
+					 sizeof(*p->type_val_to_struct),
+					 GFP_KERNEL);
+	if (!p->type_val_to_struct)
+		return -ENOMEM;
+
+	rc = cond_init_bool_indexes(p);
+	if (rc)
+		goto out;
+
+	for (i = 0; i < SYM_NUM; i++) {
+		p->sym_val_to_name[i] = kvcalloc(p->symtab[i].nprim,
+						 sizeof(char *),
+						 GFP_KERNEL);
+		if (!p->sym_val_to_name[i])
+			return -ENOMEM;
+
+		rc = hashtab_map(p->symtab[i].table, index_f[i], p);
+		if (rc)
+			goto out;
+	}
+	rc = 0;
+out:
+	return rc;
+}
+
+/*
  * Free any memory allocated by a policy database structure.
  */
 void policydb_destroy(struct policydb *p)
@@ -777,7 +779,7 @@
 	kfree(p->class_val_to_struct);
 	kfree(p->role_val_to_struct);
 	kfree(p->user_val_to_struct);
-	kvfree(p->type_val_to_struct_array);
+	kvfree(p->type_val_to_struct);
 
 	avtab_destroy(&p->te_avtab);
 
@@ -1722,7 +1724,7 @@
 			return -EINVAL;
 		}
 
-		upper = p->type_val_to_struct_array[upper->bounds - 1];
+		upper = p->type_val_to_struct[upper->bounds - 1];
 		BUG_ON(!upper);
 
 		if (upper->attribute) {

diff --git a/security/selinux/ss/policydb.h b/security/selinux/ss/policydb.h
index fcc6366..162d0e7 100644
--- a/security/selinux/ss/policydb.h
+++ b/security/selinux/ss/policydb.h

@@ -253,7 +253,7 @@
 	struct class_datum **class_val_to_struct;
 	struct role_datum **role_val_to_struct;
 	struct user_datum **user_val_to_struct;
-	struct type_datum **type_val_to_struct_array;
+	struct type_datum **type_val_to_struct;
 
 	/* type enforcement access vectors and transitions */
 	struct avtab te_avtab;

diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c
index d61563a..3a29e7c 100644
--- a/security/selinux/ss/services.c
+++ b/security/selinux/ss/services.c

@@ -542,13 +542,13 @@
 	struct type_datum *target;
 	u32 masked = 0;
 
-	source = policydb->type_val_to_struct_array[scontext->type - 1];
+	source = policydb->type_val_to_struct[scontext->type - 1];
 	BUG_ON(!source);
 
 	if (!source->bounds)
 		return;
 
-	target = policydb->type_val_to_struct_array[tcontext->type - 1];
+	target = policydb->type_val_to_struct[tcontext->type - 1];
 	BUG_ON(!target);
 
 	memset(&lo_avd, 0, sizeof(lo_avd));
@@ -891,7 +891,7 @@
 
 	index = new_context->type;
 	while (true) {
-		type = policydb->type_val_to_struct_array[index - 1];
+		type = policydb->type_val_to_struct[index - 1];
 		BUG_ON(!type);
 
 		/* not bounded anymore */

diff --git a/security/selinux/ss/sidtab.c b/security/selinux/ss/sidtab.c
index 1f0a6ea..7d49994 100644
--- a/security/selinux/ss/sidtab.c
+++ b/security/selinux/ss/sidtab.c

@@ -12,7 +12,7 @@
 #include <linux/slab.h>
 #include <linux/sched.h>
 #include <linux/spinlock.h>
-#include <linux/atomic.h>
+#include <asm/barrier.h>
 #include "flask.h"
 #include "security.h"
 #include "sidtab.h"
@@ -23,14 +23,14 @@
 
 	memset(s->roots, 0, sizeof(s->roots));
 
+	/* max count is SIDTAB_MAX so valid index is always < SIDTAB_MAX */
 	for (i = 0; i < SIDTAB_RCACHE_SIZE; i++)
-		atomic_set(&s->rcache[i], -1);
+		s->rcache[i] = SIDTAB_MAX;
 
 	for (i = 0; i < SECINITSID_NUM; i++)
 		s->isids[i].set = 0;
 
-	atomic_set(&s->count, 0);
-
+	s->count = 0;
 	s->convert = NULL;
 
 	spin_lock_init(&s->lock);
@@ -130,14 +130,12 @@
 
 static struct context *sidtab_lookup(struct sidtab *s, u32 index)
 {
-	u32 count = (u32)atomic_read(&s->count);
+	/* read entries only after reading count */
+	u32 count = smp_load_acquire(&s->count);
 
 	if (index >= count)
 		return NULL;
 
-	/* read entries after reading count */
-	smp_rmb();
-
 	return sidtab_do_lookup(s, index, 0);
 }
 
@@ -210,10 +208,10 @@
 static void sidtab_rcache_update(struct sidtab *s, u32 index, u32 pos)
 {
 	while (pos > 0) {
-		atomic_set(&s->rcache[pos], atomic_read(&s->rcache[pos - 1]));
+		WRITE_ONCE(s->rcache[pos], READ_ONCE(s->rcache[pos - 1]));
 		--pos;
 	}
-	atomic_set(&s->rcache[0], (int)index);
+	WRITE_ONCE(s->rcache[0], index);
 }
 
 static void sidtab_rcache_push(struct sidtab *s, u32 index)
@@ -227,14 +225,14 @@
 	u32 i;
 
 	for (i = 0; i < SIDTAB_RCACHE_SIZE; i++) {
-		int v = atomic_read(&s->rcache[i]);
+		u32 v = READ_ONCE(s->rcache[i]);
 
-		if (v < 0)
+		if (v >= SIDTAB_MAX)
 			continue;
 
-		if (context_cmp(sidtab_do_lookup(s, (u32)v, 0), context)) {
-			sidtab_rcache_update(s, (u32)v, i);
-			*index = (u32)v;
+		if (context_cmp(sidtab_do_lookup(s, v, 0), context)) {
+			sidtab_rcache_update(s, v, i);
+			*index = v;
 			return 0;
 		}
 	}
@@ -245,8 +243,7 @@
 				 u32 *index)
 {
 	unsigned long flags;
-	u32 count = (u32)atomic_read(&s->count);
-	u32 count_locked, level, pos;
+	u32 count, count_locked, level, pos;
 	struct sidtab_convert_params *convert;
 	struct context *dst, *dst_convert;
 	int rc;
@@ -255,11 +252,10 @@
 	if (rc == 0)
 		return 0;
 
+	/* read entries only after reading count */
+	count = smp_load_acquire(&s->count);
 	level = sidtab_level_from_count(count);
 
-	/* read entries after reading count */
-	smp_rmb();
-
 	pos = 0;
 	rc = sidtab_find_context(s->roots[level], &pos, count, level,
 				 context, index);
@@ -272,7 +268,7 @@
 	spin_lock_irqsave(&s->lock, flags);
 
 	convert = s->convert;
-	count_locked = (u32)atomic_read(&s->count);
+	count_locked = s->count;
 	level = sidtab_level_from_count(count_locked);
 
 	/* if count has changed before we acquired the lock, then catch up */
@@ -320,7 +316,7 @@
 		}
 
 		/* at this point we know the insert won't fail */
-		atomic_set(&convert->target->count, count + 1);
+		convert->target->count = count + 1;
 	}
 
 	if (context->len)
@@ -331,9 +327,7 @@
 	*index = count;
 
 	/* write entries before writing new count */
-	smp_wmb();
-
-	atomic_set(&s->count, count + 1);
+	smp_store_release(&s->count, count + 1);
 
 	rc = 0;
 out_unlock:
@@ -423,7 +417,7 @@
 		return -EBUSY;
 	}
 
-	count = (u32)atomic_read(&s->count);
+	count = s->count;
 	level = sidtab_level_from_count(count);
 
 	/* allocate last leaf in the new sidtab (to avoid race with
@@ -436,7 +430,7 @@
 	}
 
 	/* set count in case no new entries are added during conversion */
-	atomic_set(&params->target->count, count);
+	params->target->count = count;
 
 	/* enable live convert of new entries */
 	s->convert = params;

diff --git a/security/selinux/ss/sidtab.h b/security/selinux/ss/sidtab.h
index bbd5c0d..1f47631 100644
--- a/security/selinux/ss/sidtab.h
+++ b/security/selinux/ss/sidtab.h

@@ -40,8 +40,8 @@
 #define SIDTAB_LEAF_ENTRIES \
 	(SIDTAB_NODE_ALLOC_SIZE / sizeof(struct sidtab_entry_leaf))
 
-#define SIDTAB_MAX_BITS 31 /* limited to INT_MAX due to atomic_t range */
-#define SIDTAB_MAX (((u32)1 << SIDTAB_MAX_BITS) - 1)
+#define SIDTAB_MAX_BITS 32
+#define SIDTAB_MAX U32_MAX
 /* ensure enough tree levels for SIDTAB_MAX entries */
 #define SIDTAB_MAX_LEVEL \
 	DIV_ROUND_UP(SIDTAB_MAX_BITS - size_to_shift(SIDTAB_LEAF_ENTRIES), \
@@ -69,13 +69,22 @@
 #define SIDTAB_RCACHE_SIZE 3
 
 struct sidtab {
+	/*
+	 * lock-free read access only for as many items as a prior read of
+	 * 'count'
+	 */
 	union sidtab_entry_inner roots[SIDTAB_MAX_LEVEL + 1];
-	atomic_t count;
+	/*
+	 * access atomically via {READ|WRITE}_ONCE(); only increment under
+	 * spinlock
+	 */
+	u32 count;
+	/* access only under spinlock */
 	struct sidtab_convert_params *convert;
 	spinlock_t lock;
 
-	/* reverse lookup cache */
-	atomic_t rcache[SIDTAB_RCACHE_SIZE];
+	/* reverse lookup cache - access atomically via {READ|WRITE}_ONCE() */
+	u32 rcache[SIDTAB_RCACHE_SIZE];
 
 	/* index == SID - 1 (no entry for SECSID_NULL) */
 	struct sidtab_isid_entry isids[SECINITSID_NUM];

diff --git a/security/smack/smack_access.c b/security/smack/smack_access.c
index f1c93a7..38ac3da 100644
--- a/security/smack/smack_access.c
+++ b/security/smack/smack_access.c

@@ -465,7 +465,7 @@
 	if (i == 0 || i >= SMK_LONGLABEL)
 		return ERR_PTR(-EINVAL);
 
-	smack = kzalloc(i + 1, GFP_KERNEL);
+	smack = kzalloc(i + 1, GFP_NOFS);
 	if (smack == NULL)
 		return ERR_PTR(-ENOMEM);
 
@@ -500,7 +500,7 @@
 			if ((m & *cp) == 0)
 				continue;
 			rc = netlbl_catmap_setbit(&sap->attr.mls.cat,
-						  cat, GFP_KERNEL);
+						  cat, GFP_NOFS);
 			if (rc < 0) {
 				netlbl_catmap_free(sap->attr.mls.cat);
 				return rc;
@@ -536,7 +536,7 @@
 	if (skp != NULL)
 		goto freeout;
 
-	skp = kzalloc(sizeof(*skp), GFP_KERNEL);
+	skp = kzalloc(sizeof(*skp), GFP_NOFS);
 	if (skp == NULL) {
 		skp = ERR_PTR(-ENOMEM);
 		goto freeout;

diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index 4c5e5a4..abeb09c 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c

@@ -288,7 +288,7 @@
 	if (!(ip->i_opflags & IOP_XATTR))
 		return ERR_PTR(-EOPNOTSUPP);
 
-	buffer = kzalloc(SMK_LONGLABEL, GFP_KERNEL);
+	buffer = kzalloc(SMK_LONGLABEL, GFP_NOFS);
 	if (buffer == NULL)
 		return ERR_PTR(-ENOMEM);
 
@@ -307,7 +307,7 @@
 
 /**
  * init_inode_smack - initialize an inode security blob
- * @isp: the blob to initialize
+ * @inode: inode to extract the info from
  * @skp: a pointer to the Smack label entry to use in the blob
  *
  */
@@ -509,7 +509,7 @@
 
 /**
  * smack_syslog - Smack approval on syslog
- * @type: message type
+ * @typefrom_file: unused
  *
  * Returns 0 on success, error code otherwise.
  */
@@ -765,7 +765,7 @@
 /**
  * smack_set_mnt_opts - set Smack specific mount options
  * @sb: the file system superblock
- * @opts: Smack mount options
+ * @mnt_opts: Smack mount options
  * @kern_flags: mount option from kernel space or user space
  * @set_kern_flags: where to store converted mount opts
  *
@@ -937,7 +937,8 @@
 
 		if (rc != 0)
 			return rc;
-	} else if (bprm->unsafe)
+	}
+	if (bprm->unsafe & ~LSM_UNSAFE_PTRACE)
 		return -EPERM;
 
 	bsp->smk_task = isp->smk_task;
@@ -958,7 +959,7 @@
  * smack_inode_alloc_security - allocate an inode blob
  * @inode: the inode in need of a blob
  *
- * Returns 0 if it gets a blob, -ENOMEM otherwise
+ * Returns 0
  */
 static int smack_inode_alloc_security(struct inode *inode)
 {
@@ -1164,7 +1165,7 @@
  *
  * This is the important Smack hook.
  *
- * Returns 0 if access is permitted, -EACCES otherwise
+ * Returns 0 if access is permitted, an error code otherwise
  */
 static int smack_inode_permission(struct inode *inode, int mask)
 {
@@ -1222,8 +1223,7 @@
 
 /**
  * smack_inode_getattr - Smack check for getting attributes
- * @mnt: vfsmount of the object
- * @dentry: the object
+ * @path: path to extract the info from
  *
  * Returns 0 if access is permitted, an error code otherwise
  */
@@ -1870,14 +1870,13 @@
 /**
  * smack_file_open - Smack dentry open processing
  * @file: the object
- * @cred: task credential
  *
  * Set the security blob in the file structure.
  * Allow the open only if the task has read access. There are
  * many read operations (e.g. fstat) that you can do with an
  * fd even if you have the file open write-only.
  *
- * Returns 0
+ * Returns 0 if current has access, error code otherwise
  */
 static int smack_file_open(struct file *file)
 {
@@ -1900,7 +1899,7 @@
 
 /**
  * smack_cred_alloc_blank - "allocate" blank task-level security credentials
- * @new: the new credentials
+ * @cred: the new credentials
  * @gfp: the atomicity of any memory allocations
  *
  * Prepare a blank set of credentials for modification.  This must allocate all
@@ -1983,7 +1982,7 @@
 
 /**
  * smack_cred_getsecid - get the secid corresponding to a creds structure
- * @c: the object creds
+ * @cred: the object creds
  * @secid: where to put the result
  *
  * Sets the secid to contain a u32 version of the smack label.
@@ -2140,8 +2139,6 @@
 /**
  * smack_task_setscheduler - Smack check on setting scheduler
  * @p: the task object
- * @policy: unused
- * @lp: unused
  *
  * Return 0 if read access is permitted
  */
@@ -2611,8 +2608,9 @@
 
 /**
  * smk_ipv6_port_check - check Smack port access
- * @sock: socket
+ * @sk: socket
  * @address: address
+ * @act: the action being taken
  *
  * Create or update the port list entry
  */
@@ -2782,7 +2780,7 @@
  *
  * Cross reference the peer labels for SO_PEERSEC
  *
- * Returns 0 on success, and error code otherwise
+ * Returns 0
  */
 static int smack_socket_socketpair(struct socket *socka,
 		                   struct socket *sockb)
@@ -3014,13 +3012,13 @@
  *
  * Returns 0 if current has the requested access, error code otherwise
  */
-static int smack_shm_shmat(struct kern_ipc_perm *ipc, char __user *shmaddr,
+static int smack_shm_shmat(struct kern_ipc_perm *isp, char __user *shmaddr,
 			   int shmflg)
 {
 	int may;
 
 	may = smack_flags_to_may(shmflg);
-	return smk_curacc_shm(ipc, may);
+	return smk_curacc_shm(isp, may);
 }
 
 /**
@@ -3925,6 +3923,8 @@
 			skp = smack_ipv6host_label(&sadd);
 		if (skp == NULL)
 			skp = smack_net_ambient;
+		if (skb == NULL)
+			break;
 #ifdef CONFIG_AUDIT
 		smk_ad_init_net(&ad, __func__, LSM_AUDIT_DATA_NET, &net);
 		ad.a.u.net->family = family;
@@ -4762,7 +4762,7 @@
 /**
  * smack_init - initialize the smack system
  *
- * Returns 0
+ * Returns 0 on success, -ENOMEM is there's no memory
  */
 static __init int smack_init(void)
 {

diff --git a/sound/firewire/dice/dice-alesis.c b/sound/firewire/dice/dice-alesis.c
index 218292b..f5b3252 100644
--- a/sound/firewire/dice/dice-alesis.c
+++ b/sound/firewire/dice/dice-alesis.c

@@ -15,7 +15,7 @@
 
 static const unsigned int
 alesis_io26_tx_pcm_chs[MAX_STREAMS][SND_DICE_RATE_MODE_COUNT] = {
-	{10, 10, 8},	/* Tx0 = Analog + S/PDIF. */
+	{10, 10, 4},	/* Tx0 = Analog + S/PDIF. */
 	{16, 8, 0},	/* Tx1 = ADAT1 + ADAT2. */
 };
 

diff --git a/sound/pci/hda/hda_intel.c b/sound/pci/hda/hda_intel.c
index 91e71be..240f4ca 100644
--- a/sound/pci/hda/hda_intel.c
+++ b/sound/pci/hda/hda_intel.c

@@ -2485,8 +2485,7 @@
 			 AZX_DCAPS_PM_RUNTIME },
 	/* AMD Raven */
 	{ PCI_DEVICE(0x1022, 0x15e3),
-	  .driver_data = AZX_DRIVER_GENERIC | AZX_DCAPS_PRESET_ATI_SB |
-			 AZX_DCAPS_PM_RUNTIME },
+	  .driver_data = AZX_DRIVER_GENERIC | AZX_DCAPS_PRESET_AMD_SB },
 	/* ATI HDMI */
 	{ PCI_DEVICE(0x1002, 0x0002),
 	  .driver_data = AZX_DRIVER_ATIHDMI_NS | AZX_DCAPS_PRESET_ATI_HDMI_NS },

diff --git a/sound/pci/hda/patch_analog.c b/sound/pci/hda/patch_analog.c
index e283966..bc9dd8e 100644
--- a/sound/pci/hda/patch_analog.c
+++ b/sound/pci/hda/patch_analog.c

@@ -357,6 +357,7 @@
 
 static const struct snd_pci_quirk ad1986a_fixup_tbl[] = {
 	SND_PCI_QUIRK(0x103c, 0x30af, "HP B2800", AD1986A_FIXUP_LAPTOP_IMIC),
+	SND_PCI_QUIRK(0x1043, 0x1153, "ASUS M9V", AD1986A_FIXUP_LAPTOP_IMIC),
 	SND_PCI_QUIRK(0x1043, 0x1443, "ASUS Z99He", AD1986A_FIXUP_EAPD),
 	SND_PCI_QUIRK(0x1043, 0x1447, "ASUS A8JN", AD1986A_FIXUP_EAPD),
 	SND_PCI_QUIRK_MASK(0x1043, 0xff00, 0x8100, "ASUS P5", AD1986A_FIXUP_3STACK),

diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c
index da16954..b000b36 100644
--- a/sound/pci/hda/patch_realtek.c
+++ b/sound/pci/hda/patch_realtek.c

@@ -5817,6 +5817,7 @@
 	ALC292_FIXUP_DELL_E7X,
 	ALC292_FIXUP_DISABLE_AAMIX,
 	ALC293_FIXUP_DISABLE_AAMIX_MULTIJACK,
+	ALC298_FIXUP_ALIENWARE_MIC_NO_PRESENCE,
 	ALC298_FIXUP_DELL1_MIC_NO_PRESENCE,
 	ALC298_FIXUP_DELL_AIO_MIC_NO_PRESENCE,
 	ALC275_FIXUP_DELL_XPS,
@@ -5871,6 +5872,7 @@
 	ALC256_FIXUP_ASUS_MIC_NO_PRESENCE,
 	ALC299_FIXUP_PREDATOR_SPK,
 	ALC294_FIXUP_ASUS_INTSPK_HEADSET_MIC,
+	ALC256_FIXUP_MEDION_HEADSET_NO_PRESENCE,
 };
 
 static const struct hda_fixup alc269_fixups[] = {
@@ -6506,6 +6508,15 @@
 		.chained = true,
 		.chain_id = ALC292_FIXUP_DISABLE_AAMIX
 	},
+	[ALC298_FIXUP_ALIENWARE_MIC_NO_PRESENCE] = {
+		.type = HDA_FIXUP_PINS,
+		.v.pins = (const struct hda_pintbl[]) {
+			{ 0x18, 0x01a1913c }, /* headset mic w/o jack detect */
+			{ }
+		},
+		.chained_before = true,
+		.chain_id = ALC269_FIXUP_HEADSET_MODE,
+	},
 	[ALC298_FIXUP_DELL1_MIC_NO_PRESENCE] = {
 		.type = HDA_FIXUP_PINS,
 		.v.pins = (const struct hda_pintbl[]) {
@@ -6927,6 +6938,16 @@
 		.chained = true,
 		.chain_id = ALC269_FIXUP_HEADSET_MODE_NO_HP_MIC
 	},
+	[ALC256_FIXUP_MEDION_HEADSET_NO_PRESENCE] = {
+		.type = HDA_FIXUP_PINS,
+		.v.pins = (const struct hda_pintbl[]) {
+			{ 0x19, 0x04a11040 },
+			{ 0x21, 0x04211020 },
+			{ }
+		},
+		.chained = true,
+		.chain_id = ALC256_FIXUP_ASUS_HEADSET_MODE
+	},
 };
 
 static const struct snd_pci_quirk alc269_fixup_tbl[] = {
@@ -7190,6 +7211,7 @@
 	SND_PCI_QUIRK(0x17aa, 0x9e54, "LENOVO NB", ALC269_FIXUP_LENOVO_EAPD),
 	SND_PCI_QUIRK(0x19e5, 0x3204, "Huawei MACH-WX9", ALC256_FIXUP_HUAWEI_MACH_WX9_PINS),
 	SND_PCI_QUIRK(0x1b7d, 0xa831, "Ordissimo EVE2 ", ALC269VB_FIXUP_ORDISSIMO_EVE2), /* Also known as Malata PC-B1303 */
+	SND_PCI_QUIRK(0x10ec, 0x118c, "Medion EE4254 MD62100", ALC256_FIXUP_MEDION_HEADSET_NO_PRESENCE),
 
 #if 0
 	/* Below is a quirk table taken from the old code.
@@ -7358,6 +7380,7 @@
 	{.id = ALC295_FIXUP_CHROME_BOOK, .name = "alc-chrome-book"},
 	{.id = ALC299_FIXUP_PREDATOR_SPK, .name = "predator-spk"},
 	{.id = ALC298_FIXUP_HUAWEI_MBX_STEREO, .name = "huawei-mbx-stereo"},
+	{.id = ALC256_FIXUP_MEDION_HEADSET_NO_PRESENCE, .name = "alc256-medion-headset"},
 	{}
 };
 #define ALC225_STANDARD_PINS \
@@ -7770,6 +7793,11 @@
 		{0x17, 0x90170110},
 		{0x1a, 0x03011020},
 		{0x21, 0x03211030}),
+	SND_HDA_PIN_QUIRK(0x10ec0298, 0x1028, "Dell", ALC298_FIXUP_ALIENWARE_MIC_NO_PRESENCE,
+		{0x12, 0xb7a60140},
+		{0x17, 0x90170110},
+		{0x1a, 0x03a11030},
+		{0x21, 0x03211020}),
 	SND_HDA_PIN_QUIRK(0x10ec0299, 0x1028, "Dell", ALC269_FIXUP_DELL4_MIC_NO_PRESENCE,
 		ALC225_STANDARD_PINS,
 		{0x12, 0xb7a60130},

diff --git a/sound/soc/atmel/atmel_ssc_dai.c b/sound/soc/atmel/atmel_ssc_dai.c
index 48e9eef..ca60339 100644
--- a/sound/soc/atmel/atmel_ssc_dai.c
+++ b/sound/soc/atmel/atmel_ssc_dai.c

@@ -116,19 +116,16 @@
 static struct atmel_ssc_info ssc_info[NUM_SSC_DEVICES] = {
 	{
 	.name		= "ssc0",
-	.lock		= __SPIN_LOCK_UNLOCKED(ssc_info[0].lock),
 	.dir_mask	= SSC_DIR_MASK_UNUSED,
 	.initialized	= 0,
 	},
 	{
 	.name		= "ssc1",
-	.lock		= __SPIN_LOCK_UNLOCKED(ssc_info[1].lock),
 	.dir_mask	= SSC_DIR_MASK_UNUSED,
 	.initialized	= 0,
 	},
 	{
 	.name		= "ssc2",
-	.lock		= __SPIN_LOCK_UNLOCKED(ssc_info[2].lock),
 	.dir_mask	= SSC_DIR_MASK_UNUSED,
 	.initialized	= 0,
 	},
@@ -317,13 +314,10 @@
 
 	snd_soc_dai_set_dma_data(dai, substream, dma_params);
 
-	spin_lock_irq(&ssc_p->lock);
-	if (ssc_p->dir_mask & dir_mask) {
-		spin_unlock_irq(&ssc_p->lock);
+	if (ssc_p->dir_mask & dir_mask)
 		return -EBUSY;
-	}
+
 	ssc_p->dir_mask |= dir_mask;
-	spin_unlock_irq(&ssc_p->lock);
 
 	return 0;
 }
@@ -355,7 +349,6 @@
 
 	dir_mask = 1 << dir;
 
-	spin_lock_irq(&ssc_p->lock);
 	ssc_p->dir_mask &= ~dir_mask;
 	if (!ssc_p->dir_mask) {
 		if (ssc_p->initialized) {
@@ -369,7 +362,6 @@
 		ssc_p->cmr_div = ssc_p->tcmr_period = ssc_p->rcmr_period = 0;
 		ssc_p->forced_divider = 0;
 	}
-	spin_unlock_irq(&ssc_p->lock);
 
 	/* Shutdown the SSC clock. */
 	pr_debug("atmel_ssc_dai: Stopping clock\n");

diff --git a/sound/soc/atmel/atmel_ssc_dai.h b/sound/soc/atmel/atmel_ssc_dai.h
index ae764cb..3470b96 100644
--- a/sound/soc/atmel/atmel_ssc_dai.h
+++ b/sound/soc/atmel/atmel_ssc_dai.h

@@ -93,7 +93,6 @@
 struct atmel_ssc_info {
 	char *name;
 	struct ssc_device *ssc;
-	spinlock_t lock;	/* lock for dir_mask */
 	unsigned short dir_mask;	/* 0=unused, 1=playback, 2=capture */
 	unsigned short initialized;	/* true if SSC has been initialized */
 	unsigned short daifmt;

diff --git a/sound/soc/codecs/pcm3168a.c b/sound/soc/codecs/pcm3168a.c
index 50ed86d..88b7569 100644
--- a/sound/soc/codecs/pcm3168a.c
+++ b/sound/soc/codecs/pcm3168a.c

@@ -21,8 +21,7 @@
 
 #define PCM3168A_FORMATS (SNDRV_PCM_FMTBIT_S16_LE | \
 			 SNDRV_PCM_FMTBIT_S24_3LE | \
-			 SNDRV_PCM_FMTBIT_S24_LE | \
-			 SNDRV_PCM_FMTBIT_S32_LE)
+			 SNDRV_PCM_FMTBIT_S24_LE)
 
 #define PCM3168A_FMT_I2S		0x0
 #define PCM3168A_FMT_LEFT_J		0x1

diff --git a/sound/soc/fsl/fsl_sai.c b/sound/soc/fsl/fsl_sai.c
index ef0b746..b517e4b 100644
--- a/sound/soc/fsl/fsl_sai.c
+++ b/sound/soc/fsl/fsl_sai.c

@@ -628,6 +628,16 @@
 			   FSL_SAI_CR3_TRCE_MASK,
 			   FSL_SAI_CR3_TRCE);
 
+	/*
+	 * EDMA controller needs period size to be a multiple of
+	 * tx/rx maxburst
+	 */
+	if (sai->soc_data->use_edma)
+		snd_pcm_hw_constraint_step(substream->runtime, 0,
+					   SNDRV_PCM_HW_PARAM_PERIOD_SIZE,
+					   tx ? sai->dma_params_tx.maxburst :
+					   sai->dma_params_rx.maxburst);
+
 	ret = snd_pcm_hw_constraint_list(substream->runtime, 0,
 			SNDRV_PCM_HW_PARAM_RATE, &fsl_sai_rate_constraints);
 
@@ -1026,30 +1036,35 @@
 
 static const struct fsl_sai_soc_data fsl_sai_vf610_data = {
 	.use_imx_pcm = false,
+	.use_edma = false,
 	.fifo_depth = 32,
 	.reg_offset = 0,
 };
 
 static const struct fsl_sai_soc_data fsl_sai_imx6sx_data = {
 	.use_imx_pcm = true,
+	.use_edma = false,
 	.fifo_depth = 32,
 	.reg_offset = 0,
 };
 
 static const struct fsl_sai_soc_data fsl_sai_imx7ulp_data = {
 	.use_imx_pcm = true,
+	.use_edma = false,
 	.fifo_depth = 16,
 	.reg_offset = 8,
 };
 
 static const struct fsl_sai_soc_data fsl_sai_imx8mq_data = {
 	.use_imx_pcm = true,
+	.use_edma = false,
 	.fifo_depth = 128,
 	.reg_offset = 8,
 };
 
 static const struct fsl_sai_soc_data fsl_sai_imx8qm_data = {
 	.use_imx_pcm = true,
+	.use_edma = true,
 	.fifo_depth = 64,
 	.reg_offset = 0,
 };

diff --git a/sound/soc/fsl/fsl_sai.h b/sound/soc/fsl/fsl_sai.h
index b12cb57..76b15de 100644
--- a/sound/soc/fsl/fsl_sai.h
+++ b/sound/soc/fsl/fsl_sai.h

@@ -157,6 +157,7 @@
 
 struct fsl_sai_soc_data {
 	bool use_imx_pcm;
+	bool use_edma;
 	unsigned int fifo_depth;
 	unsigned int reg_offset;
 };

diff --git a/sound/soc/sh/rcar/ssi.c b/sound/soc/sh/rcar/ssi.c
index f6a7466..fc5d089 100644
--- a/sound/soc/sh/rcar/ssi.c
+++ b/sound/soc/sh/rcar/ssi.c

@@ -286,6 +286,11 @@
 	if (rsnd_ssi_is_multi_slave(mod, io))
 		return 0;
 
+	if (rsnd_runtime_is_tdm_split(io))
+		chan = rsnd_io_converted_chan(io);
+
+	chan = rsnd_channel_normalization(chan);
+
 	if (ssi->usrcnt > 0) {
 		if (ssi->rate != rate) {
 			dev_err(dev, "SSI parent/child should use same rate\n");
@@ -300,11 +305,6 @@
 		return 0;
 	}
 
-	if (rsnd_runtime_is_tdm_split(io))
-		chan = rsnd_io_converted_chan(io);
-
-	chan = rsnd_channel_normalization(chan);
-
 	main_rate = rsnd_ssi_clk_query(rdai, rate, chan, &idx);
 	if (!main_rate) {
 		dev_err(dev, "unsupported clock rate\n");

diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c
index 35f48e9..88978a3 100644
--- a/sound/soc/soc-core.c
+++ b/sound/soc/soc-core.c

@@ -978,7 +978,7 @@
 	/* For framework level robustness */
 	snd_soc_component_set_jack(component, NULL, NULL);
 
-	list_del(&component->card_list);
+	list_del_init(&component->card_list);
 	snd_soc_dapm_free(snd_soc_component_get_dapm(component));
 	soc_cleanup_component_debugfs(component);
 	component->card = NULL;

diff --git a/sound/soc/ti/Kconfig b/sound/soc/ti/Kconfig
index 87a9b9d..29f6105 100644
--- a/sound/soc/ti/Kconfig
+++ b/sound/soc/ti/Kconfig

@@ -200,11 +200,18 @@
 
 config SND_SOC_DM365_VOICE_CODEC
 	bool "Voice Codec - CQ93VC"
-	select MFD_DAVINCI_VOICECODEC
-	select SND_SOC_CQ0093VC
 	help
 	  Say Y if you want to add support for SoC On-chip voice codec
 endchoice
 
+config SND_SOC_DM365_VOICE_CODEC_MODULE
+	def_tristate y
+	depends on SND_SOC_DM365_VOICE_CODEC && SND_SOC
+	select MFD_DAVINCI_VOICECODEC
+	select SND_SOC_CQ0093VC
+	help
+	  The is an internal symbol needed to ensure that the codec
+	  and MFD driver can be built as loadable modules if necessary.
+
 endmenu
 

diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c
index 25faf2d..fbfde99 100644
--- a/sound/usb/quirks.c
+++ b/sound/usb/quirks.c

@@ -1658,6 +1658,8 @@
 	case 0x25ce:  /* Mytek devices */
 	case 0x278b:  /* Rotel? */
 	case 0x2ab6:  /* T+A devices */
+	case 0x3842:  /* EVGA */
+	case 0xc502:  /* HiBy devices */
 		if (fp->dsd_raw)
 			return SNDRV_PCM_FMTBIT_DSD_U32_BE;
 		break;

diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h
index 5171b9c..0652d3e 100644
--- a/tools/arch/x86/include/asm/cpufeatures.h
+++ b/tools/arch/x86/include/asm/cpufeatures.h

@@ -231,6 +231,8 @@
 #define X86_FEATURE_VMMCALL		( 8*32+15) /* Prefer VMMCALL to VMCALL */
 #define X86_FEATURE_XENPV		( 8*32+16) /* "" Xen paravirtual guest */
 #define X86_FEATURE_EPT_AD		( 8*32+17) /* Intel Extended Page Table access-dirty bit */
+#define X86_FEATURE_VMCALL		( 8*32+18) /* "" Hypervisor supports the VMCALL instruction */
+#define X86_FEATURE_VMW_VMMCALL		( 8*32+19) /* "" VMware prefers VMMCALL hypercall instruction */
 
 /* Intel-defined CPU features, CPUID level 0x00000007:0 (EBX), word 9 */
 #define X86_FEATURE_FSGSBASE		( 9*32+ 0) /* RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/
@@ -354,6 +356,7 @@
 /* Intel-defined CPU features, CPUID level 0x00000007:0 (EDX), word 18 */
 #define X86_FEATURE_AVX512_4VNNIW	(18*32+ 2) /* AVX-512 Neural Network Instructions */
 #define X86_FEATURE_AVX512_4FMAPS	(18*32+ 3) /* AVX-512 Multiply Accumulation Single precision */
+#define X86_FEATURE_AVX512_VP2INTERSECT (18*32+ 8) /* AVX-512 Intersect for D/Q */
 #define X86_FEATURE_MD_CLEAR		(18*32+10) /* VERW clears CPU buffers */
 #define X86_FEATURE_TSX_FORCE_ABORT	(18*32+13) /* "" TSX_FORCE_ABORT */
 #define X86_FEATURE_PCONFIG		(18*32+18) /* Intel PCONFIG */

diff --git a/tools/arch/x86/include/uapi/asm/unistd.h b/tools/arch/x86/include/uapi/asm/unistd.h
index 30d7d04..196fdd0 100644
--- a/tools/arch/x86/include/uapi/asm/unistd.h
+++ b/tools/arch/x86/include/uapi/asm/unistd.h

@@ -3,7 +3,7 @@
 #define _UAPI_ASM_X86_UNISTD_H
 
 /* x32 syscall flag bit */
-#define __X32_SYSCALL_BIT	0x40000000
+#define __X32_SYSCALL_BIT	0x40000000UL
 
 #ifndef __KERNEL__
 # ifdef __i386__

diff --git a/tools/hv/Build b/tools/hv/Build
new file mode 100644
index 0000000..6cf51fa
--- /dev/null
+++ b/tools/hv/Build

@@ -0,0 +1,3 @@
+hv_kvp_daemon-y += hv_kvp_daemon.o
+hv_vss_daemon-y += hv_vss_daemon.o
+hv_fcopy_daemon-y += hv_fcopy_daemon.o

diff --git a/tools/hv/Makefile b/tools/hv/Makefile
index 5db5e62..b57143d 100644
--- a/tools/hv/Makefile
+++ b/tools/hv/Makefile

@@ -1,28 +1,55 @@
 # SPDX-License-Identifier: GPL-2.0
 # Makefile for Hyper-V tools
-
-WARNINGS = -Wall -Wextra
-CFLAGS = $(WARNINGS) -g $(shell getconf LFS_CFLAGS)
-
-CFLAGS += -D__EXPORTED_HEADERS__ -I../../include/uapi -I../../include
+include ../scripts/Makefile.include
 
 sbindir ?= /usr/sbin
 libexecdir ?= /usr/libexec
 sharedstatedir ?= /var/lib
 
-ALL_PROGRAMS := hv_kvp_daemon hv_vss_daemon hv_fcopy_daemon
+ifeq ($(srctree),)
+srctree := $(patsubst %/,%,$(dir $(CURDIR)))
+srctree := $(patsubst %/,%,$(dir $(srctree)))
+endif
+
+# Do not use make's built-in rules
+# (this improves performance and avoids hard-to-debug behaviour);
+MAKEFLAGS += -r
+
+override CFLAGS += -O2 -Wall -g -D_GNU_SOURCE -I$(OUTPUT)include
+
+ALL_TARGETS := hv_kvp_daemon hv_vss_daemon hv_fcopy_daemon
+ALL_PROGRAMS := $(patsubst %,$(OUTPUT)%,$(ALL_TARGETS))
 
 ALL_SCRIPTS := hv_get_dhcp_info.sh hv_get_dns_info.sh hv_set_ifconfig.sh
 
 all: $(ALL_PROGRAMS)
 
-%: %.c
-	$(CC) $(CFLAGS) -o $@ $^
+export srctree OUTPUT CC LD CFLAGS
+include $(srctree)/tools/build/Makefile.include
+
+HV_KVP_DAEMON_IN := $(OUTPUT)hv_kvp_daemon-in.o
+$(HV_KVP_DAEMON_IN): FORCE
+	$(Q)$(MAKE) $(build)=hv_kvp_daemon
+$(OUTPUT)hv_kvp_daemon: $(HV_KVP_DAEMON_IN)
+	$(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) $< -o $@
+
+HV_VSS_DAEMON_IN := $(OUTPUT)hv_vss_daemon-in.o
+$(HV_VSS_DAEMON_IN): FORCE
+	$(Q)$(MAKE) $(build)=hv_vss_daemon
+$(OUTPUT)hv_vss_daemon: $(HV_VSS_DAEMON_IN)
+	$(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) $< -o $@
+
+HV_FCOPY_DAEMON_IN := $(OUTPUT)hv_fcopy_daemon-in.o
+$(HV_FCOPY_DAEMON_IN): FORCE
+	$(Q)$(MAKE) $(build)=hv_fcopy_daemon
+$(OUTPUT)hv_fcopy_daemon: $(HV_FCOPY_DAEMON_IN)
+	$(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) $< -o $@
 
 clean:
-	$(RM) hv_kvp_daemon hv_vss_daemon hv_fcopy_daemon
+	rm -f $(ALL_PROGRAMS)
+	find $(if $(OUTPUT),$(OUTPUT),.) -name '*.o' -delete -o -name '\.*.d' -delete
 
-install: all
+install: $(ALL_PROGRAMS)
 	install -d -m 755 $(DESTDIR)$(sbindir); \
 	install -d -m 755 $(DESTDIR)$(libexecdir)/hypervkvpd; \
 	install -d -m 755 $(DESTDIR)$(sharedstatedir); \
@@ -33,3 +60,7 @@
 	for script in $(ALL_SCRIPTS); do \
 		install $$script -m 755 $(DESTDIR)$(libexecdir)/hypervkvpd/$${script%.sh}; \
 	done
+
+FORCE:
+
+.PHONY: all install clean FORCE prepare

diff --git a/tools/include/asm/bug.h b/tools/include/asm/bug.h
index bbd75ac..550223f 100644
--- a/tools/include/asm/bug.h
+++ b/tools/include/asm/bug.h

@@ -3,6 +3,7 @@
 #define _TOOLS_ASM_BUG_H
 
 #include <linux/compiler.h>
+#include <stdio.h>
 
 #define __WARN_printf(arg...)	do { fprintf(stderr, arg); } while (0)
 

diff --git a/tools/include/linux/rbtree.h b/tools/include/linux/rbtree.h
index d83763a..e03b1ea 100644
--- a/tools/include/linux/rbtree.h
+++ b/tools/include/linux/rbtree.h

@@ -31,25 +31,9 @@
 	struct rb_node *rb_node;
 };
 
-/*
- * Leftmost-cached rbtrees.
- *
- * We do not cache the rightmost node based on footprint
- * size vs number of potential users that could benefit
- * from O(1) rb_last(). Just not worth it, users that want
- * this feature can always implement the logic explicitly.
- * Furthermore, users that want to cache both pointers may
- * find it a bit asymmetric, but that's ok.
- */
-struct rb_root_cached {
-	struct rb_root rb_root;
-	struct rb_node *rb_leftmost;
-};
-
 #define rb_parent(r)   ((struct rb_node *)((r)->__rb_parent_color & ~3))
 
 #define RB_ROOT	(struct rb_root) { NULL, }
-#define RB_ROOT_CACHED (struct rb_root_cached) { {NULL, }, NULL }
 #define	rb_entry(ptr, type, member) container_of(ptr, type, member)
 
 #define RB_EMPTY_ROOT(root)  (READ_ONCE((root)->rb_node) == NULL)
@@ -71,12 +55,6 @@
 extern struct rb_node *rb_first(const struct rb_root *);
 extern struct rb_node *rb_last(const struct rb_root *);
 
-extern void rb_insert_color_cached(struct rb_node *,
-				   struct rb_root_cached *, bool);
-extern void rb_erase_cached(struct rb_node *node, struct rb_root_cached *);
-/* Same as rb_first(), but O(1) */
-#define rb_first_cached(root) (root)->rb_leftmost
-
 /* Postorder iteration - always visit the parent after its children */
 extern struct rb_node *rb_first_postorder(const struct rb_root *);
 extern struct rb_node *rb_next_postorder(const struct rb_node *);
@@ -84,8 +62,6 @@
 /* Fast replacement of a single node without remove/rebalance/add/rebalance */
 extern void rb_replace_node(struct rb_node *victim, struct rb_node *new,
 			    struct rb_root *root);
-extern void rb_replace_node_cached(struct rb_node *victim, struct rb_node *new,
-				   struct rb_root_cached *root);
 
 static inline void rb_link_node(struct rb_node *node, struct rb_node *parent,
 				struct rb_node **rb_link)
@@ -129,4 +105,51 @@
 	rb_erase(n, root);
 	RB_CLEAR_NODE(n);
 }
+
+/*
+ * Leftmost-cached rbtrees.
+ *
+ * We do not cache the rightmost node based on footprint
+ * size vs number of potential users that could benefit
+ * from O(1) rb_last(). Just not worth it, users that want
+ * this feature can always implement the logic explicitly.
+ * Furthermore, users that want to cache both pointers may
+ * find it a bit asymmetric, but that's ok.
+ */
+struct rb_root_cached {
+	struct rb_root rb_root;
+	struct rb_node *rb_leftmost;
+};
+
+#define RB_ROOT_CACHED (struct rb_root_cached) { {NULL, }, NULL }
+
+/* Same as rb_first(), but O(1) */
+#define rb_first_cached(root) (root)->rb_leftmost
+
+static inline void rb_insert_color_cached(struct rb_node *node,
+					  struct rb_root_cached *root,
+					  bool leftmost)
+{
+	if (leftmost)
+		root->rb_leftmost = node;
+	rb_insert_color(node, &root->rb_root);
+}
+
+static inline void rb_erase_cached(struct rb_node *node,
+				   struct rb_root_cached *root)
+{
+	if (root->rb_leftmost == node)
+		root->rb_leftmost = rb_next(node);
+	rb_erase(node, &root->rb_root);
+}
+
+static inline void rb_replace_node_cached(struct rb_node *victim,
+					  struct rb_node *new,
+					  struct rb_root_cached *root)
+{
+	if (root->rb_leftmost == victim)
+		root->rb_leftmost = new;
+	rb_replace_node(victim, new, &root->rb_root);
+}
+
 #endif /* __TOOLS_LINUX_PERF_RBTREE_H */

diff --git a/tools/include/linux/rbtree_augmented.h b/tools/include/linux/rbtree_augmented.h
index ddd0100..381aa94 100644
--- a/tools/include/linux/rbtree_augmented.h
+++ b/tools/include/linux/rbtree_augmented.h

@@ -32,17 +32,16 @@
 	void (*rotate)(struct rb_node *old, struct rb_node *new);
 };
 
-extern void __rb_insert_augmented(struct rb_node *node,
-				  struct rb_root *root,
-				  bool newleft, struct rb_node **leftmost,
+extern void __rb_insert_augmented(struct rb_node *node, struct rb_root *root,
 	void (*augment_rotate)(struct rb_node *old, struct rb_node *new));
+
 /*
  * Fixup the rbtree and update the augmented information when rebalancing.
  *
  * On insertion, the user must update the augmented information on the path
  * leading to the inserted node, then call rb_link_node() as usual and
- * rb_augment_inserted() instead of the usual rb_insert_color() call.
- * If rb_augment_inserted() rebalances the rbtree, it will callback into
+ * rb_insert_augmented() instead of the usual rb_insert_color() call.
+ * If rb_insert_augmented() rebalances the rbtree, it will callback into
  * a user provided function to update the augmented information on the
  * affected subtrees.
  */
@@ -50,7 +49,7 @@
 rb_insert_augmented(struct rb_node *node, struct rb_root *root,
 		    const struct rb_augment_callbacks *augment)
 {
-	__rb_insert_augmented(node, root, false, NULL, augment->rotate);
+	__rb_insert_augmented(node, root, augment->rotate);
 }
 
 static inline void
@@ -58,45 +57,92 @@
 			   struct rb_root_cached *root, bool newleft,
 			   const struct rb_augment_callbacks *augment)
 {
-	__rb_insert_augmented(node, &root->rb_root,
-			      newleft, &root->rb_leftmost, augment->rotate);
+	if (newleft)
+		root->rb_leftmost = node;
+	rb_insert_augmented(node, &root->rb_root, augment);
 }
 
-#define RB_DECLARE_CALLBACKS(rbstatic, rbname, rbstruct, rbfield,	\
-			     rbtype, rbaugmented, rbcompute)		\
+/*
+ * Template for declaring augmented rbtree callbacks (generic case)
+ *
+ * RBSTATIC:    'static' or empty
+ * RBNAME:      name of the rb_augment_callbacks structure
+ * RBSTRUCT:    struct type of the tree nodes
+ * RBFIELD:     name of struct rb_node field within RBSTRUCT
+ * RBAUGMENTED: name of field within RBSTRUCT holding data for subtree
+ * RBCOMPUTE:   name of function that recomputes the RBAUGMENTED data
+ */
+
+#define RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME,				\
+			     RBSTRUCT, RBFIELD, RBAUGMENTED, RBCOMPUTE)	\
 static inline void							\
-rbname ## _propagate(struct rb_node *rb, struct rb_node *stop)		\
+RBNAME ## _propagate(struct rb_node *rb, struct rb_node *stop)		\
 {									\
 	while (rb != stop) {						\
-		rbstruct *node = rb_entry(rb, rbstruct, rbfield);	\
-		rbtype augmented = rbcompute(node);			\
-		if (node->rbaugmented == augmented)			\
+		RBSTRUCT *node = rb_entry(rb, RBSTRUCT, RBFIELD);	\
+		if (RBCOMPUTE(node, true))				\
 			break;						\
-		node->rbaugmented = augmented;				\
-		rb = rb_parent(&node->rbfield);				\
+		rb = rb_parent(&node->RBFIELD);				\
 	}								\
 }									\
 static inline void							\
-rbname ## _copy(struct rb_node *rb_old, struct rb_node *rb_new)		\
+RBNAME ## _copy(struct rb_node *rb_old, struct rb_node *rb_new)		\
 {									\
-	rbstruct *old = rb_entry(rb_old, rbstruct, rbfield);		\
-	rbstruct *new = rb_entry(rb_new, rbstruct, rbfield);		\
-	new->rbaugmented = old->rbaugmented;				\
+	RBSTRUCT *old = rb_entry(rb_old, RBSTRUCT, RBFIELD);		\
+	RBSTRUCT *new = rb_entry(rb_new, RBSTRUCT, RBFIELD);		\
+	new->RBAUGMENTED = old->RBAUGMENTED;				\
 }									\
 static void								\
-rbname ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new)	\
+RBNAME ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new)	\
 {									\
-	rbstruct *old = rb_entry(rb_old, rbstruct, rbfield);		\
-	rbstruct *new = rb_entry(rb_new, rbstruct, rbfield);		\
-	new->rbaugmented = old->rbaugmented;				\
-	old->rbaugmented = rbcompute(old);				\
+	RBSTRUCT *old = rb_entry(rb_old, RBSTRUCT, RBFIELD);		\
+	RBSTRUCT *new = rb_entry(rb_new, RBSTRUCT, RBFIELD);		\
+	new->RBAUGMENTED = old->RBAUGMENTED;				\
+	RBCOMPUTE(old, false);						\
 }									\
-rbstatic const struct rb_augment_callbacks rbname = {			\
-	.propagate = rbname ## _propagate,				\
-	.copy = rbname ## _copy,					\
-	.rotate = rbname ## _rotate					\
+RBSTATIC const struct rb_augment_callbacks RBNAME = {			\
+	.propagate = RBNAME ## _propagate,				\
+	.copy = RBNAME ## _copy,					\
+	.rotate = RBNAME ## _rotate					\
 };
 
+/*
+ * Template for declaring augmented rbtree callbacks,
+ * computing RBAUGMENTED scalar as max(RBCOMPUTE(node)) for all subtree nodes.
+ *
+ * RBSTATIC:    'static' or empty
+ * RBNAME:      name of the rb_augment_callbacks structure
+ * RBSTRUCT:    struct type of the tree nodes
+ * RBFIELD:     name of struct rb_node field within RBSTRUCT
+ * RBTYPE:      type of the RBAUGMENTED field
+ * RBAUGMENTED: name of RBTYPE field within RBSTRUCT holding data for subtree
+ * RBCOMPUTE:   name of function that returns the per-node RBTYPE scalar
+ */
+
+#define RB_DECLARE_CALLBACKS_MAX(RBSTATIC, RBNAME, RBSTRUCT, RBFIELD,	      \
+				 RBTYPE, RBAUGMENTED, RBCOMPUTE)	      \
+static inline bool RBNAME ## _compute_max(RBSTRUCT *node, bool exit)	      \
+{									      \
+	RBSTRUCT *child;						      \
+	RBTYPE max = RBCOMPUTE(node);					      \
+	if (node->RBFIELD.rb_left) {					      \
+		child = rb_entry(node->RBFIELD.rb_left, RBSTRUCT, RBFIELD);   \
+		if (child->RBAUGMENTED > max)				      \
+			max = child->RBAUGMENTED;			      \
+	}								      \
+	if (node->RBFIELD.rb_right) {					      \
+		child = rb_entry(node->RBFIELD.rb_right, RBSTRUCT, RBFIELD);  \
+		if (child->RBAUGMENTED > max)				      \
+			max = child->RBAUGMENTED;			      \
+	}								      \
+	if (exit && node->RBAUGMENTED == max)				      \
+		return true;						      \
+	node->RBAUGMENTED = max;					      \
+	return false;							      \
+}									      \
+RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME,					      \
+		     RBSTRUCT, RBFIELD, RBAUGMENTED, RBNAME ## _compute_max)
+
 
 #define	RB_RED		0
 #define	RB_BLACK	1
@@ -139,7 +185,6 @@
 
 static __always_inline struct rb_node *
 __rb_erase_augmented(struct rb_node *node, struct rb_root *root,
-		     struct rb_node **leftmost,
 		     const struct rb_augment_callbacks *augment)
 {
 	struct rb_node *child = node->rb_right;
@@ -147,9 +192,6 @@
 	struct rb_node *parent, *rebalance;
 	unsigned long pc;
 
-	if (leftmost && node == *leftmost)
-		*leftmost = rb_next(node);
-
 	if (!tmp) {
 		/*
 		 * Case 1: node to erase has no more than 1 child (easy!)
@@ -249,8 +291,7 @@
 rb_erase_augmented(struct rb_node *node, struct rb_root *root,
 		   const struct rb_augment_callbacks *augment)
 {
-	struct rb_node *rebalance = __rb_erase_augmented(node, root,
-							 NULL, augment);
+	struct rb_node *rebalance = __rb_erase_augmented(node, root, augment);
 	if (rebalance)
 		__rb_erase_color(rebalance, root, augment->rotate);
 }
@@ -259,11 +300,9 @@
 rb_erase_augmented_cached(struct rb_node *node, struct rb_root_cached *root,
 			  const struct rb_augment_callbacks *augment)
 {
-	struct rb_node *rebalance = __rb_erase_augmented(node, &root->rb_root,
-							 &root->rb_leftmost,
-							 augment);
-	if (rebalance)
-		__rb_erase_color(rebalance, &root->rb_root, augment->rotate);
+	if (root->rb_leftmost == node)
+		root->rb_leftmost = rb_next(node);
+	rb_erase_augmented(node, &root->rb_root, augment);
 }
 
 #endif /* _TOOLS_LINUX_RBTREE_AUGMENTED_H */

diff --git a/tools/include/uapi/asm-generic/unistd.h b/tools/include/uapi/asm-generic/unistd.h
index 1be0e79..1fc8faa 100644
--- a/tools/include/uapi/asm-generic/unistd.h
+++ b/tools/include/uapi/asm-generic/unistd.h

@@ -569,7 +569,7 @@
 __SC_COMP(__NR_semctl, sys_semctl, compat_sys_semctl)
 #if defined(__ARCH_WANT_TIME32_SYSCALLS) || __BITS_PER_LONG != 32
 #define __NR_semtimedop 192
-__SC_COMP(__NR_semtimedop, sys_semtimedop, sys_semtimedop_time32)
+__SC_3264(__NR_semtimedop, sys_semtimedop_time32, sys_semtimedop)
 #endif
 #define __NR_semop 193
 __SYSCALL(__NR_semop, sys_semop)

diff --git a/tools/include/uapi/linux/prctl.h b/tools/include/uapi/linux/prctl.h
index 094bb03..7da1b37 100644
--- a/tools/include/uapi/linux/prctl.h
+++ b/tools/include/uapi/linux/prctl.h

@@ -181,7 +181,7 @@
 #define PR_GET_THP_DISABLE	42
 
 /*
- * Tell the kernel to start/stop helping userspace manage bounds tables.
+ * No longer implemented, but left here to ensure the numbers stay reserved:
  */
 #define PR_MPX_ENABLE_MANAGEMENT  43
 #define PR_MPX_DISABLE_MANAGEMENT 44
@@ -229,4 +229,9 @@
 # define PR_PAC_APDBKEY			(1UL << 3)
 # define PR_PAC_APGAKEY			(1UL << 4)
 
+/* Tagged user address controls for arm64 */
+#define PR_SET_TAGGED_ADDR_CTRL		55
+#define PR_GET_TAGGED_ADDR_CTRL		56
+# define PR_TAGGED_ADDR_ENABLE		(1UL << 0)
+
 #endif /* _LINUX_PRCTL_H */

diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c
index 7159677..ede55fe 100644
--- a/tools/lib/bpf/btf_dump.c
+++ b/tools/lib/bpf/btf_dump.c

@@ -48,6 +48,8 @@
 	__u8 fwd_emitted: 1;
 	/* whether unique non-duplicate name was already assigned */
 	__u8 name_resolved: 1;
+	/* whether type is referenced from any other type */
+	__u8 referenced: 1;
 };
 
 struct btf_dump {
@@ -173,6 +175,7 @@
 	free(d);
 }
 
+static int btf_dump_mark_referenced(struct btf_dump *d);
 static int btf_dump_order_type(struct btf_dump *d, __u32 id, bool through_ptr);
 static void btf_dump_emit_type(struct btf_dump *d, __u32 id, __u32 cont_id);
 
@@ -213,6 +216,11 @@
 		/* VOID is special */
 		d->type_states[0].order_state = ORDERED;
 		d->type_states[0].emit_state = EMITTED;
+
+		/* eagerly determine referenced types for anon enums */
+		err = btf_dump_mark_referenced(d);
+		if (err)
+			return err;
 	}
 
 	d->emit_queue_cnt = 0;
@@ -226,6 +234,79 @@
 	return 0;
 }
 
+/*
+ * Mark all types that are referenced from any other type. This is used to
+ * determine top-level anonymous enums that need to be emitted as an
+ * independent type declarations.
+ * Anonymous enums come in two flavors: either embedded in a struct's field
+ * definition, in which case they have to be declared inline as part of field
+ * type declaration; or as a top-level anonymous enum, typically used for
+ * declaring global constants. It's impossible to distinguish between two
+ * without knowning whether given enum type was referenced from other type:
+ * top-level anonymous enum won't be referenced by anything, while embedded
+ * one will.
+ */
+static int btf_dump_mark_referenced(struct btf_dump *d)
+{
+	int i, j, n = btf__get_nr_types(d->btf);
+	const struct btf_type *t;
+	__u16 vlen;
+
+	for (i = 1; i <= n; i++) {
+		t = btf__type_by_id(d->btf, i);
+		vlen = btf_vlen(t);
+
+		switch (btf_kind(t)) {
+		case BTF_KIND_INT:
+		case BTF_KIND_ENUM:
+		case BTF_KIND_FWD:
+			break;
+
+		case BTF_KIND_VOLATILE:
+		case BTF_KIND_CONST:
+		case BTF_KIND_RESTRICT:
+		case BTF_KIND_PTR:
+		case BTF_KIND_TYPEDEF:
+		case BTF_KIND_FUNC:
+		case BTF_KIND_VAR:
+			d->type_states[t->type].referenced = 1;
+			break;
+
+		case BTF_KIND_ARRAY: {
+			const struct btf_array *a = btf_array(t);
+
+			d->type_states[a->index_type].referenced = 1;
+			d->type_states[a->type].referenced = 1;
+			break;
+		}
+		case BTF_KIND_STRUCT:
+		case BTF_KIND_UNION: {
+			const struct btf_member *m = btf_members(t);
+
+			for (j = 0; j < vlen; j++, m++)
+				d->type_states[m->type].referenced = 1;
+			break;
+		}
+		case BTF_KIND_FUNC_PROTO: {
+			const struct btf_param *p = btf_params(t);
+
+			for (j = 0; j < vlen; j++, p++)
+				d->type_states[p->type].referenced = 1;
+			break;
+		}
+		case BTF_KIND_DATASEC: {
+			const struct btf_var_secinfo *v = btf_var_secinfos(t);
+
+			for (j = 0; j < vlen; j++, v++)
+				d->type_states[v->type].referenced = 1;
+			break;
+		}
+		default:
+			return -EINVAL;
+		}
+	}
+	return 0;
+}
 static int btf_dump_add_emit_queue_id(struct btf_dump *d, __u32 id)
 {
 	__u32 *new_queue;
@@ -395,7 +476,12 @@
 	}
 	case BTF_KIND_ENUM:
 	case BTF_KIND_FWD:
-		if (t->name_off != 0) {
+		/*
+		 * non-anonymous or non-referenced enums are top-level
+		 * declarations and should be emitted. Same logic can be
+		 * applied to FWDs, it won't hurt anyways.
+		 */
+		if (t->name_off != 0 || !tstate->referenced) {
 			err = btf_dump_add_emit_queue_id(d, id);
 			if (err)
 				return err;
@@ -536,11 +622,6 @@
 	t = btf__type_by_id(d->btf, id);
 	kind = btf_kind(t);
 
-	if (top_level_def && t->name_off == 0) {
-		pr_warning("unexpected nameless definition, id:[%u]\n", id);
-		return;
-	}
-
 	if (tstate->emit_state == EMITTING) {
 		if (tstate->fwd_emitted)
 			return;
@@ -1167,6 +1248,7 @@
 				return;
 			}
 
+			next_id = decls->ids[decls->cnt - 1];
 			next_t = btf__type_by_id(d->btf, next_id);
 			multidim = btf_is_array(next_t);
 			/* we need space if we have named non-pointer */

diff --git a/tools/lib/bpf/xsk.c b/tools/lib/bpf/xsk.c
index 842c4fd..24fa313 100644
--- a/tools/lib/bpf/xsk.c
+++ b/tools/lib/bpf/xsk.c

@@ -65,7 +65,6 @@
 	int xsks_map_fd;
 	__u32 queue_id;
 	char ifname[IFNAMSIZ];
-	bool zc;
 };
 
 struct xsk_nl_info {
@@ -491,7 +490,6 @@
 	void *rx_map = NULL, *tx_map = NULL;
 	struct sockaddr_xdp sxdp = {};
 	struct xdp_mmap_offsets off;
-	struct xdp_options opts;
 	struct xsk_socket *xsk;
 	socklen_t optlen;
 	int err;
@@ -611,15 +609,6 @@
 
 	xsk->prog_fd = -1;
 
-	optlen = sizeof(opts);
-	err = getsockopt(xsk->fd, SOL_XDP, XDP_OPTIONS, &opts, &optlen);
-	if (err) {
-		err = -errno;
-		goto out_mmap_tx;
-	}
-
-	xsk->zc = opts.flags & XDP_OPTIONS_ZEROCOPY;
-
 	if (!(xsk->config.libbpf_flags & XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD)) {
 		err = xsk_setup_xdp_prog(xsk);
 		if (err)

diff --git a/tools/lib/rbtree.c b/tools/lib/rbtree.c
index 804f145..2548ff8 100644
--- a/tools/lib/rbtree.c
+++ b/tools/lib/rbtree.c

@@ -83,14 +83,10 @@
 
 static __always_inline void
 __rb_insert(struct rb_node *node, struct rb_root *root,
-	    bool newleft, struct rb_node **leftmost,
 	    void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
 {
 	struct rb_node *parent = rb_red_parent(node), *gparent, *tmp;
 
-	if (newleft)
-		*leftmost = node;
-
 	while (true) {
 		/*
 		 * Loop invariant: node is red.
@@ -436,34 +432,17 @@
 
 void rb_insert_color(struct rb_node *node, struct rb_root *root)
 {
-	__rb_insert(node, root, false, NULL, dummy_rotate);
+	__rb_insert(node, root, dummy_rotate);
 }
 
 void rb_erase(struct rb_node *node, struct rb_root *root)
 {
 	struct rb_node *rebalance;
-	rebalance = __rb_erase_augmented(node, root,
-					 NULL, &dummy_callbacks);
+	rebalance = __rb_erase_augmented(node, root, &dummy_callbacks);
 	if (rebalance)
 		____rb_erase_color(rebalance, root, dummy_rotate);
 }
 
-void rb_insert_color_cached(struct rb_node *node,
-			    struct rb_root_cached *root, bool leftmost)
-{
-	__rb_insert(node, &root->rb_root, leftmost,
-		    &root->rb_leftmost, dummy_rotate);
-}
-
-void rb_erase_cached(struct rb_node *node, struct rb_root_cached *root)
-{
-	struct rb_node *rebalance;
-	rebalance = __rb_erase_augmented(node, &root->rb_root,
-					 &root->rb_leftmost, &dummy_callbacks);
-	if (rebalance)
-		____rb_erase_color(rebalance, &root->rb_root, dummy_rotate);
-}
-
 /*
  * Augmented rbtree manipulation functions.
  *
@@ -472,10 +451,9 @@
  */
 
 void __rb_insert_augmented(struct rb_node *node, struct rb_root *root,
-			   bool newleft, struct rb_node **leftmost,
 	void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
 {
-	__rb_insert(node, root, newleft, leftmost, augment_rotate);
+	__rb_insert(node, root, augment_rotate);
 }
 
 /*
@@ -580,15 +558,6 @@
 	__rb_change_child(victim, new, parent, root);
 }
 
-void rb_replace_node_cached(struct rb_node *victim, struct rb_node *new,
-			    struct rb_root_cached *root)
-{
-	rb_replace_node(victim, new, &root->rb_root);
-
-	if (root->rb_leftmost == victim)
-		root->rb_leftmost = new;
-}
-
 static struct rb_node *rb_left_deepest_node(const struct rb_node *node)
 {
 	for (;;) {

diff --git a/tools/lib/traceevent/Build b/tools/lib/traceevent/Build
index ba54bfc..f9a5d79 100644
--- a/tools/lib/traceevent/Build
+++ b/tools/lib/traceevent/Build

@@ -6,14 +6,3 @@
 libtraceevent-y += kbuffer-parse.o
 libtraceevent-y += tep_strerror.o
 libtraceevent-y += event-parse-api.o
-
-plugin_jbd2-y         += plugin_jbd2.o
-plugin_hrtimer-y      += plugin_hrtimer.o
-plugin_kmem-y         += plugin_kmem.o
-plugin_kvm-y          += plugin_kvm.o
-plugin_mac80211-y     += plugin_mac80211.o
-plugin_sched_switch-y += plugin_sched_switch.o
-plugin_function-y     += plugin_function.o
-plugin_xen-y          += plugin_xen.o
-plugin_scsi-y         += plugin_scsi.o
-plugin_cfg80211-y     += plugin_cfg80211.o

diff --git a/tools/lib/traceevent/Documentation/libtraceevent-event_print.txt b/tools/lib/traceevent/Documentation/libtraceevent-event_print.txt
new file mode 100644
index 0000000..2c6a618
--- /dev/null
+++ b/tools/lib/traceevent/Documentation/libtraceevent-event_print.txt

@@ -0,0 +1,130 @@
+libtraceevent(3)
+================
+
+NAME
+----
+tep_print_event - Writes event information into a trace sequence.
+
+SYNOPSIS
+--------
+[verse]
+--
+*#include <event-parse.h>*
+*#include <trace-seq.h>*
+
+void *tep_print_event*(struct tep_handle pass:[*]_tep_, struct trace_seqpass:[*]_s_, struct tep_record pass:[*]_record_, const char pass:[*]_fmt_, _..._)
+--
+
+DESCRIPTION
+-----------
+
+The _tep_print_event()_ function parses the event information of the given
+_record_ and writes it into the trace sequence _s_, according to the format
+string _fmt_. The desired information is specified after the format string.
+The _fmt_ is printf-like format string, following arguments are supported:
+[verse]
+--
+	TEP_PRINT_PID, "%d"  - PID of the event.
+	TEP_PRINT_CPU, "%d"  - Event CPU.
+	TEP_PRINT_COMM, "%s" - Event command string.
+	TEP_PRINT_NAME, "%s" - Event name.
+	TEP_PRINT_LATENCY, "%s" - Latency of the event. It prints 4 or more
+			fields - interrupt state, scheduling state,
+			current context, and preemption count.
+			Field 1 is the interrupt enabled state:
+				d : Interrupts are disabled
+				. : Interrupts are enabled
+				X : The architecture does not support this
+				    information
+			Field 2 is the "need resched" state.
+				N : The task is set to call the scheduler when
+				    possible, as another higher priority task
+				    may need to be scheduled in.
+				. : The task is not set to call the scheduler.
+			Field 3 is the context state.
+				. : Normal context
+				s : Soft interrupt context
+				h : Hard interrupt context
+				H : Hard interrupt context which triggered
+				    during soft interrupt context.
+				z : NMI context
+				Z : NMI context which triggered during hard
+				    interrupt context
+			Field 4 is the preemption count.
+				. : The preempt count is zero.
+			On preemptible kernels (where the task can be scheduled
+			out in arbitrary locations while in kernel context), the
+			preempt count, when non zero, will prevent the kernel
+			from scheduling out the current task. The preempt count
+			number is displayed when it is not zero.
+			Depending on the kernel, it may show other fields
+			(lock depth, or migration disabled, which are unique to
+			specialized kernels).
+	TEP_PRINT_TIME, %d - event time stamp. A divisor and precision can be
+			specified as part of this format string:
+			"%precision.divisord". Example:
+			"%3.1000d" - divide the time by 1000 and print the first
+			3 digits before the dot. Thus, the time stamp
+			"123456000" will be printed as "123.456"
+	TEP_PRINT_INFO, "%s" - event information.
+	TEP_PRINT_INFO_RAW, "%s" - event information, in raw format.
+
+--
+EXAMPLE
+-------
+[source,c]
+--
+#include <event-parse.h>
+#include <trace-seq.h>
+...
+struct trace_seq seq;
+trace_seq_init(&seq);
+struct tep_handle *tep = tep_alloc();
+...
+void print_my_event(struct tep_record *record)
+{
+	trace_seq_reset(&seq);
+	tep_print_event(tep, s, record, "%16s-%-5d [%03d] %s %6.1000d %s %s",
+			TEP_PRINT_COMM, TEP_PRINT_PID, TEP_PRINT_CPU,
+			TEP_PRINT_LATENCY, TEP_PRINT_TIME, TEP_PRINT_NAME,
+			TEP_PRINT_INFO);
+}
+...
+--
+
+FILES
+-----
+[verse]
+--
+*event-parse.h*
+	Header file to include in order to have access to the library APIs.
+*trace-seq.h*
+	Header file to include in order to have access to trace sequences related APIs.
+	Trace sequences are used to allow a function to call several other functions
+	to create a string of data to use.
+*-ltraceevent*
+	Linker switch to add when building a program that uses the library.
+--
+
+SEE ALSO
+--------
+_libtraceevent(3)_, _trace-cmd(1)_
+
+AUTHOR
+------
+[verse]
+--
+*Steven Rostedt* <rostedt@goodmis.org>, author of *libtraceevent*.
+*Tzvetomir Stoyanov* <tz.stoyanov@gmail.com>, author of this man page.
+--
+REPORTING BUGS
+--------------
+Report bugs to  <linux-trace-devel@vger.kernel.org>
+
+LICENSE
+-------
+libtraceevent is Free Software licensed under the GNU LGPL 2.1
+
+RESOURCES
+---------
+https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

diff --git a/tools/lib/traceevent/Documentation/libtraceevent-func_apis.txt b/tools/lib/traceevent/Documentation/libtraceevent-func_apis.txt
index 38bfea3..f6aca0d 100644
--- a/tools/lib/traceevent/Documentation/libtraceevent-func_apis.txt
+++ b/tools/lib/traceevent/Documentation/libtraceevent-func_apis.txt

@@ -59,12 +59,12 @@
 
 The _tep_register_function()_ function registers a function name mapped to an
 address and (optional) module. This mapping is used in case the function tracer
-or events have "%pF" or "%pS" parameter in its format string. It is common to
-pass in the kallsyms function names with their corresponding addresses with this
+or events have "%pS" parameter in its format string. It is common to pass in
+the kallsyms function names with their corresponding addresses with this
 function. The _tep_ argument is the trace event parser context. The _name_ is
-the name of the function, the string is copied internally. The _addr_ is
-the start address of the function. The _mod_ is the kernel module
-the function may be in (NULL for none).
+the name of the function, the string is copied internally. The _addr_ is the
+start address of the function. The _mod_ is the kernel module the function may
+be in (NULL for none).
 
 The _tep_register_print_string()_ function  registers a string by the address
 it was stored in the kernel. Some strings internal to the kernel with static

diff --git a/tools/lib/traceevent/Documentation/libtraceevent-handle.txt b/tools/lib/traceevent/Documentation/libtraceevent-handle.txt
index 8d56831..45b2017 100644
--- a/tools/lib/traceevent/Documentation/libtraceevent-handle.txt
+++ b/tools/lib/traceevent/Documentation/libtraceevent-handle.txt

@@ -3,7 +3,7 @@
 
 NAME
 ----
-tep_alloc, tep_free,tep_ref, tep_unref,tep_ref_get - Create, destroy, manage
+tep_alloc, tep_free,tep_ref, tep_unref,tep_get_ref - Create, destroy, manage
 references of trace event parser context.
 
 SYNOPSIS
@@ -16,7 +16,7 @@
 void *tep_free*(struct tep_handle pass:[*]_tep_);
 void *tep_ref*(struct tep_handle pass:[*]_tep_);
 void *tep_unref*(struct tep_handle pass:[*]_tep_);
-int *tep_ref_get*(struct tep_handle pass:[*]_tep_);
+int *tep_get_ref*(struct tep_handle pass:[*]_tep_);
 --
 
 DESCRIPTION
@@ -57,9 +57,9 @@
 ...
 struct tep_handle *tep = tep_alloc();
 ...
-int ref = tep_ref_get(tep);
+int ref = tep_get_ref(tep);
 tep_ref(tep);
-if ( (ref+1) != tep_ref_get(tep)) {
+if ( (ref+1) != tep_get_ref(tep)) {
 	/* Something wrong happened, the counter is not incremented by 1 */
 }
 tep_unref(tep);

diff --git a/tools/lib/traceevent/Documentation/libtraceevent-plugins.txt b/tools/lib/traceevent/Documentation/libtraceevent-plugins.txt
new file mode 100644
index 0000000..596032a
--- /dev/null
+++ b/tools/lib/traceevent/Documentation/libtraceevent-plugins.txt

@@ -0,0 +1,99 @@
+libtraceevent(3)
+================
+
+NAME
+----
+tep_load_plugins, tep_unload_plugins - Load / unload traceevent plugins.
+
+SYNOPSIS
+--------
+[verse]
+--
+*#include <event-parse.h>*
+
+struct tep_plugin_list pass:[*]*tep_load_plugins*(struct tep_handle pass:[*]_tep_);
+void *tep_unload_plugins*(struct tep_plugin_list pass:[*]_plugin_list_, struct tep_handle pass:[*]_tep_);
+--
+
+DESCRIPTION
+-----------
+The _tep_load_plugins()_ function loads all plugins, located in the plugin
+directories. The _tep_ argument is trace event parser context.
+The plugin directories are :
+[verse]
+--
+	- System's plugin directory, defined at the library compile time. It
+	  depends on the library installation prefix and usually is
+	  _(install_preffix)/lib/traceevent/plugins_
+	- Directory, defined by the environment variable _TRACEEVENT_PLUGIN_DIR_
+	- User's plugin directory, located at _~/.local/lib/traceevent/plugins_
+--
+Loading of plugins can be controlled by the _tep_flags_, using the
+_tep_set_flag()_ API:
+[verse]
+--
+	_TEP_DISABLE_SYS_PLUGINS_	- do not load plugins, located in
+					the system's plugin directory.
+	_TEP_DISABLE_PLUGINS_		- do not load any plugins.
+--
+The _tep_set_flag()_ API needs to be called before _tep_load_plugins()_, if
+loading of all plugins is not the desired case.
+
+The _tep_unload_plugins()_ function unloads the plugins, previously loaded by
+_tep_load_plugins()_. The _tep_ argument is trace event parser context. The
+_plugin_list_ is the list of loaded plugins, returned by
+the _tep_load_plugins()_ function.
+
+RETURN VALUE
+------------
+The _tep_load_plugins()_ function returns a list of successfully loaded plugins,
+or NULL in case no plugins are loaded.
+
+EXAMPLE
+-------
+[source,c]
+--
+#include <event-parse.h>
+...
+struct tep_handle *tep = tep_alloc();
+...
+struct tep_plugin_list *plugins = tep_load_plugins(tep);
+if (plugins == NULL) {
+	/* no plugins are loaded */
+}
+...
+tep_unload_plugins(plugins, tep);
+--
+
+FILES
+-----
+[verse]
+--
+*event-parse.h*
+	Header file to include in order to have access to the library APIs.
+*-ltraceevent*
+	Linker switch to add when building a program that uses the library.
+--
+
+SEE ALSO
+--------
+_libtraceevent(3)_, _trace-cmd(1)_, _tep_set_flag(3)_
+
+AUTHOR
+------
+[verse]
+--
+*Steven Rostedt* <rostedt@goodmis.org>, author of *libtraceevent*.
+*Tzvetomir Stoyanov* <tz.stoyanov@gmail.com>, author of this man page.
+--
+REPORTING BUGS
+--------------
+Report bugs to  <linux-trace-devel@vger.kernel.org>
+
+LICENSE
+-------
+libtraceevent is Free Software licensed under the GNU LGPL 2.1
+
+RESOURCES
+---------
+https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

diff --git a/tools/lib/traceevent/Documentation/libtraceevent.txt b/tools/lib/traceevent/Documentation/libtraceevent.txt
index fbd977b..d530a7c 100644
--- a/tools/lib/traceevent/Documentation/libtraceevent.txt
+++ b/tools/lib/traceevent/Documentation/libtraceevent.txt

@@ -16,7 +16,7 @@
 	void *tep_free*(struct tep_handle pass:[*]_tep_);
 	void *tep_ref*(struct tep_handle pass:[*]_tep_);
 	void *tep_unref*(struct tep_handle pass:[*]_tep_);
-	int *tep_ref_get*(struct tep_handle pass:[*]_tep_);
+	int *tep_get_ref*(struct tep_handle pass:[*]_tep_);
 	void *tep_set_flag*(struct tep_handle pass:[*]_tep_, enum tep_flag _flag_);
 	void *tep_clear_flag*(struct tep_handle pass:[*]_tep_, enum tep_flag _flag_);
 	bool *tep_test_flag*(struct tep_handle pass:[*]_tep_, enum tep_flag _flags_);
@@ -26,15 +26,12 @@
 	void *tep_set_long_size*(struct tep_handle pass:[*]_tep_, int _long_size_);
 	int *tep_get_page_size*(struct tep_handle pass:[*]_tep_);
 	void *tep_set_page_size*(struct tep_handle pass:[*]_tep_, int _page_size_);
-	bool *tep_is_latency_format*(struct tep_handle pass:[*]_tep_);
-	void *tep_set_latency_format*(struct tep_handle pass:[*]_tep_, int _lat_);
 	int *tep_get_header_page_size*(struct tep_handle pass:[*]_tep_);
 	int *tep_get_header_timestamp_size*(struct tep_handle pass:[*]_tep_);
 	bool *tep_is_old_format*(struct tep_handle pass:[*]_tep_);
 	int *tep_strerror*(struct tep_handle pass:[*]_tep_, enum tep_errno _errnum_, char pass:[*]_buf_, size_t _buflen_);
 
 Register / unregister APIs:
-	int *tep_register_trace_clock*(struct tep_handle pass:[*]_tep_, const char pass:[*]_trace_clock_);
 	int *tep_register_function*(struct tep_handle pass:[*]_tep_, char pass:[*]_name_, unsigned long long _addr_, char pass:[*]_mod_);
 	int *tep_register_event_handler*(struct tep_handle pass:[*]_tep_, int _id_, const char pass:[*]_sys_name_, const char pass:[*]_event_name_, tep_event_handler_func _func_, void pass:[*]_context_);
 	int *tep_unregister_event_handler*(struct tep_handle pass:[*]tep, int id, const char pass:[*]sys_name, const char pass:[*]event_name, tep_event_handler_func func, void pass:[*]_context_);
@@ -57,14 +54,7 @@
 	int *tep_get_events_count*(struct tep_handle pass:[*]_tep_);
 	struct tep_event pass:[*]pass:[*]*tep_list_events*(struct tep_handle pass:[*]_tep_, enum tep_event_sort_type _sort_type_);
 	struct tep_event pass:[*]pass:[*]*tep_list_events_copy*(struct tep_handle pass:[*]_tep_, enum tep_event_sort_type _sort_type_);
-
-Event printing:
-	void *tep_print_event*(struct tep_handle pass:[*]_tep_, struct trace_seq pass:[*]_s_, struct tep_record pass:[*]_record_, bool _use_trace_clock_);
-	void *tep_print_event_data*(struct tep_handle pass:[*]_tep_, struct trace_seq pass:[*]_s_, struct tep_event pass:[*]_event_, struct tep_record pass:[*]_record_);
-	void *tep_event_info*(struct trace_seq pass:[*]_s_, struct tep_event pass:[*]_event_, struct tep_record pass:[*]_record_);
-	void *tep_print_event_task*(struct tep_handle pass:[*]_tep_, struct trace_seq pass:[*]_s_, struct tep_event pass:[*]_event_, struct tep_record pass:[*]_record_);
-	void *tep_print_event_time*(struct tep_handle pass:[*]_tep_, struct trace_seq pass:[*]_s_, struct tep_event pass:[*]_event_, struct tep_record pass:[*]record, bool _use_trace_clock_);
-	void *tep_set_print_raw*(struct tep_handle pass:[*]_tep_, int _print_raw_);
+	void *tep_print_event*(struct tep_handle pass:[*]_tep_, struct trace_seq pass:[*]_s_, struct tep_record pass:[*]_record_, const char pass:[*]_fmt_, _..._);
 
 Event finding:
 	struct tep_event pass:[*]*tep_find_event*(struct tep_handle pass:[*]_tep_, int _id_);
@@ -116,7 +106,6 @@
 	int *tep_filter_compare*(struct tep_event_filter pass:[*]_filter1_, struct tep_event_filter pass:[*]_filter2_);
 
 Parsing various data from the records:
-	void *tep_data_latency_format*(struct tep_handle pass:[*]_tep_, struct trace_seq pass:[*]_s_, struct tep_record pass:[*]_record_);
 	int *tep_data_type*(struct tep_handle pass:[*]_tep_, struct tep_record pass:[*]_rec_);
 	int *tep_data_pid*(struct tep_handle pass:[*]_tep_, struct tep_record pass:[*]_rec_);
 	int *tep_data_preempt_count*(struct tep_handle pass:[*]_tep_, struct tep_record pass:[*]_rec_);

diff --git a/tools/lib/traceevent/Makefile b/tools/lib/traceevent/Makefile
index a39cdd0..5315f37 100644
--- a/tools/lib/traceevent/Makefile
+++ b/tools/lib/traceevent/Makefile

@@ -58,30 +58,6 @@
 export DESTDIR DESTDIR_SQ
 export EVENT_PARSE_VERSION
 
-set_plugin_dir := 1
-
-# Set plugin_dir to preffered global plugin location
-# If we install under $HOME directory we go under
-# $(HOME)/.local/lib/traceevent/plugins
-#
-# We dont set PLUGIN_DIR in case we install under $HOME
-# directory, because by default the code looks under:
-# $(HOME)/.local/lib/traceevent/plugins by default.
-#
-ifeq ($(plugin_dir),)
-ifeq ($(prefix),$(HOME))
-override plugin_dir = $(HOME)/.local/lib/traceevent/plugins
-set_plugin_dir := 0
-else
-override plugin_dir = $(libdir)/traceevent/plugins
-endif
-endif
-
-ifeq ($(set_plugin_dir),1)
-PLUGIN_DIR = -DPLUGIN_DIR="$(plugin_dir)"
-PLUGIN_DIR_SQ = '$(subst ','\'',$(PLUGIN_DIR))'
-endif
-
 include ../../scripts/Makefile.include
 
 # copy a bit from Linux kbuild
@@ -105,7 +81,6 @@
 # Shell quotes
 libdir_SQ = $(subst ','\'',$(libdir))
 libdir_relative_SQ = $(subst ','\'',$(libdir_relative))
-plugin_dir_SQ = $(subst ','\'',$(plugin_dir))
 
 CONFIG_INCLUDES = 
 CONFIG_LIBS	=
@@ -151,29 +126,14 @@
 export srctree OUTPUT CC LD CFLAGS V
 build := -f $(srctree)/tools/build/Makefile.build dir=. obj
 
-PLUGINS  = plugin_jbd2.so
-PLUGINS += plugin_hrtimer.so
-PLUGINS += plugin_kmem.so
-PLUGINS += plugin_kvm.so
-PLUGINS += plugin_mac80211.so
-PLUGINS += plugin_sched_switch.so
-PLUGINS += plugin_function.so
-PLUGINS += plugin_xen.so
-PLUGINS += plugin_scsi.so
-PLUGINS += plugin_cfg80211.so
-
-PLUGINS    := $(addprefix $(OUTPUT),$(PLUGINS))
-PLUGINS_IN := $(PLUGINS:.so=-in.o)
-
 TE_IN      := $(OUTPUT)libtraceevent-in.o
 LIB_TARGET := $(addprefix $(OUTPUT),$(LIB_TARGET))
-DYNAMIC_LIST_FILE := $(OUTPUT)libtraceevent-dynamic-list
 
-CMD_TARGETS = $(LIB_TARGET) $(PLUGINS) $(DYNAMIC_LIST_FILE)
+CMD_TARGETS = $(LIB_TARGET)
 
 TARGETS = $(CMD_TARGETS)
 
-all: all_cmd
+all: all_cmd plugins
 
 all_cmd: $(CMD_TARGETS)
 
@@ -188,17 +148,6 @@
 $(OUTPUT)libtraceevent.a: $(TE_IN)
 	$(QUIET_LINK)$(RM) $@; $(AR) rcs $@ $^
 
-$(OUTPUT)libtraceevent-dynamic-list: $(PLUGINS)
-	$(QUIET_GEN)$(call do_generate_dynamic_list_file, $(PLUGINS), $@)
-
-plugins: $(PLUGINS)
-
-__plugin_obj = $(notdir $@)
-  plugin_obj = $(__plugin_obj:-in.o=)
-
-$(PLUGINS_IN): force
-	$(Q)$(MAKE) $(build)=$(plugin_obj)
-
 $(OUTPUT)%.so: $(OUTPUT)%-in.o
 	$(QUIET_LINK)$(CC) $(CFLAGS) -shared $(LDFLAGS) -nostartfiles -o $@ $^
 
@@ -258,25 +207,6 @@
 	$(INSTALL) $(if $3,-m $3,) $1 '$(DESTDIR_SQ)$2'
 endef
 
-define do_install_plugins
-	for plugin in $1; do				\
-	  $(call do_install,$$plugin,$(plugin_dir_SQ));	\
-	done
-endef
-
-define do_generate_dynamic_list_file
-	symbol_type=`$(NM) -u -D $1 | awk 'NF>1 {print $$1}' | \
-	xargs echo "U w W" | tr 'w ' 'W\n' | sort -u | xargs echo`;\
-	if [ "$$symbol_type" = "U W" ];then				\
-		(echo '{';						\
-		$(NM) -u -D $1 | awk 'NF>1 {print "\t"$$2";"}' | sort -u;\
-		echo '};';						\
-		) > $2;							\
-	else								\
-		(echo Either missing one of [$1] or bad version of $(NM)) 1>&2;\
-	fi
-endef
-
 PKG_CONFIG_FILE = libtraceevent.pc
 define do_install_pkgconfig_file
 	if [ -n "${pkgconfig_dir}" ]; then 					\
@@ -296,10 +226,6 @@
 		$(call do_install_mkdir,$(libdir_SQ)); \
 		cp -fpR $(LIB_INSTALL) $(DESTDIR)$(libdir_SQ)
 
-install_plugins: $(PLUGINS)
-	$(call QUIET_INSTALL, trace_plugins) \
-		$(call do_install_plugins, $(PLUGINS))
-
 install_pkgconfig:
 	$(call QUIET_INSTALL, $(PKG_CONFIG_FILE)) \
 		$(call do_install_pkgconfig_file,$(prefix))
@@ -313,7 +239,7 @@
 
 install: install_lib
 
-clean:
+clean: clean_plugins
 	$(call QUIET_CLEAN, libtraceevent) \
 		$(RM) *.o *~ $(TARGETS) *.a *.so $(VERSION_FILES) .*.d .*.cmd; \
 		$(RM) TRACEEVENT-CFLAGS tags TAGS; \
@@ -351,7 +277,19 @@
 	@echo '  doc-install         - install the man pages'
 	@echo '  doc-uninstall       - uninstall the man pages'
 	@echo''
-PHONY += force plugins
+
+PHONY += plugins
+plugins:
+	$(call descend,plugins)
+
+PHONY += install_plugins
+install_plugins:
+	$(call descend,plugins,install)
+
+PHONY += clean_plugins
+clean_plugins:
+	$(call descend,plugins,clean)
+
 force:
 
 # Declare the contents of the .PHONY variable as phony.  We keep that

diff --git a/tools/lib/traceevent/event-parse.c b/tools/lib/traceevent/event-parse.c
index bb22238..d948475 100644
--- a/tools/lib/traceevent/event-parse.c
+++ b/tools/lib/traceevent/event-parse.c

@@ -4367,10 +4367,20 @@
 					switch (*ptr) {
 					case 's':
 					case 'S':
-					case 'f':
-					case 'F':
 					case 'x':
 						break;
+					case 'f':
+					case 'F':
+						/*
+						 * Pre-5.5 kernels use %pf and
+						 * %pF for printing symbols
+						 * while kernels since 5.5 use
+						 * %pfw for fwnodes. So check
+						 * %p[fF] isn't followed by 'w'.
+						 */
+						if (ptr[1] != 'w')
+							break;
+						/* fall through */
 					default:
 						/*
 						 * Older kernels do not process
@@ -4487,12 +4497,12 @@
 
 	printk = find_printk(tep, addr);
 	if (!printk) {
-		if (asprintf(&format, "%%pf: (NO FORMAT FOUND at %llx)\n", addr) < 0)
+		if (asprintf(&format, "%%ps: (NO FORMAT FOUND at %llx)\n", addr) < 0)
 			return NULL;
 		return format;
 	}
 
-	if (asprintf(&format, "%s: %s", "%pf", printk->printk) < 0)
+	if (asprintf(&format, "%s: %s", "%ps", printk->printk) < 0)
 		return NULL;
 
 	return format;
@@ -5517,8 +5527,10 @@
 	if (divstr && isdigit(*(divstr + 1)))
 		div = atoi(divstr + 1);
 	time = record->ts;
-	if (div)
+	if (div) {
+		time += div / 2;
 		time /= div;
+	}
 	pr = prec;
 	while (pr--)
 		p10 *= 10;

diff --git a/tools/lib/traceevent/event-parse.h b/tools/lib/traceevent/event-parse.h
index d438ee4..b77837f 100644
--- a/tools/lib/traceevent/event-parse.h
+++ b/tools/lib/traceevent/event-parse.h

@@ -441,6 +441,8 @@
 			      unsigned long long addr);
 bool tep_is_pid_registered(struct tep_handle *tep, int pid);
 
+struct tep_event *tep_get_event(struct tep_handle *tep, int index);
+
 #define TEP_PRINT_INFO		"INFO"
 #define TEP_PRINT_INFO_RAW	"INFO_RAW"
 #define TEP_PRINT_COMM		"COMM"

diff --git a/tools/lib/traceevent/plugins/Build b/tools/lib/traceevent/plugins/Build
new file mode 100644
index 0000000..210d269
--- /dev/null
+++ b/tools/lib/traceevent/plugins/Build

@@ -0,0 +1,10 @@
+plugin_jbd2-y         += plugin_jbd2.o
+plugin_hrtimer-y      += plugin_hrtimer.o
+plugin_kmem-y         += plugin_kmem.o
+plugin_kvm-y          += plugin_kvm.o
+plugin_mac80211-y     += plugin_mac80211.o
+plugin_sched_switch-y += plugin_sched_switch.o
+plugin_function-y     += plugin_function.o
+plugin_xen-y          += plugin_xen.o
+plugin_scsi-y         += plugin_scsi.o
+plugin_cfg80211-y     += plugin_cfg80211.o

diff --git a/tools/lib/traceevent/plugins/Makefile b/tools/lib/traceevent/plugins/Makefile
new file mode 100644
index 0000000..f440989
--- /dev/null
+++ b/tools/lib/traceevent/plugins/Makefile

@@ -0,0 +1,222 @@
+# SPDX-License-Identifier: GPL-2.0
+
+#MAKEFLAGS += --no-print-directory
+
+
+# Makefiles suck: This macro sets a default value of $(2) for the
+# variable named by $(1), unless the variable has been set by
+# environment or command line. This is necessary for CC and AR
+# because make sets default values, so the simpler ?= approach
+# won't work as expected.
+define allow-override
+  $(if $(or $(findstring environment,$(origin $(1))),\
+            $(findstring command line,$(origin $(1)))),,\
+    $(eval $(1) = $(2)))
+endef
+
+# Allow setting CC and AR, or setting CROSS_COMPILE as a prefix.
+$(call allow-override,CC,$(CROSS_COMPILE)gcc)
+$(call allow-override,AR,$(CROSS_COMPILE)ar)
+$(call allow-override,NM,$(CROSS_COMPILE)nm)
+$(call allow-override,PKG_CONFIG,pkg-config)
+
+EXT = -std=gnu99
+INSTALL = install
+
+# Use DESTDIR for installing into a different root directory.
+# This is useful for building a package. The program will be
+# installed in this directory as if it was the root directory.
+# Then the build tool can move it later.
+DESTDIR ?=
+DESTDIR_SQ = '$(subst ','\'',$(DESTDIR))'
+
+LP64 := $(shell echo __LP64__ | ${CC} ${CFLAGS} -E -x c - | tail -n 1)
+ifeq ($(LP64), 1)
+  libdir_relative = lib64
+else
+  libdir_relative = lib
+endif
+
+prefix ?= /usr/local
+libdir = $(prefix)/$(libdir_relative)
+
+set_plugin_dir := 1
+
+# Set plugin_dir to preffered global plugin location
+# If we install under $HOME directory we go under
+# $(HOME)/.local/lib/traceevent/plugins
+#
+# We dont set PLUGIN_DIR in case we install under $HOME
+# directory, because by default the code looks under:
+# $(HOME)/.local/lib/traceevent/plugins by default.
+#
+ifeq ($(plugin_dir),)
+ifeq ($(prefix),$(HOME))
+override plugin_dir = $(HOME)/.local/lib/traceevent/plugins
+set_plugin_dir := 0
+else
+override plugin_dir = $(libdir)/traceevent/plugins
+endif
+endif
+
+ifeq ($(set_plugin_dir),1)
+PLUGIN_DIR = -DPLUGIN_DIR="$(plugin_dir)"
+PLUGIN_DIR_SQ = '$(subst ','\'',$(PLUGIN_DIR))'
+endif
+
+include ../../../scripts/Makefile.include
+
+# copy a bit from Linux kbuild
+
+ifeq ("$(origin V)", "command line")
+  VERBOSE = $(V)
+endif
+ifndef VERBOSE
+  VERBOSE = 0
+endif
+
+ifeq ($(srctree),)
+srctree := $(patsubst %/,%,$(dir $(CURDIR)))
+srctree := $(patsubst %/,%,$(dir $(srctree)))
+srctree := $(patsubst %/,%,$(dir $(srctree)))
+srctree := $(patsubst %/,%,$(dir $(srctree)))
+#$(info Determined 'srctree' to be $(srctree))
+endif
+
+export prefix libdir src obj
+
+# Shell quotes
+plugin_dir_SQ = $(subst ','\'',$(plugin_dir))
+
+CONFIG_INCLUDES =
+CONFIG_LIBS    =
+CONFIG_FLAGS   =
+
+OBJ            = $@
+N              =
+
+INCLUDES = -I. -I.. -I $(srctree)/tools/include $(CONFIG_INCLUDES)
+
+# Set compile option CFLAGS
+ifdef EXTRA_CFLAGS
+  CFLAGS := $(EXTRA_CFLAGS)
+else
+  CFLAGS := -g -Wall
+endif
+
+# Append required CFLAGS
+override CFLAGS += -fPIC
+override CFLAGS += $(CONFIG_FLAGS) $(INCLUDES) $(PLUGIN_DIR_SQ)
+override CFLAGS += $(udis86-flags) -D_GNU_SOURCE
+
+ifeq ($(VERBOSE),1)
+  Q =
+else
+  Q = @
+endif
+
+# Disable command line variables (CFLAGS) override from top
+# level Makefile (perf), otherwise build Makefile will get
+# the same command line setup.
+MAKEOVERRIDES=
+
+export srctree OUTPUT CC LD CFLAGS V
+
+build := -f $(srctree)/tools/build/Makefile.build dir=. obj
+
+DYNAMIC_LIST_FILE := $(OUTPUT)libtraceevent-dynamic-list
+
+PLUGINS  = plugin_jbd2.so
+PLUGINS += plugin_hrtimer.so
+PLUGINS += plugin_kmem.so
+PLUGINS += plugin_kvm.so
+PLUGINS += plugin_mac80211.so
+PLUGINS += plugin_sched_switch.so
+PLUGINS += plugin_function.so
+PLUGINS += plugin_xen.so
+PLUGINS += plugin_scsi.so
+PLUGINS += plugin_cfg80211.so
+
+PLUGINS    := $(addprefix $(OUTPUT),$(PLUGINS))
+PLUGINS_IN := $(PLUGINS:.so=-in.o)
+
+plugins: $(PLUGINS) $(DYNAMIC_LIST_FILE)
+
+__plugin_obj = $(notdir $@)
+  plugin_obj = $(__plugin_obj:-in.o=)
+
+$(PLUGINS_IN): force
+	$(Q)$(MAKE) $(build)=$(plugin_obj)
+
+$(OUTPUT)libtraceevent-dynamic-list: $(PLUGINS)
+	$(QUIET_GEN)$(call do_generate_dynamic_list_file, $(PLUGINS), $@)
+
+$(OUTPUT)%.so: $(OUTPUT)%-in.o
+	$(QUIET_LINK)$(CC) $(CFLAGS) -shared $(LDFLAGS) -nostartfiles -o $@ $^
+
+define update_dir
+  (echo $1 > $@.tmp;                           \
+   if [ -r $@ ] && cmp -s $@ $@.tmp; then      \
+     rm -f $@.tmp;                             \
+   else                                                \
+     echo '  UPDATE                 $@';       \
+     mv -f $@.tmp $@;                          \
+   fi);
+endef
+
+tags:	force
+	$(RM) tags
+	find . -name '*.[ch]' | xargs ctags --extra=+f --c-kinds=+px \
+	--regex-c++='/_PE\(([^,)]*).*/TEP_ERRNO__\1/'
+
+TAGS:	force
+	$(RM) TAGS
+	find . -name '*.[ch]' | xargs etags \
+	--regex='/_PE(\([^,)]*\).*/TEP_ERRNO__\1/'
+
+define do_install_mkdir
+	if [ ! -d '$(DESTDIR_SQ)$1' ]; then             \
+		$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$1'; \
+	fi
+endef
+
+define do_install
+	$(call do_install_mkdir,$2);                    \
+	$(INSTALL) $(if $3,-m $3,) $1 '$(DESTDIR_SQ)$2'
+endef
+
+define do_install_plugins
+       for plugin in $1; do                            \
+         $(call do_install,$$plugin,$(plugin_dir_SQ)); \
+       done
+endef
+
+define do_generate_dynamic_list_file
+	symbol_type=`$(NM) -u -D $1 | awk 'NF>1 {print $$1}' | \
+	xargs echo "U w W" | tr 'w ' 'W\n' | sort -u | xargs echo`;\
+	if [ "$$symbol_type" = "U W" ];then				\
+		(echo '{';                                              \
+		$(NM) -u -D $1 | awk 'NF>1 {print "\t"$$2";"}' | sort -u;\
+		echo '};';                                              \
+		) > $2;                                                 \
+	else                                                            \
+		(echo Either missing one of [$1] or bad version of $(NM)) 1>&2;\
+		fi
+endef
+
+install: $(PLUGINS)
+	$(call QUIET_INSTALL, trace_plugins) \
+	$(call do_install_plugins, $(PLUGINS))
+
+clean:
+	$(call QUIET_CLEAN, trace_plugins) \
+		$(RM) *.o *~ $(TARGETS) *.a *.so $(VERSION_FILES) .*.d .*.cmd; \
+		$(RM) $(OUTPUT)libtraceevent-dynamic-list \
+		$(RM) TRACEEVENT-CFLAGS tags TAGS;
+
+PHONY += force plugins
+force:
+
+# Declare the contents of the .PHONY variable as phony.  We keep that
+# information in a variable so we can use it in if_changed and friends.
+.PHONY: $(PHONY)

diff --git a/tools/lib/traceevent/plugin_cfg80211.c b/tools/lib/traceevent/plugins/plugin_cfg80211.c
similarity index 100%
rename from tools/lib/traceevent/plugin_cfg80211.c
rename to tools/lib/traceevent/plugins/plugin_cfg80211.c


diff --git a/tools/lib/traceevent/plugin_function.c b/tools/lib/traceevent/plugins/plugin_function.c
similarity index 100%
rename from tools/lib/traceevent/plugin_function.c
rename to tools/lib/traceevent/plugins/plugin_function.c


diff --git a/tools/lib/traceevent/plugin_hrtimer.c b/tools/lib/traceevent/plugins/plugin_hrtimer.c
similarity index 100%
rename from tools/lib/traceevent/plugin_hrtimer.c
rename to tools/lib/traceevent/plugins/plugin_hrtimer.c


diff --git a/tools/lib/traceevent/plugin_jbd2.c b/tools/lib/traceevent/plugins/plugin_jbd2.c
similarity index 100%
rename from tools/lib/traceevent/plugin_jbd2.c
rename to tools/lib/traceevent/plugins/plugin_jbd2.c


diff --git a/tools/lib/traceevent/plugin_kmem.c b/tools/lib/traceevent/plugins/plugin_kmem.c
similarity index 100%
rename from tools/lib/traceevent/plugin_kmem.c
rename to tools/lib/traceevent/plugins/plugin_kmem.c


diff --git a/tools/lib/traceevent/plugin_kvm.c b/tools/lib/traceevent/plugins/plugin_kvm.c
similarity index 100%
rename from tools/lib/traceevent/plugin_kvm.c
rename to tools/lib/traceevent/plugins/plugin_kvm.c


diff --git a/tools/lib/traceevent/plugin_mac80211.c b/tools/lib/traceevent/plugins/plugin_mac80211.c
similarity index 100%
rename from tools/lib/traceevent/plugin_mac80211.c
rename to tools/lib/traceevent/plugins/plugin_mac80211.c


diff --git a/tools/lib/traceevent/plugin_sched_switch.c b/tools/lib/traceevent/plugins/plugin_sched_switch.c
similarity index 100%
rename from tools/lib/traceevent/plugin_sched_switch.c
rename to tools/lib/traceevent/plugins/plugin_sched_switch.c


diff --git a/tools/lib/traceevent/plugin_scsi.c b/tools/lib/traceevent/plugins/plugin_scsi.c
similarity index 100%
rename from tools/lib/traceevent/plugin_scsi.c
rename to tools/lib/traceevent/plugins/plugin_scsi.c


diff --git a/tools/lib/traceevent/plugin_xen.c b/tools/lib/traceevent/plugins/plugin_xen.c
similarity index 100%
rename from tools/lib/traceevent/plugin_xen.c
rename to tools/lib/traceevent/plugins/plugin_xen.c


diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 176f2f0..044c9a3 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c

@@ -138,7 +138,6 @@
 		"do_task_dead",
 		"__module_put_and_exit",
 		"complete_and_exit",
-		"kvm_spurious_fault",
 		"__reiserfs_panic",
 		"lbug_with_loc",
 		"fortify_panic",

diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config
index a269d78..46f7fba 100644
--- a/tools/perf/Makefile.config
+++ b/tools/perf/Makefile.config

@@ -924,7 +924,7 @@
     JDIR=$(shell /usr/sbin/update-java-alternatives -l | head -1 | awk '{print $$3}')
   else
     ifneq (,$(wildcard /usr/sbin/alternatives))
-      JDIR=$(shell /usr/sbin/alternatives --display java | tail -1 | cut -d' ' -f 5 | sed 's%/jre/bin/java.%%g')
+      JDIR=$(shell /usr/sbin/alternatives --display java | tail -1 | cut -d' ' -f 5 | sed -e 's%/jre/bin/java.%%g' -e 's%/bin/java.%%g')
     endif
   endif
   ifndef JDIR

diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf
index f9807d8..902c792 100644
--- a/tools/perf/Makefile.perf
+++ b/tools/perf/Makefile.perf

@@ -292,7 +292,7 @@
 LIBTRACEEVENT = $(TE_PATH)libtraceevent.a
 export LIBTRACEEVENT
 
-LIBTRACEEVENT_DYNAMIC_LIST = $(TE_PATH)libtraceevent-dynamic-list
+LIBTRACEEVENT_DYNAMIC_LIST = $(TE_PATH)plugins/libtraceevent-dynamic-list
 
 #
 # The static build has no dynsym table, so this does not work for
@@ -567,7 +567,7 @@
 # Create python binding output directory if not already present
 _dummy := $(shell [ -d '$(OUTPUT)python' ] || mkdir -p '$(OUTPUT)python')
 
-$(OUTPUT)python/perf.so: $(PYTHON_EXT_SRCS) $(PYTHON_EXT_DEPS) $(LIBTRACEEVENT_DYNAMIC_LIST)
+$(OUTPUT)python/perf.so: $(PYTHON_EXT_SRCS) $(PYTHON_EXT_DEPS) $(LIBTRACEEVENT_DYNAMIC_LIST) $(LIBPERF)
 	$(QUIET_GEN)LDSHARED="$(CC) -pthread -shared" \
         CFLAGS='$(CFLAGS)' LDFLAGS='$(LDFLAGS) $(LIBTRACEEVENT_DYNAMIC_LIST_LDFLAGS)' \
 	  $(PYTHON_WORD) util/setup.py \
@@ -737,7 +737,7 @@
 	$(Q)$(MAKE) -C $(TRACE_EVENT_DIR) $(LIBTRACEEVENT_FLAGS) O=$(OUTPUT) plugins
 
 $(LIBTRACEEVENT_DYNAMIC_LIST): libtraceevent_plugins
-	$(Q)$(MAKE) -C $(TRACE_EVENT_DIR) $(LIBTRACEEVENT_FLAGS) O=$(OUTPUT) $(OUTPUT)libtraceevent-dynamic-list
+	$(Q)$(MAKE) -C $(TRACE_EVENT_DIR) $(LIBTRACEEVENT_FLAGS) O=$(OUTPUT) $(OUTPUT)plugins/libtraceevent-dynamic-list
 
 $(LIBTRACEEVENT)-clean:
 	$(call QUIET_CLEAN, libtraceevent)

diff --git a/tools/perf/arch/arm/util/cs-etm.c b/tools/perf/arch/arm/util/cs-etm.c
index c32db09..ede040c 100644
--- a/tools/perf/arch/arm/util/cs-etm.c
+++ b/tools/perf/arch/arm/util/cs-etm.c

@@ -23,9 +23,10 @@
 #include "../../util/event.h"
 #include "../../util/evlist.h"
 #include "../../util/evsel.h"
+#include "../../util/evsel_config.h"
 #include "../../util/pmu.h"
 #include "../../util/cs-etm.h"
-#include "../../util/util.h"
+#include <internal/lib.h> // page_size
 #include "../../util/session.h"
 
 #include <errno.h>
@@ -416,7 +417,7 @@
 		if (err)
 			goto out;
 
-		tracking_evsel = perf_evlist__last(evlist);
+		tracking_evsel = evlist__last(evlist);
 		perf_evlist__set_tracking_event(evlist, tracking_evsel);
 
 		tracking_evsel->core.attr.freq = 0;
@@ -648,7 +649,7 @@
 	if (priv_size != cs_etm_info_priv_size(itr, session->evlist))
 		return -EINVAL;
 
-	if (!session->evlist->nr_mmaps)
+	if (!session->evlist->core.nr_mmaps)
 		return -EINVAL;
 
 	/* If the cpu_map is empty all online CPUs are involved */

diff --git a/tools/perf/arch/arm64/util/arm-spe.c b/tools/perf/arch/arm64/util/arm-spe.c
index 4b36469..eba6541 100644
--- a/tools/perf/arch/arm64/util/arm-spe.c
+++ b/tools/perf/arch/arm64/util/arm-spe.c

@@ -16,7 +16,7 @@
 #include "../../util/evsel.h"
 #include "../../util/evlist.h"
 #include "../../util/session.h"
-#include "../../util/util.h"
+#include <internal/lib.h> // page_size
 #include "../../util/pmu.h"
 #include "../../util/debug.h"
 #include "../../util/auxtrace.h"
@@ -51,7 +51,7 @@
 	if (priv_size != ARM_SPE_AUXTRACE_PRIV_SIZE)
 		return -EINVAL;
 
-	if (!session->evlist->nr_mmaps)
+	if (!session->evlist->core.nr_mmaps)
 		return -EINVAL;
 
 	auxtrace_info->type = PERF_AUXTRACE_ARM_SPE;
@@ -129,7 +129,7 @@
 	if (err)
 		return err;
 
-	tracking_evsel = perf_evlist__last(evlist);
+	tracking_evsel = evlist__last(evlist);
 	perf_evlist__set_tracking_event(evlist, tracking_evsel);
 
 	tracking_evsel->core.attr.freq = 0;

diff --git a/tools/perf/arch/arm64/util/dwarf-regs.c b/tools/perf/arch/arm64/util/dwarf-regs.c
index b047b88..917b97d 100644
--- a/tools/perf/arch/arm64/util/dwarf-regs.c
+++ b/tools/perf/arch/arm64/util/dwarf-regs.c

@@ -11,7 +11,6 @@
 #include <dwarf-regs.h>
 #include <linux/ptrace.h> /* for struct user_pt_regs */
 #include <linux/stringify.h>
-#include "util.h"
 
 struct pt_regs_dwarfnum {
 	const char *name;

diff --git a/tools/perf/arch/arm64/util/header.c b/tools/perf/arch/arm64/util/header.c
index e41defa..a32e4b7 100644
--- a/tools/perf/arch/arm64/util/header.c
+++ b/tools/perf/arch/arm64/util/header.c

@@ -1,5 +1,7 @@
 #include <stdio.h>
 #include <stdlib.h>
+#include <perf/cpumap.h>
+#include <internal/cpumap.h>
 #include <api/fs/fs.h>
 #include "debug.h"
 #include "header.h"
@@ -29,7 +31,7 @@
 
 	/* read midr from list of cpus mapped to this pmu */
 	cpus = perf_cpu_map__get(pmu->cpus);
-	for (cpu = 0; cpu < cpus->nr; cpu++) {
+	for (cpu = 0; cpu < perf_cpu_map__nr(cpus); cpu++) {
 		scnprintf(path, PATH_MAX, "%s/devices/system/cpu/cpu%d"MIDR,
 				sysfs, cpus->map[cpu]);
 

diff --git a/tools/perf/arch/arm64/util/unwind-libunwind.c b/tools/perf/arch/arm64/util/unwind-libunwind.c
index 002520d..1495a95 100644
--- a/tools/perf/arch/arm64/util/unwind-libunwind.c
+++ b/tools/perf/arch/arm64/util/unwind-libunwind.c

@@ -5,8 +5,8 @@
 #include <libunwind.h>
 #include "perf_regs.h"
 #include "../../util/unwind.h"
-#include "../../util/debug.h"
 #endif
+#include "../../util/debug.h"
 
 int LIBUNWIND__ARCH_REG_ID(int regnum)
 {

diff --git a/tools/perf/arch/powerpc/util/dwarf-regs.c b/tools/perf/arch/powerpc/util/dwarf-regs.c
index 4952890..0c4f4ca 100644
--- a/tools/perf/arch/powerpc/util/dwarf-regs.c
+++ b/tools/perf/arch/powerpc/util/dwarf-regs.c

@@ -12,7 +12,6 @@
 #include <linux/ptrace.h>
 #include <linux/kernel.h>
 #include <linux/stringify.h>
-#include "util.h"
 
 struct pt_regs_dwarfnum {
 	const char *name;

diff --git a/tools/perf/arch/powerpc/util/header.c b/tools/perf/arch/powerpc/util/header.c
index 0b24266..b6b7bc7 100644
--- a/tools/perf/arch/powerpc/util/header.c
+++ b/tools/perf/arch/powerpc/util/header.c

@@ -6,7 +6,6 @@
 #include <string.h>
 #include <linux/stringify.h>
 #include "header.h"
-#include "util.h"
 
 #define mfspr(rn)       ({unsigned long rval; \
 			 asm volatile("mfspr %0," __stringify(rn) \

diff --git a/tools/perf/arch/powerpc/util/kvm-stat.c b/tools/perf/arch/powerpc/util/kvm-stat.c
index f0dbf7b..9cc1c4a 100644
--- a/tools/perf/arch/powerpc/util/kvm-stat.c
+++ b/tools/perf/arch/powerpc/util/kvm-stat.c

@@ -5,9 +5,11 @@
 #include "util/debug.h"
 #include "util/evsel.h"
 #include "util/evlist.h"
+#include "util/pmu.h"
 
 #include "book3s_hv_exits.h"
 #include "book3s_hcalls.h"
+#include <subcmd/parse-options.h>
 
 #define NR_TPS 4
 
@@ -172,3 +174,46 @@
 
 	return ret;
 }
+
+/*
+ * Incase of powerpc architecture, pmu registers are programmable
+ * by guest kernel. So monitoring guest via host may not provide
+ * valid samples with default 'cycles' event. It is better to use
+ * 'trace_imc/trace_cycles' event for guest profiling, since it
+ * can track the guest instruction pointer in the trace-record.
+ *
+ * Function to parse the arguments and return appropriate values.
+ */
+int kvm_add_default_arch_event(int *argc, const char **argv)
+{
+	const char **tmp;
+	bool event = false;
+	int i, j = *argc;
+
+	const struct option event_options[] = {
+		OPT_BOOLEAN('e', "event", &event, NULL),
+		OPT_END()
+	};
+
+	tmp = calloc(j + 1, sizeof(char *));
+	if (!tmp)
+		return -EINVAL;
+
+	for (i = 0; i < j; i++)
+		tmp[i] = argv[i];
+
+	parse_options(j, tmp, event_options, NULL, PARSE_OPT_KEEP_UNKNOWN);
+	if (!event) {
+		if (pmu_have_event("trace_imc", "trace_cycles")) {
+			argv[j++] = strdup("-e");
+			argv[j++] = strdup("trace_imc/trace_cycles/");
+			*argc += 2;
+		} else {
+			free(tmp);
+			return -EINVAL;
+		}
+	}
+
+	free(tmp);
+	return 0;
+}

diff --git a/tools/perf/arch/powerpc/util/skip-callchain-idx.c b/tools/perf/arch/powerpc/util/skip-callchain-idx.c
index fc9c2f5..3018a05 100644
--- a/tools/perf/arch/powerpc/util/skip-callchain-idx.c
+++ b/tools/perf/arch/powerpc/util/skip-callchain-idx.c

@@ -13,6 +13,7 @@
 #include "util/callchain.h"
 #include "util/debug.h"
 #include "util/dso.h"
+#include "util/event.h" // struct ip_callchain
 #include "util/map.h"
 #include "util/symbol.h"
 

diff --git a/tools/perf/arch/powerpc/util/sym-handling.c b/tools/perf/arch/powerpc/util/sym-handling.c
index 8a4b717..abb7a12 100644
--- a/tools/perf/arch/powerpc/util/sym-handling.c
+++ b/tools/perf/arch/powerpc/util/sym-handling.c

@@ -4,7 +4,6 @@
  * Copyright (C) 2015 Naveen N. Rao, IBM Corporation
  */
 
-#include "debug.h"
 #include "dso.h"
 #include "symbol.h"
 #include "map.h"

diff --git a/tools/perf/arch/s390/Makefile b/tools/perf/arch/s390/Makefile
index cb19878..6ac8887 100644
--- a/tools/perf/arch/s390/Makefile
+++ b/tools/perf/arch/s390/Makefile

@@ -4,6 +4,7 @@
 endif
 HAVE_KVM_STAT_SUPPORT := 1
 PERF_HAVE_ARCH_REGS_QUERY_REGISTER_OFFSET := 1
+PERF_HAVE_JITDUMP := 1
 
 #
 # Syscall table generation for perf

diff --git a/tools/perf/arch/s390/util/auxtrace.c b/tools/perf/arch/s390/util/auxtrace.c
index b0fb70e..0db5c58 100644
--- a/tools/perf/arch/s390/util/auxtrace.c
+++ b/tools/perf/arch/s390/util/auxtrace.c

@@ -1,4 +1,5 @@
 #include <stdbool.h>
+#include <stdlib.h>
 #include <linux/kernel.h>
 #include <linux/types.h>
 #include <linux/bitops.h>

diff --git a/tools/perf/arch/s390/util/machine.c b/tools/perf/arch/s390/util/machine.c
index c8c86a0..724efb2 100644
--- a/tools/perf/arch/s390/util/machine.c
+++ b/tools/perf/arch/s390/util/machine.c

@@ -2,7 +2,7 @@
 #include <unistd.h>
 #include <stdio.h>
 #include <string.h>
-#include "util.h"
+#include <internal/lib.h> // page_size
 #include "machine.h"
 #include "api/fs/fs.h"
 #include "debug.h"

diff --git a/tools/perf/arch/x86/tests/intel-cqm.c b/tools/perf/arch/x86/tests/intel-cqm.c
index 3b5cc33..3ec562a 100644
--- a/tools/perf/arch/x86/tests/intel-cqm.c
+++ b/tools/perf/arch/x86/tests/intel-cqm.c

@@ -5,7 +5,7 @@
 #include "evlist.h"
 #include "evsel.h"
 #include "arch-tests.h"
-#include "util.h"
+#include <internal/lib.h> // page_size
 
 #include <signal.h>
 #include <sys/mman.h>
@@ -63,9 +63,9 @@
 		goto out;
 	}
 
-	evsel = perf_evlist__first(evlist);
+	evsel = evlist__first(evlist);
 	if (!evsel) {
-		pr_debug("perf_evlist__first failed\n");
+		pr_debug("evlist__first failed\n");
 		goto out;
 	}
 

diff --git a/tools/perf/arch/x86/tests/perf-time-to-tsc.c b/tools/perf/arch/x86/tests/perf-time-to-tsc.c
index eb36359..fa94795 100644
--- a/tools/perf/arch/x86/tests/perf-time-to-tsc.c
+++ b/tools/perf/arch/x86/tests/perf-time-to-tsc.c

@@ -15,9 +15,9 @@
 #include "evlist.h"
 #include "evsel.h"
 #include "thread_map.h"
-#include "cpumap.h"
 #include "record.h"
 #include "tsc.h"
+#include "util/mmap.h"
 #include "tests/tests.h"
 
 #include "arch-tests.h"
@@ -66,7 +66,7 @@
 	union perf_event *event;
 	u64 test_tsc, comm1_tsc, comm2_tsc;
 	u64 test_time, comm1_time = 0, comm2_time = 0;
-	struct perf_mmap *md;
+	struct mmap *md;
 
 	threads = thread_map__new(-1, getpid(), UINT_MAX);
 	CHECK_NOT_NULL__(threads);
@@ -83,7 +83,7 @@
 
 	perf_evlist__config(evlist, &opts, NULL);
 
-	evsel = perf_evlist__first(evlist);
+	evsel = evlist__first(evlist);
 
 	evsel->core.attr.comm = 1;
 	evsel->core.attr.disabled = 1;
@@ -91,9 +91,9 @@
 
 	CHECK__(evlist__open(evlist));
 
-	CHECK__(perf_evlist__mmap(evlist, UINT_MAX));
+	CHECK__(evlist__mmap(evlist, UINT_MAX));
 
-	pc = evlist->mmap[0].base;
+	pc = evlist->mmap[0].core.base;
 	ret = perf_read_tsc_conversion(pc, &tc);
 	if (ret) {
 		if (ret == -EOPNOTSUPP) {
@@ -115,7 +115,7 @@
 
 	evlist__disable(evlist);
 
-	for (i = 0; i < evlist->nr_mmaps; i++) {
+	for (i = 0; i < evlist->core.nr_mmaps; i++) {
 		md = &evlist->mmap[i];
 		if (perf_mmap__read_init(md) < 0)
 			continue;

diff --git a/tools/perf/arch/x86/tests/rdpmc.c b/tools/perf/arch/x86/tests/rdpmc.c
index 6e67cee..1ea9166 100644
--- a/tools/perf/arch/x86/tests/rdpmc.c
+++ b/tools/perf/arch/x86/tests/rdpmc.c

@@ -13,7 +13,7 @@
 #include "tests/tests.h"
 #include "cloexec.h"
 #include "event.h"
-#include "util.h"
+#include <internal/lib.h> // page_size
 #include "arch-tests.h"
 
 static u64 rdpmc(unsigned int counter)

diff --git a/tools/perf/arch/x86/util/archinsn.c b/tools/perf/arch/x86/util/archinsn.c
index 9876c7a..3e67915 100644
--- a/tools/perf/arch/x86/util/archinsn.c
+++ b/tools/perf/arch/x86/util/archinsn.c

@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 #include "../../../../arch/x86/include/asm/insn.h"
 #include "archinsn.h"
+#include "event.h"
 #include "machine.h"
 #include "thread.h"
 #include "symbol.h"

diff --git a/tools/perf/arch/x86/util/event.c b/tools/perf/arch/x86/util/event.c
index a3a0b68..d357c62 100644
--- a/tools/perf/arch/x86/util/event.c
+++ b/tools/perf/arch/x86/util/event.c

@@ -3,6 +3,8 @@
 #include <linux/string.h>
 #include <linux/zalloc.h>
 
+#include "../../util/event.h"
+#include "../../util/synthetic-events.h"
 #include "../../util/machine.h"
 #include "../../util/tool.h"
 #include "../../util/map.h"

diff --git a/tools/perf/arch/x86/util/intel-bts.c b/tools/perf/arch/x86/util/intel-bts.c
index d263430..f7f68a5 100644
--- a/tools/perf/arch/x86/util/intel-bts.c
+++ b/tools/perf/arch/x86/util/intel-bts.c

@@ -15,6 +15,7 @@
 #include "../../util/event.h"
 #include "../../util/evsel.h"
 #include "../../util/evlist.h"
+#include "../../util/mmap.h"
 #include "../../util/session.h"
 #include "../../util/pmu.h"
 #include "../../util/debug.h"
@@ -22,7 +23,7 @@
 #include "../../util/tsc.h"
 #include "../../util/auxtrace.h"
 #include "../../util/intel-bts.h"
-#include "../../util/util.h"
+#include <internal/lib.h> // page_size
 
 #define KiB(x) ((x) * 1024)
 #define MiB(x) ((x) * 1024 * 1024)
@@ -74,10 +75,10 @@
 	if (priv_size != INTEL_BTS_AUXTRACE_PRIV_SIZE)
 		return -EINVAL;
 
-	if (!session->evlist->nr_mmaps)
+	if (!session->evlist->core.nr_mmaps)
 		return -EINVAL;
 
-	pc = session->evlist->mmap[0].base;
+	pc = session->evlist->mmap[0].core.base;
 	if (pc) {
 		err = perf_read_tsc_conversion(pc, &tc);
 		if (err) {
@@ -230,7 +231,7 @@
 		if (err)
 			return err;
 
-		tracking_evsel = perf_evlist__last(evlist);
+		tracking_evsel = evlist__last(evlist);
 
 		perf_evlist__set_tracking_event(evlist, tracking_evsel);
 

diff --git a/tools/perf/arch/x86/util/intel-pt.c b/tools/perf/arch/x86/util/intel-pt.c
index cb7cf16..d6d2625 100644
--- a/tools/perf/arch/x86/util/intel-pt.c
+++ b/tools/perf/arch/x86/util/intel-pt.c

@@ -18,6 +18,7 @@
 #include "../../util/evlist.h"
 #include "../../util/evsel.h"
 #include "../../util/cpumap.h"
+#include "../../util/mmap.h"
 #include <subcmd/parse-options.h>
 #include "../../util/parse-events.h"
 #include "../../util/pmu.h"
@@ -26,7 +27,7 @@
 #include "../../util/record.h"
 #include "../../util/target.h"
 #include "../../util/tsc.h"
-#include "../../util/util.h"
+#include <internal/lib.h> // page_size
 #include "../../util/intel-pt.h"
 
 #define KiB(x) ((x) * 1024)
@@ -351,10 +352,10 @@
 	filter = intel_pt_find_filter(session->evlist, ptr->intel_pt_pmu);
 	filter_str_len = filter ? strlen(filter) : 0;
 
-	if (!session->evlist->nr_mmaps)
+	if (!session->evlist->core.nr_mmaps)
 		return -EINVAL;
 
-	pc = session->evlist->mmap[0].base;
+	pc = session->evlist->mmap[0].core.base;
 	if (pc) {
 		err = perf_read_tsc_conversion(pc, &tc);
 		if (err) {
@@ -416,12 +417,12 @@
 		return err;
 	}
 
-	evsel = perf_evlist__last(evlist);
+	evsel = evlist__last(evlist);
 
 	perf_evsel__set_sample_bit(evsel, CPU);
 	perf_evsel__set_sample_bit(evsel, TIME);
 
-	evsel->system_wide = true;
+	evsel->core.system_wide = true;
 	evsel->no_aux_samples = true;
 	evsel->immediate = true;
 
@@ -716,13 +717,13 @@
 				if (err)
 					return err;
 
-				switch_evsel = perf_evlist__last(evlist);
+				switch_evsel = evlist__last(evlist);
 
 				switch_evsel->core.attr.freq = 0;
 				switch_evsel->core.attr.sample_period = 1;
 				switch_evsel->core.attr.context_switch = 1;
 
-				switch_evsel->system_wide = true;
+				switch_evsel->core.system_wide = true;
 				switch_evsel->no_aux_samples = true;
 				switch_evsel->immediate = true;
 
@@ -774,7 +775,7 @@
 		if (err)
 			return err;
 
-		tracking_evsel = perf_evlist__last(evlist);
+		tracking_evsel = evlist__last(evlist);
 
 		perf_evlist__set_tracking_event(evlist, tracking_evsel);
 

diff --git a/tools/perf/arch/x86/util/machine.c b/tools/perf/arch/x86/util/machine.c
index 1e9ec78..e17e080 100644
--- a/tools/perf/arch/x86/util/machine.c
+++ b/tools/perf/arch/x86/util/machine.c

@@ -1,9 +1,10 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <linux/types.h>
 #include <linux/string.h>
+#include <limits.h>
 #include <stdlib.h>
 
-#include "../../util/util.h"
+#include <internal/lib.h> // page_size
 #include "../../util/machine.h"
 #include "../../util/map.h"
 #include "../../util/symbol.h"

diff --git a/tools/perf/arch/x86/util/tsc.c b/tools/perf/arch/x86/util/tsc.c
index c5197a1..2f55afb 100644
--- a/tools/perf/arch/x86/util/tsc.c
+++ b/tools/perf/arch/x86/util/tsc.c

@@ -8,6 +8,8 @@
 #include <linux/types.h>
 #include <asm/barrier.h>
 #include "../../../util/debug.h"
+#include "../../../util/event.h"
+#include "../../../util/synthetic-events.h"
 #include "../../../util/tsc.h"
 
 int perf_read_tsc_conversion(const struct perf_event_mmap_page *pc,

diff --git a/tools/perf/arch/x86/util/unwind-libunwind.c b/tools/perf/arch/x86/util/unwind-libunwind.c
index 05920e3..4735797 100644
--- a/tools/perf/arch/x86/util/unwind-libunwind.c
+++ b/tools/perf/arch/x86/util/unwind-libunwind.c

@@ -1,11 +1,11 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include <errno.h>
+#include "../../util/debug.h"
 #ifndef REMOTE_UNWIND_LIBUNWIND
 #include <libunwind.h>
 #include "perf_regs.h"
 #include "../../util/unwind.h"
-#include "../../util/debug.h"
 #endif
 
 #ifdef HAVE_ARCH_X86_64_SUPPORT

diff --git a/tools/perf/bench/epoll-ctl.c b/tools/perf/bench/epoll-ctl.c
index d1caa4a..bb617e5 100644
--- a/tools/perf/bench/epoll-ctl.c
+++ b/tools/perf/bench/epoll-ctl.c

@@ -21,12 +21,12 @@
 #include <sys/resource.h>
 #include <sys/epoll.h>
 #include <sys/eventfd.h>
+#include <internal/cpumap.h>
 #include <perf/cpumap.h>
 
 #include "../util/stat.h"
 #include <subcmd/parse-options.h>
 #include "bench.h"
-#include "cpumap.h"
 
 #include <err.h>
 

diff --git a/tools/perf/bench/epoll-wait.c b/tools/perf/bench/epoll-wait.c
index f6b4472..7af6944 100644
--- a/tools/perf/bench/epoll-wait.c
+++ b/tools/perf/bench/epoll-wait.c

@@ -76,12 +76,12 @@
 #include <sys/epoll.h>
 #include <sys/eventfd.h>
 #include <sys/types.h>
+#include <internal/cpumap.h>
 #include <perf/cpumap.h>
 
 #include "../util/stat.h"
 #include <subcmd/parse-options.h>
 #include "bench.h"
-#include "cpumap.h"
 
 #include <err.h>
 

diff --git a/tools/perf/bench/futex-hash.c b/tools/perf/bench/futex-hash.c
index 80e1389..8ba0c33 100644
--- a/tools/perf/bench/futex-hash.c
+++ b/tools/perf/bench/futex-hash.c

@@ -20,13 +20,13 @@
 #include <linux/kernel.h>
 #include <linux/zalloc.h>
 #include <sys/time.h>
+#include <internal/cpumap.h>
 #include <perf/cpumap.h>
 
 #include "../util/stat.h"
 #include <subcmd/parse-options.h>
 #include "bench.h"
 #include "futex.h"
-#include "cpumap.h"
 
 #include <err.h>
 

diff --git a/tools/perf/bench/futex-lock-pi.c b/tools/perf/bench/futex-lock-pi.c
index c5d6d0a..d0cae81 100644
--- a/tools/perf/bench/futex-lock-pi.c
+++ b/tools/perf/bench/futex-lock-pi.c

@@ -14,10 +14,10 @@
 #include <linux/kernel.h>
 #include <linux/zalloc.h>
 #include <errno.h>
+#include <internal/cpumap.h>
 #include <perf/cpumap.h>
 #include "bench.h"
 #include "futex.h"
-#include "cpumap.h"
 
 #include <err.h>
 #include <stdlib.h>

diff --git a/tools/perf/bench/futex-requeue.c b/tools/perf/bench/futex-requeue.c
index 75d3418c..a00a689 100644
--- a/tools/perf/bench/futex-requeue.c
+++ b/tools/perf/bench/futex-requeue.c

@@ -20,10 +20,10 @@
 #include <linux/kernel.h>
 #include <linux/time64.h>
 #include <errno.h>
+#include <internal/cpumap.h>
 #include <perf/cpumap.h>
 #include "bench.h"
 #include "futex.h"
-#include "cpumap.h"
 
 #include <err.h>
 #include <stdlib.h>

diff --git a/tools/perf/bench/futex-wake-parallel.c b/tools/perf/bench/futex-wake-parallel.c
index 163fe16..a053cf2 100644
--- a/tools/perf/bench/futex-wake-parallel.c
+++ b/tools/perf/bench/futex-wake-parallel.c

@@ -29,7 +29,8 @@
 #include <linux/time64.h>
 #include <errno.h>
 #include "futex.h"
-#include "cpumap.h"
+#include <internal/cpumap.h>
+#include <perf/cpumap.h>
 
 #include <err.h>
 #include <stdlib.h>

diff --git a/tools/perf/bench/futex-wake.c b/tools/perf/bench/futex-wake.c
index 77dcdc1..df81009 100644
--- a/tools/perf/bench/futex-wake.c
+++ b/tools/perf/bench/futex-wake.c

@@ -20,10 +20,10 @@
 #include <linux/kernel.h>
 #include <linux/time64.h>
 #include <errno.h>
+#include <internal/cpumap.h>
 #include <perf/cpumap.h>
 #include "bench.h"
 #include "futex.h"
-#include "cpumap.h"
 
 #include <err.h>
 #include <stdlib.h>

diff --git a/tools/perf/bench/numa.c b/tools/perf/bench/numa.c
index 62b8ef4..5797253 100644
--- a/tools/perf/bench/numa.c
+++ b/tools/perf/bench/numa.c

@@ -9,7 +9,6 @@
 /* For the CLR_() macros */
 #include <pthread.h>
 
-#include "../builtin.h"
 #include <subcmd/parse-options.h>
 #include "../util/cloexec.h"
 

diff --git a/tools/perf/bench/sched-messaging.c b/tools/perf/bench/sched-messaging.c
index c63eb9a..97e4a4f 100644
--- a/tools/perf/bench/sched-messaging.c
+++ b/tools/perf/bench/sched-messaging.c

@@ -10,9 +10,7 @@
  *
  */
 
-#include "../util/util.h"
 #include <subcmd/parse-options.h>
-#include "../builtin.h"
 #include "bench.h"
 
 /* Test groups of 20 processes spraying to 20 receivers */

diff --git a/tools/perf/bench/sched-pipe.c b/tools/perf/bench/sched-pipe.c
index 35b07f1..3c88d1f 100644
--- a/tools/perf/bench/sched-pipe.c
+++ b/tools/perf/bench/sched-pipe.c

@@ -9,9 +9,7 @@
  *  http://people.redhat.com/mingo/cfs-scheduler/tools/pipe-test-1m.c
  * Ported to perf by Hitoshi Mitake <mitake@dcl.info.waseda.ac.jp>
  */
-#include "../util/util.h"
 #include <subcmd/parse-options.h>
-#include "../builtin.h"
 #include "bench.h"
 
 #include <unistd.h>

diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index 4e4d2e7..8db8fc9 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c

@@ -27,6 +27,7 @@
 #include "util/sort.h"
 #include "util/hist.h"
 #include "util/dso.h"
+#include "util/machine.h"
 #include "util/map.h"
 #include "util/session.h"
 #include "util/tool.h"
@@ -39,6 +40,7 @@
 #include <dlfcn.h>
 #include <errno.h>
 #include <linux/bitmap.h>
+#include <linux/err.h>
 
 struct perf_annotate {
 	struct perf_tool tool;
@@ -583,8 +585,8 @@
 	data.path = input_name;
 
 	annotate.session = perf_session__new(&data, false, &annotate.tool);
-	if (annotate.session == NULL)
-		return -1;
+	if (IS_ERR(annotate.session))
+		return PTR_ERR(annotate.session);
 
 	annotate.has_br_stack = perf_header__has_feat(&annotate.session->header,
 						      HEADER_BRANCH_STACK);

diff --git a/tools/perf/builtin-buildid-cache.c b/tools/perf/builtin-buildid-cache.c
index 1a69eb5..39efa51 100644
--- a/tools/perf/builtin-buildid-cache.c
+++ b/tools/perf/builtin-buildid-cache.c

@@ -28,6 +28,7 @@
 #include "util/util.h"
 #include "util/probe-file.h"
 #include <linux/string.h>
+#include <linux/err.h>
 
 static int build_id_cache__kcore_buildid(const char *proc_dir, char *sbuildid)
 {
@@ -422,8 +423,8 @@
 		data.force = force;
 
 		session = perf_session__new(&data, false, NULL);
-		if (session == NULL)
-			return -1;
+		if (IS_ERR(session))
+			return PTR_ERR(session);
 	}
 
 	if (symbol__init(session ? &session->header.env : NULL) < 0)

diff --git a/tools/perf/builtin-buildid-list.c b/tools/perf/builtin-buildid-list.c
index 5a0d8b3..e3ef755 100644
--- a/tools/perf/builtin-buildid-list.c
+++ b/tools/perf/builtin-buildid-list.c

@@ -18,6 +18,7 @@
 #include "util/symbol.h"
 #include "util/data.h"
 #include <errno.h>
+#include <linux/err.h>
 
 static int sysfs__fprintf_build_id(FILE *fp)
 {
@@ -65,8 +66,8 @@
 		goto out;
 
 	session = perf_session__new(&data, false, &build_id__mark_dso_hit_ops);
-	if (session == NULL)
-		return -1;
+	if (IS_ERR(session))
+		return PTR_ERR(session);
 
 	/*
 	 * We take all buildids when the file contains AUX area tracing data

diff --git a/tools/perf/builtin-c2c.c b/tools/perf/builtin-c2c.c
index b09b12e..3542b6a 100644
--- a/tools/perf/builtin-c2c.c
+++ b/tools/perf/builtin-c2c.c

@@ -13,6 +13,7 @@
 #include <errno.h>
 #include <inttypes.h>
 #include <linux/compiler.h>
+#include <linux/err.h>
 #include <linux/kernel.h>
 #include <linux/stringify.h>
 #include <linux/zalloc.h>
@@ -20,6 +21,7 @@
 #include <sys/param.h>
 #include "debug.h"
 #include "builtin.h"
+#include <perf/cpumap.h>
 #include <subcmd/pager.h>
 #include <subcmd/parse-options.h>
 #include "map_symbol.h"
@@ -2780,8 +2782,9 @@
 	}
 
 	session = perf_session__new(&data, 0, &c2c.tool);
-	if (session == NULL) {
-		pr_debug("No memory for session\n");
+	if (IS_ERR(session)) {
+		err = PTR_ERR(session);
+		pr_debug("Error creating perf session\n");
 		goto out;
 	}
 

diff --git a/tools/perf/builtin-config.c b/tools/perf/builtin-config.c
index 42d8157..2603015 100644
--- a/tools/perf/builtin-config.c
+++ b/tools/perf/builtin-config.c

@@ -9,7 +9,6 @@
 
 #include "util/cache.h"
 #include <subcmd/parse-options.h>
-#include "util/util.h"
 #include "util/debug.h"
 #include "util/config.h"
 #include <linux/string.h>

diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c
index 827e480..c37a786 100644
--- a/tools/perf/builtin-diff.c
+++ b/tools/perf/builtin-diff.c

@@ -23,6 +23,7 @@
 #include "util/time-utils.h"
 #include "util/annotate.h"
 #include "util/map.h"
+#include <linux/err.h>
 #include <linux/zalloc.h>
 #include <subcmd/pager.h>
 #include <subcmd/parse-options.h>
@@ -1153,9 +1154,9 @@
 
 	data__for_each_file(i, d) {
 		d->session = perf_session__new(&d->data, false, &pdiff.tool);
-		if (!d->session) {
+		if (IS_ERR(d->session)) {
 			pr_err("Failed to open %s\n", d->data.path);
-			return -1;
+			return PTR_ERR(d->session);
 		}
 
 		has_br_stack = perf_header__has_feat(&d->session->header,
@@ -1185,9 +1186,9 @@
 
 	data__for_each_file(i, d) {
 		d->session = perf_session__new(&d->data, false, &pdiff.tool);
-		if (!d->session) {
+		if (IS_ERR(d->session)) {
+			ret = PTR_ERR(d->session);
 			pr_err("Failed to open %s\n", d->data.path);
-			ret = -1;
 			goto out_delete;
 		}
 

diff --git a/tools/perf/builtin-evlist.c b/tools/perf/builtin-evlist.c
index 238fa38..4405019 100644
--- a/tools/perf/builtin-evlist.c
+++ b/tools/perf/builtin-evlist.c

@@ -5,18 +5,18 @@
  */
 #include "builtin.h"
 
-#include "util/util.h"
-
 #include <linux/list.h>
 
 #include "perf.h"
 #include "util/evlist.h"
 #include "util/evsel.h"
+#include "util/evsel_fprintf.h"
 #include "util/parse-events.h"
 #include <subcmd/parse-options.h>
 #include "util/session.h"
 #include "util/data.h"
 #include "util/debug.h"
+#include <linux/err.h>
 
 static int __cmd_evlist(const char *file_name, struct perf_attr_details *details)
 {
@@ -30,8 +30,8 @@
 	bool has_tracepoint = false;
 
 	session = perf_session__new(&data, 0, NULL);
-	if (session == NULL)
-		return -1;
+	if (IS_ERR(session))
+		return PTR_ERR(session);
 
 	evlist__for_each_entry(session->evlist, pos) {
 		perf_evsel__fprintf(pos, details, stdout);

diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c
index c14f40b8..372ecb3 100644
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c

@@ -21,7 +21,9 @@
 #include "util/auxtrace.h"
 #include "util/jit.h"
 #include "util/symbol.h"
+#include "util/synthetic-events.h"
 #include "util/thread.h"
+#include <linux/err.h>
 
 #include <subcmd/parse-options.h>
 
@@ -834,8 +836,8 @@
 
 	data.path = inject.input_name;
 	inject.session = perf_session__new(&data, true, &inject.tool);
-	if (inject.session == NULL)
-		return -1;
+	if (IS_ERR(inject.session))
+		return PTR_ERR(inject.session);
 
 	if (zstd_init(&(inject.session->zstd_data), 0) < 0)
 		pr_warning("Decompression initialization failed.\n");

diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c
index b5682be..1e61e353 100644
--- a/tools/perf/builtin-kmem.c
+++ b/tools/perf/builtin-kmem.c

@@ -14,6 +14,7 @@
 #include "util/tool.h"
 #include "util/callchain.h"
 #include "util/time-utils.h"
+#include <linux/err.h>
 
 #include <subcmd/pager.h>
 #include <subcmd/parse-options.h>
@@ -1956,8 +1957,8 @@
 	data.path = input_name;
 
 	kmem_session = session = perf_session__new(&data, false, &perf_kmem);
-	if (session == NULL)
-		return -1;
+	if (IS_ERR(session))
+		return PTR_ERR(session);
 
 	ret = -1;
 

diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c
index 0a4fcbe..2227e2f 100644
--- a/tools/perf/builtin-kvm.c
+++ b/tools/perf/builtin-kvm.c

@@ -5,6 +5,7 @@
 #include "util/build-id.h"
 #include "util/evsel.h"
 #include "util/evlist.h"
+#include "util/mmap.h"
 #include "util/term.h"
 #include "util/symbol.h"
 #include "util/thread.h"
@@ -17,9 +18,11 @@
 #include "util/debug.h"
 #include "util/tool.h"
 #include "util/stat.h"
+#include "util/synthetic-events.h"
 #include "util/top.h"
 #include "util/data.h"
 #include "util/ordered-events.h"
+#include "util/kvm-stat.h"
 #include "ui/ui.h"
 
 #include <sys/prctl.h>
@@ -31,6 +34,7 @@
 #include <sys/stat.h>
 #include <fcntl.h>
 
+#include <linux/err.h>
 #include <linux/kernel.h>
 #include <linux/string.h>
 #include <linux/time64.h>
@@ -58,7 +62,6 @@
 }
 
 #ifdef HAVE_KVM_STAT_SUPPORT
-#include "util/kvm-stat.h"
 
 void exit_event_get_key(struct evsel *evsel,
 			struct perf_sample *sample,
@@ -748,7 +751,7 @@
 {
 	struct evlist *evlist = kvm->evlist;
 	union perf_event *event;
-	struct perf_mmap *md;
+	struct mmap *md;
 	u64 timestamp;
 	s64 n = 0;
 	int err;
@@ -799,7 +802,7 @@
 	s64 n, ntotal = 0;
 	u64 flush_time = ULLONG_MAX, mmap_time;
 
-	for (i = 0; i < kvm->evlist->nr_mmaps; i++) {
+	for (i = 0; i < kvm->evlist->core.nr_mmaps; i++) {
 		n = perf_kvm__mmap_read_idx(kvm, i, &mmap_time);
 		if (n < 0)
 			return -1;
@@ -964,10 +967,10 @@
 		goto out;
 	}
 
-	if (perf_evlist__add_pollfd(kvm->evlist, kvm->timerfd) < 0)
+	if (evlist__add_pollfd(kvm->evlist, kvm->timerfd) < 0)
 		goto out;
 
-	nr_stdin = perf_evlist__add_pollfd(kvm->evlist, fileno(stdin));
+	nr_stdin = evlist__add_pollfd(kvm->evlist, fileno(stdin));
 	if (nr_stdin < 0)
 		goto out;
 
@@ -978,7 +981,7 @@
 	evlist__enable(kvm->evlist);
 
 	while (!done) {
-		struct fdarray *fda = &kvm->evlist->pollfd;
+		struct fdarray *fda = &kvm->evlist->core.pollfd;
 		int rc;
 
 		rc = perf_kvm__mmap_read(kvm);
@@ -1058,7 +1061,7 @@
 		goto out;
 	}
 
-	if (perf_evlist__mmap(evlist, kvm->opts.mmap_pages) < 0) {
+	if (evlist__mmap(evlist, kvm->opts.mmap_pages) < 0) {
 		ui__error("Failed to mmap the events: %s\n",
 			  str_error_r(errno, sbuf, sizeof(sbuf)));
 		evlist__close(evlist);
@@ -1090,9 +1093,9 @@
 
 	kvm->tool = eops;
 	kvm->session = perf_session__new(&file, false, &kvm->tool);
-	if (!kvm->session) {
+	if (IS_ERR(kvm->session)) {
 		pr_err("Initializing perf session failed\n");
-		return -1;
+		return PTR_ERR(kvm->session);
 	}
 
 	symbol__init(&kvm->session->header.env);
@@ -1445,8 +1448,8 @@
 	 * perf session
 	 */
 	kvm->session = perf_session__new(&data, false, &kvm->tool);
-	if (kvm->session == NULL) {
-		err = -1;
+	if (IS_ERR(kvm->session)) {
+		err = PTR_ERR(kvm->session);
 		goto out;
 	}
 	kvm->session->evlist = kvm->evlist;
@@ -1513,11 +1516,21 @@
 }
 #endif /* HAVE_KVM_STAT_SUPPORT */
 
+int __weak kvm_add_default_arch_event(int *argc __maybe_unused,
+					const char **argv __maybe_unused)
+{
+	return 0;
+}
+
 static int __cmd_record(const char *file_name, int argc, const char **argv)
 {
-	int rec_argc, i = 0, j;
+	int rec_argc, i = 0, j, ret;
 	const char **rec_argv;
 
+	ret = kvm_add_default_arch_event(&argc, argv);
+	if (ret)
+		return -EINVAL;
+
 	rec_argc = argc + 2;
 	rec_argv = calloc(rec_argc + 1, sizeof(char *));
 	rec_argv[i++] = strdup("record");

diff --git a/tools/perf/builtin-list.c b/tools/perf/builtin-list.c
index e290f6b..08e62ae9 100644
--- a/tools/perf/builtin-list.c
+++ b/tools/perf/builtin-list.c

@@ -81,9 +81,9 @@
 						long_desc_flag, details_flag);
 		else if (strcmp(argv[i], "sdt") == 0)
 			print_sdt_events(NULL, NULL, raw_dump);
-		else if (strcmp(argv[i], "metric") == 0)
+		else if (strcmp(argv[i], "metric") == 0 || strcmp(argv[i], "metrics") == 0)
 			metricgroup__print(true, false, NULL, raw_dump, details_flag);
-		else if (strcmp(argv[i], "metricgroup") == 0)
+		else if (strcmp(argv[i], "metricgroup") == 0 || strcmp(argv[i], "metricgroups") == 0)
 			metricgroup__print(false, true, NULL, raw_dump, details_flag);
 		else if ((sep = strchr(argv[i], ':')) != NULL) {
 			int sep_idx;

diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c
index 4c2b7f4..474dfd5 100644
--- a/tools/perf/builtin-lock.c
+++ b/tools/perf/builtin-lock.c

@@ -30,6 +30,7 @@
 #include <linux/hash.h>
 #include <linux/kernel.h>
 #include <linux/zalloc.h>
+#include <linux/err.h>
 
 static struct perf_session *session;
 
@@ -872,9 +873,9 @@
 	};
 
 	session = perf_session__new(&data, false, &eops);
-	if (!session) {
+	if (IS_ERR(session)) {
 		pr_err("Initializing perf session failed\n");
-		return -1;
+		return PTR_ERR(session);
 	}
 
 	symbol__init(&session->header.env);

diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c
index 27d2bde..a13f581 100644
--- a/tools/perf/builtin-mem.c
+++ b/tools/perf/builtin-mem.c

@@ -17,6 +17,7 @@
 #include "util/dso.h"
 #include "util/map.h"
 #include "util/symbol.h"
+#include <linux/err.h>
 
 #define MEM_OPERATION_LOAD	0x1
 #define MEM_OPERATION_STORE	0x2
@@ -249,8 +250,8 @@
 	struct perf_session *session = perf_session__new(&data, false,
 							 &mem->tool);
 
-	if (session == NULL)
-		return -1;
+	if (IS_ERR(session))
+		return PTR_ERR(session);
 
 	if (mem->cpu_list) {
 		ret = perf_session__cpu_bitmap(session, mem->cpu_list,

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 1447004..2333286 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c

@@ -20,6 +20,7 @@
 #include "util/evlist.h"
 #include "util/evsel.h"
 #include "util/debug.h"
+#include "util/mmap.h"
 #include "util/target.h"
 #include "util/session.h"
 #include "util/tool.h"
@@ -38,6 +39,7 @@
 #include "util/trigger.h"
 #include "util/perf-hooks.h"
 #include "util/cpu-set-sched.h"
+#include "util/synthetic-events.h"
 #include "util/time-utils.h"
 #include "util/units.h"
 #include "util/bpf-event.h"
@@ -53,6 +55,7 @@
 #include <signal.h>
 #include <sys/mman.h>
 #include <sys/wait.h>
+#include <linux/err.h>
 #include <linux/string.h>
 #include <linux/time64.h>
 #include <linux/zalloc.h>
@@ -117,7 +120,7 @@
 	       trigger_is_ready(&switch_output_trigger);
 }
 
-static int record__write(struct record *rec, struct perf_mmap *map __maybe_unused,
+static int record__write(struct record *rec, struct mmap *map __maybe_unused,
 			 void *bf, size_t size)
 {
 	struct perf_data_file *file = &rec->session->data->file;
@@ -166,7 +169,7 @@
 	return rc;
 }
 
-static int record__aio_complete(struct perf_mmap *md, struct aiocb *cblock)
+static int record__aio_complete(struct mmap *md, struct aiocb *cblock)
 {
 	void *rem_buf;
 	off_t rem_off;
@@ -212,7 +215,7 @@
 	return rc;
 }
 
-static int record__aio_sync(struct perf_mmap *md, bool sync_all)
+static int record__aio_sync(struct mmap *md, bool sync_all)
 {
 	struct aiocb **aiocb = md->aio.aiocb;
 	struct aiocb *cblocks = md->aio.cblocks;
@@ -253,12 +256,12 @@
 	size_t		size;
 };
 
-static int record__aio_pushfn(struct perf_mmap *map, void *to, void *buf, size_t size)
+static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size)
 {
 	struct record_aio *aio = to;
 
 	/*
-	 * map->base data pointed by buf is copied into free map->aio.data[] buffer
+	 * map->core.base data pointed by buf is copied into free map->aio.data[] buffer
 	 * to release space in the kernel buffer as fast as possible, calling
 	 * perf_mmap__consume() from perf_mmap__push() function.
 	 *
@@ -298,7 +301,7 @@
 	return size;
 }
 
-static int record__aio_push(struct record *rec, struct perf_mmap *map, off_t *off)
+static int record__aio_push(struct record *rec, struct mmap *map, off_t *off)
 {
 	int ret, idx;
 	int trace_fd = rec->session->data->file.fd;
@@ -349,15 +352,15 @@
 {
 	int i;
 	struct evlist *evlist = rec->evlist;
-	struct perf_mmap *maps = evlist->mmap;
+	struct mmap *maps = evlist->mmap;
 
 	if (!record__aio_enabled(rec))
 		return;
 
-	for (i = 0; i < evlist->nr_mmaps; i++) {
-		struct perf_mmap *map = &maps[i];
+	for (i = 0; i < evlist->core.nr_mmaps; i++) {
+		struct mmap *map = &maps[i];
 
-		if (map->base)
+		if (map->core.base)
 			record__aio_sync(map, true);
 	}
 }
@@ -385,7 +388,7 @@
 #else /* HAVE_AIO_SUPPORT */
 static int nr_cblocks_max = 0;
 
-static int record__aio_push(struct record *rec __maybe_unused, struct perf_mmap *map __maybe_unused,
+static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused,
 			    off_t *off __maybe_unused)
 {
 	return -1;
@@ -437,7 +440,7 @@
 	if (!opts->mmap_flush)
 		opts->mmap_flush = MMAP_FLUSH_DEFAULT;
 
-	flush_max = perf_evlist__mmap_size(opts->mmap_pages);
+	flush_max = evlist__mmap_size(opts->mmap_pages);
 	flush_max /= 4;
 	if (opts->mmap_flush > flush_max)
 		opts->mmap_flush = flush_max;
@@ -480,7 +483,7 @@
 	return record__write(rec, NULL, event, event->header.size);
 }
 
-static int record__pushfn(struct perf_mmap *map, void *to, void *bf, size_t size)
+static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
 {
 	struct record *rec = to;
 
@@ -525,7 +528,7 @@
 #ifdef HAVE_AUXTRACE_SUPPORT
 
 static int record__process_auxtrace(struct perf_tool *tool,
-				    struct perf_mmap *map,
+				    struct mmap *map,
 				    union perf_event *event, void *data1,
 				    size_t len1, void *data2, size_t len2)
 {
@@ -563,7 +566,7 @@
 }
 
 static int record__auxtrace_mmap_read(struct record *rec,
-				      struct perf_mmap *map)
+				      struct mmap *map)
 {
 	int ret;
 
@@ -579,7 +582,7 @@
 }
 
 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
-					       struct perf_mmap *map)
+					       struct mmap *map)
 {
 	int ret;
 
@@ -600,8 +603,8 @@
 	int i;
 	int rc = 0;
 
-	for (i = 0; i < rec->evlist->nr_mmaps; i++) {
-		struct perf_mmap *map = &rec->evlist->mmap[i];
+	for (i = 0; i < rec->evlist->core.nr_mmaps; i++) {
+		struct mmap *map = &rec->evlist->mmap[i];
 
 		if (!map->auxtrace_mmap.base)
 			continue;
@@ -666,7 +669,7 @@
 
 static inline
 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
-			       struct perf_mmap *map __maybe_unused)
+			       struct mmap *map __maybe_unused)
 {
 	return 0;
 }
@@ -705,7 +708,7 @@
 	if (opts->affinity != PERF_AFFINITY_SYS)
 		cpu__setup_cpunode_map();
 
-	if (perf_evlist__mmap_ex(evlist, opts->mmap_pages,
+	if (evlist__mmap_ex(evlist, opts->mmap_pages,
 				 opts->auxtrace_mmap_pages,
 				 opts->auxtrace_snapshot_mode,
 				 opts->nr_cblocks, opts->affinity,
@@ -753,9 +756,9 @@
 		if (perf_evlist__add_dummy(evlist))
 			return -ENOMEM;
 
-		pos = perf_evlist__first(evlist);
+		pos = evlist__first(evlist);
 		pos->tracking = 0;
-		pos = perf_evlist__last(evlist);
+		pos = evlist__last(evlist);
 		pos->tracking = 1;
 		pos->core.attr.enable_on_exec = 1;
 	}
@@ -786,6 +789,17 @@
 		pos->supported = true;
 	}
 
+	if (symbol_conf.kptr_restrict && !perf_evlist__exclude_kernel(evlist)) {
+		pr_warning(
+"WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
+"check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
+"Samples in kernel functions may not be resolved if a suitable vmlinux\n"
+"file is not found in the buildid cache or in the vmlinux path.\n\n"
+"Samples in kernel modules won't be resolved at all.\n\n"
+"If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
+"even with a suitable vmlinux or kallsyms file.\n\n");
+	}
+
 	if (perf_evlist__apply_filters(evlist, &pos)) {
 		pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
 			pos->filter, perf_evsel__name(pos), errno,
@@ -888,7 +902,7 @@
 	.type = PERF_RECORD_FINISHED_ROUND,
 };
 
-static void record__adjust_affinity(struct record *rec, struct perf_mmap *map)
+static void record__adjust_affinity(struct record *rec, struct mmap *map)
 {
 	if (rec->opts.affinity != PERF_AFFINITY_SYS &&
 	    !CPU_EQUAL(&rec->affinity_mask, &map->affinity_mask)) {
@@ -935,7 +949,7 @@
 	u64 bytes_written = rec->bytes_written;
 	int i;
 	int rc = 0;
-	struct perf_mmap *maps;
+	struct mmap *maps;
 	int trace_fd = rec->data.file.fd;
 	off_t off = 0;
 
@@ -952,20 +966,20 @@
 	if (record__aio_enabled(rec))
 		off = record__aio_get_pos(trace_fd);
 
-	for (i = 0; i < evlist->nr_mmaps; i++) {
+	for (i = 0; i < evlist->core.nr_mmaps; i++) {
 		u64 flush = 0;
-		struct perf_mmap *map = &maps[i];
+		struct mmap *map = &maps[i];
 
-		if (map->base) {
+		if (map->core.base) {
 			record__adjust_affinity(rec, map);
 			if (synch) {
-				flush = map->flush;
-				map->flush = 1;
+				flush = map->core.flush;
+				map->core.flush = 1;
 			}
 			if (!record__aio_enabled(rec)) {
 				if (perf_mmap__push(map, rec, record__pushfn) < 0) {
 					if (synch)
-						map->flush = flush;
+						map->core.flush = flush;
 					rc = -1;
 					goto out;
 				}
@@ -973,13 +987,13 @@
 				if (record__aio_push(rec, map, &off) < 0) {
 					record__aio_set_pos(trace_fd, off);
 					if (synch)
-						map->flush = flush;
+						map->core.flush = flush;
 					rc = -1;
 					goto out;
 				}
 			}
 			if (synch)
-				map->flush = flush;
+				map->core.flush = flush;
 		}
 
 		if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
@@ -1180,23 +1194,14 @@
 static void snapshot_sig_handler(int sig);
 static void alarm_sig_handler(int sig);
 
-int __weak
-perf_event__synth_time_conv(const struct perf_event_mmap_page *pc __maybe_unused,
-			    struct perf_tool *tool __maybe_unused,
-			    perf_event__handler_t process __maybe_unused,
-			    struct machine *machine __maybe_unused)
-{
-	return 0;
-}
-
 static const struct perf_event_mmap_page *
 perf_evlist__pick_pc(struct evlist *evlist)
 {
 	if (evlist) {
-		if (evlist->mmap && evlist->mmap[0].base)
-			return evlist->mmap[0].base;
-		if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].base)
-			return evlist->overwrite_mmap[0].base;
+		if (evlist->mmap && evlist->mmap[0].core.base)
+			return evlist->mmap[0].core.base;
+		if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base)
+			return evlist->overwrite_mmap[0].core.base;
 	}
 	return NULL;
 }
@@ -1362,9 +1367,9 @@
 	}
 
 	session = perf_session__new(data, false, tool);
-	if (session == NULL) {
+	if (IS_ERR(session)) {
 		pr_err("Perf session creation failed.\n");
-		return -1;
+		return PTR_ERR(session);
 	}
 
 	fd = perf_data__fd(data);
@@ -1407,7 +1412,7 @@
 		err = -1;
 		goto out_child;
 	}
-	session->header.env.comp_mmap_len = session->evlist->mmap_len;
+	session->header.env.comp_mmap_len = session->evlist->core.mmap_len;
 
 	err = bpf__apply_obj_config();
 	if (err) {
@@ -1610,7 +1615,7 @@
 		if (hits == rec->samples) {
 			if (done || draining)
 				break;
-			err = perf_evlist__poll(rec->evlist, -1);
+			err = evlist__poll(rec->evlist, -1);
 			/*
 			 * Propagate error, only if there's any. Ignore positive
 			 * number of returned events and interrupt error.
@@ -1619,7 +1624,7 @@
 				err = 0;
 			waking++;
 
-			if (perf_evlist__filter_pollfd(rec->evlist, POLLERR | POLLHUP) == 0)
+			if (evlist__filter_pollfd(rec->evlist, POLLERR | POLLHUP) == 0)
 				draining = true;
 		}
 
@@ -1976,7 +1981,7 @@
 
 static void switch_output_size_warn(struct record *rec)
 {
-	u64 wakeup_size = perf_evlist__mmap_size(rec->opts.mmap_pages);
+	u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages);
 	struct switch_output *s = &rec->switch_output;
 
 	wakeup_size /= 2;
@@ -2371,16 +2376,6 @@
 
 	err = -ENOMEM;
 
-	if (symbol_conf.kptr_restrict && !perf_evlist__exclude_kernel(rec->evlist))
-		pr_warning(
-"WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
-"check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
-"Samples in kernel functions may not be resolved if a suitable vmlinux\n"
-"file is not found in the buildid cache or in the vmlinux path.\n\n"
-"Samples in kernel modules won't be resolved at all.\n\n"
-"If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
-"even with a suitable vmlinux or kallsyms file.\n\n");
-
 	if (rec->no_buildid_cache || rec->no_buildid) {
 		disable_buildid_cache();
 	} else if (rec->switch_output.enabled) {

diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index b18fab9..aae0e57 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c

@@ -48,7 +48,7 @@
 #include "util/auxtrace.h"
 #include "util/units.h"
 #include "util/branch.h"
-#include "util/util.h"
+#include "util/util.h" // perf_tip()
 #include "ui/ui.h"
 #include "ui/progress.h"
 
@@ -1269,8 +1269,8 @@
 
 repeat:
 	session = perf_session__new(&data, false, &report.tool);
-	if (session == NULL)
-		return -1;
+	if (IS_ERR(session))
+		return PTR_ERR(session);
 
 	ret = evswitch__init(&report.evswitch, session->evlist, stderr);
 	if (ret)

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index ec96d64..5cacc4f 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c

@@ -3,8 +3,10 @@
 #include "perf.h"
 #include "perf-sys.h"
 
+#include "util/cpumap.h"
 #include "util/evlist.h"
 #include "util/evsel.h"
+#include "util/evsel_fprintf.h"
 #include "util/symbol.h"
 #include "util/thread.h"
 #include "util/header.h"
@@ -23,6 +25,7 @@
 #include "util/trace-event.h"
 
 #include "util/debug.h"
+#include "util/event.h"
 
 #include <linux/kernel.h>
 #include <linux/log2.h>
@@ -36,7 +39,9 @@
 #include <pthread.h>
 #include <math.h>
 #include <api/fs/fs.h>
+#include <perf/cpumap.h>
 #include <linux/time64.h>
+#include <linux/err.h>
 
 #include <linux/ctype.h>
 
@@ -1794,9 +1799,9 @@
 	int rc = -1;
 
 	session = perf_session__new(&data, false, &sched->tool);
-	if (session == NULL) {
-		pr_debug("No Memory for session\n");
-		return -1;
+	if (IS_ERR(session)) {
+		pr_debug("Error creating perf session");
+		return PTR_ERR(session);
 	}
 
 	symbol__init(&session->header.env);
@@ -2051,7 +2056,7 @@
 			    EVSEL__PRINT_SYM | EVSEL__PRINT_ONELINE |
 			    EVSEL__PRINT_CALLCHAIN_ARROW |
 			    EVSEL__PRINT_SKIP_IGNORED,
-			    &callchain_cursor, stdout);
+			    &callchain_cursor, symbol_conf.bt_stop_list,  stdout);
 
 out:
 	printf("\n");
@@ -2986,8 +2991,8 @@
 	symbol_conf.use_callchain = sched->show_callchain;
 
 	session = perf_session__new(&data, false, &sched->tool);
-	if (session == NULL)
-		return -ENOMEM;
+	if (IS_ERR(session))
+		return PTR_ERR(session);
 
 	evlist = session->evlist;
 

diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index e079b34..286fc70 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c

@@ -17,6 +17,7 @@
 #include "util/trace-event.h"
 #include "util/evlist.h"
 #include "util/evsel.h"
+#include "util/evsel_fprintf.h"
 #include "util/evswitch.h"
 #include "util/sort.h"
 #include "util/data.h"
@@ -52,6 +53,7 @@
 #include <unistd.h>
 #include <subcmd/pager.h>
 #include <perf/evlist.h>
+#include <linux/err.h>
 #include "util/record.h"
 #include "util/util.h"
 #include "perf.h"
@@ -1324,7 +1326,8 @@
 		} else
 			printed += fprintf(fp, "\n");
 
-		printed += sample__fprintf_sym(sample, al, 0, print_opts, cursor, fp);
+		printed += sample__fprintf_sym(sample, al, 0, print_opts, cursor,
+					       symbol_conf.bt_stop_list, fp);
 	}
 
 	/* print branch_to information */
@@ -1866,7 +1869,8 @@
 			cursor = &callchain_cursor;
 
 		fputc(cursor ? '\n' : ' ', fp);
-		sample__fprintf_sym(sample, al, 0, output[type].print_ip_opts, cursor, fp);
+		sample__fprintf_sym(sample, al, 0, output[type].print_ip_opts, cursor,
+				    symbol_conf.bt_stop_list, fp);
 	}
 
 	if (PRINT_FIELD(IREGS))
@@ -1915,7 +1919,7 @@
 	int cpu, thread;
 	static int header_printed;
 
-	if (counter->system_wide)
+	if (counter->core.system_wide)
 		nthreads = 1;
 
 	if (!header_printed) {
@@ -2042,7 +2046,7 @@
 		return err;
 
 	evlist = *pevlist;
-	evsel = perf_evlist__last(*pevlist);
+	evsel = evlist__last(*pevlist);
 
 	if (!evsel->priv) {
 		if (scr->per_event_dump) {
@@ -3083,8 +3087,8 @@
 	int i = 0;
 
 	session = perf_session__new(&data, false, NULL);
-	if (!session)
-		return -1;
+	if (IS_ERR(session))
+		return PTR_ERR(session);
 
 	snprintf(scripts_path, MAXPATHLEN, "%s/scripts", get_argv_exec_path());
 
@@ -3754,8 +3758,8 @@
 	}
 
 	session = perf_session__new(&data, false, &script.tool);
-	if (session == NULL)
-		return -1;
+	if (IS_ERR(session))
+		return PTR_ERR(session);
 
 	if (header || header_only) {
 		script.tool.show_feat_hdr = SHOW_FEAT_HEADER;

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 7e17bf9..468fc49 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c

@@ -61,6 +61,7 @@
 #include "util/tool.h"
 #include "util/string2.h"
 #include "util/metricgroup.h"
+#include "util/synthetic-events.h"
 #include "util/target.h"
 #include "util/time-utils.h"
 #include "util/top.h"
@@ -82,6 +83,7 @@
 #include <unistd.h>
 #include <sys/time.h>
 #include <sys/resource.h>
+#include <linux/err.h>
 
 #include <linux/ctype.h>
 #include <perf/evlist.h>
@@ -233,7 +235,7 @@
 #define WRITE_STAT_ROUND_EVENT(time, interval) \
 	write_stat_round_event(time, PERF_STAT_ROUND_TYPE__ ## interval)
 
-#define SID(e, x, y) xyarray__entry(e->sample_id, x, y)
+#define SID(e, x, y) xyarray__entry(e->core.sample_id, x, y)
 
 static int
 perf_evsel__write_stat_event(struct evsel *counter, u32 cpu, u32 thread,
@@ -276,7 +278,7 @@
 	if (!counter->supported)
 		return -ENOENT;
 
-	if (counter->system_wide)
+	if (counter->core.system_wide)
 		nthreads = 1;
 
 	for (thread = 0; thread < nthreads; thread++) {
@@ -540,8 +542,8 @@
 		if (err < 0)
 			return err;
 
-		err = perf_stat_synthesize_config(&stat_config, NULL, evsel_list,
-						  process_synthesized_event, is_pipe);
+		err = perf_event__synthesize_stat_events(&stat_config, NULL, evsel_list,
+							 process_synthesized_event, is_pipe);
 		if (err < 0)
 			return err;
 	}
@@ -822,18 +824,6 @@
 	return cpu_map__get_core(map, cpu, NULL);
 }
 
-static int cpu_map__get_max(struct perf_cpu_map *map)
-{
-	int i, max = -1;
-
-	for (i = 0; i < map->nr; i++) {
-		if (map->map[i] > max)
-			max = map->map[i];
-	}
-
-	return max;
-}
-
 static int perf_stat__get_aggr(struct perf_stat_config *config,
 			       aggr_get_id_t get_id, struct perf_cpu_map *map, int idx)
 {
@@ -928,7 +918,7 @@
 	 * taking the highest cpu number to be the size of
 	 * the aggregation translate cpumap.
 	 */
-	nr = cpu_map__get_max(evsel_list->core.cpus);
+	nr = perf_cpu_map__max(evsel_list->core.cpus);
 	stat_config.cpus_aggr_map = perf_cpu_map__empty_new(nr + 1);
 	return stat_config.cpus_aggr_map ? 0 : -ENOMEM;
 }
@@ -1447,9 +1437,9 @@
 	}
 
 	session = perf_session__new(data, false, NULL);
-	if (session == NULL) {
-		pr_err("Perf session creation failed.\n");
-		return -1;
+	if (IS_ERR(session)) {
+		pr_err("Perf session creation failed\n");
+		return PTR_ERR(session);
 	}
 
 	init_features(session);
@@ -1646,8 +1636,8 @@
 	perf_stat.data.mode = PERF_DATA_MODE_READ;
 
 	session = perf_session__new(&perf_stat.data, false, &perf_stat.tool);
-	if (session == NULL)
-		return -1;
+	if (IS_ERR(session))
+		return PTR_ERR(session);
 
 	perf_stat.session  = session;
 	stat_config.output = stderr;
@@ -1681,7 +1671,7 @@
 		struct evsel *counter;
 
 		evlist__for_each_entry(evsel_list, counter) {
-			if (!counter->system_wide)
+			if (!counter->core.system_wide)
 				return;
 		}
 
@@ -1963,8 +1953,11 @@
 			fprintf(output, "[ perf stat: executing run #%d ... ]\n",
 				run_idx + 1);
 
+		if (run_idx != 0)
+			perf_evlist__reset_prev_raw_counts(evsel_list);
+
 		status = run_perf_stat(argc, argv, run_idx);
-		if (forever && status != -1) {
+		if (forever && status != -1 && !interval) {
 			print_counters(NULL, argc, argv);
 			perf_stat__reset_stats();
 		}

diff --git a/tools/perf/builtin-timechart.c b/tools/perf/builtin-timechart.c
index e0e8226..9e84fae 100644
--- a/tools/perf/builtin-timechart.c
+++ b/tools/perf/builtin-timechart.c

@@ -35,6 +35,7 @@
 #include "util/tool.h"
 #include "util/data.h"
 #include "util/debug.h"
+#include <linux/err.h>
 
 #ifdef LACKS_OPEN_MEMSTREAM_PROTOTYPE
 FILE *open_memstream(char **ptr, size_t *sizeloc);
@@ -1601,8 +1602,8 @@
 							 &tchart->tool);
 	int ret = -EINVAL;
 
-	if (session == NULL)
-		return -1;
+	if (IS_ERR(session))
+		return PTR_ERR(session);
 
 	symbol__init(&session->header.env);
 

diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 726e3f2..1f60124 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c

@@ -27,11 +27,14 @@
 #include "util/dso.h"
 #include "util/evlist.h"
 #include "util/evsel.h"
+#include "util/evsel_config.h"
 #include "util/event.h"
 #include "util/machine.h"
 #include "util/map.h"
+#include "util/mmap.h"
 #include "util/session.h"
 #include "util/symbol.h"
+#include "util/synthetic-events.h"
 #include "util/top.h"
 #include "util/util.h"
 #include <linux/rbtree.h>
@@ -76,6 +79,7 @@
 #include <linux/stringify.h>
 #include <linux/time64.h>
 #include <linux/types.h>
+#include <linux/err.h>
 
 #include <linux/ctype.h>
 
@@ -528,7 +532,7 @@
 				prompt_integer(&counter, "Enter details event counter");
 
 				if (counter >= top->evlist->core.nr_entries) {
-					top->sym_evsel = perf_evlist__first(top->evlist);
+					top->sym_evsel = evlist__first(top->evlist);
 					fprintf(stderr, "Sorry, no such event, using %s.\n", perf_evsel__name(top->sym_evsel));
 					sleep(1);
 					break;
@@ -537,7 +541,7 @@
 					if (top->sym_evsel->idx == counter)
 						break;
 			} else
-				top->sym_evsel = perf_evlist__first(top->evlist);
+				top->sym_evsel = evlist__first(top->evlist);
 			break;
 		case 'f':
 			prompt_integer(&top->count_filter, "Enter display event count filter");
@@ -861,7 +865,7 @@
 {
 	struct record_opts *opts = &top->record_opts;
 	struct evlist *evlist = top->evlist;
-	struct perf_mmap *md;
+	struct mmap *md;
 	union perf_event *event;
 
 	md = opts->overwrite ? &evlist->overwrite_mmap[idx] : &evlist->mmap[idx];
@@ -901,7 +905,7 @@
 	if (overwrite)
 		perf_evlist__toggle_bkw_mmap(evlist, BKW_MMAP_DATA_PENDING);
 
-	for (i = 0; i < top->evlist->nr_mmaps; i++)
+	for (i = 0; i < top->evlist->core.nr_mmaps; i++)
 		perf_top__mmap_read_idx(top, i);
 
 	if (overwrite) {
@@ -959,7 +963,7 @@
 		/* has term for current event */
 		if ((overwrite < 0) && (set >= 0)) {
 			/* if it's first event, set overwrite */
-			if (evsel == perf_evlist__first(evlist))
+			if (evsel == evlist__first(evlist))
 				overwrite = set;
 			else
 				return -1;
@@ -983,7 +987,7 @@
 		return 0;
 
 	/* only fall back when first event fails */
-	if (evsel != perf_evlist__first(evlist))
+	if (evsel != evlist__first(evlist))
 		return 0;
 
 	evlist__for_each_entry(evlist, counter)
@@ -1040,7 +1044,7 @@
 		}
 	}
 
-	if (perf_evlist__mmap(evlist, opts->mmap_pages) < 0) {
+	if (evlist__mmap(evlist, opts->mmap_pages) < 0) {
 		ui__error("Failed to mmap with %d (%s)\n",
 			    errno, str_error_r(errno, msg, sizeof(msg)));
 		goto out_err;
@@ -1304,7 +1308,7 @@
 	}
 
 	/* Wait for a minimal set of events before starting the snapshot */
-	perf_evlist__poll(top->evlist, 100);
+	evlist__poll(top->evlist, 100);
 
 	perf_top__mmap_read(top);
 
@@ -1314,7 +1318,7 @@
 		perf_top__mmap_read(top);
 
 		if (opts->overwrite || (hits == top->samples))
-			ret = perf_evlist__poll(top->evlist, 100);
+			ret = evlist__poll(top->evlist, 100);
 
 		if (resize) {
 			perf_top__resize(top);
@@ -1641,7 +1645,7 @@
 		goto out_delete_evlist;
 	}
 
-	top.sym_evsel = perf_evlist__first(top.evlist);
+	top.sym_evsel = evlist__first(top.evlist);
 
 	if (!callchain_param.enabled) {
 		symbol_conf.cumulate_callchain = false;
@@ -1671,8 +1675,8 @@
 	}
 
 	top.session = perf_session__new(NULL, false, NULL);
-	if (top.session == NULL) {
-		status = -1;
+	if (IS_ERR(top.session)) {
+		status = PTR_ERR(top.session);
 		goto out_delete_evlist;
 	}
 

diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 0f633f0..bb5130d 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c

@@ -28,8 +28,12 @@
 #include "util/dso.h"
 #include "util/env.h"
 #include "util/event.h"
+#include "util/evsel.h"
+#include "util/evsel_fprintf.h"
+#include "util/synthetic-events.h"
 #include "util/evlist.h"
 #include "util/evswitch.h"
+#include "util/mmap.h"
 #include <subcmd/pager.h>
 #include <subcmd/exec-cmd.h>
 #include "util/machine.h"
@@ -2074,7 +2078,7 @@
 				        EVSEL__PRINT_DSO |
 				        EVSEL__PRINT_UNKNOWN_AS_ADDR;
 
-	return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
+	return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, symbol_conf.bt_stop_list, trace->output);
 }
 
 static const char *errno_to_name(struct evsel *evsel, int err)
@@ -3408,7 +3412,7 @@
 	if (trace->dump.map)
 		bpf_map__fprintf(trace->dump.map, trace->output);
 
-	err = perf_evlist__mmap(evlist, trace->opts.mmap_pages);
+	err = evlist__mmap(evlist, trace->opts.mmap_pages);
 	if (err < 0)
 		goto out_error_mmap;
 
@@ -3425,7 +3429,7 @@
 
 	trace->multiple_threads = perf_thread_map__pid(evlist->core.threads, 0) == -1 ||
 				  evlist->core.threads->nr > 1 ||
-				  perf_evlist__first(evlist)->core.attr.inherit;
+				  evlist__first(evlist)->core.attr.inherit;
 
 	/*
 	 * Now that we already used evsel->core.attr to ask the kernel to setup the
@@ -3441,9 +3445,9 @@
 again:
 	before = trace->nr_events;
 
-	for (i = 0; i < evlist->nr_mmaps; i++) {
+	for (i = 0; i < evlist->core.nr_mmaps; i++) {
 		union perf_event *event;
-		struct perf_mmap *md;
+		struct mmap *md;
 
 		md = &evlist->mmap[i];
 		if (perf_mmap__read_init(md) < 0)
@@ -3472,8 +3476,8 @@
 	if (trace->nr_events == before) {
 		int timeout = done ? 100 : -1;
 
-		if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
-			if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP | POLLNVAL) == 0)
+		if (!draining && evlist__poll(evlist, timeout) > 0) {
+			if (evlist__filter_pollfd(evlist, POLLERR | POLLHUP | POLLNVAL) == 0)
 				draining = true;
 
 			goto again;
@@ -3584,8 +3588,8 @@
 	trace->multiple_threads = true;
 
 	session = perf_session__new(&data, false, &trace->tool);
-	if (session == NULL)
-		return -1;
+	if (IS_ERR(session))
+		return PTR_ERR(session);
 
 	if (trace->opts.target.pid)
 		symbol_conf.pid_list_str = strdup(trace->opts.target.pid);

diff --git a/tools/perf/jvmti/Build b/tools/perf/jvmti/Build
index eaeb8cb..1e148bb 100644
--- a/tools/perf/jvmti/Build
+++ b/tools/perf/jvmti/Build

@@ -1,8 +1,17 @@
 jvmti-y += libjvmti.o
 jvmti-y += jvmti_agent.o
 
+# For strlcpy
+jvmti-y += libstring.o
+
 CFLAGS_jvmti         = -fPIC -DPIC -I$(JDIR)/include -I$(JDIR)/include/linux
 CFLAGS_REMOVE_jvmti  = -Wmissing-declarations
 CFLAGS_REMOVE_jvmti += -Wstrict-prototypes
 CFLAGS_REMOVE_jvmti += -Wextra
 CFLAGS_REMOVE_jvmti += -Wwrite-strings
+
+CFLAGS_libstring.o += -Wno-unused-parameter -DETC_PERFCONFIG="BUILD_STR($(ETC_PERFCONFIG_SQ))"
+
+$(OUTPUT)jvmti/libstring.o: ../lib/string.c FORCE
+	$(call rule_mkdir)
+	$(call if_changed_dep,cc_o_c)

diff --git a/tools/perf/lib/Makefile b/tools/perf/lib/Makefile
index a67efb8..85ccb8c 100644
--- a/tools/perf/lib/Makefile
+++ b/tools/perf/lib/Makefile

@@ -59,7 +59,13 @@
   CFLAGS := -g -Wall
 endif
 
-INCLUDES = -I$(srctree)/tools/perf/lib/include -I$(srctree)/tools/include -I$(srctree)/tools/arch/$(SRCARCH)/include/ -I$(srctree)/tools/arch/$(SRCARCH)/include/uapi -I$(srctree)/tools/include/uapi
+INCLUDES = \
+-I$(srctree)/tools/perf/lib/include \
+-I$(srctree)/tools/lib/ \
+-I$(srctree)/tools/include \
+-I$(srctree)/tools/arch/$(SRCARCH)/include/ \
+-I$(srctree)/tools/arch/$(SRCARCH)/include/uapi \
+-I$(srctree)/tools/include/uapi
 
 # Append required CFLAGS
 override CFLAGS += $(EXTRA_WARNINGS)
@@ -88,13 +94,34 @@
 
 LIBPERF_ALL := $(LIBPERF_A) $(OUTPUT)libperf.so*
 
+LIB_DIR := $(srctree)/tools/lib/api/
+
+ifneq ($(OUTPUT),)
+ifneq ($(subdir),)
+  API_PATH=$(OUTPUT)/../lib/api/
+else
+  API_PATH=$(OUTPUT)
+endif
+else
+  API_PATH=$(LIB_DIR)
+endif
+
+LIBAPI = $(API_PATH)libapi.a
+
+$(LIBAPI): FORCE
+	$(Q)$(MAKE) -C $(LIB_DIR) O=$(OUTPUT) $(OUTPUT)libapi.a
+
+$(LIBAPI)-clean:
+	$(call QUIET_CLEAN, libapi)
+	$(Q)$(MAKE) -C $(LIB_DIR) O=$(OUTPUT) clean >/dev/null
+
 $(LIBPERF_IN): FORCE
 	$(Q)$(MAKE) $(build)=libperf
 
 $(LIBPERF_A): $(LIBPERF_IN)
 	$(QUIET_AR)$(RM) $@ && $(AR) rcs $@ $(LIBPERF_IN)
 
-$(LIBPERF_SO): $(LIBPERF_IN)
+$(LIBPERF_SO): $(LIBPERF_IN) $(LIBAPI)
 	$(QUIET_LINK)$(CC) --shared -Wl,-soname,libperf.so \
                                     -Wl,--version-script=$(VERSION_SCRIPT) $^ -o $@
 	@ln -sf $(@F) $(OUTPUT)libperf.so
@@ -106,12 +133,12 @@
 all: fixdep
 	$(Q)$(MAKE) libs
 
-clean:
+clean: $(LIBAPI)-clean
 	$(call QUIET_CLEAN, libperf) $(RM) $(LIBPERF_A) \
                 *.o *~ *.a *.so *.so.$(VERSION) *.so.$(LIBPERF_VERSION) .*.d .*.cmd LIBPERF-CFLAGS $(LIBPERF_PC)
 	$(Q)$(MAKE) -C tests clean
 
-tests:
+tests: libs
 	$(Q)$(MAKE) -C tests
 	$(Q)$(MAKE) -C tests run
 
@@ -146,6 +173,7 @@
 		$(call do_install,include/perf/threadmap.h,$(prefix)/include/perf,644); \
 		$(call do_install,include/perf/evlist.h,$(prefix)/include/perf,644); \
 		$(call do_install,include/perf/evsel.h,$(prefix)/include/perf,644);
+		$(call do_install,include/perf/event.h,$(prefix)/include/perf,644);
 
 install_pkgconfig: $(LIBPERF_PC)
 	$(call QUIET_INSTALL, $(LIBPERF_PC)) \

diff --git a/tools/perf/lib/core.c b/tools/perf/lib/core.c
index 29d5e33..d0b9ae4 100644
--- a/tools/perf/lib/core.c
+++ b/tools/perf/lib/core.c

@@ -4,7 +4,9 @@
 
 #include <stdio.h>
 #include <stdarg.h>
+#include <unistd.h>
 #include <perf/core.h>
+#include <internal/lib.h>
 #include "internal.h"
 
 static int __base_pr(enum libperf_print_level level, const char *format,
@@ -15,11 +17,6 @@
 
 static libperf_print_fn_t __libperf_pr = __base_pr;
 
-void libperf_set_print(libperf_print_fn_t fn)
-{
-	__libperf_pr = fn;
-}
-
 __printf(2, 3)
 void libperf_print(enum libperf_print_level level, const char *format, ...)
 {
@@ -32,3 +29,9 @@
 	__libperf_pr(level, format, args);
 	va_end(args);
 }
+
+void libperf_init(libperf_print_fn_t fn)
+{
+	page_size = sysconf(_SC_PAGE_SIZE);
+	__libperf_pr = fn;
+}

diff --git a/tools/perf/lib/cpumap.c b/tools/perf/lib/cpumap.c
index 1f0e6f3..2ca1fafa 100644
--- a/tools/perf/lib/cpumap.c
+++ b/tools/perf/lib/cpumap.c

@@ -260,3 +260,15 @@
 
 	return -1;
 }
+
+int perf_cpu_map__max(struct perf_cpu_map *map)
+{
+	int i, max = -1;
+
+	for (i = 0; i < map->nr; i++) {
+		if (map->map[i] > max)
+			max = map->map[i];
+	}
+
+	return max;
+}

diff --git a/tools/perf/lib/evlist.c b/tools/perf/lib/evlist.c
index f4dc9a2..d1496fe 100644
--- a/tools/perf/lib/evlist.c
+++ b/tools/perf/lib/evlist.c

@@ -1,16 +1,30 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <perf/evlist.h>
 #include <perf/evsel.h>
+#include <linux/bitops.h>
 #include <linux/list.h>
+#include <linux/hash.h>
+#include <sys/ioctl.h>
 #include <internal/evlist.h>
 #include <internal/evsel.h>
+#include <internal/xyarray.h>
 #include <linux/zalloc.h>
 #include <stdlib.h>
+#include <errno.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <signal.h>
+#include <poll.h>
 #include <perf/cpumap.h>
 #include <perf/threadmap.h>
+#include <api/fd/array.h>
 
 void perf_evlist__init(struct perf_evlist *evlist)
 {
+	int i;
+
+	for (i = 0; i < PERF_EVLIST__HLIST_SIZE; ++i)
+		INIT_HLIST_HEAD(&evlist->heads[i]);
 	INIT_LIST_HEAD(&evlist->entries);
 	evlist->nr_entries = 0;
 }
@@ -157,3 +171,113 @@
 	perf_evlist__for_each_entry(evlist, evsel)
 		perf_evsel__disable(evsel);
 }
+
+u64 perf_evlist__read_format(struct perf_evlist *evlist)
+{
+	struct perf_evsel *first = perf_evlist__first(evlist);
+
+	return first->attr.read_format;
+}
+
+#define SID(e, x, y) xyarray__entry(e->sample_id, x, y)
+
+static void perf_evlist__id_hash(struct perf_evlist *evlist,
+				 struct perf_evsel *evsel,
+				 int cpu, int thread, u64 id)
+{
+	int hash;
+	struct perf_sample_id *sid = SID(evsel, cpu, thread);
+
+	sid->id = id;
+	sid->evsel = evsel;
+	hash = hash_64(sid->id, PERF_EVLIST__HLIST_BITS);
+	hlist_add_head(&sid->node, &evlist->heads[hash]);
+}
+
+void perf_evlist__id_add(struct perf_evlist *evlist,
+			 struct perf_evsel *evsel,
+			 int cpu, int thread, u64 id)
+{
+	perf_evlist__id_hash(evlist, evsel, cpu, thread, id);
+	evsel->id[evsel->ids++] = id;
+}
+
+int perf_evlist__id_add_fd(struct perf_evlist *evlist,
+			   struct perf_evsel *evsel,
+			   int cpu, int thread, int fd)
+{
+	u64 read_data[4] = { 0, };
+	int id_idx = 1; /* The first entry is the counter value */
+	u64 id;
+	int ret;
+
+	ret = ioctl(fd, PERF_EVENT_IOC_ID, &id);
+	if (!ret)
+		goto add;
+
+	if (errno != ENOTTY)
+		return -1;
+
+	/* Legacy way to get event id.. All hail to old kernels! */
+
+	/*
+	 * This way does not work with group format read, so bail
+	 * out in that case.
+	 */
+	if (perf_evlist__read_format(evlist) & PERF_FORMAT_GROUP)
+		return -1;
+
+	if (!(evsel->attr.read_format & PERF_FORMAT_ID) ||
+	    read(fd, &read_data, sizeof(read_data)) == -1)
+		return -1;
+
+	if (evsel->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+		++id_idx;
+	if (evsel->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+		++id_idx;
+
+	id = read_data[id_idx];
+
+add:
+	perf_evlist__id_add(evlist, evsel, cpu, thread, id);
+	return 0;
+}
+
+int perf_evlist__alloc_pollfd(struct perf_evlist *evlist)
+{
+	int nr_cpus = perf_cpu_map__nr(evlist->cpus);
+	int nr_threads = perf_thread_map__nr(evlist->threads);
+	int nfds = 0;
+	struct perf_evsel *evsel;
+
+	perf_evlist__for_each_entry(evlist, evsel) {
+		if (evsel->system_wide)
+			nfds += nr_cpus;
+		else
+			nfds += nr_cpus * nr_threads;
+	}
+
+	if (fdarray__available_entries(&evlist->pollfd) < nfds &&
+	    fdarray__grow(&evlist->pollfd, nfds) < 0)
+		return -ENOMEM;
+
+	return 0;
+}
+
+int perf_evlist__add_pollfd(struct perf_evlist *evlist, int fd,
+			    void *ptr, short revent)
+{
+	int pos = fdarray__add(&evlist->pollfd, fd, revent | POLLERR | POLLHUP);
+
+	if (pos >= 0) {
+		evlist->pollfd.priv[pos].ptr = ptr;
+		fcntl(fd, F_SETFL, O_NONBLOCK);
+	}
+
+	return pos;
+}
+
+int perf_evlist__poll(struct perf_evlist *evlist, int timeout)
+{
+	return fdarray__poll(&evlist->pollfd, timeout);
+}

diff --git a/tools/perf/lib/evsel.c b/tools/perf/lib/evsel.c
index 24abc80..a8cb582 100644
--- a/tools/perf/lib/evsel.c
+++ b/tools/perf/lib/evsel.c

@@ -230,3 +230,33 @@
 {
 	return &evsel->attr;
 }
+
+int perf_evsel__alloc_id(struct perf_evsel *evsel, int ncpus, int nthreads)
+{
+	if (ncpus == 0 || nthreads == 0)
+		return 0;
+
+	if (evsel->system_wide)
+		nthreads = 1;
+
+	evsel->sample_id = xyarray__new(ncpus, nthreads, sizeof(struct perf_sample_id));
+	if (evsel->sample_id == NULL)
+		return -ENOMEM;
+
+	evsel->id = zalloc(ncpus * nthreads * sizeof(u64));
+	if (evsel->id == NULL) {
+		xyarray__delete(evsel->sample_id);
+		evsel->sample_id = NULL;
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+void perf_evsel__free_id(struct perf_evsel *evsel)
+{
+	xyarray__delete(evsel->sample_id);
+	evsel->sample_id = NULL;
+	zfree(&evsel->id);
+	evsel->ids = 0;
+}

diff --git a/tools/perf/lib/include/internal/evlist.h b/tools/perf/lib/include/internal/evlist.h
index 448891f..9f440ab 100644
--- a/tools/perf/lib/include/internal/evlist.h
+++ b/tools/perf/lib/include/internal/evlist.h

@@ -3,6 +3,11 @@
 #define __LIBPERF_INTERNAL_EVLIST_H
 
 #include <linux/list.h>
+#include <api/fd/array.h>
+#include <internal/evsel.h>
+
+#define PERF_EVLIST__HLIST_BITS 8
+#define PERF_EVLIST__HLIST_SIZE (1 << PERF_EVLIST__HLIST_BITS)
 
 struct perf_cpu_map;
 struct perf_thread_map;
@@ -13,8 +18,16 @@
 	bool			 has_user_cpus;
 	struct perf_cpu_map	*cpus;
 	struct perf_thread_map	*threads;
+	int			 nr_mmaps;
+	size_t			 mmap_len;
+	struct fdarray		 pollfd;
+	struct hlist_head	 heads[PERF_EVLIST__HLIST_SIZE];
 };
 
+int perf_evlist__alloc_pollfd(struct perf_evlist *evlist);
+int perf_evlist__add_pollfd(struct perf_evlist *evlist, int fd,
+			    void *ptr, short revent);
+
 /**
  * __perf_evlist__for_each_entry - iterate thru all the evsels
  * @list: list_head instance to iterate
@@ -47,4 +60,24 @@
 #define perf_evlist__for_each_entry_reverse(evlist, evsel) \
 	__perf_evlist__for_each_entry_reverse(&(evlist)->entries, evsel)
 
+static inline struct perf_evsel *perf_evlist__first(struct perf_evlist *evlist)
+{
+	return list_entry(evlist->entries.next, struct perf_evsel, node);
+}
+
+static inline struct perf_evsel *perf_evlist__last(struct perf_evlist *evlist)
+{
+	return list_entry(evlist->entries.prev, struct perf_evsel, node);
+}
+
+u64 perf_evlist__read_format(struct perf_evlist *evlist);
+
+void perf_evlist__id_add(struct perf_evlist *evlist,
+			 struct perf_evsel *evsel,
+			 int cpu, int thread, u64 id);
+
+int perf_evlist__id_add_fd(struct perf_evlist *evlist,
+			   struct perf_evsel *evsel,
+			   int cpu, int thread, int fd);
+
 #endif /* __LIBPERF_INTERNAL_EVLIST_H */

diff --git a/tools/perf/lib/include/internal/evsel.h b/tools/perf/lib/include/internal/evsel.h
index 8b854d1..a69b829 100644
--- a/tools/perf/lib/include/internal/evsel.h
+++ b/tools/perf/lib/include/internal/evsel.h

@@ -4,9 +4,35 @@
 
 #include <linux/types.h>
 #include <linux/perf_event.h>
+#include <stdbool.h>
+#include <sys/types.h>
 
 struct perf_cpu_map;
 struct perf_thread_map;
+struct xyarray;
+
+/*
+ * Per fd, to map back from PERF_SAMPLE_ID to evsel, only used when there are
+ * more than one entry in the evlist.
+ */
+struct perf_sample_id {
+	struct hlist_node	 node;
+	u64			 id;
+	struct perf_evsel	*evsel;
+       /*
+	* 'idx' will be used for AUX area sampling. A sample will have AUX area
+	* data that will be queued for decoding, where there are separate
+	* queues for each CPU (per-cpu tracing) or task (per-thread tracing).
+	* The sample ID can be used to lookup 'idx' which is effectively the
+	* queue number.
+	*/
+	int			 idx;
+	int			 cpu;
+	pid_t			 tid;
+
+	/* Holds total ID period value for PERF_SAMPLE_READ processing. */
+	u64			 period;
+};
 
 struct perf_evsel {
 	struct list_head	 node;
@@ -15,9 +41,13 @@
 	struct perf_cpu_map	*own_cpus;
 	struct perf_thread_map	*threads;
 	struct xyarray		*fd;
+	struct xyarray		*sample_id;
+	u64			*id;
+	u32			 ids;
 
 	/* parse modifier helper */
 	int			 nr_members;
+	bool			 system_wide;
 };
 
 int perf_evsel__alloc_fd(struct perf_evsel *evsel, int ncpus, int nthreads);
@@ -26,4 +56,7 @@
 int perf_evsel__read_size(struct perf_evsel *evsel);
 int perf_evsel__apply_filter(struct perf_evsel *evsel, const char *filter);
 
+int perf_evsel__alloc_id(struct perf_evsel *evsel, int ncpus, int nthreads);
+void perf_evsel__free_id(struct perf_evsel *evsel);
+
 #endif /* __LIBPERF_INTERNAL_EVSEL_H */

diff --git a/tools/perf/lib/include/internal/lib.h b/tools/perf/lib/include/internal/lib.h
index 0b56f12..5175d49 100644
--- a/tools/perf/lib/include/internal/lib.h
+++ b/tools/perf/lib/include/internal/lib.h

@@ -2,7 +2,9 @@
 #ifndef __LIBPERF_INTERNAL_LIB_H
 #define __LIBPERF_INTERNAL_LIB_H
 
-#include <unistd.h>
+#include <sys/types.h>
+
+extern unsigned int page_size;
 
 ssize_t readn(int fd, void *buf, size_t n);
 ssize_t writen(int fd, const void *buf, size_t n);

diff --git a/tools/perf/lib/include/internal/mmap.h b/tools/perf/lib/include/internal/mmap.h
new file mode 100644
index 0000000..ba1e519
--- /dev/null
+++ b/tools/perf/lib/include/internal/mmap.h

@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __LIBPERF_INTERNAL_MMAP_H
+#define __LIBPERF_INTERNAL_MMAP_H
+
+#include <linux/compiler.h>
+#include <linux/refcount.h>
+#include <linux/types.h>
+#include <stdbool.h>
+
+/* perf sample has 16 bits size limit */
+#define PERF_SAMPLE_MAX_SIZE (1 << 16)
+
+/**
+ * struct perf_mmap - perf's ring buffer mmap details
+ *
+ * @refcnt - e.g. code using PERF_EVENT_IOC_SET_OUTPUT to share this
+ */
+struct perf_mmap {
+	void		*base;
+	int		 mask;
+	int		 fd;
+	int		 cpu;
+	refcount_t	 refcnt;
+	u64		 prev;
+	u64		 start;
+	u64		 end;
+	bool		 overwrite;
+	u64		 flush;
+	char		 event_copy[PERF_SAMPLE_MAX_SIZE] __aligned(8);
+};
+
+#endif /* __LIBPERF_INTERNAL_MMAP_H */

diff --git a/tools/perf/lib/include/perf/core.h b/tools/perf/lib/include/perf/core.h
index c341a7b..cfd70e7 100644
--- a/tools/perf/lib/include/perf/core.h
+++ b/tools/perf/lib/include/perf/core.h

@@ -17,6 +17,6 @@
 typedef int (*libperf_print_fn_t)(enum libperf_print_level level,
 				  const char *, va_list ap);
 
-LIBPERF_API void libperf_set_print(libperf_print_fn_t fn);
+LIBPERF_API void libperf_init(libperf_print_fn_t fn);
 
 #endif /* __LIBPERF_CORE_H */

diff --git a/tools/perf/lib/include/perf/cpumap.h b/tools/perf/lib/include/perf/cpumap.h
index 8aa995c..ac9aa49 100644
--- a/tools/perf/lib/include/perf/cpumap.h
+++ b/tools/perf/lib/include/perf/cpumap.h

@@ -16,6 +16,7 @@
 LIBPERF_API int perf_cpu_map__cpu(const struct perf_cpu_map *cpus, int idx);
 LIBPERF_API int perf_cpu_map__nr(const struct perf_cpu_map *cpus);
 LIBPERF_API bool perf_cpu_map__empty(const struct perf_cpu_map *map);
+LIBPERF_API int perf_cpu_map__max(struct perf_cpu_map *map);
 
 #define perf_cpu_map__for_each_cpu(cpu, idx, cpus)		\
 	for ((idx) = 0, (cpu) = perf_cpu_map__cpu(cpus, idx);	\

diff --git a/tools/perf/lib/include/perf/evlist.h b/tools/perf/lib/include/perf/evlist.h
index 38365f8..8a2ce07 100644
--- a/tools/perf/lib/include/perf/evlist.h
+++ b/tools/perf/lib/include/perf/evlist.h

@@ -31,5 +31,6 @@
 LIBPERF_API void perf_evlist__set_maps(struct perf_evlist *evlist,
 				       struct perf_cpu_map *cpus,
 				       struct perf_thread_map *threads);
+LIBPERF_API int perf_evlist__poll(struct perf_evlist *evlist, int timeout);
 
 #endif /* __LIBPERF_EVLIST_H */

diff --git a/tools/perf/lib/lib.c b/tools/perf/lib/lib.c
index 2a81819..1865893 100644
--- a/tools/perf/lib/lib.c
+++ b/tools/perf/lib/lib.c

@@ -5,6 +5,8 @@
 #include <linux/kernel.h>
 #include <internal/lib.h>
 
+unsigned int page_size;
+
 static ssize_t ion(bool is_read, int fd, void *buf, size_t n)
 {
 	void *buf_start = buf;

diff --git a/tools/perf/lib/libperf.map b/tools/perf/lib/libperf.map
index dc4d663..ab8dbde 100644
--- a/tools/perf/lib/libperf.map
+++ b/tools/perf/lib/libperf.map

@@ -1,6 +1,6 @@
 LIBPERF_0.0.1 {
 	global:
-		libperf_set_print;
+		libperf_init;
 		perf_cpu_map__dummy_new;
 		perf_cpu_map__get;
 		perf_cpu_map__put;
@@ -9,6 +9,7 @@
 		perf_cpu_map__nr;
 		perf_cpu_map__cpu;
 		perf_cpu_map__empty;
+		perf_cpu_map__max;
 		perf_thread_map__new_dummy;
 		perf_thread_map__set_pid;
 		perf_thread_map__comm;
@@ -38,6 +39,7 @@
 		perf_evlist__remove;
 		perf_evlist__next;
 		perf_evlist__set_maps;
+		perf_evlist__poll;
 	local:
 		*;
 };

diff --git a/tools/perf/lib/tests/test-cpumap.c b/tools/perf/lib/tests/test-cpumap.c
index 76a43cf..aa34c20d 100644
--- a/tools/perf/lib/tests/test-cpumap.c
+++ b/tools/perf/lib/tests/test-cpumap.c

@@ -1,13 +1,23 @@
 // SPDX-License-Identifier: GPL-2.0
+#include <stdarg.h>
+#include <stdio.h>
 #include <perf/cpumap.h>
 #include <internal/tests.h>
 
+static int libperf_print(enum libperf_print_level level,
+			 const char *fmt, va_list ap)
+{
+	return vfprintf(stderr, fmt, ap);
+}
+
 int main(int argc, char **argv)
 {
 	struct perf_cpu_map *cpus;
 
 	__T_START;
 
+	libperf_init(libperf_print);
+
 	cpus = perf_cpu_map__dummy_new();
 	if (!cpus)
 		return -1;

diff --git a/tools/perf/lib/tests/test-evlist.c b/tools/perf/lib/tests/test-evlist.c
index 4e1407f..e6b2ab2 100644
--- a/tools/perf/lib/tests/test-evlist.c
+++ b/tools/perf/lib/tests/test-evlist.c

@@ -1,4 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
+#include <stdio.h>
+#include <stdarg.h>
 #include <linux/perf_event.h>
 #include <perf/cpumap.h>
 #include <perf/threadmap.h>
@@ -6,6 +8,12 @@
 #include <perf/evsel.h>
 #include <internal/tests.h>
 
+static int libperf_print(enum libperf_print_level level,
+			 const char *fmt, va_list ap)
+{
+	return vfprintf(stderr, fmt, ap);
+}
+
 static int test_stat_cpu(void)
 {
 	struct perf_cpu_map *cpus;
@@ -177,6 +185,8 @@
 {
 	__T_START;
 
+	libperf_init(libperf_print);
+
 	test_stat_cpu();
 	test_stat_thread();
 	test_stat_thread_enable();

diff --git a/tools/perf/lib/tests/test-evsel.c b/tools/perf/lib/tests/test-evsel.c
index 2c648fe..1b6c428 100644
--- a/tools/perf/lib/tests/test-evsel.c
+++ b/tools/perf/lib/tests/test-evsel.c

@@ -1,10 +1,18 @@
 // SPDX-License-Identifier: GPL-2.0
+#include <stdarg.h>
+#include <stdio.h>
 #include <linux/perf_event.h>
 #include <perf/cpumap.h>
 #include <perf/threadmap.h>
 #include <perf/evsel.h>
 #include <internal/tests.h>
 
+static int libperf_print(enum libperf_print_level level,
+			 const char *fmt, va_list ap)
+{
+	return vfprintf(stderr, fmt, ap);
+}
+
 static int test_stat_cpu(void)
 {
 	struct perf_cpu_map *cpus;
@@ -116,6 +124,8 @@
 {
 	__T_START;
 
+	libperf_init(libperf_print);
+
 	test_stat_cpu();
 	test_stat_thread();
 	test_stat_thread_enable();

diff --git a/tools/perf/lib/tests/test-threadmap.c b/tools/perf/lib/tests/test-threadmap.c
index 10a4f4c..8c5f472 100644
--- a/tools/perf/lib/tests/test-threadmap.c
+++ b/tools/perf/lib/tests/test-threadmap.c

@@ -1,13 +1,23 @@
 // SPDX-License-Identifier: GPL-2.0
+#include <stdarg.h>
+#include <stdio.h>
 #include <perf/threadmap.h>
 #include <internal/tests.h>
 
+static int libperf_print(enum libperf_print_level level,
+			 const char *fmt, va_list ap)
+{
+	return vfprintf(stderr, fmt, ap);
+}
+
 int main(int argc, char **argv)
 {
 	struct perf_thread_map *threads;
 
 	__T_START;
 
+	libperf_init(libperf_print);
+
 	threads = perf_thread_map__new_dummy();
 	if (!threads)
 		return -1;

diff --git a/tools/perf/perf.c b/tools/perf/perf.c
index 1193b923..27f94b0 100644
--- a/tools/perf/perf.c
+++ b/tools/perf/perf.c

@@ -12,6 +12,7 @@
 #include "util/build-id.h"
 #include "util/cache.h"
 #include "util/env.h"
+#include <internal/lib.h> // page_size
 #include <subcmd/exec-cmd.h>
 #include "util/config.h"
 #include <subcmd/run-command.h>
@@ -20,11 +21,12 @@
 #include "util/bpf-loader.h"
 #include "util/debug.h"
 #include "util/event.h"
-#include "util/util.h"
+#include "util/util.h" // usage()
 #include "ui/ui.h"
 #include "perf-sys.h"
 #include <api/fs/fs.h>
 #include <api/fs/tracing_path.h>
+#include <perf/core.h>
 #include <errno.h>
 #include <pthread.h>
 #include <signal.h>
@@ -428,6 +430,12 @@
 	pthread_sigmask(SIG_UNBLOCK, &set, NULL);
 }
 
+static int libperf_print(enum libperf_print_level level,
+			 const char *fmt, va_list ap)
+{
+	return eprintf(level, verbose, fmt, ap);
+}
+
 int main(int argc, const char **argv)
 {
 	int err;
@@ -438,8 +446,7 @@
 	exec_cmd_init("perf", PREFIX, PERF_EXEC_PATH, EXEC_PATH_ENVIRONMENT);
 	pager_init(PERF_PAGER_ENVIRONMENT);
 
-	/* The page_size is placed in util object. */
-	page_size = sysconf(_SC_PAGE_SIZE);
+	libperf_init(libperf_print);
 
 	cmd = extract_argv0_path(argv[0]);
 	if (!cmd)

diff --git a/tools/perf/pmu-events/README b/tools/perf/pmu-events/README
index e62b09b..de7efa2 100644
--- a/tools/perf/pmu-events/README
+++ b/tools/perf/pmu-events/README

@@ -30,9 +30,9 @@
 All the topic JSON files for a CPU model/family should be in a separate
 sub directory. Thus for the Silvermont X86 CPU:
 
-	$ ls tools/perf/pmu-events/arch/x86/Silvermont_core
-	Cache.json 	Memory.json 	Virtual-Memory.json
-	Frontend.json 	Pipeline.json
+	$ ls tools/perf/pmu-events/arch/x86/silvermont
+	cache.json     memory.json    virtual-memory.json
+	frontend.json  pipeline.json
 
 The JSONs folder for a CPU model/family may be placed in the root arch
 folder, or may be placed in a vendor sub-folder under the arch folder
@@ -94,7 +94,7 @@
 
 where 'pm_1plus_ppc_cmpl' is a Power8 PMU event.
 
-However some errors in processing may cause the perf build to fail.
+However some errors in processing may cause the alias build to fail.
 
 Mapfile format
 ===============
@@ -119,7 +119,7 @@
 
 	Header line
 		The header line is the first line in the file, which is
-		always _IGNORED_. It can empty.
+		always _IGNORED_. It can be empty.
 
 	CPUID:
 		CPUID is an arch-specific char string, that can be used
@@ -138,15 +138,15 @@
 		files, relative to the directory containing the mapfile.csv
 
 	Type:
-		indicates whether the events or "core" or "uncore" events.
+		indicates whether the events are "core" or "uncore" events.
 
 
 	Eg:
 
-	$ grep Silvermont tools/perf/pmu-events/arch/x86/mapfile.csv
-	GenuineIntel-6-37,V13,Silvermont_core,core
-	GenuineIntel-6-4D,V13,Silvermont_core,core
-	GenuineIntel-6-4C,V13,Silvermont_core,core
+	$ grep silvermont tools/perf/pmu-events/arch/x86/mapfile.csv
+	GenuineIntel-6-37,v13,silvermont,core
+	GenuineIntel-6-4D,v13,silvermont,core
+	GenuineIntel-6-4C,v13,silvermont,core
 
 	i.e the three CPU models use the JSON files (i.e PMU events) listed
-	in the directory 'tools/perf/pmu-events/arch/x86/Silvermont_core'.
+	in the directory 'tools/perf/pmu-events/arch/x86/silvermont'.

diff --git a/tools/perf/pmu-events/arch/arm64/arm/cortex-a76-n1/branch.json b/tools/perf/pmu-events/arch/arm64/arm/cortex-a76-n1/branch.json
new file mode 100644
index 0000000..b5e5d05
--- /dev/null
+++ b/tools/perf/pmu-events/arch/arm64/arm/cortex-a76-n1/branch.json

@@ -0,0 +1,14 @@
+[
+    {
+        "PublicDescription": "Mispredicted or not predicted branch speculatively executed. This event counts any predictable branch instruction which is mispredicted either due to dynamic misprediction or because the MMU is off and the branches are statically predicted not taken.",
+        "EventCode": "0x10",
+        "EventName": "BR_MIS_PRED",
+        "BriefDescription": "Mispredicted or not predicted branch speculatively executed."
+    },
+    {
+        "PublicDescription": "Predictable branch speculatively executed. This event counts all predictable branches.",
+        "EventCode": "0x12",
+        "EventName": "BR_PRED",
+        "BriefDescription": "Predictable branch speculatively executed."
+    }
+]

diff --git a/tools/perf/pmu-events/arch/arm64/arm/cortex-a76-n1/bus.json b/tools/perf/pmu-events/arch/arm64/arm/cortex-a76-n1/bus.json
new file mode 100644
index 0000000..fce7309
--- /dev/null
+++ b/tools/perf/pmu-events/arch/arm64/arm/cortex-a76-n1/bus.json

@@ -0,0 +1,24 @@
+[
+    {
+        "EventCode": "0x11",
+        "EventName": "CPU_CYCLES",
+        "BriefDescription": "The number of core clock cycles."
+    },
+    {
+        "PublicDescription": "Bus access. This event counts for every beat of data transferred over the data channels between the core and the SCU. If both read and write data beats are transferred on a given cycle, this event is counted twice on that cycle. This event counts the sum of BUS_ACCESS_RD and BUS_ACCESS_WR.",
+        "EventCode": "0x19",
+        "EventName": "BUS_ACCESS",
+        "BriefDescription": "Bus access."
+    },
+    {
+        "EventCode": "0x1D",
+        "EventName": "BUS_CYCLES",
+        "BriefDescription": "Bus cycles. This event duplicates CPU_CYCLES."
+    },
+    {
+        "ArchStdEvent":  "BUS_ACCESS_RD"
+    },
+    {
+        "ArchStdEvent":  "BUS_ACCESS_WR"
+    }
+]

diff --git a/tools/perf/pmu-events/arch/arm64/arm/cortex-a76-n1/cache.json b/tools/perf/pmu-events/arch/arm64/arm/cortex-a76-n1/cache.json
new file mode 100644
index 0000000..2459408
--- /dev/null
+++ b/tools/perf/pmu-events/arch/arm64/arm/cortex-a76-n1/cache.json

@@ -0,0 +1,207 @@
+[
+    {
+        "PublicDescription": "L1 instruction cache refill. This event counts any instruction fetch which misses in the cache.",
+        "EventCode": "0x01",
+        "EventName": "L1I_CACHE_REFILL",
+        "BriefDescription": "L1 instruction cache refill"
+    },
+    {
+        "PublicDescription": "L1 instruction TLB refill. This event counts any refill of the instruction L1 TLB from the L2 TLB. This includes refills that result in a translation fault.",
+        "EventCode": "0x02",
+        "EventName": "L1I_TLB_REFILL",
+        "BriefDescription": "L1 instruction TLB refill"
+    },
+    {
+        "PublicDescription": "L1 data cache refill. This event counts any load or store operation or page table walk access which causes data to be read from outside the L1, including accesses which do not allocate into L1.",
+        "EventCode": "0x03",
+        "EventName": "L1D_CACHE_REFILL",
+        "BriefDescription": "L1 data cache refill"
+    },
+    {
+        "PublicDescription": "L1 data cache access. This event counts any load or store operation or page table walk access which looks up in the L1 data cache. In particular, any access which could count the L1D_CACHE_REFILL event causes this event to count.",
+        "EventCode": "0x04",
+        "EventName": "L1D_CACHE",
+        "BriefDescription": "L1 data cache access"
+    },
+    {
+        "PublicDescription": "L1 data TLB refill. This event counts any refill of the data L1 TLB from the L2 TLB. This includes refills that result in a translation fault.",
+        "EventCode": "0x05",
+        "EventName": "L1D_TLB_REFILL",
+        "BriefDescription": "L1 data TLB refill"
+    },
+    {
+        "PublicDescription": "Level 1 instruction cache access or Level 0 Macro-op cache access. This event counts any instruction fetch which accesses the L1 instruction cache or L0 Macro-op cache.",
+        "EventCode": "0x14",
+        "EventName": "L1I_CACHE",
+        "BriefDescription": "L1 instruction cache access"
+    },
+    {
+        "PublicDescription": "L1 data cache Write-Back. This event counts any write-back of data from the L1 data cache to L2 or L3. This counts both victim line evictions and snoops, including cache maintenance operations.",
+        "EventCode": "0x15",
+        "EventName": "L1D_CACHE_WB",
+        "BriefDescription": "L1 data cache Write-Back"
+    },
+    {
+        "PublicDescription": "L2 data cache access. This event counts any transaction from L1 which looks up in the L2 cache, and any write-back from the L1 to the L2. Snoops from outside the core and cache maintenance operations are not counted.",
+        "EventCode": "0x16",
+        "EventName": "L2D_CACHE",
+        "BriefDescription": "L2 data cache access"
+    },
+    {
+        "PublicDescription": "L2 data cache refill. This event counts any cacheable transaction from L1 which causes data to be read from outside the core. L2 refills caused by stashes into L2 should not be counted",
+        "EventCode": "0x17",
+        "EventName": "L2D_CACHE_REFILL",
+        "BriefDescription": "L2 data cache refill"
+    },
+    {
+        "PublicDescription": "L2 data cache write-back. This event counts any write-back of data from the L2 cache to outside the core. This includes snoops to the L2 which return data, regardless of whether they cause an invalidation. Invalidations from the L2 which do not write data outside of the core and snoops which return data from the L1 are not counted",
+        "EventCode": "0x18",
+        "EventName": "L2D_CACHE_WB",
+        "BriefDescription": "L2 data cache write-back"
+    },
+    {
+        "PublicDescription": "L2 data cache allocation without refill. This event counts any full cache line write into the L2 cache which does not cause a linefill, including write-backs from L1 to L2 and full-line writes which do not allocate into L1.",
+        "EventCode": "0x20",
+        "EventName": "L2D_CACHE_ALLOCATE",
+        "BriefDescription": "L2 data cache allocation without refill"
+    },
+    {
+        "PublicDescription": "Level 1 data TLB access. This event counts any load or store operation which accesses the data L1 TLB. If both a load and a store are executed on a cycle, this event counts twice. This event counts regardless of whether the MMU is enabled.",
+        "EventCode": "0x25",
+        "EventName": "L1D_TLB",
+        "BriefDescription": "Level 1 data TLB access."
+    },
+    {
+        "PublicDescription": "Level 1 instruction TLB access. This event counts any instruction fetch which accesses the instruction L1 TLB.This event counts regardless of whether the MMU is enabled.",
+        "EventCode": "0x26",
+        "EventName": "L1I_TLB",
+        "BriefDescription": "Level 1 instruction TLB access"
+    },
+    {
+        "PublicDescription": "This event counts any full cache line write into the L3 cache which does not cause a linefill, including write-backs from L2 to L3 and full-line writes which do not allocate into L2",
+        "EventCode": "0x29",
+        "EventName": "L3D_CACHE_ALLOCATE",
+        "BriefDescription": "Allocation without refill"
+    },
+    {
+        "PublicDescription": "Attributable Level 3 unified cache refill. This event counts for any cacheable read transaction returning datafrom the SCU for which the data source was outside the cluster. Transactions such as ReadUnique are counted here as 'read' transactions, even though they can be generated by store instructions.",
+        "EventCode": "0x2A",
+        "EventName": "L3D_CACHE_REFILL",
+        "BriefDescription": "Attributable Level 3 unified cache refill."
+    },
+    {
+        "PublicDescription": "Attributable Level 3 unified cache access. This event counts for any cacheable read transaction returning datafrom the SCU, or for any cacheable write to the SCU.",
+        "EventCode": "0x2B",
+        "EventName": "L3D_CACHE",
+        "BriefDescription": "Attributable Level 3 unified cache access."
+    },
+    {
+        "PublicDescription": "Attributable L2 data or unified TLB refill. This event counts on anyrefill of the L2 TLB, caused by either an instruction or data access.This event does not count if the MMU is disabled.",
+        "EventCode": "0x2D",
+        "EventName": "L2D_TLB_REFILL",
+        "BriefDescription": "Attributable L2 data or unified TLB refill"
+    },
+    {
+        "PublicDescription": "Attributable L2 data or unified TLB access. This event counts on any access to the L2 TLB (caused by a refill of any of the L1 TLBs). This event does not count if the MMU is disabled.",
+        "EventCode": "0x2F",
+        "EventName": "L2D_TLB",
+        "BriefDescription": "Attributable L2 data or unified TLB access"
+    },
+    {
+        "PublicDescription": "Access to data TLB that caused a page table walk. This event counts on any data access which causes L2D_TLB_REFILL to count.",
+        "EventCode": "0x34",
+        "EventName": "DTLB_WALK",
+        "BriefDescription": "Access to data TLB that caused a page table walk."
+    },
+    {
+        "PublicDescription": "Access to instruction TLB that caused a page table walk. This event counts on any instruction access which causes L2D_TLB_REFILL to count.",
+        "EventCode": "0x35",
+        "EventName": "ITLB_WALK",
+        "BriefDescription": "Access to instruction TLB that caused a page table walk."
+    },
+    {
+        "EventCode": "0x36",
+        "EventName": "LL_CACHE_RD",
+        "BriefDescription": "Last level cache access, read"
+    },
+    {
+        "EventCode": "0x37",
+        "EventName": "LL_CACHE_MISS_RD",
+        "BriefDescription": "Last level cache miss, read"
+    },
+    {
+        "ArchStdEvent": "L1D_CACHE_INVAL"
+    },
+    {
+        "ArchStdEvent": "L1D_CACHE_RD"
+    },
+    {
+        "ArchStdEvent": "L1D_CACHE_REFILL_INNER"
+    },
+    {
+        "ArchStdEvent": "L1D_CACHE_REFILL_OUTER"
+    },
+    {
+        "ArchStdEvent": "L1D_CACHE_REFILL_RD"
+    },
+    {
+        "ArchStdEvent": "L1D_CACHE_REFILL_WR"
+    },
+    {
+        "ArchStdEvent": "L1D_CACHE_WB_CLEAN"
+    },
+    {
+        "ArchStdEvent": "L1D_CACHE_WB_VICTIM"
+    },
+    {
+        "ArchStdEvent": "L1D_CACHE_WR"
+    },
+    {
+        "ArchStdEvent": "L1D_TLB_RD"
+    },
+    {
+        "ArchStdEvent": "L1D_TLB_REFILL_RD"
+    },
+    {
+        "ArchStdEvent": "L1D_TLB_REFILL_WR"
+    },
+    {
+        "ArchStdEvent": "L1D_TLB_WR"
+    },
+    {
+        "ArchStdEvent": "L2D_CACHE_INVAL"
+    },
+    {
+        "ArchStdEvent": "L2D_CACHE_RD"
+    },
+    {
+        "ArchStdEvent": "L2D_CACHE_REFILL_RD"
+    },
+    {
+        "ArchStdEvent": "L2D_CACHE_REFILL_WR"
+    },
+    {
+        "ArchStdEvent": "L2D_CACHE_WB_CLEAN"
+    },
+    {
+        "ArchStdEvent": "L2D_CACHE_WB_VICTIM"
+    },
+    {
+        "ArchStdEvent": "L2D_CACHE_WR"
+    },
+    {
+        "ArchStdEvent": "L2D_TLB_RD"
+    },
+    {
+        "ArchStdEvent": "L2D_TLB_REFILL_RD"
+    },
+    {
+        "ArchStdEvent": "L2D_TLB_REFILL_WR"
+    },
+    {
+        "ArchStdEvent": "L2D_TLB_WR"
+    },
+    {
+        "ArchStdEvent": "L3D_CACHE_RD"
+    }
+]

diff --git a/tools/perf/pmu-events/arch/arm64/arm/cortex-a76-n1/exception.json b/tools/perf/pmu-events/arch/arm64/arm/cortex-a76-n1/exception.json
new file mode 100644
index 0000000..98d29c8
--- /dev/null
+++ b/tools/perf/pmu-events/arch/arm64/arm/cortex-a76-n1/exception.json

@@ -0,0 +1,52 @@
+[
+    {
+        "EventCode": "0x09",
+        "EventName": "EXC_TAKEN",
+        "BriefDescription": "Exception taken."
+    },
+    {
+        "PublicDescription": "Local memory error. This event counts any correctable or uncorrectable memory error (ECC or parity) in the protected core RAMs",
+        "EventCode": "0x1A",
+        "EventName": "MEMORY_ERROR",
+        "BriefDescription": "Local memory error."
+    },
+    {
+        "ArchStdEvent": "EXC_DABORT"
+    },
+    {
+        "ArchStdEvent": "EXC_FIQ"
+    },
+    {
+        "ArchStdEvent": "EXC_HVC"
+    },
+    {
+        "ArchStdEvent": "EXC_IRQ"
+    },
+    {
+        "ArchStdEvent": "EXC_PABORT"
+    },
+    {
+        "ArchStdEvent": "EXC_SMC"
+    },
+    {
+        "ArchStdEvent": "EXC_SVC"
+    },
+    {
+        "ArchStdEvent": "EXC_TRAP_DABORT"
+    },
+    {
+        "ArchStdEvent": "EXC_TRAP_FIQ"
+    },
+    {
+        "ArchStdEvent": "EXC_TRAP_IRQ"
+    },
+    {
+        "ArchStdEvent": "EXC_TRAP_OTHER"
+    },
+    {
+        "ArchStdEvent": "EXC_TRAP_PABORT"
+    },
+    {
+        "ArchStdEvent": "EXC_UNDEF"
+    }
+]

diff --git a/tools/perf/pmu-events/arch/arm64/arm/cortex-a76-n1/instruction.json b/tools/perf/pmu-events/arch/arm64/arm/cortex-a76-n1/instruction.json
new file mode 100644
index 0000000..c153ac7
--- /dev/null
+++ b/tools/perf/pmu-events/arch/arm64/arm/cortex-a76-n1/instruction.json

@@ -0,0 +1,108 @@
+[
+    {
+        "PublicDescription": "Software increment. Instruction architecturally executed (condition code check pass).",
+        "EventCode": "0x00",
+        "EventName": "SW_INCR",
+        "BriefDescription": "Software increment."
+    },
+    {
+        "PublicDescription": "Instruction architecturally executed. This event counts all retired instructions, including those that fail their condition check.",
+        "EventCode": "0x08",
+        "EventName": "INST_RETIRED",
+        "BriefDescription": "Instruction architecturally executed."
+    },
+    {
+        "EventCode": "0x0A",
+        "EventName": "EXC_RETURN",
+        "BriefDescription": "Instruction architecturally executed, condition code check pass, exception return."
+    },
+    {
+        "PublicDescription": "Instruction architecturally executed, condition code check pass, write to CONTEXTIDR. This event only counts writes to CONTEXTIDR in AArch32 state, and via the CONTEXTIDR_EL1 mnemonic in AArch64 state.",
+        "EventCode": "0x0B",
+        "EventName": "CID_WRITE_RETIRED",
+        "BriefDescription": "Instruction architecturally executed, condition code check pass, write to CONTEXTIDR."
+    },
+    {
+        "EventCode": "0x1B",
+        "EventName": "INST_SPEC",
+        "BriefDescription": "Operation speculatively executed"
+    },
+    {
+        "PublicDescription": "Instruction architecturally executed, condition code check pass, write to TTBR. This event only counts writes to TTBR0/TTBR1 in AArch32 state and TTBR0_EL1/TTBR1_EL1 in AArch64 state.",
+        "EventCode": "0x1C",
+        "EventName": "TTBR_WRITE_RETIRED",
+        "BriefDescription": "Instruction architecturally executed, condition code check pass, write to TTBR"
+    },
+    {
+        "PublicDescription": "Instruction architecturally executed, branch. This event counts all branches, taken or not. This excludes exception entries, debug entries and CCFAIL branches.",
+        "EventCode": "0x21",
+        "EventName": "BR_RETIRED",
+        "BriefDescription": "Instruction architecturally executed, branch."
+    },
+    {
+        "PublicDescription": "Instruction architecturally executed, mispredicted branch. This event counts any branch counted by BR_RETIRED which is not correctly predicted and causes a pipeline flush.",
+        "EventCode": "0x22",
+        "EventName": "BR_MIS_PRED_RETIRED",
+        "BriefDescription": "Instruction architecturally executed, mispredicted branch."
+    },
+    {
+        "ArchStdEvent": "ASE_SPEC"
+    },
+    {
+        "ArchStdEvent": "BR_IMMED_SPEC"
+    },
+    {
+        "ArchStdEvent": "BR_INDIRECT_SPEC"
+    },
+    {
+        "ArchStdEvent": "BR_RETURN_SPEC"
+    },
+    {
+        "ArchStdEvent": "CRYPTO_SPEC"
+    },
+    {
+        "ArchStdEvent": "DMB_SPEC"
+    },
+    {
+        "ArchStdEvent": "DP_SPEC"
+    },
+    {
+        "ArchStdEvent": "DSB_SPEC"
+    },
+    {
+        "ArchStdEvent": "ISB_SPEC"
+    },
+    {
+        "ArchStdEvent": "LDREX_SPEC"
+    },
+    {
+        "ArchStdEvent": "LDST_SPEC"
+    },
+    {
+        "ArchStdEvent": "LD_SPEC"
+    },
+    {
+        "ArchStdEvent": "PC_WRITE_SPEC"
+    },
+    {
+        "ArchStdEvent": "RC_LD_SPEC"
+    },
+    {
+        "ArchStdEvent": "RC_ST_SPEC"
+    },
+    {
+        "ArchStdEvent": "STREX_FAIL_SPEC"
+    },
+    {
+        "ArchStdEvent": "STREX_PASS_SPEC"
+    },
+    {
+        "ArchStdEvent": "STREX_SPEC"
+    },
+    {
+        "ArchStdEvent": "ST_SPEC"
+    },
+    {
+        "ArchStdEvent": "VFP_SPEC"
+    }
+]

diff --git a/tools/perf/pmu-events/arch/arm64/arm/cortex-a76-n1/memory.json b/tools/perf/pmu-events/arch/arm64/arm/cortex-a76-n1/memory.json
new file mode 100644
index 0000000..b866432
--- /dev/null
+++ b/tools/perf/pmu-events/arch/arm64/arm/cortex-a76-n1/memory.json

@@ -0,0 +1,23 @@
+[
+    {
+        "PublicDescription": "Data memory access. This event counts memory accesses due to load or store instructions. This event counts the sum of MEM_ACCESS_RD and MEM_ACCESS_WR.",
+        "EventCode": "0x13",
+        "EventName": "MEM_ACCESS",
+        "BriefDescription": "Data memory access"
+    },
+    {
+         "ArchStdEvent": "MEM_ACCESS_RD"
+    },
+    {
+         "ArchStdEvent": "MEM_ACCESS_WR"
+    },
+    {
+         "ArchStdEvent": "UNALIGNED_LD_SPEC"
+    },
+    {
+         "ArchStdEvent": "UNALIGNED_ST_SPEC"
+    },
+    {
+         "ArchStdEvent": "UNALIGNED_LDST_SPEC"
+    }
+]

diff --git a/tools/perf/pmu-events/arch/arm64/arm/cortex-a76-n1/other.json b/tools/perf/pmu-events/arch/arm64/arm/cortex-a76-n1/other.json
new file mode 100644
index 0000000..8bde029
--- /dev/null
+++ b/tools/perf/pmu-events/arch/arm64/arm/cortex-a76-n1/other.json

@@ -0,0 +1,7 @@
+[
+    {
+        "EventCode": "0x31",
+        "EventName": "REMOTE_ACCESS",
+        "BriefDescription": "Access to another socket in a multi-socket system"
+    }
+]

diff --git a/tools/perf/pmu-events/arch/arm64/arm/cortex-a76-n1/pipeline.json b/tools/perf/pmu-events/arch/arm64/arm/cortex-a76-n1/pipeline.json
new file mode 100644
index 0000000..010a647
--- /dev/null
+++ b/tools/perf/pmu-events/arch/arm64/arm/cortex-a76-n1/pipeline.json

@@ -0,0 +1,14 @@
+[
+    {
+        "PublicDescription": "No operation issued because of the frontend. The counter counts on any cycle when there are no fetched instructions available to dispatch.",
+        "EventCode": "0x23",
+        "EventName": "STALL_FRONTEND",
+        "BriefDescription": "No operation issued because of the frontend."
+    },
+    {
+        "PublicDescription": "No operation issued because of the backend. The counter counts on any cycle fetched instructions are not dispatched due to resource constraints.",
+        "EventCode": "0x24",
+        "EventName": "STALL_BACKEND",
+        "BriefDescription": "No operation issued because of the backend."
+    }
+]

diff --git a/tools/perf/pmu-events/arch/arm64/mapfile.csv b/tools/perf/pmu-events/arch/arm64/mapfile.csv
index 927fcdd..0d60914 100644
--- a/tools/perf/pmu-events/arch/arm64/mapfile.csv
+++ b/tools/perf/pmu-events/arch/arm64/mapfile.csv

@@ -16,6 +16,8 @@
 0x00000000420f1000,v1,arm/cortex-a53,core
 0x00000000410fd070,v1,arm/cortex-a57-a72,core
 0x00000000410fd080,v1,arm/cortex-a57-a72,core
+0x00000000410fd0b0,v1,arm/cortex-a76-n1,core
+0x00000000410fd0c0,v1,arm/cortex-a76-n1,core
 0x00000000420f5160,v1,cavium/thunderx2,core
 0x00000000430f0af0,v1,cavium/thunderx2,core
 0x00000000480fd010,v1,hisilicon/hip08,core

diff --git a/tools/perf/pmu-events/arch/powerpc/power8/other.json b/tools/perf/pmu-events/arch/powerpc/power8/other.json
index 9dc2f6b7..b2a3df0 100644
--- a/tools/perf/pmu-events/arch/powerpc/power8/other.json
+++ b/tools/perf/pmu-events/arch/powerpc/power8/other.json

@@ -1776,30 +1776,6 @@
     "PublicDescription": ""
   },
   {,
-    "EventCode": "0xa29084",
-    "EventName": "PM_L3_P0_GRP_PUMP",
-    "BriefDescription": "L3 pf sent with grp scope port 0",
-    "PublicDescription": ""
-  },
-  {,
-    "EventCode": "0x528084",
-    "EventName": "PM_L3_P0_LCO_DATA",
-    "BriefDescription": "lco sent with data port 0",
-    "PublicDescription": ""
-  },
-  {,
-    "EventCode": "0x518080",
-    "EventName": "PM_L3_P0_LCO_NO_DATA",
-    "BriefDescription": "dataless l3 lco sent port 0",
-    "PublicDescription": ""
-  },
-  {,
-    "EventCode": "0xa4908c",
-    "EventName": "PM_L3_P0_LCO_RTY",
-    "BriefDescription": "L3 LCO received retry port 0",
-    "PublicDescription": ""
-  },
-  {,
     "EventCode": "0x84908d",
     "EventName": "PM_L3_PF0_ALLOC",
     "BriefDescription": "lifetime, sample of PF machine 0 valid",

diff --git a/tools/perf/pmu-events/arch/x86/amdfam17h/cache.json b/tools/perf/pmu-events/arch/x86/amdfam17h/cache.json
index fad4af9..6221a84 100644
--- a/tools/perf/pmu-events/arch/x86/amdfam17h/cache.json
+++ b/tools/perf/pmu-events/arch/x86/amdfam17h/cache.json

@@ -283,5 +283,47 @@
     "BriefDescription": "Total cycles spent with one or more fill requests in flight from L2.",
     "PublicDescription": "Total cycles spent with one or more fill requests in flight from L2.",
     "UMask": "0x1"
+  },
+  {
+    "EventName": "l3_request_g1.caching_l3_cache_accesses",
+    "EventCode": "0x01",
+    "BriefDescription": "Caching: L3 cache accesses",
+    "UMask": "0x80",
+    "Unit": "L3PMC"
+  },
+  {
+    "EventName": "l3_lookup_state.all_l3_req_typs",
+    "EventCode": "0x04",
+    "BriefDescription": "All L3 Request Types",
+    "UMask": "0xff",
+    "Unit": "L3PMC"
+  },
+  {
+    "EventName": "l3_comb_clstr_state.other_l3_miss_typs",
+    "EventCode": "0x06",
+    "BriefDescription": "Other L3 Miss Request Types",
+    "UMask": "0xfe",
+    "Unit": "L3PMC"
+  },
+  {
+    "EventName": "l3_comb_clstr_state.request_miss",
+    "EventCode": "0x06",
+    "BriefDescription": "L3 cache misses",
+    "UMask": "0x01",
+    "Unit": "L3PMC"
+  },
+  {
+    "EventName": "xi_sys_fill_latency",
+    "EventCode": "0x90",
+    "BriefDescription": "L3 Cache Miss Latency. Total cycles for all transactions divided by 16. Ignores SliceMask and ThreadMask.",
+    "UMask": "0x00",
+    "Unit": "L3PMC"
+  },
+  {
+    "EventName": "xi_ccx_sdp_req1.all_l3_miss_req_typs",
+    "EventCode": "0x9a",
+    "BriefDescription": "All L3 Miss Request Types. Ignores SliceMask and ThreadMask.",
+    "UMask": "0x3f",
+    "Unit": "L3PMC"
   }
 ]

diff --git a/tools/perf/pmu-events/arch/x86/amdfam17h/core.json b/tools/perf/pmu-events/arch/x86/amdfam17h/core.json
index 7b285b0..1079544 100644
--- a/tools/perf/pmu-events/arch/x86/amdfam17h/core.json
+++ b/tools/perf/pmu-events/arch/x86/amdfam17h/core.json

@@ -13,7 +13,7 @@
   {
     "EventName": "ex_ret_brn",
     "EventCode": "0xc2",
-    "BriefDescription": "[Retired Branch Instructions.",
+    "BriefDescription": "Retired Branch Instructions.",
     "PublicDescription": "The number of branch instructions retired. This includes all types of architectural control flow changes, including exceptions and interrupts."
   },
   {

diff --git a/tools/perf/pmu-events/jevents.c b/tools/perf/pmu-events/jevents.c
index d413761..9e37287 100644
--- a/tools/perf/pmu-events/jevents.c
+++ b/tools/perf/pmu-events/jevents.c

@@ -239,6 +239,7 @@
 	{ "hisi_sccl,ddrc", "hisi_sccl,ddrc" },
 	{ "hisi_sccl,hha", "hisi_sccl,hha" },
 	{ "hisi_sccl,l3c", "hisi_sccl,l3c" },
+	{ "L3PMC", "amd_l3" },
 	{}
 };
 

diff --git a/tools/perf/tests/backward-ring-buffer.c b/tools/perf/tests/backward-ring-buffer.c
index a637a4a..338cd9f 100644
--- a/tools/perf/tests/backward-ring-buffer.c
+++ b/tools/perf/tests/backward-ring-buffer.c

@@ -10,6 +10,7 @@
 #include "tests.h"
 #include "debug.h"
 #include "parse-events.h"
+#include "util/mmap.h"
 #include <errno.h>
 #include <linux/string.h>
 
@@ -32,8 +33,8 @@
 {
 	int i;
 
-	for (i = 0; i < evlist->nr_mmaps; i++) {
-		struct perf_mmap *map = &evlist->overwrite_mmap[i];
+	for (i = 0; i < evlist->core.nr_mmaps; i++) {
+		struct mmap *map = &evlist->overwrite_mmap[i];
 		union perf_event *event;
 
 		perf_mmap__read_init(map);
@@ -63,9 +64,9 @@
 	int err;
 	char sbuf[STRERR_BUFSIZE];
 
-	err = perf_evlist__mmap(evlist, mmap_pages);
+	err = evlist__mmap(evlist, mmap_pages);
 	if (err < 0) {
-		pr_debug("perf_evlist__mmap: %s\n",
+		pr_debug("evlist__mmap: %s\n",
 			 str_error_r(errno, sbuf, sizeof(sbuf)));
 		return TEST_FAIL;
 	}
@@ -75,7 +76,7 @@
 	evlist__disable(evlist);
 
 	err = count_samples(evlist, sample_count, comm_count);
-	perf_evlist__munmap(evlist);
+	evlist__munmap(evlist);
 	return err;
 }
 

diff --git a/tools/perf/tests/bitmap.c b/tools/perf/tests/bitmap.c
index db2aadf..96c1373 100644
--- a/tools/perf/tests/bitmap.c
+++ b/tools/perf/tests/bitmap.c

@@ -2,8 +2,8 @@
 #include <linux/compiler.h>
 #include <linux/bitmap.h>
 #include <perf/cpumap.h>
+#include <internal/cpumap.h>
 #include "tests.h"
-#include "cpumap.h"
 #include "debug.h"
 
 #define NBITS 100

diff --git a/tools/perf/tests/bpf.c b/tools/perf/tests/bpf.c
index fc102e4..1eb0bff 100644
--- a/tools/perf/tests/bpf.c
+++ b/tools/perf/tests/bpf.c

@@ -19,6 +19,7 @@
 #include "llvm.h"
 #include "debug.h"
 #include "parse-events.h"
+#include "util/mmap.h"
 #define NR_ITERS       111
 #define PERF_TEST_BPF_PATH "/sys/fs/bpf/perf_test"
 
@@ -167,9 +168,9 @@
 		goto out_delete_evlist;
 	}
 
-	err = perf_evlist__mmap(evlist, opts.mmap_pages);
+	err = evlist__mmap(evlist, opts.mmap_pages);
 	if (err < 0) {
-		pr_debug("perf_evlist__mmap: %s\n",
+		pr_debug("evlist__mmap: %s\n",
 			 str_error_r(errno, sbuf, sizeof(sbuf)));
 		goto out_delete_evlist;
 	}
@@ -178,9 +179,9 @@
 	(*func)();
 	evlist__disable(evlist);
 
-	for (i = 0; i < evlist->nr_mmaps; i++) {
+	for (i = 0; i < evlist->core.nr_mmaps; i++) {
 		union perf_event *event;
-		struct perf_mmap *md;
+		struct mmap *md;
 
 		md = &evlist->mmap[i];
 		if (perf_mmap__read_init(md) < 0)

diff --git a/tools/perf/tests/clang.c b/tools/perf/tests/clang.c
index f45fe11..2577d3e 100644
--- a/tools/perf/tests/clang.c
+++ b/tools/perf/tests/clang.c

@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
 #include "tests.h"
-#include "debug.h"
-#include "util.h"
 #include "c++/clang-c.h"
 #include <linux/kernel.h>
 

diff --git a/tools/perf/tests/code-reading.c b/tools/perf/tests/code-reading.c
index c1c29e0..f5764a3 100644
--- a/tools/perf/tests/code-reading.c
+++ b/tools/perf/tests/code-reading.c

@@ -19,12 +19,13 @@
 #include "evlist.h"
 #include "evsel.h"
 #include "thread_map.h"
-#include "cpumap.h"
 #include "machine.h"
 #include "map.h"
 #include "symbol.h"
 #include "event.h"
 #include "record.h"
+#include "util/mmap.h"
+#include "util/synthetic-events.h"
 #include "thread.h"
 
 #include "tests.h"
@@ -419,10 +420,10 @@
 			  struct state *state)
 {
 	union perf_event *event;
-	struct perf_mmap *md;
+	struct mmap *md;
 	int i, ret;
 
-	for (i = 0; i < evlist->nr_mmaps; i++) {
+	for (i = 0; i < evlist->core.nr_mmaps; i++) {
 		md = &evlist->mmap[i];
 		if (perf_mmap__read_init(md) < 0)
 			continue;
@@ -651,7 +652,7 @@
 
 		perf_evlist__config(evlist, &opts, NULL);
 
-		evsel = perf_evlist__first(evlist);
+		evsel = evlist__first(evlist);
 
 		evsel->core.attr.comm = 1;
 		evsel->core.attr.disabled = 1;
@@ -685,9 +686,9 @@
 		break;
 	}
 
-	ret = perf_evlist__mmap(evlist, UINT_MAX);
+	ret = evlist__mmap(evlist, UINT_MAX);
 	if (ret < 0) {
-		pr_debug("perf_evlist__mmap failed\n");
+		pr_debug("evlist__mmap failed\n");
 		goto out_put;
 	}
 

diff --git a/tools/perf/tests/cpumap.c b/tools/perf/tests/cpumap.c
index 39493de..8a0d236 100644
--- a/tools/perf/tests/cpumap.c
+++ b/tools/perf/tests/cpumap.c

@@ -3,6 +3,7 @@
 #include <stdio.h>
 #include "cpumap.h"
 #include "event.h"
+#include "util/synthetic-events.h"
 #include <string.h>
 #include <linux/bitops.h>
 #include <perf/cpumap.h>

diff --git a/tools/perf/tests/dso-data.c b/tools/perf/tests/dso-data.c
index a4874d4..627c1aa 100644
--- a/tools/perf/tests/dso-data.c
+++ b/tools/perf/tests/dso-data.c

@@ -10,7 +10,6 @@
 #include <sys/resource.h>
 #include <api/fs/fs.h>
 #include "dso.h"
-#include "util.h"
 #include "machine.h"
 #include "symbol.h"
 #include "tests.h"

diff --git a/tools/perf/tests/dwarf-unwind.c b/tools/perf/tests/dwarf-unwind.c
index 4125255..4f4ecbc 100644
--- a/tools/perf/tests/dwarf-unwind.c
+++ b/tools/perf/tests/dwarf-unwind.c

@@ -15,6 +15,7 @@
 #include "symbol.h"
 #include "thread.h"
 #include "callchain.h"
+#include "util/synthetic-events.h"
 
 #if defined (__x86_64__) || defined (__i386__) || defined (__powerpc__)
 #include "arch-tests.h"

diff --git a/tools/perf/tests/event-times.c b/tools/perf/tests/event-times.c
index d824a72..1ee8704 100644
--- a/tools/perf/tests/event-times.c
+++ b/tools/perf/tests/event-times.c

@@ -9,7 +9,6 @@
 #include "tests.h"
 #include "evlist.h"
 #include "evsel.h"
-#include "util.h"
 #include "debug.h"
 #include "parse-events.h"
 #include "thread_map.h"
@@ -17,7 +16,7 @@
 
 static int attach__enable_on_exec(struct evlist *evlist)
 {
-	struct evsel *evsel = perf_evlist__last(evlist);
+	struct evsel *evsel = evlist__last(evlist);
 	struct target target = {
 		.uid = UINT_MAX,
 	};
@@ -59,7 +58,7 @@
 
 static int attach__current_disabled(struct evlist *evlist)
 {
-	struct evsel *evsel = perf_evlist__last(evlist);
+	struct evsel *evsel = evlist__last(evlist);
 	struct perf_thread_map *threads;
 	int err;
 
@@ -85,7 +84,7 @@
 
 static int attach__current_enabled(struct evlist *evlist)
 {
-	struct evsel *evsel = perf_evlist__last(evlist);
+	struct evsel *evsel = evlist__last(evlist);
 	struct perf_thread_map *threads;
 	int err;
 
@@ -105,14 +104,14 @@
 
 static int detach__disable(struct evlist *evlist)
 {
-	struct evsel *evsel = perf_evlist__last(evlist);
+	struct evsel *evsel = evlist__last(evlist);
 
 	return evsel__enable(evsel);
 }
 
 static int attach__cpu_disabled(struct evlist *evlist)
 {
-	struct evsel *evsel = perf_evlist__last(evlist);
+	struct evsel *evsel = evlist__last(evlist);
 	struct perf_cpu_map *cpus;
 	int err;
 
@@ -141,7 +140,7 @@
 
 static int attach__cpu_enabled(struct evlist *evlist)
 {
-	struct evsel *evsel = perf_evlist__last(evlist);
+	struct evsel *evsel = evlist__last(evlist);
 	struct perf_cpu_map *cpus;
 	int err;
 
@@ -181,7 +180,7 @@
 		goto out_err;
 	}
 
-	evsel = perf_evlist__last(evlist);
+	evsel = evlist__last(evlist);
 	evsel->core.attr.read_format |=
 		PERF_FORMAT_TOTAL_TIME_ENABLED |
 		PERF_FORMAT_TOTAL_TIME_RUNNING;

diff --git a/tools/perf/tests/event_update.c b/tools/perf/tests/event_update.c
index cac4290..c727379 100644
--- a/tools/perf/tests/event_update.c
+++ b/tools/perf/tests/event_update.c

@@ -2,10 +2,12 @@
 #include <linux/compiler.h>
 #include <perf/cpumap.h>
 #include <string.h>
+#include "cpumap.h"
 #include "evlist.h"
 #include "evsel.h"
 #include "header.h"
 #include "machine.h"
+#include "util/synthetic-events.h"
 #include "tool.h"
 #include "tests.h"
 #include "debug.h"
@@ -90,12 +92,12 @@
 	evlist = perf_evlist__new_default();
 	TEST_ASSERT_VAL("failed to get evlist", evlist);
 
-	evsel = perf_evlist__first(evlist);
+	evsel = evlist__first(evlist);
 
-	TEST_ASSERT_VAL("failed to allos ids",
-			!perf_evsel__alloc_id(evsel, 1, 1));
+	TEST_ASSERT_VAL("failed to allocate ids",
+			!perf_evsel__alloc_id(&evsel->core, 1, 1));
 
-	perf_evlist__id_add(evlist, evsel, 0, 0, 123);
+	perf_evlist__id_add(&evlist->core, &evsel->core, 0, 0, 123);
 
 	evsel->unit = strdup("KRAVA");
 

diff --git a/tools/perf/tests/evsel-roundtrip-name.c b/tools/perf/tests/evsel-roundtrip-name.c
index 5330f10..956205b 100644
--- a/tools/perf/tests/evsel-roundtrip-name.c
+++ b/tools/perf/tests/evsel-roundtrip-name.c

@@ -34,7 +34,7 @@
 	}
 
 	idx = 0;
-	evsel = perf_evlist__first(evlist);
+	evsel = evlist__first(evlist);
 
 	for (type = 0; type < PERF_COUNT_HW_CACHE_MAX; type++) {
 		for (op = 0; op < PERF_COUNT_HW_CACHE_OP_MAX; op++) {

diff --git a/tools/perf/tests/hists_common.c b/tools/perf/tests/hists_common.c
index de110d8..6f34d08 100644
--- a/tools/perf/tests/hists_common.c
+++ b/tools/perf/tests/hists_common.c

@@ -2,6 +2,7 @@
 #include <inttypes.h>
 #include "util/debug.h"
 #include "util/dso.h"
+#include "util/event.h" // struct perf_sample
 #include "util/map.h"
 #include "util/symbol.h"
 #include "util/sort.h"
@@ -10,6 +11,7 @@
 #include "util/thread.h"
 #include "tests/hists_common.h"
 #include <linux/kernel.h>
+#include <linux/perf_event.h>
 
 static struct {
 	u32 pid;

diff --git a/tools/perf/tests/hists_cumulate.c b/tools/perf/tests/hists_cumulate.c
index fa55b7b..6367c8f 100644
--- a/tools/perf/tests/hists_cumulate.c
+++ b/tools/perf/tests/hists_cumulate.c

@@ -721,7 +721,7 @@
 	if (verbose > 1)
 		machine__fprintf(machine, stderr);
 
-	evsel = perf_evlist__first(evlist);
+	evsel = evlist__first(evlist);
 
 	for (i = 0; i < ARRAY_SIZE(testcases); i++) {
 		err = testcases[i](evsel, machine);

diff --git a/tools/perf/tests/hists_link.c b/tools/perf/tests/hists_link.c
index 8be4d0b..a024d3f 100644
--- a/tools/perf/tests/hists_link.c
+++ b/tools/perf/tests/hists_link.c

@@ -8,6 +8,7 @@
 #include "machine.h"
 #include "parse-events.h"
 #include "hists_common.h"
+#include "util/mmap.h"
 #include <errno.h>
 #include <linux/kernel.h>
 
@@ -310,8 +311,8 @@
 			print_hists_in(hists);
 	}
 
-	first = perf_evlist__first(evlist);
-	evsel = perf_evlist__last(evlist);
+	first = evlist__first(evlist);
+	evsel = evlist__last(evlist);
 
 	first_hists = evsel__hists(first);
 	hists = evsel__hists(evsel);

diff --git a/tools/perf/tests/hists_output.c b/tools/perf/tests/hists_output.c
index 3f6dfa2..38f804f 100644
--- a/tools/perf/tests/hists_output.c
+++ b/tools/perf/tests/hists_output.c

@@ -608,7 +608,7 @@
 	if (verbose > 1)
 		machine__fprintf(machine, stderr);
 
-	evsel = perf_evlist__first(evlist);
+	evsel = evlist__first(evlist);
 
 	for (i = 0; i < ARRAY_SIZE(testcases); i++) {
 		err = testcases[i](evsel, machine);

diff --git a/tools/perf/tests/keep-tracking.c b/tools/perf/tests/keep-tracking.c
index 9f0762d..92c7d59 100644
--- a/tools/perf/tests/keep-tracking.c
+++ b/tools/perf/tests/keep-tracking.c

@@ -12,8 +12,8 @@
 #include "evsel.h"
 #include "record.h"
 #include "thread_map.h"
-#include "cpumap.h"
 #include "tests.h"
+#include "util/mmap.h"
 
 #define CHECK__(x) {				\
 	while ((x) < 0) {			\
@@ -32,11 +32,11 @@
 static int find_comm(struct evlist *evlist, const char *comm)
 {
 	union perf_event *event;
-	struct perf_mmap *md;
+	struct mmap *md;
 	int i, found;
 
 	found = 0;
-	for (i = 0; i < evlist->nr_mmaps; i++) {
+	for (i = 0; i < evlist->core.nr_mmaps; i++) {
 		md = &evlist->mmap[i];
 		if (perf_mmap__read_init(md) < 0)
 			continue;
@@ -93,7 +93,7 @@
 
 	perf_evlist__config(evlist, &opts, NULL);
 
-	evsel = perf_evlist__first(evlist);
+	evsel = evlist__first(evlist);
 
 	evsel->core.attr.comm = 1;
 	evsel->core.attr.disabled = 1;
@@ -105,7 +105,7 @@
 		goto out_err;
 	}
 
-	CHECK__(perf_evlist__mmap(evlist, UINT_MAX));
+	CHECK__(evlist__mmap(evlist, UINT_MAX));
 
 	/*
 	 * First, test that a 'comm' event can be found when the event is
@@ -132,7 +132,7 @@
 
 	evlist__enable(evlist);
 
-	evsel = perf_evlist__last(evlist);
+	evsel = evlist__last(evlist);
 
 	CHECK__(evsel__disable(evsel));
 
@@ -143,7 +143,7 @@
 
 	found = find_comm(evlist, comm);
 	if (found != 1) {
-		pr_debug("Seconf time, failed to find tracking event.\n");
+		pr_debug("Second time, failed to find tracking event.\n");
 		goto out_err;
 	}
 

diff --git a/tools/perf/tests/llvm.c b/tools/perf/tests/llvm.c
index 022e4c9..ae6cda8 100644
--- a/tools/perf/tests/llvm.c
+++ b/tools/perf/tests/llvm.c

@@ -7,7 +7,6 @@
 #include "llvm.h"
 #include "tests.h"
 #include "debug.h"
-#include "util.h"
 
 #ifdef HAVE_LIBBPF_SUPPORT
 static int test__bpf_parsing(void *obj_buf, size_t obj_buf_sz)

diff --git a/tools/perf/tests/make b/tools/perf/tests/make
index 70c4847..c850d16 100644
--- a/tools/perf/tests/make
+++ b/tools/perf/tests/make

@@ -100,7 +100,7 @@
 make_install_pdf    := install-pdf
 make_install_prefix       := install prefix=/tmp/krava
 make_install_prefix_slash := install prefix=/tmp/krava/
-make_static         := LDFLAGS=-static
+make_static         := LDFLAGS=-static NO_PERF_READ_VDSO32=1 NO_PERF_READ_VDSOX32=1 NO_JVMTI=1
 
 # all the NO_* variable combined
 make_minimal        := NO_LIBPERL=1 NO_LIBPYTHON=1 NO_NEWT=1 NO_GTK2=1
@@ -327,6 +327,10 @@
 	(make -C ../../tools $(PARALLEL_OPT) $(K_O_OPT) perf) > $@ 2>&1 && \
 	test -x $(KERNEL_O)/tools/perf/perf && rm -f $@ || (cat $@ ; false)
 
+make_libperf:
+	@echo "- make -C lib";
+	make -C lib clean >$@ 2>&1; make -C lib >>$@ 2>&1 && rm $@
+
 FEATURES_DUMP_FILE := $(FULL_O)/BUILD_TEST_FEATURE_DUMP
 FEATURES_DUMP_FILE_STATIC := $(FULL_O)/BUILD_TEST_FEATURE_DUMP_STATIC
 
@@ -365,5 +369,5 @@
 			$(eval $(t) := $($(t)) FEATURES_DUMP=$(FEATURES_DUMP_FILE))))
 endif
 
-.PHONY: all $(run) $(run_O) tarpkg clean make_kernelsrc make_kernelsrc_tools
+.PHONY: all $(run) $(run_O) tarpkg clean make_kernelsrc make_kernelsrc_tools make_libperf
 endif # ifndef MK

diff --git a/tools/perf/tests/mem2node.c b/tools/perf/tests/mem2node.c
index 7672ade..a258bd5 100644
--- a/tools/perf/tests/mem2node.c
+++ b/tools/perf/tests/mem2node.c

@@ -4,7 +4,7 @@
 #include <linux/kernel.h>
 #include <linux/zalloc.h>
 #include <perf/cpumap.h>
-#include "cpumap.h"
+#include <internal/cpumap.h>
 #include "debug.h"
 #include "env.h"
 #include "mem2node.h"

diff --git a/tools/perf/tests/mmap-basic.c b/tools/perf/tests/mmap-basic.c
index 85e1d73..3a22dce 100644
--- a/tools/perf/tests/mmap-basic.c
+++ b/tools/perf/tests/mmap-basic.c

@@ -10,8 +10,8 @@
 #include "evlist.h"
 #include "evsel.h"
 #include "thread_map.h"
-#include "cpumap.h"
 #include "tests.h"
+#include "util/mmap.h"
 #include <linux/err.h>
 #include <linux/kernel.h>
 #include <linux/string.h>
@@ -43,7 +43,7 @@
 		     expected_nr_events[nsyscalls], i, j;
 	struct evsel *evsels[nsyscalls], *evsel;
 	char sbuf[STRERR_BUFSIZE];
-	struct perf_mmap *md;
+	struct mmap *md;
 
 	threads = thread_map__new(-1, getpid(), UINT_MAX);
 	if (threads == NULL) {
@@ -53,7 +53,7 @@
 
 	cpus = perf_cpu_map__new(NULL);
 	if (cpus == NULL) {
-		pr_debug("cpu_map__new\n");
+		pr_debug("perf_cpu_map__new\n");
 		goto out_free_threads;
 	}
 
@@ -100,7 +100,7 @@
 		expected_nr_events[i] = 1 + rand() % 127;
 	}
 
-	if (perf_evlist__mmap(evlist, 128) < 0) {
+	if (evlist__mmap(evlist, 128) < 0) {
 		pr_debug("failed to mmap events: %d (%s)\n", errno,
 			 str_error_r(errno, sbuf, sizeof(sbuf)));
 		goto out_delete_evlist;

diff --git a/tools/perf/tests/mmap-thread-lookup.c b/tools/perf/tests/mmap-thread-lookup.c
index 360d70d..8d9d4cb 100644
--- a/tools/perf/tests/mmap-thread-lookup.c
+++ b/tools/perf/tests/mmap-thread-lookup.c

@@ -8,13 +8,15 @@
 #include <stdlib.h>
 #include <stdio.h>
 #include "debug.h"
+#include "event.h"
 #include "tests.h"
 #include "machine.h"
 #include "thread_map.h"
 #include "map.h"
 #include "symbol.h"
+#include "util/synthetic-events.h"
 #include "thread.h"
-#include "util.h"
+#include <internal/lib.h> // page_size
 
 #define THREADS 4
 

diff --git a/tools/perf/tests/openat-syscall-all-cpus.c b/tools/perf/tests/openat-syscall-all-cpus.c
index 9171f77..93c1765 100644
--- a/tools/perf/tests/openat-syscall-all-cpus.c
+++ b/tools/perf/tests/openat-syscall-all-cpus.c

@@ -14,7 +14,8 @@
 #include "evsel.h"
 #include "tests.h"
 #include "thread_map.h"
-#include "cpumap.h"
+#include <perf/cpumap.h>
+#include <internal/cpumap.h>
 #include "debug.h"
 #include "stat.h"
 #include "util/counts.h"
@@ -37,7 +38,7 @@
 
 	cpus = perf_cpu_map__new(NULL);
 	if (cpus == NULL) {
-		pr_debug("cpu_map__new\n");
+		pr_debug("perf_cpu_map__new\n");
 		goto out_thread_map_delete;
 	}
 

diff --git a/tools/perf/tests/openat-syscall-tp-fields.c b/tools/perf/tests/openat-syscall-tp-fields.c
index b71167b..2b5c468 100644
--- a/tools/perf/tests/openat-syscall-tp-fields.c
+++ b/tools/perf/tests/openat-syscall-tp-fields.c

@@ -11,6 +11,7 @@
 #include "record.h"
 #include "tests.h"
 #include "debug.h"
+#include "util/mmap.h"
 #include <errno.h>
 
 #ifndef O_DIRECTORY
@@ -69,9 +70,9 @@
 		goto out_delete_evlist;
 	}
 
-	err = perf_evlist__mmap(evlist, UINT_MAX);
+	err = evlist__mmap(evlist, UINT_MAX);
 	if (err < 0) {
-		pr_debug("perf_evlist__mmap: %s\n",
+		pr_debug("evlist__mmap: %s\n",
 			 str_error_r(errno, sbuf, sizeof(sbuf)));
 		goto out_delete_evlist;
 	}
@@ -86,9 +87,9 @@
 	while (1) {
 		int before = nr_events;
 
-		for (i = 0; i < evlist->nr_mmaps; i++) {
+		for (i = 0; i < evlist->core.nr_mmaps; i++) {
 			union perf_event *event;
-			struct perf_mmap *md;
+			struct mmap *md;
 
 			md = &evlist->mmap[i];
 			if (perf_mmap__read_init(md) < 0)
@@ -126,7 +127,7 @@
 		}
 
 		if (nr_events == before)
-			perf_evlist__poll(evlist, 10);
+			evlist__poll(evlist, 10);
 
 		if (++nr_polls > 5) {
 			pr_debug("%s: no events!\n", __func__);

diff --git a/tools/perf/tests/parse-events.c b/tools/perf/tests/parse-events.c
index 02ba696..25e0ed2 100644
--- a/tools/perf/tests/parse-events.c
+++ b/tools/perf/tests/parse-events.c

@@ -6,7 +6,6 @@
 #include "tests.h"
 #include "debug.h"
 #include "pmu.h"
-#include "util.h"
 #include <dirent.h>
 #include <errno.h>
 #include <sys/types.h>
@@ -47,7 +46,7 @@
 
 static int test__checkevent_tracepoint(struct evlist *evlist)
 {
-	struct evsel *evsel = perf_evlist__first(evlist);
+	struct evsel *evsel = evlist__first(evlist);
 
 	TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->core.nr_entries);
 	TEST_ASSERT_VAL("wrong number of groups", 0 == evlist->nr_groups);
@@ -78,7 +77,7 @@
 
 static int test__checkevent_raw(struct evlist *evlist)
 {
-	struct evsel *evsel = perf_evlist__first(evlist);
+	struct evsel *evsel = evlist__first(evlist);
 
 	TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->core.nr_entries);
 	TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type);
@@ -88,7 +87,7 @@
 
 static int test__checkevent_numeric(struct evlist *evlist)
 {
-	struct evsel *evsel = perf_evlist__first(evlist);
+	struct evsel *evsel = evlist__first(evlist);
 
 	TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->core.nr_entries);
 	TEST_ASSERT_VAL("wrong type", 1 == evsel->core.attr.type);
@@ -98,7 +97,7 @@
 
 static int test__checkevent_symbolic_name(struct evlist *evlist)
 {
-	struct evsel *evsel = perf_evlist__first(evlist);
+	struct evsel *evsel = evlist__first(evlist);
 
 	TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->core.nr_entries);
 	TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type);
@@ -109,7 +108,7 @@
 
 static int test__checkevent_symbolic_name_config(struct evlist *evlist)
 {
-	struct evsel *evsel = perf_evlist__first(evlist);
+	struct evsel *evsel = evlist__first(evlist);
 
 	TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->core.nr_entries);
 	TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type);
@@ -130,7 +129,7 @@
 
 static int test__checkevent_symbolic_alias(struct evlist *evlist)
 {
-	struct evsel *evsel = perf_evlist__first(evlist);
+	struct evsel *evsel = evlist__first(evlist);
 
 	TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->core.nr_entries);
 	TEST_ASSERT_VAL("wrong type", PERF_TYPE_SOFTWARE == evsel->core.attr.type);
@@ -141,7 +140,7 @@
 
 static int test__checkevent_genhw(struct evlist *evlist)
 {
-	struct evsel *evsel = perf_evlist__first(evlist);
+	struct evsel *evsel = evlist__first(evlist);
 
 	TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->core.nr_entries);
 	TEST_ASSERT_VAL("wrong type", PERF_TYPE_HW_CACHE == evsel->core.attr.type);
@@ -151,7 +150,7 @@
 
 static int test__checkevent_breakpoint(struct evlist *evlist)
 {
-	struct evsel *evsel = perf_evlist__first(evlist);
+	struct evsel *evsel = evlist__first(evlist);
 
 	TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->core.nr_entries);
 	TEST_ASSERT_VAL("wrong type", PERF_TYPE_BREAKPOINT == evsel->core.attr.type);
@@ -165,7 +164,7 @@
 
 static int test__checkevent_breakpoint_x(struct evlist *evlist)
 {
-	struct evsel *evsel = perf_evlist__first(evlist);
+	struct evsel *evsel = evlist__first(evlist);
 
 	TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->core.nr_entries);
 	TEST_ASSERT_VAL("wrong type", PERF_TYPE_BREAKPOINT == evsel->core.attr.type);
@@ -178,7 +177,7 @@
 
 static int test__checkevent_breakpoint_r(struct evlist *evlist)
 {
-	struct evsel *evsel = perf_evlist__first(evlist);
+	struct evsel *evsel = evlist__first(evlist);
 
 	TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->core.nr_entries);
 	TEST_ASSERT_VAL("wrong type",
@@ -193,7 +192,7 @@
 
 static int test__checkevent_breakpoint_w(struct evlist *evlist)
 {
-	struct evsel *evsel = perf_evlist__first(evlist);
+	struct evsel *evsel = evlist__first(evlist);
 
 	TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->core.nr_entries);
 	TEST_ASSERT_VAL("wrong type",
@@ -208,7 +207,7 @@
 
 static int test__checkevent_breakpoint_rw(struct evlist *evlist)
 {
-	struct evsel *evsel = perf_evlist__first(evlist);
+	struct evsel *evsel = evlist__first(evlist);
 
 	TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->core.nr_entries);
 	TEST_ASSERT_VAL("wrong type",
@@ -223,7 +222,7 @@
 
 static int test__checkevent_tracepoint_modifier(struct evlist *evlist)
 {
-	struct evsel *evsel = perf_evlist__first(evlist);
+	struct evsel *evsel = evlist__first(evlist);
 
 	TEST_ASSERT_VAL("wrong exclude_user", evsel->core.attr.exclude_user);
 	TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel);
@@ -254,7 +253,7 @@
 
 static int test__checkevent_raw_modifier(struct evlist *evlist)
 {
-	struct evsel *evsel = perf_evlist__first(evlist);
+	struct evsel *evsel = evlist__first(evlist);
 
 	TEST_ASSERT_VAL("wrong exclude_user", evsel->core.attr.exclude_user);
 	TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel);
@@ -266,7 +265,7 @@
 
 static int test__checkevent_numeric_modifier(struct evlist *evlist)
 {
-	struct evsel *evsel = perf_evlist__first(evlist);
+	struct evsel *evsel = evlist__first(evlist);
 
 	TEST_ASSERT_VAL("wrong exclude_user", evsel->core.attr.exclude_user);
 	TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel);
@@ -278,7 +277,7 @@
 
 static int test__checkevent_symbolic_name_modifier(struct evlist *evlist)
 {
-	struct evsel *evsel = perf_evlist__first(evlist);
+	struct evsel *evsel = evlist__first(evlist);
 
 	TEST_ASSERT_VAL("wrong exclude_user", evsel->core.attr.exclude_user);
 	TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel);
@@ -290,7 +289,7 @@
 
 static int test__checkevent_exclude_host_modifier(struct evlist *evlist)
 {
-	struct evsel *evsel = perf_evlist__first(evlist);
+	struct evsel *evsel = evlist__first(evlist);
 
 	TEST_ASSERT_VAL("wrong exclude guest", !evsel->core.attr.exclude_guest);
 	TEST_ASSERT_VAL("wrong exclude host", evsel->core.attr.exclude_host);
@@ -300,7 +299,7 @@
 
 static int test__checkevent_exclude_guest_modifier(struct evlist *evlist)
 {
-	struct evsel *evsel = perf_evlist__first(evlist);
+	struct evsel *evsel = evlist__first(evlist);
 
 	TEST_ASSERT_VAL("wrong exclude guest", evsel->core.attr.exclude_guest);
 	TEST_ASSERT_VAL("wrong exclude host", !evsel->core.attr.exclude_host);
@@ -310,7 +309,7 @@
 
 static int test__checkevent_symbolic_alias_modifier(struct evlist *evlist)
 {
-	struct evsel *evsel = perf_evlist__first(evlist);
+	struct evsel *evsel = evlist__first(evlist);
 
 	TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user);
 	TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel);
@@ -322,7 +321,7 @@
 
 static int test__checkevent_genhw_modifier(struct evlist *evlist)
 {
-	struct evsel *evsel = perf_evlist__first(evlist);
+	struct evsel *evsel = evlist__first(evlist);
 
 	TEST_ASSERT_VAL("wrong exclude_user", evsel->core.attr.exclude_user);
 	TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel);
@@ -334,7 +333,7 @@
 
 static int test__checkevent_exclude_idle_modifier(struct evlist *evlist)
 {
-	struct evsel *evsel = perf_evlist__first(evlist);
+	struct evsel *evsel = evlist__first(evlist);
 
 	TEST_ASSERT_VAL("wrong exclude idle", evsel->core.attr.exclude_idle);
 	TEST_ASSERT_VAL("wrong exclude guest", !evsel->core.attr.exclude_guest);
@@ -349,7 +348,7 @@
 
 static int test__checkevent_exclude_idle_modifier_1(struct evlist *evlist)
 {
-	struct evsel *evsel = perf_evlist__first(evlist);
+	struct evsel *evsel = evlist__first(evlist);
 
 	TEST_ASSERT_VAL("wrong exclude idle", evsel->core.attr.exclude_idle);
 	TEST_ASSERT_VAL("wrong exclude guest", !evsel->core.attr.exclude_guest);
@@ -364,7 +363,7 @@
 
 static int test__checkevent_breakpoint_modifier(struct evlist *evlist)
 {
-	struct evsel *evsel = perf_evlist__first(evlist);
+	struct evsel *evsel = evlist__first(evlist);
 
 
 	TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user);
@@ -379,7 +378,7 @@
 
 static int test__checkevent_breakpoint_x_modifier(struct evlist *evlist)
 {
-	struct evsel *evsel = perf_evlist__first(evlist);
+	struct evsel *evsel = evlist__first(evlist);
 
 	TEST_ASSERT_VAL("wrong exclude_user", evsel->core.attr.exclude_user);
 	TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel);
@@ -393,7 +392,7 @@
 
 static int test__checkevent_breakpoint_r_modifier(struct evlist *evlist)
 {
-	struct evsel *evsel = perf_evlist__first(evlist);
+	struct evsel *evsel = evlist__first(evlist);
 
 	TEST_ASSERT_VAL("wrong exclude_user", evsel->core.attr.exclude_user);
 	TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel);
@@ -407,7 +406,7 @@
 
 static int test__checkevent_breakpoint_w_modifier(struct evlist *evlist)
 {
-	struct evsel *evsel = perf_evlist__first(evlist);
+	struct evsel *evsel = evlist__first(evlist);
 
 	TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user);
 	TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel);
@@ -421,7 +420,7 @@
 
 static int test__checkevent_breakpoint_rw_modifier(struct evlist *evlist)
 {
-	struct evsel *evsel = perf_evlist__first(evlist);
+	struct evsel *evsel = evlist__first(evlist);
 
 	TEST_ASSERT_VAL("wrong exclude_user", evsel->core.attr.exclude_user);
 	TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel);
@@ -436,7 +435,7 @@
 static int test__checkevent_pmu(struct evlist *evlist)
 {
 
-	struct evsel *evsel = perf_evlist__first(evlist);
+	struct evsel *evsel = evlist__first(evlist);
 
 	TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->core.nr_entries);
 	TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type);
@@ -454,7 +453,7 @@
 
 static int test__checkevent_list(struct evlist *evlist)
 {
-	struct evsel *evsel = perf_evlist__first(evlist);
+	struct evsel *evsel = evlist__first(evlist);
 
 	TEST_ASSERT_VAL("wrong number of entries", 3 == evlist->core.nr_entries);
 
@@ -493,7 +492,7 @@
 
 static int test__checkevent_pmu_name(struct evlist *evlist)
 {
-	struct evsel *evsel = perf_evlist__first(evlist);
+	struct evsel *evsel = evlist__first(evlist);
 
 	/* cpu/config=1,name=krava/u */
 	TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->core.nr_entries);
@@ -514,7 +513,7 @@
 
 static int test__checkevent_pmu_partial_time_callgraph(struct evlist *evlist)
 {
-	struct evsel *evsel = perf_evlist__first(evlist);
+	struct evsel *evsel = evlist__first(evlist);
 
 	/* cpu/config=1,call-graph=fp,time,period=100000/ */
 	TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->core.nr_entries);
@@ -547,7 +546,7 @@
 
 static int test__checkevent_pmu_events(struct evlist *evlist)
 {
-	struct evsel *evsel = perf_evlist__first(evlist);
+	struct evsel *evsel = evlist__first(evlist);
 
 	TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->core.nr_entries);
 	TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type);
@@ -565,7 +564,7 @@
 
 static int test__checkevent_pmu_events_mix(struct evlist *evlist)
 {
-	struct evsel *evsel = perf_evlist__first(evlist);
+	struct evsel *evsel = evlist__first(evlist);
 
 	/* pmu-event:u */
 	TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->core.nr_entries);
@@ -643,7 +642,7 @@
 	TEST_ASSERT_VAL("wrong number of groups", 1 == evlist->nr_groups);
 
 	/* instructions:k */
-	evsel = leader = perf_evlist__first(evlist);
+	evsel = leader = evlist__first(evlist);
 	TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type);
 	TEST_ASSERT_VAL("wrong config",
 			PERF_COUNT_HW_INSTRUCTIONS == evsel->core.attr.config);
@@ -685,7 +684,7 @@
 	TEST_ASSERT_VAL("wrong number of groups", 1 == evlist->nr_groups);
 
 	/* faults + :ku modifier */
-	evsel = leader = perf_evlist__first(evlist);
+	evsel = leader = evlist__first(evlist);
 	TEST_ASSERT_VAL("wrong type", PERF_TYPE_SOFTWARE == evsel->core.attr.type);
 	TEST_ASSERT_VAL("wrong config",
 			PERF_COUNT_SW_PAGE_FAULTS == evsel->core.attr.config);
@@ -740,7 +739,7 @@
 	TEST_ASSERT_VAL("wrong number of groups", 2 == evlist->nr_groups);
 
 	/* group1 syscalls:sys_enter_openat:H */
-	evsel = leader = perf_evlist__first(evlist);
+	evsel = leader = evlist__first(evlist);
 	TEST_ASSERT_VAL("wrong type", PERF_TYPE_TRACEPOINT == evsel->core.attr.type);
 	TEST_ASSERT_VAL("wrong sample_type",
 		PERF_TP_SAMPLE_TYPE == evsel->core.attr.sample_type);
@@ -832,7 +831,7 @@
 	TEST_ASSERT_VAL("wrong number of groups", 1 == evlist->nr_groups);
 
 	/* cycles:u + p */
-	evsel = leader = perf_evlist__first(evlist);
+	evsel = leader = evlist__first(evlist);
 	TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type);
 	TEST_ASSERT_VAL("wrong config",
 			PERF_COUNT_HW_CPU_CYCLES == evsel->core.attr.config);
@@ -876,7 +875,7 @@
 	TEST_ASSERT_VAL("wrong number of groups", 2 == evlist->nr_groups);
 
 	/* cycles + G */
-	evsel = leader = perf_evlist__first(evlist);
+	evsel = leader = evlist__first(evlist);
 	TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type);
 	TEST_ASSERT_VAL("wrong config",
 			PERF_COUNT_HW_CPU_CYCLES == evsel->core.attr.config);
@@ -962,7 +961,7 @@
 	TEST_ASSERT_VAL("wrong number of groups", 1 == evlist->nr_groups);
 
 	/* cycles + :H group modifier */
-	evsel = leader = perf_evlist__first(evlist);
+	evsel = leader = evlist__first(evlist);
 	TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type);
 	TEST_ASSERT_VAL("wrong config",
 			PERF_COUNT_HW_CPU_CYCLES == evsel->core.attr.config);
@@ -1002,7 +1001,7 @@
 	TEST_ASSERT_VAL("wrong number of groups", 1 == evlist->nr_groups);
 
 	/* cycles + :G group modifier */
-	evsel = leader = perf_evlist__first(evlist);
+	evsel = leader = evlist__first(evlist);
 	TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type);
 	TEST_ASSERT_VAL("wrong config",
 			PERF_COUNT_HW_CPU_CYCLES == evsel->core.attr.config);
@@ -1042,7 +1041,7 @@
 	TEST_ASSERT_VAL("wrong number of groups", 1 == evlist->nr_groups);
 
 	/* cycles:G + :u group modifier */
-	evsel = leader = perf_evlist__first(evlist);
+	evsel = leader = evlist__first(evlist);
 	TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type);
 	TEST_ASSERT_VAL("wrong config",
 			PERF_COUNT_HW_CPU_CYCLES == evsel->core.attr.config);
@@ -1082,7 +1081,7 @@
 	TEST_ASSERT_VAL("wrong number of groups", 1 == evlist->nr_groups);
 
 	/* cycles:G + :uG group modifier */
-	evsel = leader = perf_evlist__first(evlist);
+	evsel = leader = evlist__first(evlist);
 	TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type);
 	TEST_ASSERT_VAL("wrong config",
 			PERF_COUNT_HW_CPU_CYCLES == evsel->core.attr.config);
@@ -1121,7 +1120,7 @@
 	TEST_ASSERT_VAL("wrong number of entries", 3 == evlist->core.nr_entries);
 
 	/* cycles - sampling group leader */
-	evsel = leader = perf_evlist__first(evlist);
+	evsel = leader = evlist__first(evlist);
 	TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type);
 	TEST_ASSERT_VAL("wrong config",
 			PERF_COUNT_HW_CPU_CYCLES == evsel->core.attr.config);
@@ -1174,7 +1173,7 @@
 	TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->core.nr_entries);
 
 	/* instructions - sampling group leader */
-	evsel = leader = perf_evlist__first(evlist);
+	evsel = leader = evlist__first(evlist);
 	TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type);
 	TEST_ASSERT_VAL("wrong config",
 			PERF_COUNT_HW_INSTRUCTIONS == evsel->core.attr.config);
@@ -1208,7 +1207,7 @@
 
 static int test__checkevent_pinned_modifier(struct evlist *evlist)
 {
-	struct evsel *evsel = perf_evlist__first(evlist);
+	struct evsel *evsel = evlist__first(evlist);
 
 	TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user);
 	TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel);
@@ -1226,7 +1225,7 @@
 	TEST_ASSERT_VAL("wrong number of entries", 3 == evlist->core.nr_entries);
 
 	/* cycles - group leader */
-	evsel = leader = perf_evlist__first(evlist);
+	evsel = leader = evlist__first(evlist);
 	TEST_ASSERT_VAL("wrong type", PERF_TYPE_HARDWARE == evsel->core.attr.type);
 	TEST_ASSERT_VAL("wrong config",
 			PERF_COUNT_HW_CPU_CYCLES == evsel->core.attr.config);
@@ -1252,7 +1251,7 @@
 
 static int test__checkevent_breakpoint_len(struct evlist *evlist)
 {
-	struct evsel *evsel = perf_evlist__first(evlist);
+	struct evsel *evsel = evlist__first(evlist);
 
 	TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->core.nr_entries);
 	TEST_ASSERT_VAL("wrong type", PERF_TYPE_BREAKPOINT == evsel->core.attr.type);
@@ -1267,7 +1266,7 @@
 
 static int test__checkevent_breakpoint_len_w(struct evlist *evlist)
 {
-	struct evsel *evsel = perf_evlist__first(evlist);
+	struct evsel *evsel = evlist__first(evlist);
 
 	TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->core.nr_entries);
 	TEST_ASSERT_VAL("wrong type", PERF_TYPE_BREAKPOINT == evsel->core.attr.type);
@@ -1283,7 +1282,7 @@
 static int
 test__checkevent_breakpoint_len_rw_modifier(struct evlist *evlist)
 {
-	struct evsel *evsel = perf_evlist__first(evlist);
+	struct evsel *evsel = evlist__first(evlist);
 
 	TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user);
 	TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel);
@@ -1295,7 +1294,7 @@
 
 static int test__checkevent_precise_max_modifier(struct evlist *evlist)
 {
-	struct evsel *evsel = perf_evlist__first(evlist);
+	struct evsel *evsel = evlist__first(evlist);
 
 	TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->core.nr_entries);
 	TEST_ASSERT_VAL("wrong type", PERF_TYPE_SOFTWARE == evsel->core.attr.type);
@@ -1306,7 +1305,7 @@
 
 static int test__checkevent_config_symbol(struct evlist *evlist)
 {
-	struct evsel *evsel = perf_evlist__first(evlist);
+	struct evsel *evsel = evlist__first(evlist);
 
 	TEST_ASSERT_VAL("wrong name setting", strcmp(evsel->name, "insn") == 0);
 	return 0;
@@ -1314,7 +1313,7 @@
 
 static int test__checkevent_config_raw(struct evlist *evlist)
 {
-	struct evsel *evsel = perf_evlist__first(evlist);
+	struct evsel *evsel = evlist__first(evlist);
 
 	TEST_ASSERT_VAL("wrong name setting", strcmp(evsel->name, "rawpmu") == 0);
 	return 0;
@@ -1322,7 +1321,7 @@
 
 static int test__checkevent_config_num(struct evlist *evlist)
 {
-	struct evsel *evsel = perf_evlist__first(evlist);
+	struct evsel *evsel = evlist__first(evlist);
 
 	TEST_ASSERT_VAL("wrong name setting", strcmp(evsel->name, "numpmu") == 0);
 	return 0;
@@ -1330,7 +1329,7 @@
 
 static int test__checkevent_config_cache(struct evlist *evlist)
 {
-	struct evsel *evsel = perf_evlist__first(evlist);
+	struct evsel *evsel = evlist__first(evlist);
 
 	TEST_ASSERT_VAL("wrong name setting", strcmp(evsel->name, "cachepmu") == 0);
 	return 0;
@@ -1343,7 +1342,7 @@
 
 static int test__intel_pt(struct evlist *evlist)
 {
-	struct evsel *evsel = perf_evlist__first(evlist);
+	struct evsel *evsel = evlist__first(evlist);
 
 	TEST_ASSERT_VAL("wrong name setting", strcmp(evsel->name, "intel_pt//u") == 0);
 	return 0;
@@ -1351,7 +1350,7 @@
 
 static int test__checkevent_complex_name(struct evlist *evlist)
 {
-	struct evsel *evsel = perf_evlist__first(evlist);
+	struct evsel *evsel = evlist__first(evlist);
 
 	TEST_ASSERT_VAL("wrong complex name parsing", strcmp(evsel->name, "COMPLEX_CYCLES_NAME:orig=cycles,desc=chip-clock-ticks") == 0);
 	return 0;
@@ -1359,7 +1358,7 @@
 
 static int test__sym_event_slash(struct evlist *evlist)
 {
-	struct evsel *evsel = perf_evlist__first(evlist);
+	struct evsel *evsel = evlist__first(evlist);
 
 	TEST_ASSERT_VAL("wrong type", evsel->core.attr.type == PERF_TYPE_HARDWARE);
 	TEST_ASSERT_VAL("wrong config", evsel->core.attr.config == PERF_COUNT_HW_CPU_CYCLES);
@@ -1369,7 +1368,7 @@
 
 static int test__sym_event_dc(struct evlist *evlist)
 {
-	struct evsel *evsel = perf_evlist__first(evlist);
+	struct evsel *evsel = evlist__first(evlist);
 
 	TEST_ASSERT_VAL("wrong type", evsel->core.attr.type == PERF_TYPE_HARDWARE);
 	TEST_ASSERT_VAL("wrong config", evsel->core.attr.config == PERF_COUNT_HW_CPU_CYCLES);

diff --git a/tools/perf/tests/parse-no-sample-id-all.c b/tools/perf/tests/parse-no-sample-id-all.c
index 8284752..adf3c9c 100644
--- a/tools/perf/tests/parse-no-sample-id-all.c
+++ b/tools/perf/tests/parse-no-sample-id-all.c

@@ -1,4 +1,3 @@
-// SPDX-License-Identifier: GPL-2.0
 #include <linux/kernel.h>
 #include <linux/types.h>
 #include <stddef.h>
@@ -8,7 +7,6 @@
 #include "event.h"
 #include "evlist.h"
 #include "header.h"
-#include "util.h"
 #include "debug.h"
 
 static int process_event(struct evlist **pevlist, union perf_event *event)

diff --git a/tools/perf/tests/perf-hooks.c b/tools/perf/tests/perf-hooks.c
index a693bcf..dbc2719 100644
--- a/tools/perf/tests/perf-hooks.c
+++ b/tools/perf/tests/perf-hooks.c

@@ -4,7 +4,6 @@
 
 #include "tests.h"
 #include "debug.h"
-#include "util.h"
 #include "perf-hooks.h"
 
 static void sigsegv_handler(int sig __maybe_unused)

diff --git a/tools/perf/tests/perf-record.c b/tools/perf/tests/perf-record.c
index e1b4229..437426b 100644
--- a/tools/perf/tests/perf-record.c
+++ b/tools/perf/tests/perf-record.c

@@ -11,6 +11,7 @@
 #include "debug.h"
 #include "record.h"
 #include "tests.h"
+#include "util/mmap.h"
 
 static int sched__get_first_possible_cpu(pid_t pid, cpu_set_t *maskp)
 {
@@ -103,7 +104,7 @@
 	/*
 	 * Config the evsels, setting attr->comm on the first one, etc.
 	 */
-	evsel = perf_evlist__first(evlist);
+	evsel = evlist__first(evlist);
 	perf_evsel__set_sample_bit(evsel, CPU);
 	perf_evsel__set_sample_bit(evsel, TID);
 	perf_evsel__set_sample_bit(evsel, TIME);
@@ -143,9 +144,9 @@
 	 * fds in the same CPU to be injected in the same mmap ring buffer
 	 * (using ioctl(PERF_EVENT_IOC_SET_OUTPUT)).
 	 */
-	err = perf_evlist__mmap(evlist, opts.mmap_pages);
+	err = evlist__mmap(evlist, opts.mmap_pages);
 	if (err < 0) {
-		pr_debug("perf_evlist__mmap: %s\n",
+		pr_debug("evlist__mmap: %s\n",
 			 str_error_r(errno, sbuf, sizeof(sbuf)));
 		goto out_delete_evlist;
 	}
@@ -164,9 +165,9 @@
 	while (1) {
 		int before = total_events;
 
-		for (i = 0; i < evlist->nr_mmaps; i++) {
+		for (i = 0; i < evlist->core.nr_mmaps; i++) {
 			union perf_event *event;
-			struct perf_mmap *md;
+			struct mmap *md;
 
 			md = &evlist->mmap[i];
 			if (perf_mmap__read_init(md) < 0)
@@ -286,7 +287,7 @@
 		 * perf_event_attr.wakeup_events, just PERF_EVENT_SAMPLE does.
 		 */
 		if (total_events == before && false)
-			perf_evlist__poll(evlist, -1);
+			evlist__poll(evlist, -1);
 
 		sleep(1);
 		if (++wakeups > 5) {

diff --git a/tools/perf/tests/pmu.c b/tools/perf/tests/pmu.c
index 14a7889..74379ff 100644
--- a/tools/perf/tests/pmu.c
+++ b/tools/perf/tests/pmu.c

@@ -1,7 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 #include "parse-events.h"
 #include "pmu.h"
-#include "util.h"
 #include "tests.h"
 #include <errno.h>
 #include <stdio.h>

diff --git a/tools/perf/tests/sample-parsing.c b/tools/perf/tests/sample-parsing.c
index 5fcc068..3a02426 100644
--- a/tools/perf/tests/sample-parsing.c
+++ b/tools/perf/tests/sample-parsing.c

@@ -9,10 +9,10 @@
 
 #include "map_symbol.h"
 #include "branch.h"
-#include "util.h"
 #include "event.h"
 #include "evsel.h"
 #include "debug.h"
+#include "util/synthetic-events.h"
 
 #include "tests.h"
 

diff --git a/tools/perf/tests/sdt.c b/tools/perf/tests/sdt.c
index cf1bd57..60f0e9e 100644
--- a/tools/perf/tests/sdt.c
+++ b/tools/perf/tests/sdt.c

@@ -3,6 +3,7 @@
 #include <limits.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <unistd.h>
 #include <sys/epoll.h>
 #include <util/symbol.h>
 #include <linux/filter.h>

diff --git a/tools/perf/tests/stat.c b/tools/perf/tests/stat.c
index cc10b41..c191150 100644
--- a/tools/perf/tests/stat.c
+++ b/tools/perf/tests/stat.c

@@ -5,6 +5,7 @@
 #include "stat.h"
 #include "counts.h"
 #include "debug.h"
+#include "util/synthetic-events.h"
 
 static bool has_term(struct perf_record_stat_config *config,
 		     u64 tag, u64 val)

diff --git a/tools/perf/tests/sw-clock.c b/tools/perf/tests/sw-clock.c
index 97694a0..84519df 100644
--- a/tools/perf/tests/sw-clock.c
+++ b/tools/perf/tests/sw-clock.c

@@ -12,6 +12,7 @@
 #include "util/evsel.h"
 #include "util/evlist.h"
 #include "util/cpumap.h"
+#include "util/mmap.h"
 #include "util/thread_map.h"
 #include <perf/evlist.h>
 
@@ -42,7 +43,7 @@
 	};
 	struct perf_cpu_map *cpus;
 	struct perf_thread_map *threads;
-	struct perf_mmap *md;
+	struct mmap *md;
 
 	attr.sample_freq = 500;
 
@@ -82,7 +83,7 @@
 		goto out_delete_evlist;
 	}
 
-	err = perf_evlist__mmap(evlist, 128);
+	err = evlist__mmap(evlist, 128);
 	if (err < 0) {
 		pr_debug("failed to mmap event: %d (%s)\n", errno,
 			 str_error_r(errno, sbuf, sizeof(sbuf)));

diff --git a/tools/perf/tests/switch-tracking.c b/tools/perf/tests/switch-tracking.c
index 1a60fa1..ffa592e 100644
--- a/tools/perf/tests/switch-tracking.c
+++ b/tools/perf/tests/switch-tracking.c

@@ -14,9 +14,9 @@
 #include "evlist.h"
 #include "evsel.h"
 #include "thread_map.h"
-#include "cpumap.h"
 #include "record.h"
 #include "tests.h"
+#include "util/mmap.h"
 
 static int spin_sleep(void)
 {
@@ -144,7 +144,7 @@
 			return err;
 		/*
 		 * Check for no missing sched_switch events i.e. that the
-		 * evsel->system_wide flag has worked.
+		 * evsel->core.system_wide flag has worked.
 		 */
 		if (switch_tracking->tids[cpu] != -1 &&
 		    switch_tracking->tids[cpu] != prev_tid) {
@@ -264,10 +264,10 @@
 	unsigned pos, cnt = 0;
 	LIST_HEAD(events);
 	struct event_node *events_array, *node;
-	struct perf_mmap *md;
+	struct mmap *md;
 	int i, ret;
 
-	for (i = 0; i < evlist->nr_mmaps; i++) {
+	for (i = 0; i < evlist->core.nr_mmaps; i++) {
 		md = &evlist->mmap[i];
 		if (perf_mmap__read_init(md) < 0)
 			continue;
@@ -316,7 +316,7 @@
  *
  * This function implements a test that checks that sched_switch events and
  * tracking events can be recorded for a workload (current process) using the
- * evsel->system_wide and evsel->tracking flags (respectively) with other events
+ * evsel->core.system_wide and evsel->tracking flags (respectively) with other events
  * sometimes enabled or disabled.
  */
 int test__switch_tracking(struct test *test __maybe_unused, int subtest __maybe_unused)
@@ -367,7 +367,7 @@
 		goto out_err;
 	}
 
-	cpu_clocks_evsel = perf_evlist__last(evlist);
+	cpu_clocks_evsel = evlist__last(evlist);
 
 	/* Second event */
 	err = parse_events(evlist, "cycles:u", NULL);
@@ -376,7 +376,7 @@
 		goto out_err;
 	}
 
-	cycles_evsel = perf_evlist__last(evlist);
+	cycles_evsel = evlist__last(evlist);
 
 	/* Third event */
 	if (!perf_evlist__can_select_event(evlist, sched_switch)) {
@@ -391,22 +391,22 @@
 		goto out_err;
 	}
 
-	switch_evsel = perf_evlist__last(evlist);
+	switch_evsel = evlist__last(evlist);
 
 	perf_evsel__set_sample_bit(switch_evsel, CPU);
 	perf_evsel__set_sample_bit(switch_evsel, TIME);
 
-	switch_evsel->system_wide = true;
+	switch_evsel->core.system_wide = true;
 	switch_evsel->no_aux_samples = true;
 	switch_evsel->immediate = true;
 
 	/* Test moving an event to the front */
-	if (cycles_evsel == perf_evlist__first(evlist)) {
+	if (cycles_evsel == evlist__first(evlist)) {
 		pr_debug("cycles event already at front");
 		goto out_err;
 	}
 	perf_evlist__to_front(evlist, cycles_evsel);
-	if (cycles_evsel != perf_evlist__first(evlist)) {
+	if (cycles_evsel != evlist__first(evlist)) {
 		pr_debug("Failed to move cycles event to front");
 		goto out_err;
 	}
@@ -421,7 +421,7 @@
 		goto out_err;
 	}
 
-	tracking_evsel = perf_evlist__last(evlist);
+	tracking_evsel = evlist__last(evlist);
 
 	perf_evlist__set_tracking_event(evlist, tracking_evsel);
 
@@ -434,7 +434,7 @@
 	perf_evlist__config(evlist, &opts, NULL);
 
 	/* Check moved event is still at the front */
-	if (cycles_evsel != perf_evlist__first(evlist)) {
+	if (cycles_evsel != evlist__first(evlist)) {
 		pr_debug("Front event no longer at front");
 		goto out_err;
 	}
@@ -461,9 +461,9 @@
 		goto out;
 	}
 
-	err = perf_evlist__mmap(evlist, UINT_MAX);
+	err = evlist__mmap(evlist, UINT_MAX);
 	if (err) {
-		pr_debug("perf_evlist__mmap failed!\n");
+		pr_debug("evlist__mmap failed!\n");
 		goto out_err;
 	}
 

diff --git a/tools/perf/tests/task-exit.c b/tools/perf/tests/task-exit.c
index f610e8c..bce3a4c 100644
--- a/tools/perf/tests/task-exit.c
+++ b/tools/perf/tests/task-exit.c

@@ -4,12 +4,13 @@
 #include "evsel.h"
 #include "target.h"
 #include "thread_map.h"
-#include "cpumap.h"
 #include "tests.h"
+#include "util/mmap.h"
 
 #include <errno.h>
 #include <signal.h>
 #include <linux/string.h>
+#include <perf/cpumap.h>
 #include <perf/evlist.h>
 
 static int exited;
@@ -51,7 +52,7 @@
 	char sbuf[STRERR_BUFSIZE];
 	struct perf_cpu_map *cpus;
 	struct perf_thread_map *threads;
-	struct perf_mmap *md;
+	struct mmap *md;
 
 	signal(SIGCHLD, sig_handler);
 
@@ -87,7 +88,7 @@
 		goto out_delete_evlist;
 	}
 
-	evsel = perf_evlist__first(evlist);
+	evsel = evlist__first(evlist);
 	evsel->core.attr.task = 1;
 #ifdef __s390x__
 	evsel->core.attr.sample_freq = 1000000;
@@ -106,7 +107,7 @@
 		goto out_delete_evlist;
 	}
 
-	if (perf_evlist__mmap(evlist, 128) < 0) {
+	if (evlist__mmap(evlist, 128) < 0) {
 		pr_debug("failed to mmap events: %d (%s)\n", errno,
 			 str_error_r(errno, sbuf, sizeof(sbuf)));
 		goto out_delete_evlist;
@@ -129,7 +130,7 @@
 
 out_init:
 	if (!exited || !nr_exit) {
-		perf_evlist__poll(evlist, -1);
+		evlist__poll(evlist, -1);
 		goto retry;
 	}
 

diff --git a/tools/perf/tests/thread-map.c b/tools/perf/tests/thread-map.c
index 39168c5..28f51c4 100644
--- a/tools/perf/tests/thread-map.c
+++ b/tools/perf/tests/thread-map.c

@@ -8,6 +8,7 @@
 #include "thread_map.h"
 #include "debug.h"
 #include "event.h"
+#include "util/synthetic-events.h"
 #include <linux/zalloc.h>
 #include <perf/event.h>
 

diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c
index a4f9f51..4a80049 100644
--- a/tools/perf/tests/topology.c
+++ b/tools/perf/tests/topology.c

@@ -3,11 +3,12 @@
 #include <stdlib.h>
 #include <stdio.h>
 #include <perf/cpumap.h>
+#include "cpumap.h"
 #include "tests.h"
-#include "util.h"
 #include "session.h"
 #include "evlist.h"
 #include "debug.h"
+#include <linux/err.h>
 
 #define TEMPL "/tmp/perf-test-XXXXXX"
 #define DATA_SIZE	10
@@ -39,7 +40,7 @@
 	};
 
 	session = perf_session__new(&data, false, NULL);
-	TEST_ASSERT_VAL("can't get session", session);
+	TEST_ASSERT_VAL("can't get session", !IS_ERR(session));
 
 	session->evlist = perf_evlist__new_default();
 	TEST_ASSERT_VAL("can't get evlist", session->evlist);
@@ -70,7 +71,7 @@
 	int i;
 
 	session = perf_session__new(&data, false, NULL);
-	TEST_ASSERT_VAL("can't get session", session);
+	TEST_ASSERT_VAL("can't get session", !IS_ERR(session));
 
 	/* On platforms with large numbers of CPUs process_cpu_topology()
 	 * might issue an error while reading the perf.data file section

diff --git a/tools/perf/tests/vmlinux-kallsyms.c b/tools/perf/tests/vmlinux-kallsyms.c
index 01f434c..aa296ff 100644
--- a/tools/perf/tests/vmlinux-kallsyms.c
+++ b/tools/perf/tests/vmlinux-kallsyms.c

@@ -7,7 +7,7 @@
 #include "dso.h"
 #include "map.h"
 #include "symbol.h"
-#include "util.h"
+#include <internal/lib.h> // page_size
 #include "tests.h"
 #include "debug.h"
 #include "machine.h"

diff --git a/tools/perf/ui/browser.c b/tools/perf/ui/browser.c
index f93d40b..781afe4 100644
--- a/tools/perf/ui/browser.c
+++ b/tools/perf/ui/browser.c

@@ -1,5 +1,4 @@
 // SPDX-License-Identifier: GPL-2.0
-#include "../util/util.h"
 #include "../util/string2.h"
 #include "../util/config.h"
 #include "libslang.h"

diff --git a/tools/perf/ui/browsers/annotate.c b/tools/perf/ui/browsers/annotate.c
index ac74ed2..82207db 100644
--- a/tools/perf/ui/browsers/annotate.c
+++ b/tools/perf/ui/browsers/annotate.c

@@ -2,7 +2,6 @@
 #include "../browser.h"
 #include "../helpline.h"
 #include "../ui.h"
-#include "../util.h"
 #include "../../util/annotate.h"
 #include "../../util/debug.h"
 #include "../../util/dso.h"

diff --git a/tools/perf/ui/browsers/header.c b/tools/perf/ui/browsers/header.c
index 0f59a70..57e6e43 100644
--- a/tools/perf/ui/browsers/header.c
+++ b/tools/perf/ui/browsers/header.c

@@ -1,5 +1,4 @@
 // SPDX-License-Identifier: GPL-2.0
-#include "util/debug.h"
 #include "ui/browser.h"
 #include "ui/keysyms.h"
 #include "ui/ui.h"

diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c
index 589168c..7a7187e 100644
--- a/tools/perf/ui/browsers/hists.c
+++ b/tools/perf/ui/browsers/hists.c

@@ -3319,13 +3319,13 @@
 			switch (key) {
 			case K_TAB:
 				if (pos->core.node.next == &evlist->core.entries)
-					pos = perf_evlist__first(evlist);
+					pos = evlist__first(evlist);
 				else
 					pos = perf_evsel__next(pos);
 				goto browse_hists;
 			case K_UNTAB:
 				if (pos->core.node.prev == &evlist->core.entries)
-					pos = perf_evlist__last(evlist);
+					pos = evlist__last(evlist);
 				else
 					pos = perf_evsel__prev(pos);
 				goto browse_hists;
@@ -3417,7 +3417,7 @@
 
 single_entry:
 	if (nr_entries == 1) {
-		struct evsel *first = perf_evlist__first(evlist);
+		struct evsel *first = evlist__first(evlist);
 
 		return perf_evsel__hists_browse(first, nr_entries, help,
 						false, hbt, min_pcnt,

diff --git a/tools/perf/ui/browsers/map.c b/tools/perf/ui/browsers/map.c
index 893b065..3d49b916 100644
--- a/tools/perf/ui/browsers/map.c
+++ b/tools/perf/ui/browsers/map.c

@@ -5,7 +5,6 @@
 #include <stdlib.h>
 #include <string.h>
 #include <linux/bitops.h>
-#include "../../util/util.h"
 #include "../../util/debug.h"
 #include "../../util/map.h"
 #include "../../util/dso.h"

diff --git a/tools/perf/ui/browsers/res_sample.c b/tools/perf/ui/browsers/res_sample.c
index f16a38f..76d356a 100644
--- a/tools/perf/ui/browsers/res_sample.c
+++ b/tools/perf/ui/browsers/res_sample.c

@@ -7,7 +7,7 @@
 #include "config.h"
 #include "time-utils.h"
 #include "../util.h"
-#include "../../util/util.h"
+#include "../../util/util.h" // perf_exe()
 #include "../../perf.h"
 #include <stdlib.h>
 #include <string.h>

diff --git a/tools/perf/ui/browsers/scripts.c b/tools/perf/ui/browsers/scripts.c
index 586a21a..fc733a6 100644
--- a/tools/perf/ui/browsers/scripts.c
+++ b/tools/perf/ui/browsers/scripts.c

@@ -1,7 +1,8 @@
 // SPDX-License-Identifier: GPL-2.0
 #include "../../builtin.h"
 #include "../../perf.h"
-#include "../../util/util.h"
+#include "../../util/util.h" // perf_exe()
+#include "../util.h"
 #include "../../util/hist.h"
 #include "../../util/debug.h"
 #include "../../util/symbol.h"

diff --git a/tools/perf/ui/gtk/helpline.c b/tools/perf/ui/gtk/helpline.c
index e166da9..e40a006 100644
--- a/tools/perf/ui/gtk/helpline.c
+++ b/tools/perf/ui/gtk/helpline.c

@@ -6,7 +6,6 @@
 #include "gtk.h"
 #include "../ui.h"
 #include "../helpline.h"
-#include "../../util/debug.h"
 
 static void gtk_helpline_pop(void)
 {

diff --git a/tools/perf/ui/gtk/hists.c b/tools/perf/ui/gtk/hists.c
index 6c2efc1..ed1a97b 100644
--- a/tools/perf/ui/gtk/hists.c
+++ b/tools/perf/ui/gtk/hists.c

@@ -8,6 +8,7 @@
 #include "../string2.h"
 #include "gtk.h"
 #include <signal.h>
+#include <stdlib.h>
 #include <linux/string.h>
 
 #define MAX_COLUMNS			32

diff --git a/tools/perf/ui/gtk/progress.c b/tools/perf/ui/gtk/progress.c
index b6ad885..eea6fcd 100644
--- a/tools/perf/ui/gtk/progress.c
+++ b/tools/perf/ui/gtk/progress.c

@@ -3,7 +3,6 @@
 
 #include "gtk.h"
 #include "../progress.h"
-#include "util.h"
 
 static GtkWidget *dialog;
 static GtkWidget *progress;

diff --git a/tools/perf/ui/gtk/setup.c b/tools/perf/ui/gtk/setup.c
index 1a2616b..f5eee4d 100644
--- a/tools/perf/ui/gtk/setup.c
+++ b/tools/perf/ui/gtk/setup.c

@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 #include "gtk.h"
-#include "../../util/debug.h"
+#include <linux/compiler.h>
+#include "../util.h"
 
 extern struct perf_error_ops perf_gtk_eops;
 

diff --git a/tools/perf/ui/gtk/util.c b/tools/perf/ui/gtk/util.c
index c2c5589..c47f5c3 100644
--- a/tools/perf/ui/gtk/util.c
+++ b/tools/perf/ui/gtk/util.c

@@ -1,6 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
 #include "../util.h"
-#include "../../util/debug.h"
 #include "gtk.h"
 
 #include <stdlib.h>

diff --git a/tools/perf/ui/helpline.c b/tools/perf/ui/helpline.c
index 54bcd08..911182b 100644
--- a/tools/perf/ui/helpline.c
+++ b/tools/perf/ui/helpline.c

@@ -3,10 +3,8 @@
 #include <stdlib.h>
 #include <string.h>
 
-#include "../util/debug.h"
 #include "helpline.h"
 #include "ui.h"
-#include "../util/util.h"
 
 char ui_helpline__current[512];
 

diff --git a/tools/perf/ui/hist.c b/tools/perf/ui/hist.c
index 3e533de..f736755 100644
--- a/tools/perf/ui/hist.c
+++ b/tools/perf/ui/hist.c

@@ -8,7 +8,6 @@
 #include "../util/callchain.h"
 #include "../util/debug.h"
 #include "../util/hist.h"
-#include "../util/util.h"
 #include "../util/sort.h"
 #include "../util/evsel.h"
 #include "../util/evlist.h"

diff --git a/tools/perf/ui/setup.c b/tools/perf/ui/setup.c
index c7a86b4..700335c 100644
--- a/tools/perf/ui/setup.c
+++ b/tools/perf/ui/setup.c

@@ -1,11 +1,11 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <pthread.h>
 #include <dlfcn.h>
+#include <unistd.h>
 
 #include <subcmd/pager.h>
 #include "../util/debug.h"
 #include "../util/hist.h"
-#include "../util/util.h"
 #include "ui.h"
 
 pthread_mutex_t ui__lock = PTHREAD_MUTEX_INITIALIZER;

diff --git a/tools/perf/ui/stdio/hist.c b/tools/perf/ui/stdio/hist.c
index 832ca6c..5365606 100644
--- a/tools/perf/ui/stdio/hist.c
+++ b/tools/perf/ui/stdio/hist.c

@@ -5,6 +5,7 @@
 
 #include "../../util/callchain.h"
 #include "../../util/debug.h"
+#include "../../util/event.h"
 #include "../../util/hist.h"
 #include "../../util/map.h"
 #include "../../util/map_groups.h"

diff --git a/tools/perf/ui/tui/helpline.c b/tools/perf/ui/tui/helpline.c
index 5f188f6..298d6af 100644
--- a/tools/perf/ui/tui/helpline.c
+++ b/tools/perf/ui/tui/helpline.c

@@ -6,7 +6,6 @@
 #include <linux/kernel.h>
 #include <linux/string.h>
 
-#include "../../util/debug.h"
 #include "../helpline.h"
 #include "../ui.h"
 #include "../libslang.h"

diff --git a/tools/perf/ui/tui/setup.c b/tools/perf/ui/tui/setup.c
index 56651a4..e9bfe85 100644
--- a/tools/perf/ui/tui/setup.c
+++ b/tools/perf/ui/tui/setup.c

@@ -2,13 +2,13 @@
 #include <signal.h>
 #include <stdbool.h>
 #include <stdlib.h>
+#include <unistd.h>
 #include <linux/kernel.h>
 #ifdef HAVE_BACKTRACE_SUPPORT
 #include <execinfo.h>
 #endif
 
 #include "../../util/debug.h"
-#include "../../util/util.h"
 #include "../../perf.h"
 #include "../browser.h"
 #include "../helpline.h"

diff --git a/tools/perf/ui/tui/util.c b/tools/perf/ui/tui/util.c
index 087d9ab..b98dd0e 100644
--- a/tools/perf/ui/tui/util.c
+++ b/tools/perf/ui/tui/util.c

@@ -5,7 +5,6 @@
 #include <stdlib.h>
 #include <sys/ttydefaults.h>
 
-#include "../../util/debug.h"
 #include "../browser.h"
 #include "../keysyms.h"
 #include "../helpline.h"

diff --git a/tools/perf/util/Build b/tools/perf/util/Build
index 0b4d8e0..8dcfca1 100644
--- a/tools/perf/util/Build
+++ b/tools/perf/util/Build

@@ -3,6 +3,7 @@
 perf-y += build-id.o
 perf-y += cacheline.o
 perf-y += config.o
+perf-y += copyfile.o
 perf-y += ctype.o
 perf-y += db-export.o
 perf-y += env.o
@@ -10,6 +11,7 @@
 perf-y += evlist.o
 perf-y += evsel.o
 perf-y += evsel_fprintf.o
+perf-y += perf_event_attr_fprintf.o
 perf-y += evswitch.o
 perf-y += find_bit.o
 perf-y += get_current_dir_name.o
@@ -86,6 +88,7 @@
 perf-y += record.o
 perf-y += srcline.o
 perf-y += srccode.o
+perf-y += synthetic-events.o
 perf-y += data.o
 perf-y += tsc.o
 perf-y += cloexec.o

diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index 1748f52..e830ead 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c

@@ -14,7 +14,7 @@
 #include <bpf/btf.h>
 #include <bpf/libbpf.h>
 #include <linux/btf.h>
-#include "util.h"
+#include "util.h" // hex_width()
 #include "ui/ui.h"
 #include "sort.h"
 #include "build-id.h"
@@ -34,6 +34,7 @@
 #include "bpf-event.h"
 #include "block-range.h"
 #include "string2.h"
+#include "util/event.h"
 #include "arch/common.h"
 #include <regex.h>
 #include <pthread.h>

diff --git a/tools/perf/util/arm-spe.c b/tools/perf/util/arm-spe.c
index 8a7340f..53be12b 100644
--- a/tools/perf/util/arm-spe.c
+++ b/tools/perf/util/arm-spe.c

@@ -16,7 +16,6 @@
 #include <linux/log2.h>
 #include <linux/zalloc.h>
 
-#include "cpumap.h"
 #include "color.h"
 #include "evsel.h"
 #include "machine.h"

diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c
index 6f25224..8470dfe 100644
--- a/tools/perf/util/auxtrace.c
+++ b/tools/perf/util/auxtrace.c

@@ -31,8 +31,8 @@
 #include "map.h"
 #include "pmu.h"
 #include "evsel.h"
-#include "cpumap.h"
 #include "symbol.h"
+#include "util/synthetic-events.h"
 #include "thread_map.h"
 #include "asm/bug.h"
 #include "auxtrace.h"
@@ -50,10 +50,12 @@
 #include "intel-bts.h"
 #include "arm-spe.h"
 #include "s390-cpumsf.h"
-#include "util.h"
+#include "util/mmap.h"
 
 #include <linux/ctype.h>
+#include <linux/kernel.h>
 #include "symbol/kallsyms.h"
+#include <internal/lib.h>
 
 static bool auxtrace__dont_decode(struct perf_session *session)
 {
@@ -1226,7 +1228,7 @@
 	return 0;
 }
 
-static int __auxtrace_mmap__read(struct perf_mmap *map,
+static int __auxtrace_mmap__read(struct mmap *map,
 				 struct auxtrace_record *itr,
 				 struct perf_tool *tool, process_auxtrace_t fn,
 				 bool snapshot, size_t snapshot_size)
@@ -1337,13 +1339,13 @@
 	return 1;
 }
 
-int auxtrace_mmap__read(struct perf_mmap *map, struct auxtrace_record *itr,
+int auxtrace_mmap__read(struct mmap *map, struct auxtrace_record *itr,
 			struct perf_tool *tool, process_auxtrace_t fn)
 {
 	return __auxtrace_mmap__read(map, itr, tool, fn, false, 0);
 }
 
-int auxtrace_mmap__read_snapshot(struct perf_mmap *map,
+int auxtrace_mmap__read_snapshot(struct mmap *map,
 				 struct auxtrace_record *itr,
 				 struct perf_tool *tool, process_auxtrace_t fn,
 				 size_t snapshot_size)

diff --git a/tools/perf/util/auxtrace.h b/tools/perf/util/auxtrace.h
index 37e70dc..f201f36 100644
--- a/tools/perf/util/auxtrace.h
+++ b/tools/perf/util/auxtrace.h

@@ -11,21 +11,22 @@
 #include <errno.h>
 #include <stdbool.h>
 #include <stddef.h>
+#include <stdio.h> // FILE
 #include <linux/list.h>
 #include <linux/perf_event.h>
 #include <linux/types.h>
 #include <asm/bitsperlong.h>
 #include <asm/barrier.h>
 
-#include "event.h"
-
 union perf_event;
 struct perf_session;
 struct evlist;
 struct perf_tool;
-struct perf_mmap;
+struct mmap;
+struct perf_sample;
 struct option;
 struct record_opts;
+struct perf_record_auxtrace_error;
 struct perf_record_auxtrace_info;
 struct events_stats;
 
@@ -444,14 +445,14 @@
 				   bool per_cpu);
 
 typedef int (*process_auxtrace_t)(struct perf_tool *tool,
-				  struct perf_mmap *map,
+				  struct mmap *map,
 				  union perf_event *event, void *data1,
 				  size_t len1, void *data2, size_t len2);
 
-int auxtrace_mmap__read(struct perf_mmap *map, struct auxtrace_record *itr,
+int auxtrace_mmap__read(struct mmap *map, struct auxtrace_record *itr,
 			struct perf_tool *tool, process_auxtrace_t fn);
 
-int auxtrace_mmap__read_snapshot(struct perf_mmap *map,
+int auxtrace_mmap__read_snapshot(struct mmap *map,
 				 struct auxtrace_record *itr,
 				 struct perf_tool *tool, process_auxtrace_t fn,
 				 size_t snapshot_size);
@@ -524,10 +525,6 @@
 			  int code, int cpu, pid_t pid, pid_t tid, u64 ip,
 			  const char *msg, u64 timestamp);
 
-int perf_event__synthesize_auxtrace_info(struct auxtrace_record *itr,
-					 struct perf_tool *tool,
-					 struct perf_session *session,
-					 perf_event__handler_t process);
 int perf_event__process_auxtrace_info(struct perf_session *session,
 				      union perf_event *event);
 s64 perf_event__process_auxtrace(struct perf_session *session,
@@ -604,15 +601,6 @@
 {
 }
 
-static inline int
-perf_event__synthesize_auxtrace_info(struct auxtrace_record *itr __maybe_unused,
-				     struct perf_tool *tool __maybe_unused,
-				     struct perf_session *session __maybe_unused,
-				     perf_event__handler_t process __maybe_unused)
-{
-	return -EINVAL;
-}
-
 static inline
 int auxtrace_record__options(struct auxtrace_record *itr __maybe_unused,
 			     struct evlist *evlist __maybe_unused,

diff --git a/tools/perf/util/bpf-event.c b/tools/perf/util/bpf-event.c
index 7a3d4b1..f7ed5d1 100644
--- a/tools/perf/util/bpf-event.c
+++ b/tools/perf/util/bpf-event.c

@@ -16,6 +16,7 @@
 #include "map.h"
 #include "evlist.h"
 #include "record.h"
+#include "util/synthetic-events.h"
 
 #define ptr_to_u64(ptr)    ((__u64)(unsigned long)(ptr))
 

diff --git a/tools/perf/util/bpf-event.h b/tools/perf/util/bpf-event.h
index a01c2fd..81fdc88 100644
--- a/tools/perf/util/bpf-event.h
+++ b/tools/perf/util/bpf-event.h

@@ -6,9 +6,9 @@
 #include <linux/rbtree.h>
 #include <pthread.h>
 #include <api/fd/array.h>
-#include "event.h"
 #include <stdio.h>
 
+struct bpf_prog_info;
 struct machine;
 union perf_event;
 struct perf_env;
@@ -33,11 +33,6 @@
 #ifdef HAVE_LIBBPF_SUPPORT
 int machine__process_bpf(struct machine *machine, union perf_event *event,
 			 struct perf_sample *sample);
-
-int perf_event__synthesize_bpf_events(struct perf_session *session,
-				      perf_event__handler_t process,
-				      struct machine *machine,
-				      struct record_opts *opts);
 int bpf_event__add_sb_event(struct evlist **evlist,
 				 struct perf_env *env);
 void bpf_event__print_bpf_prog_info(struct bpf_prog_info *info,
@@ -51,14 +46,6 @@
 	return 0;
 }
 
-static inline int perf_event__synthesize_bpf_events(struct perf_session *session __maybe_unused,
-						    perf_event__handler_t process __maybe_unused,
-						    struct machine *machine __maybe_unused,
-						    struct record_opts *opts __maybe_unused)
-{
-	return 0;
-}
-
 static inline int bpf_event__add_sb_event(struct evlist **evlist __maybe_unused,
 					  struct perf_env *env __maybe_unused)
 {

diff --git a/tools/perf/util/bpf-loader.c b/tools/perf/util/bpf-loader.c
index 37283e8..10c187b 100644
--- a/tools/perf/util/bpf-loader.c
+++ b/tools/perf/util/bpf-loader.c

@@ -1568,7 +1568,7 @@
 			return ERR_PTR(-err);
 		}
 
-		evsel = perf_evlist__last(evlist);
+		evsel = evlist__last(evlist);
 	}
 
 	bpf__for_each_map_named(map, obj, tmp, name) {

diff --git a/tools/perf/util/branch.c b/tools/perf/util/branch.c
index 9d1e090..2285b1e 100644
--- a/tools/perf/util/branch.c
+++ b/tools/perf/util/branch.c

@@ -1,5 +1,3 @@
-#include "util/util.h"
-#include "util/debug.h"
 #include "util/map_symbol.h"
 #include "util/branch.h"
 #include <linux/kernel.h>

diff --git a/tools/perf/util/branch.h b/tools/perf/util/branch.h
index 06f66da..88e00d2 100644
--- a/tools/perf/util/branch.h
+++ b/tools/perf/util/branch.h

@@ -1,8 +1,15 @@
 #ifndef _PERF_BRANCH_H
 #define _PERF_BRANCH_H 1
-
+/*
+ * The linux/stddef.h isn't need here, but is needed for __always_inline used
+ * in files included from uapi/linux/perf_event.h such as
+ * /usr/include/linux/swab.h and /usr/include/linux/byteorder/little_endian.h,
+ * detected in at least musl libc, used in Alpine Linux. -acme
+ */
 #include <stdio.h>
 #include <stdint.h>
+#include <linux/compiler.h>
+#include <linux/stddef.h>
 #include <linux/perf_event.h>
 #include <linux/types.h>
 

diff --git a/tools/perf/util/build-id.c b/tools/perf/util/build-id.c
index e5fb777..c076fc7 100644
--- a/tools/perf/util/build-id.c
+++ b/tools/perf/util/build-id.c

@@ -7,12 +7,13 @@
  * Copyright (C) 2009, 2010 Red Hat Inc.
  * Copyright (C) 2009, 2010 Arnaldo Carvalho de Melo <acme@redhat.com>
  */
-#include "util.h"
+#include "util.h" // lsdir(), mkdir_p(), rm_rf()
 #include <dirent.h>
 #include <errno.h>
 #include <stdio.h>
 #include <sys/stat.h>
 #include <sys/types.h>
+#include "util/copyfile.h"
 #include "dso.h"
 #include "build-id.h"
 #include "event.h"

diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
index c14646c..9a9b56e 100644
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c

@@ -23,6 +23,7 @@
 
 #include "debug.h"
 #include "dso.h"
+#include "event.h"
 #include "hist.h"
 #include "sort.h"
 #include "machine.h"

diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h
index b042cee..83398e5 100644
--- a/tools/perf/util/callchain.h
+++ b/tools/perf/util/callchain.h

@@ -4,12 +4,15 @@
 
 #include <linux/list.h>
 #include <linux/rbtree.h>
-#include "event.h"
 #include "map_symbol.h"
 #include "branch.h"
 
+struct addr_location;
 struct evsel;
+struct ip_callchain;
 struct map;
+struct perf_sample;
+struct thread;
 
 #define HELP_PAD "\t\t\t\t"
 

diff --git a/tools/perf/util/cloexec.c b/tools/perf/util/cloexec.c
index 4e904fc..a12872f 100644
--- a/tools/perf/util/cloexec.c
+++ b/tools/perf/util/cloexec.c

@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <errno.h>
 #include <sched.h>
-#include "util.h"
+#include "util.h" // for sched_getcpu()
 #include "../perf-sys.h"
 #include "cloexec.h"
 #include "event.h"

diff --git a/tools/perf/util/copyfile.c b/tools/perf/util/copyfile.c
new file mode 100644
index 0000000..3fa0db1
--- /dev/null
+++ b/tools/perf/util/copyfile.c

@@ -0,0 +1,144 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "util/copyfile.h"
+#include "util/namespaces.h"
+#include <internal/lib.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+static int slow_copyfile(const char *from, const char *to, struct nsinfo *nsi)
+{
+	int err = -1;
+	char *line = NULL;
+	size_t n;
+	FILE *from_fp, *to_fp;
+	struct nscookie nsc;
+
+	nsinfo__mountns_enter(nsi, &nsc);
+	from_fp = fopen(from, "r");
+	nsinfo__mountns_exit(&nsc);
+	if (from_fp == NULL)
+		goto out;
+
+	to_fp = fopen(to, "w");
+	if (to_fp == NULL)
+		goto out_fclose_from;
+
+	while (getline(&line, &n, from_fp) > 0)
+		if (fputs(line, to_fp) == EOF)
+			goto out_fclose_to;
+	err = 0;
+out_fclose_to:
+	fclose(to_fp);
+	free(line);
+out_fclose_from:
+	fclose(from_fp);
+out:
+	return err;
+}
+
+int copyfile_offset(int ifd, loff_t off_in, int ofd, loff_t off_out, u64 size)
+{
+	void *ptr;
+	loff_t pgoff;
+
+	pgoff = off_in & ~(page_size - 1);
+	off_in -= pgoff;
+
+	ptr = mmap(NULL, off_in + size, PROT_READ, MAP_PRIVATE, ifd, pgoff);
+	if (ptr == MAP_FAILED)
+		return -1;
+
+	while (size) {
+		ssize_t ret = pwrite(ofd, ptr + off_in, size, off_out);
+		if (ret < 0 && errno == EINTR)
+			continue;
+		if (ret <= 0)
+			break;
+
+		size -= ret;
+		off_in += ret;
+		off_out += ret;
+	}
+	munmap(ptr, off_in + size);
+
+	return size ? -1 : 0;
+}
+
+static int copyfile_mode_ns(const char *from, const char *to, mode_t mode,
+			    struct nsinfo *nsi)
+{
+	int fromfd, tofd;
+	struct stat st;
+	int err;
+	char *tmp = NULL, *ptr = NULL;
+	struct nscookie nsc;
+
+	nsinfo__mountns_enter(nsi, &nsc);
+	err = stat(from, &st);
+	nsinfo__mountns_exit(&nsc);
+	if (err)
+		goto out;
+	err = -1;
+
+	/* extra 'x' at the end is to reserve space for '.' */
+	if (asprintf(&tmp, "%s.XXXXXXx", to) < 0) {
+		tmp = NULL;
+		goto out;
+	}
+	ptr = strrchr(tmp, '/');
+	if (!ptr)
+		goto out;
+	ptr = memmove(ptr + 1, ptr, strlen(ptr) - 1);
+	*ptr = '.';
+
+	tofd = mkstemp(tmp);
+	if (tofd < 0)
+		goto out;
+
+	if (fchmod(tofd, mode))
+		goto out_close_to;
+
+	if (st.st_size == 0) { /* /proc? do it slowly... */
+		err = slow_copyfile(from, tmp, nsi);
+		goto out_close_to;
+	}
+
+	nsinfo__mountns_enter(nsi, &nsc);
+	fromfd = open(from, O_RDONLY);
+	nsinfo__mountns_exit(&nsc);
+	if (fromfd < 0)
+		goto out_close_to;
+
+	err = copyfile_offset(fromfd, 0, tofd, 0, st.st_size);
+
+	close(fromfd);
+out_close_to:
+	close(tofd);
+	if (!err)
+		err = link(tmp, to);
+	unlink(tmp);
+out:
+	free(tmp);
+	return err;
+}
+
+int copyfile_ns(const char *from, const char *to, struct nsinfo *nsi)
+{
+	return copyfile_mode_ns(from, to, 0755, nsi);
+}
+
+int copyfile_mode(const char *from, const char *to, mode_t mode)
+{
+	return copyfile_mode_ns(from, to, mode, NULL);
+}
+
+int copyfile(const char *from, const char *to)
+{
+	return copyfile_mode(from, to, 0755);
+}

diff --git a/tools/perf/util/copyfile.h b/tools/perf/util/copyfile.h
new file mode 100644
index 0000000..e85d2f2
--- /dev/null
+++ b/tools/perf/util/copyfile.h

@@ -0,0 +1,16 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef PERF_COPYFILE_H_
+#define PERF_COPYFILE_H_
+
+#include <linux/types.h>
+#include <sys/types.h>
+#include <fcntl.h>
+
+struct nsinfo;
+
+int copyfile(const char *from, const char *to);
+int copyfile_mode(const char *from, const char *to, mode_t mode);
+int copyfile_ns(const char *from, const char *to, struct nsinfo *nsi);
+int copyfile_offset(int ifd, loff_t off_in, int ofd, loff_t off_out, u64 size);
+
+#endif // PERF_COPYFILE_H_

diff --git a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
index 37d7c49..cd92a99 100644
--- a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
+++ b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c

@@ -17,7 +17,6 @@
 #include "cs-etm.h"
 #include "cs-etm-decoder.h"
 #include "intlist.h"
-#include "util.h"
 
 /* use raw logging */
 #ifdef CS_DEBUG_RAW

diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
index 707afdb..4ba0f87 100644
--- a/tools/perf/util/cs-etm.c
+++ b/tools/perf/util/cs-etm.c

@@ -35,7 +35,7 @@
 #include "thread.h"
 #include "thread-stack.h"
 #include <tools/libc_compat.h>
-#include "util.h"
+#include "util/synthetic-events.h"
 
 #define MAX_TIMESTAMP (~0ULL)
 
@@ -1298,7 +1298,7 @@
 	attr.read_format = evsel->core.attr.read_format;
 
 	/* create new id val to be a fixed offset from evsel id */
-	id = evsel->id[0] + 1000000000;
+	id = evsel->core.id[0] + 1000000000;
 
 	if (!id)
 		id = 1;

diff --git a/tools/perf/util/data-convert-bt.c b/tools/perf/util/data-convert-bt.c
index 0c26844..dbc772b 100644
--- a/tools/perf/util/data-convert-bt.c
+++ b/tools/perf/util/data-convert-bt.c

@@ -30,6 +30,7 @@
 #include "machine.h"
 #include "config.h"
 #include <linux/ctype.h>
+#include <linux/err.h>
 
 #define pr_N(n, fmt, ...) \
 	eprintf(n, debug_data_convert, fmt, ##__VA_ARGS__)
@@ -1619,8 +1620,10 @@
 	err = -1;
 	/* perf.data session */
 	session = perf_session__new(&data, 0, &c.tool);
-	if (!session)
+	if (IS_ERR(session)) {
+		err = PTR_ERR(session);
 		goto free_writer;
+	}
 
 	if (c.queue_size) {
 		ordered_events__set_alloc_size(&session->ordered_events,

diff --git a/tools/perf/util/data.c b/tools/perf/util/data.c
index e75c3a2..88fba2b 100644
--- a/tools/perf/util/data.c
+++ b/tools/perf/util/data.c

@@ -13,9 +13,10 @@
 #include <dirent.h>
 
 #include "data.h"
-#include "util.h"
+#include "util.h" // rm_rf_perf_data()
 #include "debug.h"
 #include "header.h"
+#include <internal/lib.h>
 
 static void close_dir(struct perf_data_file *files, int nr)
 {

diff --git a/tools/perf/util/debug.c b/tools/perf/util/debug.c
index a1b59bd..e55114f 100644
--- a/tools/perf/util/debug.c
+++ b/tools/perf/util/debug.c

@@ -17,7 +17,6 @@
 #include "event.h"
 #include "debug.h"
 #include "print_binary.h"
-#include "util.h"
 #include "target.h"
 #include "ui/helpline.h"
 #include "ui/ui.h"

diff --git a/tools/perf/util/debug.h b/tools/perf/util/debug.h
index b2deee9..d25ae1c 100644
--- a/tools/perf/util/debug.h
+++ b/tools/perf/util/debug.h

@@ -3,9 +3,9 @@
 #ifndef __PERF_DEBUG_H
 #define __PERF_DEBUG_H
 
+#include <stdarg.h>
 #include <stdbool.h>
 #include <linux/compiler.h>
-#include "../ui/util.h"
 
 extern int verbose;
 extern bool quiet, dump_trace;

diff --git a/tools/perf/util/demangle-java.c b/tools/perf/util/demangle-java.c
index 763328c..6fb7f34 100644
--- a/tools/perf/util/demangle-java.c
+++ b/tools/perf/util/demangle-java.c

@@ -3,7 +3,6 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include "debug.h"
 #include "symbol.h"
 
 #include "demangle-java.h"

diff --git a/tools/perf/util/demangle-rust.c b/tools/perf/util/demangle-rust.c
index 423afbb..a659fc6 100644
--- a/tools/perf/util/demangle-rust.c
+++ b/tools/perf/util/demangle-rust.c

@@ -1,6 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <string.h>
-#include "util.h"
 #include "debug.h"
 
 #include "demangle-rust.h"

diff --git a/tools/perf/util/dwarf-regs.c b/tools/perf/util/dwarf-regs.c
index db55edd..1b49ece 100644
--- a/tools/perf/util/dwarf-regs.c
+++ b/tools/perf/util/dwarf-regs.c

@@ -5,7 +5,6 @@
  * Written by: Masami Hiramatsu <mhiramat@kernel.org>
  */
 
-#include <util.h>
 #include <debug.h>
 #include <dwarf-regs.h>
 #include <elf.h>

diff --git a/tools/perf/util/env.h b/tools/perf/util/env.h
index d8e083d..db40906 100644
--- a/tools/perf/util/env.h
+++ b/tools/perf/util/env.h

@@ -4,9 +4,10 @@
 
 #include <linux/types.h>
 #include <linux/rbtree.h>
-#include "cpumap.h"
 #include "rwsem.h"
 
+struct perf_cpu_map;
+
 struct cpu_topology_map {
 	int	socket_id;
 	int	die_id;

diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index f4afbb8..fc1e5a9 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c

@@ -1,16 +1,16 @@
-#include <dirent.h>
 #include <errno.h>
 #include <fcntl.h>
 #include <inttypes.h>
 #include <linux/kernel.h>
 #include <linux/types.h>
+#include <perf/cpumap.h>
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <unistd.h>
 #include <uapi/linux/mman.h> /* To get things like MAP_HUGETLB even on older libc headers */
-#include <api/fs/fs.h>
 #include <linux/perf_event.h>
 #include <linux/zalloc.h>
+#include "cpumap.h"
 #include "dso.h"
 #include "event.h"
 #include "debug.h"
@@ -24,6 +24,7 @@
 #include "time-utils.h"
 #include <linux/ctype.h>
 #include "map.h"
+#include "util/namespaces.h"
 #include "symbol.h"
 #include "symbol/kallsyms.h"
 #include "asm/bug.h"
@@ -33,8 +34,6 @@
 #include "tool.h"
 #include "../perf.h"
 
-#define DEFAULT_PROC_MAP_PARSE_TIMEOUT 500
-
 static const char *perf_event__names[] = {
 	[0]					= "TOTAL",
 	[PERF_RECORD_MMAP]			= "MMAP",
@@ -75,18 +74,6 @@
 	[PERF_RECORD_COMPRESSED]		= "COMPRESSED",
 };
 
-static const char *perf_ns__names[] = {
-	[NET_NS_INDEX]		= "net",
-	[UTS_NS_INDEX]		= "uts",
-	[IPC_NS_INDEX]		= "ipc",
-	[PID_NS_INDEX]		= "pid",
-	[USER_NS_INDEX]		= "user",
-	[MNT_NS_INDEX]		= "mnt",
-	[CGROUP_NS_INDEX]	= "cgroup",
-};
-
-unsigned int proc_map_timeout = DEFAULT_PROC_MAP_PARSE_TIMEOUT;
-
 const char *perf_event__name(unsigned int id)
 {
 	if (id >= ARRAY_SIZE(perf_event__names))
@@ -96,775 +83,6 @@
 	return perf_event__names[id];
 }
 
-static const char *perf_ns__name(unsigned int id)
-{
-	if (id >= ARRAY_SIZE(perf_ns__names))
-		return "UNKNOWN";
-	return perf_ns__names[id];
-}
-
-int perf_tool__process_synth_event(struct perf_tool *tool,
-				   union perf_event *event,
-				   struct machine *machine,
-				   perf_event__handler_t process)
-{
-	struct perf_sample synth_sample = {
-	.pid	   = -1,
-	.tid	   = -1,
-	.time	   = -1,
-	.stream_id = -1,
-	.cpu	   = -1,
-	.period	   = 1,
-	.cpumode   = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK,
-	};
-
-	return process(tool, event, &synth_sample, machine);
-};
-
-/*
- * Assumes that the first 4095 bytes of /proc/pid/stat contains
- * the comm, tgid and ppid.
- */
-static int perf_event__get_comm_ids(pid_t pid, char *comm, size_t len,
-				    pid_t *tgid, pid_t *ppid)
-{
-	char filename[PATH_MAX];
-	char bf[4096];
-	int fd;
-	size_t size = 0;
-	ssize_t n;
-	char *name, *tgids, *ppids;
-
-	*tgid = -1;
-	*ppid = -1;
-
-	snprintf(filename, sizeof(filename), "/proc/%d/status", pid);
-
-	fd = open(filename, O_RDONLY);
-	if (fd < 0) {
-		pr_debug("couldn't open %s\n", filename);
-		return -1;
-	}
-
-	n = read(fd, bf, sizeof(bf) - 1);
-	close(fd);
-	if (n <= 0) {
-		pr_warning("Couldn't get COMM, tigd and ppid for pid %d\n",
-			   pid);
-		return -1;
-	}
-	bf[n] = '\0';
-
-	name = strstr(bf, "Name:");
-	tgids = strstr(bf, "Tgid:");
-	ppids = strstr(bf, "PPid:");
-
-	if (name) {
-		char *nl;
-
-		name = skip_spaces(name + 5);  /* strlen("Name:") */
-		nl = strchr(name, '\n');
-		if (nl)
-			*nl = '\0';
-
-		size = strlen(name);
-		if (size >= len)
-			size = len - 1;
-		memcpy(comm, name, size);
-		comm[size] = '\0';
-	} else {
-		pr_debug("Name: string not found for pid %d\n", pid);
-	}
-
-	if (tgids) {
-		tgids += 5;  /* strlen("Tgid:") */
-		*tgid = atoi(tgids);
-	} else {
-		pr_debug("Tgid: string not found for pid %d\n", pid);
-	}
-
-	if (ppids) {
-		ppids += 5;  /* strlen("PPid:") */
-		*ppid = atoi(ppids);
-	} else {
-		pr_debug("PPid: string not found for pid %d\n", pid);
-	}
-
-	return 0;
-}
-
-static int perf_event__prepare_comm(union perf_event *event, pid_t pid,
-				    struct machine *machine,
-				    pid_t *tgid, pid_t *ppid)
-{
-	size_t size;
-
-	*ppid = -1;
-
-	memset(&event->comm, 0, sizeof(event->comm));
-
-	if (machine__is_host(machine)) {
-		if (perf_event__get_comm_ids(pid, event->comm.comm,
-					     sizeof(event->comm.comm),
-					     tgid, ppid) != 0) {
-			return -1;
-		}
-	} else {
-		*tgid = machine->pid;
-	}
-
-	if (*tgid < 0)
-		return -1;
-
-	event->comm.pid = *tgid;
-	event->comm.header.type = PERF_RECORD_COMM;
-
-	size = strlen(event->comm.comm) + 1;
-	size = PERF_ALIGN(size, sizeof(u64));
-	memset(event->comm.comm + size, 0, machine->id_hdr_size);
-	event->comm.header.size = (sizeof(event->comm) -
-				(sizeof(event->comm.comm) - size) +
-				machine->id_hdr_size);
-	event->comm.tid = pid;
-
-	return 0;
-}
-
-pid_t perf_event__synthesize_comm(struct perf_tool *tool,
-					 union perf_event *event, pid_t pid,
-					 perf_event__handler_t process,
-					 struct machine *machine)
-{
-	pid_t tgid, ppid;
-
-	if (perf_event__prepare_comm(event, pid, machine, &tgid, &ppid) != 0)
-		return -1;
-
-	if (perf_tool__process_synth_event(tool, event, machine, process) != 0)
-		return -1;
-
-	return tgid;
-}
-
-static void perf_event__get_ns_link_info(pid_t pid, const char *ns,
-					 struct perf_ns_link_info *ns_link_info)
-{
-	struct stat64 st;
-	char proc_ns[128];
-
-	sprintf(proc_ns, "/proc/%u/ns/%s", pid, ns);
-	if (stat64(proc_ns, &st) == 0) {
-		ns_link_info->dev = st.st_dev;
-		ns_link_info->ino = st.st_ino;
-	}
-}
-
-int perf_event__synthesize_namespaces(struct perf_tool *tool,
-				      union perf_event *event,
-				      pid_t pid, pid_t tgid,
-				      perf_event__handler_t process,
-				      struct machine *machine)
-{
-	u32 idx;
-	struct perf_ns_link_info *ns_link_info;
-
-	if (!tool || !tool->namespace_events)
-		return 0;
-
-	memset(&event->namespaces, 0, (sizeof(event->namespaces) +
-	       (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
-	       machine->id_hdr_size));
-
-	event->namespaces.pid = tgid;
-	event->namespaces.tid = pid;
-
-	event->namespaces.nr_namespaces = NR_NAMESPACES;
-
-	ns_link_info = event->namespaces.link_info;
-
-	for (idx = 0; idx < event->namespaces.nr_namespaces; idx++)
-		perf_event__get_ns_link_info(pid, perf_ns__name(idx),
-					     &ns_link_info[idx]);
-
-	event->namespaces.header.type = PERF_RECORD_NAMESPACES;
-
-	event->namespaces.header.size = (sizeof(event->namespaces) +
-			(NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
-			machine->id_hdr_size);
-
-	if (perf_tool__process_synth_event(tool, event, machine, process) != 0)
-		return -1;
-
-	return 0;
-}
-
-static int perf_event__synthesize_fork(struct perf_tool *tool,
-				       union perf_event *event,
-				       pid_t pid, pid_t tgid, pid_t ppid,
-				       perf_event__handler_t process,
-				       struct machine *machine)
-{
-	memset(&event->fork, 0, sizeof(event->fork) + machine->id_hdr_size);
-
-	/*
-	 * for main thread set parent to ppid from status file. For other
-	 * threads set parent pid to main thread. ie., assume main thread
-	 * spawns all threads in a process
-	*/
-	if (tgid == pid) {
-		event->fork.ppid = ppid;
-		event->fork.ptid = ppid;
-	} else {
-		event->fork.ppid = tgid;
-		event->fork.ptid = tgid;
-	}
-	event->fork.pid  = tgid;
-	event->fork.tid  = pid;
-	event->fork.header.type = PERF_RECORD_FORK;
-	event->fork.header.misc = PERF_RECORD_MISC_FORK_EXEC;
-
-	event->fork.header.size = (sizeof(event->fork) + machine->id_hdr_size);
-
-	if (perf_tool__process_synth_event(tool, event, machine, process) != 0)
-		return -1;
-
-	return 0;
-}
-
-int perf_event__synthesize_mmap_events(struct perf_tool *tool,
-				       union perf_event *event,
-				       pid_t pid, pid_t tgid,
-				       perf_event__handler_t process,
-				       struct machine *machine,
-				       bool mmap_data)
-{
-	char filename[PATH_MAX];
-	FILE *fp;
-	unsigned long long t;
-	bool truncation = false;
-	unsigned long long timeout = proc_map_timeout * 1000000ULL;
-	int rc = 0;
-	const char *hugetlbfs_mnt = hugetlbfs__mountpoint();
-	int hugetlbfs_mnt_len = hugetlbfs_mnt ? strlen(hugetlbfs_mnt) : 0;
-
-	if (machine__is_default_guest(machine))
-		return 0;
-
-	snprintf(filename, sizeof(filename), "%s/proc/%d/task/%d/maps",
-		 machine->root_dir, pid, pid);
-
-	fp = fopen(filename, "r");
-	if (fp == NULL) {
-		/*
-		 * We raced with a task exiting - just return:
-		 */
-		pr_debug("couldn't open %s\n", filename);
-		return -1;
-	}
-
-	event->header.type = PERF_RECORD_MMAP2;
-	t = rdclock();
-
-	while (1) {
-		char bf[BUFSIZ];
-		char prot[5];
-		char execname[PATH_MAX];
-		char anonstr[] = "//anon";
-		unsigned int ino;
-		size_t size;
-		ssize_t n;
-
-		if (fgets(bf, sizeof(bf), fp) == NULL)
-			break;
-
-		if ((rdclock() - t) > timeout) {
-			pr_warning("Reading %s time out. "
-				   "You may want to increase "
-				   "the time limit by --proc-map-timeout\n",
-				   filename);
-			truncation = true;
-			goto out;
-		}
-
-		/* ensure null termination since stack will be reused. */
-		strcpy(execname, "");
-
-		/* 00400000-0040c000 r-xp 00000000 fd:01 41038  /bin/cat */
-		n = sscanf(bf, "%"PRI_lx64"-%"PRI_lx64" %s %"PRI_lx64" %x:%x %u %[^\n]\n",
-		       &event->mmap2.start, &event->mmap2.len, prot,
-		       &event->mmap2.pgoff, &event->mmap2.maj,
-		       &event->mmap2.min,
-		       &ino, execname);
-
-		/*
- 		 * Anon maps don't have the execname.
- 		 */
-		if (n < 7)
-			continue;
-
-		event->mmap2.ino = (u64)ino;
-
-		/*
-		 * Just like the kernel, see __perf_event_mmap in kernel/perf_event.c
-		 */
-		if (machine__is_host(machine))
-			event->header.misc = PERF_RECORD_MISC_USER;
-		else
-			event->header.misc = PERF_RECORD_MISC_GUEST_USER;
-
-		/* map protection and flags bits */
-		event->mmap2.prot = 0;
-		event->mmap2.flags = 0;
-		if (prot[0] == 'r')
-			event->mmap2.prot |= PROT_READ;
-		if (prot[1] == 'w')
-			event->mmap2.prot |= PROT_WRITE;
-		if (prot[2] == 'x')
-			event->mmap2.prot |= PROT_EXEC;
-
-		if (prot[3] == 's')
-			event->mmap2.flags |= MAP_SHARED;
-		else
-			event->mmap2.flags |= MAP_PRIVATE;
-
-		if (prot[2] != 'x') {
-			if (!mmap_data || prot[0] != 'r')
-				continue;
-
-			event->header.misc |= PERF_RECORD_MISC_MMAP_DATA;
-		}
-
-out:
-		if (truncation)
-			event->header.misc |= PERF_RECORD_MISC_PROC_MAP_PARSE_TIMEOUT;
-
-		if (!strcmp(execname, ""))
-			strcpy(execname, anonstr);
-
-		if (hugetlbfs_mnt_len &&
-		    !strncmp(execname, hugetlbfs_mnt, hugetlbfs_mnt_len)) {
-			strcpy(execname, anonstr);
-			event->mmap2.flags |= MAP_HUGETLB;
-		}
-
-		size = strlen(execname) + 1;
-		memcpy(event->mmap2.filename, execname, size);
-		size = PERF_ALIGN(size, sizeof(u64));
-		event->mmap2.len -= event->mmap.start;
-		event->mmap2.header.size = (sizeof(event->mmap2) -
-					(sizeof(event->mmap2.filename) - size));
-		memset(event->mmap2.filename + size, 0, machine->id_hdr_size);
-		event->mmap2.header.size += machine->id_hdr_size;
-		event->mmap2.pid = tgid;
-		event->mmap2.tid = pid;
-
-		if (perf_tool__process_synth_event(tool, event, machine, process) != 0) {
-			rc = -1;
-			break;
-		}
-
-		if (truncation)
-			break;
-	}
-
-	fclose(fp);
-	return rc;
-}
-
-int perf_event__synthesize_modules(struct perf_tool *tool,
-				   perf_event__handler_t process,
-				   struct machine *machine)
-{
-	int rc = 0;
-	struct map *pos;
-	struct maps *maps = machine__kernel_maps(machine);
-	union perf_event *event = zalloc((sizeof(event->mmap) +
-					  machine->id_hdr_size));
-	if (event == NULL) {
-		pr_debug("Not enough memory synthesizing mmap event "
-			 "for kernel modules\n");
-		return -1;
-	}
-
-	event->header.type = PERF_RECORD_MMAP;
-
-	/*
-	 * kernel uses 0 for user space maps, see kernel/perf_event.c
-	 * __perf_event_mmap
-	 */
-	if (machine__is_host(machine))
-		event->header.misc = PERF_RECORD_MISC_KERNEL;
-	else
-		event->header.misc = PERF_RECORD_MISC_GUEST_KERNEL;
-
-	for (pos = maps__first(maps); pos; pos = map__next(pos)) {
-		size_t size;
-
-		if (!__map__is_kmodule(pos))
-			continue;
-
-		size = PERF_ALIGN(pos->dso->long_name_len + 1, sizeof(u64));
-		event->mmap.header.type = PERF_RECORD_MMAP;
-		event->mmap.header.size = (sizeof(event->mmap) -
-				        (sizeof(event->mmap.filename) - size));
-		memset(event->mmap.filename + size, 0, machine->id_hdr_size);
-		event->mmap.header.size += machine->id_hdr_size;
-		event->mmap.start = pos->start;
-		event->mmap.len   = pos->end - pos->start;
-		event->mmap.pid   = machine->pid;
-
-		memcpy(event->mmap.filename, pos->dso->long_name,
-		       pos->dso->long_name_len + 1);
-		if (perf_tool__process_synth_event(tool, event, machine, process) != 0) {
-			rc = -1;
-			break;
-		}
-	}
-
-	free(event);
-	return rc;
-}
-
-static int __event__synthesize_thread(union perf_event *comm_event,
-				      union perf_event *mmap_event,
-				      union perf_event *fork_event,
-				      union perf_event *namespaces_event,
-				      pid_t pid, int full,
-				      perf_event__handler_t process,
-				      struct perf_tool *tool,
-				      struct machine *machine,
-				      bool mmap_data)
-{
-	char filename[PATH_MAX];
-	DIR *tasks;
-	struct dirent *dirent;
-	pid_t tgid, ppid;
-	int rc = 0;
-
-	/* special case: only send one comm event using passed in pid */
-	if (!full) {
-		tgid = perf_event__synthesize_comm(tool, comm_event, pid,
-						   process, machine);
-
-		if (tgid == -1)
-			return -1;
-
-		if (perf_event__synthesize_namespaces(tool, namespaces_event, pid,
-						      tgid, process, machine) < 0)
-			return -1;
-
-		/*
-		 * send mmap only for thread group leader
-		 * see thread__init_map_groups
-		 */
-		if (pid == tgid &&
-		    perf_event__synthesize_mmap_events(tool, mmap_event, pid, tgid,
-						       process, machine, mmap_data))
-			return -1;
-
-		return 0;
-	}
-
-	if (machine__is_default_guest(machine))
-		return 0;
-
-	snprintf(filename, sizeof(filename), "%s/proc/%d/task",
-		 machine->root_dir, pid);
-
-	tasks = opendir(filename);
-	if (tasks == NULL) {
-		pr_debug("couldn't open %s\n", filename);
-		return 0;
-	}
-
-	while ((dirent = readdir(tasks)) != NULL) {
-		char *end;
-		pid_t _pid;
-
-		_pid = strtol(dirent->d_name, &end, 10);
-		if (*end)
-			continue;
-
-		rc = -1;
-		if (perf_event__prepare_comm(comm_event, _pid, machine,
-					     &tgid, &ppid) != 0)
-			break;
-
-		if (perf_event__synthesize_fork(tool, fork_event, _pid, tgid,
-						ppid, process, machine) < 0)
-			break;
-
-		if (perf_event__synthesize_namespaces(tool, namespaces_event, _pid,
-						      tgid, process, machine) < 0)
-			break;
-
-		/*
-		 * Send the prepared comm event
-		 */
-		if (perf_tool__process_synth_event(tool, comm_event, machine, process) != 0)
-			break;
-
-		rc = 0;
-		if (_pid == pid) {
-			/* process the parent's maps too */
-			rc = perf_event__synthesize_mmap_events(tool, mmap_event, pid, tgid,
-						process, machine, mmap_data);
-			if (rc)
-				break;
-		}
-	}
-
-	closedir(tasks);
-	return rc;
-}
-
-int perf_event__synthesize_thread_map(struct perf_tool *tool,
-				      struct perf_thread_map *threads,
-				      perf_event__handler_t process,
-				      struct machine *machine,
-				      bool mmap_data)
-{
-	union perf_event *comm_event, *mmap_event, *fork_event;
-	union perf_event *namespaces_event;
-	int err = -1, thread, j;
-
-	comm_event = malloc(sizeof(comm_event->comm) + machine->id_hdr_size);
-	if (comm_event == NULL)
-		goto out;
-
-	mmap_event = malloc(sizeof(mmap_event->mmap2) + machine->id_hdr_size);
-	if (mmap_event == NULL)
-		goto out_free_comm;
-
-	fork_event = malloc(sizeof(fork_event->fork) + machine->id_hdr_size);
-	if (fork_event == NULL)
-		goto out_free_mmap;
-
-	namespaces_event = malloc(sizeof(namespaces_event->namespaces) +
-				  (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
-				  machine->id_hdr_size);
-	if (namespaces_event == NULL)
-		goto out_free_fork;
-
-	err = 0;
-	for (thread = 0; thread < threads->nr; ++thread) {
-		if (__event__synthesize_thread(comm_event, mmap_event,
-					       fork_event, namespaces_event,
-					       perf_thread_map__pid(threads, thread), 0,
-					       process, tool, machine,
-					       mmap_data)) {
-			err = -1;
-			break;
-		}
-
-		/*
-		 * comm.pid is set to thread group id by
-		 * perf_event__synthesize_comm
-		 */
-		if ((int) comm_event->comm.pid != perf_thread_map__pid(threads, thread)) {
-			bool need_leader = true;
-
-			/* is thread group leader in thread_map? */
-			for (j = 0; j < threads->nr; ++j) {
-				if ((int) comm_event->comm.pid == perf_thread_map__pid(threads, j)) {
-					need_leader = false;
-					break;
-				}
-			}
-
-			/* if not, generate events for it */
-			if (need_leader &&
-			    __event__synthesize_thread(comm_event, mmap_event,
-						       fork_event, namespaces_event,
-						       comm_event->comm.pid, 0,
-						       process, tool, machine,
-						       mmap_data)) {
-				err = -1;
-				break;
-			}
-		}
-	}
-	free(namespaces_event);
-out_free_fork:
-	free(fork_event);
-out_free_mmap:
-	free(mmap_event);
-out_free_comm:
-	free(comm_event);
-out:
-	return err;
-}
-
-static int __perf_event__synthesize_threads(struct perf_tool *tool,
-					    perf_event__handler_t process,
-					    struct machine *machine,
-					    bool mmap_data,
-					    struct dirent **dirent,
-					    int start,
-					    int num)
-{
-	union perf_event *comm_event, *mmap_event, *fork_event;
-	union perf_event *namespaces_event;
-	int err = -1;
-	char *end;
-	pid_t pid;
-	int i;
-
-	comm_event = malloc(sizeof(comm_event->comm) + machine->id_hdr_size);
-	if (comm_event == NULL)
-		goto out;
-
-	mmap_event = malloc(sizeof(mmap_event->mmap2) + machine->id_hdr_size);
-	if (mmap_event == NULL)
-		goto out_free_comm;
-
-	fork_event = malloc(sizeof(fork_event->fork) + machine->id_hdr_size);
-	if (fork_event == NULL)
-		goto out_free_mmap;
-
-	namespaces_event = malloc(sizeof(namespaces_event->namespaces) +
-				  (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
-				  machine->id_hdr_size);
-	if (namespaces_event == NULL)
-		goto out_free_fork;
-
-	for (i = start; i < start + num; i++) {
-		if (!isdigit(dirent[i]->d_name[0]))
-			continue;
-
-		pid = (pid_t)strtol(dirent[i]->d_name, &end, 10);
-		/* only interested in proper numerical dirents */
-		if (*end)
-			continue;
-		/*
-		 * We may race with exiting thread, so don't stop just because
-		 * one thread couldn't be synthesized.
-		 */
-		__event__synthesize_thread(comm_event, mmap_event, fork_event,
-					   namespaces_event, pid, 1, process,
-					   tool, machine, mmap_data);
-	}
-	err = 0;
-
-	free(namespaces_event);
-out_free_fork:
-	free(fork_event);
-out_free_mmap:
-	free(mmap_event);
-out_free_comm:
-	free(comm_event);
-out:
-	return err;
-}
-
-struct synthesize_threads_arg {
-	struct perf_tool *tool;
-	perf_event__handler_t process;
-	struct machine *machine;
-	bool mmap_data;
-	struct dirent **dirent;
-	int num;
-	int start;
-};
-
-static void *synthesize_threads_worker(void *arg)
-{
-	struct synthesize_threads_arg *args = arg;
-
-	__perf_event__synthesize_threads(args->tool, args->process,
-					 args->machine, args->mmap_data,
-					 args->dirent,
-					 args->start, args->num);
-	return NULL;
-}
-
-int perf_event__synthesize_threads(struct perf_tool *tool,
-				   perf_event__handler_t process,
-				   struct machine *machine,
-				   bool mmap_data,
-				   unsigned int nr_threads_synthesize)
-{
-	struct synthesize_threads_arg *args = NULL;
-	pthread_t *synthesize_threads = NULL;
-	char proc_path[PATH_MAX];
-	struct dirent **dirent;
-	int num_per_thread;
-	int m, n, i, j;
-	int thread_nr;
-	int base = 0;
-	int err = -1;
-
-
-	if (machine__is_default_guest(machine))
-		return 0;
-
-	snprintf(proc_path, sizeof(proc_path), "%s/proc", machine->root_dir);
-	n = scandir(proc_path, &dirent, 0, alphasort);
-	if (n < 0)
-		return err;
-
-	if (nr_threads_synthesize == UINT_MAX)
-		thread_nr = sysconf(_SC_NPROCESSORS_ONLN);
-	else
-		thread_nr = nr_threads_synthesize;
-
-	if (thread_nr <= 1) {
-		err = __perf_event__synthesize_threads(tool, process,
-						       machine, mmap_data,
-						       dirent, base, n);
-		goto free_dirent;
-	}
-	if (thread_nr > n)
-		thread_nr = n;
-
-	synthesize_threads = calloc(sizeof(pthread_t), thread_nr);
-	if (synthesize_threads == NULL)
-		goto free_dirent;
-
-	args = calloc(sizeof(*args), thread_nr);
-	if (args == NULL)
-		goto free_threads;
-
-	num_per_thread = n / thread_nr;
-	m = n % thread_nr;
-	for (i = 0; i < thread_nr; i++) {
-		args[i].tool = tool;
-		args[i].process = process;
-		args[i].machine = machine;
-		args[i].mmap_data = mmap_data;
-		args[i].dirent = dirent;
-	}
-	for (i = 0; i < m; i++) {
-		args[i].num = num_per_thread + 1;
-		args[i].start = i * args[i].num;
-	}
-	if (i != 0)
-		base = args[i-1].start + args[i-1].num;
-	for (j = i; j < thread_nr; j++) {
-		args[j].num = num_per_thread;
-		args[j].start = base + (j - i) * args[i].num;
-	}
-
-	for (i = 0; i < thread_nr; i++) {
-		if (pthread_create(&synthesize_threads[i], NULL,
-				   synthesize_threads_worker, &args[i]))
-			goto out_join;
-	}
-	err = 0;
-out_join:
-	for (i = 0; i < thread_nr; i++)
-		pthread_join(synthesize_threads[i], NULL);
-	free(args);
-free_threads:
-	free(synthesize_threads);
-free_dirent:
-	for (i = 0; i < n; i++)
-		zfree(&dirent[i]);
-	free(dirent);
-
-	return err;
-}
-
 struct process_symbol_args {
 	const char *name;
 	u64	   start;
@@ -899,327 +117,6 @@
 	return 0;
 }
 
-int __weak perf_event__synthesize_extra_kmaps(struct perf_tool *tool __maybe_unused,
-					      perf_event__handler_t process __maybe_unused,
-					      struct machine *machine __maybe_unused)
-{
-	return 0;
-}
-
-static int __perf_event__synthesize_kernel_mmap(struct perf_tool *tool,
-						perf_event__handler_t process,
-						struct machine *machine)
-{
-	size_t size;
-	struct map *map = machine__kernel_map(machine);
-	struct kmap *kmap;
-	int err;
-	union perf_event *event;
-
-	if (map == NULL)
-		return -1;
-
-	kmap = map__kmap(map);
-	if (!kmap->ref_reloc_sym)
-		return -1;
-
-	/*
-	 * We should get this from /sys/kernel/sections/.text, but till that is
-	 * available use this, and after it is use this as a fallback for older
-	 * kernels.
-	 */
-	event = zalloc((sizeof(event->mmap) + machine->id_hdr_size));
-	if (event == NULL) {
-		pr_debug("Not enough memory synthesizing mmap event "
-			 "for kernel modules\n");
-		return -1;
-	}
-
-	if (machine__is_host(machine)) {
-		/*
-		 * kernel uses PERF_RECORD_MISC_USER for user space maps,
-		 * see kernel/perf_event.c __perf_event_mmap
-		 */
-		event->header.misc = PERF_RECORD_MISC_KERNEL;
-	} else {
-		event->header.misc = PERF_RECORD_MISC_GUEST_KERNEL;
-	}
-
-	size = snprintf(event->mmap.filename, sizeof(event->mmap.filename),
-			"%s%s", machine->mmap_name, kmap->ref_reloc_sym->name) + 1;
-	size = PERF_ALIGN(size, sizeof(u64));
-	event->mmap.header.type = PERF_RECORD_MMAP;
-	event->mmap.header.size = (sizeof(event->mmap) -
-			(sizeof(event->mmap.filename) - size) + machine->id_hdr_size);
-	event->mmap.pgoff = kmap->ref_reloc_sym->addr;
-	event->mmap.start = map->start;
-	event->mmap.len   = map->end - event->mmap.start;
-	event->mmap.pid   = machine->pid;
-
-	err = perf_tool__process_synth_event(tool, event, machine, process);
-	free(event);
-
-	return err;
-}
-
-int perf_event__synthesize_kernel_mmap(struct perf_tool *tool,
-				       perf_event__handler_t process,
-				       struct machine *machine)
-{
-	int err;
-
-	err = __perf_event__synthesize_kernel_mmap(tool, process, machine);
-	if (err < 0)
-		return err;
-
-	return perf_event__synthesize_extra_kmaps(tool, process, machine);
-}
-
-int perf_event__synthesize_thread_map2(struct perf_tool *tool,
-				      struct perf_thread_map *threads,
-				      perf_event__handler_t process,
-				      struct machine *machine)
-{
-	union perf_event *event;
-	int i, err, size;
-
-	size  = sizeof(event->thread_map);
-	size +=	threads->nr * sizeof(event->thread_map.entries[0]);
-
-	event = zalloc(size);
-	if (!event)
-		return -ENOMEM;
-
-	event->header.type = PERF_RECORD_THREAD_MAP;
-	event->header.size = size;
-	event->thread_map.nr = threads->nr;
-
-	for (i = 0; i < threads->nr; i++) {
-		struct perf_record_thread_map_entry *entry = &event->thread_map.entries[i];
-		char *comm = perf_thread_map__comm(threads, i);
-
-		if (!comm)
-			comm = (char *) "";
-
-		entry->pid = perf_thread_map__pid(threads, i);
-		strncpy((char *) &entry->comm, comm, sizeof(entry->comm));
-	}
-
-	err = process(tool, event, NULL, machine);
-
-	free(event);
-	return err;
-}
-
-static void synthesize_cpus(struct cpu_map_entries *cpus,
-			    struct perf_cpu_map *map)
-{
-	int i;
-
-	cpus->nr = map->nr;
-
-	for (i = 0; i < map->nr; i++)
-		cpus->cpu[i] = map->map[i];
-}
-
-static void synthesize_mask(struct perf_record_record_cpu_map *mask,
-			    struct perf_cpu_map *map, int max)
-{
-	int i;
-
-	mask->nr = BITS_TO_LONGS(max);
-	mask->long_size = sizeof(long);
-
-	for (i = 0; i < map->nr; i++)
-		set_bit(map->map[i], mask->mask);
-}
-
-static size_t cpus_size(struct perf_cpu_map *map)
-{
-	return sizeof(struct cpu_map_entries) + map->nr * sizeof(u16);
-}
-
-static size_t mask_size(struct perf_cpu_map *map, int *max)
-{
-	int i;
-
-	*max = 0;
-
-	for (i = 0; i < map->nr; i++) {
-		/* bit possition of the cpu is + 1 */
-		int bit = map->map[i] + 1;
-
-		if (bit > *max)
-			*max = bit;
-	}
-
-	return sizeof(struct perf_record_record_cpu_map) + BITS_TO_LONGS(*max) * sizeof(long);
-}
-
-void *cpu_map_data__alloc(struct perf_cpu_map *map, size_t *size, u16 *type, int *max)
-{
-	size_t size_cpus, size_mask;
-	bool is_dummy = perf_cpu_map__empty(map);
-
-	/*
-	 * Both array and mask data have variable size based
-	 * on the number of cpus and their actual values.
-	 * The size of the 'struct perf_record_cpu_map_data' is:
-	 *
-	 *   array = size of 'struct cpu_map_entries' +
-	 *           number of cpus * sizeof(u64)
-	 *
-	 *   mask  = size of 'struct perf_record_record_cpu_map' +
-	 *           maximum cpu bit converted to size of longs
-	 *
-	 * and finaly + the size of 'struct perf_record_cpu_map_data'.
-	 */
-	size_cpus = cpus_size(map);
-	size_mask = mask_size(map, max);
-
-	if (is_dummy || (size_cpus < size_mask)) {
-		*size += size_cpus;
-		*type  = PERF_CPU_MAP__CPUS;
-	} else {
-		*size += size_mask;
-		*type  = PERF_CPU_MAP__MASK;
-	}
-
-	*size += sizeof(struct perf_record_cpu_map_data);
-	*size = PERF_ALIGN(*size, sizeof(u64));
-	return zalloc(*size);
-}
-
-void cpu_map_data__synthesize(struct perf_record_cpu_map_data *data, struct perf_cpu_map *map,
-			      u16 type, int max)
-{
-	data->type = type;
-
-	switch (type) {
-	case PERF_CPU_MAP__CPUS:
-		synthesize_cpus((struct cpu_map_entries *) data->data, map);
-		break;
-	case PERF_CPU_MAP__MASK:
-		synthesize_mask((struct perf_record_record_cpu_map *)data->data, map, max);
-	default:
-		break;
-	};
-}
-
-static struct perf_record_cpu_map *cpu_map_event__new(struct perf_cpu_map *map)
-{
-	size_t size = sizeof(struct perf_record_cpu_map);
-	struct perf_record_cpu_map *event;
-	int max;
-	u16 type;
-
-	event = cpu_map_data__alloc(map, &size, &type, &max);
-	if (!event)
-		return NULL;
-
-	event->header.type = PERF_RECORD_CPU_MAP;
-	event->header.size = size;
-	event->data.type   = type;
-
-	cpu_map_data__synthesize(&event->data, map, type, max);
-	return event;
-}
-
-int perf_event__synthesize_cpu_map(struct perf_tool *tool,
-				   struct perf_cpu_map *map,
-				   perf_event__handler_t process,
-				   struct machine *machine)
-{
-	struct perf_record_cpu_map *event;
-	int err;
-
-	event = cpu_map_event__new(map);
-	if (!event)
-		return -ENOMEM;
-
-	err = process(tool, (union perf_event *) event, NULL, machine);
-
-	free(event);
-	return err;
-}
-
-int perf_event__synthesize_stat_config(struct perf_tool *tool,
-				       struct perf_stat_config *config,
-				       perf_event__handler_t process,
-				       struct machine *machine)
-{
-	struct perf_record_stat_config *event;
-	int size, i = 0, err;
-
-	size  = sizeof(*event);
-	size += (PERF_STAT_CONFIG_TERM__MAX * sizeof(event->data[0]));
-
-	event = zalloc(size);
-	if (!event)
-		return -ENOMEM;
-
-	event->header.type = PERF_RECORD_STAT_CONFIG;
-	event->header.size = size;
-	event->nr          = PERF_STAT_CONFIG_TERM__MAX;
-
-#define ADD(__term, __val)					\
-	event->data[i].tag = PERF_STAT_CONFIG_TERM__##__term;	\
-	event->data[i].val = __val;				\
-	i++;
-
-	ADD(AGGR_MODE,	config->aggr_mode)
-	ADD(INTERVAL,	config->interval)
-	ADD(SCALE,	config->scale)
-
-	WARN_ONCE(i != PERF_STAT_CONFIG_TERM__MAX,
-		  "stat config terms unbalanced\n");
-#undef ADD
-
-	err = process(tool, (union perf_event *) event, NULL, machine);
-
-	free(event);
-	return err;
-}
-
-int perf_event__synthesize_stat(struct perf_tool *tool,
-				u32 cpu, u32 thread, u64 id,
-				struct perf_counts_values *count,
-				perf_event__handler_t process,
-				struct machine *machine)
-{
-	struct perf_record_stat event;
-
-	event.header.type = PERF_RECORD_STAT;
-	event.header.size = sizeof(event);
-	event.header.misc = 0;
-
-	event.id        = id;
-	event.cpu       = cpu;
-	event.thread    = thread;
-	event.val       = count->val;
-	event.ena       = count->ena;
-	event.run       = count->run;
-
-	return process(tool, (union perf_event *) &event, NULL, machine);
-}
-
-int perf_event__synthesize_stat_round(struct perf_tool *tool,
-				      u64 evtime, u64 type,
-				      perf_event__handler_t process,
-				      struct machine *machine)
-{
-	struct perf_record_stat_round event;
-
-	event.header.type = PERF_RECORD_STAT_ROUND;
-	event.header.size = sizeof(event);
-	event.header.misc = 0;
-
-	event.time = evtime;
-	event.type = type;
-
-	return process(tool, (union perf_event *) &event, NULL, machine);
-}
-
 void perf_event__read_stat_config(struct perf_stat_config *config,
 				  struct perf_record_stat_config *event)
 {

diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index 47ad81d..a0a0c91 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h

@@ -279,54 +279,13 @@
 
 void perf_event__print_totals(void);
 
-struct perf_tool;
-struct perf_thread_map;
 struct perf_cpu_map;
+struct perf_record_stat_config;
 struct perf_stat_config;
-struct perf_counts_values;
+struct perf_tool;
 
-typedef int (*perf_event__handler_t)(struct perf_tool *tool,
-				     union perf_event *event,
-				     struct perf_sample *sample,
-				     struct machine *machine);
-
-int perf_event__synthesize_thread_map(struct perf_tool *tool,
-				      struct perf_thread_map *threads,
-				      perf_event__handler_t process,
-				      struct machine *machine, bool mmap_data);
-int perf_event__synthesize_thread_map2(struct perf_tool *tool,
-				      struct perf_thread_map *threads,
-				      perf_event__handler_t process,
-				      struct machine *machine);
-int perf_event__synthesize_cpu_map(struct perf_tool *tool,
-				   struct perf_cpu_map *cpus,
-				   perf_event__handler_t process,
-				   struct machine *machine);
-int perf_event__synthesize_threads(struct perf_tool *tool,
-				   perf_event__handler_t process,
-				   struct machine *machine, bool mmap_data,
-				   unsigned int nr_threads_synthesize);
-int perf_event__synthesize_kernel_mmap(struct perf_tool *tool,
-				       perf_event__handler_t process,
-				       struct machine *machine);
-int perf_event__synthesize_stat_config(struct perf_tool *tool,
-				       struct perf_stat_config *config,
-				       perf_event__handler_t process,
-				       struct machine *machine);
 void perf_event__read_stat_config(struct perf_stat_config *config,
 				  struct perf_record_stat_config *event);
-int perf_event__synthesize_stat(struct perf_tool *tool,
-				u32 cpu, u32 thread, u64 id,
-				struct perf_counts_values *count,
-				perf_event__handler_t process,
-				struct machine *machine);
-int perf_event__synthesize_stat_round(struct perf_tool *tool,
-				      u64 time, u64 type,
-				      perf_event__handler_t process,
-				      struct machine *machine);
-int perf_event__synthesize_modules(struct perf_tool *tool,
-				   perf_event__handler_t process,
-				   struct machine *machine);
 
 int perf_event__process_comm(struct perf_tool *tool,
 			     union perf_event *event,
@@ -380,10 +339,6 @@
 			    union perf_event *event,
 			    struct perf_sample *sample,
 			    struct machine *machine);
-int perf_tool__process_synth_event(struct perf_tool *tool,
-				   union perf_event *event,
-				   struct machine *machine,
-				   perf_event__handler_t process);
 int perf_event__process(struct perf_tool *tool,
 			union perf_event *event,
 			struct perf_sample *sample,
@@ -405,34 +360,6 @@
 
 const char *perf_event__name(unsigned int id);
 
-size_t perf_event__sample_event_size(const struct perf_sample *sample, u64 type,
-				     u64 read_format);
-int perf_event__synthesize_sample(union perf_event *event, u64 type,
-				  u64 read_format,
-				  const struct perf_sample *sample);
-
-pid_t perf_event__synthesize_comm(struct perf_tool *tool,
-				  union perf_event *event, pid_t pid,
-				  perf_event__handler_t process,
-				  struct machine *machine);
-
-int perf_event__synthesize_namespaces(struct perf_tool *tool,
-				      union perf_event *event,
-				      pid_t pid, pid_t tgid,
-				      perf_event__handler_t process,
-				      struct machine *machine);
-
-int perf_event__synthesize_mmap_events(struct perf_tool *tool,
-				       union perf_event *event,
-				       pid_t pid, pid_t tgid,
-				       perf_event__handler_t process,
-				       struct machine *machine,
-				       bool mmap_data);
-
-int perf_event__synthesize_extra_kmaps(struct perf_tool *tool,
-				       perf_event__handler_t process,
-				       struct machine *machine);
-
 size_t perf_event__fprintf_comm(union perf_event *event, FILE *fp);
 size_t perf_event__fprintf_mmap(union perf_event *event, FILE *fp);
 size_t perf_event__fprintf_mmap2(union perf_event *event, FILE *fp);

diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index 095924a..d277a98 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c

@@ -10,13 +10,14 @@
 #include <inttypes.h>
 #include <poll.h>
 #include "cpumap.h"
+#include "util/mmap.h"
 #include "thread_map.h"
 #include "target.h"
 #include "evlist.h"
 #include "evsel.h"
 #include "debug.h"
 #include "units.h"
-#include "util.h"
+#include <internal/lib.h> // page_size
 #include "../perf.h"
 #include "asm/bug.h"
 #include "bpf-event.h"
@@ -49,18 +50,14 @@
 #endif
 
 #define FD(e, x, y) (*(int *)xyarray__entry(e->core.fd, x, y))
-#define SID(e, x, y) xyarray__entry(e->sample_id, x, y)
+#define SID(e, x, y) xyarray__entry(e->core.sample_id, x, y)
 
 void evlist__init(struct evlist *evlist, struct perf_cpu_map *cpus,
 		  struct perf_thread_map *threads)
 {
-	int i;
-
-	for (i = 0; i < PERF_EVLIST__HLIST_SIZE; ++i)
-		INIT_HLIST_HEAD(&evlist->heads[i]);
 	perf_evlist__init(&evlist->core);
 	perf_evlist__set_maps(&evlist->core, cpus, threads);
-	fdarray__init(&evlist->pollfd, 64);
+	fdarray__init(&evlist->core.pollfd, 64);
 	evlist->workload.pid = -1;
 	evlist->bkw_mmap_state = BKW_MMAP_NOTREADY;
 }
@@ -108,7 +105,7 @@
  */
 void perf_evlist__set_id_pos(struct evlist *evlist)
 {
-	struct evsel *first = perf_evlist__first(evlist);
+	struct evsel *first = evlist__first(evlist);
 
 	evlist->id_pos = first->id_pos;
 	evlist->is_pos = first->is_pos;
@@ -124,7 +121,7 @@
 	perf_evlist__set_id_pos(evlist);
 }
 
-static void perf_evlist__purge(struct evlist *evlist)
+static void evlist__purge(struct evlist *evlist)
 {
 	struct evsel *pos, *n;
 
@@ -137,11 +134,11 @@
 	evlist->core.nr_entries = 0;
 }
 
-void perf_evlist__exit(struct evlist *evlist)
+void evlist__exit(struct evlist *evlist)
 {
 	zfree(&evlist->mmap);
 	zfree(&evlist->overwrite_mmap);
-	fdarray__exit(&evlist->pollfd);
+	fdarray__exit(&evlist->core.pollfd);
 }
 
 void evlist__delete(struct evlist *evlist)
@@ -149,14 +146,14 @@
 	if (evlist == NULL)
 		return;
 
-	perf_evlist__munmap(evlist);
+	evlist__munmap(evlist);
 	evlist__close(evlist);
 	perf_cpu_map__put(evlist->core.cpus);
 	perf_thread_map__put(evlist->core.threads);
 	evlist->core.cpus = NULL;
 	evlist->core.threads = NULL;
-	perf_evlist__purge(evlist);
-	perf_evlist__exit(evlist);
+	evlist__purge(evlist);
+	evlist__exit(evlist);
 	free(evlist);
 }
 
@@ -318,7 +315,7 @@
 static int perf_evlist__nr_threads(struct evlist *evlist,
 				   struct evsel *evsel)
 {
-	if (evsel->system_wide)
+	if (evsel->core.system_wide)
 		return 1;
 	else
 		return perf_thread_map__nr(evlist->core.threads);
@@ -401,128 +398,29 @@
 		return perf_evlist__enable_event_thread(evlist, evsel, idx);
 }
 
-int perf_evlist__alloc_pollfd(struct evlist *evlist)
+int evlist__add_pollfd(struct evlist *evlist, int fd)
 {
-	int nr_cpus = perf_cpu_map__nr(evlist->core.cpus);
-	int nr_threads = perf_thread_map__nr(evlist->core.threads);
-	int nfds = 0;
-	struct evsel *evsel;
-
-	evlist__for_each_entry(evlist, evsel) {
-		if (evsel->system_wide)
-			nfds += nr_cpus;
-		else
-			nfds += nr_cpus * nr_threads;
-	}
-
-	if (fdarray__available_entries(&evlist->pollfd) < nfds &&
-	    fdarray__grow(&evlist->pollfd, nfds) < 0)
-		return -ENOMEM;
-
-	return 0;
-}
-
-static int __perf_evlist__add_pollfd(struct evlist *evlist, int fd,
-				     struct perf_mmap *map, short revent)
-{
-	int pos = fdarray__add(&evlist->pollfd, fd, revent | POLLERR | POLLHUP);
-	/*
-	 * Save the idx so that when we filter out fds POLLHUP'ed we can
-	 * close the associated evlist->mmap[] entry.
-	 */
-	if (pos >= 0) {
-		evlist->pollfd.priv[pos].ptr = map;
-
-		fcntl(fd, F_SETFL, O_NONBLOCK);
-	}
-
-	return pos;
-}
-
-int perf_evlist__add_pollfd(struct evlist *evlist, int fd)
-{
-	return __perf_evlist__add_pollfd(evlist, fd, NULL, POLLIN);
+	return perf_evlist__add_pollfd(&evlist->core, fd, NULL, POLLIN);
 }
 
 static void perf_evlist__munmap_filtered(struct fdarray *fda, int fd,
 					 void *arg __maybe_unused)
 {
-	struct perf_mmap *map = fda->priv[fd].ptr;
+	struct mmap *map = fda->priv[fd].ptr;
 
 	if (map)
 		perf_mmap__put(map);
 }
 
-int perf_evlist__filter_pollfd(struct evlist *evlist, short revents_and_mask)
+int evlist__filter_pollfd(struct evlist *evlist, short revents_and_mask)
 {
-	return fdarray__filter(&evlist->pollfd, revents_and_mask,
+	return fdarray__filter(&evlist->core.pollfd, revents_and_mask,
 			       perf_evlist__munmap_filtered, NULL);
 }
 
-int perf_evlist__poll(struct evlist *evlist, int timeout)
+int evlist__poll(struct evlist *evlist, int timeout)
 {
-	return fdarray__poll(&evlist->pollfd, timeout);
-}
-
-static void perf_evlist__id_hash(struct evlist *evlist,
-				 struct evsel *evsel,
-				 int cpu, int thread, u64 id)
-{
-	int hash;
-	struct perf_sample_id *sid = SID(evsel, cpu, thread);
-
-	sid->id = id;
-	sid->evsel = evsel;
-	hash = hash_64(sid->id, PERF_EVLIST__HLIST_BITS);
-	hlist_add_head(&sid->node, &evlist->heads[hash]);
-}
-
-void perf_evlist__id_add(struct evlist *evlist, struct evsel *evsel,
-			 int cpu, int thread, u64 id)
-{
-	perf_evlist__id_hash(evlist, evsel, cpu, thread, id);
-	evsel->id[evsel->ids++] = id;
-}
-
-int perf_evlist__id_add_fd(struct evlist *evlist,
-			   struct evsel *evsel,
-			   int cpu, int thread, int fd)
-{
-	u64 read_data[4] = { 0, };
-	int id_idx = 1; /* The first entry is the counter value */
-	u64 id;
-	int ret;
-
-	ret = ioctl(fd, PERF_EVENT_IOC_ID, &id);
-	if (!ret)
-		goto add;
-
-	if (errno != ENOTTY)
-		return -1;
-
-	/* Legacy way to get event id.. All hail to old kernels! */
-
-	/*
-	 * This way does not work with group format read, so bail
-	 * out in that case.
-	 */
-	if (perf_evlist__read_format(evlist) & PERF_FORMAT_GROUP)
-		return -1;
-
-	if (!(evsel->core.attr.read_format & PERF_FORMAT_ID) ||
-	    read(fd, &read_data, sizeof(read_data)) == -1)
-		return -1;
-
-	if (evsel->core.attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
-		++id_idx;
-	if (evsel->core.attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
-		++id_idx;
-
-	id = read_data[id_idx];
-
- add:
-	perf_evlist__id_add(evlist, evsel, cpu, thread, id);
-	return 0;
+	return perf_evlist__poll(&evlist->core, timeout);
 }
 
 static void perf_evlist__set_sid_idx(struct evlist *evlist,
@@ -535,7 +433,7 @@
 		sid->cpu = evlist->core.cpus->map[cpu];
 	else
 		sid->cpu = -1;
-	if (!evsel->system_wide && evlist->core.threads && thread >= 0)
+	if (!evsel->core.system_wide && evlist->core.threads && thread >= 0)
 		sid->tid = perf_thread_map__pid(evlist->core.threads, thread);
 	else
 		sid->tid = -1;
@@ -548,7 +446,7 @@
 	int hash;
 
 	hash = hash_64(id, PERF_EVLIST__HLIST_BITS);
-	head = &evlist->heads[hash];
+	head = &evlist->core.heads[hash];
 
 	hlist_for_each_entry(sid, head, node)
 		if (sid->id == id)
@@ -562,14 +460,14 @@
 	struct perf_sample_id *sid;
 
 	if (evlist->core.nr_entries == 1 || !id)
-		return perf_evlist__first(evlist);
+		return evlist__first(evlist);
 
 	sid = perf_evlist__id2sid(evlist, id);
 	if (sid)
-		return sid->evsel;
+		return container_of(sid->evsel, struct evsel, core);
 
 	if (!perf_evlist__sample_id_all(evlist))
-		return perf_evlist__first(evlist);
+		return evlist__first(evlist);
 
 	return NULL;
 }
@@ -584,7 +482,7 @@
 
 	sid = perf_evlist__id2sid(evlist, id);
 	if (sid)
-		return sid->evsel;
+		return container_of(sid->evsel, struct evsel, core);
 
 	return NULL;
 }
@@ -613,7 +511,7 @@
 struct evsel *perf_evlist__event2evsel(struct evlist *evlist,
 					    union perf_event *event)
 {
-	struct evsel *first = perf_evlist__first(evlist);
+	struct evsel *first = evlist__first(evlist);
 	struct hlist_head *head;
 	struct perf_sample_id *sid;
 	int hash;
@@ -634,11 +532,11 @@
 		return first;
 
 	hash = hash_64(id, PERF_EVLIST__HLIST_BITS);
-	head = &evlist->heads[hash];
+	head = &evlist->core.heads[hash];
 
 	hlist_for_each_entry(sid, head, node) {
 		if (sid->id == id)
-			return sid->evsel;
+			return container_of(sid->evsel, struct evsel, core);
 	}
 	return NULL;
 }
@@ -650,8 +548,8 @@
 	if (!evlist->overwrite_mmap)
 		return 0;
 
-	for (i = 0; i < evlist->nr_mmaps; i++) {
-		int fd = evlist->overwrite_mmap[i].fd;
+	for (i = 0; i < evlist->core.nr_mmaps; i++) {
+		int fd = evlist->overwrite_mmap[i].core.fd;
 		int err;
 
 		if (fd < 0)
@@ -673,42 +571,42 @@
 	return perf_evlist__set_paused(evlist, false);
 }
 
-static void perf_evlist__munmap_nofree(struct evlist *evlist)
+static void evlist__munmap_nofree(struct evlist *evlist)
 {
 	int i;
 
 	if (evlist->mmap)
-		for (i = 0; i < evlist->nr_mmaps; i++)
+		for (i = 0; i < evlist->core.nr_mmaps; i++)
 			perf_mmap__munmap(&evlist->mmap[i]);
 
 	if (evlist->overwrite_mmap)
-		for (i = 0; i < evlist->nr_mmaps; i++)
+		for (i = 0; i < evlist->core.nr_mmaps; i++)
 			perf_mmap__munmap(&evlist->overwrite_mmap[i]);
 }
 
-void perf_evlist__munmap(struct evlist *evlist)
+void evlist__munmap(struct evlist *evlist)
 {
-	perf_evlist__munmap_nofree(evlist);
+	evlist__munmap_nofree(evlist);
 	zfree(&evlist->mmap);
 	zfree(&evlist->overwrite_mmap);
 }
 
-static struct perf_mmap *perf_evlist__alloc_mmap(struct evlist *evlist,
-						 bool overwrite)
+static struct mmap *evlist__alloc_mmap(struct evlist *evlist,
+				       bool overwrite)
 {
 	int i;
-	struct perf_mmap *map;
+	struct mmap *map;
 
-	evlist->nr_mmaps = perf_cpu_map__nr(evlist->core.cpus);
+	evlist->core.nr_mmaps = perf_cpu_map__nr(evlist->core.cpus);
 	if (perf_cpu_map__empty(evlist->core.cpus))
-		evlist->nr_mmaps = perf_thread_map__nr(evlist->core.threads);
-	map = zalloc(evlist->nr_mmaps * sizeof(struct perf_mmap));
+		evlist->core.nr_mmaps = perf_thread_map__nr(evlist->core.threads);
+	map = zalloc(evlist->core.nr_mmaps * sizeof(struct mmap));
 	if (!map)
 		return NULL;
 
-	for (i = 0; i < evlist->nr_mmaps; i++) {
-		map[i].fd = -1;
-		map[i].overwrite = overwrite;
+	for (i = 0; i < evlist->core.nr_mmaps; i++) {
+		map[i].core.fd = -1;
+		map[i].core.overwrite = overwrite;
 		/*
 		 * When the perf_mmap() call is made we grab one refcount, plus
 		 * one extra to let perf_mmap__consume() get the last
@@ -718,7 +616,7 @@
 		 * Each PERF_EVENT_IOC_SET_OUTPUT points to this mmap and
 		 * thus does perf_mmap__get() on it.
 		 */
-		refcount_set(&map[i].refcnt, 0);
+		refcount_set(&map[i].core.refcnt, 0);
 	}
 	return map;
 }
@@ -732,7 +630,7 @@
 	return true;
 }
 
-static int perf_evlist__mmap_per_evsel(struct evlist *evlist, int idx,
+static int evlist__mmap_per_evsel(struct evlist *evlist, int idx,
 				       struct mmap_params *mp, int cpu_idx,
 				       int thread, int *_output, int *_output_overwrite)
 {
@@ -741,7 +639,7 @@
 	int evlist_cpu = cpu_map__cpu(evlist->core.cpus, cpu_idx);
 
 	evlist__for_each_entry(evlist, evsel) {
-		struct perf_mmap *maps = evlist->mmap;
+		struct mmap *maps = evlist->mmap;
 		int *output = _output;
 		int fd;
 		int cpu;
@@ -752,7 +650,7 @@
 			maps = evlist->overwrite_mmap;
 
 			if (!maps) {
-				maps = perf_evlist__alloc_mmap(evlist, true);
+				maps = evlist__alloc_mmap(evlist, true);
 				if (!maps)
 					return -1;
 				evlist->overwrite_mmap = maps;
@@ -762,7 +660,7 @@
 			mp->prot &= ~PROT_WRITE;
 		}
 
-		if (evsel->system_wide && thread)
+		if (evsel->core.system_wide && thread)
 			continue;
 
 		cpu = perf_cpu_map__idx(evsel->core.cpus, evlist_cpu);
@@ -792,14 +690,14 @@
 		 * other events, so it should not need to be polled anyway.
 		 * Therefore don't add it for polling.
 		 */
-		if (!evsel->system_wide &&
-		    __perf_evlist__add_pollfd(evlist, fd, &maps[idx], revent) < 0) {
+		if (!evsel->core.system_wide &&
+		     perf_evlist__add_pollfd(&evlist->core, fd, &maps[idx], revent) < 0) {
 			perf_mmap__put(&maps[idx]);
 			return -1;
 		}
 
 		if (evsel->core.attr.read_format & PERF_FORMAT_ID) {
-			if (perf_evlist__id_add_fd(evlist, evsel, cpu, thread,
+			if (perf_evlist__id_add_fd(&evlist->core, &evsel->core, cpu, thread,
 						   fd) < 0)
 				return -1;
 			perf_evlist__set_sid_idx(evlist, evsel, idx, cpu,
@@ -810,7 +708,7 @@
 	return 0;
 }
 
-static int perf_evlist__mmap_per_cpu(struct evlist *evlist,
+static int evlist__mmap_per_cpu(struct evlist *evlist,
 				     struct mmap_params *mp)
 {
 	int cpu, thread;
@@ -826,7 +724,7 @@
 					      true);
 
 		for (thread = 0; thread < nr_threads; thread++) {
-			if (perf_evlist__mmap_per_evsel(evlist, cpu, mp, cpu,
+			if (evlist__mmap_per_evsel(evlist, cpu, mp, cpu,
 							thread, &output, &output_overwrite))
 				goto out_unmap;
 		}
@@ -835,11 +733,11 @@
 	return 0;
 
 out_unmap:
-	perf_evlist__munmap_nofree(evlist);
+	evlist__munmap_nofree(evlist);
 	return -1;
 }
 
-static int perf_evlist__mmap_per_thread(struct evlist *evlist,
+static int evlist__mmap_per_thread(struct evlist *evlist,
 					struct mmap_params *mp)
 {
 	int thread;
@@ -853,7 +751,7 @@
 		auxtrace_mmap_params__set_idx(&mp->auxtrace_mp, evlist, thread,
 					      false);
 
-		if (perf_evlist__mmap_per_evsel(evlist, thread, mp, 0, thread,
+		if (evlist__mmap_per_evsel(evlist, thread, mp, 0, thread,
 						&output, &output_overwrite))
 			goto out_unmap;
 	}
@@ -861,7 +759,7 @@
 	return 0;
 
 out_unmap:
-	perf_evlist__munmap_nofree(evlist);
+	evlist__munmap_nofree(evlist);
 	return -1;
 }
 
@@ -888,7 +786,7 @@
 	return pages;
 }
 
-size_t perf_evlist__mmap_size(unsigned long pages)
+size_t evlist__mmap_size(unsigned long pages)
 {
 	if (pages == UINT_MAX)
 		pages = perf_event_mlock_kb_in_pages();
@@ -971,7 +869,7 @@
 }
 
 /**
- * perf_evlist__mmap_ex - Create mmaps to receive events.
+ * evlist__mmap_ex - Create mmaps to receive events.
  * @evlist: list of events
  * @pages: map length in pages
  * @overwrite: overwrite older events?
@@ -979,7 +877,7 @@
  * @auxtrace_overwrite - overwrite older auxtrace data?
  *
  * If @overwrite is %false the user needs to signal event consumption using
- * perf_mmap__write_tail().  Using perf_evlist__mmap_read() does this
+ * perf_mmap__write_tail().  Using evlist__mmap_read() does this
  * automatically.
  *
  * Similarly, if @auxtrace_overwrite is %false the user needs to signal data
@@ -987,7 +885,7 @@
  *
  * Return: %0 on success, negative error code otherwise.
  */
-int perf_evlist__mmap_ex(struct evlist *evlist, unsigned int pages,
+int evlist__mmap_ex(struct evlist *evlist, unsigned int pages,
 			 unsigned int auxtrace_pages,
 			 bool auxtrace_overwrite, int nr_cblocks, int affinity, int flush,
 			 int comp_level)
@@ -1004,36 +902,36 @@
 				  .comp_level = comp_level };
 
 	if (!evlist->mmap)
-		evlist->mmap = perf_evlist__alloc_mmap(evlist, false);
+		evlist->mmap = evlist__alloc_mmap(evlist, false);
 	if (!evlist->mmap)
 		return -ENOMEM;
 
-	if (evlist->pollfd.entries == NULL && perf_evlist__alloc_pollfd(evlist) < 0)
+	if (evlist->core.pollfd.entries == NULL && perf_evlist__alloc_pollfd(&evlist->core) < 0)
 		return -ENOMEM;
 
-	evlist->mmap_len = perf_evlist__mmap_size(pages);
-	pr_debug("mmap size %zuB\n", evlist->mmap_len);
-	mp.mask = evlist->mmap_len - page_size - 1;
+	evlist->core.mmap_len = evlist__mmap_size(pages);
+	pr_debug("mmap size %zuB\n", evlist->core.mmap_len);
+	mp.mask = evlist->core.mmap_len - page_size - 1;
 
-	auxtrace_mmap_params__init(&mp.auxtrace_mp, evlist->mmap_len,
+	auxtrace_mmap_params__init(&mp.auxtrace_mp, evlist->core.mmap_len,
 				   auxtrace_pages, auxtrace_overwrite);
 
 	evlist__for_each_entry(evlist, evsel) {
 		if ((evsel->core.attr.read_format & PERF_FORMAT_ID) &&
-		    evsel->sample_id == NULL &&
-		    perf_evsel__alloc_id(evsel, perf_cpu_map__nr(cpus), threads->nr) < 0)
+		    evsel->core.sample_id == NULL &&
+		    perf_evsel__alloc_id(&evsel->core, perf_cpu_map__nr(cpus), threads->nr) < 0)
 			return -ENOMEM;
 	}
 
 	if (perf_cpu_map__empty(cpus))
-		return perf_evlist__mmap_per_thread(evlist, &mp);
+		return evlist__mmap_per_thread(evlist, &mp);
 
-	return perf_evlist__mmap_per_cpu(evlist, &mp);
+	return evlist__mmap_per_cpu(evlist, &mp);
 }
 
-int perf_evlist__mmap(struct evlist *evlist, unsigned int pages)
+int evlist__mmap(struct evlist *evlist, unsigned int pages)
 {
-	return perf_evlist__mmap_ex(evlist, pages, 0, false, 0, PERF_AFFINITY_SYS, 1, 0);
+	return evlist__mmap_ex(evlist, pages, 0, false, 0, PERF_AFFINITY_SYS, 1, 0);
 }
 
 int perf_evlist__create_maps(struct evlist *evlist, struct target *target)
@@ -1225,7 +1123,7 @@
 
 bool perf_evlist__valid_read_format(struct evlist *evlist)
 {
-	struct evsel *first = perf_evlist__first(evlist), *pos = first;
+	struct evsel *first = evlist__first(evlist), *pos = first;
 	u64 read_format = first->core.attr.read_format;
 	u64 sample_type = first->core.attr.sample_type;
 
@@ -1243,15 +1141,9 @@
 	return true;
 }
 
-u64 perf_evlist__read_format(struct evlist *evlist)
-{
-	struct evsel *first = perf_evlist__first(evlist);
-	return first->core.attr.read_format;
-}
-
 u16 perf_evlist__id_hdr_size(struct evlist *evlist)
 {
-	struct evsel *first = perf_evlist__first(evlist);
+	struct evsel *first = evlist__first(evlist);
 	struct perf_sample *data;
 	u64 sample_type;
 	u16 size = 0;
@@ -1284,7 +1176,7 @@
 
 bool perf_evlist__valid_sample_id_all(struct evlist *evlist)
 {
-	struct evsel *first = perf_evlist__first(evlist), *pos = first;
+	struct evsel *first = evlist__first(evlist), *pos = first;
 
 	evlist__for_each_entry_continue(evlist, pos) {
 		if (first->core.attr.sample_id_all != pos->core.attr.sample_id_all)
@@ -1296,7 +1188,7 @@
 
 bool perf_evlist__sample_id_all(struct evlist *evlist)
 {
-	struct evsel *first = perf_evlist__first(evlist);
+	struct evsel *first = evlist__first(evlist);
 	return first->core.attr.sample_id_all;
 }
 
@@ -1529,19 +1421,6 @@
 	return perf_evsel__parse_sample_timestamp(evsel, event, timestamp);
 }
 
-size_t perf_evlist__fprintf(struct evlist *evlist, FILE *fp)
-{
-	struct evsel *evsel;
-	size_t printed = 0;
-
-	evlist__for_each_entry(evlist, evsel) {
-		printed += fprintf(fp, "%s%s", evsel->idx ? ", " : "",
-				   perf_evsel__name(evsel));
-	}
-
-	return printed + fprintf(fp, "\n");
-}
-
 int perf_evlist__strerror_open(struct evlist *evlist,
 			       int err, char *buf, size_t size)
 {
@@ -1571,7 +1450,7 @@
 				    "Hint:\tThe current value is %d.", value);
 		break;
 	case EINVAL: {
-		struct evsel *first = perf_evlist__first(evlist);
+		struct evsel *first = evlist__first(evlist);
 		int max_freq;
 
 		if (sysctl__read_int("kernel/perf_event_max_sample_rate", &max_freq) < 0)
@@ -1599,7 +1478,7 @@
 int perf_evlist__strerror_mmap(struct evlist *evlist, int err, char *buf, size_t size)
 {
 	char sbuf[STRERR_BUFSIZE], *emsg = str_error_r(err, sbuf, sizeof(sbuf));
-	int pages_attempted = evlist->mmap_len / 1024, pages_max_per_user, printed = 0;
+	int pages_attempted = evlist->core.mmap_len / 1024, pages_max_per_user, printed = 0;
 
 	switch (err) {
 	case EPERM:
@@ -1633,7 +1512,7 @@
 	struct evsel *evsel, *n;
 	LIST_HEAD(move);
 
-	if (move_evsel == perf_evlist__first(evlist))
+	if (move_evsel == evlist__first(evlist))
 		return;
 
 	evlist__for_each_entry_safe(evlist, n, evsel) {
@@ -1754,7 +1633,7 @@
 void perf_evlist__force_leader(struct evlist *evlist)
 {
 	if (!evlist->nr_groups) {
-		struct evsel *leader = perf_evlist__first(evlist);
+		struct evsel *leader = evlist__first(evlist);
 
 		perf_evlist__set_leader(evlist);
 		leader->forced_leader = true;
@@ -1780,7 +1659,7 @@
 			is_open = false;
 		if (c2->leader == leader) {
 			if (is_open)
-				evsel__close(c2);
+				perf_evsel__close(&evsel->core);
 			c2->leader = c2;
 			c2->core.nr_members = 0;
 		}
@@ -1844,10 +1723,10 @@
 			draining = true;
 
 		if (!draining)
-			perf_evlist__poll(evlist, 1000);
+			evlist__poll(evlist, 1000);
 
-		for (i = 0; i < evlist->nr_mmaps; i++) {
-			struct perf_mmap *map = &evlist->mmap[i];
+		for (i = 0; i < evlist->core.nr_mmaps; i++) {
+			struct mmap *map = &evlist->mmap[i];
 			union perf_event *event;
 
 			if (perf_mmap__read_init(map))
@@ -1889,7 +1768,7 @@
 			goto out_delete_evlist;
 	}
 
-	if (perf_evlist__mmap(evlist, UINT_MAX))
+	if (evlist__mmap(evlist, UINT_MAX))
 		goto out_delete_evlist;
 
 	evlist__for_each_entry(evlist, counter) {

diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
index a55f0f2..7cfe755 100644
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h

@@ -7,11 +7,11 @@
 #include <linux/refcount.h>
 #include <linux/list.h>
 #include <api/fd/array.h>
-#include <stdio.h>
 #include <internal/evlist.h>
+#include <internal/evsel.h>
 #include "events_stats.h"
 #include "evsel.h"
-#include "mmap.h"
+#include <pthread.h>
 #include <signal.h>
 #include <unistd.h>
 
@@ -20,16 +20,38 @@
 struct perf_cpu_map;
 struct record_opts;
 
-#define PERF_EVLIST__HLIST_BITS 8
-#define PERF_EVLIST__HLIST_SIZE (1 << PERF_EVLIST__HLIST_BITS)
+/*
+ * State machine of bkw_mmap_state:
+ *
+ *                     .________________(forbid)_____________.
+ *                     |                                     V
+ * NOTREADY --(0)--> RUNNING --(1)--> DATA_PENDING --(2)--> EMPTY
+ *                     ^  ^              |   ^               |
+ *                     |  |__(forbid)____/   |___(forbid)___/|
+ *                     |                                     |
+ *                      \_________________(3)_______________/
+ *
+ * NOTREADY     : Backward ring buffers are not ready
+ * RUNNING      : Backward ring buffers are recording
+ * DATA_PENDING : We are required to collect data from backward ring buffers
+ * EMPTY        : We have collected data from backward ring buffers.
+ *
+ * (0): Setup backward ring buffer
+ * (1): Pause ring buffers for reading
+ * (2): Read from ring buffers
+ * (3): Resume ring buffers for recording
+ */
+enum bkw_mmap_state {
+	BKW_MMAP_NOTREADY,
+	BKW_MMAP_RUNNING,
+	BKW_MMAP_DATA_PENDING,
+	BKW_MMAP_EMPTY,
+};
 
 struct evlist {
 	struct perf_evlist core;
-	struct hlist_head heads[PERF_EVLIST__HLIST_SIZE];
 	int		 nr_groups;
-	int		 nr_mmaps;
 	bool		 enabled;
-	size_t		 mmap_len;
 	int		 id_pos;
 	int		 is_pos;
 	u64		 combined_sample_type;
@@ -38,9 +60,8 @@
 		int	cork_fd;
 		pid_t	pid;
 	} workload;
-	struct fdarray	 pollfd;
-	struct perf_mmap *mmap;
-	struct perf_mmap *overwrite_mmap;
+	struct mmap *mmap;
+	struct mmap *overwrite_mmap;
 	struct evsel *selected;
 	struct events_stats stats;
 	struct perf_env	*env;
@@ -65,7 +86,7 @@
 struct evlist *perf_evlist__new_dummy(void);
 void evlist__init(struct evlist *evlist, struct perf_cpu_map *cpus,
 		  struct perf_thread_map *threads);
-void perf_evlist__exit(struct evlist *evlist);
+void evlist__exit(struct evlist *evlist);
 void evlist__delete(struct evlist *evlist);
 
 void evlist__add(struct evlist *evlist, struct evsel *entry);
@@ -119,17 +140,10 @@
 perf_evlist__find_tracepoint_by_name(struct evlist *evlist,
 				     const char *name);
 
-void perf_evlist__id_add(struct evlist *evlist, struct evsel *evsel,
-			 int cpu, int thread, u64 id);
-int perf_evlist__id_add_fd(struct evlist *evlist,
-			   struct evsel *evsel,
-			   int cpu, int thread, int fd);
+int evlist__add_pollfd(struct evlist *evlist, int fd);
+int evlist__filter_pollfd(struct evlist *evlist, short revents_and_mask);
 
-int perf_evlist__add_pollfd(struct evlist *evlist, int fd);
-int perf_evlist__alloc_pollfd(struct evlist *evlist);
-int perf_evlist__filter_pollfd(struct evlist *evlist, short revents_and_mask);
-
-int perf_evlist__poll(struct evlist *evlist, int timeout);
+int evlist__poll(struct evlist *evlist, int timeout);
 
 struct evsel *perf_evlist__id2evsel(struct evlist *evlist, u64 id);
 struct evsel *perf_evlist__id2evsel_strict(struct evlist *evlist,
@@ -139,7 +153,7 @@
 
 void perf_evlist__toggle_bkw_mmap(struct evlist *evlist, enum bkw_mmap_state state);
 
-void perf_evlist__mmap_consume(struct evlist *evlist, int idx);
+void evlist__mmap_consume(struct evlist *evlist, int idx);
 
 int evlist__open(struct evlist *evlist);
 void evlist__close(struct evlist *evlist);
@@ -170,14 +184,14 @@
 
 unsigned long perf_event_mlock_kb_in_pages(void);
 
-int perf_evlist__mmap_ex(struct evlist *evlist, unsigned int pages,
+int evlist__mmap_ex(struct evlist *evlist, unsigned int pages,
 			 unsigned int auxtrace_pages,
 			 bool auxtrace_overwrite, int nr_cblocks,
 			 int affinity, int flush, int comp_level);
-int perf_evlist__mmap(struct evlist *evlist, unsigned int pages);
-void perf_evlist__munmap(struct evlist *evlist);
+int evlist__mmap(struct evlist *evlist, unsigned int pages);
+void evlist__munmap(struct evlist *evlist);
 
-size_t perf_evlist__mmap_size(unsigned long pages);
+size_t evlist__mmap_size(unsigned long pages);
 
 void evlist__disable(struct evlist *evlist);
 void evlist__enable(struct evlist *evlist);
@@ -195,7 +209,6 @@
 void __perf_evlist__set_leader(struct list_head *list);
 void perf_evlist__set_leader(struct evlist *evlist);
 
-u64 perf_evlist__read_format(struct evlist *evlist);
 u64 __perf_evlist__combined_sample_type(struct evlist *evlist);
 u64 perf_evlist__combined_sample_type(struct evlist *evlist);
 u64 perf_evlist__combined_branch_type(struct evlist *evlist);
@@ -221,17 +234,19 @@
 	return list_empty(&evlist->core.entries);
 }
 
-static inline struct evsel *perf_evlist__first(struct evlist *evlist)
+static inline struct evsel *evlist__first(struct evlist *evlist)
 {
-	return list_entry(evlist->core.entries.next, struct evsel, core.node);
+	struct perf_evsel *evsel = perf_evlist__first(&evlist->core);
+
+	return container_of(evsel, struct evsel, core);
 }
 
-static inline struct evsel *perf_evlist__last(struct evlist *evlist)
+static inline struct evsel *evlist__last(struct evlist *evlist)
 {
-	return list_entry(evlist->core.entries.prev, struct evsel, core.node);
-}
+	struct perf_evsel *evsel = perf_evlist__last(&evlist->core);
 
-size_t perf_evlist__fprintf(struct evlist *evlist, FILE *fp);
+	return container_of(evsel, struct evsel, core);
+}
 
 int perf_evlist__strerror_open(struct evlist *evlist, int err, char *buf, size_t size);
 int perf_evlist__strerror_mmap(struct evlist *evlist, int err, char *buf, size_t size);

diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 8582538..5591af8 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c

@@ -30,8 +30,10 @@
 #include "counts.h"
 #include "event.h"
 #include "evsel.h"
+#include "util/evsel_config.h"
+#include "util/evsel_fprintf.h"
 #include "evlist.h"
-#include "cpumap.h"
+#include <perf/cpumap.h>
 #include "thread_map.h"
 #include "target.h"
 #include "perf_regs.h"
@@ -45,6 +47,7 @@
 #include "../perf-sys.h"
 #include "util/parse-branch-options.h"
 #include <internal/xyarray.h>
+#include <internal/lib.h>
 
 #include <linux/ctype.h>
 
@@ -1226,36 +1229,6 @@
 	return err;
 }
 
-int perf_evsel__alloc_id(struct evsel *evsel, int ncpus, int nthreads)
-{
-	if (ncpus == 0 || nthreads == 0)
-		return 0;
-
-	if (evsel->system_wide)
-		nthreads = 1;
-
-	evsel->sample_id = xyarray__new(ncpus, nthreads, sizeof(struct perf_sample_id));
-	if (evsel->sample_id == NULL)
-		return -ENOMEM;
-
-	evsel->id = zalloc(ncpus * nthreads * sizeof(u64));
-	if (evsel->id == NULL) {
-		xyarray__delete(evsel->sample_id);
-		evsel->sample_id = NULL;
-		return -ENOMEM;
-	}
-
-	return 0;
-}
-
-static void perf_evsel__free_id(struct evsel *evsel)
-{
-	xyarray__delete(evsel->sample_id);
-	evsel->sample_id = NULL;
-	zfree(&evsel->id);
-	evsel->ids = 0;
-}
-
 static void perf_evsel__free_config_terms(struct evsel *evsel)
 {
 	struct perf_evsel_config_term *term, *h;
@@ -1272,7 +1245,7 @@
 	assert(evsel->evlist == NULL);
 	perf_evsel__free_counts(evsel);
 	perf_evsel__free_fd(&evsel->core);
-	perf_evsel__free_id(evsel);
+	perf_evsel__free_id(&evsel->core);
 	perf_evsel__free_config_terms(evsel);
 	cgroup__put(evsel->cgrp);
 	perf_cpu_map__put(evsel->core.cpus);
@@ -1472,152 +1445,6 @@
 	return fd;
 }
 
-struct bit_names {
-	int bit;
-	const char *name;
-};
-
-static void __p_bits(char *buf, size_t size, u64 value, struct bit_names *bits)
-{
-	bool first_bit = true;
-	int i = 0;
-
-	do {
-		if (value & bits[i].bit) {
-			buf += scnprintf(buf, size, "%s%s", first_bit ? "" : "|", bits[i].name);
-			first_bit = false;
-		}
-	} while (bits[++i].name != NULL);
-}
-
-static void __p_sample_type(char *buf, size_t size, u64 value)
-{
-#define bit_name(n) { PERF_SAMPLE_##n, #n }
-	struct bit_names bits[] = {
-		bit_name(IP), bit_name(TID), bit_name(TIME), bit_name(ADDR),
-		bit_name(READ), bit_name(CALLCHAIN), bit_name(ID), bit_name(CPU),
-		bit_name(PERIOD), bit_name(STREAM_ID), bit_name(RAW),
-		bit_name(BRANCH_STACK), bit_name(REGS_USER), bit_name(STACK_USER),
-		bit_name(IDENTIFIER), bit_name(REGS_INTR), bit_name(DATA_SRC),
-		bit_name(WEIGHT), bit_name(PHYS_ADDR),
-		{ .name = NULL, }
-	};
-#undef bit_name
-	__p_bits(buf, size, value, bits);
-}
-
-static void __p_branch_sample_type(char *buf, size_t size, u64 value)
-{
-#define bit_name(n) { PERF_SAMPLE_BRANCH_##n, #n }
-	struct bit_names bits[] = {
-		bit_name(USER), bit_name(KERNEL), bit_name(HV), bit_name(ANY),
-		bit_name(ANY_CALL), bit_name(ANY_RETURN), bit_name(IND_CALL),
-		bit_name(ABORT_TX), bit_name(IN_TX), bit_name(NO_TX),
-		bit_name(COND), bit_name(CALL_STACK), bit_name(IND_JUMP),
-		bit_name(CALL), bit_name(NO_FLAGS), bit_name(NO_CYCLES),
-		{ .name = NULL, }
-	};
-#undef bit_name
-	__p_bits(buf, size, value, bits);
-}
-
-static void __p_read_format(char *buf, size_t size, u64 value)
-{
-#define bit_name(n) { PERF_FORMAT_##n, #n }
-	struct bit_names bits[] = {
-		bit_name(TOTAL_TIME_ENABLED), bit_name(TOTAL_TIME_RUNNING),
-		bit_name(ID), bit_name(GROUP),
-		{ .name = NULL, }
-	};
-#undef bit_name
-	__p_bits(buf, size, value, bits);
-}
-
-#define BUF_SIZE		1024
-
-#define p_hex(val)		snprintf(buf, BUF_SIZE, "%#"PRIx64, (uint64_t)(val))
-#define p_unsigned(val)		snprintf(buf, BUF_SIZE, "%"PRIu64, (uint64_t)(val))
-#define p_signed(val)		snprintf(buf, BUF_SIZE, "%"PRId64, (int64_t)(val))
-#define p_sample_type(val)	__p_sample_type(buf, BUF_SIZE, val)
-#define p_branch_sample_type(val) __p_branch_sample_type(buf, BUF_SIZE, val)
-#define p_read_format(val)	__p_read_format(buf, BUF_SIZE, val)
-
-#define PRINT_ATTRn(_n, _f, _p)				\
-do {							\
-	if (attr->_f) {					\
-		_p(attr->_f);				\
-		ret += attr__fprintf(fp, _n, buf, priv);\
-	}						\
-} while (0)
-
-#define PRINT_ATTRf(_f, _p)	PRINT_ATTRn(#_f, _f, _p)
-
-int perf_event_attr__fprintf(FILE *fp, struct perf_event_attr *attr,
-			     attr__fprintf_f attr__fprintf, void *priv)
-{
-	char buf[BUF_SIZE];
-	int ret = 0;
-
-	PRINT_ATTRf(type, p_unsigned);
-	PRINT_ATTRf(size, p_unsigned);
-	PRINT_ATTRf(config, p_hex);
-	PRINT_ATTRn("{ sample_period, sample_freq }", sample_period, p_unsigned);
-	PRINT_ATTRf(sample_type, p_sample_type);
-	PRINT_ATTRf(read_format, p_read_format);
-
-	PRINT_ATTRf(disabled, p_unsigned);
-	PRINT_ATTRf(inherit, p_unsigned);
-	PRINT_ATTRf(pinned, p_unsigned);
-	PRINT_ATTRf(exclusive, p_unsigned);
-	PRINT_ATTRf(exclude_user, p_unsigned);
-	PRINT_ATTRf(exclude_kernel, p_unsigned);
-	PRINT_ATTRf(exclude_hv, p_unsigned);
-	PRINT_ATTRf(exclude_idle, p_unsigned);
-	PRINT_ATTRf(mmap, p_unsigned);
-	PRINT_ATTRf(comm, p_unsigned);
-	PRINT_ATTRf(freq, p_unsigned);
-	PRINT_ATTRf(inherit_stat, p_unsigned);
-	PRINT_ATTRf(enable_on_exec, p_unsigned);
-	PRINT_ATTRf(task, p_unsigned);
-	PRINT_ATTRf(watermark, p_unsigned);
-	PRINT_ATTRf(precise_ip, p_unsigned);
-	PRINT_ATTRf(mmap_data, p_unsigned);
-	PRINT_ATTRf(sample_id_all, p_unsigned);
-	PRINT_ATTRf(exclude_host, p_unsigned);
-	PRINT_ATTRf(exclude_guest, p_unsigned);
-	PRINT_ATTRf(exclude_callchain_kernel, p_unsigned);
-	PRINT_ATTRf(exclude_callchain_user, p_unsigned);
-	PRINT_ATTRf(mmap2, p_unsigned);
-	PRINT_ATTRf(comm_exec, p_unsigned);
-	PRINT_ATTRf(use_clockid, p_unsigned);
-	PRINT_ATTRf(context_switch, p_unsigned);
-	PRINT_ATTRf(write_backward, p_unsigned);
-	PRINT_ATTRf(namespaces, p_unsigned);
-	PRINT_ATTRf(ksymbol, p_unsigned);
-	PRINT_ATTRf(bpf_event, p_unsigned);
-	PRINT_ATTRf(aux_output, p_unsigned);
-
-	PRINT_ATTRn("{ wakeup_events, wakeup_watermark }", wakeup_events, p_unsigned);
-	PRINT_ATTRf(bp_type, p_unsigned);
-	PRINT_ATTRn("{ bp_addr, config1 }", bp_addr, p_hex);
-	PRINT_ATTRn("{ bp_len, config2 }", bp_len, p_hex);
-	PRINT_ATTRf(branch_sample_type, p_branch_sample_type);
-	PRINT_ATTRf(sample_regs_user, p_hex);
-	PRINT_ATTRf(sample_stack_user, p_unsigned);
-	PRINT_ATTRf(clockid, p_signed);
-	PRINT_ATTRf(sample_regs_intr, p_hex);
-	PRINT_ATTRf(aux_watermark, p_unsigned);
-	PRINT_ATTRf(sample_max_stack, p_unsigned);
-
-	return ret;
-}
-
-static int __open_attr__fprintf(FILE *fp, const char *name, const char *val,
-				void *priv __maybe_unused)
-{
-	return fprintf(fp, "  %-32s %s\n", name, val);
-}
-
 static void perf_evsel__remove_fd(struct evsel *pos,
 				  int nr_cpus, int nr_threads,
 				  int thread_idx)
@@ -1662,7 +1489,7 @@
 		return false;
 
 	/* The system wide setup does not work with threads. */
-	if (evsel->system_wide)
+	if (evsel->core.system_wide)
 		return false;
 
 	/* The -ESRCH is perf event syscall errno for pid's not found. */
@@ -1688,6 +1515,12 @@
 	return true;
 }
 
+static int __open_attr__fprintf(FILE *fp, const char *name, const char *val,
+				void *priv __maybe_unused)
+{
+	return fprintf(fp, "  %-32s %s\n", name, val);
+}
+
 static void display_attr(struct perf_event_attr *attr)
 {
 	if (verbose >= 2) {
@@ -1771,7 +1604,7 @@
 		threads = empty_thread_map;
 	}
 
-	if (evsel->system_wide)
+	if (evsel->core.system_wide)
 		nthreads = 1;
 	else
 		nthreads = threads->nr;
@@ -1818,7 +1651,7 @@
 		for (thread = 0; thread < nthreads; thread++) {
 			int fd, group_fd;
 
-			if (!evsel->cgrp && !evsel->system_wide)
+			if (!evsel->cgrp && !evsel->core.system_wide)
 				pid = perf_thread_map__pid(threads, thread);
 
 			group_fd = get_group_fd(evsel, cpu, thread);
@@ -1991,7 +1824,7 @@
 void evsel__close(struct evsel *evsel)
 {
 	perf_evsel__close(&evsel->core);
-	perf_evsel__free_id(evsel);
+	perf_evsel__free_id(&evsel->core);
 }
 
 int perf_evsel__open_per_cpu(struct evsel *evsel,
@@ -2419,283 +2252,6 @@
 	return 0;
 }
 
-size_t perf_event__sample_event_size(const struct perf_sample *sample, u64 type,
-				     u64 read_format)
-{
-	size_t sz, result = sizeof(struct perf_record_sample);
-
-	if (type & PERF_SAMPLE_IDENTIFIER)
-		result += sizeof(u64);
-
-	if (type & PERF_SAMPLE_IP)
-		result += sizeof(u64);
-
-	if (type & PERF_SAMPLE_TID)
-		result += sizeof(u64);
-
-	if (type & PERF_SAMPLE_TIME)
-		result += sizeof(u64);
-
-	if (type & PERF_SAMPLE_ADDR)
-		result += sizeof(u64);
-
-	if (type & PERF_SAMPLE_ID)
-		result += sizeof(u64);
-
-	if (type & PERF_SAMPLE_STREAM_ID)
-		result += sizeof(u64);
-
-	if (type & PERF_SAMPLE_CPU)
-		result += sizeof(u64);
-
-	if (type & PERF_SAMPLE_PERIOD)
-		result += sizeof(u64);
-
-	if (type & PERF_SAMPLE_READ) {
-		result += sizeof(u64);
-		if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
-			result += sizeof(u64);
-		if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
-			result += sizeof(u64);
-		/* PERF_FORMAT_ID is forced for PERF_SAMPLE_READ */
-		if (read_format & PERF_FORMAT_GROUP) {
-			sz = sample->read.group.nr *
-			     sizeof(struct sample_read_value);
-			result += sz;
-		} else {
-			result += sizeof(u64);
-		}
-	}
-
-	if (type & PERF_SAMPLE_CALLCHAIN) {
-		sz = (sample->callchain->nr + 1) * sizeof(u64);
-		result += sz;
-	}
-
-	if (type & PERF_SAMPLE_RAW) {
-		result += sizeof(u32);
-		result += sample->raw_size;
-	}
-
-	if (type & PERF_SAMPLE_BRANCH_STACK) {
-		sz = sample->branch_stack->nr * sizeof(struct branch_entry);
-		sz += sizeof(u64);
-		result += sz;
-	}
-
-	if (type & PERF_SAMPLE_REGS_USER) {
-		if (sample->user_regs.abi) {
-			result += sizeof(u64);
-			sz = hweight64(sample->user_regs.mask) * sizeof(u64);
-			result += sz;
-		} else {
-			result += sizeof(u64);
-		}
-	}
-
-	if (type & PERF_SAMPLE_STACK_USER) {
-		sz = sample->user_stack.size;
-		result += sizeof(u64);
-		if (sz) {
-			result += sz;
-			result += sizeof(u64);
-		}
-	}
-
-	if (type & PERF_SAMPLE_WEIGHT)
-		result += sizeof(u64);
-
-	if (type & PERF_SAMPLE_DATA_SRC)
-		result += sizeof(u64);
-
-	if (type & PERF_SAMPLE_TRANSACTION)
-		result += sizeof(u64);
-
-	if (type & PERF_SAMPLE_REGS_INTR) {
-		if (sample->intr_regs.abi) {
-			result += sizeof(u64);
-			sz = hweight64(sample->intr_regs.mask) * sizeof(u64);
-			result += sz;
-		} else {
-			result += sizeof(u64);
-		}
-	}
-
-	if (type & PERF_SAMPLE_PHYS_ADDR)
-		result += sizeof(u64);
-
-	return result;
-}
-
-int perf_event__synthesize_sample(union perf_event *event, u64 type,
-				  u64 read_format,
-				  const struct perf_sample *sample)
-{
-	__u64 *array;
-	size_t sz;
-	/*
-	 * used for cross-endian analysis. See git commit 65014ab3
-	 * for why this goofiness is needed.
-	 */
-	union u64_swap u;
-
-	array = event->sample.array;
-
-	if (type & PERF_SAMPLE_IDENTIFIER) {
-		*array = sample->id;
-		array++;
-	}
-
-	if (type & PERF_SAMPLE_IP) {
-		*array = sample->ip;
-		array++;
-	}
-
-	if (type & PERF_SAMPLE_TID) {
-		u.val32[0] = sample->pid;
-		u.val32[1] = sample->tid;
-		*array = u.val64;
-		array++;
-	}
-
-	if (type & PERF_SAMPLE_TIME) {
-		*array = sample->time;
-		array++;
-	}
-
-	if (type & PERF_SAMPLE_ADDR) {
-		*array = sample->addr;
-		array++;
-	}
-
-	if (type & PERF_SAMPLE_ID) {
-		*array = sample->id;
-		array++;
-	}
-
-	if (type & PERF_SAMPLE_STREAM_ID) {
-		*array = sample->stream_id;
-		array++;
-	}
-
-	if (type & PERF_SAMPLE_CPU) {
-		u.val32[0] = sample->cpu;
-		u.val32[1] = 0;
-		*array = u.val64;
-		array++;
-	}
-
-	if (type & PERF_SAMPLE_PERIOD) {
-		*array = sample->period;
-		array++;
-	}
-
-	if (type & PERF_SAMPLE_READ) {
-		if (read_format & PERF_FORMAT_GROUP)
-			*array = sample->read.group.nr;
-		else
-			*array = sample->read.one.value;
-		array++;
-
-		if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
-			*array = sample->read.time_enabled;
-			array++;
-		}
-
-		if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
-			*array = sample->read.time_running;
-			array++;
-		}
-
-		/* PERF_FORMAT_ID is forced for PERF_SAMPLE_READ */
-		if (read_format & PERF_FORMAT_GROUP) {
-			sz = sample->read.group.nr *
-			     sizeof(struct sample_read_value);
-			memcpy(array, sample->read.group.values, sz);
-			array = (void *)array + sz;
-		} else {
-			*array = sample->read.one.id;
-			array++;
-		}
-	}
-
-	if (type & PERF_SAMPLE_CALLCHAIN) {
-		sz = (sample->callchain->nr + 1) * sizeof(u64);
-		memcpy(array, sample->callchain, sz);
-		array = (void *)array + sz;
-	}
-
-	if (type & PERF_SAMPLE_RAW) {
-		u.val32[0] = sample->raw_size;
-		*array = u.val64;
-		array = (void *)array + sizeof(u32);
-
-		memcpy(array, sample->raw_data, sample->raw_size);
-		array = (void *)array + sample->raw_size;
-	}
-
-	if (type & PERF_SAMPLE_BRANCH_STACK) {
-		sz = sample->branch_stack->nr * sizeof(struct branch_entry);
-		sz += sizeof(u64);
-		memcpy(array, sample->branch_stack, sz);
-		array = (void *)array + sz;
-	}
-
-	if (type & PERF_SAMPLE_REGS_USER) {
-		if (sample->user_regs.abi) {
-			*array++ = sample->user_regs.abi;
-			sz = hweight64(sample->user_regs.mask) * sizeof(u64);
-			memcpy(array, sample->user_regs.regs, sz);
-			array = (void *)array + sz;
-		} else {
-			*array++ = 0;
-		}
-	}
-
-	if (type & PERF_SAMPLE_STACK_USER) {
-		sz = sample->user_stack.size;
-		*array++ = sz;
-		if (sz) {
-			memcpy(array, sample->user_stack.data, sz);
-			array = (void *)array + sz;
-			*array++ = sz;
-		}
-	}
-
-	if (type & PERF_SAMPLE_WEIGHT) {
-		*array = sample->weight;
-		array++;
-	}
-
-	if (type & PERF_SAMPLE_DATA_SRC) {
-		*array = sample->data_src;
-		array++;
-	}
-
-	if (type & PERF_SAMPLE_TRANSACTION) {
-		*array = sample->transaction;
-		array++;
-	}
-
-	if (type & PERF_SAMPLE_REGS_INTR) {
-		if (sample->intr_regs.abi) {
-			*array++ = sample->intr_regs.abi;
-			sz = hweight64(sample->intr_regs.mask) * sizeof(u64);
-			memcpy(array, sample->intr_regs.regs, sz);
-			array = (void *)array + sz;
-		} else {
-			*array++ = 0;
-		}
-	}
-
-	if (type & PERF_SAMPLE_PHYS_ADDR) {
-		*array = sample->phys_addr;
-		array++;
-	}
-
-	return 0;
-}
-
 struct tep_format_field *perf_evsel__field(struct evsel *evsel, const char *name)
 {
 	return tep_find_field(evsel->tp_format, name);
@@ -2811,9 +2367,11 @@
 		if (evsel->name)
 			free(evsel->name);
 		evsel->name = new_name;
-		scnprintf(msg, msgsize,
-"kernel.perf_event_paranoid=%d, trying to fall back to excluding kernel samples", paranoid);
+		scnprintf(msg, msgsize, "kernel.perf_event_paranoid=%d, trying "
+			  "to fall back to excluding kernel and hypervisor "
+			  " samples", paranoid);
 		evsel->core.attr.exclude_kernel = 1;
+		evsel->core.attr.exclude_hv     = 1;
 
 		return true;
 	}
@@ -2966,7 +2524,7 @@
 		     thread++) {
 			int fd = FD(evsel, cpu, thread);
 
-			if (perf_evlist__id_add_fd(evlist, evsel,
+			if (perf_evlist__id_add_fd(&evlist->core, &evsel->core,
 						   cpu, thread, fd) < 0)
 				return -1;
 		}
@@ -2980,7 +2538,7 @@
 	struct perf_cpu_map *cpus = evsel->core.cpus;
 	struct perf_thread_map *threads = evsel->core.threads;
 
-	if (perf_evsel__alloc_id(evsel, cpus->nr, threads->nr))
+	if (perf_evsel__alloc_id(&evsel->core, cpus->nr, threads->nr))
 		return -ENOMEM;
 
 	return store_evsel_ids(evsel, evlist);

diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index 68321d1..ddc5ee6 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h

@@ -4,7 +4,6 @@
 
 #include <linux/list.h>
 #include <stdbool.h>
-#include <stdio.h>
 #include <sys/types.h>
 #include <linux/perf_event.h>
 #include <linux/types.h>
@@ -13,79 +12,11 @@
 #include "symbol_conf.h"
 #include <internal/cpumap.h>
 
-struct addr_location;
-struct evsel;
-union perf_event;
-
-/*
- * Per fd, to map back from PERF_SAMPLE_ID to evsel, only used when there are
- * more than one entry in the evlist.
- */
-struct perf_sample_id {
-	struct hlist_node 	node;
-	u64		 	id;
-	struct evsel		*evsel;
-       /*
-	* 'idx' will be used for AUX area sampling. A sample will have AUX area
-	* data that will be queued for decoding, where there are separate
-	* queues for each CPU (per-cpu tracing) or task (per-thread tracing).
-	* The sample ID can be used to lookup 'idx' which is effectively the
-	* queue number.
-	*/
-	int			idx;
-	int			cpu;
-	pid_t			tid;
-
-	/* Holds total ID period value for PERF_SAMPLE_READ processing. */
-	u64			period;
-};
-
+struct bpf_object;
 struct cgroup;
-
-/*
- * The 'struct perf_evsel_config_term' is used to pass event
- * specific configuration data to perf_evsel__config routine.
- * It is allocated within event parsing and attached to
- * perf_evsel::config_terms list head.
-*/
-enum term_type {
-	PERF_EVSEL__CONFIG_TERM_PERIOD,
-	PERF_EVSEL__CONFIG_TERM_FREQ,
-	PERF_EVSEL__CONFIG_TERM_TIME,
-	PERF_EVSEL__CONFIG_TERM_CALLGRAPH,
-	PERF_EVSEL__CONFIG_TERM_STACK_USER,
-	PERF_EVSEL__CONFIG_TERM_INHERIT,
-	PERF_EVSEL__CONFIG_TERM_MAX_STACK,
-	PERF_EVSEL__CONFIG_TERM_MAX_EVENTS,
-	PERF_EVSEL__CONFIG_TERM_OVERWRITE,
-	PERF_EVSEL__CONFIG_TERM_DRV_CFG,
-	PERF_EVSEL__CONFIG_TERM_BRANCH,
-	PERF_EVSEL__CONFIG_TERM_PERCORE,
-	PERF_EVSEL__CONFIG_TERM_AUX_OUTPUT,
-};
-
-struct perf_evsel_config_term {
-	struct list_head	list;
-	enum term_type	type;
-	union {
-		u64	period;
-		u64	freq;
-		bool	time;
-		char	*callgraph;
-		char	*drv_cfg;
-		u64	stack_user;
-		int	max_stack;
-		bool	inherit;
-		bool	overwrite;
-		char	*branch;
-		unsigned long max_events;
-		bool	percore;
-		bool	aux_output;
-	} val;
-	bool weak;
-};
-
+struct perf_counts;
 struct perf_stat_evsel;
+union perf_event;
 
 typedef int (perf_evsel__sb_cb_t)(union perf_event *event, void *data);
 
@@ -94,10 +25,6 @@
 	PERF_TOOL_DURATION_TIME = 1,
 };
 
-struct bpf_object;
-struct perf_counts;
-struct xyarray;
-
 /** struct evsel - event selector
  *
  * @evlist - evlist this evsel is in, if it is in one.
@@ -117,12 +44,9 @@
 	struct perf_evsel	core;
 	struct evlist	*evlist;
 	char			*filter;
-	struct xyarray		*sample_id;
-	u64			*id;
 	struct perf_counts	*counts;
 	struct perf_counts	*prev_raw_counts;
 	int			idx;
-	u32			ids;
 	unsigned long		max_events;
 	unsigned long		nr_events_printed;
 	char			*name;
@@ -146,7 +70,6 @@
 	bool 			disabled;
 	bool			no_aux_samples;
 	bool			immediate;
-	bool			system_wide;
 	bool			tracking;
 	bool			per_pkg;
 	bool			precise_max;
@@ -179,11 +102,6 @@
 	} side_band;
 };
 
-union u64_swap {
-	u64 val64;
-	u32 val32[2];
-};
-
 struct perf_missing_features {
 	bool sample_id_all;
 	bool exclude_guest;
@@ -282,8 +200,6 @@
 const char *perf_evsel__group_name(struct evsel *evsel);
 int perf_evsel__group_desc(struct evsel *evsel, char *buf, size_t size);
 
-int perf_evsel__alloc_id(struct evsel *evsel, int ncpus, int nthreads);
-
 void __perf_evsel__set_sample_bit(struct evsel *evsel,
 				  enum perf_event_sample_format bit);
 void __perf_evsel__reset_sample_bit(struct evsel *evsel,
@@ -439,37 +355,6 @@
 	       perf_evsel__match(evsel, SOFTWARE, SW_TASK_CLOCK);
 }
 
-struct perf_attr_details {
-	bool freq;
-	bool verbose;
-	bool event_group;
-	bool force;
-	bool trace_fields;
-};
-
-int perf_evsel__fprintf(struct evsel *evsel,
-			struct perf_attr_details *details, FILE *fp);
-
-#define EVSEL__PRINT_IP			(1<<0)
-#define EVSEL__PRINT_SYM		(1<<1)
-#define EVSEL__PRINT_DSO		(1<<2)
-#define EVSEL__PRINT_SYMOFFSET		(1<<3)
-#define EVSEL__PRINT_ONELINE		(1<<4)
-#define EVSEL__PRINT_SRCLINE		(1<<5)
-#define EVSEL__PRINT_UNKNOWN_AS_ADDR	(1<<6)
-#define EVSEL__PRINT_CALLCHAIN_ARROW	(1<<7)
-#define EVSEL__PRINT_SKIP_IGNORED	(1<<8)
-
-struct callchain_cursor;
-
-int sample__fprintf_callchain(struct perf_sample *sample, int left_alignment,
-			      unsigned int print_opts,
-			      struct callchain_cursor *cursor, FILE *fp);
-
-int sample__fprintf_sym(struct perf_sample *sample, struct addr_location *al,
-			int left_alignment, unsigned int print_opts,
-			struct callchain_cursor *cursor, FILE *fp);
-
 bool perf_evsel__fallback(struct evsel *evsel, int err,
 			  char *msg, size_t msgsize);
 int perf_evsel__open_strerror(struct evsel *evsel, struct target *target,
@@ -502,11 +387,6 @@
 	return (evsel->core.attr.sample_type & PERF_SAMPLE_CALLCHAIN) != 0;
 }
 
-typedef int (*attr__fprintf_f)(FILE *, const char *, const char *, void *);
-
-int perf_event_attr__fprintf(FILE *fp, struct perf_event_attr *attr,
-			     attr__fprintf_f attr__fprintf, void *priv);
-
 struct perf_env *perf_evsel__env(struct evsel *evsel);
 
 int perf_evsel__store_ids(struct evsel *evsel, struct evlist *evlist);

diff --git a/tools/perf/util/evsel_config.h b/tools/perf/util/evsel_config.h
new file mode 100644
index 0000000..8a76480
--- /dev/null
+++ b/tools/perf/util/evsel_config.h

@@ -0,0 +1,50 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef __PERF_EVSEL_CONFIG_H
+#define __PERF_EVSEL_CONFIG_H 1
+
+#include <linux/types.h>
+#include <stdbool.h>
+
+/*
+ * The 'struct perf_evsel_config_term' is used to pass event
+ * specific configuration data to perf_evsel__config routine.
+ * It is allocated within event parsing and attached to
+ * perf_evsel::config_terms list head.
+*/
+enum evsel_term_type {
+	PERF_EVSEL__CONFIG_TERM_PERIOD,
+	PERF_EVSEL__CONFIG_TERM_FREQ,
+	PERF_EVSEL__CONFIG_TERM_TIME,
+	PERF_EVSEL__CONFIG_TERM_CALLGRAPH,
+	PERF_EVSEL__CONFIG_TERM_STACK_USER,
+	PERF_EVSEL__CONFIG_TERM_INHERIT,
+	PERF_EVSEL__CONFIG_TERM_MAX_STACK,
+	PERF_EVSEL__CONFIG_TERM_MAX_EVENTS,
+	PERF_EVSEL__CONFIG_TERM_OVERWRITE,
+	PERF_EVSEL__CONFIG_TERM_DRV_CFG,
+	PERF_EVSEL__CONFIG_TERM_BRANCH,
+	PERF_EVSEL__CONFIG_TERM_PERCORE,
+	PERF_EVSEL__CONFIG_TERM_AUX_OUTPUT,
+};
+
+struct perf_evsel_config_term {
+	struct list_head      list;
+	enum evsel_term_type  type;
+	union {
+		u64	      period;
+		u64	      freq;
+		bool	      time;
+		char	      *callgraph;
+		char	      *drv_cfg;
+		u64	      stack_user;
+		int	      max_stack;
+		bool	      inherit;
+		bool	      overwrite;
+		char	      *branch;
+		unsigned long max_events;
+		bool	      percore;
+		bool	      aux_output;
+	} val;
+	bool weak;
+};
+#endif // __PERF_EVSEL_CONFIG_H

diff --git a/tools/perf/util/evsel_fprintf.c b/tools/perf/util/evsel_fprintf.c
index 496fec0..028df7a 100644
--- a/tools/perf/util/evsel_fprintf.c
+++ b/tools/perf/util/evsel_fprintf.c

@@ -4,6 +4,8 @@
 #include <stdbool.h>
 #include <traceevent/event-parse.h>
 #include "evsel.h"
+#include "util/evsel_fprintf.h"
+#include "util/event.h"
 #include "callchain.h"
 #include "map.h"
 #include "strlist.h"
@@ -101,7 +103,7 @@
 
 int sample__fprintf_callchain(struct perf_sample *sample, int left_alignment,
 			      unsigned int print_opts, struct callchain_cursor *cursor,
-			      FILE *fp)
+			      struct strlist *bt_stop_list, FILE *fp)
 {
 	int printed = 0;
 	struct callchain_cursor_node *node;
@@ -174,10 +176,8 @@
 				printed += fprintf(fp, "\n");
 
 			/* Add srccode here too? */
-			if (symbol_conf.bt_stop_list &&
-			    node->sym &&
-			    strlist__has_entry(symbol_conf.bt_stop_list,
-					       node->sym->name)) {
+			if (bt_stop_list && node->sym &&
+			    strlist__has_entry(bt_stop_list, node->sym->name)) {
 				break;
 			}
 
@@ -192,7 +192,7 @@
 
 int sample__fprintf_sym(struct perf_sample *sample, struct addr_location *al,
 			int left_alignment, unsigned int print_opts,
-			struct callchain_cursor *cursor, FILE *fp)
+			struct callchain_cursor *cursor, struct strlist *bt_stop_list, FILE *fp)
 {
 	int printed = 0;
 	int print_ip = print_opts & EVSEL__PRINT_IP;
@@ -203,8 +203,8 @@
 	int print_unknown_as_addr = print_opts & EVSEL__PRINT_UNKNOWN_AS_ADDR;
 
 	if (cursor != NULL) {
-		printed += sample__fprintf_callchain(sample, left_alignment,
-						     print_opts, cursor, fp);
+		printed += sample__fprintf_callchain(sample, left_alignment, print_opts,
+						     cursor, bt_stop_list, fp);
 	} else {
 		printed += fprintf(fp, "%-*.*s", left_alignment, left_alignment, " ");
 

diff --git a/tools/perf/util/evsel_fprintf.h b/tools/perf/util/evsel_fprintf.h
new file mode 100644
index 0000000..47e6c84
--- /dev/null
+++ b/tools/perf/util/evsel_fprintf.h

@@ -0,0 +1,50 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef __PERF_EVSEL_FPRINTF_H
+#define __PERF_EVSEL_FPRINTF_H 1
+
+#include <stdio.h>
+#include <stdbool.h>
+
+struct evsel;
+
+struct perf_attr_details {
+	bool freq;
+	bool verbose;
+	bool event_group;
+	bool force;
+	bool trace_fields;
+};
+
+int perf_evsel__fprintf(struct evsel *evsel,
+			struct perf_attr_details *details, FILE *fp);
+
+#define EVSEL__PRINT_IP			(1<<0)
+#define EVSEL__PRINT_SYM		(1<<1)
+#define EVSEL__PRINT_DSO		(1<<2)
+#define EVSEL__PRINT_SYMOFFSET		(1<<3)
+#define EVSEL__PRINT_ONELINE		(1<<4)
+#define EVSEL__PRINT_SRCLINE		(1<<5)
+#define EVSEL__PRINT_UNKNOWN_AS_ADDR	(1<<6)
+#define EVSEL__PRINT_CALLCHAIN_ARROW	(1<<7)
+#define EVSEL__PRINT_SKIP_IGNORED	(1<<8)
+
+struct addr_location;
+struct perf_event_attr;
+struct perf_sample;
+struct callchain_cursor;
+struct strlist;
+
+int sample__fprintf_callchain(struct perf_sample *sample, int left_alignment,
+			      unsigned int print_opts, struct callchain_cursor *cursor,
+			      struct strlist *bt_stop_list, FILE *fp);
+
+int sample__fprintf_sym(struct perf_sample *sample, struct addr_location *al,
+			int left_alignment, unsigned int print_opts,
+			struct callchain_cursor *cursor,
+			struct strlist *bt_stop_list, FILE *fp);
+
+typedef int (*attr__fprintf_f)(FILE *, const char *, const char *, void *);
+
+int perf_event_attr__fprintf(FILE *fp, struct perf_event_attr *attr,
+			     attr__fprintf_f attr__fprintf, void *priv);
+#endif // __PERF_EVSEL_H

diff --git a/tools/perf/util/genelf.h b/tools/perf/util/genelf.h
index b72440b..d413755 100644
--- a/tools/perf/util/genelf.h
+++ b/tools/perf/util/genelf.h

@@ -35,6 +35,9 @@
 #elif defined(__sparc__)
 #define GEN_ELF_ARCH	EM_SPARC
 #define GEN_ELF_CLASS	ELFCLASS32
+#elif defined(__s390x__)
+#define GEN_ELF_ARCH	EM_S390
+#define GEN_ELF_CLASS	ELFCLASS64
 #else
 #error "unsupported architecture"
 #endif

diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index b0c34dd..86d9396 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c

@@ -25,6 +25,7 @@
 #include "dso.h"
 #include "evlist.h"
 #include "evsel.h"
+#include "util/evsel_fprintf.h"
 #include "header.h"
 #include "memswap.h"
 #include "trace-event.h"
@@ -42,11 +43,12 @@
 #include "tool.h"
 #include "time-utils.h"
 #include "units.h"
-#include "util.h"
+#include "util/util.h" // perf_exe()
 #include "cputopo.h"
 #include "bpf-event.h"
 
 #include <linux/ctype.h>
+#include <internal/lib.h>
 
 /*
  * magic2 = "PERFILE2"
@@ -70,15 +72,6 @@
 	struct perf_file_section	ids;
 };
 
-struct feat_fd {
-	struct perf_header	*ph;
-	int			fd;
-	void			*buf;	/* Either buf != NULL or fd >= 0 */
-	ssize_t			offset;
-	size_t			size;
-	struct evsel	*events;
-};
-
 void perf_header__set_feat(struct perf_header *header, int feat)
 {
 	set_bit(feat, header->adds_features);
@@ -524,7 +517,7 @@
 		 * copy into an nri to be independent of the
 		 * type of ids,
 		 */
-		nri = evsel->ids;
+		nri = evsel->core.ids;
 		ret = do_write(ff, &nri, sizeof(nri));
 		if (ret < 0)
 			return ret;
@@ -538,7 +531,7 @@
 		/*
 		 * write unique ids for this event
 		 */
-		ret = do_write(ff, evsel->id, evsel->ids * sizeof(u64));
+		ret = do_write(ff, evsel->core.id, evsel->core.ids * sizeof(u64));
 		if (ret < 0)
 			return ret;
 	}
@@ -1081,7 +1074,7 @@
 
 	scnprintf(file, PATH_MAX, "%s/shared_cpu_list", path);
 	if (sysfs__read_str(file, &cache->map, &len)) {
-		zfree(&cache->map);
+		zfree(&cache->size);
 		zfree(&cache->type);
 		return -1;
 	}
@@ -1598,7 +1591,7 @@
 
 	for (evsel = events; evsel->core.attr.size; evsel++) {
 		zfree(&evsel->name);
-		zfree(&evsel->id);
+		zfree(&evsel->core.id);
 	}
 
 	free(events);
@@ -1664,8 +1657,8 @@
 		id = calloc(nr, sizeof(*id));
 		if (!id)
 			goto error;
-		evsel->ids = nr;
-		evsel->id = id;
+		evsel->core.ids = nr;
+		evsel->core.id = id;
 
 		for (j = 0 ; j < nr; j++) {
 			if (do_read_u64(ff, id))
@@ -1707,9 +1700,9 @@
 	for (evsel = events; evsel->core.attr.size; evsel++) {
 		fprintf(fp, "# event : name = %s, ", evsel->name);
 
-		if (evsel->ids) {
+		if (evsel->core.ids) {
 			fprintf(fp, ", id = {");
-			for (j = 0, id = evsel->id; j < evsel->ids; j++, id++) {
+			for (j = 0, id = evsel->core.id; j < evsel->core.ids; j++, id++) {
 				if (j)
 					fputc(',', fp);
 				fprintf(fp, " %"PRIu64, *id);
@@ -2823,15 +2816,6 @@
 	return 0;
 }
 
-struct feature_ops {
-	int (*write)(struct feat_fd *ff, struct evlist *evlist);
-	void (*print)(struct feat_fd *ff, FILE *fp);
-	int (*process)(struct feat_fd *ff, void *data);
-	const char *name;
-	bool full_only;
-	bool synthesize;
-};
-
 #define FEAT_OPR(n, func, __full_only) \
 	[HEADER_##n] = {					\
 		.name	    = __stringify(n),			\
@@ -2858,8 +2842,10 @@
 #define process_branch_stack	NULL
 #define process_stat		NULL
 
+// Only used in util/synthetic-events.c
+const struct perf_header_feature_ops feat_ops[HEADER_LAST_FEATURE];
 
-static const struct feature_ops feat_ops[HEADER_LAST_FEATURE] = {
+const struct perf_header_feature_ops feat_ops[HEADER_LAST_FEATURE] = {
 	FEAT_OPN(TRACING_DATA,	tracing_data,	false),
 	FEAT_OPN(BUILD_ID,	build_id,	false),
 	FEAT_OPR(HOSTNAME,	hostname,	false),
@@ -3083,7 +3069,7 @@
 
 	evlist__for_each_entry(session->evlist, evsel) {
 		evsel->id_offset = lseek(fd, 0, SEEK_CUR);
-		err = do_write(&ff, evsel->id, evsel->ids * sizeof(u64));
+		err = do_write(&ff, evsel->core.id, evsel->core.ids * sizeof(u64));
 		if (err < 0) {
 			pr_debug("failed to write perf header\n");
 			return err;
@@ -3097,7 +3083,7 @@
 			.attr = evsel->core.attr,
 			.ids  = {
 				.offset = evsel->id_offset,
-				.size   = evsel->ids * sizeof(u64),
+				.size   = evsel->core.ids * sizeof(u64),
 			}
 		};
 		err = do_write(&ff, &f_attr, sizeof(f_attr));
@@ -3624,7 +3610,7 @@
 		 * for allocating the perf_sample_id table we fake 1 cpu and
 		 * hattr->ids threads.
 		 */
-		if (perf_evsel__alloc_id(evsel, 1, nr_ids))
+		if (perf_evsel__alloc_id(&evsel->core, 1, nr_ids))
 			goto out_delete_evlist;
 
 		lseek(fd, f_attr.ids.offset, SEEK_SET);
@@ -3633,7 +3619,7 @@
 			if (perf_header__getbuffer64(header, fd, &f_id, sizeof(f_id)))
 				goto out_errno;
 
-			perf_evlist__id_add(session->evlist, evsel, 0, j, f_id);
+			perf_evlist__id_add(&session->evlist->core, &evsel->core, 0, j, f_id);
 		}
 
 		lseek(fd, tmp, SEEK_SET);
@@ -3656,105 +3642,6 @@
 	return -ENOMEM;
 }
 
-int perf_event__synthesize_attr(struct perf_tool *tool,
-				struct perf_event_attr *attr, u32 ids, u64 *id,
-				perf_event__handler_t process)
-{
-	union perf_event *ev;
-	size_t size;
-	int err;
-
-	size = sizeof(struct perf_event_attr);
-	size = PERF_ALIGN(size, sizeof(u64));
-	size += sizeof(struct perf_event_header);
-	size += ids * sizeof(u64);
-
-	ev = zalloc(size);
-
-	if (ev == NULL)
-		return -ENOMEM;
-
-	ev->attr.attr = *attr;
-	memcpy(ev->attr.id, id, ids * sizeof(u64));
-
-	ev->attr.header.type = PERF_RECORD_HEADER_ATTR;
-	ev->attr.header.size = (u16)size;
-
-	if (ev->attr.header.size == size)
-		err = process(tool, ev, NULL, NULL);
-	else
-		err = -E2BIG;
-
-	free(ev);
-
-	return err;
-}
-
-int perf_event__synthesize_features(struct perf_tool *tool,
-				    struct perf_session *session,
-				    struct evlist *evlist,
-				    perf_event__handler_t process)
-{
-	struct perf_header *header = &session->header;
-	struct feat_fd ff;
-	struct perf_record_header_feature *fe;
-	size_t sz, sz_hdr;
-	int feat, ret;
-
-	sz_hdr = sizeof(fe->header);
-	sz = sizeof(union perf_event);
-	/* get a nice alignment */
-	sz = PERF_ALIGN(sz, page_size);
-
-	memset(&ff, 0, sizeof(ff));
-
-	ff.buf = malloc(sz);
-	if (!ff.buf)
-		return -ENOMEM;
-
-	ff.size = sz - sz_hdr;
-	ff.ph = &session->header;
-
-	for_each_set_bit(feat, header->adds_features, HEADER_FEAT_BITS) {
-		if (!feat_ops[feat].synthesize) {
-			pr_debug("No record header feature for header :%d\n", feat);
-			continue;
-		}
-
-		ff.offset = sizeof(*fe);
-
-		ret = feat_ops[feat].write(&ff, evlist);
-		if (ret || ff.offset <= (ssize_t)sizeof(*fe)) {
-			pr_debug("Error writing feature\n");
-			continue;
-		}
-		/* ff.buf may have changed due to realloc in do_write() */
-		fe = ff.buf;
-		memset(fe, 0, sizeof(*fe));
-
-		fe->feat_id = feat;
-		fe->header.type = PERF_RECORD_HEADER_FEATURE;
-		fe->header.size = ff.offset;
-
-		ret = process(tool, ff.buf, NULL, NULL);
-		if (ret) {
-			free(ff.buf);
-			return ret;
-		}
-	}
-
-	/* Send HEADER_LAST_FEATURE mark. */
-	fe = ff.buf;
-	fe->feat_id     = HEADER_LAST_FEATURE;
-	fe->header.type = PERF_RECORD_HEADER_FEATURE;
-	fe->header.size = sizeof(*fe);
-
-	ret = process(tool, ff.buf, NULL, NULL);
-
-	free(ff.buf);
-	return ret;
-}
-
 int perf_event__process_feature(struct perf_session *session,
 				union perf_event *event)
 {
@@ -3797,113 +3684,6 @@
 	return 0;
 }
 
-static struct perf_record_event_update *
-event_update_event__new(size_t size, u64 type, u64 id)
-{
-	struct perf_record_event_update *ev;
-
-	size += sizeof(*ev);
-	size  = PERF_ALIGN(size, sizeof(u64));
-
-	ev = zalloc(size);
-	if (ev) {
-		ev->header.type = PERF_RECORD_EVENT_UPDATE;
-		ev->header.size = (u16)size;
-		ev->type = type;
-		ev->id = id;
-	}
-	return ev;
-}
-
-int
-perf_event__synthesize_event_update_unit(struct perf_tool *tool,
-					 struct evsel *evsel,
-					 perf_event__handler_t process)
-{
-	struct perf_record_event_update *ev;
-	size_t size = strlen(evsel->unit);
-	int err;
-
-	ev = event_update_event__new(size + 1, PERF_EVENT_UPDATE__UNIT, evsel->id[0]);
-	if (ev == NULL)
-		return -ENOMEM;
-
-	strlcpy(ev->data, evsel->unit, size + 1);
-	err = process(tool, (union perf_event *)ev, NULL, NULL);
-	free(ev);
-	return err;
-}
-
-int
-perf_event__synthesize_event_update_scale(struct perf_tool *tool,
-					  struct evsel *evsel,
-					  perf_event__handler_t process)
-{
-	struct perf_record_event_update *ev;
-	struct perf_record_event_update_scale *ev_data;
-	int err;
-
-	ev = event_update_event__new(sizeof(*ev_data), PERF_EVENT_UPDATE__SCALE, evsel->id[0]);
-	if (ev == NULL)
-		return -ENOMEM;
-
-	ev_data = (struct perf_record_event_update_scale *)ev->data;
-	ev_data->scale = evsel->scale;
-	err = process(tool, (union perf_event*) ev, NULL, NULL);
-	free(ev);
-	return err;
-}
-
-int
-perf_event__synthesize_event_update_name(struct perf_tool *tool,
-					 struct evsel *evsel,
-					 perf_event__handler_t process)
-{
-	struct perf_record_event_update *ev;
-	size_t len = strlen(evsel->name);
-	int err;
-
-	ev = event_update_event__new(len + 1, PERF_EVENT_UPDATE__NAME, evsel->id[0]);
-	if (ev == NULL)
-		return -ENOMEM;
-
-	strlcpy(ev->data, evsel->name, len + 1);
-	err = process(tool, (union perf_event*) ev, NULL, NULL);
-	free(ev);
-	return err;
-}
-
-int
-perf_event__synthesize_event_update_cpus(struct perf_tool *tool,
-					struct evsel *evsel,
-					perf_event__handler_t process)
-{
-	size_t size = sizeof(struct perf_record_event_update);
-	struct perf_record_event_update *ev;
-	int max, err;
-	u16 type;
-
-	if (!evsel->core.own_cpus)
-		return 0;
-
-	ev = cpu_map_data__alloc(evsel->core.own_cpus, &size, &type, &max);
-	if (!ev)
-		return -ENOMEM;
-
-	ev->header.type = PERF_RECORD_EVENT_UPDATE;
-	ev->header.size = (u16)size;
-	ev->type = PERF_EVENT_UPDATE__CPUS;
-	ev->id   = evsel->id[0];
-
-	cpu_map_data__synthesize((struct perf_record_cpu_map_data *)ev->data,
-				 evsel->core.own_cpus,
-				 type, max);
-
-	err = process(tool, (union perf_event*) ev, NULL, NULL);
-	free(ev);
-	return err;
-}
-
 size_t perf_event__fprintf_event_update(union perf_event *event, FILE *fp)
 {
 	struct perf_record_event_update *ev = &event->event_update;
@@ -3943,93 +3723,6 @@
 	return ret;
 }
 
-int perf_event__synthesize_attrs(struct perf_tool *tool,
-				 struct evlist *evlist,
-				 perf_event__handler_t process)
-{
-	struct evsel *evsel;
-	int err = 0;
-
-	evlist__for_each_entry(evlist, evsel) {
-		err = perf_event__synthesize_attr(tool, &evsel->core.attr, evsel->ids,
-						  evsel->id, process);
-		if (err) {
-			pr_debug("failed to create perf header attribute\n");
-			return err;
-		}
-	}
-
-	return err;
-}
-
-static bool has_unit(struct evsel *counter)
-{
-	return counter->unit && *counter->unit;
-}
-
-static bool has_scale(struct evsel *counter)
-{
-	return counter->scale != 1;
-}
-
-int perf_event__synthesize_extra_attr(struct perf_tool *tool,
-				      struct evlist *evsel_list,
-				      perf_event__handler_t process,
-				      bool is_pipe)
-{
-	struct evsel *counter;
-	int err;
-
-	/*
-	 * Synthesize other events stuff not carried within
-	 * attr event - unit, scale, name
-	 */
-	evlist__for_each_entry(evsel_list, counter) {
-		if (!counter->supported)
-			continue;
-
-		/*
-		 * Synthesize unit and scale only if it's defined.
-		 */
-		if (has_unit(counter)) {
-			err = perf_event__synthesize_event_update_unit(tool, counter, process);
-			if (err < 0) {
-				pr_err("Couldn't synthesize evsel unit.\n");
-				return err;
-			}
-		}
-
-		if (has_scale(counter)) {
-			err = perf_event__synthesize_event_update_scale(tool, counter, process);
-			if (err < 0) {
-				pr_err("Couldn't synthesize evsel counter.\n");
-				return err;
-			}
-		}
-
-		if (counter->core.own_cpus) {
-			err = perf_event__synthesize_event_update_cpus(tool, counter, process);
-			if (err < 0) {
-				pr_err("Couldn't synthesize evsel cpus.\n");
-				return err;
-			}
-		}
-
-		/*
-		 * Name is needed only for pipe output,
-		 * perf.data carries event names.
-		 */
-		if (is_pipe) {
-			err = perf_event__synthesize_event_update_name(tool, counter, process);
-			if (err < 0) {
-				pr_err("Couldn't synthesize evsel name.\n");
-				return err;
-			}
-		}
-	}
-	return 0;
-}
-
 int perf_event__process_attr(struct perf_tool *tool __maybe_unused,
 			     union perf_event *event,
 			     struct evlist **pevlist)
@@ -4058,11 +3751,11 @@
 	 * for allocating the perf_sample_id table we fake 1 cpu and
 	 * hattr->ids threads.
 	 */
-	if (perf_evsel__alloc_id(evsel, 1, n_ids))
+	if (perf_evsel__alloc_id(&evsel->core, 1, n_ids))
 		return -ENOMEM;
 
 	for (i = 0; i < n_ids; i++) {
-		perf_evlist__id_add(evlist, evsel, 0, i, event->attr.id[i]);
+		perf_evlist__id_add(&evlist->core, &evsel->core, 0, i, event->attr.id[i]);
 	}
 
 	return 0;
@@ -4114,55 +3807,6 @@
 	return 0;
 }
 
-int perf_event__synthesize_tracing_data(struct perf_tool *tool, int fd,
-					struct evlist *evlist,
-					perf_event__handler_t process)
-{
-	union perf_event ev;
-	struct tracing_data *tdata;
-	ssize_t size = 0, aligned_size = 0, padding;
-	struct feat_fd ff;
-	int err __maybe_unused = 0;
-
-	/*
-	 * We are going to store the size of the data followed
-	 * by the data contents. Since the fd descriptor is a pipe,
-	 * we cannot seek back to store the size of the data once
-	 * we know it. Instead we:
-	 *
-	 * - write the tracing data to the temp file
-	 * - get/write the data size to pipe
-	 * - write the tracing data from the temp file
-	 *   to the pipe
-	 */
-	tdata = tracing_data_get(&evlist->core.entries, fd, true);
-	if (!tdata)
-		return -1;
-
-	memset(&ev, 0, sizeof(ev));
-
-	ev.tracing_data.header.type = PERF_RECORD_HEADER_TRACING_DATA;
-	size = tdata->size;
-	aligned_size = PERF_ALIGN(size, sizeof(u64));
-	padding = aligned_size - size;
-	ev.tracing_data.header.size = sizeof(ev.tracing_data);
-	ev.tracing_data.size = aligned_size;
-
-	process(tool, &ev, NULL, NULL);
-
-	/*
-	 * The put function will copy all the tracing data
-	 * stored in temp file to the pipe.
-	 */
-	tracing_data_put(tdata);
-
-	ff = (struct feat_fd){ .fd = fd };
-	if (write_padded(&ff, NULL, 0, padding))
-		return -1;
-
-	return aligned_size;
-}
-
 int perf_event__process_tracing_data(struct perf_session *session,
 				     union perf_event *event)
 {
@@ -4202,34 +3846,6 @@
 	return size_read + padding;
 }
 
-int perf_event__synthesize_build_id(struct perf_tool *tool,
-				    struct dso *pos, u16 misc,
-				    perf_event__handler_t process,
-				    struct machine *machine)
-{
-	union perf_event ev;
-	size_t len;
-	int err = 0;
-
-	if (!pos->hit)
-		return err;
-
-	memset(&ev, 0, sizeof(ev));
-
-	len = pos->long_name_len + 1;
-	len = PERF_ALIGN(len, NAME_ALIGN);
-	memcpy(&ev.build_id.build_id, pos->build_id, sizeof(pos->build_id));
-	ev.build_id.header.type = PERF_RECORD_HEADER_BUILD_ID;
-	ev.build_id.header.misc = misc;
-	ev.build_id.pid = machine->pid;
-	ev.build_id.header.size = sizeof(ev.build_id) + len;
-	memcpy(&ev.build_id.filename, pos->long_name, pos->long_name_len);
-
-	err = process(tool, &ev, NULL, machine);
-
-	return err;
-}
-
 int perf_event__process_build_id(struct perf_session *session,
 				 union perf_event *event)
 {

diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h
index 3e48ae3..ca53a92 100644
--- a/tools/perf/util/header.h
+++ b/tools/perf/util/header.h

@@ -5,10 +5,10 @@
 #include <linux/stddef.h>
 #include <linux/perf_event.h>
 #include <sys/types.h>
+#include <stdio.h> // FILE
 #include <stdbool.h>
 #include <linux/bitmap.h>
 #include <linux/types.h>
-#include "event.h"
 #include "env.h"
 #include "pmu.h"
 
@@ -92,8 +92,28 @@
 	struct perf_env 	env;
 };
 
+struct feat_fd {
+	struct perf_header *ph;
+	int		   fd;
+	void		   *buf;	/* Either buf != NULL or fd >= 0 */
+	ssize_t		   offset;
+	size_t		   size;
+	struct evsel	   *events;
+};
+
+struct perf_header_feature_ops {
+	int	   (*write)(struct feat_fd *ff, struct evlist *evlist);
+	void	   (*print)(struct feat_fd *ff, FILE *fp);
+	int	   (*process)(struct feat_fd *ff, void *data);
+	const char *name;
+	bool	   full_only;
+	bool	   synthesize;
+};
+
 struct evlist;
 struct perf_session;
+struct perf_tool;
+union perf_event;
 
 int perf_session__read_header(struct perf_session *session);
 int perf_session__write_header(struct perf_session *session,
@@ -115,54 +135,16 @@
 
 int perf_header__fprintf_info(struct perf_session *s, FILE *fp, bool full);
 
-int perf_event__synthesize_features(struct perf_tool *tool,
-				    struct perf_session *session,
-				    struct evlist *evlist,
-				    perf_event__handler_t process);
-
-int perf_event__synthesize_extra_attr(struct perf_tool *tool,
-				      struct evlist *evsel_list,
-				      perf_event__handler_t process,
-				      bool is_pipe);
-
 int perf_event__process_feature(struct perf_session *session,
 				union perf_event *event);
-
-int perf_event__synthesize_attr(struct perf_tool *tool,
-				struct perf_event_attr *attr, u32 ids, u64 *id,
-				perf_event__handler_t process);
-int perf_event__synthesize_attrs(struct perf_tool *tool,
-				 struct evlist *evlist,
-				 perf_event__handler_t process);
-int perf_event__synthesize_event_update_unit(struct perf_tool *tool,
-					     struct evsel *evsel,
-					     perf_event__handler_t process);
-int perf_event__synthesize_event_update_scale(struct perf_tool *tool,
-					      struct evsel *evsel,
-					      perf_event__handler_t process);
-int perf_event__synthesize_event_update_name(struct perf_tool *tool,
-					     struct evsel *evsel,
-					     perf_event__handler_t process);
-int perf_event__synthesize_event_update_cpus(struct perf_tool *tool,
-					     struct evsel *evsel,
-					     perf_event__handler_t process);
 int perf_event__process_attr(struct perf_tool *tool, union perf_event *event,
 			     struct evlist **pevlist);
 int perf_event__process_event_update(struct perf_tool *tool,
 				     union perf_event *event,
 				     struct evlist **pevlist);
 size_t perf_event__fprintf_event_update(union perf_event *event, FILE *fp);
-
-int perf_event__synthesize_tracing_data(struct perf_tool *tool,
-					int fd, struct evlist *evlist,
-					perf_event__handler_t process);
 int perf_event__process_tracing_data(struct perf_session *session,
 				     union perf_event *event);
-
-int perf_event__synthesize_build_id(struct perf_tool *tool,
-				    struct dso *pos, u16 misc,
-				    perf_event__handler_t process,
-				    struct machine *machine);
 int perf_event__process_build_id(struct perf_session *session,
 				 union perf_event *event);
 bool is_perf_magic(u64 magic);

diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index 34803e3..6a186b6 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h

@@ -15,6 +15,7 @@
 struct map_symbol;
 struct mem_info;
 struct branch_info;
+struct branch_stack;
 struct block_info;
 struct symbol;
 struct ui_progress;

diff --git a/tools/perf/util/intel-bts.c b/tools/perf/util/intel-bts.c
index aacffa2..34cb380 100644
--- a/tools/perf/util/intel-bts.c
+++ b/tools/perf/util/intel-bts.c

@@ -14,7 +14,6 @@
 #include <linux/log2.h>
 #include <linux/zalloc.h>
 
-#include "cpumap.h"
 #include "color.h"
 #include "evsel.h"
 #include "evlist.h"
@@ -29,6 +28,7 @@
 #include "auxtrace.h"
 #include "intel-pt-decoder/intel-pt-insn-decoder.h"
 #include "intel-bts.h"
+#include "util/synthetic-events.h"
 
 #define MAX_TIMESTAMP (~0ULL)
 
@@ -768,7 +768,7 @@
 	int err;
 
 	evlist__for_each_entry(evlist, evsel) {
-		if (evsel->core.attr.type == bts->pmu_type && evsel->ids) {
+		if (evsel->core.attr.type == bts->pmu_type && evsel->core.ids) {
 			found = true;
 			break;
 		}
@@ -795,7 +795,7 @@
 	attr.sample_id_all = evsel->core.attr.sample_id_all;
 	attr.read_format = evsel->core.attr.read_format;
 
-	id = evsel->id[0] + 1000000000;
+	id = evsel->core.id[0] + 1000000000;
 	if (!id)
 		id = 1;
 

diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c
index 9b56fb7..a1c9eb6 100644
--- a/tools/perf/util/intel-pt.c
+++ b/tools/perf/util/intel-pt.c

@@ -33,6 +33,7 @@
 #include "tsc.h"
 #include "intel-pt.h"
 #include "config.h"
+#include "util/synthetic-events.h"
 #include "time-utils.h"
 
 #include "../arch/x86/include/uapi/asm/perf_regs.h"
@@ -1704,7 +1705,7 @@
 	struct intel_pt *pt = ptq->pt;
 	struct evsel *evsel = pt->pebs_evsel;
 	u64 sample_type = evsel->core.attr.sample_type;
-	u64 id = evsel->id[0];
+	u64 id = evsel->core.id[0];
 	u8 cpumode;
 
 	if (intel_pt_skip_event(pt))
@@ -2719,7 +2720,7 @@
 	struct evsel *evsel;
 
 	evlist__for_each_entry(evlist, evsel) {
-		if (evsel->id && evsel->id[0] == id) {
+		if (evsel->core.id && evsel->core.id[0] == id) {
 			if (evsel->name)
 				zfree(&evsel->name);
 			evsel->name = strdup(name);
@@ -2734,7 +2735,7 @@
 	struct evsel *evsel;
 
 	evlist__for_each_entry(evlist, evsel) {
-		if (evsel->core.attr.type == pt->pmu_type && evsel->ids)
+		if (evsel->core.attr.type == pt->pmu_type && evsel->core.ids)
 			return evsel;
 	}
 
@@ -2775,7 +2776,7 @@
 	attr.sample_id_all = evsel->core.attr.sample_id_all;
 	attr.read_format = evsel->core.attr.read_format;
 
-	id = evsel->id[0] + 1000000000;
+	id = evsel->core.id[0] + 1000000000;
 	if (!id)
 		id = 1;
 
@@ -2902,7 +2903,7 @@
 		return;
 
 	evlist__for_each_entry(pt->session->evlist, evsel) {
-		if (evsel->core.attr.aux_output && evsel->id) {
+		if (evsel->core.attr.aux_output && evsel->core.id) {
 			pt->sample_pebs = true;
 			pt->pebs_evsel = evsel;
 			return;

diff --git a/tools/perf/util/jitdump.c b/tools/perf/util/jitdump.c
index b80f29b..1bdf4c6 100644
--- a/tools/perf/util/jitdump.c
+++ b/tools/perf/util/jitdump.c

@@ -15,7 +15,6 @@
 #include <linux/stringify.h>
 
 #include "build-id.h"
-#include "util.h"
 #include "event.h"
 #include "debug.h"
 #include "evlist.h"
@@ -27,7 +26,6 @@
 #include "jit.h"
 #include "jitdump.h"
 #include "genelf.h"
-#include "../builtin.h"
 
 #include <linux/ctype.h>
 #include <linux/zalloc.h>
@@ -779,7 +777,7 @@
 	 * track sample_type to compute id_all layout
 	 * perf sets the same sample type to all events as of now
 	 */
-	first = perf_evlist__first(session->evlist);
+	first = evlist__first(session->evlist);
 	jd.sample_type = first->core.attr.sample_type;
 
 	*nbytes = 0;

diff --git a/tools/perf/util/kvm-stat.h b/tools/perf/util/kvm-stat.h
index 4691363..6f0fa05 100644
--- a/tools/perf/util/kvm-stat.h
+++ b/tools/perf/util/kvm-stat.h

@@ -2,6 +2,8 @@
 #ifndef __PERF_KVM_STAT_H
 #define __PERF_KVM_STAT_H
 
+#ifdef HAVE_KVM_STAT_SUPPORT
+
 #include "tool.h"
 #include "stat.h"
 #include "record.h"
@@ -144,5 +146,7 @@
 extern const char *kvm_exit_reason;
 extern const char *kvm_entry_trace;
 extern const char *kvm_exit_trace;
+#endif /* HAVE_KVM_STAT_SUPPORT */
 
+extern int kvm_add_default_arch_event(int *argc, const char **argv);
 #endif /* __PERF_KVM_STAT_H */

diff --git a/tools/perf/util/libunwind/arm64.c b/tools/perf/util/libunwind/arm64.c
index 66756e6..6b4e5a0 100644
--- a/tools/perf/util/libunwind/arm64.c
+++ b/tools/perf/util/libunwind/arm64.c

@@ -22,7 +22,6 @@
 #define LIBUNWIND__ARCH_REG_SP PERF_REG_ARM64_SP
 
 #include "unwind.h"
-#include "debug.h"
 #include "libunwind-aarch64.h"
 #include <../../../../arch/arm64/include/uapi/asm/perf_regs.h>
 #include "../../arch/arm64/util/unwind-libunwind.c"

diff --git a/tools/perf/util/libunwind/x86_32.c b/tools/perf/util/libunwind/x86_32.c
index c5e5681..21c216c 100644
--- a/tools/perf/util/libunwind/x86_32.c
+++ b/tools/perf/util/libunwind/x86_32.c

@@ -22,7 +22,6 @@
 #define LIBUNWIND__ARCH_REG_SP PERF_REG_X86_SP
 
 #include "unwind.h"
-#include "debug.h"
 #include "libunwind-x86.h"
 #include <../../../../arch/x86/include/uapi/asm/perf_regs.h>
 

diff --git a/tools/perf/util/llvm-utils.c b/tools/perf/util/llvm-utils.c
index 55fb4b3..8d04e3d 100644
--- a/tools/perf/util/llvm-utils.c
+++ b/tools/perf/util/llvm-utils.c

@@ -8,6 +8,7 @@
 #include <limits.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <unistd.h>
 #include <linux/err.h>
 #include <linux/string.h>
 #include <linux/zalloc.h>

diff --git a/tools/perf/util/lzma.c b/tools/perf/util/lzma.c
index 3974470..39062df 100644
--- a/tools/perf/util/lzma.c
+++ b/tools/perf/util/lzma.c

@@ -7,10 +7,10 @@
 #include <sys/stat.h>
 #include <fcntl.h>
 #include "compress.h"
-#include "util.h"
 #include "debug.h"
 #include <string.h>
 #include <unistd.h>
+#include <internal/lib.h>
 
 #define BUFSIZE 8192
 

diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index b4749d3..70a9f871 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c

@@ -32,6 +32,7 @@
 #include "linux/hash.h"
 #include "asm/bug.h"
 #include "bpf-event.h"
+#include <internal/lib.h> // page_size
 
 #include <linux/ctype.h>
 #include <symbol/kallsyms.h>
@@ -2609,21 +2610,6 @@
 	return rc;
 }
 
-int __machine__synthesize_threads(struct machine *machine, struct perf_tool *tool,
-				  struct target *target, struct perf_thread_map *threads,
-				  perf_event__handler_t process, bool data_mmap,
-				  unsigned int nr_threads_synthesize)
-{
-	if (target__has_task(target))
-		return perf_event__synthesize_thread_map(tool, threads, process, machine, data_mmap);
-	else if (target__has_cpu(target))
-		return perf_event__synthesize_threads(tool, process,
-						      machine, data_mmap,
-						      nr_threads_synthesize);
-	/* command specified */
-	return 0;
-}
-
 pid_t machine__get_current_tid(struct machine *machine, int cpu)
 {
 	int nr_cpus = min(machine->env->nr_cpus_online, MAX_NR_CPUS);

diff --git a/tools/perf/util/machine.h b/tools/perf/util/machine.h
index ffd391a..18e13c0 100644
--- a/tools/perf/util/machine.h
+++ b/tools/perf/util/machine.h

@@ -6,7 +6,6 @@
 #include <linux/rbtree.h>
 #include "map_groups.h"
 #include "dsos.h"
-#include "event.h"
 #include "rwsem.h"
 
 struct addr_location;
@@ -252,20 +251,6 @@
 			      int (*fn)(struct thread *thread, void *p),
 			      void *priv);
 
-int __machine__synthesize_threads(struct machine *machine, struct perf_tool *tool,
-				  struct target *target, struct perf_thread_map *threads,
-				  perf_event__handler_t process, bool data_mmap,
-				  unsigned int nr_threads_synthesize);
-static inline
-int machine__synthesize_threads(struct machine *machine, struct target *target,
-				struct perf_thread_map *threads, bool data_mmap,
-				unsigned int nr_threads_synthesize)
-{
-	return __machine__synthesize_threads(machine, NULL, target, threads,
-					     perf_event__process, data_mmap,
-					     nr_threads_synthesize);
-}
-
 pid_t machine__get_current_tid(struct machine *machine, int cpu);
 int machine__set_current_tid(struct machine *machine, int cpu, pid_t pid,
 			     pid_t tid);

diff --git a/tools/perf/util/memswap.h b/tools/perf/util/memswap.h
index 1e29ff9..2c38e8c 100644
--- a/tools/perf/util/memswap.h
+++ b/tools/perf/util/memswap.h

@@ -2,6 +2,13 @@
 #ifndef PERF_MEMSWAP_H_
 #define PERF_MEMSWAP_H_
 
+#include <linux/types.h>
+
+union u64_swap {
+	u64 val64;
+	u32 val32[2];
+};
+
 void mem_bswap_64(void *src, int byte_size);
 void mem_bswap_32(void *src, int byte_size);
 

diff --git a/tools/perf/util/mmap.c b/tools/perf/util/mmap.c
index 33c5b54..a35dc57 100644
--- a/tools/perf/util/mmap.c
+++ b/tools/perf/util/mmap.c

@@ -12,6 +12,7 @@
 #include <linux/zalloc.h>
 #include <stdlib.h>
 #include <string.h>
+#include <unistd.h> // sysconf()
 #ifdef HAVE_LIBNUMA_SUPPORT
 #include <numaif.h>
 #endif
@@ -20,25 +21,25 @@
 #include "event.h"
 #include "mmap.h"
 #include "../perf.h"
-#include "util.h" /* page_size */
+#include <internal/lib.h> /* page_size */
 
-size_t perf_mmap__mmap_len(struct perf_mmap *map)
+size_t perf_mmap__mmap_len(struct mmap *map)
 {
-	return map->mask + 1 + page_size;
+	return map->core.mask + 1 + page_size;
 }
 
 /* When check_messup is true, 'end' must points to a good entry */
-static union perf_event *perf_mmap__read(struct perf_mmap *map,
+static union perf_event *perf_mmap__read(struct mmap *map,
 					 u64 *startp, u64 end)
 {
-	unsigned char *data = map->base + page_size;
+	unsigned char *data = map->core.base + page_size;
 	union perf_event *event = NULL;
 	int diff = end - *startp;
 
 	if (diff >= (int)sizeof(event->header)) {
 		size_t size;
 
-		event = (union perf_event *)&data[*startp & map->mask];
+		event = (union perf_event *)&data[*startp & map->core.mask];
 		size = event->header.size;
 
 		if (size < sizeof(event->header) || diff < (int)size)
@@ -48,20 +49,20 @@
 		 * Event straddles the mmap boundary -- header should always
 		 * be inside due to u64 alignment of output.
 		 */
-		if ((*startp & map->mask) + size != ((*startp + size) & map->mask)) {
+		if ((*startp & map->core.mask) + size != ((*startp + size) & map->core.mask)) {
 			unsigned int offset = *startp;
 			unsigned int len = min(sizeof(*event), size), cpy;
-			void *dst = map->event_copy;
+			void *dst = map->core.event_copy;
 
 			do {
-				cpy = min(map->mask + 1 - (offset & map->mask), len);
-				memcpy(dst, &data[offset & map->mask], cpy);
+				cpy = min(map->core.mask + 1 - (offset & map->core.mask), len);
+				memcpy(dst, &data[offset & map->core.mask], cpy);
 				offset += cpy;
 				dst += cpy;
 				len -= cpy;
 			} while (len);
 
-			event = (union perf_event *)map->event_copy;
+			event = (union perf_event *)map->core.event_copy;
 		}
 
 		*startp += size;
@@ -82,55 +83,55 @@
  * }
  * perf_mmap__read_done()
  */
-union perf_event *perf_mmap__read_event(struct perf_mmap *map)
+union perf_event *perf_mmap__read_event(struct mmap *map)
 {
 	union perf_event *event;
 
 	/*
 	 * Check if event was unmapped due to a POLLHUP/POLLERR.
 	 */
-	if (!refcount_read(&map->refcnt))
+	if (!refcount_read(&map->core.refcnt))
 		return NULL;
 
 	/* non-overwirte doesn't pause the ringbuffer */
-	if (!map->overwrite)
-		map->end = perf_mmap__read_head(map);
+	if (!map->core.overwrite)
+		map->core.end = perf_mmap__read_head(map);
 
-	event = perf_mmap__read(map, &map->start, map->end);
+	event = perf_mmap__read(map, &map->core.start, map->core.end);
 
-	if (!map->overwrite)
-		map->prev = map->start;
+	if (!map->core.overwrite)
+		map->core.prev = map->core.start;
 
 	return event;
 }
 
-static bool perf_mmap__empty(struct perf_mmap *map)
+static bool perf_mmap__empty(struct mmap *map)
 {
-	return perf_mmap__read_head(map) == map->prev && !map->auxtrace_mmap.base;
+	return perf_mmap__read_head(map) == map->core.prev && !map->auxtrace_mmap.base;
 }
 
-void perf_mmap__get(struct perf_mmap *map)
+void perf_mmap__get(struct mmap *map)
 {
-	refcount_inc(&map->refcnt);
+	refcount_inc(&map->core.refcnt);
 }
 
-void perf_mmap__put(struct perf_mmap *map)
+void perf_mmap__put(struct mmap *map)
 {
-	BUG_ON(map->base && refcount_read(&map->refcnt) == 0);
+	BUG_ON(map->core.base && refcount_read(&map->core.refcnt) == 0);
 
-	if (refcount_dec_and_test(&map->refcnt))
+	if (refcount_dec_and_test(&map->core.refcnt))
 		perf_mmap__munmap(map);
 }
 
-void perf_mmap__consume(struct perf_mmap *map)
+void perf_mmap__consume(struct mmap *map)
 {
-	if (!map->overwrite) {
-		u64 old = map->prev;
+	if (!map->core.overwrite) {
+		u64 old = map->core.prev;
 
 		perf_mmap__write_tail(map, old);
 	}
 
-	if (refcount_read(&map->refcnt) == 1 && perf_mmap__empty(map))
+	if (refcount_read(&map->core.refcnt) == 1 && perf_mmap__empty(map))
 		perf_mmap__put(map);
 }
 
@@ -161,13 +162,13 @@
 }
 
 #ifdef HAVE_AIO_SUPPORT
-static int perf_mmap__aio_enabled(struct perf_mmap *map)
+static int perf_mmap__aio_enabled(struct mmap *map)
 {
 	return map->aio.nr_cblocks > 0;
 }
 
 #ifdef HAVE_LIBNUMA_SUPPORT
-static int perf_mmap__aio_alloc(struct perf_mmap *map, int idx)
+static int perf_mmap__aio_alloc(struct mmap *map, int idx)
 {
 	map->aio.data[idx] = mmap(NULL, perf_mmap__mmap_len(map), PROT_READ|PROT_WRITE,
 				  MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
@@ -179,7 +180,7 @@
 	return 0;
 }
 
-static void perf_mmap__aio_free(struct perf_mmap *map, int idx)
+static void perf_mmap__aio_free(struct mmap *map, int idx)
 {
 	if (map->aio.data[idx]) {
 		munmap(map->aio.data[idx], perf_mmap__mmap_len(map));
@@ -187,7 +188,7 @@
 	}
 }
 
-static int perf_mmap__aio_bind(struct perf_mmap *map, int idx, int cpu, int affinity)
+static int perf_mmap__aio_bind(struct mmap *map, int idx, int cpu, int affinity)
 {
 	void *data;
 	size_t mmap_len;
@@ -207,7 +208,7 @@
 	return 0;
 }
 #else /* !HAVE_LIBNUMA_SUPPORT */
-static int perf_mmap__aio_alloc(struct perf_mmap *map, int idx)
+static int perf_mmap__aio_alloc(struct mmap *map, int idx)
 {
 	map->aio.data[idx] = malloc(perf_mmap__mmap_len(map));
 	if (map->aio.data[idx] == NULL)
@@ -216,19 +217,19 @@
 	return 0;
 }
 
-static void perf_mmap__aio_free(struct perf_mmap *map, int idx)
+static void perf_mmap__aio_free(struct mmap *map, int idx)
 {
 	zfree(&(map->aio.data[idx]));
 }
 
-static int perf_mmap__aio_bind(struct perf_mmap *map __maybe_unused, int idx __maybe_unused,
+static int perf_mmap__aio_bind(struct mmap *map __maybe_unused, int idx __maybe_unused,
 		int cpu __maybe_unused, int affinity __maybe_unused)
 {
 	return 0;
 }
 #endif
 
-static int perf_mmap__aio_mmap(struct perf_mmap *map, struct mmap_params *mp)
+static int perf_mmap__aio_mmap(struct mmap *map, struct mmap_params *mp)
 {
 	int delta_max, i, prio, ret;
 
@@ -256,7 +257,7 @@
 				pr_debug2("failed to allocate data buffer area, error %m");
 				return -1;
 			}
-			ret = perf_mmap__aio_bind(map, i, map->cpu, mp->affinity);
+			ret = perf_mmap__aio_bind(map, i, map->core.cpu, mp->affinity);
 			if (ret == -1)
 				return -1;
 			/*
@@ -282,7 +283,7 @@
 	return 0;
 }
 
-static void perf_mmap__aio_munmap(struct perf_mmap *map)
+static void perf_mmap__aio_munmap(struct mmap *map)
 {
 	int i;
 
@@ -294,34 +295,34 @@
 	zfree(&map->aio.aiocb);
 }
 #else /* !HAVE_AIO_SUPPORT */
-static int perf_mmap__aio_enabled(struct perf_mmap *map __maybe_unused)
+static int perf_mmap__aio_enabled(struct mmap *map __maybe_unused)
 {
 	return 0;
 }
 
-static int perf_mmap__aio_mmap(struct perf_mmap *map __maybe_unused,
+static int perf_mmap__aio_mmap(struct mmap *map __maybe_unused,
 			       struct mmap_params *mp __maybe_unused)
 {
 	return 0;
 }
 
-static void perf_mmap__aio_munmap(struct perf_mmap *map __maybe_unused)
+static void perf_mmap__aio_munmap(struct mmap *map __maybe_unused)
 {
 }
 #endif
 
-void perf_mmap__munmap(struct perf_mmap *map)
+void perf_mmap__munmap(struct mmap *map)
 {
 	perf_mmap__aio_munmap(map);
 	if (map->data != NULL) {
 		munmap(map->data, perf_mmap__mmap_len(map));
 		map->data = NULL;
 	}
-	if (map->base != NULL) {
-		munmap(map->base, perf_mmap__mmap_len(map));
-		map->base = NULL;
-		map->fd = -1;
-		refcount_set(&map->refcnt, 0);
+	if (map->core.base != NULL) {
+		munmap(map->core.base, perf_mmap__mmap_len(map));
+		map->core.base = NULL;
+		map->core.fd = -1;
+		refcount_set(&map->core.refcnt, 0);
 	}
 	auxtrace_mmap__munmap(&map->auxtrace_mmap);
 }
@@ -343,16 +344,16 @@
 	}
 }
 
-static void perf_mmap__setup_affinity_mask(struct perf_mmap *map, struct mmap_params *mp)
+static void perf_mmap__setup_affinity_mask(struct mmap *map, struct mmap_params *mp)
 {
 	CPU_ZERO(&map->affinity_mask);
 	if (mp->affinity == PERF_AFFINITY_NODE && cpu__max_node() > 1)
-		build_node_mask(cpu__get_node(map->cpu), &map->affinity_mask);
+		build_node_mask(cpu__get_node(map->core.cpu), &map->affinity_mask);
 	else if (mp->affinity == PERF_AFFINITY_CPU)
-		CPU_SET(map->cpu, &map->affinity_mask);
+		CPU_SET(map->core.cpu, &map->affinity_mask);
 }
 
-int perf_mmap__mmap(struct perf_mmap *map, struct mmap_params *mp, int fd, int cpu)
+int perf_mmap__mmap(struct mmap *map, struct mmap_params *mp, int fd, int cpu)
 {
 	/*
 	 * The last one will be done at perf_mmap__consume(), so that we
@@ -367,23 +368,23 @@
 	 * evlist layer can't just drop it when filtering events in
 	 * perf_evlist__filter_pollfd().
 	 */
-	refcount_set(&map->refcnt, 2);
-	map->prev = 0;
-	map->mask = mp->mask;
-	map->base = mmap(NULL, perf_mmap__mmap_len(map), mp->prot,
+	refcount_set(&map->core.refcnt, 2);
+	map->core.prev = 0;
+	map->core.mask = mp->mask;
+	map->core.base = mmap(NULL, perf_mmap__mmap_len(map), mp->prot,
 			 MAP_SHARED, fd, 0);
-	if (map->base == MAP_FAILED) {
+	if (map->core.base == MAP_FAILED) {
 		pr_debug2("failed to mmap perf event ring buffer, error %d\n",
 			  errno);
-		map->base = NULL;
+		map->core.base = NULL;
 		return -1;
 	}
-	map->fd = fd;
-	map->cpu = cpu;
+	map->core.fd = fd;
+	map->core.cpu = cpu;
 
 	perf_mmap__setup_affinity_mask(map, mp);
 
-	map->flush = mp->flush;
+	map->core.flush = mp->flush;
 
 	map->comp_level = mp->comp_level;
 
@@ -399,7 +400,7 @@
 	}
 
 	if (auxtrace_mmap__mmap(&map->auxtrace_mmap,
-				&mp->auxtrace_mp, map->base, fd))
+				&mp->auxtrace_mp, map->core.base, fd))
 		return -1;
 
 	return perf_mmap__aio_mmap(map, mp);
@@ -440,25 +441,25 @@
 /*
  * Report the start and end of the available data in ringbuffer
  */
-static int __perf_mmap__read_init(struct perf_mmap *md)
+static int __perf_mmap__read_init(struct mmap *md)
 {
 	u64 head = perf_mmap__read_head(md);
-	u64 old = md->prev;
-	unsigned char *data = md->base + page_size;
+	u64 old = md->core.prev;
+	unsigned char *data = md->core.base + page_size;
 	unsigned long size;
 
-	md->start = md->overwrite ? head : old;
-	md->end = md->overwrite ? old : head;
+	md->core.start = md->core.overwrite ? head : old;
+	md->core.end = md->core.overwrite ? old : head;
 
-	if ((md->end - md->start) < md->flush)
+	if ((md->core.end - md->core.start) < md->core.flush)
 		return -EAGAIN;
 
-	size = md->end - md->start;
-	if (size > (unsigned long)(md->mask) + 1) {
-		if (!md->overwrite) {
+	size = md->core.end - md->core.start;
+	if (size > (unsigned long)(md->core.mask) + 1) {
+		if (!md->core.overwrite) {
 			WARN_ONCE(1, "failed to keep up with mmap data. (warn only once)\n");
 
-			md->prev = head;
+			md->core.prev = head;
 			perf_mmap__consume(md);
 			return -EAGAIN;
 		}
@@ -467,29 +468,29 @@
 		 * Backward ring buffer is full. We still have a chance to read
 		 * most of data from it.
 		 */
-		if (overwrite_rb_find_range(data, md->mask, &md->start, &md->end))
+		if (overwrite_rb_find_range(data, md->core.mask, &md->core.start, &md->core.end))
 			return -EINVAL;
 	}
 
 	return 0;
 }
 
-int perf_mmap__read_init(struct perf_mmap *map)
+int perf_mmap__read_init(struct mmap *map)
 {
 	/*
 	 * Check if event was unmapped due to a POLLHUP/POLLERR.
 	 */
-	if (!refcount_read(&map->refcnt))
+	if (!refcount_read(&map->core.refcnt))
 		return -ENOENT;
 
 	return __perf_mmap__read_init(map);
 }
 
-int perf_mmap__push(struct perf_mmap *md, void *to,
-		    int push(struct perf_mmap *map, void *to, void *buf, size_t size))
+int perf_mmap__push(struct mmap *md, void *to,
+		    int push(struct mmap *map, void *to, void *buf, size_t size))
 {
 	u64 head = perf_mmap__read_head(md);
-	unsigned char *data = md->base + page_size;
+	unsigned char *data = md->core.base + page_size;
 	unsigned long size;
 	void *buf;
 	int rc = 0;
@@ -498,12 +499,12 @@
 	if (rc < 0)
 		return (rc == -EAGAIN) ? 1 : -1;
 
-	size = md->end - md->start;
+	size = md->core.end - md->core.start;
 
-	if ((md->start & md->mask) + size != (md->end & md->mask)) {
-		buf = &data[md->start & md->mask];
-		size = md->mask + 1 - (md->start & md->mask);
-		md->start += size;
+	if ((md->core.start & md->core.mask) + size != (md->core.end & md->core.mask)) {
+		buf = &data[md->core.start & md->core.mask];
+		size = md->core.mask + 1 - (md->core.start & md->core.mask);
+		md->core.start += size;
 
 		if (push(md, to, buf, size) < 0) {
 			rc = -1;
@@ -511,16 +512,16 @@
 		}
 	}
 
-	buf = &data[md->start & md->mask];
-	size = md->end - md->start;
-	md->start += size;
+	buf = &data[md->core.start & md->core.mask];
+	size = md->core.end - md->core.start;
+	md->core.start += size;
 
 	if (push(md, to, buf, size) < 0) {
 		rc = -1;
 		goto out;
 	}
 
-	md->prev = head;
+	md->core.prev = head;
 	perf_mmap__consume(md);
 out:
 	return rc;
@@ -529,16 +530,16 @@
 /*
  * Mandatory for overwrite mode
  * The direction of overwrite mode is backward.
- * The last perf_mmap__read() will set tail to map->prev.
- * Need to correct the map->prev to head which is the end of next read.
+ * The last perf_mmap__read() will set tail to map->core.prev.
+ * Need to correct the map->core.prev to head which is the end of next read.
  */
-void perf_mmap__read_done(struct perf_mmap *map)
+void perf_mmap__read_done(struct mmap *map)
 {
 	/*
 	 * Check if event was unmapped due to a POLLHUP/POLLERR.
 	 */
-	if (!refcount_read(&map->refcnt))
+	if (!refcount_read(&map->core.refcnt))
 		return;
 
-	map->prev = perf_mmap__read_head(map);
+	map->core.prev = perf_mmap__read_head(map);
 }

diff --git a/tools/perf/util/mmap.h b/tools/perf/util/mmap.h
index 3857a49..e567c1c 100644
--- a/tools/perf/util/mmap.h
+++ b/tools/perf/util/mmap.h

@@ -1,6 +1,7 @@
 #ifndef __PERF_MMAP_H
 #define __PERF_MMAP_H 1
 
+#include <internal/mmap.h>
 #include <linux/compiler.h>
 #include <linux/refcount.h>
 #include <linux/types.h>
@@ -15,22 +16,13 @@
 
 struct aiocb;
 /**
- * struct perf_mmap - perf's ring buffer mmap details
+ * struct mmap - perf's ring buffer mmap details
  *
  * @refcnt - e.g. code using PERF_EVENT_IOC_SET_OUTPUT to share this
  */
-struct perf_mmap {
-	void		 *base;
-	int		 mask;
-	int		 fd;
-	int		 cpu;
-	refcount_t	 refcnt;
-	u64		 prev;
-	u64		 start;
-	u64		 end;
-	bool		 overwrite;
+struct mmap {
+	struct perf_mmap	core;
 	struct auxtrace_mmap auxtrace_mmap;
-	char		 event_copy[PERF_SAMPLE_MAX_SIZE] __aligned(8);
 #ifdef HAVE_AIO_SUPPORT
 	struct {
 		void		 **data;
@@ -40,71 +32,42 @@
 	} aio;
 #endif
 	cpu_set_t	affinity_mask;
-	u64		flush;
 	void		*data;
 	int		comp_level;
 };
 
-/*
- * State machine of bkw_mmap_state:
- *
- *                     .________________(forbid)_____________.
- *                     |                                     V
- * NOTREADY --(0)--> RUNNING --(1)--> DATA_PENDING --(2)--> EMPTY
- *                     ^  ^              |   ^               |
- *                     |  |__(forbid)____/   |___(forbid)___/|
- *                     |                                     |
- *                      \_________________(3)_______________/
- *
- * NOTREADY     : Backward ring buffers are not ready
- * RUNNING      : Backward ring buffers are recording
- * DATA_PENDING : We are required to collect data from backward ring buffers
- * EMPTY        : We have collected data from backward ring buffers.
- *
- * (0): Setup backward ring buffer
- * (1): Pause ring buffers for reading
- * (2): Read from ring buffers
- * (3): Resume ring buffers for recording
- */
-enum bkw_mmap_state {
-	BKW_MMAP_NOTREADY,
-	BKW_MMAP_RUNNING,
-	BKW_MMAP_DATA_PENDING,
-	BKW_MMAP_EMPTY,
-};
-
 struct mmap_params {
 	int prot, mask, nr_cblocks, affinity, flush, comp_level;
 	struct auxtrace_mmap_params auxtrace_mp;
 };
 
-int perf_mmap__mmap(struct perf_mmap *map, struct mmap_params *mp, int fd, int cpu);
-void perf_mmap__munmap(struct perf_mmap *map);
+int perf_mmap__mmap(struct mmap *map, struct mmap_params *mp, int fd, int cpu);
+void perf_mmap__munmap(struct mmap *map);
 
-void perf_mmap__get(struct perf_mmap *map);
-void perf_mmap__put(struct perf_mmap *map);
+void perf_mmap__get(struct mmap *map);
+void perf_mmap__put(struct mmap *map);
 
-void perf_mmap__consume(struct perf_mmap *map);
+void perf_mmap__consume(struct mmap *map);
 
-static inline u64 perf_mmap__read_head(struct perf_mmap *mm)
+static inline u64 perf_mmap__read_head(struct mmap *mm)
 {
-	return ring_buffer_read_head(mm->base);
+	return ring_buffer_read_head(mm->core.base);
 }
 
-static inline void perf_mmap__write_tail(struct perf_mmap *md, u64 tail)
+static inline void perf_mmap__write_tail(struct mmap *md, u64 tail)
 {
-	ring_buffer_write_tail(md->base, tail);
+	ring_buffer_write_tail(md->core.base, tail);
 }
 
-union perf_event *perf_mmap__read_forward(struct perf_mmap *map);
+union perf_event *perf_mmap__read_forward(struct mmap *map);
 
-union perf_event *perf_mmap__read_event(struct perf_mmap *map);
+union perf_event *perf_mmap__read_event(struct mmap *map);
 
-int perf_mmap__push(struct perf_mmap *md, void *to,
-		    int push(struct perf_mmap *map, void *to, void *buf, size_t size));
+int perf_mmap__push(struct mmap *md, void *to,
+		    int push(struct mmap *map, void *to, void *buf, size_t size));
 
-size_t perf_mmap__mmap_len(struct perf_mmap *map);
+size_t perf_mmap__mmap_len(struct mmap *map);
 
-int perf_mmap__read_init(struct perf_mmap *md);
-void perf_mmap__read_done(struct perf_mmap *map);
+int perf_mmap__read_init(struct mmap *md);
+void perf_mmap__read_done(struct mmap *map);
 #endif /*__PERF_MMAP_H */

diff --git a/tools/perf/util/namespaces.c b/tools/perf/util/namespaces.c
index 99be15d..285d6f3 100644
--- a/tools/perf/util/namespaces.c
+++ b/tools/perf/util/namespaces.c

@@ -17,8 +17,26 @@
 #include <string.h>
 #include <unistd.h>
 #include <asm/bug.h>
+#include <linux/kernel.h>
 #include <linux/zalloc.h>
 
+static const char *perf_ns__names[] = {
+	[NET_NS_INDEX]		= "net",
+	[UTS_NS_INDEX]		= "uts",
+	[IPC_NS_INDEX]		= "ipc",
+	[PID_NS_INDEX]		= "pid",
+	[USER_NS_INDEX]		= "user",
+	[MNT_NS_INDEX]		= "mnt",
+	[CGROUP_NS_INDEX]	= "cgroup",
+};
+
+const char *perf_ns__name(unsigned int id)
+{
+	if (id >= ARRAY_SIZE(perf_ns__names))
+		return "UNKNOWN";
+	return perf_ns__names[id];
+}
+
 struct namespaces *namespaces__new(struct perf_record_namespaces *event)
 {
 	struct namespaces *namespaces;

diff --git a/tools/perf/util/namespaces.h b/tools/perf/util/namespaces.h
index 40edef5..4b33f68 100644
--- a/tools/perf/util/namespaces.h
+++ b/tools/perf/util/namespaces.h

@@ -66,4 +66,6 @@
 
 #define nsinfo__zput(nsi) __nsinfo__zput(&nsi)
 
+const char *perf_ns__name(unsigned int id);
+
 #endif  /* __PERF_NAMESPACES_H */

diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 5ec21d2..b5e2ade 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c

@@ -30,11 +30,12 @@
 #include "parse-events-flex.h"
 #include "pmu.h"
 #include "thread_map.h"
-#include "cpumap.h"
 #include "probe-file.h"
 #include "asm/bug.h"
 #include "util/parse-branch-options.h"
 #include "metricgroup.h"
+#include "util/evsel_config.h"
+#include "util/event.h"
 
 #define MAX_NAME_LEN 100
 
@@ -335,7 +336,7 @@
 	(*idx)++;
 	evsel->core.cpus   = perf_cpu_map__get(cpus);
 	evsel->core.own_cpus = perf_cpu_map__get(cpus);
-	evsel->system_wide = pmu ? pmu->is_uncore : false;
+	evsel->core.system_wide = pmu ? pmu->is_uncore : false;
 	evsel->auto_merge_stats = auto_merge_stats;
 
 	if (name)
@@ -1936,7 +1937,7 @@
 
 		perf_evlist__splice_list_tail(evlist, &parse_state.list);
 		evlist->nr_groups += parse_state.nr_groups;
-		last = perf_evlist__last(evlist);
+		last = evlist__last(evlist);
 		last->cmdline_group_boundary = true;
 
 		return 0;
@@ -2050,7 +2051,7 @@
 	 * So no need to WARN here, let *func do this.
 	 */
 	if (evlist->core.nr_entries > 0)
-		last = perf_evlist__last(evlist);
+		last = evlist__last(evlist);
 
 	do {
 		err = (*func)(last, arg);

diff --git a/tools/perf/util/parse-events.y b/tools/perf/util/parse-events.y
index f1c36ed..48126ae 100644
--- a/tools/perf/util/parse-events.y
+++ b/tools/perf/util/parse-events.y

@@ -9,13 +9,11 @@
 #define YYDEBUG 1
 
 #include <fnmatch.h>
+#include <stdio.h>
 #include <linux/compiler.h>
-#include <linux/list.h>
 #include <linux/types.h>
-#include "util.h"
 #include "pmu.h"
 #include "evsel.h"
-#include "debug.h"
 #include "parse-events.h"
 #include "parse-events-bison.h"
 

diff --git a/tools/perf/util/perf-hooks.c b/tools/perf/util/perf-hooks.c
index e635c59..7a0ab35 100644
--- a/tools/perf/util/perf-hooks.c
+++ b/tools/perf/util/perf-hooks.c

@@ -12,7 +12,6 @@
 #include <setjmp.h>
 #include <linux/err.h>
 #include <linux/kernel.h>
-#include "util/util.h"
 #include "util/debug.h"
 #include "util/perf-hooks.h"
 

diff --git a/tools/perf/util/perf_event_attr_fprintf.c b/tools/perf/util/perf_event_attr_fprintf.c
new file mode 100644
index 0000000..d4ad3f0
--- /dev/null
+++ b/tools/perf/util/perf_event_attr_fprintf.c

@@ -0,0 +1,148 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/perf_event.h>
+#include "util/evsel_fprintf.h"
+
+struct bit_names {
+	int bit;
+	const char *name;
+};
+
+static void __p_bits(char *buf, size_t size, u64 value, struct bit_names *bits)
+{
+	bool first_bit = true;
+	int i = 0;
+
+	do {
+		if (value & bits[i].bit) {
+			buf += scnprintf(buf, size, "%s%s", first_bit ? "" : "|", bits[i].name);
+			first_bit = false;
+		}
+	} while (bits[++i].name != NULL);
+}
+
+static void __p_sample_type(char *buf, size_t size, u64 value)
+{
+#define bit_name(n) { PERF_SAMPLE_##n, #n }
+	struct bit_names bits[] = {
+		bit_name(IP), bit_name(TID), bit_name(TIME), bit_name(ADDR),
+		bit_name(READ), bit_name(CALLCHAIN), bit_name(ID), bit_name(CPU),
+		bit_name(PERIOD), bit_name(STREAM_ID), bit_name(RAW),
+		bit_name(BRANCH_STACK), bit_name(REGS_USER), bit_name(STACK_USER),
+		bit_name(IDENTIFIER), bit_name(REGS_INTR), bit_name(DATA_SRC),
+		bit_name(WEIGHT), bit_name(PHYS_ADDR),
+		{ .name = NULL, }
+	};
+#undef bit_name
+	__p_bits(buf, size, value, bits);
+}
+
+static void __p_branch_sample_type(char *buf, size_t size, u64 value)
+{
+#define bit_name(n) { PERF_SAMPLE_BRANCH_##n, #n }
+	struct bit_names bits[] = {
+		bit_name(USER), bit_name(KERNEL), bit_name(HV), bit_name(ANY),
+		bit_name(ANY_CALL), bit_name(ANY_RETURN), bit_name(IND_CALL),
+		bit_name(ABORT_TX), bit_name(IN_TX), bit_name(NO_TX),
+		bit_name(COND), bit_name(CALL_STACK), bit_name(IND_JUMP),
+		bit_name(CALL), bit_name(NO_FLAGS), bit_name(NO_CYCLES),
+		{ .name = NULL, }
+	};
+#undef bit_name
+	__p_bits(buf, size, value, bits);
+}
+
+static void __p_read_format(char *buf, size_t size, u64 value)
+{
+#define bit_name(n) { PERF_FORMAT_##n, #n }
+	struct bit_names bits[] = {
+		bit_name(TOTAL_TIME_ENABLED), bit_name(TOTAL_TIME_RUNNING),
+		bit_name(ID), bit_name(GROUP),
+		{ .name = NULL, }
+	};
+#undef bit_name
+	__p_bits(buf, size, value, bits);
+}
+
+#define BUF_SIZE		1024
+
+#define p_hex(val)		snprintf(buf, BUF_SIZE, "%#"PRIx64, (uint64_t)(val))
+#define p_unsigned(val)		snprintf(buf, BUF_SIZE, "%"PRIu64, (uint64_t)(val))
+#define p_signed(val)		snprintf(buf, BUF_SIZE, "%"PRId64, (int64_t)(val))
+#define p_sample_type(val)	__p_sample_type(buf, BUF_SIZE, val)
+#define p_branch_sample_type(val) __p_branch_sample_type(buf, BUF_SIZE, val)
+#define p_read_format(val)	__p_read_format(buf, BUF_SIZE, val)
+
+#define PRINT_ATTRn(_n, _f, _p)				\
+do {							\
+	if (attr->_f) {					\
+		_p(attr->_f);				\
+		ret += attr__fprintf(fp, _n, buf, priv);\
+	}						\
+} while (0)
+
+#define PRINT_ATTRf(_f, _p)	PRINT_ATTRn(#_f, _f, _p)
+
+int perf_event_attr__fprintf(FILE *fp, struct perf_event_attr *attr,
+			     attr__fprintf_f attr__fprintf, void *priv)
+{
+	char buf[BUF_SIZE];
+	int ret = 0;
+
+	PRINT_ATTRf(type, p_unsigned);
+	PRINT_ATTRf(size, p_unsigned);
+	PRINT_ATTRf(config, p_hex);
+	PRINT_ATTRn("{ sample_period, sample_freq }", sample_period, p_unsigned);
+	PRINT_ATTRf(sample_type, p_sample_type);
+	PRINT_ATTRf(read_format, p_read_format);
+
+	PRINT_ATTRf(disabled, p_unsigned);
+	PRINT_ATTRf(inherit, p_unsigned);
+	PRINT_ATTRf(pinned, p_unsigned);
+	PRINT_ATTRf(exclusive, p_unsigned);
+	PRINT_ATTRf(exclude_user, p_unsigned);
+	PRINT_ATTRf(exclude_kernel, p_unsigned);
+	PRINT_ATTRf(exclude_hv, p_unsigned);
+	PRINT_ATTRf(exclude_idle, p_unsigned);
+	PRINT_ATTRf(mmap, p_unsigned);
+	PRINT_ATTRf(comm, p_unsigned);
+	PRINT_ATTRf(freq, p_unsigned);
+	PRINT_ATTRf(inherit_stat, p_unsigned);
+	PRINT_ATTRf(enable_on_exec, p_unsigned);
+	PRINT_ATTRf(task, p_unsigned);
+	PRINT_ATTRf(watermark, p_unsigned);
+	PRINT_ATTRf(precise_ip, p_unsigned);
+	PRINT_ATTRf(mmap_data, p_unsigned);
+	PRINT_ATTRf(sample_id_all, p_unsigned);
+	PRINT_ATTRf(exclude_host, p_unsigned);
+	PRINT_ATTRf(exclude_guest, p_unsigned);
+	PRINT_ATTRf(exclude_callchain_kernel, p_unsigned);
+	PRINT_ATTRf(exclude_callchain_user, p_unsigned);
+	PRINT_ATTRf(mmap2, p_unsigned);
+	PRINT_ATTRf(comm_exec, p_unsigned);
+	PRINT_ATTRf(use_clockid, p_unsigned);
+	PRINT_ATTRf(context_switch, p_unsigned);
+	PRINT_ATTRf(write_backward, p_unsigned);
+	PRINT_ATTRf(namespaces, p_unsigned);
+	PRINT_ATTRf(ksymbol, p_unsigned);
+	PRINT_ATTRf(bpf_event, p_unsigned);
+	PRINT_ATTRf(aux_output, p_unsigned);
+
+	PRINT_ATTRn("{ wakeup_events, wakeup_watermark }", wakeup_events, p_unsigned);
+	PRINT_ATTRf(bp_type, p_unsigned);
+	PRINT_ATTRn("{ bp_addr, config1 }", bp_addr, p_hex);
+	PRINT_ATTRn("{ bp_len, config2 }", bp_len, p_hex);
+	PRINT_ATTRf(branch_sample_type, p_branch_sample_type);
+	PRINT_ATTRf(sample_regs_user, p_hex);
+	PRINT_ATTRf(sample_stack_user, p_unsigned);
+	PRINT_ATTRf(clockid, p_signed);
+	PRINT_ATTRf(sample_regs_intr, p_hex);
+	PRINT_ATTRf(aux_watermark, p_unsigned);
+	PRINT_ATTRf(sample_max_stack, p_unsigned);
+
+	return ret;
+}

diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c
index fb597fa..5608da8 100644
--- a/tools/perf/util/pmu.c
+++ b/tools/perf/util/pmu.c

@@ -20,7 +20,6 @@
 #include "debug.h"
 #include "pmu.h"
 #include "parse-events.h"
-#include "cpumap.h"
 #include "header.h"
 #include "pmu-events/pmu-events.h"
 #include "string2.h"

diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
index b8e0967..91cab5f 100644
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c

@@ -2331,6 +2331,7 @@
 		}
 	}
 	zfree(&tev->args);
+	tev->nargs = 0;
 }
 
 struct kprobe_blacklist_node {

diff --git a/tools/perf/util/probe-file.c b/tools/perf/util/probe-file.c
index d13db55a..b659466 100644
--- a/tools/perf/util/probe-file.c
+++ b/tools/perf/util/probe-file.c

@@ -16,6 +16,7 @@
 #include "strlist.h"
 #include "strfilter.h"
 #include "debug.h"
+#include "build-id.h"
 #include "dso.h"
 #include "color.h"
 #include "symbol.h"

diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c
index 505905f..cd9f95e 100644
--- a/tools/perf/util/probe-finder.c
+++ b/tools/perf/util/probe-finder.c

@@ -1245,6 +1245,17 @@
 	return n;
 }
 
+static bool trace_event_finder_overlap(struct trace_event_finder *tf)
+{
+	int i;
+
+	for (i = 0; i < tf->ntevs; i++) {
+		if (tf->pf.addr == tf->tevs[i].point.address)
+			return true;
+	}
+	return false;
+}
+
 /* Add a found probe point into trace event list */
 static int add_probe_trace_event(Dwarf_Die *sc_die, struct probe_finder *pf)
 {
@@ -1255,6 +1266,14 @@
 	struct perf_probe_arg *args = NULL;
 	int ret, i;
 
+	/*
+	 * For some reason (e.g. different column assigned to same address)
+	 * This callback can be called with the address which already passed.
+	 * Ignore it first.
+	 */
+	if (trace_event_finder_overlap(tf))
+		return 0;
+
 	/* Check number of tevs */
 	if (tf->ntevs == tf->max_tevs) {
 		pr_warning("Too many( > %d) probe point found.\n",

diff --git a/tools/perf/util/python-ext-sources b/tools/perf/util/python-ext-sources
index c6dd478..9af1838 100644
--- a/tools/perf/util/python-ext-sources
+++ b/tools/perf/util/python-ext-sources

@@ -10,6 +10,7 @@
 util/cap.c
 util/evlist.c
 util/evsel.c
+util/perf_event_attr_fprintf.c
 util/cpumap.c
 util/memswap.c
 util/mmap.c

diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c
index 07ca453..53f3105 100644
--- a/tools/perf/util/python.c
+++ b/tools/perf/util/python.c

@@ -6,17 +6,15 @@
 #include <linux/err.h>
 #include <perf/cpumap.h>
 #include <traceevent/event-parse.h>
-#include "debug.h"
 #include "evlist.h"
 #include "callchain.h"
 #include "evsel.h"
 #include "event.h"
-#include "cpumap.h"
 #include "print_binary.h"
 #include "thread_map.h"
 #include "trace-event.h"
 #include "mmap.h"
-#include "util.h"
+#include <internal/lib.h>
 #include "../perf-sys.h"
 
 #if PY_MAJOR_VERSION < 3
@@ -61,6 +59,8 @@
  */
 int verbose;
 
+int eprintf(int level, int var, const char *fmt, ...);
+
 int eprintf(int level, int var, const char *fmt, ...)
 {
 	va_list args;
@@ -884,7 +884,7 @@
 
 static void pyrf_evlist__delete(struct pyrf_evlist *pevlist)
 {
-	perf_evlist__exit(&pevlist->evlist);
+	evlist__exit(&pevlist->evlist);
 	Py_TYPE(pevlist)->tp_free((PyObject*)pevlist);
 }
 
@@ -899,7 +899,7 @@
 					 &pages, &overwrite))
 		return NULL;
 
-	if (perf_evlist__mmap(evlist, pages) < 0) {
+	if (evlist__mmap(evlist, pages) < 0) {
 		PyErr_SetFromErrno(PyExc_OSError);
 		return NULL;
 	}
@@ -918,7 +918,7 @@
 	if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|i", kwlist, &timeout))
 		return NULL;
 
-	n = perf_evlist__poll(evlist, timeout);
+	n = evlist__poll(evlist, timeout);
 	if (n < 0) {
 		PyErr_SetFromErrno(PyExc_OSError);
 		return NULL;
@@ -935,17 +935,17 @@
         PyObject *list = PyList_New(0);
 	int i;
 
-	for (i = 0; i < evlist->pollfd.nr; ++i) {
+	for (i = 0; i < evlist->core.pollfd.nr; ++i) {
 		PyObject *file;
 #if PY_MAJOR_VERSION < 3
-		FILE *fp = fdopen(evlist->pollfd.entries[i].fd, "r");
+		FILE *fp = fdopen(evlist->core.pollfd.entries[i].fd, "r");
 
 		if (fp == NULL)
 			goto free_list;
 
 		file = PyFile_FromFile(fp, "perf", "r", NULL);
 #else
-		file = PyFile_FromFd(evlist->pollfd.entries[i].fd, "perf", "r", -1,
+		file = PyFile_FromFd(evlist->core.pollfd.entries[i].fd, "perf", "r", -1,
 				     NULL, NULL, NULL, 0);
 #endif
 		if (file == NULL)
@@ -984,14 +984,14 @@
 	return Py_BuildValue("i", evlist->core.nr_entries);
 }
 
-static struct perf_mmap *get_md(struct evlist *evlist, int cpu)
+static struct mmap *get_md(struct evlist *evlist, int cpu)
 {
 	int i;
 
-	for (i = 0; i < evlist->nr_mmaps; i++) {
-		struct perf_mmap *md = &evlist->mmap[i];
+	for (i = 0; i < evlist->core.nr_mmaps; i++) {
+		struct mmap *md = &evlist->mmap[i];
 
-		if (md->cpu == cpu)
+		if (md->core.cpu == cpu)
 			return md;
 	}
 
@@ -1005,7 +1005,7 @@
 	union perf_event *event;
 	int sample_id_all = 1, cpu;
 	static char *kwlist[] = { "cpu", "sample_id_all", NULL };
-	struct perf_mmap *md;
+	struct mmap *md;
 	int err;
 
 	if (!PyArg_ParseTupleAndKeywords(args, kwargs, "i|i", kwlist,

diff --git a/tools/perf/util/record.c b/tools/perf/util/record.c
index 286fe81..8579505 100644
--- a/tools/perf/util/record.c
+++ b/tools/perf/util/record.c

@@ -2,7 +2,6 @@
 #include "debug.h"
 #include "evlist.h"
 #include "evsel.h"
-#include "cpumap.h"
 #include "parse-events.h"
 #include <errno.h>
 #include <limits.h>
@@ -10,7 +9,6 @@
 #include <api/fs/fs.h>
 #include <subcmd/parse-options.h>
 #include <perf/cpumap.h>
-#include "util.h"
 #include "cloexec.h"
 #include "record.h"
 #include "../perf-sys.h"
@@ -32,7 +30,7 @@
 	if (parse_events(evlist, str, NULL))
 		goto out_delete;
 
-	evsel = perf_evlist__first(evlist);
+	evsel = evlist__first(evlist);
 
 	while (1) {
 		fd = sys_perf_event_open(&evsel->core.attr, pid, cpu, -1, flags);
@@ -173,7 +171,7 @@
 		use_sample_identifier = perf_can_sample_identifier();
 		sample_id = true;
 	} else if (evlist->core.nr_entries > 1) {
-		struct evsel *first = perf_evlist__first(evlist);
+		struct evsel *first = evlist__first(evlist);
 
 		evlist__for_each_entry(evlist, evsel) {
 			if (evsel->core.attr.sample_type == first->core.attr.sample_type)
@@ -278,7 +276,7 @@
 	if (err)
 		goto out_delete;
 
-	evsel = perf_evlist__last(temp_evlist);
+	evsel = evlist__last(temp_evlist);
 
 	if (!evlist || perf_cpu_map__empty(evlist->core.cpus)) {
 		struct perf_cpu_map *cpus = perf_cpu_map__new(NULL);

diff --git a/tools/perf/util/rwsem.c b/tools/perf/util/rwsem.c
index 5e52e7b..f3d29d8 100644
--- a/tools/perf/util/rwsem.c
+++ b/tools/perf/util/rwsem.c

@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 #include "util.h"
 #include "rwsem.h"
 

diff --git a/tools/perf/util/s390-cpumsf.c b/tools/perf/util/s390-cpumsf.c
index 24a9990..6785cd8 100644
--- a/tools/perf/util/s390-cpumsf.c
+++ b/tools/perf/util/s390-cpumsf.c

@@ -151,7 +151,6 @@
 #include <sys/stat.h>
 #include <sys/types.h>
 
-#include "cpumap.h"
 #include "color.h"
 #include "evsel.h"
 #include "evlist.h"

diff --git a/tools/perf/util/s390-sample-raw.c b/tools/perf/util/s390-sample-raw.c
index 4d9593e..05b43ab 100644
--- a/tools/perf/util/s390-sample-raw.c
+++ b/tools/perf/util/s390-sample-raw.c

@@ -22,7 +22,6 @@
 #include <asm/byteorder.h>
 
 #include "debug.h"
-#include "util.h"
 #include "session.h"
 #include "evlist.h"
 #include "color.h"

diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c
index 666a56e..5d341ef 100644
--- a/tools/perf/util/scripting-engines/trace-event-python.c
+++ b/tools/perf/util/scripting-engines/trace-event-python.c

@@ -37,7 +37,6 @@
 #include "../dso.h"
 #include "../callchain.h"
 #include "../evsel.h"
-#include "../util.h"
 #include "../event.h"
 #include "../thread.h"
 #include "../comm.h"
@@ -49,7 +48,6 @@
 #include "map.h"
 #include "symbol.h"
 #include "thread_map.h"
-#include "cpumap.h"
 #include "print_binary.h"
 #include "stat.h"
 #include "mem-events.h"

diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index e9e4a04..061bb4d 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c

@@ -22,7 +22,6 @@
 #include "symbol.h"
 #include "session.h"
 #include "tool.h"
-#include "cpumap.h"
 #include "perf_regs.h"
 #include "asm/bug.h"
 #include "auxtrace.h"
@@ -30,10 +29,11 @@
 #include "thread-stack.h"
 #include "sample-raw.h"
 #include "stat.h"
-#include "util.h"
 #include "ui/progress.h"
 #include "../perf.h"
 #include "arch/common.h"
+#include <internal/lib.h>
+#include <linux/err.h>
 
 #ifdef HAVE_ZSTD_SUPPORT
 static int perf_session__process_compressed_event(struct perf_session *session,
@@ -187,6 +187,7 @@
 struct perf_session *perf_session__new(struct perf_data *data,
 				       bool repipe, struct perf_tool *tool)
 {
+	int ret = -ENOMEM;
 	struct perf_session *session = zalloc(sizeof(*session));
 
 	if (!session)
@@ -201,13 +202,15 @@
 
 	perf_env__init(&session->header.env);
 	if (data) {
-		if (perf_data__open(data))
+		ret = perf_data__open(data);
+		if (ret < 0)
 			goto out_delete;
 
 		session->data = data;
 
 		if (perf_data__is_read(data)) {
-			if (perf_session__open(session) < 0)
+			ret = perf_session__open(session);
+			if (ret < 0)
 				goto out_delete;
 
 			/*
@@ -222,8 +225,11 @@
 			perf_evlist__init_trace_event_sample_raw(session->evlist);
 
 			/* Open the directory data. */
-			if (data->is_dir && perf_data__open_dir(data))
+			if (data->is_dir) {
+				ret = perf_data__open_dir(data);
+			if (ret)
 				goto out_delete;
+			}
 		}
 	} else  {
 		session->machines.host.env = &perf_env;
@@ -256,7 +262,7 @@
  out_delete:
 	perf_session__delete(session);
  out:
-	return NULL;
+	return ERR_PTR(ret);
 }
 
 static void perf_session__delete_threads(struct perf_session *session)
@@ -1317,6 +1323,7 @@
 				struct machine *machine)
 {
 	struct perf_sample_id *sid = perf_evlist__id2sid(evlist, v->id);
+	struct evsel *evsel;
 
 	if (sid) {
 		sample->id     = v->id;
@@ -1336,7 +1343,8 @@
 	if (!sample->period)
 		return 0;
 
-	return tool->sample(tool, event, sample, sid->evsel, machine);
+	evsel = container_of(sid->evsel, struct evsel, core);
+	return tool->sample(tool, event, sample, evsel, machine);
 }
 
 static int deliver_sample_group(struct evlist *evlist,
@@ -2412,73 +2420,3 @@
 	}
 	return 0;
 }
-
-int perf_event__synthesize_id_index(struct perf_tool *tool,
-				    perf_event__handler_t process,
-				    struct evlist *evlist,
-				    struct machine *machine)
-{
-	union perf_event *ev;
-	struct evsel *evsel;
-	size_t nr = 0, i = 0, sz, max_nr, n;
-	int err;
-
-	pr_debug2("Synthesizing id index\n");
-
-	max_nr = (UINT16_MAX - sizeof(struct perf_record_id_index)) /
-		 sizeof(struct id_index_entry);
-
-	evlist__for_each_entry(evlist, evsel)
-		nr += evsel->ids;
-
-	n = nr > max_nr ? max_nr : nr;
-	sz = sizeof(struct perf_record_id_index) + n * sizeof(struct id_index_entry);
-	ev = zalloc(sz);
-	if (!ev)
-		return -ENOMEM;
-
-	ev->id_index.header.type = PERF_RECORD_ID_INDEX;
-	ev->id_index.header.size = sz;
-	ev->id_index.nr = n;
-
-	evlist__for_each_entry(evlist, evsel) {
-		u32 j;
-
-		for (j = 0; j < evsel->ids; j++) {
-			struct id_index_entry *e;
-			struct perf_sample_id *sid;
-
-			if (i >= n) {
-				err = process(tool, ev, NULL, machine);
-				if (err)
-					goto out_err;
-				nr -= n;
-				i = 0;
-			}
-
-			e = &ev->id_index.entries[i++];
-
-			e->id = evsel->id[j];
-
-			sid = perf_evlist__id2sid(evlist, e->id);
-			if (!sid) {
-				free(ev);
-				return -ENOENT;
-			}
-
-			e->idx = sid->idx;
-			e->cpu = sid->cpu;
-			e->tid = sid->tid;
-		}
-	}
-
-	sz = sizeof(struct perf_record_id_index) + nr * sizeof(struct id_index_entry);
-	ev->id_index.header.size = sz;
-	ev->id_index.nr = nr;
-
-	err = process(tool, ev, NULL, machine);
-out_err:
-	free(ev);
-
-	return err;
-}

diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h
index b7aa076..b4c9428 100644
--- a/tools/perf/util/session.h
+++ b/tools/perf/util/session.h

@@ -138,9 +138,4 @@
 int perf_event__process_id_index(struct perf_session *session,
 				 union perf_event *event);
 
-int perf_event__synthesize_id_index(struct perf_tool *tool,
-				    perf_event__handler_t process,
-				    struct evlist *evlist,
-				    struct machine *machine);
-
 #endif /* __PERF_SESSION_H */

diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index a2308eb..43d1d41 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c

@@ -2329,7 +2329,7 @@
 		if (nr > evlist->core.nr_entries)
 			return NULL;
 
-		evsel = perf_evlist__first(evlist);
+		evsel = evlist__first(evlist);
 		while (--nr > 0)
 			evsel = perf_evsel__next(evsel);
 

diff --git a/tools/perf/util/srccode.c b/tools/perf/util/srccode.c
index adfcf1f..d84ed8b 100644
--- a/tools/perf/util/srccode.c
+++ b/tools/perf/util/srccode.c

@@ -15,7 +15,7 @@
 #include <string.h>
 #include "srccode.h"
 #include "debug.h"
-#include "util.h"
+#include <internal/lib.h> // page_size
 
 #define MAXSRCCACHE (32*1024*1024)
 #define MAXSRCFILES     64

diff --git a/tools/perf/util/stat-shadow.c b/tools/perf/util/stat-shadow.c
index 70c87fd..2c41d47 100644
--- a/tools/perf/util/stat-shadow.c
+++ b/tools/perf/util/stat-shadow.c

@@ -738,6 +738,8 @@
 	char *n, *pn;
 
 	expr__ctx_init(&pctx);
+	/* Must be first id entry */
+	expr__add_id(&pctx, name, avg);
 	for (i = 0; metric_events[i]; i++) {
 		struct saved_value *v;
 		struct stats *stats;
@@ -776,8 +778,6 @@
 			expr__add_id(&pctx, n, avg_stats(stats)*scale);
 	}
 
-	expr__add_id(&pctx, name, avg);
-
 	if (!metric_events[i]) {
 		const char *p = metric_expr;
 

diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c
index 8f1ea27..ebdd130 100644
--- a/tools/perf/util/stat.c
+++ b/tools/perf/util/stat.c

@@ -4,6 +4,7 @@
 #include <math.h>
 #include <string.h>
 #include "counts.h"
+#include "cpumap.h"
 #include "debug.h"
 #include "header.h"
 #include "stat.h"
@@ -161,6 +162,15 @@
 	evsel->prev_raw_counts = NULL;
 }
 
+static void perf_evsel__reset_prev_raw_counts(struct evsel *evsel)
+{
+	if (evsel->prev_raw_counts) {
+		evsel->prev_raw_counts->aggr.val = 0;
+		evsel->prev_raw_counts->aggr.ena = 0;
+		evsel->prev_raw_counts->aggr.run = 0;
+       }
+}
+
 static int perf_evsel__alloc_stats(struct evsel *evsel, bool alloc_raw)
 {
 	int ncpus = perf_evsel__nr_cpus(evsel);
@@ -211,6 +221,14 @@
 	}
 }
 
+void perf_evlist__reset_prev_raw_counts(struct evlist *evlist)
+{
+	struct evsel *evsel;
+
+	evlist__for_each_entry(evlist, evsel)
+		perf_evsel__reset_prev_raw_counts(evsel);
+}
+
 static void zero_per_pkg(struct evsel *counter)
 {
 	if (counter->per_pkg_mask)
@@ -318,7 +336,7 @@
 	int ncpus = perf_evsel__nr_cpus(counter);
 	int cpu, thread;
 
-	if (counter->system_wide)
+	if (counter->core.system_wide)
 		nthreads = 1;
 
 	for (thread = 0; thread < nthreads; thread++) {
@@ -493,45 +511,3 @@
 
 	return perf_evsel__open_per_thread(evsel, evsel->core.threads);
 }
-
-int perf_stat_synthesize_config(struct perf_stat_config *config,
-				struct perf_tool *tool,
-				struct evlist *evlist,
-				perf_event__handler_t process,
-				bool attrs)
-{
-	int err;
-
-	if (attrs) {
-		err = perf_event__synthesize_attrs(tool, evlist, process);
-		if (err < 0) {
-			pr_err("Couldn't synthesize attrs.\n");
-			return err;
-		}
-	}
-
-	err = perf_event__synthesize_extra_attr(tool, evlist, process,
-						attrs);
-
-	err = perf_event__synthesize_thread_map2(tool, evlist->core.threads,
-						 process, NULL);
-	if (err < 0) {
-		pr_err("Couldn't synthesize thread map.\n");
-		return err;
-	}
-
-	err = perf_event__synthesize_cpu_map(tool, evlist->core.cpus,
-					     process, NULL);
-	if (err < 0) {
-		pr_err("Couldn't synthesize thread map.\n");
-		return err;
-	}
-
-	err = perf_event__synthesize_stat_config(tool, config, process, NULL);
-	if (err < 0) {
-		pr_err("Couldn't synthesize config.\n");
-		return err;
-	}
-
-	return 0;
-}

diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h
index 14fe3e5..edbeb2f 100644
--- a/tools/perf/util/stat.h
+++ b/tools/perf/util/stat.h

@@ -7,8 +7,9 @@
 #include <sys/types.h>
 #include <sys/resource.h>
 #include "rblist.h"
-#include "event.h"
 
+struct perf_cpu_map;
+struct perf_stat_config;
 struct timespec;
 
 struct stats {
@@ -192,6 +193,7 @@
 int perf_evlist__alloc_stats(struct evlist *evlist, bool alloc_raw);
 void perf_evlist__free_stats(struct evlist *evlist);
 void perf_evlist__reset_stats(struct evlist *evlist);
+void perf_evlist__reset_prev_raw_counts(struct evlist *evlist);
 
 int perf_stat_process_counter(struct perf_stat_config *config,
 			      struct evsel *counter);
@@ -210,11 +212,6 @@
 int create_perf_stat_counter(struct evsel *evsel,
 			     struct perf_stat_config *config,
 			     struct target *target);
-int perf_stat_synthesize_config(struct perf_stat_config *config,
-				struct perf_tool *tool,
-				struct evlist *evlist,
-				perf_event__handler_t process,
-				bool attrs);
 void
 perf_evlist__print_counters(struct evlist *evlist,
 			    struct perf_stat_config *config,

diff --git a/tools/perf/util/svghelper.c b/tools/perf/util/svghelper.c
index 582f4a6..96f941e 100644
--- a/tools/perf/util/svghelper.c
+++ b/tools/perf/util/svghelper.c

@@ -17,11 +17,11 @@
 #include <linux/string.h>
 #include <linux/time64.h>
 #include <linux/zalloc.h>
+#include <internal/cpumap.h>
 #include <perf/cpumap.h>
 
 #include "env.h"
 #include "svghelper.h"
-#include "cpumap.h"
 
 static u64 first_time, last_time;
 static u64 turbo_frequency, max_freq;

diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c
index 9428639..66f4be1 100644
--- a/tools/perf/util/symbol-elf.c
+++ b/tools/perf/util/symbol-elf.c

@@ -7,6 +7,7 @@
 #include <unistd.h>
 #include <inttypes.h>
 
+#include "dso.h"
 #include "map.h"
 #include "map_groups.h"
 #include "symbol.h"
@@ -16,10 +17,12 @@
 #include "machine.h"
 #include "vdso.h"
 #include "debug.h"
-#include "util.h"
+#include "util/copyfile.h"
 #include <linux/ctype.h>
+#include <linux/kernel.h>
 #include <linux/zalloc.h>
 #include <symbol/kallsyms.h>
+#include <internal/lib.h>
 
 #ifndef EM_AARCH64
 #define EM_AARCH64	183  /* ARM 64 bit */

diff --git a/tools/perf/util/symbol-minimal.c b/tools/perf/util/symbol-minimal.c
index 7e2813e..d6e99af 100644
--- a/tools/perf/util/symbol-minimal.c
+++ b/tools/perf/util/symbol-minimal.c

@@ -1,8 +1,6 @@
-// SPDX-License-Identifier: GPL-2.0
 #include "dso.h"
 #include "symbol.h"
 #include "symsrc.h"
-#include "util.h"
 
 #include <errno.h>
 #include <unistd.h>
@@ -13,6 +11,7 @@
 #include <byteswap.h>
 #include <sys/stat.h>
 #include <linux/zalloc.h>
+#include <internal/lib.h>
 
 static bool check_need_swap(int file_endian)
 {

diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index 765c75d..a8f80e4 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c

@@ -19,7 +19,7 @@
 #include "build-id.h"
 #include "cap.h"
 #include "dso.h"
-#include "util.h"
+#include "util.h" // lsdir()
 #include "debug.h"
 #include "event.h"
 #include "machine.h"

diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c
new file mode 100644
index 0000000..807cbca
--- /dev/null
+++ b/tools/perf/util/synthetic-events.c

@@ -0,0 +1,1884 @@
+// SPDX-License-Identifier: GPL-2.0-only 
+
+#include "util/debug.h"
+#include "util/dso.h"
+#include "util/event.h"
+#include "util/evlist.h"
+#include "util/machine.h"
+#include "util/map.h"
+#include "util/map_symbol.h"
+#include "util/branch.h"
+#include "util/memswap.h"
+#include "util/namespaces.h"
+#include "util/session.h"
+#include "util/stat.h"
+#include "util/symbol.h"
+#include "util/synthetic-events.h"
+#include "util/target.h"
+#include "util/time-utils.h"
+#include <linux/bitops.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/zalloc.h>
+#include <linux/perf_event.h>
+#include <asm/bug.h>
+#include <perf/evsel.h>
+#include <internal/cpumap.h>
+#include <perf/cpumap.h>
+#include <internal/lib.h> // page_size
+#include <internal/threadmap.h>
+#include <perf/threadmap.h>
+#include <symbol/kallsyms.h>
+#include <dirent.h>
+#include <errno.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <string.h>
+#include <uapi/linux/mman.h> /* To get things like MAP_HUGETLB even on older libc headers */
+#include <api/fs/fs.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+#define DEFAULT_PROC_MAP_PARSE_TIMEOUT 500
+
+unsigned int proc_map_timeout = DEFAULT_PROC_MAP_PARSE_TIMEOUT;
+
+int perf_tool__process_synth_event(struct perf_tool *tool,
+				   union perf_event *event,
+				   struct machine *machine,
+				   perf_event__handler_t process)
+{
+	struct perf_sample synth_sample = {
+		.pid	   = -1,
+		.tid	   = -1,
+		.time	   = -1,
+		.stream_id = -1,
+		.cpu	   = -1,
+		.period	   = 1,
+		.cpumode   = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK,
+	};
+
+	return process(tool, event, &synth_sample, machine);
+};
+
+/*
+ * Assumes that the first 4095 bytes of /proc/pid/stat contains
+ * the comm, tgid and ppid.
+ */
+static int perf_event__get_comm_ids(pid_t pid, char *comm, size_t len,
+				    pid_t *tgid, pid_t *ppid)
+{
+	char filename[PATH_MAX];
+	char bf[4096];
+	int fd;
+	size_t size = 0;
+	ssize_t n;
+	char *name, *tgids, *ppids;
+
+	*tgid = -1;
+	*ppid = -1;
+
+	snprintf(filename, sizeof(filename), "/proc/%d/status", pid);
+
+	fd = open(filename, O_RDONLY);
+	if (fd < 0) {
+		pr_debug("couldn't open %s\n", filename);
+		return -1;
+	}
+
+	n = read(fd, bf, sizeof(bf) - 1);
+	close(fd);
+	if (n <= 0) {
+		pr_warning("Couldn't get COMM, tigd and ppid for pid %d\n",
+			   pid);
+		return -1;
+	}
+	bf[n] = '\0';
+
+	name = strstr(bf, "Name:");
+	tgids = strstr(bf, "Tgid:");
+	ppids = strstr(bf, "PPid:");
+
+	if (name) {
+		char *nl;
+
+		name = skip_spaces(name + 5);  /* strlen("Name:") */
+		nl = strchr(name, '\n');
+		if (nl)
+			*nl = '\0';
+
+		size = strlen(name);
+		if (size >= len)
+			size = len - 1;
+		memcpy(comm, name, size);
+		comm[size] = '\0';
+	} else {
+		pr_debug("Name: string not found for pid %d\n", pid);
+	}
+
+	if (tgids) {
+		tgids += 5;  /* strlen("Tgid:") */
+		*tgid = atoi(tgids);
+	} else {
+		pr_debug("Tgid: string not found for pid %d\n", pid);
+	}
+
+	if (ppids) {
+		ppids += 5;  /* strlen("PPid:") */
+		*ppid = atoi(ppids);
+	} else {
+		pr_debug("PPid: string not found for pid %d\n", pid);
+	}
+
+	return 0;
+}
+
+static int perf_event__prepare_comm(union perf_event *event, pid_t pid,
+				    struct machine *machine,
+				    pid_t *tgid, pid_t *ppid)
+{
+	size_t size;
+
+	*ppid = -1;
+
+	memset(&event->comm, 0, sizeof(event->comm));
+
+	if (machine__is_host(machine)) {
+		if (perf_event__get_comm_ids(pid, event->comm.comm,
+					     sizeof(event->comm.comm),
+					     tgid, ppid) != 0) {
+			return -1;
+		}
+	} else {
+		*tgid = machine->pid;
+	}
+
+	if (*tgid < 0)
+		return -1;
+
+	event->comm.pid = *tgid;
+	event->comm.header.type = PERF_RECORD_COMM;
+
+	size = strlen(event->comm.comm) + 1;
+	size = PERF_ALIGN(size, sizeof(u64));
+	memset(event->comm.comm + size, 0, machine->id_hdr_size);
+	event->comm.header.size = (sizeof(event->comm) -
+				(sizeof(event->comm.comm) - size) +
+				machine->id_hdr_size);
+	event->comm.tid = pid;
+
+	return 0;
+}
+
+pid_t perf_event__synthesize_comm(struct perf_tool *tool,
+					 union perf_event *event, pid_t pid,
+					 perf_event__handler_t process,
+					 struct machine *machine)
+{
+	pid_t tgid, ppid;
+
+	if (perf_event__prepare_comm(event, pid, machine, &tgid, &ppid) != 0)
+		return -1;
+
+	if (perf_tool__process_synth_event(tool, event, machine, process) != 0)
+		return -1;
+
+	return tgid;
+}
+
+static void perf_event__get_ns_link_info(pid_t pid, const char *ns,
+					 struct perf_ns_link_info *ns_link_info)
+{
+	struct stat64 st;
+	char proc_ns[128];
+
+	sprintf(proc_ns, "/proc/%u/ns/%s", pid, ns);
+	if (stat64(proc_ns, &st) == 0) {
+		ns_link_info->dev = st.st_dev;
+		ns_link_info->ino = st.st_ino;
+	}
+}
+
+int perf_event__synthesize_namespaces(struct perf_tool *tool,
+				      union perf_event *event,
+				      pid_t pid, pid_t tgid,
+				      perf_event__handler_t process,
+				      struct machine *machine)
+{
+	u32 idx;
+	struct perf_ns_link_info *ns_link_info;
+
+	if (!tool || !tool->namespace_events)
+		return 0;
+
+	memset(&event->namespaces, 0, (sizeof(event->namespaces) +
+	       (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
+	       machine->id_hdr_size));
+
+	event->namespaces.pid = tgid;
+	event->namespaces.tid = pid;
+
+	event->namespaces.nr_namespaces = NR_NAMESPACES;
+
+	ns_link_info = event->namespaces.link_info;
+
+	for (idx = 0; idx < event->namespaces.nr_namespaces; idx++)
+		perf_event__get_ns_link_info(pid, perf_ns__name(idx),
+					     &ns_link_info[idx]);
+
+	event->namespaces.header.type = PERF_RECORD_NAMESPACES;
+
+	event->namespaces.header.size = (sizeof(event->namespaces) +
+			(NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
+			machine->id_hdr_size);
+
+	if (perf_tool__process_synth_event(tool, event, machine, process) != 0)
+		return -1;
+
+	return 0;
+}
+
+static int perf_event__synthesize_fork(struct perf_tool *tool,
+				       union perf_event *event,
+				       pid_t pid, pid_t tgid, pid_t ppid,
+				       perf_event__handler_t process,
+				       struct machine *machine)
+{
+	memset(&event->fork, 0, sizeof(event->fork) + machine->id_hdr_size);
+
+	/*
+	 * for main thread set parent to ppid from status file. For other
+	 * threads set parent pid to main thread. ie., assume main thread
+	 * spawns all threads in a process
+	*/
+	if (tgid == pid) {
+		event->fork.ppid = ppid;
+		event->fork.ptid = ppid;
+	} else {
+		event->fork.ppid = tgid;
+		event->fork.ptid = tgid;
+	}
+	event->fork.pid  = tgid;
+	event->fork.tid  = pid;
+	event->fork.header.type = PERF_RECORD_FORK;
+	event->fork.header.misc = PERF_RECORD_MISC_FORK_EXEC;
+
+	event->fork.header.size = (sizeof(event->fork) + machine->id_hdr_size);
+
+	if (perf_tool__process_synth_event(tool, event, machine, process) != 0)
+		return -1;
+
+	return 0;
+}
+
+int perf_event__synthesize_mmap_events(struct perf_tool *tool,
+				       union perf_event *event,
+				       pid_t pid, pid_t tgid,
+				       perf_event__handler_t process,
+				       struct machine *machine,
+				       bool mmap_data)
+{
+	char filename[PATH_MAX];
+	FILE *fp;
+	unsigned long long t;
+	bool truncation = false;
+	unsigned long long timeout = proc_map_timeout * 1000000ULL;
+	int rc = 0;
+	const char *hugetlbfs_mnt = hugetlbfs__mountpoint();
+	int hugetlbfs_mnt_len = hugetlbfs_mnt ? strlen(hugetlbfs_mnt) : 0;
+
+	if (machine__is_default_guest(machine))
+		return 0;
+
+	snprintf(filename, sizeof(filename), "%s/proc/%d/task/%d/maps",
+		 machine->root_dir, pid, pid);
+
+	fp = fopen(filename, "r");
+	if (fp == NULL) {
+		/*
+		 * We raced with a task exiting - just return:
+		 */
+		pr_debug("couldn't open %s\n", filename);
+		return -1;
+	}
+
+	event->header.type = PERF_RECORD_MMAP2;
+	t = rdclock();
+
+	while (1) {
+		char bf[BUFSIZ];
+		char prot[5];
+		char execname[PATH_MAX];
+		char anonstr[] = "//anon";
+		unsigned int ino;
+		size_t size;
+		ssize_t n;
+
+		if (fgets(bf, sizeof(bf), fp) == NULL)
+			break;
+
+		if ((rdclock() - t) > timeout) {
+			pr_warning("Reading %s time out. "
+				   "You may want to increase "
+				   "the time limit by --proc-map-timeout\n",
+				   filename);
+			truncation = true;
+			goto out;
+		}
+
+		/* ensure null termination since stack will be reused. */
+		strcpy(execname, "");
+
+		/* 00400000-0040c000 r-xp 00000000 fd:01 41038  /bin/cat */
+		n = sscanf(bf, "%"PRI_lx64"-%"PRI_lx64" %s %"PRI_lx64" %x:%x %u %[^\n]\n",
+		       &event->mmap2.start, &event->mmap2.len, prot,
+		       &event->mmap2.pgoff, &event->mmap2.maj,
+		       &event->mmap2.min,
+		       &ino, execname);
+
+		/*
+ 		 * Anon maps don't have the execname.
+ 		 */
+		if (n < 7)
+			continue;
+
+		event->mmap2.ino = (u64)ino;
+
+		/*
+		 * Just like the kernel, see __perf_event_mmap in kernel/perf_event.c
+		 */
+		if (machine__is_host(machine))
+			event->header.misc = PERF_RECORD_MISC_USER;
+		else
+			event->header.misc = PERF_RECORD_MISC_GUEST_USER;
+
+		/* map protection and flags bits */
+		event->mmap2.prot = 0;
+		event->mmap2.flags = 0;
+		if (prot[0] == 'r')
+			event->mmap2.prot |= PROT_READ;
+		if (prot[1] == 'w')
+			event->mmap2.prot |= PROT_WRITE;
+		if (prot[2] == 'x')
+			event->mmap2.prot |= PROT_EXEC;
+
+		if (prot[3] == 's')
+			event->mmap2.flags |= MAP_SHARED;
+		else
+			event->mmap2.flags |= MAP_PRIVATE;
+
+		if (prot[2] != 'x') {
+			if (!mmap_data || prot[0] != 'r')
+				continue;
+
+			event->header.misc |= PERF_RECORD_MISC_MMAP_DATA;
+		}
+
+out:
+		if (truncation)
+			event->header.misc |= PERF_RECORD_MISC_PROC_MAP_PARSE_TIMEOUT;
+
+		if (!strcmp(execname, ""))
+			strcpy(execname, anonstr);
+
+		if (hugetlbfs_mnt_len &&
+		    !strncmp(execname, hugetlbfs_mnt, hugetlbfs_mnt_len)) {
+			strcpy(execname, anonstr);
+			event->mmap2.flags |= MAP_HUGETLB;
+		}
+
+		size = strlen(execname) + 1;
+		memcpy(event->mmap2.filename, execname, size);
+		size = PERF_ALIGN(size, sizeof(u64));
+		event->mmap2.len -= event->mmap.start;
+		event->mmap2.header.size = (sizeof(event->mmap2) -
+					(sizeof(event->mmap2.filename) - size));
+		memset(event->mmap2.filename + size, 0, machine->id_hdr_size);
+		event->mmap2.header.size += machine->id_hdr_size;
+		event->mmap2.pid = tgid;
+		event->mmap2.tid = pid;
+
+		if (perf_tool__process_synth_event(tool, event, machine, process) != 0) {
+			rc = -1;
+			break;
+		}
+
+		if (truncation)
+			break;
+	}
+
+	fclose(fp);
+	return rc;
+}
+
+int perf_event__synthesize_modules(struct perf_tool *tool, perf_event__handler_t process,
+				   struct machine *machine)
+{
+	int rc = 0;
+	struct map *pos;
+	struct maps *maps = machine__kernel_maps(machine);
+	union perf_event *event = zalloc((sizeof(event->mmap) +
+					  machine->id_hdr_size));
+	if (event == NULL) {
+		pr_debug("Not enough memory synthesizing mmap event "
+			 "for kernel modules\n");
+		return -1;
+	}
+
+	event->header.type = PERF_RECORD_MMAP;
+
+	/*
+	 * kernel uses 0 for user space maps, see kernel/perf_event.c
+	 * __perf_event_mmap
+	 */
+	if (machine__is_host(machine))
+		event->header.misc = PERF_RECORD_MISC_KERNEL;
+	else
+		event->header.misc = PERF_RECORD_MISC_GUEST_KERNEL;
+
+	for (pos = maps__first(maps); pos; pos = map__next(pos)) {
+		size_t size;
+
+		if (!__map__is_kmodule(pos))
+			continue;
+
+		size = PERF_ALIGN(pos->dso->long_name_len + 1, sizeof(u64));
+		event->mmap.header.type = PERF_RECORD_MMAP;
+		event->mmap.header.size = (sizeof(event->mmap) -
+				        (sizeof(event->mmap.filename) - size));
+		memset(event->mmap.filename + size, 0, machine->id_hdr_size);
+		event->mmap.header.size += machine->id_hdr_size;
+		event->mmap.start = pos->start;
+		event->mmap.len   = pos->end - pos->start;
+		event->mmap.pid   = machine->pid;
+
+		memcpy(event->mmap.filename, pos->dso->long_name,
+		       pos->dso->long_name_len + 1);
+		if (perf_tool__process_synth_event(tool, event, machine, process) != 0) {
+			rc = -1;
+			break;
+		}
+	}
+
+	free(event);
+	return rc;
+}
+
+static int __event__synthesize_thread(union perf_event *comm_event,
+				      union perf_event *mmap_event,
+				      union perf_event *fork_event,
+				      union perf_event *namespaces_event,
+				      pid_t pid, int full, perf_event__handler_t process,
+				      struct perf_tool *tool, struct machine *machine, bool mmap_data)
+{
+	char filename[PATH_MAX];
+	DIR *tasks;
+	struct dirent *dirent;
+	pid_t tgid, ppid;
+	int rc = 0;
+
+	/* special case: only send one comm event using passed in pid */
+	if (!full) {
+		tgid = perf_event__synthesize_comm(tool, comm_event, pid,
+						   process, machine);
+
+		if (tgid == -1)
+			return -1;
+
+		if (perf_event__synthesize_namespaces(tool, namespaces_event, pid,
+						      tgid, process, machine) < 0)
+			return -1;
+
+		/*
+		 * send mmap only for thread group leader
+		 * see thread__init_map_groups
+		 */
+		if (pid == tgid &&
+		    perf_event__synthesize_mmap_events(tool, mmap_event, pid, tgid,
+						       process, machine, mmap_data))
+			return -1;
+
+		return 0;
+	}
+
+	if (machine__is_default_guest(machine))
+		return 0;
+
+	snprintf(filename, sizeof(filename), "%s/proc/%d/task",
+		 machine->root_dir, pid);
+
+	tasks = opendir(filename);
+	if (tasks == NULL) {
+		pr_debug("couldn't open %s\n", filename);
+		return 0;
+	}
+
+	while ((dirent = readdir(tasks)) != NULL) {
+		char *end;
+		pid_t _pid;
+
+		_pid = strtol(dirent->d_name, &end, 10);
+		if (*end)
+			continue;
+
+		rc = -1;
+		if (perf_event__prepare_comm(comm_event, _pid, machine,
+					     &tgid, &ppid) != 0)
+			break;
+
+		if (perf_event__synthesize_fork(tool, fork_event, _pid, tgid,
+						ppid, process, machine) < 0)
+			break;
+
+		if (perf_event__synthesize_namespaces(tool, namespaces_event, _pid,
+						      tgid, process, machine) < 0)
+			break;
+
+		/*
+		 * Send the prepared comm event
+		 */
+		if (perf_tool__process_synth_event(tool, comm_event, machine, process) != 0)
+			break;
+
+		rc = 0;
+		if (_pid == pid) {
+			/* process the parent's maps too */
+			rc = perf_event__synthesize_mmap_events(tool, mmap_event, pid, tgid,
+						process, machine, mmap_data);
+			if (rc)
+				break;
+		}
+	}
+
+	closedir(tasks);
+	return rc;
+}
+
+int perf_event__synthesize_thread_map(struct perf_tool *tool,
+				      struct perf_thread_map *threads,
+				      perf_event__handler_t process,
+				      struct machine *machine,
+				      bool mmap_data)
+{
+	union perf_event *comm_event, *mmap_event, *fork_event;
+	union perf_event *namespaces_event;
+	int err = -1, thread, j;
+
+	comm_event = malloc(sizeof(comm_event->comm) + machine->id_hdr_size);
+	if (comm_event == NULL)
+		goto out;
+
+	mmap_event = malloc(sizeof(mmap_event->mmap2) + machine->id_hdr_size);
+	if (mmap_event == NULL)
+		goto out_free_comm;
+
+	fork_event = malloc(sizeof(fork_event->fork) + machine->id_hdr_size);
+	if (fork_event == NULL)
+		goto out_free_mmap;
+
+	namespaces_event = malloc(sizeof(namespaces_event->namespaces) +
+				  (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
+				  machine->id_hdr_size);
+	if (namespaces_event == NULL)
+		goto out_free_fork;
+
+	err = 0;
+	for (thread = 0; thread < threads->nr; ++thread) {
+		if (__event__synthesize_thread(comm_event, mmap_event,
+					       fork_event, namespaces_event,
+					       perf_thread_map__pid(threads, thread), 0,
+					       process, tool, machine,
+					       mmap_data)) {
+			err = -1;
+			break;
+		}
+
+		/*
+		 * comm.pid is set to thread group id by
+		 * perf_event__synthesize_comm
+		 */
+		if ((int) comm_event->comm.pid != perf_thread_map__pid(threads, thread)) {
+			bool need_leader = true;
+
+			/* is thread group leader in thread_map? */
+			for (j = 0; j < threads->nr; ++j) {
+				if ((int) comm_event->comm.pid == perf_thread_map__pid(threads, j)) {
+					need_leader = false;
+					break;
+				}
+			}
+
+			/* if not, generate events for it */
+			if (need_leader &&
+			    __event__synthesize_thread(comm_event, mmap_event,
+						       fork_event, namespaces_event,
+						       comm_event->comm.pid, 0,
+						       process, tool, machine,
+						       mmap_data)) {
+				err = -1;
+				break;
+			}
+		}
+	}
+	free(namespaces_event);
+out_free_fork:
+	free(fork_event);
+out_free_mmap:
+	free(mmap_event);
+out_free_comm:
+	free(comm_event);
+out:
+	return err;
+}
+
+static int __perf_event__synthesize_threads(struct perf_tool *tool,
+					    perf_event__handler_t process,
+					    struct machine *machine,
+					    bool mmap_data,
+					    struct dirent **dirent,
+					    int start,
+					    int num)
+{
+	union perf_event *comm_event, *mmap_event, *fork_event;
+	union perf_event *namespaces_event;
+	int err = -1;
+	char *end;
+	pid_t pid;
+	int i;
+
+	comm_event = malloc(sizeof(comm_event->comm) + machine->id_hdr_size);
+	if (comm_event == NULL)
+		goto out;
+
+	mmap_event = malloc(sizeof(mmap_event->mmap2) + machine->id_hdr_size);
+	if (mmap_event == NULL)
+		goto out_free_comm;
+
+	fork_event = malloc(sizeof(fork_event->fork) + machine->id_hdr_size);
+	if (fork_event == NULL)
+		goto out_free_mmap;
+
+	namespaces_event = malloc(sizeof(namespaces_event->namespaces) +
+				  (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
+				  machine->id_hdr_size);
+	if (namespaces_event == NULL)
+		goto out_free_fork;
+
+	for (i = start; i < start + num; i++) {
+		if (!isdigit(dirent[i]->d_name[0]))
+			continue;
+
+		pid = (pid_t)strtol(dirent[i]->d_name, &end, 10);
+		/* only interested in proper numerical dirents */
+		if (*end)
+			continue;
+		/*
+		 * We may race with exiting thread, so don't stop just because
+		 * one thread couldn't be synthesized.
+		 */
+		__event__synthesize_thread(comm_event, mmap_event, fork_event,
+					   namespaces_event, pid, 1, process,
+					   tool, machine, mmap_data);
+	}
+	err = 0;
+
+	free(namespaces_event);
+out_free_fork:
+	free(fork_event);
+out_free_mmap:
+	free(mmap_event);
+out_free_comm:
+	free(comm_event);
+out:
+	return err;
+}
+
+struct synthesize_threads_arg {
+	struct perf_tool *tool;
+	perf_event__handler_t process;
+	struct machine *machine;
+	bool mmap_data;
+	struct dirent **dirent;
+	int num;
+	int start;
+};
+
+static void *synthesize_threads_worker(void *arg)
+{
+	struct synthesize_threads_arg *args = arg;
+
+	__perf_event__synthesize_threads(args->tool, args->process,
+					 args->machine, args->mmap_data,
+					 args->dirent,
+					 args->start, args->num);
+	return NULL;
+}
+
+int perf_event__synthesize_threads(struct perf_tool *tool,
+				   perf_event__handler_t process,
+				   struct machine *machine,
+				   bool mmap_data,
+				   unsigned int nr_threads_synthesize)
+{
+	struct synthesize_threads_arg *args = NULL;
+	pthread_t *synthesize_threads = NULL;
+	char proc_path[PATH_MAX];
+	struct dirent **dirent;
+	int num_per_thread;
+	int m, n, i, j;
+	int thread_nr;
+	int base = 0;
+	int err = -1;
+
+
+	if (machine__is_default_guest(machine))
+		return 0;
+
+	snprintf(proc_path, sizeof(proc_path), "%s/proc", machine->root_dir);
+	n = scandir(proc_path, &dirent, 0, alphasort);
+	if (n < 0)
+		return err;
+
+	if (nr_threads_synthesize == UINT_MAX)
+		thread_nr = sysconf(_SC_NPROCESSORS_ONLN);
+	else
+		thread_nr = nr_threads_synthesize;
+
+	if (thread_nr <= 1) {
+		err = __perf_event__synthesize_threads(tool, process,
+						       machine, mmap_data,
+						       dirent, base, n);
+		goto free_dirent;
+	}
+	if (thread_nr > n)
+		thread_nr = n;
+
+	synthesize_threads = calloc(sizeof(pthread_t), thread_nr);
+	if (synthesize_threads == NULL)
+		goto free_dirent;
+
+	args = calloc(sizeof(*args), thread_nr);
+	if (args == NULL)
+		goto free_threads;
+
+	num_per_thread = n / thread_nr;
+	m = n % thread_nr;
+	for (i = 0; i < thread_nr; i++) {
+		args[i].tool = tool;
+		args[i].process = process;
+		args[i].machine = machine;
+		args[i].mmap_data = mmap_data;
+		args[i].dirent = dirent;
+	}
+	for (i = 0; i < m; i++) {
+		args[i].num = num_per_thread + 1;
+		args[i].start = i * args[i].num;
+	}
+	if (i != 0)
+		base = args[i-1].start + args[i-1].num;
+	for (j = i; j < thread_nr; j++) {
+		args[j].num = num_per_thread;
+		args[j].start = base + (j - i) * args[i].num;
+	}
+
+	for (i = 0; i < thread_nr; i++) {
+		if (pthread_create(&synthesize_threads[i], NULL,
+				   synthesize_threads_worker, &args[i]))
+			goto out_join;
+	}
+	err = 0;
+out_join:
+	for (i = 0; i < thread_nr; i++)
+		pthread_join(synthesize_threads[i], NULL);
+	free(args);
+free_threads:
+	free(synthesize_threads);
+free_dirent:
+	for (i = 0; i < n; i++)
+		zfree(&dirent[i]);
+	free(dirent);
+
+	return err;
+}
+
+int __weak perf_event__synthesize_extra_kmaps(struct perf_tool *tool __maybe_unused,
+					      perf_event__handler_t process __maybe_unused,
+					      struct machine *machine __maybe_unused)
+{
+	return 0;
+}
+
+static int __perf_event__synthesize_kernel_mmap(struct perf_tool *tool,
+						perf_event__handler_t process,
+						struct machine *machine)
+{
+	size_t size;
+	struct map *map = machine__kernel_map(machine);
+	struct kmap *kmap;
+	int err;
+	union perf_event *event;
+
+	if (map == NULL)
+		return -1;
+
+	kmap = map__kmap(map);
+	if (!kmap->ref_reloc_sym)
+		return -1;
+
+	/*
+	 * We should get this from /sys/kernel/sections/.text, but till that is
+	 * available use this, and after it is use this as a fallback for older
+	 * kernels.
+	 */
+	event = zalloc((sizeof(event->mmap) + machine->id_hdr_size));
+	if (event == NULL) {
+		pr_debug("Not enough memory synthesizing mmap event "
+			 "for kernel modules\n");
+		return -1;
+	}
+
+	if (machine__is_host(machine)) {
+		/*
+		 * kernel uses PERF_RECORD_MISC_USER for user space maps,
+		 * see kernel/perf_event.c __perf_event_mmap
+		 */
+		event->header.misc = PERF_RECORD_MISC_KERNEL;
+	} else {
+		event->header.misc = PERF_RECORD_MISC_GUEST_KERNEL;
+	}
+
+	size = snprintf(event->mmap.filename, sizeof(event->mmap.filename),
+			"%s%s", machine->mmap_name, kmap->ref_reloc_sym->name) + 1;
+	size = PERF_ALIGN(size, sizeof(u64));
+	event->mmap.header.type = PERF_RECORD_MMAP;
+	event->mmap.header.size = (sizeof(event->mmap) -
+			(sizeof(event->mmap.filename) - size) + machine->id_hdr_size);
+	event->mmap.pgoff = kmap->ref_reloc_sym->addr;
+	event->mmap.start = map->start;
+	event->mmap.len   = map->end - event->mmap.start;
+	event->mmap.pid   = machine->pid;
+
+	err = perf_tool__process_synth_event(tool, event, machine, process);
+	free(event);
+
+	return err;
+}
+
+int perf_event__synthesize_kernel_mmap(struct perf_tool *tool,
+				       perf_event__handler_t process,
+				       struct machine *machine)
+{
+	int err;
+
+	err = __perf_event__synthesize_kernel_mmap(tool, process, machine);
+	if (err < 0)
+		return err;
+
+	return perf_event__synthesize_extra_kmaps(tool, process, machine);
+}
+
+int perf_event__synthesize_thread_map2(struct perf_tool *tool,
+				      struct perf_thread_map *threads,
+				      perf_event__handler_t process,
+				      struct machine *machine)
+{
+	union perf_event *event;
+	int i, err, size;
+
+	size  = sizeof(event->thread_map);
+	size +=	threads->nr * sizeof(event->thread_map.entries[0]);
+
+	event = zalloc(size);
+	if (!event)
+		return -ENOMEM;
+
+	event->header.type = PERF_RECORD_THREAD_MAP;
+	event->header.size = size;
+	event->thread_map.nr = threads->nr;
+
+	for (i = 0; i < threads->nr; i++) {
+		struct perf_record_thread_map_entry *entry = &event->thread_map.entries[i];
+		char *comm = perf_thread_map__comm(threads, i);
+
+		if (!comm)
+			comm = (char *) "";
+
+		entry->pid = perf_thread_map__pid(threads, i);
+		strncpy((char *) &entry->comm, comm, sizeof(entry->comm));
+	}
+
+	err = process(tool, event, NULL, machine);
+
+	free(event);
+	return err;
+}
+
+static void synthesize_cpus(struct cpu_map_entries *cpus,
+			    struct perf_cpu_map *map)
+{
+	int i;
+
+	cpus->nr = map->nr;
+
+	for (i = 0; i < map->nr; i++)
+		cpus->cpu[i] = map->map[i];
+}
+
+static void synthesize_mask(struct perf_record_record_cpu_map *mask,
+			    struct perf_cpu_map *map, int max)
+{
+	int i;
+
+	mask->nr = BITS_TO_LONGS(max);
+	mask->long_size = sizeof(long);
+
+	for (i = 0; i < map->nr; i++)
+		set_bit(map->map[i], mask->mask);
+}
+
+static size_t cpus_size(struct perf_cpu_map *map)
+{
+	return sizeof(struct cpu_map_entries) + map->nr * sizeof(u16);
+}
+
+static size_t mask_size(struct perf_cpu_map *map, int *max)
+{
+	int i;
+
+	*max = 0;
+
+	for (i = 0; i < map->nr; i++) {
+		/* bit possition of the cpu is + 1 */
+		int bit = map->map[i] + 1;
+
+		if (bit > *max)
+			*max = bit;
+	}
+
+	return sizeof(struct perf_record_record_cpu_map) + BITS_TO_LONGS(*max) * sizeof(long);
+}
+
+void *cpu_map_data__alloc(struct perf_cpu_map *map, size_t *size, u16 *type, int *max)
+{
+	size_t size_cpus, size_mask;
+	bool is_dummy = perf_cpu_map__empty(map);
+
+	/*
+	 * Both array and mask data have variable size based
+	 * on the number of cpus and their actual values.
+	 * The size of the 'struct perf_record_cpu_map_data' is:
+	 *
+	 *   array = size of 'struct cpu_map_entries' +
+	 *           number of cpus * sizeof(u64)
+	 *
+	 *   mask  = size of 'struct perf_record_record_cpu_map' +
+	 *           maximum cpu bit converted to size of longs
+	 *
+	 * and finaly + the size of 'struct perf_record_cpu_map_data'.
+	 */
+	size_cpus = cpus_size(map);
+	size_mask = mask_size(map, max);
+
+	if (is_dummy || (size_cpus < size_mask)) {
+		*size += size_cpus;
+		*type  = PERF_CPU_MAP__CPUS;
+	} else {
+		*size += size_mask;
+		*type  = PERF_CPU_MAP__MASK;
+	}
+
+	*size += sizeof(struct perf_record_cpu_map_data);
+	*size = PERF_ALIGN(*size, sizeof(u64));
+	return zalloc(*size);
+}
+
+void cpu_map_data__synthesize(struct perf_record_cpu_map_data *data, struct perf_cpu_map *map,
+			      u16 type, int max)
+{
+	data->type = type;
+
+	switch (type) {
+	case PERF_CPU_MAP__CPUS:
+		synthesize_cpus((struct cpu_map_entries *) data->data, map);
+		break;
+	case PERF_CPU_MAP__MASK:
+		synthesize_mask((struct perf_record_record_cpu_map *)data->data, map, max);
+	default:
+		break;
+	};
+}
+
+static struct perf_record_cpu_map *cpu_map_event__new(struct perf_cpu_map *map)
+{
+	size_t size = sizeof(struct perf_record_cpu_map);
+	struct perf_record_cpu_map *event;
+	int max;
+	u16 type;
+
+	event = cpu_map_data__alloc(map, &size, &type, &max);
+	if (!event)
+		return NULL;
+
+	event->header.type = PERF_RECORD_CPU_MAP;
+	event->header.size = size;
+	event->data.type   = type;
+
+	cpu_map_data__synthesize(&event->data, map, type, max);
+	return event;
+}
+
+int perf_event__synthesize_cpu_map(struct perf_tool *tool,
+				   struct perf_cpu_map *map,
+				   perf_event__handler_t process,
+				   struct machine *machine)
+{
+	struct perf_record_cpu_map *event;
+	int err;
+
+	event = cpu_map_event__new(map);
+	if (!event)
+		return -ENOMEM;
+
+	err = process(tool, (union perf_event *) event, NULL, machine);
+
+	free(event);
+	return err;
+}
+
+int perf_event__synthesize_stat_config(struct perf_tool *tool,
+				       struct perf_stat_config *config,
+				       perf_event__handler_t process,
+				       struct machine *machine)
+{
+	struct perf_record_stat_config *event;
+	int size, i = 0, err;
+
+	size  = sizeof(*event);
+	size += (PERF_STAT_CONFIG_TERM__MAX * sizeof(event->data[0]));
+
+	event = zalloc(size);
+	if (!event)
+		return -ENOMEM;
+
+	event->header.type = PERF_RECORD_STAT_CONFIG;
+	event->header.size = size;
+	event->nr          = PERF_STAT_CONFIG_TERM__MAX;
+
+#define ADD(__term, __val)					\
+	event->data[i].tag = PERF_STAT_CONFIG_TERM__##__term;	\
+	event->data[i].val = __val;				\
+	i++;
+
+	ADD(AGGR_MODE,	config->aggr_mode)
+	ADD(INTERVAL,	config->interval)
+	ADD(SCALE,	config->scale)
+
+	WARN_ONCE(i != PERF_STAT_CONFIG_TERM__MAX,
+		  "stat config terms unbalanced\n");
+#undef ADD
+
+	err = process(tool, (union perf_event *) event, NULL, machine);
+
+	free(event);
+	return err;
+}
+
+int perf_event__synthesize_stat(struct perf_tool *tool,
+				u32 cpu, u32 thread, u64 id,
+				struct perf_counts_values *count,
+				perf_event__handler_t process,
+				struct machine *machine)
+{
+	struct perf_record_stat event;
+
+	event.header.type = PERF_RECORD_STAT;
+	event.header.size = sizeof(event);
+	event.header.misc = 0;
+
+	event.id        = id;
+	event.cpu       = cpu;
+	event.thread    = thread;
+	event.val       = count->val;
+	event.ena       = count->ena;
+	event.run       = count->run;
+
+	return process(tool, (union perf_event *) &event, NULL, machine);
+}
+
+int perf_event__synthesize_stat_round(struct perf_tool *tool,
+				      u64 evtime, u64 type,
+				      perf_event__handler_t process,
+				      struct machine *machine)
+{
+	struct perf_record_stat_round event;
+
+	event.header.type = PERF_RECORD_STAT_ROUND;
+	event.header.size = sizeof(event);
+	event.header.misc = 0;
+
+	event.time = evtime;
+	event.type = type;
+
+	return process(tool, (union perf_event *) &event, NULL, machine);
+}
+
+size_t perf_event__sample_event_size(const struct perf_sample *sample, u64 type, u64 read_format)
+{
+	size_t sz, result = sizeof(struct perf_record_sample);
+
+	if (type & PERF_SAMPLE_IDENTIFIER)
+		result += sizeof(u64);
+
+	if (type & PERF_SAMPLE_IP)
+		result += sizeof(u64);
+
+	if (type & PERF_SAMPLE_TID)
+		result += sizeof(u64);
+
+	if (type & PERF_SAMPLE_TIME)
+		result += sizeof(u64);
+
+	if (type & PERF_SAMPLE_ADDR)
+		result += sizeof(u64);
+
+	if (type & PERF_SAMPLE_ID)
+		result += sizeof(u64);
+
+	if (type & PERF_SAMPLE_STREAM_ID)
+		result += sizeof(u64);
+
+	if (type & PERF_SAMPLE_CPU)
+		result += sizeof(u64);
+
+	if (type & PERF_SAMPLE_PERIOD)
+		result += sizeof(u64);
+
+	if (type & PERF_SAMPLE_READ) {
+		result += sizeof(u64);
+		if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+			result += sizeof(u64);
+		if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+			result += sizeof(u64);
+		/* PERF_FORMAT_ID is forced for PERF_SAMPLE_READ */
+		if (read_format & PERF_FORMAT_GROUP) {
+			sz = sample->read.group.nr *
+			     sizeof(struct sample_read_value);
+			result += sz;
+		} else {
+			result += sizeof(u64);
+		}
+	}
+
+	if (type & PERF_SAMPLE_CALLCHAIN) {
+		sz = (sample->callchain->nr + 1) * sizeof(u64);
+		result += sz;
+	}
+
+	if (type & PERF_SAMPLE_RAW) {
+		result += sizeof(u32);
+		result += sample->raw_size;
+	}
+
+	if (type & PERF_SAMPLE_BRANCH_STACK) {
+		sz = sample->branch_stack->nr * sizeof(struct branch_entry);
+		sz += sizeof(u64);
+		result += sz;
+	}
+
+	if (type & PERF_SAMPLE_REGS_USER) {
+		if (sample->user_regs.abi) {
+			result += sizeof(u64);
+			sz = hweight64(sample->user_regs.mask) * sizeof(u64);
+			result += sz;
+		} else {
+			result += sizeof(u64);
+		}
+	}
+
+	if (type & PERF_SAMPLE_STACK_USER) {
+		sz = sample->user_stack.size;
+		result += sizeof(u64);
+		if (sz) {
+			result += sz;
+			result += sizeof(u64);
+		}
+	}
+
+	if (type & PERF_SAMPLE_WEIGHT)
+		result += sizeof(u64);
+
+	if (type & PERF_SAMPLE_DATA_SRC)
+		result += sizeof(u64);
+
+	if (type & PERF_SAMPLE_TRANSACTION)
+		result += sizeof(u64);
+
+	if (type & PERF_SAMPLE_REGS_INTR) {
+		if (sample->intr_regs.abi) {
+			result += sizeof(u64);
+			sz = hweight64(sample->intr_regs.mask) * sizeof(u64);
+			result += sz;
+		} else {
+			result += sizeof(u64);
+		}
+	}
+
+	if (type & PERF_SAMPLE_PHYS_ADDR)
+		result += sizeof(u64);
+
+	return result;
+}
+
+int perf_event__synthesize_sample(union perf_event *event, u64 type, u64 read_format,
+				  const struct perf_sample *sample)
+{
+	__u64 *array;
+	size_t sz;
+	/*
+	 * used for cross-endian analysis. See git commit 65014ab3
+	 * for why this goofiness is needed.
+	 */
+	union u64_swap u;
+
+	array = event->sample.array;
+
+	if (type & PERF_SAMPLE_IDENTIFIER) {
+		*array = sample->id;
+		array++;
+	}
+
+	if (type & PERF_SAMPLE_IP) {
+		*array = sample->ip;
+		array++;
+	}
+
+	if (type & PERF_SAMPLE_TID) {
+		u.val32[0] = sample->pid;
+		u.val32[1] = sample->tid;
+		*array = u.val64;
+		array++;
+	}
+
+	if (type & PERF_SAMPLE_TIME) {
+		*array = sample->time;
+		array++;
+	}
+
+	if (type & PERF_SAMPLE_ADDR) {
+		*array = sample->addr;
+		array++;
+	}
+
+	if (type & PERF_SAMPLE_ID) {
+		*array = sample->id;
+		array++;
+	}
+
+	if (type & PERF_SAMPLE_STREAM_ID) {
+		*array = sample->stream_id;
+		array++;
+	}
+
+	if (type & PERF_SAMPLE_CPU) {
+		u.val32[0] = sample->cpu;
+		u.val32[1] = 0;
+		*array = u.val64;
+		array++;
+	}
+
+	if (type & PERF_SAMPLE_PERIOD) {
+		*array = sample->period;
+		array++;
+	}
+
+	if (type & PERF_SAMPLE_READ) {
+		if (read_format & PERF_FORMAT_GROUP)
+			*array = sample->read.group.nr;
+		else
+			*array = sample->read.one.value;
+		array++;
+
+		if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
+			*array = sample->read.time_enabled;
+			array++;
+		}
+
+		if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
+			*array = sample->read.time_running;
+			array++;
+		}
+
+		/* PERF_FORMAT_ID is forced for PERF_SAMPLE_READ */
+		if (read_format & PERF_FORMAT_GROUP) {
+			sz = sample->read.group.nr *
+			     sizeof(struct sample_read_value);
+			memcpy(array, sample->read.group.values, sz);
+			array = (void *)array + sz;
+		} else {
+			*array = sample->read.one.id;
+			array++;
+		}
+	}
+
+	if (type & PERF_SAMPLE_CALLCHAIN) {
+		sz = (sample->callchain->nr + 1) * sizeof(u64);
+		memcpy(array, sample->callchain, sz);
+		array = (void *)array + sz;
+	}
+
+	if (type & PERF_SAMPLE_RAW) {
+		u.val32[0] = sample->raw_size;
+		*array = u.val64;
+		array = (void *)array + sizeof(u32);
+
+		memcpy(array, sample->raw_data, sample->raw_size);
+		array = (void *)array + sample->raw_size;
+	}
+
+	if (type & PERF_SAMPLE_BRANCH_STACK) {
+		sz = sample->branch_stack->nr * sizeof(struct branch_entry);
+		sz += sizeof(u64);
+		memcpy(array, sample->branch_stack, sz);
+		array = (void *)array + sz;
+	}
+
+	if (type & PERF_SAMPLE_REGS_USER) {
+		if (sample->user_regs.abi) {
+			*array++ = sample->user_regs.abi;
+			sz = hweight64(sample->user_regs.mask) * sizeof(u64);
+			memcpy(array, sample->user_regs.regs, sz);
+			array = (void *)array + sz;
+		} else {
+			*array++ = 0;
+		}
+	}
+
+	if (type & PERF_SAMPLE_STACK_USER) {
+		sz = sample->user_stack.size;
+		*array++ = sz;
+		if (sz) {
+			memcpy(array, sample->user_stack.data, sz);
+			array = (void *)array + sz;
+			*array++ = sz;
+		}
+	}
+
+	if (type & PERF_SAMPLE_WEIGHT) {
+		*array = sample->weight;
+		array++;
+	}
+
+	if (type & PERF_SAMPLE_DATA_SRC) {
+		*array = sample->data_src;
+		array++;
+	}
+
+	if (type & PERF_SAMPLE_TRANSACTION) {
+		*array = sample->transaction;
+		array++;
+	}
+
+	if (type & PERF_SAMPLE_REGS_INTR) {
+		if (sample->intr_regs.abi) {
+			*array++ = sample->intr_regs.abi;
+			sz = hweight64(sample->intr_regs.mask) * sizeof(u64);
+			memcpy(array, sample->intr_regs.regs, sz);
+			array = (void *)array + sz;
+		} else {
+			*array++ = 0;
+		}
+	}
+
+	if (type & PERF_SAMPLE_PHYS_ADDR) {
+		*array = sample->phys_addr;
+		array++;
+	}
+
+	return 0;
+}
+
+int perf_event__synthesize_id_index(struct perf_tool *tool, perf_event__handler_t process,
+				    struct evlist *evlist, struct machine *machine)
+{
+	union perf_event *ev;
+	struct evsel *evsel;
+	size_t nr = 0, i = 0, sz, max_nr, n;
+	int err;
+
+	pr_debug2("Synthesizing id index\n");
+
+	max_nr = (UINT16_MAX - sizeof(struct perf_record_id_index)) /
+		 sizeof(struct id_index_entry);
+
+	evlist__for_each_entry(evlist, evsel)
+		nr += evsel->core.ids;
+
+	n = nr > max_nr ? max_nr : nr;
+	sz = sizeof(struct perf_record_id_index) + n * sizeof(struct id_index_entry);
+	ev = zalloc(sz);
+	if (!ev)
+		return -ENOMEM;
+
+	ev->id_index.header.type = PERF_RECORD_ID_INDEX;
+	ev->id_index.header.size = sz;
+	ev->id_index.nr = n;
+
+	evlist__for_each_entry(evlist, evsel) {
+		u32 j;
+
+		for (j = 0; j < evsel->core.ids; j++) {
+			struct id_index_entry *e;
+			struct perf_sample_id *sid;
+
+			if (i >= n) {
+				err = process(tool, ev, NULL, machine);
+				if (err)
+					goto out_err;
+				nr -= n;
+				i = 0;
+			}
+
+			e = &ev->id_index.entries[i++];
+
+			e->id = evsel->core.id[j];
+
+			sid = perf_evlist__id2sid(evlist, e->id);
+			if (!sid) {
+				free(ev);
+				return -ENOENT;
+			}
+
+			e->idx = sid->idx;
+			e->cpu = sid->cpu;
+			e->tid = sid->tid;
+		}
+	}
+
+	sz = sizeof(struct perf_record_id_index) + nr * sizeof(struct id_index_entry);
+	ev->id_index.header.size = sz;
+	ev->id_index.nr = nr;
+
+	err = process(tool, ev, NULL, machine);
+out_err:
+	free(ev);
+
+	return err;
+}
+
+int __machine__synthesize_threads(struct machine *machine, struct perf_tool *tool,
+				  struct target *target, struct perf_thread_map *threads,
+				  perf_event__handler_t process, bool data_mmap,
+				  unsigned int nr_threads_synthesize)
+{
+	if (target__has_task(target))
+		return perf_event__synthesize_thread_map(tool, threads, process, machine, data_mmap);
+	else if (target__has_cpu(target))
+		return perf_event__synthesize_threads(tool, process,
+						      machine, data_mmap,
+						      nr_threads_synthesize);
+	/* command specified */
+	return 0;
+}
+
+int machine__synthesize_threads(struct machine *machine, struct target *target,
+				struct perf_thread_map *threads, bool data_mmap,
+				unsigned int nr_threads_synthesize)
+{
+	return __machine__synthesize_threads(machine, NULL, target, threads,
+					     perf_event__process, data_mmap,
+					     nr_threads_synthesize);
+}
+
+static struct perf_record_event_update *event_update_event__new(size_t size, u64 type, u64 id)
+{
+	struct perf_record_event_update *ev;
+
+	size += sizeof(*ev);
+	size  = PERF_ALIGN(size, sizeof(u64));
+
+	ev = zalloc(size);
+	if (ev) {
+		ev->header.type = PERF_RECORD_EVENT_UPDATE;
+		ev->header.size = (u16)size;
+		ev->type	= type;
+		ev->id		= id;
+	}
+	return ev;
+}
+
+int perf_event__synthesize_event_update_unit(struct perf_tool *tool, struct evsel *evsel,
+					     perf_event__handler_t process)
+{
+	size_t size = strlen(evsel->unit);
+	struct perf_record_event_update *ev;
+	int err;
+
+	ev = event_update_event__new(size + 1, PERF_EVENT_UPDATE__UNIT, evsel->core.id[0]);
+	if (ev == NULL)
+		return -ENOMEM;
+
+	strlcpy(ev->data, evsel->unit, size + 1);
+	err = process(tool, (union perf_event *)ev, NULL, NULL);
+	free(ev);
+	return err;
+}
+
+int perf_event__synthesize_event_update_scale(struct perf_tool *tool, struct evsel *evsel,
+					      perf_event__handler_t process)
+{
+	struct perf_record_event_update *ev;
+	struct perf_record_event_update_scale *ev_data;
+	int err;
+
+	ev = event_update_event__new(sizeof(*ev_data), PERF_EVENT_UPDATE__SCALE, evsel->core.id[0]);
+	if (ev == NULL)
+		return -ENOMEM;
+
+	ev_data = (struct perf_record_event_update_scale *)ev->data;
+	ev_data->scale = evsel->scale;
+	err = process(tool, (union perf_event *)ev, NULL, NULL);
+	free(ev);
+	return err;
+}
+
+int perf_event__synthesize_event_update_name(struct perf_tool *tool, struct evsel *evsel,
+					     perf_event__handler_t process)
+{
+	struct perf_record_event_update *ev;
+	size_t len = strlen(evsel->name);
+	int err;
+
+	ev = event_update_event__new(len + 1, PERF_EVENT_UPDATE__NAME, evsel->core.id[0]);
+	if (ev == NULL)
+		return -ENOMEM;
+
+	strlcpy(ev->data, evsel->name, len + 1);
+	err = process(tool, (union perf_event *)ev, NULL, NULL);
+	free(ev);
+	return err;
+}
+
+int perf_event__synthesize_event_update_cpus(struct perf_tool *tool, struct evsel *evsel,
+					     perf_event__handler_t process)
+{
+	size_t size = sizeof(struct perf_record_event_update);
+	struct perf_record_event_update *ev;
+	int max, err;
+	u16 type;
+
+	if (!evsel->core.own_cpus)
+		return 0;
+
+	ev = cpu_map_data__alloc(evsel->core.own_cpus, &size, &type, &max);
+	if (!ev)
+		return -ENOMEM;
+
+	ev->header.type = PERF_RECORD_EVENT_UPDATE;
+	ev->header.size = (u16)size;
+	ev->type	= PERF_EVENT_UPDATE__CPUS;
+	ev->id		= evsel->core.id[0];
+
+	cpu_map_data__synthesize((struct perf_record_cpu_map_data *)ev->data,
+				 evsel->core.own_cpus, type, max);
+
+	err = process(tool, (union perf_event *)ev, NULL, NULL);
+	free(ev);
+	return err;
+}
+
+int perf_event__synthesize_attrs(struct perf_tool *tool, struct evlist *evlist,
+				 perf_event__handler_t process)
+{
+	struct evsel *evsel;
+	int err = 0;
+
+	evlist__for_each_entry(evlist, evsel) {
+		err = perf_event__synthesize_attr(tool, &evsel->core.attr, evsel->core.ids,
+						  evsel->core.id, process);
+		if (err) {
+			pr_debug("failed to create perf header attribute\n");
+			return err;
+		}
+	}
+
+	return err;
+}
+
+static bool has_unit(struct evsel *evsel)
+{
+	return evsel->unit && *evsel->unit;
+}
+
+static bool has_scale(struct evsel *evsel)
+{
+	return evsel->scale != 1;
+}
+
+int perf_event__synthesize_extra_attr(struct perf_tool *tool, struct evlist *evsel_list,
+				      perf_event__handler_t process, bool is_pipe)
+{
+	struct evsel *evsel;
+	int err;
+
+	/*
+	 * Synthesize other events stuff not carried within
+	 * attr event - unit, scale, name
+	 */
+	evlist__for_each_entry(evsel_list, evsel) {
+		if (!evsel->supported)
+			continue;
+
+		/*
+		 * Synthesize unit and scale only if it's defined.
+		 */
+		if (has_unit(evsel)) {
+			err = perf_event__synthesize_event_update_unit(tool, evsel, process);
+			if (err < 0) {
+				pr_err("Couldn't synthesize evsel unit.\n");
+				return err;
+			}
+		}
+
+		if (has_scale(evsel)) {
+			err = perf_event__synthesize_event_update_scale(tool, evsel, process);
+			if (err < 0) {
+				pr_err("Couldn't synthesize evsel evsel.\n");
+				return err;
+			}
+		}
+
+		if (evsel->core.own_cpus) {
+			err = perf_event__synthesize_event_update_cpus(tool, evsel, process);
+			if (err < 0) {
+				pr_err("Couldn't synthesize evsel cpus.\n");
+				return err;
+			}
+		}
+
+		/*
+		 * Name is needed only for pipe output,
+		 * perf.data carries event names.
+		 */
+		if (is_pipe) {
+			err = perf_event__synthesize_event_update_name(tool, evsel, process);
+			if (err < 0) {
+				pr_err("Couldn't synthesize evsel name.\n");
+				return err;
+			}
+		}
+	}
+	return 0;
+}
+
+int perf_event__synthesize_attr(struct perf_tool *tool, struct perf_event_attr *attr,
+				u32 ids, u64 *id, perf_event__handler_t process)
+{
+	union perf_event *ev;
+	size_t size;
+	int err;
+
+	size = sizeof(struct perf_event_attr);
+	size = PERF_ALIGN(size, sizeof(u64));
+	size += sizeof(struct perf_event_header);
+	size += ids * sizeof(u64);
+
+	ev = zalloc(size);
+
+	if (ev == NULL)
+		return -ENOMEM;
+
+	ev->attr.attr = *attr;
+	memcpy(ev->attr.id, id, ids * sizeof(u64));
+
+	ev->attr.header.type = PERF_RECORD_HEADER_ATTR;
+	ev->attr.header.size = (u16)size;
+
+	if (ev->attr.header.size == size)
+		err = process(tool, ev, NULL, NULL);
+	else
+		err = -E2BIG;
+
+	free(ev);
+
+	return err;
+}
+
+int perf_event__synthesize_tracing_data(struct perf_tool *tool, int fd, struct evlist *evlist,
+					perf_event__handler_t process)
+{
+	union perf_event ev;
+	struct tracing_data *tdata;
+	ssize_t size = 0, aligned_size = 0, padding;
+	struct feat_fd ff;
+
+	/*
+	 * We are going to store the size of the data followed
+	 * by the data contents. Since the fd descriptor is a pipe,
+	 * we cannot seek back to store the size of the data once
+	 * we know it. Instead we:
+	 *
+	 * - write the tracing data to the temp file
+	 * - get/write the data size to pipe
+	 * - write the tracing data from the temp file
+	 *   to the pipe
+	 */
+	tdata = tracing_data_get(&evlist->core.entries, fd, true);
+	if (!tdata)
+		return -1;
+
+	memset(&ev, 0, sizeof(ev));
+
+	ev.tracing_data.header.type = PERF_RECORD_HEADER_TRACING_DATA;
+	size = tdata->size;
+	aligned_size = PERF_ALIGN(size, sizeof(u64));
+	padding = aligned_size - size;
+	ev.tracing_data.header.size = sizeof(ev.tracing_data);
+	ev.tracing_data.size = aligned_size;
+
+	process(tool, &ev, NULL, NULL);
+
+	/*
+	 * The put function will copy all the tracing data
+	 * stored in temp file to the pipe.
+	 */
+	tracing_data_put(tdata);
+
+	ff = (struct feat_fd){ .fd = fd };
+	if (write_padded(&ff, NULL, 0, padding))
+		return -1;
+
+	return aligned_size;
+}
+
+int perf_event__synthesize_build_id(struct perf_tool *tool, struct dso *pos, u16 misc,
+				    perf_event__handler_t process, struct machine *machine)
+{
+	union perf_event ev;
+	size_t len;
+
+	if (!pos->hit)
+		return 0;
+
+	memset(&ev, 0, sizeof(ev));
+
+	len = pos->long_name_len + 1;
+	len = PERF_ALIGN(len, NAME_ALIGN);
+	memcpy(&ev.build_id.build_id, pos->build_id, sizeof(pos->build_id));
+	ev.build_id.header.type = PERF_RECORD_HEADER_BUILD_ID;
+	ev.build_id.header.misc = misc;
+	ev.build_id.pid = machine->pid;
+	ev.build_id.header.size = sizeof(ev.build_id) + len;
+	memcpy(&ev.build_id.filename, pos->long_name, pos->long_name_len);
+
+	return process(tool, &ev, NULL, machine);
+}
+
+int perf_event__synthesize_stat_events(struct perf_stat_config *config, struct perf_tool *tool,
+				       struct evlist *evlist, perf_event__handler_t process, bool attrs)
+{
+	int err;
+
+	if (attrs) {
+		err = perf_event__synthesize_attrs(tool, evlist, process);
+		if (err < 0) {
+			pr_err("Couldn't synthesize attrs.\n");
+			return err;
+		}
+	}
+
+	err = perf_event__synthesize_extra_attr(tool, evlist, process, attrs);
+	err = perf_event__synthesize_thread_map2(tool, evlist->core.threads, process, NULL);
+	if (err < 0) {
+		pr_err("Couldn't synthesize thread map.\n");
+		return err;
+	}
+
+	err = perf_event__synthesize_cpu_map(tool, evlist->core.cpus, process, NULL);
+	if (err < 0) {
+		pr_err("Couldn't synthesize thread map.\n");
+		return err;
+	}
+
+	err = perf_event__synthesize_stat_config(tool, config, process, NULL);
+	if (err < 0) {
+		pr_err("Couldn't synthesize config.\n");
+		return err;
+	}
+
+	return 0;
+}
+
+int __weak perf_event__synth_time_conv(const struct perf_event_mmap_page *pc __maybe_unused,
+				       struct perf_tool *tool __maybe_unused,
+				       perf_event__handler_t process __maybe_unused,
+				       struct machine *machine __maybe_unused)
+{
+	return 0;
+}
+
+extern const struct perf_header_feature_ops feat_ops[HEADER_LAST_FEATURE];
+
+int perf_event__synthesize_features(struct perf_tool *tool, struct perf_session *session,
+				    struct evlist *evlist, perf_event__handler_t process)
+{
+	struct perf_header *header = &session->header;
+	struct perf_record_header_feature *fe;
+	struct feat_fd ff;
+	size_t sz, sz_hdr;
+	int feat, ret;
+
+	sz_hdr = sizeof(fe->header);
+	sz = sizeof(union perf_event);
+	/* get a nice alignment */
+	sz = PERF_ALIGN(sz, page_size);
+
+	memset(&ff, 0, sizeof(ff));
+
+	ff.buf = malloc(sz);
+	if (!ff.buf)
+		return -ENOMEM;
+
+	ff.size = sz - sz_hdr;
+	ff.ph = &session->header;
+
+	for_each_set_bit(feat, header->adds_features, HEADER_FEAT_BITS) {
+		if (!feat_ops[feat].synthesize) {
+			pr_debug("No record header feature for header :%d\n", feat);
+			continue;
+		}
+
+		ff.offset = sizeof(*fe);
+
+		ret = feat_ops[feat].write(&ff, evlist);
+		if (ret || ff.offset <= (ssize_t)sizeof(*fe)) {
+			pr_debug("Error writing feature\n");
+			continue;
+		}
+		/* ff.buf may have changed due to realloc in do_write() */
+		fe = ff.buf;
+		memset(fe, 0, sizeof(*fe));
+
+		fe->feat_id = feat;
+		fe->header.type = PERF_RECORD_HEADER_FEATURE;
+		fe->header.size = ff.offset;
+
+		ret = process(tool, ff.buf, NULL, NULL);
+		if (ret) {
+			free(ff.buf);
+			return ret;
+		}
+	}
+
+	/* Send HEADER_LAST_FEATURE mark. */
+	fe = ff.buf;
+	fe->feat_id     = HEADER_LAST_FEATURE;
+	fe->header.type = PERF_RECORD_HEADER_FEATURE;
+	fe->header.size = sizeof(*fe);
+
+	ret = process(tool, ff.buf, NULL, NULL);
+
+	free(ff.buf);
+	return ret;
+}

diff --git a/tools/perf/util/synthetic-events.h b/tools/perf/util/synthetic-events.h
new file mode 100644
index 0000000..baead0c
--- /dev/null
+++ b/tools/perf/util/synthetic-events.h

@@ -0,0 +1,103 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __PERF_SYNTHETIC_EVENTS_H
+#define __PERF_SYNTHETIC_EVENTS_H
+
+#include <stdbool.h>
+#include <sys/types.h> // pid_t
+#include <linux/compiler.h>
+#include <linux/types.h>
+
+struct auxtrace_record;
+struct dso;
+struct evlist;
+struct evsel;
+struct machine;
+struct perf_counts_values;
+struct perf_cpu_map;
+struct perf_event_attr;
+struct perf_event_mmap_page;
+struct perf_sample;
+struct perf_session;
+struct perf_stat_config;
+struct perf_thread_map;
+struct perf_tool;
+struct record_opts;
+struct target;
+
+union perf_event;
+
+typedef int (*perf_event__handler_t)(struct perf_tool *tool, union perf_event *event,
+				     struct perf_sample *sample, struct machine *machine);
+
+int perf_event__synthesize_attrs(struct perf_tool *tool, struct evlist *evlist, perf_event__handler_t process);
+int perf_event__synthesize_attr(struct perf_tool *tool, struct perf_event_attr *attr, u32 ids, u64 *id, perf_event__handler_t process);
+int perf_event__synthesize_build_id(struct perf_tool *tool, struct dso *pos, u16 misc, perf_event__handler_t process, struct machine *machine);
+int perf_event__synthesize_cpu_map(struct perf_tool *tool, struct perf_cpu_map *cpus, perf_event__handler_t process, struct machine *machine);
+int perf_event__synthesize_event_update_cpus(struct perf_tool *tool, struct evsel *evsel, perf_event__handler_t process);
+int perf_event__synthesize_event_update_name(struct perf_tool *tool, struct evsel *evsel, perf_event__handler_t process);
+int perf_event__synthesize_event_update_scale(struct perf_tool *tool, struct evsel *evsel, perf_event__handler_t process);
+int perf_event__synthesize_event_update_unit(struct perf_tool *tool, struct evsel *evsel, perf_event__handler_t process);
+int perf_event__synthesize_extra_attr(struct perf_tool *tool, struct evlist *evsel_list, perf_event__handler_t process, bool is_pipe);
+int perf_event__synthesize_extra_kmaps(struct perf_tool *tool, perf_event__handler_t process, struct machine *machine);
+int perf_event__synthesize_features(struct perf_tool *tool, struct perf_session *session, struct evlist *evlist, perf_event__handler_t process);
+int perf_event__synthesize_id_index(struct perf_tool *tool, perf_event__handler_t process, struct evlist *evlist, struct machine *machine);
+int perf_event__synthesize_kernel_mmap(struct perf_tool *tool, perf_event__handler_t process, struct machine *machine);
+int perf_event__synthesize_mmap_events(struct perf_tool *tool, union perf_event *event, pid_t pid, pid_t tgid, perf_event__handler_t process, struct machine *machine, bool mmap_data);
+int perf_event__synthesize_modules(struct perf_tool *tool, perf_event__handler_t process, struct machine *machine);
+int perf_event__synthesize_namespaces(struct perf_tool *tool, union perf_event *event, pid_t pid, pid_t tgid, perf_event__handler_t process, struct machine *machine);
+int perf_event__synthesize_sample(union perf_event *event, u64 type, u64 read_format, const struct perf_sample *sample);
+int perf_event__synthesize_stat_config(struct perf_tool *tool, struct perf_stat_config *config, perf_event__handler_t process, struct machine *machine);
+int perf_event__synthesize_stat_events(struct perf_stat_config *config, struct perf_tool *tool, struct evlist *evlist, perf_event__handler_t process, bool attrs);
+int perf_event__synthesize_stat_round(struct perf_tool *tool, u64 time, u64 type, perf_event__handler_t process, struct machine *machine);
+int perf_event__synthesize_stat(struct perf_tool *tool, u32 cpu, u32 thread, u64 id, struct perf_counts_values *count, perf_event__handler_t process, struct machine *machine);
+int perf_event__synthesize_thread_map2(struct perf_tool *tool, struct perf_thread_map *threads, perf_event__handler_t process, struct machine *machine);
+int perf_event__synthesize_thread_map(struct perf_tool *tool, struct perf_thread_map *threads, perf_event__handler_t process, struct machine *machine, bool mmap_data);
+int perf_event__synthesize_threads(struct perf_tool *tool, perf_event__handler_t process, struct machine *machine, bool mmap_data, unsigned int nr_threads_synthesize);
+int perf_event__synthesize_tracing_data(struct perf_tool *tool, int fd, struct evlist *evlist, perf_event__handler_t process);
+int perf_event__synth_time_conv(const struct perf_event_mmap_page *pc, struct perf_tool *tool, perf_event__handler_t process, struct machine *machine);
+pid_t perf_event__synthesize_comm(struct perf_tool *tool, union perf_event *event, pid_t pid, perf_event__handler_t process, struct machine *machine);
+
+int perf_tool__process_synth_event(struct perf_tool *tool, union perf_event *event, struct machine *machine, perf_event__handler_t process);
+
+size_t perf_event__sample_event_size(const struct perf_sample *sample, u64 type, u64 read_format);
+
+int __machine__synthesize_threads(struct machine *machine, struct perf_tool *tool,
+				  struct target *target, struct perf_thread_map *threads,
+				  perf_event__handler_t process, bool data_mmap,
+				  unsigned int nr_threads_synthesize);
+int machine__synthesize_threads(struct machine *machine, struct target *target,
+				struct perf_thread_map *threads, bool data_mmap,
+				unsigned int nr_threads_synthesize);
+
+#ifdef HAVE_AUXTRACE_SUPPORT
+int perf_event__synthesize_auxtrace_info(struct auxtrace_record *itr, struct perf_tool *tool,
+					 struct perf_session *session, perf_event__handler_t process);
+
+#else // HAVE_AUXTRACE_SUPPORT
+
+#include <errno.h>
+
+static inline int
+perf_event__synthesize_auxtrace_info(struct auxtrace_record *itr __maybe_unused,
+				     struct perf_tool *tool __maybe_unused,
+				     struct perf_session *session __maybe_unused,
+				     perf_event__handler_t process __maybe_unused)
+{
+	return -EINVAL;
+}
+#endif // HAVE_AUXTRACE_SUPPORT
+
+#ifdef HAVE_LIBBPF_SUPPORT
+int perf_event__synthesize_bpf_events(struct perf_session *session, perf_event__handler_t process,
+				      struct machine *machine, struct record_opts *opts);
+#else // HAVE_LIBBPF_SUPPORT
+static inline int perf_event__synthesize_bpf_events(struct perf_session *session __maybe_unused,
+						    perf_event__handler_t process __maybe_unused,
+						    struct machine *machine __maybe_unused,
+						    struct record_opts *opts __maybe_unused)
+{
+	return 0;
+}
+#endif // HAVE_LIBBPF_SUPPORT
+
+#endif // __PERF_SYNTHETIC_EVENTS_H

diff --git a/tools/perf/util/target.c b/tools/perf/util/target.c
index 565f7ae..a3db13d 100644
--- a/tools/perf/util/target.c
+++ b/tools/perf/util/target.c

@@ -6,8 +6,6 @@
  */
 
 #include "target.h"
-#include "util.h"
-#include "debug.h"
 
 #include <pwd.h>
 #include <stdio.h>

diff --git a/tools/perf/util/top.c b/tools/perf/util/top.c
index 51fb574..3dce2de 100644
--- a/tools/perf/util/top.c
+++ b/tools/perf/util/top.c

@@ -5,7 +5,6 @@
  * Refactored from builtin-top.c, see that files for further copyright notes.
  */
 
-#include "cpumap.h"
 #include "event.h"
 #include "evlist.h"
 #include "evsel.h"
@@ -72,7 +71,7 @@
 	}
 
 	if (top->evlist->core.nr_entries == 1) {
-		struct evsel *first = perf_evlist__first(top->evlist);
+		struct evsel *first = evlist__first(top->evlist);
 		ret += SNPRINTF(bf + ret, size - ret, "%" PRIu64 "%s ",
 				(uint64_t)first->core.attr.sample_period,
 				opts->freq ? "Hz" : "");

diff --git a/tools/perf/util/trace-event-info.c b/tools/perf/util/trace-event-info.c
index d63d542..086e98f 100644
--- a/tools/perf/util/trace-event-info.c
+++ b/tools/perf/util/trace-event-info.c

@@ -2,7 +2,6 @@
 /*
  * Copyright (C) 2008,2009, Steven Rostedt <srostedt@redhat.com>
  */
-#include "util.h"
 #include <dirent.h>
 #include <mntent.h>
 #include <stdio.h>
@@ -19,6 +18,7 @@
 #include <linux/list.h>
 #include <linux/kernel.h>
 #include <linux/zalloc.h>
+#include <internal/lib.h> // page_size
 
 #include "trace-event.h"
 #include <api/fs/tracing_path.h>

diff --git a/tools/perf/util/trace-event-read.c b/tools/perf/util/trace-event-read.c
index b6c0db0..8593d3c 100644
--- a/tools/perf/util/trace-event-read.c
+++ b/tools/perf/util/trace-event-read.c

@@ -15,7 +15,6 @@
 #include <unistd.h>
 #include <errno.h>
 
-#include "util.h"
 #include "trace-event.h"
 #include "debug.h"
 

diff --git a/tools/perf/util/trace-event.c b/tools/perf/util/trace-event.c
index 01b9d89..b3ee651 100644
--- a/tools/perf/util/trace-event.c
+++ b/tools/perf/util/trace-event.c

@@ -14,7 +14,6 @@
 #include <api/fs/fs.h>
 #include "trace-event.h"
 #include "machine.h"
-#include "util.h"
 
 /*
  * global trace_event object used by trace_event__tp_format

diff --git a/tools/perf/util/tsc.h b/tools/perf/util/tsc.h
index e0c3af3..3c5a632 100644
--- a/tools/perf/util/tsc.h
+++ b/tools/perf/util/tsc.h

@@ -4,13 +4,12 @@
 
 #include <linux/types.h>
 
-#include "event.h"
-
 struct perf_tsc_conversion {
 	u16 time_shift;
 	u32 time_mult;
 	u64 time_zero;
 };
+
 struct perf_event_mmap_page;
 
 int perf_read_tsc_conversion(const struct perf_event_mmap_page *pc,
@@ -20,13 +19,4 @@
 u64 tsc_to_perf_time(u64 cyc, struct perf_tsc_conversion *tc);
 u64 rdtsc(void);
 
-struct perf_event_mmap_page;
-struct perf_tool;
-struct machine;
-
-int perf_event__synth_time_conv(const struct perf_event_mmap_page *pc,
-				struct perf_tool *tool,
-				perf_event__handler_t process,
-				struct machine *machine);
-
-#endif
+#endif // __PERF_TSC_H

diff --git a/tools/perf/util/unwind-libdw.c b/tools/perf/util/unwind-libdw.c
index 9ece188..15f6e46 100644
--- a/tools/perf/util/unwind-libdw.c
+++ b/tools/perf/util/unwind-libdw.c

@@ -17,7 +17,6 @@
 #include "event.h"
 #include "perf_regs.h"
 #include "callchain.h"
-#include "util.h"
 
 static char *debuginfo_path;
 

diff --git a/tools/perf/util/unwind-libunwind-local.c b/tools/perf/util/unwind-libunwind-local.c
index ebdbb05..1800887 100644
--- a/tools/perf/util/unwind-libunwind-local.c
+++ b/tools/perf/util/unwind-libunwind-local.c

@@ -37,7 +37,6 @@
 #include "unwind.h"
 #include "map.h"
 #include "symbol.h"
-#include "util.h"
 #include "debug.h"
 #include "asm/bug.h"
 #include "dso.h"

diff --git a/tools/perf/util/usage.c b/tools/perf/util/usage.c
index 3949a60..196438e 100644
--- a/tools/perf/util/usage.c
+++ b/tools/perf/util/usage.c

@@ -8,7 +8,6 @@
  * Copyright (C) Linus Torvalds, 2005
  */
 #include "util.h"
-#include "debug.h"
 #include <stdio.h>
 #include <stdlib.h>
 #include <linux/compiler.h>

diff --git a/tools/perf/util/util.c b/tools/perf/util/util.c
index 32322a2..5eda6e1 100644
--- a/tools/perf/util/util.c
+++ b/tools/perf/util/util.c

@@ -2,9 +2,7 @@
 #include "util.h"
 #include "debug.h"
 #include "event.h"
-#include "namespaces.h"
 #include <api/fs/fs.h>
-#include <sys/mman.h>
 #include <sys/stat.h>
 #include <sys/utsname.h>
 #include <dirent.h>
@@ -41,8 +39,6 @@
 	perf_singlethreaded = false;
 }
 
-unsigned int page_size;
-
 int sysctl_perf_event_max_stack = PERF_MAX_STACK_DEPTH;
 int sysctl_perf_event_max_contexts_per_stack = PERF_MAX_CONTEXTS_PER_STACK;
 
@@ -234,138 +230,6 @@
 	return list;
 }
 
-static int slow_copyfile(const char *from, const char *to, struct nsinfo *nsi)
-{
-	int err = -1;
-	char *line = NULL;
-	size_t n;
-	FILE *from_fp, *to_fp;
-	struct nscookie nsc;
-
-	nsinfo__mountns_enter(nsi, &nsc);
-	from_fp = fopen(from, "r");
-	nsinfo__mountns_exit(&nsc);
-	if (from_fp == NULL)
-		goto out;
-
-	to_fp = fopen(to, "w");
-	if (to_fp == NULL)
-		goto out_fclose_from;
-
-	while (getline(&line, &n, from_fp) > 0)
-		if (fputs(line, to_fp) == EOF)
-			goto out_fclose_to;
-	err = 0;
-out_fclose_to:
-	fclose(to_fp);
-	free(line);
-out_fclose_from:
-	fclose(from_fp);
-out:
-	return err;
-}
-
-int copyfile_offset(int ifd, loff_t off_in, int ofd, loff_t off_out, u64 size)
-{
-	void *ptr;
-	loff_t pgoff;
-
-	pgoff = off_in & ~(page_size - 1);
-	off_in -= pgoff;
-
-	ptr = mmap(NULL, off_in + size, PROT_READ, MAP_PRIVATE, ifd, pgoff);
-	if (ptr == MAP_FAILED)
-		return -1;
-
-	while (size) {
-		ssize_t ret = pwrite(ofd, ptr + off_in, size, off_out);
-		if (ret < 0 && errno == EINTR)
-			continue;
-		if (ret <= 0)
-			break;
-
-		size -= ret;
-		off_in += ret;
-		off_out += ret;
-	}
-	munmap(ptr, off_in + size);
-
-	return size ? -1 : 0;
-}
-
-static int copyfile_mode_ns(const char *from, const char *to, mode_t mode,
-			    struct nsinfo *nsi)
-{
-	int fromfd, tofd;
-	struct stat st;
-	int err;
-	char *tmp = NULL, *ptr = NULL;
-	struct nscookie nsc;
-
-	nsinfo__mountns_enter(nsi, &nsc);
-	err = stat(from, &st);
-	nsinfo__mountns_exit(&nsc);
-	if (err)
-		goto out;
-	err = -1;
-
-	/* extra 'x' at the end is to reserve space for '.' */
-	if (asprintf(&tmp, "%s.XXXXXXx", to) < 0) {
-		tmp = NULL;
-		goto out;
-	}
-	ptr = strrchr(tmp, '/');
-	if (!ptr)
-		goto out;
-	ptr = memmove(ptr + 1, ptr, strlen(ptr) - 1);
-	*ptr = '.';
-
-	tofd = mkstemp(tmp);
-	if (tofd < 0)
-		goto out;
-
-	if (fchmod(tofd, mode))
-		goto out_close_to;
-
-	if (st.st_size == 0) { /* /proc? do it slowly... */
-		err = slow_copyfile(from, tmp, nsi);
-		goto out_close_to;
-	}
-
-	nsinfo__mountns_enter(nsi, &nsc);
-	fromfd = open(from, O_RDONLY);
-	nsinfo__mountns_exit(&nsc);
-	if (fromfd < 0)
-		goto out_close_to;
-
-	err = copyfile_offset(fromfd, 0, tofd, 0, st.st_size);
-
-	close(fromfd);
-out_close_to:
-	close(tofd);
-	if (!err)
-		err = link(tmp, to);
-	unlink(tmp);
-out:
-	free(tmp);
-	return err;
-}
-
-int copyfile_ns(const char *from, const char *to, struct nsinfo *nsi)
-{
-	return copyfile_mode_ns(from, to, 0755, nsi);
-}
-
-int copyfile_mode(const char *from, const char *to, mode_t mode)
-{
-	return copyfile_mode_ns(from, to, mode, NULL);
-}
-
-int copyfile(const char *from, const char *to)
-{
-	return copyfile_mode(from, to, 0755);
-}
-
 size_t hex_width(u64 v)
 {
 	size_t n = 1;

diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h
index 45a5c6f2..9969b8b 100644
--- a/tools/perf/util/util.h
+++ b/tools/perf/util/util.h

@@ -11,14 +11,12 @@
 #include <stddef.h>
 #include <linux/compiler.h>
 #include <sys/types.h>
-#include <internal/lib.h>
 
 /* General helper functions */
 void usage(const char *err) __noreturn;
 void die(const char *err, ...) __noreturn __printf(1, 2);
 
 struct dirent;
-struct nsinfo;
 struct strlist;
 
 int mkdir_p(char *path, mode_t mode);
@@ -26,15 +24,9 @@
 int rm_rf_perf_data(const char *path);
 struct strlist *lsdir(const char *name, bool (*filter)(const char *, struct dirent *));
 bool lsdir_no_dot_filter(const char *name, struct dirent *d);
-int copyfile(const char *from, const char *to);
-int copyfile_mode(const char *from, const char *to, mode_t mode);
-int copyfile_ns(const char *from, const char *to, struct nsinfo *nsi);
-int copyfile_offset(int ifd, loff_t off_in, int ofd, loff_t off_out, u64 size);
 
 size_t hex_width(u64 v);
 
-extern unsigned int page_size;
-
 int sysctl__max_stack(void);
 
 int fetch_kernel_version(unsigned int *puint,

diff --git a/tools/perf/util/vdso.c b/tools/perf/util/vdso.c
index e5e6599..ba4b439 100644
--- a/tools/perf/util/vdso.c
+++ b/tools/perf/util/vdso.c

@@ -11,7 +11,7 @@
 
 #include "vdso.h"
 #include "dso.h"
-#include "util.h"
+#include <internal/lib.h>
 #include "map.h"
 #include "symbol.h"
 #include "machine.h"

diff --git a/tools/perf/util/zlib.c b/tools/perf/util/zlib.c
index 59d456f..78d2297 100644
--- a/tools/perf/util/zlib.c
+++ b/tools/perf/util/zlib.c

@@ -7,11 +7,9 @@
 #include <sys/mman.h>
 #include <zlib.h>
 #include <linux/compiler.h>
+#include <internal/lib.h>
 
 #include "util/compress.h"
-#include "util/util.h"
-#include "util/debug.h"
-
 
 #define CHUNK_SIZE  16384
 

diff --git a/tools/power/x86/intel-speed-select/isst-config.c b/tools/power/x86/intel-speed-select/isst-config.c
index 59753b3..2a9890c 100644
--- a/tools/power/x86/intel-speed-select/isst-config.c
+++ b/tools/power/x86/intel-speed-select/isst-config.c

@@ -38,6 +38,7 @@
 static unsigned long long fact_trl;
 static int out_format_json;
 static int cmd_help;
+static int force_online_offline;
 
 /* clos related */
 static int current_clos = -1;
@@ -138,14 +139,14 @@
 int get_physical_package_id(int cpu)
 {
 	return parse_int_file(
-		1, "/sys/devices/system/cpu/cpu%d/topology/physical_package_id",
+		0, "/sys/devices/system/cpu/cpu%d/topology/physical_package_id",
 		cpu);
 }
 
 int get_physical_core_id(int cpu)
 {
 	return parse_int_file(
-		1, "/sys/devices/system/cpu/cpu%d/topology/core_id", cpu);
+		0, "/sys/devices/system/cpu/cpu%d/topology/core_id", cpu);
 }
 
 int get_physical_die_id(int cpu)
@@ -165,6 +166,26 @@
 	return topo_max_cpus;
 }
 
+static void set_cpu_online_offline(int cpu, int state)
+{
+	char buffer[128];
+	int fd;
+
+	snprintf(buffer, sizeof(buffer),
+		 "/sys/devices/system/cpu/cpu%d/online", cpu);
+
+	fd = open(buffer, O_WRONLY);
+	if (fd < 0)
+		err(-1, "%s open failed", buffer);
+
+	if (state)
+		write(fd, "1\n", 2);
+	else
+		write(fd, "0\n", 2);
+
+	close(fd);
+}
+
 #define MAX_PACKAGE_COUNT 8
 #define MAX_DIE_PER_PACKAGE 2
 static void for_each_online_package_in_set(void (*callback)(int, void *, void *,
@@ -402,6 +423,9 @@
 			int j;
 
 			for (j = 0; j < topo_max_cpus; ++j) {
+				if (!CPU_ISSET_S(j, present_cpumask_size, present_cpumask))
+					continue;
+
 				if (cpu_map[j].pkg_id == pkg_id &&
 				    cpu_map[j].die_id == die_id &&
 				    cpu_map[j].punit_cpu_core == i) {
@@ -484,7 +508,7 @@
 		int write = 0;
 		int clos_id, core_id, ret = 0;
 
-		debug_printf("CLOS %d\n", cpu);
+		debug_printf("CPU %d\n", cpu);
 
 		if (parameter & BIT(MBOX_CMD_WRITE_BIT)) {
 			value = req_data;
@@ -649,8 +673,8 @@
 	if (ret)
 		perror("get_tdp_*");
 	else
-		isst_display_result(cpu, outf, "perf-profile", (char *)arg3,
-				    *(unsigned int *)arg4);
+		isst_ctdp_display_core_info(cpu, outf, arg3,
+					    *(unsigned int *)arg4);
 }
 
 #define _get_tdp_level(desc, suffix, object, help)                                \
@@ -733,9 +757,34 @@
 	ret = isst_set_tdp_level(cpu, tdp_level);
 	if (ret)
 		perror("set_tdp_level_for_cpu");
-	else
+	else {
 		isst_display_result(cpu, outf, "perf-profile", "set_tdp_level",
 				    ret);
+		if (force_online_offline) {
+			struct isst_pkg_ctdp_level_info ctdp_level;
+			int pkg_id = get_physical_package_id(cpu);
+			int die_id = get_physical_die_id(cpu);
+
+			fprintf(stderr, "Option is set to online/offline\n");
+			ctdp_level.core_cpumask_size =
+				alloc_cpu_set(&ctdp_level.core_cpumask);
+			isst_get_coremask_info(cpu, tdp_level, &ctdp_level);
+			if (ctdp_level.cpu_count) {
+				int i, max_cpus = get_topo_max_cpus();
+				for (i = 0; i < max_cpus; ++i) {
+					if (pkg_id != get_physical_package_id(i) || die_id != get_physical_die_id(i))
+						continue;
+					if (CPU_ISSET_S(i, ctdp_level.core_cpumask_size, ctdp_level.core_cpumask)) {
+						fprintf(stderr, "online cpu %d\n", i);
+						set_cpu_online_offline(i, 1);
+					} else {
+						fprintf(stderr, "offline cpu %d\n", i);
+						set_cpu_online_offline(i, 0);
+					}
+				}
+			}
+		}
+	}
 }
 
 static void set_tdp_level(void)
@@ -744,6 +793,8 @@
 		fprintf(stderr, "Set Config TDP level\n");
 		fprintf(stderr,
 			"\t Arguments: -l|--level : Specify tdp level\n");
+		fprintf(stderr,
+			"\t Optional Arguments: -o | online : online/offline for the tdp level\n");
 		exit(0);
 	}
 
@@ -1082,6 +1133,40 @@
 	isst_ctdp_display_information_end(outf);
 }
 
+static void get_clos_info_for_cpu(int cpu, void *arg1, void *arg2, void *arg3,
+				  void *arg4)
+{
+	int enable, ret, prio_type;
+
+	ret = isst_clos_get_clos_information(cpu, &enable, &prio_type);
+	if (ret)
+		perror("isst_clos_get_info");
+	else
+		isst_clos_display_clos_information(cpu, outf, enable, prio_type);
+}
+
+static void dump_clos_info(void)
+{
+	if (cmd_help) {
+		fprintf(stderr,
+			"Print Intel Speed Select Technology core power information\n");
+		fprintf(stderr, "\tSpecify targeted cpu id with [--cpu|-c]\n");
+		exit(0);
+	}
+
+	if (!max_target_cpus) {
+		fprintf(stderr,
+			"Invalid target cpu. Specify with [-c|--cpu]\n");
+		exit(0);
+	}
+
+	isst_ctdp_display_information_start(outf);
+	for_each_online_target_cpu_in_set(get_clos_info_for_cpu, NULL,
+					  NULL, NULL, NULL);
+	isst_ctdp_display_information_end(outf);
+
+}
+
 static void set_clos_config_for_cpu(int cpu, void *arg1, void *arg2, void *arg3,
 				    void *arg4)
 {
@@ -1198,7 +1283,7 @@
 	if (ret)
 		perror("isst_clos_get_assoc_status");
 	else
-		isst_display_result(cpu, outf, "core-power", "get-assoc", clos);
+		isst_clos_display_assoc_information(cpu, outf, clos);
 }
 
 static void get_clos_assoc(void)
@@ -1208,13 +1293,17 @@
 		fprintf(stderr, "\tSpecify targeted cpu id with [--cpu|-c]\n");
 		exit(0);
 	}
-	if (max_target_cpus)
-		for_each_online_target_cpu_in_set(get_clos_assoc_for_cpu, NULL,
-						  NULL, NULL, NULL);
-	else {
+
+	if (!max_target_cpus) {
 		fprintf(stderr,
 			"Invalid target cpu. Specify with [-c|--cpu]\n");
+		exit(0);
 	}
+
+	isst_ctdp_display_information_start(outf);
+	for_each_online_target_cpu_in_set(get_clos_assoc_for_cpu, NULL,
+					  NULL, NULL, NULL);
+	isst_ctdp_display_information_end(outf);
 }
 
 static struct process_cmd_struct isst_cmds[] = {
@@ -1231,10 +1320,11 @@
 	{ "turbo-freq", "info", dump_fact_config },
 	{ "turbo-freq", "enable", set_fact_enable },
 	{ "turbo-freq", "disable", set_fact_disable },
-	{ "core-power", "info", dump_clos_config },
+	{ "core-power", "info", dump_clos_info },
 	{ "core-power", "enable", set_clos_enable },
 	{ "core-power", "disable", set_clos_disable },
 	{ "core-power", "config", set_clos_config },
+	{ "core-power", "get-config", dump_clos_config },
 	{ "core-power", "assoc", set_clos_assoc },
 	{ "core-power", "get-assoc", get_clos_assoc },
 	{ NULL, NULL, NULL }
@@ -1316,6 +1406,7 @@
 	static struct option long_options[] = {
 		{ "bucket", required_argument, 0, 'b' },
 		{ "level", required_argument, 0, 'l' },
+		{ "online", required_argument, 0, 'o' },
 		{ "trl-type", required_argument, 0, 'r' },
 		{ "trl", required_argument, 0, 't' },
 		{ "help", no_argument, 0, 'h' },
@@ -1332,7 +1423,7 @@
 	option_index = start;
 
 	optind = start + 1;
-	while ((opt = getopt_long(argc, argv, "b:l:t:c:d:e:n:m:p:w:h",
+	while ((opt = getopt_long(argc, argv, "b:l:t:c:d:e:n:m:p:w:ho",
 				  long_options, &option_index)) != -1) {
 		switch (opt) {
 		case 'b':
@@ -1344,6 +1435,9 @@
 		case 'l':
 			tdp_level = atoi(optarg);
 			break;
+		case 'o':
+			force_online_offline = 1;
+			break;
 		case 't':
 			sscanf(optarg, "0x%llx", &fact_trl);
 			break;
@@ -1362,7 +1456,6 @@
 		/* CLOS related */
 		case 'c':
 			current_clos = atoi(optarg);
-			printf("clos %d\n", current_clos);
 			break;
 		case 'd':
 			clos_desired = atoi(optarg);
@@ -1433,6 +1526,7 @@
 	printf("\tenable\n");
 	printf("\tdisable\n");
 	printf("\tconfig\n");
+	printf("\tget-config\n");
 	printf("\tassoc\n");
 	printf("\tget-assoc\n");
 }

diff --git a/tools/power/x86/intel-speed-select/isst-core.c b/tools/power/x86/intel-speed-select/isst-core.c
index 0bf341a..6dee533 100644
--- a/tools/power/x86/intel-speed-select/isst-core.c
+++ b/tools/power/x86/intel-speed-select/isst-core.c

@@ -619,6 +619,31 @@
 	return 0;
 }
 
+int isst_clos_get_clos_information(int cpu, int *enable, int *type)
+{
+	unsigned int resp;
+	int ret;
+
+	ret = isst_send_mbox_command(cpu, CONFIG_CLOS, CLOS_PM_QOS_CONFIG, 0, 0,
+				     &resp);
+	if (ret)
+		return ret;
+
+	debug_printf("cpu:%d CLOS_PM_QOS_CONFIG resp:%x\n", cpu, resp);
+
+	if (resp & BIT(1))
+		*enable = 1;
+	else
+		*enable = 0;
+
+	if (resp & BIT(2))
+		*type = 1;
+	else
+		*type = 0;
+
+	return 0;
+}
+
 int isst_pm_qos_config(int cpu, int enable_clos, int priority_type)
 {
 	unsigned int req, resp;

diff --git a/tools/power/x86/intel-speed-select/isst-display.c b/tools/power/x86/intel-speed-select/isst-display.c
index df4aa99..40346d5 100644
--- a/tools/power/x86/intel-speed-select/isst-display.c
+++ b/tools/power/x86/intel-speed-select/isst-display.c

@@ -287,6 +287,26 @@
 	format_and_print(outf, base_level + 2, header, value);
 }
 
+void isst_ctdp_display_core_info(int cpu, FILE *outf, char *prefix,
+				 unsigned int val)
+{
+	char header[256];
+	char value[256];
+
+	snprintf(header, sizeof(header), "package-%d",
+		 get_physical_package_id(cpu));
+	format_and_print(outf, 1, header, NULL);
+	snprintf(header, sizeof(header), "die-%d", get_physical_die_id(cpu));
+	format_and_print(outf, 2, header, NULL);
+	snprintf(header, sizeof(header), "cpu-%d", cpu);
+	format_and_print(outf, 3, header, NULL);
+
+	snprintf(value, sizeof(value), "%u", val);
+	format_and_print(outf, 4, prefix, value);
+
+	format_and_print(outf, 1, NULL, NULL);
+}
+
 void isst_ctdp_display_information(int cpu, FILE *outf, int tdp_level,
 				   struct isst_pkg_ctdp *pkg_dev)
 {
@@ -503,6 +523,57 @@
 	format_and_print(outf, 1, NULL, NULL);
 }
 
+void isst_clos_display_clos_information(int cpu, FILE *outf,
+					int clos_enable, int type)
+{
+	char header[256];
+	char value[256];
+
+	snprintf(header, sizeof(header), "package-%d",
+		 get_physical_package_id(cpu));
+	format_and_print(outf, 1, header, NULL);
+	snprintf(header, sizeof(header), "die-%d", get_physical_die_id(cpu));
+	format_and_print(outf, 2, header, NULL);
+	snprintf(header, sizeof(header), "cpu-%d", cpu);
+	format_and_print(outf, 3, header, NULL);
+
+	snprintf(header, sizeof(header), "core-power");
+	format_and_print(outf, 4, header, NULL);
+
+	snprintf(header, sizeof(header), "enable-status");
+	snprintf(value, sizeof(value), "%d", clos_enable);
+	format_and_print(outf, 5, header, value);
+
+	snprintf(header, sizeof(header), "priority-type");
+	snprintf(value, sizeof(value), "%d", type);
+	format_and_print(outf, 5, header, value);
+
+	format_and_print(outf, 1, NULL, NULL);
+}
+
+void isst_clos_display_assoc_information(int cpu, FILE *outf, int clos)
+{
+	char header[256];
+	char value[256];
+
+	snprintf(header, sizeof(header), "package-%d",
+		 get_physical_package_id(cpu));
+	format_and_print(outf, 1, header, NULL);
+	snprintf(header, sizeof(header), "die-%d", get_physical_die_id(cpu));
+	format_and_print(outf, 2, header, NULL);
+	snprintf(header, sizeof(header), "cpu-%d", cpu);
+	format_and_print(outf, 3, header, NULL);
+
+	snprintf(header, sizeof(header), "get-assoc");
+	format_and_print(outf, 4, header, NULL);
+
+	snprintf(header, sizeof(header), "clos");
+	snprintf(value, sizeof(value), "%d", clos);
+	format_and_print(outf, 5, header, value);
+
+	format_and_print(outf, 1, NULL, NULL);
+}
+
 void isst_display_result(int cpu, FILE *outf, char *feature, char *cmd,
 			 int result)
 {

diff --git a/tools/power/x86/intel-speed-select/isst.h b/tools/power/x86/intel-speed-select/isst.h
index 2f7f627..d280b27 100644
--- a/tools/power/x86/intel-speed-select/isst.h
+++ b/tools/power/x86/intel-speed-select/isst.h

@@ -187,12 +187,16 @@
 				 int write, unsigned long long *req_resp);
 
 extern int isst_get_ctdp_levels(int cpu, struct isst_pkg_ctdp *pkg_dev);
+extern int isst_get_coremask_info(int cpu, int config_index,
+			   struct isst_pkg_ctdp_level_info *ctdp_level);
 extern int isst_get_process_ctdp(int cpu, int tdp_level,
 				 struct isst_pkg_ctdp *pkg_dev);
 extern void isst_get_process_ctdp_complete(int cpu,
 					   struct isst_pkg_ctdp *pkg_dev);
 extern void isst_ctdp_display_information(int cpu, FILE *outf, int tdp_level,
 					  struct isst_pkg_ctdp *pkg_dev);
+extern void isst_ctdp_display_core_info(int cpu, FILE *outf, char *prefix,
+					unsigned int val);
 extern void isst_ctdp_display_information_start(FILE *outf);
 extern void isst_ctdp_display_information_end(FILE *outf);
 extern void isst_pbf_display_information(int cpu, FILE *outf, int level,
@@ -223,10 +227,14 @@
 extern int isst_clos_get_assoc_status(int cpu, int *clos_id);
 extern void isst_clos_display_information(int cpu, FILE *outf, int clos,
 					  struct isst_clos_config *clos_config);
-
+extern void isst_clos_display_assoc_information(int cpu, FILE *outf, int clos);
 extern int isst_read_reg(unsigned short reg, unsigned int *val);
 extern int isst_write_reg(int reg, unsigned int val);
 
 extern void isst_display_result(int cpu, FILE *outf, char *feature, char *cmd,
 				int result);
+
+extern int isst_clos_get_clos_information(int cpu, int *enable, int *type);
+extern void isst_clos_display_clos_information(int cpu, FILE *outf,
+					       int clos_enable, int type);
 #endif

diff --git a/tools/testing/nvdimm/test/nfit_test.h b/tools/testing/nvdimm/test/nfit_test.h
index 448d686..0bf5640 100644
--- a/tools/testing/nvdimm/test/nfit_test.h
+++ b/tools/testing/nvdimm/test/nfit_test.h

@@ -4,6 +4,7 @@
  */
 #ifndef __NFIT_TEST_H__
 #define __NFIT_TEST_H__
+#include <linux/acpi.h>
 #include <linux/list.h>
 #include <linux/uuid.h>
 #include <linux/ioport.h>
@@ -202,9 +203,6 @@
 	__u32 status;
 } __packed;
 
-union acpi_object;
-typedef void *acpi_handle;
-
 typedef struct nfit_test_resource *(*nfit_test_lookup_fn)(resource_size_t);
 typedef union acpi_object *(*nfit_test_evaluate_dsm_fn)(acpi_handle handle,
 		 const guid_t *guid, u64 rev, u64 func,

diff --git a/tools/testing/selftests/.gitignore b/tools/testing/selftests/.gitignore
index 8059ce8..61df01c 100644
--- a/tools/testing/selftests/.gitignore
+++ b/tools/testing/selftests/.gitignore

@@ -2,3 +2,5 @@
 gpiogpio-hammer
 gpioinclude/
 gpiolsgpio
+tpm2/SpaceTest.log
+tpm2/*.pyc

diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index 25b43a8..c3feccb 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile

@@ -126,9 +126,9 @@
 # in the default INSTALL_HDR_PATH usr/include.
 khdr:
 ifeq (1,$(DEFAULT_INSTALL_HDR_PATH))
-	make --no-builtin-rules ARCH=$(ARCH) -C $(top_srcdir) headers_install
+	$(MAKE) --no-builtin-rules ARCH=$(ARCH) -C $(top_srcdir) headers_install
 else
-	make --no-builtin-rules INSTALL_HDR_PATH=$$BUILD/usr \
+	$(MAKE) --no-builtin-rules INSTALL_HDR_PATH=$$BUILD/usr \
 		ARCH=$(ARCH) -C $(top_srcdir) headers_install
 endif
 
@@ -136,35 +136,35 @@
 	@for TARGET in $(TARGETS); do		\
 		BUILD_TARGET=$$BUILD/$$TARGET;	\
 		mkdir $$BUILD_TARGET  -p;	\
-		make OUTPUT=$$BUILD_TARGET -C $$TARGET;\
+		$(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET;\
 	done;
 
 run_tests: all
 	@for TARGET in $(TARGETS); do \
 		BUILD_TARGET=$$BUILD/$$TARGET;	\
-		make OUTPUT=$$BUILD_TARGET -C $$TARGET run_tests;\
+		$(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET run_tests;\
 	done;
 
 hotplug:
 	@for TARGET in $(TARGETS_HOTPLUG); do \
 		BUILD_TARGET=$$BUILD/$$TARGET;	\
-		make OUTPUT=$$BUILD_TARGET -C $$TARGET;\
+		$(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET;\
 	done;
 
 run_hotplug: hotplug
 	@for TARGET in $(TARGETS_HOTPLUG); do \
 		BUILD_TARGET=$$BUILD/$$TARGET;	\
-		make OUTPUT=$$BUILD_TARGET -C $$TARGET run_full_test;\
+		$(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET run_full_test;\
 	done;
 
 clean_hotplug:
 	@for TARGET in $(TARGETS_HOTPLUG); do \
 		BUILD_TARGET=$$BUILD/$$TARGET;	\
-		make OUTPUT=$$BUILD_TARGET -C $$TARGET clean;\
+		$(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET clean;\
 	done;
 
 run_pstore_crash:
-	make -C pstore run_crash
+	$(MAKE) -C pstore run_crash
 
 # Use $BUILD as the default install root. $BUILD points to the
 # right output location for the following cases:
@@ -184,7 +184,7 @@
 	install -m 744 kselftest/prefix.pl $(INSTALL_PATH)/kselftest/
 	@for TARGET in $(TARGETS); do \
 		BUILD_TARGET=$$BUILD/$$TARGET;	\
-		make OUTPUT=$$BUILD_TARGET -C $$TARGET INSTALL_PATH=$(INSTALL_PATH)/$$TARGET install; \
+		$(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET INSTALL_PATH=$(INSTALL_PATH)/$$TARGET install; \
 	done;
 
 	@# Ask all targets to emit their test scripts
@@ -203,7 +203,7 @@
 		echo "[ -w /dev/kmsg ] && echo \"kselftest: Running tests in $$TARGET\" >> /dev/kmsg" >> $(ALL_SCRIPT); \
 		echo "cd $$TARGET" >> $(ALL_SCRIPT); \
 		echo -n "run_many" >> $(ALL_SCRIPT); \
-		make -s --no-print-directory OUTPUT=$$BUILD_TARGET -C $$TARGET emit_tests >> $(ALL_SCRIPT); \
+		$(MAKE) -s --no-print-directory OUTPUT=$$BUILD_TARGET -C $$TARGET emit_tests >> $(ALL_SCRIPT); \
 		echo "" >> $(ALL_SCRIPT);	    \
 		echo "cd \$$ROOT" >> $(ALL_SCRIPT); \
 	done;
@@ -216,7 +216,7 @@
 clean:
 	@for TARGET in $(TARGETS); do \
 		BUILD_TARGET=$$BUILD/$$TARGET;	\
-		make OUTPUT=$$BUILD_TARGET -C $$TARGET clean;\
+		$(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET clean;\
 	done;
 
 .PHONY: khdr all run_tests hotplug run_hotplug clean_hotplug run_pstore_crash install clean

diff --git a/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c b/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c
index fdc0b36..a82da55 100644
--- a/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c
+++ b/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c

@@ -203,14 +203,24 @@
 	return fd;
 }
 
+static pthread_mutex_t server_started_mtx = PTHREAD_MUTEX_INITIALIZER;
+static pthread_cond_t server_started = PTHREAD_COND_INITIALIZER;
+
 static void *server_thread(void *arg)
 {
 	struct sockaddr_storage addr;
 	socklen_t len = sizeof(addr);
 	int fd = *(int *)arg;
 	int client_fd;
+	int err;
 
-	if (CHECK_FAIL(listen(fd, 1)) < 0) {
+	err = listen(fd, 1);
+
+	pthread_mutex_lock(&server_started_mtx);
+	pthread_cond_signal(&server_started);
+	pthread_mutex_unlock(&server_started_mtx);
+
+	if (CHECK_FAIL(err < 0)) {
 		perror("Failed to listed on socket");
 		return NULL;
 	}
@@ -248,7 +258,14 @@
 	if (CHECK_FAIL(server_fd < 0))
 		goto close_cgroup_fd;
 
-	pthread_create(&tid, NULL, server_thread, (void *)&server_fd);
+	if (CHECK_FAIL(pthread_create(&tid, NULL, server_thread,
+				      (void *)&server_fd)))
+		goto close_cgroup_fd;
+
+	pthread_mutex_lock(&server_started_mtx);
+	pthread_cond_wait(&server_started, &server_started_mtx);
+	pthread_mutex_unlock(&server_started_mtx);
+
 	CHECK_FAIL(run_test(cgroup_fd, server_fd));
 	close(server_fd);
 close_cgroup_fd:

diff --git a/tools/testing/selftests/bpf/progs/strobemeta.h b/tools/testing/selftests/bpf/progs/strobemeta.h
index 8a399bd..067eb62 100644
--- a/tools/testing/selftests/bpf/progs/strobemeta.h
+++ b/tools/testing/selftests/bpf/progs/strobemeta.h

@@ -413,7 +413,10 @@
 #else
 #pragma unroll
 #endif
-	for (int i = 0; i < STROBE_MAX_MAP_ENTRIES && i < map.cnt; ++i) {
+	for (int i = 0; i < STROBE_MAX_MAP_ENTRIES; ++i) {
+		if (i >= map.cnt)
+			break;
+
 		descr->key_lens[i] = 0;
 		len = bpf_probe_read_str(payload, STROBE_MAX_STR_LEN,
 					 map.entries[i].key);

diff --git a/tools/testing/selftests/bpf/test_sysctl.c b/tools/testing/selftests/bpf/test_sysctl.c
index 4f8ec1f..a320e38 100644
--- a/tools/testing/selftests/bpf/test_sysctl.c
+++ b/tools/testing/selftests/bpf/test_sysctl.c

@@ -1385,7 +1385,6 @@
 		uint8_t raw[sizeof(uint64_t)];
 		uint64_t num;
 	} value = {};
-	uint8_t c, i;
 
 	if (buf_len > sizeof(value)) {
 		log_err("Value is too big (%zd) to use in fixup", buf_len);

diff --git a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l2_drops.sh b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l2_drops.sh
index 5dcdfa2..126caf2 100755
--- a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l2_drops.sh
+++ b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l2_drops.sh

@@ -224,13 +224,6 @@
 	local vid=10
 
 	bridge vlan add vid $vid dev $swp2 master
-	# During initialization the firmware enables all the VLAN filters and
-	# the driver does not turn them off since the traffic will be discarded
-	# by the STP filter whose default is DISCARD state. Add the VID on the
-	# ingress bridge port and then remove it to make sure it is not member
-	# in the VLAN.
-	bridge vlan add vid $vid dev $swp1 master
-	bridge vlan del vid $vid dev $swp1 master
 
 	RET=0
 

diff --git a/tools/testing/selftests/ftrace/test.d/trigger/trigger-snapshot.tc b/tools/testing/selftests/ftrace/test.d/trigger/trigger-snapshot.tc
index 7717c0a..ac73850 100644
--- a/tools/testing/selftests/ftrace/test.d/trigger/trigger-snapshot.tc
+++ b/tools/testing/selftests/ftrace/test.d/trigger/trigger-snapshot.tc

@@ -28,7 +28,7 @@
     exit_unsupported
 fi
 
-echo "Test snapshot tigger"
+echo "Test snapshot trigger"
 echo 0 > snapshot
 echo 1 > events/sched/sched_process_fork/enable
 ( echo "forked")

diff --git a/tools/testing/selftests/kexec/test_kexec_file_load.sh b/tools/testing/selftests/kexec/test_kexec_file_load.sh
index fa7c24e..2ff6003 100755
--- a/tools/testing/selftests/kexec/test_kexec_file_load.sh
+++ b/tools/testing/selftests/kexec/test_kexec_file_load.sh

@@ -37,11 +37,20 @@
 	# sequentially.  As a result, a policy rule may be defined, but
 	# might not necessarily be used.  This test assumes if a policy
 	# rule is specified, that is the intent.
+
+	# First check for appended signature (modsig), then xattr
 	if [ $ima_read_policy -eq 1 ]; then
 		check_ima_policy "appraise" "func=KEXEC_KERNEL_CHECK" \
-			"appraise_type=imasig"
+			"appraise_type=imasig|modsig"
 		ret=$?
-		[ $ret -eq 1 ] && log_info "IMA signature required";
+		if [ $ret -eq 1 ]; then
+			log_info "IMA or appended(modsig) signature required"
+		else
+			check_ima_policy "appraise" "func=KEXEC_KERNEL_CHECK" \
+				"appraise_type=imasig"
+			ret=$?
+			[ $ret -eq 1 ] && log_info "IMA signature required";
+		fi
 	fi
 	return $ret
 }
@@ -84,6 +93,22 @@
 	return $ret
 }
 
+# Return 1 for appended signature (modsig) found and 0 for not found.
+check_for_modsig()
+{
+	local module_sig_string="~Module signature appended~"
+	local sig="$(tail --bytes $((${#module_sig_string} + 1)) $KERNEL_IMAGE)"
+	local ret=0
+
+	if [ "$sig" == "$module_sig_string" ]; then
+		ret=1
+		log_info "kexec kernel image modsig signed"
+	else
+		log_info "kexec kernel image not modsig signed"
+	fi
+	return $ret
+}
+
 kexec_file_load_test()
 {
 	local succeed_msg="kexec_file_load succeeded"
@@ -98,7 +123,8 @@
 		# In secureboot mode with an architecture  specific
 		# policy, make sure either an IMA or PE signature exists.
 		if [ $secureboot -eq 1 ] && [ $arch_policy -eq 1 ] && \
-			[ $ima_signed -eq 0 ] && [ $pe_signed -eq 0 ]; then
+			[ $ima_signed -eq 0 ] && [ $pe_signed -eq 0 ] \
+			  && [ $ima_modsig -eq 0 ]; then
 			log_fail "$succeed_msg (missing sig)"
 		fi
 
@@ -107,7 +133,8 @@
 			log_fail "$succeed_msg (missing PE sig)"
 		fi
 
-		if [ $ima_sig_required -eq 1 ] && [ $ima_signed -eq 0 ]; then
+		if [ $ima_sig_required -eq 1 ] && [ $ima_signed -eq 0 ] \
+		     && [ $ima_modsig -eq 0 ]; then
 			log_fail "$succeed_msg (missing IMA sig)"
 		fi
 
@@ -204,5 +231,8 @@
 check_for_imasig
 ima_signed=$?
 
+check_for_modsig
+ima_modsig=$?
+
 # Test loading the kernel image via kexec_file_load syscall
 kexec_file_load_test

diff --git a/tools/testing/selftests/kvm/dirty_log_test.c b/tools/testing/selftests/kvm/dirty_log_test.c
index dc3346e..5614222 100644
--- a/tools/testing/selftests/kvm/dirty_log_test.c
+++ b/tools/testing/selftests/kvm/dirty_log_test.c

@@ -19,8 +19,6 @@
 #include "kvm_util.h"
 #include "processor.h"
 
-#define DEBUG printf
-
 #define VCPU_ID				1
 
 /* The memory slot index to track dirty pages */
@@ -249,14 +247,12 @@
 }
 
 static struct kvm_vm *create_vm(enum vm_guest_mode mode, uint32_t vcpuid,
-				uint64_t extra_mem_pages, void *guest_code,
-				unsigned long type)
+				uint64_t extra_mem_pages, void *guest_code)
 {
 	struct kvm_vm *vm;
 	uint64_t extra_pg_pages = extra_mem_pages / 512 * 2;
 
-	vm = _vm_create(mode, DEFAULT_GUEST_PHY_PAGES + extra_pg_pages,
-			O_RDWR, type);
+	vm = _vm_create(mode, DEFAULT_GUEST_PHY_PAGES + extra_pg_pages, O_RDWR);
 	kvm_vm_elf_load(vm, program_invocation_name, 0, 0);
 #ifdef __x86_64__
 	vm_create_irqchip(vm);
@@ -265,67 +261,35 @@
 	return vm;
 }
 
+#define DIRTY_MEM_BITS 30 /* 1G */
+#define PAGE_SHIFT_4K  12
+
 static void run_test(enum vm_guest_mode mode, unsigned long iterations,
 		     unsigned long interval, uint64_t phys_offset)
 {
-	unsigned int guest_pa_bits, guest_page_shift;
 	pthread_t vcpu_thread;
 	struct kvm_vm *vm;
-	uint64_t max_gfn;
 	unsigned long *bmap;
-	unsigned long type = 0;
 
-	switch (mode) {
-	case VM_MODE_P52V48_4K:
-		guest_pa_bits = 52;
-		guest_page_shift = 12;
-		break;
-	case VM_MODE_P52V48_64K:
-		guest_pa_bits = 52;
-		guest_page_shift = 16;
-		break;
-	case VM_MODE_P48V48_4K:
-		guest_pa_bits = 48;
-		guest_page_shift = 12;
-		break;
-	case VM_MODE_P48V48_64K:
-		guest_pa_bits = 48;
-		guest_page_shift = 16;
-		break;
-	case VM_MODE_P40V48_4K:
-		guest_pa_bits = 40;
-		guest_page_shift = 12;
-		break;
-	case VM_MODE_P40V48_64K:
-		guest_pa_bits = 40;
-		guest_page_shift = 16;
-		break;
-	default:
-		TEST_ASSERT(false, "Unknown guest mode, mode: 0x%x", mode);
-	}
-
-	DEBUG("Testing guest mode: %s\n", vm_guest_mode_string(mode));
-
-#ifdef __x86_64__
 	/*
-	 * FIXME
-	 * The x86_64 kvm selftests framework currently only supports a
-	 * single PML4 which restricts the number of physical address
-	 * bits we can change to 39.
+	 * We reserve page table for 2 times of extra dirty mem which
+	 * will definitely cover the original (1G+) test range.  Here
+	 * we do the calculation with 4K page size which is the
+	 * smallest so the page number will be enough for all archs
+	 * (e.g., 64K page size guest will need even less memory for
+	 * page tables).
 	 */
-	guest_pa_bits = 39;
-#endif
-#ifdef __aarch64__
-	if (guest_pa_bits != 40)
-		type = KVM_VM_TYPE_ARM_IPA_SIZE(guest_pa_bits);
-#endif
-	max_gfn = (1ul << (guest_pa_bits - guest_page_shift)) - 1;
-	guest_page_size = (1ul << guest_page_shift);
+	vm = create_vm(mode, VCPU_ID,
+		       2ul << (DIRTY_MEM_BITS - PAGE_SHIFT_4K),
+		       guest_code);
+
+	guest_page_size = vm_get_page_size(vm);
 	/*
 	 * A little more than 1G of guest page sized pages.  Cover the
 	 * case where the size is not aligned to 64 pages.
 	 */
-	guest_num_pages = (1ul << (30 - guest_page_shift)) + 16;
+	guest_num_pages = (1ul << (DIRTY_MEM_BITS -
+				   vm_get_page_shift(vm))) + 16;
 #ifdef __s390x__
 	/* Round up to multiple of 1M (segment size) */
 	guest_num_pages = (guest_num_pages + 0xff) & ~0xffUL;
@@ -335,7 +299,8 @@
 			 !!((guest_num_pages * guest_page_size) % host_page_size);
 
 	if (!phys_offset) {
-		guest_test_phys_mem = (max_gfn - guest_num_pages) * guest_page_size;
+		guest_test_phys_mem = (vm_get_max_gfn(vm) -
+				       guest_num_pages) * guest_page_size;
 		guest_test_phys_mem &= ~(host_page_size - 1);
 	} else {
 		guest_test_phys_mem = phys_offset;
@@ -351,8 +316,6 @@
 	bmap = bitmap_alloc(host_num_pages);
 	host_bmap_track = bitmap_alloc(host_num_pages);
 
-	vm = create_vm(mode, VCPU_ID, guest_num_pages, guest_code, type);
-
 #ifdef USE_CLEAR_DIRTY_LOG
 	struct kvm_enable_cap cap = {};
 
@@ -482,7 +445,7 @@
 #endif
 
 #ifdef __x86_64__
-	vm_guest_mode_params_init(VM_MODE_P52V48_4K, true, true);
+	vm_guest_mode_params_init(VM_MODE_PXXV48_4K, true, true);
 #endif
 #ifdef __aarch64__
 	vm_guest_mode_params_init(VM_MODE_P40V48_4K, true, true);

diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h
index 5463b78..29cccaf 100644
--- a/tools/testing/selftests/kvm/include/kvm_util.h
+++ b/tools/testing/selftests/kvm/include/kvm_util.h

@@ -24,6 +24,12 @@
 typedef uint64_t vm_paddr_t; /* Virtual Machine (Guest) physical address */
 typedef uint64_t vm_vaddr_t; /* Virtual Machine (Guest) virtual address */
 
+#ifndef NDEBUG
+#define DEBUG(...) printf(__VA_ARGS__);
+#else
+#define DEBUG(...)
+#endif
+
 /* Minimum allocated guest virtual and physical addresses */
 #define KVM_UTIL_MIN_VADDR		0x2000
 
@@ -38,11 +44,14 @@
 	VM_MODE_P48V48_64K,
 	VM_MODE_P40V48_4K,
 	VM_MODE_P40V48_64K,
+	VM_MODE_PXXV48_4K,	/* For 48bits VA but ANY bits PA */
 	NUM_VM_MODES,
 };
 
-#ifdef __aarch64__
+#if defined(__aarch64__)
 #define VM_MODE_DEFAULT VM_MODE_P40V48_4K
+#elif defined(__x86_64__)
+#define VM_MODE_DEFAULT VM_MODE_PXXV48_4K
 #else
 #define VM_MODE_DEFAULT VM_MODE_P52V48_4K
 #endif
@@ -60,8 +69,7 @@
 int vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap *cap);
 
 struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm);
-struct kvm_vm *_vm_create(enum vm_guest_mode mode, uint64_t phy_pages,
-			  int perm, unsigned long type);
+struct kvm_vm *_vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm);
 void kvm_vm_free(struct kvm_vm *vmp);
 void kvm_vm_restart(struct kvm_vm *vmp, int perm);
 void kvm_vm_release(struct kvm_vm *vmp);
@@ -146,6 +154,10 @@
 
 bool vm_is_unrestricted_guest(struct kvm_vm *vm);
 
+unsigned int vm_get_page_size(struct kvm_vm *vm);
+unsigned int vm_get_page_shift(struct kvm_vm *vm);
+unsigned int vm_get_max_gfn(struct kvm_vm *vm);
+
 struct kvm_userspace_memory_region *
 kvm_userspace_memory_region_find(struct kvm_vm *vm, uint64_t start,
 				 uint64_t end);

diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h
index 80d1974..0c17f2e 100644
--- a/tools/testing/selftests/kvm/include/x86_64/processor.h
+++ b/tools/testing/selftests/kvm/include/x86_64/processor.h

@@ -325,6 +325,9 @@
 void vcpu_set_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index,
 	  	  uint64_t msr_value);
 
+uint32_t kvm_get_cpuid_max(void);
+void kvm_get_cpu_address_width(unsigned int *pa_bits, unsigned int *va_bits);
+
 /*
  * Basic CPU control in CR0
  */

diff --git a/tools/testing/selftests/kvm/lib/aarch64/processor.c b/tools/testing/selftests/kvm/lib/aarch64/processor.c
index 486400a..86036a5 100644
--- a/tools/testing/selftests/kvm/lib/aarch64/processor.c
+++ b/tools/testing/selftests/kvm/lib/aarch64/processor.c

@@ -264,6 +264,9 @@
 	case VM_MODE_P52V48_4K:
 		TEST_ASSERT(false, "AArch64 does not support 4K sized pages "
 				   "with 52-bit physical address ranges");
+	case VM_MODE_PXXV48_4K:
+		TEST_ASSERT(false, "AArch64 does not support 4K sized pages "
+				   "with ANY-bit physical address ranges");
 	case VM_MODE_P52V48_64K:
 		tcr_el1 |= 1ul << 14; /* TG0 = 64KB */
 		tcr_el1 |= 6ul << 32; /* IPS = 52 bits */

diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index 6e49bb0..80a338b 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c

@@ -8,6 +8,7 @@
 #include "test_util.h"
 #include "kvm_util.h"
 #include "kvm_util_internal.h"
+#include "processor.h"
 
 #include <assert.h>
 #include <sys/mman.h>
@@ -84,7 +85,7 @@
 	return ret;
 }
 
-static void vm_open(struct kvm_vm *vm, int perm, unsigned long type)
+static void vm_open(struct kvm_vm *vm, int perm)
 {
 	vm->kvm_fd = open(KVM_DEV_PATH, perm);
 	if (vm->kvm_fd < 0)
@@ -95,18 +96,19 @@
 		exit(KSFT_SKIP);
 	}
 
-	vm->fd = ioctl(vm->kvm_fd, KVM_CREATE_VM, type);
+	vm->fd = ioctl(vm->kvm_fd, KVM_CREATE_VM, vm->type);
 	TEST_ASSERT(vm->fd >= 0, "KVM_CREATE_VM ioctl failed, "
 		"rc: %i errno: %i", vm->fd, errno);
 }
 
 const char * const vm_guest_mode_string[] = {
-	"PA-bits:52, VA-bits:48, 4K pages",
-	"PA-bits:52, VA-bits:48, 64K pages",
-	"PA-bits:48, VA-bits:48, 4K pages",
-	"PA-bits:48, VA-bits:48, 64K pages",
-	"PA-bits:40, VA-bits:48, 4K pages",
-	"PA-bits:40, VA-bits:48, 64K pages",
+	"PA-bits:52,  VA-bits:48,  4K pages",
+	"PA-bits:52,  VA-bits:48, 64K pages",
+	"PA-bits:48,  VA-bits:48,  4K pages",
+	"PA-bits:48,  VA-bits:48, 64K pages",
+	"PA-bits:40,  VA-bits:48,  4K pages",
+	"PA-bits:40,  VA-bits:48, 64K pages",
+	"PA-bits:ANY, VA-bits:48,  4K pages",
 };
 _Static_assert(sizeof(vm_guest_mode_string)/sizeof(char *) == NUM_VM_MODES,
 	       "Missing new mode strings?");
@@ -130,17 +132,17 @@
  * descriptor to control the created VM is created with the permissions
  * given by perm (e.g. O_RDWR).
  */
-struct kvm_vm *_vm_create(enum vm_guest_mode mode, uint64_t phy_pages,
-			  int perm, unsigned long type)
+struct kvm_vm *_vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm)
 {
 	struct kvm_vm *vm;
 
+	DEBUG("Testing guest mode: %s\n", vm_guest_mode_string(mode));
+
 	vm = calloc(1, sizeof(*vm));
 	TEST_ASSERT(vm != NULL, "Insufficient Memory");
 
 	vm->mode = mode;
-	vm->type = type;
-	vm_open(vm, perm, type);
+	vm->type = 0;
 
 	/* Setup mode specific traits. */
 	switch (vm->mode) {
@@ -186,10 +188,32 @@
 		vm->page_size = 0x10000;
 		vm->page_shift = 16;
 		break;
+	case VM_MODE_PXXV48_4K:
+#ifdef __x86_64__
+		kvm_get_cpu_address_width(&vm->pa_bits, &vm->va_bits);
+		TEST_ASSERT(vm->va_bits == 48, "Linear address width "
+			    "(%d bits) not supported", vm->va_bits);
+		vm->pgtable_levels = 4;
+		vm->page_size = 0x1000;
+		vm->page_shift = 12;
+		DEBUG("Guest physical address width detected: %d\n",
+		      vm->pa_bits);
+#else
+		TEST_ASSERT(false, "VM_MODE_PXXV48_4K not supported on "
+			    "non-x86 platforms");
+#endif
+		break;
 	default:
 		TEST_ASSERT(false, "Unknown guest mode, mode: 0x%x", mode);
 	}
 
+#ifdef __aarch64__
+	if (vm->pa_bits != 40)
+		vm->type = KVM_VM_TYPE_ARM_IPA_SIZE(vm->pa_bits);
+#endif
+
+	vm_open(vm, perm);
+
 	/* Limit to VA-bit canonical virtual addresses. */
 	vm->vpages_valid = sparsebit_alloc();
 	sparsebit_set_num(vm->vpages_valid,
@@ -212,7 +236,7 @@
 
 struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm)
 {
-	return _vm_create(mode, phy_pages, perm, 0);
+	return _vm_create(mode, phy_pages, perm);
 }
 
 /*
@@ -232,7 +256,7 @@
 {
 	struct userspace_mem_region *region;
 
-	vm_open(vmp, perm, vmp->type);
+	vm_open(vmp, perm);
 	if (vmp->has_irqchip)
 		vm_create_irqchip(vmp);
 
@@ -1628,3 +1652,18 @@
 
 	return val == 'Y';
 }
+
+unsigned int vm_get_page_size(struct kvm_vm *vm)
+{
+	return vm->page_size;
+}
+
+unsigned int vm_get_page_shift(struct kvm_vm *vm)
+{
+	return vm->page_shift;
+}
+
+unsigned int vm_get_max_gfn(struct kvm_vm *vm)
+{
+	return vm->max_gfn;
+}

diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c
index 0a5e487..c53dbc6 100644
--- a/tools/testing/selftests/kvm/lib/x86_64/processor.c
+++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c

@@ -228,7 +228,7 @@
 
 void virt_pgd_alloc(struct kvm_vm *vm, uint32_t pgd_memslot)
 {
-	TEST_ASSERT(vm->mode == VM_MODE_P52V48_4K, "Attempt to use "
+	TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use "
 		"unknown or unsupported guest mode, mode: 0x%x", vm->mode);
 
 	/* If needed, create page map l4 table. */
@@ -261,7 +261,7 @@
 	uint16_t index[4];
 	struct pageMapL4Entry *pml4e;
 
-	TEST_ASSERT(vm->mode == VM_MODE_P52V48_4K, "Attempt to use "
+	TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use "
 		"unknown or unsupported guest mode, mode: 0x%x", vm->mode);
 
 	TEST_ASSERT((vaddr % vm->page_size) == 0,
@@ -547,7 +547,7 @@
 	struct pageDirectoryEntry *pde;
 	struct pageTableEntry *pte;
 
-	TEST_ASSERT(vm->mode == VM_MODE_P52V48_4K, "Attempt to use "
+	TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use "
 		"unknown or unsupported guest mode, mode: 0x%x", vm->mode);
 
 	index[0] = (gva >> 12) & 0x1ffu;
@@ -621,7 +621,7 @@
 	kvm_setup_gdt(vm, &sregs.gdt, gdt_memslot, pgd_memslot);
 
 	switch (vm->mode) {
-	case VM_MODE_P52V48_4K:
+	case VM_MODE_PXXV48_4K:
 		sregs.cr0 = X86_CR0_PE | X86_CR0_NE | X86_CR0_PG;
 		sregs.cr4 |= X86_CR4_PAE | X86_CR4_OSFXSR;
 		sregs.efer |= (EFER_LME | EFER_LMA | EFER_NX);
@@ -1157,3 +1157,25 @@
 	chunk = (const uint32_t *)("GenuineIntel");
 	return (ebx == chunk[0] && edx == chunk[1] && ecx == chunk[2]);
 }
+
+uint32_t kvm_get_cpuid_max(void)
+{
+	return kvm_get_supported_cpuid_entry(0x80000000)->eax;
+}
+
+void kvm_get_cpu_address_width(unsigned int *pa_bits, unsigned int *va_bits)
+{
+	struct kvm_cpuid_entry2 *entry;
+	bool pae;
+
+	/* SDM 4.1.4 */
+	if (kvm_get_cpuid_max() < 0x80000008) {
+		pae = kvm_get_supported_cpuid_entry(1)->edx & (1 << 6);
+		*pa_bits = pae ? 36 : 32;
+		*va_bits = 32;
+	} else {
+		entry = kvm_get_supported_cpuid_entry(0x80000008);
+		*pa_bits = entry->eax & 0xff;
+		*va_bits = (entry->eax >> 8) & 0xff;
+	}
+}

diff --git a/tools/testing/selftests/kvm/lib/x86_64/ucall.c b/tools/testing/selftests/kvm/lib/x86_64/ucall.c
index 4bfc9a9..da4d89a 100644
--- a/tools/testing/selftests/kvm/lib/x86_64/ucall.c
+++ b/tools/testing/selftests/kvm/lib/x86_64/ucall.c

@@ -32,7 +32,7 @@
 	va_end(va);
 
 	asm volatile("in %[port], %%al"
-		: : [port] "d" (UCALL_PIO_PORT), "D" (&uc) : "rax");
+		: : [port] "d" (UCALL_PIO_PORT), "D" (&uc) : "rax", "memory");
 }
 
 uint64_t get_ucall(struct kvm_vm *vm, uint32_t vcpu_id, struct ucall *uc)

diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c b/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c
index ee59831..443a2b5 100644
--- a/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c
+++ b/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c

@@ -26,6 +26,25 @@
 {
 }
 
+static int smt_possible(void)
+{
+	char buf[16];
+	FILE *f;
+	bool res = 1;
+
+	f = fopen("/sys/devices/system/cpu/smt/control", "r");
+	if (f) {
+		if (fread(buf, sizeof(*buf), sizeof(buf), f) > 0) {
+			if (!strncmp(buf, "forceoff", 8) ||
+			    !strncmp(buf, "notsupported", 12))
+				res = 0;
+		}
+		fclose(f);
+	}
+
+	return res;
+}
+
 static void test_hv_cpuid(struct kvm_cpuid2 *hv_cpuid_entries,
 			  int evmcs_enabled)
 {
@@ -59,6 +78,14 @@
 		TEST_ASSERT(!entry->padding[0] && !entry->padding[1] &&
 			    !entry->padding[2], "padding should be zero");
 
+		if (entry->function == 0x40000004) {
+			int nononarchcs = !!(entry->eax & (1UL << 18));
+
+			TEST_ASSERT(nononarchcs == !smt_possible(),
+				    "NoNonArchitecturalCoreSharing bit"
+				    " doesn't reflect SMT setting");
+		}
+
 		/*
 		 * If needed for debug:
 		 * fprintf(stdout,

diff --git a/tools/testing/selftests/livepatch/config b/tools/testing/selftests/livepatch/config
index 0dd7700..ad23100 100644
--- a/tools/testing/selftests/livepatch/config
+++ b/tools/testing/selftests/livepatch/config

@@ -1 +1,3 @@
+CONFIG_LIVEPATCH=y
+CONFIG_DYNAMIC_DEBUG=y
 CONFIG_TEST_LIVEPATCH=m

diff --git a/tools/testing/selftests/membarrier/.gitignore b/tools/testing/selftests/membarrier/.gitignore
index 020c44f4..f2f7ec0 100644
--- a/tools/testing/selftests/membarrier/.gitignore
+++ b/tools/testing/selftests/membarrier/.gitignore

@@ -1 +1,2 @@
-membarrier_test
+membarrier_test_multi_thread
+membarrier_test_single_thread

diff --git a/tools/testing/selftests/membarrier/Makefile b/tools/testing/selftests/membarrier/Makefile
index 97e3bdf..34d1c81 100644
--- a/tools/testing/selftests/membarrier/Makefile
+++ b/tools/testing/selftests/membarrier/Makefile

@@ -1,7 +1,8 @@
 # SPDX-License-Identifier: GPL-2.0-only
 CFLAGS += -g -I../../../../usr/include/
+LDLIBS += -lpthread
 
-TEST_GEN_PROGS := membarrier_test
+TEST_GEN_PROGS := membarrier_test_single_thread \
+		membarrier_test_multi_thread
 
 include ../lib.mk
-

diff --git a/tools/testing/selftests/membarrier/membarrier_test.c b/tools/testing/selftests/membarrier/membarrier_test_impl.h
similarity index 95%
rename from tools/testing/selftests/membarrier/membarrier_test.c
rename to tools/testing/selftests/membarrier/membarrier_test_impl.h
index 70b4ddb..186be69 100644
--- a/tools/testing/selftests/membarrier/membarrier_test.c
+++ b/tools/testing/selftests/membarrier/membarrier_test_impl.h

@@ -1,10 +1,11 @@
-// SPDX-License-Identifier: GPL-2.0
+/* SPDX-License-Identifier: GPL-2.0 */
 #define _GNU_SOURCE
 #include <linux/membarrier.h>
 #include <syscall.h>
 #include <stdio.h>
 #include <errno.h>
 #include <string.h>
+#include <pthread.h>
 
 #include "../kselftest.h"
 
@@ -223,7 +224,7 @@
 	return 0;
 }
 
-static int test_membarrier(void)
+static int test_membarrier_fail(void)
 {
 	int status;
 
@@ -233,10 +234,27 @@
 	status = test_membarrier_flags_fail();
 	if (status)
 		return status;
-	status = test_membarrier_global_success();
+	status = test_membarrier_private_expedited_fail();
 	if (status)
 		return status;
-	status = test_membarrier_private_expedited_fail();
+	status = sys_membarrier(MEMBARRIER_CMD_QUERY, 0);
+	if (status < 0) {
+		ksft_test_result_fail("sys_membarrier() failed\n");
+		return status;
+	}
+	if (status & MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE) {
+		status = test_membarrier_private_expedited_sync_core_fail();
+		if (status)
+			return status;
+	}
+	return 0;
+}
+
+static int test_membarrier_success(void)
+{
+	int status;
+
+	status = test_membarrier_global_success();
 	if (status)
 		return status;
 	status = test_membarrier_register_private_expedited_success();
@@ -251,9 +269,6 @@
 		return status;
 	}
 	if (status & MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE) {
-		status = test_membarrier_private_expedited_sync_core_fail();
-		if (status)
-			return status;
 		status = test_membarrier_register_private_expedited_sync_core_success();
 		if (status)
 			return status;
@@ -300,14 +315,3 @@
 	ksft_test_result_pass("sys_membarrier available\n");
 	return 0;
 }
-
-int main(int argc, char **argv)
-{
-	ksft_print_header();
-	ksft_set_plan(13);
-
-	test_membarrier_query();
-	test_membarrier();
-
-	return ksft_exit_pass();
-}

diff --git a/tools/testing/selftests/membarrier/membarrier_test_multi_thread.c b/tools/testing/selftests/membarrier/membarrier_test_multi_thread.c
new file mode 100644
index 0000000..ac5613e
--- /dev/null
+++ b/tools/testing/selftests/membarrier/membarrier_test_multi_thread.c

@@ -0,0 +1,73 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <linux/membarrier.h>
+#include <syscall.h>
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+#include <pthread.h>
+
+#include "membarrier_test_impl.h"
+
+static int thread_ready, thread_quit;
+static pthread_mutex_t test_membarrier_thread_mutex =
+	PTHREAD_MUTEX_INITIALIZER;
+static pthread_cond_t test_membarrier_thread_cond =
+	PTHREAD_COND_INITIALIZER;
+
+void *test_membarrier_thread(void *arg)
+{
+	pthread_mutex_lock(&test_membarrier_thread_mutex);
+	thread_ready = 1;
+	pthread_cond_broadcast(&test_membarrier_thread_cond);
+	pthread_mutex_unlock(&test_membarrier_thread_mutex);
+
+	pthread_mutex_lock(&test_membarrier_thread_mutex);
+	while (!thread_quit)
+		pthread_cond_wait(&test_membarrier_thread_cond,
+				  &test_membarrier_thread_mutex);
+	pthread_mutex_unlock(&test_membarrier_thread_mutex);
+
+	return NULL;
+}
+
+static int test_mt_membarrier(void)
+{
+	int i;
+	pthread_t test_thread;
+
+	pthread_create(&test_thread, NULL,
+		       test_membarrier_thread, NULL);
+
+	pthread_mutex_lock(&test_membarrier_thread_mutex);
+	while (!thread_ready)
+		pthread_cond_wait(&test_membarrier_thread_cond,
+				  &test_membarrier_thread_mutex);
+	pthread_mutex_unlock(&test_membarrier_thread_mutex);
+
+	test_membarrier_fail();
+
+	test_membarrier_success();
+
+	pthread_mutex_lock(&test_membarrier_thread_mutex);
+	thread_quit = 1;
+	pthread_cond_broadcast(&test_membarrier_thread_cond);
+	pthread_mutex_unlock(&test_membarrier_thread_mutex);
+
+	pthread_join(test_thread, NULL);
+
+	return 0;
+}
+
+int main(int argc, char **argv)
+{
+	ksft_print_header();
+	ksft_set_plan(13);
+
+	test_membarrier_query();
+
+	/* Multi-threaded */
+	test_mt_membarrier();
+
+	return ksft_exit_pass();
+}

diff --git a/tools/testing/selftests/membarrier/membarrier_test_single_thread.c b/tools/testing/selftests/membarrier/membarrier_test_single_thread.c
new file mode 100644
index 0000000..c1c9639
--- /dev/null
+++ b/tools/testing/selftests/membarrier/membarrier_test_single_thread.c

@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <linux/membarrier.h>
+#include <syscall.h>
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+#include <pthread.h>
+
+#include "membarrier_test_impl.h"
+
+int main(int argc, char **argv)
+{
+	ksft_print_header();
+	ksft_set_plan(13);
+
+	test_membarrier_query();
+
+	test_membarrier_fail();
+
+	test_membarrier_success();
+
+	return ksft_exit_pass();
+}

diff --git a/tools/testing/selftests/net/fib_nexthop_multiprefix.sh b/tools/testing/selftests/net/fib_nexthop_multiprefix.sh
index e682873..9dc35a1 100755
--- a/tools/testing/selftests/net/fib_nexthop_multiprefix.sh
+++ b/tools/testing/selftests/net/fib_nexthop_multiprefix.sh

@@ -15,6 +15,8 @@
 PAUSE_ON_FAIL=no
 VERBOSE=0
 
+which ping6 > /dev/null 2>&1 && ping6=$(which ping6) || ping6=$(which ping)
+
 ################################################################################
 # helpers
 
@@ -200,7 +202,7 @@
 	local rc
 
 	if [ ${ping_sz} != "0" ]; then
-		run_cmd ip netns exec h0 ping6 -s ${ping_sz} -c5 -w5 ${dst}
+		run_cmd ip netns exec h0 ${ping6} -s ${ping_sz} -c5 -w5 ${dst}
 	fi
 
 	if [ "$VERBOSE" = "1" ]; then
@@ -243,7 +245,7 @@
 		run_cmd taskset -c ${c} ip netns exec h0 ping -c1 -w1 172.16.10${i}.1
 		[ $? -ne 0 ] && printf "\nERROR: ping to h${i} failed\n" && ret=1
 
-		run_cmd taskset -c ${c} ip netns exec h0 ping6 -c1 -w1 2001:db8:10${i}::1
+		run_cmd taskset -c ${c} ip netns exec h0 ${ping6} -c1 -w1 2001:db8:10${i}::1
 		[ $? -ne 0 ] && printf "\nERROR: ping6 to h${i} failed\n" && ret=1
 
 		[ $ret -ne 0 ] && break

diff --git a/tools/testing/selftests/net/fib_nexthops.sh b/tools/testing/selftests/net/fib_nexthops.sh
index f9ebeac..796670e 100755
--- a/tools/testing/selftests/net/fib_nexthops.sh
+++ b/tools/testing/selftests/net/fib_nexthops.sh

@@ -940,6 +940,20 @@
 	run_cmd "$IP nexthop add id 104 group 1 dev veth1"
 	log_test $? 2 "Nexthop group and device"
 
+	# Tests to ensure that flushing works as expected.
+	run_cmd "$IP nexthop add id 105 blackhole proto 99"
+	run_cmd "$IP nexthop add id 106 blackhole proto 100"
+	run_cmd "$IP nexthop add id 107 blackhole proto 99"
+	run_cmd "$IP nexthop flush proto 99"
+	check_nexthop "id 105" ""
+	check_nexthop "id 106" "id 106 blackhole proto 100"
+	check_nexthop "id 107" ""
+	run_cmd "$IP nexthop flush proto 100"
+	check_nexthop "id 106" ""
+
+	run_cmd "$IP nexthop flush proto 100"
+	log_test $? 0 "Test proto flush"
+
 	run_cmd "$IP nexthop add id 104 group 1 blackhole"
 	log_test $? 2 "Nexthop group and blackhole"
 

diff --git a/tools/testing/selftests/net/fib_tests.sh b/tools/testing/selftests/net/fib_tests.sh
index 4465fc2..c4ba0ff 100755
--- a/tools/testing/selftests/net/fib_tests.sh
+++ b/tools/testing/selftests/net/fib_tests.sh

@@ -9,7 +9,7 @@
 ksft_skip=4
 
 # all tests in this script. Can be overridden with -t option
-TESTS="unregister down carrier nexthop ipv6_rt ipv4_rt ipv6_addr_metric ipv4_addr_metric ipv6_route_metrics ipv4_route_metrics ipv4_route_v6_gw rp_filter"
+TESTS="unregister down carrier nexthop suppress ipv6_rt ipv4_rt ipv6_addr_metric ipv4_addr_metric ipv6_route_metrics ipv4_route_metrics ipv4_route_v6_gw rp_filter"
 
 VERBOSE=0
 PAUSE_ON_FAIL=no
@@ -17,6 +17,8 @@
 IP="ip -netns ns1"
 NS_EXEC="ip netns exec ns1"
 
+which ping6 > /dev/null 2>&1 && ping6=$(which ping6) || ping6=$(which ping)
+
 log_test()
 {
 	local rc=$1
@@ -614,6 +616,20 @@
 	cleanup
 }
 
+fib_suppress_test()
+{
+	$IP link add dummy1 type dummy
+	$IP link set dummy1 up
+	$IP -6 route add default dev dummy1
+	$IP -6 rule add table main suppress_prefixlength 0
+	ping -f -c 1000 -W 1 1234::1 || true
+	$IP -6 rule del table main suppress_prefixlength 0
+	$IP link del dummy1
+
+	# If we got here without crashing, we're good.
+	return 0
+}
+
 ################################################################################
 # Tests on route add and replace
 
@@ -1086,7 +1102,7 @@
 	log_test $rc 0 "Multipath route with mtu metric"
 
 	$IP -6 ro add 2001:db8:104::/64 via 2001:db8:101::2 mtu 1300
-	run_cmd "ip netns exec ns1 ping6 -w1 -c1 -s 1500 2001:db8:104::1"
+	run_cmd "ip netns exec ns1 ${ping6} -w1 -c1 -s 1500 2001:db8:104::1"
 	log_test $? 0 "Using route with mtu metric"
 
 	run_cmd "$IP -6 ro add 2001:db8:114::/64 via  2001:db8:101::2  congctl lock foo"
@@ -1591,6 +1607,7 @@
 	fib_carrier_test|carrier)	fib_carrier_test;;
 	fib_rp_filter_test|rp_filter)	fib_rp_filter_test;;
 	fib_nexthop_test|nexthop)	fib_nexthop_test;;
+	fib_suppress_test|suppress)	fib_suppress_test;;
 	ipv6_route_test|ipv6_rt)	ipv6_route_test;;
 	ipv4_route_test|ipv4_rt)	ipv4_route_test;;
 	ipv6_addr_metric)		ipv6_addr_metric_test;;

diff --git a/tools/testing/selftests/powerpc/mm/Makefile b/tools/testing/selftests/powerpc/mm/Makefile
index f1fbc15..ed15658 100644
--- a/tools/testing/selftests/powerpc/mm/Makefile
+++ b/tools/testing/selftests/powerpc/mm/Makefile

@@ -4,6 +4,7 @@
 
 TEST_GEN_PROGS := hugetlb_vs_thp_test subpage_prot prot_sao segv_errors wild_bctr \
 		  large_vm_fork_separation
+TEST_GEN_PROGS_EXTENDED := tlbie_test
 TEST_GEN_FILES := tempfile
 
 top_srcdir = ../../../../..
@@ -19,3 +20,4 @@
 $(OUTPUT)/tempfile:
 	dd if=/dev/zero of=$@ bs=64k count=1
 
+$(OUTPUT)/tlbie_test: LDLIBS += -lpthread

diff --git a/tools/testing/selftests/powerpc/mm/tlbie_test.c b/tools/testing/selftests/powerpc/mm/tlbie_test.c
new file mode 100644
index 0000000..9868a5d
--- /dev/null
+++ b/tools/testing/selftests/powerpc/mm/tlbie_test.c

@@ -0,0 +1,734 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright 2019, Nick Piggin, Gautham R. Shenoy, Aneesh Kumar K.V, IBM Corp.
+ */
+
+/*
+ *
+ * Test tlbie/mtpidr race. We have 4 threads doing flush/load/compare/store
+ * sequence in a loop. The same threads also rung a context switch task
+ * that does sched_yield() in loop.
+ *
+ * The snapshot thread mark the mmap area PROT_READ in between, make a copy
+ * and copy it back to the original area. This helps us to detect if any
+ * store continued to happen after we marked the memory PROT_READ.
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <linux/futex.h>
+#include <unistd.h>
+#include <asm/unistd.h>
+#include <string.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <sched.h>
+#include <time.h>
+#include <stdarg.h>
+#include <sched.h>
+#include <pthread.h>
+#include <signal.h>
+#include <sys/prctl.h>
+
+static inline void dcbf(volatile unsigned int *addr)
+{
+	__asm__ __volatile__ ("dcbf %y0; sync" : : "Z"(*(unsigned char *)addr) : "memory");
+}
+
+static void err_msg(char *msg)
+{
+
+	time_t now;
+	time(&now);
+	printf("=================================\n");
+	printf("    Error: %s\n", msg);
+	printf("    %s", ctime(&now));
+	printf("=================================\n");
+	exit(1);
+}
+
+static char *map1;
+static char *map2;
+static pid_t rim_process_pid;
+
+/*
+ * A "rim-sequence" is defined to be the sequence of the following
+ * operations performed on a memory word:
+ *	1) FLUSH the contents of that word.
+ *	2) LOAD the contents of that word.
+ *	3) COMPARE the contents of that word with the content that was
+ *	           previously stored at that word
+ *	4) STORE new content into that word.
+ *
+ * The threads in this test that perform the rim-sequence are termed
+ * as rim_threads.
+ */
+
+/*
+ * A "corruption" is defined to be the failed COMPARE operation in a
+ * rim-sequence.
+ *
+ * A rim_thread that detects a corruption informs about it to all the
+ * other rim_threads, and the mem_snapshot thread.
+ */
+static volatile unsigned int corruption_found;
+
+/*
+ * This defines the maximum number of rim_threads in this test.
+ *
+ * The THREAD_ID_BITS denote the number of bits required
+ * to represent the thread_ids [0..MAX_THREADS - 1].
+ * We are being a bit paranoid here and set it to 8 bits,
+ * though 6 bits suffice.
+ *
+ */
+#define MAX_THREADS 		64
+#define THREAD_ID_BITS		8
+#define THREAD_ID_MASK		((1 << THREAD_ID_BITS) - 1)
+static unsigned int rim_thread_ids[MAX_THREADS];
+static pthread_t rim_threads[MAX_THREADS];
+
+
+/*
+ * Each rim_thread works on an exclusive "chunk" of size
+ * RIM_CHUNK_SIZE.
+ *
+ * The ith rim_thread works on the ith chunk.
+ *
+ * The ith chunk begins at
+ * map1 + (i * RIM_CHUNK_SIZE)
+ */
+#define RIM_CHUNK_SIZE  	1024
+#define BITS_PER_BYTE 		8
+#define WORD_SIZE     		(sizeof(unsigned int))
+#define WORD_BITS		(WORD_SIZE * BITS_PER_BYTE)
+#define WORDS_PER_CHUNK		(RIM_CHUNK_SIZE/WORD_SIZE)
+
+static inline char *compute_chunk_start_addr(unsigned int thread_id)
+{
+	char *chunk_start;
+
+	chunk_start = (char *)((unsigned long)map1 +
+			       (thread_id * RIM_CHUNK_SIZE));
+
+	return chunk_start;
+}
+
+/*
+ * The "word-offset" of a word-aligned address inside a chunk, is
+ * defined to be the number of words that precede the address in that
+ * chunk.
+ *
+ * WORD_OFFSET_BITS denote the number of bits required to represent
+ * the word-offsets of all the word-aligned addresses of a chunk.
+ */
+#define WORD_OFFSET_BITS	(__builtin_ctz(WORDS_PER_CHUNK))
+#define WORD_OFFSET_MASK	((1 << WORD_OFFSET_BITS) - 1)
+
+static inline unsigned int compute_word_offset(char *start, unsigned int *addr)
+{
+	unsigned int delta_bytes, ret;
+	delta_bytes = (unsigned long)addr - (unsigned long)start;
+
+	ret = delta_bytes/WORD_SIZE;
+
+	return ret;
+}
+
+/*
+ * A "sweep" is defined to be the sequential execution of the
+ * rim-sequence by a rim_thread on its chunk one word at a time,
+ * starting from the first word of its chunk and ending with the last
+ * word of its chunk.
+ *
+ * Each sweep of a rim_thread is uniquely identified by a sweep_id.
+ * SWEEP_ID_BITS denote the number of bits required to represent
+ * the sweep_ids of rim_threads.
+ *
+ * As to why SWEEP_ID_BITS are computed as a function of THREAD_ID_BITS,
+ * WORD_OFFSET_BITS, and WORD_BITS, see the "store-pattern" below.
+ */
+#define SWEEP_ID_BITS		(WORD_BITS - (THREAD_ID_BITS + WORD_OFFSET_BITS))
+#define SWEEP_ID_MASK		((1 << SWEEP_ID_BITS) - 1)
+
+/*
+ * A "store-pattern" is the word-pattern that is stored into a word
+ * location in the 4)STORE step of the rim-sequence.
+ *
+ * In the store-pattern, we shall encode:
+ *
+ *      - The thread-id of the rim_thread performing the store
+ *        (The most significant THREAD_ID_BITS)
+ *
+ *      - The word-offset of the address into which the store is being
+ *        performed (The next WORD_OFFSET_BITS)
+ *
+ *      - The sweep_id of the current sweep in which the store is
+ *        being performed. (The lower SWEEP_ID_BITS)
+ *
+ * Store Pattern: 32 bits
+ * |------------------|--------------------|---------------------------------|
+ * |    Thread id     |  Word offset       |         sweep_id                |
+ * |------------------|--------------------|---------------------------------|
+ *    THREAD_ID_BITS     WORD_OFFSET_BITS          SWEEP_ID_BITS
+ *
+ * In the store pattern, the (Thread-id + Word-offset) uniquely identify the
+ * address to which the store is being performed i.e,
+ *    address == map1 +
+ *              (Thread-id * RIM_CHUNK_SIZE) + (Word-offset * WORD_SIZE)
+ *
+ * And the sweep_id in the store pattern identifies the time when the
+ * store was performed by the rim_thread.
+ *
+ * We shall use this property in the 3)COMPARE step of the
+ * rim-sequence.
+ */
+#define SWEEP_ID_SHIFT	0
+#define WORD_OFFSET_SHIFT	(SWEEP_ID_BITS)
+#define THREAD_ID_SHIFT		(WORD_OFFSET_BITS + SWEEP_ID_BITS)
+
+/*
+ * Compute the store pattern for a given thread with id @tid, at
+ * location @addr in the sweep identified by @sweep_id
+ */
+static inline unsigned int compute_store_pattern(unsigned int tid,
+						 unsigned int *addr,
+						 unsigned int sweep_id)
+{
+	unsigned int ret = 0;
+	char *start = compute_chunk_start_addr(tid);
+	unsigned int word_offset = compute_word_offset(start, addr);
+
+	ret += (tid & THREAD_ID_MASK) << THREAD_ID_SHIFT;
+	ret += (word_offset & WORD_OFFSET_MASK) << WORD_OFFSET_SHIFT;
+	ret += (sweep_id & SWEEP_ID_MASK) << SWEEP_ID_SHIFT;
+	return ret;
+}
+
+/* Extract the thread-id from the given store-pattern */
+static inline unsigned int extract_tid(unsigned int pattern)
+{
+	unsigned int ret;
+
+	ret = (pattern >> THREAD_ID_SHIFT) & THREAD_ID_MASK;
+	return ret;
+}
+
+/* Extract the word-offset from the given store-pattern */
+static inline unsigned int extract_word_offset(unsigned int pattern)
+{
+	unsigned int ret;
+
+	ret = (pattern >> WORD_OFFSET_SHIFT) & WORD_OFFSET_MASK;
+
+	return ret;
+}
+
+/* Extract the sweep-id from the given store-pattern */
+static inline unsigned int extract_sweep_id(unsigned int pattern)
+
+{
+	unsigned int ret;
+
+	ret = (pattern >> SWEEP_ID_SHIFT) & SWEEP_ID_MASK;
+
+	return ret;
+}
+
+/************************************************************
+ *                                                          *
+ *          Logging the output of the verification          *
+ *                                                          *
+ ************************************************************/
+#define LOGDIR_NAME_SIZE 100
+static char logdir[LOGDIR_NAME_SIZE];
+
+static FILE *fp[MAX_THREADS];
+static const char logfilename[] ="Thread-%02d-Chunk";
+
+static inline void start_verification_log(unsigned int tid,
+					  unsigned int *addr,
+					  unsigned int cur_sweep_id,
+					  unsigned int prev_sweep_id)
+{
+	FILE *f;
+	char logfile[30];
+	char path[LOGDIR_NAME_SIZE + 30];
+	char separator[2] = "/";
+	char *chunk_start = compute_chunk_start_addr(tid);
+	unsigned int size = RIM_CHUNK_SIZE;
+
+	sprintf(logfile, logfilename, tid);
+	strcpy(path, logdir);
+	strcat(path, separator);
+	strcat(path, logfile);
+	f = fopen(path, "w");
+
+	if (!f) {
+		err_msg("Unable to create logfile\n");
+	}
+
+	fp[tid] = f;
+
+	fprintf(f, "----------------------------------------------------------\n");
+	fprintf(f, "PID                = %d\n", rim_process_pid);
+	fprintf(f, "Thread id          = %02d\n", tid);
+	fprintf(f, "Chunk Start Addr   = 0x%016lx\n", (unsigned long)chunk_start);
+	fprintf(f, "Chunk Size         = %d\n", size);
+	fprintf(f, "Next Store Addr    = 0x%016lx\n", (unsigned long)addr);
+	fprintf(f, "Current sweep-id   = 0x%08x\n", cur_sweep_id);
+	fprintf(f, "Previous sweep-id  = 0x%08x\n", prev_sweep_id);
+	fprintf(f, "----------------------------------------------------------\n");
+}
+
+static inline void log_anamoly(unsigned int tid, unsigned int *addr,
+			       unsigned int expected, unsigned int observed)
+{
+	FILE *f = fp[tid];
+
+	fprintf(f, "Thread %02d: Addr 0x%lx: Expected 0x%x, Observed 0x%x\n",
+	        tid, (unsigned long)addr, expected, observed);
+	fprintf(f, "Thread %02d: Expected Thread id   = %02d\n", tid, extract_tid(expected));
+	fprintf(f, "Thread %02d: Observed Thread id   = %02d\n", tid, extract_tid(observed));
+	fprintf(f, "Thread %02d: Expected Word offset = %03d\n", tid, extract_word_offset(expected));
+	fprintf(f, "Thread %02d: Observed Word offset = %03d\n", tid, extract_word_offset(observed));
+	fprintf(f, "Thread %02d: Expected sweep-id    = 0x%x\n", tid, extract_sweep_id(expected));
+	fprintf(f, "Thread %02d: Observed sweep-id    = 0x%x\n", tid, extract_sweep_id(observed));
+	fprintf(f, "----------------------------------------------------------\n");
+}
+
+static inline void end_verification_log(unsigned int tid, unsigned nr_anamolies)
+{
+	FILE *f = fp[tid];
+	char logfile[30];
+	char path[LOGDIR_NAME_SIZE + 30];
+	char separator[] = "/";
+
+	fclose(f);
+
+	if (nr_anamolies == 0) {
+		remove(path);
+		return;
+	}
+
+	sprintf(logfile, logfilename, tid);
+	strcpy(path, logdir);
+	strcat(path, separator);
+	strcat(path, logfile);
+
+	printf("Thread %02d chunk has %d corrupted words. For details check %s\n",
+		tid, nr_anamolies, path);
+}
+
+/*
+ * When a COMPARE step of a rim-sequence fails, the rim_thread informs
+ * everyone else via the shared_memory pointed to by
+ * corruption_found variable. On seeing this, every thread verifies the
+ * content of its chunk as follows.
+ *
+ * Suppose a thread identified with @tid was about to store (but not
+ * yet stored) to @next_store_addr in its current sweep identified
+ * @cur_sweep_id. Let @prev_sweep_id indicate the previous sweep_id.
+ *
+ * This implies that for all the addresses @addr < @next_store_addr,
+ * Thread @tid has already performed a store as part of its current
+ * sweep. Hence we expect the content of such @addr to be:
+ *    |-------------------------------------------------|
+ *    | tid   | word_offset(addr) |    cur_sweep_id     |
+ *    |-------------------------------------------------|
+ *
+ * Since Thread @tid is yet to perform stores on address
+ * @next_store_addr and above, we expect the content of such an
+ * address @addr to be:
+ *    |-------------------------------------------------|
+ *    | tid   | word_offset(addr) |    prev_sweep_id    |
+ *    |-------------------------------------------------|
+ *
+ * The verifier function @verify_chunk does this verification and logs
+ * any anamolies that it finds.
+ */
+static void verify_chunk(unsigned int tid, unsigned int *next_store_addr,
+		  unsigned int cur_sweep_id,
+		  unsigned int prev_sweep_id)
+{
+	unsigned int *iter_ptr;
+	unsigned int size = RIM_CHUNK_SIZE;
+	unsigned int expected;
+	unsigned int observed;
+	char *chunk_start = compute_chunk_start_addr(tid);
+
+	int nr_anamolies = 0;
+
+	start_verification_log(tid, next_store_addr,
+			       cur_sweep_id, prev_sweep_id);
+
+	for (iter_ptr = (unsigned int *)chunk_start;
+	     (unsigned long)iter_ptr < (unsigned long)chunk_start + size;
+	     iter_ptr++) {
+		unsigned int expected_sweep_id;
+
+		if (iter_ptr < next_store_addr) {
+			expected_sweep_id = cur_sweep_id;
+		} else {
+			expected_sweep_id = prev_sweep_id;
+		}
+
+		expected = compute_store_pattern(tid, iter_ptr, expected_sweep_id);
+
+		dcbf((volatile unsigned int*)iter_ptr); //Flush before reading
+		observed = *iter_ptr;
+
+	        if (observed != expected) {
+			nr_anamolies++;
+			log_anamoly(tid, iter_ptr, expected, observed);
+		}
+	}
+
+	end_verification_log(tid, nr_anamolies);
+}
+
+static void set_pthread_cpu(pthread_t th, int cpu)
+{
+	cpu_set_t run_cpu_mask;
+	struct sched_param param;
+
+	CPU_ZERO(&run_cpu_mask);
+	CPU_SET(cpu, &run_cpu_mask);
+	pthread_setaffinity_np(th, sizeof(cpu_set_t), &run_cpu_mask);
+
+	param.sched_priority = 1;
+	if (0 && sched_setscheduler(0, SCHED_FIFO, &param) == -1) {
+		/* haven't reproduced with this setting, it kills random preemption which may be a factor */
+		fprintf(stderr, "could not set SCHED_FIFO, run as root?\n");
+	}
+}
+
+static void set_mycpu(int cpu)
+{
+	cpu_set_t run_cpu_mask;
+	struct sched_param param;
+
+	CPU_ZERO(&run_cpu_mask);
+	CPU_SET(cpu, &run_cpu_mask);
+	sched_setaffinity(0, sizeof(cpu_set_t), &run_cpu_mask);
+
+	param.sched_priority = 1;
+	if (0 && sched_setscheduler(0, SCHED_FIFO, &param) == -1) {
+		fprintf(stderr, "could not set SCHED_FIFO, run as root?\n");
+	}
+}
+
+static volatile int segv_wait;
+
+static void segv_handler(int signo, siginfo_t *info, void *extra)
+{
+	while (segv_wait) {
+		sched_yield();
+	}
+
+}
+
+static void set_segv_handler(void)
+{
+	struct sigaction sa;
+
+	sa.sa_flags = SA_SIGINFO;
+	sa.sa_sigaction = segv_handler;
+
+	if (sigaction(SIGSEGV, &sa, NULL) == -1) {
+		perror("sigaction");
+		exit(EXIT_FAILURE);
+	}
+}
+
+int timeout = 0;
+/*
+ * This function is executed by every rim_thread.
+ *
+ * This function performs sweeps over the exclusive chunks of the
+ * rim_threads executing the rim-sequence one word at a time.
+ */
+static void *rim_fn(void *arg)
+{
+	unsigned int tid = *((unsigned int *)arg);
+
+	int size = RIM_CHUNK_SIZE;
+	char *chunk_start = compute_chunk_start_addr(tid);
+
+	unsigned int prev_sweep_id;
+	unsigned int cur_sweep_id = 0;
+
+	/* word access */
+	unsigned int pattern = cur_sweep_id;
+	unsigned int *pattern_ptr = &pattern;
+	unsigned int *w_ptr, read_data;
+
+	set_segv_handler();
+
+	/*
+	 * Let us initialize the chunk:
+	 *
+	 * Each word-aligned address addr in the chunk,
+	 * is initialized to :
+	 *    |-------------------------------------------------|
+	 *    | tid   | word_offset(addr) |         0           |
+	 *    |-------------------------------------------------|
+	 */
+	for (w_ptr = (unsigned int *)chunk_start;
+	     (unsigned long)w_ptr < (unsigned long)(chunk_start) + size;
+	     w_ptr++) {
+
+		*pattern_ptr = compute_store_pattern(tid, w_ptr, cur_sweep_id);
+		*w_ptr = *pattern_ptr;
+	}
+
+	while (!corruption_found && !timeout) {
+		prev_sweep_id = cur_sweep_id;
+		cur_sweep_id = cur_sweep_id + 1;
+
+		for (w_ptr = (unsigned int *)chunk_start;
+		     (unsigned long)w_ptr < (unsigned long)(chunk_start) + size;
+		     w_ptr++)  {
+			unsigned int old_pattern;
+
+			/*
+			 * Compute the pattern that we would have
+			 * stored at this location in the previous
+			 * sweep.
+			 */
+			old_pattern = compute_store_pattern(tid, w_ptr, prev_sweep_id);
+
+			/*
+			 * FLUSH:Ensure that we flush the contents of
+			 *       the cache before loading
+			 */
+			dcbf((volatile unsigned int*)w_ptr); //Flush
+
+			/* LOAD: Read the value */
+			read_data = *w_ptr; //Load
+
+			/*
+			 * COMPARE: Is it the same as what we had stored
+			 *          in the previous sweep ? It better be!
+			 */
+			if (read_data != old_pattern) {
+				/* No it isn't! Tell everyone */
+				corruption_found = 1;
+			}
+
+			/*
+			 * Before performing a store, let us check if
+			 * any rim_thread has found a corruption.
+			 */
+			if (corruption_found || timeout) {
+				/*
+				 * Yes. Someone (including us!) has found
+				 * a corruption :(
+				 *
+				 * Let us verify that our chunk is
+				 * correct.
+				 */
+				/* But first, let us allow the dust to settle down! */
+				verify_chunk(tid, w_ptr, cur_sweep_id, prev_sweep_id);
+
+				return 0;
+			}
+
+			/*
+			 * Compute the new pattern that we are going
+			 * to write to this location
+			 */
+			*pattern_ptr = compute_store_pattern(tid, w_ptr, cur_sweep_id);
+
+			/*
+			 * STORE: Now let us write this pattern into
+			 *        the location
+			 */
+			*w_ptr = *pattern_ptr;
+		}
+	}
+
+	return NULL;
+}
+
+
+static unsigned long start_cpu = 0;
+static unsigned long nrthreads = 4;
+
+static pthread_t mem_snapshot_thread;
+
+static void *mem_snapshot_fn(void *arg)
+{
+	int page_size = getpagesize();
+	size_t size = page_size;
+	void *tmp = malloc(size);
+
+	while (!corruption_found && !timeout) {
+		/* Stop memory migration once corruption is found */
+		segv_wait = 1;
+
+		mprotect(map1, size, PROT_READ);
+
+		/*
+		 * Load from the working alias (map1). Loading from map2
+		 * also fails.
+		 */
+		memcpy(tmp, map1, size);
+
+		/*
+		 * Stores must go via map2 which has write permissions, but
+		 * the corrupted data tends to be seen in the snapshot buffer,
+		 * so corruption does not appear to be introduced at the
+		 * copy-back via map2 alias here.
+		 */
+		memcpy(map2, tmp, size);
+		/*
+		 * Before releasing other threads, must ensure the copy
+		 * back to
+		 */
+		asm volatile("sync" ::: "memory");
+		mprotect(map1, size, PROT_READ|PROT_WRITE);
+		asm volatile("sync" ::: "memory");
+		segv_wait = 0;
+
+		usleep(1); /* This value makes a big difference */
+	}
+
+	return 0;
+}
+
+void alrm_sighandler(int sig)
+{
+	timeout = 1;
+}
+
+int main(int argc, char *argv[])
+{
+	int c;
+	int page_size = getpagesize();
+	time_t now;
+	int i, dir_error;
+	pthread_attr_t attr;
+	key_t shm_key = (key_t) getpid();
+	int shmid, run_time = 20 * 60;
+	struct sigaction sa_alrm;
+
+	snprintf(logdir, LOGDIR_NAME_SIZE,
+		 "/tmp/logdir-%u", (unsigned int)getpid());
+	while ((c = getopt(argc, argv, "r:hn:l:t:")) != -1) {
+		switch(c) {
+		case 'r':
+			start_cpu = strtoul(optarg, NULL, 10);
+			break;
+		case 'h':
+			printf("%s [-r <start_cpu>] [-n <nrthreads>] [-l <logdir>] [-t <timeout>]\n", argv[0]);
+			exit(0);
+			break;
+		case 'n':
+			nrthreads = strtoul(optarg, NULL, 10);
+			break;
+		case 'l':
+			strncpy(logdir, optarg, LOGDIR_NAME_SIZE);
+			break;
+		case 't':
+			run_time = strtoul(optarg, NULL, 10);
+			break;
+		default:
+			printf("invalid option\n");
+			exit(0);
+			break;
+		}
+	}
+
+	if (nrthreads > MAX_THREADS)
+		nrthreads = MAX_THREADS;
+
+	shmid = shmget(shm_key, page_size, IPC_CREAT|0666);
+	if (shmid < 0) {
+		err_msg("Failed shmget\n");
+	}
+
+	map1 = shmat(shmid, NULL, 0);
+	if (map1 == (void *) -1) {
+		err_msg("Failed shmat");
+	}
+
+	map2 = shmat(shmid, NULL, 0);
+	if (map2 == (void *) -1) {
+		err_msg("Failed shmat");
+	}
+
+	dir_error = mkdir(logdir, 0755);
+
+	if (dir_error) {
+		err_msg("Failed mkdir");
+	}
+
+	printf("start_cpu list:%lu\n", start_cpu);
+	printf("number of worker threads:%lu + 1 snapshot thread\n", nrthreads);
+	printf("Allocated address:0x%016lx + secondary map:0x%016lx\n", (unsigned long)map1, (unsigned long)map2);
+	printf("logdir at : %s\n", logdir);
+	printf("Timeout: %d seconds\n", run_time);
+
+	time(&now);
+	printf("=================================\n");
+	printf("     Starting Test\n");
+	printf("     %s", ctime(&now));
+	printf("=================================\n");
+
+	for (i = 0; i < nrthreads; i++) {
+		if (1 && !fork()) {
+			prctl(PR_SET_PDEATHSIG, SIGKILL);
+			set_mycpu(start_cpu + i);
+			for (;;)
+				sched_yield();
+			exit(0);
+		}
+	}
+
+
+	sa_alrm.sa_handler = &alrm_sighandler;
+	sigemptyset(&sa_alrm.sa_mask);
+	sa_alrm.sa_flags = 0;
+
+	if (sigaction(SIGALRM, &sa_alrm, 0) == -1) {
+		err_msg("Failed signal handler registration\n");
+	}
+
+	alarm(run_time);
+
+	pthread_attr_init(&attr);
+	for (i = 0; i < nrthreads; i++) {
+		rim_thread_ids[i] = i;
+		pthread_create(&rim_threads[i], &attr, rim_fn, &rim_thread_ids[i]);
+		set_pthread_cpu(rim_threads[i], start_cpu + i);
+	}
+
+	pthread_create(&mem_snapshot_thread, &attr, mem_snapshot_fn, map1);
+	set_pthread_cpu(mem_snapshot_thread, start_cpu + i);
+
+
+	pthread_join(mem_snapshot_thread, NULL);
+	for (i = 0; i < nrthreads; i++) {
+		pthread_join(rim_threads[i], NULL);
+	}
+
+	if (!timeout) {
+		time(&now);
+		printf("=================================\n");
+		printf("      Data Corruption Detected\n");
+		printf("      %s", ctime(&now));
+		printf("      See logfiles in %s\n", logdir);
+		printf("=================================\n");
+		return 1;
+	}
+	return 0;
+}

diff --git a/tools/testing/selftests/powerpc/tm/.gitignore b/tools/testing/selftests/powerpc/tm/.gitignore
index 951fe85..98f2708 100644
--- a/tools/testing/selftests/powerpc/tm/.gitignore
+++ b/tools/testing/selftests/powerpc/tm/.gitignore

@@ -17,3 +17,4 @@
 tm-unavailable
 tm-trap
 tm-sigreturn
+tm-poison

diff --git a/tools/testing/selftests/powerpc/tm/Makefile b/tools/testing/selftests/powerpc/tm/Makefile
index c0734ed..b15a1a3 100644
--- a/tools/testing/selftests/powerpc/tm/Makefile
+++ b/tools/testing/selftests/powerpc/tm/Makefile

@@ -5,7 +5,7 @@
 TEST_GEN_PROGS := tm-resched-dscr tm-syscall tm-signal-msr-resv tm-signal-stack \
 	tm-vmxcopy tm-fork tm-tar tm-tmspr tm-vmx-unavail tm-unavailable tm-trap \
 	$(SIGNAL_CONTEXT_CHK_TESTS) tm-sigreturn tm-signal-sigreturn-nt \
-	tm-signal-context-force-tm
+	tm-signal-context-force-tm tm-poison
 
 top_srcdir = ../../../../..
 include ../../lib.mk

diff --git a/tools/testing/selftests/powerpc/tm/tm-poison.c b/tools/testing/selftests/powerpc/tm/tm-poison.c
new file mode 100644
index 0000000..9775584
--- /dev/null
+++ b/tools/testing/selftests/powerpc/tm/tm-poison.c

@@ -0,0 +1,179 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2019, Gustavo Romero, Michael Neuling, IBM Corp.
+ *
+ * This test will spawn two processes. Both will be attached to the same
+ * CPU (CPU 0). The child will be in a loop writing to FP register f31 and
+ * VMX/VEC/Altivec register vr31 a known value, called poison, calling
+ * sched_yield syscall after to allow the parent to switch on the CPU.
+ * Parent will set f31 and vr31 to 1 and in a loop will check if f31 and
+ * vr31 remain 1 as expected until a given timeout (2m). If the issue is
+ * present child's poison will leak into parent's f31 or vr31 registers,
+ * otherwise, poison will never leak into parent's f31 and vr31 registers.
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <inttypes.h>
+#include <sched.h>
+#include <sys/types.h>
+#include <signal.h>
+#include <inttypes.h>
+
+#include "tm.h"
+
+int tm_poison_test(void)
+{
+	int pid;
+	cpu_set_t cpuset;
+	uint64_t poison = 0xdeadbeefc0dec0fe;
+	uint64_t unknown = 0;
+	bool fail_fp = false;
+	bool fail_vr = false;
+
+	SKIP_IF(!have_htm());
+
+	/* Attach both Child and Parent to CPU 0 */
+	CPU_ZERO(&cpuset);
+	CPU_SET(0, &cpuset);
+	sched_setaffinity(0, sizeof(cpuset), &cpuset);
+
+	pid = fork();
+	if (!pid) {
+		/**
+		 * child
+		 */
+		while (1) {
+			sched_yield();
+			asm (
+				"mtvsrd 31, %[poison];" // f31 = poison
+				"mtvsrd 63, %[poison];" // vr31 = poison
+
+				: : [poison] "r" (poison) : );
+		}
+	}
+
+	/**
+	 * parent
+	 */
+	asm (
+		/*
+		 * Set r3, r4, and f31 to known value 1 before entering
+		 * in transaction. They won't be written after that.
+		 */
+		"       li      3, 0x1          ;"
+		"       li      4, 0x1          ;"
+		"       mtvsrd  31, 4           ;"
+
+		/*
+		 * The Time Base (TB) is a 64-bit counter register that is
+		 * independent of the CPU clock and which is incremented
+		 * at a frequency of 512000000 Hz, so every 1.953125ns.
+		 * So it's necessary 120s/0.000000001953125s = 61440000000
+		 * increments to get a 2 minutes timeout. Below we set that
+		 * value in r5 and then use r6 to track initial TB value,
+		 * updating TB values in r7 at every iteration and comparing it
+		 * to r6. When r7 (current) - r6 (initial) > 61440000000 we bail
+		 * out since for sure we spent already 2 minutes in the loop.
+		 * SPR 268 is the TB register.
+		 */
+		"       lis     5, 14           ;"
+		"       ori     5, 5, 19996     ;"
+		"       sldi    5, 5, 16        ;" // r5 = 61440000000
+
+		"       mfspr   6, 268          ;" // r6 (TB initial)
+		"1:     mfspr   7, 268          ;" // r7 (TB current)
+		"       subf    7, 6, 7         ;" // r7 - r6 > 61440000000 ?
+		"       cmpd    7, 5            ;"
+		"       bgt     3f              ;" // yes, exit
+
+		/*
+		 * Main loop to check f31
+		 */
+		"       tbegin.                 ;" // no, try again
+		"       beq     1b              ;" // restart if no timeout
+		"       mfvsrd  3, 31           ;" // read f31
+		"       cmpd    3, 4            ;" // f31 == 1 ?
+		"       bne     2f              ;" // broken :-(
+		"       tabort. 3               ;" // try another transaction
+		"2:     tend.                   ;" // commit transaction
+		"3:     mr    %[unknown], 3     ;" // record r3
+
+		: [unknown] "=r" (unknown)
+		:
+		: "cr0", "r3", "r4", "r5", "r6", "r7", "vs31"
+
+		);
+
+	/*
+	 * On leak 'unknown' will contain 'poison' value from child,
+	 * otherwise (no leak) 'unknown' will contain the same value
+	 * as r3 before entering in transactional mode, i.e. 0x1.
+	 */
+	fail_fp = unknown != 0x1;
+	if (fail_fp)
+		printf("Unknown value %#"PRIx64" leaked into f31!\n", unknown);
+	else
+		printf("Good, no poison or leaked value into FP registers\n");
+
+	asm (
+		/*
+		 * Set r3, r4, and vr31 to known value 1 before entering
+		 * in transaction. They won't be written after that.
+		 */
+		"       li      3, 0x1          ;"
+		"       li      4, 0x1          ;"
+		"       mtvsrd  63, 4           ;"
+
+		"       lis     5, 14           ;"
+		"       ori     5, 5, 19996     ;"
+		"       sldi    5, 5, 16        ;" // r5 = 61440000000
+
+		"       mfspr   6, 268          ;" // r6 (TB initial)
+		"1:     mfspr   7, 268          ;" // r7 (TB current)
+		"       subf    7, 6, 7         ;" // r7 - r6 > 61440000000 ?
+		"       cmpd    7, 5            ;"
+		"       bgt     3f              ;" // yes, exit
+
+		/*
+		 * Main loop to check vr31
+		 */
+		"       tbegin.                 ;" // no, try again
+		"       beq     1b              ;" // restart if no timeout
+		"       mfvsrd  3, 63           ;" // read vr31
+		"       cmpd    3, 4            ;" // vr31 == 1 ?
+		"       bne     2f              ;" // broken :-(
+		"       tabort. 3               ;" // try another transaction
+		"2:     tend.                   ;" // commit transaction
+		"3:     mr    %[unknown], 3     ;" // record r3
+
+		: [unknown] "=r" (unknown)
+		:
+		: "cr0", "r3", "r4", "r5", "r6", "r7", "vs63"
+
+		);
+
+	/*
+	 * On leak 'unknown' will contain 'poison' value from child,
+	 * otherwise (no leak) 'unknown' will contain the same value
+	 * as r3 before entering in transactional mode, i.e. 0x1.
+	 */
+	fail_vr = unknown != 0x1;
+	if (fail_vr)
+		printf("Unknown value %#"PRIx64" leaked into vr31!\n", unknown);
+	else
+		printf("Good, no poison or leaked value into VEC registers\n");
+
+	kill(pid, SIGKILL);
+
+	return (fail_fp | fail_vr);
+}
+
+int main(int argc, char *argv[])
+{
+	/* Test completes in about 4m */
+	test_harness_set_timeout(250);
+	return test_harness(tm_poison_test, "tm_poison_test");
+}

diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
index 6ef7f16..7f8b5c8 100644
--- a/tools/testing/selftests/seccomp/seccomp_bpf.c
+++ b/tools/testing/selftests/seccomp/seccomp_bpf.c

@@ -199,6 +199,11 @@
 };
 #endif
 
+#ifndef PTRACE_EVENTMSG_SYSCALL_ENTRY
+#define PTRACE_EVENTMSG_SYSCALL_ENTRY	1
+#define PTRACE_EVENTMSG_SYSCALL_EXIT	2
+#endif
+
 #ifndef seccomp
 int seccomp(unsigned int op, unsigned int flags, void *args)
 {

diff --git a/tools/testing/selftests/tpm2/Makefile b/tools/testing/selftests/tpm2/Makefile
index 9dd8484..1a5db1e 100644
--- a/tools/testing/selftests/tpm2/Makefile
+++ b/tools/testing/selftests/tpm2/Makefile

@@ -2,3 +2,4 @@
 include ../lib.mk
 
 TEST_PROGS := test_smoke.sh test_space.sh
+TEST_PROGS_EXTENDED := tpm2.py tpm2_tests.py

diff --git a/tools/testing/selftests/watchdog/watchdog-test.c b/tools/testing/selftests/watchdog/watchdog-test.c
index c2333c7..afff120 100644
--- a/tools/testing/selftests/watchdog/watchdog-test.c
+++ b/tools/testing/selftests/watchdog/watchdog-test.c

@@ -19,7 +19,7 @@
 
 int fd;
 const char v = 'V';
-static const char sopts[] = "bdehp:t:Tn:NL";
+static const char sopts[] = "bdehp:t:Tn:NLf:";
 static const struct option lopts[] = {
 	{"bootstatus",          no_argument, NULL, 'b'},
 	{"disable",             no_argument, NULL, 'd'},
@@ -31,6 +31,7 @@
 	{"pretimeout",    required_argument, NULL, 'n'},
 	{"getpretimeout",       no_argument, NULL, 'N'},
 	{"gettimeleft",		no_argument, NULL, 'L'},
+	{"file",          required_argument, NULL, 'f'},
 	{NULL,                  no_argument, NULL, 0x0}
 };
 
@@ -69,16 +70,19 @@
 static void usage(char *progname)
 {
 	printf("Usage: %s [options]\n", progname);
-	printf(" -b, --bootstatus    Get last boot status (Watchdog/POR)\n");
-	printf(" -d, --disable       Turn off the watchdog timer\n");
-	printf(" -e, --enable        Turn on the watchdog timer\n");
-	printf(" -h, --help          Print the help message\n");
-	printf(" -p, --pingrate=P    Set ping rate to P seconds (default %d)\n", DEFAULT_PING_RATE);
-	printf(" -t, --timeout=T     Set timeout to T seconds\n");
-	printf(" -T, --gettimeout    Get the timeout\n");
-	printf(" -n, --pretimeout=T  Set the pretimeout to T seconds\n");
-	printf(" -N, --getpretimeout Get the pretimeout\n");
-	printf(" -L, --gettimeleft   Get the time left until timer expires\n");
+	printf(" -f, --file\t\tOpen watchdog device file\n");
+	printf("\t\t\tDefault is /dev/watchdog\n");
+	printf(" -b, --bootstatus\tGet last boot status (Watchdog/POR)\n");
+	printf(" -d, --disable\t\tTurn off the watchdog timer\n");
+	printf(" -e, --enable\t\tTurn on the watchdog timer\n");
+	printf(" -h, --help\t\tPrint the help message\n");
+	printf(" -p, --pingrate=P\tSet ping rate to P seconds (default %d)\n",
+	       DEFAULT_PING_RATE);
+	printf(" -t, --timeout=T\tSet timeout to T seconds\n");
+	printf(" -T, --gettimeout\tGet the timeout\n");
+	printf(" -n, --pretimeout=T\tSet the pretimeout to T seconds\n");
+	printf(" -N, --getpretimeout\tGet the pretimeout\n");
+	printf(" -L, --gettimeleft\tGet the time left until timer expires\n");
 	printf("\n");
 	printf("Parameters are parsed left-to-right in real-time.\n");
 	printf("Example: %s -d -t 10 -p 5 -e\n", progname);
@@ -92,14 +96,20 @@
 	int ret;
 	int c;
 	int oneshot = 0;
+	char *file = "/dev/watchdog";
 
 	setbuf(stdout, NULL);
 
-	fd = open("/dev/watchdog", O_WRONLY);
+	while ((c = getopt_long(argc, argv, sopts, lopts, NULL)) != -1) {
+		if (c == 'f')
+			file = optarg;
+	}
+
+	fd = open(file, O_WRONLY);
 
 	if (fd == -1) {
 		if (errno == ENOENT)
-			printf("Watchdog device not enabled.\n");
+			printf("Watchdog device (%s) not found.\n", file);
 		else if (errno == EACCES)
 			printf("Run watchdog as root.\n");
 		else
@@ -108,6 +118,8 @@
 		exit(-1);
 	}
 
+	optind = 0;
+
 	while ((c = getopt_long(argc, argv, sopts, lopts, NULL)) != -1) {
 		switch (c) {
 		case 'b':
@@ -190,6 +202,9 @@
 			else
 				printf("WDIOC_GETTIMELEFT error '%s'\n", strerror(errno));
 			break;
+		case 'f':
+			/* Handled above */
+			break;
 
 		default:
 			usage(argv[0]);

diff --git a/usr/Makefile b/usr/Makefile
index 6a89eb0..e6f7cb2 100644
--- a/usr/Makefile
+++ b/usr/Makefile

@@ -11,6 +11,9 @@
 datafile_d_y = .$(datafile_y).d
 AFLAGS_initramfs_data.o += -DINITRAMFS_IMAGE="usr/$(datafile_y)"
 
+# clean rules do not have CONFIG_INITRAMFS_COMPRESSION.  So clean up after all
+# possible compression formats.
+clean-files += initramfs_data.cpio*
 
 # Generate builtin.o based on initramfs_data.o
 obj-$(CONFIG_BLK_DEV_INITRD) := initramfs_data.o

diff --git a/usr/include/Makefile b/usr/include/Makefile
index 05c71ef..c9449aaf 100644
--- a/usr/include/Makefile
+++ b/usr/include/Makefile

@@ -38,7 +38,6 @@
 header-test- += linux/jffs2.h
 header-test- += linux/kexec.h
 header-test- += linux/matroxfb.h
-header-test- += linux/netfilter_bridge/ebtables.h
 header-test- += linux/netfilter_ipv4/ipt_LOG.h
 header-test- += linux/netfilter_ipv6/ip6t_LOG.h
 header-test- += linux/nfc.h
commit	c710364f78afdef8c2ed07556d0743c5a30ed429	[log] [tgz]
author	Linus Torvalds <torvalds@linux-foundation.org>	Mon Sep 30 09:21:53 2019 -0700
committer	Linus Torvalds <torvalds@linux-foundation.org>	Mon Sep 30 09:21:53 2019 -0700
tree	38173e6818c3c2e4414a767dd81e60d1d9ff4309
parent	97f9a3c4eee55b0178b518ae7114a6a53372913d [diff]
parent	e51df6ce668a8f75ce27f83ce0f60103c568c375 [diff]