Merge tag 'dax-fixes-5.6-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm
Pull dax fixes from Dan Williams:
"A fix for an xfstest failure and some and an update that removes an
fsdax dependency on block devices.
Summary:
- Fix RWF_NOWAIT writes to properly return -EAGAIN
- Clean up an unused helper
- Update dax_writeback_mapping_range to not need a block_device
argument"
* tag 'dax-fixes-5.6-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm:
dax: pass NOWAIT flag to iomap_apply
dax: Get rid of fs_dax_get_by_host() helper
dax: Pass dax_dev instead of bdev to dax_writeback_mapping_range()
diff --git a/.mailmap b/.mailmap
index 00581c1..ffb8f28 100644
--- a/.mailmap
+++ b/.mailmap
@@ -18,6 +18,7 @@
Aleksandar Markovic <aleksandar.markovic@mips.com> <aleksandar.markovic@imgtec.com>
Alex Shi <alex.shi@linux.alibaba.com> <alex.shi@intel.com>
Alex Shi <alex.shi@linux.alibaba.com> <alex.shi@linaro.org>
+Alexandre Belloni <alexandre.belloni@bootlin.com> <alexandre.belloni@free-electrons.com>
Alexei Starovoitov <ast@kernel.org> <ast@plumgrid.com>
Alexei Starovoitov <ast@kernel.org> <alexei.starovoitov@gmail.com>
Alexei Starovoitov <ast@kernel.org> <ast@fb.com>
@@ -27,6 +28,8 @@
Andreas Herrmann <aherrman@de.ibm.com>
Andrey Ryabinin <ryabinin.a.a@gmail.com> <a.ryabinin@samsung.com>
Andrew Morton <akpm@linux-foundation.org>
+Andrew Murray <amurray@thegoodpenguin.co.uk> <andrew.murray@arm.com>
+Andrew Murray <amurray@thegoodpenguin.co.uk> <amurray@embedded-bits.co.uk>
Andrew Vasquez <andrew.vasquez@qlogic.com>
Andy Adamson <andros@citi.umich.edu>
Antoine Tenart <antoine.tenart@free-electrons.com>
@@ -74,6 +77,7 @@
Domen Puncer <domen@coderock.org>
Douglas Gilbert <dougg@torque.net>
Ed L. Cashin <ecashin@coraid.com>
+Erik Kaneda <erik.kaneda@intel.com> <erik.schmauss@intel.com>
Evgeniy Polyakov <johnpol@2ka.mipt.ru>
Felipe W Damasio <felipewd@terra.com.br>
Felix Kuhling <fxkuehl@gmx.de>
@@ -99,6 +103,7 @@
Jaegeuk Kim <jaegeuk@kernel.org> <jaegeuk@google.com>
Jaegeuk Kim <jaegeuk@kernel.org> <jaegeuk@motorola.com>
Jaegeuk Kim <jaegeuk@kernel.org> <jaegeuk.kim@samsung.com>
+Jakub Kicinski <kuba@kernel.org> <jakub.kicinski@netronome.com>
James Bottomley <jejb@mulgrave.(none)>
James Bottomley <jejb@titanic.il.steeleye.com>
James E Wilson <wilson@specifix.com>
@@ -137,6 +142,7 @@
Juha Yrjola <juha.yrjola@nokia.com>
Juha Yrjola <juha.yrjola@solidboot.com>
Julien Thierry <julien.thierry.kdev@gmail.com> <julien.thierry@arm.com>
+Kamil Konieczny <k.konieczny@samsung.com> <k.konieczny@partner.samsung.com>
Kay Sievers <kay.sievers@vrfy.org>
Kenneth W Chen <kenneth.w.chen@intel.com>
Konstantin Khlebnikov <koct9i@gmail.com> <k.khlebnikov@samsung.com>
@@ -152,6 +158,7 @@
Linus Lüssing <linus.luessing@c0d3.blue> <linus.luessing@ascom.ch>
Li Yang <leoyang.li@nxp.com> <leo@zh-kernel.org>
Li Yang <leoyang.li@nxp.com> <leoli@freescale.com>
+Lukasz Luba <lukasz.luba@arm.com> <l.luba@partner.samsung.com>
Maciej W. Rozycki <macro@mips.com> <macro@imgtec.com>
Marc Zyngier <maz@kernel.org> <marc.zyngier@arm.com>
Marcin Nowakowski <marcin.nowakowski@mips.com> <marcin.nowakowski@imgtec.com>
@@ -207,6 +214,10 @@
Patrick Mochel <mochel@digitalimplant.org>
Paul Burton <paulburton@kernel.org> <paul.burton@imgtec.com>
Paul Burton <paulburton@kernel.org> <paul.burton@mips.com>
+Paul E. McKenney <paulmck@kernel.org> <paulmck@linux.ibm.com>
+Paul E. McKenney <paulmck@kernel.org> <paulmck@linux.vnet.ibm.com>
+Paul E. McKenney <paulmck@kernel.org> <paul.mckenney@linaro.org>
+Paul E. McKenney <paulmck@kernel.org> <paulmck@us.ibm.com>
Peter A Jonsson <pj@ludd.ltu.se>
Peter Oruba <peter@oruba.de>
Peter Oruba <peter.oruba@amd.com>
@@ -215,6 +226,7 @@
Punit Agrawal <punitagrawal@gmail.com> <punit.agrawal@arm.com>
Qais Yousef <qsyousef@gmail.com> <qais.yousef@imgtec.com>
Quentin Perret <qperret@qperret.net> <quentin.perret@arm.com>
+Rafael J. Wysocki <rjw@rjwysocki.net> <rjw@sisk.pl>
Rajesh Shah <rajesh.shah@intel.com>
Ralf Baechle <ralf@linux-mips.org>
Ralf Wildenhues <Ralf.Wildenhues@gmx.de>
@@ -250,6 +262,7 @@
Tejun Heo <htejun@gmail.com>
Thomas Graf <tgraf@suug.ch>
Thomas Pedersen <twp@codeaurora.org>
+Tiezhu Yang <yangtiezhu@loongson.cn> <kernelpatch@126.com>
Todor Tomov <todor.too@gmail.com> <todor.tomov@linaro.org>
Tony Luck <tony.luck@intel.com>
TripleX Chung <xxx.phy@gmail.com> <zhongyu@18mail.cn>
@@ -265,6 +278,7 @@
Viresh Kumar <vireshk@kernel.org> <viresh.kumar@st.com>
Viresh Kumar <vireshk@kernel.org> <viresh.linux@gmail.com>
Viresh Kumar <vireshk@kernel.org> <viresh.kumar2@arm.com>
+Vivien Didelot <vivien.didelot@gmail.com> <vivien.didelot@savoirfairelinux.com>
Vlad Dogaru <ddvlad@gmail.com> <vlad.dogaru@intel.com>
Vladimir Davydov <vdavydov.dev@gmail.com> <vdavydov@virtuozzo.com>
Vladimir Davydov <vdavydov.dev@gmail.com> <vdavydov@parallels.com>
diff --git a/CREDITS b/CREDITS
index 9602b0f..a97d328 100644
--- a/CREDITS
+++ b/CREDITS
@@ -3302,7 +3302,9 @@
N: Aleksa Sarai
E: cyphar@cyphar.com
W: https://www.cyphar.com/
-D: `pids` cgroup subsystem
+D: /sys/fs/cgroup/pids
+D: openat2(2)
+S: Sydney, Australia
N: Dipankar Sarma
E: dipankar@in.ibm.com
diff --git a/Documentation/ABI/obsolete/sysfs-selinux-disable b/Documentation/ABI/obsolete/sysfs-selinux-disable
new file mode 100644
index 0000000..c340278
--- /dev/null
+++ b/Documentation/ABI/obsolete/sysfs-selinux-disable
@@ -0,0 +1,26 @@
+What: /sys/fs/selinux/disable
+Date: April 2005 (predates git)
+KernelVersion: 2.6.12-rc2 (predates git)
+Contact: selinux@vger.kernel.org
+Description:
+
+ The selinuxfs "disable" node allows SELinux to be disabled at runtime
+ prior to a policy being loaded into the kernel. If disabled via this
+ mechanism, SELinux will remain disabled until the system is rebooted.
+
+ The preferred method of disabling SELinux is via the "selinux=0" boot
+ parameter, but the selinuxfs "disable" node was created to make it
+ easier for systems with primitive bootloaders that did not allow for
+ easy modification of the kernel command line. Unfortunately, allowing
+ for SELinux to be disabled at runtime makes it difficult to secure the
+ kernel's LSM hooks using the "__ro_after_init" feature.
+
+ Thankfully, the need for the SELinux runtime disable appears to be
+ gone, the default Kconfig configuration disables this selinuxfs node,
+ and only one of the major distributions, Fedora, supports disabling
+ SELinux at runtime. Fedora is in the process of removing the
+ selinuxfs "disable" node and once that is complete we will start the
+ slow process of removing this code from the kernel.
+
+ More information on /sys/fs/selinux/disable can be found under the
+ CONFIG_SECURITY_SELINUX_DISABLE Kconfig option.
diff --git a/Documentation/ABI/stable/sysfs-class-tpm b/Documentation/ABI/stable/sysfs-class-tpm
index c0e2383..58e94e7 100644
--- a/Documentation/ABI/stable/sysfs-class-tpm
+++ b/Documentation/ABI/stable/sysfs-class-tpm
@@ -1,7 +1,7 @@
What: /sys/class/tpm/tpmX/device/
Date: April 2005
KernelVersion: 2.6.12
-Contact: tpmdd-devel@lists.sf.net
+Contact: linux-integrity@vger.kernel.org
Description: The device/ directory under a specific TPM instance exposes
the properties of that TPM chip
@@ -9,7 +9,7 @@
What: /sys/class/tpm/tpmX/device/active
Date: April 2006
KernelVersion: 2.6.17
-Contact: tpmdd-devel@lists.sf.net
+Contact: linux-integrity@vger.kernel.org
Description: The "active" property prints a '1' if the TPM chip is accepting
commands. An inactive TPM chip still contains all the state of
an active chip (Storage Root Key, NVRAM, etc), and can be
@@ -21,7 +21,7 @@
What: /sys/class/tpm/tpmX/device/cancel
Date: June 2005
KernelVersion: 2.6.13
-Contact: tpmdd-devel@lists.sf.net
+Contact: linux-integrity@vger.kernel.org
Description: The "cancel" property allows you to cancel the currently
pending TPM command. Writing any value to cancel will call the
TPM vendor specific cancel operation.
@@ -29,7 +29,7 @@
What: /sys/class/tpm/tpmX/device/caps
Date: April 2005
KernelVersion: 2.6.12
-Contact: tpmdd-devel@lists.sf.net
+Contact: linux-integrity@vger.kernel.org
Description: The "caps" property contains TPM manufacturer and version info.
Example output:
@@ -46,7 +46,7 @@
What: /sys/class/tpm/tpmX/device/durations
Date: March 2011
KernelVersion: 3.1
-Contact: tpmdd-devel@lists.sf.net
+Contact: linux-integrity@vger.kernel.org
Description: The "durations" property shows the 3 vendor-specific values
used to wait for a short, medium and long TPM command. All
TPM commands are categorized as short, medium or long in
@@ -69,7 +69,7 @@
What: /sys/class/tpm/tpmX/device/enabled
Date: April 2006
KernelVersion: 2.6.17
-Contact: tpmdd-devel@lists.sf.net
+Contact: linux-integrity@vger.kernel.org
Description: The "enabled" property prints a '1' if the TPM chip is enabled,
meaning that it should be visible to the OS. This property
may be visible but produce a '0' after some operation that
@@ -78,7 +78,7 @@
What: /sys/class/tpm/tpmX/device/owned
Date: April 2006
KernelVersion: 2.6.17
-Contact: tpmdd-devel@lists.sf.net
+Contact: linux-integrity@vger.kernel.org
Description: The "owned" property produces a '1' if the TPM_TakeOwnership
ordinal has been executed successfully in the chip. A '0'
indicates that ownership hasn't been taken.
@@ -86,7 +86,7 @@
What: /sys/class/tpm/tpmX/device/pcrs
Date: April 2005
KernelVersion: 2.6.12
-Contact: tpmdd-devel@lists.sf.net
+Contact: linux-integrity@vger.kernel.org
Description: The "pcrs" property will dump the current value of all Platform
Configuration Registers in the TPM. Note that since these
values may be constantly changing, the output is only valid
@@ -109,7 +109,7 @@
What: /sys/class/tpm/tpmX/device/pubek
Date: April 2005
KernelVersion: 2.6.12
-Contact: tpmdd-devel@lists.sf.net
+Contact: linux-integrity@vger.kernel.org
Description: The "pubek" property will return the TPM's public endorsement
key if possible. If the TPM has had ownership established and
is version 1.2, the pubek will not be available without the
@@ -161,7 +161,7 @@
What: /sys/class/tpm/tpmX/device/temp_deactivated
Date: April 2006
KernelVersion: 2.6.17
-Contact: tpmdd-devel@lists.sf.net
+Contact: linux-integrity@vger.kernel.org
Description: The "temp_deactivated" property returns a '1' if the chip has
been temporarily deactivated, usually until the next power
cycle. Whether a warm boot (reboot) will clear a TPM chip
@@ -170,7 +170,7 @@
What: /sys/class/tpm/tpmX/device/timeouts
Date: March 2011
KernelVersion: 3.1
-Contact: tpmdd-devel@lists.sf.net
+Contact: linux-integrity@vger.kernel.org
Description: The "timeouts" property shows the 4 vendor-specific values
for the TPM's interface spec timeouts. The use of these
timeouts is defined by the TPM interface spec that the chip
@@ -183,3 +183,14 @@
The four timeout values are shown in usecs, with a trailing
"[original]" or "[adjusted]" depending on whether the values
were scaled by the driver to be reported in usec from msecs.
+
+What: /sys/class/tpm/tpmX/tpm_version_major
+Date: October 2019
+KernelVersion: 5.5
+Contact: linux-integrity@vger.kernel.org
+Description: The "tpm_version_major" property shows the TCG spec major version
+ implemented by the TPM device.
+
+ Example output:
+
+ 2
diff --git a/Documentation/ABI/stable/sysfs-driver-dma-idxd b/Documentation/ABI/stable/sysfs-driver-dma-idxd
new file mode 100644
index 0000000..f4be46cc
--- /dev/null
+++ b/Documentation/ABI/stable/sysfs-driver-dma-idxd
@@ -0,0 +1,171 @@
+What: sys/bus/dsa/devices/dsa<m>/cdev_major
+Date: Oct 25, 2019
+KernelVersion: 5.6.0
+Contact: dmaengine@vger.kernel.org
+Description: The major number that the character device driver assigned to
+ this device.
+
+What: sys/bus/dsa/devices/dsa<m>/errors
+Date: Oct 25, 2019
+KernelVersion: 5.6.0
+Contact: dmaengine@vger.kernel.org
+Description: The error information for this device.
+
+What: sys/bus/dsa/devices/dsa<m>/max_batch_size
+Date: Oct 25, 2019
+KernelVersion: 5.6.0
+Contact: dmaengine@vger.kernel.org
+Description: The largest number of work descriptors in a batch.
+
+What: sys/bus/dsa/devices/dsa<m>/max_work_queues_size
+Date: Oct 25, 2019
+KernelVersion: 5.6.0
+Contact: dmaengine@vger.kernel.org
+Description: The maximum work queue size supported by this device.
+
+What: sys/bus/dsa/devices/dsa<m>/max_engines
+Date: Oct 25, 2019
+KernelVersion: 5.6.0
+Contact: dmaengine@vger.kernel.org
+Description: The maximum number of engines supported by this device.
+
+What: sys/bus/dsa/devices/dsa<m>/max_groups
+Date: Oct 25, 2019
+KernelVersion: 5.6.0
+Contact: dmaengine@vger.kernel.org
+Description: The maximum number of groups can be created under this device.
+
+What: sys/bus/dsa/devices/dsa<m>/max_tokens
+Date: Oct 25, 2019
+KernelVersion: 5.6.0
+Contact: dmaengine@vger.kernel.org
+Description: The total number of bandwidth tokens supported by this device.
+ The bandwidth tokens represent resources within the DSA
+ implementation, and these resources are allocated by engines to
+ support operations.
+
+What: sys/bus/dsa/devices/dsa<m>/max_transfer_size
+Date: Oct 25, 2019
+KernelVersion: 5.6.0
+Contact: dmaengine@vger.kernel.org
+Description: The number of bytes to be read from the source address to
+ perform the operation. The maximum transfer size is dependent on
+ the workqueue the descriptor was submitted to.
+
+What: sys/bus/dsa/devices/dsa<m>/max_work_queues
+Date: Oct 25, 2019
+KernelVersion: 5.6.0
+Contact: dmaengine@vger.kernel.org
+Description: The maximum work queue number that this device supports.
+
+What: sys/bus/dsa/devices/dsa<m>/numa_node
+Date: Oct 25, 2019
+KernelVersion: 5.6.0
+Contact: dmaengine@vger.kernel.org
+Description: The numa node number for this device.
+
+What: sys/bus/dsa/devices/dsa<m>/op_cap
+Date: Oct 25, 2019
+KernelVersion: 5.6.0
+Contact: dmaengine@vger.kernel.org
+Description: The operation capability bit mask specify the operation types
+ supported by the this device.
+
+What: sys/bus/dsa/devices/dsa<m>/state
+Date: Oct 25, 2019
+KernelVersion: 5.6.0
+Contact: dmaengine@vger.kernel.org
+Description: The state information of this device. It can be either enabled
+ or disabled.
+
+What: sys/bus/dsa/devices/dsa<m>/group<m>.<n>
+Date: Oct 25, 2019
+KernelVersion: 5.6.0
+Contact: dmaengine@vger.kernel.org
+Description: The assigned group under this device.
+
+What: sys/bus/dsa/devices/dsa<m>/engine<m>.<n>
+Date: Oct 25, 2019
+KernelVersion: 5.6.0
+Contact: dmaengine@vger.kernel.org
+Description: The assigned engine under this device.
+
+What: sys/bus/dsa/devices/dsa<m>/wq<m>.<n>
+Date: Oct 25, 2019
+KernelVersion: 5.6.0
+Contact: dmaengine@vger.kernel.org
+Description: The assigned work queue under this device.
+
+What: sys/bus/dsa/devices/dsa<m>/configurable
+Date: Oct 25, 2019
+KernelVersion: 5.6.0
+Contact: dmaengine@vger.kernel.org
+Description: To indicate if this device is configurable or not.
+
+What: sys/bus/dsa/devices/dsa<m>/token_limit
+Date: Oct 25, 2019
+KernelVersion: 5.6.0
+Contact: dmaengine@vger.kernel.org
+Description: The maximum number of bandwidth tokens that may be in use at
+ one time by operations that access low bandwidth memory in the
+ device.
+
+What: sys/bus/dsa/devices/wq<m>.<n>/group_id
+Date: Oct 25, 2019
+KernelVersion: 5.6.0
+Contact: dmaengine@vger.kernel.org
+Description: The group id that this work queue belongs to.
+
+What: sys/bus/dsa/devices/wq<m>.<n>/size
+Date: Oct 25, 2019
+KernelVersion: 5.6.0
+Contact: dmaengine@vger.kernel.org
+Description: The work queue size for this work queue.
+
+What: sys/bus/dsa/devices/wq<m>.<n>/type
+Date: Oct 25, 2019
+KernelVersion: 5.6.0
+Contact: dmaengine@vger.kernel.org
+Description: The type of this work queue, it can be "kernel" type for work
+ queue usages in the kernel space or "user" type for work queue
+ usages by applications in user space.
+
+What: sys/bus/dsa/devices/wq<m>.<n>/cdev_minor
+Date: Oct 25, 2019
+KernelVersion: 5.6.0
+Contact: dmaengine@vger.kernel.org
+Description: The minor number assigned to this work queue by the character
+ device driver.
+
+What: sys/bus/dsa/devices/wq<m>.<n>/mode
+Date: Oct 25, 2019
+KernelVersion: 5.6.0
+Contact: dmaengine@vger.kernel.org
+Description: The work queue mode type for this work queue.
+
+What: sys/bus/dsa/devices/wq<m>.<n>/priority
+Date: Oct 25, 2019
+KernelVersion: 5.6.0
+Contact: dmaengine@vger.kernel.org
+Description: The priority value of this work queue, it is a vlue relative to
+ other work queue in the same group to control quality of service
+ for dispatching work from multiple workqueues in the same group.
+
+What: sys/bus/dsa/devices/wq<m>.<n>/state
+Date: Oct 25, 2019
+KernelVersion: 5.6.0
+Contact: dmaengine@vger.kernel.org
+Description: The current state of the work queue.
+
+What: sys/bus/dsa/devices/wq<m>.<n>/threshold
+Date: Oct 25, 2019
+KernelVersion: 5.6.0
+Contact: dmaengine@vger.kernel.org
+Description: The number of entries in this work queue that may be filled
+ via a limited portal.
+
+What: sys/bus/dsa/devices/engine<m>.<n>/group_id
+Date: Oct 25, 2019
+KernelVersion: 5.6.0
+Contact: dmaengine@vger.kernel.org
+Description: The group that this engine belongs to.
diff --git a/Documentation/ABI/stable/sysfs-driver-mlxreg-io b/Documentation/ABI/stable/sysfs-driver-mlxreg-io
index 8ca4984..b0d90cc 100644
--- a/Documentation/ABI/stable/sysfs-driver-mlxreg-io
+++ b/Documentation/ABI/stable/sysfs-driver-mlxreg-io
@@ -1,5 +1,4 @@
What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/asic_health
-
Date: June 2018
KernelVersion: 4.19
Contact: Vadim Pasternak <vadimpmellanox.com>
@@ -19,7 +18,6 @@
The files are read only.
What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/fan_dir
-
Date: December 2018
KernelVersion: 5.0
Contact: Vadim Pasternak <vadimpmellanox.com>
@@ -29,18 +27,16 @@
The files are read only.
-What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/jtag_enable
-
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/cpld3_version
Date: November 2018
KernelVersion: 5.0
Contact: Vadim Pasternak <vadimpmellanox.com>
Description: These files show with which CPLD versions have been burned
- on LED board.
+ on LED or Gearbox board.
The files are read only.
What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/jtag_enable
-
Date: November 2018
KernelVersion: 5.0
Contact: Vadim Pasternak <vadimpmellanox.com>
@@ -108,7 +104,6 @@
What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_from_comex
What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_system
What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_voltmon_upgrade_fail
-
Date: November 2018
KernelVersion: 5.0
Contact: Vadim Pasternak <vadimpmellanox.com>
@@ -121,6 +116,21 @@
The files are read only.
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/cpld4_version
+Date: November 2018
+KernelVersion: 5.0
+Contact: Vadim Pasternak <vadimpmellanox.com>
+Description: These files show with which CPLD versions have been burned
+ on LED board.
+
+ The files are read only.
+
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_comex_thermal
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_comex_wd
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_from_asic
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_reload_bios
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_sff_wd
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_swb_wd
Date: June 2019
KernelVersion: 5.3
Contact: Vadim Pasternak <vadimpmellanox.com>
@@ -134,9 +144,65 @@
The files are read only.
-What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_comex_thermal
-What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_comex_wd
-What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_from_asic
-What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_reload_bios
-What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_sff_wd
-What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_swb_wd
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/config1
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/config2
+Date: January 2020
+KernelVersion: 5.6
+Contact: Vadim Pasternak <vadimpmellanox.com>
+Description: These files show system static topology identification
+ like system's static I2C topology, number and type of FPGA
+ devices within the system and so on.
+
+ The files are read only.
+
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_ac_pwr_fail
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_platform
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_soc
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_sw_pwr_off
+Date: January 2020
+KernelVersion: 5.6
+Contact: Vadim Pasternak <vadimpmellanox.com>
+Description: These files show the system reset causes, as following: reset
+ due to AC power failure, reset invoked from software by
+ assertion reset signal through CPLD. reset caused by signal
+ asserted by SOC through ACPI register, reset invoked from
+ software by assertion power off signal through CPLD.
+
+ The files are read only.
+
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/pcie_asic_reset_dis
+Date: January 2020
+KernelVersion: 5.6
+Contact: Vadim Pasternak <vadimpmellanox.com>
+Description: This file allows to retain ASIC up during PCIe root complex
+ reset, when attribute is set 1.
+
+ The file is read/write.
+
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/vpd_wp
+Date: January 2020
+KernelVersion: 5.6
+Contact: Vadim Pasternak <vadimpmellanox.com>
+Description: This file allows to overwrite system VPD hardware wrtie
+ protection when attribute is set 1.
+
+ The file is read/write.
+
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/voltreg_update_status
+Date: January 2020
+KernelVersion: 5.6
+Contact: Vadim Pasternak <vadimpmellanox.com>
+Description: This file exposes the configuration update status of burnable
+ voltage regulator devices. The status values are as following:
+ 0 - OK; 1 - CRC failure; 2 = I2C failure; 3 - in progress.
+
+ The file is read only.
+
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/ufm_version
+Date: January 2020
+KernelVersion: 5.6
+Contact: Vadim Pasternak <vadimpmellanox.com>
+Description: This file exposes the firmware version of burnable voltage
+ regulator devices.
+
+ The file is read only.
diff --git a/Documentation/ABI/testing/configfs-usb-gadget b/Documentation/ABI/testing/configfs-usb-gadget
index 95a3658..4594cc2 100644
--- a/Documentation/ABI/testing/configfs-usb-gadget
+++ b/Documentation/ABI/testing/configfs-usb-gadget
@@ -16,6 +16,10 @@
write UDC's name found in /sys/class/udc/*
to bind a gadget, empty string "" to unbind.
+ max_speed - maximum speed the driver supports. Valid
+ names are super-speed-plus, super-speed,
+ high-speed, full-speed, and low-speed.
+
bDeviceClass - USB device class code
bDeviceSubClass - USB device subclass code
bDeviceProtocol - USB device protocol code
diff --git a/Documentation/ABI/testing/ima_policy b/Documentation/ABI/testing/ima_policy
index 29aaedf..cd57291 100644
--- a/Documentation/ABI/testing/ima_policy
+++ b/Documentation/ABI/testing/ima_policy
@@ -25,11 +25,11 @@
lsm: [[subj_user=] [subj_role=] [subj_type=]
[obj_user=] [obj_role=] [obj_type=]]
option: [[appraise_type=]] [template=] [permit_directio]
- [appraise_flag=]
+ [appraise_flag=] [keyrings=]
base: func:= [BPRM_CHECK][MMAP_CHECK][CREDS_CHECK][FILE_CHECK][MODULE_CHECK]
[FIRMWARE_CHECK]
[KEXEC_KERNEL_CHECK] [KEXEC_INITRAMFS_CHECK]
- [KEXEC_CMDLINE]
+ [KEXEC_CMDLINE] [KEY_CHECK]
mask:= [[^]MAY_READ] [[^]MAY_WRITE] [[^]MAY_APPEND]
[[^]MAY_EXEC]
fsmagic:= hex value
@@ -42,6 +42,9 @@
appraise_flag:= [check_blacklist]
Currently, blacklist check is only for files signed with appended
signature.
+ keyrings:= list of keyrings
+ (eg, .builtin_trusted_keys|.ima). Only valid
+ when action is "measure" and func is KEY_CHECK.
template:= name of a defined IMA template type
(eg, ima-ng). Only valid when action is "measure".
pcr:= decimal value
@@ -113,3 +116,12 @@
Example of appraise rule allowing modsig appended signatures:
appraise func=KEXEC_KERNEL_CHECK appraise_type=imasig|modsig
+
+ Example of measure rule using KEY_CHECK to measure all keys:
+
+ measure func=KEY_CHECK
+
+ Example of measure rule using KEY_CHECK to only measure
+ keys added to .builtin_trusted_keys or .ima keyring:
+
+ measure func=KEY_CHECK keyrings=.builtin_trusted_keys|.ima
diff --git a/Documentation/ABI/testing/rtc-cdev b/Documentation/ABI/testing/rtc-cdev
index 9744728..25910c3 100644
--- a/Documentation/ABI/testing/rtc-cdev
+++ b/Documentation/ABI/testing/rtc-cdev
@@ -33,6 +33,14 @@
Requires a separate RTC_PIE_ON call to enable the periodic
interrupts.
+ * RTC_VL_READ: Read the voltage inputs status of the RTC when
+ supported. The value is a bit field of RTC_VL_*, giving the
+ status of the main and backup voltages.
+
+ * RTC_VL_CLEAR: Clear the voltage status of the RTC. Some RTCs
+ need user interaction when the backup power provider is
+ replaced or charged to be able to clear the status.
+
The ioctl() calls supported by the older /dev/rtc interface are
also supported by the newer RTC class framework. However,
because the chips and systems are not standardized, some PC/AT
diff --git a/Documentation/ABI/testing/sysfs-bus-iio b/Documentation/ABI/testing/sysfs-bus-iio
index faaa216..d3e53a6 100644
--- a/Documentation/ABI/testing/sysfs-bus-iio
+++ b/Documentation/ABI/testing/sysfs-bus-iio
@@ -1726,3 +1726,16 @@
Description:
List of valid periods (in seconds) for which the light intensity
must be above the threshold level before interrupt is asserted.
+
+What: /sys/bus/iio/devices/iio:deviceX/in_filter_notch_center_frequency
+KernelVersion: 5.5
+Contact: linux-iio@vger.kernel.org
+Description:
+ Center frequency in Hz for a notch filter. Used i.e. for line
+ noise suppression.
+
+What: /sys/bus/iio/devices/iio:deviceX/in_temp_thermocouple_type
+KernelVersion: 5.5
+Contact: linux-iio@vger.kernel.org
+Description:
+ One of the following thermocouple types: B, E, J, K, N, R, S, T.
diff --git a/Documentation/ABI/testing/sysfs-bus-iio-dma-buffer b/Documentation/ABI/testing/sysfs-bus-iio-dma-buffer
new file mode 100644
index 0000000..d526e65
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-iio-dma-buffer
@@ -0,0 +1,19 @@
+What: /sys/bus/iio/devices/iio:deviceX/buffer/length_align_bytes
+KernelVersion: 5.4
+Contact: linux-iio@vger.kernel.org
+Description:
+ DMA buffers tend to have a alignment requirement for the
+ buffers. If this alignment requirement is not met samples might
+ be dropped from the buffer.
+
+ This property reports the alignment requirements in bytes.
+ This means that the buffer size in bytes needs to be a integer
+ multiple of the number reported by this file.
+
+ The alignment requirements in number of sample sets will depend
+ on the enabled channels and the bytes per channel. This means
+ that the alignment requirement in samples sets might change
+ depending on which and how many channels are enabled. Whereas
+ the alignment requirement reported in bytes by this property
+ will remain static and does not depend on which channels are
+ enabled.
diff --git a/Documentation/ABI/testing/sysfs-bus-mdio b/Documentation/ABI/testing/sysfs-bus-mdio
new file mode 100644
index 0000000..da86efc
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-mdio
@@ -0,0 +1,63 @@
+What: /sys/bus/mdio_bus/devices/.../statistics/
+Date: January 2020
+KernelVersion: 5.6
+Contact: netdev@vger.kernel.org
+Description:
+ This folder contains statistics about global and per
+ MDIO bus address statistics.
+
+What: /sys/bus/mdio_bus/devices/.../statistics/transfers
+Date: January 2020
+KernelVersion: 5.6
+Contact: netdev@vger.kernel.org
+Description:
+ Total number of transfers for this MDIO bus.
+
+What: /sys/bus/mdio_bus/devices/.../statistics/errors
+Date: January 2020
+KernelVersion: 5.6
+Contact: netdev@vger.kernel.org
+Description:
+ Total number of transfer errors for this MDIO bus.
+
+What: /sys/bus/mdio_bus/devices/.../statistics/writes
+Date: January 2020
+KernelVersion: 5.6
+Contact: netdev@vger.kernel.org
+Description:
+ Total number of write transactions for this MDIO bus.
+
+What: /sys/bus/mdio_bus/devices/.../statistics/reads
+Date: January 2020
+KernelVersion: 5.6
+Contact: netdev@vger.kernel.org
+Description:
+ Total number of read transactions for this MDIO bus.
+
+What: /sys/bus/mdio_bus/devices/.../statistics/transfers_<addr>
+Date: January 2020
+KernelVersion: 5.6
+Contact: netdev@vger.kernel.org
+Description:
+ Total number of transfers for this MDIO bus address.
+
+What: /sys/bus/mdio_bus/devices/.../statistics/errors_<addr>
+Date: January 2020
+KernelVersion: 5.6
+Contact: netdev@vger.kernel.org
+Description:
+ Total number of transfer errors for this MDIO bus address.
+
+What: /sys/bus/mdio_bus/devices/.../statistics/writes_<addr>
+Date: January 2020
+KernelVersion: 5.6
+Contact: netdev@vger.kernel.org
+Description:
+ Total number of write transactions for this MDIO bus address.
+
+What: /sys/bus/mdio_bus/devices/.../statistics/reads_<addr>
+Date: January 2020
+KernelVersion: 5.6
+Contact: netdev@vger.kernel.org
+Description:
+ Total number of read transactions for this MDIO bus address.
diff --git a/Documentation/ABI/testing/sysfs-class-devfreq b/Documentation/ABI/testing/sysfs-class-devfreq
index 01196e1..9758eb8 100644
--- a/Documentation/ABI/testing/sysfs-class-devfreq
+++ b/Documentation/ABI/testing/sysfs-class-devfreq
@@ -7,6 +7,13 @@
The name of devfreq object denoted as ... is same as the
name of device using devfreq.
+What: /sys/class/devfreq/.../name
+Date: November 2019
+Contact: Chanwoo Choi <cw00.choi@samsung.com>
+Description:
+ The /sys/class/devfreq/.../name shows the name of device
+ of the corresponding devfreq object.
+
What: /sys/class/devfreq/.../governor
Date: September 2011
Contact: MyungJoo Ham <myungjoo.ham@samsung.com>
@@ -48,12 +55,15 @@
Date: October 2012
Contact: MyungJoo Ham <myungjoo.ham@samsung.com>
Description:
- This ABI shows the statistics of devfreq behavior on a
- specific device. It shows the time spent in each state and
- the number of transitions between states.
+ This ABI shows or clears the statistics of devfreq behavior
+ on a specific device. It shows the time spent in each state
+ and the number of transitions between states.
In order to activate this ABI, the devfreq target device
driver should provide the list of available frequencies
- with its profile.
+ with its profile. If need to reset the statistics of devfreq
+ behavior on a specific device, enter 0(zero) to 'trans_stat'
+ as following:
+ echo 0 > /sys/class/devfreq/.../trans_stat
What: /sys/class/devfreq/.../userspace/set_freq
Date: September 2011
diff --git a/Documentation/ABI/testing/sysfs-class-power b/Documentation/ABI/testing/sysfs-class-power
index 27edc06..bf3b48f 100644
--- a/Documentation/ABI/testing/sysfs-class-power
+++ b/Documentation/ABI/testing/sysfs-class-power
@@ -189,7 +189,8 @@
Access: Read
Valid values: "Unknown", "Good", "Overheat", "Dead",
"Over voltage", "Unspecified failure", "Cold",
- "Watchdog timer expire", "Safety timer expire"
+ "Watchdog timer expire", "Safety timer expire",
+ "Over current"
What: /sys/class/power_supply/<supply_name>/precharge_current
Date: June 2017
diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu
index fc20cde..2e0e3b4 100644
--- a/Documentation/ABI/testing/sysfs-devices-system-cpu
+++ b/Documentation/ABI/testing/sysfs-devices-system-cpu
@@ -196,6 +196,12 @@
does not reflect it. Likewise, if one enables a deep state but a
lighter state still is disabled, then this has no effect.
+What: /sys/devices/system/cpu/cpuX/cpuidle/stateN/default_status
+Date: December 2019
+KernelVersion: v5.6
+Contact: Linux power management list <linux-pm@vger.kernel.org>
+Description:
+ (RO) The default status of this state, "enabled" or "disabled".
What: /sys/devices/system/cpu/cpuX/cpuidle/stateN/residency
Date: March 2014
diff --git a/Documentation/ABI/testing/sysfs-driver-pciback b/Documentation/ABI/testing/sysfs-driver-pciback
index 6a733bf..73308c2 100644
--- a/Documentation/ABI/testing/sysfs-driver-pciback
+++ b/Documentation/ABI/testing/sysfs-driver-pciback
@@ -11,3 +11,16 @@
#echo 00:19.0-E0:2:FF > /sys/bus/pci/drivers/pciback/quirks
will allow the guest to read and write to the configuration
register 0x0E.
+
+What: /sys/bus/pci/drivers/pciback/allow_interrupt_control
+Date: Jan 2020
+KernelVersion: 5.6
+Contact: xen-devel@lists.xenproject.org
+Description:
+ List of devices which can have interrupt control flag (INTx,
+ MSI, MSI-X) set by a connected guest. It is meant to be set
+ only when the guest is a stubdomain hosting device model (qemu)
+ and the actual device is assigned to a HVM. It is not safe
+ (similar to permissive attribute) to set for a devices assigned
+ to a PV guest. The device is automatically removed from this
+ list when the connected pcifront terminates.
diff --git a/Documentation/ABI/testing/sysfs-driver-xen-blkback b/Documentation/ABI/testing/sysfs-driver-xen-blkback
index 4e7babb3..ecb7942 100644
--- a/Documentation/ABI/testing/sysfs-driver-xen-blkback
+++ b/Documentation/ABI/testing/sysfs-driver-xen-blkback
@@ -25,3 +25,13 @@
allocated without being in use. The time is in
seconds, 0 means indefinitely long.
The default is 60 seconds.
+
+What: /sys/module/xen_blkback/parameters/buffer_squeeze_duration_ms
+Date: December 2019
+KernelVersion: 5.6
+Contact: SeongJae Park <sjpark@amazon.de>
+Description:
+ When memory pressure is reported to blkback this option
+ controls the duration in milliseconds that blkback will not
+ cache any page not backed by a grant mapping.
+ The default is 10ms.
diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs
index aedeae1..1a6cd539 100644
--- a/Documentation/ABI/testing/sysfs-fs-f2fs
+++ b/Documentation/ABI/testing/sysfs-fs-f2fs
@@ -1,37 +1,40 @@
What: /sys/fs/f2fs/<disk>/gc_max_sleep_time
Date: July 2013
Contact: "Namjae Jeon" <namjae.jeon@samsung.com>
-Description:
- Controls the maximun sleep time for gc_thread. Time
- is in milliseconds.
+Description: Controls the maximum sleep time for gc_thread. Time
+ is in milliseconds.
What: /sys/fs/f2fs/<disk>/gc_min_sleep_time
Date: July 2013
Contact: "Namjae Jeon" <namjae.jeon@samsung.com>
-Description:
- Controls the minimum sleep time for gc_thread. Time
- is in milliseconds.
+Description: Controls the minimum sleep time for gc_thread. Time
+ is in milliseconds.
What: /sys/fs/f2fs/<disk>/gc_no_gc_sleep_time
Date: July 2013
Contact: "Namjae Jeon" <namjae.jeon@samsung.com>
-Description:
- Controls the default sleep time for gc_thread. Time
- is in milliseconds.
+Description: Controls the default sleep time for gc_thread. Time
+ is in milliseconds.
What: /sys/fs/f2fs/<disk>/gc_idle
Date: July 2013
Contact: "Namjae Jeon" <namjae.jeon@samsung.com>
-Description:
- Controls the victim selection policy for garbage collection.
+Description: Controls the victim selection policy for garbage collection.
+ Setting gc_idle = 0(default) will disable this option. Setting
+ gc_idle = 1 will select the Cost Benefit approach & setting
+ gc_idle = 2 will select the greedy approach.
What: /sys/fs/f2fs/<disk>/reclaim_segments
Date: October 2013
Contact: "Jaegeuk Kim" <jaegeuk.kim@samsung.com>
-Description:
- Controls the issue rate of segment discard commands.
+Description: This parameter controls the number of prefree segments to be
+ reclaimed. If the number of prefree segments is larger than
+ the number of segments in the proportion to the percentage
+ over total volume size, f2fs tries to conduct checkpoint to
+ reclaim the prefree segments to free segments.
+ By default, 5% over total # of segments.
-What: /sys/fs/f2fs/<disk>/max_blkaddr
+What: /sys/fs/f2fs/<disk>/main_blkaddr
Date: November 2019
Contact: "Ramon Pantin" <pantin@google.com>
Description:
@@ -40,227 +43,278 @@
What: /sys/fs/f2fs/<disk>/ipu_policy
Date: November 2013
Contact: "Jaegeuk Kim" <jaegeuk.kim@samsung.com>
-Description:
- Controls the in-place-update policy.
+Description: Controls the in-place-update policy.
+ updates in f2fs. User can set:
+ 0x01: F2FS_IPU_FORCE, 0x02: F2FS_IPU_SSR,
+ 0x04: F2FS_IPU_UTIL, 0x08: F2FS_IPU_SSR_UTIL,
+ 0x10: F2FS_IPU_FSYNC, 0x20: F2FS_IPU_ASYNC,
+ 0x40: F2FS_IPU_NOCACHE.
+ Refer segment.h for details.
What: /sys/fs/f2fs/<disk>/min_ipu_util
Date: November 2013
Contact: "Jaegeuk Kim" <jaegeuk.kim@samsung.com>
-Description:
- Controls the FS utilization condition for the in-place-update
- policies.
+Description: Controls the FS utilization condition for the in-place-update
+ policies. It is used by F2FS_IPU_UTIL and F2FS_IPU_SSR_UTIL policies.
What: /sys/fs/f2fs/<disk>/min_fsync_blocks
Date: September 2014
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
-Description:
- Controls the dirty page count condition for the in-place-update
- policies.
+Description: Controls the dirty page count condition for the in-place-update
+ policies.
What: /sys/fs/f2fs/<disk>/min_seq_blocks
Date: August 2018
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
-Description:
- Controls the dirty page count condition for batched sequential
- writes in ->writepages.
-
+Description: Controls the dirty page count condition for batched sequential
+ writes in writepages.
What: /sys/fs/f2fs/<disk>/min_hot_blocks
Date: March 2017
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
-Description:
- Controls the dirty page count condition for redefining hot data.
+Description: Controls the dirty page count condition for redefining hot data.
What: /sys/fs/f2fs/<disk>/min_ssr_sections
Date: October 2017
Contact: "Chao Yu" <yuchao0@huawei.com>
-Description:
- Controls the fee section threshold to trigger SSR allocation.
+Description: Controls the free section threshold to trigger SSR allocation.
+ If this is large, SSR mode will be enabled early.
What: /sys/fs/f2fs/<disk>/max_small_discards
Date: November 2013
Contact: "Jaegeuk Kim" <jaegeuk.kim@samsung.com>
-Description:
- Controls the issue rate of small discard commands.
+Description: Controls the issue rate of discard commands that consist of small
+ blocks less than 2MB. The candidates to be discarded are cached until
+ checkpoint is triggered, and issued during the checkpoint.
+ By default, it is disabled with 0.
-What: /sys/fs/f2fs/<disk>/discard_granularity
-Date: July 2017
-Contact: "Chao Yu" <yuchao0@huawei.com>
-Description:
- Controls discard granularity of inner discard thread, inner thread
+What: /sys/fs/f2fs/<disk>/discard_granularity
+Date: July 2017
+Contact: "Chao Yu" <yuchao0@huawei.com>
+Description: Controls discard granularity of inner discard thread. Inner thread
will not issue discards with size that is smaller than granularity.
- The unit size is one block, now only support configuring in range
- of [1, 512].
+ The unit size is one block(4KB), now only support configuring
+ in range of [1, 512]. Default value is 4(=16KB).
-What: /sys/fs/f2fs/<disk>/umount_discard_timeout
-Date: January 2019
-Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
-Description:
- Set timeout to issue discard commands during umount.
- Default: 5 secs
+What: /sys/fs/f2fs/<disk>/umount_discard_timeout
+Date: January 2019
+Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
+Description: Set timeout to issue discard commands during umount.
+ Default: 5 secs
What: /sys/fs/f2fs/<disk>/max_victim_search
Date: January 2014
Contact: "Jaegeuk Kim" <jaegeuk.kim@samsung.com>
-Description:
- Controls the number of trials to find a victim segment.
+Description: Controls the number of trials to find a victim segment
+ when conducting SSR and cleaning operations. The default value
+ is 4096 which covers 8GB block address range.
What: /sys/fs/f2fs/<disk>/migration_granularity
Date: October 2018
Contact: "Chao Yu" <yuchao0@huawei.com>
-Description:
- Controls migration granularity of garbage collection on large
- section, it can let GC move partial segment{s} of one section
- in one GC cycle, so that dispersing heavy overhead GC to
- multiple lightweight one.
+Description: Controls migration granularity of garbage collection on large
+ section, it can let GC move partial segment{s} of one section
+ in one GC cycle, so that dispersing heavy overhead GC to
+ multiple lightweight one.
What: /sys/fs/f2fs/<disk>/dir_level
Date: March 2014
Contact: "Jaegeuk Kim" <jaegeuk.kim@samsung.com>
-Description:
- Controls the directory level for large directory.
+Description: Controls the directory level for large directory. If a
+ directory has a number of files, it can reduce the file lookup
+ latency by increasing this dir_level value. Otherwise, it
+ needs to decrease this value to reduce the space overhead.
+ The default value is 0.
What: /sys/fs/f2fs/<disk>/ram_thresh
Date: March 2014
Contact: "Jaegeuk Kim" <jaegeuk.kim@samsung.com>
-Description:
- Controls the memory footprint used by f2fs.
+Description: Controls the memory footprint used by free nids and cached
+ nat entries. By default, 1 is set, which indicates
+ 10 MB / 1 GB RAM.
What: /sys/fs/f2fs/<disk>/batched_trim_sections
Date: February 2015
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
-Description:
- Controls the trimming rate in batch mode.
- <deprecated>
+Description: Controls the trimming rate in batch mode.
+ <deprecated>
What: /sys/fs/f2fs/<disk>/cp_interval
Date: October 2015
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
-Description:
- Controls the checkpoint timing.
+Description: Controls the checkpoint timing, set to 60 seconds by default.
What: /sys/fs/f2fs/<disk>/idle_interval
Date: January 2016
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
-Description:
- Controls the idle timing for all paths other than
- discard and gc path.
+Description: Controls the idle timing of system, if there is no FS operation
+ during given interval.
+ Set to 5 seconds by default.
What: /sys/fs/f2fs/<disk>/discard_idle_interval
Date: September 2018
Contact: "Chao Yu" <yuchao0@huawei.com>
Contact: "Sahitya Tummala" <stummala@codeaurora.org>
-Description:
- Controls the idle timing for discard path.
+Description: Controls the idle timing of discard thread given
+ this time interval.
+ Default is 5 secs.
What: /sys/fs/f2fs/<disk>/gc_idle_interval
Date: September 2018
Contact: "Chao Yu" <yuchao0@huawei.com>
Contact: "Sahitya Tummala" <stummala@codeaurora.org>
-Description:
- Controls the idle timing for gc path.
+Description: Controls the idle timing for gc path. Set to 5 seconds by default.
What: /sys/fs/f2fs/<disk>/iostat_enable
Date: August 2017
Contact: "Chao Yu" <yuchao0@huawei.com>
-Description:
- Controls to enable/disable IO stat.
+Description: Controls to enable/disable IO stat.
What: /sys/fs/f2fs/<disk>/ra_nid_pages
Date: October 2015
Contact: "Chao Yu" <chao2.yu@samsung.com>
-Description:
- Controls the count of nid pages to be readaheaded.
+Description: Controls the count of nid pages to be readaheaded.
+ When building free nids, F2FS reads NAT blocks ahead for
+ speed up. Default is 0.
What: /sys/fs/f2fs/<disk>/dirty_nats_ratio
Date: January 2016
Contact: "Chao Yu" <chao2.yu@samsung.com>
-Description:
- Controls dirty nat entries ratio threshold, if current
- ratio exceeds configured threshold, checkpoint will
- be triggered for flushing dirty nat entries.
+Description: Controls dirty nat entries ratio threshold, if current
+ ratio exceeds configured threshold, checkpoint will
+ be triggered for flushing dirty nat entries.
What: /sys/fs/f2fs/<disk>/lifetime_write_kbytes
Date: January 2016
Contact: "Shuoran Liu" <liushuoran@huawei.com>
-Description:
- Shows total written kbytes issued to disk.
+Description: Shows total written kbytes issued to disk.
What: /sys/fs/f2fs/<disk>/features
Date: July 2017
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
-Description:
- Shows all enabled features in current device.
+Description: Shows all enabled features in current device.
What: /sys/fs/f2fs/<disk>/inject_rate
Date: May 2016
Contact: "Sheng Yong" <shengyong1@huawei.com>
-Description:
- Controls the injection rate.
+Description: Controls the injection rate of arbitrary faults.
What: /sys/fs/f2fs/<disk>/inject_type
Date: May 2016
Contact: "Sheng Yong" <shengyong1@huawei.com>
-Description:
- Controls the injection type.
+Description: Controls the injection type of arbitrary faults.
+
+What: /sys/fs/f2fs/<disk>/dirty_segments
+Date: October 2017
+Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
+Description: Shows the number of dirty segments.
What: /sys/fs/f2fs/<disk>/reserved_blocks
Date: June 2017
Contact: "Chao Yu" <yuchao0@huawei.com>
-Description:
- Controls target reserved blocks in system, the threshold
- is soft, it could exceed current available user space.
+Description: Controls target reserved blocks in system, the threshold
+ is soft, it could exceed current available user space.
What: /sys/fs/f2fs/<disk>/current_reserved_blocks
Date: October 2017
Contact: "Yunlong Song" <yunlong.song@huawei.com>
Contact: "Chao Yu" <yuchao0@huawei.com>
-Description:
- Shows current reserved blocks in system, it may be temporarily
- smaller than target_reserved_blocks, but will gradually
- increase to target_reserved_blocks when more free blocks are
- freed by user later.
+Description: Shows current reserved blocks in system, it may be temporarily
+ smaller than target_reserved_blocks, but will gradually
+ increase to target_reserved_blocks when more free blocks are
+ freed by user later.
What: /sys/fs/f2fs/<disk>/gc_urgent
Date: August 2017
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
-Description:
- Do background GC agressively
+Description: Do background GC agressively when set. When gc_urgent = 1,
+ background thread starts to do GC by given gc_urgent_sleep_time
+ interval. It is set to 0 by default.
What: /sys/fs/f2fs/<disk>/gc_urgent_sleep_time
Date: August 2017
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
-Description:
- Controls sleep time of GC urgent mode
+Description: Controls sleep time of GC urgent mode. Set to 500ms by default.
What: /sys/fs/f2fs/<disk>/readdir_ra
Date: November 2017
Contact: "Sheng Yong" <shengyong1@huawei.com>
-Description:
- Controls readahead inode block in readdir.
+Description: Controls readahead inode block in readdir. Enabled by default.
+
+What: /sys/fs/f2fs/<disk>/gc_pin_file_thresh
+Date: January 2018
+Contact: Jaegeuk Kim <jaegeuk@kernel.org>
+Description: This indicates how many GC can be failed for the pinned
+ file. If it exceeds this, F2FS doesn't guarantee its pinning
+ state. 2048 trials is set by default.
What: /sys/fs/f2fs/<disk>/extension_list
Date: Feburary 2018
Contact: "Chao Yu" <yuchao0@huawei.com>
-Description:
- Used to control configure extension list:
- - Query: cat /sys/fs/f2fs/<disk>/extension_list
- - Add: echo '[h/c]extension' > /sys/fs/f2fs/<disk>/extension_list
- - Del: echo '[h/c]!extension' > /sys/fs/f2fs/<disk>/extension_list
- - [h] means add/del hot file extension
- - [c] means add/del cold file extension
+Description: Used to control configure extension list:
+ - Query: cat /sys/fs/f2fs/<disk>/extension_list
+ - Add: echo '[h/c]extension' > /sys/fs/f2fs/<disk>/extension_list
+ - Del: echo '[h/c]!extension' > /sys/fs/f2fs/<disk>/extension_list
+ - [h] means add/del hot file extension
+ - [c] means add/del cold file extension
What: /sys/fs/f2fs/<disk>/unusable
Date April 2019
Contact: "Daniel Rosenberg" <drosen@google.com>
-Description:
- If checkpoint=disable, it displays the number of blocks that are unusable.
- If checkpoint=enable it displays the enumber of blocks that would be unusable
- if checkpoint=disable were to be set.
+Description: If checkpoint=disable, it displays the number of blocks that
+ are unusable.
+ If checkpoint=enable it displays the enumber of blocks that
+ would be unusable if checkpoint=disable were to be set.
What: /sys/fs/f2fs/<disk>/encoding
Date July 2019
Contact: "Daniel Rosenberg" <drosen@google.com>
-Description:
- Displays name and version of the encoding set for the filesystem.
- If no encoding is set, displays (none)
+Description: Displays name and version of the encoding set for the filesystem.
+ If no encoding is set, displays (none)
+
+What: /sys/fs/f2fs/<disk>/free_segments
+Date: September 2019
+Contact: "Hridya Valsaraju" <hridya@google.com>
+Description: Number of free segments in disk.
+
+What: /sys/fs/f2fs/<disk>/cp_foreground_calls
+Date: September 2019
+Contact: "Hridya Valsaraju" <hridya@google.com>
+Description: Number of checkpoint operations performed on demand. Available when
+ CONFIG_F2FS_STAT_FS=y.
+
+What: /sys/fs/f2fs/<disk>/cp_background_calls
+Date: September 2019
+Contact: "Hridya Valsaraju" <hridya@google.com>
+Description: Number of checkpoint operations performed in the background to
+ free segments. Available when CONFIG_F2FS_STAT_FS=y.
+
+What: /sys/fs/f2fs/<disk>/gc_foreground_calls
+Date: September 2019
+Contact: "Hridya Valsaraju" <hridya@google.com>
+Description: Number of garbage collection operations performed on demand.
+ Available when CONFIG_F2FS_STAT_FS=y.
+
+What: /sys/fs/f2fs/<disk>/gc_background_calls
+Date: September 2019
+Contact: "Hridya Valsaraju" <hridya@google.com>
+Description: Number of garbage collection operations triggered in background.
+ Available when CONFIG_F2FS_STAT_FS=y.
+
+What: /sys/fs/f2fs/<disk>/moved_blocks_foreground
+Date: September 2019
+Contact: "Hridya Valsaraju" <hridya@google.com>
+Description: Number of blocks moved by garbage collection in foreground.
+ Available when CONFIG_F2FS_STAT_FS=y.
+
+What: /sys/fs/f2fs/<disk>/moved_blocks_background
+Date: September 2019
+Contact: "Hridya Valsaraju" <hridya@google.com>
+Description: Number of blocks moved by garbage collection in background.
+ Available when CONFIG_F2FS_STAT_FS=y.
+
+What: /sys/fs/f2fs/<disk>/avg_vblocks
+Date: September 2019
+Contact: "Hridya Valsaraju" <hridya@google.com>
+Description: Average number of valid blocks.
+ Available when CONFIG_F2FS_STAT_FS=y.
diff --git a/Documentation/ABI/testing/sysfs-platform-asus-wmi b/Documentation/ABI/testing/sysfs-platform-asus-wmi
index 9e99f29..1efac0d 100644
--- a/Documentation/ABI/testing/sysfs-platform-asus-wmi
+++ b/Documentation/ABI/testing/sysfs-platform-asus-wmi
@@ -46,3 +46,13 @@
* 0 - normal,
* 1 - overboost,
* 2 - silent
+
+What: /sys/devices/platform/<platform>/throttle_thermal_policy
+Date: Dec 2019
+KernelVersion: 5.6
+Contact: "Leonid Maksymchuk" <leonmaxx@gmail.com>
+Description:
+ Throttle thermal policy mode:
+ * 0 - default,
+ * 1 - overboost,
+ * 2 - silent
diff --git a/Documentation/ABI/testing/sysfs-platform-mellanox-bootctl b/Documentation/ABI/testing/sysfs-platform-mellanox-bootctl
index c65a805..401d202 100644
--- a/Documentation/ABI/testing/sysfs-platform-mellanox-bootctl
+++ b/Documentation/ABI/testing/sysfs-platform-mellanox-bootctl
@@ -1,4 +1,4 @@
-What: /sys/bus/platform/devices/MLNXBF04:00/driver/lifecycle_state
+What: /sys/bus/platform/devices/MLNXBF04:00/lifecycle_state
Date: Oct 2019
KernelVersion: 5.5
Contact: "Liming Sun <lsun@mellanox.com>"
@@ -10,7 +10,7 @@
GA Non-Secured - Non-Secure chip and not able to change state
RMA - Return Merchandise Authorization
-What: /sys/bus/platform/devices/MLNXBF04:00/driver/post_reset_wdog
+What: /sys/bus/platform/devices/MLNXBF04:00/post_reset_wdog
Date: Oct 2019
KernelVersion: 5.5
Contact: "Liming Sun <lsun@mellanox.com>"
@@ -19,7 +19,7 @@
to reboot the chip and recover it to the old state if the new
boot partition fails.
-What: /sys/bus/platform/devices/MLNXBF04:00/driver/reset_action
+What: /sys/bus/platform/devices/MLNXBF04:00/reset_action
Date: Oct 2019
KernelVersion: 5.5
Contact: "Liming Sun <lsun@mellanox.com>"
@@ -30,7 +30,7 @@
emmc - boot from the onchip eMMC
emmc_legacy - boot from the onchip eMMC in legacy (slow) mode
-What: /sys/bus/platform/devices/MLNXBF04:00/driver/second_reset_action
+What: /sys/bus/platform/devices/MLNXBF04:00/second_reset_action
Date: Oct 2019
KernelVersion: 5.5
Contact: "Liming Sun <lsun@mellanox.com>"
@@ -44,7 +44,7 @@
swap_emmc - swap the primary / secondary boot partition
none - cancel the action
-What: /sys/bus/platform/devices/MLNXBF04:00/driver/secure_boot_fuse_state
+What: /sys/bus/platform/devices/MLNXBF04:00/secure_boot_fuse_state
Date: Oct 2019
KernelVersion: 5.5
Contact: "Liming Sun <lsun@mellanox.com>"
diff --git a/Documentation/ABI/testing/sysfs-power b/Documentation/ABI/testing/sysfs-power
index 6f87b9d..5e6ead2 100644
--- a/Documentation/ABI/testing/sysfs-power
+++ b/Documentation/ABI/testing/sysfs-power
@@ -407,3 +407,16 @@
Description:
The /sys/power/suspend_stats/last_failed_step file contains
the last failed step in the suspend/resume path.
+
+What: /sys/power/sync_on_suspend
+Date: October 2019
+Contact: Jonas Meurer <jonas@freesources.org>
+Description:
+ This file controls whether or not the kernel will sync()
+ filesystems during system suspend (after freezing user space
+ and before suspending devices).
+
+ Writing a "1" to this file enables the sync() and writing a "0"
+ disables it. Reads from the file return the current value.
+ The default is "1" if the build-time "SUSPEND_SKIP_SYNC" config
+ flag is unset, or "0" otherwise.
diff --git a/Documentation/ABI/testing/usb-charger-uevent b/Documentation/ABI/testing/usb-charger-uevent
new file mode 100644
index 0000000..419a92d
--- /dev/null
+++ b/Documentation/ABI/testing/usb-charger-uevent
@@ -0,0 +1,46 @@
+What: Raise a uevent when a USB charger is inserted or removed
+Date: 2020-01-14
+KernelVersion: 5.6
+Contact: linux-usb@vger.kernel.org
+Description: There are two USB charger states:
+ USB_CHARGER_ABSENT
+ USB_CHARGER_PRESENT
+ There are five USB charger types:
+ USB_CHARGER_UNKNOWN_TYPE: Charger type is unknown
+ USB_CHARGER_SDP_TYPE: Standard Downstream Port
+ USB_CHARGER_CDP_TYPE: Charging Downstream Port
+ USB_CHARGER_DCP_TYPE: Dedicated Charging Port
+ USB_CHARGER_ACA_TYPE: Accessory Charging Adapter
+ https://www.usb.org/document-library/battery-charging-v12-spec-and-adopters-agreement
+
+ Here are two examples taken using udevadm monitor -p when
+ USB charger is online:
+ UDEV change /devices/soc0/usbphynop1 (platform)
+ ACTION=change
+ DEVPATH=/devices/soc0/usbphynop1
+ DRIVER=usb_phy_generic
+ MODALIAS=of:Nusbphynop1T(null)Cusb-nop-xceiv
+ OF_COMPATIBLE_0=usb-nop-xceiv
+ OF_COMPATIBLE_N=1
+ OF_FULLNAME=/usbphynop1
+ OF_NAME=usbphynop1
+ SEQNUM=2493
+ SUBSYSTEM=platform
+ USB_CHARGER_STATE=USB_CHARGER_PRESENT
+ USB_CHARGER_TYPE=USB_CHARGER_SDP_TYPE
+ USEC_INITIALIZED=227422826
+
+ USB charger is offline:
+ KERNEL change /devices/soc0/usbphynop1 (platform)
+ ACTION=change
+ DEVPATH=/devices/soc0/usbphynop1
+ DRIVER=usb_phy_generic
+ MODALIAS=of:Nusbphynop1T(null)Cusb-nop-xceiv
+ OF_COMPATIBLE_0=usb-nop-xceiv
+ OF_COMPATIBLE_N=1
+ OF_FULLNAME=/usbphynop1
+ OF_NAME=usbphynop1
+ SEQNUM=2494
+ SUBSYSTEM=platform
+ USB_CHARGER_STATE=USB_CHARGER_ABSENT
+ USB_CHARGER_TYPE=USB_CHARGER_UNKNOWN_TYPE
diff --git a/Documentation/PCI/msi-howto.rst b/Documentation/PCI/msi-howto.rst
index 994cbb6..aa2046a 100644
--- a/Documentation/PCI/msi-howto.rst
+++ b/Documentation/PCI/msi-howto.rst
@@ -283,5 +283,5 @@
to bridges between the PCI root and the device, MSIs are disabled.
It is also worth checking the device driver to see whether it supports MSIs.
-For example, it may contain calls to pci_irq_alloc_vectors() with the
+For example, it may contain calls to pci_alloc_irq_vectors() with the
PCI_IRQ_MSI or PCI_IRQ_MSIX flags.
diff --git a/Documentation/RCU/NMI-RCU.rst b/Documentation/RCU/NMI-RCU.rst
new file mode 100644
index 0000000..1809583
--- /dev/null
+++ b/Documentation/RCU/NMI-RCU.rst
@@ -0,0 +1,124 @@
+.. _NMI_rcu_doc:
+
+Using RCU to Protect Dynamic NMI Handlers
+=========================================
+
+
+Although RCU is usually used to protect read-mostly data structures,
+it is possible to use RCU to provide dynamic non-maskable interrupt
+handlers, as well as dynamic irq handlers. This document describes
+how to do this, drawing loosely from Zwane Mwaikambo's NMI-timer
+work in "arch/x86/oprofile/nmi_timer_int.c" and in
+"arch/x86/kernel/traps.c".
+
+The relevant pieces of code are listed below, each followed by a
+brief explanation::
+
+ static int dummy_nmi_callback(struct pt_regs *regs, int cpu)
+ {
+ return 0;
+ }
+
+The dummy_nmi_callback() function is a "dummy" NMI handler that does
+nothing, but returns zero, thus saying that it did nothing, allowing
+the NMI handler to take the default machine-specific action::
+
+ static nmi_callback_t nmi_callback = dummy_nmi_callback;
+
+This nmi_callback variable is a global function pointer to the current
+NMI handler::
+
+ void do_nmi(struct pt_regs * regs, long error_code)
+ {
+ int cpu;
+
+ nmi_enter();
+
+ cpu = smp_processor_id();
+ ++nmi_count(cpu);
+
+ if (!rcu_dereference_sched(nmi_callback)(regs, cpu))
+ default_do_nmi(regs);
+
+ nmi_exit();
+ }
+
+The do_nmi() function processes each NMI. It first disables preemption
+in the same way that a hardware irq would, then increments the per-CPU
+count of NMIs. It then invokes the NMI handler stored in the nmi_callback
+function pointer. If this handler returns zero, do_nmi() invokes the
+default_do_nmi() function to handle a machine-specific NMI. Finally,
+preemption is restored.
+
+In theory, rcu_dereference_sched() is not needed, since this code runs
+only on i386, which in theory does not need rcu_dereference_sched()
+anyway. However, in practice it is a good documentation aid, particularly
+for anyone attempting to do something similar on Alpha or on systems
+with aggressive optimizing compilers.
+
+Quick Quiz:
+ Why might the rcu_dereference_sched() be necessary on Alpha, given that the code referenced by the pointer is read-only?
+
+:ref:`Answer to Quick Quiz <answer_quick_quiz_NMI>`
+
+Back to the discussion of NMI and RCU::
+
+ void set_nmi_callback(nmi_callback_t callback)
+ {
+ rcu_assign_pointer(nmi_callback, callback);
+ }
+
+The set_nmi_callback() function registers an NMI handler. Note that any
+data that is to be used by the callback must be initialized up -before-
+the call to set_nmi_callback(). On architectures that do not order
+writes, the rcu_assign_pointer() ensures that the NMI handler sees the
+initialized values::
+
+ void unset_nmi_callback(void)
+ {
+ rcu_assign_pointer(nmi_callback, dummy_nmi_callback);
+ }
+
+This function unregisters an NMI handler, restoring the original
+dummy_nmi_handler(). However, there may well be an NMI handler
+currently executing on some other CPU. We therefore cannot free
+up any data structures used by the old NMI handler until execution
+of it completes on all other CPUs.
+
+One way to accomplish this is via synchronize_rcu(), perhaps as
+follows::
+
+ unset_nmi_callback();
+ synchronize_rcu();
+ kfree(my_nmi_data);
+
+This works because (as of v4.20) synchronize_rcu() blocks until all
+CPUs complete any preemption-disabled segments of code that they were
+executing.
+Since NMI handlers disable preemption, synchronize_rcu() is guaranteed
+not to return until all ongoing NMI handlers exit. It is therefore safe
+to free up the handler's data as soon as synchronize_rcu() returns.
+
+Important note: for this to work, the architecture in question must
+invoke nmi_enter() and nmi_exit() on NMI entry and exit, respectively.
+
+.. _answer_quick_quiz_NMI:
+
+Answer to Quick Quiz:
+ Why might the rcu_dereference_sched() be necessary on Alpha, given that the code referenced by the pointer is read-only?
+
+ The caller to set_nmi_callback() might well have
+ initialized some data that is to be used by the new NMI
+ handler. In this case, the rcu_dereference_sched() would
+ be needed, because otherwise a CPU that received an NMI
+ just after the new handler was set might see the pointer
+ to the new NMI handler, but the old pre-initialized
+ version of the handler's data.
+
+ This same sad story can happen on other CPUs when using
+ a compiler with aggressive pointer-value speculation
+ optimizations.
+
+ More important, the rcu_dereference_sched() makes it
+ clear to someone reading the code that the pointer is
+ being protected by RCU-sched.
diff --git a/Documentation/RCU/NMI-RCU.txt b/Documentation/RCU/NMI-RCU.txt
deleted file mode 100644
index 881353f..0000000
--- a/Documentation/RCU/NMI-RCU.txt
+++ /dev/null
@@ -1,121 +0,0 @@
-Using RCU to Protect Dynamic NMI Handlers
-
-
-Although RCU is usually used to protect read-mostly data structures,
-it is possible to use RCU to provide dynamic non-maskable interrupt
-handlers, as well as dynamic irq handlers. This document describes
-how to do this, drawing loosely from Zwane Mwaikambo's NMI-timer
-work in "arch/x86/oprofile/nmi_timer_int.c" and in
-"arch/x86/kernel/traps.c".
-
-The relevant pieces of code are listed below, each followed by a
-brief explanation.
-
- static int dummy_nmi_callback(struct pt_regs *regs, int cpu)
- {
- return 0;
- }
-
-The dummy_nmi_callback() function is a "dummy" NMI handler that does
-nothing, but returns zero, thus saying that it did nothing, allowing
-the NMI handler to take the default machine-specific action.
-
- static nmi_callback_t nmi_callback = dummy_nmi_callback;
-
-This nmi_callback variable is a global function pointer to the current
-NMI handler.
-
- void do_nmi(struct pt_regs * regs, long error_code)
- {
- int cpu;
-
- nmi_enter();
-
- cpu = smp_processor_id();
- ++nmi_count(cpu);
-
- if (!rcu_dereference_sched(nmi_callback)(regs, cpu))
- default_do_nmi(regs);
-
- nmi_exit();
- }
-
-The do_nmi() function processes each NMI. It first disables preemption
-in the same way that a hardware irq would, then increments the per-CPU
-count of NMIs. It then invokes the NMI handler stored in the nmi_callback
-function pointer. If this handler returns zero, do_nmi() invokes the
-default_do_nmi() function to handle a machine-specific NMI. Finally,
-preemption is restored.
-
-In theory, rcu_dereference_sched() is not needed, since this code runs
-only on i386, which in theory does not need rcu_dereference_sched()
-anyway. However, in practice it is a good documentation aid, particularly
-for anyone attempting to do something similar on Alpha or on systems
-with aggressive optimizing compilers.
-
-Quick Quiz: Why might the rcu_dereference_sched() be necessary on Alpha,
- given that the code referenced by the pointer is read-only?
-
-
-Back to the discussion of NMI and RCU...
-
- void set_nmi_callback(nmi_callback_t callback)
- {
- rcu_assign_pointer(nmi_callback, callback);
- }
-
-The set_nmi_callback() function registers an NMI handler. Note that any
-data that is to be used by the callback must be initialized up -before-
-the call to set_nmi_callback(). On architectures that do not order
-writes, the rcu_assign_pointer() ensures that the NMI handler sees the
-initialized values.
-
- void unset_nmi_callback(void)
- {
- rcu_assign_pointer(nmi_callback, dummy_nmi_callback);
- }
-
-This function unregisters an NMI handler, restoring the original
-dummy_nmi_handler(). However, there may well be an NMI handler
-currently executing on some other CPU. We therefore cannot free
-up any data structures used by the old NMI handler until execution
-of it completes on all other CPUs.
-
-One way to accomplish this is via synchronize_rcu(), perhaps as
-follows:
-
- unset_nmi_callback();
- synchronize_rcu();
- kfree(my_nmi_data);
-
-This works because (as of v4.20) synchronize_rcu() blocks until all
-CPUs complete any preemption-disabled segments of code that they were
-executing.
-Since NMI handlers disable preemption, synchronize_rcu() is guaranteed
-not to return until all ongoing NMI handlers exit. It is therefore safe
-to free up the handler's data as soon as synchronize_rcu() returns.
-
-Important note: for this to work, the architecture in question must
-invoke nmi_enter() and nmi_exit() on NMI entry and exit, respectively.
-
-
-Answer to Quick Quiz
-
- Why might the rcu_dereference_sched() be necessary on Alpha, given
- that the code referenced by the pointer is read-only?
-
- Answer: The caller to set_nmi_callback() might well have
- initialized some data that is to be used by the new NMI
- handler. In this case, the rcu_dereference_sched() would
- be needed, because otherwise a CPU that received an NMI
- just after the new handler was set might see the pointer
- to the new NMI handler, but the old pre-initialized
- version of the handler's data.
-
- This same sad story can happen on other CPUs when using
- a compiler with aggressive pointer-value speculation
- optimizations.
-
- More important, the rcu_dereference_sched() makes it
- clear to someone reading the code that the pointer is
- being protected by RCU-sched.
diff --git a/Documentation/RCU/arrayRCU.rst b/Documentation/RCU/arrayRCU.rst
new file mode 100644
index 0000000..4051ea3
--- /dev/null
+++ b/Documentation/RCU/arrayRCU.rst
@@ -0,0 +1,165 @@
+.. _array_rcu_doc:
+
+Using RCU to Protect Read-Mostly Arrays
+=======================================
+
+Although RCU is more commonly used to protect linked lists, it can
+also be used to protect arrays. Three situations are as follows:
+
+1. :ref:`Hash Tables <hash_tables>`
+
+2. :ref:`Static Arrays <static_arrays>`
+
+3. :ref:`Resizable Arrays <resizable_arrays>`
+
+Each of these three situations involves an RCU-protected pointer to an
+array that is separately indexed. It might be tempting to consider use
+of RCU to instead protect the index into an array, however, this use
+case is **not** supported. The problem with RCU-protected indexes into
+arrays is that compilers can play way too many optimization games with
+integers, which means that the rules governing handling of these indexes
+are far more trouble than they are worth. If RCU-protected indexes into
+arrays prove to be particularly valuable (which they have not thus far),
+explicit cooperation from the compiler will be required to permit them
+to be safely used.
+
+That aside, each of the three RCU-protected pointer situations are
+described in the following sections.
+
+.. _hash_tables:
+
+Situation 1: Hash Tables
+------------------------
+
+Hash tables are often implemented as an array, where each array entry
+has a linked-list hash chain. Each hash chain can be protected by RCU
+as described in the listRCU.txt document. This approach also applies
+to other array-of-list situations, such as radix trees.
+
+.. _static_arrays:
+
+Situation 2: Static Arrays
+--------------------------
+
+Static arrays, where the data (rather than a pointer to the data) is
+located in each array element, and where the array is never resized,
+have not been used with RCU. Rik van Riel recommends using seqlock in
+this situation, which would also have minimal read-side overhead as long
+as updates are rare.
+
+Quick Quiz:
+ Why is it so important that updates be rare when using seqlock?
+
+:ref:`Answer to Quick Quiz <answer_quick_quiz_seqlock>`
+
+.. _resizable_arrays:
+
+Situation 3: Resizable Arrays
+------------------------------
+
+Use of RCU for resizable arrays is demonstrated by the grow_ary()
+function formerly used by the System V IPC code. The array is used
+to map from semaphore, message-queue, and shared-memory IDs to the data
+structure that represents the corresponding IPC construct. The grow_ary()
+function does not acquire any locks; instead its caller must hold the
+ids->sem semaphore.
+
+The grow_ary() function, shown below, does some limit checks, allocates a
+new ipc_id_ary, copies the old to the new portion of the new, initializes
+the remainder of the new, updates the ids->entries pointer to point to
+the new array, and invokes ipc_rcu_putref() to free up the old array.
+Note that rcu_assign_pointer() is used to update the ids->entries pointer,
+which includes any memory barriers required on whatever architecture
+you are running on::
+
+ static int grow_ary(struct ipc_ids* ids, int newsize)
+ {
+ struct ipc_id_ary* new;
+ struct ipc_id_ary* old;
+ int i;
+ int size = ids->entries->size;
+
+ if(newsize > IPCMNI)
+ newsize = IPCMNI;
+ if(newsize <= size)
+ return newsize;
+
+ new = ipc_rcu_alloc(sizeof(struct kern_ipc_perm *)*newsize +
+ sizeof(struct ipc_id_ary));
+ if(new == NULL)
+ return size;
+ new->size = newsize;
+ memcpy(new->p, ids->entries->p,
+ sizeof(struct kern_ipc_perm *)*size +
+ sizeof(struct ipc_id_ary));
+ for(i=size;i<newsize;i++) {
+ new->p[i] = NULL;
+ }
+ old = ids->entries;
+
+ /*
+ * Use rcu_assign_pointer() to make sure the memcpyed
+ * contents of the new array are visible before the new
+ * array becomes visible.
+ */
+ rcu_assign_pointer(ids->entries, new);
+
+ ipc_rcu_putref(old);
+ return newsize;
+ }
+
+The ipc_rcu_putref() function decrements the array's reference count
+and then, if the reference count has dropped to zero, uses call_rcu()
+to free the array after a grace period has elapsed.
+
+The array is traversed by the ipc_lock() function. This function
+indexes into the array under the protection of rcu_read_lock(),
+using rcu_dereference() to pick up the pointer to the array so
+that it may later safely be dereferenced -- memory barriers are
+required on the Alpha CPU. Since the size of the array is stored
+with the array itself, there can be no array-size mismatches, so
+a simple check suffices. The pointer to the structure corresponding
+to the desired IPC object is placed in "out", with NULL indicating
+a non-existent entry. After acquiring "out->lock", the "out->deleted"
+flag indicates whether the IPC object is in the process of being
+deleted, and, if not, the pointer is returned::
+
+ struct kern_ipc_perm* ipc_lock(struct ipc_ids* ids, int id)
+ {
+ struct kern_ipc_perm* out;
+ int lid = id % SEQ_MULTIPLIER;
+ struct ipc_id_ary* entries;
+
+ rcu_read_lock();
+ entries = rcu_dereference(ids->entries);
+ if(lid >= entries->size) {
+ rcu_read_unlock();
+ return NULL;
+ }
+ out = entries->p[lid];
+ if(out == NULL) {
+ rcu_read_unlock();
+ return NULL;
+ }
+ spin_lock(&out->lock);
+
+ /* ipc_rmid() may have already freed the ID while ipc_lock
+ * was spinning: here verify that the structure is still valid
+ */
+ if (out->deleted) {
+ spin_unlock(&out->lock);
+ rcu_read_unlock();
+ return NULL;
+ }
+ return out;
+ }
+
+.. _answer_quick_quiz_seqlock:
+
+Answer to Quick Quiz:
+ Why is it so important that updates be rare when using seqlock?
+
+ The reason that it is important that updates be rare when
+ using seqlock is that frequent updates can livelock readers.
+ One way to avoid this problem is to assign a seqlock for
+ each array entry rather than to the entire array.
diff --git a/Documentation/RCU/arrayRCU.txt b/Documentation/RCU/arrayRCU.txt
deleted file mode 100644
index f05a9af..0000000
--- a/Documentation/RCU/arrayRCU.txt
+++ /dev/null
@@ -1,153 +0,0 @@
-Using RCU to Protect Read-Mostly Arrays
-
-
-Although RCU is more commonly used to protect linked lists, it can
-also be used to protect arrays. Three situations are as follows:
-
-1. Hash Tables
-
-2. Static Arrays
-
-3. Resizeable Arrays
-
-Each of these three situations involves an RCU-protected pointer to an
-array that is separately indexed. It might be tempting to consider use
-of RCU to instead protect the index into an array, however, this use
-case is -not- supported. The problem with RCU-protected indexes into
-arrays is that compilers can play way too many optimization games with
-integers, which means that the rules governing handling of these indexes
-are far more trouble than they are worth. If RCU-protected indexes into
-arrays prove to be particularly valuable (which they have not thus far),
-explicit cooperation from the compiler will be required to permit them
-to be safely used.
-
-That aside, each of the three RCU-protected pointer situations are
-described in the following sections.
-
-
-Situation 1: Hash Tables
-
-Hash tables are often implemented as an array, where each array entry
-has a linked-list hash chain. Each hash chain can be protected by RCU
-as described in the listRCU.txt document. This approach also applies
-to other array-of-list situations, such as radix trees.
-
-
-Situation 2: Static Arrays
-
-Static arrays, where the data (rather than a pointer to the data) is
-located in each array element, and where the array is never resized,
-have not been used with RCU. Rik van Riel recommends using seqlock in
-this situation, which would also have minimal read-side overhead as long
-as updates are rare.
-
-Quick Quiz: Why is it so important that updates be rare when
- using seqlock?
-
-
-Situation 3: Resizeable Arrays
-
-Use of RCU for resizeable arrays is demonstrated by the grow_ary()
-function formerly used by the System V IPC code. The array is used
-to map from semaphore, message-queue, and shared-memory IDs to the data
-structure that represents the corresponding IPC construct. The grow_ary()
-function does not acquire any locks; instead its caller must hold the
-ids->sem semaphore.
-
-The grow_ary() function, shown below, does some limit checks, allocates a
-new ipc_id_ary, copies the old to the new portion of the new, initializes
-the remainder of the new, updates the ids->entries pointer to point to
-the new array, and invokes ipc_rcu_putref() to free up the old array.
-Note that rcu_assign_pointer() is used to update the ids->entries pointer,
-which includes any memory barriers required on whatever architecture
-you are running on.
-
- static int grow_ary(struct ipc_ids* ids, int newsize)
- {
- struct ipc_id_ary* new;
- struct ipc_id_ary* old;
- int i;
- int size = ids->entries->size;
-
- if(newsize > IPCMNI)
- newsize = IPCMNI;
- if(newsize <= size)
- return newsize;
-
- new = ipc_rcu_alloc(sizeof(struct kern_ipc_perm *)*newsize +
- sizeof(struct ipc_id_ary));
- if(new == NULL)
- return size;
- new->size = newsize;
- memcpy(new->p, ids->entries->p,
- sizeof(struct kern_ipc_perm *)*size +
- sizeof(struct ipc_id_ary));
- for(i=size;i<newsize;i++) {
- new->p[i] = NULL;
- }
- old = ids->entries;
-
- /*
- * Use rcu_assign_pointer() to make sure the memcpyed
- * contents of the new array are visible before the new
- * array becomes visible.
- */
- rcu_assign_pointer(ids->entries, new);
-
- ipc_rcu_putref(old);
- return newsize;
- }
-
-The ipc_rcu_putref() function decrements the array's reference count
-and then, if the reference count has dropped to zero, uses call_rcu()
-to free the array after a grace period has elapsed.
-
-The array is traversed by the ipc_lock() function. This function
-indexes into the array under the protection of rcu_read_lock(),
-using rcu_dereference() to pick up the pointer to the array so
-that it may later safely be dereferenced -- memory barriers are
-required on the Alpha CPU. Since the size of the array is stored
-with the array itself, there can be no array-size mismatches, so
-a simple check suffices. The pointer to the structure corresponding
-to the desired IPC object is placed in "out", with NULL indicating
-a non-existent entry. After acquiring "out->lock", the "out->deleted"
-flag indicates whether the IPC object is in the process of being
-deleted, and, if not, the pointer is returned.
-
- struct kern_ipc_perm* ipc_lock(struct ipc_ids* ids, int id)
- {
- struct kern_ipc_perm* out;
- int lid = id % SEQ_MULTIPLIER;
- struct ipc_id_ary* entries;
-
- rcu_read_lock();
- entries = rcu_dereference(ids->entries);
- if(lid >= entries->size) {
- rcu_read_unlock();
- return NULL;
- }
- out = entries->p[lid];
- if(out == NULL) {
- rcu_read_unlock();
- return NULL;
- }
- spin_lock(&out->lock);
-
- /* ipc_rmid() may have already freed the ID while ipc_lock
- * was spinning: here verify that the structure is still valid
- */
- if (out->deleted) {
- spin_unlock(&out->lock);
- rcu_read_unlock();
- return NULL;
- }
- return out;
- }
-
-
-Answer to Quick Quiz:
-
- The reason that it is important that updates be rare when
- using seqlock is that frequent updates can livelock readers.
- One way to avoid this problem is to assign a seqlock for
- each array entry rather than to the entire array.
diff --git a/Documentation/RCU/index.rst b/Documentation/RCU/index.rst
index 5c99185..81a0a1e 100644
--- a/Documentation/RCU/index.rst
+++ b/Documentation/RCU/index.rst
@@ -7,8 +7,13 @@
.. toctree::
:maxdepth: 3
+ arrayRCU
+ rcubarrier
+ rcu_dereference
+ whatisRCU
rcu
listRCU
+ NMI-RCU
UP
Design/Memory-Ordering/Tree-RCU-Memory-Ordering
diff --git a/Documentation/RCU/lockdep-splat.txt b/Documentation/RCU/lockdep-splat.txt
index 9c01597..b809631 100644
--- a/Documentation/RCU/lockdep-splat.txt
+++ b/Documentation/RCU/lockdep-splat.txt
@@ -99,7 +99,7 @@
read-side critical section, which again would have suppressed the
above lockdep-RCU splat.
-But in this particular case, we don't actually deference the pointer
+But in this particular case, we don't actually dereference the pointer
returned from rcu_dereference(). Instead, that pointer is just compared
to the cic pointer, which means that the rcu_dereference() can be replaced
by rcu_access_pointer() as follows:
diff --git a/Documentation/RCU/rcu_dereference.rst b/Documentation/RCU/rcu_dereference.rst
new file mode 100644
index 0000000..c9667eb
--- /dev/null
+++ b/Documentation/RCU/rcu_dereference.rst
@@ -0,0 +1,463 @@
+.. _rcu_dereference_doc:
+
+PROPER CARE AND FEEDING OF RETURN VALUES FROM rcu_dereference()
+===============================================================
+
+Most of the time, you can use values from rcu_dereference() or one of
+the similar primitives without worries. Dereferencing (prefix "*"),
+field selection ("->"), assignment ("="), address-of ("&"), addition and
+subtraction of constants, and casts all work quite naturally and safely.
+
+It is nevertheless possible to get into trouble with other operations.
+Follow these rules to keep your RCU code working properly:
+
+- You must use one of the rcu_dereference() family of primitives
+ to load an RCU-protected pointer, otherwise CONFIG_PROVE_RCU
+ will complain. Worse yet, your code can see random memory-corruption
+ bugs due to games that compilers and DEC Alpha can play.
+ Without one of the rcu_dereference() primitives, compilers
+ can reload the value, and won't your code have fun with two
+ different values for a single pointer! Without rcu_dereference(),
+ DEC Alpha can load a pointer, dereference that pointer, and
+ return data preceding initialization that preceded the store of
+ the pointer.
+
+ In addition, the volatile cast in rcu_dereference() prevents the
+ compiler from deducing the resulting pointer value. Please see
+ the section entitled "EXAMPLE WHERE THE COMPILER KNOWS TOO MUCH"
+ for an example where the compiler can in fact deduce the exact
+ value of the pointer, and thus cause misordering.
+
+- You are only permitted to use rcu_dereference on pointer values.
+ The compiler simply knows too much about integral values to
+ trust it to carry dependencies through integer operations.
+ There are a very few exceptions, namely that you can temporarily
+ cast the pointer to uintptr_t in order to:
+
+ - Set bits and clear bits down in the must-be-zero low-order
+ bits of that pointer. This clearly means that the pointer
+ must have alignment constraints, for example, this does
+ -not- work in general for char* pointers.
+
+ - XOR bits to translate pointers, as is done in some
+ classic buddy-allocator algorithms.
+
+ It is important to cast the value back to pointer before
+ doing much of anything else with it.
+
+- Avoid cancellation when using the "+" and "-" infix arithmetic
+ operators. For example, for a given variable "x", avoid
+ "(x-(uintptr_t)x)" for char* pointers. The compiler is within its
+ rights to substitute zero for this sort of expression, so that
+ subsequent accesses no longer depend on the rcu_dereference(),
+ again possibly resulting in bugs due to misordering.
+
+ Of course, if "p" is a pointer from rcu_dereference(), and "a"
+ and "b" are integers that happen to be equal, the expression
+ "p+a-b" is safe because its value still necessarily depends on
+ the rcu_dereference(), thus maintaining proper ordering.
+
+- If you are using RCU to protect JITed functions, so that the
+ "()" function-invocation operator is applied to a value obtained
+ (directly or indirectly) from rcu_dereference(), you may need to
+ interact directly with the hardware to flush instruction caches.
+ This issue arises on some systems when a newly JITed function is
+ using the same memory that was used by an earlier JITed function.
+
+- Do not use the results from relational operators ("==", "!=",
+ ">", ">=", "<", or "<=") when dereferencing. For example,
+ the following (quite strange) code is buggy::
+
+ int *p;
+ int *q;
+
+ ...
+
+ p = rcu_dereference(gp)
+ q = &global_q;
+ q += p > &oom_p;
+ r1 = *q; /* BUGGY!!! */
+
+ As before, the reason this is buggy is that relational operators
+ are often compiled using branches. And as before, although
+ weak-memory machines such as ARM or PowerPC do order stores
+ after such branches, but can speculate loads, which can again
+ result in misordering bugs.
+
+- Be very careful about comparing pointers obtained from
+ rcu_dereference() against non-NULL values. As Linus Torvalds
+ explained, if the two pointers are equal, the compiler could
+ substitute the pointer you are comparing against for the pointer
+ obtained from rcu_dereference(). For example::
+
+ p = rcu_dereference(gp);
+ if (p == &default_struct)
+ do_default(p->a);
+
+ Because the compiler now knows that the value of "p" is exactly
+ the address of the variable "default_struct", it is free to
+ transform this code into the following::
+
+ p = rcu_dereference(gp);
+ if (p == &default_struct)
+ do_default(default_struct.a);
+
+ On ARM and Power hardware, the load from "default_struct.a"
+ can now be speculated, such that it might happen before the
+ rcu_dereference(). This could result in bugs due to misordering.
+
+ However, comparisons are OK in the following cases:
+
+ - The comparison was against the NULL pointer. If the
+ compiler knows that the pointer is NULL, you had better
+ not be dereferencing it anyway. If the comparison is
+ non-equal, the compiler is none the wiser. Therefore,
+ it is safe to compare pointers from rcu_dereference()
+ against NULL pointers.
+
+ - The pointer is never dereferenced after being compared.
+ Since there are no subsequent dereferences, the compiler
+ cannot use anything it learned from the comparison
+ to reorder the non-existent subsequent dereferences.
+ This sort of comparison occurs frequently when scanning
+ RCU-protected circular linked lists.
+
+ Note that if checks for being within an RCU read-side
+ critical section are not required and the pointer is never
+ dereferenced, rcu_access_pointer() should be used in place
+ of rcu_dereference().
+
+ - The comparison is against a pointer that references memory
+ that was initialized "a long time ago." The reason
+ this is safe is that even if misordering occurs, the
+ misordering will not affect the accesses that follow
+ the comparison. So exactly how long ago is "a long
+ time ago"? Here are some possibilities:
+
+ - Compile time.
+
+ - Boot time.
+
+ - Module-init time for module code.
+
+ - Prior to kthread creation for kthread code.
+
+ - During some prior acquisition of the lock that
+ we now hold.
+
+ - Before mod_timer() time for a timer handler.
+
+ There are many other possibilities involving the Linux
+ kernel's wide array of primitives that cause code to
+ be invoked at a later time.
+
+ - The pointer being compared against also came from
+ rcu_dereference(). In this case, both pointers depend
+ on one rcu_dereference() or another, so you get proper
+ ordering either way.
+
+ That said, this situation can make certain RCU usage
+ bugs more likely to happen. Which can be a good thing,
+ at least if they happen during testing. An example
+ of such an RCU usage bug is shown in the section titled
+ "EXAMPLE OF AMPLIFIED RCU-USAGE BUG".
+
+ - All of the accesses following the comparison are stores,
+ so that a control dependency preserves the needed ordering.
+ That said, it is easy to get control dependencies wrong.
+ Please see the "CONTROL DEPENDENCIES" section of
+ Documentation/memory-barriers.txt for more details.
+
+ - The pointers are not equal -and- the compiler does
+ not have enough information to deduce the value of the
+ pointer. Note that the volatile cast in rcu_dereference()
+ will normally prevent the compiler from knowing too much.
+
+ However, please note that if the compiler knows that the
+ pointer takes on only one of two values, a not-equal
+ comparison will provide exactly the information that the
+ compiler needs to deduce the value of the pointer.
+
+- Disable any value-speculation optimizations that your compiler
+ might provide, especially if you are making use of feedback-based
+ optimizations that take data collected from prior runs. Such
+ value-speculation optimizations reorder operations by design.
+
+ There is one exception to this rule: Value-speculation
+ optimizations that leverage the branch-prediction hardware are
+ safe on strongly ordered systems (such as x86), but not on weakly
+ ordered systems (such as ARM or Power). Choose your compiler
+ command-line options wisely!
+
+
+EXAMPLE OF AMPLIFIED RCU-USAGE BUG
+----------------------------------
+
+Because updaters can run concurrently with RCU readers, RCU readers can
+see stale and/or inconsistent values. If RCU readers need fresh or
+consistent values, which they sometimes do, they need to take proper
+precautions. To see this, consider the following code fragment::
+
+ struct foo {
+ int a;
+ int b;
+ int c;
+ };
+ struct foo *gp1;
+ struct foo *gp2;
+
+ void updater(void)
+ {
+ struct foo *p;
+
+ p = kmalloc(...);
+ if (p == NULL)
+ deal_with_it();
+ p->a = 42; /* Each field in its own cache line. */
+ p->b = 43;
+ p->c = 44;
+ rcu_assign_pointer(gp1, p);
+ p->b = 143;
+ p->c = 144;
+ rcu_assign_pointer(gp2, p);
+ }
+
+ void reader(void)
+ {
+ struct foo *p;
+ struct foo *q;
+ int r1, r2;
+
+ p = rcu_dereference(gp2);
+ if (p == NULL)
+ return;
+ r1 = p->b; /* Guaranteed to get 143. */
+ q = rcu_dereference(gp1); /* Guaranteed non-NULL. */
+ if (p == q) {
+ /* The compiler decides that q->c is same as p->c. */
+ r2 = p->c; /* Could get 44 on weakly order system. */
+ }
+ do_something_with(r1, r2);
+ }
+
+You might be surprised that the outcome (r1 == 143 && r2 == 44) is possible,
+but you should not be. After all, the updater might have been invoked
+a second time between the time reader() loaded into "r1" and the time
+that it loaded into "r2". The fact that this same result can occur due
+to some reordering from the compiler and CPUs is beside the point.
+
+But suppose that the reader needs a consistent view?
+
+Then one approach is to use locking, for example, as follows::
+
+ struct foo {
+ int a;
+ int b;
+ int c;
+ spinlock_t lock;
+ };
+ struct foo *gp1;
+ struct foo *gp2;
+
+ void updater(void)
+ {
+ struct foo *p;
+
+ p = kmalloc(...);
+ if (p == NULL)
+ deal_with_it();
+ spin_lock(&p->lock);
+ p->a = 42; /* Each field in its own cache line. */
+ p->b = 43;
+ p->c = 44;
+ spin_unlock(&p->lock);
+ rcu_assign_pointer(gp1, p);
+ spin_lock(&p->lock);
+ p->b = 143;
+ p->c = 144;
+ spin_unlock(&p->lock);
+ rcu_assign_pointer(gp2, p);
+ }
+
+ void reader(void)
+ {
+ struct foo *p;
+ struct foo *q;
+ int r1, r2;
+
+ p = rcu_dereference(gp2);
+ if (p == NULL)
+ return;
+ spin_lock(&p->lock);
+ r1 = p->b; /* Guaranteed to get 143. */
+ q = rcu_dereference(gp1); /* Guaranteed non-NULL. */
+ if (p == q) {
+ /* The compiler decides that q->c is same as p->c. */
+ r2 = p->c; /* Locking guarantees r2 == 144. */
+ }
+ spin_unlock(&p->lock);
+ do_something_with(r1, r2);
+ }
+
+As always, use the right tool for the job!
+
+
+EXAMPLE WHERE THE COMPILER KNOWS TOO MUCH
+-----------------------------------------
+
+If a pointer obtained from rcu_dereference() compares not-equal to some
+other pointer, the compiler normally has no clue what the value of the
+first pointer might be. This lack of knowledge prevents the compiler
+from carrying out optimizations that otherwise might destroy the ordering
+guarantees that RCU depends on. And the volatile cast in rcu_dereference()
+should prevent the compiler from guessing the value.
+
+But without rcu_dereference(), the compiler knows more than you might
+expect. Consider the following code fragment::
+
+ struct foo {
+ int a;
+ int b;
+ };
+ static struct foo variable1;
+ static struct foo variable2;
+ static struct foo *gp = &variable1;
+
+ void updater(void)
+ {
+ initialize_foo(&variable2);
+ rcu_assign_pointer(gp, &variable2);
+ /*
+ * The above is the only store to gp in this translation unit,
+ * and the address of gp is not exported in any way.
+ */
+ }
+
+ int reader(void)
+ {
+ struct foo *p;
+
+ p = gp;
+ barrier();
+ if (p == &variable1)
+ return p->a; /* Must be variable1.a. */
+ else
+ return p->b; /* Must be variable2.b. */
+ }
+
+Because the compiler can see all stores to "gp", it knows that the only
+possible values of "gp" are "variable1" on the one hand and "variable2"
+on the other. The comparison in reader() therefore tells the compiler
+the exact value of "p" even in the not-equals case. This allows the
+compiler to make the return values independent of the load from "gp",
+in turn destroying the ordering between this load and the loads of the
+return values. This can result in "p->b" returning pre-initialization
+garbage values.
+
+In short, rcu_dereference() is -not- optional when you are going to
+dereference the resulting pointer.
+
+
+WHICH MEMBER OF THE rcu_dereference() FAMILY SHOULD YOU USE?
+------------------------------------------------------------
+
+First, please avoid using rcu_dereference_raw() and also please avoid
+using rcu_dereference_check() and rcu_dereference_protected() with a
+second argument with a constant value of 1 (or true, for that matter).
+With that caution out of the way, here is some guidance for which
+member of the rcu_dereference() to use in various situations:
+
+1. If the access needs to be within an RCU read-side critical
+ section, use rcu_dereference(). With the new consolidated
+ RCU flavors, an RCU read-side critical section is entered
+ using rcu_read_lock(), anything that disables bottom halves,
+ anything that disables interrupts, or anything that disables
+ preemption.
+
+2. If the access might be within an RCU read-side critical section
+ on the one hand, or protected by (say) my_lock on the other,
+ use rcu_dereference_check(), for example::
+
+ p1 = rcu_dereference_check(p->rcu_protected_pointer,
+ lockdep_is_held(&my_lock));
+
+
+3. If the access might be within an RCU read-side critical section
+ on the one hand, or protected by either my_lock or your_lock on
+ the other, again use rcu_dereference_check(), for example::
+
+ p1 = rcu_dereference_check(p->rcu_protected_pointer,
+ lockdep_is_held(&my_lock) ||
+ lockdep_is_held(&your_lock));
+
+4. If the access is on the update side, so that it is always protected
+ by my_lock, use rcu_dereference_protected()::
+
+ p1 = rcu_dereference_protected(p->rcu_protected_pointer,
+ lockdep_is_held(&my_lock));
+
+ This can be extended to handle multiple locks as in #3 above,
+ and both can be extended to check other conditions as well.
+
+5. If the protection is supplied by the caller, and is thus unknown
+ to this code, that is the rare case when rcu_dereference_raw()
+ is appropriate. In addition, rcu_dereference_raw() might be
+ appropriate when the lockdep expression would be excessively
+ complex, except that a better approach in that case might be to
+ take a long hard look at your synchronization design. Still,
+ there are data-locking cases where any one of a very large number
+ of locks or reference counters suffices to protect the pointer,
+ so rcu_dereference_raw() does have its place.
+
+ However, its place is probably quite a bit smaller than one
+ might expect given the number of uses in the current kernel.
+ Ditto for its synonym, rcu_dereference_check( ... , 1), and
+ its close relative, rcu_dereference_protected(... , 1).
+
+
+SPARSE CHECKING OF RCU-PROTECTED POINTERS
+-----------------------------------------
+
+The sparse static-analysis tool checks for direct access to RCU-protected
+pointers, which can result in "interesting" bugs due to compiler
+optimizations involving invented loads and perhaps also load tearing.
+For example, suppose someone mistakenly does something like this::
+
+ p = q->rcu_protected_pointer;
+ do_something_with(p->a);
+ do_something_else_with(p->b);
+
+If register pressure is high, the compiler might optimize "p" out
+of existence, transforming the code to something like this::
+
+ do_something_with(q->rcu_protected_pointer->a);
+ do_something_else_with(q->rcu_protected_pointer->b);
+
+This could fatally disappoint your code if q->rcu_protected_pointer
+changed in the meantime. Nor is this a theoretical problem: Exactly
+this sort of bug cost Paul E. McKenney (and several of his innocent
+colleagues) a three-day weekend back in the early 1990s.
+
+Load tearing could of course result in dereferencing a mashup of a pair
+of pointers, which also might fatally disappoint your code.
+
+These problems could have been avoided simply by making the code instead
+read as follows::
+
+ p = rcu_dereference(q->rcu_protected_pointer);
+ do_something_with(p->a);
+ do_something_else_with(p->b);
+
+Unfortunately, these sorts of bugs can be extremely hard to spot during
+review. This is where the sparse tool comes into play, along with the
+"__rcu" marker. If you mark a pointer declaration, whether in a structure
+or as a formal parameter, with "__rcu", which tells sparse to complain if
+this pointer is accessed directly. It will also cause sparse to complain
+if a pointer not marked with "__rcu" is accessed using rcu_dereference()
+and friends. For example, ->rcu_protected_pointer might be declared as
+follows::
+
+ struct foo __rcu *rcu_protected_pointer;
+
+Use of "__rcu" is opt-in. If you choose not to use it, then you should
+ignore the sparse warnings.
diff --git a/Documentation/RCU/rcu_dereference.txt b/Documentation/RCU/rcu_dereference.txt
deleted file mode 100644
index bf699e8..0000000
--- a/Documentation/RCU/rcu_dereference.txt
+++ /dev/null
@@ -1,456 +0,0 @@
-PROPER CARE AND FEEDING OF RETURN VALUES FROM rcu_dereference()
-
-Most of the time, you can use values from rcu_dereference() or one of
-the similar primitives without worries. Dereferencing (prefix "*"),
-field selection ("->"), assignment ("="), address-of ("&"), addition and
-subtraction of constants, and casts all work quite naturally and safely.
-
-It is nevertheless possible to get into trouble with other operations.
-Follow these rules to keep your RCU code working properly:
-
-o You must use one of the rcu_dereference() family of primitives
- to load an RCU-protected pointer, otherwise CONFIG_PROVE_RCU
- will complain. Worse yet, your code can see random memory-corruption
- bugs due to games that compilers and DEC Alpha can play.
- Without one of the rcu_dereference() primitives, compilers
- can reload the value, and won't your code have fun with two
- different values for a single pointer! Without rcu_dereference(),
- DEC Alpha can load a pointer, dereference that pointer, and
- return data preceding initialization that preceded the store of
- the pointer.
-
- In addition, the volatile cast in rcu_dereference() prevents the
- compiler from deducing the resulting pointer value. Please see
- the section entitled "EXAMPLE WHERE THE COMPILER KNOWS TOO MUCH"
- for an example where the compiler can in fact deduce the exact
- value of the pointer, and thus cause misordering.
-
-o You are only permitted to use rcu_dereference on pointer values.
- The compiler simply knows too much about integral values to
- trust it to carry dependencies through integer operations.
- There are a very few exceptions, namely that you can temporarily
- cast the pointer to uintptr_t in order to:
-
- o Set bits and clear bits down in the must-be-zero low-order
- bits of that pointer. This clearly means that the pointer
- must have alignment constraints, for example, this does
- -not- work in general for char* pointers.
-
- o XOR bits to translate pointers, as is done in some
- classic buddy-allocator algorithms.
-
- It is important to cast the value back to pointer before
- doing much of anything else with it.
-
-o Avoid cancellation when using the "+" and "-" infix arithmetic
- operators. For example, for a given variable "x", avoid
- "(x-(uintptr_t)x)" for char* pointers. The compiler is within its
- rights to substitute zero for this sort of expression, so that
- subsequent accesses no longer depend on the rcu_dereference(),
- again possibly resulting in bugs due to misordering.
-
- Of course, if "p" is a pointer from rcu_dereference(), and "a"
- and "b" are integers that happen to be equal, the expression
- "p+a-b" is safe because its value still necessarily depends on
- the rcu_dereference(), thus maintaining proper ordering.
-
-o If you are using RCU to protect JITed functions, so that the
- "()" function-invocation operator is applied to a value obtained
- (directly or indirectly) from rcu_dereference(), you may need to
- interact directly with the hardware to flush instruction caches.
- This issue arises on some systems when a newly JITed function is
- using the same memory that was used by an earlier JITed function.
-
-o Do not use the results from relational operators ("==", "!=",
- ">", ">=", "<", or "<=") when dereferencing. For example,
- the following (quite strange) code is buggy:
-
- int *p;
- int *q;
-
- ...
-
- p = rcu_dereference(gp)
- q = &global_q;
- q += p > &oom_p;
- r1 = *q; /* BUGGY!!! */
-
- As before, the reason this is buggy is that relational operators
- are often compiled using branches. And as before, although
- weak-memory machines such as ARM or PowerPC do order stores
- after such branches, but can speculate loads, which can again
- result in misordering bugs.
-
-o Be very careful about comparing pointers obtained from
- rcu_dereference() against non-NULL values. As Linus Torvalds
- explained, if the two pointers are equal, the compiler could
- substitute the pointer you are comparing against for the pointer
- obtained from rcu_dereference(). For example:
-
- p = rcu_dereference(gp);
- if (p == &default_struct)
- do_default(p->a);
-
- Because the compiler now knows that the value of "p" is exactly
- the address of the variable "default_struct", it is free to
- transform this code into the following:
-
- p = rcu_dereference(gp);
- if (p == &default_struct)
- do_default(default_struct.a);
-
- On ARM and Power hardware, the load from "default_struct.a"
- can now be speculated, such that it might happen before the
- rcu_dereference(). This could result in bugs due to misordering.
-
- However, comparisons are OK in the following cases:
-
- o The comparison was against the NULL pointer. If the
- compiler knows that the pointer is NULL, you had better
- not be dereferencing it anyway. If the comparison is
- non-equal, the compiler is none the wiser. Therefore,
- it is safe to compare pointers from rcu_dereference()
- against NULL pointers.
-
- o The pointer is never dereferenced after being compared.
- Since there are no subsequent dereferences, the compiler
- cannot use anything it learned from the comparison
- to reorder the non-existent subsequent dereferences.
- This sort of comparison occurs frequently when scanning
- RCU-protected circular linked lists.
-
- Note that if checks for being within an RCU read-side
- critical section are not required and the pointer is never
- dereferenced, rcu_access_pointer() should be used in place
- of rcu_dereference().
-
- o The comparison is against a pointer that references memory
- that was initialized "a long time ago." The reason
- this is safe is that even if misordering occurs, the
- misordering will not affect the accesses that follow
- the comparison. So exactly how long ago is "a long
- time ago"? Here are some possibilities:
-
- o Compile time.
-
- o Boot time.
-
- o Module-init time for module code.
-
- o Prior to kthread creation for kthread code.
-
- o During some prior acquisition of the lock that
- we now hold.
-
- o Before mod_timer() time for a timer handler.
-
- There are many other possibilities involving the Linux
- kernel's wide array of primitives that cause code to
- be invoked at a later time.
-
- o The pointer being compared against also came from
- rcu_dereference(). In this case, both pointers depend
- on one rcu_dereference() or another, so you get proper
- ordering either way.
-
- That said, this situation can make certain RCU usage
- bugs more likely to happen. Which can be a good thing,
- at least if they happen during testing. An example
- of such an RCU usage bug is shown in the section titled
- "EXAMPLE OF AMPLIFIED RCU-USAGE BUG".
-
- o All of the accesses following the comparison are stores,
- so that a control dependency preserves the needed ordering.
- That said, it is easy to get control dependencies wrong.
- Please see the "CONTROL DEPENDENCIES" section of
- Documentation/memory-barriers.txt for more details.
-
- o The pointers are not equal -and- the compiler does
- not have enough information to deduce the value of the
- pointer. Note that the volatile cast in rcu_dereference()
- will normally prevent the compiler from knowing too much.
-
- However, please note that if the compiler knows that the
- pointer takes on only one of two values, a not-equal
- comparison will provide exactly the information that the
- compiler needs to deduce the value of the pointer.
-
-o Disable any value-speculation optimizations that your compiler
- might provide, especially if you are making use of feedback-based
- optimizations that take data collected from prior runs. Such
- value-speculation optimizations reorder operations by design.
-
- There is one exception to this rule: Value-speculation
- optimizations that leverage the branch-prediction hardware are
- safe on strongly ordered systems (such as x86), but not on weakly
- ordered systems (such as ARM or Power). Choose your compiler
- command-line options wisely!
-
-
-EXAMPLE OF AMPLIFIED RCU-USAGE BUG
-
-Because updaters can run concurrently with RCU readers, RCU readers can
-see stale and/or inconsistent values. If RCU readers need fresh or
-consistent values, which they sometimes do, they need to take proper
-precautions. To see this, consider the following code fragment:
-
- struct foo {
- int a;
- int b;
- int c;
- };
- struct foo *gp1;
- struct foo *gp2;
-
- void updater(void)
- {
- struct foo *p;
-
- p = kmalloc(...);
- if (p == NULL)
- deal_with_it();
- p->a = 42; /* Each field in its own cache line. */
- p->b = 43;
- p->c = 44;
- rcu_assign_pointer(gp1, p);
- p->b = 143;
- p->c = 144;
- rcu_assign_pointer(gp2, p);
- }
-
- void reader(void)
- {
- struct foo *p;
- struct foo *q;
- int r1, r2;
-
- p = rcu_dereference(gp2);
- if (p == NULL)
- return;
- r1 = p->b; /* Guaranteed to get 143. */
- q = rcu_dereference(gp1); /* Guaranteed non-NULL. */
- if (p == q) {
- /* The compiler decides that q->c is same as p->c. */
- r2 = p->c; /* Could get 44 on weakly order system. */
- }
- do_something_with(r1, r2);
- }
-
-You might be surprised that the outcome (r1 == 143 && r2 == 44) is possible,
-but you should not be. After all, the updater might have been invoked
-a second time between the time reader() loaded into "r1" and the time
-that it loaded into "r2". The fact that this same result can occur due
-to some reordering from the compiler and CPUs is beside the point.
-
-But suppose that the reader needs a consistent view?
-
-Then one approach is to use locking, for example, as follows:
-
- struct foo {
- int a;
- int b;
- int c;
- spinlock_t lock;
- };
- struct foo *gp1;
- struct foo *gp2;
-
- void updater(void)
- {
- struct foo *p;
-
- p = kmalloc(...);
- if (p == NULL)
- deal_with_it();
- spin_lock(&p->lock);
- p->a = 42; /* Each field in its own cache line. */
- p->b = 43;
- p->c = 44;
- spin_unlock(&p->lock);
- rcu_assign_pointer(gp1, p);
- spin_lock(&p->lock);
- p->b = 143;
- p->c = 144;
- spin_unlock(&p->lock);
- rcu_assign_pointer(gp2, p);
- }
-
- void reader(void)
- {
- struct foo *p;
- struct foo *q;
- int r1, r2;
-
- p = rcu_dereference(gp2);
- if (p == NULL)
- return;
- spin_lock(&p->lock);
- r1 = p->b; /* Guaranteed to get 143. */
- q = rcu_dereference(gp1); /* Guaranteed non-NULL. */
- if (p == q) {
- /* The compiler decides that q->c is same as p->c. */
- r2 = p->c; /* Locking guarantees r2 == 144. */
- }
- spin_unlock(&p->lock);
- do_something_with(r1, r2);
- }
-
-As always, use the right tool for the job!
-
-
-EXAMPLE WHERE THE COMPILER KNOWS TOO MUCH
-
-If a pointer obtained from rcu_dereference() compares not-equal to some
-other pointer, the compiler normally has no clue what the value of the
-first pointer might be. This lack of knowledge prevents the compiler
-from carrying out optimizations that otherwise might destroy the ordering
-guarantees that RCU depends on. And the volatile cast in rcu_dereference()
-should prevent the compiler from guessing the value.
-
-But without rcu_dereference(), the compiler knows more than you might
-expect. Consider the following code fragment:
-
- struct foo {
- int a;
- int b;
- };
- static struct foo variable1;
- static struct foo variable2;
- static struct foo *gp = &variable1;
-
- void updater(void)
- {
- initialize_foo(&variable2);
- rcu_assign_pointer(gp, &variable2);
- /*
- * The above is the only store to gp in this translation unit,
- * and the address of gp is not exported in any way.
- */
- }
-
- int reader(void)
- {
- struct foo *p;
-
- p = gp;
- barrier();
- if (p == &variable1)
- return p->a; /* Must be variable1.a. */
- else
- return p->b; /* Must be variable2.b. */
- }
-
-Because the compiler can see all stores to "gp", it knows that the only
-possible values of "gp" are "variable1" on the one hand and "variable2"
-on the other. The comparison in reader() therefore tells the compiler
-the exact value of "p" even in the not-equals case. This allows the
-compiler to make the return values independent of the load from "gp",
-in turn destroying the ordering between this load and the loads of the
-return values. This can result in "p->b" returning pre-initialization
-garbage values.
-
-In short, rcu_dereference() is -not- optional when you are going to
-dereference the resulting pointer.
-
-
-WHICH MEMBER OF THE rcu_dereference() FAMILY SHOULD YOU USE?
-
-First, please avoid using rcu_dereference_raw() and also please avoid
-using rcu_dereference_check() and rcu_dereference_protected() with a
-second argument with a constant value of 1 (or true, for that matter).
-With that caution out of the way, here is some guidance for which
-member of the rcu_dereference() to use in various situations:
-
-1. If the access needs to be within an RCU read-side critical
- section, use rcu_dereference(). With the new consolidated
- RCU flavors, an RCU read-side critical section is entered
- using rcu_read_lock(), anything that disables bottom halves,
- anything that disables interrupts, or anything that disables
- preemption.
-
-2. If the access might be within an RCU read-side critical section
- on the one hand, or protected by (say) my_lock on the other,
- use rcu_dereference_check(), for example:
-
- p1 = rcu_dereference_check(p->rcu_protected_pointer,
- lockdep_is_held(&my_lock));
-
-
-3. If the access might be within an RCU read-side critical section
- on the one hand, or protected by either my_lock or your_lock on
- the other, again use rcu_dereference_check(), for example:
-
- p1 = rcu_dereference_check(p->rcu_protected_pointer,
- lockdep_is_held(&my_lock) ||
- lockdep_is_held(&your_lock));
-
-4. If the access is on the update side, so that it is always protected
- by my_lock, use rcu_dereference_protected():
-
- p1 = rcu_dereference_protected(p->rcu_protected_pointer,
- lockdep_is_held(&my_lock));
-
- This can be extended to handle multiple locks as in #3 above,
- and both can be extended to check other conditions as well.
-
-5. If the protection is supplied by the caller, and is thus unknown
- to this code, that is the rare case when rcu_dereference_raw()
- is appropriate. In addition, rcu_dereference_raw() might be
- appropriate when the lockdep expression would be excessively
- complex, except that a better approach in that case might be to
- take a long hard look at your synchronization design. Still,
- there are data-locking cases where any one of a very large number
- of locks or reference counters suffices to protect the pointer,
- so rcu_dereference_raw() does have its place.
-
- However, its place is probably quite a bit smaller than one
- might expect given the number of uses in the current kernel.
- Ditto for its synonym, rcu_dereference_check( ... , 1), and
- its close relative, rcu_dereference_protected(... , 1).
-
-
-SPARSE CHECKING OF RCU-PROTECTED POINTERS
-
-The sparse static-analysis tool checks for direct access to RCU-protected
-pointers, which can result in "interesting" bugs due to compiler
-optimizations involving invented loads and perhaps also load tearing.
-For example, suppose someone mistakenly does something like this:
-
- p = q->rcu_protected_pointer;
- do_something_with(p->a);
- do_something_else_with(p->b);
-
-If register pressure is high, the compiler might optimize "p" out
-of existence, transforming the code to something like this:
-
- do_something_with(q->rcu_protected_pointer->a);
- do_something_else_with(q->rcu_protected_pointer->b);
-
-This could fatally disappoint your code if q->rcu_protected_pointer
-changed in the meantime. Nor is this a theoretical problem: Exactly
-this sort of bug cost Paul E. McKenney (and several of his innocent
-colleagues) a three-day weekend back in the early 1990s.
-
-Load tearing could of course result in dereferencing a mashup of a pair
-of pointers, which also might fatally disappoint your code.
-
-These problems could have been avoided simply by making the code instead
-read as follows:
-
- p = rcu_dereference(q->rcu_protected_pointer);
- do_something_with(p->a);
- do_something_else_with(p->b);
-
-Unfortunately, these sorts of bugs can be extremely hard to spot during
-review. This is where the sparse tool comes into play, along with the
-"__rcu" marker. If you mark a pointer declaration, whether in a structure
-or as a formal parameter, with "__rcu", which tells sparse to complain if
-this pointer is accessed directly. It will also cause sparse to complain
-if a pointer not marked with "__rcu" is accessed using rcu_dereference()
-and friends. For example, ->rcu_protected_pointer might be declared as
-follows:
-
- struct foo __rcu *rcu_protected_pointer;
-
-Use of "__rcu" is opt-in. If you choose not to use it, then you should
-ignore the sparse warnings.
diff --git a/Documentation/RCU/rcubarrier.rst b/Documentation/RCU/rcubarrier.rst
new file mode 100644
index 0000000..f64f4413
--- /dev/null
+++ b/Documentation/RCU/rcubarrier.rst
@@ -0,0 +1,353 @@
+.. _rcu_barrier:
+
+RCU and Unloadable Modules
+==========================
+
+[Originally published in LWN Jan. 14, 2007: http://lwn.net/Articles/217484/]
+
+RCU (read-copy update) is a synchronization mechanism that can be thought
+of as a replacement for read-writer locking (among other things), but with
+very low-overhead readers that are immune to deadlock, priority inversion,
+and unbounded latency. RCU read-side critical sections are delimited
+by rcu_read_lock() and rcu_read_unlock(), which, in non-CONFIG_PREEMPT
+kernels, generate no code whatsoever.
+
+This means that RCU writers are unaware of the presence of concurrent
+readers, so that RCU updates to shared data must be undertaken quite
+carefully, leaving an old version of the data structure in place until all
+pre-existing readers have finished. These old versions are needed because
+such readers might hold a reference to them. RCU updates can therefore be
+rather expensive, and RCU is thus best suited for read-mostly situations.
+
+How can an RCU writer possibly determine when all readers are finished,
+given that readers might well leave absolutely no trace of their
+presence? There is a synchronize_rcu() primitive that blocks until all
+pre-existing readers have completed. An updater wishing to delete an
+element p from a linked list might do the following, while holding an
+appropriate lock, of course::
+
+ list_del_rcu(p);
+ synchronize_rcu();
+ kfree(p);
+
+But the above code cannot be used in IRQ context -- the call_rcu()
+primitive must be used instead. This primitive takes a pointer to an
+rcu_head struct placed within the RCU-protected data structure and
+another pointer to a function that may be invoked later to free that
+structure. Code to delete an element p from the linked list from IRQ
+context might then be as follows::
+
+ list_del_rcu(p);
+ call_rcu(&p->rcu, p_callback);
+
+Since call_rcu() never blocks, this code can safely be used from within
+IRQ context. The function p_callback() might be defined as follows::
+
+ static void p_callback(struct rcu_head *rp)
+ {
+ struct pstruct *p = container_of(rp, struct pstruct, rcu);
+
+ kfree(p);
+ }
+
+
+Unloading Modules That Use call_rcu()
+-------------------------------------
+
+But what if p_callback is defined in an unloadable module?
+
+If we unload the module while some RCU callbacks are pending,
+the CPUs executing these callbacks are going to be severely
+disappointed when they are later invoked, as fancifully depicted at
+http://lwn.net/images/ns/kernel/rcu-drop.jpg.
+
+We could try placing a synchronize_rcu() in the module-exit code path,
+but this is not sufficient. Although synchronize_rcu() does wait for a
+grace period to elapse, it does not wait for the callbacks to complete.
+
+One might be tempted to try several back-to-back synchronize_rcu()
+calls, but this is still not guaranteed to work. If there is a very
+heavy RCU-callback load, then some of the callbacks might be deferred
+in order to allow other processing to proceed. Such deferral is required
+in realtime kernels in order to avoid excessive scheduling latencies.
+
+
+rcu_barrier()
+-------------
+
+We instead need the rcu_barrier() primitive. Rather than waiting for
+a grace period to elapse, rcu_barrier() waits for all outstanding RCU
+callbacks to complete. Please note that rcu_barrier() does **not** imply
+synchronize_rcu(), in particular, if there are no RCU callbacks queued
+anywhere, rcu_barrier() is within its rights to return immediately,
+without waiting for a grace period to elapse.
+
+Pseudo-code using rcu_barrier() is as follows:
+
+ 1. Prevent any new RCU callbacks from being posted.
+ 2. Execute rcu_barrier().
+ 3. Allow the module to be unloaded.
+
+There is also an srcu_barrier() function for SRCU, and you of course
+must match the flavor of rcu_barrier() with that of call_rcu(). If your
+module uses multiple flavors of call_rcu(), then it must also use multiple
+flavors of rcu_barrier() when unloading that module. For example, if
+it uses call_rcu(), call_srcu() on srcu_struct_1, and call_srcu() on
+srcu_struct_2, then the following three lines of code will be required
+when unloading::
+
+ 1 rcu_barrier();
+ 2 srcu_barrier(&srcu_struct_1);
+ 3 srcu_barrier(&srcu_struct_2);
+
+The rcutorture module makes use of rcu_barrier() in its exit function
+as follows::
+
+ 1 static void
+ 2 rcu_torture_cleanup(void)
+ 3 {
+ 4 int i;
+ 5
+ 6 fullstop = 1;
+ 7 if (shuffler_task != NULL) {
+ 8 VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task");
+ 9 kthread_stop(shuffler_task);
+ 10 }
+ 11 shuffler_task = NULL;
+ 12
+ 13 if (writer_task != NULL) {
+ 14 VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task");
+ 15 kthread_stop(writer_task);
+ 16 }
+ 17 writer_task = NULL;
+ 18
+ 19 if (reader_tasks != NULL) {
+ 20 for (i = 0; i < nrealreaders; i++) {
+ 21 if (reader_tasks[i] != NULL) {
+ 22 VERBOSE_PRINTK_STRING(
+ 23 "Stopping rcu_torture_reader task");
+ 24 kthread_stop(reader_tasks[i]);
+ 25 }
+ 26 reader_tasks[i] = NULL;
+ 27 }
+ 28 kfree(reader_tasks);
+ 29 reader_tasks = NULL;
+ 30 }
+ 31 rcu_torture_current = NULL;
+ 32
+ 33 if (fakewriter_tasks != NULL) {
+ 34 for (i = 0; i < nfakewriters; i++) {
+ 35 if (fakewriter_tasks[i] != NULL) {
+ 36 VERBOSE_PRINTK_STRING(
+ 37 "Stopping rcu_torture_fakewriter task");
+ 38 kthread_stop(fakewriter_tasks[i]);
+ 39 }
+ 40 fakewriter_tasks[i] = NULL;
+ 41 }
+ 42 kfree(fakewriter_tasks);
+ 43 fakewriter_tasks = NULL;
+ 44 }
+ 45
+ 46 if (stats_task != NULL) {
+ 47 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task");
+ 48 kthread_stop(stats_task);
+ 49 }
+ 50 stats_task = NULL;
+ 51
+ 52 /* Wait for all RCU callbacks to fire. */
+ 53 rcu_barrier();
+ 54
+ 55 rcu_torture_stats_print(); /* -After- the stats thread is stopped! */
+ 56
+ 57 if (cur_ops->cleanup != NULL)
+ 58 cur_ops->cleanup();
+ 59 if (atomic_read(&n_rcu_torture_error))
+ 60 rcu_torture_print_module_parms("End of test: FAILURE");
+ 61 else
+ 62 rcu_torture_print_module_parms("End of test: SUCCESS");
+ 63 }
+
+Line 6 sets a global variable that prevents any RCU callbacks from
+re-posting themselves. This will not be necessary in most cases, since
+RCU callbacks rarely include calls to call_rcu(). However, the rcutorture
+module is an exception to this rule, and therefore needs to set this
+global variable.
+
+Lines 7-50 stop all the kernel tasks associated with the rcutorture
+module. Therefore, once execution reaches line 53, no more rcutorture
+RCU callbacks will be posted. The rcu_barrier() call on line 53 waits
+for any pre-existing callbacks to complete.
+
+Then lines 55-62 print status and do operation-specific cleanup, and
+then return, permitting the module-unload operation to be completed.
+
+.. _rcubarrier_quiz_1:
+
+Quick Quiz #1:
+ Is there any other situation where rcu_barrier() might
+ be required?
+
+:ref:`Answer to Quick Quiz #1 <answer_rcubarrier_quiz_1>`
+
+Your module might have additional complications. For example, if your
+module invokes call_rcu() from timers, you will need to first cancel all
+the timers, and only then invoke rcu_barrier() to wait for any remaining
+RCU callbacks to complete.
+
+Of course, if you module uses call_rcu(), you will need to invoke
+rcu_barrier() before unloading. Similarly, if your module uses
+call_srcu(), you will need to invoke srcu_barrier() before unloading,
+and on the same srcu_struct structure. If your module uses call_rcu()
+**and** call_srcu(), then you will need to invoke rcu_barrier() **and**
+srcu_barrier().
+
+
+Implementing rcu_barrier()
+--------------------------
+
+Dipankar Sarma's implementation of rcu_barrier() makes use of the fact
+that RCU callbacks are never reordered once queued on one of the per-CPU
+queues. His implementation queues an RCU callback on each of the per-CPU
+callback queues, and then waits until they have all started executing, at
+which point, all earlier RCU callbacks are guaranteed to have completed.
+
+The original code for rcu_barrier() was as follows::
+
+ 1 void rcu_barrier(void)
+ 2 {
+ 3 BUG_ON(in_interrupt());
+ 4 /* Take cpucontrol mutex to protect against CPU hotplug */
+ 5 mutex_lock(&rcu_barrier_mutex);
+ 6 init_completion(&rcu_barrier_completion);
+ 7 atomic_set(&rcu_barrier_cpu_count, 0);
+ 8 on_each_cpu(rcu_barrier_func, NULL, 0, 1);
+ 9 wait_for_completion(&rcu_barrier_completion);
+ 10 mutex_unlock(&rcu_barrier_mutex);
+ 11 }
+
+Line 3 verifies that the caller is in process context, and lines 5 and 10
+use rcu_barrier_mutex to ensure that only one rcu_barrier() is using the
+global completion and counters at a time, which are initialized on lines
+6 and 7. Line 8 causes each CPU to invoke rcu_barrier_func(), which is
+shown below. Note that the final "1" in on_each_cpu()'s argument list
+ensures that all the calls to rcu_barrier_func() will have completed
+before on_each_cpu() returns. Line 9 then waits for the completion.
+
+This code was rewritten in 2008 and several times thereafter, but this
+still gives the general idea.
+
+The rcu_barrier_func() runs on each CPU, where it invokes call_rcu()
+to post an RCU callback, as follows::
+
+ 1 static void rcu_barrier_func(void *notused)
+ 2 {
+ 3 int cpu = smp_processor_id();
+ 4 struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
+ 5 struct rcu_head *head;
+ 6
+ 7 head = &rdp->barrier;
+ 8 atomic_inc(&rcu_barrier_cpu_count);
+ 9 call_rcu(head, rcu_barrier_callback);
+ 10 }
+
+Lines 3 and 4 locate RCU's internal per-CPU rcu_data structure,
+which contains the struct rcu_head that needed for the later call to
+call_rcu(). Line 7 picks up a pointer to this struct rcu_head, and line
+8 increments a global counter. This counter will later be decremented
+by the callback. Line 9 then registers the rcu_barrier_callback() on
+the current CPU's queue.
+
+The rcu_barrier_callback() function simply atomically decrements the
+rcu_barrier_cpu_count variable and finalizes the completion when it
+reaches zero, as follows::
+
+ 1 static void rcu_barrier_callback(struct rcu_head *notused)
+ 2 {
+ 3 if (atomic_dec_and_test(&rcu_barrier_cpu_count))
+ 4 complete(&rcu_barrier_completion);
+ 5 }
+
+.. _rcubarrier_quiz_2:
+
+Quick Quiz #2:
+ What happens if CPU 0's rcu_barrier_func() executes
+ immediately (thus incrementing rcu_barrier_cpu_count to the
+ value one), but the other CPU's rcu_barrier_func() invocations
+ are delayed for a full grace period? Couldn't this result in
+ rcu_barrier() returning prematurely?
+
+:ref:`Answer to Quick Quiz #2 <answer_rcubarrier_quiz_2>`
+
+The current rcu_barrier() implementation is more complex, due to the need
+to avoid disturbing idle CPUs (especially on battery-powered systems)
+and the need to minimally disturb non-idle CPUs in real-time systems.
+However, the code above illustrates the concepts.
+
+
+rcu_barrier() Summary
+---------------------
+
+The rcu_barrier() primitive has seen relatively little use, since most
+code using RCU is in the core kernel rather than in modules. However, if
+you are using RCU from an unloadable module, you need to use rcu_barrier()
+so that your module may be safely unloaded.
+
+
+Answers to Quick Quizzes
+------------------------
+
+.. _answer_rcubarrier_quiz_1:
+
+Quick Quiz #1:
+ Is there any other situation where rcu_barrier() might
+ be required?
+
+Answer: Interestingly enough, rcu_barrier() was not originally
+ implemented for module unloading. Nikita Danilov was using
+ RCU in a filesystem, which resulted in a similar situation at
+ filesystem-unmount time. Dipankar Sarma coded up rcu_barrier()
+ in response, so that Nikita could invoke it during the
+ filesystem-unmount process.
+
+ Much later, yours truly hit the RCU module-unload problem when
+ implementing rcutorture, and found that rcu_barrier() solves
+ this problem as well.
+
+:ref:`Back to Quick Quiz #1 <rcubarrier_quiz_1>`
+
+.. _answer_rcubarrier_quiz_2:
+
+Quick Quiz #2:
+ What happens if CPU 0's rcu_barrier_func() executes
+ immediately (thus incrementing rcu_barrier_cpu_count to the
+ value one), but the other CPU's rcu_barrier_func() invocations
+ are delayed for a full grace period? Couldn't this result in
+ rcu_barrier() returning prematurely?
+
+Answer: This cannot happen. The reason is that on_each_cpu() has its last
+ argument, the wait flag, set to "1". This flag is passed through
+ to smp_call_function() and further to smp_call_function_on_cpu(),
+ causing this latter to spin until the cross-CPU invocation of
+ rcu_barrier_func() has completed. This by itself would prevent
+ a grace period from completing on non-CONFIG_PREEMPT kernels,
+ since each CPU must undergo a context switch (or other quiescent
+ state) before the grace period can complete. However, this is
+ of no use in CONFIG_PREEMPT kernels.
+
+ Therefore, on_each_cpu() disables preemption across its call
+ to smp_call_function() and also across the local call to
+ rcu_barrier_func(). This prevents the local CPU from context
+ switching, again preventing grace periods from completing. This
+ means that all CPUs have executed rcu_barrier_func() before
+ the first rcu_barrier_callback() can possibly execute, in turn
+ preventing rcu_barrier_cpu_count from prematurely reaching zero.
+
+ Currently, -rt implementations of RCU keep but a single global
+ queue for RCU callbacks, and thus do not suffer from this
+ problem. However, when the -rt RCU eventually does have per-CPU
+ callback queues, things will have to change. One simple change
+ is to add an rcu_read_lock() before line 8 of rcu_barrier()
+ and an rcu_read_unlock() after line 8 of this same function. If
+ you can think of a better change, please let me know!
+
+:ref:`Back to Quick Quiz #2 <rcubarrier_quiz_2>`
diff --git a/Documentation/RCU/rcubarrier.txt b/Documentation/RCU/rcubarrier.txt
deleted file mode 100644
index a2782df..0000000
--- a/Documentation/RCU/rcubarrier.txt
+++ /dev/null
@@ -1,325 +0,0 @@
-RCU and Unloadable Modules
-
-[Originally published in LWN Jan. 14, 2007: http://lwn.net/Articles/217484/]
-
-RCU (read-copy update) is a synchronization mechanism that can be thought
-of as a replacement for read-writer locking (among other things), but with
-very low-overhead readers that are immune to deadlock, priority inversion,
-and unbounded latency. RCU read-side critical sections are delimited
-by rcu_read_lock() and rcu_read_unlock(), which, in non-CONFIG_PREEMPT
-kernels, generate no code whatsoever.
-
-This means that RCU writers are unaware of the presence of concurrent
-readers, so that RCU updates to shared data must be undertaken quite
-carefully, leaving an old version of the data structure in place until all
-pre-existing readers have finished. These old versions are needed because
-such readers might hold a reference to them. RCU updates can therefore be
-rather expensive, and RCU is thus best suited for read-mostly situations.
-
-How can an RCU writer possibly determine when all readers are finished,
-given that readers might well leave absolutely no trace of their
-presence? There is a synchronize_rcu() primitive that blocks until all
-pre-existing readers have completed. An updater wishing to delete an
-element p from a linked list might do the following, while holding an
-appropriate lock, of course:
-
- list_del_rcu(p);
- synchronize_rcu();
- kfree(p);
-
-But the above code cannot be used in IRQ context -- the call_rcu()
-primitive must be used instead. This primitive takes a pointer to an
-rcu_head struct placed within the RCU-protected data structure and
-another pointer to a function that may be invoked later to free that
-structure. Code to delete an element p from the linked list from IRQ
-context might then be as follows:
-
- list_del_rcu(p);
- call_rcu(&p->rcu, p_callback);
-
-Since call_rcu() never blocks, this code can safely be used from within
-IRQ context. The function p_callback() might be defined as follows:
-
- static void p_callback(struct rcu_head *rp)
- {
- struct pstruct *p = container_of(rp, struct pstruct, rcu);
-
- kfree(p);
- }
-
-
-Unloading Modules That Use call_rcu()
-
-But what if p_callback is defined in an unloadable module?
-
-If we unload the module while some RCU callbacks are pending,
-the CPUs executing these callbacks are going to be severely
-disappointed when they are later invoked, as fancifully depicted at
-http://lwn.net/images/ns/kernel/rcu-drop.jpg.
-
-We could try placing a synchronize_rcu() in the module-exit code path,
-but this is not sufficient. Although synchronize_rcu() does wait for a
-grace period to elapse, it does not wait for the callbacks to complete.
-
-One might be tempted to try several back-to-back synchronize_rcu()
-calls, but this is still not guaranteed to work. If there is a very
-heavy RCU-callback load, then some of the callbacks might be deferred
-in order to allow other processing to proceed. Such deferral is required
-in realtime kernels in order to avoid excessive scheduling latencies.
-
-
-rcu_barrier()
-
-We instead need the rcu_barrier() primitive. Rather than waiting for
-a grace period to elapse, rcu_barrier() waits for all outstanding RCU
-callbacks to complete. Please note that rcu_barrier() does -not- imply
-synchronize_rcu(), in particular, if there are no RCU callbacks queued
-anywhere, rcu_barrier() is within its rights to return immediately,
-without waiting for a grace period to elapse.
-
-Pseudo-code using rcu_barrier() is as follows:
-
- 1. Prevent any new RCU callbacks from being posted.
- 2. Execute rcu_barrier().
- 3. Allow the module to be unloaded.
-
-There is also an srcu_barrier() function for SRCU, and you of course
-must match the flavor of rcu_barrier() with that of call_rcu(). If your
-module uses multiple flavors of call_rcu(), then it must also use multiple
-flavors of rcu_barrier() when unloading that module. For example, if
-it uses call_rcu(), call_srcu() on srcu_struct_1, and call_srcu() on
-srcu_struct_2(), then the following three lines of code will be required
-when unloading:
-
- 1 rcu_barrier();
- 2 srcu_barrier(&srcu_struct_1);
- 3 srcu_barrier(&srcu_struct_2);
-
-The rcutorture module makes use of rcu_barrier() in its exit function
-as follows:
-
- 1 static void
- 2 rcu_torture_cleanup(void)
- 3 {
- 4 int i;
- 5
- 6 fullstop = 1;
- 7 if (shuffler_task != NULL) {
- 8 VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task");
- 9 kthread_stop(shuffler_task);
-10 }
-11 shuffler_task = NULL;
-12
-13 if (writer_task != NULL) {
-14 VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task");
-15 kthread_stop(writer_task);
-16 }
-17 writer_task = NULL;
-18
-19 if (reader_tasks != NULL) {
-20 for (i = 0; i < nrealreaders; i++) {
-21 if (reader_tasks[i] != NULL) {
-22 VERBOSE_PRINTK_STRING(
-23 "Stopping rcu_torture_reader task");
-24 kthread_stop(reader_tasks[i]);
-25 }
-26 reader_tasks[i] = NULL;
-27 }
-28 kfree(reader_tasks);
-29 reader_tasks = NULL;
-30 }
-31 rcu_torture_current = NULL;
-32
-33 if (fakewriter_tasks != NULL) {
-34 for (i = 0; i < nfakewriters; i++) {
-35 if (fakewriter_tasks[i] != NULL) {
-36 VERBOSE_PRINTK_STRING(
-37 "Stopping rcu_torture_fakewriter task");
-38 kthread_stop(fakewriter_tasks[i]);
-39 }
-40 fakewriter_tasks[i] = NULL;
-41 }
-42 kfree(fakewriter_tasks);
-43 fakewriter_tasks = NULL;
-44 }
-45
-46 if (stats_task != NULL) {
-47 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task");
-48 kthread_stop(stats_task);
-49 }
-50 stats_task = NULL;
-51
-52 /* Wait for all RCU callbacks to fire. */
-53 rcu_barrier();
-54
-55 rcu_torture_stats_print(); /* -After- the stats thread is stopped! */
-56
-57 if (cur_ops->cleanup != NULL)
-58 cur_ops->cleanup();
-59 if (atomic_read(&n_rcu_torture_error))
-60 rcu_torture_print_module_parms("End of test: FAILURE");
-61 else
-62 rcu_torture_print_module_parms("End of test: SUCCESS");
-63 }
-
-Line 6 sets a global variable that prevents any RCU callbacks from
-re-posting themselves. This will not be necessary in most cases, since
-RCU callbacks rarely include calls to call_rcu(). However, the rcutorture
-module is an exception to this rule, and therefore needs to set this
-global variable.
-
-Lines 7-50 stop all the kernel tasks associated with the rcutorture
-module. Therefore, once execution reaches line 53, no more rcutorture
-RCU callbacks will be posted. The rcu_barrier() call on line 53 waits
-for any pre-existing callbacks to complete.
-
-Then lines 55-62 print status and do operation-specific cleanup, and
-then return, permitting the module-unload operation to be completed.
-
-Quick Quiz #1: Is there any other situation where rcu_barrier() might
- be required?
-
-Your module might have additional complications. For example, if your
-module invokes call_rcu() from timers, you will need to first cancel all
-the timers, and only then invoke rcu_barrier() to wait for any remaining
-RCU callbacks to complete.
-
-Of course, if you module uses call_rcu(), you will need to invoke
-rcu_barrier() before unloading. Similarly, if your module uses
-call_srcu(), you will need to invoke srcu_barrier() before unloading,
-and on the same srcu_struct structure. If your module uses call_rcu()
--and- call_srcu(), then you will need to invoke rcu_barrier() -and-
-srcu_barrier().
-
-
-Implementing rcu_barrier()
-
-Dipankar Sarma's implementation of rcu_barrier() makes use of the fact
-that RCU callbacks are never reordered once queued on one of the per-CPU
-queues. His implementation queues an RCU callback on each of the per-CPU
-callback queues, and then waits until they have all started executing, at
-which point, all earlier RCU callbacks are guaranteed to have completed.
-
-The original code for rcu_barrier() was as follows:
-
- 1 void rcu_barrier(void)
- 2 {
- 3 BUG_ON(in_interrupt());
- 4 /* Take cpucontrol mutex to protect against CPU hotplug */
- 5 mutex_lock(&rcu_barrier_mutex);
- 6 init_completion(&rcu_barrier_completion);
- 7 atomic_set(&rcu_barrier_cpu_count, 0);
- 8 on_each_cpu(rcu_barrier_func, NULL, 0, 1);
- 9 wait_for_completion(&rcu_barrier_completion);
-10 mutex_unlock(&rcu_barrier_mutex);
-11 }
-
-Line 3 verifies that the caller is in process context, and lines 5 and 10
-use rcu_barrier_mutex to ensure that only one rcu_barrier() is using the
-global completion and counters at a time, which are initialized on lines
-6 and 7. Line 8 causes each CPU to invoke rcu_barrier_func(), which is
-shown below. Note that the final "1" in on_each_cpu()'s argument list
-ensures that all the calls to rcu_barrier_func() will have completed
-before on_each_cpu() returns. Line 9 then waits for the completion.
-
-This code was rewritten in 2008 and several times thereafter, but this
-still gives the general idea.
-
-The rcu_barrier_func() runs on each CPU, where it invokes call_rcu()
-to post an RCU callback, as follows:
-
- 1 static void rcu_barrier_func(void *notused)
- 2 {
- 3 int cpu = smp_processor_id();
- 4 struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
- 5 struct rcu_head *head;
- 6
- 7 head = &rdp->barrier;
- 8 atomic_inc(&rcu_barrier_cpu_count);
- 9 call_rcu(head, rcu_barrier_callback);
-10 }
-
-Lines 3 and 4 locate RCU's internal per-CPU rcu_data structure,
-which contains the struct rcu_head that needed for the later call to
-call_rcu(). Line 7 picks up a pointer to this struct rcu_head, and line
-8 increments a global counter. This counter will later be decremented
-by the callback. Line 9 then registers the rcu_barrier_callback() on
-the current CPU's queue.
-
-The rcu_barrier_callback() function simply atomically decrements the
-rcu_barrier_cpu_count variable and finalizes the completion when it
-reaches zero, as follows:
-
- 1 static void rcu_barrier_callback(struct rcu_head *notused)
- 2 {
- 3 if (atomic_dec_and_test(&rcu_barrier_cpu_count))
- 4 complete(&rcu_barrier_completion);
- 5 }
-
-Quick Quiz #2: What happens if CPU 0's rcu_barrier_func() executes
- immediately (thus incrementing rcu_barrier_cpu_count to the
- value one), but the other CPU's rcu_barrier_func() invocations
- are delayed for a full grace period? Couldn't this result in
- rcu_barrier() returning prematurely?
-
-The current rcu_barrier() implementation is more complex, due to the need
-to avoid disturbing idle CPUs (especially on battery-powered systems)
-and the need to minimally disturb non-idle CPUs in real-time systems.
-However, the code above illustrates the concepts.
-
-
-rcu_barrier() Summary
-
-The rcu_barrier() primitive has seen relatively little use, since most
-code using RCU is in the core kernel rather than in modules. However, if
-you are using RCU from an unloadable module, you need to use rcu_barrier()
-so that your module may be safely unloaded.
-
-
-Answers to Quick Quizzes
-
-Quick Quiz #1: Is there any other situation where rcu_barrier() might
- be required?
-
-Answer: Interestingly enough, rcu_barrier() was not originally
- implemented for module unloading. Nikita Danilov was using
- RCU in a filesystem, which resulted in a similar situation at
- filesystem-unmount time. Dipankar Sarma coded up rcu_barrier()
- in response, so that Nikita could invoke it during the
- filesystem-unmount process.
-
- Much later, yours truly hit the RCU module-unload problem when
- implementing rcutorture, and found that rcu_barrier() solves
- this problem as well.
-
-Quick Quiz #2: What happens if CPU 0's rcu_barrier_func() executes
- immediately (thus incrementing rcu_barrier_cpu_count to the
- value one), but the other CPU's rcu_barrier_func() invocations
- are delayed for a full grace period? Couldn't this result in
- rcu_barrier() returning prematurely?
-
-Answer: This cannot happen. The reason is that on_each_cpu() has its last
- argument, the wait flag, set to "1". This flag is passed through
- to smp_call_function() and further to smp_call_function_on_cpu(),
- causing this latter to spin until the cross-CPU invocation of
- rcu_barrier_func() has completed. This by itself would prevent
- a grace period from completing on non-CONFIG_PREEMPT kernels,
- since each CPU must undergo a context switch (or other quiescent
- state) before the grace period can complete. However, this is
- of no use in CONFIG_PREEMPT kernels.
-
- Therefore, on_each_cpu() disables preemption across its call
- to smp_call_function() and also across the local call to
- rcu_barrier_func(). This prevents the local CPU from context
- switching, again preventing grace periods from completing. This
- means that all CPUs have executed rcu_barrier_func() before
- the first rcu_barrier_callback() can possibly execute, in turn
- preventing rcu_barrier_cpu_count from prematurely reaching zero.
-
- Currently, -rt implementations of RCU keep but a single global
- queue for RCU callbacks, and thus do not suffer from this
- problem. However, when the -rt RCU eventually does have per-CPU
- callback queues, things will have to change. One simple change
- is to add an rcu_read_lock() before line 8 of rcu_barrier()
- and an rcu_read_unlock() after line 8 of this same function. If
- you can think of a better change, please let me know!
diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt
index f48f462..a360a87 100644
--- a/Documentation/RCU/stallwarn.txt
+++ b/Documentation/RCU/stallwarn.txt
@@ -225,18 +225,13 @@
In kernels with CONFIG_RCU_FAST_NO_HZ, more information is printed
for each CPU:
- 0: (64628 ticks this GP) idle=dd5/3fffffffffffffff/0 softirq=82/543 last_accelerate: a345/d342 Nonlazy posted: ..D
+ 0: (64628 ticks this GP) idle=dd5/3fffffffffffffff/0 softirq=82/543 last_accelerate: a345/d342 dyntick_enabled: 1
The "last_accelerate:" prints the low-order 16 bits (in hex) of the
jiffies counter when this CPU last invoked rcu_try_advance_all_cbs()
from rcu_needs_cpu() or last invoked rcu_accelerate_cbs() from
-rcu_prepare_for_idle(). The "Nonlazy posted:" indicates lazy-callback
-status, so that an "l" indicates that all callbacks were lazy at the start
-of the last idle period and an "L" indicates that there are currently
-no non-lazy callbacks (in both cases, "." is printed otherwise, as
-shown above) and "D" indicates that dyntick-idle processing is enabled
-("." is printed otherwise, for example, if disabled via the "nohz="
-kernel boot parameter).
+rcu_prepare_for_idle(). "dyntick_enabled: 1" indicates that dyntick-idle
+processing is enabled.
If the grace period ends just as the stall warning starts printing,
there will be a spurious stall-warning message, which will include
diff --git a/Documentation/RCU/whatisRCU.rst b/Documentation/RCU/whatisRCU.rst
new file mode 100644
index 0000000..c7f147b
--- /dev/null
+++ b/Documentation/RCU/whatisRCU.rst
@@ -0,0 +1,1154 @@
+.. _whatisrcu_doc:
+
+What is RCU? -- "Read, Copy, Update"
+======================================
+
+Please note that the "What is RCU?" LWN series is an excellent place
+to start learning about RCU:
+
+| 1. What is RCU, Fundamentally? http://lwn.net/Articles/262464/
+| 2. What is RCU? Part 2: Usage http://lwn.net/Articles/263130/
+| 3. RCU part 3: the RCU API http://lwn.net/Articles/264090/
+| 4. The RCU API, 2010 Edition http://lwn.net/Articles/418853/
+| 2010 Big API Table http://lwn.net/Articles/419086/
+| 5. The RCU API, 2014 Edition http://lwn.net/Articles/609904/
+| 2014 Big API Table http://lwn.net/Articles/609973/
+
+
+What is RCU?
+
+RCU is a synchronization mechanism that was added to the Linux kernel
+during the 2.5 development effort that is optimized for read-mostly
+situations. Although RCU is actually quite simple once you understand it,
+getting there can sometimes be a challenge. Part of the problem is that
+most of the past descriptions of RCU have been written with the mistaken
+assumption that there is "one true way" to describe RCU. Instead,
+the experience has been that different people must take different paths
+to arrive at an understanding of RCU. This document provides several
+different paths, as follows:
+
+:ref:`1. RCU OVERVIEW <1_whatisRCU>`
+
+:ref:`2. WHAT IS RCU'S CORE API? <2_whatisRCU>`
+
+:ref:`3. WHAT ARE SOME EXAMPLE USES OF CORE RCU API? <3_whatisRCU>`
+
+:ref:`4. WHAT IF MY UPDATING THREAD CANNOT BLOCK? <4_whatisRCU>`
+
+:ref:`5. WHAT ARE SOME SIMPLE IMPLEMENTATIONS OF RCU? <5_whatisRCU>`
+
+:ref:`6. ANALOGY WITH READER-WRITER LOCKING <6_whatisRCU>`
+
+:ref:`7. FULL LIST OF RCU APIs <7_whatisRCU>`
+
+:ref:`8. ANSWERS TO QUICK QUIZZES <8_whatisRCU>`
+
+People who prefer starting with a conceptual overview should focus on
+Section 1, though most readers will profit by reading this section at
+some point. People who prefer to start with an API that they can then
+experiment with should focus on Section 2. People who prefer to start
+with example uses should focus on Sections 3 and 4. People who need to
+understand the RCU implementation should focus on Section 5, then dive
+into the kernel source code. People who reason best by analogy should
+focus on Section 6. Section 7 serves as an index to the docbook API
+documentation, and Section 8 is the traditional answer key.
+
+So, start with the section that makes the most sense to you and your
+preferred method of learning. If you need to know everything about
+everything, feel free to read the whole thing -- but if you are really
+that type of person, you have perused the source code and will therefore
+never need this document anyway. ;-)
+
+.. _1_whatisRCU:
+
+1. RCU OVERVIEW
+----------------
+
+The basic idea behind RCU is to split updates into "removal" and
+"reclamation" phases. The removal phase removes references to data items
+within a data structure (possibly by replacing them with references to
+new versions of these data items), and can run concurrently with readers.
+The reason that it is safe to run the removal phase concurrently with
+readers is the semantics of modern CPUs guarantee that readers will see
+either the old or the new version of the data structure rather than a
+partially updated reference. The reclamation phase does the work of reclaiming
+(e.g., freeing) the data items removed from the data structure during the
+removal phase. Because reclaiming data items can disrupt any readers
+concurrently referencing those data items, the reclamation phase must
+not start until readers no longer hold references to those data items.
+
+Splitting the update into removal and reclamation phases permits the
+updater to perform the removal phase immediately, and to defer the
+reclamation phase until all readers active during the removal phase have
+completed, either by blocking until they finish or by registering a
+callback that is invoked after they finish. Only readers that are active
+during the removal phase need be considered, because any reader starting
+after the removal phase will be unable to gain a reference to the removed
+data items, and therefore cannot be disrupted by the reclamation phase.
+
+So the typical RCU update sequence goes something like the following:
+
+a. Remove pointers to a data structure, so that subsequent
+ readers cannot gain a reference to it.
+
+b. Wait for all previous readers to complete their RCU read-side
+ critical sections.
+
+c. At this point, there cannot be any readers who hold references
+ to the data structure, so it now may safely be reclaimed
+ (e.g., kfree()d).
+
+Step (b) above is the key idea underlying RCU's deferred destruction.
+The ability to wait until all readers are done allows RCU readers to
+use much lighter-weight synchronization, in some cases, absolutely no
+synchronization at all. In contrast, in more conventional lock-based
+schemes, readers must use heavy-weight synchronization in order to
+prevent an updater from deleting the data structure out from under them.
+This is because lock-based updaters typically update data items in place,
+and must therefore exclude readers. In contrast, RCU-based updaters
+typically take advantage of the fact that writes to single aligned
+pointers are atomic on modern CPUs, allowing atomic insertion, removal,
+and replacement of data items in a linked structure without disrupting
+readers. Concurrent RCU readers can then continue accessing the old
+versions, and can dispense with the atomic operations, memory barriers,
+and communications cache misses that are so expensive on present-day
+SMP computer systems, even in absence of lock contention.
+
+In the three-step procedure shown above, the updater is performing both
+the removal and the reclamation step, but it is often helpful for an
+entirely different thread to do the reclamation, as is in fact the case
+in the Linux kernel's directory-entry cache (dcache). Even if the same
+thread performs both the update step (step (a) above) and the reclamation
+step (step (c) above), it is often helpful to think of them separately.
+For example, RCU readers and updaters need not communicate at all,
+but RCU provides implicit low-overhead communication between readers
+and reclaimers, namely, in step (b) above.
+
+So how the heck can a reclaimer tell when a reader is done, given
+that readers are not doing any sort of synchronization operations???
+Read on to learn about how RCU's API makes this easy.
+
+.. _2_whatisRCU:
+
+2. WHAT IS RCU'S CORE API?
+---------------------------
+
+The core RCU API is quite small:
+
+a. rcu_read_lock()
+b. rcu_read_unlock()
+c. synchronize_rcu() / call_rcu()
+d. rcu_assign_pointer()
+e. rcu_dereference()
+
+There are many other members of the RCU API, but the rest can be
+expressed in terms of these five, though most implementations instead
+express synchronize_rcu() in terms of the call_rcu() callback API.
+
+The five core RCU APIs are described below, the other 18 will be enumerated
+later. See the kernel docbook documentation for more info, or look directly
+at the function header comments.
+
+rcu_read_lock()
+^^^^^^^^^^^^^^^
+ void rcu_read_lock(void);
+
+ Used by a reader to inform the reclaimer that the reader is
+ entering an RCU read-side critical section. It is illegal
+ to block while in an RCU read-side critical section, though
+ kernels built with CONFIG_PREEMPT_RCU can preempt RCU
+ read-side critical sections. Any RCU-protected data structure
+ accessed during an RCU read-side critical section is guaranteed to
+ remain unreclaimed for the full duration of that critical section.
+ Reference counts may be used in conjunction with RCU to maintain
+ longer-term references to data structures.
+
+rcu_read_unlock()
+^^^^^^^^^^^^^^^^^
+ void rcu_read_unlock(void);
+
+ Used by a reader to inform the reclaimer that the reader is
+ exiting an RCU read-side critical section. Note that RCU
+ read-side critical sections may be nested and/or overlapping.
+
+synchronize_rcu()
+^^^^^^^^^^^^^^^^^
+ void synchronize_rcu(void);
+
+ Marks the end of updater code and the beginning of reclaimer
+ code. It does this by blocking until all pre-existing RCU
+ read-side critical sections on all CPUs have completed.
+ Note that synchronize_rcu() will **not** necessarily wait for
+ any subsequent RCU read-side critical sections to complete.
+ For example, consider the following sequence of events::
+
+ CPU 0 CPU 1 CPU 2
+ ----------------- ------------------------- ---------------
+ 1. rcu_read_lock()
+ 2. enters synchronize_rcu()
+ 3. rcu_read_lock()
+ 4. rcu_read_unlock()
+ 5. exits synchronize_rcu()
+ 6. rcu_read_unlock()
+
+ To reiterate, synchronize_rcu() waits only for ongoing RCU
+ read-side critical sections to complete, not necessarily for
+ any that begin after synchronize_rcu() is invoked.
+
+ Of course, synchronize_rcu() does not necessarily return
+ **immediately** after the last pre-existing RCU read-side critical
+ section completes. For one thing, there might well be scheduling
+ delays. For another thing, many RCU implementations process
+ requests in batches in order to improve efficiencies, which can
+ further delay synchronize_rcu().
+
+ Since synchronize_rcu() is the API that must figure out when
+ readers are done, its implementation is key to RCU. For RCU
+ to be useful in all but the most read-intensive situations,
+ synchronize_rcu()'s overhead must also be quite small.
+
+ The call_rcu() API is a callback form of synchronize_rcu(),
+ and is described in more detail in a later section. Instead of
+ blocking, it registers a function and argument which are invoked
+ after all ongoing RCU read-side critical sections have completed.
+ This callback variant is particularly useful in situations where
+ it is illegal to block or where update-side performance is
+ critically important.
+
+ However, the call_rcu() API should not be used lightly, as use
+ of the synchronize_rcu() API generally results in simpler code.
+ In addition, the synchronize_rcu() API has the nice property
+ of automatically limiting update rate should grace periods
+ be delayed. This property results in system resilience in face
+ of denial-of-service attacks. Code using call_rcu() should limit
+ update rate in order to gain this same sort of resilience. See
+ checklist.txt for some approaches to limiting the update rate.
+
+rcu_assign_pointer()
+^^^^^^^^^^^^^^^^^^^^
+ void rcu_assign_pointer(p, typeof(p) v);
+
+ Yes, rcu_assign_pointer() **is** implemented as a macro, though it
+ would be cool to be able to declare a function in this manner.
+ (Compiler experts will no doubt disagree.)
+
+ The updater uses this function to assign a new value to an
+ RCU-protected pointer, in order to safely communicate the change
+ in value from the updater to the reader. This macro does not
+ evaluate to an rvalue, but it does execute any memory-barrier
+ instructions required for a given CPU architecture.
+
+ Perhaps just as important, it serves to document (1) which
+ pointers are protected by RCU and (2) the point at which a
+ given structure becomes accessible to other CPUs. That said,
+ rcu_assign_pointer() is most frequently used indirectly, via
+ the _rcu list-manipulation primitives such as list_add_rcu().
+
+rcu_dereference()
+^^^^^^^^^^^^^^^^^
+ typeof(p) rcu_dereference(p);
+
+ Like rcu_assign_pointer(), rcu_dereference() must be implemented
+ as a macro.
+
+ The reader uses rcu_dereference() to fetch an RCU-protected
+ pointer, which returns a value that may then be safely
+ dereferenced. Note that rcu_dereference() does not actually
+ dereference the pointer, instead, it protects the pointer for
+ later dereferencing. It also executes any needed memory-barrier
+ instructions for a given CPU architecture. Currently, only Alpha
+ needs memory barriers within rcu_dereference() -- on other CPUs,
+ it compiles to nothing, not even a compiler directive.
+
+ Common coding practice uses rcu_dereference() to copy an
+ RCU-protected pointer to a local variable, then dereferences
+ this local variable, for example as follows::
+
+ p = rcu_dereference(head.next);
+ return p->data;
+
+ However, in this case, one could just as easily combine these
+ into one statement::
+
+ return rcu_dereference(head.next)->data;
+
+ If you are going to be fetching multiple fields from the
+ RCU-protected structure, using the local variable is of
+ course preferred. Repeated rcu_dereference() calls look
+ ugly, do not guarantee that the same pointer will be returned
+ if an update happened while in the critical section, and incur
+ unnecessary overhead on Alpha CPUs.
+
+ Note that the value returned by rcu_dereference() is valid
+ only within the enclosing RCU read-side critical section [1]_.
+ For example, the following is **not** legal::
+
+ rcu_read_lock();
+ p = rcu_dereference(head.next);
+ rcu_read_unlock();
+ x = p->address; /* BUG!!! */
+ rcu_read_lock();
+ y = p->data; /* BUG!!! */
+ rcu_read_unlock();
+
+ Holding a reference from one RCU read-side critical section
+ to another is just as illegal as holding a reference from
+ one lock-based critical section to another! Similarly,
+ using a reference outside of the critical section in which
+ it was acquired is just as illegal as doing so with normal
+ locking.
+
+ As with rcu_assign_pointer(), an important function of
+ rcu_dereference() is to document which pointers are protected by
+ RCU, in particular, flagging a pointer that is subject to changing
+ at any time, including immediately after the rcu_dereference().
+ And, again like rcu_assign_pointer(), rcu_dereference() is
+ typically used indirectly, via the _rcu list-manipulation
+ primitives, such as list_for_each_entry_rcu() [2]_.
+
+.. [1] The variant rcu_dereference_protected() can be used outside
+ of an RCU read-side critical section as long as the usage is
+ protected by locks acquired by the update-side code. This variant
+ avoids the lockdep warning that would happen when using (for
+ example) rcu_dereference() without rcu_read_lock() protection.
+ Using rcu_dereference_protected() also has the advantage
+ of permitting compiler optimizations that rcu_dereference()
+ must prohibit. The rcu_dereference_protected() variant takes
+ a lockdep expression to indicate which locks must be acquired
+ by the caller. If the indicated protection is not provided,
+ a lockdep splat is emitted. See Documentation/RCU/Design/Requirements/Requirements.rst
+ and the API's code comments for more details and example usage.
+
+.. [2] If the list_for_each_entry_rcu() instance might be used by
+ update-side code as well as by RCU readers, then an additional
+ lockdep expression can be added to its list of arguments.
+ For example, given an additional "lock_is_held(&mylock)" argument,
+ the RCU lockdep code would complain only if this instance was
+ invoked outside of an RCU read-side critical section and without
+ the protection of mylock.
+
+The following diagram shows how each API communicates among the
+reader, updater, and reclaimer.
+::
+
+
+ rcu_assign_pointer()
+ +--------+
+ +---------------------->| reader |---------+
+ | +--------+ |
+ | | |
+ | | | Protect:
+ | | | rcu_read_lock()
+ | | | rcu_read_unlock()
+ | rcu_dereference() | |
+ +---------+ | |
+ | updater |<----------------+ |
+ +---------+ V
+ | +-----------+
+ +----------------------------------->| reclaimer |
+ +-----------+
+ Defer:
+ synchronize_rcu() & call_rcu()
+
+
+The RCU infrastructure observes the time sequence of rcu_read_lock(),
+rcu_read_unlock(), synchronize_rcu(), and call_rcu() invocations in
+order to determine when (1) synchronize_rcu() invocations may return
+to their callers and (2) call_rcu() callbacks may be invoked. Efficient
+implementations of the RCU infrastructure make heavy use of batching in
+order to amortize their overhead over many uses of the corresponding APIs.
+
+There are at least three flavors of RCU usage in the Linux kernel. The diagram
+above shows the most common one. On the updater side, the rcu_assign_pointer(),
+sychronize_rcu() and call_rcu() primitives used are the same for all three
+flavors. However for protection (on the reader side), the primitives used vary
+depending on the flavor:
+
+a. rcu_read_lock() / rcu_read_unlock()
+ rcu_dereference()
+
+b. rcu_read_lock_bh() / rcu_read_unlock_bh()
+ local_bh_disable() / local_bh_enable()
+ rcu_dereference_bh()
+
+c. rcu_read_lock_sched() / rcu_read_unlock_sched()
+ preempt_disable() / preempt_enable()
+ local_irq_save() / local_irq_restore()
+ hardirq enter / hardirq exit
+ NMI enter / NMI exit
+ rcu_dereference_sched()
+
+These three flavors are used as follows:
+
+a. RCU applied to normal data structures.
+
+b. RCU applied to networking data structures that may be subjected
+ to remote denial-of-service attacks.
+
+c. RCU applied to scheduler and interrupt/NMI-handler tasks.
+
+Again, most uses will be of (a). The (b) and (c) cases are important
+for specialized uses, but are relatively uncommon.
+
+.. _3_whatisRCU:
+
+3. WHAT ARE SOME EXAMPLE USES OF CORE RCU API?
+-----------------------------------------------
+
+This section shows a simple use of the core RCU API to protect a
+global pointer to a dynamically allocated structure. More-typical
+uses of RCU may be found in :ref:`listRCU.rst <list_rcu_doc>`,
+:ref:`arrayRCU.rst <array_rcu_doc>`, and :ref:`NMI-RCU.rst <NMI_rcu_doc>`.
+::
+
+ struct foo {
+ int a;
+ char b;
+ long c;
+ };
+ DEFINE_SPINLOCK(foo_mutex);
+
+ struct foo __rcu *gbl_foo;
+
+ /*
+ * Create a new struct foo that is the same as the one currently
+ * pointed to by gbl_foo, except that field "a" is replaced
+ * with "new_a". Points gbl_foo to the new structure, and
+ * frees up the old structure after a grace period.
+ *
+ * Uses rcu_assign_pointer() to ensure that concurrent readers
+ * see the initialized version of the new structure.
+ *
+ * Uses synchronize_rcu() to ensure that any readers that might
+ * have references to the old structure complete before freeing
+ * the old structure.
+ */
+ void foo_update_a(int new_a)
+ {
+ struct foo *new_fp;
+ struct foo *old_fp;
+
+ new_fp = kmalloc(sizeof(*new_fp), GFP_KERNEL);
+ spin_lock(&foo_mutex);
+ old_fp = rcu_dereference_protected(gbl_foo, lockdep_is_held(&foo_mutex));
+ *new_fp = *old_fp;
+ new_fp->a = new_a;
+ rcu_assign_pointer(gbl_foo, new_fp);
+ spin_unlock(&foo_mutex);
+ synchronize_rcu();
+ kfree(old_fp);
+ }
+
+ /*
+ * Return the value of field "a" of the current gbl_foo
+ * structure. Use rcu_read_lock() and rcu_read_unlock()
+ * to ensure that the structure does not get deleted out
+ * from under us, and use rcu_dereference() to ensure that
+ * we see the initialized version of the structure (important
+ * for DEC Alpha and for people reading the code).
+ */
+ int foo_get_a(void)
+ {
+ int retval;
+
+ rcu_read_lock();
+ retval = rcu_dereference(gbl_foo)->a;
+ rcu_read_unlock();
+ return retval;
+ }
+
+So, to sum up:
+
+- Use rcu_read_lock() and rcu_read_unlock() to guard RCU
+ read-side critical sections.
+
+- Within an RCU read-side critical section, use rcu_dereference()
+ to dereference RCU-protected pointers.
+
+- Use some solid scheme (such as locks or semaphores) to
+ keep concurrent updates from interfering with each other.
+
+- Use rcu_assign_pointer() to update an RCU-protected pointer.
+ This primitive protects concurrent readers from the updater,
+ **not** concurrent updates from each other! You therefore still
+ need to use locking (or something similar) to keep concurrent
+ rcu_assign_pointer() primitives from interfering with each other.
+
+- Use synchronize_rcu() **after** removing a data element from an
+ RCU-protected data structure, but **before** reclaiming/freeing
+ the data element, in order to wait for the completion of all
+ RCU read-side critical sections that might be referencing that
+ data item.
+
+See checklist.txt for additional rules to follow when using RCU.
+And again, more-typical uses of RCU may be found in :ref:`listRCU.rst
+<list_rcu_doc>`, :ref:`arrayRCU.rst <array_rcu_doc>`, and :ref:`NMI-RCU.rst
+<NMI_rcu_doc>`.
+
+.. _4_whatisRCU:
+
+4. WHAT IF MY UPDATING THREAD CANNOT BLOCK?
+--------------------------------------------
+
+In the example above, foo_update_a() blocks until a grace period elapses.
+This is quite simple, but in some cases one cannot afford to wait so
+long -- there might be other high-priority work to be done.
+
+In such cases, one uses call_rcu() rather than synchronize_rcu().
+The call_rcu() API is as follows::
+
+ void call_rcu(struct rcu_head * head,
+ void (*func)(struct rcu_head *head));
+
+This function invokes func(head) after a grace period has elapsed.
+This invocation might happen from either softirq or process context,
+so the function is not permitted to block. The foo struct needs to
+have an rcu_head structure added, perhaps as follows::
+
+ struct foo {
+ int a;
+ char b;
+ long c;
+ struct rcu_head rcu;
+ };
+
+The foo_update_a() function might then be written as follows::
+
+ /*
+ * Create a new struct foo that is the same as the one currently
+ * pointed to by gbl_foo, except that field "a" is replaced
+ * with "new_a". Points gbl_foo to the new structure, and
+ * frees up the old structure after a grace period.
+ *
+ * Uses rcu_assign_pointer() to ensure that concurrent readers
+ * see the initialized version of the new structure.
+ *
+ * Uses call_rcu() to ensure that any readers that might have
+ * references to the old structure complete before freeing the
+ * old structure.
+ */
+ void foo_update_a(int new_a)
+ {
+ struct foo *new_fp;
+ struct foo *old_fp;
+
+ new_fp = kmalloc(sizeof(*new_fp), GFP_KERNEL);
+ spin_lock(&foo_mutex);
+ old_fp = rcu_dereference_protected(gbl_foo, lockdep_is_held(&foo_mutex));
+ *new_fp = *old_fp;
+ new_fp->a = new_a;
+ rcu_assign_pointer(gbl_foo, new_fp);
+ spin_unlock(&foo_mutex);
+ call_rcu(&old_fp->rcu, foo_reclaim);
+ }
+
+The foo_reclaim() function might appear as follows::
+
+ void foo_reclaim(struct rcu_head *rp)
+ {
+ struct foo *fp = container_of(rp, struct foo, rcu);
+
+ foo_cleanup(fp->a);
+
+ kfree(fp);
+ }
+
+The container_of() primitive is a macro that, given a pointer into a
+struct, the type of the struct, and the pointed-to field within the
+struct, returns a pointer to the beginning of the struct.
+
+The use of call_rcu() permits the caller of foo_update_a() to
+immediately regain control, without needing to worry further about the
+old version of the newly updated element. It also clearly shows the
+RCU distinction between updater, namely foo_update_a(), and reclaimer,
+namely foo_reclaim().
+
+The summary of advice is the same as for the previous section, except
+that we are now using call_rcu() rather than synchronize_rcu():
+
+- Use call_rcu() **after** removing a data element from an
+ RCU-protected data structure in order to register a callback
+ function that will be invoked after the completion of all RCU
+ read-side critical sections that might be referencing that
+ data item.
+
+If the callback for call_rcu() is not doing anything more than calling
+kfree() on the structure, you can use kfree_rcu() instead of call_rcu()
+to avoid having to write your own callback::
+
+ kfree_rcu(old_fp, rcu);
+
+Again, see checklist.txt for additional rules governing the use of RCU.
+
+.. _5_whatisRCU:
+
+5. WHAT ARE SOME SIMPLE IMPLEMENTATIONS OF RCU?
+------------------------------------------------
+
+One of the nice things about RCU is that it has extremely simple "toy"
+implementations that are a good first step towards understanding the
+production-quality implementations in the Linux kernel. This section
+presents two such "toy" implementations of RCU, one that is implemented
+in terms of familiar locking primitives, and another that more closely
+resembles "classic" RCU. Both are way too simple for real-world use,
+lacking both functionality and performance. However, they are useful
+in getting a feel for how RCU works. See kernel/rcu/update.c for a
+production-quality implementation, and see:
+
+ http://www.rdrop.com/users/paulmck/RCU
+
+for papers describing the Linux kernel RCU implementation. The OLS'01
+and OLS'02 papers are a good introduction, and the dissertation provides
+more details on the current implementation as of early 2004.
+
+
+5A. "TOY" IMPLEMENTATION #1: LOCKING
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+This section presents a "toy" RCU implementation that is based on
+familiar locking primitives. Its overhead makes it a non-starter for
+real-life use, as does its lack of scalability. It is also unsuitable
+for realtime use, since it allows scheduling latency to "bleed" from
+one read-side critical section to another. It also assumes recursive
+reader-writer locks: If you try this with non-recursive locks, and
+you allow nested rcu_read_lock() calls, you can deadlock.
+
+However, it is probably the easiest implementation to relate to, so is
+a good starting point.
+
+It is extremely simple::
+
+ static DEFINE_RWLOCK(rcu_gp_mutex);
+
+ void rcu_read_lock(void)
+ {
+ read_lock(&rcu_gp_mutex);
+ }
+
+ void rcu_read_unlock(void)
+ {
+ read_unlock(&rcu_gp_mutex);
+ }
+
+ void synchronize_rcu(void)
+ {
+ write_lock(&rcu_gp_mutex);
+ smp_mb__after_spinlock();
+ write_unlock(&rcu_gp_mutex);
+ }
+
+[You can ignore rcu_assign_pointer() and rcu_dereference() without missing
+much. But here are simplified versions anyway. And whatever you do,
+don't forget about them when submitting patches making use of RCU!]::
+
+ #define rcu_assign_pointer(p, v) \
+ ({ \
+ smp_store_release(&(p), (v)); \
+ })
+
+ #define rcu_dereference(p) \
+ ({ \
+ typeof(p) _________p1 = READ_ONCE(p); \
+ (_________p1); \
+ })
+
+
+The rcu_read_lock() and rcu_read_unlock() primitive read-acquire
+and release a global reader-writer lock. The synchronize_rcu()
+primitive write-acquires this same lock, then releases it. This means
+that once synchronize_rcu() exits, all RCU read-side critical sections
+that were in progress before synchronize_rcu() was called are guaranteed
+to have completed -- there is no way that synchronize_rcu() would have
+been able to write-acquire the lock otherwise. The smp_mb__after_spinlock()
+promotes synchronize_rcu() to a full memory barrier in compliance with
+the "Memory-Barrier Guarantees" listed in:
+
+ Documentation/RCU/Design/Requirements/Requirements.rst
+
+It is possible to nest rcu_read_lock(), since reader-writer locks may
+be recursively acquired. Note also that rcu_read_lock() is immune
+from deadlock (an important property of RCU). The reason for this is
+that the only thing that can block rcu_read_lock() is a synchronize_rcu().
+But synchronize_rcu() does not acquire any locks while holding rcu_gp_mutex,
+so there can be no deadlock cycle.
+
+.. _quiz_1:
+
+Quick Quiz #1:
+ Why is this argument naive? How could a deadlock
+ occur when using this algorithm in a real-world Linux
+ kernel? How could this deadlock be avoided?
+
+:ref:`Answers to Quick Quiz <8_whatisRCU>`
+
+5B. "TOY" EXAMPLE #2: CLASSIC RCU
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+This section presents a "toy" RCU implementation that is based on
+"classic RCU". It is also short on performance (but only for updates) and
+on features such as hotplug CPU and the ability to run in CONFIG_PREEMPT
+kernels. The definitions of rcu_dereference() and rcu_assign_pointer()
+are the same as those shown in the preceding section, so they are omitted.
+::
+
+ void rcu_read_lock(void) { }
+
+ void rcu_read_unlock(void) { }
+
+ void synchronize_rcu(void)
+ {
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ run_on(cpu);
+ }
+
+Note that rcu_read_lock() and rcu_read_unlock() do absolutely nothing.
+This is the great strength of classic RCU in a non-preemptive kernel:
+read-side overhead is precisely zero, at least on non-Alpha CPUs.
+And there is absolutely no way that rcu_read_lock() can possibly
+participate in a deadlock cycle!
+
+The implementation of synchronize_rcu() simply schedules itself on each
+CPU in turn. The run_on() primitive can be implemented straightforwardly
+in terms of the sched_setaffinity() primitive. Of course, a somewhat less
+"toy" implementation would restore the affinity upon completion rather
+than just leaving all tasks running on the last CPU, but when I said
+"toy", I meant **toy**!
+
+So how the heck is this supposed to work???
+
+Remember that it is illegal to block while in an RCU read-side critical
+section. Therefore, if a given CPU executes a context switch, we know
+that it must have completed all preceding RCU read-side critical sections.
+Once **all** CPUs have executed a context switch, then **all** preceding
+RCU read-side critical sections will have completed.
+
+So, suppose that we remove a data item from its structure and then invoke
+synchronize_rcu(). Once synchronize_rcu() returns, we are guaranteed
+that there are no RCU read-side critical sections holding a reference
+to that data item, so we can safely reclaim it.
+
+.. _quiz_2:
+
+Quick Quiz #2:
+ Give an example where Classic RCU's read-side
+ overhead is **negative**.
+
+:ref:`Answers to Quick Quiz <8_whatisRCU>`
+
+.. _quiz_3:
+
+Quick Quiz #3:
+ If it is illegal to block in an RCU read-side
+ critical section, what the heck do you do in
+ PREEMPT_RT, where normal spinlocks can block???
+
+:ref:`Answers to Quick Quiz <8_whatisRCU>`
+
+.. _6_whatisRCU:
+
+6. ANALOGY WITH READER-WRITER LOCKING
+--------------------------------------
+
+Although RCU can be used in many different ways, a very common use of
+RCU is analogous to reader-writer locking. The following unified
+diff shows how closely related RCU and reader-writer locking can be.
+::
+
+ @@ -5,5 +5,5 @@ struct el {
+ int data;
+ /* Other data fields */
+ };
+ -rwlock_t listmutex;
+ +spinlock_t listmutex;
+ struct el head;
+
+ @@ -13,15 +14,15 @@
+ struct list_head *lp;
+ struct el *p;
+
+ - read_lock(&listmutex);
+ - list_for_each_entry(p, head, lp) {
+ + rcu_read_lock();
+ + list_for_each_entry_rcu(p, head, lp) {
+ if (p->key == key) {
+ *result = p->data;
+ - read_unlock(&listmutex);
+ + rcu_read_unlock();
+ return 1;
+ }
+ }
+ - read_unlock(&listmutex);
+ + rcu_read_unlock();
+ return 0;
+ }
+
+ @@ -29,15 +30,16 @@
+ {
+ struct el *p;
+
+ - write_lock(&listmutex);
+ + spin_lock(&listmutex);
+ list_for_each_entry(p, head, lp) {
+ if (p->key == key) {
+ - list_del(&p->list);
+ - write_unlock(&listmutex);
+ + list_del_rcu(&p->list);
+ + spin_unlock(&listmutex);
+ + synchronize_rcu();
+ kfree(p);
+ return 1;
+ }
+ }
+ - write_unlock(&listmutex);
+ + spin_unlock(&listmutex);
+ return 0;
+ }
+
+Or, for those who prefer a side-by-side listing::
+
+ 1 struct el { 1 struct el {
+ 2 struct list_head list; 2 struct list_head list;
+ 3 long key; 3 long key;
+ 4 spinlock_t mutex; 4 spinlock_t mutex;
+ 5 int data; 5 int data;
+ 6 /* Other data fields */ 6 /* Other data fields */
+ 7 }; 7 };
+ 8 rwlock_t listmutex; 8 spinlock_t listmutex;
+ 9 struct el head; 9 struct el head;
+
+::
+
+ 1 int search(long key, int *result) 1 int search(long key, int *result)
+ 2 { 2 {
+ 3 struct list_head *lp; 3 struct list_head *lp;
+ 4 struct el *p; 4 struct el *p;
+ 5 5
+ 6 read_lock(&listmutex); 6 rcu_read_lock();
+ 7 list_for_each_entry(p, head, lp) { 7 list_for_each_entry_rcu(p, head, lp) {
+ 8 if (p->key == key) { 8 if (p->key == key) {
+ 9 *result = p->data; 9 *result = p->data;
+ 10 read_unlock(&listmutex); 10 rcu_read_unlock();
+ 11 return 1; 11 return 1;
+ 12 } 12 }
+ 13 } 13 }
+ 14 read_unlock(&listmutex); 14 rcu_read_unlock();
+ 15 return 0; 15 return 0;
+ 16 } 16 }
+
+::
+
+ 1 int delete(long key) 1 int delete(long key)
+ 2 { 2 {
+ 3 struct el *p; 3 struct el *p;
+ 4 4
+ 5 write_lock(&listmutex); 5 spin_lock(&listmutex);
+ 6 list_for_each_entry(p, head, lp) { 6 list_for_each_entry(p, head, lp) {
+ 7 if (p->key == key) { 7 if (p->key == key) {
+ 8 list_del(&p->list); 8 list_del_rcu(&p->list);
+ 9 write_unlock(&listmutex); 9 spin_unlock(&listmutex);
+ 10 synchronize_rcu();
+ 10 kfree(p); 11 kfree(p);
+ 11 return 1; 12 return 1;
+ 12 } 13 }
+ 13 } 14 }
+ 14 write_unlock(&listmutex); 15 spin_unlock(&listmutex);
+ 15 return 0; 16 return 0;
+ 16 } 17 }
+
+Either way, the differences are quite small. Read-side locking moves
+to rcu_read_lock() and rcu_read_unlock, update-side locking moves from
+a reader-writer lock to a simple spinlock, and a synchronize_rcu()
+precedes the kfree().
+
+However, there is one potential catch: the read-side and update-side
+critical sections can now run concurrently. In many cases, this will
+not be a problem, but it is necessary to check carefully regardless.
+For example, if multiple independent list updates must be seen as
+a single atomic update, converting to RCU will require special care.
+
+Also, the presence of synchronize_rcu() means that the RCU version of
+delete() can now block. If this is a problem, there is a callback-based
+mechanism that never blocks, namely call_rcu() or kfree_rcu(), that can
+be used in place of synchronize_rcu().
+
+.. _7_whatisRCU:
+
+7. FULL LIST OF RCU APIs
+-------------------------
+
+The RCU APIs are documented in docbook-format header comments in the
+Linux-kernel source code, but it helps to have a full list of the
+APIs, since there does not appear to be a way to categorize them
+in docbook. Here is the list, by category.
+
+RCU list traversal::
+
+ list_entry_rcu
+ list_entry_lockless
+ list_first_entry_rcu
+ list_next_rcu
+ list_for_each_entry_rcu
+ list_for_each_entry_continue_rcu
+ list_for_each_entry_from_rcu
+ list_first_or_null_rcu
+ list_next_or_null_rcu
+ hlist_first_rcu
+ hlist_next_rcu
+ hlist_pprev_rcu
+ hlist_for_each_entry_rcu
+ hlist_for_each_entry_rcu_bh
+ hlist_for_each_entry_from_rcu
+ hlist_for_each_entry_continue_rcu
+ hlist_for_each_entry_continue_rcu_bh
+ hlist_nulls_first_rcu
+ hlist_nulls_for_each_entry_rcu
+ hlist_bl_first_rcu
+ hlist_bl_for_each_entry_rcu
+
+RCU pointer/list update::
+
+ rcu_assign_pointer
+ list_add_rcu
+ list_add_tail_rcu
+ list_del_rcu
+ list_replace_rcu
+ hlist_add_behind_rcu
+ hlist_add_before_rcu
+ hlist_add_head_rcu
+ hlist_add_tail_rcu
+ hlist_del_rcu
+ hlist_del_init_rcu
+ hlist_replace_rcu
+ list_splice_init_rcu
+ list_splice_tail_init_rcu
+ hlist_nulls_del_init_rcu
+ hlist_nulls_del_rcu
+ hlist_nulls_add_head_rcu
+ hlist_bl_add_head_rcu
+ hlist_bl_del_init_rcu
+ hlist_bl_del_rcu
+ hlist_bl_set_first_rcu
+
+RCU::
+
+ Critical sections Grace period Barrier
+
+ rcu_read_lock synchronize_net rcu_barrier
+ rcu_read_unlock synchronize_rcu
+ rcu_dereference synchronize_rcu_expedited
+ rcu_read_lock_held call_rcu
+ rcu_dereference_check kfree_rcu
+ rcu_dereference_protected
+
+bh::
+
+ Critical sections Grace period Barrier
+
+ rcu_read_lock_bh call_rcu rcu_barrier
+ rcu_read_unlock_bh synchronize_rcu
+ [local_bh_disable] synchronize_rcu_expedited
+ [and friends]
+ rcu_dereference_bh
+ rcu_dereference_bh_check
+ rcu_dereference_bh_protected
+ rcu_read_lock_bh_held
+
+sched::
+
+ Critical sections Grace period Barrier
+
+ rcu_read_lock_sched call_rcu rcu_barrier
+ rcu_read_unlock_sched synchronize_rcu
+ [preempt_disable] synchronize_rcu_expedited
+ [and friends]
+ rcu_read_lock_sched_notrace
+ rcu_read_unlock_sched_notrace
+ rcu_dereference_sched
+ rcu_dereference_sched_check
+ rcu_dereference_sched_protected
+ rcu_read_lock_sched_held
+
+
+SRCU::
+
+ Critical sections Grace period Barrier
+
+ srcu_read_lock call_srcu srcu_barrier
+ srcu_read_unlock synchronize_srcu
+ srcu_dereference synchronize_srcu_expedited
+ srcu_dereference_check
+ srcu_read_lock_held
+
+SRCU: Initialization/cleanup::
+
+ DEFINE_SRCU
+ DEFINE_STATIC_SRCU
+ init_srcu_struct
+ cleanup_srcu_struct
+
+All: lockdep-checked RCU-protected pointer access::
+
+ rcu_access_pointer
+ rcu_dereference_raw
+ RCU_LOCKDEP_WARN
+ rcu_sleep_check
+ RCU_NONIDLE
+
+See the comment headers in the source code (or the docbook generated
+from them) for more information.
+
+However, given that there are no fewer than four families of RCU APIs
+in the Linux kernel, how do you choose which one to use? The following
+list can be helpful:
+
+a. Will readers need to block? If so, you need SRCU.
+
+b. What about the -rt patchset? If readers would need to block
+ in an non-rt kernel, you need SRCU. If readers would block
+ in a -rt kernel, but not in a non-rt kernel, SRCU is not
+ necessary. (The -rt patchset turns spinlocks into sleeplocks,
+ hence this distinction.)
+
+c. Do you need to treat NMI handlers, hardirq handlers,
+ and code segments with preemption disabled (whether
+ via preempt_disable(), local_irq_save(), local_bh_disable(),
+ or some other mechanism) as if they were explicit RCU readers?
+ If so, RCU-sched is the only choice that will work for you.
+
+d. Do you need RCU grace periods to complete even in the face
+ of softirq monopolization of one or more of the CPUs? For
+ example, is your code subject to network-based denial-of-service
+ attacks? If so, you should disable softirq across your readers,
+ for example, by using rcu_read_lock_bh().
+
+e. Is your workload too update-intensive for normal use of
+ RCU, but inappropriate for other synchronization mechanisms?
+ If so, consider SLAB_TYPESAFE_BY_RCU (which was originally
+ named SLAB_DESTROY_BY_RCU). But please be careful!
+
+f. Do you need read-side critical sections that are respected
+ even though they are in the middle of the idle loop, during
+ user-mode execution, or on an offlined CPU? If so, SRCU is the
+ only choice that will work for you.
+
+g. Otherwise, use RCU.
+
+Of course, this all assumes that you have determined that RCU is in fact
+the right tool for your job.
+
+.. _8_whatisRCU:
+
+8. ANSWERS TO QUICK QUIZZES
+----------------------------
+
+Quick Quiz #1:
+ Why is this argument naive? How could a deadlock
+ occur when using this algorithm in a real-world Linux
+ kernel? [Referring to the lock-based "toy" RCU
+ algorithm.]
+
+Answer:
+ Consider the following sequence of events:
+
+ 1. CPU 0 acquires some unrelated lock, call it
+ "problematic_lock", disabling irq via
+ spin_lock_irqsave().
+
+ 2. CPU 1 enters synchronize_rcu(), write-acquiring
+ rcu_gp_mutex.
+
+ 3. CPU 0 enters rcu_read_lock(), but must wait
+ because CPU 1 holds rcu_gp_mutex.
+
+ 4. CPU 1 is interrupted, and the irq handler
+ attempts to acquire problematic_lock.
+
+ The system is now deadlocked.
+
+ One way to avoid this deadlock is to use an approach like
+ that of CONFIG_PREEMPT_RT, where all normal spinlocks
+ become blocking locks, and all irq handlers execute in
+ the context of special tasks. In this case, in step 4
+ above, the irq handler would block, allowing CPU 1 to
+ release rcu_gp_mutex, avoiding the deadlock.
+
+ Even in the absence of deadlock, this RCU implementation
+ allows latency to "bleed" from readers to other
+ readers through synchronize_rcu(). To see this,
+ consider task A in an RCU read-side critical section
+ (thus read-holding rcu_gp_mutex), task B blocked
+ attempting to write-acquire rcu_gp_mutex, and
+ task C blocked in rcu_read_lock() attempting to
+ read_acquire rcu_gp_mutex. Task A's RCU read-side
+ latency is holding up task C, albeit indirectly via
+ task B.
+
+ Realtime RCU implementations therefore use a counter-based
+ approach where tasks in RCU read-side critical sections
+ cannot be blocked by tasks executing synchronize_rcu().
+
+:ref:`Back to Quick Quiz #1 <quiz_1>`
+
+Quick Quiz #2:
+ Give an example where Classic RCU's read-side
+ overhead is **negative**.
+
+Answer:
+ Imagine a single-CPU system with a non-CONFIG_PREEMPT
+ kernel where a routing table is used by process-context
+ code, but can be updated by irq-context code (for example,
+ by an "ICMP REDIRECT" packet). The usual way of handling
+ this would be to have the process-context code disable
+ interrupts while searching the routing table. Use of
+ RCU allows such interrupt-disabling to be dispensed with.
+ Thus, without RCU, you pay the cost of disabling interrupts,
+ and with RCU you don't.
+
+ One can argue that the overhead of RCU in this
+ case is negative with respect to the single-CPU
+ interrupt-disabling approach. Others might argue that
+ the overhead of RCU is merely zero, and that replacing
+ the positive overhead of the interrupt-disabling scheme
+ with the zero-overhead RCU scheme does not constitute
+ negative overhead.
+
+ In real life, of course, things are more complex. But
+ even the theoretical possibility of negative overhead for
+ a synchronization primitive is a bit unexpected. ;-)
+
+:ref:`Back to Quick Quiz #2 <quiz_2>`
+
+Quick Quiz #3:
+ If it is illegal to block in an RCU read-side
+ critical section, what the heck do you do in
+ PREEMPT_RT, where normal spinlocks can block???
+
+Answer:
+ Just as PREEMPT_RT permits preemption of spinlock
+ critical sections, it permits preemption of RCU
+ read-side critical sections. It also permits
+ spinlocks blocking while in RCU read-side critical
+ sections.
+
+ Why the apparent inconsistency? Because it is
+ possible to use priority boosting to keep the RCU
+ grace periods short if need be (for example, if running
+ short of memory). In contrast, if blocking waiting
+ for (say) network reception, there is no way to know
+ what should be boosted. Especially given that the
+ process we need to boost might well be a human being
+ who just went out for a pizza or something. And although
+ a computer-operated cattle prod might arouse serious
+ interest, it might also provoke serious objections.
+ Besides, how does the computer know what pizza parlor
+ the human being went to???
+
+:ref:`Back to Quick Quiz #3 <quiz_3>`
+
+ACKNOWLEDGEMENTS
+
+My thanks to the people who helped make this human-readable, including
+Jon Walpole, Josh Triplett, Serge Hallyn, Suzanne Wood, and Alan Stern.
+
+
+For more information, see http://www.rdrop.com/users/paulmck/RCU.
diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt
deleted file mode 100644
index 58ba05c..0000000
--- a/Documentation/RCU/whatisRCU.txt
+++ /dev/null
@@ -1,1079 +0,0 @@
-What is RCU? -- "Read, Copy, Update"
-
-Please note that the "What is RCU?" LWN series is an excellent place
-to start learning about RCU:
-
-1. What is RCU, Fundamentally? http://lwn.net/Articles/262464/
-2. What is RCU? Part 2: Usage http://lwn.net/Articles/263130/
-3. RCU part 3: the RCU API http://lwn.net/Articles/264090/
-4. The RCU API, 2010 Edition http://lwn.net/Articles/418853/
- 2010 Big API Table http://lwn.net/Articles/419086/
-5. The RCU API, 2014 Edition http://lwn.net/Articles/609904/
- 2014 Big API Table http://lwn.net/Articles/609973/
-
-
-What is RCU?
-
-RCU is a synchronization mechanism that was added to the Linux kernel
-during the 2.5 development effort that is optimized for read-mostly
-situations. Although RCU is actually quite simple once you understand it,
-getting there can sometimes be a challenge. Part of the problem is that
-most of the past descriptions of RCU have been written with the mistaken
-assumption that there is "one true way" to describe RCU. Instead,
-the experience has been that different people must take different paths
-to arrive at an understanding of RCU. This document provides several
-different paths, as follows:
-
-1. RCU OVERVIEW
-2. WHAT IS RCU'S CORE API?
-3. WHAT ARE SOME EXAMPLE USES OF CORE RCU API?
-4. WHAT IF MY UPDATING THREAD CANNOT BLOCK?
-5. WHAT ARE SOME SIMPLE IMPLEMENTATIONS OF RCU?
-6. ANALOGY WITH READER-WRITER LOCKING
-7. FULL LIST OF RCU APIs
-8. ANSWERS TO QUICK QUIZZES
-
-People who prefer starting with a conceptual overview should focus on
-Section 1, though most readers will profit by reading this section at
-some point. People who prefer to start with an API that they can then
-experiment with should focus on Section 2. People who prefer to start
-with example uses should focus on Sections 3 and 4. People who need to
-understand the RCU implementation should focus on Section 5, then dive
-into the kernel source code. People who reason best by analogy should
-focus on Section 6. Section 7 serves as an index to the docbook API
-documentation, and Section 8 is the traditional answer key.
-
-So, start with the section that makes the most sense to you and your
-preferred method of learning. If you need to know everything about
-everything, feel free to read the whole thing -- but if you are really
-that type of person, you have perused the source code and will therefore
-never need this document anyway. ;-)
-
-
-1. RCU OVERVIEW
-
-The basic idea behind RCU is to split updates into "removal" and
-"reclamation" phases. The removal phase removes references to data items
-within a data structure (possibly by replacing them with references to
-new versions of these data items), and can run concurrently with readers.
-The reason that it is safe to run the removal phase concurrently with
-readers is the semantics of modern CPUs guarantee that readers will see
-either the old or the new version of the data structure rather than a
-partially updated reference. The reclamation phase does the work of reclaiming
-(e.g., freeing) the data items removed from the data structure during the
-removal phase. Because reclaiming data items can disrupt any readers
-concurrently referencing those data items, the reclamation phase must
-not start until readers no longer hold references to those data items.
-
-Splitting the update into removal and reclamation phases permits the
-updater to perform the removal phase immediately, and to defer the
-reclamation phase until all readers active during the removal phase have
-completed, either by blocking until they finish or by registering a
-callback that is invoked after they finish. Only readers that are active
-during the removal phase need be considered, because any reader starting
-after the removal phase will be unable to gain a reference to the removed
-data items, and therefore cannot be disrupted by the reclamation phase.
-
-So the typical RCU update sequence goes something like the following:
-
-a. Remove pointers to a data structure, so that subsequent
- readers cannot gain a reference to it.
-
-b. Wait for all previous readers to complete their RCU read-side
- critical sections.
-
-c. At this point, there cannot be any readers who hold references
- to the data structure, so it now may safely be reclaimed
- (e.g., kfree()d).
-
-Step (b) above is the key idea underlying RCU's deferred destruction.
-The ability to wait until all readers are done allows RCU readers to
-use much lighter-weight synchronization, in some cases, absolutely no
-synchronization at all. In contrast, in more conventional lock-based
-schemes, readers must use heavy-weight synchronization in order to
-prevent an updater from deleting the data structure out from under them.
-This is because lock-based updaters typically update data items in place,
-and must therefore exclude readers. In contrast, RCU-based updaters
-typically take advantage of the fact that writes to single aligned
-pointers are atomic on modern CPUs, allowing atomic insertion, removal,
-and replacement of data items in a linked structure without disrupting
-readers. Concurrent RCU readers can then continue accessing the old
-versions, and can dispense with the atomic operations, memory barriers,
-and communications cache misses that are so expensive on present-day
-SMP computer systems, even in absence of lock contention.
-
-In the three-step procedure shown above, the updater is performing both
-the removal and the reclamation step, but it is often helpful for an
-entirely different thread to do the reclamation, as is in fact the case
-in the Linux kernel's directory-entry cache (dcache). Even if the same
-thread performs both the update step (step (a) above) and the reclamation
-step (step (c) above), it is often helpful to think of them separately.
-For example, RCU readers and updaters need not communicate at all,
-but RCU provides implicit low-overhead communication between readers
-and reclaimers, namely, in step (b) above.
-
-So how the heck can a reclaimer tell when a reader is done, given
-that readers are not doing any sort of synchronization operations???
-Read on to learn about how RCU's API makes this easy.
-
-
-2. WHAT IS RCU'S CORE API?
-
-The core RCU API is quite small:
-
-a. rcu_read_lock()
-b. rcu_read_unlock()
-c. synchronize_rcu() / call_rcu()
-d. rcu_assign_pointer()
-e. rcu_dereference()
-
-There are many other members of the RCU API, but the rest can be
-expressed in terms of these five, though most implementations instead
-express synchronize_rcu() in terms of the call_rcu() callback API.
-
-The five core RCU APIs are described below, the other 18 will be enumerated
-later. See the kernel docbook documentation for more info, or look directly
-at the function header comments.
-
-rcu_read_lock()
-
- void rcu_read_lock(void);
-
- Used by a reader to inform the reclaimer that the reader is
- entering an RCU read-side critical section. It is illegal
- to block while in an RCU read-side critical section, though
- kernels built with CONFIG_PREEMPT_RCU can preempt RCU
- read-side critical sections. Any RCU-protected data structure
- accessed during an RCU read-side critical section is guaranteed to
- remain unreclaimed for the full duration of that critical section.
- Reference counts may be used in conjunction with RCU to maintain
- longer-term references to data structures.
-
-rcu_read_unlock()
-
- void rcu_read_unlock(void);
-
- Used by a reader to inform the reclaimer that the reader is
- exiting an RCU read-side critical section. Note that RCU
- read-side critical sections may be nested and/or overlapping.
-
-synchronize_rcu()
-
- void synchronize_rcu(void);
-
- Marks the end of updater code and the beginning of reclaimer
- code. It does this by blocking until all pre-existing RCU
- read-side critical sections on all CPUs have completed.
- Note that synchronize_rcu() will -not- necessarily wait for
- any subsequent RCU read-side critical sections to complete.
- For example, consider the following sequence of events:
-
- CPU 0 CPU 1 CPU 2
- ----------------- ------------------------- ---------------
- 1. rcu_read_lock()
- 2. enters synchronize_rcu()
- 3. rcu_read_lock()
- 4. rcu_read_unlock()
- 5. exits synchronize_rcu()
- 6. rcu_read_unlock()
-
- To reiterate, synchronize_rcu() waits only for ongoing RCU
- read-side critical sections to complete, not necessarily for
- any that begin after synchronize_rcu() is invoked.
-
- Of course, synchronize_rcu() does not necessarily return
- -immediately- after the last pre-existing RCU read-side critical
- section completes. For one thing, there might well be scheduling
- delays. For another thing, many RCU implementations process
- requests in batches in order to improve efficiencies, which can
- further delay synchronize_rcu().
-
- Since synchronize_rcu() is the API that must figure out when
- readers are done, its implementation is key to RCU. For RCU
- to be useful in all but the most read-intensive situations,
- synchronize_rcu()'s overhead must also be quite small.
-
- The call_rcu() API is a callback form of synchronize_rcu(),
- and is described in more detail in a later section. Instead of
- blocking, it registers a function and argument which are invoked
- after all ongoing RCU read-side critical sections have completed.
- This callback variant is particularly useful in situations where
- it is illegal to block or where update-side performance is
- critically important.
-
- However, the call_rcu() API should not be used lightly, as use
- of the synchronize_rcu() API generally results in simpler code.
- In addition, the synchronize_rcu() API has the nice property
- of automatically limiting update rate should grace periods
- be delayed. This property results in system resilience in face
- of denial-of-service attacks. Code using call_rcu() should limit
- update rate in order to gain this same sort of resilience. See
- checklist.txt for some approaches to limiting the update rate.
-
-rcu_assign_pointer()
-
- void rcu_assign_pointer(p, typeof(p) v);
-
- Yes, rcu_assign_pointer() -is- implemented as a macro, though it
- would be cool to be able to declare a function in this manner.
- (Compiler experts will no doubt disagree.)
-
- The updater uses this function to assign a new value to an
- RCU-protected pointer, in order to safely communicate the change
- in value from the updater to the reader. This macro does not
- evaluate to an rvalue, but it does execute any memory-barrier
- instructions required for a given CPU architecture.
-
- Perhaps just as important, it serves to document (1) which
- pointers are protected by RCU and (2) the point at which a
- given structure becomes accessible to other CPUs. That said,
- rcu_assign_pointer() is most frequently used indirectly, via
- the _rcu list-manipulation primitives such as list_add_rcu().
-
-rcu_dereference()
-
- typeof(p) rcu_dereference(p);
-
- Like rcu_assign_pointer(), rcu_dereference() must be implemented
- as a macro.
-
- The reader uses rcu_dereference() to fetch an RCU-protected
- pointer, which returns a value that may then be safely
- dereferenced. Note that rcu_dereference() does not actually
- dereference the pointer, instead, it protects the pointer for
- later dereferencing. It also executes any needed memory-barrier
- instructions for a given CPU architecture. Currently, only Alpha
- needs memory barriers within rcu_dereference() -- on other CPUs,
- it compiles to nothing, not even a compiler directive.
-
- Common coding practice uses rcu_dereference() to copy an
- RCU-protected pointer to a local variable, then dereferences
- this local variable, for example as follows:
-
- p = rcu_dereference(head.next);
- return p->data;
-
- However, in this case, one could just as easily combine these
- into one statement:
-
- return rcu_dereference(head.next)->data;
-
- If you are going to be fetching multiple fields from the
- RCU-protected structure, using the local variable is of
- course preferred. Repeated rcu_dereference() calls look
- ugly, do not guarantee that the same pointer will be returned
- if an update happened while in the critical section, and incur
- unnecessary overhead on Alpha CPUs.
-
- Note that the value returned by rcu_dereference() is valid
- only within the enclosing RCU read-side critical section [1].
- For example, the following is -not- legal:
-
- rcu_read_lock();
- p = rcu_dereference(head.next);
- rcu_read_unlock();
- x = p->address; /* BUG!!! */
- rcu_read_lock();
- y = p->data; /* BUG!!! */
- rcu_read_unlock();
-
- Holding a reference from one RCU read-side critical section
- to another is just as illegal as holding a reference from
- one lock-based critical section to another! Similarly,
- using a reference outside of the critical section in which
- it was acquired is just as illegal as doing so with normal
- locking.
-
- As with rcu_assign_pointer(), an important function of
- rcu_dereference() is to document which pointers are protected by
- RCU, in particular, flagging a pointer that is subject to changing
- at any time, including immediately after the rcu_dereference().
- And, again like rcu_assign_pointer(), rcu_dereference() is
- typically used indirectly, via the _rcu list-manipulation
- primitives, such as list_for_each_entry_rcu() [2].
-
- [1] The variant rcu_dereference_protected() can be used outside
- of an RCU read-side critical section as long as the usage is
- protected by locks acquired by the update-side code. This variant
- avoids the lockdep warning that would happen when using (for
- example) rcu_dereference() without rcu_read_lock() protection.
- Using rcu_dereference_protected() also has the advantage
- of permitting compiler optimizations that rcu_dereference()
- must prohibit. The rcu_dereference_protected() variant takes
- a lockdep expression to indicate which locks must be acquired
- by the caller. If the indicated protection is not provided,
- a lockdep splat is emitted. See Documentation/RCU/Design/Requirements/Requirements.rst
- and the API's code comments for more details and example usage.
-
- [2] If the list_for_each_entry_rcu() instance might be used by
- update-side code as well as by RCU readers, then an additional
- lockdep expression can be added to its list of arguments.
- For example, given an additional "lock_is_held(&mylock)" argument,
- the RCU lockdep code would complain only if this instance was
- invoked outside of an RCU read-side critical section and without
- the protection of mylock.
-
-The following diagram shows how each API communicates among the
-reader, updater, and reclaimer.
-
-
- rcu_assign_pointer()
- +--------+
- +---------------------->| reader |---------+
- | +--------+ |
- | | |
- | | | Protect:
- | | | rcu_read_lock()
- | | | rcu_read_unlock()
- | rcu_dereference() | |
- +---------+ | |
- | updater |<----------------+ |
- +---------+ V
- | +-----------+
- +----------------------------------->| reclaimer |
- +-----------+
- Defer:
- synchronize_rcu() & call_rcu()
-
-
-The RCU infrastructure observes the time sequence of rcu_read_lock(),
-rcu_read_unlock(), synchronize_rcu(), and call_rcu() invocations in
-order to determine when (1) synchronize_rcu() invocations may return
-to their callers and (2) call_rcu() callbacks may be invoked. Efficient
-implementations of the RCU infrastructure make heavy use of batching in
-order to amortize their overhead over many uses of the corresponding APIs.
-
-There are at least three flavors of RCU usage in the Linux kernel. The diagram
-above shows the most common one. On the updater side, the rcu_assign_pointer(),
-sychronize_rcu() and call_rcu() primitives used are the same for all three
-flavors. However for protection (on the reader side), the primitives used vary
-depending on the flavor:
-
-a. rcu_read_lock() / rcu_read_unlock()
- rcu_dereference()
-
-b. rcu_read_lock_bh() / rcu_read_unlock_bh()
- local_bh_disable() / local_bh_enable()
- rcu_dereference_bh()
-
-c. rcu_read_lock_sched() / rcu_read_unlock_sched()
- preempt_disable() / preempt_enable()
- local_irq_save() / local_irq_restore()
- hardirq enter / hardirq exit
- NMI enter / NMI exit
- rcu_dereference_sched()
-
-These three flavors are used as follows:
-
-a. RCU applied to normal data structures.
-
-b. RCU applied to networking data structures that may be subjected
- to remote denial-of-service attacks.
-
-c. RCU applied to scheduler and interrupt/NMI-handler tasks.
-
-Again, most uses will be of (a). The (b) and (c) cases are important
-for specialized uses, but are relatively uncommon.
-
-
-3. WHAT ARE SOME EXAMPLE USES OF CORE RCU API?
-
-This section shows a simple use of the core RCU API to protect a
-global pointer to a dynamically allocated structure. More-typical
-uses of RCU may be found in listRCU.txt, arrayRCU.txt, and NMI-RCU.txt.
-
- struct foo {
- int a;
- char b;
- long c;
- };
- DEFINE_SPINLOCK(foo_mutex);
-
- struct foo __rcu *gbl_foo;
-
- /*
- * Create a new struct foo that is the same as the one currently
- * pointed to by gbl_foo, except that field "a" is replaced
- * with "new_a". Points gbl_foo to the new structure, and
- * frees up the old structure after a grace period.
- *
- * Uses rcu_assign_pointer() to ensure that concurrent readers
- * see the initialized version of the new structure.
- *
- * Uses synchronize_rcu() to ensure that any readers that might
- * have references to the old structure complete before freeing
- * the old structure.
- */
- void foo_update_a(int new_a)
- {
- struct foo *new_fp;
- struct foo *old_fp;
-
- new_fp = kmalloc(sizeof(*new_fp), GFP_KERNEL);
- spin_lock(&foo_mutex);
- old_fp = rcu_dereference_protected(gbl_foo, lockdep_is_held(&foo_mutex));
- *new_fp = *old_fp;
- new_fp->a = new_a;
- rcu_assign_pointer(gbl_foo, new_fp);
- spin_unlock(&foo_mutex);
- synchronize_rcu();
- kfree(old_fp);
- }
-
- /*
- * Return the value of field "a" of the current gbl_foo
- * structure. Use rcu_read_lock() and rcu_read_unlock()
- * to ensure that the structure does not get deleted out
- * from under us, and use rcu_dereference() to ensure that
- * we see the initialized version of the structure (important
- * for DEC Alpha and for people reading the code).
- */
- int foo_get_a(void)
- {
- int retval;
-
- rcu_read_lock();
- retval = rcu_dereference(gbl_foo)->a;
- rcu_read_unlock();
- return retval;
- }
-
-So, to sum up:
-
-o Use rcu_read_lock() and rcu_read_unlock() to guard RCU
- read-side critical sections.
-
-o Within an RCU read-side critical section, use rcu_dereference()
- to dereference RCU-protected pointers.
-
-o Use some solid scheme (such as locks or semaphores) to
- keep concurrent updates from interfering with each other.
-
-o Use rcu_assign_pointer() to update an RCU-protected pointer.
- This primitive protects concurrent readers from the updater,
- -not- concurrent updates from each other! You therefore still
- need to use locking (or something similar) to keep concurrent
- rcu_assign_pointer() primitives from interfering with each other.
-
-o Use synchronize_rcu() -after- removing a data element from an
- RCU-protected data structure, but -before- reclaiming/freeing
- the data element, in order to wait for the completion of all
- RCU read-side critical sections that might be referencing that
- data item.
-
-See checklist.txt for additional rules to follow when using RCU.
-And again, more-typical uses of RCU may be found in listRCU.txt,
-arrayRCU.txt, and NMI-RCU.txt.
-
-
-4. WHAT IF MY UPDATING THREAD CANNOT BLOCK?
-
-In the example above, foo_update_a() blocks until a grace period elapses.
-This is quite simple, but in some cases one cannot afford to wait so
-long -- there might be other high-priority work to be done.
-
-In such cases, one uses call_rcu() rather than synchronize_rcu().
-The call_rcu() API is as follows:
-
- void call_rcu(struct rcu_head * head,
- void (*func)(struct rcu_head *head));
-
-This function invokes func(head) after a grace period has elapsed.
-This invocation might happen from either softirq or process context,
-so the function is not permitted to block. The foo struct needs to
-have an rcu_head structure added, perhaps as follows:
-
- struct foo {
- int a;
- char b;
- long c;
- struct rcu_head rcu;
- };
-
-The foo_update_a() function might then be written as follows:
-
- /*
- * Create a new struct foo that is the same as the one currently
- * pointed to by gbl_foo, except that field "a" is replaced
- * with "new_a". Points gbl_foo to the new structure, and
- * frees up the old structure after a grace period.
- *
- * Uses rcu_assign_pointer() to ensure that concurrent readers
- * see the initialized version of the new structure.
- *
- * Uses call_rcu() to ensure that any readers that might have
- * references to the old structure complete before freeing the
- * old structure.
- */
- void foo_update_a(int new_a)
- {
- struct foo *new_fp;
- struct foo *old_fp;
-
- new_fp = kmalloc(sizeof(*new_fp), GFP_KERNEL);
- spin_lock(&foo_mutex);
- old_fp = rcu_dereference_protected(gbl_foo, lockdep_is_held(&foo_mutex));
- *new_fp = *old_fp;
- new_fp->a = new_a;
- rcu_assign_pointer(gbl_foo, new_fp);
- spin_unlock(&foo_mutex);
- call_rcu(&old_fp->rcu, foo_reclaim);
- }
-
-The foo_reclaim() function might appear as follows:
-
- void foo_reclaim(struct rcu_head *rp)
- {
- struct foo *fp = container_of(rp, struct foo, rcu);
-
- foo_cleanup(fp->a);
-
- kfree(fp);
- }
-
-The container_of() primitive is a macro that, given a pointer into a
-struct, the type of the struct, and the pointed-to field within the
-struct, returns a pointer to the beginning of the struct.
-
-The use of call_rcu() permits the caller of foo_update_a() to
-immediately regain control, without needing to worry further about the
-old version of the newly updated element. It also clearly shows the
-RCU distinction between updater, namely foo_update_a(), and reclaimer,
-namely foo_reclaim().
-
-The summary of advice is the same as for the previous section, except
-that we are now using call_rcu() rather than synchronize_rcu():
-
-o Use call_rcu() -after- removing a data element from an
- RCU-protected data structure in order to register a callback
- function that will be invoked after the completion of all RCU
- read-side critical sections that might be referencing that
- data item.
-
-If the callback for call_rcu() is not doing anything more than calling
-kfree() on the structure, you can use kfree_rcu() instead of call_rcu()
-to avoid having to write your own callback:
-
- kfree_rcu(old_fp, rcu);
-
-Again, see checklist.txt for additional rules governing the use of RCU.
-
-
-5. WHAT ARE SOME SIMPLE IMPLEMENTATIONS OF RCU?
-
-One of the nice things about RCU is that it has extremely simple "toy"
-implementations that are a good first step towards understanding the
-production-quality implementations in the Linux kernel. This section
-presents two such "toy" implementations of RCU, one that is implemented
-in terms of familiar locking primitives, and another that more closely
-resembles "classic" RCU. Both are way too simple for real-world use,
-lacking both functionality and performance. However, they are useful
-in getting a feel for how RCU works. See kernel/rcu/update.c for a
-production-quality implementation, and see:
-
- http://www.rdrop.com/users/paulmck/RCU
-
-for papers describing the Linux kernel RCU implementation. The OLS'01
-and OLS'02 papers are a good introduction, and the dissertation provides
-more details on the current implementation as of early 2004.
-
-
-5A. "TOY" IMPLEMENTATION #1: LOCKING
-
-This section presents a "toy" RCU implementation that is based on
-familiar locking primitives. Its overhead makes it a non-starter for
-real-life use, as does its lack of scalability. It is also unsuitable
-for realtime use, since it allows scheduling latency to "bleed" from
-one read-side critical section to another. It also assumes recursive
-reader-writer locks: If you try this with non-recursive locks, and
-you allow nested rcu_read_lock() calls, you can deadlock.
-
-However, it is probably the easiest implementation to relate to, so is
-a good starting point.
-
-It is extremely simple:
-
- static DEFINE_RWLOCK(rcu_gp_mutex);
-
- void rcu_read_lock(void)
- {
- read_lock(&rcu_gp_mutex);
- }
-
- void rcu_read_unlock(void)
- {
- read_unlock(&rcu_gp_mutex);
- }
-
- void synchronize_rcu(void)
- {
- write_lock(&rcu_gp_mutex);
- smp_mb__after_spinlock();
- write_unlock(&rcu_gp_mutex);
- }
-
-[You can ignore rcu_assign_pointer() and rcu_dereference() without missing
-much. But here are simplified versions anyway. And whatever you do,
-don't forget about them when submitting patches making use of RCU!]
-
- #define rcu_assign_pointer(p, v) \
- ({ \
- smp_store_release(&(p), (v)); \
- })
-
- #define rcu_dereference(p) \
- ({ \
- typeof(p) _________p1 = READ_ONCE(p); \
- (_________p1); \
- })
-
-
-The rcu_read_lock() and rcu_read_unlock() primitive read-acquire
-and release a global reader-writer lock. The synchronize_rcu()
-primitive write-acquires this same lock, then releases it. This means
-that once synchronize_rcu() exits, all RCU read-side critical sections
-that were in progress before synchronize_rcu() was called are guaranteed
-to have completed -- there is no way that synchronize_rcu() would have
-been able to write-acquire the lock otherwise. The smp_mb__after_spinlock()
-promotes synchronize_rcu() to a full memory barrier in compliance with
-the "Memory-Barrier Guarantees" listed in:
-
- Documentation/RCU/Design/Requirements/Requirements.rst
-
-It is possible to nest rcu_read_lock(), since reader-writer locks may
-be recursively acquired. Note also that rcu_read_lock() is immune
-from deadlock (an important property of RCU). The reason for this is
-that the only thing that can block rcu_read_lock() is a synchronize_rcu().
-But synchronize_rcu() does not acquire any locks while holding rcu_gp_mutex,
-so there can be no deadlock cycle.
-
-Quick Quiz #1: Why is this argument naive? How could a deadlock
- occur when using this algorithm in a real-world Linux
- kernel? How could this deadlock be avoided?
-
-
-5B. "TOY" EXAMPLE #2: CLASSIC RCU
-
-This section presents a "toy" RCU implementation that is based on
-"classic RCU". It is also short on performance (but only for updates) and
-on features such as hotplug CPU and the ability to run in CONFIG_PREEMPT
-kernels. The definitions of rcu_dereference() and rcu_assign_pointer()
-are the same as those shown in the preceding section, so they are omitted.
-
- void rcu_read_lock(void) { }
-
- void rcu_read_unlock(void) { }
-
- void synchronize_rcu(void)
- {
- int cpu;
-
- for_each_possible_cpu(cpu)
- run_on(cpu);
- }
-
-Note that rcu_read_lock() and rcu_read_unlock() do absolutely nothing.
-This is the great strength of classic RCU in a non-preemptive kernel:
-read-side overhead is precisely zero, at least on non-Alpha CPUs.
-And there is absolutely no way that rcu_read_lock() can possibly
-participate in a deadlock cycle!
-
-The implementation of synchronize_rcu() simply schedules itself on each
-CPU in turn. The run_on() primitive can be implemented straightforwardly
-in terms of the sched_setaffinity() primitive. Of course, a somewhat less
-"toy" implementation would restore the affinity upon completion rather
-than just leaving all tasks running on the last CPU, but when I said
-"toy", I meant -toy-!
-
-So how the heck is this supposed to work???
-
-Remember that it is illegal to block while in an RCU read-side critical
-section. Therefore, if a given CPU executes a context switch, we know
-that it must have completed all preceding RCU read-side critical sections.
-Once -all- CPUs have executed a context switch, then -all- preceding
-RCU read-side critical sections will have completed.
-
-So, suppose that we remove a data item from its structure and then invoke
-synchronize_rcu(). Once synchronize_rcu() returns, we are guaranteed
-that there are no RCU read-side critical sections holding a reference
-to that data item, so we can safely reclaim it.
-
-Quick Quiz #2: Give an example where Classic RCU's read-side
- overhead is -negative-.
-
-Quick Quiz #3: If it is illegal to block in an RCU read-side
- critical section, what the heck do you do in
- PREEMPT_RT, where normal spinlocks can block???
-
-
-6. ANALOGY WITH READER-WRITER LOCKING
-
-Although RCU can be used in many different ways, a very common use of
-RCU is analogous to reader-writer locking. The following unified
-diff shows how closely related RCU and reader-writer locking can be.
-
- @@ -5,5 +5,5 @@ struct el {
- int data;
- /* Other data fields */
- };
- -rwlock_t listmutex;
- +spinlock_t listmutex;
- struct el head;
-
- @@ -13,15 +14,15 @@
- struct list_head *lp;
- struct el *p;
-
- - read_lock(&listmutex);
- - list_for_each_entry(p, head, lp) {
- + rcu_read_lock();
- + list_for_each_entry_rcu(p, head, lp) {
- if (p->key == key) {
- *result = p->data;
- - read_unlock(&listmutex);
- + rcu_read_unlock();
- return 1;
- }
- }
- - read_unlock(&listmutex);
- + rcu_read_unlock();
- return 0;
- }
-
- @@ -29,15 +30,16 @@
- {
- struct el *p;
-
- - write_lock(&listmutex);
- + spin_lock(&listmutex);
- list_for_each_entry(p, head, lp) {
- if (p->key == key) {
- - list_del(&p->list);
- - write_unlock(&listmutex);
- + list_del_rcu(&p->list);
- + spin_unlock(&listmutex);
- + synchronize_rcu();
- kfree(p);
- return 1;
- }
- }
- - write_unlock(&listmutex);
- + spin_unlock(&listmutex);
- return 0;
- }
-
-Or, for those who prefer a side-by-side listing:
-
- 1 struct el { 1 struct el {
- 2 struct list_head list; 2 struct list_head list;
- 3 long key; 3 long key;
- 4 spinlock_t mutex; 4 spinlock_t mutex;
- 5 int data; 5 int data;
- 6 /* Other data fields */ 6 /* Other data fields */
- 7 }; 7 };
- 8 rwlock_t listmutex; 8 spinlock_t listmutex;
- 9 struct el head; 9 struct el head;
-
- 1 int search(long key, int *result) 1 int search(long key, int *result)
- 2 { 2 {
- 3 struct list_head *lp; 3 struct list_head *lp;
- 4 struct el *p; 4 struct el *p;
- 5 5
- 6 read_lock(&listmutex); 6 rcu_read_lock();
- 7 list_for_each_entry(p, head, lp) { 7 list_for_each_entry_rcu(p, head, lp) {
- 8 if (p->key == key) { 8 if (p->key == key) {
- 9 *result = p->data; 9 *result = p->data;
-10 read_unlock(&listmutex); 10 rcu_read_unlock();
-11 return 1; 11 return 1;
-12 } 12 }
-13 } 13 }
-14 read_unlock(&listmutex); 14 rcu_read_unlock();
-15 return 0; 15 return 0;
-16 } 16 }
-
- 1 int delete(long key) 1 int delete(long key)
- 2 { 2 {
- 3 struct el *p; 3 struct el *p;
- 4 4
- 5 write_lock(&listmutex); 5 spin_lock(&listmutex);
- 6 list_for_each_entry(p, head, lp) { 6 list_for_each_entry(p, head, lp) {
- 7 if (p->key == key) { 7 if (p->key == key) {
- 8 list_del(&p->list); 8 list_del_rcu(&p->list);
- 9 write_unlock(&listmutex); 9 spin_unlock(&listmutex);
- 10 synchronize_rcu();
-10 kfree(p); 11 kfree(p);
-11 return 1; 12 return 1;
-12 } 13 }
-13 } 14 }
-14 write_unlock(&listmutex); 15 spin_unlock(&listmutex);
-15 return 0; 16 return 0;
-16 } 17 }
-
-Either way, the differences are quite small. Read-side locking moves
-to rcu_read_lock() and rcu_read_unlock, update-side locking moves from
-a reader-writer lock to a simple spinlock, and a synchronize_rcu()
-precedes the kfree().
-
-However, there is one potential catch: the read-side and update-side
-critical sections can now run concurrently. In many cases, this will
-not be a problem, but it is necessary to check carefully regardless.
-For example, if multiple independent list updates must be seen as
-a single atomic update, converting to RCU will require special care.
-
-Also, the presence of synchronize_rcu() means that the RCU version of
-delete() can now block. If this is a problem, there is a callback-based
-mechanism that never blocks, namely call_rcu() or kfree_rcu(), that can
-be used in place of synchronize_rcu().
-
-
-7. FULL LIST OF RCU APIs
-
-The RCU APIs are documented in docbook-format header comments in the
-Linux-kernel source code, but it helps to have a full list of the
-APIs, since there does not appear to be a way to categorize them
-in docbook. Here is the list, by category.
-
-RCU list traversal:
-
- list_entry_rcu
- list_first_entry_rcu
- list_next_rcu
- list_for_each_entry_rcu
- list_for_each_entry_continue_rcu
- list_for_each_entry_from_rcu
- hlist_first_rcu
- hlist_next_rcu
- hlist_pprev_rcu
- hlist_for_each_entry_rcu
- hlist_for_each_entry_rcu_bh
- hlist_for_each_entry_from_rcu
- hlist_for_each_entry_continue_rcu
- hlist_for_each_entry_continue_rcu_bh
- hlist_nulls_first_rcu
- hlist_nulls_for_each_entry_rcu
- hlist_bl_first_rcu
- hlist_bl_for_each_entry_rcu
-
-RCU pointer/list update:
-
- rcu_assign_pointer
- list_add_rcu
- list_add_tail_rcu
- list_del_rcu
- list_replace_rcu
- hlist_add_behind_rcu
- hlist_add_before_rcu
- hlist_add_head_rcu
- hlist_del_rcu
- hlist_del_init_rcu
- hlist_replace_rcu
- list_splice_init_rcu()
- hlist_nulls_del_init_rcu
- hlist_nulls_del_rcu
- hlist_nulls_add_head_rcu
- hlist_bl_add_head_rcu
- hlist_bl_del_init_rcu
- hlist_bl_del_rcu
- hlist_bl_set_first_rcu
-
-RCU: Critical sections Grace period Barrier
-
- rcu_read_lock synchronize_net rcu_barrier
- rcu_read_unlock synchronize_rcu
- rcu_dereference synchronize_rcu_expedited
- rcu_read_lock_held call_rcu
- rcu_dereference_check kfree_rcu
- rcu_dereference_protected
-
-bh: Critical sections Grace period Barrier
-
- rcu_read_lock_bh call_rcu rcu_barrier
- rcu_read_unlock_bh synchronize_rcu
- [local_bh_disable] synchronize_rcu_expedited
- [and friends]
- rcu_dereference_bh
- rcu_dereference_bh_check
- rcu_dereference_bh_protected
- rcu_read_lock_bh_held
-
-sched: Critical sections Grace period Barrier
-
- rcu_read_lock_sched call_rcu rcu_barrier
- rcu_read_unlock_sched synchronize_rcu
- [preempt_disable] synchronize_rcu_expedited
- [and friends]
- rcu_read_lock_sched_notrace
- rcu_read_unlock_sched_notrace
- rcu_dereference_sched
- rcu_dereference_sched_check
- rcu_dereference_sched_protected
- rcu_read_lock_sched_held
-
-
-SRCU: Critical sections Grace period Barrier
-
- srcu_read_lock call_srcu srcu_barrier
- srcu_read_unlock synchronize_srcu
- srcu_dereference synchronize_srcu_expedited
- srcu_dereference_check
- srcu_read_lock_held
-
-SRCU: Initialization/cleanup
- DEFINE_SRCU
- DEFINE_STATIC_SRCU
- init_srcu_struct
- cleanup_srcu_struct
-
-All: lockdep-checked RCU-protected pointer access
-
- rcu_access_pointer
- rcu_dereference_raw
- RCU_LOCKDEP_WARN
- rcu_sleep_check
- RCU_NONIDLE
-
-See the comment headers in the source code (or the docbook generated
-from them) for more information.
-
-However, given that there are no fewer than four families of RCU APIs
-in the Linux kernel, how do you choose which one to use? The following
-list can be helpful:
-
-a. Will readers need to block? If so, you need SRCU.
-
-b. What about the -rt patchset? If readers would need to block
- in an non-rt kernel, you need SRCU. If readers would block
- in a -rt kernel, but not in a non-rt kernel, SRCU is not
- necessary. (The -rt patchset turns spinlocks into sleeplocks,
- hence this distinction.)
-
-c. Do you need to treat NMI handlers, hardirq handlers,
- and code segments with preemption disabled (whether
- via preempt_disable(), local_irq_save(), local_bh_disable(),
- or some other mechanism) as if they were explicit RCU readers?
- If so, RCU-sched is the only choice that will work for you.
-
-d. Do you need RCU grace periods to complete even in the face
- of softirq monopolization of one or more of the CPUs? For
- example, is your code subject to network-based denial-of-service
- attacks? If so, you should disable softirq across your readers,
- for example, by using rcu_read_lock_bh().
-
-e. Is your workload too update-intensive for normal use of
- RCU, but inappropriate for other synchronization mechanisms?
- If so, consider SLAB_TYPESAFE_BY_RCU (which was originally
- named SLAB_DESTROY_BY_RCU). But please be careful!
-
-f. Do you need read-side critical sections that are respected
- even though they are in the middle of the idle loop, during
- user-mode execution, or on an offlined CPU? If so, SRCU is the
- only choice that will work for you.
-
-g. Otherwise, use RCU.
-
-Of course, this all assumes that you have determined that RCU is in fact
-the right tool for your job.
-
-
-8. ANSWERS TO QUICK QUIZZES
-
-Quick Quiz #1: Why is this argument naive? How could a deadlock
- occur when using this algorithm in a real-world Linux
- kernel? [Referring to the lock-based "toy" RCU
- algorithm.]
-
-Answer: Consider the following sequence of events:
-
- 1. CPU 0 acquires some unrelated lock, call it
- "problematic_lock", disabling irq via
- spin_lock_irqsave().
-
- 2. CPU 1 enters synchronize_rcu(), write-acquiring
- rcu_gp_mutex.
-
- 3. CPU 0 enters rcu_read_lock(), but must wait
- because CPU 1 holds rcu_gp_mutex.
-
- 4. CPU 1 is interrupted, and the irq handler
- attempts to acquire problematic_lock.
-
- The system is now deadlocked.
-
- One way to avoid this deadlock is to use an approach like
- that of CONFIG_PREEMPT_RT, where all normal spinlocks
- become blocking locks, and all irq handlers execute in
- the context of special tasks. In this case, in step 4
- above, the irq handler would block, allowing CPU 1 to
- release rcu_gp_mutex, avoiding the deadlock.
-
- Even in the absence of deadlock, this RCU implementation
- allows latency to "bleed" from readers to other
- readers through synchronize_rcu(). To see this,
- consider task A in an RCU read-side critical section
- (thus read-holding rcu_gp_mutex), task B blocked
- attempting to write-acquire rcu_gp_mutex, and
- task C blocked in rcu_read_lock() attempting to
- read_acquire rcu_gp_mutex. Task A's RCU read-side
- latency is holding up task C, albeit indirectly via
- task B.
-
- Realtime RCU implementations therefore use a counter-based
- approach where tasks in RCU read-side critical sections
- cannot be blocked by tasks executing synchronize_rcu().
-
-Quick Quiz #2: Give an example where Classic RCU's read-side
- overhead is -negative-.
-
-Answer: Imagine a single-CPU system with a non-CONFIG_PREEMPT
- kernel where a routing table is used by process-context
- code, but can be updated by irq-context code (for example,
- by an "ICMP REDIRECT" packet). The usual way of handling
- this would be to have the process-context code disable
- interrupts while searching the routing table. Use of
- RCU allows such interrupt-disabling to be dispensed with.
- Thus, without RCU, you pay the cost of disabling interrupts,
- and with RCU you don't.
-
- One can argue that the overhead of RCU in this
- case is negative with respect to the single-CPU
- interrupt-disabling approach. Others might argue that
- the overhead of RCU is merely zero, and that replacing
- the positive overhead of the interrupt-disabling scheme
- with the zero-overhead RCU scheme does not constitute
- negative overhead.
-
- In real life, of course, things are more complex. But
- even the theoretical possibility of negative overhead for
- a synchronization primitive is a bit unexpected. ;-)
-
-Quick Quiz #3: If it is illegal to block in an RCU read-side
- critical section, what the heck do you do in
- PREEMPT_RT, where normal spinlocks can block???
-
-Answer: Just as PREEMPT_RT permits preemption of spinlock
- critical sections, it permits preemption of RCU
- read-side critical sections. It also permits
- spinlocks blocking while in RCU read-side critical
- sections.
-
- Why the apparent inconsistency? Because it is
- possible to use priority boosting to keep the RCU
- grace periods short if need be (for example, if running
- short of memory). In contrast, if blocking waiting
- for (say) network reception, there is no way to know
- what should be boosted. Especially given that the
- process we need to boost might well be a human being
- who just went out for a pizza or something. And although
- a computer-operated cattle prod might arouse serious
- interest, it might also provoke serious objections.
- Besides, how does the computer know what pizza parlor
- the human being went to???
-
-
-ACKNOWLEDGEMENTS
-
-My thanks to the people who helped make this human-readable, including
-Jon Walpole, Josh Triplett, Serge Hallyn, Suzanne Wood, and Alan Stern.
-
-
-For more information, see http://www.rdrop.com/users/paulmck/RCU.
diff --git a/Documentation/admin-guide/acpi/fan_performance_states.rst b/Documentation/admin-guide/acpi/fan_performance_states.rst
new file mode 100644
index 0000000..21d233c
--- /dev/null
+++ b/Documentation/admin-guide/acpi/fan_performance_states.rst
@@ -0,0 +1,62 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===========================
+ACPI Fan Performance States
+===========================
+
+When the optional _FPS object is present under an ACPI device representing a
+fan (for example, PNP0C0B or INT3404), the ACPI fan driver creates additional
+"state*" attributes in the sysfs directory of the ACPI device in question.
+These attributes list properties of fan performance states.
+
+For more information on _FPS refer to the ACPI specification at:
+
+http://uefi.org/specifications
+
+For instance, the contents of the INT3404 ACPI device sysfs directory
+may look as follows::
+
+ $ ls -l /sys/bus/acpi/devices/INT3404:00/
+ total 0
+...
+ -r--r--r-- 1 root root 4096 Dec 13 20:38 state0
+ -r--r--r-- 1 root root 4096 Dec 13 20:38 state1
+ -r--r--r-- 1 root root 4096 Dec 13 20:38 state10
+ -r--r--r-- 1 root root 4096 Dec 13 20:38 state11
+ -r--r--r-- 1 root root 4096 Dec 13 20:38 state2
+ -r--r--r-- 1 root root 4096 Dec 13 20:38 state3
+ -r--r--r-- 1 root root 4096 Dec 13 20:38 state4
+ -r--r--r-- 1 root root 4096 Dec 13 20:38 state5
+ -r--r--r-- 1 root root 4096 Dec 13 20:38 state6
+ -r--r--r-- 1 root root 4096 Dec 13 20:38 state7
+ -r--r--r-- 1 root root 4096 Dec 13 20:38 state8
+ -r--r--r-- 1 root root 4096 Dec 13 20:38 state9
+ -r--r--r-- 1 root root 4096 Dec 13 01:00 status
+ ...
+
+where each of the "state*" files represents one performance state of the fan
+and contains a colon-separated list of 5 integer numbers (fields) with the
+following interpretation::
+
+control_percent:trip_point_index:speed_rpm:noise_level_mdb:power_mw
+
+* ``control_percent``: The percent value to be used to set the fan speed to a
+ specific level using the _FSL object (0-100).
+
+* ``trip_point_index``: The active cooling trip point number that corresponds
+ to this performance state (0-9).
+
+* ``speed_rpm``: Speed of the fan in rotations per minute.
+
+* ``noise_level_mdb``: Audible noise emitted by the fan in this state in
+ millidecibels.
+
+* ``power_mw``: Power draw of the fan in this state in milliwatts.
+
+For example::
+
+ $cat /sys/bus/acpi/devices/INT3404:00/state1
+ 25:0:3200:12500:1250
+
+When a given field is not populated or its value provided by the platform
+firmware is invalid, the "not-defined" string is shown instead of the value.
diff --git a/Documentation/admin-guide/acpi/index.rst b/Documentation/admin-guide/acpi/index.rst
index 4d13eee..7127768 100644
--- a/Documentation/admin-guide/acpi/index.rst
+++ b/Documentation/admin-guide/acpi/index.rst
@@ -12,3 +12,4 @@
dsdt-override
ssdt-overlays
cppc_sysfs
+ fan_performance_states
diff --git a/Documentation/admin-guide/blockdev/zram.rst b/Documentation/admin-guide/blockdev/zram.rst
index 6eccf13..27c77d8 100644
--- a/Documentation/admin-guide/blockdev/zram.rst
+++ b/Documentation/admin-guide/blockdev/zram.rst
@@ -1,15 +1,15 @@
========================================
-zram: Compressed RAM based block devices
+zram: Compressed RAM-based block devices
========================================
Introduction
============
-The zram module creates RAM based block devices named /dev/zram<id>
+The zram module creates RAM-based block devices named /dev/zram<id>
(<id> = 0, 1, ...). Pages written to these disks are compressed and stored
in memory itself. These disks allow very fast I/O and compression provides
-good amounts of memory savings. Some of the usecases include /tmp storage,
-use as swap disks, various caches under /var and maybe many more :)
+good amounts of memory savings. Some of the use cases include /tmp storage,
+use as swap disks, various caches under /var and maybe many more. :)
Statistics for individual zram devices are exported through sysfs nodes at
/sys/block/zram<id>/
@@ -43,17 +43,17 @@
======== =============================================================
-EBUSY an attempt to modify an attribute that cannot be changed once
- the device has been initialised. Please reset device first;
+ the device has been initialised. Please reset device first.
-ENOMEM zram was not able to allocate enough memory to fulfil your
- needs;
+ needs.
-EINVAL invalid input has been provided.
======== =============================================================
-If you use 'echo', the returned value that is changed by 'echo' utility,
+If you use 'echo', the returned value is set by the 'echo' utility,
and, in general case, something like::
echo 3 > /sys/block/zram0/max_comp_streams
- if [ $? -ne 0 ];
+ if [ $? -ne 0 ]; then
handle_error
fi
@@ -65,7 +65,8 @@
::
modprobe zram num_devices=4
- This creates 4 devices: /dev/zram{0,1,2,3}
+
+This creates 4 devices: /dev/zram{0,1,2,3}
num_devices parameter is optional and tells zram how many devices should be
pre-created. Default: 1.
@@ -73,12 +74,12 @@
2) Set max number of compression streams
========================================
-Regardless the value passed to this attribute, ZRAM will always
-allocate multiple compression streams - one per online CPUs - thus
+Regardless of the value passed to this attribute, ZRAM will always
+allocate multiple compression streams - one per online CPU - thus
allowing several concurrent compression operations. The number of
allocated compression streams goes down when some of the CPUs
become offline. There is no single-compression-stream mode anymore,
-unless you are running a UP system or has only 1 CPU online.
+unless you are running a UP system or have only 1 CPU online.
To find out how many streams are currently available::
@@ -89,7 +90,7 @@
Using comp_algorithm device attribute one can see available and
currently selected (shown in square brackets) compression algorithms,
-change selected compression algorithm (once the device is initialised
+or change the selected compression algorithm (once the device is initialised
there is no way to change compression algorithm).
Examples::
@@ -167,9 +168,9 @@
zram provides a control interface, which enables dynamic (on-demand) device
addition and removal.
-In order to add a new /dev/zramX device, perform read operation on hot_add
-attribute. This will return either new device's device id (meaning that you
-can use /dev/zram<id>) or error code.
+In order to add a new /dev/zramX device, perform a read operation on the hot_add
+attribute. This will return either the new device's device id (meaning that you
+can use /dev/zram<id>) or an error code.
Example::
@@ -186,8 +187,8 @@
Per-device statistics are exported as various nodes under /sys/block/zram<id>/
-A brief description of exported device attributes. For more details please
-read Documentation/ABI/testing/sysfs-block-zram.
+A brief description of exported device attributes follows. For more details
+please read Documentation/ABI/testing/sysfs-block-zram.
====================== ====== ===============================================
Name access description
@@ -245,7 +246,7 @@
File /sys/block/zram<id>/mm_stat
-The stat file represents device's mm statistics. It consists of a single
+The mm_stat file represents the device's mm statistics. It consists of a single
line of text and contains the following stats separated by whitespace:
================ =============================================================
@@ -261,7 +262,7 @@
Unit: bytes
mem_limit the maximum amount of memory ZRAM can use to store
the compressed data
- mem_used_max the maximum amount of memory zram have consumed to
+ mem_used_max the maximum amount of memory zram has consumed to
store the data
same_pages the number of same element filled pages written to this disk.
No memory is allocated for such pages.
@@ -271,7 +272,7 @@
File /sys/block/zram<id>/bd_stat
-The stat file represents device's backing device statistics. It consists of
+The bd_stat file represents a device's backing device statistics. It consists of
a single line of text and contains the following stats separated by whitespace:
============== =============================================================
@@ -316,9 +317,9 @@
echo /dev/sda5 > /sys/block/zramX/backing_dev
before disksize setting. It supports only partition at this moment.
-If admin want to use incompressible page writeback, they could do via::
+If admin wants to use incompressible page writeback, they could do via::
- echo huge > /sys/block/zramX/write
+ echo huge > /sys/block/zramX/writeback
To use idle page writeback, first, user need to declare zram pages
as idle::
@@ -326,7 +327,7 @@
echo all > /sys/block/zramX/idle
From now on, any pages on zram are idle pages. The idle mark
-will be removed until someone request access of the block.
+will be removed until someone requests access of the block.
IOW, unless there is access request, those pages are still idle pages.
Admin can request writeback of those idle pages at right timing via::
@@ -341,16 +342,16 @@
To overcome the concern, zram supports "writeback_limit" feature.
The "writeback_limit_enable"'s default value is 0 so that it doesn't limit
-any writeback. IOW, if admin want to apply writeback budget, he should
+any writeback. IOW, if admin wants to apply writeback budget, he should
enable writeback_limit_enable via::
$ echo 1 > /sys/block/zramX/writeback_limit_enable
Once writeback_limit_enable is set, zram doesn't allow any writeback
-until admin set the budget via /sys/block/zramX/writeback_limit.
+until admin sets the budget via /sys/block/zramX/writeback_limit.
(If admin doesn't enable writeback_limit_enable, writeback_limit's value
-assigned via /sys/block/zramX/writeback_limit is meaninless.)
+assigned via /sys/block/zramX/writeback_limit is meaningless.)
If admin want to limit writeback as per-day 400M, he could do it
like below::
@@ -361,13 +362,13 @@
/sys/block/zram0/writeback_limit.
$ echo 1 > /sys/block/zram0/writeback_limit_enable
-If admin want to allow further write again once the bugdet is exausted,
+If admins want to allow further write again once the bugdet is exhausted,
he could do it like below::
$ echo $((400<<MB_SHIFT>>4K_SHIFT)) > \
/sys/block/zram0/writeback_limit
-If admin want to see remaining writeback budget since he set::
+If admin wants to see remaining writeback budget since last set::
$ cat /sys/block/zramX/writeback_limit
@@ -375,12 +376,12 @@
$ echo 0 > /sys/block/zramX/writeback_limit_enable
-The writeback_limit count will reset whenever you reset zram(e.g.,
+The writeback_limit count will reset whenever you reset zram (e.g.,
system reboot, echo 1 > /sys/block/zramX/reset) so keeping how many of
writeback happened until you reset the zram to allocate extra writeback
budget in next setting is user's job.
-If admin want to measure writeback count in a certain period, he could
+If admin wants to measure writeback count in a certain period, he could
know it via /sys/block/zram0/bd_stat's 3rd column.
memory tracking
diff --git a/Documentation/admin-guide/bootconfig.rst b/Documentation/admin-guide/bootconfig.rst
new file mode 100644
index 0000000..b342a67
--- /dev/null
+++ b/Documentation/admin-guide/bootconfig.rst
@@ -0,0 +1,190 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+.. _bootconfig:
+
+==================
+Boot Configuration
+==================
+
+:Author: Masami Hiramatsu <mhiramat@kernel.org>
+
+Overview
+========
+
+The boot configuration expands the current kernel command line to support
+additional key-value data when booting the kernel in an efficient way.
+This allows administrators to pass a structured-Key config file.
+
+Config File Syntax
+==================
+
+The boot config syntax is a simple structured key-value. Each key consists
+of dot-connected-words, and key and value are connected by ``=``. The value
+has to be terminated by semi-colon (``;``) or newline (``\n``).
+For array value, array entries are separated by comma (``,``). ::
+
+KEY[.WORD[...]] = VALUE[, VALUE2[...]][;]
+
+Unlike the kernel command line syntax, spaces are OK around the comma and ``=``.
+
+Each key word must contain only alphabets, numbers, dash (``-``) or underscore
+(``_``). And each value only contains printable characters or spaces except
+for delimiters such as semi-colon (``;``), new-line (``\n``), comma (``,``),
+hash (``#``) and closing brace (``}``).
+
+If you want to use those delimiters in a value, you can use either double-
+quotes (``"VALUE"``) or single-quotes (``'VALUE'``) to quote it. Note that
+you can not escape these quotes.
+
+There can be a key which doesn't have value or has an empty value. Those keys
+are used for checking if the key exists or not (like a boolean).
+
+Key-Value Syntax
+----------------
+
+The boot config file syntax allows user to merge partially same word keys
+by brace. For example::
+
+ foo.bar.baz = value1
+ foo.bar.qux.quux = value2
+
+These can be written also in::
+
+ foo.bar {
+ baz = value1
+ qux.quux = value2
+ }
+
+Or more shorter, written as following::
+
+ foo.bar { baz = value1; qux.quux = value2 }
+
+In both styles, same key words are automatically merged when parsing it
+at boot time. So you can append similar trees or key-values.
+
+Comments
+--------
+
+The config syntax accepts shell-script style comments. The comments starting
+with hash ("#") until newline ("\n") will be ignored.
+
+::
+
+ # comment line
+ foo = value # value is set to foo.
+ bar = 1, # 1st element
+ 2, # 2nd element
+ 3 # 3rd element
+
+This is parsed as below::
+
+ foo = value
+ bar = 1, 2, 3
+
+Note that you can not put a comment between value and delimiter(``,`` or
+``;``). This means following config has a syntax error ::
+
+ key = 1 # comment
+ ,2
+
+
+/proc/bootconfig
+================
+
+/proc/bootconfig is a user-space interface of the boot config.
+Unlike /proc/cmdline, this file shows the key-value style list.
+Each key-value pair is shown in each line with following style::
+
+ KEY[.WORDS...] = "[VALUE]"[,"VALUE2"...]
+
+
+Boot Kernel With a Boot Config
+==============================
+
+Since the boot configuration file is loaded with initrd, it will be added
+to the end of the initrd (initramfs) image file. The Linux kernel decodes
+the last part of the initrd image in memory to get the boot configuration
+data.
+Because of this "piggyback" method, there is no need to change or
+update the boot loader and the kernel image itself.
+
+To do this operation, Linux kernel provides "bootconfig" command under
+tools/bootconfig, which allows admin to apply or delete the config file
+to/from initrd image. You can build it by the following command::
+
+ # make -C tools/bootconfig
+
+To add your boot config file to initrd image, run bootconfig as below
+(Old data is removed automatically if exists)::
+
+ # tools/bootconfig/bootconfig -a your-config /boot/initrd.img-X.Y.Z
+
+To remove the config from the image, you can use -d option as below::
+
+ # tools/bootconfig/bootconfig -d /boot/initrd.img-X.Y.Z
+
+Then add "bootconfig" on the normal kernel command line to tell the
+kernel to look for the bootconfig at the end of the initrd file.
+
+Config File Limitation
+======================
+
+Currently the maximum config size size is 32KB and the total key-words (not
+key-value entries) must be under 1024 nodes.
+Note: this is not the number of entries but nodes, an entry must consume
+more than 2 nodes (a key-word and a value). So theoretically, it will be
+up to 512 key-value pairs. If keys contains 3 words in average, it can
+contain 256 key-value pairs. In most cases, the number of config items
+will be under 100 entries and smaller than 8KB, so it would be enough.
+If the node number exceeds 1024, parser returns an error even if the file
+size is smaller than 32KB.
+Anyway, since bootconfig command verifies it when appending a boot config
+to initrd image, user can notice it before boot.
+
+
+Bootconfig APIs
+===============
+
+User can query or loop on key-value pairs, also it is possible to find
+a root (prefix) key node and find key-values under that node.
+
+If you have a key string, you can query the value directly with the key
+using xbc_find_value(). If you want to know what keys exist in the boot
+config, you can use xbc_for_each_key_value() to iterate key-value pairs.
+Note that you need to use xbc_array_for_each_value() for accessing
+each array's value, e.g.::
+
+ vnode = NULL;
+ xbc_find_value("key.word", &vnode);
+ if (vnode && xbc_node_is_array(vnode))
+ xbc_array_for_each_value(vnode, value) {
+ printk("%s ", value);
+ }
+
+If you want to focus on keys which have a prefix string, you can use
+xbc_find_node() to find a node by the prefix string, and iterate
+keys under the prefix node with xbc_node_for_each_key_value().
+
+But the most typical usage is to get the named value under prefix
+or get the named array under prefix as below::
+
+ root = xbc_find_node("key.prefix");
+ value = xbc_node_find_value(root, "option", &vnode);
+ ...
+ xbc_node_for_each_array_value(root, "array-option", value, anode) {
+ ...
+ }
+
+This accesses a value of "key.prefix.option" and an array of
+"key.prefix.array-option".
+
+Locking is not needed, since after initialization, the config becomes
+read-only. All data and keys must be copied if you need to modify it.
+
+
+Functions and structures
+========================
+
+.. kernel-doc:: include/linux/bootconfig.h
+.. kernel-doc:: lib/bootconfig.c
+
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 0636bcb..3f80146 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -61,6 +61,8 @@
5-6. Device
5-7. RDMA
5-7-1. RDMA Interface Files
+ 5-8. HugeTLB
+ 5.8-1. HugeTLB Interface Files
5-8. Misc
5-8-1. perf_event
5-N. Non-normative information
@@ -2056,6 +2058,33 @@
mlx4_0 hca_handle=1 hca_object=20
ocrdma1 hca_handle=1 hca_object=23
+HugeTLB
+-------
+
+The HugeTLB controller allows to limit the HugeTLB usage per control group and
+enforces the controller limit during page fault.
+
+HugeTLB Interface Files
+~~~~~~~~~~~~~~~~~~~~~~~
+
+ hugetlb.<hugepagesize>.current
+ Show current usage for "hugepagesize" hugetlb. It exists for all
+ the cgroup except root.
+
+ hugetlb.<hugepagesize>.max
+ Set/show the hard limit of "hugepagesize" hugetlb usage.
+ The default value is "max". It exists for all the cgroup except root.
+
+ hugetlb.<hugepagesize>.events
+ A read-only flat-keyed file which exists on non-root cgroups.
+
+ max
+ The number of allocation failure due to HugeTLB limit
+
+ hugetlb.<hugepagesize>.events.local
+ Similar to hugetlb.<hugepagesize>.events but the fields in the file
+ are local to the cgroup i.e. not hierarchical. The file modified event
+ generated on this file reflects only the local events.
Misc
----
diff --git a/Documentation/admin-guide/device-mapper/dm-raid.rst b/Documentation/admin-guide/device-mapper/dm-raid.rst
index f634467..695a2ea 100644
--- a/Documentation/admin-guide/device-mapper/dm-raid.rst
+++ b/Documentation/admin-guide/device-mapper/dm-raid.rst
@@ -419,3 +419,5 @@
rebuild errors.
1.15.0 Fix size extensions not being synchronized in case of new MD bitmap
pages allocated; also fix those not occuring after previous reductions
+ 1.15.1 Fix argument count and arguments for rebuild/write_mostly/journal_(dev|mode)
+ on the status line.
diff --git a/Documentation/admin-guide/devices.txt b/Documentation/admin-guide/devices.txt
index 1c5d228..2a97aae 100644
--- a/Documentation/admin-guide/devices.txt
+++ b/Documentation/admin-guide/devices.txt
@@ -319,7 +319,7 @@
182 = /dev/perfctr Performance-monitoring counters
183 = /dev/hwrng Generic random number generator
184 = /dev/cpu/microcode CPU microcode update interface
- 186 = /dev/atomicps Atomic shapshot of process state data
+ 186 = /dev/atomicps Atomic snapshot of process state data
187 = /dev/irnet IrNET device
188 = /dev/smbusbios SMBus BIOS
189 = /dev/ussp_ctl User space serial port control
diff --git a/Documentation/admin-guide/ext4.rst b/Documentation/admin-guide/ext4.rst
index 059ddcb..9443fce 100644
--- a/Documentation/admin-guide/ext4.rst
+++ b/Documentation/admin-guide/ext4.rst
@@ -92,6 +92,8 @@
* efficient new ordered mode in JBD2 and ext4 (avoid using buffer head to force
the ordering)
* Case-insensitive file name lookups
+* file-based encryption support (fscrypt)
+* file-based verity support (fsverity)
[1] Filesystems with a block size of 1k may see a limit imposed by the
directory hash tree having a maximum depth of two.
@@ -181,14 +183,17 @@
system after its metadata has been committed to the journal.
commit=nrsec (*)
- Ext4 can be told to sync all its data and metadata every 'nrsec'
- seconds. The default value is 5 seconds. This means that if you lose
- your power, you will lose as much as the latest 5 seconds of work (your
- filesystem will not be damaged though, thanks to the journaling). This
- default value (or any low value) will hurt performance, but it's good
- for data-safety. Setting it to 0 will have the same effect as leaving
- it at the default (5 seconds). Setting it to very large values will
- improve performance.
+ This setting limits the maximum age of the running transaction to
+ 'nrsec' seconds. The default value is 5 seconds. This means that if
+ you lose your power, you will lose as much as the latest 5 seconds of
+ metadata changes (your filesystem will not be damaged though, thanks
+ to the journaling). This default value (or any low value) will hurt
+ performance, but it's good for data-safety. Setting it to 0 will have
+ the same effect as leaving it at the default (5 seconds). Setting it
+ to very large values will improve performance. Note that due to
+ delayed allocation even older data can be lost on power failure since
+ writeback of those data begins only after time set in
+ /proc/sys/vm/dirty_expire_centisecs.
barrier=<0|1(*)>, barrier(*), nobarrier
This enables/disables the use of write barriers in the jbd code.
diff --git a/Documentation/admin-guide/index.rst b/Documentation/admin-guide/index.rst
index 4405b74..f1d0ccf 100644
--- a/Documentation/admin-guide/index.rst
+++ b/Documentation/admin-guide/index.rst
@@ -64,6 +64,7 @@
binderfs
binfmt-misc
blockdev/index
+ bootconfig
braille-console
btmrvl
cgroup-v1/index
@@ -76,6 +77,7 @@
device-mapper/index
efi-stub
ext4
+ nfs/index
gpio/index
highuid
hw_random
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index ade4e6e..dbc22d6 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -437,6 +437,12 @@
no delay (0).
Format: integer
+ bootconfig [KNL]
+ Extended command line options can be added to an initrd
+ and this will cause the kernel to look for it.
+
+ See Documentation/admin-guide/bootconfig.rst
+
bert_disable [ACPI]
Disable BERT OS support on buggy BIOSes.
@@ -511,7 +517,7 @@
1 -- check protection requested by application.
Default value is set via a kernel config option.
Value can be changed at runtime via
- /selinux/checkreqprot.
+ /sys/fs/selinux/checkreqprot.
cio_ignore= [S390]
See Documentation/s390/common_io.rst for details.
@@ -834,6 +840,18 @@
dump out devices still on the deferred probe list after
retrying.
+ dfltcc= [HW,S390]
+ Format: { on | off | def_only | inf_only | always }
+ on: s390 zlib hardware support for compression on
+ level 1 and decompression (default)
+ off: No s390 zlib hardware support
+ def_only: s390 zlib hardware support for deflate
+ only (compression on level 1)
+ inf_only: s390 zlib hardware support for inflate
+ only (decompression)
+ always: Same as 'on' but ignores the selected compression
+ level always using hardware support (used for debugging)
+
dhash_entries= [KNL]
Set number of hash buckets for dentry cache.
@@ -1165,10 +1183,10 @@
efi= [EFI]
Format: { "old_map", "nochunk", "noruntime", "debug",
- "nosoftreserve" }
+ "nosoftreserve", "disable_early_pci_dma",
+ "no_disable_early_pci_dma" }
old_map [X86-64]: switch to the old ioremap-based EFI
- runtime services mapping. 32-bit still uses this one by
- default.
+ runtime services mapping. [Needs CONFIG_X86_UV=y]
nochunk: disable reading files in "chunks" in the EFI
boot stub, as chunking can cause problems with some
firmware implementations.
@@ -1180,6 +1198,10 @@
claim. Specify efi=nosoftreserve to disable this
reservation and treat the memory by its base type
(i.e. EFI_CONVENTIONAL_MEMORY / "System RAM").
+ disable_early_pci_dma: Disable the busmaster bit on all
+ PCI bridges while in the EFI boot stub
+ no_disable_early_pci_dma: Leave the busmaster bit set
+ on all PCI bridges while in the EFI boot stub
efi_no_storage_paranoia [EFI; X86]
Using this parameter you can use more than 50% of
@@ -1245,7 +1267,8 @@
0 -- permissive (log only, no denials).
1 -- enforcing (deny and log).
Default value is 0.
- Value can be changed at runtime via /selinux/enforce.
+ Value can be changed at runtime via
+ /sys/fs/selinux/enforce.
erst_disable [ACPI]
Disable Error Record Serialization Table (ERST)
@@ -1933,10 +1956,32 @@
<cpu number> begins at 0 and the maximum value is
"number of CPUs in system - 1".
+ managed_irq
+
+ Isolate from being targeted by managed interrupts
+ which have an interrupt mask containing isolated
+ CPUs. The affinity of managed interrupts is
+ handled by the kernel and cannot be changed via
+ the /proc/irq/* interfaces.
+
+ This isolation is best effort and only effective
+ if the automatically assigned interrupt mask of a
+ device queue contains isolated and housekeeping
+ CPUs. If housekeeping CPUs are online then such
+ interrupts are directed to the housekeeping CPU
+ so that IO submitted on the housekeeping CPU
+ cannot disturb the isolated CPU.
+
+ If a queue's affinity mask contains only isolated
+ CPUs then this parameter has no effect on the
+ interrupt routing decision, though interrupts are
+ only delivered when tasks running on those
+ isolated CPUs submit IO. IO submitted on
+ housekeeping CPUs has no influence on those
+ queues.
+
The format of <cpu-list> is described above.
-
-
iucv= [HW,NET]
ivrs_ioapic [HW,X86_64]
@@ -3978,6 +4023,19 @@
test until boot completes in order to avoid
interference.
+ rcuperf.kfree_rcu_test= [KNL]
+ Set to measure performance of kfree_rcu() flooding.
+
+ rcuperf.kfree_nthreads= [KNL]
+ The number of threads running loops of kfree_rcu().
+
+ rcuperf.kfree_alloc_num= [KNL]
+ Number of allocations and frees done in an iteration.
+
+ rcuperf.kfree_loops= [KNL]
+ Number of loops doing rcuperf.kfree_alloc_num number
+ of allocations and frees.
+
rcuperf.nreaders= [KNL]
Set number of RCU readers. The value -1 selects
N, where N is the number of CPUs. A value
@@ -4348,9 +4406,7 @@
See security/selinux/Kconfig help text.
0 -- disable.
1 -- enable.
- Default value is set via kernel config option.
- If enabled at boot time, /selinux/disable can be used
- later to disable prior to initial policy load.
+ Default value is 1.
apparmor= [APPARMOR] Disable or enable AppArmor at boot time
Format: { "0" | "1" }
diff --git a/Documentation/admin-guide/nfs/fault_injection.rst b/Documentation/admin-guide/nfs/fault_injection.rst
new file mode 100644
index 0000000..eb029c0
--- /dev/null
+++ b/Documentation/admin-guide/nfs/fault_injection.rst
@@ -0,0 +1,70 @@
+===================
+NFS Fault Injection
+===================
+
+Fault injection is a method for forcing errors that may not normally occur, or
+may be difficult to reproduce. Forcing these errors in a controlled environment
+can help the developer find and fix bugs before their code is shipped in a
+production system. Injecting an error on the Linux NFS server will allow us to
+observe how the client reacts and if it manages to recover its state correctly.
+
+NFSD_FAULT_INJECTION must be selected when configuring the kernel to use this
+feature.
+
+
+Using Fault Injection
+=====================
+On the client, mount the fault injection server through NFS v4.0+ and do some
+work over NFS (open files, take locks, ...).
+
+On the server, mount the debugfs filesystem to <debug_dir> and ls
+<debug_dir>/nfsd. This will show a list of files that will be used for
+injecting faults on the NFS server. As root, write a number n to the file
+corresponding to the action you want the server to take. The server will then
+process the first n items it finds. So if you want to forget 5 locks, echo '5'
+to <debug_dir>/nfsd/forget_locks. A value of 0 will tell the server to forget
+all corresponding items. A log message will be created containing the number
+of items forgotten (check dmesg).
+
+Go back to work on the client and check if the client recovered from the error
+correctly.
+
+
+Available Faults
+================
+forget_clients:
+ The NFS server keeps a list of clients that have placed a mount call. If
+ this list is cleared, the server will have no knowledge of who the client
+ is, forcing the client to reauthenticate with the server.
+
+forget_openowners:
+ The NFS server keeps a list of what files are currently opened and who
+ they were opened by. Clearing this list will force the client to reopen
+ its files.
+
+forget_locks:
+ The NFS server keeps a list of what files are currently locked in the VFS.
+ Clearing this list will force the client to reclaim its locks (files are
+ unlocked through the VFS as they are cleared from this list).
+
+forget_delegations:
+ A delegation is used to assure the client that a file, or part of a file,
+ has not changed since the delegation was awarded. Clearing this list will
+ force the client to reacquire its delegation before accessing the file
+ again.
+
+recall_delegations:
+ Delegations can be recalled by the server when another client attempts to
+ access a file. This test will notify the client that its delegation has
+ been revoked, forcing the client to reacquire the delegation before using
+ the file again.
+
+
+tools/nfs/inject_faults.sh script
+=================================
+This script has been created to ease the fault injection process. This script
+will detect the mounted debugfs directory and write to the files located there
+based on the arguments passed by the user. For example, running
+`inject_faults.sh forget_locks 1` as root will instruct the server to forget
+one lock. Running `inject_faults forget_locks` will instruct the server to
+forgetall locks.
diff --git a/Documentation/admin-guide/nfs/index.rst b/Documentation/admin-guide/nfs/index.rst
new file mode 100644
index 0000000..6b5a3c9
--- /dev/null
+++ b/Documentation/admin-guide/nfs/index.rst
@@ -0,0 +1,15 @@
+=============
+NFS
+=============
+
+.. toctree::
+ :maxdepth: 1
+
+ nfs-client
+ nfsroot
+ nfs-rdma
+ nfsd-admin-interfaces
+ nfs-idmapper
+ pnfs-block-server
+ pnfs-scsi-server
+ fault_injection
diff --git a/Documentation/admin-guide/nfs/nfs-client.rst b/Documentation/admin-guide/nfs/nfs-client.rst
new file mode 100644
index 0000000..c4b777c
--- /dev/null
+++ b/Documentation/admin-guide/nfs/nfs-client.rst
@@ -0,0 +1,141 @@
+==========
+NFS Client
+==========
+
+The NFS client
+==============
+
+The NFS version 2 protocol was first documented in RFC1094 (March 1989).
+Since then two more major releases of NFS have been published, with NFSv3
+being documented in RFC1813 (June 1995), and NFSv4 in RFC3530 (April
+2003).
+
+The Linux NFS client currently supports all the above published versions,
+and work is in progress on adding support for minor version 1 of the NFSv4
+protocol.
+
+The purpose of this document is to provide information on some of the
+special features of the NFS client that can be configured by system
+administrators.
+
+
+The nfs4_unique_id parameter
+============================
+
+NFSv4 requires clients to identify themselves to servers with a unique
+string. File open and lock state shared between one client and one server
+is associated with this identity. To support robust NFSv4 state recovery
+and transparent state migration, this identity string must not change
+across client reboots.
+
+Without any other intervention, the Linux client uses a string that contains
+the local system's node name. System administrators, however, often do not
+take care to ensure that node names are fully qualified and do not change
+over the lifetime of a client system. Node names can have other
+administrative requirements that require particular behavior that does not
+work well as part of an nfs_client_id4 string.
+
+The nfs.nfs4_unique_id boot parameter specifies a unique string that can be
+used instead of a system's node name when an NFS client identifies itself to
+a server. Thus, if the system's node name is not unique, or it changes, its
+nfs.nfs4_unique_id stays the same, preventing collision with other clients
+or loss of state during NFS reboot recovery or transparent state migration.
+
+The nfs.nfs4_unique_id string is typically a UUID, though it can contain
+anything that is believed to be unique across all NFS clients. An
+nfs4_unique_id string should be chosen when a client system is installed,
+just as a system's root file system gets a fresh UUID in its label at
+install time.
+
+The string should remain fixed for the lifetime of the client. It can be
+changed safely if care is taken that the client shuts down cleanly and all
+outstanding NFSv4 state has expired, to prevent loss of NFSv4 state.
+
+This string can be stored in an NFS client's grub.conf, or it can be provided
+via a net boot facility such as PXE. It may also be specified as an nfs.ko
+module parameter. Specifying a uniquifier string is not support for NFS
+clients running in containers.
+
+
+The DNS resolver
+================
+
+NFSv4 allows for one server to refer the NFS client to data that has been
+migrated onto another server by means of the special "fs_locations"
+attribute. See `RFC3530 Section 6: Filesystem Migration and Replication`_ and
+`Implementation Guide for Referrals in NFSv4`_.
+
+.. _RFC3530 Section 6\: Filesystem Migration and Replication: http://tools.ietf.org/html/rfc3530#section-6
+.. _Implementation Guide for Referrals in NFSv4: http://tools.ietf.org/html/draft-ietf-nfsv4-referrals-00
+
+The fs_locations information can take the form of either an ip address and
+a path, or a DNS hostname and a path. The latter requires the NFS client to
+do a DNS lookup in order to mount the new volume, and hence the need for an
+upcall to allow userland to provide this service.
+
+Assuming that the user has the 'rpc_pipefs' filesystem mounted in the usual
+/var/lib/nfs/rpc_pipefs, the upcall consists of the following steps:
+
+ (1) The process checks the dns_resolve cache to see if it contains a
+ valid entry. If so, it returns that entry and exits.
+
+ (2) If no valid entry exists, the helper script '/sbin/nfs_cache_getent'
+ (may be changed using the 'nfs.cache_getent' kernel boot parameter)
+ is run, with two arguments:
+ - the cache name, "dns_resolve"
+ - the hostname to resolve
+
+ (3) After looking up the corresponding ip address, the helper script
+ writes the result into the rpc_pipefs pseudo-file
+ '/var/lib/nfs/rpc_pipefs/cache/dns_resolve/channel'
+ in the following (text) format:
+
+ "<ip address> <hostname> <ttl>\n"
+
+ Where <ip address> is in the usual IPv4 (123.456.78.90) or IPv6
+ (ffee:ddcc:bbaa:9988:7766:5544:3322:1100, ffee::1100, ...) format.
+ <hostname> is identical to the second argument of the helper
+ script, and <ttl> is the 'time to live' of this cache entry (in
+ units of seconds).
+
+ .. note::
+ If <ip address> is invalid, say the string "0", then a negative
+ entry is created, which will cause the kernel to treat the hostname
+ as having no valid DNS translation.
+
+
+
+
+A basic sample /sbin/nfs_cache_getent
+=====================================
+.. code-block:: sh
+
+ #!/bin/bash
+ #
+ ttl=600
+ #
+ cut=/usr/bin/cut
+ getent=/usr/bin/getent
+ rpc_pipefs=/var/lib/nfs/rpc_pipefs
+ #
+ die()
+ {
+ echo "Usage: $0 cache_name entry_name"
+ exit 1
+ }
+
+ [ $# -lt 2 ] && die
+ cachename="$1"
+ cache_path=${rpc_pipefs}/cache/${cachename}/channel
+
+ case "${cachename}" in
+ dns_resolve)
+ name="$2"
+ result="$(${getent} hosts ${name} | ${cut} -f1 -d\ )"
+ [ -z "${result}" ] && result="0"
+ ;;
+ *)
+ die
+ ;;
+ esac
+ echo "${result} ${name} ${ttl}" >${cache_path}
diff --git a/Documentation/admin-guide/nfs/nfs-idmapper.rst b/Documentation/admin-guide/nfs/nfs-idmapper.rst
new file mode 100644
index 0000000..58b8e63
--- /dev/null
+++ b/Documentation/admin-guide/nfs/nfs-idmapper.rst
@@ -0,0 +1,78 @@
+=============
+NFS ID Mapper
+=============
+
+Id mapper is used by NFS to translate user and group ids into names, and to
+translate user and group names into ids. Part of this translation involves
+performing an upcall to userspace to request the information. There are two
+ways NFS could obtain this information: placing a call to /sbin/request-key
+or by placing a call to the rpc.idmap daemon.
+
+NFS will attempt to call /sbin/request-key first. If this succeeds, the
+result will be cached using the generic request-key cache. This call should
+only fail if /etc/request-key.conf is not configured for the id_resolver key
+type, see the "Configuring" section below if you wish to use the request-key
+method.
+
+If the call to /sbin/request-key fails (if /etc/request-key.conf is not
+configured with the id_resolver key type), then the idmapper will ask the
+legacy rpc.idmap daemon for the id mapping. This result will be stored
+in a custom NFS idmap cache.
+
+
+Configuring
+===========
+
+The file /etc/request-key.conf will need to be modified so /sbin/request-key can
+direct the upcall. The following line should be added:
+
+``#OP TYPE DESCRIPTION CALLOUT INFO PROGRAM ARG1 ARG2 ARG3 ...``
+``#====== ======= =============== =============== ===============================``
+``create id_resolver * * /usr/sbin/nfs.idmap %k %d 600``
+
+
+This will direct all id_resolver requests to the program /usr/sbin/nfs.idmap.
+The last parameter, 600, defines how many seconds into the future the key will
+expire. This parameter is optional for /usr/sbin/nfs.idmap. When the timeout
+is not specified, nfs.idmap will default to 600 seconds.
+
+id mapper uses for key descriptions::
+
+ uid: Find the UID for the given user
+ gid: Find the GID for the given group
+ user: Find the user name for the given UID
+ group: Find the group name for the given GID
+
+You can handle any of these individually, rather than using the generic upcall
+program. If you would like to use your own program for a uid lookup then you
+would edit your request-key.conf so it look similar to this:
+
+``#OP TYPE DESCRIPTION CALLOUT INFO PROGRAM ARG1 ARG2 ARG3 ...``
+``#====== ======= =============== =============== ===============================``
+``create id_resolver uid:* * /some/other/program %k %d 600``
+``create id_resolver * * /usr/sbin/nfs.idmap %k %d 600``
+
+
+Notice that the new line was added above the line for the generic program.
+request-key will find the first matching line and corresponding program. In
+this case, /some/other/program will handle all uid lookups and
+/usr/sbin/nfs.idmap will handle gid, user, and group lookups.
+
+See Documentation/security/keys/request-key.rst for more information
+about the request-key function.
+
+
+nfs.idmap
+=========
+
+nfs.idmap is designed to be called by request-key, and should not be run "by
+hand". This program takes two arguments, a serialized key and a key
+description. The serialized key is first converted into a key_serial_t, and
+then passed as an argument to keyctl_instantiate (both are part of keyutils.h).
+
+The actual lookups are performed by functions found in nfsidmap.h. nfs.idmap
+determines the correct function to call by looking at the first part of the
+description string. For example, a uid lookup description will appear as
+"uid:user@domain".
+
+nfs.idmap will return 0 if the key was instantiated, and non-zero otherwise.
diff --git a/Documentation/admin-guide/nfs/nfs-rdma.rst b/Documentation/admin-guide/nfs/nfs-rdma.rst
new file mode 100644
index 0000000..ef0f367
--- /dev/null
+++ b/Documentation/admin-guide/nfs/nfs-rdma.rst
@@ -0,0 +1,292 @@
+===================
+Setting up NFS/RDMA
+===================
+
+:Author:
+ NetApp and Open Grid Computing (May 29, 2008)
+
+.. warning::
+ This document is probably obsolete.
+
+Overview
+========
+
+This document describes how to install and setup the Linux NFS/RDMA client
+and server software.
+
+The NFS/RDMA client was first included in Linux 2.6.24. The NFS/RDMA server
+was first included in the following release, Linux 2.6.25.
+
+In our testing, we have obtained excellent performance results (full 10Gbit
+wire bandwidth at minimal client CPU) under many workloads. The code passes
+the full Connectathon test suite and operates over both Infiniband and iWARP
+RDMA adapters.
+
+Getting Help
+============
+
+If you get stuck, you can ask questions on the
+nfs-rdma-devel@lists.sourceforge.net mailing list.
+
+Installation
+============
+
+These instructions are a step by step guide to building a machine for
+use with NFS/RDMA.
+
+- Install an RDMA device
+
+ Any device supported by the drivers in drivers/infiniband/hw is acceptable.
+
+ Testing has been performed using several Mellanox-based IB cards, the
+ Ammasso AMS1100 iWARP adapter, and the Chelsio cxgb3 iWARP adapter.
+
+- Install a Linux distribution and tools
+
+ The first kernel release to contain both the NFS/RDMA client and server was
+ Linux 2.6.25 Therefore, a distribution compatible with this and subsequent
+ Linux kernel release should be installed.
+
+ The procedures described in this document have been tested with
+ distributions from Red Hat's Fedora Project (http://fedora.redhat.com/).
+
+- Install nfs-utils-1.1.2 or greater on the client
+
+ An NFS/RDMA mount point can be obtained by using the mount.nfs command in
+ nfs-utils-1.1.2 or greater (nfs-utils-1.1.1 was the first nfs-utils
+ version with support for NFS/RDMA mounts, but for various reasons we
+ recommend using nfs-utils-1.1.2 or greater). To see which version of
+ mount.nfs you are using, type:
+
+ .. code-block:: sh
+
+ $ /sbin/mount.nfs -V
+
+ If the version is less than 1.1.2 or the command does not exist,
+ you should install the latest version of nfs-utils.
+
+ Download the latest package from: http://www.kernel.org/pub/linux/utils/nfs
+
+ Uncompress the package and follow the installation instructions.
+
+ If you will not need the idmapper and gssd executables (you do not need
+ these to create an NFS/RDMA enabled mount command), the installation
+ process can be simplified by disabling these features when running
+ configure:
+
+ .. code-block:: sh
+
+ $ ./configure --disable-gss --disable-nfsv4
+
+ To build nfs-utils you will need the tcp_wrappers package installed. For
+ more information on this see the package's README and INSTALL files.
+
+ After building the nfs-utils package, there will be a mount.nfs binary in
+ the utils/mount directory. This binary can be used to initiate NFS v2, v3,
+ or v4 mounts. To initiate a v4 mount, the binary must be called
+ mount.nfs4. The standard technique is to create a symlink called
+ mount.nfs4 to mount.nfs.
+
+ This mount.nfs binary should be installed at /sbin/mount.nfs as follows:
+
+ .. code-block:: sh
+
+ $ sudo cp utils/mount/mount.nfs /sbin/mount.nfs
+
+ In this location, mount.nfs will be invoked automatically for NFS mounts
+ by the system mount command.
+
+ .. note::
+ mount.nfs and therefore nfs-utils-1.1.2 or greater is only needed
+ on the NFS client machine. You do not need this specific version of
+ nfs-utils on the server. Furthermore, only the mount.nfs command from
+ nfs-utils-1.1.2 is needed on the client.
+
+- Install a Linux kernel with NFS/RDMA
+
+ The NFS/RDMA client and server are both included in the mainline Linux
+ kernel version 2.6.25 and later. This and other versions of the Linux
+ kernel can be found at: https://www.kernel.org/pub/linux/kernel/
+
+ Download the sources and place them in an appropriate location.
+
+- Configure the RDMA stack
+
+ Make sure your kernel configuration has RDMA support enabled. Under
+ Device Drivers -> InfiniBand support, update the kernel configuration
+ to enable InfiniBand support [NOTE: the option name is misleading. Enabling
+ InfiniBand support is required for all RDMA devices (IB, iWARP, etc.)].
+
+ Enable the appropriate IB HCA support (mlx4, mthca, ehca, ipath, etc.) or
+ iWARP adapter support (amso, cxgb3, etc.).
+
+ If you are using InfiniBand, be sure to enable IP-over-InfiniBand support.
+
+- Configure the NFS client and server
+
+ Your kernel configuration must also have NFS file system support and/or
+ NFS server support enabled. These and other NFS related configuration
+ options can be found under File Systems -> Network File Systems.
+
+- Build, install, reboot
+
+ The NFS/RDMA code will be enabled automatically if NFS and RDMA
+ are turned on. The NFS/RDMA client and server are configured via the hidden
+ SUNRPC_XPRT_RDMA config option that depends on SUNRPC and INFINIBAND. The
+ value of SUNRPC_XPRT_RDMA will be:
+
+ #. N if either SUNRPC or INFINIBAND are N, in this case the NFS/RDMA client
+ and server will not be built
+
+ #. M if both SUNRPC and INFINIBAND are on (M or Y) and at least one is M,
+ in this case the NFS/RDMA client and server will be built as modules
+
+ #. Y if both SUNRPC and INFINIBAND are Y, in this case the NFS/RDMA client
+ and server will be built into the kernel
+
+ Therefore, if you have followed the steps above and turned no NFS and RDMA,
+ the NFS/RDMA client and server will be built.
+
+ Build a new kernel, install it, boot it.
+
+Check RDMA and NFS Setup
+========================
+
+Before configuring the NFS/RDMA software, it is a good idea to test
+your new kernel to ensure that the kernel is working correctly.
+In particular, it is a good idea to verify that the RDMA stack
+is functioning as expected and standard NFS over TCP/IP and/or UDP/IP
+is working properly.
+
+- Check RDMA Setup
+
+ If you built the RDMA components as modules, load them at
+ this time. For example, if you are using a Mellanox Tavor/Sinai/Arbel
+ card:
+
+ .. code-block:: sh
+
+ $ modprobe ib_mthca
+ $ modprobe ib_ipoib
+
+ If you are using InfiniBand, make sure there is a Subnet Manager (SM)
+ running on the network. If your IB switch has an embedded SM, you can
+ use it. Otherwise, you will need to run an SM, such as OpenSM, on one
+ of your end nodes.
+
+ If an SM is running on your network, you should see the following:
+
+ .. code-block:: sh
+
+ $ cat /sys/class/infiniband/driverX/ports/1/state
+ 4: ACTIVE
+
+ where driverX is mthca0, ipath5, ehca3, etc.
+
+ To further test the InfiniBand software stack, use IPoIB (this
+ assumes you have two IB hosts named host1 and host2):
+
+ .. code-block:: sh
+
+ host1$ ip link set dev ib0 up
+ host1$ ip address add dev ib0 a.b.c.x
+ host2$ ip link set dev ib0 up
+ host2$ ip address add dev ib0 a.b.c.y
+ host1$ ping a.b.c.y
+ host2$ ping a.b.c.x
+
+ For other device types, follow the appropriate procedures.
+
+- Check NFS Setup
+
+ For the NFS components enabled above (client and/or server),
+ test their functionality over standard Ethernet using TCP/IP or UDP/IP.
+
+NFS/RDMA Setup
+==============
+
+We recommend that you use two machines, one to act as the client and
+one to act as the server.
+
+One time configuration:
+-----------------------
+
+- On the server system, configure the /etc/exports file and start the NFS/RDMA server.
+
+ Exports entries with the following formats have been tested::
+
+ /vol0 192.168.0.47(fsid=0,rw,async,insecure,no_root_squash)
+ /vol0 192.168.0.0/255.255.255.0(fsid=0,rw,async,insecure,no_root_squash)
+
+ The IP address(es) is(are) the client's IPoIB address for an InfiniBand
+ HCA or the client's iWARP address(es) for an RNIC.
+
+ .. note::
+ The "insecure" option must be used because the NFS/RDMA client does
+ not use a reserved port.
+
+Each time a machine boots:
+--------------------------
+
+- Load and configure the RDMA drivers
+
+ For InfiniBand using a Mellanox adapter:
+
+ .. code-block:: sh
+
+ $ modprobe ib_mthca
+ $ modprobe ib_ipoib
+ $ ip li set dev ib0 up
+ $ ip addr add dev ib0 a.b.c.d
+
+ .. note::
+ Please use unique addresses for the client and server!
+
+- Start the NFS server
+
+ If the NFS/RDMA server was built as a module (CONFIG_SUNRPC_XPRT_RDMA=m in
+ kernel config), load the RDMA transport module:
+
+ .. code-block:: sh
+
+ $ modprobe svcrdma
+
+ Regardless of how the server was built (module or built-in), start the
+ server:
+
+ .. code-block:: sh
+
+ $ /etc/init.d/nfs start
+
+ or
+
+ .. code-block:: sh
+
+ $ service nfs start
+
+ Instruct the server to listen on the RDMA transport:
+
+ .. code-block:: sh
+
+ $ echo rdma 20049 > /proc/fs/nfsd/portlist
+
+- On the client system
+
+ If the NFS/RDMA client was built as a module (CONFIG_SUNRPC_XPRT_RDMA=m in
+ kernel config), load the RDMA client module:
+
+ .. code-block:: sh
+
+ $ modprobe xprtrdma.ko
+
+ Regardless of how the client was built (module or built-in), use this
+ command to mount the NFS/RDMA server:
+
+ .. code-block:: sh
+
+ $ mount -o rdma,port=20049 <IPoIB-server-name-or-address>:/<export> /mnt
+
+ To verify that the mount is using RDMA, run "cat /proc/mounts" and check
+ the "proto" field for the given mount.
+
+ Congratulations! You're using NFS/RDMA!
diff --git a/Documentation/admin-guide/nfs/nfsd-admin-interfaces.rst b/Documentation/admin-guide/nfs/nfsd-admin-interfaces.rst
new file mode 100644
index 0000000..c05926f
--- /dev/null
+++ b/Documentation/admin-guide/nfs/nfsd-admin-interfaces.rst
@@ -0,0 +1,40 @@
+==================================
+Administrative interfaces for nfsd
+==================================
+
+Note that normally these interfaces are used only by the utilities in
+nfs-utils.
+
+nfsd is controlled mainly by pseudofiles under the "nfsd" filesystem,
+which is normally mounted at /proc/fs/nfsd/.
+
+The server is always started by the first write of a nonzero value to
+nfsd/threads.
+
+Before doing that, NFSD can be told which sockets to listen on by
+writing to nfsd/portlist; that write may be:
+
+ - an ascii-encoded file descriptor, which should refer to a
+ bound (and listening, for tcp) socket, or
+ - "transportname port", where transportname is currently either
+ "udp", "tcp", or "rdma".
+
+If nfsd is started without doing any of these, then it will create one
+udp and one tcp listener at port 2049 (see nfsd_init_socks).
+
+On startup, nfsd and lockd grace periods start. nfsd is shut down by a write of
+0 to nfsd/threads. All locks and state are thrown away at that point.
+
+Between startup and shutdown, the number of threads may be adjusted up
+or down by additional writes to nfsd/threads or by writes to
+nfsd/pool_threads.
+
+For more detail about files under nfsd/ and what they control, see
+fs/nfsd/nfsctl.c; most of them have detailed comments.
+
+Implementation notes
+====================
+
+Note that the rpc server requires the caller to serialize addition and
+removal of listening sockets, and startup and shutdown of the server.
+For nfsd this is done using nfsd_mutex.
diff --git a/Documentation/admin-guide/nfs/nfsroot.rst b/Documentation/admin-guide/nfs/nfsroot.rst
new file mode 100644
index 0000000..82a4fda
--- /dev/null
+++ b/Documentation/admin-guide/nfs/nfsroot.rst
@@ -0,0 +1,364 @@
+===============================================
+Mounting the root filesystem via NFS (nfsroot)
+===============================================
+
+:Authors:
+ Written 1996 by Gero Kuhlmann <gero@gkminix.han.de>
+
+ Updated 1997 by Martin Mares <mj@atrey.karlin.mff.cuni.cz>
+
+ Updated 2006 by Nico Schottelius <nico-kernel-nfsroot@schottelius.org>
+
+ Updated 2006 by Horms <horms@verge.net.au>
+
+ Updated 2018 by Chris Novakovic <chris@chrisn.me.uk>
+
+
+
+In order to use a diskless system, such as an X-terminal or printer server for
+example, it is necessary for the root filesystem to be present on a non-disk
+device. This may be an initramfs (see
+Documentation/filesystems/ramfs-rootfs-initramfs.txt), a ramdisk (see
+Documentation/admin-guide/initrd.rst) or a filesystem mounted via NFS. The
+following text describes on how to use NFS for the root filesystem. For the rest
+of this text 'client' means the diskless system, and 'server' means the NFS
+server.
+
+
+
+
+Enabling nfsroot capabilities
+=============================
+
+In order to use nfsroot, NFS client support needs to be selected as
+built-in during configuration. Once this has been selected, the nfsroot
+option will become available, which should also be selected.
+
+In the networking options, kernel level autoconfiguration can be selected,
+along with the types of autoconfiguration to support. Selecting all of
+DHCP, BOOTP and RARP is safe.
+
+
+
+
+Kernel command line
+===================
+
+When the kernel has been loaded by a boot loader (see below) it needs to be
+told what root fs device to use. And in the case of nfsroot, where to find
+both the server and the name of the directory on the server to mount as root.
+This can be established using the following kernel command line parameters:
+
+
+root=/dev/nfs
+ This is necessary to enable the pseudo-NFS-device. Note that it's not a
+ real device but just a synonym to tell the kernel to use NFS instead of
+ a real device.
+
+
+nfsroot=[<server-ip>:]<root-dir>[,<nfs-options>]
+ If the `nfsroot' parameter is NOT given on the command line,
+ the default ``"/tftpboot/%s"`` will be used.
+
+ <server-ip> Specifies the IP address of the NFS server.
+ The default address is determined by the ip parameter
+ (see below). This parameter allows the use of different
+ servers for IP autoconfiguration and NFS.
+
+ <root-dir> Name of the directory on the server to mount as root.
+ If there is a "%s" token in the string, it will be
+ replaced by the ASCII-representation of the client's
+ IP address.
+
+ <nfs-options> Standard NFS options. All options are separated by commas.
+ The following defaults are used::
+
+ port = as given by server portmap daemon
+ rsize = 4096
+ wsize = 4096
+ timeo = 7
+ retrans = 3
+ acregmin = 3
+ acregmax = 60
+ acdirmin = 30
+ acdirmax = 60
+ flags = hard, nointr, noposix, cto, ac
+
+
+ip=<client-ip>:<server-ip>:<gw-ip>:<netmask>:<hostname>:<device>:<autoconf>:<dns0-ip>:<dns1-ip>:<ntp0-ip>
+ This parameter tells the kernel how to configure IP addresses of devices
+ and also how to set up the IP routing table. It was originally called
+ nfsaddrs, but now the boot-time IP configuration works independently of
+ NFS, so it was renamed to ip and the old name remained as an alias for
+ compatibility reasons.
+
+ If this parameter is missing from the kernel command line, all fields are
+ assumed to be empty, and the defaults mentioned below apply. In general
+ this means that the kernel tries to configure everything using
+ autoconfiguration.
+
+ The <autoconf> parameter can appear alone as the value to the ip
+ parameter (without all the ':' characters before). If the value is
+ "ip=off" or "ip=none", no autoconfiguration will take place, otherwise
+ autoconfiguration will take place. The most common way to use this
+ is "ip=dhcp".
+
+ <client-ip> IP address of the client.
+ Default: Determined using autoconfiguration.
+
+ <server-ip> IP address of the NFS server.
+ If RARP is used to determine
+ the client address and this parameter is NOT empty only
+ replies from the specified server are accepted.
+
+ Only required for NFS root. That is autoconfiguration
+ will not be triggered if it is missing and NFS root is not
+ in operation.
+
+ Value is exported to /proc/net/pnp with the prefix "bootserver "
+ (see below).
+
+ Default: Determined using autoconfiguration.
+ The address of the autoconfiguration server is used.
+
+ <gw-ip> IP address of a gateway if the server is on a different subnet.
+ Default: Determined using autoconfiguration.
+
+ <netmask> Netmask for local network interface.
+ If unspecified the netmask is derived from the client IP address
+ assuming classful addressing.
+
+ Default: Determined using autoconfiguration.
+
+ <hostname> Name of the client.
+ If a '.' character is present, anything
+ before the first '.' is used as the client's hostname, and anything
+ after it is used as its NIS domain name. May be supplied by
+ autoconfiguration, but its absence will not trigger autoconfiguration.
+ If specified and DHCP is used, the user-provided hostname (and NIS
+ domain name, if present) will be carried in the DHCP request; this
+ may cause a DNS record to be created or updated for the client.
+
+ Default: Client IP address is used in ASCII notation.
+
+ <device> Name of network device to use.
+ Default: If the host only has one device, it is used.
+ Otherwise the device is determined using
+ autoconfiguration. This is done by sending
+ autoconfiguration requests out of all devices,
+ and using the device that received the first reply.
+
+ <autoconf> Method to use for autoconfiguration.
+ In the case of options
+ which specify multiple autoconfiguration protocols,
+ requests are sent using all protocols, and the first one
+ to reply is used.
+
+ Only autoconfiguration protocols that have been compiled
+ into the kernel will be used, regardless of the value of
+ this option::
+
+ off or none: don't use autoconfiguration
+ (do static IP assignment instead)
+ on or any: use any protocol available in the kernel
+ (default)
+ dhcp: use DHCP
+ bootp: use BOOTP
+ rarp: use RARP
+ both: use both BOOTP and RARP but not DHCP
+ (old option kept for backwards compatibility)
+
+ if dhcp is used, the client identifier can be used by following
+ format "ip=dhcp,client-id-type,client-id-value"
+
+ Default: any
+
+ <dns0-ip> IP address of primary nameserver.
+ Value is exported to /proc/net/pnp with the prefix "nameserver "
+ (see below).
+
+ Default: None if not using autoconfiguration; determined
+ automatically if using autoconfiguration.
+
+ <dns1-ip> IP address of secondary nameserver.
+ See <dns0-ip>.
+
+ <ntp0-ip> IP address of a Network Time Protocol (NTP) server.
+ Value is exported to /proc/net/ipconfig/ntp_servers, but is
+ otherwise unused (see below).
+
+ Default: None if not using autoconfiguration; determined
+ automatically if using autoconfiguration.
+
+ After configuration (whether manual or automatic) is complete, two files
+ are created in the following format; lines are omitted if their respective
+ value is empty following configuration:
+
+ - /proc/net/pnp:
+
+ #PROTO: <DHCP|BOOTP|RARP|MANUAL> (depending on configuration method)
+ domain <dns-domain> (if autoconfigured, the DNS domain)
+ nameserver <dns0-ip> (primary name server IP)
+ nameserver <dns1-ip> (secondary name server IP)
+ nameserver <dns2-ip> (tertiary name server IP)
+ bootserver <server-ip> (NFS server IP)
+
+ - /proc/net/ipconfig/ntp_servers:
+
+ <ntp0-ip> (NTP server IP)
+ <ntp1-ip> (NTP server IP)
+ <ntp2-ip> (NTP server IP)
+
+ <dns-domain> and <dns2-ip> (in /proc/net/pnp) and <ntp1-ip> and <ntp2-ip>
+ (in /proc/net/ipconfig/ntp_servers) are requested during autoconfiguration;
+ they cannot be specified as part of the "ip=" kernel command line parameter.
+
+ Because the "domain" and "nameserver" options are recognised by DNS
+ resolvers, /etc/resolv.conf is often linked to /proc/net/pnp on systems
+ that use an NFS root filesystem.
+
+ Note that the kernel will not synchronise the system time with any NTP
+ servers it discovers; this is the responsibility of a user space process
+ (e.g. an initrd/initramfs script that passes the IP addresses listed in
+ /proc/net/ipconfig/ntp_servers to an NTP client before mounting the real
+ root filesystem if it is on NFS).
+
+
+nfsrootdebug
+ This parameter enables debugging messages to appear in the kernel
+ log at boot time so that administrators can verify that the correct
+ NFS mount options, server address, and root path are passed to the
+ NFS client.
+
+
+rdinit=<executable file>
+ To specify which file contains the program that starts system
+ initialization, administrators can use this command line parameter.
+ The default value of this parameter is "/init". If the specified
+ file exists and the kernel can execute it, root filesystem related
+ kernel command line parameters, including 'nfsroot=', are ignored.
+
+ A description of the process of mounting the root file system can be
+ found in Documentation/driver-api/early-userspace/early_userspace_support.rst
+
+
+Boot Loader
+===========
+
+To get the kernel into memory different approaches can be used.
+They depend on various facilities being available:
+
+
+- Booting from a floppy using syslinux
+
+ When building kernels, an easy way to create a boot floppy that uses
+ syslinux is to use the zdisk or bzdisk make targets which use zimage
+ and bzimage images respectively. Both targets accept the
+ FDARGS parameter which can be used to set the kernel command line.
+
+ e.g::
+
+ make bzdisk FDARGS="root=/dev/nfs"
+
+ Note that the user running this command will need to have
+ access to the floppy drive device, /dev/fd0
+
+ For more information on syslinux, including how to create bootdisks
+ for prebuilt kernels, see http://syslinux.zytor.com/
+
+ .. note::
+ Previously it was possible to write a kernel directly to
+ a floppy using dd, configure the boot device using rdev, and
+ boot using the resulting floppy. Linux no longer supports this
+ method of booting.
+
+- Booting from a cdrom using isolinux
+
+ When building kernels, an easy way to create a bootable cdrom that
+ uses isolinux is to use the isoimage target which uses a bzimage
+ image. Like zdisk and bzdisk, this target accepts the FDARGS
+ parameter which can be used to set the kernel command line.
+
+ e.g::
+
+ make isoimage FDARGS="root=/dev/nfs"
+
+ The resulting iso image will be arch/<ARCH>/boot/image.iso
+ This can be written to a cdrom using a variety of tools including
+ cdrecord.
+
+ e.g::
+
+ cdrecord dev=ATAPI:1,0,0 arch/x86/boot/image.iso
+
+ For more information on isolinux, including how to create bootdisks
+ for prebuilt kernels, see http://syslinux.zytor.com/
+
+- Using LILO
+
+ When using LILO all the necessary command line parameters may be
+ specified using the 'append=' directive in the LILO configuration
+ file.
+
+ However, to use the 'root=' directive you also need to create
+ a dummy root device, which may be removed after LILO is run.
+
+ e.g::
+
+ mknod /dev/boot255 c 0 255
+
+ For information on configuring LILO, please refer to its documentation.
+
+- Using GRUB
+
+ When using GRUB, kernel parameter are simply appended after the kernel
+ specification: kernel <kernel> <parameters>
+
+- Using loadlin
+
+ loadlin may be used to boot Linux from a DOS command prompt without
+ requiring a local hard disk to mount as root. This has not been
+ thoroughly tested by the authors of this document, but in general
+ it should be possible configure the kernel command line similarly
+ to the configuration of LILO.
+
+ Please refer to the loadlin documentation for further information.
+
+- Using a boot ROM
+
+ This is probably the most elegant way of booting a diskless client.
+ With a boot ROM the kernel is loaded using the TFTP protocol. The
+ authors of this document are not aware of any no commercial boot
+ ROMs that support booting Linux over the network. However, there
+ are two free implementations of a boot ROM, netboot-nfs and
+ etherboot, both of which are available on sunsite.unc.edu, and both
+ of which contain everything you need to boot a diskless Linux client.
+
+- Using pxelinux
+
+ Pxelinux may be used to boot linux using the PXE boot loader
+ which is present on many modern network cards.
+
+ When using pxelinux, the kernel image is specified using
+ "kernel <relative-path-below /tftpboot>". The nfsroot parameters
+ are passed to the kernel by adding them to the "append" line.
+ It is common to use serial console in conjunction with pxeliunx,
+ see Documentation/admin-guide/serial-console.rst for more information.
+
+ For more information on isolinux, including how to create bootdisks
+ for prebuilt kernels, see http://syslinux.zytor.com/
+
+
+
+
+Credits
+=======
+
+ The nfsroot code in the kernel and the RARP support have been written
+ by Gero Kuhlmann <gero@gkminix.han.de>.
+
+ The rest of the IP layer autoconfiguration code has been written
+ by Martin Mares <mj@atrey.karlin.mff.cuni.cz>.
+
+ In order to write the initial version of nfsroot I would like to thank
+ Jens-Uwe Mager <jum@anubis.han.de> for his help.
diff --git a/Documentation/admin-guide/nfs/pnfs-block-server.rst b/Documentation/admin-guide/nfs/pnfs-block-server.rst
new file mode 100644
index 0000000..b00a2e7
--- /dev/null
+++ b/Documentation/admin-guide/nfs/pnfs-block-server.rst
@@ -0,0 +1,42 @@
+===================================
+pNFS block layout server user guide
+===================================
+
+The Linux NFS server now supports the pNFS block layout extension. In this
+case the NFS server acts as Metadata Server (MDS) for pNFS, which in addition
+to handling all the metadata access to the NFS export also hands out layouts
+to the clients to directly access the underlying block devices that are
+shared with the client.
+
+To use pNFS block layouts with with the Linux NFS server the exported file
+system needs to support the pNFS block layouts (currently just XFS), and the
+file system must sit on shared storage (typically iSCSI) that is accessible
+to the clients in addition to the MDS. As of now the file system needs to
+sit directly on the exported volume, striping or concatenation of
+volumes on the MDS and clients is not supported yet.
+
+On the server, pNFS block volume support is automatically if the file system
+support it. On the client make sure the kernel has the CONFIG_PNFS_BLOCK
+option enabled, the blkmapd daemon from nfs-utils is running, and the
+file system is mounted using the NFSv4.1 protocol version (mount -o vers=4.1).
+
+If the nfsd server needs to fence a non-responding client it calls
+/sbin/nfsd-recall-failed with the first argument set to the IP address of
+the client, and the second argument set to the device node without the /dev
+prefix for the file system to be fenced. Below is an example file that shows
+how to translate the device into a serial number from SCSI EVPD 0x80::
+
+ cat > /sbin/nfsd-recall-failed << EOF
+
+.. code-block:: sh
+
+ #!/bin/sh
+
+ CLIENT="$1"
+ DEV="/dev/$2"
+ EVPD=`sg_inq --page=0x80 ${DEV} | \
+ grep "Unit serial number:" | \
+ awk -F ': ' '{print $2}'`
+
+ echo "fencing client ${CLIENT} serial ${EVPD}" >> /var/log/pnfsd-fence.log
+ EOF
diff --git a/Documentation/admin-guide/nfs/pnfs-scsi-server.rst b/Documentation/admin-guide/nfs/pnfs-scsi-server.rst
new file mode 100644
index 0000000..d2f6ee5
--- /dev/null
+++ b/Documentation/admin-guide/nfs/pnfs-scsi-server.rst
@@ -0,0 +1,24 @@
+
+==================================
+pNFS SCSI layout server user guide
+==================================
+
+This document describes support for pNFS SCSI layouts in the Linux NFS server.
+With pNFS SCSI layouts, the NFS server acts as Metadata Server (MDS) for pNFS,
+which in addition to handling all the metadata access to the NFS export,
+also hands out layouts to the clients so that they can directly access the
+underlying SCSI LUNs that are shared with the client.
+
+To use pNFS SCSI layouts with with the Linux NFS server, the exported file
+system needs to support the pNFS SCSI layouts (currently just XFS), and the
+file system must sit on a SCSI LUN that is accessible to the clients in
+addition to the MDS. As of now the file system needs to sit directly on the
+exported LUN, striping or concatenation of LUNs on the MDS and clients
+is not supported yet.
+
+On a server built with CONFIG_NFSD_SCSI, the pNFS SCSI volume support is
+automatically enabled if the file system is exported using the "pnfs"
+option and the underlying SCSI device support persistent reservations.
+On the client make sure the kernel has the CONFIG_PNFS_BLOCK option
+enabled, and the file system is mounted using the NFSv4.1 protocol
+version (mount -o vers=4.1).
diff --git a/Documentation/admin-guide/pm/cpuidle.rst b/Documentation/admin-guide/pm/cpuidle.rst
index e70b365..6a06dc4 100644
--- a/Documentation/admin-guide/pm/cpuidle.rst
+++ b/Documentation/admin-guide/pm/cpuidle.rst
@@ -506,6 +506,9 @@
``disable``
Whether or not this idle state is disabled.
+``default_status``
+ The default status of this state, "enabled" or "disabled".
+
``latency``
Exit latency of the idle state in microseconds.
@@ -629,16 +632,16 @@
will be used, again, to determine the new effective value for the whole list
and that value will become the new real constraint.
-In turn, for each CPU there is only one resume latency PM QoS request
-associated with the :file:`power/pm_qos_resume_latency_us` file under
+In turn, for each CPU there is one resume latency PM QoS request associated with
+the :file:`power/pm_qos_resume_latency_us` file under
:file:`/sys/devices/system/cpu/cpu<N>/` in ``sysfs`` and writing to it causes
this single PM QoS request to be updated regardless of which user space
process does that. In other words, this PM QoS request is shared by the entire
user space, so access to the file associated with it needs to be arbitrated
to avoid confusion. [Arguably, the only legitimate use of this mechanism in
practice is to pin a process to the CPU in question and let it use the
-``sysfs`` interface to control the resume latency constraint for it.] It
-still only is a request, however. It is a member of a priority list used to
+``sysfs`` interface to control the resume latency constraint for it.] It is
+still only a request, however. It is an entry in a priority list used to
determine the effective value to be set as the resume latency constraint for the
CPU in question every time the list of requests is updated this way or another
(there may be other requests coming from kernel code in that list).
diff --git a/Documentation/admin-guide/pm/intel_idle.rst b/Documentation/admin-guide/pm/intel_idle.rst
new file mode 100644
index 0000000..89309e1
--- /dev/null
+++ b/Documentation/admin-guide/pm/intel_idle.rst
@@ -0,0 +1,268 @@
+.. SPDX-License-Identifier: GPL-2.0
+.. include:: <isonum.txt>
+
+==============================================
+``intel_idle`` CPU Idle Time Management Driver
+==============================================
+
+:Copyright: |copy| 2020 Intel Corporation
+
+:Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+
+
+General Information
+===================
+
+``intel_idle`` is a part of the
+:doc:`CPU idle time management subsystem <cpuidle>` in the Linux kernel
+(``CPUIdle``). It is the default CPU idle time management driver for the
+Nehalem and later generations of Intel processors, but the level of support for
+a particular processor model in it depends on whether or not it recognizes that
+processor model and may also depend on information coming from the platform
+firmware. [To understand ``intel_idle`` it is necessary to know how ``CPUIdle``
+works in general, so this is the time to get familiar with :doc:`cpuidle` if you
+have not done that yet.]
+
+``intel_idle`` uses the ``MWAIT`` instruction to inform the processor that the
+logical CPU executing it is idle and so it may be possible to put some of the
+processor's functional blocks into low-power states. That instruction takes two
+arguments (passed in the ``EAX`` and ``ECX`` registers of the target CPU), the
+first of which, referred to as a *hint*, can be used by the processor to
+determine what can be done (for details refer to Intel Software Developer’s
+Manual [1]_). Accordingly, ``intel_idle`` refuses to work with processors in
+which the support for the ``MWAIT`` instruction has been disabled (for example,
+via the platform firmware configuration menu) or which do not support that
+instruction at all.
+
+``intel_idle`` is not modular, so it cannot be unloaded, which means that the
+only way to pass early-configuration-time parameters to it is via the kernel
+command line.
+
+
+.. _intel-idle-enumeration-of-states:
+
+Enumeration of Idle States
+==========================
+
+Each ``MWAIT`` hint value is interpreted by the processor as a license to
+reconfigure itself in a certain way in order to save energy. The processor
+configurations (with reduced power draw) resulting from that are referred to
+as C-states (in the ACPI terminology) or idle states. The list of meaningful
+``MWAIT`` hint values and idle states (i.e. low-power configurations of the
+processor) corresponding to them depends on the processor model and it may also
+depend on the configuration of the platform.
+
+In order to create a list of available idle states required by the ``CPUIdle``
+subsystem (see :ref:`idle-states-representation` in :doc:`cpuidle`),
+``intel_idle`` can use two sources of information: static tables of idle states
+for different processor models included in the driver itself and the ACPI tables
+of the system. The former are always used if the processor model at hand is
+recognized by ``intel_idle`` and the latter are used if that is required for
+the given processor model (which is the case for all server processor models
+recognized by ``intel_idle``) or if the processor model is not recognized.
+[There is a module parameter that can be used to make the driver use the ACPI
+tables with any processor model recognized by it; see
+`below <intel-idle-parameters_>`_.]
+
+If the ACPI tables are going to be used for building the list of available idle
+states, ``intel_idle`` first looks for a ``_CST`` object under one of the ACPI
+objects corresponding to the CPUs in the system (refer to the ACPI specification
+[2]_ for the description of ``_CST`` and its output package). Because the
+``CPUIdle`` subsystem expects that the list of idle states supplied by the
+driver will be suitable for all of the CPUs handled by it and ``intel_idle`` is
+registered as the ``CPUIdle`` driver for all of the CPUs in the system, the
+driver looks for the first ``_CST`` object returning at least one valid idle
+state description and such that all of the idle states included in its return
+package are of the FFH (Functional Fixed Hardware) type, which means that the
+``MWAIT`` instruction is expected to be used to tell the processor that it can
+enter one of them. The return package of that ``_CST`` is then assumed to be
+applicable to all of the other CPUs in the system and the idle state
+descriptions extracted from it are stored in a preliminary list of idle states
+coming from the ACPI tables. [This step is skipped if ``intel_idle`` is
+configured to ignore the ACPI tables; see `below <intel-idle-parameters_>`_.]
+
+Next, the first (index 0) entry in the list of available idle states is
+initialized to represent a "polling idle state" (a pseudo-idle state in which
+the target CPU continuously fetches and executes instructions), and the
+subsequent (real) idle state entries are populated as follows.
+
+If the processor model at hand is recognized by ``intel_idle``, there is a
+(static) table of idle state descriptions for it in the driver. In that case,
+the "internal" table is the primary source of information on idle states and the
+information from it is copied to the final list of available idle states. If
+using the ACPI tables for the enumeration of idle states is not required
+(depending on the processor model), all of the listed idle state are enabled by
+default (so all of them will be taken into consideration by ``CPUIdle``
+governors during CPU idle state selection). Otherwise, some of the listed idle
+states may not be enabled by default if there are no matching entries in the
+preliminary list of idle states coming from the ACPI tables. In that case user
+space still can enable them later (on a per-CPU basis) with the help of
+the ``disable`` idle state attribute in ``sysfs`` (see
+:ref:`idle-states-representation` in :doc:`cpuidle`). This basically means that
+the idle states "known" to the driver may not be enabled by default if they have
+not been exposed by the platform firmware (through the ACPI tables).
+
+If the given processor model is not recognized by ``intel_idle``, but it
+supports ``MWAIT``, the preliminary list of idle states coming from the ACPI
+tables is used for building the final list that will be supplied to the
+``CPUIdle`` core during driver registration. For each idle state in that list,
+the description, ``MWAIT`` hint and exit latency are copied to the corresponding
+entry in the final list of idle states. The name of the idle state represented
+by it (to be returned by the ``name`` idle state attribute in ``sysfs``) is
+"CX_ACPI", where X is the index of that idle state in the final list (note that
+the minimum value of X is 1, because 0 is reserved for the "polling" state), and
+its target residency is based on the exit latency value. Specifically, for
+C1-type idle states the exit latency value is also used as the target residency
+(for compatibility with the majority of the "internal" tables of idle states for
+various processor models recognized by ``intel_idle``) and for the other idle
+state types (C2 and C3) the target residency value is 3 times the exit latency
+(again, that is because it reflects the target residency to exit latency ratio
+in the majority of cases for the processor models recognized by ``intel_idle``).
+All of the idle states in the final list are enabled by default in this case.
+
+
+.. _intel-idle-initialization:
+
+Initialization
+==============
+
+The initialization of ``intel_idle`` starts with checking if the kernel command
+line options forbid the use of the ``MWAIT`` instruction. If that is the case,
+an error code is returned right away.
+
+The next step is to check whether or not the processor model is known to the
+driver, which determines the idle states enumeration method (see
+`above <intel-idle-enumeration-of-states_>`_), and whether or not the processor
+supports ``MWAIT`` (the initialization fails if that is not the case). Then,
+the ``MWAIT`` support in the processor is enumerated through ``CPUID`` and the
+driver initialization fails if the level of support is not as expected (for
+example, if the total number of ``MWAIT`` substates returned is 0).
+
+Next, if the driver is not configured to ignore the ACPI tables (see
+`below <intel-idle-parameters_>`_), the idle states information provided by the
+platform firmware is extracted from them.
+
+Then, ``CPUIdle`` device objects are allocated for all CPUs and the list of
+available idle states is created as explained
+`above <intel-idle-enumeration-of-states_>`_.
+
+Finally, ``intel_idle`` is registered with the help of cpuidle_register_driver()
+as the ``CPUIdle`` driver for all CPUs in the system and a CPU online callback
+for configuring individual CPUs is registered via cpuhp_setup_state(), which
+(among other things) causes the callback routine to be invoked for all of the
+CPUs present in the system at that time (each CPU executes its own instance of
+the callback routine). That routine registers a ``CPUIdle`` device for the CPU
+running it (which enables the ``CPUIdle`` subsystem to operate that CPU) and
+optionally performs some CPU-specific initialization actions that may be
+required for the given processor model.
+
+
+.. _intel-idle-parameters:
+
+Kernel Command Line Options and Module Parameters
+=================================================
+
+The *x86* architecture support code recognizes three kernel command line
+options related to CPU idle time management: ``idle=poll``, ``idle=halt``,
+and ``idle=nomwait``. If any of them is present in the kernel command line, the
+``MWAIT`` instruction is not allowed to be used, so the initialization of
+``intel_idle`` will fail.
+
+Apart from that there are four module parameters recognized by ``intel_idle``
+itself that can be set via the kernel command line (they cannot be updated via
+sysfs, so that is the only way to change their values).
+
+The ``max_cstate`` parameter value is the maximum idle state index in the list
+of idle states supplied to the ``CPUIdle`` core during the registration of the
+driver. It is also the maximum number of regular (non-polling) idle states that
+can be used by ``intel_idle``, so the enumeration of idle states is terminated
+after finding that number of usable idle states (the other idle states that
+potentially might have been used if ``max_cstate`` had been greater are not
+taken into consideration at all). Setting ``max_cstate`` can prevent
+``intel_idle`` from exposing idle states that are regarded as "too deep" for
+some reason to the ``CPUIdle`` core, but it does so by making them effectively
+invisible until the system is shut down and started again which may not always
+be desirable. In practice, it is only really necessary to do that if the idle
+states in question cannot be enabled during system startup, because in the
+working state of the system the CPU power management quality of service (PM
+QoS) feature can be used to prevent ``CPUIdle`` from touching those idle states
+even if they have been enumerated (see :ref:`cpu-pm-qos` in :doc:`cpuidle`).
+Setting ``max_cstate`` to 0 causes the ``intel_idle`` initialization to fail.
+
+The ``no_acpi`` and ``use_acpi`` module parameters (recognized by ``intel_idle``
+if the kernel has been configured with ACPI support) can be set to make the
+driver ignore the system's ACPI tables entirely or use them for all of the
+recognized processor models, respectively (they both are unset by default and
+``use_acpi`` has no effect if ``no_acpi`` is set).
+
+The value of the ``states_off`` module parameter (0 by default) represents a
+list of idle states to be disabled by default in the form of a bitmask.
+
+Namely, the positions of the bits that are set in the ``states_off`` value are
+the indices of idle states to be disabled by default (as reflected by the names
+of the corresponding idle state directories in ``sysfs``, :file:`state0`,
+:file:`state1` ... :file:`state<i>` ..., where ``<i>`` is the index of the given
+idle state; see :ref:`idle-states-representation` in :doc:`cpuidle`).
+
+For example, if ``states_off`` is equal to 3, the driver will disable idle
+states 0 and 1 by default, and if it is equal to 8, idle state 3 will be
+disabled by default and so on (bit positions beyond the maximum idle state index
+are ignored).
+
+The idle states disabled this way can be enabled (on a per-CPU basis) from user
+space via ``sysfs``.
+
+
+.. _intel-idle-core-and-package-idle-states:
+
+Core and Package Levels of Idle States
+======================================
+
+Typically, in a processor supporting the ``MWAIT`` instruction there are (at
+least) two levels of idle states (or C-states). One level, referred to as
+"core C-states", covers individual cores in the processor, whereas the other
+level, referred to as "package C-states", covers the entire processor package
+and it may also involve other components of the system (GPUs, memory
+controllers, I/O hubs etc.).
+
+Some of the ``MWAIT`` hint values allow the processor to use core C-states only
+(most importantly, that is the case for the ``MWAIT`` hint value corresponding
+to the ``C1`` idle state), but the majority of them give it a license to put
+the target core (i.e. the core containing the logical CPU executing ``MWAIT``
+with the given hint value) into a specific core C-state and then (if possible)
+to enter a specific package C-state at the deeper level. For example, the
+``MWAIT`` hint value representing the ``C3`` idle state allows the processor to
+put the target core into the low-power state referred to as "core ``C3``" (or
+``CC3``), which happens if all of the logical CPUs (SMT siblings) in that core
+have executed ``MWAIT`` with the ``C3`` hint value (or with a hint value
+representing a deeper idle state), and in addition to that (in the majority of
+cases) it gives the processor a license to put the entire package (possibly
+including some non-CPU components such as a GPU or a memory controller) into the
+low-power state referred to as "package ``C3``" (or ``PC3``), which happens if
+all of the cores have gone into the ``CC3`` state and (possibly) some additional
+conditions are satisfied (for instance, if the GPU is covered by ``PC3``, it may
+be required to be in a certain GPU-specific low-power state for ``PC3`` to be
+reachable).
+
+As a rule, there is no simple way to make the processor use core C-states only
+if the conditions for entering the corresponding package C-states are met, so
+the logical CPU executing ``MWAIT`` with a hint value that is not core-level
+only (like for ``C1``) must always assume that this may cause the processor to
+enter a package C-state. [That is why the exit latency and target residency
+values corresponding to the majority of ``MWAIT`` hint values in the "internal"
+tables of idle states in ``intel_idle`` reflect the properties of package
+C-states.] If using package C-states is not desirable at all, either
+:ref:`PM QoS <cpu-pm-qos>` or the ``max_cstate`` module parameter of
+``intel_idle`` described `above <intel-idle-parameters_>`_ must be used to
+restrict the range of permissible idle states to the ones with core-level only
+``MWAIT`` hint values (like ``C1``).
+
+
+References
+==========
+
+.. [1] *Intel® 64 and IA-32 Architectures Software Developer’s Manual Volume 2B*,
+ https://www.intel.com/content/www/us/en/architecture-and-technology/64-ia-32-architectures-software-developer-vol-2b-manual.html
+
+.. [2] *Advanced Configuration and Power Interface (ACPI) Specification*,
+ https://uefi.org/specifications
diff --git a/Documentation/admin-guide/pm/sleep-states.rst b/Documentation/admin-guide/pm/sleep-states.rst
index cd3a28c..ee55a46 100644
--- a/Documentation/admin-guide/pm/sleep-states.rst
+++ b/Documentation/admin-guide/pm/sleep-states.rst
@@ -153,8 +153,11 @@
Basic ``sysfs`` Interfaces for System Suspend and Hibernation
=============================================================
-The following files located in the :file:`/sys/power/` directory can be used by
-user space for sleep states control.
+The power management subsystem provides userspace with a unified ``sysfs``
+interface for system sleep regardless of the underlying system architecture or
+platform. That interface is located in the :file:`/sys/power/` directory
+(assuming that ``sysfs`` is mounted at :file:`/sys`) and it consists of the
+following attributes (files):
``state``
This file contains a list of strings representing sleep states supported
@@ -162,9 +165,9 @@
to start a transition of the system into the sleep state represented by
that string.
- In particular, the strings "disk", "freeze" and "standby" represent the
+ In particular, the "disk", "freeze" and "standby" strings represent the
:ref:`hibernation <hibernation>`, :ref:`suspend-to-idle <s2idle>` and
- :ref:`standby <standby>` sleep states, respectively. The string "mem"
+ :ref:`standby <standby>` sleep states, respectively. The "mem" string
is interpreted in accordance with the contents of the ``mem_sleep`` file
described below.
@@ -177,7 +180,7 @@
associated with the "mem" string in the ``state`` file described above.
The strings that may be present in this file are "s2idle", "shallow"
- and "deep". The string "s2idle" always represents :ref:`suspend-to-idle
+ and "deep". The "s2idle" string always represents :ref:`suspend-to-idle
<s2idle>` and, by convention, "shallow" and "deep" represent
:ref:`standby <standby>` and :ref:`suspend-to-RAM <s2ram>`,
respectively.
@@ -185,15 +188,17 @@
Writing one of the listed strings into this file causes the system
suspend variant represented by it to be associated with the "mem" string
in the ``state`` file. The string representing the suspend variant
- currently associated with the "mem" string in the ``state`` file
- is listed in square brackets.
+ currently associated with the "mem" string in the ``state`` file is
+ shown in square brackets.
If the kernel does not support system suspend, this file is not present.
``disk``
- This file contains a list of strings representing different operations
- that can be carried out after the hibernation image has been saved. The
- possible options are as follows:
+ This file controls the operating mode of hibernation (Suspend-to-Disk).
+ Specifically, it tells the kernel what to do after creating a
+ hibernation image.
+
+ Reading from it returns a list of supported options encoded as:
``platform``
Put the system into a special low-power state (e.g. ACPI S4) to
@@ -201,6 +206,11 @@
platform firmware to take a simplified initialization path after
wakeup.
+ It is only available if the platform provides a special
+ mechanism to put the system to sleep after creating a
+ hibernation image (platforms with ACPI do that as a rule, for
+ example).
+
``shutdown``
Power off the system.
@@ -214,22 +224,53 @@
the hibernation image and continue. Otherwise, use the image
to restore the previous state of the system.
+ It is available if system suspend is supported.
+
``test_resume``
Diagnostic operation. Load the image as though the system had
just woken up from hibernation and the currently running kernel
instance was a restore kernel and follow up with full system
resume.
- Writing one of the listed strings into this file causes the option
+ Writing one of the strings listed above into this file causes the option
represented by it to be selected.
- The currently selected option is shown in square brackets which means
+ The currently selected option is shown in square brackets, which means
that the operation represented by it will be carried out after creating
- and saving the image next time hibernation is triggered by writing
- ``disk`` to :file:`/sys/power/state`.
+ and saving the image when hibernation is triggered by writing ``disk``
+ to :file:`/sys/power/state`.
If the kernel does not support hibernation, this file is not present.
+``image_size``
+ This file controls the size of hibernation images.
+
+ It can be written a string representing a non-negative integer that will
+ be used as a best-effort upper limit of the image size, in bytes. The
+ hibernation core will do its best to ensure that the image size will not
+ exceed that number, but if that turns out to be impossible to achieve, a
+ hibernation image will still be created and its size will be as small as
+ possible. In particular, writing '0' to this file causes the size of
+ hibernation images to be minimum.
+
+ Reading from it returns the current image size limit, which is set to
+ around 2/5 of the available RAM size by default.
+
+``pm_trace``
+ This file controls the "PM trace" mechanism saving the last suspend
+ or resume event point in the RTC memory across reboots. It helps to
+ debug hard lockups or reboots due to device driver failures that occur
+ during system suspend or resume (which is more common) more effectively.
+
+ If it contains "1", the fingerprint of each suspend/resume event point
+ in turn will be stored in the RTC memory (overwriting the actual RTC
+ information), so it will survive a system crash if one occurs right
+ after storing it and it can be used later to identify the driver that
+ caused the crash to happen.
+
+ It contains "0" by default, which may be changed to "1" by writing a
+ string representing a nonzero integer into it.
+
According to the above, there are two ways to make the system go into the
:ref:`suspend-to-idle <s2idle>` state. The first one is to write "freeze"
directly to :file:`/sys/power/state`. The second one is to write "s2idle" to
@@ -244,6 +285,7 @@
The default suspend variant (ie. the one to be used without writing anything
into :file:`/sys/power/mem_sleep`) is either "deep" (on the majority of systems
supporting :ref:`suspend-to-RAM <s2ram>`) or "s2idle", but it can be overridden
-by the value of the "mem_sleep_default" parameter in the kernel command line.
-On some ACPI-based systems, depending on the information in the ACPI tables, the
-default may be "s2idle" even if :ref:`suspend-to-RAM <s2ram>` is supported.
+by the value of the ``mem_sleep_default`` parameter in the kernel command line.
+On some systems with ACPI, depending on the information in the ACPI tables, the
+default may be "s2idle" even if :ref:`suspend-to-RAM <s2ram>` is supported in
+principle.
diff --git a/Documentation/admin-guide/pm/working-state.rst b/Documentation/admin-guide/pm/working-state.rst
index fc298eb..88f717e 100644
--- a/Documentation/admin-guide/pm/working-state.rst
+++ b/Documentation/admin-guide/pm/working-state.rst
@@ -8,6 +8,7 @@
:maxdepth: 2
cpuidle
+ intel_idle
cpufreq
intel_pstate
intel_epb
diff --git a/Documentation/admin-guide/thunderbolt.rst b/Documentation/admin-guide/thunderbolt.rst
index 898ad78..10c4f0c 100644
--- a/Documentation/admin-guide/thunderbolt.rst
+++ b/Documentation/admin-guide/thunderbolt.rst
@@ -1,6 +1,28 @@
-=============
- Thunderbolt
-=============
+.. SPDX-License-Identifier: GPL-2.0
+
+======================
+ USB4 and Thunderbolt
+======================
+USB4 is the public specification based on Thunderbolt 3 protocol with
+some differences at the register level among other things. Connection
+manager is an entity running on the host router (host controller)
+responsible for enumerating routers and establishing tunnels. A
+connection manager can be implemented either in firmware or software.
+Typically PCs come with a firmware connection manager for Thunderbolt 3
+and early USB4 capable systems. Apple systems on the other hand use
+software connection manager and the later USB4 compliant devices follow
+the suit.
+
+The Linux Thunderbolt driver supports both and can detect at runtime which
+connection manager implementation is to be used. To be on the safe side the
+software connection manager in Linux also advertises security level
+``user`` which means PCIe tunneling is disabled by default. The
+documentation below applies to both implementations with the exception that
+the software connection manager only supports ``user`` security level and
+is expected to be accompanied with an IOMMU based DMA protection.
+
+Security levels and how to use them
+-----------------------------------
The interface presented here is not meant for end users. Instead there
should be a userspace tool that handles all the low-level details, keeps
a database of the authorized devices and prompts users for new connections.
@@ -18,8 +40,6 @@
keep in mind that this bypasses the security levels and makes the system
vulnerable to DMA attacks.
-Security levels and how to use them
------------------------------------
Starting with Intel Falcon Ridge Thunderbolt controller there are 4
security levels available. Intel Titan Ridge added one more security level
(usbonly). The reason for these is the fact that the connected devices can
diff --git a/Documentation/admin-guide/xfs.rst b/Documentation/admin-guide/xfs.rst
index fb5b39f7..ad911be 100644
--- a/Documentation/admin-guide/xfs.rst
+++ b/Documentation/admin-guide/xfs.rst
@@ -253,7 +253,7 @@
pool.
fs.xfs.speculative_prealloc_lifetime
- (Units: seconds Min: 1 Default: 300 Max: 86400)
+ (Units: seconds Min: 1 Default: 300 Max: 86400)
The interval at which the background scanning for inodes
with unused speculative preallocation runs. The scan
removes unused preallocation from clean inodes and releases
diff --git a/Documentation/arm/microchip.rst b/Documentation/arm/microchip.rst
index 1adf53d..05e5f2d 100644
--- a/Documentation/arm/microchip.rst
+++ b/Documentation/arm/microchip.rst
@@ -92,6 +92,12 @@
http://ww1.microchip.com/downloads/en/DeviceDoc/DS60001517A.pdf
+ - sam9x60
+
+ * Datasheet
+
+ http://ww1.microchip.com/downloads/en/DeviceDoc/SAM9X60-Data-Sheet-DS60001579A.pdf
+
* ARM Cortex-A5 based SoCs
- sama5d3 family
diff --git a/Documentation/arm64/cpu-feature-registers.rst b/Documentation/arm64/cpu-feature-registers.rst
index b6e4488..41937a8 100644
--- a/Documentation/arm64/cpu-feature-registers.rst
+++ b/Documentation/arm64/cpu-feature-registers.rst
@@ -117,6 +117,8 @@
+------------------------------+---------+---------+
| Name | bits | visible |
+------------------------------+---------+---------+
+ | RNDR | [63-60] | y |
+ +------------------------------+---------+---------+
| TS | [55-52] | y |
+------------------------------+---------+---------+
| FHM | [51-48] | y |
@@ -200,6 +202,12 @@
+------------------------------+---------+---------+
| Name | bits | visible |
+------------------------------+---------+---------+
+ | I8MM | [55-52] | y |
+ +------------------------------+---------+---------+
+ | DGH | [51-48] | y |
+ +------------------------------+---------+---------+
+ | BF16 | [47-44] | y |
+ +------------------------------+---------+---------+
| SB | [39-36] | y |
+------------------------------+---------+---------+
| FRINTTS | [35-32] | y |
@@ -234,10 +242,18 @@
+------------------------------+---------+---------+
| Name | bits | visible |
+------------------------------+---------+---------+
+ | F64MM | [59-56] | y |
+ +------------------------------+---------+---------+
+ | F32MM | [55-52] | y |
+ +------------------------------+---------+---------+
+ | I8MM | [47-44] | y |
+ +------------------------------+---------+---------+
| SM4 | [43-40] | y |
+------------------------------+---------+---------+
| SHA3 | [35-32] | y |
+------------------------------+---------+---------+
+ | BF16 | [23-20] | y |
+ +------------------------------+---------+---------+
| BitPerm | [19-16] | y |
+------------------------------+---------+---------+
| AES | [7-4] | y |
diff --git a/Documentation/arm64/elf_hwcaps.rst b/Documentation/arm64/elf_hwcaps.rst
index 7fa3d21..7dfb97d 100644
--- a/Documentation/arm64/elf_hwcaps.rst
+++ b/Documentation/arm64/elf_hwcaps.rst
@@ -204,6 +204,37 @@
Functionality implied by ID_AA64ISAR1_EL1.FRINTTS == 0b0001.
+HWCAP2_SVEI8MM
+
+ Functionality implied by ID_AA64ZFR0_EL1.I8MM == 0b0001.
+
+HWCAP2_SVEF32MM
+
+ Functionality implied by ID_AA64ZFR0_EL1.F32MM == 0b0001.
+
+HWCAP2_SVEF64MM
+
+ Functionality implied by ID_AA64ZFR0_EL1.F64MM == 0b0001.
+
+HWCAP2_SVEBF16
+
+ Functionality implied by ID_AA64ZFR0_EL1.BF16 == 0b0001.
+
+HWCAP2_I8MM
+
+ Functionality implied by ID_AA64ISAR1_EL1.I8MM == 0b0001.
+
+HWCAP2_BF16
+
+ Functionality implied by ID_AA64ISAR1_EL1.BF16 == 0b0001.
+
+HWCAP2_DGH
+
+ Functionality implied by ID_AA64ISAR1_EL1.DGH == 0b0001.
+
+HWCAP2_RNG
+
+ Functionality implied by ID_AA64ISAR0_EL1.RNDR == 0b0001.
4. Unused AT_HWCAP bits
-----------------------
diff --git a/Documentation/arm64/silicon-errata.rst b/Documentation/arm64/silicon-errata.rst
index 99b2545..9120e59 100644
--- a/Documentation/arm64/silicon-errata.rst
+++ b/Documentation/arm64/silicon-errata.rst
@@ -88,6 +88,8 @@
+----------------+-----------------+-----------------+-----------------------------+
| ARM | Cortex-A76 | #1463225 | ARM64_ERRATUM_1463225 |
+----------------+-----------------+-----------------+-----------------------------+
+| ARM | Cortex-A55 | #1530923 | ARM64_ERRATUM_1530923 |
++----------------+-----------------+-----------------+-----------------------------+
| ARM | Neoverse-N1 | #1188873,1418040| ARM64_ERRATUM_1418040 |
+----------------+-----------------+-----------------+-----------------------------+
| ARM | Neoverse-N1 | #1349291 | N/A |
diff --git a/Documentation/asm-annotations.rst b/Documentation/asm-annotations.rst
index f55c2bb..32ea574 100644
--- a/Documentation/asm-annotations.rst
+++ b/Documentation/asm-annotations.rst
@@ -73,10 +73,11 @@
three main groups:
1. ``SYM_FUNC_*`` -- to annotate C-like functions. This means functions with
- standard C calling conventions, i.e. the stack contains a return address at
- the predefined place and a return from the function can happen in a
- standard way. When frame pointers are enabled, save/restore of frame
- pointer shall happen at the start/end of a function, respectively, too.
+ standard C calling conventions. For example, on x86, this means that the
+ stack contains a return address at the predefined place and a return from
+ the function can happen in a standard way. When frame pointers are enabled,
+ save/restore of frame pointer shall happen at the start/end of a function,
+ respectively, too.
Checking tools like ``objtool`` should ensure such marked functions conform
to these rules. The tools can also easily annotate these functions with
diff --git a/Documentation/block/biovecs.rst b/Documentation/block/biovecs.rst
index 86fa66c8..ad303a2 100644
--- a/Documentation/block/biovecs.rst
+++ b/Documentation/block/biovecs.rst
@@ -47,7 +47,7 @@
advantages:
* Before, iterating over bios was very awkward when you weren't processing
- exactly one bvec at a time - for example, bio_copy_data() in fs/bio.c,
+ exactly one bvec at a time - for example, bio_copy_data() in block/bio.c,
which copies the contents of one bio into another. Because the biovecs
wouldn't necessarily be the same size, the old code was tricky convoluted -
it had to walk two different bios at the same time, keeping both bi_idx and
diff --git a/Documentation/core-api/index.rst b/Documentation/core-api/index.rst
index ab0eae1..a501dc1 100644
--- a/Documentation/core-api/index.rst
+++ b/Documentation/core-api/index.rst
@@ -31,6 +31,7 @@
generic-radix-tree
memory-allocation
mm-api
+ pin_user_pages
gfp_mask-from-fs-io
timekeeping
boot-time-mm
@@ -39,6 +40,8 @@
../RCU/index
gcc-plugins
symbol-namespaces
+ padata
+ ioctl
Interfaces for kernel debugging
diff --git a/Documentation/core-api/ioctl.rst b/Documentation/core-api/ioctl.rst
new file mode 100644
index 0000000..c455db0
--- /dev/null
+++ b/Documentation/core-api/ioctl.rst
@@ -0,0 +1,253 @@
+======================
+ioctl based interfaces
+======================
+
+ioctl() is the most common way for applications to interface
+with device drivers. It is flexible and easily extended by adding new
+commands and can be passed through character devices, block devices as
+well as sockets and other special file descriptors.
+
+However, it is also very easy to get ioctl command definitions wrong,
+and hard to fix them later without breaking existing applications,
+so this documentation tries to help developers get it right.
+
+Command number definitions
+==========================
+
+The command number, or request number, is the second argument passed to
+the ioctl system call. While this can be any 32-bit number that uniquely
+identifies an action for a particular driver, there are a number of
+conventions around defining them.
+
+``include/uapi/asm-generic/ioctl.h`` provides four macros for defining
+ioctl commands that follow modern conventions: ``_IO``, ``_IOR``,
+``_IOW``, and ``_IOWR``. These should be used for all new commands,
+with the correct parameters:
+
+_IO/_IOR/_IOW/_IOWR
+ The macro name specifies how the argument will be used. It may be a
+ pointer to data to be passed into the kernel (_IOW), out of the kernel
+ (_IOR), or both (_IOWR). _IO can indicate either commands with no
+ argument or those passing an integer value instead of a pointer.
+ It is recommended to only use _IO for commands without arguments,
+ and use pointers for passing data.
+
+type
+ An 8-bit number, often a character literal, specific to a subsystem
+ or driver, and listed in :doc:`../userspace-api/ioctl/ioctl-number`
+
+nr
+ An 8-bit number identifying the specific command, unique for a give
+ value of 'type'
+
+data_type
+ The name of the data type pointed to by the argument, the command number
+ encodes the ``sizeof(data_type)`` value in a 13-bit or 14-bit integer,
+ leading to a limit of 8191 bytes for the maximum size of the argument.
+ Note: do not pass sizeof(data_type) type into _IOR/_IOW/IOWR, as that
+ will lead to encoding sizeof(sizeof(data_type)), i.e. sizeof(size_t).
+ _IO does not have a data_type parameter.
+
+
+Interface versions
+==================
+
+Some subsystems use version numbers in data structures to overload
+commands with different interpretations of the argument.
+
+This is generally a bad idea, since changes to existing commands tend
+to break existing applications.
+
+A better approach is to add a new ioctl command with a new number. The
+old command still needs to be implemented in the kernel for compatibility,
+but this can be a wrapper around the new implementation.
+
+Return code
+===========
+
+ioctl commands can return negative error codes as documented in errno(3);
+these get turned into errno values in user space. On success, the return
+code should be zero. It is also possible but not recommended to return
+a positive 'long' value.
+
+When the ioctl callback is called with an unknown command number, the
+handler returns either -ENOTTY or -ENOIOCTLCMD, which also results in
+-ENOTTY being returned from the system call. Some subsystems return
+-ENOSYS or -EINVAL here for historic reasons, but this is wrong.
+
+Prior to Linux 5.5, compat_ioctl handlers were required to return
+-ENOIOCTLCMD in order to use the fallback conversion into native
+commands. As all subsystems are now responsible for handling compat
+mode themselves, this is no longer needed, but it may be important to
+consider when backporting bug fixes to older kernels.
+
+Timestamps
+==========
+
+Traditionally, timestamps and timeout values are passed as ``struct
+timespec`` or ``struct timeval``, but these are problematic because of
+incompatible definitions of these structures in user space after the
+move to 64-bit time_t.
+
+The ``struct __kernel_timespec`` type can be used instead to be embedded
+in other data structures when separate second/nanosecond values are
+desired, or passed to user space directly. This is still not ideal though,
+as the structure matches neither the kernel's timespec64 nor the user
+space timespec exactly. The get_timespec64() and put_timespec64() helper
+functions can be used to ensure that the layout remains compatible with
+user space and the padding is treated correctly.
+
+As it is cheap to convert seconds to nanoseconds, but the opposite
+requires an expensive 64-bit division, a simple __u64 nanosecond value
+can be simpler and more efficient.
+
+Timeout values and timestamps should ideally use CLOCK_MONOTONIC time,
+as returned by ktime_get_ns() or ktime_get_ts64(). Unlike
+CLOCK_REALTIME, this makes the timestamps immune from jumping backwards
+or forwards due to leap second adjustments and clock_settime() calls.
+
+ktime_get_real_ns() can be used for CLOCK_REALTIME timestamps that
+need to be persistent across a reboot or between multiple machines.
+
+32-bit compat mode
+==================
+
+In order to support 32-bit user space running on a 64-bit machine, each
+subsystem or driver that implements an ioctl callback handler must also
+implement the corresponding compat_ioctl handler.
+
+As long as all the rules for data structures are followed, this is as
+easy as setting the .compat_ioctl pointer to a helper function such as
+compat_ptr_ioctl() or blkdev_compat_ptr_ioctl().
+
+compat_ptr()
+------------
+
+On the s390 architecture, 31-bit user space has ambiguous representations
+for data pointers, with the upper bit being ignored. When running such
+a process in compat mode, the compat_ptr() helper must be used to
+clear the upper bit of a compat_uptr_t and turn it into a valid 64-bit
+pointer. On other architectures, this macro only performs a cast to a
+``void __user *`` pointer.
+
+In an compat_ioctl() callback, the last argument is an unsigned long,
+which can be interpreted as either a pointer or a scalar depending on
+the command. If it is a scalar, then compat_ptr() must not be used, to
+ensure that the 64-bit kernel behaves the same way as a 32-bit kernel
+for arguments with the upper bit set.
+
+The compat_ptr_ioctl() helper can be used in place of a custom
+compat_ioctl file operation for drivers that only take arguments that
+are pointers to compatible data structures.
+
+Structure layout
+----------------
+
+Compatible data structures have the same layout on all architectures,
+avoiding all problematic members:
+
+* ``long`` and ``unsigned long`` are the size of a register, so
+ they can be either 32-bit or 64-bit wide and cannot be used in portable
+ data structures. Fixed-length replacements are ``__s32``, ``__u32``,
+ ``__s64`` and ``__u64``.
+
+* Pointers have the same problem, in addition to requiring the
+ use of compat_ptr(). The best workaround is to use ``__u64``
+ in place of pointers, which requires a cast to ``uintptr_t`` in user
+ space, and the use of u64_to_user_ptr() in the kernel to convert
+ it back into a user pointer.
+
+* On the x86-32 (i386) architecture, the alignment of 64-bit variables
+ is only 32-bit, but they are naturally aligned on most other
+ architectures including x86-64. This means a structure like::
+
+ struct foo {
+ __u32 a;
+ __u64 b;
+ __u32 c;
+ };
+
+ has four bytes of padding between a and b on x86-64, plus another four
+ bytes of padding at the end, but no padding on i386, and it needs a
+ compat_ioctl conversion handler to translate between the two formats.
+
+ To avoid this problem, all structures should have their members
+ naturally aligned, or explicit reserved fields added in place of the
+ implicit padding. The ``pahole`` tool can be used for checking the
+ alignment.
+
+* On ARM OABI user space, structures are padded to multiples of 32-bit,
+ making some structs incompatible with modern EABI kernels if they
+ do not end on a 32-bit boundary.
+
+* On the m68k architecture, struct members are not guaranteed to have an
+ alignment greater than 16-bit, which is a problem when relying on
+ implicit padding.
+
+* Bitfields and enums generally work as one would expect them to,
+ but some properties of them are implementation-defined, so it is better
+ to avoid them completely in ioctl interfaces.
+
+* ``char`` members can be either signed or unsigned, depending on
+ the architecture, so the __u8 and __s8 types should be used for 8-bit
+ integer values, though char arrays are clearer for fixed-length strings.
+
+Information leaks
+=================
+
+Uninitialized data must not be copied back to user space, as this can
+cause an information leak, which can be used to defeat kernel address
+space layout randomization (KASLR), helping in an attack.
+
+For this reason (and for compat support) it is best to avoid any
+implicit padding in data structures. Where there is implicit padding
+in an existing structure, kernel drivers must be careful to fully
+initialize an instance of the structure before copying it to user
+space. This is usually done by calling memset() before assigning to
+individual members.
+
+Subsystem abstractions
+======================
+
+While some device drivers implement their own ioctl function, most
+subsystems implement the same command for multiple drivers. Ideally the
+subsystem has an .ioctl() handler that copies the arguments from and
+to user space, passing them into subsystem specific callback functions
+through normal kernel pointers.
+
+This helps in various ways:
+
+* Applications written for one driver are more likely to work for
+ another one in the same subsystem if there are no subtle differences
+ in the user space ABI.
+
+* The complexity of user space access and data structure layout is done
+ in one place, reducing the potential for implementation bugs.
+
+* It is more likely to be reviewed by experienced developers
+ that can spot problems in the interface when the ioctl is shared
+ between multiple drivers than when it is only used in a single driver.
+
+Alternatives to ioctl
+=====================
+
+There are many cases in which ioctl is not the best solution for a
+problem. Alternatives include:
+
+* System calls are a better choice for a system-wide feature that
+ is not tied to a physical device or constrained by the file system
+ permissions of a character device node
+
+* netlink is the preferred way of configuring any network related
+ objects through sockets.
+
+* debugfs is used for ad-hoc interfaces for debugging functionality
+ that does not need to be exposed as a stable interface to applications.
+
+* sysfs is a good way to expose the state of an in-kernel object
+ that is not tied to a file descriptor.
+
+* configfs can be used for more complex configuration than sysfs
+
+* A custom file system can provide extra flexibility with a simple
+ user interface but adds a lot of complexity to the implementation.
diff --git a/Documentation/core-api/padata.rst b/Documentation/core-api/padata.rst
new file mode 100644
index 0000000..9a24c11
--- /dev/null
+++ b/Documentation/core-api/padata.rst
@@ -0,0 +1,169 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=======================================
+The padata parallel execution mechanism
+=======================================
+
+:Date: December 2019
+
+Padata is a mechanism by which the kernel can farm jobs out to be done in
+parallel on multiple CPUs while retaining their ordering. It was developed for
+use with the IPsec code, which needs to be able to perform encryption and
+decryption on large numbers of packets without reordering those packets. The
+crypto developers made a point of writing padata in a sufficiently general
+fashion that it could be put to other uses as well.
+
+Usage
+=====
+
+Initializing
+------------
+
+The first step in using padata is to set up a padata_instance structure for
+overall control of how jobs are to be run::
+
+ #include <linux/padata.h>
+
+ struct padata_instance *padata_alloc_possible(const char *name);
+
+'name' simply identifies the instance.
+
+There are functions for enabling and disabling the instance::
+
+ int padata_start(struct padata_instance *pinst);
+ void padata_stop(struct padata_instance *pinst);
+
+These functions are setting or clearing the "PADATA_INIT" flag; if that flag is
+not set, other functions will refuse to work. padata_start() returns zero on
+success (flag set) or -EINVAL if the padata cpumask contains no active CPU
+(flag not set). padata_stop() clears the flag and blocks until the padata
+instance is unused.
+
+Finally, complete padata initialization by allocating a padata_shell::
+
+ struct padata_shell *padata_alloc_shell(struct padata_instance *pinst);
+
+A padata_shell is used to submit a job to padata and allows a series of such
+jobs to be serialized independently. A padata_instance may have one or more
+padata_shells associated with it, each allowing a separate series of jobs.
+
+Modifying cpumasks
+------------------
+
+The CPUs used to run jobs can be changed in two ways, programatically with
+padata_set_cpumask() or via sysfs. The former is defined::
+
+ int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type,
+ cpumask_var_t cpumask);
+
+Here cpumask_type is one of PADATA_CPU_PARALLEL or PADATA_CPU_SERIAL, where a
+parallel cpumask describes which processors will be used to execute jobs
+submitted to this instance in parallel and a serial cpumask defines which
+processors are allowed to be used as the serialization callback processor.
+cpumask specifies the new cpumask to use.
+
+There may be sysfs files for an instance's cpumasks. For example, pcrypt's
+live in /sys/kernel/pcrypt/<instance-name>. Within an instance's directory
+there are two files, parallel_cpumask and serial_cpumask, and either cpumask
+may be changed by echoing a bitmask into the file, for example::
+
+ echo f > /sys/kernel/pcrypt/pencrypt/parallel_cpumask
+
+Reading one of these files shows the user-supplied cpumask, which may be
+different from the 'usable' cpumask.
+
+Padata maintains two pairs of cpumasks internally, the user-supplied cpumasks
+and the 'usable' cpumasks. (Each pair consists of a parallel and a serial
+cpumask.) The user-supplied cpumasks default to all possible CPUs on instance
+allocation and may be changed as above. The usable cpumasks are always a
+subset of the user-supplied cpumasks and contain only the online CPUs in the
+user-supplied masks; these are the cpumasks padata actually uses. So it is
+legal to supply a cpumask to padata that contains offline CPUs. Once an
+offline CPU in the user-supplied cpumask comes online, padata is going to use
+it.
+
+Changing the CPU masks are expensive operations, so it should not be done with
+great frequency.
+
+Running A Job
+-------------
+
+Actually submitting work to the padata instance requires the creation of a
+padata_priv structure, which represents one job::
+
+ struct padata_priv {
+ /* Other stuff here... */
+ void (*parallel)(struct padata_priv *padata);
+ void (*serial)(struct padata_priv *padata);
+ };
+
+This structure will almost certainly be embedded within some larger
+structure specific to the work to be done. Most of its fields are private to
+padata, but the structure should be zeroed at initialisation time, and the
+parallel() and serial() functions should be provided. Those functions will
+be called in the process of getting the work done as we will see
+momentarily.
+
+The submission of the job is done with::
+
+ int padata_do_parallel(struct padata_shell *ps,
+ struct padata_priv *padata, int *cb_cpu);
+
+The ps and padata structures must be set up as described above; cb_cpu
+points to the preferred CPU to be used for the final callback when the job is
+done; it must be in the current instance's CPU mask (if not the cb_cpu pointer
+is updated to point to the CPU actually chosen). The return value from
+padata_do_parallel() is zero on success, indicating that the job is in
+progress. -EBUSY means that somebody, somewhere else is messing with the
+instance's CPU mask, while -EINVAL is a complaint about cb_cpu not being in the
+serial cpumask, no online CPUs in the parallel or serial cpumasks, or a stopped
+instance.
+
+Each job submitted to padata_do_parallel() will, in turn, be passed to
+exactly one call to the above-mentioned parallel() function, on one CPU, so
+true parallelism is achieved by submitting multiple jobs. parallel() runs with
+software interrupts disabled and thus cannot sleep. The parallel()
+function gets the padata_priv structure pointer as its lone parameter;
+information about the actual work to be done is probably obtained by using
+container_of() to find the enclosing structure.
+
+Note that parallel() has no return value; the padata subsystem assumes that
+parallel() will take responsibility for the job from this point. The job
+need not be completed during this call, but, if parallel() leaves work
+outstanding, it should be prepared to be called again with a new job before
+the previous one completes.
+
+Serializing Jobs
+----------------
+
+When a job does complete, parallel() (or whatever function actually finishes
+the work) should inform padata of the fact with a call to::
+
+ void padata_do_serial(struct padata_priv *padata);
+
+At some point in the future, padata_do_serial() will trigger a call to the
+serial() function in the padata_priv structure. That call will happen on
+the CPU requested in the initial call to padata_do_parallel(); it, too, is
+run with local software interrupts disabled.
+Note that this call may be deferred for a while since the padata code takes
+pains to ensure that jobs are completed in the order in which they were
+submitted.
+
+Destroying
+----------
+
+Cleaning up a padata instance predictably involves calling the three free
+functions that correspond to the allocation in reverse::
+
+ void padata_free_shell(struct padata_shell *ps);
+ void padata_stop(struct padata_instance *pinst);
+ void padata_free(struct padata_instance *pinst);
+
+It is the user's responsibility to ensure all outstanding jobs are complete
+before any of the above are called.
+
+Interface
+=========
+
+.. kernel-doc:: include/linux/padata.h
+.. kernel-doc:: kernel/padata.c
diff --git a/Documentation/core-api/pin_user_pages.rst b/Documentation/core-api/pin_user_pages.rst
new file mode 100644
index 0000000..1d49015
--- /dev/null
+++ b/Documentation/core-api/pin_user_pages.rst
@@ -0,0 +1,232 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+====================================================
+pin_user_pages() and related calls
+====================================================
+
+.. contents:: :local:
+
+Overview
+========
+
+This document describes the following functions::
+
+ pin_user_pages()
+ pin_user_pages_fast()
+ pin_user_pages_remote()
+
+Basic description of FOLL_PIN
+=============================
+
+FOLL_PIN and FOLL_LONGTERM are flags that can be passed to the get_user_pages*()
+("gup") family of functions. FOLL_PIN has significant interactions and
+interdependencies with FOLL_LONGTERM, so both are covered here.
+
+FOLL_PIN is internal to gup, meaning that it should not appear at the gup call
+sites. This allows the associated wrapper functions (pin_user_pages*() and
+others) to set the correct combination of these flags, and to check for problems
+as well.
+
+FOLL_LONGTERM, on the other hand, *is* allowed to be set at the gup call sites.
+This is in order to avoid creating a large number of wrapper functions to cover
+all combinations of get*(), pin*(), FOLL_LONGTERM, and more. Also, the
+pin_user_pages*() APIs are clearly distinct from the get_user_pages*() APIs, so
+that's a natural dividing line, and a good point to make separate wrapper calls.
+In other words, use pin_user_pages*() for DMA-pinned pages, and
+get_user_pages*() for other cases. There are four cases described later on in
+this document, to further clarify that concept.
+
+FOLL_PIN and FOLL_GET are mutually exclusive for a given gup call. However,
+multiple threads and call sites are free to pin the same struct pages, via both
+FOLL_PIN and FOLL_GET. It's just the call site that needs to choose one or the
+other, not the struct page(s).
+
+The FOLL_PIN implementation is nearly the same as FOLL_GET, except that FOLL_PIN
+uses a different reference counting technique.
+
+FOLL_PIN is a prerequisite to FOLL_LONGTERM. Another way of saying that is,
+FOLL_LONGTERM is a specific case, more restrictive case of FOLL_PIN.
+
+Which flags are set by each wrapper
+===================================
+
+For these pin_user_pages*() functions, FOLL_PIN is OR'd in with whatever gup
+flags the caller provides. The caller is required to pass in a non-null struct
+pages* array, and the function then pin pages by incrementing each by a special
+value. For now, that value is +1, just like get_user_pages*().::
+
+ Function
+ --------
+ pin_user_pages FOLL_PIN is always set internally by this function.
+ pin_user_pages_fast FOLL_PIN is always set internally by this function.
+ pin_user_pages_remote FOLL_PIN is always set internally by this function.
+
+For these get_user_pages*() functions, FOLL_GET might not even be specified.
+Behavior is a little more complex than above. If FOLL_GET was *not* specified,
+but the caller passed in a non-null struct pages* array, then the function
+sets FOLL_GET for you, and proceeds to pin pages by incrementing the refcount
+of each page by +1.::
+
+ Function
+ --------
+ get_user_pages FOLL_GET is sometimes set internally by this function.
+ get_user_pages_fast FOLL_GET is sometimes set internally by this function.
+ get_user_pages_remote FOLL_GET is sometimes set internally by this function.
+
+Tracking dma-pinned pages
+=========================
+
+Some of the key design constraints, and solutions, for tracking dma-pinned
+pages:
+
+* An actual reference count, per struct page, is required. This is because
+ multiple processes may pin and unpin a page.
+
+* False positives (reporting that a page is dma-pinned, when in fact it is not)
+ are acceptable, but false negatives are not.
+
+* struct page may not be increased in size for this, and all fields are already
+ used.
+
+* Given the above, we can overload the page->_refcount field by using, sort of,
+ the upper bits in that field for a dma-pinned count. "Sort of", means that,
+ rather than dividing page->_refcount into bit fields, we simple add a medium-
+ large value (GUP_PIN_COUNTING_BIAS, initially chosen to be 1024: 10 bits) to
+ page->_refcount. This provides fuzzy behavior: if a page has get_page() called
+ on it 1024 times, then it will appear to have a single dma-pinned count.
+ And again, that's acceptable.
+
+This also leads to limitations: there are only 31-10==21 bits available for a
+counter that increments 10 bits at a time.
+
+TODO: for 1GB and larger huge pages, this is cutting it close. That's because
+when pin_user_pages() follows such pages, it increments the head page by "1"
+(where "1" used to mean "+1" for get_user_pages(), but now means "+1024" for
+pin_user_pages()) for each tail page. So if you have a 1GB huge page:
+
+* There are 256K (18 bits) worth of 4 KB tail pages.
+* There are 21 bits available to count up via GUP_PIN_COUNTING_BIAS (that is,
+ 10 bits at a time)
+* There are 21 - 18 == 3 bits available to count. Except that there aren't,
+ because you need to allow for a few normal get_page() calls on the head page,
+ as well. Fortunately, the approach of using addition, rather than "hard"
+ bitfields, within page->_refcount, allows for sharing these bits gracefully.
+ But we're still looking at about 8 references.
+
+This, however, is a missing feature more than anything else, because it's easily
+solved by addressing an obvious inefficiency in the original get_user_pages()
+approach of retrieving pages: stop treating all the pages as if they were
+PAGE_SIZE. Retrieve huge pages as huge pages. The callers need to be aware of
+this, so some work is required. Once that's in place, this limitation mostly
+disappears from view, because there will be ample refcounting range available.
+
+* Callers must specifically request "dma-pinned tracking of pages". In other
+ words, just calling get_user_pages() will not suffice; a new set of functions,
+ pin_user_page() and related, must be used.
+
+FOLL_PIN, FOLL_GET, FOLL_LONGTERM: when to use which flags
+==========================================================
+
+Thanks to Jan Kara, Vlastimil Babka and several other -mm people, for describing
+these categories:
+
+CASE 1: Direct IO (DIO)
+-----------------------
+There are GUP references to pages that are serving
+as DIO buffers. These buffers are needed for a relatively short time (so they
+are not "long term"). No special synchronization with page_mkclean() or
+munmap() is provided. Therefore, flags to set at the call site are: ::
+
+ FOLL_PIN
+
+...but rather than setting FOLL_PIN directly, call sites should use one of
+the pin_user_pages*() routines that set FOLL_PIN.
+
+CASE 2: RDMA
+------------
+There are GUP references to pages that are serving as DMA
+buffers. These buffers are needed for a long time ("long term"). No special
+synchronization with page_mkclean() or munmap() is provided. Therefore, flags
+to set at the call site are: ::
+
+ FOLL_PIN | FOLL_LONGTERM
+
+NOTE: Some pages, such as DAX pages, cannot be pinned with longterm pins. That's
+because DAX pages do not have a separate page cache, and so "pinning" implies
+locking down file system blocks, which is not (yet) supported in that way.
+
+CASE 3: Hardware with page faulting support
+-------------------------------------------
+Here, a well-written driver doesn't normally need to pin pages at all. However,
+if the driver does choose to do so, it can register MMU notifiers for the range,
+and will be called back upon invalidation. Either way (avoiding page pinning, or
+using MMU notifiers to unpin upon request), there is proper synchronization with
+both filesystem and mm (page_mkclean(), munmap(), etc).
+
+Therefore, neither flag needs to be set.
+
+In this case, ideally, neither get_user_pages() nor pin_user_pages() should be
+called. Instead, the software should be written so that it does not pin pages.
+This allows mm and filesystems to operate more efficiently and reliably.
+
+CASE 4: Pinning for struct page manipulation only
+-------------------------------------------------
+Here, normal GUP calls are sufficient, so neither flag needs to be set.
+
+page_dma_pinned(): the whole point of pinning
+=============================================
+
+The whole point of marking pages as "DMA-pinned" or "gup-pinned" is to be able
+to query, "is this page DMA-pinned?" That allows code such as page_mkclean()
+(and file system writeback code in general) to make informed decisions about
+what to do when a page cannot be unmapped due to such pins.
+
+What to do in those cases is the subject of a years-long series of discussions
+and debates (see the References at the end of this document). It's a TODO item
+here: fill in the details once that's worked out. Meanwhile, it's safe to say
+that having this available: ::
+
+ static inline bool page_dma_pinned(struct page *page)
+
+...is a prerequisite to solving the long-running gup+DMA problem.
+
+Another way of thinking about FOLL_GET, FOLL_PIN, and FOLL_LONGTERM
+===================================================================
+
+Another way of thinking about these flags is as a progression of restrictions:
+FOLL_GET is for struct page manipulation, without affecting the data that the
+struct page refers to. FOLL_PIN is a *replacement* for FOLL_GET, and is for
+short term pins on pages whose data *will* get accessed. As such, FOLL_PIN is
+a "more severe" form of pinning. And finally, FOLL_LONGTERM is an even more
+restrictive case that has FOLL_PIN as a prerequisite: this is for pages that
+will be pinned longterm, and whose data will be accessed.
+
+Unit testing
+============
+This file::
+
+ tools/testing/selftests/vm/gup_benchmark.c
+
+has the following new calls to exercise the new pin*() wrapper functions:
+
+* PIN_FAST_BENCHMARK (./gup_benchmark -a)
+* PIN_BENCHMARK (./gup_benchmark -b)
+
+You can monitor how many total dma-pinned pages have been acquired and released
+since the system was booted, via two new /proc/vmstat entries: ::
+
+ /proc/vmstat/nr_foll_pin_requested
+ /proc/vmstat/nr_foll_pin_requested
+
+Those are both going to show zero, unless CONFIG_DEBUG_VM is set. This is
+because there is a noticeable performance drop in unpin_user_page(), when they
+are activated.
+
+References
+==========
+
+* `Some slow progress on get_user_pages() (Apr 2, 2019) <https://lwn.net/Articles/784574/>`_
+* `DMA and get_user_pages() (LPC: Dec 12, 2018) <https://lwn.net/Articles/774411/>`_
+* `The trouble with get_user_pages() (Apr 30, 2018) <https://lwn.net/Articles/753027/>`_
+
+John Hubbard, October, 2019
diff --git a/Documentation/core-api/xarray.rst b/Documentation/core-api/xarray.rst
index fcedc53..640934b 100644
--- a/Documentation/core-api/xarray.rst
+++ b/Documentation/core-api/xarray.rst
@@ -25,10 +25,6 @@
``ULONG_MAX`` then the XArray is not the data type for you. The most
important user of the XArray is the page cache.
-Each non-``NULL`` entry in the array has three bits associated with
-it called marks. Each mark may be set or cleared independently of
-the others. You can iterate over entries which are marked.
-
Normal pointers may be stored in the XArray directly. They must be 4-byte
aligned, which is true for any pointer returned from kmalloc() and
alloc_page(). It isn't true for arbitrary user-space pointers,
@@ -41,12 +37,11 @@
a value entry by calling xa_is_value(), and convert it back to
an integer by calling xa_to_value().
-Some users want to store tagged pointers instead of using the marks
-described above. They can call xa_tag_pointer() to create an
-entry with a tag, xa_untag_pointer() to turn a tagged entry
-back into an untagged pointer and xa_pointer_tag() to retrieve
-the tag of an entry. Tagged pointers use the same bits that are used
-to distinguish value entries from normal pointers, so each user must
+Some users want to tag the pointers they store in the XArray. You can
+call xa_tag_pointer() to create an entry with a tag, xa_untag_pointer()
+to turn a tagged entry back into an untagged pointer and xa_pointer_tag()
+to retrieve the tag of an entry. Tagged pointers use the same bits that
+are used to distinguish value entries from normal pointers, so you must
decide whether they want to store value entries or tagged pointers in
any particular XArray.
@@ -56,10 +51,9 @@
An unusual feature of the XArray is the ability to create entries which
occupy a range of indices. Once stored to, looking up any index in
the range will return the same entry as looking up any other index in
-the range. Setting a mark on one index will set it on all of them.
-Storing to any index will store to all of them. Multi-index entries can
-be explicitly split into smaller entries, or storing ``NULL`` into any
-entry will cause the XArray to forget about the range.
+the range. Storing to any index will store to all of them. Multi-index
+entries can be explicitly split into smaller entries, or storing ``NULL``
+into any entry will cause the XArray to forget about the range.
Normal API
==========
@@ -87,17 +81,11 @@
at that index is ``NULL``, you can use xa_insert() which
returns ``-EBUSY`` if the entry is not empty.
-You can enquire whether a mark is set on an entry by using
-xa_get_mark(). If the entry is not ``NULL``, you can set a mark
-on it by using xa_set_mark() and remove the mark from an entry by
-calling xa_clear_mark(). You can ask whether any entry in the
-XArray has a particular mark set by calling xa_marked().
-
You can copy entries out of the XArray into a plain array by calling
-xa_extract(). Or you can iterate over the present entries in
-the XArray by calling xa_for_each(). You may prefer to use
-xa_find() or xa_find_after() to move to the next present
-entry in the XArray.
+xa_extract(). Or you can iterate over the present entries in the XArray
+by calling xa_for_each(), xa_for_each_start() or xa_for_each_range().
+You may prefer to use xa_find() or xa_find_after() to move to the next
+present entry in the XArray.
Calling xa_store_range() stores the same entry in a range
of indices. If you do this, some of the other operations will behave
@@ -124,6 +112,31 @@
to free the entries first. You can do this by iterating over all present
entries in the XArray using the xa_for_each() iterator.
+Search Marks
+------------
+
+Each entry in the array has three bits associated with it called marks.
+Each mark may be set or cleared independently of the others. You can
+iterate over marked entries by using the xa_for_each_marked() iterator.
+
+You can enquire whether a mark is set on an entry by using
+xa_get_mark(). If the entry is not ``NULL``, you can set a mark on it
+by using xa_set_mark() and remove the mark from an entry by calling
+xa_clear_mark(). You can ask whether any entry in the XArray has a
+particular mark set by calling xa_marked(). Erasing an entry from the
+XArray causes all marks associated with that entry to be cleared.
+
+Setting or clearing a mark on any index of a multi-index entry will
+affect all indices covered by that entry. Querying the mark on any
+index will return the same result.
+
+There is no way to iterate over entries which are not marked; the data
+structure does not allow this to be implemented efficiently. There are
+not currently iterators to search for logical combinations of bits (eg
+iterate over all entries which have both ``XA_MARK_1`` and ``XA_MARK_2``
+set, or iterate over all entries which have ``XA_MARK_0`` or ``XA_MARK_2``
+set). It would be possible to add these if a user arises.
+
Allocating XArrays
------------------
@@ -180,6 +193,8 @@
Takes RCU read lock:
* xa_load()
* xa_for_each()
+ * xa_for_each_start()
+ * xa_for_each_range()
* xa_find()
* xa_find_after()
* xa_extract()
@@ -419,10 +434,9 @@
then it is good manners to pause the iteration and reenable interrupts
every ``XA_CHECK_SCHED`` entries.
-The xas_get_mark(), xas_set_mark() and
-xas_clear_mark() functions require the xa_state cursor to have
-been moved to the appropriate location in the xarray; they will do
-nothing if you have called xas_pause() or xas_set()
+The xas_get_mark(), xas_set_mark() and xas_clear_mark() functions require
+the xa_state cursor to have been moved to the appropriate location in the
+XArray; they will do nothing if you have called xas_pause() or xas_set()
immediately before.
You can call xas_set_update() to have a callback function
diff --git a/Documentation/crypto/devel-algos.rst b/Documentation/crypto/devel-algos.rst
index f9d2880..f225a95 100644
--- a/Documentation/crypto/devel-algos.rst
+++ b/Documentation/crypto/devel-algos.rst
@@ -31,33 +31,23 @@
::
- int crypto_unregister_alg(struct crypto_alg *alg);
- int crypto_unregister_algs(struct crypto_alg *algs, int count);
+ void crypto_unregister_alg(struct crypto_alg *alg);
+ void crypto_unregister_algs(struct crypto_alg *algs, int count);
-Notice that both registration and unregistration functions do return a
-value, so make sure to handle errors. A return code of zero implies
-success. Any return code < 0 implies an error.
+The registration functions return 0 on success, or a negative errno
+value on failure. crypto_register_algs() succeeds only if it
+successfully registered all the given algorithms; if it fails partway
+through, then any changes are rolled back.
-The bulk registration/unregistration functions register/unregister each
-transformation in the given array of length count. They handle errors as
-follows:
-
-- crypto_register_algs() succeeds if and only if it successfully
- registers all the given transformations. If an error occurs partway
- through, then it rolls back successful registrations before returning
- the error code. Note that if a driver needs to handle registration
- errors for individual transformations, then it will need to use the
- non-bulk function crypto_register_alg() instead.
-
-- crypto_unregister_algs() tries to unregister all the given
- transformations, continuing on error. It logs errors and always
- returns zero.
+The unregistration functions always succeed, so they don't have a
+return value. Don't try to unregister algorithms that aren't
+currently registered.
Single-Block Symmetric Ciphers [CIPHER]
---------------------------------------
-Example of transformations: aes, arc4, ...
+Example of transformations: aes, serpent, ...
This section describes the simplest of all transformation
implementations, that being the CIPHER type used for symmetric ciphers.
@@ -108,7 +98,7 @@
Multi-Block Ciphers
-------------------
-Example of transformations: cbc(aes), ecb(arc4), ...
+Example of transformations: cbc(aes), chacha20, ...
This section describes the multi-block cipher transformation
implementations. The multi-block ciphers are used for transformations
@@ -169,10 +159,10 @@
::
- int crypto_unregister_ahash(struct ahash_alg *alg);
+ void crypto_unregister_ahash(struct ahash_alg *alg);
- int crypto_unregister_shash(struct shash_alg *alg);
- int crypto_unregister_shashes(struct shash_alg *algs, int count);
+ void crypto_unregister_shash(struct shash_alg *alg);
+ void crypto_unregister_shashes(struct shash_alg *algs, int count);
Cipher Definition With struct shash_alg and ahash_alg
diff --git a/Documentation/dev-tools/kasan.rst b/Documentation/dev-tools/kasan.rst
index e4d66e7..c652d740 100644
--- a/Documentation/dev-tools/kasan.rst
+++ b/Documentation/dev-tools/kasan.rst
@@ -21,8 +21,8 @@
Tag-based KASAN is only supported in Clang and requires version 7.0.0 or later.
-Currently generic KASAN is supported for the x86_64, arm64, xtensa and s390
-architectures, and tag-based KASAN is supported only for arm64.
+Currently generic KASAN is supported for the x86_64, arm64, xtensa, s390 and
+riscv architectures, and tag-based KASAN is supported only for arm64.
Usage
-----
diff --git a/Documentation/dev-tools/kcov.rst b/Documentation/dev-tools/kcov.rst
index 36890b0..1c4e182 100644
--- a/Documentation/dev-tools/kcov.rst
+++ b/Documentation/dev-tools/kcov.rst
@@ -251,11 +251,11 @@
.. code-block:: c
struct kcov_remote_arg {
- unsigned trace_mode;
- unsigned area_size;
- unsigned num_handles;
- uint64_t common_handle;
- uint64_t handles[0];
+ __u32 trace_mode;
+ __u32 area_size;
+ __u32 num_handles;
+ __aligned_u64 common_handle;
+ __aligned_u64 handles[0];
};
#define KCOV_INIT_TRACE _IOR('c', 1, unsigned long)
diff --git a/Documentation/dev-tools/kselftest.rst b/Documentation/dev-tools/kselftest.rst
index ecdfdc9..61ae13c 100644
--- a/Documentation/dev-tools/kselftest.rst
+++ b/Documentation/dev-tools/kselftest.rst
@@ -203,12 +203,12 @@
Kselftest tests the kernel from userspace. Sometimes things need
testing from within the kernel, one method of doing this is to create a
test module. We can tie the module into the kselftest framework by
-using a shell script test runner. ``kselftest_module.sh`` is designed
+using a shell script test runner. ``kselftest/module.sh`` is designed
to facilitate this process. There is also a header file provided to
assist writing kernel modules that are for use with kselftest:
- ``tools/testing/kselftest/kselftest_module.h``
-- ``tools/testing/kselftest/kselftest_module.sh``
+- ``tools/testing/kselftest/kselftest/module.sh``
How to use
----------
@@ -247,7 +247,7 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
- #include "../tools/testing/selftests/kselftest_module.h"
+ #include "../tools/testing/selftests/kselftest/module.h"
KSTM_MODULE_GLOBALS();
@@ -276,7 +276,7 @@
#!/bin/bash
# SPDX-License-Identifier: GPL-2.0+
- $(dirname $0)/../kselftest_module.sh "foo" test_foo
+ $(dirname $0)/../kselftest/module.sh "foo" test_foo
Test Harness
diff --git a/Documentation/dev-tools/kunit/faq.rst b/Documentation/dev-tools/kunit/faq.rst
index bf20951..ea55b24 100644
--- a/Documentation/dev-tools/kunit/faq.rst
+++ b/Documentation/dev-tools/kunit/faq.rst
@@ -29,7 +29,8 @@
For the most part, the KUnit core framework (what you use to write the tests)
can compile to any architecture; it compiles like just another part of the
-kernel and runs when the kernel boots. However, there is some infrastructure,
+kernel and runs when the kernel boots, or when built as a module, when the
+module is loaded. However, there is some infrastructure,
like the KUnit Wrapper (``tools/testing/kunit/kunit.py``) that does not support
other architectures.
diff --git a/Documentation/dev-tools/kunit/index.rst b/Documentation/dev-tools/kunit/index.rst
index 26ffb46..d16a4d2 100644
--- a/Documentation/dev-tools/kunit/index.rst
+++ b/Documentation/dev-tools/kunit/index.rst
@@ -9,6 +9,7 @@
start
usage
+ kunit-tool
api/index
faq
@@ -48,6 +49,9 @@
of a host operating system; to be clear, it does not require any virtualization
support; it is just a regular program.
+Alternatively, kunit and kunit tests can be built as modules and tests will
+run when the test module is loaded.
+
KUnit is fast. Excluding build time, from invocation to completion KUnit can run
several dozen tests in only 10 to 20 seconds; this might not sound like a big
deal to some people, but having such fast and easy to run tests fundamentally
diff --git a/Documentation/dev-tools/kunit/kunit-tool.rst b/Documentation/dev-tools/kunit/kunit-tool.rst
new file mode 100644
index 0000000..50d4639
--- /dev/null
+++ b/Documentation/dev-tools/kunit/kunit-tool.rst
@@ -0,0 +1,57 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=================
+kunit_tool How-To
+=================
+
+What is kunit_tool?
+===================
+
+kunit_tool is a script (``tools/testing/kunit/kunit.py``) that aids in building
+the Linux kernel as UML (`User Mode Linux
+<http://user-mode-linux.sourceforge.net/>`_), running KUnit tests, parsing
+the test results and displaying them in a user friendly manner.
+
+What is a kunitconfig?
+======================
+
+It's just a defconfig that kunit_tool looks for in the base directory.
+kunit_tool uses it to generate a .config as you might expect. In addition, it
+verifies that the generated .config contains the CONFIG options in the
+kunitconfig; the reason it does this is so that it is easy to be sure that a
+CONFIG that enables a test actually ends up in the .config.
+
+How do I use kunit_tool?
+========================
+
+If a kunitconfig is present at the root directory, all you have to do is:
+
+.. code-block:: bash
+
+ ./tools/testing/kunit/kunit.py run
+
+However, you most likely want to use it with the following options:
+
+.. code-block:: bash
+
+ ./tools/testing/kunit/kunit.py run --timeout=30 --jobs=`nproc --all`
+
+- ``--timeout`` sets a maximum amount of time to allow tests to run.
+- ``--jobs`` sets the number of threads to use to build the kernel.
+
+If you just want to use the defconfig that ships with the kernel, you can
+append the ``--defconfig`` flag as well:
+
+.. code-block:: bash
+
+ ./tools/testing/kunit/kunit.py run --timeout=30 --jobs=`nproc --all` --defconfig
+
+.. note::
+ This command is particularly helpful for getting started because it
+ just works. No kunitconfig needs to be present.
+
+For a list of all the flags supported by kunit_tool, you can run:
+
+.. code-block:: bash
+
+ ./tools/testing/kunit/kunit.py run --help
diff --git a/Documentation/dev-tools/kunit/start.rst b/Documentation/dev-tools/kunit/start.rst
index aeeddfa..4e1d24d 100644
--- a/Documentation/dev-tools/kunit/start.rst
+++ b/Documentation/dev-tools/kunit/start.rst
@@ -19,21 +19,21 @@
.. code-block:: bash
- ./tools/testing/kunit/kunit.py run
+ ./tools/testing/kunit/kunit.py run --defconfig
-Creating a kunitconfig
-======================
-The Python script is a thin wrapper around Kbuild as such, it needs to be
-configured with a ``kunitconfig`` file. This file essentially contains the
+For more information on this wrapper (also called kunit_tool) checkout the
+:doc:`kunit-tool` page.
+
+Creating a .kunitconfig
+=======================
+The Python script is a thin wrapper around Kbuild. As such, it needs to be
+configured with a ``.kunitconfig`` file. This file essentially contains the
regular Kernel config, with the specific test targets as well.
.. code-block:: bash
- git clone -b master https://kunit.googlesource.com/kunitconfig $PATH_TO_KUNITCONFIG_REPO
cd $PATH_TO_LINUX_REPO
- ln -s $PATH_TO_KUNIT_CONFIG_REPO/kunitconfig kunitconfig
-
-You may want to add kunitconfig to your local gitignore.
+ cp arch/um/configs/kunit_defconfig .kunitconfig
Verifying KUnit Works
---------------------
@@ -59,8 +59,8 @@
followed by a list of tests that are run. All of them should be passing.
.. note::
- Because it is building a lot of sources for the first time, the ``Building
- kunit kernel`` step may take a while.
+ Because it is building a lot of sources for the first time, the
+ ``Building KUnit kernel`` step may take a while.
Writing your first test
=======================
@@ -148,7 +148,7 @@
obj-$(CONFIG_MISC_EXAMPLE_TEST) += example-test.o
-Now add it to your ``kunitconfig``:
+Now add it to your ``.kunitconfig``:
.. code-block:: none
@@ -159,7 +159,7 @@
.. code-block:: bash
- ./tools/testing/kunit/kunit.py
+ ./tools/testing/kunit/kunit.py run
You should see the following failure:
diff --git a/Documentation/dev-tools/kunit/usage.rst b/Documentation/dev-tools/kunit/usage.rst
index c6e6963..7cd56a1 100644
--- a/Documentation/dev-tools/kunit/usage.rst
+++ b/Documentation/dev-tools/kunit/usage.rst
@@ -16,7 +16,7 @@
=============================
This document is organized into two main sections: Testing and Isolating
-Behavior. The first covers what a unit test is and how to use KUnit to write
+Behavior. The first covers what unit tests are and how to use KUnit to write
them. The second covers how to use KUnit to isolate code and make it possible
to unit test code that was otherwise un-unit-testable.
@@ -174,13 +174,13 @@
~~~~~~~~~~~
Now obviously one unit test isn't very helpful; the power comes from having
-many test cases covering all of your behaviors. Consequently it is common to
-have many *similar* tests; in order to reduce duplication in these closely
-related tests most unit testing frameworks provide the concept of a *test
-suite*, in KUnit we call it a *test suite*; all it is is just a collection of
-test cases for a unit of code with a set up function that gets invoked before
-every test cases and then a tear down function that gets invoked after every
-test case completes.
+many test cases covering all of a unit's behaviors. Consequently it is common
+to have many *similar* tests; in order to reduce duplication in these closely
+related tests most unit testing frameworks - including KUnit - provide the
+concept of a *test suite*. A *test suite* is just a collection of test cases
+for a unit of code with a set up function that gets invoked before every test
+case and then a tear down function that gets invoked after every test case
+completes.
Example:
@@ -211,7 +211,7 @@
.. note::
A test case will only be run if it is associated with a test suite.
-For a more information on these types of things see the :doc:`api/test`.
+For more information on these types of things see the :doc:`api/test`.
Isolating Behavior
==================
@@ -338,7 +338,7 @@
return count;
}
- ssize_t fake_eeprom_write(struct eeprom *this, size_t offset, const char *buffer, size_t count)
+ ssize_t fake_eeprom_write(struct eeprom *parent, size_t offset, const char *buffer, size_t count)
{
struct fake_eeprom *this = container_of(parent, struct fake_eeprom, parent);
@@ -454,7 +454,7 @@
By default KUnit uses UML as a way to provide dependencies for code under test.
Under most circumstances KUnit's usage of UML should be treated as an
implementation detail of how KUnit works under the hood. Nevertheless, there
-are instances where being able to run architecture specific code, or test
+are instances where being able to run architecture specific code or test
against real hardware is desirable. For these reasons KUnit supports running on
other architectures.
@@ -539,6 +539,22 @@
Congratulations, you just ran a KUnit test on the x86 architecture!
+In a similar manner, kunit and kunit tests can also be built as modules,
+so if you wanted to run tests in this way you might add the following config
+options to your ``.config``:
+
+.. code-block:: none
+
+ CONFIG_KUNIT=m
+ CONFIG_KUNIT_EXAMPLE_TEST=m
+
+Once the kernel is built and installed, a simple
+
+.. code-block:: bash
+ modprobe example-test
+
+...will run the tests.
+
Writing new tests for other architectures
-----------------------------------------
@@ -557,7 +573,7 @@
.. important::
Always prefer tests that run on UML to tests that only run under a particular
architecture, and always prefer tests that run under QEMU or another easy
- (and monitarily free) to obtain software environment to a specific piece of
+ (and monetarily free) to obtain software environment to a specific piece of
hardware.
Nevertheless, there are still valid reasons to write an architecture or hardware
diff --git a/Documentation/devicetree/bindings/arm/amlogic.yaml b/Documentation/devicetree/bindings/arm/amlogic.yaml
index c6a4433..f74aba4 100644
--- a/Documentation/devicetree/bindings/arm/amlogic.yaml
+++ b/Documentation/devicetree/bindings/arm/amlogic.yaml
@@ -59,6 +59,7 @@
- friendlyarm,nanopi-k2
- hardkernel,odroid-c2
- nexbox,a95x
+ - videostrong,kii-pro
- wetek,hub
- wetek,play2
- const: amlogic,meson-gxbb
@@ -104,6 +105,7 @@
- enum:
- amlogic,p230
- amlogic,p231
+ - libretech,aml-s905d-pc
- phicomm,n1
- const: amlogic,s905d
- const: amlogic,meson-gxl
@@ -115,6 +117,7 @@
- amlogic,q201
- khadas,vim2
- kingnovel,r-box-pro
+ - libretech,aml-s912-pc
- nexbox,a1
- tronsmart,vega-s96
- const: amlogic,s912
diff --git a/Documentation/devicetree/bindings/arm/arm-boards b/Documentation/devicetree/bindings/arm/arm-boards
index b2a9f9f..96b1dad 100644
--- a/Documentation/devicetree/bindings/arm/arm-boards
+++ b/Documentation/devicetree/bindings/arm/arm-boards
@@ -121,7 +121,7 @@
Required nodes:
- soc: some node of the RealView platforms must be the SoC
- node that contain the SoC-specific devices, withe the compatible
+ node that contain the SoC-specific devices, with the compatible
string set to one of these tuples:
"arm,realview-eb-soc", "simple-bus"
"arm,realview-pb1176-soc", "simple-bus"
diff --git a/Documentation/devicetree/bindings/arm/atmel-at91.yaml b/Documentation/devicetree/bindings/arm/atmel-at91.yaml
index 6dd8be4..0357314 100644
--- a/Documentation/devicetree/bindings/arm/atmel-at91.yaml
+++ b/Documentation/devicetree/bindings/arm/atmel-at91.yaml
@@ -37,6 +37,16 @@
- items:
- enum:
+ - overkiz,kizboxmini-base # Overkiz kizbox Mini Base Board
+ - overkiz,kizboxmini-mb # Overkiz kizbox Mini Mother Board
+ - overkiz,kizboxmini-rd # Overkiz kizbox Mini RailDIN
+ - overkiz,smartkiz # Overkiz SmartKiz Board
+ - const: atmel,at91sam9g25
+ - const: atmel,at91sam9x5
+ - const: atmel,at91sam9
+
+ - items:
+ - enum:
- atmel,at91sam9g15
- atmel,at91sam9g25
- atmel,at91sam9g35
@@ -52,11 +62,32 @@
- const: atmel,sama5d2
- const: atmel,sama5
+ - description: Microchip SAMA5D27 WLSOM1
+ items:
+ - const: microchip,sama5d27-wlsom1
+ - const: atmel,sama5d27
+ - const: atmel,sama5d2
+ - const: atmel,sama5
+
+ - description: Microchip SAMA5D27 WLSOM1 Evaluation Kit
+ items:
+ - const: microchip,sama5d27-wlsom1-ek
+ - const: microchip,sama5d27-wlsom1
+ - const: atmel,sama5d27
+ - const: atmel,sama5d2
+ - const: atmel,sama5
+
- items:
- const: atmel,sama5d27
- const: atmel,sama5d2
- const: atmel,sama5
+ - description: SAM9X60-EK board
+ items:
+ - const: microchip,sam9x60ek
+ - const: microchip,sam9x60
+ - const: atmel,at91sam9
+
- description: Nattis v2 board with Natte v2 power board
items:
- const: axentia,nattis-2
diff --git a/Documentation/devicetree/bindings/arm/atmel-sysregs.txt b/Documentation/devicetree/bindings/arm/atmel-sysregs.txt
index 9fbde40..62cd4e8 100644
--- a/Documentation/devicetree/bindings/arm/atmel-sysregs.txt
+++ b/Documentation/devicetree/bindings/arm/atmel-sysregs.txt
@@ -10,6 +10,12 @@
- interrupts: Should contain interrupt for the PIT which is the IRQ line
shared across all System Controller members.
+PIT64B Timer required properties:
+- compatible: Should be "microchip,sam9x60-pit64b"
+- reg: Should contain registers location and length
+- interrupts: Should contain interrupt for PIT64B timer
+- clocks: Should contain the available clock sources for PIT64B timer.
+
System Timer (ST) required properties:
- compatible: Should be "atmel,at91rm9200-st", "syscon", "simple-mfd"
- reg: Should contain registers location and length
@@ -39,6 +45,7 @@
"atmel,at91sam9260-sdramc",
"atmel,at91sam9g45-ddramc",
"atmel,sama5d3-ddramc",
+ "microchip,sam9x60-ddramc"
- reg: Should contain registers location and length
Examples:
diff --git a/Documentation/devicetree/bindings/arm/cpus.yaml b/Documentation/devicetree/bindings/arm/cpus.yaml
index c23c24f..7a9c3ce 100644
--- a/Documentation/devicetree/bindings/arm/cpus.yaml
+++ b/Documentation/devicetree/bindings/arm/cpus.yaml
@@ -242,6 +242,21 @@
where voltage is in V, frequency is in MHz.
+ power-domains:
+ $ref: '/schemas/types.yaml#/definitions/phandle-array'
+ description:
+ List of phandles and PM domain specifiers, as defined by bindings of the
+ PM domain provider (see also ../power_domain.txt).
+
+ power-domain-names:
+ $ref: '/schemas/types.yaml#/definitions/string-array'
+ description:
+ A list of power domain name strings sorted in the same order as the
+ power-domains property.
+
+ For PSCI based platforms, the name corresponding to the index of the PSCI
+ PM domain provider, must be "psci".
+
qcom,saw:
$ref: '/schemas/types.yaml#/definitions/phandle'
description: |
diff --git a/Documentation/devicetree/bindings/arm/fsl.yaml b/Documentation/devicetree/bindings/arm/fsl.yaml
index f79683a..a8e0b4a 100644
--- a/Documentation/devicetree/bindings/arm/fsl.yaml
+++ b/Documentation/devicetree/bindings/arm/fsl.yaml
@@ -1,7 +1,7 @@
# SPDX-License-Identifier: GPL-2.0
%YAML 1.2
---
-$id: http://devicetree.org/schemas/bindings/arm/fsl.yaml#
+$id: http://devicetree.org/schemas/arm/fsl.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
title: Freescale i.MX Platforms Device Tree Bindings
@@ -128,6 +128,27 @@
- variscite,dt6customboard
- const: fsl,imx6q
+ - description: i.MX6Q Gateworks Ventana Boards
+ items:
+ - enum:
+ - gw,imx6q-gw51xx
+ - gw,imx6q-gw52xx
+ - gw,imx6q-gw53xx
+ - gw,imx6q-gw5400-a
+ - gw,imx6q-gw54xx
+ - gw,imx6q-gw551x
+ - gw,imx6q-gw552x
+ - gw,imx6q-gw553x
+ - gw,imx6q-gw560x
+ - gw,imx6q-gw5903
+ - gw,imx6q-gw5904
+ - gw,imx6q-gw5907
+ - gw,imx6q-gw5910
+ - gw,imx6q-gw5912
+ - gw,imx6q-gw5913
+ - const: gw,ventana
+ - const: fsl,imx6q
+
- description: i.MX6QP based Boards
items:
- enum:
@@ -154,10 +175,31 @@
- ysoft,imx6dl-yapp4-ursa # i.MX6 Solo Y Soft IOTA Ursa board
- const: fsl,imx6dl
+ - description: i.MX6DL Gateworks Ventana Boards
+ items:
+ - enum:
+ - gw,imx6dl-gw51xx
+ - gw,imx6dl-gw52xx
+ - gw,imx6dl-gw53xx
+ - gw,imx6dl-gw54xx
+ - gw,imx6dl-gw551x
+ - gw,imx6dl-gw552x
+ - gw,imx6dl-gw553x
+ - gw,imx6dl-gw560x
+ - gw,imx6dl-gw5903
+ - gw,imx6dl-gw5904
+ - gw,imx6dl-gw5907
+ - gw,imx6dl-gw5910
+ - gw,imx6dl-gw5912
+ - gw,imx6dl-gw5913
+ - const: gw,ventana
+ - const: fsl,imx6dl
+
- description: i.MX6SL based Boards
items:
- enum:
- fsl,imx6sl-evk # i.MX6 SoloLite EVK Board
+ - kobo,tolino-shine3
- const: fsl,imx6sl
- description: i.MX6SLL based Boards
@@ -172,6 +214,7 @@
- enum:
- fsl,imx6sx-sabreauto # i.MX6 SoloX Sabre Auto Board
- fsl,imx6sx-sdb # i.MX6 SoloX SDB Board
+ - fsl,imx6sx-sdb-reva # i.MX6 SoloX SDB Rev-A Board
- const: fsl,imx6sx
- description: i.MX6UL based Boards
@@ -239,6 +282,7 @@
items:
- enum:
- fsl,imx7d-sdb # i.MX7 SabreSD Board
+ - fsl,imx7d-sdb-reva # i.MX7 SabreSD Rev-A Board
- novtech,imx7d-meerkat96 # i.MX7 Meerkat96 Board
- toradex,colibri-imx7d # Colibri iMX7 Dual Module
- toradex,colibri-imx7d-emmc # Colibri iMX7 Dual 1GB (eMMC) Module
@@ -263,6 +307,7 @@
- description: i.MX7ULP based Boards
items:
- enum:
+ - ea,imx7ulp-com # i.MX7ULP Embedded Artists COM Board
- fsl,imx7ulp-evk # i.MX7ULP Evaluation Kit
- const: fsl,imx7ulp
@@ -283,7 +328,9 @@
items:
- enum:
- boundary,imx8mq-nitrogen8m # i.MX8MQ NITROGEN Board
+ - einfochips,imx8mq-thor96 # i.MX8MQ Thor96 Board
- fsl,imx8mq-evk # i.MX8MQ EVK Board
+ - google,imx8mq-phanbell # Google Coral Edge TPU
- purism,librem5-devkit # Purism Librem5 devkit
- solidrun,hummingboard-pulse # SolidRun Hummingboard Pulse
- technexion,pico-pi-imx8m # TechNexion PICO-PI-8M evk
@@ -385,6 +432,13 @@
- fsl,ls2088a-rdb
- const: fsl,ls2088a
+ - description: LX2160A based Boards
+ items:
+ - enum:
+ - fsl,lx2160a-qds
+ - fsl,lx2160a-rdb
+ - const: fsl,lx2160a
+
- description: S32V234 based Boards
items:
- enum:
diff --git a/Documentation/devicetree/bindings/arm/idle-states.txt b/Documentation/devicetree/bindings/arm/idle-states.txt
deleted file mode 100644
index 771f5d2..0000000
--- a/Documentation/devicetree/bindings/arm/idle-states.txt
+++ /dev/null
@@ -1,706 +0,0 @@
-==========================================
-ARM idle states binding description
-==========================================
-
-==========================================
-1 - Introduction
-==========================================
-
-ARM systems contain HW capable of managing power consumption dynamically,
-where cores can be put in different low-power states (ranging from simple
-wfi to power gating) according to OS PM policies. The CPU states representing
-the range of dynamic idle states that a processor can enter at run-time, can be
-specified through device tree bindings representing the parameters required
-to enter/exit specific idle states on a given processor.
-
-According to the Server Base System Architecture document (SBSA, [3]), the
-power states an ARM CPU can be put into are identified by the following list:
-
-- Running
-- Idle_standby
-- Idle_retention
-- Sleep
-- Off
-
-The power states described in the SBSA document define the basic CPU states on
-top of which ARM platforms implement power management schemes that allow an OS
-PM implementation to put the processor in different idle states (which include
-states listed above; "off" state is not an idle state since it does not have
-wake-up capabilities, hence it is not considered in this document).
-
-Idle state parameters (e.g. entry latency) are platform specific and need to be
-characterized with bindings that provide the required information to OS PM
-code so that it can build the required tables and use them at runtime.
-
-The device tree binding definition for ARM idle states is the subject of this
-document.
-
-===========================================
-2 - idle-states definitions
-===========================================
-
-Idle states are characterized for a specific system through a set of
-timing and energy related properties, that underline the HW behaviour
-triggered upon idle states entry and exit.
-
-The following diagram depicts the CPU execution phases and related timing
-properties required to enter and exit an idle state:
-
-..__[EXEC]__|__[PREP]__|__[ENTRY]__|__[IDLE]__|__[EXIT]__|__[EXEC]__..
- | | | | |
-
- |<------ entry ------->|
- | latency |
- |<- exit ->|
- | latency |
- |<-------- min-residency -------->|
- |<------- wakeup-latency ------->|
-
- Diagram 1: CPU idle state execution phases
-
-EXEC: Normal CPU execution.
-
-PREP: Preparation phase before committing the hardware to idle mode
- like cache flushing. This is abortable on pending wake-up
- event conditions. The abort latency is assumed to be negligible
- (i.e. less than the ENTRY + EXIT duration). If aborted, CPU
- goes back to EXEC. This phase is optional. If not abortable,
- this should be included in the ENTRY phase instead.
-
-ENTRY: The hardware is committed to idle mode. This period must run
- to completion up to IDLE before anything else can happen.
-
-IDLE: This is the actual energy-saving idle period. This may last
- between 0 and infinite time, until a wake-up event occurs.
-
-EXIT: Period during which the CPU is brought back to operational
- mode (EXEC).
-
-entry-latency: Worst case latency required to enter the idle state. The
-exit-latency may be guaranteed only after entry-latency has passed.
-
-min-residency: Minimum period, including preparation and entry, for a given
-idle state to be worthwhile energywise.
-
-wakeup-latency: Maximum delay between the signaling of a wake-up event and the
-CPU being able to execute normal code again. If not specified, this is assumed
-to be entry-latency + exit-latency.
-
-These timing parameters can be used by an OS in different circumstances.
-
-An idle CPU requires the expected min-residency time to select the most
-appropriate idle state based on the expected expiry time of the next IRQ
-(i.e. wake-up) that causes the CPU to return to the EXEC phase.
-
-An operating system scheduler may need to compute the shortest wake-up delay
-for CPUs in the system by detecting how long will it take to get a CPU out
-of an idle state, e.g.:
-
-wakeup-delay = exit-latency + max(entry-latency - (now - entry-timestamp), 0)
-
-In other words, the scheduler can make its scheduling decision by selecting
-(e.g. waking-up) the CPU with the shortest wake-up delay.
-The wake-up delay must take into account the entry latency if that period
-has not expired. The abortable nature of the PREP period can be ignored
-if it cannot be relied upon (e.g. the PREP deadline may occur much sooner than
-the worst case since it depends on the CPU operating conditions, i.e. caches
-state).
-
-An OS has to reliably probe the wakeup-latency since some devices can enforce
-latency constraint guarantees to work properly, so the OS has to detect the
-worst case wake-up latency it can incur if a CPU is allowed to enter an
-idle state, and possibly to prevent that to guarantee reliable device
-functioning.
-
-The min-residency time parameter deserves further explanation since it is
-expressed in time units but must factor in energy consumption coefficients.
-
-The energy consumption of a cpu when it enters a power state can be roughly
-characterised by the following graph:
-
- |
- |
- |
- e |
- n | /---
- e | /------
- r | /------
- g | /-----
- y | /------
- | ----
- | /|
- | / |
- | / |
- | / |
- | / |
- | / |
- |/ |
- -----|-------+----------------------------------
- 0| 1 time(ms)
-
- Graph 1: Energy vs time example
-
-The graph is split in two parts delimited by time 1ms on the X-axis.
-The graph curve with X-axis values = { x | 0 < x < 1ms } has a steep slope
-and denotes the energy costs incurred while entering and leaving the idle
-state.
-The graph curve in the area delimited by X-axis values = {x | x > 1ms } has
-shallower slope and essentially represents the energy consumption of the idle
-state.
-
-min-residency is defined for a given idle state as the minimum expected
-residency time for a state (inclusive of preparation and entry) after
-which choosing that state become the most energy efficient option. A good
-way to visualise this, is by taking the same graph above and comparing some
-states energy consumptions plots.
-
-For sake of simplicity, let's consider a system with two idle states IDLE1,
-and IDLE2:
-
- |
- |
- |
- | /-- IDLE1
- e | /---
- n | /----
- e | /---
- r | /-----/--------- IDLE2
- g | /-------/---------
- y | ------------ /---|
- | / /---- |
- | / /--- |
- | / /---- |
- | / /--- |
- | --- |
- | / |
- | / |
- |/ | time
- ---/----------------------------+------------------------
- |IDLE1-energy < IDLE2-energy | IDLE2-energy < IDLE1-energy
- |
- IDLE2-min-residency
-
- Graph 2: idle states min-residency example
-
-In graph 2 above, that takes into account idle states entry/exit energy
-costs, it is clear that if the idle state residency time (i.e. time till next
-wake-up IRQ) is less than IDLE2-min-residency, IDLE1 is the better idle state
-choice energywise.
-
-This is mainly down to the fact that IDLE1 entry/exit energy costs are lower
-than IDLE2.
-
-However, the lower power consumption (i.e. shallower energy curve slope) of
-idle state IDLE2 implies that after a suitable time, IDLE2 becomes more energy
-efficient.
-
-The time at which IDLE2 becomes more energy efficient than IDLE1 (and other
-shallower states in a system with multiple idle states) is defined
-IDLE2-min-residency and corresponds to the time when energy consumption of
-IDLE1 and IDLE2 states breaks even.
-
-The definitions provided in this section underpin the idle states
-properties specification that is the subject of the following sections.
-
-===========================================
-3 - idle-states node
-===========================================
-
-ARM processor idle states are defined within the idle-states node, which is
-a direct child of the cpus node [1] and provides a container where the
-processor idle states, defined as device tree nodes, are listed.
-
-- idle-states node
-
- Usage: Optional - On ARM systems, it is a container of processor idle
- states nodes. If the system does not provide CPU
- power management capabilities, or the processor just
- supports idle_standby, an idle-states node is not
- required.
-
- Description: idle-states node is a container node, where its
- subnodes describe the CPU idle states.
-
- Node name must be "idle-states".
-
- The idle-states node's parent node must be the cpus node.
-
- The idle-states node's child nodes can be:
-
- - one or more state nodes
-
- Any other configuration is considered invalid.
-
- An idle-states node defines the following properties:
-
- - entry-method
- Value type: <stringlist>
- Usage and definition depend on ARM architecture version.
- # On ARM v8 64-bit this property is required and must
- be:
- - "psci"
- # On ARM 32-bit systems this property is optional
-
-This assumes that the "enable-method" property is set to "psci" in the cpu
-node[6] that is responsible for setting up CPU idle management in the OS
-implementation.
-
-The nodes describing the idle states (state) can only be defined
-within the idle-states node, any other configuration is considered invalid
-and therefore must be ignored.
-
-===========================================
-4 - state node
-===========================================
-
-A state node represents an idle state description and must be defined as
-follows:
-
-- state node
-
- Description: must be child of the idle-states node
-
- The state node name shall follow standard device tree naming
- rules ([5], 2.2.1 "Node names"), in particular state nodes which
- are siblings within a single common parent must be given a unique name.
-
- The idle state entered by executing the wfi instruction (idle_standby
- SBSA,[3][4]) is considered standard on all ARM platforms and therefore
- must not be listed.
-
- With the definitions provided above, the following list represents
- the valid properties for a state node:
-
- - compatible
- Usage: Required
- Value type: <stringlist>
- Definition: Must be "arm,idle-state".
-
- - local-timer-stop
- Usage: See definition
- Value type: <none>
- Definition: if present the CPU local timer control logic is
- lost on state entry, otherwise it is retained.
-
- - entry-latency-us
- Usage: Required
- Value type: <prop-encoded-array>
- Definition: u32 value representing worst case latency in
- microseconds required to enter the idle state.
-
- - exit-latency-us
- Usage: Required
- Value type: <prop-encoded-array>
- Definition: u32 value representing worst case latency
- in microseconds required to exit the idle state.
- The exit-latency-us duration may be guaranteed
- only after entry-latency-us has passed.
-
- - min-residency-us
- Usage: Required
- Value type: <prop-encoded-array>
- Definition: u32 value representing minimum residency duration
- in microseconds, inclusive of preparation and
- entry, for this idle state to be considered
- worthwhile energy wise (refer to section 2 of
- this document for a complete description).
-
- - wakeup-latency-us:
- Usage: Optional
- Value type: <prop-encoded-array>
- Definition: u32 value representing maximum delay between the
- signaling of a wake-up event and the CPU being
- able to execute normal code again. If omitted,
- this is assumed to be equal to:
-
- entry-latency-us + exit-latency-us
-
- It is important to supply this value on systems
- where the duration of PREP phase (see diagram 1,
- section 2) is non-neglibigle.
- In such systems entry-latency-us + exit-latency-us
- will exceed wakeup-latency-us by this duration.
-
- - status:
- Usage: Optional
- Value type: <string>
- Definition: A standard device tree property [5] that indicates
- the operational status of an idle-state.
- If present, it shall be:
- "okay": to indicate that the idle state is
- operational.
- "disabled": to indicate that the idle state has
- been disabled in firmware so it is not
- operational.
- If the property is not present the idle-state must
- be considered operational.
-
- - idle-state-name:
- Usage: Optional
- Value type: <string>
- Definition: A string used as a descriptive name for the idle
- state.
-
- In addition to the properties listed above, a state node may require
- additional properties specific to the entry-method defined in the
- idle-states node. Please refer to the entry-method bindings
- documentation for properties definitions.
-
-===========================================
-4 - Examples
-===========================================
-
-Example 1 (ARM 64-bit, 16-cpu system, PSCI enable-method):
-
-cpus {
- #size-cells = <0>;
- #address-cells = <2>;
-
- CPU0: cpu@0 {
- device_type = "cpu";
- compatible = "arm,cortex-a57";
- reg = <0x0 0x0>;
- enable-method = "psci";
- cpu-idle-states = <&CPU_RETENTION_0_0 &CPU_SLEEP_0_0
- &CLUSTER_RETENTION_0 &CLUSTER_SLEEP_0>;
- };
-
- CPU1: cpu@1 {
- device_type = "cpu";
- compatible = "arm,cortex-a57";
- reg = <0x0 0x1>;
- enable-method = "psci";
- cpu-idle-states = <&CPU_RETENTION_0_0 &CPU_SLEEP_0_0
- &CLUSTER_RETENTION_0 &CLUSTER_SLEEP_0>;
- };
-
- CPU2: cpu@100 {
- device_type = "cpu";
- compatible = "arm,cortex-a57";
- reg = <0x0 0x100>;
- enable-method = "psci";
- cpu-idle-states = <&CPU_RETENTION_0_0 &CPU_SLEEP_0_0
- &CLUSTER_RETENTION_0 &CLUSTER_SLEEP_0>;
- };
-
- CPU3: cpu@101 {
- device_type = "cpu";
- compatible = "arm,cortex-a57";
- reg = <0x0 0x101>;
- enable-method = "psci";
- cpu-idle-states = <&CPU_RETENTION_0_0 &CPU_SLEEP_0_0
- &CLUSTER_RETENTION_0 &CLUSTER_SLEEP_0>;
- };
-
- CPU4: cpu@10000 {
- device_type = "cpu";
- compatible = "arm,cortex-a57";
- reg = <0x0 0x10000>;
- enable-method = "psci";
- cpu-idle-states = <&CPU_RETENTION_0_0 &CPU_SLEEP_0_0
- &CLUSTER_RETENTION_0 &CLUSTER_SLEEP_0>;
- };
-
- CPU5: cpu@10001 {
- device_type = "cpu";
- compatible = "arm,cortex-a57";
- reg = <0x0 0x10001>;
- enable-method = "psci";
- cpu-idle-states = <&CPU_RETENTION_0_0 &CPU_SLEEP_0_0
- &CLUSTER_RETENTION_0 &CLUSTER_SLEEP_0>;
- };
-
- CPU6: cpu@10100 {
- device_type = "cpu";
- compatible = "arm,cortex-a57";
- reg = <0x0 0x10100>;
- enable-method = "psci";
- cpu-idle-states = <&CPU_RETENTION_0_0 &CPU_SLEEP_0_0
- &CLUSTER_RETENTION_0 &CLUSTER_SLEEP_0>;
- };
-
- CPU7: cpu@10101 {
- device_type = "cpu";
- compatible = "arm,cortex-a57";
- reg = <0x0 0x10101>;
- enable-method = "psci";
- cpu-idle-states = <&CPU_RETENTION_0_0 &CPU_SLEEP_0_0
- &CLUSTER_RETENTION_0 &CLUSTER_SLEEP_0>;
- };
-
- CPU8: cpu@100000000 {
- device_type = "cpu";
- compatible = "arm,cortex-a53";
- reg = <0x1 0x0>;
- enable-method = "psci";
- cpu-idle-states = <&CPU_RETENTION_1_0 &CPU_SLEEP_1_0
- &CLUSTER_RETENTION_1 &CLUSTER_SLEEP_1>;
- };
-
- CPU9: cpu@100000001 {
- device_type = "cpu";
- compatible = "arm,cortex-a53";
- reg = <0x1 0x1>;
- enable-method = "psci";
- cpu-idle-states = <&CPU_RETENTION_1_0 &CPU_SLEEP_1_0
- &CLUSTER_RETENTION_1 &CLUSTER_SLEEP_1>;
- };
-
- CPU10: cpu@100000100 {
- device_type = "cpu";
- compatible = "arm,cortex-a53";
- reg = <0x1 0x100>;
- enable-method = "psci";
- cpu-idle-states = <&CPU_RETENTION_1_0 &CPU_SLEEP_1_0
- &CLUSTER_RETENTION_1 &CLUSTER_SLEEP_1>;
- };
-
- CPU11: cpu@100000101 {
- device_type = "cpu";
- compatible = "arm,cortex-a53";
- reg = <0x1 0x101>;
- enable-method = "psci";
- cpu-idle-states = <&CPU_RETENTION_1_0 &CPU_SLEEP_1_0
- &CLUSTER_RETENTION_1 &CLUSTER_SLEEP_1>;
- };
-
- CPU12: cpu@100010000 {
- device_type = "cpu";
- compatible = "arm,cortex-a53";
- reg = <0x1 0x10000>;
- enable-method = "psci";
- cpu-idle-states = <&CPU_RETENTION_1_0 &CPU_SLEEP_1_0
- &CLUSTER_RETENTION_1 &CLUSTER_SLEEP_1>;
- };
-
- CPU13: cpu@100010001 {
- device_type = "cpu";
- compatible = "arm,cortex-a53";
- reg = <0x1 0x10001>;
- enable-method = "psci";
- cpu-idle-states = <&CPU_RETENTION_1_0 &CPU_SLEEP_1_0
- &CLUSTER_RETENTION_1 &CLUSTER_SLEEP_1>;
- };
-
- CPU14: cpu@100010100 {
- device_type = "cpu";
- compatible = "arm,cortex-a53";
- reg = <0x1 0x10100>;
- enable-method = "psci";
- cpu-idle-states = <&CPU_RETENTION_1_0 &CPU_SLEEP_1_0
- &CLUSTER_RETENTION_1 &CLUSTER_SLEEP_1>;
- };
-
- CPU15: cpu@100010101 {
- device_type = "cpu";
- compatible = "arm,cortex-a53";
- reg = <0x1 0x10101>;
- enable-method = "psci";
- cpu-idle-states = <&CPU_RETENTION_1_0 &CPU_SLEEP_1_0
- &CLUSTER_RETENTION_1 &CLUSTER_SLEEP_1>;
- };
-
- idle-states {
- entry-method = "psci";
-
- CPU_RETENTION_0_0: cpu-retention-0-0 {
- compatible = "arm,idle-state";
- arm,psci-suspend-param = <0x0010000>;
- entry-latency-us = <20>;
- exit-latency-us = <40>;
- min-residency-us = <80>;
- };
-
- CLUSTER_RETENTION_0: cluster-retention-0 {
- compatible = "arm,idle-state";
- local-timer-stop;
- arm,psci-suspend-param = <0x1010000>;
- entry-latency-us = <50>;
- exit-latency-us = <100>;
- min-residency-us = <250>;
- wakeup-latency-us = <130>;
- };
-
- CPU_SLEEP_0_0: cpu-sleep-0-0 {
- compatible = "arm,idle-state";
- local-timer-stop;
- arm,psci-suspend-param = <0x0010000>;
- entry-latency-us = <250>;
- exit-latency-us = <500>;
- min-residency-us = <950>;
- };
-
- CLUSTER_SLEEP_0: cluster-sleep-0 {
- compatible = "arm,idle-state";
- local-timer-stop;
- arm,psci-suspend-param = <0x1010000>;
- entry-latency-us = <600>;
- exit-latency-us = <1100>;
- min-residency-us = <2700>;
- wakeup-latency-us = <1500>;
- };
-
- CPU_RETENTION_1_0: cpu-retention-1-0 {
- compatible = "arm,idle-state";
- arm,psci-suspend-param = <0x0010000>;
- entry-latency-us = <20>;
- exit-latency-us = <40>;
- min-residency-us = <90>;
- };
-
- CLUSTER_RETENTION_1: cluster-retention-1 {
- compatible = "arm,idle-state";
- local-timer-stop;
- arm,psci-suspend-param = <0x1010000>;
- entry-latency-us = <50>;
- exit-latency-us = <100>;
- min-residency-us = <270>;
- wakeup-latency-us = <100>;
- };
-
- CPU_SLEEP_1_0: cpu-sleep-1-0 {
- compatible = "arm,idle-state";
- local-timer-stop;
- arm,psci-suspend-param = <0x0010000>;
- entry-latency-us = <70>;
- exit-latency-us = <100>;
- min-residency-us = <300>;
- wakeup-latency-us = <150>;
- };
-
- CLUSTER_SLEEP_1: cluster-sleep-1 {
- compatible = "arm,idle-state";
- local-timer-stop;
- arm,psci-suspend-param = <0x1010000>;
- entry-latency-us = <500>;
- exit-latency-us = <1200>;
- min-residency-us = <3500>;
- wakeup-latency-us = <1300>;
- };
- };
-
-};
-
-Example 2 (ARM 32-bit, 8-cpu system, two clusters):
-
-cpus {
- #size-cells = <0>;
- #address-cells = <1>;
-
- CPU0: cpu@0 {
- device_type = "cpu";
- compatible = "arm,cortex-a15";
- reg = <0x0>;
- cpu-idle-states = <&CPU_SLEEP_0_0 &CLUSTER_SLEEP_0>;
- };
-
- CPU1: cpu@1 {
- device_type = "cpu";
- compatible = "arm,cortex-a15";
- reg = <0x1>;
- cpu-idle-states = <&CPU_SLEEP_0_0 &CLUSTER_SLEEP_0>;
- };
-
- CPU2: cpu@2 {
- device_type = "cpu";
- compatible = "arm,cortex-a15";
- reg = <0x2>;
- cpu-idle-states = <&CPU_SLEEP_0_0 &CLUSTER_SLEEP_0>;
- };
-
- CPU3: cpu@3 {
- device_type = "cpu";
- compatible = "arm,cortex-a15";
- reg = <0x3>;
- cpu-idle-states = <&CPU_SLEEP_0_0 &CLUSTER_SLEEP_0>;
- };
-
- CPU4: cpu@100 {
- device_type = "cpu";
- compatible = "arm,cortex-a7";
- reg = <0x100>;
- cpu-idle-states = <&CPU_SLEEP_1_0 &CLUSTER_SLEEP_1>;
- };
-
- CPU5: cpu@101 {
- device_type = "cpu";
- compatible = "arm,cortex-a7";
- reg = <0x101>;
- cpu-idle-states = <&CPU_SLEEP_1_0 &CLUSTER_SLEEP_1>;
- };
-
- CPU6: cpu@102 {
- device_type = "cpu";
- compatible = "arm,cortex-a7";
- reg = <0x102>;
- cpu-idle-states = <&CPU_SLEEP_1_0 &CLUSTER_SLEEP_1>;
- };
-
- CPU7: cpu@103 {
- device_type = "cpu";
- compatible = "arm,cortex-a7";
- reg = <0x103>;
- cpu-idle-states = <&CPU_SLEEP_1_0 &CLUSTER_SLEEP_1>;
- };
-
- idle-states {
- CPU_SLEEP_0_0: cpu-sleep-0-0 {
- compatible = "arm,idle-state";
- local-timer-stop;
- entry-latency-us = <200>;
- exit-latency-us = <100>;
- min-residency-us = <400>;
- wakeup-latency-us = <250>;
- };
-
- CLUSTER_SLEEP_0: cluster-sleep-0 {
- compatible = "arm,idle-state";
- local-timer-stop;
- entry-latency-us = <500>;
- exit-latency-us = <1500>;
- min-residency-us = <2500>;
- wakeup-latency-us = <1700>;
- };
-
- CPU_SLEEP_1_0: cpu-sleep-1-0 {
- compatible = "arm,idle-state";
- local-timer-stop;
- entry-latency-us = <300>;
- exit-latency-us = <500>;
- min-residency-us = <900>;
- wakeup-latency-us = <600>;
- };
-
- CLUSTER_SLEEP_1: cluster-sleep-1 {
- compatible = "arm,idle-state";
- local-timer-stop;
- entry-latency-us = <800>;
- exit-latency-us = <2000>;
- min-residency-us = <6500>;
- wakeup-latency-us = <2300>;
- };
- };
-
-};
-
-===========================================
-5 - References
-===========================================
-
-[1] ARM Linux Kernel documentation - CPUs bindings
- Documentation/devicetree/bindings/arm/cpus.yaml
-
-[2] ARM Linux Kernel documentation - PSCI bindings
- Documentation/devicetree/bindings/arm/psci.yaml
-
-[3] ARM Server Base System Architecture (SBSA)
- http://infocenter.arm.com/help/index.jsp
-
-[4] ARM Architecture Reference Manuals
- http://infocenter.arm.com/help/index.jsp
-
-[5] Devicetree Specification
- https://www.devicetree.org/specifications/
-
-[6] ARM Linux Kernel documentation - Booting AArch64 Linux
- Documentation/arm64/booting.rst
diff --git a/Documentation/devicetree/bindings/arm/idle-states.yaml b/Documentation/devicetree/bindings/arm/idle-states.yaml
new file mode 100644
index 0000000..ea805c1
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/idle-states.yaml
@@ -0,0 +1,661 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/idle-states.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: ARM idle states binding description
+
+maintainers:
+ - Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
+
+description: |+
+ ==========================================
+ 1 - Introduction
+ ==========================================
+
+ ARM systems contain HW capable of managing power consumption dynamically,
+ where cores can be put in different low-power states (ranging from simple wfi
+ to power gating) according to OS PM policies. The CPU states representing the
+ range of dynamic idle states that a processor can enter at run-time, can be
+ specified through device tree bindings representing the parameters required to
+ enter/exit specific idle states on a given processor.
+
+ According to the Server Base System Architecture document (SBSA, [3]), the
+ power states an ARM CPU can be put into are identified by the following list:
+
+ - Running
+ - Idle_standby
+ - Idle_retention
+ - Sleep
+ - Off
+
+ The power states described in the SBSA document define the basic CPU states on
+ top of which ARM platforms implement power management schemes that allow an OS
+ PM implementation to put the processor in different idle states (which include
+ states listed above; "off" state is not an idle state since it does not have
+ wake-up capabilities, hence it is not considered in this document).
+
+ Idle state parameters (e.g. entry latency) are platform specific and need to
+ be characterized with bindings that provide the required information to OS PM
+ code so that it can build the required tables and use them at runtime.
+
+ The device tree binding definition for ARM idle states is the subject of this
+ document.
+
+ ===========================================
+ 2 - idle-states definitions
+ ===========================================
+
+ Idle states are characterized for a specific system through a set of
+ timing and energy related properties, that underline the HW behaviour
+ triggered upon idle states entry and exit.
+
+ The following diagram depicts the CPU execution phases and related timing
+ properties required to enter and exit an idle state:
+
+ ..__[EXEC]__|__[PREP]__|__[ENTRY]__|__[IDLE]__|__[EXIT]__|__[EXEC]__..
+ | | | | |
+
+ |<------ entry ------->|
+ | latency |
+ |<- exit ->|
+ | latency |
+ |<-------- min-residency -------->|
+ |<------- wakeup-latency ------->|
+
+ Diagram 1: CPU idle state execution phases
+
+ EXEC: Normal CPU execution.
+
+ PREP: Preparation phase before committing the hardware to idle mode
+ like cache flushing. This is abortable on pending wake-up
+ event conditions. The abort latency is assumed to be negligible
+ (i.e. less than the ENTRY + EXIT duration). If aborted, CPU
+ goes back to EXEC. This phase is optional. If not abortable,
+ this should be included in the ENTRY phase instead.
+
+ ENTRY: The hardware is committed to idle mode. This period must run
+ to completion up to IDLE before anything else can happen.
+
+ IDLE: This is the actual energy-saving idle period. This may last
+ between 0 and infinite time, until a wake-up event occurs.
+
+ EXIT: Period during which the CPU is brought back to operational
+ mode (EXEC).
+
+ entry-latency: Worst case latency required to enter the idle state. The
+ exit-latency may be guaranteed only after entry-latency has passed.
+
+ min-residency: Minimum period, including preparation and entry, for a given
+ idle state to be worthwhile energywise.
+
+ wakeup-latency: Maximum delay between the signaling of a wake-up event and the
+ CPU being able to execute normal code again. If not specified, this is assumed
+ to be entry-latency + exit-latency.
+
+ These timing parameters can be used by an OS in different circumstances.
+
+ An idle CPU requires the expected min-residency time to select the most
+ appropriate idle state based on the expected expiry time of the next IRQ
+ (i.e. wake-up) that causes the CPU to return to the EXEC phase.
+
+ An operating system scheduler may need to compute the shortest wake-up delay
+ for CPUs in the system by detecting how long will it take to get a CPU out
+ of an idle state, e.g.:
+
+ wakeup-delay = exit-latency + max(entry-latency - (now - entry-timestamp), 0)
+
+ In other words, the scheduler can make its scheduling decision by selecting
+ (e.g. waking-up) the CPU with the shortest wake-up delay.
+ The wake-up delay must take into account the entry latency if that period
+ has not expired. The abortable nature of the PREP period can be ignored
+ if it cannot be relied upon (e.g. the PREP deadline may occur much sooner than
+ the worst case since it depends on the CPU operating conditions, i.e. caches
+ state).
+
+ An OS has to reliably probe the wakeup-latency since some devices can enforce
+ latency constraint guarantees to work properly, so the OS has to detect the
+ worst case wake-up latency it can incur if a CPU is allowed to enter an
+ idle state, and possibly to prevent that to guarantee reliable device
+ functioning.
+
+ The min-residency time parameter deserves further explanation since it is
+ expressed in time units but must factor in energy consumption coefficients.
+
+ The energy consumption of a cpu when it enters a power state can be roughly
+ characterised by the following graph:
+
+ |
+ |
+ |
+ e |
+ n | /---
+ e | /------
+ r | /------
+ g | /-----
+ y | /------
+ | ----
+ | /|
+ | / |
+ | / |
+ | / |
+ | / |
+ | / |
+ |/ |
+ -----|-------+----------------------------------
+ 0| 1 time(ms)
+
+ Graph 1: Energy vs time example
+
+ The graph is split in two parts delimited by time 1ms on the X-axis.
+ The graph curve with X-axis values = { x | 0 < x < 1ms } has a steep slope
+ and denotes the energy costs incurred while entering and leaving the idle
+ state.
+ The graph curve in the area delimited by X-axis values = {x | x > 1ms } has
+ shallower slope and essentially represents the energy consumption of the idle
+ state.
+
+ min-residency is defined for a given idle state as the minimum expected
+ residency time for a state (inclusive of preparation and entry) after
+ which choosing that state become the most energy efficient option. A good
+ way to visualise this, is by taking the same graph above and comparing some
+ states energy consumptions plots.
+
+ For sake of simplicity, let's consider a system with two idle states IDLE1,
+ and IDLE2:
+
+ |
+ |
+ |
+ | /-- IDLE1
+ e | /---
+ n | /----
+ e | /---
+ r | /-----/--------- IDLE2
+ g | /-------/---------
+ y | ------------ /---|
+ | / /---- |
+ | / /--- |
+ | / /---- |
+ | / /--- |
+ | --- |
+ | / |
+ | / |
+ |/ | time
+ ---/----------------------------+------------------------
+ |IDLE1-energy < IDLE2-energy | IDLE2-energy < IDLE1-energy
+ |
+ IDLE2-min-residency
+
+ Graph 2: idle states min-residency example
+
+ In graph 2 above, that takes into account idle states entry/exit energy
+ costs, it is clear that if the idle state residency time (i.e. time till next
+ wake-up IRQ) is less than IDLE2-min-residency, IDLE1 is the better idle state
+ choice energywise.
+
+ This is mainly down to the fact that IDLE1 entry/exit energy costs are lower
+ than IDLE2.
+
+ However, the lower power consumption (i.e. shallower energy curve slope) of
+ idle state IDLE2 implies that after a suitable time, IDLE2 becomes more energy
+ efficient.
+
+ The time at which IDLE2 becomes more energy efficient than IDLE1 (and other
+ shallower states in a system with multiple idle states) is defined
+ IDLE2-min-residency and corresponds to the time when energy consumption of
+ IDLE1 and IDLE2 states breaks even.
+
+ The definitions provided in this section underpin the idle states
+ properties specification that is the subject of the following sections.
+
+ ===========================================
+ 3 - idle-states node
+ ===========================================
+
+ ARM processor idle states are defined within the idle-states node, which is
+ a direct child of the cpus node [1] and provides a container where the
+ processor idle states, defined as device tree nodes, are listed.
+
+ On ARM systems, it is a container of processor idle states nodes. If the
+ system does not provide CPU power management capabilities, or the processor
+ just supports idle_standby, an idle-states node is not required.
+
+ ===========================================
+ 4 - References
+ ===========================================
+
+ [1] ARM Linux Kernel documentation - CPUs bindings
+ Documentation/devicetree/bindings/arm/cpus.yaml
+
+ [2] ARM Linux Kernel documentation - PSCI bindings
+ Documentation/devicetree/bindings/arm/psci.yaml
+
+ [3] ARM Server Base System Architecture (SBSA)
+ http://infocenter.arm.com/help/index.jsp
+
+ [4] ARM Architecture Reference Manuals
+ http://infocenter.arm.com/help/index.jsp
+
+ [6] ARM Linux Kernel documentation - Booting AArch64 Linux
+ Documentation/arm64/booting.rst
+
+properties:
+ $nodename:
+ const: idle-states
+
+ entry-method:
+ description: |
+ Usage and definition depend on ARM architecture version.
+
+ On ARM v8 64-bit this property is required.
+ On ARM 32-bit systems this property is optional
+
+ This assumes that the "enable-method" property is set to "psci" in the cpu
+ node[6] that is responsible for setting up CPU idle management in the OS
+ implementation.
+ const: psci
+
+patternProperties:
+ "^(cpu|cluster)-":
+ type: object
+ description: |
+ Each state node represents an idle state description and must be defined
+ as follows.
+
+ The idle state entered by executing the wfi instruction (idle_standby
+ SBSA,[3][4]) is considered standard on all ARM platforms and therefore
+ must not be listed.
+
+ In addition to the properties listed above, a state node may require
+ additional properties specific to the entry-method defined in the
+ idle-states node. Please refer to the entry-method bindings
+ documentation for properties definitions.
+
+ properties:
+ compatible:
+ const: arm,idle-state
+
+ local-timer-stop:
+ description:
+ If present the CPU local timer control logic is
+ lost on state entry, otherwise it is retained.
+ type: boolean
+
+ entry-latency-us:
+ description:
+ Worst case latency in microseconds required to enter the idle state.
+
+ exit-latency-us:
+ description:
+ Worst case latency in microseconds required to exit the idle state.
+ The exit-latency-us duration may be guaranteed only after
+ entry-latency-us has passed.
+
+ min-residency-us:
+ description:
+ Minimum residency duration in microseconds, inclusive of preparation
+ and entry, for this idle state to be considered worthwhile energy wise
+ (refer to section 2 of this document for a complete description).
+
+ wakeup-latency-us:
+ description: |
+ Maximum delay between the signaling of a wake-up event and the CPU
+ being able to execute normal code again. If omitted, this is assumed
+ to be equal to:
+
+ entry-latency-us + exit-latency-us
+
+ It is important to supply this value on systems where the duration of
+ PREP phase (see diagram 1, section 2) is non-neglibigle. In such
+ systems entry-latency-us + exit-latency-us will exceed
+ wakeup-latency-us by this duration.
+
+ idle-state-name:
+ $ref: /schemas/types.yaml#definitions/string
+ description:
+ A string used as a descriptive name for the idle state.
+
+ required:
+ - compatible
+ - entry-latency-us
+ - exit-latency-us
+ - min-residency-us
+
+additionalProperties: false
+
+examples:
+ - |
+
+ cpus {
+ #size-cells = <0>;
+ #address-cells = <2>;
+
+ cpu@0 {
+ device_type = "cpu";
+ compatible = "arm,cortex-a57";
+ reg = <0x0 0x0>;
+ enable-method = "psci";
+ cpu-idle-states = <&CPU_RETENTION_0_0 &CPU_SLEEP_0_0
+ &CLUSTER_RETENTION_0 &CLUSTER_SLEEP_0>;
+ };
+
+ cpu@1 {
+ device_type = "cpu";
+ compatible = "arm,cortex-a57";
+ reg = <0x0 0x1>;
+ enable-method = "psci";
+ cpu-idle-states = <&CPU_RETENTION_0_0 &CPU_SLEEP_0_0
+ &CLUSTER_RETENTION_0 &CLUSTER_SLEEP_0>;
+ };
+
+ cpu@100 {
+ device_type = "cpu";
+ compatible = "arm,cortex-a57";
+ reg = <0x0 0x100>;
+ enable-method = "psci";
+ cpu-idle-states = <&CPU_RETENTION_0_0 &CPU_SLEEP_0_0
+ &CLUSTER_RETENTION_0 &CLUSTER_SLEEP_0>;
+ };
+
+ cpu@101 {
+ device_type = "cpu";
+ compatible = "arm,cortex-a57";
+ reg = <0x0 0x101>;
+ enable-method = "psci";
+ cpu-idle-states = <&CPU_RETENTION_0_0 &CPU_SLEEP_0_0
+ &CLUSTER_RETENTION_0 &CLUSTER_SLEEP_0>;
+ };
+
+ cpu@10000 {
+ device_type = "cpu";
+ compatible = "arm,cortex-a57";
+ reg = <0x0 0x10000>;
+ enable-method = "psci";
+ cpu-idle-states = <&CPU_RETENTION_0_0 &CPU_SLEEP_0_0
+ &CLUSTER_RETENTION_0 &CLUSTER_SLEEP_0>;
+ };
+
+ cpu@10001 {
+ device_type = "cpu";
+ compatible = "arm,cortex-a57";
+ reg = <0x0 0x10001>;
+ enable-method = "psci";
+ cpu-idle-states = <&CPU_RETENTION_0_0 &CPU_SLEEP_0_0
+ &CLUSTER_RETENTION_0 &CLUSTER_SLEEP_0>;
+ };
+
+ cpu@10100 {
+ device_type = "cpu";
+ compatible = "arm,cortex-a57";
+ reg = <0x0 0x10100>;
+ enable-method = "psci";
+ cpu-idle-states = <&CPU_RETENTION_0_0 &CPU_SLEEP_0_0
+ &CLUSTER_RETENTION_0 &CLUSTER_SLEEP_0>;
+ };
+
+ cpu@10101 {
+ device_type = "cpu";
+ compatible = "arm,cortex-a57";
+ reg = <0x0 0x10101>;
+ enable-method = "psci";
+ cpu-idle-states = <&CPU_RETENTION_0_0 &CPU_SLEEP_0_0
+ &CLUSTER_RETENTION_0 &CLUSTER_SLEEP_0>;
+ };
+
+ cpu@100000000 {
+ device_type = "cpu";
+ compatible = "arm,cortex-a53";
+ reg = <0x1 0x0>;
+ enable-method = "psci";
+ cpu-idle-states = <&CPU_RETENTION_1_0 &CPU_SLEEP_1_0
+ &CLUSTER_RETENTION_1 &CLUSTER_SLEEP_1>;
+ };
+
+ cpu@100000001 {
+ device_type = "cpu";
+ compatible = "arm,cortex-a53";
+ reg = <0x1 0x1>;
+ enable-method = "psci";
+ cpu-idle-states = <&CPU_RETENTION_1_0 &CPU_SLEEP_1_0
+ &CLUSTER_RETENTION_1 &CLUSTER_SLEEP_1>;
+ };
+
+ cpu@100000100 {
+ device_type = "cpu";
+ compatible = "arm,cortex-a53";
+ reg = <0x1 0x100>;
+ enable-method = "psci";
+ cpu-idle-states = <&CPU_RETENTION_1_0 &CPU_SLEEP_1_0
+ &CLUSTER_RETENTION_1 &CLUSTER_SLEEP_1>;
+ };
+
+ cpu@100000101 {
+ device_type = "cpu";
+ compatible = "arm,cortex-a53";
+ reg = <0x1 0x101>;
+ enable-method = "psci";
+ cpu-idle-states = <&CPU_RETENTION_1_0 &CPU_SLEEP_1_0
+ &CLUSTER_RETENTION_1 &CLUSTER_SLEEP_1>;
+ };
+
+ cpu@100010000 {
+ device_type = "cpu";
+ compatible = "arm,cortex-a53";
+ reg = <0x1 0x10000>;
+ enable-method = "psci";
+ cpu-idle-states = <&CPU_RETENTION_1_0 &CPU_SLEEP_1_0
+ &CLUSTER_RETENTION_1 &CLUSTER_SLEEP_1>;
+ };
+
+ cpu@100010001 {
+ device_type = "cpu";
+ compatible = "arm,cortex-a53";
+ reg = <0x1 0x10001>;
+ enable-method = "psci";
+ cpu-idle-states = <&CPU_RETENTION_1_0 &CPU_SLEEP_1_0
+ &CLUSTER_RETENTION_1 &CLUSTER_SLEEP_1>;
+ };
+
+ cpu@100010100 {
+ device_type = "cpu";
+ compatible = "arm,cortex-a53";
+ reg = <0x1 0x10100>;
+ enable-method = "psci";
+ cpu-idle-states = <&CPU_RETENTION_1_0 &CPU_SLEEP_1_0
+ &CLUSTER_RETENTION_1 &CLUSTER_SLEEP_1>;
+ };
+
+ cpu@100010101 {
+ device_type = "cpu";
+ compatible = "arm,cortex-a53";
+ reg = <0x1 0x10101>;
+ enable-method = "psci";
+ cpu-idle-states = <&CPU_RETENTION_1_0 &CPU_SLEEP_1_0
+ &CLUSTER_RETENTION_1 &CLUSTER_SLEEP_1>;
+ };
+
+ idle-states {
+ entry-method = "psci";
+
+ CPU_RETENTION_0_0: cpu-retention-0-0 {
+ compatible = "arm,idle-state";
+ arm,psci-suspend-param = <0x0010000>;
+ entry-latency-us = <20>;
+ exit-latency-us = <40>;
+ min-residency-us = <80>;
+ };
+
+ CLUSTER_RETENTION_0: cluster-retention-0 {
+ compatible = "arm,idle-state";
+ local-timer-stop;
+ arm,psci-suspend-param = <0x1010000>;
+ entry-latency-us = <50>;
+ exit-latency-us = <100>;
+ min-residency-us = <250>;
+ wakeup-latency-us = <130>;
+ };
+
+ CPU_SLEEP_0_0: cpu-sleep-0-0 {
+ compatible = "arm,idle-state";
+ local-timer-stop;
+ arm,psci-suspend-param = <0x0010000>;
+ entry-latency-us = <250>;
+ exit-latency-us = <500>;
+ min-residency-us = <950>;
+ };
+
+ CLUSTER_SLEEP_0: cluster-sleep-0 {
+ compatible = "arm,idle-state";
+ local-timer-stop;
+ arm,psci-suspend-param = <0x1010000>;
+ entry-latency-us = <600>;
+ exit-latency-us = <1100>;
+ min-residency-us = <2700>;
+ wakeup-latency-us = <1500>;
+ };
+
+ CPU_RETENTION_1_0: cpu-retention-1-0 {
+ compatible = "arm,idle-state";
+ arm,psci-suspend-param = <0x0010000>;
+ entry-latency-us = <20>;
+ exit-latency-us = <40>;
+ min-residency-us = <90>;
+ };
+
+ CLUSTER_RETENTION_1: cluster-retention-1 {
+ compatible = "arm,idle-state";
+ local-timer-stop;
+ arm,psci-suspend-param = <0x1010000>;
+ entry-latency-us = <50>;
+ exit-latency-us = <100>;
+ min-residency-us = <270>;
+ wakeup-latency-us = <100>;
+ };
+
+ CPU_SLEEP_1_0: cpu-sleep-1-0 {
+ compatible = "arm,idle-state";
+ local-timer-stop;
+ arm,psci-suspend-param = <0x0010000>;
+ entry-latency-us = <70>;
+ exit-latency-us = <100>;
+ min-residency-us = <300>;
+ wakeup-latency-us = <150>;
+ };
+
+ CLUSTER_SLEEP_1: cluster-sleep-1 {
+ compatible = "arm,idle-state";
+ local-timer-stop;
+ arm,psci-suspend-param = <0x1010000>;
+ entry-latency-us = <500>;
+ exit-latency-us = <1200>;
+ min-residency-us = <3500>;
+ wakeup-latency-us = <1300>;
+ };
+ };
+ };
+
+ - |
+ // Example 2 (ARM 32-bit, 8-cpu system, two clusters):
+
+ cpus {
+ #size-cells = <0>;
+ #address-cells = <1>;
+
+ cpu@0 {
+ device_type = "cpu";
+ compatible = "arm,cortex-a15";
+ reg = <0x0>;
+ cpu-idle-states = <&cpu_sleep_0_0 &cluster_sleep_0>;
+ };
+
+ cpu@1 {
+ device_type = "cpu";
+ compatible = "arm,cortex-a15";
+ reg = <0x1>;
+ cpu-idle-states = <&cpu_sleep_0_0 &cluster_sleep_0>;
+ };
+
+ cpu@2 {
+ device_type = "cpu";
+ compatible = "arm,cortex-a15";
+ reg = <0x2>;
+ cpu-idle-states = <&cpu_sleep_0_0 &cluster_sleep_0>;
+ };
+
+ cpu@3 {
+ device_type = "cpu";
+ compatible = "arm,cortex-a15";
+ reg = <0x3>;
+ cpu-idle-states = <&cpu_sleep_0_0 &cluster_sleep_0>;
+ };
+
+ cpu@100 {
+ device_type = "cpu";
+ compatible = "arm,cortex-a7";
+ reg = <0x100>;
+ cpu-idle-states = <&cpu_sleep_1_0 &cluster_sleep_1>;
+ };
+
+ cpu@101 {
+ device_type = "cpu";
+ compatible = "arm,cortex-a7";
+ reg = <0x101>;
+ cpu-idle-states = <&cpu_sleep_1_0 &cluster_sleep_1>;
+ };
+
+ cpu@102 {
+ device_type = "cpu";
+ compatible = "arm,cortex-a7";
+ reg = <0x102>;
+ cpu-idle-states = <&cpu_sleep_1_0 &cluster_sleep_1>;
+ };
+
+ cpu@103 {
+ device_type = "cpu";
+ compatible = "arm,cortex-a7";
+ reg = <0x103>;
+ cpu-idle-states = <&cpu_sleep_1_0 &cluster_sleep_1>;
+ };
+
+ idle-states {
+ cpu_sleep_0_0: cpu-sleep-0-0 {
+ compatible = "arm,idle-state";
+ local-timer-stop;
+ entry-latency-us = <200>;
+ exit-latency-us = <100>;
+ min-residency-us = <400>;
+ wakeup-latency-us = <250>;
+ };
+
+ cluster_sleep_0: cluster-sleep-0 {
+ compatible = "arm,idle-state";
+ local-timer-stop;
+ entry-latency-us = <500>;
+ exit-latency-us = <1500>;
+ min-residency-us = <2500>;
+ wakeup-latency-us = <1700>;
+ };
+
+ cpu_sleep_1_0: cpu-sleep-1-0 {
+ compatible = "arm,idle-state";
+ local-timer-stop;
+ entry-latency-us = <300>;
+ exit-latency-us = <500>;
+ min-residency-us = <900>;
+ wakeup-latency-us = <600>;
+ };
+
+ cluster_sleep_1: cluster-sleep-1 {
+ compatible = "arm,idle-state";
+ local-timer-stop;
+ entry-latency-us = <800>;
+ exit-latency-us = <2000>;
+ min-residency-us = <6500>;
+ wakeup-latency-us = <2300>;
+ };
+ };
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/arm/msm/qcom,llcc.yaml b/Documentation/devicetree/bindings/arm/msm/qcom,llcc.yaml
index 5587490..79902f4 100644
--- a/Documentation/devicetree/bindings/arm/msm/qcom,llcc.yaml
+++ b/Documentation/devicetree/bindings/arm/msm/qcom,llcc.yaml
@@ -47,7 +47,7 @@
- |
#include <dt-bindings/interrupt-controller/arm-gic.h>
- cache-controller@1100000 {
+ system-cache-controller@1100000 {
compatible = "qcom,sdm845-llcc";
reg = <0x1100000 0x200000>, <0x1300000 0x50000> ;
reg-names = "llcc_base", "llcc_broadcast_base";
diff --git a/Documentation/devicetree/bindings/arm/psci.yaml b/Documentation/devicetree/bindings/arm/psci.yaml
index 7abdf58b3..8ef8542 100644
--- a/Documentation/devicetree/bindings/arm/psci.yaml
+++ b/Documentation/devicetree/bindings/arm/psci.yaml
@@ -102,6 +102,34 @@
[1] Kernel documentation - ARM idle states bindings
Documentation/devicetree/bindings/arm/idle-states.txt
+ "#power-domain-cells":
+ description:
+ The number of cells in a PM domain specifier as per binding in [3].
+ Must be 0 as to represent a single PM domain.
+
+ ARM systems can have multiple cores, sometimes in an hierarchical
+ arrangement. This often, but not always, maps directly to the processor
+ power topology of the system. Individual nodes in a topology have their
+ own specific power states and can be better represented hierarchically.
+
+ For these cases, the definitions of the idle states for the CPUs and the
+ CPU topology, must conform to the binding in [3]. The idle states
+ themselves must conform to the binding in [4] and must specify the
+ arm,psci-suspend-param property.
+
+ It should also be noted that, in PSCI firmware v1.0 the OS-Initiated
+ (OSI) CPU suspend mode is introduced. Using a hierarchical representation
+ helps to implement support for OSI mode and OS implementations may choose
+ to mandate it.
+
+ [3] Documentation/devicetree/bindings/power/power_domain.txt
+ [4] Documentation/devicetree/bindings/power/domain-idle-state.txt
+
+ power-domains:
+ $ref: '/schemas/types.yaml#/definitions/phandle-array'
+ description:
+ List of phandles and PM domain specifiers, as defined by bindings of the
+ PM domain provider.
required:
- compatible
@@ -160,4 +188,80 @@
cpu_on = <0x95c10002>;
cpu_off = <0x95c10001>;
};
+
+ - |+
+
+ // Case 4: CPUs and CPU idle states described using the hierarchical model.
+
+ cpus {
+ #size-cells = <0>;
+ #address-cells = <1>;
+
+ CPU0: cpu@0 {
+ device_type = "cpu";
+ compatible = "arm,cortex-a53", "arm,armv8";
+ reg = <0x0>;
+ enable-method = "psci";
+ power-domains = <&CPU_PD0>;
+ power-domain-names = "psci";
+ };
+
+ CPU1: cpu@1 {
+ device_type = "cpu";
+ compatible = "arm,cortex-a57", "arm,armv8";
+ reg = <0x100>;
+ enable-method = "psci";
+ power-domains = <&CPU_PD1>;
+ power-domain-names = "psci";
+ };
+
+ idle-states {
+
+ CPU_PWRDN: cpu-power-down {
+ compatible = "arm,idle-state";
+ arm,psci-suspend-param = <0x0000001>;
+ entry-latency-us = <10>;
+ exit-latency-us = <10>;
+ min-residency-us = <100>;
+ };
+
+ CLUSTER_RET: cluster-retention {
+ compatible = "domain-idle-state";
+ arm,psci-suspend-param = <0x1000011>;
+ entry-latency-us = <500>;
+ exit-latency-us = <500>;
+ min-residency-us = <2000>;
+ };
+
+ CLUSTER_PWRDN: cluster-power-down {
+ compatible = "domain-idle-state";
+ arm,psci-suspend-param = <0x1000031>;
+ entry-latency-us = <2000>;
+ exit-latency-us = <2000>;
+ min-residency-us = <6000>;
+ };
+ };
+ };
+
+ psci {
+ compatible = "arm,psci-1.0";
+ method = "smc";
+
+ CPU_PD0: cpu-pd0 {
+ #power-domain-cells = <0>;
+ domain-idle-states = <&CPU_PWRDN>;
+ power-domains = <&CLUSTER_PD>;
+ };
+
+ CPU_PD1: cpu-pd1 {
+ #power-domain-cells = <0>;
+ domain-idle-states = <&CPU_PWRDN>;
+ power-domains = <&CLUSTER_PD>;
+ };
+
+ CLUSTER_PD: cluster-pd {
+ #power-domain-cells = <0>;
+ domain-idle-states = <&CLUSTER_RET>, <&CLUSTER_PWRDN>;
+ };
+ };
...
diff --git a/Documentation/devicetree/bindings/arm/qcom.yaml b/Documentation/devicetree/bindings/arm/qcom.yaml
index e39d8f0..5976c0b 100644
--- a/Documentation/devicetree/bindings/arm/qcom.yaml
+++ b/Documentation/devicetree/bindings/arm/qcom.yaml
@@ -1,7 +1,7 @@
# SPDX-License-Identifier: GPL-2.0
%YAML 1.2
---
-$id: http://devicetree.org/schemas/bindings/arm/qcom.yaml#
+$id: http://devicetree.org/schemas/arm/qcom.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
title: QCOM device tree bindings
@@ -24,28 +24,30 @@
The 'SoC' element must be one of the following strings:
- apq8016
- apq8074
- apq8084
- apq8096
- msm8916
- msm8974
- msm8992
- msm8994
- msm8996
- mdm9615
- ipq8074
- sdm845
+ apq8016
+ apq8074
+ apq8084
+ apq8096
+ ipq8074
+ mdm9615
+ msm8916
+ msm8974
+ msm8992
+ msm8994
+ msm8996
+ sc7180
+ sdm845
The 'board' element must be one of the following strings:
- cdp
- liquid
- dragonboard
- mtp
- sbc
- hk01
- qrd
+ cdp
+ dragonboard
+ hk01
+ idp
+ liquid
+ mtp
+ qrd
+ sbc
The 'soc_version' and 'board_version' elements take the form of v<Major>.<Minor>
where the minor number may be omitted when it's zero, i.e. v1.0 is the same
@@ -144,4 +146,8 @@
- qcom,ipq8074-hk01
- const: qcom,ipq8074
+ - items:
+ - enum:
+ - qcom,sc7180-idp
+ - const: qcom,sc7180
...
diff --git a/Documentation/devicetree/bindings/arm/rockchip.yaml b/Documentation/devicetree/bindings/arm/rockchip.yaml
index d9847b3..874b0ea 100644
--- a/Documentation/devicetree/bindings/arm/rockchip.yaml
+++ b/Documentation/devicetree/bindings/arm/rockchip.yaml
@@ -409,6 +409,9 @@
- description: Pine64 RockPro64
items:
+ - enum:
+ - pine64,rockpro64-v2.1
+ - pine64,rockpro64-v2.0
- const: pine64,rockpro64
- const: rockchip,rk3399
@@ -422,6 +425,12 @@
- const: radxa,rockpi4
- const: rockchip,rk3399
+ - description: Radxa ROCK Pi N10
+ items:
+ - const: radxa,rockpi-n10
+ - const: vamrs,rk3399pro-vmarc-som
+ - const: rockchip,rk3399pro
+
- description: Radxa Rock2 Square
items:
- const: radxa,rock2-square
diff --git a/Documentation/devicetree/bindings/arm/sprd.yaml b/Documentation/devicetree/bindings/arm/sprd.yaml
deleted file mode 100644
index c35fb84..0000000
--- a/Documentation/devicetree/bindings/arm/sprd.yaml
+++ /dev/null
@@ -1,33 +0,0 @@
-# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
-# Copyright 2019 Unisoc Inc.
-%YAML 1.2
----
-$id: http://devicetree.org/schemas/arm/sprd.yaml#
-$schema: http://devicetree.org/meta-schemas/core.yaml#
-
-title: Unisoc platforms device tree bindings
-
-maintainers:
- - Orson Zhai <orsonzhai@gmail.com>
- - Baolin Wang <baolin.wang7@gmail.com>
- - Chunyan Zhang <zhang.lyra@gmail.com>
-
-properties:
- $nodename:
- const: '/'
- compatible:
- oneOf:
- - items:
- - enum:
- - sprd,sc9836-openphone
- - const: sprd,sc9836
- - items:
- - enum:
- - sprd,sp9860g-1h10
- - const: sprd,sc9860
- - items:
- - enum:
- - sprd,sp9863a-1h10
- - const: sprd,sc9863a
-
-...
diff --git a/Documentation/devicetree/bindings/arm/sprd/sprd.yaml b/Documentation/devicetree/bindings/arm/sprd/sprd.yaml
new file mode 100644
index 0000000..0258a96b
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/sprd/sprd.yaml
@@ -0,0 +1,33 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+# Copyright 2019 Unisoc Inc.
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/sprd/sprd.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Unisoc platforms device tree bindings
+
+maintainers:
+ - Orson Zhai <orsonzhai@gmail.com>
+ - Baolin Wang <baolin.wang7@gmail.com>
+ - Chunyan Zhang <zhang.lyra@gmail.com>
+
+properties:
+ $nodename:
+ const: '/'
+ compatible:
+ oneOf:
+ - items:
+ - enum:
+ - sprd,sc9836-openphone
+ - const: sprd,sc9836
+ - items:
+ - enum:
+ - sprd,sp9860g-1h10
+ - const: sprd,sc9860
+ - items:
+ - enum:
+ - sprd,sp9863a-1h10
+ - const: sprd,sc9863a
+
+...
diff --git a/Documentation/devicetree/bindings/arm/stm32/mlahb.txt b/Documentation/devicetree/bindings/arm/stm32/mlahb.txt
deleted file mode 100644
index 25307aa..0000000
--- a/Documentation/devicetree/bindings/arm/stm32/mlahb.txt
+++ /dev/null
@@ -1,37 +0,0 @@
-ML-AHB interconnect bindings
-
-These bindings describe the STM32 SoCs ML-AHB interconnect bus which connects
-a Cortex-M subsystem with dedicated memories.
-The MCU SRAM and RETRAM memory parts can be accessed through different addresses
-(see "RAM aliases" in [1]) using different buses (see [2]) : balancing the
-Cortex-M firmware accesses among those ports allows to tune the system
-performance.
-
-[1]: https://www.st.com/resource/en/reference_manual/dm00327659.pdf
-[2]: https://wiki.st.com/stm32mpu/wiki/STM32MP15_RAM_mapping
-
-Required properties:
-- compatible: should be "simple-bus"
-- dma-ranges: describes memory addresses translation between the local CPU and
- the remote Cortex-M processor. Each memory region, is declared with
- 3 parameters:
- - param 1: device base address (Cortex-M processor address)
- - param 2: physical base address (local CPU address)
- - param 3: size of the memory region.
-
-The Cortex-M remote processor accessed via the mlahb interconnect is described
-by a child node.
-
-Example:
-mlahb {
- compatible = "simple-bus";
- #address-cells = <1>;
- #size-cells = <1>;
- dma-ranges = <0x00000000 0x38000000 0x10000>,
- <0x10000000 0x10000000 0x60000>,
- <0x30000000 0x30000000 0x60000>;
-
- m4_rproc: m4@10000000 {
- ...
- };
-};
diff --git a/Documentation/devicetree/bindings/arm/stm32/st,mlahb.yaml b/Documentation/devicetree/bindings/arm/stm32/st,mlahb.yaml
new file mode 100644
index 0000000..68917bb
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/stm32/st,mlahb.yaml
@@ -0,0 +1,70 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: "http://devicetree.org/schemas/arm/stm32/st,mlahb.yaml#"
+$schema: "http://devicetree.org/meta-schemas/core.yaml#"
+
+title: STMicroelectronics STM32 ML-AHB interconnect bindings
+
+maintainers:
+ - Fabien Dessenne <fabien.dessenne@st.com>
+ - Arnaud Pouliquen <arnaud.pouliquen@st.com>
+
+description: |
+ These bindings describe the STM32 SoCs ML-AHB interconnect bus which connects
+ a Cortex-M subsystem with dedicated memories. The MCU SRAM and RETRAM memory
+ parts can be accessed through different addresses (see "RAM aliases" in [1])
+ using different buses (see [2]): balancing the Cortex-M firmware accesses
+ among those ports allows to tune the system performance.
+ [1]: https://www.st.com/resource/en/reference_manual/dm00327659.pdf
+ [2]: https://wiki.st.com/stm32mpu/wiki/STM32MP15_RAM_mapping
+
+allOf:
+ - $ref: /schemas/simple-bus.yaml#
+
+properties:
+ compatible:
+ contains:
+ enum:
+ - st,mlahb
+
+ dma-ranges:
+ description: |
+ Describe memory addresses translation between the local CPU and the
+ remote Cortex-M processor. Each memory region, is declared with
+ 3 parameters:
+ - param 1: device base address (Cortex-M processor address)
+ - param 2: physical base address (local CPU address)
+ - param 3: size of the memory region.
+ maxItems: 3
+
+ '#address-cells':
+ const: 1
+
+ '#size-cells':
+ const: 1
+
+required:
+ - compatible
+ - '#address-cells'
+ - '#size-cells'
+ - dma-ranges
+
+examples:
+ - |
+ mlahb: ahb {
+ compatible = "st,mlahb", "simple-bus";
+ #address-cells = <1>;
+ #size-cells = <1>;
+ reg = <0x10000000 0x40000>;
+ ranges;
+ dma-ranges = <0x00000000 0x38000000 0x10000>,
+ <0x10000000 0x10000000 0x60000>,
+ <0x30000000 0x30000000 0x60000>;
+
+ m4_rproc: m4@10000000 {
+ reg = <0x10000000 0x40000>;
+ };
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/arm/stm32/st,stm32-syscon.yaml b/Documentation/devicetree/bindings/arm/stm32/st,stm32-syscon.yaml
new file mode 100644
index 0000000..0dedf94
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/stm32/st,stm32-syscon.yaml
@@ -0,0 +1,41 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: "http://devicetree.org/schemas/arm/stm32/st,stm32-syscon.yaml#"
+$schema: "http://devicetree.org/meta-schemas/core.yaml#"
+
+title: STMicroelectronics STM32 Platforms System Controller bindings
+
+maintainers:
+ - Alexandre Torgue <alexandre.torgue@st.com>
+ - Christophe Roullier <christophe.roullier@st.com>
+
+properties:
+ compatible:
+ oneOf:
+ - items:
+ - enum:
+ - st,stm32mp157-syscfg
+ - const: syscon
+
+ reg:
+ maxItems: 1
+
+ clocks:
+ maxItems: 1
+
+required:
+ - compatible
+ - reg
+ - clocks
+
+examples:
+ - |
+ #include <dt-bindings/clock/stm32mp1-clks.h>
+ syscfg: syscon@50020000 {
+ compatible = "st,stm32mp157-syscfg", "syscon";
+ reg = <0x50020000 0x400>;
+ clocks = <&rcc SYSCFG>;
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/arm/stm32/stm32-syscon.txt b/Documentation/devicetree/bindings/arm/stm32/stm32-syscon.txt
deleted file mode 100644
index c92d411..0000000
--- a/Documentation/devicetree/bindings/arm/stm32/stm32-syscon.txt
+++ /dev/null
@@ -1,16 +0,0 @@
-STMicroelectronics STM32 Platforms System Controller
-
-Properties:
- - compatible : should contain two values. First value must be :
- - " st,stm32mp157-syscfg " - for stm32mp157 based SoCs,
- second value must be always "syscon".
- - reg : offset and length of the register set.
- - clocks: phandle to the syscfg clock
-
- Example:
- syscfg: syscon@50020000 {
- compatible = "st,stm32mp157-syscfg", "syscon";
- reg = <0x50020000 0x400>;
- clocks = <&rcc SYSCFG>;
- };
-
diff --git a/Documentation/devicetree/bindings/arm/sunxi.yaml b/Documentation/devicetree/bindings/arm/sunxi.yaml
index cffe8bb..327ce67 100644
--- a/Documentation/devicetree/bindings/arm/sunxi.yaml
+++ b/Documentation/devicetree/bindings/arm/sunxi.yaml
@@ -342,6 +342,16 @@
- const: libretech,all-h3-cc-h5
- const: allwinner,sun50i-h5
+ - description: Libre Computer Board ALL-H3-IT H5
+ items:
+ - const: libretech,all-h3-it-h5
+ - const: allwinner,sun50i-h5
+
+ - description: Libre Computer Board ALL-H5-CC H5
+ items:
+ - const: libretech,all-h5-cc-h5
+ - const: allwinner,sun50i-h5
+
- description: Lichee Pi One
items:
- const: licheepi,licheepi-one
@@ -470,6 +480,12 @@
- const: emlid,neutis-n5
- const: allwinner,sun50i-h5
+ - description: Emlid Neutis N5H3 Developper Board
+ items:
+ - const: emlid,neutis-n5h3-devboard
+ - const: emlid,neutis-n5h3
+ - const: allwinner,sun8i-h3
+
- description: NextThing Co. CHIP
items:
- const: nextthing,chip
@@ -599,11 +615,16 @@
- const: pine64,pine64-plus
- const: allwinner,sun50i-a64
- - description: Pine64 PineH64
+ - description: Pine64 PineH64 model A
items:
- const: pine64,pine-h64
- const: allwinner,sun50i-h6
+ - description: Pine64 PineH64 model B
+ items:
+ - const: pine64,pine-h64-model-b
+ - const: allwinner,sun50i-h6
+
- description: Pine64 LTS
items:
- const: pine64,pine64-lts
diff --git a/Documentation/devicetree/bindings/arm/sunxi/allwinner,sun4i-a10-mbus.yaml b/Documentation/devicetree/bindings/arm/sunxi/allwinner,sun4i-a10-mbus.yaml
new file mode 100644
index 0000000..9370e64
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/sunxi/allwinner,sun4i-a10-mbus.yaml
@@ -0,0 +1,65 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/sunxi/allwinner,sun4i-a10-mbus.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Allwinner Memory Bus (MBUS) controller
+
+maintainers:
+ - Chen-Yu Tsai <wens@csie.org>
+ - Maxime Ripard <mripard@kernel.org>
+
+description: |
+ The MBUS controller drives the MBUS that other devices in the SoC
+ will use to perform DMA. It also has a register interface that
+ allows to monitor and control the bandwidth and priorities for
+ masters on that bus.
+
+ Each device having to perform their DMA through the MBUS must have
+ the interconnects and interconnect-names properties set to the MBUS
+ controller and with "dma-mem" as the interconnect name.
+
+properties:
+ "#interconnect-cells":
+ const: 1
+ description:
+ The content of the cell is the MBUS ID.
+
+ compatible:
+ enum:
+ - allwinner,sun5i-a13-mbus
+ - allwinner,sun8i-h3-mbus
+
+ reg:
+ maxItems: 1
+
+ clocks:
+ maxItems: 1
+
+ dma-ranges:
+ description:
+ See section 2.3.9 of the DeviceTree Specification.
+
+required:
+ - "#interconnect-cells"
+ - compatible
+ - reg
+ - clocks
+ - dma-ranges
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/sun5i-ccu.h>
+
+ mbus: dram-controller@1c01000 {
+ compatible = "allwinner,sun5i-a13-mbus";
+ reg = <0x01c01000 0x1000>;
+ clocks = <&ccu CLK_MBUS>;
+ dma-ranges = <0x00000000 0x40000000 0x20000000>;
+ #interconnect-cells = <1>;
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/arm/sunxi/sunxi-mbus.txt b/Documentation/devicetree/bindings/arm/sunxi/sunxi-mbus.txt
deleted file mode 100644
index 2005bb4..0000000
--- a/Documentation/devicetree/bindings/arm/sunxi/sunxi-mbus.txt
+++ /dev/null
@@ -1,37 +0,0 @@
-Allwinner Memory Bus (MBUS) controller
-
-The MBUS controller drives the MBUS that other devices in the SoC will
-use to perform DMA. It also has a register interface that allows to
-monitor and control the bandwidth and priorities for masters on that
-bus.
-
-Required properties:
- - compatible: Must be one of:
- - allwinner,sun5i-a13-mbus
- - allwinner,sun8i-h3-mbus
- - reg: Offset and length of the register set for the controller
- - clocks: phandle to the clock driving the controller
- - dma-ranges: See section 2.3.9 of the DeviceTree Specification
- - #interconnect-cells: Must be one, with the argument being the MBUS
- port ID
-
-Each device having to perform their DMA through the MBUS must have the
-interconnects and interconnect-names properties set to the MBUS
-controller and with "dma-mem" as the interconnect name.
-
-Example:
-
-mbus: dram-controller@1c01000 {
- compatible = "allwinner,sun5i-a13-mbus";
- reg = <0x01c01000 0x1000>;
- clocks = <&ccu CLK_MBUS>;
- dma-ranges = <0x00000000 0x40000000 0x20000000>;
- #interconnect-cells = <1>;
-};
-
-fe0: display-frontend@1e00000 {
- compatible = "allwinner,sun5i-a13-display-frontend";
- ...
- interconnects = <&mbus 19>;
- interconnect-names = "dma-mem";
-};
diff --git a/Documentation/devicetree/bindings/arm/ux500.yaml b/Documentation/devicetree/bindings/arm/ux500.yaml
new file mode 100644
index 0000000..accaee9
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/ux500.yaml
@@ -0,0 +1,36 @@
+# SPDX-License-Identifier: GPL-2.0-only
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/ux500.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Ux500 platforms device tree bindings
+
+maintainers:
+ - Linus Walleij <linus.walleij@linaro.org>
+
+properties:
+ $nodename:
+ const: '/'
+ compatible:
+ oneOf:
+
+ - description: ST-Ericsson HREF (pre-v60)
+ items:
+ - const: st-ericsson,mop500
+ - const: st-ericsson,u8500
+
+ - description: ST-Ericsson HREF (v60+)
+ items:
+ - const: st-ericsson,hrefv60+
+ - const: st-ericsson,u8500
+
+ - description: Calao Systems Snowball
+ items:
+ - const: calaosystems,snowball-a9500
+ - const: st-ericsson,u9500
+
+ - description: Samsung Galaxy S III mini (GT-I8190)
+ items:
+ - const: samsung,golden
+ - const: st-ericsson,u8500
diff --git a/Documentation/devicetree/bindings/ata/ahci-platform.txt b/Documentation/devicetree/bindings/ata/ahci-platform.txt
index 55c6fab..77091a2 100644
--- a/Documentation/devicetree/bindings/ata/ahci-platform.txt
+++ b/Documentation/devicetree/bindings/ata/ahci-platform.txt
@@ -9,8 +9,6 @@
Required properties:
- compatible : compatible string, one of:
- - "allwinner,sun4i-a10-ahci"
- - "allwinner,sun8i-r40-ahci"
- "brcm,iproc-ahci"
- "hisilicon,hisi-ahci"
- "cavium,octeon-7130-ahci"
@@ -45,8 +43,6 @@
- #address-cells : number of cells to encode an address
- #size-cells : number of cells representing the size of an address
-For allwinner,sun8i-r40-ahci, the reset property must be present.
-
Sub-nodes required properties:
- reg : the port number
And at least one of the following properties:
@@ -60,14 +56,6 @@
interrupts = <115>;
};
- ahci: sata@1c18000 {
- compatible = "allwinner,sun4i-a10-ahci";
- reg = <0x01c18000 0x1000>;
- interrupts = <56>;
- clocks = <&pll6 0>, <&ahb_gates 25>;
- target-supply = <®_ahci_5v>;
- };
-
With sub-nodes:
sata@f7e90000 {
compatible = "marvell,berlin2q-achi", "generic-ahci";
diff --git a/Documentation/devicetree/bindings/ata/allwinner,sun4i-a10-ahci.yaml b/Documentation/devicetree/bindings/ata/allwinner,sun4i-a10-ahci.yaml
new file mode 100644
index 0000000..cb530b4
--- /dev/null
+++ b/Documentation/devicetree/bindings/ata/allwinner,sun4i-a10-ahci.yaml
@@ -0,0 +1,47 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/ata/allwinner,sun4i-a10-ahci.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Allwinner A10 AHCI SATA Controller bindings
+
+maintainers:
+ - Chen-Yu Tsai <wens@csie.org>
+ - Maxime Ripard <mripard@kernel.org>
+
+properties:
+ compatible:
+ const: allwinner,sun4i-a10-ahci
+
+ reg:
+ maxItems: 1
+
+ clocks:
+ items:
+ - description: AHCI Bus Clock
+ - description: AHCI Module Clock
+
+ interrupts:
+ maxItems: 1
+
+ target-supply:
+ description: Regulator for SATA target power
+
+required:
+ - compatible
+ - reg
+ - clocks
+ - interrupts
+
+additionalProperties: false
+
+examples:
+ - |
+ ahci: sata@1c18000 {
+ compatible = "allwinner,sun4i-a10-ahci";
+ reg = <0x01c18000 0x1000>;
+ interrupts = <56>;
+ clocks = <&pll6 0>, <&ahb_gates 25>;
+ target-supply = <®_ahci_5v>;
+ };
diff --git a/Documentation/devicetree/bindings/ata/allwinner,sun8i-r40-ahci.yaml b/Documentation/devicetree/bindings/ata/allwinner,sun8i-r40-ahci.yaml
new file mode 100644
index 0000000..e6b42a1
--- /dev/null
+++ b/Documentation/devicetree/bindings/ata/allwinner,sun8i-r40-ahci.yaml
@@ -0,0 +1,67 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/ata/allwinner,sun8i-r40-ahci.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Allwinner R40 AHCI SATA Controller bindings
+
+maintainers:
+ - Chen-Yu Tsai <wens@csie.org>
+ - Maxime Ripard <mripard@kernel.org>
+
+properties:
+ compatible:
+ const: allwinner,sun8i-r40-ahci
+
+ reg:
+ maxItems: 1
+
+ clocks:
+ items:
+ - description: AHCI Bus Clock
+ - description: AHCI Module Clock
+
+ interrupts:
+ maxItems: 1
+
+ resets:
+ maxItems: 1
+
+ reset-names:
+ const: ahci
+
+ ahci-supply:
+ description: Regulator for the AHCI controller
+
+ phy-supply:
+ description: Regulator for the SATA PHY power
+
+required:
+ - compatible
+ - reg
+ - clocks
+ - interrupts
+ - resets
+ - reset-names
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/clock/sun8i-r40-ccu.h>
+ #include <dt-bindings/reset/sun8i-r40-ccu.h>
+
+ ahci: sata@1c18000 {
+ compatible = "allwinner,sun8i-r40-ahci";
+ reg = <0x01c18000 0x1000>;
+ interrupts = <GIC_SPI 56 IRQ_TYPE_LEVEL_HIGH>;
+ clocks = <&ccu CLK_BUS_SATA>, <&ccu CLK_SATA>;
+ resets = <&ccu RST_BUS_SATA>;
+ reset-names = "ahci";
+ ahci-supply = <®_dldo4>;
+ phy-supply = <®_eldo3>;
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/ata/brcm,sata-brcm.txt b/Documentation/devicetree/bindings/ata/brcm,sata-brcm.txt
index 7713a41..b9ae4ce 100644
--- a/Documentation/devicetree/bindings/ata/brcm,sata-brcm.txt
+++ b/Documentation/devicetree/bindings/ata/brcm,sata-brcm.txt
@@ -5,6 +5,7 @@
Required properties:
- compatible : should be one or more of
+ "brcm,bcm7216-ahci"
"brcm,bcm7425-ahci"
"brcm,bcm7445-ahci"
"brcm,bcm-nsp-ahci"
@@ -14,6 +15,12 @@
- reg-names : "ahci" and "top-ctrl"
- interrupts : interrupt mapping for SATA IRQ
+Optional properties:
+
+- reset: for "brcm,bcm7216-ahci" must be a valid reset phandle
+ pointing to the RESCAL reset controller provider node.
+- reset-names: for "brcm,bcm7216-ahci", must be "rescal".
+
Also see ahci-platform.txt.
Example:
diff --git a/Documentation/devicetree/bindings/ata/faraday,ftide010.txt b/Documentation/devicetree/bindings/ata/faraday,ftide010.txt
deleted file mode 100644
index a0c64a2..0000000
--- a/Documentation/devicetree/bindings/ata/faraday,ftide010.txt
+++ /dev/null
@@ -1,38 +0,0 @@
-* Faraday Technology FTIDE010 PATA controller
-
-This controller is the first Faraday IDE interface block, used in the
-StorLink SL2312 and SL3516, later known as the Cortina Systems Gemini
-platform. The controller can do PIO modes 0 through 4, Multi-word DMA
-(MWDM)modes 0 through 2 and Ultra DMA modes 0 through 6.
-
-On the Gemini platform, this PATA block is accompanied by a PATA to
-SATA bridge in order to support SATA. This is why a phandle to that
-controller is compulsory on that platform.
-
-The timing properties are unique per-SoC, not per-board.
-
-Required properties:
-- compatible: should be one of
- "cortina,gemini-pata", "faraday,ftide010"
- "faraday,ftide010"
-- interrupts: interrupt for the block
-- reg: registers and size for the block
-
-Optional properties:
-- clocks: a SoC clock running the peripheral.
-- clock-names: should be set to "PCLK" for the peripheral clock.
-
-Required properties for "cortina,gemini-pata" compatible:
-- sata: a phande to the Gemini PATA to SATA bridge, see
- cortina,gemini-sata-bridge.txt for details.
-
-Example:
-
-ata@63000000 {
- compatible = "cortina,gemini-pata", "faraday,ftide010";
- reg = <0x63000000 0x100>;
- interrupts = <4 IRQ_TYPE_EDGE_RISING>;
- clocks = <&gcc GEMINI_CLK_GATE_IDE>;
- clock-names = "PCLK";
- sata = <&sata>;
-};
diff --git a/Documentation/devicetree/bindings/ata/faraday,ftide010.yaml b/Documentation/devicetree/bindings/ata/faraday,ftide010.yaml
new file mode 100644
index 0000000..bfc6357
--- /dev/null
+++ b/Documentation/devicetree/bindings/ata/faraday,ftide010.yaml
@@ -0,0 +1,89 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/ata/faraday,ftide010.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Faraday Technology FTIDE010 PATA controller
+
+maintainers:
+ - Linus Walleij <linus.walleij@linaro.org>
+
+description: |
+ This controller is the first Faraday IDE interface block, used in the
+ StorLink SL3512 and SL3516, later known as the Cortina Systems Gemini
+ platform. The controller can do PIO modes 0 through 4, Multi-word DMA
+ (MWDM) modes 0 through 2 and Ultra DMA modes 0 through 6.
+
+ On the Gemini platform, this PATA block is accompanied by a PATA to
+ SATA bridge in order to support SATA. This is why a phandle to that
+ controller is compulsory on that platform.
+
+ The timing properties are unique per-SoC, not per-board.
+
+properties:
+ compatible:
+ oneOf:
+ - const: faraday,ftide010
+ - items:
+ - const: cortina,gemini-pata
+ - const: faraday,ftide010
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ clocks:
+ minItems: 1
+
+ clock-names:
+ const: PCLK
+
+ sata:
+ description:
+ phandle to the Gemini PATA to SATA bridge, if available
+ $ref: /schemas/types.yaml#/definitions/phandle
+
+required:
+ - compatible
+ - reg
+ - interrupts
+
+allOf:
+ - $ref: pata-common.yaml#
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: cortina,gemini-pata
+
+ then:
+ required:
+ - sata
+
+examples:
+ - |
+ #include <dt-bindings/interrupt-controller/irq.h>
+ #include <dt-bindings/clock/cortina,gemini-clock.h>
+
+ ide@63000000 {
+ compatible = "cortina,gemini-pata", "faraday,ftide010";
+ reg = <0x63000000 0x100>;
+ interrupts = <4 IRQ_TYPE_EDGE_RISING>;
+ clocks = <&gcc GEMINI_CLK_GATE_IDE>;
+ clock-names = "PCLK";
+ sata = <&sata>;
+ #address-cells = <1>;
+ #size-cells = <0>;
+ ide-port@0 {
+ reg = <0>;
+ };
+ ide-port@1 {
+ reg = <1>;
+ };
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/ata/pata-common.yaml b/Documentation/devicetree/bindings/ata/pata-common.yaml
new file mode 100644
index 0000000..fc5ebbe
--- /dev/null
+++ b/Documentation/devicetree/bindings/ata/pata-common.yaml
@@ -0,0 +1,50 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/ata/pata-common.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Common Properties for Parallel AT attachment (PATA) controllers
+
+maintainers:
+ - Linus Walleij <linus.walleij@linaro.org>
+
+description: |
+ This document defines device tree properties common to most Parallel
+ ATA (PATA, also known as IDE) AT attachment storage devices.
+ It doesn't constitue a device tree binding specification by itself but is
+ meant to be referenced by device tree bindings.
+
+ The PATA (IDE) controller-specific device tree bindings are responsible for
+ defining whether each property is required or optional.
+
+properties:
+ $nodename:
+ pattern: "^ide(@.*)?$"
+ description:
+ Specifies the host controller node. PATA host controller nodes are named
+ "ide".
+
+ "#address-cells":
+ const: 1
+
+ "#size-cells":
+ const: 0
+
+patternProperties:
+ "^ide-port@[0-1]$":
+ description: |
+ DT nodes for ports connected on the PATA host. The master drive will have
+ ID number 0 and the slave drive will have ID number 1. The PATA port
+ nodes will be named "ide-port".
+ type: object
+
+ properties:
+ reg:
+ minimum: 0
+ maximum: 1
+ description:
+ The ID number of the drive port, 0 for the master port and 1 for the
+ slave port.
+
+...
diff --git a/Documentation/devicetree/bindings/ata/sata-common.yaml b/Documentation/devicetree/bindings/ata/sata-common.yaml
new file mode 100644
index 0000000..6783a4d
--- /dev/null
+++ b/Documentation/devicetree/bindings/ata/sata-common.yaml
@@ -0,0 +1,50 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/ata/sata-common.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Common Properties for Serial AT attachment (SATA) controllers
+
+maintainers:
+ - Linus Walleij <linus.walleij@linaro.org>
+
+description: |
+ This document defines device tree properties common to most Serial
+ AT attachment (SATA) storage devices. It doesn't constitute a device tree
+ binding specification by itself but is meant to be referenced by device
+ tree bindings.
+
+ The SATA controller-specific device tree bindings are responsible for
+ defining whether each property is required or optional.
+
+properties:
+ $nodename:
+ pattern: "^sata(@.*)?$"
+ description:
+ Specifies the host controller node. SATA host controller nodes are named
+ "sata"
+
+ "#address-cells":
+ const: 1
+
+ "#size-cells":
+ const: 0
+
+patternProperties:
+ "^sata-port@[0-9a-e]$":
+ description: |
+ DT nodes for ports connected on the SATA host. The SATA port
+ nodes will be named "sata-port".
+ type: object
+
+ properties:
+ reg:
+ minimum: 0
+ maximum: 14
+ description:
+ The ID number of the drive port SATA can potentially use a port
+ multiplier making it possible to connect up to 15 disks to a single
+ SATA port.
+
+...
diff --git a/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-ahb-clk.yaml b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-ahb-clk.yaml
new file mode 100644
index 0000000..558db4b
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-ahb-clk.yaml
@@ -0,0 +1,108 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/allwinner,sun4i-a10-ahb-clk.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Allwinner A10 AHB Clock Device Tree Bindings
+
+maintainers:
+ - Chen-Yu Tsai <wens@csie.org>
+ - Maxime Ripard <mripard@kernel.org>
+
+deprecated: true
+
+properties:
+ "#clock-cells":
+ const: 0
+
+ compatible:
+ enum:
+ - allwinner,sun4i-a10-ahb-clk
+ - allwinner,sun6i-a31-ahb1-clk
+ - allwinner,sun8i-h3-ahb2-clk
+
+ reg:
+ maxItems: 1
+
+ clocks:
+ minItems: 1
+ maxItems: 4
+ description: >
+ The parent order must match the hardware programming order.
+
+ clock-output-names:
+ maxItems: 1
+
+required:
+ - "#clock-cells"
+ - compatible
+ - reg
+ - clocks
+ - clock-output-names
+
+additionalProperties: false
+
+allOf:
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: allwinner,sun4i-a10-ahb-clk
+
+ then:
+ properties:
+ clocks:
+ maxItems: 1
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: allwinner,sun6i-a31-ahb1-clk
+
+ then:
+ properties:
+ clocks:
+ maxItems: 4
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: allwinner,sun8i-h3-ahb2-clk
+
+ then:
+ properties:
+ clocks:
+ maxItems: 2
+
+examples:
+ - |
+ ahb@1c20054 {
+ #clock-cells = <0>;
+ compatible = "allwinner,sun4i-a10-ahb-clk";
+ reg = <0x01c20054 0x4>;
+ clocks = <&axi>;
+ clock-output-names = "ahb";
+ };
+
+ - |
+ ahb1@1c20054 {
+ #clock-cells = <0>;
+ compatible = "allwinner,sun6i-a31-ahb1-clk";
+ reg = <0x01c20054 0x4>;
+ clocks = <&osc32k>, <&osc24M>, <&axi>, <&pll6 0>;
+ clock-output-names = "ahb1";
+ };
+
+ - |
+ ahb2_clk@1c2005c {
+ #clock-cells = <0>;
+ compatible = "allwinner,sun8i-h3-ahb2-clk";
+ reg = <0x01c2005c 0x4>;
+ clocks = <&ahb1>, <&pll6d2>;
+ clock-output-names = "ahb2";
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-apb0-clk.yaml b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-apb0-clk.yaml
new file mode 100644
index 0000000..b1e3d73
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-apb0-clk.yaml
@@ -0,0 +1,50 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/allwinner,sun4i-a10-apb0-clk.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Allwinner A10 APB0 Bus Clock Device Tree Bindings
+
+maintainers:
+ - Chen-Yu Tsai <wens@csie.org>
+ - Maxime Ripard <mripard@kernel.org>
+
+deprecated: true
+
+properties:
+ "#clock-cells":
+ const: 0
+
+ compatible:
+ const: allwinner,sun4i-a10-apb0-clk
+
+ reg:
+ maxItems: 1
+
+ clocks:
+ maxItems: 1
+
+ clock-output-names:
+ maxItems: 1
+
+required:
+ - "#clock-cells"
+ - compatible
+ - reg
+ - clocks
+ - clock-output-names
+
+additionalProperties: false
+
+examples:
+ - |
+ apb0@1c20054 {
+ #clock-cells = <0>;
+ compatible = "allwinner,sun4i-a10-apb0-clk";
+ reg = <0x01c20054 0x4>;
+ clocks = <&ahb>;
+ clock-output-names = "apb0";
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-apb1-clk.yaml b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-apb1-clk.yaml
new file mode 100644
index 0000000..51b7a6d
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-apb1-clk.yaml
@@ -0,0 +1,52 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/allwinner,sun4i-a10-apb1-clk.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Allwinner A10 APB1 Bus Clock Device Tree Bindings
+
+maintainers:
+ - Chen-Yu Tsai <wens@csie.org>
+ - Maxime Ripard <mripard@kernel.org>
+
+deprecated: true
+
+properties:
+ "#clock-cells":
+ const: 0
+
+ compatible:
+ const: allwinner,sun4i-a10-apb1-clk
+
+ reg:
+ maxItems: 1
+
+ clocks:
+ maxItems: 3
+ description: >
+ The parent order must match the hardware programming order.
+
+ clock-output-names:
+ maxItems: 1
+
+required:
+ - "#clock-cells"
+ - compatible
+ - reg
+ - clocks
+ - clock-output-names
+
+additionalProperties: false
+
+examples:
+ - |
+ clk@1c20058 {
+ #clock-cells = <0>;
+ compatible = "allwinner,sun4i-a10-apb1-clk";
+ reg = <0x01c20058 0x4>;
+ clocks = <&osc24M>, <&pll6 1>, <&osc32k>;
+ clock-output-names = "apb1";
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-axi-clk.yaml b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-axi-clk.yaml
new file mode 100644
index 0000000..d801158
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-axi-clk.yaml
@@ -0,0 +1,61 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/allwinner,sun4i-a10-axi-clk.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Allwinner A10 AXI Clock Device Tree Bindings
+
+maintainers:
+ - Chen-Yu Tsai <wens@csie.org>
+ - Maxime Ripard <mripard@kernel.org>
+
+deprecated: true
+
+properties:
+ "#clock-cells":
+ const: 0
+
+ compatible:
+ enum:
+ - allwinner,sun4i-a10-axi-clk
+ - allwinner,sun8i-a23-axi-clk
+
+ reg:
+ maxItems: 1
+
+ clocks:
+ maxItems: 1
+
+ clock-output-names:
+ maxItems: 1
+
+required:
+ - "#clock-cells"
+ - compatible
+ - reg
+ - clocks
+ - clock-output-names
+
+additionalProperties: false
+
+examples:
+ - |
+ axi@1c20054 {
+ #clock-cells = <0>;
+ compatible = "allwinner,sun4i-a10-axi-clk";
+ reg = <0x01c20054 0x4>;
+ clocks = <&cpu>;
+ clock-output-names = "axi";
+ };
+
+ - |
+ axi_clk@1c20050 {
+ #clock-cells = <0>;
+ compatible = "allwinner,sun8i-a23-axi-clk";
+ reg = <0x01c20050 0x4>;
+ clocks = <&cpu>;
+ clock-output-names = "axi";
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-cpu-clk.yaml b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-cpu-clk.yaml
new file mode 100644
index 0000000..0dfafba
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-cpu-clk.yaml
@@ -0,0 +1,52 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/allwinner,sun4i-a10-cpu-clk.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Allwinner A10 CPU Clock Device Tree Bindings
+
+maintainers:
+ - Chen-Yu Tsai <wens@csie.org>
+ - Maxime Ripard <mripard@kernel.org>
+
+deprecated: true
+
+properties:
+ "#clock-cells":
+ const: 0
+
+ compatible:
+ const: allwinner,sun4i-a10-cpu-clk
+
+ reg:
+ maxItems: 1
+
+ clocks:
+ maxItems: 4
+ description: >
+ The parent order must match the hardware programming order.
+
+ clock-output-names:
+ maxItems: 1
+
+required:
+ - "#clock-cells"
+ - compatible
+ - reg
+ - clocks
+ - clock-output-names
+
+additionalProperties: false
+
+examples:
+ - |
+ cpu@1c20054 {
+ #clock-cells = <0>;
+ compatible = "allwinner,sun4i-a10-cpu-clk";
+ reg = <0x01c20054 0x4>;
+ clocks = <&osc32k>, <&osc24M>, <&pll1>, <&dummy>;
+ clock-output-names = "cpu";
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-display-clk.yaml b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-display-clk.yaml
new file mode 100644
index 0000000..7484a7a
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-display-clk.yaml
@@ -0,0 +1,57 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/allwinner,sun4i-a10-display-clk.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Allwinner A10 Display Clock Device Tree Bindings
+
+maintainers:
+ - Chen-Yu Tsai <wens@csie.org>
+ - Maxime Ripard <mripard@kernel.org>
+
+deprecated: true
+
+properties:
+ "#clock-cells":
+ const: 0
+
+ "#reset-cells":
+ const: 0
+
+ compatible:
+ const: allwinner,sun4i-a10-display-clk
+
+ reg:
+ maxItems: 1
+
+ clocks:
+ maxItems: 3
+ description: >
+ The parent order must match the hardware programming order.
+
+ clock-output-names:
+ maxItems: 1
+
+required:
+ - "#clock-cells"
+ - "#reset-cells"
+ - compatible
+ - reg
+ - clocks
+ - clock-output-names
+
+additionalProperties: false
+
+examples:
+ - |
+ clk@1c20104 {
+ #clock-cells = <0>;
+ #reset-cells = <0>;
+ compatible = "allwinner,sun4i-a10-display-clk";
+ reg = <0x01c20104 0x4>;
+ clocks = <&pll3>, <&pll7>, <&pll5 1>;
+ clock-output-names = "de-be";
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-gates-clk.yaml b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-gates-clk.yaml
new file mode 100644
index 0000000..ed1b212
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-gates-clk.yaml
@@ -0,0 +1,152 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/allwinner,sun4i-a10-gates-clk.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Allwinner A10 Bus Gates Clock Device Tree Bindings
+
+maintainers:
+ - Chen-Yu Tsai <wens@csie.org>
+ - Maxime Ripard <mripard@kernel.org>
+
+deprecated: true
+
+properties:
+ "#clock-cells":
+ const: 1
+ description: >
+ This additional argument passed to that clock is the offset of
+ the bit controlling this particular gate in the register.
+
+ compatible:
+ oneOf:
+ - const: allwinner,sun4i-a10-gates-clk
+ - const: allwinner,sun4i-a10-axi-gates-clk
+ - const: allwinner,sun4i-a10-ahb-gates-clk
+ - const: allwinner,sun5i-a10s-ahb-gates-clk
+ - const: allwinner,sun5i-a13-ahb-gates-clk
+ - const: allwinner,sun7i-a20-ahb-gates-clk
+ - const: allwinner,sun6i-a31-ahb1-gates-clk
+ - const: allwinner,sun8i-a23-ahb1-gates-clk
+ - const: allwinner,sun9i-a80-ahb0-gates-clk
+ - const: allwinner,sun9i-a80-ahb1-gates-clk
+ - const: allwinner,sun9i-a80-ahb2-gates-clk
+ - const: allwinner,sun4i-a10-apb0-gates-clk
+ - const: allwinner,sun5i-a10s-apb0-gates-clk
+ - const: allwinner,sun5i-a13-apb0-gates-clk
+ - const: allwinner,sun7i-a20-apb0-gates-clk
+ - const: allwinner,sun9i-a80-apb0-gates-clk
+ - const: allwinner,sun8i-a83t-apb0-gates-clk
+ - const: allwinner,sun4i-a10-apb1-gates-clk
+ - const: allwinner,sun5i-a13-apb1-gates-clk
+ - const: allwinner,sun5i-a10s-apb1-gates-clk
+ - const: allwinner,sun6i-a31-apb1-gates-clk
+ - const: allwinner,sun7i-a20-apb1-gates-clk
+ - const: allwinner,sun8i-a23-apb1-gates-clk
+ - const: allwinner,sun9i-a80-apb1-gates-clk
+ - const: allwinner,sun6i-a31-apb2-gates-clk
+ - const: allwinner,sun8i-a23-apb2-gates-clk
+ - const: allwinner,sun8i-a83t-bus-gates-clk
+ - const: allwinner,sun9i-a80-apbs-gates-clk
+ - const: allwinner,sun4i-a10-dram-gates-clk
+
+ - items:
+ - const: allwinner,sun5i-a13-dram-gates-clk
+ - const: allwinner,sun4i-a10-gates-clk
+
+ - items:
+ - const: allwinner,sun8i-h3-apb0-gates-clk
+ - const: allwinner,sun4i-a10-gates-clk
+
+ reg:
+ maxItems: 1
+
+ clocks:
+ maxItems: 1
+
+ clock-indices:
+ minItems: 1
+ maxItems: 64
+
+ clock-output-names:
+ minItems: 1
+ maxItems: 64
+
+required:
+ - "#clock-cells"
+ - compatible
+ - reg
+ - clocks
+ - clock-indices
+ - clock-output-names
+
+additionalProperties: false
+
+examples:
+ - |
+ clk@1c2005c {
+ #clock-cells = <1>;
+ compatible = "allwinner,sun4i-a10-axi-gates-clk";
+ reg = <0x01c2005c 0x4>;
+ clocks = <&axi>;
+ clock-indices = <0>;
+ clock-output-names = "axi_dram";
+ };
+
+ - |
+ clk@1c20060 {
+ #clock-cells = <1>;
+ compatible = "allwinner,sun4i-a10-ahb-gates-clk";
+ reg = <0x01c20060 0x8>;
+ clocks = <&ahb>;
+ clock-indices = <0>, <1>,
+ <2>, <3>,
+ <4>, <5>, <6>,
+ <7>, <8>, <9>,
+ <10>, <11>, <12>,
+ <13>, <14>, <16>,
+ <17>, <18>, <20>,
+ <21>, <22>, <23>,
+ <24>, <25>, <26>,
+ <32>, <33>, <34>,
+ <35>, <36>, <37>,
+ <40>, <41>, <43>,
+ <44>, <45>,
+ <46>, <47>,
+ <50>, <52>;
+ clock-output-names = "ahb_usb0", "ahb_ehci0",
+ "ahb_ohci0", "ahb_ehci1",
+ "ahb_ohci1", "ahb_ss", "ahb_dma",
+ "ahb_bist", "ahb_mmc0", "ahb_mmc1",
+ "ahb_mmc2", "ahb_mmc3", "ahb_ms",
+ "ahb_nand", "ahb_sdram", "ahb_ace",
+ "ahb_emac", "ahb_ts", "ahb_spi0",
+ "ahb_spi1", "ahb_spi2", "ahb_spi3",
+ "ahb_pata", "ahb_sata", "ahb_gps",
+ "ahb_ve", "ahb_tvd", "ahb_tve0",
+ "ahb_tve1", "ahb_lcd0", "ahb_lcd1",
+ "ahb_csi0", "ahb_csi1", "ahb_hdmi",
+ "ahb_de_be0", "ahb_de_be1",
+ "ahb_de_fe0", "ahb_de_fe1",
+ "ahb_mp", "ahb_mali400";
+ };
+
+
+ - |
+ clk@1c20068 {
+ #clock-cells = <1>;
+ compatible = "allwinner,sun4i-a10-apb0-gates-clk";
+ reg = <0x01c20068 0x4>;
+ clocks = <&apb0>;
+ clock-indices = <0>, <1>,
+ <2>, <3>,
+ <5>, <6>,
+ <7>, <10>;
+ clock-output-names = "apb0_codec", "apb0_spdif",
+ "apb0_ac97", "apb0_iis",
+ "apb0_pio", "apb0_ir0",
+ "apb0_ir1", "apb0_keypad";
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-mbus-clk.yaml b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-mbus-clk.yaml
new file mode 100644
index 0000000..18f131e
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-mbus-clk.yaml
@@ -0,0 +1,63 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/allwinner,sun4i-a10-mbus-clk.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Allwinner A10 MBUS Clock Device Tree Bindings
+
+maintainers:
+ - Chen-Yu Tsai <wens@csie.org>
+ - Maxime Ripard <mripard@kernel.org>
+
+deprecated: true
+
+properties:
+ "#clock-cells":
+ const: 0
+
+ compatible:
+ enum:
+ - allwinner,sun5i-a13-mbus-clk
+ - allwinner,sun8i-a23-mbus-clk
+
+ reg:
+ maxItems: 1
+
+ clocks:
+ maxItems: 3
+ description: >
+ The parent order must match the hardware programming order.
+
+ clock-output-names:
+ maxItems: 1
+
+required:
+ - "#clock-cells"
+ - compatible
+ - reg
+ - clocks
+ - clock-output-names
+
+additionalProperties: false
+
+examples:
+ - |
+ clk@1c2015c {
+ #clock-cells = <0>;
+ compatible = "allwinner,sun5i-a13-mbus-clk";
+ reg = <0x01c2015c 0x4>;
+ clocks = <&osc24M>, <&pll6 1>, <&pll5 1>;
+ clock-output-names = "mbus";
+ };
+
+ - |
+ clk@1c2015c {
+ #clock-cells = <0>;
+ compatible = "allwinner,sun8i-a23-mbus-clk";
+ reg = <0x01c2015c 0x4>;
+ clocks = <&osc24M>, <&pll6 1>, <&pll5>;
+ clock-output-names = "mbus";
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-mmc-clk.yaml b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-mmc-clk.yaml
new file mode 100644
index 0000000..5199285
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-mmc-clk.yaml
@@ -0,0 +1,87 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/allwinner,sun4i-a10-mmc-clk.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Allwinner A10 Module 1 Clock Device Tree Bindings
+
+maintainers:
+ - Chen-Yu Tsai <wens@csie.org>
+ - Maxime Ripard <mripard@kernel.org>
+
+deprecated: true
+
+properties:
+ "#clock-cells":
+ const: 1
+ description: >
+ There is three different outputs: the main clock, with the ID 0,
+ and the output and sample clocks, with the IDs 1 and 2,
+ respectively.
+
+ compatible:
+ enum:
+ - allwinner,sun4i-a10-mmc-clk
+ - allwinner,sun9i-a80-mmc-clk
+
+ reg:
+ maxItems: 1
+
+ clocks:
+ minItems: 2
+ maxItems: 3
+ description: >
+ The parent order must match the hardware programming order.
+
+ clock-output-names:
+ maxItems: 3
+
+required:
+ - "#clock-cells"
+ - compatible
+ - reg
+ - clocks
+ - clock-output-names
+
+additionalProperties: false
+
+if:
+ properties:
+ compatible:
+ contains:
+ const: allwinner,sun4i-a10-mmc-clk
+
+then:
+ properties:
+ clocks:
+ maxItems: 3
+
+else:
+ properties:
+ clocks:
+ maxItems: 2
+
+examples:
+ - |
+ clk@1c20088 {
+ #clock-cells = <1>;
+ compatible = "allwinner,sun4i-a10-mmc-clk";
+ reg = <0x01c20088 0x4>;
+ clocks = <&osc24M>, <&pll6 1>, <&pll5 1>;
+ clock-output-names = "mmc0",
+ "mmc0_output",
+ "mmc0_sample";
+ };
+
+ - |
+ clk@6000410 {
+ #clock-cells = <1>;
+ compatible = "allwinner,sun9i-a80-mmc-clk";
+ reg = <0x06000410 0x4>;
+ clocks = <&osc24M>, <&pll4>;
+ clock-output-names = "mmc0", "mmc0_output",
+ "mmc0_sample";
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-mod0-clk.yaml b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-mod0-clk.yaml
new file mode 100644
index 0000000..3e2abe3
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-mod0-clk.yaml
@@ -0,0 +1,80 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/allwinner,sun4i-a10-mod0-clk.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Allwinner A10 Module 0 Clock Device Tree Bindings
+
+maintainers:
+ - Chen-Yu Tsai <wens@csie.org>
+ - Maxime Ripard <mripard@kernel.org>
+
+deprecated: true
+
+select:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - allwinner,sun4i-a10-mod0-clk
+ - allwinner,sun9i-a80-mod0-clk
+
+ # The PRCM on the A31 and A23 will have the reg property missing,
+ # since it's set at the upper level node, and will be validated by
+ # PRCM's schema. Make sure we only validate standalone nodes.
+ required:
+ - compatible
+ - reg
+
+properties:
+ "#clock-cells":
+ const: 0
+
+ compatible:
+ enum:
+ - allwinner,sun4i-a10-mod0-clk
+ - allwinner,sun9i-a80-mod0-clk
+
+ reg:
+ maxItems: 1
+
+ clocks:
+ # On the A80, the PRCM mod0 clocks have 2 parents.
+ minItems: 2
+ maxItems: 3
+ description: >
+ The parent order must match the hardware programming order.
+
+ clock-output-names:
+ maxItems: 1
+
+required:
+ - "#clock-cells"
+ - compatible
+ - reg
+ - clocks
+ - clock-output-names
+
+additionalProperties: false
+
+examples:
+ - |
+ clk@1c20080 {
+ #clock-cells = <0>;
+ compatible = "allwinner,sun4i-a10-mod0-clk";
+ reg = <0x01c20080 0x4>;
+ clocks = <&osc24M>, <&pll6 1>, <&pll5 1>;
+ clock-output-names = "nand";
+ };
+
+ - |
+ clk@8001454 {
+ #clock-cells = <0>;
+ compatible = "allwinner,sun4i-a10-mod0-clk";
+ reg = <0x08001454 0x4>;
+ clocks = <&osc32k>, <&osc24M>;
+ clock-output-names = "r_ir";
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-mod1-clk.yaml b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-mod1-clk.yaml
new file mode 100644
index 0000000..7ddb55c
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-mod1-clk.yaml
@@ -0,0 +1,57 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/allwinner,sun4i-a10-mod1-clk.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Allwinner A10 Module 1 Clock Device Tree Bindings
+
+maintainers:
+ - Chen-Yu Tsai <wens@csie.org>
+ - Maxime Ripard <mripard@kernel.org>
+
+deprecated: true
+
+properties:
+ "#clock-cells":
+ const: 0
+
+ compatible:
+ const: allwinner,sun4i-a10-mod1-clk
+
+ reg:
+ maxItems: 1
+
+ clocks:
+ maxItems: 4
+ description: >
+ The parent order must match the hardware programming order.
+
+ clock-output-names:
+ maxItems: 1
+
+required:
+ - "#clock-cells"
+ - compatible
+ - reg
+ - clocks
+ - clock-output-names
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/sun4i-a10-pll2.h>
+
+ clk@1c200c0 {
+ #clock-cells = <0>;
+ compatible = "allwinner,sun4i-a10-mod1-clk";
+ reg = <0x01c200c0 0x4>;
+ clocks = <&pll2 SUN4I_A10_PLL2_8X>,
+ <&pll2 SUN4I_A10_PLL2_4X>,
+ <&pll2 SUN4I_A10_PLL2_2X>,
+ <&pll2 SUN4I_A10_PLL2_1X>;
+ clock-output-names = "spdif";
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-osc-clk.yaml b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-osc-clk.yaml
new file mode 100644
index 0000000..69cfa4a
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-osc-clk.yaml
@@ -0,0 +1,51 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/allwinner,sun4i-a10-osc-clk.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Allwinner A10 Gatable Oscillator Clock Device Tree Bindings
+
+maintainers:
+ - Chen-Yu Tsai <wens@csie.org>
+ - Maxime Ripard <mripard@kernel.org>
+
+deprecated: true
+
+properties:
+ "#clock-cells":
+ const: 0
+
+ compatible:
+ const: allwinner,sun4i-a10-osc-clk
+
+ reg:
+ maxItems: 1
+
+ clock-frequency:
+ description: >
+ Frequency of the main oscillator.
+
+ clock-output-names:
+ maxItems: 1
+
+required:
+ - "#clock-cells"
+ - compatible
+ - reg
+ - clock-frequency
+ - clock-output-names
+
+additionalProperties: false
+
+examples:
+ - |
+ osc24M: clk@01c20050 {
+ #clock-cells = <0>;
+ compatible = "allwinner,sun4i-a10-osc-clk";
+ reg = <0x01c20050 0x4>;
+ clock-frequency = <24000000>;
+ clock-output-names = "osc24M";
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-pll1-clk.yaml b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-pll1-clk.yaml
new file mode 100644
index 0000000..e9c4cf8
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-pll1-clk.yaml
@@ -0,0 +1,71 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/allwinner,sun4i-a10-pll1-clk.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Allwinner A10 CPU PLL Device Tree Bindings
+
+maintainers:
+ - Chen-Yu Tsai <wens@csie.org>
+ - Maxime Ripard <mripard@kernel.org>
+
+deprecated: true
+
+properties:
+ "#clock-cells":
+ const: 0
+
+ compatible:
+ enum:
+ - allwinner,sun4i-a10-pll1-clk
+ - allwinner,sun6i-a31-pll1-clk
+ - allwinner,sun8i-a23-pll1-clk
+
+ reg:
+ maxItems: 1
+
+ clocks:
+ maxItems: 1
+
+ clock-output-names:
+ maxItems: 1
+
+required:
+ - "#clock-cells"
+ - compatible
+ - reg
+ - clocks
+ - clock-output-names
+
+additionalProperties: false
+
+examples:
+ - |
+ clk@1c20000 {
+ #clock-cells = <0>;
+ compatible = "allwinner,sun4i-a10-pll1";
+ reg = <0x01c20000 0x4>;
+ clocks = <&osc24M>;
+ clock-output-names = "osc24M";
+ };
+
+ - |
+ clk@1c20000 {
+ #clock-cells = <0>;
+ compatible = "allwinner,sun6i-a31-pll1-clk";
+ reg = <0x01c20000 0x4>;
+ clocks = <&osc24M>;
+ clock-output-names = "pll1";
+ };
+
+ - |
+ clk@1c20000 {
+ #clock-cells = <0>;
+ compatible = "allwinner,sun8i-a23-pll1-clk";
+ reg = <0x01c20000 0x4>;
+ clocks = <&osc24M>;
+ clock-output-names = "pll1";
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-pll3-clk.yaml b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-pll3-clk.yaml
new file mode 100644
index 0000000..4b80a42
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-pll3-clk.yaml
@@ -0,0 +1,50 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/allwinner,sun4i-a10-pll3-clk.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Allwinner A10 Video PLL Device Tree Bindings
+
+maintainers:
+ - Chen-Yu Tsai <wens@csie.org>
+ - Maxime Ripard <mripard@kernel.org>
+
+deprecated: true
+
+properties:
+ "#clock-cells":
+ const: 0
+
+ compatible:
+ const: allwinner,sun4i-a10-pll3-clk
+
+ reg:
+ maxItems: 1
+
+ clocks:
+ maxItems: 1
+
+ clock-output-names:
+ maxItems: 1
+
+required:
+ - "#clock-cells"
+ - compatible
+ - reg
+ - clocks
+ - clock-output-names
+
+additionalProperties: false
+
+examples:
+ - |
+ clk@1c20010 {
+ #clock-cells = <0>;
+ compatible = "allwinner,sun4i-a10-pll3-clk";
+ reg = <0x01c20010 0x4>;
+ clocks = <&osc3M>;
+ clock-output-names = "pll3";
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-pll5-clk.yaml b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-pll5-clk.yaml
new file mode 100644
index 0000000..415bd77
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-pll5-clk.yaml
@@ -0,0 +1,53 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/allwinner,sun4i-a10-pll5-clk.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Allwinner A10 DRAM PLL Device Tree Bindings
+
+maintainers:
+ - Chen-Yu Tsai <wens@csie.org>
+ - Maxime Ripard <mripard@kernel.org>
+
+deprecated: true
+
+properties:
+ "#clock-cells":
+ const: 1
+ description: >
+ The first output is the DRAM clock output, the second is meant
+ for peripherals on the SoC.
+
+ compatible:
+ const: allwinner,sun4i-a10-pll5-clk
+
+ reg:
+ maxItems: 1
+
+ clocks:
+ maxItems: 1
+
+ clock-output-names:
+ maxItems: 2
+
+required:
+ - "#clock-cells"
+ - compatible
+ - reg
+ - clocks
+ - clock-output-names
+
+additionalProperties: false
+
+examples:
+ - |
+ clk@1c20020 {
+ #clock-cells = <1>;
+ compatible = "allwinner,sun4i-a10-pll5-clk";
+ reg = <0x01c20020 0x4>;
+ clocks = <&osc24M>;
+ clock-output-names = "pll5_ddr", "pll5_other";
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-pll6-clk.yaml b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-pll6-clk.yaml
new file mode 100644
index 0000000..ec5652f7
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-pll6-clk.yaml
@@ -0,0 +1,53 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/allwinner,sun4i-a10-pll6-clk.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Allwinner A10 Peripheral PLL Device Tree Bindings
+
+maintainers:
+ - Chen-Yu Tsai <wens@csie.org>
+ - Maxime Ripard <mripard@kernel.org>
+
+deprecated: true
+
+properties:
+ "#clock-cells":
+ const: 1
+ description: >
+ The first output is the SATA clock output, the second is the
+ regular PLL output, the third is a PLL output at twice the rate.
+
+ compatible:
+ const: allwinner,sun4i-a10-pll6-clk
+
+ reg:
+ maxItems: 1
+
+ clocks:
+ maxItems: 1
+
+ clock-output-names:
+ maxItems: 3
+
+required:
+ - "#clock-cells"
+ - compatible
+ - reg
+ - clocks
+ - clock-output-names
+
+additionalProperties: false
+
+examples:
+ - |
+ clk@1c20028 {
+ #clock-cells = <1>;
+ compatible = "allwinner,sun4i-a10-pll6-clk";
+ reg = <0x01c20028 0x4>;
+ clocks = <&osc24M>;
+ clock-output-names = "pll6_sata", "pll6_other", "pll6";
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-tcon-ch0-clk.yaml b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-tcon-ch0-clk.yaml
new file mode 100644
index 0000000..0a335c6
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-tcon-ch0-clk.yaml
@@ -0,0 +1,77 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/allwinner,sun4i-a10-tcon-ch0-clk.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Allwinner A10 TCON Channel 0 Clock Device Tree Bindings
+
+maintainers:
+ - Chen-Yu Tsai <wens@csie.org>
+ - Maxime Ripard <mripard@kernel.org>
+
+deprecated: true
+
+properties:
+ "#clock-cells":
+ const: 0
+
+ "#reset-cells":
+ const: 1
+
+ compatible:
+ enum:
+ - allwinner,sun4i-a10-tcon-ch0-clk
+ - allwinner,sun4i-a10-tcon-ch1-clk
+
+ reg:
+ maxItems: 1
+
+ clocks:
+ maxItems: 4
+ description: >
+ The parent order must match the hardware programming order.
+
+ clock-output-names:
+ maxItems: 1
+
+required:
+ - "#clock-cells"
+ - compatible
+ - reg
+ - clocks
+ - clock-output-names
+
+if:
+ properties:
+ compatible:
+ contains:
+ const: allwinner,sun4i-a10-tcon-ch0-clk
+
+then:
+ required:
+ - "#reset-cells"
+
+additionalProperties: false
+
+examples:
+ - |
+ clk@1c20118 {
+ #clock-cells = <0>;
+ #reset-cells = <1>;
+ compatible = "allwinner,sun4i-a10-tcon-ch0-clk";
+ reg = <0x01c20118 0x4>;
+ clocks = <&pll3>, <&pll7>, <&pll3x2>, <&pll7x2>;
+ clock-output-names = "tcon-ch0-sclk";
+ };
+
+ - |
+ clk@1c2012c {
+ #clock-cells = <0>;
+ compatible = "allwinner,sun4i-a10-tcon-ch1-clk";
+ reg = <0x01c2012c 0x4>;
+ clocks = <&pll3>, <&pll7>, <&pll3x2>, <&pll7x2>;
+ clock-output-names = "tcon-ch1-sclk";
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-usb-clk.yaml b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-usb-clk.yaml
new file mode 100644
index 0000000..cd95d25
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-usb-clk.yaml
@@ -0,0 +1,166 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/allwinner,sun4i-a10-usb-clk.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Allwinner A10 USB Clock Device Tree Bindings
+
+maintainers:
+ - Chen-Yu Tsai <wens@csie.org>
+ - Maxime Ripard <mripard@kernel.org>
+
+deprecated: true
+
+properties:
+ "#clock-cells":
+ const: 1
+ description: >
+ The additional ID argument passed to the clock shall refer to
+ the index of the output.
+
+ "#reset-cells":
+ const: 1
+
+ compatible:
+ enum:
+ - allwinner,sun4i-a10-usb-clk
+ - allwinner,sun5i-a13-usb-clk
+ - allwinner,sun6i-a31-usb-clk
+ - allwinner,sun8i-a23-usb-clk
+ - allwinner,sun8i-h3-usb-clk
+
+ reg:
+ maxItems: 1
+
+ clocks:
+ maxItems: 1
+
+ clock-output-names:
+ minItems: 2
+ maxItems: 8
+
+required:
+ - "#clock-cells"
+ - "#reset-cells"
+ - compatible
+ - reg
+ - clocks
+ - clock-output-names
+
+additionalProperties: false
+
+allOf:
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: allwinner,sun4i-a10-usb-clk
+
+ then:
+ properties:
+ clock-output-names:
+ maxItems: 3
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: allwinner,sun5i-a13-usb-clk
+
+ then:
+ properties:
+ clock-output-names:
+ maxItems: 2
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: allwinner,sun6i-a31-usb-clk
+
+ then:
+ properties:
+ clock-output-names:
+ maxItems: 6
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: allwinner,sun8i-a23-usb-clk
+
+ then:
+ properties:
+ clock-output-names:
+ maxItems: 5
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: allwinner,sun8i-h3-usb-clk
+
+ then:
+ properties:
+ clock-output-names:
+ maxItems: 8
+
+examples:
+ - |
+ clk@1c200cc {
+ #clock-cells = <1>;
+ #reset-cells = <1>;
+ compatible = "allwinner,sun4i-a10-usb-clk";
+ reg = <0x01c200cc 0x4>;
+ clocks = <&pll6 1>;
+ clock-output-names = "usb_ohci0", "usb_ohci1", "usb_phy";
+ };
+
+ - |
+ clk@1c200cc {
+ #clock-cells = <1>;
+ #reset-cells = <1>;
+ compatible = "allwinner,sun5i-a13-usb-clk";
+ reg = <0x01c200cc 0x4>;
+ clocks = <&pll6 1>;
+ clock-output-names = "usb_ohci0", "usb_phy";
+ };
+
+ - |
+ clk@1c200cc {
+ #clock-cells = <1>;
+ #reset-cells = <1>;
+ compatible = "allwinner,sun6i-a31-usb-clk";
+ reg = <0x01c200cc 0x4>;
+ clocks = <&osc24M>;
+ clock-output-names = "usb_phy0", "usb_phy1", "usb_phy2",
+ "usb_ohci0", "usb_ohci1",
+ "usb_ohci2";
+ };
+
+ - |
+ clk@1c200cc {
+ #clock-cells = <1>;
+ #reset-cells = <1>;
+ compatible = "allwinner,sun8i-a23-usb-clk";
+ reg = <0x01c200cc 0x4>;
+ clocks = <&osc24M>;
+ clock-output-names = "usb_phy0", "usb_phy1", "usb_hsic",
+ "usb_hsic_12M", "usb_ohci0";
+ };
+
+ - |
+ clk@1c200cc {
+ #clock-cells = <1>;
+ #reset-cells = <1>;
+ compatible = "allwinner,sun8i-h3-usb-clk";
+ reg = <0x01c200cc 0x4>;
+ clocks = <&osc24M>;
+ clock-output-names = "usb_phy0", "usb_phy1",
+ "usb_phy2", "usb_phy3",
+ "usb_ohci0", "usb_ohci1",
+ "usb_ohci2", "usb_ohci3";
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-ve-clk.yaml b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-ve-clk.yaml
new file mode 100644
index 0000000..5dfd0c1
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-ve-clk.yaml
@@ -0,0 +1,55 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/allwinner,sun4i-a10-ve-clk.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Allwinner A10 Video Engine Clock Device Tree Bindings
+
+maintainers:
+ - Chen-Yu Tsai <wens@csie.org>
+ - Maxime Ripard <mripard@kernel.org>
+
+deprecated: true
+
+properties:
+ "#clock-cells":
+ const: 0
+
+ "#reset-cells":
+ const: 0
+
+ compatible:
+ const: allwinner,sun4i-a10-ve-clk
+
+ reg:
+ maxItems: 1
+
+ clocks:
+ maxItems: 1
+
+ clock-output-names:
+ maxItems: 1
+
+required:
+ - "#clock-cells"
+ - "#reset-cells"
+ - compatible
+ - reg
+ - clocks
+ - clock-output-names
+
+additionalProperties: false
+
+examples:
+ - |
+ clk@1c2013c {
+ #clock-cells = <0>;
+ #reset-cells = <0>;
+ compatible = "allwinner,sun4i-a10-ve-clk";
+ reg = <0x01c2013c 0x4>;
+ clocks = <&pll4>;
+ clock-output-names = "ve";
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/clock/allwinner,sun5i-a13-ahb-clk.yaml b/Documentation/devicetree/bindings/clock/allwinner,sun5i-a13-ahb-clk.yaml
new file mode 100644
index 0000000..99add79
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/allwinner,sun5i-a13-ahb-clk.yaml
@@ -0,0 +1,52 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/allwinner,sun5i-a13-ahb-clk.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Allwinner A13 AHB Clock Device Tree Bindings
+
+maintainers:
+ - Chen-Yu Tsai <wens@csie.org>
+ - Maxime Ripard <mripard@kernel.org>
+
+deprecated: true
+
+properties:
+ "#clock-cells":
+ const: 0
+
+ compatible:
+ const: allwinner,sun5i-a13-ahb-clk
+
+ reg:
+ maxItems: 1
+
+ clocks:
+ maxItems: 3
+ description: >
+ The parent order must match the hardware programming order.
+
+ clock-output-names:
+ maxItems: 1
+
+required:
+ - "#clock-cells"
+ - compatible
+ - reg
+ - clocks
+ - clock-output-names
+
+additionalProperties: false
+
+examples:
+ - |
+ ahb@1c20054 {
+ #clock-cells = <0>;
+ compatible = "allwinner,sun5i-a13-ahb-clk";
+ reg = <0x01c20054 0x4>;
+ clocks = <&axi>, <&cpu>, <&pll6 1>;
+ clock-output-names = "ahb";
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/clock/allwinner,sun6i-a31-pll6-clk.yaml b/Documentation/devicetree/bindings/clock/allwinner,sun6i-a31-pll6-clk.yaml
new file mode 100644
index 0000000..5f37720
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/allwinner,sun6i-a31-pll6-clk.yaml
@@ -0,0 +1,53 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/allwinner,sun6i-a31-pll6-clk.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Allwinner A31 Peripheral PLL Device Tree Bindings
+
+maintainers:
+ - Chen-Yu Tsai <wens@csie.org>
+ - Maxime Ripard <mripard@kernel.org>
+
+deprecated: true
+
+properties:
+ "#clock-cells":
+ const: 1
+ description: >
+ The first output is the regular PLL output, the second is a PLL
+ output at twice the rate.
+
+ compatible:
+ const: allwinner,sun6i-a31-pll6-clk
+
+ reg:
+ maxItems: 1
+
+ clocks:
+ maxItems: 1
+
+ clock-output-names:
+ maxItems: 2
+
+required:
+ - "#clock-cells"
+ - compatible
+ - reg
+ - clocks
+ - clock-output-names
+
+additionalProperties: false
+
+examples:
+ - |
+ clk@1c20028 {
+ #clock-cells = <1>;
+ compatible = "allwinner,sun6i-a31-pll6-clk";
+ reg = <0x01c20028 0x4>;
+ clocks = <&osc24M>;
+ clock-output-names = "pll6", "pll6x2";
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/clock/allwinner,sun7i-a20-gmac-clk.yaml b/Documentation/devicetree/bindings/clock/allwinner,sun7i-a20-gmac-clk.yaml
new file mode 100644
index 0000000..59e5dce
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/allwinner,sun7i-a20-gmac-clk.yaml
@@ -0,0 +1,51 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/allwinner,sun7i-a20-gmac-clk.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Allwinner A20 GMAC TX Clock Device Tree Bindings
+
+maintainers:
+ - Chen-Yu Tsai <wens@csie.org>
+ - Maxime Ripard <mripard@kernel.org>
+
+properties:
+ "#clock-cells":
+ const: 0
+
+ compatible:
+ const: allwinner,sun7i-a20-gmac-clk
+
+ reg:
+ maxItems: 1
+
+ clocks:
+ maxItems: 2
+ description: >
+ The parent clocks shall be fixed rate dummy clocks at 25 MHz and
+ 125 MHz, respectively.
+
+ clock-output-names:
+ maxItems: 1
+
+required:
+ - "#clock-cells"
+ - compatible
+ - reg
+ - clocks
+ - clock-output-names
+
+additionalProperties: false
+
+examples:
+ - |
+ clk@1c20164 {
+ #clock-cells = <0>;
+ compatible = "allwinner,sun7i-a20-gmac-clk";
+ reg = <0x01c20164 0x4>;
+ clocks = <&mii_phy_tx_clk>, <&gmac_int_tx_clk>;
+ clock-output-names = "gmac_tx";
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/clock/allwinner,sun7i-a20-out-clk.yaml b/Documentation/devicetree/bindings/clock/allwinner,sun7i-a20-out-clk.yaml
new file mode 100644
index 0000000..c745733
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/allwinner,sun7i-a20-out-clk.yaml
@@ -0,0 +1,52 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/allwinner,sun7i-a20-out-clk.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Allwinner A20 Output Clock Device Tree Bindings
+
+maintainers:
+ - Chen-Yu Tsai <wens@csie.org>
+ - Maxime Ripard <mripard@kernel.org>
+
+deprecated: true
+
+properties:
+ "#clock-cells":
+ const: 0
+
+ compatible:
+ const: allwinner,sun7i-a20-out-clk
+
+ reg:
+ maxItems: 1
+
+ clocks:
+ maxItems: 3
+ description: >
+ The parent order must match the hardware programming order.
+
+ clock-output-names:
+ maxItems: 1
+
+required:
+ - "#clock-cells"
+ - compatible
+ - reg
+ - clocks
+ - clock-output-names
+
+additionalProperties: false
+
+examples:
+ - |
+ clk@1c201f0 {
+ #clock-cells = <0>;
+ compatible = "allwinner,sun7i-a20-out-clk";
+ reg = <0x01c201f0 0x4>;
+ clocks = <&osc24M_32k>, <&osc32k>, <&osc24M>;
+ clock-output-names = "clk_out_a";
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/clock/allwinner,sun8i-a83t-de2-clk.yaml b/Documentation/devicetree/bindings/clock/allwinner,sun8i-a83t-de2-clk.yaml
new file mode 100644
index 0000000..3f995d2
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/allwinner,sun8i-a83t-de2-clk.yaml
@@ -0,0 +1,76 @@
+# SPDX-License-Identifier: GPL-2.0+
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/allwinner,sun8i-a83t-de2-clk.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Allwinner A83t Display Engine 2/3 Clock Controller Device Tree Bindings
+
+maintainers:
+ - Chen-Yu Tsai <wens@csie.org>
+ - Maxime Ripard <mripard@kernel.org>
+
+properties:
+ "#clock-cells":
+ const: 1
+
+ "#reset-cells":
+ const: 1
+
+ compatible:
+ oneOf:
+ - const: allwinner,sun8i-a83t-de2-clk
+ - const: allwinner,sun8i-h3-de2-clk
+ - const: allwinner,sun8i-v3s-de2-clk
+ - const: allwinner,sun50i-a64-de2-clk
+ - const: allwinner,sun50i-h5-de2-clk
+ - const: allwinner,sun50i-h6-de2-clk
+ - items:
+ - const: allwinner,sun8i-r40-de2-clk
+ - const: allwinner,sun8i-h3-de2-clk
+
+ reg:
+ maxItems: 1
+
+ clocks:
+ items:
+ - description: Bus Clock
+ - description: Module Clock
+
+ clock-names:
+ items:
+ - const: bus
+ - const: mod
+
+ resets:
+ maxItems: 1
+
+required:
+ - "#clock-cells"
+ - "#reset-cells"
+ - compatible
+ - reg
+ - clocks
+ - clock-names
+ - resets
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/sun8i-h3-ccu.h>
+ #include <dt-bindings/reset/sun8i-h3-ccu.h>
+
+ de2_clocks: clock@1000000 {
+ compatible = "allwinner,sun8i-h3-de2-clk";
+ reg = <0x01000000 0x100000>;
+ clocks = <&ccu CLK_BUS_DE>,
+ <&ccu CLK_DE>;
+ clock-names = "bus",
+ "mod";
+ resets = <&ccu RST_BUS_DE>;
+ #clock-cells = <1>;
+ #reset-cells = <1>;
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/clock/allwinner,sun8i-h3-bus-gates-clk.yaml b/Documentation/devicetree/bindings/clock/allwinner,sun8i-h3-bus-gates-clk.yaml
new file mode 100644
index 0000000..3eb2bf6
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/allwinner,sun8i-h3-bus-gates-clk.yaml
@@ -0,0 +1,103 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/allwinner,sun8i-h3-bus-gates-clk.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Allwinner A10 Bus Gates Clock Device Tree Bindings
+
+maintainers:
+ - Chen-Yu Tsai <wens@csie.org>
+ - Maxime Ripard <mripard@kernel.org>
+
+deprecated: true
+
+properties:
+ "#clock-cells":
+ const: 1
+ description: >
+ This additional argument passed to that clock is the offset of
+ the bit controlling this particular gate in the register.
+
+ compatible:
+ const: allwinner,sun8i-h3-bus-gates-clk
+
+ reg:
+ maxItems: 1
+
+ clocks:
+ maxItems: 4
+
+ clock-names:
+ maxItems: 4
+ description: >
+ The parent order must match the hardware programming order.
+
+ clock-indices:
+ minItems: 1
+ maxItems: 64
+
+ clock-output-names:
+ minItems: 1
+ maxItems: 64
+
+required:
+ - "#clock-cells"
+ - compatible
+ - reg
+ - clocks
+ - clock-indices
+ - clock-names
+ - clock-output-names
+
+additionalProperties: false
+
+examples:
+ - |
+ clk@1c20060 {
+ #clock-cells = <1>;
+ compatible = "allwinner,sun8i-h3-bus-gates-clk";
+ reg = <0x01c20060 0x14>;
+ clocks = <&ahb1>, <&ahb2>, <&apb1>, <&apb2>;
+ clock-names = "ahb1", "ahb2", "apb1", "apb2";
+ clock-indices = <5>, <6>, <8>,
+ <9>, <10>, <13>,
+ <14>, <17>, <18>,
+ <19>, <20>,
+ <21>, <23>,
+ <24>, <25>,
+ <26>, <27>,
+ <28>, <29>,
+ <30>, <31>, <32>,
+ <35>, <36>, <37>,
+ <40>, <41>, <43>,
+ <44>, <52>, <53>,
+ <54>, <64>,
+ <65>, <69>, <72>,
+ <76>, <77>, <78>,
+ <96>, <97>, <98>,
+ <112>, <113>,
+ <114>, <115>,
+ <116>, <128>, <135>;
+ clock-output-names = "bus_ce", "bus_dma", "bus_mmc0",
+ "bus_mmc1", "bus_mmc2", "bus_nand",
+ "bus_sdram", "bus_gmac", "bus_ts",
+ "bus_hstimer", "bus_spi0",
+ "bus_spi1", "bus_otg",
+ "bus_otg_ehci0", "bus_ehci1",
+ "bus_ehci2", "bus_ehci3",
+ "bus_otg_ohci0", "bus_ohci1",
+ "bus_ohci2", "bus_ohci3", "bus_ve",
+ "bus_lcd0", "bus_lcd1", "bus_deint",
+ "bus_csi", "bus_tve", "bus_hdmi",
+ "bus_de", "bus_gpu", "bus_msgbox",
+ "bus_spinlock", "bus_codec",
+