Merge branch 'linux-linaro-lsk-v3.18' into linux-linaro-lsk-v3.18-rt

commit: 590e935df0c94b2d7a4584bb26906666176a6133 [log] [tgz]
author: Kevin Hilman <khilman@linaro.org> Mon Jun 29 10:37:23 2015 -0700
committer: Kevin Hilman <khilman@linaro.org> Mon Jun 29 10:37:23 2015 -0700
tree: bb004c31d277423c7eb0f6bd2c0c1e9d8428e9f2
parent: 774b195c03ac44c0dbc4e81a3f194779692ce3bd [diff]
parent: 91760bceff1f3f5860ff49c14194567f972352e6 [diff]
diff --git a/Documentation/device-mapper/dm-crypt.txt b/Documentation/device-mapper/dm-crypt.txt
index c81839b..692171f 100644
--- a/Documentation/device-mapper/dm-crypt.txt
+++ b/Documentation/device-mapper/dm-crypt.txt

@@ -5,7 +5,7 @@
 using the kernel crypto API.
 
 For a more detailed description of supported parameters see:
-http://code.google.com/p/cryptsetup/wiki/DMCrypt
+https://gitlab.com/cryptsetup/cryptsetup/wikis/DMCrypt
 
 Parameters: <cipher> <key> <iv_offset> <device path> \
 	      <offset> [<#opt_params> <opt_params>]
@@ -51,7 +51,7 @@
     Otherwise #opt_params is the number of following arguments.
 
     Example of optional parameters section:
-        1 allow_discards
+        3 allow_discards same_cpu_crypt submit_from_crypt_cpus
 
 allow_discards
     Block discard requests (a.k.a. TRIM) are passed through the crypt device.
@@ -63,11 +63,24 @@
     used space etc.) if the discarded blocks can be located easily on the
     device later.
 
+same_cpu_crypt
+    Perform encryption using the same cpu that IO was submitted on.
+    The default is to use an unbound workqueue so that encryption work
+    is automatically balanced between available CPUs.
+
+submit_from_crypt_cpus
+    Disable offloading writes to a separate thread after encryption.
+    There are some situations where offloading write bios from the
+    encryption threads to a single thread degrades performance
+    significantly.  The default is to offload write bios to the same
+    thread because it benefits CFQ to have writes submitted using the
+    same context.
+
 Example scripts
 ===============
 LUKS (Linux Unified Key Setup) is now the preferred way to set up disk
 encryption with dm-crypt using the 'cryptsetup' utility, see
-http://code.google.com/p/cryptsetup/
+https://gitlab.com/cryptsetup/cryptsetup
 
 [[
 #!/bin/sh

diff --git a/Documentation/device-mapper/verity.txt b/Documentation/device-mapper/verity.txt
index 9884681..0075f70 100644
--- a/Documentation/device-mapper/verity.txt
+++ b/Documentation/device-mapper/verity.txt

@@ -125,7 +125,7 @@
 
 The full specification of kernel parameters and on-disk metadata format
 is available at the cryptsetup project's wiki page
-  http://code.google.com/p/cryptsetup/wiki/DMVerity
+  https://gitlab.com/cryptsetup/cryptsetup/wikis/DMVerity
 
 Status
 ======
@@ -142,7 +142,7 @@
 
 A command line tool veritysetup is available to compute or verify
 the hash tree or activate the kernel device. This is available from
-the cryptsetup upstream repository http://code.google.com/p/cryptsetup/
+the cryptsetup upstream repository https://gitlab.com/cryptsetup/cryptsetup/
 (as a libcryptsetup extension).
 
 Create hash on the device:

diff --git a/Documentation/devicetree/bindings/thermal/thermal.txt b/Documentation/devicetree/bindings/thermal/thermal.txt
index f5db6b7..99d6608 100644
--- a/Documentation/devicetree/bindings/thermal/thermal.txt
+++ b/Documentation/devicetree/bindings/thermal/thermal.txt

@@ -167,6 +167,13 @@
 			by means of sensor ID. Additional coefficients are
 			interpreted as constant offset.
 
+- sustainable-power:	An estimate of the sustainable power (in mW) that the
+  Type: unsigned	thermal zone can dissipate at the desired
+  Size: one cell	control temperature.  For reference, the
+			sustainable power of a 4'' phone is typically
+			2000mW, while on a 10'' tablet is around
+			4500mW.
+
 Note: The delay properties are bound to the maximum dT/dt (temperature
 derivative over time) in two situations for a thermal zone:
 (i)  - when passive cooling is activated (polling-delay-passive); and
@@ -546,6 +553,8 @@
 		 */
 		coefficients =		<1200	-345	890>;
 
+		sustainable-power = <2500>;
+
 		trips {
 			/* Trips are based on resulting linear equation */
 			cpu-trip: cpu-trip {

diff --git a/Documentation/thermal/cpu-cooling-api.txt b/Documentation/thermal/cpu-cooling-api.txt
index fca24c9..7165358 100644
--- a/Documentation/thermal/cpu-cooling-api.txt
+++ b/Documentation/thermal/cpu-cooling-api.txt

@@ -3,7 +3,7 @@
 
 Written by Amit Daniel Kachhap <amit.kachhap@linaro.org>
 
-Updated: 12 May 2012
+Updated: 6 Jan 2015
 
 Copyright (c)  2012 Samsung Electronics Co., Ltd(http://www.samsung.com)
 
@@ -25,8 +25,173 @@
 
    clip_cpus: cpumask of cpus where the frequency constraints will happen.
 
-1.1.2 void cpufreq_cooling_unregister(struct thermal_cooling_device *cdev)
+1.1.2 struct thermal_cooling_device *of_cpufreq_cooling_register(
+	struct device_node *np, const struct cpumask *clip_cpus)
+
+    This interface function registers the cpufreq cooling device with
+    the name "thermal-cpufreq-%x" linking it with a device tree node, in
+    order to bind it via the thermal DT code. This api can support multiple
+    instances of cpufreq cooling devices.
+
+    np: pointer to the cooling device device tree node
+    clip_cpus: cpumask of cpus where the frequency constraints will happen.
+
+1.1.3 struct thermal_cooling_device *cpufreq_power_cooling_register(
+    const struct cpumask *clip_cpus, u32 capacitance,
+    get_static_t plat_static_func)
+
+Similar to cpufreq_cooling_register, this function registers a cpufreq
+cooling device.  Using this function, the cooling device will
+implement the power extensions by using a simple cpu power model.  The
+cpus must have registered their OPPs using the OPP library.
+
+The additional parameters are needed for the power model (See 2. Power
+models).  "capacitance" is the dynamic power coefficient (See 2.1
+Dynamic power).  "plat_static_func" is a function to calculate the
+static power consumed by these cpus (See 2.2 Static power).
+
+1.1.4 struct thermal_cooling_device *of_cpufreq_power_cooling_register(
+    struct device_node *np, const struct cpumask *clip_cpus, u32 capacitance,
+    get_static_t plat_static_func)
+
+Similar to cpufreq_power_cooling_register, this function register a
+cpufreq cooling device with power extensions using the device tree
+information supplied by the np parameter.
+
+1.1.5 void cpufreq_cooling_unregister(struct thermal_cooling_device *cdev)
 
     This interface function unregisters the "thermal-cpufreq-%x" cooling device.
 
     cdev: Cooling device pointer which has to be unregistered.
+
+2. Power models
+
+The power API registration functions provide a simple power model for
+CPUs.  The current power is calculated as dynamic + (optionally)
+static power.  This power model requires that the operating-points of
+the CPUs are registered using the kernel's opp library and the
+`cpufreq_frequency_table` is assigned to the `struct device` of the
+cpu.  If you are using CONFIG_CPUFREQ_DT then the
+`cpufreq_frequency_table` should already be assigned to the cpu
+device.
+
+The `plat_static_func` parameter of `cpufreq_power_cooling_register()`
+and `of_cpufreq_power_cooling_register()` is optional.  If you don't
+provide it, only dynamic power will be considered.
+
+2.1 Dynamic power
+
+The dynamic power consumption of a processor depends on many factors.
+For a given processor implementation the primary factors are:
+
+- The time the processor spends running, consuming dynamic power, as
+  compared to the time in idle states where dynamic consumption is
+  negligible.  Herein we refer to this as 'utilisation'.
+- The voltage and frequency levels as a result of DVFS.  The DVFS
+  level is a dominant factor governing power consumption.
+- In running time the 'execution' behaviour (instruction types, memory
+  access patterns and so forth) causes, in most cases, a second order
+  variation.  In pathological cases this variation can be significant,
+  but typically it is of a much lesser impact than the factors above.
+
+A high level dynamic power consumption model may then be represented as:
+
+Pdyn = f(run) * Voltage^2 * Frequency * Utilisation
+
+f(run) here represents the described execution behaviour and its
+result has a units of Watts/Hz/Volt^2 (this often expressed in
+mW/MHz/uVolt^2)
+
+The detailed behaviour for f(run) could be modelled on-line.  However,
+in practice, such an on-line model has dependencies on a number of
+implementation specific processor support and characterisation
+factors.  Therefore, in initial implementation that contribution is
+represented as a constant coefficient.  This is a simplification
+consistent with the relative contribution to overall power variation.
+
+In this simplified representation our model becomes:
+
+Pdyn = Capacitance * Voltage^2 * Frequency * Utilisation
+
+Where `capacitance` is a constant that represents an indicative
+running time dynamic power coefficient in fundamental units of
+mW/MHz/uVolt^2.  Typical values for mobile CPUs might lie in range
+from 100 to 500.  For reference, the approximate values for the SoC in
+ARM's Juno Development Platform are 530 for the Cortex-A57 cluster and
+140 for the Cortex-A53 cluster.
+
+
+2.2 Static power
+
+Static leakage power consumption depends on a number of factors.  For a
+given circuit implementation the primary factors are:
+
+- Time the circuit spends in each 'power state'
+- Temperature
+- Operating voltage
+- Process grade
+
+The time the circuit spends in each 'power state' for a given
+evaluation period at first order means OFF or ON.  However,
+'retention' states can also be supported that reduce power during
+inactive periods without loss of context.
+
+Note: The visibility of state entries to the OS can vary, according to
+platform specifics, and this can then impact the accuracy of a model
+based on OS state information alone.  It might be possible in some
+cases to extract more accurate information from system resources.
+
+The temperature, operating voltage and process 'grade' (slow to fast)
+of the circuit are all significant factors in static leakage power
+consumption.  All of these have complex relationships to static power.
+
+Circuit implementation specific factors include the chosen silicon
+process as well as the type, number and size of transistors in both
+the logic gates and any RAM elements included.
+
+The static power consumption modelling must take into account the
+power managed regions that are implemented.  Taking the example of an
+ARM processor cluster, the modelling would take into account whether
+each CPU can be powered OFF separately or if only a single power
+region is implemented for the complete cluster.
+
+In one view, there are others, a static power consumption model can
+then start from a set of reference values for each power managed
+region (e.g. CPU, Cluster/L2) in each state (e.g. ON, OFF) at an
+arbitrary process grade, voltage and temperature point.  These values
+are then scaled for all of the following: the time in each state, the
+process grade, the current temperature and the operating voltage.
+However, since both implementation specific and complex relationships
+dominate the estimate, the appropriate interface to the model from the
+cpu cooling device is to provide a function callback that calculates
+the static power in this platform.  When registering the cpu cooling
+device pass a function pointer that follows the `get_static_t`
+prototype:
+
+    int plat_get_static(cpumask_t *cpumask, int interval,
+                        unsigned long voltage, u32 &power);
+
+`cpumask` is the cpumask of the cpus involved in the calculation.
+`voltage` is the voltage at which they are operating.  The function
+should calculate the average static power for the last `interval`
+milliseconds.  It returns 0 on success, -E* on error.  If it
+succeeds, it should store the static power in `power`.  Reading the
+temperature of the cpus described by `cpumask` is left for
+plat_get_static() to do as the platform knows best which thermal
+sensor is closest to the cpu.
+
+If `plat_static_func` is NULL, static power is considered to be
+negligible for this platform and only dynamic power is considered.
+
+The platform specific callback can then use any combination of tables
+and/or equations to permute the estimated value.  Process grade
+information is not passed to the model since access to such data, from
+on-chip measurement capability or manufacture time data, is platform
+specific.
+
+Note: the significance of static power for CPUs in comparison to
+dynamic power is highly dependent on implementation.  Given the
+potential complexity in implementation, the importance and accuracy of
+its inclusion when using cpu cooling devices should be assessed on a
+case by case basis.
+

diff --git a/Documentation/thermal/power_allocator.txt b/Documentation/thermal/power_allocator.txt
new file mode 100644
index 0000000..c3797b5
--- /dev/null
+++ b/Documentation/thermal/power_allocator.txt

@@ -0,0 +1,247 @@
+Power allocator governor tunables
+=================================
+
+Trip points
+-----------
+
+The governor requires the following two passive trip points:
+
+1.  "switch on" trip point: temperature above which the governor
+    control loop starts operating.  This is the first passive trip
+    point of the thermal zone.
+
+2.  "desired temperature" trip point: it should be higher than the
+    "switch on" trip point.  This the target temperature the governor
+    is controlling for.  This is the last passive trip point of the
+    thermal zone.
+
+PID Controller
+--------------
+
+The power allocator governor implements a
+Proportional-Integral-Derivative controller (PID controller) with
+temperature as the control input and power as the controlled output:
+
+    P_max = k_p * e + k_i * err_integral + k_d * diff_err + sustainable_power
+
+where
+    e = desired_temperature - current_temperature
+    err_integral is the sum of previous errors
+    diff_err = e - previous_error
+
+It is similar to the one depicted below:
+
+                                      k_d
+                                       |
+current_temp                           |
+     |                                 v
+     |                +----------+   +---+
+     |         +----->| diff_err |-->| X |------+
+     |         |      +----------+   +---+      |
+     |         |                                |      tdp        actor
+     |         |                      k_i       |       |  get_requested_power()
+     |         |                       |        |       |        |     |
+     |         |                       |        |       |        |     | ...
+     v         |                       v        v       v        v     v
+   +---+       |      +-------+      +---+    +---+   +---+   +----------+
+   | S |-------+----->| sum e |----->| X |--->| S |-->| S |-->|power     |
+   +---+       |      +-------+      +---+    +---+   +---+   |allocation|
+     ^         |                                ^             +----------+
+     |         |                                |                |     |
+     |         |        +---+                   |                |     |
+     |         +------->| X |-------------------+                v     v
+     |                  +---+                               granted performance
+desired_temperature       ^
+                          |
+                          |
+                      k_po/k_pu
+
+Sustainable power
+-----------------
+
+An estimate of the sustainable dissipatable power (in mW) should be
+provided while registering the thermal zone.  This estimates the
+sustained power that can be dissipated at the desired control
+temperature.  This is the maximum sustained power for allocation at
+the desired maximum temperature.  The actual sustained power can vary
+for a number of reasons.  The closed loop controller will take care of
+variations such as environmental conditions, and some factors related
+to the speed-grade of the silicon.  `sustainable_power` is therefore
+simply an estimate, and may be tuned to affect the aggressiveness of
+the thermal ramp. For reference, the sustainable power of a 4" phone
+is typically 2000mW, while on a 10" tablet is around 4500mW (may vary
+depending on screen size).
+
+If you are using device tree, do add it as a property of the
+thermal-zone.  For example:
+
+	thermal-zones {
+		soc_thermal {
+			polling-delay = <1000>;
+			polling-delay-passive = <100>;
+			sustainable-power = <2500>;
+			...
+
+Instead, if the thermal zone is registered from the platform code, pass a
+`thermal_zone_params` that has a `sustainable_power`.  If no
+`thermal_zone_params` were being passed, then something like below
+will suffice:
+
+	static const struct thermal_zone_params tz_params = {
+		.sustainable_power = 3500,
+	};
+
+and then pass `tz_params` as the 5th parameter to
+`thermal_zone_device_register()`
+
+k_po and k_pu
+-------------
+
+The implementation of the PID controller in the power allocator
+thermal governor allows the configuration of two proportional term
+constants: `k_po` and `k_pu`.  `k_po` is the proportional term
+constant during temperature overshoot periods (current temperature is
+above "desired temperature" trip point).  Conversely, `k_pu` is the
+proportional term constant during temperature undershoot periods
+(current temperature below "desired temperature" trip point).
+
+These controls are intended as the primary mechanism for configuring
+the permitted thermal "ramp" of the system.  For instance, a lower
+`k_pu` value will provide a slower ramp, at the cost of capping
+available capacity at a low temperature.  On the other hand, a high
+value of `k_pu` will result in the governor granting very high power
+whilst temperature is low, and may lead to temperature overshooting.
+
+The default value for `k_pu` is:
+
+    2 * sustainable_power / (desired_temperature - switch_on_temp)
+
+This means that at `switch_on_temp` the output of the controller's
+proportional term will be 2 * `sustainable_power`.  The default value
+for `k_po` is:
+
+    sustainable_power / (desired_temperature - switch_on_temp)
+
+Focusing on the proportional and feed forward values of the PID
+controller equation we have:
+
+    P_max = k_p * e + sustainable_power
+
+The proportional term is proportional to the difference between the
+desired temperature and the current one.  When the current temperature
+is the desired one, then the proportional component is zero and
+`P_max` = `sustainable_power`.  That is, the system should operate in
+thermal equilibrium under constant load.  `sustainable_power` is only
+an estimate, which is the reason for closed-loop control such as this.
+
+Expanding `k_pu` we get:
+    P_max = 2 * sustainable_power * (T_set - T) / (T_set - T_on) +
+        sustainable_power
+
+where
+    T_set is the desired temperature
+    T is the current temperature
+    T_on is the switch on temperature
+
+When the current temperature is the switch_on temperature, the above
+formula becomes:
+
+    P_max = 2 * sustainable_power * (T_set - T_on) / (T_set - T_on) +
+        sustainable_power = 2 * sustainable_power + sustainable_power =
+        3 * sustainable_power
+
+Therefore, the proportional term alone linearly decreases power from
+3 * `sustainable_power` to `sustainable_power` as the temperature
+rises from the switch on temperature to the desired temperature.
+
+k_i and integral_cutoff
+-----------------------
+
+`k_i` configures the PID loop's integral term constant.  This term
+allows the PID controller to compensate for long term drift and for
+the quantized nature of the output control: cooling devices can't set
+the exact power that the governor requests.  When the temperature
+error is below `integral_cutoff`, errors are accumulated in the
+integral term.  This term is then multiplied by `k_i` and the result
+added to the output of the controller.  Typically `k_i` is set low (1
+or 2) and `integral_cutoff` is 0.
+
+k_d
+---
+
+`k_d` configures the PID loop's derivative term constant.  It's
+recommended to leave it as the default: 0.
+
+Cooling device power API
+========================
+
+Cooling devices controlled by this governor must supply the additional
+"power" API in their `cooling_device_ops`.  It consists on three ops:
+
+1. int get_requested_power(struct thermal_cooling_device *cdev,
+	struct thermal_zone_device *tz, u32 *power);
+@cdev: The `struct thermal_cooling_device` pointer
+@tz: thermal zone in which we are currently operating
+@power: pointer in which to store the calculated power
+
+`get_requested_power()` calculates the power requested by the device
+in milliwatts and stores it in @power .  It should return 0 on
+success, -E* on failure.  This is currently used by the power
+allocator governor to calculate how much power to give to each cooling
+device.
+
+2. int state2power(struct thermal_cooling_device *cdev, struct
+        thermal_zone_device *tz, unsigned long state, u32 *power);
+@cdev: The `struct thermal_cooling_device` pointer
+@tz: thermal zone in which we are currently operating
+@state: A cooling device state
+@power: pointer in which to store the equivalent power
+
+Convert cooling device state @state into power consumption in
+milliwatts and store it in @power.  It should return 0 on success, -E*
+on failure.  This is currently used by thermal core to calculate the
+maximum power that an actor can consume.
+
+3. int power2state(struct thermal_cooling_device *cdev, u32 power,
+	unsigned long *state);
+@cdev: The `struct thermal_cooling_device` pointer
+@power: power in milliwatts
+@state: pointer in which to store the resulting state
+
+Calculate a cooling device state that would make the device consume at
+most @power mW and store it in @state.  It should return 0 on success,
+-E* on failure.  This is currently used by the thermal core to convert
+a given power set by the power allocator governor to a state that the
+cooling device can set.  It is a function because this conversion may
+depend on external factors that may change so this function should the
+best conversion given "current circumstances".
+
+Cooling device weights
+----------------------
+
+Weights are a mechanism to bias the allocation among cooling
+devices.  They express the relative power efficiency of different
+cooling devices.  Higher weight can be used to express higher power
+efficiency.  Weighting is relative such that if each cooling device
+has a weight of one they are considered equal.  This is particularly
+useful in heterogeneous systems where two cooling devices may perform
+the same kind of compute, but with different efficiency.  For example,
+a system with two different types of processors.
+
+If the thermal zone is registered using
+`thermal_zone_device_register()` (i.e., platform code), then weights
+are passed as part of the thermal zone's `thermal_bind_parameters`.
+If the platform is registered using device tree, then they are passed
+as the `contribution` property of each map in the `cooling-maps` node.
+
+Limitations of the power allocator governor
+===========================================
+
+The power allocator governor's PID controller works best if there is a
+periodic tick.  If you have a driver that calls
+`thermal_zone_device_update()` (or anything that ends up calling the
+governor's `throttle()` function) repetitively, the governor response
+won't be very good.  Note that this is not particular to this
+governor, step-wise will also misbehave if you call its throttle()
+faster than the normal thermal framework tick (due to interrupts for
+example) as it will overreact.

diff --git a/Documentation/thermal/sysfs-api.txt b/Documentation/thermal/sysfs-api.txt
index 87519cb..c1f6864 100644
--- a/Documentation/thermal/sysfs-api.txt
+++ b/Documentation/thermal/sysfs-api.txt

@@ -95,7 +95,7 @@
 1.3 interface for binding a thermal zone device with a thermal cooling device
 1.3.1 int thermal_zone_bind_cooling_device(struct thermal_zone_device *tz,
 	int trip, struct thermal_cooling_device *cdev,
-	unsigned long upper, unsigned long lower);
+	unsigned long upper, unsigned long lower, unsigned int weight);
 
     This interface function bind a thermal cooling device to the certain trip
     point of a thermal zone device.
@@ -110,6 +110,8 @@
     lower:the Minimum cooling state can be used for this trip point.
           THERMAL_NO_LIMIT means no lower limit,
 	  and the cooling device can be in cooling state 0.
+    weight: the influence of this cooling device in this thermal
+            zone.  See 1.4.1 below for more information.
 
 1.3.2 int thermal_zone_unbind_cooling_device(struct thermal_zone_device *tz,
 		int trip, struct thermal_cooling_device *cdev);
@@ -127,9 +129,15 @@
     This structure defines the following parameters that are used to bind
     a zone with a cooling device for a particular trip point.
     .cdev: The cooling device pointer
-    .weight: The 'influence' of a particular cooling device on this zone.
-             This is on a percentage scale. The sum of all these weights
-             (for a particular zone) cannot exceed 100.
+    .weight: The 'influence' of a particular cooling device on this
+             zone. This is relative to the rest of the cooling
+             devices. For example, if all cooling devices have a
+             weight of 1, then they all contribute the same. You can
+             use percentages if you want, but it's not mandatory. A
+             weight of 0 means that this cooling device doesn't
+             contribute to the cooling of this zone unless all cooling
+             devices have a weight of 0. If all weights are 0, then
+             they all contribute the same.
     .trip_mask:This is a bit mask that gives the binding relation between
                this thermal zone and cdev, for a particular trip point.
                If nth bit is set, then the cdev and thermal zone are bound
@@ -176,6 +184,14 @@
     |---trip_point_[0-*]_type:	Trip point type
     |---trip_point_[0-*]_hyst:	Hysteresis value for this trip point
     |---emul_temp:		Emulated temperature set node
+    |---sustainable_power:      Sustainable dissipatable power
+    |---k_po:                   Proportional term during temperature overshoot
+    |---k_pu:                   Proportional term during temperature undershoot
+    |---k_i:                    PID's integral term in the power allocator gov
+    |---k_d:                    PID's derivative term in the power allocator
+    |---integral_cutoff:        Offset above which errors are accumulated
+    |---slope:                  Slope constant applied as linear extrapolation
+    |---offset:                 Offset constant applied as linear extrapolation
 
 Thermal cooling device sys I/F, created once it's registered:
 /sys/class/thermal/cooling_device[0-*]:
@@ -192,6 +208,8 @@
 /sys/class/thermal/thermal_zone[0-*]:
     |---cdev[0-*]:		[0-*]th cooling device in current thermal zone
     |---cdev[0-*]_trip_point:	Trip point that cdev[0-*] is associated with
+    |---cdev[0-*]_weight:       Influence of the cooling device in
+                                this thermal zone
 
 Besides the thermal zone device sysfs I/F and cooling device sysfs I/F,
 the generic thermal driver also creates a hwmon sysfs I/F for each _type_
@@ -265,6 +283,14 @@
 	point.
 	RO, Optional
 
+cdev[0-*]_weight
+        The influence of cdev[0-*] in this thermal zone. This value
+        is relative to the rest of cooling devices in the thermal
+        zone. For example, if a cooling device has a weight double
+        than that of other, it's twice as effective in cooling the
+        thermal zone.
+        RW, Optional
+
 passive
 	Attribute is only present for zones in which the passive cooling
 	policy is not supported by native thermal driver. Default is zero
@@ -289,6 +315,66 @@
 	  because userland can easily disable the thermal policy by simply
 	  flooding this sysfs node with low temperature values.
 
+sustainable_power
+	An estimate of the sustained power that can be dissipated by
+	the thermal zone. Used by the power allocator governor. For
+	more information see Documentation/thermal/power_allocator.txt
+	Unit: milliwatts
+	RW, Optional
+
+k_po
+	The proportional term of the power allocator governor's PID
+	controller during temperature overshoot. Temperature overshoot
+	is when the current temperature is above the "desired
+	temperature" trip point. For more information see
+	Documentation/thermal/power_allocator.txt
+	RW, Optional
+
+k_pu
+	The proportional term of the power allocator governor's PID
+	controller during temperature undershoot. Temperature undershoot
+	is when the current temperature is below the "desired
+	temperature" trip point. For more information see
+	Documentation/thermal/power_allocator.txt
+	RW, Optional
+
+k_i
+	The integral term of the power allocator governor's PID
+	controller. This term allows the PID controller to compensate
+	for long term drift. For more information see
+	Documentation/thermal/power_allocator.txt
+	RW, Optional
+
+k_d
+	The derivative term of the power allocator governor's PID
+	controller. For more information see
+	Documentation/thermal/power_allocator.txt
+	RW, Optional
+
+integral_cutoff
+	Temperature offset from the desired temperature trip point
+	above which the integral term of the power allocator
+	governor's PID controller starts accumulating errors. For
+	example, if integral_cutoff is 0, then the integral term only
+	accumulates error when temperature is above the desired
+	temperature trip point. For more information see
+	Documentation/thermal/power_allocator.txt
+	RW, Optional
+
+slope
+	The slope constant used in a linear extrapolation model
+	to determine a hotspot temperature based off the sensor's
+	raw readings. It is up to the device driver to determine
+	the usage of these values.
+	RW, Optional
+
+offset
+	The offset constant used in a linear extrapolation model
+	to determine a hotspot temperature based off the sensor's
+	raw readings. It is up to the device driver to determine
+	the usage of these values.
+	RW, Optional
+
 *****************************
 * Cooling device attributes *
 *****************************
@@ -318,7 +404,8 @@
 active[0] and active[1] at the same time, it may register itself as a
 thermal_zone_device (thermal_zone1) with 4 trip points in all.
 It has one processor and one fan, which are both registered as
-thermal_cooling_device.
+thermal_cooling_device. Both are considered to have the same
+effectiveness in cooling the thermal zone.
 
 If the processor is listed in _PSL method, and the fan is listed in _AL0
 method, the sys I/F structure will be built like this:
@@ -340,8 +427,10 @@
     |---trip_point_3_type:	active1
     |---cdev0:			--->/sys/class/thermal/cooling_device0
     |---cdev0_trip_point:	1	/* cdev0 can be used for passive */
+    |---cdev0_weight:           1024
     |---cdev1:			--->/sys/class/thermal/cooling_device3
     |---cdev1_trip_point:	2	/* cdev1 can be used for active[0]*/
+    |---cdev1_weight:           1024
 
 |cooling_device0:
     |---type:			Processor

diff --git a/MAINTAINERS b/MAINTAINERS
index 636c5ab..0c921c8 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS

@@ -9305,6 +9305,7 @@
 S:	Supported
 F:	drivers/thermal/
 F:	include/linux/thermal.h
+F:	include/uapi/linux/thermal.h
 F:	include/linux/cpu_cooling.h
 F:	Documentation/devicetree/bindings/thermal/
 

diff --git a/arch/arm/kernel/Makefile b/arch/arm/kernel/Makefile
index 03120e6..2ecc7d1 100644
--- a/arch/arm/kernel/Makefile
+++ b/arch/arm/kernel/Makefile

@@ -84,6 +84,7 @@
 obj-$(CONFIG_IWMMXT)		+= iwmmxt.o
 obj-$(CONFIG_PERF_EVENTS)	+= perf_regs.o
 obj-$(CONFIG_HW_PERF_EVENTS)	+= perf_event.o perf_event_cpu.o
+CFLAGS_pj4-cp0.o		:= -marm
 AFLAGS_iwmmxt.o			:= -Wa,-mcpu=iwmmxt
 obj-$(CONFIG_ARM_CPU_TOPOLOGY)  += topology.o
 

diff --git a/arch/arm/kernel/iwmmxt.S b/arch/arm/kernel/iwmmxt.S
index ad58e56..49fadbd 100644
--- a/arch/arm/kernel/iwmmxt.S
+++ b/arch/arm/kernel/iwmmxt.S

@@ -58,6 +58,7 @@
 #define MMX_SIZE		(0x98)
 
 	.text
+	.arm
 
 /*
  * Lazy switching of Concan coprocessor context
@@ -182,6 +183,8 @@
 	tmcr	wCon, r2
 	ret	lr
 
+ENDPROC(iwmmxt_task_enable)
+
 /*
  * Back up Concan regs to save area and disable access to them
  * (mainly for gdb or sleep mode usage)
@@ -232,6 +235,8 @@
 1:	msr	cpsr_c, ip			@ restore interrupt mode
 	ldmfd	sp!, {r4, pc}
 
+ENDPROC(iwmmxt_task_disable)
+
 /*
  * Copy Concan state to given memory address
  *
@@ -268,6 +273,8 @@
 	msr	cpsr_c, ip			@ restore interrupt mode
 	ret	r3
 
+ENDPROC(iwmmxt_task_copy)
+
 /*
  * Restore Concan state from given memory address
  *
@@ -304,6 +311,8 @@
 	msr	cpsr_c, ip			@ restore interrupt mode
 	ret	r3
 
+ENDPROC(iwmmxt_task_restore)
+
 /*
  * Concan handling on task switch
  *
@@ -335,6 +344,8 @@
 	mrc	p15, 0, r1, c2, c0, 0
 	sub	pc, lr, r1, lsr #32		@ cpwait and return
 
+ENDPROC(iwmmxt_task_switch)
+
 /*
  * Remove Concan ownership of given task
  *
@@ -353,6 +364,8 @@
 	msr	cpsr_c, r2			@ restore interrupts
 	ret	lr
 
+ENDPROC(iwmmxt_task_release)
+
 	.data
 concan_owner:
 	.word	0

diff --git a/drivers/acpi/thermal.c b/drivers/acpi/thermal.c
index d24fa19..6d4e44e 100644
--- a/drivers/acpi/thermal.c
+++ b/drivers/acpi/thermal.c

@@ -800,7 +800,8 @@
 				result =
 					thermal_zone_bind_cooling_device
 					(thermal, trip, cdev,
-					 THERMAL_NO_LIMIT, THERMAL_NO_LIMIT);
+					 THERMAL_NO_LIMIT, THERMAL_NO_LIMIT,
+					 THERMAL_WEIGHT_DEFAULT);
 			else
 				result =
 					thermal_zone_unbind_cooling_device
@@ -824,7 +825,8 @@
 			if (bind)
 				result = thermal_zone_bind_cooling_device
 					(thermal, trip, cdev,
-					 THERMAL_NO_LIMIT, THERMAL_NO_LIMIT);
+					 THERMAL_NO_LIMIT, THERMAL_NO_LIMIT,
+					 THERMAL_WEIGHT_DEFAULT);
 			else
 				result = thermal_zone_unbind_cooling_device
 					(thermal, trip, cdev);
@@ -841,7 +843,8 @@
 				result = thermal_zone_bind_cooling_device
 						(thermal, THERMAL_TRIPS_NONE,
 						 cdev, THERMAL_NO_LIMIT,
-						 THERMAL_NO_LIMIT);
+						 THERMAL_NO_LIMIT,
+						 THERMAL_WEIGHT_DEFAULT);
 			else
 				result = thermal_zone_unbind_cooling_device
 						(thermal, THERMAL_TRIPS_NONE,

diff --git a/drivers/gpu/drm/i915/intel_drv.h b/drivers/gpu/drm/i915/intel_drv.h
index ba71522..07ce046 100644
--- a/drivers/gpu/drm/i915/intel_drv.h
+++ b/drivers/gpu/drm/i915/intel_drv.h

@@ -35,9 +35,6 @@
 #include <drm/drm_fb_helper.h>
 #include <drm/drm_dp_mst_helper.h>
 
-#define DIV_ROUND_CLOSEST_ULL(ll, d)	\
-({ unsigned long long _tmp = (ll)+(d)/2; do_div(_tmp, d); _tmp; })
-
 /**
  * _wait_for - magic (register) wait macro
  *

diff --git a/drivers/gpu/drm/i915/intel_panel.c b/drivers/gpu/drm/i915/intel_panel.c
index 8bc193f..ce5a385 100644
--- a/drivers/gpu/drm/i915/intel_panel.c
+++ b/drivers/gpu/drm/i915/intel_panel.c

@@ -30,6 +30,7 @@
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
+#include <linux/kernel.h>
 #include <linux/moduleparam.h>
 #include "intel_drv.h"
 

diff --git a/drivers/hwmon/lm75.c b/drivers/hwmon/lm75.c
index d16dbb3..e7c8bf9 100644
--- a/drivers/hwmon/lm75.c
+++ b/drivers/hwmon/lm75.c

@@ -176,6 +176,10 @@
 };
 ATTRIBUTE_GROUPS(lm75);
 
+static const struct thermal_zone_of_device_ops lm75_of_thermal_ops = {
+	.get_temp = lm75_read_temp,
+};
+
 /*-----------------------------------------------------------------------*/
 
 /* device probe and removal */
@@ -291,10 +295,9 @@
 	if (IS_ERR(data->hwmon_dev))
 		return PTR_ERR(data->hwmon_dev);
 
-	data->tz = thermal_zone_of_sensor_register(data->hwmon_dev,
-						   0,
+	data->tz = thermal_zone_of_sensor_register(data->hwmon_dev, 0,
 						   data->hwmon_dev,
-						   lm75_read_temp, NULL);
+						   &lm75_of_thermal_ops);
 	if (IS_ERR(data->tz))
 		data->tz = NULL;
 

diff --git a/drivers/hwmon/ntc_thermistor.c b/drivers/hwmon/ntc_thermistor.c
index 31597c5..b2c90bd 100644
--- a/drivers/hwmon/ntc_thermistor.c
+++ b/drivers/hwmon/ntc_thermistor.c

@@ -495,6 +495,10 @@
 	.attrs = ntc_attributes,
 };
 
+static const struct thermal_zone_of_device_ops ntc_of_thermal_ops = {
+	.get_temp = ntc_read_temp,
+};
+
 static int ntc_thermistor_probe(struct platform_device *pdev)
 {
 	const struct of_device_id *of_id =
@@ -588,7 +592,7 @@
 								pdev_id->name);
 
 	data->tz = thermal_zone_of_sensor_register(data->dev, 0, data->dev,
-						ntc_read_temp, NULL);
+						   &ntc_of_thermal_ops);
 	if (IS_ERR(data->tz)) {
 		dev_dbg(&pdev->dev, "Failed to register to thermal fw.\n");
 		data->tz = NULL;

diff --git a/drivers/hwmon/tmp102.c b/drivers/hwmon/tmp102.c
index 5171995..ba9f478 100644
--- a/drivers/hwmon/tmp102.c
+++ b/drivers/hwmon/tmp102.c

@@ -158,6 +158,10 @@
 #define TMP102_CONFIG  (TMP102_CONF_TM | TMP102_CONF_EM | TMP102_CONF_CR1)
 #define TMP102_CONFIG_RD_ONLY (TMP102_CONF_R0 | TMP102_CONF_R1 | TMP102_CONF_AL)
 
+static const struct thermal_zone_of_device_ops tmp102_of_thermal_ops = {
+	.get_temp = tmp102_read_temp,
+};
+
 static int tmp102_probe(struct i2c_client *client,
 				  const struct i2c_device_id *id)
 {
@@ -215,7 +219,7 @@
 	}
 	tmp102->hwmon_dev = hwmon_dev;
 	tmp102->tz = thermal_zone_of_sensor_register(hwmon_dev, 0, hwmon_dev,
-						     tmp102_read_temp, NULL);
+						     &tmp102_of_thermal_ops);
 	if (IS_ERR(tmp102->tz))
 		tmp102->tz = NULL;
 

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 08981be..5503e43 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c

@@ -18,9 +18,11 @@
 #include <linux/slab.h>
 #include <linux/crypto.h>
 #include <linux/workqueue.h>
+#include <linux/kthread.h>
 #include <linux/backing-dev.h>
 #include <linux/atomic.h>
 #include <linux/scatterlist.h>
+#include <linux/rbtree.h>
 #include <asm/page.h>
 #include <asm/unaligned.h>
 #include <crypto/hash.h>
@@ -58,7 +60,8 @@
 	atomic_t io_pending;
 	int error;
 	sector_t sector;
-	struct dm_crypt_io *base_io;
+
+	struct rb_node rb_node;
 } CRYPTO_MINALIGN_ATTR;
 
 struct dm_crypt_request {
@@ -108,7 +111,8 @@
  * Crypt: maps a linear range of a block device
  * and encrypts / decrypts at the same time.
  */
-enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID };
+enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID,
+	     DM_CRYPT_SAME_CPU, DM_CRYPT_NO_OFFLOAD };
 
 /*
  * The fields in here must be read only after initialization.
@@ -121,14 +125,18 @@
 	 * pool for per bio private data, crypto requests and
 	 * encryption requeusts/buffer pages
 	 */
-	mempool_t *io_pool;
 	mempool_t *req_pool;
 	mempool_t *page_pool;
 	struct bio_set *bs;
+	struct mutex bio_alloc_lock;
 
 	struct workqueue_struct *io_queue;
 	struct workqueue_struct *crypt_queue;
 
+	struct task_struct *write_thread;
+	wait_queue_head_t write_thread_wait;
+	struct rb_root write_tree;
+
 	char *cipher;
 	char *cipher_string;
 
@@ -172,9 +180,6 @@
 };
 
 #define MIN_IOS        16
-#define MIN_POOL_PAGES 32
-
-static struct kmem_cache *_crypt_io_pool;
 
 static void clone_init(struct dm_crypt_io *, struct bio *);
 static void kcryptd_queue_crypt(struct dm_crypt_io *io);
@@ -223,7 +228,7 @@
  *
  * tcw:  Compatible implementation of the block chaining mode used
  *       by the TrueCrypt device encryption system (prior to version 4.1).
- *       For more info see: http://www.truecrypt.org
+ *       For more info see: https://gitlab.com/cryptsetup/cryptsetup/wikis/TrueCryptOnDiskFormat
  *       It operates on full 512 byte sectors and uses CBC
  *       with an IV derived from initial key and the sector number.
  *       In addition, whitening value is applied on every sector, whitening
@@ -946,57 +951,70 @@
 	return 0;
 }
 
+static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone);
+
 /*
  * Generate a new unfragmented bio with the given size
  * This should never violate the device limitations
- * May return a smaller bio when running out of pages, indicated by
- * *out_of_pages set to 1.
+ *
+ * This function may be called concurrently. If we allocate from the mempool
+ * concurrently, there is a possibility of deadlock. For example, if we have
+ * mempool of 256 pages, two processes, each wanting 256, pages allocate from
+ * the mempool concurrently, it may deadlock in a situation where both processes
+ * have allocated 128 pages and the mempool is exhausted.
+ *
+ * In order to avoid this scenario we allocate the pages under a mutex.
+ *
+ * In order to not degrade performance with excessive locking, we try
+ * non-blocking allocations without a mutex first but on failure we fallback
+ * to blocking allocations with a mutex.
  */
-static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned size,
-				      unsigned *out_of_pages)
+static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned size)
 {
 	struct crypt_config *cc = io->cc;
 	struct bio *clone;
 	unsigned int nr_iovecs = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	gfp_t gfp_mask = GFP_NOIO | __GFP_HIGHMEM;
-	unsigned i, len;
+	gfp_t gfp_mask = GFP_NOWAIT | __GFP_HIGHMEM;
+	unsigned i, len, remaining_size;
 	struct page *page;
+	struct bio_vec *bvec;
+
+retry:
+	if (unlikely(gfp_mask & __GFP_WAIT))
+		mutex_lock(&cc->bio_alloc_lock);
 
 	clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, cc->bs);
 	if (!clone)
-		return NULL;
+		goto return_clone;
 
 	clone_init(io, clone);
-	*out_of_pages = 0;
+
+	remaining_size = size;
 
 	for (i = 0; i < nr_iovecs; i++) {
 		page = mempool_alloc(cc->page_pool, gfp_mask);
 		if (!page) {
-			*out_of_pages = 1;
-			break;
+			crypt_free_buffer_pages(cc, clone);
+			bio_put(clone);
+			gfp_mask |= __GFP_WAIT;
+			goto retry;
 		}
 
-		/*
-		 * If additional pages cannot be allocated without waiting,
-		 * return a partially-allocated bio.  The caller will then try
-		 * to allocate more bios while submitting this partial bio.
-		 */
-		gfp_mask = (gfp_mask | __GFP_NOWARN) & ~__GFP_WAIT;
+		len = (remaining_size > PAGE_SIZE) ? PAGE_SIZE : remaining_size;
 
-		len = (size > PAGE_SIZE) ? PAGE_SIZE : size;
+		bvec = &clone->bi_io_vec[clone->bi_vcnt++];
+		bvec->bv_page = page;
+		bvec->bv_len = len;
+		bvec->bv_offset = 0;
 
-		if (!bio_add_page(clone, page, len, 0)) {
-			mempool_free(page, cc->page_pool);
-			break;
-		}
+		clone->bi_iter.bi_size += len;
 
-		size -= len;
+		remaining_size -= len;
 	}
 
-	if (!clone->bi_iter.bi_size) {
-		bio_put(clone);
-		return NULL;
-	}
+return_clone:
+	if (unlikely(gfp_mask & __GFP_WAIT))
+		mutex_unlock(&cc->bio_alloc_lock);
 
 	return clone;
 }
@@ -1020,7 +1038,6 @@
 	io->base_bio = bio;
 	io->sector = sector;
 	io->error = 0;
-	io->base_io = NULL;
 	io->ctx.req = NULL;
 	atomic_set(&io->io_pending, 0);
 }
@@ -1033,13 +1050,11 @@
 /*
  * One of the bios was finished. Check for completion of
  * the whole request and correctly clean up the buffer.
- * If base_io is set, wait for the last fragment to complete.
  */
 static void crypt_dec_pending(struct dm_crypt_io *io)
 {
 	struct crypt_config *cc = io->cc;
 	struct bio *base_bio = io->base_bio;
-	struct dm_crypt_io *base_io = io->base_io;
 	int error = io->error;
 
 	if (!atomic_dec_and_test(&io->io_pending))
@@ -1047,16 +1062,8 @@
 
 	if (io->ctx.req)
 		crypt_free_req(cc, io->ctx.req, base_bio);
-	if (io != dm_per_bio_data(base_bio, cc->per_bio_data_size))
-		mempool_free(io, cc->io_pool);
 
-	if (likely(!base_io))
-		bio_endio(base_bio, error);
-	else {
-		if (error && !base_io->error)
-			base_io->error = error;
-		crypt_dec_pending(base_io);
-	}
+	bio_endio(base_bio, error);
 }
 
 /*
@@ -1117,15 +1124,15 @@
 static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp)
 {
 	struct crypt_config *cc = io->cc;
-	struct bio *base_bio = io->base_bio;
 	struct bio *clone;
 
 	/*
-	 * The block layer might modify the bvec array, so always
-	 * copy the required bvecs because we need the original
-	 * one in order to decrypt the whole bio data *afterwards*.
+	 * We need the original biovec array in order to decrypt
+	 * the whole bio data *afterwards* -- thanks to immutable
+	 * biovecs we don't need to worry about the block layer
+	 * modifying the biovec array; so leverage bio_clone_fast().
 	 */
-	clone = bio_clone_bioset(base_bio, gfp, cc->bs);
+	clone = bio_clone_fast(io->base_bio, gfp, cc->bs);
 	if (!clone)
 		return 1;
 
@@ -1138,37 +1145,97 @@
 	return 0;
 }
 
-static void kcryptd_io_write(struct dm_crypt_io *io)
-{
-	struct bio *clone = io->ctx.bio_out;
-	generic_make_request(clone);
-}
-
-static void kcryptd_io(struct work_struct *work)
+static void kcryptd_io_read_work(struct work_struct *work)
 {
 	struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work);
 
-	if (bio_data_dir(io->base_bio) == READ) {
-		crypt_inc_pending(io);
-		if (kcryptd_io_read(io, GFP_NOIO))
-			io->error = -ENOMEM;
-		crypt_dec_pending(io);
-	} else
-		kcryptd_io_write(io);
+	crypt_inc_pending(io);
+	if (kcryptd_io_read(io, GFP_NOIO))
+		io->error = -ENOMEM;
+	crypt_dec_pending(io);
 }
 
-static void kcryptd_queue_io(struct dm_crypt_io *io)
+static void kcryptd_queue_read(struct dm_crypt_io *io)
 {
 	struct crypt_config *cc = io->cc;
 
-	INIT_WORK(&io->work, kcryptd_io);
+	INIT_WORK(&io->work, kcryptd_io_read_work);
 	queue_work(cc->io_queue, &io->work);
 }
 
+static void kcryptd_io_write(struct dm_crypt_io *io)
+{
+	struct bio *clone = io->ctx.bio_out;
+
+	generic_make_request(clone);
+}
+
+#define crypt_io_from_node(node) rb_entry((node), struct dm_crypt_io, rb_node)
+
+static int dmcrypt_write(void *data)
+{
+	struct crypt_config *cc = data;
+	struct dm_crypt_io *io;
+
+	while (1) {
+		struct rb_root write_tree;
+		struct blk_plug plug;
+
+		DECLARE_WAITQUEUE(wait, current);
+
+		spin_lock_irq(&cc->write_thread_wait.lock);
+continue_locked:
+
+		if (!RB_EMPTY_ROOT(&cc->write_tree))
+			goto pop_from_list;
+
+		__set_current_state(TASK_INTERRUPTIBLE);
+		__add_wait_queue(&cc->write_thread_wait, &wait);
+
+		spin_unlock_irq(&cc->write_thread_wait.lock);
+
+		if (unlikely(kthread_should_stop())) {
+			set_task_state(current, TASK_RUNNING);
+			remove_wait_queue(&cc->write_thread_wait, &wait);
+			break;
+		}
+
+		schedule();
+
+		set_task_state(current, TASK_RUNNING);
+		spin_lock_irq(&cc->write_thread_wait.lock);
+		__remove_wait_queue(&cc->write_thread_wait, &wait);
+		goto continue_locked;
+
+pop_from_list:
+		write_tree = cc->write_tree;
+		cc->write_tree = RB_ROOT;
+		spin_unlock_irq(&cc->write_thread_wait.lock);
+
+		BUG_ON(rb_parent(write_tree.rb_node));
+
+		/*
+		 * Note: we cannot walk the tree here with rb_next because
+		 * the structures may be freed when kcryptd_io_write is called.
+		 */
+		blk_start_plug(&plug);
+		do {
+			io = crypt_io_from_node(rb_first(&write_tree));
+			rb_erase(&io->rb_node, &write_tree);
+			kcryptd_io_write(io);
+		} while (!RB_EMPTY_ROOT(&write_tree));
+		blk_finish_plug(&plug);
+	}
+	return 0;
+}
+
 static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async)
 {
 	struct bio *clone = io->ctx.bio_out;
 	struct crypt_config *cc = io->cc;
+	unsigned long flags;
+	sector_t sector;
+	struct rb_node **rbp, *parent;
 
 	if (unlikely(io->error < 0)) {
 		crypt_free_buffer_pages(cc, clone);
@@ -1182,20 +1249,34 @@
 
 	clone->bi_iter.bi_sector = cc->start + io->sector;
 
-	if (async)
-		kcryptd_queue_io(io);
-	else
+	if (likely(!async) && test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags)) {
 		generic_make_request(clone);
+		return;
+	}
+
+	spin_lock_irqsave(&cc->write_thread_wait.lock, flags);
+	rbp = &cc->write_tree.rb_node;
+	parent = NULL;
+	sector = io->sector;
+	while (*rbp) {
+		parent = *rbp;
+		if (sector < crypt_io_from_node(parent)->sector)
+			rbp = &(*rbp)->rb_left;
+		else
+			rbp = &(*rbp)->rb_right;
+	}
+	rb_link_node(&io->rb_node, parent, rbp);
+	rb_insert_color(&io->rb_node, &cc->write_tree);
+
+	wake_up_locked(&cc->write_thread_wait);
+	spin_unlock_irqrestore(&cc->write_thread_wait.lock, flags);
 }
 
 static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
 {
 	struct crypt_config *cc = io->cc;
 	struct bio *clone;
-	struct dm_crypt_io *new_io;
 	int crypt_finished;
-	unsigned out_of_pages = 0;
-	unsigned remaining = io->base_bio->bi_iter.bi_size;
 	sector_t sector = io->sector;
 	int r;
 
@@ -1205,80 +1286,30 @@
 	crypt_inc_pending(io);
 	crypt_convert_init(cc, &io->ctx, NULL, io->base_bio, sector);
 
-	/*
-	 * The allocated buffers can be smaller than the whole bio,
-	 * so repeat the whole process until all the data can be handled.
-	 */
-	while (remaining) {
-		clone = crypt_alloc_buffer(io, remaining, &out_of_pages);
-		if (unlikely(!clone)) {
-			io->error = -ENOMEM;
-			break;
-		}
-
-		io->ctx.bio_out = clone;
-		io->ctx.iter_out = clone->bi_iter;
-
-		remaining -= clone->bi_iter.bi_size;
-		sector += bio_sectors(clone);
-
-		crypt_inc_pending(io);
-
-		r = crypt_convert(cc, &io->ctx);
-		if (r < 0)
-			io->error = -EIO;
-
-		crypt_finished = atomic_dec_and_test(&io->ctx.cc_pending);
-
-		/* Encryption was already finished, submit io now */
-		if (crypt_finished) {
-			kcryptd_crypt_write_io_submit(io, 0);
-
-			/*
-			 * If there was an error, do not try next fragments.
-			 * For async, error is processed in async handler.
-			 */
-			if (unlikely(r < 0))
-				break;
-
-			io->sector = sector;
-		}
-
-		/*
-		 * Out of memory -> run queues
-		 * But don't wait if split was due to the io size restriction
-		 */
-		if (unlikely(out_of_pages))
-			congestion_wait(BLK_RW_ASYNC, HZ/100);
-
-		/*
-		 * With async crypto it is unsafe to share the crypto context
-		 * between fragments, so switch to a new dm_crypt_io structure.
-		 */
-		if (unlikely(!crypt_finished && remaining)) {
-			new_io = mempool_alloc(cc->io_pool, GFP_NOIO);
-			crypt_io_init(new_io, io->cc, io->base_bio, sector);
-			crypt_inc_pending(new_io);
-			crypt_convert_init(cc, &new_io->ctx, NULL,
-					   io->base_bio, sector);
-			new_io->ctx.iter_in = io->ctx.iter_in;
-
-			/*
-			 * Fragments after the first use the base_io
-			 * pending count.
-			 */
-			if (!io->base_io)
-				new_io->base_io = io;
-			else {
-				new_io->base_io = io->base_io;
-				crypt_inc_pending(io->base_io);
-				crypt_dec_pending(io);
-			}
-
-			io = new_io;
-		}
+	clone = crypt_alloc_buffer(io, io->base_bio->bi_iter.bi_size);
+	if (unlikely(!clone)) {
+		io->error = -EIO;
+		goto dec;
 	}
 
+	io->ctx.bio_out = clone;
+	io->ctx.iter_out = clone->bi_iter;
+
+	sector += bio_sectors(clone);
+
+	crypt_inc_pending(io);
+	r = crypt_convert(cc, &io->ctx);
+	if (r)
+		io->error = -EIO;
+	crypt_finished = atomic_dec_and_test(&io->ctx.cc_pending);
+
+	/* Encryption was already finished, submit io now */
+	if (crypt_finished) {
+		kcryptd_crypt_write_io_submit(io, 0);
+		io->sector = sector;
+	}
+
+dec:
 	crypt_dec_pending(io);
 }
 
@@ -1481,6 +1512,9 @@
 	if (!cc)
 		return;
 
+	if (cc->write_thread)
+		kthread_stop(cc->write_thread);
+
 	if (cc->io_queue)
 		destroy_workqueue(cc->io_queue);
 	if (cc->crypt_queue)
@@ -1495,8 +1529,6 @@
 		mempool_destroy(cc->page_pool);
 	if (cc->req_pool)
 		mempool_destroy(cc->req_pool);
-	if (cc->io_pool)
-		mempool_destroy(cc->io_pool);
 
 	if (cc->iv_gen_ops && cc->iv_gen_ops->dtr)
 		cc->iv_gen_ops->dtr(cc);
@@ -1688,7 +1720,7 @@
 	char dummy;
 
 	static struct dm_arg _args[] = {
-		{0, 1, "Invalid number of feature args"},
+		{0, 3, "Invalid number of feature args"},
 	};
 
 	if (argc < 5) {
@@ -1710,13 +1742,6 @@
 	if (ret < 0)
 		goto bad;
 
-	ret = -ENOMEM;
-	cc->io_pool = mempool_create_slab_pool(MIN_IOS, _crypt_io_pool);
-	if (!cc->io_pool) {
-		ti->error = "Cannot allocate crypt io mempool";
-		goto bad;
-	}
-
 	cc->dmreq_start = sizeof(struct ablkcipher_request);
 	cc->dmreq_start += crypto_ablkcipher_reqsize(any_tfm(cc));
 	cc->dmreq_start = ALIGN(cc->dmreq_start, __alignof__(struct dm_crypt_request));
@@ -1734,6 +1759,7 @@
 		iv_size_padding = crypto_ablkcipher_alignmask(any_tfm(cc));
 	}
 
+	ret = -ENOMEM;
 	cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start +
 			sizeof(struct dm_crypt_request) + iv_size_padding + cc->iv_size);
 	if (!cc->req_pool) {
@@ -1746,7 +1772,7 @@
 		      sizeof(struct dm_crypt_request) + iv_size_padding + cc->iv_size,
 		      ARCH_KMALLOC_MINALIGN);
 
-	cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0);
+	cc->page_pool = mempool_create_page_pool(BIO_MAX_PAGES, 0);
 	if (!cc->page_pool) {
 		ti->error = "Cannot allocate page mempool";
 		goto bad;
@@ -1758,6 +1784,8 @@
 		goto bad;
 	}
 
+	mutex_init(&cc->bio_alloc_lock);
+
 	ret = -EINVAL;
 	if (sscanf(argv[2], "%llu%c", &tmpll, &dummy) != 1) {
 		ti->error = "Invalid iv_offset sector";
@@ -1788,15 +1816,27 @@
 		if (ret)
 			goto bad;
 
-		opt_string = dm_shift_arg(&as);
+		ret = -EINVAL;
+		while (opt_params--) {
+			opt_string = dm_shift_arg(&as);
+			if (!opt_string) {
+				ti->error = "Not enough feature arguments";
+				goto bad;
+			}
 
-		if (opt_params == 1 && opt_string &&
-		    !strcasecmp(opt_string, "allow_discards"))
-			ti->num_discard_bios = 1;
-		else if (opt_params) {
-			ret = -EINVAL;
-			ti->error = "Invalid feature arguments";
-			goto bad;
+			if (!strcasecmp(opt_string, "allow_discards"))
+				ti->num_discard_bios = 1;
+
+			else if (!strcasecmp(opt_string, "same_cpu_crypt"))
+				set_bit(DM_CRYPT_SAME_CPU, &cc->flags);
+
+			else if (!strcasecmp(opt_string, "submit_from_crypt_cpus"))
+				set_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags);
+
+			else {
+				ti->error = "Invalid feature arguments";
+				goto bad;
+			}
 		}
 	}
 
@@ -1807,13 +1847,28 @@
 		goto bad;
 	}
 
-	cc->crypt_queue = alloc_workqueue("kcryptd",
-					  WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, 1);
+	if (test_bit(DM_CRYPT_SAME_CPU, &cc->flags))
+		cc->crypt_queue = alloc_workqueue("kcryptd", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, 1);
+	else
+		cc->crypt_queue = alloc_workqueue("kcryptd", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND,
+						  num_online_cpus());
 	if (!cc->crypt_queue) {
 		ti->error = "Couldn't create kcryptd queue";
 		goto bad;
 	}
 
+	init_waitqueue_head(&cc->write_thread_wait);
+	cc->write_tree = RB_ROOT;
+
+	cc->write_thread = kthread_create(dmcrypt_write, cc, "dmcrypt_write");
+	if (IS_ERR(cc->write_thread)) {
+		ret = PTR_ERR(cc->write_thread);
+		cc->write_thread = NULL;
+		ti->error = "Couldn't spawn write thread";
+		goto bad;
+	}
+	wake_up_process(cc->write_thread);
+
 	ti->num_flush_bios = 1;
 	ti->discard_zeroes_data_unsupported = true;
 
@@ -1848,7 +1903,7 @@
 
 	if (bio_data_dir(io->base_bio) == READ) {
 		if (kcryptd_io_read(io, GFP_NOWAIT))
-			kcryptd_queue_io(io);
+			kcryptd_queue_read(io);
 	} else
 		kcryptd_queue_crypt(io);
 
@@ -1860,6 +1915,7 @@
 {
 	struct crypt_config *cc = ti->private;
 	unsigned i, sz = 0;
+	int num_feature_args = 0;
 
 	switch (type) {
 	case STATUSTYPE_INFO:
@@ -1878,8 +1934,18 @@
 		DMEMIT(" %llu %s %llu", (unsigned long long)cc->iv_offset,
 				cc->dev->name, (unsigned long long)cc->start);
 
-		if (ti->num_discard_bios)
-			DMEMIT(" 1 allow_discards");
+		num_feature_args += !!ti->num_discard_bios;
+		num_feature_args += test_bit(DM_CRYPT_SAME_CPU, &cc->flags);
+		num_feature_args += test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags);
+		if (num_feature_args) {
+			DMEMIT(" %d", num_feature_args);
+			if (ti->num_discard_bios)
+				DMEMIT(" allow_discards");
+			if (test_bit(DM_CRYPT_SAME_CPU, &cc->flags))
+				DMEMIT(" same_cpu_crypt");
+			if (test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags))
+				DMEMIT(" submit_from_crypt_cpus");
+		}
 
 		break;
 	}
@@ -1976,7 +2042,7 @@
 
 static struct target_type crypt_target = {
 	.name   = "crypt",
-	.version = {1, 13, 0},
+	.version = {1, 14, 0},
 	.module = THIS_MODULE,
 	.ctr    = crypt_ctr,
 	.dtr    = crypt_dtr,
@@ -1994,15 +2060,9 @@
 {
 	int r;
 
-	_crypt_io_pool = KMEM_CACHE(dm_crypt_io, 0);
-	if (!_crypt_io_pool)
-		return -ENOMEM;
-
 	r = dm_register_target(&crypt_target);
-	if (r < 0) {
+	if (r < 0)
 		DMERR("register failed %d", r);
-		kmem_cache_destroy(_crypt_io_pool);
-	}
 
 	return r;
 }
@@ -2010,7 +2070,6 @@
 static void __exit dm_crypt_exit(void)
 {
 	dm_unregister_target(&crypt_target);
-	kmem_cache_destroy(_crypt_io_pool);
 }
 
 module_init(dm_crypt_init);

diff --git a/drivers/platform/x86/acerhdf.c b/drivers/platform/x86/acerhdf.c
index f94467c..7f21f75 100644
--- a/drivers/platform/x86/acerhdf.c
+++ b/drivers/platform/x86/acerhdf.c

@@ -330,7 +330,8 @@
 		return 0;
 
 	if (thermal_zone_bind_cooling_device(thermal, 0, cdev,
-			THERMAL_NO_LIMIT, THERMAL_NO_LIMIT)) {
+			THERMAL_NO_LIMIT, THERMAL_NO_LIMIT,
+			THERMAL_WEIGHT_DEFAULT)) {
 		pr_err("error binding cooling dev\n");
 		return -EINVAL;
 	}

diff --git a/drivers/thermal/Kconfig b/drivers/thermal/Kconfig
index f554d25..8b7d47f 100644
--- a/drivers/thermal/Kconfig
+++ b/drivers/thermal/Kconfig

@@ -42,6 +42,17 @@
 	  Say 'Y' here if you need to build thermal infrastructure
 	  based on device tree.
 
+config THERMAL_WRITABLE_TRIPS
+	bool "Enable writable trip points"
+	help
+	  This option allows the system integrator to choose whether
+	  trip temperatures can be changed from userspace. The
+	  writable trips need to be specified when setting up the
+	  thermal zone but the choice here takes precedence.
+
+	  Say 'Y' here if you would like to allow userspace tools to
+	  change trip temperatures.
+
 choice
 	prompt "Default Thermal governor"
 	default THERMAL_DEFAULT_GOV_STEP_WISE
@@ -71,6 +82,14 @@
 	  Select this if you want to let the user space manage the
 	  platform thermals.
 
+config THERMAL_DEFAULT_GOV_POWER_ALLOCATOR
+	bool "power_allocator"
+	select THERMAL_GOV_POWER_ALLOCATOR
+	help
+	  Select this if you want to control temperature based on
+	  system and device power allocation. This governor can only
+	  operate on cooling devices that implement the power API.
+
 endchoice
 
 config THERMAL_GOV_FAIR_SHARE
@@ -99,6 +118,13 @@
 	help
 	  Enable this to let the user space manage the platform thermals.
 
+config THERMAL_GOV_POWER_ALLOCATOR
+	bool "Power allocator thermal governor"
+	select THERMAL_POWER_ACTOR
+	help
+	  Enable this to manage platform thermals by dynamically
+	  allocating and limiting power to devices.
+
 config CPU_THERMAL
 	bool "generic cpu cooling support"
 	depends on CPU_FREQ
@@ -112,6 +138,18 @@
 
 	  If you want this support, you should say Y here.
 
+config CLOCK_THERMAL
+	bool "Generic clock cooling support"
+	depends on COMMON_CLK
+	depends on PM_OPP
+	help
+	  This entry implements the generic clock cooling mechanism through
+	  frequency clipping. Typically used to cool off co-processors. The
+	  device that is configured to use this cooling mechanism will be
+	  controlled to reduce clock frequency whenever temperature is high.
+
+	  If you want this support, you should say Y here.
+
 config THERMAL_EMULATION
 	bool "Thermal emulation mode support"
 	help
@@ -185,6 +223,16 @@
 	  Enable this option if you want to have support for thermal management
 	  controller present in Armada 370 and Armada XP SoC.
 
+config TEGRA_SOCTHERM
+	tristate "Tegra SOCTHERM thermal management"
+	depends on ARCH_TEGRA
+	help
+	  Enable this option for integrated thermal management support on NVIDIA
+	  Tegra124 systems-on-chip. The driver supports four thermal zones
+	  (CPU, GPU, MEM, PLLX). Cooling devices can be bound to the thermal
+	  zones to manage temperatures. This option is also required for the
+	  emergency thermal reset (thermtrip) feature to function.
+
 config DB8500_CPUFREQ_COOLING
 	tristate "DB8500 cpufreq cooling"
 	depends on ARCH_U8500

diff --git a/drivers/thermal/Makefile b/drivers/thermal/Makefile
index 39c4fe8..dae37c0 100644
--- a/drivers/thermal/Makefile
+++ b/drivers/thermal/Makefile

@@ -14,10 +14,14 @@
 thermal_sys-$(CONFIG_THERMAL_GOV_BANG_BANG)	+= gov_bang_bang.o
 thermal_sys-$(CONFIG_THERMAL_GOV_STEP_WISE)	+= step_wise.o
 thermal_sys-$(CONFIG_THERMAL_GOV_USER_SPACE)	+= user_space.o
+thermal_sys-$(CONFIG_THERMAL_GOV_POWER_ALLOCATOR)	+= power_allocator.o
 
 # cpufreq cooling
 thermal_sys-$(CONFIG_CPU_THERMAL)	+= cpu_cooling.o
 
+# clock cooling
+thermal_sys-$(CONFIG_CLOCK_THERMAL)	+= clock_cooling.o
+
 # platform thermal drivers
 obj-$(CONFIG_SPEAR_THERMAL)	+= spear_thermal.o
 obj-$(CONFIG_RCAR_THERMAL)	+= rcar_thermal.o
@@ -34,3 +38,4 @@
 obj-$(CONFIG_TI_SOC_THERMAL)	+= ti-soc-thermal/
 obj-$(CONFIG_INT340X_THERMAL)  += int340x_thermal/
 obj-$(CONFIG_ST_THERMAL)	+= st/
+obj-$(CONFIG_TEGRA_SOCTHERM)	+= tegra_soctherm.o

diff --git a/drivers/thermal/clock_cooling.c b/drivers/thermal/clock_cooling.c
new file mode 100644
index 0000000..1b4ff0f
--- /dev/null
+++ b/drivers/thermal/clock_cooling.c

@@ -0,0 +1,485 @@
+/*
+ *  drivers/thermal/clock_cooling.c
+ *
+ *  Copyright (C) 2014 Eduardo Valentin <edubezval@gmail.com>
+ *
+ *  Copyright (C) 2013	Texas Instruments Inc.
+ *  Contact:  Eduardo Valentin <eduardo.valentin@ti.com>
+ *
+ *  Highly based on cpu_cooling.c.
+ *  Copyright (C) 2012	Samsung Electronics Co., Ltd(http://www.samsung.com)
+ *  Copyright (C) 2012  Amit Daniel <amit.kachhap@linaro.org>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; version 2 of the License.
+ *
+ *  This program is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ */
+#include <linux/clk.h>
+#include <linux/cpufreq.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/idr.h>
+#include <linux/mutex.h>
+#include <linux/pm_opp.h>
+#include <linux/slab.h>
+#include <linux/thermal.h>
+#include <linux/clock_cooling.h>
+
+/**
+ * struct clock_cooling_device - data for cooling device with clock
+ * @id: unique integer value corresponding to each clock_cooling_device
+ *	registered.
+ * @dev: struct device pointer to the device being used to cool off using
+ *       clock frequencies.
+ * @cdev: thermal_cooling_device pointer to keep track of the
+ *	registered cooling device.
+ * @clk_rate_change_nb: reference to notifier block used to receive clock
+ *                      rate changes.
+ * @freq_table: frequency table used to keep track of available frequencies.
+ * @clock_state: integer value representing the current state of clock
+ *	cooling	devices.
+ * @clock_val: integer value representing the absolute value of the clipped
+ *	frequency.
+ * @clk: struct clk reference used to enforce clock limits.
+ * @lock: mutex lock to protect this struct.
+ *
+ * This structure is required for keeping information of each
+ * clock_cooling_device registered. In order to prevent corruption of this a
+ * mutex @lock is used.
+ */
+struct clock_cooling_device {
+	int id;
+	struct device *dev;
+	struct thermal_cooling_device *cdev;
+	struct notifier_block clk_rate_change_nb;
+	struct cpufreq_frequency_table *freq_table;
+	unsigned long clock_state;
+	unsigned long clock_val;
+	struct clk *clk;
+	struct mutex lock; /* lock to protect the content of this struct */
+};
+#define to_clock_cooling_device(x) \
+		container_of(x, struct clock_cooling_device, clk_rate_change_nb)
+static DEFINE_IDR(clock_idr);
+static DEFINE_MUTEX(cooling_clock_lock);
+
+/**
+ * clock_cooling_get_idr - function to get an unique id.
+ * @id: int * value generated by this function.
+ *
+ * This function will populate @id with an unique
+ * id, using the idr API.
+ *
+ * Return: 0 on success, an error code on failure.
+ */
+static int clock_cooling_get_idr(int *id)
+{
+	int ret;
+
+	mutex_lock(&cooling_clock_lock);
+	ret = idr_alloc(&clock_idr, NULL, 0, 0, GFP_KERNEL);
+	mutex_unlock(&cooling_clock_lock);
+	if (unlikely(ret < 0))
+		return ret;
+	*id = ret;
+
+	return 0;
+}
+
+/**
+ * release_idr - function to free the unique id.
+ * @id: int value representing the unique id.
+ */
+static void release_idr(int id)
+{
+	mutex_lock(&cooling_clock_lock);
+	idr_remove(&clock_idr, id);
+	mutex_unlock(&cooling_clock_lock);
+}
+
+/* Below code defines functions to be used for clock as cooling device */
+
+enum clock_cooling_property {
+	GET_LEVEL,
+	GET_FREQ,
+	GET_MAXL,
+};
+
+/**
+ * clock_cooling_get_property - fetch a property of interest for a give cpu.
+ * @ccdev: clock cooling device reference
+ * @input: query parameter
+ * @output: query return
+ * @property: type of query (frequency, level, max level)
+ *
+ * This is the common function to
+ * 1. get maximum clock cooling states
+ * 2. translate frequency to cooling state
+ * 3. translate cooling state to frequency
+ * Note that the code may be not in good shape
+ * but it is written in this way in order to:
+ * a) reduce duplicate code as most of the code can be shared.
+ * b) make sure the logic is consistent when translating between
+ *    cooling states and frequencies.
+ *
+ * Return: 0 on success, -EINVAL when invalid parameters are passed.
+ */
+static int clock_cooling_get_property(struct clock_cooling_device *ccdev,
+				      unsigned long input,
+				      unsigned long *output,
+				      enum clock_cooling_property property)
+{
+	int i;
+	unsigned long max_level = 0, level = 0;
+	unsigned int freq = CPUFREQ_ENTRY_INVALID;
+	int descend = -1;
+	struct cpufreq_frequency_table *pos, *table = ccdev->freq_table;
+
+	if (!output)
+		return -EINVAL;
+
+	if (!table)
+		return -EINVAL;
+
+	cpufreq_for_each_valid_entry(pos, table) {
+		/* ignore duplicate entry */
+		if (freq == pos->frequency)
+			continue;
+
+		/* get the frequency order */
+		if (freq != CPUFREQ_ENTRY_INVALID && descend == -1)
+			descend = freq > pos->frequency;
+
+		freq = pos->frequency;
+		max_level++;
+	}
+
+	/* No valid cpu frequency entry */
+	if (max_level == 0)
+		return -EINVAL;
+
+	/* max_level is an index, not a counter */
+	max_level--;
+
+	/* get max level */
+	if (property == GET_MAXL) {
+		*output = max_level;
+		return 0;
+	}
+
+	if (property == GET_FREQ)
+		level = descend ? input : (max_level - input);
+
+	i = 0;
+	cpufreq_for_each_valid_entry(pos, table) {
+		/* ignore duplicate entry */
+		if (freq == pos->frequency)
+			continue;
+
+		/* now we have a valid frequency entry */
+		freq = pos->frequency;
+
+		if (property == GET_LEVEL && (unsigned int)input == freq) {
+			/* get level by frequency */
+			*output = descend ? i : (max_level - i);
+			return 0;
+		}
+		if (property == GET_FREQ && level == i) {
+			/* get frequency by level */
+			*output = freq;
+			return 0;
+		}
+		i++;
+	}
+
+	return -EINVAL;
+}
+
+/**
+ * clock_cooling_get_level - return the cooling level of given clock cooling.
+ * @cdev: reference of a thermal cooling device of used as clock cooling device
+ * @freq: the frequency of interest
+ *
+ * This function will match the cooling level corresponding to the
+ * requested @freq and return it.
+ *
+ * Return: The matched cooling level on success or THERMAL_CSTATE_INVALID
+ * otherwise.
+ */
+unsigned long clock_cooling_get_level(struct thermal_cooling_device *cdev,
+				      unsigned long freq)
+{
+	struct clock_cooling_device *ccdev = cdev->devdata;
+	unsigned long val;
+
+	if (clock_cooling_get_property(ccdev, (unsigned long)freq, &val,
+				       GET_LEVEL))
+		return THERMAL_CSTATE_INVALID;
+
+	return val;
+}
+EXPORT_SYMBOL_GPL(clock_cooling_get_level);
+
+/**
+ * clock_cooling_get_frequency - get the absolute value of frequency from level.
+ * @ccdev: clock cooling device reference
+ * @level: cooling level
+ *
+ * This function matches cooling level with frequency. Based on a cooling level
+ * of frequency, equals cooling state of cpu cooling device, it will return
+ * the corresponding frequency.
+ *	e.g level=0 --> 1st MAX FREQ, level=1 ---> 2nd MAX FREQ, .... etc
+ *
+ * Return: 0 on error, the corresponding frequency otherwise.
+ */
+static unsigned long
+clock_cooling_get_frequency(struct clock_cooling_device *ccdev,
+			    unsigned long level)
+{
+	int ret = 0;
+	unsigned long freq;
+
+	ret = clock_cooling_get_property(ccdev, level, &freq, GET_FREQ);
+	if (ret)
+		return 0;
+
+	return freq;
+}
+
+/**
+ * clock_cooling_apply - function to apply frequency clipping.
+ * @ccdev: clock_cooling_device pointer containing frequency clipping data.
+ * @cooling_state: value of the cooling state.
+ *
+ * Function used to make sure the clock layer is aware of current thermal
+ * limits. The limits are applied by updating the clock rate in case it is
+ * higher than the corresponding frequency based on the requested cooling_state.
+ *
+ * Return: 0 on success, an error code otherwise (-EINVAL in case wrong
+ * cooling state).
+ */
+static int clock_cooling_apply(struct clock_cooling_device *ccdev,
+			       unsigned long cooling_state)
+{
+	unsigned long clip_freq, cur_freq;
+	int ret = 0;
+
+	/* Here we write the clipping */
+	/* Check if the old cooling action is same as new cooling action */
+	if (ccdev->clock_state == cooling_state)
+		return 0;
+
+	clip_freq = clock_cooling_get_frequency(ccdev, cooling_state);
+	if (!clip_freq)
+		return -EINVAL;
+
+	cur_freq = clk_get_rate(ccdev->clk);
+
+	mutex_lock(&ccdev->lock);
+	ccdev->clock_state = cooling_state;
+	ccdev->clock_val = clip_freq;
+	/* enforce clock level */
+	if (cur_freq > clip_freq)
+		ret = clk_set_rate(ccdev->clk, clip_freq);
+	mutex_unlock(&ccdev->lock);
+
+	return ret;
+}
+
+/**
+ * clock_cooling_clock_notifier - notifier callback on clock rate changes.
+ * @nb:	struct notifier_block * with callback info.
+ * @event: value showing clock event for which this function invoked.
+ * @data: callback-specific data
+ *
+ * Callback to hijack the notification on clock transition.
+ * Every time there is a clock change, we intercept all pre change events
+ * and block the transition in case the new rate infringes thermal limits.
+ *
+ * Return: NOTIFY_DONE (success) or NOTIFY_BAD (new_rate > thermal limit).
+ */
+static int clock_cooling_clock_notifier(struct notifier_block *nb,
+					unsigned long event, void *data)
+{
+	struct clk_notifier_data *ndata = data;
+	struct clock_cooling_device *ccdev = to_clock_cooling_device(nb);
+
+	switch (event) {
+	case PRE_RATE_CHANGE:
+		/*
+		 * checks on current state
+		 * TODO: current method is not best we can find as it
+		 * allows possibly voltage transitions, in case DVFS
+		 * layer is also hijacking clock pre notifications.
+		 */
+		if (ndata->new_rate > ccdev->clock_val)
+			return NOTIFY_BAD;
+		/* fall through */
+	case POST_RATE_CHANGE:
+	case ABORT_RATE_CHANGE:
+	default:
+		return NOTIFY_DONE;
+	}
+}
+
+/* clock cooling device thermal callback functions are defined below */
+
+/**
+ * clock_cooling_get_max_state - callback function to get the max cooling state.
+ * @cdev: thermal cooling device pointer.
+ * @state: fill this variable with the max cooling state.
+ *
+ * Callback for the thermal cooling device to return the clock
+ * max cooling state.
+ *
+ * Return: 0 on success, an error code otherwise.
+ */
+static int clock_cooling_get_max_state(struct thermal_cooling_device *cdev,
+				       unsigned long *state)
+{
+	struct clock_cooling_device *ccdev = cdev->devdata;
+	unsigned long count = 0;
+	int ret;
+
+	ret = clock_cooling_get_property(ccdev, 0, &count, GET_MAXL);
+	if (!ret)
+		*state = count;
+
+	return ret;
+}
+
+/**
+ * clock_cooling_get_cur_state - function to get the current cooling state.
+ * @cdev: thermal cooling device pointer.
+ * @state: fill this variable with the current cooling state.
+ *
+ * Callback for the thermal cooling device to return the clock
+ * current cooling state.
+ *
+ * Return: 0 (success)
+ */
+static int clock_cooling_get_cur_state(struct thermal_cooling_device *cdev,
+				       unsigned long *state)
+{
+	struct clock_cooling_device *ccdev = cdev->devdata;
+
+	*state = ccdev->clock_state;
+
+	return 0;
+}
+
+/**
+ * clock_cooling_set_cur_state - function to set the current cooling state.
+ * @cdev: thermal cooling device pointer.
+ * @state: set this variable to the current cooling state.
+ *
+ * Callback for the thermal cooling device to change the clock cooling
+ * current cooling state.
+ *
+ * Return: 0 on success, an error code otherwise.
+ */
+static int clock_cooling_set_cur_state(struct thermal_cooling_device *cdev,
+				       unsigned long state)
+{
+	struct clock_cooling_device *clock_device = cdev->devdata;
+
+	return clock_cooling_apply(clock_device, state);
+}
+
+/* Bind clock callbacks to thermal cooling device ops */
+static struct thermal_cooling_device_ops const clock_cooling_ops = {
+	.get_max_state = clock_cooling_get_max_state,
+	.get_cur_state = clock_cooling_get_cur_state,
+	.set_cur_state = clock_cooling_set_cur_state,
+};
+
+/**
+ * clock_cooling_register - function to create clock cooling device.
+ * @dev: struct device pointer to the device used as clock cooling device.
+ * @clock_name: string containing the clock used as cooling mechanism.
+ *
+ * This interface function registers the clock cooling device with the name
+ * "thermal-clock-%x". The cooling device is based on clock frequencies.
+ * The struct device is assumed to be capable of DVFS transitions.
+ * The OPP layer is used to fetch and fill the available frequencies for
+ * the referred device. The ordered frequency table is used to control
+ * the clock cooling device cooling states and to limit clock transitions
+ * based on the cooling state requested by the thermal framework.
+ *
+ * Return: a valid struct thermal_cooling_device pointer on success,
+ * on failure, it returns a corresponding ERR_PTR().
+ */
+struct thermal_cooling_device *
+clock_cooling_register(struct device *dev, const char *clock_name)
+{
+	struct thermal_cooling_device *cdev;
+	struct clock_cooling_device *ccdev = NULL;
+	char dev_name[THERMAL_NAME_LENGTH];
+	int ret = 0;
+
+	ccdev = devm_kzalloc(dev, sizeof(*ccdev), GFP_KERNEL);
+	if (!ccdev)
+		return ERR_PTR(-ENOMEM);
+
+	ccdev->dev = dev;
+	ccdev->clk = devm_clk_get(dev, clock_name);
+	if (IS_ERR(ccdev->clk))
+		return ERR_CAST(ccdev->clk);
+
+	ret = clock_cooling_get_idr(&ccdev->id);
+	if (ret)
+		return ERR_PTR(-EINVAL);
+
+	snprintf(dev_name, sizeof(dev_name), "thermal-clock-%d", ccdev->id);
+
+	cdev = thermal_cooling_device_register(dev_name, ccdev,
+					       &clock_cooling_ops);
+	if (IS_ERR(cdev)) {
+		release_idr(ccdev->id);
+		return ERR_PTR(-EINVAL);
+	}
+	ccdev->cdev = cdev;
+	ccdev->clk_rate_change_nb.notifier_call = clock_cooling_clock_notifier;
+
+	/* Assuming someone has already filled the opp table for this device */
+	ret = dev_pm_opp_init_cpufreq_table(dev, &ccdev->freq_table);
+	if (ret) {
+		release_idr(ccdev->id);
+		return ERR_PTR(ret);
+	}
+	ccdev->clock_state = 0;
+	ccdev->clock_val = clock_cooling_get_frequency(ccdev, 0);
+
+	clk_notifier_register(ccdev->clk, &ccdev->clk_rate_change_nb);
+
+	return cdev;
+}
+EXPORT_SYMBOL_GPL(clock_cooling_register);
+
+/**
+ * clock_cooling_unregister - function to remove clock cooling device.
+ * @cdev: thermal cooling device pointer.
+ *
+ * This interface function unregisters the "thermal-clock-%x" cooling device.
+ */
+void clock_cooling_unregister(struct thermal_cooling_device *cdev)
+{
+	struct clock_cooling_device *ccdev;
+
+	if (!cdev)
+		return;
+
+	ccdev = cdev->devdata;
+
+	clk_notifier_unregister(ccdev->clk, &ccdev->clk_rate_change_nb);
+	dev_pm_opp_free_cpufreq_table(ccdev->dev, &ccdev->freq_table);
+
+	thermal_cooling_device_unregister(ccdev->cdev);
+	release_idr(ccdev->id);
+}
+EXPORT_SYMBOL_GPL(clock_cooling_unregister);

diff --git a/drivers/thermal/cpu_cooling.c b/drivers/thermal/cpu_cooling.c
index ad09e51..07a9629 100644
--- a/drivers/thermal/cpu_cooling.c
+++ b/drivers/thermal/cpu_cooling.c

@@ -4,6 +4,8 @@
  *  Copyright (C) 2012	Samsung Electronics Co., Ltd(http://www.samsung.com)
  *  Copyright (C) 2012  Amit Daniel <amit.kachhap@linaro.org>
  *
+ *  Copyright (C) 2014  Viresh Kumar <viresh.kumar@linaro.org>
+ *
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
@@ -24,10 +26,40 @@
 #include <linux/thermal.h>
 #include <linux/cpufreq.h>
 #include <linux/err.h>
+#include <linux/pm_opp.h>
 #include <linux/slab.h>
 #include <linux/cpu.h>
 #include <linux/cpu_cooling.h>
 
+#include <trace/events/thermal.h>
+
+/*
+ * Cooling state <-> CPUFreq frequency
+ *
+ * Cooling states are translated to frequencies throughout this driver and this
+ * is the relation between them.
+ *
+ * Highest cooling state corresponds to lowest possible frequency.
+ *
+ * i.e.
+ *	level 0 --> 1st Max Freq
+ *	level 1 --> 2nd Max Freq
+ *	...
+ */
+
+/**
+ * struct power_table - frequency to power conversion
+ * @frequency:	frequency in KHz
+ * @power:	power in mW
+ *
+ * This structure is built when the cooling device registers and helps
+ * in translating frequency to power and viceversa.
+ */
+struct power_table {
+	u32 frequency;
+	u32 power;
+};
+
 /**
  * struct cpufreq_cooling_device - data for cooling device with cpufreq
  * @id: unique integer value corresponding to each cpufreq_cooling_device
@@ -38,25 +70,43 @@
  *	cooling	devices.
  * @cpufreq_val: integer value representing the absolute value of the clipped
  *	frequency.
+ * @max_level: maximum cooling level. One less than total number of valid
+ *	cpufreq frequencies.
  * @allowed_cpus: all the cpus involved for this cpufreq_cooling_device.
+ * @node: list_head to link all cpufreq_cooling_device together.
+ * @last_load: load measured by the latest call to cpufreq_get_actual_power()
+ * @time_in_idle: previous reading of the absolute time that this cpu was idle
+ * @time_in_idle_timestamp: wall time of the last invocation of
+ *	get_cpu_idle_time_us()
+ * @dyn_power_table: array of struct power_table for frequency to power
+ *	conversion, sorted in ascending order.
+ * @dyn_power_table_entries: number of entries in the @dyn_power_table array
+ * @cpu_dev: the first cpu_device from @allowed_cpus that has OPPs registered
+ * @plat_get_static_power: callback to calculate the static power
  *
- * This structure is required for keeping information of each
- * cpufreq_cooling_device registered. In order to prevent corruption of this a
- * mutex lock cooling_cpufreq_lock is used.
+ * This structure is required for keeping information of each registered
+ * cpufreq_cooling_device.
  */
 struct cpufreq_cooling_device {
 	int id;
 	struct thermal_cooling_device *cool_dev;
 	unsigned int cpufreq_state;
 	unsigned int cpufreq_val;
+	unsigned int max_level;
+	unsigned int *freq_table;	/* In descending order */
 	struct cpumask allowed_cpus;
 	struct list_head node;
+	u32 last_load;
+	u64 *time_in_idle;
+	u64 *time_in_idle_timestamp;
+	struct power_table *dyn_power_table;
+	int dyn_power_table_entries;
+	struct device *cpu_dev;
+	get_static_t plat_get_static_power;
 };
 static DEFINE_IDR(cpufreq_idr);
 static DEFINE_MUTEX(cooling_cpufreq_lock);
 
-static unsigned int cpufreq_dev_count;
-
 static LIST_HEAD(cpufreq_dev_list);
 
 /**
@@ -98,120 +148,30 @@
 /* Below code defines functions to be used for cpufreq as cooling device */
 
 /**
- * is_cpufreq_valid - function to check frequency transitioning capability.
- * @cpu: cpu for which check is needed.
+ * get_level: Find the level for a particular frequency
+ * @cpufreq_dev: cpufreq_dev for which the property is required
+ * @freq: Frequency
  *
- * This function will check the current state of the system if
- * it is capable of changing the frequency for a given @cpu.
- *
- * Return: 0 if the system is not currently capable of changing
- * the frequency of given cpu. !0 in case the frequency is changeable.
+ * Return: level on success, THERMAL_CSTATE_INVALID on error.
  */
-static int is_cpufreq_valid(int cpu)
+static unsigned long get_level(struct cpufreq_cooling_device *cpufreq_dev,
+			       unsigned int freq)
 {
-	struct cpufreq_policy policy;
+	unsigned long level;
 
-	return !cpufreq_get_policy(&policy, cpu);
-}
+	for (level = 0; level <= cpufreq_dev->max_level; level++) {
+		if (freq == cpufreq_dev->freq_table[level])
+			return level;
 
-enum cpufreq_cooling_property {
-	GET_LEVEL,
-	GET_FREQ,
-	GET_MAXL,
-};
-
-/**
- * get_property - fetch a property of interest for a give cpu.
- * @cpu: cpu for which the property is required
- * @input: query parameter
- * @output: query return
- * @property: type of query (frequency, level, max level)
- *
- * This is the common function to
- * 1. get maximum cpu cooling states
- * 2. translate frequency to cooling state
- * 3. translate cooling state to frequency
- * Note that the code may be not in good shape
- * but it is written in this way in order to:
- * a) reduce duplicate code as most of the code can be shared.
- * b) make sure the logic is consistent when translating between
- *    cooling states and frequencies.
- *
- * Return: 0 on success, -EINVAL when invalid parameters are passed.
- */
-static int get_property(unsigned int cpu, unsigned long input,
-			unsigned int *output,
-			enum cpufreq_cooling_property property)
-{
-	int i;
-	unsigned long max_level = 0, level = 0;
-	unsigned int freq = CPUFREQ_ENTRY_INVALID;
-	int descend = -1;
-	struct cpufreq_frequency_table *pos, *table =
-					cpufreq_frequency_get_table(cpu);
-
-	if (!output)
-		return -EINVAL;
-
-	if (!table)
-		return -EINVAL;
-
-	cpufreq_for_each_valid_entry(pos, table) {
-		/* ignore duplicate entry */
-		if (freq == pos->frequency)
-			continue;
-
-		/* get the frequency order */
-		if (freq != CPUFREQ_ENTRY_INVALID && descend == -1)
-			descend = freq > pos->frequency;
-
-		freq = pos->frequency;
-		max_level++;
+		if (freq > cpufreq_dev->freq_table[level])
+			break;
 	}
 
-	/* No valid cpu frequency entry */
-	if (max_level == 0)
-		return -EINVAL;
-
-	/* max_level is an index, not a counter */
-	max_level--;
-
-	/* get max level */
-	if (property == GET_MAXL) {
-		*output = (unsigned int)max_level;
-		return 0;
-	}
-
-	if (property == GET_FREQ)
-		level = descend ? input : (max_level - input);
-
-	i = 0;
-	cpufreq_for_each_valid_entry(pos, table) {
-		/* ignore duplicate entry */
-		if (freq == pos->frequency)
-			continue;
-
-		/* now we have a valid frequency entry */
-		freq = pos->frequency;
-
-		if (property == GET_LEVEL && (unsigned int)input == freq) {
-			/* get level by frequency */
-			*output = descend ? i : (max_level - i);
-			return 0;
-		}
-		if (property == GET_FREQ && level == i) {
-			/* get frequency by level */
-			*output = freq;
-			return 0;
-		}
-		i++;
-	}
-
-	return -EINVAL;
+	return THERMAL_CSTATE_INVALID;
 }
 
 /**
- * cpufreq_cooling_get_level - for a give cpu, return the cooling level.
+ * cpufreq_cooling_get_level - for a given cpu, return the cooling level.
  * @cpu: cpu for which the level is required
  * @freq: the frequency of interest
  *
@@ -223,79 +183,23 @@
  */
 unsigned long cpufreq_cooling_get_level(unsigned int cpu, unsigned int freq)
 {
-	unsigned int val;
+	struct cpufreq_cooling_device *cpufreq_dev;
 
-	if (get_property(cpu, (unsigned long)freq, &val, GET_LEVEL))
-		return THERMAL_CSTATE_INVALID;
+	mutex_lock(&cooling_cpufreq_lock);
+	list_for_each_entry(cpufreq_dev, &cpufreq_dev_list, node) {
+		if (cpumask_test_cpu(cpu, &cpufreq_dev->allowed_cpus)) {
+			mutex_unlock(&cooling_cpufreq_lock);
+			return get_level(cpufreq_dev, freq);
+		}
+	}
+	mutex_unlock(&cooling_cpufreq_lock);
 
-	return (unsigned long)val;
+	pr_err("%s: cpu:%d not part of any cooling device\n", __func__, cpu);
+	return THERMAL_CSTATE_INVALID;
 }
 EXPORT_SYMBOL_GPL(cpufreq_cooling_get_level);
 
 /**
- * get_cpu_frequency - get the absolute value of frequency from level.
- * @cpu: cpu for which frequency is fetched.
- * @level: cooling level
- *
- * This function matches cooling level with frequency. Based on a cooling level
- * of frequency, equals cooling state of cpu cooling device, it will return
- * the corresponding frequency.
- *	e.g level=0 --> 1st MAX FREQ, level=1 ---> 2nd MAX FREQ, .... etc
- *
- * Return: 0 on error, the corresponding frequency otherwise.
- */
-static unsigned int get_cpu_frequency(unsigned int cpu, unsigned long level)
-{
-	int ret = 0;
-	unsigned int freq;
-
-	ret = get_property(cpu, level, &freq, GET_FREQ);
-	if (ret)
-		return 0;
-
-	return freq;
-}
-
-/**
- * cpufreq_apply_cooling - function to apply frequency clipping.
- * @cpufreq_device: cpufreq_cooling_device pointer containing frequency
- *	clipping data.
- * @cooling_state: value of the cooling state.
- *
- * Function used to make sure the cpufreq layer is aware of current thermal
- * limits. The limits are applied by updating the cpufreq policy.
- *
- * Return: 0 on success, an error code otherwise (-EINVAL in case wrong
- * cooling state).
- */
-static int cpufreq_apply_cooling(struct cpufreq_cooling_device *cpufreq_device,
-				 unsigned long cooling_state)
-{
-	unsigned int cpuid, clip_freq;
-	struct cpumask *mask = &cpufreq_device->allowed_cpus;
-	unsigned int cpu = cpumask_any(mask);
-
-
-	/* Check if the old cooling action is same as new cooling action */
-	if (cpufreq_device->cpufreq_state == cooling_state)
-		return 0;
-
-	clip_freq = get_cpu_frequency(cpu, cooling_state);
-	if (!clip_freq)
-		return -EINVAL;
-
-	cpufreq_device->cpufreq_state = cooling_state;
-	cpufreq_device->cpufreq_val = clip_freq;
-
-	for_each_cpu(cpuid, mask) {
-		if (is_cpufreq_valid(cpuid))
-			cpufreq_update_policy(cpuid);
-	}
-
-	return 0;
-}
-
-/**
  * cpufreq_thermal_notifier - notifier callback for cpufreq policy change.
  * @nb:	struct notifier_block * with callback info.
  * @event: value showing cpufreq event for which this function invoked.
@@ -314,28 +218,233 @@
 	unsigned long max_freq = 0;
 	struct cpufreq_cooling_device *cpufreq_dev;
 
-	if (event != CPUFREQ_ADJUST)
-		return 0;
+	switch (event) {
 
-	mutex_lock(&cooling_cpufreq_lock);
-	list_for_each_entry(cpufreq_dev, &cpufreq_dev_list, node) {
-		if (!cpumask_test_cpu(policy->cpu,
-					&cpufreq_dev->allowed_cpus))
-			continue;
+	case CPUFREQ_ADJUST:
+		mutex_lock(&cooling_cpufreq_lock);
+		list_for_each_entry(cpufreq_dev, &cpufreq_dev_list, node) {
+			if (!cpumask_test_cpu(policy->cpu,
+					      &cpufreq_dev->allowed_cpus))
+				continue;
 
-		if (!cpufreq_dev->cpufreq_val)
-			cpufreq_dev->cpufreq_val = get_cpu_frequency(
-					cpumask_any(&cpufreq_dev->allowed_cpus),
-					cpufreq_dev->cpufreq_state);
+			max_freq = cpufreq_dev->cpufreq_val;
 
-		max_freq = cpufreq_dev->cpufreq_val;
-
-		if (policy->max != max_freq)
-			cpufreq_verify_within_limits(policy, 0, max_freq);
+			if (policy->max != max_freq)
+				cpufreq_verify_within_limits(policy, 0,
+							     max_freq);
+		}
+		mutex_unlock(&cooling_cpufreq_lock);
+		break;
+	default:
+		return NOTIFY_DONE;
 	}
-	mutex_unlock(&cooling_cpufreq_lock);
 
-	return 0;
+	return NOTIFY_OK;
+}
+
+/**
+ * build_dyn_power_table() - create a dynamic power to frequency table
+ * @cpufreq_device:	the cpufreq cooling device in which to store the table
+ * @capacitance: dynamic power coefficient for these cpus
+ *
+ * Build a dynamic power to frequency table for this cpu and store it
+ * in @cpufreq_device.  This table will be used in cpu_power_to_freq() and
+ * cpu_freq_to_power() to convert between power and frequency
+ * efficiently.  Power is stored in mW, frequency in KHz.  The
+ * resulting table is in ascending order.
+ *
+ * Return: 0 on success, -E* on error.
+ */
+static int build_dyn_power_table(struct cpufreq_cooling_device *cpufreq_device,
+				 u32 capacitance)
+{
+	struct power_table *power_table;
+	struct dev_pm_opp *opp;
+	struct device *dev = NULL;
+	int num_opps = 0, cpu, i, ret = 0;
+	unsigned long freq;
+
+	rcu_read_lock();
+
+	for_each_cpu(cpu, &cpufreq_device->allowed_cpus) {
+		dev = get_cpu_device(cpu);
+		if (!dev) {
+			dev_warn(&cpufreq_device->cool_dev->device,
+				 "No cpu device for cpu %d\n", cpu);
+			continue;
+		}
+
+		num_opps = dev_pm_opp_get_opp_count(dev);
+		if (num_opps > 0) {
+			break;
+		} else if (num_opps < 0) {
+			ret = num_opps;
+			goto unlock;
+		}
+	}
+
+	if (num_opps == 0) {
+		ret = -EINVAL;
+		goto unlock;
+	}
+
+	power_table = kcalloc(num_opps, sizeof(*power_table), GFP_KERNEL);
+
+	for (freq = 0, i = 0;
+	     opp = dev_pm_opp_find_freq_ceil(dev, &freq), !IS_ERR(opp);
+	     freq++, i++) {
+		u32 freq_mhz, voltage_mv;
+		u64 power;
+
+		freq_mhz = freq / 1000000;
+		voltage_mv = dev_pm_opp_get_voltage(opp) / 1000;
+
+		/*
+		 * Do the multiplication with MHz and millivolt so as
+		 * to not overflow.
+		 */
+		power = (u64)capacitance * freq_mhz * voltage_mv * voltage_mv;
+		do_div(power, 1000000000);
+
+		/* frequency is stored in power_table in KHz */
+		power_table[i].frequency = freq / 1000;
+
+		/* power is stored in mW */
+		power_table[i].power = power;
+	}
+
+	if (i == 0) {
+		ret = PTR_ERR(opp);
+		goto unlock;
+	}
+
+	cpufreq_device->cpu_dev = dev;
+	cpufreq_device->dyn_power_table = power_table;
+	cpufreq_device->dyn_power_table_entries = i;
+
+unlock:
+	rcu_read_unlock();
+	return ret;
+}
+
+static u32 cpu_freq_to_power(struct cpufreq_cooling_device *cpufreq_device,
+			     u32 freq)
+{
+	int i;
+	struct power_table *pt = cpufreq_device->dyn_power_table;
+
+	for (i = 1; i < cpufreq_device->dyn_power_table_entries; i++)
+		if (freq < pt[i].frequency)
+			break;
+
+	return pt[i - 1].power;
+}
+
+static u32 cpu_power_to_freq(struct cpufreq_cooling_device *cpufreq_device,
+			     u32 power)
+{
+	int i;
+	struct power_table *pt = cpufreq_device->dyn_power_table;
+
+	for (i = 1; i < cpufreq_device->dyn_power_table_entries; i++)
+		if (power < pt[i].power)
+			break;
+
+	return pt[i - 1].frequency;
+}
+
+/**
+ * get_load() - get load for a cpu since last updated
+ * @cpufreq_device:	&struct cpufreq_cooling_device for this cpu
+ * @cpu:	cpu number
+ *
+ * Return: The average load of cpu @cpu in percentage since this
+ * function was last called.
+ */
+static u32 get_load(struct cpufreq_cooling_device *cpufreq_device, int cpu)
+{
+	u32 load;
+	u64 now, now_idle, delta_time, delta_idle;
+
+	now_idle = get_cpu_idle_time(cpu, &now, 0);
+	delta_idle = now_idle - cpufreq_device->time_in_idle[cpu];
+	delta_time = now - cpufreq_device->time_in_idle_timestamp[cpu];
+
+	if (delta_time <= delta_idle)
+		load = 0;
+	else
+		load = div64_u64(100 * (delta_time - delta_idle), delta_time);
+
+	cpufreq_device->time_in_idle[cpu] = now_idle;
+	cpufreq_device->time_in_idle_timestamp[cpu] = now;
+
+	return load;
+}
+
+/**
+ * get_static_power() - calculate the static power consumed by the cpus
+ * @cpufreq_device:	struct &cpufreq_cooling_device for this cpu cdev
+ * @tz:		thermal zone device in which we're operating
+ * @freq:	frequency in KHz
+ * @power:	pointer in which to store the calculated static power
+ *
+ * Calculate the static power consumed by the cpus described by
+ * @cpu_actor running at frequency @freq.  This function relies on a
+ * platform specific function that should have been provided when the
+ * actor was registered.  If it wasn't, the static power is assumed to
+ * be negligible.  The calculated static power is stored in @power.
+ *
+ * Return: 0 on success, -E* on failure.
+ */
+static int get_static_power(struct cpufreq_cooling_device *cpufreq_device,
+			    struct thermal_zone_device *tz, unsigned long freq,
+			    u32 *power)
+{
+	struct dev_pm_opp *opp;
+	unsigned long voltage;
+	struct cpumask *cpumask = &cpufreq_device->allowed_cpus;
+	unsigned long freq_hz = freq * 1000;
+
+	if (!cpufreq_device->plat_get_static_power ||
+	    !cpufreq_device->cpu_dev) {
+		*power = 0;
+		return 0;
+	}
+
+	rcu_read_lock();
+
+	opp = dev_pm_opp_find_freq_exact(cpufreq_device->cpu_dev, freq_hz,
+					 true);
+	voltage = dev_pm_opp_get_voltage(opp);
+
+	rcu_read_unlock();
+
+	if (voltage == 0) {
+		dev_warn_ratelimited(cpufreq_device->cpu_dev,
+				     "Failed to get voltage for frequency %lu: %ld\n",
+				     freq_hz, IS_ERR(opp) ? PTR_ERR(opp) : 0);
+		return -EINVAL;
+	}
+
+	return cpufreq_device->plat_get_static_power(cpumask, tz->passive_delay,
+						     voltage, power);
+}
+
+/**
+ * get_dynamic_power() - calculate the dynamic power
+ * @cpufreq_device:	&cpufreq_cooling_device for this cdev
+ * @freq:	current frequency
+ *
+ * Return: the dynamic power consumed by the cpus described by
+ * @cpufreq_device.
+ */
+static u32 get_dynamic_power(struct cpufreq_cooling_device *cpufreq_device,
+			     unsigned long freq)
+{
+	u32 raw_cpu_power;
+
+	raw_cpu_power = cpu_freq_to_power(cpufreq_device, freq);
+	return (raw_cpu_power * cpufreq_device->last_load) / 100;
 }
 
 /* cpufreq cooling device callback functions are defined below */
@@ -354,19 +463,9 @@
 				 unsigned long *state)
 {
 	struct cpufreq_cooling_device *cpufreq_device = cdev->devdata;
-	struct cpumask *mask = &cpufreq_device->allowed_cpus;
-	unsigned int cpu;
-	unsigned int count = 0;
-	int ret;
 
-	cpu = cpumask_any(mask);
-
-	ret = get_property(cpu, 0, &count, GET_MAXL);
-
-	if (count > 0)
-		*state = count;
-
-	return ret;
+	*state = cpufreq_device->max_level;
+	return 0;
 }
 
 /**
@@ -403,12 +502,225 @@
 				 unsigned long state)
 {
 	struct cpufreq_cooling_device *cpufreq_device = cdev->devdata;
+	unsigned int cpu = cpumask_any(&cpufreq_device->allowed_cpus);
+	unsigned int clip_freq;
 
-	return cpufreq_apply_cooling(cpufreq_device, state);
+	/* Request state should be less than max_level */
+	if (WARN_ON(state > cpufreq_device->max_level))
+		return -EINVAL;
+
+	/* Check if the old cooling action is same as new cooling action */
+	if (cpufreq_device->cpufreq_state == state)
+		return 0;
+
+	clip_freq = cpufreq_device->freq_table[state];
+	cpufreq_device->cpufreq_state = state;
+	cpufreq_device->cpufreq_val = clip_freq;
+
+	cpufreq_update_policy(cpu);
+
+	return 0;
+}
+
+/**
+ * cpufreq_get_requested_power() - get the current power
+ * @cdev:	&thermal_cooling_device pointer
+ * @tz:		a valid thermal zone device pointer
+ * @power:	pointer in which to store the resulting power
+ *
+ * Calculate the current power consumption of the cpus in milliwatts
+ * and store it in @power.  This function should actually calculate
+ * the requested power, but it's hard to get the frequency that
+ * cpufreq would have assigned if there were no thermal limits.
+ * Instead, we calculate the current power on the assumption that the
+ * immediate future will look like the immediate past.
+ *
+ * We use the current frequency and the average load since this
+ * function was last called.  In reality, there could have been
+ * multiple opps since this function was last called and that affects
+ * the load calculation.  While it's not perfectly accurate, this
+ * simplification is good enough and works.  REVISIT this, as more
+ * complex code may be needed if experiments show that it's not
+ * accurate enough.
+ *
+ * Return: 0 on success, -E* if getting the static power failed.
+ */
+static int cpufreq_get_requested_power(struct thermal_cooling_device *cdev,
+				       struct thermal_zone_device *tz,
+				       u32 *power)
+{
+	unsigned long freq;
+	int i = 0, cpu, ret;
+	u32 static_power, dynamic_power, total_load = 0;
+	struct cpufreq_cooling_device *cpufreq_device = cdev->devdata;
+	u32 *load_cpu = NULL;
+
+	cpu = cpumask_any_and(&cpufreq_device->allowed_cpus, cpu_online_mask);
+
+	/*
+	 * All the CPUs are offline, thus the requested power by
+	 * the cdev is 0
+	 */
+	if (cpu >= nr_cpu_ids) {
+		*power = 0;
+		return 0;
+	}
+
+	freq = cpufreq_quick_get(cpu);
+
+	if (trace_thermal_power_cpu_get_power_enabled()) {
+		u32 ncpus = cpumask_weight(&cpufreq_device->allowed_cpus);
+
+		load_cpu = devm_kcalloc(&cdev->device, ncpus, sizeof(*load_cpu),
+					GFP_KERNEL);
+	}
+
+	for_each_cpu(cpu, &cpufreq_device->allowed_cpus) {
+		u32 load;
+
+		if (cpu_online(cpu))
+			load = get_load(cpufreq_device, cpu);
+		else
+			load = 0;
+
+		total_load += load;
+		if (trace_thermal_power_cpu_limit_enabled() && load_cpu)
+			load_cpu[i] = load;
+
+		i++;
+	}
+
+	cpufreq_device->last_load = total_load;
+
+	dynamic_power = get_dynamic_power(cpufreq_device, freq);
+	ret = get_static_power(cpufreq_device, tz, freq, &static_power);
+	if (ret) {
+		if (load_cpu)
+			devm_kfree(&cdev->device, load_cpu);
+		return ret;
+	}
+
+	if (load_cpu) {
+		trace_thermal_power_cpu_get_power(
+			&cpufreq_device->allowed_cpus,
+			freq, load_cpu, i, dynamic_power, static_power);
+
+		devm_kfree(&cdev->device, load_cpu);
+	}
+
+	*power = static_power + dynamic_power;
+	return 0;
+}
+
+/**
+ * cpufreq_state2power() - convert a cpu cdev state to power consumed
+ * @cdev:	&thermal_cooling_device pointer
+ * @tz:		a valid thermal zone device pointer
+ * @state:	cooling device state to be converted
+ * @power:	pointer in which to store the resulting power
+ *
+ * Convert cooling device state @state into power consumption in
+ * milliwatts assuming 100% load.  Store the calculated power in
+ * @power.
+ *
+ * Return: 0 on success, -EINVAL if the cooling device state could not
+ * be converted into a frequency or other -E* if there was an error
+ * when calculating the static power.
+ */
+static int cpufreq_state2power(struct thermal_cooling_device *cdev,
+			       struct thermal_zone_device *tz,
+			       unsigned long state, u32 *power)
+{
+	unsigned int freq, num_cpus;
+	cpumask_t cpumask;
+	u32 static_power, dynamic_power;
+	int ret;
+	struct cpufreq_cooling_device *cpufreq_device = cdev->devdata;
+
+	cpumask_and(&cpumask, &cpufreq_device->allowed_cpus, cpu_online_mask);
+	num_cpus = cpumask_weight(&cpumask);
+
+	/* None of our cpus are online, so no power */
+	if (num_cpus == 0) {
+		*power = 0;
+		return 0;
+	}
+
+	freq = cpufreq_device->freq_table[state];
+	if (!freq)
+		return -EINVAL;
+
+	dynamic_power = cpu_freq_to_power(cpufreq_device, freq) * num_cpus;
+	ret = get_static_power(cpufreq_device, tz, freq, &static_power);
+	if (ret)
+		return ret;
+
+	*power = static_power + dynamic_power;
+	return 0;
+}
+
+/**
+ * cpufreq_power2state() - convert power to a cooling device state
+ * @cdev:	&thermal_cooling_device pointer
+ * @tz:		a valid thermal zone device pointer
+ * @power:	power in milliwatts to be converted
+ * @state:	pointer in which to store the resulting state
+ *
+ * Calculate a cooling device state for the cpus described by @cdev
+ * that would allow them to consume at most @power mW and store it in
+ * @state.  Note that this calculation depends on external factors
+ * such as the cpu load or the current static power.  Calling this
+ * function with the same power as input can yield different cooling
+ * device states depending on those external factors.
+ *
+ * Return: 0 on success, -ENODEV if no cpus are online or -EINVAL if
+ * the calculated frequency could not be converted to a valid state.
+ * The latter should not happen unless the frequencies available to
+ * cpufreq have changed since the initialization of the cpu cooling
+ * device.
+ */
+static int cpufreq_power2state(struct thermal_cooling_device *cdev,
+			       struct thermal_zone_device *tz, u32 power,
+			       unsigned long *state)
+{
+	unsigned int cpu, cur_freq, target_freq;
+	int ret;
+	s32 dyn_power;
+	u32 last_load, normalised_power, static_power;
+	struct cpufreq_cooling_device *cpufreq_device = cdev->devdata;
+
+	cpu = cpumask_any_and(&cpufreq_device->allowed_cpus, cpu_online_mask);
+
+	/* None of our cpus are online */
+	if (cpu >= nr_cpu_ids)
+		return -ENODEV;
+
+	cur_freq = cpufreq_quick_get(cpu);
+	ret = get_static_power(cpufreq_device, tz, cur_freq, &static_power);
+	if (ret)
+		return ret;
+
+	dyn_power = power - static_power;
+	dyn_power = dyn_power > 0 ? dyn_power : 0;
+	last_load = cpufreq_device->last_load ?: 1;
+	normalised_power = (dyn_power * 100) / last_load;
+	target_freq = cpu_power_to_freq(cpufreq_device, normalised_power);
+
+	*state = cpufreq_cooling_get_level(cpu, target_freq);
+	if (*state == THERMAL_CSTATE_INVALID) {
+		dev_warn_ratelimited(&cdev->device,
+				     "Failed to convert %dKHz for cpu %d into a cdev state\n",
+				     target_freq, cpu);
+		return -EINVAL;
+	}
+
+	trace_thermal_power_cpu_limit(&cpufreq_device->allowed_cpus,
+				      target_freq, *state, power);
+	return 0;
 }
 
 /* Bind cpufreq callbacks to thermal cooling device ops */
-static struct thermal_cooling_device_ops const cpufreq_cooling_ops = {
+static struct thermal_cooling_device_ops cpufreq_cooling_ops = {
 	.get_max_state = cpufreq_get_max_state,
 	.get_cur_state = cpufreq_get_cur_state,
 	.set_cur_state = cpufreq_set_cur_state,
@@ -419,10 +731,28 @@
 	.notifier_call = cpufreq_thermal_notifier,
 };
 
+static unsigned int find_next_max(struct cpufreq_frequency_table *table,
+				  unsigned int prev_max)
+{
+	struct cpufreq_frequency_table *pos;
+	unsigned int max = 0;
+
+	cpufreq_for_each_valid_entry(pos, table) {
+		if (pos->frequency > max && pos->frequency < prev_max)
+			max = pos->frequency;
+	}
+
+	return max;
+}
+
 /**
  * __cpufreq_cooling_register - helper function to create cpufreq cooling device
  * @np: a valid struct device_node to the cooling device device tree node
  * @clip_cpus: cpumask of cpus where the frequency constraints will happen.
+ * Normally this should be same as cpufreq policy->related_cpus.
+ * @capacitance: dynamic power coefficient for these cpus
+ * @plat_static_func: function to calculate the static power consumed by these
+ *                    cpus (optional)
  *
  * This interface function registers the cpufreq cooling device with the name
  * "thermal-cpufreq-%x". This api can support multiple instances of cpufreq
@@ -434,40 +764,77 @@
  */
 static struct thermal_cooling_device *
 __cpufreq_cooling_register(struct device_node *np,
-			   const struct cpumask *clip_cpus)
+			const struct cpumask *clip_cpus, u32 capacitance,
+			get_static_t plat_static_func)
 {
 	struct thermal_cooling_device *cool_dev;
-	struct cpufreq_cooling_device *cpufreq_dev = NULL;
-	unsigned int min = 0, max = 0;
+	struct cpufreq_cooling_device *cpufreq_dev;
 	char dev_name[THERMAL_NAME_LENGTH];
-	int ret = 0, i;
-	struct cpufreq_policy policy;
+	struct cpufreq_frequency_table *pos, *table;
+	unsigned int freq, i, num_cpus;
+	int ret;
 
-	/* Verify that all the clip cpus have same freq_min, freq_max limit */
-	for_each_cpu(i, clip_cpus) {
-		/* continue if cpufreq policy not found and not return error */
-		if (!cpufreq_get_policy(&policy, i))
-			continue;
-		if (min == 0 && max == 0) {
-			min = policy.cpuinfo.min_freq;
-			max = policy.cpuinfo.max_freq;
-		} else {
-			if (min != policy.cpuinfo.min_freq ||
-			    max != policy.cpuinfo.max_freq)
-				return ERR_PTR(-EINVAL);
-		}
+	table = cpufreq_frequency_get_table(cpumask_first(clip_cpus));
+	if (!table) {
+		pr_debug("%s: CPUFreq table not found\n", __func__);
+		return ERR_PTR(-EPROBE_DEFER);
 	}
-	cpufreq_dev = kzalloc(sizeof(struct cpufreq_cooling_device),
-			      GFP_KERNEL);
+
+	cpufreq_dev = kzalloc(sizeof(*cpufreq_dev), GFP_KERNEL);
 	if (!cpufreq_dev)
 		return ERR_PTR(-ENOMEM);
 
+	num_cpus = cpumask_weight(clip_cpus);
+	cpufreq_dev->time_in_idle = kcalloc(num_cpus,
+					    sizeof(*cpufreq_dev->time_in_idle),
+					    GFP_KERNEL);
+	if (!cpufreq_dev->time_in_idle) {
+		cool_dev = ERR_PTR(-ENOMEM);
+		goto free_cdev;
+	}
+
+	cpufreq_dev->time_in_idle_timestamp =
+		kcalloc(num_cpus, sizeof(*cpufreq_dev->time_in_idle_timestamp),
+			GFP_KERNEL);
+	if (!cpufreq_dev->time_in_idle_timestamp) {
+		cool_dev = ERR_PTR(-ENOMEM);
+		goto free_time_in_idle;
+	}
+
+	/* Find max levels */
+	cpufreq_for_each_valid_entry(pos, table)
+		cpufreq_dev->max_level++;
+
+	cpufreq_dev->freq_table = kmalloc(sizeof(*cpufreq_dev->freq_table) *
+					  cpufreq_dev->max_level, GFP_KERNEL);
+	if (!cpufreq_dev->freq_table) {
+		cool_dev = ERR_PTR(-ENOMEM);
+		goto free_time_in_idle_timestamp;
+	}
+
+	/* max_level is an index, not a counter */
+	cpufreq_dev->max_level--;
+
 	cpumask_copy(&cpufreq_dev->allowed_cpus, clip_cpus);
 
+	if (capacitance) {
+		cpufreq_cooling_ops.get_requested_power =
+			cpufreq_get_requested_power;
+		cpufreq_cooling_ops.state2power = cpufreq_state2power;
+		cpufreq_cooling_ops.power2state = cpufreq_power2state;
+		cpufreq_dev->plat_get_static_power = plat_static_func;
+
+		ret = build_dyn_power_table(cpufreq_dev, capacitance);
+		if (ret) {
+			cool_dev = ERR_PTR(ret);
+			goto free_table;
+		}
+	}
+
 	ret = get_idr(&cpufreq_idr, &cpufreq_dev->id);
 	if (ret) {
-		kfree(cpufreq_dev);
-		return ERR_PTR(-EINVAL);
+		cool_dev = ERR_PTR(ret);
+		goto free_table;
 	}
 
 	snprintf(dev_name, sizeof(dev_name), "thermal-cpufreq-%d",
@@ -475,25 +842,48 @@
 
 	cool_dev = thermal_of_cooling_device_register(np, dev_name, cpufreq_dev,
 						      &cpufreq_cooling_ops);
-	if (IS_ERR(cool_dev)) {
-		release_idr(&cpufreq_idr, cpufreq_dev->id);
-		kfree(cpufreq_dev);
-		return cool_dev;
+	if (IS_ERR(cool_dev))
+		goto remove_idr;
+
+	/* Fill freq-table in descending order of frequencies */
+	for (i = 0, freq = -1; i <= cpufreq_dev->max_level; i++) {
+		freq = find_next_max(table, freq);
+		cpufreq_dev->freq_table[i] = freq;
+
+		/* Warn for duplicate entries */
+		if (!freq)
+			pr_warn("%s: table has duplicate entries\n", __func__);
+		else
+			pr_debug("%s: freq:%u KHz\n", __func__, freq);
 	}
+
+	cpufreq_dev->cpufreq_val = cpufreq_dev->freq_table[0];
 	cpufreq_dev->cool_dev = cool_dev;
-	cpufreq_dev->cpufreq_state = 0;
+
 	mutex_lock(&cooling_cpufreq_lock);
 
 	/* Register the notifier for first cpufreq cooling device */
-	if (cpufreq_dev_count == 0)
+	if (list_empty(&cpufreq_dev_list))
 		cpufreq_register_notifier(&thermal_cpufreq_notifier_block,
 					  CPUFREQ_POLICY_NOTIFIER);
-	cpufreq_dev_count++;
 	list_add(&cpufreq_dev->node, &cpufreq_dev_list);
 
 	mutex_unlock(&cooling_cpufreq_lock);
 
 	return cool_dev;
+
+remove_idr:
+	release_idr(&cpufreq_idr, cpufreq_dev->id);
+free_table:
+	kfree(cpufreq_dev->freq_table);
+free_time_in_idle_timestamp:
+	kfree(cpufreq_dev->time_in_idle_timestamp);
+free_time_in_idle:
+	kfree(cpufreq_dev->time_in_idle);
+free_cdev:
+	kfree(cpufreq_dev);
+
+	return cool_dev;
 }
 
 /**
@@ -510,7 +900,7 @@
 struct thermal_cooling_device *
 cpufreq_cooling_register(const struct cpumask *clip_cpus)
 {
-	return __cpufreq_cooling_register(NULL, clip_cpus);
+	return __cpufreq_cooling_register(NULL, clip_cpus, 0, NULL);
 }
 EXPORT_SYMBOL_GPL(cpufreq_cooling_register);
 
@@ -534,11 +924,78 @@
 	if (!np)
 		return ERR_PTR(-EINVAL);
 
-	return __cpufreq_cooling_register(np, clip_cpus);
+	return __cpufreq_cooling_register(np, clip_cpus, 0, NULL);
 }
 EXPORT_SYMBOL_GPL(of_cpufreq_cooling_register);
 
 /**
+ * cpufreq_power_cooling_register() - create cpufreq cooling device with power extensions
+ * @clip_cpus:	cpumask of cpus where the frequency constraints will happen
+ * @capacitance:	dynamic power coefficient for these cpus
+ * @plat_static_func:	function to calculate the static power consumed by these
+ *			cpus (optional)
+ *
+ * This interface function registers the cpufreq cooling device with
+ * the name "thermal-cpufreq-%x".  This api can support multiple
+ * instances of cpufreq cooling devices.  Using this function, the
+ * cooling device will implement the power extensions by using a
+ * simple cpu power model.  The cpus must have registered their OPPs
+ * using the OPP library.
+ *
+ * An optional @plat_static_func may be provided to calculate the
+ * static power consumed by these cpus.  If the platform's static
+ * power consumption is unknown or negligible, make it NULL.
+ *
+ * Return: a valid struct thermal_cooling_device pointer on success,
+ * on failure, it returns a corresponding ERR_PTR().
+ */
+struct thermal_cooling_device *
+cpufreq_power_cooling_register(const struct cpumask *clip_cpus, u32 capacitance,
+			       get_static_t plat_static_func)
+{
+	return __cpufreq_cooling_register(NULL, clip_cpus, capacitance,
+				plat_static_func);
+}
+EXPORT_SYMBOL(cpufreq_power_cooling_register);
+
+/**
+ * of_cpufreq_power_cooling_register() - create cpufreq cooling device with power extensions
+ * @np:	a valid struct device_node to the cooling device device tree node
+ * @clip_cpus:	cpumask of cpus where the frequency constraints will happen
+ * @capacitance:	dynamic power coefficient for these cpus
+ * @plat_static_func:	function to calculate the static power consumed by these
+ *			cpus (optional)
+ *
+ * This interface function registers the cpufreq cooling device with
+ * the name "thermal-cpufreq-%x".  This api can support multiple
+ * instances of cpufreq cooling devices.  Using this API, the cpufreq
+ * cooling device will be linked to the device tree node provided.
+ * Using this function, the cooling device will implement the power
+ * extensions by using a simple cpu power model.  The cpus must have
+ * registered their OPPs using the OPP library.
+ *
+ * An optional @plat_static_func may be provided to calculate the
+ * static power consumed by these cpus.  If the platform's static
+ * power consumption is unknown or negligible, make it NULL.
+ *
+ * Return: a valid struct thermal_cooling_device pointer on success,
+ * on failure, it returns a corresponding ERR_PTR().
+ */
+struct thermal_cooling_device *
+of_cpufreq_power_cooling_register(struct device_node *np,
+				  const struct cpumask *clip_cpus,
+				  u32 capacitance,
+				  get_static_t plat_static_func)
+{
+	if (!np)
+		return ERR_PTR(-EINVAL);
+
+	return __cpufreq_cooling_register(np, clip_cpus, capacitance,
+				plat_static_func);
+}
+EXPORT_SYMBOL(of_cpufreq_power_cooling_register);
+
+/**
  * cpufreq_cooling_unregister - function to remove cpufreq cooling device.
  * @cdev: thermal cooling device pointer.
  *
@@ -554,16 +1011,18 @@
 	cpufreq_dev = cdev->devdata;
 	mutex_lock(&cooling_cpufreq_lock);
 	list_del(&cpufreq_dev->node);
-	cpufreq_dev_count--;
 
 	/* Unregister the notifier for the last cpufreq cooling device */
-	if (cpufreq_dev_count == 0)
+	if (list_empty(&cpufreq_dev_list))
 		cpufreq_unregister_notifier(&thermal_cpufreq_notifier_block,
 					    CPUFREQ_POLICY_NOTIFIER);
 	mutex_unlock(&cooling_cpufreq_lock);
 
 	thermal_cooling_device_unregister(cpufreq_dev->cool_dev);
 	release_idr(&cpufreq_idr, cpufreq_dev->id);
+	kfree(cpufreq_dev->time_in_idle_timestamp);
+	kfree(cpufreq_dev->time_in_idle);
+	kfree(cpufreq_dev->freq_table);
 	kfree(cpufreq_dev);
 }
 EXPORT_SYMBOL_GPL(cpufreq_cooling_unregister);

diff --git a/drivers/thermal/db8500_cpufreq_cooling.c b/drivers/thermal/db8500_cpufreq_cooling.c
index 786d192..1ac7ec6 100644
--- a/drivers/thermal/db8500_cpufreq_cooling.c
+++ b/drivers/thermal/db8500_cpufreq_cooling.c

@@ -18,7 +18,6 @@
  */
 
 #include <linux/cpu_cooling.h>
-#include <linux/cpufreq.h>
 #include <linux/err.h>
 #include <linux/module.h>
 #include <linux/of.h>
@@ -30,10 +29,6 @@
 	struct thermal_cooling_device *cdev;
 	struct cpumask mask_val;
 
-	/* make sure cpufreq driver has been initialized */
-	if (!cpufreq_frequency_get_table(0))
-		return -EPROBE_DEFER;
-
 	cpumask_set_cpu(0, &mask_val);
 	cdev = cpufreq_cooling_register(&mask_val);
 

diff --git a/drivers/thermal/db8500_thermal.c b/drivers/thermal/db8500_thermal.c
index 1e3b3bf..e3ccc221 100644
--- a/drivers/thermal/db8500_thermal.c
+++ b/drivers/thermal/db8500_thermal.c

@@ -76,7 +76,7 @@
 		upper = lower = i > max_state ? max_state : i;
 
 		ret = thermal_zone_bind_cooling_device(thermal, i, cdev,
-			upper, lower);
+			upper, lower, THERMAL_WEIGHT_DEFAULT);
 
 		dev_info(&cdev->device, "%s bind to %d: %d-%s\n", cdev->type,
 			i, ret, ret ? "fail" : "succeed");

diff --git a/drivers/thermal/fair_share.c b/drivers/thermal/fair_share.c
index 6e0a3fb..8c50b8d6 100644
--- a/drivers/thermal/fair_share.c
+++ b/drivers/thermal/fair_share.c

@@ -59,13 +59,13 @@
 }
 
 static long get_target_state(struct thermal_zone_device *tz,
-		struct thermal_cooling_device *cdev, int weight, int level)
+		struct thermal_cooling_device *cdev, int percentage, int level)
 {
 	unsigned long max_state;
 
 	cdev->ops->get_max_state(cdev, &max_state);
 
-	return (long)(weight * level * max_state) / (100 * tz->trips);
+	return (long)(percentage * level * max_state) / (100 * tz->trips);
 }
 
 /**
@@ -77,7 +77,7 @@
  *
  * Parameters used for Throttling:
  * P1. max_state: Maximum throttle state exposed by the cooling device.
- * P2. weight[i]/100:
+ * P2. percentage[i]/100:
  *	How 'effective' the 'i'th device is, in cooling the given zone.
  * P3. cur_trip_level/max_no_of_trips:
  *	This describes the extent to which the devices should be throttled.
@@ -88,28 +88,33 @@
  */
 static int fair_share_throttle(struct thermal_zone_device *tz, int trip)
 {
-	const struct thermal_zone_params *tzp;
-	struct thermal_cooling_device *cdev;
 	struct thermal_instance *instance;
-	int i;
+	int total_weight = 0;
+	int total_instance = 0;
 	int cur_trip_level = get_trip_level(tz);
 
-	if (!tz->tzp || !tz->tzp->tbp)
-		return -EINVAL;
-
-	tzp = tz->tzp;
-
-	for (i = 0; i < tzp->num_tbps; i++) {
-		if (!tzp->tbp[i].cdev)
+	list_for_each_entry(instance, &tz->thermal_instances, tz_node) {
+		if (instance->trip != trip)
 			continue;
 
-		cdev = tzp->tbp[i].cdev;
-		instance = get_thermal_instance(tz, cdev, trip);
-		if (!instance)
+		total_weight += instance->weight;
+		total_instance++;
+	}
+
+	list_for_each_entry(instance, &tz->thermal_instances, tz_node) {
+		int percentage;
+		struct thermal_cooling_device *cdev = instance->cdev;
+
+		if (instance->trip != trip)
 			continue;
 
-		instance->target = get_target_state(tz, cdev,
-					tzp->tbp[i].weight, cur_trip_level);
+		if (!total_weight)
+			percentage = 100 / total_instance;
+		else
+			percentage = (instance->weight * 100) / total_weight;
+
+		instance->target = get_target_state(tz, cdev, percentage,
+						    cur_trip_level);
 
 		instance->cdev->updated = false;
 		thermal_cdev_update(cdev);

diff --git a/drivers/thermal/imx_thermal.c b/drivers/thermal/imx_thermal.c
index 5a1f107..31ada06 100644
--- a/drivers/thermal/imx_thermal.c
+++ b/drivers/thermal/imx_thermal.c

@@ -9,7 +9,6 @@
 
 #include <linux/clk.h>
 #include <linux/cpu_cooling.h>
-#include <linux/cpufreq.h>
 #include <linux/delay.h>
 #include <linux/device.h>
 #include <linux/init.h>
@@ -307,7 +306,8 @@
 
 	ret = thermal_zone_bind_cooling_device(tz, IMX_TRIP_PASSIVE, cdev,
 					       THERMAL_NO_LIMIT,
-					       THERMAL_NO_LIMIT);
+					       THERMAL_NO_LIMIT,
+					       THERMAL_WEIGHT_DEFAULT);
 	if (ret) {
 		dev_err(&tz->device,
 			"binding zone %s with cdev %s failed:%d\n",
@@ -459,10 +459,6 @@
 	int measure_freq;
 	int ret;
 
-	if (!cpufreq_get_current_driver()) {
-		dev_dbg(&pdev->dev, "no cpufreq driver!");
-		return -EPROBE_DEFER;
-	}
 	data = devm_kzalloc(&pdev->dev, sizeof(*data), GFP_KERNEL);
 	if (!data)
 		return -ENOMEM;

diff --git a/drivers/thermal/int340x_thermal/int3403_thermal.c b/drivers/thermal/int340x_thermal/int3403_thermal.c
index 6e9fb62..76910d3 100644
--- a/drivers/thermal/int340x_thermal/int3403_thermal.c
+++ b/drivers/thermal/int340x_thermal/int3403_thermal.c

@@ -471,7 +471,6 @@
 	.remove = int3403_remove,
 	.driver = {
 		.name = "int3403 thermal",
-		.owner  = THIS_MODULE,
 		.acpi_match_table = int3403_device_ids,
 	},
 };

diff --git a/drivers/thermal/of-thermal.c b/drivers/thermal/of-thermal.c
index 62143ba..b295b2b 100644
--- a/drivers/thermal/of-thermal.c
+++ b/drivers/thermal/of-thermal.c

@@ -30,27 +30,13 @@
 #include <linux/err.h>
 #include <linux/export.h>
 #include <linux/string.h>
+#include <linux/thermal.h>
 
 #include "thermal_core.h"
 
 /***   Private data structures to represent thermal device tree data ***/
 
 /**
- * struct __thermal_trip - representation of a point in temperature domain
- * @np: pointer to struct device_node that this trip point was created from
- * @temperature: temperature value in miliCelsius
- * @hysteresis: relative hysteresis in miliCelsius
- * @type: trip point type
- */
-
-struct __thermal_trip {
-	struct device_node *np;
-	unsigned long int temperature;
-	unsigned long int hysteresis;
-	enum thermal_trip_type type;
-};
-
-/**
  * struct __thermal_bind_param - a match between trip and cooling device
  * @cooling_device: a pointer to identify the referred cooling device
  * @trip_id: the trip point index
@@ -72,23 +58,26 @@
  * @mode: current thermal zone device mode (enabled/disabled)
  * @passive_delay: polling interval while passive cooling is activated
  * @polling_delay: zone polling interval
+ * @slope: slope of the temperature adjustment curve
+ * @offset: offset of the temperature adjustment curve
  * @ntrips: number of trip points
  * @trips: an array of trip points (0..ntrips - 1)
  * @num_tbps: number of thermal bind params
  * @tbps: an array of thermal bind params (0..num_tbps - 1)
  * @sensor_data: sensor private data used while reading temperature and trend
- * @get_temp: sensor callback to read temperature
- * @get_trend: sensor callback to read temperature trend
+ * @ops: set of callbacks to handle the thermal zone based on DT
  */
 
 struct __thermal_zone {
 	enum thermal_device_mode mode;
 	int passive_delay;
 	int polling_delay;
+	int slope;
+	int offset;
 
 	/* trip data */
 	int ntrips;
-	struct __thermal_trip *trips;
+	struct thermal_trip *trips;
 
 	/* cooling binding data */
 	int num_tbps;
@@ -96,8 +85,7 @@
 
 	/* sensor interface */
 	void *sensor_data;
-	int (*get_temp)(void *, long *);
-	int (*get_trend)(void *, long *);
+	const struct thermal_zone_of_device_ops *ops;
 };
 
 /***   DT thermal zone device callbacks   ***/
@@ -107,10 +95,96 @@
 {
 	struct __thermal_zone *data = tz->devdata;
 
-	if (!data->get_temp)
+	if (!data->ops->get_temp)
 		return -EINVAL;
 
-	return data->get_temp(data->sensor_data, temp);
+	return data->ops->get_temp(data->sensor_data, temp);
+}
+
+/**
+ * of_thermal_get_ntrips - function to export number of available trip
+ *			   points.
+ * @tz: pointer to a thermal zone
+ *
+ * This function is a globally visible wrapper to get number of trip points
+ * stored in the local struct __thermal_zone
+ *
+ * Return: number of available trip points, -ENODEV when data not available
+ */
+int of_thermal_get_ntrips(struct thermal_zone_device *tz)
+{
+	struct __thermal_zone *data = tz->devdata;
+
+	if (!data || IS_ERR(data))
+		return -ENODEV;
+
+	return data->ntrips;
+}
+EXPORT_SYMBOL_GPL(of_thermal_get_ntrips);
+
+/**
+ * of_thermal_is_trip_valid - function to check if trip point is valid
+ *
+ * @tz:	pointer to a thermal zone
+ * @trip:	trip point to evaluate
+ *
+ * This function is responsible for checking if passed trip point is valid
+ *
+ * Return: true if trip point is valid, false otherwise
+ */
+bool of_thermal_is_trip_valid(struct thermal_zone_device *tz, int trip)
+{
+	struct __thermal_zone *data = tz->devdata;
+
+	if (!data || trip >= data->ntrips || trip < 0)
+		return false;
+
+	return true;
+}
+EXPORT_SYMBOL_GPL(of_thermal_is_trip_valid);
+
+/**
+ * of_thermal_get_trip_points - function to get access to a globally exported
+ *				trip points
+ *
+ * @tz:	pointer to a thermal zone
+ *
+ * This function provides a pointer to trip points table
+ *
+ * Return: pointer to trip points table, NULL otherwise
+ */
+const struct thermal_trip *
+of_thermal_get_trip_points(struct thermal_zone_device *tz)
+{
+	struct __thermal_zone *data = tz->devdata;
+
+	if (!data)
+		return NULL;
+
+	return data->trips;
+}
+EXPORT_SYMBOL_GPL(of_thermal_get_trip_points);
+
+/**
+ * of_thermal_set_emul_temp - function to set emulated temperature
+ *
+ * @tz:	pointer to a thermal zone
+ * @temp:	temperature to set
+ *
+ * This function gives the ability to set emulated value of temperature,
+ * which is handy for debugging
+ *
+ * Return: zero on success, error code otherwise
+ */
+static int of_thermal_set_emul_temp(struct thermal_zone_device *tz,
+				    unsigned long temp)
+{
+	struct __thermal_zone *data = tz->devdata;
+
+	if (!data->ops || !data->ops->set_emul_temp)
+		return -EINVAL;
+
+	return data->ops->set_emul_temp(data->sensor_data, temp);
 }
 
 static int of_thermal_get_trend(struct thermal_zone_device *tz, int trip,
@@ -120,10 +194,10 @@
 	long dev_trend;
 	int r;
 
-	if (!data->get_trend)
+	if (!data->ops->get_trend)
 		return -EINVAL;
 
-	r = data->get_trend(data->sensor_data, &dev_trend);
+	r = data->ops->get_trend(data->sensor_data, &dev_trend);
 	if (r)
 		return r;
 
@@ -157,7 +231,8 @@
 			ret = thermal_zone_bind_cooling_device(thermal,
 						tbp->trip_id, cdev,
 						tbp->max,
-						tbp->min);
+						tbp->min,
+						tbp->usage);
 			if (ret)
 				return ret;
 		}
@@ -324,8 +399,7 @@
 static struct thermal_zone_device *
 thermal_zone_of_add_sensor(struct device_node *zone,
 			   struct device_node *sensor, void *data,
-			   int (*get_temp)(void *, long *),
-			   int (*get_trend)(void *, long *))
+			   const struct thermal_zone_of_device_ops *ops)
 {
 	struct thermal_zone_device *tzd;
 	struct __thermal_zone *tz;
@@ -336,13 +410,16 @@
 
 	tz = tzd->devdata;
 
+	if (!ops)
+		return ERR_PTR(-EINVAL);
+
 	mutex_lock(&tzd->lock);
-	tz->get_temp = get_temp;
-	tz->get_trend = get_trend;
+	tz->ops = ops;
 	tz->sensor_data = data;
 
 	tzd->ops->get_temp = of_thermal_get_temp;
 	tzd->ops->get_trend = of_thermal_get_trend;
+	tzd->ops->set_emul_temp = of_thermal_set_emul_temp;
 	mutex_unlock(&tzd->lock);
 
 	return tzd;
@@ -356,8 +433,7 @@
  *             than one sensors
  * @data: a private pointer (owned by the caller) that will be passed
  *        back, when a temperature reading is needed.
- * @get_temp: a pointer to a function that reads the sensor temperature.
- * @get_trend: a pointer to a function that reads the sensor temperature trend.
+ * @ops: struct thermal_zone_of_device_ops *. Must contain at least .get_temp.
  *
  * This function will search the list of thermal zones described in device
  * tree and look for the zone that refer to the sensor device pointed by
@@ -382,9 +458,8 @@
  * check the return value with help of IS_ERR() helper.
  */
 struct thermal_zone_device *
-thermal_zone_of_sensor_register(struct device *dev, int sensor_id,
-				void *data, int (*get_temp)(void *, long *),
-				int (*get_trend)(void *, long *))
+thermal_zone_of_sensor_register(struct device *dev, int sensor_id, void *data,
+				const struct thermal_zone_of_device_ops *ops)
 {
 	struct device_node *np, *child, *sensor_np;
 	struct thermal_zone_device *tzd = ERR_PTR(-ENODEV);
@@ -426,9 +501,10 @@
 
 		if (sensor_specs.np == sensor_np && id == sensor_id) {
 			tzd = thermal_zone_of_add_sensor(child, sensor_np,
-							 data,
-							 get_temp,
-							 get_trend);
+							 data, ops);
+			if (!IS_ERR(tzd))
+				tzd->ops->set_mode(tzd, THERMAL_DEVICE_ENABLED);
+
 			of_node_put(sensor_specs.np);
 			of_node_put(child);
 			goto exit;
@@ -475,9 +551,9 @@
 	mutex_lock(&tzd->lock);
 	tzd->ops->get_temp = NULL;
 	tzd->ops->get_trend = NULL;
+	tzd->ops->set_emul_temp = NULL;
 
-	tz->get_temp = NULL;
-	tz->get_trend = NULL;
+	tz->ops = NULL;
 	tz->sensor_data = NULL;
 	mutex_unlock(&tzd->lock);
 }
@@ -501,7 +577,7 @@
  */
 static int thermal_of_populate_bind_params(struct device_node *np,
 					   struct __thermal_bind_params *__tbp,
-					   struct __thermal_trip *trips,
+					   struct thermal_trip *trips,
 					   int ntrips)
 {
 	struct of_phandle_args cooling_spec;
@@ -510,7 +586,7 @@
 	u32 prop;
 
 	/* Default weight. Usage is optional */
-	__tbp->usage = 0;
+	__tbp->usage = THERMAL_WEIGHT_DEFAULT;
 	ret = of_property_read_u32(np, "contribution", &prop);
 	if (ret == 0)
 		__tbp->usage = prop;
@@ -604,7 +680,7 @@
  * Return: 0 on success, proper error code otherwise
  */
 static int thermal_of_populate_trip(struct device_node *np,
-				    struct __thermal_trip *trip)
+				    struct thermal_trip *trip)
 {
 	int prop;
 	int ret;
@@ -644,7 +720,7 @@
  * @np parameter and fills the read data into a __thermal_zone data structure
  * and return this pointer.
  *
- * TODO: Missing properties to parse: thermal-sensor-names and coefficients
+ * TODO: Missing properties to parse: thermal-sensor-names
  *
  * Return: On success returns a valid struct __thermal_zone,
  * otherwise, it returns a corresponding ERR_PTR(). Caller must
@@ -656,7 +732,7 @@
 	struct device_node *child = NULL, *gchild;
 	struct __thermal_zone *tz;
 	int ret, i;
-	u32 prop;
+	u32 prop, coef[2];
 
 	if (!np) {
 		pr_err("no thermal zone np\n");
@@ -681,6 +757,20 @@
 	}
 	tz->polling_delay = prop;
 
+	/*
+	 * REVIST: for now, the thermal framework supports only
+	 * one sensor per thermal zone. Thus, we are considering
+	 * only the first two values as slope and offset.
+	 */
+	ret = of_property_read_u32_array(np, "coefficients", coef, 2);
+	if (ret == 0) {
+		tz->slope = coef[0];
+		tz->offset = coef[1];
+	} else {
+		tz->slope = 1;
+		tz->offset = 0;
+	}
+
 	/* trips */
 	child = of_get_child_by_name(np, "trips");
 
@@ -794,6 +884,8 @@
 	for_each_child_of_node(np, child) {
 		struct thermal_zone_device *zone;
 		struct thermal_zone_params *tzp;
+		int i, mask = 0;
+		u32 prop;
 
 		/* Check whether child is enabled or not */
 		if (!of_device_is_available(child))
@@ -820,8 +912,18 @@
 		/* No hwmon because there might be hwmon drivers registering */
 		tzp->no_hwmon = true;
 
+		if (!of_property_read_u32(child, "sustainable-power", &prop))
+			tzp->sustainable_power = prop;
+
+		for (i = 0; i < tz->ntrips; i++)
+			mask |= 1 << i;
+
+		/* these two are left for temperature drivers to use */
+		tzp->slope = tz->slope;
+		tzp->offset = tz->offset;
+
 		zone = thermal_zone_device_register(child->name, tz->ntrips,
-						    0, tz,
+						    mask, tz,
 						    ops, tzp,
 						    tz->passive_delay,
 						    tz->polling_delay);

diff --git a/drivers/thermal/power_allocator.c b/drivers/thermal/power_allocator.c
new file mode 100644
index 0000000..4672250
--- /dev/null
+++ b/drivers/thermal/power_allocator.c

@@ -0,0 +1,539 @@
+/*
+ * A power allocator to manage temperature
+ *
+ * Copyright (C) 2014 ARM Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed "as is" WITHOUT ANY WARRANTY of any
+ * kind, whether express or implied; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#define pr_fmt(fmt) "Power allocator: " fmt
+
+#include <linux/rculist.h>
+#include <linux/slab.h>
+#include <linux/thermal.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/thermal_power_allocator.h>
+
+#include "thermal_core.h"
+
+#define FRAC_BITS 10
+#define int_to_frac(x) ((x) << FRAC_BITS)
+#define frac_to_int(x) ((x) >> FRAC_BITS)
+
+/**
+ * mul_frac() - multiply two fixed-point numbers
+ * @x:	first multiplicand
+ * @y:	second multiplicand
+ *
+ * Return: the result of multiplying two fixed-point numbers.  The
+ * result is also a fixed-point number.
+ */
+static inline s64 mul_frac(s64 x, s64 y)
+{
+	return (x * y) >> FRAC_BITS;
+}
+
+/**
+ * div_frac() - divide two fixed-point numbers
+ * @x:	the dividend
+ * @y:	the divisor
+ *
+ * Return: the result of dividing two fixed-point numbers.  The
+ * result is also a fixed-point number.
+ */
+static inline s64 div_frac(s64 x, s64 y)
+{
+	return div_s64(x << FRAC_BITS, y);
+}
+
+/**
+ * struct power_allocator_params - parameters for the power allocator governor
+ * @err_integral:	accumulated error in the PID controller.
+ * @prev_err:	error in the previous iteration of the PID controller.
+ *		Used to calculate the derivative term.
+ * @trip_switch_on:	first passive trip point of the thermal zone.  The
+ *			governor switches on when this trip point is crossed.
+ * @trip_max_desired_temperature:	last passive trip point of the thermal
+ *					zone.  The temperature we are
+ *					controlling for.
+ */
+struct power_allocator_params {
+	s64 err_integral;
+	s32 prev_err;
+	int trip_switch_on;
+	int trip_max_desired_temperature;
+};
+
+/**
+ * pid_controller() - PID controller
+ * @tz:	thermal zone we are operating in
+ * @current_temp:	the current temperature in millicelsius
+ * @control_temp:	the target temperature in millicelsius
+ * @max_allocatable_power:	maximum allocatable power for this thermal zone
+ *
+ * This PID controller increases the available power budget so that the
+ * temperature of the thermal zone gets as close as possible to
+ * @control_temp and limits the power if it exceeds it.  k_po is the
+ * proportional term when we are overshooting, k_pu is the
+ * proportional term when we are undershooting.  integral_cutoff is a
+ * threshold below which we stop accumulating the error.  The
+ * accumulated error is only valid if the requested power will make
+ * the system warmer.  If the system is mostly idle, there's no point
+ * in accumulating positive error.
+ *
+ * Return: The power budget for the next period.
+ */
+static u32 pid_controller(struct thermal_zone_device *tz,
+			  unsigned long current_temp,
+			  unsigned long control_temp,
+			  u32 max_allocatable_power)
+{
+	s64 p, i, d, power_range;
+	s32 err, max_power_frac;
+	struct power_allocator_params *params = tz->governor_data;
+
+	max_power_frac = int_to_frac(max_allocatable_power);
+
+	err = ((s32)control_temp - (s32)current_temp);
+	err = int_to_frac(err);
+
+	/* Calculate the proportional term */
+	p = mul_frac(err < 0 ? tz->tzp->k_po : tz->tzp->k_pu, err);
+
+	/*
+	 * Calculate the integral term
+	 *
+	 * if the error is less than cut off allow integration (but
+	 * the integral is limited to max power)
+	 */
+	i = mul_frac(tz->tzp->k_i, params->err_integral);
+
+	if (err < int_to_frac(tz->tzp->integral_cutoff)) {
+		s64 i_next = i + mul_frac(tz->tzp->k_i, err);
+
+		if (abs64(i_next) < max_power_frac) {
+			i = i_next;
+			params->err_integral += err;
+		}
+	}
+
+	/*
+	 * Calculate the derivative term
+	 *
+	 * We do err - prev_err, so with a positive k_d, a decreasing
+	 * error (i.e. driving closer to the line) results in less
+	 * power being applied, slowing down the controller)
+	 */
+	d = mul_frac(tz->tzp->k_d, err - params->prev_err);
+	d = div_frac(d, tz->passive_delay);
+	params->prev_err = err;
+
+	power_range = p + i + d;
+
+	/* feed-forward the known sustainable dissipatable power */
+	power_range = tz->tzp->sustainable_power + frac_to_int(power_range);
+
+	power_range = clamp(power_range, (s64)0, (s64)max_allocatable_power);
+
+	trace_thermal_power_allocator_pid(tz, frac_to_int(err),
+					  frac_to_int(params->err_integral),
+					  frac_to_int(p), frac_to_int(i),
+					  frac_to_int(d), power_range);
+
+	return power_range;
+}
+
+/**
+ * divvy_up_power() - divvy the allocated power between the actors
+ * @req_power:	each actor's requested power
+ * @max_power:	each actor's maximum available power
+ * @num_actors:	size of the @req_power, @max_power and @granted_power's array
+ * @total_req_power: sum of @req_power
+ * @power_range:	total allocated power
+ * @granted_power:	output array: each actor's granted power
+ * @extra_actor_power:	an appropriately sized array to be used in the
+ *			function as temporary storage of the extra power given
+ *			to the actors
+ *
+ * This function divides the total allocated power (@power_range)
+ * fairly between the actors.  It first tries to give each actor a
+ * share of the @power_range according to how much power it requested
+ * compared to the rest of the actors.  For example, if only one actor
+ * requests power, then it receives all the @power_range.  If
+ * three actors each requests 1mW, each receives a third of the
+ * @power_range.
+ *
+ * If any actor received more than their maximum power, then that
+ * surplus is re-divvied among the actors based on how far they are
+ * from their respective maximums.
+ *
+ * Granted power for each actor is written to @granted_power, which
+ * should've been allocated by the calling function.
+ */
+static void divvy_up_power(u32 *req_power, u32 *max_power, int num_actors,
+			   u32 total_req_power, u32 power_range,
+			   u32 *granted_power, u32 *extra_actor_power)
+{
+	u32 extra_power, capped_extra_power;
+	int i;
+
+	/*
+	 * Prevent division by 0 if none of the actors request power.
+	 */
+	if (!total_req_power)
+		total_req_power = 1;
+
+	capped_extra_power = 0;
+	extra_power = 0;
+	for (i = 0; i < num_actors; i++) {
+		u64 req_range = req_power[i] * power_range;
+
+		granted_power[i] = DIV_ROUND_CLOSEST_ULL(req_range,
+							 total_req_power);
+
+		if (granted_power[i] > max_power[i]) {
+			extra_power += granted_power[i] - max_power[i];
+			granted_power[i] = max_power[i];
+		}
+
+		extra_actor_power[i] = max_power[i] - granted_power[i];
+		capped_extra_power += extra_actor_power[i];
+	}
+
+	if (!extra_power)
+		return;
+
+	/*
+	 * Re-divvy the reclaimed extra among actors based on
+	 * how far they are from the max
+	 */
+	extra_power = min(extra_power, capped_extra_power);
+	if (capped_extra_power > 0)
+		for (i = 0; i < num_actors; i++)
+			granted_power[i] += (extra_actor_power[i] *
+					extra_power) / capped_extra_power;
+}
+
+static int allocate_power(struct thermal_zone_device *tz,
+			  unsigned long current_temp,
+			  unsigned long control_temp)
+{
+	struct thermal_instance *instance;
+	struct power_allocator_params *params = tz->governor_data;
+	u32 *req_power, *max_power, *granted_power, *extra_actor_power;
+	u32 total_req_power, max_allocatable_power;
+	u32 total_granted_power, power_range;
+	int i, num_actors, total_weight, ret = 0;
+	int trip_max_desired_temperature = params->trip_max_desired_temperature;
+
+	mutex_lock(&tz->lock);
+
+	num_actors = 0;
+	total_weight = 0;
+	list_for_each_entry(instance, &tz->thermal_instances, tz_node) {
+		if ((instance->trip == trip_max_desired_temperature) &&
+		    cdev_is_power_actor(instance->cdev)) {
+			num_actors++;
+			total_weight += instance->weight;
+		}
+	}
+
+	/*
+	 * We need to allocate three arrays of the same size:
+	 * req_power, max_power and granted_power.  They are going to
+	 * be needed until this function returns.  Allocate them all
+	 * in one go to simplify the allocation and deallocation
+	 * logic.
+	 */
+	BUILD_BUG_ON(sizeof(*req_power) != sizeof(*max_power));
+	BUILD_BUG_ON(sizeof(*req_power) != sizeof(*granted_power));
+	BUILD_BUG_ON(sizeof(*req_power) != sizeof(*extra_actor_power));
+	req_power = devm_kcalloc(&tz->device, num_actors * 4,
+				 sizeof(*req_power), GFP_KERNEL);
+	if (!req_power) {
+		ret = -ENOMEM;
+		goto unlock;
+	}
+
+	max_power = &req_power[num_actors];
+	granted_power = &req_power[2 * num_actors];
+	extra_actor_power = &req_power[3 * num_actors];
+
+	i = 0;
+	total_req_power = 0;
+	max_allocatable_power = 0;
+
+	list_for_each_entry(instance, &tz->thermal_instances, tz_node) {
+		int weight;
+		struct thermal_cooling_device *cdev = instance->cdev;
+
+		if (instance->trip != trip_max_desired_temperature)
+			continue;
+
+		if (!cdev_is_power_actor(cdev))
+			continue;
+
+		if (cdev->ops->get_requested_power(cdev, tz, &req_power[i]))
+			continue;
+
+		if (!total_weight)
+			weight = 1 << FRAC_BITS;
+		else
+			weight = instance->weight;
+
+		req_power[i] = frac_to_int(weight * req_power[i]);
+
+		if (power_actor_get_max_power(cdev, tz, &max_power[i]))
+			continue;
+
+		total_req_power += req_power[i];
+		max_allocatable_power += max_power[i];
+
+		i++;
+	}
+
+	power_range = pid_controller(tz, current_temp, control_temp,
+				     max_allocatable_power);
+
+	divvy_up_power(req_power, max_power, num_actors, total_req_power,
+		       power_range, granted_power, extra_actor_power);
+
+	total_granted_power = 0;
+	i = 0;
+	list_for_each_entry(instance, &tz->thermal_instances, tz_node) {
+		if (instance->trip != trip_max_desired_temperature)
+			continue;
+
+		if (!cdev_is_power_actor(instance->cdev))
+			continue;
+
+		power_actor_set_power(instance->cdev, instance,
+				      granted_power[i]);
+		total_granted_power += granted_power[i];
+
+		i++;
+	}
+
+	trace_thermal_power_allocator(tz, req_power, total_req_power,
+				      granted_power, total_granted_power,
+				      num_actors, power_range,
+				      max_allocatable_power, current_temp,
+				      (s32)control_temp - (s32)current_temp);
+
+	devm_kfree(&tz->device, req_power);
+unlock:
+	mutex_unlock(&tz->lock);
+
+	return ret;
+}
+
+static int get_governor_trips(struct thermal_zone_device *tz,
+			      struct power_allocator_params *params)
+{
+	int i, ret, last_passive;
+	bool found_first_passive;
+
+	found_first_passive = false;
+	last_passive = -1;
+	ret = -EINVAL;
+
+	for (i = 0; i < tz->trips; i++) {
+		enum thermal_trip_type type;
+
+		ret = tz->ops->get_trip_type(tz, i, &type);
+		if (ret)
+			return ret;
+
+		if (!found_first_passive) {
+			if (type == THERMAL_TRIP_PASSIVE) {
+				params->trip_switch_on = i;
+				found_first_passive = true;
+			}
+		} else if (type == THERMAL_TRIP_PASSIVE) {
+			last_passive = i;
+		} else {
+			break;
+		}
+	}
+
+	if (last_passive != -1) {
+		params->trip_max_desired_temperature = last_passive;
+		ret = 0;
+	} else {
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+
+static void reset_pid_controller(struct power_allocator_params *params)
+{
+	params->err_integral = 0;
+	params->prev_err = 0;
+}
+
+static void allow_maximum_power(struct thermal_zone_device *tz)
+{
+	struct thermal_instance *instance;
+	struct power_allocator_params *params = tz->governor_data;
+
+	list_for_each_entry(instance, &tz->thermal_instances, tz_node) {
+		if ((instance->trip != params->trip_max_desired_temperature) ||
+		    (!cdev_is_power_actor(instance->cdev)))
+			continue;
+
+		instance->target = 0;
+		instance->cdev->updated = false;
+		thermal_cdev_update(instance->cdev);
+	}
+}
+
+/**
+ * power_allocator_bind() - bind the power_allocator governor to a thermal zone
+ * @tz:	thermal zone to bind it to
+ *
+ * Check that the thermal zone is valid for this governor, that is, it
+ * has two thermal trips.  If so, initialize the PID controller
+ * parameters and bind it to the thermal zone.
+ *
+ * Return: 0 on success, -EINVAL if the trips were invalid or -ENOMEM
+ * if we ran out of memory.
+ */
+static int power_allocator_bind(struct thermal_zone_device *tz)
+{
+	int ret;
+	struct power_allocator_params *params;
+	unsigned long switch_on_temp, control_temp;
+	u32 temperature_threshold;
+
+	if (!tz->tzp || !tz->tzp->sustainable_power) {
+		dev_err(&tz->device,
+			"power_allocator: missing sustainable_power\n");
+		return -EINVAL;
+	}
+
+	params = devm_kzalloc(&tz->device, sizeof(*params), GFP_KERNEL);
+	if (!params)
+		return -ENOMEM;
+
+	ret = get_governor_trips(tz, params);
+	if (ret) {
+		dev_err(&tz->device,
+			"thermal zone %s has wrong trip setup for power allocator\n",
+			tz->type);
+		goto free;
+	}
+
+	ret = tz->ops->get_trip_temp(tz, params->trip_switch_on,
+				     &switch_on_temp);
+	if (ret)
+		goto free;
+
+	ret = tz->ops->get_trip_temp(tz, params->trip_max_desired_temperature,
+				     &control_temp);
+	if (ret)
+		goto free;
+
+	temperature_threshold = control_temp - switch_on_temp;
+
+	tz->tzp->k_po = tz->tzp->k_po ?:
+		int_to_frac(tz->tzp->sustainable_power) / temperature_threshold;
+	tz->tzp->k_pu = tz->tzp->k_pu ?:
+		int_to_frac(2 * tz->tzp->sustainable_power) /
+		temperature_threshold;
+	tz->tzp->k_i = tz->tzp->k_i ?: int_to_frac(10) / 1000;
+	/*
+	 * The default for k_d and integral_cutoff is 0, so we can
+	 * leave them as they are.
+	 */
+
+	reset_pid_controller(params);
+
+	tz->governor_data = params;
+
+	return 0;
+
+free:
+	devm_kfree(&tz->device, params);
+	return ret;
+}
+
+static void power_allocator_unbind(struct thermal_zone_device *tz)
+{
+	dev_dbg(&tz->device, "Unbinding from thermal zone %d\n", tz->id);
+	devm_kfree(&tz->device, tz->governor_data);
+	tz->governor_data = NULL;
+}
+
+static int power_allocator_throttle(struct thermal_zone_device *tz, int trip)
+{
+	int ret;
+	unsigned long switch_on_temp, control_temp, current_temp;
+	struct power_allocator_params *params = tz->governor_data;
+
+	/*
+	 * We get called for every trip point but we only need to do
+	 * our calculations once
+	 */
+	if (trip != params->trip_max_desired_temperature)
+		return 0;
+
+	ret = thermal_zone_get_temp(tz, &current_temp);
+	if (ret) {
+		dev_warn(&tz->device, "Failed to get temperature: %d\n", ret);
+		return ret;
+	}
+
+	ret = tz->ops->get_trip_temp(tz, params->trip_switch_on,
+				     &switch_on_temp);
+	if (ret) {
+		dev_warn(&tz->device,
+			 "Failed to get switch on temperature: %d\n", ret);
+		return ret;
+	}
+
+	if (current_temp < switch_on_temp) {
+		tz->passive = 0;
+		reset_pid_controller(params);
+		allow_maximum_power(tz);
+		return 0;
+	}
+
+	tz->passive = 1;
+
+	ret = tz->ops->get_trip_temp(tz, params->trip_max_desired_temperature,
+				&control_temp);
+	if (ret) {
+		dev_warn(&tz->device,
+			 "Failed to get the maximum desired temperature: %d\n",
+			 ret);
+		return ret;
+	}
+
+	return allocate_power(tz, current_temp, control_temp);
+}
+
+static struct thermal_governor thermal_gov_power_allocator = {
+	.name		= "power_allocator",
+	.bind_to_tz	= power_allocator_bind,
+	.unbind_from_tz	= power_allocator_unbind,
+	.throttle	= power_allocator_throttle,
+};
+
+int thermal_gov_power_allocator_register(void)
+{
+	return thermal_register_governor(&thermal_gov_power_allocator);
+}
+
+void thermal_gov_power_allocator_unregister(void)
+{
+	thermal_unregister_governor(&thermal_gov_power_allocator);
+}

diff --git a/drivers/thermal/samsung/Kconfig b/drivers/thermal/samsung/Kconfig
index f760389..c43306e 100644
--- a/drivers/thermal/samsung/Kconfig
+++ b/drivers/thermal/samsung/Kconfig

@@ -1,6 +1,6 @@
 config EXYNOS_THERMAL
 	tristate "Exynos thermal management unit driver"
-	depends on ARCH_HAS_BANDGAP && OF
+	depends on OF
 	help
 	  If you say yes here you get support for the TMU (Thermal Management
 	  Unit) driver for SAMSUNG EXYNOS series of SoCs. This driver initialises

diff --git a/drivers/thermal/samsung/exynos_thermal_common.c b/drivers/thermal/samsung/exynos_thermal_common.c
index b6be572..7c97db1 100644
--- a/drivers/thermal/samsung/exynos_thermal_common.c
+++ b/drivers/thermal/samsung/exynos_thermal_common.c

@@ -163,7 +163,7 @@
 		case MONITOR_ZONE:
 		case WARN_ZONE:
 			if (thermal_zone_bind_cooling_device(thermal, i, cdev,
-								level, 0)) {
+							     level, 0, THERMAL_WEIGHT_DEFAULT)) {
 				dev_err(data->dev,
 					"error unbinding cdev inst=%d\n", i);
 				ret = -EINVAL;
@@ -371,9 +371,11 @@
 		th_zone->cool_dev[th_zone->cool_dev_size] =
 					cpufreq_cooling_register(&mask_val);
 		if (IS_ERR(th_zone->cool_dev[th_zone->cool_dev_size])) {
-			dev_err(sensor_conf->dev,
-				"Failed to register cpufreq cooling device\n");
-			ret = -EINVAL;
+			ret = PTR_ERR(th_zone->cool_dev[th_zone->cool_dev_size]);
+			if (ret != -EPROBE_DEFER)
+				dev_err(sensor_conf->dev,
+					"Failed to register cpufreq cooling device: %d\n",
+					ret);
 			goto err_unregister;
 		}
 		th_zone->cool_dev_size++;

diff --git a/drivers/thermal/samsung/exynos_tmu.c b/drivers/thermal/samsung/exynos_tmu.c
index 49c0924..2afca9b 100644
--- a/drivers/thermal/samsung/exynos_tmu.c
+++ b/drivers/thermal/samsung/exynos_tmu.c

@@ -683,7 +683,10 @@
 	/* Register the sensor with thermal management interface */
 	ret = exynos_register_thermal(sensor_conf);
 	if (ret) {
-		dev_err(&pdev->dev, "Failed to register thermal interface\n");
+		if (ret != -EPROBE_DEFER)
+			dev_err(&pdev->dev,
+				"Failed to register thermal interface: %d\n",
+				ret);
 		goto err_clk;
 	}
 	data->reg_conf = sensor_conf;

diff --git a/drivers/thermal/tegra_soctherm.c b/drivers/thermal/tegra_soctherm.c
new file mode 100644
index 0000000..9197fc0
--- /dev/null
+++ b/drivers/thermal/tegra_soctherm.c

@@ -0,0 +1,476 @@
+/*
+ * Copyright (c) 2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Author:
+ *	Mikko Perttunen <mperttunen@nvidia.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/bitops.h>
+#include <linux/clk.h>
+#include <linux/delay.h>
+#include <linux/err.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/platform_device.h>
+#include <linux/reset.h>
+#include <linux/thermal.h>
+
+#include <soc/tegra/fuse.h>
+
+#define SENSOR_CONFIG0				0
+#define SENSOR_CONFIG0_STOP			BIT(0)
+#define SENSOR_CONFIG0_TALL_SHIFT		8
+#define SENSOR_CONFIG0_TCALC_OVER		BIT(4)
+#define SENSOR_CONFIG0_OVER			BIT(3)
+#define SENSOR_CONFIG0_CPTR_OVER		BIT(2)
+
+#define SENSOR_CONFIG1				4
+#define SENSOR_CONFIG1_TSAMPLE_SHIFT		0
+#define SENSOR_CONFIG1_TIDDQ_EN_SHIFT		15
+#define SENSOR_CONFIG1_TEN_COUNT_SHIFT		24
+#define SENSOR_CONFIG1_TEMP_ENABLE		BIT(31)
+
+#define SENSOR_CONFIG2				8
+#define SENSOR_CONFIG2_THERMA_SHIFT		16
+#define SENSOR_CONFIG2_THERMB_SHIFT		0
+
+#define SENSOR_PDIV				0x1c0
+#define SENSOR_PDIV_T124			0x8888
+#define SENSOR_HOTSPOT_OFF			0x1c4
+#define SENSOR_HOTSPOT_OFF_T124			0x00060600
+#define SENSOR_TEMP1				0x1c8
+#define SENSOR_TEMP2				0x1cc
+
+#define SENSOR_TEMP_MASK			0xffff
+#define READBACK_VALUE_MASK			0xff00
+#define READBACK_VALUE_SHIFT			8
+#define READBACK_ADD_HALF			BIT(7)
+#define READBACK_NEGATE				BIT(1)
+
+#define FUSE_TSENSOR8_CALIB			0x180
+#define FUSE_SPARE_REALIGNMENT_REG_0		0x1fc
+
+#define FUSE_TSENSOR_CALIB_CP_TS_BASE_MASK	0x1fff
+#define FUSE_TSENSOR_CALIB_FT_TS_BASE_MASK	(0x1fff << 13)
+#define FUSE_TSENSOR_CALIB_FT_TS_BASE_SHIFT	13
+
+#define FUSE_TSENSOR8_CALIB_CP_TS_BASE_MASK	0x3ff
+#define FUSE_TSENSOR8_CALIB_FT_TS_BASE_MASK	(0x7ff << 10)
+#define FUSE_TSENSOR8_CALIB_FT_TS_BASE_SHIFT	10
+
+#define FUSE_SPARE_REALIGNMENT_REG_SHIFT_CP_MASK 0x3f
+#define FUSE_SPARE_REALIGNMENT_REG_SHIFT_FT_MASK (0x1f << 21)
+#define FUSE_SPARE_REALIGNMENT_REG_SHIFT_FT_SHIFT 21
+
+#define NOMINAL_CALIB_FT_T124			105
+#define NOMINAL_CALIB_CP_T124			25
+
+struct tegra_tsensor_configuration {
+	u32 tall, tsample, tiddq_en, ten_count, pdiv, tsample_ate, pdiv_ate;
+};
+
+struct tegra_tsensor {
+	const struct tegra_tsensor_configuration *config;
+	u32 base, calib_fuse_offset;
+	/* Correction values used to modify values read from calibration fuses */
+	s32 fuse_corr_alpha, fuse_corr_beta;
+};
+
+struct tegra_thermctl_zone {
+	void __iomem *reg;
+	unsigned int shift;
+};
+
+static const struct tegra_tsensor_configuration t124_tsensor_config = {
+	.tall = 16300,
+	.tsample = 120,
+	.tiddq_en = 1,
+	.ten_count = 1,
+	.pdiv = 8,
+	.tsample_ate = 480,
+	.pdiv_ate = 8
+};
+
+static const struct tegra_tsensor t124_tsensors[] = {
+	{
+		.config = &t124_tsensor_config,
+		.base = 0xc0,
+		.calib_fuse_offset = 0x098,
+		.fuse_corr_alpha = 1135400,
+		.fuse_corr_beta = -6266900,
+	},
+	{
+		.config = &t124_tsensor_config,
+		.base = 0xe0,
+		.calib_fuse_offset = 0x084,
+		.fuse_corr_alpha = 1122220,
+		.fuse_corr_beta = -5700700,
+	},
+	{
+		.config = &t124_tsensor_config,
+		.base = 0x100,
+		.calib_fuse_offset = 0x088,
+		.fuse_corr_alpha = 1127000,
+		.fuse_corr_beta = -6768200,
+	},
+	{
+		.config = &t124_tsensor_config,
+		.base = 0x120,
+		.calib_fuse_offset = 0x12c,
+		.fuse_corr_alpha = 1110900,
+		.fuse_corr_beta = -6232000,
+	},
+	{
+		.config = &t124_tsensor_config,
+		.base = 0x140,
+		.calib_fuse_offset = 0x158,
+		.fuse_corr_alpha = 1122300,
+		.fuse_corr_beta = -5936400,
+	},
+	{
+		.config = &t124_tsensor_config,
+		.base = 0x160,
+		.calib_fuse_offset = 0x15c,
+		.fuse_corr_alpha = 1145700,
+		.fuse_corr_beta = -7124600,
+	},
+	{
+		.config = &t124_tsensor_config,
+		.base = 0x180,
+		.calib_fuse_offset = 0x154,
+		.fuse_corr_alpha = 1120100,
+		.fuse_corr_beta = -6000500,
+	},
+	{
+		.config = &t124_tsensor_config,
+		.base = 0x1a0,
+		.calib_fuse_offset = 0x160,
+		.fuse_corr_alpha = 1106500,
+		.fuse_corr_beta = -6729300,
+	},
+};
+
+struct tegra_soctherm {
+	struct reset_control *reset;
+	struct clk *clock_tsensor;
+	struct clk *clock_soctherm;
+	void __iomem *regs;
+
+	struct thermal_zone_device *thermctl_tzs[4];
+};
+
+struct tsensor_shared_calibration {
+	u32 base_cp, base_ft;
+	u32 actual_temp_cp, actual_temp_ft;
+};
+
+static int calculate_shared_calibration(struct tsensor_shared_calibration *r)
+{
+	u32 val, shifted_cp, shifted_ft;
+	int err;
+
+	err = tegra_fuse_readl(FUSE_TSENSOR8_CALIB, &val);
+	if (err)
+		return err;
+	r->base_cp = val & FUSE_TSENSOR8_CALIB_CP_TS_BASE_MASK;
+	r->base_ft = (val & FUSE_TSENSOR8_CALIB_FT_TS_BASE_MASK)
+		>> FUSE_TSENSOR8_CALIB_FT_TS_BASE_SHIFT;
+	val = ((val & FUSE_SPARE_REALIGNMENT_REG_SHIFT_FT_MASK)
+		>> FUSE_SPARE_REALIGNMENT_REG_SHIFT_FT_SHIFT);
+	shifted_ft = sign_extend32(val, 4);
+
+	err = tegra_fuse_readl(FUSE_SPARE_REALIGNMENT_REG_0, &val);
+	if (err)
+		return err;
+	shifted_cp = sign_extend32(val, 5);
+
+	r->actual_temp_cp = 2 * NOMINAL_CALIB_CP_T124 + shifted_cp;
+	r->actual_temp_ft = 2 * NOMINAL_CALIB_FT_T124 + shifted_ft;
+
+	return 0;
+}
+
+static s64 div64_s64_precise(s64 a, s64 b)
+{
+	s64 r, al;
+
+	/* Scale up for increased precision division */
+	al = a << 16;
+
+	r = div64_s64(al * 2 + 1, 2 * b);
+	return r >> 16;
+}
+
+static int
+calculate_tsensor_calibration(const struct tegra_tsensor *sensor,
+			      const struct tsensor_shared_calibration *shared,
+			      u32 *calib)
+{
+	u32 val;
+	s32 actual_tsensor_ft, actual_tsensor_cp, delta_sens, delta_temp,
+	    mult, div;
+	s16 therma, thermb;
+	s64 tmp;
+	int err;
+
+	err = tegra_fuse_readl(sensor->calib_fuse_offset, &val);
+	if (err)
+		return err;
+
+	actual_tsensor_cp = (shared->base_cp * 64) + sign_extend32(val, 12);
+	val = (val & FUSE_TSENSOR_CALIB_FT_TS_BASE_MASK)
+		>> FUSE_TSENSOR_CALIB_FT_TS_BASE_SHIFT;
+	actual_tsensor_ft = (shared->base_ft * 32) + sign_extend32(val, 12);
+
+	delta_sens = actual_tsensor_ft - actual_tsensor_cp;
+	delta_temp = shared->actual_temp_ft - shared->actual_temp_cp;
+
+	mult = sensor->config->pdiv * sensor->config->tsample_ate;
+	div = sensor->config->tsample * sensor->config->pdiv_ate;
+
+	therma = div64_s64_precise((s64) delta_temp * (1LL << 13) * mult,
+				   (s64) delta_sens * div);
+
+	tmp = (s64)actual_tsensor_ft * shared->actual_temp_cp -
+	      (s64)actual_tsensor_cp * shared->actual_temp_ft;
+	thermb = div64_s64_precise(tmp, (s64)delta_sens);
+
+	therma = div64_s64_precise((s64)therma * sensor->fuse_corr_alpha,
+				   (s64)1000000LL);
+	thermb = div64_s64_precise((s64)thermb * sensor->fuse_corr_alpha +
+				   sensor->fuse_corr_beta, (s64)1000000LL);
+
+	*calib = ((u16)therma << SENSOR_CONFIG2_THERMA_SHIFT) |
+		 ((u16)thermb << SENSOR_CONFIG2_THERMB_SHIFT);
+
+	return 0;
+}
+
+static int enable_tsensor(struct tegra_soctherm *tegra,
+			  const struct tegra_tsensor *sensor,
+			  const struct tsensor_shared_calibration *shared)
+{
+	void __iomem *base = tegra->regs + sensor->base;
+	unsigned int val;
+	u32 calib;
+	int err;
+
+	err = calculate_tsensor_calibration(sensor, shared, &calib);
+	if (err)
+		return err;
+
+	val = sensor->config->tall << SENSOR_CONFIG0_TALL_SHIFT;
+	writel(val, base + SENSOR_CONFIG0);
+
+	val  = (sensor->config->tsample - 1) << SENSOR_CONFIG1_TSAMPLE_SHIFT;
+	val |= sensor->config->tiddq_en << SENSOR_CONFIG1_TIDDQ_EN_SHIFT;
+	val |= sensor->config->ten_count << SENSOR_CONFIG1_TEN_COUNT_SHIFT;
+	val |= SENSOR_CONFIG1_TEMP_ENABLE;
+	writel(val, base + SENSOR_CONFIG1);
+
+	writel(calib, base + SENSOR_CONFIG2);
+
+	return 0;
+}
+
+/*
+ * Translate from soctherm readback format to millicelsius.
+ * The soctherm readback format in bits is as follows:
+ *   TTTTTTTT H______N
+ * where T's contain the temperature in Celsius,
+ * H denotes an addition of 0.5 Celsius and N denotes negation
+ * of the final value.
+ */
+static long translate_temp(u16 val)
+{
+	long t;
+
+	t = ((val & READBACK_VALUE_MASK) >> READBACK_VALUE_SHIFT) * 1000;
+	if (val & READBACK_ADD_HALF)
+		t += 500;
+	if (val & READBACK_NEGATE)
+		t *= -1;
+
+	return t;
+}
+
+static int tegra_thermctl_get_temp(void *data, long *out_temp)
+{
+	struct tegra_thermctl_zone *zone = data;
+	u32 val;
+
+	val = (readl(zone->reg) >> zone->shift) & SENSOR_TEMP_MASK;
+	*out_temp = translate_temp(val);
+
+	return 0;
+}
+
+static const struct thermal_zone_of_device_ops tegra_of_thermal_ops = {
+	.get_temp = tegra_thermctl_get_temp,
+};
+
+static const struct of_device_id tegra_soctherm_of_match[] = {
+	{ .compatible = "nvidia,tegra124-soctherm" },
+	{ },
+};
+MODULE_DEVICE_TABLE(of, tegra_soctherm_of_match);
+
+struct thermctl_zone_desc {
+	unsigned int offset;
+	unsigned int shift;
+};
+
+static const struct thermctl_zone_desc t124_thermctl_temp_zones[] = {
+	{ SENSOR_TEMP1, 16 },
+	{ SENSOR_TEMP2, 16 },
+	{ SENSOR_TEMP1, 0 },
+	{ SENSOR_TEMP2, 0 }
+};
+
+static int tegra_soctherm_probe(struct platform_device *pdev)
+{
+	struct tegra_soctherm *tegra;
+	struct thermal_zone_device *tz;
+	struct tsensor_shared_calibration shared_calib;
+	struct resource *res;
+	unsigned int i;
+	int err;
+
+	const struct tegra_tsensor *tsensors = t124_tsensors;
+
+	tegra = devm_kzalloc(&pdev->dev, sizeof(*tegra), GFP_KERNEL);
+	if (!tegra)
+		return -ENOMEM;
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	tegra->regs = devm_ioremap_resource(&pdev->dev, res);
+	if (IS_ERR(tegra->regs))
+		return PTR_ERR(tegra->regs);
+
+	tegra->reset = devm_reset_control_get(&pdev->dev, "soctherm");
+	if (IS_ERR(tegra->reset)) {
+		dev_err(&pdev->dev, "can't get soctherm reset\n");
+		return PTR_ERR(tegra->reset);
+	}
+
+	tegra->clock_tsensor = devm_clk_get(&pdev->dev, "tsensor");
+	if (IS_ERR(tegra->clock_tsensor)) {
+		dev_err(&pdev->dev, "can't get tsensor clock\n");
+		return PTR_ERR(tegra->clock_tsensor);
+	}
+
+	tegra->clock_soctherm = devm_clk_get(&pdev->dev, "soctherm");
+	if (IS_ERR(tegra->clock_soctherm)) {
+		dev_err(&pdev->dev, "can't get soctherm clock\n");
+		return PTR_ERR(tegra->clock_soctherm);
+	}
+
+	reset_control_assert(tegra->reset);
+
+	err = clk_prepare_enable(tegra->clock_soctherm);
+	if (err)
+		return err;
+
+	err = clk_prepare_enable(tegra->clock_tsensor);
+	if (err) {
+		clk_disable_unprepare(tegra->clock_soctherm);
+		return err;
+	}
+
+	reset_control_deassert(tegra->reset);
+
+	/* Initialize raw sensors */
+
+	err = calculate_shared_calibration(&shared_calib);
+	if (err)
+		goto disable_clocks;
+
+	for (i = 0; i < ARRAY_SIZE(t124_tsensors); ++i) {
+		err = enable_tsensor(tegra, tsensors + i, &shared_calib);
+		if (err)
+			goto disable_clocks;
+	}
+
+	writel(SENSOR_PDIV_T124, tegra->regs + SENSOR_PDIV);
+	writel(SENSOR_HOTSPOT_OFF_T124, tegra->regs + SENSOR_HOTSPOT_OFF);
+
+	/* Initialize thermctl sensors */
+
+	for (i = 0; i < ARRAY_SIZE(tegra->thermctl_tzs); ++i) {
+		struct tegra_thermctl_zone *zone =
+			devm_kzalloc(&pdev->dev, sizeof(*zone), GFP_KERNEL);
+		if (!zone) {
+			err = -ENOMEM;
+			goto unregister_tzs;
+		}
+
+		zone->reg = tegra->regs + t124_thermctl_temp_zones[i].offset;
+		zone->shift = t124_thermctl_temp_zones[i].shift;
+
+		tz = thermal_zone_of_sensor_register(&pdev->dev, i, zone,
+						     &tegra_of_thermal_ops);
+		if (IS_ERR(tz)) {
+			err = PTR_ERR(tz);
+			dev_err(&pdev->dev, "failed to register sensor: %d\n",
+				err);
+			goto unregister_tzs;
+		}
+
+		tegra->thermctl_tzs[i] = tz;
+	}
+
+	return 0;
+
+unregister_tzs:
+	while (i--)
+		thermal_zone_of_sensor_unregister(&pdev->dev,
+						  tegra->thermctl_tzs[i]);
+
+disable_clocks:
+	clk_disable_unprepare(tegra->clock_tsensor);
+	clk_disable_unprepare(tegra->clock_soctherm);
+
+	return err;
+}
+
+static int tegra_soctherm_remove(struct platform_device *pdev)
+{
+	struct tegra_soctherm *tegra = platform_get_drvdata(pdev);
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(tegra->thermctl_tzs); ++i) {
+		thermal_zone_of_sensor_unregister(&pdev->dev,
+						  tegra->thermctl_tzs[i]);
+	}
+
+	clk_disable_unprepare(tegra->clock_tsensor);
+	clk_disable_unprepare(tegra->clock_soctherm);
+
+	return 0;
+}
+
+static struct platform_driver tegra_soctherm_driver = {
+	.probe = tegra_soctherm_probe,
+	.remove = tegra_soctherm_remove,
+	.driver = {
+		.name = "tegra-soctherm",
+		.of_match_table = tegra_soctherm_of_match,
+	},
+};
+module_platform_driver(tegra_soctherm_driver);
+
+MODULE_AUTHOR("Mikko Perttunen <mperttunen@nvidia.com>");
+MODULE_DESCRIPTION("NVIDIA Tegra SOCTHERM thermal management driver");
+MODULE_LICENSE("GPL v2");

diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index 488e9bf..249b612 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c

@@ -75,6 +75,58 @@
 	return NULL;
 }
 
+/**
+ * bind_previous_governor() - bind the previous governor of the thermal zone
+ * @tz:		a valid pointer to a struct thermal_zone_device
+ * @failed_gov_name:	the name of the governor that failed to register
+ *
+ * Register the previous governor of the thermal zone after a new
+ * governor has failed to be bound.
+ */
+static void bind_previous_governor(struct thermal_zone_device *tz,
+				   const char *failed_gov_name)
+{
+	if (tz->governor && tz->governor->bind_to_tz) {
+		if (tz->governor->bind_to_tz(tz)) {
+			dev_err(&tz->device,
+				"governor %s failed to bind and the previous one (%s) failed to bind again, thermal zone %s has no governor\n",
+				failed_gov_name, tz->governor->name, tz->type);
+			tz->governor = NULL;
+		}
+	}
+}
+
+/**
+ * thermal_set_governor() - Switch to another governor
+ * @tz:		a valid pointer to a struct thermal_zone_device
+ * @new_gov:	pointer to the new governor
+ *
+ * Change the governor of thermal zone @tz.
+ *
+ * Return: 0 on success, an error if the new governor's bind_to_tz() failed.
+ */
+static int thermal_set_governor(struct thermal_zone_device *tz,
+				struct thermal_governor *new_gov)
+{
+	int ret = 0;
+
+	if (tz->governor && tz->governor->unbind_from_tz)
+		tz->governor->unbind_from_tz(tz);
+
+	if (new_gov && new_gov->bind_to_tz) {
+		ret = new_gov->bind_to_tz(tz);
+		if (ret) {
+			bind_previous_governor(tz, new_gov->name);
+
+			return ret;
+		}
+	}
+
+	tz->governor = new_gov;
+
+	return ret;
+}
+
 int thermal_register_governor(struct thermal_governor *governor)
 {
 	int err;
@@ -107,8 +159,15 @@
 
 		name = pos->tzp->governor_name;
 
-		if (!strncasecmp(name, governor->name, THERMAL_NAME_LENGTH))
-			pos->governor = governor;
+		if (!strncasecmp(name, governor->name, THERMAL_NAME_LENGTH)) {
+			int ret;
+
+			ret = thermal_set_governor(pos, governor);
+			if (ret)
+				dev_err(&pos->device,
+					"Failed to set governor %s for thermal zone %s: %d\n",
+					governor->name, pos->type, ret);
+		}
 	}
 
 	mutex_unlock(&thermal_list_lock);
@@ -134,7 +193,7 @@
 	list_for_each_entry(pos, &thermal_tz_list, node) {
 		if (!strncasecmp(pos->governor->name, governor->name,
 						THERMAL_NAME_LENGTH))
-			pos->governor = NULL;
+			thermal_set_governor(pos, NULL);
 	}
 
 	mutex_unlock(&thermal_list_lock);
@@ -218,7 +277,8 @@
 
 static void __bind(struct thermal_zone_device *tz, int mask,
 			struct thermal_cooling_device *cdev,
-			unsigned long *limits)
+			unsigned long *limits,
+			unsigned int weight)
 {
 	int i, ret;
 
@@ -233,7 +293,8 @@
 				upper = limits[i * 2 + 1];
 			}
 			ret = thermal_zone_bind_cooling_device(tz, i, cdev,
-							       upper, lower);
+							       upper, lower,
+							       weight);
 			if (ret)
 				print_bind_err_msg(tz, cdev, ret);
 		}
@@ -280,7 +341,8 @@
 				continue;
 			tzp->tbp[i].cdev = cdev;
 			__bind(pos, tzp->tbp[i].trip_mask, cdev,
-			       tzp->tbp[i].binding_limits);
+			       tzp->tbp[i].binding_limits,
+			       tzp->tbp[i].weight);
 		}
 	}
 
@@ -319,7 +381,8 @@
 				continue;
 			tzp->tbp[i].cdev = pos;
 			__bind(tz, tzp->tbp[i].trip_mask, pos,
-			       tzp->tbp[i].binding_limits);
+			       tzp->tbp[i].binding_limits,
+			       tzp->tbp[i].weight);
 		}
 	}
 exit:
@@ -368,7 +431,7 @@
 	tz->ops->get_trip_temp(tz, trip, &trip_temp);
 
 	/* If we have not crossed the trip_temp, we do not care. */
-	if (tz->temperature < trip_temp)
+	if (trip_temp <= 0 || tz->temperature < trip_temp)
 		return;
 
 	trace_thermal_zone_trip(tz, trip, trip_type);
@@ -711,7 +774,8 @@
 				thermal_zone_bind_cooling_device(tz,
 						THERMAL_TRIPS_NONE, cdev,
 						THERMAL_NO_LIMIT,
-						THERMAL_NO_LIMIT);
+						THERMAL_NO_LIMIT,
+						THERMAL_WEIGHT_DEFAULT);
 		}
 		mutex_unlock(&thermal_list_lock);
 		if (!tz->passive_delay)
@@ -757,15 +821,18 @@
 	snprintf(name, sizeof(name), "%s", buf);
 
 	mutex_lock(&thermal_governor_lock);
+	mutex_lock(&tz->lock);
 
 	gov = __find_governor(strim(name));
 	if (!gov)
 		goto exit;
 
-	tz->governor = gov;
-	ret = count;
+	ret = thermal_set_governor(tz, gov);
+	if (!ret)
+		ret = count;
 
 exit:
+	mutex_unlock(&tz->lock);
 	mutex_unlock(&thermal_governor_lock);
 	return ret;
 }
@@ -806,6 +873,158 @@
 static DEVICE_ATTR(emul_temp, S_IWUSR, NULL, emul_temp_store);
 #endif/*CONFIG_THERMAL_EMULATION*/
 
+static ssize_t
+sustainable_power_show(struct device *dev, struct device_attribute *devattr,
+		       char *buf)
+{
+	struct thermal_zone_device *tz = to_thermal_zone(dev);
+
+	if (tz->tzp)
+		return sprintf(buf, "%u\n", tz->tzp->sustainable_power);
+	else
+		return -EIO;
+}
+
+static ssize_t
+sustainable_power_store(struct device *dev, struct device_attribute *devattr,
+			const char *buf, size_t count)
+{
+	struct thermal_zone_device *tz = to_thermal_zone(dev);
+	u32 sustainable_power;
+
+	if (!tz->tzp)
+		return -EIO;
+
+	if (kstrtou32(buf, 10, &sustainable_power))
+		return -EINVAL;
+
+	tz->tzp->sustainable_power = sustainable_power;
+
+	return count;
+}
+static DEVICE_ATTR(sustainable_power, S_IWUSR | S_IRUGO, sustainable_power_show,
+		sustainable_power_store);
+
+#define create_s32_tzp_attr(name)					\
+	static ssize_t							\
+	name##_show(struct device *dev, struct device_attribute *devattr, \
+		char *buf)						\
+	{								\
+	struct thermal_zone_device *tz = to_thermal_zone(dev);		\
+									\
+	if (tz->tzp)							\
+		return sprintf(buf, "%u\n", tz->tzp->name);		\
+	else								\
+		return -EIO;						\
+	}								\
+									\
+	static ssize_t							\
+	name##_store(struct device *dev, struct device_attribute *devattr, \
+		const char *buf, size_t count)				\
+	{								\
+		struct thermal_zone_device *tz = to_thermal_zone(dev);	\
+		s32 value;						\
+									\
+		if (!tz->tzp)						\
+			return -EIO;					\
+									\
+		if (kstrtos32(buf, 10, &value))				\
+			return -EINVAL;					\
+									\
+		tz->tzp->name = value;					\
+									\
+		return count;						\
+	}								\
+	static DEVICE_ATTR(name, S_IWUSR | S_IRUGO, name##_show, name##_store)
+
+create_s32_tzp_attr(k_po);
+create_s32_tzp_attr(k_pu);
+create_s32_tzp_attr(k_i);
+create_s32_tzp_attr(k_d);
+create_s32_tzp_attr(integral_cutoff);
+create_s32_tzp_attr(slope);
+create_s32_tzp_attr(offset);
+#undef create_s32_tzp_attr
+
+static struct device_attribute *dev_tzp_attrs[] = {
+	&dev_attr_sustainable_power,
+	&dev_attr_k_po,
+	&dev_attr_k_pu,
+	&dev_attr_k_i,
+	&dev_attr_k_d,
+	&dev_attr_integral_cutoff,
+	&dev_attr_slope,
+	&dev_attr_offset,
+};
+
+static int create_tzp_attrs(struct device *dev)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(dev_tzp_attrs); i++) {
+		int ret;
+		struct device_attribute *dev_attr = dev_tzp_attrs[i];
+
+		ret = device_create_file(dev, dev_attr);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+/**
+ * power_actor_get_max_power() - get the maximum power that a cdev can consume
+ * @cdev:	pointer to &thermal_cooling_device
+ * @tz:		a valid thermal zone device pointer
+ * @max_power:	pointer in which to store the maximum power
+ *
+ * Calculate the maximum power consumption in milliwats that the
+ * cooling device can currently consume and store it in @max_power.
+ *
+ * Return: 0 on success, -EINVAL if @cdev doesn't support the
+ * power_actor API or -E* on other error.
+ */
+int power_actor_get_max_power(struct thermal_cooling_device *cdev,
+			      struct thermal_zone_device *tz, u32 *max_power)
+{
+	if (!cdev_is_power_actor(cdev))
+		return -EINVAL;
+
+	return cdev->ops->state2power(cdev, tz, 0, max_power);
+}
+
+/**
+ * power_actor_set_power() - limit the maximum power that a cooling device can consume
+ * @cdev:	pointer to &thermal_cooling_device
+ * @instance:	thermal instance to update
+ * @power:	the power in milliwatts
+ *
+ * Set the cooling device to consume at most @power milliwatts.
+ *
+ * Return: 0 on success, -EINVAL if the cooling device does not
+ * implement the power actor API or -E* for other failures.
+ */
+int power_actor_set_power(struct thermal_cooling_device *cdev,
+			  struct thermal_instance *instance, u32 power)
+{
+	unsigned long state;
+	int ret;
+
+	if (!cdev_is_power_actor(cdev))
+		return -EINVAL;
+
+	ret = cdev->ops->power2state(cdev, instance->tz, power, &state);
+	if (ret)
+		return ret;
+
+	instance->target = state;
+	cdev->updated = false;
+	thermal_cdev_update(cdev);
+
+	return 0;
+}
+
 static DEVICE_ATTR(type, 0444, type_show, NULL);
 static DEVICE_ATTR(temp, 0444, temp_show, NULL);
 static DEVICE_ATTR(mode, 0644, mode_show, mode_store);
@@ -897,6 +1116,50 @@
 		return sprintf(buf, "%d\n", instance->trip);
 }
 
+static struct attribute *cooling_device_attrs[] = {
+	&dev_attr_cdev_type.attr,
+	&dev_attr_max_state.attr,
+	&dev_attr_cur_state.attr,
+	NULL,
+};
+
+static const struct attribute_group cooling_device_attr_group = {
+	.attrs = cooling_device_attrs,
+};
+
+static const struct attribute_group *cooling_device_attr_groups[] = {
+	&cooling_device_attr_group,
+	NULL,
+};
+
+static ssize_t
+thermal_cooling_device_weight_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	struct thermal_instance *instance;
+
+	instance = container_of(attr, struct thermal_instance, weight_attr);
+
+	return sprintf(buf, "%d\n", instance->weight);
+}
+
+static ssize_t
+thermal_cooling_device_weight_store(struct device *dev,
+				    struct device_attribute *attr,
+				    const char *buf, size_t count)
+{
+	struct thermal_instance *instance;
+	int ret, weight;
+
+	ret = kstrtoint(buf, 0, &weight);
+	if (ret)
+		return ret;
+
+	instance = container_of(attr, struct thermal_instance, weight_attr);
+	instance->weight = weight;
+
+	return count;
+}
 /* Device management */
 
 /**
@@ -911,6 +1174,9 @@
  * @lower:	the Minimum cooling state can be used for this trip point.
  *		THERMAL_NO_LIMIT means no lower limit,
  *		and the cooling device can be in cooling state 0.
+ * @weight:	The weight of the cooling device to be bound to the
+ *		thermal zone. Use THERMAL_WEIGHT_DEFAULT for the
+ *		default value
  *
  * This interface function bind a thermal cooling device to the certain trip
  * point of a thermal zone device.
@@ -921,7 +1187,8 @@
 int thermal_zone_bind_cooling_device(struct thermal_zone_device *tz,
 				     int trip,
 				     struct thermal_cooling_device *cdev,
-				     unsigned long upper, unsigned long lower)
+				     unsigned long upper, unsigned long lower,
+				     unsigned int weight)
 {
 	struct thermal_instance *dev;
 	struct thermal_instance *pos;
@@ -964,6 +1231,7 @@
 	dev->upper = upper;
 	dev->lower = lower;
 	dev->target = THERMAL_NO_TARGET;
+	dev->weight = weight;
 
 	result = get_idr(&tz->idr, &tz->lock, &dev->id);
 	if (result)
@@ -984,6 +1252,16 @@
 	if (result)
 		goto remove_symbol_link;
 
+	sprintf(dev->weight_attr_name, "cdev%d_weight", dev->id);
+	sysfs_attr_init(&dev->weight_attr.attr);
+	dev->weight_attr.attr.name = dev->weight_attr_name;
+	dev->weight_attr.attr.mode = S_IWUSR | S_IRUGO;
+	dev->weight_attr.show = thermal_cooling_device_weight_show;
+	dev->weight_attr.store = thermal_cooling_device_weight_store;
+	result = device_create_file(&tz->device, &dev->weight_attr);
+	if (result)
+		goto remove_trip_file;
+
 	mutex_lock(&tz->lock);
 	mutex_lock(&cdev->lock);
 	list_for_each_entry(pos, &tz->thermal_instances, tz_node)
@@ -1001,6 +1279,8 @@
 	if (!result)
 		return 0;
 
+	device_remove_file(&tz->device, &dev->weight_attr);
+remove_trip_file:
 	device_remove_file(&tz->device, &dev->attr);
 remove_symbol_link:
 	sysfs_remove_link(&tz->device.kobj, dev->name);
@@ -1126,6 +1406,7 @@
 	cdev->ops = ops;
 	cdev->updated = false;
 	cdev->device.class = &thermal_class;
+	cdev->device.groups = cooling_device_attr_groups;
 	cdev->devdata = devdata;
 	dev_set_name(&cdev->device, "cooling_device%d", cdev->id);
 	result = device_register(&cdev->device);
@@ -1135,21 +1416,6 @@
 		return ERR_PTR(result);
 	}
 
-	/* sys I/F */
-	if (type) {
-		result = device_create_file(&cdev->device, &dev_attr_cdev_type);
-		if (result)
-			goto unregister;
-	}
-
-	result = device_create_file(&cdev->device, &dev_attr_max_state);
-	if (result)
-		goto unregister;
-
-	result = device_create_file(&cdev->device, &dev_attr_cur_state);
-	if (result)
-		goto unregister;
-
 	/* Add 'this' new cdev to the global cdev list */
 	mutex_lock(&thermal_list_lock);
 	list_add(&cdev->node, &thermal_cdev_list);
@@ -1159,11 +1425,6 @@
 	bind_cdev(cdev);
 
 	return cdev;
-
-unregister:
-	release_idr(&thermal_cdev_idr, &thermal_idr_lock, cdev->id);
-	device_unregister(&cdev->device);
-	return ERR_PTR(result);
 }
 
 /**
@@ -1374,7 +1635,8 @@
 						tz->trip_temp_attrs[indx].name;
 		tz->trip_temp_attrs[indx].attr.attr.mode = S_IRUGO;
 		tz->trip_temp_attrs[indx].attr.show = trip_point_temp_show;
-		if (mask & (1 << indx)) {
+		if (IS_ENABLED(CONFIG_THERMAL_WRITABLE_TRIPS) &&
+		    mask & (1 << indx)) {
 			tz->trip_temp_attrs[indx].attr.attr.mode |= S_IWUSR;
 			tz->trip_temp_attrs[indx].attr.store =
 							trip_point_temp_store;
@@ -1451,7 +1713,7 @@
 struct thermal_zone_device *thermal_zone_device_register(const char *type,
 	int trips, int mask, void *devdata,
 	struct thermal_zone_device_ops *ops,
-	const struct thermal_zone_params *tzp,
+	struct thermal_zone_params *tzp,
 	int passive_delay, int polling_delay)
 {
 	struct thermal_zone_device *tz;
@@ -1459,6 +1721,7 @@
 	int result;
 	int count;
 	int passive = 0;
+	struct thermal_governor *governor;
 
 	if (type && strlen(type) >= THERMAL_NAME_LENGTH)
 		return ERR_PTR(-EINVAL);
@@ -1545,13 +1808,24 @@
 	if (result)
 		goto unregister;
 
+	/* Add thermal zone params */
+	result = create_tzp_attrs(&tz->device);
+	if (result)
+		goto unregister;
+
 	/* Update 'this' zone's governor information */
 	mutex_lock(&thermal_governor_lock);
 
 	if (tz->tzp)
-		tz->governor = __find_governor(tz->tzp->governor_name);
+		governor = __find_governor(tz->tzp->governor_name);
 	else
-		tz->governor = def_governor;
+		governor = def_governor;
+
+	result = thermal_set_governor(tz, governor);
+	if (result) {
+		mutex_unlock(&thermal_governor_lock);
+		goto unregister;
+	}
 
 	mutex_unlock(&thermal_governor_lock);
 
@@ -1640,7 +1914,7 @@
 		device_remove_file(&tz->device, &dev_attr_mode);
 	device_remove_file(&tz->device, &dev_attr_policy);
 	remove_trip_attrs(tz);
-	tz->governor = NULL;
+	thermal_set_governor(tz, NULL);
 
 	thermal_remove_hwmon_sysfs(tz);
 	release_idr(&thermal_tz_idr, &thermal_idr_lock, tz->id);
@@ -1800,7 +2074,11 @@
 	if (result)
 		return result;
 
-	return thermal_gov_user_space_register();
+	result = thermal_gov_user_space_register();
+	if (result)
+		return result;
+
+	return thermal_gov_power_allocator_register();
 }
 
 static void thermal_unregister_governors(void)
@@ -1809,6 +2087,7 @@
 	thermal_gov_fair_share_unregister();
 	thermal_gov_bang_bang_unregister();
 	thermal_gov_user_space_unregister();
+	thermal_gov_power_allocator_unregister();
 }
 
 static int __init thermal_init(void)

diff --git a/drivers/thermal/thermal_core.h b/drivers/thermal/thermal_core.h
index d15d243..8a66244 100644
--- a/drivers/thermal/thermal_core.h
+++ b/drivers/thermal/thermal_core.h

@@ -46,8 +46,11 @@
 	unsigned long target;	/* expected cooling state */
 	char attr_name[THERMAL_NAME_LENGTH];
 	struct device_attribute attr;
+	char weight_attr_name[THERMAL_NAME_LENGTH];
+	struct device_attribute weight_attr;
 	struct list_head tz_node; /* node in tz->thermal_instances */
 	struct list_head cdev_node; /* node in cdev->thermal_instances */
+	unsigned int weight; /* The weight of the cooling device */
 };
 
 int thermal_register_governor(struct thermal_governor *);
@@ -85,13 +88,39 @@
 static inline void thermal_gov_user_space_unregister(void) {}
 #endif /* CONFIG_THERMAL_GOV_USER_SPACE */
 
+#ifdef CONFIG_THERMAL_GOV_POWER_ALLOCATOR
+int thermal_gov_power_allocator_register(void);
+void thermal_gov_power_allocator_unregister(void);
+#else
+static inline int thermal_gov_power_allocator_register(void) { return 0; }
+static inline void thermal_gov_power_allocator_unregister(void) {}
+#endif /* CONFIG_THERMAL_GOV_POWER_ALLOCATOR */
+
 /* device tree support */
 #ifdef CONFIG_THERMAL_OF
 int of_parse_thermal_zones(void);
 void of_thermal_destroy_zones(void);
+int of_thermal_get_ntrips(struct thermal_zone_device *);
+bool of_thermal_is_trip_valid(struct thermal_zone_device *, int);
+const struct thermal_trip *
+of_thermal_get_trip_points(struct thermal_zone_device *);
 #else
 static inline int of_parse_thermal_zones(void) { return 0; }
 static inline void of_thermal_destroy_zones(void) { }
+static inline int of_thermal_get_ntrips(struct thermal_zone_device *tz)
+{
+	return 0;
+}
+static inline bool of_thermal_is_trip_valid(struct thermal_zone_device *tz,
+					    int trip)
+{
+	return 0;
+}
+static inline const struct thermal_trip *
+of_thermal_get_trip_points(struct thermal_zone_device *tz)
+{
+	return NULL;
+}
 #endif
 
 #endif /* __THERMAL_CORE_H__ */

diff --git a/drivers/thermal/ti-soc-thermal/ti-thermal-common.c b/drivers/thermal/ti-soc-thermal/ti-thermal-common.c
index 9eec26d..68f53fc 100644
--- a/drivers/thermal/ti-soc-thermal/ti-thermal-common.c
+++ b/drivers/thermal/ti-soc-thermal/ti-thermal-common.c

@@ -28,7 +28,6 @@
 #include <linux/kernel.h>
 #include <linux/workqueue.h>
 #include <linux/thermal.h>
-#include <linux/cpufreq.h>
 #include <linux/cpumask.h>
 #include <linux/cpu_cooling.h>
 #include <linux/of.h>
@@ -147,7 +146,8 @@
 	return thermal_zone_bind_cooling_device(thermal, 0, cdev,
 	/* bind with min and max states defined by cpu_cooling */
 						THERMAL_NO_LIMIT,
-						THERMAL_NO_LIMIT);
+						THERMAL_NO_LIMIT,
+						THERMAL_WEIGHT_DEFAULT);
 }
 
 /* Unbind callback functions for thermal zone */
@@ -286,6 +286,11 @@
 	return ti_thermal_get_trip_temp(thermal, OMAP_TRIP_NUMBER - 1, temp);
 }
 
+static const struct thermal_zone_of_device_ops ti_of_thermal_ops = {
+	.get_temp = __ti_thermal_get_temp,
+	.get_trend = __ti_thermal_get_trend,
+};
+
 static struct thermal_zone_device_ops ti_thermal_ops = {
 	.get_temp = ti_thermal_get_temp,
 	.get_trend = ti_thermal_get_trend,
@@ -333,8 +338,7 @@
 
 	/* in case this is specified by DT */
 	data->ti_thermal = thermal_zone_of_sensor_register(bgp->dev, id,
-					data, __ti_thermal_get_temp,
-					__ti_thermal_get_trend);
+					data, &ti_of_thermal_ops);
 	if (IS_ERR(data->ti_thermal)) {
 		/* Create thermal zone */
 		data->ti_thermal = thermal_zone_device_register(domain,
@@ -403,11 +407,6 @@
 	if (!data)
 		return -EINVAL;
 
-	if (!cpufreq_get_current_driver()) {
-		dev_dbg(bgp->dev, "no cpufreq driver yet\n");
-		return -EPROBE_DEFER;
-	}
-
 	/* Register cooling device */
 	data->cool_dev = cpufreq_cooling_register(cpu_present_mask);
 	if (IS_ERR(data->cool_dev)) {

diff --git a/include/dt-bindings/thermal/thermal.h b/include/dt-bindings/thermal/thermal.h
index 59822a9..b5e6b00 100644
--- a/include/dt-bindings/thermal/thermal.h
+++ b/include/dt-bindings/thermal/thermal.h

@@ -11,7 +11,7 @@
 #define _DT_BINDINGS_THERMAL_THERMAL_H
 
 /* On cooling devices upper and lower limits */
-#define THERMAL_NO_LIMIT		(-1UL)
+#define THERMAL_NO_LIMIT		(~0)
 
 #endif
 

diff --git a/include/linux/clock_cooling.h b/include/linux/clock_cooling.h
new file mode 100644
index 0000000..4d1019d
--- /dev/null
+++ b/include/linux/clock_cooling.h

@@ -0,0 +1,65 @@
+/*
+ *  linux/include/linux/clock_cooling.h
+ *
+ *  Copyright (C) 2014 Eduardo Valentin <edubezval@gmail.com>
+ *
+ *  Copyright (C) 2013	Texas Instruments Inc.
+ *  Contact:  Eduardo Valentin <eduardo.valentin@ti.com>
+ *
+ *  Highly based on cpu_cooling.c.
+ *  Copyright (C) 2012	Samsung Electronics Co., Ltd(http://www.samsung.com)
+ *  Copyright (C) 2012  Amit Daniel <amit.kachhap@linaro.org>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; version 2 of the License.
+ *
+ *  This program is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ */
+
+#ifndef __CPU_COOLING_H__
+#define __CPU_COOLING_H__
+
+#include <linux/of.h>
+#include <linux/thermal.h>
+#include <linux/cpumask.h>
+
+#ifdef CONFIG_CLOCK_THERMAL
+/**
+ * clock_cooling_register - function to create clock cooling device.
+ * @dev: struct device pointer to the device used as clock cooling device.
+ * @clock_name: string containing the clock used as cooling mechanism.
+ */
+struct thermal_cooling_device *
+clock_cooling_register(struct device *dev, const char *clock_name);
+
+/**
+ * clock_cooling_unregister - function to remove clock cooling device.
+ * @cdev: thermal cooling device pointer.
+ */
+void clock_cooling_unregister(struct thermal_cooling_device *cdev);
+
+unsigned long clock_cooling_get_level(struct thermal_cooling_device *cdev,
+				      unsigned long freq);
+#else /* !CONFIG_CLOCK_THERMAL */
+static inline struct thermal_cooling_device *
+clock_cooling_register(struct device *dev, const char *clock_name)
+{
+	return NULL;
+}
+static inline
+void clock_cooling_unregister(struct thermal_cooling_device *cdev)
+{
+}
+static inline
+unsigned long clock_cooling_get_level(struct thermal_cooling_device *cdev,
+				      unsigned long freq)
+{
+	return THERMAL_CSTATE_INVALID;
+}
+#endif	/* CONFIG_CLOCK_THERMAL */
+
+#endif /* __CPU_COOLING_H__ */

diff --git a/include/linux/cpu_cooling.h b/include/linux/cpu_cooling.h
index c303d38..8ac0331 100644
--- a/include/linux/cpu_cooling.h
+++ b/include/linux/cpu_cooling.h

@@ -28,6 +28,9 @@
 #include <linux/thermal.h>
 #include <linux/cpumask.h>
 
+typedef int (*get_static_t)(cpumask_t *cpumask, int interval,
+			    unsigned long voltage, u32 *power);
+
 #ifdef CONFIG_CPU_THERMAL
 /**
  * cpufreq_cooling_register - function to create cpufreq cooling device.
@@ -36,6 +39,10 @@
 struct thermal_cooling_device *
 cpufreq_cooling_register(const struct cpumask *clip_cpus);
 
+struct thermal_cooling_device *
+cpufreq_power_cooling_register(const struct cpumask *clip_cpus,
+			       u32 capacitance, get_static_t plat_static_func);
+
 /**
  * of_cpufreq_cooling_register - create cpufreq cooling device based on DT.
  * @np: a valid struct device_node to the cooling device device tree node.
@@ -45,6 +52,12 @@
 struct thermal_cooling_device *
 of_cpufreq_cooling_register(struct device_node *np,
 			    const struct cpumask *clip_cpus);
+
+struct thermal_cooling_device *
+of_cpufreq_power_cooling_register(struct device_node *np,
+				  const struct cpumask *clip_cpus,
+				  u32 capacitance,
+				  get_static_t plat_static_func);
 #else
 static inline struct thermal_cooling_device *
 of_cpufreq_cooling_register(struct device_node *np,
@@ -52,6 +65,15 @@
 {
 	return NULL;
 }
+
+static inline struct thermal_cooling_device *
+of_cpufreq_power_cooling_register(struct device_node *np,
+				  const struct cpumask *clip_cpus,
+				  u32 capacitance,
+				  get_static_t plat_static_func)
+{
+	return NULL;
+}
 #endif
 
 /**
@@ -68,11 +90,28 @@
 	return NULL;
 }
 static inline struct thermal_cooling_device *
+cpufreq_power_cooling_register(const struct cpumask *clip_cpus,
+			       u32 capacitance, get_static_t plat_static_func)
+{
+	return NULL;
+}
+
+static inline struct thermal_cooling_device *
 of_cpufreq_cooling_register(struct device_node *np,
 			    const struct cpumask *clip_cpus)
 {
 	return NULL;
 }
+
+static inline struct thermal_cooling_device *
+of_cpufreq_power_cooling_register(struct device_node *np,
+				  const struct cpumask *clip_cpus,
+				  u32 capacitance,
+				  get_static_t plat_static_func)
+{
+	return NULL;
+}
+
 static inline
 void cpufreq_cooling_unregister(struct thermal_cooling_device *cdev)
 {

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 004fac2..708de89 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h

@@ -44,6 +44,10 @@
 const char *ftrace_print_hex_seq(struct trace_seq *p,
 				 const unsigned char *buf, int len);
 
+const char *ftrace_print_array_seq(struct trace_seq *p,
+				   const void *buf, int buf_len,
+				   size_t el_size);
+
 struct trace_iterator;
 struct trace_event;
 

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 8df5e32..0844663 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h

@@ -103,6 +103,18 @@
 		(((__x) - ((__d) / 2)) / (__d));	\
 }							\
 )
+/*
+ * Same as above but for u64 dividends. divisor must be a 32-bit
+ * number.
+ */
+#define DIV_ROUND_CLOSEST_ULL(x, divisor)(		\
+{							\
+	typeof(divisor) __d = divisor;			\
+	unsigned long long _tmp = (x) + (__d) / 2;	\
+	do_div(_tmp, __d);				\
+	_tmp;						\
+}							\
+)
 
 /*
  * Multiplies an integer by a fraction, while avoiding unnecessary

diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index ef90838..037e9df 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h

@@ -29,16 +29,19 @@
 #include <linux/idr.h>
 #include <linux/device.h>
 #include <linux/workqueue.h>
+#include <uapi/linux/thermal.h>
 
 #define THERMAL_TRIPS_NONE	-1
 #define THERMAL_MAX_TRIPS	12
-#define THERMAL_NAME_LENGTH	20
 
 /* invalid cooling state */
 #define THERMAL_CSTATE_INVALID -1UL
 
 /* No upper/lower limit requirement */
-#define THERMAL_NO_LIMIT	THERMAL_CSTATE_INVALID
+#define THERMAL_NO_LIMIT	((u32)~0)
+
+/* Default weight of a bound cooling device */
+#define THERMAL_WEIGHT_DEFAULT 0
 
 /* Unit conversion macros */
 #define KELVIN_TO_CELSIUS(t)	(long)(((long)t-2732 >= 0) ?	\
@@ -49,11 +52,6 @@
 #define MILLICELSIUS_TO_DECI_KELVIN_WITH_OFFSET(t, off) (((t) / 100) + (off))
 #define MILLICELSIUS_TO_DECI_KELVIN(t) MILLICELSIUS_TO_DECI_KELVIN_WITH_OFFSET(t, 2732)
 
-/* Adding event notification support elements */
-#define THERMAL_GENL_FAMILY_NAME                "thermal_event"
-#define THERMAL_GENL_VERSION                    0x01
-#define THERMAL_GENL_MCAST_GROUP_NAME           "thermal_mc_grp"
-
 /* Default Thermal Governor */
 #if defined(CONFIG_THERMAL_DEFAULT_GOV_STEP_WISE)
 #define DEFAULT_THERMAL_GOVERNOR       "step_wise"
@@ -61,10 +59,13 @@
 #define DEFAULT_THERMAL_GOVERNOR       "fair_share"
 #elif defined(CONFIG_THERMAL_DEFAULT_GOV_USER_SPACE)
 #define DEFAULT_THERMAL_GOVERNOR       "user_space"
+#elif defined(CONFIG_THERMAL_DEFAULT_GOV_POWER_ALLOCATOR)
+#define DEFAULT_THERMAL_GOVERNOR       "power_allocator"
 #endif
 
 struct thermal_zone_device;
 struct thermal_cooling_device;
+struct thermal_instance;
 
 enum thermal_device_mode {
 	THERMAL_DEVICE_DISABLED = 0,
@@ -86,30 +87,6 @@
 	THERMAL_TREND_DROP_FULL, /* apply lowest cooling action */
 };
 
-/* Events supported by Thermal Netlink */
-enum events {
-	THERMAL_AUX0,
-	THERMAL_AUX1,
-	THERMAL_CRITICAL,
-	THERMAL_DEV_FAULT,
-};
-
-/* attributes of thermal_genl_family */
-enum {
-	THERMAL_GENL_ATTR_UNSPEC,
-	THERMAL_GENL_ATTR_EVENT,
-	__THERMAL_GENL_ATTR_MAX,
-};
-#define THERMAL_GENL_ATTR_MAX (__THERMAL_GENL_ATTR_MAX - 1)
-
-/* commands supported by the thermal_genl_family */
-enum {
-	THERMAL_GENL_CMD_UNSPEC,
-	THERMAL_GENL_CMD_EVENT,
-	__THERMAL_GENL_CMD_MAX,
-};
-#define THERMAL_GENL_CMD_MAX (__THERMAL_GENL_CMD_MAX - 1)
-
 struct thermal_zone_device_ops {
 	int (*bind) (struct thermal_zone_device *,
 		     struct thermal_cooling_device *);
@@ -142,6 +119,12 @@
 	int (*get_max_state) (struct thermal_cooling_device *, unsigned long *);
 	int (*get_cur_state) (struct thermal_cooling_device *, unsigned long *);
 	int (*set_cur_state) (struct thermal_cooling_device *, unsigned long);
+	int (*get_requested_power)(struct thermal_cooling_device *,
+				   struct thermal_zone_device *, u32 *);
+	int (*state2power)(struct thermal_cooling_device *,
+			   struct thermal_zone_device *, unsigned long, u32 *);
+	int (*power2state)(struct thermal_cooling_device *,
+			   struct thermal_zone_device *, u32, unsigned long *);
 };
 
 struct thermal_cooling_device {
@@ -173,8 +156,7 @@
  * @devdata:	private pointer for device private data
  * @trips:	number of trip points the thermal zone supports
  * @passive_delay:	number of milliseconds to wait between polls when
- *			performing passive cooling.  Currenty only used by the
- *			step-wise governor
+ *			performing passive cooling.
  * @polling_delay:	number of milliseconds to wait between polls when
  *			checking whether trip points have been crossed (0 for
  *			interrupt driven systems)
@@ -184,13 +166,13 @@
  * @last_temperature:	previous temperature read
  * @emul_temperature:	emulated temperature when using CONFIG_THERMAL_EMULATION
  * @passive:		1 if you've crossed a passive trip point, 0 otherwise.
- *			Currenty only used by the step-wise governor.
  * @forced_passive:	If > 0, temperature at which to switch on all ACPI
  *			processor cooling devices.  Currently only used by the
  *			step-wise governor.
  * @ops:	operations this &thermal_zone_device supports
  * @tzp:	thermal zone parameters
  * @governor:	pointer to the governor for this thermal zone
+ * @governor_data:	private pointer for governor data
  * @thermal_instances:	list of &struct thermal_instance of this thermal zone
  * @idr:	&struct idr to generate unique id for this zone's cooling
  *		devices
@@ -215,8 +197,9 @@
 	int passive;
 	unsigned int forced_passive;
 	struct thermal_zone_device_ops *ops;
-	const struct thermal_zone_params *tzp;
+	struct thermal_zone_params *tzp;
 	struct thermal_governor *governor;
+	void *governor_data;
 	struct list_head thermal_instances;
 	struct idr idr;
 	struct mutex lock;
@@ -227,12 +210,19 @@
 /**
  * struct thermal_governor - structure that holds thermal governor information
  * @name:	name of the governor
+ * @bind_to_tz: callback called when binding to a thermal zone.  If it
+ *		returns 0, the governor is bound to the thermal zone,
+ *		otherwise it fails.
+ * @unbind_from_tz:	callback called when a governor is unbound from a
+ *			thermal zone.
  * @throttle:	callback called for every trip point even if temperature is
  *		below the trip point temperature
  * @governor_list:	node in thermal_governor_list (in thermal_core.c)
  */
 struct thermal_governor {
 	char name[THERMAL_NAME_LENGTH];
+	int (*bind_to_tz)(struct thermal_zone_device *tz);
+	void (*unbind_from_tz)(struct thermal_zone_device *tz);
 	int (*throttle)(struct thermal_zone_device *tz, int trip);
 	struct list_head	governor_list;
 };
@@ -243,9 +233,12 @@
 
 	/*
 	 * This is a measure of 'how effectively these devices can
-	 * cool 'this' thermal zone. The shall be determined by platform
-	 * characterization. This is on a 'percentage' scale.
-	 * See Documentation/thermal/sysfs-api.txt for more information.
+	 * cool 'this' thermal zone. It shall be determined by
+	 * platform characterization. This value is relative to the
+	 * rest of the weights so a cooling device whose weight is
+	 * double that of another cooling device is twice as
+	 * effective. See Documentation/thermal/sysfs-api.txt for more
+	 * information.
 	 */
 	int weight;
 
@@ -282,6 +275,44 @@
 
 	int num_tbps;	/* Number of tbp entries */
 	struct thermal_bind_params *tbp;
+
+	/*
+	 * Sustainable power (heat) that this thermal zone can dissipate in
+	 * mW
+	 */
+	u32 sustainable_power;
+
+	/*
+	 * Proportional parameter of the PID controller when
+	 * overshooting (i.e., when temperature is below the target)
+	 */
+	s32 k_po;
+
+	/*
+	 * Proportional parameter of the PID controller when
+	 * undershooting
+	 */
+	s32 k_pu;
+
+	/* Integral parameter of the PID controller */
+	s32 k_i;
+
+	/* Derivative parameter of the PID controller */
+	s32 k_d;
+
+	/* threshold below which the error is no longer accumulated */
+	s32 integral_cutoff;
+
+	/*
+	 * @slope:	slope of a linear temperature adjustment curve.
+	 * 		Used by thermal zone drivers.
+	 */
+	int slope;
+	/*
+	 * @offset:	offset of a linear temperature adjustment curve.
+	 * 		Used by thermal zone drivers (default 0).
+	 */
+	int offset;
 };
 
 struct thermal_genl_event {
@@ -289,19 +320,49 @@
 	enum events event;
 };
 
+/**
+ * struct thermal_zone_of_device_ops - scallbacks for handling DT based zones
+ *
+ * Mandatory:
+ * @get_temp: a pointer to a function that reads the sensor temperature.
+ *
+ * Optional:
+ * @get_trend: a pointer to a function that reads the sensor temperature trend.
+ * @set_emul_temp: a pointer to a function that sets sensor emulated
+ *		   temperature.
+ */
+struct thermal_zone_of_device_ops {
+	int (*get_temp)(void *, long *);
+	int (*get_trend)(void *, long *);
+	int (*set_emul_temp)(void *, unsigned long);
+};
+
+/**
+ * struct thermal_trip - representation of a point in temperature domain
+ * @np: pointer to struct device_node that this trip point was created from
+ * @temperature: temperature value in miliCelsius
+ * @hysteresis: relative hysteresis in miliCelsius
+ * @type: trip point type
+ */
+
+struct thermal_trip {
+	struct device_node *np;
+	unsigned long int temperature;
+	unsigned long int hysteresis;
+	enum thermal_trip_type type;
+};
+
 /* Function declarations */
 #ifdef CONFIG_THERMAL_OF
 struct thermal_zone_device *
-thermal_zone_of_sensor_register(struct device *dev, int id,
-				void *data, int (*get_temp)(void *, long *),
-				int (*get_trend)(void *, long *));
+thermal_zone_of_sensor_register(struct device *dev, int id, void *data,
+				const struct thermal_zone_of_device_ops *ops);
 void thermal_zone_of_sensor_unregister(struct device *dev,
 				       struct thermal_zone_device *tz);
 #else
 static inline struct thermal_zone_device *
-thermal_zone_of_sensor_register(struct device *dev, int id,
-				void *data, int (*get_temp)(void *, long *),
-				int (*get_trend)(void *, long *))
+thermal_zone_of_sensor_register(struct device *dev, int id, void *data,
+				const struct thermal_zone_of_device_ops *ops)
 {
 	return NULL;
 }
@@ -313,14 +374,27 @@
 }
 
 #endif
+
+#if IS_ENABLED(CONFIG_THERMAL)
+static inline bool cdev_is_power_actor(struct thermal_cooling_device *cdev)
+{
+	return cdev->ops->get_requested_power && cdev->ops->state2power &&
+		cdev->ops->power2state;
+}
+
+int power_actor_get_max_power(struct thermal_cooling_device *,
+			      struct thermal_zone_device *tz, u32 *max_power);
+int power_actor_set_power(struct thermal_cooling_device *,
+			  struct thermal_instance *, u32);
 struct thermal_zone_device *thermal_zone_device_register(const char *, int, int,
 		void *, struct thermal_zone_device_ops *,
-		const struct thermal_zone_params *, int, int);
+		struct thermal_zone_params *, int, int);
 void thermal_zone_device_unregister(struct thermal_zone_device *);
 
 int thermal_zone_bind_cooling_device(struct thermal_zone_device *, int,
 				     struct thermal_cooling_device *,
-				     unsigned long, unsigned long);
+				     unsigned long, unsigned long,
+				     unsigned int);
 int thermal_zone_unbind_cooling_device(struct thermal_zone_device *, int,
 				       struct thermal_cooling_device *);
 void thermal_zone_device_update(struct thermal_zone_device *);
@@ -339,8 +413,66 @@
 		struct thermal_cooling_device *, int);
 void thermal_cdev_update(struct thermal_cooling_device *);
 void thermal_notify_framework(struct thermal_zone_device *, int);
+#else
+static inline bool cdev_is_power_actor(struct thermal_cooling_device *cdev)
+{ return false; }
+static inline int power_actor_get_max_power(struct thermal_cooling_device *cdev,
+			      struct thermal_zone_device *tz, u32 *max_power)
+{ return 0; }
+static inline int power_actor_set_power(struct thermal_cooling_device *cdev,
+			  struct thermal_instance *tz, u32 power)
+{ return 0; }
+static inline struct thermal_zone_device *thermal_zone_device_register(
+	const char *type, int trips, int mask, void *devdata,
+	struct thermal_zone_device_ops *ops,
+	const struct thermal_zone_params *tzp,
+	int passive_delay, int polling_delay)
+{ return ERR_PTR(-ENODEV); }
+static inline void thermal_zone_device_unregister(
+	struct thermal_zone_device *tz)
+{ }
+static inline int thermal_zone_bind_cooling_device(
+	struct thermal_zone_device *tz, int trip,
+	struct thermal_cooling_device *cdev,
+	unsigned long upper, unsigned long lower)
+{ return -ENODEV; }
+static inline int thermal_zone_unbind_cooling_device(
+	struct thermal_zone_device *tz, int trip,
+	struct thermal_cooling_device *cdev)
+{ return -ENODEV; }
+static inline void thermal_zone_device_update(struct thermal_zone_device *tz)
+{ }
+static inline struct thermal_cooling_device *
+thermal_cooling_device_register(char *type, void *devdata,
+	const struct thermal_cooling_device_ops *ops)
+{ return ERR_PTR(-ENODEV); }
+static inline struct thermal_cooling_device *
+thermal_of_cooling_device_register(struct device_node *np,
+	char *type, void *devdata, const struct thermal_cooling_device_ops *ops)
+{ return ERR_PTR(-ENODEV); }
+static inline void thermal_cooling_device_unregister(
+	struct thermal_cooling_device *cdev)
+{ }
+static inline struct thermal_zone_device *thermal_zone_get_zone_by_name(
+		const char *name)
+{ return ERR_PTR(-ENODEV); }
+static inline int thermal_zone_get_temp(
+		struct thermal_zone_device *tz, unsigned long *temp)
+{ return -ENODEV; }
+static inline int get_tz_trend(struct thermal_zone_device *tz, int trip)
+{ return -ENODEV; }
+static inline struct thermal_instance *
+get_thermal_instance(struct thermal_zone_device *tz,
+	struct thermal_cooling_device *cdev, int trip)
+{ return ERR_PTR(-ENODEV); }
+static inline void thermal_cdev_update(struct thermal_cooling_device *cdev)
+{ }
+static inline void thermal_notify_framework(struct thermal_zone_device *tz,
+	int trip)
+{ }
+#endif /* CONFIG_THERMAL */
 
-#ifdef CONFIG_NET
+#if defined(CONFIG_NET) && IS_ENABLED(CONFIG_THERMAL)
 extern int thermal_generate_netlink_event(struct thermal_zone_device *tz,
 						enum events event);
 #else

diff --git a/include/trace/events/thermal.h b/include/trace/events/thermal.h
index 0f4f95d..8b1f806 100644
--- a/include/trace/events/thermal.h
+++ b/include/trace/events/thermal.h

@@ -77,6 +77,64 @@
 		__entry->trip_type)
 );
 
+TRACE_EVENT(thermal_power_cpu_get_power,
+	TP_PROTO(const struct cpumask *cpus, unsigned long freq, u32 *load,
+		size_t load_len, u32 dynamic_power, u32 static_power),
+
+	TP_ARGS(cpus, freq, load, load_len, dynamic_power, static_power),
+
+	TP_STRUCT__entry(
+		__bitmask(cpumask, num_possible_cpus())
+		__field(unsigned long, freq          )
+		__dynamic_array(u32,   load, load_len)
+		__field(size_t,        load_len      )
+		__field(u32,           dynamic_power )
+		__field(u32,           static_power  )
+	),
+
+	TP_fast_assign(
+		__assign_bitmask(cpumask, cpumask_bits(cpus),
+				num_possible_cpus());
+		__entry->freq = freq;
+		memcpy(__get_dynamic_array(load), load,
+			load_len * sizeof(*load));
+		__entry->load_len = load_len;
+		__entry->dynamic_power = dynamic_power;
+		__entry->static_power = static_power;
+	),
+
+	TP_printk("cpus=%s freq=%lu load={%s} dynamic_power=%d static_power=%d",
+		__get_bitmask(cpumask), __entry->freq,
+		__print_array(__get_dynamic_array(load), __entry->load_len, 4),
+		__entry->dynamic_power, __entry->static_power)
+);
+
+TRACE_EVENT(thermal_power_cpu_limit,
+	TP_PROTO(const struct cpumask *cpus, unsigned int freq,
+		unsigned long cdev_state, u32 power),
+
+	TP_ARGS(cpus, freq, cdev_state, power),
+
+	TP_STRUCT__entry(
+		__bitmask(cpumask, num_possible_cpus())
+		__field(unsigned int,  freq      )
+		__field(unsigned long, cdev_state)
+		__field(u32,           power     )
+	),
+
+	TP_fast_assign(
+		__assign_bitmask(cpumask, cpumask_bits(cpus),
+				num_possible_cpus());
+		__entry->freq = freq;
+		__entry->cdev_state = cdev_state;
+		__entry->power = power;
+	),
+
+	TP_printk("cpus=%s freq=%u cdev_state=%lu power=%u",
+		__get_bitmask(cpumask), __entry->freq, __entry->cdev_state,
+		__entry->power)
+);
+
 #endif /* _TRACE_THERMAL_H */
 
 /* This part must be outside protection */

diff --git a/include/trace/events/thermal_power_allocator.h b/include/trace/events/thermal_power_allocator.h
new file mode 100644
index 0000000..12e1321
--- /dev/null
+++ b/include/trace/events/thermal_power_allocator.h

@@ -0,0 +1,87 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM thermal_power_allocator
+
+#if !defined(_TRACE_THERMAL_POWER_ALLOCATOR_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_THERMAL_POWER_ALLOCATOR_H
+
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(thermal_power_allocator,
+	TP_PROTO(struct thermal_zone_device *tz, u32 *req_power,
+		 u32 total_req_power, u32 *granted_power,
+		 u32 total_granted_power, size_t num_actors,
+		 u32 power_range, u32 max_allocatable_power,
+		 unsigned long current_temp, s32 delta_temp),
+	TP_ARGS(tz, req_power, total_req_power, granted_power,
+		total_granted_power, num_actors, power_range,
+		max_allocatable_power, current_temp, delta_temp),
+	TP_STRUCT__entry(
+		__field(int,           tz_id          )
+		__dynamic_array(u32,   req_power, num_actors    )
+		__field(u32,           total_req_power          )
+		__dynamic_array(u32,   granted_power, num_actors)
+		__field(u32,           total_granted_power      )
+		__field(size_t,        num_actors               )
+		__field(u32,           power_range              )
+		__field(u32,           max_allocatable_power    )
+		__field(unsigned long, current_temp             )
+		__field(s32,           delta_temp               )
+	),
+	TP_fast_assign(
+		__entry->tz_id = tz->id;
+		memcpy(__get_dynamic_array(req_power), req_power,
+			num_actors * sizeof(*req_power));
+		__entry->total_req_power = total_req_power;
+		memcpy(__get_dynamic_array(granted_power), granted_power,
+			num_actors * sizeof(*granted_power));
+		__entry->total_granted_power = total_granted_power;
+		__entry->num_actors = num_actors;
+		__entry->power_range = power_range;
+		__entry->max_allocatable_power = max_allocatable_power;
+		__entry->current_temp = current_temp;
+		__entry->delta_temp = delta_temp;
+	),
+
+	TP_printk("thermal_zone_id=%d req_power={%s} total_req_power=%u granted_power={%s} total_granted_power=%u power_range=%u max_allocatable_power=%u current_temperature=%lu delta_temperature=%d",
+		__entry->tz_id,
+		__print_array(__get_dynamic_array(req_power),
+                              __entry->num_actors, 4),
+		__entry->total_req_power,
+		__print_array(__get_dynamic_array(granted_power),
+                              __entry->num_actors, 4),
+		__entry->total_granted_power, __entry->power_range,
+		__entry->max_allocatable_power, __entry->current_temp,
+		__entry->delta_temp)
+);
+
+TRACE_EVENT(thermal_power_allocator_pid,
+	TP_PROTO(struct thermal_zone_device *tz, s32 err, s32 err_integral,
+		 s64 p, s64 i, s64 d, s32 output),
+	TP_ARGS(tz, err, err_integral, p, i, d, output),
+	TP_STRUCT__entry(
+		__field(int, tz_id       )
+		__field(s32, err         )
+		__field(s32, err_integral)
+		__field(s64, p           )
+		__field(s64, i           )
+		__field(s64, d           )
+		__field(s32, output      )
+	),
+	TP_fast_assign(
+		__entry->tz_id = tz->id;
+		__entry->err = err;
+		__entry->err_integral = err_integral;
+		__entry->p = p;
+		__entry->i = i;
+		__entry->d = d;
+		__entry->output = output;
+	),
+
+	TP_printk("thermal_zone_id=%d err=%d err_integral=%d p=%lld i=%lld d=%lld output=%d",
+		  __entry->tz_id, __entry->err, __entry->err_integral,
+		  __entry->p, __entry->i, __entry->d, __entry->output)
+);
+#endif /* _TRACE_THERMAL_POWER_ALLOCATOR_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>

diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 26b4f2e..f5d9371 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h

@@ -263,6 +263,14 @@
 #undef __print_hex
 #define __print_hex(buf, buf_len) ftrace_print_hex_seq(p, buf, buf_len)
 
+#undef __print_array
+#define __print_array(array, count, el_size)				\
+	({								\
+		BUILD_BUG_ON(el_size != 1 && el_size != 2 &&		\
+			     el_size != 4 && el_size != 8);		\
+		ftrace_print_array_seq(p, array, count, el_size);	\
+	})
+
 #undef DECLARE_EVENT_CLASS
 #define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print)	\
 static notrace enum print_line_t					\
@@ -676,6 +684,7 @@
 #undef __get_dynamic_array_len
 #undef __get_str
 #undef __get_bitmask
+#undef __print_array
 
 #undef TP_printk
 #define TP_printk(fmt, args...) "\"" fmt "\", "  __stringify(args)

diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
index 8523f9b..fe47395 100644
--- a/include/uapi/linux/Kbuild
+++ b/include/uapi/linux/Kbuild

@@ -383,6 +383,7 @@
 header-y += tcp_metrics.h
 header-y += telephony.h
 header-y += termios.h
+header-y += thermal.h
 header-y += time.h
 header-y += times.h
 header-y += timex.h

diff --git a/include/uapi/linux/thermal.h b/include/uapi/linux/thermal.h
new file mode 100644
index 0000000..ac55358
--- /dev/null
+++ b/include/uapi/linux/thermal.h

@@ -0,0 +1,35 @@
+#ifndef _UAPI_LINUX_THERMAL_H
+#define _UAPI_LINUX_THERMAL_H
+
+#define THERMAL_NAME_LENGTH	20
+
+/* Adding event notification support elements */
+#define THERMAL_GENL_FAMILY_NAME                "thermal_event"
+#define THERMAL_GENL_VERSION                    0x01
+#define THERMAL_GENL_MCAST_GROUP_NAME           "thermal_mc_grp"
+
+/* Events supported by Thermal Netlink */
+enum events {
+	THERMAL_AUX0,
+	THERMAL_AUX1,
+	THERMAL_CRITICAL,
+	THERMAL_DEV_FAULT,
+};
+
+/* attributes of thermal_genl_family */
+enum {
+	THERMAL_GENL_ATTR_UNSPEC,
+	THERMAL_GENL_ATTR_EVENT,
+	__THERMAL_GENL_ATTR_MAX,
+};
+#define THERMAL_GENL_ATTR_MAX (__THERMAL_GENL_ATTR_MAX - 1)
+
+/* commands supported by the thermal_genl_family */
+enum {
+	THERMAL_GENL_CMD_UNSPEC,
+	THERMAL_GENL_CMD_EVENT,
+	__THERMAL_GENL_CMD_MAX,
+};
+#define THERMAL_GENL_CMD_MAX (__THERMAL_GENL_CMD_MAX - 1)
+
+#endif /* _UAPI_LINUX_THERMAL_H */

diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index d726844..9dcf91d 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c

@@ -186,6 +186,50 @@
 }
 EXPORT_SYMBOL(ftrace_print_hex_seq);
 
+const char *
+ftrace_print_array_seq(struct trace_seq *p, const void *buf, int buf_len,
+		       size_t el_size)
+{
+	const char *ret = trace_seq_buffer_ptr(p);
+	const char *prefix = "";
+	void *ptr = (void *)buf;
+
+	trace_seq_putc(p, '{');
+
+	while (ptr < buf + buf_len) {
+		switch (el_size) {
+		case 1:
+			trace_seq_printf(p, "%s0x%x", prefix,
+					 *(u8 *)ptr);
+			break;
+		case 2:
+			trace_seq_printf(p, "%s0x%x", prefix,
+					 *(u16 *)ptr);
+			break;
+		case 4:
+			trace_seq_printf(p, "%s0x%x", prefix,
+					 *(u32 *)ptr);
+			break;
+		case 8:
+			trace_seq_printf(p, "%s0x%llx", prefix,
+					 *(u64 *)ptr);
+			break;
+		default:
+			trace_seq_printf(p, "BAD SIZE:%zu 0x%x", el_size,
+					 *(u8 *)ptr);
+			el_size = 1;
+		}
+		prefix = ",";
+		ptr += el_size;
+	}
+
+	trace_seq_putc(p, '}');
+	trace_seq_putc(p, 0);
+
+	return ret;
+}
+EXPORT_SYMBOL(ftrace_print_array_seq);
+
 int ftrace_raw_output_prep(struct trace_iterator *iter,
 			   struct trace_event *trace_event)
 {

diff --git a/linaro/configs/debug.conf b/linaro/configs/debug.conf
deleted file mode 100644
index 3698056..0000000
--- a/linaro/configs/debug.conf
+++ /dev/null

@@ -1 +0,0 @@
-CONFIG_PROVE_LOCKING=y

diff --git a/linaro/configs/highbank.conf b/linaro/configs/highbank.conf
deleted file mode 100644
index bf0f3c1..0000000
--- a/linaro/configs/highbank.conf
+++ /dev/null

@@ -1,40 +0,0 @@
-CONFIG_EXPERIMENTAL=y
-CONFIG_NO_HZ=y
-CONFIG_HIGH_RES_TIMERS=y
-CONFIG_ARCH_HIGHBANK=y
-CONFIG_ARM_ERRATA_754322=y
-CONFIG_SMP=y
-CONFIG_SCHED_MC=y
-CONFIG_AEABI=y
-CONFIG_CMDLINE="console=ttyAMA0"
-CONFIG_CPU_IDLE=y
-CONFIG_VFP=y
-CONFIG_NEON=y
-CONFIG_NET=y
-CONFIG_SCSI=y
-CONFIG_BLK_DEV_SD=y
-CONFIG_ATA=y
-CONFIG_SATA_AHCI_PLATFORM=y
-CONFIG_SATA_HIGHBANK=y
-CONFIG_NETDEVICES=y
-CONFIG_NET_CALXEDA_XGMAC=y
-CONFIG_SERIAL_AMBA_PL011=y
-CONFIG_SERIAL_AMBA_PL011_CONSOLE=y
-CONFIG_IPMI_HANDLER=y
-CONFIG_IPMI_SI=y
-CONFIG_I2C=y
-CONFIG_I2C_DESIGNWARE_PLATFORM=y
-CONFIG_SPI=y
-CONFIG_SPI_PL022=y
-CONFIG_GPIO_PL061=y
-CONFIG_MMC=y
-CONFIG_MMC_SDHCI=y
-CONFIG_MMC_SDHCI_PLTFM=y
-CONFIG_EDAC=y
-CONFIG_EDAC_MM_EDAC=y
-CONFIG_EDAC_HIGHBANK_MC=y
-CONFIG_EDAC_HIGHBANK_L2=y
-CONFIG_RTC_CLASS=y
-CONFIG_RTC_DRV_PL031=y
-CONFIG_DMADEVICES=y
-CONFIG_PL330_DMA=y

diff --git a/linaro/configs/linaro-base.conf b/linaro/configs/linaro-base.conf
index c8a353f..0a1d7bc 100644
--- a/linaro/configs/linaro-base.conf
+++ b/linaro/configs/linaro-base.conf

@@ -121,3 +121,9 @@
 CONFIG_CORESIGHT=y
 CONFIG_CORESIGHT_LINKS_AND_SINKS=y
 CONFIG_CORESIGHT_SOURCE_ETM=y
+
+# enable ZRAM features
+CONFIG_ZPOOL=y
+CONFIG_ZSMALLOC=y
+CONFIG_ZRAM=y
+CONFIG_ZRAM_LZ4_COMPRESS=y

diff --git a/linaro/configs/ubuntu-minimal.conf b/linaro/configs/ubuntu-minimal.conf
deleted file mode 120000
index 794e82f..0000000
--- a/linaro/configs/ubuntu-minimal.conf
+++ /dev/null

@@ -1 +0,0 @@
-distribution.conf
\ No newline at end of file

diff --git a/linaro/configs/xen.conf b/linaro/configs/xen.conf
deleted file mode 100644
index d24fabb..0000000
--- a/linaro/configs/xen.conf
+++ /dev/null

@@ -1,7 +0,0 @@
-CONFIG_XEN=y
-CONFIG_XEN_NETDEV_FRONTEND=y
-CONFIG_XEN_NETDEV_BACKEND=y
-CONFIG_XEN_BLKDEV_FRONTEND=y
-CONFIG_XEN_BLKDEV_BACKEND=y
-CONFIG_XENFS=y
-CONFIG_XEN_COMPAT_XENFS=y
commit	590e935df0c94b2d7a4584bb26906666176a6133	[log] [tgz]
author	Kevin Hilman <khilman@linaro.org>	Mon Jun 29 10:37:23 2015 -0700
committer	Kevin Hilman <khilman@linaro.org>	Mon Jun 29 10:37:23 2015 -0700
tree	bb004c31d277423c7eb0f6bd2c0c1e9d8428e9f2
parent	774b195c03ac44c0dbc4e81a3f194779692ce3bd [diff]
parent	91760bceff1f3f5860ff49c14194567f972352e6 [diff]