Merge branch 'x86/urgent'
diff --git a/Documentation/00-INDEX b/Documentation/00-INDEX
index 27e67a9..1750fce 100644
--- a/Documentation/00-INDEX
+++ b/Documentation/00-INDEX
@@ -287,6 +287,8 @@
- semantics and behavior of local atomic operations.
lockdep-design.txt
- documentation on the runtime locking correctness validator.
+locking/
+ - directory with info about kernel locking primitives
lockstat.txt
- info on collecting statistics on locks (and contention).
lockup-watchdogs.txt
diff --git a/Documentation/DocBook/kernel-locking.tmpl b/Documentation/DocBook/kernel-locking.tmpl
index e584ee1..7c9cc48 100644
--- a/Documentation/DocBook/kernel-locking.tmpl
+++ b/Documentation/DocBook/kernel-locking.tmpl
@@ -1972,7 +1972,7 @@
<itemizedlist>
<listitem>
<para>
- <filename>Documentation/spinlocks.txt</filename>:
+ <filename>Documentation/locking/spinlocks.txt</filename>:
Linus Torvalds' spinlocking tutorial in the kernel sources.
</para>
</listitem>
diff --git a/Documentation/SubmittingPatches b/Documentation/SubmittingPatches
index 0a523c9..482c749 100644
--- a/Documentation/SubmittingPatches
+++ b/Documentation/SubmittingPatches
@@ -794,6 +794,7 @@
<http://www.kroah.com/log/linux/maintainer-03.html>
<http://www.kroah.com/log/linux/maintainer-04.html>
<http://www.kroah.com/log/linux/maintainer-05.html>
+ <http://www.kroah.com/log/linux/maintainer-06.html>
NO!!!! No more huge patch bombs to linux-kernel@vger.kernel.org people!
<https://lkml.org/lkml/2005/7/11/336>
diff --git a/Documentation/devicetree/bindings/input/atmel,maxtouch.txt b/Documentation/devicetree/bindings/input/atmel,maxtouch.txt
index baef432..0ac23f2 100644
--- a/Documentation/devicetree/bindings/input/atmel,maxtouch.txt
+++ b/Documentation/devicetree/bindings/input/atmel,maxtouch.txt
@@ -15,6 +15,17 @@
keycode generated by each GPIO. Linux keycodes are defined in
<dt-bindings/input/input.h>.
+- linux,gpio-keymap: When enabled, the SPT_GPIOPWN_T19 object sends messages
+ on GPIO bit changes. An array of up to 8 entries can be provided
+ indicating the Linux keycode mapped to each bit of the status byte,
+ starting at the LSB. Linux keycodes are defined in
+ <dt-bindings/input/input.h>.
+
+ Note: the numbering of the GPIOs and the bit they start at varies between
+ maXTouch devices. You must either refer to the documentation, or
+ experiment to determine which bit corresponds to which input. Use
+ KEY_RESERVED for unused padding values.
+
Example:
touch@4b {
diff --git a/Documentation/devicetree/bindings/net/stmmac.txt b/Documentation/devicetree/bindings/net/stmmac.txt
index 9b03c57..e45ac3f 100644
--- a/Documentation/devicetree/bindings/net/stmmac.txt
+++ b/Documentation/devicetree/bindings/net/stmmac.txt
@@ -39,6 +39,10 @@
further clocks may be specified in derived bindings.
- clock-names: One name for each entry in the clocks property, the
first one should be "stmmaceth".
+- clk_ptp_ref: this is the PTP reference clock; in case of the PTP is
+ available this clock is used for programming the Timestamp Addend Register.
+ If not passed then the system clock will be used and this is fine on some
+ platforms.
Examples:
diff --git a/Documentation/devicetree/bindings/regulator/tps65090.txt b/Documentation/devicetree/bindings/regulator/tps65090.txt
index 34098023..ca69f5e 100644
--- a/Documentation/devicetree/bindings/regulator/tps65090.txt
+++ b/Documentation/devicetree/bindings/regulator/tps65090.txt
@@ -45,8 +45,8 @@
infet5-supply = <&some_reg>;
infet6-supply = <&some_reg>;
infet7-supply = <&some_reg>;
- vsys_l1-supply = <&some_reg>;
- vsys_l2-supply = <&some_reg>;
+ vsys-l1-supply = <&some_reg>;
+ vsys-l2-supply = <&some_reg>;
regulators {
dcdc1 {
diff --git a/Documentation/devicetree/bindings/sound/adi,axi-spdif-tx.txt b/Documentation/devicetree/bindings/sound/adi,axi-spdif-tx.txt
index 46f3449..4eb7997 100644
--- a/Documentation/devicetree/bindings/sound/adi,axi-spdif-tx.txt
+++ b/Documentation/devicetree/bindings/sound/adi,axi-spdif-tx.txt
@@ -1,7 +1,7 @@
ADI AXI-SPDIF controller
Required properties:
- - compatible : Must be "adi,axi-spdif-1.00.a"
+ - compatible : Must be "adi,axi-spdif-tx-1.00.a"
- reg : Must contain SPDIF core's registers location and length
- clocks : Pairs of phandle and specifier referencing the controller's clocks.
The controller expects two clocks, the clock used for the AXI interface and
diff --git a/Documentation/filesystems/nfs/nfs-rdma.txt b/Documentation/filesystems/nfs/nfs-rdma.txt
index e386f7e..7240438 100644
--- a/Documentation/filesystems/nfs/nfs-rdma.txt
+++ b/Documentation/filesystems/nfs/nfs-rdma.txt
@@ -138,9 +138,9 @@
- Build, install, reboot
The NFS/RDMA code will be enabled automatically if NFS and RDMA
- are turned on. The NFS/RDMA client and server are configured via the hidden
- SUNRPC_XPRT_RDMA config option that depends on SUNRPC and INFINIBAND. The
- value of SUNRPC_XPRT_RDMA will be:
+ are turned on. The NFS/RDMA client and server are configured via the
+ SUNRPC_XPRT_RDMA_CLIENT and SUNRPC_XPRT_RDMA_SERVER config options that both
+ depend on SUNRPC and INFINIBAND. The default value of both options will be:
- N if either SUNRPC or INFINIBAND are N, in this case the NFS/RDMA client
and server will not be built
@@ -235,8 +235,9 @@
- Start the NFS server
- If the NFS/RDMA server was built as a module (CONFIG_SUNRPC_XPRT_RDMA=m in
- kernel config), load the RDMA transport module:
+ If the NFS/RDMA server was built as a module
+ (CONFIG_SUNRPC_XPRT_RDMA_SERVER=m in kernel config), load the RDMA
+ transport module:
$ modprobe svcrdma
@@ -255,8 +256,9 @@
- On the client system
- If the NFS/RDMA client was built as a module (CONFIG_SUNRPC_XPRT_RDMA=m in
- kernel config), load the RDMA client module:
+ If the NFS/RDMA client was built as a module
+ (CONFIG_SUNRPC_XPRT_RDMA_CLIENT=m in kernel config), load the RDMA client
+ module:
$ modprobe xprtrdma.ko
diff --git a/Documentation/filesystems/seq_file.txt b/Documentation/filesystems/seq_file.txt
index 1fe0ccb..8ea3e90 100644
--- a/Documentation/filesystems/seq_file.txt
+++ b/Documentation/filesystems/seq_file.txt
@@ -235,6 +235,39 @@
private field of the seq_file structure; that value can then be retrieved
by the iterator functions.
+There is also a wrapper function to seq_open() called seq_open_private(). It
+kmallocs a zero filled block of memory and stores a pointer to it in the
+private field of the seq_file structure, returning 0 on success. The
+block size is specified in a third parameter to the function, e.g.:
+
+ static int ct_open(struct inode *inode, struct file *file)
+ {
+ return seq_open_private(file, &ct_seq_ops,
+ sizeof(struct mystruct));
+ }
+
+There is also a variant function, __seq_open_private(), which is functionally
+identical except that, if successful, it returns the pointer to the allocated
+memory block, allowing further initialisation e.g.:
+
+ static int ct_open(struct inode *inode, struct file *file)
+ {
+ struct mystruct *p =
+ __seq_open_private(file, &ct_seq_ops, sizeof(*p));
+
+ if (!p)
+ return -ENOMEM;
+
+ p->foo = bar; /* initialize my stuff */
+ ...
+ p->baz = true;
+
+ return 0;
+ }
+
+A corresponding close function, seq_release_private() is available which
+frees the memory allocated in the corresponding open.
+
The other operations of interest - read(), llseek(), and release() - are
all implemented by the seq_file code itself. So a virtual file's
file_operations structure will look like:
diff --git a/Documentation/gpio/consumer.txt b/Documentation/gpio/consumer.txt
index 7654632..6ce5441 100644
--- a/Documentation/gpio/consumer.txt
+++ b/Documentation/gpio/consumer.txt
@@ -53,7 +53,20 @@
if and only if no GPIO has been assigned to the device/function/index triplet,
other error codes are used for cases where a GPIO has been assigned but an error
occurred while trying to acquire it. This is useful to discriminate between mere
-errors and an absence of GPIO for optional GPIO parameters.
+errors and an absence of GPIO for optional GPIO parameters. For the common
+pattern where a GPIO is optional, the gpiod_get_optional() and
+gpiod_get_index_optional() functions can be used. These functions return NULL
+instead of -ENOENT if no GPIO has been assigned to the requested function:
+
+
+ struct gpio_desc *gpiod_get_optional(struct device *dev,
+ const char *con_id,
+ enum gpiod_flags flags)
+
+ struct gpio_desc *gpiod_get_index_optional(struct device *dev,
+ const char *con_id,
+ unsigned int index,
+ enum gpiod_flags flags)
Device-managed variants of these functions are also defined:
@@ -65,6 +78,15 @@
unsigned int idx,
enum gpiod_flags flags)
+ struct gpio_desc *devm_gpiod_get_optional(struct device *dev,
+ const char *con_id,
+ enum gpiod_flags flags)
+
+ struct gpio_desc * devm_gpiod_get_index_optional(struct device *dev,
+ const char *con_id,
+ unsigned int index,
+ enum gpiod_flags flags)
+
A GPIO descriptor can be disposed of using the gpiod_put() function:
void gpiod_put(struct gpio_desc *desc)
diff --git a/Documentation/i2c/dev-interface b/Documentation/i2c/dev-interface
index 3e742ba..2ac78ae 100644
--- a/Documentation/i2c/dev-interface
+++ b/Documentation/i2c/dev-interface
@@ -57,12 +57,12 @@
I2C to communicate with your device. SMBus commands are preferred if
the device supports them. Both are illustrated below.
- __u8 register = 0x10; /* Device register to access */
+ __u8 reg = 0x10; /* Device register to access */
__s32 res;
char buf[10];
/* Using SMBus commands */
- res = i2c_smbus_read_word_data(file, register);
+ res = i2c_smbus_read_word_data(file, reg);
if (res < 0) {
/* ERROR HANDLING: i2c transaction failed */
} else {
@@ -70,11 +70,11 @@
}
/* Using I2C Write, equivalent of
- i2c_smbus_write_word_data(file, register, 0x6543) */
- buf[0] = register;
+ i2c_smbus_write_word_data(file, reg, 0x6543) */
+ buf[0] = reg;
buf[1] = 0x43;
buf[2] = 0x65;
- if (write(file, buf, 3) ! =3) {
+ if (write(file, buf, 3) != 3) {
/* ERROR HANDLING: i2c transaction failed */
}
diff --git a/Documentation/lockdep-design.txt b/Documentation/locking/lockdep-design.txt
similarity index 100%
rename from Documentation/lockdep-design.txt
rename to Documentation/locking/lockdep-design.txt
diff --git a/Documentation/lockstat.txt b/Documentation/locking/lockstat.txt
similarity index 98%
rename from Documentation/lockstat.txt
rename to Documentation/locking/lockstat.txt
index 72d0106..7428773 100644
--- a/Documentation/lockstat.txt
+++ b/Documentation/locking/lockstat.txt
@@ -12,7 +12,7 @@
- HOW
Lockdep already has hooks in the lock functions and maps lock instances to
-lock classes. We build on that (see Documentation/lockdep-design.txt).
+lock classes. We build on that (see Documentation/lokcing/lockdep-design.txt).
The graph below shows the relation between the lock functions and the various
hooks therein.
diff --git a/Documentation/mutex-design.txt b/Documentation/locking/mutex-design.txt
similarity index 96%
rename from Documentation/mutex-design.txt
rename to Documentation/locking/mutex-design.txt
index ee231ed..60c482d 100644
--- a/Documentation/mutex-design.txt
+++ b/Documentation/locking/mutex-design.txt
@@ -145,9 +145,9 @@
Unlike its original design and purpose, 'struct mutex' is larger than
most locks in the kernel. E.g: on x86-64 it is 40 bytes, almost twice
-as large as 'struct semaphore' (24 bytes) and 8 bytes shy of the
-'struct rw_semaphore' variant. Larger structure sizes mean more CPU
-cache and memory footprint.
+as large as 'struct semaphore' (24 bytes) and tied, along with rwsems,
+for the largest lock in the kernel. Larger structure sizes mean more
+CPU cache and memory footprint.
When to use mutexes
-------------------
diff --git a/Documentation/rt-mutex-design.txt b/Documentation/locking/rt-mutex-design.txt
similarity index 100%
rename from Documentation/rt-mutex-design.txt
rename to Documentation/locking/rt-mutex-design.txt
diff --git a/Documentation/rt-mutex.txt b/Documentation/locking/rt-mutex.txt
similarity index 100%
rename from Documentation/rt-mutex.txt
rename to Documentation/locking/rt-mutex.txt
diff --git a/Documentation/spinlocks.txt b/Documentation/locking/spinlocks.txt
similarity index 97%
rename from Documentation/spinlocks.txt
rename to Documentation/locking/spinlocks.txt
index 97eaf57..ff35e40 100644
--- a/Documentation/spinlocks.txt
+++ b/Documentation/locking/spinlocks.txt
@@ -105,9 +105,9 @@
spin_unlock(&lock);
(and the equivalent read-write versions too, of course). The spinlock will
-guarantee the same kind of exclusive access, and it will be much faster.
+guarantee the same kind of exclusive access, and it will be much faster.
This is useful if you know that the data in question is only ever
-manipulated from a "process context", ie no interrupts involved.
+manipulated from a "process context", ie no interrupts involved.
The reasons you mustn't use these versions if you have interrupts that
play with the spinlock is that you can get deadlocks:
@@ -122,21 +122,21 @@
interrupt happens on the same CPU that already holds the lock, because the
lock will obviously never be released (because the interrupt is waiting
for the lock, and the lock-holder is interrupted by the interrupt and will
-not continue until the interrupt has been processed).
+not continue until the interrupt has been processed).
(This is also the reason why the irq-versions of the spinlocks only need
to disable the _local_ interrupts - it's ok to use spinlocks in interrupts
on other CPU's, because an interrupt on another CPU doesn't interrupt the
CPU that holds the lock, so the lock-holder can continue and eventually
-releases the lock).
+releases the lock).
Note that you can be clever with read-write locks and interrupts. For
example, if you know that the interrupt only ever gets a read-lock, then
you can use a non-irq version of read locks everywhere - because they
-don't block on each other (and thus there is no dead-lock wrt interrupts.
-But when you do the write-lock, you have to use the irq-safe version.
+don't block on each other (and thus there is no dead-lock wrt interrupts.
+But when you do the write-lock, you have to use the irq-safe version.
-For an example of being clever with rw-locks, see the "waitqueue_lock"
+For an example of being clever with rw-locks, see the "waitqueue_lock"
handling in kernel/sched/core.c - nothing ever _changes_ a wait-queue from
within an interrupt, they only read the queue in order to know whom to
wake up. So read-locks are safe (which is good: they are very common
diff --git a/Documentation/ww-mutex-design.txt b/Documentation/locking/ww-mutex-design.txt
similarity index 100%
rename from Documentation/ww-mutex-design.txt
rename to Documentation/locking/ww-mutex-design.txt
diff --git a/Documentation/misc-devices/lis3lv02d b/Documentation/misc-devices/lis3lv02d
index af815b9..f89960a 100644
--- a/Documentation/misc-devices/lis3lv02d
+++ b/Documentation/misc-devices/lis3lv02d
@@ -59,7 +59,7 @@
from the device. It supports blocking operations, poll/select and
fasync operation modes. You must read 1 bytes from the device. The
result is number of free-fall interrupts since the last successful
-read (or 255 if number of interrupts would not fit). See the hpfall.c
+read (or 255 if number of interrupts would not fit). See the freefall.c
file for an example on using the device.
diff --git a/Documentation/power/regulator/consumer.txt b/Documentation/power/regulator/consumer.txt
index 81c0e2b..8afb236c 100644
--- a/Documentation/power/regulator/consumer.txt
+++ b/Documentation/power/regulator/consumer.txt
@@ -143,8 +143,9 @@
on all its consumers) and change operating mode (if necessary and permitted)
to best match the current operating load.
-The load_uA value can be determined from the consumers datasheet. e.g.most
-datasheets have tables showing the max current consumed in certain situations.
+The load_uA value can be determined from the consumer's datasheet. e.g. most
+datasheets have tables showing the maximum current consumed in certain
+situations.
Most consumers will use indirect operating mode control since they have no
knowledge of the regulator or whether the regulator is shared with other
@@ -173,7 +174,7 @@
int regulator_register_notifier(struct regulator *regulator,
struct notifier_block *nb);
-Consumers can uregister interest by calling :-
+Consumers can unregister interest by calling :-
int regulator_unregister_notifier(struct regulator *regulator,
struct notifier_block *nb);
diff --git a/Documentation/power/regulator/design.txt b/Documentation/power/regulator/design.txt
index f9b56b7..fdd919b 100644
--- a/Documentation/power/regulator/design.txt
+++ b/Documentation/power/regulator/design.txt
@@ -9,14 +9,14 @@
- Errors in regulator configuration can have very serious consequences
for the system, potentially including lasting hardware damage.
- - It is not possible to automatically determine the power confugration
+ - It is not possible to automatically determine the power configuration
of the system - software-equivalent variants of the same chip may
- have different power requirments, and not all components with power
+ have different power requirements, and not all components with power
requirements are visible to software.
=> The API should make no changes to the hardware state unless it has
- specific knowledge that these changes are safe to do perform on
- this particular system.
+ specific knowledge that these changes are safe to perform on this
+ particular system.
Consumer use cases
------------------
diff --git a/Documentation/power/regulator/machine.txt b/Documentation/power/regulator/machine.txt
index ce63af0..757e3b5 100644
--- a/Documentation/power/regulator/machine.txt
+++ b/Documentation/power/regulator/machine.txt
@@ -11,7 +11,7 @@
+-> [Consumer B @ 3.3V]
The drivers for consumers A & B must be mapped to the correct regulator in
-order to control their power supply. This mapping can be achieved in machine
+order to control their power supplies. This mapping can be achieved in machine
initialisation code by creating a struct regulator_consumer_supply for
each regulator.
@@ -39,7 +39,7 @@
Constraints can now be registered by defining a struct regulator_init_data
for each regulator power domain. This structure also maps the consumers
-to their supply regulator :-
+to their supply regulators :-
static struct regulator_init_data regulator1_data = {
.constraints = {
diff --git a/Documentation/power/regulator/overview.txt b/Documentation/power/regulator/overview.txt
index 8ed1758..40ca2d6 100644
--- a/Documentation/power/regulator/overview.txt
+++ b/Documentation/power/regulator/overview.txt
@@ -36,11 +36,11 @@
Consumers can be classified into two types:-
Static: consumer does not change its supply voltage or
- current limit. It only needs to enable or disable it's
+ current limit. It only needs to enable or disable its
power supply. Its supply voltage is set by the hardware,
bootloader, firmware or kernel board initialisation code.
- Dynamic: consumer needs to change it's supply voltage or
+ Dynamic: consumer needs to change its supply voltage or
current limit to meet operation demands.
@@ -156,7 +156,7 @@
This interface is for machine specific code and allows the creation of
voltage/current domains (with constraints) for each regulator. It can
provide regulator constraints that will prevent device damage through
- overvoltage or over current caused by buggy client drivers. It also
+ overvoltage or overcurrent caused by buggy client drivers. It also
allows the creation of a regulator tree whereby some regulators are
supplied by others (similar to a clock tree).
diff --git a/Documentation/power/regulator/regulator.txt b/Documentation/power/regulator/regulator.txt
index 1390277..b17e583 100644
--- a/Documentation/power/regulator/regulator.txt
+++ b/Documentation/power/regulator/regulator.txt
@@ -13,7 +13,7 @@
struct regulator_dev *regulator_register(struct regulator_desc *regulator_desc,
const struct regulator_config *config);
-This will register the regulators capabilities and operations to the regulator
+This will register the regulator's capabilities and operations to the regulator
core.
Regulators can be unregistered by calling :-
@@ -23,8 +23,8 @@
Regulator Events
================
-Regulators can send events (e.g. over temp, under voltage, etc) to consumer
-drivers by calling :-
+Regulators can send events (e.g. overtemperature, undervoltage, etc) to
+consumer drivers by calling :-
int regulator_notifier_call_chain(struct regulator_dev *rdev,
unsigned long event, void *data);
diff --git a/MAINTAINERS b/MAINTAINERS
index cf24bb5..7d72ba9 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5594,8 +5594,8 @@
L: linux-kernel@vger.kernel.org
T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git core/locking
S: Maintained
-F: Documentation/lockdep*.txt
-F: Documentation/lockstat.txt
+F: Documentation/locking/lockdep*.txt
+F: Documentation/locking/lockstat.txt
F: include/linux/lockdep.h
F: kernel/locking/
@@ -6145,6 +6145,14 @@
W: http://www.native-instruments.com
F: sound/usb/caiaq/
+NATIVE LINUX KVM TOOL
+M: Pekka Enberg <penberg@kernel.org>
+M: Sasha Levin <levinsasha928@gmail.com>
+M: Asias He <asias.hejun@gmail.com>
+L: kvm@vger.kernel.org
+S: Maintained
+F: tools/kvm/
+
NCP FILESYSTEM
M: Petr Vandrovec <petr@vandrovec.name>
S: Odd Fixes
@@ -10070,9 +10078,9 @@
F: arch/x86/
X86 PLATFORM DRIVERS
-M: Matthew Garrett <matthew.garrett@nebula.com>
+M: Darren Hart <dvhart@infradead.org>
L: platform-driver-x86@vger.kernel.org
-T: git git://cavan.codon.org.uk/platform-drivers-x86.git
+T: git git://git.infradead.org/users/dvhart/linux-platform-drivers-x86.git
S: Maintained
F: drivers/platform/x86/
diff --git a/Makefile b/Makefile
index 2893d7f..1a60bdd 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
VERSION = 3
PATCHLEVEL = 17
SUBLEVEL = 0
-EXTRAVERSION = -rc3
+EXTRAVERSION = -rc4
NAME = Shuffling Zombie Juror
# *DOCUMENTATION*
diff --git a/arch/Kconfig b/arch/Kconfig
index 0eae9df..05d7a8a 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -323,6 +323,17 @@
results in the system call being skipped immediately.
- seccomp syscall wired up
+ For best performance, an arch should use seccomp_phase1 and
+ seccomp_phase2 directly. It should call seccomp_phase1 for all
+ syscalls if TIF_SECCOMP is set, but seccomp_phase1 does not
+ need to be called from a ptrace-safe context. It must then
+ call seccomp_phase2 if seccomp_phase1 returns anything other
+ than SECCOMP_PHASE1_OK or SECCOMP_PHASE1_SKIP.
+
+ As an additional optimization, an arch may provide seccomp_data
+ directly to seccomp_phase1; this avoids multiple calls
+ to the syscall_xyz helpers for every syscall.
+
config SECCOMP_FILTER
def_bool y
depends on HAVE_ARCH_SECCOMP_FILTER && SECCOMP && NET
diff --git a/arch/alpha/include/asm/atomic.h b/arch/alpha/include/asm/atomic.h
index ed60a1e..6fbb53a 100644
--- a/arch/alpha/include/asm/atomic.h
+++ b/arch/alpha/include/asm/atomic.h
@@ -29,145 +29,92 @@
* branch back to restart the operation.
*/
-static __inline__ void atomic_add(int i, atomic_t * v)
-{
- unsigned long temp;
- __asm__ __volatile__(
- "1: ldl_l %0,%1\n"
- " addl %0,%2,%0\n"
- " stl_c %0,%1\n"
- " beq %0,2f\n"
- ".subsection 2\n"
- "2: br 1b\n"
- ".previous"
- :"=&r" (temp), "=m" (v->counter)
- :"Ir" (i), "m" (v->counter));
+#define ATOMIC_OP(op) \
+static __inline__ void atomic_##op(int i, atomic_t * v) \
+{ \
+ unsigned long temp; \
+ __asm__ __volatile__( \
+ "1: ldl_l %0,%1\n" \
+ " " #op "l %0,%2,%0\n" \
+ " stl_c %0,%1\n" \
+ " beq %0,2f\n" \
+ ".subsection 2\n" \
+ "2: br 1b\n" \
+ ".previous" \
+ :"=&r" (temp), "=m" (v->counter) \
+ :"Ir" (i), "m" (v->counter)); \
+} \
+
+#define ATOMIC_OP_RETURN(op) \
+static inline int atomic_##op##_return(int i, atomic_t *v) \
+{ \
+ long temp, result; \
+ smp_mb(); \
+ __asm__ __volatile__( \
+ "1: ldl_l %0,%1\n" \
+ " " #op "l %0,%3,%2\n" \
+ " " #op "l %0,%3,%0\n" \
+ " stl_c %0,%1\n" \
+ " beq %0,2f\n" \
+ ".subsection 2\n" \
+ "2: br 1b\n" \
+ ".previous" \
+ :"=&r" (temp), "=m" (v->counter), "=&r" (result) \
+ :"Ir" (i), "m" (v->counter) : "memory"); \
+ smp_mb(); \
+ return result; \
}
-static __inline__ void atomic64_add(long i, atomic64_t * v)
-{
- unsigned long temp;
- __asm__ __volatile__(
- "1: ldq_l %0,%1\n"
- " addq %0,%2,%0\n"
- " stq_c %0,%1\n"
- " beq %0,2f\n"
- ".subsection 2\n"
- "2: br 1b\n"
- ".previous"
- :"=&r" (temp), "=m" (v->counter)
- :"Ir" (i), "m" (v->counter));
+#define ATOMIC64_OP(op) \
+static __inline__ void atomic64_##op(long i, atomic64_t * v) \
+{ \
+ unsigned long temp; \
+ __asm__ __volatile__( \
+ "1: ldq_l %0,%1\n" \
+ " " #op "q %0,%2,%0\n" \
+ " stq_c %0,%1\n" \
+ " beq %0,2f\n" \
+ ".subsection 2\n" \
+ "2: br 1b\n" \
+ ".previous" \
+ :"=&r" (temp), "=m" (v->counter) \
+ :"Ir" (i), "m" (v->counter)); \
+} \
+
+#define ATOMIC64_OP_RETURN(op) \
+static __inline__ long atomic64_##op##_return(long i, atomic64_t * v) \
+{ \
+ long temp, result; \
+ smp_mb(); \
+ __asm__ __volatile__( \
+ "1: ldq_l %0,%1\n" \
+ " " #op "q %0,%3,%2\n" \
+ " " #op "q %0,%3,%0\n" \
+ " stq_c %0,%1\n" \
+ " beq %0,2f\n" \
+ ".subsection 2\n" \
+ "2: br 1b\n" \
+ ".previous" \
+ :"=&r" (temp), "=m" (v->counter), "=&r" (result) \
+ :"Ir" (i), "m" (v->counter) : "memory"); \
+ smp_mb(); \
+ return result; \
}
-static __inline__ void atomic_sub(int i, atomic_t * v)
-{
- unsigned long temp;
- __asm__ __volatile__(
- "1: ldl_l %0,%1\n"
- " subl %0,%2,%0\n"
- " stl_c %0,%1\n"
- " beq %0,2f\n"
- ".subsection 2\n"
- "2: br 1b\n"
- ".previous"
- :"=&r" (temp), "=m" (v->counter)
- :"Ir" (i), "m" (v->counter));
-}
+#define ATOMIC_OPS(opg) \
+ ATOMIC_OP(opg) \
+ ATOMIC_OP_RETURN(opg) \
+ ATOMIC64_OP(opg) \
+ ATOMIC64_OP_RETURN(opg)
-static __inline__ void atomic64_sub(long i, atomic64_t * v)
-{
- unsigned long temp;
- __asm__ __volatile__(
- "1: ldq_l %0,%1\n"
- " subq %0,%2,%0\n"
- " stq_c %0,%1\n"
- " beq %0,2f\n"
- ".subsection 2\n"
- "2: br 1b\n"
- ".previous"
- :"=&r" (temp), "=m" (v->counter)
- :"Ir" (i), "m" (v->counter));
-}
+ATOMIC_OPS(add)
+ATOMIC_OPS(sub)
-
-/*
- * Same as above, but return the result value
- */
-static inline int atomic_add_return(int i, atomic_t *v)
-{
- long temp, result;
- smp_mb();
- __asm__ __volatile__(
- "1: ldl_l %0,%1\n"
- " addl %0,%3,%2\n"
- " addl %0,%3,%0\n"
- " stl_c %0,%1\n"
- " beq %0,2f\n"
- ".subsection 2\n"
- "2: br 1b\n"
- ".previous"
- :"=&r" (temp), "=m" (v->counter), "=&r" (result)
- :"Ir" (i), "m" (v->counter) : "memory");
- smp_mb();
- return result;
-}
-
-static __inline__ long atomic64_add_return(long i, atomic64_t * v)
-{
- long temp, result;
- smp_mb();
- __asm__ __volatile__(
- "1: ldq_l %0,%1\n"
- " addq %0,%3,%2\n"
- " addq %0,%3,%0\n"
- " stq_c %0,%1\n"
- " beq %0,2f\n"
- ".subsection 2\n"
- "2: br 1b\n"
- ".previous"
- :"=&r" (temp), "=m" (v->counter), "=&r" (result)
- :"Ir" (i), "m" (v->counter) : "memory");
- smp_mb();
- return result;
-}
-
-static __inline__ long atomic_sub_return(int i, atomic_t * v)
-{
- long temp, result;
- smp_mb();
- __asm__ __volatile__(
- "1: ldl_l %0,%1\n"
- " subl %0,%3,%2\n"
- " subl %0,%3,%0\n"
- " stl_c %0,%1\n"
- " beq %0,2f\n"
- ".subsection 2\n"
- "2: br 1b\n"
- ".previous"
- :"=&r" (temp), "=m" (v->counter), "=&r" (result)
- :"Ir" (i), "m" (v->counter) : "memory");
- smp_mb();
- return result;
-}
-
-static __inline__ long atomic64_sub_return(long i, atomic64_t * v)
-{
- long temp, result;
- smp_mb();
- __asm__ __volatile__(
- "1: ldq_l %0,%1\n"
- " subq %0,%3,%2\n"
- " subq %0,%3,%0\n"
- " stq_c %0,%1\n"
- " beq %0,2f\n"
- ".subsection 2\n"
- "2: br 1b\n"
- ".previous"
- :"=&r" (temp), "=m" (v->counter), "=&r" (result)
- :"Ir" (i), "m" (v->counter) : "memory");
- smp_mb();
- return result;
-}
+#undef ATOMIC_OPS
+#undef ATOMIC64_OP_RETURN
+#undef ATOMIC64_OP
+#undef ATOMIC_OP_RETURN
+#undef ATOMIC_OP
#define atomic64_cmpxchg(v, old, new) (cmpxchg(&((v)->counter), old, new))
#define atomic64_xchg(v, new) (xchg(&((v)->counter), new))
diff --git a/arch/arc/include/asm/atomic.h b/arch/arc/include/asm/atomic.h
index 83f03ca..173f303 100644
--- a/arch/arc/include/asm/atomic.h
+++ b/arch/arc/include/asm/atomic.h
@@ -25,79 +25,36 @@
#define atomic_set(v, i) (((v)->counter) = (i))
-static inline void atomic_add(int i, atomic_t *v)
-{
- unsigned int temp;
+#define ATOMIC_OP(op, c_op, asm_op) \
+static inline void atomic_##op(int i, atomic_t *v) \
+{ \
+ unsigned int temp; \
+ \
+ __asm__ __volatile__( \
+ "1: llock %0, [%1] \n" \
+ " " #asm_op " %0, %0, %2 \n" \
+ " scond %0, [%1] \n" \
+ " bnz 1b \n" \
+ : "=&r"(temp) /* Early clobber, to prevent reg reuse */ \
+ : "r"(&v->counter), "ir"(i) \
+ : "cc"); \
+} \
- __asm__ __volatile__(
- "1: llock %0, [%1] \n"
- " add %0, %0, %2 \n"
- " scond %0, [%1] \n"
- " bnz 1b \n"
- : "=&r"(temp) /* Early clobber, to prevent reg reuse */
- : "r"(&v->counter), "ir"(i)
- : "cc");
-}
-
-static inline void atomic_sub(int i, atomic_t *v)
-{
- unsigned int temp;
-
- __asm__ __volatile__(
- "1: llock %0, [%1] \n"
- " sub %0, %0, %2 \n"
- " scond %0, [%1] \n"
- " bnz 1b \n"
- : "=&r"(temp)
- : "r"(&v->counter), "ir"(i)
- : "cc");
-}
-
-/* add and also return the new value */
-static inline int atomic_add_return(int i, atomic_t *v)
-{
- unsigned int temp;
-
- __asm__ __volatile__(
- "1: llock %0, [%1] \n"
- " add %0, %0, %2 \n"
- " scond %0, [%1] \n"
- " bnz 1b \n"
- : "=&r"(temp)
- : "r"(&v->counter), "ir"(i)
- : "cc");
-
- return temp;
-}
-
-static inline int atomic_sub_return(int i, atomic_t *v)
-{
- unsigned int temp;
-
- __asm__ __volatile__(
- "1: llock %0, [%1] \n"
- " sub %0, %0, %2 \n"
- " scond %0, [%1] \n"
- " bnz 1b \n"
- : "=&r"(temp)
- : "r"(&v->counter), "ir"(i)
- : "cc");
-
- return temp;
-}
-
-static inline void atomic_clear_mask(unsigned long mask, unsigned long *addr)
-{
- unsigned int temp;
-
- __asm__ __volatile__(
- "1: llock %0, [%1] \n"
- " bic %0, %0, %2 \n"
- " scond %0, [%1] \n"
- " bnz 1b \n"
- : "=&r"(temp)
- : "r"(addr), "ir"(mask)
- : "cc");
+#define ATOMIC_OP_RETURN(op, c_op, asm_op) \
+static inline int atomic_##op##_return(int i, atomic_t *v) \
+{ \
+ unsigned int temp; \
+ \
+ __asm__ __volatile__( \
+ "1: llock %0, [%1] \n" \
+ " " #asm_op " %0, %0, %2 \n" \
+ " scond %0, [%1] \n" \
+ " bnz 1b \n" \
+ : "=&r"(temp) \
+ : "r"(&v->counter), "ir"(i) \
+ : "cc"); \
+ \
+ return temp; \
}
#else /* !CONFIG_ARC_HAS_LLSC */
@@ -126,6 +83,7 @@
v->counter = i;
atomic_ops_unlock(flags);
}
+
#endif
/*
@@ -133,63 +91,47 @@
* Locking would change to irq-disabling only (UP) and spinlocks (SMP)
*/
-static inline void atomic_add(int i, atomic_t *v)
-{
- unsigned long flags;
-
- atomic_ops_lock(flags);
- v->counter += i;
- atomic_ops_unlock(flags);
+#define ATOMIC_OP(op, c_op, asm_op) \
+static inline void atomic_##op(int i, atomic_t *v) \
+{ \
+ unsigned long flags; \
+ \
+ atomic_ops_lock(flags); \
+ v->counter c_op i; \
+ atomic_ops_unlock(flags); \
}
-static inline void atomic_sub(int i, atomic_t *v)
-{
- unsigned long flags;
-
- atomic_ops_lock(flags);
- v->counter -= i;
- atomic_ops_unlock(flags);
-}
-
-static inline int atomic_add_return(int i, atomic_t *v)
-{
- unsigned long flags;
- unsigned long temp;
-
- atomic_ops_lock(flags);
- temp = v->counter;
- temp += i;
- v->counter = temp;
- atomic_ops_unlock(flags);
-
- return temp;
-}
-
-static inline int atomic_sub_return(int i, atomic_t *v)
-{
- unsigned long flags;
- unsigned long temp;
-
- atomic_ops_lock(flags);
- temp = v->counter;
- temp -= i;
- v->counter = temp;
- atomic_ops_unlock(flags);
-
- return temp;
-}
-
-static inline void atomic_clear_mask(unsigned long mask, unsigned long *addr)
-{
- unsigned long flags;
-
- atomic_ops_lock(flags);
- *addr &= ~mask;
- atomic_ops_unlock(flags);
+#define ATOMIC_OP_RETURN(op, c_op) \
+static inline int atomic_##op##_return(int i, atomic_t *v) \
+{ \
+ unsigned long flags; \
+ unsigned long temp; \
+ \
+ atomic_ops_lock(flags); \
+ temp = v->counter; \
+ temp c_op i; \
+ v->counter = temp; \
+ atomic_ops_unlock(flags); \
+ \
+ return temp; \
}
#endif /* !CONFIG_ARC_HAS_LLSC */
+#define ATOMIC_OPS(op, c_op, asm_op) \
+ ATOMIC_OP(op, c_op, asm_op) \
+ ATOMIC_OP_RETURN(op, c_op, asm_op)
+
+ATOMIC_OPS(add, +=, add)
+ATOMIC_OPS(sub, -=, sub)
+ATOMIC_OP(and, &=, and)
+
+#define atomic_clear_mask(mask, v) atomic_and(~(mask), (v))
+
+#undef ATOMIC_OPS
+#undef ATOMIC_OP_RETURN
+#undef ATOMIC_OP
+
/**
* __atomic_add_unless - add unless the number is a given value
* @v: pointer of type atomic_t
diff --git a/arch/arc/mm/cache_arc700.c b/arch/arc/mm/cache_arc700.c
index e88ddbf..9e11427 100644
--- a/arch/arc/mm/cache_arc700.c
+++ b/arch/arc/mm/cache_arc700.c
@@ -427,7 +427,7 @@
static void __ic_line_inv_vaddr_helper(void *info)
{
- struct ic_inv *ic_inv_args = (struct ic_inv_args *) info;
+ struct ic_inv_args *ic_inv = info;
__ic_line_inv_vaddr_local(ic_inv->paddr, ic_inv->vaddr, ic_inv->sz);
}
diff --git a/arch/arm/boot/dts/am4372.dtsi b/arch/arm/boot/dts/am4372.dtsi
index 9b3d2ba..8689949 100644
--- a/arch/arm/boot/dts/am4372.dtsi
+++ b/arch/arm/boot/dts/am4372.dtsi
@@ -804,7 +804,7 @@
usb1: usb@48390000 {
compatible = "synopsys,dwc3";
- reg = <0x48390000 0x17000>;
+ reg = <0x48390000 0x10000>;
interrupts = <GIC_SPI 168 IRQ_TYPE_LEVEL_HIGH>;
phys = <&usb2_phy1>;
phy-names = "usb2-phy";
@@ -826,7 +826,7 @@
usb2: usb@483d0000 {
compatible = "synopsys,dwc3";
- reg = <0x483d0000 0x17000>;
+ reg = <0x483d0000 0x10000>;
interrupts = <GIC_SPI 174 IRQ_TYPE_LEVEL_HIGH>;
phys = <&usb2_phy2>;
phy-names = "usb2-phy";
diff --git a/arch/arm/boot/dts/am437x-gp-evm.dts b/arch/arm/boot/dts/am437x-gp-evm.dts
index 646a6ea..e7ac47f 100644
--- a/arch/arm/boot/dts/am437x-gp-evm.dts
+++ b/arch/arm/boot/dts/am437x-gp-evm.dts
@@ -260,7 +260,7 @@
status = "okay";
pinctrl-names = "default";
pinctrl-0 = <&i2c0_pins>;
- clock-frequency = <400000>;
+ clock-frequency = <100000>;
tps65218: tps65218@24 {
reg = <0x24>;
@@ -424,7 +424,7 @@
ranges = <0 0 0 0x01000000>; /* minimum GPMC partition = 16MB */
nand@0,0 {
reg = <0 0 4>; /* device IO registers */
- ti,nand-ecc-opt = "bch8";
+ ti,nand-ecc-opt = "bch16";
ti,elm-id = <&elm>;
nand-bus-width = <8>;
gpmc,device-width = <1>;
@@ -443,8 +443,6 @@
gpmc,rd-cycle-ns = <40>;
gpmc,wr-cycle-ns = <40>;
gpmc,wait-pin = <0>;
- gpmc,wait-on-read;
- gpmc,wait-on-write;
gpmc,bus-turnaround-ns = <0>;
gpmc,cycle2cycle-delay-ns = <0>;
gpmc,clk-activation-ns = <0>;
diff --git a/arch/arm/boot/dts/am43x-epos-evm.dts b/arch/arm/boot/dts/am43x-epos-evm.dts
index ed7dd23..ac3e485 100644
--- a/arch/arm/boot/dts/am43x-epos-evm.dts
+++ b/arch/arm/boot/dts/am43x-epos-evm.dts
@@ -435,13 +435,13 @@
};
&gpmc {
- status = "okay";
+ status = "okay"; /* Disable QSPI when enabling GPMC (NAND) */
pinctrl-names = "default";
pinctrl-0 = <&nand_flash_x8>;
ranges = <0 0 0x08000000 0x10000000>; /* CS0: NAND */
nand@0,0 {
reg = <0 0 0>; /* CS0, offset 0 */
- ti,nand-ecc-opt = "bch8";
+ ti,nand-ecc-opt = "bch16";
ti,elm-id = <&elm>;
nand-bus-width = <8>;
gpmc,device-width = <1>;
@@ -459,8 +459,7 @@
gpmc,access-ns = <30>; /* tCEA + 4*/
gpmc,rd-cycle-ns = <40>;
gpmc,wr-cycle-ns = <40>;
- gpmc,wait-on-read = "true";
- gpmc,wait-on-write = "true";
+ gpmc,wait-pin = <0>;
gpmc,bus-turnaround-ns = <0>;
gpmc,cycle2cycle-delay-ns = <0>;
gpmc,clk-activation-ns = <0>;
@@ -557,7 +556,7 @@
};
&qspi {
- status = "okay";
+ status = "disabled"; /* Disable GPMC (NAND) when enabling QSPI */
pinctrl-names = "default";
pinctrl-0 = <&qspi1_default>;
diff --git a/arch/arm/boot/dts/at91rm9200.dtsi b/arch/arm/boot/dts/at91rm9200.dtsi
index 65ccf56..6c97d4a 100644
--- a/arch/arm/boot/dts/at91rm9200.dtsi
+++ b/arch/arm/boot/dts/at91rm9200.dtsi
@@ -149,7 +149,7 @@
usb: usbck {
compatible = "atmel,at91rm9200-clk-usb";
#clock-cells = <0>;
- atmel,clk-divisors = <1 2>;
+ atmel,clk-divisors = <1 2 0 0>;
clocks = <&pllb>;
};
diff --git a/arch/arm/boot/dts/at91sam9g20.dtsi b/arch/arm/boot/dts/at91sam9g20.dtsi
index 31f7652..4e0abbd 100644
--- a/arch/arm/boot/dts/at91sam9g20.dtsi
+++ b/arch/arm/boot/dts/at91sam9g20.dtsi
@@ -40,6 +40,7 @@
};
pllb: pllbck {
+ compatible = "atmel,at91sam9g20-clk-pllb";
atmel,clk-input-range = <2000000 32000000>;
atmel,pll-clk-output-ranges = <30000000 100000000 0 0>;
};
diff --git a/arch/arm/boot/dts/dra7-evm.dts b/arch/arm/boot/dts/dra7-evm.dts
index 50f8022..e03fbf3 100644
--- a/arch/arm/boot/dts/dra7-evm.dts
+++ b/arch/arm/boot/dts/dra7-evm.dts
@@ -8,6 +8,7 @@
/dts-v1/;
#include "dra74x.dtsi"
+#include <dt-bindings/gpio/gpio.h>
/ {
model = "TI DRA742";
@@ -24,9 +25,29 @@
regulator-min-microvolt = <3300000>;
regulator-max-microvolt = <3300000>;
};
+
+ vtt_fixed: fixedregulator-vtt {
+ compatible = "regulator-fixed";
+ regulator-name = "vtt_fixed";
+ regulator-min-microvolt = <1350000>;
+ regulator-max-microvolt = <1350000>;
+ regulator-always-on;
+ regulator-boot-on;
+ enable-active-high;
+ gpio = <&gpio7 11 GPIO_ACTIVE_HIGH>;
+ };
};
&dra7_pmx_core {
+ pinctrl-names = "default";
+ pinctrl-0 = <&vtt_pin>;
+
+ vtt_pin: pinmux_vtt_pin {
+ pinctrl-single,pins = <
+ 0x3b4 (PIN_OUTPUT | MUX_MODE14) /* spi1_cs1.gpio7_11 */
+ >;
+ };
+
i2c1_pins: pinmux_i2c1_pins {
pinctrl-single,pins = <
0x400 (PIN_INPUT | MUX_MODE0) /* i2c1_sda */
@@ -43,20 +64,19 @@
i2c3_pins: pinmux_i2c3_pins {
pinctrl-single,pins = <
- 0x410 (PIN_INPUT | MUX_MODE0) /* i2c3_sda */
- 0x414 (PIN_INPUT | MUX_MODE0) /* i2c3_scl */
+ 0x288 (PIN_INPUT | MUX_MODE9) /* gpio6_14.i2c3_sda */
+ 0x28c (PIN_INPUT | MUX_MODE9) /* gpio6_15.i2c3_scl */
>;
};
mcspi1_pins: pinmux_mcspi1_pins {
pinctrl-single,pins = <
- 0x3a4 (PIN_INPUT | MUX_MODE0) /* spi2_clk */
- 0x3a8 (PIN_INPUT | MUX_MODE0) /* spi2_d1 */
- 0x3ac (PIN_INPUT | MUX_MODE0) /* spi2_d0 */
- 0x3b0 (PIN_INPUT_SLEW | MUX_MODE0) /* spi2_cs0 */
- 0x3b4 (PIN_INPUT_SLEW | MUX_MODE0) /* spi2_cs1 */
- 0x3b8 (PIN_INPUT_SLEW | MUX_MODE6) /* spi2_cs2 */
- 0x3bc (PIN_INPUT_SLEW | MUX_MODE6) /* spi2_cs3 */
+ 0x3a4 (PIN_INPUT | MUX_MODE0) /* spi1_sclk */
+ 0x3a8 (PIN_INPUT | MUX_MODE0) /* spi1_d1 */
+ 0x3ac (PIN_INPUT | MUX_MODE0) /* spi1_d0 */
+ 0x3b0 (PIN_INPUT_SLEW | MUX_MODE0) /* spi1_cs0 */
+ 0x3b8 (PIN_INPUT_SLEW | MUX_MODE6) /* spi1_cs2.hdmi1_hpd */
+ 0x3bc (PIN_INPUT_SLEW | MUX_MODE6) /* spi1_cs3.hdmi1_cec */
>;
};
@@ -284,7 +304,7 @@
status = "okay";
pinctrl-names = "default";
pinctrl-0 = <&i2c3_pins>;
- clock-frequency = <3400000>;
+ clock-frequency = <400000>;
};
&mcspi1 {
@@ -483,7 +503,7 @@
reg = <0x001c0000 0x00020000>;
};
partition@7 {
- label = "NAND.u-boot-env";
+ label = "NAND.u-boot-env.backup1";
reg = <0x001e0000 0x00020000>;
};
partition@8 {
@@ -504,3 +524,8 @@
&usb2_phy2 {
phy-supply = <&ldousb_reg>;
};
+
+&gpio7 {
+ ti,no-reset-on-init;
+ ti,no-idle-on-init;
+};
diff --git a/arch/arm/boot/dts/omap3xxx-clocks.dtsi b/arch/arm/boot/dts/omap3xxx-clocks.dtsi
index e47ff69..5c37500 100644
--- a/arch/arm/boot/dts/omap3xxx-clocks.dtsi
+++ b/arch/arm/boot/dts/omap3xxx-clocks.dtsi
@@ -467,6 +467,7 @@
ti,bit-shift = <0x1e>;
reg = <0x0d00>;
ti,set-bit-to-disable;
+ ti,set-rate-parent;
};
dpll4_m6_ck: dpll4_m6_ck {
diff --git a/arch/arm/boot/dts/ste-snowball.dts b/arch/arm/boot/dts/ste-snowball.dts
index 4a2000c..3e97a66 100644
--- a/arch/arm/boot/dts/ste-snowball.dts
+++ b/arch/arm/boot/dts/ste-snowball.dts
@@ -116,7 +116,6 @@
msp2: msp@80117000 {
pinctrl-names = "default";
pinctrl-0 = <&msp2_default_mode>;
- status = "okay";
};
msp3: msp@80125000 {
diff --git a/arch/arm/common/edma.c b/arch/arm/common/edma.c
index 8809917..d86771a 100644
--- a/arch/arm/common/edma.c
+++ b/arch/arm/common/edma.c
@@ -1443,14 +1443,14 @@
EXPORT_SYMBOL(edma_assign_channel_eventq);
static int edma_setup_from_hw(struct device *dev, struct edma_soc_info *pdata,
- struct edma *edma_cc)
+ struct edma *edma_cc, int cc_id)
{
int i;
u32 value, cccfg;
s8 (*queue_priority_map)[2];
/* Decode the eDMA3 configuration from CCCFG register */
- cccfg = edma_read(0, EDMA_CCCFG);
+ cccfg = edma_read(cc_id, EDMA_CCCFG);
value = GET_NUM_REGN(cccfg);
edma_cc->num_region = BIT(value);
@@ -1464,7 +1464,8 @@
value = GET_NUM_EVQUE(cccfg);
edma_cc->num_tc = value + 1;
- dev_dbg(dev, "eDMA3 HW configuration (cccfg: 0x%08x):\n", cccfg);
+ dev_dbg(dev, "eDMA3 CC%d HW configuration (cccfg: 0x%08x):\n", cc_id,
+ cccfg);
dev_dbg(dev, "num_region: %u\n", edma_cc->num_region);
dev_dbg(dev, "num_channel: %u\n", edma_cc->num_channels);
dev_dbg(dev, "num_slot: %u\n", edma_cc->num_slots);
@@ -1684,7 +1685,7 @@
return -ENOMEM;
/* Get eDMA3 configuration from IP */
- ret = edma_setup_from_hw(dev, info[j], edma_cc[j]);
+ ret = edma_setup_from_hw(dev, info[j], edma_cc[j], j);
if (ret)
return ret;
diff --git a/arch/arm/include/asm/atomic.h b/arch/arm/include/asm/atomic.h
index 3040359..832f1cd 100644
--- a/arch/arm/include/asm/atomic.h
+++ b/arch/arm/include/asm/atomic.h
@@ -37,84 +37,47 @@
* store exclusive to ensure that these are atomic. We may loop
* to ensure that the update happens.
*/
-static inline void atomic_add(int i, atomic_t *v)
-{
- unsigned long tmp;
- int result;
- prefetchw(&v->counter);
- __asm__ __volatile__("@ atomic_add\n"
-"1: ldrex %0, [%3]\n"
-" add %0, %0, %4\n"
-" strex %1, %0, [%3]\n"
-" teq %1, #0\n"
-" bne 1b"
- : "=&r" (result), "=&r" (tmp), "+Qo" (v->counter)
- : "r" (&v->counter), "Ir" (i)
- : "cc");
-}
+#define ATOMIC_OP(op, c_op, asm_op) \
+static inline void atomic_##op(int i, atomic_t *v) \
+{ \
+ unsigned long tmp; \
+ int result; \
+ \
+ prefetchw(&v->counter); \
+ __asm__ __volatile__("@ atomic_" #op "\n" \
+"1: ldrex %0, [%3]\n" \
+" " #asm_op " %0, %0, %4\n" \
+" strex %1, %0, [%3]\n" \
+" teq %1, #0\n" \
+" bne 1b" \
+ : "=&r" (result), "=&r" (tmp), "+Qo" (v->counter) \
+ : "r" (&v->counter), "Ir" (i) \
+ : "cc"); \
+} \
-static inline int atomic_add_return(int i, atomic_t *v)
-{
- unsigned long tmp;
- int result;
-
- smp_mb();
- prefetchw(&v->counter);
-
- __asm__ __volatile__("@ atomic_add_return\n"
-"1: ldrex %0, [%3]\n"
-" add %0, %0, %4\n"
-" strex %1, %0, [%3]\n"
-" teq %1, #0\n"
-" bne 1b"
- : "=&r" (result), "=&r" (tmp), "+Qo" (v->counter)
- : "r" (&v->counter), "Ir" (i)
- : "cc");
-
- smp_mb();
-
- return result;
-}
-
-static inline void atomic_sub(int i, atomic_t *v)
-{
- unsigned long tmp;
- int result;
-
- prefetchw(&v->counter);
- __asm__ __volatile__("@ atomic_sub\n"
-"1: ldrex %0, [%3]\n"
-" sub %0, %0, %4\n"
-" strex %1, %0, [%3]\n"
-" teq %1, #0\n"
-" bne 1b"
- : "=&r" (result), "=&r" (tmp), "+Qo" (v->counter)
- : "r" (&v->counter), "Ir" (i)
- : "cc");
-}
-
-static inline int atomic_sub_return(int i, atomic_t *v)
-{
- unsigned long tmp;
- int result;
-
- smp_mb();
- prefetchw(&v->counter);
-
- __asm__ __volatile__("@ atomic_sub_return\n"
-"1: ldrex %0, [%3]\n"
-" sub %0, %0, %4\n"
-" strex %1, %0, [%3]\n"
-" teq %1, #0\n"
-" bne 1b"
- : "=&r" (result), "=&r" (tmp), "+Qo" (v->counter)
- : "r" (&v->counter), "Ir" (i)
- : "cc");
-
- smp_mb();
-
- return result;
+#define ATOMIC_OP_RETURN(op, c_op, asm_op) \
+static inline int atomic_##op##_return(int i, atomic_t *v) \
+{ \
+ unsigned long tmp; \
+ int result; \
+ \
+ smp_mb(); \
+ prefetchw(&v->counter); \
+ \
+ __asm__ __volatile__("@ atomic_" #op "_return\n" \
+"1: ldrex %0, [%3]\n" \
+" " #asm_op " %0, %0, %4\n" \
+" strex %1, %0, [%3]\n" \
+" teq %1, #0\n" \
+" bne 1b" \
+ : "=&r" (result), "=&r" (tmp), "+Qo" (v->counter) \
+ : "r" (&v->counter), "Ir" (i) \
+ : "cc"); \
+ \
+ smp_mb(); \
+ \
+ return result; \
}
static inline int atomic_cmpxchg(atomic_t *ptr, int old, int new)
@@ -174,33 +137,29 @@
#error SMP not supported on pre-ARMv6 CPUs
#endif
-static inline int atomic_add_return(int i, atomic_t *v)
-{
- unsigned long flags;
- int val;
+#define ATOMIC_OP(op, c_op, asm_op) \
+static inline void atomic_##op(int i, atomic_t *v) \
+{ \
+ unsigned long flags; \
+ \
+ raw_local_irq_save(flags); \
+ v->counter c_op i; \
+ raw_local_irq_restore(flags); \
+} \
- raw_local_irq_save(flags);
- val = v->counter;
- v->counter = val += i;
- raw_local_irq_restore(flags);
-
- return val;
+#define ATOMIC_OP_RETURN(op, c_op, asm_op) \
+static inline int atomic_##op##_return(int i, atomic_t *v) \
+{ \
+ unsigned long flags; \
+ int val; \
+ \
+ raw_local_irq_save(flags); \
+ v->counter c_op i; \
+ val = v->counter; \
+ raw_local_irq_restore(flags); \
+ \
+ return val; \
}
-#define atomic_add(i, v) (void) atomic_add_return(i, v)
-
-static inline int atomic_sub_return(int i, atomic_t *v)
-{
- unsigned long flags;
- int val;
-
- raw_local_irq_save(flags);
- val = v->counter;
- v->counter = val -= i;
- raw_local_irq_restore(flags);
-
- return val;
-}
-#define atomic_sub(i, v) (void) atomic_sub_return(i, v)
static inline int atomic_cmpxchg(atomic_t *v, int old, int new)
{
@@ -228,6 +187,17 @@
#endif /* __LINUX_ARM_ARCH__ */
+#define ATOMIC_OPS(op, c_op, asm_op) \
+ ATOMIC_OP(op, c_op, asm_op) \
+ ATOMIC_OP_RETURN(op, c_op, asm_op)
+
+ATOMIC_OPS(add, +=, add)
+ATOMIC_OPS(sub, -=, sub)
+
+#undef ATOMIC_OPS
+#undef ATOMIC_OP_RETURN
+#undef ATOMIC_OP
+
#define atomic_xchg(v, new) (xchg(&((v)->counter), new))
#define atomic_inc(v) atomic_add(1, v)
@@ -300,89 +270,60 @@
}
#endif
-static inline void atomic64_add(long long i, atomic64_t *v)
-{
- long long result;
- unsigned long tmp;
+#define ATOMIC64_OP(op, op1, op2) \
+static inline void atomic64_##op(long long i, atomic64_t *v) \
+{ \
+ long long result; \
+ unsigned long tmp; \
+ \
+ prefetchw(&v->counter); \
+ __asm__ __volatile__("@ atomic64_" #op "\n" \
+"1: ldrexd %0, %H0, [%3]\n" \
+" " #op1 " %Q0, %Q0, %Q4\n" \
+" " #op2 " %R0, %R0, %R4\n" \
+" strexd %1, %0, %H0, [%3]\n" \
+" teq %1, #0\n" \
+" bne 1b" \
+ : "=&r" (result), "=&r" (tmp), "+Qo" (v->counter) \
+ : "r" (&v->counter), "r" (i) \
+ : "cc"); \
+} \
- prefetchw(&v->counter);
- __asm__ __volatile__("@ atomic64_add\n"
-"1: ldrexd %0, %H0, [%3]\n"
-" adds %Q0, %Q0, %Q4\n"
-" adc %R0, %R0, %R4\n"
-" strexd %1, %0, %H0, [%3]\n"
-" teq %1, #0\n"
-" bne 1b"
- : "=&r" (result), "=&r" (tmp), "+Qo" (v->counter)
- : "r" (&v->counter), "r" (i)
- : "cc");
+#define ATOMIC64_OP_RETURN(op, op1, op2) \
+static inline long long atomic64_##op##_return(long long i, atomic64_t *v) \
+{ \
+ long long result; \
+ unsigned long tmp; \
+ \
+ smp_mb(); \
+ prefetchw(&v->counter); \
+ \
+ __asm__ __volatile__("@ atomic64_" #op "_return\n" \
+"1: ldrexd %0, %H0, [%3]\n" \
+" " #op1 " %Q0, %Q0, %Q4\n" \
+" " #op2 " %R0, %R0, %R4\n" \
+" strexd %1, %0, %H0, [%3]\n" \
+" teq %1, #0\n" \
+" bne 1b" \
+ : "=&r" (result), "=&r" (tmp), "+Qo" (v->counter) \
+ : "r" (&v->counter), "r" (i) \
+ : "cc"); \
+ \
+ smp_mb(); \
+ \
+ return result; \
}
-static inline long long atomic64_add_return(long long i, atomic64_t *v)
-{
- long long result;
- unsigned long tmp;
+#define ATOMIC64_OPS(op, op1, op2) \
+ ATOMIC64_OP(op, op1, op2) \
+ ATOMIC64_OP_RETURN(op, op1, op2)
- smp_mb();
- prefetchw(&v->counter);
+ATOMIC64_OPS(add, adds, adc)
+ATOMIC64_OPS(sub, subs, sbc)
- __asm__ __volatile__("@ atomic64_add_return\n"
-"1: ldrexd %0, %H0, [%3]\n"
-" adds %Q0, %Q0, %Q4\n"
-" adc %R0, %R0, %R4\n"
-" strexd %1, %0, %H0, [%3]\n"
-" teq %1, #0\n"
-" bne 1b"
- : "=&r" (result), "=&r" (tmp), "+Qo" (v->counter)
- : "r" (&v->counter), "r" (i)
- : "cc");
-
- smp_mb();
-
- return result;
-}
-
-static inline void atomic64_sub(long long i, atomic64_t *v)
-{
- long long result;
- unsigned long tmp;
-
- prefetchw(&v->counter);
- __asm__ __volatile__("@ atomic64_sub\n"
-"1: ldrexd %0, %H0, [%3]\n"
-" subs %Q0, %Q0, %Q4\n"
-" sbc %R0, %R0, %R4\n"
-" strexd %1, %0, %H0, [%3]\n"
-" teq %1, #0\n"
-" bne 1b"
- : "=&r" (result), "=&r" (tmp), "+Qo" (v->counter)
- : "r" (&v->counter), "r" (i)
- : "cc");
-}
-
-static inline long long atomic64_sub_return(long long i, atomic64_t *v)
-{
- long long result;
- unsigned long tmp;
-
- smp_mb();
- prefetchw(&v->counter);
-
- __asm__ __volatile__("@ atomic64_sub_return\n"
-"1: ldrexd %0, %H0, [%3]\n"
-" subs %Q0, %Q0, %Q4\n"
-" sbc %R0, %R0, %R4\n"
-" strexd %1, %0, %H0, [%3]\n"
-" teq %1, #0\n"
-" bne 1b"
- : "=&r" (result), "=&r" (tmp), "+Qo" (v->counter)
- : "r" (&v->counter), "r" (i)
- : "cc");
-
- smp_mb();
-
- return result;
-}
+#undef ATOMIC64_OPS
+#undef ATOMIC64_OP_RETURN
+#undef ATOMIC64_OP
static inline long long atomic64_cmpxchg(atomic64_t *ptr, long long old,
long long new)
diff --git a/arch/arm/kernel/ptrace.c b/arch/arm/kernel/ptrace.c
index 0c27ed6..5e772a2 100644
--- a/arch/arm/kernel/ptrace.c
+++ b/arch/arm/kernel/ptrace.c
@@ -933,8 +933,13 @@
current_thread_info()->syscall = scno;
/* Do the secure computing check first; failures should be fast. */
- if (secure_computing(scno) == -1)
+#ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
+ if (secure_computing() == -1)
return -1;
+#else
+ /* XXX: remove this once OABI gets fixed */
+ secure_computing_strict(scno);
+#endif
if (test_thread_flag(TIF_SYSCALL_TRACE))
tracehook_report_syscall(regs, PTRACE_SYSCALL_ENTER);
diff --git a/arch/arm/kvm/handle_exit.c b/arch/arm/kvm/handle_exit.c
index 4c979d4..a96a804 100644
--- a/arch/arm/kvm/handle_exit.c
+++ b/arch/arm/kvm/handle_exit.c
@@ -93,6 +93,8 @@
else
kvm_vcpu_block(vcpu);
+ kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
+
return 1;
}
diff --git a/arch/arm/kvm/init.S b/arch/arm/kvm/init.S
index 991415d..3988e72 100644
--- a/arch/arm/kvm/init.S
+++ b/arch/arm/kvm/init.S
@@ -99,6 +99,10 @@
mrc p15, 0, r0, c10, c2, 1
mcr p15, 4, r0, c10, c2, 1
+ @ Invalidate the stale TLBs from Bootloader
+ mcr p15, 4, r0, c8, c7, 0 @ TLBIALLH
+ dsb ish
+
@ Set the HSCTLR to:
@ - ARM/THUMB exceptions: Kernel config (Thumb-2 kernel)
@ - Endianness: Kernel config
diff --git a/arch/arm/mach-at91/board-dt-rm9200.c b/arch/arm/mach-at91/board-dt-rm9200.c
index 3a185fa..f4b6e91 100644
--- a/arch/arm/mach-at91/board-dt-rm9200.c
+++ b/arch/arm/mach-at91/board-dt-rm9200.c
@@ -14,6 +14,7 @@
#include <linux/gpio.h>
#include <linux/of.h>
#include <linux/of_irq.h>
+#include <linux/clk-provider.h>
#include <asm/setup.h>
#include <asm/irq.h>
@@ -35,13 +36,21 @@
of_irq_init(irq_of_match);
}
+static void __init at91rm9200_dt_timer_init(void)
+{
+#if defined(CONFIG_COMMON_CLK)
+ of_clk_init(NULL);
+#endif
+ at91rm9200_timer_init();
+}
+
static const char *at91rm9200_dt_board_compat[] __initdata = {
"atmel,at91rm9200",
NULL
};
DT_MACHINE_START(at91rm9200_dt, "Atmel AT91RM9200 (Device Tree)")
- .init_time = at91rm9200_timer_init,
+ .init_time = at91rm9200_dt_timer_init,
.map_io = at91_map_io,
.handle_irq = at91_aic_handle_irq,
.init_early = at91rm9200_dt_initialize,
diff --git a/arch/arm/mach-omap2/gpmc.c b/arch/arm/mach-omap2/gpmc.c
index 9f42d54..2f97228 100644
--- a/arch/arm/mach-omap2/gpmc.c
+++ b/arch/arm/mach-omap2/gpmc.c
@@ -1207,8 +1207,7 @@
}
}
- if ((p->wait_on_read || p->wait_on_write) &&
- (p->wait_pin > gpmc_nr_waitpins)) {
+ if (p->wait_pin > gpmc_nr_waitpins) {
pr_err("%s: invalid wait-pin (%d)\n", __func__, p->wait_pin);
return -EINVAL;
}
@@ -1288,8 +1287,8 @@
p->wait_on_write = of_property_read_bool(np,
"gpmc,wait-on-write");
if (!p->wait_on_read && !p->wait_on_write)
- pr_warn("%s: read/write wait monitoring not enabled!\n",
- __func__);
+ pr_debug("%s: rd/wr wait monitoring not enabled!\n",
+ __func__);
}
}
diff --git a/arch/arm64/crypto/sha2-ce-glue.c b/arch/arm64/crypto/sha2-ce-glue.c
index c294e67..ae67e88 100644
--- a/arch/arm64/crypto/sha2-ce-glue.c
+++ b/arch/arm64/crypto/sha2-ce-glue.c
@@ -150,7 +150,6 @@
kernel_neon_begin_partial(28);
sha2_ce_transform(blocks, data, sctx->state, NULL, len);
kernel_neon_end();
- data += blocks * SHA256_BLOCK_SIZE;
}
static int sha224_finup(struct shash_desc *desc, const u8 *data,
diff --git a/arch/arm64/include/asm/atomic.h b/arch/arm64/include/asm/atomic.h
index 65f1569..b83c325 100644
--- a/arch/arm64/include/asm/atomic.h
+++ b/arch/arm64/include/asm/atomic.h
@@ -43,69 +43,51 @@
* store exclusive to ensure that these are atomic. We may loop
* to ensure that the update happens.
*/
-static inline void atomic_add(int i, atomic_t *v)
-{
- unsigned long tmp;
- int result;
- asm volatile("// atomic_add\n"
-"1: ldxr %w0, %2\n"
-" add %w0, %w0, %w3\n"
-" stxr %w1, %w0, %2\n"
-" cbnz %w1, 1b"
- : "=&r" (result), "=&r" (tmp), "+Q" (v->counter)
- : "Ir" (i));
+#define ATOMIC_OP(op, asm_op) \
+static inline void atomic_##op(int i, atomic_t *v) \
+{ \
+ unsigned long tmp; \
+ int result; \
+ \
+ asm volatile("// atomic_" #op "\n" \
+"1: ldxr %w0, %2\n" \
+" " #asm_op " %w0, %w0, %w3\n" \
+" stxr %w1, %w0, %2\n" \
+" cbnz %w1, 1b" \
+ : "=&r" (result), "=&r" (tmp), "+Q" (v->counter) \
+ : "Ir" (i)); \
+} \
+
+#define ATOMIC_OP_RETURN(op, asm_op) \
+static inline int atomic_##op##_return(int i, atomic_t *v) \
+{ \
+ unsigned long tmp; \
+ int result; \
+ \
+ asm volatile("// atomic_" #op "_return\n" \
+"1: ldxr %w0, %2\n" \
+" " #asm_op " %w0, %w0, %w3\n" \
+" stlxr %w1, %w0, %2\n" \
+" cbnz %w1, 1b" \
+ : "=&r" (result), "=&r" (tmp), "+Q" (v->counter) \
+ : "Ir" (i) \
+ : "memory"); \
+ \
+ smp_mb(); \
+ return result; \
}
-static inline int atomic_add_return(int i, atomic_t *v)
-{
- unsigned long tmp;
- int result;
+#define ATOMIC_OPS(op, asm_op) \
+ ATOMIC_OP(op, asm_op) \
+ ATOMIC_OP_RETURN(op, asm_op)
- asm volatile("// atomic_add_return\n"
-"1: ldxr %w0, %2\n"
-" add %w0, %w0, %w3\n"
-" stlxr %w1, %w0, %2\n"
-" cbnz %w1, 1b"
- : "=&r" (result), "=&r" (tmp), "+Q" (v->counter)
- : "Ir" (i)
- : "memory");
+ATOMIC_OPS(add, add)
+ATOMIC_OPS(sub, sub)
- smp_mb();
- return result;
-}
-
-static inline void atomic_sub(int i, atomic_t *v)
-{
- unsigned long tmp;
- int result;
-
- asm volatile("// atomic_sub\n"
-"1: ldxr %w0, %2\n"
-" sub %w0, %w0, %w3\n"
-" stxr %w1, %w0, %2\n"
-" cbnz %w1, 1b"
- : "=&r" (result), "=&r" (tmp), "+Q" (v->counter)
- : "Ir" (i));
-}
-
-static inline int atomic_sub_return(int i, atomic_t *v)
-{
- unsigned long tmp;
- int result;
-
- asm volatile("// atomic_sub_return\n"
-"1: ldxr %w0, %2\n"
-" sub %w0, %w0, %w3\n"
-" stlxr %w1, %w0, %2\n"
-" cbnz %w1, 1b"
- : "=&r" (result), "=&r" (tmp), "+Q" (v->counter)
- : "Ir" (i)
- : "memory");
-
- smp_mb();
- return result;
-}
+#undef ATOMIC_OPS
+#undef ATOMIC_OP_RETURN
+#undef ATOMIC_OP
static inline int atomic_cmpxchg(atomic_t *ptr, int old, int new)
{
@@ -160,69 +142,50 @@
#define atomic64_read(v) (*(volatile long *)&(v)->counter)
#define atomic64_set(v,i) (((v)->counter) = (i))
-static inline void atomic64_add(u64 i, atomic64_t *v)
-{
- long result;
- unsigned long tmp;
+#define ATOMIC64_OP(op, asm_op) \
+static inline void atomic64_##op(long i, atomic64_t *v) \
+{ \
+ long result; \
+ unsigned long tmp; \
+ \
+ asm volatile("// atomic64_" #op "\n" \
+"1: ldxr %0, %2\n" \
+" " #asm_op " %0, %0, %3\n" \
+" stxr %w1, %0, %2\n" \
+" cbnz %w1, 1b" \
+ : "=&r" (result), "=&r" (tmp), "+Q" (v->counter) \
+ : "Ir" (i)); \
+} \
- asm volatile("// atomic64_add\n"
-"1: ldxr %0, %2\n"
-" add %0, %0, %3\n"
-" stxr %w1, %0, %2\n"
-" cbnz %w1, 1b"
- : "=&r" (result), "=&r" (tmp), "+Q" (v->counter)
- : "Ir" (i));
+#define ATOMIC64_OP_RETURN(op, asm_op) \
+static inline long atomic64_##op##_return(long i, atomic64_t *v) \
+{ \
+ long result; \
+ unsigned long tmp; \
+ \
+ asm volatile("// atomic64_" #op "_return\n" \
+"1: ldxr %0, %2\n" \
+" " #asm_op " %0, %0, %3\n" \
+" stlxr %w1, %0, %2\n" \
+" cbnz %w1, 1b" \
+ : "=&r" (result), "=&r" (tmp), "+Q" (v->counter) \
+ : "Ir" (i) \
+ : "memory"); \
+ \
+ smp_mb(); \
+ return result; \
}
-static inline long atomic64_add_return(long i, atomic64_t *v)
-{
- long result;
- unsigned long tmp;
+#define ATOMIC64_OPS(op, asm_op) \
+ ATOMIC64_OP(op, asm_op) \
+ ATOMIC64_OP_RETURN(op, asm_op)
- asm volatile("// atomic64_add_return\n"
-"1: ldxr %0, %2\n"
-" add %0, %0, %3\n"
-" stlxr %w1, %0, %2\n"
-" cbnz %w1, 1b"
- : "=&r" (result), "=&r" (tmp), "+Q" (v->counter)
- : "Ir" (i)
- : "memory");
+ATOMIC64_OPS(add, add)
+ATOMIC64_OPS(sub, sub)
- smp_mb();
- return result;
-}
-
-static inline void atomic64_sub(u64 i, atomic64_t *v)
-{
- long result;
- unsigned long tmp;
-
- asm volatile("// atomic64_sub\n"
-"1: ldxr %0, %2\n"
-" sub %0, %0, %3\n"
-" stxr %w1, %0, %2\n"
-" cbnz %w1, 1b"
- : "=&r" (result), "=&r" (tmp), "+Q" (v->counter)
- : "Ir" (i));
-}
-
-static inline long atomic64_sub_return(long i, atomic64_t *v)
-{
- long result;
- unsigned long tmp;
-
- asm volatile("// atomic64_sub_return\n"
-"1: ldxr %0, %2\n"
-" sub %0, %0, %3\n"
-" stlxr %w1, %0, %2\n"
-" cbnz %w1, 1b"
- : "=&r" (result), "=&r" (tmp), "+Q" (v->counter)
- : "Ir" (i)
- : "memory");
-
- smp_mb();
- return result;
-}
+#undef ATOMIC64_OPS
+#undef ATOMIC64_OP_RETURN
+#undef ATOMIC64_OP
static inline long atomic64_cmpxchg(atomic64_t *ptr, long old, long new)
{
diff --git a/arch/arm64/include/asm/hw_breakpoint.h b/arch/arm64/include/asm/hw_breakpoint.h
index d064047..52b484b 100644
--- a/arch/arm64/include/asm/hw_breakpoint.h
+++ b/arch/arm64/include/asm/hw_breakpoint.h
@@ -79,7 +79,6 @@
*/
#define ARM_MAX_BRP 16
#define ARM_MAX_WRP 16
-#define ARM_MAX_HBP_SLOTS (ARM_MAX_BRP + ARM_MAX_WRP)
/* Virtual debug register bases. */
#define AARCH64_DBG_REG_BVR 0
diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
index 3df21fe..286b1be 100644
--- a/arch/arm64/include/asm/processor.h
+++ b/arch/arm64/include/asm/processor.h
@@ -139,7 +139,7 @@
((struct pt_regs *)(THREAD_START_SP + task_stack_page(p)) - 1)
#define KSTK_EIP(tsk) ((unsigned long)task_pt_regs(tsk)->pc)
-#define KSTK_ESP(tsk) ((unsigned long)task_pt_regs(tsk)->sp)
+#define KSTK_ESP(tsk) user_stack_pointer(task_pt_regs(tsk))
/*
* Prefetching support
diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h
index 501000f..41ed9e1 100644
--- a/arch/arm64/include/asm/ptrace.h
+++ b/arch/arm64/include/asm/ptrace.h
@@ -137,7 +137,7 @@
(!((regs)->pstate & PSR_F_BIT))
#define user_stack_pointer(regs) \
- (!compat_user_mode(regs)) ? ((regs)->sp) : ((regs)->compat_sp)
+ (!compat_user_mode(regs) ? (regs)->sp : (regs)->compat_sp)
static inline unsigned long regs_return_value(struct pt_regs *regs)
{
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index ad8aebb..3dca156 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -270,6 +270,7 @@
case CPU_PM_ENTER:
if (current->mm && !test_thread_flag(TIF_FOREIGN_FPSTATE))
fpsimd_save_state(¤t->thread.fpsimd_state);
+ this_cpu_write(fpsimd_last_state, NULL);
break;
case CPU_PM_EXIT:
if (current->mm)
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index bed0283..8730690 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -373,10 +373,6 @@
.long 0
.popsection
- .align 3
-2: .quad .
- .quad PAGE_OFFSET
-
#ifdef CONFIG_SMP
.align 3
1: .quad .
diff --git a/arch/arm64/kernel/perf_regs.c b/arch/arm64/kernel/perf_regs.c
index 422ebd6..6762ad7 100644
--- a/arch/arm64/kernel/perf_regs.c
+++ b/arch/arm64/kernel/perf_regs.c
@@ -24,6 +24,12 @@
return regs->compat_lr;
}
+ if ((u32)idx == PERF_REG_ARM64_SP)
+ return regs->sp;
+
+ if ((u32)idx == PERF_REG_ARM64_PC)
+ return regs->pc;
+
return regs->regs[idx];
}
diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
index 70526cf..fe63ac5 100644
--- a/arch/arm64/kernel/ptrace.c
+++ b/arch/arm64/kernel/ptrace.c
@@ -87,7 +87,8 @@
break;
}
}
- for (i = ARM_MAX_BRP; i < ARM_MAX_HBP_SLOTS && !bp; ++i) {
+
+ for (i = 0; i < ARM_MAX_WRP; ++i) {
if (current->thread.debug.hbp_watch[i] == bp) {
info.si_errno = -((i << 1) + 1);
break;
@@ -662,8 +663,10 @@
kbuf += sizeof(reg);
} else {
ret = copy_to_user(ubuf, ®, sizeof(reg));
- if (ret)
+ if (ret) {
+ ret = -EFAULT;
break;
+ }
ubuf += sizeof(reg);
}
@@ -701,8 +704,10 @@
kbuf += sizeof(reg);
} else {
ret = copy_from_user(®, ubuf, sizeof(reg));
- if (ret)
- return ret;
+ if (ret) {
+ ret = -EFAULT;
+ break;
+ }
ubuf += sizeof(reg);
}
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index f6f0ccf..edb146d 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -78,6 +78,7 @@
#endif
static const char *cpu_name;
+static const char *machine_name;
phys_addr_t __fdt_pointer __initdata;
/*
@@ -309,6 +310,8 @@
while (true)
cpu_relax();
}
+
+ machine_name = of_flat_dt_get_machine_name();
}
/*
@@ -447,21 +450,10 @@
{
int i;
- /*
- * Dump out the common processor features in a single line. Userspace
- * should read the hwcaps with getauxval(AT_HWCAP) rather than
- * attempting to parse this.
- */
- seq_puts(m, "features\t:");
- for (i = 0; hwcap_str[i]; i++)
- if (elf_hwcap & (1 << i))
- seq_printf(m, " %s", hwcap_str[i]);
- seq_puts(m, "\n\n");
+ seq_printf(m, "Processor\t: %s rev %d (%s)\n",
+ cpu_name, read_cpuid_id() & 15, ELF_PLATFORM);
for_each_online_cpu(i) {
- struct cpuinfo_arm64 *cpuinfo = &per_cpu(cpu_data, i);
- u32 midr = cpuinfo->reg_midr;
-
/*
* glibc reads /proc/cpuinfo to determine the number of
* online processors, looking for lines beginning with
@@ -470,13 +462,25 @@
#ifdef CONFIG_SMP
seq_printf(m, "processor\t: %d\n", i);
#endif
- seq_printf(m, "implementer\t: 0x%02x\n",
- MIDR_IMPLEMENTOR(midr));
- seq_printf(m, "variant\t\t: 0x%x\n", MIDR_VARIANT(midr));
- seq_printf(m, "partnum\t\t: 0x%03x\n", MIDR_PARTNUM(midr));
- seq_printf(m, "revision\t: 0x%x\n\n", MIDR_REVISION(midr));
}
+ /* dump out the processor features */
+ seq_puts(m, "Features\t: ");
+
+ for (i = 0; hwcap_str[i]; i++)
+ if (elf_hwcap & (1 << i))
+ seq_printf(m, "%s ", hwcap_str[i]);
+
+ seq_printf(m, "\nCPU implementer\t: 0x%02x\n", read_cpuid_id() >> 24);
+ seq_printf(m, "CPU architecture: AArch64\n");
+ seq_printf(m, "CPU variant\t: 0x%x\n", (read_cpuid_id() >> 20) & 15);
+ seq_printf(m, "CPU part\t: 0x%03x\n", (read_cpuid_id() >> 4) & 0xfff);
+ seq_printf(m, "CPU revision\t: %d\n", read_cpuid_id() & 15);
+
+ seq_puts(m, "\n");
+
+ seq_printf(m, "Hardware\t: %s\n", machine_name);
+
return 0;
}
diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c
index e28be51..34b8bd0 100644
--- a/arch/arm64/kvm/handle_exit.c
+++ b/arch/arm64/kvm/handle_exit.c
@@ -66,6 +66,8 @@
else
kvm_vcpu_block(vcpu);
+ kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
+
return 1;
}
diff --git a/arch/arm64/kvm/hyp-init.S b/arch/arm64/kvm/hyp-init.S
index d968796..c319116 100644
--- a/arch/arm64/kvm/hyp-init.S
+++ b/arch/arm64/kvm/hyp-init.S
@@ -80,6 +80,10 @@
msr mair_el2, x4
isb
+ /* Invalidate the stale TLBs from Bootloader */
+ tlbi alle2
+ dsb sy
+
mrs x4, sctlr_el2
and x4, x4, #SCTLR_EL2_EE // preserve endianness of EL2
ldr x5, =SCTLR_EL2_FLAGS
diff --git a/arch/avr32/include/asm/atomic.h b/arch/avr32/include/asm/atomic.h
index 0780f3f..83e980a 100644
--- a/arch/avr32/include/asm/atomic.h
+++ b/arch/avr32/include/asm/atomic.h
@@ -22,31 +22,44 @@
#define atomic_read(v) (*(volatile int *)&(v)->counter)
#define atomic_set(v, i) (((v)->counter) = i)
-/*
- * atomic_sub_return - subtract the atomic variable
- * @i: integer value to subtract
- * @v: pointer of type atomic_t
- *
- * Atomically subtracts @i from @v. Returns the resulting value.
- */
-static inline int atomic_sub_return(int i, atomic_t *v)
-{
- int result;
-
- asm volatile(
- "/* atomic_sub_return */\n"
- "1: ssrf 5\n"
- " ld.w %0, %2\n"
- " sub %0, %3\n"
- " stcond %1, %0\n"
- " brne 1b"
- : "=&r"(result), "=o"(v->counter)
- : "m"(v->counter), "rKs21"(i)
- : "cc");
-
- return result;
+#define ATOMIC_OP_RETURN(op, asm_op, asm_con) \
+static inline int __atomic_##op##_return(int i, atomic_t *v) \
+{ \
+ int result; \
+ \
+ asm volatile( \
+ "/* atomic_" #op "_return */\n" \
+ "1: ssrf 5\n" \
+ " ld.w %0, %2\n" \
+ " " #asm_op " %0, %3\n" \
+ " stcond %1, %0\n" \
+ " brne 1b" \
+ : "=&r" (result), "=o" (v->counter) \
+ : "m" (v->counter), #asm_con (i) \
+ : "cc"); \
+ \
+ return result; \
}
+ATOMIC_OP_RETURN(sub, sub, rKs21)
+ATOMIC_OP_RETURN(add, add, r)
+
+#undef ATOMIC_OP_RETURN
+
+/*
+ * Probably found the reason why we want to use sub with the signed 21-bit
+ * limit, it uses one less register than the add instruction that can add up to
+ * 32-bit values.
+ *
+ * Both instructions are 32-bit, to use a 16-bit instruction the immediate is
+ * very small; 4 bit.
+ *
+ * sub 32-bit, type IV, takes a register and subtracts a 21-bit immediate.
+ * add 32-bit, type II, adds two register values together.
+ */
+#define IS_21BIT_CONST(i) \
+ (__builtin_constant_p(i) && ((i) >= -1048575) && ((i) <= 1048576))
+
/*
* atomic_add_return - add integer to atomic variable
* @i: integer value to add
@@ -56,51 +69,25 @@
*/
static inline int atomic_add_return(int i, atomic_t *v)
{
- int result;
+ if (IS_21BIT_CONST(i))
+ return __atomic_sub_return(-i, v);
- if (__builtin_constant_p(i) && (i >= -1048575) && (i <= 1048576))
- result = atomic_sub_return(-i, v);
- else
- asm volatile(
- "/* atomic_add_return */\n"
- "1: ssrf 5\n"
- " ld.w %0, %1\n"
- " add %0, %3\n"
- " stcond %2, %0\n"
- " brne 1b"
- : "=&r"(result), "=o"(v->counter)
- : "m"(v->counter), "r"(i)
- : "cc", "memory");
-
- return result;
+ return __atomic_add_return(i, v);
}
/*
- * atomic_sub_unless - sub unless the number is a given value
+ * atomic_sub_return - subtract the atomic variable
+ * @i: integer value to subtract
* @v: pointer of type atomic_t
- * @a: the amount to subtract from v...
- * @u: ...unless v is equal to u.
*
- * Atomically subtract @a from @v, so long as it was not @u.
- * Returns the old value of @v.
-*/
-static inline void atomic_sub_unless(atomic_t *v, int a, int u)
+ * Atomically subtracts @i from @v. Returns the resulting value.
+ */
+static inline int atomic_sub_return(int i, atomic_t *v)
{
- int tmp;
+ if (IS_21BIT_CONST(i))
+ return __atomic_sub_return(i, v);
- asm volatile(
- "/* atomic_sub_unless */\n"
- "1: ssrf 5\n"
- " ld.w %0, %2\n"
- " cp.w %0, %4\n"
- " breq 1f\n"
- " sub %0, %3\n"
- " stcond %1, %0\n"
- " brne 1b\n"
- "1:"
- : "=&r"(tmp), "=o"(v->counter)
- : "m"(v->counter), "rKs21"(a), "rKs21"(u)
- : "cc", "memory");
+ return __atomic_add_return(-i, v);
}
/*
@@ -116,9 +103,21 @@
{
int tmp, old = atomic_read(v);
- if (__builtin_constant_p(a) && (a >= -1048575) && (a <= 1048576))
- atomic_sub_unless(v, -a, u);
- else {
+ if (IS_21BIT_CONST(a)) {
+ asm volatile(
+ "/* __atomic_sub_unless */\n"
+ "1: ssrf 5\n"
+ " ld.w %0, %2\n"
+ " cp.w %0, %4\n"
+ " breq 1f\n"
+ " sub %0, %3\n"
+ " stcond %1, %0\n"
+ " brne 1b\n"
+ "1:"
+ : "=&r"(tmp), "=o"(v->counter)
+ : "m"(v->counter), "rKs21"(-a), "rKs21"(u)
+ : "cc", "memory");
+ } else {
asm volatile(
"/* __atomic_add_unless */\n"
"1: ssrf 5\n"
@@ -137,6 +136,8 @@
return old;
}
+#undef IS_21BIT_CONST
+
/*
* atomic_sub_if_positive - conditionally subtract integer from atomic variable
* @i: integer value to subtract
diff --git a/arch/cris/include/asm/atomic.h b/arch/cris/include/asm/atomic.h
index aa429ba..0033f9d 100644
--- a/arch/cris/include/asm/atomic.h
+++ b/arch/cris/include/asm/atomic.h
@@ -22,44 +22,37 @@
/* These should be written in asm but we do it in C for now. */
-static inline void atomic_add(int i, volatile atomic_t *v)
-{
- unsigned long flags;
- cris_atomic_save(v, flags);
- v->counter += i;
- cris_atomic_restore(v, flags);
+#define ATOMIC_OP(op, c_op) \
+static inline void atomic_##op(int i, volatile atomic_t *v) \
+{ \
+ unsigned long flags; \
+ cris_atomic_save(v, flags); \
+ v->counter c_op i; \
+ cris_atomic_restore(v, flags); \
+} \
+
+#define ATOMIC_OP_RETURN(op, c_op) \
+static inline int atomic_##op##_return(int i, volatile atomic_t *v) \
+{ \
+ unsigned long flags; \
+ int retval; \
+ cris_atomic_save(v, flags); \
+ retval = (v->counter c_op i); \
+ cris_atomic_restore(v, flags); \
+ return retval; \
}
-static inline void atomic_sub(int i, volatile atomic_t *v)
-{
- unsigned long flags;
- cris_atomic_save(v, flags);
- v->counter -= i;
- cris_atomic_restore(v, flags);
-}
+#define ATOMIC_OPS(op, c_op) ATOMIC_OP(op, c_op) ATOMIC_OP_RETURN(op, c_op)
-static inline int atomic_add_return(int i, volatile atomic_t *v)
-{
- unsigned long flags;
- int retval;
- cris_atomic_save(v, flags);
- retval = (v->counter += i);
- cris_atomic_restore(v, flags);
- return retval;
-}
+ATOMIC_OPS(add, +=)
+ATOMIC_OPS(sub, -=)
+
+#undef ATOMIC_OPS
+#undef ATOMIC_OP_RETURN
+#undef ATOMIC_OP
#define atomic_add_negative(a, v) (atomic_add_return((a), (v)) < 0)
-static inline int atomic_sub_return(int i, volatile atomic_t *v)
-{
- unsigned long flags;
- int retval;
- cris_atomic_save(v, flags);
- retval = (v->counter -= i);
- cris_atomic_restore(v, flags);
- return retval;
-}
-
static inline int atomic_sub_and_test(int i, volatile atomic_t *v)
{
int retval;
diff --git a/arch/hexagon/include/asm/atomic.h b/arch/hexagon/include/asm/atomic.h
index de916b1..93d0702 100644
--- a/arch/hexagon/include/asm/atomic.h
+++ b/arch/hexagon/include/asm/atomic.h
@@ -94,41 +94,47 @@
return __oldval;
}
-static inline int atomic_add_return(int i, atomic_t *v)
-{
- int output;
+#define ATOMIC_OP(op) \
+static inline void atomic_##op(int i, atomic_t *v) \
+{ \
+ int output; \
+ \
+ __asm__ __volatile__ ( \
+ "1: %0 = memw_locked(%1);\n" \
+ " %0 = "#op "(%0,%2);\n" \
+ " memw_locked(%1,P3)=%0;\n" \
+ " if !P3 jump 1b;\n" \
+ : "=&r" (output) \
+ : "r" (&v->counter), "r" (i) \
+ : "memory", "p3" \
+ ); \
+} \
- __asm__ __volatile__ (
- "1: %0 = memw_locked(%1);\n"
- " %0 = add(%0,%2);\n"
- " memw_locked(%1,P3)=%0;\n"
- " if !P3 jump 1b;\n"
- : "=&r" (output)
- : "r" (&v->counter), "r" (i)
- : "memory", "p3"
- );
- return output;
-
+#define ATOMIC_OP_RETURN(op) \
+static inline int atomic_##op##_return(int i, atomic_t *v) \
+{ \
+ int output; \
+ \
+ __asm__ __volatile__ ( \
+ "1: %0 = memw_locked(%1);\n" \
+ " %0 = "#op "(%0,%2);\n" \
+ " memw_locked(%1,P3)=%0;\n" \
+ " if !P3 jump 1b;\n" \
+ : "=&r" (output) \
+ : "r" (&v->counter), "r" (i) \
+ : "memory", "p3" \
+ ); \
+ return output; \
}
-#define atomic_add(i, v) atomic_add_return(i, (v))
+#define ATOMIC_OPS(op) ATOMIC_OP(op) ATOMIC_OP_RETURN(op)
-static inline int atomic_sub_return(int i, atomic_t *v)
-{
- int output;
- __asm__ __volatile__ (
- "1: %0 = memw_locked(%1);\n"
- " %0 = sub(%0,%2);\n"
- " memw_locked(%1,P3)=%0\n"
- " if !P3 jump 1b;\n"
- : "=&r" (output)
- : "r" (&v->counter), "r" (i)
- : "memory", "p3"
- );
- return output;
-}
+ATOMIC_OPS(add)
+ATOMIC_OPS(sub)
-#define atomic_sub(i, v) atomic_sub_return(i, (v))
+#undef ATOMIC_OPS
+#undef ATOMIC_OP_RETURN
+#undef ATOMIC_OP
/**
* __atomic_add_unless - add unless the number is a given value
diff --git a/arch/ia64/include/asm/atomic.h b/arch/ia64/include/asm/atomic.h
index 0f8bf48..42919a8 100644
--- a/arch/ia64/include/asm/atomic.h
+++ b/arch/ia64/include/asm/atomic.h
@@ -27,61 +27,93 @@
#define atomic_set(v,i) (((v)->counter) = (i))
#define atomic64_set(v,i) (((v)->counter) = (i))
-static __inline__ int
-ia64_atomic_add (int i, atomic_t *v)
-{
- __s32 old, new;
- CMPXCHG_BUGCHECK_DECL
-
- do {
- CMPXCHG_BUGCHECK(v);
- old = atomic_read(v);
- new = old + i;
- } while (ia64_cmpxchg(acq, v, old, new, sizeof(atomic_t)) != old);
- return new;
+#define ATOMIC_OP(op, c_op) \
+static __inline__ int \
+ia64_atomic_##op (int i, atomic_t *v) \
+{ \
+ __s32 old, new; \
+ CMPXCHG_BUGCHECK_DECL \
+ \
+ do { \
+ CMPXCHG_BUGCHECK(v); \
+ old = atomic_read(v); \
+ new = old c_op i; \
+ } while (ia64_cmpxchg(acq, v, old, new, sizeof(atomic_t)) != old); \
+ return new; \
}
-static __inline__ long
-ia64_atomic64_add (__s64 i, atomic64_t *v)
-{
- __s64 old, new;
- CMPXCHG_BUGCHECK_DECL
+ATOMIC_OP(add, +)
+ATOMIC_OP(sub, -)
- do {
- CMPXCHG_BUGCHECK(v);
- old = atomic64_read(v);
- new = old + i;
- } while (ia64_cmpxchg(acq, v, old, new, sizeof(atomic64_t)) != old);
- return new;
+#undef ATOMIC_OP
+
+#define atomic_add_return(i,v) \
+({ \
+ int __ia64_aar_i = (i); \
+ (__builtin_constant_p(i) \
+ && ( (__ia64_aar_i == 1) || (__ia64_aar_i == 4) \
+ || (__ia64_aar_i == 8) || (__ia64_aar_i == 16) \
+ || (__ia64_aar_i == -1) || (__ia64_aar_i == -4) \
+ || (__ia64_aar_i == -8) || (__ia64_aar_i == -16))) \
+ ? ia64_fetch_and_add(__ia64_aar_i, &(v)->counter) \
+ : ia64_atomic_add(__ia64_aar_i, v); \
+})
+
+#define atomic_sub_return(i,v) \
+({ \
+ int __ia64_asr_i = (i); \
+ (__builtin_constant_p(i) \
+ && ( (__ia64_asr_i == 1) || (__ia64_asr_i == 4) \
+ || (__ia64_asr_i == 8) || (__ia64_asr_i == 16) \
+ || (__ia64_asr_i == -1) || (__ia64_asr_i == -4) \
+ || (__ia64_asr_i == -8) || (__ia64_asr_i == -16))) \
+ ? ia64_fetch_and_add(-__ia64_asr_i, &(v)->counter) \
+ : ia64_atomic_sub(__ia64_asr_i, v); \
+})
+
+#define ATOMIC64_OP(op, c_op) \
+static __inline__ long \
+ia64_atomic64_##op (__s64 i, atomic64_t *v) \
+{ \
+ __s64 old, new; \
+ CMPXCHG_BUGCHECK_DECL \
+ \
+ do { \
+ CMPXCHG_BUGCHECK(v); \
+ old = atomic64_read(v); \
+ new = old c_op i; \
+ } while (ia64_cmpxchg(acq, v, old, new, sizeof(atomic64_t)) != old); \
+ return new; \
}
-static __inline__ int
-ia64_atomic_sub (int i, atomic_t *v)
-{
- __s32 old, new;
- CMPXCHG_BUGCHECK_DECL
+ATOMIC64_OP(add, +)
+ATOMIC64_OP(sub, -)
- do {
- CMPXCHG_BUGCHECK(v);
- old = atomic_read(v);
- new = old - i;
- } while (ia64_cmpxchg(acq, v, old, new, sizeof(atomic_t)) != old);
- return new;
-}
+#undef ATOMIC64_OP
-static __inline__ long
-ia64_atomic64_sub (__s64 i, atomic64_t *v)
-{
- __s64 old, new;
- CMPXCHG_BUGCHECK_DECL
+#define atomic64_add_return(i,v) \
+({ \
+ long __ia64_aar_i = (i); \
+ (__builtin_constant_p(i) \
+ && ( (__ia64_aar_i == 1) || (__ia64_aar_i == 4) \
+ || (__ia64_aar_i == 8) || (__ia64_aar_i == 16) \
+ || (__ia64_aar_i == -1) || (__ia64_aar_i == -4) \
+ || (__ia64_aar_i == -8) || (__ia64_aar_i == -16))) \
+ ? ia64_fetch_and_add(__ia64_aar_i, &(v)->counter) \
+ : ia64_atomic64_add(__ia64_aar_i, v); \
+})
- do {
- CMPXCHG_BUGCHECK(v);
- old = atomic64_read(v);
- new = old - i;
- } while (ia64_cmpxchg(acq, v, old, new, sizeof(atomic64_t)) != old);
- return new;
-}
+#define atomic64_sub_return(i,v) \
+({ \
+ long __ia64_asr_i = (i); \
+ (__builtin_constant_p(i) \
+ && ( (__ia64_asr_i == 1) || (__ia64_asr_i == 4) \
+ || (__ia64_asr_i == 8) || (__ia64_asr_i == 16) \
+ || (__ia64_asr_i == -1) || (__ia64_asr_i == -4) \
+ || (__ia64_asr_i == -8) || (__ia64_asr_i == -16))) \
+ ? ia64_fetch_and_add(-__ia64_asr_i, &(v)->counter) \
+ : ia64_atomic64_sub(__ia64_asr_i, v); \
+})
#define atomic_cmpxchg(v, old, new) (cmpxchg(&((v)->counter), old, new))
#define atomic_xchg(v, new) (xchg(&((v)->counter), new))
@@ -123,30 +155,6 @@
#define atomic64_inc_not_zero(v) atomic64_add_unless((v), 1, 0)
-#define atomic_add_return(i,v) \
-({ \
- int __ia64_aar_i = (i); \
- (__builtin_constant_p(i) \
- && ( (__ia64_aar_i == 1) || (__ia64_aar_i == 4) \
- || (__ia64_aar_i == 8) || (__ia64_aar_i == 16) \
- || (__ia64_aar_i == -1) || (__ia64_aar_i == -4) \
- || (__ia64_aar_i == -8) || (__ia64_aar_i == -16))) \
- ? ia64_fetch_and_add(__ia64_aar_i, &(v)->counter) \
- : ia64_atomic_add(__ia64_aar_i, v); \
-})
-
-#define atomic64_add_return(i,v) \
-({ \
- long __ia64_aar_i = (i); \
- (__builtin_constant_p(i) \
- && ( (__ia64_aar_i == 1) || (__ia64_aar_i == 4) \
- || (__ia64_aar_i == 8) || (__ia64_aar_i == 16) \
- || (__ia64_aar_i == -1) || (__ia64_aar_i == -4) \
- || (__ia64_aar_i == -8) || (__ia64_aar_i == -16))) \
- ? ia64_fetch_and_add(__ia64_aar_i, &(v)->counter) \
- : ia64_atomic64_add(__ia64_aar_i, v); \
-})
-
/*
* Atomically add I to V and return TRUE if the resulting value is
* negative.
@@ -163,30 +171,6 @@
return atomic64_add_return(i, v) < 0;
}
-#define atomic_sub_return(i,v) \
-({ \
- int __ia64_asr_i = (i); \
- (__builtin_constant_p(i) \
- && ( (__ia64_asr_i == 1) || (__ia64_asr_i == 4) \
- || (__ia64_asr_i == 8) || (__ia64_asr_i == 16) \
- || (__ia64_asr_i == -1) || (__ia64_asr_i == -4) \
- || (__ia64_asr_i == -8) || (__ia64_asr_i == -16))) \
- ? ia64_fetch_and_add(-__ia64_asr_i, &(v)->counter) \
- : ia64_atomic_sub(__ia64_asr_i, v); \
-})
-
-#define atomic64_sub_return(i,v) \
-({ \
- long __ia64_asr_i = (i); \
- (__builtin_constant_p(i) \
- && ( (__ia64_asr_i == 1) || (__ia64_asr_i == 4) \
- || (__ia64_asr_i == 8) || (__ia64_asr_i == 16) \
- || (__ia64_asr_i == -1) || (__ia64_asr_i == -4) \
- || (__ia64_asr_i == -8) || (__ia64_asr_i == -16))) \
- ? ia64_fetch_and_add(-__ia64_asr_i, &(v)->counter) \
- : ia64_atomic64_sub(__ia64_asr_i, v); \
-})
-
#define atomic_dec_return(v) atomic_sub_return(1, (v))
#define atomic_inc_return(v) atomic_add_return(1, (v))
#define atomic64_dec_return(v) atomic64_sub_return(1, (v))
@@ -199,13 +183,13 @@
#define atomic64_dec_and_test(v) (atomic64_sub_return(1, (v)) == 0)
#define atomic64_inc_and_test(v) (atomic64_add_return(1, (v)) == 0)
-#define atomic_add(i,v) atomic_add_return((i), (v))
-#define atomic_sub(i,v) atomic_sub_return((i), (v))
+#define atomic_add(i,v) (void)atomic_add_return((i), (v))
+#define atomic_sub(i,v) (void)atomic_sub_return((i), (v))
#define atomic_inc(v) atomic_add(1, (v))
#define atomic_dec(v) atomic_sub(1, (v))
-#define atomic64_add(i,v) atomic64_add_return((i), (v))
-#define atomic64_sub(i,v) atomic64_sub_return((i), (v))
+#define atomic64_add(i,v) (void)atomic64_add_return((i), (v))
+#define atomic64_sub(i,v) (void)atomic64_sub_return((i), (v))
#define atomic64_inc(v) atomic64_add(1, (v))
#define atomic64_dec(v) atomic64_sub(1, (v))
diff --git a/arch/m32r/include/asm/atomic.h b/arch/m32r/include/asm/atomic.h
index 8ad0ed4..3946b2c 100644
--- a/arch/m32r/include/asm/atomic.h
+++ b/arch/m32r/include/asm/atomic.h
@@ -39,85 +39,64 @@
*/
#define atomic_set(v,i) (((v)->counter) = (i))
-/**
- * atomic_add_return - add integer to atomic variable and return it
- * @i: integer value to add
- * @v: pointer of type atomic_t
- *
- * Atomically adds @i to @v and return (@i + @v).
- */
-static __inline__ int atomic_add_return(int i, atomic_t *v)
-{
- unsigned long flags;
- int result;
-
- local_irq_save(flags);
- __asm__ __volatile__ (
- "# atomic_add_return \n\t"
- DCACHE_CLEAR("%0", "r4", "%1")
- M32R_LOCK" %0, @%1; \n\t"
- "add %0, %2; \n\t"
- M32R_UNLOCK" %0, @%1; \n\t"
- : "=&r" (result)
- : "r" (&v->counter), "r" (i)
- : "memory"
#ifdef CONFIG_CHIP_M32700_TS1
- , "r4"
-#endif /* CONFIG_CHIP_M32700_TS1 */
- );
- local_irq_restore(flags);
+#define __ATOMIC_CLOBBER , "r4"
+#else
+#define __ATOMIC_CLOBBER
+#endif
- return result;
+#define ATOMIC_OP(op) \
+static __inline__ void atomic_##op(int i, atomic_t *v) \
+{ \
+ unsigned long flags; \
+ int result; \
+ \
+ local_irq_save(flags); \
+ __asm__ __volatile__ ( \
+ "# atomic_" #op " \n\t" \
+ DCACHE_CLEAR("%0", "r4", "%1") \
+ M32R_LOCK" %0, @%1; \n\t" \
+ #op " %0, %2; \n\t" \
+ M32R_UNLOCK" %0, @%1; \n\t" \
+ : "=&r" (result) \
+ : "r" (&v->counter), "r" (i) \
+ : "memory" \
+ __ATOMIC_CLOBBER \
+ ); \
+ local_irq_restore(flags); \
+} \
+
+#define ATOMIC_OP_RETURN(op) \
+static __inline__ int atomic_##op##_return(int i, atomic_t *v) \
+{ \
+ unsigned long flags; \
+ int result; \
+ \
+ local_irq_save(flags); \
+ __asm__ __volatile__ ( \
+ "# atomic_" #op "_return \n\t" \
+ DCACHE_CLEAR("%0", "r4", "%1") \
+ M32R_LOCK" %0, @%1; \n\t" \
+ #op " %0, %2; \n\t" \
+ M32R_UNLOCK" %0, @%1; \n\t" \
+ : "=&r" (result) \
+ : "r" (&v->counter), "r" (i) \
+ : "memory" \
+ __ATOMIC_CLOBBER \
+ ); \
+ local_irq_restore(flags); \
+ \
+ return result; \
}
-/**
- * atomic_sub_return - subtract integer from atomic variable and return it
- * @i: integer value to subtract
- * @v: pointer of type atomic_t
- *
- * Atomically subtracts @i from @v and return (@v - @i).
- */
-static __inline__ int atomic_sub_return(int i, atomic_t *v)
-{
- unsigned long flags;
- int result;
+#define ATOMIC_OPS(op) ATOMIC_OP(op) ATOMIC_OP_RETURN(op)
- local_irq_save(flags);
- __asm__ __volatile__ (
- "# atomic_sub_return \n\t"
- DCACHE_CLEAR("%0", "r4", "%1")
- M32R_LOCK" %0, @%1; \n\t"
- "sub %0, %2; \n\t"
- M32R_UNLOCK" %0, @%1; \n\t"
- : "=&r" (result)
- : "r" (&v->counter), "r" (i)
- : "memory"
-#ifdef CONFIG_CHIP_M32700_TS1
- , "r4"
-#endif /* CONFIG_CHIP_M32700_TS1 */
- );
- local_irq_restore(flags);
+ATOMIC_OPS(add)
+ATOMIC_OPS(sub)
- return result;
-}
-
-/**
- * atomic_add - add integer to atomic variable
- * @i: integer value to add
- * @v: pointer of type atomic_t
- *
- * Atomically adds @i to @v.
- */
-#define atomic_add(i,v) ((void) atomic_add_return((i), (v)))
-
-/**
- * atomic_sub - subtract the atomic variable
- * @i: integer value to subtract
- * @v: pointer of type atomic_t
- *
- * Atomically subtracts @i from @v.
- */
-#define atomic_sub(i,v) ((void) atomic_sub_return((i), (v)))
+#undef ATOMIC_OPS
+#undef ATOMIC_OP_RETURN
+#undef ATOMIC_OP
/**
* atomic_sub_and_test - subtract value from variable and test result
@@ -151,9 +130,7 @@
: "=&r" (result)
: "r" (&v->counter)
: "memory"
-#ifdef CONFIG_CHIP_M32700_TS1
- , "r4"
-#endif /* CONFIG_CHIP_M32700_TS1 */
+ __ATOMIC_CLOBBER
);
local_irq_restore(flags);
@@ -181,9 +158,7 @@
: "=&r" (result)
: "r" (&v->counter)
: "memory"
-#ifdef CONFIG_CHIP_M32700_TS1
- , "r4"
-#endif /* CONFIG_CHIP_M32700_TS1 */
+ __ATOMIC_CLOBBER
);
local_irq_restore(flags);
@@ -280,9 +255,7 @@
: "=&r" (tmp)
: "r" (addr), "r" (~mask)
: "memory"
-#ifdef CONFIG_CHIP_M32700_TS1
- , "r5"
-#endif /* CONFIG_CHIP_M32700_TS1 */
+ __ATOMIC_CLOBBER
);
local_irq_restore(flags);
}
@@ -302,9 +275,7 @@
: "=&r" (tmp)
: "r" (addr), "r" (mask)
: "memory"
-#ifdef CONFIG_CHIP_M32700_TS1
- , "r5"
-#endif /* CONFIG_CHIP_M32700_TS1 */
+ __ATOMIC_CLOBBER
);
local_irq_restore(flags);
}
diff --git a/arch/m68k/include/asm/atomic.h b/arch/m68k/include/asm/atomic.h
index 5569521..663d4ba 100644
--- a/arch/m68k/include/asm/atomic.h
+++ b/arch/m68k/include/asm/atomic.h
@@ -30,16 +30,57 @@
#define ASM_DI "di"
#endif
-static inline void atomic_add(int i, atomic_t *v)
-{
- __asm__ __volatile__("addl %1,%0" : "+m" (*v) : ASM_DI (i));
+#define ATOMIC_OP(op, c_op, asm_op) \
+static inline void atomic_##op(int i, atomic_t *v) \
+{ \
+ __asm__ __volatile__(#asm_op "l %1,%0" : "+m" (*v) : ASM_DI (i));\
+} \
+
+#ifdef CONFIG_RMW_INSNS
+
+#define ATOMIC_OP_RETURN(op, c_op, asm_op) \
+static inline int atomic_##op##_return(int i, atomic_t *v) \
+{ \
+ int t, tmp; \
+ \
+ __asm__ __volatile__( \
+ "1: movel %2,%1\n" \
+ " " #asm_op "l %3,%1\n" \
+ " casl %2,%1,%0\n" \
+ " jne 1b" \
+ : "+m" (*v), "=&d" (t), "=&d" (tmp) \
+ : "g" (i), "2" (atomic_read(v))); \
+ return t; \
}
-static inline void atomic_sub(int i, atomic_t *v)
-{
- __asm__ __volatile__("subl %1,%0" : "+m" (*v) : ASM_DI (i));
+#else
+
+#define ATOMIC_OP_RETURN(op, c_op, asm_op) \
+static inline int atomic_##op##_return(int i, atomic_t * v) \
+{ \
+ unsigned long flags; \
+ int t; \
+ \
+ local_irq_save(flags); \
+ t = (v->counter c_op i); \
+ local_irq_restore(flags); \
+ \
+ return t; \
}
+#endif /* CONFIG_RMW_INSNS */
+
+#define ATOMIC_OPS(op, c_op, asm_op) \
+ ATOMIC_OP(op, c_op, asm_op) \
+ ATOMIC_OP_RETURN(op, c_op, asm_op)
+
+ATOMIC_OPS(add, +=, add)
+ATOMIC_OPS(sub, -=, sub)
+
+#undef ATOMIC_OPS
+#undef ATOMIC_OP_RETURN
+#undef ATOMIC_OP
+
static inline void atomic_inc(atomic_t *v)
{
__asm__ __volatile__("addql #1,%0" : "+m" (*v));
@@ -76,67 +117,11 @@
#ifdef CONFIG_RMW_INSNS
-static inline int atomic_add_return(int i, atomic_t *v)
-{
- int t, tmp;
-
- __asm__ __volatile__(
- "1: movel %2,%1\n"
- " addl %3,%1\n"
- " casl %2,%1,%0\n"
- " jne 1b"
- : "+m" (*v), "=&d" (t), "=&d" (tmp)
- : "g" (i), "2" (atomic_read(v)));
- return t;
-}
-
-static inline int atomic_sub_return(int i, atomic_t *v)
-{
- int t, tmp;
-
- __asm__ __volatile__(
- "1: movel %2,%1\n"
- " subl %3,%1\n"
- " casl %2,%1,%0\n"
- " jne 1b"
- : "+m" (*v), "=&d" (t), "=&d" (tmp)
- : "g" (i), "2" (atomic_read(v)));
- return t;
-}
-
#define atomic_cmpxchg(v, o, n) ((int)cmpxchg(&((v)->counter), (o), (n)))
#define atomic_xchg(v, new) (xchg(&((v)->counter), new))
#else /* !CONFIG_RMW_INSNS */
-static inline int atomic_add_return(int i, atomic_t * v)
-{
- unsigned long flags;
- int t;
-
- local_irq_save(flags);
- t = atomic_read(v);
- t += i;
- atomic_set(v, t);
- local_irq_restore(flags);
-
- return t;
-}
-
-static inline int atomic_sub_return(int i, atomic_t * v)
-{
- unsigned long flags;
- int t;
-
- local_irq_save(flags);
- t = atomic_read(v);
- t -= i;
- atomic_set(v, t);
- local_irq_restore(flags);
-
- return t;
-}
-
static inline int atomic_cmpxchg(atomic_t *v, int old, int new)
{
unsigned long flags;
diff --git a/arch/m68k/include/asm/unistd.h b/arch/m68k/include/asm/unistd.h
index 1fcdd34..4ef7a54 100644
--- a/arch/m68k/include/asm/unistd.h
+++ b/arch/m68k/include/asm/unistd.h
@@ -4,7 +4,7 @@
#include <uapi/asm/unistd.h>
-#define NR_syscalls 352
+#define NR_syscalls 354
#define __ARCH_WANT_OLD_READDIR
#define __ARCH_WANT_OLD_STAT
diff --git a/arch/m68k/include/uapi/asm/unistd.h b/arch/m68k/include/uapi/asm/unistd.h
index 9cd82fb..b419c6b 100644
--- a/arch/m68k/include/uapi/asm/unistd.h
+++ b/arch/m68k/include/uapi/asm/unistd.h
@@ -357,5 +357,7 @@
#define __NR_sched_setattr 349
#define __NR_sched_getattr 350
#define __NR_renameat2 351
+#define __NR_getrandom 352
+#define __NR_memfd_create 353
#endif /* _UAPI_ASM_M68K_UNISTD_H_ */
diff --git a/arch/m68k/kernel/syscalltable.S b/arch/m68k/kernel/syscalltable.S
index 501e102..05b46c2 100644
--- a/arch/m68k/kernel/syscalltable.S
+++ b/arch/m68k/kernel/syscalltable.S
@@ -372,4 +372,6 @@
.long sys_sched_setattr
.long sys_sched_getattr /* 350 */
.long sys_renameat2
+ .long sys_getrandom
+ .long sys_memfd_create
diff --git a/arch/metag/include/asm/atomic_lnkget.h b/arch/metag/include/asm/atomic_lnkget.h
index d2e60a1..948d868 100644
--- a/arch/metag/include/asm/atomic_lnkget.h
+++ b/arch/metag/include/asm/atomic_lnkget.h
@@ -27,85 +27,56 @@
return temp;
}
-static inline void atomic_add(int i, atomic_t *v)
-{
- int temp;
+#define ATOMIC_OP(op) \
+static inline void atomic_##op(int i, atomic_t *v) \
+{ \
+ int temp; \
+ \
+ asm volatile ( \
+ "1: LNKGETD %0, [%1]\n" \
+ " " #op " %0, %0, %2\n" \
+ " LNKSETD [%1], %0\n" \
+ " DEFR %0, TXSTAT\n" \
+ " ANDT %0, %0, #HI(0x3f000000)\n" \
+ " CMPT %0, #HI(0x02000000)\n" \
+ " BNZ 1b\n" \
+ : "=&d" (temp) \
+ : "da" (&v->counter), "bd" (i) \
+ : "cc"); \
+} \
- asm volatile (
- "1: LNKGETD %0, [%1]\n"
- " ADD %0, %0, %2\n"
- " LNKSETD [%1], %0\n"
- " DEFR %0, TXSTAT\n"
- " ANDT %0, %0, #HI(0x3f000000)\n"
- " CMPT %0, #HI(0x02000000)\n"
- " BNZ 1b\n"
- : "=&d" (temp)
- : "da" (&v->counter), "bd" (i)
- : "cc");
+#define ATOMIC_OP_RETURN(op) \
+static inline int atomic_##op##_return(int i, atomic_t *v) \
+{ \
+ int result, temp; \
+ \
+ smp_mb(); \
+ \
+ asm volatile ( \
+ "1: LNKGETD %1, [%2]\n" \
+ " " #op " %1, %1, %3\n" \
+ " LNKSETD [%2], %1\n" \
+ " DEFR %0, TXSTAT\n" \
+ " ANDT %0, %0, #HI(0x3f000000)\n" \
+ " CMPT %0, #HI(0x02000000)\n" \
+ " BNZ 1b\n" \
+ : "=&d" (temp), "=&da" (result) \
+ : "da" (&v->counter), "bd" (i) \
+ : "cc"); \
+ \
+ smp_mb(); \
+ \
+ return result; \
}
-static inline void atomic_sub(int i, atomic_t *v)
-{
- int temp;
+#define ATOMIC_OPS(op) ATOMIC_OP(op) ATOMIC_OP_RETURN(op)
- asm volatile (
- "1: LNKGETD %0, [%1]\n"
- " SUB %0, %0, %2\n"
- " LNKSETD [%1], %0\n"
- " DEFR %0, TXSTAT\n"
- " ANDT %0, %0, #HI(0x3f000000)\n"
- " CMPT %0, #HI(0x02000000)\n"
- " BNZ 1b\n"
- : "=&d" (temp)
- : "da" (&v->counter), "bd" (i)
- : "cc");
-}
+ATOMIC_OPS(add)
+ATOMIC_OPS(sub)
-static inline int atomic_add_return(int i, atomic_t *v)
-{
- int result, temp;
-
- smp_mb();
-
- asm volatile (
- "1: LNKGETD %1, [%2]\n"
- " ADD %1, %1, %3\n"
- " LNKSETD [%2], %1\n"
- " DEFR %0, TXSTAT\n"
- " ANDT %0, %0, #HI(0x3f000000)\n"
- " CMPT %0, #HI(0x02000000)\n"
- " BNZ 1b\n"
- : "=&d" (temp), "=&da" (result)
- : "da" (&v->counter), "bd" (i)
- : "cc");
-
- smp_mb();
-
- return result;
-}
-
-static inline int atomic_sub_return(int i, atomic_t *v)
-{
- int result, temp;
-
- smp_mb();
-
- asm volatile (
- "1: LNKGETD %1, [%2]\n"
- " SUB %1, %1, %3\n"
- " LNKSETD [%2], %1\n"
- " DEFR %0, TXSTAT\n"
- " ANDT %0, %0, #HI(0x3f000000)\n"
- " CMPT %0, #HI(0x02000000)\n"
- " BNZ 1b\n"
- : "=&d" (temp), "=&da" (result)
- : "da" (&v->counter), "bd" (i)
- : "cc");
-
- smp_mb();
-
- return result;
-}
+#undef ATOMIC_OPS
+#undef ATOMIC_OP_RETURN
+#undef ATOMIC_OP
static inline void atomic_clear_mask(unsigned int mask, atomic_t *v)
{
diff --git a/arch/metag/include/asm/atomic_lock1.h b/arch/metag/include/asm/atomic_lock1.h
index e578955..f5d5898 100644
--- a/arch/metag/include/asm/atomic_lock1.h
+++ b/arch/metag/include/asm/atomic_lock1.h
@@ -37,55 +37,41 @@
return i;
}
-static inline void atomic_add(int i, atomic_t *v)
-{
- unsigned long flags;
+#define ATOMIC_OP(op, c_op) \
+static inline void atomic_##op(int i, atomic_t *v) \
+{ \
+ unsigned long flags; \
+ \
+ __global_lock1(flags); \
+ fence(); \
+ v->counter c_op i; \
+ __global_unlock1(flags); \
+} \
- __global_lock1(flags);
- fence();
- v->counter += i;
- __global_unlock1(flags);
+#define ATOMIC_OP_RETURN(op, c_op) \
+static inline int atomic_##op##_return(int i, atomic_t *v) \
+{ \
+ unsigned long result; \
+ unsigned long flags; \
+ \
+ __global_lock1(flags); \
+ result = v->counter; \
+ result c_op i; \
+ fence(); \
+ v->counter = result; \
+ __global_unlock1(flags); \
+ \
+ return result; \
}
-static inline void atomic_sub(int i, atomic_t *v)
-{
- unsigned long flags;
+#define ATOMIC_OPS(op, c_op) ATOMIC_OP(op, c_op) ATOMIC_OP_RETURN(op, c_op)
- __global_lock1(flags);
- fence();
- v->counter -= i;
- __global_unlock1(flags);
-}
+ATOMIC_OPS(add, +=)
+ATOMIC_OPS(sub, -=)
-static inline int atomic_add_return(int i, atomic_t *v)
-{
- unsigned long result;
- unsigned long flags;
-
- __global_lock1(flags);
- result = v->counter;
- result += i;
- fence();
- v->counter = result;
- __global_unlock1(flags);
-
- return result;
-}
-
-static inline int atomic_sub_return(int i, atomic_t *v)
-{
- unsigned long result;
- unsigned long flags;
-
- __global_lock1(flags);
- result = v->counter;
- result -= i;
- fence();
- v->counter = result;
- __global_unlock1(flags);
-
- return result;
-}
+#undef ATOMIC_OPS
+#undef ATOMIC_OP_RETURN
+#undef ATOMIC_OP
static inline void atomic_clear_mask(unsigned int mask, atomic_t *v)
{
diff --git a/arch/mips/include/asm/atomic.h b/arch/mips/include/asm/atomic.h
index 37b2bef..476fe3b 100644
--- a/arch/mips/include/asm/atomic.h
+++ b/arch/mips/include/asm/atomic.h
@@ -40,195 +40,103 @@
*/
#define atomic_set(v, i) ((v)->counter = (i))
-/*
- * atomic_add - add integer to atomic variable
- * @i: integer value to add
- * @v: pointer of type atomic_t
- *
- * Atomically adds @i to @v.
- */
-static __inline__ void atomic_add(int i, atomic_t * v)
-{
- if (kernel_uses_llsc && R10000_LLSC_WAR) {
- int temp;
+#define ATOMIC_OP(op, c_op, asm_op) \
+static __inline__ void atomic_##op(int i, atomic_t * v) \
+{ \
+ if (kernel_uses_llsc && R10000_LLSC_WAR) { \
+ int temp; \
+ \
+ __asm__ __volatile__( \
+ " .set arch=r4000 \n" \
+ "1: ll %0, %1 # atomic_" #op " \n" \
+ " " #asm_op " %0, %2 \n" \
+ " sc %0, %1 \n" \
+ " beqzl %0, 1b \n" \
+ " .set mips0 \n" \
+ : "=&r" (temp), "+m" (v->counter) \
+ : "Ir" (i)); \
+ } else if (kernel_uses_llsc) { \
+ int temp; \
+ \
+ do { \
+ __asm__ __volatile__( \
+ " .set arch=r4000 \n" \
+ " ll %0, %1 # atomic_" #op "\n" \
+ " " #asm_op " %0, %2 \n" \
+ " sc %0, %1 \n" \
+ " .set mips0 \n" \
+ : "=&r" (temp), "+m" (v->counter) \
+ : "Ir" (i)); \
+ } while (unlikely(!temp)); \
+ } else { \
+ unsigned long flags; \
+ \
+ raw_local_irq_save(flags); \
+ v->counter c_op i; \
+ raw_local_irq_restore(flags); \
+ } \
+} \
- __asm__ __volatile__(
- " .set arch=r4000 \n"
- "1: ll %0, %1 # atomic_add \n"
- " addu %0, %2 \n"
- " sc %0, %1 \n"
- " beqzl %0, 1b \n"
- " .set mips0 \n"
- : "=&r" (temp), "+m" (v->counter)
- : "Ir" (i));
- } else if (kernel_uses_llsc) {
- int temp;
-
- do {
- __asm__ __volatile__(
- " .set arch=r4000 \n"
- " ll %0, %1 # atomic_add \n"
- " addu %0, %2 \n"
- " sc %0, %1 \n"
- " .set mips0 \n"
- : "=&r" (temp), "+m" (v->counter)
- : "Ir" (i));
- } while (unlikely(!temp));
- } else {
- unsigned long flags;
-
- raw_local_irq_save(flags);
- v->counter += i;
- raw_local_irq_restore(flags);
- }
+#define ATOMIC_OP_RETURN(op, c_op, asm_op) \
+static __inline__ int atomic_##op##_return(int i, atomic_t * v) \
+{ \
+ int result; \
+ \
+ smp_mb__before_llsc(); \
+ \
+ if (kernel_uses_llsc && R10000_LLSC_WAR) { \
+ int temp; \
+ \
+ __asm__ __volatile__( \
+ " .set arch=r4000 \n" \
+ "1: ll %1, %2 # atomic_" #op "_return \n" \
+ " " #asm_op " %0, %1, %3 \n" \
+ " sc %0, %2 \n" \
+ " beqzl %0, 1b \n" \
+ " addu %0, %1, %3 \n" \
+ " .set mips0 \n" \
+ : "=&r" (result), "=&r" (temp), "+m" (v->counter) \
+ : "Ir" (i)); \
+ } else if (kernel_uses_llsc) { \
+ int temp; \
+ \
+ do { \
+ __asm__ __volatile__( \
+ " .set arch=r4000 \n" \
+ " ll %1, %2 # atomic_" #op "_return \n" \
+ " " #asm_op " %0, %1, %3 \n" \
+ " sc %0, %2 \n" \
+ " .set mips0 \n" \
+ : "=&r" (result), "=&r" (temp), "+m" (v->counter) \
+ : "Ir" (i)); \
+ } while (unlikely(!result)); \
+ \
+ result = temp + i; \
+ } else { \
+ unsigned long flags; \
+ \
+ raw_local_irq_save(flags); \
+ result = v->counter; \
+ result c_op i; \
+ v->counter = result; \
+ raw_local_irq_restore(flags); \
+ } \
+ \
+ smp_llsc_mb(); \
+ \
+ return result; \
}
-/*
- * atomic_sub - subtract the atomic variable
- * @i: integer value to subtract
- * @v: pointer of type atomic_t
- *
- * Atomically subtracts @i from @v.
- */
-static __inline__ void atomic_sub(int i, atomic_t * v)
-{
- if (kernel_uses_llsc && R10000_LLSC_WAR) {
- int temp;
+#define ATOMIC_OPS(op, c_op, asm_op) \
+ ATOMIC_OP(op, c_op, asm_op) \
+ ATOMIC_OP_RETURN(op, c_op, asm_op)
- __asm__ __volatile__(
- " .set arch=r4000 \n"
- "1: ll %0, %1 # atomic_sub \n"
- " subu %0, %2 \n"
- " sc %0, %1 \n"
- " beqzl %0, 1b \n"
- " .set mips0 \n"
- : "=&r" (temp), "+m" (v->counter)
- : "Ir" (i));
- } else if (kernel_uses_llsc) {
- int temp;
+ATOMIC_OPS(add, +=, addu)
+ATOMIC_OPS(sub, -=, subu)
- do {
- __asm__ __volatile__(
- " .set arch=r4000 \n"
- " ll %0, %1 # atomic_sub \n"
- " subu %0, %2 \n"
- " sc %0, %1 \n"
- " .set mips0 \n"
- : "=&r" (temp), "+m" (v->counter)
- : "Ir" (i));
- } while (unlikely(!temp));
- } else {
- unsigned long flags;
-
- raw_local_irq_save(flags);
- v->counter -= i;
- raw_local_irq_restore(flags);
- }
-}
-
-/*
- * Same as above, but return the result value
- */
-static __inline__ int atomic_add_return(int i, atomic_t * v)
-{
- int result;
-
- smp_mb__before_llsc();
-
- if (kernel_uses_llsc && R10000_LLSC_WAR) {
- int temp;
-
- __asm__ __volatile__(
- " .set arch=r4000 \n"
- "1: ll %1, %2 # atomic_add_return \n"
- " addu %0, %1, %3 \n"
- " sc %0, %2 \n"
- " beqzl %0, 1b \n"
- " addu %0, %1, %3 \n"
- " .set mips0 \n"
- : "=&r" (result), "=&r" (temp), "+m" (v->counter)
- : "Ir" (i));
- } else if (kernel_uses_llsc) {
- int temp;
-
- do {
- __asm__ __volatile__(
- " .set arch=r4000 \n"
- " ll %1, %2 # atomic_add_return \n"
- " addu %0, %1, %3 \n"
- " sc %0, %2 \n"
- " .set mips0 \n"
- : "=&r" (result), "=&r" (temp), "+m" (v->counter)
- : "Ir" (i));
- } while (unlikely(!result));
-
- result = temp + i;
- } else {
- unsigned long flags;
-
- raw_local_irq_save(flags);
- result = v->counter;
- result += i;
- v->counter = result;
- raw_local_irq_restore(flags);
- }
-
- smp_llsc_mb();
-
- return result;
-}
-
-static __inline__ int atomic_sub_return(int i, atomic_t * v)
-{
- int result;
-
- smp_mb__before_llsc();
-
- if (kernel_uses_llsc && R10000_LLSC_WAR) {
- int temp;
-
- __asm__ __volatile__(
- " .set arch=r4000 \n"
- "1: ll %1, %2 # atomic_sub_return \n"
- " subu %0, %1, %3 \n"
- " sc %0, %2 \n"
- " beqzl %0, 1b \n"
- " subu %0, %1, %3 \n"
- " .set mips0 \n"
- : "=&r" (result), "=&r" (temp), "=m" (v->counter)
- : "Ir" (i), "m" (v->counter)
- : "memory");
-
- result = temp - i;
- } else if (kernel_uses_llsc) {
- int temp;
-
- do {
- __asm__ __volatile__(
- " .set arch=r4000 \n"
- " ll %1, %2 # atomic_sub_return \n"
- " subu %0, %1, %3 \n"
- " sc %0, %2 \n"
- " .set mips0 \n"
- : "=&r" (result), "=&r" (temp), "+m" (v->counter)
- : "Ir" (i));
- } while (unlikely(!result));
-
- result = temp - i;
- } else {
- unsigned long flags;
-
- raw_local_irq_save(flags);
- result = v->counter;
- result -= i;
- v->counter = result;
- raw_local_irq_restore(flags);
- }
-
- smp_llsc_mb();
-
- return result;
-}
+#undef ATOMIC_OPS
+#undef ATOMIC_OP_RETURN
+#undef ATOMIC_OP
/*
* atomic_sub_if_positive - conditionally subtract integer from atomic variable
@@ -407,195 +315,104 @@
*/
#define atomic64_set(v, i) ((v)->counter = (i))
-/*
- * atomic64_add - add integer to atomic variable
- * @i: integer value to add
- * @v: pointer of type atomic64_t
- *
- * Atomically adds @i to @v.
- */
-static __inline__ void atomic64_add(long i, atomic64_t * v)
-{
- if (kernel_uses_llsc && R10000_LLSC_WAR) {
- long temp;
+#define ATOMIC64_OP(op, c_op, asm_op) \
+static __inline__ void atomic64_##op(long i, atomic64_t * v) \
+{ \
+ if (kernel_uses_llsc && R10000_LLSC_WAR) { \
+ long temp; \
+ \
+ __asm__ __volatile__( \
+ " .set arch=r4000 \n" \
+ "1: lld %0, %1 # atomic64_" #op " \n" \
+ " " #asm_op " %0, %2 \n" \
+ " scd %0, %1 \n" \
+ " beqzl %0, 1b \n" \
+ " .set mips0 \n" \
+ : "=&r" (temp), "+m" (v->counter) \
+ : "Ir" (i)); \
+ } else if (kernel_uses_llsc) { \
+ long temp; \
+ \
+ do { \
+ __asm__ __volatile__( \
+ " .set arch=r4000 \n" \
+ " lld %0, %1 # atomic64_" #op "\n" \
+ " " #asm_op " %0, %2 \n" \
+ " scd %0, %1 \n" \
+ " .set mips0 \n" \
+ : "=&r" (temp), "+m" (v->counter) \
+ : "Ir" (i)); \
+ } while (unlikely(!temp)); \
+ } else { \
+ unsigned long flags; \
+ \
+ raw_local_irq_save(flags); \
+ v->counter c_op i; \
+ raw_local_irq_restore(flags); \
+ } \
+} \
- __asm__ __volatile__(
- " .set arch=r4000 \n"
- "1: lld %0, %1 # atomic64_add \n"
- " daddu %0, %2 \n"
- " scd %0, %1 \n"
- " beqzl %0, 1b \n"
- " .set mips0 \n"
- : "=&r" (temp), "+m" (v->counter)
- : "Ir" (i));
- } else if (kernel_uses_llsc) {
- long temp;
-
- do {
- __asm__ __volatile__(
- " .set arch=r4000 \n"
- " lld %0, %1 # atomic64_add \n"
- " daddu %0, %2 \n"
- " scd %0, %1 \n"
- " .set mips0 \n"
- : "=&r" (temp), "+m" (v->counter)
- : "Ir" (i));
- } while (unlikely(!temp));
- } else {
- unsigned long flags;
-
- raw_local_irq_save(flags);
- v->counter += i;
- raw_local_irq_restore(flags);
- }
+#define ATOMIC64_OP_RETURN(op, c_op, asm_op) \
+static __inline__ long atomic64_##op##_return(long i, atomic64_t * v) \
+{ \
+ long result; \
+ \
+ smp_mb__before_llsc(); \
+ \
+ if (kernel_uses_llsc && R10000_LLSC_WAR) { \
+ long temp; \
+ \
+ __asm__ __volatile__( \
+ " .set arch=r4000 \n" \
+ "1: lld %1, %2 # atomic64_" #op "_return\n" \
+ " " #asm_op " %0, %1, %3 \n" \
+ " scd %0, %2 \n" \
+ " beqzl %0, 1b \n" \
+ " " #asm_op " %0, %1, %3 \n" \
+ " .set mips0 \n" \
+ : "=&r" (result), "=&r" (temp), "+m" (v->counter) \
+ : "Ir" (i)); \
+ } else if (kernel_uses_llsc) { \
+ long temp; \
+ \
+ do { \
+ __asm__ __volatile__( \
+ " .set arch=r4000 \n" \
+ " lld %1, %2 # atomic64_" #op "_return\n" \
+ " " #asm_op " %0, %1, %3 \n" \
+ " scd %0, %2 \n" \
+ " .set mips0 \n" \
+ : "=&r" (result), "=&r" (temp), "=m" (v->counter) \
+ : "Ir" (i), "m" (v->counter) \
+ : "memory"); \
+ } while (unlikely(!result)); \
+ \
+ result = temp + i; \
+ } else { \
+ unsigned long flags; \
+ \
+ raw_local_irq_save(flags); \
+ result = v->counter; \
+ result c_op i; \
+ v->counter = result; \
+ raw_local_irq_restore(flags); \
+ } \
+ \
+ smp_llsc_mb(); \
+ \
+ return result; \
}
-/*
- * atomic64_sub - subtract the atomic variable
- * @i: integer value to subtract
- * @v: pointer of type atomic64_t
- *
- * Atomically subtracts @i from @v.
- */
-static __inline__ void atomic64_sub(long i, atomic64_t * v)
-{
- if (kernel_uses_llsc && R10000_LLSC_WAR) {
- long temp;
+#define ATOMIC64_OPS(op, c_op, asm_op) \
+ ATOMIC64_OP(op, c_op, asm_op) \
+ ATOMIC64_OP_RETURN(op, c_op, asm_op)
- __asm__ __volatile__(
- " .set arch=r4000 \n"
- "1: lld %0, %1 # atomic64_sub \n"
- " dsubu %0, %2 \n"
- " scd %0, %1 \n"
- " beqzl %0, 1b \n"
- " .set mips0 \n"
- : "=&r" (temp), "+m" (v->counter)
- : "Ir" (i));
- } else if (kernel_uses_llsc) {
- long temp;
+ATOMIC64_OPS(add, +=, daddu)
+ATOMIC64_OPS(sub, -=, dsubu)
- do {
- __asm__ __volatile__(
- " .set arch=r4000 \n"
- " lld %0, %1 # atomic64_sub \n"
- " dsubu %0, %2 \n"
- " scd %0, %1 \n"
- " .set mips0 \n"
- : "=&r" (temp), "+m" (v->counter)
- : "Ir" (i));
- } while (unlikely(!temp));
- } else {
- unsigned long flags;
-
- raw_local_irq_save(flags);
- v->counter -= i;
- raw_local_irq_restore(flags);
- }
-}
-
-/*
- * Same as above, but return the result value
- */
-static __inline__ long atomic64_add_return(long i, atomic64_t * v)
-{
- long result;
-
- smp_mb__before_llsc();
-
- if (kernel_uses_llsc && R10000_LLSC_WAR) {
- long temp;
-
- __asm__ __volatile__(
- " .set arch=r4000 \n"
- "1: lld %1, %2 # atomic64_add_return \n"
- " daddu %0, %1, %3 \n"
- " scd %0, %2 \n"
- " beqzl %0, 1b \n"
- " daddu %0, %1, %3 \n"
- " .set mips0 \n"
- : "=&r" (result), "=&r" (temp), "+m" (v->counter)
- : "Ir" (i));
- } else if (kernel_uses_llsc) {
- long temp;
-
- do {
- __asm__ __volatile__(
- " .set arch=r4000 \n"
- " lld %1, %2 # atomic64_add_return \n"
- " daddu %0, %1, %3 \n"
- " scd %0, %2 \n"
- " .set mips0 \n"
- : "=&r" (result), "=&r" (temp), "=m" (v->counter)
- : "Ir" (i), "m" (v->counter)
- : "memory");
- } while (unlikely(!result));
-
- result = temp + i;
- } else {
- unsigned long flags;
-
- raw_local_irq_save(flags);
- result = v->counter;
- result += i;
- v->counter = result;
- raw_local_irq_restore(flags);
- }
-
- smp_llsc_mb();
-
- return result;
-}
-
-static __inline__ long atomic64_sub_return(long i, atomic64_t * v)
-{
- long result;
-
- smp_mb__before_llsc();
-
- if (kernel_uses_llsc && R10000_LLSC_WAR) {
- long temp;
-
- __asm__ __volatile__(
- " .set arch=r4000 \n"
- "1: lld %1, %2 # atomic64_sub_return \n"
- " dsubu %0, %1, %3 \n"
- " scd %0, %2 \n"
- " beqzl %0, 1b \n"
- " dsubu %0, %1, %3 \n"
- " .set mips0 \n"
- : "=&r" (result), "=&r" (temp), "=m" (v->counter)
- : "Ir" (i), "m" (v->counter)
- : "memory");
- } else if (kernel_uses_llsc) {
- long temp;
-
- do {
- __asm__ __volatile__(
- " .set arch=r4000 \n"
- " lld %1, %2 # atomic64_sub_return \n"
- " dsubu %0, %1, %3 \n"
- " scd %0, %2 \n"
- " .set mips0 \n"
- : "=&r" (result), "=&r" (temp), "=m" (v->counter)
- : "Ir" (i), "m" (v->counter)
- : "memory");
- } while (unlikely(!result));
-
- result = temp - i;
- } else {
- unsigned long flags;
-
- raw_local_irq_save(flags);
- result = v->counter;
- result -= i;
- v->counter = result;
- raw_local_irq_restore(flags);
- }
-
- smp_llsc_mb();
-
- return result;
-}
+#undef ATOMIC64_OPS
+#undef ATOMIC64_OP_RETURN
+#undef ATOMIC64_OP
/*
* atomic64_sub_if_positive - conditionally subtract integer from atomic variable
diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c
index 645b3c4..f7aac5b 100644
--- a/arch/mips/kernel/ptrace.c
+++ b/arch/mips/kernel/ptrace.c
@@ -770,7 +770,7 @@
long ret = 0;
user_exit();
- if (secure_computing(syscall) == -1)
+ if (secure_computing() == -1)
return -1;
if (test_thread_flag(TIF_SYSCALL_TRACE) &&
diff --git a/arch/mn10300/include/asm/atomic.h b/arch/mn10300/include/asm/atomic.h
index cadeb1e..5be655e 100644
--- a/arch/mn10300/include/asm/atomic.h
+++ b/arch/mn10300/include/asm/atomic.h
@@ -33,7 +33,6 @@
* @v: pointer of type atomic_t
*
* Atomically reads the value of @v. Note that the guaranteed
- * useful range of an atomic_t is only 24 bits.
*/
#define atomic_read(v) (ACCESS_ONCE((v)->counter))
@@ -43,102 +42,62 @@
* @i: required value
*
* Atomically sets the value of @v to @i. Note that the guaranteed
- * useful range of an atomic_t is only 24 bits.
*/
#define atomic_set(v, i) (((v)->counter) = (i))
-/**
- * atomic_add_return - add integer to atomic variable
- * @i: integer value to add
- * @v: pointer of type atomic_t
- *
- * Atomically adds @i to @v and returns the result
- * Note that the guaranteed useful range of an atomic_t is only 24 bits.
- */
-static inline int atomic_add_return(int i, atomic_t *v)
-{
- int retval;
-#ifdef CONFIG_SMP
- int status;
-
- asm volatile(
- "1: mov %4,(_AAR,%3) \n"
- " mov (_ADR,%3),%1 \n"
- " add %5,%1 \n"
- " mov %1,(_ADR,%3) \n"
- " mov (_ADR,%3),%0 \n" /* flush */
- " mov (_ASR,%3),%0 \n"
- " or %0,%0 \n"
- " bne 1b \n"
- : "=&r"(status), "=&r"(retval), "=m"(v->counter)
- : "a"(ATOMIC_OPS_BASE_ADDR), "r"(&v->counter), "r"(i)
- : "memory", "cc");
-
-#else
- unsigned long flags;
-
- flags = arch_local_cli_save();
- retval = v->counter;
- retval += i;
- v->counter = retval;
- arch_local_irq_restore(flags);
-#endif
- return retval;
+#define ATOMIC_OP(op) \
+static inline void atomic_##op(int i, atomic_t *v) \
+{ \
+ int retval, status; \
+ \
+ asm volatile( \
+ "1: mov %4,(_AAR,%3) \n" \
+ " mov (_ADR,%3),%1 \n" \
+ " " #op " %5,%1 \n" \
+ " mov %1,(_ADR,%3) \n" \
+ " mov (_ADR,%3),%0 \n" /* flush */ \
+ " mov (_ASR,%3),%0 \n" \
+ " or %0,%0 \n" \
+ " bne 1b \n" \
+ : "=&r"(status), "=&r"(retval), "=m"(v->counter) \
+ : "a"(ATOMIC_OPS_BASE_ADDR), "r"(&v->counter), "r"(i) \
+ : "memory", "cc"); \
}
-/**
- * atomic_sub_return - subtract integer from atomic variable
- * @i: integer value to subtract
- * @v: pointer of type atomic_t
- *
- * Atomically subtracts @i from @v and returns the result
- * Note that the guaranteed useful range of an atomic_t is only 24 bits.
- */
-static inline int atomic_sub_return(int i, atomic_t *v)
-{
- int retval;
-#ifdef CONFIG_SMP
- int status;
-
- asm volatile(
- "1: mov %4,(_AAR,%3) \n"
- " mov (_ADR,%3),%1 \n"
- " sub %5,%1 \n"
- " mov %1,(_ADR,%3) \n"
- " mov (_ADR,%3),%0 \n" /* flush */
- " mov (_ASR,%3),%0 \n"
- " or %0,%0 \n"
- " bne 1b \n"
- : "=&r"(status), "=&r"(retval), "=m"(v->counter)
- : "a"(ATOMIC_OPS_BASE_ADDR), "r"(&v->counter), "r"(i)
- : "memory", "cc");
-
-#else
- unsigned long flags;
- flags = arch_local_cli_save();
- retval = v->counter;
- retval -= i;
- v->counter = retval;
- arch_local_irq_restore(flags);
-#endif
- return retval;
+#define ATOMIC_OP_RETURN(op) \
+static inline int atomic_##op##_return(int i, atomic_t *v) \
+{ \
+ int retval, status; \
+ \
+ asm volatile( \
+ "1: mov %4,(_AAR,%3) \n" \
+ " mov (_ADR,%3),%1 \n" \
+ " " #op " %5,%1 \n" \
+ " mov %1,(_ADR,%3) \n" \
+ " mov (_ADR,%3),%0 \n" /* flush */ \
+ " mov (_ASR,%3),%0 \n" \
+ " or %0,%0 \n" \
+ " bne 1b \n" \
+ : "=&r"(status), "=&r"(retval), "=m"(v->counter) \
+ : "a"(ATOMIC_OPS_BASE_ADDR), "r"(&v->counter), "r"(i) \
+ : "memory", "cc"); \
+ return retval; \
}
+#define ATOMIC_OPS(op) ATOMIC_OP(op) ATOMIC_OP_RETURN(op)
+
+ATOMIC_OPS(add)
+ATOMIC_OPS(sub)
+
+#undef ATOMIC_OPS
+#undef ATOMIC_OP_RETURN
+#undef ATOMIC_OP
+
static inline int atomic_add_negative(int i, atomic_t *v)
{
return atomic_add_return(i, v) < 0;
}
-static inline void atomic_add(int i, atomic_t *v)
-{
- atomic_add_return(i, v);
-}
-
-static inline void atomic_sub(int i, atomic_t *v)
-{
- atomic_sub_return(i, v);
-}
-
static inline void atomic_inc(atomic_t *v)
{
atomic_add_return(1, v);
diff --git a/arch/parisc/include/asm/atomic.h b/arch/parisc/include/asm/atomic.h
index 0be2db2..219750b 100644
--- a/arch/parisc/include/asm/atomic.h
+++ b/arch/parisc/include/asm/atomic.h
@@ -55,24 +55,7 @@
* are atomic, so a reader never sees inconsistent values.
*/
-/* It's possible to reduce all atomic operations to either
- * __atomic_add_return, atomic_set and atomic_read (the latter
- * is there only for consistency).
- */
-
-static __inline__ int __atomic_add_return(int i, atomic_t *v)
-{
- int ret;
- unsigned long flags;
- _atomic_spin_lock_irqsave(v, flags);
-
- ret = (v->counter += i);
-
- _atomic_spin_unlock_irqrestore(v, flags);
- return ret;
-}
-
-static __inline__ void atomic_set(atomic_t *v, int i)
+static __inline__ void atomic_set(atomic_t *v, int i)
{
unsigned long flags;
_atomic_spin_lock_irqsave(v, flags);
@@ -115,16 +98,43 @@
return c;
}
+#define ATOMIC_OP(op, c_op) \
+static __inline__ void atomic_##op(int i, atomic_t *v) \
+{ \
+ unsigned long flags; \
+ \
+ _atomic_spin_lock_irqsave(v, flags); \
+ v->counter c_op i; \
+ _atomic_spin_unlock_irqrestore(v, flags); \
+} \
-#define atomic_add(i,v) ((void)(__atomic_add_return( (i),(v))))
-#define atomic_sub(i,v) ((void)(__atomic_add_return(-((int) (i)),(v))))
-#define atomic_inc(v) ((void)(__atomic_add_return( 1,(v))))
-#define atomic_dec(v) ((void)(__atomic_add_return( -1,(v))))
+#define ATOMIC_OP_RETURN(op, c_op) \
+static __inline__ int atomic_##op##_return(int i, atomic_t *v) \
+{ \
+ unsigned long flags; \
+ int ret; \
+ \
+ _atomic_spin_lock_irqsave(v, flags); \
+ ret = (v->counter c_op i); \
+ _atomic_spin_unlock_irqrestore(v, flags); \
+ \
+ return ret; \
+}
-#define atomic_add_return(i,v) (__atomic_add_return( (i),(v)))
-#define atomic_sub_return(i,v) (__atomic_add_return(-(i),(v)))
-#define atomic_inc_return(v) (__atomic_add_return( 1,(v)))
-#define atomic_dec_return(v) (__atomic_add_return( -1,(v)))
+#define ATOMIC_OPS(op, c_op) ATOMIC_OP(op, c_op) ATOMIC_OP_RETURN(op, c_op)
+
+ATOMIC_OPS(add, +=)
+ATOMIC_OPS(sub, -=)
+
+#undef ATOMIC_OPS
+#undef ATOMIC_OP_RETURN
+#undef ATOMIC_OP
+
+#define atomic_inc(v) (atomic_add( 1,(v)))
+#define atomic_dec(v) (atomic_add( -1,(v)))
+
+#define atomic_inc_return(v) (atomic_add_return( 1,(v)))
+#define atomic_dec_return(v) (atomic_add_return( -1,(v)))
#define atomic_add_negative(a, v) (atomic_add_return((a), (v)) < 0)
@@ -148,19 +158,38 @@
#define ATOMIC64_INIT(i) { (i) }
-static __inline__ s64
-__atomic64_add_return(s64 i, atomic64_t *v)
-{
- s64 ret;
- unsigned long flags;
- _atomic_spin_lock_irqsave(v, flags);
+#define ATOMIC64_OP(op, c_op) \
+static __inline__ void atomic64_##op(s64 i, atomic64_t *v) \
+{ \
+ unsigned long flags; \
+ \
+ _atomic_spin_lock_irqsave(v, flags); \
+ v->counter c_op i; \
+ _atomic_spin_unlock_irqrestore(v, flags); \
+} \
- ret = (v->counter += i);
-
- _atomic_spin_unlock_irqrestore(v, flags);
- return ret;
+#define ATOMIC64_OP_RETURN(op, c_op) \
+static __inline__ s64 atomic64_##op##_return(s64 i, atomic64_t *v) \
+{ \
+ unsigned long flags; \
+ s64 ret; \
+ \
+ _atomic_spin_lock_irqsave(v, flags); \
+ ret = (v->counter c_op i); \
+ _atomic_spin_unlock_irqrestore(v, flags); \
+ \
+ return ret; \
}
+#define ATOMIC64_OPS(op, c_op) ATOMIC64_OP(op, c_op) ATOMIC64_OP_RETURN(op, c_op)
+
+ATOMIC64_OPS(add, +=)
+ATOMIC64_OPS(sub, -=)
+
+#undef ATOMIC64_OPS
+#undef ATOMIC64_OP_RETURN
+#undef ATOMIC64_OP
+
static __inline__ void
atomic64_set(atomic64_t *v, s64 i)
{
@@ -178,15 +207,11 @@
return (*(volatile long *)&(v)->counter);
}
-#define atomic64_add(i,v) ((void)(__atomic64_add_return( ((s64)(i)),(v))))
-#define atomic64_sub(i,v) ((void)(__atomic64_add_return(-((s64)(i)),(v))))
-#define atomic64_inc(v) ((void)(__atomic64_add_return( 1,(v))))
-#define atomic64_dec(v) ((void)(__atomic64_add_return( -1,(v))))
+#define atomic64_inc(v) (atomic64_add( 1,(v)))
+#define atomic64_dec(v) (atomic64_add( -1,(v)))
-#define atomic64_add_return(i,v) (__atomic64_add_return( ((s64)(i)),(v)))
-#define atomic64_sub_return(i,v) (__atomic64_add_return(-((s64)(i)),(v)))
-#define atomic64_inc_return(v) (__atomic64_add_return( 1,(v)))
-#define atomic64_dec_return(v) (__atomic64_add_return( -1,(v)))
+#define atomic64_inc_return(v) (atomic64_add_return( 1,(v)))
+#define atomic64_dec_return(v) (atomic64_add_return( -1,(v)))
#define atomic64_add_negative(a, v) (atomic64_add_return((a), (v)) < 0)
diff --git a/arch/powerpc/include/asm/atomic.h b/arch/powerpc/include/asm/atomic.h
index 28992d0..512d278 100644
--- a/arch/powerpc/include/asm/atomic.h
+++ b/arch/powerpc/include/asm/atomic.h
@@ -26,76 +26,53 @@
__asm__ __volatile__("stw%U0%X0 %1,%0" : "=m"(v->counter) : "r"(i));
}
-static __inline__ void atomic_add(int a, atomic_t *v)
-{
- int t;
+#define ATOMIC_OP(op, asm_op) \
+static __inline__ void atomic_##op(int a, atomic_t *v) \
+{ \
+ int t; \
+ \
+ __asm__ __volatile__( \
+"1: lwarx %0,0,%3 # atomic_" #op "\n" \
+ #asm_op " %0,%2,%0\n" \
+ PPC405_ERR77(0,%3) \
+" stwcx. %0,0,%3 \n" \
+" bne- 1b\n" \
+ : "=&r" (t), "+m" (v->counter) \
+ : "r" (a), "r" (&v->counter) \
+ : "cc"); \
+} \
- __asm__ __volatile__(
-"1: lwarx %0,0,%3 # atomic_add\n\
- add %0,%2,%0\n"
- PPC405_ERR77(0,%3)
-" stwcx. %0,0,%3 \n\
- bne- 1b"
- : "=&r" (t), "+m" (v->counter)
- : "r" (a), "r" (&v->counter)
- : "cc");
+#define ATOMIC_OP_RETURN(op, asm_op) \
+static __inline__ int atomic_##op##_return(int a, atomic_t *v) \
+{ \
+ int t; \
+ \
+ __asm__ __volatile__( \
+ PPC_ATOMIC_ENTRY_BARRIER \
+"1: lwarx %0,0,%2 # atomic_" #op "_return\n" \
+ #asm_op " %0,%1,%0\n" \
+ PPC405_ERR77(0,%2) \
+" stwcx. %0,0,%2 \n" \
+" bne- 1b\n" \
+ PPC_ATOMIC_EXIT_BARRIER \
+ : "=&r" (t) \
+ : "r" (a), "r" (&v->counter) \
+ : "cc", "memory"); \
+ \
+ return t; \
}
-static __inline__ int atomic_add_return(int a, atomic_t *v)
-{
- int t;
+#define ATOMIC_OPS(op, asm_op) ATOMIC_OP(op, asm_op) ATOMIC_OP_RETURN(op, asm_op)
- __asm__ __volatile__(
- PPC_ATOMIC_ENTRY_BARRIER
-"1: lwarx %0,0,%2 # atomic_add_return\n\
- add %0,%1,%0\n"
- PPC405_ERR77(0,%2)
-" stwcx. %0,0,%2 \n\
- bne- 1b"
- PPC_ATOMIC_EXIT_BARRIER
- : "=&r" (t)
- : "r" (a), "r" (&v->counter)
- : "cc", "memory");
+ATOMIC_OPS(add, add)
+ATOMIC_OPS(sub, subf)
- return t;
-}
+#undef ATOMIC_OPS
+#undef ATOMIC_OP_RETURN
+#undef ATOMIC_OP
#define atomic_add_negative(a, v) (atomic_add_return((a), (v)) < 0)
-static __inline__ void atomic_sub(int a, atomic_t *v)
-{
- int t;
-
- __asm__ __volatile__(
-"1: lwarx %0,0,%3 # atomic_sub\n\
- subf %0,%2,%0\n"
- PPC405_ERR77(0,%3)
-" stwcx. %0,0,%3 \n\
- bne- 1b"
- : "=&r" (t), "+m" (v->counter)
- : "r" (a), "r" (&v->counter)
- : "cc");
-}
-
-static __inline__ int atomic_sub_return(int a, atomic_t *v)
-{
- int t;
-
- __asm__ __volatile__(
- PPC_ATOMIC_ENTRY_BARRIER
-"1: lwarx %0,0,%2 # atomic_sub_return\n\
- subf %0,%1,%0\n"
- PPC405_ERR77(0,%2)
-" stwcx. %0,0,%2 \n\
- bne- 1b"
- PPC_ATOMIC_EXIT_BARRIER
- : "=&r" (t)
- : "r" (a), "r" (&v->counter)
- : "cc", "memory");
-
- return t;
-}
-
static __inline__ void atomic_inc(atomic_t *v)
{
int t;
@@ -289,72 +266,51 @@
__asm__ __volatile__("std%U0%X0 %1,%0" : "=m"(v->counter) : "r"(i));
}
-static __inline__ void atomic64_add(long a, atomic64_t *v)
-{
- long t;
-
- __asm__ __volatile__(
-"1: ldarx %0,0,%3 # atomic64_add\n\
- add %0,%2,%0\n\
- stdcx. %0,0,%3 \n\
- bne- 1b"
- : "=&r" (t), "+m" (v->counter)
- : "r" (a), "r" (&v->counter)
- : "cc");
+#define ATOMIC64_OP(op, asm_op) \
+static __inline__ void atomic64_##op(long a, atomic64_t *v) \
+{ \
+ long t; \
+ \
+ __asm__ __volatile__( \
+"1: ldarx %0,0,%3 # atomic64_" #op "\n" \
+ #asm_op " %0,%2,%0\n" \
+" stdcx. %0,0,%3 \n" \
+" bne- 1b\n" \
+ : "=&r" (t), "+m" (v->counter) \
+ : "r" (a), "r" (&v->counter) \
+ : "cc"); \
}
-static __inline__ long atomic64_add_return(long a, atomic64_t *v)
-{
- long t;
-
- __asm__ __volatile__(
- PPC_ATOMIC_ENTRY_BARRIER
-"1: ldarx %0,0,%2 # atomic64_add_return\n\
- add %0,%1,%0\n\
- stdcx. %0,0,%2 \n\
- bne- 1b"
- PPC_ATOMIC_EXIT_BARRIER
- : "=&r" (t)
- : "r" (a), "r" (&v->counter)
- : "cc", "memory");
-
- return t;
+#define ATOMIC64_OP_RETURN(op, asm_op) \
+static __inline__ long atomic64_##op##_return(long a, atomic64_t *v) \
+{ \
+ long t; \
+ \
+ __asm__ __volatile__( \
+ PPC_ATOMIC_ENTRY_BARRIER \
+"1: ldarx %0,0,%2 # atomic64_" #op "_return\n" \
+ #asm_op " %0,%1,%0\n" \
+" stdcx. %0,0,%2 \n" \
+" bne- 1b\n" \
+ PPC_ATOMIC_EXIT_BARRIER \
+ : "=&r" (t) \
+ : "r" (a), "r" (&v->counter) \
+ : "cc", "memory"); \
+ \
+ return t; \
}
+#define ATOMIC64_OPS(op, asm_op) ATOMIC64_OP(op, asm_op) ATOMIC64_OP_RETURN(op, asm_op)
+
+ATOMIC64_OPS(add, add)
+ATOMIC64_OPS(sub, subf)
+
+#undef ATOMIC64_OPS
+#undef ATOMIC64_OP_RETURN
+#undef ATOMIC64_OP
+
#define atomic64_add_negative(a, v) (atomic64_add_return((a), (v)) < 0)
-static __inline__ void atomic64_sub(long a, atomic64_t *v)
-{
- long t;
-
- __asm__ __volatile__(
-"1: ldarx %0,0,%3 # atomic64_sub\n\
- subf %0,%2,%0\n\
- stdcx. %0,0,%3 \n\
- bne- 1b"
- : "=&r" (t), "+m" (v->counter)
- : "r" (a), "r" (&v->counter)
- : "cc");
-}
-
-static __inline__ long atomic64_sub_return(long a, atomic64_t *v)
-{
- long t;
-
- __asm__ __volatile__(
- PPC_ATOMIC_ENTRY_BARRIER
-"1: ldarx %0,0,%2 # atomic64_sub_return\n\
- subf %0,%1,%0\n\
- stdcx. %0,0,%2 \n\
- bne- 1b"
- PPC_ATOMIC_EXIT_BARRIER
- : "=&r" (t)
- : "r" (a), "r" (&v->counter)
- : "cc", "memory");
-
- return t;
-}
-
static __inline__ void atomic64_inc(atomic64_t *v)
{
long t;
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 72c20bb..79294c4 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -62,10 +62,10 @@
}
kvm->arch.hpt_cma_alloc = 0;
- page = kvm_alloc_hpt(1 << (order - PAGE_SHIFT));
+ page = kvm_alloc_hpt(1ul << (order - PAGE_SHIFT));
if (page) {
hpt = (unsigned long)pfn_to_kaddr(page_to_pfn(page));
- memset((void *)hpt, 0, (1 << order));
+ memset((void *)hpt, 0, (1ul << order));
kvm->arch.hpt_cma_alloc = 1;
}
diff --git a/arch/s390/include/asm/ipl.h b/arch/s390/include/asm/ipl.h
index 2fcccc0..c81661e 100644
--- a/arch/s390/include/asm/ipl.h
+++ b/arch/s390/include/asm/ipl.h
@@ -17,12 +17,12 @@
#define IPL_PARM_BLK_FCP_LEN (sizeof(struct ipl_list_hdr) + \
sizeof(struct ipl_block_fcp))
-#define IPL_PARM_BLK0_FCP_LEN (sizeof(struct ipl_block_fcp) + 8)
+#define IPL_PARM_BLK0_FCP_LEN (sizeof(struct ipl_block_fcp) + 16)
#define IPL_PARM_BLK_CCW_LEN (sizeof(struct ipl_list_hdr) + \
sizeof(struct ipl_block_ccw))
-#define IPL_PARM_BLK0_CCW_LEN (sizeof(struct ipl_block_ccw) + 8)
+#define IPL_PARM_BLK0_CCW_LEN (sizeof(struct ipl_block_ccw) + 16)
#define IPL_MAX_SUPPORTED_VERSION (0)
@@ -38,10 +38,11 @@
u8 pbt;
u8 flags;
u16 reserved2;
+ u8 loadparm[8];
} __attribute__((packed));
struct ipl_block_fcp {
- u8 reserved1[313-1];
+ u8 reserved1[305-1];
u8 opt;
u8 reserved2[3];
u16 reserved3;
@@ -62,7 +63,6 @@
offsetof(struct ipl_block_fcp, scp_data)))
struct ipl_block_ccw {
- u8 load_parm[8];
u8 reserved1[84];
u8 reserved2[2];
u16 devno;
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index b76317c..5efb2fe 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -1127,7 +1127,7 @@
unsigned long addr, pte_t *ptep)
{
pgste_t pgste;
- pte_t pte;
+ pte_t pte, oldpte;
int young;
if (mm_has_pgste(vma->vm_mm)) {
@@ -1135,12 +1135,13 @@
pgste = pgste_ipte_notify(vma->vm_mm, ptep, pgste);
}
- pte = *ptep;
+ oldpte = pte = *ptep;
ptep_flush_direct(vma->vm_mm, addr, ptep);
young = pte_young(pte);
pte = pte_mkold(pte);
if (mm_has_pgste(vma->vm_mm)) {
+ pgste = pgste_update_all(&oldpte, pgste, vma->vm_mm);
pgste = pgste_set_pte(ptep, pgste, pte);
pgste_set_unlock(ptep, pgste);
} else
@@ -1330,6 +1331,7 @@
ptep_flush_direct(vma->vm_mm, address, ptep);
if (mm_has_pgste(vma->vm_mm)) {
+ pgste_set_key(ptep, pgste, entry, vma->vm_mm);
pgste = pgste_set_pte(ptep, pgste, entry);
pgste_set_unlock(ptep, pgste);
} else
diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c
index 22aac58..39badb9 100644
--- a/arch/s390/kernel/ipl.c
+++ b/arch/s390/kernel/ipl.c
@@ -455,22 +455,6 @@
DEFINE_IPL_ATTR_RO(ipl_fcp, br_lba, "%lld\n", (unsigned long long)
IPL_PARMBLOCK_START->ipl_info.fcp.br_lba);
-static struct attribute *ipl_fcp_attrs[] = {
- &sys_ipl_type_attr.attr,
- &sys_ipl_device_attr.attr,
- &sys_ipl_fcp_wwpn_attr.attr,
- &sys_ipl_fcp_lun_attr.attr,
- &sys_ipl_fcp_bootprog_attr.attr,
- &sys_ipl_fcp_br_lba_attr.attr,
- NULL,
-};
-
-static struct attribute_group ipl_fcp_attr_group = {
- .attrs = ipl_fcp_attrs,
-};
-
-/* CCW ipl device attributes */
-
static ssize_t ipl_ccw_loadparm_show(struct kobject *kobj,
struct kobj_attribute *attr, char *page)
{
@@ -487,6 +471,23 @@
static struct kobj_attribute sys_ipl_ccw_loadparm_attr =
__ATTR(loadparm, 0444, ipl_ccw_loadparm_show, NULL);
+static struct attribute *ipl_fcp_attrs[] = {
+ &sys_ipl_type_attr.attr,
+ &sys_ipl_device_attr.attr,
+ &sys_ipl_fcp_wwpn_attr.attr,
+ &sys_ipl_fcp_lun_attr.attr,
+ &sys_ipl_fcp_bootprog_attr.attr,
+ &sys_ipl_fcp_br_lba_attr.attr,
+ &sys_ipl_ccw_loadparm_attr.attr,
+ NULL,
+};
+
+static struct attribute_group ipl_fcp_attr_group = {
+ .attrs = ipl_fcp_attrs,
+};
+
+/* CCW ipl device attributes */
+
static struct attribute *ipl_ccw_attrs_vm[] = {
&sys_ipl_type_attr.attr,
&sys_ipl_device_attr.attr,
@@ -765,28 +766,10 @@
DEFINE_IPL_ATTR_RW(reipl_fcp, device, "0.0.%04llx\n", "0.0.%llx\n",
reipl_block_fcp->ipl_info.fcp.devno);
-static struct attribute *reipl_fcp_attrs[] = {
- &sys_reipl_fcp_device_attr.attr,
- &sys_reipl_fcp_wwpn_attr.attr,
- &sys_reipl_fcp_lun_attr.attr,
- &sys_reipl_fcp_bootprog_attr.attr,
- &sys_reipl_fcp_br_lba_attr.attr,
- NULL,
-};
-
-static struct attribute_group reipl_fcp_attr_group = {
- .attrs = reipl_fcp_attrs,
-};
-
-/* CCW reipl device attributes */
-
-DEFINE_IPL_ATTR_RW(reipl_ccw, device, "0.0.%04llx\n", "0.0.%llx\n",
- reipl_block_ccw->ipl_info.ccw.devno);
-
static void reipl_get_ascii_loadparm(char *loadparm,
struct ipl_parameter_block *ibp)
{
- memcpy(loadparm, ibp->ipl_info.ccw.load_parm, LOADPARM_LEN);
+ memcpy(loadparm, ibp->hdr.loadparm, LOADPARM_LEN);
EBCASC(loadparm, LOADPARM_LEN);
loadparm[LOADPARM_LEN] = 0;
strim(loadparm);
@@ -821,13 +804,50 @@
return -EINVAL;
}
/* initialize loadparm with blanks */
- memset(ipb->ipl_info.ccw.load_parm, ' ', LOADPARM_LEN);
+ memset(ipb->hdr.loadparm, ' ', LOADPARM_LEN);
/* copy and convert to ebcdic */
- memcpy(ipb->ipl_info.ccw.load_parm, buf, lp_len);
- ASCEBC(ipb->ipl_info.ccw.load_parm, LOADPARM_LEN);
+ memcpy(ipb->hdr.loadparm, buf, lp_len);
+ ASCEBC(ipb->hdr.loadparm, LOADPARM_LEN);
return len;
}
+/* FCP wrapper */
+static ssize_t reipl_fcp_loadparm_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *page)
+{
+ return reipl_generic_loadparm_show(reipl_block_fcp, page);
+}
+
+static ssize_t reipl_fcp_loadparm_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t len)
+{
+ return reipl_generic_loadparm_store(reipl_block_fcp, buf, len);
+}
+
+static struct kobj_attribute sys_reipl_fcp_loadparm_attr =
+ __ATTR(loadparm, S_IRUGO | S_IWUSR, reipl_fcp_loadparm_show,
+ reipl_fcp_loadparm_store);
+
+static struct attribute *reipl_fcp_attrs[] = {
+ &sys_reipl_fcp_device_attr.attr,
+ &sys_reipl_fcp_wwpn_attr.attr,
+ &sys_reipl_fcp_lun_attr.attr,
+ &sys_reipl_fcp_bootprog_attr.attr,
+ &sys_reipl_fcp_br_lba_attr.attr,
+ &sys_reipl_fcp_loadparm_attr.attr,
+ NULL,
+};
+
+static struct attribute_group reipl_fcp_attr_group = {
+ .attrs = reipl_fcp_attrs,
+};
+
+/* CCW reipl device attributes */
+
+DEFINE_IPL_ATTR_RW(reipl_ccw, device, "0.0.%04llx\n", "0.0.%llx\n",
+ reipl_block_ccw->ipl_info.ccw.devno);
+
/* NSS wrapper */
static ssize_t reipl_nss_loadparm_show(struct kobject *kobj,
struct kobj_attribute *attr, char *page)
@@ -1125,11 +1145,10 @@
/* LOADPARM */
/* check if read scp info worked and set loadparm */
if (sclp_ipl_info.is_valid)
- memcpy(ipb->ipl_info.ccw.load_parm,
- &sclp_ipl_info.loadparm, LOADPARM_LEN);
+ memcpy(ipb->hdr.loadparm, &sclp_ipl_info.loadparm, LOADPARM_LEN);
else
/* read scp info failed: set empty loadparm (EBCDIC blanks) */
- memset(ipb->ipl_info.ccw.load_parm, 0x40, LOADPARM_LEN);
+ memset(ipb->hdr.loadparm, 0x40, LOADPARM_LEN);
ipb->hdr.flags = DIAG308_FLAGS_LP_VALID;
/* VM PARM */
@@ -1251,9 +1270,16 @@
return rc;
}
- if (ipl_info.type == IPL_TYPE_FCP)
+ if (ipl_info.type == IPL_TYPE_FCP) {
memcpy(reipl_block_fcp, IPL_PARMBLOCK_START, PAGE_SIZE);
- else {
+ /*
+ * Fix loadparm: There are systems where the (SCSI) LOADPARM
+ * is invalid in the SCSI IPL parameter block, so take it
+ * always from sclp_ipl_info.
+ */
+ memcpy(reipl_block_fcp->hdr.loadparm, sclp_ipl_info.loadparm,
+ LOADPARM_LEN);
+ } else {
reipl_block_fcp->hdr.len = IPL_PARM_BLK_FCP_LEN;
reipl_block_fcp->hdr.version = IPL_PARM_BLOCK_VERSION;
reipl_block_fcp->hdr.blk0_len = IPL_PARM_BLK0_FCP_LEN;
@@ -1864,7 +1890,23 @@
static int __init s390_ipl_init(void)
{
+ char str[8] = {0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40};
+
sclp_get_ipl_info(&sclp_ipl_info);
+ /*
+ * Fix loadparm: There are systems where the (SCSI) LOADPARM
+ * returned by read SCP info is invalid (contains EBCDIC blanks)
+ * when the system has been booted via diag308. In that case we use
+ * the value from diag308, if available.
+ *
+ * There are also systems where diag308 store does not work in
+ * case the system is booted from HMC. Fortunately in this case
+ * READ SCP info provides the correct value.
+ */
+ if (memcmp(sclp_ipl_info.loadparm, str, sizeof(str)) == 0 &&
+ diag308_set_works)
+ memcpy(sclp_ipl_info.loadparm, ipl_block.hdr.loadparm,
+ LOADPARM_LEN);
shutdown_actions_init();
shutdown_triggers_init();
return 0;
diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c
index 5dc7ad9..bebacad 100644
--- a/arch/s390/kernel/ptrace.c
+++ b/arch/s390/kernel/ptrace.c
@@ -803,7 +803,7 @@
long ret = 0;
/* Do the secure computing check first. */
- if (secure_computing(regs->gprs[2])) {
+ if (secure_computing()) {
/* seccomp failures shouldn't expose any additional code. */
ret = -1;
goto out;
diff --git a/arch/s390/kernel/vdso32/clock_gettime.S b/arch/s390/kernel/vdso32/clock_gettime.S
index 65fc397..7cf18f8 100644
--- a/arch/s390/kernel/vdso32/clock_gettime.S
+++ b/arch/s390/kernel/vdso32/clock_gettime.S
@@ -22,13 +22,11 @@
basr %r5,0
0: al %r5,21f-0b(%r5) /* get &_vdso_data */
chi %r2,__CLOCK_REALTIME
- je 10f
+ je 11f
chi %r2,__CLOCK_MONOTONIC
jne 19f
/* CLOCK_MONOTONIC */
- ltr %r3,%r3
- jz 9f /* tp == NULL */
1: l %r4,__VDSO_UPD_COUNT+4(%r5) /* load update counter */
tml %r4,0x0001 /* pending update ? loop */
jnz 1b
@@ -67,12 +65,10 @@
j 6b
8: st %r2,0(%r3) /* store tp->tv_sec */
st %r1,4(%r3) /* store tp->tv_nsec */
-9: lhi %r2,0
+ lhi %r2,0
br %r14
/* CLOCK_REALTIME */
-10: ltr %r3,%r3 /* tp == NULL */
- jz 18f
11: l %r4,__VDSO_UPD_COUNT+4(%r5) /* load update counter */
tml %r4,0x0001 /* pending update ? loop */
jnz 11b
@@ -111,7 +107,7 @@
j 15b
17: st %r2,0(%r3) /* store tp->tv_sec */
st %r1,4(%r3) /* store tp->tv_nsec */
-18: lhi %r2,0
+ lhi %r2,0
br %r14
/* Fallback to system call */
diff --git a/arch/s390/kernel/vdso64/clock_gettime.S b/arch/s390/kernel/vdso64/clock_gettime.S
index 91940ed..3f34e09 100644
--- a/arch/s390/kernel/vdso64/clock_gettime.S
+++ b/arch/s390/kernel/vdso64/clock_gettime.S
@@ -21,7 +21,7 @@
.cfi_startproc
larl %r5,_vdso_data
cghi %r2,__CLOCK_REALTIME
- je 4f
+ je 5f
cghi %r2,__CLOCK_THREAD_CPUTIME_ID
je 9f
cghi %r2,-2 /* Per-thread CPUCLOCK with PID=0, VIRT=1 */
@@ -30,8 +30,6 @@
jne 12f
/* CLOCK_MONOTONIC */
- ltgr %r3,%r3
- jz 3f /* tp == NULL */
0: lg %r4,__VDSO_UPD_COUNT(%r5) /* load update counter */
tmll %r4,0x0001 /* pending update ? loop */
jnz 0b
@@ -53,12 +51,10 @@
j 1b
2: stg %r0,0(%r3) /* store tp->tv_sec */
stg %r1,8(%r3) /* store tp->tv_nsec */
-3: lghi %r2,0
+ lghi %r2,0
br %r14
/* CLOCK_REALTIME */
-4: ltr %r3,%r3 /* tp == NULL */
- jz 8f
5: lg %r4,__VDSO_UPD_COUNT(%r5) /* load update counter */
tmll %r4,0x0001 /* pending update ? loop */
jnz 5b
@@ -80,7 +76,7 @@
j 6b
7: stg %r0,0(%r3) /* store tp->tv_sec */
stg %r1,8(%r3) /* store tp->tv_nsec */
-8: lghi %r2,0
+ lghi %r2,0
br %r14
/* CLOCK_THREAD_CPUTIME_ID for this thread */
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index ce81eb2..81b0e11 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -1317,19 +1317,6 @@
return -EINVAL;
}
- switch (kvm_run->exit_reason) {
- case KVM_EXIT_S390_SIEIC:
- case KVM_EXIT_UNKNOWN:
- case KVM_EXIT_INTR:
- case KVM_EXIT_S390_RESET:
- case KVM_EXIT_S390_UCONTROL:
- case KVM_EXIT_S390_TSCH:
- case KVM_EXIT_DEBUG:
- break;
- default:
- BUG();
- }
-
vcpu->arch.sie_block->gpsw.mask = kvm_run->psw_mask;
vcpu->arch.sie_block->gpsw.addr = kvm_run->psw_addr;
if (kvm_run->kvm_dirty_regs & KVM_SYNC_PREFIX) {
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 19daa53..5404a62 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -986,11 +986,21 @@
pte_t *ptep;
down_read(&mm->mmap_sem);
+retry:
ptep = get_locked_pte(current->mm, addr, &ptl);
if (unlikely(!ptep)) {
up_read(&mm->mmap_sem);
return -EFAULT;
}
+ if (!(pte_val(*ptep) & _PAGE_INVALID) &&
+ (pte_val(*ptep) & _PAGE_PROTECT)) {
+ pte_unmap_unlock(*ptep, ptl);
+ if (fixup_user_fault(current, mm, addr, FAULT_FLAG_WRITE)) {
+ up_read(&mm->mmap_sem);
+ return -EFAULT;
+ }
+ goto retry;
+ }
new = old = pgste_get_lock(ptep);
pgste_val(new) &= ~(PGSTE_GR_BIT | PGSTE_GC_BIT |
diff --git a/arch/sh/include/asm/atomic-grb.h b/arch/sh/include/asm/atomic-grb.h
index a273c88..97a5fda 100644
--- a/arch/sh/include/asm/atomic-grb.h
+++ b/arch/sh/include/asm/atomic-grb.h
@@ -1,85 +1,56 @@
#ifndef __ASM_SH_ATOMIC_GRB_H
#define __ASM_SH_ATOMIC_GRB_H
-static inline void atomic_add(int i, atomic_t *v)
-{
- int tmp;
+#define ATOMIC_OP(op) \
+static inline void atomic_##op(int i, atomic_t *v) \
+{ \
+ int tmp; \
+ \
+ __asm__ __volatile__ ( \
+ " .align 2 \n\t" \
+ " mova 1f, r0 \n\t" /* r0 = end point */ \
+ " mov r15, r1 \n\t" /* r1 = saved sp */ \
+ " mov #-6, r15 \n\t" /* LOGIN: r15 = size */ \
+ " mov.l @%1, %0 \n\t" /* load old value */ \
+ " " #op " %2, %0 \n\t" /* $op */ \
+ " mov.l %0, @%1 \n\t" /* store new value */ \
+ "1: mov r1, r15 \n\t" /* LOGOUT */ \
+ : "=&r" (tmp), \
+ "+r" (v) \
+ : "r" (i) \
+ : "memory" , "r0", "r1"); \
+} \
- __asm__ __volatile__ (
- " .align 2 \n\t"
- " mova 1f, r0 \n\t" /* r0 = end point */
- " mov r15, r1 \n\t" /* r1 = saved sp */
- " mov #-6, r15 \n\t" /* LOGIN: r15 = size */
- " mov.l @%1, %0 \n\t" /* load old value */
- " add %2, %0 \n\t" /* add */
- " mov.l %0, @%1 \n\t" /* store new value */
- "1: mov r1, r15 \n\t" /* LOGOUT */
- : "=&r" (tmp),
- "+r" (v)
- : "r" (i)
- : "memory" , "r0", "r1");
+#define ATOMIC_OP_RETURN(op) \
+static inline int atomic_##op##_return(int i, atomic_t *v) \
+{ \
+ int tmp; \
+ \
+ __asm__ __volatile__ ( \
+ " .align 2 \n\t" \
+ " mova 1f, r0 \n\t" /* r0 = end point */ \
+ " mov r15, r1 \n\t" /* r1 = saved sp */ \
+ " mov #-6, r15 \n\t" /* LOGIN: r15 = size */ \
+ " mov.l @%1, %0 \n\t" /* load old value */ \
+ " " #op " %2, %0 \n\t" /* $op */ \
+ " mov.l %0, @%1 \n\t" /* store new value */ \
+ "1: mov r1, r15 \n\t" /* LOGOUT */ \
+ : "=&r" (tmp), \
+ "+r" (v) \
+ : "r" (i) \
+ : "memory" , "r0", "r1"); \
+ \
+ return tmp; \
}
-static inline void atomic_sub(int i, atomic_t *v)
-{
- int tmp;
+#define ATOMIC_OPS(op) ATOMIC_OP(op) ATOMIC_OP_RETURN(op)
- __asm__ __volatile__ (
- " .align 2 \n\t"
- " mova 1f, r0 \n\t" /* r0 = end point */
- " mov r15, r1 \n\t" /* r1 = saved sp */
- " mov #-6, r15 \n\t" /* LOGIN: r15 = size */
- " mov.l @%1, %0 \n\t" /* load old value */
- " sub %2, %0 \n\t" /* sub */
- " mov.l %0, @%1 \n\t" /* store new value */
- "1: mov r1, r15 \n\t" /* LOGOUT */
- : "=&r" (tmp),
- "+r" (v)
- : "r" (i)
- : "memory" , "r0", "r1");
-}
+ATOMIC_OPS(add)
+ATOMIC_OPS(sub)
-static inline int atomic_add_return(int i, atomic_t *v)
-{
- int tmp;
-
- __asm__ __volatile__ (
- " .align 2 \n\t"
- " mova 1f, r0 \n\t" /* r0 = end point */
- " mov r15, r1 \n\t" /* r1 = saved sp */
- " mov #-6, r15 \n\t" /* LOGIN: r15 = size */
- " mov.l @%1, %0 \n\t" /* load old value */
- " add %2, %0 \n\t" /* add */
- " mov.l %0, @%1 \n\t" /* store new value */
- "1: mov r1, r15 \n\t" /* LOGOUT */
- : "=&r" (tmp),
- "+r" (v)
- : "r" (i)
- : "memory" , "r0", "r1");
-
- return tmp;
-}
-
-static inline int atomic_sub_return(int i, atomic_t *v)
-{
- int tmp;
-
- __asm__ __volatile__ (
- " .align 2 \n\t"
- " mova 1f, r0 \n\t" /* r0 = end point */
- " mov r15, r1 \n\t" /* r1 = saved sp */
- " mov #-6, r15 \n\t" /* LOGIN: r15 = size */
- " mov.l @%1, %0 \n\t" /* load old value */
- " sub %2, %0 \n\t" /* sub */
- " mov.l %0, @%1 \n\t" /* store new value */
- "1: mov r1, r15 \n\t" /* LOGOUT */
- : "=&r" (tmp),
- "+r" (v)
- : "r" (i)
- : "memory", "r0", "r1");
-
- return tmp;
-}
+#undef ATOMIC_OPS
+#undef ATOMIC_OP_RETURN
+#undef ATOMIC_OP
static inline void atomic_clear_mask(unsigned int mask, atomic_t *v)
{
diff --git a/arch/sh/include/asm/atomic-irq.h b/arch/sh/include/asm/atomic-irq.h
index 9f7c566..61d1075 100644
--- a/arch/sh/include/asm/atomic-irq.h
+++ b/arch/sh/include/asm/atomic-irq.h
@@ -8,49 +8,39 @@
* forward to code at the end of this object's .text section, then
* branch back to restart the operation.
*/
-static inline void atomic_add(int i, atomic_t *v)
-{
- unsigned long flags;
- raw_local_irq_save(flags);
- v->counter += i;
- raw_local_irq_restore(flags);
+#define ATOMIC_OP(op, c_op) \
+static inline void atomic_##op(int i, atomic_t *v) \
+{ \
+ unsigned long flags; \
+ \
+ raw_local_irq_save(flags); \
+ v->counter c_op i; \
+ raw_local_irq_restore(flags); \
}
-static inline void atomic_sub(int i, atomic_t *v)
-{
- unsigned long flags;
-
- raw_local_irq_save(flags);
- v->counter -= i;
- raw_local_irq_restore(flags);
+#define ATOMIC_OP_RETURN(op, c_op) \
+static inline int atomic_##op##_return(int i, atomic_t *v) \
+{ \
+ unsigned long temp, flags; \
+ \
+ raw_local_irq_save(flags); \
+ temp = v->counter; \
+ temp c_op i; \
+ v->counter = temp; \
+ raw_local_irq_restore(flags); \
+ \
+ return temp; \
}
-static inline int atomic_add_return(int i, atomic_t *v)
-{
- unsigned long temp, flags;
+#define ATOMIC_OPS(op, c_op) ATOMIC_OP(op, c_op) ATOMIC_OP_RETURN(op, c_op)
- raw_local_irq_save(flags);
- temp = v->counter;
- temp += i;
- v->counter = temp;
- raw_local_irq_restore(flags);
+ATOMIC_OPS(add, +=)
+ATOMIC_OPS(sub, -=)
- return temp;
-}
-
-static inline int atomic_sub_return(int i, atomic_t *v)
-{
- unsigned long temp, flags;
-
- raw_local_irq_save(flags);
- temp = v->counter;
- temp -= i;
- v->counter = temp;
- raw_local_irq_restore(flags);
-
- return temp;
-}
+#undef ATOMIC_OPS
+#undef ATOMIC_OP_RETURN
+#undef ATOMIC_OP
static inline void atomic_clear_mask(unsigned int mask, atomic_t *v)
{
diff --git a/arch/sh/include/asm/atomic-llsc.h b/arch/sh/include/asm/atomic-llsc.h
index 4b00b78..8575dcc 100644
--- a/arch/sh/include/asm/atomic-llsc.h
+++ b/arch/sh/include/asm/atomic-llsc.h
@@ -2,39 +2,6 @@
#define __ASM_SH_ATOMIC_LLSC_H
/*
- * To get proper branch prediction for the main line, we must branch
- * forward to code at the end of this object's .text section, then
- * branch back to restart the operation.
- */
-static inline void atomic_add(int i, atomic_t *v)
-{
- unsigned long tmp;
-
- __asm__ __volatile__ (
-"1: movli.l @%2, %0 ! atomic_add \n"
-" add %1, %0 \n"
-" movco.l %0, @%2 \n"
-" bf 1b \n"
- : "=&z" (tmp)
- : "r" (i), "r" (&v->counter)
- : "t");
-}
-
-static inline void atomic_sub(int i, atomic_t *v)
-{
- unsigned long tmp;
-
- __asm__ __volatile__ (
-"1: movli.l @%2, %0 ! atomic_sub \n"
-" sub %1, %0 \n"
-" movco.l %0, @%2 \n"
-" bf 1b \n"
- : "=&z" (tmp)
- : "r" (i), "r" (&v->counter)
- : "t");
-}
-
-/*
* SH-4A note:
*
* We basically get atomic_xxx_return() for free compared with
@@ -42,40 +9,54 @@
* encoding, so the retval is automatically set without having to
* do any special work.
*/
-static inline int atomic_add_return(int i, atomic_t *v)
-{
- unsigned long temp;
+/*
+ * To get proper branch prediction for the main line, we must branch
+ * forward to code at the end of this object's .text section, then
+ * branch back to restart the operation.
+ */
- __asm__ __volatile__ (
-"1: movli.l @%2, %0 ! atomic_add_return \n"
-" add %1, %0 \n"
-" movco.l %0, @%2 \n"
-" bf 1b \n"
-" synco \n"
- : "=&z" (temp)
- : "r" (i), "r" (&v->counter)
- : "t");
-
- return temp;
+#define ATOMIC_OP(op) \
+static inline void atomic_##op(int i, atomic_t *v) \
+{ \
+ unsigned long tmp; \
+ \
+ __asm__ __volatile__ ( \
+"1: movli.l @%2, %0 ! atomic_" #op "\n" \
+" " #op " %1, %0 \n" \
+" movco.l %0, @%2 \n" \
+" bf 1b \n" \
+ : "=&z" (tmp) \
+ : "r" (i), "r" (&v->counter) \
+ : "t"); \
}
-static inline int atomic_sub_return(int i, atomic_t *v)
-{
- unsigned long temp;
-
- __asm__ __volatile__ (
-"1: movli.l @%2, %0 ! atomic_sub_return \n"
-" sub %1, %0 \n"
-" movco.l %0, @%2 \n"
-" bf 1b \n"
-" synco \n"
- : "=&z" (temp)
- : "r" (i), "r" (&v->counter)
- : "t");
-
- return temp;
+#define ATOMIC_OP_RETURN(op) \
+static inline int atomic_##op##_return(int i, atomic_t *v) \
+{ \
+ unsigned long temp; \
+ \
+ __asm__ __volatile__ ( \
+"1: movli.l @%2, %0 ! atomic_" #op "_return \n" \
+" " #op " %1, %0 \n" \
+" movco.l %0, @%2 \n" \
+" bf 1b \n" \
+" synco \n" \
+ : "=&z" (temp) \
+ : "r" (i), "r" (&v->counter) \
+ : "t"); \
+ \
+ return temp; \
}
+#define ATOMIC_OPS(op) ATOMIC_OP(op) ATOMIC_OP_RETURN(op)
+
+ATOMIC_OPS(add)
+ATOMIC_OPS(sub)
+
+#undef ATOMIC_OPS
+#undef ATOMIC_OP_RETURN
+#undef ATOMIC_OP
+
static inline void atomic_clear_mask(unsigned int mask, atomic_t *v)
{
unsigned long tmp;
diff --git a/arch/sparc/include/asm/atomic_32.h b/arch/sparc/include/asm/atomic_32.h
index 7aed2be..7b024f0 100644
--- a/arch/sparc/include/asm/atomic_32.h
+++ b/arch/sparc/include/asm/atomic_32.h
@@ -20,7 +20,7 @@
#define ATOMIC_INIT(i) { (i) }
-int __atomic_add_return(int, atomic_t *);
+int atomic_add_return(int, atomic_t *);
int atomic_cmpxchg(atomic_t *, int, int);
#define atomic_xchg(v, new) (xchg(&((v)->counter), new))
int __atomic_add_unless(atomic_t *, int, int);
@@ -28,15 +28,14 @@
#define atomic_read(v) (*(volatile int *)&(v)->counter)
-#define atomic_add(i, v) ((void)__atomic_add_return( (int)(i), (v)))
-#define atomic_sub(i, v) ((void)__atomic_add_return(-(int)(i), (v)))
-#define atomic_inc(v) ((void)__atomic_add_return( 1, (v)))
-#define atomic_dec(v) ((void)__atomic_add_return( -1, (v)))
+#define atomic_add(i, v) ((void)atomic_add_return( (int)(i), (v)))
+#define atomic_sub(i, v) ((void)atomic_add_return(-(int)(i), (v)))
+#define atomic_inc(v) ((void)atomic_add_return( 1, (v)))
+#define atomic_dec(v) ((void)atomic_add_return( -1, (v)))
-#define atomic_add_return(i, v) (__atomic_add_return( (int)(i), (v)))
-#define atomic_sub_return(i, v) (__atomic_add_return(-(int)(i), (v)))
-#define atomic_inc_return(v) (__atomic_add_return( 1, (v)))
-#define atomic_dec_return(v) (__atomic_add_return( -1, (v)))
+#define atomic_sub_return(i, v) (atomic_add_return(-(int)(i), (v)))
+#define atomic_inc_return(v) (atomic_add_return( 1, (v)))
+#define atomic_dec_return(v) (atomic_add_return( -1, (v)))
#define atomic_add_negative(a, v) (atomic_add_return((a), (v)) < 0)
diff --git a/arch/sparc/include/asm/atomic_64.h b/arch/sparc/include/asm/atomic_64.h
index bb894c8..7e4ca1e 100644
--- a/arch/sparc/include/asm/atomic_64.h
+++ b/arch/sparc/include/asm/atomic_64.h
@@ -20,27 +20,28 @@
#define atomic_set(v, i) (((v)->counter) = i)
#define atomic64_set(v, i) (((v)->counter) = i)
-void atomic_add(int, atomic_t *);
-void atomic64_add(long, atomic64_t *);
-void atomic_sub(int, atomic_t *);
-void atomic64_sub(long, atomic64_t *);
+#define ATOMIC_OP(op) \
+void atomic_##op(int, atomic_t *); \
+void atomic64_##op(long, atomic64_t *);
-int atomic_add_ret(int, atomic_t *);
-long atomic64_add_ret(long, atomic64_t *);
-int atomic_sub_ret(int, atomic_t *);
-long atomic64_sub_ret(long, atomic64_t *);
+#define ATOMIC_OP_RETURN(op) \
+int atomic_##op##_return(int, atomic_t *); \
+long atomic64_##op##_return(long, atomic64_t *);
-#define atomic_dec_return(v) atomic_sub_ret(1, v)
-#define atomic64_dec_return(v) atomic64_sub_ret(1, v)
+#define ATOMIC_OPS(op) ATOMIC_OP(op) ATOMIC_OP_RETURN(op)
-#define atomic_inc_return(v) atomic_add_ret(1, v)
-#define atomic64_inc_return(v) atomic64_add_ret(1, v)
+ATOMIC_OPS(add)
+ATOMIC_OPS(sub)
-#define atomic_sub_return(i, v) atomic_sub_ret(i, v)
-#define atomic64_sub_return(i, v) atomic64_sub_ret(i, v)
+#undef ATOMIC_OPS
+#undef ATOMIC_OP_RETURN
+#undef ATOMIC_OP
-#define atomic_add_return(i, v) atomic_add_ret(i, v)
-#define atomic64_add_return(i, v) atomic64_add_ret(i, v)
+#define atomic_dec_return(v) atomic_sub_return(1, v)
+#define atomic64_dec_return(v) atomic64_sub_return(1, v)
+
+#define atomic_inc_return(v) atomic_add_return(1, v)
+#define atomic64_inc_return(v) atomic64_add_return(1, v)
/*
* atomic_inc_and_test - increment and test
@@ -53,11 +54,11 @@
#define atomic_inc_and_test(v) (atomic_inc_return(v) == 0)
#define atomic64_inc_and_test(v) (atomic64_inc_return(v) == 0)
-#define atomic_sub_and_test(i, v) (atomic_sub_ret(i, v) == 0)
-#define atomic64_sub_and_test(i, v) (atomic64_sub_ret(i, v) == 0)
+#define atomic_sub_and_test(i, v) (atomic_sub_return(i, v) == 0)
+#define atomic64_sub_and_test(i, v) (atomic64_sub_return(i, v) == 0)
-#define atomic_dec_and_test(v) (atomic_sub_ret(1, v) == 0)
-#define atomic64_dec_and_test(v) (atomic64_sub_ret(1, v) == 0)
+#define atomic_dec_and_test(v) (atomic_sub_return(1, v) == 0)
+#define atomic64_dec_and_test(v) (atomic64_sub_return(1, v) == 0)
#define atomic_inc(v) atomic_add(1, v)
#define atomic64_inc(v) atomic64_add(1, v)
@@ -65,8 +66,8 @@
#define atomic_dec(v) atomic_sub(1, v)
#define atomic64_dec(v) atomic64_sub(1, v)
-#define atomic_add_negative(i, v) (atomic_add_ret(i, v) < 0)
-#define atomic64_add_negative(i, v) (atomic64_add_ret(i, v) < 0)
+#define atomic_add_negative(i, v) (atomic_add_return(i, v) < 0)
+#define atomic64_add_negative(i, v) (atomic64_add_return(i, v) < 0)
#define atomic_cmpxchg(v, o, n) (cmpxchg(&((v)->counter), (o), (n)))
#define atomic_xchg(v, new) (xchg(&((v)->counter), new))
diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c
index f7ba875..6a9ec5c 100644
--- a/arch/sparc/kernel/smp_64.c
+++ b/arch/sparc/kernel/smp_64.c
@@ -1138,7 +1138,7 @@
void smp_capture(void)
{
- int result = atomic_add_ret(1, &smp_capture_depth);
+ int result = atomic_add_return(1, &smp_capture_depth);
if (result == 1) {
int ncpus = num_online_cpus();
diff --git a/arch/sparc/lib/atomic32.c b/arch/sparc/lib/atomic32.c
index 1d32b54..a7c418a 100644
--- a/arch/sparc/lib/atomic32.c
+++ b/arch/sparc/lib/atomic32.c
@@ -27,18 +27,23 @@
#endif /* SMP */
-int __atomic_add_return(int i, atomic_t *v)
-{
- int ret;
- unsigned long flags;
- spin_lock_irqsave(ATOMIC_HASH(v), flags);
+#define ATOMIC_OP(op, cop) \
+int atomic_##op##_return(int i, atomic_t *v) \
+{ \
+ int ret; \
+ unsigned long flags; \
+ spin_lock_irqsave(ATOMIC_HASH(v), flags); \
+ \
+ ret = (v->counter cop i); \
+ \
+ spin_unlock_irqrestore(ATOMIC_HASH(v), flags); \
+ return ret; \
+} \
+EXPORT_SYMBOL(atomic_##op##_return);
- ret = (v->counter += i);
+ATOMIC_OP(add, +=)
- spin_unlock_irqrestore(ATOMIC_HASH(v), flags);
- return ret;
-}
-EXPORT_SYMBOL(__atomic_add_return);
+#undef ATOMIC_OP
int atomic_cmpxchg(atomic_t *v, int old, int new)
{
diff --git a/arch/sparc/lib/atomic_64.S b/arch/sparc/lib/atomic_64.S
index 85c233d..96d70b4 100644
--- a/arch/sparc/lib/atomic_64.S
+++ b/arch/sparc/lib/atomic_64.S
@@ -14,109 +14,80 @@
* memory barriers, and a second which returns
* a value and does the barriers.
*/
-ENTRY(atomic_add) /* %o0 = increment, %o1 = atomic_ptr */
- BACKOFF_SETUP(%o2)
-1: lduw [%o1], %g1
- add %g1, %o0, %g7
- cas [%o1], %g1, %g7
- cmp %g1, %g7
- bne,pn %icc, BACKOFF_LABEL(2f, 1b)
- nop
- retl
- nop
-2: BACKOFF_SPIN(%o2, %o3, 1b)
-ENDPROC(atomic_add)
-ENTRY(atomic_sub) /* %o0 = decrement, %o1 = atomic_ptr */
- BACKOFF_SETUP(%o2)
-1: lduw [%o1], %g1
- sub %g1, %o0, %g7
- cas [%o1], %g1, %g7
- cmp %g1, %g7
- bne,pn %icc, BACKOFF_LABEL(2f, 1b)
- nop
- retl
- nop
-2: BACKOFF_SPIN(%o2, %o3, 1b)
-ENDPROC(atomic_sub)
+#define ATOMIC_OP(op) \
+ENTRY(atomic_##op) /* %o0 = increment, %o1 = atomic_ptr */ \
+ BACKOFF_SETUP(%o2); \
+1: lduw [%o1], %g1; \
+ op %g1, %o0, %g7; \
+ cas [%o1], %g1, %g7; \
+ cmp %g1, %g7; \
+ bne,pn %icc, BACKOFF_LABEL(2f, 1b); \
+ nop; \
+ retl; \
+ nop; \
+2: BACKOFF_SPIN(%o2, %o3, 1b); \
+ENDPROC(atomic_##op); \
-ENTRY(atomic_add_ret) /* %o0 = increment, %o1 = atomic_ptr */
- BACKOFF_SETUP(%o2)
-1: lduw [%o1], %g1
- add %g1, %o0, %g7
- cas [%o1], %g1, %g7
- cmp %g1, %g7
- bne,pn %icc, BACKOFF_LABEL(2f, 1b)
- add %g1, %o0, %g1
- retl
- sra %g1, 0, %o0
-2: BACKOFF_SPIN(%o2, %o3, 1b)
-ENDPROC(atomic_add_ret)
+#define ATOMIC_OP_RETURN(op) \
+ENTRY(atomic_##op##_return) /* %o0 = increment, %o1 = atomic_ptr */ \
+ BACKOFF_SETUP(%o2); \
+1: lduw [%o1], %g1; \
+ op %g1, %o0, %g7; \
+ cas [%o1], %g1, %g7; \
+ cmp %g1, %g7; \
+ bne,pn %icc, BACKOFF_LABEL(2f, 1b); \
+ add %g1, %o0, %g1; \
+ retl; \
+ sra %g1, 0, %o0; \
+2: BACKOFF_SPIN(%o2, %o3, 1b); \
+ENDPROC(atomic_##op##_return);
-ENTRY(atomic_sub_ret) /* %o0 = decrement, %o1 = atomic_ptr */
- BACKOFF_SETUP(%o2)
-1: lduw [%o1], %g1
- sub %g1, %o0, %g7
- cas [%o1], %g1, %g7
- cmp %g1, %g7
- bne,pn %icc, BACKOFF_LABEL(2f, 1b)
- sub %g1, %o0, %g1
- retl
- sra %g1, 0, %o0
-2: BACKOFF_SPIN(%o2, %o3, 1b)
-ENDPROC(atomic_sub_ret)
+#define ATOMIC_OPS(op) ATOMIC_OP(op) ATOMIC_OP_RETURN(op)
-ENTRY(atomic64_add) /* %o0 = increment, %o1 = atomic_ptr */
- BACKOFF_SETUP(%o2)
-1: ldx [%o1], %g1
- add %g1, %o0, %g7
- casx [%o1], %g1, %g7
- cmp %g1, %g7
- bne,pn %xcc, BACKOFF_LABEL(2f, 1b)
- nop
- retl
- nop
-2: BACKOFF_SPIN(%o2, %o3, 1b)
-ENDPROC(atomic64_add)
+ATOMIC_OPS(add)
+ATOMIC_OPS(sub)
-ENTRY(atomic64_sub) /* %o0 = decrement, %o1 = atomic_ptr */
- BACKOFF_SETUP(%o2)
-1: ldx [%o1], %g1
- sub %g1, %o0, %g7
- casx [%o1], %g1, %g7
- cmp %g1, %g7
- bne,pn %xcc, BACKOFF_LABEL(2f, 1b)
- nop
- retl
- nop
-2: BACKOFF_SPIN(%o2, %o3, 1b)
-ENDPROC(atomic64_sub)
+#undef ATOMIC_OPS
+#undef ATOMIC_OP_RETURN
+#undef ATOMIC_OP
-ENTRY(atomic64_add_ret) /* %o0 = increment, %o1 = atomic_ptr */
- BACKOFF_SETUP(%o2)
-1: ldx [%o1], %g1
- add %g1, %o0, %g7
- casx [%o1], %g1, %g7
- cmp %g1, %g7
- bne,pn %xcc, BACKOFF_LABEL(2f, 1b)
- nop
- retl
- add %g1, %o0, %o0
-2: BACKOFF_SPIN(%o2, %o3, 1b)
-ENDPROC(atomic64_add_ret)
+#define ATOMIC64_OP(op) \
+ENTRY(atomic64_##op) /* %o0 = increment, %o1 = atomic_ptr */ \
+ BACKOFF_SETUP(%o2); \
+1: ldx [%o1], %g1; \
+ op %g1, %o0, %g7; \
+ casx [%o1], %g1, %g7; \
+ cmp %g1, %g7; \
+ bne,pn %xcc, BACKOFF_LABEL(2f, 1b); \
+ nop; \
+ retl; \
+ nop; \
+2: BACKOFF_SPIN(%o2, %o3, 1b); \
+ENDPROC(atomic64_##op); \
-ENTRY(atomic64_sub_ret) /* %o0 = decrement, %o1 = atomic_ptr */
- BACKOFF_SETUP(%o2)
-1: ldx [%o1], %g1
- sub %g1, %o0, %g7
- casx [%o1], %g1, %g7
- cmp %g1, %g7
- bne,pn %xcc, BACKOFF_LABEL(2f, 1b)
- nop
- retl
- sub %g1, %o0, %o0
-2: BACKOFF_SPIN(%o2, %o3, 1b)
-ENDPROC(atomic64_sub_ret)
+#define ATOMIC64_OP_RETURN(op) \
+ENTRY(atomic64_##op##_return) /* %o0 = increment, %o1 = atomic_ptr */ \
+ BACKOFF_SETUP(%o2); \
+1: ldx [%o1], %g1; \
+ op %g1, %o0, %g7; \
+ casx [%o1], %g1, %g7; \
+ cmp %g1, %g7; \
+ bne,pn %xcc, BACKOFF_LABEL(2f, 1b); \
+ nop; \
+ retl; \
+ add %g1, %o0, %o0; \
+2: BACKOFF_SPIN(%o2, %o3, 1b); \
+ENDPROC(atomic64_##op##_return);
+
+#define ATOMIC64_OPS(op) ATOMIC64_OP(op) ATOMIC64_OP_RETURN(op)
+
+ATOMIC64_OPS(add)
+ATOMIC64_OPS(sub)
+
+#undef ATOMIC64_OPS
+#undef ATOMIC64_OP_RETURN
+#undef ATOMIC64_OP
ENTRY(atomic64_dec_if_positive) /* %o0 = atomic_ptr */
BACKOFF_SETUP(%o2)
diff --git a/arch/sparc/lib/ksyms.c b/arch/sparc/lib/ksyms.c
index 323335b..1d649a9 100644
--- a/arch/sparc/lib/ksyms.c
+++ b/arch/sparc/lib/ksyms.c
@@ -99,14 +99,23 @@
EXPORT_SYMBOL(__clear_user);
/* Atomic counter implementation. */
-EXPORT_SYMBOL(atomic_add);
-EXPORT_SYMBOL(atomic_add_ret);
-EXPORT_SYMBOL(atomic_sub);
-EXPORT_SYMBOL(atomic_sub_ret);
-EXPORT_SYMBOL(atomic64_add);
-EXPORT_SYMBOL(atomic64_add_ret);
-EXPORT_SYMBOL(atomic64_sub);
-EXPORT_SYMBOL(atomic64_sub_ret);
+#define ATOMIC_OP(op) \
+EXPORT_SYMBOL(atomic_##op); \
+EXPORT_SYMBOL(atomic64_##op);
+
+#define ATOMIC_OP_RETURN(op) \
+EXPORT_SYMBOL(atomic_##op##_return); \
+EXPORT_SYMBOL(atomic64_##op##_return);
+
+#define ATOMIC_OPS(op) ATOMIC_OP(op) ATOMIC_OP_RETURN(op)
+
+ATOMIC_OPS(add)
+ATOMIC_OPS(sub)
+
+#undef ATOMIC_OPS
+#undef ATOMIC_OP_RETURN
+#undef ATOMIC_OP
+
EXPORT_SYMBOL(atomic64_dec_if_positive);
/* Atomic bit operations. */
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 778178f..4b663e1 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -552,6 +552,39 @@
If in doubt, say "Y".
+config KVMTOOL_TEST_ENABLE
+ bool "Enable options to create a bootable tools/kvm/ kernel"
+ select NET
+ select NETDEVICES
+ select PCI
+ select BLOCK
+ select BLK_DEV
+ select NETWORK_FILESYSTEMS
+ select INET
+ select EXPERIMENTAL
+ select TTY
+ select SERIAL_8250
+ select SERIAL_8250_CONSOLE
+ select IP_PNP
+ select IP_PNP_DHCP
+ select BINFMT_ELF
+ select PCI_MSI
+ select HAVE_ARCH_KGDB
+ select DEBUG_KERNEL
+ select KGDB
+ select KGDB_SERIAL_CONSOLE
+ select VIRTUALIZATION
+ select VIRTIO
+ select VIRTIO_RING
+ select VIRTIO_PCI
+ select VIRTIO_BLK
+ select VIRTIO_CONSOLE
+ select VIRTIO_NET
+ select AVERAGE
+ select 9P_FS
+ select NET_9P
+ select NET_9P_VIRTIO
+
menuconfig HYPERVISOR_GUEST
bool "Linux guest support"
---help---
@@ -2443,9 +2476,19 @@
depends on STA2X11
config IOSF_MBI
- tristate
- default m
+ tristate "Intel System On Chip IOSF Sideband support"
depends on PCI
+ ---help---
+ Enables sideband access to mailbox registers on SoC's. The sideband is
+ available on the following platforms. This list is not meant to be
+ exclusive.
+ - BayTrail
+ - Cherryview
+ - Braswell
+ - Quark
+
+ You should say Y if you are running a kernel on one of these
+ platforms.
config PMC_ATOM
def_bool y
diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index 6dd1c7d..bf20c81 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -219,21 +219,6 @@
return *v;
}
-#ifdef CONFIG_X86_64
-/**
- * atomic_or_long - OR of two long integers
- * @v1: pointer to type unsigned long
- * @v2: pointer to type unsigned long
- *
- * Atomically ORs @v1 and @v2
- * Returns the result of the OR
- */
-static inline void atomic_or_long(unsigned long *v1, unsigned long v2)
-{
- asm(LOCK_PREFIX "orq %1, %0" : "+m" (*v1) : "r" (v2));
-}
-#endif
-
/* These are x86-specific, used by some header files */
#define atomic_clear_mask(mask, addr) \
asm volatile(LOCK_PREFIX "andl %0,%1" \
diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h
index cb4c73b..76659b6 100644
--- a/arch/x86/include/asm/calling.h
+++ b/arch/x86/include/asm/calling.h
@@ -85,7 +85,7 @@
#define ARGOFFSET R11
#define SWFRAME ORIG_RAX
- .macro SAVE_ARGS addskip=0, save_rcx=1, save_r891011=1
+ .macro SAVE_ARGS addskip=0, save_rcx=1, save_r891011=1, rax_enosys=0
subq $9*8+\addskip, %rsp
CFI_ADJUST_CFA_OFFSET 9*8+\addskip
movq_cfi rdi, 8*8
@@ -96,7 +96,11 @@
movq_cfi rcx, 5*8
.endif
+ .if \rax_enosys
+ movq $-ENOSYS, 4*8(%rsp)
+ .else
movq_cfi rax, 4*8
+ .endif
.if \save_r891011
movq_cfi r8, 3*8
diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h
index 412ecec..e97622f 100644
--- a/arch/x86/include/asm/fpu-internal.h
+++ b/arch/x86/include/asm/fpu-internal.h
@@ -344,7 +344,7 @@
static inline void __thread_fpu_begin(struct task_struct *tsk)
{
- if (!static_cpu_has_safe(X86_FEATURE_EAGER_FPU))
+ if (!use_eager_fpu())
clts();
__thread_set_has_fpu(tsk);
}
diff --git a/arch/x86/include/asm/microcode_intel.h b/arch/x86/include/asm/microcode_intel.h
index 9067166..bbe296e 100644
--- a/arch/x86/include/asm/microcode_intel.h
+++ b/arch/x86/include/asm/microcode_intel.h
@@ -43,7 +43,7 @@
#define DWSIZE (sizeof(u32))
#define get_totalsize(mc) \
- (((struct microcode_intel *)mc)->hdr.totalsize ? \
+ (((struct microcode_intel *)mc)->hdr.datasize ? \
((struct microcode_intel *)mc)->hdr.totalsize : \
DEFAULT_UCODE_TOTALSIZE)
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 8249df4..8dfc9fd 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -51,6 +51,14 @@
ARCH_PERFMON_EVENTSEL_EDGE | \
ARCH_PERFMON_EVENTSEL_INV | \
ARCH_PERFMON_EVENTSEL_CMASK)
+#define X86_ALL_EVENT_FLAGS \
+ (ARCH_PERFMON_EVENTSEL_EDGE | \
+ ARCH_PERFMON_EVENTSEL_INV | \
+ ARCH_PERFMON_EVENTSEL_CMASK | \
+ ARCH_PERFMON_EVENTSEL_ANY | \
+ ARCH_PERFMON_EVENTSEL_PIN_CONTROL | \
+ HSW_IN_TX | \
+ HSW_IN_TX_CHECKPOINTED)
#define AMD64_RAW_EVENT_MASK \
(X86_RAW_EVENT_MASK | \
AMD64_EVENTSEL_EVENT)
diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h
index 9ee3221..b6c0b40 100644
--- a/arch/x86/include/asm/pgtable_32.h
+++ b/arch/x86/include/asm/pgtable_32.h
@@ -32,9 +32,6 @@
static inline void check_pgt_cache(void) { }
void paging_init(void);
-extern void set_pmd_pfn(unsigned long, unsigned long, pgprot_t);
-
-
/*
* Define this if things work differently on an i386 and an i486:
* it will (on an i486) warn about kernel memory accesses that are
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 6205f0c..86fc2bb 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -75,6 +75,11 @@
extern void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
int error_code, int si_code);
+
+extern unsigned long syscall_trace_enter_phase1(struct pt_regs *, u32 arch);
+extern long syscall_trace_enter_phase2(struct pt_regs *, u32 arch,
+ unsigned long phase1_result);
+
extern long syscall_trace_enter(struct pt_regs *);
extern void syscall_trace_leave(struct pt_regs *);
diff --git a/arch/x86/include/asm/serial.h b/arch/x86/include/asm/serial.h
index 628c801..460b84f 100644
--- a/arch/x86/include/asm/serial.h
+++ b/arch/x86/include/asm/serial.h
@@ -6,24 +6,24 @@
*
* It'd be nice if someone built a serial card with a 24.576 MHz
* clock, since the 16550A is capable of handling a top speed of 1.5
- * megabits/second; but this requires the faster clock.
+ * megabits/second; but this requires a faster clock.
*/
-#define BASE_BAUD ( 1843200 / 16 )
+#define BASE_BAUD (1843200/16)
/* Standard COM flags (except for COM4, because of the 8514 problem) */
#ifdef CONFIG_SERIAL_DETECT_IRQ
-#define STD_COM_FLAGS (ASYNC_BOOT_AUTOCONF | ASYNC_SKIP_TEST | ASYNC_AUTO_IRQ)
-#define STD_COM4_FLAGS (ASYNC_BOOT_AUTOCONF | ASYNC_AUTO_IRQ)
+# define STD_COMX_FLAGS (ASYNC_BOOT_AUTOCONF | ASYNC_SKIP_TEST | ASYNC_AUTO_IRQ)
+# define STD_COM4_FLAGS (ASYNC_BOOT_AUTOCONF | 0 | ASYNC_AUTO_IRQ)
#else
-#define STD_COM_FLAGS (ASYNC_BOOT_AUTOCONF | ASYNC_SKIP_TEST)
-#define STD_COM4_FLAGS ASYNC_BOOT_AUTOCONF
+# define STD_COMX_FLAGS (ASYNC_BOOT_AUTOCONF | ASYNC_SKIP_TEST | 0 )
+# define STD_COM4_FLAGS (ASYNC_BOOT_AUTOCONF | 0 | 0 )
#endif
-#define SERIAL_PORT_DFNS \
- /* UART CLK PORT IRQ FLAGS */ \
- { 0, BASE_BAUD, 0x3F8, 4, STD_COM_FLAGS }, /* ttyS0 */ \
- { 0, BASE_BAUD, 0x2F8, 3, STD_COM_FLAGS }, /* ttyS1 */ \
- { 0, BASE_BAUD, 0x3E8, 4, STD_COM_FLAGS }, /* ttyS2 */ \
- { 0, BASE_BAUD, 0x2E8, 3, STD_COM4_FLAGS }, /* ttyS3 */
+#define SERIAL_PORT_DFNS \
+ /* UART CLK PORT IRQ FLAGS */ \
+ { .uart = 0, BASE_BAUD, 0x3F8, 4, STD_COMX_FLAGS }, /* ttyS0 */ \
+ { .uart = 0, BASE_BAUD, 0x2F8, 3, STD_COMX_FLAGS }, /* ttyS1 */ \
+ { .uart = 0, BASE_BAUD, 0x3E8, 4, STD_COMX_FLAGS }, /* ttyS2 */ \
+ { .uart = 0, BASE_BAUD, 0x2E8, 3, STD_COM4_FLAGS }, /* ttyS3 */
#endif /* _ASM_X86_SERIAL_H */
diff --git a/arch/x86/include/uapi/asm/e820.h b/arch/x86/include/uapi/asm/e820.h
index bbae024..d993e33 100644
--- a/arch/x86/include/uapi/asm/e820.h
+++ b/arch/x86/include/uapi/asm/e820.h
@@ -21,11 +21,6 @@
* this size.
*/
-/*
- * Odd: 'make headers_check' complains about numa.h if I try
- * to collapse the next two #ifdef lines to a single line:
- * #if defined(__KERNEL__) && defined(CONFIG_EFI)
- */
#ifndef __KERNEL__
#define E820_X_MAX E820MAX
#endif
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 004f017..8e9dcfd 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -204,7 +204,6 @@
static int uv_wakeup_secondary(int phys_apicid, unsigned long start_rip)
{
-#ifdef CONFIG_SMP
unsigned long val;
int pnode;
@@ -223,7 +222,6 @@
uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
atomic_set(&init_deasserted, 1);
-#endif
return 0;
}
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 7fd54f0..7e1fd4e 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -36,7 +36,9 @@
endif
obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_p6.o perf_event_knc.o perf_event_p4.o
obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o
-obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_uncore.o perf_event_intel_rapl.o
+obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_uncore.o perf_event_intel_uncore_snb.o
+obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_uncore_snbep.o perf_event_intel_uncore_nhmex.o
+obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_rapl.o
endif
diff --git a/arch/x86/kernel/cpu/microcode/amd_early.c b/arch/x86/kernel/cpu/microcode/amd_early.c
index 617a9e2..7aa1acc 100644
--- a/arch/x86/kernel/cpu/microcode/amd_early.c
+++ b/arch/x86/kernel/cpu/microcode/amd_early.c
@@ -27,7 +27,7 @@
u8 amd_ucode_patch[PATCH_MAX_SIZE];
static u16 this_equiv_id;
-struct cpio_data ucode_cpio;
+static struct cpio_data ucode_cpio;
/*
* Microcode patch container file is prepended to the initrd in cpio format.
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index a276fa7..c6826d1 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -127,7 +127,7 @@
return get_matching_microcode(csig, cpf, mc_intel, crev);
}
-int apply_microcode(int cpu)
+static int apply_microcode_intel(int cpu)
{
struct microcode_intel *mc_intel;
struct ucode_cpu_info *uci;
@@ -314,7 +314,7 @@
.request_microcode_user = request_microcode_user,
.request_microcode_fw = request_microcode_fw,
.collect_cpu_info = collect_cpu_info,
- .apply_microcode = apply_microcode,
+ .apply_microcode = apply_microcode_intel,
.microcode_fini_cpu = microcode_fini_cpu,
};
diff --git a/arch/x86/kernel/cpu/microcode/intel_early.c b/arch/x86/kernel/cpu/microcode/intel_early.c
index 18f7391..b88343f 100644
--- a/arch/x86/kernel/cpu/microcode/intel_early.c
+++ b/arch/x86/kernel/cpu/microcode/intel_early.c
@@ -28,8 +28,8 @@
#include <asm/tlbflush.h>
#include <asm/setup.h>
-unsigned long mc_saved_in_initrd[MAX_UCODE_COUNT];
-struct mc_saved_data {
+static unsigned long mc_saved_in_initrd[MAX_UCODE_COUNT];
+static struct mc_saved_data {
unsigned int mc_saved_count;
struct microcode_intel **mc_saved;
} mc_saved_data;
@@ -415,7 +415,7 @@
struct ucode_cpu_info uci;
if (mc_saved_data.mc_saved_count == 0) {
- pr_debug("no micorcode data saved.\n");
+ pr_debug("no microcode data saved.\n");
return;
}
pr_debug("Total microcode saved: %d\n", mc_saved_data.mc_saved_count);
@@ -506,7 +506,7 @@
if (mc_saved && mc_saved_count)
memcpy(mc_saved_tmp, mc_saved,
- mc_saved_count * sizeof(struct mirocode_intel *));
+ mc_saved_count * sizeof(struct microcode_intel *));
/*
* Save the microcode patch mc in mc_save_tmp structure if it's a newer
* version.
@@ -526,7 +526,7 @@
show_saved_mc();
/*
- * Free old saved microcod data.
+ * Free old saved microcode data.
*/
if (mc_saved) {
for (i = 0; i < mc_saved_count_init; i++)
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 2879ecd..0646d3b 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -387,7 +387,7 @@
precise++;
/* Support for IP fixup */
- if (x86_pmu.lbr_nr)
+ if (x86_pmu.lbr_nr || x86_pmu.intel_cap.pebs_format >= 2)
precise++;
}
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 8ade931..fc5eb39 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -67,8 +67,10 @@
*/
#define PERF_X86_EVENT_PEBS_LDLAT 0x1 /* ld+ldlat data address sampling */
#define PERF_X86_EVENT_PEBS_ST 0x2 /* st data address sampling */
-#define PERF_X86_EVENT_PEBS_ST_HSW 0x4 /* haswell style st data sampling */
+#define PERF_X86_EVENT_PEBS_ST_HSW 0x4 /* haswell style datala, store */
#define PERF_X86_EVENT_COMMITTED 0x8 /* event passed commit_txn */
+#define PERF_X86_EVENT_PEBS_LD_HSW 0x10 /* haswell style datala, load */
+#define PERF_X86_EVENT_PEBS_NA_HSW 0x20 /* haswell style datala, unknown */
struct amd_nb {
int nb_id; /* NorthBridge id */
@@ -252,18 +254,52 @@
EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK)
#define INTEL_PLD_CONSTRAINT(c, n) \
- __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK, \
+ __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LDLAT)
#define INTEL_PST_CONSTRAINT(c, n) \
- __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK, \
+ __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST)
-/* DataLA version of store sampling without extra enable bit. */
-#define INTEL_PST_HSW_CONSTRAINT(c, n) \
- __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK, \
+/* Event constraint, but match on all event flags too. */
+#define INTEL_FLAGS_EVENT_CONSTRAINT(c, n) \
+ EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS)
+
+/* Check only flags, but allow all event/umask */
+#define INTEL_ALL_EVENT_CONSTRAINT(code, n) \
+ EVENT_CONSTRAINT(code, n, X86_ALL_EVENT_FLAGS)
+
+/* Check flags and event code, and set the HSW store flag */
+#define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_ST(code, n) \
+ __EVENT_CONSTRAINT(code, n, \
+ ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \
HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST_HSW)
+/* Check flags and event code, and set the HSW load flag */
+#define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(code, n) \
+ __EVENT_CONSTRAINT(code, n, \
+ ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \
+ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW)
+
+/* Check flags and event code/umask, and set the HSW store flag */
+#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(code, n) \
+ __EVENT_CONSTRAINT(code, n, \
+ INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST_HSW)
+
+/* Check flags and event code/umask, and set the HSW load flag */
+#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(code, n) \
+ __EVENT_CONSTRAINT(code, n, \
+ INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW)
+
+/* Check flags and event code/umask, and set the HSW N/A flag */
+#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(code, n) \
+ __EVENT_CONSTRAINT(code, n, \
+ INTEL_ARCH_EVENT_MASK|INTEL_ARCH_EVENT_MASK, \
+ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_NA_HSW)
+
+
/*
* We define the end marker as having a weight of -1
* to enable blacklisting of events using a counter bitmask
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 2502d0d..89bc750 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -2367,15 +2367,15 @@
* Install the hw-cache-events table:
*/
switch (boot_cpu_data.x86_model) {
- case 14: /* 65 nm core solo/duo, "Yonah" */
+ case 14: /* 65nm Core "Yonah" */
pr_cont("Core events, ");
break;
- case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
+ case 15: /* 65nm Core2 "Merom" */
x86_add_quirk(intel_clovertown_quirk);
- case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
- case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */
- case 29: /* six-core 45 nm xeon "Dunnington" */
+ case 22: /* 65nm Core2 "Merom-L" */
+ case 23: /* 45nm Core2 "Penryn" */
+ case 29: /* 45nm Core2 "Dunnington (MP) */
memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
@@ -2386,9 +2386,9 @@
pr_cont("Core2 events, ");
break;
- case 26: /* 45 nm nehalem, "Bloomfield" */
- case 30: /* 45 nm nehalem, "Lynnfield" */
- case 46: /* 45 nm nehalem-ex, "Beckton" */
+ case 30: /* 45nm Nehalem */
+ case 26: /* 45nm Nehalem-EP */
+ case 46: /* 45nm Nehalem-EX */
memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs,
@@ -2415,11 +2415,11 @@
pr_cont("Nehalem events, ");
break;
- case 28: /* Atom */
- case 38: /* Lincroft */
- case 39: /* Penwell */
- case 53: /* Cloverview */
- case 54: /* Cedarview */
+ case 28: /* 45nm Atom "Pineview" */
+ case 38: /* 45nm Atom "Lincroft" */
+ case 39: /* 32nm Atom "Penwell" */
+ case 53: /* 32nm Atom "Cloverview" */
+ case 54: /* 32nm Atom "Cedarview" */
memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
@@ -2430,8 +2430,8 @@
pr_cont("Atom events, ");
break;
- case 55: /* Atom 22nm "Silvermont" */
- case 77: /* Avoton "Silvermont" */
+ case 55: /* 22nm Atom "Silvermont" */
+ case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */
memcpy(hw_cache_event_ids, slm_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
memcpy(hw_cache_extra_regs, slm_hw_cache_extra_regs,
@@ -2446,9 +2446,9 @@
pr_cont("Silvermont events, ");
break;
- case 37: /* 32 nm nehalem, "Clarkdale" */
- case 44: /* 32 nm nehalem, "Gulftown" */
- case 47: /* 32 nm Xeon E7 */
+ case 37: /* 32nm Westmere */
+ case 44: /* 32nm Westmere-EP */
+ case 47: /* 32nm Westmere-EX */
memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs,
@@ -2474,8 +2474,8 @@
pr_cont("Westmere events, ");
break;
- case 42: /* SandyBridge */
- case 45: /* SandyBridge, "Romely-EP" */
+ case 42: /* 32nm SandyBridge */
+ case 45: /* 32nm SandyBridge-E/EN/EP */
x86_add_quirk(intel_sandybridge_quirk);
memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
@@ -2506,8 +2506,9 @@
pr_cont("SandyBridge events, ");
break;
- case 58: /* IvyBridge */
- case 62: /* IvyBridge EP */
+
+ case 58: /* 22nm IvyBridge */
+ case 62: /* 22nm IvyBridge-EP/EX */
memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
/* dTLB-load-misses on IVB is different than SNB */
@@ -2539,11 +2540,11 @@
break;
- case 60: /* Haswell Client */
- case 70:
- case 71:
+ case 60: /* 22nm Haswell */
case 63:
case 69:
+ case 70:
+ case 71:
x86_pmu.late_ack = true;
memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids));
memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
@@ -2552,7 +2553,7 @@
x86_pmu.event_constraints = intel_hsw_event_constraints;
x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
- x86_pmu.extra_regs = intel_snb_extra_regs;
+ x86_pmu.extra_regs = intel_snbep_extra_regs;
x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
/* all extra regs are per-cpu when HT is on */
x86_pmu.er_flags |= ERF_HAS_RSP_1;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 696ade3..b1553d0 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -108,14 +108,16 @@
return val;
}
-static u64 precise_store_data_hsw(struct perf_event *event, u64 status)
+static u64 precise_datala_hsw(struct perf_event *event, u64 status)
{
union perf_mem_data_src dse;
- u64 cfg = event->hw.config & INTEL_ARCH_EVENT_MASK;
- dse.val = 0;
- dse.mem_op = PERF_MEM_OP_STORE;
- dse.mem_lvl = PERF_MEM_LVL_NA;
+ dse.val = PERF_MEM_NA;
+
+ if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW)
+ dse.mem_op = PERF_MEM_OP_STORE;
+ else if (event->hw.flags & PERF_X86_EVENT_PEBS_LD_HSW)
+ dse.mem_op = PERF_MEM_OP_LOAD;
/*
* L1 info only valid for following events:
@@ -125,15 +127,12 @@
* MEM_UOPS_RETIRED.SPLIT_STORES
* MEM_UOPS_RETIRED.ALL_STORES
*/
- if (cfg != 0x12d0 && cfg != 0x22d0 && cfg != 0x42d0 && cfg != 0x82d0)
- return dse.mem_lvl;
-
- if (status & 1)
- dse.mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT;
- else
- dse.mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_MISS;
-
- /* Nothing else supported. Sorry. */
+ if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW) {
+ if (status & 1)
+ dse.mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT;
+ else
+ dse.mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_MISS;
+ }
return dse.val;
}
@@ -569,28 +568,10 @@
};
struct event_constraint intel_slm_pebs_event_constraints[] = {
- INTEL_UEVENT_CONSTRAINT(0x0103, 0x1), /* REHABQ.LD_BLOCK_ST_FORWARD_PS */
- INTEL_UEVENT_CONSTRAINT(0x0803, 0x1), /* REHABQ.LD_SPLITS_PS */
- INTEL_UEVENT_CONSTRAINT(0x0204, 0x1), /* MEM_UOPS_RETIRED.L2_HIT_LOADS_PS */
- INTEL_UEVENT_CONSTRAINT(0x0404, 0x1), /* MEM_UOPS_RETIRED.L2_MISS_LOADS_PS */
- INTEL_UEVENT_CONSTRAINT(0x0804, 0x1), /* MEM_UOPS_RETIRED.DTLB_MISS_LOADS_PS */
- INTEL_UEVENT_CONSTRAINT(0x2004, 0x1), /* MEM_UOPS_RETIRED.HITM_PS */
- INTEL_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY_PS */
- INTEL_UEVENT_CONSTRAINT(0x00c4, 0x1), /* BR_INST_RETIRED.ALL_BRANCHES_PS */
- INTEL_UEVENT_CONSTRAINT(0x7ec4, 0x1), /* BR_INST_RETIRED.JCC_PS */
- INTEL_UEVENT_CONSTRAINT(0xbfc4, 0x1), /* BR_INST_RETIRED.FAR_BRANCH_PS */
- INTEL_UEVENT_CONSTRAINT(0xebc4, 0x1), /* BR_INST_RETIRED.NON_RETURN_IND_PS */
- INTEL_UEVENT_CONSTRAINT(0xf7c4, 0x1), /* BR_INST_RETIRED.RETURN_PS */
- INTEL_UEVENT_CONSTRAINT(0xf9c4, 0x1), /* BR_INST_RETIRED.CALL_PS */
- INTEL_UEVENT_CONSTRAINT(0xfbc4, 0x1), /* BR_INST_RETIRED.IND_CALL_PS */
- INTEL_UEVENT_CONSTRAINT(0xfdc4, 0x1), /* BR_INST_RETIRED.REL_CALL_PS */
- INTEL_UEVENT_CONSTRAINT(0xfec4, 0x1), /* BR_INST_RETIRED.TAKEN_JCC_PS */
- INTEL_UEVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_MISP_RETIRED.ALL_BRANCHES_PS */
- INTEL_UEVENT_CONSTRAINT(0x7ec5, 0x1), /* BR_INST_MISP_RETIRED.JCC_PS */
- INTEL_UEVENT_CONSTRAINT(0xebc5, 0x1), /* BR_INST_MISP_RETIRED.NON_RETURN_IND_PS */
- INTEL_UEVENT_CONSTRAINT(0xf7c5, 0x1), /* BR_INST_MISP_RETIRED.RETURN_PS */
- INTEL_UEVENT_CONSTRAINT(0xfbc5, 0x1), /* BR_INST_MISP_RETIRED.IND_CALL_PS */
- INTEL_UEVENT_CONSTRAINT(0xfec5, 0x1), /* BR_INST_MISP_RETIRED.TAKEN_JCC_PS */
+ /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+ /* Allow all events as PEBS with no flags */
+ INTEL_ALL_EVENT_CONSTRAINT(0, 0x1),
EVENT_CONSTRAINT_END
};
@@ -626,68 +607,44 @@
struct event_constraint intel_snb_pebs_event_constraints[] = {
INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
- INTEL_UEVENT_CONSTRAINT(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
- INTEL_UEVENT_CONSTRAINT(0x02c2, 0xf), /* UOPS_RETIRED.RETIRE_SLOTS */
- INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */
- INTEL_EVENT_CONSTRAINT(0xc5, 0xf), /* BR_MISP_RETIRED.* */
INTEL_PLD_CONSTRAINT(0x01cd, 0x8), /* MEM_TRANS_RETIRED.LAT_ABOVE_THR */
INTEL_PST_CONSTRAINT(0x02cd, 0x8), /* MEM_TRANS_RETIRED.PRECISE_STORES */
- INTEL_EVENT_CONSTRAINT(0xd0, 0xf), /* MEM_UOP_RETIRED.* */
- INTEL_EVENT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
- INTEL_EVENT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
- INTEL_EVENT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
- INTEL_UEVENT_CONSTRAINT(0x02d4, 0xf), /* MEM_LOAD_UOPS_MISC_RETIRED.LLC_MISS */
+ /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+ /* Allow all events as PEBS with no flags */
+ INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
EVENT_CONSTRAINT_END
};
struct event_constraint intel_ivb_pebs_event_constraints[] = {
INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
- INTEL_UEVENT_CONSTRAINT(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
- INTEL_UEVENT_CONSTRAINT(0x02c2, 0xf), /* UOPS_RETIRED.RETIRE_SLOTS */
- INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */
- INTEL_EVENT_CONSTRAINT(0xc5, 0xf), /* BR_MISP_RETIRED.* */
INTEL_PLD_CONSTRAINT(0x01cd, 0x8), /* MEM_TRANS_RETIRED.LAT_ABOVE_THR */
INTEL_PST_CONSTRAINT(0x02cd, 0x8), /* MEM_TRANS_RETIRED.PRECISE_STORES */
- INTEL_EVENT_CONSTRAINT(0xd0, 0xf), /* MEM_UOP_RETIRED.* */
- INTEL_EVENT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
- INTEL_EVENT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
- INTEL_EVENT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
+ /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+ /* Allow all events as PEBS with no flags */
+ INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
EVENT_CONSTRAINT_END
};
struct event_constraint intel_hsw_pebs_event_constraints[] = {
INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
- INTEL_PST_HSW_CONSTRAINT(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
- INTEL_UEVENT_CONSTRAINT(0x02c2, 0xf), /* UOPS_RETIRED.RETIRE_SLOTS */
- INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */
- INTEL_UEVENT_CONSTRAINT(0x01c5, 0xf), /* BR_MISP_RETIRED.CONDITIONAL */
- INTEL_UEVENT_CONSTRAINT(0x04c5, 0xf), /* BR_MISP_RETIRED.ALL_BRANCHES */
- INTEL_UEVENT_CONSTRAINT(0x20c5, 0xf), /* BR_MISP_RETIRED.NEAR_TAKEN */
- INTEL_PLD_CONSTRAINT(0x01cd, 0x8), /* MEM_TRANS_RETIRED.* */
- /* MEM_UOPS_RETIRED.STLB_MISS_LOADS */
- INTEL_UEVENT_CONSTRAINT(0x11d0, 0xf),
- /* MEM_UOPS_RETIRED.STLB_MISS_STORES */
- INTEL_UEVENT_CONSTRAINT(0x12d0, 0xf),
- INTEL_UEVENT_CONSTRAINT(0x21d0, 0xf), /* MEM_UOPS_RETIRED.LOCK_LOADS */
- INTEL_UEVENT_CONSTRAINT(0x41d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_LOADS */
- /* MEM_UOPS_RETIRED.SPLIT_STORES */
- INTEL_UEVENT_CONSTRAINT(0x42d0, 0xf),
- INTEL_UEVENT_CONSTRAINT(0x81d0, 0xf), /* MEM_UOPS_RETIRED.ALL_LOADS */
- INTEL_PST_HSW_CONSTRAINT(0x82d0, 0xf), /* MEM_UOPS_RETIRED.ALL_STORES */
- INTEL_UEVENT_CONSTRAINT(0x01d1, 0xf), /* MEM_LOAD_UOPS_RETIRED.L1_HIT */
- INTEL_UEVENT_CONSTRAINT(0x02d1, 0xf), /* MEM_LOAD_UOPS_RETIRED.L2_HIT */
- INTEL_UEVENT_CONSTRAINT(0x04d1, 0xf), /* MEM_LOAD_UOPS_RETIRED.L3_HIT */
- /* MEM_LOAD_UOPS_RETIRED.HIT_LFB */
- INTEL_UEVENT_CONSTRAINT(0x40d1, 0xf),
- /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS */
- INTEL_UEVENT_CONSTRAINT(0x01d2, 0xf),
- /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT */
- INTEL_UEVENT_CONSTRAINT(0x02d2, 0xf),
- /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.LOCAL_DRAM */
- INTEL_UEVENT_CONSTRAINT(0x01d3, 0xf),
- INTEL_UEVENT_CONSTRAINT(0x04c8, 0xf), /* HLE_RETIRED.Abort */
- INTEL_UEVENT_CONSTRAINT(0x04c9, 0xf), /* RTM_RETIRED.Abort */
-
+ INTEL_PLD_CONSTRAINT(0x01cd, 0xf), /* MEM_TRANS_RETIRED.* */
+ /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf), /* MEM_UOPS_RETIRED.LOCK_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x41d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x81d0, 0xf), /* MEM_UOPS_RETIRED.ALL_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_STORES */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x42d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_STORES */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x82d0, 0xf), /* MEM_UOPS_RETIRED.ALL_STORES */
+ INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
+ INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd2, 0xf), /* MEM_LOAD_UOPS_L3_HIT_RETIRED.* */
+ INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd3, 0xf), /* MEM_LOAD_UOPS_L3_MISS_RETIRED.* */
+ /* Allow all events as PEBS with no flags */
+ INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
EVENT_CONSTRAINT_END
};
@@ -864,6 +821,10 @@
static void __intel_pmu_pebs_event(struct perf_event *event,
struct pt_regs *iregs, void *__pebs)
{
+#define PERF_X86_EVENT_PEBS_HSW_PREC \
+ (PERF_X86_EVENT_PEBS_ST_HSW | \
+ PERF_X86_EVENT_PEBS_LD_HSW | \
+ PERF_X86_EVENT_PEBS_NA_HSW)
/*
* We cast to the biggest pebs_record but are careful not to
* unconditionally access the 'extra' entries.
@@ -873,42 +834,40 @@
struct perf_sample_data data;
struct pt_regs regs;
u64 sample_type;
- int fll, fst;
+ int fll, fst, dsrc;
+ int fl = event->hw.flags;
if (!intel_pmu_save_and_restart(event))
return;
- fll = event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT;
- fst = event->hw.flags & (PERF_X86_EVENT_PEBS_ST |
- PERF_X86_EVENT_PEBS_ST_HSW);
+ sample_type = event->attr.sample_type;
+ dsrc = sample_type & PERF_SAMPLE_DATA_SRC;
+
+ fll = fl & PERF_X86_EVENT_PEBS_LDLAT;
+ fst = fl & (PERF_X86_EVENT_PEBS_ST | PERF_X86_EVENT_PEBS_HSW_PREC);
perf_sample_data_init(&data, 0, event->hw.last_period);
data.period = event->hw.last_period;
- sample_type = event->attr.sample_type;
/*
- * if PEBS-LL or PreciseStore
+ * Use latency for weight (only avail with PEBS-LL)
*/
- if (fll || fst) {
- /*
- * Use latency for weight (only avail with PEBS-LL)
- */
- if (fll && (sample_type & PERF_SAMPLE_WEIGHT))
- data.weight = pebs->lat;
+ if (fll && (sample_type & PERF_SAMPLE_WEIGHT))
+ data.weight = pebs->lat;
- /*
- * data.data_src encodes the data source
- */
- if (sample_type & PERF_SAMPLE_DATA_SRC) {
- if (fll)
- data.data_src.val = load_latency_data(pebs->dse);
- else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW)
- data.data_src.val =
- precise_store_data_hsw(event, pebs->dse);
- else
- data.data_src.val = precise_store_data(pebs->dse);
- }
+ /*
+ * data.data_src encodes the data source
+ */
+ if (dsrc) {
+ u64 val = PERF_MEM_NA;
+ if (fll)
+ val = load_latency_data(pebs->dse);
+ else if (fst && (fl & PERF_X86_EVENT_PEBS_HSW_PREC))
+ val = precise_datala_hsw(event, pebs->dse);
+ else if (fst)
+ val = precise_store_data(pebs->dse);
+ data.data_src.val = val;
}
/*
@@ -935,16 +894,16 @@
else
regs.flags &= ~PERF_EFLAGS_EXACT;
- if ((event->attr.sample_type & PERF_SAMPLE_ADDR) &&
+ if ((sample_type & PERF_SAMPLE_ADDR) &&
x86_pmu.intel_cap.pebs_format >= 1)
data.addr = pebs->dla;
if (x86_pmu.intel_cap.pebs_format >= 2) {
/* Only set the TSX weight when no memory weight. */
- if ((event->attr.sample_type & PERF_SAMPLE_WEIGHT) && !fll)
+ if ((sample_type & PERF_SAMPLE_WEIGHT) && !fll)
data.weight = intel_hsw_weight(pebs);
- if (event->attr.sample_type & PERF_SAMPLE_TRANSACTION)
+ if (sample_type & PERF_SAMPLE_TRANSACTION)
data.txn = intel_hsw_transaction(pebs);
}
@@ -1055,7 +1014,7 @@
* BTS, PEBS probe and setup
*/
-void intel_ds_init(void)
+void __init intel_ds_init(void)
{
/*
* No support for 32bit formats
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index 9dd2459..4af1061 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -697,7 +697,7 @@
};
/* core */
-void intel_pmu_lbr_init_core(void)
+void __init intel_pmu_lbr_init_core(void)
{
x86_pmu.lbr_nr = 4;
x86_pmu.lbr_tos = MSR_LBR_TOS;
@@ -712,7 +712,7 @@
}
/* nehalem/westmere */
-void intel_pmu_lbr_init_nhm(void)
+void __init intel_pmu_lbr_init_nhm(void)
{
x86_pmu.lbr_nr = 16;
x86_pmu.lbr_tos = MSR_LBR_TOS;
@@ -733,7 +733,7 @@
}
/* sandy bridge */
-void intel_pmu_lbr_init_snb(void)
+void __init intel_pmu_lbr_init_snb(void)
{
x86_pmu.lbr_nr = 16;
x86_pmu.lbr_tos = MSR_LBR_TOS;
@@ -753,7 +753,7 @@
}
/* atom */
-void intel_pmu_lbr_init_atom(void)
+void __init intel_pmu_lbr_init_atom(void)
{
/*
* only models starting at stepping 10 seems
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
index 0939f86..812ec5d 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
@@ -1,83 +1,39 @@
#include "perf_event_intel_uncore.h"
static struct intel_uncore_type *empty_uncore[] = { NULL, };
-static struct intel_uncore_type **msr_uncores = empty_uncore;
-static struct intel_uncore_type **pci_uncores = empty_uncore;
-/* pci bus to socket mapping */
-static int pcibus_to_physid[256] = { [0 ... 255] = -1, };
+struct intel_uncore_type **uncore_msr_uncores = empty_uncore;
+struct intel_uncore_type **uncore_pci_uncores = empty_uncore;
-static struct pci_dev *extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX];
+static bool pcidrv_registered;
+struct pci_driver *uncore_pci_driver;
+/* pci bus to socket mapping */
+int uncore_pcibus_to_physid[256] = { [0 ... 255] = -1, };
+struct pci_dev *uncore_extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX];
static DEFINE_RAW_SPINLOCK(uncore_box_lock);
-
/* mask of cpus that collect uncore events */
static cpumask_t uncore_cpu_mask;
/* constraint for the fixed counter */
-static struct event_constraint constraint_fixed =
+static struct event_constraint uncore_constraint_fixed =
EVENT_CONSTRAINT(~0ULL, 1 << UNCORE_PMC_IDX_FIXED, ~0ULL);
-static struct event_constraint constraint_empty =
+struct event_constraint uncore_constraint_empty =
EVENT_CONSTRAINT(0, 0, 0);
-#define __BITS_VALUE(x, i, n) ((typeof(x))(((x) >> ((i) * (n))) & \
- ((1ULL << (n)) - 1)))
+ssize_t uncore_event_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct uncore_event_desc *event =
+ container_of(attr, struct uncore_event_desc, attr);
+ return sprintf(buf, "%s", event->config);
+}
-DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
-DEFINE_UNCORE_FORMAT_ATTR(event_ext, event, "config:0-7,21");
-DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
-DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
-DEFINE_UNCORE_FORMAT_ATTR(tid_en, tid_en, "config:19");
-DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
-DEFINE_UNCORE_FORMAT_ATTR(cmask5, cmask, "config:24-28");
-DEFINE_UNCORE_FORMAT_ATTR(cmask8, cmask, "config:24-31");
-DEFINE_UNCORE_FORMAT_ATTR(thresh8, thresh, "config:24-31");
-DEFINE_UNCORE_FORMAT_ATTR(thresh5, thresh, "config:24-28");
-DEFINE_UNCORE_FORMAT_ATTR(occ_sel, occ_sel, "config:14-15");
-DEFINE_UNCORE_FORMAT_ATTR(occ_invert, occ_invert, "config:30");
-DEFINE_UNCORE_FORMAT_ATTR(occ_edge, occ_edge, "config:14-51");
-DEFINE_UNCORE_FORMAT_ATTR(filter_tid, filter_tid, "config1:0-4");
-DEFINE_UNCORE_FORMAT_ATTR(filter_link, filter_link, "config1:5-8");
-DEFINE_UNCORE_FORMAT_ATTR(filter_nid, filter_nid, "config1:10-17");
-DEFINE_UNCORE_FORMAT_ATTR(filter_nid2, filter_nid, "config1:32-47");
-DEFINE_UNCORE_FORMAT_ATTR(filter_state, filter_state, "config1:18-22");
-DEFINE_UNCORE_FORMAT_ATTR(filter_state2, filter_state, "config1:17-22");
-DEFINE_UNCORE_FORMAT_ATTR(filter_opc, filter_opc, "config1:23-31");
-DEFINE_UNCORE_FORMAT_ATTR(filter_opc2, filter_opc, "config1:52-60");
-DEFINE_UNCORE_FORMAT_ATTR(filter_band0, filter_band0, "config1:0-7");
-DEFINE_UNCORE_FORMAT_ATTR(filter_band1, filter_band1, "config1:8-15");
-DEFINE_UNCORE_FORMAT_ATTR(filter_band2, filter_band2, "config1:16-23");
-DEFINE_UNCORE_FORMAT_ATTR(filter_band3, filter_band3, "config1:24-31");
-DEFINE_UNCORE_FORMAT_ATTR(match_rds, match_rds, "config1:48-51");
-DEFINE_UNCORE_FORMAT_ATTR(match_rnid30, match_rnid30, "config1:32-35");
-DEFINE_UNCORE_FORMAT_ATTR(match_rnid4, match_rnid4, "config1:31");
-DEFINE_UNCORE_FORMAT_ATTR(match_dnid, match_dnid, "config1:13-17");
-DEFINE_UNCORE_FORMAT_ATTR(match_mc, match_mc, "config1:9-12");
-DEFINE_UNCORE_FORMAT_ATTR(match_opc, match_opc, "config1:5-8");
-DEFINE_UNCORE_FORMAT_ATTR(match_vnw, match_vnw, "config1:3-4");
-DEFINE_UNCORE_FORMAT_ATTR(match0, match0, "config1:0-31");
-DEFINE_UNCORE_FORMAT_ATTR(match1, match1, "config1:32-63");
-DEFINE_UNCORE_FORMAT_ATTR(mask_rds, mask_rds, "config2:48-51");
-DEFINE_UNCORE_FORMAT_ATTR(mask_rnid30, mask_rnid30, "config2:32-35");
-DEFINE_UNCORE_FORMAT_ATTR(mask_rnid4, mask_rnid4, "config2:31");
-DEFINE_UNCORE_FORMAT_ATTR(mask_dnid, mask_dnid, "config2:13-17");
-DEFINE_UNCORE_FORMAT_ATTR(mask_mc, mask_mc, "config2:9-12");
-DEFINE_UNCORE_FORMAT_ATTR(mask_opc, mask_opc, "config2:5-8");
-DEFINE_UNCORE_FORMAT_ATTR(mask_vnw, mask_vnw, "config2:3-4");
-DEFINE_UNCORE_FORMAT_ATTR(mask0, mask0, "config2:0-31");
-DEFINE_UNCORE_FORMAT_ATTR(mask1, mask1, "config2:32-63");
-
-static void uncore_pmu_start_hrtimer(struct intel_uncore_box *box);
-static void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box);
-static void uncore_perf_event_update(struct intel_uncore_box *box, struct perf_event *event);
-static void uncore_pmu_event_read(struct perf_event *event);
-
-static struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event)
+struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event)
{
return container_of(event->pmu, struct intel_uncore_pmu, pmu);
}
-static struct intel_uncore_box *
-uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu)
+struct intel_uncore_box *uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu)
{
struct intel_uncore_box *box;
@@ -98,7 +54,7 @@
return *per_cpu_ptr(pmu->box, cpu);
}
-static struct intel_uncore_box *uncore_event_to_box(struct perf_event *event)
+struct intel_uncore_box *uncore_event_to_box(struct perf_event *event)
{
/*
* perf core schedules event on the basis of cpu, uncore events are
@@ -107,7 +63,7 @@
return uncore_pmu_to_box(uncore_event_to_pmu(event), smp_processor_id());
}
-static u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event)
+u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event)
{
u64 count;
@@ -119,7 +75,7 @@
/*
* generic get constraint function for shared match/mask registers.
*/
-static struct event_constraint *
+struct event_constraint *
uncore_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
{
struct intel_uncore_extra_reg *er;
@@ -154,10 +110,10 @@
return NULL;
}
- return &constraint_empty;
+ return &uncore_constraint_empty;
}
-static void uncore_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
+void uncore_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
{
struct intel_uncore_extra_reg *er;
struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
@@ -178,7 +134,7 @@
reg1->alloc = 0;
}
-static u64 uncore_shared_reg_config(struct intel_uncore_box *box, int idx)
+u64 uncore_shared_reg_config(struct intel_uncore_box *box, int idx)
{
struct intel_uncore_extra_reg *er;
unsigned long flags;
@@ -193,2936 +149,6 @@
return config;
}
-/* Sandy Bridge-EP uncore support */
-static struct intel_uncore_type snbep_uncore_cbox;
-static struct intel_uncore_type snbep_uncore_pcu;
-
-static void snbep_uncore_pci_disable_box(struct intel_uncore_box *box)
-{
- struct pci_dev *pdev = box->pci_dev;
- int box_ctl = uncore_pci_box_ctl(box);
- u32 config = 0;
-
- if (!pci_read_config_dword(pdev, box_ctl, &config)) {
- config |= SNBEP_PMON_BOX_CTL_FRZ;
- pci_write_config_dword(pdev, box_ctl, config);
- }
-}
-
-static void snbep_uncore_pci_enable_box(struct intel_uncore_box *box)
-{
- struct pci_dev *pdev = box->pci_dev;
- int box_ctl = uncore_pci_box_ctl(box);
- u32 config = 0;
-
- if (!pci_read_config_dword(pdev, box_ctl, &config)) {
- config &= ~SNBEP_PMON_BOX_CTL_FRZ;
- pci_write_config_dword(pdev, box_ctl, config);
- }
-}
-
-static void snbep_uncore_pci_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct pci_dev *pdev = box->pci_dev;
- struct hw_perf_event *hwc = &event->hw;
-
- pci_write_config_dword(pdev, hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
-}
-
-static void snbep_uncore_pci_disable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct pci_dev *pdev = box->pci_dev;
- struct hw_perf_event *hwc = &event->hw;
-
- pci_write_config_dword(pdev, hwc->config_base, hwc->config);
-}
-
-static u64 snbep_uncore_pci_read_counter(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct pci_dev *pdev = box->pci_dev;
- struct hw_perf_event *hwc = &event->hw;
- u64 count = 0;
-
- pci_read_config_dword(pdev, hwc->event_base, (u32 *)&count);
- pci_read_config_dword(pdev, hwc->event_base + 4, (u32 *)&count + 1);
-
- return count;
-}
-
-static void snbep_uncore_pci_init_box(struct intel_uncore_box *box)
-{
- struct pci_dev *pdev = box->pci_dev;
-
- pci_write_config_dword(pdev, SNBEP_PCI_PMON_BOX_CTL, SNBEP_PMON_BOX_CTL_INT);
-}
-
-static void snbep_uncore_msr_disable_box(struct intel_uncore_box *box)
-{
- u64 config;
- unsigned msr;
-
- msr = uncore_msr_box_ctl(box);
- if (msr) {
- rdmsrl(msr, config);
- config |= SNBEP_PMON_BOX_CTL_FRZ;
- wrmsrl(msr, config);
- }
-}
-
-static void snbep_uncore_msr_enable_box(struct intel_uncore_box *box)
-{
- u64 config;
- unsigned msr;
-
- msr = uncore_msr_box_ctl(box);
- if (msr) {
- rdmsrl(msr, config);
- config &= ~SNBEP_PMON_BOX_CTL_FRZ;
- wrmsrl(msr, config);
- }
-}
-
-static void snbep_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
- struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-
- if (reg1->idx != EXTRA_REG_NONE)
- wrmsrl(reg1->reg, uncore_shared_reg_config(box, 0));
-
- wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
-}
-
-static void snbep_uncore_msr_disable_event(struct intel_uncore_box *box,
- struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
-
- wrmsrl(hwc->config_base, hwc->config);
-}
-
-static void snbep_uncore_msr_init_box(struct intel_uncore_box *box)
-{
- unsigned msr = uncore_msr_box_ctl(box);
-
- if (msr)
- wrmsrl(msr, SNBEP_PMON_BOX_CTL_INT);
-}
-
-static struct attribute *snbep_uncore_formats_attr[] = {
- &format_attr_event.attr,
- &format_attr_umask.attr,
- &format_attr_edge.attr,
- &format_attr_inv.attr,
- &format_attr_thresh8.attr,
- NULL,
-};
-
-static struct attribute *snbep_uncore_ubox_formats_attr[] = {
- &format_attr_event.attr,
- &format_attr_umask.attr,
- &format_attr_edge.attr,
- &format_attr_inv.attr,
- &format_attr_thresh5.attr,
- NULL,
-};
-
-static struct attribute *snbep_uncore_cbox_formats_attr[] = {
- &format_attr_event.attr,
- &format_attr_umask.attr,
- &format_attr_edge.attr,
- &format_attr_tid_en.attr,
- &format_attr_inv.attr,
- &format_attr_thresh8.attr,
- &format_attr_filter_tid.attr,
- &format_attr_filter_nid.attr,
- &format_attr_filter_state.attr,
- &format_attr_filter_opc.attr,
- NULL,
-};
-
-static struct attribute *snbep_uncore_pcu_formats_attr[] = {
- &format_attr_event_ext.attr,
- &format_attr_occ_sel.attr,
- &format_attr_edge.attr,
- &format_attr_inv.attr,
- &format_attr_thresh5.attr,
- &format_attr_occ_invert.attr,
- &format_attr_occ_edge.attr,
- &format_attr_filter_band0.attr,
- &format_attr_filter_band1.attr,
- &format_attr_filter_band2.attr,
- &format_attr_filter_band3.attr,
- NULL,
-};
-
-static struct attribute *snbep_uncore_qpi_formats_attr[] = {
- &format_attr_event_ext.attr,
- &format_attr_umask.attr,
- &format_attr_edge.attr,
- &format_attr_inv.attr,
- &format_attr_thresh8.attr,
- &format_attr_match_rds.attr,
- &format_attr_match_rnid30.attr,
- &format_attr_match_rnid4.attr,
- &format_attr_match_dnid.attr,
- &format_attr_match_mc.attr,
- &format_attr_match_opc.attr,
- &format_attr_match_vnw.attr,
- &format_attr_match0.attr,
- &format_attr_match1.attr,
- &format_attr_mask_rds.attr,
- &format_attr_mask_rnid30.attr,
- &format_attr_mask_rnid4.attr,
- &format_attr_mask_dnid.attr,
- &format_attr_mask_mc.attr,
- &format_attr_mask_opc.attr,
- &format_attr_mask_vnw.attr,
- &format_attr_mask0.attr,
- &format_attr_mask1.attr,
- NULL,
-};
-
-static struct uncore_event_desc snbep_uncore_imc_events[] = {
- INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"),
- INTEL_UNCORE_EVENT_DESC(cas_count_read, "event=0x04,umask=0x03"),
- INTEL_UNCORE_EVENT_DESC(cas_count_write, "event=0x04,umask=0x0c"),
- { /* end: all zeroes */ },
-};
-
-static struct uncore_event_desc snbep_uncore_qpi_events[] = {
- INTEL_UNCORE_EVENT_DESC(clockticks, "event=0x14"),
- INTEL_UNCORE_EVENT_DESC(txl_flits_active, "event=0x00,umask=0x06"),
- INTEL_UNCORE_EVENT_DESC(drs_data, "event=0x102,umask=0x08"),
- INTEL_UNCORE_EVENT_DESC(ncb_data, "event=0x103,umask=0x04"),
- { /* end: all zeroes */ },
-};
-
-static struct attribute_group snbep_uncore_format_group = {
- .name = "format",
- .attrs = snbep_uncore_formats_attr,
-};
-
-static struct attribute_group snbep_uncore_ubox_format_group = {
- .name = "format",
- .attrs = snbep_uncore_ubox_formats_attr,
-};
-
-static struct attribute_group snbep_uncore_cbox_format_group = {
- .name = "format",
- .attrs = snbep_uncore_cbox_formats_attr,
-};
-
-static struct attribute_group snbep_uncore_pcu_format_group = {
- .name = "format",
- .attrs = snbep_uncore_pcu_formats_attr,
-};
-
-static struct attribute_group snbep_uncore_qpi_format_group = {
- .name = "format",
- .attrs = snbep_uncore_qpi_formats_attr,
-};
-
-#define SNBEP_UNCORE_MSR_OPS_COMMON_INIT() \
- .init_box = snbep_uncore_msr_init_box, \
- .disable_box = snbep_uncore_msr_disable_box, \
- .enable_box = snbep_uncore_msr_enable_box, \
- .disable_event = snbep_uncore_msr_disable_event, \
- .enable_event = snbep_uncore_msr_enable_event, \
- .read_counter = uncore_msr_read_counter
-
-static struct intel_uncore_ops snbep_uncore_msr_ops = {
- SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
-};
-
-#define SNBEP_UNCORE_PCI_OPS_COMMON_INIT() \
- .init_box = snbep_uncore_pci_init_box, \
- .disable_box = snbep_uncore_pci_disable_box, \
- .enable_box = snbep_uncore_pci_enable_box, \
- .disable_event = snbep_uncore_pci_disable_event, \
- .read_counter = snbep_uncore_pci_read_counter
-
-static struct intel_uncore_ops snbep_uncore_pci_ops = {
- SNBEP_UNCORE_PCI_OPS_COMMON_INIT(),
- .enable_event = snbep_uncore_pci_enable_event, \
-};
-
-static struct event_constraint snbep_uncore_cbox_constraints[] = {
- UNCORE_EVENT_CONSTRAINT(0x01, 0x1),
- UNCORE_EVENT_CONSTRAINT(0x02, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x04, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x05, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x07, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x09, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
- UNCORE_EVENT_CONSTRAINT(0x12, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x13, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x1b, 0xc),
- UNCORE_EVENT_CONSTRAINT(0x1c, 0xc),
- UNCORE_EVENT_CONSTRAINT(0x1d, 0xc),
- UNCORE_EVENT_CONSTRAINT(0x1e, 0xc),
- EVENT_CONSTRAINT_OVERLAP(0x1f, 0xe, 0xff),
- UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x31, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x35, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
- UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x39, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x3b, 0x1),
- EVENT_CONSTRAINT_END
-};
-
-static struct event_constraint snbep_uncore_r2pcie_constraints[] = {
- UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x12, 0x1),
- UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x24, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
- EVENT_CONSTRAINT_END
-};
-
-static struct event_constraint snbep_uncore_r3qpi_constraints[] = {
- UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x12, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
- UNCORE_EVENT_CONSTRAINT(0x20, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x22, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x24, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x29, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x2a, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x2b, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x2e, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x2f, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x30, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x31, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x36, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x39, 0x3),
- EVENT_CONSTRAINT_END
-};
-
-static struct intel_uncore_type snbep_uncore_ubox = {
- .name = "ubox",
- .num_counters = 2,
- .num_boxes = 1,
- .perf_ctr_bits = 44,
- .fixed_ctr_bits = 48,
- .perf_ctr = SNBEP_U_MSR_PMON_CTR0,
- .event_ctl = SNBEP_U_MSR_PMON_CTL0,
- .event_mask = SNBEP_U_MSR_PMON_RAW_EVENT_MASK,
- .fixed_ctr = SNBEP_U_MSR_PMON_UCLK_FIXED_CTR,
- .fixed_ctl = SNBEP_U_MSR_PMON_UCLK_FIXED_CTL,
- .ops = &snbep_uncore_msr_ops,
- .format_group = &snbep_uncore_ubox_format_group,
-};
-
-static struct extra_reg snbep_uncore_cbox_extra_regs[] = {
- SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
- SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
- SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4334, 0xffff, 0x6),
- SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4534, 0xffff, 0x6),
- SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4934, 0xffff, 0x6),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0x6),
- SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x8),
- SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x8),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0xa),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0xa),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4435, 0xffff, 0x2),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4835, 0xffff, 0x2),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4a35, 0xffff, 0x2),
- SNBEP_CBO_EVENT_EXTRA_REG(0x5035, 0xffff, 0x2),
- SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x8),
- SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x8),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0xa),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0xa),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4436, 0xffff, 0x2),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4836, 0xffff, 0x2),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4a36, 0xffff, 0x2),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4037, 0x40ff, 0x2),
- EVENT_EXTRA_END
-};
-
-static void snbep_cbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
- struct intel_uncore_extra_reg *er = &box->shared_regs[0];
- int i;
-
- if (uncore_box_is_fake(box))
- return;
-
- for (i = 0; i < 5; i++) {
- if (reg1->alloc & (0x1 << i))
- atomic_sub(1 << (i * 6), &er->ref);
- }
- reg1->alloc = 0;
-}
-
-static struct event_constraint *
-__snbep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event,
- u64 (*cbox_filter_mask)(int fields))
-{
- struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
- struct intel_uncore_extra_reg *er = &box->shared_regs[0];
- int i, alloc = 0;
- unsigned long flags;
- u64 mask;
-
- if (reg1->idx == EXTRA_REG_NONE)
- return NULL;
-
- raw_spin_lock_irqsave(&er->lock, flags);
- for (i = 0; i < 5; i++) {
- if (!(reg1->idx & (0x1 << i)))
- continue;
- if (!uncore_box_is_fake(box) && (reg1->alloc & (0x1 << i)))
- continue;
-
- mask = cbox_filter_mask(0x1 << i);
- if (!__BITS_VALUE(atomic_read(&er->ref), i, 6) ||
- !((reg1->config ^ er->config) & mask)) {
- atomic_add(1 << (i * 6), &er->ref);
- er->config &= ~mask;
- er->config |= reg1->config & mask;
- alloc |= (0x1 << i);
- } else {
- break;
- }
- }
- raw_spin_unlock_irqrestore(&er->lock, flags);
- if (i < 5)
- goto fail;
-
- if (!uncore_box_is_fake(box))
- reg1->alloc |= alloc;
-
- return NULL;
-fail:
- for (; i >= 0; i--) {
- if (alloc & (0x1 << i))
- atomic_sub(1 << (i * 6), &er->ref);
- }
- return &constraint_empty;
-}
-
-static u64 snbep_cbox_filter_mask(int fields)
-{
- u64 mask = 0;
-
- if (fields & 0x1)
- mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_TID;
- if (fields & 0x2)
- mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_NID;
- if (fields & 0x4)
- mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_STATE;
- if (fields & 0x8)
- mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_OPC;
-
- return mask;
-}
-
-static struct event_constraint *
-snbep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
- return __snbep_cbox_get_constraint(box, event, snbep_cbox_filter_mask);
-}
-
-static int snbep_cbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
- struct extra_reg *er;
- int idx = 0;
-
- for (er = snbep_uncore_cbox_extra_regs; er->msr; er++) {
- if (er->event != (event->hw.config & er->config_mask))
- continue;
- idx |= er->idx;
- }
-
- if (idx) {
- reg1->reg = SNBEP_C0_MSR_PMON_BOX_FILTER +
- SNBEP_CBO_MSR_OFFSET * box->pmu->pmu_idx;
- reg1->config = event->attr.config1 & snbep_cbox_filter_mask(idx);
- reg1->idx = idx;
- }
- return 0;
-}
-
-static struct intel_uncore_ops snbep_uncore_cbox_ops = {
- SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
- .hw_config = snbep_cbox_hw_config,
- .get_constraint = snbep_cbox_get_constraint,
- .put_constraint = snbep_cbox_put_constraint,
-};
-
-static struct intel_uncore_type snbep_uncore_cbox = {
- .name = "cbox",
- .num_counters = 4,
- .num_boxes = 8,
- .perf_ctr_bits = 44,
- .event_ctl = SNBEP_C0_MSR_PMON_CTL0,
- .perf_ctr = SNBEP_C0_MSR_PMON_CTR0,
- .event_mask = SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK,
- .box_ctl = SNBEP_C0_MSR_PMON_BOX_CTL,
- .msr_offset = SNBEP_CBO_MSR_OFFSET,
- .num_shared_regs = 1,
- .constraints = snbep_uncore_cbox_constraints,
- .ops = &snbep_uncore_cbox_ops,
- .format_group = &snbep_uncore_cbox_format_group,
-};
-
-static u64 snbep_pcu_alter_er(struct perf_event *event, int new_idx, bool modify)
-{
- struct hw_perf_event *hwc = &event->hw;
- struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
- u64 config = reg1->config;
-
- if (new_idx > reg1->idx)
- config <<= 8 * (new_idx - reg1->idx);
- else
- config >>= 8 * (reg1->idx - new_idx);
-
- if (modify) {
- hwc->config += new_idx - reg1->idx;
- reg1->config = config;
- reg1->idx = new_idx;
- }
- return config;
-}
-
-static struct event_constraint *
-snbep_pcu_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
- struct intel_uncore_extra_reg *er = &box->shared_regs[0];
- unsigned long flags;
- int idx = reg1->idx;
- u64 mask, config1 = reg1->config;
- bool ok = false;
-
- if (reg1->idx == EXTRA_REG_NONE ||
- (!uncore_box_is_fake(box) && reg1->alloc))
- return NULL;
-again:
- mask = 0xffULL << (idx * 8);
- raw_spin_lock_irqsave(&er->lock, flags);
- if (!__BITS_VALUE(atomic_read(&er->ref), idx, 8) ||
- !((config1 ^ er->config) & mask)) {
- atomic_add(1 << (idx * 8), &er->ref);
- er->config &= ~mask;
- er->config |= config1 & mask;
- ok = true;
- }
- raw_spin_unlock_irqrestore(&er->lock, flags);
-
- if (!ok) {
- idx = (idx + 1) % 4;
- if (idx != reg1->idx) {
- config1 = snbep_pcu_alter_er(event, idx, false);
- goto again;
- }
- return &constraint_empty;
- }
-
- if (!uncore_box_is_fake(box)) {
- if (idx != reg1->idx)
- snbep_pcu_alter_er(event, idx, true);
- reg1->alloc = 1;
- }
- return NULL;
-}
-
-static void snbep_pcu_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
- struct intel_uncore_extra_reg *er = &box->shared_regs[0];
-
- if (uncore_box_is_fake(box) || !reg1->alloc)
- return;
-
- atomic_sub(1 << (reg1->idx * 8), &er->ref);
- reg1->alloc = 0;
-}
-
-static int snbep_pcu_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
- struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
- int ev_sel = hwc->config & SNBEP_PMON_CTL_EV_SEL_MASK;
-
- if (ev_sel >= 0xb && ev_sel <= 0xe) {
- reg1->reg = SNBEP_PCU_MSR_PMON_BOX_FILTER;
- reg1->idx = ev_sel - 0xb;
- reg1->config = event->attr.config1 & (0xff << reg1->idx);
- }
- return 0;
-}
-
-static struct intel_uncore_ops snbep_uncore_pcu_ops = {
- SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
- .hw_config = snbep_pcu_hw_config,
- .get_constraint = snbep_pcu_get_constraint,
- .put_constraint = snbep_pcu_put_constraint,
-};
-
-static struct intel_uncore_type snbep_uncore_pcu = {
- .name = "pcu",
- .num_counters = 4,
- .num_boxes = 1,
- .perf_ctr_bits = 48,
- .perf_ctr = SNBEP_PCU_MSR_PMON_CTR0,
- .event_ctl = SNBEP_PCU_MSR_PMON_CTL0,
- .event_mask = SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK,
- .box_ctl = SNBEP_PCU_MSR_PMON_BOX_CTL,
- .num_shared_regs = 1,
- .ops = &snbep_uncore_pcu_ops,
- .format_group = &snbep_uncore_pcu_format_group,
-};
-
-static struct intel_uncore_type *snbep_msr_uncores[] = {
- &snbep_uncore_ubox,
- &snbep_uncore_cbox,
- &snbep_uncore_pcu,
- NULL,
-};
-
-enum {
- SNBEP_PCI_QPI_PORT0_FILTER,
- SNBEP_PCI_QPI_PORT1_FILTER,
-};
-
-static int snbep_qpi_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
- struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
- struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
-
- if ((hwc->config & SNBEP_PMON_CTL_EV_SEL_MASK) == 0x38) {
- reg1->idx = 0;
- reg1->reg = SNBEP_Q_Py_PCI_PMON_PKT_MATCH0;
- reg1->config = event->attr.config1;
- reg2->reg = SNBEP_Q_Py_PCI_PMON_PKT_MASK0;
- reg2->config = event->attr.config2;
- }
- return 0;
-}
-
-static void snbep_qpi_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct pci_dev *pdev = box->pci_dev;
- struct hw_perf_event *hwc = &event->hw;
- struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
- struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
-
- if (reg1->idx != EXTRA_REG_NONE) {
- int idx = box->pmu->pmu_idx + SNBEP_PCI_QPI_PORT0_FILTER;
- struct pci_dev *filter_pdev = extra_pci_dev[box->phys_id][idx];
- WARN_ON_ONCE(!filter_pdev);
- if (filter_pdev) {
- pci_write_config_dword(filter_pdev, reg1->reg,
- (u32)reg1->config);
- pci_write_config_dword(filter_pdev, reg1->reg + 4,
- (u32)(reg1->config >> 32));
- pci_write_config_dword(filter_pdev, reg2->reg,
- (u32)reg2->config);
- pci_write_config_dword(filter_pdev, reg2->reg + 4,
- (u32)(reg2->config >> 32));
- }
- }
-
- pci_write_config_dword(pdev, hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
-}
-
-static struct intel_uncore_ops snbep_uncore_qpi_ops = {
- SNBEP_UNCORE_PCI_OPS_COMMON_INIT(),
- .enable_event = snbep_qpi_enable_event,
- .hw_config = snbep_qpi_hw_config,
- .get_constraint = uncore_get_constraint,
- .put_constraint = uncore_put_constraint,
-};
-
-#define SNBEP_UNCORE_PCI_COMMON_INIT() \
- .perf_ctr = SNBEP_PCI_PMON_CTR0, \
- .event_ctl = SNBEP_PCI_PMON_CTL0, \
- .event_mask = SNBEP_PMON_RAW_EVENT_MASK, \
- .box_ctl = SNBEP_PCI_PMON_BOX_CTL, \
- .ops = &snbep_uncore_pci_ops, \
- .format_group = &snbep_uncore_format_group
-
-static struct intel_uncore_type snbep_uncore_ha = {
- .name = "ha",
- .num_counters = 4,
- .num_boxes = 1,
- .perf_ctr_bits = 48,
- SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-static struct intel_uncore_type snbep_uncore_imc = {
- .name = "imc",
- .num_counters = 4,
- .num_boxes = 4,
- .perf_ctr_bits = 48,
- .fixed_ctr_bits = 48,
- .fixed_ctr = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
- .fixed_ctl = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
- .event_descs = snbep_uncore_imc_events,
- SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-static struct intel_uncore_type snbep_uncore_qpi = {
- .name = "qpi",
- .num_counters = 4,
- .num_boxes = 2,
- .perf_ctr_bits = 48,
- .perf_ctr = SNBEP_PCI_PMON_CTR0,
- .event_ctl = SNBEP_PCI_PMON_CTL0,
- .event_mask = SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK,
- .box_ctl = SNBEP_PCI_PMON_BOX_CTL,
- .num_shared_regs = 1,
- .ops = &snbep_uncore_qpi_ops,
- .event_descs = snbep_uncore_qpi_events,
- .format_group = &snbep_uncore_qpi_format_group,
-};
-
-
-static struct intel_uncore_type snbep_uncore_r2pcie = {
- .name = "r2pcie",
- .num_counters = 4,
- .num_boxes = 1,
- .perf_ctr_bits = 44,
- .constraints = snbep_uncore_r2pcie_constraints,
- SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-static struct intel_uncore_type snbep_uncore_r3qpi = {
- .name = "r3qpi",
- .num_counters = 3,
- .num_boxes = 2,
- .perf_ctr_bits = 44,
- .constraints = snbep_uncore_r3qpi_constraints,
- SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-enum {
- SNBEP_PCI_UNCORE_HA,
- SNBEP_PCI_UNCORE_IMC,
- SNBEP_PCI_UNCORE_QPI,
- SNBEP_PCI_UNCORE_R2PCIE,
- SNBEP_PCI_UNCORE_R3QPI,
-};
-
-static struct intel_uncore_type *snbep_pci_uncores[] = {
- [SNBEP_PCI_UNCORE_HA] = &snbep_uncore_ha,
- [SNBEP_PCI_UNCORE_IMC] = &snbep_uncore_imc,
- [SNBEP_PCI_UNCORE_QPI] = &snbep_uncore_qpi,
- [SNBEP_PCI_UNCORE_R2PCIE] = &snbep_uncore_r2pcie,
- [SNBEP_PCI_UNCORE_R3QPI] = &snbep_uncore_r3qpi,
- NULL,
-};
-
-static const struct pci_device_id snbep_uncore_pci_ids[] = {
- { /* Home Agent */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_HA),
- .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_HA, 0),
- },
- { /* MC Channel 0 */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC0),
- .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 0),
- },
- { /* MC Channel 1 */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC1),
- .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 1),
- },
- { /* MC Channel 2 */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC2),
- .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 2),
- },
- { /* MC Channel 3 */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC3),
- .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 3),
- },
- { /* QPI Port 0 */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI0),
- .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_QPI, 0),
- },
- { /* QPI Port 1 */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI1),
- .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_QPI, 1),
- },
- { /* R2PCIe */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R2PCIE),
- .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R2PCIE, 0),
- },
- { /* R3QPI Link 0 */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI0),
- .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R3QPI, 0),
- },
- { /* R3QPI Link 1 */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI1),
- .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R3QPI, 1),
- },
- { /* QPI Port 0 filter */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3c86),
- .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
- SNBEP_PCI_QPI_PORT0_FILTER),
- },
- { /* QPI Port 0 filter */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3c96),
- .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
- SNBEP_PCI_QPI_PORT1_FILTER),
- },
- { /* end: all zeroes */ }
-};
-
-static struct pci_driver snbep_uncore_pci_driver = {
- .name = "snbep_uncore",
- .id_table = snbep_uncore_pci_ids,
-};
-
-/*
- * build pci bus to socket mapping
- */
-static int snbep_pci2phy_map_init(int devid)
-{
- struct pci_dev *ubox_dev = NULL;
- int i, bus, nodeid;
- int err = 0;
- u32 config = 0;
-
- while (1) {
- /* find the UBOX device */
- ubox_dev = pci_get_device(PCI_VENDOR_ID_INTEL, devid, ubox_dev);
- if (!ubox_dev)
- break;
- bus = ubox_dev->bus->number;
- /* get the Node ID of the local register */
- err = pci_read_config_dword(ubox_dev, 0x40, &config);
- if (err)
- break;
- nodeid = config;
- /* get the Node ID mapping */
- err = pci_read_config_dword(ubox_dev, 0x54, &config);
- if (err)
- break;
- /*
- * every three bits in the Node ID mapping register maps
- * to a particular node.
- */
- for (i = 0; i < 8; i++) {
- if (nodeid == ((config >> (3 * i)) & 0x7)) {
- pcibus_to_physid[bus] = i;
- break;
- }
- }
- }
-
- if (!err) {
- /*
- * For PCI bus with no UBOX device, find the next bus
- * that has UBOX device and use its mapping.
- */
- i = -1;
- for (bus = 255; bus >= 0; bus--) {
- if (pcibus_to_physid[bus] >= 0)
- i = pcibus_to_physid[bus];
- else
- pcibus_to_physid[bus] = i;
- }
- }
-
- if (ubox_dev)
- pci_dev_put(ubox_dev);
-
- return err ? pcibios_err_to_errno(err) : 0;
-}
-/* end of Sandy Bridge-EP uncore support */
-
-/* IvyTown uncore support */
-static void ivt_uncore_msr_init_box(struct intel_uncore_box *box)
-{
- unsigned msr = uncore_msr_box_ctl(box);
- if (msr)
- wrmsrl(msr, IVT_PMON_BOX_CTL_INT);
-}
-
-static void ivt_uncore_pci_init_box(struct intel_uncore_box *box)
-{
- struct pci_dev *pdev = box->pci_dev;
-
- pci_write_config_dword(pdev, SNBEP_PCI_PMON_BOX_CTL, IVT_PMON_BOX_CTL_INT);
-}
-
-#define IVT_UNCORE_MSR_OPS_COMMON_INIT() \
- .init_box = ivt_uncore_msr_init_box, \
- .disable_box = snbep_uncore_msr_disable_box, \
- .enable_box = snbep_uncore_msr_enable_box, \
- .disable_event = snbep_uncore_msr_disable_event, \
- .enable_event = snbep_uncore_msr_enable_event, \
- .read_counter = uncore_msr_read_counter
-
-static struct intel_uncore_ops ivt_uncore_msr_ops = {
- IVT_UNCORE_MSR_OPS_COMMON_INIT(),
-};
-
-static struct intel_uncore_ops ivt_uncore_pci_ops = {
- .init_box = ivt_uncore_pci_init_box,
- .disable_box = snbep_uncore_pci_disable_box,
- .enable_box = snbep_uncore_pci_enable_box,
- .disable_event = snbep_uncore_pci_disable_event,
- .enable_event = snbep_uncore_pci_enable_event,
- .read_counter = snbep_uncore_pci_read_counter,
-};
-
-#define IVT_UNCORE_PCI_COMMON_INIT() \
- .perf_ctr = SNBEP_PCI_PMON_CTR0, \
- .event_ctl = SNBEP_PCI_PMON_CTL0, \
- .event_mask = IVT_PMON_RAW_EVENT_MASK, \
- .box_ctl = SNBEP_PCI_PMON_BOX_CTL, \
- .ops = &ivt_uncore_pci_ops, \
- .format_group = &ivt_uncore_format_group
-
-static struct attribute *ivt_uncore_formats_attr[] = {
- &format_attr_event.attr,
- &format_attr_umask.attr,
- &format_attr_edge.attr,
- &format_attr_inv.attr,
- &format_attr_thresh8.attr,
- NULL,
-};
-
-static struct attribute *ivt_uncore_ubox_formats_attr[] = {
- &format_attr_event.attr,
- &format_attr_umask.attr,
- &format_attr_edge.attr,
- &format_attr_inv.attr,
- &format_attr_thresh5.attr,
- NULL,
-};
-
-static struct attribute *ivt_uncore_cbox_formats_attr[] = {
- &format_attr_event.attr,
- &format_attr_umask.attr,
- &format_attr_edge.attr,
- &format_attr_tid_en.attr,
- &format_attr_thresh8.attr,
- &format_attr_filter_tid.attr,
- &format_attr_filter_link.attr,
- &format_attr_filter_state2.attr,
- &format_attr_filter_nid2.attr,
- &format_attr_filter_opc2.attr,
- NULL,
-};
-
-static struct attribute *ivt_uncore_pcu_formats_attr[] = {
- &format_attr_event_ext.attr,
- &format_attr_occ_sel.attr,
- &format_attr_edge.attr,
- &format_attr_thresh5.attr,
- &format_attr_occ_invert.attr,
- &format_attr_occ_edge.attr,
- &format_attr_filter_band0.attr,
- &format_attr_filter_band1.attr,
- &format_attr_filter_band2.attr,
- &format_attr_filter_band3.attr,
- NULL,
-};
-
-static struct attribute *ivt_uncore_qpi_formats_attr[] = {
- &format_attr_event_ext.attr,
- &format_attr_umask.attr,
- &format_attr_edge.attr,
- &format_attr_thresh8.attr,
- &format_attr_match_rds.attr,
- &format_attr_match_rnid30.attr,
- &format_attr_match_rnid4.attr,
- &format_attr_match_dnid.attr,
- &format_attr_match_mc.attr,
- &format_attr_match_opc.attr,
- &format_attr_match_vnw.attr,
- &format_attr_match0.attr,
- &format_attr_match1.attr,
- &format_attr_mask_rds.attr,
- &format_attr_mask_rnid30.attr,
- &format_attr_mask_rnid4.attr,
- &format_attr_mask_dnid.attr,
- &format_attr_mask_mc.attr,
- &format_attr_mask_opc.attr,
- &format_attr_mask_vnw.attr,
- &format_attr_mask0.attr,
- &format_attr_mask1.attr,
- NULL,
-};
-
-static struct attribute_group ivt_uncore_format_group = {
- .name = "format",
- .attrs = ivt_uncore_formats_attr,
-};
-
-static struct attribute_group ivt_uncore_ubox_format_group = {
- .name = "format",
- .attrs = ivt_uncore_ubox_formats_attr,
-};
-
-static struct attribute_group ivt_uncore_cbox_format_group = {
- .name = "format",
- .attrs = ivt_uncore_cbox_formats_attr,
-};
-
-static struct attribute_group ivt_uncore_pcu_format_group = {
- .name = "format",
- .attrs = ivt_uncore_pcu_formats_attr,
-};
-
-static struct attribute_group ivt_uncore_qpi_format_group = {
- .name = "format",
- .attrs = ivt_uncore_qpi_formats_attr,
-};
-
-static struct intel_uncore_type ivt_uncore_ubox = {
- .name = "ubox",
- .num_counters = 2,
- .num_boxes = 1,
- .perf_ctr_bits = 44,
- .fixed_ctr_bits = 48,
- .perf_ctr = SNBEP_U_MSR_PMON_CTR0,
- .event_ctl = SNBEP_U_MSR_PMON_CTL0,
- .event_mask = IVT_U_MSR_PMON_RAW_EVENT_MASK,
- .fixed_ctr = SNBEP_U_MSR_PMON_UCLK_FIXED_CTR,
- .fixed_ctl = SNBEP_U_MSR_PMON_UCLK_FIXED_CTL,
- .ops = &ivt_uncore_msr_ops,
- .format_group = &ivt_uncore_ubox_format_group,
-};
-
-static struct extra_reg ivt_uncore_cbox_extra_regs[] = {
- SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
- SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
- SNBEP_CBO_EVENT_EXTRA_REG(0x1031, 0x10ff, 0x2),
-
- SNBEP_CBO_EVENT_EXTRA_REG(0x1134, 0xffff, 0x4),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0xc),
- SNBEP_CBO_EVENT_EXTRA_REG(0x5134, 0xffff, 0xc),
- SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4334, 0xffff, 0xc),
- SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4534, 0xffff, 0xc),
- SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4934, 0xffff, 0xc),
- SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x10),
- SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x10),
- SNBEP_CBO_EVENT_EXTRA_REG(0x2135, 0xffff, 0x10),
- SNBEP_CBO_EVENT_EXTRA_REG(0x2335, 0xffff, 0x10),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0x18),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0x18),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4435, 0xffff, 0x8),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4835, 0xffff, 0x8),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4a35, 0xffff, 0x8),
- SNBEP_CBO_EVENT_EXTRA_REG(0x5035, 0xffff, 0x8),
- SNBEP_CBO_EVENT_EXTRA_REG(0x8135, 0xffff, 0x10),
- SNBEP_CBO_EVENT_EXTRA_REG(0x8335, 0xffff, 0x10),
- SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x10),
- SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x10),
- SNBEP_CBO_EVENT_EXTRA_REG(0x2136, 0xffff, 0x10),
- SNBEP_CBO_EVENT_EXTRA_REG(0x2336, 0xffff, 0x10),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0x18),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0x18),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4436, 0xffff, 0x8),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4836, 0xffff, 0x8),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4a36, 0xffff, 0x8),
- SNBEP_CBO_EVENT_EXTRA_REG(0x5036, 0xffff, 0x8),
- SNBEP_CBO_EVENT_EXTRA_REG(0x8136, 0xffff, 0x10),
- SNBEP_CBO_EVENT_EXTRA_REG(0x8336, 0xffff, 0x10),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4037, 0x40ff, 0x8),
- EVENT_EXTRA_END
-};
-
-static u64 ivt_cbox_filter_mask(int fields)
-{
- u64 mask = 0;
-
- if (fields & 0x1)
- mask |= IVT_CB0_MSR_PMON_BOX_FILTER_TID;
- if (fields & 0x2)
- mask |= IVT_CB0_MSR_PMON_BOX_FILTER_LINK;
- if (fields & 0x4)
- mask |= IVT_CB0_MSR_PMON_BOX_FILTER_STATE;
- if (fields & 0x8)
- mask |= IVT_CB0_MSR_PMON_BOX_FILTER_NID;
- if (fields & 0x10)
- mask |= IVT_CB0_MSR_PMON_BOX_FILTER_OPC;
-
- return mask;
-}
-
-static struct event_constraint *
-ivt_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
- return __snbep_cbox_get_constraint(box, event, ivt_cbox_filter_mask);
-}
-
-static int ivt_cbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
- struct extra_reg *er;
- int idx = 0;
-
- for (er = ivt_uncore_cbox_extra_regs; er->msr; er++) {
- if (er->event != (event->hw.config & er->config_mask))
- continue;
- idx |= er->idx;
- }
-
- if (idx) {
- reg1->reg = SNBEP_C0_MSR_PMON_BOX_FILTER +
- SNBEP_CBO_MSR_OFFSET * box->pmu->pmu_idx;
- reg1->config = event->attr.config1 & ivt_cbox_filter_mask(idx);
- reg1->idx = idx;
- }
- return 0;
-}
-
-static void ivt_cbox_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
- struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-
- if (reg1->idx != EXTRA_REG_NONE) {
- u64 filter = uncore_shared_reg_config(box, 0);
- wrmsrl(reg1->reg, filter & 0xffffffff);
- wrmsrl(reg1->reg + 6, filter >> 32);
- }
-
- wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
-}
-
-static struct intel_uncore_ops ivt_uncore_cbox_ops = {
- .init_box = ivt_uncore_msr_init_box,
- .disable_box = snbep_uncore_msr_disable_box,
- .enable_box = snbep_uncore_msr_enable_box,
- .disable_event = snbep_uncore_msr_disable_event,
- .enable_event = ivt_cbox_enable_event,
- .read_counter = uncore_msr_read_counter,
- .hw_config = ivt_cbox_hw_config,
- .get_constraint = ivt_cbox_get_constraint,
- .put_constraint = snbep_cbox_put_constraint,
-};
-
-static struct intel_uncore_type ivt_uncore_cbox = {
- .name = "cbox",
- .num_counters = 4,
- .num_boxes = 15,
- .perf_ctr_bits = 44,
- .event_ctl = SNBEP_C0_MSR_PMON_CTL0,
- .perf_ctr = SNBEP_C0_MSR_PMON_CTR0,
- .event_mask = IVT_CBO_MSR_PMON_RAW_EVENT_MASK,
- .box_ctl = SNBEP_C0_MSR_PMON_BOX_CTL,
- .msr_offset = SNBEP_CBO_MSR_OFFSET,
- .num_shared_regs = 1,
- .constraints = snbep_uncore_cbox_constraints,
- .ops = &ivt_uncore_cbox_ops,
- .format_group = &ivt_uncore_cbox_format_group,
-};
-
-static struct intel_uncore_ops ivt_uncore_pcu_ops = {
- IVT_UNCORE_MSR_OPS_COMMON_INIT(),
- .hw_config = snbep_pcu_hw_config,
- .get_constraint = snbep_pcu_get_constraint,
- .put_constraint = snbep_pcu_put_constraint,
-};
-
-static struct intel_uncore_type ivt_uncore_pcu = {
- .name = "pcu",
- .num_counters = 4,
- .num_boxes = 1,
- .perf_ctr_bits = 48,
- .perf_ctr = SNBEP_PCU_MSR_PMON_CTR0,
- .event_ctl = SNBEP_PCU_MSR_PMON_CTL0,
- .event_mask = IVT_PCU_MSR_PMON_RAW_EVENT_MASK,
- .box_ctl = SNBEP_PCU_MSR_PMON_BOX_CTL,
- .num_shared_regs = 1,
- .ops = &ivt_uncore_pcu_ops,
- .format_group = &ivt_uncore_pcu_format_group,
-};
-
-static struct intel_uncore_type *ivt_msr_uncores[] = {
- &ivt_uncore_ubox,
- &ivt_uncore_cbox,
- &ivt_uncore_pcu,
- NULL,
-};
-
-static struct intel_uncore_type ivt_uncore_ha = {
- .name = "ha",
- .num_counters = 4,
- .num_boxes = 2,
- .perf_ctr_bits = 48,
- IVT_UNCORE_PCI_COMMON_INIT(),
-};
-
-static struct intel_uncore_type ivt_uncore_imc = {
- .name = "imc",
- .num_counters = 4,
- .num_boxes = 8,
- .perf_ctr_bits = 48,
- .fixed_ctr_bits = 48,
- .fixed_ctr = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
- .fixed_ctl = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
- IVT_UNCORE_PCI_COMMON_INIT(),
-};
-
-/* registers in IRP boxes are not properly aligned */
-static unsigned ivt_uncore_irp_ctls[] = {0xd8, 0xdc, 0xe0, 0xe4};
-static unsigned ivt_uncore_irp_ctrs[] = {0xa0, 0xb0, 0xb8, 0xc0};
-
-static void ivt_uncore_irp_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct pci_dev *pdev = box->pci_dev;
- struct hw_perf_event *hwc = &event->hw;
-
- pci_write_config_dword(pdev, ivt_uncore_irp_ctls[hwc->idx],
- hwc->config | SNBEP_PMON_CTL_EN);
-}
-
-static void ivt_uncore_irp_disable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct pci_dev *pdev = box->pci_dev;
- struct hw_perf_event *hwc = &event->hw;
-
- pci_write_config_dword(pdev, ivt_uncore_irp_ctls[hwc->idx], hwc->config);
-}
-
-static u64 ivt_uncore_irp_read_counter(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct pci_dev *pdev = box->pci_dev;
- struct hw_perf_event *hwc = &event->hw;
- u64 count = 0;
-
- pci_read_config_dword(pdev, ivt_uncore_irp_ctrs[hwc->idx], (u32 *)&count);
- pci_read_config_dword(pdev, ivt_uncore_irp_ctrs[hwc->idx] + 4, (u32 *)&count + 1);
-
- return count;
-}
-
-static struct intel_uncore_ops ivt_uncore_irp_ops = {
- .init_box = ivt_uncore_pci_init_box,
- .disable_box = snbep_uncore_pci_disable_box,
- .enable_box = snbep_uncore_pci_enable_box,
- .disable_event = ivt_uncore_irp_disable_event,
- .enable_event = ivt_uncore_irp_enable_event,
- .read_counter = ivt_uncore_irp_read_counter,
-};
-
-static struct intel_uncore_type ivt_uncore_irp = {
- .name = "irp",
- .num_counters = 4,
- .num_boxes = 1,
- .perf_ctr_bits = 48,
- .event_mask = IVT_PMON_RAW_EVENT_MASK,
- .box_ctl = SNBEP_PCI_PMON_BOX_CTL,
- .ops = &ivt_uncore_irp_ops,
- .format_group = &ivt_uncore_format_group,
-};
-
-static struct intel_uncore_ops ivt_uncore_qpi_ops = {
- .init_box = ivt_uncore_pci_init_box,
- .disable_box = snbep_uncore_pci_disable_box,
- .enable_box = snbep_uncore_pci_enable_box,
- .disable_event = snbep_uncore_pci_disable_event,
- .enable_event = snbep_qpi_enable_event,
- .read_counter = snbep_uncore_pci_read_counter,
- .hw_config = snbep_qpi_hw_config,
- .get_constraint = uncore_get_constraint,
- .put_constraint = uncore_put_constraint,
-};
-
-static struct intel_uncore_type ivt_uncore_qpi = {
- .name = "qpi",
- .num_counters = 4,
- .num_boxes = 3,
- .perf_ctr_bits = 48,
- .perf_ctr = SNBEP_PCI_PMON_CTR0,
- .event_ctl = SNBEP_PCI_PMON_CTL0,
- .event_mask = IVT_QPI_PCI_PMON_RAW_EVENT_MASK,
- .box_ctl = SNBEP_PCI_PMON_BOX_CTL,
- .num_shared_regs = 1,
- .ops = &ivt_uncore_qpi_ops,
- .format_group = &ivt_uncore_qpi_format_group,
-};
-
-static struct intel_uncore_type ivt_uncore_r2pcie = {
- .name = "r2pcie",
- .num_counters = 4,
- .num_boxes = 1,
- .perf_ctr_bits = 44,
- .constraints = snbep_uncore_r2pcie_constraints,
- IVT_UNCORE_PCI_COMMON_INIT(),
-};
-
-static struct intel_uncore_type ivt_uncore_r3qpi = {
- .name = "r3qpi",
- .num_counters = 3,
- .num_boxes = 2,
- .perf_ctr_bits = 44,
- .constraints = snbep_uncore_r3qpi_constraints,
- IVT_UNCORE_PCI_COMMON_INIT(),
-};
-
-enum {
- IVT_PCI_UNCORE_HA,
- IVT_PCI_UNCORE_IMC,
- IVT_PCI_UNCORE_IRP,
- IVT_PCI_UNCORE_QPI,
- IVT_PCI_UNCORE_R2PCIE,
- IVT_PCI_UNCORE_R3QPI,
-};
-
-static struct intel_uncore_type *ivt_pci_uncores[] = {
- [IVT_PCI_UNCORE_HA] = &ivt_uncore_ha,
- [IVT_PCI_UNCORE_IMC] = &ivt_uncore_imc,
- [IVT_PCI_UNCORE_IRP] = &ivt_uncore_irp,
- [IVT_PCI_UNCORE_QPI] = &ivt_uncore_qpi,
- [IVT_PCI_UNCORE_R2PCIE] = &ivt_uncore_r2pcie,
- [IVT_PCI_UNCORE_R3QPI] = &ivt_uncore_r3qpi,
- NULL,
-};
-
-static const struct pci_device_id ivt_uncore_pci_ids[] = {
- { /* Home Agent 0 */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe30),
- .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_HA, 0),
- },
- { /* Home Agent 1 */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe38),
- .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_HA, 1),
- },
- { /* MC0 Channel 0 */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb4),
- .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 0),
- },
- { /* MC0 Channel 1 */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb5),
- .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 1),
- },
- { /* MC0 Channel 3 */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb0),
- .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 2),
- },
- { /* MC0 Channel 4 */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb1),
- .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 3),
- },
- { /* MC1 Channel 0 */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef4),
- .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 4),
- },
- { /* MC1 Channel 1 */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef5),
- .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 5),
- },
- { /* MC1 Channel 3 */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef0),
- .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 6),
- },
- { /* MC1 Channel 4 */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef1),
- .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 7),
- },
- { /* IRP */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe39),
- .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IRP, 0),
- },
- { /* QPI0 Port 0 */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe32),
- .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_QPI, 0),
- },
- { /* QPI0 Port 1 */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe33),
- .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_QPI, 1),
- },
- { /* QPI1 Port 2 */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe3a),
- .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_QPI, 2),
- },
- { /* R2PCIe */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe34),
- .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_R2PCIE, 0),
- },
- { /* R3QPI0 Link 0 */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe36),
- .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_R3QPI, 0),
- },
- { /* R3QPI0 Link 1 */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe37),
- .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_R3QPI, 1),
- },
- { /* R3QPI1 Link 2 */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe3e),
- .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_R3QPI, 2),
- },
- { /* QPI Port 0 filter */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe86),
- .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
- SNBEP_PCI_QPI_PORT0_FILTER),
- },
- { /* QPI Port 0 filter */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe96),
- .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
- SNBEP_PCI_QPI_PORT1_FILTER),
- },
- { /* end: all zeroes */ }
-};
-
-static struct pci_driver ivt_uncore_pci_driver = {
- .name = "ivt_uncore",
- .id_table = ivt_uncore_pci_ids,
-};
-/* end of IvyTown uncore support */
-
-/* Sandy Bridge uncore support */
-static void snb_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
-
- if (hwc->idx < UNCORE_PMC_IDX_FIXED)
- wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN);
- else
- wrmsrl(hwc->config_base, SNB_UNC_CTL_EN);
-}
-
-static void snb_uncore_msr_disable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
- wrmsrl(event->hw.config_base, 0);
-}
-
-static void snb_uncore_msr_init_box(struct intel_uncore_box *box)
-{
- if (box->pmu->pmu_idx == 0) {
- wrmsrl(SNB_UNC_PERF_GLOBAL_CTL,
- SNB_UNC_GLOBAL_CTL_EN | SNB_UNC_GLOBAL_CTL_CORE_ALL);
- }
-}
-
-static struct uncore_event_desc snb_uncore_events[] = {
- INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"),
- { /* end: all zeroes */ },
-};
-
-static struct attribute *snb_uncore_formats_attr[] = {
- &format_attr_event.attr,
- &format_attr_umask.attr,
- &format_attr_edge.attr,
- &format_attr_inv.attr,
- &format_attr_cmask5.attr,
- NULL,
-};
-
-static struct attribute_group snb_uncore_format_group = {
- .name = "format",
- .attrs = snb_uncore_formats_attr,
-};
-
-static struct intel_uncore_ops snb_uncore_msr_ops = {
- .init_box = snb_uncore_msr_init_box,
- .disable_event = snb_uncore_msr_disable_event,
- .enable_event = snb_uncore_msr_enable_event,
- .read_counter = uncore_msr_read_counter,
-};
-
-static struct event_constraint snb_uncore_cbox_constraints[] = {
- UNCORE_EVENT_CONSTRAINT(0x80, 0x1),
- UNCORE_EVENT_CONSTRAINT(0x83, 0x1),
- EVENT_CONSTRAINT_END
-};
-
-static struct intel_uncore_type snb_uncore_cbox = {
- .name = "cbox",
- .num_counters = 2,
- .num_boxes = 4,
- .perf_ctr_bits = 44,
- .fixed_ctr_bits = 48,
- .perf_ctr = SNB_UNC_CBO_0_PER_CTR0,
- .event_ctl = SNB_UNC_CBO_0_PERFEVTSEL0,
- .fixed_ctr = SNB_UNC_FIXED_CTR,
- .fixed_ctl = SNB_UNC_FIXED_CTR_CTRL,
- .single_fixed = 1,
- .event_mask = SNB_UNC_RAW_EVENT_MASK,
- .msr_offset = SNB_UNC_CBO_MSR_OFFSET,
- .constraints = snb_uncore_cbox_constraints,
- .ops = &snb_uncore_msr_ops,
- .format_group = &snb_uncore_format_group,
- .event_descs = snb_uncore_events,
-};
-
-static struct intel_uncore_type *snb_msr_uncores[] = {
- &snb_uncore_cbox,
- NULL,
-};
-
-enum {
- SNB_PCI_UNCORE_IMC,
-};
-
-static struct uncore_event_desc snb_uncore_imc_events[] = {
- INTEL_UNCORE_EVENT_DESC(data_reads, "event=0x01"),
- INTEL_UNCORE_EVENT_DESC(data_reads.scale, "6.103515625e-5"),
- INTEL_UNCORE_EVENT_DESC(data_reads.unit, "MiB"),
-
- INTEL_UNCORE_EVENT_DESC(data_writes, "event=0x02"),
- INTEL_UNCORE_EVENT_DESC(data_writes.scale, "6.103515625e-5"),
- INTEL_UNCORE_EVENT_DESC(data_writes.unit, "MiB"),
-
- { /* end: all zeroes */ },
-};
-
-#define SNB_UNCORE_PCI_IMC_EVENT_MASK 0xff
-#define SNB_UNCORE_PCI_IMC_BAR_OFFSET 0x48
-
-/* page size multiple covering all config regs */
-#define SNB_UNCORE_PCI_IMC_MAP_SIZE 0x6000
-
-#define SNB_UNCORE_PCI_IMC_DATA_READS 0x1
-#define SNB_UNCORE_PCI_IMC_DATA_READS_BASE 0x5050
-#define SNB_UNCORE_PCI_IMC_DATA_WRITES 0x2
-#define SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE 0x5054
-#define SNB_UNCORE_PCI_IMC_CTR_BASE SNB_UNCORE_PCI_IMC_DATA_READS_BASE
-
-static struct attribute *snb_uncore_imc_formats_attr[] = {
- &format_attr_event.attr,
- NULL,
-};
-
-static struct attribute_group snb_uncore_imc_format_group = {
- .name = "format",
- .attrs = snb_uncore_imc_formats_attr,
-};
-
-static void snb_uncore_imc_init_box(struct intel_uncore_box *box)
-{
- struct pci_dev *pdev = box->pci_dev;
- int where = SNB_UNCORE_PCI_IMC_BAR_OFFSET;
- resource_size_t addr;
- u32 pci_dword;
-
- pci_read_config_dword(pdev, where, &pci_dword);
- addr = pci_dword;
-
-#ifdef CONFIG_PHYS_ADDR_T_64BIT
- pci_read_config_dword(pdev, where + 4, &pci_dword);
- addr |= ((resource_size_t)pci_dword << 32);
-#endif
-
- addr &= ~(PAGE_SIZE - 1);
-
- box->io_addr = ioremap(addr, SNB_UNCORE_PCI_IMC_MAP_SIZE);
- box->hrtimer_duration = UNCORE_SNB_IMC_HRTIMER_INTERVAL;
-}
-
-static void snb_uncore_imc_enable_box(struct intel_uncore_box *box)
-{}
-
-static void snb_uncore_imc_disable_box(struct intel_uncore_box *box)
-{}
-
-static void snb_uncore_imc_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{}
-
-static void snb_uncore_imc_disable_event(struct intel_uncore_box *box, struct perf_event *event)
-{}
-
-static u64 snb_uncore_imc_read_counter(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
-
- return (u64)*(unsigned int *)(box->io_addr + hwc->event_base);
-}
-
-/*
- * custom event_init() function because we define our own fixed, free
- * running counters, so we do not want to conflict with generic uncore
- * logic. Also simplifies processing
- */
-static int snb_uncore_imc_event_init(struct perf_event *event)
-{
- struct intel_uncore_pmu *pmu;
- struct intel_uncore_box *box;
- struct hw_perf_event *hwc = &event->hw;
- u64 cfg = event->attr.config & SNB_UNCORE_PCI_IMC_EVENT_MASK;
- int idx, base;
-
- if (event->attr.type != event->pmu->type)
- return -ENOENT;
-
- pmu = uncore_event_to_pmu(event);
- /* no device found for this pmu */
- if (pmu->func_id < 0)
- return -ENOENT;
-
- /* Sampling not supported yet */
- if (hwc->sample_period)
- return -EINVAL;
-
- /* unsupported modes and filters */
- if (event->attr.exclude_user ||
- event->attr.exclude_kernel ||
- event->attr.exclude_hv ||
- event->attr.exclude_idle ||
- event->attr.exclude_host ||
- event->attr.exclude_guest ||
- event->attr.sample_period) /* no sampling */
- return -EINVAL;
-
- /*
- * Place all uncore events for a particular physical package
- * onto a single cpu
- */
- if (event->cpu < 0)
- return -EINVAL;
-
- /* check only supported bits are set */
- if (event->attr.config & ~SNB_UNCORE_PCI_IMC_EVENT_MASK)
- return -EINVAL;
-
- box = uncore_pmu_to_box(pmu, event->cpu);
- if (!box || box->cpu < 0)
- return -EINVAL;
-
- event->cpu = box->cpu;
-
- event->hw.idx = -1;
- event->hw.last_tag = ~0ULL;
- event->hw.extra_reg.idx = EXTRA_REG_NONE;
- event->hw.branch_reg.idx = EXTRA_REG_NONE;
- /*
- * check event is known (whitelist, determines counter)
- */
- switch (cfg) {
- case SNB_UNCORE_PCI_IMC_DATA_READS:
- base = SNB_UNCORE_PCI_IMC_DATA_READS_BASE;
- idx = UNCORE_PMC_IDX_FIXED;
- break;
- case SNB_UNCORE_PCI_IMC_DATA_WRITES:
- base = SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE;
- idx = UNCORE_PMC_IDX_FIXED + 1;
- break;
- default:
- return -EINVAL;
- }
-
- /* must be done before validate_group */
- event->hw.event_base = base;
- event->hw.config = cfg;
- event->hw.idx = idx;
-
- /* no group validation needed, we have free running counters */
-
- return 0;
-}
-
-static int snb_uncore_imc_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
- return 0;
-}
-
-static void snb_uncore_imc_event_start(struct perf_event *event, int flags)
-{
- struct intel_uncore_box *box = uncore_event_to_box(event);
- u64 count;
-
- if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
- return;
-
- event->hw.state = 0;
- box->n_active++;
-
- list_add_tail(&event->active_entry, &box->active_list);
-
- count = snb_uncore_imc_read_counter(box, event);
- local64_set(&event->hw.prev_count, count);
-
- if (box->n_active == 1)
- uncore_pmu_start_hrtimer(box);
-}
-
-static void snb_uncore_imc_event_stop(struct perf_event *event, int flags)
-{
- struct intel_uncore_box *box = uncore_event_to_box(event);
- struct hw_perf_event *hwc = &event->hw;
-
- if (!(hwc->state & PERF_HES_STOPPED)) {
- box->n_active--;
-
- WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
- hwc->state |= PERF_HES_STOPPED;
-
- list_del(&event->active_entry);
-
- if (box->n_active == 0)
- uncore_pmu_cancel_hrtimer(box);
- }
-
- if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
- /*
- * Drain the remaining delta count out of a event
- * that we are disabling:
- */
- uncore_perf_event_update(box, event);
- hwc->state |= PERF_HES_UPTODATE;
- }
-}
-
-static int snb_uncore_imc_event_add(struct perf_event *event, int flags)
-{
- struct intel_uncore_box *box = uncore_event_to_box(event);
- struct hw_perf_event *hwc = &event->hw;
-
- if (!box)
- return -ENODEV;
-
- hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
- if (!(flags & PERF_EF_START))
- hwc->state |= PERF_HES_ARCH;
-
- snb_uncore_imc_event_start(event, 0);
-
- box->n_events++;
-
- return 0;
-}
-
-static void snb_uncore_imc_event_del(struct perf_event *event, int flags)
-{
- struct intel_uncore_box *box = uncore_event_to_box(event);
- int i;
-
- snb_uncore_imc_event_stop(event, PERF_EF_UPDATE);
-
- for (i = 0; i < box->n_events; i++) {
- if (event == box->event_list[i]) {
- --box->n_events;
- break;
- }
- }
-}
-
-static int snb_pci2phy_map_init(int devid)
-{
- struct pci_dev *dev = NULL;
- int bus;
-
- dev = pci_get_device(PCI_VENDOR_ID_INTEL, devid, dev);
- if (!dev)
- return -ENOTTY;
-
- bus = dev->bus->number;
-
- pcibus_to_physid[bus] = 0;
-
- pci_dev_put(dev);
-
- return 0;
-}
-
-static struct pmu snb_uncore_imc_pmu = {
- .task_ctx_nr = perf_invalid_context,
- .event_init = snb_uncore_imc_event_init,
- .add = snb_uncore_imc_event_add,
- .del = snb_uncore_imc_event_del,
- .start = snb_uncore_imc_event_start,
- .stop = snb_uncore_imc_event_stop,
- .read = uncore_pmu_event_read,
-};
-
-static struct intel_uncore_ops snb_uncore_imc_ops = {
- .init_box = snb_uncore_imc_init_box,
- .enable_box = snb_uncore_imc_enable_box,
- .disable_box = snb_uncore_imc_disable_box,
- .disable_event = snb_uncore_imc_disable_event,
- .enable_event = snb_uncore_imc_enable_event,
- .hw_config = snb_uncore_imc_hw_config,
- .read_counter = snb_uncore_imc_read_counter,
-};
-
-static struct intel_uncore_type snb_uncore_imc = {
- .name = "imc",
- .num_counters = 2,
- .num_boxes = 1,
- .fixed_ctr_bits = 32,
- .fixed_ctr = SNB_UNCORE_PCI_IMC_CTR_BASE,
- .event_descs = snb_uncore_imc_events,
- .format_group = &snb_uncore_imc_format_group,
- .perf_ctr = SNB_UNCORE_PCI_IMC_DATA_READS_BASE,
- .event_mask = SNB_UNCORE_PCI_IMC_EVENT_MASK,
- .ops = &snb_uncore_imc_ops,
- .pmu = &snb_uncore_imc_pmu,
-};
-
-static struct intel_uncore_type *snb_pci_uncores[] = {
- [SNB_PCI_UNCORE_IMC] = &snb_uncore_imc,
- NULL,
-};
-
-static const struct pci_device_id snb_uncore_pci_ids[] = {
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SNB_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* end: all zeroes */ },
-};
-
-static const struct pci_device_id ivb_uncore_pci_ids[] = {
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IVB_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* end: all zeroes */ },
-};
-
-static const struct pci_device_id hsw_uncore_pci_ids[] = {
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* end: all zeroes */ },
-};
-
-static struct pci_driver snb_uncore_pci_driver = {
- .name = "snb_uncore",
- .id_table = snb_uncore_pci_ids,
-};
-
-static struct pci_driver ivb_uncore_pci_driver = {
- .name = "ivb_uncore",
- .id_table = ivb_uncore_pci_ids,
-};
-
-static struct pci_driver hsw_uncore_pci_driver = {
- .name = "hsw_uncore",
- .id_table = hsw_uncore_pci_ids,
-};
-
-/* end of Sandy Bridge uncore support */
-
-/* Nehalem uncore support */
-static void nhm_uncore_msr_disable_box(struct intel_uncore_box *box)
-{
- wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, 0);
-}
-
-static void nhm_uncore_msr_enable_box(struct intel_uncore_box *box)
-{
- wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, NHM_UNC_GLOBAL_CTL_EN_PC_ALL | NHM_UNC_GLOBAL_CTL_EN_FC);
-}
-
-static void nhm_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
-
- if (hwc->idx < UNCORE_PMC_IDX_FIXED)
- wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN);
- else
- wrmsrl(hwc->config_base, NHM_UNC_FIXED_CTR_CTL_EN);
-}
-
-static struct attribute *nhm_uncore_formats_attr[] = {
- &format_attr_event.attr,
- &format_attr_umask.attr,
- &format_attr_edge.attr,
- &format_attr_inv.attr,
- &format_attr_cmask8.attr,
- NULL,
-};
-
-static struct attribute_group nhm_uncore_format_group = {
- .name = "format",
- .attrs = nhm_uncore_formats_attr,
-};
-
-static struct uncore_event_desc nhm_uncore_events[] = {
- INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"),
- INTEL_UNCORE_EVENT_DESC(qmc_writes_full_any, "event=0x2f,umask=0x0f"),
- INTEL_UNCORE_EVENT_DESC(qmc_normal_reads_any, "event=0x2c,umask=0x0f"),
- INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_reads, "event=0x20,umask=0x01"),
- INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_writes, "event=0x20,umask=0x02"),
- INTEL_UNCORE_EVENT_DESC(qhl_request_remote_reads, "event=0x20,umask=0x04"),
- INTEL_UNCORE_EVENT_DESC(qhl_request_remote_writes, "event=0x20,umask=0x08"),
- INTEL_UNCORE_EVENT_DESC(qhl_request_local_reads, "event=0x20,umask=0x10"),
- INTEL_UNCORE_EVENT_DESC(qhl_request_local_writes, "event=0x20,umask=0x20"),
- { /* end: all zeroes */ },
-};
-
-static struct intel_uncore_ops nhm_uncore_msr_ops = {
- .disable_box = nhm_uncore_msr_disable_box,
- .enable_box = nhm_uncore_msr_enable_box,
- .disable_event = snb_uncore_msr_disable_event,
- .enable_event = nhm_uncore_msr_enable_event,
- .read_counter = uncore_msr_read_counter,
-};
-
-static struct intel_uncore_type nhm_uncore = {
- .name = "",
- .num_counters = 8,
- .num_boxes = 1,
- .perf_ctr_bits = 48,
- .fixed_ctr_bits = 48,
- .event_ctl = NHM_UNC_PERFEVTSEL0,
- .perf_ctr = NHM_UNC_UNCORE_PMC0,
- .fixed_ctr = NHM_UNC_FIXED_CTR,
- .fixed_ctl = NHM_UNC_FIXED_CTR_CTRL,
- .event_mask = NHM_UNC_RAW_EVENT_MASK,
- .event_descs = nhm_uncore_events,
- .ops = &nhm_uncore_msr_ops,
- .format_group = &nhm_uncore_format_group,
-};
-
-static struct intel_uncore_type *nhm_msr_uncores[] = {
- &nhm_uncore,
- NULL,
-};
-/* end of Nehalem uncore support */
-
-/* Nehalem-EX uncore support */
-DEFINE_UNCORE_FORMAT_ATTR(event5, event, "config:1-5");
-DEFINE_UNCORE_FORMAT_ATTR(counter, counter, "config:6-7");
-DEFINE_UNCORE_FORMAT_ATTR(match, match, "config1:0-63");
-DEFINE_UNCORE_FORMAT_ATTR(mask, mask, "config2:0-63");
-
-static void nhmex_uncore_msr_init_box(struct intel_uncore_box *box)
-{
- wrmsrl(NHMEX_U_MSR_PMON_GLOBAL_CTL, NHMEX_U_PMON_GLOBAL_EN_ALL);
-}
-
-static void nhmex_uncore_msr_disable_box(struct intel_uncore_box *box)
-{
- unsigned msr = uncore_msr_box_ctl(box);
- u64 config;
-
- if (msr) {
- rdmsrl(msr, config);
- config &= ~((1ULL << uncore_num_counters(box)) - 1);
- /* WBox has a fixed counter */
- if (uncore_msr_fixed_ctl(box))
- config &= ~NHMEX_W_PMON_GLOBAL_FIXED_EN;
- wrmsrl(msr, config);
- }
-}
-
-static void nhmex_uncore_msr_enable_box(struct intel_uncore_box *box)
-{
- unsigned msr = uncore_msr_box_ctl(box);
- u64 config;
-
- if (msr) {
- rdmsrl(msr, config);
- config |= (1ULL << uncore_num_counters(box)) - 1;
- /* WBox has a fixed counter */
- if (uncore_msr_fixed_ctl(box))
- config |= NHMEX_W_PMON_GLOBAL_FIXED_EN;
- wrmsrl(msr, config);
- }
-}
-
-static void nhmex_uncore_msr_disable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
- wrmsrl(event->hw.config_base, 0);
-}
-
-static void nhmex_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
-
- if (hwc->idx >= UNCORE_PMC_IDX_FIXED)
- wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0);
- else if (box->pmu->type->event_mask & NHMEX_PMON_CTL_EN_BIT0)
- wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT22);
- else
- wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT0);
-}
-
-#define NHMEX_UNCORE_OPS_COMMON_INIT() \
- .init_box = nhmex_uncore_msr_init_box, \
- .disable_box = nhmex_uncore_msr_disable_box, \
- .enable_box = nhmex_uncore_msr_enable_box, \
- .disable_event = nhmex_uncore_msr_disable_event, \
- .read_counter = uncore_msr_read_counter
-
-static struct intel_uncore_ops nhmex_uncore_ops = {
- NHMEX_UNCORE_OPS_COMMON_INIT(),
- .enable_event = nhmex_uncore_msr_enable_event,
-};
-
-static struct attribute *nhmex_uncore_ubox_formats_attr[] = {
- &format_attr_event.attr,
- &format_attr_edge.attr,
- NULL,
-};
-
-static struct attribute_group nhmex_uncore_ubox_format_group = {
- .name = "format",
- .attrs = nhmex_uncore_ubox_formats_attr,
-};
-
-static struct intel_uncore_type nhmex_uncore_ubox = {
- .name = "ubox",
- .num_counters = 1,
- .num_boxes = 1,
- .perf_ctr_bits = 48,
- .event_ctl = NHMEX_U_MSR_PMON_EV_SEL,
- .perf_ctr = NHMEX_U_MSR_PMON_CTR,
- .event_mask = NHMEX_U_PMON_RAW_EVENT_MASK,
- .box_ctl = NHMEX_U_MSR_PMON_GLOBAL_CTL,
- .ops = &nhmex_uncore_ops,
- .format_group = &nhmex_uncore_ubox_format_group
-};
-
-static struct attribute *nhmex_uncore_cbox_formats_attr[] = {
- &format_attr_event.attr,
- &format_attr_umask.attr,
- &format_attr_edge.attr,
- &format_attr_inv.attr,
- &format_attr_thresh8.attr,
- NULL,
-};
-
-static struct attribute_group nhmex_uncore_cbox_format_group = {
- .name = "format",
- .attrs = nhmex_uncore_cbox_formats_attr,
-};
-
-/* msr offset for each instance of cbox */
-static unsigned nhmex_cbox_msr_offsets[] = {
- 0x0, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0, 0x240, 0x2c0,
-};
-
-static struct intel_uncore_type nhmex_uncore_cbox = {
- .name = "cbox",
- .num_counters = 6,
- .num_boxes = 10,
- .perf_ctr_bits = 48,
- .event_ctl = NHMEX_C0_MSR_PMON_EV_SEL0,
- .perf_ctr = NHMEX_C0_MSR_PMON_CTR0,
- .event_mask = NHMEX_PMON_RAW_EVENT_MASK,
- .box_ctl = NHMEX_C0_MSR_PMON_GLOBAL_CTL,
- .msr_offsets = nhmex_cbox_msr_offsets,
- .pair_ctr_ctl = 1,
- .ops = &nhmex_uncore_ops,
- .format_group = &nhmex_uncore_cbox_format_group
-};
-
-static struct uncore_event_desc nhmex_uncore_wbox_events[] = {
- INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0"),
- { /* end: all zeroes */ },
-};
-
-static struct intel_uncore_type nhmex_uncore_wbox = {
- .name = "wbox",
- .num_counters = 4,
- .num_boxes = 1,
- .perf_ctr_bits = 48,
- .event_ctl = NHMEX_W_MSR_PMON_CNT0,
- .perf_ctr = NHMEX_W_MSR_PMON_EVT_SEL0,
- .fixed_ctr = NHMEX_W_MSR_PMON_FIXED_CTR,
- .fixed_ctl = NHMEX_W_MSR_PMON_FIXED_CTL,
- .event_mask = NHMEX_PMON_RAW_EVENT_MASK,
- .box_ctl = NHMEX_W_MSR_GLOBAL_CTL,
- .pair_ctr_ctl = 1,
- .event_descs = nhmex_uncore_wbox_events,
- .ops = &nhmex_uncore_ops,
- .format_group = &nhmex_uncore_cbox_format_group
-};
-
-static int nhmex_bbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
- struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
- struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
- int ctr, ev_sel;
-
- ctr = (hwc->config & NHMEX_B_PMON_CTR_MASK) >>
- NHMEX_B_PMON_CTR_SHIFT;
- ev_sel = (hwc->config & NHMEX_B_PMON_CTL_EV_SEL_MASK) >>
- NHMEX_B_PMON_CTL_EV_SEL_SHIFT;
-
- /* events that do not use the match/mask registers */
- if ((ctr == 0 && ev_sel > 0x3) || (ctr == 1 && ev_sel > 0x6) ||
- (ctr == 2 && ev_sel != 0x4) || ctr == 3)
- return 0;
-
- if (box->pmu->pmu_idx == 0)
- reg1->reg = NHMEX_B0_MSR_MATCH;
- else
- reg1->reg = NHMEX_B1_MSR_MATCH;
- reg1->idx = 0;
- reg1->config = event->attr.config1;
- reg2->config = event->attr.config2;
- return 0;
-}
-
-static void nhmex_bbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
- struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
- struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
-
- if (reg1->idx != EXTRA_REG_NONE) {
- wrmsrl(reg1->reg, reg1->config);
- wrmsrl(reg1->reg + 1, reg2->config);
- }
- wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0 |
- (hwc->config & NHMEX_B_PMON_CTL_EV_SEL_MASK));
-}
-
-/*
- * The Bbox has 4 counters, but each counter monitors different events.
- * Use bits 6-7 in the event config to select counter.
- */
-static struct event_constraint nhmex_uncore_bbox_constraints[] = {
- EVENT_CONSTRAINT(0 , 1, 0xc0),
- EVENT_CONSTRAINT(0x40, 2, 0xc0),
- EVENT_CONSTRAINT(0x80, 4, 0xc0),
- EVENT_CONSTRAINT(0xc0, 8, 0xc0),
- EVENT_CONSTRAINT_END,
-};
-
-static struct attribute *nhmex_uncore_bbox_formats_attr[] = {
- &format_attr_event5.attr,
- &format_attr_counter.attr,
- &format_attr_match.attr,
- &format_attr_mask.attr,
- NULL,
-};
-
-static struct attribute_group nhmex_uncore_bbox_format_group = {
- .name = "format",
- .attrs = nhmex_uncore_bbox_formats_attr,
-};
-
-static struct intel_uncore_ops nhmex_uncore_bbox_ops = {
- NHMEX_UNCORE_OPS_COMMON_INIT(),
- .enable_event = nhmex_bbox_msr_enable_event,
- .hw_config = nhmex_bbox_hw_config,
- .get_constraint = uncore_get_constraint,
- .put_constraint = uncore_put_constraint,
-};
-
-static struct intel_uncore_type nhmex_uncore_bbox = {
- .name = "bbox",
- .num_counters = 4,
- .num_boxes = 2,
- .perf_ctr_bits = 48,
- .event_ctl = NHMEX_B0_MSR_PMON_CTL0,
- .perf_ctr = NHMEX_B0_MSR_PMON_CTR0,
- .event_mask = NHMEX_B_PMON_RAW_EVENT_MASK,
- .box_ctl = NHMEX_B0_MSR_PMON_GLOBAL_CTL,
- .msr_offset = NHMEX_B_MSR_OFFSET,
- .pair_ctr_ctl = 1,
- .num_shared_regs = 1,
- .constraints = nhmex_uncore_bbox_constraints,
- .ops = &nhmex_uncore_bbox_ops,
- .format_group = &nhmex_uncore_bbox_format_group
-};
-
-static int nhmex_sbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
- struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
- struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
-
- /* only TO_R_PROG_EV event uses the match/mask register */
- if ((hwc->config & NHMEX_PMON_CTL_EV_SEL_MASK) !=
- NHMEX_S_EVENT_TO_R_PROG_EV)
- return 0;
-
- if (box->pmu->pmu_idx == 0)
- reg1->reg = NHMEX_S0_MSR_MM_CFG;
- else
- reg1->reg = NHMEX_S1_MSR_MM_CFG;
- reg1->idx = 0;
- reg1->config = event->attr.config1;
- reg2->config = event->attr.config2;
- return 0;
-}
-
-static void nhmex_sbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
- struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
- struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
-
- if (reg1->idx != EXTRA_REG_NONE) {
- wrmsrl(reg1->reg, 0);
- wrmsrl(reg1->reg + 1, reg1->config);
- wrmsrl(reg1->reg + 2, reg2->config);
- wrmsrl(reg1->reg, NHMEX_S_PMON_MM_CFG_EN);
- }
- wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT22);
-}
-
-static struct attribute *nhmex_uncore_sbox_formats_attr[] = {
- &format_attr_event.attr,
- &format_attr_umask.attr,
- &format_attr_edge.attr,
- &format_attr_inv.attr,
- &format_attr_thresh8.attr,
- &format_attr_match.attr,
- &format_attr_mask.attr,
- NULL,
-};
-
-static struct attribute_group nhmex_uncore_sbox_format_group = {
- .name = "format",
- .attrs = nhmex_uncore_sbox_formats_attr,
-};
-
-static struct intel_uncore_ops nhmex_uncore_sbox_ops = {
- NHMEX_UNCORE_OPS_COMMON_INIT(),
- .enable_event = nhmex_sbox_msr_enable_event,
- .hw_config = nhmex_sbox_hw_config,
- .get_constraint = uncore_get_constraint,
- .put_constraint = uncore_put_constraint,
-};
-
-static struct intel_uncore_type nhmex_uncore_sbox = {
- .name = "sbox",
- .num_counters = 4,
- .num_boxes = 2,
- .perf_ctr_bits = 48,
- .event_ctl = NHMEX_S0_MSR_PMON_CTL0,
- .perf_ctr = NHMEX_S0_MSR_PMON_CTR0,
- .event_mask = NHMEX_PMON_RAW_EVENT_MASK,
- .box_ctl = NHMEX_S0_MSR_PMON_GLOBAL_CTL,
- .msr_offset = NHMEX_S_MSR_OFFSET,
- .pair_ctr_ctl = 1,
- .num_shared_regs = 1,
- .ops = &nhmex_uncore_sbox_ops,
- .format_group = &nhmex_uncore_sbox_format_group
-};
-
-enum {
- EXTRA_REG_NHMEX_M_FILTER,
- EXTRA_REG_NHMEX_M_DSP,
- EXTRA_REG_NHMEX_M_ISS,
- EXTRA_REG_NHMEX_M_MAP,
- EXTRA_REG_NHMEX_M_MSC_THR,
- EXTRA_REG_NHMEX_M_PGT,
- EXTRA_REG_NHMEX_M_PLD,
- EXTRA_REG_NHMEX_M_ZDP_CTL_FVC,
-};
-
-static struct extra_reg nhmex_uncore_mbox_extra_regs[] = {
- MBOX_INC_SEL_EXTAR_REG(0x0, DSP),
- MBOX_INC_SEL_EXTAR_REG(0x4, MSC_THR),
- MBOX_INC_SEL_EXTAR_REG(0x5, MSC_THR),
- MBOX_INC_SEL_EXTAR_REG(0x9, ISS),
- /* event 0xa uses two extra registers */
- MBOX_INC_SEL_EXTAR_REG(0xa, ISS),
- MBOX_INC_SEL_EXTAR_REG(0xa, PLD),
- MBOX_INC_SEL_EXTAR_REG(0xb, PLD),
- /* events 0xd ~ 0x10 use the same extra register */
- MBOX_INC_SEL_EXTAR_REG(0xd, ZDP_CTL_FVC),
- MBOX_INC_SEL_EXTAR_REG(0xe, ZDP_CTL_FVC),
- MBOX_INC_SEL_EXTAR_REG(0xf, ZDP_CTL_FVC),
- MBOX_INC_SEL_EXTAR_REG(0x10, ZDP_CTL_FVC),
- MBOX_INC_SEL_EXTAR_REG(0x16, PGT),
- MBOX_SET_FLAG_SEL_EXTRA_REG(0x0, DSP),
- MBOX_SET_FLAG_SEL_EXTRA_REG(0x1, ISS),
- MBOX_SET_FLAG_SEL_EXTRA_REG(0x5, PGT),
- MBOX_SET_FLAG_SEL_EXTRA_REG(0x6, MAP),
- EVENT_EXTRA_END
-};
-
-/* Nehalem-EX or Westmere-EX ? */
-static bool uncore_nhmex;
-
-static bool nhmex_mbox_get_shared_reg(struct intel_uncore_box *box, int idx, u64 config)
-{
- struct intel_uncore_extra_reg *er;
- unsigned long flags;
- bool ret = false;
- u64 mask;
-
- if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) {
- er = &box->shared_regs[idx];
- raw_spin_lock_irqsave(&er->lock, flags);
- if (!atomic_read(&er->ref) || er->config == config) {
- atomic_inc(&er->ref);
- er->config = config;
- ret = true;
- }
- raw_spin_unlock_irqrestore(&er->lock, flags);
-
- return ret;
- }
- /*
- * The ZDP_CTL_FVC MSR has 4 fields which are used to control
- * events 0xd ~ 0x10. Besides these 4 fields, there are additional
- * fields which are shared.
- */
- idx -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
- if (WARN_ON_ONCE(idx >= 4))
- return false;
-
- /* mask of the shared fields */
- if (uncore_nhmex)
- mask = NHMEX_M_PMON_ZDP_CTL_FVC_MASK;
- else
- mask = WSMEX_M_PMON_ZDP_CTL_FVC_MASK;
- er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC];
-
- raw_spin_lock_irqsave(&er->lock, flags);
- /* add mask of the non-shared field if it's in use */
- if (__BITS_VALUE(atomic_read(&er->ref), idx, 8)) {
- if (uncore_nhmex)
- mask |= NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
- else
- mask |= WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
- }
-
- if (!atomic_read(&er->ref) || !((er->config ^ config) & mask)) {
- atomic_add(1 << (idx * 8), &er->ref);
- if (uncore_nhmex)
- mask = NHMEX_M_PMON_ZDP_CTL_FVC_MASK |
- NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
- else
- mask = WSMEX_M_PMON_ZDP_CTL_FVC_MASK |
- WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
- er->config &= ~mask;
- er->config |= (config & mask);
- ret = true;
- }
- raw_spin_unlock_irqrestore(&er->lock, flags);
-
- return ret;
-}
-
-static void nhmex_mbox_put_shared_reg(struct intel_uncore_box *box, int idx)
-{
- struct intel_uncore_extra_reg *er;
-
- if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) {
- er = &box->shared_regs[idx];
- atomic_dec(&er->ref);
- return;
- }
-
- idx -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
- er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC];
- atomic_sub(1 << (idx * 8), &er->ref);
-}
-
-static u64 nhmex_mbox_alter_er(struct perf_event *event, int new_idx, bool modify)
-{
- struct hw_perf_event *hwc = &event->hw;
- struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
- u64 idx, orig_idx = __BITS_VALUE(reg1->idx, 0, 8);
- u64 config = reg1->config;
-
- /* get the non-shared control bits and shift them */
- idx = orig_idx - EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
- if (uncore_nhmex)
- config &= NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
- else
- config &= WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
- if (new_idx > orig_idx) {
- idx = new_idx - orig_idx;
- config <<= 3 * idx;
- } else {
- idx = orig_idx - new_idx;
- config >>= 3 * idx;
- }
-
- /* add the shared control bits back */
- if (uncore_nhmex)
- config |= NHMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config;
- else
- config |= WSMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config;
- config |= NHMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config;
- if (modify) {
- /* adjust the main event selector */
- if (new_idx > orig_idx)
- hwc->config += idx << NHMEX_M_PMON_CTL_INC_SEL_SHIFT;
- else
- hwc->config -= idx << NHMEX_M_PMON_CTL_INC_SEL_SHIFT;
- reg1->config = config;
- reg1->idx = ~0xff | new_idx;
- }
- return config;
-}
-
-static struct event_constraint *
-nhmex_mbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
- struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
- int i, idx[2], alloc = 0;
- u64 config1 = reg1->config;
-
- idx[0] = __BITS_VALUE(reg1->idx, 0, 8);
- idx[1] = __BITS_VALUE(reg1->idx, 1, 8);
-again:
- for (i = 0; i < 2; i++) {
- if (!uncore_box_is_fake(box) && (reg1->alloc & (0x1 << i)))
- idx[i] = 0xff;
-
- if (idx[i] == 0xff)
- continue;
-
- if (!nhmex_mbox_get_shared_reg(box, idx[i],
- __BITS_VALUE(config1, i, 32)))
- goto fail;
- alloc |= (0x1 << i);
- }
-
- /* for the match/mask registers */
- if (reg2->idx != EXTRA_REG_NONE &&
- (uncore_box_is_fake(box) || !reg2->alloc) &&
- !nhmex_mbox_get_shared_reg(box, reg2->idx, reg2->config))
- goto fail;
-
- /*
- * If it's a fake box -- as per validate_{group,event}() we
- * shouldn't touch event state and we can avoid doing so
- * since both will only call get_event_constraints() once
- * on each event, this avoids the need for reg->alloc.
- */
- if (!uncore_box_is_fake(box)) {
- if (idx[0] != 0xff && idx[0] != __BITS_VALUE(reg1->idx, 0, 8))
- nhmex_mbox_alter_er(event, idx[0], true);
- reg1->alloc |= alloc;
- if (reg2->idx != EXTRA_REG_NONE)
- reg2->alloc = 1;
- }
- return NULL;
-fail:
- if (idx[0] != 0xff && !(alloc & 0x1) &&
- idx[0] >= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) {
- /*
- * events 0xd ~ 0x10 are functional identical, but are
- * controlled by different fields in the ZDP_CTL_FVC
- * register. If we failed to take one field, try the
- * rest 3 choices.
- */
- BUG_ON(__BITS_VALUE(reg1->idx, 1, 8) != 0xff);
- idx[0] -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
- idx[0] = (idx[0] + 1) % 4;
- idx[0] += EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
- if (idx[0] != __BITS_VALUE(reg1->idx, 0, 8)) {
- config1 = nhmex_mbox_alter_er(event, idx[0], false);
- goto again;
- }
- }
-
- if (alloc & 0x1)
- nhmex_mbox_put_shared_reg(box, idx[0]);
- if (alloc & 0x2)
- nhmex_mbox_put_shared_reg(box, idx[1]);
- return &constraint_empty;
-}
-
-static void nhmex_mbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
- struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
-
- if (uncore_box_is_fake(box))
- return;
-
- if (reg1->alloc & 0x1)
- nhmex_mbox_put_shared_reg(box, __BITS_VALUE(reg1->idx, 0, 8));
- if (reg1->alloc & 0x2)
- nhmex_mbox_put_shared_reg(box, __BITS_VALUE(reg1->idx, 1, 8));
- reg1->alloc = 0;
-
- if (reg2->alloc) {
- nhmex_mbox_put_shared_reg(box, reg2->idx);
- reg2->alloc = 0;
- }
-}
-
-static int nhmex_mbox_extra_reg_idx(struct extra_reg *er)
-{
- if (er->idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC)
- return er->idx;
- return er->idx + (er->event >> NHMEX_M_PMON_CTL_INC_SEL_SHIFT) - 0xd;
-}
-
-static int nhmex_mbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct intel_uncore_type *type = box->pmu->type;
- struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
- struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
- struct extra_reg *er;
- unsigned msr;
- int reg_idx = 0;
- /*
- * The mbox events may require 2 extra MSRs at the most. But only
- * the lower 32 bits in these MSRs are significant, so we can use
- * config1 to pass two MSRs' config.
- */
- for (er = nhmex_uncore_mbox_extra_regs; er->msr; er++) {
- if (er->event != (event->hw.config & er->config_mask))
- continue;
- if (event->attr.config1 & ~er->valid_mask)
- return -EINVAL;
-
- msr = er->msr + type->msr_offset * box->pmu->pmu_idx;
- if (WARN_ON_ONCE(msr >= 0xffff || er->idx >= 0xff))
- return -EINVAL;
-
- /* always use the 32~63 bits to pass the PLD config */
- if (er->idx == EXTRA_REG_NHMEX_M_PLD)
- reg_idx = 1;
- else if (WARN_ON_ONCE(reg_idx > 0))
- return -EINVAL;
-
- reg1->idx &= ~(0xff << (reg_idx * 8));
- reg1->reg &= ~(0xffff << (reg_idx * 16));
- reg1->idx |= nhmex_mbox_extra_reg_idx(er) << (reg_idx * 8);
- reg1->reg |= msr << (reg_idx * 16);
- reg1->config = event->attr.config1;
- reg_idx++;
- }
- /*
- * The mbox only provides ability to perform address matching
- * for the PLD events.
- */
- if (reg_idx == 2) {
- reg2->idx = EXTRA_REG_NHMEX_M_FILTER;
- if (event->attr.config2 & NHMEX_M_PMON_MM_CFG_EN)
- reg2->config = event->attr.config2;
- else
- reg2->config = ~0ULL;
- if (box->pmu->pmu_idx == 0)
- reg2->reg = NHMEX_M0_MSR_PMU_MM_CFG;
- else
- reg2->reg = NHMEX_M1_MSR_PMU_MM_CFG;
- }
- return 0;
-}
-
-static u64 nhmex_mbox_shared_reg_config(struct intel_uncore_box *box, int idx)
-{
- struct intel_uncore_extra_reg *er;
- unsigned long flags;
- u64 config;
-
- if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC)
- return box->shared_regs[idx].config;
-
- er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC];
- raw_spin_lock_irqsave(&er->lock, flags);
- config = er->config;
- raw_spin_unlock_irqrestore(&er->lock, flags);
- return config;
-}
-
-static void nhmex_mbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
- struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
- struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
- int idx;
-
- idx = __BITS_VALUE(reg1->idx, 0, 8);
- if (idx != 0xff)
- wrmsrl(__BITS_VALUE(reg1->reg, 0, 16),
- nhmex_mbox_shared_reg_config(box, idx));
- idx = __BITS_VALUE(reg1->idx, 1, 8);
- if (idx != 0xff)
- wrmsrl(__BITS_VALUE(reg1->reg, 1, 16),
- nhmex_mbox_shared_reg_config(box, idx));
-
- if (reg2->idx != EXTRA_REG_NONE) {
- wrmsrl(reg2->reg, 0);
- if (reg2->config != ~0ULL) {
- wrmsrl(reg2->reg + 1,
- reg2->config & NHMEX_M_PMON_ADDR_MATCH_MASK);
- wrmsrl(reg2->reg + 2, NHMEX_M_PMON_ADDR_MASK_MASK &
- (reg2->config >> NHMEX_M_PMON_ADDR_MASK_SHIFT));
- wrmsrl(reg2->reg, NHMEX_M_PMON_MM_CFG_EN);
- }
- }
-
- wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT0);
-}
-
-DEFINE_UNCORE_FORMAT_ATTR(count_mode, count_mode, "config:2-3");
-DEFINE_UNCORE_FORMAT_ATTR(storage_mode, storage_mode, "config:4-5");
-DEFINE_UNCORE_FORMAT_ATTR(wrap_mode, wrap_mode, "config:6");
-DEFINE_UNCORE_FORMAT_ATTR(flag_mode, flag_mode, "config:7");
-DEFINE_UNCORE_FORMAT_ATTR(inc_sel, inc_sel, "config:9-13");
-DEFINE_UNCORE_FORMAT_ATTR(set_flag_sel, set_flag_sel, "config:19-21");
-DEFINE_UNCORE_FORMAT_ATTR(filter_cfg_en, filter_cfg_en, "config2:63");
-DEFINE_UNCORE_FORMAT_ATTR(filter_match, filter_match, "config2:0-33");
-DEFINE_UNCORE_FORMAT_ATTR(filter_mask, filter_mask, "config2:34-61");
-DEFINE_UNCORE_FORMAT_ATTR(dsp, dsp, "config1:0-31");
-DEFINE_UNCORE_FORMAT_ATTR(thr, thr, "config1:0-31");
-DEFINE_UNCORE_FORMAT_ATTR(fvc, fvc, "config1:0-31");
-DEFINE_UNCORE_FORMAT_ATTR(pgt, pgt, "config1:0-31");
-DEFINE_UNCORE_FORMAT_ATTR(map, map, "config1:0-31");
-DEFINE_UNCORE_FORMAT_ATTR(iss, iss, "config1:0-31");
-DEFINE_UNCORE_FORMAT_ATTR(pld, pld, "config1:32-63");
-
-static struct attribute *nhmex_uncore_mbox_formats_attr[] = {
- &format_attr_count_mode.attr,
- &format_attr_storage_mode.attr,
- &format_attr_wrap_mode.attr,
- &format_attr_flag_mode.attr,
- &format_attr_inc_sel.attr,
- &format_attr_set_flag_sel.attr,
- &format_attr_filter_cfg_en.attr,
- &format_attr_filter_match.attr,
- &format_attr_filter_mask.attr,
- &format_attr_dsp.attr,
- &format_attr_thr.attr,
- &format_attr_fvc.attr,
- &format_attr_pgt.attr,
- &format_attr_map.attr,
- &format_attr_iss.attr,
- &format_attr_pld.attr,
- NULL,
-};
-
-static struct attribute_group nhmex_uncore_mbox_format_group = {
- .name = "format",
- .attrs = nhmex_uncore_mbox_formats_attr,
-};
-
-static struct uncore_event_desc nhmex_uncore_mbox_events[] = {
- INTEL_UNCORE_EVENT_DESC(bbox_cmds_read, "inc_sel=0xd,fvc=0x2800"),
- INTEL_UNCORE_EVENT_DESC(bbox_cmds_write, "inc_sel=0xd,fvc=0x2820"),
- { /* end: all zeroes */ },
-};
-
-static struct uncore_event_desc wsmex_uncore_mbox_events[] = {
- INTEL_UNCORE_EVENT_DESC(bbox_cmds_read, "inc_sel=0xd,fvc=0x5000"),
- INTEL_UNCORE_EVENT_DESC(bbox_cmds_write, "inc_sel=0xd,fvc=0x5040"),
- { /* end: all zeroes */ },
-};
-
-static struct intel_uncore_ops nhmex_uncore_mbox_ops = {
- NHMEX_UNCORE_OPS_COMMON_INIT(),
- .enable_event = nhmex_mbox_msr_enable_event,
- .hw_config = nhmex_mbox_hw_config,
- .get_constraint = nhmex_mbox_get_constraint,
- .put_constraint = nhmex_mbox_put_constraint,
-};
-
-static struct intel_uncore_type nhmex_uncore_mbox = {
- .name = "mbox",
- .num_counters = 6,
- .num_boxes = 2,
- .perf_ctr_bits = 48,
- .event_ctl = NHMEX_M0_MSR_PMU_CTL0,
- .perf_ctr = NHMEX_M0_MSR_PMU_CNT0,
- .event_mask = NHMEX_M_PMON_RAW_EVENT_MASK,
- .box_ctl = NHMEX_M0_MSR_GLOBAL_CTL,
- .msr_offset = NHMEX_M_MSR_OFFSET,
- .pair_ctr_ctl = 1,
- .num_shared_regs = 8,
- .event_descs = nhmex_uncore_mbox_events,
- .ops = &nhmex_uncore_mbox_ops,
- .format_group = &nhmex_uncore_mbox_format_group,
-};
-
-static void nhmex_rbox_alter_er(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
- struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-
- /* adjust the main event selector and extra register index */
- if (reg1->idx % 2) {
- reg1->idx--;
- hwc->config -= 1 << NHMEX_R_PMON_CTL_EV_SEL_SHIFT;
- } else {
- reg1->idx++;
- hwc->config += 1 << NHMEX_R_PMON_CTL_EV_SEL_SHIFT;
- }
-
- /* adjust extra register config */
- switch (reg1->idx % 6) {
- case 2:
- /* shift the 8~15 bits to the 0~7 bits */
- reg1->config >>= 8;
- break;
- case 3:
- /* shift the 0~7 bits to the 8~15 bits */
- reg1->config <<= 8;
- break;
- };
-}
-
-/*
- * Each rbox has 4 event set which monitor PQI port 0~3 or 4~7.
- * An event set consists of 6 events, the 3rd and 4th events in
- * an event set use the same extra register. So an event set uses
- * 5 extra registers.
- */
-static struct event_constraint *
-nhmex_rbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
- struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
- struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
- struct intel_uncore_extra_reg *er;
- unsigned long flags;
- int idx, er_idx;
- u64 config1;
- bool ok = false;
-
- if (!uncore_box_is_fake(box) && reg1->alloc)
- return NULL;
-
- idx = reg1->idx % 6;
- config1 = reg1->config;
-again:
- er_idx = idx;
- /* the 3rd and 4th events use the same extra register */
- if (er_idx > 2)
- er_idx--;
- er_idx += (reg1->idx / 6) * 5;
-
- er = &box->shared_regs[er_idx];
- raw_spin_lock_irqsave(&er->lock, flags);
- if (idx < 2) {
- if (!atomic_read(&er->ref) || er->config == reg1->config) {
- atomic_inc(&er->ref);
- er->config = reg1->config;
- ok = true;
- }
- } else if (idx == 2 || idx == 3) {
- /*
- * these two events use different fields in a extra register,
- * the 0~7 bits and the 8~15 bits respectively.
- */
- u64 mask = 0xff << ((idx - 2) * 8);
- if (!__BITS_VALUE(atomic_read(&er->ref), idx - 2, 8) ||
- !((er->config ^ config1) & mask)) {
- atomic_add(1 << ((idx - 2) * 8), &er->ref);
- er->config &= ~mask;
- er->config |= config1 & mask;
- ok = true;
- }
- } else {
- if (!atomic_read(&er->ref) ||
- (er->config == (hwc->config >> 32) &&
- er->config1 == reg1->config &&
- er->config2 == reg2->config)) {
- atomic_inc(&er->ref);
- er->config = (hwc->config >> 32);
- er->config1 = reg1->config;
- er->config2 = reg2->config;
- ok = true;
- }
- }
- raw_spin_unlock_irqrestore(&er->lock, flags);
-
- if (!ok) {
- /*
- * The Rbox events are always in pairs. The paired
- * events are functional identical, but use different
- * extra registers. If we failed to take an extra
- * register, try the alternative.
- */
- idx ^= 1;
- if (idx != reg1->idx % 6) {
- if (idx == 2)
- config1 >>= 8;
- else if (idx == 3)
- config1 <<= 8;
- goto again;
- }
- } else {
- if (!uncore_box_is_fake(box)) {
- if (idx != reg1->idx % 6)
- nhmex_rbox_alter_er(box, event);
- reg1->alloc = 1;
- }
- return NULL;
- }
- return &constraint_empty;
-}
-
-static void nhmex_rbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct intel_uncore_extra_reg *er;
- struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
- int idx, er_idx;
-
- if (uncore_box_is_fake(box) || !reg1->alloc)
- return;
-
- idx = reg1->idx % 6;
- er_idx = idx;
- if (er_idx > 2)
- er_idx--;
- er_idx += (reg1->idx / 6) * 5;
-
- er = &box->shared_regs[er_idx];
- if (idx == 2 || idx == 3)
- atomic_sub(1 << ((idx - 2) * 8), &er->ref);
- else
- atomic_dec(&er->ref);
-
- reg1->alloc = 0;
-}
-
-static int nhmex_rbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
- struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
- struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
- int idx;
-
- idx = (event->hw.config & NHMEX_R_PMON_CTL_EV_SEL_MASK) >>
- NHMEX_R_PMON_CTL_EV_SEL_SHIFT;
- if (idx >= 0x18)
- return -EINVAL;
-
- reg1->idx = idx;
- reg1->config = event->attr.config1;
-
- switch (idx % 6) {
- case 4:
- case 5:
- hwc->config |= event->attr.config & (~0ULL << 32);
- reg2->config = event->attr.config2;
- break;
- };
- return 0;
-}
-
-static void nhmex_rbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
- struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
- struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
- int idx, port;
-
- idx = reg1->idx;
- port = idx / 6 + box->pmu->pmu_idx * 4;
-
- switch (idx % 6) {
- case 0:
- wrmsrl(NHMEX_R_MSR_PORTN_IPERF_CFG0(port), reg1->config);
- break;
- case 1:
- wrmsrl(NHMEX_R_MSR_PORTN_IPERF_CFG1(port), reg1->config);
- break;
- case 2:
- case 3:
- wrmsrl(NHMEX_R_MSR_PORTN_QLX_CFG(port),
- uncore_shared_reg_config(box, 2 + (idx / 6) * 5));
- break;
- case 4:
- wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(port),
- hwc->config >> 32);
- wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MATCH(port), reg1->config);
- wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MASK(port), reg2->config);
- break;
- case 5:
- wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(port),
- hwc->config >> 32);
- wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MATCH(port), reg1->config);
- wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MASK(port), reg2->config);
- break;
- };
-
- wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0 |
- (hwc->config & NHMEX_R_PMON_CTL_EV_SEL_MASK));
-}
-
-DEFINE_UNCORE_FORMAT_ATTR(xbr_mm_cfg, xbr_mm_cfg, "config:32-63");
-DEFINE_UNCORE_FORMAT_ATTR(xbr_match, xbr_match, "config1:0-63");
-DEFINE_UNCORE_FORMAT_ATTR(xbr_mask, xbr_mask, "config2:0-63");
-DEFINE_UNCORE_FORMAT_ATTR(qlx_cfg, qlx_cfg, "config1:0-15");
-DEFINE_UNCORE_FORMAT_ATTR(iperf_cfg, iperf_cfg, "config1:0-31");
-
-static struct attribute *nhmex_uncore_rbox_formats_attr[] = {
- &format_attr_event5.attr,
- &format_attr_xbr_mm_cfg.attr,
- &format_attr_xbr_match.attr,
- &format_attr_xbr_mask.attr,
- &format_attr_qlx_cfg.attr,
- &format_attr_iperf_cfg.attr,
- NULL,
-};
-
-static struct attribute_group nhmex_uncore_rbox_format_group = {
- .name = "format",
- .attrs = nhmex_uncore_rbox_formats_attr,
-};
-
-static struct uncore_event_desc nhmex_uncore_rbox_events[] = {
- INTEL_UNCORE_EVENT_DESC(qpi0_flit_send, "event=0x0,iperf_cfg=0x80000000"),
- INTEL_UNCORE_EVENT_DESC(qpi1_filt_send, "event=0x6,iperf_cfg=0x80000000"),
- INTEL_UNCORE_EVENT_DESC(qpi0_idle_filt, "event=0x0,iperf_cfg=0x40000000"),
- INTEL_UNCORE_EVENT_DESC(qpi1_idle_filt, "event=0x6,iperf_cfg=0x40000000"),
- INTEL_UNCORE_EVENT_DESC(qpi0_date_response, "event=0x0,iperf_cfg=0xc4"),
- INTEL_UNCORE_EVENT_DESC(qpi1_date_response, "event=0x6,iperf_cfg=0xc4"),
- { /* end: all zeroes */ },
-};
-
-static struct intel_uncore_ops nhmex_uncore_rbox_ops = {
- NHMEX_UNCORE_OPS_COMMON_INIT(),
- .enable_event = nhmex_rbox_msr_enable_event,
- .hw_config = nhmex_rbox_hw_config,
- .get_constraint = nhmex_rbox_get_constraint,
- .put_constraint = nhmex_rbox_put_constraint,
-};
-
-static struct intel_uncore_type nhmex_uncore_rbox = {
- .name = "rbox",
- .num_counters = 8,
- .num_boxes = 2,
- .perf_ctr_bits = 48,
- .event_ctl = NHMEX_R_MSR_PMON_CTL0,
- .perf_ctr = NHMEX_R_MSR_PMON_CNT0,
- .event_mask = NHMEX_R_PMON_RAW_EVENT_MASK,
- .box_ctl = NHMEX_R_MSR_GLOBAL_CTL,
- .msr_offset = NHMEX_R_MSR_OFFSET,
- .pair_ctr_ctl = 1,
- .num_shared_regs = 20,
- .event_descs = nhmex_uncore_rbox_events,
- .ops = &nhmex_uncore_rbox_ops,
- .format_group = &nhmex_uncore_rbox_format_group
-};
-
-static struct intel_uncore_type *nhmex_msr_uncores[] = {
- &nhmex_uncore_ubox,
- &nhmex_uncore_cbox,
- &nhmex_uncore_bbox,
- &nhmex_uncore_sbox,
- &nhmex_uncore_mbox,
- &nhmex_uncore_rbox,
- &nhmex_uncore_wbox,
- NULL,
-};
-/* end of Nehalem-EX uncore support */
-
static void uncore_assign_hw_event(struct intel_uncore_box *box, struct perf_event *event, int idx)
{
struct hw_perf_event *hwc = &event->hw;
@@ -3140,7 +166,7 @@
hwc->event_base = uncore_perf_ctr(box, hwc->idx);
}
-static void uncore_perf_event_update(struct intel_uncore_box *box, struct perf_event *event)
+void uncore_perf_event_update(struct intel_uncore_box *box, struct perf_event *event)
{
u64 prev_count, new_count, delta;
int shift;
@@ -3201,14 +227,14 @@
return HRTIMER_RESTART;
}
-static void uncore_pmu_start_hrtimer(struct intel_uncore_box *box)
+void uncore_pmu_start_hrtimer(struct intel_uncore_box *box)
{
__hrtimer_start_range_ns(&box->hrtimer,
ns_to_ktime(box->hrtimer_duration), 0,
HRTIMER_MODE_REL_PINNED, 0);
}
-static void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box)
+void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box)
{
hrtimer_cancel(&box->hrtimer);
}
@@ -3291,7 +317,7 @@
}
if (event->attr.config == UNCORE_FIXED_EVENT)
- return &constraint_fixed;
+ return &uncore_constraint_fixed;
if (type->constraints) {
for_each_event_constraint(c, type->constraints) {
@@ -3496,7 +522,7 @@
event->hw.last_tag = ~0ULL;
}
-static void uncore_pmu_event_read(struct perf_event *event)
+void uncore_pmu_event_read(struct perf_event *event)
{
struct intel_uncore_box *box = uncore_event_to_box(event);
uncore_perf_event_update(box, event);
@@ -3635,7 +661,7 @@
.attrs = uncore_pmu_attrs,
};
-static int __init uncore_pmu_register(struct intel_uncore_pmu *pmu)
+static int uncore_pmu_register(struct intel_uncore_pmu *pmu)
{
int ret;
@@ -3758,9 +784,6 @@
return ret;
}
-static struct pci_driver *uncore_pci_driver;
-static bool pcidrv_registered;
-
/*
* add a pci uncore device
*/
@@ -3771,17 +794,18 @@
struct intel_uncore_type *type;
int phys_id;
- phys_id = pcibus_to_physid[pdev->bus->number];
+ phys_id = uncore_pcibus_to_physid[pdev->bus->number];
if (phys_id < 0)
return -ENODEV;
if (UNCORE_PCI_DEV_TYPE(id->driver_data) == UNCORE_EXTRA_PCI_DEV) {
- extra_pci_dev[phys_id][UNCORE_PCI_DEV_IDX(id->driver_data)] = pdev;
+ int idx = UNCORE_PCI_DEV_IDX(id->driver_data);
+ uncore_extra_pci_dev[phys_id][idx] = pdev;
pci_set_drvdata(pdev, NULL);
return 0;
}
- type = pci_uncores[UNCORE_PCI_DEV_TYPE(id->driver_data)];
+ type = uncore_pci_uncores[UNCORE_PCI_DEV_TYPE(id->driver_data)];
box = uncore_alloc_box(type, NUMA_NO_NODE);
if (!box)
return -ENOMEM;
@@ -3813,13 +837,13 @@
{
struct intel_uncore_box *box = pci_get_drvdata(pdev);
struct intel_uncore_pmu *pmu;
- int i, cpu, phys_id = pcibus_to_physid[pdev->bus->number];
+ int i, cpu, phys_id = uncore_pcibus_to_physid[pdev->bus->number];
box = pci_get_drvdata(pdev);
if (!box) {
for (i = 0; i < UNCORE_EXTRA_PCI_DEV_MAX; i++) {
- if (extra_pci_dev[phys_id][i] == pdev) {
- extra_pci_dev[phys_id][i] = NULL;
+ if (uncore_extra_pci_dev[phys_id][i] == pdev) {
+ uncore_extra_pci_dev[phys_id][i] = NULL;
break;
}
}
@@ -3854,46 +878,29 @@
switch (boot_cpu_data.x86_model) {
case 45: /* Sandy Bridge-EP */
- ret = snbep_pci2phy_map_init(0x3ce0);
- if (ret)
- return ret;
- pci_uncores = snbep_pci_uncores;
- uncore_pci_driver = &snbep_uncore_pci_driver;
+ ret = snbep_uncore_pci_init();
break;
- case 62: /* IvyTown */
- ret = snbep_pci2phy_map_init(0x0e1e);
- if (ret)
- return ret;
- pci_uncores = ivt_pci_uncores;
- uncore_pci_driver = &ivt_uncore_pci_driver;
+ case 62: /* Ivy Bridge-EP */
+ ret = ivbep_uncore_pci_init();
break;
case 42: /* Sandy Bridge */
- ret = snb_pci2phy_map_init(PCI_DEVICE_ID_INTEL_SNB_IMC);
- if (ret)
- return ret;
- pci_uncores = snb_pci_uncores;
- uncore_pci_driver = &snb_uncore_pci_driver;
+ ret = snb_uncore_pci_init();
break;
case 58: /* Ivy Bridge */
- ret = snb_pci2phy_map_init(PCI_DEVICE_ID_INTEL_IVB_IMC);
- if (ret)
- return ret;
- pci_uncores = snb_pci_uncores;
- uncore_pci_driver = &ivb_uncore_pci_driver;
+ ret = ivb_uncore_pci_init();
break;
case 60: /* Haswell */
case 69: /* Haswell Celeron */
- ret = snb_pci2phy_map_init(PCI_DEVICE_ID_INTEL_HSW_IMC);
- if (ret)
- return ret;
- pci_uncores = snb_pci_uncores;
- uncore_pci_driver = &hsw_uncore_pci_driver;
+ ret = hsw_uncore_pci_init();
break;
default:
return 0;
}
- ret = uncore_types_init(pci_uncores);
+ if (ret)
+ return ret;
+
+ ret = uncore_types_init(uncore_pci_uncores);
if (ret)
return ret;
@@ -3904,7 +911,7 @@
if (ret == 0)
pcidrv_registered = true;
else
- uncore_types_exit(pci_uncores);
+ uncore_types_exit(uncore_pci_uncores);
return ret;
}
@@ -3914,7 +921,7 @@
if (pcidrv_registered) {
pcidrv_registered = false;
pci_unregister_driver(uncore_pci_driver);
- uncore_types_exit(pci_uncores);
+ uncore_types_exit(uncore_pci_uncores);
}
}
@@ -3940,8 +947,8 @@
struct intel_uncore_box *box;
int i, j;
- for (i = 0; msr_uncores[i]; i++) {
- type = msr_uncores[i];
+ for (i = 0; uncore_msr_uncores[i]; i++) {
+ type = uncore_msr_uncores[i];
for (j = 0; j < type->num_boxes; j++) {
pmu = &type->pmus[j];
box = *per_cpu_ptr(pmu->box, cpu);
@@ -3961,8 +968,8 @@
phys_id = topology_physical_package_id(cpu);
- for (i = 0; msr_uncores[i]; i++) {
- type = msr_uncores[i];
+ for (i = 0; uncore_msr_uncores[i]; i++) {
+ type = uncore_msr_uncores[i];
for (j = 0; j < type->num_boxes; j++) {
pmu = &type->pmus[j];
box = *per_cpu_ptr(pmu->box, cpu);
@@ -4002,8 +1009,8 @@
struct intel_uncore_box *box;
int i, j;
- for (i = 0; msr_uncores[i]; i++) {
- type = msr_uncores[i];
+ for (i = 0; uncore_msr_uncores[i]; i++) {
+ type = uncore_msr_uncores[i];
for (j = 0; j < type->num_boxes; j++) {
pmu = &type->pmus[j];
if (pmu->func_id < 0)
@@ -4083,8 +1090,8 @@
if (target >= 0)
cpumask_set_cpu(target, &uncore_cpu_mask);
- uncore_change_context(msr_uncores, cpu, target);
- uncore_change_context(pci_uncores, cpu, target);
+ uncore_change_context(uncore_msr_uncores, cpu, target);
+ uncore_change_context(uncore_pci_uncores, cpu, target);
}
static void uncore_event_init_cpu(int cpu)
@@ -4099,8 +1106,8 @@
cpumask_set_cpu(cpu, &uncore_cpu_mask);
- uncore_change_context(msr_uncores, -1, cpu);
- uncore_change_context(pci_uncores, -1, cpu);
+ uncore_change_context(uncore_msr_uncores, -1, cpu);
+ uncore_change_context(uncore_pci_uncores, -1, cpu);
}
static int uncore_cpu_notifier(struct notifier_block *self,
@@ -4160,47 +1167,35 @@
static int __init uncore_cpu_init(void)
{
- int ret, max_cores;
+ int ret;
- max_cores = boot_cpu_data.x86_max_cores;
switch (boot_cpu_data.x86_model) {
case 26: /* Nehalem */
case 30:
case 37: /* Westmere */
case 44:
- msr_uncores = nhm_msr_uncores;
+ nhm_uncore_cpu_init();
break;
case 42: /* Sandy Bridge */
case 58: /* Ivy Bridge */
- if (snb_uncore_cbox.num_boxes > max_cores)
- snb_uncore_cbox.num_boxes = max_cores;
- msr_uncores = snb_msr_uncores;
+ snb_uncore_cpu_init();
break;
case 45: /* Sandy Bridge-EP */
- if (snbep_uncore_cbox.num_boxes > max_cores)
- snbep_uncore_cbox.num_boxes = max_cores;
- msr_uncores = snbep_msr_uncores;
+ snbep_uncore_cpu_init();
break;
case 46: /* Nehalem-EX */
- uncore_nhmex = true;
case 47: /* Westmere-EX aka. Xeon E7 */
- if (!uncore_nhmex)
- nhmex_uncore_mbox.event_descs = wsmex_uncore_mbox_events;
- if (nhmex_uncore_cbox.num_boxes > max_cores)
- nhmex_uncore_cbox.num_boxes = max_cores;
- msr_uncores = nhmex_msr_uncores;
+ nhmex_uncore_cpu_init();
break;
- case 62: /* IvyTown */
- if (ivt_uncore_cbox.num_boxes > max_cores)
- ivt_uncore_cbox.num_boxes = max_cores;
- msr_uncores = ivt_msr_uncores;
+ case 62: /* Ivy Bridge-EP */
+ ivbep_uncore_cpu_init();
break;
default:
return 0;
}
- ret = uncore_types_init(msr_uncores);
+ ret = uncore_types_init(uncore_msr_uncores);
if (ret)
return ret;
@@ -4213,16 +1208,16 @@
struct intel_uncore_type *type;
int i, j;
- for (i = 0; msr_uncores[i]; i++) {
- type = msr_uncores[i];
+ for (i = 0; uncore_msr_uncores[i]; i++) {
+ type = uncore_msr_uncores[i];
for (j = 0; j < type->num_boxes; j++) {
pmu = &type->pmus[j];
uncore_pmu_register(pmu);
}
}
- for (i = 0; pci_uncores[i]; i++) {
- type = pci_uncores[i];
+ for (i = 0; uncore_pci_uncores[i]; i++) {
+ type = uncore_pci_uncores[i];
for (j = 0; j < type->num_boxes; j++) {
pmu = &type->pmus[j];
uncore_pmu_register(pmu);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
index 90236f0..1d7e894 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
@@ -24,395 +24,6 @@
#define UNCORE_EVENT_CONSTRAINT(c, n) EVENT_CONSTRAINT(c, n, 0xff)
-/* SNB event control */
-#define SNB_UNC_CTL_EV_SEL_MASK 0x000000ff
-#define SNB_UNC_CTL_UMASK_MASK 0x0000ff00
-#define SNB_UNC_CTL_EDGE_DET (1 << 18)
-#define SNB_UNC_CTL_EN (1 << 22)
-#define SNB_UNC_CTL_INVERT (1 << 23)
-#define SNB_UNC_CTL_CMASK_MASK 0x1f000000
-#define NHM_UNC_CTL_CMASK_MASK 0xff000000
-#define NHM_UNC_FIXED_CTR_CTL_EN (1 << 0)
-
-#define SNB_UNC_RAW_EVENT_MASK (SNB_UNC_CTL_EV_SEL_MASK | \
- SNB_UNC_CTL_UMASK_MASK | \
- SNB_UNC_CTL_EDGE_DET | \
- SNB_UNC_CTL_INVERT | \
- SNB_UNC_CTL_CMASK_MASK)
-
-#define NHM_UNC_RAW_EVENT_MASK (SNB_UNC_CTL_EV_SEL_MASK | \
- SNB_UNC_CTL_UMASK_MASK | \
- SNB_UNC_CTL_EDGE_DET | \
- SNB_UNC_CTL_INVERT | \
- NHM_UNC_CTL_CMASK_MASK)
-
-/* SNB global control register */
-#define SNB_UNC_PERF_GLOBAL_CTL 0x391
-#define SNB_UNC_FIXED_CTR_CTRL 0x394
-#define SNB_UNC_FIXED_CTR 0x395
-
-/* SNB uncore global control */
-#define SNB_UNC_GLOBAL_CTL_CORE_ALL ((1 << 4) - 1)
-#define SNB_UNC_GLOBAL_CTL_EN (1 << 29)
-
-/* SNB Cbo register */
-#define SNB_UNC_CBO_0_PERFEVTSEL0 0x700
-#define SNB_UNC_CBO_0_PER_CTR0 0x706
-#define SNB_UNC_CBO_MSR_OFFSET 0x10
-
-/* NHM global control register */
-#define NHM_UNC_PERF_GLOBAL_CTL 0x391
-#define NHM_UNC_FIXED_CTR 0x394
-#define NHM_UNC_FIXED_CTR_CTRL 0x395
-
-/* NHM uncore global control */
-#define NHM_UNC_GLOBAL_CTL_EN_PC_ALL ((1ULL << 8) - 1)
-#define NHM_UNC_GLOBAL_CTL_EN_FC (1ULL << 32)
-
-/* NHM uncore register */
-#define NHM_UNC_PERFEVTSEL0 0x3c0
-#define NHM_UNC_UNCORE_PMC0 0x3b0
-
-/* SNB-EP Box level control */
-#define SNBEP_PMON_BOX_CTL_RST_CTRL (1 << 0)
-#define SNBEP_PMON_BOX_CTL_RST_CTRS (1 << 1)
-#define SNBEP_PMON_BOX_CTL_FRZ (1 << 8)
-#define SNBEP_PMON_BOX_CTL_FRZ_EN (1 << 16)
-#define SNBEP_PMON_BOX_CTL_INT (SNBEP_PMON_BOX_CTL_RST_CTRL | \
- SNBEP_PMON_BOX_CTL_RST_CTRS | \
- SNBEP_PMON_BOX_CTL_FRZ_EN)
-/* SNB-EP event control */
-#define SNBEP_PMON_CTL_EV_SEL_MASK 0x000000ff
-#define SNBEP_PMON_CTL_UMASK_MASK 0x0000ff00
-#define SNBEP_PMON_CTL_RST (1 << 17)
-#define SNBEP_PMON_CTL_EDGE_DET (1 << 18)
-#define SNBEP_PMON_CTL_EV_SEL_EXT (1 << 21)
-#define SNBEP_PMON_CTL_EN (1 << 22)
-#define SNBEP_PMON_CTL_INVERT (1 << 23)
-#define SNBEP_PMON_CTL_TRESH_MASK 0xff000000
-#define SNBEP_PMON_RAW_EVENT_MASK (SNBEP_PMON_CTL_EV_SEL_MASK | \
- SNBEP_PMON_CTL_UMASK_MASK | \
- SNBEP_PMON_CTL_EDGE_DET | \
- SNBEP_PMON_CTL_INVERT | \
- SNBEP_PMON_CTL_TRESH_MASK)
-
-/* SNB-EP Ubox event control */
-#define SNBEP_U_MSR_PMON_CTL_TRESH_MASK 0x1f000000
-#define SNBEP_U_MSR_PMON_RAW_EVENT_MASK \
- (SNBEP_PMON_CTL_EV_SEL_MASK | \
- SNBEP_PMON_CTL_UMASK_MASK | \
- SNBEP_PMON_CTL_EDGE_DET | \
- SNBEP_PMON_CTL_INVERT | \
- SNBEP_U_MSR_PMON_CTL_TRESH_MASK)
-
-#define SNBEP_CBO_PMON_CTL_TID_EN (1 << 19)
-#define SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK (SNBEP_PMON_RAW_EVENT_MASK | \
- SNBEP_CBO_PMON_CTL_TID_EN)
-
-/* SNB-EP PCU event control */
-#define SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK 0x0000c000
-#define SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK 0x1f000000
-#define SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT (1 << 30)
-#define SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET (1 << 31)
-#define SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK \
- (SNBEP_PMON_CTL_EV_SEL_MASK | \
- SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \
- SNBEP_PMON_CTL_EDGE_DET | \
- SNBEP_PMON_CTL_EV_SEL_EXT | \
- SNBEP_PMON_CTL_INVERT | \
- SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \
- SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \
- SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET)
-
-#define SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK \
- (SNBEP_PMON_RAW_EVENT_MASK | \
- SNBEP_PMON_CTL_EV_SEL_EXT)
-
-/* SNB-EP pci control register */
-#define SNBEP_PCI_PMON_BOX_CTL 0xf4
-#define SNBEP_PCI_PMON_CTL0 0xd8
-/* SNB-EP pci counter register */
-#define SNBEP_PCI_PMON_CTR0 0xa0
-
-/* SNB-EP home agent register */
-#define SNBEP_HA_PCI_PMON_BOX_ADDRMATCH0 0x40
-#define SNBEP_HA_PCI_PMON_BOX_ADDRMATCH1 0x44
-#define SNBEP_HA_PCI_PMON_BOX_OPCODEMATCH 0x48
-/* SNB-EP memory controller register */
-#define SNBEP_MC_CHy_PCI_PMON_FIXED_CTL 0xf0
-#define SNBEP_MC_CHy_PCI_PMON_FIXED_CTR 0xd0
-/* SNB-EP QPI register */
-#define SNBEP_Q_Py_PCI_PMON_PKT_MATCH0 0x228
-#define SNBEP_Q_Py_PCI_PMON_PKT_MATCH1 0x22c
-#define SNBEP_Q_Py_PCI_PMON_PKT_MASK0 0x238
-#define SNBEP_Q_Py_PCI_PMON_PKT_MASK1 0x23c
-
-/* SNB-EP Ubox register */
-#define SNBEP_U_MSR_PMON_CTR0 0xc16
-#define SNBEP_U_MSR_PMON_CTL0 0xc10
-
-#define SNBEP_U_MSR_PMON_UCLK_FIXED_CTL 0xc08
-#define SNBEP_U_MSR_PMON_UCLK_FIXED_CTR 0xc09
-
-/* SNB-EP Cbo register */
-#define SNBEP_C0_MSR_PMON_CTR0 0xd16
-#define SNBEP_C0_MSR_PMON_CTL0 0xd10
-#define SNBEP_C0_MSR_PMON_BOX_CTL 0xd04
-#define SNBEP_C0_MSR_PMON_BOX_FILTER 0xd14
-#define SNBEP_CBO_MSR_OFFSET 0x20
-
-#define SNBEP_CB0_MSR_PMON_BOX_FILTER_TID 0x1f
-#define SNBEP_CB0_MSR_PMON_BOX_FILTER_NID 0x3fc00
-#define SNBEP_CB0_MSR_PMON_BOX_FILTER_STATE 0x7c0000
-#define SNBEP_CB0_MSR_PMON_BOX_FILTER_OPC 0xff800000
-
-#define SNBEP_CBO_EVENT_EXTRA_REG(e, m, i) { \
- .event = (e), \
- .msr = SNBEP_C0_MSR_PMON_BOX_FILTER, \
- .config_mask = (m), \
- .idx = (i) \
-}
-
-/* SNB-EP PCU register */
-#define SNBEP_PCU_MSR_PMON_CTR0 0xc36
-#define SNBEP_PCU_MSR_PMON_CTL0 0xc30
-#define SNBEP_PCU_MSR_PMON_BOX_CTL 0xc24
-#define SNBEP_PCU_MSR_PMON_BOX_FILTER 0xc34
-#define SNBEP_PCU_MSR_PMON_BOX_FILTER_MASK 0xffffffff
-#define SNBEP_PCU_MSR_CORE_C3_CTR 0x3fc
-#define SNBEP_PCU_MSR_CORE_C6_CTR 0x3fd
-
-/* IVT event control */
-#define IVT_PMON_BOX_CTL_INT (SNBEP_PMON_BOX_CTL_RST_CTRL | \
- SNBEP_PMON_BOX_CTL_RST_CTRS)
-#define IVT_PMON_RAW_EVENT_MASK (SNBEP_PMON_CTL_EV_SEL_MASK | \
- SNBEP_PMON_CTL_UMASK_MASK | \
- SNBEP_PMON_CTL_EDGE_DET | \
- SNBEP_PMON_CTL_TRESH_MASK)
-/* IVT Ubox */
-#define IVT_U_MSR_PMON_GLOBAL_CTL 0xc00
-#define IVT_U_PMON_GLOBAL_FRZ_ALL (1 << 31)
-#define IVT_U_PMON_GLOBAL_UNFRZ_ALL (1 << 29)
-
-#define IVT_U_MSR_PMON_RAW_EVENT_MASK \
- (SNBEP_PMON_CTL_EV_SEL_MASK | \
- SNBEP_PMON_CTL_UMASK_MASK | \
- SNBEP_PMON_CTL_EDGE_DET | \
- SNBEP_U_MSR_PMON_CTL_TRESH_MASK)
-/* IVT Cbo */
-#define IVT_CBO_MSR_PMON_RAW_EVENT_MASK (IVT_PMON_RAW_EVENT_MASK | \
- SNBEP_CBO_PMON_CTL_TID_EN)
-
-#define IVT_CB0_MSR_PMON_BOX_FILTER_TID (0x1fULL << 0)
-#define IVT_CB0_MSR_PMON_BOX_FILTER_LINK (0xfULL << 5)
-#define IVT_CB0_MSR_PMON_BOX_FILTER_STATE (0x3fULL << 17)
-#define IVT_CB0_MSR_PMON_BOX_FILTER_NID (0xffffULL << 32)
-#define IVT_CB0_MSR_PMON_BOX_FILTER_OPC (0x1ffULL << 52)
-#define IVT_CB0_MSR_PMON_BOX_FILTER_C6 (0x1ULL << 61)
-#define IVT_CB0_MSR_PMON_BOX_FILTER_NC (0x1ULL << 62)
-#define IVT_CB0_MSR_PMON_BOX_FILTER_IOSC (0x1ULL << 63)
-
-/* IVT home agent */
-#define IVT_HA_PCI_PMON_CTL_Q_OCC_RST (1 << 16)
-#define IVT_HA_PCI_PMON_RAW_EVENT_MASK \
- (IVT_PMON_RAW_EVENT_MASK | \
- IVT_HA_PCI_PMON_CTL_Q_OCC_RST)
-/* IVT PCU */
-#define IVT_PCU_MSR_PMON_RAW_EVENT_MASK \
- (SNBEP_PMON_CTL_EV_SEL_MASK | \
- SNBEP_PMON_CTL_EV_SEL_EXT | \
- SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \
- SNBEP_PMON_CTL_EDGE_DET | \
- SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \
- SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \
- SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET)
-/* IVT QPI */
-#define IVT_QPI_PCI_PMON_RAW_EVENT_MASK \
- (IVT_PMON_RAW_EVENT_MASK | \
- SNBEP_PMON_CTL_EV_SEL_EXT)
-
-/* NHM-EX event control */
-#define NHMEX_PMON_CTL_EV_SEL_MASK 0x000000ff
-#define NHMEX_PMON_CTL_UMASK_MASK 0x0000ff00
-#define NHMEX_PMON_CTL_EN_BIT0 (1 << 0)
-#define NHMEX_PMON_CTL_EDGE_DET (1 << 18)
-#define NHMEX_PMON_CTL_PMI_EN (1 << 20)
-#define NHMEX_PMON_CTL_EN_BIT22 (1 << 22)
-#define NHMEX_PMON_CTL_INVERT (1 << 23)
-#define NHMEX_PMON_CTL_TRESH_MASK 0xff000000
-#define NHMEX_PMON_RAW_EVENT_MASK (NHMEX_PMON_CTL_EV_SEL_MASK | \
- NHMEX_PMON_CTL_UMASK_MASK | \
- NHMEX_PMON_CTL_EDGE_DET | \
- NHMEX_PMON_CTL_INVERT | \
- NHMEX_PMON_CTL_TRESH_MASK)
-
-/* NHM-EX Ubox */
-#define NHMEX_U_MSR_PMON_GLOBAL_CTL 0xc00
-#define NHMEX_U_MSR_PMON_CTR 0xc11
-#define NHMEX_U_MSR_PMON_EV_SEL 0xc10
-
-#define NHMEX_U_PMON_GLOBAL_EN (1 << 0)
-#define NHMEX_U_PMON_GLOBAL_PMI_CORE_SEL 0x0000001e
-#define NHMEX_U_PMON_GLOBAL_EN_ALL (1 << 28)
-#define NHMEX_U_PMON_GLOBAL_RST_ALL (1 << 29)
-#define NHMEX_U_PMON_GLOBAL_FRZ_ALL (1 << 31)
-
-#define NHMEX_U_PMON_RAW_EVENT_MASK \
- (NHMEX_PMON_CTL_EV_SEL_MASK | \
- NHMEX_PMON_CTL_EDGE_DET)
-
-/* NHM-EX Cbox */
-#define NHMEX_C0_MSR_PMON_GLOBAL_CTL 0xd00
-#define NHMEX_C0_MSR_PMON_CTR0 0xd11
-#define NHMEX_C0_MSR_PMON_EV_SEL0 0xd10
-#define NHMEX_C_MSR_OFFSET 0x20
-
-/* NHM-EX Bbox */
-#define NHMEX_B0_MSR_PMON_GLOBAL_CTL 0xc20
-#define NHMEX_B0_MSR_PMON_CTR0 0xc31
-#define NHMEX_B0_MSR_PMON_CTL0 0xc30
-#define NHMEX_B_MSR_OFFSET 0x40
-#define NHMEX_B0_MSR_MATCH 0xe45
-#define NHMEX_B0_MSR_MASK 0xe46
-#define NHMEX_B1_MSR_MATCH 0xe4d
-#define NHMEX_B1_MSR_MASK 0xe4e
-
-#define NHMEX_B_PMON_CTL_EN (1 << 0)
-#define NHMEX_B_PMON_CTL_EV_SEL_SHIFT 1
-#define NHMEX_B_PMON_CTL_EV_SEL_MASK \
- (0x1f << NHMEX_B_PMON_CTL_EV_SEL_SHIFT)
-#define NHMEX_B_PMON_CTR_SHIFT 6
-#define NHMEX_B_PMON_CTR_MASK \
- (0x3 << NHMEX_B_PMON_CTR_SHIFT)
-#define NHMEX_B_PMON_RAW_EVENT_MASK \
- (NHMEX_B_PMON_CTL_EV_SEL_MASK | \
- NHMEX_B_PMON_CTR_MASK)
-
-/* NHM-EX Sbox */
-#define NHMEX_S0_MSR_PMON_GLOBAL_CTL 0xc40
-#define NHMEX_S0_MSR_PMON_CTR0 0xc51
-#define NHMEX_S0_MSR_PMON_CTL0 0xc50
-#define NHMEX_S_MSR_OFFSET 0x80
-#define NHMEX_S0_MSR_MM_CFG 0xe48
-#define NHMEX_S0_MSR_MATCH 0xe49
-#define NHMEX_S0_MSR_MASK 0xe4a
-#define NHMEX_S1_MSR_MM_CFG 0xe58
-#define NHMEX_S1_MSR_MATCH 0xe59
-#define NHMEX_S1_MSR_MASK 0xe5a
-
-#define NHMEX_S_PMON_MM_CFG_EN (0x1ULL << 63)
-#define NHMEX_S_EVENT_TO_R_PROG_EV 0
-
-/* NHM-EX Mbox */
-#define NHMEX_M0_MSR_GLOBAL_CTL 0xca0
-#define NHMEX_M0_MSR_PMU_DSP 0xca5
-#define NHMEX_M0_MSR_PMU_ISS 0xca6
-#define NHMEX_M0_MSR_PMU_MAP 0xca7
-#define NHMEX_M0_MSR_PMU_MSC_THR 0xca8
-#define NHMEX_M0_MSR_PMU_PGT 0xca9
-#define NHMEX_M0_MSR_PMU_PLD 0xcaa
-#define NHMEX_M0_MSR_PMU_ZDP_CTL_FVC 0xcab
-#define NHMEX_M0_MSR_PMU_CTL0 0xcb0
-#define NHMEX_M0_MSR_PMU_CNT0 0xcb1
-#define NHMEX_M_MSR_OFFSET 0x40
-#define NHMEX_M0_MSR_PMU_MM_CFG 0xe54
-#define NHMEX_M1_MSR_PMU_MM_CFG 0xe5c
-
-#define NHMEX_M_PMON_MM_CFG_EN (1ULL << 63)
-#define NHMEX_M_PMON_ADDR_MATCH_MASK 0x3ffffffffULL
-#define NHMEX_M_PMON_ADDR_MASK_MASK 0x7ffffffULL
-#define NHMEX_M_PMON_ADDR_MASK_SHIFT 34
-
-#define NHMEX_M_PMON_CTL_EN (1 << 0)
-#define NHMEX_M_PMON_CTL_PMI_EN (1 << 1)
-#define NHMEX_M_PMON_CTL_COUNT_MODE_SHIFT 2
-#define NHMEX_M_PMON_CTL_COUNT_MODE_MASK \
- (0x3 << NHMEX_M_PMON_CTL_COUNT_MODE_SHIFT)
-#define NHMEX_M_PMON_CTL_STORAGE_MODE_SHIFT 4
-#define NHMEX_M_PMON_CTL_STORAGE_MODE_MASK \
- (0x3 << NHMEX_M_PMON_CTL_STORAGE_MODE_SHIFT)
-#define NHMEX_M_PMON_CTL_WRAP_MODE (1 << 6)
-#define NHMEX_M_PMON_CTL_FLAG_MODE (1 << 7)
-#define NHMEX_M_PMON_CTL_INC_SEL_SHIFT 9
-#define NHMEX_M_PMON_CTL_INC_SEL_MASK \
- (0x1f << NHMEX_M_PMON_CTL_INC_SEL_SHIFT)
-#define NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT 19
-#define NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK \
- (0x7 << NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT)
-#define NHMEX_M_PMON_RAW_EVENT_MASK \
- (NHMEX_M_PMON_CTL_COUNT_MODE_MASK | \
- NHMEX_M_PMON_CTL_STORAGE_MODE_MASK | \
- NHMEX_M_PMON_CTL_WRAP_MODE | \
- NHMEX_M_PMON_CTL_FLAG_MODE | \
- NHMEX_M_PMON_CTL_INC_SEL_MASK | \
- NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK)
-
-#define NHMEX_M_PMON_ZDP_CTL_FVC_MASK (((1 << 11) - 1) | (1 << 23))
-#define NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(n) (0x7ULL << (11 + 3 * (n)))
-
-#define WSMEX_M_PMON_ZDP_CTL_FVC_MASK (((1 << 12) - 1) | (1 << 24))
-#define WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(n) (0x7ULL << (12 + 3 * (n)))
-
-/*
- * use the 9~13 bits to select event If the 7th bit is not set,
- * otherwise use the 19~21 bits to select event.
- */
-#define MBOX_INC_SEL(x) ((x) << NHMEX_M_PMON_CTL_INC_SEL_SHIFT)
-#define MBOX_SET_FLAG_SEL(x) (((x) << NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT) | \
- NHMEX_M_PMON_CTL_FLAG_MODE)
-#define MBOX_INC_SEL_MASK (NHMEX_M_PMON_CTL_INC_SEL_MASK | \
- NHMEX_M_PMON_CTL_FLAG_MODE)
-#define MBOX_SET_FLAG_SEL_MASK (NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK | \
- NHMEX_M_PMON_CTL_FLAG_MODE)
-#define MBOX_INC_SEL_EXTAR_REG(c, r) \
- EVENT_EXTRA_REG(MBOX_INC_SEL(c), NHMEX_M0_MSR_PMU_##r, \
- MBOX_INC_SEL_MASK, (u64)-1, NHMEX_M_##r)
-#define MBOX_SET_FLAG_SEL_EXTRA_REG(c, r) \
- EVENT_EXTRA_REG(MBOX_SET_FLAG_SEL(c), NHMEX_M0_MSR_PMU_##r, \
- MBOX_SET_FLAG_SEL_MASK, \
- (u64)-1, NHMEX_M_##r)
-
-/* NHM-EX Rbox */
-#define NHMEX_R_MSR_GLOBAL_CTL 0xe00
-#define NHMEX_R_MSR_PMON_CTL0 0xe10
-#define NHMEX_R_MSR_PMON_CNT0 0xe11
-#define NHMEX_R_MSR_OFFSET 0x20
-
-#define NHMEX_R_MSR_PORTN_QLX_CFG(n) \
- ((n) < 4 ? (0xe0c + (n)) : (0xe2c + (n) - 4))
-#define NHMEX_R_MSR_PORTN_IPERF_CFG0(n) (0xe04 + (n))
-#define NHMEX_R_MSR_PORTN_IPERF_CFG1(n) (0xe24 + (n))
-#define NHMEX_R_MSR_PORTN_XBR_OFFSET(n) \
- (((n) < 4 ? 0 : 0x10) + (n) * 4)
-#define NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) \
- (0xe60 + NHMEX_R_MSR_PORTN_XBR_OFFSET(n))
-#define NHMEX_R_MSR_PORTN_XBR_SET1_MATCH(n) \
- (NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) + 1)
-#define NHMEX_R_MSR_PORTN_XBR_SET1_MASK(n) \
- (NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) + 2)
-#define NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) \
- (0xe70 + NHMEX_R_MSR_PORTN_XBR_OFFSET(n))
-#define NHMEX_R_MSR_PORTN_XBR_SET2_MATCH(n) \
- (NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) + 1)
-#define NHMEX_R_MSR_PORTN_XBR_SET2_MASK(n) \
- (NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) + 2)
-
-#define NHMEX_R_PMON_CTL_EN (1 << 0)
-#define NHMEX_R_PMON_CTL_EV_SEL_SHIFT 1
-#define NHMEX_R_PMON_CTL_EV_SEL_MASK \
- (0x1f << NHMEX_R_PMON_CTL_EV_SEL_SHIFT)
-#define NHMEX_R_PMON_CTL_PMI_EN (1 << 6)
-#define NHMEX_R_PMON_RAW_EVENT_MASK NHMEX_R_PMON_CTL_EV_SEL_MASK
-
-/* NHM-EX Wbox */
-#define NHMEX_W_MSR_GLOBAL_CTL 0xc80
-#define NHMEX_W_MSR_PMON_CNT0 0xc90
-#define NHMEX_W_MSR_PMON_EVT_SEL0 0xc91
-#define NHMEX_W_MSR_PMON_FIXED_CTR 0x394
-#define NHMEX_W_MSR_PMON_FIXED_CTL 0x395
-
-#define NHMEX_W_PMON_GLOBAL_FIXED_EN (1ULL << 31)
-
struct intel_uncore_ops;
struct intel_uncore_pmu;
struct intel_uncore_box;
@@ -505,6 +116,9 @@
const char *config;
};
+ssize_t uncore_event_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf);
+
#define INTEL_UNCORE_EVENT_DESC(_name, _config) \
{ \
.attr = __ATTR(_name, 0444, uncore_event_show, NULL), \
@@ -522,15 +136,6 @@
static struct kobj_attribute format_attr_##_var = \
__ATTR(_name, 0444, __uncore_##_var##_show, NULL)
-
-static ssize_t uncore_event_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
-{
- struct uncore_event_desc *event =
- container_of(attr, struct uncore_event_desc, attr);
- return sprintf(buf, "%s", event->config);
-}
-
static inline unsigned uncore_pci_box_ctl(struct intel_uncore_box *box)
{
return box->pmu->type->box_ctl;
@@ -694,3 +299,39 @@
{
return (box->phys_id < 0);
}
+
+struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event);
+struct intel_uncore_box *uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu);
+struct intel_uncore_box *uncore_event_to_box(struct perf_event *event);
+u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event);
+void uncore_pmu_start_hrtimer(struct intel_uncore_box *box);
+void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box);
+void uncore_pmu_event_read(struct perf_event *event);
+void uncore_perf_event_update(struct intel_uncore_box *box, struct perf_event *event);
+struct event_constraint *
+uncore_get_constraint(struct intel_uncore_box *box, struct perf_event *event);
+void uncore_put_constraint(struct intel_uncore_box *box, struct perf_event *event);
+u64 uncore_shared_reg_config(struct intel_uncore_box *box, int idx);
+
+extern struct intel_uncore_type **uncore_msr_uncores;
+extern struct intel_uncore_type **uncore_pci_uncores;
+extern struct pci_driver *uncore_pci_driver;
+extern int uncore_pcibus_to_physid[256];
+extern struct pci_dev *uncore_extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX];
+extern struct event_constraint uncore_constraint_empty;
+
+/* perf_event_intel_uncore_snb.c */
+int snb_uncore_pci_init(void);
+int ivb_uncore_pci_init(void);
+int hsw_uncore_pci_init(void);
+void snb_uncore_cpu_init(void);
+void nhm_uncore_cpu_init(void);
+
+/* perf_event_intel_uncore_snbep.c */
+int snbep_uncore_pci_init(void);
+void snbep_uncore_cpu_init(void);
+int ivbep_uncore_pci_init(void);
+void ivbep_uncore_cpu_init(void);
+
+/* perf_event_intel_uncore_nhmex.c */
+void nhmex_uncore_cpu_init(void);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_nhmex.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_nhmex.c
new file mode 100644
index 0000000..2749965
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_nhmex.c
@@ -0,0 +1,1221 @@
+/* Nehalem-EX/Westmere-EX uncore support */
+#include "perf_event_intel_uncore.h"
+
+/* NHM-EX event control */
+#define NHMEX_PMON_CTL_EV_SEL_MASK 0x000000ff
+#define NHMEX_PMON_CTL_UMASK_MASK 0x0000ff00
+#define NHMEX_PMON_CTL_EN_BIT0 (1 << 0)
+#define NHMEX_PMON_CTL_EDGE_DET (1 << 18)
+#define NHMEX_PMON_CTL_PMI_EN (1 << 20)
+#define NHMEX_PMON_CTL_EN_BIT22 (1 << 22)
+#define NHMEX_PMON_CTL_INVERT (1 << 23)
+#define NHMEX_PMON_CTL_TRESH_MASK 0xff000000
+#define NHMEX_PMON_RAW_EVENT_MASK (NHMEX_PMON_CTL_EV_SEL_MASK | \
+ NHMEX_PMON_CTL_UMASK_MASK | \
+ NHMEX_PMON_CTL_EDGE_DET | \
+ NHMEX_PMON_CTL_INVERT | \
+ NHMEX_PMON_CTL_TRESH_MASK)
+
+/* NHM-EX Ubox */
+#define NHMEX_U_MSR_PMON_GLOBAL_CTL 0xc00
+#define NHMEX_U_MSR_PMON_CTR 0xc11
+#define NHMEX_U_MSR_PMON_EV_SEL 0xc10
+
+#define NHMEX_U_PMON_GLOBAL_EN (1 << 0)
+#define NHMEX_U_PMON_GLOBAL_PMI_CORE_SEL 0x0000001e
+#define NHMEX_U_PMON_GLOBAL_EN_ALL (1 << 28)
+#define NHMEX_U_PMON_GLOBAL_RST_ALL (1 << 29)
+#define NHMEX_U_PMON_GLOBAL_FRZ_ALL (1 << 31)
+
+#define NHMEX_U_PMON_RAW_EVENT_MASK \
+ (NHMEX_PMON_CTL_EV_SEL_MASK | \
+ NHMEX_PMON_CTL_EDGE_DET)
+
+/* NHM-EX Cbox */
+#define NHMEX_C0_MSR_PMON_GLOBAL_CTL 0xd00
+#define NHMEX_C0_MSR_PMON_CTR0 0xd11
+#define NHMEX_C0_MSR_PMON_EV_SEL0 0xd10
+#define NHMEX_C_MSR_OFFSET 0x20
+
+/* NHM-EX Bbox */
+#define NHMEX_B0_MSR_PMON_GLOBAL_CTL 0xc20
+#define NHMEX_B0_MSR_PMON_CTR0 0xc31
+#define NHMEX_B0_MSR_PMON_CTL0 0xc30
+#define NHMEX_B_MSR_OFFSET 0x40
+#define NHMEX_B0_MSR_MATCH 0xe45
+#define NHMEX_B0_MSR_MASK 0xe46
+#define NHMEX_B1_MSR_MATCH 0xe4d
+#define NHMEX_B1_MSR_MASK 0xe4e
+
+#define NHMEX_B_PMON_CTL_EN (1 << 0)
+#define NHMEX_B_PMON_CTL_EV_SEL_SHIFT 1
+#define NHMEX_B_PMON_CTL_EV_SEL_MASK \
+ (0x1f << NHMEX_B_PMON_CTL_EV_SEL_SHIFT)
+#define NHMEX_B_PMON_CTR_SHIFT 6
+#define NHMEX_B_PMON_CTR_MASK \
+ (0x3 << NHMEX_B_PMON_CTR_SHIFT)
+#define NHMEX_B_PMON_RAW_EVENT_MASK \
+ (NHMEX_B_PMON_CTL_EV_SEL_MASK | \
+ NHMEX_B_PMON_CTR_MASK)
+
+/* NHM-EX Sbox */
+#define NHMEX_S0_MSR_PMON_GLOBAL_CTL 0xc40
+#define NHMEX_S0_MSR_PMON_CTR0 0xc51
+#define NHMEX_S0_MSR_PMON_CTL0 0xc50
+#define NHMEX_S_MSR_OFFSET 0x80
+#define NHMEX_S0_MSR_MM_CFG 0xe48
+#define NHMEX_S0_MSR_MATCH 0xe49
+#define NHMEX_S0_MSR_MASK 0xe4a
+#define NHMEX_S1_MSR_MM_CFG 0xe58
+#define NHMEX_S1_MSR_MATCH 0xe59
+#define NHMEX_S1_MSR_MASK 0xe5a
+
+#define NHMEX_S_PMON_MM_CFG_EN (0x1ULL << 63)
+#define NHMEX_S_EVENT_TO_R_PROG_EV 0
+
+/* NHM-EX Mbox */
+#define NHMEX_M0_MSR_GLOBAL_CTL 0xca0
+#define NHMEX_M0_MSR_PMU_DSP 0xca5
+#define NHMEX_M0_MSR_PMU_ISS 0xca6
+#define NHMEX_M0_MSR_PMU_MAP 0xca7
+#define NHMEX_M0_MSR_PMU_MSC_THR 0xca8
+#define NHMEX_M0_MSR_PMU_PGT 0xca9
+#define NHMEX_M0_MSR_PMU_PLD 0xcaa
+#define NHMEX_M0_MSR_PMU_ZDP_CTL_FVC 0xcab
+#define NHMEX_M0_MSR_PMU_CTL0 0xcb0
+#define NHMEX_M0_MSR_PMU_CNT0 0xcb1
+#define NHMEX_M_MSR_OFFSET 0x40
+#define NHMEX_M0_MSR_PMU_MM_CFG 0xe54
+#define NHMEX_M1_MSR_PMU_MM_CFG 0xe5c
+
+#define NHMEX_M_PMON_MM_CFG_EN (1ULL << 63)
+#define NHMEX_M_PMON_ADDR_MATCH_MASK 0x3ffffffffULL
+#define NHMEX_M_PMON_ADDR_MASK_MASK 0x7ffffffULL
+#define NHMEX_M_PMON_ADDR_MASK_SHIFT 34
+
+#define NHMEX_M_PMON_CTL_EN (1 << 0)
+#define NHMEX_M_PMON_CTL_PMI_EN (1 << 1)
+#define NHMEX_M_PMON_CTL_COUNT_MODE_SHIFT 2
+#define NHMEX_M_PMON_CTL_COUNT_MODE_MASK \
+ (0x3 << NHMEX_M_PMON_CTL_COUNT_MODE_SHIFT)
+#define NHMEX_M_PMON_CTL_STORAGE_MODE_SHIFT 4
+#define NHMEX_M_PMON_CTL_STORAGE_MODE_MASK \
+ (0x3 << NHMEX_M_PMON_CTL_STORAGE_MODE_SHIFT)
+#define NHMEX_M_PMON_CTL_WRAP_MODE (1 << 6)
+#define NHMEX_M_PMON_CTL_FLAG_MODE (1 << 7)
+#define NHMEX_M_PMON_CTL_INC_SEL_SHIFT 9
+#define NHMEX_M_PMON_CTL_INC_SEL_MASK \
+ (0x1f << NHMEX_M_PMON_CTL_INC_SEL_SHIFT)
+#define NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT 19
+#define NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK \
+ (0x7 << NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT)
+#define NHMEX_M_PMON_RAW_EVENT_MASK \
+ (NHMEX_M_PMON_CTL_COUNT_MODE_MASK | \
+ NHMEX_M_PMON_CTL_STORAGE_MODE_MASK | \
+ NHMEX_M_PMON_CTL_WRAP_MODE | \
+ NHMEX_M_PMON_CTL_FLAG_MODE | \
+ NHMEX_M_PMON_CTL_INC_SEL_MASK | \
+ NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK)
+
+#define NHMEX_M_PMON_ZDP_CTL_FVC_MASK (((1 << 11) - 1) | (1 << 23))
+#define NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(n) (0x7ULL << (11 + 3 * (n)))
+
+#define WSMEX_M_PMON_ZDP_CTL_FVC_MASK (((1 << 12) - 1) | (1 << 24))
+#define WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(n) (0x7ULL << (12 + 3 * (n)))
+
+/*
+ * use the 9~13 bits to select event If the 7th bit is not set,
+ * otherwise use the 19~21 bits to select event.
+ */
+#define MBOX_INC_SEL(x) ((x) << NHMEX_M_PMON_CTL_INC_SEL_SHIFT)
+#define MBOX_SET_FLAG_SEL(x) (((x) << NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT) | \
+ NHMEX_M_PMON_CTL_FLAG_MODE)
+#define MBOX_INC_SEL_MASK (NHMEX_M_PMON_CTL_INC_SEL_MASK | \
+ NHMEX_M_PMON_CTL_FLAG_MODE)
+#define MBOX_SET_FLAG_SEL_MASK (NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK | \
+ NHMEX_M_PMON_CTL_FLAG_MODE)
+#define MBOX_INC_SEL_EXTAR_REG(c, r) \
+ EVENT_EXTRA_REG(MBOX_INC_SEL(c), NHMEX_M0_MSR_PMU_##r, \
+ MBOX_INC_SEL_MASK, (u64)-1, NHMEX_M_##r)
+#define MBOX_SET_FLAG_SEL_EXTRA_REG(c, r) \
+ EVENT_EXTRA_REG(MBOX_SET_FLAG_SEL(c), NHMEX_M0_MSR_PMU_##r, \
+ MBOX_SET_FLAG_SEL_MASK, \
+ (u64)-1, NHMEX_M_##r)
+
+/* NHM-EX Rbox */
+#define NHMEX_R_MSR_GLOBAL_CTL 0xe00
+#define NHMEX_R_MSR_PMON_CTL0 0xe10
+#define NHMEX_R_MSR_PMON_CNT0 0xe11
+#define NHMEX_R_MSR_OFFSET 0x20
+
+#define NHMEX_R_MSR_PORTN_QLX_CFG(n) \
+ ((n) < 4 ? (0xe0c + (n)) : (0xe2c + (n) - 4))
+#define NHMEX_R_MSR_PORTN_IPERF_CFG0(n) (0xe04 + (n))
+#define NHMEX_R_MSR_PORTN_IPERF_CFG1(n) (0xe24 + (n))
+#define NHMEX_R_MSR_PORTN_XBR_OFFSET(n) \
+ (((n) < 4 ? 0 : 0x10) + (n) * 4)
+#define NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) \
+ (0xe60 + NHMEX_R_MSR_PORTN_XBR_OFFSET(n))
+#define NHMEX_R_MSR_PORTN_XBR_SET1_MATCH(n) \
+ (NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) + 1)
+#define NHMEX_R_MSR_PORTN_XBR_SET1_MASK(n) \
+ (NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) + 2)
+#define NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) \
+ (0xe70 + NHMEX_R_MSR_PORTN_XBR_OFFSET(n))
+#define NHMEX_R_MSR_PORTN_XBR_SET2_MATCH(n) \
+ (NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) + 1)
+#define NHMEX_R_MSR_PORTN_XBR_SET2_MASK(n) \
+ (NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) + 2)
+
+#define NHMEX_R_PMON_CTL_EN (1 << 0)
+#define NHMEX_R_PMON_CTL_EV_SEL_SHIFT 1
+#define NHMEX_R_PMON_CTL_EV_SEL_MASK \
+ (0x1f << NHMEX_R_PMON_CTL_EV_SEL_SHIFT)
+#define NHMEX_R_PMON_CTL_PMI_EN (1 << 6)
+#define NHMEX_R_PMON_RAW_EVENT_MASK NHMEX_R_PMON_CTL_EV_SEL_MASK
+
+/* NHM-EX Wbox */
+#define NHMEX_W_MSR_GLOBAL_CTL 0xc80
+#define NHMEX_W_MSR_PMON_CNT0 0xc90
+#define NHMEX_W_MSR_PMON_EVT_SEL0 0xc91
+#define NHMEX_W_MSR_PMON_FIXED_CTR 0x394
+#define NHMEX_W_MSR_PMON_FIXED_CTL 0x395
+
+#define NHMEX_W_PMON_GLOBAL_FIXED_EN (1ULL << 31)
+
+#define __BITS_VALUE(x, i, n) ((typeof(x))(((x) >> ((i) * (n))) & \
+ ((1ULL << (n)) - 1)))
+
+DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
+DEFINE_UNCORE_FORMAT_ATTR(event5, event, "config:1-5");
+DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
+DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
+DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
+DEFINE_UNCORE_FORMAT_ATTR(thresh8, thresh, "config:24-31");
+DEFINE_UNCORE_FORMAT_ATTR(counter, counter, "config:6-7");
+DEFINE_UNCORE_FORMAT_ATTR(match, match, "config1:0-63");
+DEFINE_UNCORE_FORMAT_ATTR(mask, mask, "config2:0-63");
+
+static void nhmex_uncore_msr_init_box(struct intel_uncore_box *box)
+{
+ wrmsrl(NHMEX_U_MSR_PMON_GLOBAL_CTL, NHMEX_U_PMON_GLOBAL_EN_ALL);
+}
+
+static void nhmex_uncore_msr_disable_box(struct intel_uncore_box *box)
+{
+ unsigned msr = uncore_msr_box_ctl(box);
+ u64 config;
+
+ if (msr) {
+ rdmsrl(msr, config);
+ config &= ~((1ULL << uncore_num_counters(box)) - 1);
+ /* WBox has a fixed counter */
+ if (uncore_msr_fixed_ctl(box))
+ config &= ~NHMEX_W_PMON_GLOBAL_FIXED_EN;
+ wrmsrl(msr, config);
+ }
+}
+
+static void nhmex_uncore_msr_enable_box(struct intel_uncore_box *box)
+{
+ unsigned msr = uncore_msr_box_ctl(box);
+ u64 config;
+
+ if (msr) {
+ rdmsrl(msr, config);
+ config |= (1ULL << uncore_num_counters(box)) - 1;
+ /* WBox has a fixed counter */
+ if (uncore_msr_fixed_ctl(box))
+ config |= NHMEX_W_PMON_GLOBAL_FIXED_EN;
+ wrmsrl(msr, config);
+ }
+}
+
+static void nhmex_uncore_msr_disable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ wrmsrl(event->hw.config_base, 0);
+}
+
+static void nhmex_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (hwc->idx >= UNCORE_PMC_IDX_FIXED)
+ wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0);
+ else if (box->pmu->type->event_mask & NHMEX_PMON_CTL_EN_BIT0)
+ wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT22);
+ else
+ wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT0);
+}
+
+#define NHMEX_UNCORE_OPS_COMMON_INIT() \
+ .init_box = nhmex_uncore_msr_init_box, \
+ .disable_box = nhmex_uncore_msr_disable_box, \
+ .enable_box = nhmex_uncore_msr_enable_box, \
+ .disable_event = nhmex_uncore_msr_disable_event, \
+ .read_counter = uncore_msr_read_counter
+
+static struct intel_uncore_ops nhmex_uncore_ops = {
+ NHMEX_UNCORE_OPS_COMMON_INIT(),
+ .enable_event = nhmex_uncore_msr_enable_event,
+};
+
+static struct attribute *nhmex_uncore_ubox_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_edge.attr,
+ NULL,
+};
+
+static struct attribute_group nhmex_uncore_ubox_format_group = {
+ .name = "format",
+ .attrs = nhmex_uncore_ubox_formats_attr,
+};
+
+static struct intel_uncore_type nhmex_uncore_ubox = {
+ .name = "ubox",
+ .num_counters = 1,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .event_ctl = NHMEX_U_MSR_PMON_EV_SEL,
+ .perf_ctr = NHMEX_U_MSR_PMON_CTR,
+ .event_mask = NHMEX_U_PMON_RAW_EVENT_MASK,
+ .box_ctl = NHMEX_U_MSR_PMON_GLOBAL_CTL,
+ .ops = &nhmex_uncore_ops,
+ .format_group = &nhmex_uncore_ubox_format_group
+};
+
+static struct attribute *nhmex_uncore_cbox_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh8.attr,
+ NULL,
+};
+
+static struct attribute_group nhmex_uncore_cbox_format_group = {
+ .name = "format",
+ .attrs = nhmex_uncore_cbox_formats_attr,
+};
+
+/* msr offset for each instance of cbox */
+static unsigned nhmex_cbox_msr_offsets[] = {
+ 0x0, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0, 0x240, 0x2c0,
+};
+
+static struct intel_uncore_type nhmex_uncore_cbox = {
+ .name = "cbox",
+ .num_counters = 6,
+ .num_boxes = 10,
+ .perf_ctr_bits = 48,
+ .event_ctl = NHMEX_C0_MSR_PMON_EV_SEL0,
+ .perf_ctr = NHMEX_C0_MSR_PMON_CTR0,
+ .event_mask = NHMEX_PMON_RAW_EVENT_MASK,
+ .box_ctl = NHMEX_C0_MSR_PMON_GLOBAL_CTL,
+ .msr_offsets = nhmex_cbox_msr_offsets,
+ .pair_ctr_ctl = 1,
+ .ops = &nhmex_uncore_ops,
+ .format_group = &nhmex_uncore_cbox_format_group
+};
+
+static struct uncore_event_desc nhmex_uncore_wbox_events[] = {
+ INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0"),
+ { /* end: all zeroes */ },
+};
+
+static struct intel_uncore_type nhmex_uncore_wbox = {
+ .name = "wbox",
+ .num_counters = 4,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .event_ctl = NHMEX_W_MSR_PMON_CNT0,
+ .perf_ctr = NHMEX_W_MSR_PMON_EVT_SEL0,
+ .fixed_ctr = NHMEX_W_MSR_PMON_FIXED_CTR,
+ .fixed_ctl = NHMEX_W_MSR_PMON_FIXED_CTL,
+ .event_mask = NHMEX_PMON_RAW_EVENT_MASK,
+ .box_ctl = NHMEX_W_MSR_GLOBAL_CTL,
+ .pair_ctr_ctl = 1,
+ .event_descs = nhmex_uncore_wbox_events,
+ .ops = &nhmex_uncore_ops,
+ .format_group = &nhmex_uncore_cbox_format_group
+};
+
+static int nhmex_bbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+ struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+ int ctr, ev_sel;
+
+ ctr = (hwc->config & NHMEX_B_PMON_CTR_MASK) >>
+ NHMEX_B_PMON_CTR_SHIFT;
+ ev_sel = (hwc->config & NHMEX_B_PMON_CTL_EV_SEL_MASK) >>
+ NHMEX_B_PMON_CTL_EV_SEL_SHIFT;
+
+ /* events that do not use the match/mask registers */
+ if ((ctr == 0 && ev_sel > 0x3) || (ctr == 1 && ev_sel > 0x6) ||
+ (ctr == 2 && ev_sel != 0x4) || ctr == 3)
+ return 0;
+
+ if (box->pmu->pmu_idx == 0)
+ reg1->reg = NHMEX_B0_MSR_MATCH;
+ else
+ reg1->reg = NHMEX_B1_MSR_MATCH;
+ reg1->idx = 0;
+ reg1->config = event->attr.config1;
+ reg2->config = event->attr.config2;
+ return 0;
+}
+
+static void nhmex_bbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+ struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+
+ if (reg1->idx != EXTRA_REG_NONE) {
+ wrmsrl(reg1->reg, reg1->config);
+ wrmsrl(reg1->reg + 1, reg2->config);
+ }
+ wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0 |
+ (hwc->config & NHMEX_B_PMON_CTL_EV_SEL_MASK));
+}
+
+/*
+ * The Bbox has 4 counters, but each counter monitors different events.
+ * Use bits 6-7 in the event config to select counter.
+ */
+static struct event_constraint nhmex_uncore_bbox_constraints[] = {
+ EVENT_CONSTRAINT(0 , 1, 0xc0),
+ EVENT_CONSTRAINT(0x40, 2, 0xc0),
+ EVENT_CONSTRAINT(0x80, 4, 0xc0),
+ EVENT_CONSTRAINT(0xc0, 8, 0xc0),
+ EVENT_CONSTRAINT_END,
+};
+
+static struct attribute *nhmex_uncore_bbox_formats_attr[] = {
+ &format_attr_event5.attr,
+ &format_attr_counter.attr,
+ &format_attr_match.attr,
+ &format_attr_mask.attr,
+ NULL,
+};
+
+static struct attribute_group nhmex_uncore_bbox_format_group = {
+ .name = "format",
+ .attrs = nhmex_uncore_bbox_formats_attr,
+};
+
+static struct intel_uncore_ops nhmex_uncore_bbox_ops = {
+ NHMEX_UNCORE_OPS_COMMON_INIT(),
+ .enable_event = nhmex_bbox_msr_enable_event,
+ .hw_config = nhmex_bbox_hw_config,
+ .get_constraint = uncore_get_constraint,
+ .put_constraint = uncore_put_constraint,
+};
+
+static struct intel_uncore_type nhmex_uncore_bbox = {
+ .name = "bbox",
+ .num_counters = 4,
+ .num_boxes = 2,
+ .perf_ctr_bits = 48,
+ .event_ctl = NHMEX_B0_MSR_PMON_CTL0,
+ .perf_ctr = NHMEX_B0_MSR_PMON_CTR0,
+ .event_mask = NHMEX_B_PMON_RAW_EVENT_MASK,
+ .box_ctl = NHMEX_B0_MSR_PMON_GLOBAL_CTL,
+ .msr_offset = NHMEX_B_MSR_OFFSET,
+ .pair_ctr_ctl = 1,
+ .num_shared_regs = 1,
+ .constraints = nhmex_uncore_bbox_constraints,
+ .ops = &nhmex_uncore_bbox_ops,
+ .format_group = &nhmex_uncore_bbox_format_group
+};
+
+static int nhmex_sbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+ struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+
+ /* only TO_R_PROG_EV event uses the match/mask register */
+ if ((hwc->config & NHMEX_PMON_CTL_EV_SEL_MASK) !=
+ NHMEX_S_EVENT_TO_R_PROG_EV)
+ return 0;
+
+ if (box->pmu->pmu_idx == 0)
+ reg1->reg = NHMEX_S0_MSR_MM_CFG;
+ else
+ reg1->reg = NHMEX_S1_MSR_MM_CFG;
+ reg1->idx = 0;
+ reg1->config = event->attr.config1;
+ reg2->config = event->attr.config2;
+ return 0;
+}
+
+static void nhmex_sbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+ struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+
+ if (reg1->idx != EXTRA_REG_NONE) {
+ wrmsrl(reg1->reg, 0);
+ wrmsrl(reg1->reg + 1, reg1->config);
+ wrmsrl(reg1->reg + 2, reg2->config);
+ wrmsrl(reg1->reg, NHMEX_S_PMON_MM_CFG_EN);
+ }
+ wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT22);
+}
+
+static struct attribute *nhmex_uncore_sbox_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh8.attr,
+ &format_attr_match.attr,
+ &format_attr_mask.attr,
+ NULL,
+};
+
+static struct attribute_group nhmex_uncore_sbox_format_group = {
+ .name = "format",
+ .attrs = nhmex_uncore_sbox_formats_attr,
+};
+
+static struct intel_uncore_ops nhmex_uncore_sbox_ops = {
+ NHMEX_UNCORE_OPS_COMMON_INIT(),
+ .enable_event = nhmex_sbox_msr_enable_event,
+ .hw_config = nhmex_sbox_hw_config,
+ .get_constraint = uncore_get_constraint,
+ .put_constraint = uncore_put_constraint,
+};
+
+static struct intel_uncore_type nhmex_uncore_sbox = {
+ .name = "sbox",
+ .num_counters = 4,
+ .num_boxes = 2,
+ .perf_ctr_bits = 48,
+ .event_ctl = NHMEX_S0_MSR_PMON_CTL0,
+ .perf_ctr = NHMEX_S0_MSR_PMON_CTR0,
+ .event_mask = NHMEX_PMON_RAW_EVENT_MASK,
+ .box_ctl = NHMEX_S0_MSR_PMON_GLOBAL_CTL,
+ .msr_offset = NHMEX_S_MSR_OFFSET,
+ .pair_ctr_ctl = 1,
+ .num_shared_regs = 1,
+ .ops = &nhmex_uncore_sbox_ops,
+ .format_group = &nhmex_uncore_sbox_format_group
+};
+
+enum {
+ EXTRA_REG_NHMEX_M_FILTER,
+ EXTRA_REG_NHMEX_M_DSP,
+ EXTRA_REG_NHMEX_M_ISS,
+ EXTRA_REG_NHMEX_M_MAP,
+ EXTRA_REG_NHMEX_M_MSC_THR,
+ EXTRA_REG_NHMEX_M_PGT,
+ EXTRA_REG_NHMEX_M_PLD,
+ EXTRA_REG_NHMEX_M_ZDP_CTL_FVC,
+};
+
+static struct extra_reg nhmex_uncore_mbox_extra_regs[] = {
+ MBOX_INC_SEL_EXTAR_REG(0x0, DSP),
+ MBOX_INC_SEL_EXTAR_REG(0x4, MSC_THR),
+ MBOX_INC_SEL_EXTAR_REG(0x5, MSC_THR),
+ MBOX_INC_SEL_EXTAR_REG(0x9, ISS),
+ /* event 0xa uses two extra registers */
+ MBOX_INC_SEL_EXTAR_REG(0xa, ISS),
+ MBOX_INC_SEL_EXTAR_REG(0xa, PLD),
+ MBOX_INC_SEL_EXTAR_REG(0xb, PLD),
+ /* events 0xd ~ 0x10 use the same extra register */
+ MBOX_INC_SEL_EXTAR_REG(0xd, ZDP_CTL_FVC),
+ MBOX_INC_SEL_EXTAR_REG(0xe, ZDP_CTL_FVC),
+ MBOX_INC_SEL_EXTAR_REG(0xf, ZDP_CTL_FVC),
+ MBOX_INC_SEL_EXTAR_REG(0x10, ZDP_CTL_FVC),
+ MBOX_INC_SEL_EXTAR_REG(0x16, PGT),
+ MBOX_SET_FLAG_SEL_EXTRA_REG(0x0, DSP),
+ MBOX_SET_FLAG_SEL_EXTRA_REG(0x1, ISS),
+ MBOX_SET_FLAG_SEL_EXTRA_REG(0x5, PGT),
+ MBOX_SET_FLAG_SEL_EXTRA_REG(0x6, MAP),
+ EVENT_EXTRA_END
+};
+
+/* Nehalem-EX or Westmere-EX ? */
+static bool uncore_nhmex;
+
+static bool nhmex_mbox_get_shared_reg(struct intel_uncore_box *box, int idx, u64 config)
+{
+ struct intel_uncore_extra_reg *er;
+ unsigned long flags;
+ bool ret = false;
+ u64 mask;
+
+ if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) {
+ er = &box->shared_regs[idx];
+ raw_spin_lock_irqsave(&er->lock, flags);
+ if (!atomic_read(&er->ref) || er->config == config) {
+ atomic_inc(&er->ref);
+ er->config = config;
+ ret = true;
+ }
+ raw_spin_unlock_irqrestore(&er->lock, flags);
+
+ return ret;
+ }
+ /*
+ * The ZDP_CTL_FVC MSR has 4 fields which are used to control
+ * events 0xd ~ 0x10. Besides these 4 fields, there are additional
+ * fields which are shared.
+ */
+ idx -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
+ if (WARN_ON_ONCE(idx >= 4))
+ return false;
+
+ /* mask of the shared fields */
+ if (uncore_nhmex)
+ mask = NHMEX_M_PMON_ZDP_CTL_FVC_MASK;
+ else
+ mask = WSMEX_M_PMON_ZDP_CTL_FVC_MASK;
+ er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC];
+
+ raw_spin_lock_irqsave(&er->lock, flags);
+ /* add mask of the non-shared field if it's in use */
+ if (__BITS_VALUE(atomic_read(&er->ref), idx, 8)) {
+ if (uncore_nhmex)
+ mask |= NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
+ else
+ mask |= WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
+ }
+
+ if (!atomic_read(&er->ref) || !((er->config ^ config) & mask)) {
+ atomic_add(1 << (idx * 8), &er->ref);
+ if (uncore_nhmex)
+ mask = NHMEX_M_PMON_ZDP_CTL_FVC_MASK |
+ NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
+ else
+ mask = WSMEX_M_PMON_ZDP_CTL_FVC_MASK |
+ WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
+ er->config &= ~mask;
+ er->config |= (config & mask);
+ ret = true;
+ }
+ raw_spin_unlock_irqrestore(&er->lock, flags);
+
+ return ret;
+}
+
+static void nhmex_mbox_put_shared_reg(struct intel_uncore_box *box, int idx)
+{
+ struct intel_uncore_extra_reg *er;
+
+ if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) {
+ er = &box->shared_regs[idx];
+ atomic_dec(&er->ref);
+ return;
+ }
+
+ idx -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
+ er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC];
+ atomic_sub(1 << (idx * 8), &er->ref);
+}
+
+static u64 nhmex_mbox_alter_er(struct perf_event *event, int new_idx, bool modify)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+ u64 idx, orig_idx = __BITS_VALUE(reg1->idx, 0, 8);
+ u64 config = reg1->config;
+
+ /* get the non-shared control bits and shift them */
+ idx = orig_idx - EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
+ if (uncore_nhmex)
+ config &= NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
+ else
+ config &= WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
+ if (new_idx > orig_idx) {
+ idx = new_idx - orig_idx;
+ config <<= 3 * idx;
+ } else {
+ idx = orig_idx - new_idx;
+ config >>= 3 * idx;
+ }
+
+ /* add the shared control bits back */
+ if (uncore_nhmex)
+ config |= NHMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config;
+ else
+ config |= WSMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config;
+ config |= NHMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config;
+ if (modify) {
+ /* adjust the main event selector */
+ if (new_idx > orig_idx)
+ hwc->config += idx << NHMEX_M_PMON_CTL_INC_SEL_SHIFT;
+ else
+ hwc->config -= idx << NHMEX_M_PMON_CTL_INC_SEL_SHIFT;
+ reg1->config = config;
+ reg1->idx = ~0xff | new_idx;
+ }
+ return config;
+}
+
+static struct event_constraint *
+nhmex_mbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+ struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
+ int i, idx[2], alloc = 0;
+ u64 config1 = reg1->config;
+
+ idx[0] = __BITS_VALUE(reg1->idx, 0, 8);
+ idx[1] = __BITS_VALUE(reg1->idx, 1, 8);
+again:
+ for (i = 0; i < 2; i++) {
+ if (!uncore_box_is_fake(box) && (reg1->alloc & (0x1 << i)))
+ idx[i] = 0xff;
+
+ if (idx[i] == 0xff)
+ continue;
+
+ if (!nhmex_mbox_get_shared_reg(box, idx[i],
+ __BITS_VALUE(config1, i, 32)))
+ goto fail;
+ alloc |= (0x1 << i);
+ }
+
+ /* for the match/mask registers */
+ if (reg2->idx != EXTRA_REG_NONE &&
+ (uncore_box_is_fake(box) || !reg2->alloc) &&
+ !nhmex_mbox_get_shared_reg(box, reg2->idx, reg2->config))
+ goto fail;
+
+ /*
+ * If it's a fake box -- as per validate_{group,event}() we
+ * shouldn't touch event state and we can avoid doing so
+ * since both will only call get_event_constraints() once
+ * on each event, this avoids the need for reg->alloc.
+ */
+ if (!uncore_box_is_fake(box)) {
+ if (idx[0] != 0xff && idx[0] != __BITS_VALUE(reg1->idx, 0, 8))
+ nhmex_mbox_alter_er(event, idx[0], true);
+ reg1->alloc |= alloc;
+ if (reg2->idx != EXTRA_REG_NONE)
+ reg2->alloc = 1;
+ }
+ return NULL;
+fail:
+ if (idx[0] != 0xff && !(alloc & 0x1) &&
+ idx[0] >= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) {
+ /*
+ * events 0xd ~ 0x10 are functional identical, but are
+ * controlled by different fields in the ZDP_CTL_FVC
+ * register. If we failed to take one field, try the
+ * rest 3 choices.
+ */
+ BUG_ON(__BITS_VALUE(reg1->idx, 1, 8) != 0xff);
+ idx[0] -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
+ idx[0] = (idx[0] + 1) % 4;
+ idx[0] += EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
+ if (idx[0] != __BITS_VALUE(reg1->idx, 0, 8)) {
+ config1 = nhmex_mbox_alter_er(event, idx[0], false);
+ goto again;
+ }
+ }
+
+ if (alloc & 0x1)
+ nhmex_mbox_put_shared_reg(box, idx[0]);
+ if (alloc & 0x2)
+ nhmex_mbox_put_shared_reg(box, idx[1]);
+ return &uncore_constraint_empty;
+}
+
+static void nhmex_mbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+ struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
+
+ if (uncore_box_is_fake(box))
+ return;
+
+ if (reg1->alloc & 0x1)
+ nhmex_mbox_put_shared_reg(box, __BITS_VALUE(reg1->idx, 0, 8));
+ if (reg1->alloc & 0x2)
+ nhmex_mbox_put_shared_reg(box, __BITS_VALUE(reg1->idx, 1, 8));
+ reg1->alloc = 0;
+
+ if (reg2->alloc) {
+ nhmex_mbox_put_shared_reg(box, reg2->idx);
+ reg2->alloc = 0;
+ }
+}
+
+static int nhmex_mbox_extra_reg_idx(struct extra_reg *er)
+{
+ if (er->idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC)
+ return er->idx;
+ return er->idx + (er->event >> NHMEX_M_PMON_CTL_INC_SEL_SHIFT) - 0xd;
+}
+
+static int nhmex_mbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct intel_uncore_type *type = box->pmu->type;
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+ struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
+ struct extra_reg *er;
+ unsigned msr;
+ int reg_idx = 0;
+ /*
+ * The mbox events may require 2 extra MSRs at the most. But only
+ * the lower 32 bits in these MSRs are significant, so we can use
+ * config1 to pass two MSRs' config.
+ */
+ for (er = nhmex_uncore_mbox_extra_regs; er->msr; er++) {
+ if (er->event != (event->hw.config & er->config_mask))
+ continue;
+ if (event->attr.config1 & ~er->valid_mask)
+ return -EINVAL;
+
+ msr = er->msr + type->msr_offset * box->pmu->pmu_idx;
+ if (WARN_ON_ONCE(msr >= 0xffff || er->idx >= 0xff))
+ return -EINVAL;
+
+ /* always use the 32~63 bits to pass the PLD config */
+ if (er->idx == EXTRA_REG_NHMEX_M_PLD)
+ reg_idx = 1;
+ else if (WARN_ON_ONCE(reg_idx > 0))
+ return -EINVAL;
+
+ reg1->idx &= ~(0xff << (reg_idx * 8));
+ reg1->reg &= ~(0xffff << (reg_idx * 16));
+ reg1->idx |= nhmex_mbox_extra_reg_idx(er) << (reg_idx * 8);
+ reg1->reg |= msr << (reg_idx * 16);
+ reg1->config = event->attr.config1;
+ reg_idx++;
+ }
+ /*
+ * The mbox only provides ability to perform address matching
+ * for the PLD events.
+ */
+ if (reg_idx == 2) {
+ reg2->idx = EXTRA_REG_NHMEX_M_FILTER;
+ if (event->attr.config2 & NHMEX_M_PMON_MM_CFG_EN)
+ reg2->config = event->attr.config2;
+ else
+ reg2->config = ~0ULL;
+ if (box->pmu->pmu_idx == 0)
+ reg2->reg = NHMEX_M0_MSR_PMU_MM_CFG;
+ else
+ reg2->reg = NHMEX_M1_MSR_PMU_MM_CFG;
+ }
+ return 0;
+}
+
+static u64 nhmex_mbox_shared_reg_config(struct intel_uncore_box *box, int idx)
+{
+ struct intel_uncore_extra_reg *er;
+ unsigned long flags;
+ u64 config;
+
+ if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC)
+ return box->shared_regs[idx].config;
+
+ er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC];
+ raw_spin_lock_irqsave(&er->lock, flags);
+ config = er->config;
+ raw_spin_unlock_irqrestore(&er->lock, flags);
+ return config;
+}
+
+static void nhmex_mbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+ struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+ int idx;
+
+ idx = __BITS_VALUE(reg1->idx, 0, 8);
+ if (idx != 0xff)
+ wrmsrl(__BITS_VALUE(reg1->reg, 0, 16),
+ nhmex_mbox_shared_reg_config(box, idx));
+ idx = __BITS_VALUE(reg1->idx, 1, 8);
+ if (idx != 0xff)
+ wrmsrl(__BITS_VALUE(reg1->reg, 1, 16),
+ nhmex_mbox_shared_reg_config(box, idx));
+
+ if (reg2->idx != EXTRA_REG_NONE) {
+ wrmsrl(reg2->reg, 0);
+ if (reg2->config != ~0ULL) {
+ wrmsrl(reg2->reg + 1,
+ reg2->config & NHMEX_M_PMON_ADDR_MATCH_MASK);
+ wrmsrl(reg2->reg + 2, NHMEX_M_PMON_ADDR_MASK_MASK &
+ (reg2->config >> NHMEX_M_PMON_ADDR_MASK_SHIFT));
+ wrmsrl(reg2->reg, NHMEX_M_PMON_MM_CFG_EN);
+ }
+ }
+
+ wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT0);
+}
+
+DEFINE_UNCORE_FORMAT_ATTR(count_mode, count_mode, "config:2-3");
+DEFINE_UNCORE_FORMAT_ATTR(storage_mode, storage_mode, "config:4-5");
+DEFINE_UNCORE_FORMAT_ATTR(wrap_mode, wrap_mode, "config:6");
+DEFINE_UNCORE_FORMAT_ATTR(flag_mode, flag_mode, "config:7");
+DEFINE_UNCORE_FORMAT_ATTR(inc_sel, inc_sel, "config:9-13");
+DEFINE_UNCORE_FORMAT_ATTR(set_flag_sel, set_flag_sel, "config:19-21");
+DEFINE_UNCORE_FORMAT_ATTR(filter_cfg_en, filter_cfg_en, "config2:63");
+DEFINE_UNCORE_FORMAT_ATTR(filter_match, filter_match, "config2:0-33");
+DEFINE_UNCORE_FORMAT_ATTR(filter_mask, filter_mask, "config2:34-61");
+DEFINE_UNCORE_FORMAT_ATTR(dsp, dsp, "config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(thr, thr, "config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(fvc, fvc, "config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(pgt, pgt, "config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(map, map, "config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(iss, iss, "config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(pld, pld, "config1:32-63");
+
+static struct attribute *nhmex_uncore_mbox_formats_attr[] = {
+ &format_attr_count_mode.attr,
+ &format_attr_storage_mode.attr,
+ &format_attr_wrap_mode.attr,
+ &format_attr_flag_mode.attr,
+ &format_attr_inc_sel.attr,
+ &format_attr_set_flag_sel.attr,
+ &format_attr_filter_cfg_en.attr,
+ &format_attr_filter_match.attr,
+ &format_attr_filter_mask.attr,
+ &format_attr_dsp.attr,
+ &format_attr_thr.attr,
+ &format_attr_fvc.attr,
+ &format_attr_pgt.attr,
+ &format_attr_map.attr,
+ &format_attr_iss.attr,
+ &format_attr_pld.attr,
+ NULL,
+};
+
+static struct attribute_group nhmex_uncore_mbox_format_group = {
+ .name = "format",
+ .attrs = nhmex_uncore_mbox_formats_attr,
+};
+
+static struct uncore_event_desc nhmex_uncore_mbox_events[] = {
+ INTEL_UNCORE_EVENT_DESC(bbox_cmds_read, "inc_sel=0xd,fvc=0x2800"),
+ INTEL_UNCORE_EVENT_DESC(bbox_cmds_write, "inc_sel=0xd,fvc=0x2820"),
+ { /* end: all zeroes */ },
+};
+
+static struct uncore_event_desc wsmex_uncore_mbox_events[] = {
+ INTEL_UNCORE_EVENT_DESC(bbox_cmds_read, "inc_sel=0xd,fvc=0x5000"),
+ INTEL_UNCORE_EVENT_DESC(bbox_cmds_write, "inc_sel=0xd,fvc=0x5040"),
+ { /* end: all zeroes */ },
+};
+
+static struct intel_uncore_ops nhmex_uncore_mbox_ops = {
+ NHMEX_UNCORE_OPS_COMMON_INIT(),
+ .enable_event = nhmex_mbox_msr_enable_event,
+ .hw_config = nhmex_mbox_hw_config,
+ .get_constraint = nhmex_mbox_get_constraint,
+ .put_constraint = nhmex_mbox_put_constraint,
+};
+
+static struct intel_uncore_type nhmex_uncore_mbox = {
+ .name = "mbox",
+ .num_counters = 6,
+ .num_boxes = 2,
+ .perf_ctr_bits = 48,
+ .event_ctl = NHMEX_M0_MSR_PMU_CTL0,
+ .perf_ctr = NHMEX_M0_MSR_PMU_CNT0,
+ .event_mask = NHMEX_M_PMON_RAW_EVENT_MASK,
+ .box_ctl = NHMEX_M0_MSR_GLOBAL_CTL,
+ .msr_offset = NHMEX_M_MSR_OFFSET,
+ .pair_ctr_ctl = 1,
+ .num_shared_regs = 8,
+ .event_descs = nhmex_uncore_mbox_events,
+ .ops = &nhmex_uncore_mbox_ops,
+ .format_group = &nhmex_uncore_mbox_format_group,
+};
+
+static void nhmex_rbox_alter_er(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+
+ /* adjust the main event selector and extra register index */
+ if (reg1->idx % 2) {
+ reg1->idx--;
+ hwc->config -= 1 << NHMEX_R_PMON_CTL_EV_SEL_SHIFT;
+ } else {
+ reg1->idx++;
+ hwc->config += 1 << NHMEX_R_PMON_CTL_EV_SEL_SHIFT;
+ }
+
+ /* adjust extra register config */
+ switch (reg1->idx % 6) {
+ case 2:
+ /* shift the 8~15 bits to the 0~7 bits */
+ reg1->config >>= 8;
+ break;
+ case 3:
+ /* shift the 0~7 bits to the 8~15 bits */
+ reg1->config <<= 8;
+ break;
+ }
+}
+
+/*
+ * Each rbox has 4 event set which monitor PQI port 0~3 or 4~7.
+ * An event set consists of 6 events, the 3rd and 4th events in
+ * an event set use the same extra register. So an event set uses
+ * 5 extra registers.
+ */
+static struct event_constraint *
+nhmex_rbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+ struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+ struct intel_uncore_extra_reg *er;
+ unsigned long flags;
+ int idx, er_idx;
+ u64 config1;
+ bool ok = false;
+
+ if (!uncore_box_is_fake(box) && reg1->alloc)
+ return NULL;
+
+ idx = reg1->idx % 6;
+ config1 = reg1->config;
+again:
+ er_idx = idx;
+ /* the 3rd and 4th events use the same extra register */
+ if (er_idx > 2)
+ er_idx--;
+ er_idx += (reg1->idx / 6) * 5;
+
+ er = &box->shared_regs[er_idx];
+ raw_spin_lock_irqsave(&er->lock, flags);
+ if (idx < 2) {
+ if (!atomic_read(&er->ref) || er->config == reg1->config) {
+ atomic_inc(&er->ref);
+ er->config = reg1->config;
+ ok = true;
+ }
+ } else if (idx == 2 || idx == 3) {
+ /*
+ * these two events use different fields in a extra register,
+ * the 0~7 bits and the 8~15 bits respectively.
+ */
+ u64 mask = 0xff << ((idx - 2) * 8);
+ if (!__BITS_VALUE(atomic_read(&er->ref), idx - 2, 8) ||
+ !((er->config ^ config1) & mask)) {
+ atomic_add(1 << ((idx - 2) * 8), &er->ref);
+ er->config &= ~mask;
+ er->config |= config1 & mask;
+ ok = true;
+ }
+ } else {
+ if (!atomic_read(&er->ref) ||
+ (er->config == (hwc->config >> 32) &&
+ er->config1 == reg1->config &&
+ er->config2 == reg2->config)) {
+ atomic_inc(&er->ref);
+ er->config = (hwc->config >> 32);
+ er->config1 = reg1->config;
+ er->config2 = reg2->config;
+ ok = true;
+ }
+ }
+ raw_spin_unlock_irqrestore(&er->lock, flags);
+
+ if (!ok) {
+ /*
+ * The Rbox events are always in pairs. The paired
+ * events are functional identical, but use different
+ * extra registers. If we failed to take an extra
+ * register, try the alternative.
+ */
+ idx ^= 1;
+ if (idx != reg1->idx % 6) {
+ if (idx == 2)
+ config1 >>= 8;
+ else if (idx == 3)
+ config1 <<= 8;
+ goto again;
+ }
+ } else {
+ if (!uncore_box_is_fake(box)) {
+ if (idx != reg1->idx % 6)
+ nhmex_rbox_alter_er(box, event);
+ reg1->alloc = 1;
+ }
+ return NULL;
+ }
+ return &uncore_constraint_empty;
+}
+
+static void nhmex_rbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct intel_uncore_extra_reg *er;
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+ int idx, er_idx;
+
+ if (uncore_box_is_fake(box) || !reg1->alloc)
+ return;
+
+ idx = reg1->idx % 6;
+ er_idx = idx;
+ if (er_idx > 2)
+ er_idx--;
+ er_idx += (reg1->idx / 6) * 5;
+
+ er = &box->shared_regs[er_idx];
+ if (idx == 2 || idx == 3)
+ atomic_sub(1 << ((idx - 2) * 8), &er->ref);
+ else
+ atomic_dec(&er->ref);
+
+ reg1->alloc = 0;
+}
+
+static int nhmex_rbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+ struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
+ int idx;
+
+ idx = (event->hw.config & NHMEX_R_PMON_CTL_EV_SEL_MASK) >>
+ NHMEX_R_PMON_CTL_EV_SEL_SHIFT;
+ if (idx >= 0x18)
+ return -EINVAL;
+
+ reg1->idx = idx;
+ reg1->config = event->attr.config1;
+
+ switch (idx % 6) {
+ case 4:
+ case 5:
+ hwc->config |= event->attr.config & (~0ULL << 32);
+ reg2->config = event->attr.config2;
+ break;
+ }
+ return 0;
+}
+
+static void nhmex_rbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+ struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+ int idx, port;
+
+ idx = reg1->idx;
+ port = idx / 6 + box->pmu->pmu_idx * 4;
+
+ switch (idx % 6) {
+ case 0:
+ wrmsrl(NHMEX_R_MSR_PORTN_IPERF_CFG0(port), reg1->config);
+ break;
+ case 1:
+ wrmsrl(NHMEX_R_MSR_PORTN_IPERF_CFG1(port), reg1->config);
+ break;
+ case 2:
+ case 3:
+ wrmsrl(NHMEX_R_MSR_PORTN_QLX_CFG(port),
+ uncore_shared_reg_config(box, 2 + (idx / 6) * 5));
+ break;
+ case 4:
+ wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(port),
+ hwc->config >> 32);
+ wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MATCH(port), reg1->config);
+ wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MASK(port), reg2->config);
+ break;
+ case 5:
+ wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(port),
+ hwc->config >> 32);
+ wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MATCH(port), reg1->config);
+ wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MASK(port), reg2->config);
+ break;
+ }
+
+ wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0 |
+ (hwc->config & NHMEX_R_PMON_CTL_EV_SEL_MASK));
+}
+
+DEFINE_UNCORE_FORMAT_ATTR(xbr_mm_cfg, xbr_mm_cfg, "config:32-63");
+DEFINE_UNCORE_FORMAT_ATTR(xbr_match, xbr_match, "config1:0-63");
+DEFINE_UNCORE_FORMAT_ATTR(xbr_mask, xbr_mask, "config2:0-63");
+DEFINE_UNCORE_FORMAT_ATTR(qlx_cfg, qlx_cfg, "config1:0-15");
+DEFINE_UNCORE_FORMAT_ATTR(iperf_cfg, iperf_cfg, "config1:0-31");
+
+static struct attribute *nhmex_uncore_rbox_formats_attr[] = {
+ &format_attr_event5.attr,
+ &format_attr_xbr_mm_cfg.attr,
+ &format_attr_xbr_match.attr,
+ &format_attr_xbr_mask.attr,
+ &format_attr_qlx_cfg.attr,
+ &format_attr_iperf_cfg.attr,
+ NULL,
+};
+
+static struct attribute_group nhmex_uncore_rbox_format_group = {
+ .name = "format",
+ .attrs = nhmex_uncore_rbox_formats_attr,
+};
+
+static struct uncore_event_desc nhmex_uncore_rbox_events[] = {
+ INTEL_UNCORE_EVENT_DESC(qpi0_flit_send, "event=0x0,iperf_cfg=0x80000000"),
+ INTEL_UNCORE_EVENT_DESC(qpi1_filt_send, "event=0x6,iperf_cfg=0x80000000"),
+ INTEL_UNCORE_EVENT_DESC(qpi0_idle_filt, "event=0x0,iperf_cfg=0x40000000"),
+ INTEL_UNCORE_EVENT_DESC(qpi1_idle_filt, "event=0x6,iperf_cfg=0x40000000"),
+ INTEL_UNCORE_EVENT_DESC(qpi0_date_response, "event=0x0,iperf_cfg=0xc4"),
+ INTEL_UNCORE_EVENT_DESC(qpi1_date_response, "event=0x6,iperf_cfg=0xc4"),
+ { /* end: all zeroes */ },
+};
+
+static struct intel_uncore_ops nhmex_uncore_rbox_ops = {
+ NHMEX_UNCORE_OPS_COMMON_INIT(),
+ .enable_event = nhmex_rbox_msr_enable_event,
+ .hw_config = nhmex_rbox_hw_config,
+ .get_constraint = nhmex_rbox_get_constraint,
+ .put_constraint = nhmex_rbox_put_constraint,
+};
+
+static struct intel_uncore_type nhmex_uncore_rbox = {
+ .name = "rbox",
+ .num_counters = 8,
+ .num_boxes = 2,
+ .perf_ctr_bits = 48,
+ .event_ctl = NHMEX_R_MSR_PMON_CTL0,
+ .perf_ctr = NHMEX_R_MSR_PMON_CNT0,
+ .event_mask = NHMEX_R_PMON_RAW_EVENT_MASK,
+ .box_ctl = NHMEX_R_MSR_GLOBAL_CTL,
+ .msr_offset = NHMEX_R_MSR_OFFSET,
+ .pair_ctr_ctl = 1,
+ .num_shared_regs = 20,
+ .event_descs = nhmex_uncore_rbox_events,
+ .ops = &nhmex_uncore_rbox_ops,
+ .format_group = &nhmex_uncore_rbox_format_group
+};
+
+static struct intel_uncore_type *nhmex_msr_uncores[] = {
+ &nhmex_uncore_ubox,
+ &nhmex_uncore_cbox,
+ &nhmex_uncore_bbox,
+ &nhmex_uncore_sbox,
+ &nhmex_uncore_mbox,
+ &nhmex_uncore_rbox,
+ &nhmex_uncore_wbox,
+ NULL,
+};
+
+void nhmex_uncore_cpu_init(void)
+{
+ if (boot_cpu_data.x86_model == 46)
+ uncore_nhmex = true;
+ else
+ nhmex_uncore_mbox.event_descs = wsmex_uncore_mbox_events;
+ if (nhmex_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
+ nhmex_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
+ uncore_msr_uncores = nhmex_msr_uncores;
+}
+/* end of Nehalem-EX uncore support */
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
new file mode 100644
index 0000000..e0e934c
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
@@ -0,0 +1,603 @@
+/* Nehalem/SandBridge/Haswell uncore support */
+#include "perf_event_intel_uncore.h"
+
+/* SNB event control */
+#define SNB_UNC_CTL_EV_SEL_MASK 0x000000ff
+#define SNB_UNC_CTL_UMASK_MASK 0x0000ff00
+#define SNB_UNC_CTL_EDGE_DET (1 << 18)
+#define SNB_UNC_CTL_EN (1 << 22)
+#define SNB_UNC_CTL_INVERT (1 << 23)
+#define SNB_UNC_CTL_CMASK_MASK 0x1f000000
+#define NHM_UNC_CTL_CMASK_MASK 0xff000000
+#define NHM_UNC_FIXED_CTR_CTL_EN (1 << 0)
+
+#define SNB_UNC_RAW_EVENT_MASK (SNB_UNC_CTL_EV_SEL_MASK | \
+ SNB_UNC_CTL_UMASK_MASK | \
+ SNB_UNC_CTL_EDGE_DET | \
+ SNB_UNC_CTL_INVERT | \
+ SNB_UNC_CTL_CMASK_MASK)
+
+#define NHM_UNC_RAW_EVENT_MASK (SNB_UNC_CTL_EV_SEL_MASK | \
+ SNB_UNC_CTL_UMASK_MASK | \
+ SNB_UNC_CTL_EDGE_DET | \
+ SNB_UNC_CTL_INVERT | \
+ NHM_UNC_CTL_CMASK_MASK)
+
+/* SNB global control register */
+#define SNB_UNC_PERF_GLOBAL_CTL 0x391
+#define SNB_UNC_FIXED_CTR_CTRL 0x394
+#define SNB_UNC_FIXED_CTR 0x395
+
+/* SNB uncore global control */
+#define SNB_UNC_GLOBAL_CTL_CORE_ALL ((1 << 4) - 1)
+#define SNB_UNC_GLOBAL_CTL_EN (1 << 29)
+
+/* SNB Cbo register */
+#define SNB_UNC_CBO_0_PERFEVTSEL0 0x700
+#define SNB_UNC_CBO_0_PER_CTR0 0x706
+#define SNB_UNC_CBO_MSR_OFFSET 0x10
+
+/* NHM global control register */
+#define NHM_UNC_PERF_GLOBAL_CTL 0x391
+#define NHM_UNC_FIXED_CTR 0x394
+#define NHM_UNC_FIXED_CTR_CTRL 0x395
+
+/* NHM uncore global control */
+#define NHM_UNC_GLOBAL_CTL_EN_PC_ALL ((1ULL << 8) - 1)
+#define NHM_UNC_GLOBAL_CTL_EN_FC (1ULL << 32)
+
+/* NHM uncore register */
+#define NHM_UNC_PERFEVTSEL0 0x3c0
+#define NHM_UNC_UNCORE_PMC0 0x3b0
+
+DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
+DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
+DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
+DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
+DEFINE_UNCORE_FORMAT_ATTR(cmask5, cmask, "config:24-28");
+DEFINE_UNCORE_FORMAT_ATTR(cmask8, cmask, "config:24-31");
+
+/* Sandy Bridge uncore support */
+static void snb_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (hwc->idx < UNCORE_PMC_IDX_FIXED)
+ wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN);
+ else
+ wrmsrl(hwc->config_base, SNB_UNC_CTL_EN);
+}
+
+static void snb_uncore_msr_disable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ wrmsrl(event->hw.config_base, 0);
+}
+
+static void snb_uncore_msr_init_box(struct intel_uncore_box *box)
+{
+ if (box->pmu->pmu_idx == 0) {
+ wrmsrl(SNB_UNC_PERF_GLOBAL_CTL,
+ SNB_UNC_GLOBAL_CTL_EN | SNB_UNC_GLOBAL_CTL_CORE_ALL);
+ }
+}
+
+static struct uncore_event_desc snb_uncore_events[] = {
+ INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"),
+ { /* end: all zeroes */ },
+};
+
+static struct attribute *snb_uncore_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_cmask5.attr,
+ NULL,
+};
+
+static struct attribute_group snb_uncore_format_group = {
+ .name = "format",
+ .attrs = snb_uncore_formats_attr,
+};
+
+static struct intel_uncore_ops snb_uncore_msr_ops = {
+ .init_box = snb_uncore_msr_init_box,
+ .disable_event = snb_uncore_msr_disable_event,
+ .enable_event = snb_uncore_msr_enable_event,
+ .read_counter = uncore_msr_read_counter,
+};
+
+static struct event_constraint snb_uncore_cbox_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x80, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x83, 0x1),
+ EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type snb_uncore_cbox = {
+ .name = "cbox",
+ .num_counters = 2,
+ .num_boxes = 4,
+ .perf_ctr_bits = 44,
+ .fixed_ctr_bits = 48,
+ .perf_ctr = SNB_UNC_CBO_0_PER_CTR0,
+ .event_ctl = SNB_UNC_CBO_0_PERFEVTSEL0,
+ .fixed_ctr = SNB_UNC_FIXED_CTR,
+ .fixed_ctl = SNB_UNC_FIXED_CTR_CTRL,
+ .single_fixed = 1,
+ .event_mask = SNB_UNC_RAW_EVENT_MASK,
+ .msr_offset = SNB_UNC_CBO_MSR_OFFSET,
+ .constraints = snb_uncore_cbox_constraints,
+ .ops = &snb_uncore_msr_ops,
+ .format_group = &snb_uncore_format_group,
+ .event_descs = snb_uncore_events,
+};
+
+static struct intel_uncore_type *snb_msr_uncores[] = {
+ &snb_uncore_cbox,
+ NULL,
+};
+
+void snb_uncore_cpu_init(void)
+{
+ uncore_msr_uncores = snb_msr_uncores;
+ if (snb_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
+ snb_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
+}
+
+enum {
+ SNB_PCI_UNCORE_IMC,
+};
+
+static struct uncore_event_desc snb_uncore_imc_events[] = {
+ INTEL_UNCORE_EVENT_DESC(data_reads, "event=0x01"),
+ INTEL_UNCORE_EVENT_DESC(data_reads.scale, "6.103515625e-5"),
+ INTEL_UNCORE_EVENT_DESC(data_reads.unit, "MiB"),
+
+ INTEL_UNCORE_EVENT_DESC(data_writes, "event=0x02"),
+ INTEL_UNCORE_EVENT_DESC(data_writes.scale, "6.103515625e-5"),
+ INTEL_UNCORE_EVENT_DESC(data_writes.unit, "MiB"),
+
+ { /* end: all zeroes */ },
+};
+
+#define SNB_UNCORE_PCI_IMC_EVENT_MASK 0xff
+#define SNB_UNCORE_PCI_IMC_BAR_OFFSET 0x48
+
+/* page size multiple covering all config regs */
+#define SNB_UNCORE_PCI_IMC_MAP_SIZE 0x6000
+
+#define SNB_UNCORE_PCI_IMC_DATA_READS 0x1
+#define SNB_UNCORE_PCI_IMC_DATA_READS_BASE 0x5050
+#define SNB_UNCORE_PCI_IMC_DATA_WRITES 0x2
+#define SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE 0x5054
+#define SNB_UNCORE_PCI_IMC_CTR_BASE SNB_UNCORE_PCI_IMC_DATA_READS_BASE
+
+static struct attribute *snb_uncore_imc_formats_attr[] = {
+ &format_attr_event.attr,
+ NULL,
+};
+
+static struct attribute_group snb_uncore_imc_format_group = {
+ .name = "format",
+ .attrs = snb_uncore_imc_formats_attr,
+};
+
+static void snb_uncore_imc_init_box(struct intel_uncore_box *box)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ int where = SNB_UNCORE_PCI_IMC_BAR_OFFSET;
+ resource_size_t addr;
+ u32 pci_dword;
+
+ pci_read_config_dword(pdev, where, &pci_dword);
+ addr = pci_dword;
+
+#ifdef CONFIG_PHYS_ADDR_T_64BIT
+ pci_read_config_dword(pdev, where + 4, &pci_dword);
+ addr |= ((resource_size_t)pci_dword << 32);
+#endif
+
+ addr &= ~(PAGE_SIZE - 1);
+
+ box->io_addr = ioremap(addr, SNB_UNCORE_PCI_IMC_MAP_SIZE);
+ box->hrtimer_duration = UNCORE_SNB_IMC_HRTIMER_INTERVAL;
+}
+
+static void snb_uncore_imc_enable_box(struct intel_uncore_box *box)
+{}
+
+static void snb_uncore_imc_disable_box(struct intel_uncore_box *box)
+{}
+
+static void snb_uncore_imc_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{}
+
+static void snb_uncore_imc_disable_event(struct intel_uncore_box *box, struct perf_event *event)
+{}
+
+static u64 snb_uncore_imc_read_counter(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ return (u64)*(unsigned int *)(box->io_addr + hwc->event_base);
+}
+
+/*
+ * custom event_init() function because we define our own fixed, free
+ * running counters, so we do not want to conflict with generic uncore
+ * logic. Also simplifies processing
+ */
+static int snb_uncore_imc_event_init(struct perf_event *event)
+{
+ struct intel_uncore_pmu *pmu;
+ struct intel_uncore_box *box;
+ struct hw_perf_event *hwc = &event->hw;
+ u64 cfg = event->attr.config & SNB_UNCORE_PCI_IMC_EVENT_MASK;
+ int idx, base;
+
+ if (event->attr.type != event->pmu->type)
+ return -ENOENT;
+
+ pmu = uncore_event_to_pmu(event);
+ /* no device found for this pmu */
+ if (pmu->func_id < 0)
+ return -ENOENT;
+
+ /* Sampling not supported yet */
+ if (hwc->sample_period)
+ return -EINVAL;
+
+ /* unsupported modes and filters */
+ if (event->attr.exclude_user ||
+ event->attr.exclude_kernel ||
+ event->attr.exclude_hv ||
+ event->attr.exclude_idle ||
+ event->attr.exclude_host ||
+ event->attr.exclude_guest ||
+ event->attr.sample_period) /* no sampling */
+ return -EINVAL;
+
+ /*
+ * Place all uncore events for a particular physical package
+ * onto a single cpu
+ */
+ if (event->cpu < 0)
+ return -EINVAL;
+
+ /* check only supported bits are set */
+ if (event->attr.config & ~SNB_UNCORE_PCI_IMC_EVENT_MASK)
+ return -EINVAL;
+
+ box = uncore_pmu_to_box(pmu, event->cpu);
+ if (!box || box->cpu < 0)
+ return -EINVAL;
+
+ event->cpu = box->cpu;
+
+ event->hw.idx = -1;
+ event->hw.last_tag = ~0ULL;
+ event->hw.extra_reg.idx = EXTRA_REG_NONE;
+ event->hw.branch_reg.idx = EXTRA_REG_NONE;
+ /*
+ * check event is known (whitelist, determines counter)
+ */
+ switch (cfg) {
+ case SNB_UNCORE_PCI_IMC_DATA_READS:
+ base = SNB_UNCORE_PCI_IMC_DATA_READS_BASE;
+ idx = UNCORE_PMC_IDX_FIXED;
+ break;
+ case SNB_UNCORE_PCI_IMC_DATA_WRITES:
+ base = SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE;
+ idx = UNCORE_PMC_IDX_FIXED + 1;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ /* must be done before validate_group */
+ event->hw.event_base = base;
+ event->hw.config = cfg;
+ event->hw.idx = idx;
+
+ /* no group validation needed, we have free running counters */
+
+ return 0;
+}
+
+static int snb_uncore_imc_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+ return 0;
+}
+
+static void snb_uncore_imc_event_start(struct perf_event *event, int flags)
+{
+ struct intel_uncore_box *box = uncore_event_to_box(event);
+ u64 count;
+
+ if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
+ return;
+
+ event->hw.state = 0;
+ box->n_active++;
+
+ list_add_tail(&event->active_entry, &box->active_list);
+
+ count = snb_uncore_imc_read_counter(box, event);
+ local64_set(&event->hw.prev_count, count);
+
+ if (box->n_active == 1)
+ uncore_pmu_start_hrtimer(box);
+}
+
+static void snb_uncore_imc_event_stop(struct perf_event *event, int flags)
+{
+ struct intel_uncore_box *box = uncore_event_to_box(event);
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (!(hwc->state & PERF_HES_STOPPED)) {
+ box->n_active--;
+
+ WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+ hwc->state |= PERF_HES_STOPPED;
+
+ list_del(&event->active_entry);
+
+ if (box->n_active == 0)
+ uncore_pmu_cancel_hrtimer(box);
+ }
+
+ if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
+ /*
+ * Drain the remaining delta count out of a event
+ * that we are disabling:
+ */
+ uncore_perf_event_update(box, event);
+ hwc->state |= PERF_HES_UPTODATE;
+ }
+}
+
+static int snb_uncore_imc_event_add(struct perf_event *event, int flags)
+{
+ struct intel_uncore_box *box = uncore_event_to_box(event);
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (!box)
+ return -ENODEV;
+
+ hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+ if (!(flags & PERF_EF_START))
+ hwc->state |= PERF_HES_ARCH;
+
+ snb_uncore_imc_event_start(event, 0);
+
+ box->n_events++;
+
+ return 0;
+}
+
+static void snb_uncore_imc_event_del(struct perf_event *event, int flags)
+{
+ struct intel_uncore_box *box = uncore_event_to_box(event);
+ int i;
+
+ snb_uncore_imc_event_stop(event, PERF_EF_UPDATE);
+
+ for (i = 0; i < box->n_events; i++) {
+ if (event == box->event_list[i]) {
+ --box->n_events;
+ break;
+ }
+ }
+}
+
+static int snb_pci2phy_map_init(int devid)
+{
+ struct pci_dev *dev = NULL;
+ int bus;
+
+ dev = pci_get_device(PCI_VENDOR_ID_INTEL, devid, dev);
+ if (!dev)
+ return -ENOTTY;
+
+ bus = dev->bus->number;
+
+ uncore_pcibus_to_physid[bus] = 0;
+
+ pci_dev_put(dev);
+
+ return 0;
+}
+
+static struct pmu snb_uncore_imc_pmu = {
+ .task_ctx_nr = perf_invalid_context,
+ .event_init = snb_uncore_imc_event_init,
+ .add = snb_uncore_imc_event_add,
+ .del = snb_uncore_imc_event_del,
+ .start = snb_uncore_imc_event_start,
+ .stop = snb_uncore_imc_event_stop,
+ .read = uncore_pmu_event_read,
+};
+
+static struct intel_uncore_ops snb_uncore_imc_ops = {
+ .init_box = snb_uncore_imc_init_box,
+ .enable_box = snb_uncore_imc_enable_box,
+ .disable_box = snb_uncore_imc_disable_box,
+ .disable_event = snb_uncore_imc_disable_event,
+ .enable_event = snb_uncore_imc_enable_event,
+ .hw_config = snb_uncore_imc_hw_config,
+ .read_counter = snb_uncore_imc_read_counter,
+};
+
+static struct intel_uncore_type snb_uncore_imc = {
+ .name = "imc",
+ .num_counters = 2,
+ .num_boxes = 1,
+ .fixed_ctr_bits = 32,
+ .fixed_ctr = SNB_UNCORE_PCI_IMC_CTR_BASE,
+ .event_descs = snb_uncore_imc_events,
+ .format_group = &snb_uncore_imc_format_group,
+ .perf_ctr = SNB_UNCORE_PCI_IMC_DATA_READS_BASE,
+ .event_mask = SNB_UNCORE_PCI_IMC_EVENT_MASK,
+ .ops = &snb_uncore_imc_ops,
+ .pmu = &snb_uncore_imc_pmu,
+};
+
+static struct intel_uncore_type *snb_pci_uncores[] = {
+ [SNB_PCI_UNCORE_IMC] = &snb_uncore_imc,
+ NULL,
+};
+
+static const struct pci_device_id snb_uncore_pci_ids[] = {
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SNB_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* end: all zeroes */ },
+};
+
+static const struct pci_device_id ivb_uncore_pci_ids[] = {
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IVB_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* end: all zeroes */ },
+};
+
+static const struct pci_device_id hsw_uncore_pci_ids[] = {
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* end: all zeroes */ },
+};
+
+static struct pci_driver snb_uncore_pci_driver = {
+ .name = "snb_uncore",
+ .id_table = snb_uncore_pci_ids,
+};
+
+static struct pci_driver ivb_uncore_pci_driver = {
+ .name = "ivb_uncore",
+ .id_table = ivb_uncore_pci_ids,
+};
+
+static struct pci_driver hsw_uncore_pci_driver = {
+ .name = "hsw_uncore",
+ .id_table = hsw_uncore_pci_ids,
+};
+
+int snb_uncore_pci_init(void)
+{
+ int ret = snb_pci2phy_map_init(PCI_DEVICE_ID_INTEL_SNB_IMC);
+ if (ret)
+ return ret;
+ uncore_pci_uncores = snb_pci_uncores;
+ uncore_pci_driver = &snb_uncore_pci_driver;
+ return 0;
+}
+
+int ivb_uncore_pci_init(void)
+{
+ int ret = snb_pci2phy_map_init(PCI_DEVICE_ID_INTEL_IVB_IMC);
+ if (ret)
+ return ret;
+ uncore_pci_uncores = snb_pci_uncores;
+ uncore_pci_driver = &ivb_uncore_pci_driver;
+ return 0;
+}
+
+int hsw_uncore_pci_init(void)
+{
+ int ret = snb_pci2phy_map_init(PCI_DEVICE_ID_INTEL_HSW_IMC);
+ if (ret)
+ return ret;
+ uncore_pci_uncores = snb_pci_uncores;
+ uncore_pci_driver = &hsw_uncore_pci_driver;
+ return 0;
+}
+
+/* end of Sandy Bridge uncore support */
+
+/* Nehalem uncore support */
+static void nhm_uncore_msr_disable_box(struct intel_uncore_box *box)
+{
+ wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, 0);
+}
+
+static void nhm_uncore_msr_enable_box(struct intel_uncore_box *box)
+{
+ wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, NHM_UNC_GLOBAL_CTL_EN_PC_ALL | NHM_UNC_GLOBAL_CTL_EN_FC);
+}
+
+static void nhm_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (hwc->idx < UNCORE_PMC_IDX_FIXED)
+ wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN);
+ else
+ wrmsrl(hwc->config_base, NHM_UNC_FIXED_CTR_CTL_EN);
+}
+
+static struct attribute *nhm_uncore_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_cmask8.attr,
+ NULL,
+};
+
+static struct attribute_group nhm_uncore_format_group = {
+ .name = "format",
+ .attrs = nhm_uncore_formats_attr,
+};
+
+static struct uncore_event_desc nhm_uncore_events[] = {
+ INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"),
+ INTEL_UNCORE_EVENT_DESC(qmc_writes_full_any, "event=0x2f,umask=0x0f"),
+ INTEL_UNCORE_EVENT_DESC(qmc_normal_reads_any, "event=0x2c,umask=0x0f"),
+ INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_reads, "event=0x20,umask=0x01"),
+ INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_writes, "event=0x20,umask=0x02"),
+ INTEL_UNCORE_EVENT_DESC(qhl_request_remote_reads, "event=0x20,umask=0x04"),
+ INTEL_UNCORE_EVENT_DESC(qhl_request_remote_writes, "event=0x20,umask=0x08"),
+ INTEL_UNCORE_EVENT_DESC(qhl_request_local_reads, "event=0x20,umask=0x10"),
+ INTEL_UNCORE_EVENT_DESC(qhl_request_local_writes, "event=0x20,umask=0x20"),
+ { /* end: all zeroes */ },
+};
+
+static struct intel_uncore_ops nhm_uncore_msr_ops = {
+ .disable_box = nhm_uncore_msr_disable_box,
+ .enable_box = nhm_uncore_msr_enable_box,
+ .disable_event = snb_uncore_msr_disable_event,
+ .enable_event = nhm_uncore_msr_enable_event,
+ .read_counter = uncore_msr_read_counter,
+};
+
+static struct intel_uncore_type nhm_uncore = {
+ .name = "",
+ .num_counters = 8,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .fixed_ctr_bits = 48,
+ .event_ctl = NHM_UNC_PERFEVTSEL0,
+ .perf_ctr = NHM_UNC_UNCORE_PMC0,
+ .fixed_ctr = NHM_UNC_FIXED_CTR,
+ .fixed_ctl = NHM_UNC_FIXED_CTR_CTRL,
+ .event_mask = NHM_UNC_RAW_EVENT_MASK,
+ .event_descs = nhm_uncore_events,
+ .ops = &nhm_uncore_msr_ops,
+ .format_group = &nhm_uncore_format_group,
+};
+
+static struct intel_uncore_type *nhm_msr_uncores[] = {
+ &nhm_uncore,
+ NULL,
+};
+
+void nhm_uncore_cpu_init(void)
+{
+ uncore_msr_uncores = nhm_msr_uncores;
+}
+
+/* end of Nehalem uncore support */
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
new file mode 100644
index 0000000..6606ed0
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
@@ -0,0 +1,1645 @@
+/* SandyBridge-EP/IvyTown uncore support */
+#include "perf_event_intel_uncore.h"
+
+
+/* SNB-EP Box level control */
+#define SNBEP_PMON_BOX_CTL_RST_CTRL (1 << 0)
+#define SNBEP_PMON_BOX_CTL_RST_CTRS (1 << 1)
+#define SNBEP_PMON_BOX_CTL_FRZ (1 << 8)
+#define SNBEP_PMON_BOX_CTL_FRZ_EN (1 << 16)
+#define SNBEP_PMON_BOX_CTL_INT (SNBEP_PMON_BOX_CTL_RST_CTRL | \
+ SNBEP_PMON_BOX_CTL_RST_CTRS | \
+ SNBEP_PMON_BOX_CTL_FRZ_EN)
+/* SNB-EP event control */
+#define SNBEP_PMON_CTL_EV_SEL_MASK 0x000000ff
+#define SNBEP_PMON_CTL_UMASK_MASK 0x0000ff00
+#define SNBEP_PMON_CTL_RST (1 << 17)
+#define SNBEP_PMON_CTL_EDGE_DET (1 << 18)
+#define SNBEP_PMON_CTL_EV_SEL_EXT (1 << 21)
+#define SNBEP_PMON_CTL_EN (1 << 22)
+#define SNBEP_PMON_CTL_INVERT (1 << 23)
+#define SNBEP_PMON_CTL_TRESH_MASK 0xff000000
+#define SNBEP_PMON_RAW_EVENT_MASK (SNBEP_PMON_CTL_EV_SEL_MASK | \
+ SNBEP_PMON_CTL_UMASK_MASK | \
+ SNBEP_PMON_CTL_EDGE_DET | \
+ SNBEP_PMON_CTL_INVERT | \
+ SNBEP_PMON_CTL_TRESH_MASK)
+
+/* SNB-EP Ubox event control */
+#define SNBEP_U_MSR_PMON_CTL_TRESH_MASK 0x1f000000
+#define SNBEP_U_MSR_PMON_RAW_EVENT_MASK \
+ (SNBEP_PMON_CTL_EV_SEL_MASK | \
+ SNBEP_PMON_CTL_UMASK_MASK | \
+ SNBEP_PMON_CTL_EDGE_DET | \
+ SNBEP_PMON_CTL_INVERT | \
+ SNBEP_U_MSR_PMON_CTL_TRESH_MASK)
+
+#define SNBEP_CBO_PMON_CTL_TID_EN (1 << 19)
+#define SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK (SNBEP_PMON_RAW_EVENT_MASK | \
+ SNBEP_CBO_PMON_CTL_TID_EN)
+
+/* SNB-EP PCU event control */
+#define SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK 0x0000c000
+#define SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK 0x1f000000
+#define SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT (1 << 30)
+#define SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET (1 << 31)
+#define SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK \
+ (SNBEP_PMON_CTL_EV_SEL_MASK | \
+ SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \
+ SNBEP_PMON_CTL_EDGE_DET | \
+ SNBEP_PMON_CTL_EV_SEL_EXT | \
+ SNBEP_PMON_CTL_INVERT | \
+ SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \
+ SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \
+ SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET)
+
+#define SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK \
+ (SNBEP_PMON_RAW_EVENT_MASK | \
+ SNBEP_PMON_CTL_EV_SEL_EXT)
+
+/* SNB-EP pci control register */
+#define SNBEP_PCI_PMON_BOX_CTL 0xf4
+#define SNBEP_PCI_PMON_CTL0 0xd8
+/* SNB-EP pci counter register */
+#define SNBEP_PCI_PMON_CTR0 0xa0
+
+/* SNB-EP home agent register */
+#define SNBEP_HA_PCI_PMON_BOX_ADDRMATCH0 0x40
+#define SNBEP_HA_PCI_PMON_BOX_ADDRMATCH1 0x44
+#define SNBEP_HA_PCI_PMON_BOX_OPCODEMATCH 0x48
+/* SNB-EP memory controller register */
+#define SNBEP_MC_CHy_PCI_PMON_FIXED_CTL 0xf0
+#define SNBEP_MC_CHy_PCI_PMON_FIXED_CTR 0xd0
+/* SNB-EP QPI register */
+#define SNBEP_Q_Py_PCI_PMON_PKT_MATCH0 0x228
+#define SNBEP_Q_Py_PCI_PMON_PKT_MATCH1 0x22c
+#define SNBEP_Q_Py_PCI_PMON_PKT_MASK0 0x238
+#define SNBEP_Q_Py_PCI_PMON_PKT_MASK1 0x23c
+
+/* SNB-EP Ubox register */
+#define SNBEP_U_MSR_PMON_CTR0 0xc16
+#define SNBEP_U_MSR_PMON_CTL0 0xc10
+
+#define SNBEP_U_MSR_PMON_UCLK_FIXED_CTL 0xc08
+#define SNBEP_U_MSR_PMON_UCLK_FIXED_CTR 0xc09
+
+/* SNB-EP Cbo register */
+#define SNBEP_C0_MSR_PMON_CTR0 0xd16
+#define SNBEP_C0_MSR_PMON_CTL0 0xd10
+#define SNBEP_C0_MSR_PMON_BOX_CTL 0xd04
+#define SNBEP_C0_MSR_PMON_BOX_FILTER 0xd14
+#define SNBEP_CBO_MSR_OFFSET 0x20
+
+#define SNBEP_CB0_MSR_PMON_BOX_FILTER_TID 0x1f
+#define SNBEP_CB0_MSR_PMON_BOX_FILTER_NID 0x3fc00
+#define SNBEP_CB0_MSR_PMON_BOX_FILTER_STATE 0x7c0000
+#define SNBEP_CB0_MSR_PMON_BOX_FILTER_OPC 0xff800000
+
+#define SNBEP_CBO_EVENT_EXTRA_REG(e, m, i) { \
+ .event = (e), \
+ .msr = SNBEP_C0_MSR_PMON_BOX_FILTER, \
+ .config_mask = (m), \
+ .idx = (i) \
+}
+
+/* SNB-EP PCU register */
+#define SNBEP_PCU_MSR_PMON_CTR0 0xc36
+#define SNBEP_PCU_MSR_PMON_CTL0 0xc30
+#define SNBEP_PCU_MSR_PMON_BOX_CTL 0xc24
+#define SNBEP_PCU_MSR_PMON_BOX_FILTER 0xc34
+#define SNBEP_PCU_MSR_PMON_BOX_FILTER_MASK 0xffffffff
+#define SNBEP_PCU_MSR_CORE_C3_CTR 0x3fc
+#define SNBEP_PCU_MSR_CORE_C6_CTR 0x3fd
+
+/* IVBEP event control */
+#define IVBEP_PMON_BOX_CTL_INT (SNBEP_PMON_BOX_CTL_RST_CTRL | \
+ SNBEP_PMON_BOX_CTL_RST_CTRS)
+#define IVBEP_PMON_RAW_EVENT_MASK (SNBEP_PMON_CTL_EV_SEL_MASK | \
+ SNBEP_PMON_CTL_UMASK_MASK | \
+ SNBEP_PMON_CTL_EDGE_DET | \
+ SNBEP_PMON_CTL_TRESH_MASK)
+/* IVBEP Ubox */
+#define IVBEP_U_MSR_PMON_GLOBAL_CTL 0xc00
+#define IVBEP_U_PMON_GLOBAL_FRZ_ALL (1 << 31)
+#define IVBEP_U_PMON_GLOBAL_UNFRZ_ALL (1 << 29)
+
+#define IVBEP_U_MSR_PMON_RAW_EVENT_MASK \
+ (SNBEP_PMON_CTL_EV_SEL_MASK | \
+ SNBEP_PMON_CTL_UMASK_MASK | \
+ SNBEP_PMON_CTL_EDGE_DET | \
+ SNBEP_U_MSR_PMON_CTL_TRESH_MASK)
+/* IVBEP Cbo */
+#define IVBEP_CBO_MSR_PMON_RAW_EVENT_MASK (IVBEP_PMON_RAW_EVENT_MASK | \
+ SNBEP_CBO_PMON_CTL_TID_EN)
+
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_TID (0x1fULL << 0)
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_LINK (0xfULL << 5)
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_STATE (0x3fULL << 17)
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_NID (0xffffULL << 32)
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_OPC (0x1ffULL << 52)
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_C6 (0x1ULL << 61)
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_NC (0x1ULL << 62)
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_IOSC (0x1ULL << 63)
+
+/* IVBEP home agent */
+#define IVBEP_HA_PCI_PMON_CTL_Q_OCC_RST (1 << 16)
+#define IVBEP_HA_PCI_PMON_RAW_EVENT_MASK \
+ (IVBEP_PMON_RAW_EVENT_MASK | \
+ IVBEP_HA_PCI_PMON_CTL_Q_OCC_RST)
+/* IVBEP PCU */
+#define IVBEP_PCU_MSR_PMON_RAW_EVENT_MASK \
+ (SNBEP_PMON_CTL_EV_SEL_MASK | \
+ SNBEP_PMON_CTL_EV_SEL_EXT | \
+ SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \
+ SNBEP_PMON_CTL_EDGE_DET | \
+ SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \
+ SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \
+ SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET)
+/* IVBEP QPI */
+#define IVBEP_QPI_PCI_PMON_RAW_EVENT_MASK \
+ (IVBEP_PMON_RAW_EVENT_MASK | \
+ SNBEP_PMON_CTL_EV_SEL_EXT)
+
+#define __BITS_VALUE(x, i, n) ((typeof(x))(((x) >> ((i) * (n))) & \
+ ((1ULL << (n)) - 1)))
+
+DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
+DEFINE_UNCORE_FORMAT_ATTR(event_ext, event, "config:0-7,21");
+DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
+DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
+DEFINE_UNCORE_FORMAT_ATTR(tid_en, tid_en, "config:19");
+DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
+DEFINE_UNCORE_FORMAT_ATTR(thresh8, thresh, "config:24-31");
+DEFINE_UNCORE_FORMAT_ATTR(thresh5, thresh, "config:24-28");
+DEFINE_UNCORE_FORMAT_ATTR(occ_sel, occ_sel, "config:14-15");
+DEFINE_UNCORE_FORMAT_ATTR(occ_invert, occ_invert, "config:30");
+DEFINE_UNCORE_FORMAT_ATTR(occ_edge, occ_edge, "config:14-51");
+DEFINE_UNCORE_FORMAT_ATTR(filter_tid, filter_tid, "config1:0-4");
+DEFINE_UNCORE_FORMAT_ATTR(filter_link, filter_link, "config1:5-8");
+DEFINE_UNCORE_FORMAT_ATTR(filter_nid, filter_nid, "config1:10-17");
+DEFINE_UNCORE_FORMAT_ATTR(filter_nid2, filter_nid, "config1:32-47");
+DEFINE_UNCORE_FORMAT_ATTR(filter_state, filter_state, "config1:18-22");
+DEFINE_UNCORE_FORMAT_ATTR(filter_state2, filter_state, "config1:17-22");
+DEFINE_UNCORE_FORMAT_ATTR(filter_opc, filter_opc, "config1:23-31");
+DEFINE_UNCORE_FORMAT_ATTR(filter_opc2, filter_opc, "config1:52-60");
+DEFINE_UNCORE_FORMAT_ATTR(filter_band0, filter_band0, "config1:0-7");
+DEFINE_UNCORE_FORMAT_ATTR(filter_band1, filter_band1, "config1:8-15");
+DEFINE_UNCORE_FORMAT_ATTR(filter_band2, filter_band2, "config1:16-23");
+DEFINE_UNCORE_FORMAT_ATTR(filter_band3, filter_band3, "config1:24-31");
+DEFINE_UNCORE_FORMAT_ATTR(match_rds, match_rds, "config1:48-51");
+DEFINE_UNCORE_FORMAT_ATTR(match_rnid30, match_rnid30, "config1:32-35");
+DEFINE_UNCORE_FORMAT_ATTR(match_rnid4, match_rnid4, "config1:31");
+DEFINE_UNCORE_FORMAT_ATTR(match_dnid, match_dnid, "config1:13-17");
+DEFINE_UNCORE_FORMAT_ATTR(match_mc, match_mc, "config1:9-12");
+DEFINE_UNCORE_FORMAT_ATTR(match_opc, match_opc, "config1:5-8");
+DEFINE_UNCORE_FORMAT_ATTR(match_vnw, match_vnw, "config1:3-4");
+DEFINE_UNCORE_FORMAT_ATTR(match0, match0, "config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(match1, match1, "config1:32-63");
+DEFINE_UNCORE_FORMAT_ATTR(mask_rds, mask_rds, "config2:48-51");
+DEFINE_UNCORE_FORMAT_ATTR(mask_rnid30, mask_rnid30, "config2:32-35");
+DEFINE_UNCORE_FORMAT_ATTR(mask_rnid4, mask_rnid4, "config2:31");
+DEFINE_UNCORE_FORMAT_ATTR(mask_dnid, mask_dnid, "config2:13-17");
+DEFINE_UNCORE_FORMAT_ATTR(mask_mc, mask_mc, "config2:9-12");
+DEFINE_UNCORE_FORMAT_ATTR(mask_opc, mask_opc, "config2:5-8");
+DEFINE_UNCORE_FORMAT_ATTR(mask_vnw, mask_vnw, "config2:3-4");
+DEFINE_UNCORE_FORMAT_ATTR(mask0, mask0, "config2:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(mask1, mask1, "config2:32-63");
+
+static void snbep_uncore_pci_disable_box(struct intel_uncore_box *box)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ int box_ctl = uncore_pci_box_ctl(box);
+ u32 config = 0;
+
+ if (!pci_read_config_dword(pdev, box_ctl, &config)) {
+ config |= SNBEP_PMON_BOX_CTL_FRZ;
+ pci_write_config_dword(pdev, box_ctl, config);
+ }
+}
+
+static void snbep_uncore_pci_enable_box(struct intel_uncore_box *box)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ int box_ctl = uncore_pci_box_ctl(box);
+ u32 config = 0;
+
+ if (!pci_read_config_dword(pdev, box_ctl, &config)) {
+ config &= ~SNBEP_PMON_BOX_CTL_FRZ;
+ pci_write_config_dword(pdev, box_ctl, config);
+ }
+}
+
+static void snbep_uncore_pci_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ struct hw_perf_event *hwc = &event->hw;
+
+ pci_write_config_dword(pdev, hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static void snbep_uncore_pci_disable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ struct hw_perf_event *hwc = &event->hw;
+
+ pci_write_config_dword(pdev, hwc->config_base, hwc->config);
+}
+
+static u64 snbep_uncore_pci_read_counter(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ struct hw_perf_event *hwc = &event->hw;
+ u64 count = 0;
+
+ pci_read_config_dword(pdev, hwc->event_base, (u32 *)&count);
+ pci_read_config_dword(pdev, hwc->event_base + 4, (u32 *)&count + 1);
+
+ return count;
+}
+
+static void snbep_uncore_pci_init_box(struct intel_uncore_box *box)
+{
+ struct pci_dev *pdev = box->pci_dev;
+
+ pci_write_config_dword(pdev, SNBEP_PCI_PMON_BOX_CTL, SNBEP_PMON_BOX_CTL_INT);
+}
+
+static void snbep_uncore_msr_disable_box(struct intel_uncore_box *box)
+{
+ u64 config;
+ unsigned msr;
+
+ msr = uncore_msr_box_ctl(box);
+ if (msr) {
+ rdmsrl(msr, config);
+ config |= SNBEP_PMON_BOX_CTL_FRZ;
+ wrmsrl(msr, config);
+ }
+}
+
+static void snbep_uncore_msr_enable_box(struct intel_uncore_box *box)
+{
+ u64 config;
+ unsigned msr;
+
+ msr = uncore_msr_box_ctl(box);
+ if (msr) {
+ rdmsrl(msr, config);
+ config &= ~SNBEP_PMON_BOX_CTL_FRZ;
+ wrmsrl(msr, config);
+ }
+}
+
+static void snbep_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+
+ if (reg1->idx != EXTRA_REG_NONE)
+ wrmsrl(reg1->reg, uncore_shared_reg_config(box, 0));
+
+ wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static void snbep_uncore_msr_disable_event(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ wrmsrl(hwc->config_base, hwc->config);
+}
+
+static void snbep_uncore_msr_init_box(struct intel_uncore_box *box)
+{
+ unsigned msr = uncore_msr_box_ctl(box);
+
+ if (msr)
+ wrmsrl(msr, SNBEP_PMON_BOX_CTL_INT);
+}
+
+static struct attribute *snbep_uncore_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh8.attr,
+ NULL,
+};
+
+static struct attribute *snbep_uncore_ubox_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh5.attr,
+ NULL,
+};
+
+static struct attribute *snbep_uncore_cbox_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_tid_en.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh8.attr,
+ &format_attr_filter_tid.attr,
+ &format_attr_filter_nid.attr,
+ &format_attr_filter_state.attr,
+ &format_attr_filter_opc.attr,
+ NULL,
+};
+
+static struct attribute *snbep_uncore_pcu_formats_attr[] = {
+ &format_attr_event_ext.attr,
+ &format_attr_occ_sel.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh5.attr,
+ &format_attr_occ_invert.attr,
+ &format_attr_occ_edge.attr,
+ &format_attr_filter_band0.attr,
+ &format_attr_filter_band1.attr,
+ &format_attr_filter_band2.attr,
+ &format_attr_filter_band3.attr,
+ NULL,
+};
+
+static struct attribute *snbep_uncore_qpi_formats_attr[] = {
+ &format_attr_event_ext.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh8.attr,
+ &format_attr_match_rds.attr,
+ &format_attr_match_rnid30.attr,
+ &format_attr_match_rnid4.attr,
+ &format_attr_match_dnid.attr,
+ &format_attr_match_mc.attr,
+ &format_attr_match_opc.attr,
+ &format_attr_match_vnw.attr,
+ &format_attr_match0.attr,
+ &format_attr_match1.attr,
+ &format_attr_mask_rds.attr,
+ &format_attr_mask_rnid30.attr,
+ &format_attr_mask_rnid4.attr,
+ &format_attr_mask_dnid.attr,
+ &format_attr_mask_mc.attr,
+ &format_attr_mask_opc.attr,
+ &format_attr_mask_vnw.attr,
+ &format_attr_mask0.attr,
+ &format_attr_mask1.attr,
+ NULL,
+};
+
+static struct uncore_event_desc snbep_uncore_imc_events[] = {
+ INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"),
+ INTEL_UNCORE_EVENT_DESC(cas_count_read, "event=0x04,umask=0x03"),
+ INTEL_UNCORE_EVENT_DESC(cas_count_write, "event=0x04,umask=0x0c"),
+ { /* end: all zeroes */ },
+};
+
+static struct uncore_event_desc snbep_uncore_qpi_events[] = {
+ INTEL_UNCORE_EVENT_DESC(clockticks, "event=0x14"),
+ INTEL_UNCORE_EVENT_DESC(txl_flits_active, "event=0x00,umask=0x06"),
+ INTEL_UNCORE_EVENT_DESC(drs_data, "event=0x102,umask=0x08"),
+ INTEL_UNCORE_EVENT_DESC(ncb_data, "event=0x103,umask=0x04"),
+ { /* end: all zeroes */ },
+};
+
+static struct attribute_group snbep_uncore_format_group = {
+ .name = "format",
+ .attrs = snbep_uncore_formats_attr,
+};
+
+static struct attribute_group snbep_uncore_ubox_format_group = {
+ .name = "format",
+ .attrs = snbep_uncore_ubox_formats_attr,
+};
+
+static struct attribute_group snbep_uncore_cbox_format_group = {
+ .name = "format",
+ .attrs = snbep_uncore_cbox_formats_attr,
+};
+
+static struct attribute_group snbep_uncore_pcu_format_group = {
+ .name = "format",
+ .attrs = snbep_uncore_pcu_formats_attr,
+};
+
+static struct attribute_group snbep_uncore_qpi_format_group = {
+ .name = "format",
+ .attrs = snbep_uncore_qpi_formats_attr,
+};
+
+#define SNBEP_UNCORE_MSR_OPS_COMMON_INIT() \
+ .init_box = snbep_uncore_msr_init_box, \
+ .disable_box = snbep_uncore_msr_disable_box, \
+ .enable_box = snbep_uncore_msr_enable_box, \
+ .disable_event = snbep_uncore_msr_disable_event, \
+ .enable_event = snbep_uncore_msr_enable_event, \
+ .read_counter = uncore_msr_read_counter
+
+static struct intel_uncore_ops snbep_uncore_msr_ops = {
+ SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+};
+
+#define SNBEP_UNCORE_PCI_OPS_COMMON_INIT() \
+ .init_box = snbep_uncore_pci_init_box, \
+ .disable_box = snbep_uncore_pci_disable_box, \
+ .enable_box = snbep_uncore_pci_enable_box, \
+ .disable_event = snbep_uncore_pci_disable_event, \
+ .read_counter = snbep_uncore_pci_read_counter
+
+static struct intel_uncore_ops snbep_uncore_pci_ops = {
+ SNBEP_UNCORE_PCI_OPS_COMMON_INIT(),
+ .enable_event = snbep_uncore_pci_enable_event, \
+};
+
+static struct event_constraint snbep_uncore_cbox_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x01, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x02, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x04, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x05, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x07, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x09, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x12, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x13, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x1b, 0xc),
+ UNCORE_EVENT_CONSTRAINT(0x1c, 0xc),
+ UNCORE_EVENT_CONSTRAINT(0x1d, 0xc),
+ UNCORE_EVENT_CONSTRAINT(0x1e, 0xc),
+ EVENT_CONSTRAINT_OVERLAP(0x1f, 0xe, 0xff),
+ UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x31, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x35, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x39, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x3b, 0x1),
+ EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint snbep_uncore_r2pcie_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x12, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x24, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
+ EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint snbep_uncore_r3qpi_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x12, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x20, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x22, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x24, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x29, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2a, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2b, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2e, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2f, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x30, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x31, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x36, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x39, 0x3),
+ EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type snbep_uncore_ubox = {
+ .name = "ubox",
+ .num_counters = 2,
+ .num_boxes = 1,
+ .perf_ctr_bits = 44,
+ .fixed_ctr_bits = 48,
+ .perf_ctr = SNBEP_U_MSR_PMON_CTR0,
+ .event_ctl = SNBEP_U_MSR_PMON_CTL0,
+ .event_mask = SNBEP_U_MSR_PMON_RAW_EVENT_MASK,
+ .fixed_ctr = SNBEP_U_MSR_PMON_UCLK_FIXED_CTR,
+ .fixed_ctl = SNBEP_U_MSR_PMON_UCLK_FIXED_CTL,
+ .ops = &snbep_uncore_msr_ops,
+ .format_group = &snbep_uncore_ubox_format_group,
+};
+
+static struct extra_reg snbep_uncore_cbox_extra_regs[] = {
+ SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
+ SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4334, 0xffff, 0x6),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4534, 0xffff, 0x6),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4934, 0xffff, 0x6),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0x6),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0xa),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0xa),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4435, 0xffff, 0x2),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4835, 0xffff, 0x2),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4a35, 0xffff, 0x2),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x5035, 0xffff, 0x2),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0xa),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0xa),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4436, 0xffff, 0x2),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4836, 0xffff, 0x2),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4a36, 0xffff, 0x2),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4037, 0x40ff, 0x2),
+ EVENT_EXTRA_END
+};
+
+static void snbep_cbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+ struct intel_uncore_extra_reg *er = &box->shared_regs[0];
+ int i;
+
+ if (uncore_box_is_fake(box))
+ return;
+
+ for (i = 0; i < 5; i++) {
+ if (reg1->alloc & (0x1 << i))
+ atomic_sub(1 << (i * 6), &er->ref);
+ }
+ reg1->alloc = 0;
+}
+
+static struct event_constraint *
+__snbep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event,
+ u64 (*cbox_filter_mask)(int fields))
+{
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+ struct intel_uncore_extra_reg *er = &box->shared_regs[0];
+ int i, alloc = 0;
+ unsigned long flags;
+ u64 mask;
+
+ if (reg1->idx == EXTRA_REG_NONE)
+ return NULL;
+
+ raw_spin_lock_irqsave(&er->lock, flags);
+ for (i = 0; i < 5; i++) {
+ if (!(reg1->idx & (0x1 << i)))
+ continue;
+ if (!uncore_box_is_fake(box) && (reg1->alloc & (0x1 << i)))
+ continue;
+
+ mask = cbox_filter_mask(0x1 << i);
+ if (!__BITS_VALUE(atomic_read(&er->ref), i, 6) ||
+ !((reg1->config ^ er->config) & mask)) {
+ atomic_add(1 << (i * 6), &er->ref);
+ er->config &= ~mask;
+ er->config |= reg1->config & mask;
+ alloc |= (0x1 << i);
+ } else {
+ break;
+ }
+ }
+ raw_spin_unlock_irqrestore(&er->lock, flags);
+ if (i < 5)
+ goto fail;
+
+ if (!uncore_box_is_fake(box))
+ reg1->alloc |= alloc;
+
+ return NULL;
+fail:
+ for (; i >= 0; i--) {
+ if (alloc & (0x1 << i))
+ atomic_sub(1 << (i * 6), &er->ref);
+ }
+ return &uncore_constraint_empty;
+}
+
+static u64 snbep_cbox_filter_mask(int fields)
+{
+ u64 mask = 0;
+
+ if (fields & 0x1)
+ mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_TID;
+ if (fields & 0x2)
+ mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_NID;
+ if (fields & 0x4)
+ mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_STATE;
+ if (fields & 0x8)
+ mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_OPC;
+
+ return mask;
+}
+
+static struct event_constraint *
+snbep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+ return __snbep_cbox_get_constraint(box, event, snbep_cbox_filter_mask);
+}
+
+static int snbep_cbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+ struct extra_reg *er;
+ int idx = 0;
+
+ for (er = snbep_uncore_cbox_extra_regs; er->msr; er++) {
+ if (er->event != (event->hw.config & er->config_mask))
+ continue;
+ idx |= er->idx;
+ }
+
+ if (idx) {
+ reg1->reg = SNBEP_C0_MSR_PMON_BOX_FILTER +
+ SNBEP_CBO_MSR_OFFSET * box->pmu->pmu_idx;
+ reg1->config = event->attr.config1 & snbep_cbox_filter_mask(idx);
+ reg1->idx = idx;
+ }
+ return 0;
+}
+
+static struct intel_uncore_ops snbep_uncore_cbox_ops = {
+ SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+ .hw_config = snbep_cbox_hw_config,
+ .get_constraint = snbep_cbox_get_constraint,
+ .put_constraint = snbep_cbox_put_constraint,
+};
+
+static struct intel_uncore_type snbep_uncore_cbox = {
+ .name = "cbox",
+ .num_counters = 4,
+ .num_boxes = 8,
+ .perf_ctr_bits = 44,
+ .event_ctl = SNBEP_C0_MSR_PMON_CTL0,
+ .perf_ctr = SNBEP_C0_MSR_PMON_CTR0,
+ .event_mask = SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK,
+ .box_ctl = SNBEP_C0_MSR_PMON_BOX_CTL,
+ .msr_offset = SNBEP_CBO_MSR_OFFSET,
+ .num_shared_regs = 1,
+ .constraints = snbep_uncore_cbox_constraints,
+ .ops = &snbep_uncore_cbox_ops,
+ .format_group = &snbep_uncore_cbox_format_group,
+};
+
+static u64 snbep_pcu_alter_er(struct perf_event *event, int new_idx, bool modify)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+ u64 config = reg1->config;
+
+ if (new_idx > reg1->idx)
+ config <<= 8 * (new_idx - reg1->idx);
+ else
+ config >>= 8 * (reg1->idx - new_idx);
+
+ if (modify) {
+ hwc->config += new_idx - reg1->idx;
+ reg1->config = config;
+ reg1->idx = new_idx;
+ }
+ return config;
+}
+
+static struct event_constraint *
+snbep_pcu_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+ struct intel_uncore_extra_reg *er = &box->shared_regs[0];
+ unsigned long flags;
+ int idx = reg1->idx;
+ u64 mask, config1 = reg1->config;
+ bool ok = false;
+
+ if (reg1->idx == EXTRA_REG_NONE ||
+ (!uncore_box_is_fake(box) && reg1->alloc))
+ return NULL;
+again:
+ mask = 0xffULL << (idx * 8);
+ raw_spin_lock_irqsave(&er->lock, flags);
+ if (!__BITS_VALUE(atomic_read(&er->ref), idx, 8) ||
+ !((config1 ^ er->config) & mask)) {
+ atomic_add(1 << (idx * 8), &er->ref);
+ er->config &= ~mask;
+ er->config |= config1 & mask;
+ ok = true;
+ }
+ raw_spin_unlock_irqrestore(&er->lock, flags);
+
+ if (!ok) {
+ idx = (idx + 1) % 4;
+ if (idx != reg1->idx) {
+ config1 = snbep_pcu_alter_er(event, idx, false);
+ goto again;
+ }
+ return &uncore_constraint_empty;
+ }
+
+ if (!uncore_box_is_fake(box)) {
+ if (idx != reg1->idx)
+ snbep_pcu_alter_er(event, idx, true);
+ reg1->alloc = 1;
+ }
+ return NULL;
+}
+
+static void snbep_pcu_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+ struct intel_uncore_extra_reg *er = &box->shared_regs[0];
+
+ if (uncore_box_is_fake(box) || !reg1->alloc)
+ return;
+
+ atomic_sub(1 << (reg1->idx * 8), &er->ref);
+ reg1->alloc = 0;
+}
+
+static int snbep_pcu_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+ int ev_sel = hwc->config & SNBEP_PMON_CTL_EV_SEL_MASK;
+
+ if (ev_sel >= 0xb && ev_sel <= 0xe) {
+ reg1->reg = SNBEP_PCU_MSR_PMON_BOX_FILTER;
+ reg1->idx = ev_sel - 0xb;
+ reg1->config = event->attr.config1 & (0xff << reg1->idx);
+ }
+ return 0;
+}
+
+static struct intel_uncore_ops snbep_uncore_pcu_ops = {
+ SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+ .hw_config = snbep_pcu_hw_config,
+ .get_constraint = snbep_pcu_get_constraint,
+ .put_constraint = snbep_pcu_put_constraint,
+};
+
+static struct intel_uncore_type snbep_uncore_pcu = {
+ .name = "pcu",
+ .num_counters = 4,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .perf_ctr = SNBEP_PCU_MSR_PMON_CTR0,
+ .event_ctl = SNBEP_PCU_MSR_PMON_CTL0,
+ .event_mask = SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK,
+ .box_ctl = SNBEP_PCU_MSR_PMON_BOX_CTL,
+ .num_shared_regs = 1,
+ .ops = &snbep_uncore_pcu_ops,
+ .format_group = &snbep_uncore_pcu_format_group,
+};
+
+static struct intel_uncore_type *snbep_msr_uncores[] = {
+ &snbep_uncore_ubox,
+ &snbep_uncore_cbox,
+ &snbep_uncore_pcu,
+ NULL,
+};
+
+void snbep_uncore_cpu_init(void)
+{
+ if (snbep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
+ snbep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
+ uncore_msr_uncores = snbep_msr_uncores;
+}
+
+enum {
+ SNBEP_PCI_QPI_PORT0_FILTER,
+ SNBEP_PCI_QPI_PORT1_FILTER,
+};
+
+static int snbep_qpi_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+ struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+
+ if ((hwc->config & SNBEP_PMON_CTL_EV_SEL_MASK) == 0x38) {
+ reg1->idx = 0;
+ reg1->reg = SNBEP_Q_Py_PCI_PMON_PKT_MATCH0;
+ reg1->config = event->attr.config1;
+ reg2->reg = SNBEP_Q_Py_PCI_PMON_PKT_MASK0;
+ reg2->config = event->attr.config2;
+ }
+ return 0;
+}
+
+static void snbep_qpi_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+ struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+
+ if (reg1->idx != EXTRA_REG_NONE) {
+ int idx = box->pmu->pmu_idx + SNBEP_PCI_QPI_PORT0_FILTER;
+ struct pci_dev *filter_pdev = uncore_extra_pci_dev[box->phys_id][idx];
+ WARN_ON_ONCE(!filter_pdev);
+ if (filter_pdev) {
+ pci_write_config_dword(filter_pdev, reg1->reg,
+ (u32)reg1->config);
+ pci_write_config_dword(filter_pdev, reg1->reg + 4,
+ (u32)(reg1->config >> 32));
+ pci_write_config_dword(filter_pdev, reg2->reg,
+ (u32)reg2->config);
+ pci_write_config_dword(filter_pdev, reg2->reg + 4,
+ (u32)(reg2->config >> 32));
+ }
+ }
+
+ pci_write_config_dword(pdev, hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static struct intel_uncore_ops snbep_uncore_qpi_ops = {
+ SNBEP_UNCORE_PCI_OPS_COMMON_INIT(),
+ .enable_event = snbep_qpi_enable_event,
+ .hw_config = snbep_qpi_hw_config,
+ .get_constraint = uncore_get_constraint,
+ .put_constraint = uncore_put_constraint,
+};
+
+#define SNBEP_UNCORE_PCI_COMMON_INIT() \
+ .perf_ctr = SNBEP_PCI_PMON_CTR0, \
+ .event_ctl = SNBEP_PCI_PMON_CTL0, \
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK, \
+ .box_ctl = SNBEP_PCI_PMON_BOX_CTL, \
+ .ops = &snbep_uncore_pci_ops, \
+ .format_group = &snbep_uncore_format_group
+
+static struct intel_uncore_type snbep_uncore_ha = {
+ .name = "ha",
+ .num_counters = 4,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type snbep_uncore_imc = {
+ .name = "imc",
+ .num_counters = 4,
+ .num_boxes = 4,
+ .perf_ctr_bits = 48,
+ .fixed_ctr_bits = 48,
+ .fixed_ctr = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
+ .fixed_ctl = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
+ .event_descs = snbep_uncore_imc_events,
+ SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type snbep_uncore_qpi = {
+ .name = "qpi",
+ .num_counters = 4,
+ .num_boxes = 2,
+ .perf_ctr_bits = 48,
+ .perf_ctr = SNBEP_PCI_PMON_CTR0,
+ .event_ctl = SNBEP_PCI_PMON_CTL0,
+ .event_mask = SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK,
+ .box_ctl = SNBEP_PCI_PMON_BOX_CTL,
+ .num_shared_regs = 1,
+ .ops = &snbep_uncore_qpi_ops,
+ .event_descs = snbep_uncore_qpi_events,
+ .format_group = &snbep_uncore_qpi_format_group,
+};
+
+
+static struct intel_uncore_type snbep_uncore_r2pcie = {
+ .name = "r2pcie",
+ .num_counters = 4,
+ .num_boxes = 1,
+ .perf_ctr_bits = 44,
+ .constraints = snbep_uncore_r2pcie_constraints,
+ SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type snbep_uncore_r3qpi = {
+ .name = "r3qpi",
+ .num_counters = 3,
+ .num_boxes = 2,
+ .perf_ctr_bits = 44,
+ .constraints = snbep_uncore_r3qpi_constraints,
+ SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+enum {
+ SNBEP_PCI_UNCORE_HA,
+ SNBEP_PCI_UNCORE_IMC,
+ SNBEP_PCI_UNCORE_QPI,
+ SNBEP_PCI_UNCORE_R2PCIE,
+ SNBEP_PCI_UNCORE_R3QPI,
+};
+
+static struct intel_uncore_type *snbep_pci_uncores[] = {
+ [SNBEP_PCI_UNCORE_HA] = &snbep_uncore_ha,
+ [SNBEP_PCI_UNCORE_IMC] = &snbep_uncore_imc,
+ [SNBEP_PCI_UNCORE_QPI] = &snbep_uncore_qpi,
+ [SNBEP_PCI_UNCORE_R2PCIE] = &snbep_uncore_r2pcie,
+ [SNBEP_PCI_UNCORE_R3QPI] = &snbep_uncore_r3qpi,
+ NULL,
+};
+
+static const struct pci_device_id snbep_uncore_pci_ids[] = {
+ { /* Home Agent */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_HA),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_HA, 0),
+ },
+ { /* MC Channel 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC0),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 0),
+ },
+ { /* MC Channel 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC1),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 1),
+ },
+ { /* MC Channel 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC2),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 2),
+ },
+ { /* MC Channel 3 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC3),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 3),
+ },
+ { /* QPI Port 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI0),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_QPI, 0),
+ },
+ { /* QPI Port 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI1),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_QPI, 1),
+ },
+ { /* R2PCIe */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R2PCIE),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R2PCIE, 0),
+ },
+ { /* R3QPI Link 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI0),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R3QPI, 0),
+ },
+ { /* R3QPI Link 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI1),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R3QPI, 1),
+ },
+ { /* QPI Port 0 filter */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3c86),
+ .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+ SNBEP_PCI_QPI_PORT0_FILTER),
+ },
+ { /* QPI Port 0 filter */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3c96),
+ .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+ SNBEP_PCI_QPI_PORT1_FILTER),
+ },
+ { /* end: all zeroes */ }
+};
+
+static struct pci_driver snbep_uncore_pci_driver = {
+ .name = "snbep_uncore",
+ .id_table = snbep_uncore_pci_ids,
+};
+
+/*
+ * build pci bus to socket mapping
+ */
+static int snbep_pci2phy_map_init(int devid)
+{
+ struct pci_dev *ubox_dev = NULL;
+ int i, bus, nodeid;
+ int err = 0;
+ u32 config = 0;
+
+ while (1) {
+ /* find the UBOX device */
+ ubox_dev = pci_get_device(PCI_VENDOR_ID_INTEL, devid, ubox_dev);
+ if (!ubox_dev)
+ break;
+ bus = ubox_dev->bus->number;
+ /* get the Node ID of the local register */
+ err = pci_read_config_dword(ubox_dev, 0x40, &config);
+ if (err)
+ break;
+ nodeid = config;
+ /* get the Node ID mapping */
+ err = pci_read_config_dword(ubox_dev, 0x54, &config);
+ if (err)
+ break;
+ /*
+ * every three bits in the Node ID mapping register maps
+ * to a particular node.
+ */
+ for (i = 0; i < 8; i++) {
+ if (nodeid == ((config >> (3 * i)) & 0x7)) {
+ uncore_pcibus_to_physid[bus] = i;
+ break;
+ }
+ }
+ }
+
+ if (!err) {
+ /*
+ * For PCI bus with no UBOX device, find the next bus
+ * that has UBOX device and use its mapping.
+ */
+ i = -1;
+ for (bus = 255; bus >= 0; bus--) {
+ if (uncore_pcibus_to_physid[bus] >= 0)
+ i = uncore_pcibus_to_physid[bus];
+ else
+ uncore_pcibus_to_physid[bus] = i;
+ }
+ }
+
+ if (ubox_dev)
+ pci_dev_put(ubox_dev);
+
+ return err ? pcibios_err_to_errno(err) : 0;
+}
+
+int snbep_uncore_pci_init(void)
+{
+ int ret = snbep_pci2phy_map_init(0x3ce0);
+ if (ret)
+ return ret;
+ uncore_pci_uncores = snbep_pci_uncores;
+ uncore_pci_driver = &snbep_uncore_pci_driver;
+ return 0;
+}
+/* end of Sandy Bridge-EP uncore support */
+
+/* IvyTown uncore support */
+static void ivbep_uncore_msr_init_box(struct intel_uncore_box *box)
+{
+ unsigned msr = uncore_msr_box_ctl(box);
+ if (msr)
+ wrmsrl(msr, IVBEP_PMON_BOX_CTL_INT);
+}
+
+static void ivbep_uncore_pci_init_box(struct intel_uncore_box *box)
+{
+ struct pci_dev *pdev = box->pci_dev;
+
+ pci_write_config_dword(pdev, SNBEP_PCI_PMON_BOX_CTL, IVBEP_PMON_BOX_CTL_INT);
+}
+
+#define IVBEP_UNCORE_MSR_OPS_COMMON_INIT() \
+ .init_box = ivbep_uncore_msr_init_box, \
+ .disable_box = snbep_uncore_msr_disable_box, \
+ .enable_box = snbep_uncore_msr_enable_box, \
+ .disable_event = snbep_uncore_msr_disable_event, \
+ .enable_event = snbep_uncore_msr_enable_event, \
+ .read_counter = uncore_msr_read_counter
+
+static struct intel_uncore_ops ivbep_uncore_msr_ops = {
+ IVBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+};
+
+static struct intel_uncore_ops ivbep_uncore_pci_ops = {
+ .init_box = ivbep_uncore_pci_init_box,
+ .disable_box = snbep_uncore_pci_disable_box,
+ .enable_box = snbep_uncore_pci_enable_box,
+ .disable_event = snbep_uncore_pci_disable_event,
+ .enable_event = snbep_uncore_pci_enable_event,
+ .read_counter = snbep_uncore_pci_read_counter,
+};
+
+#define IVBEP_UNCORE_PCI_COMMON_INIT() \
+ .perf_ctr = SNBEP_PCI_PMON_CTR0, \
+ .event_ctl = SNBEP_PCI_PMON_CTL0, \
+ .event_mask = IVBEP_PMON_RAW_EVENT_MASK, \
+ .box_ctl = SNBEP_PCI_PMON_BOX_CTL, \
+ .ops = &ivbep_uncore_pci_ops, \
+ .format_group = &ivbep_uncore_format_group
+
+static struct attribute *ivbep_uncore_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh8.attr,
+ NULL,
+};
+
+static struct attribute *ivbep_uncore_ubox_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh5.attr,
+ NULL,
+};
+
+static struct attribute *ivbep_uncore_cbox_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_tid_en.attr,
+ &format_attr_thresh8.attr,
+ &format_attr_filter_tid.attr,
+ &format_attr_filter_link.attr,
+ &format_attr_filter_state2.attr,
+ &format_attr_filter_nid2.attr,
+ &format_attr_filter_opc2.attr,
+ NULL,
+};
+
+static struct attribute *ivbep_uncore_pcu_formats_attr[] = {
+ &format_attr_event_ext.attr,
+ &format_attr_occ_sel.attr,
+ &format_attr_edge.attr,
+ &format_attr_thresh5.attr,
+ &format_attr_occ_invert.attr,
+ &format_attr_occ_edge.attr,
+ &format_attr_filter_band0.attr,
+ &format_attr_filter_band1.attr,
+ &format_attr_filter_band2.attr,
+ &format_attr_filter_band3.attr,
+ NULL,
+};
+
+static struct attribute *ivbep_uncore_qpi_formats_attr[] = {
+ &format_attr_event_ext.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_thresh8.attr,
+ &format_attr_match_rds.attr,
+ &format_attr_match_rnid30.attr,
+ &format_attr_match_rnid4.attr,
+ &format_attr_match_dnid.attr,
+ &format_attr_match_mc.attr,
+ &format_attr_match_opc.attr,
+ &format_attr_match_vnw.attr,
+ &format_attr_match0.attr,
+ &format_attr_match1.attr,
+ &format_attr_mask_rds.attr,
+ &format_attr_mask_rnid30.attr,
+ &format_attr_mask_rnid4.attr,
+ &format_attr_mask_dnid.attr,
+ &format_attr_mask_mc.attr,
+ &format_attr_mask_opc.attr,
+ &format_attr_mask_vnw.attr,
+ &format_attr_mask0.attr,
+ &format_attr_mask1.attr,
+ NULL,
+};
+
+static struct attribute_group ivbep_uncore_format_group = {
+ .name = "format",
+ .attrs = ivbep_uncore_formats_attr,
+};
+
+static struct attribute_group ivbep_uncore_ubox_format_group = {
+ .name = "format",
+ .attrs = ivbep_uncore_ubox_formats_attr,
+};
+
+static struct attribute_group ivbep_uncore_cbox_format_group = {
+ .name = "format",
+ .attrs = ivbep_uncore_cbox_formats_attr,
+};
+
+static struct attribute_group ivbep_uncore_pcu_format_group = {
+ .name = "format",
+ .attrs = ivbep_uncore_pcu_formats_attr,
+};
+
+static struct attribute_group ivbep_uncore_qpi_format_group = {
+ .name = "format",
+ .attrs = ivbep_uncore_qpi_formats_attr,
+};
+
+static struct intel_uncore_type ivbep_uncore_ubox = {
+ .name = "ubox",
+ .num_counters = 2,
+ .num_boxes = 1,
+ .perf_ctr_bits = 44,
+ .fixed_ctr_bits = 48,
+ .perf_ctr = SNBEP_U_MSR_PMON_CTR0,
+ .event_ctl = SNBEP_U_MSR_PMON_CTL0,
+ .event_mask = IVBEP_U_MSR_PMON_RAW_EVENT_MASK,
+ .fixed_ctr = SNBEP_U_MSR_PMON_UCLK_FIXED_CTR,
+ .fixed_ctl = SNBEP_U_MSR_PMON_UCLK_FIXED_CTL,
+ .ops = &ivbep_uncore_msr_ops,
+ .format_group = &ivbep_uncore_ubox_format_group,
+};
+
+static struct extra_reg ivbep_uncore_cbox_extra_regs[] = {
+ SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
+ SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x1031, 0x10ff, 0x2),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x1134, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0xc),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x5134, 0xffff, 0xc),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4334, 0xffff, 0xc),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4534, 0xffff, 0xc),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4934, 0xffff, 0xc),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x2135, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x2335, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0x18),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0x18),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4435, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4835, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4a35, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x5035, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x8135, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x8335, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x2136, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x2336, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0x18),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0x18),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4436, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4836, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4a36, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x5036, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x8136, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x8336, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4037, 0x40ff, 0x8),
+ EVENT_EXTRA_END
+};
+
+static u64 ivbep_cbox_filter_mask(int fields)
+{
+ u64 mask = 0;
+
+ if (fields & 0x1)
+ mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_TID;
+ if (fields & 0x2)
+ mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_LINK;
+ if (fields & 0x4)
+ mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_STATE;
+ if (fields & 0x8)
+ mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_NID;
+ if (fields & 0x10)
+ mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_OPC;
+
+ return mask;
+}
+
+static struct event_constraint *
+ivbep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+ return __snbep_cbox_get_constraint(box, event, ivbep_cbox_filter_mask);
+}
+
+static int ivbep_cbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+ struct extra_reg *er;
+ int idx = 0;
+
+ for (er = ivbep_uncore_cbox_extra_regs; er->msr; er++) {
+ if (er->event != (event->hw.config & er->config_mask))
+ continue;
+ idx |= er->idx;
+ }
+
+ if (idx) {
+ reg1->reg = SNBEP_C0_MSR_PMON_BOX_FILTER +
+ SNBEP_CBO_MSR_OFFSET * box->pmu->pmu_idx;
+ reg1->config = event->attr.config1 & ivbep_cbox_filter_mask(idx);
+ reg1->idx = idx;
+ }
+ return 0;
+}
+
+static void ivbep_cbox_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+
+ if (reg1->idx != EXTRA_REG_NONE) {
+ u64 filter = uncore_shared_reg_config(box, 0);
+ wrmsrl(reg1->reg, filter & 0xffffffff);
+ wrmsrl(reg1->reg + 6, filter >> 32);
+ }
+
+ wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static struct intel_uncore_ops ivbep_uncore_cbox_ops = {
+ .init_box = ivbep_uncore_msr_init_box,
+ .disable_box = snbep_uncore_msr_disable_box,
+ .enable_box = snbep_uncore_msr_enable_box,
+ .disable_event = snbep_uncore_msr_disable_event,
+ .enable_event = ivbep_cbox_enable_event,
+ .read_counter = uncore_msr_read_counter,
+ .hw_config = ivbep_cbox_hw_config,
+ .get_constraint = ivbep_cbox_get_constraint,
+ .put_constraint = snbep_cbox_put_constraint,
+};
+
+static struct intel_uncore_type ivbep_uncore_cbox = {
+ .name = "cbox",
+ .num_counters = 4,
+ .num_boxes = 15,
+ .perf_ctr_bits = 44,
+ .event_ctl = SNBEP_C0_MSR_PMON_CTL0,
+ .perf_ctr = SNBEP_C0_MSR_PMON_CTR0,
+ .event_mask = IVBEP_CBO_MSR_PMON_RAW_EVENT_MASK,
+ .box_ctl = SNBEP_C0_MSR_PMON_BOX_CTL,
+ .msr_offset = SNBEP_CBO_MSR_OFFSET,
+ .num_shared_regs = 1,
+ .constraints = snbep_uncore_cbox_constraints,
+ .ops = &ivbep_uncore_cbox_ops,
+ .format_group = &ivbep_uncore_cbox_format_group,
+};
+
+static struct intel_uncore_ops ivbep_uncore_pcu_ops = {
+ IVBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+ .hw_config = snbep_pcu_hw_config,
+ .get_constraint = snbep_pcu_get_constraint,
+ .put_constraint = snbep_pcu_put_constraint,
+};
+
+static struct intel_uncore_type ivbep_uncore_pcu = {
+ .name = "pcu",
+ .num_counters = 4,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .perf_ctr = SNBEP_PCU_MSR_PMON_CTR0,
+ .event_ctl = SNBEP_PCU_MSR_PMON_CTL0,
+ .event_mask = IVBEP_PCU_MSR_PMON_RAW_EVENT_MASK,
+ .box_ctl = SNBEP_PCU_MSR_PMON_BOX_CTL,
+ .num_shared_regs = 1,
+ .ops = &ivbep_uncore_pcu_ops,
+ .format_group = &ivbep_uncore_pcu_format_group,
+};
+
+static struct intel_uncore_type *ivbep_msr_uncores[] = {
+ &ivbep_uncore_ubox,
+ &ivbep_uncore_cbox,
+ &ivbep_uncore_pcu,
+ NULL,
+};
+
+void ivbep_uncore_cpu_init(void)
+{
+ if (ivbep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
+ ivbep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
+ uncore_msr_uncores = ivbep_msr_uncores;
+}
+
+static struct intel_uncore_type ivbep_uncore_ha = {
+ .name = "ha",
+ .num_counters = 4,
+ .num_boxes = 2,
+ .perf_ctr_bits = 48,
+ IVBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type ivbep_uncore_imc = {
+ .name = "imc",
+ .num_counters = 4,
+ .num_boxes = 8,
+ .perf_ctr_bits = 48,
+ .fixed_ctr_bits = 48,
+ .fixed_ctr = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
+ .fixed_ctl = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
+ .event_descs = snbep_uncore_imc_events,
+ IVBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+/* registers in IRP boxes are not properly aligned */
+static unsigned ivbep_uncore_irp_ctls[] = {0xd8, 0xdc, 0xe0, 0xe4};
+static unsigned ivbep_uncore_irp_ctrs[] = {0xa0, 0xb0, 0xb8, 0xc0};
+
+static void ivbep_uncore_irp_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ struct hw_perf_event *hwc = &event->hw;
+
+ pci_write_config_dword(pdev, ivbep_uncore_irp_ctls[hwc->idx],
+ hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static void ivbep_uncore_irp_disable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ struct hw_perf_event *hwc = &event->hw;
+
+ pci_write_config_dword(pdev, ivbep_uncore_irp_ctls[hwc->idx], hwc->config);
+}
+
+static u64 ivbep_uncore_irp_read_counter(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ struct hw_perf_event *hwc = &event->hw;
+ u64 count = 0;
+
+ pci_read_config_dword(pdev, ivbep_uncore_irp_ctrs[hwc->idx], (u32 *)&count);
+ pci_read_config_dword(pdev, ivbep_uncore_irp_ctrs[hwc->idx] + 4, (u32 *)&count + 1);
+
+ return count;
+}
+
+static struct intel_uncore_ops ivbep_uncore_irp_ops = {
+ .init_box = ivbep_uncore_pci_init_box,
+ .disable_box = snbep_uncore_pci_disable_box,
+ .enable_box = snbep_uncore_pci_enable_box,
+ .disable_event = ivbep_uncore_irp_disable_event,
+ .enable_event = ivbep_uncore_irp_enable_event,
+ .read_counter = ivbep_uncore_irp_read_counter,
+};
+
+static struct intel_uncore_type ivbep_uncore_irp = {
+ .name = "irp",
+ .num_counters = 4,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .event_mask = IVBEP_PMON_RAW_EVENT_MASK,
+ .box_ctl = SNBEP_PCI_PMON_BOX_CTL,
+ .ops = &ivbep_uncore_irp_ops,
+ .format_group = &ivbep_uncore_format_group,
+};
+
+static struct intel_uncore_ops ivbep_uncore_qpi_ops = {
+ .init_box = ivbep_uncore_pci_init_box,
+ .disable_box = snbep_uncore_pci_disable_box,
+ .enable_box = snbep_uncore_pci_enable_box,
+ .disable_event = snbep_uncore_pci_disable_event,
+ .enable_event = snbep_qpi_enable_event,
+ .read_counter = snbep_uncore_pci_read_counter,
+ .hw_config = snbep_qpi_hw_config,
+ .get_constraint = uncore_get_constraint,
+ .put_constraint = uncore_put_constraint,
+};
+
+static struct intel_uncore_type ivbep_uncore_qpi = {
+ .name = "qpi",
+ .num_counters = 4,
+ .num_boxes = 3,
+ .perf_ctr_bits = 48,
+ .perf_ctr = SNBEP_PCI_PMON_CTR0,
+ .event_ctl = SNBEP_PCI_PMON_CTL0,
+ .event_mask = IVBEP_QPI_PCI_PMON_RAW_EVENT_MASK,
+ .box_ctl = SNBEP_PCI_PMON_BOX_CTL,
+ .num_shared_regs = 1,
+ .ops = &ivbep_uncore_qpi_ops,
+ .format_group = &ivbep_uncore_qpi_format_group,
+};
+
+static struct intel_uncore_type ivbep_uncore_r2pcie = {
+ .name = "r2pcie",
+ .num_counters = 4,
+ .num_boxes = 1,
+ .perf_ctr_bits = 44,
+ .constraints = snbep_uncore_r2pcie_constraints,
+ IVBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type ivbep_uncore_r3qpi = {
+ .name = "r3qpi",
+ .num_counters = 3,
+ .num_boxes = 2,
+ .perf_ctr_bits = 44,
+ .constraints = snbep_uncore_r3qpi_constraints,
+ IVBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+enum {
+ IVBEP_PCI_UNCORE_HA,
+ IVBEP_PCI_UNCORE_IMC,
+ IVBEP_PCI_UNCORE_IRP,
+ IVBEP_PCI_UNCORE_QPI,
+ IVBEP_PCI_UNCORE_R2PCIE,
+ IVBEP_PCI_UNCORE_R3QPI,
+};
+
+static struct intel_uncore_type *ivbep_pci_uncores[] = {
+ [IVBEP_PCI_UNCORE_HA] = &ivbep_uncore_ha,
+ [IVBEP_PCI_UNCORE_IMC] = &ivbep_uncore_imc,
+ [IVBEP_PCI_UNCORE_IRP] = &ivbep_uncore_irp,
+ [IVBEP_PCI_UNCORE_QPI] = &ivbep_uncore_qpi,
+ [IVBEP_PCI_UNCORE_R2PCIE] = &ivbep_uncore_r2pcie,
+ [IVBEP_PCI_UNCORE_R3QPI] = &ivbep_uncore_r3qpi,
+ NULL,
+};
+
+static const struct pci_device_id ivbep_uncore_pci_ids[] = {
+ { /* Home Agent 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe30),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_HA, 0),
+ },
+ { /* Home Agent 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe38),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_HA, 1),
+ },
+ { /* MC0 Channel 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb4),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 0),
+ },
+ { /* MC0 Channel 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb5),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 1),
+ },
+ { /* MC0 Channel 3 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb0),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 2),
+ },
+ { /* MC0 Channel 4 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb1),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 3),
+ },
+ { /* MC1 Channel 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef4),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 4),
+ },
+ { /* MC1 Channel 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef5),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 5),
+ },
+ { /* MC1 Channel 3 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef0),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 6),
+ },
+ { /* MC1 Channel 4 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef1),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 7),
+ },
+ { /* IRP */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe39),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IRP, 0),
+ },
+ { /* QPI0 Port 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe32),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_QPI, 0),
+ },
+ { /* QPI0 Port 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe33),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_QPI, 1),
+ },
+ { /* QPI1 Port 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe3a),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_QPI, 2),
+ },
+ { /* R2PCIe */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe34),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_R2PCIE, 0),
+ },
+ { /* R3QPI0 Link 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe36),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_R3QPI, 0),
+ },
+ { /* R3QPI0 Link 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe37),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_R3QPI, 1),
+ },
+ { /* R3QPI1 Link 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe3e),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_R3QPI, 2),
+ },
+ { /* QPI Port 0 filter */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe86),
+ .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+ SNBEP_PCI_QPI_PORT0_FILTER),
+ },
+ { /* QPI Port 0 filter */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe96),
+ .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+ SNBEP_PCI_QPI_PORT1_FILTER),
+ },
+ { /* end: all zeroes */ }
+};
+
+static struct pci_driver ivbep_uncore_pci_driver = {
+ .name = "ivbep_uncore",
+ .id_table = ivbep_uncore_pci_ids,
+};
+
+int ivbep_uncore_pci_init(void)
+{
+ int ret = snbep_pci2phy_map_init(0x0e1e);
+ if (ret)
+ return ret;
+ uncore_pci_uncores = ivbep_pci_uncores;
+ uncore_pci_driver = &ivbep_uncore_pci_driver;
+ return 0;
+}
+/* end of IvyTown uncore support */
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 2fac134..df088bb 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -404,8 +404,8 @@
* and short:
*/
ENABLE_INTERRUPTS(CLBR_NONE)
- SAVE_ARGS 8,0
- movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
+ SAVE_ARGS 8, 0, rax_enosys=1
+ movq_cfi rax,(ORIG_RAX-ARGOFFSET)
movq %rcx,RIP-ARGOFFSET(%rsp)
CFI_REL_OFFSET rip,RIP-ARGOFFSET
testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
@@ -417,7 +417,7 @@
andl $__SYSCALL_MASK,%eax
cmpl $__NR_syscall_max,%eax
#endif
- ja badsys
+ ja ret_from_sys_call /* and return regs->ax */
movq %r10,%rcx
call *sys_call_table(,%rax,8) # XXX: rip relative
movq %rax,RAX-ARGOFFSET(%rsp)
@@ -476,28 +476,8 @@
FIXUP_TOP_OF_STACK %r11, -ARGOFFSET
jmp int_check_syscall_exit_work
-badsys:
- movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
- jmp ret_from_sys_call
-
#ifdef CONFIG_AUDITSYSCALL
/*
- * Fast path for syscall audit without full syscall trace.
- * We just call __audit_syscall_entry() directly, and then
- * jump back to the normal fast path.
- */
-auditsys:
- movq %r10,%r9 /* 6th arg: 4th syscall arg */
- movq %rdx,%r8 /* 5th arg: 3rd syscall arg */
- movq %rsi,%rcx /* 4th arg: 2nd syscall arg */
- movq %rdi,%rdx /* 3rd arg: 1st syscall arg */
- movq %rax,%rsi /* 2nd arg: syscall number */
- movl $AUDIT_ARCH_X86_64,%edi /* 1st arg: audit arch */
- call __audit_syscall_entry
- LOAD_ARGS 0 /* reload call-clobbered registers */
- jmp system_call_fastpath
-
- /*
* Return fast path for syscall audit. Call __audit_syscall_exit()
* directly and then jump back to the fast path with TIF_SYSCALL_AUDIT
* masked off.
@@ -514,18 +494,25 @@
/* Do syscall tracing */
tracesys:
-#ifdef CONFIG_AUDITSYSCALL
- testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
- jz auditsys
-#endif
+ leaq -REST_SKIP(%rsp), %rdi
+ movq $AUDIT_ARCH_X86_64, %rsi
+ call syscall_trace_enter_phase1
+ test %rax, %rax
+ jnz tracesys_phase2 /* if needed, run the slow path */
+ LOAD_ARGS 0 /* else restore clobbered regs */
+ jmp system_call_fastpath /* and return to the fast path */
+
+tracesys_phase2:
SAVE_REST
- movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
FIXUP_TOP_OF_STACK %rdi
- movq %rsp,%rdi
- call syscall_trace_enter
+ movq %rsp, %rdi
+ movq $AUDIT_ARCH_X86_64, %rsi
+ movq %rax,%rdx
+ call syscall_trace_enter_phase2
+
/*
* Reload arg registers from stack in case ptrace changed them.
- * We don't reload %rax because syscall_trace_enter() returned
+ * We don't reload %rax because syscall_trace_entry_phase2() returned
* the value it wants us to use in the table lookup.
*/
LOAD_ARGS ARGOFFSET, 1
@@ -536,7 +523,7 @@
andl $__SYSCALL_MASK,%eax
cmpl $__NR_syscall_max,%eax
#endif
- ja int_ret_from_sys_call /* RAX(%rsp) set to -ENOSYS above */
+ ja int_ret_from_sys_call /* RAX(%rsp) is already set */
movq %r10,%rcx /* fixup for C */
call *sys_call_table(,%rax,8)
movq %rax,RAX-ARGOFFSET(%rsp)
diff --git a/arch/x86/kernel/iosf_mbi.c b/arch/x86/kernel/iosf_mbi.c
index 9030e83..0a2faa3 100644
--- a/arch/x86/kernel/iosf_mbi.c
+++ b/arch/x86/kernel/iosf_mbi.c
@@ -22,6 +22,8 @@
#include <linux/init.h>
#include <linux/spinlock.h>
#include <linux/pci.h>
+#include <linux/debugfs.h>
+#include <linux/capability.h>
#include <asm/iosf_mbi.h>
@@ -187,6 +189,75 @@
}
EXPORT_SYMBOL(iosf_mbi_available);
+/********************** debugfs begin ****************************/
+static u32 dbg_mdr;
+static u32 dbg_mcr;
+static u32 dbg_mcrx;
+
+static int mcr_get(void *data, u64 *val)
+{
+ *val = *(u32 *)data;
+ return 0;
+}
+
+static int mcr_set(void *data, u64 val)
+{
+ u8 command = ((u32)val & 0xFF000000) >> 24,
+ port = ((u32)val & 0x00FF0000) >> 16,
+ offset = ((u32)val & 0x0000FF00) >> 8;
+ int err;
+
+ *(u32 *)data = val;
+
+ if (!capable(CAP_SYS_RAWIO))
+ return -EACCES;
+
+ if (command & 1u)
+ err = iosf_mbi_write(port,
+ command,
+ dbg_mcrx | offset,
+ dbg_mdr);
+ else
+ err = iosf_mbi_read(port,
+ command,
+ dbg_mcrx | offset,
+ &dbg_mdr);
+
+ return err;
+}
+DEFINE_SIMPLE_ATTRIBUTE(iosf_mcr_fops, mcr_get, mcr_set , "%llx\n");
+
+static struct dentry *iosf_dbg;
+static void iosf_sideband_debug_init(void)
+{
+ struct dentry *d;
+
+ iosf_dbg = debugfs_create_dir("iosf_sb", NULL);
+ if (IS_ERR_OR_NULL(iosf_dbg))
+ return;
+
+ /* mdr */
+ d = debugfs_create_x32("mdr", 0660, iosf_dbg, &dbg_mdr);
+ if (IS_ERR_OR_NULL(d))
+ goto cleanup;
+
+ /* mcrx */
+ debugfs_create_x32("mcrx", 0660, iosf_dbg, &dbg_mcrx);
+ if (IS_ERR_OR_NULL(d))
+ goto cleanup;
+
+ /* mcr - initiates mailbox tranaction */
+ debugfs_create_file("mcr", 0660, iosf_dbg, &dbg_mcr, &iosf_mcr_fops);
+ if (IS_ERR_OR_NULL(d))
+ goto cleanup;
+
+ return;
+
+cleanup:
+ debugfs_remove_recursive(d);
+}
+/********************** debugfs end ****************************/
+
static int iosf_mbi_probe(struct pci_dev *pdev,
const struct pci_device_id *unused)
{
@@ -217,11 +288,14 @@
static int __init iosf_mbi_init(void)
{
+ iosf_sideband_debug_init();
return pci_register_driver(&iosf_mbi_pci_driver);
}
static void __exit iosf_mbi_exit(void)
{
+ debugfs_remove_recursive(iosf_dbg);
+
pci_unregister_driver(&iosf_mbi_pci_driver);
if (mbi_pdev) {
pci_dev_put(mbi_pdev);
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index f304773..f1314d0 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -338,8 +338,10 @@
* a relative jump.
*/
rel = (long)op->optinsn.insn - (long)op->kp.addr + RELATIVEJUMP_SIZE;
- if (abs(rel) > 0x7fffffff)
+ if (abs(rel) > 0x7fffffff) {
+ __arch_remove_optimized_kprobe(op, 0);
return -ERANGE;
+ }
buf = (u8 *)op->optinsn.insn;
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index f804dc9..e127dda 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -64,14 +64,16 @@
*/
int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
{
- int ret;
-
*dst = *src;
- if (fpu_allocated(&src->thread.fpu)) {
- memset(&dst->thread.fpu, 0, sizeof(dst->thread.fpu));
- ret = fpu_alloc(&dst->thread.fpu);
- if (ret)
- return ret;
+
+ dst->thread.fpu_counter = 0;
+ dst->thread.fpu.has_fpu = 0;
+ dst->thread.fpu.last_cpu = ~0;
+ dst->thread.fpu.state = NULL;
+ if (tsk_used_math(src)) {
+ int err = fpu_alloc(&dst->thread.fpu);
+ if (err)
+ return err;
fpu_copy(dst, src);
}
return 0;
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 7bc86bb..8f3ebfe 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -138,6 +138,7 @@
p->thread.sp = (unsigned long) childregs;
p->thread.sp0 = (unsigned long) (childregs+1);
+ memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
if (unlikely(p->flags & PF_KTHREAD)) {
/* kernel thread */
@@ -152,9 +153,7 @@
childregs->orig_ax = -1;
childregs->cs = __KERNEL_CS | get_kernel_rpl();
childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_FIXED;
- p->thread.fpu_counter = 0;
p->thread.io_bitmap_ptr = NULL;
- memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
return 0;
}
*childregs = *current_pt_regs();
@@ -165,13 +164,10 @@
p->thread.ip = (unsigned long) ret_from_fork;
task_user_gs(p) = get_user_gs(current_pt_regs());
- p->thread.fpu_counter = 0;
p->thread.io_bitmap_ptr = NULL;
tsk = current;
err = -ENOMEM;
- memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
-
if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr,
IO_BITMAP_BYTES, GFP_KERNEL);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index ca5b02d..3ed4a68 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -163,7 +163,6 @@
p->thread.sp = (unsigned long) childregs;
p->thread.usersp = me->thread.usersp;
set_tsk_thread_flag(p, TIF_FORK);
- p->thread.fpu_counter = 0;
p->thread.io_bitmap_ptr = NULL;
savesegment(gs, p->thread.gsindex);
@@ -193,8 +192,6 @@
childregs->sp = sp;
err = -ENOMEM;
- memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
-
if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr,
IO_BITMAP_BYTES, GFP_KERNEL);
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 678c0ad..29576c2 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -1441,24 +1441,126 @@
force_sig_info(SIGTRAP, &info, tsk);
}
-
-#ifdef CONFIG_X86_32
-# define IS_IA32 1
-#elif defined CONFIG_IA32_EMULATION
-# define IS_IA32 is_compat_task()
-#else
-# define IS_IA32 0
+static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch)
+{
+#ifdef CONFIG_X86_64
+ if (arch == AUDIT_ARCH_X86_64) {
+ audit_syscall_entry(arch, regs->orig_ax, regs->di,
+ regs->si, regs->dx, regs->r10);
+ } else
#endif
+ {
+ audit_syscall_entry(arch, regs->orig_ax, regs->bx,
+ regs->cx, regs->dx, regs->si);
+ }
+}
/*
- * We must return the syscall number to actually look up in the table.
- * This can be -1L to skip running any syscall at all.
+ * We can return 0 to resume the syscall or anything else to go to phase
+ * 2. If we resume the syscall, we need to put something appropriate in
+ * regs->orig_ax.
+ *
+ * NB: We don't have full pt_regs here, but regs->orig_ax and regs->ax
+ * are fully functional.
+ *
+ * For phase 2's benefit, our return value is:
+ * 0: resume the syscall
+ * 1: go to phase 2; no seccomp phase 2 needed
+ * anything else: go to phase 2; pass return value to seccomp
*/
-long syscall_trace_enter(struct pt_regs *regs)
+unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch)
+{
+ unsigned long ret = 0;
+ u32 work;
+
+ BUG_ON(regs != task_pt_regs(current));
+
+ work = ACCESS_ONCE(current_thread_info()->flags) &
+ _TIF_WORK_SYSCALL_ENTRY;
+
+ /*
+ * If TIF_NOHZ is set, we are required to call user_exit() before
+ * doing anything that could touch RCU.
+ */
+ if (work & _TIF_NOHZ) {
+ user_exit();
+ work &= ~TIF_NOHZ;
+ }
+
+#ifdef CONFIG_SECCOMP
+ /*
+ * Do seccomp first -- it should minimize exposure of other
+ * code, and keeping seccomp fast is probably more valuable
+ * than the rest of this.
+ */
+ if (work & _TIF_SECCOMP) {
+ struct seccomp_data sd;
+
+ sd.arch = arch;
+ sd.nr = regs->orig_ax;
+ sd.instruction_pointer = regs->ip;
+#ifdef CONFIG_X86_64
+ if (arch == AUDIT_ARCH_X86_64) {
+ sd.args[0] = regs->di;
+ sd.args[1] = regs->si;
+ sd.args[2] = regs->dx;
+ sd.args[3] = regs->r10;
+ sd.args[4] = regs->r8;
+ sd.args[5] = regs->r9;
+ } else
+#endif
+ {
+ sd.args[0] = regs->bx;
+ sd.args[1] = regs->cx;
+ sd.args[2] = regs->dx;
+ sd.args[3] = regs->si;
+ sd.args[4] = regs->di;
+ sd.args[5] = regs->bp;
+ }
+
+ BUILD_BUG_ON(SECCOMP_PHASE1_OK != 0);
+ BUILD_BUG_ON(SECCOMP_PHASE1_SKIP != 1);
+
+ ret = seccomp_phase1(&sd);
+ if (ret == SECCOMP_PHASE1_SKIP) {
+ regs->orig_ax = -1;
+ ret = 0;
+ } else if (ret != SECCOMP_PHASE1_OK) {
+ return ret; /* Go directly to phase 2 */
+ }
+
+ work &= ~_TIF_SECCOMP;
+ }
+#endif
+
+ /* Do our best to finish without phase 2. */
+ if (work == 0)
+ return ret; /* seccomp and/or nohz only (ret == 0 here) */
+
+#ifdef CONFIG_AUDITSYSCALL
+ if (work == _TIF_SYSCALL_AUDIT) {
+ /*
+ * If there is no more work to be done except auditing,
+ * then audit in phase 1. Phase 2 always audits, so, if
+ * we audit here, then we can't go on to phase 2.
+ */
+ do_audit_syscall_entry(regs, arch);
+ return 0;
+ }
+#endif
+
+ return 1; /* Something is enabled that we can't handle in phase 1 */
+}
+
+/* Returns the syscall nr to run (which should match regs->orig_ax). */
+long syscall_trace_enter_phase2(struct pt_regs *regs, u32 arch,
+ unsigned long phase1_result)
{
long ret = 0;
+ u32 work = ACCESS_ONCE(current_thread_info()->flags) &
+ _TIF_WORK_SYSCALL_ENTRY;
- user_exit();
+ BUG_ON(regs != task_pt_regs(current));
/*
* If we stepped into a sysenter/syscall insn, it trapped in
@@ -1467,17 +1569,21 @@
* do_debug() and we need to set it again to restore the user
* state. If we entered on the slow path, TF was already set.
*/
- if (test_thread_flag(TIF_SINGLESTEP))
+ if (work & _TIF_SINGLESTEP)
regs->flags |= X86_EFLAGS_TF;
- /* do the secure computing check first */
- if (secure_computing(regs->orig_ax)) {
+#ifdef CONFIG_SECCOMP
+ /*
+ * Call seccomp_phase2 before running the other hooks so that
+ * they can see any changes made by a seccomp tracer.
+ */
+ if (phase1_result > 1 && seccomp_phase2(phase1_result)) {
/* seccomp failures shouldn't expose any additional code. */
- ret = -1L;
- goto out;
+ return -1;
}
+#endif
- if (unlikely(test_thread_flag(TIF_SYSCALL_EMU)))
+ if (unlikely(work & _TIF_SYSCALL_EMU))
ret = -1L;
if ((ret || test_thread_flag(TIF_SYSCALL_TRACE)) &&
@@ -1487,23 +1593,22 @@
if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
trace_sys_enter(regs, regs->orig_ax);
- if (IS_IA32)
- audit_syscall_entry(AUDIT_ARCH_I386,
- regs->orig_ax,
- regs->bx, regs->cx,
- regs->dx, regs->si);
-#ifdef CONFIG_X86_64
- else
- audit_syscall_entry(AUDIT_ARCH_X86_64,
- regs->orig_ax,
- regs->di, regs->si,
- regs->dx, regs->r10);
-#endif
+ do_audit_syscall_entry(regs, arch);
-out:
return ret ?: regs->orig_ax;
}
+long syscall_trace_enter(struct pt_regs *regs)
+{
+ u32 arch = is_ia32_task() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64;
+ unsigned long phase1_result = syscall_trace_enter_phase1(regs, arch);
+
+ if (phase1_result == 0)
+ return regs->orig_ax;
+ else
+ return syscall_trace_enter_phase2(regs, arch, phase1_result);
+}
+
void syscall_trace_leave(struct pt_regs *regs)
{
bool step;
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 2851d63..ed37a76 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -675,6 +675,11 @@
* handler too.
*/
regs->flags &= ~(X86_EFLAGS_DF|X86_EFLAGS_RF|X86_EFLAGS_TF);
+ /*
+ * Ensure the signal handler starts with the new fpu state.
+ */
+ if (used_math())
+ drop_init_fpu(current);
}
signal_setup_done(failed, ksig, test_thread_flag(TIF_SINGLESTEP));
}
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index e1e1e80..957779f 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -216,7 +216,7 @@
*/
regs->orig_ax = syscall_nr;
regs->ax = -ENOSYS;
- tmp = secure_computing(syscall_nr);
+ tmp = secure_computing();
if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) {
warn_bad_vsyscall(KERN_DEBUG, regs,
"seccomp tried to change syscall nr or ip");
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 940b142..4c540c4 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -271,8 +271,6 @@
if (use_fxsr() && save_xstate_epilog(buf_fx, ia32_fxstate))
return -1;
- drop_init_fpu(tsk); /* trigger finit */
-
return 0;
}
@@ -402,8 +400,11 @@
set_used_math();
}
- if (use_eager_fpu())
+ if (use_eager_fpu()) {
+ preempt_disable();
math_state_restore();
+ preempt_enable();
+ }
return err;
} else {
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index 4dd8cf6..75cc097 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -59,41 +59,6 @@
__flush_tlb_one(vaddr);
}
-/*
- * Associate a large virtual page frame with a given physical page frame
- * and protection flags for that frame. pfn is for the base of the page,
- * vaddr is what the page gets mapped to - both must be properly aligned.
- * The pmd must already be instantiated. Assumes PAE mode.
- */
-void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
-{
- pgd_t *pgd;
- pud_t *pud;
- pmd_t *pmd;
-
- if (vaddr & (PMD_SIZE-1)) { /* vaddr is misaligned */
- printk(KERN_WARNING "set_pmd_pfn: vaddr misaligned\n");
- return; /* BUG(); */
- }
- if (pfn & (PTRS_PER_PTE-1)) { /* pfn is misaligned */
- printk(KERN_WARNING "set_pmd_pfn: pfn misaligned\n");
- return; /* BUG(); */
- }
- pgd = swapper_pg_dir + pgd_index(vaddr);
- if (pgd_none(*pgd)) {
- printk(KERN_WARNING "set_pmd_pfn: pgd_none\n");
- return; /* BUG(); */
- }
- pud = pud_offset(pgd, vaddr);
- pmd = pmd_offset(pud, vaddr);
- set_pmd(pmd, pfn_pmd(pfn, flags));
- /*
- * It's enough to flush this one mapping.
- * (PGE mappings get flushed as well)
- */
- __flush_tlb_one(vaddr);
-}
-
unsigned long __FIXADDR_TOP = 0xfffff000;
EXPORT_SYMBOL(__FIXADDR_TOP);
diff --git a/arch/xtensa/include/asm/atomic.h b/arch/xtensa/include/asm/atomic.h
index e5103b4..6266766 100644
--- a/arch/xtensa/include/asm/atomic.h
+++ b/arch/xtensa/include/asm/atomic.h
@@ -58,165 +58,96 @@
*/
#define atomic_set(v,i) ((v)->counter = (i))
-/**
- * atomic_add - add integer to atomic variable
- * @i: integer value to add
- * @v: pointer of type atomic_t
- *
- * Atomically adds @i to @v.
- */
-static inline void atomic_add(int i, atomic_t * v)
-{
#if XCHAL_HAVE_S32C1I
- unsigned long tmp;
- int result;
+#define ATOMIC_OP(op) \
+static inline void atomic_##op(int i, atomic_t * v) \
+{ \
+ unsigned long tmp; \
+ int result; \
+ \
+ __asm__ __volatile__( \
+ "1: l32i %1, %3, 0\n" \
+ " wsr %1, scompare1\n" \
+ " " #op " %0, %1, %2\n" \
+ " s32c1i %0, %3, 0\n" \
+ " bne %0, %1, 1b\n" \
+ : "=&a" (result), "=&a" (tmp) \
+ : "a" (i), "a" (v) \
+ : "memory" \
+ ); \
+} \
- __asm__ __volatile__(
- "1: l32i %1, %3, 0\n"
- " wsr %1, scompare1\n"
- " add %0, %1, %2\n"
- " s32c1i %0, %3, 0\n"
- " bne %0, %1, 1b\n"
- : "=&a" (result), "=&a" (tmp)
- : "a" (i), "a" (v)
- : "memory"
- );
-#else
- unsigned int vval;
-
- __asm__ __volatile__(
- " rsil a15, "__stringify(LOCKLEVEL)"\n"
- " l32i %0, %2, 0\n"
- " add %0, %0, %1\n"
- " s32i %0, %2, 0\n"
- " wsr a15, ps\n"
- " rsync\n"
- : "=&a" (vval)
- : "a" (i), "a" (v)
- : "a15", "memory"
- );
-#endif
+#define ATOMIC_OP_RETURN(op) \
+static inline int atomic_##op##_return(int i, atomic_t * v) \
+{ \
+ unsigned long tmp; \
+ int result; \
+ \
+ __asm__ __volatile__( \
+ "1: l32i %1, %3, 0\n" \
+ " wsr %1, scompare1\n" \
+ " " #op " %0, %1, %2\n" \
+ " s32c1i %0, %3, 0\n" \
+ " bne %0, %1, 1b\n" \
+ " " #op " %0, %0, %2\n" \
+ : "=&a" (result), "=&a" (tmp) \
+ : "a" (i), "a" (v) \
+ : "memory" \
+ ); \
+ \
+ return result; \
}
-/**
- * atomic_sub - subtract the atomic variable
- * @i: integer value to subtract
- * @v: pointer of type atomic_t
- *
- * Atomically subtracts @i from @v.
- */
-static inline void atomic_sub(int i, atomic_t *v)
-{
-#if XCHAL_HAVE_S32C1I
- unsigned long tmp;
- int result;
+#else /* XCHAL_HAVE_S32C1I */
- __asm__ __volatile__(
- "1: l32i %1, %3, 0\n"
- " wsr %1, scompare1\n"
- " sub %0, %1, %2\n"
- " s32c1i %0, %3, 0\n"
- " bne %0, %1, 1b\n"
- : "=&a" (result), "=&a" (tmp)
- : "a" (i), "a" (v)
- : "memory"
- );
-#else
- unsigned int vval;
+#define ATOMIC_OP(op) \
+static inline void atomic_##op(int i, atomic_t * v) \
+{ \
+ unsigned int vval; \
+ \
+ __asm__ __volatile__( \
+ " rsil a15, "__stringify(LOCKLEVEL)"\n"\
+ " l32i %0, %2, 0\n" \
+ " " #op " %0, %0, %1\n" \
+ " s32i %0, %2, 0\n" \
+ " wsr a15, ps\n" \
+ " rsync\n" \
+ : "=&a" (vval) \
+ : "a" (i), "a" (v) \
+ : "a15", "memory" \
+ ); \
+} \
- __asm__ __volatile__(
- " rsil a15, "__stringify(LOCKLEVEL)"\n"
- " l32i %0, %2, 0\n"
- " sub %0, %0, %1\n"
- " s32i %0, %2, 0\n"
- " wsr a15, ps\n"
- " rsync\n"
- : "=&a" (vval)
- : "a" (i), "a" (v)
- : "a15", "memory"
- );
-#endif
+#define ATOMIC_OP_RETURN(op) \
+static inline int atomic_##op##_return(int i, atomic_t * v) \
+{ \
+ unsigned int vval; \
+ \
+ __asm__ __volatile__( \
+ " rsil a15,"__stringify(LOCKLEVEL)"\n" \
+ " l32i %0, %2, 0\n" \
+ " " #op " %0, %0, %1\n" \
+ " s32i %0, %2, 0\n" \
+ " wsr a15, ps\n" \
+ " rsync\n" \
+ : "=&a" (vval) \
+ : "a" (i), "a" (v) \
+ : "a15", "memory" \
+ ); \
+ \
+ return vval; \
}
-/*
- * We use atomic_{add|sub}_return to define other functions.
- */
+#endif /* XCHAL_HAVE_S32C1I */
-static inline int atomic_add_return(int i, atomic_t * v)
-{
-#if XCHAL_HAVE_S32C1I
- unsigned long tmp;
- int result;
+#define ATOMIC_OPS(op) ATOMIC_OP(op) ATOMIC_OP_RETURN(op)
- __asm__ __volatile__(
- "1: l32i %1, %3, 0\n"
- " wsr %1, scompare1\n"
- " add %0, %1, %2\n"
- " s32c1i %0, %3, 0\n"
- " bne %0, %1, 1b\n"
- " add %0, %0, %2\n"
- : "=&a" (result), "=&a" (tmp)
- : "a" (i), "a" (v)
- : "memory"
- );
+ATOMIC_OPS(add)
+ATOMIC_OPS(sub)
- return result;
-#else
- unsigned int vval;
-
- __asm__ __volatile__(
- " rsil a15,"__stringify(LOCKLEVEL)"\n"
- " l32i %0, %2, 0\n"
- " add %0, %0, %1\n"
- " s32i %0, %2, 0\n"
- " wsr a15, ps\n"
- " rsync\n"
- : "=&a" (vval)
- : "a" (i), "a" (v)
- : "a15", "memory"
- );
-
- return vval;
-#endif
-}
-
-static inline int atomic_sub_return(int i, atomic_t * v)
-{
-#if XCHAL_HAVE_S32C1I
- unsigned long tmp;
- int result;
-
- __asm__ __volatile__(
- "1: l32i %1, %3, 0\n"
- " wsr %1, scompare1\n"
- " sub %0, %1, %2\n"
- " s32c1i %0, %3, 0\n"
- " bne %0, %1, 1b\n"
- " sub %0, %0, %2\n"
- : "=&a" (result), "=&a" (tmp)
- : "a" (i), "a" (v)
- : "memory"
- );
-
- return result;
-#else
- unsigned int vval;
-
- __asm__ __volatile__(
- " rsil a15,"__stringify(LOCKLEVEL)"\n"
- " l32i %0, %2, 0\n"
- " sub %0, %0, %1\n"
- " s32i %0, %2, 0\n"
- " wsr a15, ps\n"
- " rsync\n"
- : "=&a" (vval)
- : "a" (i), "a" (v)
- : "a15", "memory"
- );
-
- return vval;
-#endif
-}
+#undef ATOMIC_OPS
+#undef ATOMIC_OP_RETURN
+#undef ATOMIC_OP
/**
* atomic_sub_and_test - subtract value from variable and test result
diff --git a/crypto/asymmetric_keys/public_key.c b/crypto/asymmetric_keys/public_key.c
index 97eb0019..2f6e4fb 100644
--- a/crypto/asymmetric_keys/public_key.c
+++ b/crypto/asymmetric_keys/public_key.c
@@ -121,6 +121,7 @@
struct asymmetric_key_subtype public_key_subtype = {
.owner = THIS_MODULE,
.name = "public_key",
+ .name_len = sizeof("public_key") - 1,
.describe = public_key_describe,
.destroy = public_key_destroy,
.verify_signature = public_key_verify_signature_2,
diff --git a/crypto/asymmetric_keys/verify_pefile.c b/crypto/asymmetric_keys/verify_pefile.c
index 79175e6..2421f46 100644
--- a/crypto/asymmetric_keys/verify_pefile.c
+++ b/crypto/asymmetric_keys/verify_pefile.c
@@ -128,6 +128,7 @@
{
struct win_certificate wrapper;
const u8 *pkcs7;
+ unsigned len;
if (ctx->sig_len < sizeof(wrapper)) {
pr_debug("Signature wrapper too short\n");
@@ -154,33 +155,49 @@
return -ENOTSUPP;
}
- /* Looks like actual pkcs signature length is in wrapper->length.
- * size obtained from data dir entries lists the total size of
- * certificate table which is also aligned to octawrod boundary.
- *
- * So set signature length field appropriately.
+ /* It looks like the pkcs signature length in wrapper->length and the
+ * size obtained from the data dir entries, which lists the total size
+ * of certificate table, are both aligned to an octaword boundary, so
+ * we may have to deal with some padding.
*/
ctx->sig_len = wrapper.length;
ctx->sig_offset += sizeof(wrapper);
ctx->sig_len -= sizeof(wrapper);
- if (ctx->sig_len == 0) {
+ if (ctx->sig_len < 4) {
pr_debug("Signature data missing\n");
return -EKEYREJECTED;
}
- /* What's left should a PKCS#7 cert */
+ /* What's left should be a PKCS#7 cert */
pkcs7 = pebuf + ctx->sig_offset;
- if (pkcs7[0] == (ASN1_CONS_BIT | ASN1_SEQ)) {
- if (pkcs7[1] == 0x82 &&
- pkcs7[2] == (((ctx->sig_len - 4) >> 8) & 0xff) &&
- pkcs7[3] == ((ctx->sig_len - 4) & 0xff))
- return 0;
- if (pkcs7[1] == 0x80)
- return 0;
- if (pkcs7[1] > 0x82)
- return -EMSGSIZE;
+ if (pkcs7[0] != (ASN1_CONS_BIT | ASN1_SEQ))
+ goto not_pkcs7;
+
+ switch (pkcs7[1]) {
+ case 0 ... 0x7f:
+ len = pkcs7[1] + 2;
+ goto check_len;
+ case ASN1_INDEFINITE_LENGTH:
+ return 0;
+ case 0x81:
+ len = pkcs7[2] + 3;
+ goto check_len;
+ case 0x82:
+ len = ((pkcs7[2] << 8) | pkcs7[3]) + 4;
+ goto check_len;
+ case 0x83 ... 0xff:
+ return -EMSGSIZE;
+ default:
+ goto not_pkcs7;
}
+check_len:
+ if (len <= ctx->sig_len) {
+ /* There may be padding */
+ ctx->sig_len = len;
+ return 0;
+ }
+not_pkcs7:
pr_debug("Signature data not PKCS#7\n");
return -ELIBBAD;
}
diff --git a/drivers/acpi/acpica/nsprepkg.c b/drivers/acpi/acpica/nsprepkg.c
index 68f7258..1b13b92 100644
--- a/drivers/acpi/acpica/nsprepkg.c
+++ b/drivers/acpi/acpica/nsprepkg.c
@@ -316,6 +316,45 @@
acpi_ns_check_package_list(info, package, elements, count);
break;
+ case ACPI_PTYPE2_UUID_PAIR:
+
+ /* The package must contain pairs of (UUID + type) */
+
+ if (count & 1) {
+ expected_count = count + 1;
+ goto package_too_small;
+ }
+
+ while (count > 0) {
+ status = acpi_ns_check_object_type(info, elements,
+ package->ret_info.
+ object_type1, 0);
+ if (ACPI_FAILURE(status)) {
+ return (status);
+ }
+
+ /* Validate length of the UUID buffer */
+
+ if ((*elements)->buffer.length != 16) {
+ ACPI_WARN_PREDEFINED((AE_INFO,
+ info->full_pathname,
+ info->node_flags,
+ "Invalid length for UUID Buffer"));
+ return (AE_AML_OPERAND_VALUE);
+ }
+
+ status = acpi_ns_check_object_type(info, elements + 1,
+ package->ret_info.
+ object_type2, 0);
+ if (ACPI_FAILURE(status)) {
+ return (status);
+ }
+
+ elements += 2;
+ count -= 2;
+ }
+ break;
+
default:
/* Should not get here if predefined info table is correct */
diff --git a/drivers/acpi/ec.c b/drivers/acpi/ec.c
index 9922cc4..cb6066c 100644
--- a/drivers/acpi/ec.c
+++ b/drivers/acpi/ec.c
@@ -1030,6 +1030,10 @@
DMI_MATCH(DMI_SYS_VENDOR, "Quanta"),
DMI_MATCH(DMI_PRODUCT_NAME, "TW9/SW9"),}, NULL},
{
+ ec_flag_msi, "Clevo W350etq", {
+ DMI_MATCH(DMI_SYS_VENDOR, "CLEVO CO."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "W35_37ET"),}, NULL},
+ {
ec_validate_ecdt, "ASUS hardware", {
DMI_MATCH(DMI_BIOS_VENDOR, "ASUS") }, NULL},
{
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index 3dca36d..17f9ec5 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -1071,9 +1071,9 @@
if (pr->id == 0 && cpuidle_get_driver() == &acpi_idle_driver) {
- cpuidle_pause_and_lock();
/* Protect against cpu-hotplug */
get_online_cpus();
+ cpuidle_pause_and_lock();
/* Disable all cpuidle devices */
for_each_online_cpu(cpu) {
@@ -1100,8 +1100,8 @@
cpuidle_enable_device(dev);
}
}
- put_online_cpus();
cpuidle_resume_and_unlock();
+ put_online_cpus();
}
return 0;
diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
index 9a92989..3bf7764 100644
--- a/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c
@@ -667,8 +667,14 @@
acpi_device_sun_show(struct device *dev, struct device_attribute *attr,
char *buf) {
struct acpi_device *acpi_dev = to_acpi_device(dev);
+ acpi_status status;
+ unsigned long long sun;
- return sprintf(buf, "%lu\n", acpi_dev->pnp.sun);
+ status = acpi_evaluate_integer(acpi_dev->handle, "_SUN", NULL, &sun);
+ if (ACPI_FAILURE(status))
+ return -ENODEV;
+
+ return sprintf(buf, "%llu\n", sun);
}
static DEVICE_ATTR(sun, 0444, acpi_device_sun_show, NULL);
@@ -690,7 +696,6 @@
{
struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER, NULL};
acpi_status status;
- unsigned long long sun;
int result = 0;
/*
@@ -731,14 +736,10 @@
if (dev->pnp.unique_id)
result = device_create_file(&dev->dev, &dev_attr_uid);
- status = acpi_evaluate_integer(dev->handle, "_SUN", NULL, &sun);
- if (ACPI_SUCCESS(status)) {
- dev->pnp.sun = (unsigned long)sun;
+ if (acpi_has_method(dev->handle, "_SUN")) {
result = device_create_file(&dev->dev, &dev_attr_sun);
if (result)
goto end;
- } else {
- dev->pnp.sun = (unsigned long)-1;
}
if (acpi_has_method(dev->handle, "_STA")) {
diff --git a/drivers/acpi/video.c b/drivers/acpi/video.c
index 8268843..fcbda10 100644
--- a/drivers/acpi/video.c
+++ b/drivers/acpi/video.c
@@ -82,9 +82,9 @@
* For Windows 8 systems: used to decide if video module
* should skip registering backlight interface of its own.
*/
-static int use_native_backlight_param = 1;
+static int use_native_backlight_param = -1;
module_param_named(use_native_backlight, use_native_backlight_param, int, 0444);
-static bool use_native_backlight_dmi = false;
+static bool use_native_backlight_dmi = true;
static int register_count;
static struct mutex video_list_lock;
@@ -417,6 +417,12 @@
return 0;
}
+static int __init video_disable_native_backlight(const struct dmi_system_id *d)
+{
+ use_native_backlight_dmi = false;
+ return 0;
+}
+
static struct dmi_system_id video_dmi_table[] __initdata = {
/*
* Broken _BQC workaround http://bugzilla.kernel.org/show_bug.cgi?id=13121
@@ -720,6 +726,41 @@
DMI_MATCH(DMI_PRODUCT_NAME, "HP EliteBook 8780w"),
},
},
+
+ /*
+ * These models have a working acpi_video backlight control, and using
+ * native backlight causes a regression where backlight does not work
+ * when userspace is not handling brightness key events. Disable
+ * native_backlight on these to fix this:
+ * https://bugzilla.kernel.org/show_bug.cgi?id=81691
+ */
+ {
+ .callback = video_disable_native_backlight,
+ .ident = "ThinkPad T420",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
+ DMI_MATCH(DMI_PRODUCT_VERSION, "ThinkPad T420"),
+ },
+ },
+ {
+ .callback = video_disable_native_backlight,
+ .ident = "ThinkPad T520",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
+ DMI_MATCH(DMI_PRODUCT_VERSION, "ThinkPad T520"),
+ },
+ },
+
+ /* The native backlight controls do not work on some older machines */
+ {
+ /* https://bugs.freedesktop.org/show_bug.cgi?id=81515 */
+ .callback = video_disable_native_backlight,
+ .ident = "HP ENVY 15 Notebook",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "HP ENVY 15 Notebook PC"),
+ },
+ },
{}
};
diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c
index a29f801..a0cc0ed 100644
--- a/drivers/ata/ahci.c
+++ b/drivers/ata/ahci.c
@@ -305,6 +305,14 @@
{ PCI_VDEVICE(INTEL, 0x9c85), board_ahci }, /* Wildcat Point-LP RAID */
{ PCI_VDEVICE(INTEL, 0x9c87), board_ahci }, /* Wildcat Point-LP RAID */
{ PCI_VDEVICE(INTEL, 0x9c8f), board_ahci }, /* Wildcat Point-LP RAID */
+ { PCI_VDEVICE(INTEL, 0x8c82), board_ahci }, /* 9 Series AHCI */
+ { PCI_VDEVICE(INTEL, 0x8c83), board_ahci }, /* 9 Series AHCI */
+ { PCI_VDEVICE(INTEL, 0x8c84), board_ahci }, /* 9 Series RAID */
+ { PCI_VDEVICE(INTEL, 0x8c85), board_ahci }, /* 9 Series RAID */
+ { PCI_VDEVICE(INTEL, 0x8c86), board_ahci }, /* 9 Series RAID */
+ { PCI_VDEVICE(INTEL, 0x8c87), board_ahci }, /* 9 Series RAID */
+ { PCI_VDEVICE(INTEL, 0x8c8e), board_ahci }, /* 9 Series RAID */
+ { PCI_VDEVICE(INTEL, 0x8c8f), board_ahci }, /* 9 Series RAID */
/* JMicron 360/1/3/5/6, match class to avoid IDE function */
{ PCI_VENDOR_ID_JMICRON, PCI_ANY_ID, PCI_ANY_ID, PCI_ANY_ID,
@@ -442,6 +450,8 @@
{ PCI_DEVICE(PCI_VENDOR_ID_MARVELL_EXT, 0x917a),
.driver_data = board_ahci_yes_fbs }, /* 88se9172 */
{ PCI_DEVICE(PCI_VENDOR_ID_MARVELL_EXT, 0x9172),
+ .driver_data = board_ahci_yes_fbs }, /* 88se9182 */
+ { PCI_DEVICE(PCI_VENDOR_ID_MARVELL_EXT, 0x9182),
.driver_data = board_ahci_yes_fbs }, /* 88se9172 */
{ PCI_DEVICE(PCI_VENDOR_ID_MARVELL_EXT, 0x9192),
.driver_data = board_ahci_yes_fbs }, /* 88se9172 on some Gigabyte */
@@ -1329,6 +1339,18 @@
else if (pdev->vendor == 0x1c44 && pdev->device == 0x8000)
ahci_pci_bar = AHCI_PCI_BAR_ENMOTUS;
+ /*
+ * The JMicron chip 361/363 contains one SATA controller and one
+ * PATA controller,for powering on these both controllers, we must
+ * follow the sequence one by one, otherwise one of them can not be
+ * powered on successfully, so here we disable the async suspend
+ * method for these chips.
+ */
+ if (pdev->vendor == PCI_VENDOR_ID_JMICRON &&
+ (pdev->device == PCI_DEVICE_ID_JMICRON_JMB363 ||
+ pdev->device == PCI_DEVICE_ID_JMICRON_JMB361))
+ device_disable_async_suspend(&pdev->dev);
+
/* acquire resources */
rc = pcim_enable_device(pdev);
if (rc)
diff --git a/drivers/ata/ahci_tegra.c b/drivers/ata/ahci_tegra.c
index f1fef74..0329044 100644
--- a/drivers/ata/ahci_tegra.c
+++ b/drivers/ata/ahci_tegra.c
@@ -18,14 +18,17 @@
*/
#include <linux/ahci_platform.h>
-#include <linux/reset.h>
#include <linux/errno.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/of_device.h>
#include <linux/platform_device.h>
#include <linux/regulator/consumer.h>
+#include <linux/reset.h>
+
+#include <soc/tegra/fuse.h>
#include <soc/tegra/pmc.h>
+
#include "ahci.h"
#define SATA_CONFIGURATION_0 0x180
@@ -180,9 +183,12 @@
/* Pad calibration */
- /* FIXME Always use calibration 0. Change this to read the calibration
- * fuse once the fuse driver has landed. */
- val = 0;
+ ret = tegra_fuse_readl(FUSE_SATA_CALIB, &val);
+ if (ret) {
+ dev_err(&tegra->pdev->dev,
+ "failed to read calibration fuse: %d\n", ret);
+ return ret;
+ }
calib = tegra124_pad_calibration[val & FUSE_SATA_CALIB_MASK];
diff --git a/drivers/ata/ahci_xgene.c b/drivers/ata/ahci_xgene.c
index c696230..f03aab1 100644
--- a/drivers/ata/ahci_xgene.c
+++ b/drivers/ata/ahci_xgene.c
@@ -78,6 +78,9 @@
#define CFG_MEM_RAM_SHUTDOWN 0x00000070
#define BLOCK_MEM_RDY 0x00000074
+/* Max retry for link down */
+#define MAX_LINK_DOWN_RETRY 3
+
struct xgene_ahci_context {
struct ahci_host_priv *hpriv;
struct device *dev;
@@ -145,6 +148,14 @@
return rc;
}
+static bool xgene_ahci_is_memram_inited(struct xgene_ahci_context *ctx)
+{
+ void __iomem *diagcsr = ctx->csr_diag;
+
+ return (readl(diagcsr + CFG_MEM_RAM_SHUTDOWN) == 0 &&
+ readl(diagcsr + BLOCK_MEM_RDY) == 0xFFFFFFFF);
+}
+
/**
* xgene_ahci_read_id - Read ID data from the specified device
* @dev: device
@@ -229,8 +240,11 @@
* and Gen1 (1.5Gbps). Otherwise during long IO stress test, the PHY will
* report disparity error and etc. In addition, during COMRESET, there can
* be error reported in the register PORT_SCR_ERR. For SERR_DISPARITY and
- * SERR_10B_8B_ERR, the PHY receiver line must be reseted. The following
- * algorithm is followed to proper configure the hardware PHY during COMRESET:
+ * SERR_10B_8B_ERR, the PHY receiver line must be reseted. Also during long
+ * reboot cycle regression, sometimes the PHY reports link down even if the
+ * device is present because of speed negotiation failure. so need to retry
+ * the COMRESET to get the link up. The following algorithm is followed to
+ * proper configure the hardware PHY during COMRESET:
*
* Alg Part 1:
* 1. Start the PHY at Gen3 speed (default setting)
@@ -246,9 +260,15 @@
* Alg Part 2:
* 1. On link up, if there are any SERR_DISPARITY and SERR_10B_8B_ERR error
* reported in the register PORT_SCR_ERR, then reset the PHY receiver line
- * 2. Go to Alg Part 3
+ * 2. Go to Alg Part 4
*
* Alg Part 3:
+ * 1. Check the PORT_SCR_STAT to see whether device presence detected but PHY
+ * communication establishment failed and maximum link down attempts are
+ * less than Max attempts 3 then goto Alg Part 1.
+ * 2. Go to Alg Part 4.
+ *
+ * Alg Part 4:
* 1. Clear any pending from register PORT_SCR_ERR.
*
* NOTE: For the initial version, we will NOT support Gen1/Gen2. In addition
@@ -267,19 +287,27 @@
u8 *d2h_fis = pp->rx_fis + RX_FIS_D2H_REG;
void __iomem *port_mmio = ahci_port_base(ap);
struct ata_taskfile tf;
+ int link_down_retry = 0;
int rc;
- u32 val;
+ u32 val, sstatus;
- /* clear D2H reception area to properly wait for D2H FIS */
- ata_tf_init(link->device, &tf);
- tf.command = ATA_BUSY;
- ata_tf_to_fis(&tf, 0, 0, d2h_fis);
- rc = sata_link_hardreset(link, timing, deadline, online,
+ do {
+ /* clear D2H reception area to properly wait for D2H FIS */
+ ata_tf_init(link->device, &tf);
+ tf.command = ATA_BUSY;
+ ata_tf_to_fis(&tf, 0, 0, d2h_fis);
+ rc = sata_link_hardreset(link, timing, deadline, online,
ahci_check_ready);
+ if (*online) {
+ val = readl(port_mmio + PORT_SCR_ERR);
+ if (val & (SERR_DISPARITY | SERR_10B_8B_ERR))
+ dev_warn(ctx->dev, "link has error\n");
+ break;
+ }
- val = readl(port_mmio + PORT_SCR_ERR);
- if (val & (SERR_DISPARITY | SERR_10B_8B_ERR))
- dev_warn(ctx->dev, "link has error\n");
+ sata_scr_read(link, SCR_STATUS, &sstatus);
+ } while (link_down_retry++ < MAX_LINK_DOWN_RETRY &&
+ (sstatus & 0xff) == 0x1);
/* clear all errors if any pending */
val = readl(port_mmio + PORT_SCR_ERR);
@@ -467,6 +495,11 @@
return -ENODEV;
}
+ if (xgene_ahci_is_memram_inited(ctx)) {
+ dev_info(dev, "skip clock and PHY initialization\n");
+ goto skip_clk_phy;
+ }
+
/* Due to errata, HW requires full toggle transition */
rc = ahci_platform_enable_clks(hpriv);
if (rc)
@@ -479,7 +512,7 @@
/* Configure the host controller */
xgene_ahci_hw_init(hpriv);
-
+skip_clk_phy:
hpriv->flags = AHCI_HFLAG_NO_PMP | AHCI_HFLAG_NO_NCQ;
rc = ahci_platform_init_host(pdev, hpriv, &xgene_ahci_port_info);
diff --git a/drivers/ata/ata_piix.c b/drivers/ata/ata_piix.c
index 893e30e..ffbe625 100644
--- a/drivers/ata/ata_piix.c
+++ b/drivers/ata/ata_piix.c
@@ -340,6 +340,14 @@
{ 0x8086, 0x0F21, PCI_ANY_ID, PCI_ANY_ID, 0, 0, ich8_2port_sata_byt },
/* SATA Controller IDE (Coleto Creek) */
{ 0x8086, 0x23a6, PCI_ANY_ID, PCI_ANY_ID, 0, 0, ich8_2port_sata },
+ /* SATA Controller IDE (9 Series) */
+ { 0x8086, 0x8c88, PCI_ANY_ID, PCI_ANY_ID, 0, 0, ich8_2port_sata_snb },
+ /* SATA Controller IDE (9 Series) */
+ { 0x8086, 0x8c89, PCI_ANY_ID, PCI_ANY_ID, 0, 0, ich8_2port_sata_snb },
+ /* SATA Controller IDE (9 Series) */
+ { 0x8086, 0x8c80, PCI_ANY_ID, PCI_ANY_ID, 0, 0, ich8_sata_snb },
+ /* SATA Controller IDE (9 Series) */
+ { 0x8086, 0x8c81, PCI_ANY_ID, PCI_ANY_ID, 0, 0, ich8_sata_snb },
{ } /* terminate list */
};
diff --git a/drivers/ata/pata_jmicron.c b/drivers/ata/pata_jmicron.c
index 4d1a5d2..47e418b 100644
--- a/drivers/ata/pata_jmicron.c
+++ b/drivers/ata/pata_jmicron.c
@@ -143,6 +143,18 @@
};
const struct ata_port_info *ppi[] = { &info, NULL };
+ /*
+ * The JMicron chip 361/363 contains one SATA controller and one
+ * PATA controller,for powering on these both controllers, we must
+ * follow the sequence one by one, otherwise one of them can not be
+ * powered on successfully, so here we disable the async suspend
+ * method for these chips.
+ */
+ if (pdev->vendor == PCI_VENDOR_ID_JMICRON &&
+ (pdev->device == PCI_DEVICE_ID_JMICRON_JMB363 ||
+ pdev->device == PCI_DEVICE_ID_JMICRON_JMB361))
+ device_disable_async_suspend(&pdev->dev);
+
return ata_pci_bmdma_init_one(pdev, ppi, &jmicron_sht, NULL, 0);
}
diff --git a/drivers/base/regmap/internal.h b/drivers/base/regmap/internal.h
index 7d13269..bfc90b8 100644
--- a/drivers/base/regmap/internal.h
+++ b/drivers/base/regmap/internal.h
@@ -146,6 +146,9 @@
enum regcache_type type;
int (*init)(struct regmap *map);
int (*exit)(struct regmap *map);
+#ifdef CONFIG_DEBUG_FS
+ void (*debugfs_init)(struct regmap *map);
+#endif
int (*read)(struct regmap *map, unsigned int reg, unsigned int *value);
int (*write)(struct regmap *map, unsigned int reg, unsigned int value);
int (*sync)(struct regmap *map, unsigned int min, unsigned int max);
diff --git a/drivers/base/regmap/regcache-rbtree.c b/drivers/base/regmap/regcache-rbtree.c
index 6a7e4fa..f3e8fe0 100644
--- a/drivers/base/regmap/regcache-rbtree.c
+++ b/drivers/base/regmap/regcache-rbtree.c
@@ -194,10 +194,6 @@
{
debugfs_create_file("rbtree", 0400, map->debugfs, map, &rbtree_fops);
}
-#else
-static void rbtree_debugfs_init(struct regmap *map)
-{
-}
#endif
static int regcache_rbtree_init(struct regmap *map)
@@ -222,8 +218,6 @@
goto err;
}
- rbtree_debugfs_init(map);
-
return 0;
err:
@@ -532,6 +526,9 @@
.name = "rbtree",
.init = regcache_rbtree_init,
.exit = regcache_rbtree_exit,
+#ifdef CONFIG_DEBUG_FS
+ .debugfs_init = rbtree_debugfs_init,
+#endif
.read = regcache_rbtree_read,
.write = regcache_rbtree_write,
.sync = regcache_rbtree_sync,
diff --git a/drivers/base/regmap/regcache.c b/drivers/base/regmap/regcache.c
index 29b4128..5617da6 100644
--- a/drivers/base/regmap/regcache.c
+++ b/drivers/base/regmap/regcache.c
@@ -698,7 +698,7 @@
unsigned int block_base, unsigned int start,
unsigned int end)
{
- if (regmap_can_raw_write(map))
+ if (regmap_can_raw_write(map) && !map->use_single_rw)
return regcache_sync_block_raw(map, block, cache_present,
block_base, start, end);
else
diff --git a/drivers/base/regmap/regmap-debugfs.c b/drivers/base/regmap/regmap-debugfs.c
index 45d812c..65ea7b2 100644
--- a/drivers/base/regmap/regmap-debugfs.c
+++ b/drivers/base/regmap/regmap-debugfs.c
@@ -538,6 +538,9 @@
next = rb_next(&range_node->node);
}
+
+ if (map->cache_ops && map->cache_ops->debugfs_init)
+ map->cache_ops->debugfs_init(map);
}
void regmap_debugfs_exit(struct regmap *map)
diff --git a/drivers/base/regmap/regmap.c b/drivers/base/regmap/regmap.c
index 78f43fb..1cf427b 100644
--- a/drivers/base/regmap/regmap.c
+++ b/drivers/base/regmap/regmap.c
@@ -109,7 +109,7 @@
bool regmap_volatile(struct regmap *map, unsigned int reg)
{
- if (!regmap_readable(map, reg))
+ if (!map->format.format_write && !regmap_readable(map, reg))
return false;
if (map->volatile_reg)
diff --git a/drivers/bcma/host_pci.c b/drivers/bcma/host_pci.c
index 294a7dd..f032ed6 100644
--- a/drivers/bcma/host_pci.c
+++ b/drivers/bcma/host_pci.c
@@ -282,6 +282,7 @@
{ PCI_DEVICE(PCI_VENDOR_ID_BROADCOM, 0x43a9) },
{ PCI_DEVICE(PCI_VENDOR_ID_BROADCOM, 0x43aa) },
{ PCI_DEVICE(PCI_VENDOR_ID_BROADCOM, 0x4727) },
+ { PCI_DEVICE(PCI_VENDOR_ID_BROADCOM, 43227) }, /* 0xA8DB */
{ 0, },
};
MODULE_DEVICE_TABLE(pci, bcma_pci_bridge_tbl);
diff --git a/drivers/bus/arm-ccn.c b/drivers/bus/arm-ccn.c
index 6f550d9..a60f264 100644
--- a/drivers/bus/arm-ccn.c
+++ b/drivers/bus/arm-ccn.c
@@ -586,6 +586,30 @@
return 0;
}
+static void arm_ccn_pmu_event_destroy(struct perf_event *event)
+{
+ struct arm_ccn *ccn = pmu_to_arm_ccn(event->pmu);
+ struct hw_perf_event *hw = &event->hw;
+
+ if (hw->idx == CCN_IDX_PMU_CYCLE_COUNTER) {
+ clear_bit(CCN_IDX_PMU_CYCLE_COUNTER, ccn->dt.pmu_counters_mask);
+ } else {
+ struct arm_ccn_component *source =
+ ccn->dt.pmu_counters[hw->idx].source;
+
+ if (CCN_CONFIG_TYPE(event->attr.config) == CCN_TYPE_XP &&
+ CCN_CONFIG_EVENT(event->attr.config) ==
+ CCN_EVENT_WATCHPOINT)
+ clear_bit(hw->config_base, source->xp.dt_cmp_mask);
+ else
+ clear_bit(hw->config_base, source->pmu_events_mask);
+ clear_bit(hw->idx, ccn->dt.pmu_counters_mask);
+ }
+
+ ccn->dt.pmu_counters[hw->idx].source = NULL;
+ ccn->dt.pmu_counters[hw->idx].event = NULL;
+}
+
static int arm_ccn_pmu_event_init(struct perf_event *event)
{
struct arm_ccn *ccn;
@@ -599,6 +623,7 @@
return -ENOENT;
ccn = pmu_to_arm_ccn(event->pmu);
+ event->destroy = arm_ccn_pmu_event_destroy;
if (hw->sample_period) {
dev_warn(ccn->dev, "Sampling not supported!\n");
@@ -731,30 +756,6 @@
return 0;
}
-static void arm_ccn_pmu_event_free(struct perf_event *event)
-{
- struct arm_ccn *ccn = pmu_to_arm_ccn(event->pmu);
- struct hw_perf_event *hw = &event->hw;
-
- if (hw->idx == CCN_IDX_PMU_CYCLE_COUNTER) {
- clear_bit(CCN_IDX_PMU_CYCLE_COUNTER, ccn->dt.pmu_counters_mask);
- } else {
- struct arm_ccn_component *source =
- ccn->dt.pmu_counters[hw->idx].source;
-
- if (CCN_CONFIG_TYPE(event->attr.config) == CCN_TYPE_XP &&
- CCN_CONFIG_EVENT(event->attr.config) ==
- CCN_EVENT_WATCHPOINT)
- clear_bit(hw->config_base, source->xp.dt_cmp_mask);
- else
- clear_bit(hw->config_base, source->pmu_events_mask);
- clear_bit(hw->idx, ccn->dt.pmu_counters_mask);
- }
-
- ccn->dt.pmu_counters[hw->idx].source = NULL;
- ccn->dt.pmu_counters[hw->idx].event = NULL;
-}
-
static u64 arm_ccn_pmu_read_counter(struct arm_ccn *ccn, int idx)
{
u64 res;
@@ -1027,8 +1028,6 @@
static void arm_ccn_pmu_event_del(struct perf_event *event, int flags)
{
arm_ccn_pmu_event_stop(event, PERF_EF_UPDATE);
-
- arm_ccn_pmu_event_free(event);
}
static void arm_ccn_pmu_event_read(struct perf_event *event)
diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index e396ad3..0668b38 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -708,10 +708,6 @@
static int intel_pstate_set_policy(struct cpufreq_policy *policy)
{
- struct cpudata *cpu;
-
- cpu = all_cpu_data[policy->cpu];
-
if (!policy->cpuinfo.max_freq)
return -ENODEV;
diff --git a/drivers/gpio/gpio-bt8xx.c b/drivers/gpio/gpio-bt8xx.c
index 6557147..7e4c43c 100644
--- a/drivers/gpio/gpio-bt8xx.c
+++ b/drivers/gpio/gpio-bt8xx.c
@@ -241,9 +241,6 @@
bgwrite(~0x0, BT848_INT_STAT);
bgwrite(0x0, BT848_GPIO_OUT_EN);
- iounmap(bg->mmio);
- release_mem_region(pci_resource_start(pdev, 0),
- pci_resource_len(pdev, 0));
pci_disable_device(pdev);
}
diff --git a/drivers/gpu/drm/drm_modeset_lock.c b/drivers/gpu/drm/drm_modeset_lock.c
index 0dc57d5..3a02e5e 100644
--- a/drivers/gpu/drm/drm_modeset_lock.c
+++ b/drivers/gpu/drm/drm_modeset_lock.c
@@ -35,7 +35,7 @@
* of extra utility/tracking out of our acquire-ctx. This is provided
* by drm_modeset_lock / drm_modeset_acquire_ctx.
*
- * For basic principles of ww_mutex, see: Documentation/ww-mutex-design.txt
+ * For basic principles of ww_mutex, see: Documentation/locking/ww-mutex-design.txt
*
* The basic usage pattern is to:
*
diff --git a/drivers/gpu/drm/i915/intel_bios.c b/drivers/gpu/drm/i915/intel_bios.c
index a669550..eee79e1 100644
--- a/drivers/gpu/drm/i915/intel_bios.c
+++ b/drivers/gpu/drm/i915/intel_bios.c
@@ -1123,7 +1123,7 @@
}
}
-static int __init intel_no_opregion_vbt_callback(const struct dmi_system_id *id)
+static int intel_no_opregion_vbt_callback(const struct dmi_system_id *id)
{
DRM_DEBUG_KMS("Falling back to manually reading VBT from "
"VBIOS ROM for %s\n",
diff --git a/drivers/gpu/drm/i915/intel_crt.c b/drivers/gpu/drm/i915/intel_crt.c
index e8abfce..9212e65 100644
--- a/drivers/gpu/drm/i915/intel_crt.c
+++ b/drivers/gpu/drm/i915/intel_crt.c
@@ -804,7 +804,7 @@
.destroy = intel_encoder_destroy,
};
-static int __init intel_no_crt_dmi_callback(const struct dmi_system_id *id)
+static int intel_no_crt_dmi_callback(const struct dmi_system_id *id)
{
DRM_INFO("Skipping CRT initialization for %s\n", id->ident);
return 1;
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index d074d70..d8324c6 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -2233,6 +2233,15 @@
if (need_vtd_wa(dev) && alignment < 256 * 1024)
alignment = 256 * 1024;
+ /*
+ * Global gtt pte registers are special registers which actually forward
+ * writes to a chunk of system memory. Which means that there is no risk
+ * that the register values disappear as soon as we call
+ * intel_runtime_pm_put(), so it is correct to wrap only the
+ * pin/unpin/fence and not more.
+ */
+ intel_runtime_pm_get(dev_priv);
+
dev_priv->mm.interruptible = false;
ret = i915_gem_object_pin_to_display_plane(obj, alignment, pipelined);
if (ret)
@@ -2250,12 +2259,14 @@
i915_gem_object_pin_fence(obj);
dev_priv->mm.interruptible = true;
+ intel_runtime_pm_put(dev_priv);
return 0;
err_unpin:
i915_gem_object_unpin_from_display_plane(obj);
err_interruptible:
dev_priv->mm.interruptible = true;
+ intel_runtime_pm_put(dev_priv);
return ret;
}
@@ -4188,10 +4199,6 @@
intel_set_pch_fifo_underrun_reporting(dev, pipe, false);
intel_disable_pipe(dev_priv, pipe);
-
- if (intel_crtc->config.dp_encoder_is_mst)
- intel_ddi_set_vc_payload_alloc(crtc, false);
-
ironlake_pfit_disable(intel_crtc);
for_each_encoder_on_crtc(dev, crtc, encoder)
@@ -4256,6 +4263,9 @@
intel_set_pch_fifo_underrun_reporting(dev, TRANSCODER_A, false);
intel_disable_pipe(dev_priv, pipe);
+ if (intel_crtc->config.dp_encoder_is_mst)
+ intel_ddi_set_vc_payload_alloc(crtc, false);
+
intel_ddi_disable_transcoder_func(dev_priv, cpu_transcoder);
ironlake_pfit_disable(intel_crtc);
@@ -8240,6 +8250,15 @@
goto fail_locked;
}
+ /*
+ * Global gtt pte registers are special registers which actually
+ * forward writes to a chunk of system memory. Which means that
+ * there is no risk that the register values disappear as soon
+ * as we call intel_runtime_pm_put(), so it is correct to wrap
+ * only the pin/unpin/fence and not more.
+ */
+ intel_runtime_pm_get(dev_priv);
+
/* Note that the w/a also requires 2 PTE of padding following
* the bo. We currently fill all unused PTE with the shadow
* page and so we should always have valid PTE following the
@@ -8252,16 +8271,20 @@
ret = i915_gem_object_pin_to_display_plane(obj, alignment, NULL);
if (ret) {
DRM_DEBUG_KMS("failed to move cursor bo into the GTT\n");
+ intel_runtime_pm_put(dev_priv);
goto fail_locked;
}
ret = i915_gem_object_put_fence(obj);
if (ret) {
DRM_DEBUG_KMS("failed to release fence for cursor");
+ intel_runtime_pm_put(dev_priv);
goto fail_unpin;
}
addr = i915_gem_obj_ggtt_offset(obj);
+
+ intel_runtime_pm_put(dev_priv);
} else {
int align = IS_I830(dev) ? 16 * 1024 : 256;
ret = i915_gem_object_attach_phys(obj, align);
@@ -12481,6 +12504,9 @@
/* Acer C720 and C720P Chromebooks (Celeron 2955U) have backlights */
{ 0x0a06, 0x1025, 0x0a11, quirk_backlight_present },
+ /* Acer C720 Chromebook (Core i3 4005U) */
+ { 0x0a16, 0x1025, 0x0a11, quirk_backlight_present },
+
/* Toshiba CB35 Chromebook (Celeron 2955U) */
{ 0x0a06, 0x1179, 0x0a88, quirk_backlight_present },
diff --git a/drivers/gpu/drm/i915/intel_dp.c b/drivers/gpu/drm/i915/intel_dp.c
index 67cfed6..81d7681 100644
--- a/drivers/gpu/drm/i915/intel_dp.c
+++ b/drivers/gpu/drm/i915/intel_dp.c
@@ -3661,24 +3661,12 @@
return intel_dp_detect_dpcd(intel_dp);
}
-static enum drm_connector_status
-g4x_dp_detect(struct intel_dp *intel_dp)
+static int g4x_digital_port_connected(struct drm_device *dev,
+ struct intel_digital_port *intel_dig_port)
{
- struct drm_device *dev = intel_dp_to_dev(intel_dp);
struct drm_i915_private *dev_priv = dev->dev_private;
- struct intel_digital_port *intel_dig_port = dp_to_dig_port(intel_dp);
uint32_t bit;
- /* Can't disconnect eDP, but you can close the lid... */
- if (is_edp(intel_dp)) {
- enum drm_connector_status status;
-
- status = intel_panel_detect(dev);
- if (status == connector_status_unknown)
- status = connector_status_connected;
- return status;
- }
-
if (IS_VALLEYVIEW(dev)) {
switch (intel_dig_port->port) {
case PORT_B:
@@ -3691,7 +3679,7 @@
bit = PORTD_HOTPLUG_LIVE_STATUS_VLV;
break;
default:
- return connector_status_unknown;
+ return -EINVAL;
}
} else {
switch (intel_dig_port->port) {
@@ -3705,11 +3693,36 @@
bit = PORTD_HOTPLUG_LIVE_STATUS_G4X;
break;
default:
- return connector_status_unknown;
+ return -EINVAL;
}
}
if ((I915_READ(PORT_HOTPLUG_STAT) & bit) == 0)
+ return 0;
+ return 1;
+}
+
+static enum drm_connector_status
+g4x_dp_detect(struct intel_dp *intel_dp)
+{
+ struct drm_device *dev = intel_dp_to_dev(intel_dp);
+ struct intel_digital_port *intel_dig_port = dp_to_dig_port(intel_dp);
+ int ret;
+
+ /* Can't disconnect eDP, but you can close the lid... */
+ if (is_edp(intel_dp)) {
+ enum drm_connector_status status;
+
+ status = intel_panel_detect(dev);
+ if (status == connector_status_unknown)
+ status = connector_status_connected;
+ return status;
+ }
+
+ ret = g4x_digital_port_connected(dev, intel_dig_port);
+ if (ret == -EINVAL)
+ return connector_status_unknown;
+ else if (ret == 0)
return connector_status_disconnected;
return intel_dp_detect_dpcd(intel_dp);
@@ -4066,8 +4079,14 @@
intel_display_power_get(dev_priv, power_domain);
if (long_hpd) {
- if (!ibx_digital_port_connected(dev_priv, intel_dig_port))
- goto mst_fail;
+
+ if (HAS_PCH_SPLIT(dev)) {
+ if (!ibx_digital_port_connected(dev_priv, intel_dig_port))
+ goto mst_fail;
+ } else {
+ if (g4x_digital_port_connected(dev, intel_dig_port) != 1)
+ goto mst_fail;
+ }
if (!intel_dp_get_dpcd(intel_dp)) {
goto mst_fail;
diff --git a/drivers/gpu/drm/i915/intel_lvds.c b/drivers/gpu/drm/i915/intel_lvds.c
index 881361c..fdf4026 100644
--- a/drivers/gpu/drm/i915/intel_lvds.c
+++ b/drivers/gpu/drm/i915/intel_lvds.c
@@ -538,7 +538,7 @@
.destroy = intel_encoder_destroy,
};
-static int __init intel_no_lvds_dmi_callback(const struct dmi_system_id *id)
+static int intel_no_lvds_dmi_callback(const struct dmi_system_id *id)
{
DRM_INFO("Skipping LVDS initialization for %s\n", id->ident);
return 1;
diff --git a/drivers/gpu/drm/i915/intel_panel.c b/drivers/gpu/drm/i915/intel_panel.c
index 59b028f..8e37444 100644
--- a/drivers/gpu/drm/i915/intel_panel.c
+++ b/drivers/gpu/drm/i915/intel_panel.c
@@ -801,7 +801,7 @@
cpu_ctl2 = I915_READ(BLC_PWM_CPU_CTL2);
if (cpu_ctl2 & BLM_PWM_ENABLE) {
- WARN(1, "cpu backlight already enabled\n");
+ DRM_DEBUG_KMS("cpu backlight already enabled\n");
cpu_ctl2 &= ~BLM_PWM_ENABLE;
I915_WRITE(BLC_PWM_CPU_CTL2, cpu_ctl2);
}
@@ -845,7 +845,7 @@
ctl = I915_READ(BLC_PWM_CTL);
if (ctl & BACKLIGHT_DUTY_CYCLE_MASK_PNV) {
- WARN(1, "backlight already enabled\n");
+ DRM_DEBUG_KMS("backlight already enabled\n");
I915_WRITE(BLC_PWM_CTL, 0);
}
@@ -876,7 +876,7 @@
ctl2 = I915_READ(BLC_PWM_CTL2);
if (ctl2 & BLM_PWM_ENABLE) {
- WARN(1, "backlight already enabled\n");
+ DRM_DEBUG_KMS("backlight already enabled\n");
ctl2 &= ~BLM_PWM_ENABLE;
I915_WRITE(BLC_PWM_CTL2, ctl2);
}
@@ -910,7 +910,7 @@
ctl2 = I915_READ(VLV_BLC_PWM_CTL2(pipe));
if (ctl2 & BLM_PWM_ENABLE) {
- WARN(1, "backlight already enabled\n");
+ DRM_DEBUG_KMS("backlight already enabled\n");
ctl2 &= ~BLM_PWM_ENABLE;
I915_WRITE(VLV_BLC_PWM_CTL2(pipe), ctl2);
}
diff --git a/drivers/gpu/drm/i915/intel_tv.c b/drivers/gpu/drm/i915/intel_tv.c
index 32186a6..c69d3ce 100644
--- a/drivers/gpu/drm/i915/intel_tv.c
+++ b/drivers/gpu/drm/i915/intel_tv.c
@@ -1311,6 +1311,7 @@
{
struct drm_display_mode mode;
struct intel_tv *intel_tv = intel_attached_tv(connector);
+ enum drm_connector_status status;
int type;
DRM_DEBUG_KMS("[CONNECTOR:%d:%s] force=%d\n",
@@ -1328,16 +1329,19 @@
if (intel_get_load_detect_pipe(connector, &mode, &tmp, &ctx)) {
type = intel_tv_detect_type(intel_tv, connector);
intel_release_load_detect_pipe(connector, &tmp);
+ status = type < 0 ?
+ connector_status_disconnected :
+ connector_status_connected;
} else
- return connector_status_unknown;
+ status = connector_status_unknown;
drm_modeset_drop_locks(&ctx);
drm_modeset_acquire_fini(&ctx);
} else
return connector->status;
- if (type < 0)
- return connector_status_disconnected;
+ if (status != connector_status_connected)
+ return status;
intel_tv->type = type;
intel_tv_find_better_format(connector);
diff --git a/drivers/gpu/drm/nouveau/core/core/parent.c b/drivers/gpu/drm/nouveau/core/core/parent.c
index 8701968..30a2911 100644
--- a/drivers/gpu/drm/nouveau/core/core/parent.c
+++ b/drivers/gpu/drm/nouveau/core/core/parent.c
@@ -86,7 +86,7 @@
sclass = nv_parent(parent)->sclass;
while (sclass) {
if (++nr < size)
- lclass[nr] = sclass->oclass->handle;
+ lclass[nr] = sclass->oclass->handle & 0xffff;
sclass = sclass->sclass;
}
@@ -96,7 +96,7 @@
if (engine && (oclass = engine->sclass)) {
while (oclass->ofuncs) {
if (++nr < size)
- lclass[nr] = oclass->handle;
+ lclass[nr] = oclass->handle & 0xffff;
oclass++;
}
}
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c b/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c
index 7bfdaa1..36b8716 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c
@@ -450,11 +450,11 @@
res,
id_loc - sw_context->buf_start);
if (unlikely(ret != 0))
- goto out_err;
+ return ret;
ret = vmw_resource_val_add(sw_context, res, &node);
if (unlikely(ret != 0))
- goto out_err;
+ return ret;
if (res_type == vmw_res_context && dev_priv->has_mob &&
node->first_usage) {
@@ -468,13 +468,13 @@
ret = vmw_resource_context_res_add(dev_priv, sw_context, res);
if (unlikely(ret != 0))
- goto out_err;
+ return ret;
node->staged_bindings =
kzalloc(sizeof(*node->staged_bindings), GFP_KERNEL);
if (node->staged_bindings == NULL) {
DRM_ERROR("Failed to allocate context binding "
"information.\n");
- goto out_err;
+ return -ENOMEM;
}
INIT_LIST_HEAD(&node->staged_bindings->list);
}
@@ -482,8 +482,7 @@
if (p_val)
*p_val = node;
-out_err:
- return ret;
+ return 0;
}
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c b/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c
index 6ccd993..6eae14d 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c
@@ -180,8 +180,9 @@
mutex_lock(&dev_priv->hw_mutex);
+ vmw_write(dev_priv, SVGA_REG_SYNC, SVGA_SYNC_GENERIC);
while (vmw_read(dev_priv, SVGA_REG_BUSY) != 0)
- vmw_write(dev_priv, SVGA_REG_SYNC, SVGA_SYNC_GENERIC);
+ ;
dev_priv->last_read_seqno = ioread32(fifo_mem + SVGA_FIFO_FENCE);
diff --git a/drivers/hwmon/ds1621.c b/drivers/hwmon/ds1621.c
index fc6f5d5..8890870 100644
--- a/drivers/hwmon/ds1621.c
+++ b/drivers/hwmon/ds1621.c
@@ -309,6 +309,7 @@
data->conf |= (resol << DS1621_REG_CONFIG_RESOL_SHIFT);
i2c_smbus_write_byte_data(client, DS1621_REG_CONF, data->conf);
data->update_interval = ds1721_convrates[resol];
+ data->zbits = 7 - resol;
mutex_unlock(&data->update_lock);
return count;
diff --git a/drivers/i2c/busses/i2c-at91.c b/drivers/i2c/busses/i2c-at91.c
index 79a6899..917d545 100644
--- a/drivers/i2c/busses/i2c-at91.c
+++ b/drivers/i2c/busses/i2c-at91.c
@@ -101,6 +101,7 @@
unsigned twi_cwgr_reg;
struct at91_twi_pdata *pdata;
bool use_dma;
+ bool recv_len_abort;
struct at91_twi_dma dma;
};
@@ -267,12 +268,24 @@
*dev->buf = at91_twi_read(dev, AT91_TWI_RHR) & 0xff;
--dev->buf_len;
+ /* return if aborting, we only needed to read RHR to clear RXRDY*/
+ if (dev->recv_len_abort)
+ return;
+
/* handle I2C_SMBUS_BLOCK_DATA */
if (unlikely(dev->msg->flags & I2C_M_RECV_LEN)) {
- dev->msg->flags &= ~I2C_M_RECV_LEN;
- dev->buf_len += *dev->buf;
- dev->msg->len = dev->buf_len + 1;
- dev_dbg(dev->dev, "received block length %d\n", dev->buf_len);
+ /* ensure length byte is a valid value */
+ if (*dev->buf <= I2C_SMBUS_BLOCK_MAX && *dev->buf > 0) {
+ dev->msg->flags &= ~I2C_M_RECV_LEN;
+ dev->buf_len += *dev->buf;
+ dev->msg->len = dev->buf_len + 1;
+ dev_dbg(dev->dev, "received block length %d\n",
+ dev->buf_len);
+ } else {
+ /* abort and send the stop by reading one more byte */
+ dev->recv_len_abort = true;
+ dev->buf_len = 1;
+ }
}
/* send stop if second but last byte has been read */
@@ -421,8 +434,8 @@
}
}
- ret = wait_for_completion_interruptible_timeout(&dev->cmd_complete,
- dev->adapter.timeout);
+ ret = wait_for_completion_io_timeout(&dev->cmd_complete,
+ dev->adapter.timeout);
if (ret == 0) {
dev_err(dev->dev, "controller timed out\n");
at91_init_twi_bus(dev);
@@ -444,6 +457,12 @@
ret = -EIO;
goto error;
}
+ if (dev->recv_len_abort) {
+ dev_err(dev->dev, "invalid smbus block length recvd\n");
+ ret = -EPROTO;
+ goto error;
+ }
+
dev_dbg(dev->dev, "transfer complete\n");
return 0;
@@ -500,6 +519,7 @@
dev->buf_len = m_start->len;
dev->buf = m_start->buf;
dev->msg = m_start;
+ dev->recv_len_abort = false;
ret = at91_do_twi_transfer(dev);
diff --git a/drivers/i2c/busses/i2c-mv64xxx.c b/drivers/i2c/busses/i2c-mv64xxx.c
index 6dc5ded..2f64273 100644
--- a/drivers/i2c/busses/i2c-mv64xxx.c
+++ b/drivers/i2c/busses/i2c-mv64xxx.c
@@ -746,8 +746,7 @@
}
tclk = clk_get_rate(drv_data->clk);
- rc = of_property_read_u32(np, "clock-frequency", &bus_freq);
- if (rc)
+ if (of_property_read_u32(np, "clock-frequency", &bus_freq))
bus_freq = 100000; /* 100kHz by default */
if (!mv64xxx_find_baud_factors(bus_freq, tclk,
diff --git a/drivers/i2c/busses/i2c-rcar.c b/drivers/i2c/busses/i2c-rcar.c
index f3c7139..1cc146c 100644
--- a/drivers/i2c/busses/i2c-rcar.c
+++ b/drivers/i2c/busses/i2c-rcar.c
@@ -34,6 +34,7 @@
#include <linux/platform_device.h>
#include <linux/pm_runtime.h>
#include <linux/slab.h>
+#include <linux/spinlock.h>
/* register offsets */
#define ICSCR 0x00 /* slave ctrl */
@@ -95,6 +96,7 @@
struct i2c_msg *msg;
struct clk *clk;
+ spinlock_t lock;
wait_queue_head_t wait;
int pos;
@@ -365,20 +367,20 @@
struct rcar_i2c_priv *priv = ptr;
u32 msr;
+ /*-------------- spin lock -----------------*/
+ spin_lock(&priv->lock);
+
msr = rcar_i2c_read(priv, ICMSR);
+ /* Only handle interrupts that are currently enabled */
+ msr &= rcar_i2c_read(priv, ICMIER);
+
/* Arbitration lost */
if (msr & MAL) {
rcar_i2c_flags_set(priv, (ID_DONE | ID_ARBLOST));
goto out;
}
- /* Stop */
- if (msr & MST) {
- rcar_i2c_flags_set(priv, ID_DONE);
- goto out;
- }
-
/* Nack */
if (msr & MNR) {
/* go to stop phase */
@@ -388,6 +390,12 @@
goto out;
}
+ /* Stop */
+ if (msr & MST) {
+ rcar_i2c_flags_set(priv, ID_DONE);
+ goto out;
+ }
+
if (rcar_i2c_is_recv(priv))
rcar_i2c_flags_set(priv, rcar_i2c_irq_recv(priv, msr));
else
@@ -400,6 +408,9 @@
wake_up(&priv->wait);
}
+ spin_unlock(&priv->lock);
+ /*-------------- spin unlock -----------------*/
+
return IRQ_HANDLED;
}
@@ -409,14 +420,21 @@
{
struct rcar_i2c_priv *priv = i2c_get_adapdata(adap);
struct device *dev = rcar_i2c_priv_to_dev(priv);
+ unsigned long flags;
int i, ret, timeout;
pm_runtime_get_sync(dev);
+ /*-------------- spin lock -----------------*/
+ spin_lock_irqsave(&priv->lock, flags);
+
rcar_i2c_init(priv);
/* start clock */
rcar_i2c_write(priv, ICCCR, priv->icccr);
+ spin_unlock_irqrestore(&priv->lock, flags);
+ /*-------------- spin unlock -----------------*/
+
ret = rcar_i2c_bus_barrier(priv);
if (ret < 0)
goto out;
@@ -428,6 +446,9 @@
break;
}
+ /*-------------- spin lock -----------------*/
+ spin_lock_irqsave(&priv->lock, flags);
+
/* init each data */
priv->msg = &msgs[i];
priv->pos = 0;
@@ -437,6 +458,9 @@
ret = rcar_i2c_prepare_msg(priv);
+ spin_unlock_irqrestore(&priv->lock, flags);
+ /*-------------- spin unlock -----------------*/
+
if (ret < 0)
break;
@@ -540,6 +564,7 @@
irq = platform_get_irq(pdev, 0);
init_waitqueue_head(&priv->wait);
+ spin_lock_init(&priv->lock);
adap = &priv->adap;
adap->nr = pdev->id;
diff --git a/drivers/i2c/busses/i2c-rk3x.c b/drivers/i2c/busses/i2c-rk3x.c
index 69e1185..e637c32 100644
--- a/drivers/i2c/busses/i2c-rk3x.c
+++ b/drivers/i2c/busses/i2c-rk3x.c
@@ -323,6 +323,10 @@
/* ack interrupt */
i2c_writel(i2c, REG_INT_MBRF, REG_IPD);
+ /* Can only handle a maximum of 32 bytes at a time */
+ if (len > 32)
+ len = 32;
+
/* read the data from receive buffer */
for (i = 0; i < len; ++i) {
if (i % 4 == 0)
diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index e1e558a..af82563 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -1089,6 +1089,30 @@
return err;
}
+static int mlx4_ib_tunnel_steer_add(struct ib_qp *qp, struct ib_flow_attr *flow_attr,
+ u64 *reg_id)
+{
+ void *ib_flow;
+ union ib_flow_spec *ib_spec;
+ struct mlx4_dev *dev = to_mdev(qp->device)->dev;
+ int err = 0;
+
+ if (dev->caps.tunnel_offload_mode != MLX4_TUNNEL_OFFLOAD_MODE_VXLAN)
+ return 0; /* do nothing */
+
+ ib_flow = flow_attr + 1;
+ ib_spec = (union ib_flow_spec *)ib_flow;
+
+ if (ib_spec->type != IB_FLOW_SPEC_ETH || flow_attr->num_of_specs != 1)
+ return 0; /* do nothing */
+
+ err = mlx4_tunnel_steer_add(to_mdev(qp->device)->dev, ib_spec->eth.val.dst_mac,
+ flow_attr->port, qp->qp_num,
+ MLX4_DOMAIN_UVERBS | (flow_attr->priority & 0xff),
+ reg_id);
+ return err;
+}
+
static struct ib_flow *mlx4_ib_create_flow(struct ib_qp *qp,
struct ib_flow_attr *flow_attr,
int domain)
@@ -1136,6 +1160,12 @@
i++;
}
+ if (i < ARRAY_SIZE(type) && flow_attr->type == IB_FLOW_ATTR_NORMAL) {
+ err = mlx4_ib_tunnel_steer_add(qp, flow_attr, &mflow->reg_id[i]);
+ if (err)
+ goto err_free;
+ }
+
return &mflow->ibflow;
err_free:
diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index 6778045..efb9eff 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -1677,9 +1677,15 @@
}
}
- if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET)
+ if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET) {
context->pri_path.ackto = (context->pri_path.ackto & 0xf8) |
MLX4_IB_LINK_TYPE_ETH;
+ if (dev->dev->caps.tunnel_offload_mode == MLX4_TUNNEL_OFFLOAD_MODE_VXLAN) {
+ /* set QP to receive both tunneled & non-tunneled packets */
+ if (!(context->flags & (1 << MLX4_RSS_QPC_FLAG_OFFSET)))
+ context->srqn = cpu_to_be32(7 << 28);
+ }
+ }
if (ibqp->qp_type == IB_QPT_UD && (new_state == IB_QPS_RTR)) {
int is_eth = rdma_port_get_link_layer(
diff --git a/drivers/input/input-mt.c b/drivers/input/input-mt.c
index c30204f..fbe29fc 100644
--- a/drivers/input/input-mt.c
+++ b/drivers/input/input-mt.c
@@ -236,6 +236,18 @@
}
EXPORT_SYMBOL(input_mt_report_pointer_emulation);
+static void __input_mt_drop_unused(struct input_dev *dev, struct input_mt *mt)
+{
+ int i;
+
+ for (i = 0; i < mt->num_slots; i++) {
+ if (!input_mt_is_used(mt, &mt->slots[i])) {
+ input_mt_slot(dev, i);
+ input_event(dev, EV_ABS, ABS_MT_TRACKING_ID, -1);
+ }
+ }
+}
+
/**
* input_mt_drop_unused() - Inactivate slots not seen in this frame
* @dev: input device with allocated MT slots
@@ -245,19 +257,11 @@
void input_mt_drop_unused(struct input_dev *dev)
{
struct input_mt *mt = dev->mt;
- int i;
- if (!mt)
- return;
-
- for (i = 0; i < mt->num_slots; i++) {
- if (!input_mt_is_used(mt, &mt->slots[i])) {
- input_mt_slot(dev, i);
- input_event(dev, EV_ABS, ABS_MT_TRACKING_ID, -1);
- }
+ if (mt) {
+ __input_mt_drop_unused(dev, mt);
+ mt->frame++;
}
-
- mt->frame++;
}
EXPORT_SYMBOL(input_mt_drop_unused);
@@ -278,12 +282,14 @@
return;
if (mt->flags & INPUT_MT_DROP_UNUSED)
- input_mt_drop_unused(dev);
+ __input_mt_drop_unused(dev, mt);
if ((mt->flags & INPUT_MT_POINTER) && !(mt->flags & INPUT_MT_SEMI_MT))
use_count = true;
input_mt_report_pointer_emulation(dev, use_count);
+
+ mt->frame++;
}
EXPORT_SYMBOL(input_mt_sync_frame);
diff --git a/drivers/input/mouse/alps.c b/drivers/input/mouse/alps.c
index a59a1a6..a956b98 100644
--- a/drivers/input/mouse/alps.c
+++ b/drivers/input/mouse/alps.c
@@ -2234,8 +2234,8 @@
return 0;
}
- psmouse_info(psmouse,
- "Unknown ALPS touchpad: E7=%3ph, EC=%3ph\n", e7, ec);
+ psmouse_dbg(psmouse,
+ "Likely not an ALPS touchpad: E7=%3ph, EC=%3ph\n", e7, ec);
return -EINVAL;
}
diff --git a/drivers/input/mouse/elantech.c b/drivers/input/mouse/elantech.c
index ee2a04d..da51738 100644
--- a/drivers/input/mouse/elantech.c
+++ b/drivers/input/mouse/elantech.c
@@ -18,6 +18,7 @@
#include <linux/input/mt.h>
#include <linux/serio.h>
#include <linux/libps2.h>
+#include <asm/unaligned.h>
#include "psmouse.h"
#include "elantech.h"
@@ -403,6 +404,68 @@
input_sync(dev);
}
+static void elantech_report_trackpoint(struct psmouse *psmouse,
+ int packet_type)
+{
+ /*
+ * byte 0: 0 0 sx sy 0 M R L
+ * byte 1:~sx 0 0 0 0 0 0 0
+ * byte 2:~sy 0 0 0 0 0 0 0
+ * byte 3: 0 0 ~sy ~sx 0 1 1 0
+ * byte 4: x7 x6 x5 x4 x3 x2 x1 x0
+ * byte 5: y7 y6 y5 y4 y3 y2 y1 y0
+ *
+ * x and y are written in two's complement spread
+ * over 9 bits with sx/sy the relative top bit and
+ * x7..x0 and y7..y0 the lower bits.
+ * The sign of y is opposite to what the input driver
+ * expects for a relative movement
+ */
+
+ struct elantech_data *etd = psmouse->private;
+ struct input_dev *tp_dev = etd->tp_dev;
+ unsigned char *packet = psmouse->packet;
+ int x, y;
+ u32 t;
+
+ if (dev_WARN_ONCE(&psmouse->ps2dev.serio->dev,
+ !tp_dev,
+ psmouse_fmt("Unexpected trackpoint message\n"))) {
+ if (etd->debug == 1)
+ elantech_packet_dump(psmouse);
+ return;
+ }
+
+ t = get_unaligned_le32(&packet[0]);
+
+ switch (t & ~7U) {
+ case 0x06000030U:
+ case 0x16008020U:
+ case 0x26800010U:
+ case 0x36808000U:
+ x = packet[4] - (int)((packet[1]^0x80) << 1);
+ y = (int)((packet[2]^0x80) << 1) - packet[5];
+
+ input_report_key(tp_dev, BTN_LEFT, packet[0] & 0x01);
+ input_report_key(tp_dev, BTN_RIGHT, packet[0] & 0x02);
+ input_report_key(tp_dev, BTN_MIDDLE, packet[0] & 0x04);
+
+ input_report_rel(tp_dev, REL_X, x);
+ input_report_rel(tp_dev, REL_Y, y);
+
+ input_sync(tp_dev);
+
+ break;
+
+ default:
+ /* Dump unexpected packet sequences if debug=1 (default) */
+ if (etd->debug == 1)
+ elantech_packet_dump(psmouse);
+
+ break;
+ }
+}
+
/*
* Interpret complete data packets and report absolute mode input events for
* hardware version 3. (12 byte packets for two fingers)
@@ -715,6 +778,8 @@
if ((packet[0] & 0x0c) == 0x0c && (packet[3] & 0xce) == 0x0c)
return PACKET_V3_TAIL;
+ if ((packet[3] & 0x0f) == 0x06)
+ return PACKET_TRACKPOINT;
}
return PACKET_UNKNOWN;
@@ -791,14 +856,23 @@
case 3:
packet_type = elantech_packet_check_v3(psmouse);
- /* ignore debounce */
- if (packet_type == PACKET_DEBOUNCE)
- return PSMOUSE_FULL_PACKET;
-
- if (packet_type == PACKET_UNKNOWN)
+ switch (packet_type) {
+ case PACKET_UNKNOWN:
return PSMOUSE_BAD_DATA;
- elantech_report_absolute_v3(psmouse, packet_type);
+ case PACKET_DEBOUNCE:
+ /* ignore debounce */
+ break;
+
+ case PACKET_TRACKPOINT:
+ elantech_report_trackpoint(psmouse, packet_type);
+ break;
+
+ default:
+ elantech_report_absolute_v3(psmouse, packet_type);
+ break;
+ }
+
break;
case 4:
@@ -1018,8 +1092,10 @@
* Asus UX31 0x361f00 20, 15, 0e clickpad
* Asus UX32VD 0x361f02 00, 15, 0e clickpad
* Avatar AVIU-145A2 0x361f00 ? clickpad
+ * Fujitsu H730 0x570f00 c0, 14, 0c 3 hw buttons (**)
* Gigabyte U2442 0x450f01 58, 17, 0c 2 hw buttons
* Lenovo L430 0x350f02 b9, 15, 0c 2 hw buttons (*)
+ * Lenovo L530 0x350f02 b9, 15, 0c 2 hw buttons (*)
* Samsung NF210 0x150b00 78, 14, 0a 2 hw buttons
* Samsung NP770Z5E 0x575f01 10, 15, 0f clickpad
* Samsung NP700Z5B 0x361f06 21, 15, 0f clickpad
@@ -1029,6 +1105,8 @@
* Samsung RF710 0x450f00 ? 2 hw buttons
* System76 Pangolin 0x250f01 ? 2 hw buttons
* (*) + 3 trackpoint buttons
+ * (**) + 0 trackpoint buttons
+ * Note: Lenovo L430 and Lenovo L430 have the same fw_version/caps
*/
static void elantech_set_buttonpad_prop(struct psmouse *psmouse)
{
@@ -1324,6 +1402,10 @@
*/
static void elantech_disconnect(struct psmouse *psmouse)
{
+ struct elantech_data *etd = psmouse->private;
+
+ if (etd->tp_dev)
+ input_unregister_device(etd->tp_dev);
sysfs_remove_group(&psmouse->ps2dev.serio->dev.kobj,
&elantech_attr_group);
kfree(psmouse->private);
@@ -1438,8 +1520,10 @@
int elantech_init(struct psmouse *psmouse)
{
struct elantech_data *etd;
- int i, error;
+ int i;
+ int error = -EINVAL;
unsigned char param[3];
+ struct input_dev *tp_dev;
psmouse->private = etd = kzalloc(sizeof(struct elantech_data), GFP_KERNEL);
if (!etd)
@@ -1498,14 +1582,49 @@
goto init_fail;
}
+ /* The MSB indicates the presence of the trackpoint */
+ if ((etd->capabilities[0] & 0x80) == 0x80) {
+ tp_dev = input_allocate_device();
+
+ if (!tp_dev) {
+ error = -ENOMEM;
+ goto init_fail_tp_alloc;
+ }
+
+ etd->tp_dev = tp_dev;
+ snprintf(etd->tp_phys, sizeof(etd->tp_phys), "%s/input1",
+ psmouse->ps2dev.serio->phys);
+ tp_dev->phys = etd->tp_phys;
+ tp_dev->name = "Elantech PS/2 TrackPoint";
+ tp_dev->id.bustype = BUS_I8042;
+ tp_dev->id.vendor = 0x0002;
+ tp_dev->id.product = PSMOUSE_ELANTECH;
+ tp_dev->id.version = 0x0000;
+ tp_dev->dev.parent = &psmouse->ps2dev.serio->dev;
+ tp_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_REL);
+ tp_dev->relbit[BIT_WORD(REL_X)] =
+ BIT_MASK(REL_X) | BIT_MASK(REL_Y);
+ tp_dev->keybit[BIT_WORD(BTN_LEFT)] =
+ BIT_MASK(BTN_LEFT) | BIT_MASK(BTN_MIDDLE) |
+ BIT_MASK(BTN_RIGHT);
+ error = input_register_device(etd->tp_dev);
+ if (error < 0)
+ goto init_fail_tp_reg;
+ }
+
psmouse->protocol_handler = elantech_process_byte;
psmouse->disconnect = elantech_disconnect;
psmouse->reconnect = elantech_reconnect;
psmouse->pktsize = etd->hw_version > 1 ? 6 : 4;
return 0;
-
+ init_fail_tp_reg:
+ input_free_device(tp_dev);
+ init_fail_tp_alloc:
+ sysfs_remove_group(&psmouse->ps2dev.serio->dev.kobj,
+ &elantech_attr_group);
init_fail:
+ psmouse_reset(psmouse);
kfree(etd);
- return -1;
+ return error;
}
diff --git a/drivers/input/mouse/elantech.h b/drivers/input/mouse/elantech.h
index 9e0e2a1..6f3afec 100644
--- a/drivers/input/mouse/elantech.h
+++ b/drivers/input/mouse/elantech.h
@@ -94,6 +94,7 @@
#define PACKET_V4_HEAD 0x05
#define PACKET_V4_MOTION 0x06
#define PACKET_V4_STATUS 0x07
+#define PACKET_TRACKPOINT 0x08
/*
* track up to 5 fingers for v4 hardware
@@ -114,6 +115,8 @@
};
struct elantech_data {
+ struct input_dev *tp_dev; /* Relative device for trackpoint */
+ char tp_phys[32];
unsigned char reg_07;
unsigned char reg_10;
unsigned char reg_11;
diff --git a/drivers/input/serio/i8042-sparcio.h b/drivers/input/serio/i8042-sparcio.h
index d6aa4c6..93cb791 100644
--- a/drivers/input/serio/i8042-sparcio.h
+++ b/drivers/input/serio/i8042-sparcio.h
@@ -17,7 +17,6 @@
#define I8042_MUX_PHYS_DESC "sparcps2/serio%d"
static void __iomem *kbd_iobase;
-static struct resource *kbd_res;
#define I8042_COMMAND_REG (kbd_iobase + 0x64UL)
#define I8042_DATA_REG (kbd_iobase + 0x60UL)
@@ -44,6 +43,8 @@
#ifdef CONFIG_PCI
+static struct resource *kbd_res;
+
#define OBP_PS2KBD_NAME1 "kb_ps2"
#define OBP_PS2KBD_NAME2 "keyboard"
#define OBP_PS2MS_NAME1 "kdmouse"
diff --git a/drivers/leds/led-class.c b/drivers/leds/led-class.c
index 129729d..aa29198 100644
--- a/drivers/leds/led-class.c
+++ b/drivers/leds/led-class.c
@@ -15,10 +15,10 @@
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/device.h>
+#include <linux/timer.h>
#include <linux/err.h>
#include <linux/ctype.h>
#include <linux/leds.h>
-#include <linux/workqueue.h>
#include "leds.h"
static struct class *leds_class;
@@ -97,10 +97,9 @@
NULL,
};
-static void led_work_function(struct work_struct *ws)
+static void led_timer_function(unsigned long data)
{
- struct led_classdev *led_cdev =
- container_of(ws, struct led_classdev, blink_work.work);
+ struct led_classdev *led_cdev = (void *)data;
unsigned long brightness;
unsigned long delay;
@@ -144,8 +143,7 @@
}
}
- queue_delayed_work(system_wq, &led_cdev->blink_work,
- msecs_to_jiffies(delay));
+ mod_timer(&led_cdev->blink_timer, jiffies + msecs_to_jiffies(delay));
}
static void set_brightness_delayed(struct work_struct *ws)
@@ -233,7 +231,9 @@
INIT_WORK(&led_cdev->set_brightness_work, set_brightness_delayed);
- INIT_DELAYED_WORK(&led_cdev->blink_work, led_work_function);
+ init_timer(&led_cdev->blink_timer);
+ led_cdev->blink_timer.function = led_timer_function;
+ led_cdev->blink_timer.data = (unsigned long)led_cdev;
#ifdef CONFIG_LEDS_TRIGGERS
led_trigger_set_default(led_cdev);
diff --git a/drivers/leds/led-core.c b/drivers/leds/led-core.c
index 4bb1168..71b40d3 100644
--- a/drivers/leds/led-core.c
+++ b/drivers/leds/led-core.c
@@ -16,7 +16,6 @@
#include <linux/module.h>
#include <linux/rwsem.h>
#include <linux/leds.h>
-#include <linux/workqueue.h>
#include "leds.h"
DECLARE_RWSEM(leds_list_lock);
@@ -52,7 +51,7 @@
return;
}
- queue_delayed_work(system_wq, &led_cdev->blink_work, 1);
+ mod_timer(&led_cdev->blink_timer, jiffies + 1);
}
@@ -76,7 +75,7 @@
unsigned long *delay_on,
unsigned long *delay_off)
{
- cancel_delayed_work_sync(&led_cdev->blink_work);
+ del_timer_sync(&led_cdev->blink_timer);
led_cdev->flags &= ~LED_BLINK_ONESHOT;
led_cdev->flags &= ~LED_BLINK_ONESHOT_STOP;
@@ -91,7 +90,7 @@
int invert)
{
if ((led_cdev->flags & LED_BLINK_ONESHOT) &&
- delayed_work_pending(&led_cdev->blink_work))
+ timer_pending(&led_cdev->blink_timer))
return;
led_cdev->flags |= LED_BLINK_ONESHOT;
@@ -108,7 +107,7 @@
void led_stop_software_blink(struct led_classdev *led_cdev)
{
- cancel_delayed_work_sync(&led_cdev->blink_work);
+ del_timer_sync(&led_cdev->blink_timer);
led_cdev->blink_delay_on = 0;
led_cdev->blink_delay_off = 0;
}
@@ -117,7 +116,7 @@
void led_set_brightness(struct led_classdev *led_cdev,
enum led_brightness brightness)
{
- /* delay brightness setting if need to stop soft-blink work */
+ /* delay brightness setting if need to stop soft-blink timer */
if (led_cdev->blink_delay_on || led_cdev->blink_delay_off) {
led_cdev->delayed_set_value = brightness;
schedule_work(&led_cdev->set_brightness_work);
diff --git a/drivers/mtd/chips/cfi_cmdset_0002.c b/drivers/mtd/chips/cfi_cmdset_0002.c
index 5a4bfe3..46c4643 100644
--- a/drivers/mtd/chips/cfi_cmdset_0002.c
+++ b/drivers/mtd/chips/cfi_cmdset_0002.c
@@ -1434,6 +1434,10 @@
mutex_lock(&chip->mutex);
ret = get_chip(map, chip, base, FL_LOCKING);
+ if (ret) {
+ mutex_unlock(&chip->mutex);
+ return ret;
+ }
/* Enter lock register command */
cfi_send_gen_cmd(0xAA, cfi->addr_unlock1,
diff --git a/drivers/net/ethernet/3com/3c59x.c b/drivers/net/ethernet/3com/3c59x.c
index 059c741..3fe45c7 100644
--- a/drivers/net/ethernet/3com/3c59x.c
+++ b/drivers/net/ethernet/3com/3c59x.c
@@ -2177,10 +2177,10 @@
skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
vp->tx_ring[entry].frag[i+1].addr =
- cpu_to_le32(pci_map_single(
- VORTEX_PCI(vp),
- (void *)skb_frag_address(frag),
- skb_frag_size(frag), PCI_DMA_TODEVICE));
+ cpu_to_le32(skb_frag_dma_map(
+ &VORTEX_PCI(vp)->dev,
+ frag,
+ frag->page_offset, frag->size, DMA_TO_DEVICE));
if (i == skb_shinfo(skb)->nr_frags-1)
vp->tx_ring[entry].frag[i+1].length = cpu_to_le32(skb_frag_size(frag)|LAST_FRAG);
diff --git a/drivers/net/ethernet/aeroflex/greth.c b/drivers/net/ethernet/aeroflex/greth.c
index 23578df..3005155 100644
--- a/drivers/net/ethernet/aeroflex/greth.c
+++ b/drivers/net/ethernet/aeroflex/greth.c
@@ -123,6 +123,12 @@
GRETH_REGORIN(greth->regs->control, GRETH_TXEN);
}
+static inline void greth_enable_tx_and_irq(struct greth_private *greth)
+{
+ wmb(); /* BDs must been written to memory before enabling TX */
+ GRETH_REGORIN(greth->regs->control, GRETH_TXEN | GRETH_TXI);
+}
+
static inline void greth_disable_tx(struct greth_private *greth)
{
GRETH_REGANDIN(greth->regs->control, ~GRETH_TXEN);
@@ -447,29 +453,30 @@
return err;
}
+static inline u16 greth_num_free_bds(u16 tx_last, u16 tx_next)
+{
+ if (tx_next < tx_last)
+ return (tx_last - tx_next) - 1;
+ else
+ return GRETH_TXBD_NUM - (tx_next - tx_last) - 1;
+}
static netdev_tx_t
greth_start_xmit_gbit(struct sk_buff *skb, struct net_device *dev)
{
struct greth_private *greth = netdev_priv(dev);
struct greth_bd *bdp;
- u32 status = 0, dma_addr, ctrl;
+ u32 status, dma_addr;
int curr_tx, nr_frags, i, err = NETDEV_TX_OK;
unsigned long flags;
+ u16 tx_last;
nr_frags = skb_shinfo(skb)->nr_frags;
+ tx_last = greth->tx_last;
+ rmb(); /* tx_last is updated by the poll task */
- /* Clean TX Ring */
- greth_clean_tx_gbit(dev);
-
- if (greth->tx_free < nr_frags + 1) {
- spin_lock_irqsave(&greth->devlock, flags);/*save from poll/irq*/
- ctrl = GRETH_REGLOAD(greth->regs->control);
- /* Enable TX IRQ only if not already in poll() routine */
- if (ctrl & GRETH_RXI)
- GRETH_REGSAVE(greth->regs->control, ctrl | GRETH_TXI);
+ if (greth_num_free_bds(tx_last, greth->tx_next) < nr_frags + 1) {
netif_stop_queue(dev);
- spin_unlock_irqrestore(&greth->devlock, flags);
err = NETDEV_TX_BUSY;
goto out;
}
@@ -488,6 +495,8 @@
/* Linear buf */
if (nr_frags != 0)
status = GRETH_TXBD_MORE;
+ else
+ status = GRETH_BD_IE;
if (skb->ip_summed == CHECKSUM_PARTIAL)
status |= GRETH_TXBD_CSALL;
@@ -545,14 +554,12 @@
/* Enable the descriptor chain by enabling the first descriptor */
bdp = greth->tx_bd_base + greth->tx_next;
- greth_write_bd(&bdp->stat, greth_read_bd(&bdp->stat) | GRETH_BD_EN);
- greth->tx_next = curr_tx;
- greth->tx_free -= nr_frags + 1;
-
- wmb();
+ greth_write_bd(&bdp->stat,
+ greth_read_bd(&bdp->stat) | GRETH_BD_EN);
spin_lock_irqsave(&greth->devlock, flags); /*save from poll/irq*/
- greth_enable_tx(greth);
+ greth->tx_next = curr_tx;
+ greth_enable_tx_and_irq(greth);
spin_unlock_irqrestore(&greth->devlock, flags);
return NETDEV_TX_OK;
@@ -648,7 +655,6 @@
if (greth->tx_free > 0) {
netif_wake_queue(dev);
}
-
}
static inline void greth_update_tx_stats(struct net_device *dev, u32 stat)
@@ -670,20 +676,22 @@
{
struct greth_private *greth;
struct greth_bd *bdp, *bdp_last_frag;
- struct sk_buff *skb;
+ struct sk_buff *skb = NULL;
u32 stat;
int nr_frags, i;
+ u16 tx_last;
greth = netdev_priv(dev);
+ tx_last = greth->tx_last;
- while (greth->tx_free < GRETH_TXBD_NUM) {
+ while (tx_last != greth->tx_next) {
- skb = greth->tx_skbuff[greth->tx_last];
+ skb = greth->tx_skbuff[tx_last];
nr_frags = skb_shinfo(skb)->nr_frags;
/* We only clean fully completed SKBs */
- bdp_last_frag = greth->tx_bd_base + SKIP_TX(greth->tx_last, nr_frags);
+ bdp_last_frag = greth->tx_bd_base + SKIP_TX(tx_last, nr_frags);
GRETH_REGSAVE(greth->regs->status, GRETH_INT_TE | GRETH_INT_TX);
mb();
@@ -692,14 +700,14 @@
if (stat & GRETH_BD_EN)
break;
- greth->tx_skbuff[greth->tx_last] = NULL;
+ greth->tx_skbuff[tx_last] = NULL;
greth_update_tx_stats(dev, stat);
dev->stats.tx_bytes += skb->len;
- bdp = greth->tx_bd_base + greth->tx_last;
+ bdp = greth->tx_bd_base + tx_last;
- greth->tx_last = NEXT_TX(greth->tx_last);
+ tx_last = NEXT_TX(tx_last);
dma_unmap_single(greth->dev,
greth_read_bd(&bdp->addr),
@@ -708,21 +716,26 @@
for (i = 0; i < nr_frags; i++) {
skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
- bdp = greth->tx_bd_base + greth->tx_last;
+ bdp = greth->tx_bd_base + tx_last;
dma_unmap_page(greth->dev,
greth_read_bd(&bdp->addr),
skb_frag_size(frag),
DMA_TO_DEVICE);
- greth->tx_last = NEXT_TX(greth->tx_last);
+ tx_last = NEXT_TX(tx_last);
}
- greth->tx_free += nr_frags+1;
dev_kfree_skb(skb);
}
+ if (skb) { /* skb is set only if the above while loop was entered */
+ wmb();
+ greth->tx_last = tx_last;
- if (netif_queue_stopped(dev) && (greth->tx_free > (MAX_SKB_FRAGS+1)))
- netif_wake_queue(dev);
+ if (netif_queue_stopped(dev) &&
+ (greth_num_free_bds(tx_last, greth->tx_next) >
+ (MAX_SKB_FRAGS+1)))
+ netif_wake_queue(dev);
+ }
}
static int greth_rx(struct net_device *dev, int limit)
@@ -965,16 +978,12 @@
greth = container_of(napi, struct greth_private, napi);
restart_txrx_poll:
- if (netif_queue_stopped(greth->netdev)) {
- if (greth->gbit_mac)
- greth_clean_tx_gbit(greth->netdev);
- else
- greth_clean_tx(greth->netdev);
- }
-
if (greth->gbit_mac) {
+ greth_clean_tx_gbit(greth->netdev);
work_done += greth_rx_gbit(greth->netdev, budget - work_done);
} else {
+ if (netif_queue_stopped(greth->netdev))
+ greth_clean_tx(greth->netdev);
work_done += greth_rx(greth->netdev, budget - work_done);
}
@@ -983,7 +992,8 @@
spin_lock_irqsave(&greth->devlock, flags);
ctrl = GRETH_REGLOAD(greth->regs->control);
- if (netif_queue_stopped(greth->netdev)) {
+ if ((greth->gbit_mac && (greth->tx_last != greth->tx_next)) ||
+ (!greth->gbit_mac && netif_queue_stopped(greth->netdev))) {
GRETH_REGSAVE(greth->regs->control,
ctrl | GRETH_TXI | GRETH_RXI);
mask = GRETH_INT_RX | GRETH_INT_RE |
diff --git a/drivers/net/ethernet/aeroflex/greth.h b/drivers/net/ethernet/aeroflex/greth.h
index 232a622..ae16ac9 100644
--- a/drivers/net/ethernet/aeroflex/greth.h
+++ b/drivers/net/ethernet/aeroflex/greth.h
@@ -107,7 +107,7 @@
u16 tx_next;
u16 tx_last;
- u16 tx_free;
+ u16 tx_free; /* only used on 10/100Mbit */
u16 rx_cur;
struct greth_regs *regs; /* Address of controller registers. */
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-debugfs.c b/drivers/net/ethernet/amd/xgbe/xgbe-debugfs.c
index 346592d..a3c1135 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-debugfs.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-debugfs.c
@@ -272,8 +272,8 @@
struct xgbe_prv_data *pdata = filp->private_data;
unsigned int value;
- value = pdata->hw_if.read_mmd_regs(pdata, pdata->debugfs_xpcs_mmd,
- pdata->debugfs_xpcs_reg);
+ value = XMDIO_READ(pdata, pdata->debugfs_xpcs_mmd,
+ pdata->debugfs_xpcs_reg);
return xgbe_common_read(buffer, count, ppos, value);
}
@@ -290,8 +290,8 @@
if (len < 0)
return len;
- pdata->hw_if.write_mmd_regs(pdata, pdata->debugfs_xpcs_mmd,
- pdata->debugfs_xpcs_reg, value);
+ XMDIO_WRITE(pdata, pdata->debugfs_xpcs_mmd, pdata->debugfs_xpcs_reg,
+ value);
return len;
}
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-dev.c b/drivers/net/ethernet/amd/xgbe/xgbe-dev.c
index edaca44..ea27383 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-dev.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-dev.c
@@ -348,7 +348,7 @@
/* Clear MAC flow control */
max_q_count = XGMAC_MAX_FLOW_CONTROL_QUEUES;
- q_count = min_t(unsigned int, pdata->rx_q_count, max_q_count);
+ q_count = min_t(unsigned int, pdata->tx_q_count, max_q_count);
reg = MAC_Q0TFCR;
for (i = 0; i < q_count; i++) {
reg_val = XGMAC_IOREAD(pdata, reg);
@@ -373,7 +373,7 @@
/* Set MAC flow control */
max_q_count = XGMAC_MAX_FLOW_CONTROL_QUEUES;
- q_count = min_t(unsigned int, pdata->rx_q_count, max_q_count);
+ q_count = min_t(unsigned int, pdata->tx_q_count, max_q_count);
reg = MAC_Q0TFCR;
for (i = 0; i < q_count; i++) {
reg_val = XGMAC_IOREAD(pdata, reg);
@@ -509,8 +509,8 @@
XGMAC_IOWRITE(pdata, MAC_IER, mac_ier);
/* Enable all counter interrupts */
- XGMAC_IOWRITE_BITS(pdata, MMC_RIER, ALL_INTERRUPTS, 0xff);
- XGMAC_IOWRITE_BITS(pdata, MMC_TIER, ALL_INTERRUPTS, 0xff);
+ XGMAC_IOWRITE_BITS(pdata, MMC_RIER, ALL_INTERRUPTS, 0xffffffff);
+ XGMAC_IOWRITE_BITS(pdata, MMC_TIER, ALL_INTERRUPTS, 0xffffffff);
}
static int xgbe_set_gmii_speed(struct xgbe_prv_data *pdata)
@@ -1633,6 +1633,9 @@
{
unsigned int i, count;
+ if (XGMAC_GET_BITS(pdata->hw_feat.version, MAC_VR, SNPSVER) < 0x21)
+ return 0;
+
for (i = 0; i < pdata->tx_q_count; i++)
XGMAC_MTL_IOWRITE_BITS(pdata, i, MTL_Q_TQOMR, FTQ, 1);
@@ -1703,8 +1706,8 @@
XGMAC_IOWRITE_BITS(pdata, MTL_OMR, RAA, MTL_RAA_SP);
}
-static unsigned int xgbe_calculate_per_queue_fifo(unsigned long fifo_size,
- unsigned char queue_count)
+static unsigned int xgbe_calculate_per_queue_fifo(unsigned int fifo_size,
+ unsigned int queue_count)
{
unsigned int q_fifo_size = 0;
enum xgbe_mtl_fifo_size p_fifo = XGMAC_MTL_FIFO_SIZE_256;
@@ -1748,6 +1751,10 @@
q_fifo_size = XGBE_FIFO_SIZE_KB(256);
break;
}
+
+ /* The configured value is not the actual amount of fifo RAM */
+ q_fifo_size = min_t(unsigned int, XGBE_FIFO_MAX, q_fifo_size);
+
q_fifo_size = q_fifo_size / queue_count;
/* Set the queue fifo size programmable value */
@@ -1947,6 +1954,32 @@
xgbe_disable_rx_vlan_stripping(pdata);
}
+static u64 xgbe_mmc_read(struct xgbe_prv_data *pdata, unsigned int reg_lo)
+{
+ bool read_hi;
+ u64 val;
+
+ switch (reg_lo) {
+ /* These registers are always 64 bit */
+ case MMC_TXOCTETCOUNT_GB_LO:
+ case MMC_TXOCTETCOUNT_G_LO:
+ case MMC_RXOCTETCOUNT_GB_LO:
+ case MMC_RXOCTETCOUNT_G_LO:
+ read_hi = true;
+ break;
+
+ default:
+ read_hi = false;
+ };
+
+ val = XGMAC_IOREAD(pdata, reg_lo);
+
+ if (read_hi)
+ val |= ((u64)XGMAC_IOREAD(pdata, reg_lo + 4) << 32);
+
+ return val;
+}
+
static void xgbe_tx_mmc_int(struct xgbe_prv_data *pdata)
{
struct xgbe_mmc_stats *stats = &pdata->mmc_stats;
@@ -1954,75 +1987,75 @@
if (XGMAC_GET_BITS(mmc_isr, MMC_TISR, TXOCTETCOUNT_GB))
stats->txoctetcount_gb +=
- XGMAC_IOREAD(pdata, MMC_TXOCTETCOUNT_GB_LO);
+ xgbe_mmc_read(pdata, MMC_TXOCTETCOUNT_GB_LO);
if (XGMAC_GET_BITS(mmc_isr, MMC_TISR, TXFRAMECOUNT_GB))
stats->txframecount_gb +=
- XGMAC_IOREAD(pdata, MMC_TXFRAMECOUNT_GB_LO);
+ xgbe_mmc_read(pdata, MMC_TXFRAMECOUNT_GB_LO);
if (XGMAC_GET_BITS(mmc_isr, MMC_TISR, TXBROADCASTFRAMES_G))
stats->txbroadcastframes_g +=
- XGMAC_IOREAD(pdata, MMC_TXBROADCASTFRAMES_G_LO);
+ xgbe_mmc_read(pdata, MMC_TXBROADCASTFRAMES_G_LO);
if (XGMAC_GET_BITS(mmc_isr, MMC_TISR, TXMULTICASTFRAMES_G))
stats->txmulticastframes_g +=
- XGMAC_IOREAD(pdata, MMC_TXMULTICASTFRAMES_G_LO);
+ xgbe_mmc_read(pdata, MMC_TXMULTICASTFRAMES_G_LO);
if (XGMAC_GET_BITS(mmc_isr, MMC_TISR, TX64OCTETS_GB))
stats->tx64octets_gb +=
- XGMAC_IOREAD(pdata, MMC_TX64OCTETS_GB_LO);
+ xgbe_mmc_read(pdata, MMC_TX64OCTETS_GB_LO);
if (XGMAC_GET_BITS(mmc_isr, MMC_TISR, TX65TO127OCTETS_GB))
stats->tx65to127octets_gb +=
- XGMAC_IOREAD(pdata, MMC_TX65TO127OCTETS_GB_LO);
+ xgbe_mmc_read(pdata, MMC_TX65TO127OCTETS_GB_LO);
if (XGMAC_GET_BITS(mmc_isr, MMC_TISR, TX128TO255OCTETS_GB))
stats->tx128to255octets_gb +=
- XGMAC_IOREAD(pdata, MMC_TX128TO255OCTETS_GB_LO);
+ xgbe_mmc_read(pdata, MMC_TX128TO255OCTETS_GB_LO);
if (XGMAC_GET_BITS(mmc_isr, MMC_TISR, TX256TO511OCTETS_GB))
stats->tx256to511octets_gb +=
- XGMAC_IOREAD(pdata, MMC_TX256TO511OCTETS_GB_LO);
+ xgbe_mmc_read(pdata, MMC_TX256TO511OCTETS_GB_LO);
if (XGMAC_GET_BITS(mmc_isr, MMC_TISR, TX512TO1023OCTETS_GB))
stats->tx512to1023octets_gb +=
- XGMAC_IOREAD(pdata, MMC_TX512TO1023OCTETS_GB_LO);
+ xgbe_mmc_read(pdata, MMC_TX512TO1023OCTETS_GB_LO);
if (XGMAC_GET_BITS(mmc_isr, MMC_TISR, TX1024TOMAXOCTETS_GB))
stats->tx1024tomaxoctets_gb +=
- XGMAC_IOREAD(pdata, MMC_TX1024TOMAXOCTETS_GB_LO);
+ xgbe_mmc_read(pdata, MMC_TX1024TOMAXOCTETS_GB_LO);
if (XGMAC_GET_BITS(mmc_isr, MMC_TISR, TXUNICASTFRAMES_GB))
stats->txunicastframes_gb +=
- XGMAC_IOREAD(pdata, MMC_TXUNICASTFRAMES_GB_LO);
+ xgbe_mmc_read(pdata, MMC_TXUNICASTFRAMES_GB_LO);
if (XGMAC_GET_BITS(mmc_isr, MMC_TISR, TXMULTICASTFRAMES_GB))
stats->txmulticastframes_gb +=
- XGMAC_IOREAD(pdata, MMC_TXMULTICASTFRAMES_GB_LO);
+ xgbe_mmc_read(pdata, MMC_TXMULTICASTFRAMES_GB_LO);
if (XGMAC_GET_BITS(mmc_isr, MMC_TISR, TXBROADCASTFRAMES_GB))
stats->txbroadcastframes_g +=
- XGMAC_IOREAD(pdata, MMC_TXBROADCASTFRAMES_GB_LO);
+ xgbe_mmc_read(pdata, MMC_TXBROADCASTFRAMES_GB_LO);
if (XGMAC_GET_BITS(mmc_isr, MMC_TISR, TXUNDERFLOWERROR))
stats->txunderflowerror +=
- XGMAC_IOREAD(pdata, MMC_TXUNDERFLOWERROR_LO);
+ xgbe_mmc_read(pdata, MMC_TXUNDERFLOWERROR_LO);
if (XGMAC_GET_BITS(mmc_isr, MMC_TISR, TXOCTETCOUNT_G))
stats->txoctetcount_g +=
- XGMAC_IOREAD(pdata, MMC_TXOCTETCOUNT_G_LO);
+ xgbe_mmc_read(pdata, MMC_TXOCTETCOUNT_G_LO);
if (XGMAC_GET_BITS(mmc_isr, MMC_TISR, TXFRAMECOUNT_G))
stats->txframecount_g +=
- XGMAC_IOREAD(pdata, MMC_TXFRAMECOUNT_G_LO);
+ xgbe_mmc_read(pdata, MMC_TXFRAMECOUNT_G_LO);
if (XGMAC_GET_BITS(mmc_isr, MMC_TISR, TXPAUSEFRAMES))
stats->txpauseframes +=
- XGMAC_IOREAD(pdata, MMC_TXPAUSEFRAMES_LO);
+ xgbe_mmc_read(pdata, MMC_TXPAUSEFRAMES_LO);
if (XGMAC_GET_BITS(mmc_isr, MMC_TISR, TXVLANFRAMES_G))
stats->txvlanframes_g +=
- XGMAC_IOREAD(pdata, MMC_TXVLANFRAMES_G_LO);
+ xgbe_mmc_read(pdata, MMC_TXVLANFRAMES_G_LO);
}
static void xgbe_rx_mmc_int(struct xgbe_prv_data *pdata)
@@ -2032,95 +2065,95 @@
if (XGMAC_GET_BITS(mmc_isr, MMC_RISR, RXFRAMECOUNT_GB))
stats->rxframecount_gb +=
- XGMAC_IOREAD(pdata, MMC_RXFRAMECOUNT_GB_LO);
+ xgbe_mmc_read(pdata, MMC_RXFRAMECOUNT_GB_LO);
if (XGMAC_GET_BITS(mmc_isr, MMC_RISR, RXOCTETCOUNT_GB))
stats->rxoctetcount_gb +=
- XGMAC_IOREAD(pdata, MMC_RXOCTETCOUNT_GB_LO);
+ xgbe_mmc_read(pdata, MMC_RXOCTETCOUNT_GB_LO);
if (XGMAC_GET_BITS(mmc_isr, MMC_RISR, RXOCTETCOUNT_G))
stats->rxoctetcount_g +=
- XGMAC_IOREAD(pdata, MMC_RXOCTETCOUNT_G_LO);
+ xgbe_mmc_read(pdata, MMC_RXOCTETCOUNT_G_LO);
if (XGMAC_GET_BITS(mmc_isr, MMC_RISR, RXBROADCASTFRAMES_G))
stats->rxbroadcastframes_g +=
- XGMAC_IOREAD(pdata, MMC_RXBROADCASTFRAMES_G_LO);
+ xgbe_mmc_read(pdata, MMC_RXBROADCASTFRAMES_G_LO);
if (XGMAC_GET_BITS(mmc_isr, MMC_RISR, RXMULTICASTFRAMES_G))
stats->rxmulticastframes_g +=
- XGMAC_IOREAD(pdata, MMC_RXMULTICASTFRAMES_G_LO);
+ xgbe_mmc_read(pdata, MMC_RXMULTICASTFRAMES_G_LO);
if (XGMAC_GET_BITS(mmc_isr, MMC_RISR, RXCRCERROR))
stats->rxcrcerror +=
- XGMAC_IOREAD(pdata, MMC_RXCRCERROR_LO);
+ xgbe_mmc_read(pdata, MMC_RXCRCERROR_LO);
if (XGMAC_GET_BITS(mmc_isr, MMC_RISR, RXRUNTERROR))
stats->rxrunterror +=
- XGMAC_IOREAD(pdata, MMC_RXRUNTERROR);
+ xgbe_mmc_read(pdata, MMC_RXRUNTERROR);
if (XGMAC_GET_BITS(mmc_isr, MMC_RISR, RXJABBERERROR))
stats->rxjabbererror +=
- XGMAC_IOREAD(pdata, MMC_RXJABBERERROR);
+ xgbe_mmc_read(pdata, MMC_RXJABBERERROR);
if (XGMAC_GET_BITS(mmc_isr, MMC_RISR, RXUNDERSIZE_G))
stats->rxundersize_g +=
- XGMAC_IOREAD(pdata, MMC_RXUNDERSIZE_G);
+ xgbe_mmc_read(pdata, MMC_RXUNDERSIZE_G);
if (XGMAC_GET_BITS(mmc_isr, MMC_RISR, RXOVERSIZE_G))
stats->rxoversize_g +=
- XGMAC_IOREAD(pdata, MMC_RXOVERSIZE_G);
+ xgbe_mmc_read(pdata, MMC_RXOVERSIZE_G);
if (XGMAC_GET_BITS(mmc_isr, MMC_RISR, RX64OCTETS_GB))
stats->rx64octets_gb +=
- XGMAC_IOREAD(pdata, MMC_RX64OCTETS_GB_LO);
+ xgbe_mmc_read(pdata, MMC_RX64OCTETS_GB_LO);
if (XGMAC_GET_BITS(mmc_isr, MMC_RISR, RX65TO127OCTETS_GB))
stats->rx65to127octets_gb +=
- XGMAC_IOREAD(pdata, MMC_RX65TO127OCTETS_GB_LO);
+ xgbe_mmc_read(pdata, MMC_RX65TO127OCTETS_GB_LO);
if (XGMAC_GET_BITS(mmc_isr, MMC_RISR, RX128TO255OCTETS_GB))
stats->rx128to255octets_gb +=
- XGMAC_IOREAD(pdata, MMC_RX128TO255OCTETS_GB_LO);
+ xgbe_mmc_read(pdata, MMC_RX128TO255OCTETS_GB_LO);
if (XGMAC_GET_BITS(mmc_isr, MMC_RISR, RX256TO511OCTETS_GB))
stats->rx256to511octets_gb +=
- XGMAC_IOREAD(pdata, MMC_RX256TO511OCTETS_GB_LO);
+ xgbe_mmc_read(pdata, MMC_RX256TO511OCTETS_GB_LO);
if (XGMAC_GET_BITS(mmc_isr, MMC_RISR, RX512TO1023OCTETS_GB))
stats->rx512to1023octets_gb +=
- XGMAC_IOREAD(pdata, MMC_RX512TO1023OCTETS_GB_LO);
+ xgbe_mmc_read(pdata, MMC_RX512TO1023OCTETS_GB_LO);
if (XGMAC_GET_BITS(mmc_isr, MMC_RISR, RX1024TOMAXOCTETS_GB))
stats->rx1024tomaxoctets_gb +=
- XGMAC_IOREAD(pdata, MMC_RX1024TOMAXOCTETS_GB_LO);
+ xgbe_mmc_read(pdata, MMC_RX1024TOMAXOCTETS_GB_LO);
if (XGMAC_GET_BITS(mmc_isr, MMC_RISR, RXUNICASTFRAMES_G))
stats->rxunicastframes_g +=
- XGMAC_IOREAD(pdata, MMC_RXUNICASTFRAMES_G_LO);
+ xgbe_mmc_read(pdata, MMC_RXUNICASTFRAMES_G_LO);
if (XGMAC_GET_BITS(mmc_isr, MMC_RISR, RXLENGTHERROR))
stats->rxlengtherror +=
- XGMAC_IOREAD(pdata, MMC_RXLENGTHERROR_LO);
+ xgbe_mmc_read(pdata, MMC_RXLENGTHERROR_LO);
if (XGMAC_GET_BITS(mmc_isr, MMC_RISR, RXOUTOFRANGETYPE))
stats->rxoutofrangetype +=
- XGMAC_IOREAD(pdata, MMC_RXOUTOFRANGETYPE_LO);
+ xgbe_mmc_read(pdata, MMC_RXOUTOFRANGETYPE_LO);
if (XGMAC_GET_BITS(mmc_isr, MMC_RISR, RXPAUSEFRAMES))
stats->rxpauseframes +=
- XGMAC_IOREAD(pdata, MMC_RXPAUSEFRAMES_LO);
+ xgbe_mmc_read(pdata, MMC_RXPAUSEFRAMES_LO);
if (XGMAC_GET_BITS(mmc_isr, MMC_RISR, RXFIFOOVERFLOW))
stats->rxfifooverflow +=
- XGMAC_IOREAD(pdata, MMC_RXFIFOOVERFLOW_LO);
+ xgbe_mmc_read(pdata, MMC_RXFIFOOVERFLOW_LO);
if (XGMAC_GET_BITS(mmc_isr, MMC_RISR, RXVLANFRAMES_GB))
stats->rxvlanframes_gb +=
- XGMAC_IOREAD(pdata, MMC_RXVLANFRAMES_GB_LO);
+ xgbe_mmc_read(pdata, MMC_RXVLANFRAMES_GB_LO);
if (XGMAC_GET_BITS(mmc_isr, MMC_RISR, RXWATCHDOGERROR))
stats->rxwatchdogerror +=
- XGMAC_IOREAD(pdata, MMC_RXWATCHDOGERROR);
+ xgbe_mmc_read(pdata, MMC_RXWATCHDOGERROR);
}
static void xgbe_read_mmc_stats(struct xgbe_prv_data *pdata)
@@ -2131,127 +2164,127 @@
XGMAC_IOWRITE_BITS(pdata, MMC_CR, MCF, 1);
stats->txoctetcount_gb +=
- XGMAC_IOREAD(pdata, MMC_TXOCTETCOUNT_GB_LO);
+ xgbe_mmc_read(pdata, MMC_TXOCTETCOUNT_GB_LO);
stats->txframecount_gb +=
- XGMAC_IOREAD(pdata, MMC_TXFRAMECOUNT_GB_LO);
+ xgbe_mmc_read(pdata, MMC_TXFRAMECOUNT_GB_LO);
stats->txbroadcastframes_g +=
- XGMAC_IOREAD(pdata, MMC_TXBROADCASTFRAMES_G_LO);
+ xgbe_mmc_read(pdata, MMC_TXBROADCASTFRAMES_G_LO);
stats->txmulticastframes_g +=
- XGMAC_IOREAD(pdata, MMC_TXMULTICASTFRAMES_G_LO);
+ xgbe_mmc_read(pdata, MMC_TXMULTICASTFRAMES_G_LO);
stats->tx64octets_gb +=
- XGMAC_IOREAD(pdata, MMC_TX64OCTETS_GB_LO);
+ xgbe_mmc_read(pdata, MMC_TX64OCTETS_GB_LO);
stats->tx65to127octets_gb +=
- XGMAC_IOREAD(pdata, MMC_TX65TO127OCTETS_GB_LO);
+ xgbe_mmc_read(pdata, MMC_TX65TO127OCTETS_GB_LO);
stats->tx128to255octets_gb +=
- XGMAC_IOREAD(pdata, MMC_TX128TO255OCTETS_GB_LO);
+ xgbe_mmc_read(pdata, MMC_TX128TO255OCTETS_GB_LO);
stats->tx256to511octets_gb +=
- XGMAC_IOREAD(pdata, MMC_TX256TO511OCTETS_GB_LO);
+ xgbe_mmc_read(pdata, MMC_TX256TO511OCTETS_GB_LO);
stats->tx512to1023octets_gb +=
- XGMAC_IOREAD(pdata, MMC_TX512TO1023OCTETS_GB_LO);
+ xgbe_mmc_read(pdata, MMC_TX512TO1023OCTETS_GB_LO);
stats->tx1024tomaxoctets_gb +=
- XGMAC_IOREAD(pdata, MMC_TX1024TOMAXOCTETS_GB_LO);
+ xgbe_mmc_read(pdata, MMC_TX1024TOMAXOCTETS_GB_LO);
stats->txunicastframes_gb +=
- XGMAC_IOREAD(pdata, MMC_TXUNICASTFRAMES_GB_LO);
+ xgbe_mmc_read(pdata, MMC_TXUNICASTFRAMES_GB_LO);
stats->txmulticastframes_gb +=
- XGMAC_IOREAD(pdata, MMC_TXMULTICASTFRAMES_GB_LO);
+ xgbe_mmc_read(pdata, MMC_TXMULTICASTFRAMES_GB_LO);
stats->txbroadcastframes_g +=
- XGMAC_IOREAD(pdata, MMC_TXBROADCASTFRAMES_GB_LO);
+ xgbe_mmc_read(pdata, MMC_TXBROADCASTFRAMES_GB_LO);
stats->txunderflowerror +=
- XGMAC_IOREAD(pdata, MMC_TXUNDERFLOWERROR_LO);
+ xgbe_mmc_read(pdata, MMC_TXUNDERFLOWERROR_LO);
stats->txoctetcount_g +=
- XGMAC_IOREAD(pdata, MMC_TXOCTETCOUNT_G_LO);
+ xgbe_mmc_read(pdata, MMC_TXOCTETCOUNT_G_LO);
stats->txframecount_g +=
- XGMAC_IOREAD(pdata, MMC_TXFRAMECOUNT_G_LO);
+ xgbe_mmc_read(pdata, MMC_TXFRAMECOUNT_G_LO);
stats->txpauseframes +=
- XGMAC_IOREAD(pdata, MMC_TXPAUSEFRAMES_LO);
+ xgbe_mmc_read(pdata, MMC_TXPAUSEFRAMES_LO);
stats->txvlanframes_g +=
- XGMAC_IOREAD(pdata, MMC_TXVLANFRAMES_G_LO);
+ xgbe_mmc_read(pdata, MMC_TXVLANFRAMES_G_LO);
stats->rxframecount_gb +=
- XGMAC_IOREAD(pdata, MMC_RXFRAMECOUNT_GB_LO);
+ xgbe_mmc_read(pdata, MMC_RXFRAMECOUNT_GB_LO);
stats->rxoctetcount_gb +=
- XGMAC_IOREAD(pdata, MMC_RXOCTETCOUNT_GB_LO);
+ xgbe_mmc_read(pdata, MMC_RXOCTETCOUNT_GB_LO);
stats->rxoctetcount_g +=
- XGMAC_IOREAD(pdata, MMC_RXOCTETCOUNT_G_LO);
+ xgbe_mmc_read(pdata, MMC_RXOCTETCOUNT_G_LO);
stats->rxbroadcastframes_g +=
- XGMAC_IOREAD(pdata, MMC_RXBROADCASTFRAMES_G_LO);
+ xgbe_mmc_read(pdata, MMC_RXBROADCASTFRAMES_G_LO);
stats->rxmulticastframes_g +=
- XGMAC_IOREAD(pdata, MMC_RXMULTICASTFRAMES_G_LO);
+ xgbe_mmc_read(pdata, MMC_RXMULTICASTFRAMES_G_LO);
stats->rxcrcerror +=
- XGMAC_IOREAD(pdata, MMC_RXCRCERROR_LO);
+ xgbe_mmc_read(pdata, MMC_RXCRCERROR_LO);
stats->rxrunterror +=
- XGMAC_IOREAD(pdata, MMC_RXRUNTERROR);
+ xgbe_mmc_read(pdata, MMC_RXRUNTERROR);
stats->rxjabbererror +=
- XGMAC_IOREAD(pdata, MMC_RXJABBERERROR);
+ xgbe_mmc_read(pdata, MMC_RXJABBERERROR);
stats->rxundersize_g +=
- XGMAC_IOREAD(pdata, MMC_RXUNDERSIZE_G);
+ xgbe_mmc_read(pdata, MMC_RXUNDERSIZE_G);
stats->rxoversize_g +=
- XGMAC_IOREAD(pdata, MMC_RXOVERSIZE_G);
+ xgbe_mmc_read(pdata, MMC_RXOVERSIZE_G);
stats->rx64octets_gb +=
- XGMAC_IOREAD(pdata, MMC_RX64OCTETS_GB_LO);
+ xgbe_mmc_read(pdata, MMC_RX64OCTETS_GB_LO);
stats->rx65to127octets_gb +=
- XGMAC_IOREAD(pdata, MMC_RX65TO127OCTETS_GB_LO);
+ xgbe_mmc_read(pdata, MMC_RX65TO127OCTETS_GB_LO);
stats->rx128to255octets_gb +=
- XGMAC_IOREAD(pdata, MMC_RX128TO255OCTETS_GB_LO);
+ xgbe_mmc_read(pdata, MMC_RX128TO255OCTETS_GB_LO);
stats->rx256to511octets_gb +=
- XGMAC_IOREAD(pdata, MMC_RX256TO511OCTETS_GB_LO);
+ xgbe_mmc_read(pdata, MMC_RX256TO511OCTETS_GB_LO);
stats->rx512to1023octets_gb +=
- XGMAC_IOREAD(pdata, MMC_RX512TO1023OCTETS_GB_LO);
+ xgbe_mmc_read(pdata, MMC_RX512TO1023OCTETS_GB_LO);
stats->rx1024tomaxoctets_gb +=
- XGMAC_IOREAD(pdata, MMC_RX1024TOMAXOCTETS_GB_LO);
+ xgbe_mmc_read(pdata, MMC_RX1024TOMAXOCTETS_GB_LO);
stats->rxunicastframes_g +=
- XGMAC_IOREAD(pdata, MMC_RXUNICASTFRAMES_G_LO);
+ xgbe_mmc_read(pdata, MMC_RXUNICASTFRAMES_G_LO);
stats->rxlengtherror +=
- XGMAC_IOREAD(pdata, MMC_RXLENGTHERROR_LO);
+ xgbe_mmc_read(pdata, MMC_RXLENGTHERROR_LO);
stats->rxoutofrangetype +=
- XGMAC_IOREAD(pdata, MMC_RXOUTOFRANGETYPE_LO);
+ xgbe_mmc_read(pdata, MMC_RXOUTOFRANGETYPE_LO);
stats->rxpauseframes +=
- XGMAC_IOREAD(pdata, MMC_RXPAUSEFRAMES_LO);
+ xgbe_mmc_read(pdata, MMC_RXPAUSEFRAMES_LO);
stats->rxfifooverflow +=
- XGMAC_IOREAD(pdata, MMC_RXFIFOOVERFLOW_LO);
+ xgbe_mmc_read(pdata, MMC_RXFIFOOVERFLOW_LO);
stats->rxvlanframes_gb +=
- XGMAC_IOREAD(pdata, MMC_RXVLANFRAMES_GB_LO);
+ xgbe_mmc_read(pdata, MMC_RXVLANFRAMES_GB_LO);
stats->rxwatchdogerror +=
- XGMAC_IOREAD(pdata, MMC_RXWATCHDOGERROR);
+ xgbe_mmc_read(pdata, MMC_RXWATCHDOGERROR);
/* Un-freeze counters */
XGMAC_IOWRITE_BITS(pdata, MMC_CR, MCF, 0);
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
index dc84f71..b26d758 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
@@ -361,6 +361,8 @@
memset(hw_feat, 0, sizeof(*hw_feat));
+ hw_feat->version = XGMAC_IOREAD(pdata, MAC_VR);
+
/* Hardware feature register 0 */
hw_feat->gmii = XGMAC_GET_BITS(mac_hfr0, MAC_HWF0R, GMIISEL);
hw_feat->vlhash = XGMAC_GET_BITS(mac_hfr0, MAC_HWF0R, VLHASH);
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c b/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c
index a076aca..46f6130 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c
@@ -361,15 +361,16 @@
struct ethtool_drvinfo *drvinfo)
{
struct xgbe_prv_data *pdata = netdev_priv(netdev);
+ struct xgbe_hw_features *hw_feat = &pdata->hw_feat;
strlcpy(drvinfo->driver, XGBE_DRV_NAME, sizeof(drvinfo->driver));
strlcpy(drvinfo->version, XGBE_DRV_VERSION, sizeof(drvinfo->version));
strlcpy(drvinfo->bus_info, dev_name(pdata->dev),
sizeof(drvinfo->bus_info));
snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version), "%d.%d.%d",
- XGMAC_IOREAD_BITS(pdata, MAC_VR, USERVER),
- XGMAC_IOREAD_BITS(pdata, MAC_VR, DEVID),
- XGMAC_IOREAD_BITS(pdata, MAC_VR, SNPSVER));
+ XGMAC_GET_BITS(hw_feat->version, MAC_VR, USERVER),
+ XGMAC_GET_BITS(hw_feat->version, MAC_VR, DEVID),
+ XGMAC_GET_BITS(hw_feat->version, MAC_VR, SNPSVER));
drvinfo->n_stats = XGBE_STATS_COUNT;
}
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-main.c b/drivers/net/ethernet/amd/xgbe/xgbe-main.c
index 8aa6a93..bdf9cfa 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-main.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-main.c
@@ -172,7 +172,7 @@
}
if (i < pdata->rx_ring_count) {
- spin_lock_init(&tx_ring->lock);
+ spin_lock_init(&rx_ring->lock);
channel->rx_ring = rx_ring++;
}
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe.h b/drivers/net/ethernet/amd/xgbe/xgbe.h
index 07bf70a..e9fe6e6 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe.h
+++ b/drivers/net/ethernet/amd/xgbe/xgbe.h
@@ -183,6 +183,7 @@
#define XGMAC_DRIVER_CONTEXT 1
#define XGMAC_IOCTL_CONTEXT 2
+#define XGBE_FIFO_MAX 81920
#define XGBE_FIFO_SIZE_B(x) (x)
#define XGBE_FIFO_SIZE_KB(x) (x * 1024)
@@ -526,6 +527,9 @@
* or configurations are present in the device.
*/
struct xgbe_hw_features {
+ /* HW Version */
+ unsigned int version;
+
/* HW Feature Register0 */
unsigned int gmii; /* 1000 Mbps support */
unsigned int vlhash; /* VLAN Hash Filter */
diff --git a/drivers/net/ethernet/apm/xgene/Kconfig b/drivers/net/ethernet/apm/xgene/Kconfig
index 616dff6..f4054d24 100644
--- a/drivers/net/ethernet/apm/xgene/Kconfig
+++ b/drivers/net/ethernet/apm/xgene/Kconfig
@@ -1,5 +1,6 @@
config NET_XGENE
tristate "APM X-Gene SoC Ethernet Driver"
+ depends on HAS_DMA
select PHYLIB
help
This is the Ethernet driver for the on-chip ethernet interface on the
diff --git a/drivers/net/ethernet/broadcom/Kconfig b/drivers/net/ethernet/broadcom/Kconfig
index 7dcfb19..d8d07a8 100644
--- a/drivers/net/ethernet/broadcom/Kconfig
+++ b/drivers/net/ethernet/broadcom/Kconfig
@@ -84,7 +84,7 @@
config CNIC
tristate "QLogic CNIC support"
- depends on PCI
+ depends on PCI && (IPV6 || IPV6=n)
select BNX2
select UIO
---help---
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_hsi.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_hsi.h
index 5ba8af5..c4daa06 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_hsi.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_hsi.h
@@ -2233,7 +2233,12 @@
u32 reserved3; /* Offset 0x14C */
u32 reserved4; /* Offset 0x150 */
u32 link_attr_sync[PORT_MAX]; /* Offset 0x154 */
- #define LINK_ATTR_SYNC_KR2_ENABLE (1<<0)
+ #define LINK_ATTR_SYNC_KR2_ENABLE 0x00000001
+ #define LINK_SFP_EEPROM_COMP_CODE_MASK 0x0000ff00
+ #define LINK_SFP_EEPROM_COMP_CODE_SHIFT 8
+ #define LINK_SFP_EEPROM_COMP_CODE_SR 0x00001000
+ #define LINK_SFP_EEPROM_COMP_CODE_LR 0x00002000
+ #define LINK_SFP_EEPROM_COMP_CODE_LRM 0x00004000
u32 reserved5[2];
u32 reserved6[PORT_MAX];
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.c
index 53fb4fa..549549e 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.c
@@ -154,15 +154,22 @@
LINK_STATUS_LINK_PARTNER_ASYMMETRIC_PAUSE)
#define SFP_EEPROM_CON_TYPE_ADDR 0x2
+ #define SFP_EEPROM_CON_TYPE_VAL_UNKNOWN 0x0
#define SFP_EEPROM_CON_TYPE_VAL_LC 0x7
#define SFP_EEPROM_CON_TYPE_VAL_COPPER 0x21
#define SFP_EEPROM_CON_TYPE_VAL_RJ45 0x22
-#define SFP_EEPROM_COMP_CODE_ADDR 0x3
- #define SFP_EEPROM_COMP_CODE_SR_MASK (1<<4)
- #define SFP_EEPROM_COMP_CODE_LR_MASK (1<<5)
- #define SFP_EEPROM_COMP_CODE_LRM_MASK (1<<6)
+#define SFP_EEPROM_10G_COMP_CODE_ADDR 0x3
+ #define SFP_EEPROM_10G_COMP_CODE_SR_MASK (1<<4)
+ #define SFP_EEPROM_10G_COMP_CODE_LR_MASK (1<<5)
+ #define SFP_EEPROM_10G_COMP_CODE_LRM_MASK (1<<6)
+
+#define SFP_EEPROM_1G_COMP_CODE_ADDR 0x6
+ #define SFP_EEPROM_1G_COMP_CODE_SX (1<<0)
+ #define SFP_EEPROM_1G_COMP_CODE_LX (1<<1)
+ #define SFP_EEPROM_1G_COMP_CODE_CX (1<<2)
+ #define SFP_EEPROM_1G_COMP_CODE_BASE_T (1<<3)
#define SFP_EEPROM_FC_TX_TECH_ADDR 0x8
#define SFP_EEPROM_FC_TX_TECH_BITMASK_COPPER_PASSIVE 0x4
@@ -3633,8 +3640,8 @@
reg_set[i].val);
/* Start KR2 work-around timer which handles BCM8073 link-parner */
- vars->link_attr_sync |= LINK_ATTR_SYNC_KR2_ENABLE;
- bnx2x_update_link_attr(params, vars->link_attr_sync);
+ params->link_attr_sync |= LINK_ATTR_SYNC_KR2_ENABLE;
+ bnx2x_update_link_attr(params, params->link_attr_sync);
}
static void bnx2x_disable_kr2(struct link_params *params,
@@ -3666,8 +3673,8 @@
for (i = 0; i < ARRAY_SIZE(reg_set); i++)
bnx2x_cl45_write(bp, phy, reg_set[i].devad, reg_set[i].reg,
reg_set[i].val);
- vars->link_attr_sync &= ~LINK_ATTR_SYNC_KR2_ENABLE;
- bnx2x_update_link_attr(params, vars->link_attr_sync);
+ params->link_attr_sync &= ~LINK_ATTR_SYNC_KR2_ENABLE;
+ bnx2x_update_link_attr(params, params->link_attr_sync);
vars->check_kr2_recovery_cnt = CHECK_KR2_RECOVERY_CNT;
}
@@ -4810,7 +4817,7 @@
~FEATURE_CONFIG_PFC_ENABLED;
if (SHMEM2_HAS(bp, link_attr_sync))
- vars->link_attr_sync = SHMEM2_RD(bp,
+ params->link_attr_sync = SHMEM2_RD(bp,
link_attr_sync[params->port]);
DP(NETIF_MSG_LINK, "link_status 0x%x phy_link_up %x int_mask 0x%x\n",
@@ -8057,21 +8064,24 @@
{
struct bnx2x *bp = params->bp;
u32 sync_offset = 0, phy_idx, media_types;
- u8 gport, val[2], check_limiting_mode = 0;
+ u8 val[SFP_EEPROM_FC_TX_TECH_ADDR + 1], check_limiting_mode = 0;
*edc_mode = EDC_MODE_LIMITING;
phy->media_type = ETH_PHY_UNSPECIFIED;
/* First check for copper cable */
if (bnx2x_read_sfp_module_eeprom(phy,
params,
I2C_DEV_ADDR_A0,
- SFP_EEPROM_CON_TYPE_ADDR,
- 2,
+ 0,
+ SFP_EEPROM_FC_TX_TECH_ADDR + 1,
(u8 *)val) != 0) {
DP(NETIF_MSG_LINK, "Failed to read from SFP+ module EEPROM\n");
return -EINVAL;
}
-
- switch (val[0]) {
+ params->link_attr_sync &= ~LINK_SFP_EEPROM_COMP_CODE_MASK;
+ params->link_attr_sync |= val[SFP_EEPROM_10G_COMP_CODE_ADDR] <<
+ LINK_SFP_EEPROM_COMP_CODE_SHIFT;
+ bnx2x_update_link_attr(params, params->link_attr_sync);
+ switch (val[SFP_EEPROM_CON_TYPE_ADDR]) {
case SFP_EEPROM_CON_TYPE_VAL_COPPER:
{
u8 copper_module_type;
@@ -8079,17 +8089,7 @@
/* Check if its active cable (includes SFP+ module)
* of passive cable
*/
- if (bnx2x_read_sfp_module_eeprom(phy,
- params,
- I2C_DEV_ADDR_A0,
- SFP_EEPROM_FC_TX_TECH_ADDR,
- 1,
- &copper_module_type) != 0) {
- DP(NETIF_MSG_LINK,
- "Failed to read copper-cable-type"
- " from SFP+ EEPROM\n");
- return -EINVAL;
- }
+ copper_module_type = val[SFP_EEPROM_FC_TX_TECH_ADDR];
if (copper_module_type &
SFP_EEPROM_FC_TX_TECH_BITMASK_COPPER_ACTIVE) {
@@ -8115,16 +8115,18 @@
}
break;
}
+ case SFP_EEPROM_CON_TYPE_VAL_UNKNOWN:
case SFP_EEPROM_CON_TYPE_VAL_LC:
case SFP_EEPROM_CON_TYPE_VAL_RJ45:
check_limiting_mode = 1;
- if ((val[1] & (SFP_EEPROM_COMP_CODE_SR_MASK |
- SFP_EEPROM_COMP_CODE_LR_MASK |
- SFP_EEPROM_COMP_CODE_LRM_MASK)) == 0) {
+ if ((val[SFP_EEPROM_10G_COMP_CODE_ADDR] &
+ (SFP_EEPROM_10G_COMP_CODE_SR_MASK |
+ SFP_EEPROM_10G_COMP_CODE_LR_MASK |
+ SFP_EEPROM_10G_COMP_CODE_LRM_MASK)) == 0) {
DP(NETIF_MSG_LINK, "1G SFP module detected\n");
- gport = params->port;
phy->media_type = ETH_PHY_SFP_1G_FIBER;
if (phy->req_line_speed != SPEED_1000) {
+ u8 gport = params->port;
phy->req_line_speed = SPEED_1000;
if (!CHIP_IS_E1x(bp)) {
gport = BP_PATH(bp) +
@@ -8134,6 +8136,12 @@
"Warning: Link speed was forced to 1000Mbps. Current SFP module in port %d is not compliant with 10G Ethernet\n",
gport);
}
+ if (val[SFP_EEPROM_1G_COMP_CODE_ADDR] &
+ SFP_EEPROM_1G_COMP_CODE_BASE_T) {
+ bnx2x_sfp_set_transmitter(params, phy, 0);
+ msleep(40);
+ bnx2x_sfp_set_transmitter(params, phy, 1);
+ }
} else {
int idx, cfg_idx = 0;
DP(NETIF_MSG_LINK, "10G Optic module detected\n");
@@ -8149,7 +8157,7 @@
break;
default:
DP(NETIF_MSG_LINK, "Unable to determine module type 0x%x !!!\n",
- val[0]);
+ val[SFP_EEPROM_CON_TYPE_ADDR]);
return -EINVAL;
}
sync_offset = params->shmem_base +
@@ -13507,7 +13515,7 @@
sigdet = bnx2x_warpcore_get_sigdet(phy, params);
if (!sigdet) {
- if (!(vars->link_attr_sync & LINK_ATTR_SYNC_KR2_ENABLE)) {
+ if (!(params->link_attr_sync & LINK_ATTR_SYNC_KR2_ENABLE)) {
bnx2x_kr2_recovery(params, vars, phy);
DP(NETIF_MSG_LINK, "No sigdet\n");
}
@@ -13525,7 +13533,7 @@
/* CL73 has not begun yet */
if (base_page == 0) {
- if (!(vars->link_attr_sync & LINK_ATTR_SYNC_KR2_ENABLE)) {
+ if (!(params->link_attr_sync & LINK_ATTR_SYNC_KR2_ENABLE)) {
bnx2x_kr2_recovery(params, vars, phy);
DP(NETIF_MSG_LINK, "No BP\n");
}
@@ -13541,7 +13549,7 @@
((next_page & 0xe0) == 0x20))));
/* In case KR2 is already disabled, check if we need to re-enable it */
- if (!(vars->link_attr_sync & LINK_ATTR_SYNC_KR2_ENABLE)) {
+ if (!(params->link_attr_sync & LINK_ATTR_SYNC_KR2_ENABLE)) {
if (!not_kr2_device) {
DP(NETIF_MSG_LINK, "BP=0x%x, NP=0x%x\n", base_page,
next_page);
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.h
index 389f5f8..d9cce4c 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.h
@@ -323,6 +323,9 @@
#define LINK_FLAGS_INT_DISABLED (1<<0)
#define PHY_INITIALIZED (1<<1)
u32 lfa_base;
+
+ /* The same definitions as the shmem2 parameter */
+ u32 link_attr_sync;
};
/* Output parameters */
@@ -364,8 +367,6 @@
u8 rx_tx_asic_rst;
u8 turn_to_run_wc_rt;
u16 rsrv2;
- /* The same definitions as the shmem2 parameter */
- u32 link_attr_sync;
};
/***********************************************************/
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
index 900cab4..d1c093d 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
@@ -6849,6 +6849,37 @@
bnx2x_release_phy_lock(bp);
}
+static void bnx2x_config_endianity(struct bnx2x *bp, u32 val)
+{
+ REG_WR(bp, PXP2_REG_RQ_QM_ENDIAN_M, val);
+ REG_WR(bp, PXP2_REG_RQ_TM_ENDIAN_M, val);
+ REG_WR(bp, PXP2_REG_RQ_SRC_ENDIAN_M, val);
+ REG_WR(bp, PXP2_REG_RQ_CDU_ENDIAN_M, val);
+ REG_WR(bp, PXP2_REG_RQ_DBG_ENDIAN_M, val);
+
+ /* make sure this value is 0 */
+ REG_WR(bp, PXP2_REG_RQ_HC_ENDIAN_M, 0);
+
+ REG_WR(bp, PXP2_REG_RD_QM_SWAP_MODE, val);
+ REG_WR(bp, PXP2_REG_RD_TM_SWAP_MODE, val);
+ REG_WR(bp, PXP2_REG_RD_SRC_SWAP_MODE, val);
+ REG_WR(bp, PXP2_REG_RD_CDURD_SWAP_MODE, val);
+}
+
+static void bnx2x_set_endianity(struct bnx2x *bp)
+{
+#ifdef __BIG_ENDIAN
+ bnx2x_config_endianity(bp, 1);
+#else
+ bnx2x_config_endianity(bp, 0);
+#endif
+}
+
+static void bnx2x_reset_endianity(struct bnx2x *bp)
+{
+ bnx2x_config_endianity(bp, 0);
+}
+
/**
* bnx2x_init_hw_common - initialize the HW at the COMMON phase.
*
@@ -6915,23 +6946,7 @@
bnx2x_init_block(bp, BLOCK_PXP2, PHASE_COMMON);
bnx2x_init_pxp(bp);
-
-#ifdef __BIG_ENDIAN
- REG_WR(bp, PXP2_REG_RQ_QM_ENDIAN_M, 1);
- REG_WR(bp, PXP2_REG_RQ_TM_ENDIAN_M, 1);
- REG_WR(bp, PXP2_REG_RQ_SRC_ENDIAN_M, 1);
- REG_WR(bp, PXP2_REG_RQ_CDU_ENDIAN_M, 1);
- REG_WR(bp, PXP2_REG_RQ_DBG_ENDIAN_M, 1);
- /* make sure this value is 0 */
- REG_WR(bp, PXP2_REG_RQ_HC_ENDIAN_M, 0);
-
-/* REG_WR(bp, PXP2_REG_RD_PBF_SWAP_MODE, 1); */
- REG_WR(bp, PXP2_REG_RD_QM_SWAP_MODE, 1);
- REG_WR(bp, PXP2_REG_RD_TM_SWAP_MODE, 1);
- REG_WR(bp, PXP2_REG_RD_SRC_SWAP_MODE, 1);
- REG_WR(bp, PXP2_REG_RD_CDURD_SWAP_MODE, 1);
-#endif
-
+ bnx2x_set_endianity(bp);
bnx2x_ilt_init_page_size(bp, INITOP_SET);
if (CHIP_REV_IS_FPGA(bp) && CHIP_IS_E1H(bp))
@@ -13169,9 +13184,15 @@
bnx2x_iov_remove_one(bp);
/* Power on: we can't let PCI layer write to us while we are in D3 */
- if (IS_PF(bp))
+ if (IS_PF(bp)) {
bnx2x_set_power_state(bp, PCI_D0);
+ /* Set endianity registers to reset values in case next driver
+ * boots in different endianty environment.
+ */
+ bnx2x_reset_endianity(bp);
+ }
+
/* Disable MSI/MSI-X */
bnx2x_disable_msi(bp);
diff --git a/drivers/net/ethernet/broadcom/cnic.c b/drivers/net/ethernet/broadcom/cnic.c
index 27861a6..a6a9f28 100644
--- a/drivers/net/ethernet/broadcom/cnic.c
+++ b/drivers/net/ethernet/broadcom/cnic.c
@@ -31,7 +31,7 @@
#include <linux/if_vlan.h>
#include <linux/prefetch.h>
#include <linux/random.h>
-#if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE)
+#if IS_ENABLED(CONFIG_VLAN_8021Q)
#define BCM_VLAN 1
#endif
#include <net/ip.h>
@@ -3685,7 +3685,7 @@
static int cnic_get_v6_route(struct sockaddr_in6 *dst_addr,
struct dst_entry **dst)
{
-#if defined(CONFIG_IPV6) || (defined(CONFIG_IPV6_MODULE) && defined(MODULE))
+#if IS_ENABLED(CONFIG_IPV6)
struct flowi6 fl6;
memset(&fl6, 0, sizeof(fl6));
diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c
index 3ac5d23..cb77ae9 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -11617,6 +11617,12 @@
struct tg3 *tp = netdev_priv(dev);
int err;
+ if (tp->pcierr_recovery) {
+ netdev_err(dev, "Failed to open device. PCI error recovery "
+ "in progress\n");
+ return -EAGAIN;
+ }
+
if (tp->fw_needed) {
err = tg3_request_firmware(tp);
if (tg3_asic_rev(tp) == ASIC_REV_57766) {
@@ -11674,6 +11680,12 @@
{
struct tg3 *tp = netdev_priv(dev);
+ if (tp->pcierr_recovery) {
+ netdev_err(dev, "Failed to close device. PCI error recovery "
+ "in progress\n");
+ return -EAGAIN;
+ }
+
tg3_ptp_fini(tp);
tg3_stop(tp);
@@ -17561,6 +17573,7 @@
tp->rx_mode = TG3_DEF_RX_MODE;
tp->tx_mode = TG3_DEF_TX_MODE;
tp->irq_sync = 1;
+ tp->pcierr_recovery = false;
if (tg3_debug > 0)
tp->msg_enable = tg3_debug;
@@ -18071,6 +18084,8 @@
rtnl_lock();
+ tp->pcierr_recovery = true;
+
/* We probably don't have netdev yet */
if (!netdev || !netif_running(netdev))
goto done;
@@ -18195,6 +18210,7 @@
tg3_phy_start(tp);
done:
+ tp->pcierr_recovery = false;
rtnl_unlock();
}
diff --git a/drivers/net/ethernet/broadcom/tg3.h b/drivers/net/ethernet/broadcom/tg3.h
index 461acca..31c9f82 100644
--- a/drivers/net/ethernet/broadcom/tg3.h
+++ b/drivers/net/ethernet/broadcom/tg3.h
@@ -3407,6 +3407,7 @@
struct device *hwmon_dev;
bool link_up;
+ bool pcierr_recovery;
};
/* Accessor macros for chip and asic attributes
diff --git a/drivers/net/ethernet/brocade/bna/bnad.c b/drivers/net/ethernet/brocade/bna/bnad.c
index ff8cae5..ffc92a4 100644
--- a/drivers/net/ethernet/brocade/bna/bnad.c
+++ b/drivers/net/ethernet/brocade/bna/bnad.c
@@ -2506,7 +2506,7 @@
* For TSO, the TCP checksum field is seeded with pseudo-header sum
* excluding the length field.
*/
- if (skb->protocol == htons(ETH_P_IP)) {
+ if (vlan_get_protocol(skb) == htons(ETH_P_IP)) {
struct iphdr *iph = ip_hdr(skb);
/* Do we really need these? */
@@ -2870,12 +2870,13 @@
}
if (skb->ip_summed == CHECKSUM_PARTIAL) {
+ __be16 net_proto = vlan_get_protocol(skb);
u8 proto = 0;
- if (skb->protocol == htons(ETH_P_IP))
+ if (net_proto == htons(ETH_P_IP))
proto = ip_hdr(skb)->protocol;
#ifdef NETIF_F_IPV6_CSUM
- else if (skb->protocol == htons(ETH_P_IPV6)) {
+ else if (net_proto == htons(ETH_P_IPV6)) {
/* nexthdr may not be TCP immediately. */
proto = ipv6_hdr(skb)->nexthdr;
}
diff --git a/drivers/net/ethernet/calxeda/Kconfig b/drivers/net/ethernet/calxeda/Kconfig
index 184a063..07d2201 100644
--- a/drivers/net/ethernet/calxeda/Kconfig
+++ b/drivers/net/ethernet/calxeda/Kconfig
@@ -1,6 +1,7 @@
config NET_CALXEDA_XGMAC
tristate "Calxeda 1G/10G XGMAC Ethernet driver"
depends on HAS_IOMEM && HAS_DMA
+ depends on ARCH_HIGHBANK || COMPILE_TEST
select CRC32
help
This is the driver for the XGMAC Ethernet IP block found on Calxeda
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index 18fb9c6..8c34811 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -1253,7 +1253,9 @@
goto freeout;
}
- t4_write_reg(adap, MPS_TRC_RSS_CONTROL,
+ t4_write_reg(adap, is_t4(adap->params.chip) ?
+ MPS_TRC_RSS_CONTROL :
+ MPS_T5_TRC_RSS_CONTROL,
RSSCONTROL(netdev2pinfo(adap->port[0])->tx_chan) |
QUEUENUMBER(s->ethrxq[0].rspq.abs_id));
return 0;
@@ -1761,7 +1763,8 @@
0xd004, 0xd03c,
0xdfc0, 0xdfe0,
0xe000, 0xea7c,
- 0xf000, 0x11190,
+ 0xf000, 0x11110,
+ 0x11118, 0x11190,
0x19040, 0x1906c,
0x19078, 0x19080,
0x1908c, 0x19124,
@@ -1968,7 +1971,8 @@
0xd004, 0xd03c,
0xdfc0, 0xdfe0,
0xe000, 0x11088,
- 0x1109c, 0x1117c,
+ 0x1109c, 0x11110,
+ 0x11118, 0x1117c,
0x11190, 0x11204,
0x19040, 0x1906c,
0x19078, 0x19080,
@@ -5955,7 +5959,8 @@
params[3] = FW_PARAM_PFVF(CQ_END);
params[4] = FW_PARAM_PFVF(OCQ_START);
params[5] = FW_PARAM_PFVF(OCQ_END);
- ret = t4_query_params(adap, 0, 0, 0, 6, params, val);
+ ret = t4_query_params(adap, adap->mbox, adap->fn, 0, 6, params,
+ val);
if (ret < 0)
goto bye;
adap->vres.qp.start = val[0];
@@ -5967,7 +5972,8 @@
params[0] = FW_PARAM_DEV(MAXORDIRD_QP);
params[1] = FW_PARAM_DEV(MAXIRD_ADAPTER);
- ret = t4_query_params(adap, 0, 0, 0, 2, params, val);
+ ret = t4_query_params(adap, adap->mbox, adap->fn, 0, 2, params,
+ val);
if (ret < 0) {
adap->params.max_ordird_qp = 8;
adap->params.max_ird_adapter = 32 * adap->tids.ntids;
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
index a853133..41d0446 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
@@ -168,6 +168,34 @@
}
/*
+ * t4_report_fw_error - report firmware error
+ * @adap: the adapter
+ *
+ * The adapter firmware can indicate error conditions to the host.
+ * If the firmware has indicated an error, print out the reason for
+ * the firmware error.
+ */
+static void t4_report_fw_error(struct adapter *adap)
+{
+ static const char *const reason[] = {
+ "Crash", /* PCIE_FW_EVAL_CRASH */
+ "During Device Preparation", /* PCIE_FW_EVAL_PREP */
+ "During Device Configuration", /* PCIE_FW_EVAL_CONF */
+ "During Device Initialization", /* PCIE_FW_EVAL_INIT */
+ "Unexpected Event", /* PCIE_FW_EVAL_UNEXPECTEDEVENT */
+ "Insufficient Airflow", /* PCIE_FW_EVAL_OVERHEAT */
+ "Device Shutdown", /* PCIE_FW_EVAL_DEVICESHUTDOWN */
+ "Reserved", /* reserved */
+ };
+ u32 pcie_fw;
+
+ pcie_fw = t4_read_reg(adap, MA_PCIE_FW);
+ if (pcie_fw & FW_PCIE_FW_ERR)
+ dev_err(adap->pdev_dev, "Firmware reports adapter error: %s\n",
+ reason[FW_PCIE_FW_EVAL_GET(pcie_fw)]);
+}
+
+/*
* Get the reply to a mailbox command and store it in @rpl in big-endian order.
*/
static void get_mbox_rpl(struct adapter *adap, __be64 *rpl, int nflit,
@@ -300,6 +328,7 @@
dump_mbox(adap, mbox, data_reg);
dev_err(adap->pdev_dev, "command %#x in mailbox %d timed out\n",
*(const u8 *)cmd, mbox);
+ t4_report_fw_error(adap);
return -ETIMEDOUT;
}
@@ -566,6 +595,7 @@
#define VPD_BASE 0x400
#define VPD_BASE_OLD 0
#define VPD_LEN 1024
+#define CHELSIO_VPD_UNIQUE_ID 0x82
/**
* t4_seeprom_wp - enable/disable EEPROM write protection
@@ -603,7 +633,14 @@
ret = pci_read_vpd(adapter->pdev, VPD_BASE, sizeof(u32), vpd);
if (ret < 0)
goto out;
- addr = *vpd == 0x82 ? VPD_BASE : VPD_BASE_OLD;
+
+ /* The VPD shall have a unique identifier specified by the PCI SIG.
+ * For chelsio adapters, the identifier is 0x82. The first byte of a VPD
+ * shall be CHELSIO_VPD_UNIQUE_ID (0x82). The VPD programming software
+ * is expected to automatically put this entry at the
+ * beginning of the VPD.
+ */
+ addr = *vpd == CHELSIO_VPD_UNIQUE_ID ? VPD_BASE : VPD_BASE_OLD;
ret = pci_read_vpd(adapter->pdev, addr, VPD_LEN, vpd);
if (ret < 0)
@@ -667,6 +704,7 @@
i = pci_vpd_info_field_size(vpd + sn - PCI_VPD_INFO_FLD_HDR_SIZE);
memcpy(p->sn, vpd + sn, min(i, SERNUM_LEN));
strim(p->sn);
+ i = pci_vpd_info_field_size(vpd + pn - PCI_VPD_INFO_FLD_HDR_SIZE);
memcpy(p->pn, vpd + pn, min(i, PN_LEN));
strim(p->pn);
@@ -1394,15 +1432,18 @@
int fat;
- fat = t4_handle_intr_status(adapter,
- PCIE_CORE_UTL_SYSTEM_BUS_AGENT_STATUS,
- sysbus_intr_info) +
- t4_handle_intr_status(adapter,
- PCIE_CORE_UTL_PCI_EXPRESS_PORT_STATUS,
- pcie_port_intr_info) +
- t4_handle_intr_status(adapter, PCIE_INT_CAUSE,
- is_t4(adapter->params.chip) ?
- pcie_intr_info : t5_pcie_intr_info);
+ if (is_t4(adapter->params.chip))
+ fat = t4_handle_intr_status(adapter,
+ PCIE_CORE_UTL_SYSTEM_BUS_AGENT_STATUS,
+ sysbus_intr_info) +
+ t4_handle_intr_status(adapter,
+ PCIE_CORE_UTL_PCI_EXPRESS_PORT_STATUS,
+ pcie_port_intr_info) +
+ t4_handle_intr_status(adapter, PCIE_INT_CAUSE,
+ pcie_intr_info);
+ else
+ fat = t4_handle_intr_status(adapter, PCIE_INT_CAUSE,
+ t5_pcie_intr_info);
if (fat)
t4_fatal_err(adapter);
@@ -1521,6 +1562,9 @@
int fat;
+ if (t4_read_reg(adapter, MA_PCIE_FW) & FW_PCIE_FW_ERR)
+ t4_report_fw_error(adapter);
+
fat = t4_handle_intr_status(adapter, CIM_HOST_INT_CAUSE,
cim_intr_info) +
t4_handle_intr_status(adapter, CIM_HOST_UPACC_INT_CAUSE,
@@ -1768,10 +1812,16 @@
{
u32 v, status = t4_read_reg(adap, MA_INT_CAUSE);
- if (status & MEM_PERR_INT_CAUSE)
+ if (status & MEM_PERR_INT_CAUSE) {
dev_alert(adap->pdev_dev,
"MA parity error, parity status %#x\n",
t4_read_reg(adap, MA_PARITY_ERROR_STATUS));
+ if (is_t5(adap->params.chip))
+ dev_alert(adap->pdev_dev,
+ "MA parity error, parity status %#x\n",
+ t4_read_reg(adap,
+ MA_PARITY_ERROR_STATUS2));
+ }
if (status & MEM_WRAP_INT_CAUSE) {
v = t4_read_reg(adap, MA_INT_WRAP_STATUS);
dev_alert(adap->pdev_dev, "MA address wrap-around error by "
@@ -2733,12 +2783,16 @@
/*
* Issue the HELLO command to the firmware. If it's not successful
* but indicates that we got a "busy" or "timeout" condition, retry
- * the HELLO until we exhaust our retry limit.
+ * the HELLO until we exhaust our retry limit. If we do exceed our
+ * retry limit, check to see if the firmware left us any error
+ * information and report that if so.
*/
ret = t4_wr_mbox(adap, mbox, &c, sizeof(c), &c);
if (ret < 0) {
if ((ret == -EBUSY || ret == -ETIMEDOUT) && retries-- > 0)
goto retry;
+ if (t4_read_reg(adap, MA_PCIE_FW) & FW_PCIE_FW_ERR)
+ t4_report_fw_error(adap);
return ret;
}
@@ -3742,6 +3796,7 @@
lc->link_ok = link_ok;
lc->speed = speed;
lc->fc = fc;
+ lc->supported = be16_to_cpu(p->u.info.pcap);
t4_os_link_changed(adap, port, link_ok);
}
if (mod != pi->mod_type) {
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_regs.h b/drivers/net/ethernet/chelsio/cxgb4/t4_regs.h
index e3146e8..39fb325 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4_regs.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_regs.h
@@ -511,6 +511,7 @@
#define MEM_WRAP_CLIENT_NUM_GET(x) (((x) & MEM_WRAP_CLIENT_NUM_MASK) >> MEM_WRAP_CLIENT_NUM_SHIFT)
#define MA_PCIE_FW 0x30b8
#define MA_PARITY_ERROR_STATUS 0x77f4
+#define MA_PARITY_ERROR_STATUS2 0x7804
#define MA_EXT_MEMORY1_BAR 0x7808
#define EDC_0_BASE_ADDR 0x7900
@@ -959,6 +960,7 @@
#define TRCMULTIFILTER 0x00000001U
#define MPS_TRC_RSS_CONTROL 0x9808
+#define MPS_T5_TRC_RSS_CONTROL 0xa00c
#define RSSCONTROL_MASK 0x00ff0000U
#define RSSCONTROL_SHIFT 16
#define RSSCONTROL(x) ((x) << RSSCONTROL_SHIFT)
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h b/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h
index 5f2729e..3409756 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h
@@ -2228,6 +2228,10 @@
#define FW_PCIE_FW_MASTER(x) ((x) << FW_PCIE_FW_MASTER_SHIFT)
#define FW_PCIE_FW_MASTER_GET(x) (((x) >> FW_PCIE_FW_MASTER_SHIFT) & \
FW_PCIE_FW_MASTER_MASK)
+#define FW_PCIE_FW_EVAL_MASK 0x7
+#define FW_PCIE_FW_EVAL_SHIFT 24
+#define FW_PCIE_FW_EVAL_GET(x) (((x) >> FW_PCIE_FW_EVAL_SHIFT) & \
+ FW_PCIE_FW_EVAL_MASK)
struct fw_hdr {
u8 ver;
diff --git a/drivers/net/ethernet/ibm/ehea/ehea_main.c b/drivers/net/ethernet/ibm/ehea/ehea_main.c
index a0b418e..566b17d 100644
--- a/drivers/net/ethernet/ibm/ehea/ehea_main.c
+++ b/drivers/net/ethernet/ibm/ehea/ehea_main.c
@@ -1994,7 +1994,7 @@
{
swqe->tx_control |= EHEA_SWQE_IMM_DATA_PRESENT | EHEA_SWQE_CRC;
- if (skb->protocol != htons(ETH_P_IP))
+ if (vlan_get_protocol(skb) != htons(ETH_P_IP))
return;
if (skb->ip_summed == CHECKSUM_PARTIAL)
diff --git a/drivers/net/ethernet/intel/e1000/e1000_main.c b/drivers/net/ethernet/intel/e1000/e1000_main.c
index cbc330b..ad3d5d1 100644
--- a/drivers/net/ethernet/intel/e1000/e1000_main.c
+++ b/drivers/net/ethernet/intel/e1000/e1000_main.c
@@ -2674,7 +2674,8 @@
#define E1000_TX_FLAGS_VLAN_SHIFT 16
static int e1000_tso(struct e1000_adapter *adapter,
- struct e1000_tx_ring *tx_ring, struct sk_buff *skb)
+ struct e1000_tx_ring *tx_ring, struct sk_buff *skb,
+ __be16 protocol)
{
struct e1000_context_desc *context_desc;
struct e1000_buffer *buffer_info;
@@ -2692,7 +2693,7 @@
hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb);
mss = skb_shinfo(skb)->gso_size;
- if (skb->protocol == htons(ETH_P_IP)) {
+ if (protocol == htons(ETH_P_IP)) {
struct iphdr *iph = ip_hdr(skb);
iph->tot_len = 0;
iph->check = 0;
@@ -2702,7 +2703,7 @@
0);
cmd_length = E1000_TXD_CMD_IP;
ipcse = skb_transport_offset(skb) - 1;
- } else if (skb->protocol == htons(ETH_P_IPV6)) {
+ } else if (skb_is_gso_v6(skb)) {
ipv6_hdr(skb)->payload_len = 0;
tcp_hdr(skb)->check =
~csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
@@ -2745,7 +2746,8 @@
}
static bool e1000_tx_csum(struct e1000_adapter *adapter,
- struct e1000_tx_ring *tx_ring, struct sk_buff *skb)
+ struct e1000_tx_ring *tx_ring, struct sk_buff *skb,
+ __be16 protocol)
{
struct e1000_context_desc *context_desc;
struct e1000_buffer *buffer_info;
@@ -2756,7 +2758,7 @@
if (skb->ip_summed != CHECKSUM_PARTIAL)
return false;
- switch (skb->protocol) {
+ switch (protocol) {
case cpu_to_be16(ETH_P_IP):
if (ip_hdr(skb)->protocol == IPPROTO_TCP)
cmd_len |= E1000_TXD_CMD_TCP;
@@ -3097,6 +3099,7 @@
int count = 0;
int tso;
unsigned int f;
+ __be16 protocol = vlan_get_protocol(skb);
/* This goes back to the question of how to logically map a Tx queue
* to a flow. Right now, performance is impacted slightly negatively
@@ -3210,7 +3213,7 @@
first = tx_ring->next_to_use;
- tso = e1000_tso(adapter, tx_ring, skb);
+ tso = e1000_tso(adapter, tx_ring, skb, protocol);
if (tso < 0) {
dev_kfree_skb_any(skb);
return NETDEV_TX_OK;
@@ -3220,10 +3223,10 @@
if (likely(hw->mac_type != e1000_82544))
tx_ring->last_tx_tso = true;
tx_flags |= E1000_TX_FLAGS_TSO;
- } else if (likely(e1000_tx_csum(adapter, tx_ring, skb)))
+ } else if (likely(e1000_tx_csum(adapter, tx_ring, skb, protocol)))
tx_flags |= E1000_TX_FLAGS_CSUM;
- if (likely(skb->protocol == htons(ETH_P_IP)))
+ if (protocol == htons(ETH_P_IP))
tx_flags |= E1000_TX_FLAGS_IPV4;
if (unlikely(skb->no_fcs))
diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c
index 65c3aef..247335d 100644
--- a/drivers/net/ethernet/intel/e1000e/netdev.c
+++ b/drivers/net/ethernet/intel/e1000e/netdev.c
@@ -5164,7 +5164,8 @@
#define E1000_TX_FLAGS_VLAN_MASK 0xffff0000
#define E1000_TX_FLAGS_VLAN_SHIFT 16
-static int e1000_tso(struct e1000_ring *tx_ring, struct sk_buff *skb)
+static int e1000_tso(struct e1000_ring *tx_ring, struct sk_buff *skb,
+ __be16 protocol)
{
struct e1000_context_desc *context_desc;
struct e1000_buffer *buffer_info;
@@ -5183,7 +5184,7 @@
hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb);
mss = skb_shinfo(skb)->gso_size;
- if (skb->protocol == htons(ETH_P_IP)) {
+ if (protocol == htons(ETH_P_IP)) {
struct iphdr *iph = ip_hdr(skb);
iph->tot_len = 0;
iph->check = 0;
@@ -5231,7 +5232,8 @@
return 1;
}
-static bool e1000_tx_csum(struct e1000_ring *tx_ring, struct sk_buff *skb)
+static bool e1000_tx_csum(struct e1000_ring *tx_ring, struct sk_buff *skb,
+ __be16 protocol)
{
struct e1000_adapter *adapter = tx_ring->adapter;
struct e1000_context_desc *context_desc;
@@ -5239,16 +5241,10 @@
unsigned int i;
u8 css;
u32 cmd_len = E1000_TXD_CMD_DEXT;
- __be16 protocol;
if (skb->ip_summed != CHECKSUM_PARTIAL)
return false;
- if (skb->protocol == cpu_to_be16(ETH_P_8021Q))
- protocol = vlan_eth_hdr(skb)->h_vlan_encapsulated_proto;
- else
- protocol = skb->protocol;
-
switch (protocol) {
case cpu_to_be16(ETH_P_IP):
if (ip_hdr(skb)->protocol == IPPROTO_TCP)
@@ -5546,6 +5542,7 @@
int count = 0;
int tso;
unsigned int f;
+ __be16 protocol = vlan_get_protocol(skb);
if (test_bit(__E1000_DOWN, &adapter->state)) {
dev_kfree_skb_any(skb);
@@ -5620,7 +5617,7 @@
first = tx_ring->next_to_use;
- tso = e1000_tso(tx_ring, skb);
+ tso = e1000_tso(tx_ring, skb, protocol);
if (tso < 0) {
dev_kfree_skb_any(skb);
return NETDEV_TX_OK;
@@ -5628,14 +5625,14 @@
if (tso)
tx_flags |= E1000_TX_FLAGS_TSO;
- else if (e1000_tx_csum(tx_ring, skb))
+ else if (e1000_tx_csum(tx_ring, skb, protocol))
tx_flags |= E1000_TX_FLAGS_CSUM;
/* Old method was to assume IPv4 packet by default if TSO was enabled.
* 82571 hardware supports TSO capabilities for IPv6 as well...
* no longer assume, we must.
*/
- if (skb->protocol == htons(ETH_P_IP))
+ if (protocol == htons(ETH_P_IP))
tx_flags |= E1000_TX_FLAGS_IPV4;
if (unlikely(skb->no_fcs))
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index a51aa37..369848e 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -2295,7 +2295,7 @@
goto out_drop;
/* obtain protocol of skb */
- protocol = skb->protocol;
+ protocol = vlan_get_protocol(skb);
/* record the location of the first descriptor for this packet */
first = &tx_ring->tx_bi[tx_ring->next_to_use];
diff --git a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
index 79bf96c..95a3ec2 100644
--- a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
@@ -1597,7 +1597,7 @@
goto out_drop;
/* obtain protocol of skb */
- protocol = skb->protocol;
+ protocol = vlan_get_protocol(skb);
/* record the location of the first descriptor for this packet */
first = &tx_ring->tx_bi[tx_ring->next_to_use];
diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c
index c9f1d1b..ade067d 100644
--- a/drivers/net/ethernet/marvell/mvneta.c
+++ b/drivers/net/ethernet/marvell/mvneta.c
@@ -20,6 +20,7 @@
#include <linux/mbus.h>
#include <linux/module.h>
#include <linux/interrupt.h>
+#include <linux/if_vlan.h>
#include <net/ip.h>
#include <net/ipv6.h>
#include <linux/io.h>
@@ -1371,15 +1372,16 @@
{
if (skb->ip_summed == CHECKSUM_PARTIAL) {
int ip_hdr_len = 0;
+ __be16 l3_proto = vlan_get_protocol(skb);
u8 l4_proto;
- if (skb->protocol == htons(ETH_P_IP)) {
+ if (l3_proto == htons(ETH_P_IP)) {
struct iphdr *ip4h = ip_hdr(skb);
/* Calculate IPv4 checksum and L4 checksum */
ip_hdr_len = ip4h->ihl;
l4_proto = ip4h->protocol;
- } else if (skb->protocol == htons(ETH_P_IPV6)) {
+ } else if (l3_proto == htons(ETH_P_IPV6)) {
struct ipv6hdr *ip6h = ipv6_hdr(skb);
/* Read l4_protocol from one of IPv6 extra headers */
@@ -1390,7 +1392,7 @@
return MVNETA_TX_L4_CSUM_NOT;
return mvneta_txq_desc_csum(skb_network_offset(skb),
- skb->protocol, ip_hdr_len, l4_proto);
+ l3_proto, ip_hdr_len, l4_proto);
}
return MVNETA_TX_L4_CSUM_NOT;
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index bb536aa..abddcf8 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -474,39 +474,12 @@
int qpn, u64 *reg_id)
{
int err;
- struct mlx4_spec_list spec_eth_outer = { {NULL} };
- struct mlx4_spec_list spec_vxlan = { {NULL} };
- struct mlx4_spec_list spec_eth_inner = { {NULL} };
-
- struct mlx4_net_trans_rule rule = {
- .queue_mode = MLX4_NET_TRANS_Q_FIFO,
- .exclusive = 0,
- .allow_loopback = 1,
- .promisc_mode = MLX4_FS_REGULAR,
- .priority = MLX4_DOMAIN_NIC,
- };
-
- __be64 mac_mask = cpu_to_be64(MLX4_MAC_MASK << 16);
if (priv->mdev->dev->caps.tunnel_offload_mode != MLX4_TUNNEL_OFFLOAD_MODE_VXLAN)
return 0; /* do nothing */
- rule.port = priv->port;
- rule.qpn = qpn;
- INIT_LIST_HEAD(&rule.list);
-
- spec_eth_outer.id = MLX4_NET_TRANS_RULE_ID_ETH;
- memcpy(spec_eth_outer.eth.dst_mac, addr, ETH_ALEN);
- memcpy(spec_eth_outer.eth.dst_mac_msk, &mac_mask, ETH_ALEN);
-
- spec_vxlan.id = MLX4_NET_TRANS_RULE_ID_VXLAN; /* any vxlan header */
- spec_eth_inner.id = MLX4_NET_TRANS_RULE_ID_ETH; /* any inner eth header */
-
- list_add_tail(&spec_eth_outer.list, &rule.list);
- list_add_tail(&spec_vxlan.list, &rule.list);
- list_add_tail(&spec_eth_inner.list, &rule.list);
-
- err = mlx4_flow_attach(priv->mdev->dev, &rule, reg_id);
+ err = mlx4_tunnel_steer_add(priv->mdev->dev, addr, priv->port, qpn,
+ MLX4_DOMAIN_NIC, reg_id);
if (err) {
en_err(priv, "failed to add vxlan steering rule, err %d\n", err);
return err;
diff --git a/drivers/net/ethernet/mellanox/mlx4/mcg.c b/drivers/net/ethernet/mellanox/mlx4/mcg.c
index d80e7a6..ca0f98c 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mcg.c
+++ b/drivers/net/ethernet/mellanox/mlx4/mcg.c
@@ -1020,6 +1020,44 @@
}
EXPORT_SYMBOL_GPL(mlx4_flow_detach);
+int mlx4_tunnel_steer_add(struct mlx4_dev *dev, unsigned char *addr,
+ int port, int qpn, u16 prio, u64 *reg_id)
+{
+ int err;
+ struct mlx4_spec_list spec_eth_outer = { {NULL} };
+ struct mlx4_spec_list spec_vxlan = { {NULL} };
+ struct mlx4_spec_list spec_eth_inner = { {NULL} };
+
+ struct mlx4_net_trans_rule rule = {
+ .queue_mode = MLX4_NET_TRANS_Q_FIFO,
+ .exclusive = 0,
+ .allow_loopback = 1,
+ .promisc_mode = MLX4_FS_REGULAR,
+ };
+
+ __be64 mac_mask = cpu_to_be64(MLX4_MAC_MASK << 16);
+
+ rule.port = port;
+ rule.qpn = qpn;
+ rule.priority = prio;
+ INIT_LIST_HEAD(&rule.list);
+
+ spec_eth_outer.id = MLX4_NET_TRANS_RULE_ID_ETH;
+ memcpy(spec_eth_outer.eth.dst_mac, addr, ETH_ALEN);
+ memcpy(spec_eth_outer.eth.dst_mac_msk, &mac_mask, ETH_ALEN);
+
+ spec_vxlan.id = MLX4_NET_TRANS_RULE_ID_VXLAN; /* any vxlan header */
+ spec_eth_inner.id = MLX4_NET_TRANS_RULE_ID_ETH; /* any inner eth header */
+
+ list_add_tail(&spec_eth_outer.list, &rule.list);
+ list_add_tail(&spec_vxlan.list, &rule.list);
+ list_add_tail(&spec_eth_inner.list, &rule.list);
+
+ err = mlx4_flow_attach(dev, &rule, reg_id);
+ return err;
+}
+EXPORT_SYMBOL(mlx4_tunnel_steer_add);
+
int mlx4_FLOW_STEERING_IB_UC_QP_RANGE(struct mlx4_dev *dev, u32 min_range_qpn,
u32 max_range_qpn)
{
diff --git a/drivers/net/ethernet/moxa/moxart_ether.c b/drivers/net/ethernet/moxa/moxart_ether.c
index 5020fd4..2f12c88 100644
--- a/drivers/net/ethernet/moxa/moxart_ether.c
+++ b/drivers/net/ethernet/moxa/moxart_ether.c
@@ -206,7 +206,7 @@
int rx_head = priv->rx_head;
int rx = 0;
- while (1) {
+ while (rx < budget) {
desc = priv->rx_desc_base + (RX_REG_DESC_SIZE * rx_head);
desc0 = readl(desc + RX_REG_OFFSET_DESC0);
@@ -218,7 +218,7 @@
net_dbg_ratelimited("packet error\n");
priv->stats.rx_dropped++;
priv->stats.rx_errors++;
- continue;
+ goto rx_next;
}
len = desc0 & RX_DESC0_FRAME_LEN_MASK;
@@ -226,13 +226,19 @@
if (len > RX_BUF_SIZE)
len = RX_BUF_SIZE;
- skb = build_skb(priv->rx_buf[rx_head], priv->rx_buf_size);
+ dma_sync_single_for_cpu(&ndev->dev,
+ priv->rx_mapping[rx_head],
+ priv->rx_buf_size, DMA_FROM_DEVICE);
+ skb = netdev_alloc_skb_ip_align(ndev, len);
+
if (unlikely(!skb)) {
- net_dbg_ratelimited("build_skb failed\n");
+ net_dbg_ratelimited("netdev_alloc_skb_ip_align failed\n");
priv->stats.rx_dropped++;
priv->stats.rx_errors++;
+ goto rx_next;
}
+ memcpy(skb->data, priv->rx_buf[rx_head], len);
skb_put(skb, len);
skb->protocol = eth_type_trans(skb, ndev);
napi_gro_receive(&priv->napi, skb);
@@ -244,18 +250,15 @@
if (desc0 & RX_DESC0_MULTICAST)
priv->stats.multicast++;
+rx_next:
writel(RX_DESC0_DMA_OWN, desc + RX_REG_OFFSET_DESC0);
rx_head = RX_NEXT(rx_head);
priv->rx_head = rx_head;
-
- if (rx >= budget)
- break;
}
if (rx < budget) {
- napi_gro_flush(napi, false);
- __napi_complete(napi);
+ napi_complete(napi);
}
priv->reg_imr |= RPKT_FINISH_M;
@@ -346,10 +349,12 @@
len = ETH_ZLEN;
}
- txdes1 = readl(desc + TX_REG_OFFSET_DESC1);
- txdes1 |= TX_DESC1_LTS | TX_DESC1_FTS;
- txdes1 &= ~(TX_DESC1_FIFO_COMPLETE | TX_DESC1_INTR_COMPLETE);
- txdes1 |= (len & TX_DESC1_BUF_SIZE_MASK);
+ dma_sync_single_for_device(&ndev->dev, priv->tx_mapping[tx_head],
+ priv->tx_buf_size, DMA_TO_DEVICE);
+
+ txdes1 = TX_DESC1_LTS | TX_DESC1_FTS | (len & TX_DESC1_BUF_SIZE_MASK);
+ if (tx_head == TX_DESC_NUM_MASK)
+ txdes1 |= TX_DESC1_END;
writel(txdes1, desc + TX_REG_OFFSET_DESC1);
writel(TX_DESC0_DMA_OWN, desc + TX_REG_OFFSET_DESC0);
@@ -465,8 +470,7 @@
spin_lock_init(&priv->txlock);
priv->tx_buf_size = TX_BUF_SIZE;
- priv->rx_buf_size = RX_BUF_SIZE +
- SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+ priv->rx_buf_size = RX_BUF_SIZE;
priv->tx_desc_base = dma_alloc_coherent(NULL, TX_REG_DESC_SIZE *
TX_DESC_NUM, &priv->tx_base,
diff --git a/drivers/net/ethernet/nxp/lpc_eth.c b/drivers/net/ethernet/nxp/lpc_eth.c
index 8706c0d..a44a03c 100644
--- a/drivers/net/ethernet/nxp/lpc_eth.c
+++ b/drivers/net/ethernet/nxp/lpc_eth.c
@@ -1220,6 +1220,9 @@
__lpc_eth_clock_enable(pldat, true);
+ /* Suspended PHY makes LPC ethernet core block, so resume now */
+ phy_resume(pldat->phy_dev);
+
/* Reset and initialize */
__lpc_eth_reset(pldat);
__lpc_eth_init(pldat);
diff --git a/drivers/net/ethernet/qlogic/qlge/qlge_main.c b/drivers/net/ethernet/qlogic/qlge/qlge_main.c
index 188626e..3e96f26 100644
--- a/drivers/net/ethernet/qlogic/qlge/qlge_main.c
+++ b/drivers/net/ethernet/qlogic/qlge/qlge_main.c
@@ -2556,6 +2556,7 @@
if (skb_is_gso(skb)) {
int err;
+ __be16 l3_proto = vlan_get_protocol(skb);
err = skb_cow_head(skb, 0);
if (err < 0)
@@ -2572,7 +2573,7 @@
<< OB_MAC_TRANSPORT_HDR_SHIFT);
mac_iocb_ptr->mss = cpu_to_le16(skb_shinfo(skb)->gso_size);
mac_iocb_ptr->flags2 |= OB_MAC_TSO_IOCB_LSO;
- if (likely(skb->protocol == htons(ETH_P_IP))) {
+ if (likely(l3_proto == htons(ETH_P_IP))) {
struct iphdr *iph = ip_hdr(skb);
iph->check = 0;
mac_iocb_ptr->flags1 |= OB_MAC_TSO_IOCB_IP4;
@@ -2580,7 +2581,7 @@
iph->daddr, 0,
IPPROTO_TCP,
0);
- } else if (skb->protocol == htons(ETH_P_IPV6)) {
+ } else if (l3_proto == htons(ETH_P_IPV6)) {
mac_iocb_ptr->flags1 |= OB_MAC_TSO_IOCB_IP6;
tcp_hdr(skb)->check =
~csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
diff --git a/drivers/net/ethernet/renesas/Kconfig b/drivers/net/ethernet/renesas/Kconfig
index 9e757c7..196e98a 100644
--- a/drivers/net/ethernet/renesas/Kconfig
+++ b/drivers/net/ethernet/renesas/Kconfig
@@ -5,6 +5,7 @@
config SH_ETH
tristate "Renesas SuperH Ethernet support"
depends on HAS_DMA
+ depends on ARCH_SHMOBILE || SUPERH || COMPILE_TEST
select CRC32
select MII
select MDIO_BITBANG
diff --git a/drivers/net/ethernet/stmicro/stmmac/chain_mode.c b/drivers/net/ethernet/stmicro/stmmac/chain_mode.c
index c553f6b..cf28dab 100644
--- a/drivers/net/ethernet/stmicro/stmmac/chain_mode.c
+++ b/drivers/net/ethernet/stmicro/stmmac/chain_mode.c
@@ -28,7 +28,7 @@
#include "stmmac.h"
-static unsigned int stmmac_jumbo_frm(void *p, struct sk_buff *skb, int csum)
+static int stmmac_jumbo_frm(void *p, struct sk_buff *skb, int csum)
{
struct stmmac_priv *priv = (struct stmmac_priv *)p;
unsigned int txsize = priv->dma_tx_size;
@@ -47,7 +47,9 @@
desc->des2 = dma_map_single(priv->device, skb->data,
bmax, DMA_TO_DEVICE);
- priv->tx_skbuff_dma[entry] = desc->des2;
+ if (dma_mapping_error(priv->device, desc->des2))
+ return -1;
+ priv->tx_skbuff_dma[entry].buf = desc->des2;
priv->hw->desc->prepare_tx_desc(desc, 1, bmax, csum, STMMAC_CHAIN_MODE);
while (len != 0) {
@@ -59,7 +61,9 @@
desc->des2 = dma_map_single(priv->device,
(skb->data + bmax * i),
bmax, DMA_TO_DEVICE);
- priv->tx_skbuff_dma[entry] = desc->des2;
+ if (dma_mapping_error(priv->device, desc->des2))
+ return -1;
+ priv->tx_skbuff_dma[entry].buf = desc->des2;
priv->hw->desc->prepare_tx_desc(desc, 0, bmax, csum,
STMMAC_CHAIN_MODE);
priv->hw->desc->set_tx_owner(desc);
@@ -69,7 +73,9 @@
desc->des2 = dma_map_single(priv->device,
(skb->data + bmax * i), len,
DMA_TO_DEVICE);
- priv->tx_skbuff_dma[entry] = desc->des2;
+ if (dma_mapping_error(priv->device, desc->des2))
+ return -1;
+ priv->tx_skbuff_dma[entry].buf = desc->des2;
priv->hw->desc->prepare_tx_desc(desc, 0, len, csum,
STMMAC_CHAIN_MODE);
priv->hw->desc->set_tx_owner(desc);
diff --git a/drivers/net/ethernet/stmicro/stmmac/common.h b/drivers/net/ethernet/stmicro/stmmac/common.h
index de507c3..593e6c4 100644
--- a/drivers/net/ethernet/stmicro/stmmac/common.h
+++ b/drivers/net/ethernet/stmicro/stmmac/common.h
@@ -220,10 +220,10 @@
handle_tx = 0x8,
};
-#define CORE_IRQ_TX_PATH_IN_LPI_MODE (1 << 1)
-#define CORE_IRQ_TX_PATH_EXIT_LPI_MODE (1 << 2)
-#define CORE_IRQ_RX_PATH_IN_LPI_MODE (1 << 3)
-#define CORE_IRQ_RX_PATH_EXIT_LPI_MODE (1 << 4)
+#define CORE_IRQ_TX_PATH_IN_LPI_MODE (1 << 0)
+#define CORE_IRQ_TX_PATH_EXIT_LPI_MODE (1 << 1)
+#define CORE_IRQ_RX_PATH_IN_LPI_MODE (1 << 2)
+#define CORE_IRQ_RX_PATH_EXIT_LPI_MODE (1 << 3)
#define CORE_PCS_ANE_COMPLETE (1 << 5)
#define CORE_PCS_LINK_STATUS (1 << 6)
@@ -287,7 +287,7 @@
/* Default LPI timers */
#define STMMAC_DEFAULT_LIT_LS 0x3E8
-#define STMMAC_DEFAULT_TWT_LS 0x0
+#define STMMAC_DEFAULT_TWT_LS 0x1E
#define STMMAC_CHAIN_MODE 0x1
#define STMMAC_RING_MODE 0x2
@@ -425,7 +425,7 @@
void (*init) (void *des, dma_addr_t phy_addr, unsigned int size,
unsigned int extend_desc);
unsigned int (*is_jumbo_frm) (int len, int ehn_desc);
- unsigned int (*jumbo_frm) (void *priv, struct sk_buff *skb, int csum);
+ int (*jumbo_frm)(void *priv, struct sk_buff *skb, int csum);
int (*set_16kib_bfsize)(int mtu);
void (*init_desc3)(struct dma_desc *p);
void (*refill_desc3) (void *priv, struct dma_desc *p);
@@ -445,6 +445,7 @@
int multicast_filter_bins;
int unicast_filter_entries;
int mcast_bits_log2;
+ unsigned int rx_csum;
};
struct mac_device_info *dwmac1000_setup(void __iomem *ioaddr, int mcbins,
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac1000.h b/drivers/net/ethernet/stmicro/stmmac/dwmac1000.h
index 71b5419..64d8f56 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac1000.h
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac1000.h
@@ -153,7 +153,7 @@
#define GMAC_CONTROL_RE 0x00000004 /* Receiver Enable */
#define GMAC_CORE_INIT (GMAC_CONTROL_JD | GMAC_CONTROL_PS | GMAC_CONTROL_ACS | \
- GMAC_CONTROL_BE)
+ GMAC_CONTROL_BE | GMAC_CONTROL_DCRS)
/* GMAC Frame Filter defines */
#define GMAC_FRAME_FILTER_PR 0x00000001 /* Promiscuous Mode */
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c
index d8ef187..5efe60e 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c
@@ -58,7 +58,11 @@
void __iomem *ioaddr = hw->pcsr;
u32 value = readl(ioaddr + GMAC_CONTROL);
- value |= GMAC_CONTROL_IPC;
+ if (hw->rx_csum)
+ value |= GMAC_CONTROL_IPC;
+ else
+ value &= ~GMAC_CONTROL_IPC;
+
writel(value, ioaddr + GMAC_CONTROL);
value = readl(ioaddr + GMAC_CONTROL);
diff --git a/drivers/net/ethernet/stmicro/stmmac/mmc.h b/drivers/net/ethernet/stmicro/stmmac/mmc.h
index 8607488..192c249 100644
--- a/drivers/net/ethernet/stmicro/stmmac/mmc.h
+++ b/drivers/net/ethernet/stmicro/stmmac/mmc.h
@@ -68,7 +68,7 @@
unsigned int mmc_rx_octetcount_g;
unsigned int mmc_rx_broadcastframe_g;
unsigned int mmc_rx_multicastframe_g;
- unsigned int mmc_rx_crc_errror;
+ unsigned int mmc_rx_crc_error;
unsigned int mmc_rx_align_error;
unsigned int mmc_rx_run_error;
unsigned int mmc_rx_jabber_error;
diff --git a/drivers/net/ethernet/stmicro/stmmac/mmc_core.c b/drivers/net/ethernet/stmicro/stmmac/mmc_core.c
index 50617c5..08c483b 100644
--- a/drivers/net/ethernet/stmicro/stmmac/mmc_core.c
+++ b/drivers/net/ethernet/stmicro/stmmac/mmc_core.c
@@ -196,7 +196,7 @@
mmc->mmc_rx_octetcount_g += readl(ioaddr + MMC_RX_OCTETCOUNT_G);
mmc->mmc_rx_broadcastframe_g += readl(ioaddr + MMC_RX_BROADCASTFRAME_G);
mmc->mmc_rx_multicastframe_g += readl(ioaddr + MMC_RX_MULTICASTFRAME_G);
- mmc->mmc_rx_crc_errror += readl(ioaddr + MMC_RX_CRC_ERRROR);
+ mmc->mmc_rx_crc_error += readl(ioaddr + MMC_RX_CRC_ERRROR);
mmc->mmc_rx_align_error += readl(ioaddr + MMC_RX_ALIGN_ERROR);
mmc->mmc_rx_run_error += readl(ioaddr + MMC_RX_RUN_ERROR);
mmc->mmc_rx_jabber_error += readl(ioaddr + MMC_RX_JABBER_ERROR);
diff --git a/drivers/net/ethernet/stmicro/stmmac/ring_mode.c b/drivers/net/ethernet/stmicro/stmmac/ring_mode.c
index 650a4be..5dd50c6 100644
--- a/drivers/net/ethernet/stmicro/stmmac/ring_mode.c
+++ b/drivers/net/ethernet/stmicro/stmmac/ring_mode.c
@@ -28,7 +28,7 @@
#include "stmmac.h"
-static unsigned int stmmac_jumbo_frm(void *p, struct sk_buff *skb, int csum)
+static int stmmac_jumbo_frm(void *p, struct sk_buff *skb, int csum)
{
struct stmmac_priv *priv = (struct stmmac_priv *)p;
unsigned int txsize = priv->dma_tx_size;
@@ -53,7 +53,10 @@
desc->des2 = dma_map_single(priv->device, skb->data,
bmax, DMA_TO_DEVICE);
- priv->tx_skbuff_dma[entry] = desc->des2;
+ if (dma_mapping_error(priv->device, desc->des2))
+ return -1;
+
+ priv->tx_skbuff_dma[entry].buf = desc->des2;
desc->des3 = desc->des2 + BUF_SIZE_4KiB;
priv->hw->desc->prepare_tx_desc(desc, 1, bmax, csum,
STMMAC_RING_MODE);
@@ -68,7 +71,9 @@
desc->des2 = dma_map_single(priv->device, skb->data + bmax,
len, DMA_TO_DEVICE);
- priv->tx_skbuff_dma[entry] = desc->des2;
+ if (dma_mapping_error(priv->device, desc->des2))
+ return -1;
+ priv->tx_skbuff_dma[entry].buf = desc->des2;
desc->des3 = desc->des2 + BUF_SIZE_4KiB;
priv->hw->desc->prepare_tx_desc(desc, 0, len, csum,
STMMAC_RING_MODE);
@@ -77,7 +82,9 @@
} else {
desc->des2 = dma_map_single(priv->device, skb->data,
nopaged_len, DMA_TO_DEVICE);
- priv->tx_skbuff_dma[entry] = desc->des2;
+ if (dma_mapping_error(priv->device, desc->des2))
+ return -1;
+ priv->tx_skbuff_dma[entry].buf = desc->des2;
desc->des3 = desc->des2 + BUF_SIZE_4KiB;
priv->hw->desc->prepare_tx_desc(desc, 1, nopaged_len, csum,
STMMAC_RING_MODE);
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac.h b/drivers/net/ethernet/stmicro/stmmac/stmmac.h
index ca01035..58097c0 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac.h
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac.h
@@ -34,6 +34,11 @@
#include <linux/ptp_clock_kernel.h>
#include <linux/reset.h>
+struct stmmac_tx_info {
+ dma_addr_t buf;
+ bool map_as_page;
+};
+
struct stmmac_priv {
/* Frequently used values are kept adjacent for cache effect */
struct dma_extended_desc *dma_etx ____cacheline_aligned_in_smp;
@@ -45,7 +50,7 @@
u32 tx_count_frames;
u32 tx_coal_frames;
u32 tx_coal_timer;
- dma_addr_t *tx_skbuff_dma;
+ struct stmmac_tx_info *tx_skbuff_dma;
dma_addr_t dma_tx_phy;
int tx_coalesce;
int hwts_tx_en;
@@ -105,6 +110,8 @@
struct ptp_clock *ptp_clock;
struct ptp_clock_info ptp_clock_ops;
unsigned int default_addend;
+ struct clk *clk_ptp_ref;
+ unsigned int clk_ptp_rate;
u32 adv_ts;
int use_riwt;
int irq_wake;
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
index 9af50ba..cf4f38d 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
@@ -175,7 +175,7 @@
STMMAC_MMC_STAT(mmc_rx_octetcount_g),
STMMAC_MMC_STAT(mmc_rx_broadcastframe_g),
STMMAC_MMC_STAT(mmc_rx_multicastframe_g),
- STMMAC_MMC_STAT(mmc_rx_crc_errror),
+ STMMAC_MMC_STAT(mmc_rx_crc_error),
STMMAC_MMC_STAT(mmc_rx_align_error),
STMMAC_MMC_STAT(mmc_rx_run_error),
STMMAC_MMC_STAT(mmc_rx_jabber_error),
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 08addd6..6e6ee22 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -275,6 +275,7 @@
*/
bool stmmac_eee_init(struct stmmac_priv *priv)
{
+ char *phy_bus_name = priv->plat->phy_bus_name;
bool ret = false;
/* Using PCS we cannot dial with the phy registers at this stage
@@ -284,6 +285,10 @@
(priv->pcs == STMMAC_PCS_RTBI))
goto out;
+ /* Never init EEE in case of a switch is attached */
+ if (phy_bus_name && (!strcmp(phy_bus_name, "fixed")))
+ goto out;
+
/* MAC core supports the EEE feature. */
if (priv->dma_cap.eee) {
int tx_lpi_timer = priv->tx_lpi_timer;
@@ -316,10 +321,9 @@
priv->hw->mac->set_eee_timer(priv->hw,
STMMAC_DEFAULT_LIT_LS,
tx_lpi_timer);
- } else
- /* Set HW EEE according to the speed */
- priv->hw->mac->set_eee_pls(priv->hw,
- priv->phydev->link);
+ }
+ /* Set HW EEE according to the speed */
+ priv->hw->mac->set_eee_pls(priv->hw, priv->phydev->link);
pr_debug("stmmac: Energy-Efficient Ethernet initialized\n");
@@ -603,16 +607,16 @@
/* calculate default added value:
* formula is :
* addend = (2^32)/freq_div_ratio;
- * where, freq_div_ratio = STMMAC_SYSCLOCK/50MHz
- * hence, addend = ((2^32) * 50MHz)/STMMAC_SYSCLOCK;
- * NOTE: STMMAC_SYSCLOCK should be >= 50MHz to
+ * where, freq_div_ratio = clk_ptp_ref_i/50MHz
+ * hence, addend = ((2^32) * 50MHz)/clk_ptp_ref_i;
+ * NOTE: clk_ptp_ref_i should be >= 50MHz to
* achive 20ns accuracy.
*
* 2^x * y == (y << x), hence
* 2^32 * 50000000 ==> (50000000 << 32)
*/
temp = (u64) (50000000ULL << 32);
- priv->default_addend = div_u64(temp, STMMAC_SYSCLOCK);
+ priv->default_addend = div_u64(temp, priv->clk_ptp_rate);
priv->hw->ptp->config_addend(priv->ioaddr,
priv->default_addend);
@@ -638,6 +642,16 @@
if (!(priv->dma_cap.time_stamp || priv->dma_cap.atime_stamp))
return -EOPNOTSUPP;
+ /* Fall-back to main clock in case of no PTP ref is passed */
+ priv->clk_ptp_ref = devm_clk_get(priv->device, "clk_ptp_ref");
+ if (IS_ERR(priv->clk_ptp_ref)) {
+ priv->clk_ptp_rate = clk_get_rate(priv->stmmac_clk);
+ priv->clk_ptp_ref = NULL;
+ } else {
+ clk_prepare_enable(priv->clk_ptp_ref);
+ priv->clk_ptp_rate = clk_get_rate(priv->clk_ptp_ref);
+ }
+
priv->adv_ts = 0;
if (priv->dma_cap.atime_stamp && priv->extend_desc)
priv->adv_ts = 1;
@@ -657,6 +671,8 @@
static void stmmac_release_ptp(struct stmmac_priv *priv)
{
+ if (priv->clk_ptp_ref)
+ clk_disable_unprepare(priv->clk_ptp_ref);
stmmac_ptp_unregister(priv);
}
@@ -1061,7 +1077,8 @@
else
p = priv->dma_tx + i;
p->des2 = 0;
- priv->tx_skbuff_dma[i] = 0;
+ priv->tx_skbuff_dma[i].buf = 0;
+ priv->tx_skbuff_dma[i].map_as_page = false;
priv->tx_skbuff[i] = NULL;
}
@@ -1100,17 +1117,24 @@
else
p = priv->dma_tx + i;
- if (priv->tx_skbuff_dma[i]) {
- dma_unmap_single(priv->device,
- priv->tx_skbuff_dma[i],
- priv->hw->desc->get_tx_len(p),
- DMA_TO_DEVICE);
- priv->tx_skbuff_dma[i] = 0;
+ if (priv->tx_skbuff_dma[i].buf) {
+ if (priv->tx_skbuff_dma[i].map_as_page)
+ dma_unmap_page(priv->device,
+ priv->tx_skbuff_dma[i].buf,
+ priv->hw->desc->get_tx_len(p),
+ DMA_TO_DEVICE);
+ else
+ dma_unmap_single(priv->device,
+ priv->tx_skbuff_dma[i].buf,
+ priv->hw->desc->get_tx_len(p),
+ DMA_TO_DEVICE);
}
if (priv->tx_skbuff[i] != NULL) {
dev_kfree_skb_any(priv->tx_skbuff[i]);
priv->tx_skbuff[i] = NULL;
+ priv->tx_skbuff_dma[i].buf = 0;
+ priv->tx_skbuff_dma[i].map_as_page = false;
}
}
}
@@ -1131,7 +1155,8 @@
if (!priv->rx_skbuff)
goto err_rx_skbuff;
- priv->tx_skbuff_dma = kmalloc_array(txsize, sizeof(dma_addr_t),
+ priv->tx_skbuff_dma = kmalloc_array(txsize,
+ sizeof(*priv->tx_skbuff_dma),
GFP_KERNEL);
if (!priv->tx_skbuff_dma)
goto err_tx_skbuff_dma;
@@ -1293,12 +1318,19 @@
pr_debug("%s: curr %d, dirty %d\n", __func__,
priv->cur_tx, priv->dirty_tx);
- if (likely(priv->tx_skbuff_dma[entry])) {
- dma_unmap_single(priv->device,
- priv->tx_skbuff_dma[entry],
- priv->hw->desc->get_tx_len(p),
- DMA_TO_DEVICE);
- priv->tx_skbuff_dma[entry] = 0;
+ if (likely(priv->tx_skbuff_dma[entry].buf)) {
+ if (priv->tx_skbuff_dma[entry].map_as_page)
+ dma_unmap_page(priv->device,
+ priv->tx_skbuff_dma[entry].buf,
+ priv->hw->desc->get_tx_len(p),
+ DMA_TO_DEVICE);
+ else
+ dma_unmap_single(priv->device,
+ priv->tx_skbuff_dma[entry].buf,
+ priv->hw->desc->get_tx_len(p),
+ DMA_TO_DEVICE);
+ priv->tx_skbuff_dma[entry].buf = 0;
+ priv->tx_skbuff_dma[entry].map_as_page = false;
}
priv->hw->mode->clean_desc3(priv, p);
@@ -1637,6 +1669,13 @@
/* Initialize the MAC Core */
priv->hw->mac->core_init(priv->hw, dev->mtu);
+ ret = priv->hw->mac->rx_ipc(priv->hw);
+ if (!ret) {
+ pr_warn(" RX IPC Checksum Offload disabled\n");
+ priv->plat->rx_coe = STMMAC_RX_COE_NONE;
+ priv->hw->rx_csum = 0;
+ }
+
/* Enable the MAC Rx/Tx */
stmmac_set_mac(priv->ioaddr, true);
@@ -1887,12 +1926,16 @@
if (likely(!is_jumbo)) {
desc->des2 = dma_map_single(priv->device, skb->data,
nopaged_len, DMA_TO_DEVICE);
- priv->tx_skbuff_dma[entry] = desc->des2;
+ if (dma_mapping_error(priv->device, desc->des2))
+ goto dma_map_err;
+ priv->tx_skbuff_dma[entry].buf = desc->des2;
priv->hw->desc->prepare_tx_desc(desc, 1, nopaged_len,
csum_insertion, priv->mode);
} else {
desc = first;
entry = priv->hw->mode->jumbo_frm(priv, skb, csum_insertion);
+ if (unlikely(entry < 0))
+ goto dma_map_err;
}
for (i = 0; i < nfrags; i++) {
@@ -1908,7 +1951,11 @@
desc->des2 = skb_frag_dma_map(priv->device, frag, 0, len,
DMA_TO_DEVICE);
- priv->tx_skbuff_dma[entry] = desc->des2;
+ if (dma_mapping_error(priv->device, desc->des2))
+ goto dma_map_err; /* should reuse desc w/o issues */
+
+ priv->tx_skbuff_dma[entry].buf = desc->des2;
+ priv->tx_skbuff_dma[entry].map_as_page = true;
priv->hw->desc->prepare_tx_desc(desc, 0, len, csum_insertion,
priv->mode);
wmb();
@@ -1975,7 +2022,12 @@
priv->hw->dma->enable_dma_transmission(priv->ioaddr);
spin_unlock(&priv->tx_lock);
+ return NETDEV_TX_OK;
+dma_map_err:
+ dev_err(priv->device, "Tx dma map failed\n");
+ dev_kfree_skb(skb);
+ priv->dev->stats.tx_dropped++;
return NETDEV_TX_OK;
}
@@ -2028,7 +2080,12 @@
priv->rx_skbuff_dma[entry] =
dma_map_single(priv->device, skb->data, bfsize,
DMA_FROM_DEVICE);
-
+ if (dma_mapping_error(priv->device,
+ priv->rx_skbuff_dma[entry])) {
+ dev_err(priv->device, "Rx dma map failed\n");
+ dev_kfree_skb(skb);
+ break;
+ }
p->des2 = priv->rx_skbuff_dma[entry];
priv->hw->mode->refill_desc3(priv, p);
@@ -2055,7 +2112,7 @@
unsigned int entry = priv->cur_rx % rxsize;
unsigned int next_entry;
unsigned int count = 0;
- int coe = priv->plat->rx_coe;
+ int coe = priv->hw->rx_csum;
if (netif_msg_rx_status(priv)) {
pr_debug("%s: descriptor ring:\n", __func__);
@@ -2276,8 +2333,7 @@
if (priv->plat->rx_coe == STMMAC_RX_COE_NONE)
features &= ~NETIF_F_RXCSUM;
- else if (priv->plat->rx_coe == STMMAC_RX_COE_TYPE1)
- features &= ~NETIF_F_IPV6_CSUM;
+
if (!priv->plat->tx_coe)
features &= ~NETIF_F_ALL_CSUM;
@@ -2292,6 +2348,24 @@
return features;
}
+static int stmmac_set_features(struct net_device *netdev,
+ netdev_features_t features)
+{
+ struct stmmac_priv *priv = netdev_priv(netdev);
+
+ /* Keep the COE Type in case of csum is supporting */
+ if (features & NETIF_F_RXCSUM)
+ priv->hw->rx_csum = priv->plat->rx_coe;
+ else
+ priv->hw->rx_csum = 0;
+ /* No check needed because rx_coe has been set before and it will be
+ * fixed in case of issue.
+ */
+ priv->hw->mac->rx_ipc(priv->hw);
+
+ return 0;
+}
+
/**
* stmmac_interrupt - main ISR
* @irq: interrupt number.
@@ -2572,6 +2646,7 @@
.ndo_stop = stmmac_release,
.ndo_change_mtu = stmmac_change_mtu,
.ndo_fix_features = stmmac_fix_features,
+ .ndo_set_features = stmmac_set_features,
.ndo_set_rx_mode = stmmac_set_rx_mode,
.ndo_tx_timeout = stmmac_tx_timeout,
.ndo_do_ioctl = stmmac_ioctl,
@@ -2592,7 +2667,6 @@
*/
static int stmmac_hw_init(struct stmmac_priv *priv)
{
- int ret;
struct mac_device_info *mac;
/* Identify the MAC HW device */
@@ -2649,15 +2723,11 @@
/* To use alternate (extended) or normal descriptor structures */
stmmac_selec_desc_mode(priv);
- ret = priv->hw->mac->rx_ipc(priv->hw);
- if (!ret) {
- pr_warn(" RX IPC Checksum Offload not configured.\n");
- priv->plat->rx_coe = STMMAC_RX_COE_NONE;
- }
-
- if (priv->plat->rx_coe)
+ if (priv->plat->rx_coe) {
+ priv->hw->rx_csum = priv->plat->rx_coe;
pr_info(" RX Checksum Offload Engine supported (type %d)\n",
priv->plat->rx_coe);
+ }
if (priv->plat->tx_coe)
pr_info(" TX Checksum insertion supported\n");
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c
index b7ad356..c5ee79d 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c
@@ -206,6 +206,7 @@
{
if (priv->ptp_clock) {
ptp_clock_unregister(priv->ptp_clock);
+ priv->ptp_clock = NULL;
pr_debug("Removed PTP HW clock successfully on %s\n",
priv->dev->name);
}
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.h b/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.h
index 3dbc047..4535df3 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.h
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.h
@@ -25,8 +25,6 @@
#ifndef __STMMAC_PTP_H__
#define __STMMAC_PTP_H__
-#define STMMAC_SYSCLOCK 62500000
-
/* IEEE 1588 PTP register offsets */
#define PTP_TCR 0x0700 /* Timestamp Control Reg */
#define PTP_SSIR 0x0704 /* Sub-Second Increment Reg */
diff --git a/drivers/net/fddi/skfp/h/skfbi.h b/drivers/net/fddi/skfp/h/skfbi.h
index c1ba26c..3de2f0d 100644
--- a/drivers/net/fddi/skfp/h/skfbi.h
+++ b/drivers/net/fddi/skfp/h/skfbi.h
@@ -147,11 +147,6 @@
#define PCI_MEM64BIT (2<<1) /* Base addr anywhere in 64 Bit range */
#define PCI_MEMSPACE 0x00000001L /* Bit 0: Memory Space Indic. */
-/* PCI_BASE_2ND 32 bit 2nd Base address */
-#define PCI_IOBASE 0xffffff00L /* Bit 31..8: I/O Base address */
-#define PCI_IOSIZE 0x000000fcL /* Bit 7..2: I/O Size Requirements */
-#define PCI_IOSPACE 0x00000001L /* Bit 0: I/O Space Indicator */
-
/* PCI_SUB_VID 16 bit Subsystem Vendor ID */
/* PCI_SUB_ID 16 bit Subsystem ID */
diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index c94e2a2..a854d38 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -1036,31 +1036,31 @@
/* First check if the EEE ability is supported */
eee_cap = phy_read_mmd_indirect(phydev, MDIO_PCS_EEE_ABLE,
MDIO_MMD_PCS, phydev->addr);
- if (eee_cap < 0)
- return eee_cap;
+ if (eee_cap <= 0)
+ goto eee_exit_err;
cap = mmd_eee_cap_to_ethtool_sup_t(eee_cap);
if (!cap)
- return -EPROTONOSUPPORT;
+ goto eee_exit_err;
/* Check which link settings negotiated and verify it in
* the EEE advertising registers.
*/
eee_lp = phy_read_mmd_indirect(phydev, MDIO_AN_EEE_LPABLE,
MDIO_MMD_AN, phydev->addr);
- if (eee_lp < 0)
- return eee_lp;
+ if (eee_lp <= 0)
+ goto eee_exit_err;
eee_adv = phy_read_mmd_indirect(phydev, MDIO_AN_EEE_ADV,
MDIO_MMD_AN, phydev->addr);
- if (eee_adv < 0)
- return eee_adv;
+ if (eee_adv <= 0)
+ goto eee_exit_err;
adv = mmd_eee_adv_to_ethtool_adv_t(eee_adv);
lp = mmd_eee_adv_to_ethtool_adv_t(eee_lp);
idx = phy_find_setting(phydev->speed, phydev->duplex);
if (!(lp & adv & settings[idx].setting))
- return -EPROTONOSUPPORT;
+ goto eee_exit_err;
if (clk_stop_enable) {
/* Configure the PHY to stop receiving xMII
@@ -1080,7 +1080,7 @@
return 0; /* EEE supported */
}
-
+eee_exit_err:
return -EPROTONOSUPPORT;
}
EXPORT_SYMBOL(phy_init_eee);
diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c b/drivers/net/vmxnet3/vmxnet3_drv.c
index d6e90c7..6dfcbf5 100644
--- a/drivers/net/vmxnet3/vmxnet3_drv.c
+++ b/drivers/net/vmxnet3/vmxnet3_drv.c
@@ -2056,7 +2056,6 @@
if (!netdev_mc_empty(netdev)) {
new_table = vmxnet3_copy_mc(netdev);
if (new_table) {
- new_mode |= VMXNET3_RXM_MCAST;
rxConf->mfTableLen = cpu_to_le16(
netdev_mc_count(netdev) * ETH_ALEN);
new_table_pa = dma_map_single(
@@ -2064,15 +2063,18 @@
new_table,
rxConf->mfTableLen,
PCI_DMA_TODEVICE);
+ }
+
+ if (new_table_pa) {
+ new_mode |= VMXNET3_RXM_MCAST;
rxConf->mfTablePA = cpu_to_le64(new_table_pa);
} else {
- netdev_info(netdev, "failed to copy mcast list"
- ", setting ALL_MULTI\n");
+ netdev_info(netdev,
+ "failed to copy mcast list, setting ALL_MULTI\n");
new_mode |= VMXNET3_RXM_ALL_MULTI;
}
}
-
if (!(new_mode & VMXNET3_RXM_MCAST)) {
rxConf->mfTableLen = 0;
rxConf->mfTablePA = 0;
@@ -2091,11 +2093,10 @@
VMXNET3_CMD_UPDATE_MAC_FILTERS);
spin_unlock_irqrestore(&adapter->cmd_lock, flags);
- if (new_table) {
+ if (new_table_pa)
dma_unmap_single(&adapter->pdev->dev, new_table_pa,
rxConf->mfTableLen, PCI_DMA_TODEVICE);
- kfree(new_table);
- }
+ kfree(new_table);
}
void
diff --git a/drivers/net/vmxnet3/vmxnet3_int.h b/drivers/net/vmxnet3/vmxnet3_int.h
index 29ee77f2..3759479 100644
--- a/drivers/net/vmxnet3/vmxnet3_int.h
+++ b/drivers/net/vmxnet3/vmxnet3_int.h
@@ -69,10 +69,10 @@
/*
* Version numbers
*/
-#define VMXNET3_DRIVER_VERSION_STRING "1.2.0.0-k"
+#define VMXNET3_DRIVER_VERSION_STRING "1.2.1.0-k"
/* a 32-bit int, each byte encode a verion number in VMXNET3_DRIVER_VERSION */
-#define VMXNET3_DRIVER_VERSION_NUM 0x01020000
+#define VMXNET3_DRIVER_VERSION_NUM 0x01020100
#if defined(CONFIG_PCI_MSI)
/* RSS only makes sense if MSI-X is supported. */
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 1fb7b37..beb377b 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -1327,7 +1327,7 @@
} else if (vxlan->flags & VXLAN_F_L3MISS) {
union vxlan_addr ipa = {
.sin.sin_addr.s_addr = tip,
- .sa.sa_family = AF_INET,
+ .sin.sin_family = AF_INET,
};
vxlan_ip_miss(dev, &ipa);
@@ -1488,7 +1488,7 @@
} else if (vxlan->flags & VXLAN_F_L3MISS) {
union vxlan_addr ipa = {
.sin6.sin6_addr = msg->target,
- .sa.sa_family = AF_INET6,
+ .sin6.sin6_family = AF_INET6,
};
vxlan_ip_miss(dev, &ipa);
@@ -1521,7 +1521,7 @@
if (!n && (vxlan->flags & VXLAN_F_L3MISS)) {
union vxlan_addr ipa = {
.sin.sin_addr.s_addr = pip->daddr,
- .sa.sa_family = AF_INET,
+ .sin.sin_family = AF_INET,
};
vxlan_ip_miss(dev, &ipa);
@@ -1542,7 +1542,7 @@
if (!n && (vxlan->flags & VXLAN_F_L3MISS)) {
union vxlan_addr ipa = {
.sin6.sin6_addr = pip6->daddr,
- .sa.sa_family = AF_INET6,
+ .sin6.sin6_family = AF_INET6,
};
vxlan_ip_miss(dev, &ipa);
diff --git a/drivers/net/wireless/at76c50x-usb.c b/drivers/net/wireless/at76c50x-usb.c
index 334c2ec..da92bfa 100644
--- a/drivers/net/wireless/at76c50x-usb.c
+++ b/drivers/net/wireless/at76c50x-usb.c
@@ -2423,8 +2423,6 @@
kfree_skb(priv->rx_skb);
- usb_put_dev(priv->udev);
-
at76_dbg(DBG_PROC_ENTRY, "%s: before freeing priv/ieee80211_hw",
__func__);
ieee80211_free_hw(priv->hw);
@@ -2558,6 +2556,7 @@
wiphy_info(priv->hw->wiphy, "disconnecting\n");
at76_delete_device(priv);
+ usb_put_dev(priv->udev);
dev_info(&interface->dev, "disconnected\n");
}
diff --git a/drivers/net/wireless/ath/ath9k/spectral.c b/drivers/net/wireless/ath/ath9k/spectral.c
index 5fe29b9..8f68426 100644
--- a/drivers/net/wireless/ath/ath9k/spectral.c
+++ b/drivers/net/wireless/ath/ath9k/spectral.c
@@ -253,7 +253,7 @@
if (strncmp("trigger", buf, 7) == 0) {
ath9k_spectral_scan_trigger(sc->hw);
- } else if (strncmp("background", buf, 9) == 0) {
+ } else if (strncmp("background", buf, 10) == 0) {
ath9k_spectral_scan_config(sc->hw, SPECTRAL_BACKGROUND);
ath_dbg(common, CONFIG, "spectral scan: background mode enabled\n");
} else if (strncmp("chanscan", buf, 8) == 0) {
diff --git a/drivers/net/wireless/iwlwifi/Kconfig b/drivers/net/wireless/iwlwifi/Kconfig
index 6451d2b..824f5e2 100644
--- a/drivers/net/wireless/iwlwifi/Kconfig
+++ b/drivers/net/wireless/iwlwifi/Kconfig
@@ -51,7 +51,6 @@
config IWLDVM
tristate "Intel Wireless WiFi DVM Firmware support"
- depends on m
default IWLWIFI
help
This is the driver that supports the DVM firmware which is
@@ -60,7 +59,6 @@
config IWLMVM
tristate "Intel Wireless WiFi MVM Firmware support"
- depends on m
help
This is the driver that supports the MVM firmware which is
currently only available for 7260 and 3160 devices.
diff --git a/drivers/net/wireless/iwlwifi/dvm/rxon.c b/drivers/net/wireless/iwlwifi/dvm/rxon.c
index 6dc5dd3..ed50de6 100644
--- a/drivers/net/wireless/iwlwifi/dvm/rxon.c
+++ b/drivers/net/wireless/iwlwifi/dvm/rxon.c
@@ -1068,6 +1068,13 @@
/* recalculate basic rates */
iwl_calc_basic_rates(priv, ctx);
+ /*
+ * force CTS-to-self frames protection if RTS-CTS is not preferred
+ * one aggregation protection method
+ */
+ if (!priv->hw_params.use_rts_for_aggregation)
+ ctx->staging.flags |= RXON_FLG_SELF_CTS_EN;
+
if ((ctx->vif && ctx->vif->bss_conf.use_short_slot) ||
!(ctx->staging.flags & RXON_FLG_BAND_24G_MSK))
ctx->staging.flags |= RXON_FLG_SHORT_SLOT_MSK;
@@ -1473,6 +1480,11 @@
else
ctx->staging.flags &= ~RXON_FLG_TGG_PROTECT_MSK;
+ if (bss_conf->use_cts_prot)
+ ctx->staging.flags |= RXON_FLG_SELF_CTS_EN;
+ else
+ ctx->staging.flags &= ~RXON_FLG_SELF_CTS_EN;
+
memcpy(ctx->staging.bssid_addr, bss_conf->bssid, ETH_ALEN);
if (vif->type == NL80211_IFTYPE_AP ||
diff --git a/drivers/net/wireless/iwlwifi/iwl-7000.c b/drivers/net/wireless/iwlwifi/iwl-7000.c
index 4873006..d67a37a 100644
--- a/drivers/net/wireless/iwlwifi/iwl-7000.c
+++ b/drivers/net/wireless/iwlwifi/iwl-7000.c
@@ -67,8 +67,8 @@
#include "iwl-agn-hw.h"
/* Highest firmware API version supported */
-#define IWL7260_UCODE_API_MAX 9
-#define IWL3160_UCODE_API_MAX 9
+#define IWL7260_UCODE_API_MAX 10
+#define IWL3160_UCODE_API_MAX 10
/* Oldest version we won't warn about */
#define IWL7260_UCODE_API_OK 9
diff --git a/drivers/net/wireless/iwlwifi/iwl-8000.c b/drivers/net/wireless/iwlwifi/iwl-8000.c
index 44b19e0..e93c697 100644
--- a/drivers/net/wireless/iwlwifi/iwl-8000.c
+++ b/drivers/net/wireless/iwlwifi/iwl-8000.c
@@ -67,7 +67,7 @@
#include "iwl-agn-hw.h"
/* Highest firmware API version supported */
-#define IWL8000_UCODE_API_MAX 9
+#define IWL8000_UCODE_API_MAX 10
/* Oldest version we won't warn about */
#define IWL8000_UCODE_API_OK 8
diff --git a/drivers/net/wireless/rtlwifi/btcoexist/halbtcoutsrc.c b/drivers/net/wireless/rtlwifi/btcoexist/halbtcoutsrc.c
index 33da3df..d4bd550 100644
--- a/drivers/net/wireless/rtlwifi/btcoexist/halbtcoutsrc.c
+++ b/drivers/net/wireless/rtlwifi/btcoexist/halbtcoutsrc.c
@@ -101,7 +101,7 @@
bool is_legacy = false;
- if ((mac->mode == WIRELESS_MODE_B) || (mac->mode == WIRELESS_MODE_B))
+ if ((mac->mode == WIRELESS_MODE_B) || (mac->mode == WIRELESS_MODE_G))
is_legacy = true;
return is_legacy;
diff --git a/drivers/net/wireless/rtlwifi/rtl8192cu/sw.c b/drivers/net/wireless/rtlwifi/rtl8192cu/sw.c
index 361435f..1ac6383 100644
--- a/drivers/net/wireless/rtlwifi/rtl8192cu/sw.c
+++ b/drivers/net/wireless/rtlwifi/rtl8192cu/sw.c
@@ -317,6 +317,7 @@
{RTL_USB_DEVICE(0x0bda, 0x5088, rtl92cu_hal_cfg)}, /*Thinkware-CC&C*/
{RTL_USB_DEVICE(0x0df6, 0x0052, rtl92cu_hal_cfg)}, /*Sitecom - Edimax*/
{RTL_USB_DEVICE(0x0df6, 0x005c, rtl92cu_hal_cfg)}, /*Sitecom - Edimax*/
+ {RTL_USB_DEVICE(0x0df6, 0x0070, rtl92cu_hal_cfg)}, /*Sitecom - 150N */
{RTL_USB_DEVICE(0x0df6, 0x0077, rtl92cu_hal_cfg)}, /*Sitecom-WLA2100V2*/
{RTL_USB_DEVICE(0x0eb0, 0x9071, rtl92cu_hal_cfg)}, /*NO Brand - Etop*/
{RTL_USB_DEVICE(0x4856, 0x0091, rtl92cu_hal_cfg)}, /*NetweeN - Feixun*/
diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
index e29e15d..f379689 100644
--- a/drivers/net/xen-netback/interface.c
+++ b/drivers/net/xen-netback/interface.c
@@ -576,6 +576,9 @@
init_waitqueue_head(&queue->dealloc_wq);
atomic_set(&queue->inflight_packets, 0);
+ netif_napi_add(queue->vif->dev, &queue->napi, xenvif_poll,
+ XENVIF_NAPI_WEIGHT);
+
if (tx_evtchn == rx_evtchn) {
/* feature-split-event-channels == 0 */
err = bind_interdomain_evtchn_to_irqhandler(
@@ -629,9 +632,6 @@
wake_up_process(queue->task);
wake_up_process(queue->dealloc_task);
- netif_napi_add(queue->vif->dev, &queue->napi, xenvif_poll,
- XENVIF_NAPI_WEIGHT);
-
return 0;
err_rx_unbind:
diff --git a/drivers/pci/host/Kconfig b/drivers/pci/host/Kconfig
index 8922c37..90f5cca 100644
--- a/drivers/pci/host/Kconfig
+++ b/drivers/pci/host/Kconfig
@@ -56,7 +56,7 @@
controller, such as the one emulated by kvmtool.
config PCIE_SPEAR13XX
- tristate "STMicroelectronics SPEAr PCIe controller"
+ bool "STMicroelectronics SPEAr PCIe controller"
depends on ARCH_SPEAR13XX
select PCIEPORTBUS
select PCIE_DW
diff --git a/drivers/platform/x86/ideapad-laptop.c b/drivers/platform/x86/ideapad-laptop.c
index fc468a3..02152de 100644
--- a/drivers/platform/x86/ideapad-laptop.c
+++ b/drivers/platform/x86/ideapad-laptop.c
@@ -88,7 +88,6 @@
struct dentry *debug;
unsigned long cfg;
bool has_hw_rfkill_switch;
- bool has_touchpad_control;
};
static bool no_bt_rfkill;
@@ -456,7 +455,7 @@
int type;
};
-const const struct ideapad_rfk_data ideapad_rfk_data[] = {
+static const struct ideapad_rfk_data ideapad_rfk_data[] = {
{ "ideapad_wlan", CFG_WIFI_BIT, VPCCMD_W_WIFI, RFKILL_TYPE_WLAN },
{ "ideapad_bluetooth", CFG_BT_BIT, VPCCMD_W_BT, RFKILL_TYPE_BLUETOOTH },
{ "ideapad_3g", CFG_3G_BIT, VPCCMD_W_3G, RFKILL_TYPE_WWAN },
@@ -767,9 +766,6 @@
{
unsigned long value;
- if (!priv->has_touchpad_control)
- return;
-
/* Without reading from EC touchpad LED doesn't switch state */
if (!read_ec_data(priv->adev->handle, VPCCMD_R_TOUCHPAD, &value)) {
/* Some IdeaPads don't really turn off touchpad - they only
@@ -833,29 +829,7 @@
* always results in 0 on these models, causing ideapad_laptop to wrongly
* report all radios as hardware-blocked.
*/
-static struct dmi_system_id no_hw_rfkill_list[] = {
- {
- .ident = "Lenovo Yoga 2 11 / 13 / Pro",
- .matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
- DMI_MATCH(DMI_PRODUCT_VERSION, "Lenovo Yoga 2"),
- },
- },
- {}
-};
-
-/*
- * Some models don't offer touchpad ctrl through the ideapad interface, causing
- * ideapad_sync_touchpad_state to send wrong touchpad enable/disable events.
- */
-static struct dmi_system_id no_touchpad_ctrl_list[] = {
- {
- .ident = "Lenovo Yoga 1 series",
- .matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
- DMI_MATCH(DMI_PRODUCT_VERSION, "Lenovo IdeaPad Yoga"),
- },
- },
+static const struct dmi_system_id no_hw_rfkill_list[] = {
{
.ident = "Lenovo Yoga 2 11 / 13 / Pro",
.matches = {
@@ -889,7 +863,6 @@
priv->adev = adev;
priv->platform_device = pdev;
priv->has_hw_rfkill_switch = !dmi_check_system(no_hw_rfkill_list);
- priv->has_touchpad_control = !dmi_check_system(no_touchpad_ctrl_list);
ret = ideapad_sysfs_init(priv);
if (ret)
diff --git a/drivers/platform/x86/toshiba_acpi.c b/drivers/platform/x86/toshiba_acpi.c
index b062d3d..d0dce73 100644
--- a/drivers/platform/x86/toshiba_acpi.c
+++ b/drivers/platform/x86/toshiba_acpi.c
@@ -1255,10 +1255,15 @@
const char *buf, size_t count)
{
struct toshiba_acpi_dev *toshiba = dev_get_drvdata(dev);
- int mode = -1;
- int time = -1;
+ int mode;
+ int time;
+ int ret;
- if (sscanf(buf, "%i", &mode) != 1 && (mode != 2 || mode != 1))
+
+ ret = kstrtoint(buf, 0, &mode);
+ if (ret)
+ return ret;
+ if (mode != SCI_KBD_MODE_FNZ && mode != SCI_KBD_MODE_AUTO)
return -EINVAL;
/* Set the Keyboard Backlight Mode where:
@@ -1266,11 +1271,12 @@
* Auto - KBD backlight turns off automatically in given time
* FN-Z - KBD backlight "toggles" when hotkey pressed
*/
- if (mode != -1 && toshiba->kbd_mode != mode) {
+ if (toshiba->kbd_mode != mode) {
time = toshiba->kbd_time << HCI_MISC_SHIFT;
time = time + toshiba->kbd_mode;
- if (toshiba_kbd_illum_status_set(toshiba, time) < 0)
- return -EIO;
+ ret = toshiba_kbd_illum_status_set(toshiba, time);
+ if (ret)
+ return ret;
toshiba->kbd_mode = mode;
}
@@ -1857,9 +1863,16 @@
{
struct toshiba_acpi_dev *dev = acpi_driver_data(to_acpi_device(device));
u32 result;
+ acpi_status status;
- if (dev->hotkey_dev)
+ if (dev->hotkey_dev) {
+ status = acpi_evaluate_object(dev->acpi_dev->handle, "ENAB",
+ NULL, NULL);
+ if (ACPI_FAILURE(status))
+ pr_info("Unable to re-enable hotkeys\n");
+
hci_write1(dev, HCI_HOTKEY_EVENT, HCI_HOTKEY_ENABLE, &result);
+ }
return 0;
}
diff --git a/drivers/powercap/intel_rapl.c b/drivers/powercap/intel_rapl.c
index b1cda6f..45e05b3 100644
--- a/drivers/powercap/intel_rapl.c
+++ b/drivers/powercap/intel_rapl.c
@@ -953,6 +953,7 @@
{ X86_VENDOR_INTEL, 6, 0x3a},/* Ivy Bridge */
{ X86_VENDOR_INTEL, 6, 0x3c},/* Haswell */
{ X86_VENDOR_INTEL, 6, 0x3d},/* Broadwell */
+ { X86_VENDOR_INTEL, 6, 0x3f},/* Haswell */
{ X86_VENDOR_INTEL, 6, 0x45},/* Haswell ULT */
/* TODO: Add more CPU IDs after testing */
{}
@@ -1166,11 +1167,10 @@
for (i = 0; i < RAPL_DOMAIN_MAX; i++) {
/* use physical package id to read counters */
- if (!rapl_check_domain(cpu, i))
+ if (!rapl_check_domain(cpu, i)) {
rp->domain_map |= 1 << i;
- else
- pr_warn("RAPL domain %s detection failed\n",
- rapl_domain_names[i]);
+ pr_info("Found RAPL domain %s\n", rapl_domain_names[i]);
+ }
}
rp->nr_domains = bitmap_weight(&rp->domain_map, RAPL_DOMAIN_MAX);
if (!rp->nr_domains) {
diff --git a/drivers/s390/block/dasd_devmap.c b/drivers/s390/block/dasd_devmap.c
index 2ead7e7..14ba80b 100644
--- a/drivers/s390/block/dasd_devmap.c
+++ b/drivers/s390/block/dasd_devmap.c
@@ -77,7 +77,7 @@
* strings when running as a module.
*/
static char *dasd[256];
-module_param_array(dasd, charp, NULL, 0);
+module_param_array(dasd, charp, NULL, S_IRUGO);
/*
* Single spinlock to protect devmap and servermap structures and lists.
diff --git a/drivers/s390/net/qeth_core.h b/drivers/s390/net/qeth_core.h
index 97ef37b..e7646ce 100644
--- a/drivers/s390/net/qeth_core.h
+++ b/drivers/s390/net/qeth_core.h
@@ -889,6 +889,7 @@
extern const struct attribute_group *qeth_osn_attr_groups[];
extern struct workqueue_struct *qeth_wq;
+int qeth_card_hw_is_reachable(struct qeth_card *);
const char *qeth_get_cardname_short(struct qeth_card *);
int qeth_realloc_buffer_pool(struct qeth_card *, int);
int qeth_core_load_discipline(struct qeth_card *, enum qeth_discipline_id);
diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c
index c0d6ba8..fd22c81 100644
--- a/drivers/s390/net/qeth_core_main.c
+++ b/drivers/s390/net/qeth_core_main.c
@@ -73,6 +73,13 @@
struct workqueue_struct *qeth_wq;
EXPORT_SYMBOL_GPL(qeth_wq);
+int qeth_card_hw_is_reachable(struct qeth_card *card)
+{
+ return (card->state == CARD_STATE_SOFTSETUP) ||
+ (card->state == CARD_STATE_UP);
+}
+EXPORT_SYMBOL_GPL(qeth_card_hw_is_reachable);
+
static void qeth_close_dev_handler(struct work_struct *work)
{
struct qeth_card *card;
@@ -5790,6 +5797,7 @@
struct qeth_card *card = netdev->ml_priv;
enum qeth_link_types link_type;
struct carrier_info carrier_info;
+ int rc;
u32 speed;
if ((card->info.type == QETH_CARD_TYPE_IQD) || (card->info.guestlan))
@@ -5832,8 +5840,14 @@
/* Check if we can obtain more accurate information. */
/* If QUERY_CARD_INFO command is not supported or fails, */
/* just return the heuristics that was filled above. */
- if (qeth_query_card_info(card, &carrier_info) != 0)
+ if (!qeth_card_hw_is_reachable(card))
+ return -ENODEV;
+ rc = qeth_query_card_info(card, &carrier_info);
+ if (rc == -EOPNOTSUPP) /* for old hardware, return heuristic */
return 0;
+ if (rc) /* report error from the hardware operation */
+ return rc;
+ /* on success, fill in the information got from the hardware */
netdev_dbg(netdev,
"card info: card_type=0x%02x, port_mode=0x%04x, port_speed=0x%08x\n",
diff --git a/drivers/s390/net/qeth_l2_sys.c b/drivers/s390/net/qeth_l2_sys.c
index ae1bc04..59e3aa5 100644
--- a/drivers/s390/net/qeth_l2_sys.c
+++ b/drivers/s390/net/qeth_l2_sys.c
@@ -5,17 +5,12 @@
#include <linux/slab.h>
#include <asm/ebcdic.h>
+#include "qeth_core.h"
#include "qeth_l2.h"
#define QETH_DEVICE_ATTR(_id, _name, _mode, _show, _store) \
struct device_attribute dev_attr_##_id = __ATTR(_name, _mode, _show, _store)
-static int qeth_card_hw_is_reachable(struct qeth_card *card)
-{
- return (card->state == CARD_STATE_SOFTSETUP) ||
- (card->state == CARD_STATE_UP);
-}
-
static ssize_t qeth_bridge_port_role_state_show(struct device *dev,
struct device_attribute *attr, char *buf,
int show_state)
diff --git a/drivers/ssb/b43_pci_bridge.c b/drivers/ssb/b43_pci_bridge.c
index 19396dc..bed2fede 100644
--- a/drivers/ssb/b43_pci_bridge.c
+++ b/drivers/ssb/b43_pci_bridge.c
@@ -38,6 +38,7 @@
{ PCI_DEVICE(PCI_VENDOR_ID_BROADCOM, 0x432b) },
{ PCI_DEVICE(PCI_VENDOR_ID_BROADCOM, 0x432c) },
{ PCI_DEVICE(PCI_VENDOR_ID_BROADCOM, 0x4350) },
+ { PCI_DEVICE(PCI_VENDOR_ID_BROADCOM, 0x4351) },
{ 0, },
};
MODULE_DEVICE_TABLE(pci, b43_pci_bridge_tbl);
diff --git a/fs/aio.c b/fs/aio.c
index 97bc62c..7337500 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -793,6 +793,8 @@
for (i = 0; i < table->nr; ++i) {
struct kioctx *ctx = table->table[i];
+ struct completion requests_done =
+ COMPLETION_INITIALIZER_ONSTACK(requests_done);
if (!ctx)
continue;
@@ -804,7 +806,10 @@
* that it needs to unmap the area, just set it to 0.
*/
ctx->mmap_size = 0;
- kill_ioctx(mm, ctx, NULL);
+ kill_ioctx(mm, ctx, &requests_done);
+
+ /* Wait until all IO for the context are done. */
+ wait_for_completion(&requests_done);
}
RCU_INIT_POINTER(mm->ioctx_table, NULL);
@@ -1111,6 +1116,12 @@
tail = ring->tail;
kunmap_atomic(ring);
+ /*
+ * Ensure that once we've read the current tail pointer, that
+ * we also see the events that were stored up to the tail.
+ */
+ smp_rmb();
+
pr_debug("h%u t%u m%u\n", head, tail, ctx->nr_events);
if (head == tail)
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 90a3cdc..603e4eb 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -3240,6 +3240,7 @@
&new.de, &new.inlined);
if (IS_ERR(new.bh)) {
retval = PTR_ERR(new.bh);
+ new.bh = NULL;
goto end_rename;
}
if (new.bh) {
@@ -3386,6 +3387,7 @@
&new.de, &new.inlined);
if (IS_ERR(new.bh)) {
retval = PTR_ERR(new.bh);
+ new.bh = NULL;
goto end_rename;
}
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index bb0e80f..1e43b90 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -575,6 +575,7 @@
bh = bclean(handle, sb, block);
if (IS_ERR(bh)) {
err = PTR_ERR(bh);
+ bh = NULL;
goto out;
}
overhead = ext4_group_overhead_blocks(sb, group);
@@ -603,6 +604,7 @@
bh = bclean(handle, sb, block);
if (IS_ERR(bh)) {
err = PTR_ERR(bh);
+ bh = NULL;
goto out;
}
diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig
index 214fe10..736a348 100644
--- a/fs/f2fs/Kconfig
+++ b/fs/f2fs/Kconfig
@@ -23,7 +23,7 @@
mounted as f2fs. Each file shows the whole f2fs information.
/sys/kernel/debug/f2fs/status includes:
- - major file system information managed by f2fs currently
+ - major filesystem information managed by f2fs currently
- average SIT information about whole segments
- current memory footprint consumed by f2fs.
@@ -68,6 +68,6 @@
bool "F2FS consistency checking feature"
depends on F2FS_FS
help
- Enables BUG_ONs which check the file system consistency in runtime.
+ Enables BUG_ONs which check the filesystem consistency in runtime.
If you want to improve the performance, say N.
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 6aeed5b..ec3b7a5 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -160,14 +160,11 @@
goto redirty_out;
if (wbc->for_reclaim)
goto redirty_out;
-
- /* Should not write any meta pages, if any IO error was occurred */
- if (unlikely(is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG)))
- goto no_write;
+ if (unlikely(f2fs_cp_error(sbi)))
+ goto redirty_out;
f2fs_wait_on_page_writeback(page, META);
write_meta_page(sbi, page);
-no_write:
dec_page_count(sbi, F2FS_DIRTY_META);
unlock_page(page);
return 0;
@@ -348,7 +345,7 @@
return e ? true : false;
}
-static void release_dirty_inode(struct f2fs_sb_info *sbi)
+void release_dirty_inode(struct f2fs_sb_info *sbi)
{
struct ino_entry *e, *tmp;
int i;
@@ -446,8 +443,8 @@
struct f2fs_orphan_block *orphan_blk = NULL;
unsigned int nentries = 0;
unsigned short index;
- unsigned short orphan_blocks = (unsigned short)((sbi->n_orphans +
- (F2FS_ORPHANS_PER_BLOCK - 1)) / F2FS_ORPHANS_PER_BLOCK);
+ unsigned short orphan_blocks =
+ (unsigned short)GET_ORPHAN_BLOCKS(sbi->n_orphans);
struct page *page = NULL;
struct ino_entry *orphan = NULL;
@@ -737,7 +734,7 @@
/*
* Freeze all the FS-operations for checkpoint.
*/
-static void block_operations(struct f2fs_sb_info *sbi)
+static int block_operations(struct f2fs_sb_info *sbi)
{
struct writeback_control wbc = {
.sync_mode = WB_SYNC_ALL,
@@ -745,6 +742,7 @@
.for_reclaim = 0,
};
struct blk_plug plug;
+ int err = 0;
blk_start_plug(&plug);
@@ -754,11 +752,15 @@
if (get_pages(sbi, F2FS_DIRTY_DENTS)) {
f2fs_unlock_all(sbi);
sync_dirty_dir_inodes(sbi);
+ if (unlikely(f2fs_cp_error(sbi))) {
+ err = -EIO;
+ goto out;
+ }
goto retry_flush_dents;
}
/*
- * POR: we should ensure that there is no dirty node pages
+ * POR: we should ensure that there are no dirty node pages
* until finishing nat/sit flush.
*/
retry_flush_nodes:
@@ -767,9 +769,16 @@
if (get_pages(sbi, F2FS_DIRTY_NODES)) {
up_write(&sbi->node_write);
sync_node_pages(sbi, 0, &wbc);
+ if (unlikely(f2fs_cp_error(sbi))) {
+ f2fs_unlock_all(sbi);
+ err = -EIO;
+ goto out;
+ }
goto retry_flush_nodes;
}
+out:
blk_finish_plug(&plug);
+ return err;
}
static void unblock_operations(struct f2fs_sb_info *sbi)
@@ -813,8 +822,11 @@
discard_next_dnode(sbi, NEXT_FREE_BLKADDR(sbi, curseg));
/* Flush all the NAT/SIT pages */
- while (get_pages(sbi, F2FS_DIRTY_META))
+ while (get_pages(sbi, F2FS_DIRTY_META)) {
sync_meta_pages(sbi, META, LONG_MAX);
+ if (unlikely(f2fs_cp_error(sbi)))
+ return;
+ }
next_free_nid(sbi, &last_nid);
@@ -825,7 +837,7 @@
ckpt->elapsed_time = cpu_to_le64(get_mtime(sbi));
ckpt->valid_block_count = cpu_to_le64(valid_user_blocks(sbi));
ckpt->free_segment_count = cpu_to_le32(free_segments(sbi));
- for (i = 0; i < 3; i++) {
+ for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) {
ckpt->cur_node_segno[i] =
cpu_to_le32(curseg_segno(sbi, i + CURSEG_HOT_NODE));
ckpt->cur_node_blkoff[i] =
@@ -833,7 +845,7 @@
ckpt->alloc_type[i + CURSEG_HOT_NODE] =
curseg_alloc_type(sbi, i + CURSEG_HOT_NODE);
}
- for (i = 0; i < 3; i++) {
+ for (i = 0; i < NR_CURSEG_DATA_TYPE; i++) {
ckpt->cur_data_segno[i] =
cpu_to_le32(curseg_segno(sbi, i + CURSEG_HOT_DATA));
ckpt->cur_data_blkoff[i] =
@@ -848,24 +860,23 @@
/* 2 cp + n data seg summary + orphan inode blocks */
data_sum_blocks = npages_for_summary_flush(sbi);
- if (data_sum_blocks < 3)
+ if (data_sum_blocks < NR_CURSEG_DATA_TYPE)
set_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG);
else
clear_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG);
- orphan_blocks = (sbi->n_orphans + F2FS_ORPHANS_PER_BLOCK - 1)
- / F2FS_ORPHANS_PER_BLOCK;
+ orphan_blocks = GET_ORPHAN_BLOCKS(sbi->n_orphans);
ckpt->cp_pack_start_sum = cpu_to_le32(1 + cp_payload_blks +
orphan_blocks);
if (is_umount) {
set_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
- ckpt->cp_pack_total_block_count = cpu_to_le32(2 +
+ ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS+
cp_payload_blks + data_sum_blocks +
orphan_blocks + NR_CURSEG_NODE_TYPE);
} else {
clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
- ckpt->cp_pack_total_block_count = cpu_to_le32(2 +
+ ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS +
cp_payload_blks + data_sum_blocks +
orphan_blocks);
}
@@ -924,6 +935,9 @@
/* wait for previous submitted node/meta pages writeback */
wait_on_all_pages_writeback(sbi);
+ if (unlikely(f2fs_cp_error(sbi)))
+ return;
+
filemap_fdatawait_range(NODE_MAPPING(sbi), 0, LONG_MAX);
filemap_fdatawait_range(META_MAPPING(sbi), 0, LONG_MAX);
@@ -934,15 +948,17 @@
/* Here, we only have one bio having CP pack */
sync_meta_pages(sbi, META_FLUSH, LONG_MAX);
- if (!is_set_ckpt_flags(ckpt, CP_ERROR_FLAG)) {
- clear_prefree_segments(sbi);
- release_dirty_inode(sbi);
- F2FS_RESET_SB_DIRT(sbi);
- }
+ release_dirty_inode(sbi);
+
+ if (unlikely(f2fs_cp_error(sbi)))
+ return;
+
+ clear_prefree_segments(sbi);
+ F2FS_RESET_SB_DIRT(sbi);
}
/*
- * We guarantee that this checkpoint procedure should not fail.
+ * We guarantee that this checkpoint procedure will not fail.
*/
void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
{
@@ -952,7 +968,13 @@
trace_f2fs_write_checkpoint(sbi->sb, is_umount, "start block_ops");
mutex_lock(&sbi->cp_mutex);
- block_operations(sbi);
+
+ if (!sbi->s_dirty)
+ goto out;
+ if (unlikely(f2fs_cp_error(sbi)))
+ goto out;
+ if (block_operations(sbi))
+ goto out;
trace_f2fs_write_checkpoint(sbi->sb, is_umount, "finish block_ops");
@@ -976,9 +998,9 @@
do_checkpoint(sbi, is_umount);
unblock_operations(sbi);
- mutex_unlock(&sbi->cp_mutex);
-
stat_inc_cp_count(sbi->stat_info);
+out:
+ mutex_unlock(&sbi->cp_mutex);
trace_f2fs_write_checkpoint(sbi->sb, is_umount, "finish checkpoint");
}
@@ -999,8 +1021,8 @@
* for cp pack we can have max 1020*504 orphan entries
*/
sbi->n_orphans = 0;
- sbi->max_orphans = (sbi->blocks_per_seg - 2 - NR_CURSEG_TYPE)
- * F2FS_ORPHANS_PER_BLOCK;
+ sbi->max_orphans = (sbi->blocks_per_seg - F2FS_CP_PACKS -
+ NR_CURSEG_TYPE) * F2FS_ORPHANS_PER_BLOCK;
}
int __init create_checkpoint_caches(void)
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 03313099..76de83e 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -53,7 +53,7 @@
struct page *page = bvec->bv_page;
if (unlikely(err)) {
- SetPageError(page);
+ set_page_dirty(page);
set_bit(AS_EIO, &page->mapping->flags);
f2fs_stop_checkpoint(sbi);
}
@@ -691,7 +691,7 @@
allocated = true;
blkaddr = dn.data_blkaddr;
}
- /* Give more consecutive addresses for the read ahead */
+ /* Give more consecutive addresses for the readahead */
if (blkaddr == (bh_result->b_blocknr + ofs)) {
ofs++;
dn.ofs_in_node++;
@@ -739,7 +739,7 @@
trace_f2fs_readpage(page, DATA);
- /* If the file has inline data, try to read it directlly */
+ /* If the file has inline data, try to read it directly */
if (f2fs_has_inline_data(inode))
ret = f2fs_read_inline_data(inode, page);
else
@@ -836,10 +836,19 @@
/* Dentry blocks are controlled by checkpoint */
if (S_ISDIR(inode->i_mode)) {
+ if (unlikely(f2fs_cp_error(sbi)))
+ goto redirty_out;
err = do_write_data_page(page, &fio);
goto done;
}
+ /* we should bypass data pages to proceed the kworkder jobs */
+ if (unlikely(f2fs_cp_error(sbi))) {
+ SetPageError(page);
+ unlock_page(page);
+ return 0;
+ }
+
if (!wbc->for_reclaim)
need_balance_fs = true;
else if (has_not_enough_free_secs(sbi, 0))
@@ -927,7 +936,7 @@
if (to > inode->i_size) {
truncate_pagecache(inode, inode->i_size);
- truncate_blocks(inode, inode->i_size);
+ truncate_blocks(inode, inode->i_size, true);
}
}
@@ -946,7 +955,7 @@
f2fs_balance_fs(sbi);
repeat:
- err = f2fs_convert_inline_data(inode, pos + len);
+ err = f2fs_convert_inline_data(inode, pos + len, NULL);
if (err)
goto fail;
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index a441ba3..fecebdb 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -32,7 +32,7 @@
struct f2fs_stat_info *si = F2FS_STAT(sbi);
int i;
- /* valid check of the segment numbers */
+ /* validation check of the segment numbers */
si->hit_ext = sbi->read_hit_ext;
si->total_ext = sbi->total_hit_ext;
si->ndirty_node = get_pages(sbi, F2FS_DIRTY_NODES);
@@ -152,7 +152,7 @@
si->base_mem += NR_DIRTY_TYPE * f2fs_bitmap_size(TOTAL_SEGS(sbi));
si->base_mem += f2fs_bitmap_size(TOTAL_SECS(sbi));
- /* buld nm */
+ /* build nm */
si->base_mem += sizeof(struct f2fs_nm_info);
si->base_mem += __bitmap_size(sbi, NAT_BITMAP);
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index bcf893c..155fb05 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -124,7 +124,7 @@
/*
* For the most part, it should be a bug when name_len is zero.
- * We stop here for figuring out where the bugs are occurred.
+ * We stop here for figuring out where the bugs has occurred.
*/
f2fs_bug_on(!de->name_len);
@@ -391,7 +391,7 @@
error:
/* once the failed inode becomes a bad inode, i_mode is S_IFREG */
truncate_inode_pages(&inode->i_data, 0);
- truncate_blocks(inode, 0);
+ truncate_blocks(inode, 0, false);
remove_dirty_dir_inode(inode);
remove_inode_page(inode);
return ERR_PTR(err);
@@ -563,7 +563,7 @@
}
/*
- * It only removes the dentry from the dentry page,corresponding name
+ * It only removes the dentry from the dentry page, corresponding name
* entry in name page does not need to be touched during deletion.
*/
void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 4dab533..e921242 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -24,7 +24,7 @@
#define f2fs_bug_on(condition) BUG_ON(condition)
#define f2fs_down_write(x, y) down_write_nest_lock(x, y)
#else
-#define f2fs_bug_on(condition)
+#define f2fs_bug_on(condition) WARN_ON(condition)
#define f2fs_down_write(x, y) down_write(x)
#endif
@@ -395,7 +395,7 @@
};
/*
- * The below are the page types of bios used in submti_bio().
+ * The below are the page types of bios used in submit_bio().
* The available types are:
* DATA User data pages. It operates as async mode.
* NODE Node pages. It operates as async mode.
@@ -470,7 +470,7 @@
struct list_head dir_inode_list; /* dir inode list */
spinlock_t dir_inode_lock; /* for dir inode list lock */
- /* basic file system units */
+ /* basic filesystem units */
unsigned int log_sectors_per_block; /* log2 sectors per block */
unsigned int log_blocksize; /* log2 block size */
unsigned int blocksize; /* block size */
@@ -799,7 +799,7 @@
/*
* odd numbered checkpoint should at cp segment 0
- * and even segent must be at cp segment 1
+ * and even segment must be at cp segment 1
*/
if (!(ckpt_version & 1))
start_addr += sbi->blocks_per_seg;
@@ -1096,6 +1096,11 @@
return sb->s_flags & MS_RDONLY;
}
+static inline bool f2fs_cp_error(struct f2fs_sb_info *sbi)
+{
+ return is_set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG);
+}
+
static inline void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi)
{
set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG);
@@ -1117,7 +1122,7 @@
*/
int f2fs_sync_file(struct file *, loff_t, loff_t, int);
void truncate_data_blocks(struct dnode_of_data *);
-int truncate_blocks(struct inode *, u64);
+int truncate_blocks(struct inode *, u64, bool);
void f2fs_truncate(struct inode *);
int f2fs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
int f2fs_setattr(struct dentry *, struct iattr *);
@@ -1202,10 +1207,8 @@
bool alloc_nid(struct f2fs_sb_info *, nid_t *);
void alloc_nid_done(struct f2fs_sb_info *, nid_t);
void alloc_nid_failed(struct f2fs_sb_info *, nid_t);
-void recover_node_page(struct f2fs_sb_info *, struct page *,
- struct f2fs_summary *, struct node_info *, block_t);
void recover_inline_xattr(struct inode *, struct page *);
-bool recover_xattr_data(struct inode *, struct page *, block_t);
+void recover_xattr_data(struct inode *, struct page *, block_t);
int recover_inode_page(struct f2fs_sb_info *, struct page *);
int restore_node_summary(struct f2fs_sb_info *, unsigned int,
struct f2fs_summary_block *);
@@ -1238,8 +1241,6 @@
void rewrite_data_page(struct page *, block_t, struct f2fs_io_info *);
void recover_data_page(struct f2fs_sb_info *, struct page *,
struct f2fs_summary *, block_t, block_t);
-void rewrite_node_page(struct f2fs_sb_info *, struct page *,
- struct f2fs_summary *, block_t, block_t);
void allocate_data_block(struct f2fs_sb_info *, struct page *,
block_t, block_t *, struct f2fs_summary *, int);
void f2fs_wait_on_page_writeback(struct page *, enum page_type);
@@ -1262,6 +1263,7 @@
long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long);
void add_dirty_inode(struct f2fs_sb_info *, nid_t, int type);
void remove_dirty_inode(struct f2fs_sb_info *, nid_t, int type);
+void release_dirty_inode(struct f2fs_sb_info *);
bool exist_written_data(struct f2fs_sb_info *, nid_t, int);
int acquire_orphan_inode(struct f2fs_sb_info *);
void release_orphan_inode(struct f2fs_sb_info *);
@@ -1439,8 +1441,8 @@
*/
bool f2fs_may_inline(struct inode *);
int f2fs_read_inline_data(struct inode *, struct page *);
-int f2fs_convert_inline_data(struct inode *, pgoff_t);
+int f2fs_convert_inline_data(struct inode *, pgoff_t, struct page *);
int f2fs_write_inline_data(struct inode *, struct page *, unsigned int);
void truncate_inline_data(struct inode *, u64);
-int recover_inline_data(struct inode *, struct page *);
+bool recover_inline_data(struct inode *, struct page *);
#endif
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 208f1a9..060aee6 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -41,6 +41,11 @@
sb_start_pagefault(inode->i_sb);
+ /* force to convert with normal data indices */
+ err = f2fs_convert_inline_data(inode, MAX_INLINE_DATA + 1, page);
+ if (err)
+ goto out;
+
/* block allocation */
f2fs_lock_op(sbi);
set_new_dnode(&dn, inode, NULL, NULL, 0);
@@ -110,6 +115,25 @@
return 1;
}
+static inline bool need_do_checkpoint(struct inode *inode)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ bool need_cp = false;
+
+ if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1)
+ need_cp = true;
+ else if (file_wrong_pino(inode))
+ need_cp = true;
+ else if (!space_for_roll_forward(sbi))
+ need_cp = true;
+ else if (!is_checkpointed_node(sbi, F2FS_I(inode)->i_pino))
+ need_cp = true;
+ else if (F2FS_I(inode)->xattr_ver == cur_cp_version(F2FS_CKPT(sbi)))
+ need_cp = true;
+
+ return need_cp;
+}
+
int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
{
struct inode *inode = file->f_mapping->host;
@@ -154,23 +178,12 @@
/* guarantee free sections for fsync */
f2fs_balance_fs(sbi);
- down_read(&fi->i_sem);
-
/*
* Both of fdatasync() and fsync() are able to be recovered from
* sudden-power-off.
*/
- if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1)
- need_cp = true;
- else if (file_wrong_pino(inode))
- need_cp = true;
- else if (!space_for_roll_forward(sbi))
- need_cp = true;
- else if (!is_checkpointed_node(sbi, F2FS_I(inode)->i_pino))
- need_cp = true;
- else if (F2FS_I(inode)->xattr_ver == cur_cp_version(F2FS_CKPT(sbi)))
- need_cp = true;
-
+ down_read(&fi->i_sem);
+ need_cp = need_do_checkpoint(inode);
up_read(&fi->i_sem);
if (need_cp) {
@@ -288,7 +301,7 @@
if (err && err != -ENOENT) {
goto fail;
} else if (err == -ENOENT) {
- /* direct node is not exist */
+ /* direct node does not exists */
if (whence == SEEK_DATA) {
pgofs = PGOFS_OF_NEXT_DNODE(pgofs,
F2FS_I(inode));
@@ -417,7 +430,7 @@
f2fs_put_page(page, 1);
}
-int truncate_blocks(struct inode *inode, u64 from)
+int truncate_blocks(struct inode *inode, u64 from, bool lock)
{
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
unsigned int blocksize = inode->i_sb->s_blocksize;
@@ -433,14 +446,16 @@
free_from = (pgoff_t)
((from + blocksize - 1) >> (sbi->log_blocksize));
- f2fs_lock_op(sbi);
+ if (lock)
+ f2fs_lock_op(sbi);
set_new_dnode(&dn, inode, NULL, NULL, 0);
err = get_dnode_of_data(&dn, free_from, LOOKUP_NODE);
if (err) {
if (err == -ENOENT)
goto free_next;
- f2fs_unlock_op(sbi);
+ if (lock)
+ f2fs_unlock_op(sbi);
trace_f2fs_truncate_blocks_exit(inode, err);
return err;
}
@@ -458,7 +473,8 @@
f2fs_put_dnode(&dn);
free_next:
err = truncate_inode_blocks(inode, free_from);
- f2fs_unlock_op(sbi);
+ if (lock)
+ f2fs_unlock_op(sbi);
done:
/* lastly zero out the first data page */
truncate_partial_data_page(inode, from);
@@ -475,7 +491,7 @@
trace_f2fs_truncate(inode);
- if (!truncate_blocks(inode, i_size_read(inode))) {
+ if (!truncate_blocks(inode, i_size_read(inode), true)) {
inode->i_mtime = inode->i_ctime = CURRENT_TIME;
mark_inode_dirty(inode);
}
@@ -533,7 +549,7 @@
if ((attr->ia_valid & ATTR_SIZE) &&
attr->ia_size != i_size_read(inode)) {
- err = f2fs_convert_inline_data(inode, attr->ia_size);
+ err = f2fs_convert_inline_data(inode, attr->ia_size, NULL);
if (err)
return err;
@@ -622,7 +638,7 @@
loff_t off_start, off_end;
int ret = 0;
- ret = f2fs_convert_inline_data(inode, MAX_INLINE_DATA + 1);
+ ret = f2fs_convert_inline_data(inode, MAX_INLINE_DATA + 1, NULL);
if (ret)
return ret;
@@ -678,7 +694,7 @@
if (ret)
return ret;
- ret = f2fs_convert_inline_data(inode, offset + len);
+ ret = f2fs_convert_inline_data(inode, offset + len, NULL);
if (ret)
return ret;
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index d7947d9..943a31d 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -58,7 +58,7 @@
* 3. IO subsystem is idle by checking the # of requests in
* bdev's request list.
*
- * Note) We have to avoid triggering GCs too much frequently.
+ * Note) We have to avoid triggering GCs frequently.
* Because it is possible that some segments can be
* invalidated soon after by user update or deletion.
* So, I'd like to wait some time to collect dirty segments.
@@ -222,7 +222,7 @@
u = (vblocks * 100) >> sbi->log_blocks_per_seg;
- /* Handle if the system time is changed by user */
+ /* Handle if the system time has changed by the user */
if (mtime < sit_i->min_mtime)
sit_i->min_mtime = mtime;
if (mtime > sit_i->max_mtime)
@@ -593,7 +593,7 @@
if (phase == 2) {
inode = f2fs_iget(sb, dni.ino);
- if (IS_ERR(inode))
+ if (IS_ERR(inode) || is_bad_inode(inode))
continue;
start_bidx = start_bidx_of_node(nofs, F2FS_I(inode));
@@ -693,7 +693,7 @@
gc_more:
if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE)))
goto stop;
- if (unlikely(is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG)))
+ if (unlikely(f2fs_cp_error(sbi)))
goto stop;
if (gc_type == BG_GC && has_not_enough_free_secs(sbi, nfree)) {
diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h
index 5d5eb60..16f0b2b 100644
--- a/fs/f2fs/gc.h
+++ b/fs/f2fs/gc.h
@@ -91,7 +91,7 @@
block_t invalid_user_blocks = sbi->user_block_count -
written_block_count(sbi);
/*
- * Background GC is triggered with the following condition.
+ * Background GC is triggered with the following conditions.
* 1. There are a number of invalid blocks.
* 2. There is not enough free space.
*/
diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c
index 948d17bf..a844fcf 100644
--- a/fs/f2fs/hash.c
+++ b/fs/f2fs/hash.c
@@ -42,7 +42,8 @@
buf[1] += b1;
}
-static void str2hashbuf(const char *msg, size_t len, unsigned int *buf, int num)
+static void str2hashbuf(const unsigned char *msg, size_t len,
+ unsigned int *buf, int num)
{
unsigned pad, val;
int i;
@@ -73,9 +74,9 @@
{
__u32 hash;
f2fs_hash_t f2fs_hash;
- const char *p;
+ const unsigned char *p;
__u32 in[8], buf[4];
- const char *name = name_info->name;
+ const unsigned char *name = name_info->name;
size_t len = name_info->len;
if ((len <= 2) && (name[0] == '.') &&
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 5beecce..3e8ecdf 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -68,7 +68,7 @@
static int __f2fs_convert_inline_data(struct inode *inode, struct page *page)
{
- int err;
+ int err = 0;
struct page *ipage;
struct dnode_of_data dn;
void *src_addr, *dst_addr;
@@ -86,6 +86,10 @@
goto out;
}
+ /* someone else converted inline_data already */
+ if (!f2fs_has_inline_data(inode))
+ goto out;
+
/*
* i_addr[0] is not used for inline data,
* so reserving new block will not destroy inline data
@@ -124,9 +128,10 @@
return err;
}
-int f2fs_convert_inline_data(struct inode *inode, pgoff_t to_size)
+int f2fs_convert_inline_data(struct inode *inode, pgoff_t to_size,
+ struct page *page)
{
- struct page *page;
+ struct page *new_page = page;
int err;
if (!f2fs_has_inline_data(inode))
@@ -134,17 +139,20 @@
else if (to_size <= MAX_INLINE_DATA)
return 0;
- page = grab_cache_page(inode->i_mapping, 0);
- if (!page)
- return -ENOMEM;
+ if (!page || page->index != 0) {
+ new_page = grab_cache_page(inode->i_mapping, 0);
+ if (!new_page)
+ return -ENOMEM;
+ }
- err = __f2fs_convert_inline_data(inode, page);
- f2fs_put_page(page, 1);
+ err = __f2fs_convert_inline_data(inode, new_page);
+ if (!page || page->index != 0)
+ f2fs_put_page(new_page, 1);
return err;
}
int f2fs_write_inline_data(struct inode *inode,
- struct page *page, unsigned size)
+ struct page *page, unsigned size)
{
void *src_addr, *dst_addr;
struct page *ipage;
@@ -199,7 +207,7 @@
f2fs_put_page(ipage, 1);
}
-int recover_inline_data(struct inode *inode, struct page *npage)
+bool recover_inline_data(struct inode *inode, struct page *npage)
{
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
struct f2fs_inode *ri = NULL;
@@ -218,7 +226,7 @@
ri = F2FS_INODE(npage);
if (f2fs_has_inline_data(inode) &&
- ri && ri->i_inline & F2FS_INLINE_DATA) {
+ ri && (ri->i_inline & F2FS_INLINE_DATA)) {
process_inline:
ipage = get_node_page(sbi, inode->i_ino);
f2fs_bug_on(IS_ERR(ipage));
@@ -230,7 +238,7 @@
memcpy(dst_addr, src_addr, MAX_INLINE_DATA);
update_inode(inode, ipage);
f2fs_put_page(ipage, 1);
- return -1;
+ return true;
}
if (f2fs_has_inline_data(inode)) {
@@ -242,10 +250,10 @@
clear_inode_flag(F2FS_I(inode), FI_INLINE_DATA);
update_inode(inode, ipage);
f2fs_put_page(ipage, 1);
- } else if (ri && ri->i_inline & F2FS_INLINE_DATA) {
- truncate_blocks(inode, 0);
+ } else if (ri && (ri->i_inline & F2FS_INLINE_DATA)) {
+ truncate_blocks(inode, 0, false);
set_inode_flag(F2FS_I(inode), FI_INLINE_DATA);
goto process_inline;
}
- return 0;
+ return false;
}
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 27b0377..ee103fd 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -134,9 +134,7 @@
return 0;
out:
clear_nlink(inode);
- unlock_new_inode(inode);
- make_bad_inode(inode);
- iput(inode);
+ iget_failed(inode);
alloc_nid_failed(sbi, ino);
return err;
}
@@ -229,7 +227,7 @@
f2fs_delete_entry(de, page, inode);
f2fs_unlock_op(sbi);
- /* In order to evict this inode, we set it dirty */
+ /* In order to evict this inode, we set it dirty */
mark_inode_dirty(inode);
fail:
trace_f2fs_unlink_exit(inode, err);
@@ -267,9 +265,7 @@
return err;
out:
clear_nlink(inode);
- unlock_new_inode(inode);
- make_bad_inode(inode);
- iput(inode);
+ iget_failed(inode);
alloc_nid_failed(sbi, inode->i_ino);
return err;
}
@@ -308,9 +304,7 @@
out_fail:
clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
clear_nlink(inode);
- unlock_new_inode(inode);
- make_bad_inode(inode);
- iput(inode);
+ iget_failed(inode);
alloc_nid_failed(sbi, inode->i_ino);
return err;
}
@@ -354,9 +348,7 @@
return 0;
out:
clear_nlink(inode);
- unlock_new_inode(inode);
- make_bad_inode(inode);
- iput(inode);
+ iget_failed(inode);
alloc_nid_failed(sbi, inode->i_ino);
return err;
}
@@ -688,9 +680,7 @@
out:
f2fs_unlock_op(sbi);
clear_nlink(inode);
- unlock_new_inode(inode);
- make_bad_inode(inode);
- iput(inode);
+ iget_failed(inode);
alloc_nid_failed(sbi, inode->i_ino);
return err;
}
@@ -704,7 +694,6 @@
.mkdir = f2fs_mkdir,
.rmdir = f2fs_rmdir,
.mknod = f2fs_mknod,
- .rename = f2fs_rename,
.rename2 = f2fs_rename2,
.tmpfile = f2fs_tmpfile,
.getattr = f2fs_getattr,
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index d3d90d2..4537819 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -237,7 +237,7 @@
nat_get_blkaddr(e) != NULL_ADDR &&
new_blkaddr == NEW_ADDR);
- /* increament version no as node is removed */
+ /* increment version no as node is removed */
if (nat_get_blkaddr(e) != NEW_ADDR && new_blkaddr == NULL_ADDR) {
unsigned char version = nat_get_version(e);
nat_set_version(e, inc_node_version(version));
@@ -274,7 +274,7 @@
}
/*
- * This function returns always success
+ * This function always returns success
*/
void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
{
@@ -650,7 +650,7 @@
/* get indirect nodes in the path */
for (i = 0; i < idx + 1; i++) {
- /* refernece count'll be increased */
+ /* reference count'll be increased */
pages[i] = get_node_page(sbi, nid[i]);
if (IS_ERR(pages[i])) {
err = PTR_ERR(pages[i]);
@@ -823,22 +823,26 @@
*/
void remove_inode_page(struct inode *inode)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
- struct page *page;
- nid_t ino = inode->i_ino;
struct dnode_of_data dn;
- page = get_node_page(sbi, ino);
- if (IS_ERR(page))
+ set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino);
+ if (get_dnode_of_data(&dn, 0, LOOKUP_NODE))
return;
- if (truncate_xattr_node(inode, page)) {
- f2fs_put_page(page, 1);
+ if (truncate_xattr_node(inode, dn.inode_page)) {
+ f2fs_put_dnode(&dn);
return;
}
- /* 0 is possible, after f2fs_new_inode() is failed */
+
+ /* remove potential inline_data blocks */
+ if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+ S_ISLNK(inode->i_mode))
+ truncate_data_blocks_range(&dn, 1);
+
+ /* 0 is possible, after f2fs_new_inode() has failed */
f2fs_bug_on(inode->i_blocks != 0 && inode->i_blocks != 1);
- set_new_dnode(&dn, inode, page, page, ino);
+
+ /* will put inode & node pages */
truncate_node(&dn);
}
@@ -1129,8 +1133,11 @@
set_fsync_mark(page, 0);
set_dentry_mark(page, 0);
}
- NODE_MAPPING(sbi)->a_ops->writepage(page, wbc);
- wrote++;
+
+ if (NODE_MAPPING(sbi)->a_ops->writepage(page, wbc))
+ unlock_page(page);
+ else
+ wrote++;
if (--wbc->nr_to_write == 0)
break;
@@ -1212,6 +1219,8 @@
if (unlikely(sbi->por_doing))
goto redirty_out;
+ if (unlikely(f2fs_cp_error(sbi)))
+ goto redirty_out;
f2fs_wait_on_page_writeback(page, NODE);
@@ -1540,15 +1549,6 @@
kmem_cache_free(free_nid_slab, i);
}
-void recover_node_page(struct f2fs_sb_info *sbi, struct page *page,
- struct f2fs_summary *sum, struct node_info *ni,
- block_t new_blkaddr)
-{
- rewrite_node_page(sbi, page, sum, ni->blk_addr, new_blkaddr);
- set_node_addr(sbi, ni, new_blkaddr, false);
- clear_node_page_dirty(page);
-}
-
void recover_inline_xattr(struct inode *inode, struct page *page)
{
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
@@ -1557,40 +1557,33 @@
struct page *ipage;
struct f2fs_inode *ri;
- if (!f2fs_has_inline_xattr(inode))
- return;
-
- if (!IS_INODE(page))
- return;
-
- ri = F2FS_INODE(page);
- if (!(ri->i_inline & F2FS_INLINE_XATTR))
- return;
-
ipage = get_node_page(sbi, inode->i_ino);
f2fs_bug_on(IS_ERR(ipage));
+ ri = F2FS_INODE(page);
+ if (!(ri->i_inline & F2FS_INLINE_XATTR)) {
+ clear_inode_flag(F2FS_I(inode), FI_INLINE_XATTR);
+ goto update_inode;
+ }
+
dst_addr = inline_xattr_addr(ipage);
src_addr = inline_xattr_addr(page);
inline_size = inline_xattr_size(inode);
f2fs_wait_on_page_writeback(ipage, NODE);
memcpy(dst_addr, src_addr, inline_size);
-
+update_inode:
update_inode(inode, ipage);
f2fs_put_page(ipage, 1);
}
-bool recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr)
+void recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr)
{
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
nid_t prev_xnid = F2FS_I(inode)->i_xattr_nid;
nid_t new_xnid = nid_of_node(page);
struct node_info ni;
- if (!f2fs_has_xattr_block(ofs_of_node(page)))
- return false;
-
/* 1: invalidate the previous xattr nid */
if (!prev_xnid)
goto recover_xnid;
@@ -1618,7 +1611,6 @@
set_node_addr(sbi, &ni, blkaddr, false);
update_inode_page(inode);
- return true;
}
int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
@@ -1637,7 +1629,7 @@
if (!ipage)
return -ENOMEM;
- /* Should not use this inode from free nid list */
+ /* Should not use this inode from free nid list */
remove_free_nid(NM_I(sbi), ino);
SetPageUptodate(ipage);
@@ -1651,6 +1643,7 @@
dst->i_blocks = cpu_to_le64(1);
dst->i_links = cpu_to_le32(1);
dst->i_xattr_nid = 0;
+ dst->i_inline = src->i_inline & F2FS_INLINE_XATTR;
new_ni = old_ni;
new_ni.ino = ino;
@@ -1659,13 +1652,14 @@
WARN_ON(1);
set_node_addr(sbi, &new_ni, NEW_ADDR, false);
inc_valid_inode_count(sbi);
+ set_page_dirty(ipage);
f2fs_put_page(ipage, 1);
return 0;
}
/*
* ra_sum_pages() merge contiguous pages into one bio and submit.
- * these pre-readed pages are alloced in bd_inode's mapping tree.
+ * these pre-read pages are allocated in bd_inode's mapping tree.
*/
static int ra_sum_pages(struct f2fs_sb_info *sbi, struct page **pages,
int start, int nrpages)
@@ -1709,7 +1703,7 @@
for (i = 0; !err && i < last_offset; i += nrpages, addr += nrpages) {
nrpages = min(last_offset - i, bio_blocks);
- /* read ahead node pages */
+ /* readahead node pages */
nrpages = ra_sum_pages(sbi, pages, addr, nrpages);
if (!nrpages)
return -ENOMEM;
@@ -1967,7 +1961,7 @@
nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nat_blocks;
/* not used nids: 0, node, meta, (and root counted as valid node) */
- nm_i->available_nids = nm_i->max_nid - 3;
+ nm_i->available_nids = nm_i->max_nid - F2FS_RESERVED_NODE_NUM;
nm_i->fcnt = 0;
nm_i->nat_cnt = 0;
nm_i->ram_thresh = DEF_RAM_THRESHOLD;
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index fe1c6d9..756c41c 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -62,8 +62,10 @@
}
retry:
de = f2fs_find_entry(dir, &name, &page);
- if (de && inode->i_ino == le32_to_cpu(de->ino))
+ if (de && inode->i_ino == le32_to_cpu(de->ino)) {
+ clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
goto out_unmap_put;
+ }
if (de) {
einode = f2fs_iget(inode->i_sb, le32_to_cpu(de->ino));
if (IS_ERR(einode)) {
@@ -300,14 +302,19 @@
struct node_info ni;
int err = 0, recovered = 0;
- recover_inline_xattr(inode, page);
+ /* step 1: recover xattr */
+ if (IS_INODE(page)) {
+ recover_inline_xattr(inode, page);
+ } else if (f2fs_has_xattr_block(ofs_of_node(page))) {
+ recover_xattr_data(inode, page, blkaddr);
+ goto out;
+ }
+ /* step 2: recover inline data */
if (recover_inline_data(inode, page))
goto out;
- if (recover_xattr_data(inode, page, blkaddr))
- goto out;
-
+ /* step 3: recover data indices */
start = start_bidx_of_node(ofs_of_node(page), fi);
end = start + ADDRS_PER_PAGE(page, fi);
@@ -364,8 +371,6 @@
fill_node_footer(dn.node_page, dn.nid, ni.ino,
ofs_of_node(page), false);
set_page_dirty(dn.node_page);
-
- recover_node_page(sbi, dn.node_page, &sum, &ni, blkaddr);
err:
f2fs_put_dnode(&dn);
f2fs_unlock_op(sbi);
@@ -452,6 +457,9 @@
/* step #1: find fsynced inode numbers */
sbi->por_doing = true;
+ /* prevent checkpoint */
+ mutex_lock(&sbi->cp_mutex);
+
blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
err = find_fsync_dnodes(sbi, &inode_list);
@@ -465,7 +473,8 @@
/* step #2: recover data */
err = recover_data(sbi, &inode_list, CURSEG_WARM_NODE);
- f2fs_bug_on(!list_empty(&inode_list));
+ if (!err)
+ f2fs_bug_on(!list_empty(&inode_list));
out:
destroy_fsync_dnodes(&inode_list);
kmem_cache_destroy(fsync_entry_slab);
@@ -482,8 +491,13 @@
/* Flush all the NAT/SIT pages */
while (get_pages(sbi, F2FS_DIRTY_META))
sync_meta_pages(sbi, META, LONG_MAX);
+ set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG);
+ mutex_unlock(&sbi->cp_mutex);
} else if (need_writecp) {
+ mutex_unlock(&sbi->cp_mutex);
write_checkpoint(sbi, false);
+ } else {
+ mutex_unlock(&sbi->cp_mutex);
}
return err;
}
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 0dfeeba..0aa337c 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -62,7 +62,7 @@
}
/*
- * __find_rev_next(_zero)_bit is copied from lib/find_next_bit.c becasue
+ * __find_rev_next(_zero)_bit is copied from lib/find_next_bit.c because
* f2fs_set_bit makes MSB and LSB reversed in a byte.
* Example:
* LSB <--> MSB
@@ -808,7 +808,7 @@
}
/*
- * This function always allocates a used segment (from dirty seglist) by SSR
+ * This function always allocates a used segment(from dirty seglist) by SSR
* manner, so it should recover the existing segment information of valid blocks
*/
static void change_curseg(struct f2fs_sb_info *sbi, int type, bool reuse)
@@ -1103,55 +1103,6 @@
mutex_unlock(&curseg->curseg_mutex);
}
-void rewrite_node_page(struct f2fs_sb_info *sbi,
- struct page *page, struct f2fs_summary *sum,
- block_t old_blkaddr, block_t new_blkaddr)
-{
- struct sit_info *sit_i = SIT_I(sbi);
- int type = CURSEG_WARM_NODE;
- struct curseg_info *curseg;
- unsigned int segno, old_cursegno;
- block_t next_blkaddr = next_blkaddr_of_node(page);
- unsigned int next_segno = GET_SEGNO(sbi, next_blkaddr);
- struct f2fs_io_info fio = {
- .type = NODE,
- .rw = WRITE_SYNC,
- };
-
- curseg = CURSEG_I(sbi, type);
-
- mutex_lock(&curseg->curseg_mutex);
- mutex_lock(&sit_i->sentry_lock);
-
- segno = GET_SEGNO(sbi, new_blkaddr);
- old_cursegno = curseg->segno;
-
- /* change the current segment */
- if (segno != curseg->segno) {
- curseg->next_segno = segno;
- change_curseg(sbi, type, true);
- }
- curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr);
- __add_sum_entry(sbi, type, sum);
-
- /* change the current log to the next block addr in advance */
- if (next_segno != segno) {
- curseg->next_segno = next_segno;
- change_curseg(sbi, type, true);
- }
- curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, next_blkaddr);
-
- /* rewrite node page */
- set_page_writeback(page);
- f2fs_submit_page_mbio(sbi, page, new_blkaddr, &fio);
- f2fs_submit_merged_bio(sbi, NODE, WRITE);
- refresh_sit_entry(sbi, old_blkaddr, new_blkaddr);
- locate_dirty_segment(sbi, old_cursegno);
-
- mutex_unlock(&sit_i->sentry_lock);
- mutex_unlock(&curseg->curseg_mutex);
-}
-
static inline bool is_merged_page(struct f2fs_sb_info *sbi,
struct page *page, enum page_type type)
{
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 55973f7..ff48325 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -549,7 +549,7 @@
}
/*
- * Summary block is always treated as invalid block
+ * Summary block is always treated as an invalid block
*/
static inline void check_block_count(struct f2fs_sb_info *sbi,
int segno, struct f2fs_sit_entry *raw_sit)
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 657582f..41bdf51 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -432,9 +432,15 @@
stop_gc_thread(sbi);
/* We don't need to do checkpoint when it's clean */
- if (sbi->s_dirty && get_pages(sbi, F2FS_DIRTY_NODES))
+ if (sbi->s_dirty)
write_checkpoint(sbi, true);
+ /*
+ * normally superblock is clean, so we need to release this.
+ * In addition, EIO will skip do checkpoint, we need this as well.
+ */
+ release_dirty_inode(sbi);
+
iput(sbi->node_inode);
iput(sbi->meta_inode);
@@ -457,9 +463,6 @@
trace_f2fs_sync_fs(sb, sync);
- if (!sbi->s_dirty && !get_pages(sbi, F2FS_DIRTY_NODES))
- return 0;
-
if (sync) {
mutex_lock(&sbi->gc_mutex);
write_checkpoint(sbi, false);
@@ -505,8 +508,8 @@
buf->f_bfree = buf->f_blocks - valid_user_blocks(sbi) - ovp_count;
buf->f_bavail = user_block_count - valid_user_blocks(sbi);
- buf->f_files = sbi->total_node_count;
- buf->f_ffree = sbi->total_node_count - valid_inode_count(sbi);
+ buf->f_files = sbi->total_node_count - F2FS_RESERVED_NODE_NUM;
+ buf->f_ffree = buf->f_files - valid_inode_count(sbi);
buf->f_namelen = F2FS_NAME_LEN;
buf->f_fsid.val[0] = (u32)id;
@@ -663,7 +666,7 @@
if (need_restart_gc) {
if (start_gc_thread(sbi))
f2fs_msg(sbi->sb, KERN_WARNING,
- "background gc thread is stop");
+ "background gc thread has stopped");
} else if (need_stop_gc) {
stop_gc_thread(sbi);
}
@@ -812,7 +815,7 @@
if (unlikely(fsmeta >= total))
return 1;
- if (unlikely(is_set_ckpt_flags(ckpt, CP_ERROR_FLAG))) {
+ if (unlikely(f2fs_cp_error(sbi))) {
f2fs_msg(sbi->sb, KERN_ERR, "A bug case: need to run fsck");
return 1;
}
@@ -899,8 +902,10 @@
struct buffer_head *raw_super_buf;
struct inode *root;
long err = -EINVAL;
+ bool retry = true;
int i;
+try_onemore:
/* allocate memory for f2fs-specific super block info */
sbi = kzalloc(sizeof(struct f2fs_sb_info), GFP_KERNEL);
if (!sbi)
@@ -1080,9 +1085,11 @@
/* recover fsynced data */
if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) {
err = recover_fsync_data(sbi);
- if (err)
+ if (err) {
f2fs_msg(sb, KERN_ERR,
"Cannot recover all fsync data errno=%ld", err);
+ goto free_kobj;
+ }
}
/*
@@ -1123,6 +1130,13 @@
brelse(raw_super_buf);
free_sbi:
kfree(sbi);
+
+ /* give only one another chance */
+ if (retry) {
+ retry = 0;
+ shrink_dcache_sb(sb);
+ goto try_onemore;
+ }
return err;
}
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 8bea941..728a5dc 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -528,7 +528,7 @@
int free;
/*
* If value is NULL, it is remove operation.
- * In case of update operation, we caculate free.
+ * In case of update operation, we calculate free.
*/
free = MIN_OFFSET(inode) - ((char *)last - (char *)base_addr);
if (found)
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 8f27c93..ec9e082f 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -253,13 +253,11 @@
error = make_socks(serv, net);
if (error < 0)
- goto err_socks;
+ goto err_bind;
set_grace_period(net);
dprintk("lockd_up_net: per-net data created; net=%p\n", net);
return 0;
-err_socks:
- svc_rpcb_cleanup(serv, net);
err_bind:
ln->nlmsvc_users--;
return error;
diff --git a/fs/namespace.c b/fs/namespace.c
index a01c773..ef42d9b 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1217,6 +1217,11 @@
head.first->pprev = &head.first;
INIT_HLIST_HEAD(&unmounted);
+ /* undo decrements we'd done in umount_tree() */
+ hlist_for_each_entry(mnt, &head, mnt_hash)
+ if (mnt->mnt_ex_mountpoint.mnt)
+ mntget(mnt->mnt_ex_mountpoint.mnt);
+
up_write(&namespace_sem);
synchronize_rcu();
@@ -1253,6 +1258,9 @@
hlist_add_head(&p->mnt_hash, &tmp_list);
}
+ hlist_for_each_entry(p, &tmp_list, mnt_hash)
+ list_del_init(&p->mnt_child);
+
if (how)
propagate_umount(&tmp_list);
@@ -1263,9 +1271,9 @@
p->mnt_ns = NULL;
if (how < 2)
p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;
- list_del_init(&p->mnt_child);
if (mnt_has_parent(p)) {
put_mountpoint(p->mnt_mp);
+ mnt_add_count(p->mnt_parent, -1);
/* move the reference to mountpoint into ->mnt_ex_mountpoint */
p->mnt_ex_mountpoint.dentry = p->mnt_mountpoint;
p->mnt_ex_mountpoint.mnt = &p->mnt_parent->mnt;
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index f9821ce..e94457c 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2657,6 +2657,7 @@
struct xdr_stream *xdr = cd->xdr;
int start_offset = xdr->buf->len;
int cookie_offset;
+ u32 name_and_cookie;
int entry_bytes;
__be32 nfserr = nfserr_toosmall;
__be64 wire_offset;
@@ -2718,7 +2719,14 @@
cd->rd_maxcount -= entry_bytes;
if (!cd->rd_dircount)
goto fail;
- cd->rd_dircount--;
+ /*
+ * RFC 3530 14.2.24 describes rd_dircount as only a "hint", so
+ * let's always let through the first entry, at least:
+ */
+ name_and_cookie = 4 * XDR_QUADLEN(namlen) + 8;
+ if (name_and_cookie > cd->rd_dircount && cd->cookie_offset)
+ goto fail;
+ cd->rd_dircount -= min(cd->rd_dircount, name_and_cookie);
cd->cookie_offset = cookie_offset;
skip_entry:
cd->common.err = nfs_ok;
@@ -3321,6 +3329,10 @@
}
maxcount = min_t(int, maxcount-16, bytes_left);
+ /* RFC 3530 14.2.24 allows us to ignore dircount when it's 0: */
+ if (!readdir->rd_dircount)
+ readdir->rd_dircount = INT_MAX;
+
readdir->xdr = xdr;
readdir->rd_maxcount = maxcount;
readdir->common.err = 0;
diff --git a/fs/pnode.c b/fs/pnode.c
index 302bf22..aae331a 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -381,6 +381,7 @@
* other children
*/
if (child && list_empty(&child->mnt_mounts)) {
+ list_del_init(&child->mnt_child);
hlist_del_init_rcu(&child->mnt_hash);
hlist_add_before_rcu(&child->mnt_hash, &mnt->mnt_hash);
}
diff --git a/fs/sync.c b/fs/sync.c
index b28d1dd..bdc729d 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -65,7 +65,7 @@
return ret;
return __sync_filesystem(sb, 1);
}
-EXPORT_SYMBOL_GPL(sync_filesystem);
+EXPORT_SYMBOL(sync_filesystem);
static void sync_inodes_one_sb(struct super_block *sb, void *arg)
{
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 80c3502..b46ffa9 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -333,8 +333,7 @@
spin_lock_irq(&ctx->wqh.lock);
if (!timerfd_canceled(ctx)) {
ctx->ticks = ticks;
- if (ticks)
- wake_up_locked(&ctx->wqh);
+ wake_up_locked(&ctx->wqh);
} else
ret = -ECANCELED;
spin_unlock_irq(&ctx->wqh.lock);
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 7c580c9..be7d42c 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -902,9 +902,6 @@
invalidate_inode_buffers(inode);
clear_inode(inode);
- if (want_delete) {
- lock_ufs(inode->i_sb);
- ufs_free_inode (inode);
- unlock_ufs(inode->i_sb);
- }
+ if (want_delete)
+ ufs_free_inode(inode);
}
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 90d74b8..2df62a7 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -126,12 +126,12 @@
if (l > sb->s_blocksize)
goto out_notlocked;
- lock_ufs(dir->i_sb);
inode = ufs_new_inode(dir, S_IFLNK | S_IRWXUGO);
err = PTR_ERR(inode);
if (IS_ERR(inode))
- goto out;
+ goto out_notlocked;
+ lock_ufs(dir->i_sb);
if (l > UFS_SB(sb)->s_uspi->s_maxsymlinklen) {
/* slow symlink */
inode->i_op = &ufs_symlink_inode_operations;
@@ -181,13 +181,9 @@
struct inode * inode;
int err;
- lock_ufs(dir->i_sb);
- inode_inc_link_count(dir);
-
inode = ufs_new_inode(dir, S_IFDIR|mode);
- err = PTR_ERR(inode);
if (IS_ERR(inode))
- goto out_dir;
+ return PTR_ERR(inode);
inode->i_op = &ufs_dir_inode_operations;
inode->i_fop = &ufs_dir_operations;
@@ -195,6 +191,9 @@
inode_inc_link_count(inode);
+ lock_ufs(dir->i_sb);
+ inode_inc_link_count(dir);
+
err = ufs_make_empty(inode, dir);
if (err)
goto out_fail;
@@ -212,7 +211,6 @@
inode_dec_link_count(inode);
inode_dec_link_count(inode);
iput (inode);
-out_dir:
inode_dec_link_count(dir);
unlock_ufs(dir->i_sb);
goto out;
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index de2d26d..86df952 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -5424,7 +5424,7 @@
struct xfs_bmap_free *flist,
int num_exts)
{
- struct xfs_btree_cur *cur;
+ struct xfs_btree_cur *cur = NULL;
struct xfs_bmbt_rec_host *gotp;
struct xfs_bmbt_irec got;
struct xfs_bmbt_irec left;
@@ -5435,7 +5435,7 @@
int error = 0;
int i;
int whichfork = XFS_DATA_FORK;
- int logflags;
+ int logflags = 0;
xfs_filblks_t blockcount = 0;
int total_extents;
@@ -5478,16 +5478,11 @@
}
}
- /* We are going to change core inode */
- logflags = XFS_ILOG_CORE;
if (ifp->if_flags & XFS_IFBROOT) {
cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
cur->bc_private.b.firstblock = *firstblock;
cur->bc_private.b.flist = flist;
cur->bc_private.b.flags = 0;
- } else {
- cur = NULL;
- logflags |= XFS_ILOG_DEXT;
}
/*
@@ -5545,11 +5540,14 @@
blockcount = left.br_blockcount +
got.br_blockcount;
xfs_iext_remove(ip, *current_ext, 1, 0);
+ logflags |= XFS_ILOG_CORE;
if (cur) {
error = xfs_btree_delete(cur, &i);
if (error)
goto del_cursor;
XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
+ } else {
+ logflags |= XFS_ILOG_DEXT;
}
XFS_IFORK_NEXT_SET(ip, whichfork,
XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
@@ -5575,6 +5573,7 @@
got.br_startoff = startoff;
}
+ logflags |= XFS_ILOG_CORE;
if (cur) {
error = xfs_bmbt_update(cur, got.br_startoff,
got.br_startblock,
@@ -5582,6 +5581,8 @@
got.br_state);
if (error)
goto del_cursor;
+ } else {
+ logflags |= XFS_ILOG_DEXT;
}
(*current_ext)++;
@@ -5597,6 +5598,7 @@
xfs_btree_del_cursor(cur,
error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
- xfs_trans_log_inode(tp, ip, logflags);
+ if (logflags)
+ xfs_trans_log_inode(tp, ip, logflags);
return error;
}
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 11e9b4c..b984647 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1753,11 +1753,72 @@
return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks);
}
+/*
+ * This is basically a copy of __set_page_dirty_buffers() with one
+ * small tweak: buffers beyond EOF do not get marked dirty. If we mark them
+ * dirty, we'll never be able to clean them because we don't write buffers
+ * beyond EOF, and that means we can't invalidate pages that span EOF
+ * that have been marked dirty. Further, the dirty state can leak into
+ * the file interior if the file is extended, resulting in all sorts of
+ * bad things happening as the state does not match the underlying data.
+ *
+ * XXX: this really indicates that bufferheads in XFS need to die. Warts like
+ * this only exist because of bufferheads and how the generic code manages them.
+ */
+STATIC int
+xfs_vm_set_page_dirty(
+ struct page *page)
+{
+ struct address_space *mapping = page->mapping;
+ struct inode *inode = mapping->host;
+ loff_t end_offset;
+ loff_t offset;
+ int newly_dirty;
+
+ if (unlikely(!mapping))
+ return !TestSetPageDirty(page);
+
+ end_offset = i_size_read(inode);
+ offset = page_offset(page);
+
+ spin_lock(&mapping->private_lock);
+ if (page_has_buffers(page)) {
+ struct buffer_head *head = page_buffers(page);
+ struct buffer_head *bh = head;
+
+ do {
+ if (offset < end_offset)
+ set_buffer_dirty(bh);
+ bh = bh->b_this_page;
+ offset += 1 << inode->i_blkbits;
+ } while (bh != head);
+ }
+ newly_dirty = !TestSetPageDirty(page);
+ spin_unlock(&mapping->private_lock);
+
+ if (newly_dirty) {
+ /* sigh - __set_page_dirty() is static, so copy it here, too */
+ unsigned long flags;
+
+ spin_lock_irqsave(&mapping->tree_lock, flags);
+ if (page->mapping) { /* Race with truncate? */
+ WARN_ON_ONCE(!PageUptodate(page));
+ account_page_dirtied(page, mapping);
+ radix_tree_tag_set(&mapping->page_tree,
+ page_index(page), PAGECACHE_TAG_DIRTY);
+ }
+ spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+ }
+ return newly_dirty;
+}
+
const struct address_space_operations xfs_address_space_operations = {
.readpage = xfs_vm_readpage,
.readpages = xfs_vm_readpages,
.writepage = xfs_vm_writepage,
.writepages = xfs_vm_writepages,
+ .set_page_dirty = xfs_vm_set_page_dirty,
.releasepage = xfs_vm_releasepage,
.invalidatepage = xfs_vm_invalidatepage,
.write_begin = xfs_vm_write_begin,
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 2f1e30d..1707980 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1470,6 +1470,26 @@
start_fsb = XFS_B_TO_FSB(mp, offset + len);
shift_fsb = XFS_B_TO_FSB(mp, len);
+ /*
+ * Writeback the entire file and force remove any post-eof blocks. The
+ * writeback prevents changes to the extent list via concurrent
+ * writeback and the eofblocks trim prevents the extent shift algorithm
+ * from running into a post-eof delalloc extent.
+ *
+ * XXX: This is a temporary fix until the extent shift loop below is
+ * converted to use offsets and lookups within the ILOCK rather than
+ * carrying around the index into the extent list for the next
+ * iteration.
+ */
+ error = filemap_write_and_wait(VFS_I(ip)->i_mapping);
+ if (error)
+ return error;
+ if (xfs_can_free_eofblocks(ip, true)) {
+ error = xfs_free_eofblocks(mp, ip, false);
+ if (error)
+ return error;
+ }
+
error = xfs_free_file_space(ip, offset, len);
if (error)
return error;
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 076b170..de5368c 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -291,12 +291,22 @@
if (inode->i_mapping->nrpages) {
ret = filemap_write_and_wait_range(
VFS_I(ip)->i_mapping,
- pos, -1);
+ pos, pos + size - 1);
if (ret) {
xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
return ret;
}
- truncate_pagecache_range(VFS_I(ip), pos, -1);
+
+ /*
+ * Invalidate whole pages. This can return an error if
+ * we fail to invalidate a page, but this should never
+ * happen on XFS. Warn if it does fail.
+ */
+ ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
+ pos >> PAGE_CACHE_SHIFT,
+ (pos + size - 1) >> PAGE_CACHE_SHIFT);
+ WARN_ON_ONCE(ret);
+ ret = 0;
}
xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
}
@@ -632,10 +642,19 @@
if (mapping->nrpages) {
ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
- pos, -1);
+ pos, pos + count - 1);
if (ret)
goto out;
- truncate_pagecache_range(VFS_I(ip), pos, -1);
+ /*
+ * Invalidate whole pages. This can return an error if
+ * we fail to invalidate a page, but this should never
+ * happen on XFS. Warn if it does fail.
+ */
+ ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
+ pos >> PAGE_CACHE_SHIFT,
+ (pos + count - 1) >> PAGE_CACHE_SHIFT);
+ WARN_ON_ONCE(ret);
+ ret = 0;
}
/*
diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h
index bcfd808..c1c9de1 100644
--- a/include/acpi/acpi_bus.h
+++ b/include/acpi/acpi_bus.h
@@ -246,7 +246,6 @@
acpi_device_name device_name; /* Driver-determined */
acpi_device_class device_class; /* " */
union acpi_object *str_obj; /* unicode string for _STR method */
- unsigned long sun; /* _SUN */
};
#define acpi_device_bid(d) ((d)->pnp.bus_id)
diff --git a/include/asm-generic/atomic.h b/include/asm-generic/atomic.h
index 9c79e76..56d4d36 100644
--- a/include/asm-generic/atomic.h
+++ b/include/asm-generic/atomic.h
@@ -18,14 +18,100 @@
#include <asm/cmpxchg.h>
#include <asm/barrier.h>
+/*
+ * atomic_$op() - $op integer to atomic variable
+ * @i: integer value to $op
+ * @v: pointer to the atomic variable
+ *
+ * Atomically $ops @i to @v. Does not strictly guarantee a memory-barrier, use
+ * smp_mb__{before,after}_atomic().
+ */
+
+/*
+ * atomic_$op_return() - $op interer to atomic variable and returns the result
+ * @i: integer value to $op
+ * @v: pointer to the atomic variable
+ *
+ * Atomically $ops @i to @v. Does imply a full memory barrier.
+ */
+
#ifdef CONFIG_SMP
-/* Force people to define core atomics */
-# if !defined(atomic_add_return) || !defined(atomic_sub_return) || \
- !defined(atomic_clear_mask) || !defined(atomic_set_mask)
-# error "SMP requires a little arch-specific magic"
-# endif
+
+/* we can build all atomic primitives from cmpxchg */
+
+#define ATOMIC_OP(op, c_op) \
+static inline void atomic_##op(int i, atomic_t *v) \
+{ \
+ int c, old; \
+ \
+ c = v->counter; \
+ while ((old = cmpxchg(&v->counter, c, c c_op i)) != c) \
+ c = old; \
+}
+
+#define ATOMIC_OP_RETURN(op, c_op) \
+static inline int atomic_##op##_return(int i, atomic_t *v) \
+{ \
+ int c, old; \
+ \
+ c = v->counter; \
+ while ((old = cmpxchg(&v->counter, c, c c_op i)) != c) \
+ c = old; \
+ \
+ return c c_op i; \
+}
+
+#else
+
+#include <linux/irqflags.h>
+
+#define ATOMIC_OP(op, c_op) \
+static inline void atomic_##op(int i, atomic_t *v) \
+{ \
+ unsigned long flags; \
+ \
+ raw_local_irq_save(flags); \
+ v->counter = v->counter c_op i; \
+ raw_local_irq_restore(flags); \
+}
+
+#define ATOMIC_OP_RETURN(op, c_op) \
+static inline int atomic_##op##_return(int i, atomic_t *v) \
+{ \
+ unsigned long flags; \
+ int ret; \
+ \
+ raw_local_irq_save(flags); \
+ ret = (v->counter = v->counter c_op i); \
+ raw_local_irq_restore(flags); \
+ \
+ return ret; \
+}
+
+#endif /* CONFIG_SMP */
+
+#ifndef atomic_add_return
+ATOMIC_OP_RETURN(add, +)
#endif
+#ifndef atomic_sub_return
+ATOMIC_OP_RETURN(sub, -)
+#endif
+
+#ifndef atomic_clear_mask
+ATOMIC_OP(and, &)
+#define atomic_clear_mask(i, v) atomic_and(~(i), (v))
+#endif
+
+#ifndef atomic_set_mask
+#define CONFIG_ARCH_HAS_ATOMIC_OR
+ATOMIC_OP(or, |)
+#define atomic_set_mask(i, v) atomic_or((i), (v))
+#endif
+
+#undef ATOMIC_OP_RETURN
+#undef ATOMIC_OP
+
/*
* Atomic operations that C can't guarantee us. Useful for
* resource counting etc..
@@ -33,8 +119,6 @@
#define ATOMIC_INIT(i) { (i) }
-#ifdef __KERNEL__
-
/**
* atomic_read - read atomic variable
* @v: pointer of type atomic_t
@@ -56,52 +140,6 @@
#include <linux/irqflags.h>
-/**
- * atomic_add_return - add integer to atomic variable
- * @i: integer value to add
- * @v: pointer of type atomic_t
- *
- * Atomically adds @i to @v and returns the result
- */
-#ifndef atomic_add_return
-static inline int atomic_add_return(int i, atomic_t *v)
-{
- unsigned long flags;
- int temp;
-
- raw_local_irq_save(flags); /* Don't trace it in an irqsoff handler */
- temp = v->counter;
- temp += i;
- v->counter = temp;
- raw_local_irq_restore(flags);
-
- return temp;
-}
-#endif
-
-/**
- * atomic_sub_return - subtract integer from atomic variable
- * @i: integer value to subtract
- * @v: pointer of type atomic_t
- *
- * Atomically subtracts @i from @v and returns the result
- */
-#ifndef atomic_sub_return
-static inline int atomic_sub_return(int i, atomic_t *v)
-{
- unsigned long flags;
- int temp;
-
- raw_local_irq_save(flags); /* Don't trace it in an irqsoff handler */
- temp = v->counter;
- temp -= i;
- v->counter = temp;
- raw_local_irq_restore(flags);
-
- return temp;
-}
-#endif
-
static inline int atomic_add_negative(int i, atomic_t *v)
{
return atomic_add_return(i, v) < 0;
@@ -139,49 +177,11 @@
static inline int __atomic_add_unless(atomic_t *v, int a, int u)
{
- int c, old;
- c = atomic_read(v);
- while (c != u && (old = atomic_cmpxchg(v, c, c + a)) != c)
- c = old;
- return c;
+ int c, old;
+ c = atomic_read(v);
+ while (c != u && (old = atomic_cmpxchg(v, c, c + a)) != c)
+ c = old;
+ return c;
}
-/**
- * atomic_clear_mask - Atomically clear bits in atomic variable
- * @mask: Mask of the bits to be cleared
- * @v: pointer of type atomic_t
- *
- * Atomically clears the bits set in @mask from @v
- */
-#ifndef atomic_clear_mask
-static inline void atomic_clear_mask(unsigned long mask, atomic_t *v)
-{
- unsigned long flags;
-
- mask = ~mask;
- raw_local_irq_save(flags); /* Don't trace it in a irqsoff handler */
- v->counter &= mask;
- raw_local_irq_restore(flags);
-}
-#endif
-
-/**
- * atomic_set_mask - Atomically set bits in atomic variable
- * @mask: Mask of the bits to be set
- * @v: pointer of type atomic_t
- *
- * Atomically sets the bits set in @mask in @v
- */
-#ifndef atomic_set_mask
-static inline void atomic_set_mask(unsigned int mask, atomic_t *v)
-{
- unsigned long flags;
-
- raw_local_irq_save(flags); /* Don't trace it in a irqsoff handler */
- v->counter |= mask;
- raw_local_irq_restore(flags);
-}
-#endif
-
-#endif /* __KERNEL__ */
#endif /* __ASM_GENERIC_ATOMIC_H */
diff --git a/include/asm-generic/atomic64.h b/include/asm-generic/atomic64.h
index b18ce4f..30ad9c8 100644
--- a/include/asm-generic/atomic64.h
+++ b/include/asm-generic/atomic64.h
@@ -20,10 +20,22 @@
extern long long atomic64_read(const atomic64_t *v);
extern void atomic64_set(atomic64_t *v, long long i);
-extern void atomic64_add(long long a, atomic64_t *v);
-extern long long atomic64_add_return(long long a, atomic64_t *v);
-extern void atomic64_sub(long long a, atomic64_t *v);
-extern long long atomic64_sub_return(long long a, atomic64_t *v);
+
+#define ATOMIC64_OP(op) \
+extern void atomic64_##op(long long a, atomic64_t *v);
+
+#define ATOMIC64_OP_RETURN(op) \
+extern long long atomic64_##op##_return(long long a, atomic64_t *v);
+
+#define ATOMIC64_OPS(op) ATOMIC64_OP(op) ATOMIC64_OP_RETURN(op)
+
+ATOMIC64_OPS(add)
+ATOMIC64_OPS(sub)
+
+#undef ATOMIC64_OPS
+#undef ATOMIC64_OP_RETURN
+#undef ATOMIC64_OP
+
extern long long atomic64_dec_if_positive(atomic64_t *v);
extern long long atomic64_cmpxchg(atomic64_t *v, long long o, long long n);
extern long long atomic64_xchg(atomic64_t *v, long long new);
diff --git a/include/linux/atomic.h b/include/linux/atomic.h
index fef3a80..5b08a85 100644
--- a/include/linux/atomic.h
+++ b/include/linux/atomic.h
@@ -3,42 +3,6 @@
#define _LINUX_ATOMIC_H
#include <asm/atomic.h>
-/*
- * Provide __deprecated wrappers for the new interface, avoid flag day changes.
- * We need the ugly external functions to break header recursion hell.
- */
-#ifndef smp_mb__before_atomic_inc
-static inline void __deprecated smp_mb__before_atomic_inc(void)
-{
- extern void __smp_mb__before_atomic(void);
- __smp_mb__before_atomic();
-}
-#endif
-
-#ifndef smp_mb__after_atomic_inc
-static inline void __deprecated smp_mb__after_atomic_inc(void)
-{
- extern void __smp_mb__after_atomic(void);
- __smp_mb__after_atomic();
-}
-#endif
-
-#ifndef smp_mb__before_atomic_dec
-static inline void __deprecated smp_mb__before_atomic_dec(void)
-{
- extern void __smp_mb__before_atomic(void);
- __smp_mb__before_atomic();
-}
-#endif
-
-#ifndef smp_mb__after_atomic_dec
-static inline void __deprecated smp_mb__after_atomic_dec(void)
-{
- extern void __smp_mb__after_atomic(void);
- __smp_mb__after_atomic();
-}
-#endif
-
/**
* atomic_add_unless - add unless the number is already a given value
* @v: pointer of type atomic_t
diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index cbc5833..be5fd38 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -32,26 +32,6 @@
*/
#include <asm/bitops.h>
-/*
- * Provide __deprecated wrappers for the new interface, avoid flag day changes.
- * We need the ugly external functions to break header recursion hell.
- */
-#ifndef smp_mb__before_clear_bit
-static inline void __deprecated smp_mb__before_clear_bit(void)
-{
- extern void __smp_mb__before_atomic(void);
- __smp_mb__before_atomic();
-}
-#endif
-
-#ifndef smp_mb__after_clear_bit
-static inline void __deprecated smp_mb__after_clear_bit(void)
-{
- extern void __smp_mb__after_atomic(void);
- __smp_mb__after_atomic();
-}
-#endif
-
#define for_each_set_bit(bit, addr, size) \
for ((bit) = find_first_bit((addr), (size)); \
(bit) < (size); \
diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index 6ff0b0b..08ed2b0 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -24,6 +24,9 @@
#define NULL_ADDR ((block_t)0) /* used as block_t addresses */
#define NEW_ADDR ((block_t)-1) /* used as block_t addresses */
+/* 0, 1(node nid), 2(meta nid) are reserved node id */
+#define F2FS_RESERVED_NODE_NUM 3
+
#define F2FS_ROOT_INO(sbi) (sbi->root_ino_num)
#define F2FS_NODE_INO(sbi) (sbi->node_ino_num)
#define F2FS_META_INO(sbi) (sbi->meta_ino_num)
@@ -87,6 +90,8 @@
#define CP_ORPHAN_PRESENT_FLAG 0x00000002
#define CP_UMOUNT_FLAG 0x00000001
+#define F2FS_CP_PACKS 2 /* # of checkpoint packs */
+
struct f2fs_checkpoint {
__le64 checkpoint_ver; /* checkpoint block version number */
__le64 user_block_count; /* # of user blocks */
@@ -123,6 +128,9 @@
*/
#define F2FS_ORPHANS_PER_BLOCK 1020
+#define GET_ORPHAN_BLOCKS(n) ((n + F2FS_ORPHANS_PER_BLOCK - 1) / \
+ F2FS_ORPHANS_PER_BLOCK)
+
struct f2fs_orphan_block {
__le32 ino[F2FS_ORPHANS_PER_BLOCK]; /* inode numbers */
__le32 reserved; /* reserved */
@@ -144,6 +152,7 @@
#define F2FS_NAME_LEN 255
#define F2FS_INLINE_XATTR_ADDRS 50 /* 200 bytes for inline xattrs */
#define DEF_ADDRS_PER_INODE 923 /* Address Pointers in an Inode */
+#define DEF_NIDS_PER_INODE 5 /* Node IDs in an Inode */
#define ADDRS_PER_INODE(fi) addrs_per_inode(fi)
#define ADDRS_PER_BLOCK 1018 /* Address Pointers in a Direct Block */
#define NIDS_PER_BLOCK 1018 /* Node IDs in an Indirect Block */
@@ -163,8 +172,9 @@
#define MAX_INLINE_DATA (sizeof(__le32) * (DEF_ADDRS_PER_INODE - \
F2FS_INLINE_XATTR_ADDRS - 1))
-#define INLINE_DATA_OFFSET (PAGE_CACHE_SIZE - sizeof(struct node_footer) \
- - sizeof(__le32) * (DEF_ADDRS_PER_INODE + 5 - 1))
+#define INLINE_DATA_OFFSET (PAGE_CACHE_SIZE - sizeof(struct node_footer) -\
+ sizeof(__le32) * (DEF_ADDRS_PER_INODE + \
+ DEF_NIDS_PER_INODE - 1))
struct f2fs_inode {
__le16 i_mode; /* file mode */
@@ -194,7 +204,7 @@
__le32 i_addr[DEF_ADDRS_PER_INODE]; /* Pointers to data blocks */
- __le32 i_nid[5]; /* direct(2), indirect(2),
+ __le32 i_nid[DEF_NIDS_PER_INODE]; /* direct(2), indirect(2),
double_indirect(1) node id */
} __packed;
diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h
index c7e17de..12f146f 100644
--- a/include/linux/gpio/consumer.h
+++ b/include/linux/gpio/consumer.h
@@ -38,60 +38,32 @@
struct gpio_desc *__must_check __gpiod_get(struct device *dev,
const char *con_id,
enum gpiod_flags flags);
-#define __gpiod_get(dev, con_id, flags, ...) __gpiod_get(dev, con_id, flags)
-#define gpiod_get(varargs...) __gpiod_get(varargs, 0)
struct gpio_desc *__must_check __gpiod_get_index(struct device *dev,
const char *con_id,
unsigned int idx,
enum gpiod_flags flags);
-#define __gpiod_get_index(dev, con_id, index, flags, ...) \
- __gpiod_get_index(dev, con_id, index, flags)
-#define gpiod_get_index(varargs...) __gpiod_get_index(varargs, 0)
struct gpio_desc *__must_check __gpiod_get_optional(struct device *dev,
const char *con_id,
enum gpiod_flags flags);
-#define __gpiod_get_optional(dev, con_id, flags, ...) \
- __gpiod_get_optional(dev, con_id, flags)
-#define gpiod_get_optional(varargs...) __gpiod_get_optional(varargs, 0)
struct gpio_desc *__must_check __gpiod_get_index_optional(struct device *dev,
const char *con_id,
unsigned int index,
enum gpiod_flags flags);
-#define __gpiod_get_index_optional(dev, con_id, index, flags, ...) \
- __gpiod_get_index_optional(dev, con_id, index, flags)
-#define gpiod_get_index_optional(varargs...) \
- __gpiod_get_index_optional(varargs, 0)
-
void gpiod_put(struct gpio_desc *desc);
struct gpio_desc *__must_check __devm_gpiod_get(struct device *dev,
const char *con_id,
enum gpiod_flags flags);
-#define __devm_gpiod_get(dev, con_id, flags, ...) \
- __devm_gpiod_get(dev, con_id, flags)
-#define devm_gpiod_get(varargs...) __devm_gpiod_get(varargs, 0)
struct gpio_desc *__must_check __devm_gpiod_get_index(struct device *dev,
const char *con_id,
unsigned int idx,
enum gpiod_flags flags);
-#define __devm_gpiod_get_index(dev, con_id, index, flags, ...) \
- __devm_gpiod_get_index(dev, con_id, index, flags)
-#define devm_gpiod_get_index(varargs...) __devm_gpiod_get_index(varargs, 0)
struct gpio_desc *__must_check __devm_gpiod_get_optional(struct device *dev,
const char *con_id,
enum gpiod_flags flags);
-#define __devm_gpiod_get_optional(dev, con_id, flags, ...) \
- __devm_gpiod_get_optional(dev, con_id, flags)
-#define devm_gpiod_get_optional(varargs...) \
- __devm_gpiod_get_optional(varargs, 0)
struct gpio_desc *__must_check
__devm_gpiod_get_index_optional(struct device *dev, const char *con_id,
unsigned int index, enum gpiod_flags flags);
-#define __devm_gpiod_get_index_optional(dev, con_id, index, flags, ...) \
- __devm_gpiod_get_index_optional(dev, con_id, index, flags)
-#define devm_gpiod_get_index_optional(varargs...) \
- __devm_gpiod_get_index_optional(varargs, 0)
-
void devm_gpiod_put(struct device *dev, struct gpio_desc *desc);
int gpiod_get_direction(const struct gpio_desc *desc);
@@ -124,27 +96,31 @@
#else /* CONFIG_GPIOLIB */
-static inline struct gpio_desc *__must_check gpiod_get(struct device *dev,
- const char *con_id)
+static inline struct gpio_desc *__must_check __gpiod_get(struct device *dev,
+ const char *con_id,
+ enum gpiod_flags flags)
{
return ERR_PTR(-ENOSYS);
}
-static inline struct gpio_desc *__must_check gpiod_get_index(struct device *dev,
- const char *con_id,
- unsigned int idx)
+static inline struct gpio_desc *__must_check
+__gpiod_get_index(struct device *dev,
+ const char *con_id,
+ unsigned int idx,
+ enum gpiod_flags flags)
{
return ERR_PTR(-ENOSYS);
}
static inline struct gpio_desc *__must_check
-gpiod_get_optional(struct device *dev, const char *con_id)
+__gpiod_get_optional(struct device *dev, const char *con_id,
+ enum gpiod_flags flags)
{
return ERR_PTR(-ENOSYS);
}
static inline struct gpio_desc *__must_check
-gpiod_get_index_optional(struct device *dev, const char *con_id,
- unsigned int index)
+__gpiod_get_index_optional(struct device *dev, const char *con_id,
+ unsigned int index, enum gpiod_flags flags)
{
return ERR_PTR(-ENOSYS);
}
@@ -157,28 +133,33 @@
WARN_ON(1);
}
-static inline struct gpio_desc *__must_check devm_gpiod_get(struct device *dev,
- const char *con_id)
+static inline struct gpio_desc *__must_check
+__devm_gpiod_get(struct device *dev,
+ const char *con_id,
+ enum gpiod_flags flags)
{
return ERR_PTR(-ENOSYS);
}
static inline
-struct gpio_desc *__must_check devm_gpiod_get_index(struct device *dev,
- const char *con_id,
- unsigned int idx)
+struct gpio_desc *__must_check
+__devm_gpiod_get_index(struct device *dev,
+ const char *con_id,
+ unsigned int idx,
+ enum gpiod_flags flags)
{
return ERR_PTR(-ENOSYS);
}
static inline struct gpio_desc *__must_check
-devm_gpiod_get_optional(struct device *dev, const char *con_id)
+__devm_gpiod_get_optional(struct device *dev, const char *con_id,
+ enum gpiod_flags flags)
{
return ERR_PTR(-ENOSYS);
}
static inline struct gpio_desc *__must_check
-devm_gpiod_get_index_optional(struct device *dev, const char *con_id,
- unsigned int index)
+__devm_gpiod_get_index_optional(struct device *dev, const char *con_id,
+ unsigned int index, enum gpiod_flags flags)
{
return ERR_PTR(-ENOSYS);
}
@@ -303,9 +284,43 @@
return -EINVAL;
}
-
#endif /* CONFIG_GPIOLIB */
+/*
+ * Vararg-hacks! This is done to transition the kernel to always pass
+ * the options flags argument to the below functions. During a transition
+ * phase these vararg macros make both old-and-newstyle code compile,
+ * but when all calls to the elder API are removed, these should go away
+ * and the __gpiod_get() etc functions above be renamed just gpiod_get()
+ * etc.
+ */
+#define __gpiod_get(dev, con_id, flags, ...) __gpiod_get(dev, con_id, flags)
+#define gpiod_get(varargs...) __gpiod_get(varargs, 0)
+#define __gpiod_get_index(dev, con_id, index, flags, ...) \
+ __gpiod_get_index(dev, con_id, index, flags)
+#define gpiod_get_index(varargs...) __gpiod_get_index(varargs, 0)
+#define __gpiod_get_optional(dev, con_id, flags, ...) \
+ __gpiod_get_optional(dev, con_id, flags)
+#define gpiod_get_optional(varargs...) __gpiod_get_optional(varargs, 0)
+#define __gpiod_get_index_optional(dev, con_id, index, flags, ...) \
+ __gpiod_get_index_optional(dev, con_id, index, flags)
+#define gpiod_get_index_optional(varargs...) \
+ __gpiod_get_index_optional(varargs, 0)
+#define __devm_gpiod_get(dev, con_id, flags, ...) \
+ __devm_gpiod_get(dev, con_id, flags)
+#define devm_gpiod_get(varargs...) __devm_gpiod_get(varargs, 0)
+#define __devm_gpiod_get_index(dev, con_id, index, flags, ...) \
+ __devm_gpiod_get_index(dev, con_id, index, flags)
+#define devm_gpiod_get_index(varargs...) __devm_gpiod_get_index(varargs, 0)
+#define __devm_gpiod_get_optional(dev, con_id, flags, ...) \
+ __devm_gpiod_get_optional(dev, con_id, flags)
+#define devm_gpiod_get_optional(varargs...) \
+ __devm_gpiod_get_optional(varargs, 0)
+#define __devm_gpiod_get_index_optional(dev, con_id, index, flags, ...) \
+ __devm_gpiod_get_index_optional(dev, con_id, index, flags)
+#define devm_gpiod_get_index_optional(varargs...) \
+ __devm_gpiod_get_index_optional(varargs, 0)
+
#if IS_ENABLED(CONFIG_GPIOLIB) && IS_ENABLED(CONFIG_GPIO_SYSFS)
int gpiod_export(struct gpio_desc *desc, bool direction_may_change);
diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
index 784304b..98f923b6 100644
--- a/include/linux/jump_label.h
+++ b/include/linux/jump_label.h
@@ -8,28 +8,28 @@
* Copyright (C) 2011-2012 Peter Zijlstra <pzijlstr@redhat.com>
*
* Jump labels provide an interface to generate dynamic branches using
- * self-modifying code. Assuming toolchain and architecture support the result
- * of a "if (static_key_false(&key))" statement is a unconditional branch (which
+ * self-modifying code. Assuming toolchain and architecture support, the result
+ * of a "if (static_key_false(&key))" statement is an unconditional branch (which
* defaults to false - and the true block is placed out of line).
*
* However at runtime we can change the branch target using
* static_key_slow_{inc,dec}(). These function as a 'reference' count on the key
- * object and for as long as there are references all branches referring to
+ * object, and for as long as there are references all branches referring to
* that particular key will point to the (out of line) true block.
*
- * Since this relies on modifying code the static_key_slow_{inc,dec}() functions
+ * Since this relies on modifying code, the static_key_slow_{inc,dec}() functions
* must be considered absolute slow paths (machine wide synchronization etc.).
- * OTOH, since the affected branches are unconditional their runtime overhead
+ * OTOH, since the affected branches are unconditional, their runtime overhead
* will be absolutely minimal, esp. in the default (off) case where the total
* effect is a single NOP of appropriate size. The on case will patch in a jump
* to the out-of-line block.
*
- * When the control is directly exposed to userspace it is prudent to delay the
+ * When the control is directly exposed to userspace, it is prudent to delay the
* decrement to avoid high frequency code modifications which can (and do)
* cause significant performance degradation. Struct static_key_deferred and
* static_key_slow_dec_deferred() provide for this.
*
- * Lacking toolchain and or architecture support, it falls back to a simple
+ * Lacking toolchain and or architecture support, jump labels fall back to a simple
* conditional branch.
*
* struct static_key my_key = STATIC_KEY_INIT_TRUE;
@@ -43,8 +43,7 @@
*
* Not initializing the key (static data is initialized to 0s anyway) is the
* same as using STATIC_KEY_INIT_FALSE.
- *
-*/
+ */
#include <linux/types.h>
#include <linux/compiler.h>
diff --git a/include/linux/leds.h b/include/linux/leds.h
index 6a599dc..e436864 100644
--- a/include/linux/leds.h
+++ b/include/linux/leds.h
@@ -15,6 +15,7 @@
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/rwsem.h>
+#include <linux/timer.h>
#include <linux/workqueue.h>
struct device;
@@ -68,7 +69,7 @@
const char *default_trigger; /* Trigger to use */
unsigned long blink_delay_on, blink_delay_off;
- struct delayed_work blink_work;
+ struct timer_list blink_timer;
int blink_brightness;
struct work_struct set_brightness_work;
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 008388f9..b5a84b62 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -4,7 +4,7 @@
* Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
* Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
*
- * see Documentation/lockdep-design.txt for more details.
+ * see Documentation/locking/lockdep-design.txt for more details.
*/
#ifndef __LINUX_LOCKDEP_H
#define __LINUX_LOCKDEP_H
@@ -478,16 +478,24 @@
* on the per lock-class debug mode:
*/
+/*
+ * Read states in the 2-bit held_lock:read field:
+ * 0: Exclusive lock
+ * 1: Shareable lock, cannot be recursively called
+ * 2: Shareable lock, can be recursively called
+ * 3: Shareable lock, cannot be recursively called except in interrupt context
+ */
#define lock_acquire_exclusive(l, s, t, n, i) lock_acquire(l, s, t, 0, 1, n, i)
#define lock_acquire_shared(l, s, t, n, i) lock_acquire(l, s, t, 1, 1, n, i)
#define lock_acquire_shared_recursive(l, s, t, n, i) lock_acquire(l, s, t, 2, 1, n, i)
+#define lock_acquire_shared_irecursive(l, s, t, n, i) lock_acquire(l, s, t, 3, 1, n, i)
#define spin_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i)
#define spin_acquire_nest(l, s, t, n, i) lock_acquire_exclusive(l, s, t, n, i)
#define spin_release(l, n, i) lock_release(l, n, i)
#define rwlock_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i)
-#define rwlock_acquire_read(l, s, t, i) lock_acquire_shared_recursive(l, s, t, NULL, i)
+#define rwlock_acquire_read(l, s, t, i) lock_acquire_shared_irecursive(l, s, t, NULL, i)
#define rwlock_release(l, n, i) lock_release(l, n, i)
#define seqcount_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i)
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 071f6b2..511c6e0 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -1196,6 +1196,9 @@
enum mlx4_net_trans_rule_id id);
int mlx4_hw_rule_sz(struct mlx4_dev *dev, enum mlx4_net_trans_rule_id id);
+int mlx4_tunnel_steer_add(struct mlx4_dev *dev, unsigned char *addr,
+ int port, int qpn, u16 prio, u64 *reg_id);
+
void mlx4_sync_pkey_table(struct mlx4_dev *dev, int slave, int port,
int i, int val);
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 3083c53..c300db3 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -949,7 +949,7 @@
: 0;
}
-/**
+/*
* struct nand_sdr_timings - SDR NAND chip timings
*
* This struct defines the timing requirements of a SDR NAND chip.
diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index 8d5535c..cc31498 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -52,7 +52,7 @@
atomic_t count;
spinlock_t wait_lock;
struct list_head wait_list;
-#if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_SMP)
+#if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_MUTEX_SPIN_ON_OWNER)
struct task_struct *owner;
#endif
#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
@@ -133,7 +133,7 @@
/*
* See kernel/locking/mutex.c for detailed documentation of these APIs.
- * Also see Documentation/mutex-design.txt.
+ * Also see Documentation/locking/mutex-design.txt.
*/
#ifdef CONFIG_DEBUG_LOCK_ALLOC
extern void mutex_lock_nested(struct mutex *lock, unsigned int subclass);
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 3837739..c8e388e 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3176,7 +3176,7 @@
}
/**
- * __dev_uc_unsync - Remove synchonized addresses from device
+ * __dev_uc_unsync - Remove synchronized addresses from device
* @dev: device to sync
* @unsync: function to call if address should be removed
*
@@ -3220,7 +3220,7 @@
}
/**
- * __dev_mc_unsync - Remove synchonized addresses from device
+ * __dev_mc_unsync - Remove synchronized addresses from device
* @dev: device to sync
* @unsync: function to call if address should be removed
*
diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 2077489..2517ece 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -9,6 +9,7 @@
#include <linux/in6.h>
#include <linux/wait.h>
#include <linux/list.h>
+#include <linux/static_key.h>
#include <uapi/linux/netfilter.h>
#ifdef CONFIG_NETFILTER
static inline int NF_DROP_GETERR(int verdict)
@@ -99,9 +100,9 @@
extern struct list_head nf_hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS];
-#if defined(CONFIG_JUMP_LABEL)
-#include <linux/static_key.h>
+#ifdef HAVE_JUMP_LABEL
extern struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS];
+
static inline bool nf_hooks_active(u_int8_t pf, unsigned int hook)
{
if (__builtin_constant_p(pf) &&
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 707617a..893a0d0 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -52,6 +52,7 @@
#include <linux/atomic.h>
#include <linux/sysfs.h>
#include <linux/perf_regs.h>
+#include <linux/workqueue.h>
#include <asm/local.h>
struct perf_callchain_entry {
@@ -268,6 +269,7 @@
* enum perf_event_active_state - the states of a event
*/
enum perf_event_active_state {
+ PERF_EVENT_STATE_EXIT = -3,
PERF_EVENT_STATE_ERROR = -2,
PERF_EVENT_STATE_OFF = -1,
PERF_EVENT_STATE_INACTIVE = 0,
@@ -507,6 +509,9 @@
int nr_cgroups; /* cgroup evts */
int nr_branch_stack; /* branch_stack evt */
struct rcu_head rcu_head;
+
+ struct delayed_work orphans_remove;
+ bool orphans_remove_sched;
};
/*
@@ -604,6 +609,13 @@
u64 txn;
};
+/* default value for data source */
+#define PERF_MEM_NA (PERF_MEM_S(OP, NA) |\
+ PERF_MEM_S(LVL, NA) |\
+ PERF_MEM_S(SNOOP, NA) |\
+ PERF_MEM_S(LOCK, NA) |\
+ PERF_MEM_S(TLB, NA))
+
static inline void perf_sample_data_init(struct perf_sample_data *data,
u64 addr, u64 period)
{
@@ -616,7 +628,7 @@
data->regs_user.regs = NULL;
data->stack_user_size = 0;
data->weight = 0;
- data->data_src.val = 0;
+ data->data_src.val = PERF_MEM_NA;
data->txn = 0;
}
diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index 7c1d252..ebc4c76 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -60,7 +60,7 @@
struct mutex lock;
struct dev_power_governor *gov;
struct work_struct power_off_work;
- char *name;
+ const char *name;
unsigned int in_progress; /* Number of devices being suspended now */
atomic_t sd_count; /* Number of subdomains with power "on" */
enum gpd_status status; /* Current state of the domain */
diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h
index bbe03a1..4efa1ed 100644
--- a/include/linux/regulator/driver.h
+++ b/include/linux/regulator/driver.h
@@ -218,6 +218,8 @@
* @linear_min_sel: Minimal selector for starting linear mapping
* @fixed_uV: Fixed voltage of rails.
* @ramp_delay: Time to settle down after voltage change (unit: uV/us)
+ * @linear_ranges: A constant table of possible voltage ranges.
+ * @n_linear_ranges: Number of entries in the @linear_ranges table.
* @volt_table: Voltage mapping table (if table based mapping)
*
* @vsel_reg: Register for selector when using regulator_regmap_X_voltage_
diff --git a/include/linux/regulator/machine.h b/include/linux/regulator/machine.h
index 730e638..0b08d05 100644
--- a/include/linux/regulator/machine.h
+++ b/include/linux/regulator/machine.h
@@ -85,6 +85,7 @@
* bootloader then it will be enabled when the constraints are
* applied.
* @apply_uV: Apply the voltage constraint when initialising.
+ * @ramp_disable: Disable ramp delay when initialising or when setting voltage.
*
* @input_uV: Input voltage for regulator when supplied by another regulator.
*
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index 035d3c5..8f498cd 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -149,7 +149,7 @@
* static then another method for expressing nested locking is
* the explicit definition of lock class keys and the use of
* lockdep_set_class() at lock initialization time.
- * See Documentation/lockdep-design.txt for more details.)
+ * See Documentation/locking/lockdep-design.txt for more details.)
*/
extern void down_read_nested(struct rw_semaphore *sem, int subclass);
extern void down_write_nested(struct rw_semaphore *sem, int subclass);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5c2c885..dd9eb48 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -645,6 +645,7 @@
* Live threads maintain their own counters and add to these
* in __exit_signal, except for the group leader.
*/
+ seqlock_t stats_lock;
cputime_t utime, stime, cutime, cstime;
cputime_t gtime;
cputime_t cgtime;
diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index 5d586a4..a19ddac 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -27,19 +27,23 @@
struct seccomp_filter *filter;
};
-extern int __secure_computing(int);
-static inline int secure_computing(int this_syscall)
+#ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
+extern int __secure_computing(void);
+static inline int secure_computing(void)
{
if (unlikely(test_thread_flag(TIF_SECCOMP)))
- return __secure_computing(this_syscall);
+ return __secure_computing();
return 0;
}
-/* A wrapper for architectures supporting only SECCOMP_MODE_STRICT. */
-static inline void secure_computing_strict(int this_syscall)
-{
- BUG_ON(secure_computing(this_syscall) != 0);
-}
+#define SECCOMP_PHASE1_OK 0
+#define SECCOMP_PHASE1_SKIP 1
+
+extern u32 seccomp_phase1(struct seccomp_data *sd);
+int seccomp_phase2(u32 phase1_result);
+#else
+extern void secure_computing_strict(int this_syscall);
+#endif
extern long prctl_get_seccomp(void);
extern long prctl_set_seccomp(unsigned long, char __user *);
@@ -56,8 +60,11 @@
struct seccomp { };
struct seccomp_filter { };
-static inline int secure_computing(int this_syscall) { return 0; }
+#ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
+static inline int secure_computing(void) { return 0; }
+#else
static inline void secure_computing_strict(int this_syscall) { return; }
+#endif
static inline long prctl_get_seccomp(void)
{
diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index 3f2867f..262ba4e 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -197,7 +197,13 @@
_raw_spin_lock_nest_lock(lock, &(nest_lock)->dep_map); \
} while (0)
#else
-# define raw_spin_lock_nested(lock, subclass) _raw_spin_lock(lock)
+/*
+ * Always evaluate the 'subclass' argument to avoid that the compiler
+ * warns about set-but-not-used variables when building with
+ * CONFIG_DEBUG_LOCK_ALLOC=n and with W=1.
+ */
+# define raw_spin_lock_nested(lock, subclass) \
+ _raw_spin_lock(((void)(subclass), (lock)))
# define raw_spin_lock_nest_lock(lock, nest_lock) _raw_spin_lock(lock)
#endif
diff --git a/include/linux/tick.h b/include/linux/tick.h
index 0590523..9a82c7d 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -183,13 +183,8 @@
extern void tick_nohz_init(void);
extern void __tick_nohz_full_check(void);
+extern void tick_nohz_full_kick(void);
extern void tick_nohz_full_kick_cpu(int cpu);
-
-static inline void tick_nohz_full_kick(void)
-{
- tick_nohz_full_kick_cpu(smp_processor_id());
-}
-
extern void tick_nohz_full_kick_all(void);
extern void __tick_nohz_task_switch(struct task_struct *tsk);
#else
diff --git a/include/linux/wait.h b/include/linux/wait.h
index 6fb1ba5..034f6fc 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -280,9 +280,11 @@
* wake_up() has to be called after changing any variable that could
* change the result of the wait condition.
*
- * The function returns 0 if the @timeout elapsed, or the remaining
- * jiffies (at least 1) if the @condition evaluated to %true before
- * the @timeout elapsed.
+ * Returns:
+ * 0 if the @condition evaluated to %false after the @timeout elapsed,
+ * 1 if the @condition evaluated to %true after the @timeout elapsed,
+ * or the remaining jiffies (at least 1) if the @condition evaluated
+ * to %true before the @timeout elapsed.
*/
#define wait_event_timeout(wq, condition, timeout) \
({ \
@@ -363,9 +365,11 @@
* change the result of the wait condition.
*
* Returns:
- * 0 if the @timeout elapsed, -%ERESTARTSYS if it was interrupted by
- * a signal, or the remaining jiffies (at least 1) if the @condition
- * evaluated to %true before the @timeout elapsed.
+ * 0 if the @condition evaluated to %false after the @timeout elapsed,
+ * 1 if the @condition evaluated to %true after the @timeout elapsed,
+ * the remaining jiffies (at least 1) if the @condition evaluated
+ * to %true before the @timeout elapsed, or -%ERESTARTSYS if it was
+ * interrupted by a signal.
*/
#define wait_event_interruptible_timeout(wq, condition, timeout) \
({ \
diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index b5d5af3..6f884e6 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -464,6 +464,8 @@
HCI_AUTO_CONN_ALWAYS,
HCI_AUTO_CONN_LINK_LOSS,
} auto_connect;
+
+ struct hci_conn *conn;
};
extern struct list_head hci_dev_list;
diff --git a/include/net/netns/ieee802154_6lowpan.h b/include/net/netns/ieee802154_6lowpan.h
index e207096..8170f8d 100644
--- a/include/net/netns/ieee802154_6lowpan.h
+++ b/include/net/netns/ieee802154_6lowpan.h
@@ -16,7 +16,6 @@
struct netns_ieee802154_lowpan {
struct netns_sysctl_lowpan sysctl;
struct netns_frags frags;
- int max_dsize;
};
#endif
diff --git a/include/net/regulatory.h b/include/net/regulatory.h
index 2599924..dad7ab2 100644
--- a/include/net/regulatory.h
+++ b/include/net/regulatory.h
@@ -167,7 +167,7 @@
struct ieee80211_regdomain {
struct rcu_head rcu_head;
u32 n_reg_rules;
- char alpha2[2];
+ char alpha2[3];
enum nl80211_dfs_regions dfs_region;
struct ieee80211_reg_rule reg_rules[];
};
diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index f6e7397..9fbd856 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -320,6 +320,19 @@
return asoc ? asoc->assoc_id : 0;
}
+static inline enum sctp_sstat_state
+sctp_assoc_to_state(const struct sctp_association *asoc)
+{
+ /* SCTP's uapi always had SCTP_EMPTY(=0) as a dummy state, but we
+ * got rid of it in kernel space. Therefore SCTP_CLOSED et al
+ * start at =1 in user space, but actually as =0 in kernel space.
+ * Now that we can not break user space and SCTP_EMPTY is exposed
+ * there, we need to fix it up with an ugly offset not to break
+ * applications. :(
+ */
+ return asoc->state + 1;
+}
+
/* Look up the association by its id. */
struct sctp_association *sctp_id2assoc(struct sock *sk, sctp_assoc_t id);
diff --git a/include/net/sock.h b/include/net/sock.h
index 7f2ab72..b9a5bd0 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2165,9 +2165,7 @@
*/
if (sock_flag(sk, SOCK_RCVTSTAMP) ||
(sk->sk_tsflags & SOF_TIMESTAMPING_RX_SOFTWARE) ||
- (kt.tv64 &&
- (sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE ||
- skb_shinfo(skb)->tx_flags & SKBTX_ANY_SW_TSTAMP)) ||
+ (kt.tv64 && sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE) ||
(hwtstamps->hwtstamp.tv64 &&
(sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE)))
__sock_recv_timestamp(msg, sk, skb);
diff --git a/include/net/wimax.h b/include/net/wimax.h
index e52ef53..c52b685 100644
--- a/include/net/wimax.h
+++ b/include/net/wimax.h
@@ -290,7 +290,7 @@
* This operation has to be synchronous, and return only when the
* reset is complete. In case of having had to resort to bus/cold
* reset implying a device disconnection, the call is allowed to
- * return inmediately.
+ * return immediately.
* NOTE: wimax_dev->mutex is NOT locked when this op is being
* called; however, wimax_dev->mutex_reset IS locked to ensure
* serialization of calls to wimax_reset().
diff --git a/include/sound/soc.h b/include/sound/soc.h
index be6ecae..c83a334 100644
--- a/include/sound/soc.h
+++ b/include/sound/soc.h
@@ -277,7 +277,7 @@
.access = SNDRV_CTL_ELEM_ACCESS_TLV_READWRITE | \
SNDRV_CTL_ELEM_ACCESS_TLV_CALLBACK, \
.tlv.c = (snd_soc_bytes_tlv_callback), \
- .info = snd_soc_info_bytes_ext, \
+ .info = snd_soc_bytes_info_ext, \
.private_value = (unsigned long)&(struct soc_bytes_ext) \
{.max = xcount, .get = xhandler_get, .put = xhandler_put, } }
#define SOC_SINGLE_XR_SX(xname, xregbase, xregcount, xnbits, \
diff --git a/include/trace/events/irq.h b/include/trace/events/irq.h
index 1c09820..3608beb 100644
--- a/include/trace/events/irq.h
+++ b/include/trace/events/irq.h
@@ -107,7 +107,7 @@
* @vec_nr: softirq vector number
*
* When used in combination with the softirq_exit tracepoint
- * we can determine the softirq handler runtine.
+ * we can determine the softirq handler routine.
*/
DEFINE_EVENT(softirq, softirq_entry,
@@ -121,7 +121,7 @@
* @vec_nr: softirq vector number
*
* When used in combination with the softirq_entry tracepoint
- * we can determine the softirq handler runtine.
+ * we can determine the softirq handler routine.
*/
DEFINE_EVENT(softirq, softirq_exit,
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 7dc8788..940aced 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1035,6 +1035,11 @@
css_get(&cgrp->self);
}
+static bool cgroup_tryget(struct cgroup *cgrp)
+{
+ return css_tryget(&cgrp->self);
+}
+
static void cgroup_put(struct cgroup *cgrp)
{
css_put(&cgrp->self);
@@ -1147,7 +1152,8 @@
* protection against removal. Ensure @cgrp stays accessible and
* break the active_ref protection.
*/
- cgroup_get(cgrp);
+ if (!cgroup_tryget(cgrp))
+ return NULL;
kernfs_break_active_protection(kn);
mutex_lock(&cgroup_mutex);
@@ -3271,8 +3277,17 @@
{
struct cftype *cft;
- for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
- cft->flags |= __CFTYPE_NOT_ON_DFL;
+ /*
+ * If legacy_flies_on_dfl, we want to show the legacy files on the
+ * dfl hierarchy but iff the target subsystem hasn't been updated
+ * for the dfl hierarchy yet.
+ */
+ if (!cgroup_legacy_files_on_dfl ||
+ ss->dfl_cftypes != ss->legacy_cftypes) {
+ for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
+ cft->flags |= __CFTYPE_NOT_ON_DFL;
+ }
+
return cgroup_add_cftypes(ss, cfts);
}
@@ -4387,6 +4402,15 @@
/* cgroup release path */
cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
cgrp->id = -1;
+
+ /*
+ * There are two control paths which try to determine
+ * cgroup from dentry without going through kernfs -
+ * cgroupstats_build() and css_tryget_online_from_dir().
+ * Those are supported by RCU protecting clearing of
+ * cgrp->kn->priv backpointer.
+ */
+ RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL);
}
mutex_unlock(&cgroup_mutex);
@@ -4543,6 +4567,11 @@
struct cftype *base_files;
int ssid, ret;
+ /* Do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable.
+ */
+ if (strchr(name, '\n'))
+ return -EINVAL;
+
parent = cgroup_kn_lock_live(parent_kn);
if (!parent)
return -ENODEV;
@@ -4820,16 +4849,6 @@
cgroup_kn_unlock(kn);
- /*
- * There are two control paths which try to determine cgroup from
- * dentry without going through kernfs - cgroupstats_build() and
- * css_tryget_online_from_dir(). Those are supported by RCU
- * protecting clearing of cgrp->kn->priv backpointer, which should
- * happen after all files under it have been removed.
- */
- if (!ret)
- RCU_INIT_POINTER(*(void __rcu __force **)&kn->priv, NULL);
-
cgroup_put(cgrp);
return ret;
}
@@ -5416,7 +5435,7 @@
/*
* This path doesn't originate from kernfs and @kn could already
* have been or be removed at any point. @kn->priv is RCU
- * protected for this access. See cgroup_rmdir() for details.
+ * protected for this access. See css_release_work_fn() for details.
*/
cgrp = rcu_dereference(kn->priv);
if (cgrp)
diff --git a/kernel/compat.c b/kernel/compat.c
index 633394f..ebb3c36 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -226,7 +226,7 @@
ret = hrtimer_nanosleep_restart(restart);
set_fs(oldfs);
- if (ret) {
+ if (ret == -ERESTART_RESTARTBLOCK) {
rmtp = restart->nanosleep.compat_rmtp;
if (rmtp && compat_put_timespec(&rmt, rmtp))
@@ -256,7 +256,26 @@
HRTIMER_MODE_REL, CLOCK_MONOTONIC);
set_fs(oldfs);
- if (ret) {
+ /*
+ * hrtimer_nanosleep() can only return 0 or
+ * -ERESTART_RESTARTBLOCK here because:
+ *
+ * - we call it with HRTIMER_MODE_REL and therefor exclude the
+ * -ERESTARTNOHAND return path.
+ *
+ * - we supply the rmtp argument from the task stack (due to
+ * the necessary compat conversion. So the update cannot
+ * fail, which excludes the -EFAULT return path as well. If
+ * it fails nevertheless we have a bigger problem and wont
+ * reach this place anymore.
+ *
+ * - if the return value is 0, we do not have to update rmtp
+ * because there is no remaining time.
+ *
+ * We check for -ERESTART_RESTARTBLOCK nevertheless if the
+ * core implementation decides to return random nonsense.
+ */
+ if (ret == -ERESTART_RESTARTBLOCK) {
struct restart_block *restart
= ¤t_thread_info()->restart_block;
@@ -266,7 +285,6 @@
if (rmtp && compat_put_timespec(&rmt, rmtp))
return -EFAULT;
}
-
return ret;
}
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 97b67df..f2a88de 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -52,7 +52,7 @@
struct callchain_cpus_entries *entries;
entries = callchain_cpus_entries;
- rcu_assign_pointer(callchain_cpus_entries, NULL);
+ RCU_INIT_POINTER(callchain_cpus_entries, NULL);
call_rcu(&entries->rcu_head, release_callchain_buffers_rcu);
}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index f9c1ed0..1212cc4 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -47,6 +47,8 @@
#include <asm/irq_regs.h>
+static struct workqueue_struct *perf_wq;
+
struct remote_function_call {
struct task_struct *p;
int (*func)(void *info);
@@ -120,6 +122,13 @@
return data.ret;
}
+#define EVENT_OWNER_KERNEL ((void *) -1)
+
+static bool is_kernel_event(struct perf_event *event)
+{
+ return event->owner == EVENT_OWNER_KERNEL;
+}
+
#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
PERF_FLAG_FD_OUTPUT |\
PERF_FLAG_PID_CGROUP |\
@@ -1375,6 +1384,45 @@
perf_event__header_size(tmp);
}
+/*
+ * User event without the task.
+ */
+static bool is_orphaned_event(struct perf_event *event)
+{
+ return event && !is_kernel_event(event) && !event->owner;
+}
+
+/*
+ * Event has a parent but parent's task finished and it's
+ * alive only because of children holding refference.
+ */
+static bool is_orphaned_child(struct perf_event *event)
+{
+ return is_orphaned_event(event->parent);
+}
+
+static void orphans_remove_work(struct work_struct *work);
+
+static void schedule_orphans_remove(struct perf_event_context *ctx)
+{
+ if (!ctx->task || ctx->orphans_remove_sched || !perf_wq)
+ return;
+
+ if (queue_delayed_work(perf_wq, &ctx->orphans_remove, 1)) {
+ get_ctx(ctx);
+ ctx->orphans_remove_sched = true;
+ }
+}
+
+static int __init perf_workqueue_init(void)
+{
+ perf_wq = create_singlethread_workqueue("perf");
+ WARN(!perf_wq, "failed to create perf workqueue\n");
+ return perf_wq ? 0 : -1;
+}
+
+core_initcall(perf_workqueue_init);
+
static inline int
event_filter_match(struct perf_event *event)
{
@@ -1424,6 +1472,9 @@
if (event->attr.exclusive || !cpuctx->active_oncpu)
cpuctx->exclusive = 0;
+ if (is_orphaned_child(event))
+ schedule_orphans_remove(ctx);
+
perf_pmu_enable(event->pmu);
}
@@ -1524,6 +1575,11 @@
*/
if (ctx->is_active) {
raw_spin_unlock_irq(&ctx->lock);
+ /*
+ * Reload the task pointer, it might have been changed by
+ * a concurrent perf_event_context_sched_out().
+ */
+ task = ctx->task;
goto retry;
}
@@ -1726,6 +1782,9 @@
if (event->attr.exclusive)
cpuctx->exclusive = 1;
+ if (is_orphaned_child(event))
+ schedule_orphans_remove(ctx);
+
out:
perf_pmu_enable(event->pmu);
@@ -1967,6 +2026,11 @@
*/
if (ctx->is_active) {
raw_spin_unlock_irq(&ctx->lock);
+ /*
+ * Reload the task pointer, it might have been changed by
+ * a concurrent perf_event_context_sched_out().
+ */
+ task = ctx->task;
goto retry;
}
@@ -3068,6 +3132,7 @@
INIT_LIST_HEAD(&ctx->flexible_groups);
INIT_LIST_HEAD(&ctx->event_list);
atomic_set(&ctx->refcount, 1);
+ INIT_DELAYED_WORK(&ctx->orphans_remove, orphans_remove_work);
}
static struct perf_event_context *
@@ -3313,16 +3378,12 @@
}
/*
- * Called when the last reference to the file is gone.
+ * Remove user event from the owner task.
*/
-static void put_event(struct perf_event *event)
+static void perf_remove_from_owner(struct perf_event *event)
{
- struct perf_event_context *ctx = event->ctx;
struct task_struct *owner;
- if (!atomic_long_dec_and_test(&event->refcount))
- return;
-
rcu_read_lock();
owner = ACCESS_ONCE(event->owner);
/*
@@ -3355,6 +3416,20 @@
mutex_unlock(&owner->perf_event_mutex);
put_task_struct(owner);
}
+}
+
+/*
+ * Called when the last reference to the file is gone.
+ */
+static void put_event(struct perf_event *event)
+{
+ struct perf_event_context *ctx = event->ctx;
+
+ if (!atomic_long_dec_and_test(&event->refcount))
+ return;
+
+ if (!is_kernel_event(event))
+ perf_remove_from_owner(event);
WARN_ON_ONCE(ctx->parent_ctx);
/*
@@ -3389,6 +3464,42 @@
return 0;
}
+/*
+ * Remove all orphanes events from the context.
+ */
+static void orphans_remove_work(struct work_struct *work)
+{
+ struct perf_event_context *ctx;
+ struct perf_event *event, *tmp;
+
+ ctx = container_of(work, struct perf_event_context,
+ orphans_remove.work);
+
+ mutex_lock(&ctx->mutex);
+ list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) {
+ struct perf_event *parent_event = event->parent;
+
+ if (!is_orphaned_child(event))
+ continue;
+
+ perf_remove_from_context(event, true);
+
+ mutex_lock(&parent_event->child_mutex);
+ list_del_init(&event->child_list);
+ mutex_unlock(&parent_event->child_mutex);
+
+ free_event(event);
+ put_event(parent_event);
+ }
+
+ raw_spin_lock_irq(&ctx->lock);
+ ctx->orphans_remove_sched = false;
+ raw_spin_unlock_irq(&ctx->lock);
+ mutex_unlock(&ctx->mutex);
+
+ put_ctx(ctx);
+}
+
u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
{
struct perf_event *child;
@@ -3500,7 +3611,8 @@
* error state (i.e. because it was pinned but it couldn't be
* scheduled on to the CPU at some point).
*/
- if (event->state == PERF_EVENT_STATE_ERROR)
+ if ((event->state == PERF_EVENT_STATE_ERROR) ||
+ (event->state == PERF_EVENT_STATE_EXIT))
return 0;
if (count < event->read_size)
@@ -3527,7 +3639,12 @@
{
struct perf_event *event = file->private_data;
struct ring_buffer *rb;
- unsigned int events = POLL_HUP;
+ unsigned int events = POLLHUP;
+
+ poll_wait(file, &event->waitq, wait);
+
+ if (event->state == PERF_EVENT_STATE_EXIT)
+ return events;
/*
* Pin the event->rb by taking event->mmap_mutex; otherwise
@@ -3538,9 +3655,6 @@
if (rb)
events = atomic_xchg(&rb->poll, 0);
mutex_unlock(&event->mmap_mutex);
-
- poll_wait(file, &event->waitq, wait);
-
return events;
}
@@ -5804,7 +5918,7 @@
if (!hlist)
return;
- rcu_assign_pointer(swhash->swevent_hlist, NULL);
+ RCU_INIT_POINTER(swhash->swevent_hlist, NULL);
kfree_rcu(hlist, rcu_head);
}
@@ -7387,6 +7501,9 @@
goto err;
}
+ /* Mark owner so we could distinguish it from user events. */
+ event->owner = EVENT_OWNER_KERNEL;
+
account_event(event);
ctx = find_get_context(event->pmu, task, cpu);
@@ -7507,6 +7624,9 @@
if (child_event->parent) {
sync_child_event(child_event, child);
free_event(child_event);
+ } else {
+ child_event->state = PERF_EVENT_STATE_EXIT;
+ perf_event_wakeup(child_event);
}
}
@@ -7710,7 +7830,8 @@
if (IS_ERR(child_event))
return child_event;
- if (!atomic_long_inc_not_zero(&parent_event->refcount)) {
+ if (is_orphaned_event(parent_event) ||
+ !atomic_long_inc_not_zero(&parent_event->refcount)) {
free_event(child_event);
return NULL;
}
diff --git a/kernel/exit.c b/kernel/exit.c
index 32c58f7..fa09b86 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -115,32 +115,33 @@
if (tsk == sig->curr_target)
sig->curr_target = next_thread(tsk);
- /*
- * Accumulate here the counters for all threads but the
- * group leader as they die, so they can be added into
- * the process-wide totals when those are taken.
- * The group leader stays around as a zombie as long
- * as there are other threads. When it gets reaped,
- * the exit.c code will add its counts into these totals.
- * We won't ever get here for the group leader, since it
- * will have been the last reference on the signal_struct.
- */
- task_cputime(tsk, &utime, &stime);
- sig->utime += utime;
- sig->stime += stime;
- sig->gtime += task_gtime(tsk);
- sig->min_flt += tsk->min_flt;
- sig->maj_flt += tsk->maj_flt;
- sig->nvcsw += tsk->nvcsw;
- sig->nivcsw += tsk->nivcsw;
- sig->inblock += task_io_get_inblock(tsk);
- sig->oublock += task_io_get_oublock(tsk);
- task_io_accounting_add(&sig->ioac, &tsk->ioac);
- sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
}
+ /*
+ * Accumulate here the counters for all threads but the group leader
+ * as they die, so they can be added into the process-wide totals
+ * when those are taken. The group leader stays around as a zombie as
+ * long as there are other threads. When it gets reaped, the exit.c
+ * code will add its counts into these totals. We won't ever get here
+ * for the group leader, since it will have been the last reference on
+ * the signal_struct.
+ */
+ task_cputime(tsk, &utime, &stime);
+ write_seqlock(&sig->stats_lock);
+ sig->utime += utime;
+ sig->stime += stime;
+ sig->gtime += task_gtime(tsk);
+ sig->min_flt += tsk->min_flt;
+ sig->maj_flt += tsk->maj_flt;
+ sig->nvcsw += tsk->nvcsw;
+ sig->nivcsw += tsk->nivcsw;
+ sig->inblock += task_io_get_inblock(tsk);
+ sig->oublock += task_io_get_oublock(tsk);
+ task_io_accounting_add(&sig->ioac, &tsk->ioac);
+ sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
sig->nr_threads--;
__unhash_process(tsk, group_dead);
+ write_sequnlock(&sig->stats_lock);
/*
* Do this under ->siglock, we can race with another thread
@@ -1043,6 +1044,7 @@
spin_lock_irq(&p->real_parent->sighand->siglock);
psig = p->real_parent->signal;
sig = p->signal;
+ write_seqlock(&psig->stats_lock);
psig->cutime += tgutime + sig->cutime;
psig->cstime += tgstime + sig->cstime;
psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime;
@@ -1065,6 +1067,7 @@
psig->cmaxrss = maxrss;
task_io_accounting_add(&psig->ioac, &p->ioac);
task_io_accounting_add(&psig->ioac, &sig->ioac);
+ write_sequnlock(&psig->stats_lock);
spin_unlock_irq(&p->real_parent->sighand->siglock);
}
diff --git a/kernel/fork.c b/kernel/fork.c
index 0cf9cdb..9387ae8 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1068,6 +1068,7 @@
sig->curr_target = tsk;
init_sigpending(&sig->shared_pending);
INIT_LIST_HEAD(&sig->posix_timers);
+ seqlock_init(&sig->stats_lock);
hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
sig->real_timer.function = it_real_fn;
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index a2b28a2..6223fab 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -517,6 +517,7 @@
chip->irq_eoi(&desc->irq_data);
raw_spin_unlock(&desc->lock);
}
+EXPORT_SYMBOL_GPL(handle_fasteoi_irq);
/**
* handle_edge_irq - edge type IRQ handler
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 88d0d44..420ba68 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -3597,6 +3597,12 @@
raw_local_irq_save(flags);
check_flags(flags);
+ /*
+ * An interrupt recursive read in interrupt context can be considered
+ * to be the same as a recursive read from checking perspective.
+ */
+ if ((read == 3) && in_interrupt())
+ read = 2;
current->lockdep_recursion = 1;
trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip);
__lock_acquire(lock, subclass, trylock, read, check,
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index 23e89c5..4d60986 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -56,9 +56,6 @@
* If the lock has already been acquired, then this will proceed to spin
* on this node->locked until the previous lock holder sets the node->locked
* in mcs_spin_unlock().
- *
- * We don't inline mcs_spin_lock() so that perf can correctly account for the
- * time spent in this lock function.
*/
static inline
void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index ae712b2..dadbf88 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -15,7 +15,7 @@
* by Steven Rostedt, based on work by Gregory Haskins, Peter Morreale
* and Sven Dietrich.
*
- * Also see Documentation/mutex-design.txt.
+ * Also see Documentation/locking/mutex-design.txt.
*/
#include <linux/mutex.h>
#include <linux/ww_mutex.h>
@@ -106,6 +106,92 @@
EXPORT_SYMBOL(mutex_lock);
#endif
+static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww,
+ struct ww_acquire_ctx *ww_ctx)
+{
+#ifdef CONFIG_DEBUG_MUTEXES
+ /*
+ * If this WARN_ON triggers, you used ww_mutex_lock to acquire,
+ * but released with a normal mutex_unlock in this call.
+ *
+ * This should never happen, always use ww_mutex_unlock.
+ */
+ DEBUG_LOCKS_WARN_ON(ww->ctx);
+
+ /*
+ * Not quite done after calling ww_acquire_done() ?
+ */
+ DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire);
+
+ if (ww_ctx->contending_lock) {
+ /*
+ * After -EDEADLK you tried to
+ * acquire a different ww_mutex? Bad!
+ */
+ DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww);
+
+ /*
+ * You called ww_mutex_lock after receiving -EDEADLK,
+ * but 'forgot' to unlock everything else first?
+ */
+ DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0);
+ ww_ctx->contending_lock = NULL;
+ }
+
+ /*
+ * Naughty, using a different class will lead to undefined behavior!
+ */
+ DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class);
+#endif
+ ww_ctx->acquired++;
+}
+
+/*
+ * after acquiring lock with fastpath or when we lost out in contested
+ * slowpath, set ctx and wake up any waiters so they can recheck.
+ *
+ * This function is never called when CONFIG_DEBUG_LOCK_ALLOC is set,
+ * as the fastpath and opportunistic spinning are disabled in that case.
+ */
+static __always_inline void
+ww_mutex_set_context_fastpath(struct ww_mutex *lock,
+ struct ww_acquire_ctx *ctx)
+{
+ unsigned long flags;
+ struct mutex_waiter *cur;
+
+ ww_mutex_lock_acquired(lock, ctx);
+
+ lock->ctx = ctx;
+
+ /*
+ * The lock->ctx update should be visible on all cores before
+ * the atomic read is done, otherwise contended waiters might be
+ * missed. The contended waiters will either see ww_ctx == NULL
+ * and keep spinning, or it will acquire wait_lock, add itself
+ * to waiter list and sleep.
+ */
+ smp_mb(); /* ^^^ */
+
+ /*
+ * Check if lock is contended, if not there is nobody to wake up
+ */
+ if (likely(atomic_read(&lock->base.count) == 0))
+ return;
+
+ /*
+ * Uh oh, we raced in fastpath, wake up everyone in this case,
+ * so they can see the new lock->ctx.
+ */
+ spin_lock_mutex(&lock->base.wait_lock, flags);
+ list_for_each_entry(cur, &lock->base.wait_list, list) {
+ debug_mutex_wake_waiter(&lock->base, cur);
+ wake_up_process(cur->task);
+ }
+ spin_unlock_mutex(&lock->base.wait_lock, flags);
+}
+
+
#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
/*
* In order to avoid a stampede of mutex spinners from acquiring the mutex
@@ -180,6 +266,129 @@
*/
return retval;
}
+
+/*
+ * Atomically try to take the lock when it is available
+ */
+static inline bool mutex_try_to_acquire(struct mutex *lock)
+{
+ return !mutex_is_locked(lock) &&
+ (atomic_cmpxchg(&lock->count, 1, 0) == 1);
+}
+
+/*
+ * Optimistic spinning.
+ *
+ * We try to spin for acquisition when we find that the lock owner
+ * is currently running on a (different) CPU and while we don't
+ * need to reschedule. The rationale is that if the lock owner is
+ * running, it is likely to release the lock soon.
+ *
+ * Since this needs the lock owner, and this mutex implementation
+ * doesn't track the owner atomically in the lock field, we need to
+ * track it non-atomically.
+ *
+ * We can't do this for DEBUG_MUTEXES because that relies on wait_lock
+ * to serialize everything.
+ *
+ * The mutex spinners are queued up using MCS lock so that only one
+ * spinner can compete for the mutex. However, if mutex spinning isn't
+ * going to happen, there is no point in going through the lock/unlock
+ * overhead.
+ *
+ * Returns true when the lock was taken, otherwise false, indicating
+ * that we need to jump to the slowpath and sleep.
+ */
+static bool mutex_optimistic_spin(struct mutex *lock,
+ struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx)
+{
+ struct task_struct *task = current;
+
+ if (!mutex_can_spin_on_owner(lock))
+ goto done;
+
+ if (!osq_lock(&lock->osq))
+ goto done;
+
+ while (true) {
+ struct task_struct *owner;
+
+ if (use_ww_ctx && ww_ctx->acquired > 0) {
+ struct ww_mutex *ww;
+
+ ww = container_of(lock, struct ww_mutex, base);
+ /*
+ * If ww->ctx is set the contents are undefined, only
+ * by acquiring wait_lock there is a guarantee that
+ * they are not invalid when reading.
+ *
+ * As such, when deadlock detection needs to be
+ * performed the optimistic spinning cannot be done.
+ */
+ if (ACCESS_ONCE(ww->ctx))
+ break;
+ }
+
+ /*
+ * If there's an owner, wait for it to either
+ * release the lock or go to sleep.
+ */
+ owner = ACCESS_ONCE(lock->owner);
+ if (owner && !mutex_spin_on_owner(lock, owner))
+ break;
+
+ /* Try to acquire the mutex if it is unlocked. */
+ if (mutex_try_to_acquire(lock)) {
+ lock_acquired(&lock->dep_map, ip);
+
+ if (use_ww_ctx) {
+ struct ww_mutex *ww;
+ ww = container_of(lock, struct ww_mutex, base);
+
+ ww_mutex_set_context_fastpath(ww, ww_ctx);
+ }
+
+ mutex_set_owner(lock);
+ osq_unlock(&lock->osq);
+ return true;
+ }
+
+ /*
+ * When there's no owner, we might have preempted between the
+ * owner acquiring the lock and setting the owner field. If
+ * we're an RT task that will live-lock because we won't let
+ * the owner complete.
+ */
+ if (!owner && (need_resched() || rt_task(task)))
+ break;
+
+ /*
+ * The cpu_relax() call is a compiler barrier which forces
+ * everything in this loop to be re-loaded. We don't need
+ * memory barriers as we'll eventually observe the right
+ * values at the cost of a few extra spins.
+ */
+ cpu_relax_lowlatency();
+ }
+
+ osq_unlock(&lock->osq);
+done:
+ /*
+ * If we fell out of the spin path because of need_resched(),
+ * reschedule now, before we try-lock the mutex. This avoids getting
+ * scheduled out right after we obtained the mutex.
+ */
+ if (need_resched())
+ schedule_preempt_disabled();
+
+ return false;
+}
+#else
+static bool mutex_optimistic_spin(struct mutex *lock,
+ struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx)
+{
+ return false;
+}
#endif
__visible __used noinline
@@ -277,91 +486,6 @@
return 0;
}
-static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww,
- struct ww_acquire_ctx *ww_ctx)
-{
-#ifdef CONFIG_DEBUG_MUTEXES
- /*
- * If this WARN_ON triggers, you used ww_mutex_lock to acquire,
- * but released with a normal mutex_unlock in this call.
- *
- * This should never happen, always use ww_mutex_unlock.
- */
- DEBUG_LOCKS_WARN_ON(ww->ctx);
-
- /*
- * Not quite done after calling ww_acquire_done() ?
- */
- DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire);
-
- if (ww_ctx->contending_lock) {
- /*
- * After -EDEADLK you tried to
- * acquire a different ww_mutex? Bad!
- */
- DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww);
-
- /*
- * You called ww_mutex_lock after receiving -EDEADLK,
- * but 'forgot' to unlock everything else first?
- */
- DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0);
- ww_ctx->contending_lock = NULL;
- }
-
- /*
- * Naughty, using a different class will lead to undefined behavior!
- */
- DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class);
-#endif
- ww_ctx->acquired++;
-}
-
-/*
- * after acquiring lock with fastpath or when we lost out in contested
- * slowpath, set ctx and wake up any waiters so they can recheck.
- *
- * This function is never called when CONFIG_DEBUG_LOCK_ALLOC is set,
- * as the fastpath and opportunistic spinning are disabled in that case.
- */
-static __always_inline void
-ww_mutex_set_context_fastpath(struct ww_mutex *lock,
- struct ww_acquire_ctx *ctx)
-{
- unsigned long flags;
- struct mutex_waiter *cur;
-
- ww_mutex_lock_acquired(lock, ctx);
-
- lock->ctx = ctx;
-
- /*
- * The lock->ctx update should be visible on all cores before
- * the atomic read is done, otherwise contended waiters might be
- * missed. The contended waiters will either see ww_ctx == NULL
- * and keep spinning, or it will acquire wait_lock, add itself
- * to waiter list and sleep.
- */
- smp_mb(); /* ^^^ */
-
- /*
- * Check if lock is contended, if not there is nobody to wake up
- */
- if (likely(atomic_read(&lock->base.count) == 0))
- return;
-
- /*
- * Uh oh, we raced in fastpath, wake up everyone in this case,
- * so they can see the new lock->ctx.
- */
- spin_lock_mutex(&lock->base.wait_lock, flags);
- list_for_each_entry(cur, &lock->base.wait_list, list) {
- debug_mutex_wake_waiter(&lock->base, cur);
- wake_up_process(cur->task);
- }
- spin_unlock_mutex(&lock->base.wait_lock, flags);
-}
-
/*
* Lock a mutex (possibly interruptible), slowpath:
*/
@@ -378,104 +502,12 @@
preempt_disable();
mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);
-#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
- /*
- * Optimistic spinning.
- *
- * We try to spin for acquisition when we find that the lock owner
- * is currently running on a (different) CPU and while we don't
- * need to reschedule. The rationale is that if the lock owner is
- * running, it is likely to release the lock soon.
- *
- * Since this needs the lock owner, and this mutex implementation
- * doesn't track the owner atomically in the lock field, we need to
- * track it non-atomically.
- *
- * We can't do this for DEBUG_MUTEXES because that relies on wait_lock
- * to serialize everything.
- *
- * The mutex spinners are queued up using MCS lock so that only one
- * spinner can compete for the mutex. However, if mutex spinning isn't
- * going to happen, there is no point in going through the lock/unlock
- * overhead.
- */
- if (!mutex_can_spin_on_owner(lock))
- goto slowpath;
-
- if (!osq_lock(&lock->osq))
- goto slowpath;
-
- for (;;) {
- struct task_struct *owner;
-
- if (use_ww_ctx && ww_ctx->acquired > 0) {
- struct ww_mutex *ww;
-
- ww = container_of(lock, struct ww_mutex, base);
- /*
- * If ww->ctx is set the contents are undefined, only
- * by acquiring wait_lock there is a guarantee that
- * they are not invalid when reading.
- *
- * As such, when deadlock detection needs to be
- * performed the optimistic spinning cannot be done.
- */
- if (ACCESS_ONCE(ww->ctx))
- break;
- }
-
- /*
- * If there's an owner, wait for it to either
- * release the lock or go to sleep.
- */
- owner = ACCESS_ONCE(lock->owner);
- if (owner && !mutex_spin_on_owner(lock, owner))
- break;
-
- /* Try to acquire the mutex if it is unlocked. */
- if (!mutex_is_locked(lock) &&
- (atomic_cmpxchg(&lock->count, 1, 0) == 1)) {
- lock_acquired(&lock->dep_map, ip);
- if (use_ww_ctx) {
- struct ww_mutex *ww;
- ww = container_of(lock, struct ww_mutex, base);
-
- ww_mutex_set_context_fastpath(ww, ww_ctx);
- }
-
- mutex_set_owner(lock);
- osq_unlock(&lock->osq);
- preempt_enable();
- return 0;
- }
-
- /*
- * When there's no owner, we might have preempted between the
- * owner acquiring the lock and setting the owner field. If
- * we're an RT task that will live-lock because we won't let
- * the owner complete.
- */
- if (!owner && (need_resched() || rt_task(task)))
- break;
-
- /*
- * The cpu_relax() call is a compiler barrier which forces
- * everything in this loop to be re-loaded. We don't need
- * memory barriers as we'll eventually observe the right
- * values at the cost of a few extra spins.
- */
- cpu_relax_lowlatency();
+ if (mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx)) {
+ /* got the lock, yay! */
+ preempt_enable();
+ return 0;
}
- osq_unlock(&lock->osq);
-slowpath:
- /*
- * If we fell out of the spin path because of need_resched(),
- * reschedule now, before we try-lock the mutex. This avoids getting
- * scheduled out right after we obtained the mutex.
- */
- if (need_resched())
- schedule_preempt_disabled();
-#endif
+
spin_lock_mutex(&lock->wait_lock, flags);
/*
@@ -679,15 +711,21 @@
* Release the lock, slowpath:
*/
static inline void
-__mutex_unlock_common_slowpath(atomic_t *lock_count, int nested)
+__mutex_unlock_common_slowpath(struct mutex *lock, int nested)
{
- struct mutex *lock = container_of(lock_count, struct mutex, count);
unsigned long flags;
/*
- * some architectures leave the lock unlocked in the fastpath failure
+ * As a performance measurement, release the lock before doing other
+ * wakeup related duties to follow. This allows other tasks to acquire
+ * the lock sooner, while still handling cleanups in past unlock calls.
+ * This can be done as we do not enforce strict equivalence between the
+ * mutex counter and wait_list.
+ *
+ *
+ * Some architectures leave the lock unlocked in the fastpath failure
* case, others need to leave it locked. In the later case we have to
- * unlock it here
+ * unlock it here - as the lock counter is currently 0 or negative.
*/
if (__mutex_slowpath_needs_to_unlock())
atomic_set(&lock->count, 1);
@@ -716,7 +754,9 @@
__visible void
__mutex_unlock_slowpath(atomic_t *lock_count)
{
- __mutex_unlock_common_slowpath(lock_count, 1);
+ struct mutex *lock = container_of(lock_count, struct mutex, count);
+
+ __mutex_unlock_common_slowpath(lock, 1);
}
#ifndef CONFIG_DEBUG_LOCK_ALLOC
diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h
index 4115fbf..5cda397 100644
--- a/kernel/locking/mutex.h
+++ b/kernel/locking/mutex.h
@@ -16,7 +16,7 @@
#define mutex_remove_waiter(lock, waiter, ti) \
__list_del((waiter)->list.prev, (waiter)->list.next)
-#ifdef CONFIG_SMP
+#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
static inline void mutex_set_owner(struct mutex *lock)
{
lock->owner = current;
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index a0ea2a1..7c98873 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -8,7 +8,7 @@
* Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt
* Copyright (C) 2006 Esben Nielsen
*
- * See Documentation/rt-mutex-design.txt for details.
+ * See Documentation/locking/rt-mutex-design.txt for details.
*/
#include <linux/spinlock.h>
#include <linux/export.h>
diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c
index 6815171..b8120ab 100644
--- a/kernel/locking/semaphore.c
+++ b/kernel/locking/semaphore.c
@@ -36,7 +36,7 @@
static noinline void __down(struct semaphore *sem);
static noinline int __down_interruptible(struct semaphore *sem);
static noinline int __down_killable(struct semaphore *sem);
-static noinline int __down_timeout(struct semaphore *sem, long jiffies);
+static noinline int __down_timeout(struct semaphore *sem, long timeout);
static noinline void __up(struct semaphore *sem);
/**
@@ -145,14 +145,14 @@
/**
* down_timeout - acquire the semaphore within a specified time
* @sem: the semaphore to be acquired
- * @jiffies: how long to wait before failing
+ * @timeout: how long to wait before failing
*
* Attempts to acquire the semaphore. If no more tasks are allowed to
* acquire the semaphore, calling this function will put the task to sleep.
* If the semaphore is not released within the specified number of jiffies,
* this function returns -ETIME. It returns 0 if the semaphore was acquired.
*/
-int down_timeout(struct semaphore *sem, long jiffies)
+int down_timeout(struct semaphore *sem, long timeout)
{
unsigned long flags;
int result = 0;
@@ -161,7 +161,7 @@
if (likely(sem->count > 0))
sem->count--;
else
- result = __down_timeout(sem, jiffies);
+ result = __down_timeout(sem, timeout);
raw_spin_unlock_irqrestore(&sem->lock, flags);
return result;
@@ -248,9 +248,9 @@
return __down_common(sem, TASK_KILLABLE, MAX_SCHEDULE_TIMEOUT);
}
-static noinline int __sched __down_timeout(struct semaphore *sem, long jiffies)
+static noinline int __sched __down_timeout(struct semaphore *sem, long timeout)
{
- return __down_common(sem, TASK_UNINTERRUPTIBLE, jiffies);
+ return __down_common(sem, TASK_UNINTERRUPTIBLE, timeout);
}
static noinline void __sched __up(struct semaphore *sem)
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 5d49dca..2df883a 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -179,6 +179,7 @@
#ifdef CONFIG_SUSPEND
/* kernel/power/suspend.c */
+extern const char *pm_labels[];
extern const char *pm_states[];
extern int suspend_devices_and_enter(suspend_state_t state);
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 6dadb25..18c6219 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -31,7 +31,7 @@
#include "power.h"
-static const char *pm_labels[] = { "mem", "standby", "freeze", };
+const char *pm_labels[] = { "mem", "standby", "freeze", NULL };
const char *pm_states[PM_SUSPEND_MAX];
static const struct platform_suspend_ops *suspend_ops;
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
index 2f52492..bd91bc1 100644
--- a/kernel/power/suspend_test.c
+++ b/kernel/power/suspend_test.c
@@ -129,20 +129,20 @@
* at startup time. They're normally disabled, for faster boot and because
* we can't know which states really work on this particular system.
*/
-static suspend_state_t test_state __initdata = PM_SUSPEND_ON;
+static const char *test_state_label __initdata;
static char warn_bad_state[] __initdata =
KERN_WARNING "PM: can't test '%s' suspend state\n";
static int __init setup_test_suspend(char *value)
{
- suspend_state_t i;
+ int i;
/* "=mem" ==> "mem" */
value++;
- for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++)
- if (!strcmp(pm_states[i], value)) {
- test_state = i;
+ for (i = 0; pm_labels[i]; i++)
+ if (!strcmp(pm_labels[i], value)) {
+ test_state_label = pm_labels[i];
return 0;
}
@@ -158,13 +158,21 @@
struct rtc_device *rtc = NULL;
struct device *dev;
+ suspend_state_t test_state;
/* PM is initialized by now; is that state testable? */
- if (test_state == PM_SUSPEND_ON)
- goto done;
- if (!pm_states[test_state]) {
- printk(warn_bad_state, pm_states[test_state]);
- goto done;
+ if (!test_state_label)
+ return 0;
+
+ for (test_state = PM_SUSPEND_MIN; test_state < PM_SUSPEND_MAX; test_state++) {
+ const char *state_label = pm_states[test_state];
+
+ if (state_label && !strcmp(test_state_label, state_label))
+ break;
+ }
+ if (test_state == PM_SUSPEND_MAX) {
+ printk(warn_bad_state, test_state_label);
+ return 0;
}
/* RTCs have initialized by now too ... can we use one? */
@@ -173,13 +181,12 @@
rtc = rtc_class_open(dev_name(dev));
if (!rtc) {
printk(warn_no_rtc);
- goto done;
+ return 0;
}
/* go for it */
test_wakealarm(rtc, test_state);
rtc_class_close(rtc);
-done:
return 0;
}
late_initcall(test_suspend);
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 71e64c7..6a86eb7 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -358,7 +358,7 @@
struct rcu_head **nocb_gp_tail;
long nocb_gp_count;
long nocb_gp_count_lazy;
- bool nocb_leader_wake; /* Is the nocb leader thread awake? */
+ bool nocb_leader_sleep; /* Is the nocb leader thread asleep? */
struct rcu_data *nocb_next_follower;
/* Next follower in wakeup chain. */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 00dc411..a7997e2 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -2074,9 +2074,9 @@
if (!ACCESS_ONCE(rdp_leader->nocb_kthread))
return;
- if (!ACCESS_ONCE(rdp_leader->nocb_leader_wake) || force) {
+ if (ACCESS_ONCE(rdp_leader->nocb_leader_sleep) || force) {
/* Prior xchg orders against prior callback enqueue. */
- ACCESS_ONCE(rdp_leader->nocb_leader_wake) = true;
+ ACCESS_ONCE(rdp_leader->nocb_leader_sleep) = false;
wake_up(&rdp_leader->nocb_wq);
}
}
@@ -2253,7 +2253,7 @@
if (!rcu_nocb_poll) {
trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep");
wait_event_interruptible(my_rdp->nocb_wq,
- ACCESS_ONCE(my_rdp->nocb_leader_wake));
+ !ACCESS_ONCE(my_rdp->nocb_leader_sleep));
/* Memory barrier handled by smp_mb() calls below and repoll. */
} else if (firsttime) {
firsttime = false; /* Don't drown trace log with "Poll"! */
@@ -2292,12 +2292,12 @@
schedule_timeout_interruptible(1);
/* Rescan in case we were a victim of memory ordering. */
- my_rdp->nocb_leader_wake = false;
- smp_mb(); /* Ensure _wake false before scan. */
+ my_rdp->nocb_leader_sleep = true;
+ smp_mb(); /* Ensure _sleep true before scan. */
for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower)
if (ACCESS_ONCE(rdp->nocb_head)) {
/* Found CB, so short-circuit next wait. */
- my_rdp->nocb_leader_wake = true;
+ my_rdp->nocb_leader_sleep = false;
break;
}
goto wait_again;
@@ -2307,17 +2307,17 @@
rcu_nocb_wait_gp(my_rdp);
/*
- * We left ->nocb_leader_wake set to reduce cache thrashing.
- * We clear it now, but recheck for new callbacks while
+ * We left ->nocb_leader_sleep unset to reduce cache thrashing.
+ * We set it now, but recheck for new callbacks while
* traversing our follower list.
*/
- my_rdp->nocb_leader_wake = false;
- smp_mb(); /* Ensure _wake false before scan of ->nocb_head. */
+ my_rdp->nocb_leader_sleep = true;
+ smp_mb(); /* Ensure _sleep true before scan of ->nocb_head. */
/* Each pass through the following loop wakes a follower, if needed. */
for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
if (ACCESS_ONCE(rdp->nocb_head))
- my_rdp->nocb_leader_wake = true; /* No need to wait. */
+ my_rdp->nocb_leader_sleep = false;/* No need to sleep.*/
if (!rdp->nocb_gp_head)
continue; /* No CBs, so no need to wake follower. */
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index e73efba..8a2e230 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -148,11 +148,8 @@
if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled))
goto out;
- t = p;
- do {
+ for_each_thread(p, t)
sched_move_task(t);
- } while_each_thread(p, t);
-
out:
unlock_task_sighand(p, &flags);
autogroup_kref_put(prev);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ec1a286..07d67dd 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -90,22 +90,6 @@
#define CREATE_TRACE_POINTS
#include <trace/events/sched.h>
-#ifdef smp_mb__before_atomic
-void __smp_mb__before_atomic(void)
-{
- smp_mb__before_atomic();
-}
-EXPORT_SYMBOL(__smp_mb__before_atomic);
-#endif
-
-#ifdef smp_mb__after_atomic
-void __smp_mb__after_atomic(void)
-{
- smp_mb__after_atomic();
-}
-EXPORT_SYMBOL(__smp_mb__after_atomic);
-#endif
-
void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
{
unsigned long delta;
@@ -333,9 +317,12 @@
for (;;) {
rq = task_rq(p);
raw_spin_lock(&rq->lock);
- if (likely(rq == task_rq(p)))
+ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
return rq;
raw_spin_unlock(&rq->lock);
+
+ while (unlikely(task_on_rq_migrating(p)))
+ cpu_relax();
}
}
@@ -352,10 +339,13 @@
raw_spin_lock_irqsave(&p->pi_lock, *flags);
rq = task_rq(p);
raw_spin_lock(&rq->lock);
- if (likely(rq == task_rq(p)))
+ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
return rq;
raw_spin_unlock(&rq->lock);
raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
+
+ while (unlikely(task_on_rq_migrating(p)))
+ cpu_relax();
}
}
@@ -449,7 +439,15 @@
void hrtick_start(struct rq *rq, u64 delay)
{
struct hrtimer *timer = &rq->hrtick_timer;
- ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
+ ktime_t time;
+ s64 delta;
+
+ /*
+ * Don't schedule slices shorter than 10000ns, that just
+ * doesn't make sense and can cause timer DoS.
+ */
+ delta = max_t(s64, delay, 10000LL);
+ time = ktime_add_ns(timer->base->get_time(), delta);
hrtimer_set_expires(timer, time);
@@ -1043,7 +1041,7 @@
* A queue event has occurred, and we're going to schedule. In
* this case, we can save a useless back to back clock update.
*/
- if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))
+ if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
rq->skip_clock_update = 1;
}
@@ -1088,7 +1086,7 @@
static void __migrate_swap_task(struct task_struct *p, int cpu)
{
- if (p->on_rq) {
+ if (task_on_rq_queued(p)) {
struct rq *src_rq, *dst_rq;
src_rq = task_rq(p);
@@ -1214,7 +1212,7 @@
unsigned long wait_task_inactive(struct task_struct *p, long match_state)
{
unsigned long flags;
- int running, on_rq;
+ int running, queued;
unsigned long ncsw;
struct rq *rq;
@@ -1252,7 +1250,7 @@
rq = task_rq_lock(p, &flags);
trace_sched_wait_task(p);
running = task_running(rq, p);
- on_rq = p->on_rq;
+ queued = task_on_rq_queued(p);
ncsw = 0;
if (!match_state || p->state == match_state)
ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
@@ -1284,7 +1282,7 @@
* running right now), it's preempted, and we should
* yield - it could be a while.
*/
- if (unlikely(on_rq)) {
+ if (unlikely(queued)) {
ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
set_current_state(TASK_UNINTERRUPTIBLE);
@@ -1478,7 +1476,7 @@
static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
{
activate_task(rq, p, en_flags);
- p->on_rq = 1;
+ p->on_rq = TASK_ON_RQ_QUEUED;
/* if a worker is waking up, notify workqueue */
if (p->flags & PF_WQ_WORKER)
@@ -1537,7 +1535,7 @@
int ret = 0;
rq = __task_rq_lock(p);
- if (p->on_rq) {
+ if (task_on_rq_queued(p)) {
/* check_preempt_curr() may use rq clock */
update_rq_clock(rq);
ttwu_do_wakeup(rq, p, wake_flags);
@@ -1742,7 +1740,7 @@
if (!(p->state & TASK_NORMAL))
goto out;
- if (!p->on_rq)
+ if (!task_on_rq_queued(p))
ttwu_activate(rq, p, ENQUEUE_WAKEUP);
ttwu_do_wakeup(rq, p, 0);
@@ -2095,7 +2093,7 @@
init_task_runnable_average(p);
rq = __task_rq_lock(p);
activate_task(rq, p, 0);
- p->on_rq = 1;
+ p->on_rq = TASK_ON_RQ_QUEUED;
trace_sched_wakeup_new(p, true);
check_preempt_curr(rq, p, WF_FORK);
#ifdef CONFIG_SMP
@@ -2451,7 +2449,7 @@
* project cycles that may never be accounted to this
* thread, breaking clock_gettime().
*/
- if (task_current(rq, p) && p->on_rq) {
+ if (task_current(rq, p) && task_on_rq_queued(p)) {
update_rq_clock(rq);
ns = rq_clock_task(rq) - p->se.exec_start;
if ((s64)ns < 0)
@@ -2497,7 +2495,7 @@
* If we see ->on_cpu without ->on_rq, the task is leaving, and has
* been accounted, so we're correct here as well.
*/
- if (!p->on_cpu || !p->on_rq)
+ if (!p->on_cpu || !task_on_rq_queued(p))
return p->se.sum_exec_runtime;
#endif
@@ -2801,7 +2799,7 @@
switch_count = &prev->nvcsw;
}
- if (prev->on_rq || rq->skip_clock_update < 0)
+ if (task_on_rq_queued(prev) || rq->skip_clock_update < 0)
update_rq_clock(rq);
next = pick_next_task(rq, prev);
@@ -2966,7 +2964,7 @@
*/
void rt_mutex_setprio(struct task_struct *p, int prio)
{
- int oldprio, on_rq, running, enqueue_flag = 0;
+ int oldprio, queued, running, enqueue_flag = 0;
struct rq *rq;
const struct sched_class *prev_class;
@@ -2995,9 +2993,9 @@
trace_sched_pi_setprio(p, prio);
oldprio = p->prio;
prev_class = p->sched_class;
- on_rq = p->on_rq;
+ queued = task_on_rq_queued(p);
running = task_current(rq, p);
- if (on_rq)
+ if (queued)
dequeue_task(rq, p, 0);
if (running)
p->sched_class->put_prev_task(rq, p);
@@ -3037,7 +3035,7 @@
if (running)
p->sched_class->set_curr_task(rq);
- if (on_rq)
+ if (queued)
enqueue_task(rq, p, enqueue_flag);
check_class_changed(rq, p, prev_class, oldprio);
@@ -3048,7 +3046,7 @@
void set_user_nice(struct task_struct *p, long nice)
{
- int old_prio, delta, on_rq;
+ int old_prio, delta, queued;
unsigned long flags;
struct rq *rq;
@@ -3069,8 +3067,8 @@
p->static_prio = NICE_TO_PRIO(nice);
goto out_unlock;
}
- on_rq = p->on_rq;
- if (on_rq)
+ queued = task_on_rq_queued(p);
+ if (queued)
dequeue_task(rq, p, 0);
p->static_prio = NICE_TO_PRIO(nice);
@@ -3079,7 +3077,7 @@
p->prio = effective_prio(p);
delta = p->prio - old_prio;
- if (on_rq) {
+ if (queued) {
enqueue_task(rq, p, 0);
/*
* If the task increased its priority or is running and
@@ -3351,7 +3349,7 @@
{
int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
MAX_RT_PRIO - 1 - attr->sched_priority;
- int retval, oldprio, oldpolicy = -1, on_rq, running;
+ int retval, oldprio, oldpolicy = -1, queued, running;
int policy = attr->sched_policy;
unsigned long flags;
const struct sched_class *prev_class;
@@ -3548,9 +3546,9 @@
return 0;
}
- on_rq = p->on_rq;
+ queued = task_on_rq_queued(p);
running = task_current(rq, p);
- if (on_rq)
+ if (queued)
dequeue_task(rq, p, 0);
if (running)
p->sched_class->put_prev_task(rq, p);
@@ -3560,7 +3558,7 @@
if (running)
p->sched_class->set_curr_task(rq);
- if (on_rq) {
+ if (queued) {
/*
* We enqueue to tail when the priority of a task is
* increased (user space view).
@@ -4512,7 +4510,7 @@
" task PC stack pid father\n");
#endif
rcu_read_lock();
- do_each_thread(g, p) {
+ for_each_process_thread(g, p) {
/*
* reset the NMI-timeout, listing all files on a slow
* console might take a lot of time:
@@ -4520,7 +4518,7 @@
touch_nmi_watchdog();
if (!state_filter || (p->state & state_filter))
sched_show_task(p);
- } while_each_thread(g, p);
+ }
touch_all_softlockup_watchdogs();
@@ -4575,7 +4573,7 @@
rcu_read_unlock();
rq->curr = rq->idle = idle;
- idle->on_rq = 1;
+ idle->on_rq = TASK_ON_RQ_QUEUED;
#if defined(CONFIG_SMP)
idle->on_cpu = 1;
#endif
@@ -4652,7 +4650,7 @@
goto out;
dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
- if (p->on_rq) {
+ if (task_on_rq_queued(p) || p->state == TASK_WAKING) {
struct migration_arg arg = { p, dest_cpu };
/* Need help from migration thread: drop lock and wait. */
task_rq_unlock(rq, p, &flags);
@@ -4680,20 +4678,20 @@
*/
static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
{
- struct rq *rq_dest, *rq_src;
+ struct rq *rq;
int ret = 0;
if (unlikely(!cpu_active(dest_cpu)))
return ret;
- rq_src = cpu_rq(src_cpu);
- rq_dest = cpu_rq(dest_cpu);
+ rq = cpu_rq(src_cpu);
raw_spin_lock(&p->pi_lock);
- double_rq_lock(rq_src, rq_dest);
+ raw_spin_lock(&rq->lock);
/* Already moved. */
if (task_cpu(p) != src_cpu)
goto done;
+
/* Affinity changed (again). */
if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
goto fail;
@@ -4702,16 +4700,23 @@
* If we're not on a rq, the next wake-up will ensure we're
* placed properly.
*/
- if (p->on_rq) {
- dequeue_task(rq_src, p, 0);
+ if (task_on_rq_queued(p)) {
+ dequeue_task(rq, p, 0);
+ p->on_rq = TASK_ON_RQ_MIGRATING;
set_task_cpu(p, dest_cpu);
- enqueue_task(rq_dest, p, 0);
- check_preempt_curr(rq_dest, p, 0);
+ raw_spin_unlock(&rq->lock);
+
+ rq = cpu_rq(dest_cpu);
+ raw_spin_lock(&rq->lock);
+ BUG_ON(task_rq(p) != rq);
+ p->on_rq = TASK_ON_RQ_QUEUED;
+ enqueue_task(rq, p, 0);
+ check_preempt_curr(rq, p, 0);
}
done:
ret = 1;
fail:
- double_rq_unlock(rq_src, rq_dest);
+ raw_spin_unlock(&rq->lock);
raw_spin_unlock(&p->pi_lock);
return ret;
}
@@ -4743,13 +4748,13 @@
{
struct rq *rq;
unsigned long flags;
- bool on_rq, running;
+ bool queued, running;
rq = task_rq_lock(p, &flags);
- on_rq = p->on_rq;
+ queued = task_on_rq_queued(p);
running = task_current(rq, p);
- if (on_rq)
+ if (queued)
dequeue_task(rq, p, 0);
if (running)
p->sched_class->put_prev_task(rq, p);
@@ -4758,7 +4763,7 @@
if (running)
p->sched_class->set_curr_task(rq);
- if (on_rq)
+ if (queued)
enqueue_task(rq, p, 0);
task_rq_unlock(rq, p, &flags);
}
@@ -4778,6 +4783,12 @@
* be on another cpu but it doesn't matter.
*/
local_irq_disable();
+ /*
+ * We need to explicitly wake pending tasks before running
+ * __migrate_task() such that we will not miss enforcing cpus_allowed
+ * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
+ */
+ sched_ttwu_pending();
__migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
local_irq_enable();
return 0;
@@ -5746,7 +5757,7 @@
const struct cpumask *span = sched_domain_span(sd);
struct cpumask *covered = sched_domains_tmpmask;
struct sd_data *sdd = sd->private;
- struct sched_domain *child;
+ struct sched_domain *sibling;
int i;
cpumask_clear(covered);
@@ -5757,10 +5768,10 @@
if (cpumask_test_cpu(i, covered))
continue;
- child = *per_cpu_ptr(sdd->sd, i);
+ sibling = *per_cpu_ptr(sdd->sd, i);
/* See the comment near build_group_mask(). */
- if (!cpumask_test_cpu(i, sched_domain_span(child)))
+ if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
continue;
sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
@@ -5770,10 +5781,9 @@
goto fail;
sg_span = sched_group_cpus(sg);
- if (child->child) {
- child = child->child;
- cpumask_copy(sg_span, sched_domain_span(child));
- } else
+ if (sibling->child)
+ cpumask_copy(sg_span, sched_domain_span(sibling->child));
+ else
cpumask_set_cpu(i, sg_span);
cpumask_or(covered, covered, sg_span);
@@ -7124,13 +7134,13 @@
.sched_policy = SCHED_NORMAL,
};
int old_prio = p->prio;
- int on_rq;
+ int queued;
- on_rq = p->on_rq;
- if (on_rq)
+ queued = task_on_rq_queued(p);
+ if (queued)
dequeue_task(rq, p, 0);
__setscheduler(rq, p, &attr);
- if (on_rq) {
+ if (queued) {
enqueue_task(rq, p, 0);
resched_curr(rq);
}
@@ -7145,7 +7155,7 @@
struct rq *rq;
read_lock_irqsave(&tasklist_lock, flags);
- do_each_thread(g, p) {
+ for_each_process_thread(g, p) {
/*
* Only normalize user tasks:
*/
@@ -7176,8 +7186,7 @@
__task_rq_unlock(rq);
raw_spin_unlock(&p->pi_lock);
- } while_each_thread(g, p);
-
+ }
read_unlock_irqrestore(&tasklist_lock, flags);
}
@@ -7318,16 +7327,16 @@
void sched_move_task(struct task_struct *tsk)
{
struct task_group *tg;
- int on_rq, running;
+ int queued, running;
unsigned long flags;
struct rq *rq;
rq = task_rq_lock(tsk, &flags);
running = task_current(rq, tsk);
- on_rq = tsk->on_rq;
+ queued = task_on_rq_queued(tsk);
- if (on_rq)
+ if (queued)
dequeue_task(rq, tsk, 0);
if (unlikely(running))
tsk->sched_class->put_prev_task(rq, tsk);
@@ -7340,14 +7349,14 @@
#ifdef CONFIG_FAIR_GROUP_SCHED
if (tsk->sched_class->task_move_group)
- tsk->sched_class->task_move_group(tsk, on_rq);
+ tsk->sched_class->task_move_group(tsk, queued);
else
#endif
set_task_rq(tsk, task_cpu(tsk));
if (unlikely(running))
tsk->sched_class->set_curr_task(rq);
- if (on_rq)
+ if (queued)
enqueue_task(rq, tsk, 0);
task_rq_unlock(rq, tsk, &flags);
@@ -7365,10 +7374,10 @@
{
struct task_struct *g, *p;
- do_each_thread(g, p) {
+ for_each_process_thread(g, p) {
if (rt_task(p) && task_rq(p)->rt.tg == tg)
return 1;
- } while_each_thread(g, p);
+ }
return 0;
}
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 72fdf06..2b57031 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -288,24 +288,28 @@
struct signal_struct *sig = tsk->signal;
cputime_t utime, stime;
struct task_struct *t;
-
- times->utime = sig->utime;
- times->stime = sig->stime;
- times->sum_exec_runtime = sig->sum_sched_runtime;
+ unsigned int seq, nextseq;
rcu_read_lock();
- /* make sure we can trust tsk->thread_group list */
- if (!likely(pid_alive(tsk)))
- goto out;
-
- t = tsk;
+ /* Attempt a lockless read on the first round. */
+ nextseq = 0;
do {
- task_cputime(t, &utime, &stime);
- times->utime += utime;
- times->stime += stime;
- times->sum_exec_runtime += task_sched_runtime(t);
- } while_each_thread(tsk, t);
-out:
+ seq = nextseq;
+ read_seqbegin_or_lock(&sig->stats_lock, &seq);
+ times->utime = sig->utime;
+ times->stime = sig->stime;
+ times->sum_exec_runtime = sig->sum_sched_runtime;
+
+ for_each_thread(tsk, t) {
+ task_cputime(t, &utime, &stime);
+ times->utime += utime;
+ times->stime += stime;
+ times->sum_exec_runtime += task_sched_runtime(t);
+ }
+ /* If lockless access failed, take the lock. */
+ nextseq = 1;
+ } while (need_seqretry(&sig->stats_lock, seq));
+ done_seqretry(&sig->stats_lock, seq);
rcu_read_unlock();
}
@@ -598,9 +602,12 @@
* If the tick based count grows faster than the scheduler one,
* the result of the scaling may go backward.
* Let's enforce monotonicity.
+ * Atomic exchange protects against concurrent cputime_adjust().
*/
- prev->stime = max(prev->stime, stime);
- prev->utime = max(prev->utime, utime);
+ while (stime > (rtime = ACCESS_ONCE(prev->stime)))
+ cmpxchg(&prev->stime, rtime, stime);
+ while (utime > (rtime = ACCESS_ONCE(prev->utime)))
+ cmpxchg(&prev->utime, rtime, utime);
out:
*ut = prev->utime;
@@ -617,9 +624,6 @@
cputime_adjust(&cputime, &p->prev_cputime, ut, st);
}
-/*
- * Must be called with siglock held.
- */
void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
{
struct task_cputime cputime;
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 255ce13..cc4eb89 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -530,7 +530,7 @@
update_rq_clock(rq);
dl_se->dl_throttled = 0;
dl_se->dl_yielded = 0;
- if (p->on_rq) {
+ if (task_on_rq_queued(p)) {
enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
if (task_has_dl_policy(rq->curr))
check_preempt_curr_dl(rq, p, 0);
@@ -997,10 +997,7 @@
#ifdef CONFIG_SCHED_HRTICK
static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
{
- s64 delta = p->dl.dl_runtime - p->dl.runtime;
-
- if (delta > 10000)
- hrtick_start(rq, p->dl.runtime);
+ hrtick_start(rq, p->dl.runtime);
}
#endif
@@ -1030,7 +1027,7 @@
* means a stop task can slip in, in which case we need to
* re-start task selection.
*/
- if (rq->stop && rq->stop->on_rq)
+ if (rq->stop && task_on_rq_queued(rq->stop))
return RETRY_TASK;
}
@@ -1257,7 +1254,8 @@
if (unlikely(task_rq(task) != rq ||
!cpumask_test_cpu(later_rq->cpu,
&task->cpus_allowed) ||
- task_running(rq, task) || !task->on_rq)) {
+ task_running(rq, task) ||
+ !task_on_rq_queued(task))) {
double_unlock_balance(rq, later_rq);
later_rq = NULL;
break;
@@ -1296,7 +1294,7 @@
BUG_ON(task_current(rq, p));
BUG_ON(p->nr_cpus_allowed <= 1);
- BUG_ON(!p->on_rq);
+ BUG_ON(!task_on_rq_queued(p));
BUG_ON(!dl_task(p));
return p;
@@ -1443,7 +1441,7 @@
dl_time_before(p->dl.deadline,
this_rq->dl.earliest_dl.curr))) {
WARN_ON(p == src_rq->curr);
- WARN_ON(!p->on_rq);
+ WARN_ON(!task_on_rq_queued(p));
/*
* Then we pull iff p has actually an earlier
@@ -1596,7 +1594,7 @@
if (unlikely(p->dl.dl_throttled))
return;
- if (p->on_rq && rq->curr != p) {
+ if (task_on_rq_queued(p) && rq->curr != p) {
#ifdef CONFIG_SMP
if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p))
/* Only reschedule if pushing failed */
@@ -1614,7 +1612,7 @@
static void prio_changed_dl(struct rq *rq, struct task_struct *p,
int oldprio)
{
- if (p->on_rq || rq->curr == p) {
+ if (task_on_rq_queued(p) || rq->curr == p) {
#ifdef CONFIG_SMP
/*
* This might be too much, but unfortunately
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 627b3c3..c7fe1ea0 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -160,14 +160,12 @@
"----------------------------------------------------\n");
read_lock_irqsave(&tasklist_lock, flags);
-
- do_each_thread(g, p) {
+ for_each_process_thread(g, p) {
if (task_cpu(p) != rq_cpu)
continue;
print_task(m, rq, p);
- } while_each_thread(g, p);
-
+ }
read_unlock_irqrestore(&tasklist_lock, flags);
}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index bfa3c86..be9e97b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1038,7 +1038,8 @@
*/
static void update_numa_stats(struct numa_stats *ns, int nid)
{
- int cpu, cpus = 0;
+ int smt, cpu, cpus = 0;
+ unsigned long capacity;
memset(ns, 0, sizeof(*ns));
for_each_cpu(cpu, cpumask_of_node(nid)) {
@@ -1062,8 +1063,12 @@
if (!cpus)
return;
- ns->task_capacity =
- DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE);
+ /* smt := ceil(cpus / capacity), assumes: 1 < smt_power < 2 */
+ smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity);
+ capacity = cpus / smt; /* cores */
+
+ ns->task_capacity = min_t(unsigned, capacity,
+ DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE));
ns->has_free_capacity = (ns->nr_running < ns->task_capacity);
}
@@ -1206,7 +1211,7 @@
if (!cur) {
/* Is there capacity at our destination? */
- if (env->src_stats.has_free_capacity &&
+ if (env->src_stats.nr_running <= env->src_stats.task_capacity &&
!env->dst_stats.has_free_capacity)
goto unlock;
@@ -1775,7 +1780,7 @@
list_del(&p->numa_entry);
grp->nr_tasks--;
spin_unlock_irqrestore(&grp->lock, flags);
- rcu_assign_pointer(p->numa_group, NULL);
+ RCU_INIT_POINTER(p->numa_group, NULL);
put_numa_group(grp);
}
@@ -2377,6 +2382,9 @@
tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
tg_contrib -= cfs_rq->tg_load_contrib;
+ if (!tg_contrib)
+ return;
+
if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
atomic_long_add(tg_contrib, &tg->load_avg);
cfs_rq->tg_load_contrib += tg_contrib;
@@ -3892,14 +3900,6 @@
resched_curr(rq);
return;
}
-
- /*
- * Don't schedule slices shorter than 10000ns, that just
- * doesn't make sense. Rely on vruntime for fairness.
- */
- if (rq->curr != p)
- delta = max_t(s64, 10000LL, delta);
-
hrtick_start(rq, delta);
}
}
@@ -4704,7 +4704,7 @@
return;
/*
- * This is possible from callers such as move_task(), in which we
+ * This is possible from callers such as attach_tasks(), in which we
* unconditionally check_prempt_curr() after an enqueue (which may have
* lead to a throttle). This both saves work and prevents false
* next-buddy nomination below.
@@ -5112,27 +5112,18 @@
unsigned int loop_max;
enum fbq_type fbq_type;
+ struct list_head tasks;
};
/*
- * move_task - move a task from one runqueue to another runqueue.
- * Both runqueues must be locked.
- */
-static void move_task(struct task_struct *p, struct lb_env *env)
-{
- deactivate_task(env->src_rq, p, 0);
- set_task_cpu(p, env->dst_cpu);
- activate_task(env->dst_rq, p, 0);
- check_preempt_curr(env->dst_rq, p, 0);
-}
-
-/*
* Is this task likely cache-hot:
*/
static int task_hot(struct task_struct *p, struct lb_env *env)
{
s64 delta;
+ lockdep_assert_held(&env->src_rq->lock);
+
if (p->sched_class != &fair_sched_class)
return 0;
@@ -5252,6 +5243,9 @@
int can_migrate_task(struct task_struct *p, struct lb_env *env)
{
int tsk_cache_hot = 0;
+
+ lockdep_assert_held(&env->src_rq->lock);
+
/*
* We do not migrate tasks that are:
* 1) throttled_lb_pair, or
@@ -5336,47 +5330,63 @@
}
/*
- * move_one_task tries to move exactly one task from busiest to this_rq, as
- * part of active balancing operations within "domain".
- * Returns 1 if successful and 0 otherwise.
- *
- * Called with both runqueues locked.
+ * detach_task() -- detach the task for the migration specified in env
*/
-static int move_one_task(struct lb_env *env)
+static void detach_task(struct task_struct *p, struct lb_env *env)
+{
+ lockdep_assert_held(&env->src_rq->lock);
+
+ deactivate_task(env->src_rq, p, 0);
+ p->on_rq = TASK_ON_RQ_MIGRATING;
+ set_task_cpu(p, env->dst_cpu);
+}
+
+/*
+ * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as
+ * part of active balancing operations within "domain".
+ *
+ * Returns a task if successful and NULL otherwise.
+ */
+static struct task_struct *detach_one_task(struct lb_env *env)
{
struct task_struct *p, *n;
+ lockdep_assert_held(&env->src_rq->lock);
+
list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
if (!can_migrate_task(p, env))
continue;
- move_task(p, env);
+ detach_task(p, env);
+
/*
- * Right now, this is only the second place move_task()
- * is called, so we can safely collect move_task()
- * stats here rather than inside move_task().
+ * Right now, this is only the second place where
+ * lb_gained[env->idle] is updated (other is detach_tasks)
+ * so we can safely collect stats here rather than
+ * inside detach_tasks().
*/
schedstat_inc(env->sd, lb_gained[env->idle]);
- return 1;
+ return p;
}
- return 0;
+ return NULL;
}
static const unsigned int sched_nr_migrate_break = 32;
/*
- * move_tasks tries to move up to imbalance weighted load from busiest to
- * this_rq, as part of a balancing operation within domain "sd".
- * Returns 1 if successful and 0 otherwise.
+ * detach_tasks() -- tries to detach up to imbalance weighted load from
+ * busiest_rq, as part of a balancing operation within domain "sd".
*
- * Called with both runqueues locked.
+ * Returns number of detached tasks if successful and 0 otherwise.
*/
-static int move_tasks(struct lb_env *env)
+static int detach_tasks(struct lb_env *env)
{
struct list_head *tasks = &env->src_rq->cfs_tasks;
struct task_struct *p;
unsigned long load;
- int pulled = 0;
+ int detached = 0;
+
+ lockdep_assert_held(&env->src_rq->lock);
if (env->imbalance <= 0)
return 0;
@@ -5407,14 +5417,16 @@
if ((load / 2) > env->imbalance)
goto next;
- move_task(p, env);
- pulled++;
+ detach_task(p, env);
+ list_add(&p->se.group_node, &env->tasks);
+
+ detached++;
env->imbalance -= load;
#ifdef CONFIG_PREEMPT
/*
* NEWIDLE balancing is a source of latency, so preemptible
- * kernels will stop after the first task is pulled to minimize
+ * kernels will stop after the first task is detached to minimize
* the critical section.
*/
if (env->idle == CPU_NEWLY_IDLE)
@@ -5434,13 +5446,58 @@
}
/*
- * Right now, this is one of only two places move_task() is called,
- * so we can safely collect move_task() stats here rather than
- * inside move_task().
+ * Right now, this is one of only two places we collect this stat
+ * so we can safely collect detach_one_task() stats here rather
+ * than inside detach_one_task().
*/
- schedstat_add(env->sd, lb_gained[env->idle], pulled);
+ schedstat_add(env->sd, lb_gained[env->idle], detached);
- return pulled;
+ return detached;
+}
+
+/*
+ * attach_task() -- attach the task detached by detach_task() to its new rq.
+ */
+static void attach_task(struct rq *rq, struct task_struct *p)
+{
+ lockdep_assert_held(&rq->lock);
+
+ BUG_ON(task_rq(p) != rq);
+ p->on_rq = TASK_ON_RQ_QUEUED;
+ activate_task(rq, p, 0);
+ check_preempt_curr(rq, p, 0);
+}
+
+/*
+ * attach_one_task() -- attaches the task returned from detach_one_task() to
+ * its new rq.
+ */
+static void attach_one_task(struct rq *rq, struct task_struct *p)
+{
+ raw_spin_lock(&rq->lock);
+ attach_task(rq, p);
+ raw_spin_unlock(&rq->lock);
+}
+
+/*
+ * attach_tasks() -- attaches all tasks detached by detach_tasks() to their
+ * new rq.
+ */
+static void attach_tasks(struct lb_env *env)
+{
+ struct list_head *tasks = &env->tasks;
+ struct task_struct *p;
+
+ raw_spin_lock(&env->dst_rq->lock);
+
+ while (!list_empty(tasks)) {
+ p = list_first_entry(tasks, struct task_struct, se.group_node);
+ list_del_init(&p->se.group_node);
+
+ attach_task(env->dst_rq, p);
+ }
+
+ raw_spin_unlock(&env->dst_rq->lock);
}
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -5559,6 +5616,13 @@
#endif
/********** Helpers for find_busiest_group ************************/
+
+enum group_type {
+ group_other = 0,
+ group_imbalanced,
+ group_overloaded,
+};
+
/*
* sg_lb_stats - stats of a sched_group required for load_balancing
*/
@@ -5572,7 +5636,7 @@
unsigned int group_capacity_factor;
unsigned int idle_cpus;
unsigned int group_weight;
- int group_imb; /* Is there an imbalance in the group ? */
+ enum group_type group_type;
int group_has_free_capacity;
#ifdef CONFIG_NUMA_BALANCING
unsigned int nr_numa_running;
@@ -5610,6 +5674,8 @@
.total_capacity = 0UL,
.busiest_stat = {
.avg_load = 0UL,
+ .sum_nr_running = 0,
+ .group_type = group_other,
},
};
}
@@ -5891,6 +5957,18 @@
return capacity_factor;
}
+static enum group_type
+group_classify(struct sched_group *group, struct sg_lb_stats *sgs)
+{
+ if (sgs->sum_nr_running > sgs->group_capacity_factor)
+ return group_overloaded;
+
+ if (sg_imbalanced(group))
+ return group_imbalanced;
+
+ return group_other;
+}
+
/**
* update_sg_lb_stats - Update sched_group's statistics for load balancing.
* @env: The load balancing environment.
@@ -5942,9 +6020,8 @@
sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
sgs->group_weight = group->group_weight;
-
- sgs->group_imb = sg_imbalanced(group);
sgs->group_capacity_factor = sg_capacity_factor(env, group);
+ sgs->group_type = group_classify(group, sgs);
if (sgs->group_capacity_factor > sgs->sum_nr_running)
sgs->group_has_free_capacity = 1;
@@ -5968,13 +6045,19 @@
struct sched_group *sg,
struct sg_lb_stats *sgs)
{
- if (sgs->avg_load <= sds->busiest_stat.avg_load)
- return false;
+ struct sg_lb_stats *busiest = &sds->busiest_stat;
- if (sgs->sum_nr_running > sgs->group_capacity_factor)
+ if (sgs->group_type > busiest->group_type)
return true;
- if (sgs->group_imb)
+ if (sgs->group_type < busiest->group_type)
+ return false;
+
+ if (sgs->avg_load <= busiest->avg_load)
+ return false;
+
+ /* This is the busiest node in its class. */
+ if (!(env->sd->flags & SD_ASYM_PACKING))
return true;
/*
@@ -5982,8 +6065,7 @@
* numbered CPUs in the group, therefore mark all groups
* higher than ourself as busy.
*/
- if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
- env->dst_cpu < group_first_cpu(sg)) {
+ if (sgs->sum_nr_running && env->dst_cpu < group_first_cpu(sg)) {
if (!sds->busiest)
return true;
@@ -6228,7 +6310,7 @@
local = &sds->local_stat;
busiest = &sds->busiest_stat;
- if (busiest->group_imb) {
+ if (busiest->group_type == group_imbalanced) {
/*
* In the group_imb case we cannot rely on group-wide averages
* to ensure cpu-load equilibrium, look at wider averages. XXX
@@ -6248,12 +6330,11 @@
return fix_small_imbalance(env, sds);
}
- if (!busiest->group_imb) {
- /*
- * Don't want to pull so many tasks that a group would go idle.
- * Except of course for the group_imb case, since then we might
- * have to drop below capacity to reach cpu-load equilibrium.
- */
+ /*
+ * If there aren't any idle cpus, avoid creating some.
+ */
+ if (busiest->group_type == group_overloaded &&
+ local->group_type == group_overloaded) {
load_above_capacity =
(busiest->sum_nr_running - busiest->group_capacity_factor);
@@ -6337,7 +6418,7 @@
* work because they assume all things are equal, which typically
* isn't true due to cpus_allowed constraints and the like.
*/
- if (busiest->group_imb)
+ if (busiest->group_type == group_imbalanced)
goto force_balance;
/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
@@ -6550,6 +6631,7 @@
.loop_break = sched_nr_migrate_break,
.cpus = cpus,
.fbq_type = all,
+ .tasks = LIST_HEAD_INIT(env.tasks),
};
/*
@@ -6599,16 +6681,29 @@
env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
more_balance:
- local_irq_save(flags);
- double_rq_lock(env.dst_rq, busiest);
+ raw_spin_lock_irqsave(&busiest->lock, flags);
/*
* cur_ld_moved - load moved in current iteration
* ld_moved - cumulative load moved across iterations
*/
- cur_ld_moved = move_tasks(&env);
- ld_moved += cur_ld_moved;
- double_rq_unlock(env.dst_rq, busiest);
+ cur_ld_moved = detach_tasks(&env);
+
+ /*
+ * We've detached some tasks from busiest_rq. Every
+ * task is masked "TASK_ON_RQ_MIGRATING", so we can safely
+ * unlock busiest->lock, and we are able to be sure
+ * that nobody can manipulate the tasks in parallel.
+ * See task_rq_lock() family for the details.
+ */
+
+ raw_spin_unlock(&busiest->lock);
+
+ if (cur_ld_moved) {
+ attach_tasks(&env);
+ ld_moved += cur_ld_moved;
+ }
+
local_irq_restore(flags);
/*
@@ -6744,7 +6839,7 @@
* If we've begun active balancing, start to back off. This
* case may not be covered by the all_pinned logic if there
* is only 1 task on the busy runqueue (because we don't call
- * move_tasks).
+ * detach_tasks).
*/
if (sd->balance_interval < sd->max_interval)
sd->balance_interval *= 2;
@@ -6914,6 +7009,7 @@
int target_cpu = busiest_rq->push_cpu;
struct rq *target_rq = cpu_rq(target_cpu);
struct sched_domain *sd;
+ struct task_struct *p = NULL;
raw_spin_lock_irq(&busiest_rq->lock);
@@ -6933,9 +7029,6 @@
*/
BUG_ON(busiest_rq == target_rq);
- /* move a task from busiest_rq to target_rq */
- double_lock_balance(busiest_rq, target_rq);
-
/* Search for an sd spanning us and the target CPU. */
rcu_read_lock();
for_each_domain(target_cpu, sd) {
@@ -6956,16 +7049,22 @@
schedstat_inc(sd, alb_count);
- if (move_one_task(&env))
+ p = detach_one_task(&env);
+ if (p)
schedstat_inc(sd, alb_pushed);
else
schedstat_inc(sd, alb_failed);
}
rcu_read_unlock();
- double_unlock_balance(busiest_rq, target_rq);
out_unlock:
busiest_rq->active_balance = 0;
- raw_spin_unlock_irq(&busiest_rq->lock);
+ raw_spin_unlock(&busiest_rq->lock);
+
+ if (p)
+ attach_one_task(target_rq, p);
+
+ local_irq_enable();
+
return 0;
}
@@ -7465,7 +7564,7 @@
static void
prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
{
- if (!p->se.on_rq)
+ if (!task_on_rq_queued(p))
return;
/*
@@ -7490,11 +7589,11 @@
* switched back to the fair class the enqueue_entity(.flags=0) will
* do the right thing.
*
- * If it's on_rq, then the dequeue_entity(.flags=0) will already
- * have normalized the vruntime, if it's !on_rq, then only when
+ * If it's queued, then the dequeue_entity(.flags=0) will already
+ * have normalized the vruntime, if it's !queued, then only when
* the task is sleeping will it still have non-normalized vruntime.
*/
- if (!p->on_rq && p->state != TASK_RUNNING) {
+ if (!task_on_rq_queued(p) && p->state != TASK_RUNNING) {
/*
* Fix up our vruntime so that the current sleep doesn't
* cause 'unlimited' sleep bonus.
@@ -7521,15 +7620,15 @@
*/
static void switched_to_fair(struct rq *rq, struct task_struct *p)
{
- struct sched_entity *se = &p->se;
#ifdef CONFIG_FAIR_GROUP_SCHED
+ struct sched_entity *se = &p->se;
/*
* Since the real-depth could have been changed (only FAIR
* class maintain depth value), reset depth properly.
*/
se->depth = se->parent ? se->parent->depth + 1 : 0;
#endif
- if (!se->on_rq)
+ if (!task_on_rq_queued(p))
return;
/*
@@ -7575,7 +7674,7 @@
}
#ifdef CONFIG_FAIR_GROUP_SCHED
-static void task_move_group_fair(struct task_struct *p, int on_rq)
+static void task_move_group_fair(struct task_struct *p, int queued)
{
struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq;
@@ -7594,7 +7693,7 @@
* fair sleeper stuff for the first placement, but who cares.
*/
/*
- * When !on_rq, vruntime of the task has usually NOT been normalized.
+ * When !queued, vruntime of the task has usually NOT been normalized.
* But there are some cases where it has already been normalized:
*
* - Moving a forked child which is waiting for being woken up by
@@ -7605,14 +7704,14 @@
* To prevent boost or penalty in the new cfs_rq caused by delta
* min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
*/
- if (!on_rq && (!se->sum_exec_runtime || p->state == TASK_WAKING))
- on_rq = 1;
+ if (!queued && (!se->sum_exec_runtime || p->state == TASK_WAKING))
+ queued = 1;
- if (!on_rq)
+ if (!queued)
se->vruntime -= cfs_rq_of(se)->min_vruntime;
set_task_rq(p, task_cpu(p));
se->depth = se->parent ? se->parent->depth + 1 : 0;
- if (!on_rq) {
+ if (!queued) {
cfs_rq = cfs_rq_of(se);
se->vruntime += cfs_rq->min_vruntime;
#ifdef CONFIG_SMP
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 5f6edca..4feac8f 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1448,7 +1448,7 @@
* means a dl or stop task can slip in, in which case we need
* to re-start task selection.
*/
- if (unlikely((rq->stop && rq->stop->on_rq) ||
+ if (unlikely((rq->stop && task_on_rq_queued(rq->stop)) ||
rq->dl.dl_nr_running))
return RETRY_TASK;
}
@@ -1624,7 +1624,7 @@
!cpumask_test_cpu(lowest_rq->cpu,
tsk_cpus_allowed(task)) ||
task_running(rq, task) ||
- !task->on_rq)) {
+ !task_on_rq_queued(task))) {
double_unlock_balance(rq, lowest_rq);
lowest_rq = NULL;
@@ -1658,7 +1658,7 @@
BUG_ON(task_current(rq, p));
BUG_ON(p->nr_cpus_allowed <= 1);
- BUG_ON(!p->on_rq);
+ BUG_ON(!task_on_rq_queued(p));
BUG_ON(!rt_task(p));
return p;
@@ -1809,7 +1809,7 @@
*/
if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
WARN_ON(p == src_rq->curr);
- WARN_ON(!p->on_rq);
+ WARN_ON(!task_on_rq_queued(p));
/*
* There's a chance that p is higher in priority
@@ -1870,7 +1870,7 @@
BUG_ON(!rt_task(p));
- if (!p->on_rq)
+ if (!task_on_rq_queued(p))
return;
weight = cpumask_weight(new_mask);
@@ -1936,7 +1936,7 @@
* we may need to handle the pulling of RT tasks
* now.
*/
- if (!p->on_rq || rq->rt.rt_nr_running)
+ if (!task_on_rq_queued(p) || rq->rt.rt_nr_running)
return;
if (pull_rt_task(rq))
@@ -1970,7 +1970,7 @@
* If that current running task is also an RT task
* then see if we can move to another run queue.
*/
- if (p->on_rq && rq->curr != p) {
+ if (task_on_rq_queued(p) && rq->curr != p) {
#ifdef CONFIG_SMP
if (p->nr_cpus_allowed > 1 && rq->rt.overloaded &&
/* Don't resched if we changed runqueues */
@@ -1989,7 +1989,7 @@
static void
prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
{
- if (!p->on_rq)
+ if (!task_on_rq_queued(p))
return;
if (rq->curr == p) {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 579712f..aa0f73b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -15,6 +15,10 @@
struct rq;
+/* task_struct::on_rq states: */
+#define TASK_ON_RQ_QUEUED 1
+#define TASK_ON_RQ_MIGRATING 2
+
extern __read_mostly int scheduler_running;
extern unsigned long calc_load_update;
@@ -647,7 +651,7 @@
#endif
}
-DECLARE_PER_CPU(struct rq, runqueues);
+DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
#define this_rq() (&__get_cpu_var(runqueues))
@@ -942,6 +946,15 @@
#endif
}
+static inline int task_on_rq_queued(struct task_struct *p)
+{
+ return p->on_rq == TASK_ON_RQ_QUEUED;
+}
+
+static inline int task_on_rq_migrating(struct task_struct *p)
+{
+ return p->on_rq == TASK_ON_RQ_MIGRATING;
+}
#ifndef prepare_arch_switch
# define prepare_arch_switch(next) do { } while (0)
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index bfe0eda..67426e5 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -28,7 +28,7 @@
{
struct task_struct *stop = rq->stop;
- if (!stop || !stop->on_rq)
+ if (!stop || !task_on_rq_queued(stop))
return NULL;
put_prev_task(rq, prev);
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 44eb005..1285cb2 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -21,10 +21,11 @@
#include <linux/slab.h>
#include <linux/syscalls.h>
-/* #define SECCOMP_DEBUG 1 */
+#ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
+#include <asm/syscall.h>
+#endif
#ifdef CONFIG_SECCOMP_FILTER
-#include <asm/syscall.h>
#include <linux/filter.h>
#include <linux/pid.h>
#include <linux/ptrace.h>
@@ -172,10 +173,10 @@
*
* Returns valid seccomp BPF response codes.
*/
-static u32 seccomp_run_filters(int syscall)
+static u32 seccomp_run_filters(struct seccomp_data *sd)
{
struct seccomp_filter *f = ACCESS_ONCE(current->seccomp.filter);
- struct seccomp_data sd;
+ struct seccomp_data sd_local;
u32 ret = SECCOMP_RET_ALLOW;
/* Ensure unexpected behavior doesn't result in failing open. */
@@ -185,14 +186,17 @@
/* Make sure cross-thread synced filter points somewhere sane. */
smp_read_barrier_depends();
- populate_seccomp_data(&sd);
+ if (!sd) {
+ populate_seccomp_data(&sd_local);
+ sd = &sd_local;
+ }
/*
* All filters in the list are evaluated and the lowest BPF return
* value always takes priority (ignoring the DATA).
*/
for (; f; f = f->prev) {
- u32 cur_ret = BPF_PROG_RUN(f->prog, (void *)&sd);
+ u32 cur_ret = BPF_PROG_RUN(f->prog, (void *)sd);
if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION))
ret = cur_ret;
@@ -564,11 +568,55 @@
};
#endif
-int __secure_computing(int this_syscall)
+static void __secure_computing_strict(int this_syscall)
{
- int exit_sig = 0;
- int *syscall;
- u32 ret;
+ int *syscall_whitelist = mode1_syscalls;
+#ifdef CONFIG_COMPAT
+ if (is_compat_task())
+ syscall_whitelist = mode1_syscalls_32;
+#endif
+ do {
+ if (*syscall_whitelist == this_syscall)
+ return;
+ } while (*++syscall_whitelist);
+
+#ifdef SECCOMP_DEBUG
+ dump_stack();
+#endif
+ audit_seccomp(this_syscall, SIGKILL, SECCOMP_RET_KILL);
+ do_exit(SIGKILL);
+}
+
+#ifndef CONFIG_HAVE_ARCH_SECCOMP_FILTER
+void secure_computing_strict(int this_syscall)
+{
+ int mode = current->seccomp.mode;
+
+ if (mode == 0)
+ return;
+ else if (mode == SECCOMP_MODE_STRICT)
+ __secure_computing_strict(this_syscall);
+ else
+ BUG();
+}
+#else
+int __secure_computing(void)
+{
+ u32 phase1_result = seccomp_phase1(NULL);
+
+ if (likely(phase1_result == SECCOMP_PHASE1_OK))
+ return 0;
+ else if (likely(phase1_result == SECCOMP_PHASE1_SKIP))
+ return -1;
+ else
+ return seccomp_phase2(phase1_result);
+}
+
+#ifdef CONFIG_SECCOMP_FILTER
+static u32 __seccomp_phase1_filter(int this_syscall, struct seccomp_data *sd)
+{
+ u32 filter_ret, action;
+ int data;
/*
* Make sure that any changes to mode from another thread have
@@ -576,86 +624,128 @@
*/
rmb();
- switch (current->seccomp.mode) {
- case SECCOMP_MODE_STRICT:
- syscall = mode1_syscalls;
-#ifdef CONFIG_COMPAT
- if (is_compat_task())
- syscall = mode1_syscalls_32;
-#endif
- do {
- if (*syscall == this_syscall)
- return 0;
- } while (*++syscall);
- exit_sig = SIGKILL;
- ret = SECCOMP_RET_KILL;
- break;
-#ifdef CONFIG_SECCOMP_FILTER
- case SECCOMP_MODE_FILTER: {
- int data;
- struct pt_regs *regs = task_pt_regs(current);
- ret = seccomp_run_filters(this_syscall);
- data = ret & SECCOMP_RET_DATA;
- ret &= SECCOMP_RET_ACTION;
- switch (ret) {
- case SECCOMP_RET_ERRNO:
- /* Set the low-order 16-bits as a errno. */
- syscall_set_return_value(current, regs,
- -data, 0);
- goto skip;
- case SECCOMP_RET_TRAP:
- /* Show the handler the original registers. */
- syscall_rollback(current, regs);
- /* Let the filter pass back 16 bits of data. */
- seccomp_send_sigsys(this_syscall, data);
- goto skip;
- case SECCOMP_RET_TRACE:
- /* Skip these calls if there is no tracer. */
- if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) {
- syscall_set_return_value(current, regs,
- -ENOSYS, 0);
- goto skip;
- }
- /* Allow the BPF to provide the event message */
- ptrace_event(PTRACE_EVENT_SECCOMP, data);
- /*
- * The delivery of a fatal signal during event
- * notification may silently skip tracer notification.
- * Terminating the task now avoids executing a system
- * call that may not be intended.
- */
- if (fatal_signal_pending(current))
- break;
- if (syscall_get_nr(current, regs) < 0)
- goto skip; /* Explicit request to skip. */
+ filter_ret = seccomp_run_filters(sd);
+ data = filter_ret & SECCOMP_RET_DATA;
+ action = filter_ret & SECCOMP_RET_ACTION;
- return 0;
- case SECCOMP_RET_ALLOW:
- return 0;
- case SECCOMP_RET_KILL:
- default:
- break;
- }
- exit_sig = SIGSYS;
- break;
+ switch (action) {
+ case SECCOMP_RET_ERRNO:
+ /* Set the low-order 16-bits as a errno. */
+ syscall_set_return_value(current, task_pt_regs(current),
+ -data, 0);
+ goto skip;
+
+ case SECCOMP_RET_TRAP:
+ /* Show the handler the original registers. */
+ syscall_rollback(current, task_pt_regs(current));
+ /* Let the filter pass back 16 bits of data. */
+ seccomp_send_sigsys(this_syscall, data);
+ goto skip;
+
+ case SECCOMP_RET_TRACE:
+ return filter_ret; /* Save the rest for phase 2. */
+
+ case SECCOMP_RET_ALLOW:
+ return SECCOMP_PHASE1_OK;
+
+ case SECCOMP_RET_KILL:
+ default:
+ audit_seccomp(this_syscall, SIGSYS, action);
+ do_exit(SIGSYS);
}
+
+ unreachable();
+
+skip:
+ audit_seccomp(this_syscall, 0, action);
+ return SECCOMP_PHASE1_SKIP;
+}
+#endif
+
+/**
+ * seccomp_phase1() - run fast path seccomp checks on the current syscall
+ * @arg sd: The seccomp_data or NULL
+ *
+ * This only reads pt_regs via the syscall_xyz helpers. The only change
+ * it will make to pt_regs is via syscall_set_return_value, and it will
+ * only do that if it returns SECCOMP_PHASE1_SKIP.
+ *
+ * If sd is provided, it will not read pt_regs at all.
+ *
+ * It may also call do_exit or force a signal; these actions must be
+ * safe.
+ *
+ * If it returns SECCOMP_PHASE1_OK, the syscall passes checks and should
+ * be processed normally.
+ *
+ * If it returns SECCOMP_PHASE1_SKIP, then the syscall should not be
+ * invoked. In this case, seccomp_phase1 will have set the return value
+ * using syscall_set_return_value.
+ *
+ * If it returns anything else, then the return value should be passed
+ * to seccomp_phase2 from a context in which ptrace hooks are safe.
+ */
+u32 seccomp_phase1(struct seccomp_data *sd)
+{
+ int mode = current->seccomp.mode;
+ int this_syscall = sd ? sd->nr :
+ syscall_get_nr(current, task_pt_regs(current));
+
+ switch (mode) {
+ case SECCOMP_MODE_STRICT:
+ __secure_computing_strict(this_syscall); /* may call do_exit */
+ return SECCOMP_PHASE1_OK;
+#ifdef CONFIG_SECCOMP_FILTER
+ case SECCOMP_MODE_FILTER:
+ return __seccomp_phase1_filter(this_syscall, sd);
#endif
default:
BUG();
}
-
-#ifdef SECCOMP_DEBUG
- dump_stack();
-#endif
- audit_seccomp(this_syscall, exit_sig, ret);
- do_exit(exit_sig);
-#ifdef CONFIG_SECCOMP_FILTER
-skip:
- audit_seccomp(this_syscall, exit_sig, ret);
-#endif
- return -1;
}
+/**
+ * seccomp_phase2() - finish slow path seccomp work for the current syscall
+ * @phase1_result: The return value from seccomp_phase1()
+ *
+ * This must be called from a context in which ptrace hooks can be used.
+ *
+ * Returns 0 if the syscall should be processed or -1 to skip the syscall.
+ */
+int seccomp_phase2(u32 phase1_result)
+{
+ struct pt_regs *regs = task_pt_regs(current);
+ u32 action = phase1_result & SECCOMP_RET_ACTION;
+ int data = phase1_result & SECCOMP_RET_DATA;
+
+ BUG_ON(action != SECCOMP_RET_TRACE);
+
+ audit_seccomp(syscall_get_nr(current, regs), 0, action);
+
+ /* Skip these calls if there is no tracer. */
+ if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) {
+ syscall_set_return_value(current, regs,
+ -ENOSYS, 0);
+ return -1;
+ }
+
+ /* Allow the BPF to provide the event message */
+ ptrace_event(PTRACE_EVENT_SECCOMP, data);
+ /*
+ * The delivery of a fatal signal during event
+ * notification may silently skip tracer notification.
+ * Terminating the task now avoids executing a system
+ * call that may not be intended.
+ */
+ if (fatal_signal_pending(current))
+ do_exit(SIGSYS);
+ if (syscall_get_nr(current, regs) < 0)
+ return -1; /* Explicit request to skip. */
+
+ return 0;
+}
+#endif /* CONFIG_HAVE_ARCH_SECCOMP_FILTER */
+
long prctl_get_seccomp(void)
{
return current->seccomp.mode;
diff --git a/kernel/sys.c b/kernel/sys.c
index ce81291..b663664 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -862,11 +862,9 @@
{
cputime_t tgutime, tgstime, cutime, cstime;
- spin_lock_irq(¤t->sighand->siglock);
thread_group_cputime_adjusted(current, &tgutime, &tgstime);
cutime = current->signal->cutime;
cstime = current->signal->cstime;
- spin_unlock_irq(¤t->sighand->siglock);
tms->tms_utime = cputime_to_clock_t(tgutime);
tms->tms_stime = cputime_to_clock_t(tgstime);
tms->tms_cutime = cputime_to_clock_t(cutime);
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 3b89464..492b986 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -272,22 +272,8 @@
if (same_thread_group(tsk, current))
err = cpu_clock_sample(which_clock, tsk, &rtn);
} else {
- unsigned long flags;
- struct sighand_struct *sighand;
-
- /*
- * while_each_thread() is not yet entirely RCU safe,
- * keep locking the group while sampling process
- * clock for now.
- */
- sighand = lock_task_sighand(tsk, &flags);
- if (!sighand)
- return err;
-
if (tsk == current || thread_group_leader(tsk))
err = cpu_clock_sample_group(which_clock, tsk, &rtn);
-
- unlock_task_sighand(tsk, &flags);
}
if (!err)
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 99aa6ee..905f3e2 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -225,6 +225,20 @@
};
/*
+ * Kick this CPU if it's full dynticks in order to force it to
+ * re-evaluate its dependency on the tick and restart it if necessary.
+ * This kick, unlike tick_nohz_full_kick_cpu() and tick_nohz_full_kick_all(),
+ * is NMI safe.
+ */
+void tick_nohz_full_kick(void)
+{
+ if (!tick_nohz_full_cpu(smp_processor_id()))
+ return;
+
+ irq_work_queue(&__get_cpu_var(nohz_full_kick_work));
+}
+
+/*
* Kick the CPU if it's full dynticks in order to force it to
* re-evaluate its dependency on the tick and restart it if necessary.
*/
@@ -968,6 +982,10 @@
tick_sched_do_timer(now);
tick_sched_handle(ts, regs);
+ /* No need to reprogram if we are running tickless */
+ if (unlikely(ts->tick_stopped))
+ return;
+
while (tick_nohz_reprogram(ts, now)) {
now = ktime_get();
tick_do_update_jiffies64(now);
@@ -1095,6 +1113,10 @@
if (regs)
tick_sched_handle(ts, regs);
+ /* No need to reprogram if we are in idle or full dynticks mode */
+ if (unlikely(ts->tick_stopped))
+ return HRTIMER_NORESTART;
+
hrtimer_forward(timer, now, tick_period);
return HRTIMER_RESTART;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index fb4a9c2..ec1791f 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -442,11 +442,12 @@
tk->ntp_error = 0;
ntp_clear();
}
- update_vsyscall(tk);
- update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET);
tk_update_ktime_data(tk);
+ update_vsyscall(tk);
+ update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET);
+
if (action & TK_MIRROR)
memcpy(&shadow_timekeeper, &tk_core.timekeeper,
sizeof(tk_core.timekeeper));
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index a8d6914..8759d0b 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -15,11 +15,6 @@
#include <linux/cpu.h>
#include <linux/nmi.h>
#include <linux/init.h>
-#include <linux/delay.h>
-#include <linux/freezer.h>
-#include <linux/kthread.h>
-#include <linux/lockdep.h>
-#include <linux/notifier.h>
#include <linux/module.h>
#include <linux/sysctl.h>
#include <linux/smpboot.h>
@@ -514,7 +509,10 @@
/* should be in cleanup, but blocks oprofile */
perf_event_release_kernel(event);
}
- return;
+ if (cpu == 0) {
+ /* watchdog_nmi_enable() expects this to be zero initially. */
+ cpu0_err = 0;
+ }
}
#else
static int watchdog_nmi_enable(unsigned int cpu) { return 0; }
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index a285900..ffc4772 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -952,7 +952,7 @@
the proof of observed correctness is also maintained for an
arbitrary combination of these separate locking variants.
- For more details, see Documentation/lockdep-design.txt.
+ For more details, see Documentation/locking/lockdep-design.txt.
config LOCKDEP
bool
@@ -973,7 +973,7 @@
help
This feature enables tracking lock contention points
- For more details, see Documentation/lockstat.txt
+ For more details, see Documentation/locking/lockstat.txt
This also enables lock events required by "perf lock",
subcommand of perf.
diff --git a/lib/assoc_array.c b/lib/assoc_array.c
index c0b1007..ae146f0 100644
--- a/lib/assoc_array.c
+++ b/lib/assoc_array.c
@@ -1735,7 +1735,7 @@
gc_complete:
edit->set[0].to = new_root;
assoc_array_apply_edit(edit);
- edit->array->nr_leaves_on_tree = nr_leaves_on_tree;
+ array->nr_leaves_on_tree = nr_leaves_on_tree;
return 0;
enomem:
diff --git a/lib/atomic64.c b/lib/atomic64.c
index 08a4f06..1298c05e 100644
--- a/lib/atomic64.c
+++ b/lib/atomic64.c
@@ -70,53 +70,42 @@
}
EXPORT_SYMBOL(atomic64_set);
-void atomic64_add(long long a, atomic64_t *v)
-{
- unsigned long flags;
- raw_spinlock_t *lock = lock_addr(v);
+#define ATOMIC64_OP(op, c_op) \
+void atomic64_##op(long long a, atomic64_t *v) \
+{ \
+ unsigned long flags; \
+ raw_spinlock_t *lock = lock_addr(v); \
+ \
+ raw_spin_lock_irqsave(lock, flags); \
+ v->counter c_op a; \
+ raw_spin_unlock_irqrestore(lock, flags); \
+} \
+EXPORT_SYMBOL(atomic64_##op);
- raw_spin_lock_irqsave(lock, flags);
- v->counter += a;
- raw_spin_unlock_irqrestore(lock, flags);
-}
-EXPORT_SYMBOL(atomic64_add);
+#define ATOMIC64_OP_RETURN(op, c_op) \
+long long atomic64_##op##_return(long long a, atomic64_t *v) \
+{ \
+ unsigned long flags; \
+ raw_spinlock_t *lock = lock_addr(v); \
+ long long val; \
+ \
+ raw_spin_lock_irqsave(lock, flags); \
+ val = (v->counter c_op a); \
+ raw_spin_unlock_irqrestore(lock, flags); \
+ return val; \
+} \
+EXPORT_SYMBOL(atomic64_##op##_return);
-long long atomic64_add_return(long long a, atomic64_t *v)
-{
- unsigned long flags;
- raw_spinlock_t *lock = lock_addr(v);
- long long val;
+#define ATOMIC64_OPS(op, c_op) \
+ ATOMIC64_OP(op, c_op) \
+ ATOMIC64_OP_RETURN(op, c_op)
- raw_spin_lock_irqsave(lock, flags);
- val = v->counter += a;
- raw_spin_unlock_irqrestore(lock, flags);
- return val;
-}
-EXPORT_SYMBOL(atomic64_add_return);
+ATOMIC64_OPS(add, +=)
+ATOMIC64_OPS(sub, -=)
-void atomic64_sub(long long a, atomic64_t *v)
-{
- unsigned long flags;
- raw_spinlock_t *lock = lock_addr(v);
-
- raw_spin_lock_irqsave(lock, flags);
- v->counter -= a;
- raw_spin_unlock_irqrestore(lock, flags);
-}
-EXPORT_SYMBOL(atomic64_sub);
-
-long long atomic64_sub_return(long long a, atomic64_t *v)
-{
- unsigned long flags;
- raw_spinlock_t *lock = lock_addr(v);
- long long val;
-
- raw_spin_lock_irqsave(lock, flags);
- val = v->counter -= a;
- raw_spin_unlock_irqrestore(lock, flags);
- return val;
-}
-EXPORT_SYMBOL(atomic64_sub_return);
+#undef ATOMIC64_OPS
+#undef ATOMIC64_OP_RETURN
+#undef ATOMIC64_OP
long long atomic64_dec_if_positive(atomic64_t *v)
{
diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c
index 872a15a..62af709 100644
--- a/lib/locking-selftest.c
+++ b/lib/locking-selftest.c
@@ -267,19 +267,46 @@
#undef E
/*
- * Special-case for read-locking, they are
- * allowed to recurse on the same lock class:
+ * Special-case for read-locking, they are not allowed to
+ * recurse on the same lock class except under interrupt context:
*/
static void rlock_AA1(void)
{
RL(X1);
- RL(X1); // this one should NOT fail
+ RL(X1); // this one should fail
}
static void rlock_AA1B(void)
{
RL(X1);
- RL(X2); // this one should NOT fail
+ RL(X2); // this one should fail
+}
+
+static void rlock_AHA1(void)
+{
+ RL(X1);
+ HARDIRQ_ENTER();
+ RL(X1); // this one should NOT fail
+ HARDIRQ_EXIT();
+}
+
+static void rlock_AHA1B(void)
+{
+ RL(X1);
+ HARDIRQ_ENTER();
+ RL(X2); // this one should NOT fail
+ HARDIRQ_EXIT();
+}
+
+static void rlock_ASAHA1(void)
+{
+ RL(X1);
+ SOFTIRQ_ENTER();
+ RL(X1); // this one should NOT fail
+ HARDIRQ_ENTER();
+ RL(X1); // this one should NOT fail
+ HARDIRQ_EXIT();
+ SOFTIRQ_EXIT();
}
static void rsem_AA1(void)
@@ -1069,7 +1096,7 @@
print_testname(desc); \
dotest(name##_spin, FAILURE, LOCKTYPE_SPIN); \
dotest(name##_wlock, FAILURE, LOCKTYPE_RWLOCK); \
- dotest(name##_rlock, SUCCESS, LOCKTYPE_RWLOCK); \
+ dotest(name##_rlock, FAILURE, LOCKTYPE_RWLOCK); \
dotest(name##_mutex, FAILURE, LOCKTYPE_MUTEX); \
dotest(name##_wsem, FAILURE, LOCKTYPE_RWSEM); \
dotest(name##_rsem, FAILURE, LOCKTYPE_RWSEM); \
@@ -1830,14 +1857,14 @@
printk(" --------------------------------------------------------------------------\n");
print_testname("recursive read-lock");
printk(" |");
- dotest(rlock_AA1, SUCCESS, LOCKTYPE_RWLOCK);
+ dotest(rlock_AA1, FAILURE, LOCKTYPE_RWLOCK);
printk(" |");
dotest(rsem_AA1, FAILURE, LOCKTYPE_RWSEM);
printk("\n");
print_testname("recursive read-lock #2");
printk(" |");
- dotest(rlock_AA1B, SUCCESS, LOCKTYPE_RWLOCK);
+ dotest(rlock_AA1B, FAILURE, LOCKTYPE_RWLOCK);
printk(" |");
dotest(rsem_AA1B, FAILURE, LOCKTYPE_RWSEM);
printk("\n");
@@ -1856,6 +1883,21 @@
dotest(rsem_AA3, FAILURE, LOCKTYPE_RWSEM);
printk("\n");
+ print_testname("recursive rlock with interrupt");
+ printk(" |");
+ dotest(rlock_AHA1, SUCCESS, LOCKTYPE_RWLOCK);
+ printk("\n");
+
+ print_testname("recursive rlock with interrupt #2");
+ printk(" |");
+ dotest(rlock_AHA1B, SUCCESS, LOCKTYPE_RWLOCK);
+ printk("\n");
+
+ print_testname("recursive rlock with interrupt #3");
+ printk(" |");
+ dotest(rlock_ASAHA1, SUCCESS, LOCKTYPE_RWLOCK);
+ printk("\n");
+
printk(" --------------------------------------------------------------------------\n");
/*
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ec4dcf1..085dc6d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2534,6 +2534,8 @@
unsigned long long size;
int ret = 0;
+ if (mem_cgroup_is_root(memcg))
+ goto done;
retry:
if (consume_stock(memcg, nr_pages))
goto done;
@@ -2611,9 +2613,7 @@
if (!(gfp_mask & __GFP_NOFAIL))
return -ENOMEM;
bypass:
- memcg = root_mem_cgroup;
- ret = -EINTR;
- goto retry;
+ return -EINTR;
done_restock:
if (batch > nr_pages)
@@ -2626,6 +2626,9 @@
{
unsigned long bytes = nr_pages * PAGE_SIZE;
+ if (mem_cgroup_is_root(memcg))
+ return;
+
res_counter_uncharge(&memcg->res, bytes);
if (do_swap_account)
res_counter_uncharge(&memcg->memsw, bytes);
@@ -2640,6 +2643,9 @@
{
unsigned long bytes = nr_pages * PAGE_SIZE;
+ if (mem_cgroup_is_root(memcg))
+ return;
+
res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes);
if (do_swap_account)
res_counter_uncharge_until(&memcg->memsw,
@@ -4093,6 +4099,46 @@
return retval;
}
+static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg,
+ enum mem_cgroup_stat_index idx)
+{
+ struct mem_cgroup *iter;
+ long val = 0;
+
+ /* Per-cpu values can be negative, use a signed accumulator */
+ for_each_mem_cgroup_tree(iter, memcg)
+ val += mem_cgroup_read_stat(iter, idx);
+
+ if (val < 0) /* race ? */
+ val = 0;
+ return val;
+}
+
+static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
+{
+ u64 val;
+
+ if (!mem_cgroup_is_root(memcg)) {
+ if (!swap)
+ return res_counter_read_u64(&memcg->res, RES_USAGE);
+ else
+ return res_counter_read_u64(&memcg->memsw, RES_USAGE);
+ }
+
+ /*
+ * Transparent hugepages are still accounted for in MEM_CGROUP_STAT_RSS
+ * as well as in MEM_CGROUP_STAT_RSS_HUGE.
+ */
+ val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);
+ val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);
+
+ if (swap)
+ val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP);
+
+ return val << PAGE_SHIFT;
+}
+
+
static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
struct cftype *cft)
{
@@ -4102,8 +4148,12 @@
switch (type) {
case _MEM:
+ if (name == RES_USAGE)
+ return mem_cgroup_usage(memcg, false);
return res_counter_read_u64(&memcg->res, name);
case _MEMSWAP:
+ if (name == RES_USAGE)
+ return mem_cgroup_usage(memcg, true);
return res_counter_read_u64(&memcg->memsw, name);
case _KMEM:
return res_counter_read_u64(&memcg->kmem, name);
@@ -4572,10 +4622,7 @@
if (!t)
goto unlock;
- if (!swap)
- usage = res_counter_read_u64(&memcg->res, RES_USAGE);
- else
- usage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
+ usage = mem_cgroup_usage(memcg, swap);
/*
* current_threshold points to threshold just below or equal to usage.
@@ -4673,10 +4720,10 @@
if (type == _MEM) {
thresholds = &memcg->thresholds;
- usage = res_counter_read_u64(&memcg->res, RES_USAGE);
+ usage = mem_cgroup_usage(memcg, false);
} else if (type == _MEMSWAP) {
thresholds = &memcg->memsw_thresholds;
- usage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
+ usage = mem_cgroup_usage(memcg, true);
} else
BUG();
@@ -4762,10 +4809,10 @@
if (type == _MEM) {
thresholds = &memcg->thresholds;
- usage = res_counter_read_u64(&memcg->res, RES_USAGE);
+ usage = mem_cgroup_usage(memcg, false);
} else if (type == _MEMSWAP) {
thresholds = &memcg->memsw_thresholds;
- usage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
+ usage = mem_cgroup_usage(memcg, true);
} else
BUG();
@@ -5525,9 +5572,9 @@
* core guarantees its existence.
*/
} else {
- res_counter_init(&memcg->res, &root_mem_cgroup->res);
- res_counter_init(&memcg->memsw, &root_mem_cgroup->memsw);
- res_counter_init(&memcg->kmem, &root_mem_cgroup->kmem);
+ res_counter_init(&memcg->res, NULL);
+ res_counter_init(&memcg->memsw, NULL);
+ res_counter_init(&memcg->kmem, NULL);
/*
* Deeper hierachy with use_hierarchy == false doesn't make
* much sense so let cgroup subsystem know about this
@@ -5969,8 +6016,9 @@
/* we must fixup refcnts and charges */
if (mc.moved_swap) {
/* uncharge swap account from the old cgroup */
- res_counter_uncharge(&mc.from->memsw,
- PAGE_SIZE * mc.moved_swap);
+ if (!mem_cgroup_is_root(mc.from))
+ res_counter_uncharge(&mc.from->memsw,
+ PAGE_SIZE * mc.moved_swap);
for (i = 0; i < mc.moved_swap; i++)
css_put(&mc.from->css);
@@ -5979,8 +6027,9 @@
* we charged both to->res and to->memsw, so we should
* uncharge to->res.
*/
- res_counter_uncharge(&mc.to->res,
- PAGE_SIZE * mc.moved_swap);
+ if (!mem_cgroup_is_root(mc.to))
+ res_counter_uncharge(&mc.to->res,
+ PAGE_SIZE * mc.moved_swap);
/* we've already done css_get(mc.to) */
mc.moved_swap = 0;
}
@@ -6345,7 +6394,8 @@
rcu_read_lock();
memcg = mem_cgroup_lookup(id);
if (memcg) {
- res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
+ if (!mem_cgroup_is_root(memcg))
+ res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
mem_cgroup_swap_statistics(memcg, false);
css_put(&memcg->css);
}
@@ -6509,12 +6559,15 @@
{
unsigned long flags;
- if (nr_mem)
- res_counter_uncharge(&memcg->res, nr_mem * PAGE_SIZE);
- if (nr_memsw)
- res_counter_uncharge(&memcg->memsw, nr_memsw * PAGE_SIZE);
-
- memcg_oom_recover(memcg);
+ if (!mem_cgroup_is_root(memcg)) {
+ if (nr_mem)
+ res_counter_uncharge(&memcg->res,
+ nr_mem * PAGE_SIZE);
+ if (nr_memsw)
+ res_counter_uncharge(&memcg->memsw,
+ nr_memsw * PAGE_SIZE);
+ memcg_oom_recover(memcg);
+ }
local_irq_save(flags);
__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon);
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index 3707c71..5110816 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -108,7 +108,7 @@
int page_start, int page_end)
{
const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
- unsigned int cpu;
+ unsigned int cpu, tcpu;
int i;
for_each_possible_cpu(cpu) {
@@ -116,14 +116,23 @@
struct page **pagep = &pages[pcpu_page_idx(cpu, i)];
*pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0);
- if (!*pagep) {
- pcpu_free_pages(chunk, pages, populated,
- page_start, page_end);
- return -ENOMEM;
- }
+ if (!*pagep)
+ goto err;
}
}
return 0;
+
+err:
+ while (--i >= page_start)
+ __free_page(pages[pcpu_page_idx(cpu, i)]);
+
+ for_each_possible_cpu(tcpu) {
+ if (tcpu == cpu)
+ break;
+ for (i = page_start; i < page_end; i++)
+ __free_page(pages[pcpu_page_idx(tcpu, i)]);
+ }
+ return -ENOMEM;
}
/**
@@ -263,6 +272,7 @@
__pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start),
page_end - page_start);
}
+ pcpu_post_unmap_tlb_flush(chunk, page_start, page_end);
return err;
}
diff --git a/mm/percpu.c b/mm/percpu.c
index 2139e30..da997f9 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1932,6 +1932,8 @@
if (pcpu_setup_first_chunk(ai, fc) < 0)
panic("Failed to initialize percpu areas.");
+
+ pcpu_free_alloc_info(ai);
}
#endif /* CONFIG_SMP */
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index b50dabb..faff624 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -589,6 +589,14 @@
void hci_le_conn_failed(struct hci_conn *conn, u8 status)
{
struct hci_dev *hdev = conn->hdev;
+ struct hci_conn_params *params;
+
+ params = hci_pend_le_action_lookup(&hdev->pend_le_conns, &conn->dst,
+ conn->dst_type);
+ if (params && params->conn) {
+ hci_conn_drop(params->conn);
+ params->conn = NULL;
+ }
conn->state = BT_CLOSED;
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index c32d361..1d9c29a 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -2536,8 +2536,13 @@
{
struct hci_conn_params *p;
- list_for_each_entry(p, &hdev->le_conn_params, list)
+ list_for_each_entry(p, &hdev->le_conn_params, list) {
+ if (p->conn) {
+ hci_conn_drop(p->conn);
+ p->conn = NULL;
+ }
list_del_init(&p->action);
+ }
BT_DBG("All LE pending actions cleared");
}
@@ -2578,8 +2583,8 @@
hci_dev_lock(hdev);
hci_inquiry_cache_flush(hdev);
- hci_conn_hash_flush(hdev);
hci_pend_le_actions_clear(hdev);
+ hci_conn_hash_flush(hdev);
hci_dev_unlock(hdev);
hci_notify(hdev, HCI_DEV_DOWN);
@@ -3727,6 +3732,9 @@
if (!params)
return;
+ if (params->conn)
+ hci_conn_drop(params->conn);
+
list_del(¶ms->action);
list_del(¶ms->list);
kfree(params);
@@ -3757,6 +3765,8 @@
struct hci_conn_params *params, *tmp;
list_for_each_entry_safe(params, tmp, &hdev->le_conn_params, list) {
+ if (params->conn)
+ hci_conn_drop(params->conn);
list_del(¶ms->action);
list_del(¶ms->list);
kfree(params);
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index be35598..a600082 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -4221,8 +4221,13 @@
hci_proto_connect_cfm(conn, ev->status);
params = hci_conn_params_lookup(hdev, &conn->dst, conn->dst_type);
- if (params)
+ if (params) {
list_del_init(¶ms->action);
+ if (params->conn) {
+ hci_conn_drop(params->conn);
+ params->conn = NULL;
+ }
+ }
unlock:
hci_update_background_scan(hdev);
@@ -4304,8 +4309,16 @@
conn = hci_connect_le(hdev, addr, addr_type, BT_SECURITY_LOW,
HCI_LE_AUTOCONN_TIMEOUT, HCI_ROLE_MASTER);
- if (!IS_ERR(conn))
+ if (!IS_ERR(conn)) {
+ /* Store the pointer since we don't really have any
+ * other owner of the object besides the params that
+ * triggered it. This way we can abort the connection if
+ * the parameters get removed and keep the reference
+ * count consistent once the connection is established.
+ */
+ params->conn = conn;
return;
+ }
switch (PTR_ERR(conn)) {
case -EBUSY:
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 488dd1a..fdbc9a8 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -775,7 +775,7 @@
EXPORT_SYMBOL(__skb_checksum_complete);
/**
- * skb_copy_and_csum_datagram_iovec - Copy and checkum skb to user iovec.
+ * skb_copy_and_csum_datagram_iovec - Copy and checksum skb to user iovec.
* @skb: skbuff
* @hlen: hardware length
* @iov: io vector
diff --git a/net/core/dev.c b/net/core/dev.c
index b65a505..ab9a165 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2587,13 +2587,19 @@
return harmonize_features(skb, features);
}
- features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX |
- NETIF_F_HW_VLAN_STAG_TX);
+ features = netdev_intersect_features(features,
+ skb->dev->vlan_features |
+ NETIF_F_HW_VLAN_CTAG_TX |
+ NETIF_F_HW_VLAN_STAG_TX);
if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD))
- features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
- NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_CTAG_TX |
- NETIF_F_HW_VLAN_STAG_TX;
+ features = netdev_intersect_features(features,
+ NETIF_F_SG |
+ NETIF_F_HIGHDMA |
+ NETIF_F_FRAGLIST |
+ NETIF_F_GEN_CSUM |
+ NETIF_F_HW_VLAN_CTAG_TX |
+ NETIF_F_HW_VLAN_STAG_TX);
return harmonize_features(skb, features);
}
@@ -4889,7 +4895,8 @@
if (adj->master)
sysfs_remove_link(&(dev->dev.kobj), "master");
- if (netdev_adjacent_is_neigh_list(dev, dev_list))
+ if (netdev_adjacent_is_neigh_list(dev, dev_list) &&
+ net_eq(dev_net(dev),dev_net(adj_dev)))
netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
list_del_rcu(&adj->list);
@@ -5159,11 +5166,65 @@
}
EXPORT_SYMBOL(netdev_upper_dev_unlink);
+void netdev_adjacent_add_links(struct net_device *dev)
+{
+ struct netdev_adjacent *iter;
+
+ struct net *net = dev_net(dev);
+
+ list_for_each_entry(iter, &dev->adj_list.upper, list) {
+ if (!net_eq(net,dev_net(iter->dev)))
+ continue;
+ netdev_adjacent_sysfs_add(iter->dev, dev,
+ &iter->dev->adj_list.lower);
+ netdev_adjacent_sysfs_add(dev, iter->dev,
+ &dev->adj_list.upper);
+ }
+
+ list_for_each_entry(iter, &dev->adj_list.lower, list) {
+ if (!net_eq(net,dev_net(iter->dev)))
+ continue;
+ netdev_adjacent_sysfs_add(iter->dev, dev,
+ &iter->dev->adj_list.upper);
+ netdev_adjacent_sysfs_add(dev, iter->dev,
+ &dev->adj_list.lower);
+ }
+}
+
+void netdev_adjacent_del_links(struct net_device *dev)
+{
+ struct netdev_adjacent *iter;
+
+ struct net *net = dev_net(dev);
+
+ list_for_each_entry(iter, &dev->adj_list.upper, list) {
+ if (!net_eq(net,dev_net(iter->dev)))
+ continue;
+ netdev_adjacent_sysfs_del(iter->dev, dev->name,
+ &iter->dev->adj_list.lower);
+ netdev_adjacent_sysfs_del(dev, iter->dev->name,
+ &dev->adj_list.upper);
+ }
+
+ list_for_each_entry(iter, &dev->adj_list.lower, list) {
+ if (!net_eq(net,dev_net(iter->dev)))
+ continue;
+ netdev_adjacent_sysfs_del(iter->dev, dev->name,
+ &iter->dev->adj_list.upper);
+ netdev_adjacent_sysfs_del(dev, iter->dev->name,
+ &dev->adj_list.lower);
+ }
+}
+
void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
{
struct netdev_adjacent *iter;
+ struct net *net = dev_net(dev);
+
list_for_each_entry(iter, &dev->adj_list.upper, list) {
+ if (!net_eq(net,dev_net(iter->dev)))
+ continue;
netdev_adjacent_sysfs_del(iter->dev, oldname,
&iter->dev->adj_list.lower);
netdev_adjacent_sysfs_add(iter->dev, dev,
@@ -5171,6 +5232,8 @@
}
list_for_each_entry(iter, &dev->adj_list.lower, list) {
+ if (!net_eq(net,dev_net(iter->dev)))
+ continue;
netdev_adjacent_sysfs_del(iter->dev, oldname,
&iter->dev->adj_list.upper);
netdev_adjacent_sysfs_add(iter->dev, dev,
@@ -6773,6 +6836,7 @@
/* Send a netdev-removed uevent to the old namespace */
kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
+ netdev_adjacent_del_links(dev);
/* Actually switch the network namespace */
dev_net_set(dev, net);
@@ -6787,6 +6851,7 @@
/* Send a netdev-add uevent to the new namespace */
kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
+ netdev_adjacent_add_links(dev);
/* Fixup kobjects */
err = device_rename(&dev->dev, dev->name);
diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index 6b5b6e7..9d33dff 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -197,7 +197,7 @@
* as destination. A new timer with the interval specified in the
* configuration TLV is created. Upon each interval, the latest statistics
* will be read from &bstats and the estimated rate will be stored in
- * &rate_est with the statistics lock grabed during this period.
+ * &rate_est with the statistics lock grabbed during this period.
*
* Returns 0 on success or a negative error code.
*
diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
index 9d3d9e7..2ddbce4 100644
--- a/net/core/gen_stats.c
+++ b/net/core/gen_stats.c
@@ -206,7 +206,7 @@
* @st: application specific statistics data
* @len: length of data
*
- * Appends the application sepecific statistics to the top level TLV created by
+ * Appends the application specific statistics to the top level TLV created by
* gnet_stats_start_copy() and remembers the data for XSTATS if the dumping
* handle is in backward compatibility mode.
*
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 163b673..da1378a 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -2647,7 +2647,7 @@
* skb_seq_read() will return the remaining part of the block.
*
* Note 1: The size of each block of data returned can be arbitrary,
- * this limitation is the cost for zerocopy seqeuental
+ * this limitation is the cost for zerocopy sequential
* reads of potentially non linear data.
*
* Note 2: Fragment lists within fragments are not implemented
@@ -2781,7 +2781,7 @@
/**
* skb_append_datato_frags - append the user data to a skb
* @sk: sock structure
- * @skb: skb structure to be appened with user data.
+ * @skb: skb structure to be appended with user data.
* @getfrag: call back function to be used for getting the user data
* @from: pointer to user message iov
* @length: length of the iov message
diff --git a/net/core/sock.c b/net/core/sock.c
index 2714811..d372b4b 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -166,7 +166,7 @@
/**
* sk_capable - Socket global capability test
* @sk: Socket to use a capability on or through
- * @cap: The global capbility to use
+ * @cap: The global capability to use
*
* Test to see if the opener of the socket had when the socket was
* created and the current process has the capability @cap in all user
@@ -183,7 +183,7 @@
* @sk: Socket to use a capability on or through
* @cap: The capability to use
*
- * Test to see if the opener of the socket had when the socke was created
+ * Test to see if the opener of the socket had when the socket was created
* and the current process has the capability @cap over the network namespace
* the socket is a member of.
*/
@@ -1822,6 +1822,9 @@
order);
if (page)
goto fill_page;
+ /* Do not retry other high order allocations */
+ order = 1;
+ max_page_order = 0;
}
order--;
}
@@ -1869,10 +1872,8 @@
* no guarantee that allocations succeed. Therefore, @sz MUST be
* less or equal than PAGE_SIZE.
*/
-bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t prio)
+bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
{
- int order;
-
if (pfrag->page) {
if (atomic_read(&pfrag->page->_count) == 1) {
pfrag->offset = 0;
@@ -1883,20 +1884,21 @@
put_page(pfrag->page);
}
- order = SKB_FRAG_PAGE_ORDER;
- do {
- gfp_t gfp = prio;
-
- if (order)
- gfp |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY;
- pfrag->page = alloc_pages(gfp, order);
+ pfrag->offset = 0;
+ if (SKB_FRAG_PAGE_ORDER) {
+ pfrag->page = alloc_pages(gfp | __GFP_COMP |
+ __GFP_NOWARN | __GFP_NORETRY,
+ SKB_FRAG_PAGE_ORDER);
if (likely(pfrag->page)) {
- pfrag->offset = 0;
- pfrag->size = PAGE_SIZE << order;
+ pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
return true;
}
- } while (--order >= 0);
-
+ }
+ pfrag->page = alloc_page(gfp);
+ if (likely(pfrag->page)) {
+ pfrag->size = PAGE_SIZE;
+ return true;
+ }
return false;
}
EXPORT_SYMBOL(skb_page_frag_refill);
diff --git a/net/ieee802154/6lowpan_rtnl.c b/net/ieee802154/6lowpan_rtnl.c
index 016b77e..6591d27 100644
--- a/net/ieee802154/6lowpan_rtnl.c
+++ b/net/ieee802154/6lowpan_rtnl.c
@@ -246,7 +246,7 @@
return ERR_PTR(-rc);
}
} else {
- frag = ERR_PTR(ENOMEM);
+ frag = ERR_PTR(-ENOMEM);
}
return frag;
@@ -437,7 +437,7 @@
/* Frame Control + Sequence Number + Address fields + Security Header */
dev->hard_header_len = 2 + 1 + 20 + 14;
dev->needed_tailroom = 2; /* FCS */
- dev->mtu = 1281;
+ dev->mtu = IPV6_MIN_MTU;
dev->tx_queue_len = 0;
dev->flags = IFF_BROADCAST | IFF_MULTICAST;
dev->watchdog_timeo = 0;
diff --git a/net/ieee802154/reassembly.c b/net/ieee802154/reassembly.c
index ffec6ce..32755cb 100644
--- a/net/ieee802154/reassembly.c
+++ b/net/ieee802154/reassembly.c
@@ -355,8 +355,6 @@
struct net *net = dev_net(skb->dev);
struct lowpan_frag_info *frag_info = lowpan_cb(skb);
struct ieee802154_addr source, dest;
- struct netns_ieee802154_lowpan *ieee802154_lowpan =
- net_ieee802154_lowpan(net);
int err;
source = mac_cb(skb)->source;
@@ -366,8 +364,10 @@
if (err < 0)
goto err;
- if (frag_info->d_size > ieee802154_lowpan->max_dsize)
+ if (frag_info->d_size > IPV6_MIN_MTU) {
+ net_warn_ratelimited("lowpan_frag_rcv: datagram size exceeds MTU\n");
goto err;
+ }
fq = fq_find(net, frag_info, &source, &dest);
if (fq != NULL) {
@@ -415,13 +415,6 @@
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
- {
- .procname = "6lowpanfrag_max_datagram_size",
- .data = &init_net.ieee802154_lowpan.max_dsize,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
{ }
};
@@ -458,7 +451,6 @@
table[1].data = &ieee802154_lowpan->frags.low_thresh;
table[1].extra2 = &ieee802154_lowpan->frags.high_thresh;
table[2].data = &ieee802154_lowpan->frags.timeout;
- table[3].data = &ieee802154_lowpan->max_dsize;
/* Don't export sysctls to unprivileged users */
if (net->user_ns != &init_user_ns)
@@ -533,7 +525,6 @@
ieee802154_lowpan->frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
ieee802154_lowpan->frags.low_thresh = IPV6_FRAG_LOW_THRESH;
ieee802154_lowpan->frags.timeout = IPV6_FRAG_TIMEOUT;
- ieee802154_lowpan->max_dsize = 0xFFFF;
inet_frags_init_net(&ieee802154_lowpan->frags);
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index fb17312..7cbcaf4 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -82,6 +82,52 @@
help
This option enables the ARP support for nf_tables.
+config NF_NAT_IPV4
+ tristate "IPv4 NAT"
+ depends on NF_CONNTRACK_IPV4
+ default m if NETFILTER_ADVANCED=n
+ select NF_NAT
+ help
+ The IPv4 NAT option allows masquerading, port forwarding and other
+ forms of full Network Address Port Translation. This can be
+ controlled by iptables or nft.
+
+if NF_NAT_IPV4
+
+config NF_NAT_SNMP_BASIC
+ tristate "Basic SNMP-ALG support"
+ depends on NF_CONNTRACK_SNMP
+ depends on NETFILTER_ADVANCED
+ default NF_NAT && NF_CONNTRACK_SNMP
+ ---help---
+
+ This module implements an Application Layer Gateway (ALG) for
+ SNMP payloads. In conjunction with NAT, it allows a network
+ management system to access multiple private networks with
+ conflicting addresses. It works by modifying IP addresses
+ inside SNMP payloads to match IP-layer NAT mapping.
+
+ This is the "basic" form of SNMP-ALG, as described in RFC 2962
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NF_NAT_PROTO_GRE
+ tristate
+ depends on NF_CT_PROTO_GRE
+
+config NF_NAT_PPTP
+ tristate
+ depends on NF_CONNTRACK
+ default NF_CONNTRACK_PPTP
+ select NF_NAT_PROTO_GRE
+
+config NF_NAT_H323
+ tristate
+ depends on NF_CONNTRACK
+ default NF_CONNTRACK_H323
+
+endif # NF_NAT_IPV4
+
config IP_NF_IPTABLES
tristate "IP tables support (required for filtering/masq/NAT)"
default m if NETFILTER_ADVANCED=n
@@ -170,19 +216,21 @@
To compile it as a module, choose M here. If unsure, say N.
# NAT + specific targets: nf_conntrack
-config NF_NAT_IPV4
- tristate "IPv4 NAT"
+config IP_NF_NAT
+ tristate "iptables NAT support"
depends on NF_CONNTRACK_IPV4
default m if NETFILTER_ADVANCED=n
select NF_NAT
+ select NF_NAT_IPV4
+ select NETFILTER_XT_NAT
help
- The IPv4 NAT option allows masquerading, port forwarding and other
- forms of full Network Address Port Translation. It is controlled by
- the `nat' table in iptables: see the man page for iptables(8).
+ This enables the `nat' table in iptables. This allows masquerading,
+ port forwarding and other forms of full Network Address Port
+ Translation.
To compile it as a module, choose M here. If unsure, say N.
-if NF_NAT_IPV4
+if IP_NF_NAT
config IP_NF_TARGET_MASQUERADE
tristate "MASQUERADE target support"
@@ -214,47 +262,7 @@
(e.g. when running oldconfig). It selects
CONFIG_NETFILTER_XT_TARGET_REDIRECT.
-endif
-
-config NF_NAT_SNMP_BASIC
- tristate "Basic SNMP-ALG support"
- depends on NF_CONNTRACK_SNMP && NF_NAT_IPV4
- depends on NETFILTER_ADVANCED
- default NF_NAT && NF_CONNTRACK_SNMP
- ---help---
-
- This module implements an Application Layer Gateway (ALG) for
- SNMP payloads. In conjunction with NAT, it allows a network
- management system to access multiple private networks with
- conflicting addresses. It works by modifying IP addresses
- inside SNMP payloads to match IP-layer NAT mapping.
-
- This is the "basic" form of SNMP-ALG, as described in RFC 2962
-
- To compile it as a module, choose M here. If unsure, say N.
-
-# If they want FTP, set to $CONFIG_IP_NF_NAT (m or y),
-# or $CONFIG_IP_NF_FTP (m or y), whichever is weaker.
-# From kconfig-language.txt:
-#
-# <expr> '&&' <expr> (6)
-#
-# (6) Returns the result of min(/expr/, /expr/).
-
-config NF_NAT_PROTO_GRE
- tristate
- depends on NF_NAT_IPV4 && NF_CT_PROTO_GRE
-
-config NF_NAT_PPTP
- tristate
- depends on NF_CONNTRACK && NF_NAT_IPV4
- default NF_NAT_IPV4 && NF_CONNTRACK_PPTP
- select NF_NAT_PROTO_GRE
-
-config NF_NAT_H323
- tristate
- depends on NF_CONNTRACK && NF_NAT_IPV4
- default NF_NAT_IPV4 && NF_CONNTRACK_H323
+endif # IP_NF_NAT
# mangle + specific targets
config IP_NF_MANGLE
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 3300162..edf4af3 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -43,7 +43,7 @@
# the three instances of ip_tables
obj-$(CONFIG_IP_NF_FILTER) += iptable_filter.o
obj-$(CONFIG_IP_NF_MANGLE) += iptable_mangle.o
-obj-$(CONFIG_NF_NAT_IPV4) += iptable_nat.o
+obj-$(CONFIG_IP_NF_NAT) += iptable_nat.o
obj-$(CONFIG_IP_NF_RAW) += iptable_raw.o
obj-$(CONFIG_IP_NF_SECURITY) += iptable_security.o
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 0b239fc..fc1fac2 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1690,14 +1690,12 @@
addrconf_mod_dad_work(ifp, 0);
}
-/* Join to solicited addr multicast group. */
-
+/* Join to solicited addr multicast group.
+ * caller must hold RTNL */
void addrconf_join_solict(struct net_device *dev, const struct in6_addr *addr)
{
struct in6_addr maddr;
- ASSERT_RTNL();
-
if (dev->flags&(IFF_LOOPBACK|IFF_NOARP))
return;
@@ -1705,12 +1703,11 @@
ipv6_dev_mc_inc(dev, &maddr);
}
+/* caller must hold RTNL */
void addrconf_leave_solict(struct inet6_dev *idev, const struct in6_addr *addr)
{
struct in6_addr maddr;
- ASSERT_RTNL();
-
if (idev->dev->flags&(IFF_LOOPBACK|IFF_NOARP))
return;
@@ -1718,12 +1715,11 @@
__ipv6_dev_mc_dec(idev, &maddr);
}
+/* caller must hold RTNL */
static void addrconf_join_anycast(struct inet6_ifaddr *ifp)
{
struct in6_addr addr;
- ASSERT_RTNL();
-
if (ifp->prefix_len >= 127) /* RFC 6164 */
return;
ipv6_addr_prefix(&addr, &ifp->addr, ifp->prefix_len);
@@ -1732,12 +1728,11 @@
ipv6_dev_ac_inc(ifp->idev->dev, &addr);
}
+/* caller must hold RTNL */
static void addrconf_leave_anycast(struct inet6_ifaddr *ifp)
{
struct in6_addr addr;
- ASSERT_RTNL();
-
if (ifp->prefix_len >= 127) /* RFC 6164 */
return;
ipv6_addr_prefix(&addr, &ifp->addr, ifp->prefix_len);
@@ -4773,15 +4768,11 @@
addrconf_leave_solict(ifp->idev, &ifp->addr);
if (!ipv6_addr_any(&ifp->peer_addr)) {
struct rt6_info *rt;
- struct net_device *dev = ifp->idev->dev;
- rt = rt6_lookup(dev_net(dev), &ifp->peer_addr, NULL,
- dev->ifindex, 1);
- if (rt) {
- dst_hold(&rt->dst);
- if (ip6_del_rt(rt))
- dst_free(&rt->dst);
- }
+ rt = addrconf_get_prefix_route(&ifp->peer_addr, 128,
+ ifp->idev->dev, 0, 0);
+ if (rt && ip6_del_rt(rt))
+ dst_free(&rt->dst);
}
dst_hold(&ifp->rt->dst);
diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
index 2101832..ff2de7d 100644
--- a/net/ipv6/anycast.c
+++ b/net/ipv6/anycast.c
@@ -77,6 +77,7 @@
pac->acl_next = NULL;
pac->acl_addr = *addr;
+ rtnl_lock();
rcu_read_lock();
if (ifindex == 0) {
struct rt6_info *rt;
@@ -137,6 +138,7 @@
error:
rcu_read_unlock();
+ rtnl_unlock();
if (pac)
sock_kfree_s(sk, pac, sizeof(*pac));
return err;
@@ -171,11 +173,13 @@
spin_unlock_bh(&ipv6_sk_ac_lock);
+ rtnl_lock();
rcu_read_lock();
dev = dev_get_by_index_rcu(net, pac->acl_ifindex);
if (dev)
ipv6_dev_ac_dec(dev, &pac->acl_addr);
rcu_read_unlock();
+ rtnl_unlock();
sock_kfree_s(sk, pac, sizeof(*pac));
return 0;
@@ -198,6 +202,7 @@
spin_unlock_bh(&ipv6_sk_ac_lock);
prev_index = 0;
+ rtnl_lock();
rcu_read_lock();
while (pac) {
struct ipv6_ac_socklist *next = pac->acl_next;
@@ -212,6 +217,7 @@
pac = next;
}
rcu_read_unlock();
+ rtnl_unlock();
}
static void aca_put(struct ifacaddr6 *ac)
@@ -233,6 +239,8 @@
struct rt6_info *rt;
int err;
+ ASSERT_RTNL();
+
idev = in6_dev_get(dev);
if (idev == NULL)
@@ -302,6 +310,8 @@
{
struct ifacaddr6 *aca, *prev_aca;
+ ASSERT_RTNL();
+
write_lock_bh(&idev->lock);
prev_aca = NULL;
for (aca = idev->ac_list; aca; aca = aca->aca_next) {
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 617f095..a23b655 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -172,6 +172,7 @@
mc_lst->next = NULL;
mc_lst->addr = *addr;
+ rtnl_lock();
rcu_read_lock();
if (ifindex == 0) {
struct rt6_info *rt;
@@ -185,6 +186,7 @@
if (dev == NULL) {
rcu_read_unlock();
+ rtnl_unlock();
sock_kfree_s(sk, mc_lst, sizeof(*mc_lst));
return -ENODEV;
}
@@ -202,6 +204,7 @@
if (err) {
rcu_read_unlock();
+ rtnl_unlock();
sock_kfree_s(sk, mc_lst, sizeof(*mc_lst));
return err;
}
@@ -212,6 +215,7 @@
spin_unlock(&ipv6_sk_mc_lock);
rcu_read_unlock();
+ rtnl_unlock();
return 0;
}
@@ -229,6 +233,7 @@
if (!ipv6_addr_is_multicast(addr))
return -EINVAL;
+ rtnl_lock();
spin_lock(&ipv6_sk_mc_lock);
for (lnk = &np->ipv6_mc_list;
(mc_lst = rcu_dereference_protected(*lnk,
@@ -252,12 +257,15 @@
} else
(void) ip6_mc_leave_src(sk, mc_lst, NULL);
rcu_read_unlock();
+ rtnl_unlock();
+
atomic_sub(sizeof(*mc_lst), &sk->sk_omem_alloc);
kfree_rcu(mc_lst, rcu);
return 0;
}
}
spin_unlock(&ipv6_sk_mc_lock);
+ rtnl_unlock();
return -EADDRNOTAVAIL;
}
@@ -302,6 +310,7 @@
if (!rcu_access_pointer(np->ipv6_mc_list))
return;
+ rtnl_lock();
spin_lock(&ipv6_sk_mc_lock);
while ((mc_lst = rcu_dereference_protected(np->ipv6_mc_list,
lockdep_is_held(&ipv6_sk_mc_lock))) != NULL) {
@@ -328,6 +337,7 @@
spin_lock(&ipv6_sk_mc_lock);
}
spin_unlock(&ipv6_sk_mc_lock);
+ rtnl_unlock();
}
int ip6_mc_source(int add, int omode, struct sock *sk,
@@ -845,6 +855,8 @@
struct ifmcaddr6 *mc;
struct inet6_dev *idev;
+ ASSERT_RTNL();
+
/* we need to take a reference on idev */
idev = in6_dev_get(dev);
@@ -916,6 +928,8 @@
{
struct ifmcaddr6 *ma, **map;
+ ASSERT_RTNL();
+
write_lock_bh(&idev->lock);
for (map = &idev->mc_list; (ma=*map) != NULL; map = &ma->next) {
if (ipv6_addr_equal(&ma->mca_addr, addr)) {
diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig
index ac93df1..2812816 100644
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig
@@ -57,9 +57,19 @@
config NF_LOG_IPV6
tristate "IPv6 packet logging"
- depends on NETFILTER_ADVANCED
+ default m if NETFILTER_ADVANCED=n
select NF_LOG_COMMON
+config NF_NAT_IPV6
+ tristate "IPv6 NAT"
+ depends on NF_CONNTRACK_IPV6
+ depends on NETFILTER_ADVANCED
+ select NF_NAT
+ help
+ The IPv6 NAT option allows masquerading, port forwarding and other
+ forms of full Network Address Port Translation. This can be
+ controlled by iptables or nft.
+
config IP6_NF_IPTABLES
tristate "IP6 tables support (required for filtering)"
depends on INET && IPV6
@@ -232,19 +242,21 @@
If unsure, say N.
-config NF_NAT_IPV6
- tristate "IPv6 NAT"
+config IP6_NF_NAT
+ tristate "ip6tables NAT support"
depends on NF_CONNTRACK_IPV6
depends on NETFILTER_ADVANCED
select NF_NAT
+ select NF_NAT_IPV6
+ select NETFILTER_XT_NAT
help
- The IPv6 NAT option allows masquerading, port forwarding and other
- forms of full Network Address Port Translation. It is controlled by
- the `nat' table in ip6tables, see the man page for ip6tables(8).
+ This enables the `nat' table in ip6tables. This allows masquerading,
+ port forwarding and other forms of full Network Address Port
+ Translation.
To compile it as a module, choose M here. If unsure, say N.
-if NF_NAT_IPV6
+if IP6_NF_NAT
config IP6_NF_TARGET_MASQUERADE
tristate "MASQUERADE target support"
@@ -265,7 +277,7 @@
To compile it as a module, choose M here. If unsure, say N.
-endif # NF_NAT_IPV6
+endif # IP6_NF_NAT
endif # IP6_NF_IPTABLES
diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile
index c0b2631..c3d3286 100644
--- a/net/ipv6/netfilter/Makefile
+++ b/net/ipv6/netfilter/Makefile
@@ -8,7 +8,7 @@
obj-$(CONFIG_IP6_NF_MANGLE) += ip6table_mangle.o
obj-$(CONFIG_IP6_NF_RAW) += ip6table_raw.o
obj-$(CONFIG_IP6_NF_SECURITY) += ip6table_security.o
-obj-$(CONFIG_NF_NAT_IPV6) += ip6table_nat.o
+obj-$(CONFIG_IP6_NF_NAT) += ip6table_nat.o
# objects for l3 independent conntrack
nf_conntrack_ipv6-y := nf_conntrack_l3proto_ipv6.o nf_conntrack_proto_icmpv6.o
diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c
index 13752d9..b704a93 100644
--- a/net/l2tp/l2tp_ppp.c
+++ b/net/l2tp/l2tp_ppp.c
@@ -755,7 +755,8 @@
/* If PMTU discovery was enabled, use the MTU that was discovered */
dst = sk_dst_get(tunnel->sock);
if (dst != NULL) {
- u32 pmtu = dst_mtu(__sk_dst_get(tunnel->sock));
+ u32 pmtu = dst_mtu(dst);
+
if (pmtu != 0)
session->mtu = session->mru = pmtu -
PPPOL2TP_HEADER_OVERHEAD;
diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c
index 0375009..399ad82 100644
--- a/net/mac80211/chan.c
+++ b/net/mac80211/chan.c
@@ -541,6 +541,8 @@
continue;
if (rcu_access_pointer(sdata->vif.chanctx_conf) != conf)
continue;
+ if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
+ continue;
if (!compat)
compat = &sdata->vif.bss_conf.chandef;
diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c
index 3db9664..86173c0 100644
--- a/net/mac80211/debugfs_sta.c
+++ b/net/mac80211/debugfs_sta.c
@@ -167,7 +167,7 @@
p += scnprintf(p, sizeof(buf) + buf - p, "next dialog_token: %#02x\n",
sta->ampdu_mlme.dialog_token_allocator + 1);
p += scnprintf(p, sizeof(buf) + buf - p,
- "TID\t\tRX active\tDTKN\tSSN\t\tTX\tDTKN\tpending\n");
+ "TID\t\tRX\tDTKN\tSSN\t\tTX\tDTKN\tpending\n");
for (i = 0; i < IEEE80211_NUM_TIDS; i++) {
tid_rx = rcu_dereference(sta->ampdu_mlme.tid_rx[i]);
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index 01eede7..f75e5f1 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -1175,8 +1175,8 @@
if (sta) {
u16 last_seq;
- last_seq = le16_to_cpu(
- sta->last_seq_ctrl[rx_agg->tid]);
+ last_seq = IEEE80211_SEQ_TO_SN(le16_to_cpu(
+ sta->last_seq_ctrl[rx_agg->tid]));
__ieee80211_start_rx_ba_session(sta,
0, 0,
diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c
index 63b8741..c47194d 100644
--- a/net/mac80211/mesh_plink.c
+++ b/net/mac80211/mesh_plink.c
@@ -959,7 +959,8 @@
if (!matches_local)
event = CNF_RJCT;
if (!mesh_plink_free_count(sdata) ||
- (sta->llid != llid || sta->plid != plid))
+ sta->llid != llid ||
+ (sta->plid && sta->plid != plid))
event = CNF_IGNR;
else
event = CNF_ACPT;
@@ -1080,6 +1081,10 @@
goto unlock_rcu;
}
+ /* 802.11-2012 13.3.7.2 - update plid on CNF if not set */
+ if (!sta->plid && event == CNF_ACPT)
+ sta->plid = plid;
+
changed |= mesh_plink_fsm(sdata, sta, event);
unlock_rcu:
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 31a8afa..b82a12a 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -4376,8 +4376,7 @@
rcu_read_unlock();
if (bss->wmm_used && bss->uapsd_supported &&
- (sdata->local->hw.flags & IEEE80211_HW_SUPPORTS_UAPSD) &&
- sdata->wmm_acm != 0xff) {
+ (sdata->local->hw.flags & IEEE80211_HW_SUPPORTS_UAPSD)) {
assoc_data->uapsd = true;
ifmgd->flags |= IEEE80211_STA_UAPSD_ENABLED;
} else {
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index c6ee213..441875f 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -1094,8 +1094,11 @@
unsigned long flags;
struct ps_data *ps;
- if (sdata->vif.type == NL80211_IFTYPE_AP ||
- sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
+ if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
+ sdata = container_of(sdata->bss, struct ieee80211_sub_if_data,
+ u.ap);
+
+ if (sdata->vif.type == NL80211_IFTYPE_AP)
ps = &sdata->bss->ps;
else if (ieee80211_vif_is_mesh(&sdata->vif))
ps = &sdata->u.mesh.ps;
diff --git a/net/mac802154/wpan.c b/net/mac802154/wpan.c
index 3c3069f..5478388 100644
--- a/net/mac802154/wpan.c
+++ b/net/mac802154/wpan.c
@@ -462,7 +462,10 @@
skb->pkt_type = PACKET_OTHERHOST;
break;
default:
- break;
+ spin_unlock_bh(&sdata->mib_lock);
+ pr_debug("invalid dest mode\n");
+ kfree_skb(skb);
+ return NET_RX_DROP;
}
spin_unlock_bh(&sdata->mib_lock);
@@ -573,6 +576,7 @@
ret = mac802154_parse_frame_start(skb, &hdr);
if (ret) {
pr_debug("got invalid frame\n");
+ kfree_skb(skb);
return;
}
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index ad751fe..b5c1d3a 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -499,7 +499,7 @@
config NFT_NAT
depends on NF_TABLES
depends on NF_CONNTRACK
- depends on NF_NAT
+ select NF_NAT
tristate "Netfilter nf_tables nat module"
help
This option adds the "nat" expression that you can use to perform
@@ -747,7 +747,9 @@
config NETFILTER_XT_TARGET_LOG
tristate "LOG target support"
- depends on NF_LOG_IPV4 && NF_LOG_IPV6
+ select NF_LOG_COMMON
+ select NF_LOG_IPV4
+ select NF_LOG_IPV6 if IPV6
default m if NETFILTER_ADVANCED=n
help
This option adds a `LOG' target, which allows you to create rules in
@@ -764,6 +766,14 @@
(e.g. when running oldconfig). It selects
CONFIG_NETFILTER_XT_MARK (combined mark/MARK module).
+config NETFILTER_XT_NAT
+ tristate '"SNAT and DNAT" targets support'
+ depends on NF_NAT
+ ---help---
+ This option enables the SNAT and DNAT targets.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
config NETFILTER_XT_TARGET_NETMAP
tristate '"NETMAP" target support'
depends on NF_NAT
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 8308624..fad5fdb 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -95,7 +95,7 @@
obj-$(CONFIG_NETFILTER_XT_MARK) += xt_mark.o
obj-$(CONFIG_NETFILTER_XT_CONNMARK) += xt_connmark.o
obj-$(CONFIG_NETFILTER_XT_SET) += xt_set.o
-obj-$(CONFIG_NF_NAT) += xt_nat.o
+obj-$(CONFIG_NETFILTER_XT_NAT) += xt_nat.o
# targets
obj-$(CONFIG_NETFILTER_XT_TARGET_AUDIT) += xt_AUDIT.o
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index a93c97f..024a2e2 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -54,7 +54,7 @@
struct list_head nf_hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS] __read_mostly;
EXPORT_SYMBOL(nf_hooks);
-#if defined(CONFIG_JUMP_LABEL)
+#ifdef HAVE_JUMP_LABEL
struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS];
EXPORT_SYMBOL(nf_hooks_needed);
#endif
@@ -72,7 +72,7 @@
}
list_add_rcu(®->list, elem->list.prev);
mutex_unlock(&nf_hook_mutex);
-#if defined(CONFIG_JUMP_LABEL)
+#ifdef HAVE_JUMP_LABEL
static_key_slow_inc(&nf_hooks_needed[reg->pf][reg->hooknum]);
#endif
return 0;
@@ -84,7 +84,7 @@
mutex_lock(&nf_hook_mutex);
list_del_rcu(®->list);
mutex_unlock(&nf_hook_mutex);
-#if defined(CONFIG_JUMP_LABEL)
+#ifdef HAVE_JUMP_LABEL
static_key_slow_dec(&nf_hooks_needed[reg->pf][reg->hooknum]);
#endif
synchronize_net();
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index e683675..5c34e8d 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -1906,7 +1906,7 @@
{
.hook = ip_vs_local_reply6,
.owner = THIS_MODULE,
- .pf = NFPROTO_IPV4,
+ .pf = NFPROTO_IPV6,
.hooknum = NF_INET_LOCAL_OUT,
.priority = NF_IP6_PRI_NAT_DST + 1,
},
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 6f70bdd..56896a4 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -38,6 +38,7 @@
#include <net/route.h> /* for ip_route_output */
#include <net/ipv6.h>
#include <net/ip6_route.h>
+#include <net/ip_tunnels.h>
#include <net/addrconf.h>
#include <linux/icmpv6.h>
#include <linux/netfilter.h>
@@ -862,11 +863,15 @@
old_iph = ip_hdr(skb);
}
- skb->transport_header = skb->network_header;
-
/* fix old IP header checksum */
ip_send_check(old_iph);
+ skb = iptunnel_handle_offloads(skb, false, SKB_GSO_IPIP);
+ if (IS_ERR(skb))
+ goto tx_error;
+
+ skb->transport_header = skb->network_header;
+
skb_push(skb, sizeof(struct iphdr));
skb_reset_network_header(skb);
memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
@@ -900,7 +905,8 @@
return NF_STOLEN;
tx_error:
- kfree_skb(skb);
+ if (!IS_ERR(skb))
+ kfree_skb(skb);
rcu_read_unlock();
LeaveFunction(10);
return NF_STOLEN;
@@ -953,6 +959,11 @@
old_iph = ipv6_hdr(skb);
}
+ /* GSO: we need to provide proper SKB_GSO_ value for IPv6 */
+ skb = iptunnel_handle_offloads(skb, false, 0); /* SKB_GSO_SIT/IPV6 */
+ if (IS_ERR(skb))
+ goto tx_error;
+
skb->transport_header = skb->network_header;
skb_push(skb, sizeof(struct ipv6hdr));
@@ -988,7 +999,8 @@
return NF_STOLEN;
tx_error:
- kfree_skb(skb);
+ if (!IS_ERR(skb))
+ kfree_skb(skb);
rcu_read_unlock();
LeaveFunction(10);
return NF_STOLEN;
diff --git a/net/netfilter/xt_cgroup.c b/net/netfilter/xt_cgroup.c
index f4e8330..7198d66 100644
--- a/net/netfilter/xt_cgroup.c
+++ b/net/netfilter/xt_cgroup.c
@@ -31,7 +31,7 @@
if (info->invert & ~1)
return -EINVAL;
- return info->id ? 0 : -EINVAL;
+ return 0;
}
static bool
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 7228ec3..91d66b7 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -265,8 +265,11 @@
upcall.key = &key;
upcall.userdata = NULL;
upcall.portid = ovs_vport_find_upcall_portid(p, skb);
- ovs_dp_upcall(dp, skb, &upcall);
- consume_skb(skb);
+ error = ovs_dp_upcall(dp, skb, &upcall);
+ if (unlikely(error))
+ kfree_skb(skb);
+ else
+ consume_skb(skb);
stats_counter = &stats->n_missed;
goto out;
}
@@ -404,7 +407,7 @@
{
struct ovs_header *upcall;
struct sk_buff *nskb = NULL;
- struct sk_buff *user_skb; /* to be queued to userspace */
+ struct sk_buff *user_skb = NULL; /* to be queued to userspace */
struct nlattr *nla;
struct genl_info info = {
.dst_sk = ovs_dp_get_net(dp)->genl_sock,
@@ -494,9 +497,11 @@
((struct nlmsghdr *) user_skb->data)->nlmsg_len = user_skb->len;
err = genlmsg_unicast(ovs_dp_get_net(dp), user_skb, upcall_info->portid);
+ user_skb = NULL;
out:
if (err)
skb_tx_error(skb);
+ kfree_skb(user_skb);
kfree_skb(nskb);
return err;
}
diff --git a/net/rfkill/rfkill-gpio.c b/net/rfkill/rfkill-gpio.c
index 14c98e4..02a86a2 100644
--- a/net/rfkill/rfkill-gpio.c
+++ b/net/rfkill/rfkill-gpio.c
@@ -158,6 +158,7 @@
{ "BCM2E1A", RFKILL_TYPE_BLUETOOTH },
{ "BCM2E39", RFKILL_TYPE_BLUETOOTH },
{ "BCM2E3D", RFKILL_TYPE_BLUETOOTH },
+ { "BCM2E64", RFKILL_TYPE_BLUETOOTH },
{ "BCM4752", RFKILL_TYPE_GPS },
{ "LNV4752", RFKILL_TYPE_GPS },
{ },
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index eb71d49..634a2ab 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -4243,7 +4243,7 @@
transport = asoc->peer.primary_path;
status.sstat_assoc_id = sctp_assoc2id(asoc);
- status.sstat_state = asoc->state;
+ status.sstat_state = sctp_assoc_to_state(asoc);
status.sstat_rwnd = asoc->peer.rwnd;
status.sstat_unackdata = asoc->unack_data;
diff --git a/net/socket.c b/net/socket.c
index 95ee7d8..2e2586e 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -734,8 +734,7 @@
}
memset(&tss, 0, sizeof(tss));
- if ((sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE ||
- skb_shinfo(skb)->tx_flags & SKBTX_ANY_SW_TSTAMP) &&
+ if ((sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE) &&
ktime_to_timespec_cond(skb->tstamp, tss.ts + 0))
empty = 0;
if (shhwtstamps &&
@@ -2602,7 +2601,7 @@
*
* This function is called by a protocol handler that wants to
* advertise its address family, and have it linked into the
- * socket interface. The value ops->family coresponds to the
+ * socket interface. The value ops->family corresponds to the
* socket system call protocol family.
*/
int sock_register(const struct net_proto_family *ops)
diff --git a/scripts/kconfig/Makefile b/scripts/kconfig/Makefile
index 9c4d241..a7e0862 100644
--- a/scripts/kconfig/Makefile
+++ b/scripts/kconfig/Makefile
@@ -3,7 +3,7 @@
# These targets are used from top-level makefile
PHONY += oldconfig xconfig gconfig menuconfig config silentoldconfig update-po-config \
- localmodconfig localyesconfig
+ localmodconfig localyesconfig kvmconfig
ifdef KBUILD_KCONFIG
Kconfig := $(KBUILD_KCONFIG)
@@ -36,6 +36,11 @@
$(Q)mkdir -p include/config include/generated
$< --$@ $(Kconfig)
+kvmconfig:
+ $(Q)$(CONFIG_SHELL) $(srctree)/scripts/config -e KVMTOOL_TEST_ENABLE
+ $(Q)yes "" | make oldconfig > /dev/null
+ @echo 'Kernel configuration modified to run as KVM guest.'
+
localyesconfig localmodconfig: $(obj)/streamline_config.pl $(obj)/conf
$(Q)mkdir -p include/config include/generated
$(Q)perl $< --$@ $(srctree) $(Kconfig) > .tmp.config
diff --git a/security/keys/key.c b/security/keys/key.c
index b90a68c..6d0cad1 100644
--- a/security/keys/key.c
+++ b/security/keys/key.c
@@ -27,8 +27,8 @@
struct rb_root key_user_tree; /* tree of quota records indexed by UID */
DEFINE_SPINLOCK(key_user_lock);
-unsigned int key_quota_root_maxkeys = 200; /* root's key count quota */
-unsigned int key_quota_root_maxbytes = 20000; /* root's key space quota */
+unsigned int key_quota_root_maxkeys = 1000000; /* root's key count quota */
+unsigned int key_quota_root_maxbytes = 25000000; /* root's key space quota */
unsigned int key_quota_maxkeys = 200; /* general key count quota */
unsigned int key_quota_maxbytes = 20000; /* general key space quota */
diff --git a/sound/firewire/amdtp.c b/sound/firewire/amdtp.c
index f96bf4c..95fc2eaf 100644
--- a/sound/firewire/amdtp.c
+++ b/sound/firewire/amdtp.c
@@ -507,7 +507,16 @@
static void update_pcm_pointers(struct amdtp_stream *s,
struct snd_pcm_substream *pcm,
unsigned int frames)
-{ unsigned int ptr;
+{
+ unsigned int ptr;
+
+ /*
+ * In IEC 61883-6, one data block represents one event. In ALSA, one
+ * event equals to one PCM frame. But Dice has a quirk to transfer
+ * two PCM frames in one data block.
+ */
+ if (s->double_pcm_frames)
+ frames *= 2;
ptr = s->pcm_buffer_pointer + frames;
if (ptr >= pcm->runtime->buffer_size)
diff --git a/sound/firewire/amdtp.h b/sound/firewire/amdtp.h
index d8ee7b0..4823c08 100644
--- a/sound/firewire/amdtp.h
+++ b/sound/firewire/amdtp.h
@@ -125,6 +125,7 @@
unsigned int pcm_buffer_pointer;
unsigned int pcm_period_pointer;
bool pointer_flush;
+ bool double_pcm_frames;
struct snd_rawmidi_substream *midi[AMDTP_MAX_CHANNELS_FOR_MIDI * 8];
diff --git a/sound/firewire/dice.c b/sound/firewire/dice.c
index a9a30c0..e3a04d6 100644
--- a/sound/firewire/dice.c
+++ b/sound/firewire/dice.c
@@ -567,10 +567,14 @@
return err;
/*
- * At rates above 96 kHz, pretend that the stream runs at half the
- * actual sample rate with twice the number of channels; two samples
- * of a channel are stored consecutively in the packet. Requires
- * blocking mode and PCM buffer size should be aligned to SYT_INTERVAL.
+ * At 176.4/192.0 kHz, Dice has a quirk to transfer two PCM frames in
+ * one data block of AMDTP packet. Thus sampling transfer frequency is
+ * a half of PCM sampling frequency, i.e. PCM frames at 192.0 kHz are
+ * transferred on AMDTP packets at 96 kHz. Two successive samples of a
+ * channel are stored consecutively in the packet. This quirk is called
+ * as 'Dual Wire'.
+ * For this quirk, blocking mode is required and PCM buffer size should
+ * be aligned to SYT_INTERVAL.
*/
channels = params_channels(hw_params);
if (rate_index > 4) {
@@ -579,18 +583,25 @@
return err;
}
- for (i = 0; i < channels; i++) {
- dice->stream.pcm_positions[i * 2] = i;
- dice->stream.pcm_positions[i * 2 + 1] = i + channels;
- }
-
rate /= 2;
channels *= 2;
+ dice->stream.double_pcm_frames = true;
+ } else {
+ dice->stream.double_pcm_frames = false;
}
mode = rate_index_to_mode(rate_index);
amdtp_stream_set_parameters(&dice->stream, rate, channels,
dice->rx_midi_ports[mode]);
+ if (rate_index > 4) {
+ channels /= 2;
+
+ for (i = 0; i < channels; i++) {
+ dice->stream.pcm_positions[i] = i * 2;
+ dice->stream.pcm_positions[i + channels] = i * 2 + 1;
+ }
+ }
+
amdtp_stream_set_pcm_format(&dice->stream,
params_format(hw_params));
diff --git a/sound/pci/hda/patch_conexant.c b/sound/pci/hda/patch_conexant.c
index 6f2fa83..6e5d0cb 100644
--- a/sound/pci/hda/patch_conexant.c
+++ b/sound/pci/hda/patch_conexant.c
@@ -217,6 +217,7 @@
CXT_FIXUP_HEADPHONE_MIC_PIN,
CXT_FIXUP_HEADPHONE_MIC,
CXT_FIXUP_GPIO1,
+ CXT_FIXUP_ASPIRE_DMIC,
CXT_FIXUP_THINKPAD_ACPI,
CXT_FIXUP_OLPC_XO,
CXT_FIXUP_CAP_MIX_AMP,
@@ -664,6 +665,12 @@
{ }
},
},
+ [CXT_FIXUP_ASPIRE_DMIC] = {
+ .type = HDA_FIXUP_FUNC,
+ .v.func = cxt_fixup_stereo_dmic,
+ .chained = true,
+ .chain_id = CXT_FIXUP_GPIO1,
+ },
[CXT_FIXUP_THINKPAD_ACPI] = {
.type = HDA_FIXUP_FUNC,
.v.func = hda_fixup_thinkpad_acpi,
@@ -744,7 +751,7 @@
static const struct snd_pci_quirk cxt5066_fixups[] = {
SND_PCI_QUIRK(0x1025, 0x0543, "Acer Aspire One 522", CXT_FIXUP_STEREO_DMIC),
- SND_PCI_QUIRK(0x1025, 0x054c, "Acer Aspire 3830TG", CXT_FIXUP_GPIO1),
+ SND_PCI_QUIRK(0x1025, 0x054c, "Acer Aspire 3830TG", CXT_FIXUP_ASPIRE_DMIC),
SND_PCI_QUIRK(0x1043, 0x138d, "Asus", CXT_FIXUP_HEADPHONE_MIC_PIN),
SND_PCI_QUIRK(0x152d, 0x0833, "OLPC XO-1.5", CXT_FIXUP_OLPC_XO),
SND_PCI_QUIRK(0x17aa, 0x20f2, "Lenovo T400", CXT_PINCFG_LENOVO_TP410),
diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c
index d446ac3..1ba22fb 100644
--- a/sound/pci/hda/patch_realtek.c
+++ b/sound/pci/hda/patch_realtek.c
@@ -328,6 +328,7 @@
case 0x10ec0885:
case 0x10ec0887:
/*case 0x10ec0889:*/ /* this causes an SPDIF problem */
+ case 0x10ec0900:
alc889_coef_init(codec);
break;
case 0x10ec0888:
@@ -2350,6 +2351,7 @@
switch (codec->vendor_id) {
case 0x10ec0882:
case 0x10ec0885:
+ case 0x10ec0900:
break;
default:
/* ALC883 and variants */
diff --git a/sound/soc/codecs/cs4265.c b/sound/soc/codecs/cs4265.c
index a20b30c..9852320 100644
--- a/sound/soc/codecs/cs4265.c
+++ b/sound/soc/codecs/cs4265.c
@@ -282,10 +282,10 @@
/*64k*/
{8192000, 64000, 1, 0},
- {1228800, 64000, 1, 1},
- {1693440, 64000, 1, 2},
- {2457600, 64000, 1, 3},
- {3276800, 64000, 1, 4},
+ {12288000, 64000, 1, 1},
+ {16934400, 64000, 1, 2},
+ {24576000, 64000, 1, 3},
+ {32768000, 64000, 1, 4},
/* 88.2k */
{11289600, 88200, 1, 0},
@@ -435,10 +435,10 @@
index = cs4265_get_clk_index(cs4265->sysclk, params_rate(params));
if (index >= 0) {
snd_soc_update_bits(codec, CS4265_ADC_CTL,
- CS4265_ADC_FM, clk_map_table[index].fm_mode);
+ CS4265_ADC_FM, clk_map_table[index].fm_mode << 6);
snd_soc_update_bits(codec, CS4265_MCLK_FREQ,
CS4265_MCLK_FREQ_MASK,
- clk_map_table[index].mclkdiv);
+ clk_map_table[index].mclkdiv << 4);
} else {
dev_err(codec->dev, "can't get correct mclk\n");
diff --git a/sound/soc/codecs/da732x.h b/sound/soc/codecs/da732x.h
index 1dceafe..f586cbd 100644
--- a/sound/soc/codecs/da732x.h
+++ b/sound/soc/codecs/da732x.h
@@ -11,7 +11,7 @@
*/
#ifndef __DA732X_H_
-#define __DA732X_H
+#define __DA732X_H_
#include <sound/soc.h>
diff --git a/sound/soc/codecs/rt5640.c b/sound/soc/codecs/rt5640.c
index 6bc6efd..f1ec6e6 100644
--- a/sound/soc/codecs/rt5640.c
+++ b/sound/soc/codecs/rt5640.c
@@ -2059,6 +2059,7 @@
static const struct regmap_config rt5640_regmap = {
.reg_bits = 8,
.val_bits = 16,
+ .use_single_rw = true,
.max_register = RT5640_VENDOR_ID2 + 1 + (ARRAY_SIZE(rt5640_ranges) *
RT5640_PR_SPACING),
diff --git a/sound/soc/codecs/rt5677.c b/sound/soc/codecs/rt5677.c
index 67f1455..5337c44 100644
--- a/sound/soc/codecs/rt5677.c
+++ b/sound/soc/codecs/rt5677.c
@@ -2135,10 +2135,10 @@
{ "BST2", NULL, "IN2P" },
{ "BST2", NULL, "IN2N" },
- { "IN1P", NULL, "micbias1" },
- { "IN1N", NULL, "micbias1" },
- { "IN2P", NULL, "micbias1" },
- { "IN2N", NULL, "micbias1" },
+ { "IN1P", NULL, "MICBIAS1" },
+ { "IN1N", NULL, "MICBIAS1" },
+ { "IN2P", NULL, "MICBIAS1" },
+ { "IN2N", NULL, "MICBIAS1" },
{ "ADC 1", NULL, "BST1" },
{ "ADC 1", NULL, "ADC 1 power" },
diff --git a/sound/soc/generic/simple-card.c b/sound/soc/generic/simple-card.c
index 159e517f..cef7776 100644
--- a/sound/soc/generic/simple-card.c
+++ b/sound/soc/generic/simple-card.c
@@ -481,12 +481,19 @@
snd_soc_card_set_drvdata(&priv->snd_card, priv);
ret = devm_snd_soc_register_card(&pdev->dev, &priv->snd_card);
+ if (ret >= 0)
+ return ret;
err:
asoc_simple_card_unref(pdev);
return ret;
}
+static int asoc_simple_card_remove(struct platform_device *pdev)
+{
+ return asoc_simple_card_unref(pdev);
+}
+
static const struct of_device_id asoc_simple_of_match[] = {
{ .compatible = "simple-audio-card", },
{},
@@ -500,6 +507,7 @@
.of_match_table = asoc_simple_of_match,
},
.probe = asoc_simple_card_probe,
+ .remove = asoc_simple_card_remove,
};
module_platform_driver(asoc_simple_card);
diff --git a/sound/soc/omap/omap-twl4030.c b/sound/soc/omap/omap-twl4030.c
index f8a6adc..4336d18 100644
--- a/sound/soc/omap/omap-twl4030.c
+++ b/sound/soc/omap/omap-twl4030.c
@@ -260,7 +260,7 @@
.stream_name = "TWL4030 Voice",
.cpu_dai_name = "omap-mcbsp.3",
.codec_dai_name = "twl4030-voice",
- .platform_name = "omap-mcbsp.2",
+ .platform_name = "omap-mcbsp.3",
.codec_name = "twl4030-codec",
.dai_fmt = SND_SOC_DAIFMT_DSP_A | SND_SOC_DAIFMT_IB_NF |
SND_SOC_DAIFMT_CBM_CFM,
diff --git a/sound/soc/sh/rcar/gen.c b/sound/soc/sh/rcar/gen.c
index 3fdf3be..f95e7ab 100644
--- a/sound/soc/sh/rcar/gen.c
+++ b/sound/soc/sh/rcar/gen.c
@@ -247,7 +247,7 @@
};
/* it shouldn't happen */
- if (use_dvc & !use_src)
+ if (use_dvc && !use_src)
dev_err(dev, "DVC is selected without SRC\n");
/* use SSIU or SSI ? */
diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c
index d4bfd4a..889f4e3 100644
--- a/sound/soc/soc-core.c
+++ b/sound/soc/soc-core.c
@@ -1325,7 +1325,7 @@
device_initialize(rtd->dev);
rtd->dev->parent = rtd->card->dev;
rtd->dev->release = rtd_release;
- rtd->dev->init_name = name;
+ dev_set_name(rtd->dev, "%s", name);
dev_set_drvdata(rtd->dev, rtd);
mutex_init(&rtd->pcm_mutex);
INIT_LIST_HEAD(&rtd->dpcm[SNDRV_PCM_STREAM_PLAYBACK].be_clients);
diff --git a/sound/soc/tegra/tegra_asoc_utils.h b/sound/soc/tegra/tegra_asoc_utils.h
index 9577121..ca80376 100644
--- a/sound/soc/tegra/tegra_asoc_utils.h
+++ b/sound/soc/tegra/tegra_asoc_utils.h
@@ -21,7 +21,7 @@
*/
#ifndef __TEGRA_ASOC_UTILS_H__
-#define __TEGRA_ASOC_UTILS_H_
+#define __TEGRA_ASOC_UTILS_H__
struct clk;
struct device;
diff --git a/tools/kvm/.gitignore b/tools/kvm/.gitignore
new file mode 100644
index 0000000..60dd6db
--- /dev/null
+++ b/tools/kvm/.gitignore
@@ -0,0 +1,12 @@
+/lkvm
+/vm
+*.o
+*.d
+.cscope
+tags
+include/common-cmds.h
+tests/boot/boot_test.iso
+tests/boot/rootfs/
+guest/init
+guest/init_stage2
+KVMTOOLS-VERSION-FILE
diff --git a/tools/kvm/CREDITS-Git b/tools/kvm/CREDITS-Git
new file mode 100644
index 0000000..c2ddcb3
--- /dev/null
+++ b/tools/kvm/CREDITS-Git
@@ -0,0 +1,30 @@
+Most of the infrastructure that 'perf' uses here has been reused
+from the Git project, as of version:
+
+ 66996ec: Sync with 1.6.2.4
+
+Here is an (incomplete!) list of main contributors to those files
+in util/* and elsewhere:
+
+ Alex Riesen
+ Christian Couder
+ Dmitry Potapov
+ Jeff King
+ Johannes Schindelin
+ Johannes Sixt
+ Junio C Hamano
+ Linus Torvalds
+ Matthias Kestenholz
+ Michal Ostrowski
+ Miklos Vajna
+ Petr Baudis
+ Pierre Habouzit
+ René Scharfe
+ Samuel Tardieu
+ Shawn O. Pearce
+ Steffen Prohaska
+ Steve Haslam
+
+Thanks guys!
+
+The full history of the files can be found in the upstream Git commits.
diff --git a/tools/kvm/Documentation/kernel-debugging.txt b/tools/kvm/Documentation/kernel-debugging.txt
new file mode 100644
index 0000000..98b9438
--- /dev/null
+++ b/tools/kvm/Documentation/kernel-debugging.txt
@@ -0,0 +1,15 @@
+This document explains how to debug a guests' kernel using KGDB.
+
+1. Run the guest:
+ 'lkvm run -k [vmlinuz] -p "kgdboc=ttyS1 kgdbwait" --tty 1'
+
+And see which PTY got assigned to ttyS1 (you'll see:
+' Info: Assigned terminal 1 to pty /dev/pts/X').
+
+2. Run GDB on the host:
+ 'gdb [vmlinuz]'
+
+3. Connect to the guest (from within GDB):
+ 'target remote /dev/pty/X'
+
+4. Start debugging! (enter 'continue' to continue boot).
diff --git a/tools/kvm/Documentation/kvm-balloon.txt b/tools/kvm/Documentation/kvm-balloon.txt
new file mode 100644
index 0000000..efc0a87
--- /dev/null
+++ b/tools/kvm/Documentation/kvm-balloon.txt
@@ -0,0 +1,24 @@
+lkvm-balloon(1)
+================
+
+NAME
+----
+lkvm-balloon - Inflate or deflate the virtio balloon
+
+SYNOPSIS
+--------
+[verse]
+'lkvm balloon [command] [size] [instance]'
+
+DESCRIPTION
+-----------
+The command inflates or deflates the virtio balloon located in the
+specified instance.
+For a list of running instances see 'lkvm list'.
+
+Command can be either 'inflate' or 'deflate'. Inflate increases the
+size of the balloon, thus decreasing the amount of virtual RAM available
+for the guest. Deflation returns previously inflated memory back to the
+guest.
+
+size is specified in Mb.
diff --git a/tools/kvm/Documentation/kvm-debug.txt b/tools/kvm/Documentation/kvm-debug.txt
new file mode 100644
index 0000000..a8eb2c0
--- /dev/null
+++ b/tools/kvm/Documentation/kvm-debug.txt
@@ -0,0 +1,16 @@
+lkvm-debug(1)
+================
+
+NAME
+----
+lkvm-debug - Print debug information from a running instance
+
+SYNOPSIS
+--------
+[verse]
+'lkvm debug [instance]'
+
+DESCRIPTION
+-----------
+The command prints debug information from a running instance.
+For a list of running instances see 'lkvm list'.
diff --git a/tools/kvm/Documentation/kvm-list.txt b/tools/kvm/Documentation/kvm-list.txt
new file mode 100644
index 0000000..a245607
--- /dev/null
+++ b/tools/kvm/Documentation/kvm-list.txt
@@ -0,0 +1,16 @@
+lkvm-list(1)
+================
+
+NAME
+----
+lkvm-list - Print a list of running instances on the host.
+
+SYNOPSIS
+--------
+[verse]
+'lkvm list'
+
+DESCRIPTION
+-----------
+This command prints a list of running instances on the host which
+belong to the user who currently ran 'lkvm list'.
diff --git a/tools/kvm/Documentation/kvm-pause.txt b/tools/kvm/Documentation/kvm-pause.txt
new file mode 100644
index 0000000..1ea2a23
--- /dev/null
+++ b/tools/kvm/Documentation/kvm-pause.txt
@@ -0,0 +1,16 @@
+lkvm-pause(1)
+================
+
+NAME
+----
+lkvm-pause - Pause the virtual machine
+
+SYNOPSIS
+--------
+[verse]
+'lkvm pause [instance]'
+
+DESCRIPTION
+-----------
+The command pauses a virtual machine.
+For a list of running instances see 'lkvm list'.
diff --git a/tools/kvm/Documentation/kvm-resume.txt b/tools/kvm/Documentation/kvm-resume.txt
new file mode 100644
index 0000000..a36c4df
--- /dev/null
+++ b/tools/kvm/Documentation/kvm-resume.txt
@@ -0,0 +1,16 @@
+lkvm-resume(1)
+================
+
+NAME
+----
+lkvm-resume - Resume the virtual machine
+
+SYNOPSIS
+--------
+[verse]
+'lkvm resume [instance]'
+
+DESCRIPTION
+-----------
+The command resumes a virtual machine.
+For a list of running instances see 'lkvm list'.
diff --git a/tools/kvm/Documentation/kvm-run.txt b/tools/kvm/Documentation/kvm-run.txt
new file mode 100644
index 0000000..8ddf470
--- /dev/null
+++ b/tools/kvm/Documentation/kvm-run.txt
@@ -0,0 +1,62 @@
+lkvm-run(1)
+================
+
+NAME
+----
+lkvm-run - Start the virtual machine
+
+SYNOPSIS
+--------
+[verse]
+'lkvm run' [-k <kernel image> | --kernel <kernel image>]
+
+DESCRIPTION
+-----------
+The command starts a virtual machine.
+
+OPTIONS
+-------
+-m::
+--mem=::
+ Virtual machine memory size in MiB.
+
+-p::
+--params::
+ Additional kernel command line arguments.
+
+-r::
+--initrd=::
+ Initial RAM disk image.
+
+-k::
+--kernel=::
+ The virtual machine kernel.
+
+--dev=::
+ KVM device file.
+
+-i::
+--image=::
+ A disk image file.
+
+-s::
+--single-step::
+ Enable single stepping.
+
+-g::
+--ioport-debug::
+ Enable ioport debugging.
+
+-c::
+--enable-virtio-console::
+ Enable the virtual IO console.
+
+--cpus::
+ The number of virtual CPUs to run.
+
+--debug::
+ Enable debug messages.
+
+SEE ALSO
+--------
+linkkvm:
diff --git a/tools/kvm/Documentation/kvm-sandbox.txt b/tools/kvm/Documentation/kvm-sandbox.txt
new file mode 100644
index 0000000..2d7f558
--- /dev/null
+++ b/tools/kvm/Documentation/kvm-sandbox.txt
@@ -0,0 +1,16 @@
+lkvm-sandbox(1)
+================
+
+NAME
+----
+lkvm-sandbox - Run a command in a sandboxed guest
+
+SYNOPSIS
+--------
+[verse]
+'lkvm sandbox ['lkvm run' arguments] -- [sandboxed command]'
+
+DESCRIPTION
+-----------
+The sandboxed command will run in a guest as part of it's init
+command.
diff --git a/tools/kvm/Documentation/kvm-setup.txt b/tools/kvm/Documentation/kvm-setup.txt
new file mode 100644
index 0000000..4b6e331
--- /dev/null
+++ b/tools/kvm/Documentation/kvm-setup.txt
@@ -0,0 +1,15 @@
+lkvm-setup(1)
+================
+
+NAME
+----
+lkvm-setup - Setup a new virtual machine
+
+SYNOPSIS
+--------
+[verse]
+'lkvm setup <name>'
+
+DESCRIPTION
+-----------
+The command setups a virtual machine.
diff --git a/tools/kvm/Documentation/kvm-stat.txt b/tools/kvm/Documentation/kvm-stat.txt
new file mode 100644
index 0000000..101ce7a
--- /dev/null
+++ b/tools/kvm/Documentation/kvm-stat.txt
@@ -0,0 +1,19 @@
+lkvm-stat(1)
+================
+
+NAME
+----
+lkvm-stat - Print statistics about a running instance
+
+SYNOPSIS
+--------
+[verse]
+'lkvm [command] [-n instance] [-p instance pid] [--all]'
+
+DESCRIPTION
+-----------
+The command prints statistics about a running instance.
+For a list of running instances see 'lkvm list'.
+
+Commands:
+ --memory, -m Display memory statistics
diff --git a/tools/kvm/Documentation/kvm-stop.txt b/tools/kvm/Documentation/kvm-stop.txt
new file mode 100644
index 0000000..6e4bc83
--- /dev/null
+++ b/tools/kvm/Documentation/kvm-stop.txt
@@ -0,0 +1,16 @@
+lkvm-stop(1)
+================
+
+NAME
+----
+lkvm-stop - Stop a running instance
+
+SYNOPSIS
+--------
+[verse]
+'lkvm stop [instance]'
+
+DESCRIPTION
+-----------
+The command stops a running instance.
+For a list of running instances see 'lkvm list'.
diff --git a/tools/kvm/Documentation/kvm-version.txt b/tools/kvm/Documentation/kvm-version.txt
new file mode 100644
index 0000000..41003d2
--- /dev/null
+++ b/tools/kvm/Documentation/kvm-version.txt
@@ -0,0 +1,21 @@
+lkvm-version(1)
+================
+
+NAME
+----
+lkvm-version - Print the version of the kernel tree kvm tools
+was built on.
+
+SYNOPSIS
+--------
+[verse]
+'lkvm version'
+
+DESCRIPTION
+-----------
+The command prints the version of the kernel that was used to build
+kvm tools.
+
+Note that the version is not the version of the kernel which is currently
+running on the host, but is the version of the kernel tree from which kvm
+tools was built.
diff --git a/tools/kvm/Documentation/virtio-console.txt b/tools/kvm/Documentation/virtio-console.txt
new file mode 100644
index 0000000..4a58d56
--- /dev/null
+++ b/tools/kvm/Documentation/virtio-console.txt
@@ -0,0 +1,41 @@
+General
+--------
+
+virtio-console as the name implies is a console over virtio transport. Here is
+a simple head to head comparison of the virtio-console vs regular 8250 console:
+
+8250 serial console:
+
+ - Requires CONFIG_SERIAL_8250=y and CONFIG_SERIAL_8250_CONSOLE=y kernel configs,
+which are enabled almost everywhere.
+ - Doesn't require guest-side changes.
+ - Compatible with older guests.
+
+virtio-console:
+
+ - Requires CONFIG_VIRTIO_CONSOLE=y (along with all other virtio dependencies),
+which got enabled only in recent kernels (but not all of them).
+ - Much faster.
+ - Consumes less processing resources.
+ - Requires guest-side changes.
+
+Enabling virtio-console
+------------------------
+
+First, make sure guest kernel is built with CONFIG_VIRTIO_CONSOLE=y. Once this
+is done, the following has to be done inside guest image:
+
+ - Add the following line to /etc/inittab:
+ 'hvc0:2345:respawn:/sbin/agetty -L 9600 hvc0'
+ - Add 'hvc0' to /etc/securetty (so you could actually log on)
+ - Start the guest with '--console virtio'
+
+Common errors
+--------------
+
+Q: I don't see anything on the screen!
+A: Make sure CONFIG_VIRTIO_CONSOLE=y is enabled in the *guest* kernel, also
+make sure you've updated /etc/inittab
+
+Q: It won't accept my username/password, but I enter them correctly!
+A: You didn't add 'hvc0' to /etc/securetty
diff --git a/tools/kvm/Makefile b/tools/kvm/Makefile
new file mode 100644
index 0000000..fba60f1
--- /dev/null
+++ b/tools/kvm/Makefile
@@ -0,0 +1,520 @@
+#
+# Define WERROR=0 to disable -Werror.
+#
+
+ifeq ($(strip $(V)),)
+ E = @echo
+ Q = @
+else
+ E = @\#
+ Q =
+endif
+ifneq ($(I), )
+ KINCL_PATH=$(I)
+else
+ KINCL_PATH=../..
+endif
+export E Q KINCL_PATH
+
+include config/utilities.mak
+include config/feature-tests.mak
+
+CC := $(CROSS_COMPILE)gcc
+LD := $(CROSS_COMPILE)ld
+
+FIND := find
+CSCOPE := cscope
+TAGS := ctags
+INSTALL := install
+
+prefix = $(HOME)
+bindir_relative = bin
+bindir = $(prefix)/$(bindir_relative)
+
+DESTDIR_SQ = $(subst ','\'',$(DESTDIR))
+bindir_SQ = $(subst ','\'',$(bindir))
+
+PROGRAM := lkvm
+PROGRAM_ALIAS := vm
+
+GUEST_INIT := guest/init
+
+OBJS += builtin-balloon.o
+OBJS += builtin-debug.o
+OBJS += builtin-help.o
+OBJS += builtin-list.o
+OBJS += builtin-stat.o
+OBJS += builtin-pause.o
+OBJS += builtin-resume.o
+OBJS += builtin-run.o
+OBJS += builtin-setup.o
+OBJS += builtin-stop.o
+OBJS += builtin-version.o
+OBJS += devices.o
+OBJS += disk/core.o
+OBJS += framebuffer.o
+OBJS += guest_compat.o
+OBJS += hw/rtc.o
+OBJS += hw/serial.o
+OBJS += ioport.o
+OBJS += irq.o
+OBJS += kvm-cpu.o
+OBJS += kvm.o
+OBJS += main.o
+OBJS += mmio.o
+OBJS += pci.o
+OBJS += term.o
+OBJS += virtio/blk.o
+OBJS += virtio/scsi.o
+OBJS += virtio/console.o
+OBJS += virtio/core.o
+OBJS += virtio/net.o
+OBJS += virtio/rng.o
+OBJS += virtio/balloon.o
+OBJS += virtio/pci.o
+OBJS += disk/blk.o
+OBJS += disk/qcow.o
+OBJS += disk/raw.o
+OBJS += ioeventfd.o
+OBJS += net/uip/core.o
+OBJS += net/uip/arp.o
+OBJS += net/uip/icmp.o
+OBJS += net/uip/ipv4.o
+OBJS += net/uip/tcp.o
+OBJS += net/uip/udp.o
+OBJS += net/uip/buf.o
+OBJS += net/uip/csum.o
+OBJS += net/uip/dhcp.o
+OBJS += kvm-cmd.o
+OBJS += util/init.o
+OBJS += util/iovec.o
+OBJS += util/rbtree.o
+OBJS += util/threadpool.o
+OBJS += util/parse-options.o
+OBJS += util/rbtree-interval.o
+OBJS += util/strbuf.o
+OBJS += util/read-write.o
+OBJS += util/util.o
+OBJS += virtio/9p.o
+OBJS += virtio/9p-pdu.o
+OBJS += hw/vesa.o
+OBJS += hw/pci-shmem.o
+OBJS += kvm-ipc.o
+OBJS += builtin-sandbox.o
+OBJS += virtio/mmio.o
+OBJS += hw/i8042.o
+
+# Translate uname -m into ARCH string
+ARCH ?= $(shell uname -m | sed -e s/i.86/i386/ -e s/ppc.*/powerpc/ \
+ -e s/armv7.*/arm/ -e s/aarch64.*/arm64/ -e s/mips64/mips/)
+
+ifeq ($(ARCH),i386)
+ ARCH := x86
+ DEFINES += -DCONFIG_X86_32
+endif
+ifeq ($(ARCH),x86_64)
+ ARCH := x86
+ DEFINES += -DCONFIG_X86_64
+endif
+
+### Arch-specific stuff
+
+#x86
+ifeq ($(ARCH),x86)
+ DEFINES += -DCONFIG_X86
+ OBJS += x86/boot.o
+ OBJS += x86/cpuid.o
+ OBJS += x86/interrupt.o
+ OBJS += x86/ioport.o
+ OBJS += x86/irq.o
+ OBJS += x86/kvm.o
+ OBJS += x86/kvm-cpu.o
+ OBJS += x86/mptable.o
+# Exclude BIOS object files from header dependencies.
+ OTHEROBJS += x86/bios.o
+ OTHEROBJS += x86/bios/bios-rom.o
+ ARCH_INCLUDE := x86/include
+endif
+# POWER/ppc: Actually only support ppc64 currently.
+ifeq ($(ARCH), powerpc)
+ DEFINES += -DCONFIG_PPC
+ OBJS += powerpc/boot.o
+ OBJS += powerpc/ioport.o
+ OBJS += powerpc/irq.o
+ OBJS += powerpc/kvm.o
+ OBJS += powerpc/cpu_info.o
+ OBJS += powerpc/kvm-cpu.o
+ OBJS += powerpc/spapr_hcall.o
+ OBJS += powerpc/spapr_rtas.o
+ OBJS += powerpc/spapr_hvcons.o
+ OBJS += powerpc/spapr_pci.o
+ OBJS += powerpc/xics.o
+ ARCH_INCLUDE := powerpc/include
+ CFLAGS += -m64
+ LDFLAGS += -m elf64ppc
+
+ ARCH_WANT_LIBFDT := y
+endif
+
+# ARM
+OBJS_ARM_COMMON := arm/fdt.o arm/gic.o arm/ioport.o arm/irq.o \
+ arm/kvm.o arm/kvm-cpu.o arm/pci.o arm/timer.o
+HDRS_ARM_COMMON := arm/include
+ifeq ($(ARCH), arm)
+ DEFINES += -DCONFIG_ARM
+ OBJS += $(OBJS_ARM_COMMON)
+ OBJS += arm/aarch32/arm-cpu.o
+ OBJS += arm/aarch32/kvm-cpu.o
+ ARCH_INCLUDE := $(HDRS_ARM_COMMON)
+ ARCH_INCLUDE += -Iarm/aarch32/include
+ CFLAGS += -march=armv7-a
+
+ ARCH_WANT_LIBFDT := y
+endif
+
+# ARM64
+ifeq ($(ARCH), arm64)
+ DEFINES += -DCONFIG_ARM64
+ OBJS += $(OBJS_ARM_COMMON)
+ OBJS += arm/aarch64/arm-cpu.o
+ OBJS += arm/aarch64/kvm-cpu.o
+ ARCH_INCLUDE := $(HDRS_ARM_COMMON)
+ ARCH_INCLUDE += -Iarm/aarch64/include
+
+ ARCH_WANT_LIBFDT := y
+endif
+
+ifeq ($(ARCH),mips)
+ DEFINES += -DCONFIG_MIPS
+ ARCH_INCLUDE := mips/include
+ OBJS += mips/kvm.o
+ OBJS += mips/kvm-cpu.o
+ OBJS += mips/irq.o
+endif
+###
+
+ifeq (,$(ARCH_INCLUDE))
+ UNSUPP_ERR = @echo "This architecture is not supported in kvmtool." && exit 1
+else
+ UNSUPP_ERR =
+endif
+
+###
+
+# libfdt support
+
+LIBFDT_SRC = fdt.o fdt_ro.o fdt_wip.o fdt_sw.o fdt_rw.o fdt_strerror.o
+LIBFDT_OBJS = $(patsubst %,../../scripts/dtc/libfdt/%,$(LIBFDT_SRC))
+
+ifeq (y,$(ARCH_WANT_LIBFDT))
+ DEFINES += -DCONFIG_HAS_LIBFDT
+ OTHEROBJS += $(LIBFDT_OBJS)
+endif
+
+###
+
+# Detect optional features.
+# On a given system, some libs may link statically, some may not; so, check
+# both and only build those that link!
+
+FLAGS_BFD := $(CFLAGS) -lbfd
+ifeq ($(call try-cc,$(SOURCE_BFD),$(FLAGS_BFD) -static),y)
+ CFLAGS_STATOPT += -DCONFIG_HAS_BFD
+ OBJS_STATOPT += symbol.o
+ LIBS_STATOPT += -lbfd
+endif
+
+FLAGS_GTK3 := $(CFLAGS) $(shell pkg-config --libs --cflags gtk+-3.0 2>/dev/null)
+ifeq ($(call try-cc,$(SOURCE_GTK3),$(FLAGS_GTK3)),y)
+ OBJS_DYNOPT += ui/gtk3.o
+ CFLAGS_DYNOPT += -DCONFIG_HAS_GTK3 $(shell pkg-config --cflags gtk+-3.0 2>/dev/null)
+ LIBS_DYNOPT += $(shell pkg-config --libs gtk+-3.0 2>/dev/null)
+else
+ $(warning warning GTK3 not found, disables GTK3 support. Please install gtk3-devel or libgtk3.0-dev)
+endif
+
+FLAGS_VNCSERVER := $(CFLAGS) -lvncserver
+ifeq ($(call try-cc,$(SOURCE_VNCSERVER),$(FLAGS_VNCSERVER)),y)
+ OBJS_DYNOPT += ui/vnc.o
+ CFLAGS_DYNOPT += -DCONFIG_HAS_VNCSERVER
+ LIBS_DYNOPT += -lvncserver
+endif
+ifeq ($(call try-cc,$(SOURCE_VNCSERVER),$(FLAGS_VNCSERVER) -static),y)
+ OBJS_STATOPT += ui/vnc.o
+ CFLAGS_STATOPT += -DCONFIG_HAS_VNCSERVER
+ LIBS_STATOPT += -lvncserver
+endif
+
+FLAGS_SDL := $(CFLAGS) -lSDL
+ifeq ($(call try-cc,$(SOURCE_SDL),$(FLAGS_SDL)),y)
+ OBJS_DYNOPT += ui/sdl.o
+ CFLAGS_DYNOPT += -DCONFIG_HAS_SDL
+ LIBS_DYNOPT += -lSDL
+endif
+ifeq ($(call try-cc,$(SOURCE_SDL),$(FLAGS_SDL) -static), y)
+ OBJS_STATOPT += ui/sdl.o
+ CFLAGS_STATOPT += -DCONFIG_HAS_SDL
+ LIBS_STATOPT += -lSDL
+endif
+
+FLAGS_ZLIB := $(CFLAGS) -lz
+ifeq ($(call try-cc,$(SOURCE_ZLIB),$(FLAGS_ZLIB)),y)
+ CFLAGS_DYNOPT += -DCONFIG_HAS_ZLIB
+ LIBS_DYNOPT += -lz
+endif
+ifeq ($(call try-cc,$(SOURCE_ZLIB),$(FLAGS_ZLIB) -static),y)
+ CFLAGS_STATOPT += -DCONFIG_HAS_ZLIB
+ LIBS_STATOPT += -lz
+endif
+
+FLAGS_AIO := $(CFLAGS) -laio
+ifeq ($(call try-cc,$(SOURCE_AIO),$(FLAGS_AIO)),y)
+ CFLAGS_DYNOPT += -DCONFIG_HAS_AIO
+ LIBS_DYNOPT += -laio
+endif
+ifeq ($(call try-cc,$(SOURCE_AIO),$(FLAGS_AIO) -static),y)
+ CFLAGS_STATOPT += -DCONFIG_HAS_AIO
+ LIBS_STATOPT += -laio
+endif
+
+ifeq ($(LTO),1)
+ FLAGS_LTO := -flto
+ ifeq ($(call try-cc,$(SOURCE_HELLO),$(FLAGS_LTO)),y)
+ CFLAGS += $(FLAGS_LTO)
+ endif
+endif
+
+ifneq ($(call try-build,$(SOURCE_STATIC),-static,),y)
+$(error No static libc found. Please install glibc-static package.)
+endif
+###
+
+LIBS += -lrt
+LIBS += -lpthread
+LIBS += -lutil
+
+
+DEPS := $(patsubst %.o,%.d,$(OBJS))
+
+DEFINES += -D_FILE_OFFSET_BITS=64
+DEFINES += -D_GNU_SOURCE
+DEFINES += -DKVMTOOLS_VERSION='"$(KVMTOOLS_VERSION)"'
+DEFINES += -DBUILD_ARCH='"$(ARCH)"'
+
+KVM_INCLUDE := include
+CFLAGS += $(CPPFLAGS) $(DEFINES) -I$(KVM_INCLUDE) -I$(ARCH_INCLUDE) -I$(KINCL_PATH)/include/uapi -I$(KINCL_PATH)/include -I$(KINCL_PATH)/arch/$(ARCH)/include/uapi -I$(KINCL_PATH)/arch/$(ARCH)/include/ -I$(KINCL_PATH)/scripts/dtc/libfdt -O2 -fno-strict-aliasing -g
+
+WARNINGS += -Wall
+WARNINGS += -Wformat=2
+WARNINGS += -Winit-self
+WARNINGS += -Wmissing-declarations
+WARNINGS += -Wmissing-prototypes
+WARNINGS += -Wnested-externs
+WARNINGS += -Wno-system-headers
+WARNINGS += -Wold-style-definition
+WARNINGS += -Wredundant-decls
+WARNINGS += -Wsign-compare
+WARNINGS += -Wstrict-prototypes
+WARNINGS += -Wundef
+WARNINGS += -Wvolatile-register-var
+WARNINGS += -Wwrite-strings
+
+CFLAGS += $(WARNINGS)
+
+# Some targets may use 'external' sources that don't build totally cleanly.
+CFLAGS_EASYGOING := $(CFLAGS)
+
+ifneq ($(WERROR),0)
+ CFLAGS += -Werror
+endif
+
+all: arch_support_check $(PROGRAM) $(PROGRAM_ALIAS) $(GUEST_INIT)
+
+arch_support_check:
+ $(UNSUPP_ERR)
+
+# When building -static all objects are built with appropriate flags, which
+# may differ between static & dynamic .o. The objects are separated into
+# .o and .static.o. See the %.o: %.c rules below.
+#
+# $(OTHEROBJS) are things that do not get substituted like this.
+#
+STATIC_OBJS = $(patsubst %.o,%.static.o,$(OBJS) $(OBJS_STATOPT))
+GUEST_OBJS = guest/guest_init.o
+
+$(PROGRAM)-static: $(DEPS) $(STATIC_OBJS) $(OTHEROBJS) $(GUEST_INIT)
+ $(E) " LINK " $@
+ $(Q) $(CC) -static $(CFLAGS) $(STATIC_OBJS) $(OTHEROBJS) $(GUEST_OBJS) $(LIBS) $(LIBS_STATOPT) -o $@
+
+$(PROGRAM): $(DEPS) $(OBJS) $(OBJS_DYNOPT) $(OTHEROBJS) $(GUEST_INIT)
+ $(E) " LINK " $@
+ $(Q) $(CC) $(CFLAGS) $(OBJS) $(OBJS_DYNOPT) $(OTHEROBJS) $(GUEST_OBJS) $(LIBS) $(LIBS_DYNOPT) -o $@
+
+$(PROGRAM_ALIAS): $(PROGRAM)
+ $(E) " LN " $@
+ $(Q) ln -f $(PROGRAM) $@
+
+$(GUEST_INIT): guest/init.c
+ $(E) " LINK " $@
+ $(Q) $(CC) -static guest/init.c -o $@
+ $(Q) $(LD) $(LDFLAGS) -r -b binary -o guest/guest_init.o $(GUEST_INIT)
+
+$(DEPS):
+
+util/rbtree.d: ../../lib/rbtree.c
+ $(Q) $(CC) -M -MT util/rbtree.o $(CFLAGS) $< -o $@
+
+%.d: %.c
+ $(Q) $(CC) -M -MT $(patsubst %.d,%.o,$@) $(CFLAGS) $< -o $@
+
+%.s: %.c
+ $(Q) $(CC) -o $@ -S $(CFLAGS) -fverbose-asm $<
+
+# The header file common-cmds.h is needed for compilation of builtin-help.c.
+builtin-help.d: $(KVM_INCLUDE)/common-cmds.h
+
+$(OBJS):
+
+# This rule relaxes the -Werror on libfdt, since for now it still has
+# a bunch of warnings. :(
+../../scripts/dtc/libfdt/%.o: ../../scripts/dtc/libfdt/%.c
+ifeq ($(C),1)
+ $(E) " CHECK " $@
+ $(Q) $(CHECK) -c $(CFLAGS_EASYGOING) $< -o $@
+endif
+ $(E) " CC " $@
+ $(Q) $(CC) -c $(CFLAGS_EASYGOING) $< -o $@
+
+util/rbtree.static.o util/rbtree.o: ../../lib/rbtree.c
+ifeq ($(C),1)
+ $(E) " CHECK " $@
+ $(Q) $(CHECK) -c $(CFLAGS) $< -o $@
+endif
+ $(E) " CC " $@
+ $(Q) $(CC) -c $(CFLAGS) $< -o $@
+
+%.static.o: %.c
+ifeq ($(C),1)
+ $(E) " CHECK " $@
+ $(Q) $(CHECK) -c $(CFLAGS) $(CFLAGS_STATOPT) $< -o $@
+endif
+ $(E) " CC " $@
+ $(Q) $(CC) -c $(CFLAGS) $(CFLAGS_STATOPT) $< -o $@
+
+%.o: %.c
+ifeq ($(C),1)
+ $(E) " CHECK " $@
+ $(Q) $(CHECK) -c $(CFLAGS) $(CFLAGS_DYNOPT) $< -o $@
+endif
+ $(E) " CC " $@
+ $(Q) $(CC) -c $(CFLAGS) $(CFLAGS_DYNOPT) $< -o $@
+
+
+$(KVM_INCLUDE)/common-cmds.h: util/generate-cmdlist.sh command-list.txt
+
+$(KVM_INCLUDE)/common-cmds.h: $(wildcard Documentation/kvm-*.txt)
+ $(E) " GEN " $@
+ $(Q) util/generate-cmdlist.sh > $@+ && mv $@+ $@
+
+#
+# BIOS assembly weirdness
+#
+BIOS_CFLAGS += -m32
+BIOS_CFLAGS += -march=i386
+BIOS_CFLAGS += -mregparm=3
+
+BIOS_CFLAGS += -fno-stack-protector
+BIOS_CFLAGS += -I../../arch/$(ARCH)
+
+x86/bios.o: x86/bios/bios.bin x86/bios/bios-rom.h
+
+x86/bios/bios.bin.elf: x86/bios/entry.S x86/bios/e820.c x86/bios/int10.c x86/bios/int15.c x86/bios/rom.ld.S
+ $(E) " CC x86/bios/memcpy.o"
+ $(Q) $(CC) -include code16gcc.h $(CFLAGS) $(BIOS_CFLAGS) -c -s x86/bios/memcpy.c -o x86/bios/memcpy.o
+ $(E) " CC x86/bios/e820.o"
+ $(Q) $(CC) -include code16gcc.h $(CFLAGS) $(BIOS_CFLAGS) -c -s x86/bios/e820.c -o x86/bios/e820.o
+ $(E) " CC x86/bios/int10.o"
+ $(Q) $(CC) -include code16gcc.h $(CFLAGS) $(BIOS_CFLAGS) -c -s x86/bios/int10.c -o x86/bios/int10.o
+ $(E) " CC x86/bios/int15.o"
+ $(Q) $(CC) -include code16gcc.h $(CFLAGS) $(BIOS_CFLAGS) -c -s x86/bios/int15.c -o x86/bios/int15.o
+ $(E) " CC x86/bios/entry.o"
+ $(Q) $(CC) $(CFLAGS) $(BIOS_CFLAGS) -c -s x86/bios/entry.S -o x86/bios/entry.o
+ $(E) " LD " $@
+ $(Q) $(LD) -T x86/bios/rom.ld.S -o x86/bios/bios.bin.elf x86/bios/memcpy.o x86/bios/entry.o x86/bios/e820.o x86/bios/int10.o x86/bios/int15.o
+
+x86/bios/bios.bin: x86/bios/bios.bin.elf
+ $(E) " OBJCOPY " $@
+ $(Q) objcopy -O binary -j .text x86/bios/bios.bin.elf x86/bios/bios.bin
+
+x86/bios/bios-rom.o: x86/bios/bios-rom.S x86/bios/bios.bin x86/bios/bios-rom.h
+ $(E) " CC " $@
+ $(Q) $(CC) -c $(CFLAGS) x86/bios/bios-rom.S -o x86/bios/bios-rom.o
+
+x86/bios/bios-rom.h: x86/bios/bios.bin.elf
+ $(E) " NM " $@
+ $(Q) cd x86/bios && sh gen-offsets.sh > bios-rom.h && cd ..
+
+check: all
+ $(MAKE) -C tests
+ ./$(PROGRAM) run tests/pit/tick.bin
+ ./$(PROGRAM) run -d tests/boot/boot_test.iso -p "init=init"
+.PHONY: check
+
+install: all
+ $(E) " INSTALL"
+ $(Q) $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(bindir_SQ)'
+ $(Q) $(INSTALL) $(PROGRAM) '$(DESTDIR_SQ)$(bindir_SQ)'
+.PHONY: install
+
+clean:
+ $(E) " CLEAN"
+ $(Q) rm -f x86/bios/*.bin
+ $(Q) rm -f x86/bios/*.elf
+ $(Q) rm -f x86/bios/*.o
+ $(Q) rm -f x86/bios/bios-rom.h
+ $(Q) rm -f tests/boot/boot_test.iso
+ $(Q) rm -rf tests/boot/rootfs/
+ $(Q) rm -f $(DEPS) $(OBJS) $(OTHEROBJS) $(OBJS_DYNOPT) $(STATIC_OBJS) $(PROGRAM) $(PROGRAM_ALIAS) $(PROGRAM)-static $(GUEST_INIT) $(GUEST_OBJS)
+ $(Q) rm -f cscope.*
+ $(Q) rm -f tags
+ $(Q) rm -f TAGS
+ $(Q) rm -f $(KVM_INCLUDE)/common-cmds.h
+ $(Q) rm -f KVMTOOLS-VERSION-FILE
+.PHONY: clean
+
+KVM_DEV ?= /dev/kvm
+
+$(KVM_DEV):
+ $(E) " MKNOD " $@
+ $(Q) mknod $@ char 10 232
+
+devices: $(KVM_DEV)
+.PHONY: devices
+
+TAGS:
+ $(E) " GEN" $@
+ $(Q) $(RM) -f TAGS
+ $(Q) $(FIND) . -name '*.[hcS]' -print | xargs etags -a
+.PHONY: TAGS
+
+tags:
+ $(E) " GEN" $@
+ $(Q) $(RM) -f tags
+ $(Q) $(FIND) . -name '*.[hcS]' -print | xargs ctags -a
+.PHONY: tags
+
+cscope:
+ $(E) " GEN" $@
+ $(Q) $(FIND) . -name '*.[hcS]' -print > cscope.files
+ $(Q) $(CSCOPE) -bkqu
+.PHONY: cscope
+
+#
+# Escape redundant work on cleaning up
+ifneq ($(MAKECMDGOALS),clean)
+-include $(DEPS)
+
+KVMTOOLS-VERSION-FILE:
+ @$(SHELL_PATH) util/KVMTOOLS-VERSION-GEN $(OUTPUT)
+-include $(OUTPUT)KVMTOOLS-VERSION-FILE
+endif
diff --git a/tools/kvm/README b/tools/kvm/README
new file mode 100644
index 0000000..97747d6
--- /dev/null
+++ b/tools/kvm/README
@@ -0,0 +1,112 @@
+Native Linux KVM tool
+=====================
+The goal of this tool is to provide a clean, from-scratch, lightweight
+KVM host tool implementation that can boot Linux guest images (just a
+hobby, won't be big and professional like QEMU) with no BIOS
+dependencies and with only the minimal amount of legacy device
+emulation.
+
+It's great as a learning tool if you want to get your feet wet in
+virtualization land: it's only 5 KLOC of clean C code that can already
+boot a guest Linux image.
+
+Right now it can boot a Linux image and provide you output via a serial
+console, over the host terminal, i.e. you can use it to boot a guest
+Linux image in a terminal or over ssh and log into the guest without
+much guest or host side setup work needed.
+
+1. To try out the tool, clone the git repository:
+
+ git clone git://github.com/penberg/linux-kvm.git
+
+or alternatively, if you already have a kernel source tree:
+
+ git remote add kvm-tool git://github.com/penberg/linux-kvm.git
+ git remote update
+ git checkout kvm-tool/master -b kvm-tool
+
+2. Compile the tool:
+
+ cd tools/kvm && make
+
+3. Download a raw userspace image:
+
+ wget http://wiki.qemu.org/download/linux-0.2.img.bz2 && bunzip2
+linux-0.2.img.bz2
+
+4. The guest kernel has to be built with the following configuration:
+
+ - For the default console output:
+ CONFIG_SERIAL_8250=y
+ CONFIG_SERIAL_8250_CONSOLE=y
+
+ - For running 32bit images on 64bit hosts:
+ CONFIG_IA32_EMULATION=y
+
+ - Proper FS options according to image FS (e.g. CONFIG_EXT2_FS, CONFIG_EXT4_FS).
+
+ - For all virtio devices listed below:
+ CONFIG_VIRTIO=y
+ CONFIG_VIRTIO_RING=y
+ CONFIG_VIRTIO_PCI=y
+
+ - For virtio-blk devices (--disk, -d):
+ CONFIG_VIRTIO_BLK=y
+
+ - For virtio-net devices ([--network, -n] virtio):
+ CONFIG_VIRTIO_NET=y
+
+ - For virtio-9p devices (--virtio-9p):
+ CONFIG_NET_9P=y
+ CONFIG_NET_9P_VIRTIO=y
+ CONFIG_9P_FS=y
+
+ - For virtio-balloon device (--balloon):
+ CONFIG_VIRTIO_BALLOON=y
+
+ - For virtio-console device (--console virtio):
+ CONFIG_VIRTIO_CONSOLE=y
+
+ - For virtio-rng device (--rng):
+ CONFIG_HW_RANDOM_VIRTIO=y
+
+ - For vesa device (--sdl or --vnc):
+ CONFIG_FB_VESA=y
+
+
+5. And finally, launch the hypervisor:
+
+ ./lkvm run --disk linux-0.2.img \
+ --kernel ../../arch/x86/boot/bzImage \
+or
+
+ sudo ./lkvm run --disk linux-0.2.img \
+ --kernel ../../arch/x86/boot/bzImage \
+ --network virtio
+
+The tool has been written by Pekka Enberg, Cyrill Gorcunov, Asias He,
+Sasha Levin and Prasad Joshi. Special thanks to Avi Kivity for his help
+on KVM internals and Ingo Molnar for all-around support and encouragement!
+
+See the following thread for original discussion for motivation of this
+project:
+
+http://thread.gmane.org/gmane.linux.kernel/962051/focus=962620
+
+Build dependencies
+=====================
+For deb based systems:
+32-bit:
+sudo apt-get install build-essential
+64-bit:
+sudo apt-get install build-essential libc6-dev-i386
+
+For rpm based systems:
+32-bit:
+yum install glibc-devel
+64-bit:
+yum install glibc-devel glibc-static
+
+On 64-bit Arch Linux make sure the multilib repository is enabled in your
+/etc/pacman.conf and run
+pacman -Sy lib32-glibc
diff --git a/tools/kvm/arm/aarch32/arm-cpu.c b/tools/kvm/arm/aarch32/arm-cpu.c
new file mode 100644
index 0000000..71b98fe
--- /dev/null
+++ b/tools/kvm/arm/aarch32/arm-cpu.c
@@ -0,0 +1,42 @@
+#include "kvm/kvm.h"
+#include "kvm/kvm-cpu.h"
+#include "kvm/util.h"
+
+#include "arm-common/gic.h"
+#include "arm-common/timer.h"
+
+#include <linux/byteorder.h>
+#include <linux/types.h>
+
+static void generate_fdt_nodes(void *fdt, struct kvm *kvm, u32 gic_phandle)
+{
+ int timer_interrupts[4] = {13, 14, 11, 10};
+
+ gic__generate_fdt_nodes(fdt, gic_phandle);
+ timer__generate_fdt_nodes(fdt, kvm, timer_interrupts);
+}
+
+static int arm_cpu__vcpu_init(struct kvm_cpu *vcpu)
+{
+ vcpu->generate_fdt_nodes = generate_fdt_nodes;
+ return 0;
+}
+
+static struct kvm_arm_target target_cortex_a15 = {
+ .id = KVM_ARM_TARGET_CORTEX_A15,
+ .compatible = "arm,cortex-a15",
+ .init = arm_cpu__vcpu_init,
+};
+
+static struct kvm_arm_target target_cortex_a7 = {
+ .id = KVM_ARM_TARGET_CORTEX_A7,
+ .compatible = "arm,cortex-a7",
+ .init = arm_cpu__vcpu_init,
+};
+
+static int arm_cpu__core_init(struct kvm *kvm)
+{
+ return (kvm_cpu__register_kvm_arm_target(&target_cortex_a15) ||
+ kvm_cpu__register_kvm_arm_target(&target_cortex_a7));
+}
+core_init(arm_cpu__core_init);
diff --git a/tools/kvm/arm/aarch32/include/kvm/barrier.h b/tools/kvm/arm/aarch32/include/kvm/barrier.h
new file mode 100644
index 0000000..94913a9
--- /dev/null
+++ b/tools/kvm/arm/aarch32/include/kvm/barrier.h
@@ -0,0 +1,10 @@
+#ifndef KVM__KVM_BARRIER_H
+#define KVM__KVM_BARRIER_H
+
+#define dmb() asm volatile ("dmb" : : : "memory")
+
+#define mb() dmb()
+#define rmb() dmb()
+#define wmb() dmb()
+
+#endif /* KVM__KVM_BARRIER_H */
diff --git a/tools/kvm/arm/aarch32/include/kvm/kvm-arch.h b/tools/kvm/arm/aarch32/include/kvm/kvm-arch.h
new file mode 100644
index 0000000..1632e3c
--- /dev/null
+++ b/tools/kvm/arm/aarch32/include/kvm/kvm-arch.h
@@ -0,0 +1,13 @@
+#ifndef KVM__KVM_ARCH_H
+#define KVM__KVM_ARCH_H
+
+#define ARM_GIC_DIST_SIZE 0x1000
+#define ARM_GIC_CPUI_SIZE 0x2000
+
+#define ARM_KERN_OFFSET(...) 0x8000
+
+#define ARM_MAX_MEMORY(...) ARM_LOMAP_MAX_MEMORY
+
+#include "arm-common/kvm-arch.h"
+
+#endif /* KVM__KVM_ARCH_H */
diff --git a/tools/kvm/arm/aarch32/include/kvm/kvm-config-arch.h b/tools/kvm/arm/aarch32/include/kvm/kvm-config-arch.h
new file mode 100644
index 0000000..acf0d23
--- /dev/null
+++ b/tools/kvm/arm/aarch32/include/kvm/kvm-config-arch.h
@@ -0,0 +1,8 @@
+#ifndef KVM__KVM_CONFIG_ARCH_H
+#define KVM__KVM_CONFIG_ARCH_H
+
+#define ARM_OPT_ARCH_RUN(...)
+
+#include "arm-common/kvm-config-arch.h"
+
+#endif /* KVM__KVM_CONFIG_ARCH_H */
diff --git a/tools/kvm/arm/aarch32/include/kvm/kvm-cpu-arch.h b/tools/kvm/arm/aarch32/include/kvm/kvm-cpu-arch.h
new file mode 100644
index 0000000..d28ea67
--- /dev/null
+++ b/tools/kvm/arm/aarch32/include/kvm/kvm-cpu-arch.h
@@ -0,0 +1,16 @@
+#ifndef KVM__KVM_CPU_ARCH_H
+#define KVM__KVM_CPU_ARCH_H
+
+#include "kvm/kvm.h"
+
+#include "arm-common/kvm-cpu-arch.h"
+
+#define ARM_VCPU_FEATURE_FLAGS(kvm, cpuid) { \
+ [0] = (!!(cpuid) << KVM_ARM_VCPU_POWER_OFF), \
+}
+
+#define ARM_MPIDR_HWID_BITMASK 0xFFFFFF
+#define ARM_CPU_ID 0, 0, 0
+#define ARM_CPU_ID_MPIDR 5
+
+#endif /* KVM__KVM_CPU_ARCH_H */
diff --git a/tools/kvm/arm/aarch32/kvm-cpu.c b/tools/kvm/arm/aarch32/kvm-cpu.c
new file mode 100644
index 0000000..464b473
--- /dev/null
+++ b/tools/kvm/arm/aarch32/kvm-cpu.c
@@ -0,0 +1,145 @@
+#include "kvm/kvm-cpu.h"
+#include "kvm/kvm.h"
+#include "kvm/virtio.h"
+
+#include <asm/ptrace.h>
+
+#define ARM_CORE_REG(x) (KVM_REG_ARM | KVM_REG_SIZE_U32 | KVM_REG_ARM_CORE | \
+ KVM_REG_ARM_CORE_REG(x))
+
+#define ARM_CP15_REG_SHIFT_MASK(x,n) \
+ (((x) << KVM_REG_ARM_ ## n ## _SHIFT) & KVM_REG_ARM_ ## n ## _MASK)
+
+#define __ARM_CP15_REG(op1,crn,crm,op2) \
+ (KVM_REG_ARM | KVM_REG_SIZE_U32 | \
+ (15 << KVM_REG_ARM_COPROC_SHIFT) | \
+ ARM_CP15_REG_SHIFT_MASK(op1, OPC1) | \
+ ARM_CP15_REG_SHIFT_MASK(crn, 32_CRN) | \
+ ARM_CP15_REG_SHIFT_MASK(crm, CRM) | \
+ ARM_CP15_REG_SHIFT_MASK(op2, 32_OPC2))
+
+#define ARM_CP15_REG(...) __ARM_CP15_REG(__VA_ARGS__)
+
+unsigned long kvm_cpu__get_vcpu_mpidr(struct kvm_cpu *vcpu)
+{
+ struct kvm_one_reg reg;
+ u32 mpidr;
+
+ reg.id = ARM_CP15_REG(ARM_CPU_ID, ARM_CPU_ID_MPIDR);
+ reg.addr = (u64)(unsigned long)&mpidr;
+ if (ioctl(vcpu->vcpu_fd, KVM_GET_ONE_REG, ®) < 0)
+ die("KVM_GET_ONE_REG failed (get_mpidr vcpu%ld", vcpu->cpu_id);
+
+ return mpidr;
+}
+
+void kvm_cpu__reset_vcpu(struct kvm_cpu *vcpu)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_one_reg reg;
+ u32 data;
+
+ /* Who said future-proofing was a good idea? */
+ reg.addr = (u64)(unsigned long)&data;
+
+ /* cpsr = IRQs/FIQs masked */
+ data = PSR_I_BIT | PSR_F_BIT | SVC_MODE;
+ reg.id = ARM_CORE_REG(usr_regs.ARM_cpsr);
+ if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, ®) < 0)
+ die_perror("KVM_SET_ONE_REG failed (cpsr)");
+
+ /* Secondary cores are stopped awaiting PSCI wakeup */
+ if (vcpu->cpu_id != 0)
+ return;
+
+ /* r0 = 0 */
+ data = 0;
+ reg.id = ARM_CORE_REG(usr_regs.ARM_r0);
+ if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, ®) < 0)
+ die_perror("KVM_SET_ONE_REG failed (r0)");
+
+ /* r1 = machine type (-1) */
+ data = -1;
+ reg.id = ARM_CORE_REG(usr_regs.ARM_r1);
+ if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, ®) < 0)
+ die_perror("KVM_SET_ONE_REG failed (r1)");
+
+ /* r2 = physical address of the device tree blob */
+ data = kvm->arch.dtb_guest_start;
+ reg.id = ARM_CORE_REG(usr_regs.ARM_r2);
+ if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, ®) < 0)
+ die_perror("KVM_SET_ONE_REG failed (r2)");
+
+ /* pc = start of kernel image */
+ data = kvm->arch.kern_guest_start;
+ reg.id = ARM_CORE_REG(usr_regs.ARM_pc);
+ if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, ®) < 0)
+ die_perror("KVM_SET_ONE_REG failed (pc)");
+}
+
+int kvm_cpu__get_endianness(struct kvm_cpu *vcpu)
+{
+ struct kvm_one_reg reg;
+ u32 data;
+
+ reg.id = ARM_CORE_REG(usr_regs.ARM_cpsr);
+ reg.addr = (u64)(unsigned long)&data;
+ if (ioctl(vcpu->vcpu_fd, KVM_GET_ONE_REG, ®) < 0)
+ die("KVM_GET_ONE_REG failed (cpsr)");
+
+ return (data & PSR_E_BIT) ? VIRTIO_ENDIAN_BE : VIRTIO_ENDIAN_LE;
+}
+
+void kvm_cpu__show_code(struct kvm_cpu *vcpu)
+{
+ struct kvm_one_reg reg;
+ u32 data;
+ int debug_fd = kvm_cpu__get_debug_fd();
+
+ reg.addr = (u64)(unsigned long)&data;
+
+ dprintf(debug_fd, "\n*pc:\n");
+ reg.id = ARM_CORE_REG(usr_regs.ARM_pc);
+ if (ioctl(vcpu->vcpu_fd, KVM_GET_ONE_REG, ®) < 0)
+ die("KVM_GET_ONE_REG failed (show_code @ PC)");
+
+ kvm__dump_mem(vcpu->kvm, data, 32, debug_fd);
+
+ dprintf(debug_fd, "\n*lr (svc):\n");
+ reg.id = ARM_CORE_REG(svc_regs[1]);
+ if (ioctl(vcpu->vcpu_fd, KVM_GET_ONE_REG, ®) < 0)
+ die("KVM_GET_ONE_REG failed (show_code @ LR_svc)");
+ data &= ~0x1;
+
+ kvm__dump_mem(vcpu->kvm, data, 32, debug_fd);
+}
+
+void kvm_cpu__show_registers(struct kvm_cpu *vcpu)
+{
+ struct kvm_one_reg reg;
+ u32 data;
+ int debug_fd = kvm_cpu__get_debug_fd();
+
+ reg.addr = (u64)(unsigned long)&data;
+ dprintf(debug_fd, "\n Registers:\n");
+
+ reg.id = ARM_CORE_REG(usr_regs.ARM_pc);
+ if (ioctl(vcpu->vcpu_fd, KVM_GET_ONE_REG, ®) < 0)
+ die("KVM_GET_ONE_REG failed (pc)");
+ dprintf(debug_fd, " PC: 0x%x\n", data);
+
+ reg.id = ARM_CORE_REG(usr_regs.ARM_cpsr);
+ if (ioctl(vcpu->vcpu_fd, KVM_GET_ONE_REG, ®) < 0)
+ die("KVM_GET_ONE_REG failed (cpsr)");
+ dprintf(debug_fd, " CPSR: 0x%x\n", data);
+
+ reg.id = ARM_CORE_REG(svc_regs[0]);
+ if (ioctl(vcpu->vcpu_fd, KVM_GET_ONE_REG, ®) < 0)
+ die("KVM_GET_ONE_REG failed (SP_svc)");
+ dprintf(debug_fd, " SP_svc: 0x%x\n", data);
+
+ reg.id = ARM_CORE_REG(svc_regs[1]);
+ if (ioctl(vcpu->vcpu_fd, KVM_GET_ONE_REG, ®) < 0)
+ die("KVM_GET_ONE_REG failed (LR_svc)");
+ dprintf(debug_fd, " LR_svc: 0x%x\n", data);
+}
diff --git a/tools/kvm/arm/aarch64/arm-cpu.c b/tools/kvm/arm/aarch64/arm-cpu.c
new file mode 100644
index 0000000..ce5ea2f
--- /dev/null
+++ b/tools/kvm/arm/aarch64/arm-cpu.c
@@ -0,0 +1,50 @@
+#include "kvm/fdt.h"
+#include "kvm/kvm.h"
+#include "kvm/kvm-cpu.h"
+#include "kvm/util.h"
+
+#include "arm-common/gic.h"
+#include "arm-common/timer.h"
+
+#include <linux/byteorder.h>
+#include <linux/types.h>
+
+static void generate_fdt_nodes(void *fdt, struct kvm *kvm, u32 gic_phandle)
+{
+ int timer_interrupts[4] = {13, 14, 11, 10};
+ gic__generate_fdt_nodes(fdt, gic_phandle);
+ timer__generate_fdt_nodes(fdt, kvm, timer_interrupts);
+}
+
+
+static int arm_cpu__vcpu_init(struct kvm_cpu *vcpu)
+{
+ vcpu->generate_fdt_nodes = generate_fdt_nodes;
+ return 0;
+}
+
+static struct kvm_arm_target target_aem_v8 = {
+ .id = KVM_ARM_TARGET_AEM_V8,
+ .compatible = "arm,arm-v8",
+ .init = arm_cpu__vcpu_init,
+};
+
+static struct kvm_arm_target target_foundation_v8 = {
+ .id = KVM_ARM_TARGET_FOUNDATION_V8,
+ .compatible = "arm,arm-v8",
+ .init = arm_cpu__vcpu_init,
+};
+
+static struct kvm_arm_target target_cortex_a57 = {
+ .id = KVM_ARM_TARGET_CORTEX_A57,
+ .compatible = "arm,cortex-a57",
+ .init = arm_cpu__vcpu_init,
+};
+
+static int arm_cpu__core_init(struct kvm *kvm)
+{
+ return (kvm_cpu__register_kvm_arm_target(&target_aem_v8) ||
+ kvm_cpu__register_kvm_arm_target(&target_foundation_v8) ||
+ kvm_cpu__register_kvm_arm_target(&target_cortex_a57));
+}
+core_init(arm_cpu__core_init);
diff --git a/tools/kvm/arm/aarch64/include/kvm/barrier.h b/tools/kvm/arm/aarch64/include/kvm/barrier.h
new file mode 100644
index 0000000..97ab252
--- /dev/null
+++ b/tools/kvm/arm/aarch64/include/kvm/barrier.h
@@ -0,0 +1,8 @@
+#ifndef KVM__KVM_BARRIER_H
+#define KVM__KVM_BARRIER_H
+
+#define mb() asm volatile ("dmb ish" : : : "memory")
+#define rmb() asm volatile ("dmb ishld" : : : "memory")
+#define wmb() asm volatile ("dmb ishst" : : : "memory")
+
+#endif /* KVM__KVM_BARRIER_H */
diff --git a/tools/kvm/arm/aarch64/include/kvm/kvm-arch.h b/tools/kvm/arm/aarch64/include/kvm/kvm-arch.h
new file mode 100644
index 0000000..2f08a26
--- /dev/null
+++ b/tools/kvm/arm/aarch64/include/kvm/kvm-arch.h
@@ -0,0 +1,17 @@
+#ifndef KVM__KVM_ARCH_H
+#define KVM__KVM_ARCH_H
+
+#define ARM_GIC_DIST_SIZE 0x10000
+#define ARM_GIC_CPUI_SIZE 0x10000
+
+#define ARM_KERN_OFFSET(kvm) ((kvm)->cfg.arch.aarch32_guest ? \
+ 0x8000 : \
+ 0x80000)
+
+#define ARM_MAX_MEMORY(kvm) ((kvm)->cfg.arch.aarch32_guest ? \
+ ARM_LOMAP_MAX_MEMORY : \
+ ARM_HIMAP_MAX_MEMORY)
+
+#include "arm-common/kvm-arch.h"
+
+#endif /* KVM__KVM_ARCH_H */
diff --git a/tools/kvm/arm/aarch64/include/kvm/kvm-config-arch.h b/tools/kvm/arm/aarch64/include/kvm/kvm-config-arch.h
new file mode 100644
index 0000000..89860ae
--- /dev/null
+++ b/tools/kvm/arm/aarch64/include/kvm/kvm-config-arch.h
@@ -0,0 +1,10 @@
+#ifndef KVM__KVM_CONFIG_ARCH_H
+#define KVM__KVM_CONFIG_ARCH_H
+
+#define ARM_OPT_ARCH_RUN(cfg) \
+ OPT_BOOLEAN('\0', "aarch32", &(cfg)->aarch32_guest, \
+ "Run AArch32 guest"),
+
+#include "arm-common/kvm-config-arch.h"
+
+#endif /* KVM__KVM_CONFIG_ARCH_H */
diff --git a/tools/kvm/arm/aarch64/include/kvm/kvm-cpu-arch.h b/tools/kvm/arm/aarch64/include/kvm/kvm-cpu-arch.h
new file mode 100644
index 0000000..2ffa7ad
--- /dev/null
+++ b/tools/kvm/arm/aarch64/include/kvm/kvm-cpu-arch.h
@@ -0,0 +1,19 @@
+#ifndef KVM__KVM_CPU_ARCH_H
+#define KVM__KVM_CPU_ARCH_H
+
+#include "kvm/kvm.h"
+
+#include "arm-common/kvm-cpu-arch.h"
+
+#define ARM_VCPU_FEATURE_FLAGS(kvm, cpuid) { \
+ [0] = ((!!(cpuid) << KVM_ARM_VCPU_POWER_OFF) | \
+ (!!(kvm)->cfg.arch.aarch32_guest << KVM_ARM_VCPU_EL1_32BIT)) \
+}
+
+#define ARM_MPIDR_HWID_BITMASK 0xFF00FFFFFFUL
+#define ARM_CPU_ID 3, 0, 0, 0
+#define ARM_CPU_ID_MPIDR 5
+#define ARM_CPU_CTRL 3, 0, 1, 0
+#define ARM_CPU_CTRL_SCTLR_EL1 0
+
+#endif /* KVM__KVM_CPU_ARCH_H */
diff --git a/tools/kvm/arm/aarch64/kvm-cpu.c b/tools/kvm/arm/aarch64/kvm-cpu.c
new file mode 100644
index 0000000..71a2a3a
--- /dev/null
+++ b/tools/kvm/arm/aarch64/kvm-cpu.c
@@ -0,0 +1,230 @@
+#include "kvm/kvm-cpu.h"
+#include "kvm/kvm.h"
+#include "kvm/virtio.h"
+
+#include <asm/ptrace.h>
+
+#define COMPAT_PSR_F_BIT 0x00000040
+#define COMPAT_PSR_I_BIT 0x00000080
+#define COMPAT_PSR_E_BIT 0x00000200
+#define COMPAT_PSR_MODE_SVC 0x00000013
+
+#define SCTLR_EL1_E0E_MASK (1 << 24)
+#define SCTLR_EL1_EE_MASK (1 << 25)
+
+#define ARM64_CORE_REG(x) (KVM_REG_ARM64 | KVM_REG_SIZE_U64 | \
+ KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(x))
+
+#define ARM64_SYS_REG_SHIFT_MASK(x,n) \
+ (((x) << KVM_REG_ARM64_SYSREG_ ## n ## _SHIFT) & \
+ KVM_REG_ARM64_SYSREG_ ## n ## _MASK)
+
+#define __ARM64_SYS_REG(op0,op1,crn,crm,op2) \
+ (KVM_REG_ARM64 | KVM_REG_SIZE_U64 | \
+ KVM_REG_ARM64_SYSREG | \
+ ARM64_SYS_REG_SHIFT_MASK(op0, OP0) | \
+ ARM64_SYS_REG_SHIFT_MASK(op1, OP1) | \
+ ARM64_SYS_REG_SHIFT_MASK(crn, CRN) | \
+ ARM64_SYS_REG_SHIFT_MASK(crm, CRM) | \
+ ARM64_SYS_REG_SHIFT_MASK(op2, OP2))
+
+#define ARM64_SYS_REG(...) __ARM64_SYS_REG(__VA_ARGS__)
+
+unsigned long kvm_cpu__get_vcpu_mpidr(struct kvm_cpu *vcpu)
+{
+ struct kvm_one_reg reg;
+ u64 mpidr;
+
+ reg.id = ARM64_SYS_REG(ARM_CPU_ID, ARM_CPU_ID_MPIDR);
+ reg.addr = (u64)&mpidr;
+ if (ioctl(vcpu->vcpu_fd, KVM_GET_ONE_REG, ®) < 0)
+ die("KVM_GET_ONE_REG failed (get_mpidr vcpu%ld", vcpu->cpu_id);
+
+ return mpidr;
+}
+
+static void reset_vcpu_aarch32(struct kvm_cpu *vcpu)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_one_reg reg;
+ u64 data;
+
+ reg.addr = (u64)&data;
+
+ /* pstate = all interrupts masked */
+ data = COMPAT_PSR_I_BIT | COMPAT_PSR_F_BIT | COMPAT_PSR_MODE_SVC;
+ reg.id = ARM64_CORE_REG(regs.pstate);
+ if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, ®) < 0)
+ die_perror("KVM_SET_ONE_REG failed (spsr[EL1])");
+
+ /* Secondary cores are stopped awaiting PSCI wakeup */
+ if (vcpu->cpu_id != 0)
+ return;
+
+ /* r0 = 0 */
+ data = 0;
+ reg.id = ARM64_CORE_REG(regs.regs[0]);
+ if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, ®) < 0)
+ die_perror("KVM_SET_ONE_REG failed (r0)");
+
+ /* r1 = machine type (-1) */
+ data = -1;
+ reg.id = ARM64_CORE_REG(regs.regs[1]);
+ if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, ®) < 0)
+ die_perror("KVM_SET_ONE_REG failed (r1)");
+
+ /* r2 = physical address of the device tree blob */
+ data = kvm->arch.dtb_guest_start;
+ reg.id = ARM64_CORE_REG(regs.regs[2]);
+ if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, ®) < 0)
+ die_perror("KVM_SET_ONE_REG failed (r2)");
+
+ /* pc = start of kernel image */
+ data = kvm->arch.kern_guest_start;
+ reg.id = ARM64_CORE_REG(regs.pc);
+ if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, ®) < 0)
+ die_perror("KVM_SET_ONE_REG failed (pc)");
+}
+
+static void reset_vcpu_aarch64(struct kvm_cpu *vcpu)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_one_reg reg;
+ u64 data;
+
+ reg.addr = (u64)&data;
+
+ /* pstate = all interrupts masked */
+ data = PSR_D_BIT | PSR_A_BIT | PSR_I_BIT | PSR_F_BIT | PSR_MODE_EL1h;
+ reg.id = ARM64_CORE_REG(regs.pstate);
+ if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, ®) < 0)
+ die_perror("KVM_SET_ONE_REG failed (spsr[EL1])");
+
+ /* x1...x3 = 0 */
+ data = 0;
+ reg.id = ARM64_CORE_REG(regs.regs[1]);
+ if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, ®) < 0)
+ die_perror("KVM_SET_ONE_REG failed (x1)");
+
+ reg.id = ARM64_CORE_REG(regs.regs[2]);
+ if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, ®) < 0)
+ die_perror("KVM_SET_ONE_REG failed (x2)");
+
+ reg.id = ARM64_CORE_REG(regs.regs[3]);
+ if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, ®) < 0)
+ die_perror("KVM_SET_ONE_REG failed (x3)");
+
+ /* Secondary cores are stopped awaiting PSCI wakeup */
+ if (vcpu->cpu_id == 0) {
+ /* x0 = physical address of the device tree blob */
+ data = kvm->arch.dtb_guest_start;
+ reg.id = ARM64_CORE_REG(regs.regs[0]);
+ if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, ®) < 0)
+ die_perror("KVM_SET_ONE_REG failed (x0)");
+
+ /* pc = start of kernel image */
+ data = kvm->arch.kern_guest_start;
+ reg.id = ARM64_CORE_REG(regs.pc);
+ if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, ®) < 0)
+ die_perror("KVM_SET_ONE_REG failed (pc)");
+ }
+}
+
+void kvm_cpu__reset_vcpu(struct kvm_cpu *vcpu)
+{
+ if (vcpu->kvm->cfg.arch.aarch32_guest)
+ return reset_vcpu_aarch32(vcpu);
+ else
+ return reset_vcpu_aarch64(vcpu);
+}
+
+int kvm_cpu__get_endianness(struct kvm_cpu *vcpu)
+{
+ struct kvm_one_reg reg;
+ u64 psr;
+ u64 sctlr;
+
+ /*
+ * Quoting the definition given by Peter Maydell:
+ *
+ * "Endianness of the CPU which does the virtio reset at the
+ * point when it does that reset"
+ *
+ * We first check for an AArch32 guest: its endianness can
+ * change when using SETEND, which affects the CPSR.E bit.
+ *
+ * If we're AArch64, use SCTLR_EL1.E0E if access comes from
+ * EL0, and SCTLR_EL1.EE if access comes from EL1.
+ */
+ reg.id = ARM64_CORE_REG(regs.pstate);
+ reg.addr = (u64)&psr;
+ if (ioctl(vcpu->vcpu_fd, KVM_GET_ONE_REG, ®) < 0)
+ die("KVM_GET_ONE_REG failed (spsr[EL1])");
+
+ if (psr & PSR_MODE32_BIT)
+ return (psr & COMPAT_PSR_E_BIT) ? VIRTIO_ENDIAN_BE : VIRTIO_ENDIAN_LE;
+
+ reg.id = ARM64_SYS_REG(ARM_CPU_CTRL, ARM_CPU_CTRL_SCTLR_EL1);
+ reg.addr = (u64)&sctlr;
+ if (ioctl(vcpu->vcpu_fd, KVM_GET_ONE_REG, ®) < 0)
+ die("KVM_GET_ONE_REG failed (SCTLR_EL1)");
+
+ if ((psr & PSR_MODE_MASK) == PSR_MODE_EL0t)
+ sctlr &= SCTLR_EL1_E0E_MASK;
+ else
+ sctlr &= SCTLR_EL1_EE_MASK;
+ return sctlr ? VIRTIO_ENDIAN_BE : VIRTIO_ENDIAN_LE;
+}
+
+void kvm_cpu__show_code(struct kvm_cpu *vcpu)
+{
+ struct kvm_one_reg reg;
+ unsigned long data;
+ int debug_fd = kvm_cpu__get_debug_fd();
+
+ reg.addr = (u64)&data;
+
+ dprintf(debug_fd, "\n*pc:\n");
+ reg.id = ARM64_CORE_REG(regs.pc);
+ if (ioctl(vcpu->vcpu_fd, KVM_GET_ONE_REG, ®) < 0)
+ die("KVM_GET_ONE_REG failed (show_code @ PC)");
+
+ kvm__dump_mem(vcpu->kvm, data, 32, debug_fd);
+
+ dprintf(debug_fd, "\n*lr:\n");
+ reg.id = ARM64_CORE_REG(regs.regs[30]);
+ if (ioctl(vcpu->vcpu_fd, KVM_GET_ONE_REG, ®) < 0)
+ die("KVM_GET_ONE_REG failed (show_code @ LR)");
+
+ kvm__dump_mem(vcpu->kvm, data, 32, debug_fd);
+}
+
+void kvm_cpu__show_registers(struct kvm_cpu *vcpu)
+{
+ struct kvm_one_reg reg;
+ unsigned long data;
+ int debug_fd = kvm_cpu__get_debug_fd();
+
+ reg.addr = (u64)&data;
+ dprintf(debug_fd, "\n Registers:\n");
+
+ reg.id = ARM64_CORE_REG(regs.pc);
+ if (ioctl(vcpu->vcpu_fd, KVM_GET_ONE_REG, ®) < 0)
+ die("KVM_GET_ONE_REG failed (pc)");
+ dprintf(debug_fd, " PC: 0x%lx\n", data);
+
+ reg.id = ARM64_CORE_REG(regs.pstate);
+ if (ioctl(vcpu->vcpu_fd, KVM_GET_ONE_REG, ®) < 0)
+ die("KVM_GET_ONE_REG failed (pstate)");
+ dprintf(debug_fd, " PSTATE: 0x%lx\n", data);
+
+ reg.id = ARM64_CORE_REG(sp_el1);
+ if (ioctl(vcpu->vcpu_fd, KVM_GET_ONE_REG, ®) < 0)
+ die("KVM_GET_ONE_REG failed (sp_el1)");
+ dprintf(debug_fd, " SP_EL1: 0x%lx\n", data);
+
+ reg.id = ARM64_CORE_REG(regs.regs[30]);
+ if (ioctl(vcpu->vcpu_fd, KVM_GET_ONE_REG, ®) < 0)
+ die("KVM_GET_ONE_REG failed (lr)");
+ dprintf(debug_fd, " LR: 0x%lx\n", data);
+}
diff --git a/tools/kvm/arm/fdt.c b/tools/kvm/arm/fdt.c
new file mode 100644
index 0000000..186a718
--- /dev/null
+++ b/tools/kvm/arm/fdt.c
@@ -0,0 +1,278 @@
+#include "kvm/devices.h"
+#include "kvm/fdt.h"
+#include "kvm/kvm.h"
+#include "kvm/kvm-cpu.h"
+#include "kvm/virtio-mmio.h"
+
+#include "arm-common/gic.h"
+#include "arm-common/pci.h"
+
+#include <stdbool.h>
+
+#include <asm/setup.h>
+#include <linux/byteorder.h>
+#include <linux/kernel.h>
+#include <linux/sizes.h>
+
+static char kern_cmdline[COMMAND_LINE_SIZE];
+
+bool kvm__load_firmware(struct kvm *kvm, const char *firmware_filename)
+{
+ return false;
+}
+
+int kvm__arch_setup_firmware(struct kvm *kvm)
+{
+ return 0;
+}
+
+static void dump_fdt(const char *dtb_file, void *fdt)
+{
+ int count, fd;
+
+ fd = open(dtb_file, O_CREAT | O_TRUNC | O_RDWR, 0666);
+ if (fd < 0)
+ die("Failed to write dtb to %s", dtb_file);
+
+ count = write(fd, fdt, FDT_MAX_SIZE);
+ if (count < 0)
+ die_perror("Failed to dump dtb");
+
+ pr_info("Wrote %d bytes to dtb %s\n", count, dtb_file);
+ close(fd);
+}
+
+#define CPU_NAME_MAX_LEN 8
+static void generate_cpu_nodes(void *fdt, struct kvm *kvm)
+{
+ int cpu;
+
+ _FDT(fdt_begin_node(fdt, "cpus"));
+ _FDT(fdt_property_cell(fdt, "#address-cells", 0x1));
+ _FDT(fdt_property_cell(fdt, "#size-cells", 0x0));
+
+ for (cpu = 0; cpu < kvm->nrcpus; ++cpu) {
+ char cpu_name[CPU_NAME_MAX_LEN];
+ struct kvm_cpu *vcpu = kvm->cpus[cpu];
+ unsigned long mpidr = kvm_cpu__get_vcpu_mpidr(vcpu);
+
+ mpidr &= ARM_MPIDR_HWID_BITMASK;
+ snprintf(cpu_name, CPU_NAME_MAX_LEN, "cpu@%lx", mpidr);
+
+ _FDT(fdt_begin_node(fdt, cpu_name));
+ _FDT(fdt_property_string(fdt, "device_type", "cpu"));
+ _FDT(fdt_property_string(fdt, "compatible", vcpu->cpu_compatible));
+
+ if (kvm->nrcpus > 1)
+ _FDT(fdt_property_string(fdt, "enable-method", "psci"));
+
+ _FDT(fdt_property_cell(fdt, "reg", mpidr));
+ _FDT(fdt_end_node(fdt));
+ }
+
+ _FDT(fdt_end_node(fdt));
+}
+
+static void generate_irq_prop(void *fdt, u8 irq)
+{
+ u32 irq_prop[] = {
+ cpu_to_fdt32(GIC_FDT_IRQ_TYPE_SPI),
+ cpu_to_fdt32(irq - GIC_SPI_IRQ_BASE),
+ cpu_to_fdt32(GIC_FDT_IRQ_FLAGS_EDGE_LO_HI),
+ };
+
+ _FDT(fdt_property(fdt, "interrupts", irq_prop, sizeof(irq_prop)));
+}
+
+static int setup_fdt(struct kvm *kvm)
+{
+ struct device_header *dev_hdr;
+ u8 staging_fdt[FDT_MAX_SIZE];
+ u32 gic_phandle = fdt__alloc_phandle();
+ u64 mem_reg_prop[] = {
+ cpu_to_fdt64(kvm->arch.memory_guest_start),
+ cpu_to_fdt64(kvm->ram_size),
+ };
+ void *fdt = staging_fdt;
+ void *fdt_dest = guest_flat_to_host(kvm,
+ kvm->arch.dtb_guest_start);
+ void (*generate_mmio_fdt_nodes)(void *, struct device_header *,
+ void (*)(void *, u8));
+ void (*generate_cpu_peripheral_fdt_nodes)(void *, struct kvm *, u32)
+ = kvm->cpus[0]->generate_fdt_nodes;
+
+ /* Create new tree without a reserve map */
+ _FDT(fdt_create(fdt, FDT_MAX_SIZE));
+ _FDT(fdt_finish_reservemap(fdt));
+
+ /* Header */
+ _FDT(fdt_begin_node(fdt, ""));
+ _FDT(fdt_property_cell(fdt, "interrupt-parent", gic_phandle));
+ _FDT(fdt_property_string(fdt, "compatible", "linux,dummy-virt"));
+ _FDT(fdt_property_cell(fdt, "#address-cells", 0x2));
+ _FDT(fdt_property_cell(fdt, "#size-cells", 0x2));
+
+ /* /chosen */
+ _FDT(fdt_begin_node(fdt, "chosen"));
+ _FDT(fdt_property_cell(fdt, "linux,pci-probe-only", 1));
+ _FDT(fdt_property_string(fdt, "bootargs", kern_cmdline));
+
+ /* Initrd */
+ if (kvm->arch.initrd_size != 0) {
+ u32 ird_st_prop = cpu_to_fdt64(kvm->arch.initrd_guest_start);
+ u32 ird_end_prop = cpu_to_fdt64(kvm->arch.initrd_guest_start +
+ kvm->arch.initrd_size);
+
+ _FDT(fdt_property(fdt, "linux,initrd-start",
+ &ird_st_prop, sizeof(ird_st_prop)));
+ _FDT(fdt_property(fdt, "linux,initrd-end",
+ &ird_end_prop, sizeof(ird_end_prop)));
+ }
+ _FDT(fdt_end_node(fdt));
+
+ /* Memory */
+ _FDT(fdt_begin_node(fdt, "memory"));
+ _FDT(fdt_property_string(fdt, "device_type", "memory"));
+ _FDT(fdt_property(fdt, "reg", mem_reg_prop, sizeof(mem_reg_prop)));
+ _FDT(fdt_end_node(fdt));
+
+ /* CPU and peripherals (interrupt controller, timers, etc) */
+ generate_cpu_nodes(fdt, kvm);
+ if (generate_cpu_peripheral_fdt_nodes)
+ generate_cpu_peripheral_fdt_nodes(fdt, kvm, gic_phandle);
+
+ /* Virtio MMIO devices */
+ dev_hdr = device__first_dev(DEVICE_BUS_MMIO);
+ while (dev_hdr) {
+ generate_mmio_fdt_nodes = dev_hdr->data;
+ generate_mmio_fdt_nodes(fdt, dev_hdr, generate_irq_prop);
+ dev_hdr = device__next_dev(dev_hdr);
+ }
+
+ /* IOPORT devices (!) */
+ dev_hdr = device__first_dev(DEVICE_BUS_IOPORT);
+ while (dev_hdr) {
+ generate_mmio_fdt_nodes = dev_hdr->data;
+ generate_mmio_fdt_nodes(fdt, dev_hdr, generate_irq_prop);
+ dev_hdr = device__next_dev(dev_hdr);
+ }
+
+ /* PCI host controller */
+ pci__generate_fdt_nodes(fdt, gic_phandle);
+
+ /* PSCI firmware */
+ _FDT(fdt_begin_node(fdt, "psci"));
+ _FDT(fdt_property_string(fdt, "compatible", "arm,psci"));
+ _FDT(fdt_property_string(fdt, "method", "hvc"));
+ _FDT(fdt_property_cell(fdt, "cpu_suspend", KVM_PSCI_FN_CPU_SUSPEND));
+ _FDT(fdt_property_cell(fdt, "cpu_off", KVM_PSCI_FN_CPU_OFF));
+ _FDT(fdt_property_cell(fdt, "cpu_on", KVM_PSCI_FN_CPU_ON));
+ _FDT(fdt_property_cell(fdt, "migrate", KVM_PSCI_FN_MIGRATE));
+ _FDT(fdt_end_node(fdt));
+
+ /* Finalise. */
+ _FDT(fdt_end_node(fdt));
+ _FDT(fdt_finish(fdt));
+
+ _FDT(fdt_open_into(fdt, fdt_dest, FDT_MAX_SIZE));
+ _FDT(fdt_pack(fdt_dest));
+
+ if (kvm->cfg.arch.dump_dtb_filename)
+ dump_fdt(kvm->cfg.arch.dump_dtb_filename, fdt_dest);
+ return 0;
+}
+late_init(setup_fdt);
+
+static int read_image(int fd, void **pos, void *limit)
+{
+ int count;
+
+ while (((count = xread(fd, *pos, SZ_64K)) > 0) && *pos <= limit)
+ *pos += count;
+
+ if (pos < 0)
+ die_perror("xread");
+
+ return *pos < limit ? 0 : -ENOMEM;
+}
+
+#define FDT_ALIGN SZ_2M
+#define INITRD_ALIGN 4
+int load_flat_binary(struct kvm *kvm, int fd_kernel, int fd_initrd,
+ const char *kernel_cmdline)
+{
+ void *pos, *kernel_end, *limit;
+ unsigned long guest_addr;
+
+ if (lseek(fd_kernel, 0, SEEK_SET) < 0)
+ die_perror("lseek");
+
+ /*
+ * Linux requires the initrd and dtb to be mapped inside lowmem,
+ * so we can't just place them at the top of memory.
+ */
+ limit = kvm->ram_start + min(kvm->ram_size, (u64)SZ_256M) - 1;
+
+ pos = kvm->ram_start + ARM_KERN_OFFSET(kvm);
+ kvm->arch.kern_guest_start = host_to_guest_flat(kvm, pos);
+ if (read_image(fd_kernel, &pos, limit) == -ENOMEM)
+ die("kernel image too big to contain in guest memory.");
+
+ kernel_end = pos;
+ pr_info("Loaded kernel to 0x%llx (%llu bytes)",
+ kvm->arch.kern_guest_start,
+ host_to_guest_flat(kvm, pos) - kvm->arch.kern_guest_start);
+
+ /*
+ * Now load backwards from the end of memory so the kernel
+ * decompressor has plenty of space to work with. First up is
+ * the device tree blob...
+ */
+ pos = limit;
+ pos -= (FDT_MAX_SIZE + FDT_ALIGN);
+ guest_addr = ALIGN(host_to_guest_flat(kvm, pos), FDT_ALIGN);
+ pos = guest_flat_to_host(kvm, guest_addr);
+ if (pos < kernel_end)
+ die("fdt overlaps with kernel image.");
+
+ kvm->arch.dtb_guest_start = guest_addr;
+ pr_info("Placing fdt at 0x%llx - 0x%llx",
+ kvm->arch.dtb_guest_start,
+ host_to_guest_flat(kvm, limit));
+ limit = pos;
+
+ /* ... and finally the initrd, if we have one. */
+ if (fd_initrd != -1) {
+ struct stat sb;
+ unsigned long initrd_start;
+
+ if (lseek(fd_initrd, 0, SEEK_SET) < 0)
+ die_perror("lseek");
+
+ if (fstat(fd_initrd, &sb))
+ die_perror("fstat");
+
+ pos -= (sb.st_size + INITRD_ALIGN);
+ guest_addr = ALIGN(host_to_guest_flat(kvm, pos), INITRD_ALIGN);
+ pos = guest_flat_to_host(kvm, guest_addr);
+ if (pos < kernel_end)
+ die("initrd overlaps with kernel image.");
+
+ initrd_start = guest_addr;
+ if (read_image(fd_initrd, &pos, limit) == -ENOMEM)
+ die("initrd too big to contain in guest memory.");
+
+ kvm->arch.initrd_guest_start = initrd_start;
+ kvm->arch.initrd_size = host_to_guest_flat(kvm, pos) - initrd_start;
+ pr_info("Loaded initrd to 0x%llx (%llu bytes)",
+ kvm->arch.initrd_guest_start,
+ kvm->arch.initrd_size);
+ } else {
+ kvm->arch.initrd_size = 0;
+ }
+
+ strncpy(kern_cmdline, kernel_cmdline, COMMAND_LINE_SIZE);
+ kern_cmdline[COMMAND_LINE_SIZE - 1] = '\0';
+
+ return true;
+}
diff --git a/tools/kvm/arm/gic.c b/tools/kvm/arm/gic.c
new file mode 100644
index 0000000..5d8cbe6
--- /dev/null
+++ b/tools/kvm/arm/gic.c
@@ -0,0 +1,80 @@
+#include "kvm/fdt.h"
+#include "kvm/kvm.h"
+#include "kvm/virtio.h"
+
+#include "arm-common/gic.h"
+
+#include <linux/byteorder.h>
+#include <linux/kvm.h>
+
+int gic__init_irqchip(struct kvm *kvm)
+{
+ int err;
+ struct kvm_arm_device_addr gic_addr[] = {
+ [0] = {
+ .id = KVM_VGIC_V2_ADDR_TYPE_DIST |
+ (KVM_ARM_DEVICE_VGIC_V2 << KVM_ARM_DEVICE_ID_SHIFT),
+ .addr = ARM_GIC_DIST_BASE,
+ },
+ [1] = {
+ .id = KVM_VGIC_V2_ADDR_TYPE_CPU |
+ (KVM_ARM_DEVICE_VGIC_V2 << KVM_ARM_DEVICE_ID_SHIFT),
+ .addr = ARM_GIC_CPUI_BASE,
+ }
+ };
+
+ if (kvm->nrcpus > GIC_MAX_CPUS) {
+ pr_warning("%d CPUS greater than maximum of %d -- truncating\n",
+ kvm->nrcpus, GIC_MAX_CPUS);
+ kvm->nrcpus = GIC_MAX_CPUS;
+ }
+
+ err = ioctl(kvm->vm_fd, KVM_CREATE_IRQCHIP);
+ if (err)
+ return err;
+
+ err = ioctl(kvm->vm_fd, KVM_ARM_SET_DEVICE_ADDR, &gic_addr[0]);
+ if (err)
+ return err;
+
+ err = ioctl(kvm->vm_fd, KVM_ARM_SET_DEVICE_ADDR, &gic_addr[1]);
+ return err;
+}
+
+void gic__generate_fdt_nodes(void *fdt, u32 phandle)
+{
+ u64 reg_prop[] = {
+ cpu_to_fdt64(ARM_GIC_DIST_BASE), cpu_to_fdt64(ARM_GIC_DIST_SIZE),
+ cpu_to_fdt64(ARM_GIC_CPUI_BASE), cpu_to_fdt64(ARM_GIC_CPUI_SIZE),
+ };
+
+ _FDT(fdt_begin_node(fdt, "intc"));
+ _FDT(fdt_property_string(fdt, "compatible", "arm,cortex-a15-gic"));
+ _FDT(fdt_property_cell(fdt, "#interrupt-cells", GIC_FDT_IRQ_NUM_CELLS));
+ _FDT(fdt_property(fdt, "interrupt-controller", NULL, 0));
+ _FDT(fdt_property(fdt, "reg", reg_prop, sizeof(reg_prop)));
+ _FDT(fdt_property_cell(fdt, "phandle", phandle));
+ _FDT(fdt_end_node(fdt));
+}
+
+#define KVM_IRQCHIP_IRQ(x) (KVM_ARM_IRQ_TYPE_SPI << KVM_ARM_IRQ_TYPE_SHIFT) |\
+ ((x) & KVM_ARM_IRQ_NUM_MASK)
+
+void kvm__irq_line(struct kvm *kvm, int irq, int level)
+{
+ struct kvm_irq_level irq_level = {
+ .irq = KVM_IRQCHIP_IRQ(irq),
+ .level = !!level,
+ };
+
+ if (irq < GIC_SPI_IRQ_BASE || irq > GIC_MAX_IRQ)
+ pr_warning("Ignoring invalid GIC IRQ %d", irq);
+ else if (ioctl(kvm->vm_fd, KVM_IRQ_LINE, &irq_level) < 0)
+ pr_warning("Could not KVM_IRQ_LINE for irq %d", irq);
+}
+
+void kvm__irq_trigger(struct kvm *kvm, int irq)
+{
+ kvm__irq_line(kvm, irq, VIRTIO_IRQ_HIGH);
+ kvm__irq_line(kvm, irq, VIRTIO_IRQ_LOW);
+}
diff --git a/tools/kvm/arm/include/arm-common/gic.h b/tools/kvm/arm/include/arm-common/gic.h
new file mode 100644
index 0000000..850edc7
--- /dev/null
+++ b/tools/kvm/arm/include/arm-common/gic.h
@@ -0,0 +1,35 @@
+#ifndef ARM_COMMON__GIC_H
+#define ARM_COMMON__GIC_H
+
+#define GIC_SGI_IRQ_BASE 0
+#define GIC_PPI_IRQ_BASE 16
+#define GIC_SPI_IRQ_BASE 32
+
+#define GIC_FDT_IRQ_NUM_CELLS 3
+
+#define GIC_FDT_IRQ_TYPE_SPI 0
+#define GIC_FDT_IRQ_TYPE_PPI 1
+
+#define GIC_FDT_IRQ_FLAGS_EDGE_LO_HI 1
+#define GIC_FDT_IRQ_FLAGS_EDGE_HI_LO 2
+#define GIC_FDT_IRQ_FLAGS_LEVEL_HI 4
+#define GIC_FDT_IRQ_FLAGS_LEVEL_LO 8
+
+#define GIC_FDT_IRQ_PPI_CPU_SHIFT 8
+#define GIC_FDT_IRQ_PPI_CPU_MASK (0xff << GIC_FDT_IRQ_PPI_CPU_SHIFT)
+
+#define GIC_CPUI_CTLR_EN (1 << 0)
+#define GIC_CPUI_PMR_MIN_PRIO 0xff
+
+#define GIC_CPUI_OFF_PMR 4
+
+#define GIC_MAX_CPUS 8
+#define GIC_MAX_IRQ 255
+
+struct kvm;
+
+int gic__alloc_irqnum(void);
+int gic__init_irqchip(struct kvm *kvm);
+void gic__generate_fdt_nodes(void *fdt, u32 phandle);
+
+#endif /* ARM_COMMON__GIC_H */
diff --git a/tools/kvm/arm/include/arm-common/kvm-arch.h b/tools/kvm/arm/include/arm-common/kvm-arch.h
new file mode 100644
index 0000000..082131d
--- /dev/null
+++ b/tools/kvm/arm/include/arm-common/kvm-arch.h
@@ -0,0 +1,78 @@
+#ifndef ARM_COMMON__KVM_ARCH_H
+#define ARM_COMMON__KVM_ARCH_H
+
+#include <stdbool.h>
+#include <linux/const.h>
+#include <linux/types.h>
+
+#include "arm-common/gic.h"
+
+#define ARM_IOPORT_AREA _AC(0x0000000000000000, UL)
+#define ARM_MMIO_AREA _AC(0x0000000000010000, UL)
+#define ARM_AXI_AREA _AC(0x0000000040000000, UL)
+#define ARM_MEMORY_AREA _AC(0x0000000080000000, UL)
+
+#define ARM_LOMAP_MAX_MEMORY ((1ULL << 32) - ARM_MEMORY_AREA)
+#define ARM_HIMAP_MAX_MEMORY ((1ULL << 40) - ARM_MEMORY_AREA)
+
+#define ARM_GIC_DIST_BASE (ARM_AXI_AREA - ARM_GIC_DIST_SIZE)
+#define ARM_GIC_CPUI_BASE (ARM_GIC_DIST_BASE - ARM_GIC_CPUI_SIZE)
+#define ARM_GIC_SIZE (ARM_GIC_DIST_SIZE + ARM_GIC_CPUI_SIZE)
+
+#define ARM_IOPORT_SIZE (ARM_MMIO_AREA - ARM_IOPORT_AREA)
+#define ARM_VIRTIO_MMIO_SIZE (ARM_AXI_AREA - (ARM_MMIO_AREA + ARM_GIC_SIZE))
+#define ARM_PCI_CFG_SIZE (1ULL << 24)
+#define ARM_PCI_MMIO_SIZE (ARM_MEMORY_AREA - \
+ (ARM_AXI_AREA + ARM_PCI_CFG_SIZE))
+
+#define KVM_IOPORT_AREA ARM_IOPORT_AREA
+#define KVM_PCI_CFG_AREA ARM_AXI_AREA
+#define KVM_PCI_MMIO_AREA (KVM_PCI_CFG_AREA + ARM_PCI_CFG_SIZE)
+#define KVM_VIRTIO_MMIO_AREA ARM_MMIO_AREA
+
+#define KVM_IRQ_OFFSET GIC_SPI_IRQ_BASE
+
+#define KVM_VM_TYPE 0
+
+#define VIRTIO_DEFAULT_TRANS(kvm) \
+ ((kvm)->cfg.arch.virtio_trans_pci ? VIRTIO_PCI : VIRTIO_MMIO)
+
+#define VIRTIO_RING_ENDIAN (VIRTIO_ENDIAN_LE | VIRTIO_ENDIAN_BE)
+
+static inline bool arm_addr_in_ioport_region(u64 phys_addr)
+{
+ u64 limit = KVM_IOPORT_AREA + ARM_IOPORT_SIZE;
+ return phys_addr >= KVM_IOPORT_AREA && phys_addr < limit;
+}
+
+static inline bool arm_addr_in_virtio_mmio_region(u64 phys_addr)
+{
+ u64 limit = KVM_VIRTIO_MMIO_AREA + ARM_VIRTIO_MMIO_SIZE;
+ return phys_addr >= KVM_VIRTIO_MMIO_AREA && phys_addr < limit;
+}
+
+static inline bool arm_addr_in_pci_region(u64 phys_addr)
+{
+ u64 limit = KVM_PCI_CFG_AREA + ARM_PCI_CFG_SIZE + ARM_PCI_MMIO_SIZE;
+ return phys_addr >= KVM_PCI_CFG_AREA && phys_addr < limit;
+}
+
+struct kvm_arch {
+ /*
+ * We may have to align the guest memory for virtio, so keep the
+ * original pointers here for munmap.
+ */
+ void *ram_alloc_start;
+ u64 ram_alloc_size;
+
+ /*
+ * Guest addresses for memory layout.
+ */
+ u64 memory_guest_start;
+ u64 kern_guest_start;
+ u64 initrd_guest_start;
+ u64 initrd_size;
+ u64 dtb_guest_start;
+};
+
+#endif /* ARM_COMMON__KVM_ARCH_H */
diff --git a/tools/kvm/arm/include/arm-common/kvm-config-arch.h b/tools/kvm/arm/include/arm-common/kvm-config-arch.h
new file mode 100644
index 0000000..a8ebd94
--- /dev/null
+++ b/tools/kvm/arm/include/arm-common/kvm-config-arch.h
@@ -0,0 +1,26 @@
+#ifndef ARM_COMMON__KVM_CONFIG_ARCH_H
+#define ARM_COMMON__KVM_CONFIG_ARCH_H
+
+#include "kvm/parse-options.h"
+
+struct kvm_config_arch {
+ const char *dump_dtb_filename;
+ unsigned int force_cntfrq;
+ bool virtio_trans_pci;
+ bool aarch32_guest;
+};
+
+#define OPT_ARCH_RUN(pfx, cfg) \
+ pfx, \
+ ARM_OPT_ARCH_RUN(cfg) \
+ OPT_STRING('\0', "dump-dtb", &(cfg)->dump_dtb_filename, \
+ ".dtb file", "Dump generated .dtb to specified file"), \
+ OPT_UINTEGER('\0', "override-bad-firmware-cntfrq", &(cfg)->force_cntfrq,\
+ "Specify Generic Timer frequency in guest DT to " \
+ "work around buggy secure firmware *Firmware should be " \
+ "updated to program CNTFRQ correctly*"), \
+ OPT_BOOLEAN('\0', "force-pci", &(cfg)->virtio_trans_pci, \
+ "Force virtio devices to use PCI as their default " \
+ "transport"),
+
+#endif /* ARM_COMMON__KVM_CONFIG_ARCH_H */
diff --git a/tools/kvm/arm/include/arm-common/kvm-cpu-arch.h b/tools/kvm/arm/include/arm-common/kvm-cpu-arch.h
new file mode 100644
index 0000000..83cd8b8
--- /dev/null
+++ b/tools/kvm/arm/include/arm-common/kvm-cpu-arch.h
@@ -0,0 +1,50 @@
+#ifndef ARM_COMMON__KVM_CPU_ARCH_H
+#define ARM_COMMON__KVM_CPU_ARCH_H
+
+#include <linux/kvm.h>
+#include <pthread.h>
+#include <stdbool.h>
+
+struct kvm;
+
+struct kvm_cpu {
+ pthread_t thread;
+
+ unsigned long cpu_id;
+ unsigned long cpu_type;
+ const char *cpu_compatible;
+
+ struct kvm *kvm;
+ int vcpu_fd;
+ struct kvm_run *kvm_run;
+
+ u8 is_running;
+ u8 paused;
+ u8 needs_nmi;
+
+ struct kvm_coalesced_mmio_ring *ring;
+
+ void (*generate_fdt_nodes)(void *fdt, struct kvm* kvm,
+ u32 gic_phandle);
+};
+
+struct kvm_arm_target {
+ u32 id;
+ const char *compatible;
+ int (*init)(struct kvm_cpu *vcpu);
+};
+
+int kvm_cpu__register_kvm_arm_target(struct kvm_arm_target *target);
+
+static inline bool kvm_cpu__emulate_io(struct kvm_cpu *vcpu, u16 port, void *data,
+ int direction, int size, u32 count)
+{
+ return false;
+}
+
+bool kvm_cpu__emulate_mmio(struct kvm_cpu *vcpu, u64 phys_addr, u8 *data,
+ u32 len, u8 is_write);
+
+unsigned long kvm_cpu__get_vcpu_mpidr(struct kvm_cpu *vcpu);
+
+#endif /* ARM_COMMON__KVM_CPU_ARCH_H */
diff --git a/tools/kvm/arm/include/arm-common/pci.h b/tools/kvm/arm/include/arm-common/pci.h
new file mode 100644
index 0000000..ee87725
--- /dev/null
+++ b/tools/kvm/arm/include/arm-common/pci.h
@@ -0,0 +1,6 @@
+#ifndef ARM_COMMON__PCI_H
+#define ARM_COMMON__PCI_H
+
+void pci__generate_fdt_nodes(void *fdt, u32 gic_phandle);
+
+#endif /* ARM_COMMON__PCI_H */
diff --git a/tools/kvm/arm/include/arm-common/timer.h b/tools/kvm/arm/include/arm-common/timer.h
new file mode 100644
index 0000000..928e9ea
--- /dev/null
+++ b/tools/kvm/arm/include/arm-common/timer.h
@@ -0,0 +1,6 @@
+#ifndef ARM_COMMON__TIMER_H
+#define ARM_COMMON__TIMER_H
+
+void timer__generate_fdt_nodes(void *fdt, struct kvm *kvm, int *irqs);
+
+#endif /* ARM_COMMON__TIMER_H */
diff --git a/tools/kvm/arm/ioport.c b/tools/kvm/arm/ioport.c
new file mode 100644
index 0000000..bdd30b6
--- /dev/null
+++ b/tools/kvm/arm/ioport.c
@@ -0,0 +1,11 @@
+#include "kvm/ioport.h"
+#include "kvm/irq.h"
+
+void ioport__setup_arch(struct kvm *kvm)
+{
+}
+
+void ioport__map_irq(u8 *irq)
+{
+ *irq = irq__alloc_line();
+}
diff --git a/tools/kvm/arm/irq.c b/tools/kvm/arm/irq.c
new file mode 100644
index 0000000..d8f44df
--- /dev/null
+++ b/tools/kvm/arm/irq.c
@@ -0,0 +1,9 @@
+#include "kvm/irq.h"
+#include "kvm/kvm.h"
+#include "kvm/util.h"
+
+int irq__add_msix_route(struct kvm *kvm, struct msi_msg *msg)
+{
+ die(__FUNCTION__);
+ return 0;
+}
diff --git a/tools/kvm/arm/kvm-cpu.c b/tools/kvm/arm/kvm-cpu.c
new file mode 100644
index 0000000..aeaa4cf
--- /dev/null
+++ b/tools/kvm/arm/kvm-cpu.c
@@ -0,0 +1,119 @@
+#include "kvm/kvm.h"
+#include "kvm/kvm-cpu.h"
+
+static int debug_fd;
+
+void kvm_cpu__set_debug_fd(int fd)
+{
+ debug_fd = fd;
+}
+
+int kvm_cpu__get_debug_fd(void)
+{
+ return debug_fd;
+}
+
+static struct kvm_arm_target *kvm_arm_targets[KVM_ARM_NUM_TARGETS];
+int kvm_cpu__register_kvm_arm_target(struct kvm_arm_target *target)
+{
+ unsigned int i = 0;
+
+ for (i = 0; i < ARRAY_SIZE(kvm_arm_targets); ++i) {
+ if (!kvm_arm_targets[i]) {
+ kvm_arm_targets[i] = target;
+ return 0;
+ }
+ }
+
+ return -ENOSPC;
+}
+
+struct kvm_cpu *kvm_cpu__arch_init(struct kvm *kvm, unsigned long cpu_id)
+{
+ struct kvm_arm_target *target;
+ struct kvm_cpu *vcpu;
+ int coalesced_offset, mmap_size, err = -1;
+ unsigned int i;
+ struct kvm_vcpu_init vcpu_init = {
+ .features = ARM_VCPU_FEATURE_FLAGS(kvm, cpu_id)
+ };
+
+ vcpu = calloc(1, sizeof(struct kvm_cpu));
+ if (!vcpu)
+ return NULL;
+
+ vcpu->vcpu_fd = ioctl(kvm->vm_fd, KVM_CREATE_VCPU, cpu_id);
+ if (vcpu->vcpu_fd < 0)
+ die_perror("KVM_CREATE_VCPU ioctl");
+
+ mmap_size = ioctl(kvm->sys_fd, KVM_GET_VCPU_MMAP_SIZE, 0);
+ if (mmap_size < 0)
+ die_perror("KVM_GET_VCPU_MMAP_SIZE ioctl");
+
+ vcpu->kvm_run = mmap(NULL, mmap_size, PROT_RW, MAP_SHARED,
+ vcpu->vcpu_fd, 0);
+ if (vcpu->kvm_run == MAP_FAILED)
+ die("unable to mmap vcpu fd");
+
+ /* Find an appropriate target CPU type. */
+ for (i = 0; i < ARRAY_SIZE(kvm_arm_targets); ++i) {
+ if (!kvm_arm_targets[i])
+ continue;
+ target = kvm_arm_targets[i];
+ vcpu_init.target = target->id;
+ err = ioctl(vcpu->vcpu_fd, KVM_ARM_VCPU_INIT, &vcpu_init);
+ if (!err)
+ break;
+ }
+
+ if (err || target->init(vcpu))
+ die("Unable to initialise ARM vcpu");
+
+ coalesced_offset = ioctl(kvm->sys_fd, KVM_CHECK_EXTENSION,
+ KVM_CAP_COALESCED_MMIO);
+ if (coalesced_offset)
+ vcpu->ring = (void *)vcpu->kvm_run +
+ (coalesced_offset * PAGE_SIZE);
+
+ /* Populate the vcpu structure. */
+ vcpu->kvm = kvm;
+ vcpu->cpu_id = cpu_id;
+ vcpu->cpu_type = target->id;
+ vcpu->cpu_compatible = target->compatible;
+ vcpu->is_running = true;
+ return vcpu;
+}
+
+void kvm_cpu__arch_nmi(struct kvm_cpu *cpu)
+{
+}
+
+void kvm_cpu__delete(struct kvm_cpu *vcpu)
+{
+ free(vcpu);
+}
+
+bool kvm_cpu__handle_exit(struct kvm_cpu *vcpu)
+{
+ return false;
+}
+
+bool kvm_cpu__emulate_mmio(struct kvm_cpu *vcpu, u64 phys_addr, u8 *data,
+ u32 len, u8 is_write)
+{
+ if (arm_addr_in_virtio_mmio_region(phys_addr)) {
+ return kvm__emulate_mmio(vcpu, phys_addr, data, len, is_write);
+ } else if (arm_addr_in_ioport_region(phys_addr)) {
+ int direction = is_write ? KVM_EXIT_IO_OUT : KVM_EXIT_IO_IN;
+ u16 port = (phys_addr - KVM_IOPORT_AREA) & USHRT_MAX;
+ return kvm__emulate_io(vcpu, port, data, direction, len, 1);
+ } else if (arm_addr_in_pci_region(phys_addr)) {
+ return kvm__emulate_mmio(vcpu, phys_addr, data, len, is_write);
+ }
+
+ return false;
+}
+
+void kvm_cpu__show_page_tables(struct kvm_cpu *vcpu)
+{
+}
diff --git a/tools/kvm/arm/kvm.c b/tools/kvm/arm/kvm.c
new file mode 100644
index 0000000..58ad9fa
--- /dev/null
+++ b/tools/kvm/arm/kvm.c
@@ -0,0 +1,87 @@
+#include "kvm/kvm.h"
+#include "kvm/term.h"
+#include "kvm/util.h"
+#include "kvm/8250-serial.h"
+#include "kvm/virtio-console.h"
+
+#include "arm-common/gic.h"
+
+#include <linux/kernel.h>
+#include <linux/kvm.h>
+#include <linux/sizes.h>
+
+struct kvm_ext kvm_req_ext[] = {
+ { DEFINE_KVM_EXT(KVM_CAP_IRQCHIP) },
+ { DEFINE_KVM_EXT(KVM_CAP_ONE_REG) },
+ { DEFINE_KVM_EXT(KVM_CAP_ARM_PSCI) },
+ { 0, 0 },
+};
+
+bool kvm__arch_cpu_supports_vm(void)
+{
+ /* The KVM capability check is enough. */
+ return true;
+}
+
+void kvm__init_ram(struct kvm *kvm)
+{
+ int err;
+ u64 phys_start, phys_size;
+ void *host_mem;
+
+ phys_start = ARM_MEMORY_AREA;
+ phys_size = kvm->ram_size;
+ host_mem = kvm->ram_start;
+
+ err = kvm__register_mem(kvm, phys_start, phys_size, host_mem);
+ if (err)
+ die("Failed to register %lld bytes of memory at physical "
+ "address 0x%llx [err %d]", phys_size, phys_start, err);
+
+ kvm->arch.memory_guest_start = phys_start;
+}
+
+void kvm__arch_delete_ram(struct kvm *kvm)
+{
+ munmap(kvm->arch.ram_alloc_start, kvm->arch.ram_alloc_size);
+}
+
+void kvm__arch_read_term(struct kvm *kvm)
+{
+ if (term_readable(0)) {
+ serial8250__update_consoles(kvm);
+ virtio_console__inject_interrupt(kvm);
+ }
+}
+
+void kvm__arch_set_cmdline(char *cmdline, bool video)
+{
+}
+
+void kvm__arch_init(struct kvm *kvm, const char *hugetlbfs_path, u64 ram_size)
+{
+ /*
+ * Allocate guest memory. We must align our buffer to 64K to
+ * correlate with the maximum guest page size for virtio-mmio.
+ * If using THP, then our minimal alignment becomes 2M.
+ * 2M trumps 64K, so let's go with that.
+ */
+ kvm->ram_size = min(ram_size, (u64)ARM_MAX_MEMORY(kvm));
+ kvm->arch.ram_alloc_size = kvm->ram_size + SZ_2M;
+ kvm->arch.ram_alloc_start = mmap_anon_or_hugetlbfs(kvm, hugetlbfs_path,
+ kvm->arch.ram_alloc_size);
+
+ if (kvm->arch.ram_alloc_start == MAP_FAILED)
+ die("Failed to map %lld bytes for guest memory (%d)",
+ kvm->arch.ram_alloc_size, errno);
+
+ kvm->ram_start = (void *)ALIGN((unsigned long)kvm->arch.ram_alloc_start,
+ SZ_2M);
+
+ madvise(kvm->arch.ram_alloc_start, kvm->arch.ram_alloc_size,
+ MADV_MERGEABLE | MADV_HUGEPAGE);
+
+ /* Initialise the virtual GIC. */
+ if (gic__init_irqchip(kvm))
+ die("Failed to initialise virtual GIC");
+}
diff --git a/tools/kvm/arm/pci.c b/tools/kvm/arm/pci.c
new file mode 100644
index 0000000..9f4dabc
--- /dev/null
+++ b/tools/kvm/arm/pci.c
@@ -0,0 +1,117 @@
+#include "kvm/devices.h"
+#include "kvm/fdt.h"
+#include "kvm/of_pci.h"
+#include "kvm/pci.h"
+#include "kvm/util.h"
+
+#include "arm-common/pci.h"
+
+/*
+ * An entry in the interrupt-map table looks like:
+ * <pci unit address> <pci interrupt pin> <gic phandle> <gic interrupt>
+ */
+
+struct of_gic_irq {
+ u32 type, num, flags;
+} __attribute__((packed));
+
+struct of_interrupt_map_entry {
+ struct of_pci_irq_mask pci_irq_mask;
+ u32 gic_phandle;
+ struct of_gic_irq gic_irq;
+} __attribute__((packed));
+
+void pci__generate_fdt_nodes(void *fdt, u32 gic_phandle)
+{
+ struct device_header *dev_hdr;
+ struct of_interrupt_map_entry irq_map[OF_PCI_IRQ_MAP_MAX];
+ unsigned nentries = 0;
+ /* Bus range */
+ u32 bus_range[] = { cpu_to_fdt32(0), cpu_to_fdt32(1), };
+ /* Configuration Space */
+ u64 cfg_reg_prop[] = { cpu_to_fdt64(KVM_PCI_CFG_AREA),
+ cpu_to_fdt64(ARM_PCI_CFG_SIZE), };
+ /* Describe the memory ranges */
+ struct of_pci_ranges_entry ranges[] = {
+ {
+ .pci_addr = {
+ .hi = cpu_to_fdt32(of_pci_b_ss(OF_PCI_SS_IO)),
+ .mid = 0,
+ .lo = 0,
+ },
+ .cpu_addr = cpu_to_fdt64(KVM_IOPORT_AREA),
+ .length = cpu_to_fdt64(ARM_IOPORT_SIZE),
+ },
+ {
+ .pci_addr = {
+ .hi = cpu_to_fdt32(of_pci_b_ss(OF_PCI_SS_M32)),
+ .mid = cpu_to_fdt32(KVM_PCI_MMIO_AREA >> 32),
+ .lo = cpu_to_fdt32(KVM_PCI_MMIO_AREA),
+ },
+ .cpu_addr = cpu_to_fdt64(KVM_PCI_MMIO_AREA),
+ .length = cpu_to_fdt64(ARM_PCI_MMIO_SIZE),
+ },
+ };
+
+ /* Boilerplate PCI properties */
+ _FDT(fdt_begin_node(fdt, "pci"));
+ _FDT(fdt_property_string(fdt, "device_type", "pci"));
+ _FDT(fdt_property_cell(fdt, "#address-cells", 0x3));
+ _FDT(fdt_property_cell(fdt, "#size-cells", 0x2));
+ _FDT(fdt_property_cell(fdt, "#interrupt-cells", 0x1));
+ _FDT(fdt_property_string(fdt, "compatible", "pci-host-cam-generic"));
+
+ _FDT(fdt_property(fdt, "bus-range", bus_range, sizeof(bus_range)));
+ _FDT(fdt_property(fdt, "reg", &cfg_reg_prop, sizeof(cfg_reg_prop)));
+ _FDT(fdt_property(fdt, "ranges", ranges, sizeof(ranges)));
+
+ /* Generate the interrupt map ... */
+ dev_hdr = device__first_dev(DEVICE_BUS_PCI);
+ while (dev_hdr && nentries < ARRAY_SIZE(irq_map)) {
+ struct of_interrupt_map_entry *entry = &irq_map[nentries];
+ struct pci_device_header *pci_hdr = dev_hdr->data;
+ u8 dev_num = dev_hdr->dev_num;
+ u8 pin = pci_hdr->irq_pin;
+ u8 irq = pci_hdr->irq_line;
+
+ *entry = (struct of_interrupt_map_entry) {
+ .pci_irq_mask = {
+ .pci_addr = {
+ .hi = cpu_to_fdt32(of_pci_b_ddddd(dev_num)),
+ .mid = 0,
+ .lo = 0,
+ },
+ .pci_pin = cpu_to_fdt32(pin),
+ },
+ .gic_phandle = cpu_to_fdt32(gic_phandle),
+ .gic_irq = {
+ .type = cpu_to_fdt32(GIC_FDT_IRQ_TYPE_SPI),
+ .num = cpu_to_fdt32(irq - GIC_SPI_IRQ_BASE),
+ .flags = cpu_to_fdt32(GIC_FDT_IRQ_FLAGS_EDGE_LO_HI),
+ },
+ };
+
+ nentries++;
+ dev_hdr = device__next_dev(dev_hdr);
+ }
+
+ _FDT(fdt_property(fdt, "interrupt-map", irq_map,
+ sizeof(struct of_interrupt_map_entry) * nentries));
+
+ /* ... and the corresponding mask. */
+ if (nentries) {
+ struct of_pci_irq_mask irq_mask = {
+ .pci_addr = {
+ .hi = cpu_to_fdt32(of_pci_b_ddddd(-1)),
+ .mid = 0,
+ .lo = 0,
+ },
+ .pci_pin = cpu_to_fdt32(7),
+ };
+
+ _FDT(fdt_property(fdt, "interrupt-map-mask", &irq_mask,
+ sizeof(irq_mask)));
+ }
+
+ _FDT(fdt_end_node(fdt));
+}
diff --git a/tools/kvm/arm/timer.c b/tools/kvm/arm/timer.c
new file mode 100644
index 0000000..209251e
--- /dev/null
+++ b/tools/kvm/arm/timer.c
@@ -0,0 +1,41 @@
+#include "kvm/fdt.h"
+#include "kvm/kvm.h"
+#include "kvm/kvm-cpu.h"
+#include "kvm/util.h"
+
+#include "arm-common/gic.h"
+#include "arm-common/timer.h"
+
+void timer__generate_fdt_nodes(void *fdt, struct kvm *kvm, int *irqs)
+{
+ const char compatible[] = "arm,armv8-timer\0arm,armv7-timer";
+
+ u32 cpu_mask = (((1 << kvm->nrcpus) - 1) << GIC_FDT_IRQ_PPI_CPU_SHIFT) \
+ & GIC_FDT_IRQ_PPI_CPU_MASK;
+ u32 irq_prop[] = {
+ cpu_to_fdt32(GIC_FDT_IRQ_TYPE_PPI),
+ cpu_to_fdt32(irqs[0]),
+ cpu_to_fdt32(cpu_mask | GIC_FDT_IRQ_FLAGS_EDGE_LO_HI),
+
+ cpu_to_fdt32(GIC_FDT_IRQ_TYPE_PPI),
+ cpu_to_fdt32(irqs[1]),
+ cpu_to_fdt32(cpu_mask | GIC_FDT_IRQ_FLAGS_EDGE_LO_HI),
+
+ cpu_to_fdt32(GIC_FDT_IRQ_TYPE_PPI),
+ cpu_to_fdt32(irqs[2]),
+ cpu_to_fdt32(cpu_mask | GIC_FDT_IRQ_FLAGS_EDGE_LO_HI),
+
+ cpu_to_fdt32(GIC_FDT_IRQ_TYPE_PPI),
+ cpu_to_fdt32(irqs[3]),
+ cpu_to_fdt32(cpu_mask | GIC_FDT_IRQ_FLAGS_EDGE_LO_HI),
+ };
+
+ _FDT(fdt_begin_node(fdt, "timer"));
+ _FDT(fdt_property(fdt, "compatible", compatible, sizeof(compatible)));
+ _FDT(fdt_property(fdt, "interrupts", irq_prop, sizeof(irq_prop)));
+ _FDT(fdt_property(fdt, "always-on", NULL, 0));
+ if (kvm->cfg.arch.force_cntfrq > 0)
+ _FDT(fdt_property_cell(fdt, "clock-frequency", kvm->cfg.arch.force_cntfrq));
+ _FDT(fdt_end_node(fdt));
+}
+
diff --git a/tools/kvm/builtin-balloon.c b/tools/kvm/builtin-balloon.c
new file mode 100644
index 0000000..d158ace
--- /dev/null
+++ b/tools/kvm/builtin-balloon.c
@@ -0,0 +1,80 @@
+#include <stdio.h>
+#include <string.h>
+#include <signal.h>
+
+#include <kvm/util.h>
+#include <kvm/kvm-cmd.h>
+#include <kvm/builtin-balloon.h>
+#include <kvm/parse-options.h>
+#include <kvm/kvm.h>
+#include <kvm/kvm-ipc.h>
+
+static const char *instance_name;
+static u64 inflate;
+static u64 deflate;
+
+static const char * const balloon_usage[] = {
+ "lkvm balloon [-n name] [-p pid] [-i amount] [-d amount]",
+ NULL
+};
+
+static const struct option balloon_options[] = {
+ OPT_GROUP("Instance options:"),
+ OPT_STRING('n', "name", &instance_name, "name", "Instance name"),
+ OPT_GROUP("Balloon options:"),
+ OPT_U64('i', "inflate", &inflate, "Amount to inflate (in MB)"),
+ OPT_U64('d', "deflate", &deflate, "Amount to deflate (in MB)"),
+ OPT_END(),
+};
+
+void kvm_balloon_help(void)
+{
+ usage_with_options(balloon_usage, balloon_options);
+}
+
+static void parse_balloon_options(int argc, const char **argv)
+{
+ while (argc != 0) {
+ argc = parse_options(argc, argv, balloon_options, balloon_usage,
+ PARSE_OPT_STOP_AT_NON_OPTION);
+ if (argc != 0)
+ kvm_balloon_help();
+ }
+}
+
+int kvm_cmd_balloon(int argc, const char **argv, const char *prefix)
+{
+ int instance;
+ int r;
+ int amount;
+
+ parse_balloon_options(argc, argv);
+
+ if (inflate == 0 && deflate == 0)
+ kvm_balloon_help();
+
+ if (instance_name == NULL)
+ kvm_balloon_help();
+
+ instance = kvm__get_sock_by_instance(instance_name);
+
+ if (instance <= 0)
+ die("Failed locating instance");
+
+ if (inflate)
+ amount = inflate;
+ else if (deflate)
+ amount = -deflate;
+ else
+ kvm_balloon_help();
+
+ r = kvm_ipc__send_msg(instance, KVM_IPC_BALLOON,
+ sizeof(amount), (u8 *)&amount);
+
+ close(instance);
+
+ if (r < 0)
+ return -1;
+
+ return 0;
+}
diff --git a/tools/kvm/builtin-debug.c b/tools/kvm/builtin-debug.c
new file mode 100644
index 0000000..4ae51d2
--- /dev/null
+++ b/tools/kvm/builtin-debug.c
@@ -0,0 +1,110 @@
+#include <kvm/util.h>
+#include <kvm/kvm-cmd.h>
+#include <kvm/builtin-debug.h>
+#include <kvm/kvm.h>
+#include <kvm/parse-options.h>
+#include <kvm/kvm-ipc.h>
+#include <kvm/read-write.h>
+
+#include <stdio.h>
+#include <string.h>
+#include <signal.h>
+
+#define BUFFER_SIZE 100
+
+static bool all;
+static int nmi = -1;
+static bool dump;
+static const char *instance_name;
+static const char *sysrq;
+
+static const char * const debug_usage[] = {
+ "lkvm debug [--all] [-n name] [-d] [-m vcpu]",
+ NULL
+};
+
+static const struct option debug_options[] = {
+ OPT_GROUP("General options:"),
+ OPT_BOOLEAN('d', "dump", &dump, "Generate a debug dump from guest"),
+ OPT_INTEGER('m', "nmi", &nmi, "Generate NMI on VCPU"),
+ OPT_STRING('s', "sysrq", &sysrq, "sysrq", "Inject a sysrq"),
+ OPT_GROUP("Instance options:"),
+ OPT_BOOLEAN('a', "all", &all, "Debug all instances"),
+ OPT_STRING('n', "name", &instance_name, "name", "Instance name"),
+ OPT_END()
+};
+
+static void parse_debug_options(int argc, const char **argv)
+{
+ while (argc != 0) {
+ argc = parse_options(argc, argv, debug_options, debug_usage,
+ PARSE_OPT_STOP_AT_NON_OPTION);
+ if (argc != 0)
+ kvm_debug_help();
+ }
+}
+
+void kvm_debug_help(void)
+{
+ usage_with_options(debug_usage, debug_options);
+}
+
+static int do_debug(const char *name, int sock)
+{
+ char buff[BUFFER_SIZE];
+ struct debug_cmd_params cmd = {.dbg_type = 0};
+ int r;
+
+ if (dump)
+ cmd.dbg_type |= KVM_DEBUG_CMD_TYPE_DUMP;
+
+ if (nmi != -1) {
+ cmd.dbg_type |= KVM_DEBUG_CMD_TYPE_NMI;
+ cmd.cpu = nmi;
+ }
+
+ if (sysrq) {
+ cmd.dbg_type |= KVM_DEBUG_CMD_TYPE_SYSRQ;
+ cmd.sysrq = sysrq[0];
+ }
+
+ r = kvm_ipc__send_msg(sock, KVM_IPC_DEBUG, sizeof(cmd), (u8 *)&cmd);
+ if (r < 0)
+ return r;
+
+ if (!dump)
+ return 0;
+
+ do {
+ r = xread(sock, buff, BUFFER_SIZE);
+ if (r < 0)
+ return 0;
+ printf("%.*s", r, buff);
+ } while (r > 0);
+
+ return 0;
+}
+
+int kvm_cmd_debug(int argc, const char **argv, const char *prefix)
+{
+ parse_debug_options(argc, argv);
+ int instance;
+ int r;
+
+ if (all)
+ return kvm__enumerate_instances(do_debug);
+
+ if (instance_name == NULL)
+ kvm_debug_help();
+
+ instance = kvm__get_sock_by_instance(instance_name);
+
+ if (instance <= 0)
+ die("Failed locating instance");
+
+ r = do_debug(instance_name, instance);
+
+ close(instance);
+
+ return r;
+}
diff --git a/tools/kvm/builtin-help.c b/tools/kvm/builtin-help.c
new file mode 100644
index 0000000..5970fb7
--- /dev/null
+++ b/tools/kvm/builtin-help.c
@@ -0,0 +1,63 @@
+#include <stdio.h>
+#include <string.h>
+
+/* user defined headers */
+#include <common-cmds.h>
+
+#include <kvm/util.h>
+#include <kvm/kvm-cmd.h>
+#include <kvm/builtin-help.h>
+#include <kvm/kvm.h>
+
+
+const char kvm_usage_string[] =
+ "lkvm COMMAND [ARGS]";
+
+const char kvm_more_info_string[] =
+ "See 'lkvm help COMMAND' for more information on a specific command.";
+
+
+static void list_common_cmds_help(void)
+{
+ unsigned int i, longest = 0;
+
+ for (i = 0; i < ARRAY_SIZE(common_cmds); i++) {
+ if (longest < strlen(common_cmds[i].name))
+ longest = strlen(common_cmds[i].name);
+ }
+
+ puts(" The most commonly used lkvm commands are:");
+ for (i = 0; i < ARRAY_SIZE(common_cmds); i++) {
+ printf(" %-*s ", longest, common_cmds[i].name);
+ puts(common_cmds[i].help);
+ }
+}
+
+static void kvm_help(void)
+{
+ printf("\n To start a simple non-privileged shell run '%s run'\n\n"
+ "usage: %s\n\n", KVM_BINARY_NAME, kvm_usage_string);
+ list_common_cmds_help();
+ printf("\n %s\n\n", kvm_more_info_string);
+}
+
+
+static void help_cmd(const char *cmd)
+{
+ struct cmd_struct *p;
+ p = kvm_get_command(kvm_commands, cmd);
+ if (!p)
+ kvm_help();
+ else if (p->help)
+ p->help();
+}
+
+int kvm_cmd_help(int argc, const char **argv, const char *prefix)
+{
+ if (!argv || !*argv) {
+ kvm_help();
+ return 0;
+ }
+ help_cmd(argv[0]);
+ return 0;
+}
diff --git a/tools/kvm/builtin-list.c b/tools/kvm/builtin-list.c
new file mode 100644
index 0000000..c35be93
--- /dev/null
+++ b/tools/kvm/builtin-list.c
@@ -0,0 +1,155 @@
+#include <kvm/util.h>
+#include <kvm/kvm-cmd.h>
+#include <kvm/builtin-list.h>
+#include <kvm/kvm.h>
+#include <kvm/parse-options.h>
+#include <kvm/kvm-ipc.h>
+
+#include <dirent.h>
+#include <stdio.h>
+#include <string.h>
+#include <signal.h>
+#include <fcntl.h>
+
+static bool run;
+static bool rootfs;
+
+static const char * const list_usage[] = {
+ "lkvm list",
+ NULL
+};
+
+static const struct option list_options[] = {
+ OPT_GROUP("General options:"),
+ OPT_BOOLEAN('i', "run", &run, "List running instances"),
+ OPT_BOOLEAN('r', "rootfs", &rootfs, "List rootfs instances"),
+ OPT_END()
+};
+
+#define KVM_INSTANCE_RUNNING "running"
+#define KVM_INSTANCE_PAUSED "paused"
+#define KVM_INSTANCE_SHUTOFF "shut off"
+
+void kvm_list_help(void)
+{
+ usage_with_options(list_usage, list_options);
+}
+
+static pid_t get_pid(int sock)
+{
+ pid_t pid;
+ int r;
+
+ r = kvm_ipc__send(sock, KVM_IPC_PID);
+ if (r < 0)
+ return r;
+
+ r = read(sock, &pid, sizeof(pid));
+ if (r < 0)
+ return r;
+
+ return pid;
+}
+
+int get_vmstate(int sock)
+{
+ int vmstate;
+ int r;
+
+ r = kvm_ipc__send(sock, KVM_IPC_VMSTATE);
+ if (r < 0)
+ return r;
+
+ r = read(sock, &vmstate, sizeof(vmstate));
+ if (r < 0)
+ return r;
+
+ return vmstate;
+
+}
+
+static int print_guest(const char *name, int sock)
+{
+ pid_t pid;
+ int vmstate;
+
+ pid = get_pid(sock);
+ vmstate = get_vmstate(sock);
+
+ if ((int)pid < 0 || vmstate < 0)
+ return -1;
+
+ if (vmstate == KVM_VMSTATE_PAUSED)
+ printf("%5d %-20s %s\n", pid, name, KVM_INSTANCE_PAUSED);
+ else
+ printf("%5d %-20s %s\n", pid, name, KVM_INSTANCE_RUNNING);
+
+ return 0;
+}
+
+static int kvm_list_running_instances(void)
+{
+ return kvm__enumerate_instances(print_guest);
+}
+
+static int kvm_list_rootfs(void)
+{
+ DIR *dir;
+ struct dirent *dirent;
+
+ dir = opendir(kvm__get_dir());
+ if (dir == NULL)
+ return -1;
+
+ while ((dirent = readdir(dir))) {
+ if (dirent->d_type == DT_DIR &&
+ strcmp(dirent->d_name, ".") &&
+ strcmp(dirent->d_name, ".."))
+ printf("%5s %-20s %s\n", "", dirent->d_name, KVM_INSTANCE_SHUTOFF);
+ }
+
+ return 0;
+}
+
+static void parse_setup_options(int argc, const char **argv)
+{
+ while (argc != 0) {
+ argc = parse_options(argc, argv, list_options, list_usage,
+ PARSE_OPT_STOP_AT_NON_OPTION);
+ if (argc != 0)
+ kvm_list_help();
+ }
+}
+
+int kvm_cmd_list(int argc, const char **argv, const char *prefix)
+{
+ int status, r;
+
+ parse_setup_options(argc, argv);
+
+ if (!run && !rootfs)
+ run = rootfs = true;
+
+ printf("%6s %-20s %s\n", "PID", "NAME", "STATE");
+ printf("------------------------------------\n");
+
+ status = 0;
+
+ if (run) {
+ r = kvm_list_running_instances();
+ if (r < 0)
+ perror("Error listing instances");
+
+ status |= r;
+ }
+
+ if (rootfs) {
+ r = kvm_list_rootfs();
+ if (r < 0)
+ perror("Error listing rootfs");
+
+ status |= r;
+ }
+
+ return status;
+}
diff --git a/tools/kvm/builtin-pause.c b/tools/kvm/builtin-pause.c
new file mode 100644
index 0000000..c08595a
--- /dev/null
+++ b/tools/kvm/builtin-pause.c
@@ -0,0 +1,88 @@
+#include <kvm/util.h>
+#include <kvm/kvm-cmd.h>
+#include <kvm/builtin-pause.h>
+#include <kvm/builtin-list.h>
+#include <kvm/kvm.h>
+#include <kvm/parse-options.h>
+#include <kvm/kvm-ipc.h>
+
+#include <stdio.h>
+#include <string.h>
+#include <signal.h>
+
+static bool all;
+static const char *instance_name;
+
+static const char * const pause_usage[] = {
+ "lkvm pause [--all] [-n name]",
+ NULL
+};
+
+static const struct option pause_options[] = {
+ OPT_GROUP("General options:"),
+ OPT_BOOLEAN('a', "all", &all, "Pause all instances"),
+ OPT_STRING('n', "name", &instance_name, "name", "Instance name"),
+ OPT_END()
+};
+
+static void parse_pause_options(int argc, const char **argv)
+{
+ while (argc != 0) {
+ argc = parse_options(argc, argv, pause_options, pause_usage,
+ PARSE_OPT_STOP_AT_NON_OPTION);
+ if (argc != 0)
+ kvm_pause_help();
+ }
+}
+
+void kvm_pause_help(void)
+{
+ usage_with_options(pause_usage, pause_options);
+}
+
+static int do_pause(const char *name, int sock)
+{
+ int r;
+ int vmstate;
+
+ vmstate = get_vmstate(sock);
+ if (vmstate < 0)
+ return vmstate;
+ if (vmstate == KVM_VMSTATE_PAUSED) {
+ printf("Guest %s is already paused.\n", name);
+ return 0;
+ }
+
+ r = kvm_ipc__send(sock, KVM_IPC_PAUSE);
+ if (r)
+ return r;
+
+ printf("Guest %s paused\n", name);
+
+ return 0;
+}
+
+int kvm_cmd_pause(int argc, const char **argv, const char *prefix)
+{
+ int instance;
+ int r;
+
+ parse_pause_options(argc, argv);
+
+ if (all)
+ return kvm__enumerate_instances(do_pause);
+
+ if (instance_name == NULL)
+ kvm_pause_help();
+
+ instance = kvm__get_sock_by_instance(instance_name);
+
+ if (instance <= 0)
+ die("Failed locating instance");
+
+ r = do_pause(instance_name, instance);
+
+ close(instance);
+
+ return r;
+}
diff --git a/tools/kvm/builtin-resume.c b/tools/kvm/builtin-resume.c
new file mode 100644
index 0000000..0e954b4
--- /dev/null
+++ b/tools/kvm/builtin-resume.c
@@ -0,0 +1,88 @@
+#include <kvm/util.h>
+#include <kvm/kvm-cmd.h>
+#include <kvm/builtin-resume.h>
+#include <kvm/builtin-list.h>
+#include <kvm/kvm.h>
+#include <kvm/parse-options.h>
+#include <kvm/kvm-ipc.h>
+
+#include <stdio.h>
+#include <string.h>
+#include <signal.h>
+
+static bool all;
+static const char *instance_name;
+
+static const char * const resume_usage[] = {
+ "lkvm resume [--all] [-n name]",
+ NULL
+};
+
+static const struct option resume_options[] = {
+ OPT_GROUP("General options:"),
+ OPT_BOOLEAN('a', "all", &all, "Resume all instances"),
+ OPT_STRING('n', "name", &instance_name, "name", "Instance name"),
+ OPT_END()
+};
+
+static void parse_resume_options(int argc, const char **argv)
+{
+ while (argc != 0) {
+ argc = parse_options(argc, argv, resume_options, resume_usage,
+ PARSE_OPT_STOP_AT_NON_OPTION);
+ if (argc != 0)
+ kvm_resume_help();
+ }
+}
+
+void kvm_resume_help(void)
+{
+ usage_with_options(resume_usage, resume_options);
+}
+
+static int do_resume(const char *name, int sock)
+{
+ int r;
+ int vmstate;
+
+ vmstate = get_vmstate(sock);
+ if (vmstate < 0)
+ return vmstate;
+ if (vmstate == KVM_VMSTATE_RUNNING) {
+ printf("Guest %s is still running.\n", name);
+ return 0;
+ }
+
+ r = kvm_ipc__send(sock, KVM_IPC_RESUME);
+ if (r)
+ return r;
+
+ printf("Guest %s resumed\n", name);
+
+ return 0;
+}
+
+int kvm_cmd_resume(int argc, const char **argv, const char *prefix)
+{
+ int instance;
+ int r;
+
+ parse_resume_options(argc, argv);
+
+ if (all)
+ return kvm__enumerate_instances(do_resume);
+
+ if (instance_name == NULL)
+ kvm_resume_help();
+
+ instance = kvm__get_sock_by_instance(instance_name);
+
+ if (instance <= 0)
+ die("Failed locating instance");
+
+ r = do_resume(instance_name, instance);
+
+ close(instance);
+
+ return r;
+}
diff --git a/tools/kvm/builtin-run.c b/tools/kvm/builtin-run.c
new file mode 100644
index 0000000..1ee75ad
--- /dev/null
+++ b/tools/kvm/builtin-run.c
@@ -0,0 +1,691 @@
+#include "kvm/builtin-run.h"
+
+#include "kvm/builtin-setup.h"
+#include "kvm/virtio-balloon.h"
+#include "kvm/virtio-console.h"
+#include "kvm/parse-options.h"
+#include "kvm/8250-serial.h"
+#include "kvm/framebuffer.h"
+#include "kvm/disk-image.h"
+#include "kvm/threadpool.h"
+#include "kvm/virtio-scsi.h"
+#include "kvm/virtio-blk.h"
+#include "kvm/virtio-net.h"
+#include "kvm/virtio-rng.h"
+#include "kvm/ioeventfd.h"
+#include "kvm/virtio-9p.h"
+#include "kvm/barrier.h"
+#include "kvm/kvm-cpu.h"
+#include "kvm/ioport.h"
+#include "kvm/symbol.h"
+#include "kvm/i8042.h"
+#include "kvm/mutex.h"
+#include "kvm/term.h"
+#include "kvm/util.h"
+#include "kvm/strbuf.h"
+#include "kvm/vesa.h"
+#include "kvm/irq.h"
+#include "kvm/kvm.h"
+#include "kvm/pci.h"
+#include "kvm/rtc.h"
+#include "kvm/sdl.h"
+#include "kvm/vnc.h"
+#include "kvm/guest_compat.h"
+#include "kvm/pci-shmem.h"
+#include "kvm/kvm-ipc.h"
+#include "kvm/builtin-debug.h"
+
+#include <linux/types.h>
+#include <linux/err.h>
+
+#include <sys/utsname.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <termios.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <ctype.h>
+#include <stdio.h>
+
+#define MB_SHIFT (20)
+#define KB_SHIFT (10)
+#define GB_SHIFT (30)
+
+__thread struct kvm_cpu *current_kvm_cpu;
+
+static int kvm_run_wrapper;
+
+bool do_debug_print = false;
+
+extern char _binary_guest_init_start;
+extern char _binary_guest_init_size;
+
+static const char * const run_usage[] = {
+ "lkvm run [<options>] [<kernel image>]",
+ NULL
+};
+
+enum {
+ KVM_RUN_DEFAULT,
+ KVM_RUN_SANDBOX,
+};
+
+static int img_name_parser(const struct option *opt, const char *arg, int unset)
+{
+ char path[PATH_MAX];
+ struct stat st;
+
+ snprintf(path, PATH_MAX, "%s%s", kvm__get_dir(), arg);
+
+ if ((stat(arg, &st) == 0 && S_ISDIR(st.st_mode)) ||
+ (stat(path, &st) == 0 && S_ISDIR(st.st_mode)))
+ return virtio_9p_img_name_parser(opt, arg, unset);
+ return disk_img_name_parser(opt, arg, unset);
+}
+
+void kvm_run_set_wrapper_sandbox(void)
+{
+ kvm_run_wrapper = KVM_RUN_SANDBOX;
+}
+
+#ifndef OPT_ARCH_RUN
+#define OPT_ARCH_RUN(...)
+#endif
+
+#define BUILD_OPTIONS(name, cfg, kvm) \
+ struct option name[] = { \
+ OPT_GROUP("Basic options:"), \
+ OPT_STRING('\0', "name", &(cfg)->guest_name, "guest name", \
+ "A name for the guest"), \
+ OPT_INTEGER('c', "cpus", &(cfg)->nrcpus, "Number of CPUs"), \
+ OPT_U64('m', "mem", &(cfg)->ram_size, "Virtual machine memory" \
+ " size in MiB."), \
+ OPT_CALLBACK('\0', "shmem", NULL, \
+ "[pci:]<addr>:<size>[:handle=<handle>][:create]", \
+ "Share host shmem with guest via pci device", \
+ shmem_parser, NULL), \
+ OPT_CALLBACK('d', "disk", kvm, "image or rootfs_dir", "Disk " \
+ " image or rootfs directory", img_name_parser, \
+ kvm), \
+ OPT_BOOLEAN('\0', "balloon", &(cfg)->balloon, "Enable virtio" \
+ " balloon"), \
+ OPT_BOOLEAN('\0', "vnc", &(cfg)->vnc, "Enable VNC framebuffer"),\
+ OPT_BOOLEAN('\0', "gtk", &(cfg)->gtk, "Enable GTK framebuffer"),\
+ OPT_BOOLEAN('\0', "sdl", &(cfg)->sdl, "Enable SDL framebuffer"),\
+ OPT_BOOLEAN('\0', "rng", &(cfg)->virtio_rng, "Enable virtio" \
+ " Random Number Generator"), \
+ OPT_CALLBACK('\0', "9p", NULL, "dir_to_share,tag_name", \
+ "Enable virtio 9p to share files between host and" \
+ " guest", virtio_9p_rootdir_parser, kvm), \
+ OPT_STRING('\0', "console", &(cfg)->console, "serial, virtio or"\
+ " hv", "Console to use"), \
+ OPT_STRING('\0', "dev", &(cfg)->dev, "device_file", \
+ "KVM device file"), \
+ OPT_CALLBACK('\0', "tty", NULL, "tty id", \
+ "Remap guest TTY into a pty on the host", \
+ tty_parser, NULL), \
+ OPT_STRING('\0', "sandbox", &(cfg)->sandbox, "script", \
+ "Run this script when booting into custom" \
+ " rootfs"), \
+ OPT_STRING('\0', "hugetlbfs", &(cfg)->hugetlbfs_path, "path", \
+ "Hugetlbfs path"), \
+ \
+ OPT_GROUP("Kernel options:"), \
+ OPT_STRING('k', "kernel", &(cfg)->kernel_filename, "kernel", \
+ "Kernel to boot in virtual machine"), \
+ OPT_STRING('i', "initrd", &(cfg)->initrd_filename, "initrd", \
+ "Initial RAM disk image"), \
+ OPT_STRING('p', "params", &(cfg)->kernel_cmdline, "params", \
+ "Kernel command line arguments"), \
+ OPT_STRING('f', "firmware", &(cfg)->firmware_filename, "firmware",\
+ "Firmware image to boot in virtual machine"), \
+ \
+ OPT_GROUP("Networking options:"), \
+ OPT_CALLBACK_DEFAULT('n', "network", NULL, "network params", \
+ "Create a new guest NIC", \
+ netdev_parser, NULL, kvm), \
+ OPT_BOOLEAN('\0', "no-dhcp", &(cfg)->no_dhcp, "Disable kernel" \
+ " DHCP in rootfs mode"), \
+ \
+ OPT_GROUP("Debug options:"), \
+ OPT_BOOLEAN('\0', "debug", &do_debug_print, \
+ "Enable debug messages"), \
+ OPT_BOOLEAN('\0', "debug-single-step", &(cfg)->single_step, \
+ "Enable single stepping"), \
+ OPT_BOOLEAN('\0', "debug-ioport", &(cfg)->ioport_debug, \
+ "Enable ioport debugging"), \
+ OPT_BOOLEAN('\0', "debug-mmio", &(cfg)->mmio_debug, \
+ "Enable MMIO debugging"), \
+ OPT_INTEGER('\0', "debug-iodelay", &(cfg)->debug_iodelay, \
+ "Delay IO by millisecond"), \
+ \
+ OPT_ARCH(RUN, cfg) \
+ OPT_END() \
+ };
+
+static void *kvm_cpu_thread(void *arg)
+{
+ char name[16];
+
+ current_kvm_cpu = arg;
+
+ sprintf(name, "kvm-vcpu-%lu", current_kvm_cpu->cpu_id);
+ kvm__set_thread_name(name);
+
+ if (kvm_cpu__start(current_kvm_cpu))
+ goto panic_kvm;
+
+ return (void *) (intptr_t) 0;
+
+panic_kvm:
+ fprintf(stderr, "KVM exit reason: %u (\"%s\")\n",
+ current_kvm_cpu->kvm_run->exit_reason,
+ kvm_exit_reasons[current_kvm_cpu->kvm_run->exit_reason]);
+ if (current_kvm_cpu->kvm_run->exit_reason == KVM_EXIT_UNKNOWN)
+ fprintf(stderr, "KVM exit code: 0x%llu\n",
+ (unsigned long long)current_kvm_cpu->kvm_run->hw.hardware_exit_reason);
+
+ kvm_cpu__set_debug_fd(STDOUT_FILENO);
+ kvm_cpu__show_registers(current_kvm_cpu);
+ kvm_cpu__show_code(current_kvm_cpu);
+ kvm_cpu__show_page_tables(current_kvm_cpu);
+
+ return (void *) (intptr_t) 1;
+}
+
+static char kernel[PATH_MAX];
+
+static const char *host_kernels[] = {
+ "/boot/vmlinuz",
+ "/boot/bzImage",
+ NULL
+};
+
+static const char *default_kernels[] = {
+ "./bzImage",
+ "arch/" BUILD_ARCH "/boot/bzImage",
+ "../../arch/" BUILD_ARCH "/boot/bzImage",
+ NULL
+};
+
+static const char *default_vmlinux[] = {
+ "vmlinux",
+ "../../../vmlinux",
+ "../../vmlinux",
+ NULL
+};
+
+static void kernel_usage_with_options(void)
+{
+ const char **k;
+ struct utsname uts;
+
+ fprintf(stderr, "Fatal: could not find default kernel image in:\n");
+ k = &default_kernels[0];
+ while (*k) {
+ fprintf(stderr, "\t%s\n", *k);
+ k++;
+ }
+
+ if (uname(&uts) < 0)
+ return;
+
+ k = &host_kernels[0];
+ while (*k) {
+ if (snprintf(kernel, PATH_MAX, "%s-%s", *k, uts.release) < 0)
+ return;
+ fprintf(stderr, "\t%s\n", kernel);
+ k++;
+ }
+ fprintf(stderr, "\nPlease see '%s run --help' for more options.\n\n",
+ KVM_BINARY_NAME);
+}
+
+static u64 host_ram_size(void)
+{
+ long page_size;
+ long nr_pages;
+
+ nr_pages = sysconf(_SC_PHYS_PAGES);
+ if (nr_pages < 0) {
+ pr_warning("sysconf(_SC_PHYS_PAGES) failed");
+ return 0;
+ }
+
+ page_size = sysconf(_SC_PAGE_SIZE);
+ if (page_size < 0) {
+ pr_warning("sysconf(_SC_PAGE_SIZE) failed");
+ return 0;
+ }
+
+ return (nr_pages * page_size) >> MB_SHIFT;
+}
+
+/*
+ * If user didn't specify how much memory it wants to allocate for the guest,
+ * avoid filling the whole host RAM.
+ */
+#define RAM_SIZE_RATIO 0.8
+
+static u64 get_ram_size(int nr_cpus)
+{
+ u64 available;
+ u64 ram_size;
+
+ ram_size = 64 * (nr_cpus + 3);
+
+ available = host_ram_size() * RAM_SIZE_RATIO;
+ if (!available)
+ available = MIN_RAM_SIZE_MB;
+
+ if (ram_size > available)
+ ram_size = available;
+
+ return ram_size;
+}
+
+static const char *find_kernel(void)
+{
+ const char **k;
+ struct stat st;
+ struct utsname uts;
+
+ k = &default_kernels[0];
+ while (*k) {
+ if (stat(*k, &st) < 0 || !S_ISREG(st.st_mode)) {
+ k++;
+ continue;
+ }
+ strncpy(kernel, *k, PATH_MAX);
+ return kernel;
+ }
+
+ if (uname(&uts) < 0)
+ return NULL;
+
+ k = &host_kernels[0];
+ while (*k) {
+ if (snprintf(kernel, PATH_MAX, "%s-%s", *k, uts.release) < 0)
+ return NULL;
+
+ if (stat(kernel, &st) < 0 || !S_ISREG(st.st_mode)) {
+ k++;
+ continue;
+ }
+ return kernel;
+
+ }
+ return NULL;
+}
+
+static const char *find_vmlinux(void)
+{
+ const char **vmlinux;
+
+ vmlinux = &default_vmlinux[0];
+ while (*vmlinux) {
+ struct stat st;
+
+ if (stat(*vmlinux, &st) < 0 || !S_ISREG(st.st_mode)) {
+ vmlinux++;
+ continue;
+ }
+ return *vmlinux;
+ }
+ return NULL;
+}
+
+void kvm_run_help(void)
+{
+ struct kvm *kvm = NULL;
+
+ BUILD_OPTIONS(options, &kvm->cfg, kvm);
+ usage_with_options(run_usage, options);
+}
+
+static int kvm_setup_guest_init(struct kvm *kvm)
+{
+ const char *rootfs = kvm->cfg.custom_rootfs_name;
+ char tmp[PATH_MAX];
+ size_t size;
+ int fd, ret;
+ char *data;
+
+ /* Setup /virt/init */
+ size = (size_t)&_binary_guest_init_size;
+ data = (char *)&_binary_guest_init_start;
+ snprintf(tmp, PATH_MAX, "%s%s/virt/init", kvm__get_dir(), rootfs);
+ remove(tmp);
+ fd = open(tmp, O_CREAT | O_WRONLY, 0755);
+ if (fd < 0)
+ die("Fail to setup %s", tmp);
+ ret = xwrite(fd, data, size);
+ if (ret < 0)
+ die("Fail to setup %s", tmp);
+ close(fd);
+
+ return 0;
+}
+
+static int kvm_run_set_sandbox(struct kvm *kvm)
+{
+ const char *guestfs_name = kvm->cfg.custom_rootfs_name;
+ char path[PATH_MAX], script[PATH_MAX], *tmp;
+
+ snprintf(path, PATH_MAX, "%s%s/virt/sandbox.sh", kvm__get_dir(), guestfs_name);
+
+ remove(path);
+
+ if (kvm->cfg.sandbox == NULL)
+ return 0;
+
+ tmp = realpath(kvm->cfg.sandbox, NULL);
+ if (tmp == NULL)
+ return -ENOMEM;
+
+ snprintf(script, PATH_MAX, "/host/%s", tmp);
+ free(tmp);
+
+ return symlink(script, path);
+}
+
+static void kvm_write_sandbox_cmd_exactly(int fd, const char *arg)
+{
+ const char *single_quote;
+
+ if (!*arg) { /* zero length string */
+ if (write(fd, "''", 2) <= 0)
+ die("Failed writing sandbox script");
+ return;
+ }
+
+ while (*arg) {
+ single_quote = strchrnul(arg, '\'');
+
+ /* write non-single-quote string as #('string') */
+ if (arg != single_quote) {
+ if (write(fd, "'", 1) <= 0 ||
+ write(fd, arg, single_quote - arg) <= 0 ||
+ write(fd, "'", 1) <= 0)
+ die("Failed writing sandbox script");
+ }
+
+ /* write single quote as #("'") */
+ if (*single_quote) {
+ if (write(fd, "\"'\"", 3) <= 0)
+ die("Failed writing sandbox script");
+ } else
+ break;
+
+ arg = single_quote + 1;
+ }
+}
+
+static void resolve_program(const char *src, char *dst, size_t len)
+{
+ struct stat st;
+ int err;
+
+ err = stat(src, &st);
+
+ if (!err && S_ISREG(st.st_mode)) {
+ char resolved_path[PATH_MAX];
+
+ if (!realpath(src, resolved_path))
+ die("Unable to resolve program %s: %s\n", src, strerror(errno));
+
+ snprintf(dst, len, "/host%s", resolved_path);
+ } else
+ strncpy(dst, src, len);
+}
+
+static void kvm_run_write_sandbox_cmd(struct kvm *kvm, const char **argv, int argc)
+{
+ const char script_hdr[] = "#! /bin/bash\n\n";
+ char program[PATH_MAX];
+ int fd;
+
+ remove(kvm->cfg.sandbox);
+
+ fd = open(kvm->cfg.sandbox, O_RDWR | O_CREAT, 0777);
+ if (fd < 0)
+ die("Failed creating sandbox script");
+
+ if (write(fd, script_hdr, sizeof(script_hdr) - 1) <= 0)
+ die("Failed writing sandbox script");
+
+ resolve_program(argv[0], program, PATH_MAX);
+ kvm_write_sandbox_cmd_exactly(fd, program);
+
+ argv++;
+ argc--;
+
+ while (argc) {
+ if (write(fd, " ", 1) <= 0)
+ die("Failed writing sandbox script");
+
+ kvm_write_sandbox_cmd_exactly(fd, argv[0]);
+ argv++;
+ argc--;
+ }
+ if (write(fd, "\n", 1) <= 0)
+ die("Failed writing sandbox script");
+
+ close(fd);
+}
+
+static struct kvm *kvm_cmd_run_init(int argc, const char **argv)
+{
+ static char real_cmdline[2048], default_name[20];
+ unsigned int nr_online_cpus;
+ struct kvm *kvm = kvm__new();
+
+ if (IS_ERR(kvm))
+ return kvm;
+
+ nr_online_cpus = sysconf(_SC_NPROCESSORS_ONLN);
+ kvm->cfg.custom_rootfs_name = "default";
+
+ while (argc != 0) {
+ BUILD_OPTIONS(options, &kvm->cfg, kvm);
+ argc = parse_options(argc, argv, options, run_usage,
+ PARSE_OPT_STOP_AT_NON_OPTION |
+ PARSE_OPT_KEEP_DASHDASH);
+ if (argc != 0) {
+ /* Cusrom options, should have been handled elsewhere */
+ if (strcmp(argv[0], "--") == 0) {
+ if (kvm_run_wrapper == KVM_RUN_SANDBOX) {
+ kvm->cfg.sandbox = DEFAULT_SANDBOX_FILENAME;
+ kvm_run_write_sandbox_cmd(kvm, argv+1, argc-1);
+ break;
+ }
+ }
+
+ if ((kvm_run_wrapper == KVM_RUN_DEFAULT && kvm->cfg.kernel_filename) ||
+ (kvm_run_wrapper == KVM_RUN_SANDBOX && kvm->cfg.sandbox)) {
+ fprintf(stderr, "Cannot handle parameter: "
+ "%s\n", argv[0]);
+ usage_with_options(run_usage, options);
+ free(kvm);
+ return ERR_PTR(-EINVAL);
+ }
+ if (kvm_run_wrapper == KVM_RUN_SANDBOX) {
+ /*
+ * first unhandled parameter is treated as
+ * sandbox command
+ */
+ kvm->cfg.sandbox = DEFAULT_SANDBOX_FILENAME;
+ kvm_run_write_sandbox_cmd(kvm, argv, argc);
+ } else {
+ /*
+ * first unhandled parameter is treated as a kernel
+ * image
+ */
+ kvm->cfg.kernel_filename = argv[0];
+ }
+ argv++;
+ argc--;
+ }
+
+ }
+
+ kvm->nr_disks = kvm->cfg.image_count;
+
+ if (!kvm->cfg.kernel_filename)
+ kvm->cfg.kernel_filename = find_kernel();
+
+ if (!kvm->cfg.kernel_filename) {
+ kernel_usage_with_options();
+ return ERR_PTR(-EINVAL);
+ }
+
+ kvm->cfg.vmlinux_filename = find_vmlinux();
+ kvm->vmlinux = kvm->cfg.vmlinux_filename;
+
+ if (kvm->cfg.nrcpus == 0)
+ kvm->cfg.nrcpus = nr_online_cpus;
+
+ if (!kvm->cfg.ram_size)
+ kvm->cfg.ram_size = get_ram_size(kvm->cfg.nrcpus);
+
+ if (kvm->cfg.ram_size > host_ram_size())
+ pr_warning("Guest memory size %lluMB exceeds host physical RAM size %lluMB",
+ (unsigned long long)kvm->cfg.ram_size,
+ (unsigned long long)host_ram_size());
+
+ kvm->cfg.ram_size <<= MB_SHIFT;
+
+ if (!kvm->cfg.dev)
+ kvm->cfg.dev = DEFAULT_KVM_DEV;
+
+ if (!kvm->cfg.console)
+ kvm->cfg.console = DEFAULT_CONSOLE;
+
+ if (!strncmp(kvm->cfg.console, "virtio", 6))
+ kvm->cfg.active_console = CONSOLE_VIRTIO;
+ else if (!strncmp(kvm->cfg.console, "serial", 6))
+ kvm->cfg.active_console = CONSOLE_8250;
+ else if (!strncmp(kvm->cfg.console, "hv", 2))
+ kvm->cfg.active_console = CONSOLE_HV;
+ else
+ pr_warning("No console!");
+
+ if (!kvm->cfg.host_ip)
+ kvm->cfg.host_ip = DEFAULT_HOST_ADDR;
+
+ if (!kvm->cfg.guest_ip)
+ kvm->cfg.guest_ip = DEFAULT_GUEST_ADDR;
+
+ if (!kvm->cfg.guest_mac)
+ kvm->cfg.guest_mac = DEFAULT_GUEST_MAC;
+
+ if (!kvm->cfg.host_mac)
+ kvm->cfg.host_mac = DEFAULT_HOST_MAC;
+
+ if (!kvm->cfg.script)
+ kvm->cfg.script = DEFAULT_SCRIPT;
+
+ if (!kvm->cfg.network)
+ kvm->cfg.network = DEFAULT_NETWORK;
+
+ memset(real_cmdline, 0, sizeof(real_cmdline));
+ kvm__arch_set_cmdline(real_cmdline, kvm->cfg.vnc || kvm->cfg.sdl || kvm->cfg.gtk);
+
+ if (strlen(real_cmdline) > 0)
+ strcat(real_cmdline, " ");
+
+ if (kvm->cfg.kernel_cmdline)
+ strlcat(real_cmdline, kvm->cfg.kernel_cmdline, sizeof(real_cmdline));
+
+ if (!kvm->cfg.guest_name) {
+ if (kvm->cfg.custom_rootfs) {
+ kvm->cfg.guest_name = kvm->cfg.custom_rootfs_name;
+ } else {
+ sprintf(default_name, "guest-%u", getpid());
+ kvm->cfg.guest_name = default_name;
+ }
+ }
+
+ if (!kvm->cfg.using_rootfs && !kvm->cfg.disk_image[0].filename && !kvm->cfg.initrd_filename) {
+ char tmp[PATH_MAX];
+
+ kvm_setup_create_new(kvm->cfg.custom_rootfs_name);
+ kvm_setup_resolv(kvm->cfg.custom_rootfs_name);
+
+ snprintf(tmp, PATH_MAX, "%s%s", kvm__get_dir(), "default");
+ if (virtio_9p__register(kvm, tmp, "/dev/root") < 0)
+ die("Unable to initialize virtio 9p");
+ if (virtio_9p__register(kvm, "/", "hostfs") < 0)
+ die("Unable to initialize virtio 9p");
+ kvm->cfg.using_rootfs = kvm->cfg.custom_rootfs = 1;
+ }
+
+ if (kvm->cfg.using_rootfs) {
+ strcat(real_cmdline, " root=/dev/root rw rootflags=rw,trans=virtio,version=9p2000.L rootfstype=9p");
+ if (kvm->cfg.custom_rootfs) {
+ kvm_run_set_sandbox(kvm);
+
+ strcat(real_cmdline, " init=/virt/init");
+
+ if (!kvm->cfg.no_dhcp)
+ strcat(real_cmdline, " ip=dhcp");
+ if (kvm_setup_guest_init(kvm))
+ die("Failed to setup init for guest.");
+ }
+ } else if (!strstr(real_cmdline, "root=")) {
+ strlcat(real_cmdline, " root=/dev/vda rw ", sizeof(real_cmdline));
+ }
+
+ kvm->cfg.real_cmdline = real_cmdline;
+
+ printf(" # %s run -k %s -m %Lu -c %d --name %s\n", KVM_BINARY_NAME,
+ kvm->cfg.kernel_filename,
+ (unsigned long long)kvm->cfg.ram_size / 1024 / 1024,
+ kvm->cfg.nrcpus, kvm->cfg.guest_name);
+
+ if (init_list__init(kvm) < 0)
+ die ("Initialisation failed");
+
+ return kvm;
+}
+
+static int kvm_cmd_run_work(struct kvm *kvm)
+{
+ int i;
+ void *ret = NULL;
+
+ for (i = 0; i < kvm->nrcpus; i++) {
+ if (pthread_create(&kvm->cpus[i]->thread, NULL, kvm_cpu_thread, kvm->cpus[i]) != 0)
+ die("unable to create KVM VCPU thread");
+ }
+
+ /* Only VCPU #0 is going to exit by itself when shutting down */
+ return pthread_join(kvm->cpus[0]->thread, &ret);
+}
+
+static void kvm_cmd_run_exit(struct kvm *kvm, int guest_ret)
+{
+ compat__print_all_messages();
+
+ init_list__exit(kvm);
+
+ if (guest_ret == 0)
+ printf("\n # KVM session ended normally.\n");
+}
+
+int kvm_cmd_run(int argc, const char **argv, const char *prefix)
+{
+ int ret = -EFAULT;
+ struct kvm *kvm;
+
+ kvm = kvm_cmd_run_init(argc, argv);
+ if (IS_ERR(kvm))
+ return PTR_ERR(kvm);
+
+ ret = kvm_cmd_run_work(kvm);
+ kvm_cmd_run_exit(kvm, ret);
+
+ return ret;
+}
diff --git a/tools/kvm/builtin-sandbox.c b/tools/kvm/builtin-sandbox.c
new file mode 100644
index 0000000..433f536
--- /dev/null
+++ b/tools/kvm/builtin-sandbox.c
@@ -0,0 +1,9 @@
+#include "kvm/builtin-sandbox.h"
+#include "kvm/builtin-run.h"
+
+int kvm_cmd_sandbox(int argc, const char **argv, const char *prefix)
+{
+ kvm_run_set_wrapper_sandbox();
+
+ return kvm_cmd_run(argc, argv, prefix);
+}
diff --git a/tools/kvm/builtin-setup.c b/tools/kvm/builtin-setup.c
new file mode 100644
index 0000000..8b45c56
--- /dev/null
+++ b/tools/kvm/builtin-setup.c
@@ -0,0 +1,258 @@
+#include <kvm/util.h>
+#include <kvm/kvm-cmd.h>
+#include <kvm/builtin-setup.h>
+#include <kvm/kvm.h>
+#include <kvm/parse-options.h>
+#include <kvm/read-write.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <limits.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+
+extern char _binary_guest_init_start;
+extern char _binary_guest_init_size;
+
+static const char *instance_name;
+
+static const char * const setup_usage[] = {
+ "lkvm setup [name]",
+ NULL
+};
+
+static const struct option setup_options[] = {
+ OPT_END()
+};
+
+static void parse_setup_options(int argc, const char **argv)
+{
+ while (argc != 0) {
+ argc = parse_options(argc, argv, setup_options, setup_usage,
+ PARSE_OPT_STOP_AT_NON_OPTION);
+ if (argc != 0 && instance_name)
+ kvm_setup_help();
+ else
+ instance_name = argv[0];
+ argv++;
+ argc--;
+ }
+}
+
+void kvm_setup_help(void)
+{
+ printf("\n%s setup creates a new rootfs under %s.\n"
+ "This can be used later by the '-d' parameter of '%s run'.\n",
+ KVM_BINARY_NAME, kvm__get_dir(), KVM_BINARY_NAME);
+ usage_with_options(setup_usage, setup_options);
+}
+
+static int copy_file(const char *from, const char *to)
+{
+ int in_fd, out_fd;
+ void *src, *dst;
+ struct stat st;
+ int err = -1;
+
+ in_fd = open(from, O_RDONLY);
+ if (in_fd < 0)
+ return err;
+
+ if (fstat(in_fd, &st) < 0)
+ goto error_close_in;
+
+ out_fd = open(to, O_RDWR | O_CREAT | O_TRUNC, st.st_mode & (S_IRWXU|S_IRWXG|S_IRWXO));
+ if (out_fd < 0)
+ goto error_close_in;
+
+ src = mmap(NULL, st.st_size, PROT_READ, MAP_SHARED, in_fd, 0);
+ if (src == MAP_FAILED)
+ goto error_close_out;
+
+ if (ftruncate(out_fd, st.st_size) < 0)
+ goto error_munmap_src;
+
+ dst = mmap(NULL, st.st_size, PROT_READ|PROT_WRITE, MAP_SHARED, out_fd, 0);
+ if (dst == MAP_FAILED)
+ goto error_munmap_src;
+
+ memcpy(dst, src, st.st_size);
+
+ if (fsync(out_fd) < 0)
+ goto error_munmap_dst;
+
+ err = 0;
+
+error_munmap_dst:
+ munmap(dst, st.st_size);
+error_munmap_src:
+ munmap(src, st.st_size);
+error_close_out:
+ close(out_fd);
+error_close_in:
+ close(in_fd);
+
+ return err;
+}
+
+static const char *guestfs_dirs[] = {
+ "/dev",
+ "/etc",
+ "/home",
+ "/host",
+ "/proc",
+ "/root",
+ "/sys",
+ "/tmp",
+ "/var",
+ "/var/lib",
+ "/virt",
+ "/virt/home",
+};
+
+static const char *guestfs_symlinks[] = {
+ "/bin",
+ "/lib",
+ "/lib64",
+ "/sbin",
+ "/usr",
+ "/etc/ld.so.conf",
+};
+
+static int copy_init(const char *guestfs_name)
+{
+ char path[PATH_MAX];
+ size_t size;
+ int fd, ret;
+ char *data;
+
+ size = (size_t)&_binary_guest_init_size;
+ data = (char *)&_binary_guest_init_start;
+ snprintf(path, PATH_MAX, "%s%s/virt/init", kvm__get_dir(), guestfs_name);
+ remove(path);
+ fd = open(path, O_CREAT | O_WRONLY, 0755);
+ if (fd < 0)
+ die("Fail to setup %s", path);
+ ret = xwrite(fd, data, size);
+ if (ret < 0)
+ die("Fail to setup %s", path);
+ close(fd);
+
+ return 0;
+}
+
+static int copy_passwd(const char *guestfs_name)
+{
+ char path[PATH_MAX];
+ FILE *file;
+ int ret;
+
+ snprintf(path, PATH_MAX, "%s%s/etc/passwd", kvm__get_dir(), guestfs_name);
+
+ file = fopen(path, "w");
+ if (!file)
+ return -1;
+
+ ret = fprintf(file, "root:x:0:0:root:/root:/bin/sh\n");
+ if (ret > 0)
+ ret = 0;
+
+ fclose(file);
+
+ return ret;
+}
+
+static int make_guestfs_symlink(const char *guestfs_name, const char *path)
+{
+ char target[PATH_MAX];
+ char name[PATH_MAX];
+
+ snprintf(name, PATH_MAX, "%s%s%s", kvm__get_dir(), guestfs_name, path);
+
+ snprintf(target, PATH_MAX, "/host%s", path);
+
+ return symlink(target, name);
+}
+
+static int make_dir(const char *dir)
+{
+ char name[PATH_MAX];
+
+ snprintf(name, PATH_MAX, "%s%s", kvm__get_dir(), dir);
+
+ return mkdir(name, 0777);
+}
+
+static void make_guestfs_dir(const char *guestfs_name, const char *dir)
+{
+ char name[PATH_MAX];
+
+ snprintf(name, PATH_MAX, "%s%s", guestfs_name, dir);
+
+ make_dir(name);
+}
+
+void kvm_setup_resolv(const char *guestfs_name)
+{
+ char path[PATH_MAX];
+
+ snprintf(path, PATH_MAX, "%s%s/etc/resolv.conf", kvm__get_dir(), guestfs_name);
+
+ copy_file("/etc/resolv.conf", path);
+}
+
+static int do_setup(const char *guestfs_name)
+{
+ unsigned int i;
+ int ret;
+
+ ret = make_dir(guestfs_name);
+ if (ret < 0)
+ return ret;
+
+ for (i = 0; i < ARRAY_SIZE(guestfs_dirs); i++)
+ make_guestfs_dir(guestfs_name, guestfs_dirs[i]);
+
+ for (i = 0; i < ARRAY_SIZE(guestfs_symlinks); i++) {
+ make_guestfs_symlink(guestfs_name, guestfs_symlinks[i]);
+ }
+
+ ret = copy_init(guestfs_name);
+ if (ret < 0)
+ return ret;
+
+ return copy_passwd(guestfs_name);
+}
+
+int kvm_setup_create_new(const char *guestfs_name)
+{
+ return do_setup(guestfs_name);
+}
+
+int kvm_cmd_setup(int argc, const char **argv, const char *prefix)
+{
+ int r;
+
+ parse_setup_options(argc, argv);
+
+ if (instance_name == NULL)
+ kvm_setup_help();
+
+ r = do_setup(instance_name);
+ if (r == 0)
+ printf("A new rootfs '%s' has been created in '%s%s'.\n\n"
+ "You can now start it by running the following command:\n\n"
+ " %s run -d %s\n",
+ instance_name, kvm__get_dir(), instance_name,
+ KVM_BINARY_NAME,instance_name);
+ else
+ printf("Unable to create rootfs in %s%s: %s\n",
+ kvm__get_dir(), instance_name, strerror(errno));
+
+ return r;
+}
diff --git a/tools/kvm/builtin-stat.c b/tools/kvm/builtin-stat.c
new file mode 100644
index 0000000..5d6407e
--- /dev/null
+++ b/tools/kvm/builtin-stat.c
@@ -0,0 +1,127 @@
+#include <kvm/util.h>
+#include <kvm/kvm-cmd.h>
+#include <kvm/builtin-stat.h>
+#include <kvm/kvm.h>
+#include <kvm/parse-options.h>
+#include <kvm/kvm-ipc.h>
+
+#include <sys/select.h>
+#include <stdio.h>
+#include <string.h>
+#include <signal.h>
+
+#include <linux/virtio_balloon.h>
+
+static bool mem;
+static bool all;
+static const char *instance_name;
+
+static const char * const stat_usage[] = {
+ "lkvm stat [command] [--all] [-n name]",
+ NULL
+};
+
+static const struct option stat_options[] = {
+ OPT_GROUP("Commands options:"),
+ OPT_BOOLEAN('m', "memory", &mem, "Display memory statistics"),
+ OPT_GROUP("Instance options:"),
+ OPT_BOOLEAN('a', "all", &all, "All instances"),
+ OPT_STRING('n', "name", &instance_name, "name", "Instance name"),
+ OPT_END()
+};
+
+static void parse_stat_options(int argc, const char **argv)
+{
+ while (argc != 0) {
+ argc = parse_options(argc, argv, stat_options, stat_usage,
+ PARSE_OPT_STOP_AT_NON_OPTION);
+ if (argc != 0)
+ kvm_stat_help();
+ }
+}
+
+void kvm_stat_help(void)
+{
+ usage_with_options(stat_usage, stat_options);
+}
+
+static int do_memstat(const char *name, int sock)
+{
+ struct virtio_balloon_stat stats[VIRTIO_BALLOON_S_NR];
+ fd_set fdset;
+ struct timeval t = { .tv_sec = 1 };
+ int r;
+ u8 i;
+
+ FD_ZERO(&fdset);
+ FD_SET(sock, &fdset);
+ r = kvm_ipc__send(sock, KVM_IPC_STAT);
+ if (r < 0)
+ return r;
+
+ r = select(1, &fdset, NULL, NULL, &t);
+ if (r < 0) {
+ pr_err("Could not retrieve mem stats from %s", name);
+ return r;
+ }
+ r = read(sock, &stats, sizeof(stats));
+ if (r < 0)
+ return r;
+
+ printf("\n\n\t*** Guest memory statistics ***\n\n");
+ for (i = 0; i < VIRTIO_BALLOON_S_NR; i++) {
+ switch (stats[i].tag) {
+ case VIRTIO_BALLOON_S_SWAP_IN:
+ printf("The amount of memory that has been swapped in (in bytes):");
+ break;
+ case VIRTIO_BALLOON_S_SWAP_OUT:
+ printf("The amount of memory that has been swapped out to disk (in bytes):");
+ break;
+ case VIRTIO_BALLOON_S_MAJFLT:
+ printf("The number of major page faults that have occurred:");
+ break;
+ case VIRTIO_BALLOON_S_MINFLT:
+ printf("The number of minor page faults that have occurred:");
+ break;
+ case VIRTIO_BALLOON_S_MEMFREE:
+ printf("The amount of memory not being used for any purpose (in bytes):");
+ break;
+ case VIRTIO_BALLOON_S_MEMTOT:
+ printf("The total amount of memory available (in bytes):");
+ break;
+ }
+ printf("%llu\n", (unsigned long long)stats[i].val);
+ }
+ printf("\n");
+
+ return 0;
+}
+
+int kvm_cmd_stat(int argc, const char **argv, const char *prefix)
+{
+ int instance;
+ int r = 0;
+
+ parse_stat_options(argc, argv);
+
+ if (!mem)
+ usage_with_options(stat_usage, stat_options);
+
+ if (mem && all)
+ return kvm__enumerate_instances(do_memstat);
+
+ if (instance_name == NULL)
+ kvm_stat_help();
+
+ instance = kvm__get_sock_by_instance(instance_name);
+
+ if (instance <= 0)
+ die("Failed locating instance");
+
+ if (mem)
+ r = do_memstat(instance_name, instance);
+
+ close(instance);
+
+ return r;
+}
diff --git a/tools/kvm/builtin-stop.c b/tools/kvm/builtin-stop.c
new file mode 100644
index 0000000..6067630
--- /dev/null
+++ b/tools/kvm/builtin-stop.c
@@ -0,0 +1,70 @@
+#include <kvm/util.h>
+#include <kvm/kvm-cmd.h>
+#include <kvm/builtin-stop.h>
+#include <kvm/kvm.h>
+#include <kvm/parse-options.h>
+#include <kvm/kvm-ipc.h>
+
+#include <stdio.h>
+#include <string.h>
+#include <signal.h>
+
+static bool all;
+static const char *instance_name;
+
+static const char * const stop_usage[] = {
+ "lkvm stop [--all] [-n name]",
+ NULL
+};
+
+static const struct option stop_options[] = {
+ OPT_GROUP("General options:"),
+ OPT_BOOLEAN('a', "all", &all, "Stop all instances"),
+ OPT_STRING('n', "name", &instance_name, "name", "Instance name"),
+ OPT_END()
+};
+
+static void parse_stop_options(int argc, const char **argv)
+{
+ while (argc != 0) {
+ argc = parse_options(argc, argv, stop_options, stop_usage,
+ PARSE_OPT_STOP_AT_NON_OPTION);
+ if (argc != 0)
+ kvm_stop_help();
+ }
+}
+
+void kvm_stop_help(void)
+{
+ usage_with_options(stop_usage, stop_options);
+}
+
+static int do_stop(const char *name, int sock)
+{
+ return kvm_ipc__send(sock, KVM_IPC_STOP);
+}
+
+int kvm_cmd_stop(int argc, const char **argv, const char *prefix)
+{
+ int instance;
+ int r;
+
+ parse_stop_options(argc, argv);
+
+ if (all)
+ return kvm__enumerate_instances(do_stop);
+
+ if (instance_name == NULL)
+ kvm_stop_help();
+
+ instance = kvm__get_sock_by_instance(instance_name);
+
+ if (instance <= 0)
+ die("Failed locating instance");
+
+ r = do_stop(instance_name, instance);
+
+ close(instance);
+
+ return r;
+}
diff --git a/tools/kvm/builtin-version.c b/tools/kvm/builtin-version.c
new file mode 100644
index 0000000..b8bb859
--- /dev/null
+++ b/tools/kvm/builtin-version.c
@@ -0,0 +1,15 @@
+#include <kvm/util.h>
+#include <kvm/kvm-cmd.h>
+#include <kvm/builtin-version.h>
+#include <kvm/kvm.h>
+
+#include <stdio.h>
+#include <string.h>
+#include <signal.h>
+
+int kvm_cmd_version(int argc, const char **argv, const char *prefix)
+{
+ printf("kvm tool %s\n", KVMTOOLS_VERSION);
+
+ return 0;
+}
diff --git a/tools/kvm/code16gcc.h b/tools/kvm/code16gcc.h
new file mode 100644
index 0000000..d93e480
--- /dev/null
+++ b/tools/kvm/code16gcc.h
@@ -0,0 +1,15 @@
+/*
+ * code16gcc.h
+ *
+ * This file is -include'd when compiling 16-bit C code.
+ * Note: this asm() needs to be emitted before gcc emits any code.
+ * Depending on gcc version, this requires -fno-unit-at-a-time or
+ * -fno-toplevel-reorder.
+ *
+ * Hopefully gcc will eventually have a real -m16 option so we can
+ * drop this hack long term.
+ */
+
+#ifndef __ASSEMBLY__
+asm(".code16gcc");
+#endif
diff --git a/tools/kvm/command-list.txt b/tools/kvm/command-list.txt
new file mode 100644
index 0000000..d93597d
--- /dev/null
+++ b/tools/kvm/command-list.txt
@@ -0,0 +1,15 @@
+#
+# List of known perf commands.
+# command name category [deprecated] [common]
+#
+lkvm-run mainporcelain common
+lkvm-setup mainporcelain common
+lkvm-pause common
+lkvm-resume common
+lkvm-version common
+lkvm-list common
+lkvm-debug common
+lkvm-balloon common
+lkvm-stop common
+lkvm-stat common
+lkvm-sandbox common
diff --git a/tools/kvm/config/feature-tests.mak b/tools/kvm/config/feature-tests.mak
new file mode 100644
index 0000000..927ac54
--- /dev/null
+++ b/tools/kvm/config/feature-tests.mak
@@ -0,0 +1,188 @@
+define SOURCE_HELLO
+#include <stdio.h>
+int main(void)
+{
+ return puts(\"hi\");
+}
+endef
+
+ifndef NO_DWARF
+define SOURCE_DWARF
+#include <dwarf.h>
+#include <elfutils/libdw.h>
+#include <elfutils/version.h>
+#ifndef _ELFUTILS_PREREQ
+#error
+#endif
+
+int main(void)
+{
+ Dwarf *dbg = dwarf_begin(0, DWARF_C_READ);
+ return (long)dbg;
+}
+endef
+endif
+
+define SOURCE_LIBELF
+#include <libelf.h>
+
+int main(void)
+{
+ Elf *elf = elf_begin(0, ELF_C_READ, 0);
+ return (long)elf;
+}
+endef
+
+define SOURCE_GLIBC
+#include <gnu/libc-version.h>
+
+int main(void)
+{
+ const char *version = gnu_get_libc_version();
+ return (long)version;
+}
+endef
+
+define SOURCE_ELF_MMAP
+#include <libelf.h>
+int main(void)
+{
+ Elf *elf = elf_begin(0, ELF_C_READ_MMAP, 0);
+ return (long)elf;
+}
+endef
+
+ifndef NO_NEWT
+define SOURCE_NEWT
+#include <newt.h>
+
+int main(void)
+{
+ newtInit();
+ newtCls();
+ return newtFinished();
+}
+endef
+endif
+
+ifndef NO_LIBPERL
+define SOURCE_PERL_EMBED
+#include <EXTERN.h>
+#include <perl.h>
+
+int main(void)
+{
+perl_alloc();
+return 0;
+}
+endef
+endif
+
+ifndef NO_LIBPYTHON
+define SOURCE_PYTHON_VERSION
+#include <Python.h>
+#if PY_VERSION_HEX >= 0x03000000
+ #error
+#endif
+int main(void){}
+endef
+define SOURCE_PYTHON_EMBED
+#include <Python.h>
+int main(void)
+{
+ Py_Initialize();
+ return 0;
+}
+endef
+endif
+
+define SOURCE_BFD
+#include <bfd.h>
+
+int main(void)
+{
+ bfd_demangle(0, 0, 0);
+ return 0;
+}
+endef
+
+define SOURCE_CPLUS_DEMANGLE
+extern char *cplus_demangle(const char *, int);
+
+int main(void)
+{
+ cplus_demangle(0, 0);
+ return 0;
+}
+endef
+
+define SOURCE_STRLCPY
+#include <stdlib.h>
+extern size_t strlcpy(char *dest, const char *src, size_t size);
+
+int main(void)
+{
+ strlcpy(NULL, NULL, 0);
+ return 0;
+}
+endef
+
+define SOURCE_VNCSERVER
+#include <rfb/rfb.h>
+
+int main(void)
+{
+ rfbIsActive((void *)0);
+ return 0;
+}
+endef
+
+define SOURCE_SDL
+#include <SDL/SDL.h>
+
+int main(void)
+{
+ SDL_Init(SDL_INIT_VIDEO);
+ return 0;
+}
+endef
+
+define SOURCE_ZLIB
+#include <zlib.h>
+
+int main(void)
+{
+ inflateInit2(NULL, 0);
+ return 0;
+}
+endef
+
+define SOURCE_AIO
+#include <libaio.h>
+
+int main(void)
+{
+ io_setup(0, NULL);
+ return 0;
+}
+endef
+
+define SOURCE_STATIC
+#include <stdlib.h>
+
+int main(void)
+{
+ return 0;
+}
+endef
+
+define SOURCE_GTK3
+#include <gtk/gtk.h>
+
+int main(void)
+{
+ gtk_main();
+
+ return 0;
+}
+endef
diff --git a/tools/kvm/config/utilities.mak b/tools/kvm/config/utilities.mak
new file mode 100644
index 0000000..a70963b
--- /dev/null
+++ b/tools/kvm/config/utilities.mak
@@ -0,0 +1,196 @@
+# This allows us to work with the newline character:
+define newline
+
+
+endef
+newline := $(newline)
+
+# nl-escape
+#
+# Usage: escape = $(call nl-escape[,escape])
+#
+# This is used as the common way to specify
+# what should replace a newline when escaping
+# newlines; the default is a bizarre string.
+#
+nl-escape = $(or $(1),m822df3020w6a44id34bt574ctac44eb9f4n)
+
+# escape-nl
+#
+# Usage: escaped-text = $(call escape-nl,text[,escape])
+#
+# GNU make's $(shell ...) function converts to a
+# single space each newline character in the output
+# produced during the expansion; this may not be
+# desirable.
+#
+# The only solution is to change each newline into
+# something that won't be converted, so that the
+# information can be recovered later with
+# $(call unescape-nl...)
+#
+escape-nl = $(subst $(newline),$(call nl-escape,$(2)),$(1))
+
+# unescape-nl
+#
+# Usage: text = $(call unescape-nl,escaped-text[,escape])
+#
+# See escape-nl.
+#
+unescape-nl = $(subst $(call nl-escape,$(2)),$(newline),$(1))
+
+# shell-escape-nl
+#
+# Usage: $(shell some-command | $(call shell-escape-nl[,escape]))
+#
+# Use this to escape newlines from within a shell call;
+# the default escape is a bizarre string.
+#
+# NOTE: The escape is used directly as a string constant
+# in an `awk' program that is delimited by shell
+# single-quotes, so be wary of the characters
+# that are chosen.
+#
+define shell-escape-nl
+awk 'NR==1 {t=$$0} NR>1 {t=t "$(nl-escape)" $$0} END {printf t}'
+endef
+
+# shell-unescape-nl
+#
+# Usage: $(shell some-command | $(call shell-unescape-nl[,escape]))
+#
+# Use this to unescape newlines from within a shell call;
+# the default escape is a bizarre string.
+#
+# NOTE: The escape is used directly as an extended regular
+# expression constant in an `awk' program that is
+# delimited by shell single-quotes, so be wary
+# of the characters that are chosen.
+#
+# (The bash shell has a bug where `{gsub(...),...}' is
+# misinterpreted as a brace expansion; this can be
+# overcome by putting a space between `{' and `gsub').
+#
+define shell-unescape-nl
+awk 'NR==1 {t=$$0} NR>1 {t=t "\n" $$0} END { gsub(/$(nl-escape)/,"\n",t); printf t }'
+endef
+
+# escape-for-shell-sq
+#
+# Usage: embeddable-text = $(call escape-for-shell-sq,text)
+#
+# This function produces text that is suitable for
+# embedding in a shell string that is delimited by
+# single-quotes.
+#
+escape-for-shell-sq = $(subst ','\'',$(1))
+
+# shell-sq
+#
+# Usage: single-quoted-and-escaped-text = $(call shell-sq,text)
+#
+shell-sq = '$(escape-for-shell-sq)'
+
+# shell-wordify
+#
+# Usage: wordified-text = $(call shell-wordify,text)
+#
+# For instance:
+#
+# |define text
+# |hello
+# |world
+# |endef
+# |
+# |target:
+# | echo $(call shell-wordify,$(text))
+#
+# At least GNU make gets confused by expanding a newline
+# within the context of a command line of a makefile rule
+# (this is in constrast to a `$(shell ...)' function call,
+# which can handle it just fine).
+#
+# This function avoids the problem by producing a string
+# that works as a shell word, regardless of whether or
+# not it contains a newline.
+#
+# If the text to be wordified contains a newline, then
+# an intrictate shell command substitution is constructed
+# to render the text as a single line; when the shell
+# processes the resulting escaped text, it transforms
+# it into the original unescaped text.
+#
+# If the text does not contain a newline, then this function
+# produces the same results as the `$(shell-sq)' function.
+#
+shell-wordify = $(if $(findstring $(newline),$(1)),$(_sw-esc-nl),$(shell-sq))
+define _sw-esc-nl
+"$$(echo $(call escape-nl,$(shell-sq),$(2)) | $(call shell-unescape-nl,$(2)))"
+endef
+
+# is-absolute
+#
+# Usage: bool-value = $(call is-absolute,path)
+#
+is-absolute = $(shell echo $(shell-sq) | grep ^/ -q && echo y)
+
+# lookup
+#
+# Usage: absolute-executable-path-or-empty = $(call lookup,path)
+#
+# (It's necessary to use `sh -c' because GNU make messes up by
+# trying too hard and getting things wrong).
+#
+lookup = $(call unescape-nl,$(shell sh -c $(_l-sh)))
+_l-sh = $(call shell-sq,command -v $(shell-sq) | $(call shell-escape-nl,))
+
+# is-executable
+#
+# Usage: bool-value = $(call is-executable,path)
+#
+# (It's necessary to use `sh -c' because GNU make messes up by
+# trying too hard and getting things wrong).
+#
+is-executable = $(call _is-executable-helper,$(shell-sq))
+_is-executable-helper = $(shell sh -c $(_is-executable-sh))
+_is-executable-sh = $(call shell-sq,test -f $(1) -a -x $(1) && echo y)
+
+# get-executable
+#
+# Usage: absolute-executable-path-or-empty = $(call get-executable,path)
+#
+# The goal is to get an absolute path for an executable;
+# the `command -v' is defined by POSIX, but it's not
+# necessarily very portable, so it's only used if
+# relative path resolution is requested, as determined
+# by the presence of a leading `/'.
+#
+get-executable = $(if $(1),$(if $(is-absolute),$(_ge-abspath),$(lookup)))
+_ge-abspath = $(if $(is-executable),$(1))
+
+# get-supplied-or-default-executable
+#
+# Usage: absolute-executable-path-or-empty = $(call get-executable-or-default,variable,default)
+#
+define get-executable-or-default
+$(if $($(1)),$(call _ge_attempt,$($(1)),$(1)),$(call _ge_attempt,$(2)))
+endef
+_ge_attempt = $(or $(get-executable),$(_gea_warn),$(call _gea_err,$(2)))
+_gea_warn = $(warning The path '$(1)' is not executable.)
+_gea_err = $(if $(1),$(error Please set '$(1)' appropriately))
+
+# try-cc
+# Usage: option = $(call try-cc, source-to-build, cc-options)
+try-cc = $(shell sh -c \
+ 'TMP="$(OUTPUT)$(TMPOUT).$$$$"; \
+ echo "$(1)" | \
+ $(CC) -x c - $(2) -o "$$TMP" > /dev/null 2>&1 && echo y; \
+ rm -f "$$TMP"')
+
+# try-build
+# Usage: option = $(call try-build, source-to-build, cc-options, link-options)
+try-build = $(shell sh -c \
+ 'TMP="$(OUTPUT)$(TMPOUT).$$$$"; \
+ echo "$(1)" | \
+ $(CC) -x c - $(2) $(3) -o "$$TMP" > /dev/null 2>&1 && echo y; \
+ rm -f "$$TMP"')
diff --git a/tools/kvm/devices.c b/tools/kvm/devices.c
new file mode 100644
index 0000000..b560a59
--- /dev/null
+++ b/tools/kvm/devices.c
@@ -0,0 +1,105 @@
+#include "kvm/devices.h"
+#include "kvm/kvm.h"
+#include "kvm/pci.h"
+#include "kvm/virtio-mmio.h"
+
+#include <linux/err.h>
+#include <linux/rbtree.h>
+
+struct device_bus {
+ struct rb_root root;
+ int dev_num;
+};
+
+static struct device_bus device_trees[DEVICE_BUS_MAX] = {
+ [0 ... (DEVICE_BUS_MAX - 1)] = { RB_ROOT, 0 },
+};
+
+int device__register(struct device_header *dev)
+{
+ struct device_bus *bus;
+ struct rb_node **node, *parent = NULL;
+
+ if (dev->bus_type >= DEVICE_BUS_MAX) {
+ pr_warning("Ignoring device registration on unknown bus %d\n",
+ dev->bus_type);
+ return -EINVAL;
+ }
+
+ bus = &device_trees[dev->bus_type];
+ dev->dev_num = bus->dev_num++;
+
+ switch (dev->bus_type) {
+ case DEVICE_BUS_PCI:
+ pci__assign_irq(dev);
+ break;
+ case DEVICE_BUS_MMIO:
+ virtio_mmio_assign_irq(dev);
+ break;
+ default:
+ break;
+ }
+
+ node = &bus->root.rb_node;
+ while (*node) {
+ int num = rb_entry(*node, struct device_header, node)->dev_num;
+ int result = dev->dev_num - num;
+
+ if (result < 0)
+ node = &((*node)->rb_left);
+ else if (result > 0)
+ node = &((*node)->rb_right);
+ else
+ return -EEXIST;
+ }
+
+ rb_link_node(&dev->node, parent, node);
+ rb_insert_color(&dev->node, &bus->root);
+ return 0;
+}
+
+void device__unregister(struct device_header *dev)
+{
+ struct device_bus *bus = &device_trees[dev->bus_type];
+ rb_erase(&dev->node, &bus->root);
+}
+
+struct device_header *device__find_dev(enum device_bus_type bus_type, u8 dev_num)
+{
+ struct rb_node *node;
+
+ if (bus_type >= DEVICE_BUS_MAX)
+ return ERR_PTR(-EINVAL);
+
+ node = device_trees[bus_type].root.rb_node;
+ while (node) {
+ struct device_header *dev = rb_entry(node, struct device_header,
+ node);
+ if (dev_num < dev->dev_num) {
+ node = node->rb_left;
+ } else if (dev_num > dev->dev_num) {
+ node = node->rb_right;
+ } else {
+ return dev;
+ }
+ }
+
+ return NULL;
+}
+
+struct device_header *device__first_dev(enum device_bus_type bus_type)
+{
+ struct rb_node *node;
+
+ if (bus_type >= DEVICE_BUS_MAX)
+ return NULL;
+
+ node = rb_first(&device_trees[bus_type].root);
+ return node ? rb_entry(node, struct device_header, node) : NULL;
+}
+
+struct device_header *device__next_dev(struct device_header *dev)
+{
+ struct rb_node *node = rb_next(&dev->node);
+ return node ? rb_entry(node, struct device_header, node) : NULL;
+}
diff --git a/tools/kvm/disk/blk.c b/tools/kvm/disk/blk.c
new file mode 100644
index 0000000..37581d3
--- /dev/null
+++ b/tools/kvm/disk/blk.c
@@ -0,0 +1,76 @@
+#include "kvm/disk-image.h"
+
+#include <linux/err.h>
+#include <mntent.h>
+
+/*
+ * raw image and blk dev are similar, so reuse raw image ops.
+ */
+static struct disk_image_operations blk_dev_ops = {
+ .read = raw_image__read,
+ .write = raw_image__write,
+};
+
+static bool is_mounted(struct stat *st)
+{
+ struct stat st_buf;
+ struct mntent *mnt;
+ FILE *f;
+
+ f = setmntent("/proc/mounts", "r");
+ if (!f)
+ return false;
+
+ while ((mnt = getmntent(f)) != NULL) {
+ if (stat(mnt->mnt_fsname, &st_buf) == 0 &&
+ S_ISBLK(st_buf.st_mode) && st->st_rdev == st_buf.st_rdev) {
+ fclose(f);
+ return true;
+ }
+ }
+
+ fclose(f);
+ return false;
+}
+
+struct disk_image *blkdev__probe(const char *filename, int flags, struct stat *st)
+{
+ struct disk_image *disk;
+ int fd, r;
+ u64 size;
+
+ if (!S_ISBLK(st->st_mode))
+ return ERR_PTR(-EINVAL);
+
+ if (is_mounted(st)) {
+ pr_err("Block device %s is already mounted! Unmount before use.",
+ filename);
+ return ERR_PTR(-EINVAL);
+ }
+
+ /*
+ * Be careful! We are opening host block device!
+ * Open it readonly since we do not want to break user's data on disk.
+ */
+ fd = open(filename, flags);
+ if (fd < 0)
+ return ERR_PTR(fd);
+
+ if (ioctl(fd, BLKGETSIZE64, &size) < 0) {
+ r = -errno;
+ close(fd);
+ return ERR_PTR(r);
+ }
+
+ /*
+ * FIXME: This will not work on 32-bit host because we can not
+ * mmap large disk. There is not enough virtual address space
+ * in 32-bit host. However, this works on 64-bit host.
+ */
+ disk = disk_image__new(fd, size, &blk_dev_ops, DISK_IMAGE_REGULAR);
+#ifdef CONFIG_HAS_AIO
+ if (!IS_ERR_OR_NULL(disk))
+ disk->async = 1;
+#endif
+ return disk;
+}
diff --git a/tools/kvm/disk/core.c b/tools/kvm/disk/core.c
new file mode 100644
index 0000000..309e16c
--- /dev/null
+++ b/tools/kvm/disk/core.c
@@ -0,0 +1,358 @@
+#include "kvm/disk-image.h"
+#include "kvm/qcow.h"
+#include "kvm/virtio-blk.h"
+#include "kvm/kvm.h"
+
+#include <linux/err.h>
+#include <sys/eventfd.h>
+#include <sys/poll.h>
+
+#define AIO_MAX 256
+
+int debug_iodelay;
+
+static int disk_image__close(struct disk_image *disk);
+
+int disk_img_name_parser(const struct option *opt, const char *arg, int unset)
+{
+ const char *cur;
+ char *sep;
+ struct kvm *kvm = opt->ptr;
+
+ if (kvm->cfg.image_count >= MAX_DISK_IMAGES)
+ die("Currently only 4 images are supported");
+
+ kvm->cfg.disk_image[kvm->cfg.image_count].filename = arg;
+ cur = arg;
+
+ if (strncmp(arg, "scsi:", 5) == 0) {
+ sep = strstr(arg, ":");
+ if (sep)
+ kvm->cfg.disk_image[kvm->cfg.image_count].wwpn = sep + 1;
+ sep = strstr(sep + 1, ":");
+ if (sep) {
+ *sep = 0;
+ kvm->cfg.disk_image[kvm->cfg.image_count].tpgt = sep + 1;
+ }
+ cur = sep + 1;
+ }
+
+ do {
+ sep = strstr(cur, ",");
+ if (sep) {
+ if (strncmp(sep + 1, "ro", 2) == 0)
+ kvm->cfg.disk_image[kvm->cfg.image_count].readonly = true;
+ else if (strncmp(sep + 1, "direct", 6) == 0)
+ kvm->cfg.disk_image[kvm->cfg.image_count].direct = true;
+ *sep = 0;
+ cur = sep + 1;
+ }
+ } while (sep);
+
+ kvm->cfg.image_count++;
+
+ return 0;
+}
+
+#ifdef CONFIG_HAS_AIO
+static void *disk_image__thread(void *param)
+{
+ struct disk_image *disk = param;
+ struct io_event event[AIO_MAX];
+ struct timespec notime = {0};
+ int nr, i;
+ u64 dummy;
+
+ kvm__set_thread_name("disk-image-io");
+
+ while (read(disk->evt, &dummy, sizeof(dummy)) > 0) {
+ nr = io_getevents(disk->ctx, 1, ARRAY_SIZE(event), event, ¬ime);
+ for (i = 0; i < nr; i++)
+ disk->disk_req_cb(event[i].data, event[i].res);
+ }
+
+ return NULL;
+}
+#endif
+
+struct disk_image *disk_image__new(int fd, u64 size,
+ struct disk_image_operations *ops,
+ int use_mmap)
+{
+ struct disk_image *disk;
+ int r;
+
+ disk = malloc(sizeof *disk);
+ if (!disk)
+ return ERR_PTR(-ENOMEM);
+
+ *disk = (struct disk_image) {
+ .fd = fd,
+ .size = size,
+ .ops = ops,
+ };
+
+ if (use_mmap == DISK_IMAGE_MMAP) {
+ /*
+ * The write to disk image will be discarded
+ */
+ disk->priv = mmap(NULL, size, PROT_RW, MAP_PRIVATE | MAP_NORESERVE, fd, 0);
+ if (disk->priv == MAP_FAILED) {
+ r = -errno;
+ free(disk);
+ return ERR_PTR(r);
+ }
+ }
+
+#ifdef CONFIG_HAS_AIO
+ {
+ pthread_t thread;
+
+ disk->evt = eventfd(0, 0);
+ io_setup(AIO_MAX, &disk->ctx);
+ r = pthread_create(&thread, NULL, disk_image__thread, disk);
+ if (r) {
+ r = -errno;
+ free(disk);
+ return ERR_PTR(r);
+ }
+ }
+#endif
+ return disk;
+}
+
+static struct disk_image *disk_image__open(const char *filename, bool readonly, bool direct)
+{
+ struct disk_image *disk;
+ struct stat st;
+ int fd, flags;
+
+ if (readonly)
+ flags = O_RDONLY;
+ else
+ flags = O_RDWR;
+ if (direct)
+ flags |= O_DIRECT;
+
+ if (stat(filename, &st) < 0)
+ return ERR_PTR(-errno);
+
+ /* blk device ?*/
+ disk = blkdev__probe(filename, flags, &st);
+ if (!IS_ERR_OR_NULL(disk))
+ return disk;
+
+ fd = open(filename, flags);
+ if (fd < 0)
+ return ERR_PTR(fd);
+
+ /* qcow image ?*/
+ disk = qcow_probe(fd, true);
+ if (!IS_ERR_OR_NULL(disk)) {
+ pr_warning("Forcing read-only support for QCOW");
+ return disk;
+ }
+
+ /* raw image ?*/
+ disk = raw_image__probe(fd, &st, readonly);
+ if (!IS_ERR_OR_NULL(disk))
+ return disk;
+
+ if (close(fd) < 0)
+ pr_warning("close() failed");
+
+ return ERR_PTR(-ENOSYS);
+}
+
+static struct disk_image **disk_image__open_all(struct kvm *kvm)
+{
+ struct disk_image **disks;
+ const char *filename;
+ const char *wwpn;
+ const char *tpgt;
+ bool readonly;
+ bool direct;
+ void *err;
+ int i;
+ struct disk_image_params *params = (struct disk_image_params *)&kvm->cfg.disk_image;
+ int count = kvm->cfg.image_count;
+
+ if (!count)
+ return ERR_PTR(-EINVAL);
+ if (count > MAX_DISK_IMAGES)
+ return ERR_PTR(-ENOSPC);
+
+ disks = calloc(count, sizeof(*disks));
+ if (!disks)
+ return ERR_PTR(-ENOMEM);
+
+ for (i = 0; i < count; i++) {
+ filename = params[i].filename;
+ readonly = params[i].readonly;
+ direct = params[i].direct;
+ wwpn = params[i].wwpn;
+ tpgt = params[i].tpgt;
+
+ if (wwpn) {
+ disks[i] = malloc(sizeof(struct disk_image));
+ if (!disks[i])
+ return ERR_PTR(-ENOMEM);
+ disks[i]->wwpn = wwpn;
+ disks[i]->tpgt = tpgt;
+ continue;
+ }
+
+ if (!filename)
+ continue;
+
+ disks[i] = disk_image__open(filename, readonly, direct);
+ if (IS_ERR_OR_NULL(disks[i])) {
+ pr_err("Loading disk image '%s' failed", filename);
+ err = disks[i];
+ goto error;
+ }
+ disks[i]->debug_iodelay = kvm->cfg.debug_iodelay;
+ }
+
+ return disks;
+error:
+ for (i = 0; i < count; i++)
+ if (!IS_ERR_OR_NULL(disks[i]))
+ disk_image__close(disks[i]);
+
+ free(disks);
+ return err;
+}
+
+int disk_image__flush(struct disk_image *disk)
+{
+ if (disk->ops->flush)
+ return disk->ops->flush(disk);
+
+ return fsync(disk->fd);
+}
+
+static int disk_image__close(struct disk_image *disk)
+{
+ /* If there was no disk image then there's nothing to do: */
+ if (!disk)
+ return 0;
+
+ if (disk->ops->close)
+ return disk->ops->close(disk);
+
+ if (close(disk->fd) < 0)
+ pr_warning("close() failed");
+
+ free(disk);
+
+ return 0;
+}
+
+static int disk_image__close_all(struct disk_image **disks, int count)
+{
+ while (count)
+ disk_image__close(disks[--count]);
+
+ free(disks);
+
+ return 0;
+}
+
+/*
+ * Fill iov with disk data, starting from sector 'sector'.
+ * Return amount of bytes read.
+ */
+ssize_t disk_image__read(struct disk_image *disk, u64 sector,
+ const struct iovec *iov, int iovcount, void *param)
+{
+ ssize_t total = 0;
+
+ if (debug_iodelay)
+ msleep(debug_iodelay);
+
+ if (disk->ops->read) {
+ total = disk->ops->read(disk, sector, iov, iovcount, param);
+ if (total < 0) {
+ pr_info("disk_image__read error: total=%ld\n", (long)total);
+ return total;
+ }
+ }
+
+ if (!disk->async && disk->disk_req_cb)
+ disk->disk_req_cb(param, total);
+
+ return total;
+}
+
+/*
+ * Write iov to disk, starting from sector 'sector'.
+ * Return amount of bytes written.
+ */
+ssize_t disk_image__write(struct disk_image *disk, u64 sector,
+ const struct iovec *iov, int iovcount, void *param)
+{
+ ssize_t total = 0;
+
+ if (debug_iodelay)
+ msleep(debug_iodelay);
+
+ if (disk->ops->write) {
+ /*
+ * Try writev based operation first
+ */
+
+ total = disk->ops->write(disk, sector, iov, iovcount, param);
+ if (total < 0) {
+ pr_info("disk_image__write error: total=%ld\n", (long)total);
+ return total;
+ }
+ } else {
+ /* Do nothing */
+ }
+
+ if (!disk->async && disk->disk_req_cb)
+ disk->disk_req_cb(param, total);
+
+ return total;
+}
+
+ssize_t disk_image__get_serial(struct disk_image *disk, void *buffer, ssize_t *len)
+{
+ struct stat st;
+ int r;
+
+ r = fstat(disk->fd, &st);
+ if (r)
+ return r;
+
+ *len = snprintf(buffer, *len, "%llu%llu%llu",
+ (unsigned long long)st.st_dev,
+ (unsigned long long)st.st_rdev,
+ (unsigned long long)st.st_ino);
+ return *len;
+}
+
+void disk_image__set_callback(struct disk_image *disk,
+ void (*disk_req_cb)(void *param, long len))
+{
+ disk->disk_req_cb = disk_req_cb;
+}
+
+int disk_image__init(struct kvm *kvm)
+{
+ if (kvm->cfg.image_count) {
+ kvm->disks = disk_image__open_all(kvm);
+ if (IS_ERR(kvm->disks))
+ return PTR_ERR(kvm->disks);
+ }
+
+ return 0;
+}
+dev_base_init(disk_image__init);
+
+int disk_image__exit(struct kvm *kvm)
+{
+ return disk_image__close_all(kvm->disks, kvm->nr_disks);
+}
+dev_base_exit(disk_image__exit);
diff --git a/tools/kvm/disk/qcow.c b/tools/kvm/disk/qcow.c
new file mode 100644
index 0000000..64a2550
--- /dev/null
+++ b/tools/kvm/disk/qcow.c
@@ -0,0 +1,1527 @@
+#include "kvm/qcow.h"
+
+#include "kvm/disk-image.h"
+#include "kvm/read-write.h"
+#include "kvm/mutex.h"
+#include "kvm/util.h"
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <errno.h>
+#ifdef CONFIG_HAS_ZLIB
+#include <zlib.h>
+#endif
+
+#include <linux/err.h>
+#include <linux/byteorder.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+
+static int update_cluster_refcount(struct qcow *q, u64 clust_idx, u16 append);
+static int qcow_write_refcount_table(struct qcow *q);
+static u64 qcow_alloc_clusters(struct qcow *q, u64 size, int update_ref);
+static void qcow_free_clusters(struct qcow *q, u64 clust_start, u64 size);
+
+static inline int qcow_pwrite_sync(int fd,
+ void *buf, size_t count, off_t offset)
+{
+ if (pwrite_in_full(fd, buf, count, offset) < 0)
+ return -1;
+
+ return fdatasync(fd);
+}
+
+static int l2_table_insert(struct rb_root *root, struct qcow_l2_table *new)
+{
+ struct rb_node **link = &(root->rb_node), *parent = NULL;
+ u64 offset = new->offset;
+
+ /* search the tree */
+ while (*link) {
+ struct qcow_l2_table *t;
+
+ t = rb_entry(*link, struct qcow_l2_table, node);
+ if (!t)
+ goto error;
+
+ parent = *link;
+
+ if (t->offset > offset)
+ link = &(*link)->rb_left;
+ else if (t->offset < offset)
+ link = &(*link)->rb_right;
+ else
+ goto out;
+ }
+
+ /* add new node */
+ rb_link_node(&new->node, parent, link);
+ rb_insert_color(&new->node, root);
+out:
+ return 0;
+error:
+ return -1;
+}
+
+static struct qcow_l2_table *l2_table_lookup(struct rb_root *root, u64 offset)
+{
+ struct rb_node *link = root->rb_node;
+
+ while (link) {
+ struct qcow_l2_table *t;
+
+ t = rb_entry(link, struct qcow_l2_table, node);
+ if (!t)
+ goto out;
+
+ if (t->offset > offset)
+ link = link->rb_left;
+ else if (t->offset < offset)
+ link = link->rb_right;
+ else
+ return t;
+ }
+out:
+ return NULL;
+}
+
+static void l1_table_free_cache(struct qcow_l1_table *l1t)
+{
+ struct rb_root *r = &l1t->root;
+ struct list_head *pos, *n;
+ struct qcow_l2_table *t;
+
+ list_for_each_safe(pos, n, &l1t->lru_list) {
+ /* Remove cache table from the list and RB tree */
+ list_del(pos);
+ t = list_entry(pos, struct qcow_l2_table, list);
+ rb_erase(&t->node, r);
+
+ /* Free the cached node */
+ free(t);
+ }
+}
+
+static int qcow_l2_cache_write(struct qcow *q, struct qcow_l2_table *c)
+{
+ struct qcow_header *header = q->header;
+ u64 size;
+
+ if (!c->dirty)
+ return 0;
+
+ size = 1 << header->l2_bits;
+
+ if (qcow_pwrite_sync(q->fd, c->table,
+ size * sizeof(u64), c->offset) < 0)
+ return -1;
+
+ c->dirty = 0;
+
+ return 0;
+}
+
+static int cache_table(struct qcow *q, struct qcow_l2_table *c)
+{
+ struct qcow_l1_table *l1t = &q->table;
+ struct rb_root *r = &l1t->root;
+ struct qcow_l2_table *lru;
+
+ if (l1t->nr_cached == MAX_CACHE_NODES) {
+ /*
+ * The node at the head of the list is least recently used
+ * node. Remove it from the list and replaced with a new node.
+ */
+ lru = list_first_entry(&l1t->lru_list, struct qcow_l2_table, list);
+
+ /* Remove the node from the cache */
+ rb_erase(&lru->node, r);
+ list_del_init(&lru->list);
+ l1t->nr_cached--;
+
+ /* Free the LRUed node */
+ free(lru);
+ }
+
+ /* Add new node in RB Tree: Helps in searching faster */
+ if (l2_table_insert(r, c) < 0)
+ goto error;
+
+ /* Add in LRU replacement list */
+ list_add_tail(&c->list, &l1t->lru_list);
+ l1t->nr_cached++;
+
+ return 0;
+error:
+ return -1;
+}
+
+static struct qcow_l2_table *l2_table_search(struct qcow *q, u64 offset)
+{
+ struct qcow_l1_table *l1t = &q->table;
+ struct qcow_l2_table *l2t;
+
+ l2t = l2_table_lookup(&l1t->root, offset);
+ if (!l2t)
+ return NULL;
+
+ /* Update the LRU state, by moving the searched node to list tail */
+ list_move_tail(&l2t->list, &l1t->lru_list);
+
+ return l2t;
+}
+
+/* Allocates a new node for caching L2 table */
+static struct qcow_l2_table *new_cache_table(struct qcow *q, u64 offset)
+{
+ struct qcow_header *header = q->header;
+ struct qcow_l2_table *c;
+ u64 l2t_sz;
+ u64 size;
+
+ l2t_sz = 1 << header->l2_bits;
+ size = sizeof(*c) + l2t_sz * sizeof(u64);
+ c = calloc(1, size);
+ if (!c)
+ goto out;
+
+ c->offset = offset;
+ RB_CLEAR_NODE(&c->node);
+ INIT_LIST_HEAD(&c->list);
+out:
+ return c;
+}
+
+static inline u64 get_l1_index(struct qcow *q, u64 offset)
+{
+ struct qcow_header *header = q->header;
+
+ return offset >> (header->l2_bits + header->cluster_bits);
+}
+
+static inline u64 get_l2_index(struct qcow *q, u64 offset)
+{
+ struct qcow_header *header = q->header;
+
+ return (offset >> (header->cluster_bits)) & ((1 << header->l2_bits)-1);
+}
+
+static inline u64 get_cluster_offset(struct qcow *q, u64 offset)
+{
+ struct qcow_header *header = q->header;
+
+ return offset & ((1 << header->cluster_bits)-1);
+}
+
+static struct qcow_l2_table *qcow_read_l2_table(struct qcow *q, u64 offset)
+{
+ struct qcow_header *header = q->header;
+ struct qcow_l2_table *l2t;
+ u64 size;
+
+ size = 1 << header->l2_bits;
+
+ /* search an entry for offset in cache */
+ l2t = l2_table_search(q, offset);
+ if (l2t)
+ return l2t;
+
+ /* allocate new node for caching l2 table */
+ l2t = new_cache_table(q, offset);
+ if (!l2t)
+ goto error;
+
+ /* table not cached: read from the disk */
+ if (pread_in_full(q->fd, l2t->table, size * sizeof(u64), offset) < 0)
+ goto error;
+
+ /* cache the table */
+ if (cache_table(q, l2t) < 0)
+ goto error;
+
+ return l2t;
+error:
+ free(l2t);
+ return NULL;
+}
+
+static int qcow_decompress_buffer(u8 *out_buf, int out_buf_size,
+ const u8 *buf, int buf_size)
+{
+#ifdef CONFIG_HAS_ZLIB
+ z_stream strm1, *strm = &strm1;
+ int ret, out_len;
+
+ memset(strm, 0, sizeof(*strm));
+
+ strm->next_in = (u8 *)buf;
+ strm->avail_in = buf_size;
+ strm->next_out = out_buf;
+ strm->avail_out = out_buf_size;
+
+ ret = inflateInit2(strm, -12);
+ if (ret != Z_OK)
+ return -1;
+
+ ret = inflate(strm, Z_FINISH);
+ out_len = strm->next_out - out_buf;
+ if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) ||
+ out_len != out_buf_size) {
+ inflateEnd(strm);
+ return -1;
+ }
+
+ inflateEnd(strm);
+ return 0;
+#else
+ return -1;
+#endif
+}
+
+static ssize_t qcow1_read_cluster(struct qcow *q, u64 offset,
+ void *dst, u32 dst_len)
+{
+ struct qcow_header *header = q->header;
+ struct qcow_l1_table *l1t = &q->table;
+ struct qcow_l2_table *l2t;
+ u64 clust_offset;
+ u64 clust_start;
+ u64 l2t_offset;
+ size_t length;
+ u64 l2t_size;
+ u64 l1_idx;
+ u64 l2_idx;
+ int coffset;
+ int csize;
+
+ l1_idx = get_l1_index(q, offset);
+ if (l1_idx >= l1t->table_size)
+ return -1;
+
+ clust_offset = get_cluster_offset(q, offset);
+ if (clust_offset >= q->cluster_size)
+ return -1;
+
+ length = q->cluster_size - clust_offset;
+ if (length > dst_len)
+ length = dst_len;
+
+ mutex_lock(&q->mutex);
+
+ l2t_offset = be64_to_cpu(l1t->l1_table[l1_idx]);
+ if (!l2t_offset)
+ goto zero_cluster;
+
+ l2t_size = 1 << header->l2_bits;
+
+ /* read and cache level 2 table */
+ l2t = qcow_read_l2_table(q, l2t_offset);
+ if (!l2t)
+ goto out_error;
+
+ l2_idx = get_l2_index(q, offset);
+ if (l2_idx >= l2t_size)
+ goto out_error;
+
+ clust_start = be64_to_cpu(l2t->table[l2_idx]);
+ if (clust_start & QCOW1_OFLAG_COMPRESSED) {
+ coffset = clust_start & q->cluster_offset_mask;
+ csize = clust_start >> (63 - q->header->cluster_bits);
+ csize &= (q->cluster_size - 1);
+
+ if (pread_in_full(q->fd, q->cluster_data, csize,
+ coffset) < 0)
+ goto out_error;
+
+ if (qcow_decompress_buffer(q->cluster_cache, q->cluster_size,
+ q->cluster_data, csize) < 0)
+ goto out_error;
+
+ memcpy(dst, q->cluster_cache + clust_offset, length);
+ mutex_unlock(&q->mutex);
+ } else {
+ if (!clust_start)
+ goto zero_cluster;
+
+ mutex_unlock(&q->mutex);
+
+ if (pread_in_full(q->fd, dst, length,
+ clust_start + clust_offset) < 0)
+ return -1;
+ }
+
+ return length;
+
+zero_cluster:
+ mutex_unlock(&q->mutex);
+ memset(dst, 0, length);
+ return length;
+
+out_error:
+ mutex_unlock(&q->mutex);
+ length = -1;
+ return -1;
+}
+
+static ssize_t qcow2_read_cluster(struct qcow *q, u64 offset,
+ void *dst, u32 dst_len)
+{
+ struct qcow_header *header = q->header;
+ struct qcow_l1_table *l1t = &q->table;
+ struct qcow_l2_table *l2t;
+ u64 clust_offset;
+ u64 clust_start;
+ u64 l2t_offset;
+ size_t length;
+ u64 l2t_size;
+ u64 l1_idx;
+ u64 l2_idx;
+ int coffset;
+ int sector_offset;
+ int nb_csectors;
+ int csize;
+
+ l1_idx = get_l1_index(q, offset);
+ if (l1_idx >= l1t->table_size)
+ return -1;
+
+ clust_offset = get_cluster_offset(q, offset);
+ if (clust_offset >= q->cluster_size)
+ return -1;
+
+ length = q->cluster_size - clust_offset;
+ if (length > dst_len)
+ length = dst_len;
+
+ mutex_lock(&q->mutex);
+
+ l2t_offset = be64_to_cpu(l1t->l1_table[l1_idx]);
+
+ l2t_offset &= ~QCOW2_OFLAG_COPIED;
+ if (!l2t_offset)
+ goto zero_cluster;
+
+ l2t_size = 1 << header->l2_bits;
+
+ /* read and cache level 2 table */
+ l2t = qcow_read_l2_table(q, l2t_offset);
+ if (!l2t)
+ goto out_error;
+
+ l2_idx = get_l2_index(q, offset);
+ if (l2_idx >= l2t_size)
+ goto out_error;
+
+ clust_start = be64_to_cpu(l2t->table[l2_idx]);
+ if (clust_start & QCOW2_OFLAG_COMPRESSED) {
+ coffset = clust_start & q->cluster_offset_mask;
+ nb_csectors = ((clust_start >> q->csize_shift)
+ & q->csize_mask) + 1;
+ sector_offset = coffset & (SECTOR_SIZE - 1);
+ csize = nb_csectors * SECTOR_SIZE - sector_offset;
+
+ if (pread_in_full(q->fd, q->cluster_data,
+ nb_csectors * SECTOR_SIZE,
+ coffset & ~(SECTOR_SIZE - 1)) < 0) {
+ goto out_error;
+ }
+
+ if (qcow_decompress_buffer(q->cluster_cache, q->cluster_size,
+ q->cluster_data + sector_offset,
+ csize) < 0) {
+ goto out_error;
+ }
+
+ memcpy(dst, q->cluster_cache + clust_offset, length);
+ mutex_unlock(&q->mutex);
+ } else {
+ clust_start &= QCOW2_OFFSET_MASK;
+ if (!clust_start)
+ goto zero_cluster;
+
+ mutex_unlock(&q->mutex);
+
+ if (pread_in_full(q->fd, dst, length,
+ clust_start + clust_offset) < 0)
+ return -1;
+ }
+
+ return length;
+
+zero_cluster:
+ mutex_unlock(&q->mutex);
+ memset(dst, 0, length);
+ return length;
+
+out_error:
+ mutex_unlock(&q->mutex);
+ length = -1;
+ return -1;
+}
+
+static ssize_t qcow_read_sector_single(struct disk_image *disk, u64 sector,
+ void *dst, u32 dst_len)
+{
+ struct qcow *q = disk->priv;
+ struct qcow_header *header = q->header;
+ u32 nr_read;
+ u64 offset;
+ char *buf;
+ u32 nr;
+
+ buf = dst;
+ nr_read = 0;
+
+ while (nr_read < dst_len) {
+ offset = sector << SECTOR_SHIFT;
+ if (offset >= header->size)
+ return -1;
+
+ if (q->version == QCOW1_VERSION)
+ nr = qcow1_read_cluster(q, offset, buf,
+ dst_len - nr_read);
+ else
+ nr = qcow2_read_cluster(q, offset, buf,
+ dst_len - nr_read);
+
+ if (nr <= 0)
+ return -1;
+
+ nr_read += nr;
+ buf += nr;
+ sector += (nr >> SECTOR_SHIFT);
+ }
+
+ return dst_len;
+}
+
+static ssize_t qcow_read_sector(struct disk_image *disk, u64 sector,
+ const struct iovec *iov, int iovcount, void *param)
+{
+ ssize_t nr, total = 0;
+
+ while (iovcount--) {
+ nr = qcow_read_sector_single(disk, sector, iov->iov_base, iov->iov_len);
+ if (nr != (ssize_t)iov->iov_len) {
+ pr_info("qcow_read_sector error: nr=%ld iov_len=%ld\n", (long)nr, (long)iov->iov_len);
+ return -1;
+ }
+
+ sector += iov->iov_len >> SECTOR_SHIFT;
+ total += nr;
+ iov++;
+ }
+
+ return total;
+}
+
+static void refcount_table_free_cache(struct qcow_refcount_table *rft)
+{
+ struct rb_root *r = &rft->root;
+ struct list_head *pos, *n;
+ struct qcow_refcount_block *t;
+
+ list_for_each_safe(pos, n, &rft->lru_list) {
+ list_del(pos);
+ t = list_entry(pos, struct qcow_refcount_block, list);
+ rb_erase(&t->node, r);
+
+ free(t);
+ }
+}
+
+static int refcount_block_insert(struct rb_root *root, struct qcow_refcount_block *new)
+{
+ struct rb_node **link = &(root->rb_node), *parent = NULL;
+ u64 offset = new->offset;
+
+ /* search the tree */
+ while (*link) {
+ struct qcow_refcount_block *t;
+
+ t = rb_entry(*link, struct qcow_refcount_block, node);
+ if (!t)
+ goto error;
+
+ parent = *link;
+
+ if (t->offset > offset)
+ link = &(*link)->rb_left;
+ else if (t->offset < offset)
+ link = &(*link)->rb_right;
+ else
+ goto out;
+ }
+
+ /* add new node */
+ rb_link_node(&new->node, parent, link);
+ rb_insert_color(&new->node, root);
+out:
+ return 0;
+error:
+ return -1;
+}
+
+static int write_refcount_block(struct qcow *q, struct qcow_refcount_block *rfb)
+{
+ if (!rfb->dirty)
+ return 0;
+
+ if (qcow_pwrite_sync(q->fd, rfb->entries,
+ rfb->size * sizeof(u16), rfb->offset) < 0)
+ return -1;
+
+ rfb->dirty = 0;
+
+ return 0;
+}
+
+static int cache_refcount_block(struct qcow *q, struct qcow_refcount_block *c)
+{
+ struct qcow_refcount_table *rft = &q->refcount_table;
+ struct rb_root *r = &rft->root;
+ struct qcow_refcount_block *lru;
+
+ if (rft->nr_cached == MAX_CACHE_NODES) {
+ lru = list_first_entry(&rft->lru_list, struct qcow_refcount_block, list);
+
+ rb_erase(&lru->node, r);
+ list_del_init(&lru->list);
+ rft->nr_cached--;
+
+ free(lru);
+ }
+
+ if (refcount_block_insert(r, c) < 0)
+ goto error;
+
+ list_add_tail(&c->list, &rft->lru_list);
+ rft->nr_cached++;
+
+ return 0;
+error:
+ return -1;
+}
+
+static struct qcow_refcount_block *new_refcount_block(struct qcow *q, u64 rfb_offset)
+{
+ struct qcow_refcount_block *rfb;
+
+ rfb = malloc(sizeof *rfb + q->cluster_size);
+ if (!rfb)
+ return NULL;
+
+ rfb->offset = rfb_offset;
+ rfb->size = q->cluster_size / sizeof(u16);
+ RB_CLEAR_NODE(&rfb->node);
+ INIT_LIST_HEAD(&rfb->list);
+
+ return rfb;
+}
+
+static struct qcow_refcount_block *refcount_block_lookup(struct rb_root *root, u64 offset)
+{
+ struct rb_node *link = root->rb_node;
+
+ while (link) {
+ struct qcow_refcount_block *t;
+
+ t = rb_entry(link, struct qcow_refcount_block, node);
+ if (!t)
+ goto out;
+
+ if (t->offset > offset)
+ link = link->rb_left;
+ else if (t->offset < offset)
+ link = link->rb_right;
+ else
+ return t;
+ }
+out:
+ return NULL;
+}
+
+static struct qcow_refcount_block *refcount_block_search(struct qcow *q, u64 offset)
+{
+ struct qcow_refcount_table *rft = &q->refcount_table;
+ struct qcow_refcount_block *rfb;
+
+ rfb = refcount_block_lookup(&rft->root, offset);
+ if (!rfb)
+ return NULL;
+
+ /* Update the LRU state, by moving the searched node to list tail */
+ list_move_tail(&rfb->list, &rft->lru_list);
+
+ return rfb;
+}
+
+static struct qcow_refcount_block *qcow_grow_refcount_block(struct qcow *q,
+ u64 clust_idx)
+{
+ struct qcow_header *header = q->header;
+ struct qcow_refcount_table *rft = &q->refcount_table;
+ struct qcow_refcount_block *rfb;
+ u64 new_block_offset;
+ u64 rft_idx;
+
+ rft_idx = clust_idx >> (header->cluster_bits -
+ QCOW_REFCOUNT_BLOCK_SHIFT);
+
+ if (rft_idx >= rft->rf_size) {
+ pr_warning("Don't support grow refcount block table");
+ return NULL;
+ }
+
+ new_block_offset = qcow_alloc_clusters(q, q->cluster_size, 0);
+ if (new_block_offset < 0)
+ return NULL;
+
+ rfb = new_refcount_block(q, new_block_offset);
+ if (!rfb)
+ return NULL;
+
+ memset(rfb->entries, 0x00, q->cluster_size);
+ rfb->dirty = 1;
+
+ /* write refcount block */
+ if (write_refcount_block(q, rfb) < 0)
+ goto free_rfb;
+
+ if (cache_refcount_block(q, rfb) < 0)
+ goto free_rfb;
+
+ rft->rf_table[rft_idx] = cpu_to_be64(new_block_offset);
+ if (update_cluster_refcount(q, new_block_offset >>
+ header->cluster_bits, 1) < 0)
+ goto recover_rft;
+
+ if (qcow_write_refcount_table(q) < 0)
+ goto recover_rft;
+
+ return rfb;
+
+recover_rft:
+ rft->rf_table[rft_idx] = 0;
+free_rfb:
+ free(rfb);
+ return NULL;
+}
+
+static struct qcow_refcount_block *qcow_read_refcount_block(struct qcow *q, u64 clust_idx)
+{
+ struct qcow_header *header = q->header;
+ struct qcow_refcount_table *rft = &q->refcount_table;
+ struct qcow_refcount_block *rfb;
+ u64 rfb_offset;
+ u64 rft_idx;
+
+ rft_idx = clust_idx >> (header->cluster_bits - QCOW_REFCOUNT_BLOCK_SHIFT);
+ if (rft_idx >= rft->rf_size)
+ return ERR_PTR(-ENOSPC);
+
+ rfb_offset = be64_to_cpu(rft->rf_table[rft_idx]);
+ if (!rfb_offset)
+ return ERR_PTR(-ENOSPC);
+
+ rfb = refcount_block_search(q, rfb_offset);
+ if (rfb)
+ return rfb;
+
+ rfb = new_refcount_block(q, rfb_offset);
+ if (!rfb)
+ return NULL;
+
+ if (pread_in_full(q->fd, rfb->entries, rfb->size * sizeof(u16), rfb_offset) < 0)
+ goto error_free_rfb;
+
+ if (cache_refcount_block(q, rfb) < 0)
+ goto error_free_rfb;
+
+ return rfb;
+
+error_free_rfb:
+ free(rfb);
+
+ return NULL;
+}
+
+static u16 qcow_get_refcount(struct qcow *q, u64 clust_idx)
+{
+ struct qcow_refcount_block *rfb = NULL;
+ struct qcow_header *header = q->header;
+ u64 rfb_idx;
+
+ rfb = qcow_read_refcount_block(q, clust_idx);
+ if (PTR_ERR(rfb) == -ENOSPC)
+ return 0;
+ else if (IS_ERR_OR_NULL(rfb)) {
+ pr_warning("Error while reading refcount table");
+ return -1;
+ }
+
+ rfb_idx = clust_idx & (((1ULL <<
+ (header->cluster_bits - QCOW_REFCOUNT_BLOCK_SHIFT)) - 1));
+
+ if (rfb_idx >= rfb->size) {
+ pr_warning("L1: refcount block index out of bounds");
+ return -1;
+ }
+
+ return be16_to_cpu(rfb->entries[rfb_idx]);
+}
+
+static int update_cluster_refcount(struct qcow *q, u64 clust_idx, u16 append)
+{
+ struct qcow_refcount_block *rfb = NULL;
+ struct qcow_header *header = q->header;
+ u16 refcount;
+ u64 rfb_idx;
+
+ rfb = qcow_read_refcount_block(q, clust_idx);
+ if (PTR_ERR(rfb) == -ENOSPC) {
+ rfb = qcow_grow_refcount_block(q, clust_idx);
+ if (!rfb) {
+ pr_warning("error while growing refcount table");
+ return -1;
+ }
+ } else if (IS_ERR_OR_NULL(rfb)) {
+ pr_warning("error while reading refcount table");
+ return -1;
+ }
+
+ rfb_idx = clust_idx & (((1ULL <<
+ (header->cluster_bits - QCOW_REFCOUNT_BLOCK_SHIFT)) - 1));
+ if (rfb_idx >= rfb->size) {
+ pr_warning("refcount block index out of bounds");
+ return -1;
+ }
+
+ refcount = be16_to_cpu(rfb->entries[rfb_idx]) + append;
+ rfb->entries[rfb_idx] = cpu_to_be16(refcount);
+ rfb->dirty = 1;
+
+ /* write refcount block */
+ if (write_refcount_block(q, rfb) < 0) {
+ pr_warning("refcount block index out of bounds");
+ return -1;
+ }
+
+ /* update free_clust_idx since refcount becomes zero */
+ if (!refcount && clust_idx < q->free_clust_idx)
+ q->free_clust_idx = clust_idx;
+
+ return 0;
+}
+
+static void qcow_free_clusters(struct qcow *q, u64 clust_start, u64 size)
+{
+ struct qcow_header *header = q->header;
+ u64 start, end, offset;
+
+ start = clust_start & ~(q->cluster_size - 1);
+ end = (clust_start + size - 1) & ~(q->cluster_size - 1);
+ for (offset = start; offset <= end; offset += q->cluster_size)
+ update_cluster_refcount(q, offset >> header->cluster_bits, -1);
+}
+
+/*
+ * Allocate clusters according to the size. Find a postion that
+ * can satisfy the size. free_clust_idx is initialized to zero and
+ * Record last position.
+ */
+static u64 qcow_alloc_clusters(struct qcow *q, u64 size, int update_ref)
+{
+ struct qcow_header *header = q->header;
+ u16 clust_refcount;
+ u32 clust_idx = 0, i;
+ u64 clust_num;
+
+ clust_num = (size + (q->cluster_size - 1)) >> header->cluster_bits;
+
+again:
+ for (i = 0; i < clust_num; i++) {
+ clust_idx = q->free_clust_idx++;
+ clust_refcount = qcow_get_refcount(q, clust_idx);
+ if (clust_refcount < 0)
+ return -1;
+ else if (clust_refcount > 0)
+ goto again;
+ }
+
+ clust_idx++;
+
+ if (update_ref)
+ for (i = 0; i < clust_num; i++)
+ if (update_cluster_refcount(q,
+ clust_idx - clust_num + i, 1))
+ return -1;
+
+ return (clust_idx - clust_num) << header->cluster_bits;
+}
+
+static int qcow_write_l1_table(struct qcow *q)
+{
+ struct qcow_l1_table *l1t = &q->table;
+ struct qcow_header *header = q->header;
+
+ if (qcow_pwrite_sync(q->fd, l1t->l1_table,
+ l1t->table_size * sizeof(u64),
+ header->l1_table_offset) < 0)
+ return -1;
+
+ return 0;
+}
+
+/*
+ * Get l2 table. If the table has been copied, read table directly.
+ * If the table exists, allocate a new cluster and copy the table
+ * to the new cluster.
+ */
+static int get_cluster_table(struct qcow *q, u64 offset,
+ struct qcow_l2_table **result_l2t, u64 *result_l2_index)
+{
+ struct qcow_header *header = q->header;
+ struct qcow_l1_table *l1t = &q->table;
+ struct qcow_l2_table *l2t;
+ u64 l1t_idx;
+ u64 l2t_offset;
+ u64 l2t_idx;
+ u64 l2t_size;
+ u64 l2t_new_offset;
+
+ l2t_size = 1 << header->l2_bits;
+
+ l1t_idx = get_l1_index(q, offset);
+ if (l1t_idx >= l1t->table_size)
+ return -1;
+
+ l2t_idx = get_l2_index(q, offset);
+ if (l2t_idx >= l2t_size)
+ return -1;
+
+ l2t_offset = be64_to_cpu(l1t->l1_table[l1t_idx]);
+ if (l2t_offset & QCOW2_OFLAG_COPIED) {
+ l2t_offset &= ~QCOW2_OFLAG_COPIED;
+ l2t = qcow_read_l2_table(q, l2t_offset);
+ if (!l2t)
+ goto error;
+ } else {
+ l2t_new_offset = qcow_alloc_clusters(q,
+ l2t_size*sizeof(u64), 1);
+
+ if (l2t_new_offset < 0)
+ goto error;
+
+ l2t = new_cache_table(q, l2t_new_offset);
+ if (!l2t)
+ goto free_cluster;
+
+ if (l2t_offset) {
+ l2t = qcow_read_l2_table(q, l2t_offset);
+ if (!l2t)
+ goto free_cache;
+ } else
+ memset(l2t->table, 0x00, l2t_size * sizeof(u64));
+
+ /* write l2 table */
+ l2t->dirty = 1;
+ if (qcow_l2_cache_write(q, l2t) < 0)
+ goto free_cache;
+
+ /* cache l2 table */
+ if (cache_table(q, l2t))
+ goto free_cache;
+
+ /* update the l1 talble */
+ l1t->l1_table[l1t_idx] = cpu_to_be64(l2t_new_offset
+ | QCOW2_OFLAG_COPIED);
+ if (qcow_write_l1_table(q)) {
+ pr_warning("Update l1 table error");
+ goto free_cache;
+ }
+
+ /* free old cluster */
+ qcow_free_clusters(q, l2t_offset, q->cluster_size);
+ }
+
+ *result_l2t = l2t;
+ *result_l2_index = l2t_idx;
+
+ return 0;
+
+free_cache:
+ free(l2t);
+
+free_cluster:
+ qcow_free_clusters(q, l2t_new_offset, q->cluster_size);
+
+error:
+ return -1;
+}
+
+/*
+ * If the cluster has been copied, write data directly. If not,
+ * read the original data and write it to the new cluster with
+ * modification.
+ */
+static ssize_t qcow_write_cluster(struct qcow *q, u64 offset,
+ void *buf, u32 src_len)
+{
+ struct qcow_l2_table *l2t;
+ u64 clust_new_start;
+ u64 clust_start;
+ u64 clust_flags;
+ u64 clust_off;
+ u64 l2t_idx;
+ u64 len;
+
+ l2t = NULL;
+
+ clust_off = get_cluster_offset(q, offset);
+ if (clust_off >= q->cluster_size)
+ return -1;
+
+ len = q->cluster_size - clust_off;
+ if (len > src_len)
+ len = src_len;
+
+ mutex_lock(&q->mutex);
+
+ if (get_cluster_table(q, offset, &l2t, &l2t_idx)) {
+ pr_warning("Get l2 table error");
+ goto error;
+ }
+
+ clust_start = be64_to_cpu(l2t->table[l2t_idx]);
+ clust_flags = clust_start & QCOW2_OFLAGS_MASK;
+
+ clust_start &= QCOW2_OFFSET_MASK;
+ if (!(clust_flags & QCOW2_OFLAG_COPIED)) {
+ clust_new_start = qcow_alloc_clusters(q, q->cluster_size, 1);
+ if (clust_new_start < 0) {
+ pr_warning("Cluster alloc error");
+ goto error;
+ }
+
+ offset &= ~(q->cluster_size - 1);
+
+ /* if clust_start is not zero, read the original data*/
+ if (clust_start) {
+ mutex_unlock(&q->mutex);
+ if (qcow2_read_cluster(q, offset, q->copy_buff,
+ q->cluster_size) < 0) {
+ pr_warning("Read copy cluster error");
+ qcow_free_clusters(q, clust_new_start,
+ q->cluster_size);
+ return -1;
+ }
+ mutex_lock(&q->mutex);
+ } else
+ memset(q->copy_buff, 0x00, q->cluster_size);
+
+ memcpy(q->copy_buff + clust_off, buf, len);
+
+ /* Write actual data */
+ if (pwrite_in_full(q->fd, q->copy_buff, q->cluster_size,
+ clust_new_start) < 0)
+ goto free_cluster;
+
+ /* update l2 table*/
+ l2t->table[l2t_idx] = cpu_to_be64(clust_new_start
+ | QCOW2_OFLAG_COPIED);
+ l2t->dirty = 1;
+
+ if (qcow_l2_cache_write(q, l2t))
+ goto free_cluster;
+
+ /* free old cluster*/
+ if (clust_flags & QCOW2_OFLAG_COMPRESSED) {
+ int size;
+ size = ((clust_start >> q->csize_shift) &
+ q->csize_mask) + 1;
+ size *= 512;
+ clust_start &= q->cluster_offset_mask;
+ clust_start &= ~511;
+
+ qcow_free_clusters(q, clust_start, size);
+ } else if (clust_start)
+ qcow_free_clusters(q, clust_start, q->cluster_size);
+
+ } else {
+ /* Write actual data */
+ if (pwrite_in_full(q->fd, buf, len,
+ clust_start + clust_off) < 0)
+ goto error;
+ }
+ mutex_unlock(&q->mutex);
+ return len;
+
+free_cluster:
+ qcow_free_clusters(q, clust_new_start, q->cluster_size);
+
+error:
+ mutex_unlock(&q->mutex);
+ return -1;
+}
+
+static ssize_t qcow_write_sector_single(struct disk_image *disk, u64 sector, void *src, u32 src_len)
+{
+ struct qcow *q = disk->priv;
+ struct qcow_header *header = q->header;
+ u32 nr_written;
+ char *buf;
+ u64 offset;
+ ssize_t nr;
+
+ buf = src;
+ nr_written = 0;
+ offset = sector << SECTOR_SHIFT;
+
+ while (nr_written < src_len) {
+ if (offset >= header->size)
+ return -1;
+
+ nr = qcow_write_cluster(q, offset, buf, src_len - nr_written);
+ if (nr < 0)
+ return -1;
+
+ nr_written += nr;
+ buf += nr;
+ offset += nr;
+ }
+
+ return nr_written;
+}
+
+static ssize_t qcow_write_sector(struct disk_image *disk, u64 sector,
+ const struct iovec *iov, int iovcount, void *param)
+{
+ ssize_t nr, total = 0;
+
+ while (iovcount--) {
+ nr = qcow_write_sector_single(disk, sector, iov->iov_base, iov->iov_len);
+ if (nr != (ssize_t)iov->iov_len) {
+ pr_info("qcow_write_sector error: nr=%ld iov_len=%ld\n", (long)nr, (long)iov->iov_len);
+ return -1;
+ }
+
+ sector += iov->iov_len >> SECTOR_SHIFT;
+ iov++;
+ total += nr;
+ }
+
+ return total;
+}
+
+static int qcow_disk_flush(struct disk_image *disk)
+{
+ struct qcow *q = disk->priv;
+ struct qcow_refcount_table *rft;
+ struct list_head *pos, *n;
+ struct qcow_l1_table *l1t;
+
+ l1t = &q->table;
+ rft = &q->refcount_table;
+
+ mutex_lock(&q->mutex);
+
+ list_for_each_safe(pos, n, &rft->lru_list) {
+ struct qcow_refcount_block *c = list_entry(pos, struct qcow_refcount_block, list);
+
+ if (write_refcount_block(q, c) < 0)
+ goto error_unlock;
+ }
+
+ list_for_each_safe(pos, n, &l1t->lru_list) {
+ struct qcow_l2_table *c = list_entry(pos, struct qcow_l2_table, list);
+
+ if (qcow_l2_cache_write(q, c) < 0)
+ goto error_unlock;
+ }
+
+ if (qcow_write_l1_table < 0)
+ goto error_unlock;
+
+ mutex_unlock(&q->mutex);
+
+ return fsync(disk->fd);
+
+error_unlock:
+ mutex_unlock(&q->mutex);
+ return -1;
+}
+
+static int qcow_disk_close(struct disk_image *disk)
+{
+ struct qcow *q;
+
+ if (!disk)
+ return 0;
+
+ q = disk->priv;
+
+ refcount_table_free_cache(&q->refcount_table);
+ l1_table_free_cache(&q->table);
+ free(q->copy_buff);
+ free(q->cluster_data);
+ free(q->cluster_cache);
+ free(q->refcount_table.rf_table);
+ free(q->table.l1_table);
+ free(q->header);
+ free(q);
+
+ return 0;
+}
+
+static struct disk_image_operations qcow_disk_readonly_ops = {
+ .read = qcow_read_sector,
+ .close = qcow_disk_close,
+};
+
+static struct disk_image_operations qcow_disk_ops = {
+ .read = qcow_read_sector,
+ .write = qcow_write_sector,
+ .flush = qcow_disk_flush,
+ .close = qcow_disk_close,
+};
+
+static int qcow_read_refcount_table(struct qcow *q)
+{
+ struct qcow_header *header = q->header;
+ struct qcow_refcount_table *rft = &q->refcount_table;
+
+ rft->rf_size = (header->refcount_table_size * q->cluster_size)
+ / sizeof(u64);
+
+ rft->rf_table = calloc(rft->rf_size, sizeof(u64));
+ if (!rft->rf_table)
+ return -1;
+
+ rft->root = RB_ROOT;
+ INIT_LIST_HEAD(&rft->lru_list);
+
+ return pread_in_full(q->fd, rft->rf_table, sizeof(u64) * rft->rf_size, header->refcount_table_offset);
+}
+
+static int qcow_write_refcount_table(struct qcow *q)
+{
+ struct qcow_header *header = q->header;
+ struct qcow_refcount_table *rft = &q->refcount_table;
+
+ return qcow_pwrite_sync(q->fd, rft->rf_table,
+ rft->rf_size * sizeof(u64), header->refcount_table_offset);
+}
+
+static int qcow_read_l1_table(struct qcow *q)
+{
+ struct qcow_header *header = q->header;
+ struct qcow_l1_table *table = &q->table;
+
+ table->table_size = header->l1_size;
+
+ table->l1_table = calloc(table->table_size, sizeof(u64));
+ if (!table->l1_table)
+ return -1;
+
+ return pread_in_full(q->fd, table->l1_table, sizeof(u64) * table->table_size, header->l1_table_offset);
+}
+
+static void *qcow2_read_header(int fd)
+{
+ struct qcow2_header_disk f_header;
+ struct qcow_header *header;
+
+ header = malloc(sizeof(struct qcow_header));
+ if (!header)
+ return NULL;
+
+ if (pread_in_full(fd, &f_header, sizeof(struct qcow2_header_disk), 0) < 0) {
+ free(header);
+ return NULL;
+ }
+
+ be32_to_cpus(&f_header.magic);
+ be32_to_cpus(&f_header.version);
+ be64_to_cpus(&f_header.backing_file_offset);
+ be32_to_cpus(&f_header.backing_file_size);
+ be32_to_cpus(&f_header.cluster_bits);
+ be64_to_cpus(&f_header.size);
+ be32_to_cpus(&f_header.crypt_method);
+ be32_to_cpus(&f_header.l1_size);
+ be64_to_cpus(&f_header.l1_table_offset);
+ be64_to_cpus(&f_header.refcount_table_offset);
+ be32_to_cpus(&f_header.refcount_table_clusters);
+ be32_to_cpus(&f_header.nb_snapshots);
+ be64_to_cpus(&f_header.snapshots_offset);
+
+ *header = (struct qcow_header) {
+ .size = f_header.size,
+ .l1_table_offset = f_header.l1_table_offset,
+ .l1_size = f_header.l1_size,
+ .cluster_bits = f_header.cluster_bits,
+ .l2_bits = f_header.cluster_bits - 3,
+ .refcount_table_offset = f_header.refcount_table_offset,
+ .refcount_table_size = f_header.refcount_table_clusters,
+ };
+
+ return header;
+}
+
+static struct disk_image *qcow2_probe(int fd, bool readonly)
+{
+ struct disk_image *disk_image;
+ struct qcow_l1_table *l1t;
+ struct qcow_header *h;
+ struct qcow *q;
+
+ q = calloc(1, sizeof(struct qcow));
+ if (!q)
+ return NULL;
+
+ mutex_init(&q->mutex);
+ q->fd = fd;
+
+ l1t = &q->table;
+
+ l1t->root = RB_ROOT;
+ INIT_LIST_HEAD(&l1t->lru_list);
+
+ h = q->header = qcow2_read_header(fd);
+ if (!h)
+ goto free_qcow;
+
+ q->version = QCOW2_VERSION;
+ q->csize_shift = (62 - (q->header->cluster_bits - 8));
+ q->csize_mask = (1 << (q->header->cluster_bits - 8)) - 1;
+ q->cluster_offset_mask = (1LL << q->csize_shift) - 1;
+ q->cluster_size = 1 << q->header->cluster_bits;
+
+ q->copy_buff = malloc(q->cluster_size);
+ if (!q->copy_buff) {
+ pr_warning("copy buff malloc error");
+ goto free_header;
+ }
+
+ q->cluster_data = malloc(q->cluster_size);
+ if (!q->cluster_data) {
+ pr_warning("cluster data malloc error");
+ goto free_copy_buff;
+ }
+
+ q->cluster_cache = malloc(q->cluster_size);
+ if (!q->cluster_cache) {
+ pr_warning("cluster cache malloc error");
+ goto free_cluster_data;
+ }
+
+ if (qcow_read_l1_table(q) < 0)
+ goto free_cluster_cache;
+
+ if (qcow_read_refcount_table(q) < 0)
+ goto free_l1_table;
+
+ /*
+ * Do not use mmap use read/write instead
+ */
+ if (readonly)
+ disk_image = disk_image__new(fd, h->size, &qcow_disk_readonly_ops, DISK_IMAGE_REGULAR);
+ else
+ disk_image = disk_image__new(fd, h->size, &qcow_disk_ops, DISK_IMAGE_REGULAR);
+
+ if (IS_ERR_OR_NULL(disk_image))
+ goto free_refcount_table;
+
+ disk_image->async = 0;
+ disk_image->priv = q;
+
+ return disk_image;
+
+free_refcount_table:
+ if (q->refcount_table.rf_table)
+ free(q->refcount_table.rf_table);
+free_l1_table:
+ if (q->table.l1_table)
+ free(q->table.l1_table);
+free_cluster_cache:
+ if (q->cluster_cache)
+ free(q->cluster_cache);
+free_cluster_data:
+ if (q->cluster_data)
+ free(q->cluster_data);
+free_copy_buff:
+ if (q->copy_buff)
+ free(q->copy_buff);
+free_header:
+ if (q->header)
+ free(q->header);
+free_qcow:
+ free(q);
+
+ return NULL;
+}
+
+static bool qcow2_check_image(int fd)
+{
+ struct qcow2_header_disk f_header;
+
+ if (pread_in_full(fd, &f_header, sizeof(struct qcow2_header_disk), 0) < 0)
+ return false;
+
+ be32_to_cpus(&f_header.magic);
+ be32_to_cpus(&f_header.version);
+
+ if (f_header.magic != QCOW_MAGIC)
+ return false;
+
+ if (f_header.version != QCOW2_VERSION)
+ return false;
+
+ return true;
+}
+
+static void *qcow1_read_header(int fd)
+{
+ struct qcow1_header_disk f_header;
+ struct qcow_header *header;
+
+ header = malloc(sizeof(struct qcow_header));
+ if (!header)
+ return NULL;
+
+ if (pread_in_full(fd, &f_header, sizeof(struct qcow1_header_disk), 0) < 0) {
+ free(header);
+ return NULL;
+ }
+
+ be32_to_cpus(&f_header.magic);
+ be32_to_cpus(&f_header.version);
+ be64_to_cpus(&f_header.backing_file_offset);
+ be32_to_cpus(&f_header.backing_file_size);
+ be32_to_cpus(&f_header.mtime);
+ be64_to_cpus(&f_header.size);
+ be32_to_cpus(&f_header.crypt_method);
+ be64_to_cpus(&f_header.l1_table_offset);
+
+ *header = (struct qcow_header) {
+ .size = f_header.size,
+ .l1_table_offset = f_header.l1_table_offset,
+ .l1_size = f_header.size / ((1 << f_header.l2_bits) * (1 << f_header.cluster_bits)),
+ .cluster_bits = f_header.cluster_bits,
+ .l2_bits = f_header.l2_bits,
+ };
+
+ return header;
+}
+
+static struct disk_image *qcow1_probe(int fd, bool readonly)
+{
+ struct disk_image *disk_image;
+ struct qcow_l1_table *l1t;
+ struct qcow_header *h;
+ struct qcow *q;
+
+ q = calloc(1, sizeof(struct qcow));
+ if (!q)
+ return NULL;
+
+ mutex_init(&q->mutex);
+ q->fd = fd;
+
+ l1t = &q->table;
+
+ l1t->root = RB_ROOT;
+ INIT_LIST_HEAD(&l1t->lru_list);
+
+ h = q->header = qcow1_read_header(fd);
+ if (!h)
+ goto free_qcow;
+
+ q->version = QCOW1_VERSION;
+ q->cluster_size = 1 << q->header->cluster_bits;
+ q->cluster_offset_mask = (1LL << (63 - q->header->cluster_bits)) - 1;
+ q->free_clust_idx = 0;
+
+ q->cluster_data = malloc(q->cluster_size);
+ if (!q->cluster_data) {
+ pr_warning("cluster data malloc error");
+ goto free_header;
+ }
+
+ q->cluster_cache = malloc(q->cluster_size);
+ if (!q->cluster_cache) {
+ pr_warning("cluster cache malloc error");
+ goto free_cluster_data;
+ }
+
+ if (qcow_read_l1_table(q) < 0)
+ goto free_cluster_cache;
+
+ /*
+ * Do not use mmap use read/write instead
+ */
+ if (readonly)
+ disk_image = disk_image__new(fd, h->size, &qcow_disk_readonly_ops, DISK_IMAGE_REGULAR);
+ else
+ disk_image = disk_image__new(fd, h->size, &qcow_disk_ops, DISK_IMAGE_REGULAR);
+
+ if (!disk_image)
+ goto free_l1_table;
+
+ disk_image->async = 1;
+ disk_image->priv = q;
+
+ return disk_image;
+
+free_l1_table:
+ if (q->table.l1_table)
+ free(q->table.l1_table);
+free_cluster_cache:
+ if (q->cluster_cache)
+ free(q->cluster_cache);
+free_cluster_data:
+ if (q->cluster_data)
+ free(q->cluster_data);
+free_header:
+ if (q->header)
+ free(q->header);
+free_qcow:
+ free(q);
+
+ return NULL;
+}
+
+static bool qcow1_check_image(int fd)
+{
+ struct qcow1_header_disk f_header;
+
+ if (pread_in_full(fd, &f_header, sizeof(struct qcow1_header_disk), 0) < 0)
+ return false;
+
+ be32_to_cpus(&f_header.magic);
+ be32_to_cpus(&f_header.version);
+
+ if (f_header.magic != QCOW_MAGIC)
+ return false;
+
+ if (f_header.version != QCOW1_VERSION)
+ return false;
+
+ return true;
+}
+
+struct disk_image *qcow_probe(int fd, bool readonly)
+{
+ if (qcow1_check_image(fd))
+ return qcow1_probe(fd, readonly);
+
+ if (qcow2_check_image(fd))
+ return qcow2_probe(fd, readonly);
+
+ return NULL;
+}
diff --git a/tools/kvm/disk/raw.c b/tools/kvm/disk/raw.c
new file mode 100644
index 0000000..93b2b4e
--- /dev/null
+++ b/tools/kvm/disk/raw.c
@@ -0,0 +1,141 @@
+#include "kvm/disk-image.h"
+
+#include <linux/err.h>
+
+#ifdef CONFIG_HAS_AIO
+#include <libaio.h>
+#endif
+
+ssize_t raw_image__read(struct disk_image *disk, u64 sector, const struct iovec *iov,
+ int iovcount, void *param)
+{
+ u64 offset = sector << SECTOR_SHIFT;
+
+#ifdef CONFIG_HAS_AIO
+ struct iocb iocb;
+
+ return aio_preadv(disk->ctx, &iocb, disk->fd, iov, iovcount, offset,
+ disk->evt, param);
+#else
+ return preadv_in_full(disk->fd, iov, iovcount, offset);
+#endif
+}
+
+ssize_t raw_image__write(struct disk_image *disk, u64 sector, const struct iovec *iov,
+ int iovcount, void *param)
+{
+ u64 offset = sector << SECTOR_SHIFT;
+
+#ifdef CONFIG_HAS_AIO
+ struct iocb iocb;
+
+ return aio_pwritev(disk->ctx, &iocb, disk->fd, iov, iovcount, offset,
+ disk->evt, param);
+#else
+ return pwritev_in_full(disk->fd, iov, iovcount, offset);
+#endif
+}
+
+ssize_t raw_image__read_mmap(struct disk_image *disk, u64 sector, const struct iovec *iov,
+ int iovcount, void *param)
+{
+ u64 offset = sector << SECTOR_SHIFT;
+ ssize_t total = 0;
+
+ while (iovcount--) {
+ memcpy(iov->iov_base, disk->priv + offset, iov->iov_len);
+
+ sector += iov->iov_len >> SECTOR_SHIFT;
+ offset += iov->iov_len;
+ total += iov->iov_len;
+ iov++;
+ }
+
+ return total;
+}
+
+ssize_t raw_image__write_mmap(struct disk_image *disk, u64 sector, const struct iovec *iov,
+ int iovcount, void *param)
+{
+ u64 offset = sector << SECTOR_SHIFT;
+ ssize_t total = 0;
+
+ while (iovcount--) {
+ memcpy(disk->priv + offset, iov->iov_base, iov->iov_len);
+
+ sector += iov->iov_len >> SECTOR_SHIFT;
+ offset += iov->iov_len;
+ total += iov->iov_len;
+ iov++;
+ }
+
+ return total;
+}
+
+int raw_image__close(struct disk_image *disk)
+{
+ int ret = 0;
+
+ if (disk->priv != MAP_FAILED)
+ ret = munmap(disk->priv, disk->size);
+
+ close(disk->evt);
+
+#ifdef CONFIG_HAS_VIRTIO
+ io_destroy(disk->ctx);
+#endif
+
+ return ret;
+}
+
+/*
+ * multiple buffer based disk image operations
+ */
+static struct disk_image_operations raw_image_regular_ops = {
+ .read = raw_image__read,
+ .write = raw_image__write,
+};
+
+struct disk_image_operations ro_ops = {
+ .read = raw_image__read_mmap,
+ .write = raw_image__write_mmap,
+ .close = raw_image__close,
+};
+
+struct disk_image_operations ro_ops_nowrite = {
+ .read = raw_image__read,
+};
+
+struct disk_image *raw_image__probe(int fd, struct stat *st, bool readonly)
+{
+ struct disk_image *disk;
+
+ if (readonly) {
+ /*
+ * Use mmap's MAP_PRIVATE to implement non-persistent write
+ * FIXME: This does not work on 32-bit host.
+ */
+ struct disk_image *disk;
+
+ disk = disk_image__new(fd, st->st_size, &ro_ops, DISK_IMAGE_MMAP);
+ if (IS_ERR_OR_NULL(disk)) {
+ disk = disk_image__new(fd, st->st_size, &ro_ops_nowrite, DISK_IMAGE_REGULAR);
+#ifdef CONFIG_HAS_AIO
+ if (!IS_ERR_OR_NULL(disk))
+ disk->async = 1;
+#endif
+ }
+
+ return disk;
+ } else {
+ /*
+ * Use read/write instead of mmap
+ */
+ disk = disk_image__new(fd, st->st_size, &raw_image_regular_ops, DISK_IMAGE_REGULAR);
+#ifdef CONFIG_HAS_AIO
+ if (!IS_ERR_OR_NULL(disk))
+ disk->async = 1;
+#endif
+ return disk;
+ }
+}
diff --git a/tools/kvm/framebuffer.c b/tools/kvm/framebuffer.c
new file mode 100644
index 0000000..fb8f51d
--- /dev/null
+++ b/tools/kvm/framebuffer.c
@@ -0,0 +1,80 @@
+#include "kvm/framebuffer.h"
+#include "kvm/kvm.h"
+
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <errno.h>
+
+static LIST_HEAD(framebuffers);
+
+struct framebuffer *fb__register(struct framebuffer *fb)
+{
+ INIT_LIST_HEAD(&fb->node);
+ list_add(&fb->node, &framebuffers);
+
+ return fb;
+}
+
+int fb__attach(struct framebuffer *fb, struct fb_target_operations *ops)
+{
+ if (fb->nr_targets >= FB_MAX_TARGETS)
+ return -ENOSPC;
+
+ fb->targets[fb->nr_targets++] = ops;
+
+ return 0;
+}
+
+static int start_targets(struct framebuffer *fb)
+{
+ unsigned long i;
+
+ for (i = 0; i < fb->nr_targets; i++) {
+ struct fb_target_operations *ops = fb->targets[i];
+ int err = 0;
+
+ if (ops->start)
+ err = ops->start(fb);
+
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+int fb__init(struct kvm *kvm)
+{
+ struct framebuffer *fb;
+
+ list_for_each_entry(fb, &framebuffers, node) {
+ int err;
+
+ err = start_targets(fb);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+firmware_init(fb__init);
+
+int fb__exit(struct kvm *kvm)
+{
+ struct framebuffer *fb;
+
+ list_for_each_entry(fb, &framebuffers, node) {
+ u32 i;
+
+ for (i = 0; i < fb->nr_targets; i++)
+ if (fb->targets[i]->stop)
+ fb->targets[i]->stop(fb);
+
+ munmap(fb->mem, fb->mem_size);
+ }
+
+ return 0;
+}
+firmware_exit(fb__exit);
diff --git a/tools/kvm/guest/init.c b/tools/kvm/guest/init.c
new file mode 100644
index 0000000..8c49a03
--- /dev/null
+++ b/tools/kvm/guest/init.c
@@ -0,0 +1,76 @@
+/*
+ * This is a simple init for shared rootfs guests. This part should be limited
+ * to doing mounts and running stage 2 of the init process.
+ */
+#include <sys/mount.h>
+#include <string.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <errno.h>
+#include <linux/reboot.h>
+
+static int run_process(char *filename)
+{
+ char *new_argv[] = { filename, NULL };
+ char *new_env[] = { "TERM=linux", "DISPLAY=192.168.33.1:0",
+ "HOME=/virt/home", NULL };
+
+ return execve(filename, new_argv, new_env);
+}
+
+static int run_process_sandbox(char *filename)
+{
+ char *new_argv[] = { filename, "/virt/sandbox.sh", NULL };
+ char *new_env[] = { "TERM=linux", "HOME=/virt/home", NULL };
+
+ return execve(filename, new_argv, new_env);
+}
+
+static void do_mounts(void)
+{
+ mount("hostfs", "/host", "9p", MS_RDONLY, "trans=virtio,version=9p2000.L");
+ mount("", "/sys", "sysfs", 0, NULL);
+ mount("proc", "/proc", "proc", 0, NULL);
+ mount("devtmpfs", "/dev", "devtmpfs", 0, NULL);
+ mkdir("/dev/pts", 0755);
+ mount("devpts", "/dev/pts", "devpts", 0, NULL);
+}
+
+int main(int argc, char *argv[])
+{
+ pid_t child;
+ int status;
+
+ puts("Mounting...");
+
+ do_mounts();
+
+ /* get session leader */
+ setsid();
+
+ /* set controlling terminal */
+ ioctl(0, TIOCSCTTY, 1);
+
+ child = fork();
+ if (child < 0) {
+ printf("Fatal: fork() failed with %d\n", child);
+ return 0;
+ } else if (child == 0) {
+ if (access("/virt/sandbox.sh", R_OK) == 0)
+ run_process_sandbox("/bin/sh");
+ else
+ run_process("/bin/sh");
+ } else {
+ pid_t corpse;
+
+ do {
+ corpse = waitpid(-1, &status, 0);
+ } while (corpse != child);
+ }
+
+ reboot(LINUX_REBOOT_CMD_RESTART);
+
+ printf("Init failed: %s\n", strerror(errno));
+
+ return 0;
+}
diff --git a/tools/kvm/guest_compat.c b/tools/kvm/guest_compat.c
new file mode 100644
index 0000000..fd4704b
--- /dev/null
+++ b/tools/kvm/guest_compat.c
@@ -0,0 +1,99 @@
+#include "kvm/guest_compat.h"
+
+#include "kvm/mutex.h"
+
+#include <linux/kernel.h>
+#include <linux/list.h>
+
+struct compat_message {
+ int id;
+ char *title;
+ char *desc;
+
+ struct list_head list;
+};
+
+static int id;
+static DEFINE_MUTEX(compat_mtx);
+static LIST_HEAD(messages);
+
+static void compat__free(struct compat_message *msg)
+{
+ free(msg->title);
+ free(msg->desc);
+ free(msg);
+}
+
+int compat__add_message(const char *title, const char *desc)
+{
+ struct compat_message *msg;
+ int msg_id;
+
+ msg = malloc(sizeof(*msg));
+ if (msg == NULL)
+ goto cleanup;
+
+ msg->title = strdup(title);
+ msg->desc = strdup(desc);
+
+ if (msg->title == NULL || msg->desc == NULL)
+ goto cleanup;
+
+ mutex_lock(&compat_mtx);
+
+ msg->id = msg_id = id++;
+ list_add_tail(&msg->list, &messages);
+
+ mutex_unlock(&compat_mtx);
+
+ return msg_id;
+
+cleanup:
+ if (msg)
+ compat__free(msg);
+
+ return -ENOMEM;
+}
+
+int compat__remove_message(int id)
+{
+ struct compat_message *pos, *n;
+
+ mutex_lock(&compat_mtx);
+
+ list_for_each_entry_safe(pos, n, &messages, list) {
+ if (pos->id == id) {
+ list_del(&pos->list);
+ compat__free(pos);
+
+ mutex_unlock(&compat_mtx);
+
+ return 0;
+ }
+ }
+
+ mutex_unlock(&compat_mtx);
+
+ return -ENOENT;
+}
+
+int compat__print_all_messages(void)
+{
+ mutex_lock(&compat_mtx);
+
+ while (!list_empty(&messages)) {
+ struct compat_message *msg;
+
+ msg = list_first_entry(&messages, struct compat_message, list);
+
+ printf("\n # KVM compatibility warning.\n\t%s\n\t%s\n",
+ msg->title, msg->desc);
+
+ list_del(&msg->list);
+ compat__free(msg);
+ }
+
+ mutex_unlock(&compat_mtx);
+
+ return 0;
+}
diff --git a/tools/kvm/hw/i8042.c b/tools/kvm/hw/i8042.c
new file mode 100644
index 0000000..3801e20
--- /dev/null
+++ b/tools/kvm/hw/i8042.c
@@ -0,0 +1,363 @@
+#include "kvm/read-write.h"
+#include "kvm/ioport.h"
+#include "kvm/mutex.h"
+#include "kvm/util.h"
+#include "kvm/term.h"
+#include "kvm/kvm.h"
+#include "kvm/i8042.h"
+#include "kvm/kvm-cpu.h"
+
+#include <stdint.h>
+
+/*
+ * IRQs
+ */
+#define KBD_IRQ 1
+#define AUX_IRQ 12
+
+/*
+ * Registers
+ */
+#define I8042_DATA_REG 0x60
+#define I8042_PORT_B_REG 0x61
+#define I8042_COMMAND_REG 0x64
+
+/*
+ * Commands
+ */
+#define I8042_CMD_CTL_RCTR 0x20
+#define I8042_CMD_CTL_WCTR 0x60
+#define I8042_CMD_AUX_LOOP 0xD3
+#define I8042_CMD_AUX_SEND 0xD4
+#define I8042_CMD_AUX_TEST 0xA9
+#define I8042_CMD_AUX_DISABLE 0xA7
+#define I8042_CMD_AUX_ENABLE 0xA8
+#define I8042_CMD_SYSTEM_RESET 0xFE
+
+#define RESPONSE_ACK 0xFA
+
+#define MODE_DISABLE_AUX 0x20
+
+#define AUX_ENABLE_REPORTING 0x20
+#define AUX_SCALING_FLAG 0x10
+#define AUX_DEFAULT_RESOLUTION 0x2
+#define AUX_DEFAULT_SAMPLE 100
+
+/*
+ * Status register bits
+ */
+#define I8042_STR_AUXDATA 0x20
+#define I8042_STR_KEYLOCK 0x10
+#define I8042_STR_CMDDAT 0x08
+#define I8042_STR_MUXERR 0x04
+#define I8042_STR_OBF 0x01
+
+#define KBD_MODE_KBD_INT 0x01
+#define KBD_MODE_SYS 0x02
+
+#define QUEUE_SIZE 128
+
+/*
+ * This represents the current state of the PS/2 keyboard system,
+ * including the AUX device (the mouse)
+ */
+struct kbd_state {
+ struct kvm *kvm;
+
+ char kq[QUEUE_SIZE]; /* Keyboard queue */
+ int kread, kwrite; /* Indexes into the queue */
+ int kcount; /* number of elements in queue */
+
+ char mq[QUEUE_SIZE];
+ int mread, mwrite;
+ int mcount;
+
+ u8 mstatus; /* Mouse status byte */
+ u8 mres; /* Current mouse resolution */
+ u8 msample; /* Current mouse samples/second */
+
+ u8 mode; /* i8042 mode register */
+ u8 status; /* i8042 status register */
+ /*
+ * Some commands (on port 0x64) have arguments;
+ * we store the command here while we wait for the argument
+ */
+ u32 write_cmd;
+};
+
+static struct kbd_state state;
+
+/*
+ * If there are packets to be read, set the appropriate IRQs high
+ */
+static void kbd_update_irq(void)
+{
+ u8 klevel = 0;
+ u8 mlevel = 0;
+
+ /* First, clear the kbd and aux output buffer full bits */
+ state.status &= ~(I8042_STR_OBF | I8042_STR_AUXDATA);
+
+ if (state.kcount > 0) {
+ state.status |= I8042_STR_OBF;
+ klevel = 1;
+ }
+
+ /* Keyboard has higher priority than mouse */
+ if (klevel == 0 && state.mcount != 0) {
+ state.status |= I8042_STR_OBF | I8042_STR_AUXDATA;
+ mlevel = 1;
+ }
+
+ kvm__irq_line(state.kvm, KBD_IRQ, klevel);
+ kvm__irq_line(state.kvm, AUX_IRQ, mlevel);
+}
+
+/*
+ * Add a byte to the mouse queue, then set IRQs
+ */
+void mouse_queue(u8 c)
+{
+ if (state.mcount >= QUEUE_SIZE)
+ return;
+
+ state.mq[state.mwrite++ % QUEUE_SIZE] = c;
+
+ state.mcount++;
+ kbd_update_irq();
+}
+
+/*
+ * Add a byte to the keyboard queue, then set IRQs
+ */
+void kbd_queue(u8 c)
+{
+ if (state.kcount >= QUEUE_SIZE)
+ return;
+
+ state.kq[state.kwrite++ % QUEUE_SIZE] = c;
+
+ state.kcount++;
+ kbd_update_irq();
+}
+
+static void kbd_write_command(struct kvm *kvm, u8 val)
+{
+ switch (val) {
+ case I8042_CMD_CTL_RCTR:
+ kbd_queue(state.mode);
+ break;
+ case I8042_CMD_CTL_WCTR:
+ case I8042_CMD_AUX_SEND:
+ case I8042_CMD_AUX_LOOP:
+ state.write_cmd = val;
+ break;
+ case I8042_CMD_AUX_TEST:
+ /* 0 means we're a normal PS/2 mouse */
+ mouse_queue(0);
+ break;
+ case I8042_CMD_AUX_DISABLE:
+ state.mode |= MODE_DISABLE_AUX;
+ break;
+ case I8042_CMD_AUX_ENABLE:
+ state.mode &= ~MODE_DISABLE_AUX;
+ break;
+ case I8042_CMD_SYSTEM_RESET:
+ kvm_cpu__reboot(kvm);
+ break;
+ default:
+ break;
+ }
+}
+
+/*
+ * Called when the OS reads from port 0x60 (PS/2 data)
+ */
+static u32 kbd_read_data(void)
+{
+ u32 ret;
+ int i;
+
+ if (state.kcount != 0) {
+ /* Keyboard data gets read first */
+ ret = state.kq[state.kread++ % QUEUE_SIZE];
+ state.kcount--;
+ kvm__irq_line(state.kvm, KBD_IRQ, 0);
+ kbd_update_irq();
+ } else if (state.mcount > 0) {
+ /* Followed by the mouse */
+ ret = state.mq[state.mread++ % QUEUE_SIZE];
+ state.mcount--;
+ kvm__irq_line(state.kvm, AUX_IRQ, 0);
+ kbd_update_irq();
+ } else {
+ i = state.kread - 1;
+ if (i < 0)
+ i = QUEUE_SIZE;
+ ret = state.kq[i];
+ }
+ return ret;
+}
+
+/*
+ * Called when the OS read from port 0x64, the command port
+ */
+static u32 kbd_read_status(void)
+{
+ return (u32)state.status;
+}
+
+/*
+ * Called when the OS writes to port 0x60 (data port)
+ * Things written here are generally arguments to commands previously
+ * written to port 0x64 and stored in state.write_cmd
+ */
+static void kbd_write_data(u32 val)
+{
+ switch (state.write_cmd) {
+ case I8042_CMD_CTL_WCTR:
+ state.mode = val;
+ kbd_update_irq();
+ break;
+ case I8042_CMD_AUX_LOOP:
+ mouse_queue(val);
+ mouse_queue(RESPONSE_ACK);
+ break;
+ case I8042_CMD_AUX_SEND:
+ /* The OS wants to send a command to the mouse */
+ mouse_queue(RESPONSE_ACK);
+ switch (val) {
+ case 0xe6:
+ /* set scaling = 1:1 */
+ state.mstatus &= ~AUX_SCALING_FLAG;
+ break;
+ case 0xe8:
+ /* set resolution */
+ state.mres = val;
+ break;
+ case 0xe9:
+ /* Report mouse status/config */
+ mouse_queue(state.mstatus);
+ mouse_queue(state.mres);
+ mouse_queue(state.msample);
+ break;
+ case 0xf2:
+ /* send ID */
+ mouse_queue(0); /* normal mouse */
+ break;
+ case 0xf3:
+ /* set sample rate */
+ state.msample = val;
+ break;
+ case 0xf4:
+ /* enable reporting */
+ state.mstatus |= AUX_ENABLE_REPORTING;
+ break;
+ case 0xf5:
+ state.mstatus &= ~AUX_ENABLE_REPORTING;
+ break;
+ case 0xf6:
+ /* set defaults, just fall through to reset */
+ case 0xff:
+ /* reset */
+ state.mstatus = 0x0;
+ state.mres = AUX_DEFAULT_RESOLUTION;
+ state.msample = AUX_DEFAULT_SAMPLE;
+ break;
+ default:
+ break;
+ }
+ break;
+ case 0:
+ /* Just send the ID */
+ kbd_queue(RESPONSE_ACK);
+ kbd_queue(0xab);
+ kbd_queue(0x41);
+ kbd_update_irq();
+ break;
+ default:
+ /* Yeah whatever */
+ break;
+ }
+ state.write_cmd = 0;
+}
+
+static void kbd_reset(void)
+{
+ state = (struct kbd_state) {
+ .status = I8042_STR_MUXERR | I8042_STR_CMDDAT | I8042_STR_KEYLOCK, /* 0x1c */
+ .mode = KBD_MODE_KBD_INT | KBD_MODE_SYS, /* 0x3 */
+ .mres = AUX_DEFAULT_RESOLUTION,
+ .msample = AUX_DEFAULT_SAMPLE,
+ };
+}
+
+/*
+ * Called when the OS has written to one of the keyboard's ports (0x60 or 0x64)
+ */
+static bool kbd_in(struct ioport *ioport, struct kvm_cpu *vcpu, u16 port, void *data, int size)
+{
+ switch (port) {
+ case I8042_COMMAND_REG: {
+ u8 value = kbd_read_status();
+ ioport__write8(data, value);
+ break;
+ }
+ case I8042_DATA_REG: {
+ u32 value = kbd_read_data();
+ ioport__write32(data, value);
+ break;
+ }
+ case I8042_PORT_B_REG: {
+ ioport__write8(data, 0x20);
+ break;
+ }
+ default:
+ return false;
+ }
+
+ return true;
+}
+
+static bool kbd_out(struct ioport *ioport, struct kvm_cpu *vcpu, u16 port, void *data, int size)
+{
+ switch (port) {
+ case I8042_COMMAND_REG: {
+ u8 value = ioport__read8(data);
+ kbd_write_command(vcpu->kvm, value);
+ break;
+ }
+ case I8042_DATA_REG: {
+ u32 value = ioport__read32(data);
+ kbd_write_data(value);
+ break;
+ }
+ case I8042_PORT_B_REG: {
+ break;
+ }
+ default:
+ return false;
+ }
+
+ return true;
+}
+
+static struct ioport_operations kbd_ops = {
+ .io_in = kbd_in,
+ .io_out = kbd_out,
+};
+
+int kbd__init(struct kvm *kvm)
+{
+#ifndef CONFIG_X86
+ return 0;
+#endif
+
+ kbd_reset();
+ state.kvm = kvm;
+ ioport__register(kvm, I8042_DATA_REG, &kbd_ops, 2, NULL);
+ ioport__register(kvm, I8042_COMMAND_REG, &kbd_ops, 2, NULL);
+
+ return 0;
+}
+dev_init(kbd__init);
diff --git a/tools/kvm/hw/pci-shmem.c b/tools/kvm/hw/pci-shmem.c
new file mode 100644
index 0000000..a1c5ab7
--- /dev/null
+++ b/tools/kvm/hw/pci-shmem.c
@@ -0,0 +1,403 @@
+#include "kvm/devices.h"
+#include "kvm/pci-shmem.h"
+#include "kvm/virtio-pci-dev.h"
+#include "kvm/irq.h"
+#include "kvm/kvm.h"
+#include "kvm/pci.h"
+#include "kvm/util.h"
+#include "kvm/ioport.h"
+#include "kvm/ioeventfd.h"
+
+#include <linux/kvm.h>
+#include <linux/byteorder.h>
+#include <sys/ioctl.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+
+#define MB_SHIFT (20)
+#define KB_SHIFT (10)
+#define GB_SHIFT (30)
+
+static struct pci_device_header pci_shmem_pci_device = {
+ .vendor_id = cpu_to_le16(PCI_VENDOR_ID_REDHAT_QUMRANET),
+ .device_id = cpu_to_le16(0x1110),
+ .header_type = PCI_HEADER_TYPE_NORMAL,
+ .class[2] = 0xFF, /* misc pci device */
+ .status = cpu_to_le16(PCI_STATUS_CAP_LIST),
+ .capabilities = (void *)&pci_shmem_pci_device.msix - (void *)&pci_shmem_pci_device,
+ .msix.cap = PCI_CAP_ID_MSIX,
+ .msix.ctrl = cpu_to_le16(1),
+ .msix.table_offset = cpu_to_le32(1), /* Use BAR 1 */
+ .msix.pba_offset = cpu_to_le32(0x1001), /* Use BAR 1 */
+};
+
+static struct device_header pci_shmem_device = {
+ .bus_type = DEVICE_BUS_PCI,
+ .data = &pci_shmem_pci_device,
+};
+
+/* registers for the Inter-VM shared memory device */
+enum ivshmem_registers {
+ INTRMASK = 0,
+ INTRSTATUS = 4,
+ IVPOSITION = 8,
+ DOORBELL = 12,
+};
+
+static struct shmem_info *shmem_region;
+static u16 ivshmem_registers;
+static int local_fd;
+static u32 local_id;
+static u64 msix_block;
+static u64 msix_pba;
+static struct msix_table msix_table[2];
+
+int pci_shmem__register_mem(struct shmem_info *si)
+{
+ if (!shmem_region) {
+ shmem_region = si;
+ } else {
+ pr_warning("only single shmem currently avail. ignoring.\n");
+ free(si);
+ }
+ return 0;
+}
+
+static bool shmem_pci__io_in(struct ioport *ioport, struct kvm_cpu *vcpu, u16 port, void *data, int size)
+{
+ u16 offset = port - ivshmem_registers;
+
+ switch (offset) {
+ case INTRMASK:
+ break;
+ case INTRSTATUS:
+ break;
+ case IVPOSITION:
+ ioport__write32(data, local_id);
+ break;
+ case DOORBELL:
+ break;
+ };
+
+ return true;
+}
+
+static bool shmem_pci__io_out(struct ioport *ioport, struct kvm_cpu *vcpu, u16 port, void *data, int size)
+{
+ u16 offset = port - ivshmem_registers;
+
+ switch (offset) {
+ case INTRMASK:
+ break;
+ case INTRSTATUS:
+ break;
+ case IVPOSITION:
+ break;
+ case DOORBELL:
+ break;
+ };
+
+ return true;
+}
+
+static struct ioport_operations shmem_pci__io_ops = {
+ .io_in = shmem_pci__io_in,
+ .io_out = shmem_pci__io_out,
+};
+
+static void callback_mmio_msix(struct kvm_cpu *vcpu, u64 addr, u8 *data, u32 len, u8 is_write, void *ptr)
+{
+ void *mem;
+
+ if (addr - msix_block < 0x1000)
+ mem = &msix_table;
+ else
+ mem = &msix_pba;
+
+ if (is_write)
+ memcpy(mem + addr - msix_block, data, len);
+ else
+ memcpy(data, mem + addr - msix_block, len);
+}
+
+/*
+ * Return an irqfd which can be used by other guests to signal this guest
+ * whenever they need to poke it
+ */
+int pci_shmem__get_local_irqfd(struct kvm *kvm)
+{
+ int fd, gsi, r;
+ struct kvm_irqfd irqfd;
+
+ if (local_fd == 0) {
+ fd = eventfd(0, 0);
+ if (fd < 0)
+ return fd;
+
+ if (pci_shmem_pci_device.msix.ctrl & cpu_to_le16(PCI_MSIX_FLAGS_ENABLE)) {
+ gsi = irq__add_msix_route(kvm, &msix_table[0].msg);
+ } else {
+ gsi = pci_shmem_pci_device.irq_line;
+ }
+
+ irqfd = (struct kvm_irqfd) {
+ .fd = fd,
+ .gsi = gsi,
+ };
+
+ r = ioctl(kvm->vm_fd, KVM_IRQFD, &irqfd);
+ if (r < 0)
+ return r;
+
+ local_fd = fd;
+ }
+
+ return local_fd;
+}
+
+/*
+ * Connect a new client to ivshmem by adding the appropriate datamatch
+ * to the DOORBELL
+ */
+int pci_shmem__add_client(struct kvm *kvm, u32 id, int fd)
+{
+ struct kvm_ioeventfd ioevent;
+
+ ioevent = (struct kvm_ioeventfd) {
+ .addr = ivshmem_registers + DOORBELL,
+ .len = sizeof(u32),
+ .datamatch = id,
+ .fd = fd,
+ .flags = KVM_IOEVENTFD_FLAG_PIO | KVM_IOEVENTFD_FLAG_DATAMATCH,
+ };
+
+ return ioctl(kvm->vm_fd, KVM_IOEVENTFD, &ioevent);
+}
+
+/*
+ * Remove a client connected to ivshmem by removing the appropriate datamatch
+ * from the DOORBELL
+ */
+int pci_shmem__remove_client(struct kvm *kvm, u32 id)
+{
+ struct kvm_ioeventfd ioevent;
+
+ ioevent = (struct kvm_ioeventfd) {
+ .addr = ivshmem_registers + DOORBELL,
+ .len = sizeof(u32),
+ .datamatch = id,
+ .flags = KVM_IOEVENTFD_FLAG_PIO
+ | KVM_IOEVENTFD_FLAG_DATAMATCH
+ | KVM_IOEVENTFD_FLAG_DEASSIGN,
+ };
+
+ return ioctl(kvm->vm_fd, KVM_IOEVENTFD, &ioevent);
+}
+
+static void *setup_shmem(const char *key, size_t len, int creating)
+{
+ int fd;
+ int rtn;
+ void *mem;
+ int flag = O_RDWR;
+
+ if (creating)
+ flag |= O_CREAT;
+
+ fd = shm_open(key, flag, S_IRUSR | S_IWUSR);
+ if (fd < 0) {
+ pr_warning("Failed to open shared memory file %s\n", key);
+ return NULL;
+ }
+
+ if (creating) {
+ rtn = ftruncate(fd, (off_t) len);
+ if (rtn < 0)
+ pr_warning("Can't ftruncate(fd,%zu)\n", len);
+ }
+ mem = mmap(NULL, len,
+ PROT_READ | PROT_WRITE, MAP_SHARED | MAP_NORESERVE, fd, 0);
+ if (mem == MAP_FAILED) {
+ pr_warning("Failed to mmap shared memory file");
+ mem = NULL;
+ }
+ close(fd);
+
+ return mem;
+}
+
+int shmem_parser(const struct option *opt, const char *arg, int unset)
+{
+ const u64 default_size = SHMEM_DEFAULT_SIZE;
+ const u64 default_phys_addr = SHMEM_DEFAULT_ADDR;
+ const char *default_handle = SHMEM_DEFAULT_HANDLE;
+ struct shmem_info *si = malloc(sizeof(struct shmem_info));
+ u64 phys_addr;
+ u64 size;
+ char *handle = NULL;
+ int create = 0;
+ const char *p = arg;
+ char *next;
+ int base = 10;
+ int verbose = 0;
+
+ const int skip_pci = strlen("pci:");
+ if (verbose)
+ pr_info("shmem_parser(%p,%s,%d)", opt, arg, unset);
+ /* parse out optional addr family */
+ if (strcasestr(p, "pci:")) {
+ p += skip_pci;
+ } else if (strcasestr(p, "mem:")) {
+ die("I can't add to E820 map yet.\n");
+ }
+ /* parse out physical addr */
+ base = 10;
+ if (strcasestr(p, "0x"))
+ base = 16;
+ phys_addr = strtoll(p, &next, base);
+ if (next == p && phys_addr == 0) {
+ pr_info("shmem: no physical addr specified, using default.");
+ phys_addr = default_phys_addr;
+ }
+ if (*next != ':' && *next != '\0')
+ die("shmem: unexpected chars after phys addr.\n");
+ if (*next == '\0')
+ p = next;
+ else
+ p = next + 1;
+ /* parse out size */
+ base = 10;
+ if (strcasestr(p, "0x"))
+ base = 16;
+ size = strtoll(p, &next, base);
+ if (next == p && size == 0) {
+ pr_info("shmem: no size specified, using default.");
+ size = default_size;
+ }
+ /* look for [KMGkmg][Bb]* uses base 2. */
+ int skip_B = 0;
+ if (strspn(next, "KMGkmg")) { /* might have a prefix */
+ if (*(next + 1) == 'B' || *(next + 1) == 'b')
+ skip_B = 1;
+ switch (*next) {
+ case 'K':
+ case 'k':
+ size = size << KB_SHIFT;
+ break;
+ case 'M':
+ case 'm':
+ size = size << MB_SHIFT;
+ break;
+ case 'G':
+ case 'g':
+ size = size << GB_SHIFT;
+ break;
+ default:
+ die("shmem: bug in detecting size prefix.");
+ break;
+ }
+ next += 1 + skip_B;
+ }
+ if (*next != ':' && *next != '\0') {
+ die("shmem: unexpected chars after phys size. <%c><%c>\n",
+ *next, *p);
+ }
+ if (*next == '\0')
+ p = next;
+ else
+ p = next + 1;
+ /* parse out optional shmem handle */
+ const int skip_handle = strlen("handle=");
+ next = strcasestr(p, "handle=");
+ if (*p && next) {
+ if (p != next)
+ die("unexpected chars before handle\n");
+ p += skip_handle;
+ next = strchrnul(p, ':');
+ if (next - p) {
+ handle = malloc(next - p + 1);
+ strncpy(handle, p, next - p);
+ handle[next - p] = '\0'; /* just in case. */
+ }
+ if (*next == '\0')
+ p = next;
+ else
+ p = next + 1;
+ }
+ /* parse optional create flag to see if we should create shm seg. */
+ if (*p && strcasestr(p, "create")) {
+ create = 1;
+ p += strlen("create");
+ }
+ if (*p != '\0')
+ die("shmem: unexpected trailing chars\n");
+ if (handle == NULL) {
+ handle = malloc(strlen(default_handle) + 1);
+ strcpy(handle, default_handle);
+ }
+ if (verbose) {
+ pr_info("shmem: phys_addr = %llx",
+ (unsigned long long)phys_addr);
+ pr_info("shmem: size = %llx", (unsigned long long)size);
+ pr_info("shmem: handle = %s", handle);
+ pr_info("shmem: create = %d", create);
+ }
+
+ si->phys_addr = phys_addr;
+ si->size = size;
+ si->handle = handle;
+ si->create = create;
+ pci_shmem__register_mem(si); /* ownership of si, etc. passed on. */
+ return 0;
+}
+
+int pci_shmem__init(struct kvm *kvm)
+{
+ char *mem;
+ int r;
+
+ if (shmem_region == NULL)
+ return 0;
+
+ /* Register MMIO space for MSI-X */
+ r = ioport__register(kvm, IOPORT_EMPTY, &shmem_pci__io_ops, IOPORT_SIZE, NULL);
+ if (r < 0)
+ return r;
+ ivshmem_registers = (u16)r;
+
+ msix_block = pci_get_io_space_block(0x1010);
+ kvm__register_mmio(kvm, msix_block, 0x1010, false, callback_mmio_msix, NULL);
+
+ /*
+ * This registers 3 BARs:
+ *
+ * 0 - ivshmem registers
+ * 1 - MSI-X MMIO space
+ * 2 - Shared memory block
+ */
+ pci_shmem_pci_device.bar[0] = cpu_to_le32(ivshmem_registers | PCI_BASE_ADDRESS_SPACE_IO);
+ pci_shmem_pci_device.bar_size[0] = shmem_region->size;
+ pci_shmem_pci_device.bar[1] = cpu_to_le32(msix_block | PCI_BASE_ADDRESS_SPACE_MEMORY);
+ pci_shmem_pci_device.bar_size[1] = 0x1010;
+ pci_shmem_pci_device.bar[2] = cpu_to_le32(shmem_region->phys_addr | PCI_BASE_ADDRESS_SPACE_MEMORY);
+ pci_shmem_pci_device.bar_size[2] = shmem_region->size;
+
+ device__register(&pci_shmem_device);
+
+ /* Open shared memory and plug it into the guest */
+ mem = setup_shmem(shmem_region->handle, shmem_region->size,
+ shmem_region->create);
+ if (mem == NULL)
+ return -EINVAL;
+
+ kvm__register_mem(kvm, shmem_region->phys_addr, shmem_region->size,
+ mem);
+ return 0;
+}
+dev_init(pci_shmem__init);
+
+int pci_shmem__exit(struct kvm *kvm)
+{
+ return 0;
+}
+dev_exit(pci_shmem__exit);
diff --git a/tools/kvm/hw/rtc.c b/tools/kvm/hw/rtc.c
new file mode 100644
index 0000000..0649b5d
--- /dev/null
+++ b/tools/kvm/hw/rtc.c
@@ -0,0 +1,155 @@
+#include "kvm/rtc.h"
+
+#include "kvm/ioport.h"
+#include "kvm/kvm.h"
+
+#include <time.h>
+
+/*
+ * MC146818 RTC registers
+ */
+#define RTC_SECONDS 0x00
+#define RTC_SECONDS_ALARM 0x01
+#define RTC_MINUTES 0x02
+#define RTC_MINUTES_ALARM 0x03
+#define RTC_HOURS 0x04
+#define RTC_HOURS_ALARM 0x05
+#define RTC_DAY_OF_WEEK 0x06
+#define RTC_DAY_OF_MONTH 0x07
+#define RTC_MONTH 0x08
+#define RTC_YEAR 0x09
+#define RTC_CENTURY 0x32
+
+#define RTC_REG_A 0x0A
+#define RTC_REG_B 0x0B
+#define RTC_REG_C 0x0C
+#define RTC_REG_D 0x0D
+
+struct rtc_device {
+ u8 cmos_idx;
+ u8 cmos_data[128];
+};
+
+static struct rtc_device rtc;
+
+static inline unsigned char bin2bcd(unsigned val)
+{
+ return ((val / 10) << 4) + val % 10;
+}
+
+static bool cmos_ram_data_in(struct ioport *ioport, struct kvm_cpu *vcpu, u16 port, void *data, int size)
+{
+ struct tm *tm;
+ time_t ti;
+
+ time(&ti);
+
+ tm = gmtime(&ti);
+
+ switch (rtc.cmos_idx) {
+ case RTC_SECONDS:
+ ioport__write8(data, bin2bcd(tm->tm_sec));
+ break;
+ case RTC_MINUTES:
+ ioport__write8(data, bin2bcd(tm->tm_min));
+ break;
+ case RTC_HOURS:
+ ioport__write8(data, bin2bcd(tm->tm_hour));
+ break;
+ case RTC_DAY_OF_WEEK:
+ ioport__write8(data, bin2bcd(tm->tm_wday + 1));
+ break;
+ case RTC_DAY_OF_MONTH:
+ ioport__write8(data, bin2bcd(tm->tm_mday));
+ break;
+ case RTC_MONTH:
+ ioport__write8(data, bin2bcd(tm->tm_mon + 1));
+ break;
+ case RTC_YEAR: {
+ int year;
+
+ year = tm->tm_year + 1900;
+
+ ioport__write8(data, bin2bcd(year % 100));
+
+ break;
+ }
+ case RTC_CENTURY: {
+ int year;
+
+ year = tm->tm_year + 1900;
+
+ ioport__write8(data, bin2bcd(year / 100));
+
+ break;
+ }
+ default:
+ ioport__write8(data, rtc.cmos_data[rtc.cmos_idx]);
+ break;
+ }
+
+ return true;
+}
+
+static bool cmos_ram_data_out(struct ioport *ioport, struct kvm_cpu *vcpu, u16 port, void *data, int size)
+{
+ switch (rtc.cmos_idx) {
+ case RTC_REG_C:
+ case RTC_REG_D:
+ /* Read-only */
+ break;
+ default:
+ rtc.cmos_data[rtc.cmos_idx] = ioport__read8(data);
+ break;
+ }
+
+ return true;
+}
+
+static struct ioport_operations cmos_ram_data_ioport_ops = {
+ .io_out = cmos_ram_data_out,
+ .io_in = cmos_ram_data_in,
+};
+
+static bool cmos_ram_index_out(struct ioport *ioport, struct kvm_cpu *vcpu, u16 port, void *data, int size)
+{
+ u8 value = ioport__read8(data);
+
+ vcpu->kvm->nmi_disabled = value & (1UL << 7);
+ rtc.cmos_idx = value & ~(1UL << 7);
+
+ return true;
+}
+
+static struct ioport_operations cmos_ram_index_ioport_ops = {
+ .io_out = cmos_ram_index_out,
+};
+
+int rtc__init(struct kvm *kvm)
+{
+ int r = 0;
+
+ /* PORT 0070-007F - CMOS RAM/RTC (REAL TIME CLOCK) */
+ r = ioport__register(kvm, 0x0070, &cmos_ram_index_ioport_ops, 1, NULL);
+ if (r < 0)
+ return r;
+
+ r = ioport__register(kvm, 0x0071, &cmos_ram_data_ioport_ops, 1, NULL);
+ if (r < 0) {
+ ioport__unregister(kvm, 0x0071);
+ return r;
+ }
+
+ return r;
+}
+dev_init(rtc__init);
+
+int rtc__exit(struct kvm *kvm)
+{
+ /* PORT 0070-007F - CMOS RAM/RTC (REAL TIME CLOCK) */
+ ioport__unregister(kvm, 0x0070);
+ ioport__unregister(kvm, 0x0071);
+
+ return 0;
+}
+dev_exit(rtc__exit);
diff --git a/tools/kvm/hw/serial.c b/tools/kvm/hw/serial.c
new file mode 100644
index 0000000..60147de
--- /dev/null
+++ b/tools/kvm/hw/serial.c
@@ -0,0 +1,452 @@
+#include "kvm/8250-serial.h"
+
+#include "kvm/read-write.h"
+#include "kvm/ioport.h"
+#include "kvm/mutex.h"
+#include "kvm/util.h"
+#include "kvm/term.h"
+#include "kvm/kvm.h"
+#include "kvm/fdt.h"
+
+#include <linux/types.h>
+#include <linux/serial_reg.h>
+
+#include <pthread.h>
+
+/*
+ * This fakes a U6_16550A. The fifo len needs to be 64 as the kernel
+ * expects that for autodetection.
+ */
+#define FIFO_LEN 64
+#define FIFO_MASK (FIFO_LEN - 1)
+
+#define UART_IIR_TYPE_BITS 0xc0
+
+struct serial8250_device {
+ struct mutex mutex;
+ u8 id;
+
+ u16 iobase;
+ u8 irq;
+ u8 irq_state;
+ int txcnt;
+ int rxcnt;
+ int rxdone;
+ char txbuf[FIFO_LEN];
+ char rxbuf[FIFO_LEN];
+
+ u8 dll;
+ u8 dlm;
+ u8 iir;
+ u8 ier;
+ u8 fcr;
+ u8 lcr;
+ u8 mcr;
+ u8 lsr;
+ u8 msr;
+ u8 scr;
+};
+
+#define SERIAL_REGS_SETTING \
+ .iir = UART_IIR_NO_INT, \
+ .lsr = UART_LSR_TEMT | UART_LSR_THRE, \
+ .msr = UART_MSR_DCD | UART_MSR_DSR | UART_MSR_CTS, \
+ .mcr = UART_MCR_OUT2,
+
+static struct serial8250_device devices[] = {
+ /* ttyS0 */
+ [0] = {
+ .mutex = MUTEX_INITIALIZER,
+
+ .id = 0,
+ .iobase = 0x3f8,
+ .irq = 4,
+
+ SERIAL_REGS_SETTING
+ },
+ /* ttyS1 */
+ [1] = {
+ .mutex = MUTEX_INITIALIZER,
+
+ .id = 1,
+ .iobase = 0x2f8,
+ .irq = 3,
+
+ SERIAL_REGS_SETTING
+ },
+ /* ttyS2 */
+ [2] = {
+ .mutex = MUTEX_INITIALIZER,
+
+ .id = 2,
+ .iobase = 0x3e8,
+ .irq = 4,
+
+ SERIAL_REGS_SETTING
+ },
+ /* ttyS3 */
+ [3] = {
+ .mutex = MUTEX_INITIALIZER,
+
+ .id = 3,
+ .iobase = 0x2e8,
+ .irq = 3,
+
+ SERIAL_REGS_SETTING
+ },
+};
+
+static void serial8250_flush_tx(struct kvm *kvm, struct serial8250_device *dev)
+{
+ dev->lsr |= UART_LSR_TEMT | UART_LSR_THRE;
+
+ if (dev->txcnt) {
+ term_putc(dev->txbuf, dev->txcnt, dev->id);
+ dev->txcnt = 0;
+ }
+}
+
+static void serial8250_update_irq(struct kvm *kvm, struct serial8250_device *dev)
+{
+ u8 iir = 0;
+
+ /* Handle clear rx */
+ if (dev->lcr & UART_FCR_CLEAR_RCVR) {
+ dev->lcr &= ~UART_FCR_CLEAR_RCVR;
+ dev->rxcnt = dev->rxdone = 0;
+ dev->lsr &= ~UART_LSR_DR;
+ }
+
+ /* Handle clear tx */
+ if (dev->lcr & UART_FCR_CLEAR_XMIT) {
+ dev->lcr &= ~UART_FCR_CLEAR_XMIT;
+ dev->txcnt = 0;
+ dev->lsr |= UART_LSR_TEMT | UART_LSR_THRE;
+ }
+
+ /* Data ready and rcv interrupt enabled ? */
+ if ((dev->ier & UART_IER_RDI) && (dev->lsr & UART_LSR_DR))
+ iir |= UART_IIR_RDI;
+
+ /* Transmitter empty and interrupt enabled ? */
+ if ((dev->ier & UART_IER_THRI) && (dev->lsr & UART_LSR_TEMT))
+ iir |= UART_IIR_THRI;
+
+ /* Now update the irq line, if necessary */
+ if (!iir) {
+ dev->iir = UART_IIR_NO_INT;
+ if (dev->irq_state)
+ kvm__irq_line(kvm, dev->irq, 0);
+ } else {
+ dev->iir = iir;
+ if (!dev->irq_state)
+ kvm__irq_line(kvm, dev->irq, 1);
+ }
+ dev->irq_state = iir;
+
+ /*
+ * If the kernel disabled the tx interrupt, we know that there
+ * is nothing more to transmit, so we can reset our tx logic
+ * here.
+ */
+ if (!(dev->ier & UART_IER_THRI))
+ serial8250_flush_tx(kvm, dev);
+}
+
+#define SYSRQ_PENDING_NONE 0
+
+static int sysrq_pending;
+
+static void serial8250__sysrq(struct kvm *kvm, struct serial8250_device *dev)
+{
+ dev->lsr |= UART_LSR_DR | UART_LSR_BI;
+ dev->rxbuf[dev->rxcnt++] = sysrq_pending;
+ sysrq_pending = SYSRQ_PENDING_NONE;
+}
+
+static void serial8250__receive(struct kvm *kvm, struct serial8250_device *dev,
+ bool handle_sysrq)
+{
+ int c;
+
+ if (dev->mcr & UART_MCR_LOOP)
+ return;
+
+ if ((dev->lsr & UART_LSR_DR) || dev->rxcnt)
+ return;
+
+ if (handle_sysrq && sysrq_pending) {
+ serial8250__sysrq(kvm, dev);
+ return;
+ }
+
+ if (kvm->cfg.active_console != CONSOLE_8250)
+ return;
+
+ while (term_readable(dev->id) &&
+ dev->rxcnt < FIFO_LEN) {
+
+ c = term_getc(kvm, dev->id);
+
+ if (c < 0)
+ break;
+ dev->rxbuf[dev->rxcnt++] = c;
+ dev->lsr |= UART_LSR_DR;
+ }
+}
+
+void serial8250__update_consoles(struct kvm *kvm)
+{
+ unsigned int i;
+
+ for (i = 0; i < ARRAY_SIZE(devices); i++) {
+ struct serial8250_device *dev = &devices[i];
+
+ mutex_lock(&dev->mutex);
+
+ /* Restrict sysrq injection to the first port */
+ serial8250__receive(kvm, dev, i == 0);
+
+ serial8250_update_irq(kvm, dev);
+
+ mutex_unlock(&dev->mutex);
+ }
+}
+
+void serial8250__inject_sysrq(struct kvm *kvm, char sysrq)
+{
+ sysrq_pending = sysrq;
+}
+
+static bool serial8250_out(struct ioport *ioport, struct kvm_cpu *vcpu, u16 port,
+ void *data, int size)
+{
+ struct serial8250_device *dev = ioport->priv;
+ u16 offset;
+ bool ret = true;
+ char *addr = data;
+
+ mutex_lock(&dev->mutex);
+
+ offset = port - dev->iobase;
+
+ switch (offset) {
+ case UART_TX:
+ if (dev->lcr & UART_LCR_DLAB) {
+ dev->dll = ioport__read8(data);
+ break;
+ }
+
+ /* Loopback mode */
+ if (dev->mcr & UART_MCR_LOOP) {
+ if (dev->rxcnt < FIFO_LEN) {
+ dev->rxbuf[dev->rxcnt++] = *addr;
+ dev->lsr |= UART_LSR_DR;
+ }
+ break;
+ }
+
+ if (dev->txcnt < FIFO_LEN) {
+ dev->txbuf[dev->txcnt++] = *addr;
+ dev->lsr &= ~UART_LSR_TEMT;
+ if (dev->txcnt == FIFO_LEN / 2)
+ dev->lsr &= ~UART_LSR_THRE;
+ serial8250_flush_tx(vcpu->kvm, dev);
+ } else {
+ /* Should never happpen */
+ dev->lsr &= ~(UART_LSR_TEMT | UART_LSR_THRE);
+ }
+ break;
+ case UART_IER:
+ if (!(dev->lcr & UART_LCR_DLAB))
+ dev->ier = ioport__read8(data) & 0x0f;
+ else
+ dev->dlm = ioport__read8(data);
+ break;
+ case UART_FCR:
+ dev->fcr = ioport__read8(data);
+ break;
+ case UART_LCR:
+ dev->lcr = ioport__read8(data);
+ break;
+ case UART_MCR:
+ dev->mcr = ioport__read8(data);
+ break;
+ case UART_LSR:
+ /* Factory test */
+ break;
+ case UART_MSR:
+ /* Not used */
+ break;
+ case UART_SCR:
+ dev->scr = ioport__read8(data);
+ break;
+ default:
+ ret = false;
+ break;
+ }
+
+ serial8250_update_irq(vcpu->kvm, dev);
+
+ mutex_unlock(&dev->mutex);
+
+ return ret;
+}
+
+static void serial8250_rx(struct serial8250_device *dev, void *data)
+{
+ if (dev->rxdone == dev->rxcnt)
+ return;
+
+ /* Break issued ? */
+ if (dev->lsr & UART_LSR_BI) {
+ dev->lsr &= ~UART_LSR_BI;
+ ioport__write8(data, 0);
+ return;
+ }
+
+ ioport__write8(data, dev->rxbuf[dev->rxdone++]);
+ if (dev->rxcnt == dev->rxdone) {
+ dev->lsr &= ~UART_LSR_DR;
+ dev->rxcnt = dev->rxdone = 0;
+ }
+}
+
+static bool serial8250_in(struct ioport *ioport, struct kvm_cpu *vcpu, u16 port, void *data, int size)
+{
+ struct serial8250_device *dev = ioport->priv;
+ u16 offset;
+ bool ret = true;
+
+ mutex_lock(&dev->mutex);
+
+ offset = port - dev->iobase;
+
+ switch (offset) {
+ case UART_RX:
+ if (dev->lcr & UART_LCR_DLAB)
+ ioport__write8(data, dev->dll);
+ else
+ serial8250_rx(dev, data);
+ break;
+ case UART_IER:
+ if (dev->lcr & UART_LCR_DLAB)
+ ioport__write8(data, dev->dlm);
+ else
+ ioport__write8(data, dev->ier);
+ break;
+ case UART_IIR:
+ ioport__write8(data, dev->iir | UART_IIR_TYPE_BITS);
+ break;
+ case UART_LCR:
+ ioport__write8(data, dev->lcr);
+ break;
+ case UART_MCR:
+ ioport__write8(data, dev->mcr);
+ break;
+ case UART_LSR:
+ ioport__write8(data, dev->lsr);
+ break;
+ case UART_MSR:
+ ioport__write8(data, dev->msr);
+ break;
+ case UART_SCR:
+ ioport__write8(data, dev->scr);
+ break;
+ default:
+ ret = false;
+ break;
+ }
+
+ serial8250_update_irq(vcpu->kvm, dev);
+
+ mutex_unlock(&dev->mutex);
+
+ return ret;
+}
+
+#ifdef CONFIG_HAS_LIBFDT
+#define DEVICE_NAME_MAX_LEN 32
+static void serial8250_generate_fdt_node(struct ioport *ioport, void *fdt,
+ void (*generate_irq_prop)(void *fdt,
+ u8 irq))
+{
+ char dev_name[DEVICE_NAME_MAX_LEN];
+ struct serial8250_device *dev = ioport->priv;
+ u64 addr = KVM_IOPORT_AREA + dev->iobase;
+ u64 reg_prop[] = {
+ cpu_to_fdt64(addr),
+ cpu_to_fdt64(8),
+ };
+
+ snprintf(dev_name, DEVICE_NAME_MAX_LEN, "U6_16550A@%llx", addr);
+
+ _FDT(fdt_begin_node(fdt, dev_name));
+ _FDT(fdt_property_string(fdt, "compatible", "ns16550a"));
+ _FDT(fdt_property(fdt, "reg", reg_prop, sizeof(reg_prop)));
+ generate_irq_prop(fdt, dev->irq);
+ _FDT(fdt_property_cell(fdt, "clock-frequency", 1843200));
+ _FDT(fdt_end_node(fdt));
+}
+#else
+#define serial8250_generate_fdt_node NULL
+#endif
+
+static struct ioport_operations serial8250_ops = {
+ .io_in = serial8250_in,
+ .io_out = serial8250_out,
+ .generate_fdt_node = serial8250_generate_fdt_node,
+};
+
+static int serial8250__device_init(struct kvm *kvm, struct serial8250_device *dev)
+{
+ int r;
+
+ ioport__map_irq(&dev->irq);
+ r = ioport__register(kvm, dev->iobase, &serial8250_ops, 8, dev);
+ kvm__irq_line(kvm, dev->irq, 0);
+
+ return r;
+}
+
+int serial8250__init(struct kvm *kvm)
+{
+ unsigned int i, j;
+ int r = 0;
+
+ for (i = 0; i < ARRAY_SIZE(devices); i++) {
+ struct serial8250_device *dev = &devices[i];
+
+ r = serial8250__device_init(kvm, dev);
+ if (r < 0)
+ goto cleanup;
+ }
+
+ return r;
+cleanup:
+ for (j = 0; j <= i; j++) {
+ struct serial8250_device *dev = &devices[j];
+
+ ioport__unregister(kvm, dev->iobase);
+ }
+
+ return r;
+}
+dev_init(serial8250__init);
+
+int serial8250__exit(struct kvm *kvm)
+{
+ unsigned int i;
+ int r;
+
+ for (i = 0; i < ARRAY_SIZE(devices); i++) {
+ struct serial8250_device *dev = &devices[i];
+
+ r = ioport__unregister(kvm, dev->iobase);
+ if (r < 0)
+ return r;
+ }
+
+ return 0;
+}
+dev_exit(serial8250__exit);
diff --git a/tools/kvm/hw/vesa.c b/tools/kvm/hw/vesa.c
new file mode 100644
index 0000000..a9a1d3e
--- /dev/null
+++ b/tools/kvm/hw/vesa.c
@@ -0,0 +1,88 @@
+#include "kvm/vesa.h"
+
+#include "kvm/devices.h"
+#include "kvm/virtio-pci-dev.h"
+#include "kvm/framebuffer.h"
+#include "kvm/kvm-cpu.h"
+#include "kvm/ioport.h"
+#include "kvm/util.h"
+#include "kvm/irq.h"
+#include "kvm/kvm.h"
+#include "kvm/pci.h"
+
+#include <linux/byteorder.h>
+#include <sys/mman.h>
+#include <linux/err.h>
+#include <sys/types.h>
+#include <sys/ioctl.h>
+#include <inttypes.h>
+#include <unistd.h>
+
+static bool vesa_pci_io_in(struct ioport *ioport, struct kvm_cpu *vcpu, u16 port, void *data, int size)
+{
+ return true;
+}
+
+static bool vesa_pci_io_out(struct ioport *ioport, struct kvm_cpu *vcpu, u16 port, void *data, int size)
+{
+ return true;
+}
+
+static struct ioport_operations vesa_io_ops = {
+ .io_in = vesa_pci_io_in,
+ .io_out = vesa_pci_io_out,
+};
+
+static struct pci_device_header vesa_pci_device = {
+ .vendor_id = cpu_to_le16(PCI_VENDOR_ID_REDHAT_QUMRANET),
+ .device_id = cpu_to_le16(PCI_DEVICE_ID_VESA),
+ .header_type = PCI_HEADER_TYPE_NORMAL,
+ .revision_id = 0,
+ .class[2] = 0x03,
+ .subsys_vendor_id = cpu_to_le16(PCI_SUBSYSTEM_VENDOR_ID_REDHAT_QUMRANET),
+ .subsys_id = cpu_to_le16(PCI_SUBSYSTEM_ID_VESA),
+ .bar[1] = cpu_to_le32(VESA_MEM_ADDR | PCI_BASE_ADDRESS_SPACE_MEMORY),
+ .bar_size[1] = VESA_MEM_SIZE,
+};
+
+static struct device_header vesa_device = {
+ .bus_type = DEVICE_BUS_PCI,
+ .data = &vesa_pci_device,
+};
+
+static struct framebuffer vesafb;
+
+struct framebuffer *vesa__init(struct kvm *kvm)
+{
+ u16 vesa_base_addr;
+ char *mem;
+ int r;
+
+ if (!kvm->cfg.vnc && !kvm->cfg.sdl && !kvm->cfg.gtk)
+ return NULL;
+
+ r = ioport__register(kvm, IOPORT_EMPTY, &vesa_io_ops, IOPORT_SIZE, NULL);
+ if (r < 0)
+ return ERR_PTR(r);
+
+ vesa_base_addr = (u16)r;
+ vesa_pci_device.bar[0] = cpu_to_le32(vesa_base_addr | PCI_BASE_ADDRESS_SPACE_IO);
+ device__register(&vesa_device);
+
+ mem = mmap(NULL, VESA_MEM_SIZE, PROT_RW, MAP_ANON_NORESERVE, -1, 0);
+ if (mem == MAP_FAILED)
+ ERR_PTR(-errno);
+
+ kvm__register_mem(kvm, VESA_MEM_ADDR, VESA_MEM_SIZE, mem);
+
+ vesafb = (struct framebuffer) {
+ .width = VESA_WIDTH,
+ .height = VESA_HEIGHT,
+ .depth = VESA_BPP,
+ .mem = mem,
+ .mem_addr = VESA_MEM_ADDR,
+ .mem_size = VESA_MEM_SIZE,
+ .kvm = kvm,
+ };
+ return fb__register(&vesafb);
+}
diff --git a/tools/kvm/include/asm/hweight.h b/tools/kvm/include/asm/hweight.h
new file mode 100644
index 0000000..1a43977
--- /dev/null
+++ b/tools/kvm/include/asm/hweight.h
@@ -0,0 +1,8 @@
+#ifndef _KVM_ASM_HWEIGHT_H_
+#define _KVM_ASM_HWEIGHT_H_
+
+#include <linux/types.h>
+unsigned int hweight32(unsigned int w);
+unsigned long hweight64(__u64 w);
+
+#endif /* _KVM_ASM_HWEIGHT_H_ */
diff --git a/tools/kvm/include/bios/memcpy.h b/tools/kvm/include/bios/memcpy.h
new file mode 100644
index 0000000..e021044
--- /dev/null
+++ b/tools/kvm/include/bios/memcpy.h
@@ -0,0 +1,9 @@
+#ifndef KVM_BIOS_MEMCPY_H
+#define KVM_BIOS_MEMCPY_H
+
+#include <linux/types.h>
+#include <stddef.h>
+
+void memcpy16(u16 dst_seg, void *dst, u16 src_seg, const void *src, size_t len);
+
+#endif /* KVM_BIOS_MEMCPY_H */
diff --git a/tools/kvm/include/kvm/8250-serial.h b/tools/kvm/include/kvm/8250-serial.h
new file mode 100644
index 0000000..e954551
--- /dev/null
+++ b/tools/kvm/include/kvm/8250-serial.h
@@ -0,0 +1,11 @@
+#ifndef KVM__8250_SERIAL_H
+#define KVM__8250_SERIAL_H
+
+struct kvm;
+
+int serial8250__init(struct kvm *kvm);
+int serial8250__exit(struct kvm *kvm);
+void serial8250__update_consoles(struct kvm *kvm);
+void serial8250__inject_sysrq(struct kvm *kvm, char sysrq);
+
+#endif /* KVM__8250_SERIAL_H */
diff --git a/tools/kvm/include/kvm/apic.h b/tools/kvm/include/kvm/apic.h
new file mode 100644
index 0000000..2129997
--- /dev/null
+++ b/tools/kvm/include/kvm/apic.h
@@ -0,0 +1,17 @@
+#ifndef KVM_APIC_H_
+#define KVM_APIC_H_
+
+#include <asm/apicdef.h>
+
+/*
+ * APIC, IOAPIC stuff
+ */
+#define APIC_BASE_ADDR_STEP 0x00400000
+#define IOAPIC_BASE_ADDR_STEP 0x00100000
+
+#define APIC_ADDR(apic) (APIC_DEFAULT_PHYS_BASE + apic * APIC_BASE_ADDR_STEP)
+#define IOAPIC_ADDR(ioapic) (IO_APIC_DEFAULT_PHYS_BASE + ioapic * IOAPIC_BASE_ADDR_STEP)
+
+#define KVM_APIC_VERSION 0x14 /* xAPIC */
+
+#endif /* KVM_APIC_H_ */
diff --git a/tools/kvm/include/kvm/brlock.h b/tools/kvm/include/kvm/brlock.h
new file mode 100644
index 0000000..29f72e0
--- /dev/null
+++ b/tools/kvm/include/kvm/brlock.h
@@ -0,0 +1,41 @@
+#ifndef KVM__BRLOCK_H
+#define KVM__BRLOCK_H
+
+#include "kvm/kvm.h"
+#include "kvm/barrier.h"
+
+/*
+ * brlock is a lock which is very cheap for reads, but very expensive
+ * for writes.
+ * This lock will be used when updates are very rare and reads are common.
+ * This lock is currently implemented by stopping the guest while
+ * performing the updates. We assume that the only threads whichread from
+ * the locked data are VCPU threads, and the only writer isn't a VCPU thread.
+ */
+
+#ifndef barrier
+#define barrier() __asm__ __volatile__("": : :"memory")
+#endif
+
+#ifdef KVM_BRLOCK_DEBUG
+
+#include "kvm/rwsem.h"
+
+DECLARE_RWSEM(brlock_sem);
+
+#define br_read_lock(kvm) down_read(&brlock_sem);
+#define br_read_unlock(kvm) up_read(&brlock_sem);
+
+#define br_write_lock(kvm) down_write(&brlock_sem);
+#define br_write_unlock(kvm) up_write(&brlock_sem);
+
+#else
+
+#define br_read_lock(kvm) barrier()
+#define br_read_unlock(kvm) barrier()
+
+#define br_write_lock(kvm) kvm__pause(kvm)
+#define br_write_unlock(kvm) kvm__continue(kvm)
+#endif
+
+#endif
diff --git a/tools/kvm/include/kvm/builtin-balloon.h b/tools/kvm/include/kvm/builtin-balloon.h
new file mode 100644
index 0000000..77ee656
--- /dev/null
+++ b/tools/kvm/include/kvm/builtin-balloon.h
@@ -0,0 +1,9 @@
+#ifndef KVM__BALLOON_H
+#define KVM__BALLOON_H
+
+#include <kvm/util.h>
+
+int kvm_cmd_balloon(int argc, const char **argv, const char *prefix);
+void kvm_balloon_help(void) NORETURN;
+
+#endif
diff --git a/tools/kvm/include/kvm/builtin-debug.h b/tools/kvm/include/kvm/builtin-debug.h
new file mode 100644
index 0000000..efa0268
--- /dev/null
+++ b/tools/kvm/include/kvm/builtin-debug.h
@@ -0,0 +1,20 @@
+#ifndef KVM__DEBUG_H
+#define KVM__DEBUG_H
+
+#include <kvm/util.h>
+#include <linux/types.h>
+
+#define KVM_DEBUG_CMD_TYPE_DUMP (1 << 0)
+#define KVM_DEBUG_CMD_TYPE_NMI (1 << 1)
+#define KVM_DEBUG_CMD_TYPE_SYSRQ (1 << 2)
+
+struct debug_cmd_params {
+ u32 dbg_type;
+ u32 cpu;
+ char sysrq;
+};
+
+int kvm_cmd_debug(int argc, const char **argv, const char *prefix);
+void kvm_debug_help(void) NORETURN;
+
+#endif
diff --git a/tools/kvm/include/kvm/builtin-help.h b/tools/kvm/include/kvm/builtin-help.h
new file mode 100644
index 0000000..2946743
--- /dev/null
+++ b/tools/kvm/include/kvm/builtin-help.h
@@ -0,0 +1,6 @@
+#ifndef __KVM_HELP_H__
+#define __KVM_HELP_H__
+
+int kvm_cmd_help(int argc, const char **argv, const char *prefix);
+
+#endif
diff --git a/tools/kvm/include/kvm/builtin-list.h b/tools/kvm/include/kvm/builtin-list.h
new file mode 100644
index 0000000..47029ca
--- /dev/null
+++ b/tools/kvm/include/kvm/builtin-list.h
@@ -0,0 +1,10 @@
+#ifndef KVM__LIST_H
+#define KVM__LIST_H
+
+#include <kvm/util.h>
+
+int kvm_cmd_list(int argc, const char **argv, const char *prefix);
+void kvm_list_help(void) NORETURN;
+int get_vmstate(int sock);
+
+#endif
diff --git a/tools/kvm/include/kvm/builtin-pause.h b/tools/kvm/include/kvm/builtin-pause.h
new file mode 100644
index 0000000..84aaee3
--- /dev/null
+++ b/tools/kvm/include/kvm/builtin-pause.h
@@ -0,0 +1,9 @@
+#ifndef KVM__PAUSE_H
+#define KVM__PAUSE_H
+
+#include <kvm/util.h>
+
+int kvm_cmd_pause(int argc, const char **argv, const char *prefix);
+void kvm_pause_help(void) NORETURN;
+
+#endif
diff --git a/tools/kvm/include/kvm/builtin-resume.h b/tools/kvm/include/kvm/builtin-resume.h
new file mode 100644
index 0000000..7de999b
--- /dev/null
+++ b/tools/kvm/include/kvm/builtin-resume.h
@@ -0,0 +1,9 @@
+#ifndef KVM__RESUME_H
+#define KVM__RESUME_H
+
+#include <kvm/util.h>
+
+int kvm_cmd_resume(int argc, const char **argv, const char *prefix);
+void kvm_resume_help(void) NORETURN;
+
+#endif
diff --git a/tools/kvm/include/kvm/builtin-run.h b/tools/kvm/include/kvm/builtin-run.h
new file mode 100644
index 0000000..91521a58
--- /dev/null
+++ b/tools/kvm/include/kvm/builtin-run.h
@@ -0,0 +1,11 @@
+#ifndef __KVM_RUN_H__
+#define __KVM_RUN_H__
+
+#include <kvm/util.h>
+
+int kvm_cmd_run(int argc, const char **argv, const char *prefix);
+void kvm_run_help(void) NORETURN;
+
+void kvm_run_set_wrapper_sandbox(void);
+
+#endif
diff --git a/tools/kvm/include/kvm/builtin-sandbox.h b/tools/kvm/include/kvm/builtin-sandbox.h
new file mode 100644
index 0000000..98cd6be
--- /dev/null
+++ b/tools/kvm/include/kvm/builtin-sandbox.h
@@ -0,0 +1,6 @@
+#ifndef KVM__SANDBOX_H
+#define KVM__SANDBOX_H
+
+int kvm_cmd_sandbox(int argc, const char **argv, const char *prefix);
+
+#endif
diff --git a/tools/kvm/include/kvm/builtin-setup.h b/tools/kvm/include/kvm/builtin-setup.h
new file mode 100644
index 0000000..4a8d7ee
--- /dev/null
+++ b/tools/kvm/include/kvm/builtin-setup.h
@@ -0,0 +1,11 @@
+#ifndef KVM__SETUP_H
+#define KVM__SETUP_H
+
+#include <kvm/util.h>
+
+int kvm_cmd_setup(int argc, const char **argv, const char *prefix);
+void kvm_setup_help(void) NORETURN;
+int kvm_setup_create_new(const char *guestfs_name);
+void kvm_setup_resolv(const char *guestfs_name);
+
+#endif
diff --git a/tools/kvm/include/kvm/builtin-stat.h b/tools/kvm/include/kvm/builtin-stat.h
new file mode 100644
index 0000000..4fecb37
--- /dev/null
+++ b/tools/kvm/include/kvm/builtin-stat.h
@@ -0,0 +1,9 @@
+#ifndef KVM__STAT_H
+#define KVM__STAT_H
+
+#include <kvm/util.h>
+
+int kvm_cmd_stat(int argc, const char **argv, const char *prefix);
+void kvm_stat_help(void) NORETURN;
+
+#endif
diff --git a/tools/kvm/include/kvm/builtin-stop.h b/tools/kvm/include/kvm/builtin-stop.h
new file mode 100644
index 0000000..b26b275
--- /dev/null
+++ b/tools/kvm/include/kvm/builtin-stop.h
@@ -0,0 +1,9 @@
+#ifndef KVM__STOP_H
+#define KVM__STOP_H
+
+#include <kvm/util.h>
+
+int kvm_cmd_stop(int argc, const char **argv, const char *prefix);
+void kvm_stop_help(void) NORETURN;
+
+#endif
diff --git a/tools/kvm/include/kvm/builtin-version.h b/tools/kvm/include/kvm/builtin-version.h
new file mode 100644
index 0000000..83cac4d
--- /dev/null
+++ b/tools/kvm/include/kvm/builtin-version.h
@@ -0,0 +1,6 @@
+#ifndef KVM__VERSION_H
+#define KVM__VERSION_H
+
+int kvm_cmd_version(int argc, const char **argv, const char *prefix);
+
+#endif
diff --git a/tools/kvm/include/kvm/compiler.h b/tools/kvm/include/kvm/compiler.h
new file mode 100644
index 0000000..2013a83
--- /dev/null
+++ b/tools/kvm/include/kvm/compiler.h
@@ -0,0 +1,10 @@
+#ifndef KVM_COMPILER_H_
+#define KVM_COMPILER_H_
+
+#ifndef __compiletime_error
+# define __compiletime_error(message)
+#endif
+
+#define notrace __attribute__((no_instrument_function))
+
+#endif /* KVM_COMPILER_H_ */
diff --git a/tools/kvm/include/kvm/devices.h b/tools/kvm/include/kvm/devices.h
new file mode 100644
index 0000000..405f195
--- /dev/null
+++ b/tools/kvm/include/kvm/devices.h
@@ -0,0 +1,29 @@
+#ifndef KVM__DEVICES_H
+#define KVM__DEVICES_H
+
+#include <linux/rbtree.h>
+#include <linux/types.h>
+
+enum device_bus_type {
+ DEVICE_BUS_PCI,
+ DEVICE_BUS_MMIO,
+ DEVICE_BUS_IOPORT,
+ DEVICE_BUS_MAX,
+};
+
+struct device_header {
+ enum device_bus_type bus_type;
+ void *data;
+ int dev_num;
+ struct rb_node node;
+};
+
+int device__register(struct device_header *dev);
+void device__unregister(struct device_header *dev);
+struct device_header *device__find_dev(enum device_bus_type bus_type,
+ u8 dev_num);
+
+struct device_header *device__first_dev(enum device_bus_type bus_type);
+struct device_header *device__next_dev(struct device_header *dev);
+
+#endif /* KVM__DEVICES_H */
diff --git a/tools/kvm/include/kvm/disk-image.h b/tools/kvm/include/kvm/disk-image.h
new file mode 100644
index 0000000..b728052
--- /dev/null
+++ b/tools/kvm/include/kvm/disk-image.h
@@ -0,0 +1,96 @@
+#ifndef KVM__DISK_IMAGE_H
+#define KVM__DISK_IMAGE_H
+
+#include "kvm/read-write.h"
+#include "kvm/util.h"
+#include "kvm/parse-options.h"
+
+#include <linux/types.h>
+#include <linux/fs.h> /* for BLKGETSIZE64 */
+#include <sys/ioctl.h>
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <stdbool.h>
+#include <sys/uio.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+#define SECTOR_SHIFT 9
+#define SECTOR_SIZE (1UL << SECTOR_SHIFT)
+
+enum {
+ DISK_IMAGE_REGULAR,
+ DISK_IMAGE_MMAP,
+};
+
+#define MAX_DISK_IMAGES 4
+
+struct disk_image;
+
+struct disk_image_operations {
+ ssize_t (*read)(struct disk_image *disk, u64 sector, const struct iovec *iov,
+ int iovcount, void *param);
+ ssize_t (*write)(struct disk_image *disk, u64 sector, const struct iovec *iov,
+ int iovcount, void *param);
+ int (*flush)(struct disk_image *disk);
+ int (*close)(struct disk_image *disk);
+};
+
+struct disk_image_params {
+ const char *filename;
+ /*
+ * wwpn == World Wide Port Number
+ * tpgt == Target Portal Group Tag
+ */
+ const char *wwpn;
+ const char *tpgt;
+ bool readonly;
+ bool direct;
+};
+
+struct disk_image {
+ int fd;
+ u64 size;
+ struct disk_image_operations *ops;
+ void *priv;
+ void *disk_req_cb_param;
+ void (*disk_req_cb)(void *param, long len);
+ bool async;
+ int evt;
+#ifdef CONFIG_HAS_AIO
+ io_context_t ctx;
+#endif
+ const char *wwpn;
+ const char *tpgt;
+ int debug_iodelay;
+};
+
+int disk_img_name_parser(const struct option *opt, const char *arg, int unset);
+int disk_image__init(struct kvm *kvm);
+int disk_image__exit(struct kvm *kvm);
+struct disk_image *disk_image__new(int fd, u64 size, struct disk_image_operations *ops, int mmap);
+int disk_image__flush(struct disk_image *disk);
+ssize_t disk_image__read(struct disk_image *disk, u64 sector, const struct iovec *iov,
+ int iovcount, void *param);
+ssize_t disk_image__write(struct disk_image *disk, u64 sector, const struct iovec *iov,
+ int iovcount, void *param);
+ssize_t disk_image__get_serial(struct disk_image *disk, void *buffer, ssize_t *len);
+
+struct disk_image *raw_image__probe(int fd, struct stat *st, bool readonly);
+struct disk_image *blkdev__probe(const char *filename, int flags, struct stat *st);
+
+ssize_t raw_image__read(struct disk_image *disk, u64 sector,
+ const struct iovec *iov, int iovcount, void *param);
+ssize_t raw_image__write(struct disk_image *disk, u64 sector,
+ const struct iovec *iov, int iovcount, void *param);
+ssize_t raw_image__read_mmap(struct disk_image *disk, u64 sector,
+ const struct iovec *iov, int iovcount, void *param);
+ssize_t raw_image__write_mmap(struct disk_image *disk, u64 sector,
+ const struct iovec *iov, int iovcount, void *param);
+int raw_image__close(struct disk_image *disk);
+void disk_image__set_callback(struct disk_image *disk, void (*disk_req_cb)(void *param, long len));
+#endif /* KVM__DISK_IMAGE_H */
diff --git a/tools/kvm/include/kvm/e820.h b/tools/kvm/include/kvm/e820.h
new file mode 100644
index 0000000..15f62cc
--- /dev/null
+++ b/tools/kvm/include/kvm/e820.h
@@ -0,0 +1,13 @@
+#ifndef KVM_E820_H
+#define KVM_E820_H
+
+#include <linux/types.h>
+#include <kvm/bios.h>
+
+#define SMAP 0x534d4150 /* ASCII "SMAP" */
+
+struct biosregs;
+
+extern bioscall void e820_query_map(struct biosregs *regs);
+
+#endif /* KVM_E820_H */
diff --git a/tools/kvm/include/kvm/fdt.h b/tools/kvm/include/kvm/fdt.h
new file mode 100644
index 0000000..19f95ac
--- /dev/null
+++ b/tools/kvm/include/kvm/fdt.h
@@ -0,0 +1,26 @@
+#ifndef KVM__FDT_H
+#define KVM__FDT_H
+
+#include "libfdt.h"
+
+#include <linux/types.h>
+
+#define FDT_MAX_SIZE 0x10000
+
+/* Helper for the various bits of code that generate FDT nodes */
+#define _FDT(exp) \
+ do { \
+ int ret = (exp); \
+ if (ret < 0) { \
+ die("Error creating device tree: %s: %s\n", \
+ #exp, fdt_strerror(ret)); \
+ } \
+ } while (0)
+
+static inline u32 fdt__alloc_phandle(void)
+{
+ static u32 phandle = 0;
+ return ++phandle;
+}
+
+#endif /* KVM__FDT_H */
diff --git a/tools/kvm/include/kvm/framebuffer.h b/tools/kvm/include/kvm/framebuffer.h
new file mode 100644
index 0000000..e3200e5
--- /dev/null
+++ b/tools/kvm/include/kvm/framebuffer.h
@@ -0,0 +1,36 @@
+#ifndef KVM__FRAMEBUFFER_H
+#define KVM__FRAMEBUFFER_H
+
+#include <linux/types.h>
+#include <linux/list.h>
+
+struct framebuffer;
+
+struct fb_target_operations {
+ int (*start)(struct framebuffer *fb);
+ int (*stop)(struct framebuffer *fb);
+};
+
+#define FB_MAX_TARGETS 2
+
+struct framebuffer {
+ struct list_head node;
+
+ u32 width;
+ u32 height;
+ u8 depth;
+ char *mem;
+ u64 mem_addr;
+ u64 mem_size;
+ struct kvm *kvm;
+
+ unsigned long nr_targets;
+ struct fb_target_operations *targets[FB_MAX_TARGETS];
+};
+
+struct framebuffer *fb__register(struct framebuffer *fb);
+int fb__attach(struct framebuffer *fb, struct fb_target_operations *ops);
+int fb__init(struct kvm *kvm);
+int fb__exit(struct kvm *kvm);
+
+#endif /* KVM__FRAMEBUFFER_H */
diff --git a/tools/kvm/include/kvm/gtk3.h b/tools/kvm/include/kvm/gtk3.h
new file mode 100644
index 0000000..b02dc13
--- /dev/null
+++ b/tools/kvm/include/kvm/gtk3.h
@@ -0,0 +1,28 @@
+#ifndef KVM__GTK3_H
+#define KVM__GTK3_H
+
+#include "kvm/util.h"
+
+struct framebuffer;
+
+#ifdef CONFIG_HAS_GTK3
+int kvm_gtk_init(struct kvm *kvm);
+int kvm_gtk_exit(struct kvm *kvm);
+#else
+static inline int kvm_gtk_init(struct kvm *kvm)
+{
+ if (kvm->cfg.gtk)
+ die("GTK3 support not compiled in. (install the gtk3-devel or libgtk3.0-dev package)");
+
+ return 0;
+}
+static inline int kvm_gtk_exit(struct kvm *kvm)
+{
+ if (kvm->cfg.gtk)
+ die("GTK3 support not compiled in. (install the gtk3-devel or libgtk3.0-dev package)");
+
+ return 0;
+}
+#endif
+
+#endif /* KVM__GTK3_H */
diff --git a/tools/kvm/include/kvm/guest_compat.h b/tools/kvm/include/kvm/guest_compat.h
new file mode 100644
index 0000000..ae7abbd
--- /dev/null
+++ b/tools/kvm/include/kvm/guest_compat.h
@@ -0,0 +1,9 @@
+#ifndef KVM__GUEST_COMPAT_H
+#define KVM__GUEST_COMPAT_H
+
+int compat__print_all_messages(void);
+int compat__remove_message(int id);
+int compat__add_message(const char *title, const char *description);
+
+
+#endif
\ No newline at end of file
diff --git a/tools/kvm/include/kvm/i8042.h b/tools/kvm/include/kvm/i8042.h
new file mode 100644
index 0000000..3b4ab68
--- /dev/null
+++ b/tools/kvm/include/kvm/i8042.h
@@ -0,0 +1,12 @@
+#ifndef KVM__PCKBD_H
+#define KVM__PCKBD_H
+
+#include <linux/types.h>
+
+struct kvm;
+
+void mouse_queue(u8 c);
+void kbd_queue(u8 c);
+int kbd__init(struct kvm *kvm);
+
+#endif
diff --git a/tools/kvm/include/kvm/ioeventfd.h b/tools/kvm/include/kvm/ioeventfd.h
new file mode 100644
index 0000000..bb1f78d
--- /dev/null
+++ b/tools/kvm/include/kvm/ioeventfd.h
@@ -0,0 +1,31 @@
+#ifndef KVM__IOEVENTFD_H
+#define KVM__IOEVENTFD_H
+
+#include <linux/types.h>
+#include <linux/list.h>
+#include <sys/eventfd.h>
+#include "kvm/util.h"
+
+struct kvm;
+
+struct ioevent {
+ u64 io_addr;
+ u8 io_len;
+ void (*fn)(struct kvm *kvm, void *ptr);
+ struct kvm *fn_kvm;
+ void *fn_ptr;
+ int fd;
+ u64 datamatch;
+
+ struct list_head list;
+};
+
+#define IOEVENTFD_FLAG_PIO (1 << 0)
+#define IOEVENTFD_FLAG_USER_POLL (1 << 1)
+
+int ioeventfd__init(struct kvm *kvm);
+int ioeventfd__exit(struct kvm *kvm);
+int ioeventfd__add_event(struct ioevent *ioevent, int flags);
+int ioeventfd__del_event(u64 addr, u64 datamatch);
+
+#endif
diff --git a/tools/kvm/include/kvm/ioport.h b/tools/kvm/include/kvm/ioport.h
new file mode 100644
index 0000000..18da47c
--- /dev/null
+++ b/tools/kvm/include/kvm/ioport.h
@@ -0,0 +1,76 @@
+#ifndef KVM__IOPORT_H
+#define KVM__IOPORT_H
+
+#include "kvm/devices.h"
+#include "kvm/kvm-cpu.h"
+#include "kvm/rbtree-interval.h"
+
+#include <stdbool.h>
+#include <limits.h>
+#include <asm/types.h>
+#include <linux/types.h>
+#include <linux/byteorder.h>
+
+/* some ports we reserve for own use */
+#define IOPORT_DBG 0xe0
+#define IOPORT_START 0x6200
+#define IOPORT_SIZE 0x400
+
+#define IOPORT_EMPTY USHRT_MAX
+
+struct kvm;
+
+struct ioport {
+ struct rb_int_node node;
+ struct ioport_operations *ops;
+ void *priv;
+ struct device_header dev_hdr;
+};
+
+struct ioport_operations {
+ bool (*io_in)(struct ioport *ioport, struct kvm_cpu *vcpu, u16 port, void *data, int size);
+ bool (*io_out)(struct ioport *ioport, struct kvm_cpu *vcpu, u16 port, void *data, int size);
+ void (*generate_fdt_node)(struct ioport *ioport, void *fdt,
+ void (*generate_irq_prop)(void *fdt, u8 irq));
+};
+
+void ioport__setup_arch(struct kvm *kvm);
+void ioport__map_irq(u8 *irq);
+
+int ioport__register(struct kvm *kvm, u16 port, struct ioport_operations *ops,
+ int count, void *param);
+int ioport__unregister(struct kvm *kvm, u16 port);
+int ioport__init(struct kvm *kvm);
+int ioport__exit(struct kvm *kvm);
+
+static inline u8 ioport__read8(u8 *data)
+{
+ return *data;
+}
+/* On BE platforms, PCI I/O is byteswapped, i.e. LE, so swap back. */
+static inline u16 ioport__read16(u16 *data)
+{
+ return le16_to_cpu(*data);
+}
+
+static inline u32 ioport__read32(u32 *data)
+{
+ return le32_to_cpu(*data);
+}
+
+static inline void ioport__write8(u8 *data, u8 value)
+{
+ *data = value;
+}
+
+static inline void ioport__write16(u16 *data, u16 value)
+{
+ *data = cpu_to_le16(value);
+}
+
+static inline void ioport__write32(u32 *data, u32 value)
+{
+ *data = cpu_to_le32(value);
+}
+
+#endif /* KVM__IOPORT_H */
diff --git a/tools/kvm/include/kvm/iovec.h b/tools/kvm/include/kvm/iovec.h
new file mode 100644
index 0000000..fe79dd4
--- /dev/null
+++ b/tools/kvm/include/kvm/iovec.h
@@ -0,0 +1,21 @@
+#ifndef KVM_UTIL_IOVEC_H_
+#define KVM_UTIL_IOVEC_H_
+
+extern int memcpy_fromiovec(unsigned char *kdata, struct iovec *iov, int len);
+extern int memcpy_fromiovecend(unsigned char *kdata, const struct iovec *iov,
+ size_t offset, int len);
+extern int memcpy_toiovec(struct iovec *v, unsigned char *kdata, int len);
+extern int memcpy_toiovecend(const struct iovec *v, unsigned char *kdata,
+ size_t offset, int len);
+
+static inline size_t iov_size(const struct iovec *iovec, size_t len)
+{
+ size_t size = 0, i;
+
+ for (i = 0; i < len; i++)
+ size += iovec[i].iov_len;
+
+ return size;
+}
+
+#endif
diff --git a/tools/kvm/include/kvm/irq.h b/tools/kvm/include/kvm/irq.h
new file mode 100644
index 0000000..4cec6f0
--- /dev/null
+++ b/tools/kvm/include/kvm/irq.h
@@ -0,0 +1,19 @@
+#ifndef KVM__IRQ_H
+#define KVM__IRQ_H
+
+#include <linux/types.h>
+#include <linux/rbtree.h>
+#include <linux/list.h>
+#include <linux/kvm.h>
+
+#include "kvm/msi.h"
+
+struct kvm;
+
+int irq__alloc_line(void);
+
+int irq__init(struct kvm *kvm);
+int irq__exit(struct kvm *kvm);
+int irq__add_msix_route(struct kvm *kvm, struct msi_msg *msg);
+
+#endif
diff --git a/tools/kvm/include/kvm/kvm-cmd.h b/tools/kvm/include/kvm/kvm-cmd.h
new file mode 100644
index 0000000..0a73bce
--- /dev/null
+++ b/tools/kvm/include/kvm/kvm-cmd.h
@@ -0,0 +1,17 @@
+#ifndef __KVM_CMD_H__
+#define __KVM_CMD_H__
+
+struct cmd_struct {
+ const char *cmd;
+ int (*fn)(int, const char **, const char *);
+ void (*help)(void);
+ int option;
+};
+
+extern struct cmd_struct kvm_commands[];
+struct cmd_struct *kvm_get_command(struct cmd_struct *command,
+ const char *cmd);
+
+int handle_command(struct cmd_struct *command, int argc, const char **argv);
+
+#endif
diff --git a/tools/kvm/include/kvm/kvm-config.h b/tools/kvm/include/kvm/kvm-config.h
new file mode 100644
index 0000000..386fa8c
--- /dev/null
+++ b/tools/kvm/include/kvm/kvm-config.h
@@ -0,0 +1,62 @@
+#ifndef KVM_CONFIG_H_
+#define KVM_CONFIG_H_
+
+#include "kvm/disk-image.h"
+#include "kvm/kvm-config-arch.h"
+
+#define DEFAULT_KVM_DEV "/dev/kvm"
+#define DEFAULT_CONSOLE "serial"
+#define DEFAULT_NETWORK "user"
+#define DEFAULT_HOST_ADDR "192.168.33.1"
+#define DEFAULT_GUEST_ADDR "192.168.33.15"
+#define DEFAULT_GUEST_MAC "02:15:15:15:15:15"
+#define DEFAULT_HOST_MAC "02:01:01:01:01:01"
+#define DEFAULT_SCRIPT "none"
+#define DEFAULT_SANDBOX_FILENAME "guest/sandbox.sh"
+
+#define MIN_RAM_SIZE_MB (64ULL)
+#define MIN_RAM_SIZE_BYTE (MIN_RAM_SIZE_MB << MB_SHIFT)
+
+struct kvm_config {
+ struct kvm_config_arch arch;
+ struct disk_image_params disk_image[MAX_DISK_IMAGES];
+ u64 ram_size;
+ u8 image_count;
+ u8 num_net_devices;
+ bool virtio_rng;
+ int active_console;
+ int debug_iodelay;
+ int nrcpus;
+ const char *kernel_cmdline;
+ const char *kernel_filename;
+ const char *vmlinux_filename;
+ const char *initrd_filename;
+ const char *firmware_filename;
+ const char *console;
+ const char *dev;
+ const char *network;
+ const char *host_ip;
+ const char *guest_ip;
+ const char *guest_mac;
+ const char *host_mac;
+ const char *script;
+ const char *guest_name;
+ const char *sandbox;
+ const char *hugetlbfs_path;
+ const char *custom_rootfs_name;
+ const char *real_cmdline;
+ struct virtio_net_params *net_params;
+ bool single_step;
+ bool vnc;
+ bool gtk;
+ bool sdl;
+ bool balloon;
+ bool using_rootfs;
+ bool custom_rootfs;
+ bool no_net;
+ bool no_dhcp;
+ bool ioport_debug;
+ bool mmio_debug;
+};
+
+#endif
diff --git a/tools/kvm/include/kvm/kvm-cpu.h b/tools/kvm/include/kvm/kvm-cpu.h
new file mode 100644
index 0000000..aa0cb54
--- /dev/null
+++ b/tools/kvm/include/kvm/kvm-cpu.h
@@ -0,0 +1,27 @@
+#ifndef KVM__KVM_CPU_H
+#define KVM__KVM_CPU_H
+
+#include "kvm/kvm-cpu-arch.h"
+#include <stdbool.h>
+
+int kvm_cpu__init(struct kvm *kvm);
+int kvm_cpu__exit(struct kvm *kvm);
+struct kvm_cpu *kvm_cpu__arch_init(struct kvm *kvm, unsigned long cpu_id);
+void kvm_cpu__delete(struct kvm_cpu *vcpu);
+void kvm_cpu__reset_vcpu(struct kvm_cpu *vcpu);
+void kvm_cpu__setup_cpuid(struct kvm_cpu *vcpu);
+void kvm_cpu__enable_singlestep(struct kvm_cpu *vcpu);
+void kvm_cpu__run(struct kvm_cpu *vcpu);
+void kvm_cpu__reboot(struct kvm *kvm);
+int kvm_cpu__start(struct kvm_cpu *cpu);
+bool kvm_cpu__handle_exit(struct kvm_cpu *vcpu);
+int kvm_cpu__get_endianness(struct kvm_cpu *vcpu);
+
+int kvm_cpu__get_debug_fd(void);
+void kvm_cpu__set_debug_fd(int fd);
+void kvm_cpu__show_code(struct kvm_cpu *vcpu);
+void kvm_cpu__show_registers(struct kvm_cpu *vcpu);
+void kvm_cpu__show_page_tables(struct kvm_cpu *vcpu);
+void kvm_cpu__arch_nmi(struct kvm_cpu *cpu);
+
+#endif /* KVM__KVM_CPU_H */
diff --git a/tools/kvm/include/kvm/kvm-ipc.h b/tools/kvm/include/kvm/kvm-ipc.h
new file mode 100644
index 0000000..5494da4
--- /dev/null
+++ b/tools/kvm/include/kvm/kvm-ipc.h
@@ -0,0 +1,26 @@
+#ifndef KVM__IPC_H_
+#define KVM__IPC_H_
+
+#include <linux/types.h>
+#include "kvm/kvm.h"
+
+enum {
+ KVM_IPC_BALLOON = 1,
+ KVM_IPC_DEBUG = 2,
+ KVM_IPC_STAT = 3,
+ KVM_IPC_PAUSE = 4,
+ KVM_IPC_RESUME = 5,
+ KVM_IPC_STOP = 6,
+ KVM_IPC_PID = 7,
+ KVM_IPC_VMSTATE = 8,
+};
+
+int kvm_ipc__register_handler(u32 type, void (*cb)(struct kvm *kvm,
+ int fd, u32 type, u32 len, u8 *msg));
+int kvm_ipc__init(struct kvm *kvm);
+int kvm_ipc__exit(struct kvm *kvm);
+
+int kvm_ipc__send(int fd, u32 type);
+int kvm_ipc__send_msg(int fd, u32 type, u32 len, u8 *msg);
+
+#endif
diff --git a/tools/kvm/include/kvm/kvm.h b/tools/kvm/include/kvm/kvm.h
new file mode 100644
index 0000000..754e029
--- /dev/null
+++ b/tools/kvm/include/kvm/kvm.h
@@ -0,0 +1,134 @@
+#ifndef KVM__KVM_H
+#define KVM__KVM_H
+
+#include "kvm/kvm-arch.h"
+#include "kvm/kvm-config.h"
+#include "kvm/util-init.h"
+#include "kvm/kvm.h"
+
+#include <stdbool.h>
+#include <linux/types.h>
+#include <time.h>
+#include <signal.h>
+#include <sys/prctl.h>
+
+#define SIGKVMEXIT (SIGRTMIN + 0)
+#define SIGKVMPAUSE (SIGRTMIN + 1)
+
+#define KVM_PID_FILE_PATH "/.lkvm/"
+#define HOME_DIR getenv("HOME")
+#define KVM_BINARY_NAME "lkvm"
+
+#define PAGE_SIZE (sysconf(_SC_PAGE_SIZE))
+
+#define DEFINE_KVM_EXT(ext) \
+ .name = #ext, \
+ .code = ext
+
+enum {
+ KVM_VMSTATE_RUNNING,
+ KVM_VMSTATE_PAUSED,
+};
+
+struct kvm_ext {
+ const char *name;
+ int code;
+};
+
+struct kvm_mem_bank {
+ struct list_head list;
+ u64 guest_phys_addr;
+ void *host_addr;
+ u64 size;
+};
+
+struct kvm {
+ struct kvm_arch arch;
+ struct kvm_config cfg;
+ int sys_fd; /* For system ioctls(), i.e. /dev/kvm */
+ int vm_fd; /* For VM ioctls() */
+ timer_t timerid; /* Posix timer for interrupts */
+
+ int nrcpus; /* Number of cpus to run */
+ struct kvm_cpu **cpus;
+
+ u32 mem_slots; /* for KVM_SET_USER_MEMORY_REGION */
+ u64 ram_size;
+ void *ram_start;
+ u64 ram_pagesize;
+ struct list_head mem_banks;
+
+ bool nmi_disabled;
+
+ const char *vmlinux;
+ struct disk_image **disks;
+ int nr_disks;
+
+ int vm_state;
+};
+
+void kvm__set_dir(const char *fmt, ...);
+const char *kvm__get_dir(void);
+
+int kvm__init(struct kvm *kvm);
+struct kvm *kvm__new(void);
+int kvm__recommended_cpus(struct kvm *kvm);
+int kvm__max_cpus(struct kvm *kvm);
+void kvm__init_ram(struct kvm *kvm);
+int kvm__exit(struct kvm *kvm);
+bool kvm__load_firmware(struct kvm *kvm, const char *firmware_filename);
+bool kvm__load_kernel(struct kvm *kvm, const char *kernel_filename,
+ const char *initrd_filename, const char *kernel_cmdline);
+int kvm_timer__init(struct kvm *kvm);
+int kvm_timer__exit(struct kvm *kvm);
+void kvm__irq_line(struct kvm *kvm, int irq, int level);
+void kvm__irq_trigger(struct kvm *kvm, int irq);
+bool kvm__emulate_io(struct kvm_cpu *vcpu, u16 port, void *data, int direction, int size, u32 count);
+bool kvm__emulate_mmio(struct kvm_cpu *vcpu, u64 phys_addr, u8 *data, u32 len, u8 is_write);
+int kvm__register_mem(struct kvm *kvm, u64 guest_phys, u64 size, void *userspace_addr);
+int kvm__register_mmio(struct kvm *kvm, u64 phys_addr, u64 phys_addr_len, bool coalesce,
+ void (*mmio_fn)(struct kvm_cpu *vcpu, u64 addr, u8 *data, u32 len, u8 is_write, void *ptr),
+ void *ptr);
+bool kvm__deregister_mmio(struct kvm *kvm, u64 phys_addr);
+void kvm__pause(struct kvm *kvm);
+void kvm__continue(struct kvm *kvm);
+void kvm__notify_paused(void);
+int kvm__get_sock_by_instance(const char *name);
+int kvm__enumerate_instances(int (*callback)(const char *name, int pid));
+void kvm__remove_socket(const char *name);
+
+void kvm__arch_set_cmdline(char *cmdline, bool video);
+void kvm__arch_init(struct kvm *kvm, const char *hugetlbfs_path, u64 ram_size);
+void kvm__arch_delete_ram(struct kvm *kvm);
+int kvm__arch_setup_firmware(struct kvm *kvm);
+int kvm__arch_free_firmware(struct kvm *kvm);
+bool kvm__arch_cpu_supports_vm(void);
+void kvm__arch_read_term(struct kvm *kvm);
+
+void *guest_flat_to_host(struct kvm *kvm, u64 offset);
+u64 host_to_guest_flat(struct kvm *kvm, void *ptr);
+
+int load_flat_binary(struct kvm *kvm, int fd_kernel, int fd_initrd, const char *kernel_cmdline);
+int load_elf_binary(struct kvm *kvm, int fd_kernel, int fd_initrd, const char *kernel_cmdline);
+bool load_bzimage(struct kvm *kvm, int fd_kernel, int fd_initrd, const char *kernel_cmdline);
+
+/*
+ * Debugging
+ */
+void kvm__dump_mem(struct kvm *kvm, unsigned long addr, unsigned long size, int debug_fd);
+
+extern const char *kvm_exit_reasons[];
+
+static inline bool host_ptr_in_ram(struct kvm *kvm, void *p)
+{
+ return kvm->ram_start <= p && p < (kvm->ram_start + kvm->ram_size);
+}
+
+bool kvm__supports_extension(struct kvm *kvm, unsigned int extension);
+
+static inline void kvm__set_thread_name(const char *name)
+{
+ prctl(PR_SET_NAME, name);
+}
+
+#endif /* KVM__KVM_H */
diff --git a/tools/kvm/include/kvm/msi.h b/tools/kvm/include/kvm/msi.h
new file mode 100644
index 0000000..885eb5b
--- /dev/null
+++ b/tools/kvm/include/kvm/msi.h
@@ -0,0 +1,10 @@
+#ifndef LKVM_MSI_H
+#define LKVM_MSI_H
+
+struct msi_msg {
+ u32 address_lo; /* low 32 bits of msi message address */
+ u32 address_hi; /* high 32 bits of msi message address */
+ u32 data; /* 16 bits of msi message data */
+};
+
+#endif /* LKVM_MSI_H */
diff --git a/tools/kvm/include/kvm/mutex.h b/tools/kvm/include/kvm/mutex.h
new file mode 100644
index 0000000..a90584b
--- /dev/null
+++ b/tools/kvm/include/kvm/mutex.h
@@ -0,0 +1,39 @@
+#ifndef KVM__MUTEX_H
+#define KVM__MUTEX_H
+
+#include <pthread.h>
+
+#include "kvm/util.h"
+
+/*
+ * Kernel-alike mutex API - to make it easier for kernel developers
+ * to write user-space code! :-)
+ */
+
+struct mutex {
+ pthread_mutex_t mutex;
+};
+#define MUTEX_INITIALIZER (struct mutex) { .mutex = PTHREAD_MUTEX_INITIALIZER }
+
+#define DEFINE_MUTEX(mtx) struct mutex mtx = MUTEX_INITIALIZER
+
+static inline void mutex_init(struct mutex *lock)
+{
+ if (pthread_mutex_init(&lock->mutex, NULL) != 0)
+ die("unexpected pthread_mutex_init() failure!");
+}
+
+static inline void mutex_lock(struct mutex *lock)
+{
+ if (pthread_mutex_lock(&lock->mutex) != 0)
+ die("unexpected pthread_mutex_lock() failure!");
+
+}
+
+static inline void mutex_unlock(struct mutex *lock)
+{
+ if (pthread_mutex_unlock(&lock->mutex) != 0)
+ die("unexpected pthread_mutex_unlock() failure!");
+}
+
+#endif /* KVM__MUTEX_H */
diff --git a/tools/kvm/include/kvm/of_pci.h b/tools/kvm/include/kvm/of_pci.h
new file mode 100644
index 0000000..c8187ab
--- /dev/null
+++ b/tools/kvm/include/kvm/of_pci.h
@@ -0,0 +1,44 @@
+#ifndef KVM__OF_PCI_H
+#define KVM__OF_PCI_H
+
+#include <linux/types.h>
+
+/*
+ * Definitions for implementing parts of the OpenFirmware PCI Bus Binding
+ * Specification (IEEE Std 1275-1994).
+ */
+
+struct of_pci_unit_address {
+ u32 hi, mid, lo;
+} __attribute__((packed));
+
+struct of_pci_irq_mask {
+ struct of_pci_unit_address pci_addr;
+ u32 pci_pin;
+} __attribute__((packed));
+
+struct of_pci_ranges_entry {
+ struct of_pci_unit_address pci_addr;
+ u64 cpu_addr;
+ u64 length;
+} __attribute__((packed));
+
+/* Macros to operate with address in OF binding to PCI */
+#define __b_x(x, p, l) (((x) & ((1<<(l))-1)) << (p))
+#define of_pci_b_n(x) __b_x((x), 31, 1) /* 0 if relocatable */
+#define of_pci_b_p(x) __b_x((x), 30, 1) /* 1 if prefetchable */
+#define of_pci_b_t(x) __b_x((x), 29, 1) /* 1 if the address is aliased */
+#define of_pci_b_ss(x) __b_x((x), 24, 2) /* the space code */
+#define of_pci_b_bbbbbbbb(x) __b_x((x), 16, 8) /* bus number */
+#define of_pci_b_ddddd(x) __b_x((x), 11, 5) /* device number */
+#define of_pci_b_fff(x) __b_x((x), 8, 3) /* function number */
+#define of_pci_b_rrrrrrrr(x) __b_x((x), 0, 8) /* register number */
+
+#define OF_PCI_SS_CONFIG 0
+#define OF_PCI_SS_IO 1
+#define OF_PCI_SS_M32 2
+#define OF_PCI_SS_M64 3
+
+#define OF_PCI_IRQ_MAP_MAX 256 /* 5 bit device + 3 bit pin */
+
+#endif /* KVM__OF_PCI_H */
diff --git a/tools/kvm/include/kvm/parse-options.h b/tools/kvm/include/kvm/parse-options.h
new file mode 100644
index 0000000..b03f151
--- /dev/null
+++ b/tools/kvm/include/kvm/parse-options.h
@@ -0,0 +1,230 @@
+#ifndef __PARSE_OPTIONS_H__
+#define __PARSE_OPTIONS_H__
+
+#include <inttypes.h>
+#include <kvm/util.h>
+
+enum parse_opt_type {
+ /* special types */
+ OPTION_END,
+ OPTION_ARGUMENT,
+ OPTION_GROUP,
+ /* options with no arguments */
+ OPTION_BIT,
+ OPTION_BOOLEAN,
+ OPTION_INCR,
+ OPTION_SET_UINT,
+ OPTION_SET_PTR,
+ /* options with arguments (usually) */
+ OPTION_STRING,
+ OPTION_INTEGER,
+ OPTION_LONG,
+ OPTION_CALLBACK,
+ OPTION_U64,
+ OPTION_UINTEGER,
+};
+
+enum parse_opt_flags {
+ PARSE_OPT_KEEP_DASHDASH = 1,
+ PARSE_OPT_STOP_AT_NON_OPTION = 2,
+ PARSE_OPT_KEEP_ARGV0 = 4,
+ PARSE_OPT_KEEP_UNKNOWN = 8,
+ PARSE_OPT_NO_INTERNAL_HELP = 16,
+};
+
+enum parse_opt_option_flags {
+ PARSE_OPT_OPTARG = 1,
+ PARSE_OPT_NOARG = 2,
+ PARSE_OPT_NONEG = 4,
+ PARSE_OPT_HIDDEN = 8,
+ PARSE_OPT_LASTARG_DEFAULT = 16,
+};
+
+struct option;
+typedef int parse_opt_cb(const struct option *, const char *arg, int unset);
+/*
+ * `type`::
+ * holds the type of the option, you must have an OPTION_END last in your
+ * array.
+ *
+ * `short_name`::
+ * the character to use as a short option name, '\0' if none.
+ *
+ * `long_name`::
+ * the long option name, without the leading dashes, NULL if none.
+ *
+ * `value`::
+ * stores pointers to the values to be filled.
+ *
+ * `argh`::
+ * token to explain the kind of argument this option wants. Keep it
+ * homogenous across the repository.
+ *
+ * `help`::
+ * the short help associated to what the option does.
+ * Must never be NULL (except for OPTION_END).
+ * OPTION_GROUP uses this pointer to store the group header.
+ *
+ * `flags`::
+ * mask of parse_opt_option_flags.
+ * PARSE_OPT_OPTARG: says that the argument is optionnal (not for BOOLEANs)
+ * PARSE_OPT_NOARG: says that this option takes no argument, for CALLBACKs
+ * PARSE_OPT_NONEG: says that this option cannot be negated
+ * PARSE_OPT_HIDDEN this option is skipped in the default usage, showed in
+ * the long one.
+ *
+ * `callback`::
+ * pointer to the callback to use for OPTION_CALLBACK.
+ *
+ * `defval`::
+ * default value to fill (*->value) with for PARSE_OPT_OPTARG.
+ * OPTION_{BIT,SET_UINT,SET_PTR} store the {mask,integer,pointer} to put in
+ * the value when met.
+ * CALLBACKS can use it like they want.
+ */
+struct option {
+ enum parse_opt_type type;
+ int short_name;
+ const char *long_name;
+ void *value;
+ const char *argh;
+ const char *help;
+ void *ptr;
+
+ int flags;
+ parse_opt_cb *callback;
+ intptr_t defval;
+};
+
+#define BUILD_BUG_ON_ZERO(e) (sizeof(struct { int:-!!(e); }))
+#define check_vtype(v, type) \
+ (BUILD_BUG_ON_ZERO(!__builtin_types_compatible_p(typeof(v), type)) + v)
+
+#define OPT_INTEGER(s, l, v, h) \
+{ \
+ .type = OPTION_INTEGER, \
+ .short_name = (s), \
+ .long_name = (l), \
+ .value = check_vtype(v, int *), \
+ .help = (h) \
+}
+
+#define OPT_UINTEGER(s, l, v, h) \
+{ \
+ .type = OPTION_UINTEGER, \
+ .short_name = (s), \
+ .long_name = (l), \
+ .value = check_vtype(v, unsigned int *), \
+ .help = (h) \
+}
+
+#define OPT_U64(s, l, v, h) \
+{ \
+ .type = OPTION_U64, \
+ .short_name = (s), \
+ .long_name = (l), \
+ .value = check_vtype(v, u64 *), \
+ .help = (h) \
+}
+
+#define OPT_STRING(s, l, v, a, h) \
+{ \
+ .type = OPTION_STRING, \
+ .short_name = (s), \
+ .long_name = (l), \
+ .value = check_vtype(v, const char **), (a), \
+ .help = (h) \
+}
+
+#define OPT_BOOLEAN(s, l, v, h) \
+{ \
+ .type = OPTION_BOOLEAN, \
+ .short_name = (s), \
+ .long_name = (l), \
+ .value = check_vtype(v, bool *), \
+ .help = (h) \
+}
+
+#define OPT_INCR(s, l, v, h) \
+{ \
+ .type = OPTION_INCR, \
+ .short_name = (s), \
+ .long_name = (l), \
+ .value = check_vtype(v, int *), \
+ .help = (h) \
+}
+
+#define OPT_GROUP(h) \
+{ \
+ .type = OPTION_GROUP, \
+ .help = (h) \
+}
+
+#define OPT_CALLBACK(s, l, v, a, h, f, p) \
+{ \
+ .type = OPTION_CALLBACK, \
+ .short_name = (s), \
+ .long_name = (l), \
+ .value = (v), \
+ (a), \
+ .help = (h), \
+ .callback = (f), \
+ .ptr = (p), \
+}
+
+#define OPT_CALLBACK_NOOPT(s, l, v, a, h, f, p) \
+{ \
+ .type = OPTION_CALLBACK, \
+ .short_name = (s), \
+ .long_name = (l), \
+ .value = (v), \
+ (a), \
+ .help = (h), \
+ .callback = (f), \
+ .flags = PARSE_OPT_NOARG, \
+ .ptr = (p), \
+}
+
+#define OPT_CALLBACK_DEFAULT(s, l, v, a, h, f, d, p) \
+{ \
+ .type = OPTION_CALLBACK, \
+ .short_name = (s), \
+ .long_name = (l), \
+ .value = (v), (a), \
+ .help = (h), \
+ .callback = (f), \
+ .defval = (intptr_t)d, \
+ .flags = PARSE_OPT_LASTARG_DEFAULT, \
+ .ptr = (p) \
+}
+
+#define OPT_END() { .type = OPTION_END }
+
+#define OPT_ARCH(cmd, cfg) \
+ OPT_ARCH_##cmd(OPT_GROUP("Arch-specific options:"), &(cfg)->arch)
+
+enum {
+ PARSE_OPT_HELP = -1,
+ PARSE_OPT_DONE,
+ PARSE_OPT_UNKNOWN,
+};
+
+/*
+ * It's okay for the caller to consume argv/argc in the usual way.
+ * Other fields of that structure are private to parse-options and should not
+ * be modified in any way.
+ **/
+struct parse_opt_ctx_t {
+ const char **argv;
+ const char **out;
+ int argc, cpidx;
+ const char *opt;
+ int flags;
+};
+
+/* global functions */
+void usage_with_options(const char * const *usagestr,
+ const struct option *opts) NORETURN;
+int parse_options(int argc, const char **argv, const struct option *options,
+ const char * const usagestr[], int flags);
+#endif
diff --git a/tools/kvm/include/kvm/pci-shmem.h b/tools/kvm/include/kvm/pci-shmem.h
new file mode 100644
index 0000000..6cff2b8
--- /dev/null
+++ b/tools/kvm/include/kvm/pci-shmem.h
@@ -0,0 +1,32 @@
+#ifndef KVM__PCI_SHMEM_H
+#define KVM__PCI_SHMEM_H
+
+#include <linux/types.h>
+#include <linux/list.h>
+
+#include "kvm/parse-options.h"
+
+#define SHMEM_DEFAULT_SIZE (16 << MB_SHIFT)
+#define SHMEM_DEFAULT_ADDR (0xc8000000)
+#define SHMEM_DEFAULT_HANDLE "/kvm_shmem"
+
+struct kvm;
+struct shmem_info;
+
+struct shmem_info {
+ u64 phys_addr;
+ u64 size;
+ char *handle;
+ int create;
+};
+
+int pci_shmem__init(struct kvm *kvm);
+int pci_shmem__exit(struct kvm *kvm);
+int pci_shmem__register_mem(struct shmem_info *si);
+int shmem_parser(const struct option *opt, const char *arg, int unset);
+
+int pci_shmem__get_local_irqfd(struct kvm *kvm);
+int pci_shmem__add_client(struct kvm *kvm, u32 id, int fd);
+int pci_shmem__remove_client(struct kvm *kvm, u32 id);
+
+#endif
diff --git a/tools/kvm/include/kvm/pci.h b/tools/kvm/include/kvm/pci.h
new file mode 100644
index 0000000..b0c28a1
--- /dev/null
+++ b/tools/kvm/include/kvm/pci.h
@@ -0,0 +1,96 @@
+#ifndef KVM__PCI_H
+#define KVM__PCI_H
+
+#include <linux/types.h>
+#include <linux/kvm.h>
+#include <linux/pci_regs.h>
+#include <endian.h>
+
+#include "kvm/devices.h"
+#include "kvm/kvm.h"
+#include "kvm/msi.h"
+
+/*
+ * PCI Configuration Mechanism #1 I/O ports. See Section 3.7.4.1.
+ * ("Configuration Mechanism #1") of the PCI Local Bus Specification 2.1 for
+ * details.
+ */
+#define PCI_CONFIG_ADDRESS 0xcf8
+#define PCI_CONFIG_DATA 0xcfc
+#define PCI_CONFIG_BUS_FORWARD 0xcfa
+#define PCI_IO_SIZE 0x100
+#define PCI_CFG_SIZE (1ULL << 24)
+
+union pci_config_address {
+ struct {
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+ unsigned reg_offset : 2; /* 1 .. 0 */
+ unsigned register_number : 6; /* 7 .. 2 */
+ unsigned function_number : 3; /* 10 .. 8 */
+ unsigned device_number : 5; /* 15 .. 11 */
+ unsigned bus_number : 8; /* 23 .. 16 */
+ unsigned reserved : 7; /* 30 .. 24 */
+ unsigned enable_bit : 1; /* 31 */
+#else
+ unsigned enable_bit : 1; /* 31 */
+ unsigned reserved : 7; /* 30 .. 24 */
+ unsigned bus_number : 8; /* 23 .. 16 */
+ unsigned device_number : 5; /* 15 .. 11 */
+ unsigned function_number : 3; /* 10 .. 8 */
+ unsigned register_number : 6; /* 7 .. 2 */
+ unsigned reg_offset : 2; /* 1 .. 0 */
+#endif
+ };
+ u32 w;
+};
+
+struct msix_table {
+ struct msi_msg msg;
+ u32 ctrl;
+};
+
+struct msix_cap {
+ u8 cap;
+ u8 next;
+ u16 ctrl;
+ u32 table_offset;
+ u32 pba_offset;
+};
+
+struct pci_device_header {
+ u16 vendor_id;
+ u16 device_id;
+ u16 command;
+ u16 status;
+ u8 revision_id;
+ u8 class[3];
+ u8 cacheline_size;
+ u8 latency_timer;
+ u8 header_type;
+ u8 bist;
+ u32 bar[6];
+ u32 card_bus;
+ u16 subsys_vendor_id;
+ u16 subsys_id;
+ u32 exp_rom_bar;
+ u8 capabilities;
+ u8 reserved1[3];
+ u32 reserved2;
+ u8 irq_line;
+ u8 irq_pin;
+ u8 min_gnt;
+ u8 max_lat;
+ struct msix_cap msix;
+ u8 empty[136]; /* Rest of PCI config space */
+ u32 bar_size[6];
+} __attribute__((packed));
+
+int pci__init(struct kvm *kvm);
+int pci__exit(struct kvm *kvm);
+struct pci_device_header *pci__find_dev(u8 dev_num);
+u32 pci_get_io_space_block(u32 size);
+void pci__assign_irq(struct device_header *dev_hdr);
+void pci__config_wr(struct kvm *kvm, union pci_config_address addr, void *data, int size);
+void pci__config_rd(struct kvm *kvm, union pci_config_address addr, void *data, int size);
+
+#endif /* KVM__PCI_H */
diff --git a/tools/kvm/include/kvm/qcow.h b/tools/kvm/include/kvm/qcow.h
new file mode 100644
index 0000000..f849246
--- /dev/null
+++ b/tools/kvm/include/kvm/qcow.h
@@ -0,0 +1,133 @@
+#ifndef KVM__QCOW_H
+#define KVM__QCOW_H
+
+#include "kvm/mutex.h"
+
+#include <linux/types.h>
+#include <stdbool.h>
+#include <linux/rbtree.h>
+#include <linux/list.h>
+
+#define QCOW_MAGIC (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb)
+
+#define QCOW1_VERSION 1
+#define QCOW2_VERSION 2
+
+#define QCOW1_OFLAG_COMPRESSED (1ULL << 63)
+
+#define QCOW2_OFLAG_COPIED (1ULL << 63)
+#define QCOW2_OFLAG_COMPRESSED (1ULL << 62)
+
+#define QCOW2_OFLAGS_MASK (QCOW2_OFLAG_COPIED|QCOW2_OFLAG_COMPRESSED)
+
+#define QCOW2_OFFSET_MASK (~QCOW2_OFLAGS_MASK)
+
+#define MAX_CACHE_NODES 32
+
+struct qcow_l2_table {
+ u64 offset;
+ struct rb_node node;
+ struct list_head list;
+ u8 dirty;
+ u64 table[];
+};
+
+struct qcow_l1_table {
+ u32 table_size;
+ u64 *l1_table;
+
+ /* Level2 caching data structures */
+ struct rb_root root;
+ struct list_head lru_list;
+ int nr_cached;
+};
+
+#define QCOW_REFCOUNT_BLOCK_SHIFT 1
+
+struct qcow_refcount_block {
+ u64 offset;
+ struct rb_node node;
+ struct list_head list;
+ u64 size;
+ u8 dirty;
+ u16 entries[];
+};
+
+struct qcow_refcount_table {
+ u32 rf_size;
+ u64 *rf_table;
+
+ /* Refcount block caching data structures */
+ struct rb_root root;
+ struct list_head lru_list;
+ int nr_cached;
+};
+
+struct qcow_header {
+ u64 size; /* in bytes */
+ u64 l1_table_offset;
+ u32 l1_size;
+ u8 cluster_bits;
+ u8 l2_bits;
+ u64 refcount_table_offset;
+ u32 refcount_table_size;
+};
+
+struct qcow {
+ struct mutex mutex;
+ struct qcow_header *header;
+ struct qcow_l1_table table;
+ struct qcow_refcount_table refcount_table;
+ int fd;
+ int csize_shift;
+ int csize_mask;
+ u32 version;
+ u64 cluster_size;
+ u64 cluster_offset_mask;
+ u64 free_clust_idx;
+ void *cluster_cache;
+ void *cluster_data;
+ void *copy_buff;
+};
+
+struct qcow1_header_disk {
+ u32 magic;
+ u32 version;
+
+ u64 backing_file_offset;
+ u32 backing_file_size;
+ u32 mtime;
+
+ u64 size; /* in bytes */
+
+ u8 cluster_bits;
+ u8 l2_bits;
+ u32 crypt_method;
+
+ u64 l1_table_offset;
+};
+
+struct qcow2_header_disk {
+ u32 magic;
+ u32 version;
+
+ u64 backing_file_offset;
+ u32 backing_file_size;
+
+ u32 cluster_bits;
+ u64 size; /* in bytes */
+ u32 crypt_method;
+
+ u32 l1_size;
+ u64 l1_table_offset;
+
+ u64 refcount_table_offset;
+ u32 refcount_table_clusters;
+
+ u32 nb_snapshots;
+ u64 snapshots_offset;
+};
+
+struct disk_image *qcow_probe(int fd, bool readonly);
+
+#endif /* KVM__QCOW_H */
diff --git a/tools/kvm/include/kvm/rbtree-interval.h b/tools/kvm/include/kvm/rbtree-interval.h
new file mode 100644
index 0000000..730eb5e
--- /dev/null
+++ b/tools/kvm/include/kvm/rbtree-interval.h
@@ -0,0 +1,30 @@
+#ifndef KVM__INTERVAL_RBTREE_H
+#define KVM__INTERVAL_RBTREE_H
+
+#include <linux/rbtree.h>
+#include <linux/types.h>
+
+#define RB_INT_INIT(l, h) \
+ (struct rb_int_node){.low = l, .high = h}
+#define rb_int(n) rb_entry(n, struct rb_int_node, node)
+
+struct rb_int_node {
+ struct rb_node node;
+ u64 low;
+ u64 high;
+};
+
+/* Return the rb_int_node interval in which 'point' is located. */
+struct rb_int_node *rb_int_search_single(struct rb_root *root, u64 point);
+
+/* Return the rb_int_node in which start:len is located. */
+struct rb_int_node *rb_int_search_range(struct rb_root *root, u64 low, u64 high);
+
+int rb_int_insert(struct rb_root *root, struct rb_int_node *data);
+
+static inline void rb_int_erase(struct rb_root *root, struct rb_int_node *node)
+{
+ rb_erase(&node->node, root);
+}
+
+#endif
diff --git a/tools/kvm/include/kvm/read-write.h b/tools/kvm/include/kvm/read-write.h
new file mode 100644
index 0000000..67571f9
--- /dev/null
+++ b/tools/kvm/include/kvm/read-write.h
@@ -0,0 +1,43 @@
+#ifndef KVM_READ_WRITE_H
+#define KVM_READ_WRITE_H
+
+#include <sys/types.h>
+#include <sys/uio.h>
+#include <unistd.h>
+
+#ifdef CONFIG_HAS_AIO
+#include <libaio.h>
+#endif
+
+ssize_t xread(int fd, void *buf, size_t count);
+ssize_t xwrite(int fd, const void *buf, size_t count);
+
+ssize_t read_in_full(int fd, void *buf, size_t count);
+ssize_t write_in_full(int fd, const void *buf, size_t count);
+
+ssize_t xpread(int fd, void *buf, size_t count, off_t offset);
+ssize_t xpwrite(int fd, const void *buf, size_t count, off_t offset);
+
+ssize_t pread_in_full(int fd, void *buf, size_t count, off_t offset);
+ssize_t pwrite_in_full(int fd, const void *buf, size_t count, off_t offset);
+
+ssize_t xreadv(int fd, const struct iovec *iov, int iovcnt);
+ssize_t xwritev(int fd, const struct iovec *iov, int iovcnt);
+
+ssize_t readv_in_full(int fd, const struct iovec *iov, int iovcnt);
+ssize_t writev_in_full(int fd, const struct iovec *iov, int iovcnt);
+
+ssize_t xpreadv(int fd, const struct iovec *iov, int iovcnt, off_t offset);
+ssize_t xpwritev(int fd, const struct iovec *iov, int iovcnt, off_t offset);
+
+ssize_t preadv_in_full(int fd, const struct iovec *iov, int iovcnt, off_t offset);
+ssize_t pwritev_in_full(int fd, const struct iovec *iov, int iovcnt, off_t offset);
+
+#ifdef CONFIG_HAS_AIO
+int aio_preadv(io_context_t ctx, struct iocb *iocb, int fd, const struct iovec *iov, int iovcnt,
+ off_t offset, int ev, void *param);
+int aio_pwritev(io_context_t ctx, struct iocb *iocb, int fd, const struct iovec *iov, int iovcnt,
+ off_t offset, int ev, void *param);
+#endif
+
+#endif /* KVM_READ_WRITE_H */
diff --git a/tools/kvm/include/kvm/rtc.h b/tools/kvm/include/kvm/rtc.h
new file mode 100644
index 0000000..6aa9299
--- /dev/null
+++ b/tools/kvm/include/kvm/rtc.h
@@ -0,0 +1,9 @@
+#ifndef KVM__RTC_H
+#define KVM__RTC_H
+
+struct kvm;
+
+int rtc__init(struct kvm *kvm);
+int rtc__exit(struct kvm *kvm);
+
+#endif /* KVM__RTC_H */
diff --git a/tools/kvm/include/kvm/rwsem.h b/tools/kvm/include/kvm/rwsem.h
new file mode 100644
index 0000000..75a22f8
--- /dev/null
+++ b/tools/kvm/include/kvm/rwsem.h
@@ -0,0 +1,39 @@
+#ifndef KVM__RWSEM_H
+#define KVM__RWSEM_H
+
+#include <pthread.h>
+
+#include "kvm/util.h"
+
+/*
+ * Kernel-alike rwsem API - to make it easier for kernel developers
+ * to write user-space code! :-)
+ */
+
+#define DECLARE_RWSEM(sem) pthread_rwlock_t sem = PTHREAD_RWLOCK_INITIALIZER
+
+static inline void down_read(pthread_rwlock_t *rwsem)
+{
+ if (pthread_rwlock_rdlock(rwsem) != 0)
+ die("unexpected pthread_rwlock_rdlock() failure!");
+}
+
+static inline void down_write(pthread_rwlock_t *rwsem)
+{
+ if (pthread_rwlock_wrlock(rwsem) != 0)
+ die("unexpected pthread_rwlock_wrlock() failure!");
+}
+
+static inline void up_read(pthread_rwlock_t *rwsem)
+{
+ if (pthread_rwlock_unlock(rwsem) != 0)
+ die("unexpected pthread_rwlock_unlock() failure!");
+}
+
+static inline void up_write(pthread_rwlock_t *rwsem)
+{
+ if (pthread_rwlock_unlock(rwsem) != 0)
+ die("unexpected pthread_rwlock_unlock() failure!");
+}
+
+#endif /* KVM__RWSEM_H */
diff --git a/tools/kvm/include/kvm/sdl.h b/tools/kvm/include/kvm/sdl.h
new file mode 100644
index 0000000..2f0c213
--- /dev/null
+++ b/tools/kvm/include/kvm/sdl.h
@@ -0,0 +1,28 @@
+#ifndef KVM__SDL_H
+#define KVM__SDL_H
+
+#include "kvm/util.h"
+
+struct framebuffer;
+
+#ifdef CONFIG_HAS_SDL
+int sdl__init(struct kvm *kvm);
+int sdl__exit(struct kvm *kvm);
+#else
+static inline int sdl__init(struct kvm *kvm)
+{
+ if (kvm->cfg.sdl)
+ die("SDL support not compiled in. (install the SDL-dev[el] package)");
+
+ return 0;
+}
+static inline int sdl__exit(struct kvm *kvm)
+{
+ if (kvm->cfg.sdl)
+ die("SDL support not compiled in. (install the SDL-dev[el] package)");
+
+ return 0;
+}
+#endif
+
+#endif /* KVM__SDL_H */
diff --git a/tools/kvm/include/kvm/segment.h b/tools/kvm/include/kvm/segment.h
new file mode 100644
index 0000000..9387a82
--- /dev/null
+++ b/tools/kvm/include/kvm/segment.h
@@ -0,0 +1,21 @@
+#ifndef KVM_SEGMENT_H
+#define KVM_SEGMENT_H
+
+#include <linux/types.h>
+
+static inline u32 segment_to_flat(u16 selector, u16 offset)
+{
+ return ((u32)selector << 4) + (u32) offset;
+}
+
+static inline u16 flat_to_seg16(u32 address)
+{
+ return address >> 4;
+}
+
+static inline u16 flat_to_off16(u32 address, u32 segment)
+{
+ return address - (segment << 4);
+}
+
+#endif /* KVM_SEGMENT_H */
diff --git a/tools/kvm/include/kvm/strbuf.h b/tools/kvm/include/kvm/strbuf.h
new file mode 100644
index 0000000..2beefbc
--- /dev/null
+++ b/tools/kvm/include/kvm/strbuf.h
@@ -0,0 +1,20 @@
+#ifndef __STRBUF_H__
+#define __STRBUF_H__
+
+#include <sys/types.h>
+#include <string.h>
+
+int prefixcmp(const char *str, const char *prefix);
+
+extern size_t strlcat(char *dest, const char *src, size_t count);
+extern size_t strlcpy(char *dest, const char *src, size_t size);
+
+/* some inline functions */
+
+static inline const char *skip_prefix(const char *str, const char *prefix)
+{
+ size_t len = strlen(prefix);
+ return strncmp(str, prefix, len) ? NULL : str + len;
+}
+
+#endif
diff --git a/tools/kvm/include/kvm/symbol.h b/tools/kvm/include/kvm/symbol.h
new file mode 100644
index 0000000..725bbaf
--- /dev/null
+++ b/tools/kvm/include/kvm/symbol.h
@@ -0,0 +1,30 @@
+#ifndef KVM__SYMBOL_H
+#define KVM__SYMBOL_H
+
+#include <stddef.h>
+#include <string.h>
+
+struct kvm;
+
+#define SYMBOL_DEFAULT_UNKNOWN "<unknown>"
+
+#ifdef CONFIG_HAS_BFD
+
+int symbol_init(struct kvm *kvm);
+int symbol_exit(struct kvm *kvm);
+char *symbol_lookup(struct kvm *kvm, unsigned long addr, char *sym, size_t size);
+
+#else
+
+static inline int symbol_init(struct kvm *kvm) { return 0; }
+static inline char *symbol_lookup(struct kvm *kvm, unsigned long addr, char *sym, size_t size)
+{
+ char *s = strncpy(sym, SYMBOL_DEFAULT_UNKNOWN, size);
+ sym[size - 1] = '\0';
+ return s;
+}
+static inline int symbol_exit(struct kvm *kvm) { return 0; }
+
+#endif
+
+#endif /* KVM__SYMBOL_H */
diff --git a/tools/kvm/include/kvm/term.h b/tools/kvm/include/kvm/term.h
new file mode 100644
index 0000000..dc9882e
--- /dev/null
+++ b/tools/kvm/include/kvm/term.h
@@ -0,0 +1,26 @@
+#ifndef KVM__TERM_H
+#define KVM__TERM_H
+
+#include "kvm/kvm.h"
+
+#include <sys/uio.h>
+#include <stdbool.h>
+
+#define CONSOLE_8250 1
+#define CONSOLE_VIRTIO 2
+#define CONSOLE_HV 3
+
+#define TERM_MAX_DEVS 4
+
+int term_putc_iov(struct iovec *iov, int iovcnt, int term);
+int term_getc_iov(struct kvm *kvm, struct iovec *iov, int iovcnt, int term);
+int term_putc(char *addr, int cnt, int term);
+int term_getc(struct kvm *kvm, int term);
+
+bool term_readable(int term);
+void term_set_tty(int term);
+int term_init(struct kvm *kvm);
+int term_exit(struct kvm *kvm);
+int tty_parser(const struct option *opt, const char *arg, int unset);
+
+#endif /* KVM__TERM_H */
diff --git a/tools/kvm/include/kvm/threadpool.h b/tools/kvm/include/kvm/threadpool.h
new file mode 100644
index 0000000..bacb243
--- /dev/null
+++ b/tools/kvm/include/kvm/threadpool.h
@@ -0,0 +1,38 @@
+#ifndef KVM__THREADPOOL_H
+#define KVM__THREADPOOL_H
+
+#include "kvm/mutex.h"
+
+#include <linux/list.h>
+
+struct kvm;
+
+typedef void (*kvm_thread_callback_fn_t)(struct kvm *kvm, void *data);
+
+struct thread_pool__job {
+ kvm_thread_callback_fn_t callback;
+ struct kvm *kvm;
+ void *data;
+
+ int signalcount;
+ struct mutex mutex;
+
+ struct list_head queue;
+};
+
+static inline void thread_pool__init_job(struct thread_pool__job *job, struct kvm *kvm, kvm_thread_callback_fn_t callback, void *data)
+{
+ *job = (struct thread_pool__job) {
+ .kvm = kvm,
+ .callback = callback,
+ .data = data,
+ .mutex = MUTEX_INITIALIZER,
+ };
+}
+
+int thread_pool__init(struct kvm *kvm);
+int thread_pool__exit(struct kvm *kvm);
+
+void thread_pool__do_job(struct thread_pool__job *job);
+
+#endif
diff --git a/tools/kvm/include/kvm/types.h b/tools/kvm/include/kvm/types.h
new file mode 100644
index 0000000..0cbc5fb
--- /dev/null
+++ b/tools/kvm/include/kvm/types.h
@@ -0,0 +1,7 @@
+#ifndef KVM_TYPES_H
+#define KVM_TYPES_H
+
+/* FIXME: include/linux/if_tun.h and include/linux/if_ether.h complains */
+#define __be16 u16
+
+#endif /* KVM_TYPES_H */
diff --git a/tools/kvm/include/kvm/uip.h b/tools/kvm/include/kvm/uip.h
new file mode 100644
index 0000000..a9a8d85
--- /dev/null
+++ b/tools/kvm/include/kvm/uip.h
@@ -0,0 +1,362 @@
+#ifndef KVM__UIP_H
+#define KVM__UIP_H
+
+#include "linux/types.h"
+#include "kvm/mutex.h"
+
+#include <netinet/in.h>
+#include <sys/uio.h>
+
+#define UIP_BUF_STATUS_FREE 0
+#define UIP_BUF_STATUS_INUSE 1
+#define UIP_BUF_STATUS_USED 2
+
+#define UIP_ETH_P_IP 0X0800
+#define UIP_ETH_P_ARP 0X0806
+
+#define UIP_IP_VER_4 0X40
+#define UIP_IP_HDR_LEN 0X05
+#define UIP_IP_TTL 0X40
+#define UIP_IP_P_UDP 0X11
+#define UIP_IP_P_TCP 0X06
+#define UIP_IP_P_ICMP 0X01
+
+#define UIP_TCP_HDR_LEN 0x50
+#define UIP_TCP_WIN_SIZE 14600
+#define UIP_TCP_FLAG_FIN 1
+#define UIP_TCP_FLAG_SYN 2
+#define UIP_TCP_FLAG_RST 4
+#define UIP_TCP_FLAG_PSH 8
+#define UIP_TCP_FLAG_ACK 16
+#define UIP_TCP_FLAG_URG 32
+
+#define UIP_BOOTP_VENDOR_SPECIFIC_LEN 64
+#define UIP_BOOTP_MAX_PAYLOAD_LEN 300
+#define UIP_DHCP_VENDOR_SPECIFIC_LEN 312
+#define UIP_DHCP_PORT_SERVER 67
+#define UIP_DHCP_PORT_CLIENT 68
+#define UIP_DHCP_MACPAD_LEN 10
+#define UIP_DHCP_HOSTNAME_LEN 64
+#define UIP_DHCP_FILENAME_LEN 128
+#define UIP_DHCP_MAGIC_COOKIE 0x63825363
+#define UIP_DHCP_MAGIC_COOKIE_LEN 4
+#define UIP_DHCP_LEASE_TIME 0x00003840
+#define UIP_DHCP_MAX_PAYLOAD_LEN (UIP_BOOTP_MAX_PAYLOAD_LEN - UIP_BOOTP_VENDOR_SPECIFIC_LEN + UIP_DHCP_VENDOR_SPECIFIC_LEN)
+#define UIP_DHCP_OPTION_LEN (UIP_DHCP_VENDOR_SPECIFIC_LEN - UIP_DHCP_MAGIC_COOKIE_LEN)
+#define UIP_DHCP_DISCOVER 1
+#define UIP_DHCP_OFFER 2
+#define UIP_DHCP_REQUEST 3
+#define UIP_DHCP_ACK 5
+#define UIP_DHCP_MAX_DNS_SERVER_NR 3
+#define UIP_DHCP_MAX_DOMAIN_NAME_LEN 256
+#define UIP_DHCP_TAG_MSG_TYPE 53
+#define UIP_DHCP_TAG_MSG_TYPE_LEN 1
+#define UIP_DHCP_TAG_SERVER_ID 54
+#define UIP_DHCP_TAG_SERVER_ID_LEN 4
+#define UIP_DHCP_TAG_LEASE_TIME 51
+#define UIP_DHCP_TAG_LEASE_TIME_LEN 4
+#define UIP_DHCP_TAG_SUBMASK 1
+#define UIP_DHCP_TAG_SUBMASK_LEN 4
+#define UIP_DHCP_TAG_ROUTER 3
+#define UIP_DHCP_TAG_ROUTER_LEN 4
+#define UIP_DHCP_TAG_ROOT 17
+#define UIP_DHCP_TAG_ROOT_LEN 4
+#define UIP_DHCP_TAG_DNS_SERVER 6
+#define UIP_DHCP_TAG_DNS_SERVER_LEN 4
+#define UIP_DHCP_TAG_DOMAIN_NAME 15
+#define UIP_DHCP_TAG_END 255
+
+/*
+ * IP package maxium len == 64 KBytes
+ * IP header == 20 Bytes
+ * TCP header == 20 Bytes
+ * UDP header == 8 Bytes
+ */
+#define UIP_MAX_TCP_PAYLOAD (64*1024 - 20 - 20 - 1)
+#define UIP_MAX_UDP_PAYLOAD (64*1024 - 20 - 8 - 1)
+
+struct uip_eth_addr {
+ u8 addr[6];
+};
+
+struct uip_eth {
+ struct uip_eth_addr dst;
+ struct uip_eth_addr src;
+ u16 type;
+} __attribute__((packed));
+
+struct uip_arp {
+ struct uip_eth eth;
+ u16 hwtype;
+ u16 proto;
+ u8 hwlen;
+ u8 protolen;
+ u16 op;
+ struct uip_eth_addr smac;
+ u32 sip;
+ struct uip_eth_addr dmac;
+ u32 dip;
+} __attribute__((packed));
+
+struct uip_ip {
+ struct uip_eth eth;
+ u8 vhl;
+ u8 tos;
+ /*
+ * len = IP hdr + IP payload
+ */
+ u16 len;
+ u16 id;
+ u16 flgfrag;
+ u8 ttl;
+ u8 proto;
+ u16 csum;
+ u32 sip;
+ u32 dip;
+} __attribute__((packed));
+
+struct uip_icmp {
+ struct uip_ip ip;
+ u8 type;
+ u8 code;
+ u16 csum;
+ u16 id;
+ u16 seq;
+} __attribute__((packed));
+
+struct uip_udp {
+ /*
+ * FIXME: IP Options (IP hdr len > 20 bytes) are not supported
+ */
+ struct uip_ip ip;
+ u16 sport;
+ u16 dport;
+ /*
+ * len = UDP hdr + UDP payload
+ */
+ u16 len;
+ u16 csum;
+ u8 payload[0];
+} __attribute__((packed));
+
+struct uip_tcp {
+ /*
+ * FIXME: IP Options (IP hdr len > 20 bytes) are not supported
+ */
+ struct uip_ip ip;
+ u16 sport;
+ u16 dport;
+ u32 seq;
+ u32 ack;
+ u8 off;
+ u8 flg;
+ u16 win;
+ u16 csum;
+ u16 urgent;
+} __attribute__((packed));
+
+struct uip_pseudo_hdr {
+ u32 sip;
+ u32 dip;
+ u8 zero;
+ u8 proto;
+ u16 len;
+} __attribute__((packed));
+
+struct uip_dhcp {
+ struct uip_udp udp;
+ u8 msg_type;
+ u8 hardware_type;
+ u8 hardware_len;
+ u8 hops;
+ u32 id;
+ u16 time;
+ u16 flg;
+ u32 client_ip;
+ u32 your_ip;
+ u32 server_ip;
+ u32 agent_ip;
+ struct uip_eth_addr client_mac;
+ u8 pad[UIP_DHCP_MACPAD_LEN];
+ u8 server_hostname[UIP_DHCP_HOSTNAME_LEN];
+ u8 boot_filename[UIP_DHCP_FILENAME_LEN];
+ u32 magic_cookie;
+ u8 option[UIP_DHCP_OPTION_LEN];
+} __attribute__((packed));
+
+struct uip_info {
+ struct list_head udp_socket_head;
+ struct list_head tcp_socket_head;
+ struct mutex udp_socket_lock;
+ struct mutex tcp_socket_lock;
+ struct uip_eth_addr guest_mac;
+ struct uip_eth_addr host_mac;
+ pthread_cond_t buf_free_cond;
+ pthread_cond_t buf_used_cond;
+ struct list_head buf_head;
+ struct mutex buf_lock;
+ pthread_t udp_thread;
+ int udp_epollfd;
+ int buf_free_nr;
+ int buf_used_nr;
+ u32 guest_ip;
+ u32 guest_netmask;
+ u32 host_ip;
+ u32 dns_ip[UIP_DHCP_MAX_DNS_SERVER_NR];
+ char *domain_name;
+ u32 buf_nr;
+ u32 vnet_hdr_len;
+};
+
+struct uip_buf {
+ struct list_head list;
+ struct uip_info *info;
+ int vnet_len;
+ int eth_len;
+ int status;
+ unsigned char *vnet;
+ unsigned char *eth;
+ int id;
+};
+
+struct uip_udp_socket {
+ struct sockaddr_in addr;
+ struct list_head list;
+ struct mutex *lock;
+ u32 dport, sport;
+ u32 dip, sip;
+ int fd;
+};
+
+struct uip_tcp_socket {
+ struct sockaddr_in addr;
+ struct list_head list;
+ struct uip_info *info;
+ pthread_cond_t cond;
+ struct mutex *lock;
+ pthread_t thread;
+ u32 dport, sport;
+ u32 guest_acked;
+ u16 window_size;
+ /*
+ * Initial Sequence Number
+ */
+ u32 isn_server;
+ u32 isn_guest;
+ u32 ack_server;
+ u32 seq_server;
+ int write_done;
+ int read_done;
+ u32 dip, sip;
+ u8 *payload;
+ int fd;
+};
+
+struct uip_tx_arg {
+ void *vnet;
+ struct uip_info *info;
+ struct uip_eth *eth;
+ int vnet_len;
+ int eth_len;
+};
+
+static inline u16 uip_ip_hdrlen(struct uip_ip *ip)
+{
+ return (ip->vhl & 0x0f) * 4;
+}
+
+static inline u16 uip_ip_len(struct uip_ip *ip)
+{
+ return htons(ip->len);
+}
+
+static inline u16 uip_udp_hdrlen(struct uip_udp *udp)
+{
+ return 8;
+}
+
+static inline u16 uip_udp_len(struct uip_udp *udp)
+{
+ return ntohs(udp->len);
+}
+
+static inline u16 uip_tcp_hdrlen(struct uip_tcp *tcp)
+{
+ return (tcp->off >> 4) * 4;
+}
+
+static inline u16 uip_tcp_len(struct uip_tcp *tcp)
+{
+ struct uip_ip *ip;
+
+ ip = &tcp->ip;
+
+ return uip_ip_len(ip) - uip_ip_hdrlen(ip);
+}
+
+static inline u16 uip_tcp_payloadlen(struct uip_tcp *tcp)
+{
+ return uip_tcp_len(tcp) - uip_tcp_hdrlen(tcp);
+}
+
+static inline u8 *uip_tcp_payload(struct uip_tcp *tcp)
+{
+ return (u8 *)&tcp->sport + uip_tcp_hdrlen(tcp);
+}
+
+static inline bool uip_tcp_is_syn(struct uip_tcp *tcp)
+{
+ return (tcp->flg & UIP_TCP_FLAG_SYN) != 0;
+}
+
+static inline bool uip_tcp_is_fin(struct uip_tcp *tcp)
+{
+ return (tcp->flg & UIP_TCP_FLAG_FIN) != 0;
+}
+
+static inline u32 uip_tcp_isn(struct uip_tcp *tcp)
+{
+ return ntohl(tcp->seq);
+}
+
+static inline u32 uip_tcp_isn_alloc(void)
+{
+ /*
+ * FIXME: should increase every 4ms
+ */
+ return 10000000;
+}
+
+static inline u16 uip_eth_hdrlen(struct uip_eth *eth)
+{
+ return sizeof(*eth);
+}
+
+int uip_tx(struct iovec *iov, u16 out, struct uip_info *info);
+int uip_rx(struct iovec *iov, u16 in, struct uip_info *info);
+void uip_static_init(struct uip_info *info);
+int uip_init(struct uip_info *info);
+
+int uip_tx_do_ipv4_udp_dhcp(struct uip_tx_arg *arg);
+int uip_tx_do_ipv4_icmp(struct uip_tx_arg *arg);
+int uip_tx_do_ipv4_tcp(struct uip_tx_arg *arg);
+int uip_tx_do_ipv4_udp(struct uip_tx_arg *arg);
+int uip_tx_do_ipv4(struct uip_tx_arg *arg);
+int uip_tx_do_arp(struct uip_tx_arg *arg);
+
+u16 uip_csum_icmp(struct uip_icmp *icmp);
+u16 uip_csum_udp(struct uip_udp *udp);
+u16 uip_csum_tcp(struct uip_tcp *tcp);
+u16 uip_csum_ip(struct uip_ip *ip);
+
+struct uip_buf *uip_buf_set_used(struct uip_info *info, struct uip_buf *buf);
+struct uip_buf *uip_buf_set_free(struct uip_info *info, struct uip_buf *buf);
+struct uip_buf *uip_buf_get_used(struct uip_info *info);
+struct uip_buf *uip_buf_get_free(struct uip_info *info);
+struct uip_buf *uip_buf_clone(struct uip_tx_arg *arg);
+
+int uip_udp_make_pkg(struct uip_info *info, struct uip_udp_socket *sk, struct uip_buf *buf, u8 *payload, int payload_len);
+bool uip_udp_is_dhcp(struct uip_udp *udp);
+
+int uip_dhcp_get_dns(struct uip_info *info);
+#endif /* KVM__UIP_H */
diff --git a/tools/kvm/include/kvm/util-init.h b/tools/kvm/include/kvm/util-init.h
new file mode 100644
index 0000000..13d4f04
--- /dev/null
+++ b/tools/kvm/include/kvm/util-init.h
@@ -0,0 +1,51 @@
+#ifndef KVM__UTIL_INIT_H
+#define KVM__UTIL_INIT_H
+
+struct kvm;
+
+struct init_item {
+ struct hlist_node n;
+ const char *fn_name;
+ int (*init)(struct kvm *);
+};
+
+int init_list__init(struct kvm *kvm);
+int init_list__exit(struct kvm *kvm);
+
+int init_list_add(struct init_item *t, int (*init)(struct kvm *),
+ int priority, const char *name);
+int exit_list_add(struct init_item *t, int (*init)(struct kvm *),
+ int priority, const char *name);
+
+#define __init_list_add(cb, l) \
+static void __attribute__ ((constructor)) __init__##cb(void) \
+{ \
+ static char name[] = #cb; \
+ static struct init_item t; \
+ init_list_add(&t, cb, l, name); \
+}
+
+#define __exit_list_add(cb, l) \
+static void __attribute__ ((constructor)) __init__##cb(void) \
+{ \
+ static char name[] = #cb; \
+ static struct init_item t; \
+ exit_list_add(&t, cb, l, name); \
+}
+
+#define core_init(cb) __init_list_add(cb, 0)
+#define base_init(cb) __init_list_add(cb, 2)
+#define dev_base_init(cb) __init_list_add(cb, 4)
+#define dev_init(cb) __init_list_add(cb, 5)
+#define virtio_dev_init(cb) __init_list_add(cb, 6)
+#define firmware_init(cb) __init_list_add(cb, 7)
+#define late_init(cb) __init_list_add(cb, 9)
+
+#define core_exit(cb) __exit_list_add(cb, 0)
+#define base_exit(cb) __exit_list_add(cb, 2)
+#define dev_base_exit(cb) __exit_list_add(cb, 4)
+#define dev_exit(cb) __exit_list_add(cb, 5)
+#define virtio_dev_exit(cb) __exit_list_add(cb, 6)
+#define firmware_exit(cb) __exit_list_add(cb, 7)
+#define late_exit(cb) __exit_list_add(cb, 9)
+#endif
diff --git a/tools/kvm/include/kvm/util.h b/tools/kvm/include/kvm/util.h
new file mode 100644
index 0000000..0df9f0d
--- /dev/null
+++ b/tools/kvm/include/kvm/util.h
@@ -0,0 +1,97 @@
+#include <linux/stringify.h>
+
+#ifndef KVM__UTIL_H
+#define KVM__UTIL_H
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+
+/*
+ * Some bits are stolen from perf tool :)
+ */
+
+#include <unistd.h>
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <string.h>
+#include <stdbool.h>
+#include <signal.h>
+#include <errno.h>
+#include <limits.h>
+#include <sys/param.h>
+#include <sys/types.h>
+#include <linux/types.h>
+
+#ifdef __GNUC__
+#define NORETURN __attribute__((__noreturn__))
+#else
+#define NORETURN
+#ifndef __attribute__
+#define __attribute__(x)
+#endif
+#endif
+
+extern bool do_debug_print;
+
+#define PROT_RW (PROT_READ|PROT_WRITE)
+#define MAP_ANON_NORESERVE (MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE)
+
+extern void die(const char *err, ...) NORETURN __attribute__((format (printf, 1, 2)));
+extern void die_perror(const char *s) NORETURN;
+extern int pr_err(const char *err, ...) __attribute__((format (printf, 1, 2)));
+extern void pr_warning(const char *err, ...) __attribute__((format (printf, 1, 2)));
+extern void pr_info(const char *err, ...) __attribute__((format (printf, 1, 2)));
+extern void set_die_routine(void (*routine)(const char *err, va_list params) NORETURN);
+
+#define pr_debug(fmt, ...) \
+ do { \
+ if (do_debug_print) \
+ pr_info("(%s) %s:%d: " fmt, __FILE__, \
+ __func__, __LINE__, ##__VA_ARGS__); \
+ } while (0)
+
+
+#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))
+
+#ifndef BUG_ON_HANDLER
+# define BUG_ON_HANDLER(condition) \
+ do { \
+ if ((condition)) { \
+ pr_err("BUG at %s:%d", __FILE__, __LINE__); \
+ raise(SIGABRT); \
+ } \
+ } while (0)
+#endif
+
+#define BUG_ON(condition) BUG_ON_HANDLER((condition))
+
+#define DIE_IF(cnd) \
+do { \
+ if (cnd) \
+ die(" at (" __FILE__ ":" __stringify(__LINE__) "): " \
+ __stringify(cnd) "\n"); \
+} while (0)
+
+#define WARN_ON(condition) ({ \
+ int __ret_warn_on = !!(condition); \
+ if (__ret_warn_on) \
+ pr_warning("(%s) %s:%d: failed condition: %s", \
+ __FILE__, __func__, __LINE__, \
+ __stringify(condition)); \
+ __ret_warn_on; \
+})
+
+#define MSECS_TO_USECS(s) ((s) * 1000)
+
+/* Millisecond sleep */
+static inline void msleep(unsigned int msecs)
+{
+ usleep(MSECS_TO_USECS(msecs));
+}
+
+struct kvm;
+void *mmap_hugetlbfs(struct kvm *kvm, const char *htlbfs_path, u64 size);
+void *mmap_anon_or_hugetlbfs(struct kvm *kvm, const char *hugetlbfs_path, u64 size);
+
+#endif /* KVM__UTIL_H */
diff --git a/tools/kvm/include/kvm/vesa.h b/tools/kvm/include/kvm/vesa.h
new file mode 100644
index 0000000..ac041d9
--- /dev/null
+++ b/tools/kvm/include/kvm/vesa.h
@@ -0,0 +1,18 @@
+#ifndef KVM__VESA_H
+#define KVM__VESA_H
+
+#include <linux/types.h>
+
+#define VESA_WIDTH 640
+#define VESA_HEIGHT 480
+
+#define VESA_MEM_ADDR 0xd0000000
+#define VESA_MEM_SIZE (4*VESA_WIDTH*VESA_HEIGHT)
+#define VESA_BPP 32
+
+struct kvm;
+struct biosregs;
+
+struct framebuffer *vesa__init(struct kvm *self);
+
+#endif
diff --git a/tools/kvm/include/kvm/virtio-9p.h b/tools/kvm/include/kvm/virtio-9p.h
new file mode 100644
index 0000000..19ffe50
--- /dev/null
+++ b/tools/kvm/include/kvm/virtio-9p.h
@@ -0,0 +1,76 @@
+#ifndef KVM__VIRTIO_9P_H
+#define KVM__VIRTIO_9P_H
+#include "kvm/virtio.h"
+#include "kvm/pci.h"
+#include "kvm/threadpool.h"
+#include "kvm/parse-options.h"
+
+#include <sys/types.h>
+#include <dirent.h>
+#include <linux/list.h>
+#include <linux/rbtree.h>
+
+#define NUM_VIRT_QUEUES 1
+#define VIRTQUEUE_NUM 128
+#define VIRTIO_9P_DEFAULT_TAG "kvm_9p"
+#define VIRTIO_9P_HDR_LEN (sizeof(u32)+sizeof(u8)+sizeof(u16))
+#define VIRTIO_9P_VERSION_DOTL "9P2000.L"
+#define MAX_TAG_LEN 32
+
+struct p9_msg {
+ u32 size;
+ u8 cmd;
+ u16 tag;
+ u8 msg[0];
+} __attribute__((packed));
+
+struct p9_fid {
+ u32 fid;
+ u32 uid;
+ char abs_path[PATH_MAX];
+ char *path;
+ DIR *dir;
+ int fd;
+ struct rb_node node;
+};
+
+struct p9_dev_job {
+ struct virt_queue *vq;
+ struct p9_dev *p9dev;
+ struct thread_pool__job job_id;
+};
+
+struct p9_dev {
+ struct list_head list;
+ struct virtio_device vdev;
+ struct rb_root fids;
+
+ struct virtio_9p_config *config;
+ u32 features;
+
+ /* virtio queue */
+ struct virt_queue vqs[NUM_VIRT_QUEUES];
+ struct p9_dev_job jobs[NUM_VIRT_QUEUES];
+ char root_dir[PATH_MAX];
+};
+
+struct p9_pdu {
+ u32 queue_head;
+ size_t read_offset;
+ size_t write_offset;
+ u16 out_iov_cnt;
+ u16 in_iov_cnt;
+ struct iovec in_iov[VIRTQUEUE_NUM];
+ struct iovec out_iov[VIRTQUEUE_NUM];
+};
+
+struct kvm;
+
+int virtio_9p_rootdir_parser(const struct option *opt, const char *arg, int unset);
+int virtio_9p_img_name_parser(const struct option *opt, const char *arg, int unset);
+int virtio_9p__register(struct kvm *kvm, const char *root, const char *tag_name);
+int virtio_9p__init(struct kvm *kvm);
+int virtio_p9_pdu_readf(struct p9_pdu *pdu, const char *fmt, ...);
+int virtio_p9_pdu_writef(struct p9_pdu *pdu, const char *fmt, ...);
+
+#endif
diff --git a/tools/kvm/include/kvm/virtio-balloon.h b/tools/kvm/include/kvm/virtio-balloon.h
new file mode 100644
index 0000000..844a1ba
--- /dev/null
+++ b/tools/kvm/include/kvm/virtio-balloon.h
@@ -0,0 +1,9 @@
+#ifndef KVM__BLN_VIRTIO_H
+#define KVM__BLN_VIRTIO_H
+
+struct kvm;
+
+int virtio_bln__init(struct kvm *kvm);
+int virtio_bln__exit(struct kvm *kvm);
+
+#endif /* KVM__BLN_VIRTIO_H */
diff --git a/tools/kvm/include/kvm/virtio-blk.h b/tools/kvm/include/kvm/virtio-blk.h
new file mode 100644
index 0000000..12e59b6
--- /dev/null
+++ b/tools/kvm/include/kvm/virtio-blk.h
@@ -0,0 +1,12 @@
+#ifndef KVM__BLK_VIRTIO_H
+#define KVM__BLK_VIRTIO_H
+
+#include "kvm/disk-image.h"
+
+struct kvm;
+
+int virtio_blk__init(struct kvm *kvm);
+int virtio_blk__exit(struct kvm *kvm);
+void virtio_blk_complete(void *param, long len);
+
+#endif /* KVM__BLK_VIRTIO_H */
diff --git a/tools/kvm/include/kvm/virtio-console.h b/tools/kvm/include/kvm/virtio-console.h
new file mode 100644
index 0000000..8980920
--- /dev/null
+++ b/tools/kvm/include/kvm/virtio-console.h
@@ -0,0 +1,10 @@
+#ifndef KVM__CONSOLE_VIRTIO_H
+#define KVM__CONSOLE_VIRTIO_H
+
+struct kvm;
+
+int virtio_console__init(struct kvm *kvm);
+void virtio_console__inject_interrupt(struct kvm *kvm);
+int virtio_console__exit(struct kvm *kvm);
+
+#endif /* KVM__CONSOLE_VIRTIO_H */
diff --git a/tools/kvm/include/kvm/virtio-mmio.h b/tools/kvm/include/kvm/virtio-mmio.h
new file mode 100644
index 0000000..835f421
--- /dev/null
+++ b/tools/kvm/include/kvm/virtio-mmio.h
@@ -0,0 +1,60 @@
+#ifndef KVM__VIRTIO_MMIO_H
+#define KVM__VIRTIO_MMIO_H
+
+#include <linux/types.h>
+#include <linux/virtio_mmio.h>
+
+#define VIRTIO_MMIO_MAX_VQ 32
+#define VIRTIO_MMIO_MAX_CONFIG 1
+#define VIRTIO_MMIO_IO_SIZE 0x200
+
+struct kvm;
+
+struct virtio_mmio_ioevent_param {
+ struct virtio_device *vdev;
+ u32 vq;
+};
+
+struct virtio_mmio_hdr {
+ char magic[4];
+ u32 version;
+ u32 device_id;
+ u32 vendor_id;
+ u32 host_features;
+ u32 host_features_sel;
+ u32 reserved_1[2];
+ u32 guest_features;
+ u32 guest_features_sel;
+ u32 guest_page_size;
+ u32 reserved_2;
+ u32 queue_sel;
+ u32 queue_num_max;
+ u32 queue_num;
+ u32 queue_align;
+ u32 queue_pfn;
+ u32 reserved_3[3];
+ u32 queue_notify;
+ u32 reserved_4[3];
+ u32 interrupt_state;
+ u32 interrupt_ack;
+ u32 reserved_5[2];
+ u32 status;
+} __attribute__((packed));
+
+struct virtio_mmio {
+ u32 addr;
+ void *dev;
+ struct kvm *kvm;
+ u8 irq;
+ struct virtio_mmio_hdr hdr;
+ struct device_header dev_hdr;
+ struct virtio_mmio_ioevent_param ioeventfds[VIRTIO_MMIO_MAX_VQ];
+};
+
+int virtio_mmio_signal_vq(struct kvm *kvm, struct virtio_device *vdev, u32 vq);
+int virtio_mmio_signal_config(struct kvm *kvm, struct virtio_device *vdev);
+int virtio_mmio_exit(struct kvm *kvm, struct virtio_device *vdev);
+int virtio_mmio_init(struct kvm *kvm, void *dev, struct virtio_device *vdev,
+ int device_id, int subsys_id, int class);
+void virtio_mmio_assign_irq(struct device_header *dev_hdr);
+#endif
diff --git a/tools/kvm/include/kvm/virtio-net.h b/tools/kvm/include/kvm/virtio-net.h
new file mode 100644
index 0000000..f435cc3
--- /dev/null
+++ b/tools/kvm/include/kvm/virtio-net.h
@@ -0,0 +1,32 @@
+#ifndef KVM__VIRTIO_NET_H
+#define KVM__VIRTIO_NET_H
+
+#include "kvm/parse-options.h"
+
+struct kvm;
+
+struct virtio_net_params {
+ const char *guest_ip;
+ const char *host_ip;
+ const char *script;
+ const char *trans;
+ const char *tapif;
+ char guest_mac[6];
+ char host_mac[6];
+ struct kvm *kvm;
+ int mode;
+ int vhost;
+ int fd;
+ int mq;
+};
+
+int virtio_net__init(struct kvm *kvm);
+int virtio_net__exit(struct kvm *kvm);
+int netdev_parser(const struct option *opt, const char *arg, int unset);
+
+enum {
+ NET_MODE_USER,
+ NET_MODE_TAP
+};
+
+#endif /* KVM__VIRTIO_NET_H */
diff --git a/tools/kvm/include/kvm/virtio-pci-dev.h b/tools/kvm/include/kvm/virtio-pci-dev.h
new file mode 100644
index 0000000..48ae018
--- /dev/null
+++ b/tools/kvm/include/kvm/virtio-pci-dev.h
@@ -0,0 +1,38 @@
+#ifndef VIRTIO_PCI_DEV_H_
+#define VIRTIO_PCI_DEV_H_
+
+#include <linux/virtio_ids.h>
+
+/*
+ * Virtio PCI device constants and resources
+ * they do use (such as irqs and pins).
+ */
+
+#define PCI_DEVICE_ID_VIRTIO_NET 0x1000
+#define PCI_DEVICE_ID_VIRTIO_BLK 0x1001
+#define PCI_DEVICE_ID_VIRTIO_CONSOLE 0x1003
+#define PCI_DEVICE_ID_VIRTIO_RNG 0x1004
+#define PCI_DEVICE_ID_VIRTIO_BLN 0x1005
+#define PCI_DEVICE_ID_VIRTIO_SCSI 0x1008
+#define PCI_DEVICE_ID_VIRTIO_9P 0x1009
+#define PCI_DEVICE_ID_VESA 0x2000
+#define PCI_DEVICE_ID_PCI_SHMEM 0x0001
+
+#define PCI_VENDOR_ID_REDHAT_QUMRANET 0x1af4
+#define PCI_VENDOR_ID_PCI_SHMEM 0x0001
+#define PCI_SUBSYSTEM_VENDOR_ID_REDHAT_QUMRANET 0x1af4
+
+#define PCI_SUBSYSTEM_ID_VESA 0x0004
+#define PCI_SUBSYSTEM_ID_PCI_SHMEM 0x0001
+
+#define PCI_CLASS_BLK 0x018000
+#define PCI_CLASS_NET 0x020000
+#define PCI_CLASS_CONSOLE 0x078000
+/*
+ * 0xFF Device does not fit in any defined classes
+ */
+#define PCI_CLASS_RNG 0xff0000
+#define PCI_CLASS_BLN 0xff0000
+#define PCI_CLASS_9P 0xff0000
+
+#endif /* VIRTIO_PCI_DEV_H_ */
diff --git a/tools/kvm/include/kvm/virtio-pci.h b/tools/kvm/include/kvm/virtio-pci.h
new file mode 100644
index 0000000..c795ce7
--- /dev/null
+++ b/tools/kvm/include/kvm/virtio-pci.h
@@ -0,0 +1,53 @@
+#ifndef KVM__VIRTIO_PCI_H
+#define KVM__VIRTIO_PCI_H
+
+#include "kvm/devices.h"
+#include "kvm/pci.h"
+
+#include <linux/types.h>
+
+#define VIRTIO_PCI_MAX_VQ 32
+#define VIRTIO_PCI_MAX_CONFIG 1
+
+struct kvm;
+
+struct virtio_pci_ioevent_param {
+ struct virtio_device *vdev;
+ u32 vq;
+};
+
+#define VIRTIO_PCI_F_SIGNAL_MSI (1 << 0)
+
+struct virtio_pci {
+ struct pci_device_header pci_hdr;
+ struct device_header dev_hdr;
+ void *dev;
+ struct kvm *kvm;
+
+ u16 port_addr;
+ u32 mmio_addr;
+ u8 status;
+ u8 isr;
+ u32 features;
+
+ /* MSI-X */
+ u16 config_vector;
+ u32 config_gsi;
+ u32 vq_vector[VIRTIO_PCI_MAX_VQ];
+ u32 gsis[VIRTIO_PCI_MAX_VQ];
+ u32 msix_io_block;
+ u64 msix_pba;
+ struct msix_table msix_table[VIRTIO_PCI_MAX_VQ + VIRTIO_PCI_MAX_CONFIG];
+
+ /* virtio queue */
+ u16 queue_selector;
+ struct virtio_pci_ioevent_param ioeventfds[VIRTIO_PCI_MAX_VQ];
+};
+
+int virtio_pci__signal_vq(struct kvm *kvm, struct virtio_device *vdev, u32 vq);
+int virtio_pci__signal_config(struct kvm *kvm, struct virtio_device *vdev);
+int virtio_pci__exit(struct kvm *kvm, struct virtio_device *vdev);
+int virtio_pci__init(struct kvm *kvm, void *dev, struct virtio_device *vdev,
+ int device_id, int subsys_id, int class);
+
+#endif
diff --git a/tools/kvm/include/kvm/virtio-rng.h b/tools/kvm/include/kvm/virtio-rng.h
new file mode 100644
index 0000000..b585b37
--- /dev/null
+++ b/tools/kvm/include/kvm/virtio-rng.h
@@ -0,0 +1,9 @@
+#ifndef KVM__RNG_VIRTIO_H
+#define KVM__RNG_VIRTIO_H
+
+struct kvm;
+
+int virtio_rng__init(struct kvm *kvm);
+int virtio_rng__exit(struct kvm *kvm);
+
+#endif /* KVM__RNG_VIRTIO_H */
diff --git a/tools/kvm/include/kvm/virtio-scsi.h b/tools/kvm/include/kvm/virtio-scsi.h
new file mode 100644
index 0000000..d64aa7e
--- /dev/null
+++ b/tools/kvm/include/kvm/virtio-scsi.h
@@ -0,0 +1,11 @@
+#ifndef KVM__SCSI_VIRTIO_H
+#define KVM__SCSI_VIRTIO_H
+
+#include "kvm/disk-image.h"
+
+struct kvm;
+
+int virtio_scsi_init(struct kvm *kvm);
+int virtio_scsi_exit(struct kvm *kvm);
+
+#endif /* KVM__SCSI_VIRTIO_H */
diff --git a/tools/kvm/include/kvm/virtio.h b/tools/kvm/include/kvm/virtio.h
new file mode 100644
index 0000000..8a9eab5
--- /dev/null
+++ b/tools/kvm/include/kvm/virtio.h
@@ -0,0 +1,175 @@
+#ifndef KVM__VIRTIO_H
+#define KVM__VIRTIO_H
+
+#include <endian.h>
+
+#include <linux/virtio_ring.h>
+#include <linux/virtio_pci.h>
+
+#include <linux/types.h>
+#include <sys/uio.h>
+
+#include "kvm/kvm.h"
+
+#define VIRTIO_IRQ_LOW 0
+#define VIRTIO_IRQ_HIGH 1
+
+#define VIRTIO_PCI_O_CONFIG 0
+#define VIRTIO_PCI_O_MSIX 1
+
+#define VIRTIO_ENDIAN_HOST 0
+#define VIRTIO_ENDIAN_LE (1 << 0)
+#define VIRTIO_ENDIAN_BE (1 << 1)
+
+struct virt_queue {
+ struct vring vring;
+ u32 pfn;
+ /* The last_avail_idx field is an index to ->ring of struct vring_avail.
+ It's where we assume the next request index is at. */
+ u16 last_avail_idx;
+ u16 last_used_signalled;
+ u16 endian;
+};
+
+/*
+ * The default policy is not to cope with the guest endianness.
+ * It also helps not breaking archs that do not care about supporting
+ * such a configuration.
+ */
+#ifndef VIRTIO_RING_ENDIAN
+#define VIRTIO_RING_ENDIAN VIRTIO_ENDIAN_HOST
+#endif
+
+#if (VIRTIO_RING_ENDIAN & (VIRTIO_ENDIAN_LE | VIRTIO_ENDIAN_BE))
+
+static inline __u16 __virtio_g2h_u16(u16 endian, __u16 val)
+{
+ return (endian == VIRTIO_ENDIAN_LE) ? le16toh(val) : be16toh(val);
+}
+
+static inline __u16 __virtio_h2g_u16(u16 endian, __u16 val)
+{
+ return (endian == VIRTIO_ENDIAN_LE) ? htole16(val) : htobe16(val);
+}
+
+static inline __u32 __virtio_g2h_u32(u16 endian, __u32 val)
+{
+ return (endian == VIRTIO_ENDIAN_LE) ? le32toh(val) : be32toh(val);
+}
+
+static inline __u32 __virtio_h2g_u32(u16 endian, __u32 val)
+{
+ return (endian == VIRTIO_ENDIAN_LE) ? htole32(val) : htobe32(val);
+}
+
+static inline __u64 __virtio_g2h_u64(u16 endian, __u64 val)
+{
+ return (endian == VIRTIO_ENDIAN_LE) ? le64toh(val) : be64toh(val);
+}
+
+static inline __u64 __virtio_h2g_u64(u16 endian, __u64 val)
+{
+ return (endian == VIRTIO_ENDIAN_LE) ? htole64(val) : htobe64(val);
+}
+
+#define virtio_guest_to_host_u16(x, v) __virtio_g2h_u16((x)->endian, (v))
+#define virtio_host_to_guest_u16(x, v) __virtio_h2g_u16((x)->endian, (v))
+#define virtio_guest_to_host_u32(x, v) __virtio_g2h_u32((x)->endian, (v))
+#define virtio_host_to_guest_u32(x, v) __virtio_h2g_u32((x)->endian, (v))
+#define virtio_guest_to_host_u64(x, v) __virtio_g2h_u64((x)->endian, (v))
+#define virtio_host_to_guest_u64(x, v) __virtio_h2g_u64((x)->endian, (v))
+
+#else
+
+#define virtio_guest_to_host_u16(x, v) (v)
+#define virtio_host_to_guest_u16(x, v) (v)
+#define virtio_guest_to_host_u32(x, v) (v)
+#define virtio_host_to_guest_u32(x, v) (v)
+#define virtio_guest_to_host_u64(x, v) (v)
+#define virtio_host_to_guest_u64(x, v) (v)
+
+#endif
+
+static inline u16 virt_queue__pop(struct virt_queue *queue)
+{
+ __u16 guest_idx;
+
+ guest_idx = queue->vring.avail->ring[queue->last_avail_idx++ % queue->vring.num];
+ return virtio_guest_to_host_u16(queue, guest_idx);
+}
+
+static inline struct vring_desc *virt_queue__get_desc(struct virt_queue *queue, u16 desc_ndx)
+{
+ return &queue->vring.desc[desc_ndx];
+}
+
+static inline bool virt_queue__available(struct virt_queue *vq)
+{
+ if (!vq->vring.avail)
+ return 0;
+
+ vring_avail_event(&vq->vring) = virtio_host_to_guest_u16(vq, vq->last_avail_idx);
+ return virtio_guest_to_host_u16(vq, vq->vring.avail->idx) != vq->last_avail_idx;
+}
+
+struct vring_used_elem *virt_queue__set_used_elem(struct virt_queue *queue, u32 head, u32 len);
+
+bool virtio_queue__should_signal(struct virt_queue *vq);
+u16 virt_queue__get_iov(struct virt_queue *vq, struct iovec iov[],
+ u16 *out, u16 *in, struct kvm *kvm);
+u16 virt_queue__get_head_iov(struct virt_queue *vq, struct iovec iov[],
+ u16 *out, u16 *in, u16 head, struct kvm *kvm);
+u16 virt_queue__get_inout_iov(struct kvm *kvm, struct virt_queue *queue,
+ struct iovec in_iov[], struct iovec out_iov[],
+ u16 *in, u16 *out);
+int virtio__get_dev_specific_field(int offset, bool msix, u32 *config_off);
+
+enum virtio_trans {
+ VIRTIO_PCI,
+ VIRTIO_MMIO,
+};
+
+struct virtio_device {
+ bool use_vhost;
+ void *virtio;
+ struct virtio_ops *ops;
+ u16 endian;
+};
+
+struct virtio_ops {
+ u8 *(*get_config)(struct kvm *kvm, void *dev);
+ u32 (*get_host_features)(struct kvm *kvm, void *dev);
+ void (*set_guest_features)(struct kvm *kvm, void *dev, u32 features);
+ int (*init_vq)(struct kvm *kvm, void *dev, u32 vq, u32 page_size,
+ u32 align, u32 pfn);
+ int (*notify_vq)(struct kvm *kvm, void *dev, u32 vq);
+ int (*get_pfn_vq)(struct kvm *kvm, void *dev, u32 vq);
+ int (*get_size_vq)(struct kvm *kvm, void *dev, u32 vq);
+ int (*set_size_vq)(struct kvm *kvm, void *dev, u32 vq, int size);
+ void (*notify_vq_gsi)(struct kvm *kvm, void *dev, u32 vq, u32 gsi);
+ void (*notify_vq_eventfd)(struct kvm *kvm, void *dev, u32 vq, u32 efd);
+ int (*signal_vq)(struct kvm *kvm, struct virtio_device *vdev, u32 queueid);
+ int (*signal_config)(struct kvm *kvm, struct virtio_device *vdev);
+ void (*notify_status)(struct kvm *kvm, void *dev, u8 status);
+ int (*init)(struct kvm *kvm, void *dev, struct virtio_device *vdev,
+ int device_id, int subsys_id, int class);
+ int (*exit)(struct kvm *kvm, struct virtio_device *vdev);
+};
+
+int virtio_init(struct kvm *kvm, void *dev, struct virtio_device *vdev,
+ struct virtio_ops *ops, enum virtio_trans trans,
+ int device_id, int subsys_id, int class);
+int virtio_compat_add_message(const char *device, const char *config);
+
+static inline void *virtio_get_vq(struct kvm *kvm, u32 pfn, u32 page_size)
+{
+ return guest_flat_to_host(kvm, (u64)pfn * page_size);
+}
+
+static inline void virtio_init_device_vq(struct virtio_device *vdev,
+ struct virt_queue *vq)
+{
+ vq->endian = vdev->endian;
+}
+
+#endif /* KVM__VIRTIO_H */
diff --git a/tools/kvm/include/kvm/vnc.h b/tools/kvm/include/kvm/vnc.h
new file mode 100644
index 0000000..c2934a4
--- /dev/null
+++ b/tools/kvm/include/kvm/vnc.h
@@ -0,0 +1,22 @@
+#ifndef KVM__VNC_H
+#define KVM__VNC_H
+
+#include "kvm/kvm.h"
+
+struct framebuffer;
+
+#ifdef CONFIG_HAS_VNCSERVER
+int vnc__init(struct kvm *kvm);
+int vnc__exit(struct kvm *kvm);
+#else
+static inline int vnc__init(struct kvm *kvm)
+{
+ return 0;
+}
+static inline int vnc__exit(struct kvm *kvm)
+{
+ return 0;
+}
+#endif
+
+#endif /* KVM__VNC_H */
diff --git a/tools/kvm/include/linux/bitops.h b/tools/kvm/include/linux/bitops.h
new file mode 100644
index 0000000..56448b7
--- /dev/null
+++ b/tools/kvm/include/linux/bitops.h
@@ -0,0 +1,33 @@
+#ifndef _KVM_LINUX_BITOPS_H_
+#define _KVM_LINUX_BITOPS_H_
+
+#include <linux/kernel.h>
+#include <linux/compiler.h>
+#include <asm/hweight.h>
+
+#define BITS_PER_LONG __WORDSIZE
+#define BITS_PER_BYTE 8
+#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(long))
+
+static inline void set_bit(int nr, unsigned long *addr)
+{
+ addr[nr / BITS_PER_LONG] |= 1UL << (nr % BITS_PER_LONG);
+}
+
+static inline void clear_bit(int nr, unsigned long *addr)
+{
+ addr[nr / BITS_PER_LONG] &= ~(1UL << (nr % BITS_PER_LONG));
+}
+
+static __always_inline int test_bit(unsigned int nr, const unsigned long *addr)
+{
+ return ((1UL << (nr % BITS_PER_LONG)) &
+ (((unsigned long *)addr)[nr / BITS_PER_LONG])) != 0;
+}
+
+static inline unsigned long hweight_long(unsigned long w)
+{
+ return sizeof(w) == 4 ? hweight32(w) : hweight64(w);
+}
+
+#endif
diff --git a/tools/kvm/include/linux/byteorder.h b/tools/kvm/include/linux/byteorder.h
new file mode 100644
index 0000000..c490de8
--- /dev/null
+++ b/tools/kvm/include/linux/byteorder.h
@@ -0,0 +1,7 @@
+#ifndef __BYTE_ORDER_H__
+#define __BYTE_ORDER_H__
+
+#include <asm/byteorder.h>
+#include <linux/byteorder/generic.h>
+
+#endif
diff --git a/tools/kvm/include/linux/compiler.h b/tools/kvm/include/linux/compiler.h
new file mode 100644
index 0000000..898420b
--- /dev/null
+++ b/tools/kvm/include/linux/compiler.h
@@ -0,0 +1,20 @@
+#ifndef _PERF_LINUX_COMPILER_H_
+#define _PERF_LINUX_COMPILER_H_
+
+#ifndef __always_inline
+#define __always_inline inline
+#endif
+#define __user
+
+#ifndef __attribute_const__
+#define __attribute_const__
+#endif
+
+#define __used __attribute__((__unused__))
+#define __packed __attribute__((packed))
+#define __iomem
+#define __force
+#define __must_check
+#define unlikely
+
+#endif
diff --git a/tools/kvm/include/linux/kernel.h b/tools/kvm/include/linux/kernel.h
new file mode 100644
index 0000000..f2bff5f
--- /dev/null
+++ b/tools/kvm/include/linux/kernel.h
@@ -0,0 +1,51 @@
+
+#ifndef KVM__LINUX_KERNEL_H_
+#define KVM__LINUX_KERNEL_H_
+
+#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))
+
+#define ALIGN(x,a) __ALIGN_MASK(x,(typeof(x))(a)-1)
+#define __ALIGN_MASK(x,mask) (((x)+(mask))&~(mask))
+
+#ifndef offsetof
+#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER)
+#endif
+
+#ifndef container_of
+/**
+ * container_of - cast a member of a structure out to the containing structure
+ * @ptr: the pointer to the member.
+ * @type: the type of the container struct this is embedded in.
+ * @member: the name of the member within the struct.
+ *
+ */
+#define container_of(ptr, type, member) ({ \
+ const typeof(((type *)0)->member) * __mptr = (ptr); \
+ (type *)((char *)__mptr - offsetof(type, member)); })
+#endif
+
+#define min(x, y) ({ \
+ typeof(x) _min1 = (x); \
+ typeof(y) _min2 = (y); \
+ (void) (&_min1 == &_min2); \
+ _min1 < _min2 ? _min1 : _min2; })
+
+#define max(x, y) ({ \
+ typeof(x) _max1 = (x); \
+ typeof(y) _max2 = (y); \
+ (void) (&_max1 == &_max2); \
+ _max1 > _max2 ? _max1 : _max2; })
+
+#define min_t(type, x, y) ({ \
+ type __min1 = (x); \
+ type __min2 = (y); \
+ __min1 < __min2 ? __min1: __min2; })
+
+#define max_t(type, x, y) ({ \
+ type __max1 = (x); \
+ type __max2 = (y); \
+ __max1 > __max2 ? __max1: __max2; })
+
+#define true 1
+
+#endif
diff --git a/tools/kvm/include/linux/module.h b/tools/kvm/include/linux/module.h
new file mode 100644
index 0000000..0e4c6a3
--- /dev/null
+++ b/tools/kvm/include/linux/module.h
@@ -0,0 +1,6 @@
+#ifndef KVM__LINUX_MODULE_H
+#define KVM__LINUX_MODULE_H
+
+#define EXPORT_SYMBOL(name)
+
+#endif
diff --git a/tools/kvm/include/linux/prefetch.h b/tools/kvm/include/linux/prefetch.h
new file mode 100644
index 0000000..62f6788
--- /dev/null
+++ b/tools/kvm/include/linux/prefetch.h
@@ -0,0 +1,6 @@
+#ifndef KVM__LINUX_PREFETCH_H
+#define KVM__LINUX_PREFETCH_H
+
+static inline void prefetch(void *a __attribute__((unused))) { }
+
+#endif
diff --git a/tools/kvm/include/linux/stddef.h b/tools/kvm/include/linux/stddef.h
new file mode 100644
index 0000000..39da808
--- /dev/null
+++ b/tools/kvm/include/linux/stddef.h
@@ -0,0 +1,10 @@
+#ifndef _LINUX_STDDEF_H
+#define _LINUX_STDDEF_H
+
+#undef NULL
+#define NULL ((void *)0)
+
+#undef offsetof
+#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER)
+
+#endif
diff --git a/tools/kvm/include/linux/types.h b/tools/kvm/include/linux/types.h
new file mode 100644
index 0000000..5e20f10
--- /dev/null
+++ b/tools/kvm/include/linux/types.h
@@ -0,0 +1,51 @@
+#ifndef LINUX_TYPES_H
+#define LINUX_TYPES_H
+
+#include <kvm/compiler.h>
+#define __SANE_USERSPACE_TYPES__ /* For PPC64, to get LL64 types */
+#include <asm/types.h>
+
+typedef __u64 u64;
+typedef __s64 s64;
+
+typedef __u32 u32;
+typedef __s32 s32;
+
+typedef __u16 u16;
+typedef __s16 s16;
+
+typedef __u8 u8;
+typedef __s8 s8;
+
+#ifdef __CHECKER__
+#define __bitwise__ __attribute__((bitwise))
+#else
+#define __bitwise__
+#endif
+#ifdef __CHECK_ENDIAN__
+#define __bitwise __bitwise__
+#else
+#define __bitwise
+#endif
+
+
+typedef __u16 __bitwise __le16;
+typedef __u16 __bitwise __be16;
+typedef __u32 __bitwise __le32;
+typedef __u32 __bitwise __be32;
+typedef __u64 __bitwise __le64;
+typedef __u64 __bitwise __be64;
+
+struct list_head {
+ struct list_head *next, *prev;
+};
+
+struct hlist_head {
+ struct hlist_node *first;
+};
+
+struct hlist_node {
+ struct hlist_node *next, **pprev;
+};
+
+#endif /* LINUX_TYPES_H */
diff --git a/tools/kvm/ioeventfd.c b/tools/kvm/ioeventfd.c
new file mode 100644
index 0000000..bce6861
--- /dev/null
+++ b/tools/kvm/ioeventfd.c
@@ -0,0 +1,218 @@
+#include <sys/epoll.h>
+#include <sys/ioctl.h>
+#include <pthread.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <signal.h>
+
+#include <linux/kernel.h>
+#include <linux/kvm.h>
+#include <linux/types.h>
+
+#include "kvm/ioeventfd.h"
+#include "kvm/kvm.h"
+#include "kvm/util.h"
+
+#define IOEVENTFD_MAX_EVENTS 20
+
+static struct epoll_event events[IOEVENTFD_MAX_EVENTS];
+static int epoll_fd, epoll_stop_fd;
+static LIST_HEAD(used_ioevents);
+static bool ioeventfd_avail;
+
+static void *ioeventfd__thread(void *param)
+{
+ u64 tmp = 1;
+
+ kvm__set_thread_name("ioeventfd-worker");
+
+ for (;;) {
+ int nfds, i;
+
+ nfds = epoll_wait(epoll_fd, events, IOEVENTFD_MAX_EVENTS, -1);
+ for (i = 0; i < nfds; i++) {
+ struct ioevent *ioevent;
+
+ if (events[i].data.fd == epoll_stop_fd)
+ goto done;
+
+ ioevent = events[i].data.ptr;
+
+ if (read(ioevent->fd, &tmp, sizeof(tmp)) < 0)
+ die("Failed reading event");
+
+ ioevent->fn(ioevent->fn_kvm, ioevent->fn_ptr);
+ }
+ }
+
+done:
+ tmp = write(epoll_stop_fd, &tmp, sizeof(tmp));
+
+ return NULL;
+}
+
+static int ioeventfd__start(void)
+{
+ pthread_t thread;
+
+ if (!ioeventfd_avail)
+ return -ENOSYS;
+
+ return pthread_create(&thread, NULL, ioeventfd__thread, NULL);
+}
+
+int ioeventfd__init(struct kvm *kvm)
+{
+ struct epoll_event epoll_event = {.events = EPOLLIN};
+ int r;
+
+ ioeventfd_avail = kvm__supports_extension(kvm, KVM_CAP_IOEVENTFD);
+ if (!ioeventfd_avail)
+ return 1; /* Not fatal, but let caller determine no-go. */
+
+ epoll_fd = epoll_create(IOEVENTFD_MAX_EVENTS);
+ if (epoll_fd < 0)
+ return -errno;
+
+ epoll_stop_fd = eventfd(0, 0);
+ epoll_event.data.fd = epoll_stop_fd;
+
+ r = epoll_ctl(epoll_fd, EPOLL_CTL_ADD, epoll_stop_fd, &epoll_event);
+ if (r < 0)
+ goto cleanup;
+
+ r = ioeventfd__start();
+ if (r < 0)
+ goto cleanup;
+
+ r = 0;
+
+ return r;
+
+cleanup:
+ close(epoll_stop_fd);
+ close(epoll_fd);
+
+ return r;
+}
+base_init(ioeventfd__init);
+
+int ioeventfd__exit(struct kvm *kvm)
+{
+ u64 tmp = 1;
+ int r;
+
+ if (!ioeventfd_avail)
+ return 0;
+
+ r = write(epoll_stop_fd, &tmp, sizeof(tmp));
+ if (r < 0)
+ return r;
+
+ r = read(epoll_stop_fd, &tmp, sizeof(tmp));
+ if (r < 0)
+ return r;
+
+ close(epoll_fd);
+ close(epoll_stop_fd);
+
+ return 0;
+}
+base_exit(ioeventfd__exit);
+
+int ioeventfd__add_event(struct ioevent *ioevent, int flags)
+{
+ struct kvm_ioeventfd kvm_ioevent;
+ struct epoll_event epoll_event;
+ struct ioevent *new_ioevent;
+ int event, r;
+
+ if (!ioeventfd_avail)
+ return -ENOSYS;
+
+ new_ioevent = malloc(sizeof(*new_ioevent));
+ if (new_ioevent == NULL)
+ return -ENOMEM;
+
+ *new_ioevent = *ioevent;
+ event = new_ioevent->fd;
+
+ kvm_ioevent = (struct kvm_ioeventfd) {
+ .addr = ioevent->io_addr,
+ .len = ioevent->io_len,
+ .datamatch = ioevent->datamatch,
+ .fd = event,
+ .flags = KVM_IOEVENTFD_FLAG_DATAMATCH,
+ };
+
+ if (flags & IOEVENTFD_FLAG_PIO)
+ kvm_ioevent.flags |= KVM_IOEVENTFD_FLAG_PIO;
+
+ r = ioctl(ioevent->fn_kvm->vm_fd, KVM_IOEVENTFD, &kvm_ioevent);
+ if (r) {
+ r = -errno;
+ goto cleanup;
+ }
+
+ if (!(flags & IOEVENTFD_FLAG_USER_POLL))
+ return 0;
+
+ epoll_event = (struct epoll_event) {
+ .events = EPOLLIN,
+ .data.ptr = new_ioevent,
+ };
+
+ r = epoll_ctl(epoll_fd, EPOLL_CTL_ADD, event, &epoll_event);
+ if (r) {
+ r = -errno;
+ goto cleanup;
+ }
+
+ list_add_tail(&new_ioevent->list, &used_ioevents);
+
+ return 0;
+
+cleanup:
+ free(new_ioevent);
+ return r;
+}
+
+int ioeventfd__del_event(u64 addr, u64 datamatch)
+{
+ struct kvm_ioeventfd kvm_ioevent;
+ struct ioevent *ioevent;
+ u8 found = 0;
+
+ if (!ioeventfd_avail)
+ return -ENOSYS;
+
+ list_for_each_entry(ioevent, &used_ioevents, list) {
+ if (ioevent->io_addr == addr) {
+ found = 1;
+ break;
+ }
+ }
+
+ if (found == 0 || ioevent == NULL)
+ return -ENOENT;
+
+ kvm_ioevent = (struct kvm_ioeventfd) {
+ .addr = ioevent->io_addr,
+ .len = ioevent->io_len,
+ .datamatch = ioevent->datamatch,
+ .flags = KVM_IOEVENTFD_FLAG_PIO
+ | KVM_IOEVENTFD_FLAG_DEASSIGN
+ | KVM_IOEVENTFD_FLAG_DATAMATCH,
+ };
+
+ ioctl(ioevent->fn_kvm->vm_fd, KVM_IOEVENTFD, &kvm_ioevent);
+
+ epoll_ctl(epoll_fd, EPOLL_CTL_DEL, ioevent->fd, NULL);
+
+ list_del(&ioevent->list);
+
+ close(ioevent->fd);
+ free(ioevent);
+
+ return 0;
+}
diff --git a/tools/kvm/ioport.c b/tools/kvm/ioport.c
new file mode 100644
index 0000000..5bfb7e2
--- /dev/null
+++ b/tools/kvm/ioport.c
@@ -0,0 +1,227 @@
+#include "kvm/ioport.h"
+
+#include "kvm/kvm.h"
+#include "kvm/util.h"
+#include "kvm/brlock.h"
+#include "kvm/rbtree-interval.h"
+#include "kvm/mutex.h"
+
+#include <linux/kvm.h> /* for KVM_EXIT_* */
+#include <linux/types.h>
+
+#include <stdbool.h>
+#include <limits.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+#define ioport_node(n) rb_entry(n, struct ioport, node)
+
+DEFINE_MUTEX(ioport_mutex);
+
+static u16 free_io_port_idx; /* protected by ioport_mutex */
+
+static struct rb_root ioport_tree = RB_ROOT;
+
+static u16 ioport__find_free_port(void)
+{
+ u16 free_port;
+
+ mutex_lock(&ioport_mutex);
+ free_port = IOPORT_START + free_io_port_idx * IOPORT_SIZE;
+ free_io_port_idx++;
+ mutex_unlock(&ioport_mutex);
+
+ return free_port;
+}
+
+static struct ioport *ioport_search(struct rb_root *root, u64 addr)
+{
+ struct rb_int_node *node;
+
+ node = rb_int_search_single(root, addr);
+ if (node == NULL)
+ return NULL;
+
+ return ioport_node(node);
+}
+
+static int ioport_insert(struct rb_root *root, struct ioport *data)
+{
+ return rb_int_insert(root, &data->node);
+}
+
+static void ioport_remove(struct rb_root *root, struct ioport *data)
+{
+ rb_int_erase(root, &data->node);
+}
+
+#ifdef CONFIG_HAS_LIBFDT
+static void generate_ioport_fdt_node(void *fdt,
+ struct device_header *dev_hdr,
+ void (*generate_irq_prop)(void *fdt,
+ u8 irq))
+{
+ struct ioport *ioport = container_of(dev_hdr, struct ioport, dev_hdr);
+ struct ioport_operations *ops = ioport->ops;
+
+ if (ops->generate_fdt_node)
+ ops->generate_fdt_node(ioport, fdt, generate_irq_prop);
+}
+#else
+static void generate_ioport_fdt_node(void *fdt,
+ struct device_header *dev_hdr,
+ void (*generate_irq_prop)(void *fdt,
+ u8 irq))
+{
+ die("Unable to generate device tree nodes without libfdt\n");
+}
+#endif
+
+int ioport__register(struct kvm *kvm, u16 port, struct ioport_operations *ops, int count, void *param)
+{
+ struct ioport *entry;
+ int r;
+
+ br_write_lock(kvm);
+ if (port == IOPORT_EMPTY)
+ port = ioport__find_free_port();
+
+ entry = ioport_search(&ioport_tree, port);
+ if (entry) {
+ pr_warning("ioport re-registered: %x", port);
+ rb_int_erase(&ioport_tree, &entry->node);
+ }
+
+ entry = malloc(sizeof(*entry));
+ if (entry == NULL)
+ return -ENOMEM;
+
+ *entry = (struct ioport) {
+ .node = RB_INT_INIT(port, port + count),
+ .ops = ops,
+ .priv = param,
+ .dev_hdr = (struct device_header) {
+ .bus_type = DEVICE_BUS_IOPORT,
+ .data = generate_ioport_fdt_node,
+ },
+ };
+
+ r = ioport_insert(&ioport_tree, entry);
+ if (r < 0) {
+ free(entry);
+ br_write_unlock(kvm);
+ return r;
+ }
+
+ device__register(&entry->dev_hdr);
+ br_write_unlock(kvm);
+
+ return port;
+}
+
+int ioport__unregister(struct kvm *kvm, u16 port)
+{
+ struct ioport *entry;
+ int r;
+
+ br_write_lock(kvm);
+
+ r = -ENOENT;
+ entry = ioport_search(&ioport_tree, port);
+ if (!entry)
+ goto done;
+
+ ioport_remove(&ioport_tree, entry);
+
+ free(entry);
+
+ r = 0;
+
+done:
+ br_write_unlock(kvm);
+
+ return r;
+}
+
+static void ioport__unregister_all(void)
+{
+ struct ioport *entry;
+ struct rb_node *rb;
+ struct rb_int_node *rb_node;
+
+ rb = rb_first(&ioport_tree);
+ while (rb) {
+ rb_node = rb_int(rb);
+ entry = ioport_node(rb_node);
+ ioport_remove(&ioport_tree, entry);
+ free(entry);
+ rb = rb_first(&ioport_tree);
+ }
+}
+
+static const char *to_direction(int direction)
+{
+ if (direction == KVM_EXIT_IO_IN)
+ return "IN";
+ else
+ return "OUT";
+}
+
+static void ioport_error(u16 port, void *data, int direction, int size, u32 count)
+{
+ fprintf(stderr, "IO error: %s port=%x, size=%d, count=%u\n", to_direction(direction), port, size, count);
+}
+
+bool kvm__emulate_io(struct kvm_cpu *vcpu, u16 port, void *data, int direction, int size, u32 count)
+{
+ struct ioport_operations *ops;
+ bool ret = false;
+ struct ioport *entry;
+ void *ptr = data;
+ struct kvm *kvm = vcpu->kvm;
+
+ br_read_lock();
+ entry = ioport_search(&ioport_tree, port);
+ if (!entry)
+ goto error;
+
+ ops = entry->ops;
+
+ while (count--) {
+ if (direction == KVM_EXIT_IO_IN && ops->io_in)
+ ret = ops->io_in(entry, vcpu, port, ptr, size);
+ else if (ops->io_out)
+ ret = ops->io_out(entry, vcpu, port, ptr, size);
+
+ ptr += size;
+ }
+
+ br_read_unlock();
+
+ if (!ret)
+ goto error;
+
+ return true;
+error:
+ br_read_unlock();
+
+ if (kvm->cfg.ioport_debug)
+ ioport_error(port, data, direction, size, count);
+
+ return !kvm->cfg.ioport_debug;
+}
+
+int ioport__init(struct kvm *kvm)
+{
+ ioport__setup_arch(kvm);
+
+ return 0;
+}
+dev_base_init(ioport__init);
+
+int ioport__exit(struct kvm *kvm)
+{
+ ioport__unregister_all();
+ return 0;
+}
+dev_base_exit(ioport__exit);
diff --git a/tools/kvm/irq.c b/tools/kvm/irq.c
new file mode 100644
index 0000000..33ea8d2
--- /dev/null
+++ b/tools/kvm/irq.c
@@ -0,0 +1,9 @@
+#include "kvm/irq.h"
+#include "kvm/kvm-arch.h"
+
+static u8 next_line = KVM_IRQ_OFFSET;
+
+int irq__alloc_line(void)
+{
+ return next_line++;
+}
diff --git a/tools/kvm/kvm-cmd.c b/tools/kvm/kvm-cmd.c
new file mode 100644
index 0000000..2520b08
--- /dev/null
+++ b/tools/kvm/kvm-cmd.c
@@ -0,0 +1,91 @@
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+
+/* user defined header files */
+#include "kvm/builtin-debug.h"
+#include "kvm/builtin-pause.h"
+#include "kvm/builtin-resume.h"
+#include "kvm/builtin-balloon.h"
+#include "kvm/builtin-list.h"
+#include "kvm/builtin-version.h"
+#include "kvm/builtin-setup.h"
+#include "kvm/builtin-stop.h"
+#include "kvm/builtin-stat.h"
+#include "kvm/builtin-help.h"
+#include "kvm/builtin-sandbox.h"
+#include "kvm/kvm-cmd.h"
+#include "kvm/builtin-run.h"
+#include "kvm/util.h"
+
+struct cmd_struct kvm_commands[] = {
+ { "pause", kvm_cmd_pause, kvm_pause_help, 0 },
+ { "resume", kvm_cmd_resume, kvm_resume_help, 0 },
+ { "debug", kvm_cmd_debug, kvm_debug_help, 0 },
+ { "balloon", kvm_cmd_balloon, kvm_balloon_help, 0 },
+ { "list", kvm_cmd_list, kvm_list_help, 0 },
+ { "version", kvm_cmd_version, NULL, 0 },
+ { "--version", kvm_cmd_version, NULL, 0 },
+ { "stop", kvm_cmd_stop, kvm_stop_help, 0 },
+ { "stat", kvm_cmd_stat, kvm_stat_help, 0 },
+ { "help", kvm_cmd_help, NULL, 0 },
+ { "setup", kvm_cmd_setup, kvm_setup_help, 0 },
+ { "run", kvm_cmd_run, kvm_run_help, 0 },
+ { "sandbox", kvm_cmd_sandbox, kvm_run_help, 0 },
+ { NULL, NULL, NULL, 0 },
+};
+
+/*
+ * kvm_get_command: Searches the command in an array of the commands and
+ * returns a pointer to cmd_struct if a match is found.
+ *
+ * Input parameters:
+ * command: Array of possible commands. The last entry in the array must be
+ * NULL.
+ * cmd: A string command to search in the array
+ *
+ * Return Value:
+ * NULL: If the cmd is not matched with any of the command in the command array
+ * p: Pointer to cmd_struct of the matching command
+ */
+struct cmd_struct *kvm_get_command(struct cmd_struct *command,
+ const char *cmd)
+{
+ struct cmd_struct *p = command;
+
+ while (p->cmd) {
+ if (!strcmp(p->cmd, cmd))
+ return p;
+ p++;
+ }
+ return NULL;
+}
+
+int handle_command(struct cmd_struct *command, int argc, const char **argv)
+{
+ struct cmd_struct *p;
+ const char *prefix = NULL;
+ int ret = 0;
+
+ if (!argv || !*argv) {
+ p = kvm_get_command(command, "help");
+ BUG_ON(!p);
+ return p->fn(argc, argv, prefix);
+ }
+
+ p = kvm_get_command(command, argv[0]);
+ if (!p) {
+ p = kvm_get_command(command, "help");
+ BUG_ON(!p);
+ p->fn(0, NULL, prefix);
+ return EINVAL;
+ }
+
+ ret = p->fn(argc - 1, &argv[1], prefix);
+ if (ret < 0) {
+ if (errno == EPERM)
+ die("Permission error - are you root?");
+ }
+
+ return ret;
+}
diff --git a/tools/kvm/kvm-cpu.c b/tools/kvm/kvm-cpu.c
new file mode 100644
index 0000000..ee0a8ec
--- /dev/null
+++ b/tools/kvm/kvm-cpu.c
@@ -0,0 +1,248 @@
+#include "kvm/kvm-cpu.h"
+
+#include "kvm/symbol.h"
+#include "kvm/util.h"
+#include "kvm/kvm.h"
+#include "kvm/virtio.h"
+
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <stdio.h>
+
+extern __thread struct kvm_cpu *current_kvm_cpu;
+
+int __attribute__((weak)) kvm_cpu__get_endianness(struct kvm_cpu *vcpu)
+{
+ return VIRTIO_ENDIAN_HOST;
+}
+
+void kvm_cpu__enable_singlestep(struct kvm_cpu *vcpu)
+{
+ struct kvm_guest_debug debug = {
+ .control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP,
+ };
+
+ if (ioctl(vcpu->vcpu_fd, KVM_SET_GUEST_DEBUG, &debug) < 0)
+ pr_warning("KVM_SET_GUEST_DEBUG failed");
+}
+
+void kvm_cpu__run(struct kvm_cpu *vcpu)
+{
+ int err;
+
+ if (!vcpu->is_running)
+ return;
+
+ err = ioctl(vcpu->vcpu_fd, KVM_RUN, 0);
+ if (err < 0 && (errno != EINTR && errno != EAGAIN))
+ die_perror("KVM_RUN failed");
+}
+
+static void kvm_cpu_signal_handler(int signum)
+{
+ if (signum == SIGKVMEXIT) {
+ if (current_kvm_cpu && current_kvm_cpu->is_running) {
+ current_kvm_cpu->is_running = false;
+ kvm__continue(current_kvm_cpu->kvm);
+ }
+ } else if (signum == SIGKVMPAUSE) {
+ current_kvm_cpu->paused = 1;
+ }
+}
+
+static void kvm_cpu__handle_coalesced_mmio(struct kvm_cpu *cpu)
+{
+ if (cpu->ring) {
+ while (cpu->ring->first != cpu->ring->last) {
+ struct kvm_coalesced_mmio *m;
+ m = &cpu->ring->coalesced_mmio[cpu->ring->first];
+ kvm_cpu__emulate_mmio(cpu,
+ m->phys_addr,
+ m->data,
+ m->len,
+ 1);
+ cpu->ring->first = (cpu->ring->first + 1) % KVM_COALESCED_MMIO_MAX;
+ }
+ }
+}
+
+void kvm_cpu__reboot(struct kvm *kvm)
+{
+ int i;
+
+ /* The kvm->cpus array contains a null pointer in the last location */
+ for (i = 0; ; i++) {
+ if (kvm->cpus[i])
+ pthread_kill(kvm->cpus[i]->thread, SIGKVMEXIT);
+ else
+ break;
+ }
+}
+
+int kvm_cpu__start(struct kvm_cpu *cpu)
+{
+ sigset_t sigset;
+
+ sigemptyset(&sigset);
+ sigaddset(&sigset, SIGALRM);
+
+ pthread_sigmask(SIG_BLOCK, &sigset, NULL);
+
+ signal(SIGKVMEXIT, kvm_cpu_signal_handler);
+ signal(SIGKVMPAUSE, kvm_cpu_signal_handler);
+
+ kvm_cpu__reset_vcpu(cpu);
+
+ if (cpu->kvm->cfg.single_step)
+ kvm_cpu__enable_singlestep(cpu);
+
+ while (cpu->is_running) {
+ if (cpu->paused) {
+ kvm__notify_paused();
+ cpu->paused = 0;
+ }
+
+ if (cpu->needs_nmi) {
+ kvm_cpu__arch_nmi(cpu);
+ cpu->needs_nmi = 0;
+ }
+
+ kvm_cpu__run(cpu);
+
+ switch (cpu->kvm_run->exit_reason) {
+ case KVM_EXIT_UNKNOWN:
+ break;
+ case KVM_EXIT_DEBUG:
+ kvm_cpu__show_registers(cpu);
+ kvm_cpu__show_code(cpu);
+ break;
+ case KVM_EXIT_IO: {
+ bool ret;
+
+ ret = kvm_cpu__emulate_io(cpu,
+ cpu->kvm_run->io.port,
+ (u8 *)cpu->kvm_run +
+ cpu->kvm_run->io.data_offset,
+ cpu->kvm_run->io.direction,
+ cpu->kvm_run->io.size,
+ cpu->kvm_run->io.count);
+
+ if (!ret)
+ goto panic_kvm;
+ break;
+ }
+ case KVM_EXIT_MMIO: {
+ bool ret;
+
+ /*
+ * If we had MMIO exit, coalesced ring should be processed
+ * *before* processing the exit itself
+ */
+ kvm_cpu__handle_coalesced_mmio(cpu);
+
+ ret = kvm_cpu__emulate_mmio(cpu,
+ cpu->kvm_run->mmio.phys_addr,
+ cpu->kvm_run->mmio.data,
+ cpu->kvm_run->mmio.len,
+ cpu->kvm_run->mmio.is_write);
+
+ if (!ret)
+ goto panic_kvm;
+ break;
+ }
+ case KVM_EXIT_INTR:
+ if (cpu->is_running)
+ break;
+ goto exit_kvm;
+ case KVM_EXIT_SHUTDOWN:
+ goto exit_kvm;
+ default: {
+ bool ret;
+
+ ret = kvm_cpu__handle_exit(cpu);
+ if (!ret)
+ goto panic_kvm;
+ break;
+ }
+ }
+ kvm_cpu__handle_coalesced_mmio(cpu);
+ }
+
+exit_kvm:
+ return 0;
+
+panic_kvm:
+ return 1;
+}
+
+int kvm_cpu__init(struct kvm *kvm)
+{
+ int max_cpus, recommended_cpus, i;
+
+ max_cpus = kvm__max_cpus(kvm);
+ recommended_cpus = kvm__recommended_cpus(kvm);
+
+ if (kvm->cfg.nrcpus > max_cpus) {
+ printf(" # Limit the number of CPUs to %d\n", max_cpus);
+ kvm->cfg.nrcpus = max_cpus;
+ } else if (kvm->cfg.nrcpus > recommended_cpus) {
+ printf(" # Warning: The maximum recommended amount of VCPUs"
+ " is %d\n", recommended_cpus);
+ }
+
+ kvm->nrcpus = kvm->cfg.nrcpus;
+
+ /* Alloc one pointer too many, so array ends up 0-terminated */
+ kvm->cpus = calloc(kvm->nrcpus + 1, sizeof(void *));
+ if (!kvm->cpus) {
+ pr_warning("Couldn't allocate array for %d CPUs", kvm->nrcpus);
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < kvm->nrcpus; i++) {
+ kvm->cpus[i] = kvm_cpu__arch_init(kvm, i);
+ if (!kvm->cpus[i]) {
+ pr_warning("unable to initialize KVM VCPU");
+ goto fail_alloc;
+ }
+ }
+
+ return 0;
+
+fail_alloc:
+ for (i = 0; i < kvm->nrcpus; i++)
+ free(kvm->cpus[i]);
+ return -ENOMEM;
+}
+base_init(kvm_cpu__init);
+
+int kvm_cpu__exit(struct kvm *kvm)
+{
+ int i, r;
+ void *ret = NULL;
+
+ kvm_cpu__delete(kvm->cpus[0]);
+ kvm->cpus[0] = NULL;
+
+ for (i = 1; i < kvm->nrcpus; i++) {
+ if (kvm->cpus[i]->is_running) {
+ pthread_kill(kvm->cpus[i]->thread, SIGKVMEXIT);
+ if (pthread_join(kvm->cpus[i]->thread, &ret) != 0)
+ die("pthread_join");
+ kvm_cpu__delete(kvm->cpus[i]);
+ }
+ if (ret == NULL)
+ r = 0;
+ }
+
+ free(kvm->cpus);
+
+ kvm->nrcpus = 0;
+
+ return r;
+}
+late_exit(kvm_cpu__exit);
diff --git a/tools/kvm/kvm-ipc.c b/tools/kvm/kvm-ipc.c
new file mode 100644
index 0000000..b1c43dd
--- /dev/null
+++ b/tools/kvm/kvm-ipc.c
@@ -0,0 +1,533 @@
+#include <sys/epoll.h>
+#include <sys/un.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/eventfd.h>
+#include <dirent.h>
+
+#include "kvm/kvm-ipc.h"
+#include "kvm/rwsem.h"
+#include "kvm/read-write.h"
+#include "kvm/util.h"
+#include "kvm/kvm.h"
+#include "kvm/builtin-debug.h"
+#include "kvm/strbuf.h"
+#include "kvm/kvm-cpu.h"
+#include "kvm/8250-serial.h"
+
+struct kvm_ipc_head {
+ u32 type;
+ u32 len;
+};
+
+#define KVM_IPC_MAX_MSGS 16
+
+#define KVM_SOCK_SUFFIX ".sock"
+#define KVM_SOCK_SUFFIX_LEN ((ssize_t)sizeof(KVM_SOCK_SUFFIX) - 1)
+
+extern __thread struct kvm_cpu *current_kvm_cpu;
+static void (*msgs[KVM_IPC_MAX_MSGS])(struct kvm *kvm, int fd, u32 type, u32 len, u8 *msg);
+static DECLARE_RWSEM(msgs_rwlock);
+static int epoll_fd, server_fd, stop_fd;
+static pthread_t thread;
+
+static int kvm__create_socket(struct kvm *kvm)
+{
+ char full_name[PATH_MAX];
+ unsigned int s;
+ struct sockaddr_un local;
+ int len, r;
+
+ /* This usually 108 bytes long */
+ BUILD_BUG_ON(sizeof(local.sun_path) < 32);
+
+ snprintf(full_name, sizeof(full_name), "%s/%s%s",
+ kvm__get_dir(), kvm->cfg.guest_name, KVM_SOCK_SUFFIX);
+ if (access(full_name, F_OK) == 0) {
+ pr_err("Socket file %s already exist", full_name);
+ return -EEXIST;
+ }
+
+ s = socket(AF_UNIX, SOCK_STREAM, 0);
+ if (s < 0) {
+ perror("socket");
+ return s;
+ }
+
+ local.sun_family = AF_UNIX;
+ strlcpy(local.sun_path, full_name, sizeof(local.sun_path));
+ len = strlen(local.sun_path) + sizeof(local.sun_family);
+ r = bind(s, (struct sockaddr *)&local, len);
+ if (r < 0) {
+ perror("bind");
+ goto fail;
+ }
+
+ r = listen(s, 5);
+ if (r < 0) {
+ perror("listen");
+ goto fail;
+ }
+
+ return s;
+
+fail:
+ close(s);
+ return r;
+}
+
+void kvm__remove_socket(const char *name)
+{
+ char full_name[PATH_MAX];
+
+ snprintf(full_name, sizeof(full_name), "%s/%s%s",
+ kvm__get_dir(), name, KVM_SOCK_SUFFIX);
+ unlink(full_name);
+}
+
+int kvm__get_sock_by_instance(const char *name)
+{
+ int s, len, r;
+ char sock_file[PATH_MAX];
+ struct sockaddr_un local;
+
+ snprintf(sock_file, sizeof(sock_file), "%s/%s%s",
+ kvm__get_dir(), name, KVM_SOCK_SUFFIX);
+ s = socket(AF_UNIX, SOCK_STREAM, 0);
+
+ local.sun_family = AF_UNIX;
+ strlcpy(local.sun_path, sock_file, sizeof(local.sun_path));
+ len = strlen(local.sun_path) + sizeof(local.sun_family);
+
+ r = connect(s, &local, len);
+ if (r < 0 && errno == ECONNREFUSED) {
+ /* Tell the user clean ghost socket file */
+ pr_err("\"%s\" could be a ghost socket file, please remove it",
+ sock_file);
+ return r;
+ } else if (r < 0) {
+ return r;
+ }
+
+ return s;
+}
+
+static bool is_socket(const char *base_path, const struct dirent *dent)
+{
+ switch (dent->d_type) {
+ case DT_SOCK:
+ return true;
+
+ case DT_UNKNOWN: {
+ char path[PATH_MAX];
+ struct stat st;
+
+ sprintf(path, "%s/%s", base_path, dent->d_name);
+ if (stat(path, &st))
+ return false;
+
+ return S_ISSOCK(st.st_mode);
+ }
+ default:
+ return false;
+ }
+}
+
+int kvm__enumerate_instances(int (*callback)(const char *name, int fd))
+{
+ int sock;
+ DIR *dir;
+ struct dirent entry, *result;
+ int ret = 0;
+ const char *path;
+
+ path = kvm__get_dir();
+
+ dir = opendir(path);
+ if (!dir)
+ return -errno;
+
+ for (;;) {
+ readdir_r(dir, &entry, &result);
+ if (result == NULL)
+ break;
+ if (is_socket(path, &entry)) {
+ ssize_t name_len = strlen(entry.d_name);
+ char *p;
+
+ if (name_len <= KVM_SOCK_SUFFIX_LEN)
+ continue;
+
+ p = &entry.d_name[name_len - KVM_SOCK_SUFFIX_LEN];
+ if (memcmp(KVM_SOCK_SUFFIX, p, KVM_SOCK_SUFFIX_LEN))
+ continue;
+
+ *p = 0;
+ sock = kvm__get_sock_by_instance(entry.d_name);
+ if (sock < 0)
+ continue;
+ ret = callback(entry.d_name, sock);
+ close(sock);
+ if (ret < 0)
+ break;
+ }
+ }
+
+ closedir(dir);
+
+ return ret;
+}
+
+int kvm_ipc__register_handler(u32 type, void (*cb)(struct kvm *kvm, int fd, u32 type, u32 len, u8 *msg))
+{
+ if (type >= KVM_IPC_MAX_MSGS)
+ return -ENOSPC;
+
+ down_write(&msgs_rwlock);
+ msgs[type] = cb;
+ up_write(&msgs_rwlock);
+
+ return 0;
+}
+
+int kvm_ipc__send(int fd, u32 type)
+{
+ struct kvm_ipc_head head = {.type = type, .len = 0,};
+
+ if (write_in_full(fd, &head, sizeof(head)) < 0)
+ return -1;
+
+ return 0;
+}
+
+int kvm_ipc__send_msg(int fd, u32 type, u32 len, u8 *msg)
+{
+ struct kvm_ipc_head head = {.type = type, .len = len,};
+
+ if (write_in_full(fd, &head, sizeof(head)) < 0)
+ return -1;
+
+ if (write_in_full(fd, msg, len) < 0)
+ return -1;
+
+ return 0;
+}
+
+static int kvm_ipc__handle(struct kvm *kvm, int fd, u32 type, u32 len, u8 *data)
+{
+ void (*cb)(struct kvm *kvm, int fd, u32 type, u32 len, u8 *msg);
+
+ if (type >= KVM_IPC_MAX_MSGS)
+ return -ENOSPC;
+
+ down_read(&msgs_rwlock);
+ cb = msgs[type];
+ up_read(&msgs_rwlock);
+
+ if (cb == NULL) {
+ pr_warning("No device handles type %u\n", type);
+ return -ENODEV;
+ }
+
+ cb(kvm, fd, type, len, data);
+
+ return 0;
+}
+
+static int kvm_ipc__new_conn(int fd)
+{
+ int client;
+ struct epoll_event ev;
+
+ client = accept(fd, NULL, NULL);
+ if (client < 0)
+ return -1;
+
+ ev.events = EPOLLIN | EPOLLRDHUP;
+ ev.data.fd = client;
+ if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, client, &ev) < 0) {
+ close(client);
+ return -1;
+ }
+
+ return client;
+}
+
+static void kvm_ipc__close_conn(int fd)
+{
+ epoll_ctl(epoll_fd, EPOLL_CTL_DEL, fd, NULL);
+ close(fd);
+}
+
+static int kvm_ipc__receive(struct kvm *kvm, int fd)
+{
+ struct kvm_ipc_head head;
+ u8 *msg = NULL;
+ u32 n;
+
+ n = read(fd, &head, sizeof(head));
+ if (n != sizeof(head))
+ goto done;
+
+ msg = malloc(head.len);
+ if (msg == NULL)
+ goto done;
+
+ n = read_in_full(fd, msg, head.len);
+ if (n != head.len)
+ goto done;
+
+ kvm_ipc__handle(kvm, fd, head.type, head.len, msg);
+
+ return 0;
+
+done:
+ free(msg);
+ return -1;
+}
+
+static void *kvm_ipc__thread(void *param)
+{
+ struct epoll_event event;
+ struct kvm *kvm = param;
+
+ kvm__set_thread_name("kvm-ipc");
+
+ for (;;) {
+ int nfds;
+
+ nfds = epoll_wait(epoll_fd, &event, 1, -1);
+ if (nfds > 0) {
+ int fd = event.data.fd;
+
+ if (fd == stop_fd && event.events & EPOLLIN) {
+ break;
+ } else if (fd == server_fd) {
+ int client, r;
+
+ client = kvm_ipc__new_conn(fd);
+ /*
+ * Handle multiple IPC cmd at a time
+ */
+ do {
+ r = kvm_ipc__receive(kvm, client);
+ } while (r == 0);
+
+ } else if (event.events & (EPOLLERR | EPOLLRDHUP | EPOLLHUP)) {
+ kvm_ipc__close_conn(fd);
+ } else {
+ kvm_ipc__receive(kvm, fd);
+ }
+ }
+ }
+
+ return NULL;
+}
+
+static void kvm__pid(struct kvm *kvm, int fd, u32 type, u32 len, u8 *msg)
+{
+ pid_t pid = getpid();
+ int r = 0;
+
+ if (type == KVM_IPC_PID)
+ r = write(fd, &pid, sizeof(pid));
+
+ if (r < 0)
+ pr_warning("Failed sending PID");
+}
+
+static void handle_stop(struct kvm *kvm, int fd, u32 type, u32 len, u8 *msg)
+{
+ if (WARN_ON(type != KVM_IPC_STOP || len))
+ return;
+
+ kvm_cpu__reboot(kvm);
+}
+
+/* Pause/resume the guest using SIGUSR2 */
+static int is_paused;
+
+static void handle_pause(struct kvm *kvm, int fd, u32 type, u32 len, u8 *msg)
+{
+ if (WARN_ON(len))
+ return;
+
+ if (type == KVM_IPC_RESUME && is_paused) {
+ kvm->vm_state = KVM_VMSTATE_RUNNING;
+ kvm__continue(kvm);
+ } else if (type == KVM_IPC_PAUSE && !is_paused) {
+ kvm->vm_state = KVM_VMSTATE_PAUSED;
+ ioctl(kvm->vm_fd, KVM_KVMCLOCK_CTRL);
+ kvm__pause(kvm);
+ } else {
+ return;
+ }
+
+ is_paused = !is_paused;
+}
+
+static void handle_vmstate(struct kvm *kvm, int fd, u32 type, u32 len, u8 *msg)
+{
+ int r = 0;
+
+ if (type == KVM_IPC_VMSTATE)
+ r = write(fd, &kvm->vm_state, sizeof(kvm->vm_state));
+
+ if (r < 0)
+ pr_warning("Failed sending VMSTATE");
+}
+
+/*
+ * Serialize debug printout so that the output of multiple vcpus does not
+ * get mixed up:
+ */
+static int printout_done;
+
+static void handle_sigusr1(int sig)
+{
+ struct kvm_cpu *cpu = current_kvm_cpu;
+ int fd = kvm_cpu__get_debug_fd();
+
+ if (!cpu || cpu->needs_nmi)
+ return;
+
+ dprintf(fd, "\n #\n # vCPU #%ld's dump:\n #\n", cpu->cpu_id);
+ kvm_cpu__show_registers(cpu);
+ kvm_cpu__show_code(cpu);
+ kvm_cpu__show_page_tables(cpu);
+ fflush(stdout);
+ printout_done = 1;
+}
+
+static void handle_debug(struct kvm *kvm, int fd, u32 type, u32 len, u8 *msg)
+{
+ int i;
+ struct debug_cmd_params *params;
+ u32 dbg_type;
+ u32 vcpu;
+
+ if (WARN_ON(type != KVM_IPC_DEBUG || len != sizeof(*params)))
+ return;
+
+ params = (void *)msg;
+ dbg_type = params->dbg_type;
+ vcpu = params->cpu;
+
+ if (dbg_type & KVM_DEBUG_CMD_TYPE_SYSRQ)
+ serial8250__inject_sysrq(kvm, params->sysrq);
+
+ if (dbg_type & KVM_DEBUG_CMD_TYPE_NMI) {
+ if ((int)vcpu >= kvm->nrcpus)
+ return;
+
+ kvm->cpus[vcpu]->needs_nmi = 1;
+ pthread_kill(kvm->cpus[vcpu]->thread, SIGUSR1);
+ }
+
+ if (!(dbg_type & KVM_DEBUG_CMD_TYPE_DUMP))
+ return;
+
+ for (i = 0; i < kvm->nrcpus; i++) {
+ struct kvm_cpu *cpu = kvm->cpus[i];
+
+ if (!cpu)
+ continue;
+
+ printout_done = 0;
+
+ kvm_cpu__set_debug_fd(fd);
+ pthread_kill(cpu->thread, SIGUSR1);
+ /*
+ * Wait for the vCPU to dump state before signalling
+ * the next thread. Since this is debug code it does
+ * not matter that we are burning CPU time a bit:
+ */
+ while (!printout_done)
+ sleep(0);
+ }
+
+ close(fd);
+
+ serial8250__inject_sysrq(kvm, 'p');
+}
+
+int kvm_ipc__init(struct kvm *kvm)
+{
+ int ret;
+ int sock = kvm__create_socket(kvm);
+ struct epoll_event ev = {0};
+
+ server_fd = sock;
+
+ epoll_fd = epoll_create(KVM_IPC_MAX_MSGS);
+ if (epoll_fd < 0) {
+ perror("epoll_create");
+ ret = epoll_fd;
+ goto err;
+ }
+
+ ev.events = EPOLLIN | EPOLLET;
+ ev.data.fd = sock;
+ if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, sock, &ev) < 0) {
+ pr_err("Failed adding socket to epoll");
+ ret = -EFAULT;
+ goto err_epoll;
+ }
+
+ stop_fd = eventfd(0, 0);
+ if (stop_fd < 0) {
+ perror("eventfd");
+ ret = stop_fd;
+ goto err_epoll;
+ }
+
+ ev.events = EPOLLIN | EPOLLET;
+ ev.data.fd = stop_fd;
+ if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, stop_fd, &ev) < 0) {
+ pr_err("Failed adding stop event to epoll");
+ ret = -EFAULT;
+ goto err_stop;
+ }
+
+ if (pthread_create(&thread, NULL, kvm_ipc__thread, kvm) != 0) {
+ pr_err("Failed starting IPC thread");
+ ret = -EFAULT;
+ goto err_stop;
+ }
+
+ kvm_ipc__register_handler(KVM_IPC_PID, kvm__pid);
+ kvm_ipc__register_handler(KVM_IPC_DEBUG, handle_debug);
+ kvm_ipc__register_handler(KVM_IPC_PAUSE, handle_pause);
+ kvm_ipc__register_handler(KVM_IPC_RESUME, handle_pause);
+ kvm_ipc__register_handler(KVM_IPC_STOP, handle_stop);
+ kvm_ipc__register_handler(KVM_IPC_VMSTATE, handle_vmstate);
+ signal(SIGUSR1, handle_sigusr1);
+
+ return 0;
+
+err_stop:
+ close(stop_fd);
+err_epoll:
+ close(epoll_fd);
+err:
+ return ret;
+}
+base_init(kvm_ipc__init);
+
+int kvm_ipc__exit(struct kvm *kvm)
+{
+ u64 val = 1;
+ int ret;
+
+ ret = write(stop_fd, &val, sizeof(val));
+ if (ret < 0)
+ return ret;
+
+ close(server_fd);
+ close(epoll_fd);
+
+ kvm__remove_socket(kvm->cfg.guest_name);
+
+ return ret;
+}
+base_exit(kvm_ipc__exit);
diff --git a/tools/kvm/kvm.c b/tools/kvm/kvm.c
new file mode 100644
index 0000000..e1b9f6c
--- /dev/null
+++ b/tools/kvm/kvm.c
@@ -0,0 +1,481 @@
+#include "kvm/kvm.h"
+#include "kvm/read-write.h"
+#include "kvm/util.h"
+#include "kvm/strbuf.h"
+#include "kvm/mutex.h"
+#include "kvm/kvm-cpu.h"
+#include "kvm/kvm-ipc.h"
+
+#include <linux/kernel.h>
+#include <linux/kvm.h>
+#include <linux/list.h>
+#include <linux/err.h>
+
+#include <sys/un.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <stdbool.h>
+#include <limits.h>
+#include <signal.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <time.h>
+#include <sys/eventfd.h>
+#include <asm/unistd.h>
+#include <dirent.h>
+
+#define DEFINE_KVM_EXIT_REASON(reason) [reason] = #reason
+
+const char *kvm_exit_reasons[] = {
+ DEFINE_KVM_EXIT_REASON(KVM_EXIT_UNKNOWN),
+ DEFINE_KVM_EXIT_REASON(KVM_EXIT_EXCEPTION),
+ DEFINE_KVM_EXIT_REASON(KVM_EXIT_IO),
+ DEFINE_KVM_EXIT_REASON(KVM_EXIT_HYPERCALL),
+ DEFINE_KVM_EXIT_REASON(KVM_EXIT_DEBUG),
+ DEFINE_KVM_EXIT_REASON(KVM_EXIT_HLT),
+ DEFINE_KVM_EXIT_REASON(KVM_EXIT_MMIO),
+ DEFINE_KVM_EXIT_REASON(KVM_EXIT_IRQ_WINDOW_OPEN),
+ DEFINE_KVM_EXIT_REASON(KVM_EXIT_SHUTDOWN),
+ DEFINE_KVM_EXIT_REASON(KVM_EXIT_FAIL_ENTRY),
+ DEFINE_KVM_EXIT_REASON(KVM_EXIT_INTR),
+ DEFINE_KVM_EXIT_REASON(KVM_EXIT_SET_TPR),
+ DEFINE_KVM_EXIT_REASON(KVM_EXIT_TPR_ACCESS),
+ DEFINE_KVM_EXIT_REASON(KVM_EXIT_S390_SIEIC),
+ DEFINE_KVM_EXIT_REASON(KVM_EXIT_S390_RESET),
+ DEFINE_KVM_EXIT_REASON(KVM_EXIT_DCR),
+ DEFINE_KVM_EXIT_REASON(KVM_EXIT_NMI),
+ DEFINE_KVM_EXIT_REASON(KVM_EXIT_INTERNAL_ERROR),
+#ifdef CONFIG_PPC64
+ DEFINE_KVM_EXIT_REASON(KVM_EXIT_PAPR_HCALL),
+#endif
+};
+
+static int pause_event;
+static DEFINE_MUTEX(pause_lock);
+extern struct kvm_ext kvm_req_ext[];
+
+static char kvm_dir[PATH_MAX];
+
+static int set_dir(const char *fmt, va_list args)
+{
+ char tmp[PATH_MAX];
+
+ vsnprintf(tmp, sizeof(tmp), fmt, args);
+
+ mkdir(tmp, 0777);
+
+ if (!realpath(tmp, kvm_dir))
+ return -errno;
+
+ strcat(kvm_dir, "/");
+
+ return 0;
+}
+
+void kvm__set_dir(const char *fmt, ...)
+{
+ va_list args;
+
+ va_start(args, fmt);
+ set_dir(fmt, args);
+ va_end(args);
+}
+
+const char *kvm__get_dir(void)
+{
+ return kvm_dir;
+}
+
+bool kvm__supports_extension(struct kvm *kvm, unsigned int extension)
+{
+ int ret;
+
+ ret = ioctl(kvm->sys_fd, KVM_CHECK_EXTENSION, extension);
+ if (ret < 0)
+ return false;
+
+ return ret;
+}
+
+static int kvm__check_extensions(struct kvm *kvm)
+{
+ int i;
+
+ for (i = 0; ; i++) {
+ if (!kvm_req_ext[i].name)
+ break;
+ if (!kvm__supports_extension(kvm, kvm_req_ext[i].code)) {
+ pr_err("Unsuppored KVM extension detected: %s",
+ kvm_req_ext[i].name);
+ return -i;
+ }
+ }
+
+ return 0;
+}
+
+struct kvm *kvm__new(void)
+{
+ struct kvm *kvm = calloc(1, sizeof(*kvm));
+ if (!kvm)
+ return ERR_PTR(-ENOMEM);
+
+ kvm->sys_fd = -1;
+ kvm->vm_fd = -1;
+
+ return kvm;
+}
+
+int kvm__exit(struct kvm *kvm)
+{
+ struct kvm_mem_bank *bank, *tmp;
+
+ kvm__arch_delete_ram(kvm);
+
+ list_for_each_entry_safe(bank, tmp, &kvm->mem_banks, list) {
+ list_del(&bank->list);
+ free(bank);
+ }
+
+ free(kvm);
+ return 0;
+}
+core_exit(kvm__exit);
+
+/*
+ * Note: KVM_SET_USER_MEMORY_REGION assumes that we don't pass overlapping
+ * memory regions to it. Therefore, be careful if you use this function for
+ * registering memory regions for emulating hardware.
+ */
+int kvm__register_mem(struct kvm *kvm, u64 guest_phys, u64 size, void *userspace_addr)
+{
+ struct kvm_userspace_memory_region mem;
+ struct kvm_mem_bank *bank;
+ int ret;
+
+ bank = malloc(sizeof(*bank));
+ if (!bank)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&bank->list);
+ bank->guest_phys_addr = guest_phys;
+ bank->host_addr = userspace_addr;
+ bank->size = size;
+
+ mem = (struct kvm_userspace_memory_region) {
+ .slot = kvm->mem_slots++,
+ .guest_phys_addr = guest_phys,
+ .memory_size = size,
+ .userspace_addr = (unsigned long)userspace_addr,
+ };
+
+ ret = ioctl(kvm->vm_fd, KVM_SET_USER_MEMORY_REGION, &mem);
+ if (ret < 0)
+ return -errno;
+
+ list_add(&bank->list, &kvm->mem_banks);
+ return 0;
+}
+
+void *guest_flat_to_host(struct kvm *kvm, u64 offset)
+{
+ struct kvm_mem_bank *bank;
+
+ list_for_each_entry(bank, &kvm->mem_banks, list) {
+ u64 bank_start = bank->guest_phys_addr;
+ u64 bank_end = bank_start + bank->size;
+
+ if (offset >= bank_start && offset < bank_end)
+ return bank->host_addr + (offset - bank_start);
+ }
+
+ pr_warning("unable to translate guest address 0x%llx to host",
+ (unsigned long long)offset);
+ return NULL;
+}
+
+u64 host_to_guest_flat(struct kvm *kvm, void *ptr)
+{
+ struct kvm_mem_bank *bank;
+
+ list_for_each_entry(bank, &kvm->mem_banks, list) {
+ void *bank_start = bank->host_addr;
+ void *bank_end = bank_start + bank->size;
+
+ if (ptr >= bank_start && ptr < bank_end)
+ return bank->guest_phys_addr + (ptr - bank_start);
+ }
+
+ pr_warning("unable to translate host address %p to guest", ptr);
+ return 0;
+}
+
+int kvm__recommended_cpus(struct kvm *kvm)
+{
+ int ret;
+
+ ret = ioctl(kvm->sys_fd, KVM_CHECK_EXTENSION, KVM_CAP_NR_VCPUS);
+ if (ret <= 0)
+ /*
+ * api.txt states that if KVM_CAP_NR_VCPUS does not exist,
+ * assume 4.
+ */
+ return 4;
+
+ return ret;
+}
+
+/*
+ * The following hack should be removed once 'x86: Raise the hard
+ * VCPU count limit' makes it's way into the mainline.
+ */
+#ifndef KVM_CAP_MAX_VCPUS
+#define KVM_CAP_MAX_VCPUS 66
+#endif
+
+int kvm__max_cpus(struct kvm *kvm)
+{
+ int ret;
+
+ ret = ioctl(kvm->sys_fd, KVM_CHECK_EXTENSION, KVM_CAP_MAX_VCPUS);
+ if (ret <= 0)
+ ret = kvm__recommended_cpus(kvm);
+
+ return ret;
+}
+
+int kvm__init(struct kvm *kvm)
+{
+ int ret;
+
+ if (!kvm__arch_cpu_supports_vm()) {
+ pr_err("Your CPU does not support hardware virtualization");
+ ret = -ENOSYS;
+ goto err;
+ }
+
+ kvm->sys_fd = open(kvm->cfg.dev, O_RDWR);
+ if (kvm->sys_fd < 0) {
+ if (errno == ENOENT)
+ pr_err("'%s' not found. Please make sure your kernel has CONFIG_KVM "
+ "enabled and that the KVM modules are loaded.", kvm->cfg.dev);
+ else if (errno == ENODEV)
+ pr_err("'%s' KVM driver not available.\n # (If the KVM "
+ "module is loaded then 'dmesg' may offer further clues "
+ "about the failure.)", kvm->cfg.dev);
+ else
+ pr_err("Could not open %s: ", kvm->cfg.dev);
+
+ ret = -errno;
+ goto err_free;
+ }
+
+ ret = ioctl(kvm->sys_fd, KVM_GET_API_VERSION, 0);
+ if (ret != KVM_API_VERSION) {
+ pr_err("KVM_API_VERSION ioctl");
+ ret = -errno;
+ goto err_sys_fd;
+ }
+
+ kvm->vm_fd = ioctl(kvm->sys_fd, KVM_CREATE_VM, KVM_VM_TYPE);
+ if (kvm->vm_fd < 0) {
+ pr_err("KVM_CREATE_VM ioctl");
+ ret = kvm->vm_fd;
+ goto err_sys_fd;
+ }
+
+ if (kvm__check_extensions(kvm)) {
+ pr_err("A required KVM extension is not supported by OS");
+ ret = -ENOSYS;
+ goto err_vm_fd;
+ }
+
+ kvm__arch_init(kvm, kvm->cfg.hugetlbfs_path, kvm->cfg.ram_size);
+
+ INIT_LIST_HEAD(&kvm->mem_banks);
+ kvm__init_ram(kvm);
+
+ if (!kvm->cfg.firmware_filename) {
+ if (!kvm__load_kernel(kvm, kvm->cfg.kernel_filename,
+ kvm->cfg.initrd_filename, kvm->cfg.real_cmdline))
+ die("unable to load kernel %s", kvm->cfg.kernel_filename);
+ }
+
+ if (kvm->cfg.firmware_filename) {
+ if (!kvm__load_firmware(kvm, kvm->cfg.firmware_filename))
+ die("unable to load firmware image %s: %s", kvm->cfg.firmware_filename, strerror(errno));
+ } else {
+ ret = kvm__arch_setup_firmware(kvm);
+ if (ret < 0)
+ die("kvm__arch_setup_firmware() failed with error %d\n", ret);
+ }
+
+ return 0;
+
+err_vm_fd:
+ close(kvm->vm_fd);
+err_sys_fd:
+ close(kvm->sys_fd);
+err_free:
+ free(kvm);
+err:
+ return ret;
+}
+core_init(kvm__init);
+
+/* RFC 1952 */
+#define GZIP_ID1 0x1f
+#define GZIP_ID2 0x8b
+#define CPIO_MAGIC "0707"
+/* initrd may be gzipped, or a plain cpio */
+static bool initrd_check(int fd)
+{
+ unsigned char id[4];
+
+ if (read_in_full(fd, id, ARRAY_SIZE(id)) < 0)
+ return false;
+
+ if (lseek(fd, 0, SEEK_SET) < 0)
+ die_perror("lseek");
+
+ return (id[0] == GZIP_ID1 && id[1] == GZIP_ID2) ||
+ !memcmp(id, CPIO_MAGIC, 4);
+}
+
+int __attribute__((__weak__)) load_elf_binary(struct kvm *kvm, int fd_kernel,
+ int fd_initrd, const char *kernel_cmdline)
+{
+ return false;
+}
+
+bool __attribute__((__weak__)) load_bzimage(struct kvm *kvm, int fd_kernel,
+ int fd_initrd, const char *kernel_cmdline)
+{
+ return false;
+}
+
+bool kvm__load_kernel(struct kvm *kvm, const char *kernel_filename,
+ const char *initrd_filename, const char *kernel_cmdline)
+{
+ bool ret;
+ int fd_kernel = -1, fd_initrd = -1;
+
+ fd_kernel = open(kernel_filename, O_RDONLY);
+ if (fd_kernel < 0)
+ die("Unable to open kernel %s", kernel_filename);
+
+ if (initrd_filename) {
+ fd_initrd = open(initrd_filename, O_RDONLY);
+ if (fd_initrd < 0)
+ die("Unable to open initrd %s", initrd_filename);
+
+ if (!initrd_check(fd_initrd))
+ die("%s is not an initrd", initrd_filename);
+ }
+
+ ret = load_bzimage(kvm, fd_kernel, fd_initrd, kernel_cmdline);
+
+ if (ret)
+ goto found_kernel;
+
+ pr_warning("%s is not a bzImage. Trying to load it as a flat binary...", kernel_filename);
+
+ ret = load_elf_binary(kvm, fd_kernel, fd_initrd, kernel_cmdline);
+
+ if (ret)
+ goto found_kernel;
+
+ ret = load_flat_binary(kvm, fd_kernel, fd_initrd, kernel_cmdline);
+
+ if (ret)
+ goto found_kernel;
+
+ if (initrd_filename)
+ close(fd_initrd);
+ close(fd_kernel);
+
+ die("%s is not a valid bzImage or flat binary", kernel_filename);
+
+found_kernel:
+ if (initrd_filename)
+ close(fd_initrd);
+ close(fd_kernel);
+
+ return ret;
+}
+
+
+void kvm__dump_mem(struct kvm *kvm, unsigned long addr, unsigned long size, int debug_fd)
+{
+ unsigned char *p;
+ unsigned long n;
+
+ size &= ~7; /* mod 8 */
+ if (!size)
+ return;
+
+ p = guest_flat_to_host(kvm, addr);
+
+ for (n = 0; n < size; n += 8) {
+ if (!host_ptr_in_ram(kvm, p + n)) {
+ dprintf(debug_fd, " 0x%08lx: <unknown>\n", addr + n);
+ continue;
+ }
+ dprintf(debug_fd, " 0x%08lx: %02x %02x %02x %02x %02x %02x %02x %02x\n",
+ addr + n, p[n + 0], p[n + 1], p[n + 2], p[n + 3],
+ p[n + 4], p[n + 5], p[n + 6], p[n + 7]);
+ }
+}
+
+void kvm__pause(struct kvm *kvm)
+{
+ int i, paused_vcpus = 0;
+
+ /* Check if the guest is running */
+ if (!kvm->cpus[0] || kvm->cpus[0]->thread == 0)
+ return;
+
+ mutex_lock(&pause_lock);
+
+ pause_event = eventfd(0, 0);
+ if (pause_event < 0)
+ die("Failed creating pause notification event");
+ for (i = 0; i < kvm->nrcpus; i++)
+ pthread_kill(kvm->cpus[i]->thread, SIGKVMPAUSE);
+
+ while (paused_vcpus < kvm->nrcpus) {
+ u64 cur_read;
+
+ if (read(pause_event, &cur_read, sizeof(cur_read)) < 0)
+ die("Failed reading pause event");
+ paused_vcpus += cur_read;
+ }
+ close(pause_event);
+}
+
+void kvm__continue(struct kvm *kvm)
+{
+ /* Check if the guest is running */
+ if (!kvm->cpus[0] || kvm->cpus[0]->thread == 0)
+ return;
+
+ mutex_unlock(&pause_lock);
+}
+
+void kvm__notify_paused(void)
+{
+ u64 p = 1;
+
+ if (write(pause_event, &p, sizeof(p)) < 0)
+ die("Failed notifying of paused VCPU.");
+
+ mutex_lock(&pause_lock);
+ mutex_unlock(&pause_lock);
+}
diff --git a/tools/kvm/main.c b/tools/kvm/main.c
new file mode 100644
index 0000000..05bc82c
--- /dev/null
+++ b/tools/kvm/main.c
@@ -0,0 +1,19 @@
+#include "kvm/kvm.h"
+
+#include <stdlib.h>
+#include <stdio.h>
+
+/* user defined header files */
+#include <kvm/kvm-cmd.h>
+
+static int handle_kvm_command(int argc, char **argv)
+{
+ return handle_command(kvm_commands, argc, (const char **) &argv[0]);
+}
+
+int main(int argc, char *argv[])
+{
+ kvm__set_dir("%s/%s", HOME_DIR, KVM_PID_FILE_PATH);
+
+ return handle_kvm_command(argc - 1, &argv[1]);
+}
diff --git a/tools/kvm/mips/include/kvm/barrier.h b/tools/kvm/mips/include/kvm/barrier.h
new file mode 100644
index 0000000..45bfa72
--- /dev/null
+++ b/tools/kvm/mips/include/kvm/barrier.h
@@ -0,0 +1,20 @@
+#ifndef _KVM_BARRIER_H_
+#define _KVM_BARRIER_H_
+
+#define barrier() asm volatile("": : :"memory")
+
+#define mb() asm volatile (".set push\n\t.set mips2\n\tsync\n\t.set pop": : :"memory")
+#define rmb() mb()
+#define wmb() mb()
+
+#ifdef CONFIG_SMP
+#define smp_mb() mb()
+#define smp_rmb() rmb()
+#define smp_wmb() wmb()
+#else
+#define smp_mb() barrier()
+#define smp_rmb() barrier()
+#define smp_wmb() barrier()
+#endif
+
+#endif /* _KVM_BARRIER_H_ */
diff --git a/tools/kvm/mips/include/kvm/kvm-arch.h b/tools/kvm/mips/include/kvm/kvm-arch.h
new file mode 100644
index 0000000..7eadbf4
--- /dev/null
+++ b/tools/kvm/mips/include/kvm/kvm-arch.h
@@ -0,0 +1,38 @@
+#ifndef KVM__KVM_ARCH_H
+#define KVM__KVM_ARCH_H
+
+#define KVM_MMIO_START 0x10000000
+#define KVM_PCI_CFG_AREA KVM_MMIO_START
+#define KVM_PCI_MMIO_AREA (KVM_MMIO_START + 0x1000000)
+#define KVM_VIRTIO_MMIO_AREA (KVM_MMIO_START + 0x2000000)
+
+/*
+ * Just for reference. This and the above corresponds to what's used
+ * in mipsvz_page_fault() in kvm_mipsvz.c of the host kernel.
+ */
+#define KVM_MIPS_IOPORT_AREA 0x1e000000
+#define KVM_MIPS_IOPORT_SIZE 0x00010000
+#define KVM_MIPS_IRQCHIP_AREA 0x1e010000
+#define KVM_MIPS_IRQCHIP_SIZE 0x00010000
+
+#define KVM_IRQ_OFFSET 1
+
+/*
+ * MIPS-VZ (trap and emulate is 0)
+ */
+#define KVM_VM_TYPE 1
+
+#define VIRTIO_DEFAULT_TRANS(kvm) VIRTIO_PCI
+
+#include <stdbool.h>
+
+#include "linux/types.h"
+
+struct kvm_arch {
+ u64 entry_point;
+ u64 argc;
+ u64 argv;
+ bool is64bit;
+};
+
+#endif /* KVM__KVM_ARCH_H */
diff --git a/tools/kvm/mips/include/kvm/kvm-config-arch.h b/tools/kvm/mips/include/kvm/kvm-config-arch.h
new file mode 100644
index 0000000..8a28f9d
--- /dev/null
+++ b/tools/kvm/mips/include/kvm/kvm-config-arch.h
@@ -0,0 +1,7 @@
+#ifndef KVM__KVM_CONFIG_ARCH_H
+#define KVM__KVM_CONFIG_ARCH_H
+
+struct kvm_config_arch {
+};
+
+#endif /* KVM__MIPS_KVM_CONFIG_ARCH_H */
diff --git a/tools/kvm/mips/include/kvm/kvm-cpu-arch.h b/tools/kvm/mips/include/kvm/kvm-cpu-arch.h
new file mode 100644
index 0000000..3db7f2b
--- /dev/null
+++ b/tools/kvm/mips/include/kvm/kvm-cpu-arch.h
@@ -0,0 +1,42 @@
+#ifndef KVM__KVM_CPU_ARCH_H
+#define KVM__KVM_CPU_ARCH_H
+
+#include <linux/kvm.h> /* for struct kvm_regs */
+#include "kvm/kvm.h" /* for kvm__emulate_{mm}io() */
+#include <pthread.h>
+
+struct kvm;
+
+struct kvm_cpu {
+ pthread_t thread; /* VCPU thread */
+
+ unsigned long cpu_id;
+
+ struct kvm *kvm; /* parent KVM */
+ int vcpu_fd; /* For VCPU ioctls() */
+ struct kvm_run *kvm_run;
+
+ struct kvm_regs regs;
+
+ u8 is_running;
+ u8 paused;
+ u8 needs_nmi;
+
+ struct kvm_coalesced_mmio_ring *ring;
+};
+
+/*
+ * As these are such simple wrappers, let's have them in the header so they'll
+ * be cheaper to call:
+ */
+static inline bool kvm_cpu__emulate_io(struct kvm_cpu *vcpu, u16 port, void *data, int direction, int size, u32 count)
+{
+ return kvm__emulate_io(vcpu, port, data, direction, size, count);
+}
+
+static inline bool kvm_cpu__emulate_mmio(struct kvm_cpu *vcpu, u64 phys_addr, u8 *data, u32 len, u8 is_write)
+{
+ return kvm__emulate_mmio(vcpu, phys_addr, data, len, is_write);
+}
+
+#endif /* KVM__KVM_CPU_ARCH_H */
diff --git a/tools/kvm/mips/irq.c b/tools/kvm/mips/irq.c
new file mode 100644
index 0000000..c1ff6bb
--- /dev/null
+++ b/tools/kvm/mips/irq.c
@@ -0,0 +1,10 @@
+#include "kvm/irq.h"
+#include "kvm/kvm.h"
+
+#include <stdlib.h>
+
+int irq__add_msix_route(struct kvm *kvm, struct msi_msg *msg)
+{
+ pr_warning("irq__add_msix_route");
+ return 1;
+}
diff --git a/tools/kvm/mips/kvm-cpu.c b/tools/kvm/mips/kvm-cpu.c
new file mode 100644
index 0000000..30a3de1
--- /dev/null
+++ b/tools/kvm/mips/kvm-cpu.c
@@ -0,0 +1,219 @@
+#include "kvm/kvm-cpu.h"
+#include "kvm/term.h"
+
+#include <stdlib.h>
+
+static int debug_fd;
+
+void kvm_cpu__set_debug_fd(int fd)
+{
+ debug_fd = fd;
+}
+
+int kvm_cpu__get_debug_fd(void)
+{
+ return debug_fd;
+}
+
+void kvm_cpu__delete(struct kvm_cpu *vcpu)
+{
+ free(vcpu);
+}
+
+static struct kvm_cpu *kvm_cpu__new(struct kvm *kvm)
+{
+ struct kvm_cpu *vcpu;
+
+ vcpu = calloc(1, sizeof(*vcpu));
+ if (!vcpu)
+ return NULL;
+
+ vcpu->kvm = kvm;
+
+ return vcpu;
+}
+
+struct kvm_cpu *kvm_cpu__arch_init(struct kvm *kvm, unsigned long cpu_id)
+{
+ struct kvm_cpu *vcpu;
+ int mmap_size;
+ int coalesced_offset;
+
+ vcpu = kvm_cpu__new(kvm);
+ if (!vcpu)
+ return NULL;
+
+ vcpu->cpu_id = cpu_id;
+
+ vcpu->vcpu_fd = ioctl(vcpu->kvm->vm_fd, KVM_CREATE_VCPU, cpu_id);
+ if (vcpu->vcpu_fd < 0)
+ die_perror("KVM_CREATE_VCPU ioctl");
+
+ mmap_size = ioctl(vcpu->kvm->sys_fd, KVM_GET_VCPU_MMAP_SIZE, 0);
+ if (mmap_size < 0)
+ die_perror("KVM_GET_VCPU_MMAP_SIZE ioctl");
+
+ vcpu->kvm_run = mmap(NULL, mmap_size, PROT_RW, MAP_SHARED, vcpu->vcpu_fd, 0);
+ if (vcpu->kvm_run == MAP_FAILED)
+ die("unable to mmap vcpu fd");
+
+ vcpu->is_running = true;
+
+ coalesced_offset = ioctl(kvm->sys_fd, KVM_CHECK_EXTENSION, KVM_CAP_COALESCED_MMIO);
+ if (coalesced_offset)
+ vcpu->ring = (void *)vcpu->kvm_run + (coalesced_offset * PAGE_SIZE);
+
+ return vcpu;
+}
+
+static void kvm_cpu__setup_regs(struct kvm_cpu *vcpu)
+{
+ uint32_t v;
+ struct kvm_one_reg one_reg;
+
+ memset(&vcpu->regs, 0, sizeof(vcpu->regs));
+ vcpu->regs.pc = vcpu->kvm->arch.entry_point;
+ vcpu->regs.gpr[4] = vcpu->kvm->arch.argc;
+ vcpu->regs.gpr[5] = vcpu->kvm->arch.argv;
+
+ if (ioctl(vcpu->vcpu_fd, KVM_SET_REGS, &vcpu->regs) < 0)
+ die_perror("KVM_SET_REGS failed");
+
+
+ one_reg.id = KVM_REG_MIPS | KVM_REG_SIZE_U32 | (0x10000 + 8 * 12 + 0); /* Status */
+ one_reg.addr = (unsigned long)(uint32_t *)&v;
+ v = 6;
+
+ if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, &one_reg) < 0)
+ die_perror("KVM_SET_ONE_REG failed");
+}
+
+/**
+ * kvm_cpu__reset_vcpu - reset virtual CPU to a known state
+ */
+void kvm_cpu__reset_vcpu(struct kvm_cpu *vcpu)
+{
+ kvm_cpu__setup_regs(vcpu);
+}
+
+static bool kvm_cpu__hypercall_write_cons(struct kvm_cpu *vcpu)
+{
+ int term = (int)vcpu->kvm_run->hypercall.args[0];
+ u64 addr = vcpu->kvm_run->hypercall.args[1];
+ int len = (int)vcpu->kvm_run->hypercall.args[2];
+ char *host_addr;
+
+ if (term < 0 || term >= TERM_MAX_DEVS) {
+ pr_warning("hypercall_write_cons term out of range <%d>", term);
+ return false;
+ }
+
+ if ((addr & 0xffffffffc0000000ull) == 0xffffffff80000000ull)
+ addr &= 0x1ffffffful; /* Convert KSEG{0,1} to physical. */
+ if ((addr & 0xc000000000000000ull) == 0x8000000000000000ull)
+ addr &= 0x07ffffffffffffffull; /* Convert XKPHYS to pysical */
+
+ host_addr = guest_flat_to_host(vcpu->kvm, addr);
+ if (!host_addr) {
+ pr_warning("hypercall_write_cons unmapped physaddr %llx", (unsigned long long)addr);
+ return false;
+ }
+
+ if ((len <= 0) || !host_ptr_in_ram(vcpu->kvm, host_addr + len)) {
+ pr_warning("hypercall_write_cons len out of range <%d>", len);
+ return false;
+ }
+
+ term_putc(host_addr, len, term);
+
+ return true;
+}
+
+#define KVM_HC_MIPS_CONSOLE_OUTPUT 8
+bool kvm_cpu__handle_exit(struct kvm_cpu *vcpu)
+{
+ switch(vcpu->kvm_run->exit_reason) {
+ case KVM_EXIT_HYPERCALL:
+ if (vcpu->kvm_run->hypercall.nr == KVM_HC_MIPS_CONSOLE_OUTPUT) {
+ return kvm_cpu__hypercall_write_cons(vcpu);
+ } else {
+ pr_warning("KVM_EXIT_HYPERCALL unrecognized call %llu",
+ (unsigned long long)vcpu->kvm_run->hypercall.nr);
+ return false;
+ }
+ case KVM_EXIT_EXCEPTION:
+ case KVM_EXIT_INTERNAL_ERROR:
+ return false;
+ default:
+ break;
+ }
+ return false;
+}
+
+void kvm_cpu__arch_nmi(struct kvm_cpu *cpu)
+{
+}
+
+void kvm_cpu__show_registers(struct kvm_cpu *vcpu)
+{
+ struct kvm_regs regs;
+
+ if (ioctl(vcpu->vcpu_fd, KVM_GET_REGS, ®s) < 0)
+ die("KVM_GET_REGS failed");
+ dprintf(debug_fd, "\n Registers:\n");
+ dprintf(debug_fd, " ----------\n");
+ dprintf(debug_fd, "$0 : %016llx %016llx %016llx %016llx\n",
+ (unsigned long long)regs.gpr[0],
+ (unsigned long long)regs.gpr[1],
+ (unsigned long long)regs.gpr[2],
+ (unsigned long long)regs.gpr[3]);
+ dprintf(debug_fd, "$4 : %016llx %016llx %016llx %016llx\n",
+ (unsigned long long)regs.gpr[4],
+ (unsigned long long)regs.gpr[5],
+ (unsigned long long)regs.gpr[6],
+ (unsigned long long)regs.gpr[7]);
+ dprintf(debug_fd, "$8 : %016llx %016llx %016llx %016llx\n",
+ (unsigned long long)regs.gpr[8],
+ (unsigned long long)regs.gpr[9],
+ (unsigned long long)regs.gpr[10],
+ (unsigned long long)regs.gpr[11]);
+ dprintf(debug_fd, "$12 : %016llx %016llx %016llx %016llx\n",
+ (unsigned long long)regs.gpr[12],
+ (unsigned long long)regs.gpr[13],
+ (unsigned long long)regs.gpr[14],
+ (unsigned long long)regs.gpr[15]);
+ dprintf(debug_fd, "$16 : %016llx %016llx %016llx %016llx\n",
+ (unsigned long long)regs.gpr[16],
+ (unsigned long long)regs.gpr[17],
+ (unsigned long long)regs.gpr[18],
+ (unsigned long long)regs.gpr[19]);
+ dprintf(debug_fd, "$20 : %016llx %016llx %016llx %016llx\n",
+ (unsigned long long)regs.gpr[20],
+ (unsigned long long)regs.gpr[21],
+ (unsigned long long)regs.gpr[22],
+ (unsigned long long)regs.gpr[23]);
+ dprintf(debug_fd, "$24 : %016llx %016llx %016llx %016llx\n",
+ (unsigned long long)regs.gpr[24],
+ (unsigned long long)regs.gpr[25],
+ (unsigned long long)regs.gpr[26],
+ (unsigned long long)regs.gpr[27]);
+ dprintf(debug_fd, "$28 : %016llx %016llx %016llx %016llx\n",
+ (unsigned long long)regs.gpr[28],
+ (unsigned long long)regs.gpr[29],
+ (unsigned long long)regs.gpr[30],
+ (unsigned long long)regs.gpr[31]);
+
+ dprintf(debug_fd, "hi : %016llx\n", (unsigned long long)regs.hi);
+ dprintf(debug_fd, "lo : %016llx\n", (unsigned long long)regs.lo);
+ dprintf(debug_fd, "epc : %016llx\n", (unsigned long long)regs.pc);
+
+ dprintf(debug_fd, "\n");
+}
+
+void kvm_cpu__show_code(struct kvm_cpu *vcpu)
+{
+}
+
+void kvm_cpu__show_page_tables(struct kvm_cpu *vcpu)
+{
+}
diff --git a/tools/kvm/mips/kvm.c b/tools/kvm/mips/kvm.c
new file mode 100644
index 0000000..fc0428b
--- /dev/null
+++ b/tools/kvm/mips/kvm.c
@@ -0,0 +1,328 @@
+#include "kvm/kvm.h"
+#include "kvm/ioport.h"
+#include "kvm/virtio-console.h"
+
+#include <linux/kvm.h>
+
+#include <ctype.h>
+#include <unistd.h>
+#include <elf.h>
+
+struct kvm_ext kvm_req_ext[] = {
+ { 0, 0 }
+};
+
+void kvm__arch_read_term(struct kvm *kvm)
+{
+ virtio_console__inject_interrupt(kvm);
+}
+
+void kvm__init_ram(struct kvm *kvm)
+{
+ u64 phys_start, phys_size;
+ void *host_mem;
+
+ phys_start = 0;
+ phys_size = kvm->ram_size;
+ host_mem = kvm->ram_start;
+
+ kvm__register_mem(kvm, phys_start, phys_size, host_mem);
+}
+
+void kvm__arch_delete_ram(struct kvm *kvm)
+{
+ munmap(kvm->ram_start, kvm->ram_size);
+}
+
+void kvm__arch_set_cmdline(char *cmdline, bool video)
+{
+
+}
+
+/* Architecture-specific KVM init */
+void kvm__arch_init(struct kvm *kvm, const char *hugetlbfs_path, u64 ram_size)
+{
+ int ret;
+
+ kvm->ram_start = mmap_anon_or_hugetlbfs(kvm, hugetlbfs_path, ram_size);
+ kvm->ram_size = ram_size;
+
+ if (kvm->ram_start == MAP_FAILED)
+ die("out of memory");
+
+ madvise(kvm->ram_start, kvm->ram_size, MADV_MERGEABLE);
+
+ ret = ioctl(kvm->vm_fd, KVM_CREATE_IRQCHIP);
+ if (ret < 0)
+ die_perror("KVM_CREATE_IRQCHIP ioctl");
+}
+
+void kvm__irq_line(struct kvm *kvm, int irq, int level)
+{
+ struct kvm_irq_level irq_level;
+ int ret;
+
+ irq_level.irq = irq;
+ irq_level.level = level ? 1 : 0;
+
+ ret = ioctl(kvm->vm_fd, KVM_IRQ_LINE, &irq_level);
+ if (ret < 0)
+ die_perror("KVM_IRQ_LINE ioctl");
+}
+
+void kvm__irq_trigger(struct kvm *kvm, int irq)
+{
+ struct kvm_irq_level irq_level;
+ int ret;
+
+ irq_level.irq = irq;
+ irq_level.level = 1;
+
+ ret = ioctl(kvm->vm_fd, KVM_IRQ_LINE, &irq_level);
+ if (ret < 0)
+ die_perror("KVM_IRQ_LINE ioctl");
+}
+
+void ioport__setup_arch(struct kvm *kvm)
+{
+}
+
+bool kvm__arch_cpu_supports_vm(void)
+{
+ return true;
+}
+bool kvm__load_firmware(struct kvm *kvm, const char *firmware_filename)
+{
+ return false;
+}
+int kvm__arch_setup_firmware(struct kvm *kvm)
+{
+ return 0;
+}
+
+static void kvm__mips_install_cmdline(struct kvm *kvm)
+{
+ char *p = kvm->ram_start;
+ u64 cmdline_offset = 0x2000;
+ u64 argv_start = 0x3000;
+ u64 argv_offset = argv_start;
+ u64 argc = 0;
+
+ sprintf(p + cmdline_offset, "mem=0x%llx@0 ",
+ (unsigned long long)kvm->ram_size);
+
+ strcat(p + cmdline_offset, kvm->cfg.real_cmdline); /* maximum size is 2K */
+
+ while (p[cmdline_offset]) {
+ if (!isspace(p[cmdline_offset])) {
+ if (kvm->arch.is64bit) {
+ *(u64 *)(p + argv_offset) = 0xffffffff80000000ull + cmdline_offset;
+ argv_offset += sizeof(u64);
+ } else {
+ *(u32 *)(p + argv_offset) = 0x80000000u + cmdline_offset;
+ argv_offset += sizeof(u32);
+ }
+ argc++;
+ while(p[cmdline_offset] && !isspace(p[cmdline_offset]))
+ cmdline_offset++;
+ continue;
+ }
+ /* Must be a space character skip over these*/
+ while(p[cmdline_offset] && isspace(p[cmdline_offset])) {
+ p[cmdline_offset] = 0;
+ cmdline_offset++;
+ }
+ }
+ kvm->arch.argc = argc;
+ kvm->arch.argv = 0xffffffff80000000ull + argv_start;
+}
+
+/* Load at the 1M point. */
+#define KERNEL_LOAD_ADDR 0x1000000
+int load_flat_binary(struct kvm *kvm, int fd_kernel, int fd_initrd, const char *kernel_cmdline)
+{
+ void *p;
+ void *k_start;
+ int nr;
+
+ if (lseek(fd_kernel, 0, SEEK_SET) < 0)
+ die_perror("lseek");
+
+ p = k_start = guest_flat_to_host(kvm, KERNEL_LOAD_ADDR);
+
+ while ((nr = read(fd_kernel, p, 65536)) > 0)
+ p += nr;
+
+ kvm->arch.is64bit = true;
+ kvm->arch.entry_point = 0xffffffff81000000ull;
+
+ pr_info("Loaded kernel to 0x%x (%ld bytes)", KERNEL_LOAD_ADDR, (long int)(p - k_start));
+
+ return true;
+}
+
+struct kvm__arch_elf_info {
+ u64 load_addr;
+ u64 entry_point;
+ size_t len;
+ size_t offset;
+};
+
+static bool kvm__arch_get_elf_64_info(Elf64_Ehdr *ehdr, int fd_kernel,
+ struct kvm__arch_elf_info *ei)
+{
+ int i;
+ size_t nr;
+ Elf64_Phdr phdr;
+
+ if (ehdr->e_phentsize != sizeof(phdr)) {
+ pr_info("Incompatible ELF PHENTSIZE %d", ehdr->e_phentsize);
+ return false;
+ }
+
+ ei->entry_point = ehdr->e_entry;
+
+ if (lseek(fd_kernel, ehdr->e_phoff, SEEK_SET) < 0)
+ die_perror("lseek");
+
+ phdr.p_type = PT_NULL;
+ for (i = 0; i < ehdr->e_phnum; i++) {
+ nr = read(fd_kernel, &phdr, sizeof(phdr));
+ if (nr != sizeof(phdr)) {
+ pr_info("Couldn't read %d bytes for ELF PHDR.", (int)sizeof(phdr));
+ return false;
+ }
+ if (phdr.p_type == PT_LOAD)
+ break;
+ }
+ if (phdr.p_type != PT_LOAD) {
+ pr_info("No PT_LOAD Program Header found.");
+ return false;
+ }
+
+ ei->load_addr = phdr.p_paddr;
+
+ if ((ei->load_addr & 0xffffffffc0000000ull) == 0xffffffff80000000ull)
+ ei->load_addr &= 0x1ffffffful; /* Convert KSEG{0,1} to physical. */
+ if ((ei->load_addr & 0xc000000000000000ull) == 0x8000000000000000ull)
+ ei->load_addr &= 0x07ffffffffffffffull; /* Convert XKPHYS to pysical */
+
+
+ ei->len = phdr.p_filesz;
+ ei->offset = phdr.p_offset;
+
+ return true;
+}
+
+static bool kvm__arch_get_elf_32_info(Elf32_Ehdr *ehdr, int fd_kernel,
+ struct kvm__arch_elf_info *ei)
+{
+ int i;
+ size_t nr;
+ Elf32_Phdr phdr;
+
+ if (ehdr->e_phentsize != sizeof(phdr)) {
+ pr_info("Incompatible ELF PHENTSIZE %d", ehdr->e_phentsize);
+ return false;
+ }
+
+ ei->entry_point = (s64)((s32)ehdr->e_entry);
+
+ if (lseek(fd_kernel, ehdr->e_phoff, SEEK_SET) < 0)
+ die_perror("lseek");
+
+ phdr.p_type = PT_NULL;
+ for (i = 0; i < ehdr->e_phnum; i++) {
+ nr = read(fd_kernel, &phdr, sizeof(phdr));
+ if (nr != sizeof(phdr)) {
+ pr_info("Couldn't read %d bytes for ELF PHDR.", (int)sizeof(phdr));
+ return false;
+ }
+ if (phdr.p_type == PT_LOAD)
+ break;
+ }
+ if (phdr.p_type != PT_LOAD) {
+ pr_info("No PT_LOAD Program Header found.");
+ return false;
+ }
+
+ ei->load_addr = (s64)((s32)phdr.p_paddr);
+
+ if ((ei->load_addr & 0xffffffffc0000000ull) == 0xffffffff80000000ull)
+ ei->load_addr &= 0x1fffffffull; /* Convert KSEG{0,1} to physical. */
+
+ ei->len = phdr.p_filesz;
+ ei->offset = phdr.p_offset;
+
+ return true;
+}
+
+int load_elf_binary(struct kvm *kvm, int fd_kernel, int fd_initrd, const char *kernel_cmdline)
+{
+ union {
+ Elf64_Ehdr ehdr;
+ Elf32_Ehdr ehdr32;
+ } eh;
+
+ size_t nr;
+ char *p;
+ struct kvm__arch_elf_info ei;
+
+ if (lseek(fd_kernel, 0, SEEK_SET) < 0)
+ die_perror("lseek");
+
+ nr = read(fd_kernel, &eh, sizeof(eh));
+ if (nr != sizeof(eh)) {
+ pr_info("Couldn't read %d bytes for ELF header.", (int)sizeof(eh));
+ return false;
+ }
+
+ if (eh.ehdr.e_ident[EI_MAG0] != ELFMAG0 ||
+ eh.ehdr.e_ident[EI_MAG1] != ELFMAG1 ||
+ eh.ehdr.e_ident[EI_MAG2] != ELFMAG2 ||
+ eh.ehdr.e_ident[EI_MAG3] != ELFMAG3 ||
+ (eh.ehdr.e_ident[EI_CLASS] != ELFCLASS64 && eh.ehdr.e_ident[EI_CLASS] != ELFCLASS32) ||
+ eh.ehdr.e_ident[EI_VERSION] != EV_CURRENT) {
+ pr_info("Incompatible ELF header.");
+ return false;
+ }
+ if (eh.ehdr.e_type != ET_EXEC || eh.ehdr.e_machine != EM_MIPS) {
+ pr_info("Incompatible ELF not MIPS EXEC.");
+ return false;
+ }
+
+ if (eh.ehdr.e_ident[EI_CLASS] == ELFCLASS64) {
+ if (!kvm__arch_get_elf_64_info(&eh.ehdr, fd_kernel, &ei))
+ return false;
+ kvm->arch.is64bit = true;
+ } else {
+ if (!kvm__arch_get_elf_32_info(&eh.ehdr32, fd_kernel, &ei))
+ return false;
+ kvm->arch.is64bit = false;
+ }
+
+ kvm->arch.entry_point = ei.entry_point;
+
+ if (lseek(fd_kernel, ei.offset, SEEK_SET) < 0)
+ die_perror("lseek");
+
+ p = guest_flat_to_host(kvm, ei.load_addr);
+
+ pr_info("ELF Loading 0x%lx bytes from 0x%llx to 0x%llx",
+ (unsigned long)ei.len, (unsigned long long)ei.offset, (unsigned long long)ei.load_addr);
+ do {
+ nr = read(fd_kernel, p, ei.len);
+ if (nr < 0)
+ die_perror("read");
+ p += nr;
+ ei.len -= nr;
+ } while (ei.len);
+
+ kvm__mips_install_cmdline(kvm);
+
+ return true;
+}
+
+void ioport__map_irq(u8 *irq)
+{
+}
diff --git a/tools/kvm/mmio.c b/tools/kvm/mmio.c
new file mode 100644
index 0000000..c648bec
--- /dev/null
+++ b/tools/kvm/mmio.c
@@ -0,0 +1,141 @@
+#include "kvm/kvm.h"
+#include "kvm/kvm-cpu.h"
+#include "kvm/rbtree-interval.h"
+#include "kvm/brlock.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <sys/ioctl.h>
+#include <linux/kvm.h>
+#include <linux/types.h>
+#include <linux/rbtree.h>
+#include <linux/err.h>
+#include <errno.h>
+
+#define mmio_node(n) rb_entry(n, struct mmio_mapping, node)
+
+struct mmio_mapping {
+ struct rb_int_node node;
+ void (*mmio_fn)(struct kvm_cpu *vcpu, u64 addr, u8 *data, u32 len, u8 is_write, void *ptr);
+ void *ptr;
+};
+
+static struct rb_root mmio_tree = RB_ROOT;
+
+static struct mmio_mapping *mmio_search(struct rb_root *root, u64 addr, u64 len)
+{
+ struct rb_int_node *node;
+
+ node = rb_int_search_range(root, addr, addr + len);
+ if (node == NULL)
+ return NULL;
+
+ return mmio_node(node);
+}
+
+/* Find lowest match, Check for overlap */
+static struct mmio_mapping *mmio_search_single(struct rb_root *root, u64 addr)
+{
+ struct rb_int_node *node;
+
+ node = rb_int_search_single(root, addr);
+ if (node == NULL)
+ return NULL;
+
+ return mmio_node(node);
+}
+
+static int mmio_insert(struct rb_root *root, struct mmio_mapping *data)
+{
+ return rb_int_insert(root, &data->node);
+}
+
+static const char *to_direction(u8 is_write)
+{
+ if (is_write)
+ return "write";
+
+ return "read";
+}
+
+int kvm__register_mmio(struct kvm *kvm, u64 phys_addr, u64 phys_addr_len, bool coalesce,
+ void (*mmio_fn)(struct kvm_cpu *vcpu, u64 addr, u8 *data, u32 len, u8 is_write, void *ptr),
+ void *ptr)
+{
+ struct mmio_mapping *mmio;
+ struct kvm_coalesced_mmio_zone zone;
+ int ret;
+
+ mmio = malloc(sizeof(*mmio));
+ if (mmio == NULL)
+ return -ENOMEM;
+
+ *mmio = (struct mmio_mapping) {
+ .node = RB_INT_INIT(phys_addr, phys_addr + phys_addr_len),
+ .mmio_fn = mmio_fn,
+ .ptr = ptr,
+ };
+
+ if (coalesce) {
+ zone = (struct kvm_coalesced_mmio_zone) {
+ .addr = phys_addr,
+ .size = phys_addr_len,
+ };
+ ret = ioctl(kvm->vm_fd, KVM_REGISTER_COALESCED_MMIO, &zone);
+ if (ret < 0) {
+ free(mmio);
+ return -errno;
+ }
+ }
+ br_write_lock(kvm);
+ ret = mmio_insert(&mmio_tree, mmio);
+ br_write_unlock(kvm);
+
+ return ret;
+}
+
+bool kvm__deregister_mmio(struct kvm *kvm, u64 phys_addr)
+{
+ struct mmio_mapping *mmio;
+ struct kvm_coalesced_mmio_zone zone;
+
+ br_write_lock(kvm);
+ mmio = mmio_search_single(&mmio_tree, phys_addr);
+ if (mmio == NULL) {
+ br_write_unlock(kvm);
+ return false;
+ }
+
+ zone = (struct kvm_coalesced_mmio_zone) {
+ .addr = phys_addr,
+ .size = 1,
+ };
+ ioctl(kvm->vm_fd, KVM_UNREGISTER_COALESCED_MMIO, &zone);
+
+ rb_int_erase(&mmio_tree, &mmio->node);
+ br_write_unlock(kvm);
+
+ free(mmio);
+ return true;
+}
+
+bool kvm__emulate_mmio(struct kvm_cpu *vcpu, u64 phys_addr, u8 *data, u32 len, u8 is_write)
+{
+ struct mmio_mapping *mmio;
+
+ br_read_lock();
+ mmio = mmio_search(&mmio_tree, phys_addr, len);
+
+ if (mmio)
+ mmio->mmio_fn(vcpu, phys_addr, data, len, is_write, mmio->ptr);
+ else {
+ if (vcpu->kvm->cfg.mmio_debug)
+ fprintf(stderr, "Warning: Ignoring MMIO %s at %016llx (length %u)\n",
+ to_direction(is_write),
+ (unsigned long long)phys_addr, len);
+ }
+ br_read_unlock();
+
+ return true;
+}
diff --git a/tools/kvm/net/uip/arp.c b/tools/kvm/net/uip/arp.c
new file mode 100644
index 0000000..98423da
--- /dev/null
+++ b/tools/kvm/net/uip/arp.c
@@ -0,0 +1,30 @@
+#include "kvm/uip.h"
+
+int uip_tx_do_arp(struct uip_tx_arg *arg)
+{
+ struct uip_arp *arp, *arp2;
+ struct uip_info *info;
+ struct uip_buf *buf;
+
+ info = arg->info;
+ buf = uip_buf_clone(arg);
+
+ arp = (struct uip_arp *)(arg->eth);
+ arp2 = (struct uip_arp *)(buf->eth);
+
+ /*
+ * ARP replay code: 2
+ */
+ arp2->op = htons(0x2);
+ arp2->dmac = arp->smac;
+ arp2->dip = arp->sip;
+
+ if (arp->dip == htonl(info->host_ip)) {
+ arp2->smac = info->host_mac;
+ arp2->sip = htonl(info->host_ip);
+
+ uip_buf_set_used(info, buf);
+ }
+
+ return 0;
+}
diff --git a/tools/kvm/net/uip/buf.c b/tools/kvm/net/uip/buf.c
new file mode 100644
index 0000000..f29ad41
--- /dev/null
+++ b/tools/kvm/net/uip/buf.c
@@ -0,0 +1,114 @@
+#include "kvm/uip.h"
+
+#include <linux/kernel.h>
+#include <linux/list.h>
+
+struct uip_buf *uip_buf_get_used(struct uip_info *info)
+{
+ struct uip_buf *buf;
+ bool found = false;
+
+ mutex_lock(&info->buf_lock);
+
+ while (!(info->buf_used_nr > 0))
+ pthread_cond_wait(&info->buf_used_cond, &info->buf_lock.mutex);
+
+ list_for_each_entry(buf, &info->buf_head, list) {
+ if (buf->status == UIP_BUF_STATUS_USED) {
+ /*
+ * Set status to INUSE immediately to prevent
+ * someone from using this buf until we free it
+ */
+ buf->status = UIP_BUF_STATUS_INUSE;
+ info->buf_used_nr--;
+ found = true;
+ break;
+ }
+ }
+
+ mutex_unlock(&info->buf_lock);
+
+ return found ? buf : NULL;
+}
+
+struct uip_buf *uip_buf_get_free(struct uip_info *info)
+{
+ struct uip_buf *buf;
+ bool found = false;
+
+ mutex_lock(&info->buf_lock);
+
+ while (!(info->buf_free_nr > 0))
+ pthread_cond_wait(&info->buf_free_cond, &info->buf_lock.mutex);
+
+ list_for_each_entry(buf, &info->buf_head, list) {
+ if (buf->status == UIP_BUF_STATUS_FREE) {
+ /*
+ * Set status to INUSE immediately to prevent
+ * someone from using this buf until we free it
+ */
+ buf->status = UIP_BUF_STATUS_INUSE;
+ info->buf_free_nr--;
+ found = true;
+ break;
+ }
+ }
+
+ mutex_unlock(&info->buf_lock);
+
+ return found ? buf : NULL;
+}
+
+struct uip_buf *uip_buf_set_used(struct uip_info *info, struct uip_buf *buf)
+{
+ mutex_lock(&info->buf_lock);
+
+ buf->status = UIP_BUF_STATUS_USED;
+ info->buf_used_nr++;
+ pthread_cond_signal(&info->buf_used_cond);
+
+ mutex_unlock(&info->buf_lock);
+
+ return buf;
+}
+
+struct uip_buf *uip_buf_set_free(struct uip_info *info, struct uip_buf *buf)
+{
+ mutex_lock(&info->buf_lock);
+
+ buf->status = UIP_BUF_STATUS_FREE;
+ info->buf_free_nr++;
+ pthread_cond_signal(&info->buf_free_cond);
+
+ mutex_unlock(&info->buf_lock);
+
+ return buf;
+}
+
+struct uip_buf *uip_buf_clone(struct uip_tx_arg *arg)
+{
+ struct uip_buf *buf;
+ struct uip_eth *eth2;
+ struct uip_info *info;
+
+ info = arg->info;
+
+ /*
+ * Get buffer from device to guest
+ */
+ buf = uip_buf_get_free(info);
+
+ /*
+ * Clone buffer
+ */
+ memcpy(buf->vnet, arg->vnet, arg->vnet_len);
+ memcpy(buf->eth, arg->eth, arg->eth_len);
+ buf->vnet_len = arg->vnet_len;
+ buf->eth_len = arg->eth_len;
+
+ eth2 = (struct uip_eth *)buf->eth;
+ eth2->src = info->host_mac;
+ eth2->dst = arg->eth->src;
+
+ return buf;
+}
diff --git a/tools/kvm/net/uip/core.c b/tools/kvm/net/uip/core.c
new file mode 100644
index 0000000..e860f3a
--- /dev/null
+++ b/tools/kvm/net/uip/core.c
@@ -0,0 +1,156 @@
+#include "kvm/mutex.h"
+#include "kvm/uip.h"
+
+#include <linux/virtio_net.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <kvm/iovec.h>
+
+int uip_tx(struct iovec *iov, u16 out, struct uip_info *info)
+{
+ void *vnet;
+ struct uip_tx_arg arg;
+ int eth_len, vnet_len;
+ struct uip_eth *eth;
+ u8 *buf = NULL;
+ u16 proto;
+ int i;
+
+ /*
+ * Buffer from guest to device
+ */
+ vnet_len = iov[0].iov_len;
+ vnet = iov[0].iov_base;
+
+ eth_len = iov[1].iov_len;
+ eth = iov[1].iov_base;
+
+ /*
+ * In case, ethernet frame is in more than one iov entry.
+ * Copy iov buffer into one linear buffer.
+ */
+ if (out > 2) {
+ eth_len = 0;
+ for (i = 1; i < out; i++)
+ eth_len += iov[i].iov_len;
+
+ buf = malloc(eth_len);
+ if (!buf)
+ return -ENOMEM;
+
+ eth = (struct uip_eth *)buf;
+ for (i = 1; i < out; i++) {
+ memcpy(buf, iov[i].iov_base, iov[i].iov_len);
+ buf += iov[i].iov_len;
+ }
+ }
+
+ memset(&arg, 0, sizeof(arg));
+
+ arg.vnet_len = vnet_len;
+ arg.eth_len = eth_len;
+ arg.info = info;
+ arg.vnet = vnet;
+ arg.eth = eth;
+
+ /*
+ * Check package type
+ */
+ proto = ntohs(eth->type);
+
+ switch (proto) {
+ case UIP_ETH_P_ARP:
+ uip_tx_do_arp(&arg);
+ break;
+ case UIP_ETH_P_IP:
+ uip_tx_do_ipv4(&arg);
+ break;
+ default:
+ break;
+ }
+
+ if (out > 2 && buf)
+ free(eth);
+
+ return vnet_len + eth_len;
+}
+
+int uip_rx(struct iovec *iov, u16 in, struct uip_info *info)
+{
+ struct uip_buf *buf;
+ int len;
+
+ /*
+ * Sleep until there is a buffer for guest
+ */
+ buf = uip_buf_get_used(info);
+
+ memcpy_toiovecend(iov, buf->vnet, 0, buf->vnet_len);
+ memcpy_toiovecend(iov, buf->eth, buf->vnet_len, buf->eth_len);
+
+ len = buf->vnet_len + buf->eth_len;
+
+ uip_buf_set_free(info, buf);
+ return len;
+}
+
+void uip_static_init(struct uip_info *info)
+{
+ struct list_head *udp_socket_head;
+ struct list_head *tcp_socket_head;
+ struct list_head *buf_head;
+
+ udp_socket_head = &info->udp_socket_head;
+ tcp_socket_head = &info->tcp_socket_head;
+ buf_head = &info->buf_head;
+
+ INIT_LIST_HEAD(udp_socket_head);
+ INIT_LIST_HEAD(tcp_socket_head);
+ INIT_LIST_HEAD(buf_head);
+
+ mutex_init(&info->udp_socket_lock);
+ mutex_init(&info->tcp_socket_lock);
+ mutex_init(&info->buf_lock);
+
+ pthread_cond_init(&info->buf_used_cond, NULL);
+ pthread_cond_init(&info->buf_free_cond, NULL);
+
+ info->buf_used_nr = 0;
+}
+
+int uip_init(struct uip_info *info)
+{
+ struct list_head *buf_head;
+ struct uip_buf *buf;
+ int buf_nr;
+ int i;
+
+ buf_head = &info->buf_head;
+ buf_nr = info->buf_nr;
+
+ for (i = 0; i < buf_nr; i++) {
+ buf = malloc(sizeof(*buf));
+ memset(buf, 0, sizeof(*buf));
+
+ buf->status = UIP_BUF_STATUS_FREE;
+ buf->info = info;
+ buf->id = i;
+ list_add_tail(&buf->list, buf_head);
+ }
+
+ list_for_each_entry(buf, buf_head, list) {
+ buf->vnet_len = info->vnet_hdr_len;
+ buf->vnet = malloc(buf->vnet_len);
+ buf->eth_len = 1024*64 + sizeof(struct uip_pseudo_hdr);
+ buf->eth = malloc(buf->eth_len);
+
+ memset(buf->vnet, 0, buf->vnet_len);
+ memset(buf->eth, 0, buf->eth_len);
+ }
+
+ info->buf_free_nr = buf_nr;
+
+ uip_dhcp_get_dns(info);
+
+ return 0;
+}
diff --git a/tools/kvm/net/uip/csum.c b/tools/kvm/net/uip/csum.c
new file mode 100644
index 0000000..7ca8bad
--- /dev/null
+++ b/tools/kvm/net/uip/csum.c
@@ -0,0 +1,92 @@
+#include "kvm/uip.h"
+
+static u16 uip_csum(u16 csum, u8 *addr, u16 count)
+{
+ long sum = csum;
+
+ while (count > 1) {
+ sum += *(u16 *)addr;
+ addr += 2;
+ count -= 2;
+ }
+
+ if (count > 0)
+ sum += *(unsigned char *)addr;
+
+ while (sum>>16)
+ sum = (sum & 0xffff) + (sum >> 16);
+
+ return ~sum;
+}
+
+u16 uip_csum_ip(struct uip_ip *ip)
+{
+ return uip_csum(0, &ip->vhl, uip_ip_hdrlen(ip));
+}
+
+u16 uip_csum_icmp(struct uip_icmp *icmp)
+{
+ struct uip_ip *ip;
+
+ ip = &icmp->ip;
+ return icmp->csum = uip_csum(0, &icmp->type, htons(ip->len) - uip_ip_hdrlen(ip) - 8); /* icmp header len = 8 */
+}
+
+u16 uip_csum_udp(struct uip_udp *udp)
+{
+ struct uip_pseudo_hdr hdr;
+ struct uip_ip *ip;
+ int udp_len;
+ u8 *pad;
+
+ ip = &udp->ip;
+
+ hdr.sip = ip->sip;
+ hdr.dip = ip->dip;
+ hdr.zero = 0;
+ hdr.proto = ip->proto;
+ hdr.len = udp->len;
+
+ udp_len = uip_udp_len(udp);
+
+ if (udp_len % 2) {
+ pad = (u8 *)&udp->sport + udp_len;
+ *pad = 0;
+ memcpy((u8 *)&udp->sport + udp_len + 1, &hdr, sizeof(hdr));
+ return uip_csum(0, (u8 *)&udp->sport, udp_len + 1 + sizeof(hdr));
+ } else {
+ memcpy((u8 *)&udp->sport + udp_len, &hdr, sizeof(hdr));
+ return uip_csum(0, (u8 *)&udp->sport, udp_len + sizeof(hdr));
+ }
+
+}
+
+u16 uip_csum_tcp(struct uip_tcp *tcp)
+{
+ struct uip_pseudo_hdr hdr;
+ struct uip_ip *ip;
+ u16 tcp_len;
+ u8 *pad;
+
+ ip = &tcp->ip;
+ tcp_len = ntohs(ip->len) - uip_ip_hdrlen(ip);
+
+ hdr.sip = ip->sip;
+ hdr.dip = ip->dip;
+ hdr.zero = 0;
+ hdr.proto = ip->proto;
+ hdr.len = htons(tcp_len);
+
+ if (tcp_len > UIP_MAX_TCP_PAYLOAD + 20)
+ pr_warning("tcp_len(%d) is too large", tcp_len);
+
+ if (tcp_len % 2) {
+ pad = (u8 *)&tcp->sport + tcp_len;
+ *pad = 0;
+ memcpy((u8 *)&tcp->sport + tcp_len + 1, &hdr, sizeof(hdr));
+ return uip_csum(0, (u8 *)&tcp->sport, tcp_len + 1 + sizeof(hdr));
+ } else {
+ memcpy((u8 *)&tcp->sport + tcp_len, &hdr, sizeof(hdr));
+ return uip_csum(0, (u8 *)&tcp->sport, tcp_len + sizeof(hdr));
+ }
+}
diff --git a/tools/kvm/net/uip/dhcp.c b/tools/kvm/net/uip/dhcp.c
new file mode 100644
index 0000000..b17d352
--- /dev/null
+++ b/tools/kvm/net/uip/dhcp.c
@@ -0,0 +1,202 @@
+#include "kvm/uip.h"
+
+#include <arpa/inet.h>
+
+#define EMPTY_ADDR "0.0.0.0"
+
+static inline bool uip_dhcp_is_discovery(struct uip_dhcp *dhcp)
+{
+ return (dhcp->option[2] == UIP_DHCP_DISCOVER &&
+ dhcp->option[1] == UIP_DHCP_TAG_MSG_TYPE_LEN &&
+ dhcp->option[0] == UIP_DHCP_TAG_MSG_TYPE);
+}
+
+static inline bool uip_dhcp_is_request(struct uip_dhcp *dhcp)
+{
+ return (dhcp->option[2] == UIP_DHCP_REQUEST &&
+ dhcp->option[1] == UIP_DHCP_TAG_MSG_TYPE_LEN &&
+ dhcp->option[0] == UIP_DHCP_TAG_MSG_TYPE);
+}
+
+bool uip_udp_is_dhcp(struct uip_udp *udp)
+{
+ struct uip_dhcp *dhcp;
+
+ if (ntohs(udp->sport) != UIP_DHCP_PORT_CLIENT ||
+ ntohs(udp->dport) != UIP_DHCP_PORT_SERVER)
+ return false;
+
+ dhcp = (struct uip_dhcp *)udp;
+
+ if (ntohl(dhcp->magic_cookie) != UIP_DHCP_MAGIC_COOKIE)
+ return false;
+
+ return true;
+}
+
+int uip_dhcp_get_dns(struct uip_info *info)
+{
+ char key[256], val[256];
+ struct in_addr addr;
+ int ret = -1;
+ int n = 0;
+ FILE *fp;
+ u32 ip;
+
+ fp = fopen("/etc/resolv.conf", "r");
+ if (!fp)
+ return ret;
+
+ while (!feof(fp)) {
+ if (fscanf(fp, "%s %s\n", key, val) != 2)
+ continue;
+ if (strncmp("domain", key, 6) == 0)
+ info->domain_name = strndup(val, UIP_DHCP_MAX_DOMAIN_NAME_LEN);
+ else if (strncmp("nameserver", key, 10) == 0) {
+ if (!inet_aton(val, &addr))
+ continue;
+ ip = ntohl(addr.s_addr);
+ if (n < UIP_DHCP_MAX_DNS_SERVER_NR)
+ info->dns_ip[n++] = ip;
+ ret = 0;
+ }
+ }
+
+ fclose(fp);
+ return ret;
+}
+
+static int uip_dhcp_fill_option_name_and_server(struct uip_info *info, u8 *opt, int i)
+{
+ u8 domain_name_len;
+ u32 *addr;
+ int n;
+
+ if (info->domain_name) {
+ domain_name_len = strlen(info->domain_name);
+ opt[i++] = UIP_DHCP_TAG_DOMAIN_NAME;
+ opt[i++] = domain_name_len;
+ memcpy(&opt[i], info->domain_name, domain_name_len);
+ i += domain_name_len;
+ }
+
+ for (n = 0; n < UIP_DHCP_MAX_DNS_SERVER_NR; n++) {
+ if (info->dns_ip[n] == 0)
+ continue;
+ opt[i++] = UIP_DHCP_TAG_DNS_SERVER;
+ opt[i++] = UIP_DHCP_TAG_DNS_SERVER_LEN;
+ addr = (u32 *)&opt[i];
+ *addr = htonl(info->dns_ip[n]);
+ i += UIP_DHCP_TAG_DNS_SERVER_LEN;
+ }
+
+ return i;
+}
+static int uip_dhcp_fill_option(struct uip_info *info, struct uip_dhcp *dhcp, int reply_msg_type)
+{
+ int i = 0;
+ u32 *addr;
+ u8 *opt;
+
+ opt = dhcp->option;
+
+ opt[i++] = UIP_DHCP_TAG_MSG_TYPE;
+ opt[i++] = UIP_DHCP_TAG_MSG_TYPE_LEN;
+ opt[i++] = reply_msg_type;
+
+ opt[i++] = UIP_DHCP_TAG_SERVER_ID;
+ opt[i++] = UIP_DHCP_TAG_SERVER_ID_LEN;
+ addr = (u32 *)&opt[i];
+ *addr = htonl(info->host_ip);
+ i += UIP_DHCP_TAG_SERVER_ID_LEN;
+
+ opt[i++] = UIP_DHCP_TAG_LEASE_TIME;
+ opt[i++] = UIP_DHCP_TAG_LEASE_TIME_LEN;
+ addr = (u32 *)&opt[i];
+ *addr = htonl(UIP_DHCP_LEASE_TIME);
+ i += UIP_DHCP_TAG_LEASE_TIME_LEN;
+
+ opt[i++] = UIP_DHCP_TAG_SUBMASK;
+ opt[i++] = UIP_DHCP_TAG_SUBMASK_LEN;
+ addr = (u32 *)&opt[i];
+ *addr = htonl(info->guest_netmask);
+ i += UIP_DHCP_TAG_SUBMASK_LEN;
+
+ opt[i++] = UIP_DHCP_TAG_ROUTER;
+ opt[i++] = UIP_DHCP_TAG_ROUTER_LEN;
+ addr = (u32 *)&opt[i];
+ *addr = htonl(info->host_ip);
+ i += UIP_DHCP_TAG_ROUTER_LEN;
+
+ opt[i++] = UIP_DHCP_TAG_ROOT;
+ opt[i++] = strlen(EMPTY_ADDR);
+ addr = (u32 *)&opt[i];
+ strncpy((void *) addr, EMPTY_ADDR, strlen(EMPTY_ADDR));
+ i += strlen(EMPTY_ADDR);
+
+ i = uip_dhcp_fill_option_name_and_server(info, opt, i);
+
+ opt[i++] = UIP_DHCP_TAG_END;
+
+ return 0;
+}
+
+static int uip_dhcp_make_pkg(struct uip_info *info, struct uip_udp_socket *sk, struct uip_buf *buf, u8 reply_msg_type)
+{
+ struct uip_dhcp *dhcp;
+
+ dhcp = (struct uip_dhcp *)buf->eth;
+
+ dhcp->msg_type = 2;
+ dhcp->client_ip = 0;
+ dhcp->your_ip = htonl(info->guest_ip);
+ dhcp->server_ip = htonl(info->host_ip);
+ dhcp->agent_ip = 0;
+
+ uip_dhcp_fill_option(info, dhcp, reply_msg_type);
+
+ sk->sip = htonl(info->guest_ip);
+ sk->dip = htonl(info->host_ip);
+ sk->sport = htons(UIP_DHCP_PORT_CLIENT);
+ sk->dport = htons(UIP_DHCP_PORT_SERVER);
+
+ return 0;
+}
+
+int uip_tx_do_ipv4_udp_dhcp(struct uip_tx_arg *arg)
+{
+ struct uip_udp_socket sk;
+ struct uip_dhcp *dhcp;
+ struct uip_info *info;
+ struct uip_buf *buf;
+ u8 reply_msg_type;
+
+ dhcp = (struct uip_dhcp *)arg->eth;
+
+ if (uip_dhcp_is_discovery(dhcp))
+ reply_msg_type = UIP_DHCP_OFFER;
+ else if (uip_dhcp_is_request(dhcp))
+ reply_msg_type = UIP_DHCP_ACK;
+ else
+ return -1;
+
+ buf = uip_buf_clone(arg);
+ info = arg->info;
+
+ /*
+ * Cook DHCP pkg
+ */
+ uip_dhcp_make_pkg(info, &sk, buf, reply_msg_type);
+
+ /*
+ * Cook UDP pkg
+ */
+ uip_udp_make_pkg(info, &sk, buf, NULL, UIP_DHCP_MAX_PAYLOAD_LEN);
+
+ /*
+ * Send data received from socket to guest
+ */
+ uip_buf_set_used(info, buf);
+
+ return 0;
+}
diff --git a/tools/kvm/net/uip/icmp.c b/tools/kvm/net/uip/icmp.c
new file mode 100644
index 0000000..233297c
--- /dev/null
+++ b/tools/kvm/net/uip/icmp.c
@@ -0,0 +1,29 @@
+#include "kvm/uip.h"
+
+int uip_tx_do_ipv4_icmp(struct uip_tx_arg *arg)
+{
+ struct uip_ip *ip, *ip2;
+ struct uip_icmp *icmp2;
+ struct uip_buf *buf;
+
+ buf = uip_buf_clone(arg);
+
+ icmp2 = (struct uip_icmp *)(buf->eth);
+ ip2 = (struct uip_ip *)(buf->eth);
+ ip = (struct uip_ip *)(arg->eth);
+
+ ip2->sip = ip->dip;
+ ip2->dip = ip->sip;
+ ip2->csum = 0;
+ /*
+ * ICMP reply: 0
+ */
+ icmp2->type = 0;
+ icmp2->csum = 0;
+ ip2->csum = uip_csum_ip(ip2);
+ icmp2->csum = uip_csum_icmp(icmp2);
+
+ uip_buf_set_used(arg->info, buf);
+
+ return 0;
+}
diff --git a/tools/kvm/net/uip/ipv4.c b/tools/kvm/net/uip/ipv4.c
new file mode 100644
index 0000000..58373fd
--- /dev/null
+++ b/tools/kvm/net/uip/ipv4.c
@@ -0,0 +1,29 @@
+#include "kvm/uip.h"
+
+int uip_tx_do_ipv4(struct uip_tx_arg *arg)
+{
+ struct uip_ip *ip;
+
+ ip = (struct uip_ip *)(arg->eth);
+
+ if (uip_ip_hdrlen(ip) != 20) {
+ pr_warning("IP header length is not 20 bytes");
+ return -1;
+ }
+
+ switch (ip->proto) {
+ case UIP_IP_P_ICMP:
+ uip_tx_do_ipv4_icmp(arg);
+ break;
+ case UIP_IP_P_TCP:
+ uip_tx_do_ipv4_tcp(arg);
+ break;
+ case UIP_IP_P_UDP:
+ uip_tx_do_ipv4_udp(arg);
+ break;
+ default:
+ break;
+ }
+
+ return 0;
+}
diff --git a/tools/kvm/net/uip/tcp.c b/tools/kvm/net/uip/tcp.c
new file mode 100644
index 0000000..3c30ade
--- /dev/null
+++ b/tools/kvm/net/uip/tcp.c
@@ -0,0 +1,348 @@
+#include "kvm/uip.h"
+
+#include <kvm/kvm.h>
+#include <linux/virtio_net.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <arpa/inet.h>
+
+static int uip_tcp_socket_close(struct uip_tcp_socket *sk, int how)
+{
+ shutdown(sk->fd, how);
+
+ if (sk->write_done && sk->read_done) {
+ shutdown(sk->fd, SHUT_RDWR);
+ close(sk->fd);
+
+ mutex_lock(sk->lock);
+ list_del(&sk->list);
+ mutex_unlock(sk->lock);
+
+ free(sk);
+ }
+
+ return 0;
+}
+
+static struct uip_tcp_socket *uip_tcp_socket_find(struct uip_tx_arg *arg, u32 sip, u32 dip, u16 sport, u16 dport)
+{
+ struct list_head *sk_head;
+ struct mutex *sk_lock;
+ struct uip_tcp_socket *sk;
+
+ sk_head = &arg->info->tcp_socket_head;
+ sk_lock = &arg->info->tcp_socket_lock;
+
+ mutex_lock(sk_lock);
+ list_for_each_entry(sk, sk_head, list) {
+ if (sk->sip == sip && sk->dip == dip && sk->sport == sport && sk->dport == dport) {
+ mutex_unlock(sk_lock);
+ return sk;
+ }
+ }
+ mutex_unlock(sk_lock);
+
+ return NULL;
+}
+
+static struct uip_tcp_socket *uip_tcp_socket_alloc(struct uip_tx_arg *arg, u32 sip, u32 dip, u16 sport, u16 dport)
+{
+ struct list_head *sk_head;
+ struct uip_tcp_socket *sk;
+ struct mutex *sk_lock;
+ struct uip_tcp *tcp;
+ struct uip_ip *ip;
+ int ret;
+
+ tcp = (struct uip_tcp *)arg->eth;
+ ip = (struct uip_ip *)arg->eth;
+
+ sk_head = &arg->info->tcp_socket_head;
+ sk_lock = &arg->info->tcp_socket_lock;
+
+ sk = malloc(sizeof(*sk));
+ memset(sk, 0, sizeof(*sk));
+
+ sk->lock = sk_lock;
+ sk->info = arg->info;
+
+ sk->fd = socket(AF_INET, SOCK_STREAM, 0);
+ sk->addr.sin_family = AF_INET;
+ sk->addr.sin_port = dport;
+ sk->addr.sin_addr.s_addr = dip;
+
+ pthread_cond_init(&sk->cond, NULL);
+
+ if (ntohl(dip) == arg->info->host_ip)
+ sk->addr.sin_addr.s_addr = inet_addr("127.0.0.1");
+
+ ret = connect(sk->fd, (struct sockaddr *)&sk->addr, sizeof(sk->addr));
+ if (ret) {
+ free(sk);
+ return NULL;
+ }
+
+ sk->sip = ip->sip;
+ sk->dip = ip->dip;
+ sk->sport = tcp->sport;
+ sk->dport = tcp->dport;
+
+ mutex_lock(sk_lock);
+ list_add_tail(&sk->list, sk_head);
+ mutex_unlock(sk_lock);
+
+ return sk;
+}
+
+static int uip_tcp_payload_send(struct uip_tcp_socket *sk, u8 flag, u16 payload_len)
+{
+ struct uip_info *info;
+ struct uip_eth *eth2;
+ struct uip_tcp *tcp2;
+ struct uip_buf *buf;
+ struct uip_ip *ip2;
+
+ info = sk->info;
+
+ /*
+ * Get free buffer to send data to guest
+ */
+ buf = uip_buf_get_free(info);
+
+ /*
+ * Cook a ethernet frame
+ */
+ tcp2 = (struct uip_tcp *)buf->eth;
+ eth2 = (struct uip_eth *)buf->eth;
+ ip2 = (struct uip_ip *)buf->eth;
+
+ eth2->src = info->host_mac;
+ eth2->dst = info->guest_mac;
+ eth2->type = htons(UIP_ETH_P_IP);
+
+ ip2->vhl = UIP_IP_VER_4 | UIP_IP_HDR_LEN;
+ ip2->tos = 0;
+ ip2->id = 0;
+ ip2->flgfrag = 0;
+ ip2->ttl = UIP_IP_TTL;
+ ip2->proto = UIP_IP_P_TCP;
+ ip2->csum = 0;
+ ip2->sip = sk->dip;
+ ip2->dip = sk->sip;
+
+ tcp2->sport = sk->dport;
+ tcp2->dport = sk->sport;
+ tcp2->seq = htonl(sk->seq_server);
+ tcp2->ack = htonl(sk->ack_server);
+ /*
+ * Diable TCP options, tcp hdr len equals 20 bytes
+ */
+ tcp2->off = UIP_TCP_HDR_LEN;
+ tcp2->flg = flag;
+ tcp2->win = htons(UIP_TCP_WIN_SIZE);
+ tcp2->csum = 0;
+ tcp2->urgent = 0;
+
+ if (payload_len > 0)
+ memcpy(uip_tcp_payload(tcp2), sk->payload, payload_len);
+
+ ip2->len = htons(uip_tcp_hdrlen(tcp2) + payload_len + uip_ip_hdrlen(ip2));
+ ip2->csum = uip_csum_ip(ip2);
+ tcp2->csum = uip_csum_tcp(tcp2);
+
+ /*
+ * virtio_net_hdr
+ */
+ buf->vnet_len = info->vnet_hdr_len;
+ memset(buf->vnet, 0, buf->vnet_len);
+
+ buf->eth_len = ntohs(ip2->len) + uip_eth_hdrlen(&ip2->eth);
+
+ /*
+ * Increase server seq
+ */
+ sk->seq_server += payload_len;
+
+ /*
+ * Send data received from socket to guest
+ */
+ uip_buf_set_used(info, buf);
+
+ return 0;
+}
+
+static void *uip_tcp_socket_thread(void *p)
+{
+ struct uip_tcp_socket *sk;
+ int len, left, ret;
+ u8 *payload, *pos;
+
+ kvm__set_thread_name("uip-tcp");
+
+ sk = p;
+
+ payload = malloc(UIP_MAX_TCP_PAYLOAD);
+ if (!payload)
+ goto out;
+
+ while (1) {
+ pos = payload;
+
+ ret = read(sk->fd, payload, UIP_MAX_TCP_PAYLOAD);
+
+ if (ret <= 0 || ret > UIP_MAX_TCP_PAYLOAD)
+ goto out;
+
+ left = ret;
+
+ while (left > 0) {
+ mutex_lock(sk->lock);
+ while ((len = sk->guest_acked + sk->window_size - sk->seq_server) <= 0)
+ pthread_cond_wait(&sk->cond, &sk->lock->mutex);
+ mutex_unlock(sk->lock);
+
+ sk->payload = pos;
+ if (len > left)
+ len = left;
+ if (len > UIP_MAX_TCP_PAYLOAD)
+ len = UIP_MAX_TCP_PAYLOAD;
+ left -= len;
+ pos += len;
+
+ uip_tcp_payload_send(sk, UIP_TCP_FLAG_ACK, len);
+ }
+ }
+
+out:
+ /*
+ * Close server to guest TCP connection
+ */
+ uip_tcp_socket_close(sk, SHUT_RD);
+
+ uip_tcp_payload_send(sk, UIP_TCP_FLAG_FIN | UIP_TCP_FLAG_ACK, 0);
+ sk->seq_server += 1;
+
+ sk->read_done = 1;
+
+ free(payload);
+ pthread_exit(NULL);
+
+ return NULL;
+}
+
+static int uip_tcp_socket_receive(struct uip_tcp_socket *sk)
+{
+ if (sk->thread == 0)
+ return pthread_create(&sk->thread, NULL, uip_tcp_socket_thread, (void *)sk);
+
+ return 0;
+}
+
+static int uip_tcp_socket_send(struct uip_tcp_socket *sk, struct uip_tcp *tcp)
+{
+ int len;
+ int ret;
+ u8 *payload;
+
+ if (sk->write_done)
+ return 0;
+
+ payload = uip_tcp_payload(tcp);
+ len = uip_tcp_payloadlen(tcp);
+
+ ret = write(sk->fd, payload, len);
+ if (ret != len)
+ pr_warning("tcp send error");
+
+ return ret;
+}
+
+int uip_tx_do_ipv4_tcp(struct uip_tx_arg *arg)
+{
+ struct uip_tcp_socket *sk;
+ struct uip_tcp *tcp;
+ struct uip_ip *ip;
+ int ret;
+
+ tcp = (struct uip_tcp *)arg->eth;
+ ip = (struct uip_ip *)arg->eth;
+
+ /*
+ * Guest is trying to start a TCP session, let's fake SYN-ACK to guest
+ */
+ if (uip_tcp_is_syn(tcp)) {
+ sk = uip_tcp_socket_alloc(arg, ip->sip, ip->dip, tcp->sport, tcp->dport);
+ if (!sk)
+ return -1;
+
+ sk->window_size = ntohs(tcp->win);
+
+ /*
+ * Setup ISN number
+ */
+ sk->isn_guest = uip_tcp_isn(tcp);
+ sk->isn_server = uip_tcp_isn_alloc();
+
+ sk->seq_server = sk->isn_server;
+ sk->ack_server = sk->isn_guest + 1;
+ uip_tcp_payload_send(sk, UIP_TCP_FLAG_SYN | UIP_TCP_FLAG_ACK, 0);
+ sk->seq_server += 1;
+
+ /*
+ * Start receive thread for data from remote to guest
+ */
+ uip_tcp_socket_receive(sk);
+
+ goto out;
+ }
+
+ /*
+ * Find socket we have allocated
+ */
+ sk = uip_tcp_socket_find(arg, ip->sip, ip->dip, tcp->sport, tcp->dport);
+ if (!sk)
+ return -1;
+
+ mutex_lock(sk->lock);
+ sk->window_size = ntohs(tcp->win);
+ sk->guest_acked = ntohl(tcp->ack);
+ pthread_cond_signal(&sk->cond);
+ mutex_unlock(sk->lock);
+
+ if (uip_tcp_is_fin(tcp)) {
+ if (sk->write_done)
+ goto out;
+
+ sk->write_done = 1;
+ sk->ack_server += 1;
+ uip_tcp_payload_send(sk, UIP_TCP_FLAG_ACK, 0);
+
+ /*
+ * Close guest to server TCP connection
+ */
+ uip_tcp_socket_close(sk, SHUT_WR);
+
+ goto out;
+ }
+
+ /*
+ * Ignore guest to server frames with zero tcp payload
+ */
+ if (uip_tcp_payloadlen(tcp) == 0)
+ goto out;
+
+ /*
+ * Sent out TCP data to remote host
+ */
+ ret = uip_tcp_socket_send(sk, tcp);
+ if (ret < 0)
+ return -1;
+ /*
+ * Send ACK to guest imediately
+ */
+ sk->ack_server += ret;
+ uip_tcp_payload_send(sk, UIP_TCP_FLAG_ACK, 0);
+
+out:
+ return 0;
+}
diff --git a/tools/kvm/net/uip/udp.c b/tools/kvm/net/uip/udp.c
new file mode 100644
index 0000000..dd288c5
--- /dev/null
+++ b/tools/kvm/net/uip/udp.c
@@ -0,0 +1,239 @@
+#include "kvm/uip.h"
+
+#include <kvm/kvm.h>
+#include <linux/virtio_net.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <sys/socket.h>
+#include <sys/epoll.h>
+#include <fcntl.h>
+
+#define UIP_UDP_MAX_EVENTS 1000
+
+static struct uip_udp_socket *uip_udp_socket_find(struct uip_tx_arg *arg, u32 sip, u32 dip, u16 sport, u16 dport)
+{
+ struct list_head *sk_head;
+ struct uip_udp_socket *sk;
+ struct mutex *sk_lock;
+ struct epoll_event ev;
+ int flags;
+ int ret;
+
+ sk_head = &arg->info->udp_socket_head;
+ sk_lock = &arg->info->udp_socket_lock;
+
+ /*
+ * Find existing sk
+ */
+ mutex_lock(sk_lock);
+ list_for_each_entry(sk, sk_head, list) {
+ if (sk->sip == sip && sk->dip == dip && sk->sport == sport && sk->dport == dport) {
+ mutex_unlock(sk_lock);
+ return sk;
+ }
+ }
+ mutex_unlock(sk_lock);
+
+ /*
+ * Allocate new one
+ */
+ sk = malloc(sizeof(*sk));
+ memset(sk, 0, sizeof(*sk));
+
+ sk->lock = sk_lock;
+
+ sk->fd = socket(AF_INET, SOCK_DGRAM, 0);
+ if (sk->fd < 0)
+ goto out;
+
+ /*
+ * Set non-blocking
+ */
+ flags = fcntl(sk->fd, F_GETFL, 0);
+ flags |= O_NONBLOCK;
+ fcntl(sk->fd, F_SETFL, flags);
+
+ /*
+ * Add sk->fd to epoll_wait
+ */
+ ev.events = EPOLLIN;
+ ev.data.fd = sk->fd;
+ ev.data.ptr = sk;
+ if (arg->info->udp_epollfd <= 0)
+ arg->info->udp_epollfd = epoll_create(UIP_UDP_MAX_EVENTS);
+ ret = epoll_ctl(arg->info->udp_epollfd, EPOLL_CTL_ADD, sk->fd, &ev);
+ if (ret == -1)
+ pr_warning("epoll_ctl error");
+
+ sk->addr.sin_family = AF_INET;
+ sk->addr.sin_addr.s_addr = dip;
+ sk->addr.sin_port = dport;
+
+ sk->sip = sip;
+ sk->dip = dip;
+ sk->sport = sport;
+ sk->dport = dport;
+
+ mutex_lock(sk_lock);
+ list_add_tail(&sk->list, sk_head);
+ mutex_unlock(sk_lock);
+
+ return sk;
+
+out:
+ free(sk);
+ return NULL;
+}
+
+static int uip_udp_socket_send(struct uip_udp_socket *sk, struct uip_udp *udp)
+{
+ int len;
+ int ret;
+
+ len = ntohs(udp->len) - uip_udp_hdrlen(udp);
+
+ ret = sendto(sk->fd, udp->payload, len, 0, (struct sockaddr *)&sk->addr, sizeof(sk->addr));
+ if (ret != len)
+ return -1;
+
+ return 0;
+}
+
+int uip_udp_make_pkg(struct uip_info *info, struct uip_udp_socket *sk, struct uip_buf *buf, u8* payload, int payload_len)
+{
+ struct uip_eth *eth2;
+ struct uip_udp *udp2;
+ struct uip_ip *ip2;
+
+ /*
+ * Cook a ethernet frame
+ */
+ udp2 = (struct uip_udp *)(buf->eth);
+ eth2 = (struct uip_eth *)buf->eth;
+ ip2 = (struct uip_ip *)(buf->eth);
+
+ eth2->src = info->host_mac;
+ eth2->dst = info->guest_mac;
+ eth2->type = htons(UIP_ETH_P_IP);
+
+ ip2->vhl = UIP_IP_VER_4 | UIP_IP_HDR_LEN;
+ ip2->tos = 0;
+ ip2->id = 0;
+ ip2->flgfrag = 0;
+ ip2->ttl = UIP_IP_TTL;
+ ip2->proto = UIP_IP_P_UDP;
+ ip2->csum = 0;
+
+ ip2->sip = sk->dip;
+ ip2->dip = sk->sip;
+ udp2->sport = sk->dport;
+ udp2->dport = sk->sport;
+
+ udp2->len = htons(payload_len + uip_udp_hdrlen(udp2));
+ udp2->csum = 0;
+
+ if (payload)
+ memcpy(udp2->payload, payload, payload_len);
+
+ ip2->len = udp2->len + htons(uip_ip_hdrlen(ip2));
+ ip2->csum = uip_csum_ip(ip2);
+ udp2->csum = uip_csum_udp(udp2);
+
+ /*
+ * virtio_net_hdr
+ */
+ buf->vnet_len = info->vnet_hdr_len;
+ memset(buf->vnet, 0, buf->vnet_len);
+
+ buf->eth_len = ntohs(ip2->len) + uip_eth_hdrlen(&ip2->eth);
+
+ return 0;
+}
+
+static void *uip_udp_socket_thread(void *p)
+{
+ struct epoll_event events[UIP_UDP_MAX_EVENTS];
+ struct uip_udp_socket *sk;
+ struct uip_info *info;
+ struct uip_buf *buf;
+ int payload_len;
+ u8 *payload;
+ int nfds;
+ int i;
+
+ kvm__set_thread_name("uip-udp");
+
+ info = p;
+
+ do {
+ payload = malloc(UIP_MAX_UDP_PAYLOAD);
+ } while (!payload);
+
+ while (1) {
+ nfds = epoll_wait(info->udp_epollfd, events, UIP_UDP_MAX_EVENTS, -1);
+
+ if (nfds == -1)
+ continue;
+
+ for (i = 0; i < nfds; i++) {
+
+ sk = events[i].data.ptr;
+ payload_len = recvfrom(sk->fd, payload, UIP_MAX_UDP_PAYLOAD, 0, NULL, NULL);
+ if (payload_len < 0)
+ continue;
+
+ /*
+ * Get free buffer to send data to guest
+ */
+ buf = uip_buf_get_free(info);
+
+ uip_udp_make_pkg(info, sk, buf, payload, payload_len);
+
+ /*
+ * Send data received from socket to guest
+ */
+ uip_buf_set_used(info, buf);
+ }
+ }
+
+ free(payload);
+ pthread_exit(NULL);
+ return NULL;
+}
+
+int uip_tx_do_ipv4_udp(struct uip_tx_arg *arg)
+{
+ struct uip_udp_socket *sk;
+ struct uip_info *info;
+ struct uip_udp *udp;
+ struct uip_ip *ip;
+ int ret;
+
+ udp = (struct uip_udp *)(arg->eth);
+ ip = (struct uip_ip *)(arg->eth);
+ info = arg->info;
+
+ if (uip_udp_is_dhcp(udp)) {
+ uip_tx_do_ipv4_udp_dhcp(arg);
+ return 0;
+ }
+
+ /*
+ * Find socket we have allocated before, otherwise allocate one
+ */
+ sk = uip_udp_socket_find(arg, ip->sip, ip->dip, udp->sport, udp->dport);
+ if (!sk)
+ return -1;
+
+ /*
+ * Send out UDP data to remote host
+ */
+ ret = uip_udp_socket_send(sk, udp);
+ if (ret)
+ return -1;
+
+ if (!info->udp_thread)
+ pthread_create(&info->udp_thread, NULL, uip_udp_socket_thread, (void *)info);
+
+ return 0;
+}
diff --git a/tools/kvm/pci.c b/tools/kvm/pci.c
new file mode 100644
index 0000000..3a6696c
--- /dev/null
+++ b/tools/kvm/pci.c
@@ -0,0 +1,251 @@
+#include "kvm/devices.h"
+#include "kvm/pci.h"
+#include "kvm/ioport.h"
+#include "kvm/irq.h"
+#include "kvm/util.h"
+#include "kvm/kvm.h"
+
+#include <linux/err.h>
+#include <assert.h>
+
+#define PCI_BAR_OFFSET(b) (offsetof(struct pci_device_header, bar[b]))
+
+static u32 pci_config_address_bits;
+
+/* This is within our PCI gap - in an unused area.
+ * Note this is a PCI *bus address*, is used to assign BARs etc.!
+ * (That's why it can still 32bit even with 64bit guests-- 64bit
+ * PCI isn't currently supported.)
+ */
+static u32 io_space_blocks = KVM_PCI_MMIO_AREA;
+
+/*
+ * BARs must be naturally aligned, so enforce this in the allocator.
+ */
+u32 pci_get_io_space_block(u32 size)
+{
+ u32 block = ALIGN(io_space_blocks, size);
+ io_space_blocks = block + size;
+ return block;
+}
+
+void pci__assign_irq(struct device_header *dev_hdr)
+{
+ struct pci_device_header *pci_hdr = dev_hdr->data;
+
+ /*
+ * PCI supports only INTA#,B#,C#,D# per device.
+ *
+ * A#,B#,C#,D# are allowed for multifunctional devices so stick
+ * with A# for our single function devices.
+ */
+ pci_hdr->irq_pin = 1;
+ pci_hdr->irq_line = irq__alloc_line();
+}
+
+static void *pci_config_address_ptr(u16 port)
+{
+ unsigned long offset;
+ void *base;
+
+ offset = port - PCI_CONFIG_ADDRESS;
+ base = &pci_config_address_bits;
+
+ return base + offset;
+}
+
+static bool pci_config_address_out(struct ioport *ioport, struct kvm_cpu *vcpu, u16 port, void *data, int size)
+{
+ void *p = pci_config_address_ptr(port);
+
+ memcpy(p, data, size);
+
+ return true;
+}
+
+static bool pci_config_address_in(struct ioport *ioport, struct kvm_cpu *vcpu, u16 port, void *data, int size)
+{
+ void *p = pci_config_address_ptr(port);
+
+ memcpy(data, p, size);
+
+ return true;
+}
+
+static struct ioport_operations pci_config_address_ops = {
+ .io_in = pci_config_address_in,
+ .io_out = pci_config_address_out,
+};
+
+static bool pci_device_exists(u8 bus_number, u8 device_number, u8 function_number)
+{
+ union pci_config_address pci_config_address;
+
+ pci_config_address.w = ioport__read32(&pci_config_address_bits);
+
+ if (pci_config_address.bus_number != bus_number)
+ return false;
+
+ if (pci_config_address.function_number != function_number)
+ return false;
+
+ return !IS_ERR_OR_NULL(device__find_dev(DEVICE_BUS_PCI, device_number));
+}
+
+static bool pci_config_data_out(struct ioport *ioport, struct kvm_cpu *vcpu, u16 port, void *data, int size)
+{
+ union pci_config_address pci_config_address;
+
+ pci_config_address.w = ioport__read32(&pci_config_address_bits);
+ /*
+ * If someone accesses PCI configuration space offsets that are not
+ * aligned to 4 bytes, it uses ioports to signify that.
+ */
+ pci_config_address.reg_offset = port - PCI_CONFIG_DATA;
+
+ pci__config_wr(vcpu->kvm, pci_config_address, data, size);
+
+ return true;
+}
+
+static bool pci_config_data_in(struct ioport *ioport, struct kvm_cpu *vcpu, u16 port, void *data, int size)
+{
+ union pci_config_address pci_config_address;
+
+ pci_config_address.w = ioport__read32(&pci_config_address_bits);
+ /*
+ * If someone accesses PCI configuration space offsets that are not
+ * aligned to 4 bytes, it uses ioports to signify that.
+ */
+ pci_config_address.reg_offset = port - PCI_CONFIG_DATA;
+
+ pci__config_rd(vcpu->kvm, pci_config_address, data, size);
+
+ return true;
+}
+
+static struct ioport_operations pci_config_data_ops = {
+ .io_in = pci_config_data_in,
+ .io_out = pci_config_data_out,
+};
+
+void pci__config_wr(struct kvm *kvm, union pci_config_address addr, void *data, int size)
+{
+ u8 dev_num;
+
+ dev_num = addr.device_number;
+
+ if (pci_device_exists(0, dev_num, 0)) {
+ unsigned long offset;
+
+ offset = addr.w & 0xff;
+ if (offset < sizeof(struct pci_device_header)) {
+ void *p = device__find_dev(DEVICE_BUS_PCI, dev_num)->data;
+ struct pci_device_header *hdr = p;
+ u8 bar = (offset - PCI_BAR_OFFSET(0)) / (sizeof(u32));
+ u32 sz = cpu_to_le32(PCI_IO_SIZE);
+
+ if (bar < 6 && hdr->bar_size[bar])
+ sz = hdr->bar_size[bar];
+
+ /*
+ * If the kernel masks the BAR it would expect to find the
+ * size of the BAR there next time it reads from it.
+ * When the kernel got the size it would write the address
+ * back.
+ */
+ if (*(u32 *)(p + offset)) {
+ /* See if kernel tries to mask one of the BARs */
+ if ((offset >= PCI_BAR_OFFSET(0)) &&
+ (offset <= PCI_BAR_OFFSET(6)) &&
+ (ioport__read32(data) == 0xFFFFFFFF))
+ memcpy(p + offset, &sz, sizeof(sz));
+ else
+ memcpy(p + offset, data, size);
+ }
+ }
+ }
+}
+
+void pci__config_rd(struct kvm *kvm, union pci_config_address addr, void *data, int size)
+{
+ u8 dev_num;
+
+ dev_num = addr.device_number;
+
+ if (pci_device_exists(0, dev_num, 0)) {
+ unsigned long offset;
+
+ offset = addr.w & 0xff;
+ if (offset < sizeof(struct pci_device_header)) {
+ void *p = device__find_dev(DEVICE_BUS_PCI, dev_num)->data;
+
+ memcpy(data, p + offset, size);
+ } else {
+ memset(data, 0x00, size);
+ }
+ } else {
+ memset(data, 0xff, size);
+ }
+}
+
+static void pci_config_mmio_access(struct kvm_cpu *vcpu, u64 addr, u8 *data,
+ u32 len, u8 is_write, void *kvm)
+{
+ union pci_config_address cfg_addr;
+
+ addr -= KVM_PCI_CFG_AREA;
+ cfg_addr.w = (u32)addr;
+ cfg_addr.enable_bit = 1;
+
+ if (is_write)
+ pci__config_wr(kvm, cfg_addr, data, len);
+ else
+ pci__config_rd(kvm, cfg_addr, data, len);
+}
+
+struct pci_device_header *pci__find_dev(u8 dev_num)
+{
+ struct device_header *hdr = device__find_dev(DEVICE_BUS_PCI, dev_num);
+
+ if (IS_ERR_OR_NULL(hdr))
+ return NULL;
+
+ return hdr->data;
+}
+
+int pci__init(struct kvm *kvm)
+{
+ int r;
+
+ r = ioport__register(kvm, PCI_CONFIG_DATA + 0, &pci_config_data_ops, 4, NULL);
+ if (r < 0)
+ return r;
+
+ r = ioport__register(kvm, PCI_CONFIG_ADDRESS + 0, &pci_config_address_ops, 4, NULL);
+ if (r < 0)
+ goto err_unregister_data;
+
+ r = kvm__register_mmio(kvm, KVM_PCI_CFG_AREA, PCI_CFG_SIZE, false,
+ pci_config_mmio_access, kvm);
+ if (r < 0)
+ goto err_unregister_addr;
+
+ return 0;
+
+err_unregister_addr:
+ ioport__unregister(kvm, PCI_CONFIG_ADDRESS);
+err_unregister_data:
+ ioport__unregister(kvm, PCI_CONFIG_DATA);
+ return r;
+}
+dev_base_init(pci__init);
+
+int pci__exit(struct kvm *kvm)
+{
+ ioport__unregister(kvm, PCI_CONFIG_DATA);
+ ioport__unregister(kvm, PCI_CONFIG_ADDRESS);
+
+ return 0;
+}
+dev_base_exit(pci__exit);
diff --git a/tools/kvm/powerpc/boot.c b/tools/kvm/powerpc/boot.c
new file mode 100644
index 0000000..2557fc0
--- /dev/null
+++ b/tools/kvm/powerpc/boot.c
@@ -0,0 +1,8 @@
+#include "kvm/kvm.h"
+
+#include <stdbool.h>
+
+bool kvm__load_firmware(struct kvm *kvm, const char *firmware_filename)
+{
+ return false;
+}
diff --git a/tools/kvm/powerpc/cpu_info.c b/tools/kvm/powerpc/cpu_info.c
new file mode 100644
index 0000000..a9dfe39
--- /dev/null
+++ b/tools/kvm/powerpc/cpu_info.c
@@ -0,0 +1,210 @@
+/*
+ * PPC CPU identification
+ *
+ * This is a very simple "host CPU info" struct to get us going.
+ * For the little host information we need, I don't want to grub about
+ * parsing stuff in /proc/device-tree so just match host PVR to differentiate
+ * PPC970 and POWER7 (which is all that's currently supported).
+ *
+ * Qemu does something similar but this is MUCH simpler!
+ *
+ * Copyright 2012 Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include <kvm/kvm.h>
+#include <sys/ioctl.h>
+
+#include "cpu_info.h"
+#include "kvm/util.h"
+
+/* POWER7 */
+
+static struct cpu_info cpu_power7_info = {
+ .name = "POWER7",
+ .tb_freq = 512000000,
+ .d_bsize = 128,
+ .i_bsize = 128,
+ .flags = CPUINFO_FLAG_DFP | CPUINFO_FLAG_VSX | CPUINFO_FLAG_VMX,
+ .mmu_info = {
+ .flags = KVM_PPC_PAGE_SIZES_REAL | KVM_PPC_1T_SEGMENTS,
+ .slb_size = 32,
+ },
+};
+
+/* POWER8 */
+
+static struct cpu_info cpu_power8_info = {
+ .name = "POWER8",
+ .tb_freq = 512000000,
+ .d_bsize = 128,
+ .i_bsize = 128,
+ .flags = CPUINFO_FLAG_DFP | CPUINFO_FLAG_VSX | CPUINFO_FLAG_VMX,
+ .mmu_info = {
+ .flags = KVM_PPC_PAGE_SIZES_REAL | KVM_PPC_1T_SEGMENTS,
+ .slb_size = 32,
+ },
+};
+
+/* PPC970/G5 */
+
+static struct cpu_info cpu_970_info = {
+ .name = "G5",
+ .tb_freq = 33333333,
+ .d_bsize = 128,
+ .i_bsize = 128,
+ .flags = CPUINFO_FLAG_VMX,
+};
+
+/* This is a default catchall for 'no match' on PVR: */
+static struct cpu_info cpu_dummy_info = { .name = "unknown" };
+
+static struct pvr_info host_pvr_info[] = {
+ { 0xffffffff, 0x0f000003, &cpu_power7_info },
+ { 0xffff0000, 0x003f0000, &cpu_power7_info },
+ { 0xffff0000, 0x004a0000, &cpu_power7_info },
+ { 0xffff0000, 0x004b0000, &cpu_power8_info },
+ { 0xffff0000, 0x00390000, &cpu_970_info },
+ { 0xffff0000, 0x003c0000, &cpu_970_info },
+ { 0xffff0000, 0x00440000, &cpu_970_info },
+ { 0xffff0000, 0x00450000, &cpu_970_info },
+};
+
+/* If we can't query the kernel for supported page sizes assume 4K and 16M */
+static struct kvm_ppc_one_seg_page_size fallback_sps[] = {
+ [0] = {
+ .page_shift = 12,
+ .slb_enc = 0,
+ .enc = {
+ [0] = {
+ .page_shift = 12,
+ .pte_enc = 0,
+ },
+ },
+ },
+ [1] = {
+ .page_shift = 24,
+ .slb_enc = 0x100,
+ .enc = {
+ [0] = {
+ .page_shift = 24,
+ .pte_enc = 0,
+ },
+ },
+ },
+};
+
+
+static void setup_mmu_info(struct kvm *kvm, struct cpu_info *cpu_info)
+{
+ static struct kvm_ppc_smmu_info *mmu_info;
+ struct kvm_ppc_one_seg_page_size *sps;
+ int i, j, k, valid;
+
+ if (!kvm__supports_extension(kvm, KVM_CAP_PPC_GET_SMMU_INFO)) {
+ memcpy(&cpu_info->mmu_info.sps, fallback_sps, sizeof(fallback_sps));
+ } else if (ioctl(kvm->vm_fd, KVM_PPC_GET_SMMU_INFO, &cpu_info->mmu_info) < 0) {
+ die_perror("KVM_PPC_GET_SMMU_INFO failed");
+ }
+
+ mmu_info = &cpu_info->mmu_info;
+
+ if (!(mmu_info->flags & KVM_PPC_PAGE_SIZES_REAL))
+ /* Guest pages are not restricted by the backing page size */
+ return;
+
+ /* Filter based on backing page size */
+
+ for (i = 0; i < KVM_PPC_PAGE_SIZES_MAX_SZ; i++) {
+ sps = &mmu_info->sps[i];
+
+ if (!sps->page_shift)
+ break;
+
+ if (kvm->ram_pagesize < (1ul << sps->page_shift)) {
+ /* Mark the whole segment size invalid */
+ sps->page_shift = 0;
+ continue;
+ }
+
+ /* Check each page size for the segment */
+ for (j = 0, valid = 0; j < KVM_PPC_PAGE_SIZES_MAX_SZ; j++) {
+ if (!sps->enc[j].page_shift)
+ break;
+
+ if (kvm->ram_pagesize < (1ul << sps->enc[j].page_shift))
+ sps->enc[j].page_shift = 0;
+ else
+ valid++;
+ }
+
+ if (!valid) {
+ /* Mark the whole segment size invalid */
+ sps->page_shift = 0;
+ continue;
+ }
+
+ /* Mark any trailing entries invalid if we broke out early */
+ for (k = j; k < KVM_PPC_PAGE_SIZES_MAX_SZ; k++)
+ sps->enc[k].page_shift = 0;
+
+ /* Collapse holes */
+ for (j = 0; j < KVM_PPC_PAGE_SIZES_MAX_SZ; j++) {
+ if (sps->enc[j].page_shift)
+ continue;
+
+ for (k = j + 1; k < KVM_PPC_PAGE_SIZES_MAX_SZ; k++) {
+ if (sps->enc[k].page_shift) {
+ sps->enc[j] = sps->enc[k];
+ sps->enc[k].page_shift = 0;
+ break;
+ }
+ }
+ }
+ }
+
+ /* Mark any trailing entries invalid if we broke out early */
+ for (j = i; j < KVM_PPC_PAGE_SIZES_MAX_SZ; j++)
+ mmu_info->sps[j].page_shift = 0;
+
+ /* Collapse holes */
+ for (i = 0; i < KVM_PPC_PAGE_SIZES_MAX_SZ; i++) {
+ if (mmu_info->sps[i].page_shift)
+ continue;
+
+ for (j = i + 1; j < KVM_PPC_PAGE_SIZES_MAX_SZ; j++) {
+ if (mmu_info->sps[j].page_shift) {
+ mmu_info->sps[i] = mmu_info->sps[j];
+ mmu_info->sps[j].page_shift = 0;
+ break;
+ }
+ }
+ }
+}
+
+struct cpu_info *find_cpu_info(struct kvm *kvm)
+{
+ struct cpu_info *info;
+ unsigned int i;
+ u32 pvr = kvm->arch.pvr;
+
+ for (info = NULL, i = 0; i < ARRAY_SIZE(host_pvr_info); i++) {
+ if ((pvr & host_pvr_info[i].pvr_mask) == host_pvr_info[i].pvr) {
+ info = host_pvr_info[i].cpu_info;
+ break;
+ }
+ }
+
+ /* Didn't find anything? Rut-ro. */
+ if (!info) {
+ pr_warning("Host CPU unsupported by kvmtool\n");
+ info = &cpu_dummy_info;
+ }
+
+ setup_mmu_info(kvm, info);
+
+ return info;
+}
diff --git a/tools/kvm/powerpc/cpu_info.h b/tools/kvm/powerpc/cpu_info.h
new file mode 100644
index 0000000..f61707a
--- /dev/null
+++ b/tools/kvm/powerpc/cpu_info.h
@@ -0,0 +1,42 @@
+/*
+ * PPC CPU identification
+ *
+ * Copyright 2012 Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#ifndef CPU_INFO_H
+#define CPU_INFO_H
+
+#include <kvm/kvm.h>
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/kvm.h>
+
+struct cpu_info {
+ const char *name;
+ u32 tb_freq; /* timebase frequency */
+ u32 d_bsize; /* d-cache block size */
+ u32 i_bsize; /* i-cache block size */
+ u32 flags;
+ struct kvm_ppc_smmu_info mmu_info;
+};
+
+struct pvr_info {
+ u32 pvr_mask;
+ u32 pvr;
+ struct cpu_info *cpu_info;
+};
+
+/* Misc capabilities/CPU properties */
+#define CPUINFO_FLAG_DFP 0x00000001
+#define CPUINFO_FLAG_VMX 0x00000002
+#define CPUINFO_FLAG_VSX 0x00000004
+
+struct cpu_info *find_cpu_info(struct kvm *kvm);
+
+#endif
diff --git a/tools/kvm/powerpc/include/kvm/barrier.h b/tools/kvm/powerpc/include/kvm/barrier.h
new file mode 100644
index 0000000..dd5115a
--- /dev/null
+++ b/tools/kvm/powerpc/include/kvm/barrier.h
@@ -0,0 +1,6 @@
+#ifndef _KVM_BARRIER_H_
+#define _KVM_BARRIER_H_
+
+#include <asm/barrier.h>
+
+#endif /* _KVM_BARRIER_H_ */
diff --git a/tools/kvm/powerpc/include/kvm/kvm-arch.h b/tools/kvm/powerpc/include/kvm/kvm-arch.h
new file mode 100644
index 0000000..fdd518f
--- /dev/null
+++ b/tools/kvm/powerpc/include/kvm/kvm-arch.h
@@ -0,0 +1,65 @@
+/*
+ * PPC64 architecture-specific definitions
+ *
+ * Copyright 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#ifndef KVM__KVM_ARCH_H
+#define KVM__KVM_ARCH_H
+
+#include <stdbool.h>
+#include <linux/types.h>
+#include <time.h>
+
+/*
+ * MMIO lives after RAM, but it'd be nice if it didn't constantly move.
+ * Choose a suitably high address, e.g. 63T... This limits RAM size.
+ */
+#define PPC_MMIO_START 0x3F0000000000UL
+#define PPC_MMIO_SIZE 0x010000000000UL
+
+#define KERNEL_LOAD_ADDR 0x0000000000000000
+#define KERNEL_START_ADDR 0x0000000000000000
+#define KERNEL_SECONDARY_START_ADDR 0x0000000000000060
+#define INITRD_LOAD_ADDR 0x0000000002800000
+
+#define RTAS_MAX_SIZE 0x10000
+
+#define TIMEBASE_FREQ 512000000ULL
+
+#define KVM_MMIO_START PPC_MMIO_START
+
+/*
+ * This is the address that pci_get_io_space_block() starts allocating
+ * from. Note that this is a PCI bus address.
+ */
+#define KVM_IOPORT_AREA 0x0
+#define KVM_PCI_CFG_AREA 0x1000000
+#define KVM_PCI_MMIO_AREA 0x2000000
+#define KVM_VIRTIO_MMIO_AREA 0x3000000
+
+#define KVM_IRQ_OFFSET 16
+
+#define KVM_VM_TYPE 0
+
+#define VIRTIO_DEFAULT_TRANS(kvm) VIRTIO_PCI
+
+struct spapr_phb;
+
+struct kvm_arch {
+ u64 sdr1;
+ u32 pvr;
+ unsigned long rtas_gra;
+ unsigned long rtas_size;
+ unsigned long fdt_gra;
+ unsigned long initrd_gra;
+ unsigned long initrd_size;
+ struct icp_state *icp;
+ struct spapr_phb *phb;
+};
+
+#endif /* KVM__KVM_ARCH_H */
diff --git a/tools/kvm/powerpc/include/kvm/kvm-config-arch.h b/tools/kvm/powerpc/include/kvm/kvm-config-arch.h
new file mode 100644
index 0000000..60f61de
--- /dev/null
+++ b/tools/kvm/powerpc/include/kvm/kvm-config-arch.h
@@ -0,0 +1,7 @@
+#ifndef KVM__KVM_CONFIG_ARCH_H
+#define KVM__KVM_CONFIG_ARCH_H
+
+struct kvm_config_arch {
+};
+
+#endif /* KVM__KVM_CONFIG_ARCH_H */
diff --git a/tools/kvm/powerpc/include/kvm/kvm-cpu-arch.h b/tools/kvm/powerpc/include/kvm/kvm-cpu-arch.h
new file mode 100644
index 0000000..e256f5d
--- /dev/null
+++ b/tools/kvm/powerpc/include/kvm/kvm-cpu-arch.h
@@ -0,0 +1,76 @@
+/*
+ * PPC64 cpu-specific definitions
+ *
+ * Copyright 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#ifndef KVM__KVM_CPU_ARCH_H
+#define KVM__KVM_CPU_ARCH_H
+
+/* Architecture-specific kvm_cpu definitions. */
+
+#include <linux/kvm.h> /* for struct kvm_regs */
+#include <stdbool.h>
+#include <pthread.h>
+
+#define MSR_SF (1UL<<63)
+#define MSR_HV (1UL<<60)
+#define MSR_VEC (1UL<<25)
+#define MSR_VSX (1UL<<23)
+#define MSR_POW (1UL<<18)
+#define MSR_EE (1UL<<15)
+#define MSR_PR (1UL<<14)
+#define MSR_FP (1UL<<13)
+#define MSR_ME (1UL<<12)
+#define MSR_FE0 (1UL<<11)
+#define MSR_SE (1UL<<10)
+#define MSR_BE (1UL<<9)
+#define MSR_FE1 (1UL<<8)
+#define MSR_IR (1UL<<5)
+#define MSR_DR (1UL<<4)
+#define MSR_PMM (1UL<<2)
+#define MSR_RI (1UL<<1)
+#define MSR_LE (1UL<<0)
+
+#define POWER7_EXT_IRQ 0
+
+struct kvm;
+
+struct kvm_cpu {
+ pthread_t thread; /* VCPU thread */
+
+ unsigned long cpu_id;
+
+ struct kvm *kvm; /* parent KVM */
+ int vcpu_fd; /* For VCPU ioctls() */
+ struct kvm_run *kvm_run;
+
+ struct kvm_regs regs;
+ struct kvm_sregs sregs;
+ struct kvm_fpu fpu;
+
+ u8 is_running;
+ u8 paused;
+ u8 needs_nmi;
+ /*
+ * Although PPC KVM doesn't yet support coalesced MMIO, generic code
+ * needs this in our kvm_cpu:
+ */
+ struct kvm_coalesced_mmio_ring *ring;
+};
+
+void kvm_cpu__irq(struct kvm_cpu *vcpu, int pin, int level);
+
+/* This is never actually called on PPC. */
+static inline bool kvm_cpu__emulate_io(struct kvm_cpu *vcpu, u16 port, void *data, int direction, int size, u32 count)
+{
+ return false;
+}
+
+bool kvm_cpu__emulate_mmio(struct kvm_cpu *vcpu, u64 phys_addr, u8 *data, u32 len, u8 is_write);
+
+#endif /* KVM__KVM_CPU_ARCH_H */
diff --git a/tools/kvm/powerpc/ioport.c b/tools/kvm/powerpc/ioport.c
new file mode 100644
index 0000000..58dc625
--- /dev/null
+++ b/tools/kvm/powerpc/ioport.c
@@ -0,0 +1,22 @@
+/*
+ * PPC64 ioport platform setup. There isn't any! :-)
+ *
+ * Copyright 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include "kvm/ioport.h"
+
+#include <stdlib.h>
+
+void ioport__setup_arch(struct kvm *kvm)
+{
+ /* PPC has no legacy ioports to set up */
+}
+
+void ioport__map_irq(u8 *irq)
+{
+}
diff --git a/tools/kvm/powerpc/irq.c b/tools/kvm/powerpc/irq.c
new file mode 100644
index 0000000..03f2fe7
--- /dev/null
+++ b/tools/kvm/powerpc/irq.c
@@ -0,0 +1,31 @@
+/*
+ * PPC64 IRQ routines
+ *
+ * Copyright 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include "kvm/devices.h"
+#include "kvm/irq.h"
+#include "kvm/kvm.h"
+#include "kvm/util.h"
+
+#include <linux/types.h>
+#include <linux/rbtree.h>
+#include <linux/list.h>
+#include <linux/kvm.h>
+#include <sys/ioctl.h>
+
+#include <stddef.h>
+#include <stdlib.h>
+
+#include "kvm/pci.h"
+
+int irq__add_msix_route(struct kvm *kvm, struct msi_msg *msg)
+{
+ die(__FUNCTION__);
+ return 0;
+}
diff --git a/tools/kvm/powerpc/kvm-cpu.c b/tools/kvm/powerpc/kvm-cpu.c
new file mode 100644
index 0000000..2f5cfc6
--- /dev/null
+++ b/tools/kvm/powerpc/kvm-cpu.c
@@ -0,0 +1,290 @@
+/*
+ * PPC64 processor support
+ *
+ * Copyright 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include "kvm/kvm-cpu.h"
+
+#include "kvm/symbol.h"
+#include "kvm/util.h"
+#include "kvm/kvm.h"
+
+#include "spapr.h"
+#include "spapr_pci.h"
+#include "xics.h"
+
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <stdio.h>
+#include <assert.h>
+
+static int debug_fd;
+
+void kvm_cpu__set_debug_fd(int fd)
+{
+ debug_fd = fd;
+}
+
+int kvm_cpu__get_debug_fd(void)
+{
+ return debug_fd;
+}
+
+static struct kvm_cpu *kvm_cpu__new(struct kvm *kvm)
+{
+ struct kvm_cpu *vcpu;
+
+ vcpu = calloc(1, sizeof *vcpu);
+ if (!vcpu)
+ return NULL;
+
+ vcpu->kvm = kvm;
+
+ return vcpu;
+}
+
+void kvm_cpu__delete(struct kvm_cpu *vcpu)
+{
+ free(vcpu);
+}
+
+struct kvm_cpu *kvm_cpu__arch_init(struct kvm *kvm, unsigned long cpu_id)
+{
+ struct kvm_cpu *vcpu;
+ int mmap_size;
+ struct kvm_enable_cap papr_cap = { .cap = KVM_CAP_PPC_PAPR };
+
+ vcpu = kvm_cpu__new(kvm);
+ if (!vcpu)
+ return NULL;
+
+ vcpu->cpu_id = cpu_id;
+
+ vcpu->vcpu_fd = ioctl(vcpu->kvm->vm_fd, KVM_CREATE_VCPU, cpu_id);
+ if (vcpu->vcpu_fd < 0)
+ die_perror("KVM_CREATE_VCPU ioctl");
+
+ mmap_size = ioctl(vcpu->kvm->sys_fd, KVM_GET_VCPU_MMAP_SIZE, 0);
+ if (mmap_size < 0)
+ die_perror("KVM_GET_VCPU_MMAP_SIZE ioctl");
+
+ vcpu->kvm_run = mmap(NULL, mmap_size, PROT_RW, MAP_SHARED, vcpu->vcpu_fd, 0);
+ if (vcpu->kvm_run == MAP_FAILED)
+ die("unable to mmap vcpu fd");
+
+ if (ioctl(vcpu->vcpu_fd, KVM_ENABLE_CAP, &papr_cap) < 0)
+ die("unable to enable PAPR capability");
+
+ /*
+ * We start all CPUs, directing non-primary threads into the kernel's
+ * secondary start point. When we come to support SLOF, we will start
+ * only one and SLOF will RTAS call us to ask for others to be
+ * started. (FIXME: make more generic & interface with whichever
+ * firmware a platform may be using.)
+ */
+ vcpu->is_running = true;
+
+ return vcpu;
+}
+
+static void kvm_cpu__setup_fpu(struct kvm_cpu *vcpu)
+{
+ /* Don't have to do anything, there's no expected FPU state. */
+}
+
+static void kvm_cpu__setup_regs(struct kvm_cpu *vcpu)
+{
+ /*
+ * FIXME: This assumes PPC64 and Linux guest. It doesn't use the
+ * OpenFirmware entry method, but instead the "embedded" entry which
+ * passes the FDT address directly.
+ */
+ struct kvm_regs *r = &vcpu->regs;
+
+ if (vcpu->cpu_id == 0) {
+ r->pc = KERNEL_START_ADDR;
+ r->gpr[3] = vcpu->kvm->arch.fdt_gra;
+ r->gpr[5] = 0;
+ } else {
+ r->pc = KERNEL_SECONDARY_START_ADDR;
+ r->gpr[3] = vcpu->cpu_id;
+ }
+ r->msr = 0x8000000000001000UL; /* 64bit, non-HV, ME */
+
+ if (ioctl(vcpu->vcpu_fd, KVM_SET_REGS, &vcpu->regs) < 0)
+ die_perror("KVM_SET_REGS failed");
+}
+
+static void kvm_cpu__setup_sregs(struct kvm_cpu *vcpu)
+{
+ /*
+ * Some sregs setup to initialise SDR1/PVR/HIOR on PPC64 SPAPR
+ * platforms using PR KVM. (Technically, this is all ignored on
+ * SPAPR HV KVM.) Different setup is required for non-PV non-SPAPR
+ * platforms! (FIXME.)
+ */
+ struct kvm_sregs sregs;
+ struct kvm_one_reg reg = {};
+ u64 value;
+
+ if (ioctl(vcpu->vcpu_fd, KVM_GET_SREGS, &sregs) < 0)
+ die("KVM_GET_SREGS failed");
+
+ sregs.u.s.sdr1 = vcpu->kvm->arch.sdr1;
+ sregs.pvr = vcpu->kvm->arch.pvr;
+
+ if (ioctl(vcpu->vcpu_fd, KVM_SET_SREGS, &sregs) < 0)
+ die("KVM_SET_SREGS failed");
+
+ reg.id = KVM_REG_PPC_HIOR;
+ value = 0;
+ reg.addr = (u64)&value;
+ if (ioctl(vcpu->vcpu_fd, KVM_SET_ONE_REG, ®) < 0)
+ die("KVM_SET_ONE_REG failed");
+}
+
+/**
+ * kvm_cpu__reset_vcpu - reset virtual CPU to a known state
+ */
+void kvm_cpu__reset_vcpu(struct kvm_cpu *vcpu)
+{
+ kvm_cpu__setup_regs(vcpu);
+ kvm_cpu__setup_sregs(vcpu);
+ kvm_cpu__setup_fpu(vcpu);
+}
+
+/* kvm_cpu__irq - set KVM's IRQ flag on this vcpu */
+void kvm_cpu__irq(struct kvm_cpu *vcpu, int pin, int level)
+{
+ unsigned int virq = level ? KVM_INTERRUPT_SET_LEVEL : KVM_INTERRUPT_UNSET;
+
+ /* FIXME: POWER-specific */
+ if (pin != POWER7_EXT_IRQ)
+ return;
+ if (ioctl(vcpu->vcpu_fd, KVM_INTERRUPT, &virq) < 0)
+ pr_warning("Could not KVM_INTERRUPT.");
+}
+
+void kvm_cpu__arch_nmi(struct kvm_cpu *cpu)
+{
+}
+
+bool kvm_cpu__handle_exit(struct kvm_cpu *vcpu)
+{
+ bool ret = true;
+ struct kvm_run *run = vcpu->kvm_run;
+ switch(run->exit_reason) {
+ case KVM_EXIT_PAPR_HCALL:
+ run->papr_hcall.ret = spapr_hypercall(vcpu, run->papr_hcall.nr,
+ (target_ulong*)run->papr_hcall.args);
+ break;
+ default:
+ ret = false;
+ }
+ return ret;
+}
+
+bool kvm_cpu__emulate_mmio(struct kvm_cpu *vcpu, u64 phys_addr, u8 *data, u32 len, u8 is_write)
+{
+ /*
+ * FIXME: This function will need to be split in order to support
+ * various PowerPC platforms/PHB types, etc. It currently assumes SPAPR
+ * PPC64 guest.
+ */
+ bool ret = false;
+
+ if ((phys_addr >= SPAPR_PCI_WIN_START) &&
+ (phys_addr < SPAPR_PCI_WIN_END)) {
+ ret = spapr_phb_mmio(vcpu, phys_addr, data, len, is_write);
+ } else {
+ pr_warning("MMIO %s unknown address %llx (size %d)!\n",
+ is_write ? "write to" : "read from",
+ phys_addr, len);
+ }
+ return ret;
+}
+
+#define CONDSTR_BIT(m, b) (((m) & MSR_##b) ? #b" " : "")
+
+void kvm_cpu__show_registers(struct kvm_cpu *vcpu)
+{
+ struct kvm_regs regs;
+ struct kvm_sregs sregs;
+ int r;
+
+ if (ioctl(vcpu->vcpu_fd, KVM_GET_REGS, ®s) < 0)
+ die("KVM_GET_REGS failed");
+ if (ioctl(vcpu->vcpu_fd, KVM_GET_SREGS, &sregs) < 0)
+ die("KVM_GET_SREGS failed");
+
+ dprintf(debug_fd, "\n Registers:\n");
+ dprintf(debug_fd, " NIP: %016llx MSR: %016llx "
+ "( %s%s%s%s%s%s%s%s%s%s%s%s)\n",
+ regs.pc, regs.msr,
+ CONDSTR_BIT(regs.msr, SF),
+ CONDSTR_BIT(regs.msr, HV), /* ! */
+ CONDSTR_BIT(regs.msr, VEC),
+ CONDSTR_BIT(regs.msr, VSX),
+ CONDSTR_BIT(regs.msr, EE),
+ CONDSTR_BIT(regs.msr, PR),
+ CONDSTR_BIT(regs.msr, FP),
+ CONDSTR_BIT(regs.msr, ME),
+ CONDSTR_BIT(regs.msr, IR),
+ CONDSTR_BIT(regs.msr, DR),
+ CONDSTR_BIT(regs.msr, RI),
+ CONDSTR_BIT(regs.msr, LE));
+ dprintf(debug_fd, " CTR: %016llx LR: %016llx CR: %08llx\n",
+ regs.ctr, regs.lr, regs.cr);
+ dprintf(debug_fd, " SRR0: %016llx SRR1: %016llx XER: %016llx\n",
+ regs.srr0, regs.srr1, regs.xer);
+ dprintf(debug_fd, " SPRG0: %016llx SPRG1: %016llx\n",
+ regs.sprg0, regs.sprg1);
+ dprintf(debug_fd, " SPRG2: %016llx SPRG3: %016llx\n",
+ regs.sprg2, regs.sprg3);
+ dprintf(debug_fd, " SPRG4: %016llx SPRG5: %016llx\n",
+ regs.sprg4, regs.sprg5);
+ dprintf(debug_fd, " SPRG6: %016llx SPRG7: %016llx\n",
+ regs.sprg6, regs.sprg7);
+ dprintf(debug_fd, " GPRs:\n ");
+ for (r = 0; r < 32; r++) {
+ dprintf(debug_fd, "%016llx ", regs.gpr[r]);
+ if ((r & 3) == 3)
+ dprintf(debug_fd, "\n ");
+ }
+ dprintf(debug_fd, "\n");
+
+ /* FIXME: Assumes SLB-based (book3s) guest */
+ for (r = 0; r < 32; r++) {
+ dprintf(debug_fd, " SLB%02d %016llx %016llx\n", r,
+ sregs.u.s.ppc64.slb[r].slbe,
+ sregs.u.s.ppc64.slb[r].slbv);
+ }
+ dprintf(debug_fd, "----------\n");
+}
+
+void kvm_cpu__show_code(struct kvm_cpu *vcpu)
+{
+ if (ioctl(vcpu->vcpu_fd, KVM_GET_REGS, &vcpu->regs) < 0)
+ die("KVM_GET_REGS failed");
+
+ /* FIXME: Dump/disassemble some code...! */
+
+ dprintf(debug_fd, "\n Stack:\n");
+ dprintf(debug_fd, " ------\n");
+ /* Only works in real mode: */
+ kvm__dump_mem(vcpu->kvm, vcpu->regs.gpr[1], 32, debug_fd);
+}
+
+void kvm_cpu__show_page_tables(struct kvm_cpu *vcpu)
+{
+ /* Does nothing yet */
+}
diff --git a/tools/kvm/powerpc/kvm.c b/tools/kvm/powerpc/kvm.c
new file mode 100644
index 0000000..2b03a12
--- /dev/null
+++ b/tools/kvm/powerpc/kvm.c
@@ -0,0 +1,524 @@
+/*
+ * PPC64 (SPAPR) platform support
+ *
+ * Copyright 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * Portions of FDT setup borrowed from QEMU, copyright 2010 David Gibson, IBM
+ * Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include "kvm/fdt.h"
+#include "kvm/kvm.h"
+#include "kvm/util.h"
+#include "cpu_info.h"
+
+#include "spapr.h"
+#include "spapr_hvcons.h"
+#include "spapr_pci.h"
+
+#include <linux/kvm.h>
+
+#include <sys/types.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <asm/unistd.h>
+#include <errno.h>
+
+#include <linux/byteorder.h>
+
+#define HPT_ORDER 24
+
+#define HUGETLBFS_PATH "/var/lib/hugetlbfs/global/pagesize-16MB/"
+
+#define PHANDLE_XICP 0x00001111
+
+static char kern_cmdline[2048];
+
+struct kvm_ext kvm_req_ext[] = {
+ { DEFINE_KVM_EXT(KVM_CAP_PPC_UNSET_IRQ) },
+ { DEFINE_KVM_EXT(KVM_CAP_PPC_IRQ_LEVEL) },
+ { 0, 0 }
+};
+
+static uint32_t mfpvr(void)
+{
+ uint32_t r;
+ asm volatile ("mfpvr %0" : "=r"(r));
+ return r;
+}
+
+bool kvm__arch_cpu_supports_vm(void)
+{
+ return true;
+}
+
+void kvm__init_ram(struct kvm *kvm)
+{
+ u64 phys_start, phys_size;
+ void *host_mem;
+
+ phys_start = 0;
+ phys_size = kvm->ram_size;
+ host_mem = kvm->ram_start;
+
+ /*
+ * We put MMIO at PPC_MMIO_START, high up. Make sure that this doesn't
+ * crash into the end of RAM -- on PPC64 at least, this is so high
+ * (63TB!) that this is unlikely.
+ */
+ if (phys_size >= PPC_MMIO_START)
+ die("Too much memory (%lld, what a nice problem): "
+ "overlaps MMIO!\n",
+ phys_size);
+
+ kvm__register_mem(kvm, phys_start, phys_size, host_mem);
+}
+
+void kvm__arch_set_cmdline(char *cmdline, bool video)
+{
+ /* We don't need anything unusual in here. */
+}
+
+/* Architecture-specific KVM init */
+void kvm__arch_init(struct kvm *kvm, const char *hugetlbfs_path, u64 ram_size)
+{
+ int cap_ppc_rma;
+ unsigned long hpt;
+
+ kvm->ram_size = ram_size;
+
+ /* Map "default" hugetblfs path to the standard 16M mount point */
+ if (hugetlbfs_path && !strcmp(hugetlbfs_path, "default"))
+ hugetlbfs_path = HUGETLBFS_PATH;
+
+ kvm->ram_start = mmap_anon_or_hugetlbfs(kvm, hugetlbfs_path, kvm->ram_size);
+
+ if (kvm->ram_start == MAP_FAILED)
+ die("Couldn't map %lld bytes for RAM (%d)\n",
+ kvm->ram_size, errno);
+
+ /* FDT goes at top of memory, RTAS just below */
+ kvm->arch.fdt_gra = kvm->ram_size - FDT_MAX_SIZE;
+ /* FIXME: Not all PPC systems have RTAS */
+ kvm->arch.rtas_gra = kvm->arch.fdt_gra - RTAS_MAX_SIZE;
+ madvise(kvm->ram_start, kvm->ram_size, MADV_MERGEABLE);
+
+ /* FIXME: SPAPR-PR specific; allocate a guest HPT. */
+ if (posix_memalign((void **)&hpt, (1<<HPT_ORDER), (1<<HPT_ORDER)))
+ die("Can't allocate %d bytes for HPT\n", (1<<HPT_ORDER));
+
+ kvm->arch.sdr1 = ((hpt + 0x3ffffULL) & ~0x3ffffULL) | (HPT_ORDER-18);
+
+ kvm->arch.pvr = mfpvr();
+
+ /* FIXME: This is book3s-specific */
+ cap_ppc_rma = ioctl(kvm->sys_fd, KVM_CHECK_EXTENSION, KVM_CAP_PPC_RMA);
+ if (cap_ppc_rma == 2)
+ die("Need contiguous RMA allocation on this hardware, "
+ "which is not yet supported.");
+
+ /* Do these before FDT setup, IRQ setup, etc. */
+ /* FIXME: SPAPR-specific */
+ hypercall_init();
+ register_core_rtas();
+ /* Now that hypercalls are initialised, register a couple for the console: */
+ spapr_hvcons_init();
+ spapr_create_phb(kvm, "pci", SPAPR_PCI_BUID,
+ SPAPR_PCI_MEM_WIN_ADDR,
+ SPAPR_PCI_MEM_WIN_SIZE,
+ SPAPR_PCI_IO_WIN_ADDR,
+ SPAPR_PCI_IO_WIN_SIZE);
+}
+
+void kvm__arch_delete_ram(struct kvm *kvm)
+{
+ munmap(kvm->ram_start, kvm->ram_size);
+}
+
+void kvm__irq_trigger(struct kvm *kvm, int irq)
+{
+ kvm__irq_line(kvm, irq, 1);
+ kvm__irq_line(kvm, irq, 0);
+}
+
+void kvm__arch_read_term(struct kvm *kvm)
+{
+ /* FIXME: Should register callbacks to platform-specific polls */
+ spapr_hvcons_poll(kvm);
+}
+
+int load_flat_binary(struct kvm *kvm, int fd_kernel, int fd_initrd, const char *kernel_cmdline)
+{
+ void *p;
+ void *k_start;
+ void *i_start;
+ int nr;
+
+ if (lseek(fd_kernel, 0, SEEK_SET) < 0)
+ die_perror("lseek");
+
+ p = k_start = guest_flat_to_host(kvm, KERNEL_LOAD_ADDR);
+
+ while ((nr = read(fd_kernel, p, 65536)) > 0)
+ p += nr;
+
+ pr_info("Loaded kernel to 0x%x (%ld bytes)", KERNEL_LOAD_ADDR, p-k_start);
+
+ if (fd_initrd != -1) {
+ if (lseek(fd_initrd, 0, SEEK_SET) < 0)
+ die_perror("lseek");
+
+ if (p-k_start > INITRD_LOAD_ADDR)
+ die("Kernel overlaps initrd!");
+
+ /* Round up kernel size to 8byte alignment, and load initrd right after. */
+ i_start = p = guest_flat_to_host(kvm, INITRD_LOAD_ADDR);
+
+ while (((nr = read(fd_initrd, p, 65536)) > 0) &&
+ p < (kvm->ram_start + kvm->ram_size))
+ p += nr;
+
+ if (p >= (kvm->ram_start + kvm->ram_size))
+ die("initrd too big to contain in guest RAM.\n");
+
+ pr_info("Loaded initrd to 0x%x (%ld bytes)",
+ INITRD_LOAD_ADDR, p-i_start);
+ kvm->arch.initrd_gra = INITRD_LOAD_ADDR;
+ kvm->arch.initrd_size = p-i_start;
+ } else {
+ kvm->arch.initrd_size = 0;
+ }
+ strncpy(kern_cmdline, kernel_cmdline, 2048);
+ kern_cmdline[2047] = '\0';
+
+ return true;
+}
+
+struct fdt_prop {
+ void *value;
+ int size;
+};
+
+static void generate_segment_page_sizes(struct kvm_ppc_smmu_info *info, struct fdt_prop *prop)
+{
+ struct kvm_ppc_one_seg_page_size *sps;
+ int i, j, size;
+ u32 *p;
+
+ for (size = 0, i = 0; i < KVM_PPC_PAGE_SIZES_MAX_SZ; i++) {
+ sps = &info->sps[i];
+
+ if (sps->page_shift == 0)
+ break;
+
+ /* page shift, slb enc & count */
+ size += 3;
+
+ for (j = 0; j < KVM_PPC_PAGE_SIZES_MAX_SZ; j++) {
+ if (info->sps[i].enc[j].page_shift == 0)
+ break;
+
+ /* page shift & pte enc */
+ size += 2;
+ }
+ }
+
+ if (!size) {
+ prop->value = NULL;
+ prop->size = 0;
+ return;
+ }
+
+ /* Convert size to bytes */
+ prop->size = size * sizeof(u32);
+
+ prop->value = malloc(prop->size);
+ if (!prop->value)
+ die_perror("malloc failed");
+
+ p = (u32 *)prop->value;
+ for (i = 0; i < KVM_PPC_PAGE_SIZES_MAX_SZ; i++) {
+ sps = &info->sps[i];
+
+ if (sps->page_shift == 0)
+ break;
+
+ *p++ = sps->page_shift;
+ *p++ = sps->slb_enc;
+
+ for (j = 0; j < KVM_PPC_PAGE_SIZES_MAX_SZ; j++)
+ if (!info->sps[i].enc[j].page_shift)
+ break;
+
+ *p++ = j; /* count of enc */
+
+ for (j = 0; j < KVM_PPC_PAGE_SIZES_MAX_SZ; j++) {
+ if (!info->sps[i].enc[j].page_shift)
+ break;
+
+ *p++ = info->sps[i].enc[j].page_shift;
+ *p++ = info->sps[i].enc[j].pte_enc;
+ }
+ }
+}
+
+#define SMT_THREADS 4
+
+/*
+ * Set up the FDT for the kernel: This function is currently fairly SPAPR-heavy,
+ * and whilst most PPC targets will require CPU/memory nodes, others like RTAS
+ * should eventually be added separately.
+ */
+static int setup_fdt(struct kvm *kvm)
+{
+ uint64_t mem_reg_property[] = { 0, cpu_to_be64(kvm->ram_size) };
+ int smp_cpus = kvm->nrcpus;
+ uint32_t int_server_ranges_prop[] = {0, cpu_to_be32(smp_cpus)};
+ char hypertas_prop_kvm[] = "hcall-pft\0hcall-term\0"
+ "hcall-dabr\0hcall-interrupt\0hcall-tce\0hcall-vio\0"
+ "hcall-splpar\0hcall-bulk";
+ int i, j;
+ char cpu_name[30];
+ u8 staging_fdt[FDT_MAX_SIZE];
+ struct cpu_info *cpu_info = find_cpu_info(kvm);
+ struct fdt_prop segment_page_sizes;
+ u32 segment_sizes_1T[] = {0x1c, 0x28, 0xffffffff, 0xffffffff};
+
+ /* Generate an appropriate DT at kvm->arch.fdt_gra */
+ void *fdt_dest = guest_flat_to_host(kvm, kvm->arch.fdt_gra);
+ void *fdt = staging_fdt;
+
+ _FDT(fdt_create(fdt, FDT_MAX_SIZE));
+ _FDT(fdt_finish_reservemap(fdt));
+
+ _FDT(fdt_begin_node(fdt, ""));
+
+ _FDT(fdt_property_string(fdt, "device_type", "chrp"));
+ _FDT(fdt_property_string(fdt, "model", "IBM pSeries (kvmtool)"));
+ _FDT(fdt_property_cell(fdt, "#address-cells", 0x2));
+ _FDT(fdt_property_cell(fdt, "#size-cells", 0x2));
+
+ /* RTAS */
+ _FDT(fdt_begin_node(fdt, "rtas"));
+ /* This is what the kernel uses to switch 'We're an LPAR'! */
+ _FDT(fdt_property(fdt, "ibm,hypertas-functions", hypertas_prop_kvm,
+ sizeof(hypertas_prop_kvm)));
+ _FDT(fdt_property_cell(fdt, "linux,rtas-base", kvm->arch.rtas_gra));
+ _FDT(fdt_property_cell(fdt, "linux,rtas-entry", kvm->arch.rtas_gra));
+ _FDT(fdt_property_cell(fdt, "rtas-size", kvm->arch.rtas_size));
+ /* Now add properties for all RTAS tokens: */
+ if (spapr_rtas_fdt_setup(kvm, fdt))
+ die("Couldn't create RTAS FDT properties\n");
+
+ _FDT(fdt_end_node(fdt));
+
+ /* /chosen */
+ _FDT(fdt_begin_node(fdt, "chosen"));
+ /* cmdline */
+ _FDT(fdt_property_string(fdt, "bootargs", kern_cmdline));
+ /* Initrd */
+ if (kvm->arch.initrd_size != 0) {
+ uint32_t ird_st_prop = cpu_to_be32(kvm->arch.initrd_gra);
+ uint32_t ird_end_prop = cpu_to_be32(kvm->arch.initrd_gra +
+ kvm->arch.initrd_size);
+ _FDT(fdt_property(fdt, "linux,initrd-start",
+ &ird_st_prop, sizeof(ird_st_prop)));
+ _FDT(fdt_property(fdt, "linux,initrd-end",
+ &ird_end_prop, sizeof(ird_end_prop)));
+ }
+
+ /*
+ * stdout-path: This is assuming we're using the HV console. Also, the
+ * address is hardwired until we do a VIO bus.
+ */
+ _FDT(fdt_property_string(fdt, "linux,stdout-path",
+ "/vdevice/vty@30000000"));
+ _FDT(fdt_end_node(fdt));
+
+ /*
+ * Memory: We don't alloc. a separate RMA yet. If we ever need to
+ * (CAP_PPC_RMA == 2) then have one memory node for 0->RMAsize, and
+ * another RMAsize->endOfMem.
+ */
+ _FDT(fdt_begin_node(fdt, "memory@0"));
+ _FDT(fdt_property_string(fdt, "device_type", "memory"));
+ _FDT(fdt_property(fdt, "reg", mem_reg_property,
+ sizeof(mem_reg_property)));
+ _FDT(fdt_end_node(fdt));
+
+ generate_segment_page_sizes(&cpu_info->mmu_info, &segment_page_sizes);
+
+ /* CPUs */
+ _FDT(fdt_begin_node(fdt, "cpus"));
+ _FDT(fdt_property_cell(fdt, "#address-cells", 0x1));
+ _FDT(fdt_property_cell(fdt, "#size-cells", 0x0));
+
+ for (i = 0; i < smp_cpus; i += SMT_THREADS) {
+ int32_t pft_size_prop[] = { 0, HPT_ORDER };
+ uint32_t servers_prop[SMT_THREADS];
+ uint32_t gservers_prop[SMT_THREADS * 2];
+ int threads = (smp_cpus - i) >= SMT_THREADS ? SMT_THREADS :
+ smp_cpus - i;
+
+ sprintf(cpu_name, "PowerPC,%s@%d", cpu_info->name, i);
+ _FDT(fdt_begin_node(fdt, cpu_name));
+ sprintf(cpu_name, "PowerPC,%s", cpu_info->name);
+ _FDT(fdt_property_string(fdt, "name", cpu_name));
+ _FDT(fdt_property_string(fdt, "device_type", "cpu"));
+
+ _FDT(fdt_property_cell(fdt, "reg", i));
+ _FDT(fdt_property_cell(fdt, "cpu-version", kvm->arch.pvr));
+
+ _FDT(fdt_property_cell(fdt, "dcache-block-size", cpu_info->d_bsize));
+ _FDT(fdt_property_cell(fdt, "icache-block-size", cpu_info->i_bsize));
+
+ if (cpu_info->tb_freq)
+ _FDT(fdt_property_cell(fdt, "timebase-frequency", cpu_info->tb_freq));
+
+ /* Lies, but safeish lies! */
+ _FDT(fdt_property_cell(fdt, "clock-frequency", 0xddbab200));
+
+ if (cpu_info->mmu_info.slb_size)
+ _FDT(fdt_property_cell(fdt, "ibm,slb-size", cpu_info->mmu_info.slb_size));
+
+ /*
+ * HPT size is hardwired; KVM currently fixes it at 16MB but the
+ * moment that changes we'll need to read it out of the kernel.
+ */
+ _FDT(fdt_property(fdt, "ibm,pft-size", pft_size_prop,
+ sizeof(pft_size_prop)));
+
+ _FDT(fdt_property_string(fdt, "status", "okay"));
+ _FDT(fdt_property(fdt, "64-bit", NULL, 0));
+ /* A server for each thread in this core */
+ for (j = 0; j < SMT_THREADS; j++) {
+ servers_prop[j] = cpu_to_be32(i+j);
+ /*
+ * Hack borrowed from QEMU, direct the group queues back
+ * to cpu 0:
+ */
+ gservers_prop[j*2] = cpu_to_be32(i+j);
+ gservers_prop[j*2 + 1] = 0;
+ }
+ _FDT(fdt_property(fdt, "ibm,ppc-interrupt-server#s",
+ servers_prop, threads * sizeof(uint32_t)));
+ _FDT(fdt_property(fdt, "ibm,ppc-interrupt-gserver#s",
+ gservers_prop,
+ threads * 2 * sizeof(uint32_t)));
+
+ if (segment_page_sizes.value)
+ _FDT(fdt_property(fdt, "ibm,segment-page-sizes",
+ segment_page_sizes.value,
+ segment_page_sizes.size));
+
+ if (cpu_info->mmu_info.flags & KVM_PPC_1T_SEGMENTS)
+ _FDT(fdt_property(fdt, "ibm,processor-segment-sizes",
+ segment_sizes_1T, sizeof(segment_sizes_1T)));
+
+ /* VSX / DFP options: */
+ if (cpu_info->flags & CPUINFO_FLAG_VMX)
+ _FDT(fdt_property_cell(fdt, "ibm,vmx",
+ (cpu_info->flags &
+ CPUINFO_FLAG_VSX) ? 2 : 1));
+ if (cpu_info->flags & CPUINFO_FLAG_DFP)
+ _FDT(fdt_property_cell(fdt, "ibm,dfp", 0x1));
+ _FDT(fdt_end_node(fdt));
+ }
+ _FDT(fdt_end_node(fdt));
+
+ /* IRQ controller */
+ _FDT(fdt_begin_node(fdt, "interrupt-controller@0"));
+
+ _FDT(fdt_property_string(fdt, "device_type",
+ "PowerPC-External-Interrupt-Presentation"));
+ _FDT(fdt_property_string(fdt, "compatible", "IBM,ppc-xicp"));
+ _FDT(fdt_property_cell(fdt, "reg", 0));
+ _FDT(fdt_property(fdt, "interrupt-controller", NULL, 0));
+ _FDT(fdt_property(fdt, "ibm,interrupt-server-ranges",
+ int_server_ranges_prop,
+ sizeof(int_server_ranges_prop)));
+ _FDT(fdt_property_cell(fdt, "#interrupt-cells", 2));
+ _FDT(fdt_property_cell(fdt, "linux,phandle", PHANDLE_XICP));
+ _FDT(fdt_property_cell(fdt, "phandle", PHANDLE_XICP));
+ _FDT(fdt_end_node(fdt));
+
+ /*
+ * VIO: See comment in linux,stdout-path; we don't yet represent a VIO
+ * bus/address allocation so addresses are hardwired here.
+ */
+ _FDT(fdt_begin_node(fdt, "vdevice"));
+ _FDT(fdt_property_cell(fdt, "#address-cells", 0x1));
+ _FDT(fdt_property_cell(fdt, "#size-cells", 0x0));
+ _FDT(fdt_property_string(fdt, "device_type", "vdevice"));
+ _FDT(fdt_property_string(fdt, "compatible", "IBM,vdevice"));
+ _FDT(fdt_begin_node(fdt, "vty@30000000"));
+ _FDT(fdt_property_string(fdt, "name", "vty"));
+ _FDT(fdt_property_string(fdt, "device_type", "serial"));
+ _FDT(fdt_property_string(fdt, "compatible", "hvterm1"));
+ _FDT(fdt_property_cell(fdt, "reg", 0x30000000));
+ _FDT(fdt_end_node(fdt));
+ _FDT(fdt_end_node(fdt));
+
+ /* Finalise: */
+ _FDT(fdt_end_node(fdt)); /* Root node */
+ _FDT(fdt_finish(fdt));
+
+ _FDT(fdt_open_into(fdt, fdt_dest, FDT_MAX_SIZE));
+
+ /* PCI */
+ if (spapr_populate_pci_devices(kvm, PHANDLE_XICP, fdt_dest))
+ die("Fail populating PCI device nodes");
+
+ _FDT(fdt_add_mem_rsv(fdt_dest, kvm->arch.rtas_gra, kvm->arch.rtas_size));
+ _FDT(fdt_pack(fdt_dest));
+
+ free(segment_page_sizes.value);
+
+ return 0;
+}
+firmware_init(setup_fdt);
+
+/**
+ * kvm__arch_setup_firmware
+ */
+int kvm__arch_setup_firmware(struct kvm *kvm)
+{
+ /*
+ * Set up RTAS stub. All it is is a single hypercall:
+ * 0: 7c 64 1b 78 mr r4,r3
+ * 4: 3c 60 00 00 lis r3,0
+ * 8: 60 63 f0 00 ori r3,r3,61440
+ * c: 44 00 00 22 sc 1
+ * 10: 4e 80 00 20 blr
+ */
+ uint32_t *rtas = guest_flat_to_host(kvm, kvm->arch.rtas_gra);
+
+ rtas[0] = 0x7c641b78;
+ rtas[1] = 0x3c600000;
+ rtas[2] = 0x6063f000;
+ rtas[3] = 0x44000022;
+ rtas[4] = 0x4e800020;
+ kvm->arch.rtas_size = 20;
+
+ pr_info("Set up %ld bytes of RTAS at 0x%lx\n",
+ kvm->arch.rtas_size, kvm->arch.rtas_gra);
+
+ /* Load SLOF */
+
+ return 0;
+}
+
+int kvm__arch_free_firmware(struct kvm *kvm)
+{
+ return 0;
+}
diff --git a/tools/kvm/powerpc/spapr.h b/tools/kvm/powerpc/spapr.h
new file mode 100644
index 0000000..0537f88
--- /dev/null
+++ b/tools/kvm/powerpc/spapr.h
@@ -0,0 +1,93 @@
+/*
+ * SPAPR definitions and declarations
+ *
+ * Borrowed heavily from QEMU's spapr.h,
+ * Copyright (c) 2010 David Gibson, IBM Corporation.
+ *
+ * Modifications by Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#if !defined(__HW_SPAPR_H__)
+#define __HW_SPAPR_H__
+
+#include <inttypes.h>
+
+/* We need some of the H_ hcall defs, but they're __KERNEL__ only. */
+#define __KERNEL__
+#include <asm/hvcall.h>
+#undef __KERNEL__
+
+#include "kvm/kvm.h"
+#include "kvm/kvm-cpu.h"
+
+typedef unsigned long target_ulong;
+typedef uintptr_t target_phys_addr_t;
+
+/*
+ * The hcalls above are standardized in PAPR and implemented by pHyp
+ * as well.
+ *
+ * We also need some hcalls which are specific to qemu / KVM-on-POWER.
+ * So far we just need one for H_RTAS, but in future we'll need more
+ * for extensions like virtio. We put those into the 0xf000-0xfffc
+ * range which is reserved by PAPR for "platform-specific" hcalls.
+ */
+#define KVMPPC_HCALL_BASE 0xf000
+#define KVMPPC_H_RTAS (KVMPPC_HCALL_BASE + 0x0)
+#define KVMPPC_HCALL_MAX KVMPPC_H_RTAS
+
+#define DEBUG_SPAPR_HCALLS
+
+#ifdef DEBUG_SPAPR_HCALLS
+#define hcall_dprintf(fmt, ...) \
+ do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0)
+#else
+#define hcall_dprintf(fmt, ...) \
+ do { } while (0)
+#endif
+
+typedef target_ulong (*spapr_hcall_fn)(struct kvm_cpu *vcpu,
+ target_ulong opcode,
+ target_ulong *args);
+
+void hypercall_init(void);
+void register_core_rtas(void);
+
+void spapr_register_hypercall(target_ulong opcode, spapr_hcall_fn fn);
+target_ulong spapr_hypercall(struct kvm_cpu *vcpu, target_ulong opcode,
+ target_ulong *args);
+
+int spapr_rtas_fdt_setup(struct kvm *kvm, void *fdt);
+
+static inline uint32_t rtas_ld(struct kvm *kvm, target_ulong phys, int n)
+{
+ return *((uint32_t *)guest_flat_to_host(kvm, phys + 4*n));
+}
+
+static inline void rtas_st(struct kvm *kvm, target_ulong phys, int n, uint32_t val)
+{
+ *((uint32_t *)guest_flat_to_host(kvm, phys + 4*n)) = val;
+}
+
+typedef void (*spapr_rtas_fn)(struct kvm_cpu *vcpu, uint32_t token,
+ uint32_t nargs, target_ulong args,
+ uint32_t nret, target_ulong rets);
+void spapr_rtas_register(const char *name, spapr_rtas_fn fn);
+target_ulong spapr_rtas_call(struct kvm_cpu *vcpu,
+ uint32_t token, uint32_t nargs, target_ulong args,
+ uint32_t nret, target_ulong rets);
+
+#define SPAPR_PCI_BUID 0x800000020000001ULL
+#define SPAPR_PCI_MEM_WIN_ADDR (KVM_MMIO_START + 0xA0000000)
+#define SPAPR_PCI_MEM_WIN_SIZE 0x20000000
+#define SPAPR_PCI_IO_WIN_ADDR (SPAPR_PCI_MEM_WIN_ADDR + SPAPR_PCI_MEM_WIN_SIZE)
+#define SPAPR_PCI_IO_WIN_SIZE 0x2000000
+
+#define SPAPR_PCI_WIN_START SPAPR_PCI_MEM_WIN_ADDR
+#define SPAPR_PCI_WIN_END (SPAPR_PCI_IO_WIN_ADDR + SPAPR_PCI_IO_WIN_SIZE)
+
+#endif /* !defined (__HW_SPAPR_H__) */
diff --git a/tools/kvm/powerpc/spapr_hcall.c b/tools/kvm/powerpc/spapr_hcall.c
new file mode 100644
index 0000000..ff1d63a
--- /dev/null
+++ b/tools/kvm/powerpc/spapr_hcall.c
@@ -0,0 +1,134 @@
+/*
+ * SPAPR hypercalls
+ *
+ * Borrowed heavily from QEMU's spapr_hcall.c,
+ * Copyright (c) 2010 David Gibson, IBM Corporation.
+ *
+ * Copyright (c) 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include "spapr.h"
+#include "kvm/util.h"
+#include "kvm/kvm.h"
+#include "kvm/kvm-cpu.h"
+
+#include <stdio.h>
+#include <assert.h>
+
+static spapr_hcall_fn papr_hypercall_table[(MAX_HCALL_OPCODE / 4) + 1];
+static spapr_hcall_fn kvmppc_hypercall_table[KVMPPC_HCALL_MAX -
+ KVMPPC_HCALL_BASE + 1];
+
+static target_ulong h_set_dabr(struct kvm_cpu *vcpu, target_ulong opcode, target_ulong *args)
+{
+ /* FIXME: Implement this for -PR. (-HV does this in kernel.) */
+ return H_HARDWARE;
+}
+
+static target_ulong h_rtas(struct kvm_cpu *vcpu, target_ulong opcode, target_ulong *args)
+{
+ target_ulong rtas_r3 = args[0];
+ /*
+ * Pointer read from phys mem; these ptrs cannot be MMIO (!) so just
+ * reference guest RAM directly.
+ */
+ uint32_t token, nargs, nret;
+
+ token = rtas_ld(vcpu->kvm, rtas_r3, 0);
+ nargs = rtas_ld(vcpu->kvm, rtas_r3, 1);
+ nret = rtas_ld(vcpu->kvm, rtas_r3, 2);
+
+ return spapr_rtas_call(vcpu, token, nargs, rtas_r3 + 12,
+ nret, rtas_r3 + 12 + 4*nargs);
+}
+
+static target_ulong h_logical_load(struct kvm_cpu *vcpu, target_ulong opcode, target_ulong *args)
+{
+ /* SLOF will require these, though kernel doesn't. */
+ die(__PRETTY_FUNCTION__);
+ return H_PARAMETER;
+}
+
+static target_ulong h_logical_store(struct kvm_cpu *vcpu, target_ulong opcode, target_ulong *args)
+{
+ /* SLOF will require these, though kernel doesn't. */
+ die(__PRETTY_FUNCTION__);
+ return H_PARAMETER;
+}
+
+static target_ulong h_logical_icbi(struct kvm_cpu *vcpu, target_ulong opcode, target_ulong *args)
+{
+ /* KVM will trap this in the kernel. Die if it misses. */
+ die(__PRETTY_FUNCTION__);
+ return H_SUCCESS;
+}
+
+static target_ulong h_logical_dcbf(struct kvm_cpu *vcpu, target_ulong opcode, target_ulong *args)
+{
+ /* KVM will trap this in the kernel. Die if it misses. */
+ die(__PRETTY_FUNCTION__);
+ return H_SUCCESS;
+}
+
+void spapr_register_hypercall(target_ulong opcode, spapr_hcall_fn fn)
+{
+ spapr_hcall_fn *slot;
+
+ if (opcode <= MAX_HCALL_OPCODE) {
+ assert((opcode & 0x3) == 0);
+
+ slot = &papr_hypercall_table[opcode / 4];
+ } else {
+ assert((opcode >= KVMPPC_HCALL_BASE) &&
+ (opcode <= KVMPPC_HCALL_MAX));
+
+ slot = &kvmppc_hypercall_table[opcode - KVMPPC_HCALL_BASE];
+ }
+
+ assert(!(*slot) || (fn == *slot));
+ *slot = fn;
+}
+
+target_ulong spapr_hypercall(struct kvm_cpu *vcpu, target_ulong opcode,
+ target_ulong *args)
+{
+ if ((opcode <= MAX_HCALL_OPCODE)
+ && ((opcode & 0x3) == 0)) {
+ spapr_hcall_fn fn = papr_hypercall_table[opcode / 4];
+
+ if (fn) {
+ return fn(vcpu, opcode, args);
+ }
+ } else if ((opcode >= KVMPPC_HCALL_BASE) &&
+ (opcode <= KVMPPC_HCALL_MAX)) {
+ spapr_hcall_fn fn = kvmppc_hypercall_table[opcode -
+ KVMPPC_HCALL_BASE];
+
+ if (fn) {
+ return fn(vcpu, opcode, args);
+ }
+ }
+
+ hcall_dprintf("Unimplemented hcall 0x%lx\n", opcode);
+ return H_FUNCTION;
+}
+
+void hypercall_init(void)
+{
+ /* hcall-dabr */
+ spapr_register_hypercall(H_SET_DABR, h_set_dabr);
+
+ spapr_register_hypercall(H_LOGICAL_CI_LOAD, h_logical_load);
+ spapr_register_hypercall(H_LOGICAL_CI_STORE, h_logical_store);
+ spapr_register_hypercall(H_LOGICAL_CACHE_LOAD, h_logical_load);
+ spapr_register_hypercall(H_LOGICAL_CACHE_STORE, h_logical_store);
+ spapr_register_hypercall(H_LOGICAL_ICBI, h_logical_icbi);
+ spapr_register_hypercall(H_LOGICAL_DCBF, h_logical_dcbf);
+
+ /* KVM-PPC specific hcalls */
+ spapr_register_hypercall(KVMPPC_H_RTAS, h_rtas);
+}
diff --git a/tools/kvm/powerpc/spapr_hvcons.c b/tools/kvm/powerpc/spapr_hvcons.c
new file mode 100644
index 0000000..605367b
--- /dev/null
+++ b/tools/kvm/powerpc/spapr_hvcons.c
@@ -0,0 +1,105 @@
+/*
+ * SPAPR HV console
+ *
+ * Borrowed lightly from QEMU's spapr_vty.c, Copyright (c) 2010 David Gibson,
+ * IBM Corporation.
+ *
+ * Copyright (c) 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include "kvm/term.h"
+#include "kvm/kvm.h"
+#include "kvm/kvm-cpu.h"
+#include "kvm/util.h"
+#include "spapr.h"
+#include "spapr_hvcons.h"
+
+#include <stdio.h>
+#include <sys/uio.h>
+#include <errno.h>
+
+#include <linux/byteorder.h>
+
+union hv_chario {
+ struct {
+ uint64_t char0_7;
+ uint64_t char8_15;
+ } a;
+ uint8_t buf[16];
+};
+
+static unsigned long h_put_term_char(struct kvm_cpu *vcpu, unsigned long opcode, unsigned long *args)
+{
+ /* To do: Read register from args[0], and check it. */
+ unsigned long len = args[1];
+ union hv_chario data;
+ struct iovec iov;
+
+ if (len > 16) {
+ return H_PARAMETER;
+ }
+ data.a.char0_7 = cpu_to_be64(args[2]);
+ data.a.char8_15 = cpu_to_be64(args[3]);
+
+ iov.iov_base = data.buf;
+ iov.iov_len = len;
+ do {
+ int ret;
+
+ ret = term_putc_iov(&iov, 1, 0);
+ if (ret < 0) {
+ die("term_putc_iov error %d!\n", errno);
+ }
+ iov.iov_base += ret;
+ iov.iov_len -= ret;
+ } while (iov.iov_len > 0);
+
+ return H_SUCCESS;
+}
+
+
+static unsigned long h_get_term_char(struct kvm_cpu *vcpu, unsigned long opcode, unsigned long *args)
+{
+ /* To do: Read register from args[0], and check it. */
+ unsigned long *len = args + 0;
+ unsigned long *char0_7 = args + 1;
+ unsigned long *char8_15 = args + 2;
+ union hv_chario data;
+ struct iovec iov;
+
+ if (vcpu->kvm->cfg.active_console != CONSOLE_HV)
+ return H_SUCCESS;
+
+ if (term_readable(0)) {
+ iov.iov_base = data.buf;
+ iov.iov_len = 16;
+
+ *len = term_getc_iov(vcpu->kvm, &iov, 1, 0);
+ *char0_7 = be64_to_cpu(data.a.char0_7);
+ *char8_15 = be64_to_cpu(data.a.char8_15);
+ } else {
+ *len = 0;
+ }
+
+ return H_SUCCESS;
+}
+
+void spapr_hvcons_poll(struct kvm *kvm)
+{
+ if (term_readable(0)) {
+ /*
+ * We can inject an IRQ to guest here if we want. The guest
+ * will happily poll, though, so not required.
+ */
+ }
+}
+
+void spapr_hvcons_init(void)
+{
+ spapr_register_hypercall(H_PUT_TERM_CHAR, h_put_term_char);
+ spapr_register_hypercall(H_GET_TERM_CHAR, h_get_term_char);
+}
diff --git a/tools/kvm/powerpc/spapr_hvcons.h b/tools/kvm/powerpc/spapr_hvcons.h
new file mode 100644
index 0000000..d3e4414
--- /dev/null
+++ b/tools/kvm/powerpc/spapr_hvcons.h
@@ -0,0 +1,19 @@
+/*
+ * SPAPR HV console
+ *
+ * Copyright (c) 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#ifndef spapr_hvcons_H
+#define spapr_hvcons_H
+
+#include "kvm/kvm.h"
+
+void spapr_hvcons_init(void);
+void spapr_hvcons_poll(struct kvm *kvm);
+
+#endif
diff --git a/tools/kvm/powerpc/spapr_pci.c b/tools/kvm/powerpc/spapr_pci.c
new file mode 100644
index 0000000..768e3f2
--- /dev/null
+++ b/tools/kvm/powerpc/spapr_pci.c
@@ -0,0 +1,409 @@
+/*
+ * SPAPR PHB emulation, RTAS interface to PCI config space, device tree nodes
+ * for enumerated devices.
+ *
+ * Borrowed heavily from QEMU's spapr_pci.c,
+ * Copyright (c) 2011 Alexey Kardashevskiy, IBM Corporation.
+ * Copyright (c) 2011 David Gibson, IBM Corporation.
+ *
+ * Modifications copyright 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include "spapr.h"
+#include "spapr_pci.h"
+#include "kvm/devices.h"
+#include "kvm/fdt.h"
+#include "kvm/util.h"
+#include "kvm/of_pci.h"
+#include "kvm/pci.h"
+
+#include <linux/pci_regs.h>
+#include <linux/byteorder.h>
+
+
+/* #define DEBUG_PHB yes */
+#ifdef DEBUG_PHB
+#define phb_dprintf(fmt, ...) \
+ do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0)
+#else
+#define phb_dprintf(fmt, ...) \
+ do { } while (0)
+#endif
+
+static const uint32_t bars[] = {
+ PCI_BASE_ADDRESS_0, PCI_BASE_ADDRESS_1,
+ PCI_BASE_ADDRESS_2, PCI_BASE_ADDRESS_3,
+ PCI_BASE_ADDRESS_4, PCI_BASE_ADDRESS_5
+ /*, PCI_ROM_ADDRESS*/
+};
+
+#define PCI_NUM_REGIONS 7
+
+static struct spapr_phb phb;
+
+static void rtas_ibm_read_pci_config(struct kvm_cpu *vcpu,
+ uint32_t token, uint32_t nargs,
+ target_ulong args,
+ uint32_t nret, target_ulong rets)
+{
+ uint32_t val = 0;
+ uint64_t buid = ((uint64_t)rtas_ld(vcpu->kvm, args, 1) << 32) | rtas_ld(vcpu->kvm, args, 2);
+ union pci_config_address addr = { .w = rtas_ld(vcpu->kvm, args, 0) };
+ struct pci_device_header *dev = pci__find_dev(addr.device_number);
+ uint32_t size = rtas_ld(vcpu->kvm, args, 3);
+
+ if (buid != phb.buid || !dev || (size > 4)) {
+ phb_dprintf("- cfgRd buid 0x%lx cfg addr 0x%x size %d not found\n",
+ buid, addr.w, size);
+
+ rtas_st(vcpu->kvm, rets, 0, -1);
+ return;
+ }
+ pci__config_rd(vcpu->kvm, addr, &val, size);
+ /* It appears this wants a byteswapped result... */
+ switch (size) {
+ case 4:
+ val = le32_to_cpu(val);
+ break;
+ case 2:
+ val = le16_to_cpu(val>>16);
+ break;
+ case 1:
+ val = val >> 24;
+ break;
+ }
+ phb_dprintf("- cfgRd buid 0x%lx addr 0x%x (/%d): b%d,d%d,f%d,r0x%x, val 0x%x\n",
+ buid, addr.w, size, addr.bus_number, addr.device_number, addr.function_number,
+ addr.register_number, val);
+
+ rtas_st(vcpu->kvm, rets, 0, 0);
+ rtas_st(vcpu->kvm, rets, 1, val);
+}
+
+static void rtas_read_pci_config(struct kvm_cpu *vcpu,
+ uint32_t token, uint32_t nargs,
+ target_ulong args,
+ uint32_t nret, target_ulong rets)
+{
+ uint32_t val;
+ union pci_config_address addr = { .w = rtas_ld(vcpu->kvm, args, 0) };
+ struct pci_device_header *dev = pci__find_dev(addr.device_number);
+ uint32_t size = rtas_ld(vcpu->kvm, args, 1);
+
+ if (!dev || (size > 4)) {
+ rtas_st(vcpu->kvm, rets, 0, -1);
+ return;
+ }
+ pci__config_rd(vcpu->kvm, addr, &val, size);
+ switch (size) {
+ case 4:
+ val = le32_to_cpu(val);
+ break;
+ case 2:
+ val = le16_to_cpu(val>>16); /* We're yuck-endian. */
+ break;
+ case 1:
+ val = val >> 24;
+ break;
+ }
+ phb_dprintf("- cfgRd addr 0x%x size %d, val 0x%x\n", addr.w, size, val);
+ rtas_st(vcpu->kvm, rets, 0, 0);
+ rtas_st(vcpu->kvm, rets, 1, val);
+}
+
+static void rtas_ibm_write_pci_config(struct kvm_cpu *vcpu,
+ uint32_t token, uint32_t nargs,
+ target_ulong args,
+ uint32_t nret, target_ulong rets)
+{
+ uint64_t buid = ((uint64_t)rtas_ld(vcpu->kvm, args, 1) << 32) | rtas_ld(vcpu->kvm, args, 2);
+ union pci_config_address addr = { .w = rtas_ld(vcpu->kvm, args, 0) };
+ struct pci_device_header *dev = pci__find_dev(addr.device_number);
+ uint32_t size = rtas_ld(vcpu->kvm, args, 3);
+ uint32_t val = rtas_ld(vcpu->kvm, args, 4);
+
+ if (buid != phb.buid || !dev || (size > 4)) {
+ phb_dprintf("- cfgWr buid 0x%lx cfg addr 0x%x/%d error (val 0x%x)\n",
+ buid, addr.w, size, val);
+
+ rtas_st(vcpu->kvm, rets, 0, -1);
+ return;
+ }
+ phb_dprintf("- cfgWr buid 0x%lx addr 0x%x (/%d): b%d,d%d,f%d,r0x%x, val 0x%x\n",
+ buid, addr.w, size, addr.bus_number, addr.device_number, addr.function_number,
+ addr.register_number, val);
+ switch (size) {
+ case 4:
+ val = le32_to_cpu(val);
+ break;
+ case 2:
+ val = le16_to_cpu(val) << 16;
+ break;
+ case 1:
+ val = val >> 24;
+ break;
+ }
+ pci__config_wr(vcpu->kvm, addr, &val, size);
+ rtas_st(vcpu->kvm, rets, 0, 0);
+}
+
+static void rtas_write_pci_config(struct kvm_cpu *vcpu,
+ uint32_t token, uint32_t nargs,
+ target_ulong args,
+ uint32_t nret, target_ulong rets)
+{
+ union pci_config_address addr = { .w = rtas_ld(vcpu->kvm, args, 0) };
+ struct pci_device_header *dev = pci__find_dev(addr.device_number);
+ uint32_t size = rtas_ld(vcpu->kvm, args, 1);
+ uint32_t val = rtas_ld(vcpu->kvm, args, 2);
+
+ if (!dev || (size > 4)) {
+ rtas_st(vcpu->kvm, rets, 0, -1);
+ return;
+ }
+
+ phb_dprintf("- cfgWr addr 0x%x (/%d): b%d,d%d,f%d,r0x%x, val 0x%x\n",
+ addr.w, size, addr.bus_number, addr.device_number, addr.function_number,
+ addr.register_number, val);
+ switch (size) {
+ case 4:
+ val = le32_to_cpu(val);
+ break;
+ case 2:
+ val = le16_to_cpu(val) << 16;
+ break;
+ case 1:
+ val = val >> 24;
+ break;
+ }
+ pci__config_wr(vcpu->kvm, addr, &val, size);
+ rtas_st(vcpu->kvm, rets, 0, 0);
+}
+
+void spapr_create_phb(struct kvm *kvm,
+ const char *busname, uint64_t buid,
+ uint64_t mem_win_addr, uint64_t mem_win_size,
+ uint64_t io_win_addr, uint64_t io_win_size)
+{
+ /*
+ * Since kvmtool doesn't really have any concept of buses etc.,
+ * there's nothing to register here. Just register RTAS.
+ */
+ spapr_rtas_register("read-pci-config", rtas_read_pci_config);
+ spapr_rtas_register("write-pci-config", rtas_write_pci_config);
+ spapr_rtas_register("ibm,read-pci-config", rtas_ibm_read_pci_config);
+ spapr_rtas_register("ibm,write-pci-config", rtas_ibm_write_pci_config);
+
+ phb.buid = buid;
+ phb.mem_addr = mem_win_addr;
+ phb.mem_size = mem_win_size;
+ phb.io_addr = io_win_addr;
+ phb.io_size = io_win_size;
+
+ kvm->arch.phb = &phb;
+}
+
+static uint32_t bar_to_ss(unsigned long bar)
+{
+ if ((bar & PCI_BASE_ADDRESS_SPACE) ==
+ PCI_BASE_ADDRESS_SPACE_IO)
+ return OF_PCI_SS_IO;
+ else if (bar & PCI_BASE_ADDRESS_MEM_TYPE_64)
+ return OF_PCI_SS_M64;
+ else
+ return OF_PCI_SS_M32;
+}
+
+static unsigned long bar_to_addr(unsigned long bar)
+{
+ if ((bar & PCI_BASE_ADDRESS_SPACE) ==
+ PCI_BASE_ADDRESS_SPACE_IO)
+ return bar & PCI_BASE_ADDRESS_IO_MASK;
+ else
+ return bar & PCI_BASE_ADDRESS_MEM_MASK;
+}
+
+int spapr_populate_pci_devices(struct kvm *kvm,
+ uint32_t xics_phandle,
+ void *fdt)
+{
+ int bus_off, node_off = 0, devid, fn, i, n, devices;
+ struct device_header *dev_hdr;
+ char nodename[256];
+ struct of_pci_unit_address reg[PCI_NUM_REGIONS + 1],
+ assigned_addresses[PCI_NUM_REGIONS];
+ uint32_t bus_range[] = { cpu_to_be32(0), cpu_to_be32(0xff) };
+ struct of_pci_ranges_entry ranges[] = {
+ {
+ {
+ cpu_to_be32(of_pci_b_ss(1)),
+ cpu_to_be32(0),
+ cpu_to_be32(0),
+ },
+ cpu_to_be64(phb.io_addr),
+ cpu_to_be64(phb.io_size),
+ },
+ {
+ {
+ cpu_to_be32(of_pci_b_ss(2)),
+ cpu_to_be32(0),
+ cpu_to_be32(0),
+ },
+ cpu_to_be64(phb.mem_addr),
+ cpu_to_be64(phb.mem_size),
+ },
+ };
+ uint64_t bus_reg[] = { cpu_to_be64(phb.buid), 0 };
+ uint32_t interrupt_map_mask[] = {
+ cpu_to_be32(of_pci_b_ddddd(-1)|of_pci_b_fff(-1)), 0x0, 0x0, 0x0};
+ uint32_t interrupt_map[SPAPR_PCI_NUM_LSI][7];
+
+ /* Start populating the FDT */
+ sprintf(nodename, "pci@%" PRIx64, phb.buid);
+ bus_off = fdt_add_subnode(fdt, 0, nodename);
+ if (bus_off < 0) {
+ die("error making bus subnode, %s\n", fdt_strerror(bus_off));
+ return bus_off;
+ }
+
+ /* Write PHB properties */
+ _FDT(fdt_setprop_string(fdt, bus_off, "device_type", "pci"));
+ _FDT(fdt_setprop_string(fdt, bus_off, "compatible", "IBM,Logical_PHB"));
+ _FDT(fdt_setprop_cell(fdt, bus_off, "#address-cells", 0x3));
+ _FDT(fdt_setprop_cell(fdt, bus_off, "#size-cells", 0x2));
+ _FDT(fdt_setprop_cell(fdt, bus_off, "#interrupt-cells", 0x1));
+ _FDT(fdt_setprop(fdt, bus_off, "used-by-rtas", NULL, 0));
+ _FDT(fdt_setprop(fdt, bus_off, "bus-range", &bus_range, sizeof(bus_range)));
+ _FDT(fdt_setprop(fdt, bus_off, "ranges", &ranges, sizeof(ranges)));
+ _FDT(fdt_setprop(fdt, bus_off, "reg", &bus_reg, sizeof(bus_reg)));
+ _FDT(fdt_setprop(fdt, bus_off, "interrupt-map-mask",
+ &interrupt_map_mask, sizeof(interrupt_map_mask)));
+
+ /* Populate PCI devices and allocate IRQs */
+ devices = 0;
+ dev_hdr = device__first_dev(DEVICE_BUS_PCI);
+ while (dev_hdr) {
+ uint32_t *irqmap = interrupt_map[devices];
+ struct pci_device_header *hdr = dev_hdr->data;
+
+ if (!hdr)
+ continue;
+
+ devid = dev_hdr->dev_num;
+ fn = 0; /* kvmtool doesn't yet do multifunction devices */
+
+ sprintf(nodename, "pci@%u,%u", devid, fn);
+
+ /* Allocate interrupt from the map */
+ if (devid > SPAPR_PCI_NUM_LSI) {
+ die("Unexpected behaviour in spapr_populate_pci_devices,"
+ "wrong devid %u\n", devid);
+ }
+ irqmap[0] = cpu_to_be32(of_pci_b_ddddd(devid)|of_pci_b_fff(fn));
+ irqmap[1] = 0;
+ irqmap[2] = 0;
+ irqmap[3] = 0;
+ irqmap[4] = cpu_to_be32(xics_phandle);
+ /*
+ * This is nasty; the PCI devs are set up such that their own
+ * header's irq_line indicates the direct XICS IRQ number to
+ * use. There REALLY needs to be a hierarchical system in place
+ * to 'raise' an IRQ on the bridge which indexes/looks up which
+ * XICS IRQ to fire.
+ */
+ irqmap[5] = cpu_to_be32(hdr->irq_line);
+ irqmap[6] = cpu_to_be32(0x8);
+
+ /* Add node to FDT */
+ node_off = fdt_add_subnode(fdt, bus_off, nodename);
+ if (node_off < 0) {
+ die("error making node subnode, %s\n", fdt_strerror(bus_off));
+ return node_off;
+ }
+
+ _FDT(fdt_setprop_cell(fdt, node_off, "vendor-id",
+ le16_to_cpu(hdr->vendor_id)));
+ _FDT(fdt_setprop_cell(fdt, node_off, "device-id",
+ le16_to_cpu(hdr->device_id)));
+ _FDT(fdt_setprop_cell(fdt, node_off, "revision-id",
+ hdr->revision_id));
+ _FDT(fdt_setprop_cell(fdt, node_off, "class-code",
+ hdr->class[0] | (hdr->class[1] << 8) | (hdr->class[2] << 16)));
+ _FDT(fdt_setprop_cell(fdt, node_off, "subsystem-id",
+ le16_to_cpu(hdr->subsys_id)));
+ _FDT(fdt_setprop_cell(fdt, node_off, "subsystem-vendor-id",
+ le16_to_cpu(hdr->subsys_vendor_id)));
+
+ /* Config space region comes first */
+ reg[0].hi = cpu_to_be32(
+ of_pci_b_n(0) |
+ of_pci_b_p(0) |
+ of_pci_b_t(0) |
+ of_pci_b_ss(OF_PCI_SS_CONFIG) |
+ of_pci_b_bbbbbbbb(0) |
+ of_pci_b_ddddd(devid) |
+ of_pci_b_fff(fn));
+ reg[0].mid = 0;
+ reg[0].lo = 0;
+
+ n = 0;
+ /* Six BARs, no ROM supported, addresses are 32bit */
+ for (i = 0; i < 6; ++i) {
+ if (0 == hdr->bar[i]) {
+ continue;
+ }
+
+ reg[n+1].hi = cpu_to_be32(
+ of_pci_b_n(0) |
+ of_pci_b_p(0) |
+ of_pci_b_t(0) |
+ of_pci_b_ss(bar_to_ss(le32_to_cpu(hdr->bar[i]))) |
+ of_pci_b_bbbbbbbb(0) |
+ of_pci_b_ddddd(devid) |
+ of_pci_b_fff(fn) |
+ of_pci_b_rrrrrrrr(bars[i]));
+ reg[n+1].mid = 0;
+ reg[n+1].lo = cpu_to_be64(hdr->bar_size[i]);
+
+ assigned_addresses[n].hi = cpu_to_be32(
+ of_pci_b_n(1) |
+ of_pci_b_p(0) |
+ of_pci_b_t(0) |
+ of_pci_b_ss(bar_to_ss(le32_to_cpu(hdr->bar[i]))) |
+ of_pci_b_bbbbbbbb(0) |
+ of_pci_b_ddddd(devid) |
+ of_pci_b_fff(fn) |
+ of_pci_b_rrrrrrrr(bars[i]));
+
+ /*
+ * Writing zeroes to assigned_addresses causes the guest kernel to
+ * reassign BARs
+ */
+ assigned_addresses[n].mid = cpu_to_be64(bar_to_addr(le32_to_cpu(hdr->bar[i])));
+ assigned_addresses[n].lo = reg[n+1].lo;
+
+ ++n;
+ }
+ _FDT(fdt_setprop(fdt, node_off, "reg", reg, sizeof(reg[0])*(n+1)));
+ _FDT(fdt_setprop(fdt, node_off, "assigned-addresses",
+ assigned_addresses,
+ sizeof(assigned_addresses[0])*(n)));
+ _FDT(fdt_setprop_cell(fdt, node_off, "interrupts",
+ hdr->irq_pin));
+
+ /* We don't set ibm,dma-window property as we don't have an IOMMU. */
+
+ ++devices;
+ dev_hdr = device__next_dev(dev_hdr);
+ }
+
+ /* Write interrupt map */
+ _FDT(fdt_setprop(fdt, bus_off, "interrupt-map", &interrupt_map,
+ devices * sizeof(interrupt_map[0])));
+
+ return 0;
+}
diff --git a/tools/kvm/powerpc/spapr_pci.h b/tools/kvm/powerpc/spapr_pci.h
new file mode 100644
index 0000000..f659eda
--- /dev/null
+++ b/tools/kvm/powerpc/spapr_pci.h
@@ -0,0 +1,57 @@
+/*
+ * SPAPR PHB definitions
+ *
+ * Modifications by Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#ifndef SPAPR_PCI_H
+#define SPAPR_PCI_H
+
+#include "kvm/kvm.h"
+#include "spapr.h"
+#include <inttypes.h>
+
+/* With XICS, we can easily accomodate 1 IRQ per PCI device. */
+
+#define SPAPR_PCI_NUM_LSI 256
+
+struct spapr_phb {
+ uint64_t buid;
+ uint64_t mem_addr;
+ uint64_t mem_size;
+ uint64_t io_addr;
+ uint64_t io_size;
+};
+
+void spapr_create_phb(struct kvm *kvm,
+ const char *busname, uint64_t buid,
+ uint64_t mem_win_addr, uint64_t mem_win_size,
+ uint64_t io_win_addr, uint64_t io_win_size);
+
+int spapr_populate_pci_devices(struct kvm *kvm,
+ uint32_t xics_phandle,
+ void *fdt);
+
+static inline bool spapr_phb_mmio(struct kvm_cpu *vcpu, u64 phys_addr, u8 *data, u32 len, u8 is_write)
+{
+ if ((phys_addr >= SPAPR_PCI_IO_WIN_ADDR) &&
+ (phys_addr < SPAPR_PCI_IO_WIN_ADDR +
+ SPAPR_PCI_IO_WIN_SIZE)) {
+ return kvm__emulate_io(vcpu, phys_addr - SPAPR_PCI_IO_WIN_ADDR,
+ data, is_write ? KVM_EXIT_IO_OUT :
+ KVM_EXIT_IO_IN,
+ len, 1);
+ } else if ((phys_addr >= SPAPR_PCI_MEM_WIN_ADDR) &&
+ (phys_addr < SPAPR_PCI_MEM_WIN_ADDR +
+ SPAPR_PCI_MEM_WIN_SIZE)) {
+ return kvm__emulate_mmio(vcpu, phys_addr - SPAPR_PCI_MEM_WIN_ADDR,
+ data, len, is_write);
+ }
+ return false;
+}
+
+#endif
diff --git a/tools/kvm/powerpc/spapr_rtas.c b/tools/kvm/powerpc/spapr_rtas.c
new file mode 100644
index 0000000..43e5310
--- /dev/null
+++ b/tools/kvm/powerpc/spapr_rtas.c
@@ -0,0 +1,246 @@
+/*
+ * SPAPR base RTAS calls
+ *
+ * Borrowed heavily from QEMU's spapr_rtas.c
+ * Copyright (c) 2010-2011 David Gibson, IBM Corporation.
+ *
+ * Modifications copyright 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include "kvm/kvm.h"
+#include "kvm/kvm-cpu.h"
+#include "kvm/util.h"
+#include "kvm/term.h"
+#include "libfdt.h"
+
+#include "spapr.h"
+
+#include <stdio.h>
+#include <assert.h>
+
+#define TOKEN_BASE 0x2000
+#define TOKEN_MAX 0x100
+
+#define RTAS_CONSOLE
+
+static struct rtas_call {
+ const char *name;
+ spapr_rtas_fn fn;
+} rtas_table[TOKEN_MAX];
+
+struct rtas_call *rtas_next = rtas_table;
+
+
+static void rtas_display_character(struct kvm_cpu *vcpu,
+ uint32_t token, uint32_t nargs,
+ target_ulong args,
+ uint32_t nret, target_ulong rets)
+{
+ char c = rtas_ld(vcpu->kvm, args, 0);
+ term_putc(&c, 1, 0);
+ rtas_st(vcpu->kvm, rets, 0, 0);
+}
+
+#ifdef RTAS_CONSOLE
+static void rtas_put_term_char(struct kvm_cpu *vcpu,
+ uint32_t token, uint32_t nargs,
+ target_ulong args,
+ uint32_t nret, target_ulong rets)
+{
+ char c = rtas_ld(vcpu->kvm, args, 0);
+
+ term_putc(&c, 1, 0);
+
+ rtas_st(vcpu->kvm, rets, 0, 0);
+}
+
+static void rtas_get_term_char(struct kvm_cpu *vcpu,
+ uint32_t token, uint32_t nargs,
+ target_ulong args,
+ uint32_t nret, target_ulong rets)
+{
+ int c;
+
+ if (vcpu->kvm->cfg.active_console == CONSOLE_HV && term_readable(0) &&
+ (c = term_getc(vcpu->kvm, 0)) >= 0) {
+ rtas_st(vcpu->kvm, rets, 0, 0);
+ rtas_st(vcpu->kvm, rets, 1, c);
+ } else {
+ rtas_st(vcpu->kvm, rets, 0, -2);
+ }
+}
+#endif
+
+static void rtas_get_time_of_day(struct kvm_cpu *vcpu,
+ uint32_t token, uint32_t nargs,
+ target_ulong args,
+ uint32_t nret, target_ulong rets)
+{
+ struct tm tm;
+ time_t tnow;
+
+ if (nret != 8) {
+ rtas_st(vcpu->kvm, rets, 0, -3);
+ return;
+ }
+
+ tnow = time(NULL);
+ /* Guest time is currently not offset in any way. */
+ gmtime_r(&tnow, &tm);
+
+ rtas_st(vcpu->kvm, rets, 0, 0); /* Success */
+ rtas_st(vcpu->kvm, rets, 1, tm.tm_year + 1900);
+ rtas_st(vcpu->kvm, rets, 2, tm.tm_mon + 1);
+ rtas_st(vcpu->kvm, rets, 3, tm.tm_mday);
+ rtas_st(vcpu->kvm, rets, 4, tm.tm_hour);
+ rtas_st(vcpu->kvm, rets, 5, tm.tm_min);
+ rtas_st(vcpu->kvm, rets, 6, tm.tm_sec);
+ rtas_st(vcpu->kvm, rets, 7, 0);
+}
+
+static void rtas_set_time_of_day(struct kvm_cpu *vcpu,
+ uint32_t token, uint32_t nargs,
+ target_ulong args,
+ uint32_t nret, target_ulong rets)
+{
+ pr_warning("%s called; TOD set ignored.\n", __FUNCTION__);
+}
+
+static void rtas_power_off(struct kvm_cpu *vcpu,
+ uint32_t token, uint32_t nargs, target_ulong args,
+ uint32_t nret, target_ulong rets)
+{
+ if (nargs != 2 || nret != 1) {
+ rtas_st(vcpu->kvm, rets, 0, -3);
+ return;
+ }
+ kvm_cpu__reboot(vcpu->kvm);
+}
+
+static void rtas_system_reboot(struct kvm_cpu *vcpu,
+ uint32_t token, uint32_t nargs, target_ulong args,
+ uint32_t nret, target_ulong rets)
+{
+ if (nargs != 0 || nret != 1) {
+ rtas_st(vcpu->kvm, rets, 0, -3);
+ return;
+ }
+
+ /* NB this actually halts the VM */
+ kvm_cpu__reboot(vcpu->kvm);
+}
+
+static void rtas_query_cpu_stopped_state(struct kvm_cpu *vcpu,
+ uint32_t token, uint32_t nargs,
+ target_ulong args,
+ uint32_t nret, target_ulong rets)
+{
+ if (nargs != 1 || nret != 2) {
+ rtas_st(vcpu->kvm, rets, 0, -3);
+ return;
+ }
+
+ /*
+ * Can read id = rtas_ld(vcpu->kvm, args, 0), but
+ * we currently start all CPUs. So just return true.
+ */
+ rtas_st(vcpu->kvm, rets, 0, 0);
+ rtas_st(vcpu->kvm, rets, 1, 2);
+}
+
+static void rtas_start_cpu(struct kvm_cpu *vcpu,
+ uint32_t token, uint32_t nargs,
+ target_ulong args,
+ uint32_t nret, target_ulong rets)
+{
+ die(__FUNCTION__);
+}
+
+target_ulong spapr_rtas_call(struct kvm_cpu *vcpu,
+ uint32_t token, uint32_t nargs, target_ulong args,
+ uint32_t nret, target_ulong rets)
+{
+ if ((token >= TOKEN_BASE)
+ && ((token - TOKEN_BASE) < TOKEN_MAX)) {
+ struct rtas_call *call = rtas_table + (token - TOKEN_BASE);
+
+ if (call->fn) {
+ call->fn(vcpu, token, nargs, args, nret, rets);
+ return H_SUCCESS;
+ }
+ }
+
+ /*
+ * HACK: Some Linux early debug code uses RTAS display-character,
+ * but assumes the token value is 0xa (which it is on some real
+ * machines) without looking it up in the device tree. This
+ * special case makes this work
+ */
+ if (token == 0xa) {
+ rtas_display_character(vcpu, 0xa, nargs, args, nret, rets);
+ return H_SUCCESS;
+ }
+
+ hcall_dprintf("Unknown RTAS token 0x%x\n", token);
+ rtas_st(vcpu->kvm, rets, 0, -3);
+ return H_PARAMETER;
+}
+
+void spapr_rtas_register(const char *name, spapr_rtas_fn fn)
+{
+ assert(rtas_next < (rtas_table + TOKEN_MAX));
+
+ rtas_next->name = name;
+ rtas_next->fn = fn;
+
+ rtas_next++;
+}
+
+/*
+ * This is called from the context of an open /rtas node, in order to add
+ * properties for the rtas call tokens.
+ */
+int spapr_rtas_fdt_setup(struct kvm *kvm, void *fdt)
+{
+ int ret;
+ int i;
+
+ for (i = 0; i < TOKEN_MAX; i++) {
+ struct rtas_call *call = &rtas_table[i];
+
+ if (!call->fn) {
+ continue;
+ }
+
+ ret = fdt_property_cell(fdt, call->name, i + TOKEN_BASE);
+
+ if (ret < 0) {
+ pr_warning("Couldn't add rtas token for %s: %s\n",
+ call->name, fdt_strerror(ret));
+ return ret;
+ }
+
+ }
+ return 0;
+}
+
+void register_core_rtas(void)
+{
+ spapr_rtas_register("display-character", rtas_display_character);
+ spapr_rtas_register("get-time-of-day", rtas_get_time_of_day);
+ spapr_rtas_register("set-time-of-day", rtas_set_time_of_day);
+ spapr_rtas_register("power-off", rtas_power_off);
+ spapr_rtas_register("system-reboot", rtas_system_reboot);
+ spapr_rtas_register("query-cpu-stopped-state",
+ rtas_query_cpu_stopped_state);
+ spapr_rtas_register("start-cpu", rtas_start_cpu);
+#ifdef RTAS_CONSOLE
+ /* These are unused: We do console I/O via hcalls, not rtas. */
+ spapr_rtas_register("put-term-char", rtas_put_term_char);
+ spapr_rtas_register("get-term-char", rtas_get_term_char);
+#endif
+}
diff --git a/tools/kvm/powerpc/xics.c b/tools/kvm/powerpc/xics.c
new file mode 100644
index 0000000..071b928
--- /dev/null
+++ b/tools/kvm/powerpc/xics.c
@@ -0,0 +1,495 @@
+/*
+ * PAPR Virtualized Interrupt System, aka ICS/ICP aka xics
+ *
+ * Borrowed heavily from QEMU's xics.c,
+ * Copyright (c) 2010,2011 David Gibson, IBM Corporation.
+ *
+ * Modifications copyright 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include "spapr.h"
+#include "xics.h"
+#include "kvm/util.h"
+#include "kvm/kvm.h"
+
+#include <stdio.h>
+#include <malloc.h>
+
+#define XICS_NUM_IRQS 1024
+
+
+/* #define DEBUG_XICS yes */
+#ifdef DEBUG_XICS
+#define xics_dprintf(fmt, ...) \
+ do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0)
+#else
+#define xics_dprintf(fmt, ...) \
+ do { } while (0)
+#endif
+
+/*
+ * ICP: Presentation layer
+ */
+
+struct icp_server_state {
+ uint32_t xirr;
+ uint8_t pending_priority;
+ uint8_t mfrr;
+ struct kvm_cpu *cpu;
+};
+
+#define XICS_IRQ_OFFSET KVM_IRQ_OFFSET
+#define XISR_MASK 0x00ffffff
+#define CPPR_MASK 0xff000000
+
+#define XISR(ss) (((ss)->xirr) & XISR_MASK)
+#define CPPR(ss) (((ss)->xirr) >> 24)
+
+struct ics_state;
+
+struct icp_state {
+ unsigned long nr_servers;
+ struct icp_server_state *ss;
+ struct ics_state *ics;
+};
+
+static void ics_reject(struct ics_state *ics, int nr);
+static void ics_resend(struct ics_state *ics);
+static void ics_eoi(struct ics_state *ics, int nr);
+
+static inline void cpu_irq_raise(struct kvm_cpu *vcpu)
+{
+ xics_dprintf("INT1[%p]\n", vcpu);
+ kvm_cpu__irq(vcpu, POWER7_EXT_IRQ, 1);
+}
+
+static inline void cpu_irq_lower(struct kvm_cpu *vcpu)
+{
+ xics_dprintf("INT0[%p]\n", vcpu);
+ kvm_cpu__irq(vcpu, POWER7_EXT_IRQ, 0);
+}
+
+static void icp_check_ipi(struct icp_state *icp, int server)
+{
+ struct icp_server_state *ss = icp->ss + server;
+
+ if (XISR(ss) && (ss->pending_priority <= ss->mfrr)) {
+ return;
+ }
+
+ if (XISR(ss)) {
+ ics_reject(icp->ics, XISR(ss));
+ }
+
+ ss->xirr = (ss->xirr & ~XISR_MASK) | XICS_IPI;
+ ss->pending_priority = ss->mfrr;
+ cpu_irq_raise(ss->cpu);
+}
+
+static void icp_resend(struct icp_state *icp, int server)
+{
+ struct icp_server_state *ss = icp->ss + server;
+
+ if (ss->mfrr < CPPR(ss)) {
+ icp_check_ipi(icp, server);
+ }
+ ics_resend(icp->ics);
+}
+
+static void icp_set_cppr(struct icp_state *icp, int server, uint8_t cppr)
+{
+ struct icp_server_state *ss = icp->ss + server;
+ uint8_t old_cppr;
+ uint32_t old_xisr;
+
+ old_cppr = CPPR(ss);
+ ss->xirr = (ss->xirr & ~CPPR_MASK) | (cppr << 24);
+
+ if (cppr < old_cppr) {
+ if (XISR(ss) && (cppr <= ss->pending_priority)) {
+ old_xisr = XISR(ss);
+ ss->xirr &= ~XISR_MASK; /* Clear XISR */
+ cpu_irq_lower(ss->cpu);
+ ics_reject(icp->ics, old_xisr);
+ }
+ } else {
+ if (!XISR(ss)) {
+ icp_resend(icp, server);
+ }
+ }
+}
+
+static void icp_set_mfrr(struct icp_state *icp, int nr, uint8_t mfrr)
+{
+ struct icp_server_state *ss = icp->ss + nr;
+
+ ss->mfrr = mfrr;
+ if (mfrr < CPPR(ss)) {
+ icp_check_ipi(icp, nr);
+ }
+}
+
+static uint32_t icp_accept(struct icp_server_state *ss)
+{
+ uint32_t xirr;
+
+ cpu_irq_lower(ss->cpu);
+ xirr = ss->xirr;
+ ss->xirr = ss->pending_priority << 24;
+ return xirr;
+}
+
+static void icp_eoi(struct icp_state *icp, int server, uint32_t xirr)
+{
+ struct icp_server_state *ss = icp->ss + server;
+
+ ics_eoi(icp->ics, xirr & XISR_MASK);
+ /* Send EOI -> ICS */
+ ss->xirr = (ss->xirr & ~CPPR_MASK) | (xirr & CPPR_MASK);
+ if (!XISR(ss)) {
+ icp_resend(icp, server);
+ }
+}
+
+static void icp_irq(struct icp_state *icp, int server, int nr, uint8_t priority)
+{
+ struct icp_server_state *ss = icp->ss + server;
+ xics_dprintf("icp_irq(nr %d, server %d, prio 0x%x)\n", nr, server, priority);
+ if ((priority >= CPPR(ss))
+ || (XISR(ss) && (ss->pending_priority <= priority))) {
+ xics_dprintf("reject %d, CPPR 0x%x, XISR 0x%x, pprio 0x%x, prio 0x%x\n",
+ nr, CPPR(ss), XISR(ss), ss->pending_priority, priority);
+ ics_reject(icp->ics, nr);
+ } else {
+ if (XISR(ss)) {
+ xics_dprintf("reject %d, CPPR 0x%x, XISR 0x%x, pprio 0x%x, prio 0x%x\n",
+ nr, CPPR(ss), XISR(ss), ss->pending_priority, priority);
+ ics_reject(icp->ics, XISR(ss));
+ }
+ ss->xirr = (ss->xirr & ~XISR_MASK) | (nr & XISR_MASK);
+ ss->pending_priority = priority;
+ cpu_irq_raise(ss->cpu);
+ }
+}
+
+/*
+ * ICS: Source layer
+ */
+
+struct ics_irq_state {
+ int server;
+ uint8_t priority;
+ uint8_t saved_priority;
+ int rejected:1;
+ int masked_pending:1;
+};
+
+struct ics_state {
+ unsigned int nr_irqs;
+ unsigned int offset;
+ struct ics_irq_state *irqs;
+ struct icp_state *icp;
+};
+
+static int ics_valid_irq(struct ics_state *ics, uint32_t nr)
+{
+ return (nr >= ics->offset)
+ && (nr < (ics->offset + ics->nr_irqs));
+}
+
+static void ics_set_irq_msi(struct ics_state *ics, int srcno, int val)
+{
+ struct ics_irq_state *irq = ics->irqs + srcno;
+
+ if (val) {
+ if (irq->priority == 0xff) {
+ xics_dprintf(" irq pri ff, masked pending\n");
+ irq->masked_pending = 1;
+ } else {
+ icp_irq(ics->icp, irq->server, srcno + ics->offset, irq->priority);
+ }
+ }
+}
+
+static void ics_reject_msi(struct ics_state *ics, int nr)
+{
+ struct ics_irq_state *irq = ics->irqs + nr - ics->offset;
+
+ irq->rejected = 1;
+}
+
+static void ics_resend_msi(struct ics_state *ics)
+{
+ unsigned int i;
+
+ for (i = 0; i < ics->nr_irqs; i++) {
+ struct ics_irq_state *irq = ics->irqs + i;
+
+ /* FIXME: filter by server#? */
+ if (irq->rejected) {
+ irq->rejected = 0;
+ if (irq->priority != 0xff) {
+ icp_irq(ics->icp, irq->server, i + ics->offset, irq->priority);
+ }
+ }
+ }
+}
+
+static void ics_write_xive_msi(struct ics_state *ics, int nr, int server,
+ uint8_t priority)
+{
+ struct ics_irq_state *irq = ics->irqs + nr - ics->offset;
+
+ irq->server = server;
+ irq->priority = priority;
+ xics_dprintf("ics_write_xive_msi(nr %d, server %d, pri 0x%x)\n", nr, server, priority);
+
+ if (!irq->masked_pending || (priority == 0xff)) {
+ return;
+ }
+
+ irq->masked_pending = 0;
+ icp_irq(ics->icp, server, nr, priority);
+}
+
+static void ics_reject(struct ics_state *ics, int nr)
+{
+ ics_reject_msi(ics, nr);
+}
+
+static void ics_resend(struct ics_state *ics)
+{
+ ics_resend_msi(ics);
+}
+
+static void ics_eoi(struct ics_state *ics, int nr)
+{
+}
+
+/*
+ * Exported functions
+ */
+
+static target_ulong h_cppr(struct kvm_cpu *vcpu,
+ target_ulong opcode, target_ulong *args)
+{
+ target_ulong cppr = args[0];
+
+ xics_dprintf("h_cppr(%lx)\n", cppr);
+ icp_set_cppr(vcpu->kvm->arch.icp, vcpu->cpu_id, cppr);
+ return H_SUCCESS;
+}
+
+static target_ulong h_ipi(struct kvm_cpu *vcpu,
+ target_ulong opcode, target_ulong *args)
+{
+ target_ulong server = args[0];
+ target_ulong mfrr = args[1];
+
+ xics_dprintf("h_ipi(%lx, %lx)\n", server, mfrr);
+ if (server >= vcpu->kvm->arch.icp->nr_servers) {
+ return H_PARAMETER;
+ }
+
+ icp_set_mfrr(vcpu->kvm->arch.icp, server, mfrr);
+ return H_SUCCESS;
+}
+
+static target_ulong h_xirr(struct kvm_cpu *vcpu,
+ target_ulong opcode, target_ulong *args)
+{
+ uint32_t xirr = icp_accept(vcpu->kvm->arch.icp->ss + vcpu->cpu_id);
+
+ xics_dprintf("h_xirr() = %x\n", xirr);
+ args[0] = xirr;
+ return H_SUCCESS;
+}
+
+static target_ulong h_eoi(struct kvm_cpu *vcpu,
+ target_ulong opcode, target_ulong *args)
+{
+ target_ulong xirr = args[0];
+
+ xics_dprintf("h_eoi(%lx)\n", xirr);
+ icp_eoi(vcpu->kvm->arch.icp, vcpu->cpu_id, xirr);
+ return H_SUCCESS;
+}
+
+static void rtas_set_xive(struct kvm_cpu *vcpu, uint32_t token,
+ uint32_t nargs, target_ulong args,
+ uint32_t nret, target_ulong rets)
+{
+ struct ics_state *ics = vcpu->kvm->arch.icp->ics;
+ uint32_t nr, server, priority;
+
+ if ((nargs != 3) || (nret != 1)) {
+ rtas_st(vcpu->kvm, rets, 0, -3);
+ return;
+ }
+
+ nr = rtas_ld(vcpu->kvm, args, 0);
+ server = rtas_ld(vcpu->kvm, args, 1);
+ priority = rtas_ld(vcpu->kvm, args, 2);
+
+ xics_dprintf("rtas_set_xive(%x,%x,%x)\n", nr, server, priority);
+ if (!ics_valid_irq(ics, nr) || (server >= ics->icp->nr_servers)
+ || (priority > 0xff)) {
+ rtas_st(vcpu->kvm, rets, 0, -3);
+ return;
+ }
+
+ ics_write_xive_msi(ics, nr, server, priority);
+
+ rtas_st(vcpu->kvm, rets, 0, 0); /* Success */
+}
+
+static void rtas_get_xive(struct kvm_cpu *vcpu, uint32_t token,
+ uint32_t nargs, target_ulong args,
+ uint32_t nret, target_ulong rets)
+{
+ struct ics_state *ics = vcpu->kvm->arch.icp->ics;
+ uint32_t nr;
+
+ if ((nargs != 1) || (nret != 3)) {
+ rtas_st(vcpu->kvm, rets, 0, -3);
+ return;
+ }
+
+ nr = rtas_ld(vcpu->kvm, args, 0);
+
+ if (!ics_valid_irq(ics, nr)) {
+ rtas_st(vcpu->kvm, rets, 0, -3);
+ return;
+ }
+
+ rtas_st(vcpu->kvm, rets, 0, 0); /* Success */
+ rtas_st(vcpu->kvm, rets, 1, ics->irqs[nr - ics->offset].server);
+ rtas_st(vcpu->kvm, rets, 2, ics->irqs[nr - ics->offset].priority);
+}
+
+static void rtas_int_off(struct kvm_cpu *vcpu, uint32_t token,
+ uint32_t nargs, target_ulong args,
+ uint32_t nret, target_ulong rets)
+{
+ struct ics_state *ics = vcpu->kvm->arch.icp->ics;
+ uint32_t nr;
+
+ if ((nargs != 1) || (nret != 1)) {
+ rtas_st(vcpu->kvm, rets, 0, -3);
+ return;
+ }
+
+ nr = rtas_ld(vcpu->kvm, args, 0);
+
+ if (!ics_valid_irq(ics, nr)) {
+ rtas_st(vcpu->kvm, rets, 0, -3);
+ return;
+ }
+
+ /* ME: QEMU wrote xive_msi here, in #if 0. Deleted. */
+
+ rtas_st(vcpu->kvm, rets, 0, 0); /* Success */
+}
+
+static void rtas_int_on(struct kvm_cpu *vcpu, uint32_t token,
+ uint32_t nargs, target_ulong args,
+ uint32_t nret, target_ulong rets)
+{
+ struct ics_state *ics = vcpu->kvm->arch.icp->ics;
+ uint32_t nr;
+
+ if ((nargs != 1) || (nret != 1)) {
+ rtas_st(vcpu->kvm, rets, 0, -3);
+ return;
+ }
+
+ nr = rtas_ld(vcpu->kvm, args, 0);
+
+ if (!ics_valid_irq(ics, nr)) {
+ rtas_st(vcpu->kvm, rets, 0, -3);
+ return;
+ }
+
+ /* ME: QEMU wrote xive_msi here, in #if 0. Deleted. */
+
+ rtas_st(vcpu->kvm, rets, 0, 0); /* Success */
+}
+
+static int xics_init(struct kvm *kvm)
+{
+ unsigned int i;
+ struct icp_state *icp;
+ struct ics_state *ics;
+ int j;
+
+ icp = malloc(sizeof(*icp));
+ icp->nr_servers = kvm->nrcpus;
+ icp->ss = malloc(icp->nr_servers * sizeof(struct icp_server_state));
+
+ for (i = 0; i < icp->nr_servers; i++) {
+ icp->ss[i].xirr = 0;
+ icp->ss[i].pending_priority = 0;
+ icp->ss[i].cpu = 0;
+ icp->ss[i].mfrr = 0xff;
+ }
+
+ /*
+ * icp->ss[env->cpu_index].cpu is set by CPUs calling in to
+ * xics_cpu_register().
+ */
+
+ ics = malloc(sizeof(*ics));
+ ics->nr_irqs = XICS_NUM_IRQS;
+ ics->offset = XICS_IRQ_OFFSET;
+ ics->irqs = malloc(ics->nr_irqs * sizeof(struct ics_irq_state));
+
+ icp->ics = ics;
+ ics->icp = icp;
+
+ for (i = 0; i < ics->nr_irqs; i++) {
+ ics->irqs[i].server = 0;
+ ics->irqs[i].priority = 0xff;
+ ics->irqs[i].saved_priority = 0xff;
+ ics->irqs[i].rejected = 0;
+ ics->irqs[i].masked_pending = 0;
+ }
+
+ spapr_register_hypercall(H_CPPR, h_cppr);
+ spapr_register_hypercall(H_IPI, h_ipi);
+ spapr_register_hypercall(H_XIRR, h_xirr);
+ spapr_register_hypercall(H_EOI, h_eoi);
+
+ spapr_rtas_register("ibm,set-xive", rtas_set_xive);
+ spapr_rtas_register("ibm,get-xive", rtas_get_xive);
+ spapr_rtas_register("ibm,int-off", rtas_int_off);
+ spapr_rtas_register("ibm,int-on", rtas_int_on);
+
+ for (j = 0; j < kvm->nrcpus; j++) {
+ struct kvm_cpu *vcpu = kvm->cpus[j];
+
+ if (vcpu->cpu_id >= icp->nr_servers)
+ die("Invalid server number for cpuid %ld\n", vcpu->cpu_id);
+
+ icp->ss[vcpu->cpu_id].cpu = vcpu;
+ }
+
+ kvm->arch.icp = icp;
+
+ return 0;
+}
+dev_base_init(xics_init);
+
+
+void kvm__irq_line(struct kvm *kvm, int irq, int level)
+{
+ /*
+ * Route event to ICS, which routes to ICP, which eventually does a
+ * kvm_cpu__irq(vcpu, POWER7_EXT_IRQ, 1)
+ */
+ xics_dprintf("Raising IRQ %d -> %d\n", irq, level);
+ ics_set_irq_msi(kvm->arch.icp->ics, irq - kvm->arch.icp->ics->offset, level);
+}
diff --git a/tools/kvm/powerpc/xics.h b/tools/kvm/powerpc/xics.h
new file mode 100644
index 0000000..d5bc6f9
--- /dev/null
+++ b/tools/kvm/powerpc/xics.h
@@ -0,0 +1,18 @@
+/*
+ * PAPR Virtualized Interrupt System, aka ICS/ICP aka xics
+ *
+ * Copyright 2011 Matt Evans <matt@ozlabs.org>, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#ifndef XICS_H
+#define XICS_H
+
+#define XICS_IPI 0x2
+
+int xics_alloc_irqnum(void);
+
+#endif
diff --git a/tools/kvm/symbol.c b/tools/kvm/symbol.c
new file mode 100644
index 0000000..07dd9d5
--- /dev/null
+++ b/tools/kvm/symbol.c
@@ -0,0 +1,133 @@
+#include "kvm/symbol.h"
+
+#include "kvm/kvm.h"
+
+#include <linux/err.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <bfd.h>
+
+static bfd *abfd;
+
+int symbol_init(struct kvm *kvm)
+{
+ int ret = 0;
+
+ if (!kvm->vmlinux)
+ return 0;
+
+ bfd_init();
+
+ abfd = bfd_openr(kvm->vmlinux, NULL);
+ if (abfd == NULL) {
+ bfd_error_type err = bfd_get_error();
+
+ switch (err) {
+ case bfd_error_no_memory:
+ ret = -ENOMEM;
+ break;
+ case bfd_error_invalid_target:
+ ret = -EINVAL;
+ break;
+ default:
+ ret = -EFAULT;
+ break;
+ }
+ }
+
+ return ret;
+}
+late_init(symbol_init);
+
+static asymbol *lookup(asymbol **symbols, int nr_symbols, const char *symbol_name)
+{
+ int i, ret;
+
+ ret = -ENOENT;
+
+ for (i = 0; i < nr_symbols; i++) {
+ asymbol *symbol = symbols[i];
+
+ if (!strcmp(bfd_asymbol_name(symbol), symbol_name))
+ return symbol;
+ }
+
+ return ERR_PTR(ret);
+}
+
+char *symbol_lookup(struct kvm *kvm, unsigned long addr, char *sym, size_t size)
+{
+ const char *filename;
+ bfd_vma sym_offset;
+ bfd_vma sym_start;
+ asection *section;
+ unsigned int line;
+ const char *func;
+ long symtab_size;
+ asymbol *symbol;
+ asymbol **syms;
+ int nr_syms, ret;
+
+ ret = -ENOENT;
+ if (!abfd)
+ goto not_found;
+
+ if (!bfd_check_format(abfd, bfd_object))
+ goto not_found;
+
+ symtab_size = bfd_get_symtab_upper_bound(abfd);
+ if (!symtab_size)
+ goto not_found;
+
+ ret = -ENOMEM;
+ syms = malloc(symtab_size);
+ if (!syms)
+ goto not_found;
+
+ nr_syms = bfd_canonicalize_symtab(abfd, syms);
+
+ ret = -ENOENT;
+ section = bfd_get_section_by_name(abfd, ".debug_aranges");
+ if (!section)
+ goto not_found;
+
+ if (!bfd_find_nearest_line(abfd, section, NULL, addr, &filename, &func, &line))
+ goto not_found;
+
+ if (!func)
+ goto not_found;
+
+ symbol = lookup(syms, nr_syms, func);
+ if (IS_ERR(symbol))
+ goto not_found;
+
+ sym_start = bfd_asymbol_value(symbol);
+
+ sym_offset = addr - sym_start;
+
+ snprintf(sym, size, "%s+%llx (%s:%i)", func, (long long) sym_offset, filename, line);
+
+ sym[size - 1] = '\0';
+
+ free(syms);
+
+ return sym;
+
+not_found:
+ return ERR_PTR(ret);
+}
+
+int symbol_exit(struct kvm *kvm)
+{
+ bfd_boolean ret = TRUE;
+
+ if (abfd)
+ ret = bfd_close(abfd);
+
+ if (ret == TRUE)
+ return 0;
+
+ return -EFAULT;
+}
+late_exit(symbol_exit);
diff --git a/tools/kvm/term.c b/tools/kvm/term.c
new file mode 100644
index 0000000..1b8131a
--- /dev/null
+++ b/tools/kvm/term.c
@@ -0,0 +1,210 @@
+#include <poll.h>
+#include <stdbool.h>
+#include <termios.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/uio.h>
+#include <signal.h>
+#include <pty.h>
+#include <utmp.h>
+
+#include "kvm/read-write.h"
+#include "kvm/term.h"
+#include "kvm/util.h"
+#include "kvm/kvm.h"
+#include "kvm/kvm-cpu.h"
+
+#define TERM_FD_IN 0
+#define TERM_FD_OUT 1
+
+static struct termios orig_term;
+
+int term_escape_char = 0x01; /* ctrl-a is used for escape */
+bool term_got_escape = false;
+
+int term_fds[TERM_MAX_DEVS][2];
+
+static pthread_t term_poll_thread;
+
+int term_getc(struct kvm *kvm, int term)
+{
+ unsigned char c;
+
+ if (read_in_full(term_fds[term][TERM_FD_IN], &c, 1) < 0)
+ return -1;
+
+ if (term_got_escape) {
+ term_got_escape = false;
+ if (c == 'x')
+ kvm_cpu__reboot(kvm);
+ if (c == term_escape_char)
+ return c;
+ }
+
+ if (c == term_escape_char) {
+ term_got_escape = true;
+ return -1;
+ }
+
+ return c;
+}
+
+int term_putc(char *addr, int cnt, int term)
+{
+ int ret;
+ int num_remaining = cnt;
+
+ while (num_remaining) {
+ ret = write(term_fds[term][TERM_FD_OUT], addr, num_remaining);
+ if (ret < 0)
+ return cnt - num_remaining;
+ num_remaining -= ret;
+ addr += ret;
+ }
+
+ return cnt;
+}
+
+int term_getc_iov(struct kvm *kvm, struct iovec *iov, int iovcnt, int term)
+{
+ int c;
+
+ c = term_getc(kvm, term);
+
+ if (c < 0)
+ return 0;
+
+ *((char *)iov[TERM_FD_IN].iov_base) = (char)c;
+
+ return sizeof(char);
+}
+
+int term_putc_iov(struct iovec *iov, int iovcnt, int term)
+{
+ return writev(term_fds[term][TERM_FD_OUT], iov, iovcnt);
+}
+
+bool term_readable(int term)
+{
+ struct pollfd pollfd = (struct pollfd) {
+ .fd = term_fds[term][TERM_FD_IN],
+ .events = POLLIN,
+ .revents = 0,
+ };
+ int err;
+
+ err = poll(&pollfd, 1, 0);
+ return (err > 0 && (pollfd.revents & POLLIN));
+}
+
+static void *term_poll_thread_loop(void *param)
+{
+ struct pollfd fds[TERM_MAX_DEVS];
+ struct kvm *kvm = (struct kvm *) param;
+
+ int i;
+
+ for (i = 0; i < TERM_MAX_DEVS; i++) {
+ fds[i].fd = term_fds[i][TERM_FD_IN];
+ fds[i].events = POLLIN;
+ fds[i].revents = 0;
+ }
+
+ while (1) {
+ /* Poll with infinite timeout */
+ if(poll(fds, TERM_MAX_DEVS, -1) < 1)
+ break;
+ kvm__arch_read_term(kvm);
+ }
+
+ die("term_poll_thread_loop: error polling device fds %d\n", errno);
+ return NULL;
+}
+
+static void term_cleanup(void)
+{
+ int i;
+
+ for (i = 0; i < TERM_MAX_DEVS; i++)
+ tcsetattr(term_fds[i][TERM_FD_IN], TCSANOW, &orig_term);
+}
+
+static void term_sig_cleanup(int sig)
+{
+ term_cleanup();
+ signal(sig, SIG_DFL);
+ raise(sig);
+}
+
+void term_set_tty(int term)
+{
+ struct termios orig_term;
+ int master, slave;
+ char new_pty[PATH_MAX];
+
+ if (tcgetattr(STDIN_FILENO, &orig_term) < 0)
+ die("unable to save initial standard input settings");
+
+ orig_term.c_lflag &= ~(ICANON | ECHO | ISIG);
+
+ if (openpty(&master, &slave, new_pty, &orig_term, NULL) < 0)
+ return;
+
+ close(slave);
+
+ pr_info("Assigned terminal %d to pty %s\n", term, new_pty);
+
+ term_fds[term][TERM_FD_IN] = term_fds[term][TERM_FD_OUT] = master;
+}
+
+int tty_parser(const struct option *opt, const char *arg, int unset)
+{
+ int tty = atoi(arg);
+
+ term_set_tty(tty);
+
+ return 0;
+}
+
+int term_init(struct kvm *kvm)
+{
+ struct termios term;
+ int i, r;
+
+ for (i = 0; i < TERM_MAX_DEVS; i++)
+ if (term_fds[i][TERM_FD_IN] == 0) {
+ term_fds[i][TERM_FD_IN] = STDIN_FILENO;
+ term_fds[i][TERM_FD_OUT] = STDOUT_FILENO;
+ }
+
+ if (!isatty(STDIN_FILENO) || !isatty(STDOUT_FILENO))
+ return 0;
+
+ r = tcgetattr(STDIN_FILENO, &orig_term);
+ if (r < 0) {
+ pr_warning("unable to save initial standard input settings");
+ return r;
+ }
+
+
+ term = orig_term;
+ term.c_lflag &= ~(ICANON | ECHO | ISIG);
+ tcsetattr(STDIN_FILENO, TCSANOW, &term);
+
+
+ /* Use our own blocking thread to read stdin, don't require a tick */
+ if(pthread_create(&term_poll_thread, NULL, term_poll_thread_loop,kvm))
+ die("Unable to create console input poll thread\n");
+
+ signal(SIGTERM, term_sig_cleanup);
+ atexit(term_cleanup);
+
+ return 0;
+}
+dev_init(term_init);
+
+int term_exit(struct kvm *kvm)
+{
+ return 0;
+}
+dev_exit(term_exit);
diff --git a/tools/kvm/tests/Makefile b/tools/kvm/tests/Makefile
new file mode 100644
index 0000000..cad14ec
--- /dev/null
+++ b/tools/kvm/tests/Makefile
@@ -0,0 +1,19 @@
+all: kernel pit boot
+
+kernel:
+ $(MAKE) -C kernel
+.PHONY: kernel
+
+pit:
+ $(MAKE) -C pit
+.PHONY: pit
+
+boot:
+ $(MAKE) -C boot
+.PHONY: boot
+
+clean:
+ $(MAKE) -C kernel clean
+ $(MAKE) -C pit clean
+ $(MAKE) -C boot clean
+.PHONY: clean
diff --git a/tools/kvm/tests/boot/Makefile b/tools/kvm/tests/boot/Makefile
new file mode 100644
index 0000000..40cba68
--- /dev/null
+++ b/tools/kvm/tests/boot/Makefile
@@ -0,0 +1,13 @@
+NAME := init
+
+OBJ := $(NAME).o
+
+all: $(.o)
+ rm -rf rootfs
+ mkdir rootfs
+ gcc -static init.c -o rootfs/init
+ mkisofs rootfs > boot_test.iso
+
+clean:
+ rm -rf rootfs boot_test.iso
+.PHONY: clean
diff --git a/tools/kvm/tests/boot/init.c b/tools/kvm/tests/boot/init.c
new file mode 100644
index 0000000..094f8ba
--- /dev/null
+++ b/tools/kvm/tests/boot/init.c
@@ -0,0 +1,11 @@
+#include <linux/reboot.h>
+#include <unistd.h>
+
+int main(int argc, char *argv[])
+{
+ puts("hello, KVM guest!\r");
+
+ reboot(LINUX_REBOOT_CMD_RESTART);
+
+ return 0;
+}
diff --git a/tools/kvm/tests/kernel/.gitignore b/tools/kvm/tests/kernel/.gitignore
new file mode 100644
index 0000000..d0cd209
--- /dev/null
+++ b/tools/kvm/tests/kernel/.gitignore
@@ -0,0 +1,2 @@
+kernel.bin
+kernel.elf
diff --git a/tools/kvm/tests/kernel/Makefile b/tools/kvm/tests/kernel/Makefile
new file mode 100644
index 0000000..c7dd8da
--- /dev/null
+++ b/tools/kvm/tests/kernel/Makefile
@@ -0,0 +1,20 @@
+NAME := kernel
+
+BIN := $(NAME).bin
+ELF := $(NAME).elf
+OBJ := $(NAME).o
+
+all: $(BIN)
+
+$(BIN): $(ELF)
+ objcopy -O binary $< $@
+
+$(ELF): $(OBJ)
+ ld -Ttext=0x00 -nostdlib -static $< -o $@
+
+%.o: %.S
+ gcc -nostdinc -c $< -o $@
+
+clean:
+ rm -f $(BIN) $(ELF) $(OBJ)
+.PHONY: clean
diff --git a/tools/kvm/tests/kernel/README b/tools/kvm/tests/kernel/README
new file mode 100644
index 0000000..2923777
--- /dev/null
+++ b/tools/kvm/tests/kernel/README
@@ -0,0 +1,16 @@
+Compiling
+---------
+
+You can simply type:
+
+ $ make
+
+to build a 16-bit binary that uses the i8086 instruction set.
+
+Disassembling
+-------------
+
+Use the "-m i8086" command line option with objdump to make sure it knows we're
+dealing with i8086 instruction set:
+
+ $ objdump -d -m i8086 i8086.elf
diff --git a/tools/kvm/tests/kernel/kernel.S b/tools/kvm/tests/kernel/kernel.S
new file mode 100644
index 0000000..2824b64
--- /dev/null
+++ b/tools/kvm/tests/kernel/kernel.S
@@ -0,0 +1,8 @@
+ .code16gcc
+ .text
+ .globl _start
+ .type _start, @function
+_start:
+ # "This is probably the largest possible kernel that is bug free." -- Avi Kivity
+ 1:
+ jmp 1b
diff --git a/tools/kvm/tests/pit/.gitignore b/tools/kvm/tests/pit/.gitignore
new file mode 100644
index 0000000..43f0aa8
--- /dev/null
+++ b/tools/kvm/tests/pit/.gitignore
@@ -0,0 +1,2 @@
+*.bin
+*.elf
diff --git a/tools/kvm/tests/pit/Makefile b/tools/kvm/tests/pit/Makefile
new file mode 100644
index 0000000..2fae9b2
--- /dev/null
+++ b/tools/kvm/tests/pit/Makefile
@@ -0,0 +1,20 @@
+NAME := tick
+
+BIN := $(NAME).bin
+ELF := $(NAME).elf
+OBJ := $(NAME).o
+
+all: $(BIN)
+
+$(BIN): $(ELF)
+ objcopy -O binary $< $@
+
+$(ELF): $(OBJ)
+ ld -Ttext=0x00 -nostdlib -static $< -o $@
+
+%.o: %.S
+ gcc -nostdinc -c $< -o $@
+
+clean:
+ rm -f $(BIN) $(ELF) $(OBJ)
+.PHONY: clean
diff --git a/tools/kvm/tests/pit/README b/tools/kvm/tests/pit/README
new file mode 100644
index 0000000..2923777
--- /dev/null
+++ b/tools/kvm/tests/pit/README
@@ -0,0 +1,16 @@
+Compiling
+---------
+
+You can simply type:
+
+ $ make
+
+to build a 16-bit binary that uses the i8086 instruction set.
+
+Disassembling
+-------------
+
+Use the "-m i8086" command line option with objdump to make sure it knows we're
+dealing with i8086 instruction set:
+
+ $ objdump -d -m i8086 i8086.elf
diff --git a/tools/kvm/tests/pit/tick.S b/tools/kvm/tests/pit/tick.S
new file mode 100644
index 0000000..635dc8d
--- /dev/null
+++ b/tools/kvm/tests/pit/tick.S
@@ -0,0 +1,101 @@
+#define IO_PIC 0x20
+#define IRQ_OFFSET 32
+#define IO_PIT 0x40
+#define TIMER_FREQ 1193182
+#define TIMER_DIV(x) ((TIMER_FREQ+(x)/2)/(x))
+
+#define TEST_COUNT 0x0200
+
+ .code16gcc
+ .text
+ .globl _start
+ .type _start, @function
+_start:
+/*
+ * fill up noop handlers
+ */
+ xorw %ax, %ax
+ xorw %di, %di
+ movw %ax, %es
+ movw $256, %cx
+fill_noop_idt:
+ movw $noop_handler, %es:(%di)
+ movw %cs, %es:2(%di)
+ add $4, %di
+ loop fill_noop_idt
+
+set_idt:
+ movw $timer_isr, %es:(IRQ_OFFSET*4)
+ movw %cs, %es:(IRQ_OFFSET*4+2)
+
+set_pic:
+ # ICW1
+ mov $0x11, %al
+ mov $(IO_PIC), %dx
+ out %al,%dx
+ # ICW2
+ mov $(IRQ_OFFSET), %al
+ mov $(IO_PIC+1), %dx
+ out %al, %dx
+ # ICW3
+ mov $0x00, %al
+ mov $(IO_PIC+1), %dx
+ out %al, %dx
+ # ICW4
+ mov $0x3, %al
+ mov $(IO_PIC+1), %dx
+ out %al, %dx
+
+set_pit:
+ # set 8254 mode
+ mov $(IO_PIT+3), %dx
+ mov $0x34, %al
+ outb %al, %dx
+ # set 8254 freq 1KHz
+ mov $(IO_PIT), %dx
+ movb $(TIMER_DIV(1000) % 256), %al
+ outb %al, %dx
+ movb $(TIMER_DIV(1000) / 256), %al
+ outb %al, %dx
+
+enable_irq0:
+ mov $0xfe, %al
+ mov $(IO_PIC+1), %dx
+ out %al, %dx
+ sti
+loop:
+ 1:
+ jmp 1b
+
+test_ok:
+ mov $0x3f8,%dx
+ cs lea msg2, %si
+ mov $(msg2_end-msg2), %cx
+ cs rep/outsb
+
+ /* Reboot by using the i8042 reboot line */
+ mov $0xfe, %al
+ outb %al, $0x64
+
+timer_isr:
+ cli
+ pushaw
+ pushfw
+ mov $0x3f8,%dx
+ mov $0x2e, %al # .
+ out %al,%dx
+ decw count
+ jz test_ok
+ popfw
+ popaw
+ iretw
+
+noop_handler:
+ iretw
+
+count:
+ .word TEST_COUNT
+
+msg2:
+ .asciz "\nTest OK\n"
+msg2_end:
diff --git a/tools/kvm/ui/gtk3.c b/tools/kvm/ui/gtk3.c
new file mode 100644
index 0000000..b2335bc
--- /dev/null
+++ b/tools/kvm/ui/gtk3.c
@@ -0,0 +1,326 @@
+#include "kvm/gtk3.h"
+
+#include "kvm/framebuffer.h"
+#include "kvm/kvm-cpu.h"
+#include "kvm/i8042.h"
+#include "kvm/vesa.h"
+#include "kvm/kvm.h"
+
+#include <gtk/gtk.h>
+#include <pthread.h>
+#include <linux/err.h>
+
+#define FRAME_RATE 25
+
+#define SCANCODE_UNKNOWN 0
+#define SCANCODE_NORMAL 1
+#define SCANCODE_ESCAPED 2
+#define SCANCODE_KEY_PAUSE 3
+#define SCANCODE_KEY_PRNTSCRN 4
+
+struct set2_scancode {
+ u8 code;
+ u8 type;
+};
+
+#define DEFINE_SC(_code) { \
+ .code = _code, \
+ .type = SCANCODE_NORMAL, \
+}
+
+/* escaped scancodes */
+#define DEFINE_ESC(_code) { \
+ .code = _code, \
+ .type = SCANCODE_ESCAPED, \
+}
+
+static const struct set2_scancode const keymap[256] = {
+ [9] = DEFINE_SC(0x76), /* <esc> */
+ [10] = DEFINE_SC(0x16), /* 1 */
+ [11] = DEFINE_SC(0x1e), /* 2 */
+ [12] = DEFINE_SC(0x26), /* 3 */
+ [13] = DEFINE_SC(0x25), /* 4 */
+ [14] = DEFINE_SC(0x2e), /* 5 */
+ [15] = DEFINE_SC(0x36), /* 6 */
+ [16] = DEFINE_SC(0x3d), /* 7 */
+ [17] = DEFINE_SC(0x3e), /* 8 */
+ [18] = DEFINE_SC(0x46), /* 9 */
+ [19] = DEFINE_SC(0x45), /* 9 */
+ [20] = DEFINE_SC(0x4e), /* - */
+ [21] = DEFINE_SC(0x55), /* + */
+ [22] = DEFINE_SC(0x66), /* <backspace> */
+ [23] = DEFINE_SC(0x0d), /* <tab> */
+ [24] = DEFINE_SC(0x15), /* q */
+ [25] = DEFINE_SC(0x1d), /* w */
+ [26] = DEFINE_SC(0x24), /* e */
+ [27] = DEFINE_SC(0x2d), /* r */
+ [28] = DEFINE_SC(0x2c), /* t */
+ [29] = DEFINE_SC(0x35), /* y */
+ [30] = DEFINE_SC(0x3c), /* u */
+ [31] = DEFINE_SC(0x43), /* i */
+ [32] = DEFINE_SC(0x44), /* o */
+ [33] = DEFINE_SC(0x4d), /* p */
+ [34] = DEFINE_SC(0x54), /* [ */
+ [35] = DEFINE_SC(0x5b), /* ] */
+ [36] = DEFINE_SC(0x5a), /* <enter> */
+ [37] = DEFINE_SC(0x14), /* <left ctrl> */
+ [38] = DEFINE_SC(0x1c), /* a */
+ [39] = DEFINE_SC(0x1b), /* s */
+ [40] = DEFINE_SC(0x23), /* d */
+ [41] = DEFINE_SC(0x2b), /* f */
+ [42] = DEFINE_SC(0x34), /* g */
+ [43] = DEFINE_SC(0x33), /* h */
+ [44] = DEFINE_SC(0x3b), /* j */
+ [45] = DEFINE_SC(0x42), /* k */
+ [46] = DEFINE_SC(0x4b), /* l */
+ [47] = DEFINE_SC(0x4c), /* ; */
+ [48] = DEFINE_SC(0x52), /* ' */
+ [49] = DEFINE_SC(0x0e), /* ` */
+ [50] = DEFINE_SC(0x12), /* <left shift> */
+ [51] = DEFINE_SC(0x5d), /* \ */
+ [52] = DEFINE_SC(0x1a), /* z */
+ [53] = DEFINE_SC(0x22), /* x */
+ [54] = DEFINE_SC(0x21), /* c */
+ [55] = DEFINE_SC(0x2a), /* v */
+ [56] = DEFINE_SC(0x32), /* b */
+ [57] = DEFINE_SC(0x31), /* n */
+ [58] = DEFINE_SC(0x3a), /* m */
+ [59] = DEFINE_SC(0x41), /* < */
+ [60] = DEFINE_SC(0x49), /* > */
+ [61] = DEFINE_SC(0x4a), /* / */
+ [62] = DEFINE_SC(0x59), /* <right shift> */
+ [63] = DEFINE_SC(0x7c), /* keypad * */
+ [64] = DEFINE_SC(0x11), /* <left alt> */
+ [65] = DEFINE_SC(0x29), /* <space> */
+
+ [67] = DEFINE_SC(0x05), /* <F1> */
+ [68] = DEFINE_SC(0x06), /* <F2> */
+ [69] = DEFINE_SC(0x04), /* <F3> */
+ [70] = DEFINE_SC(0x0c), /* <F4> */
+ [71] = DEFINE_SC(0x03), /* <F5> */
+ [72] = DEFINE_SC(0x0b), /* <F6> */
+ [73] = DEFINE_SC(0x83), /* <F7> */
+ [74] = DEFINE_SC(0x0a), /* <F8> */
+ [75] = DEFINE_SC(0x01), /* <F9> */
+ [76] = DEFINE_SC(0x09), /* <F10> */
+
+ [79] = DEFINE_SC(0x6c), /* keypad 7 */
+ [80] = DEFINE_SC(0x75), /* keypad 8 */
+ [81] = DEFINE_SC(0x7d), /* keypad 9 */
+ [82] = DEFINE_SC(0x7b), /* keypad - */
+ [83] = DEFINE_SC(0x6b), /* keypad 4 */
+ [84] = DEFINE_SC(0x73), /* keypad 5 */
+ [85] = DEFINE_SC(0x74), /* keypad 6 */
+ [86] = DEFINE_SC(0x79), /* keypad + */
+ [87] = DEFINE_SC(0x69), /* keypad 1 */
+ [88] = DEFINE_SC(0x72), /* keypad 2 */
+ [89] = DEFINE_SC(0x7a), /* keypad 3 */
+ [90] = DEFINE_SC(0x70), /* keypad 0 */
+ [91] = DEFINE_SC(0x71), /* keypad . */
+
+ [94] = DEFINE_SC(0x61), /* <INT 1> */
+ [95] = DEFINE_SC(0x78), /* <F11> */
+ [96] = DEFINE_SC(0x07), /* <F12> */
+
+ [104] = DEFINE_ESC(0x5a), /* keypad <enter> */
+ [105] = DEFINE_ESC(0x14), /* <right ctrl> */
+ [106] = DEFINE_ESC(0x4a), /* keypad / */
+ [108] = DEFINE_ESC(0x11), /* <right alt> */
+ [110] = DEFINE_ESC(0x6c), /* <home> */
+ [111] = DEFINE_ESC(0x75), /* <up> */
+ [112] = DEFINE_ESC(0x7d), /* <pag up> */
+ [113] = DEFINE_ESC(0x6b), /* <left> */
+ [114] = DEFINE_ESC(0x74), /* <right> */
+ [115] = DEFINE_ESC(0x69), /* <end> */
+ [116] = DEFINE_ESC(0x72), /* <down> */
+ [117] = DEFINE_ESC(0x7a), /* <pag down> */
+ [118] = DEFINE_ESC(0x70), /* <ins> */
+ [119] = DEFINE_ESC(0x71), /* <delete> */
+};
+
+static cairo_surface_t *surface;
+static bool done;
+
+static const struct set2_scancode *to_code(u8 scancode)
+{
+ return &keymap[scancode];
+}
+
+static gboolean
+kvm_gtk_configure_event(GtkWidget * widget, GdkEventConfigure * event, gpointer data)
+{
+ struct framebuffer *fb = data;
+ int stride;
+
+ if (surface)
+ cairo_surface_destroy(surface);
+
+ stride = cairo_format_stride_for_width(CAIRO_FORMAT_RGB24, fb->width);
+
+ surface =
+ cairo_image_surface_create_for_data((void *) fb->mem,
+ CAIRO_FORMAT_RGB24,
+ fb->width,
+ fb->height,
+ stride);
+
+ return TRUE;
+}
+
+static gboolean kvm_gtk_draw(GtkWidget *widget, cairo_t *cr, gpointer data)
+{
+ cairo_set_source_surface(cr, surface, 0, 0);
+
+ cairo_paint(cr);
+
+ return FALSE;
+}
+
+static void kvm_gtk_destroy(void)
+{
+ if (surface)
+ cairo_surface_destroy(surface);
+
+ gtk_main_quit();
+}
+
+static gboolean kvm_gtk_redraw(GtkWidget * window)
+{
+ gtk_widget_queue_draw(window);
+
+ return TRUE;
+}
+
+static gboolean
+kvm_gtk_key_press(GtkWidget *widget, GdkEventKey *event, gpointer user_data)
+{
+ const struct set2_scancode *sc = to_code(event->hardware_keycode);
+
+ switch (sc->type) {
+ case SCANCODE_ESCAPED:
+ kbd_queue(0xe0);
+ /* fallthrough */
+ case SCANCODE_NORMAL:
+ kbd_queue(sc->code);
+ break;
+ case SCANCODE_KEY_PAUSE:
+ kbd_queue(0xe1);
+ kbd_queue(0x14);
+ kbd_queue(0x77);
+ kbd_queue(0xe1);
+ kbd_queue(0xf0);
+ kbd_queue(0x14);
+ kbd_queue(0x77);
+ break;
+ case SCANCODE_KEY_PRNTSCRN:
+ kbd_queue(0xe0);
+ kbd_queue(0x12);
+ kbd_queue(0xe0);
+ kbd_queue(0x7c);
+ break;
+ }
+
+ return TRUE;
+}
+
+static void *kvm_gtk_thread(void *p)
+{
+ struct framebuffer *fb = p;
+ GtkWidget *window;
+ GtkWidget *frame;
+ GtkWidget *da;
+
+ gtk_init(NULL, NULL);
+
+ window = gtk_window_new(GTK_WINDOW_TOPLEVEL);
+
+ gtk_window_set_title(GTK_WINDOW(window), "VM");
+
+ g_signal_connect(window, "destroy", G_CALLBACK(kvm_gtk_destroy), NULL);
+
+ gtk_container_set_border_width(GTK_CONTAINER(window), 8);
+
+ frame = gtk_frame_new(NULL);
+
+ gtk_frame_set_shadow_type(GTK_FRAME(frame), GTK_SHADOW_IN);
+ gtk_container_add(GTK_CONTAINER(window), frame);
+
+ da = gtk_drawing_area_new();
+
+ gtk_widget_set_size_request(da, 100, 100);
+
+ gtk_container_add(GTK_CONTAINER(frame), da);
+
+ g_signal_connect(da, "draw", G_CALLBACK(kvm_gtk_draw), NULL);
+ g_signal_connect(da, "configure-event",
+ G_CALLBACK(kvm_gtk_configure_event), fb);
+ g_signal_connect(G_OBJECT (window), "key_press_event", G_CALLBACK(kvm_gtk_key_press), NULL);
+
+
+ gtk_widget_set_events(da, gtk_widget_get_events(da)
+ | GDK_BUTTON_PRESS_MASK
+ | GDK_POINTER_MOTION_MASK
+ | GDK_POINTER_MOTION_HINT_MASK);
+
+ gtk_widget_show_all(window);
+
+ g_timeout_add(1000 / FRAME_RATE, (GSourceFunc) kvm_gtk_redraw, window);
+
+ gtk_main();
+
+ done = true;
+
+ return NULL;
+}
+
+static int kvm_gtk_start(struct framebuffer *fb)
+{
+ pthread_t thread;
+
+ if (pthread_create(&thread, NULL, kvm_gtk_thread, fb) != 0)
+ return -1;
+
+ return 0;
+}
+
+static int kvm_gtk_stop(struct framebuffer *fb)
+{
+ gtk_main_quit();
+
+ while (!done)
+ sleep(0);
+
+ return 0;
+}
+
+static struct fb_target_operations kvm_gtk_ops = {
+ .start = kvm_gtk_start,
+ .stop = kvm_gtk_stop,
+};
+
+int kvm_gtk_init(struct kvm *kvm)
+{
+ struct framebuffer *fb;
+
+ if (!kvm->cfg.gtk)
+ return 0;
+
+ fb = vesa__init(kvm);
+ if (IS_ERR(fb)) {
+ pr_err("vesa__init() failed with error %ld\n", PTR_ERR(fb));
+ return PTR_ERR(fb);
+ }
+
+ return fb__attach(fb, &kvm_gtk_ops);
+}
+
+int kvm_gtk_exit(struct kvm *kvm)
+{
+ if (kvm->cfg.gtk)
+ return kvm_gtk_stop(NULL);
+
+ return 0;
+}
+
+dev_init(kvm_gtk_init);
+dev_exit(kvm_gtk_exit);
diff --git a/tools/kvm/ui/sdl.c b/tools/kvm/ui/sdl.c
new file mode 100644
index 0000000..a260002
--- /dev/null
+++ b/tools/kvm/ui/sdl.c
@@ -0,0 +1,324 @@
+#include "kvm/sdl.h"
+
+#include "kvm/framebuffer.h"
+#include "kvm/i8042.h"
+#include "kvm/util.h"
+#include "kvm/kvm.h"
+#include "kvm/kvm-cpu.h"
+#include "kvm/vesa.h"
+
+#include <SDL/SDL.h>
+#include <pthread.h>
+#include <signal.h>
+#include <linux/err.h>
+
+#define FRAME_RATE 25
+
+#define SCANCODE_UNKNOWN 0
+#define SCANCODE_NORMAL 1
+#define SCANCODE_ESCAPED 2
+#define SCANCODE_KEY_PAUSE 3
+#define SCANCODE_KEY_PRNTSCRN 4
+
+struct set2_scancode {
+ u8 code;
+ u8 type;
+};
+
+#define DEFINE_SC(_code) {\
+ .code = _code,\
+ .type = SCANCODE_NORMAL,\
+}
+
+/* escaped scancodes */
+#define DEFINE_ESC(_code) {\
+ .code = _code,\
+ .type = SCANCODE_ESCAPED,\
+}
+
+static const struct set2_scancode const keymap[256] = {
+ [9] = DEFINE_SC(0x76), /* <esc> */
+ [10] = DEFINE_SC(0x16), /* 1 */
+ [11] = DEFINE_SC(0x1e), /* 2 */
+ [12] = DEFINE_SC(0x26), /* 3 */
+ [13] = DEFINE_SC(0x25), /* 4 */
+ [14] = DEFINE_SC(0x2e), /* 5 */
+ [15] = DEFINE_SC(0x36), /* 6 */
+ [16] = DEFINE_SC(0x3d), /* 7 */
+ [17] = DEFINE_SC(0x3e), /* 8 */
+ [18] = DEFINE_SC(0x46), /* 9 */
+ [19] = DEFINE_SC(0x45), /* 9 */
+ [20] = DEFINE_SC(0x4e), /* - */
+ [21] = DEFINE_SC(0x55), /* + */
+ [22] = DEFINE_SC(0x66), /* <backspace> */
+ [23] = DEFINE_SC(0x0d), /* <tab> */
+ [24] = DEFINE_SC(0x15), /* q */
+ [25] = DEFINE_SC(0x1d), /* w */
+ [26] = DEFINE_SC(0x24), /* e */
+ [27] = DEFINE_SC(0x2d), /* r */
+ [28] = DEFINE_SC(0x2c), /* t */
+ [29] = DEFINE_SC(0x35), /* y */
+ [30] = DEFINE_SC(0x3c), /* u */
+ [31] = DEFINE_SC(0x43), /* i */
+ [32] = DEFINE_SC(0x44), /* o */
+ [33] = DEFINE_SC(0x4d), /* p */
+ [34] = DEFINE_SC(0x54), /* [ */
+ [35] = DEFINE_SC(0x5b), /* ] */
+ [36] = DEFINE_SC(0x5a), /* <enter> */
+ [37] = DEFINE_SC(0x14), /* <left ctrl> */
+ [38] = DEFINE_SC(0x1c), /* a */
+ [39] = DEFINE_SC(0x1b), /* s */
+ [40] = DEFINE_SC(0x23), /* d */
+ [41] = DEFINE_SC(0x2b), /* f */
+ [42] = DEFINE_SC(0x34), /* g */
+ [43] = DEFINE_SC(0x33), /* h */
+ [44] = DEFINE_SC(0x3b), /* j */
+ [45] = DEFINE_SC(0x42), /* k */
+ [46] = DEFINE_SC(0x4b), /* l */
+ [47] = DEFINE_SC(0x4c), /* ; */
+ [48] = DEFINE_SC(0x52), /* ' */
+ [49] = DEFINE_SC(0x0e), /* ` */
+ [50] = DEFINE_SC(0x12), /* <left shift> */
+ [51] = DEFINE_SC(0x5d), /* \ */
+ [52] = DEFINE_SC(0x1a), /* z */
+ [53] = DEFINE_SC(0x22), /* x */
+ [54] = DEFINE_SC(0x21), /* c */
+ [55] = DEFINE_SC(0x2a), /* v */
+ [56] = DEFINE_SC(0x32), /* b */
+ [57] = DEFINE_SC(0x31), /* n */
+ [58] = DEFINE_SC(0x3a), /* m */
+ [59] = DEFINE_SC(0x41), /* < */
+ [60] = DEFINE_SC(0x49), /* > */
+ [61] = DEFINE_SC(0x4a), /* / */
+ [62] = DEFINE_SC(0x59), /* <right shift> */
+ [63] = DEFINE_SC(0x7c), /* keypad * */
+ [64] = DEFINE_SC(0x11), /* <left alt> */
+ [65] = DEFINE_SC(0x29), /* <space> */
+
+ [67] = DEFINE_SC(0x05), /* <F1> */
+ [68] = DEFINE_SC(0x06), /* <F2> */
+ [69] = DEFINE_SC(0x04), /* <F3> */
+ [70] = DEFINE_SC(0x0c), /* <F4> */
+ [71] = DEFINE_SC(0x03), /* <F5> */
+ [72] = DEFINE_SC(0x0b), /* <F6> */
+ [73] = DEFINE_SC(0x83), /* <F7> */
+ [74] = DEFINE_SC(0x0a), /* <F8> */
+ [75] = DEFINE_SC(0x01), /* <F9> */
+ [76] = DEFINE_SC(0x09), /* <F10> */
+
+ [79] = DEFINE_SC(0x6c), /* keypad 7 */
+ [80] = DEFINE_SC(0x75), /* keypad 8 */
+ [81] = DEFINE_SC(0x7d), /* keypad 9 */
+ [82] = DEFINE_SC(0x7b), /* keypad - */
+ [83] = DEFINE_SC(0x6b), /* keypad 4 */
+ [84] = DEFINE_SC(0x73), /* keypad 5 */
+ [85] = DEFINE_SC(0x74), /* keypad 6 */
+ [86] = DEFINE_SC(0x79), /* keypad + */
+ [87] = DEFINE_SC(0x69), /* keypad 1 */
+ [88] = DEFINE_SC(0x72), /* keypad 2 */
+ [89] = DEFINE_SC(0x7a), /* keypad 3 */
+ [90] = DEFINE_SC(0x70), /* keypad 0 */
+ [91] = DEFINE_SC(0x71), /* keypad . */
+
+ [94] = DEFINE_SC(0x61), /* <INT 1> */
+ [95] = DEFINE_SC(0x78), /* <F11> */
+ [96] = DEFINE_SC(0x07), /* <F12> */
+
+ [104] = DEFINE_ESC(0x5a), /* keypad <enter> */
+ [105] = DEFINE_ESC(0x14), /* <right ctrl> */
+ [106] = DEFINE_ESC(0x4a), /* keypad / */
+ [108] = DEFINE_ESC(0x11), /* <right alt> */
+ [110] = DEFINE_ESC(0x6c), /* <home> */
+ [111] = DEFINE_ESC(0x75), /* <up> */
+ [112] = DEFINE_ESC(0x7d), /* <pag up> */
+ [113] = DEFINE_ESC(0x6b), /* <left> */
+ [114] = DEFINE_ESC(0x74), /* <right> */
+ [115] = DEFINE_ESC(0x69), /* <end> */
+ [116] = DEFINE_ESC(0x72), /* <down> */
+ [117] = DEFINE_ESC(0x7a), /* <pag down> */
+ [118] = DEFINE_ESC(0x70), /* <ins> */
+ [119] = DEFINE_ESC(0x71), /* <delete> */
+};
+static bool running, done;
+
+static const struct set2_scancode *to_code(u8 scancode)
+{
+ return &keymap[scancode];
+}
+
+static void key_press(const struct set2_scancode *sc)
+{
+ switch (sc->type) {
+ case SCANCODE_ESCAPED:
+ kbd_queue(0xe0);
+ /* fallthrough */
+ case SCANCODE_NORMAL:
+ kbd_queue(sc->code);
+ break;
+ case SCANCODE_KEY_PAUSE:
+ kbd_queue(0xe1);
+ kbd_queue(0x14);
+ kbd_queue(0x77);
+ kbd_queue(0xe1);
+ kbd_queue(0xf0);
+ kbd_queue(0x14);
+ kbd_queue(0x77);
+ break;
+ case SCANCODE_KEY_PRNTSCRN:
+ kbd_queue(0xe0);
+ kbd_queue(0x12);
+ kbd_queue(0xe0);
+ kbd_queue(0x7c);
+ break;
+ }
+}
+
+static void key_release(const struct set2_scancode *sc)
+{
+ switch (sc->type) {
+ case SCANCODE_ESCAPED:
+ kbd_queue(0xe0);
+ /* fallthrough */
+ case SCANCODE_NORMAL:
+ kbd_queue(0xf0);
+ kbd_queue(sc->code);
+ break;
+ case SCANCODE_KEY_PAUSE:
+ /* nothing to do */
+ break;
+ case SCANCODE_KEY_PRNTSCRN:
+ kbd_queue(0xe0);
+ kbd_queue(0xf0);
+ kbd_queue(0x7c);
+ kbd_queue(0xe0);
+ kbd_queue(0xf0);
+ kbd_queue(0x12);
+ break;
+ }
+}
+
+static void *sdl__thread(void *p)
+{
+ Uint32 rmask, gmask, bmask, amask;
+ struct framebuffer *fb = p;
+ SDL_Surface *guest_screen;
+ SDL_Surface *screen;
+ SDL_Event ev;
+ Uint32 flags;
+
+ kvm__set_thread_name("kvm-sdl-worker");
+
+ if (SDL_Init(SDL_INIT_VIDEO) != 0)
+ die("Unable to initialize SDL");
+
+ rmask = 0x000000ff;
+ gmask = 0x0000ff00;
+ bmask = 0x00ff0000;
+ amask = 0x00000000;
+
+ guest_screen = SDL_CreateRGBSurfaceFrom(fb->mem, fb->width, fb->height, fb->depth, fb->width * fb->depth / 8, rmask, gmask, bmask, amask);
+ if (!guest_screen)
+ die("Unable to create SDL RBG surface");
+
+ flags = SDL_HWSURFACE | SDL_ASYNCBLIT | SDL_HWACCEL | SDL_DOUBLEBUF;
+
+ SDL_WM_SetCaption("KVM tool", "KVM tool");
+
+ screen = SDL_SetVideoMode(fb->width, fb->height, fb->depth, flags);
+ if (!screen)
+ die("Unable to set SDL video mode");
+
+ SDL_EnableKeyRepeat(200, 50);
+
+ while (running) {
+ SDL_BlitSurface(guest_screen, NULL, screen, NULL);
+ SDL_Flip(screen);
+
+ while (SDL_PollEvent(&ev)) {
+ switch (ev.type) {
+ case SDL_KEYDOWN: {
+ const struct set2_scancode *sc = to_code(ev.key.keysym.scancode);
+ if (sc->type == SCANCODE_UNKNOWN) {
+ pr_warning("key '%d' not found in keymap", ev.key.keysym.scancode);
+ break;
+ }
+ key_press(sc);
+ break;
+ }
+ case SDL_KEYUP: {
+ const struct set2_scancode *sc = to_code(ev.key.keysym.scancode);
+ if (sc->type == SCANCODE_UNKNOWN)
+ break;
+ key_release(sc);
+ break;
+ }
+ case SDL_QUIT:
+ goto exit;
+ }
+ }
+
+ SDL_Delay(1000 / FRAME_RATE);
+ }
+
+ if (running == false && done == false) {
+ done = true;
+ return NULL;
+ }
+exit:
+ done = true;
+ kvm_cpu__reboot(fb->kvm);
+
+ return NULL;
+}
+
+static int sdl__start(struct framebuffer *fb)
+{
+ pthread_t thread;
+
+ running = true;
+
+ if (pthread_create(&thread, NULL, sdl__thread, fb) != 0)
+ return -1;
+
+ return 0;
+}
+
+static int sdl__stop(struct framebuffer *fb)
+{
+ running = false;
+ while (done == false)
+ sleep(0);
+
+ return 0;
+}
+
+static struct fb_target_operations sdl_ops = {
+ .start = sdl__start,
+ .stop = sdl__stop,
+};
+
+int sdl__init(struct kvm *kvm)
+{
+ struct framebuffer *fb;
+
+ if (!kvm->cfg.sdl)
+ return 0;
+
+ fb = vesa__init(kvm);
+ if (IS_ERR(fb)) {
+ pr_err("vesa__init() failed with error %ld\n", PTR_ERR(fb));
+ return PTR_ERR(fb);
+ }
+
+ return fb__attach(fb, &sdl_ops);
+}
+dev_init(sdl__init);
+
+int sdl__exit(struct kvm *kvm)
+{
+ if (kvm->cfg.sdl)
+ return sdl__stop(NULL);
+
+ return 0;
+}
+dev_exit(sdl__exit);
diff --git a/tools/kvm/ui/vnc.c b/tools/kvm/ui/vnc.c
new file mode 100644
index 0000000..12e4bd5
--- /dev/null
+++ b/tools/kvm/ui/vnc.c
@@ -0,0 +1,250 @@
+#include "kvm/vnc.h"
+
+#include "kvm/framebuffer.h"
+#include "kvm/i8042.h"
+#include "kvm/vesa.h"
+
+#include <linux/types.h>
+#include <rfb/keysym.h>
+#include <rfb/rfb.h>
+#include <pthread.h>
+#include <linux/err.h>
+
+#define VESA_QUEUE_SIZE 128
+#define VESA_IRQ 14
+
+/*
+ * This "6000" value is pretty much the result of experimentation
+ * It seems that around this value, things update pretty smoothly
+ */
+#define VESA_UPDATE_TIME 6000
+
+/*
+ * We can map the letters and numbers without a fuss,
+ * but the other characters not so much.
+ */
+static char letters[26] = {
+ 0x1c, 0x32, 0x21, 0x23, 0x24, /* a-e */
+ 0x2b, 0x34, 0x33, 0x43, 0x3b, /* f-j */
+ 0x42, 0x4b, 0x3a, 0x31, 0x44, /* k-o */
+ 0x4d, 0x15, 0x2d, 0x1b, 0x2c, /* p-t */
+ 0x3c, 0x2a, 0x1d, 0x22, 0x35, /* u-y */
+ 0x1a,
+};
+
+static rfbScreenInfoPtr server;
+static char num[10] = {
+ 0x45, 0x16, 0x1e, 0x26, 0x2e, 0x23, 0x36, 0x3d, 0x3e, 0x46,
+};
+
+/*
+ * This is called when the VNC server receives a key event
+ * The reason this function is such a beast is that we have
+ * to convert from ASCII characters (which is what VNC gets)
+ * to PC keyboard scancodes, which is what Linux expects to
+ * get from its keyboard. ASCII and the scancode set don't
+ * really seem to mesh in any good way beyond some basics with
+ * the letters and numbers.
+ */
+static void kbd_handle_key(rfbBool down, rfbKeySym key, rfbClientPtr cl)
+{
+ char tosend = 0;
+
+ if (key >= 0x41 && key <= 0x5a)
+ key += 0x20; /* convert to lowercase */
+
+ if (key >= 0x61 && key <= 0x7a) /* a-z */
+ tosend = letters[key - 0x61];
+
+ if (key >= 0x30 && key <= 0x39)
+ tosend = num[key - 0x30];
+
+ switch (key) {
+ case XK_Insert: kbd_queue(0xe0); tosend = 0x70; break;
+ case XK_Delete: kbd_queue(0xe0); tosend = 0x71; break;
+ case XK_Up: kbd_queue(0xe0); tosend = 0x75; break;
+ case XK_Down: kbd_queue(0xe0); tosend = 0x72; break;
+ case XK_Left: kbd_queue(0xe0); tosend = 0x6b; break;
+ case XK_Right: kbd_queue(0xe0); tosend = 0x74; break;
+ case XK_Page_Up: kbd_queue(0xe0); tosend = 0x7d; break;
+ case XK_Page_Down: kbd_queue(0xe0); tosend = 0x7a; break;
+ case XK_Home: kbd_queue(0xe0); tosend = 0x6c; break;
+ case XK_BackSpace: tosend = 0x66; break;
+ case XK_Tab: tosend = 0x0d; break;
+ case XK_Return: tosend = 0x5a; break;
+ case XK_Escape: tosend = 0x76; break;
+ case XK_End: tosend = 0x69; break;
+ case XK_Shift_L: tosend = 0x12; break;
+ case XK_Shift_R: tosend = 0x59; break;
+ case XK_Control_R: kbd_queue(0xe0);
+ case XK_Control_L: tosend = 0x14; break;
+ case XK_Alt_R: kbd_queue(0xe0);
+ case XK_Alt_L: tosend = 0x11; break;
+ case XK_quoteleft: tosend = 0x0e; break;
+ case XK_minus: tosend = 0x4e; break;
+ case XK_equal: tosend = 0x55; break;
+ case XK_bracketleft: tosend = 0x54; break;
+ case XK_bracketright: tosend = 0x5b; break;
+ case XK_backslash: tosend = 0x5d; break;
+ case XK_Caps_Lock: tosend = 0x58; break;
+ case XK_semicolon: tosend = 0x4c; break;
+ case XK_quoteright: tosend = 0x52; break;
+ case XK_comma: tosend = 0x41; break;
+ case XK_period: tosend = 0x49; break;
+ case XK_slash: tosend = 0x4a; break;
+ case XK_space: tosend = 0x29; break;
+
+ /*
+ * This is where I handle the shifted characters.
+ * They don't really map nicely the way A-Z maps to a-z,
+ * so I'm doing it manually
+ */
+ case XK_exclam: tosend = 0x16; break;
+ case XK_quotedbl: tosend = 0x52; break;
+ case XK_numbersign: tosend = 0x26; break;
+ case XK_dollar: tosend = 0x25; break;
+ case XK_percent: tosend = 0x2e; break;
+ case XK_ampersand: tosend = 0x3d; break;
+ case XK_parenleft: tosend = 0x46; break;
+ case XK_parenright: tosend = 0x45; break;
+ case XK_asterisk: tosend = 0x3e; break;
+ case XK_plus: tosend = 0x55; break;
+ case XK_colon: tosend = 0x4c; break;
+ case XK_less: tosend = 0x41; break;
+ case XK_greater: tosend = 0x49; break;
+ case XK_question: tosend = 0x4a; break;
+ case XK_at: tosend = 0x1e; break;
+ case XK_asciicircum: tosend = 0x36; break;
+ case XK_underscore: tosend = 0x4e; break;
+ case XK_braceleft: tosend = 0x54; break;
+ case XK_braceright: tosend = 0x5b; break;
+ case XK_bar: tosend = 0x5d; break;
+ case XK_asciitilde: tosend = 0x0e; break;
+ default: break;
+ }
+
+ /*
+ * If this is a "key up" event (the user has released the key, we
+ * need to send 0xf0 first.
+ */
+ if (!down && tosend != 0x0)
+ kbd_queue(0xf0);
+
+ if (tosend)
+ kbd_queue(tosend);
+}
+
+/* The previous X and Y coordinates of the mouse */
+static int xlast, ylast = -1;
+
+/*
+ * This function is called by the VNC server whenever a mouse event occurs.
+ */
+static void kbd_handle_ptr(int buttonMask, int x, int y, rfbClientPtr cl)
+{
+ int dx, dy;
+ char b1 = 0x8;
+
+ /* The VNC mask and the PS/2 button encoding are the same */
+ b1 |= buttonMask;
+
+ if (xlast >= 0 && ylast >= 0) {
+ /* The PS/2 mouse sends deltas, not absolutes */
+ dx = x - xlast;
+ dy = ylast - y;
+
+ /* Set overflow bits if needed */
+ if (dy > 255)
+ b1 |= 0x80;
+ if (dx > 255)
+ b1 |= 0x40;
+
+ /* Set negative bits if needed */
+ if (dy < 0)
+ b1 |= 0x20;
+ if (dx < 0)
+ b1 |= 0x10;
+
+ mouse_queue(b1);
+ mouse_queue(dx);
+ mouse_queue(dy);
+ }
+
+ xlast = x;
+ ylast = y;
+ rfbDefaultPtrAddEvent(buttonMask, x, y, cl);
+}
+
+static void *vnc__thread(void *p)
+{
+ struct framebuffer *fb = p;
+ /*
+ * Make a fake argc and argv because the getscreen function
+ * seems to want it.
+ */
+ char argv[1][1] = {{0}};
+ int argc = 1;
+
+ kvm__set_thread_name("kvm-vnc-worker");
+
+ server = rfbGetScreen(&argc, (char **) argv, fb->width, fb->height, 8, 3, 4);
+ server->frameBuffer = fb->mem;
+ server->alwaysShared = TRUE;
+ server->kbdAddEvent = kbd_handle_key;
+ server->ptrAddEvent = kbd_handle_ptr;
+ rfbInitServer(server);
+
+ while (rfbIsActive(server)) {
+ rfbMarkRectAsModified(server, 0, 0, fb->width, fb->height);
+ rfbProcessEvents(server, server->deferUpdateTime * VESA_UPDATE_TIME);
+ }
+ return NULL;
+}
+
+static int vnc__start(struct framebuffer *fb)
+{
+ pthread_t thread;
+
+ if (pthread_create(&thread, NULL, vnc__thread, fb) != 0)
+ return -1;
+
+ return 0;
+}
+
+static int vnc__stop(struct framebuffer *fb)
+{
+ rfbShutdownServer(server, TRUE);
+
+ return 0;
+}
+
+static struct fb_target_operations vnc_ops = {
+ .start = vnc__start,
+ .stop = vnc__stop,
+};
+
+int vnc__init(struct kvm *kvm)
+{
+ struct framebuffer *fb;
+
+ if (!kvm->cfg.vnc)
+ return 0;
+
+ fb = vesa__init(kvm);
+ if (IS_ERR(fb)) {
+ pr_err("vesa__init() failed with error %ld\n", PTR_ERR(fb));
+ return PTR_ERR(fb);
+ }
+
+ return fb__attach(fb, &vnc_ops);
+}
+dev_init(vnc__init);
+
+int vnc__exit(struct kvm *kvm)
+{
+ if (kvm->cfg.vnc)
+ return vnc__stop(NULL);
+
+ return 0;
+}
+dev_exit(vnc__exit);
diff --git a/tools/kvm/util/KVMTOOLS-VERSION-GEN b/tools/kvm/util/KVMTOOLS-VERSION-GEN
new file mode 100755
index 0000000..1af9d6c
--- /dev/null
+++ b/tools/kvm/util/KVMTOOLS-VERSION-GEN
@@ -0,0 +1,40 @@
+#!/bin/sh
+
+if [ $# -eq 1 ] ; then
+ OUTPUT=$1
+fi
+
+GVF=${OUTPUT}KVMTOOLS-VERSION-FILE
+
+LF='
+'
+
+# First check if there is a .git to get the version from git describe
+# otherwise try to get the version from the kernel makefile
+if test -d ../../.git -o -f ../../.git &&
+ VN=$(git describe --abbrev=4 HEAD 2>/dev/null) &&
+ case "$VN" in
+ *$LF*) (exit 1) ;;
+ v[0-9]*)
+ git update-index -q --refresh
+ test -z "$(git diff-index --name-only HEAD --)" ||
+ VN="$VN-dirty" ;;
+ esac
+then
+ VN=$(echo "$VN" | sed -e 's/-/./g');
+else
+ VN=$(MAKEFLAGS= make -sC ../.. kernelversion)
+fi
+
+VN=$(expr "$VN" : v*'\(.*\)')
+
+if test -r $GVF
+then
+ VC=$(sed -e 's/^KVMTOOLS_VERSION = //' <$GVF)
+else
+ VC=unset
+fi
+test "$VN" = "$VC" || {
+ echo >&2 "KVMTOOLS_VERSION = $VN"
+ echo "KVMTOOLS_VERSION = $VN" >$GVF
+}
diff --git a/tools/kvm/util/generate-cmdlist.sh b/tools/kvm/util/generate-cmdlist.sh
new file mode 100755
index 0000000..c8be0bd
--- /dev/null
+++ b/tools/kvm/util/generate-cmdlist.sh
@@ -0,0 +1,23 @@
+#!/bin/sh
+
+echo "/* Automatically generated by $0 */
+struct cmdname_help
+{
+ char name[16];
+ char help[80];
+};
+
+static struct cmdname_help common_cmds[] = {"
+
+sed -n 's/^lkvm-\([^ \t]*\).*common/\1/p' command-list.txt |
+while read cmd
+do
+ # TODO following sed command should be fixed
+ sed -n '/^NAME/,/^lkvm-'"$cmd"'/ {
+ /NAME/d
+ /--/d
+ s/.*kvm-'"$cmd"' - \(.*\)/ {"'"$cmd"'", "\1"},/
+ p
+ }' "Documentation/kvm-$cmd.txt"
+done
+echo "};"
diff --git a/tools/kvm/util/init.c b/tools/kvm/util/init.c
new file mode 100644
index 0000000..d4ce144
--- /dev/null
+++ b/tools/kvm/util/init.c
@@ -0,0 +1,67 @@
+#include <linux/list.h>
+#include <linux/kernel.h>
+
+#include "kvm/kvm.h"
+#include "kvm/util-init.h"
+
+#define PRIORITY_LISTS 10
+
+static struct hlist_head init_lists[PRIORITY_LISTS];
+static struct hlist_head exit_lists[PRIORITY_LISTS];
+
+int init_list_add(struct init_item *t, int (*init)(struct kvm *),
+ int priority, const char *name)
+{
+ t->init = init;
+ t->fn_name = name;
+ hlist_add_head(&t->n, &init_lists[priority]);
+
+ return 0;
+}
+
+int exit_list_add(struct init_item *t, int (*init)(struct kvm *),
+ int priority, const char *name)
+{
+ t->init = init;
+ t->fn_name = name;
+ hlist_add_head(&t->n, &exit_lists[priority]);
+
+ return 0;
+}
+
+int init_list__init(struct kvm *kvm)
+{
+ unsigned int i;
+ int r = 0;
+ struct init_item *t;
+
+ for (i = 0; i < ARRAY_SIZE(init_lists); i++)
+ hlist_for_each_entry(t, &init_lists[i], n) {
+ r = t->init(kvm);
+ if (r < 0) {
+ pr_warning("Failed init: %s\n", t->fn_name);
+ goto fail;
+ }
+ }
+
+fail:
+ return r;
+}
+
+int init_list__exit(struct kvm *kvm)
+{
+ int i;
+ int r = 0;
+ struct init_item *t;
+
+ for (i = ARRAY_SIZE(exit_lists) - 1; i >= 0; i--)
+ hlist_for_each_entry(t, &exit_lists[i], n) {
+ r = t->init(kvm);
+ if (r < 0) {
+ pr_warning("%s failed.\n", t->fn_name);
+ goto fail;
+ }
+ }
+fail:
+ return r;
+}
diff --git a/tools/kvm/util/iovec.c b/tools/kvm/util/iovec.c
new file mode 100644
index 0000000..0c8b9cf
--- /dev/null
+++ b/tools/kvm/util/iovec.c
@@ -0,0 +1,126 @@
+/*
+ * iovec manipulation routines.
+ *
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Fixes:
+ * Andrew Lunn : Errors in iovec copying.
+ * Pedro Roque : Added memcpy_fromiovecend and
+ * csum_..._fromiovecend.
+ * Andi Kleen : fixed error handling for 2.1
+ * Alexey Kuznetsov: 2.1 optimisations
+ * Andi Kleen : Fix csum*fromiovecend for IPv6.
+ */
+
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/compiler.h>
+#include <sys/uio.h>
+#include <kvm/iovec.h>
+#include <string.h>
+
+/*
+ * Copy kernel to iovec. Returns -EFAULT on error.
+ *
+ * Note: this modifies the original iovec.
+ */
+
+int memcpy_toiovec(struct iovec *iov, unsigned char *kdata, int len)
+{
+ while (len > 0) {
+ if (iov->iov_len) {
+ int copy = min_t(unsigned int, iov->iov_len, len);
+ memcpy(iov->iov_base, kdata, copy);
+ kdata += copy;
+ len -= copy;
+ iov->iov_len -= copy;
+ iov->iov_base += copy;
+ }
+ iov++;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(memcpy_toiovec);
+
+/*
+ * Copy kernel to iovec. Returns -EFAULT on error.
+ */
+
+int memcpy_toiovecend(const struct iovec *iov, unsigned char *kdata,
+ size_t offset, int len)
+{
+ int copy;
+ for (; len > 0; ++iov) {
+ /* Skip over the finished iovecs */
+ if (unlikely(offset >= iov->iov_len)) {
+ offset -= iov->iov_len;
+ continue;
+ }
+ copy = min_t(unsigned int, iov->iov_len - offset, len);
+ memcpy(iov->iov_base + offset, kdata, copy);
+ offset = 0;
+ kdata += copy;
+ len -= copy;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(memcpy_toiovecend);
+
+/*
+ * Copy iovec to kernel. Returns -EFAULT on error.
+ *
+ * Note: this modifies the original iovec.
+ */
+
+int memcpy_fromiovec(unsigned char *kdata, struct iovec *iov, int len)
+{
+ while (len > 0) {
+ if (iov->iov_len) {
+ int copy = min_t(unsigned int, len, iov->iov_len);
+ memcpy(kdata, iov->iov_base, copy);
+ len -= copy;
+ kdata += copy;
+ iov->iov_base += copy;
+ iov->iov_len -= copy;
+ }
+ iov++;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(memcpy_fromiovec);
+
+/*
+ * Copy iovec from kernel. Returns -EFAULT on error.
+ */
+
+int memcpy_fromiovecend(unsigned char *kdata, const struct iovec *iov,
+ size_t offset, int len)
+{
+ /* Skip over the finished iovecs */
+ while (offset >= iov->iov_len) {
+ offset -= iov->iov_len;
+ iov++;
+ }
+
+ while (len > 0) {
+ char *base = iov->iov_base + offset;
+ int copy = min_t(unsigned int, len, iov->iov_len - offset);
+
+ offset = 0;
+ memcpy(kdata, base, copy);
+ len -= copy;
+ kdata += copy;
+ iov++;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(memcpy_fromiovecend);
diff --git a/tools/kvm/util/kvm-ifup-vbr0 b/tools/kvm/util/kvm-ifup-vbr0
new file mode 100755
index 0000000..a91c37f
--- /dev/null
+++ b/tools/kvm/util/kvm-ifup-vbr0
@@ -0,0 +1,6 @@
+#!/bin/sh
+switch=vbr0
+/sbin/ifconfig $1 0.0.0.0 up
+/usr/sbin/brctl addif ${switch} $1
+/usr/sbin/brctl setfd ${switch} 0
+/usr/sbin/brctl stp ${switch} off
diff --git a/tools/kvm/util/parse-options.c b/tools/kvm/util/parse-options.c
new file mode 100644
index 0000000..9a1bbee
--- /dev/null
+++ b/tools/kvm/util/parse-options.c
@@ -0,0 +1,577 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <stdbool.h>
+
+/* user defined includes */
+#include <linux/types.h>
+#include <kvm/util.h>
+#include <kvm/parse-options.h>
+#include <kvm/strbuf.h>
+
+#define OPT_SHORT 1
+#define OPT_UNSET 2
+
+static int opterror(const struct option *opt, const char *reason, int flags)
+{
+ if (flags & OPT_SHORT)
+ return pr_err("switch `%c' %s", opt->short_name, reason);
+ if (flags & OPT_UNSET)
+ return pr_err("option `no-%s' %s", opt->long_name, reason);
+ return pr_err("option `%s' %s", opt->long_name, reason);
+}
+
+static int get_arg(struct parse_opt_ctx_t *p, const struct option *opt,
+ int flags, const char **arg)
+{
+ if (p->opt) {
+ *arg = p->opt;
+ p->opt = NULL;
+ } else if ((opt->flags & PARSE_OPT_LASTARG_DEFAULT) && (p->argc == 1 ||
+ **(p->argv + 1) == '-')) {
+ *arg = (const char *)opt->defval;
+ } else if (p->argc > 1) {
+ p->argc--;
+ *arg = *++p->argv;
+ } else
+ return opterror(opt, "requires a value", flags);
+ return 0;
+}
+
+static int readnum(const struct option *opt, int flags,
+ const char *str, char **end)
+{
+ switch (opt->type) {
+ case OPTION_INTEGER:
+ *(int *)opt->value = strtol(str, end, 0);
+ break;
+ case OPTION_UINTEGER:
+ *(unsigned int *)opt->value = strtol(str, end, 0);
+ break;
+ case OPTION_LONG:
+ *(long *)opt->value = strtol(str, end, 0);
+ break;
+ case OPTION_U64:
+ *(u64 *)opt->value = strtoull(str, end, 0);
+ break;
+ default:
+ return opterror(opt, "invalid numeric conversion", flags);
+ }
+
+ return 0;
+}
+
+static int get_value(struct parse_opt_ctx_t *p,
+ const struct option *opt, int flags)
+{
+ const char *s, *arg = NULL;
+ const int unset = flags & OPT_UNSET;
+
+ if (unset && p->opt)
+ return opterror(opt, "takes no value", flags);
+ if (unset && (opt->flags & PARSE_OPT_NONEG))
+ return opterror(opt, "isn't available", flags);
+
+ if (!(flags & OPT_SHORT) && p->opt) {
+ switch (opt->type) {
+ case OPTION_CALLBACK:
+ if (!(opt->flags & PARSE_OPT_NOARG))
+ break;
+ /* FALLTHROUGH */
+ case OPTION_BOOLEAN:
+ case OPTION_INCR:
+ case OPTION_BIT:
+ case OPTION_SET_UINT:
+ case OPTION_SET_PTR:
+ return opterror(opt, "takes no value", flags);
+ case OPTION_END:
+ case OPTION_ARGUMENT:
+ case OPTION_GROUP:
+ case OPTION_STRING:
+ case OPTION_INTEGER:
+ case OPTION_UINTEGER:
+ case OPTION_LONG:
+ case OPTION_U64:
+ default:
+ break;
+ }
+ }
+
+ switch (opt->type) {
+ case OPTION_BIT:
+ if (unset)
+ *(int *)opt->value &= ~opt->defval;
+ else
+ *(int *)opt->value |= opt->defval;
+ return 0;
+
+ case OPTION_BOOLEAN:
+ *(bool *)opt->value = unset ? false : true;
+ return 0;
+
+ case OPTION_INCR:
+ *(int *)opt->value = unset ? 0 : *(int *)opt->value + 1;
+ return 0;
+
+ case OPTION_SET_UINT:
+ *(unsigned int *)opt->value = unset ? 0 : opt->defval;
+ return 0;
+
+ case OPTION_SET_PTR:
+ *(void **)opt->value = unset ? NULL : (void *)opt->defval;
+ return 0;
+
+ case OPTION_STRING:
+ if (unset)
+ *(const char **)opt->value = NULL;
+ else if (opt->flags & PARSE_OPT_OPTARG && !p->opt)
+ *(const char **)opt->value = (const char *)opt->defval;
+ else
+ return get_arg(p, opt, flags,
+ (const char **)opt->value);
+ return 0;
+
+ case OPTION_CALLBACK:
+ if (unset)
+ return (*opt->callback)(opt, NULL, 1) ? (-1) : 0;
+ if (opt->flags & PARSE_OPT_NOARG)
+ return (*opt->callback)(opt, NULL, 0) ? (-1) : 0;
+ if (opt->flags & PARSE_OPT_OPTARG && !p->opt)
+ return (*opt->callback)(opt, NULL, 0) ? (-1) : 0;
+ if (get_arg(p, opt, flags, &arg))
+ return -1;
+ return (*opt->callback)(opt, arg, 0) ? (-1) : 0;
+
+ case OPTION_INTEGER:
+ if (unset) {
+ *(int *)opt->value = 0;
+ return 0;
+ }
+ if (opt->flags & PARSE_OPT_OPTARG && !p->opt) {
+ *(int *)opt->value = opt->defval;
+ return 0;
+ }
+ if (get_arg(p, opt, flags, &arg))
+ return -1;
+ return readnum(opt, flags, arg, (char **)&s);
+
+ case OPTION_UINTEGER:
+ if (unset) {
+ *(unsigned int *)opt->value = 0;
+ return 0;
+ }
+ if (opt->flags & PARSE_OPT_OPTARG && !p->opt) {
+ *(unsigned int *)opt->value = opt->defval;
+ return 0;
+ }
+ if (get_arg(p, opt, flags, &arg))
+ return -1;
+ return readnum(opt, flags, arg, (char **)&s);
+
+ case OPTION_LONG:
+ if (unset) {
+ *(long *)opt->value = 0;
+ return 0;
+ }
+ if (opt->flags & PARSE_OPT_OPTARG && !p->opt) {
+ *(long *)opt->value = opt->defval;
+ return 0;
+ }
+ if (get_arg(p, opt, flags, &arg))
+ return -1;
+ return readnum(opt, flags, arg, (char **)&s);
+
+ case OPTION_U64:
+ if (unset) {
+ *(u64 *)opt->value = 0;
+ return 0;
+ }
+ if (opt->flags & PARSE_OPT_OPTARG && !p->opt) {
+ *(u64 *)opt->value = opt->defval;
+ return 0;
+ }
+ if (get_arg(p, opt, flags, &arg))
+ return -1;
+ return readnum(opt, flags, arg, (char **)&s);
+
+ case OPTION_END:
+ case OPTION_ARGUMENT:
+ case OPTION_GROUP:
+ default:
+ die("should not happen, someone must be hit on the forehead");
+ }
+}
+
+#define USAGE_OPTS_WIDTH 24
+#define USAGE_GAP 2
+
+static int usage_with_options_internal(const char * const *usagestr,
+ const struct option *opts, int full)
+{
+ if (!usagestr)
+ return PARSE_OPT_HELP;
+
+ fprintf(stderr, "\n usage: %s\n", *usagestr++);
+ while (*usagestr && **usagestr)
+ fprintf(stderr, " or: %s\n", *usagestr++);
+ while (*usagestr) {
+ fprintf(stderr, "%s%s\n",
+ **usagestr ? " " : "",
+ *usagestr);
+ usagestr++;
+ }
+
+ if (opts->type != OPTION_GROUP)
+ fputc('\n', stderr);
+
+ for (; opts->type != OPTION_END; opts++) {
+ size_t pos;
+ int pad;
+
+ if (opts->type == OPTION_GROUP) {
+ fputc('\n', stderr);
+ if (*opts->help)
+ fprintf(stderr, "%s\n", opts->help);
+ continue;
+ }
+ if (!full && (opts->flags & PARSE_OPT_HIDDEN))
+ continue;
+
+ pos = fprintf(stderr, " ");
+ if (opts->short_name)
+ pos += fprintf(stderr, "-%c", opts->short_name);
+ else
+ pos += fprintf(stderr, " ");
+
+ if (opts->long_name && opts->short_name)
+ pos += fprintf(stderr, ", ");
+ if (opts->long_name)
+ pos += fprintf(stderr, "--%s", opts->long_name);
+
+ switch (opts->type) {
+ case OPTION_ARGUMENT:
+ break;
+ case OPTION_LONG:
+ case OPTION_U64:
+ case OPTION_INTEGER:
+ case OPTION_UINTEGER:
+ if (opts->flags & PARSE_OPT_OPTARG)
+ if (opts->long_name)
+ pos += fprintf(stderr, "[=<n>]");
+ else
+ pos += fprintf(stderr, "[<n>]");
+ else
+ pos += fprintf(stderr, " <n>");
+ break;
+ case OPTION_CALLBACK:
+ if (opts->flags & PARSE_OPT_NOARG)
+ break;
+ /* FALLTHROUGH */
+ case OPTION_STRING:
+ if (opts->argh) {
+ if (opts->flags & PARSE_OPT_OPTARG)
+ if (opts->long_name)
+ pos += fprintf(stderr, "[=<%s>]", opts->argh);
+ else
+ pos += fprintf(stderr, "[<%s>]", opts->argh);
+ else
+ pos += fprintf(stderr, " <%s>", opts->argh);
+ } else {
+ if (opts->flags & PARSE_OPT_OPTARG)
+ if (opts->long_name)
+ pos += fprintf(stderr, "[=...]");
+ else
+ pos += fprintf(stderr, "[...]");
+ else
+ pos += fprintf(stderr, " ...");
+ }
+ break;
+ default: /* OPTION_{BIT,BOOLEAN,SET_UINT,SET_PTR} */
+ case OPTION_END:
+ case OPTION_GROUP:
+ case OPTION_BIT:
+ case OPTION_BOOLEAN:
+ case OPTION_INCR:
+ case OPTION_SET_UINT:
+ case OPTION_SET_PTR:
+ break;
+ }
+ if (pos <= USAGE_OPTS_WIDTH)
+ pad = USAGE_OPTS_WIDTH - pos;
+ else {
+ fputc('\n', stderr);
+ pad = USAGE_OPTS_WIDTH;
+ }
+ fprintf(stderr, "%*s%s\n", pad + USAGE_GAP, "", opts->help);
+ }
+ fputc('\n', stderr);
+
+ return PARSE_OPT_HELP;
+}
+
+void usage_with_options(const char * const *usagestr,
+ const struct option *opts)
+{
+ usage_with_options_internal(usagestr, opts, 0);
+ exit(129);
+}
+
+static void check_typos(const char *arg, const struct option *options)
+{
+ if (strlen(arg) < 3)
+ return;
+
+ if (!prefixcmp(arg, "no-")) {
+ pr_err("did you mean `--%s` (with two dashes ?)", arg);
+ exit(129);
+ }
+
+ for (; options->type != OPTION_END; options++) {
+ if (!options->long_name)
+ continue;
+ if (!prefixcmp(options->long_name, arg)) {
+ pr_err("did you mean `--%s` (with two dashes ?)", arg);
+ exit(129);
+ }
+ }
+}
+
+static int parse_options_usage(const char * const *usagestr,
+ const struct option *opts)
+{
+ return usage_with_options_internal(usagestr, opts, 0);
+}
+
+static int parse_short_opt(struct parse_opt_ctx_t *p,
+ const struct option *options)
+{
+ for (; options->type != OPTION_END; options++) {
+ if (options->short_name == *p->opt) {
+ p->opt = p->opt[1] ? p->opt + 1 : NULL;
+ return get_value(p, options, OPT_SHORT);
+ }
+ }
+ return -2;
+}
+
+static int parse_long_opt(struct parse_opt_ctx_t *p, const char *arg,
+ const struct option *options)
+{
+ const char *arg_end = strchr(arg, '=');
+ const struct option *abbrev_option = NULL, *ambiguous_option = NULL;
+ int abbrev_flags = 0, ambiguous_flags = 0;
+
+ if (!arg_end)
+ arg_end = arg + strlen(arg);
+
+ for (; options->type != OPTION_END; options++) {
+ const char *rest;
+ int flags = 0;
+
+ if (!options->long_name)
+ continue;
+
+ rest = skip_prefix(arg, options->long_name);
+ if (options->type == OPTION_ARGUMENT) {
+ if (!rest)
+ continue;
+ if (*rest == '=')
+ return opterror(options, "takes no value",
+ flags);
+ if (*rest)
+ continue;
+ p->out[p->cpidx++] = arg - 2;
+ return 0;
+ }
+ if (!rest) {
+ /* abbreviated? */
+ if (!strncmp(options->long_name, arg, arg_end - arg)) {
+is_abbreviated:
+ if (abbrev_option) {
+ /*
+ * If this is abbreviated, it is
+ * ambiguous. So when there is no
+ * exact match later, we need to
+ * error out.
+ */
+ ambiguous_option = abbrev_option;
+ ambiguous_flags = abbrev_flags;
+ }
+ if (!(flags & OPT_UNSET) && *arg_end)
+ p->opt = arg_end + 1;
+ abbrev_option = options;
+ abbrev_flags = flags;
+ continue;
+ }
+ /* negated and abbreviated very much? */
+ if (!prefixcmp("no-", arg)) {
+ flags |= OPT_UNSET;
+ goto is_abbreviated;
+ }
+ /* negated? */
+ if (strncmp(arg, "no-", 3))
+ continue;
+ flags |= OPT_UNSET;
+ rest = skip_prefix(arg + 3, options->long_name);
+ /* abbreviated and negated? */
+ if (!rest && !prefixcmp(options->long_name, arg + 3))
+ goto is_abbreviated;
+ if (!rest)
+ continue;
+ }
+ if (*rest) {
+ if (*rest != '=')
+ continue;
+ p->opt = rest + 1;
+ }
+ return get_value(p, options, flags);
+ }
+
+ if (ambiguous_option)
+ return pr_err("Ambiguous option: %s "
+ "(could be --%s%s or --%s%s)",
+ arg,
+ (ambiguous_flags & OPT_UNSET) ? "no-" : "",
+ ambiguous_option->long_name,
+ (abbrev_flags & OPT_UNSET) ? "no-" : "",
+ abbrev_option->long_name);
+ if (abbrev_option)
+ return get_value(p, abbrev_option, abbrev_flags);
+ return -2;
+}
+
+
+static void parse_options_start(struct parse_opt_ctx_t *ctx, int argc,
+ const char **argv, int flags)
+{
+ memset(ctx, 0, sizeof(*ctx));
+ ctx->argc = argc;
+ ctx->argv = argv;
+ ctx->out = argv;
+ ctx->cpidx = ((flags & PARSE_OPT_KEEP_ARGV0) != 0);
+ ctx->flags = flags;
+ if ((flags & PARSE_OPT_KEEP_UNKNOWN) &&
+ (flags & PARSE_OPT_STOP_AT_NON_OPTION))
+ die("STOP_AT_NON_OPTION and KEEP_UNKNOWN don't go together");
+}
+
+static int parse_options_end(struct parse_opt_ctx_t *ctx)
+{
+ memmove(ctx->out + ctx->cpidx, ctx->argv, ctx->argc * sizeof(*ctx->out));
+ ctx->out[ctx->cpidx + ctx->argc] = NULL;
+ return ctx->cpidx + ctx->argc;
+}
+
+
+static int parse_options_step(struct parse_opt_ctx_t *ctx,
+ const struct option *options, const char * const usagestr[])
+{
+ int internal_help = !(ctx->flags & PARSE_OPT_NO_INTERNAL_HELP);
+
+ /* we must reset ->opt, unknown short option leave it dangling */
+ ctx->opt = NULL;
+
+ for (; ctx->argc; ctx->argc--, ctx->argv++) {
+ const char *arg = ctx->argv[0];
+
+ if (*arg != '-' || !arg[1]) {
+ if (ctx->flags & PARSE_OPT_STOP_AT_NON_OPTION)
+ break;
+ ctx->out[ctx->cpidx++] = ctx->argv[0];
+ continue;
+ }
+
+ if (arg[1] != '-') {
+ ctx->opt = arg + 1;
+ if (internal_help && *ctx->opt == 'h')
+ return parse_options_usage(usagestr, options);
+ switch (parse_short_opt(ctx, options)) {
+ case -1:
+ return parse_options_usage(usagestr, options);
+ case -2:
+ goto unknown;
+ default:
+ break;
+ }
+ if (ctx->opt)
+ check_typos(arg + 1, options);
+ while (ctx->opt) {
+ if (internal_help && *ctx->opt == 'h')
+ return parse_options_usage(usagestr,
+ options);
+ switch (parse_short_opt(ctx, options)) {
+ case -1:
+ return parse_options_usage(usagestr,
+ options);
+ case -2:
+ /* fake a short option thing to hide
+ * the fact that we may have
+ * started to parse aggregated stuff
+ *
+ * This is leaky, too bad.
+ */
+ ctx->argv[0] = strdup(ctx->opt - 1);
+ *(char *)ctx->argv[0] = '-';
+ goto unknown;
+ default:
+ break;
+ }
+ }
+ continue;
+ }
+
+ if (!arg[2]) { /* "--" */
+ if (!(ctx->flags & PARSE_OPT_KEEP_DASHDASH)) {
+ ctx->argc--;
+ ctx->argv++;
+ }
+ break;
+ }
+
+ if (internal_help && !strcmp(arg + 2, "help-all"))
+ return usage_with_options_internal(usagestr, options,
+ 1);
+ if (internal_help && !strcmp(arg + 2, "help"))
+ return parse_options_usage(usagestr, options);
+ switch (parse_long_opt(ctx, arg + 2, options)) {
+ case -1:
+ return parse_options_usage(usagestr, options);
+ case -2:
+ goto unknown;
+ default:
+ break;
+ }
+ continue;
+unknown:
+ if (!(ctx->flags & PARSE_OPT_KEEP_UNKNOWN))
+ return PARSE_OPT_UNKNOWN;
+ ctx->out[ctx->cpidx++] = ctx->argv[0];
+ ctx->opt = NULL;
+ }
+ return PARSE_OPT_DONE;
+}
+
+int parse_options(int argc, const char **argv, const struct option *options,
+ const char * const usagestr[], int flags)
+{
+ struct parse_opt_ctx_t ctx;
+
+ parse_options_start(&ctx, argc, argv, flags);
+ switch (parse_options_step(&ctx, options, usagestr)) {
+ case PARSE_OPT_HELP:
+ exit(129);
+ case PARSE_OPT_DONE:
+ break;
+ default: /* PARSE_OPT_UNKNOWN */
+ if (ctx.argv[0][1] == '-') {
+ pr_err("unknown option `%s'", ctx.argv[0] + 2);
+ } else {
+ pr_err("unknown switch `%c'", *ctx.opt);
+ }
+ usage_with_options(usagestr, options);
+ }
+
+ return parse_options_end(&ctx);
+}
diff --git a/tools/kvm/util/rbtree-interval.c b/tools/kvm/util/rbtree-interval.c
new file mode 100644
index 0000000..3630a6d
--- /dev/null
+++ b/tools/kvm/util/rbtree-interval.c
@@ -0,0 +1,58 @@
+#include <kvm/rbtree-interval.h>
+#include <stddef.h>
+#include <errno.h>
+
+struct rb_int_node *rb_int_search_single(struct rb_root *root, u64 point)
+{
+ struct rb_node *node = root->rb_node;
+
+ while (node) {
+ struct rb_int_node *cur = rb_int(node);
+
+ if (point < cur->low)
+ node = node->rb_left;
+ else if (cur->high <= point)
+ node = node->rb_right;
+ else
+ return cur;
+ }
+
+ return NULL;
+}
+
+struct rb_int_node *rb_int_search_range(struct rb_root *root, u64 low, u64 high)
+{
+ struct rb_int_node *range;
+
+ range = rb_int_search_single(root, low);
+ if (range == NULL)
+ return NULL;
+
+ /* We simply verify that 'high' is smaller than the end of the range where 'low' is located */
+ if (range->high < high)
+ return NULL;
+
+ return range;
+}
+
+int rb_int_insert(struct rb_root *root, struct rb_int_node *i_node)
+{
+ struct rb_node **node = &root->rb_node, *parent = NULL;
+
+ while (*node) {
+ struct rb_int_node *cur = rb_int(*node);
+
+ parent = *node;
+ if (i_node->high <= cur->low)
+ node = &cur->node.rb_left;
+ else if (cur->high <= i_node->low)
+ node = &cur->node.rb_right;
+ else
+ return -EEXIST;
+ }
+
+ rb_link_node(&i_node->node, parent, node);
+ rb_insert_color(&i_node->node, root);
+
+ return 0;
+}
diff --git a/tools/kvm/util/read-write.c b/tools/kvm/util/read-write.c
new file mode 100644
index 0000000..44709df
--- /dev/null
+++ b/tools/kvm/util/read-write.c
@@ -0,0 +1,354 @@
+#include "kvm/read-write.h"
+
+#include <sys/types.h>
+#include <sys/uio.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+
+/* Same as read(2) except that this function never returns EAGAIN or EINTR. */
+ssize_t xread(int fd, void *buf, size_t count)
+{
+ ssize_t nr;
+
+restart:
+ nr = read(fd, buf, count);
+ if ((nr < 0) && ((errno == EAGAIN) || (errno == EINTR)))
+ goto restart;
+
+ return nr;
+}
+
+/* Same as write(2) except that this function never returns EAGAIN or EINTR. */
+ssize_t xwrite(int fd, const void *buf, size_t count)
+{
+ ssize_t nr;
+
+restart:
+ nr = write(fd, buf, count);
+ if ((nr < 0) && ((errno == EAGAIN) || (errno == EINTR)))
+ goto restart;
+
+ return nr;
+}
+
+ssize_t read_in_full(int fd, void *buf, size_t count)
+{
+ ssize_t total = 0;
+ char *p = buf;
+
+ while (count > 0) {
+ ssize_t nr;
+
+ nr = xread(fd, p, count);
+ if (nr <= 0) {
+ if (total > 0)
+ return total;
+
+ return -1;
+ }
+
+ count -= nr;
+ total += nr;
+ p += nr;
+ }
+
+ return total;
+}
+
+ssize_t write_in_full(int fd, const void *buf, size_t count)
+{
+ const char *p = buf;
+ ssize_t total = 0;
+
+ while (count > 0) {
+ ssize_t nr;
+
+ nr = xwrite(fd, p, count);
+ if (nr < 0)
+ return -1;
+ if (nr == 0) {
+ errno = ENOSPC;
+ return -1;
+ }
+ count -= nr;
+ total += nr;
+ p += nr;
+ }
+
+ return total;
+}
+
+/* Same as pread(2) except that this function never returns EAGAIN or EINTR. */
+ssize_t xpread(int fd, void *buf, size_t count, off_t offset)
+{
+ ssize_t nr;
+
+restart:
+ nr = pread(fd, buf, count, offset);
+ if ((nr < 0) && ((errno == EAGAIN) || (errno == EINTR)))
+ goto restart;
+
+ return nr;
+}
+
+/* Same as pwrite(2) except that this function never returns EAGAIN or EINTR. */
+ssize_t xpwrite(int fd, const void *buf, size_t count, off_t offset)
+{
+ ssize_t nr;
+
+restart:
+ nr = pwrite(fd, buf, count, offset);
+ if ((nr < 0) && ((errno == EAGAIN) || (errno == EINTR)))
+ goto restart;
+
+ return nr;
+}
+
+ssize_t pread_in_full(int fd, void *buf, size_t count, off_t offset)
+{
+ ssize_t total = 0;
+ char *p = buf;
+
+ while (count > 0) {
+ ssize_t nr;
+
+ nr = xpread(fd, p, count, offset);
+ if (nr <= 0) {
+ if (total > 0)
+ return total;
+
+ return -1;
+ }
+
+ count -= nr;
+ total += nr;
+ p += nr;
+ offset += nr;
+ }
+
+ return total;
+}
+
+ssize_t pwrite_in_full(int fd, const void *buf, size_t count, off_t offset)
+{
+ const char *p = buf;
+ ssize_t total = 0;
+
+ while (count > 0) {
+ ssize_t nr;
+
+ nr = xpwrite(fd, p, count, offset);
+ if (nr < 0)
+ return -1;
+ if (nr == 0) {
+ errno = ENOSPC;
+ return -1;
+ }
+ count -= nr;
+ total += nr;
+ p += nr;
+ offset += nr;
+ }
+
+ return total;
+}
+
+/* Same as readv(2) except that this function never returns EAGAIN or EINTR. */
+ssize_t xreadv(int fd, const struct iovec *iov, int iovcnt)
+{
+ ssize_t nr;
+
+restart:
+ nr = readv(fd, iov, iovcnt);
+ if ((nr < 0) && ((errno == EAGAIN) || (errno == EINTR)))
+ goto restart;
+
+ return nr;
+}
+
+/* Same as writev(2) except that this function never returns EAGAIN or EINTR. */
+ssize_t xwritev(int fd, const struct iovec *iov, int iovcnt)
+{
+ ssize_t nr;
+
+restart:
+ nr = writev(fd, iov, iovcnt);
+ if ((nr < 0) && ((errno == EAGAIN) || (errno == EINTR)))
+ goto restart;
+
+ return nr;
+}
+
+static inline ssize_t get_iov_size(const struct iovec *iov, int iovcnt)
+{
+ size_t size = 0;
+ while (iovcnt--)
+ size += (iov++)->iov_len;
+
+ return size;
+}
+
+static inline void shift_iovec(const struct iovec **iov, int *iovcnt,
+ size_t nr, ssize_t *total, size_t *count, off_t *offset)
+{
+ while (nr >= (*iov)->iov_len) {
+ nr -= (*iov)->iov_len;
+ *total += (*iov)->iov_len;
+ *count -= (*iov)->iov_len;
+ if (offset)
+ *offset += (*iov)->iov_len;
+ (*iovcnt)--;
+ (*iov)++;
+ }
+}
+
+ssize_t readv_in_full(int fd, const struct iovec *iov, int iovcnt)
+{
+ ssize_t total = 0;
+ size_t count = get_iov_size(iov, iovcnt);
+
+ while (count > 0) {
+ ssize_t nr;
+
+ nr = xreadv(fd, iov, iovcnt);
+ if (nr <= 0) {
+ if (total > 0)
+ return total;
+
+ return -1;
+ }
+
+ shift_iovec(&iov, &iovcnt, nr, &total, &count, NULL);
+ }
+
+ return total;
+}
+
+ssize_t writev_in_full(int fd, const struct iovec *iov, int iovcnt)
+{
+ ssize_t total = 0;
+ size_t count = get_iov_size(iov, iovcnt);
+
+ while (count > 0) {
+ ssize_t nr;
+
+ nr = xwritev(fd, iov, iovcnt);
+ if (nr < 0)
+ return -1;
+ if (nr == 0) {
+ errno = ENOSPC;
+ return -1;
+ }
+
+ shift_iovec(&iov, &iovcnt, nr, &total, &count, NULL);
+ }
+
+ return total;
+}
+
+/* Same as preadv(2) except that this function never returns EAGAIN or EINTR. */
+ssize_t xpreadv(int fd, const struct iovec *iov, int iovcnt, off_t offset)
+{
+ ssize_t nr;
+
+restart:
+ nr = preadv(fd, iov, iovcnt, offset);
+ if ((nr < 0) && ((errno == EAGAIN) || (errno == EINTR)))
+ goto restart;
+
+ return nr;
+}
+
+/* Same as pwritev(2) except that this function never returns EAGAIN or EINTR. */
+ssize_t xpwritev(int fd, const struct iovec *iov, int iovcnt, off_t offset)
+{
+ ssize_t nr;
+
+restart:
+ nr = pwritev(fd, iov, iovcnt, offset);
+ if ((nr < 0) && ((errno == EAGAIN) || (errno == EINTR)))
+ goto restart;
+
+ return nr;
+}
+
+ssize_t preadv_in_full(int fd, const struct iovec *iov, int iovcnt, off_t offset)
+{
+ ssize_t total = 0;
+ size_t count = get_iov_size(iov, iovcnt);
+
+ while (count > 0) {
+ ssize_t nr;
+
+ nr = xpreadv(fd, iov, iovcnt, offset);
+ if (nr <= 0) {
+ if (total > 0)
+ return total;
+
+ return -1;
+ }
+
+ shift_iovec(&iov, &iovcnt, nr, &total, &count, &offset);
+ }
+
+ return total;
+}
+
+ssize_t pwritev_in_full(int fd, const struct iovec *iov, int iovcnt, off_t offset)
+{
+ ssize_t total = 0;
+ size_t count = get_iov_size(iov, iovcnt);
+
+ while (count > 0) {
+ ssize_t nr;
+
+ nr = xpwritev(fd, iov, iovcnt, offset);
+ if (nr < 0)
+ return -1;
+ if (nr == 0) {
+ errno = ENOSPC;
+ return -1;
+ }
+
+ shift_iovec(&iov, &iovcnt, nr, &total, &count, &offset);
+ }
+
+ return total;
+}
+
+#ifdef CONFIG_HAS_AIO
+int aio_pwritev(io_context_t ctx, struct iocb *iocb, int fd, const struct iovec *iov, int iovcnt,
+ off_t offset, int ev, void *param)
+{
+ struct iocb *ios[1] = { iocb };
+ int ret;
+
+ io_prep_pwritev(iocb, fd, iov, iovcnt, offset);
+ io_set_eventfd(iocb, ev);
+ iocb->data = param;
+
+restart:
+ ret = io_submit(ctx, 1, ios);
+ if (ret == -EAGAIN)
+ goto restart;
+ return ret;
+}
+
+int aio_preadv(io_context_t ctx, struct iocb *iocb, int fd, const struct iovec *iov, int iovcnt,
+ off_t offset, int ev, void *param)
+{
+ struct iocb *ios[1] = { iocb };
+ int ret;
+
+ io_prep_preadv(iocb, fd, iov, iovcnt, offset);
+ io_set_eventfd(iocb, ev);
+ iocb->data = param;
+
+restart:
+ ret = io_submit(ctx, 1, ios);
+ if (ret == -EAGAIN)
+ goto restart;
+ return ret;
+}
+#endif
diff --git a/tools/kvm/util/set_private_br.sh b/tools/kvm/util/set_private_br.sh
new file mode 100755
index 0000000..49867dd
--- /dev/null
+++ b/tools/kvm/util/set_private_br.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+#
+# Author: Amos Kong <kongjianjun@gmail.com>
+# Date: Apr 14, 2011
+# Description: this script is used to create/delete a private bridge,
+# launch a dhcp server on the bridge by dnsmasq.
+#
+# @ ./set_private_br.sh $bridge_name $subnet_prefix
+# @ ./set_private_br.sh vbr0 192.168.33
+
+brname='vbr0'
+subnet='192.168.33'
+
+add_br()
+{
+ echo "add new private bridge: $brname"
+ /usr/sbin/brctl addbr $brname
+ echo 1 > /proc/sys/net/ipv6/conf/$brname/disable_ipv6
+ echo 1 > /proc/sys/net/ipv4/ip_forward
+ /usr/sbin/brctl stp $brname on
+ /usr/sbin/brctl setfd $brname 0
+ ifconfig $brname $subnet.1
+ ifconfig $brname up
+ # Add forward rule, then guest can access public network
+ iptables -t nat -A POSTROUTING -s $subnet.254/24 ! -d $subnet.254/24 -j MASQUERADE
+ /etc/init.d/dnsmasq stop
+ /etc/init.d/tftpd-hpa stop 2>/dev/null
+ dnsmasq --strict-order --bind-interfaces --listen-address $subnet.1 --dhcp-range $subnet.1,$subnet.254 $tftp_cmd
+}
+
+del_br()
+{
+ echo "cleanup bridge setup"
+ kill -9 `pgrep dnsmasq|tail -1`
+ ifconfig $brname down
+ /usr/sbin/brctl delbr $brname
+ iptables -t nat -D POSTROUTING -s $subnet.254/24 ! -d $subnet.254/24 -j MASQUERADE
+}
+
+
+if [ $# = 0 ]; then
+ del_br 2>/dev/null
+ exit
+fi
+if [ $# > 1 ]; then
+ brname="$1"
+fi
+if [ $# = 2 ]; then
+ subnet="$2"
+fi
+add_br
diff --git a/tools/kvm/util/strbuf.c b/tools/kvm/util/strbuf.c
new file mode 100644
index 0000000..99d6b0c
--- /dev/null
+++ b/tools/kvm/util/strbuf.c
@@ -0,0 +1,62 @@
+
+/* user defined headers */
+#include <kvm/util.h>
+#include <kvm/strbuf.h>
+
+int prefixcmp(const char *str, const char *prefix)
+{
+ for (; ; str++, prefix++) {
+ if (!*prefix)
+ return 0;
+ else if (*str != *prefix)
+ return (unsigned char)*prefix - (unsigned char)*str;
+ }
+}
+
+/**
+ * strlcat - Append a length-limited, %NUL-terminated string to another
+ * @dest: The string to be appended to
+ * @src: The string to append to it
+ * @count: The size of the destination buffer.
+ */
+size_t strlcat(char *dest, const char *src, size_t count)
+{
+ size_t dsize = strlen(dest);
+ size_t len = strlen(src);
+ size_t res = dsize + len;
+
+ DIE_IF(dsize >= count);
+
+ dest += dsize;
+ count -= dsize;
+ if (len >= count)
+ len = count - 1;
+
+ memcpy(dest, src, len);
+ dest[len] = 0;
+
+ return res;
+}
+
+/**
+ * strlcpy - Copy a %NUL terminated string into a sized buffer
+ * @dest: Where to copy the string to
+ * @src: Where to copy the string from
+ * @size: size of destination buffer
+ *
+ * Compatible with *BSD: the result is always a valid
+ * NUL-terminated string that fits in the buffer (unless,
+ * of course, the buffer size is zero). It does not pad
+ * out the result like strncpy() does.
+ */
+size_t strlcpy(char *dest, const char *src, size_t size)
+{
+ size_t ret = strlen(src);
+
+ if (size) {
+ size_t len = (ret >= size) ? size - 1 : ret;
+ memcpy(dest, src, len);
+ dest[len] = '\0';
+ }
+ return ret;
+}
diff --git a/tools/kvm/util/threadpool.c b/tools/kvm/util/threadpool.c
new file mode 100644
index 0000000..e64aa26
--- /dev/null
+++ b/tools/kvm/util/threadpool.c
@@ -0,0 +1,175 @@
+#include "kvm/threadpool.h"
+#include "kvm/mutex.h"
+#include "kvm/kvm.h"
+
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <pthread.h>
+#include <stdbool.h>
+
+static DEFINE_MUTEX(job_mutex);
+static DEFINE_MUTEX(thread_mutex);
+static pthread_cond_t job_cond = PTHREAD_COND_INITIALIZER;
+
+static LIST_HEAD(head);
+
+static pthread_t *threads;
+static long threadcount;
+static bool running;
+
+static struct thread_pool__job *thread_pool__job_pop_locked(void)
+{
+ struct thread_pool__job *job;
+
+ if (list_empty(&head))
+ return NULL;
+
+ job = list_first_entry(&head, struct thread_pool__job, queue);
+ list_del(&job->queue);
+
+ return job;
+}
+
+static void thread_pool__job_push_locked(struct thread_pool__job *job)
+{
+ list_add_tail(&job->queue, &head);
+}
+
+static struct thread_pool__job *thread_pool__job_pop(void)
+{
+ struct thread_pool__job *job;
+
+ mutex_lock(&job_mutex);
+ job = thread_pool__job_pop_locked();
+ mutex_unlock(&job_mutex);
+ return job;
+}
+
+static void thread_pool__job_push(struct thread_pool__job *job)
+{
+ mutex_lock(&job_mutex);
+ thread_pool__job_push_locked(job);
+ mutex_unlock(&job_mutex);
+}
+
+static void thread_pool__handle_job(struct thread_pool__job *job)
+{
+ while (job) {
+ job->callback(job->kvm, job->data);
+
+ mutex_lock(&job->mutex);
+
+ if (--job->signalcount > 0)
+ /* If the job was signaled again while we were working */
+ thread_pool__job_push(job);
+
+ mutex_unlock(&job->mutex);
+
+ job = thread_pool__job_pop();
+ }
+}
+
+static void thread_pool__threadfunc_cleanup(void *param)
+{
+ mutex_unlock(&job_mutex);
+}
+
+static void *thread_pool__threadfunc(void *param)
+{
+ pthread_cleanup_push(thread_pool__threadfunc_cleanup, NULL);
+
+ kvm__set_thread_name("threadpool-worker");
+
+ while (running) {
+ struct thread_pool__job *curjob = NULL;
+
+ mutex_lock(&job_mutex);
+ while (running && (curjob = thread_pool__job_pop_locked()) == NULL)
+ pthread_cond_wait(&job_cond, &job_mutex.mutex);
+ mutex_unlock(&job_mutex);
+
+ if (running)
+ thread_pool__handle_job(curjob);
+ }
+
+ pthread_cleanup_pop(0);
+
+ return NULL;
+}
+
+static int thread_pool__addthread(void)
+{
+ int res;
+ void *newthreads;
+
+ mutex_lock(&thread_mutex);
+ newthreads = realloc(threads, (threadcount + 1) * sizeof(pthread_t));
+ if (newthreads == NULL) {
+ mutex_unlock(&thread_mutex);
+ return -1;
+ }
+
+ threads = newthreads;
+
+ res = pthread_create(threads + threadcount, NULL,
+ thread_pool__threadfunc, NULL);
+
+ if (res == 0)
+ threadcount++;
+ mutex_unlock(&thread_mutex);
+
+ return res;
+}
+
+int thread_pool__init(struct kvm *kvm)
+{
+ unsigned long i;
+ unsigned int thread_count = sysconf(_SC_NPROCESSORS_ONLN);
+
+ running = true;
+
+ for (i = 0; i < thread_count; i++)
+ if (thread_pool__addthread() < 0)
+ return i;
+
+ return i;
+}
+late_init(thread_pool__init);
+
+int thread_pool__exit(struct kvm *kvm)
+{
+ int i;
+ void *NUL = NULL;
+
+ running = false;
+
+ for (i = 0; i < threadcount; i++) {
+ mutex_lock(&job_mutex);
+ pthread_cond_signal(&job_cond);
+ mutex_unlock(&job_mutex);
+ }
+
+ for (i = 0; i < threadcount; i++) {
+ pthread_join(threads[i], NUL);
+ }
+
+ return 0;
+}
+late_exit(thread_pool__exit);
+
+void thread_pool__do_job(struct thread_pool__job *job)
+{
+ struct thread_pool__job *jobinfo = job;
+
+ if (jobinfo == NULL || jobinfo->callback == NULL)
+ return;
+
+ mutex_lock(&jobinfo->mutex);
+ if (jobinfo->signalcount++ == 0)
+ thread_pool__job_push(job);
+ mutex_unlock(&jobinfo->mutex);
+
+ mutex_lock(&job_mutex);
+ pthread_cond_signal(&job_cond);
+ mutex_unlock(&job_mutex);
+}
diff --git a/tools/kvm/util/util.c b/tools/kvm/util/util.c
new file mode 100644
index 0000000..1877105
--- /dev/null
+++ b/tools/kvm/util/util.c
@@ -0,0 +1,133 @@
+/*
+ * Taken from perf which in turn take it from GIT
+ */
+
+#include "kvm/util.h"
+
+#include <kvm/kvm.h>
+#include <linux/magic.h> /* For HUGETLBFS_MAGIC */
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/statfs.h>
+
+static void report(const char *prefix, const char *err, va_list params)
+{
+ char msg[1024];
+ vsnprintf(msg, sizeof(msg), err, params);
+ fprintf(stderr, " %s%s\n", prefix, msg);
+}
+
+static NORETURN void die_builtin(const char *err, va_list params)
+{
+ report(" Fatal: ", err, params);
+ exit(128);
+}
+
+static void error_builtin(const char *err, va_list params)
+{
+ report(" Error: ", err, params);
+}
+
+static void warn_builtin(const char *warn, va_list params)
+{
+ report(" Warning: ", warn, params);
+}
+
+static void info_builtin(const char *info, va_list params)
+{
+ report(" Info: ", info, params);
+}
+
+void die(const char *err, ...)
+{
+ va_list params;
+
+ va_start(params, err);
+ die_builtin(err, params);
+ va_end(params);
+}
+
+int pr_err(const char *err, ...)
+{
+ va_list params;
+
+ va_start(params, err);
+ error_builtin(err, params);
+ va_end(params);
+ return -1;
+}
+
+void pr_warning(const char *warn, ...)
+{
+ va_list params;
+
+ va_start(params, warn);
+ warn_builtin(warn, params);
+ va_end(params);
+}
+
+void pr_info(const char *info, ...)
+{
+ va_list params;
+
+ va_start(params, info);
+ info_builtin(info, params);
+ va_end(params);
+}
+
+void die_perror(const char *s)
+{
+ perror(s);
+ exit(1);
+}
+
+void *mmap_hugetlbfs(struct kvm *kvm, const char *htlbfs_path, u64 size)
+{
+ char mpath[PATH_MAX];
+ int fd;
+ struct statfs sfs;
+ void *addr;
+ unsigned long blk_size;
+
+ if (statfs(htlbfs_path, &sfs) < 0)
+ die("Can't stat %s\n", htlbfs_path);
+
+ if ((unsigned int)sfs.f_type != HUGETLBFS_MAGIC)
+ die("%s is not hugetlbfs!\n", htlbfs_path);
+
+ blk_size = (unsigned long)sfs.f_bsize;
+ if (sfs.f_bsize == 0 || blk_size > size) {
+ die("Can't use hugetlbfs pagesize %ld for mem size %lld\n",
+ blk_size, (unsigned long long)size);
+ }
+
+ kvm->ram_pagesize = blk_size;
+
+ snprintf(mpath, PATH_MAX, "%s/kvmtoolXXXXXX", htlbfs_path);
+ fd = mkstemp(mpath);
+ if (fd < 0)
+ die("Can't open %s for hugetlbfs map\n", mpath);
+ unlink(mpath);
+ if (ftruncate(fd, size) < 0)
+ die("Can't ftruncate for mem mapping size %lld\n",
+ (unsigned long long)size);
+ addr = mmap(NULL, size, PROT_RW, MAP_PRIVATE, fd, 0);
+ close(fd);
+
+ return addr;
+}
+
+/* This function wraps the decision between hugetlbfs map (if requested) or normal mmap */
+void *mmap_anon_or_hugetlbfs(struct kvm *kvm, const char *hugetlbfs_path, u64 size)
+{
+ if (hugetlbfs_path)
+ /*
+ * We don't /need/ to map guest RAM from hugetlbfs, but we do so
+ * if the user specifies a hugetlbfs path.
+ */
+ return mmap_hugetlbfs(kvm, hugetlbfs_path, size);
+ else {
+ kvm->ram_pagesize = getpagesize();
+ return mmap(NULL, size, PROT_RW, MAP_ANON_NORESERVE, -1, 0);
+ }
+}
diff --git a/tools/kvm/virtio/9p-pdu.c b/tools/kvm/virtio/9p-pdu.c
new file mode 100644
index 0000000..cc2e871
--- /dev/null
+++ b/tools/kvm/virtio/9p-pdu.c
@@ -0,0 +1,290 @@
+#include "kvm/util.h"
+#include "kvm/virtio-9p.h"
+
+#include <endian.h>
+#include <stdint.h>
+
+#include <linux/compiler.h>
+#include <linux/uidgid.h>
+#include <net/9p/9p.h>
+
+static void virtio_p9_pdu_read(struct p9_pdu *pdu, void *data, size_t size)
+{
+ size_t len;
+ int i, copied = 0;
+ u16 iov_cnt = pdu->out_iov_cnt;
+ size_t offset = pdu->read_offset;
+ struct iovec *iov = pdu->out_iov;
+
+ for (i = 0; i < iov_cnt && size; i++) {
+ if (offset >= iov[i].iov_len) {
+ offset -= iov[i].iov_len;
+ continue;
+ } else {
+ len = MIN(iov[i].iov_len - offset, size);
+ memcpy(data, iov[i].iov_base + offset, len);
+ size -= len;
+ data += len;
+ offset = 0;
+ copied += len;
+ }
+ }
+ pdu->read_offset += copied;
+}
+
+static void virtio_p9_pdu_write(struct p9_pdu *pdu,
+ const void *data, size_t size)
+{
+ size_t len;
+ int i, copied = 0;
+ u16 iov_cnt = pdu->in_iov_cnt;
+ size_t offset = pdu->write_offset;
+ struct iovec *iov = pdu->in_iov;
+
+ for (i = 0; i < iov_cnt && size; i++) {
+ if (offset >= iov[i].iov_len) {
+ offset -= iov[i].iov_len;
+ continue;
+ } else {
+ len = MIN(iov[i].iov_len - offset, size);
+ memcpy(iov[i].iov_base + offset, data, len);
+ size -= len;
+ data += len;
+ offset = 0;
+ copied += len;
+ }
+ }
+ pdu->write_offset += copied;
+}
+
+static void virtio_p9_wstat_free(struct p9_wstat *stbuf)
+{
+ free(stbuf->name);
+ free(stbuf->uid);
+ free(stbuf->gid);
+ free(stbuf->muid);
+}
+
+static int virtio_p9_decode(struct p9_pdu *pdu, const char *fmt, va_list ap)
+{
+ int retval = 0;
+ const char *ptr;
+
+ for (ptr = fmt; *ptr; ptr++) {
+ switch (*ptr) {
+ case 'b':
+ {
+ int8_t *val = va_arg(ap, int8_t *);
+ virtio_p9_pdu_read(pdu, val, sizeof(*val));
+ }
+ break;
+ case 'w':
+ {
+ int16_t le_val;
+ int16_t *val = va_arg(ap, int16_t *);
+ virtio_p9_pdu_read(pdu, &le_val, sizeof(le_val));
+ *val = le16toh(le_val);
+ }
+ break;
+ case 'd':
+ {
+ int32_t le_val;
+ int32_t *val = va_arg(ap, int32_t *);
+ virtio_p9_pdu_read(pdu, &le_val, sizeof(le_val));
+ *val = le32toh(le_val);
+ }
+ break;
+ case 'q':
+ {
+ int64_t le_val;
+ int64_t *val = va_arg(ap, int64_t *);
+ virtio_p9_pdu_read(pdu, &le_val, sizeof(le_val));
+ *val = le64toh(le_val);
+ }
+ break;
+ case 's':
+ {
+ int16_t len;
+ char **str = va_arg(ap, char **);
+
+ virtio_p9_pdu_readf(pdu, "w", &len);
+ *str = malloc(len + 1);
+ if (*str == NULL) {
+ retval = ENOMEM;
+ break;
+ }
+ virtio_p9_pdu_read(pdu, *str, len);
+ (*str)[len] = 0;
+ }
+ break;
+ case 'Q':
+ {
+ struct p9_qid *qid = va_arg(ap, struct p9_qid *);
+ retval = virtio_p9_pdu_readf(pdu, "bdq",
+ &qid->type, &qid->version,
+ &qid->path);
+ }
+ break;
+ case 'S':
+ {
+ struct p9_wstat *stbuf = va_arg(ap, struct p9_wstat *);
+ memset(stbuf, 0, sizeof(struct p9_wstat));
+ stbuf->n_uid = KUIDT_INIT(-1);
+ stbuf->n_gid = KGIDT_INIT(-1);
+ stbuf->n_muid = KUIDT_INIT(-1);
+ retval = virtio_p9_pdu_readf(pdu, "wwdQdddqssss",
+ &stbuf->size, &stbuf->type,
+ &stbuf->dev, &stbuf->qid,
+ &stbuf->mode, &stbuf->atime,
+ &stbuf->mtime, &stbuf->length,
+ &stbuf->name, &stbuf->uid,
+ &stbuf->gid, &stbuf->muid);
+ if (retval)
+ virtio_p9_wstat_free(stbuf);
+ }
+ break;
+ case 'I':
+ {
+ struct p9_iattr_dotl *p9attr = va_arg(ap,
+ struct p9_iattr_dotl *);
+
+ retval = virtio_p9_pdu_readf(pdu, "ddddqqqqq",
+ &p9attr->valid,
+ &p9attr->mode,
+ &p9attr->uid,
+ &p9attr->gid,
+ &p9attr->size,
+ &p9attr->atime_sec,
+ &p9attr->atime_nsec,
+ &p9attr->mtime_sec,
+ &p9attr->mtime_nsec);
+ }
+ break;
+ default:
+ retval = EINVAL;
+ break;
+ }
+ }
+ return retval;
+}
+
+static int virtio_p9_pdu_encode(struct p9_pdu *pdu, const char *fmt, va_list ap)
+{
+ int retval = 0;
+ const char *ptr;
+
+ for (ptr = fmt; *ptr; ptr++) {
+ switch (*ptr) {
+ case 'b':
+ {
+ int8_t val = va_arg(ap, int);
+ virtio_p9_pdu_write(pdu, &val, sizeof(val));
+ }
+ break;
+ case 'w':
+ {
+ int16_t val = htole16(va_arg(ap, int));
+ virtio_p9_pdu_write(pdu, &val, sizeof(val));
+ }
+ break;
+ case 'd':
+ {
+ int32_t val = htole32(va_arg(ap, int32_t));
+ virtio_p9_pdu_write(pdu, &val, sizeof(val));
+ }
+ break;
+ case 'q':
+ {
+ int64_t val = htole64(va_arg(ap, int64_t));
+ virtio_p9_pdu_write(pdu, &val, sizeof(val));
+ }
+ break;
+ case 's':
+ {
+ uint16_t len = 0;
+ const char *s = va_arg(ap, char *);
+ if (s)
+ len = MIN(strlen(s), USHRT_MAX);
+ virtio_p9_pdu_writef(pdu, "w", len);
+ virtio_p9_pdu_write(pdu, s, len);
+ }
+ break;
+ case 'Q':
+ {
+ struct p9_qid *qid = va_arg(ap, struct p9_qid *);
+ retval = virtio_p9_pdu_writef(pdu, "bdq",
+ qid->type, qid->version,
+ qid->path);
+ }
+ break;
+ case 'S':
+ {
+ struct p9_wstat *stbuf = va_arg(ap, struct p9_wstat *);
+ retval = virtio_p9_pdu_writef(pdu, "wwdQdddqssss",
+ stbuf->size, stbuf->type,
+ stbuf->dev, &stbuf->qid,
+ stbuf->mode, stbuf->atime,
+ stbuf->mtime, stbuf->length,
+ stbuf->name, stbuf->uid,
+ stbuf->gid, stbuf->muid);
+ }
+ break;
+ case 'A':
+ {
+ struct p9_stat_dotl *stbuf = va_arg(ap,
+ struct p9_stat_dotl *);
+ retval = virtio_p9_pdu_writef(pdu,
+ "qQdddqqqqqqqqqqqqqqq",
+ stbuf->st_result_mask,
+ &stbuf->qid,
+ stbuf->st_mode,
+ stbuf->st_uid,
+ stbuf->st_gid,
+ stbuf->st_nlink,
+ stbuf->st_rdev,
+ stbuf->st_size,
+ stbuf->st_blksize,
+ stbuf->st_blocks,
+ stbuf->st_atime_sec,
+ stbuf->st_atime_nsec,
+ stbuf->st_mtime_sec,
+ stbuf->st_mtime_nsec,
+ stbuf->st_ctime_sec,
+ stbuf->st_ctime_nsec,
+ stbuf->st_btime_sec,
+ stbuf->st_btime_nsec,
+ stbuf->st_gen,
+ stbuf->st_data_version);
+ }
+ break;
+ default:
+ retval = EINVAL;
+ break;
+ }
+ }
+ return retval;
+}
+
+int virtio_p9_pdu_readf(struct p9_pdu *pdu, const char *fmt, ...)
+{
+ int ret;
+ va_list ap;
+
+ va_start(ap, fmt);
+ ret = virtio_p9_decode(pdu, fmt, ap);
+ va_end(ap);
+
+ return ret;
+}
+
+int virtio_p9_pdu_writef(struct p9_pdu *pdu, const char *fmt, ...)
+{
+ int ret;
+ va_list ap;
+
+ va_start(ap, fmt);
+ ret = virtio_p9_pdu_encode(pdu, fmt, ap);
+ va_end(ap);
+
+ return ret;
+}
diff --git a/tools/kvm/virtio/9p.c b/tools/kvm/virtio/9p.c
new file mode 100644
index 0000000..9073a1e
--- /dev/null
+++ b/tools/kvm/virtio/9p.c
@@ -0,0 +1,1446 @@
+#include "kvm/virtio-pci-dev.h"
+#include "kvm/ioport.h"
+#include "kvm/util.h"
+#include "kvm/threadpool.h"
+#include "kvm/irq.h"
+#include "kvm/virtio-9p.h"
+#include "kvm/guest_compat.h"
+#include "kvm/builtin-setup.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+#include <sys/vfs.h>
+
+#include <linux/virtio_ring.h>
+#include <linux/virtio_9p.h>
+#include <linux/uidgid.h>
+#include <net/9p/9p.h>
+
+static LIST_HEAD(devs);
+static int compat_id = -1;
+
+static int insert_new_fid(struct p9_dev *dev, struct p9_fid *fid);
+static struct p9_fid *find_or_create_fid(struct p9_dev *dev, u32 fid)
+{
+ struct rb_node *node = dev->fids.rb_node;
+ struct p9_fid *pfid = NULL;
+
+ while (node) {
+ struct p9_fid *cur = rb_entry(node, struct p9_fid, node);
+
+ if (fid < cur->fid) {
+ node = node->rb_left;
+ } else if (fid > cur->fid) {
+ node = node->rb_right;
+ } else {
+ return cur;
+ }
+ }
+
+ pfid = calloc(sizeof(*pfid), 1);
+ if (!pfid)
+ return NULL;
+
+ pfid->fid = fid;
+ strcpy(pfid->abs_path, dev->root_dir);
+ pfid->path = pfid->abs_path + strlen(dev->root_dir);
+
+ insert_new_fid(dev, pfid);
+
+ return pfid;
+}
+
+static int insert_new_fid(struct p9_dev *dev, struct p9_fid *fid)
+{
+ struct rb_node **node = &(dev->fids.rb_node), *parent = NULL;
+
+ while (*node) {
+ int result = fid->fid - rb_entry(*node, struct p9_fid, node)->fid;
+
+ parent = *node;
+ if (result < 0)
+ node = &((*node)->rb_left);
+ else if (result > 0)
+ node = &((*node)->rb_right);
+ else
+ return -EEXIST;
+ }
+
+ rb_link_node(&fid->node, parent, node);
+ rb_insert_color(&fid->node, &dev->fids);
+ return 0;
+}
+
+static struct p9_fid *get_fid(struct p9_dev *p9dev, int fid)
+{
+ struct p9_fid *new;
+
+ new = find_or_create_fid(p9dev, fid);
+
+ return new;
+}
+
+/* Warning: Immediately use value returned from this function */
+static const char *rel_to_abs(struct p9_dev *p9dev,
+ const char *path, char *abs_path)
+{
+ sprintf(abs_path, "%s/%s", p9dev->root_dir, path);
+
+ return abs_path;
+}
+
+static void stat2qid(struct stat *st, struct p9_qid *qid)
+{
+ *qid = (struct p9_qid) {
+ .path = st->st_ino,
+ .version = st->st_mtime,
+ };
+
+ if (S_ISDIR(st->st_mode))
+ qid->type |= P9_QTDIR;
+}
+
+static void close_fid(struct p9_dev *p9dev, u32 fid)
+{
+ struct p9_fid *pfid = get_fid(p9dev, fid);
+
+ if (pfid->fd > 0)
+ close(pfid->fd);
+
+ if (pfid->dir)
+ closedir(pfid->dir);
+
+ rb_erase(&pfid->node, &p9dev->fids);
+ free(pfid);
+}
+
+static void virtio_p9_set_reply_header(struct p9_pdu *pdu, u32 size)
+{
+ u8 cmd;
+ u16 tag;
+
+ pdu->read_offset = sizeof(u32);
+ virtio_p9_pdu_readf(pdu, "bw", &cmd, &tag);
+ pdu->write_offset = 0;
+ /* cmd + 1 is the reply message */
+ virtio_p9_pdu_writef(pdu, "dbw", size, cmd + 1, tag);
+}
+
+static u16 virtio_p9_update_iov_cnt(struct iovec iov[], u32 count, int iov_cnt)
+{
+ int i;
+ u32 total = 0;
+ for (i = 0; (i < iov_cnt) && (total < count); i++) {
+ if (total + iov[i].iov_len > count) {
+ /* we don't need this iov fully */
+ iov[i].iov_len -= ((total + iov[i].iov_len) - count);
+ i++;
+ break;
+ }
+ total += iov[i].iov_len;
+ }
+ return i;
+}
+
+static void virtio_p9_error_reply(struct p9_dev *p9dev,
+ struct p9_pdu *pdu, int err, u32 *outlen)
+{
+ u16 tag;
+
+ pdu->write_offset = VIRTIO_9P_HDR_LEN;
+ virtio_p9_pdu_writef(pdu, "d", err);
+ *outlen = pdu->write_offset;
+
+ /* read the tag from input */
+ pdu->read_offset = sizeof(u32) + sizeof(u8);
+ virtio_p9_pdu_readf(pdu, "w", &tag);
+
+ /* Update the header */
+ pdu->write_offset = 0;
+ virtio_p9_pdu_writef(pdu, "dbw", *outlen, P9_RLERROR, tag);
+}
+
+static void virtio_p9_version(struct p9_dev *p9dev,
+ struct p9_pdu *pdu, u32 *outlen)
+{
+ u32 msize;
+ char *version;
+ virtio_p9_pdu_readf(pdu, "ds", &msize, &version);
+ /*
+ * reply with the same msize the client sent us
+ * Error out if the request is not for 9P2000.L
+ */
+ if (!strcmp(version, VIRTIO_9P_VERSION_DOTL))
+ virtio_p9_pdu_writef(pdu, "ds", msize, version);
+ else
+ virtio_p9_pdu_writef(pdu, "ds", msize, "unknown");
+
+ *outlen = pdu->write_offset;
+ virtio_p9_set_reply_header(pdu, *outlen);
+ free(version);
+ return;
+}
+
+static void virtio_p9_clunk(struct p9_dev *p9dev,
+ struct p9_pdu *pdu, u32 *outlen)
+{
+ u32 fid;
+
+ virtio_p9_pdu_readf(pdu, "d", &fid);
+ close_fid(p9dev, fid);
+
+ *outlen = pdu->write_offset;
+ virtio_p9_set_reply_header(pdu, *outlen);
+ return;
+}
+
+/*
+ * FIXME!! Need to map to protocol independent value. Upstream
+ * 9p also have the same BUG
+ */
+static int virtio_p9_openflags(int flags)
+{
+ flags &= ~(O_NOCTTY | O_ASYNC | O_CREAT | O_DIRECT);
+ flags |= O_NOFOLLOW;
+ return flags;
+}
+
+static bool is_dir(struct p9_fid *fid)
+{
+ struct stat st;
+
+ stat(fid->abs_path, &st);
+
+ return S_ISDIR(st.st_mode);
+}
+
+static void virtio_p9_open(struct p9_dev *p9dev,
+ struct p9_pdu *pdu, u32 *outlen)
+{
+ u32 fid, flags;
+ struct stat st;
+ struct p9_qid qid;
+ struct p9_fid *new_fid;
+
+
+ virtio_p9_pdu_readf(pdu, "dd", &fid, &flags);
+ new_fid = get_fid(p9dev, fid);
+
+ if (lstat(new_fid->abs_path, &st) < 0)
+ goto err_out;
+
+ stat2qid(&st, &qid);
+
+ if (is_dir(new_fid)) {
+ new_fid->dir = opendir(new_fid->abs_path);
+ if (!new_fid->dir)
+ goto err_out;
+ } else {
+ new_fid->fd = open(new_fid->abs_path,
+ virtio_p9_openflags(flags));
+ if (new_fid->fd < 0)
+ goto err_out;
+ }
+ /* FIXME!! need ot send proper iounit */
+ virtio_p9_pdu_writef(pdu, "Qd", &qid, 0);
+
+ *outlen = pdu->write_offset;
+ virtio_p9_set_reply_header(pdu, *outlen);
+ return;
+err_out:
+ virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+ return;
+}
+
+static void virtio_p9_create(struct p9_dev *p9dev,
+ struct p9_pdu *pdu, u32 *outlen)
+{
+ int fd, ret;
+ char *name;
+ struct stat st;
+ struct p9_qid qid;
+ struct p9_fid *dfid;
+ char full_path[PATH_MAX];
+ u32 dfid_val, flags, mode, gid;
+
+ virtio_p9_pdu_readf(pdu, "dsddd", &dfid_val,
+ &name, &flags, &mode, &gid);
+ dfid = get_fid(p9dev, dfid_val);
+
+ flags = virtio_p9_openflags(flags);
+
+ sprintf(full_path, "%s/%s", dfid->abs_path, name);
+ fd = open(full_path, flags | O_CREAT, mode);
+ if (fd < 0)
+ goto err_out;
+ dfid->fd = fd;
+
+ if (lstat(full_path, &st) < 0)
+ goto err_out;
+
+ ret = chmod(full_path, mode & 0777);
+ if (ret < 0)
+ goto err_out;
+
+ sprintf(dfid->path, "%s/%s", dfid->path, name);
+ stat2qid(&st, &qid);
+ virtio_p9_pdu_writef(pdu, "Qd", &qid, 0);
+ *outlen = pdu->write_offset;
+ virtio_p9_set_reply_header(pdu, *outlen);
+ free(name);
+ return;
+err_out:
+ free(name);
+ virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+ return;
+}
+
+static void virtio_p9_mkdir(struct p9_dev *p9dev,
+ struct p9_pdu *pdu, u32 *outlen)
+{
+ int ret;
+ char *name;
+ struct stat st;
+ struct p9_qid qid;
+ struct p9_fid *dfid;
+ char full_path[PATH_MAX];
+ u32 dfid_val, mode, gid;
+
+ virtio_p9_pdu_readf(pdu, "dsdd", &dfid_val,
+ &name, &mode, &gid);
+ dfid = get_fid(p9dev, dfid_val);
+
+ sprintf(full_path, "%s/%s", dfid->abs_path, name);
+ ret = mkdir(full_path, mode);
+ if (ret < 0)
+ goto err_out;
+
+ if (lstat(full_path, &st) < 0)
+ goto err_out;
+
+ ret = chmod(full_path, mode & 0777);
+ if (ret < 0)
+ goto err_out;
+
+ stat2qid(&st, &qid);
+ virtio_p9_pdu_writef(pdu, "Qd", &qid, 0);
+ *outlen = pdu->write_offset;
+ virtio_p9_set_reply_header(pdu, *outlen);
+ free(name);
+ return;
+err_out:
+ free(name);
+ virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+ return;
+}
+
+static void virtio_p9_walk(struct p9_dev *p9dev,
+ struct p9_pdu *pdu, u32 *outlen)
+{
+ u8 i;
+ u16 nwqid;
+ u16 nwname;
+ struct p9_qid wqid;
+ struct p9_fid *new_fid, *old_fid;
+ u32 fid_val, newfid_val;
+
+
+ virtio_p9_pdu_readf(pdu, "ddw", &fid_val, &newfid_val, &nwname);
+ new_fid = get_fid(p9dev, newfid_val);
+
+ nwqid = 0;
+ if (nwname) {
+ struct p9_fid *fid = get_fid(p9dev, fid_val);
+
+ strcpy(new_fid->path, fid->path);
+ /* skip the space for count */
+ pdu->write_offset += sizeof(u16);
+ for (i = 0; i < nwname; i++) {
+ struct stat st;
+ char tmp[PATH_MAX] = {0};
+ char full_path[PATH_MAX];
+ char *str;
+
+ virtio_p9_pdu_readf(pdu, "s", &str);
+
+ /* Format the new path we're 'walk'ing into */
+ sprintf(tmp, "%s/%s", new_fid->path, str);
+
+ free(str);
+
+ if (lstat(rel_to_abs(p9dev, tmp, full_path), &st) < 0)
+ goto err_out;
+
+ stat2qid(&st, &wqid);
+ strcpy(new_fid->path, tmp);
+ new_fid->uid = fid->uid;
+ nwqid++;
+ virtio_p9_pdu_writef(pdu, "Q", &wqid);
+ }
+ } else {
+ /*
+ * update write_offset so our outlen get correct value
+ */
+ pdu->write_offset += sizeof(u16);
+ old_fid = get_fid(p9dev, fid_val);
+ strcpy(new_fid->path, old_fid->path);
+ new_fid->uid = old_fid->uid;
+ }
+ *outlen = pdu->write_offset;
+ pdu->write_offset = VIRTIO_9P_HDR_LEN;
+ virtio_p9_pdu_writef(pdu, "d", nwqid);
+ virtio_p9_set_reply_header(pdu, *outlen);
+ return;
+err_out:
+ virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+ return;
+}
+
+static void virtio_p9_attach(struct p9_dev *p9dev,
+ struct p9_pdu *pdu, u32 *outlen)
+{
+ char *uname;
+ char *aname;
+ struct stat st;
+ struct p9_qid qid;
+ struct p9_fid *fid;
+ u32 fid_val, afid, uid;
+
+ virtio_p9_pdu_readf(pdu, "ddssd", &fid_val, &afid,
+ &uname, &aname, &uid);
+
+ free(uname);
+ free(aname);
+
+ if (lstat(p9dev->root_dir, &st) < 0)
+ goto err_out;
+
+ stat2qid(&st, &qid);
+
+ fid = get_fid(p9dev, fid_val);
+ fid->uid = uid;
+ strcpy(fid->path, "/");
+
+ virtio_p9_pdu_writef(pdu, "Q", &qid);
+ *outlen = pdu->write_offset;
+ virtio_p9_set_reply_header(pdu, *outlen);
+ return;
+err_out:
+ virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+ return;
+}
+
+static void virtio_p9_fill_stat(struct p9_dev *p9dev,
+ struct stat *st, struct p9_stat_dotl *statl)
+{
+ memset(statl, 0, sizeof(*statl));
+ statl->st_mode = st->st_mode;
+ statl->st_nlink = st->st_nlink;
+ statl->st_uid = KUIDT_INIT(st->st_uid);
+ statl->st_gid = KGIDT_INIT(st->st_gid);
+ statl->st_rdev = st->st_rdev;
+ statl->st_size = st->st_size;
+ statl->st_blksize = st->st_blksize;
+ statl->st_blocks = st->st_blocks;
+ statl->st_atime_sec = st->st_atime;
+ statl->st_atime_nsec = st->st_atim.tv_nsec;
+ statl->st_mtime_sec = st->st_mtime;
+ statl->st_mtime_nsec = st->st_mtim.tv_nsec;
+ statl->st_ctime_sec = st->st_ctime;
+ statl->st_ctime_nsec = st->st_ctim.tv_nsec;
+ /* Currently we only support BASIC fields in stat */
+ statl->st_result_mask = P9_STATS_BASIC;
+ stat2qid(st, &statl->qid);
+}
+
+static void virtio_p9_read(struct p9_dev *p9dev,
+ struct p9_pdu *pdu, u32 *outlen)
+{
+ u64 offset;
+ u32 fid_val;
+ u16 iov_cnt;
+ void *iov_base;
+ size_t iov_len;
+ u32 count, rcount;
+ struct p9_fid *fid;
+
+
+ rcount = 0;
+ virtio_p9_pdu_readf(pdu, "dqd", &fid_val, &offset, &count);
+ fid = get_fid(p9dev, fid_val);
+
+ iov_base = pdu->in_iov[0].iov_base;
+ iov_len = pdu->in_iov[0].iov_len;
+ iov_cnt = pdu->in_iov_cnt;
+ pdu->in_iov[0].iov_base += VIRTIO_9P_HDR_LEN + sizeof(u32);
+ pdu->in_iov[0].iov_len -= VIRTIO_9P_HDR_LEN + sizeof(u32);
+ pdu->in_iov_cnt = virtio_p9_update_iov_cnt(pdu->in_iov,
+ count,
+ pdu->in_iov_cnt);
+ rcount = preadv(fid->fd, pdu->in_iov,
+ pdu->in_iov_cnt, offset);
+ if (rcount > count)
+ rcount = count;
+ /*
+ * Update the iov_base back, so that rest of
+ * pdu_writef works correctly.
+ */
+ pdu->in_iov[0].iov_base = iov_base;
+ pdu->in_iov[0].iov_len = iov_len;
+ pdu->in_iov_cnt = iov_cnt;
+
+ pdu->write_offset = VIRTIO_9P_HDR_LEN;
+ virtio_p9_pdu_writef(pdu, "d", rcount);
+ *outlen = pdu->write_offset + rcount;
+ virtio_p9_set_reply_header(pdu, *outlen);
+ return;
+}
+
+static int virtio_p9_dentry_size(struct dirent *dent)
+{
+ /*
+ * Size of each dirent:
+ * qid(13) + offset(8) + type(1) + name_len(2) + name
+ */
+ return 24 + strlen(dent->d_name);
+}
+
+static void virtio_p9_readdir(struct p9_dev *p9dev,
+ struct p9_pdu *pdu, u32 *outlen)
+{
+ u32 fid_val;
+ u32 count, rcount;
+ struct stat st;
+ struct p9_fid *fid;
+ struct dirent *dent;
+ char full_path[PATH_MAX];
+ u64 offset, old_offset;
+
+ rcount = 0;
+ virtio_p9_pdu_readf(pdu, "dqd", &fid_val, &offset, &count);
+ fid = get_fid(p9dev, fid_val);
+
+ if (!is_dir(fid)) {
+ errno = EINVAL;
+ goto err_out;
+ }
+
+ /* Move the offset specified */
+ seekdir(fid->dir, offset);
+
+ old_offset = offset;
+ /* If reading a dir, fill the buffer with p9_stat entries */
+ dent = readdir(fid->dir);
+
+ /* Skip the space for writing count */
+ pdu->write_offset += sizeof(u32);
+ while (dent) {
+ u32 read;
+ struct p9_qid qid;
+
+ if ((rcount + virtio_p9_dentry_size(dent)) > count) {
+ /* seek to the previous offset and return */
+ seekdir(fid->dir, old_offset);
+ break;
+ }
+ old_offset = dent->d_off;
+ lstat(rel_to_abs(p9dev, dent->d_name, full_path), &st);
+ stat2qid(&st, &qid);
+ read = pdu->write_offset;
+ virtio_p9_pdu_writef(pdu, "Qqbs", &qid, dent->d_off,
+ dent->d_type, dent->d_name);
+ rcount += pdu->write_offset - read;
+ dent = readdir(fid->dir);
+ }
+
+ pdu->write_offset = VIRTIO_9P_HDR_LEN;
+ virtio_p9_pdu_writef(pdu, "d", rcount);
+ *outlen = pdu->write_offset + rcount;
+ virtio_p9_set_reply_header(pdu, *outlen);
+ return;
+err_out:
+ virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+ return;
+}
+
+
+static void virtio_p9_getattr(struct p9_dev *p9dev,
+ struct p9_pdu *pdu, u32 *outlen)
+{
+ u32 fid_val;
+ struct stat st;
+ u64 request_mask;
+ struct p9_fid *fid;
+ struct p9_stat_dotl statl;
+
+ virtio_p9_pdu_readf(pdu, "dq", &fid_val, &request_mask);
+ fid = get_fid(p9dev, fid_val);
+ if (lstat(fid->abs_path, &st) < 0)
+ goto err_out;
+
+ virtio_p9_fill_stat(p9dev, &st, &statl);
+ virtio_p9_pdu_writef(pdu, "A", &statl);
+ *outlen = pdu->write_offset;
+ virtio_p9_set_reply_header(pdu, *outlen);
+ return;
+err_out:
+ virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+ return;
+}
+
+/* FIXME!! from linux/fs.h */
+/*
+ * Attribute flags. These should be or-ed together to figure out what
+ * has been changed!
+ */
+#define ATTR_MODE (1 << 0)
+#define ATTR_UID (1 << 1)
+#define ATTR_GID (1 << 2)
+#define ATTR_SIZE (1 << 3)
+#define ATTR_ATIME (1 << 4)
+#define ATTR_MTIME (1 << 5)
+#define ATTR_CTIME (1 << 6)
+#define ATTR_ATIME_SET (1 << 7)
+#define ATTR_MTIME_SET (1 << 8)
+#define ATTR_FORCE (1 << 9) /* Not a change, but a change it */
+#define ATTR_ATTR_FLAG (1 << 10)
+#define ATTR_KILL_SUID (1 << 11)
+#define ATTR_KILL_SGID (1 << 12)
+#define ATTR_FILE (1 << 13)
+#define ATTR_KILL_PRIV (1 << 14)
+#define ATTR_OPEN (1 << 15) /* Truncating from open(O_TRUNC) */
+#define ATTR_TIMES_SET (1 << 16)
+
+#define ATTR_MASK 127
+
+static void virtio_p9_setattr(struct p9_dev *p9dev,
+ struct p9_pdu *pdu, u32 *outlen)
+{
+ int ret = 0;
+ u32 fid_val;
+ struct p9_fid *fid;
+ struct p9_iattr_dotl p9attr;
+
+ virtio_p9_pdu_readf(pdu, "dI", &fid_val, &p9attr);
+ fid = get_fid(p9dev, fid_val);
+
+ if (p9attr.valid & ATTR_MODE) {
+ ret = chmod(fid->abs_path, p9attr.mode);
+ if (ret < 0)
+ goto err_out;
+ }
+ if (p9attr.valid & (ATTR_ATIME | ATTR_MTIME)) {
+ struct timespec times[2];
+ if (p9attr.valid & ATTR_ATIME) {
+ if (p9attr.valid & ATTR_ATIME_SET) {
+ times[0].tv_sec = p9attr.atime_sec;
+ times[0].tv_nsec = p9attr.atime_nsec;
+ } else {
+ times[0].tv_nsec = UTIME_NOW;
+ }
+ } else {
+ times[0].tv_nsec = UTIME_OMIT;
+ }
+ if (p9attr.valid & ATTR_MTIME) {
+ if (p9attr.valid & ATTR_MTIME_SET) {
+ times[1].tv_sec = p9attr.mtime_sec;
+ times[1].tv_nsec = p9attr.mtime_nsec;
+ } else {
+ times[1].tv_nsec = UTIME_NOW;
+ }
+ } else
+ times[1].tv_nsec = UTIME_OMIT;
+
+ ret = utimensat(-1, fid->abs_path, times, AT_SYMLINK_NOFOLLOW);
+ if (ret < 0)
+ goto err_out;
+ }
+ /*
+ * If the only valid entry in iattr is ctime we can call
+ * chown(-1,-1) to update the ctime of the file
+ */
+ if ((p9attr.valid & (ATTR_UID | ATTR_GID)) ||
+ ((p9attr.valid & ATTR_CTIME)
+ && !((p9attr.valid & ATTR_MASK) & ~ATTR_CTIME))) {
+ if (!(p9attr.valid & ATTR_UID))
+ p9attr.uid = KUIDT_INIT(-1);
+
+ if (!(p9attr.valid & ATTR_GID))
+ p9attr.gid = KGIDT_INIT(-1);
+
+ ret = lchown(fid->abs_path, __kuid_val(p9attr.uid),
+ __kgid_val(p9attr.gid));
+ if (ret < 0)
+ goto err_out;
+ }
+ if (p9attr.valid & (ATTR_SIZE)) {
+ ret = truncate(fid->abs_path, p9attr.size);
+ if (ret < 0)
+ goto err_out;
+ }
+ *outlen = VIRTIO_9P_HDR_LEN;
+ virtio_p9_set_reply_header(pdu, *outlen);
+ return;
+err_out:
+ virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+ return;
+}
+
+static void virtio_p9_write(struct p9_dev *p9dev,
+ struct p9_pdu *pdu, u32 *outlen)
+{
+
+ u64 offset;
+ u32 fid_val;
+ u32 count;
+ ssize_t res;
+ u16 iov_cnt;
+ void *iov_base;
+ size_t iov_len;
+ struct p9_fid *fid;
+ /* u32 fid + u64 offset + u32 count */
+ int twrite_size = sizeof(u32) + sizeof(u64) + sizeof(u32);
+
+ virtio_p9_pdu_readf(pdu, "dqd", &fid_val, &offset, &count);
+ fid = get_fid(p9dev, fid_val);
+
+ iov_base = pdu->out_iov[0].iov_base;
+ iov_len = pdu->out_iov[0].iov_len;
+ iov_cnt = pdu->out_iov_cnt;
+
+ /* Adjust the iovec to skip the header and meta data */
+ pdu->out_iov[0].iov_base += (sizeof(struct p9_msg) + twrite_size);
+ pdu->out_iov[0].iov_len -= (sizeof(struct p9_msg) + twrite_size);
+ pdu->out_iov_cnt = virtio_p9_update_iov_cnt(pdu->out_iov, count,
+ pdu->out_iov_cnt);
+ res = pwritev(fid->fd, pdu->out_iov, pdu->out_iov_cnt, offset);
+ /*
+ * Update the iov_base back, so that rest of
+ * pdu_readf works correctly.
+ */
+ pdu->out_iov[0].iov_base = iov_base;
+ pdu->out_iov[0].iov_len = iov_len;
+ pdu->out_iov_cnt = iov_cnt;
+
+ if (res < 0)
+ goto err_out;
+ virtio_p9_pdu_writef(pdu, "d", res);
+ *outlen = pdu->write_offset;
+ virtio_p9_set_reply_header(pdu, *outlen);
+ return;
+err_out:
+ virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+ return;
+}
+
+static void virtio_p9_remove(struct p9_dev *p9dev,
+ struct p9_pdu *pdu, u32 *outlen)
+{
+ int ret;
+ u32 fid_val;
+ struct p9_fid *fid;
+
+ virtio_p9_pdu_readf(pdu, "d", &fid_val);
+ fid = get_fid(p9dev, fid_val);
+
+ ret = remove(fid->abs_path);
+ if (ret < 0)
+ goto err_out;
+ *outlen = pdu->write_offset;
+ virtio_p9_set_reply_header(pdu, *outlen);
+ return;
+
+err_out:
+ virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+ return;
+}
+
+static void virtio_p9_rename(struct p9_dev *p9dev,
+ struct p9_pdu *pdu, u32 *outlen)
+{
+ int ret;
+ u32 fid_val, new_fid_val;
+ struct p9_fid *fid, *new_fid;
+ char full_path[PATH_MAX], *new_name;
+
+ virtio_p9_pdu_readf(pdu, "dds", &fid_val, &new_fid_val, &new_name);
+ fid = get_fid(p9dev, fid_val);
+ new_fid = get_fid(p9dev, new_fid_val);
+
+ sprintf(full_path, "%s/%s", new_fid->abs_path, new_name);
+ ret = rename(fid->abs_path, full_path);
+ if (ret < 0)
+ goto err_out;
+ *outlen = pdu->write_offset;
+ virtio_p9_set_reply_header(pdu, *outlen);
+ return;
+
+err_out:
+ virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+ return;
+}
+
+static void virtio_p9_readlink(struct p9_dev *p9dev,
+ struct p9_pdu *pdu, u32 *outlen)
+{
+ int ret;
+ u32 fid_val;
+ struct p9_fid *fid;
+ char target_path[PATH_MAX];
+
+ virtio_p9_pdu_readf(pdu, "d", &fid_val);
+ fid = get_fid(p9dev, fid_val);
+
+ memset(target_path, 0, PATH_MAX);
+ ret = readlink(fid->abs_path, target_path, PATH_MAX - 1);
+ if (ret < 0)
+ goto err_out;
+
+ virtio_p9_pdu_writef(pdu, "s", target_path);
+ *outlen = pdu->write_offset;
+ virtio_p9_set_reply_header(pdu, *outlen);
+ return;
+err_out:
+ virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+ return;
+}
+
+static void virtio_p9_statfs(struct p9_dev *p9dev,
+ struct p9_pdu *pdu, u32 *outlen)
+{
+ int ret;
+ u64 fsid;
+ u32 fid_val;
+ struct p9_fid *fid;
+ struct statfs stat_buf;
+
+ virtio_p9_pdu_readf(pdu, "d", &fid_val);
+ fid = get_fid(p9dev, fid_val);
+
+ ret = statfs(fid->abs_path, &stat_buf);
+ if (ret < 0)
+ goto err_out;
+ /* FIXME!! f_blocks needs update based on client msize */
+ fsid = (unsigned int) stat_buf.f_fsid.__val[0] |
+ (unsigned long long)stat_buf.f_fsid.__val[1] << 32;
+ virtio_p9_pdu_writef(pdu, "ddqqqqqqd", stat_buf.f_type,
+ stat_buf.f_bsize, stat_buf.f_blocks,
+ stat_buf.f_bfree, stat_buf.f_bavail,
+ stat_buf.f_files, stat_buf.f_ffree,
+ fsid, stat_buf.f_namelen);
+ *outlen = pdu->write_offset;
+ virtio_p9_set_reply_header(pdu, *outlen);
+ return;
+err_out:
+ virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+ return;
+}
+
+static void virtio_p9_mknod(struct p9_dev *p9dev,
+ struct p9_pdu *pdu, u32 *outlen)
+{
+ int ret;
+ char *name;
+ struct stat st;
+ struct p9_fid *dfid;
+ struct p9_qid qid;
+ char full_path[PATH_MAX];
+ u32 fid_val, mode, major, minor, gid;
+
+ virtio_p9_pdu_readf(pdu, "dsdddd", &fid_val, &name, &mode,
+ &major, &minor, &gid);
+
+ dfid = get_fid(p9dev, fid_val);
+ sprintf(full_path, "%s/%s", dfid->abs_path, name);
+ ret = mknod(full_path, mode, makedev(major, minor));
+ if (ret < 0)
+ goto err_out;
+
+ if (lstat(full_path, &st) < 0)
+ goto err_out;
+
+ ret = chmod(full_path, mode & 0777);
+ if (ret < 0)
+ goto err_out;
+
+ stat2qid(&st, &qid);
+ virtio_p9_pdu_writef(pdu, "Q", &qid);
+ free(name);
+ *outlen = pdu->write_offset;
+ virtio_p9_set_reply_header(pdu, *outlen);
+ return;
+err_out:
+ free(name);
+ virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+ return;
+}
+
+static void virtio_p9_fsync(struct p9_dev *p9dev,
+ struct p9_pdu *pdu, u32 *outlen)
+{
+ int ret;
+ struct p9_fid *fid;
+ u32 fid_val, datasync;
+
+ virtio_p9_pdu_readf(pdu, "dd", &fid_val, &datasync);
+ fid = get_fid(p9dev, fid_val);
+
+ if (datasync)
+ ret = fdatasync(fid->fd);
+ else
+ ret = fsync(fid->fd);
+ if (ret < 0)
+ goto err_out;
+ *outlen = pdu->write_offset;
+ virtio_p9_set_reply_header(pdu, *outlen);
+ return;
+err_out:
+ virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+ return;
+}
+
+static void virtio_p9_symlink(struct p9_dev *p9dev,
+ struct p9_pdu *pdu, u32 *outlen)
+{
+ int ret;
+ struct stat st;
+ u32 fid_val, gid;
+ struct p9_qid qid;
+ struct p9_fid *dfid;
+ char new_name[PATH_MAX];
+ char *old_path, *name;
+
+ virtio_p9_pdu_readf(pdu, "dssd", &fid_val, &name, &old_path, &gid);
+
+ dfid = get_fid(p9dev, fid_val);
+ sprintf(new_name, "%s/%s", dfid->abs_path, name);
+ ret = symlink(old_path, new_name);
+ if (ret < 0)
+ goto err_out;
+
+ if (lstat(new_name, &st) < 0)
+ goto err_out;
+
+ stat2qid(&st, &qid);
+ virtio_p9_pdu_writef(pdu, "Q", &qid);
+ free(name);
+ free(old_path);
+ *outlen = pdu->write_offset;
+ virtio_p9_set_reply_header(pdu, *outlen);
+ return;
+err_out:
+ free(name);
+ free(old_path);
+ virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+ return;
+}
+
+static void virtio_p9_link(struct p9_dev *p9dev,
+ struct p9_pdu *pdu, u32 *outlen)
+{
+ int ret;
+ char *name;
+ u32 fid_val, dfid_val;
+ struct p9_fid *dfid, *fid;
+ char full_path[PATH_MAX];
+
+ virtio_p9_pdu_readf(pdu, "dds", &dfid_val, &fid_val, &name);
+
+ dfid = get_fid(p9dev, dfid_val);
+ fid = get_fid(p9dev, fid_val);
+ sprintf(full_path, "%s/%s", dfid->abs_path, name);
+ ret = link(fid->abs_path, full_path);
+ if (ret < 0)
+ goto err_out;
+ free(name);
+ *outlen = pdu->write_offset;
+ virtio_p9_set_reply_header(pdu, *outlen);
+ return;
+err_out:
+ free(name);
+ virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+ return;
+
+}
+
+static void virtio_p9_lock(struct p9_dev *p9dev,
+ struct p9_pdu *pdu, u32 *outlen)
+{
+ u8 ret;
+ u32 fid_val;
+ struct p9_flock flock;
+
+ virtio_p9_pdu_readf(pdu, "dbdqqds", &fid_val, &flock.type,
+ &flock.flags, &flock.start, &flock.length,
+ &flock.proc_id, &flock.client_id);
+
+ /* Just return success */
+ ret = P9_LOCK_SUCCESS;
+ virtio_p9_pdu_writef(pdu, "d", ret);
+ *outlen = pdu->write_offset;
+ virtio_p9_set_reply_header(pdu, *outlen);
+ free(flock.client_id);
+ return;
+}
+
+static void virtio_p9_getlock(struct p9_dev *p9dev,
+ struct p9_pdu *pdu, u32 *outlen)
+{
+ u32 fid_val;
+ struct p9_getlock glock;
+ virtio_p9_pdu_readf(pdu, "dbqqds", &fid_val, &glock.type,
+ &glock.start, &glock.length, &glock.proc_id,
+ &glock.client_id);
+
+ /* Just return success */
+ glock.type = F_UNLCK;
+ virtio_p9_pdu_writef(pdu, "bqqds", glock.type,
+ glock.start, glock.length, glock.proc_id,
+ glock.client_id);
+ *outlen = pdu->write_offset;
+ virtio_p9_set_reply_header(pdu, *outlen);
+ free(glock.client_id);
+ return;
+}
+
+static int virtio_p9_ancestor(char *path, char *ancestor)
+{
+ int size = strlen(ancestor);
+ if (!strncmp(path, ancestor, size)) {
+ /*
+ * Now check whether ancestor is a full name or
+ * or directory component and not just part
+ * of a name.
+ */
+ if (path[size] == '\0' || path[size] == '/')
+ return 1;
+ }
+ return 0;
+}
+
+static void virtio_p9_fix_path(char *fid_path, char *old_name, char *new_name)
+{
+ char tmp_name[PATH_MAX];
+ size_t rp_sz = strlen(old_name);
+
+ if (rp_sz == strlen(fid_path)) {
+ /* replace the full name */
+ strcpy(fid_path, new_name);
+ return;
+ }
+ /* save the trailing path details */
+ strcpy(tmp_name, fid_path + rp_sz);
+ sprintf(fid_path, "%s%s", new_name, tmp_name);
+ return;
+}
+
+static void rename_fids(struct p9_dev *p9dev, char *old_name, char *new_name)
+{
+ struct rb_node *node = rb_first(&p9dev->fids);
+
+ while (node) {
+ struct p9_fid *fid = rb_entry(node, struct p9_fid, node);
+
+ if (fid->fid != P9_NOFID && virtio_p9_ancestor(fid->path, old_name)) {
+ virtio_p9_fix_path(fid->path, old_name, new_name);
+ }
+ node = rb_next(node);
+ }
+}
+
+static void virtio_p9_renameat(struct p9_dev *p9dev,
+ struct p9_pdu *pdu, u32 *outlen)
+{
+ int ret;
+ char *old_name, *new_name;
+ u32 old_dfid_val, new_dfid_val;
+ struct p9_fid *old_dfid, *new_dfid;
+ char old_full_path[PATH_MAX], new_full_path[PATH_MAX];
+
+
+ virtio_p9_pdu_readf(pdu, "dsds", &old_dfid_val, &old_name,
+ &new_dfid_val, &new_name);
+
+ old_dfid = get_fid(p9dev, old_dfid_val);
+ new_dfid = get_fid(p9dev, new_dfid_val);
+
+ sprintf(old_full_path, "%s/%s", old_dfid->abs_path, old_name);
+ sprintf(new_full_path, "%s/%s", new_dfid->abs_path, new_name);
+ ret = rename(old_full_path, new_full_path);
+ if (ret < 0)
+ goto err_out;
+ /*
+ * Now fix path in other fids, if the renamed path is part of
+ * that.
+ */
+ rename_fids(p9dev, old_name, new_name);
+ free(old_name);
+ free(new_name);
+ *outlen = pdu->write_offset;
+ virtio_p9_set_reply_header(pdu, *outlen);
+ return;
+err_out:
+ free(old_name);
+ free(new_name);
+ virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+ return;
+}
+
+static void virtio_p9_unlinkat(struct p9_dev *p9dev,
+ struct p9_pdu *pdu, u32 *outlen)
+{
+ int ret;
+ char *name;
+ u32 fid_val, flags;
+ struct p9_fid *fid;
+ char full_path[PATH_MAX];
+
+ virtio_p9_pdu_readf(pdu, "dsd", &fid_val, &name, &flags);
+ fid = get_fid(p9dev, fid_val);
+
+ sprintf(full_path, "%s/%s", fid->abs_path, name);
+ ret = remove(full_path);
+ if (ret < 0)
+ goto err_out;
+ free(name);
+ *outlen = pdu->write_offset;
+ virtio_p9_set_reply_header(pdu, *outlen);
+ return;
+err_out:
+ free(name);
+ virtio_p9_error_reply(p9dev, pdu, errno, outlen);
+ return;
+}
+
+static void virtio_p9_flush(struct p9_dev *p9dev,
+ struct p9_pdu *pdu, u32 *outlen)
+{
+ u16 tag, oldtag;
+
+ virtio_p9_pdu_readf(pdu, "ww", &tag, &oldtag);
+ virtio_p9_pdu_writef(pdu, "w", tag);
+ *outlen = pdu->write_offset;
+ virtio_p9_set_reply_header(pdu, *outlen);
+
+ return;
+}
+
+static void virtio_p9_eopnotsupp(struct p9_dev *p9dev,
+ struct p9_pdu *pdu, u32 *outlen)
+{
+ return virtio_p9_error_reply(p9dev, pdu, EOPNOTSUPP, outlen);
+}
+
+typedef void p9_handler(struct p9_dev *p9dev,
+ struct p9_pdu *pdu, u32 *outlen);
+
+/* FIXME should be removed when merging with latest linus tree */
+#define P9_TRENAMEAT 74
+#define P9_TUNLINKAT 76
+
+static p9_handler *virtio_9p_dotl_handler [] = {
+ [P9_TREADDIR] = virtio_p9_readdir,
+ [P9_TSTATFS] = virtio_p9_statfs,
+ [P9_TGETATTR] = virtio_p9_getattr,
+ [P9_TSETATTR] = virtio_p9_setattr,
+ [P9_TXATTRWALK] = virtio_p9_eopnotsupp,
+ [P9_TXATTRCREATE] = virtio_p9_eopnotsupp,
+ [P9_TMKNOD] = virtio_p9_mknod,
+ [P9_TLOCK] = virtio_p9_lock,
+ [P9_TGETLOCK] = virtio_p9_getlock,
+ [P9_TRENAMEAT] = virtio_p9_renameat,
+ [P9_TREADLINK] = virtio_p9_readlink,
+ [P9_TUNLINKAT] = virtio_p9_unlinkat,
+ [P9_TMKDIR] = virtio_p9_mkdir,
+ [P9_TVERSION] = virtio_p9_version,
+ [P9_TLOPEN] = virtio_p9_open,
+ [P9_TATTACH] = virtio_p9_attach,
+ [P9_TWALK] = virtio_p9_walk,
+ [P9_TCLUNK] = virtio_p9_clunk,
+ [P9_TFSYNC] = virtio_p9_fsync,
+ [P9_TREAD] = virtio_p9_read,
+ [P9_TFLUSH] = virtio_p9_flush,
+ [P9_TLINK] = virtio_p9_link,
+ [P9_TSYMLINK] = virtio_p9_symlink,
+ [P9_TLCREATE] = virtio_p9_create,
+ [P9_TWRITE] = virtio_p9_write,
+ [P9_TREMOVE] = virtio_p9_remove,
+ [P9_TRENAME] = virtio_p9_rename,
+};
+
+static struct p9_pdu *virtio_p9_pdu_init(struct kvm *kvm, struct virt_queue *vq)
+{
+ struct p9_pdu *pdu = calloc(1, sizeof(*pdu));
+ if (!pdu)
+ return NULL;
+
+ /* skip the pdu header p9_msg */
+ pdu->read_offset = VIRTIO_9P_HDR_LEN;
+ pdu->write_offset = VIRTIO_9P_HDR_LEN;
+ pdu->queue_head = virt_queue__get_inout_iov(kvm, vq, pdu->in_iov,
+ pdu->out_iov, &pdu->in_iov_cnt, &pdu->out_iov_cnt);
+ return pdu;
+}
+
+static u8 virtio_p9_get_cmd(struct p9_pdu *pdu)
+{
+ struct p9_msg *msg;
+ /*
+ * we can peek directly into pdu for a u8
+ * value. The host endianess won't be an issue
+ */
+ msg = pdu->out_iov[0].iov_base;
+ return msg->cmd;
+}
+
+static bool virtio_p9_do_io_request(struct kvm *kvm, struct p9_dev_job *job)
+{
+ u8 cmd;
+ u32 len = 0;
+ p9_handler *handler;
+ struct p9_dev *p9dev;
+ struct virt_queue *vq;
+ struct p9_pdu *p9pdu;
+
+ vq = job->vq;
+ p9dev = job->p9dev;
+
+ p9pdu = virtio_p9_pdu_init(kvm, vq);
+ cmd = virtio_p9_get_cmd(p9pdu);
+
+ if ((cmd >= ARRAY_SIZE(virtio_9p_dotl_handler)) ||
+ !virtio_9p_dotl_handler[cmd])
+ handler = virtio_p9_eopnotsupp;
+ else
+ handler = virtio_9p_dotl_handler[cmd];
+
+ handler(p9dev, p9pdu, &len);
+ virt_queue__set_used_elem(vq, p9pdu->queue_head, len);
+ free(p9pdu);
+ return true;
+}
+
+static void virtio_p9_do_io(struct kvm *kvm, void *param)
+{
+ struct p9_dev_job *job = (struct p9_dev_job *)param;
+ struct p9_dev *p9dev = job->p9dev;
+ struct virt_queue *vq = job->vq;
+
+ while (virt_queue__available(vq)) {
+ virtio_p9_do_io_request(kvm, job);
+ p9dev->vdev.ops->signal_vq(kvm, &p9dev->vdev, vq - p9dev->vqs);
+ }
+}
+
+static u8 *get_config(struct kvm *kvm, void *dev)
+{
+ struct p9_dev *p9dev = dev;
+
+ return ((u8 *)(p9dev->config));
+}
+
+static u32 get_host_features(struct kvm *kvm, void *dev)
+{
+ return 1 << VIRTIO_9P_MOUNT_TAG;
+}
+
+static void set_guest_features(struct kvm *kvm, void *dev, u32 features)
+{
+ struct p9_dev *p9dev = dev;
+ struct virtio_9p_config *conf = p9dev->config;
+
+ p9dev->features = features;
+ conf->tag_len = virtio_host_to_guest_u16(&p9dev->vdev, conf->tag_len);
+}
+
+static int init_vq(struct kvm *kvm, void *dev, u32 vq, u32 page_size, u32 align,
+ u32 pfn)
+{
+ struct p9_dev *p9dev = dev;
+ struct p9_dev_job *job;
+ struct virt_queue *queue;
+ void *p;
+
+ compat__remove_message(compat_id);
+
+ queue = &p9dev->vqs[vq];
+ queue->pfn = pfn;
+ p = virtio_get_vq(kvm, queue->pfn, page_size);
+ job = &p9dev->jobs[vq];
+
+ vring_init(&queue->vring, VIRTQUEUE_NUM, p, align);
+ virtio_init_device_vq(&p9dev->vdev, queue);
+
+ *job = (struct p9_dev_job) {
+ .vq = queue,
+ .p9dev = p9dev,
+ };
+ thread_pool__init_job(&job->job_id, kvm, virtio_p9_do_io, job);
+
+ return 0;
+}
+
+static int notify_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+ struct p9_dev *p9dev = dev;
+
+ thread_pool__do_job(&p9dev->jobs[vq].job_id);
+
+ return 0;
+}
+
+static int get_pfn_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+ struct p9_dev *p9dev = dev;
+
+ return p9dev->vqs[vq].pfn;
+}
+
+static int get_size_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+ return VIRTQUEUE_NUM;
+}
+
+static int set_size_vq(struct kvm *kvm, void *dev, u32 vq, int size)
+{
+ /* FIXME: dynamic */
+ return size;
+}
+
+struct virtio_ops p9_dev_virtio_ops = (struct virtio_ops) {
+ .get_config = get_config,
+ .get_host_features = get_host_features,
+ .set_guest_features = set_guest_features,
+ .init_vq = init_vq,
+ .notify_vq = notify_vq,
+ .get_pfn_vq = get_pfn_vq,
+ .get_size_vq = get_size_vq,
+ .set_size_vq = set_size_vq,
+};
+
+int virtio_9p_rootdir_parser(const struct option *opt, const char *arg, int unset)
+{
+ char *tag_name;
+ char tmp[PATH_MAX];
+ struct kvm *kvm = opt->ptr;
+
+ /*
+ * 9p dir can be of the form dirname,tag_name or
+ * just dirname. In the later case we use the
+ * default tag name
+ */
+ tag_name = strstr(arg, ",");
+ if (tag_name) {
+ *tag_name = '\0';
+ tag_name++;
+ }
+ if (realpath(arg, tmp)) {
+ if (virtio_9p__register(kvm, tmp, tag_name) < 0)
+ die("Unable to initialize virtio 9p");
+ } else
+ die("Failed resolving 9p path");
+ return 0;
+}
+
+int virtio_9p_img_name_parser(const struct option *opt, const char *arg, int unset)
+{
+ char path[PATH_MAX];
+ struct stat st;
+ struct kvm *kvm = opt->ptr;
+
+ if (stat(arg, &st) == 0 &&
+ S_ISDIR(st.st_mode)) {
+ char tmp[PATH_MAX];
+
+ if (kvm->cfg.using_rootfs)
+ die("Please use only one rootfs directory atmost");
+
+ if (realpath(arg, tmp) == 0 ||
+ virtio_9p__register(kvm, tmp, "/dev/root") < 0)
+ die("Unable to initialize virtio 9p");
+ kvm->cfg.using_rootfs = 1;
+ return 0;
+ }
+
+ snprintf(path, PATH_MAX, "%s%s", kvm__get_dir(), arg);
+
+ if (stat(path, &st) == 0 &&
+ S_ISDIR(st.st_mode)) {
+ char tmp[PATH_MAX];
+
+ if (kvm->cfg.using_rootfs)
+ die("Please use only one rootfs directory atmost");
+
+ if (realpath(path, tmp) == 0 ||
+ virtio_9p__register(kvm, tmp, "/dev/root") < 0)
+ die("Unable to initialize virtio 9p");
+ if (virtio_9p__register(kvm, "/", "hostfs") < 0)
+ die("Unable to initialize virtio 9p");
+ kvm_setup_resolv(arg);
+ kvm->cfg.using_rootfs = kvm->cfg.custom_rootfs = 1;
+ kvm->cfg.custom_rootfs_name = arg;
+ return 0;
+ }
+
+ return -1;
+}
+
+int virtio_9p__init(struct kvm *kvm)
+{
+ struct p9_dev *p9dev;
+
+ list_for_each_entry(p9dev, &devs, list) {
+ virtio_init(kvm, p9dev, &p9dev->vdev, &p9_dev_virtio_ops,
+ VIRTIO_DEFAULT_TRANS(kvm), PCI_DEVICE_ID_VIRTIO_9P,
+ VIRTIO_ID_9P, PCI_CLASS_9P);
+ }
+
+ return 0;
+}
+virtio_dev_init(virtio_9p__init);
+
+int virtio_9p__register(struct kvm *kvm, const char *root, const char *tag_name)
+{
+ struct p9_dev *p9dev;
+ int err = 0;
+
+ p9dev = calloc(1, sizeof(*p9dev));
+ if (!p9dev)
+ return -ENOMEM;
+
+ if (!tag_name)
+ tag_name = VIRTIO_9P_DEFAULT_TAG;
+
+ p9dev->config = calloc(1, sizeof(*p9dev->config) + strlen(tag_name) + 1);
+ if (p9dev->config == NULL) {
+ err = -ENOMEM;
+ goto free_p9dev;
+ }
+
+ strcpy(p9dev->root_dir, root);
+ p9dev->config->tag_len = strlen(tag_name);
+ if (p9dev->config->tag_len > MAX_TAG_LEN) {
+ err = -EINVAL;
+ goto free_p9dev_config;
+ }
+
+ memcpy(&p9dev->config->tag, tag_name, strlen(tag_name));
+
+ list_add(&p9dev->list, &devs);
+
+ if (compat_id == -1)
+ compat_id = virtio_compat_add_message("virtio-9p", "CONFIG_NET_9P_VIRTIO");
+
+ return err;
+
+free_p9dev_config:
+ free(p9dev->config);
+free_p9dev:
+ free(p9dev);
+ return err;
+}
diff --git a/tools/kvm/virtio/balloon.c b/tools/kvm/virtio/balloon.c
new file mode 100644
index 0000000..84c4bb0
--- /dev/null
+++ b/tools/kvm/virtio/balloon.c
@@ -0,0 +1,279 @@
+#include "kvm/virtio-balloon.h"
+
+#include "kvm/virtio-pci-dev.h"
+
+#include "kvm/virtio.h"
+#include "kvm/util.h"
+#include "kvm/kvm.h"
+#include "kvm/pci.h"
+#include "kvm/threadpool.h"
+#include "kvm/guest_compat.h"
+#include "kvm/kvm-ipc.h"
+
+#include <linux/virtio_ring.h>
+#include <linux/virtio_balloon.h>
+
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <pthread.h>
+#include <sys/eventfd.h>
+
+#define NUM_VIRT_QUEUES 3
+#define VIRTIO_BLN_QUEUE_SIZE 128
+#define VIRTIO_BLN_INFLATE 0
+#define VIRTIO_BLN_DEFLATE 1
+#define VIRTIO_BLN_STATS 2
+
+struct bln_dev {
+ struct list_head list;
+ struct virtio_device vdev;
+
+ u32 features;
+
+ /* virtio queue */
+ struct virt_queue vqs[NUM_VIRT_QUEUES];
+ struct thread_pool__job jobs[NUM_VIRT_QUEUES];
+
+ struct virtio_balloon_stat stats[VIRTIO_BALLOON_S_NR];
+ struct virtio_balloon_stat *cur_stat;
+ u32 cur_stat_head;
+ u16 stat_count;
+ int stat_waitfd;
+
+ struct virtio_balloon_config config;
+};
+
+static struct bln_dev bdev;
+static int compat_id = -1;
+
+static bool virtio_bln_do_io_request(struct kvm *kvm, struct bln_dev *bdev, struct virt_queue *queue)
+{
+ struct iovec iov[VIRTIO_BLN_QUEUE_SIZE];
+ unsigned int len = 0;
+ u16 out, in, head;
+ u32 *ptrs, i;
+
+ head = virt_queue__get_iov(queue, iov, &out, &in, kvm);
+ ptrs = iov[0].iov_base;
+ len = iov[0].iov_len / sizeof(u32);
+
+ for (i = 0 ; i < len ; i++) {
+ void *guest_ptr;
+
+ guest_ptr = guest_flat_to_host(kvm, (u64)ptrs[i] << VIRTIO_BALLOON_PFN_SHIFT);
+ if (queue == &bdev->vqs[VIRTIO_BLN_INFLATE]) {
+ madvise(guest_ptr, 1 << VIRTIO_BALLOON_PFN_SHIFT, MADV_DONTNEED);
+ bdev->config.actual++;
+ } else if (queue == &bdev->vqs[VIRTIO_BLN_DEFLATE]) {
+ bdev->config.actual--;
+ }
+ }
+
+ virt_queue__set_used_elem(queue, head, len);
+
+ return true;
+}
+
+static bool virtio_bln_do_stat_request(struct kvm *kvm, struct bln_dev *bdev, struct virt_queue *queue)
+{
+ struct iovec iov[VIRTIO_BLN_QUEUE_SIZE];
+ u16 out, in, head;
+ struct virtio_balloon_stat *stat;
+ u64 wait_val = 1;
+
+ head = virt_queue__get_iov(queue, iov, &out, &in, kvm);
+ stat = iov[0].iov_base;
+
+ /* Initial empty stat buffer */
+ if (bdev->cur_stat == NULL) {
+ bdev->cur_stat = stat;
+ bdev->cur_stat_head = head;
+
+ return true;
+ }
+
+ memcpy(bdev->stats, stat, iov[0].iov_len);
+
+ bdev->stat_count = iov[0].iov_len / sizeof(struct virtio_balloon_stat);
+ bdev->cur_stat = stat;
+ bdev->cur_stat_head = head;
+
+ if (write(bdev->stat_waitfd, &wait_val, sizeof(wait_val)) <= 0)
+ return -EFAULT;
+
+ return 1;
+}
+
+static void virtio_bln_do_io(struct kvm *kvm, void *param)
+{
+ struct virt_queue *vq = param;
+
+ if (vq == &bdev.vqs[VIRTIO_BLN_STATS]) {
+ virtio_bln_do_stat_request(kvm, &bdev, vq);
+ bdev.vdev.ops->signal_vq(kvm, &bdev.vdev, VIRTIO_BLN_STATS);
+ return;
+ }
+
+ while (virt_queue__available(vq)) {
+ virtio_bln_do_io_request(kvm, &bdev, vq);
+ bdev.vdev.ops->signal_vq(kvm, &bdev.vdev, vq - bdev.vqs);
+ }
+}
+
+static int virtio_bln__collect_stats(struct kvm *kvm)
+{
+ u64 tmp;
+
+ virt_queue__set_used_elem(&bdev.vqs[VIRTIO_BLN_STATS], bdev.cur_stat_head,
+ sizeof(struct virtio_balloon_stat));
+ bdev.vdev.ops->signal_vq(kvm, &bdev.vdev, VIRTIO_BLN_STATS);
+
+ if (read(bdev.stat_waitfd, &tmp, sizeof(tmp)) <= 0)
+ return -EFAULT;
+
+ return 0;
+}
+
+static void virtio_bln__print_stats(struct kvm *kvm, int fd, u32 type, u32 len, u8 *msg)
+{
+ int r;
+
+ if (WARN_ON(type != KVM_IPC_STAT || len))
+ return;
+
+ if (virtio_bln__collect_stats(kvm) < 0)
+ return;
+
+ r = write(fd, bdev.stats, sizeof(bdev.stats));
+ if (r < 0)
+ pr_warning("Failed sending memory stats");
+}
+
+static void handle_mem(struct kvm *kvm, int fd, u32 type, u32 len, u8 *msg)
+{
+ int mem;
+
+ if (WARN_ON(type != KVM_IPC_BALLOON || len != sizeof(int)))
+ return;
+
+ mem = *(int *)msg;
+ if (mem > 0) {
+ bdev.config.num_pages += 256 * mem;
+ } else if (mem < 0) {
+ if (bdev.config.num_pages < (u32)(256 * (-mem)))
+ return;
+
+ bdev.config.num_pages += 256 * mem;
+ }
+
+ /* Notify that the configuration space has changed */
+ bdev.vdev.ops->signal_config(kvm, &bdev.vdev);
+}
+
+static u8 *get_config(struct kvm *kvm, void *dev)
+{
+ struct bln_dev *bdev = dev;
+
+ return ((u8 *)(&bdev->config));
+}
+
+static u32 get_host_features(struct kvm *kvm, void *dev)
+{
+ return 1 << VIRTIO_BALLOON_F_STATS_VQ;
+}
+
+static void set_guest_features(struct kvm *kvm, void *dev, u32 features)
+{
+ struct bln_dev *bdev = dev;
+
+ bdev->features = features;
+}
+
+static int init_vq(struct kvm *kvm, void *dev, u32 vq, u32 page_size, u32 align,
+ u32 pfn)
+{
+ struct bln_dev *bdev = dev;
+ struct virt_queue *queue;
+ void *p;
+
+ compat__remove_message(compat_id);
+
+ queue = &bdev->vqs[vq];
+ queue->pfn = pfn;
+ p = virtio_get_vq(kvm, queue->pfn, page_size);
+
+ thread_pool__init_job(&bdev->jobs[vq], kvm, virtio_bln_do_io, queue);
+ vring_init(&queue->vring, VIRTIO_BLN_QUEUE_SIZE, p, align);
+
+ return 0;
+}
+
+static int notify_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+ struct bln_dev *bdev = dev;
+
+ thread_pool__do_job(&bdev->jobs[vq]);
+
+ return 0;
+}
+
+static int get_pfn_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+ struct bln_dev *bdev = dev;
+
+ return bdev->vqs[vq].pfn;
+}
+
+static int get_size_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+ return VIRTIO_BLN_QUEUE_SIZE;
+}
+
+static int set_size_vq(struct kvm *kvm, void *dev, u32 vq, int size)
+{
+ /* FIXME: dynamic */
+ return size;
+}
+
+struct virtio_ops bln_dev_virtio_ops = (struct virtio_ops) {
+ .get_config = get_config,
+ .get_host_features = get_host_features,
+ .set_guest_features = set_guest_features,
+ .init_vq = init_vq,
+ .notify_vq = notify_vq,
+ .get_pfn_vq = get_pfn_vq,
+ .get_size_vq = get_size_vq,
+ .set_size_vq = set_size_vq,
+};
+
+int virtio_bln__init(struct kvm *kvm)
+{
+ if (!kvm->cfg.balloon)
+ return 0;
+
+ kvm_ipc__register_handler(KVM_IPC_BALLOON, handle_mem);
+ kvm_ipc__register_handler(KVM_IPC_STAT, virtio_bln__print_stats);
+
+ bdev.stat_waitfd = eventfd(0, 0);
+ memset(&bdev.config, 0, sizeof(struct virtio_balloon_config));
+
+ virtio_init(kvm, &bdev, &bdev.vdev, &bln_dev_virtio_ops,
+ VIRTIO_DEFAULT_TRANS(kvm), PCI_DEVICE_ID_VIRTIO_BLN,
+ VIRTIO_ID_BALLOON, PCI_CLASS_BLN);
+
+ if (compat_id == -1)
+ compat_id = virtio_compat_add_message("virtio-balloon", "CONFIG_VIRTIO_BALLOON");
+
+ return 0;
+}
+virtio_dev_init(virtio_bln__init);
+
+int virtio_bln__exit(struct kvm *kvm)
+{
+ return 0;
+}
+virtio_dev_exit(virtio_bln__exit);
diff --git a/tools/kvm/virtio/blk.c b/tools/kvm/virtio/blk.c
new file mode 100644
index 0000000..edfa8e6
--- /dev/null
+++ b/tools/kvm/virtio/blk.c
@@ -0,0 +1,338 @@
+#include "kvm/virtio-blk.h"
+
+#include "kvm/virtio-pci-dev.h"
+#include "kvm/disk-image.h"
+#include "kvm/mutex.h"
+#include "kvm/util.h"
+#include "kvm/kvm.h"
+#include "kvm/pci.h"
+#include "kvm/threadpool.h"
+#include "kvm/ioeventfd.h"
+#include "kvm/guest_compat.h"
+#include "kvm/virtio-pci.h"
+#include "kvm/virtio.h"
+
+#include <linux/virtio_ring.h>
+#include <linux/virtio_blk.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/types.h>
+#include <pthread.h>
+
+#define VIRTIO_BLK_MAX_DEV 4
+
+/*
+ * the header and status consume too entries
+ */
+#define DISK_SEG_MAX (VIRTIO_BLK_QUEUE_SIZE - 2)
+#define VIRTIO_BLK_QUEUE_SIZE 256
+#define NUM_VIRT_QUEUES 1
+
+struct blk_dev_req {
+ struct virt_queue *vq;
+ struct blk_dev *bdev;
+ struct iovec iov[VIRTIO_BLK_QUEUE_SIZE];
+ u16 out, in, head;
+ struct kvm *kvm;
+};
+
+struct blk_dev {
+ struct mutex mutex;
+
+ struct list_head list;
+
+ struct virtio_device vdev;
+ struct virtio_blk_config blk_config;
+ struct disk_image *disk;
+ u32 features;
+
+ struct virt_queue vqs[NUM_VIRT_QUEUES];
+ struct blk_dev_req reqs[VIRTIO_BLK_QUEUE_SIZE];
+
+ pthread_t io_thread;
+ int io_efd;
+
+ struct kvm *kvm;
+};
+
+static LIST_HEAD(bdevs);
+static int compat_id = -1;
+
+void virtio_blk_complete(void *param, long len)
+{
+ struct blk_dev_req *req = param;
+ struct blk_dev *bdev = req->bdev;
+ int queueid = req->vq - bdev->vqs;
+ u8 *status;
+
+ /* status */
+ status = req->iov[req->out + req->in - 1].iov_base;
+ *status = (len < 0) ? VIRTIO_BLK_S_IOERR : VIRTIO_BLK_S_OK;
+
+ mutex_lock(&bdev->mutex);
+ virt_queue__set_used_elem(req->vq, req->head, len);
+ mutex_unlock(&bdev->mutex);
+
+ if (virtio_queue__should_signal(&bdev->vqs[queueid]))
+ bdev->vdev.ops->signal_vq(req->kvm, &bdev->vdev, queueid);
+}
+
+static void virtio_blk_do_io_request(struct kvm *kvm, struct virt_queue *vq, struct blk_dev_req *req)
+{
+ struct virtio_blk_outhdr *req_hdr;
+ ssize_t block_cnt;
+ struct blk_dev *bdev;
+ struct iovec *iov;
+ u16 out, in;
+ u32 type;
+ u64 sector;
+
+ block_cnt = -1;
+ bdev = req->bdev;
+ iov = req->iov;
+ out = req->out;
+ in = req->in;
+ req_hdr = iov[0].iov_base;
+
+ type = virtio_guest_to_host_u32(vq, req_hdr->type);
+ sector = virtio_guest_to_host_u64(vq, req_hdr->sector);
+
+ switch (type) {
+ case VIRTIO_BLK_T_IN:
+ block_cnt = disk_image__read(bdev->disk, sector,
+ iov + 1, in + out - 2, req);
+ break;
+ case VIRTIO_BLK_T_OUT:
+ block_cnt = disk_image__write(bdev->disk, sector,
+ iov + 1, in + out - 2, req);
+ break;
+ case VIRTIO_BLK_T_FLUSH:
+ block_cnt = disk_image__flush(bdev->disk);
+ virtio_blk_complete(req, block_cnt);
+ break;
+ case VIRTIO_BLK_T_GET_ID:
+ block_cnt = VIRTIO_BLK_ID_BYTES;
+ disk_image__get_serial(bdev->disk,
+ (iov + 1)->iov_base, &block_cnt);
+ virtio_blk_complete(req, block_cnt);
+ break;
+ default:
+ pr_warning("request type %d", type);
+ block_cnt = -1;
+ break;
+ }
+}
+
+static void virtio_blk_do_io(struct kvm *kvm, struct virt_queue *vq, struct blk_dev *bdev)
+{
+ struct blk_dev_req *req;
+ u16 head;
+
+ while (virt_queue__available(vq)) {
+ head = virt_queue__pop(vq);
+ req = &bdev->reqs[head];
+ req->head = virt_queue__get_head_iov(vq, req->iov, &req->out,
+ &req->in, head, kvm);
+ req->vq = vq;
+
+ virtio_blk_do_io_request(kvm, vq, req);
+ }
+}
+
+static u8 *get_config(struct kvm *kvm, void *dev)
+{
+ struct blk_dev *bdev = dev;
+
+ return ((u8 *)(&bdev->blk_config));
+}
+
+static u32 get_host_features(struct kvm *kvm, void *dev)
+{
+ return 1UL << VIRTIO_BLK_F_SEG_MAX
+ | 1UL << VIRTIO_BLK_F_FLUSH
+ | 1UL << VIRTIO_RING_F_EVENT_IDX
+ | 1UL << VIRTIO_RING_F_INDIRECT_DESC;
+}
+
+static void set_guest_features(struct kvm *kvm, void *dev, u32 features)
+{
+ struct blk_dev *bdev = dev;
+ struct virtio_blk_config *conf = &bdev->blk_config;
+ struct virtio_blk_geometry *geo = &conf->geometry;
+
+ bdev->features = features;
+
+ conf->capacity = virtio_host_to_guest_u64(&bdev->vdev, conf->capacity);
+ conf->size_max = virtio_host_to_guest_u32(&bdev->vdev, conf->size_max);
+ conf->seg_max = virtio_host_to_guest_u32(&bdev->vdev, conf->seg_max);
+
+ /* Geometry */
+ geo->cylinders = virtio_host_to_guest_u16(&bdev->vdev, geo->cylinders);
+
+ conf->blk_size = virtio_host_to_guest_u32(&bdev->vdev, conf->blk_size);
+ conf->min_io_size = virtio_host_to_guest_u16(&bdev->vdev, conf->min_io_size);
+ conf->opt_io_size = virtio_host_to_guest_u32(&bdev->vdev, conf->opt_io_size);
+}
+
+static int init_vq(struct kvm *kvm, void *dev, u32 vq, u32 page_size, u32 align,
+ u32 pfn)
+{
+ struct blk_dev *bdev = dev;
+ struct virt_queue *queue;
+ void *p;
+
+ compat__remove_message(compat_id);
+
+ queue = &bdev->vqs[vq];
+ queue->pfn = pfn;
+ p = virtio_get_vq(kvm, queue->pfn, page_size);
+
+ vring_init(&queue->vring, VIRTIO_BLK_QUEUE_SIZE, p, align);
+ virtio_init_device_vq(&bdev->vdev, queue);
+
+ return 0;
+}
+
+static void *virtio_blk_thread(void *dev)
+{
+ struct blk_dev *bdev = dev;
+ u64 data;
+ int r;
+
+ kvm__set_thread_name("virtio-blk-io");
+
+ while (1) {
+ r = read(bdev->io_efd, &data, sizeof(u64));
+ if (r < 0)
+ continue;
+ virtio_blk_do_io(bdev->kvm, &bdev->vqs[0], bdev);
+ }
+
+ pthread_exit(NULL);
+ return NULL;
+}
+
+static int notify_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+ struct blk_dev *bdev = dev;
+ u64 data = 1;
+ int r;
+
+ r = write(bdev->io_efd, &data, sizeof(data));
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+static int get_pfn_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+ struct blk_dev *bdev = dev;
+
+ return bdev->vqs[vq].pfn;
+}
+
+static int get_size_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+ /* FIXME: dynamic */
+ return VIRTIO_BLK_QUEUE_SIZE;
+}
+
+static int set_size_vq(struct kvm *kvm, void *dev, u32 vq, int size)
+{
+ /* FIXME: dynamic */
+ return size;
+}
+
+static struct virtio_ops blk_dev_virtio_ops = (struct virtio_ops) {
+ .get_config = get_config,
+ .get_host_features = get_host_features,
+ .set_guest_features = set_guest_features,
+ .init_vq = init_vq,
+ .notify_vq = notify_vq,
+ .get_pfn_vq = get_pfn_vq,
+ .get_size_vq = get_size_vq,
+ .set_size_vq = set_size_vq,
+};
+
+static int virtio_blk__init_one(struct kvm *kvm, struct disk_image *disk)
+{
+ struct blk_dev *bdev;
+ unsigned int i;
+
+ if (!disk)
+ return -EINVAL;
+
+ bdev = calloc(1, sizeof(struct blk_dev));
+ if (bdev == NULL)
+ return -ENOMEM;
+
+ *bdev = (struct blk_dev) {
+ .mutex = MUTEX_INITIALIZER,
+ .disk = disk,
+ .blk_config = (struct virtio_blk_config) {
+ .capacity = disk->size / SECTOR_SIZE,
+ .seg_max = DISK_SEG_MAX,
+ },
+ .io_efd = eventfd(0, 0),
+ .kvm = kvm,
+ };
+
+ virtio_init(kvm, bdev, &bdev->vdev, &blk_dev_virtio_ops,
+ VIRTIO_DEFAULT_TRANS(kvm), PCI_DEVICE_ID_VIRTIO_BLK,
+ VIRTIO_ID_BLOCK, PCI_CLASS_BLK);
+
+ list_add_tail(&bdev->list, &bdevs);
+
+ for (i = 0; i < ARRAY_SIZE(bdev->reqs); i++) {
+ bdev->reqs[i].bdev = bdev;
+ bdev->reqs[i].kvm = kvm;
+ }
+
+ disk_image__set_callback(bdev->disk, virtio_blk_complete);
+
+ pthread_create(&bdev->io_thread, NULL, virtio_blk_thread, bdev);
+ if (compat_id == -1)
+ compat_id = virtio_compat_add_message("virtio-blk", "CONFIG_VIRTIO_BLK");
+
+ return 0;
+}
+
+static int virtio_blk__exit_one(struct kvm *kvm, struct blk_dev *bdev)
+{
+ list_del(&bdev->list);
+ free(bdev);
+
+ return 0;
+}
+
+int virtio_blk__init(struct kvm *kvm)
+{
+ int i, r = 0;
+
+ for (i = 0; i < kvm->nr_disks; i++) {
+ if (kvm->disks[i]->wwpn)
+ continue;
+ r = virtio_blk__init_one(kvm, kvm->disks[i]);
+ if (r < 0)
+ goto cleanup;
+ }
+
+ return 0;
+cleanup:
+ return virtio_blk__exit(kvm);
+}
+virtio_dev_init(virtio_blk__init);
+
+int virtio_blk__exit(struct kvm *kvm)
+{
+ while (!list_empty(&bdevs)) {
+ struct blk_dev *bdev;
+
+ bdev = list_first_entry(&bdevs, struct blk_dev, list);
+ virtio_blk__exit_one(kvm, bdev);
+ }
+
+ return 0;
+}
+virtio_dev_exit(virtio_blk__exit);
diff --git a/tools/kvm/virtio/console.c b/tools/kvm/virtio/console.c
new file mode 100644
index 0000000..384eac1
--- /dev/null
+++ b/tools/kvm/virtio/console.c
@@ -0,0 +1,232 @@
+#include "kvm/virtio-console.h"
+#include "kvm/virtio-pci-dev.h"
+#include "kvm/disk-image.h"
+#include "kvm/virtio.h"
+#include "kvm/ioport.h"
+#include "kvm/util.h"
+#include "kvm/term.h"
+#include "kvm/mutex.h"
+#include "kvm/kvm.h"
+#include "kvm/pci.h"
+#include "kvm/threadpool.h"
+#include "kvm/irq.h"
+#include "kvm/guest_compat.h"
+
+#include <linux/virtio_console.h>
+#include <linux/virtio_ring.h>
+#include <linux/virtio_blk.h>
+
+#include <sys/uio.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <termios.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+#define VIRTIO_CONSOLE_QUEUE_SIZE 128
+#define VIRTIO_CONSOLE_NUM_QUEUES 2
+#define VIRTIO_CONSOLE_RX_QUEUE 0
+#define VIRTIO_CONSOLE_TX_QUEUE 1
+
+struct con_dev {
+ struct mutex mutex;
+
+ struct virtio_device vdev;
+ struct virt_queue vqs[VIRTIO_CONSOLE_NUM_QUEUES];
+ struct virtio_console_config config;
+ u32 features;
+
+ pthread_cond_t poll_cond;
+ int vq_ready;
+
+ struct thread_pool__job jobs[VIRTIO_CONSOLE_NUM_QUEUES];
+};
+
+static struct con_dev cdev = {
+ .mutex = MUTEX_INITIALIZER,
+
+ .vq_ready = 0,
+
+ .config = {
+ .cols = 80,
+ .rows = 24,
+ .max_nr_ports = 1,
+ },
+};
+
+static int compat_id = -1;
+
+/*
+ * Interrupts are injected for hvc0 only.
+ */
+static void virtio_console__inject_interrupt_callback(struct kvm *kvm, void *param)
+{
+ struct iovec iov[VIRTIO_CONSOLE_QUEUE_SIZE];
+ struct virt_queue *vq;
+ u16 out, in;
+ u16 head;
+ int len;
+
+ if (kvm->cfg.active_console != CONSOLE_VIRTIO)
+ return;
+
+ mutex_lock(&cdev.mutex);
+
+ vq = param;
+
+ if (!cdev.vq_ready)
+ pthread_cond_wait(&cdev.poll_cond, &cdev.mutex.mutex);
+
+ if (term_readable(0) && virt_queue__available(vq)) {
+ head = virt_queue__get_iov(vq, iov, &out, &in, kvm);
+ len = term_getc_iov(kvm, iov, in, 0);
+ virt_queue__set_used_elem(vq, head, len);
+ cdev.vdev.ops->signal_vq(kvm, &cdev.vdev, vq - cdev.vqs);
+ }
+
+ mutex_unlock(&cdev.mutex);
+}
+
+void virtio_console__inject_interrupt(struct kvm *kvm)
+{
+ virtio_console__inject_interrupt_callback(kvm,
+ &cdev.vqs[VIRTIO_CONSOLE_RX_QUEUE]);
+}
+
+static void virtio_console_handle_callback(struct kvm *kvm, void *param)
+{
+ struct iovec iov[VIRTIO_CONSOLE_QUEUE_SIZE];
+ struct virt_queue *vq;
+ u16 out, in;
+ u16 head;
+ u32 len;
+
+ vq = param;
+
+ /*
+ * The current Linux implementation polls for the buffer
+ * to be used, rather than waiting for an interrupt.
+ * So there is no need to inject an interrupt for the tx path.
+ */
+
+ while (virt_queue__available(vq)) {
+ head = virt_queue__get_iov(vq, iov, &out, &in, kvm);
+ len = term_putc_iov(iov, out, 0);
+ virt_queue__set_used_elem(vq, head, len);
+ }
+
+}
+
+static u8 *get_config(struct kvm *kvm, void *dev)
+{
+ struct con_dev *cdev = dev;
+
+ return ((u8 *)(&cdev->config));
+}
+
+static u32 get_host_features(struct kvm *kvm, void *dev)
+{
+ return 0;
+}
+
+static void set_guest_features(struct kvm *kvm, void *dev, u32 features)
+{
+ struct con_dev *cdev = dev;
+ struct virtio_console_config *conf = &cdev->config;
+
+ conf->cols = virtio_host_to_guest_u16(&cdev->vdev, conf->cols);
+ conf->rows = virtio_host_to_guest_u16(&cdev->vdev, conf->rows);
+ conf->max_nr_ports = virtio_host_to_guest_u32(&cdev->vdev, conf->max_nr_ports);
+}
+
+static int init_vq(struct kvm *kvm, void *dev, u32 vq, u32 page_size, u32 align,
+ u32 pfn)
+{
+ struct virt_queue *queue;
+ void *p;
+
+ BUG_ON(vq >= VIRTIO_CONSOLE_NUM_QUEUES);
+
+ compat__remove_message(compat_id);
+
+ queue = &cdev.vqs[vq];
+ queue->pfn = pfn;
+ p = virtio_get_vq(kvm, queue->pfn, page_size);
+
+ vring_init(&queue->vring, VIRTIO_CONSOLE_QUEUE_SIZE, p, align);
+ virtio_init_device_vq(&cdev.vdev, queue);
+
+ if (vq == VIRTIO_CONSOLE_TX_QUEUE) {
+ thread_pool__init_job(&cdev.jobs[vq], kvm, virtio_console_handle_callback, queue);
+ } else if (vq == VIRTIO_CONSOLE_RX_QUEUE) {
+ thread_pool__init_job(&cdev.jobs[vq], kvm, virtio_console__inject_interrupt_callback, queue);
+ /* Tell the waiting poll thread that we're ready to go */
+ mutex_lock(&cdev.mutex);
+ cdev.vq_ready = 1;
+ pthread_cond_signal(&cdev.poll_cond);
+ mutex_unlock(&cdev.mutex);
+ }
+
+ return 0;
+}
+
+static int notify_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+ struct con_dev *cdev = dev;
+
+ thread_pool__do_job(&cdev->jobs[vq]);
+
+ return 0;
+}
+
+static int get_pfn_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+ struct con_dev *cdev = dev;
+
+ return cdev->vqs[vq].pfn;
+}
+
+static int get_size_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+ return VIRTIO_CONSOLE_QUEUE_SIZE;
+}
+
+static int set_size_vq(struct kvm *kvm, void *dev, u32 vq, int size)
+{
+ /* FIXME: dynamic */
+ return size;
+}
+
+static struct virtio_ops con_dev_virtio_ops = (struct virtio_ops) {
+ .get_config = get_config,
+ .get_host_features = get_host_features,
+ .set_guest_features = set_guest_features,
+ .init_vq = init_vq,
+ .notify_vq = notify_vq,
+ .get_pfn_vq = get_pfn_vq,
+ .get_size_vq = get_size_vq,
+ .set_size_vq = set_size_vq,
+};
+
+int virtio_console__init(struct kvm *kvm)
+{
+ if (kvm->cfg.active_console != CONSOLE_VIRTIO)
+ return 0;
+
+ pthread_cond_init(&cdev.poll_cond, NULL);
+
+ virtio_init(kvm, &cdev, &cdev.vdev, &con_dev_virtio_ops,
+ VIRTIO_DEFAULT_TRANS(kvm), PCI_DEVICE_ID_VIRTIO_CONSOLE,
+ VIRTIO_ID_CONSOLE, PCI_CLASS_CONSOLE);
+ if (compat_id == -1)
+ compat_id = virtio_compat_add_message("virtio-console", "CONFIG_VIRTIO_CONSOLE");
+
+ return 0;
+}
+virtio_dev_init(virtio_console__init);
+
+int virtio_console__exit(struct kvm *kvm)
+{
+ return 0;
+}
+virtio_dev_exit(virtio_console__exit);
diff --git a/tools/kvm/virtio/core.c b/tools/kvm/virtio/core.c
new file mode 100644
index 0000000..9ae7887
--- /dev/null
+++ b/tools/kvm/virtio/core.c
@@ -0,0 +1,242 @@
+#include <linux/virtio_ring.h>
+#include <linux/types.h>
+#include <sys/uio.h>
+#include <stdlib.h>
+
+#include "kvm/guest_compat.h"
+#include "kvm/barrier.h"
+#include "kvm/virtio.h"
+#include "kvm/virtio-pci.h"
+#include "kvm/virtio-mmio.h"
+#include "kvm/util.h"
+#include "kvm/kvm.h"
+
+
+struct vring_used_elem *virt_queue__set_used_elem(struct virt_queue *queue, u32 head, u32 len)
+{
+ struct vring_used_elem *used_elem;
+ u16 idx = virtio_guest_to_host_u16(queue, queue->vring.used->idx);
+
+ used_elem = &queue->vring.used->ring[idx % queue->vring.num];
+ used_elem->id = virtio_host_to_guest_u32(queue, head);
+ used_elem->len = virtio_host_to_guest_u32(queue, len);
+
+ /*
+ * Use wmb to assure that used elem was updated with head and len.
+ * We need a wmb here since we can't advance idx unless we're ready
+ * to pass the used element to the guest.
+ */
+ wmb();
+ idx++;
+ queue->vring.used->idx = virtio_host_to_guest_u16(queue, idx);
+
+ /*
+ * Use wmb to assure used idx has been increased before we signal the guest.
+ * Without a wmb here the guest may ignore the queue since it won't see
+ * an updated idx.
+ */
+ wmb();
+
+ return used_elem;
+}
+
+static inline bool virt_desc__test_flag(struct virt_queue *vq,
+ struct vring_desc *desc, u16 flag)
+{
+ return !!(virtio_guest_to_host_u16(vq, desc->flags) & flag);
+}
+
+/*
+ * Each buffer in the virtqueues is actually a chain of descriptors. This
+ * function returns the next descriptor in the chain, or vq->vring.num if we're
+ * at the end.
+ */
+static unsigned next_desc(struct virt_queue *vq, struct vring_desc *desc,
+ unsigned int i, unsigned int max)
+{
+ unsigned int next;
+
+ /* If this descriptor says it doesn't chain, we're done. */
+ if (!virt_desc__test_flag(vq, &desc[i], VRING_DESC_F_NEXT))
+ return max;
+
+ /* Check they're not leading us off end of descriptors. */
+ next = virtio_guest_to_host_u16(vq, desc[i].next);
+ /* Make sure compiler knows to grab that: we don't want it changing! */
+ wmb();
+
+ return next;
+}
+
+u16 virt_queue__get_head_iov(struct virt_queue *vq, struct iovec iov[], u16 *out, u16 *in, u16 head, struct kvm *kvm)
+{
+ struct vring_desc *desc;
+ u16 idx;
+ u16 max;
+
+ idx = head;
+ *out = *in = 0;
+ max = vq->vring.num;
+ desc = vq->vring.desc;
+
+ if (virt_desc__test_flag(vq, &desc[idx], VRING_DESC_F_INDIRECT)) {
+ max = virtio_guest_to_host_u32(vq, desc[idx].len) / sizeof(struct vring_desc);
+ desc = guest_flat_to_host(kvm, virtio_guest_to_host_u64(vq, desc[idx].addr));
+ idx = 0;
+ }
+
+ do {
+ /* Grab the first descriptor, and check it's OK. */
+ iov[*out + *in].iov_len = virtio_guest_to_host_u32(vq, desc[idx].len);
+ iov[*out + *in].iov_base = guest_flat_to_host(kvm,
+ virtio_guest_to_host_u64(vq, desc[idx].addr));
+ /* If this is an input descriptor, increment that count. */
+ if (virt_desc__test_flag(vq, &desc[idx], VRING_DESC_F_WRITE))
+ (*in)++;
+ else
+ (*out)++;
+ } while ((idx = next_desc(vq, desc, idx, max)) != max);
+
+ return head;
+}
+
+u16 virt_queue__get_iov(struct virt_queue *vq, struct iovec iov[], u16 *out, u16 *in, struct kvm *kvm)
+{
+ u16 head;
+
+ head = virt_queue__pop(vq);
+
+ return virt_queue__get_head_iov(vq, iov, out, in, head, kvm);
+}
+
+/* in and out are relative to guest */
+u16 virt_queue__get_inout_iov(struct kvm *kvm, struct virt_queue *queue,
+ struct iovec in_iov[], struct iovec out_iov[],
+ u16 *in, u16 *out)
+{
+ struct vring_desc *desc;
+ u16 head, idx;
+
+ idx = head = virt_queue__pop(queue);
+ *out = *in = 0;
+ do {
+ u64 addr;
+ desc = virt_queue__get_desc(queue, idx);
+ addr = virtio_guest_to_host_u64(queue, desc->addr);
+ if (virt_desc__test_flag(queue, desc, VRING_DESC_F_WRITE)) {
+ in_iov[*in].iov_base = guest_flat_to_host(kvm, addr);
+ in_iov[*in].iov_len = virtio_guest_to_host_u32(queue, desc->len);
+ (*in)++;
+ } else {
+ out_iov[*out].iov_base = guest_flat_to_host(kvm, addr);
+ out_iov[*out].iov_len = virtio_guest_to_host_u32(queue, desc->len);
+ (*out)++;
+ }
+ if (virt_desc__test_flag(queue, desc, VRING_DESC_F_NEXT))
+ idx = virtio_guest_to_host_u16(queue, desc->next);
+ else
+ break;
+ } while (1);
+
+ return head;
+}
+
+int virtio__get_dev_specific_field(int offset, bool msix, u32 *config_off)
+{
+ if (msix) {
+ if (offset < 4)
+ return VIRTIO_PCI_O_MSIX;
+ else
+ offset -= 4;
+ }
+
+ *config_off = offset;
+
+ return VIRTIO_PCI_O_CONFIG;
+}
+
+bool virtio_queue__should_signal(struct virt_queue *vq)
+{
+ u16 old_idx, new_idx, event_idx;
+
+ old_idx = vq->last_used_signalled;
+ new_idx = virtio_guest_to_host_u16(vq, vq->vring.used->idx);
+ event_idx = virtio_guest_to_host_u16(vq, vring_used_event(&vq->vring));
+
+ if (vring_need_event(event_idx, new_idx, old_idx)) {
+ vq->last_used_signalled = new_idx;
+ return true;
+ }
+
+ return false;
+}
+
+int virtio_init(struct kvm *kvm, void *dev, struct virtio_device *vdev,
+ struct virtio_ops *ops, enum virtio_trans trans,
+ int device_id, int subsys_id, int class)
+{
+ void *virtio;
+
+ switch (trans) {
+ case VIRTIO_PCI:
+ virtio = calloc(sizeof(struct virtio_pci), 1);
+ if (!virtio)
+ return -ENOMEM;
+ vdev->virtio = virtio;
+ vdev->ops = ops;
+ vdev->ops->signal_vq = virtio_pci__signal_vq;
+ vdev->ops->signal_config = virtio_pci__signal_config;
+ vdev->ops->init = virtio_pci__init;
+ vdev->ops->exit = virtio_pci__exit;
+ vdev->ops->init(kvm, dev, vdev, device_id, subsys_id, class);
+ break;
+ case VIRTIO_MMIO:
+ virtio = calloc(sizeof(struct virtio_mmio), 1);
+ if (!virtio)
+ return -ENOMEM;
+ vdev->virtio = virtio;
+ vdev->ops = ops;
+ vdev->ops->signal_vq = virtio_mmio_signal_vq;
+ vdev->ops->signal_config = virtio_mmio_signal_config;
+ vdev->ops->init = virtio_mmio_init;
+ vdev->ops->exit = virtio_mmio_exit;
+ vdev->ops->init(kvm, dev, vdev, device_id, subsys_id, class);
+ break;
+ default:
+ return -1;
+ };
+
+ return 0;
+}
+
+int virtio_compat_add_message(const char *device, const char *config)
+{
+ int len = 1024;
+ int compat_id;
+ char *title;
+ char *desc;
+
+ title = malloc(len);
+ if (!title)
+ return -ENOMEM;
+
+ desc = malloc(len);
+ if (!desc) {
+ free(title);
+ return -ENOMEM;
+ }
+
+ snprintf(title, len, "%s device was not detected.", device);
+ snprintf(desc, len, "While you have requested a %s device, "
+ "the guest kernel did not initialize it.\n"
+ "\tPlease make sure that the guest kernel was "
+ "compiled with %s=y enabled in .config.",
+ device, config);
+
+ compat_id = compat__add_message(title, desc);
+
+ free(desc);
+ free(title);
+
+ return compat_id;
+}
diff --git a/tools/kvm/virtio/mmio.c b/tools/kvm/virtio/mmio.c
new file mode 100644
index 0000000..3a2bd62
--- /dev/null
+++ b/tools/kvm/virtio/mmio.c
@@ -0,0 +1,323 @@
+#include "kvm/devices.h"
+#include "kvm/virtio-mmio.h"
+#include "kvm/ioeventfd.h"
+#include "kvm/ioport.h"
+#include "kvm/virtio.h"
+#include "kvm/kvm.h"
+#include "kvm/kvm-cpu.h"
+#include "kvm/irq.h"
+#include "kvm/fdt.h"
+
+#include <linux/virtio_mmio.h>
+#include <string.h>
+
+static u32 virtio_mmio_io_space_blocks = KVM_VIRTIO_MMIO_AREA;
+
+static u32 virtio_mmio_get_io_space_block(u32 size)
+{
+ u32 block = virtio_mmio_io_space_blocks;
+ virtio_mmio_io_space_blocks += size;
+
+ return block;
+}
+
+static void virtio_mmio_ioevent_callback(struct kvm *kvm, void *param)
+{
+ struct virtio_mmio_ioevent_param *ioeventfd = param;
+ struct virtio_mmio *vmmio = ioeventfd->vdev->virtio;
+
+ ioeventfd->vdev->ops->notify_vq(kvm, vmmio->dev, ioeventfd->vq);
+}
+
+static int virtio_mmio_init_ioeventfd(struct kvm *kvm,
+ struct virtio_device *vdev, u32 vq)
+{
+ struct virtio_mmio *vmmio = vdev->virtio;
+ struct ioevent ioevent;
+ int err;
+
+ vmmio->ioeventfds[vq] = (struct virtio_mmio_ioevent_param) {
+ .vdev = vdev,
+ .vq = vq,
+ };
+
+ ioevent = (struct ioevent) {
+ .io_addr = vmmio->addr + VIRTIO_MMIO_QUEUE_NOTIFY,
+ .io_len = sizeof(u32),
+ .fn = virtio_mmio_ioevent_callback,
+ .fn_ptr = &vmmio->ioeventfds[vq],
+ .datamatch = vq,
+ .fn_kvm = kvm,
+ .fd = eventfd(0, 0),
+ };
+
+ if (vdev->use_vhost)
+ /*
+ * Vhost will poll the eventfd in host kernel side,
+ * no need to poll in userspace.
+ */
+ err = ioeventfd__add_event(&ioevent, 0);
+ else
+ /* Need to poll in userspace. */
+ err = ioeventfd__add_event(&ioevent, IOEVENTFD_FLAG_USER_POLL);
+ if (err)
+ return err;
+
+ if (vdev->ops->notify_vq_eventfd)
+ vdev->ops->notify_vq_eventfd(kvm, vmmio->dev, vq, ioevent.fd);
+
+ return 0;
+}
+
+int virtio_mmio_signal_vq(struct kvm *kvm, struct virtio_device *vdev, u32 vq)
+{
+ struct virtio_mmio *vmmio = vdev->virtio;
+
+ vmmio->hdr.interrupt_state |= VIRTIO_MMIO_INT_VRING;
+ kvm__irq_trigger(vmmio->kvm, vmmio->irq);
+
+ return 0;
+}
+
+int virtio_mmio_signal_config(struct kvm *kvm, struct virtio_device *vdev)
+{
+ struct virtio_mmio *vmmio = vdev->virtio;
+
+ vmmio->hdr.interrupt_state |= VIRTIO_MMIO_INT_CONFIG;
+ kvm__irq_trigger(vmmio->kvm, vmmio->irq);
+
+ return 0;
+}
+
+static void virtio_mmio_device_specific(struct kvm_cpu *vcpu,
+ u64 addr, u8 *data, u32 len,
+ u8 is_write, struct virtio_device *vdev)
+{
+ struct virtio_mmio *vmmio = vdev->virtio;
+ u32 i;
+
+ for (i = 0; i < len; i++) {
+ if (is_write)
+ vdev->ops->get_config(vmmio->kvm, vmmio->dev)[addr + i] =
+ *(u8 *)data + i;
+ else
+ data[i] = vdev->ops->get_config(vmmio->kvm,
+ vmmio->dev)[addr + i];
+ }
+}
+
+static void virtio_mmio_config_in(struct kvm_cpu *vcpu,
+ u64 addr, void *data, u32 len,
+ struct virtio_device *vdev)
+{
+ struct virtio_mmio *vmmio = vdev->virtio;
+ u32 val = 0;
+
+ switch (addr) {
+ case VIRTIO_MMIO_MAGIC_VALUE:
+ case VIRTIO_MMIO_VERSION:
+ case VIRTIO_MMIO_DEVICE_ID:
+ case VIRTIO_MMIO_VENDOR_ID:
+ case VIRTIO_MMIO_STATUS:
+ case VIRTIO_MMIO_INTERRUPT_STATUS:
+ ioport__write32(data, *(u32 *)(((void *)&vmmio->hdr) + addr));
+ break;
+ case VIRTIO_MMIO_HOST_FEATURES:
+ if (vmmio->hdr.host_features_sel == 0)
+ val = vdev->ops->get_host_features(vmmio->kvm,
+ vmmio->dev);
+ ioport__write32(data, val);
+ break;
+ case VIRTIO_MMIO_QUEUE_PFN:
+ val = vdev->ops->get_pfn_vq(vmmio->kvm, vmmio->dev,
+ vmmio->hdr.queue_sel);
+ ioport__write32(data, val);
+ break;
+ case VIRTIO_MMIO_QUEUE_NUM_MAX:
+ val = vdev->ops->get_size_vq(vmmio->kvm, vmmio->dev,
+ vmmio->hdr.queue_sel);
+ ioport__write32(data, val);
+ break;
+ default:
+ break;
+ }
+}
+
+static void virtio_mmio_config_out(struct kvm_cpu *vcpu,
+ u64 addr, void *data, u32 len,
+ struct virtio_device *vdev)
+{
+ struct virtio_mmio *vmmio = vdev->virtio;
+ struct kvm *kvm = vmmio->kvm;
+ u32 val = 0;
+
+ switch (addr) {
+ case VIRTIO_MMIO_HOST_FEATURES_SEL:
+ case VIRTIO_MMIO_GUEST_FEATURES_SEL:
+ case VIRTIO_MMIO_QUEUE_SEL:
+ val = ioport__read32(data);
+ *(u32 *)(((void *)&vmmio->hdr) + addr) = val;
+ break;
+ case VIRTIO_MMIO_STATUS:
+ vmmio->hdr.status = ioport__read32(data);
+ if (!vmmio->hdr.status) /* Sample endianness on reset */
+ vdev->endian = kvm_cpu__get_endianness(vcpu);
+ if (vdev->ops->notify_status)
+ vdev->ops->notify_status(kvm, vmmio->dev, vmmio->hdr.status);
+ break;
+ case VIRTIO_MMIO_GUEST_FEATURES:
+ if (vmmio->hdr.guest_features_sel == 0) {
+ val = ioport__read32(data);
+ vdev->ops->set_guest_features(vmmio->kvm,
+ vmmio->dev, val);
+ }
+ break;
+ case VIRTIO_MMIO_GUEST_PAGE_SIZE:
+ val = ioport__read32(data);
+ vmmio->hdr.guest_page_size = val;
+ break;
+ case VIRTIO_MMIO_QUEUE_NUM:
+ val = ioport__read32(data);
+ vmmio->hdr.queue_num = val;
+ vdev->ops->set_size_vq(vmmio->kvm, vmmio->dev,
+ vmmio->hdr.queue_sel, val);
+ break;
+ case VIRTIO_MMIO_QUEUE_ALIGN:
+ val = ioport__read32(data);
+ vmmio->hdr.queue_align = val;
+ break;
+ case VIRTIO_MMIO_QUEUE_PFN:
+ val = ioport__read32(data);
+ virtio_mmio_init_ioeventfd(vmmio->kvm, vdev, vmmio->hdr.queue_sel);
+ vdev->ops->init_vq(vmmio->kvm, vmmio->dev,
+ vmmio->hdr.queue_sel,
+ vmmio->hdr.guest_page_size,
+ vmmio->hdr.queue_align,
+ val);
+ break;
+ case VIRTIO_MMIO_QUEUE_NOTIFY:
+ val = ioport__read32(data);
+ vdev->ops->notify_vq(vmmio->kvm, vmmio->dev, val);
+ break;
+ case VIRTIO_MMIO_INTERRUPT_ACK:
+ val = ioport__read32(data);
+ vmmio->hdr.interrupt_state &= ~val;
+ break;
+ default:
+ break;
+ };
+}
+
+static void virtio_mmio_mmio_callback(struct kvm_cpu *vcpu,
+ u64 addr, u8 *data, u32 len,
+ u8 is_write, void *ptr)
+{
+ struct virtio_device *vdev = ptr;
+ struct virtio_mmio *vmmio = vdev->virtio;
+ u32 offset = addr - vmmio->addr;
+
+ if (offset >= VIRTIO_MMIO_CONFIG) {
+ offset -= VIRTIO_MMIO_CONFIG;
+ virtio_mmio_device_specific(vcpu, offset, data, len, is_write, ptr);
+ return;
+ }
+
+ if (is_write)
+ virtio_mmio_config_out(vcpu, offset, data, len, ptr);
+ else
+ virtio_mmio_config_in(vcpu, offset, data, len, ptr);
+}
+
+#ifdef CONFIG_HAS_LIBFDT
+#define DEVICE_NAME_MAX_LEN 32
+static void generate_virtio_mmio_fdt_node(void *fdt,
+ struct device_header *dev_hdr,
+ void (*generate_irq_prop)(void *fdt,
+ u8 irq))
+{
+ char dev_name[DEVICE_NAME_MAX_LEN];
+ struct virtio_mmio *vmmio = container_of(dev_hdr,
+ struct virtio_mmio,
+ dev_hdr);
+ u64 addr = vmmio->addr;
+ u64 reg_prop[] = {
+ cpu_to_fdt64(addr),
+ cpu_to_fdt64(VIRTIO_MMIO_IO_SIZE),
+ };
+
+ snprintf(dev_name, DEVICE_NAME_MAX_LEN, "virtio@%llx", addr);
+
+ _FDT(fdt_begin_node(fdt, dev_name));
+ _FDT(fdt_property_string(fdt, "compatible", "virtio,mmio"));
+ _FDT(fdt_property(fdt, "reg", reg_prop, sizeof(reg_prop)));
+ generate_irq_prop(fdt, vmmio->irq);
+ _FDT(fdt_end_node(fdt));
+}
+#else
+static void generate_virtio_mmio_fdt_node(void *fdt,
+ struct device_header *dev_hdr,
+ void (*generate_irq_prop)(void *fdt,
+ u8 irq))
+{
+ die("Unable to generate device tree nodes without libfdt\n");
+}
+#endif
+
+void virtio_mmio_assign_irq(struct device_header *dev_hdr)
+{
+ struct virtio_mmio *vmmio = container_of(dev_hdr,
+ struct virtio_mmio,
+ dev_hdr);
+ vmmio->irq = irq__alloc_line();
+}
+
+int virtio_mmio_init(struct kvm *kvm, void *dev, struct virtio_device *vdev,
+ int device_id, int subsys_id, int class)
+{
+ struct virtio_mmio *vmmio = vdev->virtio;
+
+ vmmio->addr = virtio_mmio_get_io_space_block(VIRTIO_MMIO_IO_SIZE);
+ vmmio->kvm = kvm;
+ vmmio->dev = dev;
+
+ kvm__register_mmio(kvm, vmmio->addr, VIRTIO_MMIO_IO_SIZE,
+ false, virtio_mmio_mmio_callback, vdev);
+
+ vmmio->hdr = (struct virtio_mmio_hdr) {
+ .magic = {'v', 'i', 'r', 't'},
+ .version = 1,
+ .device_id = subsys_id,
+ .vendor_id = 0x4d564b4c , /* 'LKVM' */
+ .queue_num_max = 256,
+ };
+
+ vmmio->dev_hdr = (struct device_header) {
+ .bus_type = DEVICE_BUS_MMIO,
+ .data = generate_virtio_mmio_fdt_node,
+ };
+
+ device__register(&vmmio->dev_hdr);
+
+ /*
+ * Instantiate guest virtio-mmio devices using kernel command line
+ * (or module) parameter, e.g
+ *
+ * virtio_mmio.devices=0x200@0xd2000000:5,0x200@0xd2000200:6
+ */
+ pr_info("virtio-mmio.devices=0x%x@0x%x:%d\n", VIRTIO_MMIO_IO_SIZE, vmmio->addr, vmmio->irq);
+
+ return 0;
+}
+
+int virtio_mmio_exit(struct kvm *kvm, struct virtio_device *vdev)
+{
+ struct virtio_mmio *vmmio = vdev->virtio;
+ int i;
+
+ kvm__deregister_mmio(kvm, vmmio->addr);
+
+ for (i = 0; i < VIRTIO_MMIO_MAX_VQ; i++)
+ ioeventfd__del_event(vmmio->addr + VIRTIO_MMIO_QUEUE_NOTIFY, i);
+
+ return 0;
+}
diff --git a/tools/kvm/virtio/net.c b/tools/kvm/virtio/net.c
new file mode 100644
index 0000000..c8af385
--- /dev/null
+++ b/tools/kvm/virtio/net.c
@@ -0,0 +1,855 @@
+#include "kvm/virtio-pci-dev.h"
+#include "kvm/virtio-net.h"
+#include "kvm/virtio.h"
+#include "kvm/types.h"
+#include "kvm/mutex.h"
+#include "kvm/util.h"
+#include "kvm/kvm.h"
+#include "kvm/irq.h"
+#include "kvm/uip.h"
+#include "kvm/guest_compat.h"
+#include "kvm/iovec.h"
+
+#include <linux/vhost.h>
+#include <linux/virtio_net.h>
+#include <linux/if_tun.h>
+#include <linux/types.h>
+
+#include <arpa/inet.h>
+#include <net/if.h>
+
+#include <unistd.h>
+#include <fcntl.h>
+
+#include <sys/socket.h>
+#include <sys/ioctl.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <sys/eventfd.h>
+
+#define VIRTIO_NET_QUEUE_SIZE 256
+#define VIRTIO_NET_NUM_QUEUES 8
+
+struct net_dev;
+
+struct net_dev_operations {
+ int (*rx)(struct iovec *iov, u16 in, struct net_dev *ndev);
+ int (*tx)(struct iovec *iov, u16 in, struct net_dev *ndev);
+};
+
+struct net_dev {
+ struct mutex mutex;
+ struct virtio_device vdev;
+ struct list_head list;
+
+ struct virt_queue vqs[VIRTIO_NET_NUM_QUEUES * 2 + 1];
+ struct virtio_net_config config;
+ u32 features, rx_vqs, tx_vqs, queue_pairs;
+
+ pthread_t io_thread[VIRTIO_NET_NUM_QUEUES * 2 + 1];
+ struct mutex io_lock[VIRTIO_NET_NUM_QUEUES * 2 + 1];
+ pthread_cond_t io_cond[VIRTIO_NET_NUM_QUEUES * 2 + 1];
+
+ int vhost_fd;
+ int tap_fd;
+ char tap_name[IFNAMSIZ];
+
+ int mode;
+
+ struct uip_info info;
+ struct net_dev_operations *ops;
+ struct kvm *kvm;
+
+ struct virtio_net_params *params;
+};
+
+static LIST_HEAD(ndevs);
+static int compat_id = -1;
+
+#define MAX_PACKET_SIZE 65550
+
+static bool has_virtio_feature(struct net_dev *ndev, u32 feature)
+{
+ return ndev->features & (1 << feature);
+}
+
+static void virtio_net_fix_tx_hdr(struct virtio_net_hdr *hdr, struct net_dev *ndev)
+{
+ hdr->hdr_len = virtio_guest_to_host_u16(&ndev->vdev, hdr->hdr_len);
+ hdr->gso_size = virtio_guest_to_host_u16(&ndev->vdev, hdr->gso_size);
+ hdr->csum_start = virtio_guest_to_host_u16(&ndev->vdev, hdr->csum_start);
+ hdr->csum_offset = virtio_guest_to_host_u16(&ndev->vdev, hdr->csum_offset);
+}
+
+static void virtio_net_fix_rx_hdr(struct virtio_net_hdr_mrg_rxbuf *hdr, struct net_dev *ndev)
+{
+ hdr->hdr.hdr_len = virtio_host_to_guest_u16(&ndev->vdev, hdr->hdr.hdr_len);
+ hdr->hdr.gso_size = virtio_host_to_guest_u16(&ndev->vdev, hdr->hdr.gso_size);
+ hdr->hdr.csum_start = virtio_host_to_guest_u16(&ndev->vdev, hdr->hdr.csum_start);
+ hdr->hdr.csum_offset = virtio_host_to_guest_u16(&ndev->vdev, hdr->hdr.csum_offset);
+ if (has_virtio_feature(ndev, VIRTIO_NET_F_MRG_RXBUF))
+ hdr->num_buffers = virtio_host_to_guest_u16(&ndev->vdev, hdr->num_buffers);
+}
+
+static void *virtio_net_rx_thread(void *p)
+{
+ struct iovec iov[VIRTIO_NET_QUEUE_SIZE];
+ struct virt_queue *vq;
+ struct kvm *kvm;
+ struct net_dev *ndev = p;
+ u16 out, in;
+ u16 head;
+ int len, copied;
+ u32 id;
+
+ mutex_lock(&ndev->mutex);
+ id = ndev->rx_vqs++ * 2;
+ mutex_unlock(&ndev->mutex);
+
+ kvm__set_thread_name("virtio-net-rx");
+
+ kvm = ndev->kvm;
+ vq = &ndev->vqs[id];
+
+ while (1) {
+ mutex_lock(&ndev->io_lock[id]);
+ if (!virt_queue__available(vq))
+ pthread_cond_wait(&ndev->io_cond[id], &ndev->io_lock[id].mutex);
+ mutex_unlock(&ndev->io_lock[id]);
+
+ while (virt_queue__available(vq)) {
+ unsigned char buffer[MAX_PACKET_SIZE + sizeof(struct virtio_net_hdr_mrg_rxbuf)];
+ struct iovec dummy_iov = {
+ .iov_base = buffer,
+ .iov_len = sizeof(buffer),
+ };
+ struct virtio_net_hdr_mrg_rxbuf *hdr;
+ int i;
+
+ len = ndev->ops->rx(&dummy_iov, 1, ndev);
+ if (len < 0) {
+ pr_warning("%s: rx on vq %u failed (%d), exiting thread\n",
+ __func__, id, len);
+ goto out_err;
+ }
+
+ copied = i = 0;
+ head = virt_queue__get_iov(vq, iov, &out, &in, kvm);
+ hdr = iov[0].iov_base;
+ while (copied < len) {
+ size_t iovsize = min_t(size_t, len - copied, iov_size(iov, in));
+
+ memcpy_toiovec(iov, buffer + copied, iovsize);
+ copied += iovsize;
+ if (i++ == 0)
+ virtio_net_fix_rx_hdr(hdr, ndev);
+ if (has_virtio_feature(ndev, VIRTIO_NET_F_MRG_RXBUF)) {
+ u16 num_buffers = virtio_guest_to_host_u16(vq, hdr->num_buffers);
+ hdr->num_buffers = virtio_host_to_guest_u16(vq, num_buffers + 1);
+ }
+ virt_queue__set_used_elem(vq, head, iovsize);
+ if (copied == len)
+ break;
+ while (!virt_queue__available(vq))
+ sleep(0);
+ head = virt_queue__get_iov(vq, iov, &out, &in, kvm);
+ }
+ /* We should interrupt guest right now, otherwise latency is huge. */
+ if (virtio_queue__should_signal(vq))
+ ndev->vdev.ops->signal_vq(kvm, &ndev->vdev, id);
+ }
+ }
+
+out_err:
+ pthread_exit(NULL);
+ return NULL;
+
+}
+
+static void *virtio_net_tx_thread(void *p)
+{
+ struct iovec iov[VIRTIO_NET_QUEUE_SIZE];
+ struct virt_queue *vq;
+ struct kvm *kvm;
+ struct net_dev *ndev = p;
+ u16 out, in;
+ u16 head;
+ int len;
+ u32 id;
+
+ mutex_lock(&ndev->mutex);
+ id = ndev->tx_vqs++ * 2 + 1;
+ mutex_unlock(&ndev->mutex);
+
+ kvm__set_thread_name("virtio-net-tx");
+
+ kvm = ndev->kvm;
+ vq = &ndev->vqs[id];
+
+ while (1) {
+ mutex_lock(&ndev->io_lock[id]);
+ if (!virt_queue__available(vq))
+ pthread_cond_wait(&ndev->io_cond[id], &ndev->io_lock[id].mutex);
+ mutex_unlock(&ndev->io_lock[id]);
+
+ while (virt_queue__available(vq)) {
+ struct virtio_net_hdr *hdr;
+ head = virt_queue__get_iov(vq, iov, &out, &in, kvm);
+ hdr = iov[0].iov_base;
+ virtio_net_fix_tx_hdr(hdr, ndev);
+ len = ndev->ops->tx(iov, out, ndev);
+ if (len < 0) {
+ pr_warning("%s: tx on vq %u failed (%d)\n",
+ __func__, id, errno);
+ goto out_err;
+ }
+
+ virt_queue__set_used_elem(vq, head, len);
+ }
+
+ if (virtio_queue__should_signal(vq))
+ ndev->vdev.ops->signal_vq(kvm, &ndev->vdev, id);
+ }
+
+out_err:
+ pthread_exit(NULL);
+ return NULL;
+}
+
+static virtio_net_ctrl_ack virtio_net_handle_mq(struct kvm* kvm, struct net_dev *ndev, struct virtio_net_ctrl_hdr *ctrl)
+{
+ /* Not much to do here */
+ return VIRTIO_NET_OK;
+}
+
+static void *virtio_net_ctrl_thread(void *p)
+{
+ struct iovec iov[VIRTIO_NET_QUEUE_SIZE];
+ u16 out, in, head;
+ struct net_dev *ndev = p;
+ struct kvm *kvm = ndev->kvm;
+ u32 id = ndev->queue_pairs * 2;
+ struct virt_queue *vq = &ndev->vqs[id];
+ struct virtio_net_ctrl_hdr *ctrl;
+ virtio_net_ctrl_ack *ack;
+
+ while (1) {
+ mutex_lock(&ndev->io_lock[id]);
+ if (!virt_queue__available(vq))
+ pthread_cond_wait(&ndev->io_cond[id], &ndev->io_lock[id].mutex);
+ mutex_unlock(&ndev->io_lock[id]);
+
+ while (virt_queue__available(vq)) {
+ head = virt_queue__get_iov(&ndev->vqs[id], iov, &out, &in, kvm);
+ ctrl = iov[0].iov_base;
+ ack = iov[out].iov_base;
+
+ switch (ctrl->class) {
+ case VIRTIO_NET_CTRL_MQ:
+ *ack = virtio_net_handle_mq(kvm, ndev, ctrl);
+ break;
+ default:
+ *ack = VIRTIO_NET_ERR;
+ break;
+ }
+ virt_queue__set_used_elem(&ndev->vqs[id], head, iov[out].iov_len);
+ }
+
+ if (virtio_queue__should_signal(&ndev->vqs[id]))
+ ndev->vdev.ops->signal_vq(kvm, &ndev->vdev, id);
+ }
+
+ pthread_exit(NULL);
+
+ return NULL;
+}
+
+static void virtio_net_handle_callback(struct kvm *kvm, struct net_dev *ndev, int queue)
+{
+ if ((u32)queue >= (ndev->queue_pairs * 2 + 1)) {
+ pr_warning("Unknown queue index %u", queue);
+ return;
+ }
+
+ mutex_lock(&ndev->io_lock[queue]);
+ pthread_cond_signal(&ndev->io_cond[queue]);
+ mutex_unlock(&ndev->io_lock[queue]);
+}
+
+static bool virtio_net__tap_init(struct net_dev *ndev)
+{
+ int sock = socket(AF_INET, SOCK_STREAM, 0);
+ int pid, status, offload, hdr_len;
+ struct sockaddr_in sin = {0};
+ struct ifreq ifr;
+ const struct virtio_net_params *params = ndev->params;
+ bool skipconf = !!params->tapif;
+
+ /* Did the user already gave us the FD? */
+ if (params->fd) {
+ ndev->tap_fd = params->fd;
+ return 1;
+ }
+
+ ndev->tap_fd = open("/dev/net/tun", O_RDWR);
+ if (ndev->tap_fd < 0) {
+ pr_warning("Unable to open /dev/net/tun");
+ goto fail;
+ }
+
+ memset(&ifr, 0, sizeof(ifr));
+ ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR;
+ if (params->tapif)
+ strncpy(ifr.ifr_name, params->tapif, sizeof(ifr.ifr_name));
+ if (ioctl(ndev->tap_fd, TUNSETIFF, &ifr) < 0) {
+ pr_warning("Config tap device error. Are you root?");
+ goto fail;
+ }
+
+ strncpy(ndev->tap_name, ifr.ifr_name, sizeof(ndev->tap_name));
+
+ if (ioctl(ndev->tap_fd, TUNSETNOCSUM, 1) < 0) {
+ pr_warning("Config tap device TUNSETNOCSUM error");
+ goto fail;
+ }
+
+ hdr_len = has_virtio_feature(ndev, VIRTIO_NET_F_MRG_RXBUF) ?
+ sizeof(struct virtio_net_hdr_mrg_rxbuf) :
+ sizeof(struct virtio_net_hdr);
+ if (ioctl(ndev->tap_fd, TUNSETVNETHDRSZ, &hdr_len) < 0)
+ pr_warning("Config tap device TUNSETVNETHDRSZ error");
+
+ offload = TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 | TUN_F_UFO;
+ if (ioctl(ndev->tap_fd, TUNSETOFFLOAD, offload) < 0) {
+ pr_warning("Config tap device TUNSETOFFLOAD error");
+ goto fail;
+ }
+
+ if (strcmp(params->script, "none")) {
+ pid = fork();
+ if (pid == 0) {
+ execl(params->script, params->script, ndev->tap_name, NULL);
+ _exit(1);
+ } else {
+ waitpid(pid, &status, 0);
+ if (WIFEXITED(status) && WEXITSTATUS(status) != 0) {
+ pr_warning("Fail to setup tap by %s", params->script);
+ goto fail;
+ }
+ }
+ } else if (!skipconf) {
+ memset(&ifr, 0, sizeof(ifr));
+ strncpy(ifr.ifr_name, ndev->tap_name, sizeof(ndev->tap_name));
+ sin.sin_addr.s_addr = inet_addr(params->host_ip);
+ memcpy(&(ifr.ifr_addr), &sin, sizeof(ifr.ifr_addr));
+ ifr.ifr_addr.sa_family = AF_INET;
+ if (ioctl(sock, SIOCSIFADDR, &ifr) < 0) {
+ pr_warning("Could not set ip address on tap device");
+ goto fail;
+ }
+ }
+
+ if (!skipconf) {
+ memset(&ifr, 0, sizeof(ifr));
+ strncpy(ifr.ifr_name, ndev->tap_name, sizeof(ndev->tap_name));
+ ioctl(sock, SIOCGIFFLAGS, &ifr);
+ ifr.ifr_flags |= IFF_UP | IFF_RUNNING;
+ if (ioctl(sock, SIOCSIFFLAGS, &ifr) < 0)
+ pr_warning("Could not bring tap device up");
+ }
+
+ close(sock);
+
+ return 1;
+
+fail:
+ if (sock >= 0)
+ close(sock);
+ if (ndev->tap_fd >= 0)
+ close(ndev->tap_fd);
+
+ return 0;
+}
+
+static inline int tap_ops_tx(struct iovec *iov, u16 out, struct net_dev *ndev)
+{
+ return writev(ndev->tap_fd, iov, out);
+}
+
+static inline int tap_ops_rx(struct iovec *iov, u16 in, struct net_dev *ndev)
+{
+ return readv(ndev->tap_fd, iov, in);
+}
+
+static inline int uip_ops_tx(struct iovec *iov, u16 out, struct net_dev *ndev)
+{
+ return uip_tx(iov, out, &ndev->info);
+}
+
+static inline int uip_ops_rx(struct iovec *iov, u16 in, struct net_dev *ndev)
+{
+ return uip_rx(iov, in, &ndev->info);
+}
+
+static struct net_dev_operations tap_ops = {
+ .rx = tap_ops_rx,
+ .tx = tap_ops_tx,
+};
+
+static struct net_dev_operations uip_ops = {
+ .rx = uip_ops_rx,
+ .tx = uip_ops_tx,
+};
+
+static u8 *get_config(struct kvm *kvm, void *dev)
+{
+ struct net_dev *ndev = dev;
+
+ return ((u8 *)(&ndev->config));
+}
+
+static u32 get_host_features(struct kvm *kvm, void *dev)
+{
+ struct net_dev *ndev = dev;
+
+ return 1UL << VIRTIO_NET_F_MAC
+ | 1UL << VIRTIO_NET_F_CSUM
+ | 1UL << VIRTIO_NET_F_HOST_UFO
+ | 1UL << VIRTIO_NET_F_HOST_TSO4
+ | 1UL << VIRTIO_NET_F_HOST_TSO6
+ | 1UL << VIRTIO_NET_F_GUEST_UFO
+ | 1UL << VIRTIO_NET_F_GUEST_TSO4
+ | 1UL << VIRTIO_NET_F_GUEST_TSO6
+ | 1UL << VIRTIO_RING_F_EVENT_IDX
+ | 1UL << VIRTIO_RING_F_INDIRECT_DESC
+ | 1UL << VIRTIO_NET_F_CTRL_VQ
+ | 1UL << VIRTIO_NET_F_MRG_RXBUF
+ | 1UL << (ndev->queue_pairs > 1 ? VIRTIO_NET_F_MQ : 0);
+}
+
+static int virtio_net__vhost_set_features(struct net_dev *ndev)
+{
+ u64 features = 1UL << VIRTIO_RING_F_EVENT_IDX;
+ u64 vhost_features;
+
+ if (ioctl(ndev->vhost_fd, VHOST_GET_FEATURES, &vhost_features) != 0)
+ die_perror("VHOST_GET_FEATURES failed");
+
+ /* make sure both side support mergable rx buffers */
+ if (vhost_features & 1UL << VIRTIO_NET_F_MRG_RXBUF &&
+ has_virtio_feature(ndev, VIRTIO_NET_F_MRG_RXBUF))
+ features |= 1UL << VIRTIO_NET_F_MRG_RXBUF;
+
+ return ioctl(ndev->vhost_fd, VHOST_SET_FEATURES, &features);
+}
+
+static void set_guest_features(struct kvm *kvm, void *dev, u32 features)
+{
+ struct net_dev *ndev = dev;
+ struct virtio_net_config *conf = &ndev->config;
+
+ ndev->features = features;
+
+ conf->status = virtio_host_to_guest_u16(&ndev->vdev, conf->status);
+ conf->max_virtqueue_pairs = virtio_host_to_guest_u16(&ndev->vdev,
+ conf->max_virtqueue_pairs);
+
+ if (ndev->mode == NET_MODE_TAP) {
+ if (!virtio_net__tap_init(ndev))
+ die_perror("You have requested a TAP device, but creation of one has failed because");
+ if (ndev->vhost_fd &&
+ virtio_net__vhost_set_features(ndev) != 0)
+ die_perror("VHOST_SET_FEATURES failed");
+ } else {
+ ndev->info.vnet_hdr_len = has_virtio_feature(ndev, VIRTIO_NET_F_MRG_RXBUF) ?
+ sizeof(struct virtio_net_hdr_mrg_rxbuf) :
+ sizeof(struct virtio_net_hdr);
+ uip_init(&ndev->info);
+ }
+}
+
+static bool is_ctrl_vq(struct net_dev *ndev, u32 vq)
+{
+ return vq == (u32)(ndev->queue_pairs * 2);
+}
+
+static int init_vq(struct kvm *kvm, void *dev, u32 vq, u32 page_size, u32 align,
+ u32 pfn)
+{
+ struct vhost_vring_state state = { .index = vq };
+ struct vhost_vring_addr addr;
+ struct net_dev *ndev = dev;
+ struct virt_queue *queue;
+ void *p;
+ int r;
+
+ compat__remove_message(compat_id);
+
+ queue = &ndev->vqs[vq];
+ queue->pfn = pfn;
+ p = virtio_get_vq(kvm, queue->pfn, page_size);
+
+ vring_init(&queue->vring, VIRTIO_NET_QUEUE_SIZE, p, align);
+ virtio_init_device_vq(&ndev->vdev, queue);
+
+ mutex_init(&ndev->io_lock[vq]);
+ pthread_cond_init(&ndev->io_cond[vq], NULL);
+ if (is_ctrl_vq(ndev, vq)) {
+ pthread_create(&ndev->io_thread[vq], NULL, virtio_net_ctrl_thread, ndev);
+
+ return 0;
+ } else if (ndev->vhost_fd == 0 ) {
+ if (vq & 1)
+ pthread_create(&ndev->io_thread[vq], NULL, virtio_net_tx_thread, ndev);
+ else
+ pthread_create(&ndev->io_thread[vq], NULL, virtio_net_rx_thread, ndev);
+
+ return 0;
+ }
+
+ if (queue->endian != VIRTIO_ENDIAN_HOST)
+ die_perror("VHOST requires VIRTIO_ENDIAN_HOST");
+
+ state.num = queue->vring.num;
+ r = ioctl(ndev->vhost_fd, VHOST_SET_VRING_NUM, &state);
+ if (r < 0)
+ die_perror("VHOST_SET_VRING_NUM failed");
+ state.num = 0;
+ r = ioctl(ndev->vhost_fd, VHOST_SET_VRING_BASE, &state);
+ if (r < 0)
+ die_perror("VHOST_SET_VRING_BASE failed");
+
+ addr = (struct vhost_vring_addr) {
+ .index = vq,
+ .desc_user_addr = (u64)(unsigned long)queue->vring.desc,
+ .avail_user_addr = (u64)(unsigned long)queue->vring.avail,
+ .used_user_addr = (u64)(unsigned long)queue->vring.used,
+ };
+
+ r = ioctl(ndev->vhost_fd, VHOST_SET_VRING_ADDR, &addr);
+ if (r < 0)
+ die_perror("VHOST_SET_VRING_ADDR failed");
+
+ return 0;
+}
+
+static void notify_vq_gsi(struct kvm *kvm, void *dev, u32 vq, u32 gsi)
+{
+ struct net_dev *ndev = dev;
+ struct kvm_irqfd irq;
+ struct vhost_vring_file file;
+ int r;
+
+ if (ndev->vhost_fd == 0)
+ return;
+
+ irq = (struct kvm_irqfd) {
+ .gsi = gsi,
+ .fd = eventfd(0, 0),
+ };
+ file = (struct vhost_vring_file) {
+ .index = vq,
+ .fd = irq.fd,
+ };
+
+ r = ioctl(kvm->vm_fd, KVM_IRQFD, &irq);
+ if (r < 0)
+ die_perror("KVM_IRQFD failed");
+
+ r = ioctl(ndev->vhost_fd, VHOST_SET_VRING_CALL, &file);
+ if (r < 0)
+ die_perror("VHOST_SET_VRING_CALL failed");
+ file.fd = ndev->tap_fd;
+ r = ioctl(ndev->vhost_fd, VHOST_NET_SET_BACKEND, &file);
+ if (r != 0)
+ die("VHOST_NET_SET_BACKEND failed %d", errno);
+
+}
+
+static void notify_vq_eventfd(struct kvm *kvm, void *dev, u32 vq, u32 efd)
+{
+ struct net_dev *ndev = dev;
+ struct vhost_vring_file file = {
+ .index = vq,
+ .fd = efd,
+ };
+ int r;
+
+ if (ndev->vhost_fd == 0 || is_ctrl_vq(ndev, vq))
+ return;
+
+ r = ioctl(ndev->vhost_fd, VHOST_SET_VRING_KICK, &file);
+ if (r < 0)
+ die_perror("VHOST_SET_VRING_KICK failed");
+}
+
+static int notify_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+ struct net_dev *ndev = dev;
+
+ virtio_net_handle_callback(kvm, ndev, vq);
+
+ return 0;
+}
+
+static int get_pfn_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+ struct net_dev *ndev = dev;
+
+ return ndev->vqs[vq].pfn;
+}
+
+static int get_size_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+ /* FIXME: dynamic */
+ return VIRTIO_NET_QUEUE_SIZE;
+}
+
+static int set_size_vq(struct kvm *kvm, void *dev, u32 vq, int size)
+{
+ /* FIXME: dynamic */
+ return size;
+}
+
+static struct virtio_ops net_dev_virtio_ops = (struct virtio_ops) {
+ .get_config = get_config,
+ .get_host_features = get_host_features,
+ .set_guest_features = set_guest_features,
+ .init_vq = init_vq,
+ .get_pfn_vq = get_pfn_vq,
+ .get_size_vq = get_size_vq,
+ .set_size_vq = set_size_vq,
+ .notify_vq = notify_vq,
+ .notify_vq_gsi = notify_vq_gsi,
+ .notify_vq_eventfd = notify_vq_eventfd,
+};
+
+static void virtio_net__vhost_init(struct kvm *kvm, struct net_dev *ndev)
+{
+ struct vhost_memory *mem;
+ int r;
+
+ ndev->vhost_fd = open("/dev/vhost-net", O_RDWR);
+ if (ndev->vhost_fd < 0)
+ die_perror("Failed openning vhost-net device");
+
+ mem = calloc(1, sizeof(*mem) + sizeof(struct vhost_memory_region));
+ if (mem == NULL)
+ die("Failed allocating memory for vhost memory map");
+
+ mem->nregions = 1;
+ mem->regions[0] = (struct vhost_memory_region) {
+ .guest_phys_addr = 0,
+ .memory_size = kvm->ram_size,
+ .userspace_addr = (unsigned long)kvm->ram_start,
+ };
+
+ r = ioctl(ndev->vhost_fd, VHOST_SET_OWNER);
+ if (r != 0)
+ die_perror("VHOST_SET_OWNER failed");
+
+ r = ioctl(ndev->vhost_fd, VHOST_SET_MEM_TABLE, mem);
+ if (r != 0)
+ die_perror("VHOST_SET_MEM_TABLE failed");
+
+ ndev->vdev.use_vhost = true;
+
+ free(mem);
+}
+
+static inline void str_to_mac(const char *str, char *mac)
+{
+ sscanf(str, "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx",
+ mac, mac+1, mac+2, mac+3, mac+4, mac+5);
+}
+static int set_net_param(struct kvm *kvm, struct virtio_net_params *p,
+ const char *param, const char *val)
+{
+ if (strcmp(param, "guest_mac") == 0) {
+ str_to_mac(val, p->guest_mac);
+ } else if (strcmp(param, "mode") == 0) {
+ if (!strncmp(val, "user", 4)) {
+ int i;
+
+ for (i = 0; i < kvm->cfg.num_net_devices; i++)
+ if (kvm->cfg.net_params[i].mode == NET_MODE_USER)
+ die("Only one usermode network device allowed at a time");
+ p->mode = NET_MODE_USER;
+ } else if (!strncmp(val, "tap", 3)) {
+ p->mode = NET_MODE_TAP;
+ } else if (!strncmp(val, "none", 4)) {
+ kvm->cfg.no_net = 1;
+ return -1;
+ } else
+ die("Unknown network mode %s, please use user, tap or none", kvm->cfg.network);
+ } else if (strcmp(param, "script") == 0) {
+ p->script = strdup(val);
+ } else if (strcmp(param, "guest_ip") == 0) {
+ p->guest_ip = strdup(val);
+ } else if (strcmp(param, "host_ip") == 0) {
+ p->host_ip = strdup(val);
+ } else if (strcmp(param, "trans") == 0) {
+ p->trans = strdup(val);
+ } else if (strcmp(param, "tapif") == 0) {
+ p->tapif = strdup(val);
+ } else if (strcmp(param, "vhost") == 0) {
+ p->vhost = atoi(val);
+ } else if (strcmp(param, "fd") == 0) {
+ p->fd = atoi(val);
+ } else if (strcmp(param, "mq") == 0) {
+ p->mq = atoi(val);
+ } else
+ die("Unknown network parameter %s", param);
+
+ return 0;
+}
+
+int netdev_parser(const struct option *opt, const char *arg, int unset)
+{
+ struct virtio_net_params p;
+ char *buf = NULL, *cmd = NULL, *cur = NULL;
+ bool on_cmd = true;
+ struct kvm *kvm = opt->ptr;
+
+ if (arg) {
+ buf = strdup(arg);
+ if (buf == NULL)
+ die("Failed allocating new net buffer");
+ cur = strtok(buf, ",=");
+ }
+
+ p = (struct virtio_net_params) {
+ .guest_ip = DEFAULT_GUEST_ADDR,
+ .host_ip = DEFAULT_HOST_ADDR,
+ .script = DEFAULT_SCRIPT,
+ .mode = NET_MODE_TAP,
+ };
+
+ str_to_mac(DEFAULT_GUEST_MAC, p.guest_mac);
+ p.guest_mac[5] += kvm->cfg.num_net_devices;
+
+ while (cur) {
+ if (on_cmd) {
+ cmd = cur;
+ } else {
+ if (set_net_param(kvm, &p, cmd, cur) < 0)
+ goto done;
+ }
+ on_cmd = !on_cmd;
+
+ cur = strtok(NULL, ",=");
+ };
+
+ kvm->cfg.num_net_devices++;
+
+ kvm->cfg.net_params = realloc(kvm->cfg.net_params, kvm->cfg.num_net_devices * sizeof(*kvm->cfg.net_params));
+ if (kvm->cfg.net_params == NULL)
+ die("Failed adding new network device");
+
+ kvm->cfg.net_params[kvm->cfg.num_net_devices - 1] = p;
+
+done:
+ free(buf);
+ return 0;
+}
+
+static int virtio_net__init_one(struct virtio_net_params *params)
+{
+ int i, err;
+ struct net_dev *ndev;
+ struct virtio_ops *ops;
+
+ ndev = calloc(1, sizeof(struct net_dev));
+ if (ndev == NULL)
+ return -ENOMEM;
+
+ ops = malloc(sizeof(*ops));
+ if (ops == NULL) {
+ err = -ENOMEM;
+ goto err_free_ndev;
+ }
+
+ list_add_tail(&ndev->list, &ndevs);
+
+ ndev->kvm = params->kvm;
+ ndev->params = params;
+
+ mutex_init(&ndev->mutex);
+ ndev->queue_pairs = max(1, min(VIRTIO_NET_NUM_QUEUES, params->mq));
+ ndev->config.status = VIRTIO_NET_S_LINK_UP;
+ if (ndev->queue_pairs > 1)
+ ndev->config.max_virtqueue_pairs = ndev->queue_pairs;
+
+ for (i = 0 ; i < 6 ; i++) {
+ ndev->config.mac[i] = params->guest_mac[i];
+ ndev->info.guest_mac.addr[i] = params->guest_mac[i];
+ ndev->info.host_mac.addr[i] = params->host_mac[i];
+ }
+
+ ndev->mode = params->mode;
+ if (ndev->mode == NET_MODE_TAP) {
+ ndev->ops = &tap_ops;
+ } else {
+ ndev->info.host_ip = ntohl(inet_addr(params->host_ip));
+ ndev->info.guest_ip = ntohl(inet_addr(params->guest_ip));
+ ndev->info.guest_netmask = ntohl(inet_addr("255.255.255.0"));
+ ndev->info.buf_nr = 20,
+ ndev->ops = &uip_ops;
+ uip_static_init(&ndev->info);
+ }
+
+ *ops = net_dev_virtio_ops;
+ if (params->trans && strcmp(params->trans, "mmio") == 0)
+ virtio_init(params->kvm, ndev, &ndev->vdev, ops, VIRTIO_MMIO,
+ PCI_DEVICE_ID_VIRTIO_NET, VIRTIO_ID_NET, PCI_CLASS_NET);
+ else
+ virtio_init(params->kvm, ndev, &ndev->vdev, ops, VIRTIO_PCI,
+ PCI_DEVICE_ID_VIRTIO_NET, VIRTIO_ID_NET, PCI_CLASS_NET);
+
+ if (params->vhost)
+ virtio_net__vhost_init(params->kvm, ndev);
+
+ if (compat_id == -1)
+ compat_id = virtio_compat_add_message("virtio-net", "CONFIG_VIRTIO_NET");
+
+ return 0;
+
+err_free_ndev:
+ free(ndev);
+ return err;
+}
+
+int virtio_net__init(struct kvm *kvm)
+{
+ int i;
+
+ for (i = 0; i < kvm->cfg.num_net_devices; i++) {
+ kvm->cfg.net_params[i].kvm = kvm;
+ virtio_net__init_one(&kvm->cfg.net_params[i]);
+ }
+
+ if (kvm->cfg.num_net_devices == 0 && kvm->cfg.no_net == 0) {
+ static struct virtio_net_params net_params;
+
+ net_params = (struct virtio_net_params) {
+ .guest_ip = kvm->cfg.guest_ip,
+ .host_ip = kvm->cfg.host_ip,
+ .kvm = kvm,
+ .script = kvm->cfg.script,
+ .mode = NET_MODE_USER,
+ };
+ str_to_mac(kvm->cfg.guest_mac, net_params.guest_mac);
+ str_to_mac(kvm->cfg.host_mac, net_params.host_mac);
+
+ virtio_net__init_one(&net_params);
+ }
+
+ return 0;
+}
+virtio_dev_init(virtio_net__init);
+
+int virtio_net__exit(struct kvm *kvm)
+{
+ return 0;
+}
+virtio_dev_exit(virtio_net__exit);
diff --git a/tools/kvm/virtio/pci.c b/tools/kvm/virtio/pci.c
new file mode 100644
index 0000000..7556239
--- /dev/null
+++ b/tools/kvm/virtio/pci.c
@@ -0,0 +1,451 @@
+#include "kvm/virtio-pci.h"
+
+#include "kvm/ioport.h"
+#include "kvm/kvm.h"
+#include "kvm/kvm-cpu.h"
+#include "kvm/virtio-pci-dev.h"
+#include "kvm/irq.h"
+#include "kvm/virtio.h"
+#include "kvm/ioeventfd.h"
+
+#include <sys/ioctl.h>
+#include <linux/virtio_pci.h>
+#include <linux/byteorder.h>
+#include <string.h>
+
+static void virtio_pci__ioevent_callback(struct kvm *kvm, void *param)
+{
+ struct virtio_pci_ioevent_param *ioeventfd = param;
+ struct virtio_pci *vpci = ioeventfd->vdev->virtio;
+
+ ioeventfd->vdev->ops->notify_vq(kvm, vpci->dev, ioeventfd->vq);
+}
+
+static int virtio_pci__init_ioeventfd(struct kvm *kvm, struct virtio_device *vdev, u32 vq)
+{
+ struct ioevent ioevent;
+ struct virtio_pci *vpci = vdev->virtio;
+ int i, r, flags = IOEVENTFD_FLAG_PIO;
+ int fds[2];
+
+ vpci->ioeventfds[vq] = (struct virtio_pci_ioevent_param) {
+ .vdev = vdev,
+ .vq = vq,
+ };
+
+ ioevent = (struct ioevent) {
+ .fn = virtio_pci__ioevent_callback,
+ .fn_ptr = &vpci->ioeventfds[vq],
+ .datamatch = vq,
+ .fn_kvm = kvm,
+ };
+
+ /*
+ * Vhost will poll the eventfd in host kernel side, otherwise we
+ * need to poll in userspace.
+ */
+ if (!vdev->use_vhost)
+ flags |= IOEVENTFD_FLAG_USER_POLL;
+
+ /* ioport */
+ ioevent.io_addr = vpci->port_addr + VIRTIO_PCI_QUEUE_NOTIFY;
+ ioevent.io_len = sizeof(u16);
+ ioevent.fd = fds[0] = eventfd(0, 0);
+ r = ioeventfd__add_event(&ioevent, flags);
+ if (r)
+ return r;
+
+ /* mmio */
+ ioevent.io_addr = vpci->mmio_addr + VIRTIO_PCI_QUEUE_NOTIFY;
+ ioevent.io_len = sizeof(u32);
+ ioevent.fd = fds[1] = eventfd(0, 0);
+ r = ioeventfd__add_event(&ioevent, flags);
+ if (r)
+ goto free_ioport_evt;
+
+ if (vdev->ops->notify_vq_eventfd)
+ for (i = 0; i < 2; ++i)
+ vdev->ops->notify_vq_eventfd(kvm, vpci->dev, vq,
+ fds[i]);
+ return 0;
+
+free_ioport_evt:
+ ioeventfd__del_event(vpci->port_addr + VIRTIO_PCI_QUEUE_NOTIFY, vq);
+ return r;
+}
+
+static inline bool virtio_pci__msix_enabled(struct virtio_pci *vpci)
+{
+ return vpci->pci_hdr.msix.ctrl & cpu_to_le16(PCI_MSIX_FLAGS_ENABLE);
+}
+
+static bool virtio_pci__specific_io_in(struct kvm *kvm, struct virtio_device *vdev, u16 port,
+ void *data, int size, int offset)
+{
+ u32 config_offset;
+ struct virtio_pci *vpci = vdev->virtio;
+ int type = virtio__get_dev_specific_field(offset - 20,
+ virtio_pci__msix_enabled(vpci),
+ &config_offset);
+ if (type == VIRTIO_PCI_O_MSIX) {
+ switch (offset) {
+ case VIRTIO_MSI_CONFIG_VECTOR:
+ ioport__write16(data, vpci->config_vector);
+ break;
+ case VIRTIO_MSI_QUEUE_VECTOR:
+ ioport__write16(data, vpci->vq_vector[vpci->queue_selector]);
+ break;
+ };
+
+ return true;
+ } else if (type == VIRTIO_PCI_O_CONFIG) {
+ u8 cfg;
+
+ cfg = vdev->ops->get_config(kvm, vpci->dev)[config_offset];
+ ioport__write8(data, cfg);
+ return true;
+ }
+
+ return false;
+}
+
+static bool virtio_pci__io_in(struct ioport *ioport, struct kvm_cpu *vcpu, u16 port, void *data, int size)
+{
+ unsigned long offset;
+ bool ret = true;
+ struct virtio_device *vdev;
+ struct virtio_pci *vpci;
+ struct kvm *kvm;
+ u32 val;
+
+ kvm = vcpu->kvm;
+ vdev = ioport->priv;
+ vpci = vdev->virtio;
+ offset = port - vpci->port_addr;
+
+ switch (offset) {
+ case VIRTIO_PCI_HOST_FEATURES:
+ val = vdev->ops->get_host_features(kvm, vpci->dev);
+ ioport__write32(data, val);
+ break;
+ case VIRTIO_PCI_QUEUE_PFN:
+ val = vdev->ops->get_pfn_vq(kvm, vpci->dev, vpci->queue_selector);
+ ioport__write32(data, val);
+ break;
+ case VIRTIO_PCI_QUEUE_NUM:
+ val = vdev->ops->get_size_vq(kvm, vpci->dev, vpci->queue_selector);
+ ioport__write16(data, val);
+ break;
+ case VIRTIO_PCI_STATUS:
+ ioport__write8(data, vpci->status);
+ break;
+ case VIRTIO_PCI_ISR:
+ ioport__write8(data, vpci->isr);
+ kvm__irq_line(kvm, vpci->pci_hdr.irq_line, VIRTIO_IRQ_LOW);
+ vpci->isr = VIRTIO_IRQ_LOW;
+ break;
+ default:
+ ret = virtio_pci__specific_io_in(kvm, vdev, port, data, size, offset);
+ break;
+ };
+
+ return ret;
+}
+
+static bool virtio_pci__specific_io_out(struct kvm *kvm, struct virtio_device *vdev, u16 port,
+ void *data, int size, int offset)
+{
+ struct virtio_pci *vpci = vdev->virtio;
+ u32 config_offset, gsi, vec;
+ int type = virtio__get_dev_specific_field(offset - 20, virtio_pci__msix_enabled(vpci),
+ &config_offset);
+ if (type == VIRTIO_PCI_O_MSIX) {
+ switch (offset) {
+ case VIRTIO_MSI_CONFIG_VECTOR:
+ vec = vpci->config_vector = ioport__read16(data);
+ if (vec == VIRTIO_MSI_NO_VECTOR)
+ break;
+
+ gsi = irq__add_msix_route(kvm, &vpci->msix_table[vec].msg);
+
+ vpci->config_gsi = gsi;
+ break;
+ case VIRTIO_MSI_QUEUE_VECTOR:
+ vec = vpci->vq_vector[vpci->queue_selector] = ioport__read16(data);
+
+ if (vec == VIRTIO_MSI_NO_VECTOR)
+ break;
+
+ gsi = irq__add_msix_route(kvm, &vpci->msix_table[vec].msg);
+ vpci->gsis[vpci->queue_selector] = gsi;
+ if (vdev->ops->notify_vq_gsi)
+ vdev->ops->notify_vq_gsi(kvm, vpci->dev,
+ vpci->queue_selector, gsi);
+ break;
+ };
+
+ return true;
+ } else if (type == VIRTIO_PCI_O_CONFIG) {
+ vdev->ops->get_config(kvm, vpci->dev)[config_offset] = *(u8 *)data;
+
+ return true;
+ }
+
+ return false;
+}
+
+static bool virtio_pci__io_out(struct ioport *ioport, struct kvm_cpu *vcpu, u16 port, void *data, int size)
+{
+ unsigned long offset;
+ bool ret = true;
+ struct virtio_device *vdev;
+ struct virtio_pci *vpci;
+ struct kvm *kvm;
+ u32 val;
+
+ kvm = vcpu->kvm;
+ vdev = ioport->priv;
+ vpci = vdev->virtio;
+ offset = port - vpci->port_addr;
+
+ switch (offset) {
+ case VIRTIO_PCI_GUEST_FEATURES:
+ val = ioport__read32(data);
+ vdev->ops->set_guest_features(kvm, vpci->dev, val);
+ break;
+ case VIRTIO_PCI_QUEUE_PFN:
+ val = ioport__read32(data);
+ virtio_pci__init_ioeventfd(kvm, vdev, vpci->queue_selector);
+ vdev->ops->init_vq(kvm, vpci->dev, vpci->queue_selector,
+ 1 << VIRTIO_PCI_QUEUE_ADDR_SHIFT,
+ VIRTIO_PCI_VRING_ALIGN, val);
+ break;
+ case VIRTIO_PCI_QUEUE_SEL:
+ vpci->queue_selector = ioport__read16(data);
+ break;
+ case VIRTIO_PCI_QUEUE_NOTIFY:
+ val = ioport__read16(data);
+ vdev->ops->notify_vq(kvm, vpci->dev, val);
+ break;
+ case VIRTIO_PCI_STATUS:
+ vpci->status = ioport__read8(data);
+ if (!vpci->status) /* Sample endianness on reset */
+ vdev->endian = kvm_cpu__get_endianness(vcpu);
+ if (vdev->ops->notify_status)
+ vdev->ops->notify_status(kvm, vpci->dev, vpci->status);
+ break;
+ default:
+ ret = virtio_pci__specific_io_out(kvm, vdev, port, data, size, offset);
+ break;
+ };
+
+ return ret;
+}
+
+static struct ioport_operations virtio_pci__io_ops = {
+ .io_in = virtio_pci__io_in,
+ .io_out = virtio_pci__io_out,
+};
+
+static void virtio_pci__msix_mmio_callback(struct kvm_cpu *vcpu,
+ u64 addr, u8 *data, u32 len,
+ u8 is_write, void *ptr)
+{
+ struct virtio_pci *vpci = ptr;
+ void *table;
+ u32 offset;
+
+ if (addr > vpci->msix_io_block + PCI_IO_SIZE) {
+ table = &vpci->msix_pba;
+ offset = vpci->msix_io_block + PCI_IO_SIZE;
+ } else {
+ table = &vpci->msix_table;
+ offset = vpci->msix_io_block;
+ }
+
+ if (is_write)
+ memcpy(table + addr - offset, data, len);
+ else
+ memcpy(data, table + addr - offset, len);
+}
+
+static void virtio_pci__signal_msi(struct kvm *kvm, struct virtio_pci *vpci, int vec)
+{
+ struct kvm_msi msi = {
+ .address_lo = vpci->msix_table[vec].msg.address_lo,
+ .address_hi = vpci->msix_table[vec].msg.address_hi,
+ .data = vpci->msix_table[vec].msg.data,
+ };
+
+ ioctl(kvm->vm_fd, KVM_SIGNAL_MSI, &msi);
+}
+
+int virtio_pci__signal_vq(struct kvm *kvm, struct virtio_device *vdev, u32 vq)
+{
+ struct virtio_pci *vpci = vdev->virtio;
+ int tbl = vpci->vq_vector[vq];
+
+ if (virtio_pci__msix_enabled(vpci) && tbl != VIRTIO_MSI_NO_VECTOR) {
+ if (vpci->pci_hdr.msix.ctrl & cpu_to_le16(PCI_MSIX_FLAGS_MASKALL) ||
+ vpci->msix_table[tbl].ctrl & cpu_to_le16(PCI_MSIX_ENTRY_CTRL_MASKBIT)) {
+
+ vpci->msix_pba |= 1 << tbl;
+ return 0;
+ }
+
+ if (vpci->features & VIRTIO_PCI_F_SIGNAL_MSI)
+ virtio_pci__signal_msi(kvm, vpci, vpci->vq_vector[vq]);
+ else
+ kvm__irq_trigger(kvm, vpci->gsis[vq]);
+ } else {
+ vpci->isr = VIRTIO_IRQ_HIGH;
+ kvm__irq_trigger(kvm, vpci->pci_hdr.irq_line);
+ }
+ return 0;
+}
+
+int virtio_pci__signal_config(struct kvm *kvm, struct virtio_device *vdev)
+{
+ struct virtio_pci *vpci = vdev->virtio;
+ int tbl = vpci->config_vector;
+
+ if (virtio_pci__msix_enabled(vpci) && tbl != VIRTIO_MSI_NO_VECTOR) {
+ if (vpci->pci_hdr.msix.ctrl & cpu_to_le16(PCI_MSIX_FLAGS_MASKALL) ||
+ vpci->msix_table[tbl].ctrl & cpu_to_le16(PCI_MSIX_ENTRY_CTRL_MASKBIT)) {
+
+ vpci->msix_pba |= 1 << tbl;
+ return 0;
+ }
+
+ if (vpci->features & VIRTIO_PCI_F_SIGNAL_MSI)
+ virtio_pci__signal_msi(kvm, vpci, tbl);
+ else
+ kvm__irq_trigger(kvm, vpci->config_gsi);
+ } else {
+ vpci->isr = VIRTIO_PCI_ISR_CONFIG;
+ kvm__irq_trigger(kvm, vpci->pci_hdr.irq_line);
+ }
+
+ return 0;
+}
+
+static void virtio_pci__io_mmio_callback(struct kvm_cpu *vcpu,
+ u64 addr, u8 *data, u32 len,
+ u8 is_write, void *ptr)
+{
+ struct virtio_pci *vpci = ptr;
+ int direction = is_write ? KVM_EXIT_IO_OUT : KVM_EXIT_IO_IN;
+ u16 port = vpci->port_addr + (addr & (IOPORT_SIZE - 1));
+
+ kvm__emulate_io(vcpu, port, data, direction, len, 1);
+}
+
+int virtio_pci__init(struct kvm *kvm, void *dev, struct virtio_device *vdev,
+ int device_id, int subsys_id, int class)
+{
+ struct virtio_pci *vpci = vdev->virtio;
+ int r;
+
+ vpci->kvm = kvm;
+ vpci->dev = dev;
+
+ r = ioport__register(kvm, IOPORT_EMPTY, &virtio_pci__io_ops, IOPORT_SIZE, vdev);
+ if (r < 0)
+ return r;
+ vpci->port_addr = (u16)r;
+
+ vpci->mmio_addr = pci_get_io_space_block(IOPORT_SIZE);
+ r = kvm__register_mmio(kvm, vpci->mmio_addr, IOPORT_SIZE, false,
+ virtio_pci__io_mmio_callback, vpci);
+ if (r < 0)
+ goto free_ioport;
+
+ vpci->msix_io_block = pci_get_io_space_block(PCI_IO_SIZE * 2);
+ r = kvm__register_mmio(kvm, vpci->msix_io_block, PCI_IO_SIZE * 2, false,
+ virtio_pci__msix_mmio_callback, vpci);
+ if (r < 0)
+ goto free_mmio;
+
+ vpci->pci_hdr = (struct pci_device_header) {
+ .vendor_id = cpu_to_le16(PCI_VENDOR_ID_REDHAT_QUMRANET),
+ .device_id = cpu_to_le16(device_id),
+ .command = PCI_COMMAND_IO | PCI_COMMAND_MEMORY,
+ .header_type = PCI_HEADER_TYPE_NORMAL,
+ .revision_id = 0,
+ .class[0] = class & 0xff,
+ .class[1] = (class >> 8) & 0xff,
+ .class[2] = (class >> 16) & 0xff,
+ .subsys_vendor_id = cpu_to_le16(PCI_SUBSYSTEM_VENDOR_ID_REDHAT_QUMRANET),
+ .subsys_id = cpu_to_le16(subsys_id),
+ .bar[0] = cpu_to_le32(vpci->mmio_addr
+ | PCI_BASE_ADDRESS_SPACE_MEMORY),
+ .bar[1] = cpu_to_le32(vpci->port_addr
+ | PCI_BASE_ADDRESS_SPACE_IO),
+ .bar[2] = cpu_to_le32(vpci->msix_io_block
+ | PCI_BASE_ADDRESS_SPACE_MEMORY),
+ .status = cpu_to_le16(PCI_STATUS_CAP_LIST),
+ .capabilities = (void *)&vpci->pci_hdr.msix - (void *)&vpci->pci_hdr,
+ .bar_size[0] = cpu_to_le32(IOPORT_SIZE),
+ .bar_size[1] = cpu_to_le32(IOPORT_SIZE),
+ .bar_size[2] = cpu_to_le32(PCI_IO_SIZE*2),
+ };
+
+ vpci->dev_hdr = (struct device_header) {
+ .bus_type = DEVICE_BUS_PCI,
+ .data = &vpci->pci_hdr,
+ };
+
+ vpci->pci_hdr.msix.cap = PCI_CAP_ID_MSIX;
+ vpci->pci_hdr.msix.next = 0;
+ /*
+ * We at most have VIRTIO_PCI_MAX_VQ entries for virt queue,
+ * VIRTIO_PCI_MAX_CONFIG entries for config.
+ *
+ * To quote the PCI spec:
+ *
+ * System software reads this field to determine the
+ * MSI-X Table Size N, which is encoded as N-1.
+ * For example, a returned value of "00000000011"
+ * indicates a table size of 4.
+ */
+ vpci->pci_hdr.msix.ctrl = cpu_to_le16(VIRTIO_PCI_MAX_VQ + VIRTIO_PCI_MAX_CONFIG - 1);
+
+ /* Both table and PBA are mapped to the same BAR (2) */
+ vpci->pci_hdr.msix.table_offset = cpu_to_le32(2);
+ vpci->pci_hdr.msix.pba_offset = cpu_to_le32(2 | PCI_IO_SIZE);
+ vpci->config_vector = 0;
+
+ if (kvm__supports_extension(kvm, KVM_CAP_SIGNAL_MSI))
+ vpci->features |= VIRTIO_PCI_F_SIGNAL_MSI;
+
+ r = device__register(&vpci->dev_hdr);
+ if (r < 0)
+ goto free_msix_mmio;
+
+ return 0;
+
+free_msix_mmio:
+ kvm__deregister_mmio(kvm, vpci->msix_io_block);
+free_mmio:
+ kvm__deregister_mmio(kvm, vpci->mmio_addr);
+free_ioport:
+ ioport__unregister(kvm, vpci->port_addr);
+ return r;
+}
+
+int virtio_pci__exit(struct kvm *kvm, struct virtio_device *vdev)
+{
+ struct virtio_pci *vpci = vdev->virtio;
+ int i;
+
+ kvm__deregister_mmio(kvm, vpci->mmio_addr);
+ kvm__deregister_mmio(kvm, vpci->msix_io_block);
+ ioport__unregister(kvm, vpci->port_addr);
+
+ for (i = 0; i < VIRTIO_PCI_MAX_VQ; i++) {
+ ioeventfd__del_event(vpci->port_addr + VIRTIO_PCI_QUEUE_NOTIFY, i);
+ ioeventfd__del_event(vpci->mmio_addr + VIRTIO_PCI_QUEUE_NOTIFY, i);
+ }
+
+ return 0;
+}
diff --git a/tools/kvm/virtio/rng.c b/tools/kvm/virtio/rng.c
new file mode 100644
index 0000000..8031368
--- /dev/null
+++ b/tools/kvm/virtio/rng.c
@@ -0,0 +1,204 @@
+#include "kvm/virtio-rng.h"
+
+#include "kvm/virtio-pci-dev.h"
+
+#include "kvm/virtio.h"
+#include "kvm/util.h"
+#include "kvm/kvm.h"
+#include "kvm/threadpool.h"
+#include "kvm/guest_compat.h"
+
+#include <linux/virtio_ring.h>
+#include <linux/virtio_rng.h>
+
+#include <linux/list.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <pthread.h>
+#include <linux/kernel.h>
+
+#define NUM_VIRT_QUEUES 1
+#define VIRTIO_RNG_QUEUE_SIZE 128
+
+struct rng_dev_job {
+ struct virt_queue *vq;
+ struct rng_dev *rdev;
+ struct thread_pool__job job_id;
+};
+
+struct rng_dev {
+ struct list_head list;
+ struct virtio_device vdev;
+
+ int fd;
+
+ /* virtio queue */
+ struct virt_queue vqs[NUM_VIRT_QUEUES];
+ struct rng_dev_job jobs[NUM_VIRT_QUEUES];
+};
+
+static LIST_HEAD(rdevs);
+static int compat_id = -1;
+
+static u8 *get_config(struct kvm *kvm, void *dev)
+{
+ /* Unused */
+ return 0;
+}
+
+static u32 get_host_features(struct kvm *kvm, void *dev)
+{
+ /* Unused */
+ return 0;
+}
+
+static void set_guest_features(struct kvm *kvm, void *dev, u32 features)
+{
+ /* Unused */
+}
+
+static bool virtio_rng_do_io_request(struct kvm *kvm, struct rng_dev *rdev, struct virt_queue *queue)
+{
+ struct iovec iov[VIRTIO_RNG_QUEUE_SIZE];
+ ssize_t len = 0;
+ u16 out, in, head;
+
+ head = virt_queue__get_iov(queue, iov, &out, &in, kvm);
+ len = readv(rdev->fd, iov, in);
+ if (len < 0 && errno == EAGAIN)
+ len = 0;
+
+ virt_queue__set_used_elem(queue, head, len);
+
+ return true;
+}
+
+static void virtio_rng_do_io(struct kvm *kvm, void *param)
+{
+ struct rng_dev_job *job = param;
+ struct virt_queue *vq = job->vq;
+ struct rng_dev *rdev = job->rdev;
+
+ while (virt_queue__available(vq))
+ virtio_rng_do_io_request(kvm, rdev, vq);
+
+ rdev->vdev.ops->signal_vq(kvm, &rdev->vdev, vq - rdev->vqs);
+}
+
+static int init_vq(struct kvm *kvm, void *dev, u32 vq, u32 page_size, u32 align,
+ u32 pfn)
+{
+ struct rng_dev *rdev = dev;
+ struct virt_queue *queue;
+ struct rng_dev_job *job;
+ void *p;
+
+ compat__remove_message(compat_id);
+
+ queue = &rdev->vqs[vq];
+ queue->pfn = pfn;
+ p = virtio_get_vq(kvm, queue->pfn, page_size);
+
+ job = &rdev->jobs[vq];
+
+ vring_init(&queue->vring, VIRTIO_RNG_QUEUE_SIZE, p, align);
+
+ *job = (struct rng_dev_job) {
+ .vq = queue,
+ .rdev = rdev,
+ };
+
+ thread_pool__init_job(&job->job_id, kvm, virtio_rng_do_io, job);
+
+ return 0;
+}
+
+static int notify_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+ struct rng_dev *rdev = dev;
+
+ thread_pool__do_job(&rdev->jobs[vq].job_id);
+
+ return 0;
+}
+
+static int get_pfn_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+ struct rng_dev *rdev = dev;
+
+ return rdev->vqs[vq].pfn;
+}
+
+static int get_size_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+ return VIRTIO_RNG_QUEUE_SIZE;
+}
+
+static int set_size_vq(struct kvm *kvm, void *dev, u32 vq, int size)
+{
+ /* FIXME: dynamic */
+ return size;
+}
+
+static struct virtio_ops rng_dev_virtio_ops = (struct virtio_ops) {
+ .get_config = get_config,
+ .get_host_features = get_host_features,
+ .set_guest_features = set_guest_features,
+ .init_vq = init_vq,
+ .notify_vq = notify_vq,
+ .get_pfn_vq = get_pfn_vq,
+ .get_size_vq = get_size_vq,
+ .set_size_vq = set_size_vq,
+};
+
+int virtio_rng__init(struct kvm *kvm)
+{
+ struct rng_dev *rdev;
+ int r;
+
+ if (!kvm->cfg.virtio_rng)
+ return 0;
+
+ rdev = malloc(sizeof(*rdev));
+ if (rdev == NULL)
+ return -ENOMEM;
+
+ rdev->fd = open("/dev/random", O_RDONLY | O_NONBLOCK);
+ if (rdev->fd < 0) {
+ r = rdev->fd;
+ goto cleanup;
+ }
+
+ r = virtio_init(kvm, rdev, &rdev->vdev, &rng_dev_virtio_ops,
+ VIRTIO_DEFAULT_TRANS(kvm), PCI_DEVICE_ID_VIRTIO_RNG,
+ VIRTIO_ID_RNG, PCI_CLASS_RNG);
+ if (r < 0)
+ goto cleanup;
+
+ list_add_tail(&rdev->list, &rdevs);
+
+ if (compat_id == -1)
+ compat_id = virtio_compat_add_message("virtio-rng", "CONFIG_HW_RANDOM_VIRTIO");
+ return 0;
+cleanup:
+ close(rdev->fd);
+ free(rdev);
+
+ return r;
+}
+virtio_dev_init(virtio_rng__init);
+
+int virtio_rng__exit(struct kvm *kvm)
+{
+ struct rng_dev *rdev, *tmp;
+
+ list_for_each_entry_safe(rdev, tmp, &rdevs, list) {
+ list_del(&rdev->list);
+ rdev->vdev.ops->exit(kvm, &rdev->vdev);
+ free(rdev);
+ }
+
+ return 0;
+}
+virtio_dev_exit(virtio_rng__exit);
diff --git a/tools/kvm/virtio/scsi.c b/tools/kvm/virtio/scsi.c
new file mode 100644
index 0000000..be254f3
--- /dev/null
+++ b/tools/kvm/virtio/scsi.c
@@ -0,0 +1,311 @@
+#include "kvm/virtio-scsi.h"
+#include "kvm/virtio-pci-dev.h"
+#include "kvm/disk-image.h"
+#include "kvm/kvm.h"
+#include "kvm/pci.h"
+#include "kvm/ioeventfd.h"
+#include "kvm/guest_compat.h"
+#include "kvm/virtio-pci.h"
+#include "kvm/virtio.h"
+
+#include <linux/kernel.h>
+#include <linux/virtio_scsi.h>
+#include <linux/vhost.h>
+
+#define VIRTIO_SCSI_QUEUE_SIZE 128
+#define NUM_VIRT_QUEUES 3
+
+static LIST_HEAD(sdevs);
+static int compat_id = -1;
+
+struct scsi_dev {
+ struct virt_queue vqs[NUM_VIRT_QUEUES];
+ struct virtio_scsi_config config;
+ struct vhost_scsi_target target;
+ u32 features;
+ int vhost_fd;
+ struct virtio_device vdev;
+ struct list_head list;
+ struct kvm *kvm;
+};
+
+static u8 *get_config(struct kvm *kvm, void *dev)
+{
+ struct scsi_dev *sdev = dev;
+
+ return ((u8 *)(&sdev->config));
+}
+
+static u32 get_host_features(struct kvm *kvm, void *dev)
+{
+ return 1UL << VIRTIO_RING_F_EVENT_IDX |
+ 1UL << VIRTIO_RING_F_INDIRECT_DESC;
+}
+
+static void set_guest_features(struct kvm *kvm, void *dev, u32 features)
+{
+ struct scsi_dev *sdev = dev;
+
+ sdev->features = features;
+}
+
+static int init_vq(struct kvm *kvm, void *dev, u32 vq, u32 page_size, u32 align,
+ u32 pfn)
+{
+ struct vhost_vring_state state = { .index = vq };
+ struct vhost_vring_addr addr;
+ struct scsi_dev *sdev = dev;
+ struct virt_queue *queue;
+ void *p;
+ int r;
+
+ compat__remove_message(compat_id);
+
+ queue = &sdev->vqs[vq];
+ queue->pfn = pfn;
+ p = virtio_get_vq(kvm, queue->pfn, page_size);
+
+ vring_init(&queue->vring, VIRTIO_SCSI_QUEUE_SIZE, p, align);
+
+ if (sdev->vhost_fd == 0)
+ return 0;
+
+ state.num = queue->vring.num;
+ r = ioctl(sdev->vhost_fd, VHOST_SET_VRING_NUM, &state);
+ if (r < 0)
+ die_perror("VHOST_SET_VRING_NUM failed");
+ state.num = 0;
+ r = ioctl(sdev->vhost_fd, VHOST_SET_VRING_BASE, &state);
+ if (r < 0)
+ die_perror("VHOST_SET_VRING_BASE failed");
+
+ addr = (struct vhost_vring_addr) {
+ .index = vq,
+ .desc_user_addr = (u64)(unsigned long)queue->vring.desc,
+ .avail_user_addr = (u64)(unsigned long)queue->vring.avail,
+ .used_user_addr = (u64)(unsigned long)queue->vring.used,
+ };
+
+ r = ioctl(sdev->vhost_fd, VHOST_SET_VRING_ADDR, &addr);
+ if (r < 0)
+ die_perror("VHOST_SET_VRING_ADDR failed");
+
+ return 0;
+}
+
+static void notify_vq_gsi(struct kvm *kvm, void *dev, u32 vq, u32 gsi)
+{
+ struct vhost_vring_file file;
+ struct scsi_dev *sdev = dev;
+ struct kvm_irqfd irq;
+ int r;
+
+ if (sdev->vhost_fd == 0)
+ return;
+
+ irq = (struct kvm_irqfd) {
+ .gsi = gsi,
+ .fd = eventfd(0, 0),
+ };
+ file = (struct vhost_vring_file) {
+ .index = vq,
+ .fd = irq.fd,
+ };
+
+ r = ioctl(kvm->vm_fd, KVM_IRQFD, &irq);
+ if (r < 0)
+ die_perror("KVM_IRQFD failed");
+
+ r = ioctl(sdev->vhost_fd, VHOST_SET_VRING_CALL, &file);
+ if (r < 0)
+ die_perror("VHOST_SET_VRING_CALL failed");
+
+ if (vq > 0)
+ return;
+
+ r = ioctl(sdev->vhost_fd, VHOST_SCSI_SET_ENDPOINT, &sdev->target);
+ if (r != 0)
+ die("VHOST_SCSI_SET_ENDPOINT failed %d", errno);
+}
+
+static void notify_vq_eventfd(struct kvm *kvm, void *dev, u32 vq, u32 efd)
+{
+ struct scsi_dev *sdev = dev;
+ struct vhost_vring_file file = {
+ .index = vq,
+ .fd = efd,
+ };
+ int r;
+
+ if (sdev->vhost_fd == 0)
+ return;
+
+ r = ioctl(sdev->vhost_fd, VHOST_SET_VRING_KICK, &file);
+ if (r < 0)
+ die_perror("VHOST_SET_VRING_KICK failed");
+}
+
+static int notify_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+ return 0;
+}
+
+static int get_pfn_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+ struct scsi_dev *sdev = dev;
+
+ return sdev->vqs[vq].pfn;
+}
+
+static int get_size_vq(struct kvm *kvm, void *dev, u32 vq)
+{
+ return VIRTIO_SCSI_QUEUE_SIZE;
+}
+
+static int set_size_vq(struct kvm *kvm, void *dev, u32 vq, int size)
+{
+ return size;
+}
+
+static struct virtio_ops scsi_dev_virtio_ops = (struct virtio_ops) {
+ .get_config = get_config,
+ .get_host_features = get_host_features,
+ .set_guest_features = set_guest_features,
+ .init_vq = init_vq,
+ .get_pfn_vq = get_pfn_vq,
+ .get_size_vq = get_size_vq,
+ .set_size_vq = set_size_vq,
+ .notify_vq = notify_vq,
+ .notify_vq_gsi = notify_vq_gsi,
+ .notify_vq_eventfd = notify_vq_eventfd,
+};
+
+static void virtio_scsi_vhost_init(struct kvm *kvm, struct scsi_dev *sdev)
+{
+ struct vhost_memory *mem;
+ u64 features;
+ int r;
+
+ sdev->vhost_fd = open("/dev/vhost-scsi", O_RDWR);
+ if (sdev->vhost_fd < 0)
+ die_perror("Failed openning vhost-scsi device");
+
+ mem = calloc(1, sizeof(*mem) + sizeof(struct vhost_memory_region));
+ if (mem == NULL)
+ die("Failed allocating memory for vhost memory map");
+
+ mem->nregions = 1;
+ mem->regions[0] = (struct vhost_memory_region) {
+ .guest_phys_addr = 0,
+ .memory_size = kvm->ram_size,
+ .userspace_addr = (unsigned long)kvm->ram_start,
+ };
+
+ r = ioctl(sdev->vhost_fd, VHOST_SET_OWNER);
+ if (r != 0)
+ die_perror("VHOST_SET_OWNER failed");
+
+ r = ioctl(sdev->vhost_fd, VHOST_GET_FEATURES, &features);
+ if (r != 0)
+ die_perror("VHOST_GET_FEATURES failed");
+
+ r = ioctl(sdev->vhost_fd, VHOST_SET_FEATURES, &features);
+ if (r != 0)
+ die_perror("VHOST_SET_FEATURES failed");
+ r = ioctl(sdev->vhost_fd, VHOST_SET_MEM_TABLE, mem);
+ if (r != 0)
+ die_perror("VHOST_SET_MEM_TABLE failed");
+
+ sdev->vdev.use_vhost = true;
+
+ free(mem);
+}
+
+
+static int virtio_scsi_init_one(struct kvm *kvm, struct disk_image *disk)
+{
+ struct scsi_dev *sdev;
+
+ if (!disk)
+ return -EINVAL;
+
+ sdev = calloc(1, sizeof(struct scsi_dev));
+ if (sdev == NULL)
+ return -ENOMEM;
+
+ *sdev = (struct scsi_dev) {
+ .config = (struct virtio_scsi_config) {
+ .num_queues = NUM_VIRT_QUEUES - 2,
+ .seg_max = VIRTIO_SCSI_CDB_SIZE - 2,
+ .max_sectors = 65535,
+ .cmd_per_lun = 128,
+ .sense_size = VIRTIO_SCSI_SENSE_SIZE,
+ .cdb_size = VIRTIO_SCSI_CDB_SIZE,
+ .max_channel = 0,
+ .max_target = 0,
+ .max_lun = 16383,
+ .event_info_size = sizeof(struct virtio_scsi_event),
+ },
+ .kvm = kvm,
+ };
+ strncpy((char *)&sdev->target.vhost_wwpn, disk->wwpn, sizeof(sdev->target.vhost_wwpn));
+ sdev->target.vhost_tpgt = strtol(disk->tpgt, NULL, 0);
+
+ virtio_init(kvm, sdev, &sdev->vdev, &scsi_dev_virtio_ops,
+ VIRTIO_DEFAULT_TRANS(kvm), PCI_DEVICE_ID_VIRTIO_SCSI,
+ VIRTIO_ID_SCSI, PCI_CLASS_BLK);
+
+ list_add_tail(&sdev->list, &sdevs);
+
+ virtio_scsi_vhost_init(kvm, sdev);
+
+ if (compat_id == -1)
+ compat_id = virtio_compat_add_message("virtio-scsi", "CONFIG_VIRTIO_SCSI");
+
+ return 0;
+}
+
+static int virtio_scsi_exit_one(struct kvm *kvm, struct scsi_dev *sdev)
+{
+ int r;
+
+ r = ioctl(sdev->vhost_fd, VHOST_SCSI_CLEAR_ENDPOINT, &sdev->target);
+ if (r != 0)
+ die("VHOST_SCSI_CLEAR_ENDPOINT failed %d", errno);
+
+ list_del(&sdev->list);
+ free(sdev);
+
+ return 0;
+}
+
+int virtio_scsi_init(struct kvm *kvm)
+{
+ int i, r = 0;
+
+ for (i = 0; i < kvm->nr_disks; i++) {
+ if (!kvm->disks[i]->wwpn)
+ continue;
+ r = virtio_scsi_init_one(kvm, kvm->disks[i]);
+ if (r < 0)
+ goto cleanup;
+ }
+
+ return 0;
+cleanup:
+ return virtio_scsi_exit(kvm);
+}
+virtio_dev_init(virtio_scsi_init);
+
+int virtio_scsi_exit(struct kvm *kvm)
+{
+ while (!list_empty(&sdevs)) {
+ struct scsi_dev *sdev;
+
+ sdev = list_first_entry(&sdevs, struct scsi_dev, list);
+ virtio_scsi_exit_one(kvm, sdev);
+ }
+
+ return 0;
+}
+virtio_dev_exit(virtio_scsi_exit);
diff --git a/tools/kvm/x86/bios.c b/tools/kvm/x86/bios.c
new file mode 100644
index 0000000..f05cc02
--- /dev/null
+++ b/tools/kvm/x86/bios.c
@@ -0,0 +1,174 @@
+#include "kvm/kvm.h"
+#include "kvm/boot-protocol.h"
+#include "kvm/e820.h"
+#include "kvm/interrupt.h"
+#include "kvm/util.h"
+
+#include <string.h>
+#include <asm/e820.h>
+
+#include "bios/bios-rom.h"
+
+struct irq_handler {
+ unsigned long address;
+ unsigned int irq;
+ void *handler;
+ size_t size;
+};
+
+#define BIOS_IRQ_PA_ADDR(name) (MB_BIOS_BEGIN + BIOS_OFFSET__##name)
+#define BIOS_IRQ_FUNC(name) ((char *)&bios_rom[BIOS_OFFSET__##name])
+#define BIOS_IRQ_SIZE(name) (BIOS_ENTRY_SIZE(BIOS_OFFSET__##name))
+
+#define DEFINE_BIOS_IRQ_HANDLER(_irq, _handler) \
+ { \
+ .irq = _irq, \
+ .address = BIOS_IRQ_PA_ADDR(_handler), \
+ .handler = BIOS_IRQ_FUNC(_handler), \
+ .size = BIOS_IRQ_SIZE(_handler), \
+ }
+
+static struct irq_handler bios_irq_handlers[] = {
+ DEFINE_BIOS_IRQ_HANDLER(0x10, bios_int10),
+ DEFINE_BIOS_IRQ_HANDLER(0x15, bios_int15),
+};
+
+static void setup_irq_handler(struct kvm *kvm, struct irq_handler *handler)
+{
+ struct real_intr_desc intr_desc;
+ void *p;
+
+ p = guest_flat_to_host(kvm, handler->address);
+ memcpy(p, handler->handler, handler->size);
+
+ intr_desc = (struct real_intr_desc) {
+ .segment = REAL_SEGMENT(MB_BIOS_BEGIN),
+ .offset = handler->address - MB_BIOS_BEGIN,
+ };
+
+ DIE_IF((handler->address - MB_BIOS_BEGIN) > 0xffffUL);
+
+ interrupt_table__set(&kvm->arch.interrupt_table, &intr_desc, handler->irq);
+}
+
+/**
+ * e820_setup - setup some simple E820 memory map
+ * @kvm - guest system descriptor
+ */
+static void e820_setup(struct kvm *kvm)
+{
+ struct e820map *e820;
+ struct e820entry *mem_map;
+ unsigned int i = 0;
+
+ e820 = guest_flat_to_host(kvm, E820_MAP_START);
+ mem_map = e820->map;
+
+ mem_map[i++] = (struct e820entry) {
+ .addr = REAL_MODE_IVT_BEGIN,
+ .size = EBDA_START - REAL_MODE_IVT_BEGIN,
+ .type = E820_RAM,
+ };
+ mem_map[i++] = (struct e820entry) {
+ .addr = EBDA_START,
+ .size = VGA_RAM_BEGIN - EBDA_START,
+ .type = E820_RESERVED,
+ };
+ mem_map[i++] = (struct e820entry) {
+ .addr = MB_BIOS_BEGIN,
+ .size = MB_BIOS_END - MB_BIOS_BEGIN,
+ .type = E820_RESERVED,
+ };
+ if (kvm->ram_size < KVM_32BIT_GAP_START) {
+ mem_map[i++] = (struct e820entry) {
+ .addr = BZ_KERNEL_START,
+ .size = kvm->ram_size - BZ_KERNEL_START,
+ .type = E820_RAM,
+ };
+ } else {
+ mem_map[i++] = (struct e820entry) {
+ .addr = BZ_KERNEL_START,
+ .size = KVM_32BIT_GAP_START - BZ_KERNEL_START,
+ .type = E820_RAM,
+ };
+ mem_map[i++] = (struct e820entry) {
+ .addr = KVM_32BIT_MAX_MEM_SIZE,
+ .size = kvm->ram_size - KVM_32BIT_MAX_MEM_SIZE,
+ .type = E820_RAM,
+ };
+ }
+
+ BUG_ON(i > E820_X_MAX);
+
+ e820->nr_map = i;
+}
+
+static void setup_vga_rom(struct kvm *kvm)
+{
+ u16 *mode;
+ void *p;
+
+ p = guest_flat_to_host(kvm, VGA_ROM_OEM_STRING);
+ memset(p, 0, VGA_ROM_OEM_STRING_SIZE);
+ strncpy(p, "KVM VESA", VGA_ROM_OEM_STRING_SIZE);
+
+ mode = guest_flat_to_host(kvm, VGA_ROM_MODES);
+ mode[0] = 0x0112;
+ mode[1] = 0xffff;
+}
+
+/**
+ * setup_bios - inject BIOS into guest memory
+ * @kvm - guest system descriptor
+ */
+void setup_bios(struct kvm *kvm)
+{
+ unsigned long address = MB_BIOS_BEGIN;
+ struct real_intr_desc intr_desc;
+ unsigned int i;
+ void *p;
+
+ /*
+ * before anything else -- clean some known areas
+ * we definitely don't want any trash here
+ */
+ p = guest_flat_to_host(kvm, BDA_START);
+ memset(p, 0, BDA_END - BDA_START);
+
+ p = guest_flat_to_host(kvm, EBDA_START);
+ memset(p, 0, EBDA_END - EBDA_START);
+
+ p = guest_flat_to_host(kvm, MB_BIOS_BEGIN);
+ memset(p, 0, MB_BIOS_END - MB_BIOS_BEGIN);
+
+ p = guest_flat_to_host(kvm, VGA_ROM_BEGIN);
+ memset(p, 0, VGA_ROM_END - VGA_ROM_BEGIN);
+
+ /* just copy the bios rom into the place */
+ p = guest_flat_to_host(kvm, MB_BIOS_BEGIN);
+ memcpy(p, bios_rom, bios_rom_size);
+
+ /* E820 memory map must be present */
+ e820_setup(kvm);
+
+ /* VESA needs own tricks */
+ setup_vga_rom(kvm);
+
+ /*
+ * Setup a *fake* real mode vector table, it has only
+ * one real handler which does just iret
+ */
+ address = BIOS_IRQ_PA_ADDR(bios_intfake);
+ intr_desc = (struct real_intr_desc) {
+ .segment = REAL_SEGMENT(MB_BIOS_BEGIN),
+ .offset = address - MB_BIOS_BEGIN,
+ };
+ interrupt_table__setup(&kvm->arch.interrupt_table, &intr_desc);
+
+ for (i = 0; i < ARRAY_SIZE(bios_irq_handlers); i++)
+ setup_irq_handler(kvm, &bios_irq_handlers[i]);
+
+ /* we almost done */
+ p = guest_flat_to_host(kvm, 0);
+ interrupt_table__copy(&kvm->arch.interrupt_table, p, REAL_INTR_SIZE);
+}
diff --git a/tools/kvm/x86/bios/.gitignore b/tools/kvm/x86/bios/.gitignore
new file mode 100644
index 0000000..1f0080b
--- /dev/null
+++ b/tools/kvm/x86/bios/.gitignore
@@ -0,0 +1,3 @@
+bios-rom.bin
+bios-rom.bin.elf
+bios-rom.h
diff --git a/tools/kvm/x86/bios/bios-rom.S b/tools/kvm/x86/bios/bios-rom.S
new file mode 100644
index 0000000..3269ce9
--- /dev/null
+++ b/tools/kvm/x86/bios/bios-rom.S
@@ -0,0 +1,12 @@
+#include <kvm/assembly.h>
+
+ .org 0
+#ifdef CONFIG_X86_64
+ .code64
+#else
+ .code32
+#endif
+
+GLOBAL(bios_rom)
+ .incbin "x86/bios/bios.bin"
+END(bios_rom)
diff --git a/tools/kvm/x86/bios/e820.c b/tools/kvm/x86/bios/e820.c
new file mode 100644
index 0000000..a9bca29
--- /dev/null
+++ b/tools/kvm/x86/bios/e820.c
@@ -0,0 +1,72 @@
+#include "kvm/e820.h"
+
+#include "kvm/segment.h"
+#include "kvm/bios.h"
+
+#include <asm/processor-flags.h>
+#include <asm/e820.h>
+
+static inline void set_fs(u16 seg)
+{
+ asm volatile("movw %0,%%fs" : : "rm" (seg));
+}
+
+static inline u8 rdfs8(unsigned long addr)
+{
+ u8 v;
+
+ asm volatile("addr32 movb %%fs:%1,%0" : "=q" (v) : "m" (*(u8 *)addr));
+
+ return v;
+}
+
+static inline u32 rdfs32(unsigned long addr)
+{
+ u32 v;
+
+ asm volatile("addr32 movl %%fs:%1,%0" : "=q" (v) : "m" (*(u32 *)addr));
+
+ return v;
+}
+
+bioscall void e820_query_map(struct biosregs *regs)
+{
+ struct e820map *e820;
+ u32 map_size;
+ u16 fs_seg;
+ u32 ndx;
+
+ e820 = (struct e820map *)E820_MAP_START;
+ fs_seg = flat_to_seg16(E820_MAP_START);
+ set_fs(fs_seg);
+
+ ndx = regs->ebx;
+
+ map_size = rdfs32(flat_to_off16((u32)&e820->nr_map, fs_seg));
+
+ if (ndx < map_size) {
+ u32 start;
+ unsigned int i;
+ u8 *p;
+
+ fs_seg = flat_to_seg16(E820_MAP_START);
+ set_fs(fs_seg);
+
+ start = (u32)&e820->map[ndx];
+
+ p = (void *) regs->edi;
+
+ for (i = 0; i < sizeof(struct e820entry); i++)
+ *p++ = rdfs8(flat_to_off16(start + i, fs_seg));
+ }
+
+ regs->eax = SMAP;
+ regs->ecx = sizeof(struct e820entry);
+ regs->ebx = ++ndx;
+
+ /* Clear CF to indicate success. */
+ regs->eflags &= ~X86_EFLAGS_CF;
+
+ if (ndx >= map_size)
+ regs->ebx = 0; /* end of map */
+}
diff --git a/tools/kvm/x86/bios/entry.S b/tools/kvm/x86/bios/entry.S
new file mode 100644
index 0000000..85056e9
--- /dev/null
+++ b/tools/kvm/x86/bios/entry.S
@@ -0,0 +1,92 @@
+/*
+ * Our pretty trivial BIOS emulation
+ */
+
+#include <kvm/bios.h>
+#include <kvm/assembly.h>
+
+ .org 0
+ .code16gcc
+
+#define EFLAGS_CF (1 << 0)
+
+#include "macro.S"
+
+/* If you change these macros, remember to update 'struct biosregs' */
+.macro SAVE_BIOSREGS
+ pushl %fs
+ pushl %es
+ pushl %ds
+ pushl %edi
+ pushl %esi
+ pushl %ebp
+ pushl %esp
+ pushl %edx
+ pushl %ecx
+ pushl %ebx
+ pushl %eax
+.endm
+
+.macro RESTORE_BIOSREGS
+ popl %eax
+ popl %ebx
+ popl %ecx
+ popl %edx
+ popl %esp
+ popl %ebp
+ popl %esi
+ popl %edi
+ popl %ds
+ popl %es
+ popl %fs
+.endm
+
+/*
+ * fake interrupt handler, nothing can be faster ever
+ */
+ENTRY(bios_intfake)
+ /*
+ * Set CF to indicate failure. We don't want callers to think that the
+ * interrupt handler succeeded and then treat the return values in
+ * registers as valid data.
+ */
+ orl $EFLAGS_CF, 0x4(%esp)
+
+ IRET
+ENTRY_END(bios_intfake)
+
+/*
+ * int 10 - video - service
+ */
+ENTRY(bios_int10)
+ SAVE_BIOSREGS
+
+ movl %esp, %eax
+ /* this is way easier than doing it in assembly */
+ /* just push all the regs and jump to a C handler */
+ call int10_handler
+
+ RESTORE_BIOSREGS
+
+ /* Clear CF to indicate success. */
+ andl $~EFLAGS_CF, 0x4(%esp)
+
+ IRET
+ENTRY_END(bios_int10)
+
+ENTRY(bios_int15)
+ SAVE_BIOSREGS
+
+ movl %esp, %eax
+ call int15_handler
+
+ RESTORE_BIOSREGS
+
+ IRET
+ENTRY_END(bios_int15)
+
+GLOBAL(__locals)
+
+#include "local.S"
+
+END(__locals)
diff --git a/tools/kvm/x86/bios/gen-offsets.sh b/tools/kvm/x86/bios/gen-offsets.sh
new file mode 100644
index 0000000..8771bbe
--- /dev/null
+++ b/tools/kvm/x86/bios/gen-offsets.sh
@@ -0,0 +1,14 @@
+#!/bin/sh
+
+echo "/* Autogenerated file, don't edit */"
+echo "#ifndef BIOS_OFFSETS_H"
+echo "#define BIOS_OFFSETS_H"
+
+echo ""
+echo "#define BIOS_ENTRY_SIZE(name) (name##_end - name)"
+echo ""
+
+nm bios.bin.elf | grep ' [Tt] ' | awk '{ print "#define BIOS_OFFSET__" $3 " 0x" $1; }'
+
+echo ""
+echo "#endif"
diff --git a/tools/kvm/x86/bios/int10.c b/tools/kvm/x86/bios/int10.c
new file mode 100644
index 0000000..7cc0b3f
--- /dev/null
+++ b/tools/kvm/x86/bios/int10.c
@@ -0,0 +1,110 @@
+#include "kvm/segment.h"
+#include "kvm/bios.h"
+#include "kvm/vesa.h"
+
+#include "bios/memcpy.h"
+
+#include <boot/vesa.h>
+
+static far_ptr gen_far_ptr(unsigned int pa)
+{
+ far_ptr ptr;
+
+ ptr.seg = (pa >> 4);
+ ptr.off = pa - (ptr.seg << 4);
+
+ return ptr;
+}
+
+static inline void outb(unsigned short port, unsigned char val)
+{
+ asm volatile("outb %0, %1" : : "a"(val), "Nd"(port));
+}
+
+/*
+ * It's probably much more useful to make this print to the serial
+ * line rather than print to a non-displayed VGA memory
+ */
+static inline void int10_putchar(struct biosregs *args)
+{
+ u8 al = args->eax & 0xFF;
+
+ outb(0x3f8, al);
+}
+
+static void vbe_get_mode(struct biosregs *args)
+{
+ struct vesa_mode_info *info = (struct vesa_mode_info *) args->edi;
+
+ *info = (struct vesa_mode_info) {
+ .mode_attr = 0xd9, /* 11011011 */
+ .logical_scan = VESA_WIDTH*4,
+ .h_res = VESA_WIDTH,
+ .v_res = VESA_HEIGHT,
+ .bpp = VESA_BPP,
+ .memory_layout = 6,
+ .memory_planes = 1,
+ .lfb_ptr = VESA_MEM_ADDR,
+ .rmask = 8,
+ .gmask = 8,
+ .bmask = 8,
+ .resv_mask = 8,
+ .resv_pos = 24,
+ .bpos = 16,
+ .gpos = 8,
+ };
+}
+
+static void vbe_get_info(struct biosregs *args)
+{
+ struct vesa_general_info *infop = (struct vesa_general_info *) args->edi;
+ struct vesa_general_info info;
+
+ info = (struct vesa_general_info) {
+ .signature = VESA_MAGIC,
+ .version = 0x102,
+ .vendor_string = gen_far_ptr(VGA_ROM_BEGIN),
+ .capabilities = 0x10,
+ .video_mode_ptr = gen_far_ptr(VGA_ROM_MODES),
+ .total_memory = (4 * VESA_WIDTH * VESA_HEIGHT) / 0x10000,
+ };
+
+ memcpy16(args->es, infop, args->ds, &info, sizeof(info));
+}
+
+#define VBE_STATUS_OK 0x004F
+
+static void int10_vesa(struct biosregs *args)
+{
+ u8 al;
+
+ al = args->eax & 0xff;
+
+ switch (al) {
+ case 0x00:
+ vbe_get_info(args);
+ break;
+ case 0x01:
+ vbe_get_mode(args);
+ break;
+ }
+
+ args->eax = VBE_STATUS_OK;
+}
+
+bioscall void int10_handler(struct biosregs *args)
+{
+ u8 ah;
+
+ ah = (args->eax & 0xff00) >> 8;
+
+ switch (ah) {
+ case 0x0e:
+ int10_putchar(args);
+ break;
+ case 0x4f:
+ int10_vesa(args);
+ break;
+ }
+
+}
diff --git a/tools/kvm/x86/bios/int15.c b/tools/kvm/x86/bios/int15.c
new file mode 100644
index 0000000..faf5343
--- /dev/null
+++ b/tools/kvm/x86/bios/int15.c
@@ -0,0 +1,18 @@
+#include "kvm/bios.h"
+
+#include "kvm/e820.h"
+
+#include <asm/processor-flags.h>
+
+bioscall void int15_handler(struct biosregs *regs)
+{
+ switch (regs->eax) {
+ case 0xe820:
+ e820_query_map(regs);
+ break;
+ default:
+ /* Set CF to indicate failure. */
+ regs->eflags |= X86_EFLAGS_CF;
+ break;
+ }
+}
diff --git a/tools/kvm/x86/bios/local.S b/tools/kvm/x86/bios/local.S
new file mode 100644
index 0000000..f2cdbf4
--- /dev/null
+++ b/tools/kvm/x86/bios/local.S
@@ -0,0 +1,7 @@
+/*
+ * Local variables for almost every BIOS irq handler
+ * Must be put somewhere inside irq handler body
+ */
+__CALLER_SS: .int 0
+__CALLER_SP: .long 0
+__CALLER_CLOBBER: .long 0
diff --git a/tools/kvm/x86/bios/macro.S b/tools/kvm/x86/bios/macro.S
new file mode 100644
index 0000000..0d5e567e
--- /dev/null
+++ b/tools/kvm/x86/bios/macro.S
@@ -0,0 +1,25 @@
+/*
+ * handy BIOS macros
+ */
+
+/*
+ * switch to BIOS stack
+ */
+.macro stack_swap
+ movw %ss, %cs:(__CALLER_SS)
+ movl %esp, %cs:(__CALLER_SP)
+ movl %edx, %cs:(__CALLER_CLOBBER)
+ movw $MB_BIOS_SS, %dx
+ movw %dx, %ss
+ movw $MB_BIOS_SP, %sp
+ movl %cs:(__CALLER_CLOBBER), %edx
+.endm
+
+/*
+ * restore the original stack
+ */
+.macro stack_restore
+ movl %cs:(__CALLER_SP), %esp
+ movw %cs:(__CALLER_SS), %ss
+.endm
+
diff --git a/tools/kvm/x86/bios/memcpy.c b/tools/kvm/x86/bios/memcpy.c
new file mode 100644
index 0000000..40b9b65
--- /dev/null
+++ b/tools/kvm/x86/bios/memcpy.c
@@ -0,0 +1,23 @@
+#include "bios/memcpy.h"
+
+/*
+ * Copy memory area in 16-bit real mode.
+ */
+void memcpy16(u16 dst_seg, void *dst, u16 src_seg, const void *src, size_t len)
+{
+ __asm__ __volatile__ (
+ "pushw %%ds \n"
+ "pushw %%es \n"
+ "movw %[src_seg], %%ds \n"
+ "movw %[dst_seg], %%es \n"
+ "rep movsb %%ds:(%%si), %%es:(%%di) \n"
+ "popw %%es \n"
+ "popw %%ds \n"
+ :
+ : "S"(src),
+ "D"(dst),
+ "c"(len),
+ [src_seg] "r"(src_seg),
+ [dst_seg] "r"(dst_seg)
+ : "cc", "memory");
+}
diff --git a/tools/kvm/x86/bios/rom.ld.S b/tools/kvm/x86/bios/rom.ld.S
new file mode 100644
index 0000000..f4f1835
--- /dev/null
+++ b/tools/kvm/x86/bios/rom.ld.S
@@ -0,0 +1,16 @@
+OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
+OUTPUT_ARCH(i386)
+
+SECTIONS {
+ .text 0 : {
+ *(.text)
+ }
+
+ /DISCARD/ : {
+ *(.debug*)
+ *(.data)
+ *(.bss)
+ *(.eh_frame*)
+ }
+}
+
diff --git a/tools/kvm/x86/boot.c b/tools/kvm/x86/boot.c
new file mode 100644
index 0000000..61535eb
--- /dev/null
+++ b/tools/kvm/x86/boot.c
@@ -0,0 +1,41 @@
+#include "kvm/kvm.h"
+
+#include "kvm/util.h"
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <stdbool.h>
+#include <fcntl.h>
+
+#define BIOS_SELECTOR 0xf000
+#define BIOS_IP 0xfff0
+#define BIOS_SP 0x8000
+
+bool kvm__load_firmware(struct kvm *kvm, const char *firmware_filename)
+{
+ struct stat st;
+ void *p;
+ int fd;
+ int nr;
+
+ fd = open(firmware_filename, O_RDONLY);
+ if (fd < 0)
+ return false;
+
+ if (fstat(fd, &st))
+ return false;
+
+ if (st.st_size > MB_FIRMWARE_BIOS_SIZE)
+ die("firmware image %s is too big to fit in memory (%Lu KB).\n", firmware_filename, (u64)(st.st_size / 1024));
+
+ p = guest_flat_to_host(kvm, MB_FIRMWARE_BIOS_BEGIN);
+
+ while ((nr = read(fd, p, st.st_size)) > 0)
+ p += nr;
+
+ kvm->arch.boot_selector = BIOS_SELECTOR;
+ kvm->arch.boot_ip = BIOS_IP;
+ kvm->arch.boot_sp = BIOS_SP;
+
+ return true;
+}
diff --git a/tools/kvm/x86/cpuid.c b/tools/kvm/x86/cpuid.c
new file mode 100644
index 0000000..c3b67d9
--- /dev/null
+++ b/tools/kvm/x86/cpuid.c
@@ -0,0 +1,89 @@
+#include "kvm/kvm-cpu.h"
+
+#include "kvm/kvm.h"
+#include "kvm/util.h"
+
+#include <sys/ioctl.h>
+#include <stdlib.h>
+
+#define MAX_KVM_CPUID_ENTRIES 100
+
+static void filter_cpuid(struct kvm_cpuid2 *kvm_cpuid)
+{
+ unsigned int signature[3];
+ unsigned int i;
+
+ /*
+ * Filter CPUID functions that are not supported by the hypervisor.
+ */
+ for (i = 0; i < kvm_cpuid->nent; i++) {
+ struct kvm_cpuid_entry2 *entry = &kvm_cpuid->entries[i];
+
+ switch (entry->function) {
+ case 0:
+ /* Vendor name */
+ memcpy(signature, "LKVMLKVMLKVM", 12);
+ entry->ebx = signature[0];
+ entry->ecx = signature[1];
+ entry->edx = signature[2];
+ break;
+ case 1:
+ /* Set X86_FEATURE_HYPERVISOR */
+ if (entry->index == 0)
+ entry->ecx |= (1 << 31);
+ break;
+ case 6:
+ /* Clear X86_FEATURE_EPB */
+ entry->ecx = entry->ecx & ~(1 << 3);
+ break;
+ case 10: { /* Architectural Performance Monitoring */
+ union cpuid10_eax {
+ struct {
+ unsigned int version_id :8;
+ unsigned int num_counters :8;
+ unsigned int bit_width :8;
+ unsigned int mask_length :8;
+ } split;
+ unsigned int full;
+ } eax;
+
+ /*
+ * If the host has perf system running,
+ * but no architectural events available
+ * through kvm pmu -- disable perf support,
+ * thus guest won't even try to access msr
+ * registers.
+ */
+ if (entry->eax) {
+ eax.full = entry->eax;
+ if (eax.split.version_id != 2 ||
+ !eax.split.num_counters)
+ entry->eax = 0;
+ }
+ break;
+ }
+ default:
+ /* Keep the CPUID function as -is */
+ break;
+ };
+ }
+}
+
+void kvm_cpu__setup_cpuid(struct kvm_cpu *vcpu)
+{
+ struct kvm_cpuid2 *kvm_cpuid;
+
+ kvm_cpuid = calloc(1, sizeof(*kvm_cpuid) +
+ MAX_KVM_CPUID_ENTRIES * sizeof(*kvm_cpuid->entries));
+
+ kvm_cpuid->nent = MAX_KVM_CPUID_ENTRIES;
+ if (ioctl(vcpu->kvm->sys_fd, KVM_GET_SUPPORTED_CPUID, kvm_cpuid) < 0)
+ die_perror("KVM_GET_SUPPORTED_CPUID failed");
+
+ filter_cpuid(kvm_cpuid);
+
+ if (ioctl(vcpu->vcpu_fd, KVM_SET_CPUID2, kvm_cpuid) < 0)
+ die_perror("KVM_SET_CPUID2 failed");
+
+ free(kvm_cpuid);
+}
diff --git a/tools/kvm/x86/include/kvm/assembly.h b/tools/kvm/x86/include/kvm/assembly.h
new file mode 100644
index 0000000..e70baab
--- /dev/null
+++ b/tools/kvm/x86/include/kvm/assembly.h
@@ -0,0 +1,24 @@
+#ifndef ASSEMBLY_H_
+#define ASSEMBLY_H_
+
+#define __ALIGN .p2align 4, 0x90
+#define ENTRY(name) \
+ __ALIGN; \
+ .globl name; \
+ name:
+
+#define GLOBAL(name) \
+ .globl name; \
+ name:
+
+#define ENTRY_END(name) GLOBAL(name##_end)
+#define END(name) GLOBAL(name##_end)
+
+/*
+ * gas produces size override prefix with which
+ * we are unhappy, lets make it hardcoded for
+ * 16 bit mode
+ */
+#define IRET .byte 0xcf
+
+#endif /* ASSEMBLY_H_ */
diff --git a/tools/kvm/x86/include/kvm/barrier.h b/tools/kvm/x86/include/kvm/barrier.h
new file mode 100644
index 0000000..46d14f6
--- /dev/null
+++ b/tools/kvm/x86/include/kvm/barrier.h
@@ -0,0 +1,20 @@
+#ifndef _KVM_BARRIER_H_
+#define _KVM_BARRIER_H_
+
+#define barrier() asm volatile("": : :"memory")
+
+#define mb() asm volatile ("mfence": : :"memory")
+#define rmb() asm volatile ("lfence": : :"memory")
+#define wmb() asm volatile ("sfence": : :"memory")
+
+#ifdef CONFIG_SMP
+#define smp_mb() mb()
+#define smp_rmb() rmb()
+#define smp_wmb() wmb()
+#else
+#define smp_mb() barrier()
+#define smp_rmb() barrier()
+#define smp_wmb() barrier()
+#endif
+
+#endif /* _KVM_BARRIER_H_ */
diff --git a/tools/kvm/x86/include/kvm/bios-export.h b/tools/kvm/x86/include/kvm/bios-export.h
new file mode 100644
index 0000000..23825aa
--- /dev/null
+++ b/tools/kvm/x86/include/kvm/bios-export.h
@@ -0,0 +1,13 @@
+#ifndef BIOS_EXPORT_H_
+#define BIOS_EXPORT_H_
+
+struct kvm;
+
+extern char bios_rom[0];
+extern char bios_rom_end[0];
+
+#define bios_rom_size (bios_rom_end - bios_rom)
+
+extern void setup_bios(struct kvm *kvm);
+
+#endif /* BIOS_EXPORT_H_ */
diff --git a/tools/kvm/x86/include/kvm/bios.h b/tools/kvm/x86/include/kvm/bios.h
new file mode 100644
index 0000000..ec7ed71
--- /dev/null
+++ b/tools/kvm/x86/include/kvm/bios.h
@@ -0,0 +1,93 @@
+#ifndef BIOS_H_
+#define BIOS_H_
+
+/*
+ * X86-32 Memory Map (typical)
+ * start end
+ * Real Mode Interrupt Vector Table 0x00000000 0x000003FF
+ * BDA area 0x00000400 0x000004FF
+ * Conventional Low Memory 0x00000500 0x0009FBFF
+ * EBDA area 0x0009FC00 0x0009FFFF
+ * VIDEO RAM 0x000A0000 0x000BFFFF
+ * VIDEO ROM (BIOS) 0x000C0000 0x000C7FFF
+ * ROMs & unus. space (mapped hw & misc)0x000C8000 0x000EFFFF 160 KiB (typically)
+ * Motherboard BIOS 0x000F0000 0x000FFFFF
+ * Extended Memory 0x00100000 0xFEBFFFFF
+ * Reserved (configs, ACPI, PnP, etc) 0xFEC00000 0xFFFFFFFF
+ */
+
+#define REAL_MODE_IVT_BEGIN 0x00000000
+#define REAL_MODE_IVT_END 0x000003ff
+
+#define BDA_START 0x00000400
+#define BDA_END 0x000004ff
+
+#define EBDA_START 0x0009fc00
+#define EBDA_END 0x0009ffff
+
+#define E820_MAP_START EBDA_START
+
+#define MB_BIOS_BEGIN 0x000f0000
+#define MB_FIRMWARE_BIOS_BEGIN 0x000e0000
+#define MB_BIOS_END 0x000fffff
+
+#define MB_BIOS_SIZE (MB_BIOS_END - MB_BIOS_BEGIN + 1)
+#define MB_FIRMWARE_BIOS_SIZE (MB_BIOS_END - MB_FIRMWARE_BIOS_BEGIN + 1)
+
+#define VGA_RAM_BEGIN 0x000a0000
+#define VGA_RAM_END 0x000bffff
+
+#define VGA_ROM_BEGIN 0x000c0000
+#define VGA_ROM_OEM_STRING VGA_ROM_BEGIN
+#define VGA_ROM_OEM_STRING_SIZE 16
+#define VGA_ROM_MODES (VGA_ROM_OEM_STRING + VGA_ROM_OEM_STRING_SIZE)
+#define VGA_ROM_MODES_SIZE 32
+#define VGA_ROM_END 0x000c7fff
+
+/* we handle one page only */
+#define VGA_RAM_SEG (VGA_RAM_BEGIN >> 4)
+#define VGA_PAGE_SIZE 0x007d0 /* 80x25 */
+
+/* real mode interrupt vector table */
+#define REAL_INTR_BASE REAL_MODE_IVT_BEGIN
+#define REAL_INTR_VECTORS 256
+
+/*
+ * BIOS stack must be at absolute predefined memory address
+ * We reserve 64 bytes for BIOS stack
+ */
+#define MB_BIOS_SS 0xfff7
+#define MB_BIOS_SP 0x40
+
+/*
+ * When interfere with assembler code we need to be sure how
+ * arguments are passed in real mode.
+ */
+#define bioscall __attribute__((regparm(3)))
+
+#ifndef __ASSEMBLER__
+
+#include <linux/types.h>
+
+struct biosregs {
+ u32 eax;
+ u32 ebx;
+ u32 ecx;
+ u32 edx;
+ u32 esp;
+ u32 ebp;
+ u32 esi;
+ u32 edi;
+ u32 ds;
+ u32 es;
+ u32 fs;
+ u32 eip;
+ u32 eflags;
+};
+
+extern bioscall void int10_handler(struct biosregs *regs);
+extern bioscall void int15_handler(struct biosregs *regs);
+
+#endif
+
+#endif /* BIOS_H_ */
diff --git a/tools/kvm/x86/include/kvm/boot-protocol.h b/tools/kvm/x86/include/kvm/boot-protocol.h
new file mode 100644
index 0000000..85b637f
--- /dev/null
+++ b/tools/kvm/x86/include/kvm/boot-protocol.h
@@ -0,0 +1,16 @@
+/*
+ * Linux boot protocol specifics
+ */
+
+#ifndef BOOT_PROTOCOL_H_
+#define BOOT_PROTOCOL_H_
+
+/*
+ * The protected mode kernel part of a modern bzImage is loaded
+ * at 1 MB by default.
+ */
+#define BZ_DEFAULT_SETUP_SECTS 4
+#define BZ_KERNEL_START 0x100000UL
+#define INITRD_START 0x1000000UL
+
+#endif /* BOOT_PROTOCOL_H_ */
diff --git a/tools/kvm/x86/include/kvm/cpufeature.h b/tools/kvm/x86/include/kvm/cpufeature.h
new file mode 100644
index 0000000..bc4abbb
--- /dev/null
+++ b/tools/kvm/x86/include/kvm/cpufeature.h
@@ -0,0 +1,41 @@
+#ifndef KVM__CPUFEATURE_H
+#define KVM__CPUFEATURE_H
+
+#define CPUID_VENDOR_INTEL_1 0x756e6547 /* "Genu" */
+#define CPUID_VENDOR_INTEL_2 0x49656e69 /* "ineI" */
+#define CPUID_VENDOR_INTEL_3 0x6c65746e /* "ntel" */
+
+#define CPUID_VENDOR_AMD_1 0x68747541 /* "Auth" */
+#define CPUID_VENDOR_AMD_2 0x69746e65 /* "enti" */
+#define CPUID_VENDOR_AMD_3 0x444d4163 /* "cAMD" */
+
+/*
+ * CPUID flags we need to deal with
+ */
+#define KVM__X86_FEATURE_VMX 5 /* Hardware virtualization */
+#define KVM__X86_FEATURE_SVM 2 /* Secure virtual machine */
+#define KVM__X86_FEATURE_XSAVE 26 /* XSAVE/XRSTOR/XSETBV/XGETBV */
+
+#define cpu_feature_disable(reg, feature) \
+ ((reg) & ~(1 << (feature)))
+#define cpu_feature_enable(reg, feature) \
+ ((reg) | (1 << (feature)))
+
+struct cpuid_regs {
+ u32 eax;
+ u32 ebx;
+ u32 ecx;
+ u32 edx;
+};
+
+static inline void host_cpuid(struct cpuid_regs *regs)
+{
+ asm volatile("cpuid"
+ : "=a" (regs->eax),
+ "=b" (regs->ebx),
+ "=c" (regs->ecx),
+ "=d" (regs->edx)
+ : "0" (regs->eax), "2" (regs->ecx));
+}
+
+#endif /* KVM__CPUFEATURE_H */
diff --git a/tools/kvm/x86/include/kvm/interrupt.h b/tools/kvm/x86/include/kvm/interrupt.h
new file mode 100644
index 0000000..00c7ed7
--- /dev/null
+++ b/tools/kvm/x86/include/kvm/interrupt.h
@@ -0,0 +1,26 @@
+#ifndef KVM__INTERRUPT_H
+#define KVM__INTERRUPT_H
+
+#include <linux/types.h>
+#include "kvm/bios.h"
+#include "kvm/bios-export.h"
+
+struct real_intr_desc {
+ u16 offset;
+ u16 segment;
+} __attribute__((packed));
+
+#define REAL_SEGMENT_SHIFT 4
+#define REAL_SEGMENT(addr) ((addr) >> REAL_SEGMENT_SHIFT)
+#define REAL_OFFSET(addr) ((addr) & ((1 << REAL_SEGMENT_SHIFT) - 1))
+#define REAL_INTR_SIZE (REAL_INTR_VECTORS * sizeof(struct real_intr_desc))
+
+struct interrupt_table {
+ struct real_intr_desc entries[REAL_INTR_VECTORS];
+};
+
+void interrupt_table__copy(struct interrupt_table *itable, void *dst, unsigned int size);
+void interrupt_table__setup(struct interrupt_table *itable, struct real_intr_desc *entry);
+void interrupt_table__set(struct interrupt_table *itable, struct real_intr_desc *entry, unsigned int num);
+
+#endif /* KVM__INTERRUPT_H */
diff --git a/tools/kvm/x86/include/kvm/kvm-arch.h b/tools/kvm/x86/include/kvm/kvm-arch.h
new file mode 100644
index 0000000..673bdf1
--- /dev/null
+++ b/tools/kvm/x86/include/kvm/kvm-arch.h
@@ -0,0 +1,42 @@
+#ifndef KVM__KVM_ARCH_H
+#define KVM__KVM_ARCH_H
+
+#include "kvm/interrupt.h"
+#include "kvm/segment.h"
+
+#include <stdbool.h>
+#include <linux/types.h>
+#include <time.h>
+
+/*
+ * The hole includes VESA framebuffer and PCI memory.
+ */
+#define KVM_32BIT_MAX_MEM_SIZE (1ULL << 32)
+#define KVM_32BIT_GAP_SIZE (768 << 20)
+#define KVM_32BIT_GAP_START (KVM_32BIT_MAX_MEM_SIZE - KVM_32BIT_GAP_SIZE)
+
+#define KVM_MMIO_START KVM_32BIT_GAP_START
+
+/* This is the address that pci_get_io_space_block() starts allocating
+ * from. Note that this is a PCI bus address (though same on x86).
+ */
+#define KVM_IOPORT_AREA 0x0
+#define KVM_PCI_CFG_AREA (KVM_MMIO_START + 0x1000000)
+#define KVM_PCI_MMIO_AREA (KVM_MMIO_START + 0x2000000)
+#define KVM_VIRTIO_MMIO_AREA (KVM_MMIO_START + 0x3000000)
+
+#define KVM_IRQ_OFFSET 5
+
+#define KVM_VM_TYPE 0
+
+#define VIRTIO_DEFAULT_TRANS(kvm) VIRTIO_PCI
+
+struct kvm_arch {
+ u16 boot_selector;
+ u16 boot_ip;
+ u16 boot_sp;
+
+ struct interrupt_table interrupt_table;
+};
+
+#endif /* KVM__KVM_ARCH_H */
diff --git a/tools/kvm/x86/include/kvm/kvm-config-arch.h b/tools/kvm/x86/include/kvm/kvm-config-arch.h
new file mode 100644
index 0000000..3eae8db
--- /dev/null
+++ b/tools/kvm/x86/include/kvm/kvm-config-arch.h
@@ -0,0 +1,15 @@
+#ifndef KVM__KVM_CONFIG_ARCH_H
+#define KVM__KVM_CONFIG_ARCH_H
+
+#include "kvm/parse-options.h"
+
+struct kvm_config_arch {
+ int vidmode;
+};
+
+#define OPT_ARCH_RUN(pfx, cfg) \
+ pfx, \
+ OPT_GROUP("BIOS options:"), \
+ OPT_INTEGER('\0', "vidmode", &(cfg)->vidmode, "Video mode"),
+
+#endif /* KVM__KVM_CONFIG_ARCH_H */
diff --git a/tools/kvm/x86/include/kvm/kvm-cpu-arch.h b/tools/kvm/x86/include/kvm/kvm-cpu-arch.h
new file mode 100644
index 0000000..89c8059
--- /dev/null
+++ b/tools/kvm/x86/include/kvm/kvm-cpu-arch.h
@@ -0,0 +1,49 @@
+#ifndef KVM__KVM_CPU_ARCH_H
+#define KVM__KVM_CPU_ARCH_H
+
+/* Architecture-specific kvm_cpu definitions. */
+
+#include <linux/kvm.h> /* for struct kvm_regs */
+#include "kvm/kvm.h" /* for kvm__emulate_{mm}io() */
+#include <stdbool.h>
+#include <pthread.h>
+
+struct kvm;
+
+struct kvm_cpu {
+ pthread_t thread; /* VCPU thread */
+
+ unsigned long cpu_id;
+
+ struct kvm *kvm; /* parent KVM */
+ int vcpu_fd; /* For VCPU ioctls() */
+ struct kvm_run *kvm_run;
+
+ struct kvm_regs regs;
+ struct kvm_sregs sregs;
+ struct kvm_fpu fpu;
+
+ struct kvm_msrs *msrs; /* dynamically allocated */
+
+ u8 is_running;
+ u8 paused;
+ u8 needs_nmi;
+
+ struct kvm_coalesced_mmio_ring *ring;
+};
+
+/*
+ * As these are such simple wrappers, let's have them in the header so they'll
+ * be cheaper to call:
+ */
+static inline bool kvm_cpu__emulate_io(struct kvm_cpu *vcpu, u16 port, void *data, int direction, int size, u32 count)
+{
+ return kvm__emulate_io(vcpu, port, data, direction, size, count);
+}
+
+static inline bool kvm_cpu__emulate_mmio(struct kvm_cpu *vcpu, u64 phys_addr, u8 *data, u32 len, u8 is_write)
+{
+ return kvm__emulate_mmio(vcpu, phys_addr, data, len, is_write);
+}
+
+#endif /* KVM__KVM_CPU_ARCH_H */
diff --git a/tools/kvm/x86/include/kvm/mptable.h b/tools/kvm/x86/include/kvm/mptable.h
new file mode 100644
index 0000000..9e3cfa6
--- /dev/null
+++ b/tools/kvm/x86/include/kvm/mptable.h
@@ -0,0 +1,9 @@
+#ifndef KVM_MPTABLE_H_
+#define KVM_MPTABLE_H_
+
+struct kvm;
+
+int mptable__init(struct kvm *kvm);
+int mptable__exit(struct kvm *kvm);
+
+#endif /* KVM_MPTABLE_H_ */
diff --git a/tools/kvm/x86/interrupt.c b/tools/kvm/x86/interrupt.c
new file mode 100644
index 0000000..7d47869
--- /dev/null
+++ b/tools/kvm/x86/interrupt.c
@@ -0,0 +1,28 @@
+#include "kvm/interrupt.h"
+
+#include "kvm/util.h"
+
+#include <string.h>
+
+void interrupt_table__copy(struct interrupt_table *itable, void *dst, unsigned int size)
+{
+ if (size < sizeof(itable->entries))
+ die("An attempt to overwrite host memory");
+
+ memcpy(dst, itable->entries, sizeof(itable->entries));
+}
+
+void interrupt_table__setup(struct interrupt_table *itable, struct real_intr_desc *entry)
+{
+ unsigned int i;
+
+ for (i = 0; i < REAL_INTR_VECTORS; i++)
+ itable->entries[i] = *entry;
+}
+
+void interrupt_table__set(struct interrupt_table *itable,
+ struct real_intr_desc *entry, unsigned int num)
+{
+ if (num < REAL_INTR_VECTORS)
+ itable->entries[num] = *entry;
+}
diff --git a/tools/kvm/x86/ioport.c b/tools/kvm/x86/ioport.c
new file mode 100644
index 0000000..8572c758
--- /dev/null
+++ b/tools/kvm/x86/ioport.c
@@ -0,0 +1,118 @@
+#include "kvm/ioport.h"
+
+#include <stdlib.h>
+#include <stdio.h>
+
+static bool debug_io_out(struct ioport *ioport, struct kvm_cpu *vcpu, u16 port, void *data, int size)
+{
+ return 0;
+}
+
+static struct ioport_operations debug_ops = {
+ .io_out = debug_io_out,
+};
+
+static bool seabios_debug_io_out(struct ioport *ioport, struct kvm_cpu *vcpu, u16 port, void *data, int size)
+{
+ char ch;
+
+ ch = ioport__read8(data);
+
+ putchar(ch);
+
+ return true;
+}
+
+static struct ioport_operations seabios_debug_ops = {
+ .io_out = seabios_debug_io_out,
+};
+
+static bool dummy_io_in(struct ioport *ioport, struct kvm_cpu *vcpu, u16 port, void *data, int size)
+{
+ return true;
+}
+
+static bool dummy_io_out(struct ioport *ioport, struct kvm_cpu *vcpu, u16 port, void *data, int size)
+{
+ return true;
+}
+
+static struct ioport_operations dummy_read_write_ioport_ops = {
+ .io_in = dummy_io_in,
+ .io_out = dummy_io_out,
+};
+
+static struct ioport_operations dummy_write_only_ioport_ops = {
+ .io_out = dummy_io_out,
+};
+
+/*
+ * The "fast A20 gate"
+ */
+
+static bool ps2_control_a_io_in(struct ioport *ioport, struct kvm_cpu *vcpu, u16 port, void *data, int size)
+{
+ /*
+ * A20 is always enabled.
+ */
+ ioport__write8(data, 0x02);
+
+ return true;
+}
+
+static struct ioport_operations ps2_control_a_ops = {
+ .io_in = ps2_control_a_io_in,
+ .io_out = dummy_io_out,
+};
+
+void ioport__map_irq(u8 *irq)
+{
+}
+
+void ioport__setup_arch(struct kvm *kvm)
+{
+ /* Legacy ioport setup */
+
+ /* 0000 - 001F - DMA1 controller */
+ ioport__register(kvm, 0x0000, &dummy_read_write_ioport_ops, 32, NULL);
+
+ /* 0x0020 - 0x003F - 8259A PIC 1 */
+ ioport__register(kvm, 0x0020, &dummy_read_write_ioport_ops, 2, NULL);
+
+ /* PORT 0040-005F - PIT - PROGRAMMABLE INTERVAL TIMER (8253, 8254) */
+ ioport__register(kvm, 0x0040, &dummy_read_write_ioport_ops, 4, NULL);
+
+ /* 0092 - PS/2 system control port A */
+ ioport__register(kvm, 0x0092, &ps2_control_a_ops, 1, NULL);
+
+ /* 0x00A0 - 0x00AF - 8259A PIC 2 */
+ ioport__register(kvm, 0x00A0, &dummy_read_write_ioport_ops, 2, NULL);
+
+ /* 00C0 - 001F - DMA2 controller */
+ ioport__register(kvm, 0x00C0, &dummy_read_write_ioport_ops, 32, NULL);
+
+ /* PORT 00E0-00EF are 'motherboard specific' so we use them for our
+ internal debugging purposes. */
+ ioport__register(kvm, IOPORT_DBG, &debug_ops, 1, NULL);
+
+ /* PORT 00ED - DUMMY PORT FOR DELAY??? */
+ ioport__register(kvm, 0x00ED, &dummy_write_only_ioport_ops, 1, NULL);
+
+ /* 0x00F0 - 0x00FF - Math co-processor */
+ ioport__register(kvm, 0x00F0, &dummy_write_only_ioport_ops, 2, NULL);
+
+ /* PORT 0278-027A - PARALLEL PRINTER PORT (usually LPT1, sometimes LPT2) */
+ ioport__register(kvm, 0x0278, &dummy_read_write_ioport_ops, 3, NULL);
+
+ /* PORT 0378-037A - PARALLEL PRINTER PORT (usually LPT2, sometimes LPT3) */
+ ioport__register(kvm, 0x0378, &dummy_read_write_ioport_ops, 3, NULL);
+
+ /* PORT 03D4-03D5 - COLOR VIDEO - CRT CONTROL REGISTERS */
+ ioport__register(kvm, 0x03D4, &dummy_read_write_ioport_ops, 1, NULL);
+ ioport__register(kvm, 0x03D5, &dummy_write_only_ioport_ops, 1, NULL);
+
+ ioport__register(kvm, 0x402, &seabios_debug_ops, 1, NULL);
+
+ /* 0510 - QEMU BIOS configuration register */
+ ioport__register(kvm, 0x510, &dummy_read_write_ioport_ops, 2, NULL);
+}
diff --git a/tools/kvm/x86/irq.c b/tools/kvm/x86/irq.c
new file mode 100644
index 0000000..72177e7
--- /dev/null
+++ b/tools/kvm/x86/irq.c
@@ -0,0 +1,101 @@
+#include "kvm/irq.h"
+#include "kvm/kvm.h"
+#include "kvm/util.h"
+
+#include <linux/types.h>
+#include <linux/rbtree.h>
+#include <linux/list.h>
+#include <linux/kvm.h>
+#include <sys/ioctl.h>
+
+#include <stddef.h>
+#include <stdlib.h>
+
+#define IRQ_MAX_GSI 64
+#define IRQCHIP_MASTER 0
+#define IRQCHIP_SLAVE 1
+#define IRQCHIP_IOAPIC 2
+
+/* First 24 GSIs are routed between IRQCHIPs and IOAPICs */
+static u32 gsi = 24;
+
+struct kvm_irq_routing *irq_routing;
+
+static int irq__add_routing(u32 gsi, u32 type, u32 irqchip, u32 pin)
+{
+ if (gsi >= IRQ_MAX_GSI)
+ return -ENOSPC;
+
+ irq_routing->entries[irq_routing->nr++] =
+ (struct kvm_irq_routing_entry) {
+ .gsi = gsi,
+ .type = type,
+ .u.irqchip.irqchip = irqchip,
+ .u.irqchip.pin = pin,
+ };
+
+ return 0;
+}
+
+int irq__init(struct kvm *kvm)
+{
+ int i, r;
+
+ irq_routing = calloc(sizeof(struct kvm_irq_routing) +
+ IRQ_MAX_GSI * sizeof(struct kvm_irq_routing_entry), 1);
+ if (irq_routing == NULL)
+ return -ENOMEM;
+
+ /* Hook first 8 GSIs to master IRQCHIP */
+ for (i = 0; i < 8; i++)
+ if (i != 2)
+ irq__add_routing(i, KVM_IRQ_ROUTING_IRQCHIP, IRQCHIP_MASTER, i);
+
+ /* Hook next 8 GSIs to slave IRQCHIP */
+ for (i = 8; i < 16; i++)
+ irq__add_routing(i, KVM_IRQ_ROUTING_IRQCHIP, IRQCHIP_SLAVE, i - 8);
+
+ /* Last but not least, IOAPIC */
+ for (i = 0; i < 24; i++) {
+ if (i == 0)
+ irq__add_routing(i, KVM_IRQ_ROUTING_IRQCHIP, IRQCHIP_IOAPIC, 2);
+ else if (i != 2)
+ irq__add_routing(i, KVM_IRQ_ROUTING_IRQCHIP, IRQCHIP_IOAPIC, i);
+ }
+
+ r = ioctl(kvm->vm_fd, KVM_SET_GSI_ROUTING, irq_routing);
+ if (r) {
+ free(irq_routing);
+ return errno;
+ }
+
+ return 0;
+}
+dev_base_init(irq__init);
+
+int irq__exit(struct kvm *kvm)
+{
+ free(irq_routing);
+ return 0;
+}
+dev_base_exit(irq__exit);
+
+int irq__add_msix_route(struct kvm *kvm, struct msi_msg *msg)
+{
+ int r;
+
+ irq_routing->entries[irq_routing->nr++] =
+ (struct kvm_irq_routing_entry) {
+ .gsi = gsi,
+ .type = KVM_IRQ_ROUTING_MSI,
+ .u.msi.address_hi = msg->address_hi,
+ .u.msi.address_lo = msg->address_lo,
+ .u.msi.data = msg->data,
+ };
+
+ r = ioctl(kvm->vm_fd, KVM_SET_GSI_ROUTING, irq_routing);
+ if (r)
+ return r;
+
+ return gsi++;
+}
diff --git a/tools/kvm/x86/kvm-cpu.c b/tools/kvm/x86/kvm-cpu.c
new file mode 100644
index 0000000..5cc4e1e
--- /dev/null
+++ b/tools/kvm/x86/kvm-cpu.c
@@ -0,0 +1,431 @@
+#include "kvm/kvm-cpu.h"
+
+#include "kvm/symbol.h"
+#include "kvm/util.h"
+#include "kvm/kvm.h"
+
+#include <asm/msr-index.h>
+#include <asm/apicdef.h>
+#include <linux/err.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <stdio.h>
+
+static int debug_fd;
+
+void kvm_cpu__set_debug_fd(int fd)
+{
+ debug_fd = fd;
+}
+
+int kvm_cpu__get_debug_fd(void)
+{
+ return debug_fd;
+}
+
+static inline bool is_in_protected_mode(struct kvm_cpu *vcpu)
+{
+ return vcpu->sregs.cr0 & 0x01;
+}
+
+static inline u64 ip_to_flat(struct kvm_cpu *vcpu, u64 ip)
+{
+ u64 cs;
+
+ /*
+ * NOTE! We should take code segment base address into account here.
+ * Luckily it's usually zero because Linux uses flat memory model.
+ */
+ if (is_in_protected_mode(vcpu))
+ return ip;
+
+ cs = vcpu->sregs.cs.selector;
+
+ return ip + (cs << 4);
+}
+
+static inline u32 selector_to_base(u16 selector)
+{
+ /*
+ * KVM on Intel requires 'base' to be 'selector * 16' in real mode.
+ */
+ return (u32)selector << 4;
+}
+
+static struct kvm_cpu *kvm_cpu__new(struct kvm *kvm)
+{
+ struct kvm_cpu *vcpu;
+
+ vcpu = calloc(1, sizeof(*vcpu));
+ if (!vcpu)
+ return NULL;
+
+ vcpu->kvm = kvm;
+
+ return vcpu;
+}
+
+void kvm_cpu__delete(struct kvm_cpu *vcpu)
+{
+ if (vcpu->msrs)
+ free(vcpu->msrs);
+
+ free(vcpu);
+}
+
+static int kvm_cpu__set_lint(struct kvm_cpu *vcpu)
+{
+ struct local_apic lapic;
+
+ if (ioctl(vcpu->vcpu_fd, KVM_GET_LAPIC, &lapic))
+ return -1;
+
+ lapic.lvt_lint0.delivery_mode = APIC_MODE_EXTINT;
+ lapic.lvt_lint1.delivery_mode = APIC_MODE_NMI;
+
+ return ioctl(vcpu->vcpu_fd, KVM_SET_LAPIC, &lapic);
+}
+
+struct kvm_cpu *kvm_cpu__arch_init(struct kvm *kvm, unsigned long cpu_id)
+{
+ struct kvm_cpu *vcpu;
+ int mmap_size;
+ int coalesced_offset;
+
+ vcpu = kvm_cpu__new(kvm);
+ if (!vcpu)
+ return NULL;
+
+ vcpu->cpu_id = cpu_id;
+
+ vcpu->vcpu_fd = ioctl(vcpu->kvm->vm_fd, KVM_CREATE_VCPU, cpu_id);
+ if (vcpu->vcpu_fd < 0)
+ die_perror("KVM_CREATE_VCPU ioctl");
+
+ mmap_size = ioctl(vcpu->kvm->sys_fd, KVM_GET_VCPU_MMAP_SIZE, 0);
+ if (mmap_size < 0)
+ die_perror("KVM_GET_VCPU_MMAP_SIZE ioctl");
+
+ vcpu->kvm_run = mmap(NULL, mmap_size, PROT_RW, MAP_SHARED, vcpu->vcpu_fd, 0);
+ if (vcpu->kvm_run == MAP_FAILED)
+ die("unable to mmap vcpu fd");
+
+ coalesced_offset = ioctl(kvm->sys_fd, KVM_CHECK_EXTENSION, KVM_CAP_COALESCED_MMIO);
+ if (coalesced_offset)
+ vcpu->ring = (void *)vcpu->kvm_run + (coalesced_offset * PAGE_SIZE);
+
+ if (kvm_cpu__set_lint(vcpu))
+ die_perror("KVM_SET_LAPIC failed");
+
+ vcpu->is_running = true;
+
+ return vcpu;
+}
+
+static struct kvm_msrs *kvm_msrs__new(size_t nmsrs)
+{
+ struct kvm_msrs *vcpu = calloc(1, sizeof(*vcpu) + (sizeof(struct kvm_msr_entry) * nmsrs));
+
+ if (!vcpu)
+ die("out of memory");
+
+ return vcpu;
+}
+
+#define KVM_MSR_ENTRY(_index, _data) \
+ (struct kvm_msr_entry) { .index = _index, .data = _data }
+
+static void kvm_cpu__setup_msrs(struct kvm_cpu *vcpu)
+{
+ unsigned long ndx = 0;
+
+ vcpu->msrs = kvm_msrs__new(100);
+
+ vcpu->msrs->entries[ndx++] = KVM_MSR_ENTRY(MSR_IA32_SYSENTER_CS, 0x0);
+ vcpu->msrs->entries[ndx++] = KVM_MSR_ENTRY(MSR_IA32_SYSENTER_ESP, 0x0);
+ vcpu->msrs->entries[ndx++] = KVM_MSR_ENTRY(MSR_IA32_SYSENTER_EIP, 0x0);
+#ifdef CONFIG_X86_64
+ vcpu->msrs->entries[ndx++] = KVM_MSR_ENTRY(MSR_STAR, 0x0);
+ vcpu->msrs->entries[ndx++] = KVM_MSR_ENTRY(MSR_CSTAR, 0x0);
+ vcpu->msrs->entries[ndx++] = KVM_MSR_ENTRY(MSR_KERNEL_GS_BASE, 0x0);
+ vcpu->msrs->entries[ndx++] = KVM_MSR_ENTRY(MSR_SYSCALL_MASK, 0x0);
+ vcpu->msrs->entries[ndx++] = KVM_MSR_ENTRY(MSR_LSTAR, 0x0);
+#endif
+ vcpu->msrs->entries[ndx++] = KVM_MSR_ENTRY(MSR_IA32_TSC, 0x0);
+ vcpu->msrs->entries[ndx++] = KVM_MSR_ENTRY(MSR_IA32_MISC_ENABLE,
+ MSR_IA32_MISC_ENABLE_FAST_STRING);
+
+ vcpu->msrs->nmsrs = ndx;
+
+ if (ioctl(vcpu->vcpu_fd, KVM_SET_MSRS, vcpu->msrs) < 0)
+ die_perror("KVM_SET_MSRS failed");
+}
+
+static void kvm_cpu__setup_fpu(struct kvm_cpu *vcpu)
+{
+ vcpu->fpu = (struct kvm_fpu) {
+ .fcw = 0x37f,
+ .mxcsr = 0x1f80,
+ };
+
+ if (ioctl(vcpu->vcpu_fd, KVM_SET_FPU, &vcpu->fpu) < 0)
+ die_perror("KVM_SET_FPU failed");
+}
+
+static void kvm_cpu__setup_regs(struct kvm_cpu *vcpu)
+{
+ vcpu->regs = (struct kvm_regs) {
+ /* We start the guest in 16-bit real mode */
+ .rflags = 0x0000000000000002ULL,
+
+ .rip = vcpu->kvm->arch.boot_ip,
+ .rsp = vcpu->kvm->arch.boot_sp,
+ .rbp = vcpu->kvm->arch.boot_sp,
+ };
+
+ if (vcpu->regs.rip > USHRT_MAX)
+ die("ip 0x%llx is too high for real mode", (u64)vcpu->regs.rip);
+
+ if (ioctl(vcpu->vcpu_fd, KVM_SET_REGS, &vcpu->regs) < 0)
+ die_perror("KVM_SET_REGS failed");
+}
+
+static void kvm_cpu__setup_sregs(struct kvm_cpu *vcpu)
+{
+ if (ioctl(vcpu->vcpu_fd, KVM_GET_SREGS, &vcpu->sregs) < 0)
+ die_perror("KVM_GET_SREGS failed");
+
+ vcpu->sregs.cs.selector = vcpu->kvm->arch.boot_selector;
+ vcpu->sregs.cs.base = selector_to_base(vcpu->kvm->arch.boot_selector);
+ vcpu->sregs.ss.selector = vcpu->kvm->arch.boot_selector;
+ vcpu->sregs.ss.base = selector_to_base(vcpu->kvm->arch.boot_selector);
+ vcpu->sregs.ds.selector = vcpu->kvm->arch.boot_selector;
+ vcpu->sregs.ds.base = selector_to_base(vcpu->kvm->arch.boot_selector);
+ vcpu->sregs.es.selector = vcpu->kvm->arch.boot_selector;
+ vcpu->sregs.es.base = selector_to_base(vcpu->kvm->arch.boot_selector);
+ vcpu->sregs.fs.selector = vcpu->kvm->arch.boot_selector;
+ vcpu->sregs.fs.base = selector_to_base(vcpu->kvm->arch.boot_selector);
+ vcpu->sregs.gs.selector = vcpu->kvm->arch.boot_selector;
+ vcpu->sregs.gs.base = selector_to_base(vcpu->kvm->arch.boot_selector);
+
+ if (ioctl(vcpu->vcpu_fd, KVM_SET_SREGS, &vcpu->sregs) < 0)
+ die_perror("KVM_SET_SREGS failed");
+}
+
+/**
+ * kvm_cpu__reset_vcpu - reset virtual CPU to a known state
+ */
+void kvm_cpu__reset_vcpu(struct kvm_cpu *vcpu)
+{
+ kvm_cpu__setup_cpuid(vcpu);
+ kvm_cpu__setup_sregs(vcpu);
+ kvm_cpu__setup_regs(vcpu);
+ kvm_cpu__setup_fpu(vcpu);
+ kvm_cpu__setup_msrs(vcpu);
+}
+
+bool kvm_cpu__handle_exit(struct kvm_cpu *vcpu)
+{
+ return false;
+}
+
+static void print_dtable(const char *name, struct kvm_dtable *dtable)
+{
+ dprintf(debug_fd, " %s %016llx %08hx\n",
+ name, (u64) dtable->base, (u16) dtable->limit);
+}
+
+static void print_segment(const char *name, struct kvm_segment *seg)
+{
+ dprintf(debug_fd, " %s %04hx %016llx %08x %02hhx %x %x %x %x %x %x %x\n",
+ name, (u16) seg->selector, (u64) seg->base, (u32) seg->limit,
+ (u8) seg->type, seg->present, seg->dpl, seg->db, seg->s, seg->l, seg->g, seg->avl);
+}
+
+void kvm_cpu__show_registers(struct kvm_cpu *vcpu)
+{
+ unsigned long cr0, cr2, cr3;
+ unsigned long cr4, cr8;
+ unsigned long rax, rbx, rcx;
+ unsigned long rdx, rsi, rdi;
+ unsigned long rbp, r8, r9;
+ unsigned long r10, r11, r12;
+ unsigned long r13, r14, r15;
+ unsigned long rip, rsp;
+ struct kvm_sregs sregs;
+ unsigned long rflags;
+ struct kvm_regs regs;
+ int i;
+
+ if (ioctl(vcpu->vcpu_fd, KVM_GET_REGS, ®s) < 0)
+ die("KVM_GET_REGS failed");
+
+ rflags = regs.rflags;
+
+ rip = regs.rip; rsp = regs.rsp;
+ rax = regs.rax; rbx = regs.rbx; rcx = regs.rcx;
+ rdx = regs.rdx; rsi = regs.rsi; rdi = regs.rdi;
+ rbp = regs.rbp; r8 = regs.r8; r9 = regs.r9;
+ r10 = regs.r10; r11 = regs.r11; r12 = regs.r12;
+ r13 = regs.r13; r14 = regs.r14; r15 = regs.r15;
+
+ dprintf(debug_fd, "\n Registers:\n");
+ dprintf(debug_fd, " ----------\n");
+ dprintf(debug_fd, " rip: %016lx rsp: %016lx flags: %016lx\n", rip, rsp, rflags);
+ dprintf(debug_fd, " rax: %016lx rbx: %016lx rcx: %016lx\n", rax, rbx, rcx);
+ dprintf(debug_fd, " rdx: %016lx rsi: %016lx rdi: %016lx\n", rdx, rsi, rdi);
+ dprintf(debug_fd, " rbp: %016lx r8: %016lx r9: %016lx\n", rbp, r8, r9);
+ dprintf(debug_fd, " r10: %016lx r11: %016lx r12: %016lx\n", r10, r11, r12);
+ dprintf(debug_fd, " r13: %016lx r14: %016lx r15: %016lx\n", r13, r14, r15);
+
+ if (ioctl(vcpu->vcpu_fd, KVM_GET_SREGS, &sregs) < 0)
+ die("KVM_GET_REGS failed");
+
+ cr0 = sregs.cr0; cr2 = sregs.cr2; cr3 = sregs.cr3;
+ cr4 = sregs.cr4; cr8 = sregs.cr8;
+
+ dprintf(debug_fd, " cr0: %016lx cr2: %016lx cr3: %016lx\n", cr0, cr2, cr3);
+ dprintf(debug_fd, " cr4: %016lx cr8: %016lx\n", cr4, cr8);
+ dprintf(debug_fd, "\n Segment registers:\n");
+ dprintf(debug_fd, " ------------------\n");
+ dprintf(debug_fd, " register selector base limit type p dpl db s l g avl\n");
+ print_segment("cs ", &sregs.cs);
+ print_segment("ss ", &sregs.ss);
+ print_segment("ds ", &sregs.ds);
+ print_segment("es ", &sregs.es);
+ print_segment("fs ", &sregs.fs);
+ print_segment("gs ", &sregs.gs);
+ print_segment("tr ", &sregs.tr);
+ print_segment("ldt", &sregs.ldt);
+ print_dtable("gdt", &sregs.gdt);
+ print_dtable("idt", &sregs.idt);
+
+ dprintf(debug_fd, "\n APIC:\n");
+ dprintf(debug_fd, " -----\n");
+ dprintf(debug_fd, " efer: %016llx apic base: %016llx nmi: %s\n",
+ (u64) sregs.efer, (u64) sregs.apic_base,
+ (vcpu->kvm->nmi_disabled ? "disabled" : "enabled"));
+
+ dprintf(debug_fd, "\n Interrupt bitmap:\n");
+ dprintf(debug_fd, " -----------------\n");
+ for (i = 0; i < (KVM_NR_INTERRUPTS + 63) / 64; i++)
+ dprintf(debug_fd, " %016llx", (u64) sregs.interrupt_bitmap[i]);
+ dprintf(debug_fd, "\n");
+}
+
+#define MAX_SYM_LEN 128
+
+void kvm_cpu__show_code(struct kvm_cpu *vcpu)
+{
+ unsigned int code_bytes = 64;
+ unsigned int code_prologue = 43;
+ unsigned int code_len = code_bytes;
+ char sym[MAX_SYM_LEN] = SYMBOL_DEFAULT_UNKNOWN, *psym;
+ unsigned char c;
+ unsigned int i;
+ u8 *ip;
+
+ if (ioctl(vcpu->vcpu_fd, KVM_GET_REGS, &vcpu->regs) < 0)
+ die("KVM_GET_REGS failed");
+
+ if (ioctl(vcpu->vcpu_fd, KVM_GET_SREGS, &vcpu->sregs) < 0)
+ die("KVM_GET_SREGS failed");
+
+ ip = guest_flat_to_host(vcpu->kvm, ip_to_flat(vcpu, vcpu->regs.rip) - code_prologue);
+
+ dprintf(debug_fd, "\n Code:\n");
+ dprintf(debug_fd, " -----\n");
+
+ psym = symbol_lookup(vcpu->kvm, vcpu->regs.rip, sym, MAX_SYM_LEN);
+ if (IS_ERR(psym))
+ dprintf(debug_fd,
+ "Warning: symbol_lookup() failed to find symbol "
+ "with error: %ld\n", PTR_ERR(psym));
+
+ dprintf(debug_fd, " rip: [<%016lx>] %s\n\n", (unsigned long) vcpu->regs.rip, sym);
+
+ for (i = 0; i < code_len; i++, ip++) {
+ if (!host_ptr_in_ram(vcpu->kvm, ip))
+ break;
+
+ c = *ip;
+
+ if (ip == guest_flat_to_host(vcpu->kvm, ip_to_flat(vcpu, vcpu->regs.rip)))
+ dprintf(debug_fd, " <%02x>", c);
+ else
+ dprintf(debug_fd, " %02x", c);
+ }
+
+ dprintf(debug_fd, "\n");
+
+ dprintf(debug_fd, "\n Stack:\n");
+ dprintf(debug_fd, " ------\n");
+ dprintf(debug_fd, " rsp: [<%016lx>] \n", (unsigned long) vcpu->regs.rsp);
+ kvm__dump_mem(vcpu->kvm, vcpu->regs.rsp, 32, debug_fd);
+}
+
+void kvm_cpu__show_page_tables(struct kvm_cpu *vcpu)
+{
+ u64 *pte1;
+ u64 *pte2;
+ u64 *pte3;
+ u64 *pte4;
+
+ if (!is_in_protected_mode(vcpu)) {
+ dprintf(debug_fd, "\n Page Tables:\n");
+ dprintf(debug_fd, " ------\n");
+ dprintf(debug_fd, " Not in protected mode\n");
+ return;
+ }
+
+ if (ioctl(vcpu->vcpu_fd, KVM_GET_SREGS, &vcpu->sregs) < 0)
+ die("KVM_GET_SREGS failed");
+
+ pte4 = guest_flat_to_host(vcpu->kvm, vcpu->sregs.cr3);
+ if (!host_ptr_in_ram(vcpu->kvm, pte4))
+ return;
+
+ pte3 = guest_flat_to_host(vcpu->kvm, (*pte4 & ~0xfff));
+ if (!host_ptr_in_ram(vcpu->kvm, pte3))
+ return;
+
+ pte2 = guest_flat_to_host(vcpu->kvm, (*pte3 & ~0xfff));
+ if (!host_ptr_in_ram(vcpu->kvm, pte2))
+ return;
+
+ pte1 = guest_flat_to_host(vcpu->kvm, (*pte2 & ~0xfff));
+ if (!host_ptr_in_ram(vcpu->kvm, pte1))
+ return;
+
+ dprintf(debug_fd, "\n Page Tables:\n");
+ dprintf(debug_fd, " ------\n");
+ if (*pte2 & (1 << 7))
+ dprintf(debug_fd, " pte4: %016llx pte3: %016llx"
+ " pte2: %016llx\n",
+ *pte4, *pte3, *pte2);
+ else
+ dprintf(debug_fd, " pte4: %016llx pte3: %016llx pte2: %016"
+ "llx pte1: %016llx\n",
+ *pte4, *pte3, *pte2, *pte1);
+}
+
+void kvm_cpu__arch_nmi(struct kvm_cpu *cpu)
+{
+ struct kvm_lapic_state klapic;
+ struct local_apic *lapic = (void *)&klapic;
+
+ if (ioctl(cpu->vcpu_fd, KVM_GET_LAPIC, &klapic) != 0)
+ return;
+
+ if (lapic->lvt_lint1.mask)
+ return;
+
+ if (lapic->lvt_lint1.delivery_mode != APIC_MODE_NMI)
+ return;
+
+ ioctl(cpu->vcpu_fd, KVM_NMI);
+}
diff --git a/tools/kvm/x86/kvm.c b/tools/kvm/x86/kvm.c
new file mode 100644
index 0000000..d285d1d
--- /dev/null
+++ b/tools/kvm/x86/kvm.c
@@ -0,0 +1,381 @@
+#include "kvm/kvm.h"
+#include "kvm/boot-protocol.h"
+#include "kvm/cpufeature.h"
+#include "kvm/interrupt.h"
+#include "kvm/mptable.h"
+#include "kvm/util.h"
+#include "kvm/8250-serial.h"
+#include "kvm/virtio-console.h"
+
+#include <asm/bootparam.h>
+#include <linux/kvm.h>
+
+#include <sys/types.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <fcntl.h>
+
+struct kvm_ext kvm_req_ext[] = {
+ { DEFINE_KVM_EXT(KVM_CAP_COALESCED_MMIO) },
+ { DEFINE_KVM_EXT(KVM_CAP_SET_TSS_ADDR) },
+ { DEFINE_KVM_EXT(KVM_CAP_PIT2) },
+ { DEFINE_KVM_EXT(KVM_CAP_USER_MEMORY) },
+ { DEFINE_KVM_EXT(KVM_CAP_IRQ_ROUTING) },
+ { DEFINE_KVM_EXT(KVM_CAP_IRQCHIP) },
+ { DEFINE_KVM_EXT(KVM_CAP_HLT) },
+ { DEFINE_KVM_EXT(KVM_CAP_IRQ_INJECT_STATUS) },
+ { DEFINE_KVM_EXT(KVM_CAP_EXT_CPUID) },
+ { 0, 0 }
+};
+
+bool kvm__arch_cpu_supports_vm(void)
+{
+ struct cpuid_regs regs;
+ u32 eax_base;
+ int feature;
+
+ regs = (struct cpuid_regs) {
+ .eax = 0x00,
+ };
+ host_cpuid(®s);
+
+ switch (regs.ebx) {
+ case CPUID_VENDOR_INTEL_1:
+ eax_base = 0x00;
+ feature = KVM__X86_FEATURE_VMX;
+ break;
+
+ case CPUID_VENDOR_AMD_1:
+ eax_base = 0x80000000;
+ feature = KVM__X86_FEATURE_SVM;
+ break;
+
+ default:
+ return false;
+ }
+
+ regs = (struct cpuid_regs) {
+ .eax = eax_base,
+ };
+ host_cpuid(®s);
+
+ if (regs.eax < eax_base + 0x01)
+ return false;
+
+ regs = (struct cpuid_regs) {
+ .eax = eax_base + 0x01
+ };
+ host_cpuid(®s);
+
+ return regs.ecx & (1 << feature);
+}
+
+/*
+ * Allocating RAM size bigger than 4GB requires us to leave a gap
+ * in the RAM which is used for PCI MMIO, hotplug, and unconfigured
+ * devices (see documentation of e820_setup_gap() for details).
+ *
+ * If we're required to initialize RAM bigger than 4GB, we will create
+ * a gap between 0xe0000000 and 0x100000000 in the guest virtual mem space.
+ */
+
+void kvm__init_ram(struct kvm *kvm)
+{
+ u64 phys_start, phys_size;
+ void *host_mem;
+
+ if (kvm->ram_size < KVM_32BIT_GAP_START) {
+ /* Use a single block of RAM for 32bit RAM */
+
+ phys_start = 0;
+ phys_size = kvm->ram_size;
+ host_mem = kvm->ram_start;
+
+ kvm__register_mem(kvm, phys_start, phys_size, host_mem);
+ } else {
+ /* First RAM range from zero to the PCI gap: */
+
+ phys_start = 0;
+ phys_size = KVM_32BIT_GAP_START;
+ host_mem = kvm->ram_start;
+
+ kvm__register_mem(kvm, phys_start, phys_size, host_mem);
+
+ /* Second RAM range from 4GB to the end of RAM: */
+
+ phys_start = KVM_32BIT_MAX_MEM_SIZE;
+ phys_size = kvm->ram_size - phys_start;
+ host_mem = kvm->ram_start + phys_start;
+
+ kvm__register_mem(kvm, phys_start, phys_size, host_mem);
+ }
+}
+
+/* Arch-specific commandline setup */
+void kvm__arch_set_cmdline(char *cmdline, bool video)
+{
+ strcpy(cmdline, "noapic noacpi pci=conf1 reboot=k panic=1 i8042.direct=1 "
+ "i8042.dumbkbd=1 i8042.nopnp=1");
+ if (video)
+ strcat(cmdline, " video=vesafb console=tty0");
+ else
+ strcat(cmdline, " console=ttyS0 earlyprintk=serial i8042.noaux=1");
+}
+
+/* Architecture-specific KVM init */
+void kvm__arch_init(struct kvm *kvm, const char *hugetlbfs_path, u64 ram_size)
+{
+ struct kvm_pit_config pit_config = { .flags = 0, };
+ int ret;
+
+ ret = ioctl(kvm->vm_fd, KVM_SET_TSS_ADDR, 0xfffbd000);
+ if (ret < 0)
+ die_perror("KVM_SET_TSS_ADDR ioctl");
+
+ ret = ioctl(kvm->vm_fd, KVM_CREATE_PIT2, &pit_config);
+ if (ret < 0)
+ die_perror("KVM_CREATE_PIT2 ioctl");
+
+ if (ram_size < KVM_32BIT_GAP_START) {
+ kvm->ram_size = ram_size;
+ kvm->ram_start = mmap_anon_or_hugetlbfs(kvm, hugetlbfs_path, ram_size);
+ } else {
+ kvm->ram_start = mmap_anon_or_hugetlbfs(kvm, hugetlbfs_path, ram_size + KVM_32BIT_GAP_SIZE);
+ kvm->ram_size = ram_size + KVM_32BIT_GAP_SIZE;
+ if (kvm->ram_start != MAP_FAILED)
+ /*
+ * We mprotect the gap (see kvm__init_ram() for details) PROT_NONE so that
+ * if we accidently write to it, we will know.
+ */
+ mprotect(kvm->ram_start + KVM_32BIT_GAP_START, KVM_32BIT_GAP_SIZE, PROT_NONE);
+ }
+ if (kvm->ram_start == MAP_FAILED)
+ die("out of memory");
+
+ madvise(kvm->ram_start, kvm->ram_size, MADV_MERGEABLE);
+
+ ret = ioctl(kvm->vm_fd, KVM_CREATE_IRQCHIP);
+ if (ret < 0)
+ die_perror("KVM_CREATE_IRQCHIP ioctl");
+}
+
+void kvm__arch_delete_ram(struct kvm *kvm)
+{
+ munmap(kvm->ram_start, kvm->ram_size);
+}
+
+void kvm__irq_line(struct kvm *kvm, int irq, int level)
+{
+ struct kvm_irq_level irq_level;
+
+ irq_level = (struct kvm_irq_level) {
+ {
+ .irq = irq,
+ },
+ .level = level,
+ };
+
+ if (ioctl(kvm->vm_fd, KVM_IRQ_LINE, &irq_level) < 0)
+ die_perror("KVM_IRQ_LINE failed");
+}
+
+void kvm__irq_trigger(struct kvm *kvm, int irq)
+{
+ kvm__irq_line(kvm, irq, 1);
+ kvm__irq_line(kvm, irq, 0);
+}
+
+#define BOOT_LOADER_SELECTOR 0x1000
+#define BOOT_LOADER_IP 0x0000
+#define BOOT_LOADER_SP 0x8000
+#define BOOT_CMDLINE_OFFSET 0x20000
+
+#define BOOT_PROTOCOL_REQUIRED 0x206
+#define LOAD_HIGH 0x01
+
+static inline void *guest_real_to_host(struct kvm *kvm, u16 selector, u16 offset)
+{
+ unsigned long flat = segment_to_flat(selector, offset);
+
+ return guest_flat_to_host(kvm, flat);
+}
+
+int load_flat_binary(struct kvm *kvm, int fd_kernel, int fd_initrd, const char *kernel_cmdline)
+{
+ void *p;
+ int nr;
+
+ /*
+ * Some architectures may support loading an initrd alongside the flat kernel,
+ * but we do not.
+ */
+ if (fd_initrd != -1)
+ pr_warning("Loading initrd with flat binary not supported.");
+
+ if (lseek(fd_kernel, 0, SEEK_SET) < 0)
+ die_perror("lseek");
+
+ p = guest_real_to_host(kvm, BOOT_LOADER_SELECTOR, BOOT_LOADER_IP);
+
+ while ((nr = read(fd_kernel, p, 65536)) > 0)
+ p += nr;
+
+ kvm->arch.boot_selector = BOOT_LOADER_SELECTOR;
+ kvm->arch.boot_ip = BOOT_LOADER_IP;
+ kvm->arch.boot_sp = BOOT_LOADER_SP;
+
+ return true;
+}
+
+static const char *BZIMAGE_MAGIC = "HdrS";
+
+bool load_bzimage(struct kvm *kvm, int fd_kernel, int fd_initrd,
+ const char *kernel_cmdline)
+{
+ struct boot_params *kern_boot;
+ unsigned long setup_sects;
+ struct boot_params boot;
+ size_t cmdline_size;
+ ssize_t setup_size;
+ void *p;
+ int nr;
+ u16 vidmode;
+
+ /*
+ * See Documentation/x86/boot.txt for details no bzImage on-disk and
+ * memory layout.
+ */
+
+ if (lseek(fd_kernel, 0, SEEK_SET) < 0)
+ die_perror("lseek");
+
+ if (read(fd_kernel, &boot, sizeof(boot)) != sizeof(boot))
+ return false;
+
+ if (memcmp(&boot.hdr.header, BZIMAGE_MAGIC, strlen(BZIMAGE_MAGIC)))
+ return false;
+
+ if (boot.hdr.version < BOOT_PROTOCOL_REQUIRED)
+ die("Too old kernel");
+
+ if (lseek(fd_kernel, 0, SEEK_SET) < 0)
+ die_perror("lseek");
+
+ if (!boot.hdr.setup_sects)
+ boot.hdr.setup_sects = BZ_DEFAULT_SETUP_SECTS;
+ setup_sects = boot.hdr.setup_sects + 1;
+
+ setup_size = setup_sects << 9;
+ p = guest_real_to_host(kvm, BOOT_LOADER_SELECTOR, BOOT_LOADER_IP);
+
+ /* copy setup.bin to mem*/
+ if (read(fd_kernel, p, setup_size) != setup_size)
+ die_perror("read");
+
+ /* copy vmlinux.bin to BZ_KERNEL_START*/
+ p = guest_flat_to_host(kvm, BZ_KERNEL_START);
+
+ while ((nr = read(fd_kernel, p, 65536)) > 0)
+ p += nr;
+
+ p = guest_flat_to_host(kvm, BOOT_CMDLINE_OFFSET);
+ if (kernel_cmdline) {
+ cmdline_size = strlen(kernel_cmdline) + 1;
+ if (cmdline_size > boot.hdr.cmdline_size)
+ cmdline_size = boot.hdr.cmdline_size;
+
+ memset(p, 0, boot.hdr.cmdline_size);
+ memcpy(p, kernel_cmdline, cmdline_size - 1);
+ }
+
+
+ /* vidmode should be either specified or set by default */
+ if (kvm->cfg.vnc || kvm->cfg.sdl || kvm->cfg.gtk) {
+ if (!kvm->cfg.arch.vidmode)
+ vidmode = 0x312;
+ else
+ vidmode = kvm->cfg.arch.vidmode;
+ } else {
+ vidmode = 0;
+ }
+
+ kern_boot = guest_real_to_host(kvm, BOOT_LOADER_SELECTOR, 0x00);
+
+ kern_boot->hdr.cmd_line_ptr = BOOT_CMDLINE_OFFSET;
+ kern_boot->hdr.type_of_loader = 0xff;
+ kern_boot->hdr.heap_end_ptr = 0xfe00;
+ kern_boot->hdr.loadflags |= CAN_USE_HEAP;
+ kern_boot->hdr.vid_mode = vidmode;
+
+ /*
+ * Read initrd image into guest memory
+ */
+ if (fd_initrd >= 0) {
+ struct stat initrd_stat;
+ unsigned long addr;
+
+ if (fstat(fd_initrd, &initrd_stat))
+ die_perror("fstat");
+
+ addr = boot.hdr.initrd_addr_max & ~0xfffff;
+ for (;;) {
+ if (addr < BZ_KERNEL_START)
+ die("Not enough memory for initrd");
+ else if (addr < (kvm->ram_size - initrd_stat.st_size))
+ break;
+ addr -= 0x100000;
+ }
+
+ p = guest_flat_to_host(kvm, addr);
+ nr = read(fd_initrd, p, initrd_stat.st_size);
+ if (nr != initrd_stat.st_size)
+ die("Failed to read initrd");
+
+ kern_boot->hdr.ramdisk_image = addr;
+ kern_boot->hdr.ramdisk_size = initrd_stat.st_size;
+ }
+
+ kvm->arch.boot_selector = BOOT_LOADER_SELECTOR;
+ /*
+ * The real-mode setup code starts at offset 0x200 of a bzImage. See
+ * Documentation/x86/boot.txt for details.
+ */
+ kvm->arch.boot_ip = BOOT_LOADER_IP + 0x200;
+ kvm->arch.boot_sp = BOOT_LOADER_SP;
+
+ return true;
+}
+
+/**
+ * kvm__arch_setup_firmware - inject BIOS into guest system memory
+ * @kvm - guest system descriptor
+ *
+ * This function is a main routine where we poke guest memory
+ * and install BIOS there.
+ */
+int kvm__arch_setup_firmware(struct kvm *kvm)
+{
+ /* standart minimal configuration */
+ setup_bios(kvm);
+
+ /* FIXME: SMP, ACPI and friends here */
+
+ return 0;
+}
+
+int kvm__arch_free_firmware(struct kvm *kvm)
+{
+ return 0;
+}
+
+void kvm__arch_read_term(struct kvm *kvm)
+{
+ serial8250__update_consoles(kvm);
+ virtio_console__inject_interrupt(kvm);
+}
diff --git a/tools/kvm/x86/mptable.c b/tools/kvm/x86/mptable.c
new file mode 100644
index 0000000..a984de9
--- /dev/null
+++ b/tools/kvm/x86/mptable.c
@@ -0,0 +1,280 @@
+#include "kvm/kvm.h"
+#include "kvm/bios.h"
+#include "kvm/apic.h"
+#include "kvm/mptable.h"
+#include "kvm/util.h"
+#include "kvm/devices.h"
+#include "kvm/pci.h"
+
+#include <linux/kernel.h>
+#include <string.h>
+
+#include <asm/mpspec_def.h>
+#include <linux/types.h>
+
+/*
+ * FIXME: please make sure the addresses borrowed
+ * for apic/ioapic never overlaped! We need a global
+ * tracker of system resources (including io, mmio,
+ * and friends).
+ */
+
+static unsigned int mpf_checksum(unsigned char *mp, int len)
+{
+ unsigned int sum = 0;
+
+ while (len--)
+ sum += *mp++;
+
+ return sum & 0xFF;
+}
+
+static unsigned int gen_cpu_flag(unsigned int cpu, unsigned int ncpu)
+{
+ /* sets enabled/disabled | BSP/AP processor */
+ return ( (cpu < ncpu) ? CPU_ENABLED : 0) |
+ ((cpu == 0) ? CPU_BOOTPROCESSOR : 0x00);
+}
+
+#define MPTABLE_SIG_FLOATING "_MP_"
+#define MPTABLE_OEM "KVMCPU00"
+#define MPTABLE_PRODUCTID "0.1 "
+#define MPTABLE_PCIBUSTYPE "PCI "
+#define MPTABLE_ISABUSTYPE "ISA "
+
+#define MPTABLE_STRNCPY(d, s) memcpy(d, s, sizeof(d))
+
+/* It should be more than enough */
+#define MPTABLE_MAX_SIZE (32 << 20)
+
+/*
+ * Too many cpus will require x2apic mode
+ * and rather ACPI support so we limit it
+ * here for a while.
+ */
+#define MPTABLE_MAX_CPUS 255
+
+static void mptable_add_irq_src(struct mpc_intsrc *mpc_intsrc,
+ u16 srcbusid, u16 srcbusirq,
+ u16 dstapic, u16 dstirq)
+{
+ *mpc_intsrc = (struct mpc_intsrc) {
+ .type = MP_INTSRC,
+ .irqtype = mp_INT,
+ .irqflag = MP_IRQDIR_DEFAULT,
+ .srcbus = srcbusid,
+ .srcbusirq = srcbusirq,
+ .dstapic = dstapic,
+ .dstirq = dstirq
+ };
+}
+
+/**
+ * mptable_setup - create mptable and fill guest memory with it
+ */
+int mptable__init(struct kvm *kvm)
+{
+ unsigned long real_mpc_table, real_mpf_intel, size;
+ struct mpf_intel *mpf_intel;
+ struct mpc_table *mpc_table;
+ struct mpc_cpu *mpc_cpu;
+ struct mpc_bus *mpc_bus;
+ struct mpc_ioapic *mpc_ioapic;
+ struct mpc_intsrc *mpc_intsrc;
+ struct device_header *dev_hdr;
+
+ const int pcibusid = 0;
+ const int isabusid = 1;
+
+ unsigned int i, nentries = 0, ncpus = kvm->nrcpus;
+ unsigned int ioapicid;
+ void *last_addr;
+
+ /* That is where MP table will be in guest memory */
+ real_mpc_table = ALIGN(MB_BIOS_BEGIN + bios_rom_size, 16);
+
+ if (ncpus > MPTABLE_MAX_CPUS) {
+ pr_warning("Too many cpus: %d limited to %d",
+ ncpus, MPTABLE_MAX_CPUS);
+ ncpus = MPTABLE_MAX_CPUS;
+ }
+
+ mpc_table = calloc(1, MPTABLE_MAX_SIZE);
+ if (!mpc_table)
+ return -ENOMEM;
+
+ MPTABLE_STRNCPY(mpc_table->signature, MPC_SIGNATURE);
+ MPTABLE_STRNCPY(mpc_table->oem, MPTABLE_OEM);
+ MPTABLE_STRNCPY(mpc_table->productid, MPTABLE_PRODUCTID);
+
+ mpc_table->spec = 4;
+ mpc_table->lapic = APIC_ADDR(0);
+ mpc_table->oemcount = ncpus; /* will be updated again at end */
+
+ /*
+ * CPUs enumeration. Technically speaking we should
+ * ask either host or HV for apic version supported
+ * but for a while we simply put some random value
+ * here.
+ */
+ mpc_cpu = (void *)&mpc_table[1];
+ for (i = 0; i < ncpus; i++) {
+ mpc_cpu->type = MP_PROCESSOR;
+ mpc_cpu->apicid = i;
+ mpc_cpu->apicver = KVM_APIC_VERSION;
+ mpc_cpu->cpuflag = gen_cpu_flag(i, ncpus);
+ mpc_cpu->cpufeature = 0x600; /* some default value */
+ mpc_cpu->featureflag = 0x201; /* some default value */
+ mpc_cpu++;
+ }
+
+ last_addr = (void *)mpc_cpu;
+ nentries += ncpus;
+
+ /*
+ * PCI buses.
+ * FIXME: Some callback here to obtain real number
+ * of PCI buses present in system.
+ */
+ mpc_bus = last_addr;
+ mpc_bus->type = MP_BUS;
+ mpc_bus->busid = pcibusid;
+ MPTABLE_STRNCPY(mpc_bus->bustype, MPTABLE_PCIBUSTYPE);
+
+ last_addr = (void *)&mpc_bus[1];
+ nentries++;
+
+ /*
+ * ISA bus.
+ * FIXME: Same issue as for PCI bus.
+ */
+ mpc_bus = last_addr;
+ mpc_bus->type = MP_BUS;
+ mpc_bus->busid = isabusid;
+ MPTABLE_STRNCPY(mpc_bus->bustype, MPTABLE_ISABUSTYPE);
+
+ last_addr = (void *)&mpc_bus[1];
+ nentries++;
+
+ /*
+ * IO-APIC chip.
+ */
+ ioapicid = ncpus + 1;
+ mpc_ioapic = last_addr;
+ mpc_ioapic->type = MP_IOAPIC;
+ mpc_ioapic->apicid = ioapicid;
+ mpc_ioapic->apicver = KVM_APIC_VERSION;
+ mpc_ioapic->flags = MPC_APIC_USABLE;
+ mpc_ioapic->apicaddr = IOAPIC_ADDR(0);
+
+ last_addr = (void *)&mpc_ioapic[1];
+ nentries++;
+
+ /*
+ * IRQ sources.
+ * Also note we use PCI irqs here, no for ISA bus yet.
+ */
+
+ dev_hdr = device__first_dev(DEVICE_BUS_PCI);
+ while (dev_hdr) {
+ unsigned char srcbusirq;
+ struct pci_device_header *pci_hdr = dev_hdr->data;
+
+ srcbusirq = (pci_hdr->subsys_id << 2) | (pci_hdr->irq_pin - 1);
+ mpc_intsrc = last_addr;
+ mptable_add_irq_src(mpc_intsrc, pcibusid, srcbusirq, ioapicid, pci_hdr->irq_line);
+
+ last_addr = (void *)&mpc_intsrc[dev_hdr->dev_num];
+ nentries++;
+ dev_hdr = device__next_dev(dev_hdr);
+ }
+
+ /*
+ * Local IRQs assignment (LINT0, LINT1)
+ */
+ mpc_intsrc = last_addr;
+ mpc_intsrc->type = MP_LINTSRC;
+ mpc_intsrc->irqtype = mp_ExtINT;
+ mpc_intsrc->irqtype = mp_INT;
+ mpc_intsrc->irqflag = MP_IRQDIR_DEFAULT;
+ mpc_intsrc->srcbus = isabusid;
+ mpc_intsrc->srcbusirq = 0;
+ mpc_intsrc->dstapic = 0; /* FIXME: BSP apic */
+ mpc_intsrc->dstirq = 0; /* LINT0 */
+
+ last_addr = (void *)&mpc_intsrc[1];
+ nentries++;
+
+ mpc_intsrc = last_addr;
+ mpc_intsrc->type = MP_LINTSRC;
+ mpc_intsrc->irqtype = mp_NMI;
+ mpc_intsrc->irqflag = MP_IRQDIR_DEFAULT;
+ mpc_intsrc->srcbus = isabusid;
+ mpc_intsrc->srcbusirq = 0;
+ mpc_intsrc->dstapic = 0; /* FIXME: BSP apic */
+ mpc_intsrc->dstirq = 1; /* LINT1 */
+
+ last_addr = (void *)&mpc_intsrc[1];
+ nentries++;
+
+ /*
+ * Floating MP table finally.
+ */
+ real_mpf_intel = ALIGN((unsigned long)last_addr - (unsigned long)mpc_table, 16);
+ mpf_intel = (void *)((unsigned long)mpc_table + real_mpf_intel);
+
+ MPTABLE_STRNCPY(mpf_intel->signature, MPTABLE_SIG_FLOATING);
+ mpf_intel->length = 1;
+ mpf_intel->specification= 4;
+ mpf_intel->physptr = (unsigned int)real_mpc_table;
+ mpf_intel->checksum = -mpf_checksum((unsigned char *)mpf_intel, sizeof(*mpf_intel));
+
+ /*
+ * No last_addr inclrement here please, we need last
+ * active position here to compute table size.
+ */
+
+ /*
+ * Don't forget to update header in fixed table.
+ */
+ mpc_table->oemcount = nentries;
+ mpc_table->length = last_addr - (void *)mpc_table;
+ mpc_table->checksum = -mpf_checksum((unsigned char *)mpc_table, mpc_table->length);
+
+
+ /*
+ * We will copy the whole table, no need to separate
+ * floating structure and table itkvm.
+ */
+ size = (unsigned long)mpf_intel + sizeof(*mpf_intel) - (unsigned long)mpc_table;
+
+ /*
+ * The finial check -- never get out of system bios
+ * area. Lets also check for allocated memory overrun,
+ * in real it's late but still usefull.
+ */
+
+ if (size > (unsigned long)(MB_BIOS_END - bios_rom_size) ||
+ size > MPTABLE_MAX_SIZE) {
+ free(mpc_table);
+ pr_err("MP table is too big");
+
+ return -E2BIG;
+ }
+
+ /*
+ * OK, it is time to move it to guest memory.
+ */
+ memcpy(guest_flat_to_host(kvm, real_mpc_table), mpc_table, size);
+
+ free(mpc_table);
+
+ return 0;
+}
+firmware_init(mptable__init);
+
+int mptable__exit(struct kvm *kvm)
+{
+ return 0;
+}
+firmware_exit(mptable__exit);
diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt
index d2b59af..d561e02 100644
--- a/tools/perf/Documentation/perf-report.txt
+++ b/tools/perf/Documentation/perf-report.txt
@@ -147,7 +147,7 @@
-w::
--column-widths=<width[,width...]>::
Force each column width to the provided list, for large terminal
- readability.
+ readability. 0 means no limit (default behavior).
-t::
--field-separator=::
diff --git a/tools/perf/Documentation/perf-top.txt b/tools/perf/Documentation/perf-top.txt
index 180ae02..28fdee3 100644
--- a/tools/perf/Documentation/perf-top.txt
+++ b/tools/perf/Documentation/perf-top.txt
@@ -193,6 +193,12 @@
sum of shown entries will be always 100%. "absolute" means it retains
the original value before and after the filter is applied.
+-w::
+--column-widths=<width[,width...]>::
+ Force each column width to the provided list, for large terminal
+ readability. 0 means no limit (default behavior).
+
+
INTERACTIVE PROMPTING KEYS
--------------------------
diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf
index 2240974..95e832b 100644
--- a/tools/perf/Makefile.perf
+++ b/tools/perf/Makefile.perf
@@ -263,6 +263,7 @@
LIB_H += util/header.h
LIB_H += util/help.h
LIB_H += util/session.h
+LIB_H += util/ordered-events.h
LIB_H += util/strbuf.h
LIB_H += util/strlist.h
LIB_H += util/strfilter.h
@@ -347,6 +348,7 @@
LIB_OBJS += $(OUTPUT)util/map.o
LIB_OBJS += $(OUTPUT)util/pstack.o
LIB_OBJS += $(OUTPUT)util/session.o
+LIB_OBJS += $(OUTPUT)util/ordered-events.o
LIB_OBJS += $(OUTPUT)util/comm.o
LIB_OBJS += $(OUTPUT)util/thread.o
LIB_OBJS += $(OUTPUT)util/thread_map.o
@@ -423,6 +425,7 @@
endif
LIB_OBJS += $(OUTPUT)tests/mmap-thread-lookup.o
LIB_OBJS += $(OUTPUT)tests/thread-mg-share.o
+LIB_OBJS += $(OUTPUT)tests/switch-tracking.o
BUILTIN_OBJS += $(OUTPUT)builtin-annotate.o
BUILTIN_OBJS += $(OUTPUT)builtin-bench.o
diff --git a/tools/perf/arch/arm64/include/perf_regs.h b/tools/perf/arch/arm64/include/perf_regs.h
index e9441b9..1d3f39c 100644
--- a/tools/perf/arch/arm64/include/perf_regs.h
+++ b/tools/perf/arch/arm64/include/perf_regs.h
@@ -6,6 +6,8 @@
#include <asm/perf_regs.h>
#define PERF_REGS_MASK ((1ULL << PERF_REG_ARM64_MAX) - 1)
+#define PERF_REGS_MAX PERF_REG_ARM64_MAX
+
#define PERF_REG_IP PERF_REG_ARM64_PC
#define PERF_REG_SP PERF_REG_ARM64_SP
diff --git a/tools/perf/arch/common.c b/tools/perf/arch/common.c
index 42faf36..49776f1 100644
--- a/tools/perf/arch/common.c
+++ b/tools/perf/arch/common.c
@@ -12,6 +12,11 @@
NULL
};
+const char *const arm64_triplets[] = {
+ "aarch64-linux-android-",
+ NULL
+};
+
const char *const powerpc_triplets[] = {
"powerpc-unknown-linux-gnu-",
"powerpc64-unknown-linux-gnu-",
@@ -105,6 +110,8 @@
return "x86";
if (!strcmp(arch, "sun4u") || !strncmp(arch, "sparc", 5))
return "sparc";
+ if (!strcmp(arch, "aarch64") || !strcmp(arch, "arm64"))
+ return "arm64";
if (!strncmp(arch, "arm", 3) || !strcmp(arch, "sa110"))
return "arm";
if (!strncmp(arch, "s390", 4))
@@ -159,6 +166,8 @@
if (!strcmp(arch, "arm"))
path_list = arm_triplets;
+ else if (!strcmp(arch, "arm64"))
+ path_list = arm64_triplets;
else if (!strcmp(arch, "powerpc"))
path_list = powerpc_triplets;
else if (!strcmp(arch, "sh"))
diff --git a/tools/perf/arch/powerpc/util/skip-callchain-idx.c b/tools/perf/arch/powerpc/util/skip-callchain-idx.c
index a7c23a4..d73ef8b 100644
--- a/tools/perf/arch/powerpc/util/skip-callchain-idx.c
+++ b/tools/perf/arch/powerpc/util/skip-callchain-idx.c
@@ -15,6 +15,7 @@
#include "util/thread.h"
#include "util/callchain.h"
+#include "util/debug.h"
/*
* When saving the callchain on Power, the kernel conservatively saves
diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index 1ec429f..d4da692 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -36,7 +36,8 @@
struct perf_annotate {
struct perf_tool tool;
- bool force, use_tui, use_stdio, use_gtk;
+ struct perf_session *session;
+ bool use_tui, use_stdio, use_gtk;
bool full_paths;
bool print_line;
bool skip_missing;
@@ -188,18 +189,9 @@
static int __cmd_annotate(struct perf_annotate *ann)
{
int ret;
- struct perf_session *session;
+ struct perf_session *session = ann->session;
struct perf_evsel *pos;
u64 total_nr_samples;
- struct perf_data_file file = {
- .path = input_name,
- .mode = PERF_DATA_MODE_READ,
- .force = ann->force,
- };
-
- session = perf_session__new(&file, false, &ann->tool);
- if (session == NULL)
- return -ENOMEM;
machines__set_symbol_filter(&session->machines, symbol__annotate_init);
@@ -207,22 +199,22 @@
ret = perf_session__cpu_bitmap(session, ann->cpu_list,
ann->cpu_bitmap);
if (ret)
- goto out_delete;
+ goto out;
}
if (!objdump_path) {
ret = perf_session_env__lookup_objdump(&session->header.env);
if (ret)
- goto out_delete;
+ goto out;
}
ret = perf_session__process_events(session, &ann->tool);
if (ret)
- goto out_delete;
+ goto out;
if (dump_trace) {
perf_session__fprintf_nr_events(session, stdout);
- goto out_delete;
+ goto out;
}
if (verbose > 3)
@@ -250,8 +242,8 @@
}
if (total_nr_samples == 0) {
- ui__error("The %s file has no samples!\n", file.path);
- goto out_delete;
+ ui__error("The %s file has no samples!\n", session->file->path);
+ goto out;
}
if (use_browser == 2) {
@@ -261,24 +253,12 @@
"perf_gtk__show_annotations");
if (show_annotations == NULL) {
ui__error("GTK browser not found!\n");
- goto out_delete;
+ goto out;
}
show_annotations();
}
-out_delete:
- /*
- * Speed up the exit process, for large files this can
- * take quite a while.
- *
- * XXX Enable this when using valgrind or if we ever
- * librarize this command.
- *
- * Also experiment with obstacks to see how much speed
- * up we'll get here.
- *
- * perf_session__delete(session);
- */
+out:
return ret;
}
@@ -297,10 +277,14 @@
.comm = perf_event__process_comm,
.exit = perf_event__process_exit,
.fork = perf_event__process_fork,
- .ordered_samples = true,
+ .ordered_events = true,
.ordering_requires_timestamps = true,
},
};
+ struct perf_data_file file = {
+ .path = input_name,
+ .mode = PERF_DATA_MODE_READ,
+ };
const struct option options[] = {
OPT_STRING('i', "input", &input_name, "file",
"input file name"),
@@ -308,7 +292,7 @@
"only consider symbols in these dsos"),
OPT_STRING('s', "symbol", &annotate.sym_hist_filter, "symbol",
"symbol to annotate"),
- OPT_BOOLEAN('f', "force", &annotate.force, "don't complain, do it"),
+ OPT_BOOLEAN('f', "force", &file.force, "don't complain, do it"),
OPT_INCR('v', "verbose", &verbose,
"be more verbose (show symbol address, etc)"),
OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
@@ -341,6 +325,7 @@
"Show event group information together"),
OPT_END()
};
+ int ret;
argc = parse_options(argc, argv, options, annotate_usage, 0);
@@ -353,11 +338,16 @@
setup_browser(true);
+ annotate.session = perf_session__new(&file, false, &annotate.tool);
+ if (annotate.session == NULL)
+ return -ENOMEM;
+
symbol_conf.priv_size = sizeof(struct annotation);
symbol_conf.try_vmlinux_path = true;
- if (symbol__init() < 0)
- return -1;
+ ret = symbol__init(&annotate.session->header.env);
+ if (ret < 0)
+ goto out_delete;
if (setup_sorting() < 0)
usage_with_options(annotate_usage, options);
@@ -373,5 +363,20 @@
annotate.sym_hist_filter = argv[0];
}
- return __cmd_annotate(&annotate);
+ ret = __cmd_annotate(&annotate);
+
+out_delete:
+ /*
+ * Speed up the exit process, for large files this can
+ * take quite a while.
+ *
+ * XXX Enable this when using valgrind or if we ever
+ * librarize this command.
+ *
+ * Also experiment with obstacks to see how much speed
+ * up we'll get here.
+ *
+ * perf_session__delete(session);
+ */
+ return ret;
}
diff --git a/tools/perf/builtin-buildid-cache.c b/tools/perf/builtin-buildid-cache.c
index 2a2c78f..7038575 100644
--- a/tools/perf/builtin-buildid-cache.c
+++ b/tools/perf/builtin-buildid-cache.c
@@ -246,20 +246,9 @@
return true;
}
-static int build_id_cache__fprintf_missing(const char *filename, bool force, FILE *fp)
+static int build_id_cache__fprintf_missing(struct perf_session *session, FILE *fp)
{
- struct perf_data_file file = {
- .path = filename,
- .mode = PERF_DATA_MODE_READ,
- .force = force,
- };
- struct perf_session *session = perf_session__new(&file, false, NULL);
- if (session == NULL)
- return -1;
-
perf_session__fprintf_dsos_buildid(session, fp, dso__missing_buildid_cache, 0);
- perf_session__delete(session);
-
return 0;
}
@@ -302,6 +291,12 @@
*missing_filename = NULL,
*update_name_list_str = NULL,
*kcore_filename;
+ char sbuf[STRERR_BUFSIZE];
+
+ struct perf_data_file file = {
+ .mode = PERF_DATA_MODE_READ,
+ };
+ struct perf_session *session = NULL;
const struct option buildid_cache_options[] = {
OPT_STRING('a', "add", &add_name_list_str,
@@ -326,8 +321,17 @@
argc = parse_options(argc, argv, buildid_cache_options,
buildid_cache_usage, 0);
- if (symbol__init() < 0)
- return -1;
+ if (missing_filename) {
+ file.path = missing_filename;
+ file.force = force;
+
+ session = perf_session__new(&file, false, NULL);
+ if (session == NULL)
+ return -1;
+ }
+
+ if (symbol__init(session ? &session->header.env : NULL) < 0)
+ goto out;
setup_pager();
@@ -344,7 +348,7 @@
continue;
}
pr_warning("Couldn't add %s: %s\n",
- pos->s, strerror(errno));
+ pos->s, strerror_r(errno, sbuf, sizeof(sbuf)));
}
strlist__delete(list);
@@ -362,7 +366,7 @@
continue;
}
pr_warning("Couldn't remove %s: %s\n",
- pos->s, strerror(errno));
+ pos->s, strerror_r(errno, sbuf, sizeof(sbuf)));
}
strlist__delete(list);
@@ -370,7 +374,7 @@
}
if (missing_filename)
- ret = build_id_cache__fprintf_missing(missing_filename, force, stdout);
+ ret = build_id_cache__fprintf_missing(session, stdout);
if (update_name_list_str) {
list = strlist__new(true, update_name_list_str);
@@ -383,7 +387,7 @@
continue;
}
pr_warning("Couldn't update %s: %s\n",
- pos->s, strerror(errno));
+ pos->s, strerror_r(errno, sbuf, sizeof(sbuf)));
}
strlist__delete(list);
@@ -394,5 +398,9 @@
build_id_cache__add_kcore(kcore_filename, debugdir, force))
pr_warning("Couldn't add %s\n", kcore_filename);
+out:
+ if (session)
+ perf_session__delete(session);
+
return ret;
}
diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c
index 9a5a035..190d0b6 100644
--- a/tools/perf/builtin-diff.c
+++ b/tools/perf/builtin-diff.c
@@ -360,7 +360,7 @@
.exit = perf_event__process_exit,
.fork = perf_event__process_fork,
.lost = perf_event__process_lost,
- .ordered_samples = true,
+ .ordered_events = true,
.ordering_requires_timestamps = true,
};
@@ -1143,7 +1143,7 @@
argc = parse_options(argc, argv, options, diff_usage, 0);
- if (symbol__init() < 0)
+ if (symbol__init(NULL) < 0)
return -1;
if (data_init(argc, argv) < 0)
diff --git a/tools/perf/builtin-help.c b/tools/perf/builtin-help.c
index 0384d93..25d2062 100644
--- a/tools/perf/builtin-help.c
+++ b/tools/perf/builtin-help.c
@@ -103,6 +103,8 @@
static void exec_woman_emacs(const char *path, const char *page)
{
+ char sbuf[STRERR_BUFSIZE];
+
if (!check_emacsclient_version()) {
/* This works only with emacsclient version >= 22. */
struct strbuf man_page = STRBUF_INIT;
@@ -111,16 +113,19 @@
path = "emacsclient";
strbuf_addf(&man_page, "(woman \"%s\")", page);
execlp(path, "emacsclient", "-e", man_page.buf, NULL);
- warning("failed to exec '%s': %s", path, strerror(errno));
+ warning("failed to exec '%s': %s", path,
+ strerror_r(errno, sbuf, sizeof(sbuf)));
}
}
static void exec_man_konqueror(const char *path, const char *page)
{
const char *display = getenv("DISPLAY");
+
if (display && *display) {
struct strbuf man_page = STRBUF_INIT;
const char *filename = "kfmclient";
+ char sbuf[STRERR_BUFSIZE];
/* It's simpler to launch konqueror using kfmclient. */
if (path) {
@@ -139,24 +144,31 @@
path = "kfmclient";
strbuf_addf(&man_page, "man:%s(1)", page);
execlp(path, filename, "newTab", man_page.buf, NULL);
- warning("failed to exec '%s': %s", path, strerror(errno));
+ warning("failed to exec '%s': %s", path,
+ strerror_r(errno, sbuf, sizeof(sbuf)));
}
}
static void exec_man_man(const char *path, const char *page)
{
+ char sbuf[STRERR_BUFSIZE];
+
if (!path)
path = "man";
execlp(path, "man", page, NULL);
- warning("failed to exec '%s': %s", path, strerror(errno));
+ warning("failed to exec '%s': %s", path,
+ strerror_r(errno, sbuf, sizeof(sbuf)));
}
static void exec_man_cmd(const char *cmd, const char *page)
{
struct strbuf shell_cmd = STRBUF_INIT;
+ char sbuf[STRERR_BUFSIZE];
+
strbuf_addf(&shell_cmd, "%s %s", cmd, page);
execl("/bin/sh", "sh", "-c", shell_cmd.buf, NULL);
- warning("failed to exec '%s': %s", cmd, strerror(errno));
+ warning("failed to exec '%s': %s", cmd,
+ strerror_r(errno, sbuf, sizeof(sbuf)));
}
static void add_man_viewer(const char *name)
diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c
index 9a02807..3a62b6b 100644
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c
@@ -23,6 +23,7 @@
struct perf_inject {
struct perf_tool tool;
+ struct perf_session *session;
bool build_ids;
bool sched_stat;
const char *input_name;
@@ -340,12 +341,8 @@
static int __cmd_inject(struct perf_inject *inject)
{
- struct perf_session *session;
int ret = -EINVAL;
- struct perf_data_file file = {
- .path = inject->input_name,
- .mode = PERF_DATA_MODE_READ,
- };
+ struct perf_session *session = inject->session;
struct perf_data_file *file_out = &inject->output;
signal(SIGINT, sig_handler);
@@ -357,16 +354,12 @@
inject->tool.tracing_data = perf_event__repipe_tracing_data;
}
- session = perf_session__new(&file, true, &inject->tool);
- if (session == NULL)
- return -ENOMEM;
-
if (inject->build_ids) {
inject->tool.sample = perf_event__inject_buildid;
} else if (inject->sched_stat) {
struct perf_evsel *evsel;
- inject->tool.ordered_samples = true;
+ inject->tool.ordered_events = true;
evlist__for_each(session->evlist, evsel) {
const char *name = perf_evsel__name(evsel);
@@ -396,8 +389,6 @@
perf_session__write_header(session, session->evlist, file_out->fd, true);
}
- perf_session__delete(session);
-
return ret;
}
@@ -427,6 +418,11 @@
.mode = PERF_DATA_MODE_WRITE,
},
};
+ struct perf_data_file file = {
+ .mode = PERF_DATA_MODE_READ,
+ };
+ int ret;
+
const struct option options[] = {
OPT_BOOLEAN('b', "build-ids", &inject.build_ids,
"Inject build-ids into the output stream"),
@@ -461,8 +457,17 @@
return -1;
}
- if (symbol__init() < 0)
+ file.path = inject.input_name;
+ inject.session = perf_session__new(&file, true, &inject.tool);
+ if (inject.session == NULL)
+ return -ENOMEM;
+
+ if (symbol__init(&inject.session->header.env) < 0)
return -1;
- return __cmd_inject(&inject);
+ ret = __cmd_inject(&inject);
+
+ perf_session__delete(inject.session);
+
+ return ret;
}
diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c
index bef3376..2376218 100644
--- a/tools/perf/builtin-kmem.c
+++ b/tools/perf/builtin-kmem.c
@@ -256,7 +256,9 @@
static struct perf_tool perf_kmem = {
.sample = process_sample_event,
.comm = perf_event__process_comm,
- .ordered_samples = true,
+ .mmap = perf_event__process_mmap,
+ .mmap2 = perf_event__process_mmap2,
+ .ordered_events = true,
};
static double fragmentation(unsigned long n_req, unsigned long n_alloc)
@@ -403,10 +405,9 @@
__sort_result(&root_caller_stat, &root_caller_sorted, &caller_sort);
}
-static int __cmd_kmem(void)
+static int __cmd_kmem(struct perf_session *session)
{
int err = -EINVAL;
- struct perf_session *session;
const struct perf_evsel_str_handler kmem_tracepoints[] = {
{ "kmem:kmalloc", perf_evsel__process_alloc_event, },
{ "kmem:kmem_cache_alloc", perf_evsel__process_alloc_event, },
@@ -415,34 +416,22 @@
{ "kmem:kfree", perf_evsel__process_free_event, },
{ "kmem:kmem_cache_free", perf_evsel__process_free_event, },
};
- struct perf_data_file file = {
- .path = input_name,
- .mode = PERF_DATA_MODE_READ,
- };
-
- session = perf_session__new(&file, false, &perf_kmem);
- if (session == NULL)
- return -ENOMEM;
-
- if (perf_session__create_kernel_maps(session) < 0)
- goto out_delete;
if (!perf_session__has_traces(session, "kmem record"))
- goto out_delete;
+ goto out;
if (perf_session__set_tracepoints_handlers(session, kmem_tracepoints)) {
pr_err("Initializing perf session tracepoint handlers failed\n");
- return -1;
+ goto out;
}
setup_pager();
err = perf_session__process_events(session, &perf_kmem);
if (err != 0)
- goto out_delete;
+ goto out;
sort_result();
print_result(session);
-out_delete:
- perf_session__delete(session);
+out:
return err;
}
@@ -689,29 +678,46 @@
NULL,
NULL
};
+ struct perf_session *session;
+ struct perf_data_file file = {
+ .path = input_name,
+ .mode = PERF_DATA_MODE_READ,
+ };
+ int ret = -1;
+
argc = parse_options_subcommand(argc, argv, kmem_options,
kmem_subcommands, kmem_usage, 0);
if (!argc)
usage_with_options(kmem_usage, kmem_options);
- symbol__init();
-
if (!strncmp(argv[0], "rec", 3)) {
+ symbol__init(NULL);
return __cmd_record(argc, argv);
- } else if (!strcmp(argv[0], "stat")) {
+ }
+
+ session = perf_session__new(&file, false, &perf_kmem);
+ if (session == NULL)
+ return -ENOMEM;
+
+ symbol__init(&session->header.env);
+
+ if (!strcmp(argv[0], "stat")) {
if (cpu__setup_cpunode_map())
- return -1;
+ goto out_delete;
if (list_empty(&caller_sort))
setup_sorting(&caller_sort, default_sort_order);
if (list_empty(&alloc_sort))
setup_sorting(&alloc_sort, default_sort_order);
- return __cmd_kmem();
+ ret = __cmd_kmem(session);
} else
usage_with_options(kmem_usage, kmem_options);
- return 0;
+out_delete:
+ perf_session__delete(session);
+
+ return ret;
}
diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c
index 43367eb..1a4ef9c 100644
--- a/tools/perf/builtin-kvm.c
+++ b/tools/perf/builtin-kvm.c
@@ -592,8 +592,8 @@
pr_info("%9s ", "Samples%");
pr_info("%9s ", "Time%");
- pr_info("%10s ", "Min Time");
- pr_info("%10s ", "Max Time");
+ pr_info("%11s ", "Min Time");
+ pr_info("%11s ", "Max Time");
pr_info("%16s ", "Avg time");
pr_info("\n\n");
@@ -610,8 +610,8 @@
pr_info("%10llu ", (unsigned long long)ecount);
pr_info("%8.2f%% ", (double)ecount / kvm->total_count * 100);
pr_info("%8.2f%% ", (double)etime / kvm->total_time * 100);
- pr_info("%8" PRIu64 "us ", min / 1000);
- pr_info("%8" PRIu64 "us ", max / 1000);
+ pr_info("%9.2fus ", (double)min / 1e3);
+ pr_info("%9.2fus ", (double)max / 1e3);
pr_info("%9.2fus ( +-%7.2f%% )", (double)etime / ecount/1e3,
kvm_event_rel_stddev(vcpu, event));
pr_info("\n");
@@ -732,7 +732,7 @@
return -1;
}
- err = perf_session_queue_event(kvm->session, event, &sample, 0);
+ err = perf_session_queue_event(kvm->session, event, &kvm->tool, &sample, 0);
/*
* FIXME: Here we can't consume the event, as perf_session_queue_event will
* point to it, and it'll get possibly overwritten by the kernel.
@@ -785,7 +785,7 @@
/* flush queue after each round in which we processed events */
if (ntotal) {
- kvm->session->ordered_samples.next_flush = flush_time;
+ kvm->session->ordered_events.next_flush = flush_time;
err = kvm->tool.finished_round(&kvm->tool, NULL, kvm->session);
if (err) {
if (kvm->lost_events)
@@ -885,15 +885,11 @@
return 0;
}
-static
-int perf_kvm__handle_stdin(struct termios *tc_now, struct termios *tc_save)
+static int perf_kvm__handle_stdin(void)
{
int c;
- tcsetattr(0, TCSANOW, tc_now);
c = getc(stdin);
- tcsetattr(0, TCSAFLUSH, tc_save);
-
if (c == 'q')
return 1;
@@ -904,7 +900,7 @@
{
struct pollfd *pollfds = NULL;
int nr_fds, nr_stdin, ret, err = -EINVAL;
- struct termios tc, save;
+ struct termios save;
/* live flag must be set first */
kvm->live = true;
@@ -919,14 +915,9 @@
goto out;
}
+ set_term_quiet_input(&save);
init_kvm_event_record(kvm);
- tcgetattr(0, &save);
- tc = save;
- tc.c_lflag &= ~(ICANON | ECHO);
- tc.c_cc[VMIN] = 0;
- tc.c_cc[VTIME] = 0;
-
signal(SIGINT, sig_handler);
signal(SIGTERM, sig_handler);
@@ -972,7 +963,7 @@
goto out;
if (pollfds[nr_stdin].revents & POLLIN)
- done = perf_kvm__handle_stdin(&tc, &save);
+ done = perf_kvm__handle_stdin();
if (!rc && !done)
err = poll(pollfds, nr_fds, 100);
@@ -989,6 +980,7 @@
if (kvm->timerfd >= 0)
close(kvm->timerfd);
+ tcsetattr(0, TCSAFLUSH, &save);
free(pollfds);
return err;
}
@@ -998,6 +990,7 @@
int err, rc = -1;
struct perf_evsel *pos;
struct perf_evlist *evlist = kvm->evlist;
+ char sbuf[STRERR_BUFSIZE];
perf_evlist__config(evlist, &kvm->opts);
@@ -1034,12 +1027,14 @@
err = perf_evlist__open(evlist);
if (err < 0) {
- printf("Couldn't create the events: %s\n", strerror(errno));
+ printf("Couldn't create the events: %s\n",
+ strerror_r(errno, sbuf, sizeof(sbuf)));
goto out;
}
if (perf_evlist__mmap(evlist, kvm->opts.mmap_pages, false) < 0) {
- ui__error("Failed to mmap the events: %s\n", strerror(errno));
+ ui__error("Failed to mmap the events: %s\n",
+ strerror_r(errno, sbuf, sizeof(sbuf)));
perf_evlist__close(evlist);
goto out;
}
@@ -1058,7 +1053,7 @@
struct perf_tool eops = {
.sample = process_sample_event,
.comm = perf_event__process_comm,
- .ordered_samples = true,
+ .ordered_events = true,
};
struct perf_data_file file = {
.path = kvm->file_name,
@@ -1072,6 +1067,8 @@
return -EINVAL;
}
+ symbol__init(&kvm->session->header.env);
+
if (!perf_session__has_traces(kvm->session, "kvm record"))
return -EINVAL;
@@ -1201,8 +1198,6 @@
NULL
};
- symbol__init();
-
if (argc) {
argc = parse_options(argc, argv,
kvm_events_report_options,
@@ -1311,7 +1306,7 @@
kvm->tool.exit = perf_event__process_exit;
kvm->tool.fork = perf_event__process_fork;
kvm->tool.lost = process_lost_event;
- kvm->tool.ordered_samples = true;
+ kvm->tool.ordered_events = true;
perf_tool__fill_defaults(&kvm->tool);
/* set defaults */
@@ -1322,7 +1317,7 @@
kvm->opts.target.uid_str = NULL;
kvm->opts.target.uid = UINT_MAX;
- symbol__init();
+ symbol__init(NULL);
disable_buildid_cache();
use_browser = 0;
diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c
index 6148afc..92790ed 100644
--- a/tools/perf/builtin-lock.c
+++ b/tools/perf/builtin-lock.c
@@ -852,7 +852,7 @@
struct perf_tool eops = {
.sample = process_sample_event,
.comm = perf_event__process_comm,
- .ordered_samples = true,
+ .ordered_events = true,
};
struct perf_data_file file = {
.path = input_name,
@@ -865,6 +865,8 @@
return -ENOMEM;
}
+ symbol__init(&session->header.env);
+
if (!perf_session__has_traces(session, "lock record"))
goto out_delete;
@@ -974,7 +976,6 @@
unsigned int i;
int rc = 0;
- symbol__init();
for (i = 0; i < LOCKHASH_SIZE; i++)
INIT_LIST_HEAD(lockhash_table + i);
diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c
index 4a1a6c9..8b4a87f 100644
--- a/tools/perf/builtin-mem.c
+++ b/tools/perf/builtin-mem.c
@@ -133,7 +133,7 @@
goto out_delete;
}
- if (symbol__init() < 0)
+ if (symbol__init(&session->header.env) < 0)
return -1;
printf("# PID, TID, IP, ADDR, LOCAL WEIGHT, DSRC, SYMBOL\n");
@@ -194,7 +194,7 @@
.lost = perf_event__process_lost,
.fork = perf_event__process_fork,
.build_id = perf_event__process_build_id,
- .ordered_samples = true,
+ .ordered_events = true,
},
.input_name = "perf.data",
};
diff --git a/tools/perf/builtin-probe.c b/tools/perf/builtin-probe.c
index c63fa29..347729e 100644
--- a/tools/perf/builtin-probe.c
+++ b/tools/perf/builtin-probe.c
@@ -290,8 +290,11 @@
static void pr_err_with_code(const char *msg, int err)
{
+ char sbuf[STRERR_BUFSIZE];
+
pr_err("%s", msg);
- pr_debug(" Reason: %s (Code: %d)", strerror(-err), err);
+ pr_debug(" Reason: %s (Code: %d)",
+ strerror_r(-err, sbuf, sizeof(sbuf)), err);
pr_err("\n");
}
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 4869050..87e28a4 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -161,7 +161,7 @@
if (perf_evlist__apply_filters(evlist)) {
error("failed to set filter with %d (%s)\n", errno,
- strerror(errno));
+ strerror_r(errno, msg, sizeof(msg)));
rc = -1;
goto out;
}
@@ -175,7 +175,8 @@
"(current value: %u)\n", opts->mmap_pages);
rc = -errno;
} else {
- pr_err("failed to mmap with %d (%s)\n", errno, strerror(errno));
+ pr_err("failed to mmap with %d (%s)\n", errno,
+ strerror_r(errno, msg, sizeof(msg)));
rc = -errno;
}
goto out;
@@ -480,7 +481,7 @@
}
if (forks && workload_exec_errno) {
- char msg[512];
+ char msg[STRERR_BUFSIZE];
const char *emsg = strerror_r(workload_exec_errno, msg, sizeof(msg));
pr_err("Workload failed: %s\n", emsg);
err = -1;
@@ -781,6 +782,7 @@
*/
static struct record record = {
.opts = {
+ .sample_time = true,
.mmap_pages = UINT_MAX,
.user_freq = UINT_MAX,
.user_interval = ULLONG_MAX,
@@ -907,7 +909,7 @@
usage_with_options(record_usage, record_options);
}
- symbol__init();
+ symbol__init(NULL);
if (symbol_conf.kptr_restrict)
pr_warning(
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 21d830b..3da59a8 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -58,17 +58,19 @@
const char *symbol_filter_str;
float min_percent;
u64 nr_entries;
+ u64 queue_size;
DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS);
};
static int report__config(const char *var, const char *value, void *cb)
{
+ struct report *rep = cb;
+
if (!strcmp(var, "report.group")) {
symbol_conf.event_group = perf_config_bool(var, value);
return 0;
}
if (!strcmp(var, "report.percent-limit")) {
- struct report *rep = cb;
rep->min_percent = strtof(value, NULL);
return 0;
}
@@ -76,6 +78,10 @@
symbol_conf.cumulate_callchain = perf_config_bool(var, value);
return 0;
}
+ if (!strcmp(var, "report.queue-size")) {
+ rep->queue_size = perf_config_u64(var, value);
+ return 0;
+ }
return perf_default_config(var, value, cb);
}
@@ -578,7 +584,7 @@
.attr = perf_event__process_attr,
.tracing_data = perf_event__process_tracing_data,
.build_id = perf_event__process_build_id,
- .ordered_samples = true,
+ .ordered_events = true,
.ordering_requires_timestamps = true,
},
.max_stack = PERF_MAX_STACK_DEPTH,
@@ -714,12 +720,17 @@
if (session == NULL)
return -ENOMEM;
+ if (report.queue_size) {
+ ordered_events__set_alloc_size(&session->ordered_events,
+ report.queue_size);
+ }
+
report.session = session;
has_br_stack = perf_header__has_feat(&session->header,
HEADER_BRANCH_STACK);
- if (branch_mode == -1 && has_br_stack) {
+ if ((branch_mode == -1 && has_br_stack) || branch_mode == 1) {
sort__mode = SORT_MODE__BRANCH;
symbol_conf.cumulate_callchain = false;
}
@@ -787,7 +798,7 @@
}
}
- if (symbol__init() < 0)
+ if (symbol__init(&session->header.env) < 0)
goto error;
if (argc) {
diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index f83c08c..9c9287f 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -428,6 +428,7 @@
static int self_open_counters(void)
{
struct perf_event_attr attr;
+ char sbuf[STRERR_BUFSIZE];
int fd;
memset(&attr, 0, sizeof(attr));
@@ -440,7 +441,8 @@
if (fd < 0)
pr_err("Error: sys_perf_event_open() syscall returned "
- "with %d (%s)\n", fd, strerror(errno));
+ "with %d (%s)\n", fd,
+ strerror_r(errno, sbuf, sizeof(sbuf)));
return fd;
}
@@ -1462,6 +1464,8 @@
return -1;
}
+ symbol__init(&session->header.env);
+
if (perf_session__set_tracepoints_handlers(session, handlers))
goto out_delete;
@@ -1662,7 +1666,7 @@
.comm = perf_event__process_comm,
.lost = perf_event__process_lost,
.fork = perf_sched__process_fork_event,
- .ordered_samples = true,
+ .ordered_events = true,
},
.cmp_pid = LIST_HEAD_INIT(sched.cmp_pid),
.sort_list = LIST_HEAD_INIT(sched.sort_list),
@@ -1747,7 +1751,6 @@
if (!strcmp(argv[0], "script"))
return cmd_script(argc, argv, prefix);
- symbol__init();
if (!strncmp(argv[0], "rec", 3)) {
return __cmd_record(argc, argv);
} else if (!strncmp(argv[0], "lat", 3)) {
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index f57035b..02dce92 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -184,10 +184,6 @@
if (perf_evsel__check_stype(evsel, PERF_SAMPLE_IP, "IP",
PERF_OUTPUT_IP))
return -EINVAL;
-
- if (!no_callchain &&
- !(attr->sample_type & PERF_SAMPLE_CALLCHAIN))
- symbol_conf.use_callchain = false;
}
if (PRINT_FIELD(ADDR) &&
@@ -290,6 +286,19 @@
set_print_ip_opts(&evsel->attr);
}
+ if (!no_callchain) {
+ bool use_callchain = false;
+
+ evlist__for_each(session->evlist, evsel) {
+ if (evsel->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
+ use_callchain = true;
+ break;
+ }
+ }
+ if (!use_callchain)
+ symbol_conf.use_callchain = false;
+ }
+
/*
* set default for tracepoints to print symbols only
* if callchains are present
@@ -476,6 +485,11 @@
return 0;
}
+static int default_flush_script(void)
+{
+ return 0;
+}
+
static int default_stop_script(void)
{
return 0;
@@ -489,6 +503,7 @@
static struct scripting_ops default_scripting_ops = {
.start_script = default_start_script,
+ .flush_script = default_flush_script,
.stop_script = default_stop_script,
.process_event = process_event,
.generate_script = default_generate_script,
@@ -504,6 +519,11 @@
scripting_ops = &default_scripting_ops;
}
+static int flush_scripting(void)
+{
+ return scripting_ops->flush_script();
+}
+
static int cleanup_scripting(void)
{
pr_debug("\nperf script stopped\n");
@@ -1471,12 +1491,13 @@
bool show_full_info = false;
bool header = false;
bool header_only = false;
+ bool script_started = false;
char *rec_script_path = NULL;
char *rep_script_path = NULL;
struct perf_session *session;
char *script_path = NULL;
const char **__argv;
- int i, j, err;
+ int i, j, err = 0;
struct perf_script script = {
.tool = {
.sample = process_sample_event,
@@ -1488,7 +1509,7 @@
.attr = process_attr,
.tracing_data = perf_event__process_tracing_data,
.build_id = perf_event__process_build_id,
- .ordered_samples = true,
+ .ordered_events = true,
.ordering_requires_timestamps = true,
},
};
@@ -1718,8 +1739,6 @@
exit(-1);
}
- if (symbol__init() < 0)
- return -1;
if (!script_name)
setup_pager();
@@ -1730,14 +1749,18 @@
if (header || header_only) {
perf_session__fprintf_info(session, stdout, show_full_info);
if (header_only)
- return 0;
+ goto out_delete;
}
+ if (symbol__init(&session->header.env) < 0)
+ goto out_delete;
+
script.session = session;
if (cpu_list) {
- if (perf_session__cpu_bitmap(session, cpu_list, cpu_bitmap))
- return -1;
+ err = perf_session__cpu_bitmap(session, cpu_list, cpu_bitmap);
+ if (err < 0)
+ goto out_delete;
}
if (!no_callchain)
@@ -1752,53 +1775,62 @@
if (output_set_by_user()) {
fprintf(stderr,
"custom fields not supported for generated scripts");
- return -1;
+ err = -EINVAL;
+ goto out_delete;
}
input = open(file.path, O_RDONLY); /* input_name */
if (input < 0) {
+ err = -errno;
perror("failed to open file");
- return -1;
+ goto out_delete;
}
err = fstat(input, &perf_stat);
if (err < 0) {
perror("failed to stat file");
- return -1;
+ goto out_delete;
}
if (!perf_stat.st_size) {
fprintf(stderr, "zero-sized file, nothing to do!\n");
- return 0;
+ goto out_delete;
}
scripting_ops = script_spec__lookup(generate_script_lang);
if (!scripting_ops) {
fprintf(stderr, "invalid language specifier");
- return -1;
+ err = -ENOENT;
+ goto out_delete;
}
err = scripting_ops->generate_script(session->tevent.pevent,
"perf-script");
- goto out;
+ goto out_delete;
}
if (script_name) {
err = scripting_ops->start_script(script_name, argc, argv);
if (err)
- goto out;
+ goto out_delete;
pr_debug("perf script started with script %s\n\n", script_name);
+ script_started = true;
}
err = perf_session__check_output_opt(session);
if (err < 0)
- goto out;
+ goto out_delete;
err = __cmd_script(&script);
+ flush_scripting();
+
+out_delete:
perf_session__delete(session);
- cleanup_scripting();
+
+ if (script_started)
+ cleanup_scripting();
out:
return err;
}
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 3e80aa1..5fe0edb 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -593,7 +593,7 @@
if (perf_evlist__apply_filters(evsel_list)) {
error("failed to set filter with %d (%s)\n", errno,
- strerror(errno));
+ strerror_r(errno, msg, sizeof(msg)));
return -1;
}
diff --git a/tools/perf/builtin-timechart.c b/tools/perf/builtin-timechart.c
index 2f1a522..48eea6c 100644
--- a/tools/perf/builtin-timechart.c
+++ b/tools/perf/builtin-timechart.c
@@ -1607,6 +1607,8 @@
if (session == NULL)
return -ENOMEM;
+ symbol__init(&session->header.env);
+
(void)perf_header__process_sections(&session->header,
perf_data_file__fd(session->file),
tchart,
@@ -1920,7 +1922,7 @@
.fork = process_fork_event,
.exit = process_exit_event,
.sample = process_sample_event,
- .ordered_samples = true,
+ .ordered_events = true,
},
.proc_num = 15,
.min_time = 1000000,
@@ -1982,8 +1984,6 @@
return -1;
}
- symbol__init();
-
if (argc && !strncmp(argv[0], "rec", 3)) {
argc = parse_options(argc, argv, record_options, record_usage,
PARSE_OPT_STOP_AT_NON_OPTION);
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 377971d..9848e27 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -276,11 +276,17 @@
return;
}
+ if (top->zero) {
+ hists__delete_entries(&top->sym_evsel->hists);
+ } else {
+ hists__decay_entries(&top->sym_evsel->hists,
+ top->hide_user_symbols,
+ top->hide_kernel_symbols);
+ }
+
hists__collapse_resort(&top->sym_evsel->hists, NULL);
hists__output_resort(&top->sym_evsel->hists);
- hists__decay_entries(&top->sym_evsel->hists,
- top->hide_user_symbols,
- top->hide_kernel_symbols);
+
hists__output_recalc_col_len(&top->sym_evsel->hists,
top->print_entries - printed);
putchar('\n');
@@ -427,18 +433,13 @@
if (!perf_top__key_mapped(top, c)) {
struct pollfd stdin_poll = { .fd = 0, .events = POLLIN };
- struct termios tc, save;
+ struct termios save;
perf_top__print_mapped_keys(top);
fprintf(stdout, "\nEnter selection, or unmapped key to continue: ");
fflush(stdout);
- tcgetattr(0, &save);
- tc = save;
- tc.c_lflag &= ~(ICANON | ECHO);
- tc.c_cc[VMIN] = 0;
- tc.c_cc[VTIME] = 0;
- tcsetattr(0, TCSANOW, &tc);
+ set_term_quiet_input(&save);
poll(&stdin_poll, 1, -1);
c = getc(stdin);
@@ -542,11 +543,16 @@
if (t->evlist->selected != NULL)
t->sym_evsel = t->evlist->selected;
+ if (t->zero) {
+ hists__delete_entries(&t->sym_evsel->hists);
+ } else {
+ hists__decay_entries(&t->sym_evsel->hists,
+ t->hide_user_symbols,
+ t->hide_kernel_symbols);
+ }
+
hists__collapse_resort(&t->sym_evsel->hists, NULL);
hists__output_resort(&t->sym_evsel->hists);
- hists__decay_entries(&t->sym_evsel->hists,
- t->hide_user_symbols,
- t->hide_kernel_symbols);
}
static void *display_thread_tui(void *arg)
@@ -577,23 +583,32 @@
return NULL;
}
+static void display_sig(int sig __maybe_unused)
+{
+ done = 1;
+}
+
+static void display_setup_sig(void)
+{
+ signal(SIGSEGV, display_sig);
+ signal(SIGFPE, display_sig);
+ signal(SIGINT, display_sig);
+ signal(SIGQUIT, display_sig);
+ signal(SIGTERM, display_sig);
+}
+
static void *display_thread(void *arg)
{
struct pollfd stdin_poll = { .fd = 0, .events = POLLIN };
- struct termios tc, save;
+ struct termios save;
struct perf_top *top = arg;
int delay_msecs, c;
- tcgetattr(0, &save);
- tc = save;
- tc.c_lflag &= ~(ICANON | ECHO);
- tc.c_cc[VMIN] = 0;
- tc.c_cc[VTIME] = 0;
-
+ display_setup_sig();
pthread__unblock_sigwinch();
repeat:
delay_msecs = top->delay_secs * 1000;
- tcsetattr(0, TCSANOW, &tc);
+ set_term_quiet_input(&save);
/* trash return*/
getc(stdin);
@@ -620,13 +635,16 @@
}
}
+ tcsetattr(0, TCSAFLUSH, &save);
return NULL;
}
-static int symbol_filter(struct map *map __maybe_unused, struct symbol *sym)
+static int symbol_filter(struct map *map, struct symbol *sym)
{
const char *name = sym->name;
+ if (!map->dso->kernel)
+ return 0;
/*
* ppc64 uses function descriptors and appends a '.' to the
* start of every instruction address. Remove it.
@@ -876,7 +894,7 @@
if (perf_evlist__mmap(evlist, opts->mmap_pages, false) < 0) {
ui__error("Failed to mmap with %d (%s)\n",
- errno, strerror(errno));
+ errno, strerror_r(errno, msg, sizeof(msg)));
goto out_err;
}
@@ -963,7 +981,7 @@
param.sched_priority = top->realtime_prio;
if (sched_setscheduler(0, SCHED_FIFO, ¶m)) {
ui__error("Could not set realtime priority.\n");
- goto out_delete;
+ goto out_join;
}
}
@@ -977,6 +995,8 @@
}
ret = 0;
+out_join:
+ pthread_join(thread, NULL);
out_delete:
perf_session__delete(top->session);
top->session = NULL;
@@ -1131,6 +1151,9 @@
"Don't show entries under that percent", parse_percent_limit),
OPT_CALLBACK(0, "percentage", NULL, "relative|absolute",
"How to display percentage of filtered entries", parse_filter_percentage),
+ OPT_STRING('w', "column-widths", &symbol_conf.col_width_list_str,
+ "width[,width...]",
+ "don't try to adjust column width, use these fixed values"),
OPT_END()
};
const char * const top_usage[] = {
@@ -1217,7 +1240,7 @@
symbol_conf.priv_size = sizeof(struct annotation);
symbol_conf.try_vmlinux_path = (symbol_conf.vmlinux_name == NULL);
- if (symbol__init() < 0)
+ if (symbol__init(NULL) < 0)
return -1;
sort__setup_elide(stdout);
diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index a6c3752..a9e96ff 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -402,6 +402,31 @@
#define SCA_MMAP_FLAGS syscall_arg__scnprintf_mmap_flags
+static size_t syscall_arg__scnprintf_mremap_flags(char *bf, size_t size,
+ struct syscall_arg *arg)
+{
+ int printed = 0, flags = arg->val;
+
+#define P_MREMAP_FLAG(n) \
+ if (flags & MREMAP_##n) { \
+ printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
+ flags &= ~MREMAP_##n; \
+ }
+
+ P_MREMAP_FLAG(MAYMOVE);
+#ifdef MREMAP_FIXED
+ P_MREMAP_FLAG(FIXED);
+#endif
+#undef P_MREMAP_FLAG
+
+ if (flags)
+ printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
+
+ return printed;
+}
+
+#define SCA_MREMAP_FLAGS syscall_arg__scnprintf_mremap_flags
+
static size_t syscall_arg__scnprintf_madvise_behavior(char *bf, size_t size,
struct syscall_arg *arg)
{
@@ -1004,6 +1029,7 @@
[2] = SCA_MMAP_PROT, /* prot */ }, },
{ .name = "mremap", .hexret = true,
.arg_scnprintf = { [0] = SCA_HEX, /* addr */
+ [3] = SCA_MREMAP_FLAGS, /* flags */
[4] = SCA_HEX, /* new_addr */ }, },
{ .name = "munlock", .errmsg = true,
.arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
@@ -1385,7 +1411,7 @@
static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
{
- int err = symbol__init();
+ int err = symbol__init(NULL);
if (err)
return err;
@@ -1724,7 +1750,7 @@
signed_print:
fprintf(trace->output, ") = %d", ret);
} else if (ret < 0 && sc->fmt->errmsg) {
- char bf[256];
+ char bf[STRERR_BUFSIZE];
const char *emsg = strerror_r(-ret, bf, sizeof(bf)),
*e = audit_errno_to_name(-ret);
@@ -2018,6 +2044,7 @@
int err = -1, i;
unsigned long before;
const bool forks = argc > 0;
+ char sbuf[STRERR_BUFSIZE];
trace->live = true;
@@ -2079,7 +2106,8 @@
err = perf_evlist__mmap(evlist, trace->opts.mmap_pages, false);
if (err < 0) {
- fprintf(trace->output, "Couldn't mmap the events: %s\n", strerror(errno));
+ fprintf(trace->output, "Couldn't mmap the events: %s\n",
+ strerror_r(errno, sbuf, sizeof(sbuf)));
goto out_delete_evlist;
}
@@ -2209,19 +2237,19 @@
trace->tool.tracing_data = perf_event__process_tracing_data;
trace->tool.build_id = perf_event__process_build_id;
- trace->tool.ordered_samples = true;
+ trace->tool.ordered_events = true;
trace->tool.ordering_requires_timestamps = true;
/* add tid to output */
trace->multiple_threads = true;
- if (symbol__init() < 0)
- return -1;
-
session = perf_session__new(&file, false, &trace->tool);
if (session == NULL)
return -ENOMEM;
+ if (symbol__init(&session->header.env) < 0)
+ goto out;
+
trace->host = &session->machines.host;
err = perf_session__set_tracepoints_handlers(session, handlers);
diff --git a/tools/perf/config/Makefile b/tools/perf/config/Makefile
index 1f67aa0..75d4c23 100644
--- a/tools/perf/config/Makefile
+++ b/tools/perf/config/Makefile
@@ -120,6 +120,29 @@
CFLAGS += -DPARSER_DEBUG
endif
+ifndef NO_LIBPYTHON
+ # Try different combinations to accommodate systems that only have
+ # python[2][-config] in weird combinations but always preferring
+ # python2 and python2-config as per pep-0394. If we catch a
+ # python[-config] in version 3, the version check will kill it.
+ PYTHON2 := $(if $(call get-executable,python2),python2,python)
+ override PYTHON := $(call get-executable-or-default,PYTHON,$(PYTHON2))
+ PYTHON2_CONFIG := \
+ $(if $(call get-executable,$(PYTHON)-config),$(PYTHON)-config,python-config)
+ override PYTHON_CONFIG := \
+ $(call get-executable-or-default,PYTHON_CONFIG,$(PYTHON2_CONFIG))
+
+ PYTHON_CONFIG_SQ := $(call shell-sq,$(PYTHON_CONFIG))
+
+ PYTHON_EMBED_LDOPTS := $(shell $(PYTHON_CONFIG_SQ) --ldflags 2>/dev/null)
+ PYTHON_EMBED_CCOPTS := $(shell $(PYTHON_CONFIG_SQ) --cflags 2>/dev/null)
+
+ FEATURE_CHECK_CFLAGS-libpython := $(PYTHON_EMBED_CCOPTS)
+ FEATURE_CHECK_LDFLAGS-libpython := $(PYTHON_EMBED_LDOPTS)
+ FEATURE_CHECK_CFLAGS-libpython-version := $(PYTHON_EMBED_CCOPTS)
+ FEATURE_CHECK_LDFLAGS-libpython-version := $(PYTHON_EMBED_LDOPTS)
+endif
+
CFLAGS += -fno-omit-frame-pointer
CFLAGS += -ggdb3
CFLAGS += -funwind-tables
@@ -482,21 +505,14 @@
NO_LIBPYTHON := 1
endef
-override PYTHON := \
- $(call get-executable-or-default,PYTHON,python)
-
-ifndef PYTHON
- $(call disable-python,python interpreter)
+ifdef NO_LIBPYTHON
+ $(call disable-python)
else
- PYTHON_WORD := $(call shell-wordify,$(PYTHON))
-
- ifdef NO_LIBPYTHON
- $(call disable-python)
+ ifndef PYTHON
+ $(call disable-python,python interpreter)
else
-
- override PYTHON_CONFIG := \
- $(call get-executable-or-default,PYTHON_CONFIG,$(PYTHON)-config)
+ PYTHON_WORD := $(call shell-wordify,$(PYTHON))
ifndef PYTHON_CONFIG
$(call disable-python,python-config tool)
diff --git a/tools/perf/config/feature-checks/Makefile b/tools/perf/config/feature-checks/Makefile
index 6088f8d..72ab298 100644
--- a/tools/perf/config/feature-checks/Makefile
+++ b/tools/perf/config/feature-checks/Makefile
@@ -101,25 +101,11 @@
test-libperl.bin:
$(BUILD) $(FLAGS_PERL_EMBED)
-override PYTHON := python
-override PYTHON_CONFIG := python-config
-
-escape-for-shell-sq = $(subst ','\'',$(1))
-shell-sq = '$(escape-for-shell-sq)'
-
-PYTHON_CONFIG_SQ = $(call shell-sq,$(PYTHON_CONFIG))
-
-PYTHON_EMBED_LDOPTS = $(shell $(PYTHON_CONFIG_SQ) --ldflags 2>/dev/null)
-PYTHON_EMBED_LDFLAGS = $(call strip-libs,$(PYTHON_EMBED_LDOPTS))
-PYTHON_EMBED_LIBADD = $(call grep-libs,$(PYTHON_EMBED_LDOPTS))
-PYTHON_EMBED_CCOPTS = $(shell $(PYTHON_CONFIG_SQ) --cflags 2>/dev/null)
-FLAGS_PYTHON_EMBED = $(PYTHON_EMBED_CCOPTS) $(PYTHON_EMBED_LDOPTS)
-
test-libpython.bin:
- $(BUILD) $(FLAGS_PYTHON_EMBED)
+ $(BUILD)
test-libpython-version.bin:
- $(BUILD) $(FLAGS_PYTHON_EMBED)
+ $(BUILD)
test-libbfd.bin:
$(BUILD) -DPACKAGE='"perf"' -lbfd -lz -liberty -ldl
diff --git a/tools/perf/perf.c b/tools/perf/perf.c
index 2282d41..452a847 100644
--- a/tools/perf/perf.c
+++ b/tools/perf/perf.c
@@ -313,6 +313,7 @@
int status;
struct stat st;
const char *prefix;
+ char sbuf[STRERR_BUFSIZE];
prefix = NULL;
if (p->option & RUN_SETUP)
@@ -343,7 +344,8 @@
status = 1;
/* Check for ENOSPC and EIO errors.. */
if (fflush(stdout)) {
- fprintf(stderr, "write failure on standard output: %s", strerror(errno));
+ fprintf(stderr, "write failure on standard output: %s",
+ strerror_r(errno, sbuf, sizeof(sbuf)));
goto out;
}
if (ferror(stdout)) {
@@ -351,7 +353,8 @@
goto out;
}
if (fclose(stdout)) {
- fprintf(stderr, "close failed on standard output: %s", strerror(errno));
+ fprintf(stderr, "close failed on standard output: %s",
+ strerror_r(errno, sbuf, sizeof(sbuf)));
goto out;
}
status = 0;
@@ -466,6 +469,7 @@
int main(int argc, const char **argv)
{
const char *cmd;
+ char sbuf[STRERR_BUFSIZE];
/* The page_size is placed in util object. */
page_size = sysconf(_SC_PAGE_SIZE);
@@ -561,7 +565,7 @@
}
fprintf(stderr, "Failed to run command '%s': %s\n",
- cmd, strerror(errno));
+ cmd, strerror_r(errno, sbuf, sizeof(sbuf)));
out:
return 1;
}
diff --git a/tools/perf/tests/builtin-test.c b/tools/perf/tests/builtin-test.c
index 6f8b01b..6a4145e 100644
--- a/tools/perf/tests/builtin-test.c
+++ b/tools/perf/tests/builtin-test.c
@@ -154,6 +154,10 @@
.func = test__hists_cumulate,
},
{
+ .desc = "Test tracking with sched_switch",
+ .func = test__switch_tracking,
+ },
+ {
.func = NULL,
},
};
@@ -185,9 +189,11 @@
static int run_test(struct test *test)
{
int status, err = -1, child = fork();
+ char sbuf[STRERR_BUFSIZE];
if (child < 0) {
- pr_err("failed to fork test: %s\n", strerror(errno));
+ pr_err("failed to fork test: %s\n",
+ strerror_r(errno, sbuf, sizeof(sbuf)));
return -1;
}
@@ -297,7 +303,7 @@
symbol_conf.sort_by_name = true;
symbol_conf.try_vmlinux_path = true;
- if (symbol__init() < 0)
+ if (symbol__init(NULL) < 0)
return -1;
if (skip != NULL)
diff --git a/tools/perf/tests/mmap-basic.c b/tools/perf/tests/mmap-basic.c
index 1422634..9b9622a 100644
--- a/tools/perf/tests/mmap-basic.c
+++ b/tools/perf/tests/mmap-basic.c
@@ -31,6 +31,7 @@
unsigned int nr_events[nsyscalls],
expected_nr_events[nsyscalls], i, j;
struct perf_evsel *evsels[nsyscalls], *evsel;
+ char sbuf[STRERR_BUFSIZE];
threads = thread_map__new(-1, getpid(), UINT_MAX);
if (threads == NULL) {
@@ -49,7 +50,7 @@
sched_setaffinity(0, sizeof(cpu_set), &cpu_set);
if (sched_setaffinity(0, sizeof(cpu_set), &cpu_set) < 0) {
pr_debug("sched_setaffinity() failed on CPU %d: %s ",
- cpus->map[0], strerror(errno));
+ cpus->map[0], strerror_r(errno, sbuf, sizeof(sbuf)));
goto out_free_cpus;
}
@@ -79,7 +80,7 @@
if (perf_evsel__open(evsels[i], cpus, threads) < 0) {
pr_debug("failed to open counter: %s, "
"tweak /proc/sys/kernel/perf_event_paranoid?\n",
- strerror(errno));
+ strerror_r(errno, sbuf, sizeof(sbuf)));
goto out_delete_evlist;
}
@@ -89,7 +90,7 @@
if (perf_evlist__mmap(evlist, 128, true) < 0) {
pr_debug("failed to mmap events: %d (%s)\n", errno,
- strerror(errno));
+ strerror_r(errno, sbuf, sizeof(sbuf)));
goto out_delete_evlist;
}
diff --git a/tools/perf/tests/open-syscall-all-cpus.c b/tools/perf/tests/open-syscall-all-cpus.c
index 5fecdbd..8fa82d1 100644
--- a/tools/perf/tests/open-syscall-all-cpus.c
+++ b/tools/perf/tests/open-syscall-all-cpus.c
@@ -12,6 +12,7 @@
unsigned int nr_open_calls = 111, i;
cpu_set_t cpu_set;
struct thread_map *threads = thread_map__new(-1, getpid(), UINT_MAX);
+ char sbuf[STRERR_BUFSIZE];
if (threads == NULL) {
pr_debug("thread_map__new\n");
@@ -35,7 +36,7 @@
if (perf_evsel__open(evsel, cpus, threads) < 0) {
pr_debug("failed to open counter: %s, "
"tweak /proc/sys/kernel/perf_event_paranoid?\n",
- strerror(errno));
+ strerror_r(errno, sbuf, sizeof(sbuf)));
goto out_evsel_delete;
}
@@ -56,7 +57,7 @@
if (sched_setaffinity(0, sizeof(cpu_set), &cpu_set) < 0) {
pr_debug("sched_setaffinity() failed on CPU %d: %s ",
cpus->map[cpu],
- strerror(errno));
+ strerror_r(errno, sbuf, sizeof(sbuf)));
goto out_close_fd;
}
for (i = 0; i < ncalls; ++i) {
diff --git a/tools/perf/tests/open-syscall-tp-fields.c b/tools/perf/tests/open-syscall-tp-fields.c
index 0785b64..922bdb6 100644
--- a/tools/perf/tests/open-syscall-tp-fields.c
+++ b/tools/perf/tests/open-syscall-tp-fields.c
@@ -22,6 +22,7 @@
struct perf_evlist *evlist = perf_evlist__new();
struct perf_evsel *evsel;
int err = -1, i, nr_events = 0, nr_polls = 0;
+ char sbuf[STRERR_BUFSIZE];
if (evlist == NULL) {
pr_debug("%s: perf_evlist__new\n", __func__);
@@ -48,13 +49,15 @@
err = perf_evlist__open(evlist);
if (err < 0) {
- pr_debug("perf_evlist__open: %s\n", strerror(errno));
+ pr_debug("perf_evlist__open: %s\n",
+ strerror_r(errno, sbuf, sizeof(sbuf)));
goto out_delete_evlist;
}
err = perf_evlist__mmap(evlist, UINT_MAX, false);
if (err < 0) {
- pr_debug("perf_evlist__mmap: %s\n", strerror(errno));
+ pr_debug("perf_evlist__mmap: %s\n",
+ strerror_r(errno, sbuf, sizeof(sbuf)));
goto out_delete_evlist;
}
diff --git a/tools/perf/tests/open-syscall.c b/tools/perf/tests/open-syscall.c
index c1dc7d2..a33b2da 100644
--- a/tools/perf/tests/open-syscall.c
+++ b/tools/perf/tests/open-syscall.c
@@ -9,6 +9,7 @@
struct perf_evsel *evsel;
unsigned int nr_open_calls = 111, i;
struct thread_map *threads = thread_map__new(-1, getpid(), UINT_MAX);
+ char sbuf[STRERR_BUFSIZE];
if (threads == NULL) {
pr_debug("thread_map__new\n");
@@ -24,7 +25,7 @@
if (perf_evsel__open_per_thread(evsel, threads) < 0) {
pr_debug("failed to open counter: %s, "
"tweak /proc/sys/kernel/perf_event_paranoid?\n",
- strerror(errno));
+ strerror_r(errno, sbuf, sizeof(sbuf)));
goto out_evsel_delete;
}
diff --git a/tools/perf/tests/perf-record.c b/tools/perf/tests/perf-record.c
index aca1a83..2ce753c 100644
--- a/tools/perf/tests/perf-record.c
+++ b/tools/perf/tests/perf-record.c
@@ -59,6 +59,7 @@
int err = -1, errs = 0, i, wakeups = 0;
u32 cpu;
int total_events = 0, nr_events[PERF_RECORD_MAX] = { 0, };
+ char sbuf[STRERR_BUFSIZE];
if (evlist == NULL || argv == NULL) {
pr_debug("Not enough memory to create evlist\n");
@@ -100,7 +101,8 @@
err = sched__get_first_possible_cpu(evlist->workload.pid, &cpu_mask);
if (err < 0) {
- pr_debug("sched__get_first_possible_cpu: %s\n", strerror(errno));
+ pr_debug("sched__get_first_possible_cpu: %s\n",
+ strerror_r(errno, sbuf, sizeof(sbuf)));
goto out_delete_evlist;
}
@@ -110,7 +112,8 @@
* So that we can check perf_sample.cpu on all the samples.
*/
if (sched_setaffinity(evlist->workload.pid, cpu_mask_size, &cpu_mask) < 0) {
- pr_debug("sched_setaffinity: %s\n", strerror(errno));
+ pr_debug("sched_setaffinity: %s\n",
+ strerror_r(errno, sbuf, sizeof(sbuf)));
goto out_delete_evlist;
}
@@ -120,7 +123,8 @@
*/
err = perf_evlist__open(evlist);
if (err < 0) {
- pr_debug("perf_evlist__open: %s\n", strerror(errno));
+ pr_debug("perf_evlist__open: %s\n",
+ strerror_r(errno, sbuf, sizeof(sbuf)));
goto out_delete_evlist;
}
@@ -131,7 +135,8 @@
*/
err = perf_evlist__mmap(evlist, opts.mmap_pages, false);
if (err < 0) {
- pr_debug("perf_evlist__mmap: %s\n", strerror(errno));
+ pr_debug("perf_evlist__mmap: %s\n",
+ strerror_r(errno, sbuf, sizeof(sbuf)));
goto out_delete_evlist;
}
diff --git a/tools/perf/tests/rdpmc.c b/tools/perf/tests/rdpmc.c
index c04d1f2..d31f2c4 100644
--- a/tools/perf/tests/rdpmc.c
+++ b/tools/perf/tests/rdpmc.c
@@ -100,6 +100,7 @@
};
u64 delta_sum = 0;
struct sigaction sa;
+ char sbuf[STRERR_BUFSIZE];
sigfillset(&sa.sa_mask);
sa.sa_sigaction = segfault_handler;
@@ -109,14 +110,15 @@
perf_event_open_cloexec_flag());
if (fd < 0) {
pr_err("Error: sys_perf_event_open() syscall returned "
- "with %d (%s)\n", fd, strerror(errno));
+ "with %d (%s)\n", fd,
+ strerror_r(errno, sbuf, sizeof(sbuf)));
return -1;
}
addr = mmap(NULL, page_size, PROT_READ, MAP_SHARED, fd, 0);
if (addr == (void *)(-1)) {
pr_err("Error: mmap() syscall returned with (%s)\n",
- strerror(errno));
+ strerror_r(errno, sbuf, sizeof(sbuf)));
goto out_close;
}
diff --git a/tools/perf/tests/sw-clock.c b/tools/perf/tests/sw-clock.c
index 983d6b8..1aa21c9 100644
--- a/tools/perf/tests/sw-clock.c
+++ b/tools/perf/tests/sw-clock.c
@@ -22,6 +22,7 @@
volatile int tmp = 0;
u64 total_periods = 0;
int nr_samples = 0;
+ char sbuf[STRERR_BUFSIZE];
union perf_event *event;
struct perf_evsel *evsel;
struct perf_evlist *evlist;
@@ -62,14 +63,15 @@
err = -errno;
pr_debug("Couldn't open evlist: %s\nHint: check %s, using %" PRIu64 " in this test.\n",
- strerror(errno), knob, (u64)attr.sample_freq);
+ strerror_r(errno, sbuf, sizeof(sbuf)),
+ knob, (u64)attr.sample_freq);
goto out_delete_evlist;
}
err = perf_evlist__mmap(evlist, 128, true);
if (err < 0) {
pr_debug("failed to mmap event: %d (%s)\n", errno,
- strerror(errno));
+ strerror_r(errno, sbuf, sizeof(sbuf)));
goto out_delete_evlist;
}
diff --git a/tools/perf/tests/switch-tracking.c b/tools/perf/tests/switch-tracking.c
new file mode 100644
index 0000000..cc68648
--- /dev/null
+++ b/tools/perf/tests/switch-tracking.c
@@ -0,0 +1,572 @@
+#include <sys/time.h>
+#include <sys/prctl.h>
+#include <time.h>
+#include <stdlib.h>
+
+#include "parse-events.h"
+#include "evlist.h"
+#include "evsel.h"
+#include "thread_map.h"
+#include "cpumap.h"
+#include "tests.h"
+
+static int spin_sleep(void)
+{
+ struct timeval start, now, diff, maxtime;
+ struct timespec ts;
+ int err, i;
+
+ maxtime.tv_sec = 0;
+ maxtime.tv_usec = 50000;
+
+ err = gettimeofday(&start, NULL);
+ if (err)
+ return err;
+
+ /* Spin for 50ms */
+ while (1) {
+ for (i = 0; i < 1000; i++)
+ barrier();
+
+ err = gettimeofday(&now, NULL);
+ if (err)
+ return err;
+
+ timersub(&now, &start, &diff);
+ if (timercmp(&diff, &maxtime, > /* For checkpatch */))
+ break;
+ }
+
+ ts.tv_nsec = 50 * 1000 * 1000;
+ ts.tv_sec = 0;
+
+ /* Sleep for 50ms */
+ err = nanosleep(&ts, NULL);
+ if (err == EINTR)
+ err = 0;
+
+ return err;
+}
+
+struct switch_tracking {
+ struct perf_evsel *switch_evsel;
+ struct perf_evsel *cycles_evsel;
+ pid_t *tids;
+ int nr_tids;
+ int comm_seen[4];
+ int cycles_before_comm_1;
+ int cycles_between_comm_2_and_comm_3;
+ int cycles_after_comm_4;
+};
+
+static int check_comm(struct switch_tracking *switch_tracking,
+ union perf_event *event, const char *comm, int nr)
+{
+ if (event->header.type == PERF_RECORD_COMM &&
+ (pid_t)event->comm.pid == getpid() &&
+ (pid_t)event->comm.tid == getpid() &&
+ strcmp(event->comm.comm, comm) == 0) {
+ if (switch_tracking->comm_seen[nr]) {
+ pr_debug("Duplicate comm event\n");
+ return -1;
+ }
+ switch_tracking->comm_seen[nr] = 1;
+ pr_debug3("comm event: %s nr: %d\n", event->comm.comm, nr);
+ return 1;
+ }
+ return 0;
+}
+
+static int check_cpu(struct switch_tracking *switch_tracking, int cpu)
+{
+ int i, nr = cpu + 1;
+
+ if (cpu < 0)
+ return -1;
+
+ if (!switch_tracking->tids) {
+ switch_tracking->tids = calloc(nr, sizeof(pid_t));
+ if (!switch_tracking->tids)
+ return -1;
+ for (i = 0; i < nr; i++)
+ switch_tracking->tids[i] = -1;
+ switch_tracking->nr_tids = nr;
+ return 0;
+ }
+
+ if (cpu >= switch_tracking->nr_tids) {
+ void *addr;
+
+ addr = realloc(switch_tracking->tids, nr * sizeof(pid_t));
+ if (!addr)
+ return -1;
+ switch_tracking->tids = addr;
+ for (i = switch_tracking->nr_tids; i < nr; i++)
+ switch_tracking->tids[i] = -1;
+ switch_tracking->nr_tids = nr;
+ return 0;
+ }
+
+ return 0;
+}
+
+static int process_sample_event(struct perf_evlist *evlist,
+ union perf_event *event,
+ struct switch_tracking *switch_tracking)
+{
+ struct perf_sample sample;
+ struct perf_evsel *evsel;
+ pid_t next_tid, prev_tid;
+ int cpu, err;
+
+ if (perf_evlist__parse_sample(evlist, event, &sample)) {
+ pr_debug("perf_evlist__parse_sample failed\n");
+ return -1;
+ }
+
+ evsel = perf_evlist__id2evsel(evlist, sample.id);
+ if (evsel == switch_tracking->switch_evsel) {
+ next_tid = perf_evsel__intval(evsel, &sample, "next_pid");
+ prev_tid = perf_evsel__intval(evsel, &sample, "prev_pid");
+ cpu = sample.cpu;
+ pr_debug3("sched_switch: cpu: %d prev_tid %d next_tid %d\n",
+ cpu, prev_tid, next_tid);
+ err = check_cpu(switch_tracking, cpu);
+ if (err)
+ return err;
+ /*
+ * Check for no missing sched_switch events i.e. that the
+ * evsel->system_wide flag has worked.
+ */
+ if (switch_tracking->tids[cpu] != -1 &&
+ switch_tracking->tids[cpu] != prev_tid) {
+ pr_debug("Missing sched_switch events\n");
+ return -1;
+ }
+ switch_tracking->tids[cpu] = next_tid;
+ }
+
+ if (evsel == switch_tracking->cycles_evsel) {
+ pr_debug3("cycles event\n");
+ if (!switch_tracking->comm_seen[0])
+ switch_tracking->cycles_before_comm_1 = 1;
+ if (switch_tracking->comm_seen[1] &&
+ !switch_tracking->comm_seen[2])
+ switch_tracking->cycles_between_comm_2_and_comm_3 = 1;
+ if (switch_tracking->comm_seen[3])
+ switch_tracking->cycles_after_comm_4 = 1;
+ }
+
+ return 0;
+}
+
+static int process_event(struct perf_evlist *evlist, union perf_event *event,
+ struct switch_tracking *switch_tracking)
+{
+ if (event->header.type == PERF_RECORD_SAMPLE)
+ return process_sample_event(evlist, event, switch_tracking);
+
+ if (event->header.type == PERF_RECORD_COMM) {
+ int err, done = 0;
+
+ err = check_comm(switch_tracking, event, "Test COMM 1", 0);
+ if (err < 0)
+ return -1;
+ done += err;
+ err = check_comm(switch_tracking, event, "Test COMM 2", 1);
+ if (err < 0)
+ return -1;
+ done += err;
+ err = check_comm(switch_tracking, event, "Test COMM 3", 2);
+ if (err < 0)
+ return -1;
+ done += err;
+ err = check_comm(switch_tracking, event, "Test COMM 4", 3);
+ if (err < 0)
+ return -1;
+ done += err;
+ if (done != 1) {
+ pr_debug("Unexpected comm event\n");
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+struct event_node {
+ struct list_head list;
+ union perf_event *event;
+ u64 event_time;
+};
+
+static int add_event(struct perf_evlist *evlist, struct list_head *events,
+ union perf_event *event)
+{
+ struct perf_sample sample;
+ struct event_node *node;
+
+ node = malloc(sizeof(struct event_node));
+ if (!node) {
+ pr_debug("malloc failed\n");
+ return -1;
+ }
+ node->event = event;
+ list_add(&node->list, events);
+
+ if (perf_evlist__parse_sample(evlist, event, &sample)) {
+ pr_debug("perf_evlist__parse_sample failed\n");
+ return -1;
+ }
+
+ if (!sample.time) {
+ pr_debug("event with no time\n");
+ return -1;
+ }
+
+ node->event_time = sample.time;
+
+ return 0;
+}
+
+static void free_event_nodes(struct list_head *events)
+{
+ struct event_node *node;
+
+ while (!list_empty(events)) {
+ node = list_entry(events->next, struct event_node, list);
+ list_del(&node->list);
+ free(node);
+ }
+}
+
+static int compar(const void *a, const void *b)
+{
+ const struct event_node *nodea = a;
+ const struct event_node *nodeb = b;
+ s64 cmp = nodea->event_time - nodeb->event_time;
+
+ return cmp;
+}
+
+static int process_events(struct perf_evlist *evlist,
+ struct switch_tracking *switch_tracking)
+{
+ union perf_event *event;
+ unsigned pos, cnt = 0;
+ LIST_HEAD(events);
+ struct event_node *events_array, *node;
+ int i, ret;
+
+ for (i = 0; i < evlist->nr_mmaps; i++) {
+ while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
+ cnt += 1;
+ ret = add_event(evlist, &events, event);
+ perf_evlist__mmap_consume(evlist, i);
+ if (ret < 0)
+ goto out_free_nodes;
+ }
+ }
+
+ events_array = calloc(cnt, sizeof(struct event_node));
+ if (!events_array) {
+ pr_debug("calloc failed\n");
+ ret = -1;
+ goto out_free_nodes;
+ }
+
+ pos = 0;
+ list_for_each_entry(node, &events, list)
+ events_array[pos++] = *node;
+
+ qsort(events_array, cnt, sizeof(struct event_node), compar);
+
+ for (pos = 0; pos < cnt; pos++) {
+ ret = process_event(evlist, events_array[pos].event,
+ switch_tracking);
+ if (ret < 0)
+ goto out_free;
+ }
+
+ ret = 0;
+out_free:
+ pr_debug("%u events recorded\n", cnt);
+ free(events_array);
+out_free_nodes:
+ free_event_nodes(&events);
+ return ret;
+}
+
+/**
+ * test__switch_tracking - test using sched_switch and tracking events.
+ *
+ * This function implements a test that checks that sched_switch events and
+ * tracking events can be recorded for a workload (current process) using the
+ * evsel->system_wide and evsel->tracking flags (respectively) with other events
+ * sometimes enabled or disabled.
+ */
+int test__switch_tracking(void)
+{
+ const char *sched_switch = "sched:sched_switch";
+ struct switch_tracking switch_tracking = { .tids = NULL, };
+ struct record_opts opts = {
+ .mmap_pages = UINT_MAX,
+ .user_freq = UINT_MAX,
+ .user_interval = ULLONG_MAX,
+ .freq = 4000,
+ .target = {
+ .uses_mmap = true,
+ },
+ };
+ struct thread_map *threads = NULL;
+ struct cpu_map *cpus = NULL;
+ struct perf_evlist *evlist = NULL;
+ struct perf_evsel *evsel, *cpu_clocks_evsel, *cycles_evsel;
+ struct perf_evsel *switch_evsel, *tracking_evsel;
+ const char *comm;
+ int err = -1;
+
+ threads = thread_map__new(-1, getpid(), UINT_MAX);
+ if (!threads) {
+ pr_debug("thread_map__new failed!\n");
+ goto out_err;
+ }
+
+ cpus = cpu_map__new(NULL);
+ if (!cpus) {
+ pr_debug("cpu_map__new failed!\n");
+ goto out_err;
+ }
+
+ evlist = perf_evlist__new();
+ if (!evlist) {
+ pr_debug("perf_evlist__new failed!\n");
+ goto out_err;
+ }
+
+ perf_evlist__set_maps(evlist, cpus, threads);
+
+ /* First event */
+ err = parse_events(evlist, "cpu-clock:u");
+ if (err) {
+ pr_debug("Failed to parse event dummy:u\n");
+ goto out_err;
+ }
+
+ cpu_clocks_evsel = perf_evlist__last(evlist);
+
+ /* Second event */
+ err = parse_events(evlist, "cycles:u");
+ if (err) {
+ pr_debug("Failed to parse event cycles:u\n");
+ goto out_err;
+ }
+
+ cycles_evsel = perf_evlist__last(evlist);
+
+ /* Third event */
+ if (!perf_evlist__can_select_event(evlist, sched_switch)) {
+ fprintf(stderr, " (no sched_switch)");
+ err = 0;
+ goto out;
+ }
+
+ err = parse_events(evlist, sched_switch);
+ if (err) {
+ pr_debug("Failed to parse event %s\n", sched_switch);
+ goto out_err;
+ }
+
+ switch_evsel = perf_evlist__last(evlist);
+
+ perf_evsel__set_sample_bit(switch_evsel, CPU);
+ perf_evsel__set_sample_bit(switch_evsel, TIME);
+
+ switch_evsel->system_wide = true;
+ switch_evsel->no_aux_samples = true;
+ switch_evsel->immediate = true;
+
+ /* Test moving an event to the front */
+ if (cycles_evsel == perf_evlist__first(evlist)) {
+ pr_debug("cycles event already at front");
+ goto out_err;
+ }
+ perf_evlist__to_front(evlist, cycles_evsel);
+ if (cycles_evsel != perf_evlist__first(evlist)) {
+ pr_debug("Failed to move cycles event to front");
+ goto out_err;
+ }
+
+ perf_evsel__set_sample_bit(cycles_evsel, CPU);
+ perf_evsel__set_sample_bit(cycles_evsel, TIME);
+
+ /* Fourth event */
+ err = parse_events(evlist, "dummy:u");
+ if (err) {
+ pr_debug("Failed to parse event dummy:u\n");
+ goto out_err;
+ }
+
+ tracking_evsel = perf_evlist__last(evlist);
+
+ perf_evlist__set_tracking_event(evlist, tracking_evsel);
+
+ tracking_evsel->attr.freq = 0;
+ tracking_evsel->attr.sample_period = 1;
+
+ perf_evsel__set_sample_bit(tracking_evsel, TIME);
+
+ /* Config events */
+ perf_evlist__config(evlist, &opts);
+
+ /* Check moved event is still at the front */
+ if (cycles_evsel != perf_evlist__first(evlist)) {
+ pr_debug("Front event no longer at front");
+ goto out_err;
+ }
+
+ /* Check tracking event is tracking */
+ if (!tracking_evsel->attr.mmap || !tracking_evsel->attr.comm) {
+ pr_debug("Tracking event not tracking\n");
+ goto out_err;
+ }
+
+ /* Check non-tracking events are not tracking */
+ evlist__for_each(evlist, evsel) {
+ if (evsel != tracking_evsel) {
+ if (evsel->attr.mmap || evsel->attr.comm) {
+ pr_debug("Non-tracking event is tracking\n");
+ goto out_err;
+ }
+ }
+ }
+
+ if (perf_evlist__open(evlist) < 0) {
+ fprintf(stderr, " (not supported)");
+ err = 0;
+ goto out;
+ }
+
+ err = perf_evlist__mmap(evlist, UINT_MAX, false);
+ if (err) {
+ pr_debug("perf_evlist__mmap failed!\n");
+ goto out_err;
+ }
+
+ perf_evlist__enable(evlist);
+
+ err = perf_evlist__disable_event(evlist, cpu_clocks_evsel);
+ if (err) {
+ pr_debug("perf_evlist__disable_event failed!\n");
+ goto out_err;
+ }
+
+ err = spin_sleep();
+ if (err) {
+ pr_debug("spin_sleep failed!\n");
+ goto out_err;
+ }
+
+ comm = "Test COMM 1";
+ err = prctl(PR_SET_NAME, (unsigned long)comm, 0, 0, 0);
+ if (err) {
+ pr_debug("PR_SET_NAME failed!\n");
+ goto out_err;
+ }
+
+ err = perf_evlist__disable_event(evlist, cycles_evsel);
+ if (err) {
+ pr_debug("perf_evlist__disable_event failed!\n");
+ goto out_err;
+ }
+
+ comm = "Test COMM 2";
+ err = prctl(PR_SET_NAME, (unsigned long)comm, 0, 0, 0);
+ if (err) {
+ pr_debug("PR_SET_NAME failed!\n");
+ goto out_err;
+ }
+
+ err = spin_sleep();
+ if (err) {
+ pr_debug("spin_sleep failed!\n");
+ goto out_err;
+ }
+
+ comm = "Test COMM 3";
+ err = prctl(PR_SET_NAME, (unsigned long)comm, 0, 0, 0);
+ if (err) {
+ pr_debug("PR_SET_NAME failed!\n");
+ goto out_err;
+ }
+
+ err = perf_evlist__enable_event(evlist, cycles_evsel);
+ if (err) {
+ pr_debug("perf_evlist__disable_event failed!\n");
+ goto out_err;
+ }
+
+ comm = "Test COMM 4";
+ err = prctl(PR_SET_NAME, (unsigned long)comm, 0, 0, 0);
+ if (err) {
+ pr_debug("PR_SET_NAME failed!\n");
+ goto out_err;
+ }
+
+ err = spin_sleep();
+ if (err) {
+ pr_debug("spin_sleep failed!\n");
+ goto out_err;
+ }
+
+ perf_evlist__disable(evlist);
+
+ switch_tracking.switch_evsel = switch_evsel;
+ switch_tracking.cycles_evsel = cycles_evsel;
+
+ err = process_events(evlist, &switch_tracking);
+
+ zfree(&switch_tracking.tids);
+
+ if (err)
+ goto out_err;
+
+ /* Check all 4 comm events were seen i.e. that evsel->tracking works */
+ if (!switch_tracking.comm_seen[0] || !switch_tracking.comm_seen[1] ||
+ !switch_tracking.comm_seen[2] || !switch_tracking.comm_seen[3]) {
+ pr_debug("Missing comm events\n");
+ goto out_err;
+ }
+
+ /* Check cycles event got enabled */
+ if (!switch_tracking.cycles_before_comm_1) {
+ pr_debug("Missing cycles events\n");
+ goto out_err;
+ }
+
+ /* Check cycles event got disabled */
+ if (switch_tracking.cycles_between_comm_2_and_comm_3) {
+ pr_debug("cycles events even though event was disabled\n");
+ goto out_err;
+ }
+
+ /* Check cycles event got enabled again */
+ if (!switch_tracking.cycles_after_comm_4) {
+ pr_debug("Missing cycles events\n");
+ goto out_err;
+ }
+out:
+ if (evlist) {
+ perf_evlist__disable(evlist);
+ perf_evlist__delete(evlist);
+ } else {
+ cpu_map__delete(cpus);
+ thread_map__delete(threads);
+ }
+
+ return err;
+
+out_err:
+ err = -1;
+ goto out;
+}
diff --git a/tools/perf/tests/task-exit.c b/tools/perf/tests/task-exit.c
index 5ff3db3..87522f0 100644
--- a/tools/perf/tests/task-exit.c
+++ b/tools/perf/tests/task-exit.c
@@ -42,6 +42,7 @@
.uses_mmap = true,
};
const char *argv[] = { "true", NULL };
+ char sbuf[STRERR_BUFSIZE];
signal(SIGCHLD, sig_handler);
@@ -82,13 +83,14 @@
err = perf_evlist__open(evlist);
if (err < 0) {
- pr_debug("Couldn't open the evlist: %s\n", strerror(-err));
+ pr_debug("Couldn't open the evlist: %s\n",
+ strerror_r(-err, sbuf, sizeof(sbuf)));
goto out_delete_evlist;
}
if (perf_evlist__mmap(evlist, 128, true) < 0) {
pr_debug("failed to mmap events: %d (%s)\n", errno,
- strerror(errno));
+ strerror_r(errno, sbuf, sizeof(sbuf)));
goto out_delete_evlist;
}
diff --git a/tools/perf/tests/tests.h b/tools/perf/tests/tests.h
index ed64790..be8be10 100644
--- a/tools/perf/tests/tests.h
+++ b/tools/perf/tests/tests.h
@@ -48,6 +48,7 @@
int test__thread_mg_share(void);
int test__hists_output(void);
int test__hists_cumulate(void);
+int test__switch_tracking(void);
#if defined(__x86_64__) || defined(__i386__) || defined(__arm__)
#ifdef HAVE_DWARF_UNWIND_SUPPORT
diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c
index a94b11fc..d4cef68 100644
--- a/tools/perf/ui/browsers/hists.c
+++ b/tools/perf/ui/browsers/hists.c
@@ -10,6 +10,7 @@
#include "../../util/pstack.h"
#include "../../util/sort.h"
#include "../../util/util.h"
+#include "../../util/top.h"
#include "../../arch/common.h"
#include "../browser.h"
@@ -228,8 +229,10 @@
{
struct callchain_list *chain;
- list_for_each_entry(chain, &node->val, list)
+ if (!list_empty(&node->val)) {
+ chain = list_entry(node->val.prev, struct callchain_list, list);
chain->ms.has_children = !RB_EMPTY_ROOT(&node->rb_root);
+ }
callchain_node__init_have_children_rb_tree(node);
}
@@ -474,26 +477,87 @@
return bf;
}
+struct callchain_print_arg {
+ /* for hists browser */
+ off_t row_offset;
+ bool is_current_entry;
+
+ /* for file dump */
+ FILE *fp;
+ int printed;
+};
+
+typedef void (*print_callchain_entry_fn)(struct hist_browser *browser,
+ struct callchain_list *chain,
+ const char *str, int offset,
+ unsigned short row,
+ struct callchain_print_arg *arg);
+
+static void hist_browser__show_callchain_entry(struct hist_browser *browser,
+ struct callchain_list *chain,
+ const char *str, int offset,
+ unsigned short row,
+ struct callchain_print_arg *arg)
+{
+ int color, width;
+ char folded_sign = callchain_list__folded(chain);
+
+ color = HE_COLORSET_NORMAL;
+ width = browser->b.width - (offset + 2);
+ if (ui_browser__is_current_entry(&browser->b, row)) {
+ browser->selection = &chain->ms;
+ color = HE_COLORSET_SELECTED;
+ arg->is_current_entry = true;
+ }
+
+ ui_browser__set_color(&browser->b, color);
+ hist_browser__gotorc(browser, row, 0);
+ slsmg_write_nstring(" ", offset);
+ slsmg_printf("%c ", folded_sign);
+ slsmg_write_nstring(str, width);
+}
+
+static void hist_browser__fprintf_callchain_entry(struct hist_browser *b __maybe_unused,
+ struct callchain_list *chain,
+ const char *str, int offset,
+ unsigned short row __maybe_unused,
+ struct callchain_print_arg *arg)
+{
+ char folded_sign = callchain_list__folded(chain);
+
+ arg->printed += fprintf(arg->fp, "%*s%c %s\n", offset, " ",
+ folded_sign, str);
+}
+
+typedef bool (*check_output_full_fn)(struct hist_browser *browser,
+ unsigned short row);
+
+static bool hist_browser__check_output_full(struct hist_browser *browser,
+ unsigned short row)
+{
+ return browser->b.rows == row;
+}
+
+static bool hist_browser__check_dump_full(struct hist_browser *browser __maybe_unused,
+ unsigned short row __maybe_unused)
+{
+ return false;
+}
+
#define LEVEL_OFFSET_STEP 3
-static int hist_browser__show_callchain_node_rb_tree(struct hist_browser *browser,
- struct callchain_node *chain_node,
- u64 total, int level,
- unsigned short row,
- off_t *row_offset,
- bool *is_current_entry)
+static int hist_browser__show_callchain(struct hist_browser *browser,
+ struct rb_root *root, int level,
+ unsigned short row, u64 total,
+ print_callchain_entry_fn print,
+ struct callchain_print_arg *arg,
+ check_output_full_fn is_output_full)
{
struct rb_node *node;
- int first_row = row, width, offset = level * LEVEL_OFFSET_STEP;
- u64 new_total, remaining;
+ int first_row = row, offset = level * LEVEL_OFFSET_STEP;
+ u64 new_total;
- if (callchain_param.mode == CHAIN_GRAPH_REL)
- new_total = chain_node->children_hit;
- else
- new_total = total;
-
- remaining = new_total;
- node = rb_first(&chain_node->rb_root);
+ node = rb_first(root);
while (node) {
struct callchain_node *child = rb_entry(node, struct callchain_node, rb_node);
struct rb_node *next = rb_next(node);
@@ -503,30 +567,28 @@
int first = true;
int extra_offset = 0;
- remaining -= cumul;
-
list_for_each_entry(chain, &child->val, list) {
char bf[1024], *alloc_str;
const char *str;
- int color;
bool was_first = first;
if (first)
first = false;
- else
+ else if (level > 1)
extra_offset = LEVEL_OFFSET_STEP;
folded_sign = callchain_list__folded(chain);
- if (*row_offset != 0) {
- --*row_offset;
+ if (arg->row_offset != 0) {
+ arg->row_offset--;
goto do_next;
}
alloc_str = NULL;
str = callchain_list__sym_name(chain, bf, sizeof(bf),
browser->show_dso);
- if (was_first) {
- double percent = cumul * 100.0 / new_total;
+
+ if (was_first && level > 1) {
+ double percent = cumul * 100.0 / total;
if (asprintf(&alloc_str, "%2.2f%% %s", percent, str) < 0)
str = "Not enough memory!";
@@ -534,22 +596,11 @@
str = alloc_str;
}
- color = HE_COLORSET_NORMAL;
- width = browser->b.width - (offset + extra_offset + 2);
- if (ui_browser__is_current_entry(&browser->b, row)) {
- browser->selection = &chain->ms;
- color = HE_COLORSET_SELECTED;
- *is_current_entry = true;
- }
+ print(browser, chain, str, offset + extra_offset, row, arg);
- ui_browser__set_color(&browser->b, color);
- hist_browser__gotorc(browser, row, 0);
- slsmg_write_nstring(" ", offset + extra_offset);
- slsmg_printf("%c ", folded_sign);
- slsmg_write_nstring(str, width);
free(alloc_str);
- if (++row == browser->b.rows)
+ if (is_output_full(browser, ++row))
goto out;
do_next:
if (folded_sign == '+')
@@ -558,92 +609,24 @@
if (folded_sign == '-') {
const int new_level = level + (extra_offset ? 2 : 1);
- row += hist_browser__show_callchain_node_rb_tree(browser, child, new_total,
- new_level, row, row_offset,
- is_current_entry);
+
+ if (callchain_param.mode == CHAIN_GRAPH_REL)
+ new_total = child->children_hit;
+ else
+ new_total = total;
+
+ row += hist_browser__show_callchain(browser, &child->rb_root,
+ new_level, row, new_total,
+ print, arg, is_output_full);
}
- if (row == browser->b.rows)
- goto out;
+ if (is_output_full(browser, row))
+ break;
node = next;
}
out:
return row - first_row;
}
-static int hist_browser__show_callchain_node(struct hist_browser *browser,
- struct callchain_node *node,
- int level, unsigned short row,
- off_t *row_offset,
- bool *is_current_entry)
-{
- struct callchain_list *chain;
- int first_row = row,
- offset = level * LEVEL_OFFSET_STEP,
- width = browser->b.width - offset;
- char folded_sign = ' ';
-
- list_for_each_entry(chain, &node->val, list) {
- char bf[1024], *s;
- int color;
-
- folded_sign = callchain_list__folded(chain);
-
- if (*row_offset != 0) {
- --*row_offset;
- continue;
- }
-
- color = HE_COLORSET_NORMAL;
- if (ui_browser__is_current_entry(&browser->b, row)) {
- browser->selection = &chain->ms;
- color = HE_COLORSET_SELECTED;
- *is_current_entry = true;
- }
-
- s = callchain_list__sym_name(chain, bf, sizeof(bf),
- browser->show_dso);
- hist_browser__gotorc(browser, row, 0);
- ui_browser__set_color(&browser->b, color);
- slsmg_write_nstring(" ", offset);
- slsmg_printf("%c ", folded_sign);
- slsmg_write_nstring(s, width - 2);
-
- if (++row == browser->b.rows)
- goto out;
- }
-
- if (folded_sign == '-')
- row += hist_browser__show_callchain_node_rb_tree(browser, node,
- browser->hists->stats.total_period,
- level + 1, row,
- row_offset,
- is_current_entry);
-out:
- return row - first_row;
-}
-
-static int hist_browser__show_callchain(struct hist_browser *browser,
- struct rb_root *chain,
- int level, unsigned short row,
- off_t *row_offset,
- bool *is_current_entry)
-{
- struct rb_node *nd;
- int first_row = row;
-
- for (nd = rb_first(chain); nd; nd = rb_next(nd)) {
- struct callchain_node *node = rb_entry(nd, struct callchain_node, rb_node);
-
- row += hist_browser__show_callchain_node(browser, node, level,
- row, row_offset,
- is_current_entry);
- if (row == browser->b.rows)
- break;
- }
-
- return row - first_row;
-}
-
struct hpp_arg {
struct ui_browser *b;
char folded_sign;
@@ -653,17 +636,18 @@
static int __hpp__slsmg_color_printf(struct perf_hpp *hpp, const char *fmt, ...)
{
struct hpp_arg *arg = hpp->ptr;
- int ret;
+ int ret, len;
va_list args;
double percent;
va_start(args, fmt);
+ len = va_arg(args, int);
percent = va_arg(args, double);
va_end(args);
ui_browser__set_percent_color(arg->b, percent, arg->current_entry);
- ret = scnprintf(hpp->buf, hpp->size, fmt, percent);
+ ret = scnprintf(hpp->buf, hpp->size, fmt, len, percent);
slsmg_printf("%s", hpp->buf);
advance_hpp(hpp, ret);
@@ -677,12 +661,12 @@
} \
\
static int \
-hist_browser__hpp_color_##_type(struct perf_hpp_fmt *fmt __maybe_unused,\
+hist_browser__hpp_color_##_type(struct perf_hpp_fmt *fmt, \
struct perf_hpp *hpp, \
struct hist_entry *he) \
{ \
- return __hpp__fmt(hpp, he, __hpp_get_##_field, " %6.2f%%", \
- __hpp__slsmg_color_printf, true); \
+ return hpp__fmt(fmt, hpp, he, __hpp_get_##_field, " %*.2f%%", \
+ __hpp__slsmg_color_printf, true); \
}
#define __HPP_COLOR_ACC_PERCENT_FN(_type, _field) \
@@ -692,18 +676,20 @@
} \
\
static int \
-hist_browser__hpp_color_##_type(struct perf_hpp_fmt *fmt __maybe_unused,\
+hist_browser__hpp_color_##_type(struct perf_hpp_fmt *fmt, \
struct perf_hpp *hpp, \
struct hist_entry *he) \
{ \
if (!symbol_conf.cumulate_callchain) { \
- int ret = scnprintf(hpp->buf, hpp->size, "%8s", "N/A"); \
+ int len = fmt->user_len ?: fmt->len; \
+ int ret = scnprintf(hpp->buf, hpp->size, \
+ "%*s", len, "N/A"); \
slsmg_printf("%s", hpp->buf); \
\
return ret; \
} \
- return __hpp__fmt(hpp, he, __hpp_get_acc_##_field, " %6.2f%%", \
- __hpp__slsmg_color_printf, true); \
+ return hpp__fmt(fmt, hpp, he, __hpp_get_acc_##_field, \
+ " %*.2f%%", __hpp__slsmg_color_printf, true); \
}
__HPP_COLOR_PERCENT_FN(overhead, period)
@@ -812,10 +798,21 @@
--row_offset;
if (folded_sign == '-' && row != browser->b.rows) {
- printed += hist_browser__show_callchain(browser, &entry->sorted_chain,
- 1, row, &row_offset,
- ¤t_entry);
- if (current_entry)
+ u64 total = hists__total_period(entry->hists);
+ struct callchain_print_arg arg = {
+ .row_offset = row_offset,
+ .is_current_entry = current_entry,
+ };
+
+ if (symbol_conf.cumulate_callchain)
+ total = entry->stat_acc->period;
+
+ printed += hist_browser__show_callchain(browser,
+ &entry->sorted_chain, 1, row, total,
+ hist_browser__show_callchain_entry, &arg,
+ hist_browser__check_output_full);
+
+ if (arg.is_current_entry)
browser->he_selection = entry;
}
@@ -847,9 +844,6 @@
if (perf_hpp__should_skip(fmt))
continue;
- /* We need to add the length of the columns header. */
- perf_hpp__reset_width(fmt, hists);
-
ret = fmt->header(fmt, &dummy_hpp, hists_to_evsel(hists));
if (advance_hpp_check(&dummy_hpp, ret))
break;
@@ -1074,113 +1068,21 @@
}
}
-static int hist_browser__fprintf_callchain_node_rb_tree(struct hist_browser *browser,
- struct callchain_node *chain_node,
- u64 total, int level,
- FILE *fp)
-{
- struct rb_node *node;
- int offset = level * LEVEL_OFFSET_STEP;
- u64 new_total, remaining;
- int printed = 0;
-
- if (callchain_param.mode == CHAIN_GRAPH_REL)
- new_total = chain_node->children_hit;
- else
- new_total = total;
-
- remaining = new_total;
- node = rb_first(&chain_node->rb_root);
- while (node) {
- struct callchain_node *child = rb_entry(node, struct callchain_node, rb_node);
- struct rb_node *next = rb_next(node);
- u64 cumul = callchain_cumul_hits(child);
- struct callchain_list *chain;
- char folded_sign = ' ';
- int first = true;
- int extra_offset = 0;
-
- remaining -= cumul;
-
- list_for_each_entry(chain, &child->val, list) {
- char bf[1024], *alloc_str;
- const char *str;
- bool was_first = first;
-
- if (first)
- first = false;
- else
- extra_offset = LEVEL_OFFSET_STEP;
-
- folded_sign = callchain_list__folded(chain);
-
- alloc_str = NULL;
- str = callchain_list__sym_name(chain, bf, sizeof(bf),
- browser->show_dso);
- if (was_first) {
- double percent = cumul * 100.0 / new_total;
-
- if (asprintf(&alloc_str, "%2.2f%% %s", percent, str) < 0)
- str = "Not enough memory!";
- else
- str = alloc_str;
- }
-
- printed += fprintf(fp, "%*s%c %s\n", offset + extra_offset, " ", folded_sign, str);
- free(alloc_str);
- if (folded_sign == '+')
- break;
- }
-
- if (folded_sign == '-') {
- const int new_level = level + (extra_offset ? 2 : 1);
- printed += hist_browser__fprintf_callchain_node_rb_tree(browser, child, new_total,
- new_level, fp);
- }
-
- node = next;
- }
-
- return printed;
-}
-
-static int hist_browser__fprintf_callchain_node(struct hist_browser *browser,
- struct callchain_node *node,
- int level, FILE *fp)
-{
- struct callchain_list *chain;
- int offset = level * LEVEL_OFFSET_STEP;
- char folded_sign = ' ';
- int printed = 0;
-
- list_for_each_entry(chain, &node->val, list) {
- char bf[1024], *s;
-
- folded_sign = callchain_list__folded(chain);
- s = callchain_list__sym_name(chain, bf, sizeof(bf), browser->show_dso);
- printed += fprintf(fp, "%*s%c %s\n", offset, " ", folded_sign, s);
- }
-
- if (folded_sign == '-')
- printed += hist_browser__fprintf_callchain_node_rb_tree(browser, node,
- browser->hists->stats.total_period,
- level + 1, fp);
- return printed;
-}
-
static int hist_browser__fprintf_callchain(struct hist_browser *browser,
- struct rb_root *chain, int level, FILE *fp)
+ struct hist_entry *he, FILE *fp)
{
- struct rb_node *nd;
- int printed = 0;
+ u64 total = hists__total_period(he->hists);
+ struct callchain_print_arg arg = {
+ .fp = fp,
+ };
- for (nd = rb_first(chain); nd; nd = rb_next(nd)) {
- struct callchain_node *node = rb_entry(nd, struct callchain_node, rb_node);
+ if (symbol_conf.cumulate_callchain)
+ total = he->stat_acc->period;
- printed += hist_browser__fprintf_callchain_node(browser, node, level, fp);
- }
-
- return printed;
+ hist_browser__show_callchain(browser, &he->sorted_chain, 1, 0, total,
+ hist_browser__fprintf_callchain_entry, &arg,
+ hist_browser__check_dump_full);
+ return arg.printed;
}
static int hist_browser__fprintf_entry(struct hist_browser *browser,
@@ -1219,7 +1121,7 @@
printed += fprintf(fp, "%s\n", rtrim(s));
if (folded_sign == '-')
- printed += hist_browser__fprintf_callchain(browser, &he->sorted_chain, 1, fp);
+ printed += hist_browser__fprintf_callchain(browser, he, fp);
return printed;
}
@@ -1498,6 +1400,7 @@
char buf[64];
char script_opt[64];
int delay_secs = hbt ? hbt->refresh : 0;
+ struct perf_hpp_fmt *fmt;
#define HIST_BROWSER_HELP_COMMON \
"h/?/F1 Show this window\n" \
@@ -1529,6 +1432,7 @@
"P Print histograms to perf.hist.N\n"
"t Zoom into current Thread\n"
"V Verbose (DSO names in callchains, etc)\n"
+ "z Toggle zeroing of samples\n"
"/ Filter symbol by name";
if (browser == NULL)
@@ -1547,6 +1451,12 @@
memset(options, 0, sizeof(options));
+ perf_hpp__for_each_format(fmt)
+ perf_hpp__reset_width(fmt, hists);
+
+ if (symbol_conf.col_width_list_str)
+ perf_hpp__set_user_width(symbol_conf.col_width_list_str);
+
while (1) {
const struct thread *thread = NULL;
const struct dso *dso = NULL;
@@ -1623,6 +1533,13 @@
case 'F':
symbol_conf.filter_relative ^= 1;
continue;
+ case 'z':
+ if (!is_report_browser(hbt)) {
+ struct perf_top *top = hbt->arg;
+
+ top->zero = !top->zero;
+ }
+ continue;
case K_F1:
case 'h':
case '?':
diff --git a/tools/perf/ui/gtk/hists.c b/tools/perf/ui/gtk/hists.c
index 6ca60e4..f3fa425 100644
--- a/tools/perf/ui/gtk/hists.c
+++ b/tools/perf/ui/gtk/hists.c
@@ -11,6 +11,7 @@
static int __percent_color_snprintf(struct perf_hpp *hpp, const char *fmt, ...)
{
int ret = 0;
+ int len;
va_list args;
double percent;
const char *markup;
@@ -18,6 +19,7 @@
size_t size = hpp->size;
va_start(args, fmt);
+ len = va_arg(args, int);
percent = va_arg(args, double);
va_end(args);
@@ -25,7 +27,7 @@
if (markup)
ret += scnprintf(buf, size, markup);
- ret += scnprintf(buf + ret, size - ret, fmt, percent);
+ ret += scnprintf(buf + ret, size - ret, fmt, len, percent);
if (markup)
ret += scnprintf(buf + ret, size - ret, "</span>");
@@ -39,12 +41,12 @@
return he->stat._field; \
} \
\
-static int perf_gtk__hpp_color_##_type(struct perf_hpp_fmt *fmt __maybe_unused, \
+static int perf_gtk__hpp_color_##_type(struct perf_hpp_fmt *fmt, \
struct perf_hpp *hpp, \
struct hist_entry *he) \
{ \
- return __hpp__fmt(hpp, he, he_get_##_field, " %6.2f%%", \
- __percent_color_snprintf, true); \
+ return hpp__fmt(fmt, hpp, he, he_get_##_field, " %*.2f%%", \
+ __percent_color_snprintf, true); \
}
#define __HPP_COLOR_ACC_PERCENT_FN(_type, _field) \
@@ -57,8 +59,8 @@
struct perf_hpp *hpp, \
struct hist_entry *he) \
{ \
- return __hpp__fmt_acc(hpp, he, he_get_acc_##_field, " %6.2f%%", \
- __percent_color_snprintf, true); \
+ return hpp__fmt_acc(fmt, hpp, he, he_get_acc_##_field, " %*.2f%%", \
+ __percent_color_snprintf, true); \
}
__HPP_COLOR_PERCENT_FN(overhead, period)
@@ -205,10 +207,8 @@
if (perf_hpp__is_sort_entry(fmt))
sym_col = col_idx;
- fmt->header(fmt, &hpp, hists_to_evsel(hists));
-
gtk_tree_view_insert_column_with_attributes(GTK_TREE_VIEW(view),
- -1, ltrim(s),
+ -1, fmt->name,
renderer, "markup",
col_idx++, NULL);
}
diff --git a/tools/perf/ui/hist.c b/tools/perf/ui/hist.c
index 498adb2..2af1837 100644
--- a/tools/perf/ui/hist.c
+++ b/tools/perf/ui/hist.c
@@ -15,9 +15,9 @@
__ret; \
})
-int __hpp__fmt(struct perf_hpp *hpp, struct hist_entry *he,
- hpp_field_fn get_field, const char *fmt,
- hpp_snprint_fn print_fn, bool fmt_percent)
+static int __hpp__fmt(struct perf_hpp *hpp, struct hist_entry *he,
+ hpp_field_fn get_field, const char *fmt, int len,
+ hpp_snprint_fn print_fn, bool fmt_percent)
{
int ret;
struct hists *hists = he->hists;
@@ -32,9 +32,9 @@
if (total)
percent = 100.0 * get_field(he) / total;
- ret = hpp__call_print_fn(hpp, print_fn, fmt, percent);
+ ret = hpp__call_print_fn(hpp, print_fn, fmt, len, percent);
} else
- ret = hpp__call_print_fn(hpp, print_fn, fmt, get_field(he));
+ ret = hpp__call_print_fn(hpp, print_fn, fmt, len, get_field(he));
if (perf_evsel__is_group_event(evsel)) {
int prev_idx, idx_delta;
@@ -60,19 +60,19 @@
*/
if (fmt_percent) {
ret += hpp__call_print_fn(hpp, print_fn,
- fmt, 0.0);
+ fmt, len, 0.0);
} else {
ret += hpp__call_print_fn(hpp, print_fn,
- fmt, 0ULL);
+ fmt, len, 0ULL);
}
}
if (fmt_percent) {
- ret += hpp__call_print_fn(hpp, print_fn, fmt,
+ ret += hpp__call_print_fn(hpp, print_fn, fmt, len,
100.0 * period / total);
} else {
ret += hpp__call_print_fn(hpp, print_fn, fmt,
- period);
+ len, period);
}
prev_idx = perf_evsel__group_idx(evsel);
@@ -86,10 +86,10 @@
*/
if (fmt_percent) {
ret += hpp__call_print_fn(hpp, print_fn,
- fmt, 0.0);
+ fmt, len, 0.0);
} else {
ret += hpp__call_print_fn(hpp, print_fn,
- fmt, 0ULL);
+ fmt, len, 0ULL);
}
}
}
@@ -104,16 +104,35 @@
return ret;
}
-int __hpp__fmt_acc(struct perf_hpp *hpp, struct hist_entry *he,
- hpp_field_fn get_field, const char *fmt,
- hpp_snprint_fn print_fn, bool fmt_percent)
+int hpp__fmt(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp,
+ struct hist_entry *he, hpp_field_fn get_field,
+ const char *fmtstr, hpp_snprint_fn print_fn, bool fmt_percent)
{
- if (!symbol_conf.cumulate_callchain) {
- return snprintf(hpp->buf, hpp->size, "%*s",
- fmt_percent ? 8 : 12, "N/A");
+ int len = fmt->user_len ?: fmt->len;
+
+ if (symbol_conf.field_sep) {
+ return __hpp__fmt(hpp, he, get_field, fmtstr, 1,
+ print_fn, fmt_percent);
}
- return __hpp__fmt(hpp, he, get_field, fmt, print_fn, fmt_percent);
+ if (fmt_percent)
+ len -= 2; /* 2 for a space and a % sign */
+ else
+ len -= 1;
+
+ return __hpp__fmt(hpp, he, get_field, fmtstr, len, print_fn, fmt_percent);
+}
+
+int hpp__fmt_acc(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp,
+ struct hist_entry *he, hpp_field_fn get_field,
+ const char *fmtstr, hpp_snprint_fn print_fn, bool fmt_percent)
+{
+ if (!symbol_conf.cumulate_callchain) {
+ int len = fmt->user_len ?: fmt->len;
+ return snprintf(hpp->buf, hpp->size, " %*s", len - 1, "N/A");
+ }
+
+ return hpp__fmt(fmt, hpp, he, get_field, fmtstr, print_fn, fmt_percent);
}
static int field_cmp(u64 field_a, u64 field_b)
@@ -190,30 +209,26 @@
return ret;
}
-#define __HPP_HEADER_FN(_type, _str, _min_width, _unit_width) \
-static int hpp__header_##_type(struct perf_hpp_fmt *fmt __maybe_unused, \
- struct perf_hpp *hpp, \
- struct perf_evsel *evsel) \
-{ \
- int len = _min_width; \
- \
- if (symbol_conf.event_group) \
- len = max(len, evsel->nr_members * _unit_width); \
- \
- return scnprintf(hpp->buf, hpp->size, "%*s", len, _str); \
+static int hpp__width_fn(struct perf_hpp_fmt *fmt,
+ struct perf_hpp *hpp __maybe_unused,
+ struct perf_evsel *evsel)
+{
+ int len = fmt->user_len ?: fmt->len;
+
+ if (symbol_conf.event_group)
+ len = max(len, evsel->nr_members * fmt->len);
+
+ if (len < (int)strlen(fmt->name))
+ len = strlen(fmt->name);
+
+ return len;
}
-#define __HPP_WIDTH_FN(_type, _min_width, _unit_width) \
-static int hpp__width_##_type(struct perf_hpp_fmt *fmt __maybe_unused, \
- struct perf_hpp *hpp __maybe_unused, \
- struct perf_evsel *evsel) \
-{ \
- int len = _min_width; \
- \
- if (symbol_conf.event_group) \
- len = max(len, evsel->nr_members * _unit_width); \
- \
- return len; \
+static int hpp__header_fn(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp,
+ struct perf_evsel *evsel)
+{
+ int len = hpp__width_fn(fmt, hpp, evsel);
+ return scnprintf(hpp->buf, hpp->size, "%*s", len, fmt->name);
}
static int hpp_color_scnprintf(struct perf_hpp *hpp, const char *fmt, ...)
@@ -221,11 +236,12 @@
va_list args;
ssize_t ssize = hpp->size;
double percent;
- int ret;
+ int ret, len;
va_start(args, fmt);
+ len = va_arg(args, int);
percent = va_arg(args, double);
- ret = value_color_snprintf(hpp->buf, hpp->size, fmt, percent);
+ ret = percent_color_len_snprintf(hpp->buf, hpp->size, fmt, len, percent);
va_end(args);
return (ret >= ssize) ? (ssize - 1) : ret;
@@ -250,20 +266,19 @@
return he->stat._field; \
} \
\
-static int hpp__color_##_type(struct perf_hpp_fmt *fmt __maybe_unused, \
+static int hpp__color_##_type(struct perf_hpp_fmt *fmt, \
struct perf_hpp *hpp, struct hist_entry *he) \
{ \
- return __hpp__fmt(hpp, he, he_get_##_field, " %6.2f%%", \
- hpp_color_scnprintf, true); \
+ return hpp__fmt(fmt, hpp, he, he_get_##_field, " %*.2f%%", \
+ hpp_color_scnprintf, true); \
}
#define __HPP_ENTRY_PERCENT_FN(_type, _field) \
-static int hpp__entry_##_type(struct perf_hpp_fmt *_fmt __maybe_unused, \
+static int hpp__entry_##_type(struct perf_hpp_fmt *fmt, \
struct perf_hpp *hpp, struct hist_entry *he) \
{ \
- const char *fmt = symbol_conf.field_sep ? " %.2f" : " %6.2f%%"; \
- return __hpp__fmt(hpp, he, he_get_##_field, fmt, \
- hpp_entry_scnprintf, true); \
+ return hpp__fmt(fmt, hpp, he, he_get_##_field, " %*.2f%%", \
+ hpp_entry_scnprintf, true); \
}
#define __HPP_SORT_FN(_type, _field) \
@@ -278,20 +293,19 @@
return he->stat_acc->_field; \
} \
\
-static int hpp__color_##_type(struct perf_hpp_fmt *fmt __maybe_unused, \
+static int hpp__color_##_type(struct perf_hpp_fmt *fmt, \
struct perf_hpp *hpp, struct hist_entry *he) \
{ \
- return __hpp__fmt_acc(hpp, he, he_get_acc_##_field, " %6.2f%%", \
- hpp_color_scnprintf, true); \
+ return hpp__fmt_acc(fmt, hpp, he, he_get_acc_##_field, " %*.2f%%", \
+ hpp_color_scnprintf, true); \
}
#define __HPP_ENTRY_ACC_PERCENT_FN(_type, _field) \
-static int hpp__entry_##_type(struct perf_hpp_fmt *_fmt __maybe_unused, \
+static int hpp__entry_##_type(struct perf_hpp_fmt *fmt, \
struct perf_hpp *hpp, struct hist_entry *he) \
{ \
- const char *fmt = symbol_conf.field_sep ? " %.2f" : " %6.2f%%"; \
- return __hpp__fmt_acc(hpp, he, he_get_acc_##_field, fmt, \
- hpp_entry_scnprintf, true); \
+ return hpp__fmt_acc(fmt, hpp, he, he_get_acc_##_field, " %*.2f%%", \
+ hpp_entry_scnprintf, true); \
}
#define __HPP_SORT_ACC_FN(_type, _field) \
@@ -306,12 +320,11 @@
return he->stat._field; \
} \
\
-static int hpp__entry_##_type(struct perf_hpp_fmt *_fmt __maybe_unused, \
+static int hpp__entry_##_type(struct perf_hpp_fmt *fmt, \
struct perf_hpp *hpp, struct hist_entry *he) \
{ \
- const char *fmt = symbol_conf.field_sep ? " %"PRIu64 : " %11"PRIu64; \
- return __hpp__fmt(hpp, he, he_get_raw_##_field, fmt, \
- hpp_entry_scnprintf, false); \
+ return hpp__fmt(fmt, hpp, he, he_get_raw_##_field, " %*"PRIu64, \
+ hpp_entry_scnprintf, false); \
}
#define __HPP_SORT_RAW_FN(_type, _field) \
@@ -321,37 +334,29 @@
}
-#define HPP_PERCENT_FNS(_type, _str, _field, _min_width, _unit_width) \
-__HPP_HEADER_FN(_type, _str, _min_width, _unit_width) \
-__HPP_WIDTH_FN(_type, _min_width, _unit_width) \
+#define HPP_PERCENT_FNS(_type, _field) \
__HPP_COLOR_PERCENT_FN(_type, _field) \
__HPP_ENTRY_PERCENT_FN(_type, _field) \
__HPP_SORT_FN(_type, _field)
-#define HPP_PERCENT_ACC_FNS(_type, _str, _field, _min_width, _unit_width)\
-__HPP_HEADER_FN(_type, _str, _min_width, _unit_width) \
-__HPP_WIDTH_FN(_type, _min_width, _unit_width) \
+#define HPP_PERCENT_ACC_FNS(_type, _field) \
__HPP_COLOR_ACC_PERCENT_FN(_type, _field) \
__HPP_ENTRY_ACC_PERCENT_FN(_type, _field) \
__HPP_SORT_ACC_FN(_type, _field)
-#define HPP_RAW_FNS(_type, _str, _field, _min_width, _unit_width) \
-__HPP_HEADER_FN(_type, _str, _min_width, _unit_width) \
-__HPP_WIDTH_FN(_type, _min_width, _unit_width) \
+#define HPP_RAW_FNS(_type, _field) \
__HPP_ENTRY_RAW_FN(_type, _field) \
__HPP_SORT_RAW_FN(_type, _field)
-__HPP_HEADER_FN(overhead_self, "Self", 8, 8)
+HPP_PERCENT_FNS(overhead, period)
+HPP_PERCENT_FNS(overhead_sys, period_sys)
+HPP_PERCENT_FNS(overhead_us, period_us)
+HPP_PERCENT_FNS(overhead_guest_sys, period_guest_sys)
+HPP_PERCENT_FNS(overhead_guest_us, period_guest_us)
+HPP_PERCENT_ACC_FNS(overhead_acc, period)
-HPP_PERCENT_FNS(overhead, "Overhead", period, 8, 8)
-HPP_PERCENT_FNS(overhead_sys, "sys", period_sys, 8, 8)
-HPP_PERCENT_FNS(overhead_us, "usr", period_us, 8, 8)
-HPP_PERCENT_FNS(overhead_guest_sys, "guest sys", period_guest_sys, 9, 8)
-HPP_PERCENT_FNS(overhead_guest_us, "guest usr", period_guest_us, 9, 8)
-HPP_PERCENT_ACC_FNS(overhead_acc, "Children", period, 8, 8)
-
-HPP_RAW_FNS(samples, "Samples", nr_events, 12, 12)
-HPP_RAW_FNS(period, "Period", period, 12, 12)
+HPP_RAW_FNS(samples, nr_events)
+HPP_RAW_FNS(period, period)
static int64_t hpp__nop_cmp(struct hist_entry *a __maybe_unused,
struct hist_entry *b __maybe_unused)
@@ -359,47 +364,50 @@
return 0;
}
-#define HPP__COLOR_PRINT_FNS(_name) \
+#define HPP__COLOR_PRINT_FNS(_name, _fn) \
{ \
- .header = hpp__header_ ## _name, \
- .width = hpp__width_ ## _name, \
- .color = hpp__color_ ## _name, \
- .entry = hpp__entry_ ## _name, \
+ .name = _name, \
+ .header = hpp__header_fn, \
+ .width = hpp__width_fn, \
+ .color = hpp__color_ ## _fn, \
+ .entry = hpp__entry_ ## _fn, \
.cmp = hpp__nop_cmp, \
.collapse = hpp__nop_cmp, \
- .sort = hpp__sort_ ## _name, \
+ .sort = hpp__sort_ ## _fn, \
}
-#define HPP__COLOR_ACC_PRINT_FNS(_name) \
+#define HPP__COLOR_ACC_PRINT_FNS(_name, _fn) \
{ \
- .header = hpp__header_ ## _name, \
- .width = hpp__width_ ## _name, \
- .color = hpp__color_ ## _name, \
- .entry = hpp__entry_ ## _name, \
+ .name = _name, \
+ .header = hpp__header_fn, \
+ .width = hpp__width_fn, \
+ .color = hpp__color_ ## _fn, \
+ .entry = hpp__entry_ ## _fn, \
.cmp = hpp__nop_cmp, \
.collapse = hpp__nop_cmp, \
- .sort = hpp__sort_ ## _name, \
+ .sort = hpp__sort_ ## _fn, \
}
-#define HPP__PRINT_FNS(_name) \
+#define HPP__PRINT_FNS(_name, _fn) \
{ \
- .header = hpp__header_ ## _name, \
- .width = hpp__width_ ## _name, \
- .entry = hpp__entry_ ## _name, \
+ .name = _name, \
+ .header = hpp__header_fn, \
+ .width = hpp__width_fn, \
+ .entry = hpp__entry_ ## _fn, \
.cmp = hpp__nop_cmp, \
.collapse = hpp__nop_cmp, \
- .sort = hpp__sort_ ## _name, \
+ .sort = hpp__sort_ ## _fn, \
}
struct perf_hpp_fmt perf_hpp__format[] = {
- HPP__COLOR_PRINT_FNS(overhead),
- HPP__COLOR_PRINT_FNS(overhead_sys),
- HPP__COLOR_PRINT_FNS(overhead_us),
- HPP__COLOR_PRINT_FNS(overhead_guest_sys),
- HPP__COLOR_PRINT_FNS(overhead_guest_us),
- HPP__COLOR_ACC_PRINT_FNS(overhead_acc),
- HPP__PRINT_FNS(samples),
- HPP__PRINT_FNS(period)
+ HPP__COLOR_PRINT_FNS("Overhead", overhead),
+ HPP__COLOR_PRINT_FNS("sys", overhead_sys),
+ HPP__COLOR_PRINT_FNS("usr", overhead_us),
+ HPP__COLOR_PRINT_FNS("guest sys", overhead_guest_sys),
+ HPP__COLOR_PRINT_FNS("guest usr", overhead_guest_us),
+ HPP__COLOR_ACC_PRINT_FNS("Children", overhead_acc),
+ HPP__PRINT_FNS("Samples", samples),
+ HPP__PRINT_FNS("Period", period)
};
LIST_HEAD(perf_hpp__list);
@@ -444,14 +452,12 @@
/*
* If user specified field order, no need to setup default fields.
*/
- if (field_order)
+ if (is_strict_order(field_order))
return;
if (symbol_conf.cumulate_callchain) {
perf_hpp__column_enable(PERF_HPP__OVERHEAD_ACC);
-
- perf_hpp__format[PERF_HPP__OVERHEAD].header =
- hpp__header_overhead_self;
+ perf_hpp__format[PERF_HPP__OVERHEAD].name = "Self";
}
perf_hpp__column_enable(PERF_HPP__OVERHEAD);
@@ -513,11 +519,11 @@
void perf_hpp__cancel_cumulate(void)
{
- if (field_order)
+ if (is_strict_order(field_order))
return;
perf_hpp__column_disable(PERF_HPP__OVERHEAD_ACC);
- perf_hpp__format[PERF_HPP__OVERHEAD].header = hpp__header_overhead;
+ perf_hpp__format[PERF_HPP__OVERHEAD].name = "Overhead";
}
void perf_hpp__setup_output_field(void)
@@ -622,3 +628,59 @@
return ret;
}
+
+void perf_hpp__reset_width(struct perf_hpp_fmt *fmt, struct hists *hists)
+{
+ int idx;
+
+ if (perf_hpp__is_sort_entry(fmt))
+ return perf_hpp__reset_sort_width(fmt, hists);
+
+ for (idx = 0; idx < PERF_HPP__MAX_INDEX; idx++) {
+ if (fmt == &perf_hpp__format[idx])
+ break;
+ }
+
+ if (idx == PERF_HPP__MAX_INDEX)
+ return;
+
+ switch (idx) {
+ case PERF_HPP__OVERHEAD:
+ case PERF_HPP__OVERHEAD_SYS:
+ case PERF_HPP__OVERHEAD_US:
+ case PERF_HPP__OVERHEAD_ACC:
+ fmt->len = 8;
+ break;
+
+ case PERF_HPP__OVERHEAD_GUEST_SYS:
+ case PERF_HPP__OVERHEAD_GUEST_US:
+ fmt->len = 9;
+ break;
+
+ case PERF_HPP__SAMPLES:
+ case PERF_HPP__PERIOD:
+ fmt->len = 12;
+ break;
+
+ default:
+ break;
+ }
+}
+
+void perf_hpp__set_user_width(const char *width_list_str)
+{
+ struct perf_hpp_fmt *fmt;
+ const char *ptr = width_list_str;
+
+ perf_hpp__for_each_format(fmt) {
+ char *p;
+
+ int len = strtol(ptr, &p, 10);
+ fmt->user_len = len;
+
+ if (*p == ',')
+ ptr = p + 1;
+ else
+ break;
+ }
+}
diff --git a/tools/perf/ui/stdio/hist.c b/tools/perf/ui/stdio/hist.c
index 40af0ac..15b451a 100644
--- a/tools/perf/ui/stdio/hist.c
+++ b/tools/perf/ui/stdio/hist.c
@@ -395,10 +395,12 @@
init_rem_hits();
-
perf_hpp__for_each_format(fmt)
perf_hpp__reset_width(fmt, hists);
+ if (symbol_conf.col_width_list_str)
+ perf_hpp__set_user_width(symbol_conf.col_width_list_str);
+
if (!show_header)
goto print_entries;
diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index 809b4c5..3643752 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -232,9 +232,16 @@
return -1;
target = ++s;
+ comment = strchr(s, '#');
- while (s[0] != '\0' && !isspace(s[0]))
- ++s;
+ if (comment != NULL)
+ s = comment - 1;
+ else
+ s = strchr(s, '\0') - 1;
+
+ while (s > target && isspace(s[0]))
+ --s;
+ s++;
prev = *s;
*s = '\0';
@@ -244,7 +251,6 @@
if (ops->target.raw == NULL)
goto out_free_source;
- comment = strchr(s, '#');
if (comment == NULL)
return 0;
@@ -899,10 +905,8 @@
struct kcore_extract kce;
bool delete_extract = false;
- if (filename) {
- snprintf(symfs_filename, sizeof(symfs_filename), "%s%s",
- symbol_conf.symfs, filename);
- }
+ if (filename)
+ symbol__join_symfs(symfs_filename, filename);
if (filename == NULL) {
if (dso->has_build_id) {
@@ -922,8 +926,7 @@
* DSO is the same as when 'perf record' ran.
*/
filename = (char *)dso->long_name;
- snprintf(symfs_filename, sizeof(symfs_filename), "%s%s",
- symbol_conf.symfs, filename);
+ symbol__join_symfs(symfs_filename, filename);
free_filename = false;
}
diff --git a/tools/perf/util/cache.h b/tools/perf/util/cache.h
index 7b176dd..5cf9e1b 100644
--- a/tools/perf/util/cache.h
+++ b/tools/perf/util/cache.h
@@ -22,6 +22,7 @@
extern int perf_default_config(const char *, const char *, void *);
extern int perf_config(config_fn_t fn, void *);
extern int perf_config_int(const char *, const char *);
+extern u64 perf_config_u64(const char *, const char *);
extern int perf_config_bool(const char *, const char *);
extern int config_error_nonbool(const char *);
extern const char *perf_config_dirname(const char *, const char *);
diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
index 437ee09..08f0fbf 100644
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c
@@ -28,74 +28,55 @@
int
parse_callchain_report_opt(const char *arg)
{
- char *tok, *tok2;
+ char *tok;
char *endptr;
+ bool minpcnt_set = false;
symbol_conf.use_callchain = true;
if (!arg)
return 0;
- tok = strtok((char *)arg, ",");
- if (!tok)
- return -1;
+ while ((tok = strtok((char *)arg, ",")) != NULL) {
+ if (!strncmp(tok, "none", strlen(tok))) {
+ callchain_param.mode = CHAIN_NONE;
+ symbol_conf.use_callchain = false;
+ return 0;
+ }
- /* get the output mode */
- if (!strncmp(tok, "graph", strlen(arg))) {
- callchain_param.mode = CHAIN_GRAPH_ABS;
+ /* try to get the output mode */
+ if (!strncmp(tok, "graph", strlen(tok)))
+ callchain_param.mode = CHAIN_GRAPH_ABS;
+ else if (!strncmp(tok, "flat", strlen(tok)))
+ callchain_param.mode = CHAIN_FLAT;
+ else if (!strncmp(tok, "fractal", strlen(tok)))
+ callchain_param.mode = CHAIN_GRAPH_REL;
+ /* try to get the call chain order */
+ else if (!strncmp(tok, "caller", strlen(tok)))
+ callchain_param.order = ORDER_CALLER;
+ else if (!strncmp(tok, "callee", strlen(tok)))
+ callchain_param.order = ORDER_CALLEE;
+ /* try to get the sort key */
+ else if (!strncmp(tok, "function", strlen(tok)))
+ callchain_param.key = CCKEY_FUNCTION;
+ else if (!strncmp(tok, "address", strlen(tok)))
+ callchain_param.key = CCKEY_ADDRESS;
+ /* try to get the min percent */
+ else if (!minpcnt_set) {
+ callchain_param.min_percent = strtod(tok, &endptr);
+ if (tok == endptr)
+ return -1;
+ minpcnt_set = true;
+ } else {
+ /* try print limit at last */
+ callchain_param.print_limit = strtoul(tok, &endptr, 0);
+ if (tok == endptr)
+ return -1;
+ }
- } else if (!strncmp(tok, "flat", strlen(arg))) {
- callchain_param.mode = CHAIN_FLAT;
- } else if (!strncmp(tok, "fractal", strlen(arg))) {
- callchain_param.mode = CHAIN_GRAPH_REL;
- } else if (!strncmp(tok, "none", strlen(arg))) {
- callchain_param.mode = CHAIN_NONE;
- symbol_conf.use_callchain = false;
- return 0;
- } else {
- return -1;
+ arg = NULL;
}
- /* get the min percentage */
- tok = strtok(NULL, ",");
- if (!tok)
- goto setup;
-
- callchain_param.min_percent = strtod(tok, &endptr);
- if (tok == endptr)
- return -1;
-
- /* get the print limit */
- tok2 = strtok(NULL, ",");
- if (!tok2)
- goto setup;
-
- if (tok2[0] != 'c') {
- callchain_param.print_limit = strtoul(tok2, &endptr, 0);
- tok2 = strtok(NULL, ",");
- if (!tok2)
- goto setup;
- }
-
- /* get the call chain order */
- if (!strncmp(tok2, "caller", strlen("caller")))
- callchain_param.order = ORDER_CALLER;
- else if (!strncmp(tok2, "callee", strlen("callee")))
- callchain_param.order = ORDER_CALLEE;
- else
- return -1;
-
- /* Get the sort key */
- tok2 = strtok(NULL, ",");
- if (!tok2)
- goto setup;
- if (!strncmp(tok2, "function", strlen("function")))
- callchain_param.key = CCKEY_FUNCTION;
- else if (!strncmp(tok2, "address", strlen("address")))
- callchain_param.key = CCKEY_ADDRESS;
- else
- return -1;
-setup:
if (callchain_register_param(&callchain_param) < 0) {
pr_err("Can't register callchain params\n");
return -1;
diff --git a/tools/perf/util/cloexec.c b/tools/perf/util/cloexec.c
index c5d05ec..47b78b3 100644
--- a/tools/perf/util/cloexec.c
+++ b/tools/perf/util/cloexec.c
@@ -1,7 +1,9 @@
+#include <sched.h>
#include "util.h"
#include "../perf.h"
#include "cloexec.h"
#include "asm/bug.h"
+#include "debug.h"
static unsigned long flag = PERF_FLAG_FD_CLOEXEC;
@@ -9,15 +11,30 @@
{
/* use 'safest' configuration as used in perf_evsel__fallback() */
struct perf_event_attr attr = {
- .type = PERF_COUNT_SW_CPU_CLOCK,
+ .type = PERF_TYPE_SOFTWARE,
.config = PERF_COUNT_SW_CPU_CLOCK,
+ .exclude_kernel = 1,
};
int fd;
int err;
+ int cpu;
+ pid_t pid = -1;
+ char sbuf[STRERR_BUFSIZE];
- /* check cloexec flag */
- fd = sys_perf_event_open(&attr, 0, -1, -1,
- PERF_FLAG_FD_CLOEXEC);
+ cpu = sched_getcpu();
+ if (cpu < 0)
+ cpu = 0;
+
+ while (1) {
+ /* check cloexec flag */
+ fd = sys_perf_event_open(&attr, pid, cpu, -1,
+ PERF_FLAG_FD_CLOEXEC);
+ if (fd < 0 && pid == -1 && errno == EACCES) {
+ pid = 0;
+ continue;
+ }
+ break;
+ }
err = errno;
if (fd >= 0) {
@@ -25,17 +42,17 @@
return 1;
}
- WARN_ONCE(err != EINVAL,
+ WARN_ONCE(err != EINVAL && err != EBUSY,
"perf_event_open(..., PERF_FLAG_FD_CLOEXEC) failed with unexpected error %d (%s)\n",
- err, strerror(err));
+ err, strerror_r(err, sbuf, sizeof(sbuf)));
/* not supported, confirm error related to PERF_FLAG_FD_CLOEXEC */
- fd = sys_perf_event_open(&attr, 0, -1, -1, 0);
+ fd = sys_perf_event_open(&attr, pid, cpu, -1, 0);
err = errno;
- if (WARN_ONCE(fd < 0,
+ if (WARN_ONCE(fd < 0 && err != EBUSY,
"perf_event_open(..., 0) failed unexpectedly with error %d (%s)\n",
- err, strerror(err)))
+ err, strerror_r(err, sbuf, sizeof(sbuf))))
return -1;
close(fd);
diff --git a/tools/perf/util/color.c b/tools/perf/util/color.c
index 87b8672..f465418 100644
--- a/tools/perf/util/color.c
+++ b/tools/perf/util/color.c
@@ -335,3 +335,19 @@
va_end(args);
return value_color_snprintf(bf, size, fmt, percent);
}
+
+int percent_color_len_snprintf(char *bf, size_t size, const char *fmt, ...)
+{
+ va_list args;
+ int len;
+ double percent;
+ const char *color;
+
+ va_start(args, fmt);
+ len = va_arg(args, int);
+ percent = va_arg(args, double);
+ va_end(args);
+
+ color = get_percent_color(percent);
+ return color_snprintf(bf, size, color, fmt, len, percent);
+}
diff --git a/tools/perf/util/color.h b/tools/perf/util/color.h
index 7ff30a6..0a594b8 100644
--- a/tools/perf/util/color.h
+++ b/tools/perf/util/color.h
@@ -41,6 +41,7 @@
int color_fwrite_lines(FILE *fp, const char *color, size_t count, const char *buf);
int value_color_snprintf(char *bf, size_t size, const char *fmt, double value);
int percent_color_snprintf(char *bf, size_t size, const char *fmt, ...);
+int percent_color_len_snprintf(char *bf, size_t size, const char *fmt, ...);
int percent_color_fprintf(FILE *fp, const char *fmt, double percent);
const char *get_percent_color(double percent);
diff --git a/tools/perf/util/comm.c b/tools/perf/util/comm.c
index f9e7776..b2bb59d 100644
--- a/tools/perf/util/comm.c
+++ b/tools/perf/util/comm.c
@@ -74,7 +74,7 @@
return new;
}
-struct comm *comm__new(const char *str, u64 timestamp)
+struct comm *comm__new(const char *str, u64 timestamp, bool exec)
{
struct comm *comm = zalloc(sizeof(*comm));
@@ -82,6 +82,7 @@
return NULL;
comm->start = timestamp;
+ comm->exec = exec;
comm->comm_str = comm_str__findnew(str, &comm_str_root);
if (!comm->comm_str) {
@@ -94,7 +95,7 @@
return comm;
}
-int comm__override(struct comm *comm, const char *str, u64 timestamp)
+int comm__override(struct comm *comm, const char *str, u64 timestamp, bool exec)
{
struct comm_str *new, *old = comm->comm_str;
@@ -106,6 +107,8 @@
comm_str__put(old);
comm->comm_str = new;
comm->start = timestamp;
+ if (exec)
+ comm->exec = true;
return 0;
}
diff --git a/tools/perf/util/comm.h b/tools/perf/util/comm.h
index fac5bd5..51c10ab 100644
--- a/tools/perf/util/comm.h
+++ b/tools/perf/util/comm.h
@@ -11,11 +11,13 @@
struct comm_str *comm_str;
u64 start;
struct list_head list;
+ bool exec;
};
void comm__free(struct comm *comm);
-struct comm *comm__new(const char *str, u64 timestamp);
+struct comm *comm__new(const char *str, u64 timestamp, bool exec);
const char *comm__str(const struct comm *comm);
-int comm__override(struct comm *comm, const char *str, u64 timestamp);
+int comm__override(struct comm *comm, const char *str, u64 timestamp,
+ bool exec);
#endif /* __PERF_COMM_H */
diff --git a/tools/perf/util/config.c b/tools/perf/util/config.c
index 1e5e2e5..9970b8b 100644
--- a/tools/perf/util/config.c
+++ b/tools/perf/util/config.c
@@ -286,6 +286,21 @@
return 0;
}
+static int perf_parse_llong(const char *value, long long *ret)
+{
+ if (value && *value) {
+ char *end;
+ long long val = strtoll(value, &end, 0);
+ unsigned long factor = 1;
+
+ if (!parse_unit_factor(end, &factor))
+ return 0;
+ *ret = val * factor;
+ return 1;
+ }
+ return 0;
+}
+
static int perf_parse_long(const char *value, long *ret)
{
if (value && *value) {
@@ -307,6 +322,15 @@
die("bad config value for '%s'", name);
}
+u64 perf_config_u64(const char *name, const char *value)
+{
+ long long ret = 0;
+
+ if (!perf_parse_llong(value, &ret))
+ die_bad_config(name);
+ return (u64) ret;
+}
+
int perf_config_int(const char *name, const char *value)
{
long ret = 0;
diff --git a/tools/perf/util/data.c b/tools/perf/util/data.c
index 29d720c..1921942 100644
--- a/tools/perf/util/data.c
+++ b/tools/perf/util/data.c
@@ -50,12 +50,14 @@
{
struct stat st;
int fd;
+ char sbuf[STRERR_BUFSIZE];
fd = open(file->path, O_RDONLY);
if (fd < 0) {
int err = errno;
- pr_err("failed to open %s: %s", file->path, strerror(err));
+ pr_err("failed to open %s: %s", file->path,
+ strerror_r(err, sbuf, sizeof(sbuf)));
if (err == ENOENT && !strcmp(file->path, "perf.data"))
pr_err(" (try 'perf record' first)");
pr_err("\n");
@@ -88,6 +90,7 @@
static int open_file_write(struct perf_data_file *file)
{
int fd;
+ char sbuf[STRERR_BUFSIZE];
if (check_backup(file))
return -1;
@@ -95,7 +98,8 @@
fd = open(file->path, O_CREAT|O_RDWR|O_TRUNC, S_IRUSR|S_IWUSR);
if (fd < 0)
- pr_err("failed to open %s : %s\n", file->path, strerror(errno));
+ pr_err("failed to open %s : %s\n", file->path,
+ strerror_r(errno, sbuf, sizeof(sbuf)));
return fd;
}
diff --git a/tools/perf/util/debug.c b/tools/perf/util/debug.c
index 71d4193..ba357f3 100644
--- a/tools/perf/util/debug.c
+++ b/tools/perf/util/debug.c
@@ -13,8 +13,12 @@
#include "util.h"
#include "target.h"
+#define NSECS_PER_SEC 1000000000ULL
+#define NSECS_PER_USEC 1000ULL
+
int verbose;
bool dump_trace = false, quiet = false;
+int debug_ordered_events;
static int _eprintf(int level, int var, const char *fmt, va_list args)
{
@@ -42,6 +46,35 @@
return ret;
}
+static int __eprintf_time(u64 t, const char *fmt, va_list args)
+{
+ int ret = 0;
+ u64 secs, usecs, nsecs = t;
+
+ secs = nsecs / NSECS_PER_SEC;
+ nsecs -= secs * NSECS_PER_SEC;
+ usecs = nsecs / NSECS_PER_USEC;
+
+ ret = fprintf(stderr, "[%13" PRIu64 ".%06" PRIu64 "] ",
+ secs, usecs);
+ ret += vfprintf(stderr, fmt, args);
+ return ret;
+}
+
+int eprintf_time(int level, int var, u64 t, const char *fmt, ...)
+{
+ int ret = 0;
+ va_list args;
+
+ if (var >= level) {
+ va_start(args, fmt);
+ ret = __eprintf_time(t, fmt, args);
+ va_end(args);
+ }
+
+ return ret;
+}
+
/*
* Overloading libtraceevent standard info print
* function, display with -v in perf.
@@ -110,7 +143,8 @@
const char *name;
int *ptr;
} debug_variables[] = {
- { .name = "verbose", .ptr = &verbose },
+ { .name = "verbose", .ptr = &verbose },
+ { .name = "ordered-events", .ptr = &debug_ordered_events},
{ .name = NULL, }
};
diff --git a/tools/perf/util/debug.h b/tools/perf/util/debug.h
index 89fb6b0..be264d6 100644
--- a/tools/perf/util/debug.h
+++ b/tools/perf/util/debug.h
@@ -3,6 +3,7 @@
#define __PERF_DEBUG_H
#include <stdbool.h>
+#include <string.h>
#include "event.h"
#include "../ui/helpline.h"
#include "../ui/progress.h"
@@ -10,6 +11,7 @@
extern int verbose;
extern bool quiet, dump_trace;
+extern int debug_ordered_events;
#ifndef pr_fmt
#define pr_fmt(fmt) fmt
@@ -29,6 +31,14 @@
#define pr_debug3(fmt, ...) pr_debugN(3, pr_fmt(fmt), ##__VA_ARGS__)
#define pr_debug4(fmt, ...) pr_debugN(4, pr_fmt(fmt), ##__VA_ARGS__)
+#define pr_time_N(n, var, t, fmt, ...) \
+ eprintf_time(n, var, t, fmt, ##__VA_ARGS__)
+
+#define pr_oe_time(t, fmt, ...) pr_time_N(1, debug_ordered_events, t, pr_fmt(fmt), ##__VA_ARGS__)
+#define pr_oe_time2(t, fmt, ...) pr_time_N(2, debug_ordered_events, t, pr_fmt(fmt), ##__VA_ARGS__)
+
+#define STRERR_BUFSIZE 128 /* For the buffer size of strerror_r */
+
int dump_printf(const char *fmt, ...) __attribute__((format(printf, 1, 2)));
void trace_event(union perf_event *event);
@@ -38,6 +48,7 @@
void pr_stat(const char *fmt, ...);
int eprintf(int level, int var, const char *fmt, ...) __attribute__((format(printf, 3, 4)));
+int eprintf_time(int level, int var, u64 t, const char *fmt, ...) __attribute__((format(printf, 4, 5)));
int perf_debug_option(const char *str);
diff --git a/tools/perf/util/dso.c b/tools/perf/util/dso.c
index 90d02c66..55e39dc 100644
--- a/tools/perf/util/dso.c
+++ b/tools/perf/util/dso.c
@@ -37,6 +37,7 @@
{
char build_id_hex[BUILD_ID_SIZE * 2 + 1];
int ret = 0;
+ size_t len;
switch (type) {
case DSO_BINARY_TYPE__DEBUGLINK: {
@@ -60,26 +61,25 @@
break;
case DSO_BINARY_TYPE__FEDORA_DEBUGINFO:
- snprintf(filename, size, "%s/usr/lib/debug%s.debug",
- symbol_conf.symfs, dso->long_name);
+ len = __symbol__join_symfs(filename, size, "/usr/lib/debug");
+ snprintf(filename + len, size - len, "%s.debug", dso->long_name);
break;
case DSO_BINARY_TYPE__UBUNTU_DEBUGINFO:
- snprintf(filename, size, "%s/usr/lib/debug%s",
- symbol_conf.symfs, dso->long_name);
+ len = __symbol__join_symfs(filename, size, "/usr/lib/debug");
+ snprintf(filename + len, size - len, "%s", dso->long_name);
break;
case DSO_BINARY_TYPE__OPENEMBEDDED_DEBUGINFO:
{
const char *last_slash;
- size_t len;
size_t dir_size;
last_slash = dso->long_name + dso->long_name_len;
while (last_slash != dso->long_name && *last_slash != '/')
last_slash--;
- len = scnprintf(filename, size, "%s", symbol_conf.symfs);
+ len = __symbol__join_symfs(filename, size, "");
dir_size = last_slash - dso->long_name + 2;
if (dir_size > (size - len)) {
ret = -1;
@@ -100,26 +100,24 @@
build_id__sprintf(dso->build_id,
sizeof(dso->build_id),
build_id_hex);
- snprintf(filename, size,
- "%s/usr/lib/debug/.build-id/%.2s/%s.debug",
- symbol_conf.symfs, build_id_hex, build_id_hex + 2);
+ len = __symbol__join_symfs(filename, size, "/usr/lib/debug/.build-id/");
+ snprintf(filename + len, size - len, "%.2s/%s.debug",
+ build_id_hex, build_id_hex + 2);
break;
case DSO_BINARY_TYPE__VMLINUX:
case DSO_BINARY_TYPE__GUEST_VMLINUX:
case DSO_BINARY_TYPE__SYSTEM_PATH_DSO:
- snprintf(filename, size, "%s%s",
- symbol_conf.symfs, dso->long_name);
+ __symbol__join_symfs(filename, size, dso->long_name);
break;
case DSO_BINARY_TYPE__GUEST_KMODULE:
- snprintf(filename, size, "%s%s%s", symbol_conf.symfs,
- root_dir, dso->long_name);
+ path__join3(filename, size, symbol_conf.symfs,
+ root_dir, dso->long_name);
break;
case DSO_BINARY_TYPE__SYSTEM_PATH_KMODULE:
- snprintf(filename, size, "%s%s", symbol_conf.symfs,
- dso->long_name);
+ __symbol__join_symfs(filename, size, dso->long_name);
break;
case DSO_BINARY_TYPE__KCORE:
@@ -164,13 +162,15 @@
static int do_open(char *name)
{
int fd;
+ char sbuf[STRERR_BUFSIZE];
do {
fd = open(name, O_RDONLY);
if (fd >= 0)
return fd;
- pr_debug("dso open failed, mmap: %s\n", strerror(errno));
+ pr_debug("dso open failed, mmap: %s\n",
+ strerror_r(errno, sbuf, sizeof(sbuf)));
if (!dso__data_open_cnt || errno != EMFILE)
break;
@@ -532,10 +532,12 @@
static int data_file_size(struct dso *dso)
{
struct stat st;
+ char sbuf[STRERR_BUFSIZE];
if (!dso->data.file_size) {
if (fstat(dso->data.fd, &st)) {
- pr_err("dso mmap failed, fstat: %s\n", strerror(errno));
+ pr_err("dso mmap failed, fstat: %s\n",
+ strerror_r(errno, sbuf, sizeof(sbuf)));
return -1;
}
dso->data.file_size = st.st_size;
diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index 1398c83..ed55819 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -784,9 +784,9 @@
* "[vdso]" dso, but for now lets use the old trick of looking
* in the whole kernel symbol list.
*/
- if ((long long)al->addr < 0 &&
- cpumode == PERF_RECORD_MISC_USER &&
- machine && mg != &machine->kmaps) {
+ if (cpumode == PERF_RECORD_MISC_USER && machine &&
+ mg != &machine->kmaps &&
+ machine__kernel_ip(machine, al->addr)) {
mg = &machine->kmaps;
load_map = true;
goto try_again;
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index 94d6976..7eb7107 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -156,6 +156,8 @@
u32 cpu;
u32 raw_size;
u64 data_src;
+ u32 flags;
+ u16 insn_len;
void *raw_data;
struct ip_callchain *callchain;
struct branch_stack *branch_stack;
diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index 814e954..a3e28b4 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -122,6 +122,7 @@
{
list_add_tail(&entry->node, &evlist->entries);
entry->idx = evlist->nr_entries;
+ entry->tracking = !entry->idx;
if (!evlist->nr_entries++)
perf_evlist__set_id_pos(evlist);
@@ -265,17 +266,27 @@
return 0;
}
+static int perf_evlist__nr_threads(struct perf_evlist *evlist,
+ struct perf_evsel *evsel)
+{
+ if (evsel->system_wide)
+ return 1;
+ else
+ return thread_map__nr(evlist->threads);
+}
+
void perf_evlist__disable(struct perf_evlist *evlist)
{
int cpu, thread;
struct perf_evsel *pos;
int nr_cpus = cpu_map__nr(evlist->cpus);
- int nr_threads = thread_map__nr(evlist->threads);
+ int nr_threads;
for (cpu = 0; cpu < nr_cpus; cpu++) {
evlist__for_each(evlist, pos) {
if (!perf_evsel__is_group_leader(pos) || !pos->fd)
continue;
+ nr_threads = perf_evlist__nr_threads(evlist, pos);
for (thread = 0; thread < nr_threads; thread++)
ioctl(FD(pos, cpu, thread),
PERF_EVENT_IOC_DISABLE, 0);
@@ -288,12 +299,13 @@
int cpu, thread;
struct perf_evsel *pos;
int nr_cpus = cpu_map__nr(evlist->cpus);
- int nr_threads = thread_map__nr(evlist->threads);
+ int nr_threads;
for (cpu = 0; cpu < nr_cpus; cpu++) {
evlist__for_each(evlist, pos) {
if (!perf_evsel__is_group_leader(pos) || !pos->fd)
continue;
+ nr_threads = perf_evlist__nr_threads(evlist, pos);
for (thread = 0; thread < nr_threads; thread++)
ioctl(FD(pos, cpu, thread),
PERF_EVENT_IOC_ENABLE, 0);
@@ -305,12 +317,14 @@
struct perf_evsel *evsel)
{
int cpu, thread, err;
+ int nr_cpus = cpu_map__nr(evlist->cpus);
+ int nr_threads = perf_evlist__nr_threads(evlist, evsel);
if (!evsel->fd)
return 0;
- for (cpu = 0; cpu < evlist->cpus->nr; cpu++) {
- for (thread = 0; thread < evlist->threads->nr; thread++) {
+ for (cpu = 0; cpu < nr_cpus; cpu++) {
+ for (thread = 0; thread < nr_threads; thread++) {
err = ioctl(FD(evsel, cpu, thread),
PERF_EVENT_IOC_DISABLE, 0);
if (err)
@@ -324,12 +338,14 @@
struct perf_evsel *evsel)
{
int cpu, thread, err;
+ int nr_cpus = cpu_map__nr(evlist->cpus);
+ int nr_threads = perf_evlist__nr_threads(evlist, evsel);
if (!evsel->fd)
return -EINVAL;
- for (cpu = 0; cpu < evlist->cpus->nr; cpu++) {
- for (thread = 0; thread < evlist->threads->nr; thread++) {
+ for (cpu = 0; cpu < nr_cpus; cpu++) {
+ for (thread = 0; thread < nr_threads; thread++) {
err = ioctl(FD(evsel, cpu, thread),
PERF_EVENT_IOC_ENABLE, 0);
if (err)
@@ -339,11 +355,67 @@
return 0;
}
+static int perf_evlist__enable_event_cpu(struct perf_evlist *evlist,
+ struct perf_evsel *evsel, int cpu)
+{
+ int thread, err;
+ int nr_threads = perf_evlist__nr_threads(evlist, evsel);
+
+ if (!evsel->fd)
+ return -EINVAL;
+
+ for (thread = 0; thread < nr_threads; thread++) {
+ err = ioctl(FD(evsel, cpu, thread),
+ PERF_EVENT_IOC_ENABLE, 0);
+ if (err)
+ return err;
+ }
+ return 0;
+}
+
+static int perf_evlist__enable_event_thread(struct perf_evlist *evlist,
+ struct perf_evsel *evsel,
+ int thread)
+{
+ int cpu, err;
+ int nr_cpus = cpu_map__nr(evlist->cpus);
+
+ if (!evsel->fd)
+ return -EINVAL;
+
+ for (cpu = 0; cpu < nr_cpus; cpu++) {
+ err = ioctl(FD(evsel, cpu, thread), PERF_EVENT_IOC_ENABLE, 0);
+ if (err)
+ return err;
+ }
+ return 0;
+}
+
+int perf_evlist__enable_event_idx(struct perf_evlist *evlist,
+ struct perf_evsel *evsel, int idx)
+{
+ bool per_cpu_mmaps = !cpu_map__empty(evlist->cpus);
+
+ if (per_cpu_mmaps)
+ return perf_evlist__enable_event_cpu(evlist, evsel, idx);
+ else
+ return perf_evlist__enable_event_thread(evlist, evsel, idx);
+}
+
static int perf_evlist__alloc_pollfd(struct perf_evlist *evlist)
{
int nr_cpus = cpu_map__nr(evlist->cpus);
int nr_threads = thread_map__nr(evlist->threads);
- int nfds = nr_cpus * nr_threads * evlist->nr_entries;
+ int nfds = 0;
+ struct perf_evsel *evsel;
+
+ list_for_each_entry(evsel, &evlist->entries, node) {
+ if (evsel->system_wide)
+ nfds += nr_cpus;
+ else
+ nfds += nr_cpus * nr_threads;
+ }
+
evlist->pollfd = malloc(sizeof(struct pollfd) * nfds);
return evlist->pollfd != NULL ? 0 : -ENOMEM;
}
@@ -636,7 +708,12 @@
struct perf_evsel *evsel;
evlist__for_each(evlist, evsel) {
- int fd = FD(evsel, cpu, thread);
+ int fd;
+
+ if (evsel->system_wide && thread)
+ continue;
+
+ fd = FD(evsel, cpu, thread);
if (*output == -1) {
*output = fd;
@@ -1061,6 +1138,8 @@
}
if (!evlist->workload.pid) {
+ int ret;
+
if (pipe_output)
dup2(2, 1);
@@ -1078,8 +1157,22 @@
/*
* Wait until the parent tells us to go.
*/
- if (read(go_pipe[0], &bf, 1) == -1)
- perror("unable to read pipe");
+ ret = read(go_pipe[0], &bf, 1);
+ /*
+ * The parent will ask for the execvp() to be performed by
+ * writing exactly one byte, in workload.cork_fd, usually via
+ * perf_evlist__start_workload().
+ *
+ * For cancelling the workload without actuallin running it,
+ * the parent will just close workload.cork_fd, without writing
+ * anything, i.e. read will return zero and we just exit()
+ * here.
+ */
+ if (ret != 1) {
+ if (ret == -1)
+ perror("unable to read pipe");
+ exit(ret);
+ }
execvp(argv[0], (char **)argv);
@@ -1202,7 +1295,7 @@
int err, char *buf, size_t size)
{
int printed, value;
- char sbuf[128], *emsg = strerror_r(err, sbuf, sizeof(sbuf));
+ char sbuf[STRERR_BUFSIZE], *emsg = strerror_r(err, sbuf, sizeof(sbuf));
switch (err) {
case EACCES:
@@ -1250,3 +1343,19 @@
list_splice(&move, &evlist->entries);
}
+
+void perf_evlist__set_tracking_event(struct perf_evlist *evlist,
+ struct perf_evsel *tracking_evsel)
+{
+ struct perf_evsel *evsel;
+
+ if (tracking_evsel->tracking)
+ return;
+
+ evlist__for_each(evlist, evsel) {
+ if (evsel != tracking_evsel)
+ evsel->tracking = false;
+ }
+
+ tracking_evsel->tracking = true;
+}
diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
index f5173cd..106de53 100644
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h
@@ -122,6 +122,8 @@
struct perf_evsel *evsel);
int perf_evlist__enable_event(struct perf_evlist *evlist,
struct perf_evsel *evsel);
+int perf_evlist__enable_event_idx(struct perf_evlist *evlist,
+ struct perf_evsel *evsel, int idx);
void perf_evlist__set_selected(struct perf_evlist *evlist,
struct perf_evsel *evsel);
@@ -262,4 +264,7 @@
#define evlist__for_each_safe(evlist, tmp, evsel) \
__evlist__for_each_safe(&(evlist)->entries, tmp, evsel)
+void perf_evlist__set_tracking_event(struct perf_evlist *evlist,
+ struct perf_evsel *tracking_evsel);
+
#endif /* __PERF_EVLIST_H */
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 21a373e..b38de58 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -162,6 +162,7 @@
struct perf_event_attr *attr, int idx)
{
evsel->idx = idx;
+ evsel->tracking = !idx;
evsel->attr = *attr;
evsel->leader = evsel;
evsel->unit = "";
@@ -561,7 +562,7 @@
{
struct perf_evsel *leader = evsel->leader;
struct perf_event_attr *attr = &evsel->attr;
- int track = !evsel->idx; /* only the first counter needs these */
+ int track = evsel->tracking;
bool per_cpu = opts->target.default_per_cpu && !opts->target.per_thread;
attr->sample_id_all = perf_missing_features.sample_id_all ? 0 : 1;
@@ -633,9 +634,12 @@
if (opts->period)
perf_evsel__set_sample_bit(evsel, PERIOD);
- if (!perf_missing_features.sample_id_all &&
- (opts->sample_time || !opts->no_inherit ||
- target__has_cpu(&opts->target) || per_cpu))
+ /*
+ * When the user explicitely disabled time don't force it here.
+ */
+ if (opts->sample_time &&
+ (!perf_missing_features.sample_id_all &&
+ (!opts->no_inherit || target__has_cpu(&opts->target) || per_cpu)))
perf_evsel__set_sample_bit(evsel, TIME);
if (opts->raw_samples && !evsel->no_aux_samples) {
@@ -692,6 +696,10 @@
int perf_evsel__alloc_fd(struct perf_evsel *evsel, int ncpus, int nthreads)
{
int cpu, thread;
+
+ if (evsel->system_wide)
+ nthreads = 1;
+
evsel->fd = xyarray__new(ncpus, nthreads, sizeof(int));
if (evsel->fd) {
@@ -710,6 +718,9 @@
{
int cpu, thread;
+ if (evsel->system_wide)
+ nthreads = 1;
+
for (cpu = 0; cpu < ncpus; cpu++) {
for (thread = 0; thread < nthreads; thread++) {
int fd = FD(evsel, cpu, thread),
@@ -740,6 +751,9 @@
int perf_evsel__alloc_id(struct perf_evsel *evsel, int ncpus, int nthreads)
{
+ if (evsel->system_wide)
+ nthreads = 1;
+
evsel->sample_id = xyarray__new(ncpus, nthreads, sizeof(struct perf_sample_id));
if (evsel->sample_id == NULL)
return -ENOMEM;
@@ -784,6 +798,9 @@
{
int cpu, thread;
+ if (evsel->system_wide)
+ nthreads = 1;
+
for (cpu = 0; cpu < ncpus; cpu++)
for (thread = 0; thread < nthreads; ++thread) {
close(FD(evsel, cpu, thread));
@@ -872,6 +889,9 @@
int cpu, thread;
struct perf_counts_values *aggr = &evsel->counts->aggr, count;
+ if (evsel->system_wide)
+ nthreads = 1;
+
aggr->val = aggr->ena = aggr->run = 0;
for (cpu = 0; cpu < ncpus; cpu++) {
@@ -994,13 +1014,18 @@
static int __perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus,
struct thread_map *threads)
{
- int cpu, thread;
+ int cpu, thread, nthreads;
unsigned long flags = PERF_FLAG_FD_CLOEXEC;
int pid = -1, err;
enum { NO_CHANGE, SET_TO_MAX, INCREASED_MAX } set_rlimit = NO_CHANGE;
+ if (evsel->system_wide)
+ nthreads = 1;
+ else
+ nthreads = threads->nr;
+
if (evsel->fd == NULL &&
- perf_evsel__alloc_fd(evsel, cpus->nr, threads->nr) < 0)
+ perf_evsel__alloc_fd(evsel, cpus->nr, nthreads) < 0)
return -ENOMEM;
if (evsel->cgrp) {
@@ -1024,10 +1049,10 @@
for (cpu = 0; cpu < cpus->nr; cpu++) {
- for (thread = 0; thread < threads->nr; thread++) {
+ for (thread = 0; thread < nthreads; thread++) {
int group_fd;
- if (!evsel->cgrp)
+ if (!evsel->cgrp && !evsel->system_wide)
pid = threads->map[thread];
group_fd = get_group_fd(evsel, cpu, thread);
@@ -1100,7 +1125,7 @@
close(FD(evsel, cpu, thread));
FD(evsel, cpu, thread) = -1;
}
- thread = threads->nr;
+ thread = nthreads;
} while (--cpu >= 0);
return err;
}
@@ -2002,6 +2027,8 @@
int perf_evsel__open_strerror(struct perf_evsel *evsel, struct target *target,
int err, char *msg, size_t size)
{
+ char sbuf[STRERR_BUFSIZE];
+
switch (err) {
case EPERM:
case EACCES:
@@ -2036,13 +2063,20 @@
"No APIC? If so then you can boot the kernel with the \"lapic\" boot parameter to force-enable it.");
#endif
break;
+ case EBUSY:
+ if (find_process("oprofiled"))
+ return scnprintf(msg, size,
+ "The PMU counters are busy/taken by another profiler.\n"
+ "We found oprofile daemon running, please stop it and try again.");
+ break;
default:
break;
}
return scnprintf(msg, size,
- "The sys_perf_event_open() syscall returned with %d (%s) for event (%s). \n"
+ "The sys_perf_event_open() syscall returned with %d (%s) for event (%s).\n"
"/bin/dmesg may provide additional information.\n"
"No CONFIG_PERF_EVENTS=y kernel support configured?\n",
- err, strerror(err), perf_evsel__name(evsel));
+ err, strerror_r(err, sbuf, sizeof(sbuf)),
+ perf_evsel__name(evsel));
}
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index d7f93ce..7bc314b 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -85,6 +85,8 @@
bool needs_swap;
bool no_aux_samples;
bool immediate;
+ bool system_wide;
+ bool tracking;
/* parse modifier helper */
int exclude_GH;
int nr_members;
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index 30df618..86569fa 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -277,6 +277,28 @@
}
}
+void hists__delete_entries(struct hists *hists)
+{
+ struct rb_node *next = rb_first(&hists->entries);
+ struct hist_entry *n;
+
+ while (next) {
+ n = rb_entry(next, struct hist_entry, rb_node);
+ next = rb_next(&n->rb_node);
+
+ rb_erase(&n->rb_node, &hists->entries);
+
+ if (sort__need_collapse)
+ rb_erase(&n->rb_node_in, &hists->entries_collapsed);
+
+ --hists->nr_entries;
+ if (!n->filtered)
+ --hists->nr_non_filtered_entries;
+
+ hist_entry__free(n);
+ }
+}
+
/*
* histogram, sorted on item, collects periods
*/
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index 742f49a..8c9c70e 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -152,6 +152,7 @@
void hists__collapse_resort(struct hists *hists, struct ui_progress *prog);
void hists__decay_entries(struct hists *hists, bool zap_user, bool zap_kernel);
+void hists__delete_entries(struct hists *hists);
void hists__output_recalc_col_len(struct hists *hists, int max_rows);
u64 hists__total_period(struct hists *hists);
@@ -192,6 +193,7 @@
};
struct perf_hpp_fmt {
+ const char *name;
int (*header)(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp,
struct perf_evsel *evsel);
int (*width)(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp,
@@ -207,6 +209,8 @@
struct list_head list;
struct list_head sort_list;
bool elide;
+ int len;
+ int user_len;
};
extern struct list_head perf_hpp__list;
@@ -261,17 +265,19 @@
}
void perf_hpp__reset_width(struct perf_hpp_fmt *fmt, struct hists *hists);
+void perf_hpp__reset_sort_width(struct perf_hpp_fmt *fmt, struct hists *hists);
+void perf_hpp__set_user_width(const char *width_list_str);
typedef u64 (*hpp_field_fn)(struct hist_entry *he);
typedef int (*hpp_callback_fn)(struct perf_hpp *hpp, bool front);
typedef int (*hpp_snprint_fn)(struct perf_hpp *hpp, const char *fmt, ...);
-int __hpp__fmt(struct perf_hpp *hpp, struct hist_entry *he,
- hpp_field_fn get_field, const char *fmt,
- hpp_snprint_fn print_fn, bool fmt_percent);
-int __hpp__fmt_acc(struct perf_hpp *hpp, struct hist_entry *he,
- hpp_field_fn get_field, const char *fmt,
- hpp_snprint_fn print_fn, bool fmt_percent);
+int hpp__fmt(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp,
+ struct hist_entry *he, hpp_field_fn get_field,
+ const char *fmtstr, hpp_snprint_fn print_fn, bool fmt_percent);
+int hpp__fmt_acc(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp,
+ struct hist_entry *he, hpp_field_fn get_field,
+ const char *fmtstr, hpp_snprint_fn print_fn, bool fmt_percent);
static inline void advance_hpp(struct perf_hpp *hpp, int inc)
{
diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index 16bba9f..b2ec38b 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -31,6 +31,8 @@
machine->symbol_filter = NULL;
machine->id_hdr_size = 0;
+ machine->comm_exec = false;
+ machine->kernel_start = 0;
machine->root_dir = strdup(root_dir);
if (machine->root_dir == NULL)
@@ -179,6 +181,19 @@
}
}
+void machines__set_comm_exec(struct machines *machines, bool comm_exec)
+{
+ struct rb_node *nd;
+
+ machines->host.comm_exec = comm_exec;
+
+ for (nd = rb_first(&machines->guests); nd; nd = rb_next(nd)) {
+ struct machine *machine = rb_entry(nd, struct machine, rb_node);
+
+ machine->comm_exec = comm_exec;
+ }
+}
+
struct machine *machines__find(struct machines *machines, pid_t pid)
{
struct rb_node **p = &machines->guests.rb_node;
@@ -398,17 +413,31 @@
return __machine__findnew_thread(machine, pid, tid, false);
}
+struct comm *machine__thread_exec_comm(struct machine *machine,
+ struct thread *thread)
+{
+ if (machine->comm_exec)
+ return thread__exec_comm(thread);
+ else
+ return thread__comm(thread);
+}
+
int machine__process_comm_event(struct machine *machine, union perf_event *event,
struct perf_sample *sample)
{
struct thread *thread = machine__findnew_thread(machine,
event->comm.pid,
event->comm.tid);
+ bool exec = event->header.misc & PERF_RECORD_MISC_COMM_EXEC;
+
+ if (exec)
+ machine->comm_exec = true;
if (dump_trace)
perf_event__fprintf_comm(event, stdout);
- if (thread == NULL || thread__set_comm(thread, event->comm.comm, sample->time)) {
+ if (thread == NULL ||
+ __thread__set_comm(thread, event->comm.comm, sample->time, exec)) {
dump_printf("problem processing PERF_RECORD_COMM, skipping event.\n");
return -1;
}
@@ -565,8 +594,8 @@
* Returns the name of the start symbol in *symbol_name. Pass in NULL as
* symbol_name if it's not that important.
*/
-static u64 machine__get_kernel_start_addr(struct machine *machine,
- const char **symbol_name)
+static u64 machine__get_running_kernel_start(struct machine *machine,
+ const char **symbol_name)
{
char filename[PATH_MAX];
int i;
@@ -593,7 +622,7 @@
int __machine__create_kernel_maps(struct machine *machine, struct dso *kernel)
{
enum map_type type;
- u64 start = machine__get_kernel_start_addr(machine, NULL);
+ u64 start = machine__get_running_kernel_start(machine, NULL);
for (type = 0; type < MAP__NR_TYPES; ++type) {
struct kmap *kmap;
@@ -912,7 +941,7 @@
{
struct dso *kernel = machine__get_kernel(machine);
const char *name;
- u64 addr = machine__get_kernel_start_addr(machine, &name);
+ u64 addr = machine__get_running_kernel_start(machine, &name);
if (!addr)
return -1;
@@ -1285,6 +1314,16 @@
thread__find_addr_location(thread, machine, m, MAP__VARIABLE, addr,
&al);
+ if (al.map == NULL) {
+ /*
+ * some shared data regions have execute bit set which puts
+ * their mapping in the MAP__FUNCTION type array.
+ * Check there as a fallback option before dropping the sample.
+ */
+ thread__find_addr_location(thread, machine, m, MAP__FUNCTION, addr,
+ &al);
+ }
+
ams->addr = addr;
ams->al_addr = al.addr;
ams->sym = al.sym;
@@ -1531,3 +1570,25 @@
return 0;
}
+
+int machine__get_kernel_start(struct machine *machine)
+{
+ struct map *map = machine__kernel_map(machine, MAP__FUNCTION);
+ int err = 0;
+
+ /*
+ * The only addresses above 2^63 are kernel addresses of a 64-bit
+ * kernel. Note that addresses are unsigned so that on a 32-bit system
+ * all addresses including kernel addresses are less than 2^32. In
+ * that case (32-bit system), if the kernel mapping is unknown, all
+ * addresses will be assumed to be in user space - see
+ * machine__kernel_ip().
+ */
+ machine->kernel_start = 1ULL << 63;
+ if (map) {
+ err = map__load(map, machine->symbol_filter);
+ if (map->start)
+ machine->kernel_start = map->start;
+ }
+ return err;
+}
diff --git a/tools/perf/util/machine.h b/tools/perf/util/machine.h
index b972824..6a6bcc1 100644
--- a/tools/perf/util/machine.h
+++ b/tools/perf/util/machine.h
@@ -26,6 +26,7 @@
struct rb_node rb_node;
pid_t pid;
u16 id_hdr_size;
+ bool comm_exec;
char *root_dir;
struct rb_root threads;
struct list_head dead_threads;
@@ -35,6 +36,7 @@
struct list_head kernel_dsos;
struct map_groups kmaps;
struct map *vmlinux_maps[MAP__NR_TYPES];
+ u64 kernel_start;
symbol_filter_t symbol_filter;
pid_t *current_tid;
};
@@ -45,8 +47,26 @@
return machine->vmlinux_maps[type];
}
+int machine__get_kernel_start(struct machine *machine);
+
+static inline u64 machine__kernel_start(struct machine *machine)
+{
+ if (!machine->kernel_start)
+ machine__get_kernel_start(machine);
+ return machine->kernel_start;
+}
+
+static inline bool machine__kernel_ip(struct machine *machine, u64 ip)
+{
+ u64 kernel_start = machine__kernel_start(machine);
+
+ return ip >= kernel_start;
+}
+
struct thread *machine__find_thread(struct machine *machine, pid_t pid,
pid_t tid);
+struct comm *machine__thread_exec_comm(struct machine *machine,
+ struct thread *thread);
int machine__process_comm_event(struct machine *machine, union perf_event *event,
struct perf_sample *sample);
@@ -88,6 +108,7 @@
void machines__set_symbol_filter(struct machines *machines,
symbol_filter_t symbol_filter);
+void machines__set_comm_exec(struct machines *machines, bool comm_exec);
struct machine *machine__new_host(void);
int machine__init(struct machine *machine, const char *root_dir, pid_t pid);
diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c
index 31b8905..b709059 100644
--- a/tools/perf/util/map.c
+++ b/tools/perf/util/map.c
@@ -31,6 +31,7 @@
static inline int is_no_dso_memory(const char *filename)
{
return !strncmp(filename, "[stack", 6) ||
+ !strncmp(filename, "/SYSV",5) ||
!strcmp(filename, "[heap]");
}
diff --git a/tools/perf/util/ordered-events.c b/tools/perf/util/ordered-events.c
new file mode 100644
index 0000000..706ce1a
--- /dev/null
+++ b/tools/perf/util/ordered-events.c
@@ -0,0 +1,245 @@
+#include <linux/list.h>
+#include <linux/compiler.h>
+#include "ordered-events.h"
+#include "evlist.h"
+#include "session.h"
+#include "asm/bug.h"
+#include "debug.h"
+
+#define pr_N(n, fmt, ...) \
+ eprintf(n, debug_ordered_events, fmt, ##__VA_ARGS__)
+
+#define pr(fmt, ...) pr_N(1, pr_fmt(fmt), ##__VA_ARGS__)
+
+static void queue_event(struct ordered_events *oe, struct ordered_event *new)
+{
+ struct ordered_event *last = oe->last;
+ u64 timestamp = new->timestamp;
+ struct list_head *p;
+
+ ++oe->nr_events;
+ oe->last = new;
+
+ pr_oe_time2(timestamp, "queue_event nr_events %u\n", oe->nr_events);
+
+ if (!last) {
+ list_add(&new->list, &oe->events);
+ oe->max_timestamp = timestamp;
+ return;
+ }
+
+ /*
+ * last event might point to some random place in the list as it's
+ * the last queued event. We expect that the new event is close to
+ * this.
+ */
+ if (last->timestamp <= timestamp) {
+ while (last->timestamp <= timestamp) {
+ p = last->list.next;
+ if (p == &oe->events) {
+ list_add_tail(&new->list, &oe->events);
+ oe->max_timestamp = timestamp;
+ return;
+ }
+ last = list_entry(p, struct ordered_event, list);
+ }
+ list_add_tail(&new->list, &last->list);
+ } else {
+ while (last->timestamp > timestamp) {
+ p = last->list.prev;
+ if (p == &oe->events) {
+ list_add(&new->list, &oe->events);
+ return;
+ }
+ last = list_entry(p, struct ordered_event, list);
+ }
+ list_add(&new->list, &last->list);
+ }
+}
+
+#define MAX_SAMPLE_BUFFER (64 * 1024 / sizeof(struct ordered_event))
+static struct ordered_event *alloc_event(struct ordered_events *oe)
+{
+ struct list_head *cache = &oe->cache;
+ struct ordered_event *new = NULL;
+
+ if (!list_empty(cache)) {
+ new = list_entry(cache->next, struct ordered_event, list);
+ list_del(&new->list);
+ } else if (oe->buffer) {
+ new = oe->buffer + oe->buffer_idx;
+ if (++oe->buffer_idx == MAX_SAMPLE_BUFFER)
+ oe->buffer = NULL;
+ } else if (oe->cur_alloc_size < oe->max_alloc_size) {
+ size_t size = MAX_SAMPLE_BUFFER * sizeof(*new);
+
+ oe->buffer = malloc(size);
+ if (!oe->buffer)
+ return NULL;
+
+ pr("alloc size %" PRIu64 "B (+%zu), max %" PRIu64 "B\n",
+ oe->cur_alloc_size, size, oe->max_alloc_size);
+
+ oe->cur_alloc_size += size;
+ list_add(&oe->buffer->list, &oe->to_free);
+
+ /* First entry is abused to maintain the to_free list. */
+ oe->buffer_idx = 2;
+ new = oe->buffer + 1;
+ } else {
+ pr("allocation limit reached %" PRIu64 "B\n", oe->max_alloc_size);
+ }
+
+ return new;
+}
+
+struct ordered_event *
+ordered_events__new(struct ordered_events *oe, u64 timestamp)
+{
+ struct ordered_event *new;
+
+ new = alloc_event(oe);
+ if (new) {
+ new->timestamp = timestamp;
+ queue_event(oe, new);
+ }
+
+ return new;
+}
+
+void ordered_events__delete(struct ordered_events *oe, struct ordered_event *event)
+{
+ list_move(&event->list, &oe->cache);
+ oe->nr_events--;
+}
+
+static int __ordered_events__flush(struct perf_session *s,
+ struct perf_tool *tool)
+{
+ struct ordered_events *oe = &s->ordered_events;
+ struct list_head *head = &oe->events;
+ struct ordered_event *tmp, *iter;
+ struct perf_sample sample;
+ u64 limit = oe->next_flush;
+ u64 last_ts = oe->last ? oe->last->timestamp : 0ULL;
+ bool show_progress = limit == ULLONG_MAX;
+ struct ui_progress prog;
+ int ret;
+
+ if (!tool->ordered_events || !limit)
+ return 0;
+
+ if (show_progress)
+ ui_progress__init(&prog, oe->nr_events, "Processing time ordered events...");
+
+ list_for_each_entry_safe(iter, tmp, head, list) {
+ if (session_done())
+ return 0;
+
+ if (iter->timestamp > limit)
+ break;
+
+ ret = perf_evlist__parse_sample(s->evlist, iter->event, &sample);
+ if (ret)
+ pr_err("Can't parse sample, err = %d\n", ret);
+ else {
+ ret = perf_session__deliver_event(s, iter->event, &sample, tool,
+ iter->file_offset);
+ if (ret)
+ return ret;
+ }
+
+ ordered_events__delete(oe, iter);
+ oe->last_flush = iter->timestamp;
+
+ if (show_progress)
+ ui_progress__update(&prog, 1);
+ }
+
+ if (list_empty(head))
+ oe->last = NULL;
+ else if (last_ts <= limit)
+ oe->last = list_entry(head->prev, struct ordered_event, list);
+
+ return 0;
+}
+
+int ordered_events__flush(struct perf_session *s, struct perf_tool *tool,
+ enum oe_flush how)
+{
+ struct ordered_events *oe = &s->ordered_events;
+ static const char * const str[] = {
+ "NONE",
+ "FINAL",
+ "ROUND",
+ "HALF ",
+ };
+ int err;
+
+ switch (how) {
+ case OE_FLUSH__FINAL:
+ oe->next_flush = ULLONG_MAX;
+ break;
+
+ case OE_FLUSH__HALF:
+ {
+ struct ordered_event *first, *last;
+ struct list_head *head = &oe->events;
+
+ first = list_entry(head->next, struct ordered_event, list);
+ last = oe->last;
+
+ /* Warn if we are called before any event got allocated. */
+ if (WARN_ONCE(!last || list_empty(head), "empty queue"))
+ return 0;
+
+ oe->next_flush = first->timestamp;
+ oe->next_flush += (last->timestamp - first->timestamp) / 2;
+ break;
+ }
+
+ case OE_FLUSH__ROUND:
+ case OE_FLUSH__NONE:
+ default:
+ break;
+ };
+
+ pr_oe_time(oe->next_flush, "next_flush - ordered_events__flush PRE %s, nr_events %u\n",
+ str[how], oe->nr_events);
+ pr_oe_time(oe->max_timestamp, "max_timestamp\n");
+
+ err = __ordered_events__flush(s, tool);
+
+ if (!err) {
+ if (how == OE_FLUSH__ROUND)
+ oe->next_flush = oe->max_timestamp;
+
+ oe->last_flush_type = how;
+ }
+
+ pr_oe_time(oe->next_flush, "next_flush - ordered_events__flush POST %s, nr_events %u\n",
+ str[how], oe->nr_events);
+ pr_oe_time(oe->last_flush, "last_flush\n");
+
+ return err;
+}
+
+void ordered_events__init(struct ordered_events *oe)
+{
+ INIT_LIST_HEAD(&oe->events);
+ INIT_LIST_HEAD(&oe->cache);
+ INIT_LIST_HEAD(&oe->to_free);
+ oe->max_alloc_size = (u64) -1;
+ oe->cur_alloc_size = 0;
+}
+
+void ordered_events__free(struct ordered_events *oe)
+{
+ while (!list_empty(&oe->to_free)) {
+ struct ordered_event *event;
+
+ event = list_entry(oe->to_free.next, struct ordered_event, list);
+ list_del(&event->list);
+ free(event);
+ }
+}
diff --git a/tools/perf/util/ordered-events.h b/tools/perf/util/ordered-events.h
new file mode 100644
index 0000000..3b2f205
--- /dev/null
+++ b/tools/perf/util/ordered-events.h
@@ -0,0 +1,51 @@
+#ifndef __ORDERED_EVENTS_H
+#define __ORDERED_EVENTS_H
+
+#include <linux/types.h>
+#include "tool.h"
+
+struct perf_session;
+
+struct ordered_event {
+ u64 timestamp;
+ u64 file_offset;
+ union perf_event *event;
+ struct list_head list;
+};
+
+enum oe_flush {
+ OE_FLUSH__NONE,
+ OE_FLUSH__FINAL,
+ OE_FLUSH__ROUND,
+ OE_FLUSH__HALF,
+};
+
+struct ordered_events {
+ u64 last_flush;
+ u64 next_flush;
+ u64 max_timestamp;
+ u64 max_alloc_size;
+ u64 cur_alloc_size;
+ struct list_head events;
+ struct list_head cache;
+ struct list_head to_free;
+ struct ordered_event *buffer;
+ struct ordered_event *last;
+ int buffer_idx;
+ unsigned int nr_events;
+ enum oe_flush last_flush_type;
+};
+
+struct ordered_event *ordered_events__new(struct ordered_events *oe, u64 timestamp);
+void ordered_events__delete(struct ordered_events *oe, struct ordered_event *event);
+int ordered_events__flush(struct perf_session *s, struct perf_tool *tool,
+ enum oe_flush how);
+void ordered_events__init(struct ordered_events *oe);
+void ordered_events__free(struct ordered_events *oe);
+
+static inline
+void ordered_events__set_alloc_size(struct ordered_events *oe, u64 size)
+{
+ oe->max_alloc_size = size;
+}
+#endif /* __ORDERED_EVENTS_H */
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 1e15df1..e34c81a 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -10,6 +10,7 @@
#include "symbol.h"
#include "cache.h"
#include "header.h"
+#include "debug.h"
#include <api/fs/debugfs.h>
#include "parse-events-bison.h"
#define YY_EXTRA_TYPE int
@@ -1006,9 +1007,11 @@
struct dirent *sys_next, *evt_next, sys_dirent, evt_dirent;
char evt_path[MAXPATHLEN];
char dir_path[MAXPATHLEN];
+ char sbuf[STRERR_BUFSIZE];
if (debugfs_valid_mountpoint(tracing_events_path)) {
- printf(" [ Tracepoints not available: %s ]\n", strerror(errno));
+ printf(" [ Tracepoints not available: %s ]\n",
+ strerror_r(errno, sbuf, sizeof(sbuf)));
return;
}
diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c
index 7a811eb..9bf5827 100644
--- a/tools/perf/util/pmu.c
+++ b/tools/perf/util/pmu.c
@@ -14,8 +14,8 @@
struct perf_pmu_alias {
char *name;
- struct list_head terms;
- struct list_head list;
+ struct list_head terms; /* HEAD struct parse_events_term -> list */
+ struct list_head list; /* ELEM */
char unit[UNIT_MAX_LEN+1];
double scale;
};
diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h
index c14a543..1c1e2ee 100644
--- a/tools/perf/util/pmu.h
+++ b/tools/perf/util/pmu.h
@@ -17,9 +17,9 @@
char *name;
__u32 type;
struct cpu_map *cpus;
- struct list_head format;
- struct list_head aliases;
- struct list_head list;
+ struct list_head format; /* HEAD struct perf_pmu_format -> list */
+ struct list_head aliases; /* HEAD struct perf_pmu_alias -> list */
+ struct list_head list; /* ELEM */
};
struct perf_pmu *perf_pmu__find(const char *name);
diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
index 9a0a183..f73595f 100644
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c
@@ -79,7 +79,7 @@
int ret;
symbol_conf.sort_by_name = true;
- ret = symbol__init();
+ ret = symbol__init(NULL);
if (ret < 0) {
pr_debug("Failed to init symbol map.\n");
goto out;
@@ -258,21 +258,33 @@
#ifdef HAVE_DWARF_SUPPORT
/* Open new debuginfo of given module */
-static struct debuginfo *open_debuginfo(const char *module)
+static struct debuginfo *open_debuginfo(const char *module, bool silent)
{
const char *path = module;
+ struct debuginfo *ret;
if (!module || !strchr(module, '/')) {
path = kernel_get_module_path(module);
if (!path) {
- pr_err("Failed to find path of %s module.\n",
- module ?: "kernel");
+ if (!silent)
+ pr_err("Failed to find path of %s module.\n",
+ module ?: "kernel");
return NULL;
}
}
- return debuginfo__new(path);
+ ret = debuginfo__new(path);
+ if (!ret && !silent) {
+ pr_warning("The %s file has no debug information.\n", path);
+ if (!module || !strtailcmp(path, ".ko"))
+ pr_warning("Rebuild with CONFIG_DEBUG_INFO=y, ");
+ else
+ pr_warning("Rebuild with -g, ");
+ pr_warning("or install an appropriate debuginfo package.\n");
+ }
+ return ret;
}
+
static int get_text_start_address(const char *exec, unsigned long *address)
{
Elf *elf;
@@ -333,15 +345,13 @@
pr_debug("try to find information at %" PRIx64 " in %s\n", addr,
tp->module ? : "kernel");
- dinfo = open_debuginfo(tp->module);
+ dinfo = open_debuginfo(tp->module, verbose == 0);
if (dinfo) {
ret = debuginfo__find_probe_point(dinfo,
(unsigned long)addr, pp);
debuginfo__delete(dinfo);
- } else {
- pr_debug("Failed to open debuginfo at 0x%" PRIx64 "\n", addr);
+ } else
ret = -ENOENT;
- }
if (ret > 0) {
pp->retprobe = tp->retprobe;
@@ -457,13 +467,11 @@
struct debuginfo *dinfo;
int ntevs, ret = 0;
- dinfo = open_debuginfo(target);
+ dinfo = open_debuginfo(target, !need_dwarf);
if (!dinfo) {
- if (need_dwarf) {
- pr_warning("Failed to open debuginfo file.\n");
+ if (need_dwarf)
return -ENOENT;
- }
pr_debug("Could not open debuginfo. Try to use symbols.\n");
return 0;
}
@@ -565,7 +573,7 @@
static int __show_one_line(FILE *fp, int l, bool skip, bool show_num)
{
- char buf[LINEBUF_SIZE];
+ char buf[LINEBUF_SIZE], sbuf[STRERR_BUFSIZE];
const char *color = show_num ? "" : PERF_COLOR_BLUE;
const char *prefix = NULL;
@@ -585,7 +593,8 @@
return 1;
error:
if (ferror(fp)) {
- pr_warning("File read error: %s\n", strerror(errno));
+ pr_warning("File read error: %s\n",
+ strerror_r(errno, sbuf, sizeof(sbuf)));
return -1;
}
return 0;
@@ -618,13 +627,12 @@
FILE *fp;
int ret;
char *tmp;
+ char sbuf[STRERR_BUFSIZE];
/* Search a line range */
- dinfo = open_debuginfo(module);
- if (!dinfo) {
- pr_warning("Failed to open debuginfo file.\n");
+ dinfo = open_debuginfo(module, false);
+ if (!dinfo)
return -ENOENT;
- }
ret = debuginfo__find_line_range(dinfo, lr);
debuginfo__delete(dinfo);
@@ -656,7 +664,7 @@
fp = fopen(lr->path, "r");
if (fp == NULL) {
pr_warning("Failed to open %s: %s\n", lr->path,
- strerror(errno));
+ strerror_r(errno, sbuf, sizeof(sbuf)));
return -errno;
}
/* Skip to starting line number */
@@ -772,9 +780,8 @@
if (ret < 0)
return ret;
- dinfo = open_debuginfo(module);
+ dinfo = open_debuginfo(module, false);
if (!dinfo) {
- pr_warning("Failed to open debuginfo file.\n");
ret = -ENOENT;
goto out;
}
@@ -1405,8 +1412,7 @@
return tmp - buf;
error:
- pr_debug("Failed to synthesize perf probe argument: %s\n",
- strerror(-ret));
+ pr_debug("Failed to synthesize perf probe argument: %d\n", ret);
return ret;
}
@@ -1455,8 +1461,7 @@
return buf;
error:
- pr_debug("Failed to synthesize perf probe point: %s\n",
- strerror(-ret));
+ pr_debug("Failed to synthesize perf probe point: %d\n", ret);
free(buf);
return NULL;
}
@@ -1780,10 +1785,11 @@
memset(tev, 0, sizeof(*tev));
}
-static void print_warn_msg(const char *file, bool is_kprobe)
+static void print_open_warning(int err, bool is_kprobe)
{
+ char sbuf[STRERR_BUFSIZE];
- if (errno == ENOENT) {
+ if (err == -ENOENT) {
const char *config;
if (!is_kprobe)
@@ -1791,25 +1797,43 @@
else
config = "CONFIG_KPROBE_EVENTS";
- pr_warning("%s file does not exist - please rebuild kernel"
- " with %s.\n", file, config);
- } else
- pr_warning("Failed to open %s file: %s\n", file,
- strerror(errno));
+ pr_warning("%cprobe_events file does not exist"
+ " - please rebuild kernel with %s.\n",
+ is_kprobe ? 'k' : 'u', config);
+ } else if (err == -ENOTSUP)
+ pr_warning("Debugfs is not mounted.\n");
+ else
+ pr_warning("Failed to open %cprobe_events: %s\n",
+ is_kprobe ? 'k' : 'u',
+ strerror_r(-err, sbuf, sizeof(sbuf)));
}
-static int open_probe_events(const char *trace_file, bool readwrite,
- bool is_kprobe)
+static void print_both_open_warning(int kerr, int uerr)
+{
+ /* Both kprobes and uprobes are disabled, warn it. */
+ if (kerr == -ENOTSUP && uerr == -ENOTSUP)
+ pr_warning("Debugfs is not mounted.\n");
+ else if (kerr == -ENOENT && uerr == -ENOENT)
+ pr_warning("Please rebuild kernel with CONFIG_KPROBE_EVENTS "
+ "or/and CONFIG_UPROBE_EVENTS.\n");
+ else {
+ char sbuf[STRERR_BUFSIZE];
+ pr_warning("Failed to open kprobe events: %s.\n",
+ strerror_r(-kerr, sbuf, sizeof(sbuf)));
+ pr_warning("Failed to open uprobe events: %s.\n",
+ strerror_r(-uerr, sbuf, sizeof(sbuf)));
+ }
+}
+
+static int open_probe_events(const char *trace_file, bool readwrite)
{
char buf[PATH_MAX];
const char *__debugfs;
int ret;
__debugfs = debugfs_find_mountpoint();
- if (__debugfs == NULL) {
- pr_warning("Debugfs is not mounted.\n");
- return -ENOENT;
- }
+ if (__debugfs == NULL)
+ return -ENOTSUP;
ret = e_snprintf(buf, PATH_MAX, "%s/%s", __debugfs, trace_file);
if (ret >= 0) {
@@ -1820,19 +1844,19 @@
ret = open(buf, O_RDONLY, 0);
if (ret < 0)
- print_warn_msg(buf, is_kprobe);
+ ret = -errno;
}
return ret;
}
static int open_kprobe_events(bool readwrite)
{
- return open_probe_events("tracing/kprobe_events", readwrite, true);
+ return open_probe_events("tracing/kprobe_events", readwrite);
}
static int open_uprobe_events(bool readwrite)
{
- return open_probe_events("tracing/uprobe_events", readwrite, false);
+ return open_probe_events("tracing/uprobe_events", readwrite);
}
/* Get raw string list of current kprobe_events or uprobe_events */
@@ -1857,7 +1881,7 @@
p[idx] = '\0';
ret = strlist__add(sl, buf);
if (ret < 0) {
- pr_debug("strlist__add failed: %s\n", strerror(-ret));
+ pr_debug("strlist__add failed (%d)\n", ret);
strlist__delete(sl);
return NULL;
}
@@ -1916,7 +1940,7 @@
rawlist = get_probe_trace_command_rawlist(fd);
if (!rawlist)
- return -ENOENT;
+ return -ENOMEM;
strlist__for_each(ent, rawlist) {
ret = parse_probe_trace_command(ent->s, &tev);
@@ -1940,27 +1964,34 @@
/* List up current perf-probe events */
int show_perf_probe_events(void)
{
- int fd, ret;
+ int kp_fd, up_fd, ret;
setup_pager();
- fd = open_kprobe_events(false);
-
- if (fd < 0)
- return fd;
ret = init_symbol_maps(false);
if (ret < 0)
return ret;
- ret = __show_perf_probe_events(fd, true);
- close(fd);
-
- fd = open_uprobe_events(false);
- if (fd >= 0) {
- ret = __show_perf_probe_events(fd, false);
- close(fd);
+ kp_fd = open_kprobe_events(false);
+ if (kp_fd >= 0) {
+ ret = __show_perf_probe_events(kp_fd, true);
+ close(kp_fd);
+ if (ret < 0)
+ goto out;
}
+ up_fd = open_uprobe_events(false);
+ if (kp_fd < 0 && up_fd < 0) {
+ print_both_open_warning(kp_fd, up_fd);
+ ret = kp_fd;
+ goto out;
+ }
+
+ if (up_fd >= 0) {
+ ret = __show_perf_probe_events(up_fd, false);
+ close(up_fd);
+ }
+out:
exit_symbol_maps();
return ret;
}
@@ -1976,6 +2007,8 @@
memset(&tev, 0, sizeof(tev));
rawlist = get_probe_trace_command_rawlist(fd);
+ if (!rawlist)
+ return NULL;
sl = strlist__new(true, NULL);
strlist__for_each(ent, rawlist) {
ret = parse_probe_trace_command(ent->s, &tev);
@@ -2005,6 +2038,7 @@
{
int ret = 0;
char *buf = synthesize_probe_trace_command(tev);
+ char sbuf[STRERR_BUFSIZE];
if (!buf) {
pr_debug("Failed to synthesize probe trace event.\n");
@@ -2016,7 +2050,7 @@
ret = write(fd, buf, strlen(buf));
if (ret <= 0)
pr_warning("Failed to write event: %s\n",
- strerror(errno));
+ strerror_r(errno, sbuf, sizeof(sbuf)));
}
free(buf);
return ret;
@@ -2030,7 +2064,7 @@
/* Try no suffix */
ret = e_snprintf(buf, len, "%s", base);
if (ret < 0) {
- pr_debug("snprintf() failed: %s\n", strerror(-ret));
+ pr_debug("snprintf() failed: %d\n", ret);
return ret;
}
if (!strlist__has_entry(namelist, buf))
@@ -2046,7 +2080,7 @@
for (i = 1; i < MAX_EVENT_INDEX; i++) {
ret = e_snprintf(buf, len, "%s_%d", base, i);
if (ret < 0) {
- pr_debug("snprintf() failed: %s\n", strerror(-ret));
+ pr_debug("snprintf() failed: %d\n", ret);
return ret;
}
if (!strlist__has_entry(namelist, buf))
@@ -2075,8 +2109,11 @@
else
fd = open_kprobe_events(true);
- if (fd < 0)
+ if (fd < 0) {
+ print_open_warning(fd, !pev->uprobes);
return fd;
+ }
+
/* Get current event names */
namelist = get_probe_trace_event_names(fd, false);
if (!namelist) {
@@ -2408,7 +2445,8 @@
printf("Removed event: %s\n", ent->s);
return 0;
error:
- pr_warning("Failed to delete event: %s\n", strerror(-ret));
+ pr_warning("Failed to delete event: %s\n",
+ strerror_r(-ret, buf, sizeof(buf)));
return ret;
}
@@ -2449,15 +2487,18 @@
/* Get current event names */
kfd = open_kprobe_events(true);
- if (kfd < 0)
- return kfd;
+ if (kfd >= 0)
+ namelist = get_probe_trace_event_names(kfd, true);
- namelist = get_probe_trace_event_names(kfd, true);
ufd = open_uprobe_events(true);
-
if (ufd >= 0)
unamelist = get_probe_trace_event_names(ufd, true);
+ if (kfd < 0 && ufd < 0) {
+ print_both_open_warning(kfd, ufd);
+ goto error;
+ }
+
if (namelist == NULL && unamelist == NULL)
goto error;
diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c
index dca9145..9c59356 100644
--- a/tools/perf/util/probe-finder.c
+++ b/tools/perf/util/probe-finder.c
@@ -281,6 +281,7 @@
struct probe_trace_arg_ref **ref_ptr = &tvar->ref;
Dwarf_Die type;
char buf[16];
+ char sbuf[STRERR_BUFSIZE];
int bsize, boffs, total;
int ret;
@@ -367,7 +368,7 @@
if (ret >= 16)
ret = -E2BIG;
pr_warning("Failed to convert variable type: %s\n",
- strerror(-ret));
+ strerror_r(-ret, sbuf, sizeof(sbuf)));
return ret;
}
tvar->type = strdup(buf);
@@ -779,10 +780,12 @@
size_t line_len;
ssize_t len;
int count = 0, linenum = 1;
+ char sbuf[STRERR_BUFSIZE];
fp = fopen(fname, "r");
if (!fp) {
- pr_warning("Failed to open %s: %s\n", fname, strerror(errno));
+ pr_warning("Failed to open %s: %s\n", fname,
+ strerror_r(errno, sbuf, sizeof(sbuf)));
return -errno;
}
diff --git a/tools/perf/util/record.c b/tools/perf/util/record.c
index fe8079e..cf69325 100644
--- a/tools/perf/util/record.c
+++ b/tools/perf/util/record.c
@@ -14,6 +14,7 @@
struct perf_evsel *evsel;
unsigned long flags = perf_event_open_cloexec_flag();
int err = -EAGAIN, fd;
+ static pid_t pid = -1;
evlist = perf_evlist__new();
if (!evlist)
@@ -24,14 +25,22 @@
evsel = perf_evlist__first(evlist);
- fd = sys_perf_event_open(&evsel->attr, -1, cpu, -1, flags);
- if (fd < 0)
- goto out_delete;
+ while (1) {
+ fd = sys_perf_event_open(&evsel->attr, pid, cpu, -1, flags);
+ if (fd < 0) {
+ if (pid == -1 && errno == EACCES) {
+ pid = 0;
+ continue;
+ }
+ goto out_delete;
+ }
+ break;
+ }
close(fd);
fn(evsel);
- fd = sys_perf_event_open(&evsel->attr, -1, cpu, -1, flags);
+ fd = sys_perf_event_open(&evsel->attr, pid, cpu, -1, flags);
if (fd < 0) {
if (errno == EINVAL)
err = -EINVAL;
@@ -47,7 +56,7 @@
static bool perf_probe_api(setup_probe_fn_t fn)
{
- const char *try[] = {"cycles:u", "instructions:u", "cpu-clock", NULL};
+ const char *try[] = {"cycles:u", "instructions:u", "cpu-clock:u", NULL};
struct cpu_map *cpus;
int cpu, ret, i = 0;
@@ -106,7 +115,7 @@
evlist__for_each(evlist, evsel) {
perf_evsel__config(evsel, opts);
- if (!evsel->idx && use_comm_exec)
+ if (evsel->tracking && use_comm_exec)
evsel->attr.comm_exec = 1;
}
@@ -201,6 +210,7 @@
struct perf_evsel *evsel;
int err, fd, cpu;
bool ret = false;
+ pid_t pid = -1;
temp_evlist = perf_evlist__new();
if (!temp_evlist)
@@ -221,12 +231,20 @@
cpu = evlist->cpus->map[0];
}
- fd = sys_perf_event_open(&evsel->attr, -1, cpu, -1,
- perf_event_open_cloexec_flag());
- if (fd >= 0) {
- close(fd);
- ret = true;
+ while (1) {
+ fd = sys_perf_event_open(&evsel->attr, pid, cpu, -1,
+ perf_event_open_cloexec_flag());
+ if (fd < 0) {
+ if (pid == -1 && errno == EACCES) {
+ pid = 0;
+ continue;
+ }
+ goto out_delete;
+ }
+ break;
}
+ close(fd);
+ ret = true;
out_delete:
perf_evlist__delete(temp_evlist);
diff --git a/tools/perf/util/run-command.c b/tools/perf/util/run-command.c
index da8e9b2..34622b5 100644
--- a/tools/perf/util/run-command.c
+++ b/tools/perf/util/run-command.c
@@ -1,6 +1,7 @@
#include "cache.h"
#include "run-command.h"
#include "exec_cmd.h"
+#include "debug.h"
static inline void close_pair(int fd[2])
{
@@ -19,6 +20,7 @@
{
int need_in, need_out, need_err;
int fdin[2], fdout[2], fderr[2];
+ char sbuf[STRERR_BUFSIZE];
/*
* In case of errors we must keep the promise to close FDs
@@ -99,7 +101,7 @@
if (cmd->dir && chdir(cmd->dir))
die("exec %s: cd to %s failed (%s)", cmd->argv[0],
- cmd->dir, strerror(errno));
+ cmd->dir, strerror_r(errno, sbuf, sizeof(sbuf)));
if (cmd->env) {
for (; *cmd->env; cmd->env++) {
if (strchr(*cmd->env, '='))
@@ -153,6 +155,8 @@
static int wait_or_whine(pid_t pid)
{
+ char sbuf[STRERR_BUFSIZE];
+
for (;;) {
int status, code;
pid_t waiting = waitpid(pid, &status, 0);
@@ -160,7 +164,8 @@
if (waiting < 0) {
if (errno == EINTR)
continue;
- error("waitpid failed (%s)", strerror(errno));
+ error("waitpid failed (%s)",
+ strerror_r(errno, sbuf, sizeof(sbuf)));
return -ERR_RUN_COMMAND_WAITPID;
}
if (waiting != pid)
diff --git a/tools/perf/util/scripting-engines/trace-event-perl.c b/tools/perf/util/scripting-engines/trace-event-perl.c
index b2dba9c..0a01bac 100644
--- a/tools/perf/util/scripting-engines/trace-event-perl.c
+++ b/tools/perf/util/scripting-engines/trace-event-perl.c
@@ -432,6 +432,11 @@
return err;
}
+static int perl_flush_script(void)
+{
+ return 0;
+}
+
/*
* Stop trace script
*/
@@ -633,6 +638,7 @@
struct scripting_ops perl_scripting_ops = {
.name = "Perl",
.start_script = perl_start_script,
+ .flush_script = perl_flush_script,
.stop_script = perl_stop_script,
.process_event = perl_process_event,
.generate_script = perl_generate_script,
diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c
index cbce254..56ba07c 100644
--- a/tools/perf/util/scripting-engines/trace-event-python.c
+++ b/tools/perf/util/scripting-engines/trace-event-python.c
@@ -73,6 +73,35 @@
Py_DECREF(val);
}
+static PyObject *get_handler(const char *handler_name)
+{
+ PyObject *handler;
+
+ handler = PyDict_GetItemString(main_dict, handler_name);
+ if (handler && !PyCallable_Check(handler))
+ return NULL;
+ return handler;
+}
+
+static void call_object(PyObject *handler, PyObject *args, const char *die_msg)
+{
+ PyObject *retval;
+
+ retval = PyObject_CallObject(handler, args);
+ if (retval == NULL)
+ handler_call_die(die_msg);
+ Py_DECREF(retval);
+}
+
+static void try_call_object(const char *handler_name, PyObject *args)
+{
+ PyObject *handler;
+
+ handler = get_handler(handler_name);
+ if (handler)
+ call_object(handler, args, handler_name);
+}
+
static void define_value(enum print_arg_type field_type,
const char *ev_name,
const char *field_name,
@@ -80,7 +109,7 @@
const char *field_str)
{
const char *handler_name = "define_flag_value";
- PyObject *handler, *t, *retval;
+ PyObject *t;
unsigned long long value;
unsigned n = 0;
@@ -98,13 +127,7 @@
PyTuple_SetItem(t, n++, PyInt_FromLong(value));
PyTuple_SetItem(t, n++, PyString_FromString(field_str));
- handler = PyDict_GetItemString(main_dict, handler_name);
- if (handler && PyCallable_Check(handler)) {
- retval = PyObject_CallObject(handler, t);
- if (retval == NULL)
- handler_call_die(handler_name);
- Py_DECREF(retval);
- }
+ try_call_object(handler_name, t);
Py_DECREF(t);
}
@@ -127,7 +150,7 @@
const char *delim)
{
const char *handler_name = "define_flag_field";
- PyObject *handler, *t, *retval;
+ PyObject *t;
unsigned n = 0;
if (field_type == PRINT_SYMBOL)
@@ -145,13 +168,7 @@
if (field_type == PRINT_FLAGS)
PyTuple_SetItem(t, n++, PyString_FromString(delim));
- handler = PyDict_GetItemString(main_dict, handler_name);
- if (handler && PyCallable_Check(handler)) {
- retval = PyObject_CallObject(handler, t);
- if (retval == NULL)
- handler_call_die(handler_name);
- Py_DECREF(retval);
- }
+ try_call_object(handler_name, t);
Py_DECREF(t);
}
@@ -362,7 +379,7 @@
struct thread *thread,
struct addr_location *al)
{
- PyObject *handler, *retval, *context, *t, *obj, *callchain;
+ PyObject *handler, *context, *t, *obj, *callchain;
PyObject *dict = NULL;
static char handler_name[256];
struct format_field *field;
@@ -387,9 +404,7 @@
sprintf(handler_name, "%s__%s", event->system, event->name);
- handler = PyDict_GetItemString(main_dict, handler_name);
- if (handler && !PyCallable_Check(handler))
- handler = NULL;
+ handler = get_handler(handler_name);
if (!handler) {
dict = PyDict_New();
if (!dict)
@@ -450,19 +465,9 @@
Py_FatalError("error resizing Python tuple");
if (handler) {
- retval = PyObject_CallObject(handler, t);
- if (retval == NULL)
- handler_call_die(handler_name);
- Py_DECREF(retval);
+ call_object(handler, t, handler_name);
} else {
- handler = PyDict_GetItemString(main_dict, "trace_unhandled");
- if (handler && PyCallable_Check(handler)) {
-
- retval = PyObject_CallObject(handler, t);
- if (retval == NULL)
- handler_call_die("trace_unhandled");
- Py_DECREF(retval);
- }
+ try_call_object("trace_unhandled", t);
Py_DECREF(dict);
}
@@ -474,7 +479,7 @@
struct thread *thread,
struct addr_location *al)
{
- PyObject *handler, *retval, *t, *dict, *callchain, *dict_sample;
+ PyObject *handler, *t, *dict, *callchain, *dict_sample;
static char handler_name[64];
unsigned n = 0;
@@ -496,8 +501,8 @@
snprintf(handler_name, sizeof(handler_name), "%s", "process_event");
- handler = PyDict_GetItemString(main_dict, handler_name);
- if (!handler || !PyCallable_Check(handler))
+ handler = get_handler(handler_name);
+ if (!handler)
goto exit;
pydict_set_item_string_decref(dict, "ev_name", PyString_FromString(perf_evsel__name(evsel)));
@@ -539,10 +544,7 @@
if (_PyTuple_Resize(&t, n) == -1)
Py_FatalError("error resizing Python tuple");
- retval = PyObject_CallObject(handler, t);
- if (retval == NULL)
- handler_call_die(handler_name);
- Py_DECREF(retval);
+ call_object(handler, t, handler_name);
exit:
Py_DECREF(dict);
Py_DECREF(t);
@@ -566,36 +568,24 @@
static int run_start_sub(void)
{
- PyObject *handler, *retval;
- int err = 0;
-
main_module = PyImport_AddModule("__main__");
if (main_module == NULL)
return -1;
Py_INCREF(main_module);
main_dict = PyModule_GetDict(main_module);
- if (main_dict == NULL) {
- err = -1;
+ if (main_dict == NULL)
goto error;
- }
Py_INCREF(main_dict);
- handler = PyDict_GetItemString(main_dict, "trace_begin");
- if (handler == NULL || !PyCallable_Check(handler))
- goto out;
+ try_call_object("trace_begin", NULL);
- retval = PyObject_CallObject(handler, NULL);
- if (retval == NULL)
- handler_call_die("trace_begin");
+ return 0;
- Py_DECREF(retval);
- return err;
error:
Py_XDECREF(main_dict);
Py_XDECREF(main_module);
-out:
- return err;
+ return -1;
}
/*
@@ -649,28 +639,23 @@
return err;
}
+static int python_flush_script(void)
+{
+ return 0;
+}
+
/*
* Stop trace script
*/
static int python_stop_script(void)
{
- PyObject *handler, *retval;
- int err = 0;
+ try_call_object("trace_end", NULL);
- handler = PyDict_GetItemString(main_dict, "trace_end");
- if (handler == NULL || !PyCallable_Check(handler))
- goto out;
-
- retval = PyObject_CallObject(handler, NULL);
- if (retval == NULL)
- handler_call_die("trace_end");
- Py_DECREF(retval);
-out:
Py_XDECREF(main_dict);
Py_XDECREF(main_module);
Py_Finalize();
- return err;
+ return 0;
}
static int python_generate_script(struct pevent *pevent, const char *outfile)
@@ -843,6 +828,7 @@
struct scripting_ops python_scripting_ops = {
.name = "Python",
.start_script = python_start_script,
+ .flush_script = python_flush_script,
.stop_script = python_stop_script,
.process_event = python_process_event,
.generate_script = python_generate_script,
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 88dfef7..6d2d50d 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -14,6 +14,7 @@
#include "util.h"
#include "cpumap.h"
#include "perf_regs.h"
+#include "asm/bug.h"
static int perf_session__open(struct perf_session *session)
{
@@ -66,6 +67,25 @@
machines__destroy_kernel_maps(&session->machines);
}
+static bool perf_session__has_comm_exec(struct perf_session *session)
+{
+ struct perf_evsel *evsel;
+
+ evlist__for_each(session->evlist, evsel) {
+ if (evsel->attr.comm_exec)
+ return true;
+ }
+
+ return false;
+}
+
+static void perf_session__set_comm_exec(struct perf_session *session)
+{
+ bool comm_exec = perf_session__has_comm_exec(session);
+
+ machines__set_comm_exec(&session->machines, comm_exec);
+}
+
struct perf_session *perf_session__new(struct perf_data_file *file,
bool repipe, struct perf_tool *tool)
{
@@ -75,9 +95,7 @@
goto out;
session->repipe = repipe;
- INIT_LIST_HEAD(&session->ordered_samples.samples);
- INIT_LIST_HEAD(&session->ordered_samples.sample_cache);
- INIT_LIST_HEAD(&session->ordered_samples.to_free);
+ ordered_events__init(&session->ordered_events);
machines__init(&session->machines);
if (file) {
@@ -91,6 +109,7 @@
goto out_close;
perf_session__set_id_hdr_size(session);
+ perf_session__set_comm_exec(session);
}
}
@@ -104,9 +123,9 @@
}
if (tool && tool->ordering_requires_timestamps &&
- tool->ordered_samples && !perf_evlist__sample_id_all(session->evlist)) {
+ tool->ordered_events && !perf_evlist__sample_id_all(session->evlist)) {
dump_printf("WARNING: No sample_id_all support, falling back to unordered processing\n");
- tool->ordered_samples = false;
+ tool->ordered_events = false;
}
return session;
@@ -238,7 +257,7 @@
if (tool->build_id == NULL)
tool->build_id = process_finished_round_stub;
if (tool->finished_round == NULL) {
- if (tool->ordered_samples)
+ if (tool->ordered_events)
tool->finished_round = process_finished_round;
else
tool->finished_round = process_finished_round_stub;
@@ -444,87 +463,6 @@
[PERF_RECORD_HEADER_MAX] = NULL,
};
-struct sample_queue {
- u64 timestamp;
- u64 file_offset;
- union perf_event *event;
- struct list_head list;
-};
-
-static void perf_session_free_sample_buffers(struct perf_session *session)
-{
- struct ordered_samples *os = &session->ordered_samples;
-
- while (!list_empty(&os->to_free)) {
- struct sample_queue *sq;
-
- sq = list_entry(os->to_free.next, struct sample_queue, list);
- list_del(&sq->list);
- free(sq);
- }
-}
-
-static int perf_session_deliver_event(struct perf_session *session,
- union perf_event *event,
- struct perf_sample *sample,
- struct perf_tool *tool,
- u64 file_offset);
-
-static int flush_sample_queue(struct perf_session *s,
- struct perf_tool *tool)
-{
- struct ordered_samples *os = &s->ordered_samples;
- struct list_head *head = &os->samples;
- struct sample_queue *tmp, *iter;
- struct perf_sample sample;
- u64 limit = os->next_flush;
- u64 last_ts = os->last_sample ? os->last_sample->timestamp : 0ULL;
- bool show_progress = limit == ULLONG_MAX;
- struct ui_progress prog;
- int ret;
-
- if (!tool->ordered_samples || !limit)
- return 0;
-
- if (show_progress)
- ui_progress__init(&prog, os->nr_samples, "Processing time ordered events...");
-
- list_for_each_entry_safe(iter, tmp, head, list) {
- if (session_done())
- return 0;
-
- if (iter->timestamp > limit)
- break;
-
- ret = perf_evlist__parse_sample(s->evlist, iter->event, &sample);
- if (ret)
- pr_err("Can't parse sample, err = %d\n", ret);
- else {
- ret = perf_session_deliver_event(s, iter->event, &sample, tool,
- iter->file_offset);
- if (ret)
- return ret;
- }
-
- os->last_flush = iter->timestamp;
- list_del(&iter->list);
- list_add(&iter->list, &os->sample_cache);
- os->nr_samples--;
-
- if (show_progress)
- ui_progress__update(&prog, 1);
- }
-
- if (list_empty(head)) {
- os->last_sample = NULL;
- } else if (last_ts <= limit) {
- os->last_sample =
- list_entry(head->prev, struct sample_queue, list);
- }
-
- return 0;
-}
-
/*
* When perf record finishes a pass on every buffers, it records this pseudo
* event.
@@ -568,99 +506,43 @@
union perf_event *event __maybe_unused,
struct perf_session *session)
{
- int ret = flush_sample_queue(session, tool);
- if (!ret)
- session->ordered_samples.next_flush = session->ordered_samples.max_timestamp;
-
- return ret;
+ return ordered_events__flush(session, tool, OE_FLUSH__ROUND);
}
-/* The queue is ordered by time */
-static void __queue_event(struct sample_queue *new, struct perf_session *s)
-{
- struct ordered_samples *os = &s->ordered_samples;
- struct sample_queue *sample = os->last_sample;
- u64 timestamp = new->timestamp;
- struct list_head *p;
-
- ++os->nr_samples;
- os->last_sample = new;
-
- if (!sample) {
- list_add(&new->list, &os->samples);
- os->max_timestamp = timestamp;
- return;
- }
-
- /*
- * last_sample might point to some random place in the list as it's
- * the last queued event. We expect that the new event is close to
- * this.
- */
- if (sample->timestamp <= timestamp) {
- while (sample->timestamp <= timestamp) {
- p = sample->list.next;
- if (p == &os->samples) {
- list_add_tail(&new->list, &os->samples);
- os->max_timestamp = timestamp;
- return;
- }
- sample = list_entry(p, struct sample_queue, list);
- }
- list_add_tail(&new->list, &sample->list);
- } else {
- while (sample->timestamp > timestamp) {
- p = sample->list.prev;
- if (p == &os->samples) {
- list_add(&new->list, &os->samples);
- return;
- }
- sample = list_entry(p, struct sample_queue, list);
- }
- list_add(&new->list, &sample->list);
- }
-}
-
-#define MAX_SAMPLE_BUFFER (64 * 1024 / sizeof(struct sample_queue))
-
int perf_session_queue_event(struct perf_session *s, union perf_event *event,
- struct perf_sample *sample, u64 file_offset)
+ struct perf_tool *tool, struct perf_sample *sample,
+ u64 file_offset)
{
- struct ordered_samples *os = &s->ordered_samples;
- struct list_head *sc = &os->sample_cache;
+ struct ordered_events *oe = &s->ordered_events;
u64 timestamp = sample->time;
- struct sample_queue *new;
+ struct ordered_event *new;
if (!timestamp || timestamp == ~0ULL)
return -ETIME;
- if (timestamp < s->ordered_samples.last_flush) {
- printf("Warning: Timestamp below last timeslice flush\n");
- return -EINVAL;
+ if (timestamp < oe->last_flush) {
+ WARN_ONCE(1, "Timestamp below last timeslice flush\n");
+
+ pr_oe_time(timestamp, "out of order event");
+ pr_oe_time(oe->last_flush, "last flush, last_flush_type %d\n",
+ oe->last_flush_type);
+
+ /* We could get out of order messages after forced flush. */
+ if (oe->last_flush_type != OE_FLUSH__HALF)
+ return -EINVAL;
}
- if (!list_empty(sc)) {
- new = list_entry(sc->next, struct sample_queue, list);
- list_del(&new->list);
- } else if (os->sample_buffer) {
- new = os->sample_buffer + os->sample_buffer_idx;
- if (++os->sample_buffer_idx == MAX_SAMPLE_BUFFER)
- os->sample_buffer = NULL;
- } else {
- os->sample_buffer = malloc(MAX_SAMPLE_BUFFER * sizeof(*new));
- if (!os->sample_buffer)
- return -ENOMEM;
- list_add(&os->sample_buffer->list, &os->to_free);
- os->sample_buffer_idx = 2;
- new = os->sample_buffer + 1;
+ new = ordered_events__new(oe, timestamp);
+ if (!new) {
+ ordered_events__flush(s, tool, OE_FLUSH__HALF);
+ new = ordered_events__new(oe, timestamp);
}
- new->timestamp = timestamp;
+ if (!new)
+ return -ENOMEM;
+
new->file_offset = file_offset;
new->event = event;
-
- __queue_event(new, s);
-
return 0;
}
@@ -920,11 +802,10 @@
&sample->read.one, machine);
}
-static int perf_session_deliver_event(struct perf_session *session,
- union perf_event *event,
- struct perf_sample *sample,
- struct perf_tool *tool,
- u64 file_offset)
+int perf_session__deliver_event(struct perf_session *session,
+ union perf_event *event,
+ struct perf_sample *sample,
+ struct perf_tool *tool, u64 file_offset)
{
struct perf_evsel *evsel;
struct machine *machine;
@@ -1005,8 +886,10 @@
switch (event->header.type) {
case PERF_RECORD_HEADER_ATTR:
err = tool->attr(tool, event, &session->evlist);
- if (err == 0)
+ if (err == 0) {
perf_session__set_id_hdr_size(session);
+ perf_session__set_comm_exec(session);
+ }
return err;
case PERF_RECORD_HEADER_EVENT_TYPE:
/*
@@ -1036,6 +919,61 @@
swap(event, sample_id_all);
}
+int perf_session__peek_event(struct perf_session *session, off_t file_offset,
+ void *buf, size_t buf_sz,
+ union perf_event **event_ptr,
+ struct perf_sample *sample)
+{
+ union perf_event *event;
+ size_t hdr_sz, rest;
+ int fd;
+
+ if (session->one_mmap && !session->header.needs_swap) {
+ event = file_offset - session->one_mmap_offset +
+ session->one_mmap_addr;
+ goto out_parse_sample;
+ }
+
+ if (perf_data_file__is_pipe(session->file))
+ return -1;
+
+ fd = perf_data_file__fd(session->file);
+ hdr_sz = sizeof(struct perf_event_header);
+
+ if (buf_sz < hdr_sz)
+ return -1;
+
+ if (lseek(fd, file_offset, SEEK_SET) == (off_t)-1 ||
+ readn(fd, &buf, hdr_sz) != (ssize_t)hdr_sz)
+ return -1;
+
+ event = (union perf_event *)buf;
+
+ if (session->header.needs_swap)
+ perf_event_header__bswap(&event->header);
+
+ if (event->header.size < hdr_sz)
+ return -1;
+
+ rest = event->header.size - hdr_sz;
+
+ if (readn(fd, &buf, rest) != (ssize_t)rest)
+ return -1;
+
+ if (session->header.needs_swap)
+ event_swap(event, perf_evlist__sample_id_all(session->evlist));
+
+out_parse_sample:
+
+ if (sample && event->header.type < PERF_RECORD_USER_TYPE_START &&
+ perf_evlist__parse_sample(session->evlist, event, sample))
+ return -1;
+
+ *event_ptr = event;
+
+ return 0;
+}
+
static s64 perf_session__process_event(struct perf_session *session,
union perf_event *event,
struct perf_tool *tool,
@@ -1062,15 +1000,15 @@
if (ret)
return ret;
- if (tool->ordered_samples) {
- ret = perf_session_queue_event(session, event, &sample,
+ if (tool->ordered_events) {
+ ret = perf_session_queue_event(session, event, tool, &sample,
file_offset);
if (ret != -ETIME)
return ret;
}
- return perf_session_deliver_event(session, event, &sample, tool,
- file_offset);
+ return perf_session__deliver_event(session, event, &sample, tool,
+ file_offset);
}
void perf_event_header__bswap(struct perf_event_header *hdr)
@@ -1222,12 +1160,11 @@
goto more;
done:
/* do the final flush for ordered samples */
- session->ordered_samples.next_flush = ULLONG_MAX;
- err = flush_sample_queue(session, tool);
+ err = ordered_events__flush(session, tool, OE_FLUSH__FINAL);
out_err:
free(buf);
perf_session__warn_about_errors(session, tool);
- perf_session_free_sample_buffers(session);
+ ordered_events__free(&session->ordered_events);
return err;
}
@@ -1368,12 +1305,11 @@
out:
/* do the final flush for ordered samples */
- session->ordered_samples.next_flush = ULLONG_MAX;
- err = flush_sample_queue(session, tool);
+ err = ordered_events__flush(session, tool, OE_FLUSH__FINAL);
out_err:
ui_progress__finish();
perf_session__warn_about_errors(session, tool);
- perf_session_free_sample_buffers(session);
+ ordered_events__free(&session->ordered_events);
session->one_mmap = false;
return err;
}
diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h
index 0321013..8dd41ca 100644
--- a/tools/perf/util/session.h
+++ b/tools/perf/util/session.h
@@ -9,26 +9,13 @@
#include "symbol.h"
#include "thread.h"
#include "data.h"
+#include "ordered-events.h"
#include <linux/rbtree.h>
#include <linux/perf_event.h>
-struct sample_queue;
struct ip_callchain;
struct thread;
-struct ordered_samples {
- u64 last_flush;
- u64 next_flush;
- u64 max_timestamp;
- struct list_head samples;
- struct list_head sample_cache;
- struct list_head to_free;
- struct sample_queue *sample_buffer;
- struct sample_queue *last_sample;
- int sample_buffer_idx;
- unsigned int nr_samples;
-};
-
struct perf_session {
struct perf_header header;
struct machines machines;
@@ -39,7 +26,7 @@
bool one_mmap;
void *one_mmap_addr;
u64 one_mmap_offset;
- struct ordered_samples ordered_samples;
+ struct ordered_events ordered_events;
struct perf_data_file *file;
};
@@ -58,6 +45,11 @@
void perf_event_header__bswap(struct perf_event_header *hdr);
+int perf_session__peek_event(struct perf_session *session, off_t file_offset,
+ void *buf, size_t buf_sz,
+ union perf_event **event_ptr,
+ struct perf_sample *sample);
+
int __perf_session__process_events(struct perf_session *session,
u64 data_offset, u64 data_size, u64 size,
struct perf_tool *tool);
@@ -65,10 +57,16 @@
struct perf_tool *tool);
int perf_session_queue_event(struct perf_session *s, union perf_event *event,
- struct perf_sample *sample, u64 file_offset);
+ struct perf_tool *tool, struct perf_sample *sample,
+ u64 file_offset);
void perf_tool__fill_defaults(struct perf_tool *tool);
+int perf_session__deliver_event(struct perf_session *session,
+ union perf_event *event,
+ struct perf_sample *sample,
+ struct perf_tool *tool, u64 file_offset);
+
int perf_session__resolve_callchain(struct perf_session *session,
struct perf_evsel *evsel,
struct thread *thread,
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index 14e5a03..1958637 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -70,12 +70,14 @@
size_t size, unsigned int width)
{
const char *comm = thread__comm_str(he->thread);
- return repsep_snprintf(bf, size, "%*s:%5d", width - 6,
- comm ?: "", he->thread->tid);
+
+ width = max(7U, width) - 6;
+ return repsep_snprintf(bf, size, "%5d:%-*.*s", he->thread->tid,
+ width, width, comm ?: "");
}
struct sort_entry sort_thread = {
- .se_header = "Command: Pid",
+ .se_header = " Pid:Command",
.se_cmp = sort__thread_cmp,
.se_snprintf = hist_entry__thread_snprintf,
.se_width_idx = HISTC_THREAD,
@@ -106,7 +108,7 @@
static int hist_entry__comm_snprintf(struct hist_entry *he, char *bf,
size_t size, unsigned int width)
{
- return repsep_snprintf(bf, size, "%*s", width, comm__str(he->comm));
+ return repsep_snprintf(bf, size, "%-*.*s", width, width, comm__str(he->comm));
}
struct sort_entry sort_comm = {
@@ -152,10 +154,10 @@
if (map && map->dso) {
const char *dso_name = !verbose ? map->dso->short_name :
map->dso->long_name;
- return repsep_snprintf(bf, size, "%-*s", width, dso_name);
+ return repsep_snprintf(bf, size, "%-*.*s", width, width, dso_name);
}
- return repsep_snprintf(bf, size, "%-*s", width, "[unknown]");
+ return repsep_snprintf(bf, size, "%-*.*s", width, width, "[unknown]");
}
static int hist_entry__dso_snprintf(struct hist_entry *he, char *bf,
@@ -257,7 +259,10 @@
width - ret, "");
}
- return ret;
+ if (ret > width)
+ bf[width] = '\0';
+
+ return width;
}
static int hist_entry__sym_snprintf(struct hist_entry *he, char *bf,
@@ -302,10 +307,9 @@
}
static int hist_entry__srcline_snprintf(struct hist_entry *he, char *bf,
- size_t size,
- unsigned int width __maybe_unused)
+ size_t size, unsigned int width)
{
- return repsep_snprintf(bf, size, "%s", he->srcline);
+ return repsep_snprintf(bf, size, "%*.*-s", width, width, he->srcline);
}
struct sort_entry sort_srcline = {
@@ -332,7 +336,7 @@
static int hist_entry__parent_snprintf(struct hist_entry *he, char *bf,
size_t size, unsigned int width)
{
- return repsep_snprintf(bf, size, "%-*s", width,
+ return repsep_snprintf(bf, size, "%-*.*s", width, width,
he->parent ? he->parent->name : "[other]");
}
@@ -354,7 +358,7 @@
static int hist_entry__cpu_snprintf(struct hist_entry *he, char *bf,
size_t size, unsigned int width)
{
- return repsep_snprintf(bf, size, "%*d", width, he->cpu);
+ return repsep_snprintf(bf, size, "%*.*d", width, width, he->cpu);
}
struct sort_entry sort_cpu = {
@@ -484,7 +488,7 @@
else if (he->branch_info->flags.mispred)
out = "Y";
- return repsep_snprintf(bf, size, "%-*s", width, out);
+ return repsep_snprintf(bf, size, "%-*.*s", width, width, out);
}
/* --sort daddr_sym */
@@ -1194,7 +1198,7 @@
return hse_a->se == hse_b->se;
}
-void perf_hpp__reset_width(struct perf_hpp_fmt *fmt, struct hists *hists)
+void perf_hpp__reset_sort_width(struct perf_hpp_fmt *fmt, struct hists *hists)
{
struct hpp_sort_entry *hse;
@@ -1202,20 +1206,21 @@
return;
hse = container_of(fmt, struct hpp_sort_entry, hpp);
- hists__new_col_len(hists, hse->se->se_width_idx,
- strlen(hse->se->se_header));
+ hists__new_col_len(hists, hse->se->se_width_idx, strlen(fmt->name));
}
static int __sort__hpp_header(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp,
struct perf_evsel *evsel)
{
struct hpp_sort_entry *hse;
- size_t len;
+ size_t len = fmt->user_len;
hse = container_of(fmt, struct hpp_sort_entry, hpp);
- len = hists__col_len(&evsel->hists, hse->se->se_width_idx);
- return scnprintf(hpp->buf, hpp->size, "%-*s", len, hse->se->se_header);
+ if (!len)
+ len = hists__col_len(&evsel->hists, hse->se->se_width_idx);
+
+ return scnprintf(hpp->buf, hpp->size, "%-*.*s", len, len, fmt->name);
}
static int __sort__hpp_width(struct perf_hpp_fmt *fmt,
@@ -1223,20 +1228,26 @@
struct perf_evsel *evsel)
{
struct hpp_sort_entry *hse;
+ size_t len = fmt->user_len;
hse = container_of(fmt, struct hpp_sort_entry, hpp);
- return hists__col_len(&evsel->hists, hse->se->se_width_idx);
+ if (!len)
+ len = hists__col_len(&evsel->hists, hse->se->se_width_idx);
+
+ return len;
}
static int __sort__hpp_entry(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp,
struct hist_entry *he)
{
struct hpp_sort_entry *hse;
- size_t len;
+ size_t len = fmt->user_len;
hse = container_of(fmt, struct hpp_sort_entry, hpp);
- len = hists__col_len(he->hists, hse->se->se_width_idx);
+
+ if (!len)
+ len = hists__col_len(he->hists, hse->se->se_width_idx);
return hse->se->se_snprintf(he, hpp->buf, hpp->size, len);
}
@@ -1253,6 +1264,7 @@
}
hse->se = sd->entry;
+ hse->hpp.name = sd->entry->se_header;
hse->hpp.header = __sort__hpp_header;
hse->hpp.width = __sort__hpp_width;
hse->hpp.entry = __sort__hpp_entry;
@@ -1265,6 +1277,8 @@
INIT_LIST_HEAD(&hse->hpp.list);
INIT_LIST_HEAD(&hse->hpp.sort_list);
hse->hpp.elide = false;
+ hse->hpp.len = 0;
+ hse->hpp.user_len = 0;
return hse;
}
@@ -1439,7 +1453,7 @@
int ret = 0;
if (sort_keys == NULL) {
- if (field_order) {
+ if (is_strict_order(field_order)) {
/*
* If user specified field order but no sort order,
* we'll honor it and not add default sort orders.
@@ -1625,23 +1639,36 @@
memory_sort_dimensions[i].taken = 0;
}
+bool is_strict_order(const char *order)
+{
+ return order && (*order != '+');
+}
+
static int __setup_output_field(void)
{
- char *tmp, *tok, *str;
- int ret = 0;
+ char *tmp, *tok, *str, *strp;
+ int ret = -EINVAL;
if (field_order == NULL)
return 0;
reset_dimensions();
- str = strdup(field_order);
+ strp = str = strdup(field_order);
if (str == NULL) {
error("Not enough memory to setup output fields");
return -ENOMEM;
}
- for (tok = strtok_r(str, ", ", &tmp);
+ if (!is_strict_order(field_order))
+ strp++;
+
+ if (!strlen(strp)) {
+ error("Invalid --fields key: `+'");
+ goto out;
+ }
+
+ for (tok = strtok_r(strp, ", ", &tmp);
tok; tok = strtok_r(NULL, ", ", &tmp)) {
ret = output_field_add(tok);
if (ret == -EINVAL) {
@@ -1653,6 +1680,7 @@
}
}
+out:
free(str);
return ret;
}
diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h
index 041f0c9..c03e4ff 100644
--- a/tools/perf/util/sort.h
+++ b/tools/perf/util/sort.h
@@ -218,4 +218,5 @@
int report_parse_ignore_callees_opt(const struct option *opt, const char *arg, int unset);
+bool is_strict_order(const char *order);
#endif /* __PERF_SORT_H */
diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c
index d753499..9fb5e9e 100644
--- a/tools/perf/util/symbol-elf.c
+++ b/tools/perf/util/symbol-elf.c
@@ -736,7 +736,7 @@
if (symstrs == NULL)
goto out_elf_end;
- sec_strndx = elf_getscn(elf, ehdr.e_shstrndx);
+ sec_strndx = elf_getscn(runtime_ss->elf, runtime_ss->ehdr.e_shstrndx);
if (sec_strndx == NULL)
goto out_elf_end;
@@ -939,8 +939,11 @@
* to it...
*/
if (symbol_conf.demangle) {
- demangled = bfd_demangle(NULL, elf_name,
- DMGL_PARAMS | DMGL_ANSI);
+ int demangle_flags = DMGL_NO_OPTS;
+ if (verbose)
+ demangle_flags = DMGL_PARAMS | DMGL_ANSI;
+
+ demangled = bfd_demangle(NULL, elf_name, demangle_flags);
if (demangled != NULL)
elf_name = demangled;
}
diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index eb06746..ac098a3 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -15,6 +15,7 @@
#include "machine.h"
#include "symbol.h"
#include "strlist.h"
+#include "header.h"
#include <elf.h>
#include <limits.h>
@@ -523,10 +524,15 @@
struct dso *dso;
};
+/*
+ * These are symbols in the kernel image, so make sure that
+ * sym is from a kernel DSO.
+ */
bool symbol__is_idle(struct symbol *sym)
{
const char * const idle_symbols[] = {
"cpu_idle",
+ "cpu_startup_entry",
"intel_idle",
"default_idle",
"native_safe_halt",
@@ -1468,8 +1474,7 @@
if (vmlinux[0] == '/')
snprintf(symfs_vmlinux, sizeof(symfs_vmlinux), "%s", vmlinux);
else
- snprintf(symfs_vmlinux, sizeof(symfs_vmlinux), "%s%s",
- symbol_conf.symfs, vmlinux);
+ symbol__join_symfs(symfs_vmlinux, vmlinux);
if (dso->kernel == DSO_TYPE_GUEST_KERNEL)
symtab_type = DSO_BINARY_TYPE__GUEST_VMLINUX;
@@ -1745,10 +1750,11 @@
zfree(&vmlinux_path);
}
-static int vmlinux_path__init(void)
+static int vmlinux_path__init(struct perf_session_env *env)
{
struct utsname uts;
char bf[PATH_MAX];
+ char *kernel_version;
vmlinux_path = malloc(sizeof(char *) * 5);
if (vmlinux_path == NULL)
@@ -1763,25 +1769,31 @@
goto out_fail;
++vmlinux_path__nr_entries;
- /* only try running kernel version if no symfs was given */
+ /* only try kernel version if no symfs was given */
if (symbol_conf.symfs[0] != 0)
return 0;
- if (uname(&uts) < 0)
- return -1;
+ if (env) {
+ kernel_version = env->os_release;
+ } else {
+ if (uname(&uts) < 0)
+ goto out_fail;
- snprintf(bf, sizeof(bf), "/boot/vmlinux-%s", uts.release);
+ kernel_version = uts.release;
+ }
+
+ snprintf(bf, sizeof(bf), "/boot/vmlinux-%s", kernel_version);
vmlinux_path[vmlinux_path__nr_entries] = strdup(bf);
if (vmlinux_path[vmlinux_path__nr_entries] == NULL)
goto out_fail;
++vmlinux_path__nr_entries;
- snprintf(bf, sizeof(bf), "/lib/modules/%s/build/vmlinux", uts.release);
+ snprintf(bf, sizeof(bf), "/lib/modules/%s/build/vmlinux", kernel_version);
vmlinux_path[vmlinux_path__nr_entries] = strdup(bf);
if (vmlinux_path[vmlinux_path__nr_entries] == NULL)
goto out_fail;
++vmlinux_path__nr_entries;
snprintf(bf, sizeof(bf), "/usr/lib/debug/lib/modules/%s/vmlinux",
- uts.release);
+ kernel_version);
vmlinux_path[vmlinux_path__nr_entries] = strdup(bf);
if (vmlinux_path[vmlinux_path__nr_entries] == NULL)
goto out_fail;
@@ -1827,7 +1839,7 @@
return value;
}
-int symbol__init(void)
+int symbol__init(struct perf_session_env *env)
{
const char *symfs;
@@ -1842,7 +1854,7 @@
symbol_conf.priv_size += (sizeof(struct symbol_name_rb_node) -
sizeof(struct symbol));
- if (symbol_conf.try_vmlinux_path && vmlinux_path__init() < 0)
+ if (symbol_conf.try_vmlinux_path && vmlinux_path__init(env) < 0)
return -1;
if (symbol_conf.field_sep && *symbol_conf.field_sep == '.') {
diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h
index e7295e9..3f95ea0 100644
--- a/tools/perf/util/symbol.h
+++ b/tools/perf/util/symbol.h
@@ -13,6 +13,7 @@
#include <libgen.h>
#include "build-id.h"
#include "event.h"
+#include "util.h"
#ifdef HAVE_LIBELF_SUPPORT
#include <libelf.h>
@@ -59,6 +60,7 @@
#endif
#ifndef DMGL_PARAMS
+#define DMGL_NO_OPTS 0 /* For readability... */
#define DMGL_PARAMS (1 << 0) /* Include function args */
#define DMGL_ANSI (1 << 1) /* Include const, volatile, etc */
#endif
@@ -143,6 +145,14 @@
};
extern struct symbol_conf symbol_conf;
+
+static inline int __symbol__join_symfs(char *bf, size_t size, const char *path)
+{
+ return path__join(bf, size, symbol_conf.symfs, path);
+}
+
+#define symbol__join_symfs(bf, path) __symbol__join_symfs(bf, sizeof(bf), path)
+
extern int vmlinux_path__nr_entries;
extern char **vmlinux_path;
@@ -253,7 +263,8 @@
int filename__read_debuglink(const char *filename, char *debuglink,
size_t size);
-int symbol__init(void);
+struct perf_session_env;
+int symbol__init(struct perf_session_env *env);
void symbol__exit(void);
void symbol__elf_init(void);
struct symbol *symbol__new(u64 start, u64 len, u8 binding, const char *name);
diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c
index 12c7a25..a9df7f2 100644
--- a/tools/perf/util/thread.c
+++ b/tools/perf/util/thread.c
@@ -42,7 +42,7 @@
goto err_thread;
snprintf(comm_str, 32, ":%d", tid);
- comm = comm__new(comm_str, 0);
+ comm = comm__new(comm_str, 0, false);
free(comm_str);
if (!comm)
goto err_thread;
@@ -81,19 +81,33 @@
return list_first_entry(&thread->comm_list, struct comm, list);
}
+struct comm *thread__exec_comm(const struct thread *thread)
+{
+ struct comm *comm, *last = NULL;
+
+ list_for_each_entry(comm, &thread->comm_list, list) {
+ if (comm->exec)
+ return comm;
+ last = comm;
+ }
+
+ return last;
+}
+
/* CHECKME: time should always be 0 if event aren't ordered */
-int thread__set_comm(struct thread *thread, const char *str, u64 timestamp)
+int __thread__set_comm(struct thread *thread, const char *str, u64 timestamp,
+ bool exec)
{
struct comm *new, *curr = thread__comm(thread);
int err;
/* Override latest entry if it had no specific time coverage */
- if (!curr->start) {
- err = comm__override(curr, str, timestamp);
+ if (!curr->start && !curr->exec) {
+ err = comm__override(curr, str, timestamp, exec);
if (err)
return err;
} else {
- new = comm__new(str, timestamp);
+ new = comm__new(str, timestamp, exec);
if (!new)
return -ENOMEM;
list_add(&new->list, &thread->comm_list);
diff --git a/tools/perf/util/thread.h b/tools/perf/util/thread.h
index 716b772..8c75fa7 100644
--- a/tools/perf/util/thread.h
+++ b/tools/perf/util/thread.h
@@ -38,9 +38,17 @@
thread->dead = true;
}
-int thread__set_comm(struct thread *thread, const char *comm, u64 timestamp);
+int __thread__set_comm(struct thread *thread, const char *comm, u64 timestamp,
+ bool exec);
+static inline int thread__set_comm(struct thread *thread, const char *comm,
+ u64 timestamp)
+{
+ return __thread__set_comm(thread, comm, timestamp, false);
+}
+
int thread__comm_len(struct thread *thread);
struct comm *thread__comm(const struct thread *thread);
+struct comm *thread__exec_comm(const struct thread *thread);
const char *thread__comm_str(const struct thread *thread);
void thread__insert_map(struct thread *thread, struct map *map);
int thread__fork(struct thread *thread, struct thread *parent, u64 timestamp);
diff --git a/tools/perf/util/tool.h b/tools/perf/util/tool.h
index 4385816..f116369 100644
--- a/tools/perf/util/tool.h
+++ b/tools/perf/util/tool.h
@@ -40,7 +40,7 @@
event_op2 tracing_data;
event_op2 finished_round,
build_id;
- bool ordered_samples;
+ bool ordered_events;
bool ordering_requires_timestamps;
};
diff --git a/tools/perf/util/trace-event-scripting.c b/tools/perf/util/trace-event-scripting.c
index 57aaccc..5c9bdd1 100644
--- a/tools/perf/util/trace-event-scripting.c
+++ b/tools/perf/util/trace-event-scripting.c
@@ -30,6 +30,11 @@
struct scripting_context *scripting_context;
+static int flush_script_unsupported(void)
+{
+ return 0;
+}
+
static int stop_script_unsupported(void)
{
return 0;
@@ -74,6 +79,7 @@
struct scripting_ops python_scripting_unsupported_ops = {
.name = "Python",
.start_script = python_start_script_unsupported,
+ .flush_script = flush_script_unsupported,
.stop_script = stop_script_unsupported,
.process_event = process_event_unsupported,
.generate_script = python_generate_script_unsupported,
@@ -137,6 +143,7 @@
struct scripting_ops perl_scripting_unsupported_ops = {
.name = "Perl",
.start_script = perl_start_script_unsupported,
+ .flush_script = flush_script_unsupported,
.stop_script = stop_script_unsupported,
.process_event = process_event_unsupported,
.generate_script = perl_generate_script_unsupported,
diff --git a/tools/perf/util/trace-event.h b/tools/perf/util/trace-event.h
index 7b6d686..52aaa19 100644
--- a/tools/perf/util/trace-event.h
+++ b/tools/perf/util/trace-event.h
@@ -64,6 +64,7 @@
struct scripting_ops {
const char *name;
int (*start_script) (const char *script, int argc, const char **argv);
+ int (*flush_script) (void);
int (*stop_script) (void);
void (*process_event) (union perf_event *event,
struct perf_sample *sample,
diff --git a/tools/perf/util/util.c b/tools/perf/util/util.c
index e52e746..24e8d87 100644
--- a/tools/perf/util/util.c
+++ b/tools/perf/util/util.c
@@ -13,6 +13,7 @@
#include <limits.h>
#include <byteswap.h>
#include <linux/kernel.h>
+#include <unistd.h>
/*
* XXX We need to find a better place for these things...
@@ -282,6 +283,18 @@
ws->ws_col = 80;
}
+void set_term_quiet_input(struct termios *old)
+{
+ struct termios tc;
+
+ tcgetattr(0, old);
+ tc = *old;
+ tc.c_lflag &= ~(ICANON | ECHO);
+ tc.c_cc[VMIN] = 0;
+ tc.c_cc[VTIME] = 0;
+ tcsetattr(0, TCSANOW, &tc);
+}
+
static void set_tracing_events_path(const char *mountpoint)
{
snprintf(tracing_events_path, sizeof(tracing_events_path), "%s/%s",
@@ -443,6 +456,7 @@
size_t size = 0, alloc_size = 0;
void *bf = NULL, *nbf;
int fd, n, err = 0;
+ char sbuf[STRERR_BUFSIZE];
fd = open(filename, O_RDONLY);
if (fd < 0)
@@ -463,8 +477,8 @@
n = read(fd, bf + size, alloc_size - size);
if (n < 0) {
if (size) {
- pr_warning("read failed %d: %s\n",
- errno, strerror(errno));
+ pr_warning("read failed %d: %s\n", errno,
+ strerror_r(errno, sbuf, sizeof(sbuf)));
err = 0;
} else
err = -errno;
@@ -536,3 +550,39 @@
++m;
}
}
+
+bool find_process(const char *name)
+{
+ size_t len = strlen(name);
+ DIR *dir;
+ struct dirent *d;
+ int ret = -1;
+
+ dir = opendir(procfs__mountpoint());
+ if (!dir)
+ return -1;
+
+ /* Walk through the directory. */
+ while (ret && (d = readdir(dir)) != NULL) {
+ char path[PATH_MAX];
+ char *data;
+ size_t size;
+
+ if ((d->d_type != DT_DIR) ||
+ !strcmp(".", d->d_name) ||
+ !strcmp("..", d->d_name))
+ continue;
+
+ scnprintf(path, sizeof(path), "%s/%s/comm",
+ procfs__mountpoint(), d->d_name);
+
+ if (filename__read_str(path, &data, &size))
+ continue;
+
+ ret = strncmp(name, data, len);
+ free(data);
+ }
+
+ closedir(dir);
+ return ret ? false : true;
+}
diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h
index 6686436..d6a79b1 100644
--- a/tools/perf/util/util.h
+++ b/tools/perf/util/util.h
@@ -68,12 +68,14 @@
#include <sys/socket.h>
#include <sys/ioctl.h>
#include <inttypes.h>
+#include <linux/kernel.h>
#include <linux/magic.h>
#include <linux/types.h>
#include <sys/ttydefaults.h>
#include <api/fs/debugfs.h>
#include <termios.h>
#include <linux/bitops.h>
+#include <termios.h>
extern const char *graph_line;
extern const char *graph_dotted_line;
@@ -307,6 +309,7 @@
extern int cacheline_size;
void get_term_dimensions(struct winsize *ws);
+void set_term_quiet_input(struct termios *old);
struct parse_tag {
char tag;
@@ -317,6 +320,21 @@
#define SRCLINE_UNKNOWN ((char *) "??:0")
+static inline int path__join(char *bf, size_t size,
+ const char *path1, const char *path2)
+{
+ return scnprintf(bf, size, "%s%s%s", path1, path1[0] ? "/" : "", path2);
+}
+
+static inline int path__join3(char *bf, size_t size,
+ const char *path1, const char *path2,
+ const char *path3)
+{
+ return scnprintf(bf, size, "%s%s%s%s%s",
+ path1, path1[0] ? "/" : "",
+ path2, path2[0] ? "/" : "", path3);
+}
+
struct dso;
char *get_srcline(struct dso *dso, unsigned long addr);
@@ -330,4 +348,5 @@
void mem_bswap_32(void *src, int byte_size);
const char *get_filename_for_perf_kvm(void);
+bool find_process(const char *name);
#endif /* GIT_COMPAT_UTIL_H */