Merge branch 'slab/for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/penberg/linux * 'slab/for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/penberg/linux: tools, slub: Fix off-by-one buffer corruption after readlink() call slub: Discard slab page when node partial > minimum partial number slub: correct comments error for per cpu partial mm: restrict access to slab files under procfs and sysfs slub: Code optimization in get_partial_node() slub: doc: update the slabinfo.c file path slub: explicitly document position of inserting slab to partial list slub: update slabinfo tools to report per cpu partial list statistics slub: per cpu cache for partial pages slub: return object pointer from get_partial() / new_slab(). slub: pass kmem_cache_cpu pointer to get_partial() slub: Prepare inuse field in new_slab() slub: Remove useless statements in __slab_alloc slub: free slabs without holding locks slub: use print_hex_dump slab: use print_hex_dump

commit: 138c4ae9cfda8fdcf9e137457853b09ef8cf8f77 [log] [tgz]
author: Linus Torvalds <torvalds@linux-foundation.org> Wed Oct 26 21:46:18 2011 +0200
committer: Linus Torvalds <torvalds@linux-foundation.org> Wed Oct 26 21:46:18 2011 +0200
tree: 704c363de6d5868b08e9ae31a436ff04d423f625
parent: 3b3dd79d6a8b3debd0291465fc8cd9caf765d545 [diff]
parent: e182a345d40deba7c3165a2857812bf403818319 [diff]
diff --git a/Documentation/RCU/NMI-RCU.txt b/Documentation/RCU/NMI-RCU.txt
index bf82851..687777f 100644
--- a/Documentation/RCU/NMI-RCU.txt
+++ b/Documentation/RCU/NMI-RCU.txt

@@ -95,7 +95,7 @@
 to free up the handler's data as soon as synchronize_sched() returns.
 
 Important note: for this to work, the architecture in question must
-invoke irq_enter() and irq_exit() on NMI entry and exit, respectively.
+invoke nmi_enter() and nmi_exit() on NMI entry and exit, respectively.
 
 
 Answer to Quick Quiz

diff --git a/Documentation/RCU/lockdep-splat.txt b/Documentation/RCU/lockdep-splat.txt
new file mode 100644
index 0000000..bf90611
--- /dev/null
+++ b/Documentation/RCU/lockdep-splat.txt

@@ -0,0 +1,110 @@
+Lockdep-RCU was added to the Linux kernel in early 2010
+(http://lwn.net/Articles/371986/).  This facility checks for some common
+misuses of the RCU API, most notably using one of the rcu_dereference()
+family to access an RCU-protected pointer without the proper protection.
+When such misuse is detected, an lockdep-RCU splat is emitted.
+
+The usual cause of a lockdep-RCU slat is someone accessing an
+RCU-protected data structure without either (1) being in the right kind of
+RCU read-side critical section or (2) holding the right update-side lock.
+This problem can therefore be serious: it might result in random memory
+overwriting or worse.  There can of course be false positives, this
+being the real world and all that.
+
+So let's look at an example RCU lockdep splat from 3.0-rc5, one that
+has long since been fixed:
+
+===============================
+[ INFO: suspicious RCU usage. ]
+-------------------------------
+block/cfq-iosched.c:2776 suspicious rcu_dereference_protected() usage!
+
+other info that might help us debug this:
+
+
+rcu_scheduler_active = 1, debug_locks = 0
+3 locks held by scsi_scan_6/1552:
+ #0:  (&shost->scan_mutex){+.+.+.}, at: [<ffffffff8145efca>]
+scsi_scan_host_selected+0x5a/0x150
+ #1:  (&eq->sysfs_lock){+.+...}, at: [<ffffffff812a5032>]
+elevator_exit+0x22/0x60
+ #2:  (&(&q->__queue_lock)->rlock){-.-...}, at: [<ffffffff812b6233>]
+cfq_exit_queue+0x43/0x190
+
+stack backtrace:
+Pid: 1552, comm: scsi_scan_6 Not tainted 3.0.0-rc5 #17
+Call Trace:
+ [<ffffffff810abb9b>] lockdep_rcu_dereference+0xbb/0xc0
+ [<ffffffff812b6139>] __cfq_exit_single_io_context+0xe9/0x120
+ [<ffffffff812b626c>] cfq_exit_queue+0x7c/0x190
+ [<ffffffff812a5046>] elevator_exit+0x36/0x60
+ [<ffffffff812a802a>] blk_cleanup_queue+0x4a/0x60
+ [<ffffffff8145cc09>] scsi_free_queue+0x9/0x10
+ [<ffffffff81460944>] __scsi_remove_device+0x84/0xd0
+ [<ffffffff8145dca3>] scsi_probe_and_add_lun+0x353/0xb10
+ [<ffffffff817da069>] ? error_exit+0x29/0xb0
+ [<ffffffff817d98ed>] ? _raw_spin_unlock_irqrestore+0x3d/0x80
+ [<ffffffff8145e722>] __scsi_scan_target+0x112/0x680
+ [<ffffffff812c690d>] ? trace_hardirqs_off_thunk+0x3a/0x3c
+ [<ffffffff817da069>] ? error_exit+0x29/0xb0
+ [<ffffffff812bcc60>] ? kobject_del+0x40/0x40
+ [<ffffffff8145ed16>] scsi_scan_channel+0x86/0xb0
+ [<ffffffff8145f0b0>] scsi_scan_host_selected+0x140/0x150
+ [<ffffffff8145f149>] do_scsi_scan_host+0x89/0x90
+ [<ffffffff8145f170>] do_scan_async+0x20/0x160
+ [<ffffffff8145f150>] ? do_scsi_scan_host+0x90/0x90
+ [<ffffffff810975b6>] kthread+0xa6/0xb0
+ [<ffffffff817db154>] kernel_thread_helper+0x4/0x10
+ [<ffffffff81066430>] ? finish_task_switch+0x80/0x110
+ [<ffffffff817d9c04>] ? retint_restore_args+0xe/0xe
+ [<ffffffff81097510>] ? __init_kthread_worker+0x70/0x70
+ [<ffffffff817db150>] ? gs_change+0xb/0xb
+
+Line 2776 of block/cfq-iosched.c in v3.0-rc5 is as follows:
+
+	if (rcu_dereference(ioc->ioc_data) == cic) {
+
+This form says that it must be in a plain vanilla RCU read-side critical
+section, but the "other info" list above shows that this is not the
+case.  Instead, we hold three locks, one of which might be RCU related.
+And maybe that lock really does protect this reference.  If so, the fix
+is to inform RCU, perhaps by changing __cfq_exit_single_io_context() to
+take the struct request_queue "q" from cfq_exit_queue() as an argument,
+which would permit us to invoke rcu_dereference_protected as follows:
+
+	if (rcu_dereference_protected(ioc->ioc_data,
+				      lockdep_is_held(&q->queue_lock)) == cic) {
+
+With this change, there would be no lockdep-RCU splat emitted if this
+code was invoked either from within an RCU read-side critical section
+or with the ->queue_lock held.  In particular, this would have suppressed
+the above lockdep-RCU splat because ->queue_lock is held (see #2 in the
+list above).
+
+On the other hand, perhaps we really do need an RCU read-side critical
+section.  In this case, the critical section must span the use of the
+return value from rcu_dereference(), or at least until there is some
+reference count incremented or some such.  One way to handle this is to
+add rcu_read_lock() and rcu_read_unlock() as follows:
+
+	rcu_read_lock();
+	if (rcu_dereference(ioc->ioc_data) == cic) {
+		spin_lock(&ioc->lock);
+		rcu_assign_pointer(ioc->ioc_data, NULL);
+		spin_unlock(&ioc->lock);
+	}
+	rcu_read_unlock();
+
+With this change, the rcu_dereference() is always within an RCU
+read-side critical section, which again would have suppressed the
+above lockdep-RCU splat.
+
+But in this particular case, we don't actually deference the pointer
+returned from rcu_dereference().  Instead, that pointer is just compared
+to the cic pointer, which means that the rcu_dereference() can be replaced
+by rcu_access_pointer() as follows:
+
+	if (rcu_access_pointer(ioc->ioc_data) == cic) {
+
+Because it is legal to invoke rcu_access_pointer() without protection,
+this change would also suppress the above lockdep-RCU splat.

diff --git a/Documentation/RCU/lockdep.txt b/Documentation/RCU/lockdep.txt
index d7a49b2..a102d4b 100644
--- a/Documentation/RCU/lockdep.txt
+++ b/Documentation/RCU/lockdep.txt

@@ -32,9 +32,27 @@
 	srcu_dereference(p, sp):
 		Check for SRCU read-side critical section.
 	rcu_dereference_check(p, c):
-		Use explicit check expression "c".  This is useful in
-		code that is invoked by both readers and updaters.
-	rcu_dereference_raw(p)
+		Use explicit check expression "c" along with
+		rcu_read_lock_held().  This is useful in code that is
+		invoked by both RCU readers and updaters.
+	rcu_dereference_bh_check(p, c):
+		Use explicit check expression "c" along with
+		rcu_read_lock_bh_held().  This is useful in code that
+		is invoked by both RCU-bh readers and updaters.
+	rcu_dereference_sched_check(p, c):
+		Use explicit check expression "c" along with
+		rcu_read_lock_sched_held().  This is useful in code that
+		is invoked by both RCU-sched readers and updaters.
+	srcu_dereference_check(p, c):
+		Use explicit check expression "c" along with
+		srcu_read_lock_held()().  This is useful in code that
+		is invoked by both SRCU readers and updaters.
+	rcu_dereference_index_check(p, c):
+		Use explicit check expression "c", but the caller
+		must supply one of the rcu_read_lock_held() functions.
+		This is useful in code that uses RCU-protected arrays
+		that is invoked by both RCU readers and updaters.
+	rcu_dereference_raw(p):
 		Don't check.  (Use sparingly, if at all.)
 	rcu_dereference_protected(p, c):
 		Use explicit check expression "c", and omit all barriers
@@ -48,13 +66,11 @@
 		value of the pointer itself, for example, against NULL.
 
 The rcu_dereference_check() check expression can be any boolean
-expression, but would normally include one of the rcu_read_lock_held()
-family of functions and a lockdep expression.  However, any boolean
-expression can be used.  For a moderately ornate example, consider
-the following:
+expression, but would normally include a lockdep expression.  However,
+any boolean expression can be used.  For a moderately ornate example,
+consider the following:
 
 	file = rcu_dereference_check(fdt->fd[fd],
-				     rcu_read_lock_held() ||
 				     lockdep_is_held(&files->file_lock) ||
 				     atomic_read(&files->count) == 1);
 
@@ -62,7 +78,7 @@
 and, if CONFIG_PROVE_RCU is configured, verifies that this expression
 is used in:
 
-1.	An RCU read-side critical section, or
+1.	An RCU read-side critical section (implicit), or
 2.	with files->file_lock held, or
 3.	on an unshared files_struct.
 

diff --git a/Documentation/RCU/torture.txt b/Documentation/RCU/torture.txt
index 5d90167..783d6c1 100644
--- a/Documentation/RCU/torture.txt
+++ b/Documentation/RCU/torture.txt

@@ -42,7 +42,7 @@
 fqs_stutter	Wait time (in seconds) between consecutive bursts
 		of calls to force_quiescent_state().
 
-irqreaders	Says to invoke RCU readers from irq level.  This is currently
+irqreader	Says to invoke RCU readers from irq level.  This is currently
 		done via timers.  Defaults to "1" for variants of RCU that
 		permit this.  (Or, more accurately, variants of RCU that do
 		-not- permit this know to ignore this variable.)
@@ -79,19 +79,68 @@
 		Specifying "stutter=0" causes the test to run continuously
 		without pausing, which is the old default behavior.
 
+test_boost	Whether or not to test the ability of RCU to do priority
+		boosting.  Defaults to "test_boost=1", which performs
+		RCU priority-inversion testing only if the selected
+		RCU implementation supports priority boosting.  Specifying
+		"test_boost=0" never performs RCU priority-inversion
+		testing.  Specifying "test_boost=2" performs RCU
+		priority-inversion testing even if the selected RCU
+		implementation does not support RCU priority boosting,
+		which can be used to test rcutorture's ability to
+		carry out RCU priority-inversion testing.
+
+test_boost_interval
+		The number of seconds in an RCU priority-inversion test
+		cycle.	Defaults to "test_boost_interval=7".  It is
+		usually wise for this value to be relatively prime to
+		the value selected for "stutter".
+
+test_boost_duration
+		The number of seconds to do RCU priority-inversion testing
+		within any given "test_boost_interval".  Defaults to
+		"test_boost_duration=4".
+
 test_no_idle_hz	Whether or not to test the ability of RCU to operate in
 		a kernel that disables the scheduling-clock interrupt to
 		idle CPUs.  Boolean parameter, "1" to test, "0" otherwise.
 		Defaults to omitting this test.
 
-torture_type	The type of RCU to test: "rcu" for the rcu_read_lock() API,
-		"rcu_sync" for rcu_read_lock() with synchronous reclamation,
-		"rcu_bh" for the rcu_read_lock_bh() API, "rcu_bh_sync" for
-		rcu_read_lock_bh() with synchronous reclamation, "srcu" for
-		the "srcu_read_lock()" API, "sched" for the use of
-		preempt_disable() together with synchronize_sched(),
-		and "sched_expedited" for the use of preempt_disable()
-		with synchronize_sched_expedited().
+torture_type	The type of RCU to test, with string values as follows:
+
+		"rcu":  rcu_read_lock(), rcu_read_unlock() and call_rcu().
+
+		"rcu_sync":  rcu_read_lock(), rcu_read_unlock(), and
+			synchronize_rcu().
+
+		"rcu_expedited": rcu_read_lock(), rcu_read_unlock(), and
+			synchronize_rcu_expedited().
+
+		"rcu_bh": rcu_read_lock_bh(), rcu_read_unlock_bh(), and
+			call_rcu_bh().
+
+		"rcu_bh_sync": rcu_read_lock_bh(), rcu_read_unlock_bh(),
+			and synchronize_rcu_bh().
+
+		"rcu_bh_expedited": rcu_read_lock_bh(), rcu_read_unlock_bh(),
+			and synchronize_rcu_bh_expedited().
+
+		"srcu": srcu_read_lock(), srcu_read_unlock() and
+			synchronize_srcu().
+
+		"srcu_expedited": srcu_read_lock(), srcu_read_unlock() and
+			synchronize_srcu_expedited().
+
+		"sched": preempt_disable(), preempt_enable(), and
+			call_rcu_sched().
+
+		"sched_sync": preempt_disable(), preempt_enable(), and
+			synchronize_sched().
+
+		"sched_expedited": preempt_disable(), preempt_enable(), and
+			synchronize_sched_expedited().
+
+		Defaults to "rcu".
 
 verbose		Enable debug printk()s.  Default is disabled.
 
@@ -100,12 +149,12 @@
 
 The statistics output is as follows:
 
-	rcu-torture: --- Start of test: nreaders=16 stat_interval=0 verbose=0
-	rcu-torture: rtc: 0000000000000000 ver: 1916 tfle: 0 rta: 1916 rtaf: 0 rtf: 1915
-	rcu-torture: Reader Pipe:  1466408 9747 0 0 0 0 0 0 0 0 0
-	rcu-torture: Reader Batch:  1464477 11678 0 0 0 0 0 0 0 0
-	rcu-torture: Free-Block Circulation:  1915 1915 1915 1915 1915 1915 1915 1915 1915 1915 0
-	rcu-torture: --- End of test
+	rcu-torture:--- Start of test: nreaders=16 nfakewriters=4 stat_interval=30 verbose=0 test_no_idle_hz=1 shuffle_interval=3 stutter=5 irqreader=1 fqs_duration=0 fqs_holdoff=0 fqs_stutter=3 test_boost=1/0 test_boost_interval=7 test_boost_duration=4
+	rcu-torture: rtc:           (null) ver: 155441 tfle: 0 rta: 155441 rtaf: 8884 rtf: 155440 rtmbe: 0 rtbke: 0 rtbre: 0 rtbf: 0 rtb: 0 nt: 3055767
+	rcu-torture: Reader Pipe:  727860534 34213 0 0 0 0 0 0 0 0 0
+	rcu-torture: Reader Batch:  727877838 17003 0 0 0 0 0 0 0 0 0
+	rcu-torture: Free-Block Circulation:  155440 155440 155440 155440 155440 155440 155440 155440 155440 155440 0
+	rcu-torture:--- End of test: SUCCESS: nreaders=16 nfakewriters=4 stat_interval=30 verbose=0 test_no_idle_hz=1 shuffle_interval=3 stutter=5 irqreader=1 fqs_duration=0 fqs_holdoff=0 fqs_stutter=3 test_boost=1/0 test_boost_interval=7 test_boost_duration=4
 
 The command "dmesg | grep torture:" will extract this information on
 most systems.  On more esoteric configurations, it may be necessary to
@@ -113,26 +162,55 @@
 the RCU torture test.  The printk()s use KERN_ALERT, so they should
 be evident.  ;-)
 
+The first and last lines show the rcutorture module parameters, and the
+last line shows either "SUCCESS" or "FAILURE", based on rcutorture's
+automatic determination as to whether RCU operated correctly.
+
 The entries are as follows:
 
 o	"rtc": The hexadecimal address of the structure currently visible
 	to readers.
 
-o	"ver": The number of times since boot that the rcutw writer task
+o	"ver": The number of times since boot that the RCU writer task
 	has changed the structure visible to readers.
 
 o	"tfle": If non-zero, indicates that the "torture freelist"
-	containing structure to be placed into the "rtc" area is empty.
+	containing structures to be placed into the "rtc" area is empty.
 	This condition is important, since it can fool you into thinking
 	that RCU is working when it is not.  :-/
 
 o	"rta": Number of structures allocated from the torture freelist.
 
 o	"rtaf": Number of allocations from the torture freelist that have
-	failed due to the list being empty.
+	failed due to the list being empty.  It is not unusual for this
+	to be non-zero, but it is bad for it to be a large fraction of
+	the value indicated by "rta".
 
 o	"rtf": Number of frees into the torture freelist.
 
+o	"rtmbe": A non-zero value indicates that rcutorture believes that
+	rcu_assign_pointer() and rcu_dereference() are not working
+	correctly.  This value should be zero.
+
+o	"rtbke": rcutorture was unable to create the real-time kthreads
+	used to force RCU priority inversion.  This value should be zero.
+
+o	"rtbre": Although rcutorture successfully created the kthreads
+	used to force RCU priority inversion, it was unable to set them
+	to the real-time priority level of 1.  This value should be zero.
+
+o	"rtbf": The number of times that RCU priority boosting failed
+	to resolve RCU priority inversion.
+
+o	"rtb": The number of times that rcutorture attempted to force
+	an RCU priority inversion condition.  If you are testing RCU
+	priority boosting via the "test_boost" module parameter, this
+	value should be non-zero.
+
+o	"nt": The number of times rcutorture ran RCU read-side code from
+	within a timer handler.  This value should be non-zero only
+	if you specified the "irqreader" module parameter.
+
 o	"Reader Pipe": Histogram of "ages" of structures seen by readers.
 	If any entries past the first two are non-zero, RCU is broken.
 	And rcutorture prints the error flag string "!!!" to make sure
@@ -162,26 +240,15 @@
 	somehow gets incremented farther than it should.
 
 Different implementations of RCU can provide implementation-specific
-additional information.  For example, SRCU provides the following:
+additional information.  For example, SRCU provides the following
+additional line:
 
-	srcu-torture: rtc: f8cf46a8 ver: 355 tfle: 0 rta: 356 rtaf: 0 rtf: 346 rtmbe: 0
-	srcu-torture: Reader Pipe:  559738 939 0 0 0 0 0 0 0 0 0
-	srcu-torture: Reader Batch:  560434 243 0 0 0 0 0 0 0 0
-	srcu-torture: Free-Block Circulation:  355 354 353 352 351 350 349 348 347 346 0
 	srcu-torture: per-CPU(idx=1): 0(0,1) 1(0,1) 2(0,0) 3(0,1)
 
-The first four lines are similar to those for RCU.  The last line shows
-the per-CPU counter state.  The numbers in parentheses are the values
-of the "old" and "current" counters for the corresponding CPU.  The
-"idx" value maps the "old" and "current" values to the underlying array,
-and is useful for debugging.
-
-Similarly, sched_expedited RCU provides the following:
-
-	sched_expedited-torture: rtc: d0000000016c1880 ver: 1090796 tfle: 0 rta: 1090796 rtaf: 0 rtf: 1090787 rtmbe: 0 nt: 27713319
-	sched_expedited-torture: Reader Pipe:  12660320201 95875 0 0 0 0 0 0 0 0 0
-	sched_expedited-torture: Reader Batch:  12660424885 0 0 0 0 0 0 0 0 0 0
-	sched_expedited-torture: Free-Block Circulation:  1090795 1090795 1090794 1090793 1090792 1090791 1090790 1090789 1090788 1090787 0
+This line shows the per-CPU counter state.  The numbers in parentheses are
+the values of the "old" and "current" counters for the corresponding CPU.
+The "idx" value maps the "old" and "current" values to the underlying
+array, and is useful for debugging.
 
 
 USAGE

diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt
index 8173cec..aaf65f6 100644
--- a/Documentation/RCU/trace.txt
+++ b/Documentation/RCU/trace.txt

@@ -33,23 +33,23 @@
 The output of "cat rcu/rcudata" looks as follows:
 
 rcu_sched:
-  0 c=20972 g=20973 pq=1 pqc=20972 qp=0 dt=545/1/0 df=50 of=0 ri=0 ql=163 qs=NRW. kt=0/W/0 ktl=ebc3 b=10 ci=153737 co=0 ca=0
-  1 c=20972 g=20973 pq=1 pqc=20972 qp=0 dt=967/1/0 df=58 of=0 ri=0 ql=634 qs=NRW. kt=0/W/1 ktl=58c b=10 ci=191037 co=0 ca=0
-  2 c=20972 g=20973 pq=1 pqc=20972 qp=0 dt=1081/1/0 df=175 of=0 ri=0 ql=74 qs=N.W. kt=0/W/2 ktl=da94 b=10 ci=75991 co=0 ca=0
-  3 c=20942 g=20943 pq=1 pqc=20942 qp=1 dt=1846/0/0 df=404 of=0 ri=0 ql=0 qs=.... kt=0/W/3 ktl=d1cd b=10 ci=72261 co=0 ca=0
-  4 c=20972 g=20973 pq=1 pqc=20972 qp=0 dt=369/1/0 df=83 of=0 ri=0 ql=48 qs=N.W. kt=0/W/4 ktl=e0e7 b=10 ci=128365 co=0 ca=0
-  5 c=20972 g=20973 pq=1 pqc=20972 qp=0 dt=381/1/0 df=64 of=0 ri=0 ql=169 qs=NRW. kt=0/W/5 ktl=fb2f b=10 ci=164360 co=0 ca=0
-  6 c=20972 g=20973 pq=1 pqc=20972 qp=0 dt=1037/1/0 df=183 of=0 ri=0 ql=62 qs=N.W. kt=0/W/6 ktl=d2ad b=10 ci=65663 co=0 ca=0
-  7 c=20897 g=20897 pq=1 pqc=20896 qp=0 dt=1572/0/0 df=382 of=0 ri=0 ql=0 qs=.... kt=0/W/7 ktl=cf15 b=10 ci=75006 co=0 ca=0
+  0 c=20972 g=20973 pq=1 pgp=20973 qp=0 dt=545/1/0 df=50 of=0 ri=0 ql=163 qs=NRW. kt=0/W/0 ktl=ebc3 b=10 ci=153737 co=0 ca=0
+  1 c=20972 g=20973 pq=1 pgp=20973 qp=0 dt=967/1/0 df=58 of=0 ri=0 ql=634 qs=NRW. kt=0/W/1 ktl=58c b=10 ci=191037 co=0 ca=0
+  2 c=20972 g=20973 pq=1 pgp=20973 qp=0 dt=1081/1/0 df=175 of=0 ri=0 ql=74 qs=N.W. kt=0/W/2 ktl=da94 b=10 ci=75991 co=0 ca=0
+  3 c=20942 g=20943 pq=1 pgp=20942 qp=1 dt=1846/0/0 df=404 of=0 ri=0 ql=0 qs=.... kt=0/W/3 ktl=d1cd b=10 ci=72261 co=0 ca=0
+  4 c=20972 g=20973 pq=1 pgp=20973 qp=0 dt=369/1/0 df=83 of=0 ri=0 ql=48 qs=N.W. kt=0/W/4 ktl=e0e7 b=10 ci=128365 co=0 ca=0
+  5 c=20972 g=20973 pq=1 pgp=20973 qp=0 dt=381/1/0 df=64 of=0 ri=0 ql=169 qs=NRW. kt=0/W/5 ktl=fb2f b=10 ci=164360 co=0 ca=0
+  6 c=20972 g=20973 pq=1 pgp=20973 qp=0 dt=1037/1/0 df=183 of=0 ri=0 ql=62 qs=N.W. kt=0/W/6 ktl=d2ad b=10 ci=65663 co=0 ca=0
+  7 c=20897 g=20897 pq=1 pgp=20896 qp=0 dt=1572/0/0 df=382 of=0 ri=0 ql=0 qs=.... kt=0/W/7 ktl=cf15 b=10 ci=75006 co=0 ca=0
 rcu_bh:
-  0 c=1480 g=1480 pq=1 pqc=1479 qp=0 dt=545/1/0 df=6 of=0 ri=1 ql=0 qs=.... kt=0/W/0 ktl=ebc3 b=10 ci=0 co=0 ca=0
-  1 c=1480 g=1480 pq=1 pqc=1479 qp=0 dt=967/1/0 df=3 of=0 ri=1 ql=0 qs=.... kt=0/W/1 ktl=58c b=10 ci=151 co=0 ca=0
-  2 c=1480 g=1480 pq=1 pqc=1479 qp=0 dt=1081/1/0 df=6 of=0 ri=1 ql=0 qs=.... kt=0/W/2 ktl=da94 b=10 ci=0 co=0 ca=0
-  3 c=1480 g=1480 pq=1 pqc=1479 qp=0 dt=1846/0/0 df=8 of=0 ri=1 ql=0 qs=.... kt=0/W/3 ktl=d1cd b=10 ci=0 co=0 ca=0
-  4 c=1480 g=1480 pq=1 pqc=1479 qp=0 dt=369/1/0 df=6 of=0 ri=1 ql=0 qs=.... kt=0/W/4 ktl=e0e7 b=10 ci=0 co=0 ca=0
-  5 c=1480 g=1480 pq=1 pqc=1479 qp=0 dt=381/1/0 df=4 of=0 ri=1 ql=0 qs=.... kt=0/W/5 ktl=fb2f b=10 ci=0 co=0 ca=0
-  6 c=1480 g=1480 pq=1 pqc=1479 qp=0 dt=1037/1/0 df=6 of=0 ri=1 ql=0 qs=.... kt=0/W/6 ktl=d2ad b=10 ci=0 co=0 ca=0
-  7 c=1474 g=1474 pq=1 pqc=1473 qp=0 dt=1572/0/0 df=8 of=0 ri=1 ql=0 qs=.... kt=0/W/7 ktl=cf15 b=10 ci=0 co=0 ca=0
+  0 c=1480 g=1480 pq=1 pgp=1480 qp=0 dt=545/1/0 df=6 of=0 ri=1 ql=0 qs=.... kt=0/W/0 ktl=ebc3 b=10 ci=0 co=0 ca=0
+  1 c=1480 g=1480 pq=1 pgp=1480 qp=0 dt=967/1/0 df=3 of=0 ri=1 ql=0 qs=.... kt=0/W/1 ktl=58c b=10 ci=151 co=0 ca=0
+  2 c=1480 g=1480 pq=1 pgp=1480 qp=0 dt=1081/1/0 df=6 of=0 ri=1 ql=0 qs=.... kt=0/W/2 ktl=da94 b=10 ci=0 co=0 ca=0
+  3 c=1480 g=1480 pq=1 pgp=1480 qp=0 dt=1846/0/0 df=8 of=0 ri=1 ql=0 qs=.... kt=0/W/3 ktl=d1cd b=10 ci=0 co=0 ca=0
+  4 c=1480 g=1480 pq=1 pgp=1480 qp=0 dt=369/1/0 df=6 of=0 ri=1 ql=0 qs=.... kt=0/W/4 ktl=e0e7 b=10 ci=0 co=0 ca=0
+  5 c=1480 g=1480 pq=1 pgp=1480 qp=0 dt=381/1/0 df=4 of=0 ri=1 ql=0 qs=.... kt=0/W/5 ktl=fb2f b=10 ci=0 co=0 ca=0
+  6 c=1480 g=1480 pq=1 pgp=1480 qp=0 dt=1037/1/0 df=6 of=0 ri=1 ql=0 qs=.... kt=0/W/6 ktl=d2ad b=10 ci=0 co=0 ca=0
+  7 c=1474 g=1474 pq=1 pgp=1473 qp=0 dt=1572/0/0 df=8 of=0 ri=1 ql=0 qs=.... kt=0/W/7 ktl=cf15 b=10 ci=0 co=0 ca=0
 
 The first section lists the rcu_data structures for rcu_sched, the second
 for rcu_bh.  Note that CONFIG_TREE_PREEMPT_RCU kernels will have an
@@ -84,7 +84,7 @@
 	CPU has not yet reported that fact, (2) some other CPU has not
 	yet reported for this grace period, or (3) both.
 
-o	"pqc" indicates which grace period the last-observed quiescent
+o	"pgp" indicates which grace period the last-observed quiescent
 	state for this CPU corresponds to.  This is important for handling
 	the race between CPU 0 reporting an extended dynticks-idle
 	quiescent state for CPU 1 and CPU 1 suddenly waking up and
@@ -184,10 +184,14 @@
 	The number after the final slash is the CPU that the kthread
 	is actually running on.
 
+	This field is displayed only for CONFIG_RCU_BOOST kernels.
+
 o	"ktl" is the low-order 16 bits (in hexadecimal) of the count of
 	the number of times that this CPU's per-CPU kthread has gone
 	through its loop servicing invoke_rcu_cpu_kthread() requests.
 
+	This field is displayed only for CONFIG_RCU_BOOST kernels.
+
 o	"b" is the batch limit for this CPU.  If more than this number
 	of RCU callbacks is ready to invoke, then the remainder will
 	be deferred.

diff --git a/Documentation/scheduler/sched-bwc.txt b/Documentation/scheduler/sched-bwc.txt
new file mode 100644
index 0000000..f6b1873
--- /dev/null
+++ b/Documentation/scheduler/sched-bwc.txt

@@ -0,0 +1,122 @@
+CFS Bandwidth Control
+=====================
+
+[ This document only discusses CPU bandwidth control for SCHED_NORMAL.
+  The SCHED_RT case is covered in Documentation/scheduler/sched-rt-group.txt ]
+
+CFS bandwidth control is a CONFIG_FAIR_GROUP_SCHED extension which allows the
+specification of the maximum CPU bandwidth available to a group or hierarchy.
+
+The bandwidth allowed for a group is specified using a quota and period. Within
+each given "period" (microseconds), a group is allowed to consume only up to
+"quota" microseconds of CPU time.  When the CPU bandwidth consumption of a
+group exceeds this limit (for that period), the tasks belonging to its
+hierarchy will be throttled and are not allowed to run again until the next
+period.
+
+A group's unused runtime is globally tracked, being refreshed with quota units
+above at each period boundary.  As threads consume this bandwidth it is
+transferred to cpu-local "silos" on a demand basis.  The amount transferred
+within each of these updates is tunable and described as the "slice".
+
+Management
+----------
+Quota and period are managed within the cpu subsystem via cgroupfs.
+
+cpu.cfs_quota_us: the total available run-time within a period (in microseconds)
+cpu.cfs_period_us: the length of a period (in microseconds)
+cpu.stat: exports throttling statistics [explained further below]
+
+The default values are:
+	cpu.cfs_period_us=100ms
+	cpu.cfs_quota=-1
+
+A value of -1 for cpu.cfs_quota_us indicates that the group does not have any
+bandwidth restriction in place, such a group is described as an unconstrained
+bandwidth group.  This represents the traditional work-conserving behavior for
+CFS.
+
+Writing any (valid) positive value(s) will enact the specified bandwidth limit.
+The minimum quota allowed for the quota or period is 1ms.  There is also an
+upper bound on the period length of 1s.  Additional restrictions exist when
+bandwidth limits are used in a hierarchical fashion, these are explained in
+more detail below.
+
+Writing any negative value to cpu.cfs_quota_us will remove the bandwidth limit
+and return the group to an unconstrained state once more.
+
+Any updates to a group's bandwidth specification will result in it becoming
+unthrottled if it is in a constrained state.
+
+System wide settings
+--------------------
+For efficiency run-time is transferred between the global pool and CPU local
+"silos" in a batch fashion.  This greatly reduces global accounting pressure
+on large systems.  The amount transferred each time such an update is required
+is described as the "slice".
+
+This is tunable via procfs:
+	/proc/sys/kernel/sched_cfs_bandwidth_slice_us (default=5ms)
+
+Larger slice values will reduce transfer overheads, while smaller values allow
+for more fine-grained consumption.
+
+Statistics
+----------
+A group's bandwidth statistics are exported via 3 fields in cpu.stat.
+
+cpu.stat:
+- nr_periods: Number of enforcement intervals that have elapsed.
+- nr_throttled: Number of times the group has been throttled/limited.
+- throttled_time: The total time duration (in nanoseconds) for which entities
+  of the group have been throttled.
+
+This interface is read-only.
+
+Hierarchical considerations
+---------------------------
+The interface enforces that an individual entity's bandwidth is always
+attainable, that is: max(c_i) <= C. However, over-subscription in the
+aggregate case is explicitly allowed to enable work-conserving semantics
+within a hierarchy.
+  e.g. \Sum (c_i) may exceed C
+[ Where C is the parent's bandwidth, and c_i its children ]
+
+
+There are two ways in which a group may become throttled:
+	a. it fully consumes its own quota within a period
+	b. a parent's quota is fully consumed within its period
+
+In case b) above, even though the child may have runtime remaining it will not
+be allowed to until the parent's runtime is refreshed.
+
+Examples
+--------
+1. Limit a group to 1 CPU worth of runtime.
+
+	If period is 250ms and quota is also 250ms, the group will get
+	1 CPU worth of runtime every 250ms.
+
+	# echo 250000 > cpu.cfs_quota_us /* quota = 250ms */
+	# echo 250000 > cpu.cfs_period_us /* period = 250ms */
+
+2. Limit a group to 2 CPUs worth of runtime on a multi-CPU machine.
+
+	With 500ms period and 1000ms quota, the group can get 2 CPUs worth of
+	runtime every 500ms.
+
+	# echo 1000000 > cpu.cfs_quota_us /* quota = 1000ms */
+	# echo 500000 > cpu.cfs_period_us /* period = 500ms */
+
+	The larger period here allows for increased burst capacity.
+
+3. Limit a group to 20% of 1 CPU.
+
+	With 50ms period, 10ms quota will be equivalent to 20% of 1 CPU.
+
+	# echo 10000 > cpu.cfs_quota_us /* quota = 10ms */
+	# echo 50000 > cpu.cfs_period_us /* period = 50ms */
+
+	By using a small period here we are ensuring a consistent latency
+	response at the expense of burst capacity.
+

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 795126e..8090cad 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig

@@ -346,7 +346,6 @@
 config ARCH_PRIMA2
 	bool "CSR SiRFSoC PRIMA2 ARM Cortex A9 Platform"
 	select CPU_V7
-	select GENERIC_TIME
 	select NO_IOPORT
 	select GENERIC_CLOCKEVENTS
 	select CLKDEV_LOOKUP
@@ -520,7 +519,6 @@
 	select ARM_AMBA
 	select USB_ARCH_HAS_OHCI
 	select CLKDEV_LOOKUP
-	select GENERIC_TIME
 	select GENERIC_CLOCKEVENTS
 	help
 	  Support for the NXP LPC32XX family of processors
@@ -599,7 +597,6 @@
 	bool "NVIDIA Tegra"
 	select CLKDEV_LOOKUP
 	select CLKSRC_MMIO
-	select GENERIC_TIME
 	select GENERIC_CLOCKEVENTS
 	select GENERIC_GPIO
 	select HAVE_CLK
@@ -914,7 +911,6 @@
 config ARCH_ZYNQ
 	bool "Xilinx Zynq ARM Cortex A9 Platform"
 	select CPU_V7
-	select GENERIC_TIME
 	select GENERIC_CLOCKEVENTS
 	select CLKDEV_LOOKUP
 	select ARM_GIC

diff --git a/arch/blackfin/Kconfig b/arch/blackfin/Kconfig
index c747629..abe5a9e 100644
--- a/arch/blackfin/Kconfig
+++ b/arch/blackfin/Kconfig

@@ -248,10 +248,6 @@
 	depends on SMP && HOTPLUG
 	default y
 
-config HAVE_LEGACY_PER_CPU_AREA
-	def_bool y
-	depends on SMP
-
 config BF_REV_MIN
 	int
 	default 0 if (BF51x || BF52x || (BF54x && !BF54xM))

diff --git a/arch/blackfin/configs/BF548-EZKIT_defconfig b/arch/blackfin/configs/BF548-EZKIT_defconfig
index 56151b5..0e6d841 100644
--- a/arch/blackfin/configs/BF548-EZKIT_defconfig
+++ b/arch/blackfin/configs/BF548-EZKIT_defconfig

@@ -4,7 +4,6 @@
 CONFIG_IKCONFIG_PROC=y
 CONFIG_LOG_BUF_SHIFT=14
 CONFIG_BLK_DEV_INITRD=y
-# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set
 CONFIG_EXPERT=y
 # CONFIG_SYSCTL_SYSCALL is not set
 # CONFIG_ELF_CORE is not set
@@ -40,7 +39,6 @@
 CONFIG_EBIU_FCTLVAL=0x6
 CONFIG_BINFMT_FLAT=y
 CONFIG_BINFMT_ZFLAT=y
-CONFIG_PM=y
 CONFIG_NET=y
 CONFIG_PACKET=y
 CONFIG_UNIX=y
@@ -55,7 +53,6 @@
 CONFIG_CAN=m
 CONFIG_CAN_RAW=m
 CONFIG_CAN_BCM=m
-CONFIG_CAN_DEV=m
 CONFIG_CAN_BFIN=m
 CONFIG_IRDA=m
 CONFIG_IRLAN=m
@@ -67,7 +64,6 @@
 CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
 CONFIG_FW_LOADER=m
 CONFIG_MTD=y
-CONFIG_MTD_PARTITIONS=y
 CONFIG_MTD_CMDLINE_PARTS=y
 CONFIG_MTD_CHAR=y
 CONFIG_MTD_BLOCK=y
@@ -105,12 +101,12 @@
 CONFIG_TOUCHSCREEN_AD7877=m
 CONFIG_INPUT_MISC=y
 # CONFIG_SERIO is not set
-# CONFIG_DEVKMEM is not set
+# CONFIG_LEGACY_PTYS is not set
 CONFIG_BFIN_JTAG_COMM=m
+# CONFIG_DEVKMEM is not set
 CONFIG_SERIAL_BFIN=y
 CONFIG_SERIAL_BFIN_CONSOLE=y
 CONFIG_SERIAL_BFIN_UART1=y
-# CONFIG_LEGACY_PTYS is not set
 # CONFIG_HW_RANDOM is not set
 CONFIG_I2C=y
 CONFIG_I2C_CHARDEV=y
@@ -163,6 +159,7 @@
 CONFIG_USB_OTG_BLACKLIST_HUB=y
 CONFIG_USB_MON=y
 CONFIG_USB_MUSB_HDRC=y
+CONFIG_USB_MUSB_BLACKFIN=y
 CONFIG_USB_STORAGE=y
 CONFIG_MMC=y
 CONFIG_MMC_BLOCK=m
@@ -185,8 +182,6 @@
 CONFIG_NFS_V3=y
 CONFIG_NFSD=m
 CONFIG_NFSD_V3=y
-CONFIG_SMB_FS=m
-CONFIG_SMB_NLS_DEFAULT=y
 CONFIG_CIFS=y
 CONFIG_NLS_CODEPAGE_437=m
 CONFIG_NLS_CODEPAGE_936=m
@@ -196,7 +191,6 @@
 CONFIG_DEBUG_SHIRQ=y
 CONFIG_DETECT_HUNG_TASK=y
 CONFIG_DEBUG_INFO=y
-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
 # CONFIG_FTRACE is not set
 CONFIG_DEBUG_MMRS=y
 CONFIG_DEBUG_HWERR=y
@@ -206,5 +200,4 @@
 CONFIG_EARLY_PRINTK=y
 CONFIG_CPLB_INFO=y
 CONFIG_BFIN_PSEUDODBG_INSNS=y
-CONFIG_CRYPTO=y
 # CONFIG_CRYPTO_ANSI_CPRNG is not set

diff --git a/arch/blackfin/include/asm/Kbuild b/arch/blackfin/include/asm/Kbuild
index 7a075ea..5a0625a 100644
--- a/arch/blackfin/include/asm/Kbuild
+++ b/arch/blackfin/include/asm/Kbuild

@@ -21,6 +21,7 @@
 generic-y += local.h
 generic-y += mman.h
 generic-y += msgbuf.h
+generic-y += mutex.h
 generic-y += param.h
 generic-y += percpu.h
 generic-y += pgalloc.h

diff --git a/arch/blackfin/include/asm/atomic.h b/arch/blackfin/include/asm/atomic.h
index 1352256..54c6e28 100644
--- a/arch/blackfin/include/asm/atomic.h
+++ b/arch/blackfin/include/asm/atomic.h

@@ -1,5 +1,5 @@
 /*
- * Copyright 2004-2009 Analog Devices Inc.
+ * Copyright 2004-2011 Analog Devices Inc.
  *
  * Licensed under the GPL-2 or later.
  */
@@ -7,111 +7,27 @@
 #ifndef __ARCH_BLACKFIN_ATOMIC__
 #define __ARCH_BLACKFIN_ATOMIC__
 
-#ifndef CONFIG_SMP
-# include <asm-generic/atomic.h>
-#else
+#ifdef CONFIG_SMP
 
-#include <linux/types.h>
-#include <asm/system.h>	/* local_irq_XXX() */
-
-/*
- * Atomic operations that C can't guarantee us.  Useful for
- * resource counting etc..
- */
-
-#define ATOMIC_INIT(i)	{ (i) }
-#define atomic_set(v, i)	(((v)->counter) = i)
-
-#define atomic_read(v)	__raw_uncached_fetch_asm(&(v)->counter)
+#include <linux/linkage.h>
 
 asmlinkage int __raw_uncached_fetch_asm(const volatile int *ptr);
-
 asmlinkage int __raw_atomic_update_asm(volatile int *ptr, int value);
-
 asmlinkage int __raw_atomic_clear_asm(volatile int *ptr, int value);
-
 asmlinkage int __raw_atomic_set_asm(volatile int *ptr, int value);
-
 asmlinkage int __raw_atomic_xor_asm(volatile int *ptr, int value);
-
 asmlinkage int __raw_atomic_test_asm(const volatile int *ptr, int value);
 
-static inline void atomic_add(int i, atomic_t *v)
-{
-	__raw_atomic_update_asm(&v->counter, i);
-}
+#define atomic_read(v) __raw_uncached_fetch_asm(&(v)->counter)
 
-static inline void atomic_sub(int i, atomic_t *v)
-{
-	__raw_atomic_update_asm(&v->counter, -i);
-}
+#define atomic_add_return(i, v) __raw_atomic_update_asm(&(v)->counter, i)
+#define atomic_sub_return(i, v) __raw_atomic_update_asm(&(v)->counter, -(i))
 
-static inline int atomic_add_return(int i, atomic_t *v)
-{
-	return __raw_atomic_update_asm(&v->counter, i);
-}
-
-static inline int atomic_sub_return(int i, atomic_t *v)
-{
-	return __raw_atomic_update_asm(&v->counter, -i);
-}
-
-static inline void atomic_inc(volatile atomic_t *v)
-{
-	__raw_atomic_update_asm(&v->counter, 1);
-}
-
-static inline void atomic_dec(volatile atomic_t *v)
-{
-	__raw_atomic_update_asm(&v->counter, -1);
-}
-
-static inline void atomic_clear_mask(int mask, atomic_t *v)
-{
-	__raw_atomic_clear_asm(&v->counter, mask);
-}
-
-static inline void atomic_set_mask(int mask, atomic_t *v)
-{
-	__raw_atomic_set_asm(&v->counter, mask);
-}
-
-/* Atomic operations are already serializing */
-#define smp_mb__before_atomic_dec()    barrier()
-#define smp_mb__after_atomic_dec() barrier()
-#define smp_mb__before_atomic_inc()    barrier()
-#define smp_mb__after_atomic_inc() barrier()
-
-#define atomic_add_negative(a, v)	(atomic_add_return((a), (v)) < 0)
-#define atomic_dec_return(v) atomic_sub_return(1,(v))
-#define atomic_inc_return(v) atomic_add_return(1,(v))
-
-#define atomic_cmpxchg(v, o, n) ((int)cmpxchg(&((v)->counter), (o), (n)))
-#define atomic_xchg(v, new) (xchg(&((v)->counter), new))
-
-#define __atomic_add_unless(v, a, u)				\
-({								\
-	int c, old;						\
-	c = atomic_read(v);					\
-	while (c != (u) && (old = atomic_cmpxchg((v), c, c + (a))) != c) \
-		c = old;					\
-	c;							\
-})
-
-/*
- * atomic_inc_and_test - increment and test
- * @v: pointer of type atomic_t
- *
- * Atomically increments @v by 1
- * and returns true if the result is zero, or false for all
- * other cases.
- */
-#define atomic_inc_and_test(v) (atomic_inc_return(v) == 0)
-
-#define atomic_sub_and_test(i,v) (atomic_sub_return((i), (v)) == 0)
-#define atomic_dec_and_test(v) (atomic_sub_return(1, (v)) == 0)
-
+#define atomic_clear_mask(m, v) __raw_atomic_clear_asm(&(v)->counter, m)
+#define atomic_set_mask(m, v)   __raw_atomic_set_asm(&(v)->counter, m)
 
 #endif
 
+#include <asm-generic/atomic.h>
+
 #endif

diff --git a/arch/blackfin/include/asm/mutex.h b/arch/blackfin/include/asm/mutex.h
deleted file mode 100644
index ff6101a..0000000
--- a/arch/blackfin/include/asm/mutex.h
+++ /dev/null

@@ -1 +0,0 @@
-#include <asm-generic/mutex-dec.h>

diff --git a/arch/blackfin/include/asm/uaccess.h b/arch/blackfin/include/asm/uaccess.h
index 1c0d190..5cc1115 100644
--- a/arch/blackfin/include/asm/uaccess.h
+++ b/arch/blackfin/include/asm/uaccess.h

@@ -195,17 +195,17 @@
 copy_from_user(void *to, const void __user *from, unsigned long n)
 {
 	if (access_ok(VERIFY_READ, from, n))
-		memcpy(to, from, n);
+		memcpy(to, (const void __force *)from, n);
 	else
 		return n;
 	return 0;
 }
 
 static inline unsigned long __must_check
-copy_to_user(void *to, const void __user *from, unsigned long n)
+copy_to_user(void __user *to, const void *from, unsigned long n)
 {
 	if (access_ok(VERIFY_WRITE, to, n))
-		memcpy(to, from, n);
+		memcpy((void __force *)to, from, n);
 	else
 		return n;
 	return 0;

diff --git a/arch/blackfin/kernel/Makefile b/arch/blackfin/kernel/Makefile
index b7bdc42..1f88edd 100644
--- a/arch/blackfin/kernel/Makefile
+++ b/arch/blackfin/kernel/Makefile

@@ -38,6 +38,6 @@
 
 # the kgdb test puts code into L2 and without linker
 # relaxation, we need to force long calls to/from it
-CFLAGS_kgdb_test.o := -mlong-calls -O0
+CFLAGS_kgdb_test.o := -mlong-calls
 
 obj-$(CONFIG_DEBUG_MMRS)             += debug-mmrs.o

diff --git a/arch/blackfin/kernel/kgdb_test.c b/arch/blackfin/kernel/kgdb_test.c
index 2a6e9db..4a7dcfe 100644
--- a/arch/blackfin/kernel/kgdb_test.c
+++ b/arch/blackfin/kernel/kgdb_test.c

@@ -50,8 +50,7 @@
 
 #endif
 
-
-int kgdb_test(char *name, int len, int count, int z)
+noinline int kgdb_test(char *name, int len, int count, int z)
 {
 	pr_alert("kgdb name(%d): %s, %d, %d\n", len, name, count, z);
 	count = z;

diff --git a/arch/blackfin/kernel/time-ts.c b/arch/blackfin/kernel/time-ts.c
index 9e9b60d..1bcf3a3 100644
--- a/arch/blackfin/kernel/time-ts.c
+++ b/arch/blackfin/kernel/time-ts.c

@@ -188,8 +188,7 @@
 
 static struct irqaction gptmr0_irq = {
 	.name		= "Blackfin GPTimer0",
-	.flags		= IRQF_DISABLED | IRQF_TIMER | \
-			  IRQF_IRQPOLL | IRQF_PERCPU,
+	.flags		= IRQF_TIMER | IRQF_IRQPOLL | IRQF_PERCPU,
 	.handler	= bfin_gptmr0_interrupt,
 };
 
@@ -297,8 +296,7 @@
 
 static struct irqaction coretmr_irq = {
 	.name		= "Blackfin CoreTimer",
-	.flags		= IRQF_DISABLED | IRQF_TIMER | \
-			  IRQF_IRQPOLL | IRQF_PERCPU,
+	.flags		= IRQF_TIMER | IRQF_IRQPOLL | IRQF_PERCPU,
 	.handler	= bfin_coretmr_interrupt,
 };
 

diff --git a/arch/blackfin/kernel/time.c b/arch/blackfin/kernel/time.c
index ceb2bf6..2310b24 100644
--- a/arch/blackfin/kernel/time.c
+++ b/arch/blackfin/kernel/time.c

@@ -25,7 +25,6 @@
 
 static struct irqaction bfin_timer_irq = {
 	.name = "Blackfin Timer Tick",
-	.flags = IRQF_DISABLED
 };
 
 #if defined(CONFIG_IPIPE)

diff --git a/arch/blackfin/mach-bf533/boards/H8606.c b/arch/blackfin/mach-bf533/boards/H8606.c
index eb325ed..5da5787 100644
--- a/arch/blackfin/mach-bf533/boards/H8606.c
+++ b/arch/blackfin/mach-bf533/boards/H8606.c

@@ -54,7 +54,8 @@
 	[2] = {
 		.start	= IRQ_PF10,
 		.end	= IRQ_PF10,
-		.flags	= (IORESOURCE_IRQ | IORESOURCE_IRQ_HIGHEDGE | IRQF_SHARED | IRQF_TRIGGER_HIGH),
+		.flags	= (IORESOURCE_IRQ | IORESOURCE_IRQ_HIGHEDGE |
+		           IORESOURCE_IRQ_SHAREABLE),
 	},
 };
 

diff --git a/arch/blackfin/mach-bf537/boards/cm_bf537e.c b/arch/blackfin/mach-bf537/boards/cm_bf537e.c
index 44fd840..9fb20d6 100644
--- a/arch/blackfin/mach-bf537/boards/cm_bf537e.c
+++ b/arch/blackfin/mach-bf537/boards/cm_bf537e.c

@@ -605,7 +605,7 @@
 
 static struct pata_platform_info bfin_pata_platform_data = {
 	.ioport_shift = 2,
-	.irq_type = IRQF_TRIGGER_HIGH | IRQF_DISABLED,
+	.irq_type = IRQF_TRIGGER_HIGH,
 };
 
 static struct resource bfin_pata_resources[] = {

diff --git a/arch/blackfin/mach-bf537/boards/cm_bf537u.c b/arch/blackfin/mach-bf537/boards/cm_bf537u.c
index 1b4ac5c..5ba389f 100644
--- a/arch/blackfin/mach-bf537/boards/cm_bf537u.c
+++ b/arch/blackfin/mach-bf537/boards/cm_bf537u.c

@@ -570,7 +570,7 @@
 
 static struct pata_platform_info bfin_pata_platform_data = {
 	.ioport_shift = 2,
-	.irq_type = IRQF_TRIGGER_HIGH | IRQF_DISABLED,
+	.irq_type = IRQF_TRIGGER_HIGH,
 };
 
 static struct resource bfin_pata_resources[] = {

diff --git a/arch/blackfin/mach-bf537/boards/stamp.c b/arch/blackfin/mach-bf537/boards/stamp.c
index b52e672..6c916a6 100644
--- a/arch/blackfin/mach-bf537/boards/stamp.c
+++ b/arch/blackfin/mach-bf537/boards/stamp.c

@@ -962,10 +962,10 @@
 	},
 #endif
 
-#if defined(CONFIG_SND_BF5XX_SOC_AD183X) \
-	|| defined(CONFIG_SND_BF5XX_SOC_AD183X_MODULE)
+#if defined(CONFIG_SND_BF5XX_SOC_AD1836) \
+	|| defined(CONFIG_SND_BF5XX_SOC_AD1836_MODULE)
 	{
-		.modalias = "ad183x",
+		.modalias = "ad1836",
 		.max_speed_hz = 3125000,     /* max spi clock (SCK) speed in HZ */
 		.bus_num = 0,
 		.chip_select = 4,
@@ -984,9 +984,9 @@
 	},
 #endif
 
-#if defined(CONFIG_SND_BF5XX_SOC_ADAV80X) || defined(CONFIG_SND_BF5XX_SOC_ADAV80X_MODULE)
+#if defined(CONFIG_SND_SOC_ADAV80X) || defined(CONFIG_SND_SOC_ADV80X_MODULE)
 	{
-		.modalias = "adav80x",
+		.modalias = "adav801",
 		.max_speed_hz = 3125000,     /* max spi clock (SCK) speed in HZ */
 		.bus_num = 0,
 		.chip_select = 1,
@@ -2101,7 +2101,7 @@
 	},
 #endif
 
-#if defined(CONFIG_SND_BF5XX_SOC_ADAV80X) || defined(CONFIG_SND_BF5XX_SOC_ADAV80X_MODULE)
+#if defined(CONFIG_SND_SOC_ADAV80X) || defined(CONFIG_SND_SOC_ADAV80X_MODULE)
 	{
 		I2C_BOARD_INFO("adav803", 0x10),
 	},
@@ -2134,23 +2134,6 @@
 	},
 #endif
 
-#if defined(CONFIG_AD7414) || defined(CONFIG_AD7414_MODULE)
-	{
-		I2C_BOARD_INFO("ad7414", 0x9),
-		.irq = IRQ_PG5,
-		.irq_flags = IRQF_TRIGGER_LOW,
-	},
-#endif
-
-#if defined(CONFIG_AD7416) || defined(CONFIG_AD7416_MODULE)
-	{
-		I2C_BOARD_INFO("ad7417", 0xb),
-		.irq = IRQ_PG5,
-		.irq_flags = IRQF_TRIGGER_LOW,
-		.platform_data = (void *)GPIO_PF4,
-	},
-#endif
-
 #if defined(CONFIG_ADE7854_I2C) || defined(CONFIG_ADE7854_I2C_MODULE)
 	{
 		I2C_BOARD_INFO("ade7854", 0x38),
@@ -2161,15 +2144,6 @@
 	{
 		I2C_BOARD_INFO("adt75", 0x9),
 		.irq = IRQ_PG5,
-		.irq_flags = IRQF_TRIGGER_LOW,
-	},
-#endif
-
-#if defined(CONFIG_ADT7408) || defined(CONFIG_ADT7408_MODULE)
-	{
-		I2C_BOARD_INFO("adt7408", 0x18),
-		.irq = IRQ_PG5,
-		.irq_flags = IRQF_TRIGGER_LOW,
 	},
 #endif
 
@@ -2178,7 +2152,6 @@
 		I2C_BOARD_INFO("adt7410", 0x48),
 		/* CT critical temperature event. line 0 */
 		.irq = IRQ_PG5,
-		.irq_flags = IRQF_TRIGGER_LOW,
 		.platform_data = (void *)&adt7410_platform_data,
 	},
 #endif
@@ -2187,7 +2160,6 @@
 	{
 		I2C_BOARD_INFO("ad7291", 0x20),
 		.irq = IRQ_PG5,
-		.irq_flags = IRQF_TRIGGER_LOW,
 	},
 #endif
 
@@ -2275,6 +2247,11 @@
 		I2C_BOARD_INFO("adau1361", 0x38),
 	},
 #endif
+#if defined(CONFIG_SND_SOC_ADAU1701) || defined(CONFIG_SND_SOC_ADAU1701_MODULE)
+	{
+		I2C_BOARD_INFO("adau1701", 0x34),
+	},
+#endif
 #if defined(CONFIG_AD525X_DPOT) || defined(CONFIG_AD525X_DPOT_MODULE)
 	{
 		I2C_BOARD_INFO("ad5258", 0x18),
@@ -2388,7 +2365,7 @@
 #define PATA_INT	IRQ_PF5
 static struct pata_platform_info bfin_pata_platform_data = {
 	.ioport_shift = 1,
-	.irq_flags = IRQF_TRIGGER_HIGH | IRQF_DISABLED,
+	.irq_flags = IRQF_TRIGGER_HIGH,
 };
 
 static struct resource bfin_pata_resources[] = {
@@ -2540,13 +2517,21 @@
 };
 #endif
 
-#if defined(CONFIG_SND_BF5XX_SOC_AD73311) || defined(CONFIG_SND_BF5XX_SOC_AD73311_MODULE)
+#if defined(CONFIG_SND_SOC_AD73311) || defined(CONFIG_SND_SOC_AD73311_MODULE)
 static struct platform_device bfin_ad73311_codec_device = {
 	.name = "ad73311",
 	.id = -1,
 };
 #endif
 
+#if defined(CONFIG_SND_SOC_BFIN_EVAL_ADAV80X) || \
+	defined(CONFIG_SND_SOC_BFIN_EVAL_ADAV80X_MODULE)
+static struct platform_device bfin_eval_adav801_device = {
+	.name = "bfin-eval-adav801",
+	.id = -1,
+};
+#endif
+
 #if defined(CONFIG_SND_BF5XX_SOC_I2S) || defined(CONFIG_SND_BF5XX_SOC_I2S_MODULE)
 static struct platform_device bfin_i2s = {
 	.name = "bfin-i2s",
@@ -2661,6 +2646,20 @@
 };
 #endif
 
+#if defined(CONFIG_SND_SOC_BFIN_EVAL_ADAU1373) || \
+	defined(CONFIG_SND_SOC_BFIN_EVAL_ADAU1373_MODULE)
+static struct platform_device bf5xx_adau1373_device = {
+	.name = "bfin-eval-adau1373",
+};
+#endif
+
+#if defined(CONFIG_SND_SOC_BFIN_EVAL_ADAU1701) || \
+	defined(CONFIG_SND_SOC_BFIN_EVAL_ADAU1701_MODULE)
+static struct platform_device bf5xx_adau1701_device = {
+	.name = "bfin-eval-adau1701",
+};
+#endif
+
 static struct platform_device *stamp_devices[] __initdata = {
 
 	&bfin_dpmc,
@@ -2782,7 +2781,7 @@
 	&bfin_ac97_pcm,
 #endif
 
-#if defined(CONFIG_SND_BF5XX_SOC_AD73311) || defined(CONFIG_SND_BF5XX_SOC_AD73311_MODULE)
+#if defined(CONFIG_SND_SOC_AD73311) || defined(CONFIG_SND_SOC_AD73311_MODULE)
 	&bfin_ad73311_codec_device,
 #endif
 
@@ -2821,6 +2820,21 @@
 	defined(CONFIG_IIO_GPIO_TRIGGER_MODULE)
 	&iio_gpio_trigger,
 #endif
+
+#if defined(CONFIG_SND_SOC_BFIN_EVAL_ADAU1373) || \
+	defined(CONFIG_SND_SOC_BFIN_EVAL_ADAU1373_MODULE)
+	&bf5xx_adau1373_device,
+#endif
+
+#if defined(CONFIG_SND_SOC_BFIN_EVAL_ADAU1701) || \
+	defined(CONFIG_SND_SOC_BFIN_EVAL_ADAU1701_MODULE)
+	&bf5xx_adau1701_device,
+#endif
+
+#if defined(CONFIG_SND_SOC_BFIN_EVAL_ADAV80X) || \
+	defined(CONFIG_SND_SOC_BFIN_EVAL_ADAV80X_MODULE)
+	&bfin_eval_adav801_device,
+#endif
 };
 
 static int __init net2272_init(void)

diff --git a/arch/blackfin/mach-bf537/boards/tcm_bf537.c b/arch/blackfin/mach-bf537/boards/tcm_bf537.c
index 9b7287a..2da0316 100644
--- a/arch/blackfin/mach-bf537/boards/tcm_bf537.c
+++ b/arch/blackfin/mach-bf537/boards/tcm_bf537.c

@@ -572,7 +572,7 @@
 
 static struct pata_platform_info bfin_pata_platform_data = {
 	.ioport_shift = 2,
-	.irq_type = IRQF_TRIGGER_HIGH | IRQF_DISABLED,
+	.irq_type = IRQF_TRIGGER_HIGH,
 };
 
 static struct resource bfin_pata_resources[] = {

diff --git a/arch/blackfin/mach-bf561/boards/cm_bf561.c b/arch/blackfin/mach-bf561/boards/cm_bf561.c
index e4f397d..c1b72f2 100644
--- a/arch/blackfin/mach-bf561/boards/cm_bf561.c
+++ b/arch/blackfin/mach-bf561/boards/cm_bf561.c

@@ -348,7 +348,7 @@
 
 static struct pata_platform_info bfin_pata_platform_data = {
 	.ioport_shift = 2,
-	.irq_type = IRQF_TRIGGER_HIGH | IRQF_DISABLED,
+	.irq_type = IRQF_TRIGGER_HIGH,
 };
 
 static struct resource bfin_pata_resources[] = {

diff --git a/arch/blackfin/mach-bf561/smp.c b/arch/blackfin/mach-bf561/smp.c
index 85abd8b..db22401 100644
--- a/arch/blackfin/mach-bf561/smp.c
+++ b/arch/blackfin/mach-bf561/smp.c

@@ -114,7 +114,7 @@
 	int ret;
 	const char *name = (irq == IRQ_SUPPLE_0) ? supple0 : supple1;
 
-	ret = request_irq(irq, handler, IRQF_DISABLED | IRQF_PERCPU, name, handler);
+	ret = request_irq(irq, handler, IRQF_PERCPU, name, handler);
 	if (ret)
 		panic("Cannot request %s for IPI service", name);
 }

diff --git a/arch/blackfin/mach-common/smp.c b/arch/blackfin/mach-common/smp.c
index 107622a..0784a52 100644
--- a/arch/blackfin/mach-common/smp.c
+++ b/arch/blackfin/mach-common/smp.c

@@ -295,10 +295,15 @@
 
 void smp_send_reschedule(int cpu)
 {
+	cpumask_t callmap;
 	/* simply trigger an ipi */
 	if (cpu_is_offline(cpu))
 		return;
-	platform_send_ipi_cpu(cpu, IRQ_SUPPLE_0);
+
+	cpumask_clear(&callmap);
+	cpumask_set_cpu(cpu, &callmap);
+
+	smp_send_message(callmap, BFIN_IPI_RESCHEDULE, NULL, NULL, 0);
 
 	return;
 }

diff --git a/arch/m32r/Kconfig b/arch/m32r/Kconfig
index b92b944..6c4e9aa 100644
--- a/arch/m32r/Kconfig
+++ b/arch/m32r/Kconfig

@@ -10,6 +10,7 @@
 	select HAVE_GENERIC_HARDIRQS
 	select GENERIC_IRQ_PROBE
 	select GENERIC_IRQ_SHOW
+	select GENERIC_ATOMIC64
 
 config SBUS
 	bool

diff --git a/arch/mn10300/Kconfig b/arch/mn10300/Kconfig
index f093b3a..438db84 100644
--- a/arch/mn10300/Kconfig
+++ b/arch/mn10300/Kconfig

@@ -47,9 +47,6 @@
 config GENERIC_HWEIGHT
 	def_bool y
 
-config GENERIC_TIME
-	def_bool y
-
 config GENERIC_CLOCKEVENTS
 	def_bool y
 

diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index dff9330..8d65bd0 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c

@@ -109,10 +109,14 @@
 	set_clock_comparator(S390_lowcore.clock_comparator);
 }
 
-static int s390_next_event(unsigned long delta,
+static int s390_next_ktime(ktime_t expires,
 			   struct clock_event_device *evt)
 {
-	S390_lowcore.clock_comparator = get_clock() + delta;
+	u64 nsecs;
+
+	nsecs = ktime_to_ns(ktime_sub(expires, ktime_get_monotonic_offset()));
+	do_div(nsecs, 125);
+	S390_lowcore.clock_comparator = TOD_UNIX_EPOCH + (nsecs << 9);
 	set_clock_comparator(S390_lowcore.clock_comparator);
 	return 0;
 }
@@ -137,14 +141,15 @@
 	cpu = smp_processor_id();
 	cd = &per_cpu(comparators, cpu);
 	cd->name		= "comparator";
-	cd->features		= CLOCK_EVT_FEAT_ONESHOT;
+	cd->features		= CLOCK_EVT_FEAT_ONESHOT |
+				  CLOCK_EVT_FEAT_KTIME;
 	cd->mult		= 16777;
 	cd->shift		= 12;
 	cd->min_delta_ns	= 1;
 	cd->max_delta_ns	= LONG_MAX;
 	cd->rating		= 400;
 	cd->cpumask		= cpumask_of(cpu);
-	cd->set_next_event	= s390_next_event;
+	cd->set_next_ktime	= s390_next_ktime;
 	cd->set_mode		= s390_set_mode;
 
 	clockevents_register_device(cd);

diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig
index b30f71a..70a0de4 100644
--- a/arch/tile/Kconfig
+++ b/arch/tile/Kconfig

@@ -46,9 +46,6 @@
 config SYS_SUPPORTS_HUGETLBFS
 	def_bool y
 
-config GENERIC_TIME
-	def_bool y
-
 config GENERIC_CLOCKEVENTS
 	def_bool y
 

diff --git a/arch/tile/configs/tilegx_defconfig b/arch/tile/configs/tilegx_defconfig
index 2ad73fb..dafdbba 100644
--- a/arch/tile/configs/tilegx_defconfig
+++ b/arch/tile/configs/tilegx_defconfig

@@ -11,7 +11,6 @@
 CONFIG_HAVE_SETUP_PER_CPU_AREA=y
 CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK=y
 CONFIG_SYS_SUPPORTS_HUGETLBFS=y
-CONFIG_GENERIC_TIME=y
 CONFIG_GENERIC_CLOCKEVENTS=y
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_DEFAULT_MIGRATION_COST=10000000

diff --git a/arch/tile/configs/tilepro_defconfig b/arch/tile/configs/tilepro_defconfig
index f58dc36..6f05f96 100644
--- a/arch/tile/configs/tilepro_defconfig
+++ b/arch/tile/configs/tilepro_defconfig

@@ -11,7 +11,6 @@
 CONFIG_HAVE_SETUP_PER_CPU_AREA=y
 CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK=y
 CONFIG_SYS_SUPPORTS_HUGETLBFS=y
-CONFIG_GENERIC_TIME=y
 CONFIG_GENERIC_CLOCKEVENTS=y
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_DEFAULT_MIGRATION_COST=10000000

diff --git a/arch/um/defconfig b/arch/um/defconfig
index 9f7634f..761f5e1 100644
--- a/arch/um/defconfig
+++ b/arch/um/defconfig

@@ -13,7 +13,6 @@
 # CONFIG_STACKTRACE_SUPPORT is not set
 CONFIG_GENERIC_CALIBRATE_DELAY=y
 CONFIG_GENERIC_BUG=y
-CONFIG_GENERIC_TIME=y
 CONFIG_GENERIC_CLOCKEVENTS=y
 CONFIG_IRQ_RELEASE_METHOD=y
 CONFIG_HZ=100

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 9037289..f497677 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig

@@ -64,10 +64,12 @@
 	select HAVE_TEXT_POKE_SMP
 	select HAVE_GENERIC_HARDIRQS
 	select HAVE_SPARSE_IRQ
+	select SPARSE_IRQ
 	select GENERIC_FIND_FIRST_BIT
 	select GENERIC_IRQ_PROBE
 	select GENERIC_PENDING_IRQ if SMP
 	select GENERIC_IRQ_SHOW
+	select GENERIC_CLOCKEVENTS_MIN_ADJUST
 	select IRQ_FORCED_THREADING
 	select USE_GENERIC_SMP_HELPERS if SMP
 	select HAVE_BPF_JIT if (X86_64 && NET)

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 7b3ca83..9b7273c 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h

@@ -495,7 +495,7 @@
 	return;
 }
 
-extern struct apic *generic_bigsmp_probe(void);
+extern void generic_bigsmp_probe(void);
 
 
 #ifdef CONFIG_X86_LOCAL_APIC

diff --git a/arch/x86/include/asm/dwarf2.h b/arch/x86/include/asm/dwarf2.h
index 3260991..f6f1598 100644
--- a/arch/x86/include/asm/dwarf2.h
+++ b/arch/x86/include/asm/dwarf2.h

@@ -27,6 +27,7 @@
 #define CFI_REMEMBER_STATE	.cfi_remember_state
 #define CFI_RESTORE_STATE	.cfi_restore_state
 #define CFI_UNDEFINED		.cfi_undefined
+#define CFI_ESCAPE		.cfi_escape
 
 #ifdef CONFIG_AS_CFI_SIGNAL_FRAME
 #define CFI_SIGNAL_FRAME	.cfi_signal_frame
@@ -68,6 +69,7 @@
 #define CFI_REMEMBER_STATE	cfi_ignore
 #define CFI_RESTORE_STATE	cfi_ignore
 #define CFI_UNDEFINED		cfi_ignore
+#define CFI_ESCAPE		cfi_ignore
 #define CFI_SIGNAL_FRAME	cfi_ignore
 
 #endif

diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 7e50f06..4b44487 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h

@@ -160,19 +160,11 @@
 #define IO_APIC_VECTOR_LIMIT		( 32 * MAX_IO_APICS )
 
 #ifdef CONFIG_X86_IO_APIC
-# ifdef CONFIG_SPARSE_IRQ
-#  define CPU_VECTOR_LIMIT		(64 * NR_CPUS)
-#  define NR_IRQS					\
+# define CPU_VECTOR_LIMIT		(64 * NR_CPUS)
+# define NR_IRQS					\
 	(CPU_VECTOR_LIMIT > IO_APIC_VECTOR_LIMIT ?	\
 		(NR_VECTORS + CPU_VECTOR_LIMIT)  :	\
 		(NR_VECTORS + IO_APIC_VECTOR_LIMIT))
-# else
-#  define CPU_VECTOR_LIMIT		(32 * NR_CPUS)
-#  define NR_IRQS					\
-	(CPU_VECTOR_LIMIT < IO_APIC_VECTOR_LIMIT ?	\
-		(NR_VECTORS + CPU_VECTOR_LIMIT)  :	\
-		(NR_VECTORS + IO_APIC_VECTOR_LIMIT))
-# endif
 #else /* !CONFIG_X86_IO_APIC: */
 # define NR_IRQS			NR_IRQS_LEGACY
 #endif

diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index 4886a68..fd3f9f1 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h

@@ -22,27 +22,26 @@
 #define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace
 #endif
 
-/*
- * Define some priorities for the nmi notifier call chain.
- *
- * Create a local nmi bit that has a higher priority than
- * external nmis, because the local ones are more frequent.
- *
- * Also setup some default high/normal/low settings for
- * subsystems to registers with.  Using 4 bits to separate
- * the priorities.  This can go a lot higher if needed be.
- */
+#define NMI_FLAG_FIRST	1
 
-#define NMI_LOCAL_SHIFT		16	/* randomly picked */
-#define NMI_LOCAL_BIT		(1ULL << NMI_LOCAL_SHIFT)
-#define NMI_HIGH_PRIOR		(1ULL << 8)
-#define NMI_NORMAL_PRIOR	(1ULL << 4)
-#define NMI_LOW_PRIOR		(1ULL << 0)
-#define NMI_LOCAL_HIGH_PRIOR	(NMI_LOCAL_BIT | NMI_HIGH_PRIOR)
-#define NMI_LOCAL_NORMAL_PRIOR	(NMI_LOCAL_BIT | NMI_NORMAL_PRIOR)
-#define NMI_LOCAL_LOW_PRIOR	(NMI_LOCAL_BIT | NMI_LOW_PRIOR)
+enum {
+	NMI_LOCAL=0,
+	NMI_UNKNOWN,
+	NMI_MAX
+};
+
+#define NMI_DONE	0
+#define NMI_HANDLED	1
+
+typedef int (*nmi_handler_t)(unsigned int, struct pt_regs *);
+
+int register_nmi_handler(unsigned int, nmi_handler_t, unsigned long,
+			 const char *);
+
+void unregister_nmi_handler(unsigned int, const char *);
 
 void stop_nmi(void);
 void restart_nmi(void);
+void local_touch_nmi(void);
 
 #endif /* _ASM_X86_NMI_H */

diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 094fb30..f61c62f 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h

@@ -29,6 +29,9 @@
 #define ARCH_PERFMON_EVENTSEL_INV			(1ULL << 23)
 #define ARCH_PERFMON_EVENTSEL_CMASK			0xFF000000ULL
 
+#define AMD_PERFMON_EVENTSEL_GUESTONLY			(1ULL << 40)
+#define AMD_PERFMON_EVENTSEL_HOSTONLY			(1ULL << 41)
+
 #define AMD64_EVENTSEL_EVENT	\
 	(ARCH_PERFMON_EVENTSEL_EVENT | (0x0FULL << 32))
 #define INTEL_ARCH_EVENT_MASK	\
@@ -43,14 +46,17 @@
 #define AMD64_RAW_EVENT_MASK		\
 	(X86_RAW_EVENT_MASK          |  \
 	 AMD64_EVENTSEL_EVENT)
+#define AMD64_NUM_COUNTERS				4
+#define AMD64_NUM_COUNTERS_F15H				6
+#define AMD64_NUM_COUNTERS_MAX				AMD64_NUM_COUNTERS_F15H
 
-#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL		      0x3c
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL		0x3c
 #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK		(0x00 << 8)
-#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX			 0
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX		0
 #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \
 		(1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
 
-#define ARCH_PERFMON_BRANCH_MISSES_RETIRED			 6
+#define ARCH_PERFMON_BRANCH_MISSES_RETIRED		6
 
 /*
  * Intel "Architectural Performance Monitoring" CPUID
@@ -110,6 +116,35 @@
  */
 #define X86_PMC_IDX_FIXED_BTS				(X86_PMC_IDX_FIXED + 16)
 
+/*
+ * IBS cpuid feature detection
+ */
+
+#define IBS_CPUID_FEATURES		0x8000001b
+
+/*
+ * Same bit mask as for IBS cpuid feature flags (Fn8000_001B_EAX), but
+ * bit 0 is used to indicate the existence of IBS.
+ */
+#define IBS_CAPS_AVAIL			(1U<<0)
+#define IBS_CAPS_FETCHSAM		(1U<<1)
+#define IBS_CAPS_OPSAM			(1U<<2)
+#define IBS_CAPS_RDWROPCNT		(1U<<3)
+#define IBS_CAPS_OPCNT			(1U<<4)
+#define IBS_CAPS_BRNTRGT		(1U<<5)
+#define IBS_CAPS_OPCNTEXT		(1U<<6)
+
+#define IBS_CAPS_DEFAULT		(IBS_CAPS_AVAIL		\
+					 | IBS_CAPS_FETCHSAM	\
+					 | IBS_CAPS_OPSAM)
+
+/*
+ * IBS APIC setup
+ */
+#define IBSCTL				0x1cc
+#define IBSCTL_LVT_OFFSET_VALID		(1ULL<<8)
+#define IBSCTL_LVT_OFFSET_MASK		0x0F
+
 /* IbsFetchCtl bits/masks */
 #define IBS_FETCH_RAND_EN	(1ULL<<57)
 #define IBS_FETCH_VAL		(1ULL<<49)
@@ -124,6 +159,8 @@
 #define IBS_OP_MAX_CNT		0x0000FFFFULL
 #define IBS_OP_MAX_CNT_EXT	0x007FFFFFULL	/* not a register bit mask */
 
+extern u32 get_ibs_caps(void);
+
 #ifdef CONFIG_PERF_EVENTS
 extern void perf_events_lapic_init(void);
 
@@ -159,7 +196,19 @@
 	);							\
 }
 
+struct perf_guest_switch_msr {
+	unsigned msr;
+	u64 host, guest;
+};
+
+extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr);
 #else
+static inline perf_guest_switch_msr *perf_guest_get_msrs(int *nr)
+{
+	*nr = 0;
+	return NULL;
+}
+
 static inline void perf_events_lapic_init(void)	{ }
 #endif
 

diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h
index 3250e3d..92f29706 100644
--- a/arch/x86/include/asm/reboot.h
+++ b/arch/x86/include/asm/reboot.h

@@ -23,7 +23,7 @@
 #define MRR_BIOS	0
 #define MRR_APM		1
 
-typedef void (*nmi_shootdown_cb)(int, struct die_args*);
+typedef void (*nmi_shootdown_cb)(int, struct pt_regs*);
 void nmi_shootdown_cpus(nmi_shootdown_cb callback);
 
 #endif /* _ASM_X86_REBOOT_H */

diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index 2010405..0a6ba33 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h

@@ -624,7 +624,6 @@
 __SYSCALL(__NR_move_pages, sys_move_pages)
 #define __NR_utimensat				280
 __SYSCALL(__NR_utimensat, sys_utimensat)
-#define __IGNORE_getcpu		/* implemented as a vsyscall */
 #define __NR_epoll_pwait			281
 __SYSCALL(__NR_epoll_pwait, sys_epoll_pwait)
 #define __NR_signalfd				282

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 82f2912..8baca3c 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile

@@ -19,7 +19,7 @@
 
 obj-y			:= process_$(BITS).o signal.o entry_$(BITS).o
 obj-y			+= traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
-obj-y			+= time.o ioport.o ldt.o dumpstack.o
+obj-y			+= time.o ioport.o ldt.o dumpstack.o nmi.o
 obj-y			+= setup.o x86_init.o i8259.o irqinit.o jump_label.o
 obj-$(CONFIG_IRQ_WORK)  += irq_work.o
 obj-y			+= probe_roms.o

diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index efd737e..521bead 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c

@@ -255,12 +255,24 @@
 	.x86_32_early_logical_apicid	= bigsmp_early_logical_apicid,
 };
 
-struct apic * __init generic_bigsmp_probe(void)
+void __init generic_bigsmp_probe(void)
 {
-	if (probe_bigsmp())
-		return &apic_bigsmp;
+	unsigned int cpu;
 
-	return NULL;
+	if (!probe_bigsmp())
+		return;
+
+	apic = &apic_bigsmp;
+
+	for_each_possible_cpu(cpu) {
+		if (early_per_cpu(x86_cpu_to_logical_apicid,
+				  cpu) == BAD_APICID)
+			continue;
+		early_per_cpu(x86_cpu_to_logical_apicid, cpu) =
+			bigsmp_early_logical_apicid(cpu);
+	}
+
+	pr_info("Overriding APIC driver with %s\n", apic_bigsmp.name);
 }
 
 apic_driver(apic_bigsmp);

diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index d5e57db0..31cb9ae 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c

@@ -60,22 +60,10 @@
 }
 
 static int __kprobes
-arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self,
-			 unsigned long cmd, void *__args)
+arch_trigger_all_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs)
 {
-	struct die_args *args = __args;
-	struct pt_regs *regs;
 	int cpu;
 
-	switch (cmd) {
-	case DIE_NMI:
-		break;
-
-	default:
-		return NOTIFY_DONE;
-	}
-
-	regs = args->regs;
 	cpu = smp_processor_id();
 
 	if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
@@ -86,21 +74,16 @@
 		show_regs(regs);
 		arch_spin_unlock(&lock);
 		cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
-		return NOTIFY_STOP;
+		return NMI_HANDLED;
 	}
 
-	return NOTIFY_DONE;
+	return NMI_DONE;
 }
 
-static __read_mostly struct notifier_block backtrace_notifier = {
-	.notifier_call          = arch_trigger_all_cpu_backtrace_handler,
-	.next                   = NULL,
-	.priority               = NMI_LOCAL_LOW_PRIOR,
-};
-
 static int __init register_trigger_all_cpu_backtrace(void)
 {
-	register_die_notifier(&backtrace_notifier);
+	register_nmi_handler(NMI_LOCAL, arch_trigger_all_cpu_backtrace_handler,
+				0, "arch_bt");
 	return 0;
 }
 early_initcall(register_trigger_all_cpu_backtrace);

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 229e19f..3c31fa9 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c

@@ -92,21 +92,21 @@
 	DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
 } ioapics[MAX_IO_APICS];
 
-#define mpc_ioapic_ver(id)		ioapics[id].mp_config.apicver
+#define mpc_ioapic_ver(ioapic_idx)	ioapics[ioapic_idx].mp_config.apicver
 
-int mpc_ioapic_id(int id)
+int mpc_ioapic_id(int ioapic_idx)
 {
-	return ioapics[id].mp_config.apicid;
+	return ioapics[ioapic_idx].mp_config.apicid;
 }
 
-unsigned int mpc_ioapic_addr(int id)
+unsigned int mpc_ioapic_addr(int ioapic_idx)
 {
-	return ioapics[id].mp_config.apicaddr;
+	return ioapics[ioapic_idx].mp_config.apicaddr;
 }
 
-struct mp_ioapic_gsi *mp_ioapic_gsi_routing(int id)
+struct mp_ioapic_gsi *mp_ioapic_gsi_routing(int ioapic_idx)
 {
-	return &ioapics[id].gsi_config;
+	return &ioapics[ioapic_idx].gsi_config;
 }
 
 int nr_ioapics;
@@ -186,11 +186,7 @@
 
 
 /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
-#ifdef CONFIG_SPARSE_IRQ
 static struct irq_cfg irq_cfgx[NR_IRQS_LEGACY];
-#else
-static struct irq_cfg irq_cfgx[NR_IRQS];
-#endif
 
 int __init arch_early_irq_init(void)
 {
@@ -234,7 +230,6 @@
 	return 0;
 }
 
-#ifdef CONFIG_SPARSE_IRQ
 static struct irq_cfg *irq_cfg(unsigned int irq)
 {
 	return irq_get_chip_data(irq);
@@ -269,22 +264,6 @@
 	kfree(cfg);
 }
 
-#else
-
-struct irq_cfg *irq_cfg(unsigned int irq)
-{
-	return irq < nr_irqs ? irq_cfgx + irq : NULL;
-}
-
-static struct irq_cfg *alloc_irq_cfg(unsigned int irq, int node)
-{
-	return irq_cfgx + irq;
-}
-
-static inline void free_irq_cfg(unsigned int at, struct irq_cfg *cfg) { }
-
-#endif
-
 static struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node)
 {
 	int res = irq_alloc_desc_at(at, node);
@@ -802,13 +781,13 @@
 /*
  * Find the IRQ entry number of a certain pin.
  */
-static int find_irq_entry(int apic, int pin, int type)
+static int find_irq_entry(int ioapic_idx, int pin, int type)
 {
 	int i;
 
 	for (i = 0; i < mp_irq_entries; i++)
 		if (mp_irqs[i].irqtype == type &&
-		    (mp_irqs[i].dstapic == mpc_ioapic_id(apic) ||
+		    (mp_irqs[i].dstapic == mpc_ioapic_id(ioapic_idx) ||
 		     mp_irqs[i].dstapic == MP_APIC_ALL) &&
 		    mp_irqs[i].dstirq == pin)
 			return i;
@@ -847,12 +826,13 @@
 		    (mp_irqs[i].srcbusirq == irq))
 			break;
 	}
+
 	if (i < mp_irq_entries) {
-		int apic;
-		for(apic = 0; apic < nr_ioapics; apic++) {
-			if (mpc_ioapic_id(apic) == mp_irqs[i].dstapic)
-				return apic;
-		}
+		int ioapic_idx;
+
+		for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++)
+			if (mpc_ioapic_id(ioapic_idx) == mp_irqs[i].dstapic)
+				return ioapic_idx;
 	}
 
 	return -1;
@@ -1067,7 +1047,7 @@
 int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin,
 				struct io_apic_irq_attr *irq_attr)
 {
-	int apic, i, best_guess = -1;
+	int ioapic_idx, i, best_guess = -1;
 
 	apic_printk(APIC_DEBUG,
 		    "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
@@ -1080,8 +1060,8 @@
 	for (i = 0; i < mp_irq_entries; i++) {
 		int lbus = mp_irqs[i].srcbus;
 
-		for (apic = 0; apic < nr_ioapics; apic++)
-			if (mpc_ioapic_id(apic) == mp_irqs[i].dstapic ||
+		for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++)
+			if (mpc_ioapic_id(ioapic_idx) == mp_irqs[i].dstapic ||
 			    mp_irqs[i].dstapic == MP_APIC_ALL)
 				break;
 
@@ -1089,13 +1069,13 @@
 		    !mp_irqs[i].irqtype &&
 		    (bus == lbus) &&
 		    (slot == ((mp_irqs[i].srcbusirq >> 2) & 0x1f))) {
-			int irq = pin_2_irq(i, apic, mp_irqs[i].dstirq);
+			int irq = pin_2_irq(i, ioapic_idx, mp_irqs[i].dstirq);
 
-			if (!(apic || IO_APIC_IRQ(irq)))
+			if (!(ioapic_idx || IO_APIC_IRQ(irq)))
 				continue;
 
 			if (pin == (mp_irqs[i].srcbusirq & 3)) {
-				set_io_apic_irq_attr(irq_attr, apic,
+				set_io_apic_irq_attr(irq_attr, ioapic_idx,
 						     mp_irqs[i].dstirq,
 						     irq_trigger(i),
 						     irq_polarity(i));
@@ -1106,7 +1086,7 @@
 			 * best-guess fuzzy result for broken mptables.
 			 */
 			if (best_guess < 0) {
-				set_io_apic_irq_attr(irq_attr, apic,
+				set_io_apic_irq_attr(irq_attr, ioapic_idx,
 						     mp_irqs[i].dstirq,
 						     irq_trigger(i),
 						     irq_polarity(i));
@@ -1344,77 +1324,100 @@
 				      fasteoi ? "fasteoi" : "edge");
 }
 
-static int setup_ioapic_entry(int apic_id, int irq,
-			      struct IO_APIC_route_entry *entry,
-			      unsigned int destination, int trigger,
-			      int polarity, int vector, int pin)
+
+static int setup_ir_ioapic_entry(int irq,
+			      struct IR_IO_APIC_route_entry *entry,
+			      unsigned int destination, int vector,
+			      struct io_apic_irq_attr *attr)
 {
-	/*
-	 * add it to the IO-APIC irq-routing table:
-	 */
-	memset(entry,0,sizeof(*entry));
+	int index;
+	struct irte irte;
+	int ioapic_id = mpc_ioapic_id(attr->ioapic);
+	struct intel_iommu *iommu = map_ioapic_to_ir(ioapic_id);
 
-	if (intr_remapping_enabled) {
-		struct intel_iommu *iommu = map_ioapic_to_ir(apic_id);
-		struct irte irte;
-		struct IR_IO_APIC_route_entry *ir_entry =
-			(struct IR_IO_APIC_route_entry *) entry;
-		int index;
-
-		if (!iommu)
-			panic("No mapping iommu for ioapic %d\n", apic_id);
-
-		index = alloc_irte(iommu, irq, 1);
-		if (index < 0)
-			panic("Failed to allocate IRTE for ioapic %d\n", apic_id);
-
-		prepare_irte(&irte, vector, destination);
-
-		/* Set source-id of interrupt request */
-		set_ioapic_sid(&irte, apic_id);
-
-		modify_irte(irq, &irte);
-
-		ir_entry->index2 = (index >> 15) & 0x1;
-		ir_entry->zero = 0;
-		ir_entry->format = 1;
-		ir_entry->index = (index & 0x7fff);
-		/*
-		 * IO-APIC RTE will be configured with virtual vector.
-		 * irq handler will do the explicit EOI to the io-apic.
-		 */
-		ir_entry->vector = pin;
-
-		apic_printk(APIC_VERBOSE, KERN_DEBUG "IOAPIC[%d]: "
-			"Set IRTE entry (P:%d FPD:%d Dst_Mode:%d "
-			"Redir_hint:%d Trig_Mode:%d Dlvry_Mode:%X "
-			"Avail:%X Vector:%02X Dest:%08X "
-			"SID:%04X SQ:%X SVT:%X)\n",
-			apic_id, irte.present, irte.fpd, irte.dst_mode,
-			irte.redir_hint, irte.trigger_mode, irte.dlvry_mode,
-			irte.avail, irte.vector, irte.dest_id,
-			irte.sid, irte.sq, irte.svt);
-	} else {
-		entry->delivery_mode = apic->irq_delivery_mode;
-		entry->dest_mode = apic->irq_dest_mode;
-		entry->dest = destination;
-		entry->vector = vector;
+	if (!iommu) {
+		pr_warn("No mapping iommu for ioapic %d\n", ioapic_id);
+		return -ENODEV;
 	}
 
-	entry->mask = 0;				/* enable IRQ */
-	entry->trigger = trigger;
-	entry->polarity = polarity;
+	index = alloc_irte(iommu, irq, 1);
+	if (index < 0) {
+		pr_warn("Failed to allocate IRTE for ioapic %d\n", ioapic_id);
+		return -ENOMEM;
+	}
+
+	prepare_irte(&irte, vector, destination);
+
+	/* Set source-id of interrupt request */
+	set_ioapic_sid(&irte, ioapic_id);
+
+	modify_irte(irq, &irte);
+
+	apic_printk(APIC_VERBOSE, KERN_DEBUG "IOAPIC[%d]: "
+		"Set IRTE entry (P:%d FPD:%d Dst_Mode:%d "
+		"Redir_hint:%d Trig_Mode:%d Dlvry_Mode:%X "
+		"Avail:%X Vector:%02X Dest:%08X "
+		"SID:%04X SQ:%X SVT:%X)\n",
+		attr->ioapic, irte.present, irte.fpd, irte.dst_mode,
+		irte.redir_hint, irte.trigger_mode, irte.dlvry_mode,
+		irte.avail, irte.vector, irte.dest_id,
+		irte.sid, irte.sq, irte.svt);
+
+	memset(entry, 0, sizeof(*entry));
+
+	entry->index2	= (index >> 15) & 0x1;
+	entry->zero	= 0;
+	entry->format	= 1;
+	entry->index	= (index & 0x7fff);
+	/*
+	 * IO-APIC RTE will be configured with virtual vector.
+	 * irq handler will do the explicit EOI to the io-apic.
+	 */
+	entry->vector	= attr->ioapic_pin;
+	entry->mask	= 0;			/* enable IRQ */
+	entry->trigger	= attr->trigger;
+	entry->polarity	= attr->polarity;
 
 	/* Mask level triggered irqs.
 	 * Use IRQ_DELAYED_DISABLE for edge triggered irqs.
 	 */
-	if (trigger)
+	if (attr->trigger)
 		entry->mask = 1;
+
 	return 0;
 }
 
-static void setup_ioapic_irq(int apic_id, int pin, unsigned int irq,
-			     struct irq_cfg *cfg, int trigger, int polarity)
+static int setup_ioapic_entry(int irq, struct IO_APIC_route_entry *entry,
+			       unsigned int destination, int vector,
+			       struct io_apic_irq_attr *attr)
+{
+	if (intr_remapping_enabled)
+		return setup_ir_ioapic_entry(irq,
+			 (struct IR_IO_APIC_route_entry *)entry,
+			 destination, vector, attr);
+
+	memset(entry, 0, sizeof(*entry));
+
+	entry->delivery_mode = apic->irq_delivery_mode;
+	entry->dest_mode     = apic->irq_dest_mode;
+	entry->dest	     = destination;
+	entry->vector	     = vector;
+	entry->mask	     = 0;			/* enable IRQ */
+	entry->trigger	     = attr->trigger;
+	entry->polarity	     = attr->polarity;
+
+	/*
+	 * Mask level triggered irqs.
+	 * Use IRQ_DELAYED_DISABLE for edge triggered irqs.
+	 */
+	if (attr->trigger)
+		entry->mask = 1;
+
+	return 0;
+}
+
+static void setup_ioapic_irq(unsigned int irq, struct irq_cfg *cfg,
+				struct io_apic_irq_attr *attr)
 {
 	struct IO_APIC_route_entry entry;
 	unsigned int dest;
@@ -1437,49 +1440,48 @@
 	apic_printk(APIC_VERBOSE,KERN_DEBUG
 		    "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
 		    "IRQ %d Mode:%i Active:%i Dest:%d)\n",
-		    apic_id, mpc_ioapic_id(apic_id), pin, cfg->vector,
-		    irq, trigger, polarity, dest);
+		    attr->ioapic, mpc_ioapic_id(attr->ioapic), attr->ioapic_pin,
+		    cfg->vector, irq, attr->trigger, attr->polarity, dest);
 
-
-	if (setup_ioapic_entry(mpc_ioapic_id(apic_id), irq, &entry,
-			       dest, trigger, polarity, cfg->vector, pin)) {
-		printk("Failed to setup ioapic entry for ioapic  %d, pin %d\n",
-		       mpc_ioapic_id(apic_id), pin);
+	if (setup_ioapic_entry(irq, &entry, dest, cfg->vector, attr)) {
+		pr_warn("Failed to setup ioapic entry for ioapic  %d, pin %d\n",
+			mpc_ioapic_id(attr->ioapic), attr->ioapic_pin);
 		__clear_irq_vector(irq, cfg);
+
 		return;
 	}
 
-	ioapic_register_intr(irq, cfg, trigger);
+	ioapic_register_intr(irq, cfg, attr->trigger);
 	if (irq < legacy_pic->nr_legacy_irqs)
 		legacy_pic->mask(irq);
 
-	ioapic_write_entry(apic_id, pin, entry);
+	ioapic_write_entry(attr->ioapic, attr->ioapic_pin, entry);
 }
 
-static bool __init io_apic_pin_not_connected(int idx, int apic_id, int pin)
+static bool __init io_apic_pin_not_connected(int idx, int ioapic_idx, int pin)
 {
 	if (idx != -1)
 		return false;
 
 	apic_printk(APIC_VERBOSE, KERN_DEBUG " apic %d pin %d not connected\n",
-		    mpc_ioapic_id(apic_id), pin);
+		    mpc_ioapic_id(ioapic_idx), pin);
 	return true;
 }
 
-static void __init __io_apic_setup_irqs(unsigned int apic_id)
+static void __init __io_apic_setup_irqs(unsigned int ioapic_idx)
 {
 	int idx, node = cpu_to_node(0);
 	struct io_apic_irq_attr attr;
 	unsigned int pin, irq;
 
-	for (pin = 0; pin < ioapics[apic_id].nr_registers; pin++) {
-		idx = find_irq_entry(apic_id, pin, mp_INT);
-		if (io_apic_pin_not_connected(idx, apic_id, pin))
+	for (pin = 0; pin < ioapics[ioapic_idx].nr_registers; pin++) {
+		idx = find_irq_entry(ioapic_idx, pin, mp_INT);
+		if (io_apic_pin_not_connected(idx, ioapic_idx, pin))
 			continue;
 
-		irq = pin_2_irq(idx, apic_id, pin);
+		irq = pin_2_irq(idx, ioapic_idx, pin);
 
-		if ((apic_id > 0) && (irq > 16))
+		if ((ioapic_idx > 0) && (irq > 16))
 			continue;
 
 		/*
@@ -1487,10 +1489,10 @@
 		 * installed and if it returns 1:
 		 */
 		if (apic->multi_timer_check &&
-		    apic->multi_timer_check(apic_id, irq))
+		    apic->multi_timer_check(ioapic_idx, irq))
 			continue;
 
-		set_io_apic_irq_attr(&attr, apic_id, pin, irq_trigger(idx),
+		set_io_apic_irq_attr(&attr, ioapic_idx, pin, irq_trigger(idx),
 				     irq_polarity(idx));
 
 		io_apic_setup_irq_pin(irq, node, &attr);
@@ -1499,12 +1501,12 @@
 
 static void __init setup_IO_APIC_irqs(void)
 {
-	unsigned int apic_id;
+	unsigned int ioapic_idx;
 
 	apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
 
-	for (apic_id = 0; apic_id < nr_ioapics; apic_id++)
-		__io_apic_setup_irqs(apic_id);
+	for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++)
+		__io_apic_setup_irqs(ioapic_idx);
 }
 
 /*
@@ -1514,28 +1516,28 @@
  */
 void setup_IO_APIC_irq_extra(u32 gsi)
 {
-	int apic_id = 0, pin, idx, irq, node = cpu_to_node(0);
+	int ioapic_idx = 0, pin, idx, irq, node = cpu_to_node(0);
 	struct io_apic_irq_attr attr;
 
 	/*
 	 * Convert 'gsi' to 'ioapic.pin'.
 	 */
-	apic_id = mp_find_ioapic(gsi);
-	if (apic_id < 0)
+	ioapic_idx = mp_find_ioapic(gsi);
+	if (ioapic_idx < 0)
 		return;
 
-	pin = mp_find_ioapic_pin(apic_id, gsi);
-	idx = find_irq_entry(apic_id, pin, mp_INT);
+	pin = mp_find_ioapic_pin(ioapic_idx, gsi);
+	idx = find_irq_entry(ioapic_idx, pin, mp_INT);
 	if (idx == -1)
 		return;
 
-	irq = pin_2_irq(idx, apic_id, pin);
+	irq = pin_2_irq(idx, ioapic_idx, pin);
 
 	/* Only handle the non legacy irqs on secondary ioapics */
-	if (apic_id == 0 || irq < NR_IRQS_LEGACY)
+	if (ioapic_idx == 0 || irq < NR_IRQS_LEGACY)
 		return;
 
-	set_io_apic_irq_attr(&attr, apic_id, pin, irq_trigger(idx),
+	set_io_apic_irq_attr(&attr, ioapic_idx, pin, irq_trigger(idx),
 			     irq_polarity(idx));
 
 	io_apic_setup_irq_pin_once(irq, node, &attr);
@@ -1544,8 +1546,8 @@
 /*
  * Set up the timer pin, possibly with the 8259A-master behind.
  */
-static void __init setup_timer_IRQ0_pin(unsigned int apic_id, unsigned int pin,
-					int vector)
+static void __init setup_timer_IRQ0_pin(unsigned int ioapic_idx,
+					 unsigned int pin, int vector)
 {
 	struct IO_APIC_route_entry entry;
 
@@ -1576,45 +1578,29 @@
 	/*
 	 * Add it to the IO-APIC irq-routing table:
 	 */
-	ioapic_write_entry(apic_id, pin, entry);
+	ioapic_write_entry(ioapic_idx, pin, entry);
 }
 
-
-__apicdebuginit(void) print_IO_APIC(void)
+__apicdebuginit(void) print_IO_APIC(int ioapic_idx)
 {
-	int apic, i;
+	int i;
 	union IO_APIC_reg_00 reg_00;
 	union IO_APIC_reg_01 reg_01;
 	union IO_APIC_reg_02 reg_02;
 	union IO_APIC_reg_03 reg_03;
 	unsigned long flags;
-	struct irq_cfg *cfg;
-	unsigned int irq;
-
-	printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
-	for (i = 0; i < nr_ioapics; i++)
-		printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
-		       mpc_ioapic_id(i), ioapics[i].nr_registers);
-
-	/*
-	 * We are a bit conservative about what we expect.  We have to
-	 * know about every hardware change ASAP.
-	 */
-	printk(KERN_INFO "testing the IO APIC.......................\n");
-
-	for (apic = 0; apic < nr_ioapics; apic++) {
 
 	raw_spin_lock_irqsave(&ioapic_lock, flags);
-	reg_00.raw = io_apic_read(apic, 0);
-	reg_01.raw = io_apic_read(apic, 1);
+	reg_00.raw = io_apic_read(ioapic_idx, 0);
+	reg_01.raw = io_apic_read(ioapic_idx, 1);
 	if (reg_01.bits.version >= 0x10)
-		reg_02.raw = io_apic_read(apic, 2);
+		reg_02.raw = io_apic_read(ioapic_idx, 2);
 	if (reg_01.bits.version >= 0x20)
-		reg_03.raw = io_apic_read(apic, 3);
+		reg_03.raw = io_apic_read(ioapic_idx, 3);
 	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
 
 	printk("\n");
-	printk(KERN_DEBUG "IO APIC #%d......\n", mpc_ioapic_id(apic));
+	printk(KERN_DEBUG "IO APIC #%d......\n", mpc_ioapic_id(ioapic_idx));
 	printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
 	printk(KERN_DEBUG ".......    : physical APIC id: %02X\n", reg_00.bits.ID);
 	printk(KERN_DEBUG ".......    : Delivery Type: %X\n", reg_00.bits.delivery_type);
@@ -1664,7 +1650,7 @@
 			struct IO_APIC_route_entry entry;
 			struct IR_IO_APIC_route_entry *ir_entry;
 
-			entry = ioapic_read_entry(apic, i);
+			entry = ioapic_read_entry(ioapic_idx, i);
 			ir_entry = (struct IR_IO_APIC_route_entry *) &entry;
 			printk(KERN_DEBUG " %02x %04X ",
 				i,
@@ -1685,7 +1671,7 @@
 		} else {
 			struct IO_APIC_route_entry entry;
 
-			entry = ioapic_read_entry(apic, i);
+			entry = ioapic_read_entry(ioapic_idx, i);
 			printk(KERN_DEBUG " %02x %02X  ",
 				i,
 				entry.dest
@@ -1703,7 +1689,28 @@
 			);
 		}
 	}
-	}
+}
+
+__apicdebuginit(void) print_IO_APICs(void)
+{
+	int ioapic_idx;
+	struct irq_cfg *cfg;
+	unsigned int irq;
+
+	printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
+	for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++)
+		printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
+		       mpc_ioapic_id(ioapic_idx),
+		       ioapics[ioapic_idx].nr_registers);
+
+	/*
+	 * We are a bit conservative about what we expect.  We have to
+	 * know about every hardware change ASAP.
+	 */
+	printk(KERN_INFO "testing the IO APIC.......................\n");
+
+	for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++)
+		print_IO_APIC(ioapic_idx);
 
 	printk(KERN_DEBUG "IRQ to pin mappings:\n");
 	for_each_active_irq(irq) {
@@ -1722,8 +1729,6 @@
 	}
 
 	printk(KERN_INFO ".................................... done.\n");
-
-	return;
 }
 
 __apicdebuginit(void) print_APIC_field(int base)
@@ -1917,7 +1922,7 @@
 		return 0;
 
 	print_local_APICs(show_lapic);
-	print_IO_APIC();
+	print_IO_APICs();
 
 	return 0;
 }
@@ -2042,7 +2047,7 @@
 {
 	union IO_APIC_reg_00 reg_00;
 	physid_mask_t phys_id_present_map;
-	int apic_id;
+	int ioapic_idx;
 	int i;
 	unsigned char old_id;
 	unsigned long flags;
@@ -2056,21 +2061,20 @@
 	/*
 	 * Set the IOAPIC ID to the value stored in the MPC table.
 	 */
-	for (apic_id = 0; apic_id < nr_ioapics; apic_id++) {
-
+	for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++) {
 		/* Read the register 0 value */
 		raw_spin_lock_irqsave(&ioapic_lock, flags);
-		reg_00.raw = io_apic_read(apic_id, 0);
+		reg_00.raw = io_apic_read(ioapic_idx, 0);
 		raw_spin_unlock_irqrestore(&ioapic_lock, flags);
 
-		old_id = mpc_ioapic_id(apic_id);
+		old_id = mpc_ioapic_id(ioapic_idx);
 
-		if (mpc_ioapic_id(apic_id) >= get_physical_broadcast()) {
+		if (mpc_ioapic_id(ioapic_idx) >= get_physical_broadcast()) {
 			printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
-				apic_id, mpc_ioapic_id(apic_id));
+				ioapic_idx, mpc_ioapic_id(ioapic_idx));
 			printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
 				reg_00.bits.ID);
-			ioapics[apic_id].mp_config.apicid = reg_00.bits.ID;
+			ioapics[ioapic_idx].mp_config.apicid = reg_00.bits.ID;
 		}
 
 		/*
@@ -2079,9 +2083,9 @@
 		 * 'stuck on smp_invalidate_needed IPI wait' messages.
 		 */
 		if (apic->check_apicid_used(&phys_id_present_map,
-					    mpc_ioapic_id(apic_id))) {
+					    mpc_ioapic_id(ioapic_idx))) {
 			printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
-				apic_id, mpc_ioapic_id(apic_id));
+				ioapic_idx, mpc_ioapic_id(ioapic_idx));
 			for (i = 0; i < get_physical_broadcast(); i++)
 				if (!physid_isset(i, phys_id_present_map))
 					break;
@@ -2090,14 +2094,14 @@
 			printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
 				i);
 			physid_set(i, phys_id_present_map);
-			ioapics[apic_id].mp_config.apicid = i;
+			ioapics[ioapic_idx].mp_config.apicid = i;
 		} else {
 			physid_mask_t tmp;
-			apic->apicid_to_cpu_present(mpc_ioapic_id(apic_id),
+			apic->apicid_to_cpu_present(mpc_ioapic_id(ioapic_idx),
 						    &tmp);
 			apic_printk(APIC_VERBOSE, "Setting %d in the "
 					"phys_id_present_map\n",
-					mpc_ioapic_id(apic_id));
+					mpc_ioapic_id(ioapic_idx));
 			physids_or(phys_id_present_map, phys_id_present_map, tmp);
 		}
 
@@ -2105,35 +2109,35 @@
 		 * We need to adjust the IRQ routing table
 		 * if the ID changed.
 		 */
-		if (old_id != mpc_ioapic_id(apic_id))
+		if (old_id != mpc_ioapic_id(ioapic_idx))
 			for (i = 0; i < mp_irq_entries; i++)
 				if (mp_irqs[i].dstapic == old_id)
 					mp_irqs[i].dstapic
-						= mpc_ioapic_id(apic_id);
+						= mpc_ioapic_id(ioapic_idx);
 
 		/*
 		 * Update the ID register according to the right value
 		 * from the MPC table if they are different.
 		 */
-		if (mpc_ioapic_id(apic_id) == reg_00.bits.ID)
+		if (mpc_ioapic_id(ioapic_idx) == reg_00.bits.ID)
 			continue;
 
 		apic_printk(APIC_VERBOSE, KERN_INFO
 			"...changing IO-APIC physical APIC ID to %d ...",
-			mpc_ioapic_id(apic_id));
+			mpc_ioapic_id(ioapic_idx));
 
-		reg_00.bits.ID = mpc_ioapic_id(apic_id);
+		reg_00.bits.ID = mpc_ioapic_id(ioapic_idx);
 		raw_spin_lock_irqsave(&ioapic_lock, flags);
-		io_apic_write(apic_id, 0, reg_00.raw);
+		io_apic_write(ioapic_idx, 0, reg_00.raw);
 		raw_spin_unlock_irqrestore(&ioapic_lock, flags);
 
 		/*
 		 * Sanity check
 		 */
 		raw_spin_lock_irqsave(&ioapic_lock, flags);
-		reg_00.raw = io_apic_read(apic_id, 0);
+		reg_00.raw = io_apic_read(ioapic_idx, 0);
 		raw_spin_unlock_irqrestore(&ioapic_lock, flags);
-		if (reg_00.bits.ID != mpc_ioapic_id(apic_id))
+		if (reg_00.bits.ID != mpc_ioapic_id(ioapic_idx))
 			printk("could not set ID!\n");
 		else
 			apic_printk(APIC_VERBOSE, " ok.\n");
@@ -3001,27 +3005,26 @@
 
 late_initcall(io_apic_bug_finalize);
 
-static void resume_ioapic_id(int ioapic_id)
+static void resume_ioapic_id(int ioapic_idx)
 {
 	unsigned long flags;
 	union IO_APIC_reg_00 reg_00;
 
-
 	raw_spin_lock_irqsave(&ioapic_lock, flags);
-	reg_00.raw = io_apic_read(ioapic_id, 0);
-	if (reg_00.bits.ID != mpc_ioapic_id(ioapic_id)) {
-		reg_00.bits.ID = mpc_ioapic_id(ioapic_id);
-		io_apic_write(ioapic_id, 0, reg_00.raw);
+	reg_00.raw = io_apic_read(ioapic_idx, 0);
+	if (reg_00.bits.ID != mpc_ioapic_id(ioapic_idx)) {
+		reg_00.bits.ID = mpc_ioapic_id(ioapic_idx);
+		io_apic_write(ioapic_idx, 0, reg_00.raw);
 	}
 	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
 static void ioapic_resume(void)
 {
-	int ioapic_id;
+	int ioapic_idx;
 
-	for (ioapic_id = nr_ioapics - 1; ioapic_id >= 0; ioapic_id--)
-		resume_ioapic_id(ioapic_id);
+	for (ioapic_idx = nr_ioapics - 1; ioapic_idx >= 0; ioapic_idx--)
+		resume_ioapic_id(ioapic_idx);
 
 	restore_ioapic_entries();
 }
@@ -3558,26 +3561,25 @@
 		return -EINVAL;
 	ret = __add_pin_to_irq_node(cfg, node, attr->ioapic, attr->ioapic_pin);
 	if (!ret)
-		setup_ioapic_irq(attr->ioapic, attr->ioapic_pin, irq, cfg,
-				 attr->trigger, attr->polarity);
+		setup_ioapic_irq(irq, cfg, attr);
 	return ret;
 }
 
 int io_apic_setup_irq_pin_once(unsigned int irq, int node,
 			       struct io_apic_irq_attr *attr)
 {
-	unsigned int id = attr->ioapic, pin = attr->ioapic_pin;
+	unsigned int ioapic_idx = attr->ioapic, pin = attr->ioapic_pin;
 	int ret;
 
 	/* Avoid redundant programming */
-	if (test_bit(pin, ioapics[id].pin_programmed)) {
+	if (test_bit(pin, ioapics[ioapic_idx].pin_programmed)) {
 		pr_debug("Pin %d-%d already programmed\n",
-			 mpc_ioapic_id(id), pin);
+			 mpc_ioapic_id(ioapic_idx), pin);
 		return 0;
 	}
 	ret = io_apic_setup_irq_pin(irq, node, attr);
 	if (!ret)
-		set_bit(pin, ioapics[id].pin_programmed);
+		set_bit(pin, ioapics[ioapic_idx].pin_programmed);
 	return ret;
 }
 
@@ -3613,7 +3615,6 @@
 	return nr_irqs_gsi;
 }
 
-#ifdef CONFIG_SPARSE_IRQ
 int __init arch_probe_nr_irqs(void)
 {
 	int nr;
@@ -3633,7 +3634,6 @@
 
 	return NR_IRQS_LEGACY;
 }
-#endif
 
 int io_apic_set_pci_routing(struct device *dev, int irq,
 			    struct io_apic_irq_attr *irq_attr)

diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index b5254ad..0787bb3 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c

@@ -200,14 +200,8 @@
 	 * - we find more than 8 CPUs in acpi LAPIC listing with xAPIC support
 	 */
 
-	if (!cmdline_apic && apic == &apic_default) {
-		struct apic *bigsmp = generic_bigsmp_probe();
-		if (bigsmp) {
-			apic = bigsmp;
-			printk(KERN_INFO "Overriding APIC driver with %s\n",
-			       apic->name);
-		}
-	}
+	if (!cmdline_apic && apic == &apic_default)
+		generic_bigsmp_probe();
 #endif
 
 	if (apic->setup_apic_routing)

diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 34b1859..75be00e 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c

@@ -672,18 +672,11 @@
 /*
  * When NMI is received, print a stack trace.
  */
-int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data)
+int uv_handle_nmi(unsigned int reason, struct pt_regs *regs)
 {
 	unsigned long real_uv_nmi;
 	int bid;
 
-	if (reason != DIE_NMIUNKNOWN)
-		return NOTIFY_OK;
-
-	if (in_crash_kexec)
-		/* do nothing if entering the crash kernel */
-		return NOTIFY_OK;
-
 	/*
 	 * Each blade has an MMR that indicates when an NMI has been sent
 	 * to cpus on the blade. If an NMI is detected, atomically
@@ -704,7 +697,7 @@
 	}
 
 	if (likely(__get_cpu_var(cpu_last_nmi_count) == uv_blade_info[bid].nmi_count))
-		return NOTIFY_DONE;
+		return NMI_DONE;
 
 	__get_cpu_var(cpu_last_nmi_count) = uv_blade_info[bid].nmi_count;
 
@@ -717,17 +710,12 @@
 	dump_stack();
 	spin_unlock(&uv_nmi_lock);
 
-	return NOTIFY_STOP;
+	return NMI_HANDLED;
 }
 
-static struct notifier_block uv_dump_stack_nmi_nb = {
-	.notifier_call	= uv_handle_nmi,
-	.priority = NMI_LOCAL_LOW_PRIOR - 1,
-};
-
 void uv_register_nmi_notifier(void)
 {
-	if (register_die_notifier(&uv_dump_stack_nmi_nb))
+	if (register_nmi_handler(NMI_UNKNOWN, uv_handle_nmi, 0, "uv"))
 		printk(KERN_WARNING "UV NMI handler failed to register\n");
 }
 

diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 6042981..fe6eb19 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile

@@ -28,10 +28,15 @@
 
 obj-$(CONFIG_PERF_EVENTS)		+= perf_event.o
 
+ifdef CONFIG_PERF_EVENTS
+obj-$(CONFIG_CPU_SUP_AMD)		+= perf_event_amd.o
+obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_p6.o perf_event_p4.o perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o
+endif
+
 obj-$(CONFIG_X86_MCE)			+= mcheck/
 obj-$(CONFIG_MTRR)			+= mtrr/
 
-obj-$(CONFIG_X86_LOCAL_APIC)		+= perfctr-watchdog.o
+obj-$(CONFIG_X86_LOCAL_APIC)		+= perfctr-watchdog.o perf_event_amd_ibs.o
 
 quiet_cmd_mkcapflags = MKCAP   $@
       cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@

diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
index 0ed633c..6199232 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-inject.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c

@@ -78,27 +78,20 @@
 
 static cpumask_var_t mce_inject_cpumask;
 
-static int mce_raise_notify(struct notifier_block *self,
-			    unsigned long val, void *data)
+static int mce_raise_notify(unsigned int cmd, struct pt_regs *regs)
 {
-	struct die_args *args = (struct die_args *)data;
 	int cpu = smp_processor_id();
 	struct mce *m = &__get_cpu_var(injectm);
-	if (val != DIE_NMI || !cpumask_test_cpu(cpu, mce_inject_cpumask))
-		return NOTIFY_DONE;
+	if (!cpumask_test_cpu(cpu, mce_inject_cpumask))
+		return NMI_DONE;
 	cpumask_clear_cpu(cpu, mce_inject_cpumask);
 	if (m->inject_flags & MCJ_EXCEPTION)
-		raise_exception(m, args->regs);
+		raise_exception(m, regs);
 	else if (m->status)
 		raise_poll(m);
-	return NOTIFY_STOP;
+	return NMI_HANDLED;
 }
 
-static struct notifier_block mce_raise_nb = {
-	.notifier_call = mce_raise_notify,
-	.priority = NMI_LOCAL_NORMAL_PRIOR,
-};
-
 /* Inject mce on current CPU */
 static int raise_local(void)
 {
@@ -216,7 +209,8 @@
 		return -ENOMEM;
 	printk(KERN_INFO "Machine check injector initialized\n");
 	mce_chrdev_ops.write = mce_write;
-	register_die_notifier(&mce_raise_nb);
+	register_nmi_handler(NMI_LOCAL, mce_raise_notify, 0,
+				"mce_notify");
 	return 0;
 }
 

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 08363b0..fce51ad1 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c

@@ -908,9 +908,6 @@
 
 	percpu_inc(mce_exception_count);
 
-	if (notify_die(DIE_NMI, "machine check", regs, error_code,
-			   18, SIGKILL) == NOTIFY_STOP)
-		goto out;
 	if (!banks)
 		goto out;
 
@@ -1140,6 +1137,15 @@
 	add_timer_on(t, smp_processor_id());
 }
 
+/* Must not be called in IRQ context where del_timer_sync() can deadlock */
+static void mce_timer_delete_all(void)
+{
+	int cpu;
+
+	for_each_online_cpu(cpu)
+		del_timer_sync(&per_cpu(mce_timer, cpu));
+}
+
 static void mce_do_trigger(struct work_struct *work)
 {
 	call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT);
@@ -1750,7 +1756,6 @@
 
 static void mce_cpu_restart(void *data)
 {
-	del_timer_sync(&__get_cpu_var(mce_timer));
 	if (!mce_available(__this_cpu_ptr(&cpu_info)))
 		return;
 	__mcheck_cpu_init_generic();
@@ -1760,16 +1765,15 @@
 /* Reinit MCEs after user configuration changes */
 static void mce_restart(void)
 {
+	mce_timer_delete_all();
 	on_each_cpu(mce_cpu_restart, NULL, 1);
 }
 
 /* Toggle features for corrected errors */
-static void mce_disable_ce(void *all)
+static void mce_disable_cmci(void *data)
 {
 	if (!mce_available(__this_cpu_ptr(&cpu_info)))
 		return;
-	if (all)
-		del_timer_sync(&__get_cpu_var(mce_timer));
 	cmci_clear();
 }
 
@@ -1852,7 +1856,8 @@
 	if (mce_ignore_ce ^ !!new) {
 		if (new) {
 			/* disable ce features */
-			on_each_cpu(mce_disable_ce, (void *)1, 1);
+			mce_timer_delete_all();
+			on_each_cpu(mce_disable_cmci, NULL, 1);
 			mce_ignore_ce = 1;
 		} else {
 			/* enable ce features */
@@ -1875,7 +1880,7 @@
 	if (mce_cmci_disabled ^ !!new) {
 		if (new) {
 			/* disable cmci */
-			on_each_cpu(mce_disable_ce, NULL, 1);
+			on_each_cpu(mce_disable_cmci, NULL, 1);
 			mce_cmci_disabled = 1;
 		} else {
 			/* enable cmci */

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index cfa62ec..6408910 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c

@@ -32,6 +32,8 @@
 #include <asm/smp.h>
 #include <asm/alternative.h>
 
+#include "perf_event.h"
+
 #if 0
 #undef wrmsrl
 #define wrmsrl(msr, val) 					\
@@ -43,283 +45,17 @@
 } while (0)
 #endif
 
-/*
- *          |   NHM/WSM    |      SNB     |
- * register -------------------------------
- *          |  HT  | no HT |  HT  | no HT |
- *-----------------------------------------
- * offcore  | core | core  | cpu  | core  |
- * lbr_sel  | core | core  | cpu  | core  |
- * ld_lat   | cpu  | core  | cpu  | core  |
- *-----------------------------------------
- *
- * Given that there is a small number of shared regs,
- * we can pre-allocate their slot in the per-cpu
- * per-core reg tables.
- */
-enum extra_reg_type {
-	EXTRA_REG_NONE  = -1,	/* not used */
+struct x86_pmu x86_pmu __read_mostly;
 
-	EXTRA_REG_RSP_0 = 0,	/* offcore_response_0 */
-	EXTRA_REG_RSP_1 = 1,	/* offcore_response_1 */
-
-	EXTRA_REG_MAX		/* number of entries needed */
-};
-
-struct event_constraint {
-	union {
-		unsigned long	idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
-		u64		idxmsk64;
-	};
-	u64	code;
-	u64	cmask;
-	int	weight;
-};
-
-struct amd_nb {
-	int nb_id;  /* NorthBridge id */
-	int refcnt; /* reference count */
-	struct perf_event *owners[X86_PMC_IDX_MAX];
-	struct event_constraint event_constraints[X86_PMC_IDX_MAX];
-};
-
-struct intel_percore;
-
-#define MAX_LBR_ENTRIES		16
-
-struct cpu_hw_events {
-	/*
-	 * Generic x86 PMC bits
-	 */
-	struct perf_event	*events[X86_PMC_IDX_MAX]; /* in counter order */
-	unsigned long		active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
-	unsigned long		running[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
-	int			enabled;
-
-	int			n_events;
-	int			n_added;
-	int			n_txn;
-	int			assign[X86_PMC_IDX_MAX]; /* event to counter assignment */
-	u64			tags[X86_PMC_IDX_MAX];
-	struct perf_event	*event_list[X86_PMC_IDX_MAX]; /* in enabled order */
-
-	unsigned int		group_flag;
-
-	/*
-	 * Intel DebugStore bits
-	 */
-	struct debug_store	*ds;
-	u64			pebs_enabled;
-
-	/*
-	 * Intel LBR bits
-	 */
-	int				lbr_users;
-	void				*lbr_context;
-	struct perf_branch_stack	lbr_stack;
-	struct perf_branch_entry	lbr_entries[MAX_LBR_ENTRIES];
-
-	/*
-	 * manage shared (per-core, per-cpu) registers
-	 * used on Intel NHM/WSM/SNB
-	 */
-	struct intel_shared_regs	*shared_regs;
-
-	/*
-	 * AMD specific bits
-	 */
-	struct amd_nb		*amd_nb;
-};
-
-#define __EVENT_CONSTRAINT(c, n, m, w) {\
-	{ .idxmsk64 = (n) },		\
-	.code = (c),			\
-	.cmask = (m),			\
-	.weight = (w),			\
-}
-
-#define EVENT_CONSTRAINT(c, n, m)	\
-	__EVENT_CONSTRAINT(c, n, m, HWEIGHT(n))
-
-/*
- * Constraint on the Event code.
- */
-#define INTEL_EVENT_CONSTRAINT(c, n)	\
-	EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT)
-
-/*
- * Constraint on the Event code + UMask + fixed-mask
- *
- * filter mask to validate fixed counter events.
- * the following filters disqualify for fixed counters:
- *  - inv
- *  - edge
- *  - cnt-mask
- *  The other filters are supported by fixed counters.
- *  The any-thread option is supported starting with v3.
- */
-#define FIXED_EVENT_CONSTRAINT(c, n)	\
-	EVENT_CONSTRAINT(c, (1ULL << (32+n)), X86_RAW_EVENT_MASK)
-
-/*
- * Constraint on the Event code + UMask
- */
-#define INTEL_UEVENT_CONSTRAINT(c, n)	\
-	EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK)
-
-#define EVENT_CONSTRAINT_END		\
-	EVENT_CONSTRAINT(0, 0, 0)
-
-#define for_each_event_constraint(e, c)	\
-	for ((e) = (c); (e)->weight; (e)++)
-
-/*
- * Per register state.
- */
-struct er_account {
-	raw_spinlock_t		lock;	/* per-core: protect structure */
-	u64			config;	/* extra MSR config */
-	u64			reg;	/* extra MSR number */
-	atomic_t		ref;	/* reference count */
-};
-
-/*
- * Extra registers for specific events.
- *
- * Some events need large masks and require external MSRs.
- * Those extra MSRs end up being shared for all events on
- * a PMU and sometimes between PMU of sibling HT threads.
- * In either case, the kernel needs to handle conflicting
- * accesses to those extra, shared, regs. The data structure
- * to manage those registers is stored in cpu_hw_event.
- */
-struct extra_reg {
-	unsigned int		event;
-	unsigned int		msr;
-	u64			config_mask;
-	u64			valid_mask;
-	int			idx;  /* per_xxx->regs[] reg index */
-};
-
-#define EVENT_EXTRA_REG(e, ms, m, vm, i) {	\
-	.event = (e),		\
-	.msr = (ms),		\
-	.config_mask = (m),	\
-	.valid_mask = (vm),	\
-	.idx = EXTRA_REG_##i	\
-	}
-
-#define INTEL_EVENT_EXTRA_REG(event, msr, vm, idx)	\
-	EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm, idx)
-
-#define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0, RSP_0)
-
-union perf_capabilities {
-	struct {
-		u64	lbr_format    : 6;
-		u64	pebs_trap     : 1;
-		u64	pebs_arch_reg : 1;
-		u64	pebs_format   : 4;
-		u64	smm_freeze    : 1;
-	};
-	u64	capabilities;
-};
-
-/*
- * struct x86_pmu - generic x86 pmu
- */
-struct x86_pmu {
-	/*
-	 * Generic x86 PMC bits
-	 */
-	const char	*name;
-	int		version;
-	int		(*handle_irq)(struct pt_regs *);
-	void		(*disable_all)(void);
-	void		(*enable_all)(int added);
-	void		(*enable)(struct perf_event *);
-	void		(*disable)(struct perf_event *);
-	int		(*hw_config)(struct perf_event *event);
-	int		(*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign);
-	unsigned	eventsel;
-	unsigned	perfctr;
-	u64		(*event_map)(int);
-	int		max_events;
-	int		num_counters;
-	int		num_counters_fixed;
-	int		cntval_bits;
-	u64		cntval_mask;
-	int		apic;
-	u64		max_period;
-	struct event_constraint *
-			(*get_event_constraints)(struct cpu_hw_events *cpuc,
-						 struct perf_event *event);
-
-	void		(*put_event_constraints)(struct cpu_hw_events *cpuc,
-						 struct perf_event *event);
-	struct event_constraint *event_constraints;
-	void		(*quirks)(void);
-	int		perfctr_second_write;
-
-	int		(*cpu_prepare)(int cpu);
-	void		(*cpu_starting)(int cpu);
-	void		(*cpu_dying)(int cpu);
-	void		(*cpu_dead)(int cpu);
-
-	/*
-	 * Intel Arch Perfmon v2+
-	 */
-	u64			intel_ctrl;
-	union perf_capabilities intel_cap;
-
-	/*
-	 * Intel DebugStore bits
-	 */
-	int		bts, pebs;
-	int		bts_active, pebs_active;
-	int		pebs_record_size;
-	void		(*drain_pebs)(struct pt_regs *regs);
-	struct event_constraint *pebs_constraints;
-
-	/*
-	 * Intel LBR
-	 */
-	unsigned long	lbr_tos, lbr_from, lbr_to; /* MSR base regs       */
-	int		lbr_nr;			   /* hardware stack size */
-
-	/*
-	 * Extra registers for events
-	 */
-	struct extra_reg *extra_regs;
-	unsigned int er_flags;
-};
-
-#define ERF_NO_HT_SHARING	1
-#define ERF_HAS_RSP_1		2
-
-static struct x86_pmu x86_pmu __read_mostly;
-
-static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
+DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
 	.enabled = 1,
 };
 
-static int x86_perf_event_set_period(struct perf_event *event);
-
-/*
- * Generalized hw caching related hw_event table, filled
- * in on a per model basis. A value of 0 means
- * 'not supported', -1 means 'hw_event makes no sense on
- * this CPU', any other value means the raw hw_event
- * ID.
- */
-
-#define C(x) PERF_COUNT_HW_CACHE_##x
-
-static u64 __read_mostly hw_cache_event_ids
+u64 __read_mostly hw_cache_event_ids
 				[PERF_COUNT_HW_CACHE_MAX]
 				[PERF_COUNT_HW_CACHE_OP_MAX]
 				[PERF_COUNT_HW_CACHE_RESULT_MAX];
-static u64 __read_mostly hw_cache_extra_regs
+u64 __read_mostly hw_cache_extra_regs
 				[PERF_COUNT_HW_CACHE_MAX]
 				[PERF_COUNT_HW_CACHE_OP_MAX]
 				[PERF_COUNT_HW_CACHE_RESULT_MAX];
@@ -329,8 +65,7 @@
  * Can only be executed on the CPU where the event is active.
  * Returns the delta events processed.
  */
-static u64
-x86_perf_event_update(struct perf_event *event)
+u64 x86_perf_event_update(struct perf_event *event)
 {
 	struct hw_perf_event *hwc = &event->hw;
 	int shift = 64 - x86_pmu.cntval_bits;
@@ -373,30 +108,6 @@
 	return new_raw_count;
 }
 
-static inline int x86_pmu_addr_offset(int index)
-{
-	int offset;
-
-	/* offset = X86_FEATURE_PERFCTR_CORE ? index << 1 : index */
-	alternative_io(ASM_NOP2,
-		       "shll $1, %%eax",
-		       X86_FEATURE_PERFCTR_CORE,
-		       "=a" (offset),
-		       "a"  (index));
-
-	return offset;
-}
-
-static inline unsigned int x86_pmu_config_addr(int index)
-{
-	return x86_pmu.eventsel + x86_pmu_addr_offset(index);
-}
-
-static inline unsigned int x86_pmu_event_addr(int index)
-{
-	return x86_pmu.perfctr + x86_pmu_addr_offset(index);
-}
-
 /*
  * Find and validate any extra registers to set up.
  */
@@ -532,9 +243,6 @@
 	return false;
 }
 
-static void reserve_ds_buffers(void);
-static void release_ds_buffers(void);
-
 static void hw_perf_event_destroy(struct perf_event *event)
 {
 	if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) {
@@ -583,7 +291,7 @@
 	return x86_pmu_extra_regs(val, event);
 }
 
-static int x86_setup_perfctr(struct perf_event *event)
+int x86_setup_perfctr(struct perf_event *event)
 {
 	struct perf_event_attr *attr = &event->attr;
 	struct hw_perf_event *hwc = &event->hw;
@@ -647,7 +355,7 @@
 	return 0;
 }
 
-static int x86_pmu_hw_config(struct perf_event *event)
+int x86_pmu_hw_config(struct perf_event *event)
 {
 	if (event->attr.precise_ip) {
 		int precise = 0;
@@ -723,7 +431,7 @@
 	return x86_pmu.hw_config(event);
 }
 
-static void x86_pmu_disable_all(void)
+void x86_pmu_disable_all(void)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	int idx;
@@ -758,15 +466,7 @@
 	x86_pmu.disable_all();
 }
 
-static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
-					  u64 enable_mask)
-{
-	if (hwc->extra_reg.reg)
-		wrmsrl(hwc->extra_reg.reg, hwc->extra_reg.config);
-	wrmsrl(hwc->config_base, hwc->config | enable_mask);
-}
-
-static void x86_pmu_enable_all(int added)
+void x86_pmu_enable_all(int added)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	int idx;
@@ -788,7 +488,7 @@
 	return event->pmu == &pmu;
 }
 
-static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
+int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
 {
 	struct event_constraint *c, *constraints[X86_PMC_IDX_MAX];
 	unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
@@ -959,7 +659,6 @@
 }
 
 static void x86_pmu_start(struct perf_event *event, int flags);
-static void x86_pmu_stop(struct perf_event *event, int flags);
 
 static void x86_pmu_enable(struct pmu *pmu)
 {
@@ -1031,21 +730,13 @@
 	x86_pmu.enable_all(added);
 }
 
-static inline void x86_pmu_disable_event(struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-
-	wrmsrl(hwc->config_base, hwc->config);
-}
-
 static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
 
 /*
  * Set the next IRQ period, based on the hwc->period_left value.
  * To be called with the event disabled in hw:
  */
-static int
-x86_perf_event_set_period(struct perf_event *event)
+int x86_perf_event_set_period(struct perf_event *event)
 {
 	struct hw_perf_event *hwc = &event->hw;
 	s64 left = local64_read(&hwc->period_left);
@@ -1105,7 +796,7 @@
 	return ret;
 }
 
-static void x86_pmu_enable_event(struct perf_event *event)
+void x86_pmu_enable_event(struct perf_event *event)
 {
 	if (__this_cpu_read(cpu_hw_events.enabled))
 		__x86_pmu_enable_event(&event->hw,
@@ -1244,7 +935,7 @@
 	local_irq_restore(flags);
 }
 
-static void x86_pmu_stop(struct perf_event *event, int flags)
+void x86_pmu_stop(struct perf_event *event, int flags)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	struct hw_perf_event *hwc = &event->hw;
@@ -1297,7 +988,7 @@
 	perf_event_update_userpage(event);
 }
 
-static int x86_pmu_handle_irq(struct pt_regs *regs)
+int x86_pmu_handle_irq(struct pt_regs *regs)
 {
 	struct perf_sample_data data;
 	struct cpu_hw_events *cpuc;
@@ -1367,109 +1058,28 @@
 	apic_write(APIC_LVTPC, APIC_DM_NMI);
 }
 
-struct pmu_nmi_state {
-	unsigned int	marked;
-	int		handled;
-};
-
-static DEFINE_PER_CPU(struct pmu_nmi_state, pmu_nmi);
-
 static int __kprobes
-perf_event_nmi_handler(struct notifier_block *self,
-			 unsigned long cmd, void *__args)
+perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs)
 {
-	struct die_args *args = __args;
-	unsigned int this_nmi;
-	int handled;
-
 	if (!atomic_read(&active_events))
-		return NOTIFY_DONE;
+		return NMI_DONE;
 
-	switch (cmd) {
-	case DIE_NMI:
-		break;
-	case DIE_NMIUNKNOWN:
-		this_nmi = percpu_read(irq_stat.__nmi_count);
-		if (this_nmi != __this_cpu_read(pmu_nmi.marked))
-			/* let the kernel handle the unknown nmi */
-			return NOTIFY_DONE;
-		/*
-		 * This one is a PMU back-to-back nmi. Two events
-		 * trigger 'simultaneously' raising two back-to-back
-		 * NMIs. If the first NMI handles both, the latter
-		 * will be empty and daze the CPU. So, we drop it to
-		 * avoid false-positive 'unknown nmi' messages.
-		 */
-		return NOTIFY_STOP;
-	default:
-		return NOTIFY_DONE;
-	}
-
-	handled = x86_pmu.handle_irq(args->regs);
-	if (!handled)
-		return NOTIFY_DONE;
-
-	this_nmi = percpu_read(irq_stat.__nmi_count);
-	if ((handled > 1) ||
-		/* the next nmi could be a back-to-back nmi */
-	    ((__this_cpu_read(pmu_nmi.marked) == this_nmi) &&
-	     (__this_cpu_read(pmu_nmi.handled) > 1))) {
-		/*
-		 * We could have two subsequent back-to-back nmis: The
-		 * first handles more than one counter, the 2nd
-		 * handles only one counter and the 3rd handles no
-		 * counter.
-		 *
-		 * This is the 2nd nmi because the previous was
-		 * handling more than one counter. We will mark the
-		 * next (3rd) and then drop it if unhandled.
-		 */
-		__this_cpu_write(pmu_nmi.marked, this_nmi + 1);
-		__this_cpu_write(pmu_nmi.handled, handled);
-	}
-
-	return NOTIFY_STOP;
+	return x86_pmu.handle_irq(regs);
 }
 
-static __read_mostly struct notifier_block perf_event_nmi_notifier = {
-	.notifier_call		= perf_event_nmi_handler,
-	.next			= NULL,
-	.priority		= NMI_LOCAL_LOW_PRIOR,
-};
-
-static struct event_constraint unconstrained;
-static struct event_constraint emptyconstraint;
-
-static struct event_constraint *
-x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
-{
-	struct event_constraint *c;
-
-	if (x86_pmu.event_constraints) {
-		for_each_event_constraint(c, x86_pmu.event_constraints) {
-			if ((event->hw.config & c->cmask) == c->code)
-				return c;
-		}
-	}
-
-	return &unconstrained;
-}
-
-#include "perf_event_amd.c"
-#include "perf_event_p6.c"
-#include "perf_event_p4.c"
-#include "perf_event_intel_lbr.c"
-#include "perf_event_intel_ds.c"
-#include "perf_event_intel.c"
+struct event_constraint emptyconstraint;
+struct event_constraint unconstrained;
 
 static int __cpuinit
 x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
 {
 	unsigned int cpu = (long)hcpu;
+	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
 	int ret = NOTIFY_OK;
 
 	switch (action & ~CPU_TASKS_FROZEN) {
 	case CPU_UP_PREPARE:
+		cpuc->kfree_on_online = NULL;
 		if (x86_pmu.cpu_prepare)
 			ret = x86_pmu.cpu_prepare(cpu);
 		break;
@@ -1479,6 +1089,10 @@
 			x86_pmu.cpu_starting(cpu);
 		break;
 
+	case CPU_ONLINE:
+		kfree(cpuc->kfree_on_online);
+		break;
+
 	case CPU_DYING:
 		if (x86_pmu.cpu_dying)
 			x86_pmu.cpu_dying(cpu);
@@ -1557,7 +1171,7 @@
 		((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED;
 
 	perf_events_lapic_init();
-	register_die_notifier(&perf_event_nmi_notifier);
+	register_nmi_handler(NMI_LOCAL, perf_event_nmi_handler, 0, "PMI");
 
 	unconstrained = (struct event_constraint)
 		__EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1,

diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
new file mode 100644
index 0000000..b9698d4
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event.h

@@ -0,0 +1,505 @@
+/*
+ * Performance events x86 architecture header
+ *
+ *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
+ *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
+ *  Copyright (C) 2009 Jaswinder Singh Rajput
+ *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
+ *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ *  Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
+ *  Copyright (C) 2009 Google, Inc., Stephane Eranian
+ *
+ *  For licencing details see kernel-base/COPYING
+ */
+
+#include <linux/perf_event.h>
+
+/*
+ *          |   NHM/WSM    |      SNB     |
+ * register -------------------------------
+ *          |  HT  | no HT |  HT  | no HT |
+ *-----------------------------------------
+ * offcore  | core | core  | cpu  | core  |
+ * lbr_sel  | core | core  | cpu  | core  |
+ * ld_lat   | cpu  | core  | cpu  | core  |
+ *-----------------------------------------
+ *
+ * Given that there is a small number of shared regs,
+ * we can pre-allocate their slot in the per-cpu
+ * per-core reg tables.
+ */
+enum extra_reg_type {
+	EXTRA_REG_NONE  = -1,	/* not used */
+
+	EXTRA_REG_RSP_0 = 0,	/* offcore_response_0 */
+	EXTRA_REG_RSP_1 = 1,	/* offcore_response_1 */
+
+	EXTRA_REG_MAX		/* number of entries needed */
+};
+
+struct event_constraint {
+	union {
+		unsigned long	idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+		u64		idxmsk64;
+	};
+	u64	code;
+	u64	cmask;
+	int	weight;
+};
+
+struct amd_nb {
+	int nb_id;  /* NorthBridge id */
+	int refcnt; /* reference count */
+	struct perf_event *owners[X86_PMC_IDX_MAX];
+	struct event_constraint event_constraints[X86_PMC_IDX_MAX];
+};
+
+/* The maximal number of PEBS events: */
+#define MAX_PEBS_EVENTS		4
+
+/*
+ * A debug store configuration.
+ *
+ * We only support architectures that use 64bit fields.
+ */
+struct debug_store {
+	u64	bts_buffer_base;
+	u64	bts_index;
+	u64	bts_absolute_maximum;
+	u64	bts_interrupt_threshold;
+	u64	pebs_buffer_base;
+	u64	pebs_index;
+	u64	pebs_absolute_maximum;
+	u64	pebs_interrupt_threshold;
+	u64	pebs_event_reset[MAX_PEBS_EVENTS];
+};
+
+/*
+ * Per register state.
+ */
+struct er_account {
+	raw_spinlock_t		lock;	/* per-core: protect structure */
+	u64                 config;	/* extra MSR config */
+	u64                 reg;	/* extra MSR number */
+	atomic_t            ref;	/* reference count */
+};
+
+/*
+ * Per core/cpu state
+ *
+ * Used to coordinate shared registers between HT threads or
+ * among events on a single PMU.
+ */
+struct intel_shared_regs {
+	struct er_account       regs[EXTRA_REG_MAX];
+	int                     refcnt;		/* per-core: #HT threads */
+	unsigned                core_id;	/* per-core: core id */
+};
+
+#define MAX_LBR_ENTRIES		16
+
+struct cpu_hw_events {
+	/*
+	 * Generic x86 PMC bits
+	 */
+	struct perf_event	*events[X86_PMC_IDX_MAX]; /* in counter order */
+	unsigned long		active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+	unsigned long		running[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+	int			enabled;
+
+	int			n_events;
+	int			n_added;
+	int			n_txn;
+	int			assign[X86_PMC_IDX_MAX]; /* event to counter assignment */
+	u64			tags[X86_PMC_IDX_MAX];
+	struct perf_event	*event_list[X86_PMC_IDX_MAX]; /* in enabled order */
+
+	unsigned int		group_flag;
+
+	/*
+	 * Intel DebugStore bits
+	 */
+	struct debug_store	*ds;
+	u64			pebs_enabled;
+
+	/*
+	 * Intel LBR bits
+	 */
+	int				lbr_users;
+	void				*lbr_context;
+	struct perf_branch_stack	lbr_stack;
+	struct perf_branch_entry	lbr_entries[MAX_LBR_ENTRIES];
+
+	/*
+	 * Intel host/guest exclude bits
+	 */
+	u64				intel_ctrl_guest_mask;
+	u64				intel_ctrl_host_mask;
+	struct perf_guest_switch_msr	guest_switch_msrs[X86_PMC_IDX_MAX];
+
+	/*
+	 * manage shared (per-core, per-cpu) registers
+	 * used on Intel NHM/WSM/SNB
+	 */
+	struct intel_shared_regs	*shared_regs;
+
+	/*
+	 * AMD specific bits
+	 */
+	struct amd_nb		*amd_nb;
+
+	void				*kfree_on_online;
+};
+
+#define __EVENT_CONSTRAINT(c, n, m, w) {\
+	{ .idxmsk64 = (n) },		\
+	.code = (c),			\
+	.cmask = (m),			\
+	.weight = (w),			\
+}
+
+#define EVENT_CONSTRAINT(c, n, m)	\
+	__EVENT_CONSTRAINT(c, n, m, HWEIGHT(n))
+
+/*
+ * Constraint on the Event code.
+ */
+#define INTEL_EVENT_CONSTRAINT(c, n)	\
+	EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT)
+
+/*
+ * Constraint on the Event code + UMask + fixed-mask
+ *
+ * filter mask to validate fixed counter events.
+ * the following filters disqualify for fixed counters:
+ *  - inv
+ *  - edge
+ *  - cnt-mask
+ *  The other filters are supported by fixed counters.
+ *  The any-thread option is supported starting with v3.
+ */
+#define FIXED_EVENT_CONSTRAINT(c, n)	\
+	EVENT_CONSTRAINT(c, (1ULL << (32+n)), X86_RAW_EVENT_MASK)
+
+/*
+ * Constraint on the Event code + UMask
+ */
+#define INTEL_UEVENT_CONSTRAINT(c, n)	\
+	EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK)
+
+#define EVENT_CONSTRAINT_END		\
+	EVENT_CONSTRAINT(0, 0, 0)
+
+#define for_each_event_constraint(e, c)	\
+	for ((e) = (c); (e)->weight; (e)++)
+
+/*
+ * Extra registers for specific events.
+ *
+ * Some events need large masks and require external MSRs.
+ * Those extra MSRs end up being shared for all events on
+ * a PMU and sometimes between PMU of sibling HT threads.
+ * In either case, the kernel needs to handle conflicting
+ * accesses to those extra, shared, regs. The data structure
+ * to manage those registers is stored in cpu_hw_event.
+ */
+struct extra_reg {
+	unsigned int		event;
+	unsigned int		msr;
+	u64			config_mask;
+	u64			valid_mask;
+	int			idx;  /* per_xxx->regs[] reg index */
+};
+
+#define EVENT_EXTRA_REG(e, ms, m, vm, i) {	\
+	.event = (e),		\
+	.msr = (ms),		\
+	.config_mask = (m),	\
+	.valid_mask = (vm),	\
+	.idx = EXTRA_REG_##i	\
+	}
+
+#define INTEL_EVENT_EXTRA_REG(event, msr, vm, idx)	\
+	EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm, idx)
+
+#define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0, RSP_0)
+
+union perf_capabilities {
+	struct {
+		u64	lbr_format:6;
+		u64	pebs_trap:1;
+		u64	pebs_arch_reg:1;
+		u64	pebs_format:4;
+		u64	smm_freeze:1;
+	};
+	u64	capabilities;
+};
+
+/*
+ * struct x86_pmu - generic x86 pmu
+ */
+struct x86_pmu {
+	/*
+	 * Generic x86 PMC bits
+	 */
+	const char	*name;
+	int		version;
+	int		(*handle_irq)(struct pt_regs *);
+	void		(*disable_all)(void);
+	void		(*enable_all)(int added);
+	void		(*enable)(struct perf_event *);
+	void		(*disable)(struct perf_event *);
+	int		(*hw_config)(struct perf_event *event);
+	int		(*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign);
+	unsigned	eventsel;
+	unsigned	perfctr;
+	u64		(*event_map)(int);
+	int		max_events;
+	int		num_counters;
+	int		num_counters_fixed;
+	int		cntval_bits;
+	u64		cntval_mask;
+	int		apic;
+	u64		max_period;
+	struct event_constraint *
+			(*get_event_constraints)(struct cpu_hw_events *cpuc,
+						 struct perf_event *event);
+
+	void		(*put_event_constraints)(struct cpu_hw_events *cpuc,
+						 struct perf_event *event);
+	struct event_constraint *event_constraints;
+	void		(*quirks)(void);
+	int		perfctr_second_write;
+
+	int		(*cpu_prepare)(int cpu);
+	void		(*cpu_starting)(int cpu);
+	void		(*cpu_dying)(int cpu);
+	void		(*cpu_dead)(int cpu);
+
+	/*
+	 * Intel Arch Perfmon v2+
+	 */
+	u64			intel_ctrl;
+	union perf_capabilities intel_cap;
+
+	/*
+	 * Intel DebugStore bits
+	 */
+	int		bts, pebs;
+	int		bts_active, pebs_active;
+	int		pebs_record_size;
+	void		(*drain_pebs)(struct pt_regs *regs);
+	struct event_constraint *pebs_constraints;
+
+	/*
+	 * Intel LBR
+	 */
+	unsigned long	lbr_tos, lbr_from, lbr_to; /* MSR base regs       */
+	int		lbr_nr;			   /* hardware stack size */
+
+	/*
+	 * Extra registers for events
+	 */
+	struct extra_reg *extra_regs;
+	unsigned int er_flags;
+
+	/*
+	 * Intel host/guest support (KVM)
+	 */
+	struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr);
+};
+
+#define ERF_NO_HT_SHARING	1
+#define ERF_HAS_RSP_1		2
+
+extern struct x86_pmu x86_pmu __read_mostly;
+
+DECLARE_PER_CPU(struct cpu_hw_events, cpu_hw_events);
+
+int x86_perf_event_set_period(struct perf_event *event);
+
+/*
+ * Generalized hw caching related hw_event table, filled
+ * in on a per model basis. A value of 0 means
+ * 'not supported', -1 means 'hw_event makes no sense on
+ * this CPU', any other value means the raw hw_event
+ * ID.
+ */
+
+#define C(x) PERF_COUNT_HW_CACHE_##x
+
+extern u64 __read_mostly hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX];
+extern u64 __read_mostly hw_cache_extra_regs
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX];
+
+u64 x86_perf_event_update(struct perf_event *event);
+
+static inline int x86_pmu_addr_offset(int index)
+{
+	int offset;
+
+	/* offset = X86_FEATURE_PERFCTR_CORE ? index << 1 : index */
+	alternative_io(ASM_NOP2,
+		       "shll $1, %%eax",
+		       X86_FEATURE_PERFCTR_CORE,
+		       "=a" (offset),
+		       "a"  (index));
+
+	return offset;
+}
+
+static inline unsigned int x86_pmu_config_addr(int index)
+{
+	return x86_pmu.eventsel + x86_pmu_addr_offset(index);
+}
+
+static inline unsigned int x86_pmu_event_addr(int index)
+{
+	return x86_pmu.perfctr + x86_pmu_addr_offset(index);
+}
+
+int x86_setup_perfctr(struct perf_event *event);
+
+int x86_pmu_hw_config(struct perf_event *event);
+
+void x86_pmu_disable_all(void);
+
+static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
+					  u64 enable_mask)
+{
+	if (hwc->extra_reg.reg)
+		wrmsrl(hwc->extra_reg.reg, hwc->extra_reg.config);
+	wrmsrl(hwc->config_base, hwc->config | enable_mask);
+}
+
+void x86_pmu_enable_all(int added);
+
+int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign);
+
+void x86_pmu_stop(struct perf_event *event, int flags);
+
+static inline void x86_pmu_disable_event(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	wrmsrl(hwc->config_base, hwc->config);
+}
+
+void x86_pmu_enable_event(struct perf_event *event);
+
+int x86_pmu_handle_irq(struct pt_regs *regs);
+
+extern struct event_constraint emptyconstraint;
+
+extern struct event_constraint unconstrained;
+
+#ifdef CONFIG_CPU_SUP_AMD
+
+int amd_pmu_init(void);
+
+#else /* CONFIG_CPU_SUP_AMD */
+
+static inline int amd_pmu_init(void)
+{
+	return 0;
+}
+
+#endif /* CONFIG_CPU_SUP_AMD */
+
+#ifdef CONFIG_CPU_SUP_INTEL
+
+int intel_pmu_save_and_restart(struct perf_event *event);
+
+struct event_constraint *
+x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event);
+
+struct intel_shared_regs *allocate_shared_regs(int cpu);
+
+int intel_pmu_init(void);
+
+void init_debug_store_on_cpu(int cpu);
+
+void fini_debug_store_on_cpu(int cpu);
+
+void release_ds_buffers(void);
+
+void reserve_ds_buffers(void);
+
+extern struct event_constraint bts_constraint;
+
+void intel_pmu_enable_bts(u64 config);
+
+void intel_pmu_disable_bts(void);
+
+int intel_pmu_drain_bts_buffer(void);
+
+extern struct event_constraint intel_core2_pebs_event_constraints[];
+
+extern struct event_constraint intel_atom_pebs_event_constraints[];
+
+extern struct event_constraint intel_nehalem_pebs_event_constraints[];
+
+extern struct event_constraint intel_westmere_pebs_event_constraints[];
+
+extern struct event_constraint intel_snb_pebs_event_constraints[];
+
+struct event_constraint *intel_pebs_constraints(struct perf_event *event);
+
+void intel_pmu_pebs_enable(struct perf_event *event);
+
+void intel_pmu_pebs_disable(struct perf_event *event);
+
+void intel_pmu_pebs_enable_all(void);
+
+void intel_pmu_pebs_disable_all(void);
+
+void intel_ds_init(void);
+
+void intel_pmu_lbr_reset(void);
+
+void intel_pmu_lbr_enable(struct perf_event *event);
+
+void intel_pmu_lbr_disable(struct perf_event *event);
+
+void intel_pmu_lbr_enable_all(void);
+
+void intel_pmu_lbr_disable_all(void);
+
+void intel_pmu_lbr_read(void);
+
+void intel_pmu_lbr_init_core(void);
+
+void intel_pmu_lbr_init_nhm(void);
+
+void intel_pmu_lbr_init_atom(void);
+
+int p4_pmu_init(void);
+
+int p6_pmu_init(void);
+
+#else /* CONFIG_CPU_SUP_INTEL */
+
+static inline void reserve_ds_buffers(void)
+{
+}
+
+static inline void release_ds_buffers(void)
+{
+}
+
+static inline int intel_pmu_init(void)
+{
+	return 0;
+}
+
+static inline struct intel_shared_regs *allocate_shared_regs(int cpu)
+{
+	return NULL;
+}
+
+#endif /* CONFIG_CPU_SUP_INTEL */

diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index 941caa2..aeefd45 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c

@@ -1,4 +1,10 @@
-#ifdef CONFIG_CPU_SUP_AMD
+#include <linux/perf_event.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <asm/apicdef.h>
+
+#include "perf_event.h"
 
 static __initconst const u64 amd_hw_cache_event_ids
 				[PERF_COUNT_HW_CACHE_MAX]
@@ -132,6 +138,19 @@
 	if (ret)
 		return ret;
 
+	if (event->attr.exclude_host && event->attr.exclude_guest)
+		/*
+		 * When HO == GO == 1 the hardware treats that as GO == HO == 0
+		 * and will count in both modes. We don't want to count in that
+		 * case so we emulate no-counting by setting US = OS = 0.
+		 */
+		event->hw.config &= ~(ARCH_PERFMON_EVENTSEL_USR |
+				      ARCH_PERFMON_EVENTSEL_OS);
+	else if (event->attr.exclude_host)
+		event->hw.config |= AMD_PERFMON_EVENTSEL_GUESTONLY;
+	else if (event->attr.exclude_guest)
+		event->hw.config |= AMD_PERFMON_EVENTSEL_HOSTONLY;
+
 	if (event->attr.type != PERF_TYPE_RAW)
 		return 0;
 
@@ -350,7 +369,7 @@
 			continue;
 
 		if (nb->nb_id == nb_id) {
-			kfree(cpuc->amd_nb);
+			cpuc->kfree_on_online = cpuc->amd_nb;
 			cpuc->amd_nb = nb;
 			break;
 		}
@@ -392,7 +411,7 @@
 	.perfctr		= MSR_K7_PERFCTR0,
 	.event_map		= amd_pmu_event_map,
 	.max_events		= ARRAY_SIZE(amd_perfmon_event_map),
-	.num_counters		= 4,
+	.num_counters		= AMD64_NUM_COUNTERS,
 	.cntval_bits		= 48,
 	.cntval_mask		= (1ULL << 48) - 1,
 	.apic			= 1,
@@ -556,7 +575,7 @@
 	.perfctr		= MSR_F15H_PERF_CTR,
 	.event_map		= amd_pmu_event_map,
 	.max_events		= ARRAY_SIZE(amd_perfmon_event_map),
-	.num_counters		= 6,
+	.num_counters		= AMD64_NUM_COUNTERS_F15H,
 	.cntval_bits		= 48,
 	.cntval_mask		= (1ULL << 48) - 1,
 	.apic			= 1,
@@ -573,7 +592,7 @@
 #endif
 };
 
-static __init int amd_pmu_init(void)
+__init int amd_pmu_init(void)
 {
 	/* Performance-monitoring supported from K7 and later: */
 	if (boot_cpu_data.x86 < 6)
@@ -602,12 +621,3 @@
 
 	return 0;
 }
-
-#else /* CONFIG_CPU_SUP_AMD */
-
-static int amd_pmu_init(void)
-{
-	return 0;
-}
-
-#endif

diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
new file mode 100644
index 0000000..ab6343d
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c

@@ -0,0 +1,294 @@
+/*
+ * Performance events - AMD IBS
+ *
+ *  Copyright (C) 2011 Advanced Micro Devices, Inc., Robert Richter
+ *
+ *  For licencing details see kernel-base/COPYING
+ */
+
+#include <linux/perf_event.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+
+#include <asm/apic.h>
+
+static u32 ibs_caps;
+
+#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD)
+
+static struct pmu perf_ibs;
+
+static int perf_ibs_init(struct perf_event *event)
+{
+	if (perf_ibs.type != event->attr.type)
+		return -ENOENT;
+	return 0;
+}
+
+static int perf_ibs_add(struct perf_event *event, int flags)
+{
+	return 0;
+}
+
+static void perf_ibs_del(struct perf_event *event, int flags)
+{
+}
+
+static struct pmu perf_ibs = {
+	.event_init= perf_ibs_init,
+	.add= perf_ibs_add,
+	.del= perf_ibs_del,
+};
+
+static __init int perf_event_ibs_init(void)
+{
+	if (!ibs_caps)
+		return -ENODEV;	/* ibs not supported by the cpu */
+
+	perf_pmu_register(&perf_ibs, "ibs", -1);
+	printk(KERN_INFO "perf: AMD IBS detected (0x%08x)\n", ibs_caps);
+
+	return 0;
+}
+
+#else /* defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) */
+
+static __init int perf_event_ibs_init(void) { return 0; }
+
+#endif
+
+/* IBS - apic initialization, for perf and oprofile */
+
+static __init u32 __get_ibs_caps(void)
+{
+	u32 caps;
+	unsigned int max_level;
+
+	if (!boot_cpu_has(X86_FEATURE_IBS))
+		return 0;
+
+	/* check IBS cpuid feature flags */
+	max_level = cpuid_eax(0x80000000);
+	if (max_level < IBS_CPUID_FEATURES)
+		return IBS_CAPS_DEFAULT;
+
+	caps = cpuid_eax(IBS_CPUID_FEATURES);
+	if (!(caps & IBS_CAPS_AVAIL))
+		/* cpuid flags not valid */
+		return IBS_CAPS_DEFAULT;
+
+	return caps;
+}
+
+u32 get_ibs_caps(void)
+{
+	return ibs_caps;
+}
+
+EXPORT_SYMBOL(get_ibs_caps);
+
+static inline int get_eilvt(int offset)
+{
+	return !setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 1);
+}
+
+static inline int put_eilvt(int offset)
+{
+	return !setup_APIC_eilvt(offset, 0, 0, 1);
+}
+
+/*
+ * Check and reserve APIC extended interrupt LVT offset for IBS if available.
+ */
+static inline int ibs_eilvt_valid(void)
+{
+	int offset;
+	u64 val;
+	int valid = 0;
+
+	preempt_disable();
+
+	rdmsrl(MSR_AMD64_IBSCTL, val);
+	offset = val & IBSCTL_LVT_OFFSET_MASK;
+
+	if (!(val & IBSCTL_LVT_OFFSET_VALID)) {
+		pr_err(FW_BUG "cpu %d, invalid IBS interrupt offset %d (MSR%08X=0x%016llx)\n",
+		       smp_processor_id(), offset, MSR_AMD64_IBSCTL, val);
+		goto out;
+	}
+
+	if (!get_eilvt(offset)) {
+		pr_err(FW_BUG "cpu %d, IBS interrupt offset %d not available (MSR%08X=0x%016llx)\n",
+		       smp_processor_id(), offset, MSR_AMD64_IBSCTL, val);
+		goto out;
+	}
+
+	valid = 1;
+out:
+	preempt_enable();
+
+	return valid;
+}
+
+static int setup_ibs_ctl(int ibs_eilvt_off)
+{
+	struct pci_dev *cpu_cfg;
+	int nodes;
+	u32 value = 0;
+
+	nodes = 0;
+	cpu_cfg = NULL;
+	do {
+		cpu_cfg = pci_get_device(PCI_VENDOR_ID_AMD,
+					 PCI_DEVICE_ID_AMD_10H_NB_MISC,
+					 cpu_cfg);
+		if (!cpu_cfg)
+			break;
+		++nodes;
+		pci_write_config_dword(cpu_cfg, IBSCTL, ibs_eilvt_off
+				       | IBSCTL_LVT_OFFSET_VALID);
+		pci_read_config_dword(cpu_cfg, IBSCTL, &value);
+		if (value != (ibs_eilvt_off | IBSCTL_LVT_OFFSET_VALID)) {
+			pci_dev_put(cpu_cfg);
+			printk(KERN_DEBUG "Failed to setup IBS LVT offset, "
+			       "IBSCTL = 0x%08x\n", value);
+			return -EINVAL;
+		}
+	} while (1);
+
+	if (!nodes) {
+		printk(KERN_DEBUG "No CPU node configured for IBS\n");
+		return -ENODEV;
+	}
+
+	return 0;
+}
+
+/*
+ * This runs only on the current cpu. We try to find an LVT offset and
+ * setup the local APIC. For this we must disable preemption. On
+ * success we initialize all nodes with this offset. This updates then
+ * the offset in the IBS_CTL per-node msr. The per-core APIC setup of
+ * the IBS interrupt vector is handled by perf_ibs_cpu_notifier that
+ * is using the new offset.
+ */
+static int force_ibs_eilvt_setup(void)
+{
+	int offset;
+	int ret;
+
+	preempt_disable();
+	/* find the next free available EILVT entry, skip offset 0 */
+	for (offset = 1; offset < APIC_EILVT_NR_MAX; offset++) {
+		if (get_eilvt(offset))
+			break;
+	}
+	preempt_enable();
+
+	if (offset == APIC_EILVT_NR_MAX) {
+		printk(KERN_DEBUG "No EILVT entry available\n");
+		return -EBUSY;
+	}
+
+	ret = setup_ibs_ctl(offset);
+	if (ret)
+		goto out;
+
+	if (!ibs_eilvt_valid()) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	pr_err(FW_BUG "using offset %d for IBS interrupts\n", offset);
+	pr_err(FW_BUG "workaround enabled for IBS LVT offset\n");
+
+	return 0;
+out:
+	preempt_disable();
+	put_eilvt(offset);
+	preempt_enable();
+	return ret;
+}
+
+static inline int get_ibs_lvt_offset(void)
+{
+	u64 val;
+
+	rdmsrl(MSR_AMD64_IBSCTL, val);
+	if (!(val & IBSCTL_LVT_OFFSET_VALID))
+		return -EINVAL;
+
+	return val & IBSCTL_LVT_OFFSET_MASK;
+}
+
+static void setup_APIC_ibs(void *dummy)
+{
+	int offset;
+
+	offset = get_ibs_lvt_offset();
+	if (offset < 0)
+		goto failed;
+
+	if (!setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 0))
+		return;
+failed:
+	pr_warn("perf: IBS APIC setup failed on cpu #%d\n",
+		smp_processor_id());
+}
+
+static void clear_APIC_ibs(void *dummy)
+{
+	int offset;
+
+	offset = get_ibs_lvt_offset();
+	if (offset >= 0)
+		setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_FIX, 1);
+}
+
+static int __cpuinit
+perf_ibs_cpu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
+{
+	switch (action & ~CPU_TASKS_FROZEN) {
+	case CPU_STARTING:
+		setup_APIC_ibs(NULL);
+		break;
+	case CPU_DYING:
+		clear_APIC_ibs(NULL);
+		break;
+	default:
+		break;
+	}
+
+	return NOTIFY_OK;
+}
+
+static __init int amd_ibs_init(void)
+{
+	u32 caps;
+	int ret;
+
+	caps = __get_ibs_caps();
+	if (!caps)
+		return -ENODEV;	/* ibs not supported by the cpu */
+
+	if (!ibs_eilvt_valid()) {
+		ret = force_ibs_eilvt_setup();
+		if (ret) {
+			pr_err("Failed to setup IBS, %d\n", ret);
+			return ret;
+		}
+	}
+
+	get_online_cpus();
+	ibs_caps = caps;
+	/* make ibs_caps visible to other cpus: */
+	smp_mb();
+	perf_cpu_notifier(perf_ibs_cpu_notifier);
+	smp_call_function(setup_APIC_ibs, NULL, 1);
+	put_online_cpus();
+
+	return perf_event_ibs_init();
+}
+
+/* Since we need the pci subsystem to init ibs we can't do this earlier: */
+device_initcall(amd_ibs_init);

diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index f88af2c..e09ca20 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c

@@ -1,16 +1,19 @@
-#ifdef CONFIG_CPU_SUP_INTEL
-
 /*
  * Per core/cpu state
  *
  * Used to coordinate shared registers between HT threads or
  * among events on a single PMU.
  */
-struct intel_shared_regs {
-	struct er_account       regs[EXTRA_REG_MAX];
-	int                     refcnt;		/* per-core: #HT threads */
-	unsigned                core_id;	/* per-core: core id */
-};
+
+#include <linux/stddef.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+
+#include <asm/hardirq.h>
+#include <asm/apic.h>
+
+#include "perf_event.h"
 
 /*
  * Intel PerfMon, used on Core and later.
@@ -746,7 +749,8 @@
 
 	intel_pmu_pebs_enable_all();
 	intel_pmu_lbr_enable_all();
-	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
+	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL,
+			x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask);
 
 	if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
 		struct perf_event *event =
@@ -869,6 +873,7 @@
 static void intel_pmu_disable_event(struct perf_event *event)
 {
 	struct hw_perf_event *hwc = &event->hw;
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 
 	if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) {
 		intel_pmu_disable_bts();
@@ -876,6 +881,9 @@
 		return;
 	}
 
+	cpuc->intel_ctrl_guest_mask &= ~(1ull << hwc->idx);
+	cpuc->intel_ctrl_host_mask &= ~(1ull << hwc->idx);
+
 	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
 		intel_pmu_disable_fixed(hwc);
 		return;
@@ -921,6 +929,7 @@
 static void intel_pmu_enable_event(struct perf_event *event)
 {
 	struct hw_perf_event *hwc = &event->hw;
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 
 	if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) {
 		if (!__this_cpu_read(cpu_hw_events.enabled))
@@ -930,6 +939,11 @@
 		return;
 	}
 
+	if (event->attr.exclude_host)
+		cpuc->intel_ctrl_guest_mask |= (1ull << hwc->idx);
+	if (event->attr.exclude_guest)
+		cpuc->intel_ctrl_host_mask |= (1ull << hwc->idx);
+
 	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
 		intel_pmu_enable_fixed(hwc);
 		return;
@@ -945,7 +959,7 @@
  * Save and restart an expired event. Called by NMI contexts,
  * so it has to be careful about preempting normal event ops:
  */
-static int intel_pmu_save_and_restart(struct perf_event *event)
+int intel_pmu_save_and_restart(struct perf_event *event)
 {
 	x86_perf_event_update(event);
 	return x86_perf_event_set_period(event);
@@ -1197,6 +1211,21 @@
 	return c;
 }
 
+struct event_constraint *
+x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
+{
+	struct event_constraint *c;
+
+	if (x86_pmu.event_constraints) {
+		for_each_event_constraint(c, x86_pmu.event_constraints) {
+			if ((event->hw.config & c->cmask) == c->code)
+				return c;
+		}
+	}
+
+	return &unconstrained;
+}
+
 static struct event_constraint *
 intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
 {
@@ -1284,12 +1313,84 @@
 	return 0;
 }
 
+struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr)
+{
+	if (x86_pmu.guest_get_msrs)
+		return x86_pmu.guest_get_msrs(nr);
+	*nr = 0;
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(perf_guest_get_msrs);
+
+static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs;
+
+	arr[0].msr = MSR_CORE_PERF_GLOBAL_CTRL;
+	arr[0].host = x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask;
+	arr[0].guest = x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_host_mask;
+
+	*nr = 1;
+	return arr;
+}
+
+static struct perf_guest_switch_msr *core_guest_get_msrs(int *nr)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs;
+	int idx;
+
+	for (idx = 0; idx < x86_pmu.num_counters; idx++)  {
+		struct perf_event *event = cpuc->events[idx];
+
+		arr[idx].msr = x86_pmu_config_addr(idx);
+		arr[idx].host = arr[idx].guest = 0;
+
+		if (!test_bit(idx, cpuc->active_mask))
+			continue;
+
+		arr[idx].host = arr[idx].guest =
+			event->hw.config | ARCH_PERFMON_EVENTSEL_ENABLE;
+
+		if (event->attr.exclude_host)
+			arr[idx].host &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
+		else if (event->attr.exclude_guest)
+			arr[idx].guest &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
+	}
+
+	*nr = x86_pmu.num_counters;
+	return arr;
+}
+
+static void core_pmu_enable_event(struct perf_event *event)
+{
+	if (!event->attr.exclude_host)
+		x86_pmu_enable_event(event);
+}
+
+static void core_pmu_enable_all(int added)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	int idx;
+
+	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+		struct hw_perf_event *hwc = &cpuc->events[idx]->hw;
+
+		if (!test_bit(idx, cpuc->active_mask) ||
+				cpuc->events[idx]->attr.exclude_host)
+			continue;
+
+		__x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
+	}
+}
+
 static __initconst const struct x86_pmu core_pmu = {
 	.name			= "core",
 	.handle_irq		= x86_pmu_handle_irq,
 	.disable_all		= x86_pmu_disable_all,
-	.enable_all		= x86_pmu_enable_all,
-	.enable			= x86_pmu_enable_event,
+	.enable_all		= core_pmu_enable_all,
+	.enable			= core_pmu_enable_event,
 	.disable		= x86_pmu_disable_event,
 	.hw_config		= x86_pmu_hw_config,
 	.schedule_events	= x86_schedule_events,
@@ -1307,9 +1408,10 @@
 	.get_event_constraints	= intel_get_event_constraints,
 	.put_event_constraints	= intel_put_event_constraints,
 	.event_constraints	= intel_core_event_constraints,
+	.guest_get_msrs		= core_guest_get_msrs,
 };
 
-static struct intel_shared_regs *allocate_shared_regs(int cpu)
+struct intel_shared_regs *allocate_shared_regs(int cpu)
 {
 	struct intel_shared_regs *regs;
 	int i;
@@ -1362,7 +1464,7 @@
 
 		pc = per_cpu(cpu_hw_events, i).shared_regs;
 		if (pc && pc->core_id == core_id) {
-			kfree(cpuc->shared_regs);
+			cpuc->kfree_on_online = cpuc->shared_regs;
 			cpuc->shared_regs = pc;
 			break;
 		}
@@ -1413,6 +1515,7 @@
 	.cpu_prepare		= intel_pmu_cpu_prepare,
 	.cpu_starting		= intel_pmu_cpu_starting,
 	.cpu_dying		= intel_pmu_cpu_dying,
+	.guest_get_msrs		= intel_guest_get_msrs,
 };
 
 static void intel_clovertown_quirks(void)
@@ -1441,7 +1544,7 @@
 	x86_pmu.pebs_constraints = NULL;
 }
 
-static __init int intel_pmu_init(void)
+__init int intel_pmu_init(void)
 {
 	union cpuid10_edx edx;
 	union cpuid10_eax eax;
@@ -1597,7 +1700,7 @@
 		intel_pmu_lbr_init_nhm();
 
 		x86_pmu.event_constraints = intel_snb_event_constraints;
-		x86_pmu.pebs_constraints = intel_snb_pebs_events;
+		x86_pmu.pebs_constraints = intel_snb_pebs_event_constraints;
 		x86_pmu.extra_regs = intel_snb_extra_regs;
 		/* all extra regs are per-cpu when HT is on */
 		x86_pmu.er_flags |= ERF_HAS_RSP_1;
@@ -1628,16 +1731,3 @@
 	}
 	return 0;
 }
-
-#else /* CONFIG_CPU_SUP_INTEL */
-
-static int intel_pmu_init(void)
-{
-	return 0;
-}
-
-static struct intel_shared_regs *allocate_shared_regs(int cpu)
-{
-	return NULL;
-}
-#endif /* CONFIG_CPU_SUP_INTEL */

diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 1b1ef3a..c0d238f 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c

@@ -1,7 +1,10 @@
-#ifdef CONFIG_CPU_SUP_INTEL
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/slab.h>
 
-/* The maximal number of PEBS events: */
-#define MAX_PEBS_EVENTS		4
+#include <asm/perf_event.h>
+
+#include "perf_event.h"
 
 /* The size of a BTS record in bytes: */
 #define BTS_RECORD_SIZE		24
@@ -37,24 +40,7 @@
 	u64 status, dla, dse, lat;
 };
 
-/*
- * A debug store configuration.
- *
- * We only support architectures that use 64bit fields.
- */
-struct debug_store {
-	u64	bts_buffer_base;
-	u64	bts_index;
-	u64	bts_absolute_maximum;
-	u64	bts_interrupt_threshold;
-	u64	pebs_buffer_base;
-	u64	pebs_index;
-	u64	pebs_absolute_maximum;
-	u64	pebs_interrupt_threshold;
-	u64	pebs_event_reset[MAX_PEBS_EVENTS];
-};
-
-static void init_debug_store_on_cpu(int cpu)
+void init_debug_store_on_cpu(int cpu)
 {
 	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
 
@@ -66,7 +52,7 @@
 		     (u32)((u64)(unsigned long)ds >> 32));
 }
 
-static void fini_debug_store_on_cpu(int cpu)
+void fini_debug_store_on_cpu(int cpu)
 {
 	if (!per_cpu(cpu_hw_events, cpu).ds)
 		return;
@@ -175,7 +161,7 @@
 	kfree(ds);
 }
 
-static void release_ds_buffers(void)
+void release_ds_buffers(void)
 {
 	int cpu;
 
@@ -194,7 +180,7 @@
 	put_online_cpus();
 }
 
-static void reserve_ds_buffers(void)
+void reserve_ds_buffers(void)
 {
 	int bts_err = 0, pebs_err = 0;
 	int cpu;
@@ -260,10 +246,10 @@
  * BTS
  */
 
-static struct event_constraint bts_constraint =
+struct event_constraint bts_constraint =
 	EVENT_CONSTRAINT(0, 1ULL << X86_PMC_IDX_FIXED_BTS, 0);
 
-static void intel_pmu_enable_bts(u64 config)
+void intel_pmu_enable_bts(u64 config)
 {
 	unsigned long debugctlmsr;
 
@@ -282,7 +268,7 @@
 	update_debugctlmsr(debugctlmsr);
 }
 
-static void intel_pmu_disable_bts(void)
+void intel_pmu_disable_bts(void)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	unsigned long debugctlmsr;
@@ -299,7 +285,7 @@
 	update_debugctlmsr(debugctlmsr);
 }
 
-static int intel_pmu_drain_bts_buffer(void)
+int intel_pmu_drain_bts_buffer(void)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	struct debug_store *ds = cpuc->ds;
@@ -361,7 +347,7 @@
 /*
  * PEBS
  */
-static struct event_constraint intel_core2_pebs_event_constraints[] = {
+struct event_constraint intel_core2_pebs_event_constraints[] = {
 	INTEL_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */
 	INTEL_UEVENT_CONSTRAINT(0xfec1, 0x1), /* X87_OPS_RETIRED.ANY */
 	INTEL_UEVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */
@@ -370,14 +356,14 @@
 	EVENT_CONSTRAINT_END
 };
 
-static struct event_constraint intel_atom_pebs_event_constraints[] = {
+struct event_constraint intel_atom_pebs_event_constraints[] = {
 	INTEL_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */
 	INTEL_UEVENT_CONSTRAINT(0x00c5, 0x1), /* MISPREDICTED_BRANCH_RETIRED */
 	INTEL_EVENT_CONSTRAINT(0xcb, 0x1),    /* MEM_LOAD_RETIRED.* */
 	EVENT_CONSTRAINT_END
 };
 
-static struct event_constraint intel_nehalem_pebs_event_constraints[] = {
+struct event_constraint intel_nehalem_pebs_event_constraints[] = {
 	INTEL_EVENT_CONSTRAINT(0x0b, 0xf),    /* MEM_INST_RETIRED.* */
 	INTEL_EVENT_CONSTRAINT(0x0f, 0xf),    /* MEM_UNCORE_RETIRED.* */
 	INTEL_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */
@@ -392,7 +378,7 @@
 	EVENT_CONSTRAINT_END
 };
 
-static struct event_constraint intel_westmere_pebs_event_constraints[] = {
+struct event_constraint intel_westmere_pebs_event_constraints[] = {
 	INTEL_EVENT_CONSTRAINT(0x0b, 0xf),    /* MEM_INST_RETIRED.* */
 	INTEL_EVENT_CONSTRAINT(0x0f, 0xf),    /* MEM_UNCORE_RETIRED.* */
 	INTEL_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */
@@ -407,7 +393,7 @@
 	EVENT_CONSTRAINT_END
 };
 
-static struct event_constraint intel_snb_pebs_events[] = {
+struct event_constraint intel_snb_pebs_event_constraints[] = {
 	INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
 	INTEL_UEVENT_CONSTRAINT(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
 	INTEL_UEVENT_CONSTRAINT(0x02c2, 0xf), /* UOPS_RETIRED.RETIRE_SLOTS */
@@ -428,8 +414,7 @@
 	EVENT_CONSTRAINT_END
 };
 
-static struct event_constraint *
-intel_pebs_constraints(struct perf_event *event)
+struct event_constraint *intel_pebs_constraints(struct perf_event *event)
 {
 	struct event_constraint *c;
 
@@ -446,7 +431,7 @@
 	return &emptyconstraint;
 }
 
-static void intel_pmu_pebs_enable(struct perf_event *event)
+void intel_pmu_pebs_enable(struct perf_event *event)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	struct hw_perf_event *hwc = &event->hw;
@@ -460,7 +445,7 @@
 		intel_pmu_lbr_enable(event);
 }
 
-static void intel_pmu_pebs_disable(struct perf_event *event)
+void intel_pmu_pebs_disable(struct perf_event *event)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	struct hw_perf_event *hwc = &event->hw;
@@ -475,7 +460,7 @@
 		intel_pmu_lbr_disable(event);
 }
 
-static void intel_pmu_pebs_enable_all(void)
+void intel_pmu_pebs_enable_all(void)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 
@@ -483,7 +468,7 @@
 		wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
 }
 
-static void intel_pmu_pebs_disable_all(void)
+void intel_pmu_pebs_disable_all(void)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 
@@ -576,8 +561,6 @@
 	return 0;
 }
 
-static int intel_pmu_save_and_restart(struct perf_event *event);
-
 static void __intel_pmu_pebs_event(struct perf_event *event,
 				   struct pt_regs *iregs, void *__pebs)
 {
@@ -716,7 +699,7 @@
  * BTS, PEBS probe and setup
  */
 
-static void intel_ds_init(void)
+void intel_ds_init(void)
 {
 	/*
 	 * No support for 32bit formats
@@ -749,15 +732,3 @@
 		}
 	}
 }
-
-#else /* CONFIG_CPU_SUP_INTEL */
-
-static void reserve_ds_buffers(void)
-{
-}
-
-static void release_ds_buffers(void)
-{
-}
-
-#endif /* CONFIG_CPU_SUP_INTEL */

diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index d202c1b..3fab3de 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c

@@ -1,4 +1,10 @@
-#ifdef CONFIG_CPU_SUP_INTEL
+#include <linux/perf_event.h>
+#include <linux/types.h>
+
+#include <asm/perf_event.h>
+#include <asm/msr.h>
+
+#include "perf_event.h"
 
 enum {
 	LBR_FORMAT_32		= 0x00,
@@ -48,7 +54,7 @@
 	}
 }
 
-static void intel_pmu_lbr_reset(void)
+void intel_pmu_lbr_reset(void)
 {
 	if (!x86_pmu.lbr_nr)
 		return;
@@ -59,7 +65,7 @@
 		intel_pmu_lbr_reset_64();
 }
 
-static void intel_pmu_lbr_enable(struct perf_event *event)
+void intel_pmu_lbr_enable(struct perf_event *event)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 
@@ -81,7 +87,7 @@
 	cpuc->lbr_users++;
 }
 
-static void intel_pmu_lbr_disable(struct perf_event *event)
+void intel_pmu_lbr_disable(struct perf_event *event)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 
@@ -95,7 +101,7 @@
 		__intel_pmu_lbr_disable();
 }
 
-static void intel_pmu_lbr_enable_all(void)
+void intel_pmu_lbr_enable_all(void)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 
@@ -103,7 +109,7 @@
 		__intel_pmu_lbr_enable();
 }
 
-static void intel_pmu_lbr_disable_all(void)
+void intel_pmu_lbr_disable_all(void)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 
@@ -178,7 +184,7 @@
 	cpuc->lbr_stack.nr = i;
 }
 
-static void intel_pmu_lbr_read(void)
+void intel_pmu_lbr_read(void)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 
@@ -191,7 +197,7 @@
 		intel_pmu_lbr_read_64(cpuc);
 }
 
-static void intel_pmu_lbr_init_core(void)
+void intel_pmu_lbr_init_core(void)
 {
 	x86_pmu.lbr_nr     = 4;
 	x86_pmu.lbr_tos    = 0x01c9;
@@ -199,7 +205,7 @@
 	x86_pmu.lbr_to     = 0x60;
 }
 
-static void intel_pmu_lbr_init_nhm(void)
+void intel_pmu_lbr_init_nhm(void)
 {
 	x86_pmu.lbr_nr     = 16;
 	x86_pmu.lbr_tos    = 0x01c9;
@@ -207,12 +213,10 @@
 	x86_pmu.lbr_to     = 0x6c0;
 }
 
-static void intel_pmu_lbr_init_atom(void)
+void intel_pmu_lbr_init_atom(void)
 {
 	x86_pmu.lbr_nr	   = 8;
 	x86_pmu.lbr_tos    = 0x01c9;
 	x86_pmu.lbr_from   = 0x40;
 	x86_pmu.lbr_to     = 0x60;
 }
-
-#endif /* CONFIG_CPU_SUP_INTEL */

diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index 7809d2b..492bf13 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c

@@ -7,9 +7,13 @@
  *  For licencing details see kernel-base/COPYING
  */
 
-#ifdef CONFIG_CPU_SUP_INTEL
+#include <linux/perf_event.h>
 
 #include <asm/perf_event_p4.h>
+#include <asm/hardirq.h>
+#include <asm/apic.h>
+
+#include "perf_event.h"
 
 #define P4_CNTR_LIMIT 3
 /*
@@ -1303,7 +1307,7 @@
 	.perfctr_second_write	= 1,
 };
 
-static __init int p4_pmu_init(void)
+__init int p4_pmu_init(void)
 {
 	unsigned int low, high;
 
@@ -1326,5 +1330,3 @@
 
 	return 0;
 }
-
-#endif /* CONFIG_CPU_SUP_INTEL */

diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c
index 20c097e..c7181be 100644
--- a/arch/x86/kernel/cpu/perf_event_p6.c
+++ b/arch/x86/kernel/cpu/perf_event_p6.c

@@ -1,4 +1,7 @@
-#ifdef CONFIG_CPU_SUP_INTEL
+#include <linux/perf_event.h>
+#include <linux/types.h>
+
+#include "perf_event.h"
 
 /*
  * Not sure about some of these
@@ -114,7 +117,7 @@
 	.event_constraints	= p6_event_constraints,
 };
 
-static __init int p6_pmu_init(void)
+__init int p6_pmu_init(void)
 {
 	switch (boot_cpu_data.x86_model) {
 	case 1:
@@ -138,5 +141,3 @@
 
 	return 0;
 }
-
-#endif /* CONFIG_CPU_SUP_INTEL */

diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 764c7c2..13ad899 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c

@@ -32,15 +32,12 @@
 
 #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
 
-static void kdump_nmi_callback(int cpu, struct die_args *args)
+static void kdump_nmi_callback(int cpu, struct pt_regs *regs)
 {
-	struct pt_regs *regs;
 #ifdef CONFIG_X86_32
 	struct pt_regs fixed_regs;
 #endif
 
-	regs = args->regs;
-
 #ifdef CONFIG_X86_32
 	if (!user_mode_vm(regs)) {
 		crash_fixup_ss_esp(&fixed_regs, regs);

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 6419bb0..faf8d5e 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S

@@ -331,10 +331,15 @@
 1:	incl PER_CPU_VAR(irq_count)
 	jne 2f
 	mov PER_CPU_VAR(irq_stack_ptr),%rsp
-	EMPTY_FRAME 0
+	CFI_DEF_CFA_REGISTER	rsi
 
 2:	/* Store previous stack value */
 	pushq %rsi
+	CFI_ESCAPE	0x0f /* DW_CFA_def_cfa_expression */, 6, \
+			0x77 /* DW_OP_breg7 */, 0, \
+			0x06 /* DW_OP_deref */, \
+			0x08 /* DW_OP_const1u */, SS+8-RBP, \
+			0x22 /* DW_OP_plus */
 	/* We entered an interrupt context - irqs are off: */
 	TRACE_IRQS_OFF
 	.endm
@@ -788,7 +793,6 @@
 	subq $ORIG_RAX-RBP, %rsp
 	CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP
 	SAVE_ARGS_IRQ
-	PARTIAL_FRAME 0
 	call \func
 	.endm
 
@@ -813,10 +817,10 @@
 
 	/* Restore saved previous stack */
 	popq %rsi
-	leaq 16(%rsi), %rsp
-
+	CFI_DEF_CFA_REGISTER	rsi
+	leaq ARGOFFSET-RBP(%rsi), %rsp
 	CFI_DEF_CFA_REGISTER	rsp
-	CFI_ADJUST_CFA_OFFSET	-16
+	CFI_ADJUST_CFA_OFFSET	RBP-ARGOFFSET
 
 exit_intr:
 	GET_THREAD_INFO(%rcx)

diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c
index 3fee346..cacdd46 100644
--- a/arch/x86/kernel/jump_label.c
+++ b/arch/x86/kernel/jump_label.c

@@ -42,7 +42,7 @@
 	put_online_cpus();
 }
 
-void arch_jump_label_text_poke_early(jump_label_t addr)
+void __init_or_module arch_jump_label_text_poke_early(jump_label_t addr)
 {
 	text_poke_early((void *)addr, ideal_nops[NOP_ATOMIC5],
 			JUMP_LABEL_NOP_SIZE);

diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 00354d4..faba577 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c

@@ -511,28 +511,37 @@
 
 static int was_in_debug_nmi[NR_CPUS];
 
-static int __kgdb_notify(struct die_args *args, unsigned long cmd)
+static int kgdb_nmi_handler(unsigned int cmd, struct pt_regs *regs)
 {
-	struct pt_regs *regs = args->regs;
-
 	switch (cmd) {
-	case DIE_NMI:
+	case NMI_LOCAL:
 		if (atomic_read(&kgdb_active) != -1) {
 			/* KGDB CPU roundup */
 			kgdb_nmicallback(raw_smp_processor_id(), regs);
 			was_in_debug_nmi[raw_smp_processor_id()] = 1;
 			touch_nmi_watchdog();
-			return NOTIFY_STOP;
+			return NMI_HANDLED;
 		}
-		return NOTIFY_DONE;
+		break;
 
-	case DIE_NMIUNKNOWN:
+	case NMI_UNKNOWN:
 		if (was_in_debug_nmi[raw_smp_processor_id()]) {
 			was_in_debug_nmi[raw_smp_processor_id()] = 0;
-			return NOTIFY_STOP;
+			return NMI_HANDLED;
 		}
-		return NOTIFY_DONE;
+		break;
+	default:
+		/* do nothing */
+		break;
+	}
+	return NMI_DONE;
+}
 
+static int __kgdb_notify(struct die_args *args, unsigned long cmd)
+{
+	struct pt_regs *regs = args->regs;
+
+	switch (cmd) {
 	case DIE_DEBUG:
 		if (atomic_read(&kgdb_cpu_doing_single_step) != -1) {
 			if (user_mode(regs))
@@ -590,11 +599,6 @@
 
 static struct notifier_block kgdb_notifier = {
 	.notifier_call	= kgdb_notify,
-
-	/*
-	 * Lowest-prio notifier priority, we want to be notified last:
-	 */
-	.priority	= NMI_LOCAL_LOW_PRIOR,
 };
 
 /**
@@ -605,7 +609,31 @@
  */
 int kgdb_arch_init(void)
 {
-	return register_die_notifier(&kgdb_notifier);
+	int retval;
+
+	retval = register_die_notifier(&kgdb_notifier);
+	if (retval)
+		goto out;
+
+	retval = register_nmi_handler(NMI_LOCAL, kgdb_nmi_handler,
+					0, "kgdb");
+	if (retval)
+		goto out1;
+
+	retval = register_nmi_handler(NMI_UNKNOWN, kgdb_nmi_handler,
+					0, "kgdb");
+
+	if (retval)
+		goto out2;
+
+	return retval;
+
+out2:
+	unregister_nmi_handler(NMI_LOCAL, "kgdb");
+out1:
+	unregister_die_notifier(&kgdb_notifier);
+out:
+	return retval;
 }
 
 static void kgdb_hw_overflow_handler(struct perf_event *event,
@@ -673,6 +701,8 @@
 			breakinfo[i].pev = NULL;
 		}
 	}
+	unregister_nmi_handler(NMI_UNKNOWN, "kgdb");
+	unregister_nmi_handler(NMI_LOCAL, "kgdb");
 	unregister_die_notifier(&kgdb_notifier);
 }
 

diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 794bc95..7da647d 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c

@@ -75,10 +75,11 @@
 	/*
 	 * Undefined/reserved opcodes, conditional jump, Opcode Extension
 	 * Groups, and some special opcodes can not boost.
-	 * This is non-const to keep gcc from statically optimizing it out, as
-	 * variable_test_bit makes gcc think only *(unsigned long*) is used.
+	 * This is non-const and volatile to keep gcc from statically
+	 * optimizing it out, as variable_test_bit makes gcc think only
+	 * *(unsigned long*) is used. 
 	 */
-static u32 twobyte_is_boostable[256 / 32] = {
+static volatile u32 twobyte_is_boostable[256 / 32] = {
 	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f          */
 	/*      ----------------------------------------------          */
 	W(0x00, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0) | /* 00 */

diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
new file mode 100644
index 0000000..7ec5bd1
--- /dev/null
+++ b/arch/x86/kernel/nmi.c

@@ -0,0 +1,433 @@
+/*
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *  Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
+ *  Copyright (C) 2011	Don Zickus Red Hat, Inc.
+ *
+ *  Pentium III FXSR, SSE support
+ *	Gareth Hughes <gareth@valinux.com>, May 2000
+ */
+
+/*
+ * Handle hardware traps and faults.
+ */
+#include <linux/spinlock.h>
+#include <linux/kprobes.h>
+#include <linux/kdebug.h>
+#include <linux/nmi.h>
+#include <linux/delay.h>
+#include <linux/hardirq.h>
+#include <linux/slab.h>
+
+#include <linux/mca.h>
+
+#if defined(CONFIG_EDAC)
+#include <linux/edac.h>
+#endif
+
+#include <linux/atomic.h>
+#include <asm/traps.h>
+#include <asm/mach_traps.h>
+#include <asm/nmi.h>
+
+#define NMI_MAX_NAMELEN	16
+struct nmiaction {
+	struct list_head list;
+	nmi_handler_t handler;
+	unsigned int flags;
+	char *name;
+};
+
+struct nmi_desc {
+	spinlock_t lock;
+	struct list_head head;
+};
+
+static struct nmi_desc nmi_desc[NMI_MAX] = 
+{
+	{
+		.lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[0].lock),
+		.head = LIST_HEAD_INIT(nmi_desc[0].head),
+	},
+	{
+		.lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[1].lock),
+		.head = LIST_HEAD_INIT(nmi_desc[1].head),
+	},
+
+};
+
+struct nmi_stats {
+	unsigned int normal;
+	unsigned int unknown;
+	unsigned int external;
+	unsigned int swallow;
+};
+
+static DEFINE_PER_CPU(struct nmi_stats, nmi_stats);
+
+static int ignore_nmis;
+
+int unknown_nmi_panic;
+/*
+ * Prevent NMI reason port (0x61) being accessed simultaneously, can
+ * only be used in NMI handler.
+ */
+static DEFINE_RAW_SPINLOCK(nmi_reason_lock);
+
+static int __init setup_unknown_nmi_panic(char *str)
+{
+	unknown_nmi_panic = 1;
+	return 1;
+}
+__setup("unknown_nmi_panic", setup_unknown_nmi_panic);
+
+#define nmi_to_desc(type) (&nmi_desc[type])
+
+static int notrace __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2b)
+{
+	struct nmi_desc *desc = nmi_to_desc(type);
+	struct nmiaction *a;
+	int handled=0;
+
+	rcu_read_lock();
+
+	/*
+	 * NMIs are edge-triggered, which means if you have enough
+	 * of them concurrently, you can lose some because only one
+	 * can be latched at any given time.  Walk the whole list
+	 * to handle those situations.
+	 */
+	list_for_each_entry_rcu(a, &desc->head, list)
+		handled += a->handler(type, regs);
+
+	rcu_read_unlock();
+
+	/* return total number of NMI events handled */
+	return handled;
+}
+
+static int __setup_nmi(unsigned int type, struct nmiaction *action)
+{
+	struct nmi_desc *desc = nmi_to_desc(type);
+	unsigned long flags;
+
+	spin_lock_irqsave(&desc->lock, flags);
+
+	/*
+	 * most handlers of type NMI_UNKNOWN never return because
+	 * they just assume the NMI is theirs.  Just a sanity check
+	 * to manage expectations
+	 */
+	WARN_ON_ONCE(type == NMI_UNKNOWN && !list_empty(&desc->head));
+
+	/*
+	 * some handlers need to be executed first otherwise a fake
+	 * event confuses some handlers (kdump uses this flag)
+	 */
+	if (action->flags & NMI_FLAG_FIRST)
+		list_add_rcu(&action->list, &desc->head);
+	else
+		list_add_tail_rcu(&action->list, &desc->head);
+	
+	spin_unlock_irqrestore(&desc->lock, flags);
+	return 0;
+}
+
+static struct nmiaction *__free_nmi(unsigned int type, const char *name)
+{
+	struct nmi_desc *desc = nmi_to_desc(type);
+	struct nmiaction *n;
+	unsigned long flags;
+
+	spin_lock_irqsave(&desc->lock, flags);
+
+	list_for_each_entry_rcu(n, &desc->head, list) {
+		/*
+		 * the name passed in to describe the nmi handler
+		 * is used as the lookup key
+		 */
+		if (!strcmp(n->name, name)) {
+			WARN(in_nmi(),
+				"Trying to free NMI (%s) from NMI context!\n", n->name);
+			list_del_rcu(&n->list);
+			break;
+		}
+	}
+
+	spin_unlock_irqrestore(&desc->lock, flags);
+	synchronize_rcu();
+	return (n);
+}
+
+int register_nmi_handler(unsigned int type, nmi_handler_t handler,
+			unsigned long nmiflags, const char *devname)
+{
+	struct nmiaction *action;
+	int retval = -ENOMEM;
+
+	if (!handler)
+		return -EINVAL;
+
+	action = kzalloc(sizeof(struct nmiaction), GFP_KERNEL);
+	if (!action)
+		goto fail_action;
+
+	action->handler = handler;
+	action->flags = nmiflags;
+	action->name = kstrndup(devname, NMI_MAX_NAMELEN, GFP_KERNEL);
+	if (!action->name)
+		goto fail_action_name;
+
+	retval = __setup_nmi(type, action);
+
+	if (retval)
+		goto fail_setup_nmi;
+
+	return retval;
+
+fail_setup_nmi:
+	kfree(action->name);
+fail_action_name:
+	kfree(action);
+fail_action:	
+
+	return retval;
+}
+EXPORT_SYMBOL_GPL(register_nmi_handler);
+
+void unregister_nmi_handler(unsigned int type, const char *name)
+{
+	struct nmiaction *a;
+
+	a = __free_nmi(type, name);
+	if (a) {
+		kfree(a->name);
+		kfree(a);
+	}
+}
+
+EXPORT_SYMBOL_GPL(unregister_nmi_handler);
+
+static notrace __kprobes void
+pci_serr_error(unsigned char reason, struct pt_regs *regs)
+{
+	pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n",
+		 reason, smp_processor_id());
+
+	/*
+	 * On some machines, PCI SERR line is used to report memory
+	 * errors. EDAC makes use of it.
+	 */
+#if defined(CONFIG_EDAC)
+	if (edac_handler_set()) {
+		edac_atomic_assert_error();
+		return;
+	}
+#endif
+
+	if (panic_on_unrecovered_nmi)
+		panic("NMI: Not continuing");
+
+	pr_emerg("Dazed and confused, but trying to continue\n");
+
+	/* Clear and disable the PCI SERR error line. */
+	reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_SERR;
+	outb(reason, NMI_REASON_PORT);
+}
+
+static notrace __kprobes void
+io_check_error(unsigned char reason, struct pt_regs *regs)
+{
+	unsigned long i;
+
+	pr_emerg(
+	"NMI: IOCK error (debug interrupt?) for reason %02x on CPU %d.\n",
+		 reason, smp_processor_id());
+	show_registers(regs);
+
+	if (panic_on_io_nmi)
+		panic("NMI IOCK error: Not continuing");
+
+	/* Re-enable the IOCK line, wait for a few seconds */
+	reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_IOCHK;
+	outb(reason, NMI_REASON_PORT);
+
+	i = 20000;
+	while (--i) {
+		touch_nmi_watchdog();
+		udelay(100);
+	}
+
+	reason &= ~NMI_REASON_CLEAR_IOCHK;
+	outb(reason, NMI_REASON_PORT);
+}
+
+static notrace __kprobes void
+unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
+{
+	int handled;
+
+	/*
+	 * Use 'false' as back-to-back NMIs are dealt with one level up.
+	 * Of course this makes having multiple 'unknown' handlers useless
+	 * as only the first one is ever run (unless it can actually determine
+	 * if it caused the NMI)
+	 */
+	handled = nmi_handle(NMI_UNKNOWN, regs, false);
+	if (handled) {
+		__this_cpu_add(nmi_stats.unknown, handled);
+		return;
+	}
+
+	__this_cpu_add(nmi_stats.unknown, 1);
+
+#ifdef CONFIG_MCA
+	/*
+	 * Might actually be able to figure out what the guilty party
+	 * is:
+	 */
+	if (MCA_bus) {
+		mca_handle_nmi();
+		return;
+	}
+#endif
+	pr_emerg("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
+		 reason, smp_processor_id());
+
+	pr_emerg("Do you have a strange power saving mode enabled?\n");
+	if (unknown_nmi_panic || panic_on_unrecovered_nmi)
+		panic("NMI: Not continuing");
+
+	pr_emerg("Dazed and confused, but trying to continue\n");
+}
+
+static DEFINE_PER_CPU(bool, swallow_nmi);
+static DEFINE_PER_CPU(unsigned long, last_nmi_rip);
+
+static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
+{
+	unsigned char reason = 0;
+	int handled;
+	bool b2b = false;
+
+	/*
+	 * CPU-specific NMI must be processed before non-CPU-specific
+	 * NMI, otherwise we may lose it, because the CPU-specific
+	 * NMI can not be detected/processed on other CPUs.
+	 */
+
+	/*
+	 * Back-to-back NMIs are interesting because they can either
+	 * be two NMI or more than two NMIs (any thing over two is dropped
+	 * due to NMI being edge-triggered).  If this is the second half
+	 * of the back-to-back NMI, assume we dropped things and process
+	 * more handlers.  Otherwise reset the 'swallow' NMI behaviour
+	 */
+	if (regs->ip == __this_cpu_read(last_nmi_rip))
+		b2b = true;
+	else
+		__this_cpu_write(swallow_nmi, false);
+
+	__this_cpu_write(last_nmi_rip, regs->ip);
+
+	handled = nmi_handle(NMI_LOCAL, regs, b2b);
+	__this_cpu_add(nmi_stats.normal, handled);
+	if (handled) {
+		/*
+		 * There are cases when a NMI handler handles multiple
+		 * events in the current NMI.  One of these events may
+		 * be queued for in the next NMI.  Because the event is
+		 * already handled, the next NMI will result in an unknown
+		 * NMI.  Instead lets flag this for a potential NMI to
+		 * swallow.
+		 */
+		if (handled > 1)
+			__this_cpu_write(swallow_nmi, true);
+		return;
+	}
+
+	/* Non-CPU-specific NMI: NMI sources can be processed on any CPU */
+	raw_spin_lock(&nmi_reason_lock);
+	reason = get_nmi_reason();
+
+	if (reason & NMI_REASON_MASK) {
+		if (reason & NMI_REASON_SERR)
+			pci_serr_error(reason, regs);
+		else if (reason & NMI_REASON_IOCHK)
+			io_check_error(reason, regs);
+#ifdef CONFIG_X86_32
+		/*
+		 * Reassert NMI in case it became active
+		 * meanwhile as it's edge-triggered:
+		 */
+		reassert_nmi();
+#endif
+		__this_cpu_add(nmi_stats.external, 1);
+		raw_spin_unlock(&nmi_reason_lock);
+		return;
+	}
+	raw_spin_unlock(&nmi_reason_lock);
+
+	/*
+	 * Only one NMI can be latched at a time.  To handle
+	 * this we may process multiple nmi handlers at once to
+	 * cover the case where an NMI is dropped.  The downside
+	 * to this approach is we may process an NMI prematurely,
+	 * while its real NMI is sitting latched.  This will cause
+	 * an unknown NMI on the next run of the NMI processing.
+	 *
+	 * We tried to flag that condition above, by setting the
+	 * swallow_nmi flag when we process more than one event.
+	 * This condition is also only present on the second half
+	 * of a back-to-back NMI, so we flag that condition too.
+	 *
+	 * If both are true, we assume we already processed this
+	 * NMI previously and we swallow it.  Otherwise we reset
+	 * the logic.
+	 *
+	 * There are scenarios where we may accidentally swallow
+	 * a 'real' unknown NMI.  For example, while processing
+	 * a perf NMI another perf NMI comes in along with a
+	 * 'real' unknown NMI.  These two NMIs get combined into
+	 * one (as descibed above).  When the next NMI gets
+	 * processed, it will be flagged by perf as handled, but
+	 * noone will know that there was a 'real' unknown NMI sent
+	 * also.  As a result it gets swallowed.  Or if the first
+	 * perf NMI returns two events handled then the second
+	 * NMI will get eaten by the logic below, again losing a
+	 * 'real' unknown NMI.  But this is the best we can do
+	 * for now.
+	 */
+	if (b2b && __this_cpu_read(swallow_nmi))
+		__this_cpu_add(nmi_stats.swallow, 1);
+	else
+		unknown_nmi_error(reason, regs);
+}
+
+dotraplinkage notrace __kprobes void
+do_nmi(struct pt_regs *regs, long error_code)
+{
+	nmi_enter();
+
+	inc_irq_stat(__nmi_count);
+
+	if (!ignore_nmis)
+		default_do_nmi(regs);
+
+	nmi_exit();
+}
+
+void stop_nmi(void)
+{
+	ignore_nmis++;
+}
+
+void restart_nmi(void)
+{
+	ignore_nmis--;
+}
+
+/* reset the back-to-back NMI logic */
+void local_touch_nmi(void)
+{
+	__this_cpu_write(last_nmi_rip, 0);
+}

diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index e7e3b01..b9b3b1a 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c

@@ -49,7 +49,7 @@
 void free_thread_info(struct thread_info *ti)
 {
 	free_thread_xstate(ti->task);
-	free_pages((unsigned long)ti, get_order(THREAD_SIZE));
+	free_pages((unsigned long)ti, THREAD_ORDER);
 }
 
 void arch_task_cache_init(void)

diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 2196c70..795b79f 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c

@@ -57,6 +57,7 @@
 #include <asm/idle.h>
 #include <asm/syscalls.h>
 #include <asm/debugreg.h>
+#include <asm/nmi.h>
 
 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
 
@@ -107,6 +108,7 @@
 			if (cpu_is_offline(cpu))
 				play_dead();
 
+			local_touch_nmi();
 			local_irq_disable();
 			/* Don't trace irqs off for idle */
 			stop_critical_timings();

diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index f693e44..3bd7e6e 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c

@@ -51,6 +51,7 @@
 #include <asm/idle.h>
 #include <asm/syscalls.h>
 #include <asm/debugreg.h>
+#include <asm/nmi.h>
 
 asmlinkage extern void ret_from_fork(void);
 
@@ -133,6 +134,7 @@
 			 * from here on, until they go to idle.
 			 * Otherwise, idle callbacks can misfire.
 			 */
+			local_touch_nmi();
 			local_irq_disable();
 			enter_idle();
 			/* Don't trace irqs off for idle */

diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 9242436..e334be1 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c

@@ -464,7 +464,7 @@
 	}
 }
 
-static void vmxoff_nmi(int cpu, struct die_args *args)
+static void vmxoff_nmi(int cpu, struct pt_regs *regs)
 {
 	cpu_emergency_vmxoff();
 }
@@ -736,14 +736,10 @@
 
 static atomic_t waiting_for_crash_ipi;
 
-static int crash_nmi_callback(struct notifier_block *self,
-			unsigned long val, void *data)
+static int crash_nmi_callback(unsigned int val, struct pt_regs *regs)
 {
 	int cpu;
 
-	if (val != DIE_NMI)
-		return NOTIFY_OK;
-
 	cpu = raw_smp_processor_id();
 
 	/* Don't do anything if this handler is invoked on crashing cpu.
@@ -751,10 +747,10 @@
 	 * an NMI if system was initially booted with nmi_watchdog parameter.
 	 */
 	if (cpu == crashing_cpu)
-		return NOTIFY_STOP;
+		return NMI_HANDLED;
 	local_irq_disable();
 
-	shootdown_callback(cpu, (struct die_args *)data);
+	shootdown_callback(cpu, regs);
 
 	atomic_dec(&waiting_for_crash_ipi);
 	/* Assume hlt works */
@@ -762,7 +758,7 @@
 	for (;;)
 		cpu_relax();
 
-	return 1;
+	return NMI_HANDLED;
 }
 
 static void smp_send_nmi_allbutself(void)
@@ -770,12 +766,6 @@
 	apic->send_IPI_allbutself(NMI_VECTOR);
 }
 
-static struct notifier_block crash_nmi_nb = {
-	.notifier_call = crash_nmi_callback,
-	/* we want to be the first one called */
-	.priority = NMI_LOCAL_HIGH_PRIOR+1,
-};
-
 /* Halt all other CPUs, calling the specified function on each of them
  *
  * This function can be used to halt all other CPUs on crash
@@ -794,7 +784,8 @@
 
 	atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1);
 	/* Would it be better to replace the trap vector here? */
-	if (register_die_notifier(&crash_nmi_nb))
+	if (register_nmi_handler(NMI_LOCAL, crash_nmi_callback,
+				 NMI_FLAG_FIRST, "crash"))
 		return;		/* return what? */
 	/* Ensure the new callback function is set before sending
 	 * out the NMI

diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 6913369..a8e3eb8 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c

@@ -81,15 +81,6 @@
 DECLARE_BITMAP(used_vectors, NR_VECTORS);
 EXPORT_SYMBOL_GPL(used_vectors);
 
-static int ignore_nmis;
-
-int unknown_nmi_panic;
-/*
- * Prevent NMI reason port (0x61) being accessed simultaneously, can
- * only be used in NMI handler.
- */
-static DEFINE_RAW_SPINLOCK(nmi_reason_lock);
-
 static inline void conditional_sti(struct pt_regs *regs)
 {
 	if (regs->flags & X86_EFLAGS_IF)
@@ -307,152 +298,6 @@
 	die("general protection fault", regs, error_code);
 }
 
-static int __init setup_unknown_nmi_panic(char *str)
-{
-	unknown_nmi_panic = 1;
-	return 1;
-}
-__setup("unknown_nmi_panic", setup_unknown_nmi_panic);
-
-static notrace __kprobes void
-pci_serr_error(unsigned char reason, struct pt_regs *regs)
-{
-	pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n",
-		 reason, smp_processor_id());
-
-	/*
-	 * On some machines, PCI SERR line is used to report memory
-	 * errors. EDAC makes use of it.
-	 */
-#if defined(CONFIG_EDAC)
-	if (edac_handler_set()) {
-		edac_atomic_assert_error();
-		return;
-	}
-#endif
-
-	if (panic_on_unrecovered_nmi)
-		panic("NMI: Not continuing");
-
-	pr_emerg("Dazed and confused, but trying to continue\n");
-
-	/* Clear and disable the PCI SERR error line. */
-	reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_SERR;
-	outb(reason, NMI_REASON_PORT);
-}
-
-static notrace __kprobes void
-io_check_error(unsigned char reason, struct pt_regs *regs)
-{
-	unsigned long i;
-
-	pr_emerg(
-	"NMI: IOCK error (debug interrupt?) for reason %02x on CPU %d.\n",
-		 reason, smp_processor_id());
-	show_registers(regs);
-
-	if (panic_on_io_nmi)
-		panic("NMI IOCK error: Not continuing");
-
-	/* Re-enable the IOCK line, wait for a few seconds */
-	reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_IOCHK;
-	outb(reason, NMI_REASON_PORT);
-
-	i = 20000;
-	while (--i) {
-		touch_nmi_watchdog();
-		udelay(100);
-	}
-
-	reason &= ~NMI_REASON_CLEAR_IOCHK;
-	outb(reason, NMI_REASON_PORT);
-}
-
-static notrace __kprobes void
-unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
-{
-	if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) ==
-			NOTIFY_STOP)
-		return;
-#ifdef CONFIG_MCA
-	/*
-	 * Might actually be able to figure out what the guilty party
-	 * is:
-	 */
-	if (MCA_bus) {
-		mca_handle_nmi();
-		return;
-	}
-#endif
-	pr_emerg("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
-		 reason, smp_processor_id());
-
-	pr_emerg("Do you have a strange power saving mode enabled?\n");
-	if (unknown_nmi_panic || panic_on_unrecovered_nmi)
-		panic("NMI: Not continuing");
-
-	pr_emerg("Dazed and confused, but trying to continue\n");
-}
-
-static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
-{
-	unsigned char reason = 0;
-
-	/*
-	 * CPU-specific NMI must be processed before non-CPU-specific
-	 * NMI, otherwise we may lose it, because the CPU-specific
-	 * NMI can not be detected/processed on other CPUs.
-	 */
-	if (notify_die(DIE_NMI, "nmi", regs, 0, 2, SIGINT) == NOTIFY_STOP)
-		return;
-
-	/* Non-CPU-specific NMI: NMI sources can be processed on any CPU */
-	raw_spin_lock(&nmi_reason_lock);
-	reason = get_nmi_reason();
-
-	if (reason & NMI_REASON_MASK) {
-		if (reason & NMI_REASON_SERR)
-			pci_serr_error(reason, regs);
-		else if (reason & NMI_REASON_IOCHK)
-			io_check_error(reason, regs);
-#ifdef CONFIG_X86_32
-		/*
-		 * Reassert NMI in case it became active
-		 * meanwhile as it's edge-triggered:
-		 */
-		reassert_nmi();
-#endif
-		raw_spin_unlock(&nmi_reason_lock);
-		return;
-	}
-	raw_spin_unlock(&nmi_reason_lock);
-
-	unknown_nmi_error(reason, regs);
-}
-
-dotraplinkage notrace __kprobes void
-do_nmi(struct pt_regs *regs, long error_code)
-{
-	nmi_enter();
-
-	inc_irq_stat(__nmi_count);
-
-	if (!ignore_nmis)
-		default_do_nmi(regs);
-
-	nmi_exit();
-}
-
-void stop_nmi(void)
-{
-	ignore_nmis++;
-}
-
-void restart_nmi(void)
-{
-	ignore_nmis--;
-}
-
 /* May run on IST stack. */
 dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code)
 {

diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c
index 9f33b98..374562e 100644
--- a/arch/x86/lib/insn.c
+++ b/arch/x86/lib/insn.c

@@ -22,14 +22,23 @@
 #include <asm/inat.h>
 #include <asm/insn.h>
 
-#define get_next(t, insn)	\
-	({t r; r = *(t*)insn->next_byte; insn->next_byte += sizeof(t); r; })
+/* Verify next sizeof(t) bytes can be on the same instruction */
+#define validate_next(t, insn, n)	\
+	((insn)->next_byte + sizeof(t) + n - (insn)->kaddr <= MAX_INSN_SIZE)
 
-#define peek_next(t, insn)	\
-	({t r; r = *(t*)insn->next_byte; r; })
+#define __get_next(t, insn)	\
+	({ t r = *(t*)insn->next_byte; insn->next_byte += sizeof(t); r; })
+
+#define __peek_nbyte_next(t, insn, n)	\
+	({ t r = *(t*)((insn)->next_byte + n); r; })
+
+#define get_next(t, insn)	\
+	({ if (unlikely(!validate_next(t, insn, 0))) goto err_out; __get_next(t, insn); })
 
 #define peek_nbyte_next(t, insn, n)	\
-	({t r; r = *(t*)((insn)->next_byte + n); r; })
+	({ if (unlikely(!validate_next(t, insn, n))) goto err_out; __peek_nbyte_next(t, insn, n); })
+
+#define peek_next(t, insn)	peek_nbyte_next(t, insn, 0)
 
 /**
  * insn_init() - initialize struct insn
@@ -158,6 +167,8 @@
 	insn->vex_prefix.got = 1;
 
 	prefixes->got = 1;
+
+err_out:
 	return;
 }
 
@@ -208,6 +219,9 @@
 		insn->attr = 0;	/* This instruction is bad */
 end:
 	opcode->got = 1;
+
+err_out:
+	return;
 }
 
 /**
@@ -241,6 +255,9 @@
 	if (insn->x86_64 && inat_is_force64(insn->attr))
 		insn->opnd_bytes = 8;
 	modrm->got = 1;
+
+err_out:
+	return;
 }
 
 
@@ -290,6 +307,9 @@
 		}
 	}
 	insn->sib.got = 1;
+
+err_out:
+	return;
 }
 
 
@@ -351,6 +371,9 @@
 	}
 out:
 	insn->displacement.got = 1;
+
+err_out:
+	return;
 }
 
 /* Decode moffset16/32/64 */
@@ -373,6 +396,9 @@
 		break;
 	}
 	insn->moffset1.got = insn->moffset2.got = 1;
+
+err_out:
+	return;
 }
 
 /* Decode imm v32(Iz) */
@@ -389,6 +415,9 @@
 		insn->immediate.nbytes = 4;
 		break;
 	}
+
+err_out:
+	return;
 }
 
 /* Decode imm v64(Iv/Ov) */
@@ -411,6 +440,9 @@
 		break;
 	}
 	insn->immediate1.got = insn->immediate2.got = 1;
+
+err_out:
+	return;
 }
 
 /* Decode ptr16:16/32(Ap) */
@@ -432,6 +464,9 @@
 	insn->immediate2.value = get_next(unsigned short, insn);
 	insn->immediate2.nbytes = 2;
 	insn->immediate1.got = insn->immediate2.got = 1;
+
+err_out:
+	return;
 }
 
 /**
@@ -496,6 +531,9 @@
 	}
 done:
 	insn->immediate.got = 1;
+
+err_out:
+	return;
 }
 
 /**

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 0d17c8c..9c7378d 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c

@@ -420,12 +420,14 @@
 	return 0;
 }
 
+#ifdef CONFIG_CPU_SUP_AMD
 static const char errata93_warning[] =
 KERN_ERR 
 "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
 "******* Working around it, but it may cause SEGVs or burn power.\n"
 "******* Please consider a BIOS update.\n"
 "******* Disabling USB legacy in the BIOS may also help.\n";
+#endif
 
 /*
  * No vm86 mode in 64-bit mode:
@@ -505,7 +507,11 @@
  */
 static int is_errata93(struct pt_regs *regs, unsigned long address)
 {
-#ifdef CONFIG_X86_64
+#if defined(CONFIG_X86_64) && defined(CONFIG_CPU_SUP_AMD)
+	if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD
+	    || boot_cpu_data.x86 != 0xf)
+		return 0;
+
 	if (address != regs->ip)
 		return 0;
 

diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 96646b3..75f9528 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c

@@ -61,26 +61,15 @@
 }
 
 
-static int profile_exceptions_notify(struct notifier_block *self,
-				     unsigned long val, void *data)
+static int profile_exceptions_notify(unsigned int val, struct pt_regs *regs)
 {
-	struct die_args *args = (struct die_args *)data;
-	int ret = NOTIFY_DONE;
-
-	switch (val) {
-	case DIE_NMI:
-		if (ctr_running)
-			model->check_ctrs(args->regs, &__get_cpu_var(cpu_msrs));
-		else if (!nmi_enabled)
-			break;
-		else
-			model->stop(&__get_cpu_var(cpu_msrs));
-		ret = NOTIFY_STOP;
-		break;
-	default:
-		break;
-	}
-	return ret;
+	if (ctr_running)
+		model->check_ctrs(regs, &__get_cpu_var(cpu_msrs));
+	else if (!nmi_enabled)
+		return NMI_DONE;
+	else
+		model->stop(&__get_cpu_var(cpu_msrs));
+	return NMI_HANDLED;
 }
 
 static void nmi_cpu_save_registers(struct op_msrs *msrs)
@@ -363,12 +352,6 @@
 	apic_write(APIC_LVTPC, APIC_DM_NMI);
 }
 
-static struct notifier_block profile_exceptions_nb = {
-	.notifier_call = profile_exceptions_notify,
-	.next = NULL,
-	.priority = NMI_LOCAL_LOW_PRIOR,
-};
-
 static void nmi_cpu_restore_registers(struct op_msrs *msrs)
 {
 	struct op_msr *counters = msrs->counters;
@@ -402,8 +385,6 @@
 	apic_write(APIC_LVTPC, per_cpu(saved_lvtpc, cpu));
 	apic_write(APIC_LVTERR, v);
 	nmi_cpu_restore_registers(msrs);
-	if (model->cpu_down)
-		model->cpu_down();
 }
 
 static void nmi_cpu_up(void *dummy)
@@ -508,7 +489,8 @@
 	ctr_running = 0;
 	/* make variables visible to the nmi handler: */
 	smp_mb();
-	err = register_die_notifier(&profile_exceptions_nb);
+	err = register_nmi_handler(NMI_LOCAL, profile_exceptions_notify,
+					0, "oprofile");
 	if (err)
 		goto fail;
 
@@ -538,7 +520,7 @@
 	put_online_cpus();
 	/* make variables visible to the nmi handler: */
 	smp_mb();
-	unregister_die_notifier(&profile_exceptions_nb);
+	unregister_nmi_handler(NMI_LOCAL, "oprofile");
 	msrs = &get_cpu_var(cpu_msrs);
 	model->shutdown(msrs);
 	free_msrs();

diff --git a/arch/x86/oprofile/nmi_timer_int.c b/arch/x86/oprofile/nmi_timer_int.c
index 720bf5a..7f8052c 100644
--- a/arch/x86/oprofile/nmi_timer_int.c
+++ b/arch/x86/oprofile/nmi_timer_int.c

@@ -18,32 +18,16 @@
 #include <asm/apic.h>
 #include <asm/ptrace.h>
 
-static int profile_timer_exceptions_notify(struct notifier_block *self,
-					   unsigned long val, void *data)
+static int profile_timer_exceptions_notify(unsigned int val, struct pt_regs *regs)
 {
-	struct die_args *args = (struct die_args *)data;
-	int ret = NOTIFY_DONE;
-
-	switch (val) {
-	case DIE_NMI:
-		oprofile_add_sample(args->regs, 0);
-		ret = NOTIFY_STOP;
-		break;
-	default:
-		break;
-	}
-	return ret;
+	oprofile_add_sample(regs, 0);
+	return NMI_HANDLED;
 }
 
-static struct notifier_block profile_timer_exceptions_nb = {
-	.notifier_call = profile_timer_exceptions_notify,
-	.next = NULL,
-	.priority = NMI_LOW_PRIOR,
-};
-
 static int timer_start(void)
 {
-	if (register_die_notifier(&profile_timer_exceptions_nb))
+	if (register_nmi_handler(NMI_LOCAL, profile_timer_exceptions_notify,
+					0, "oprofile-timer"))
 		return 1;
 	return 0;
 }
@@ -51,7 +35,7 @@
 
 static void timer_stop(void)
 {
-	unregister_die_notifier(&profile_timer_exceptions_nb);
+	unregister_nmi_handler(NMI_LOCAL, "oprofile-timer");
 	synchronize_sched();  /* Allow already-started NMIs to complete. */
 }
 

diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
index 9cbb710..303f086 100644
--- a/arch/x86/oprofile/op_model_amd.c
+++ b/arch/x86/oprofile/op_model_amd.c

@@ -29,8 +29,6 @@
 #include "op_x86_model.h"
 #include "op_counter.h"
 
-#define NUM_COUNTERS		4
-#define NUM_COUNTERS_F15H	6
 #ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
 #define NUM_VIRT_COUNTERS	32
 #else
@@ -70,62 +68,12 @@
 static struct ibs_state ibs_state;
 
 /*
- * IBS cpuid feature detection
- */
-
-#define IBS_CPUID_FEATURES		0x8000001b
-
-/*
- * Same bit mask as for IBS cpuid feature flags (Fn8000_001B_EAX), but
- * bit 0 is used to indicate the existence of IBS.
- */
-#define IBS_CAPS_AVAIL			(1U<<0)
-#define IBS_CAPS_FETCHSAM		(1U<<1)
-#define IBS_CAPS_OPSAM			(1U<<2)
-#define IBS_CAPS_RDWROPCNT		(1U<<3)
-#define IBS_CAPS_OPCNT			(1U<<4)
-#define IBS_CAPS_BRNTRGT		(1U<<5)
-#define IBS_CAPS_OPCNTEXT		(1U<<6)
-
-#define IBS_CAPS_DEFAULT		(IBS_CAPS_AVAIL		\
-					 | IBS_CAPS_FETCHSAM	\
-					 | IBS_CAPS_OPSAM)
-
-/*
- * IBS APIC setup
- */
-#define IBSCTL				0x1cc
-#define IBSCTL_LVT_OFFSET_VALID		(1ULL<<8)
-#define IBSCTL_LVT_OFFSET_MASK		0x0F
-
-/*
  * IBS randomization macros
  */
 #define IBS_RANDOM_BITS			12
 #define IBS_RANDOM_MASK			((1ULL << IBS_RANDOM_BITS) - 1)
 #define IBS_RANDOM_MAXCNT_OFFSET	(1ULL << (IBS_RANDOM_BITS - 5))
 
-static u32 get_ibs_caps(void)
-{
-	u32 ibs_caps;
-	unsigned int max_level;
-
-	if (!boot_cpu_has(X86_FEATURE_IBS))
-		return 0;
-
-	/* check IBS cpuid feature flags */
-	max_level = cpuid_eax(0x80000000);
-	if (max_level < IBS_CPUID_FEATURES)
-		return IBS_CAPS_DEFAULT;
-
-	ibs_caps = cpuid_eax(IBS_CPUID_FEATURES);
-	if (!(ibs_caps & IBS_CAPS_AVAIL))
-		/* cpuid flags not valid */
-		return IBS_CAPS_DEFAULT;
-
-	return ibs_caps;
-}
-
 /*
  * 16-bit Linear Feedback Shift Register (LFSR)
  *
@@ -316,81 +264,6 @@
 		wrmsrl(MSR_AMD64_IBSOPCTL, 0);
 }
 
-static inline int get_eilvt(int offset)
-{
-	return !setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 1);
-}
-
-static inline int put_eilvt(int offset)
-{
-	return !setup_APIC_eilvt(offset, 0, 0, 1);
-}
-
-static inline int ibs_eilvt_valid(void)
-{
-	int offset;
-	u64 val;
-	int valid = 0;
-
-	preempt_disable();
-
-	rdmsrl(MSR_AMD64_IBSCTL, val);
-	offset = val & IBSCTL_LVT_OFFSET_MASK;
-
-	if (!(val & IBSCTL_LVT_OFFSET_VALID)) {
-		pr_err(FW_BUG "cpu %d, invalid IBS interrupt offset %d (MSR%08X=0x%016llx)\n",
-		       smp_processor_id(), offset, MSR_AMD64_IBSCTL, val);
-		goto out;
-	}
-
-	if (!get_eilvt(offset)) {
-		pr_err(FW_BUG "cpu %d, IBS interrupt offset %d not available (MSR%08X=0x%016llx)\n",
-		       smp_processor_id(), offset, MSR_AMD64_IBSCTL, val);
-		goto out;
-	}
-
-	valid = 1;
-out:
-	preempt_enable();
-
-	return valid;
-}
-
-static inline int get_ibs_offset(void)
-{
-	u64 val;
-
-	rdmsrl(MSR_AMD64_IBSCTL, val);
-	if (!(val & IBSCTL_LVT_OFFSET_VALID))
-		return -EINVAL;
-
-	return val & IBSCTL_LVT_OFFSET_MASK;
-}
-
-static void setup_APIC_ibs(void)
-{
-	int offset;
-
-	offset = get_ibs_offset();
-	if (offset < 0)
-		goto failed;
-
-	if (!setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 0))
-		return;
-failed:
-	pr_warn("oprofile: IBS APIC setup failed on cpu #%d\n",
-		smp_processor_id());
-}
-
-static void clear_APIC_ibs(void)
-{
-	int offset;
-
-	offset = get_ibs_offset();
-	if (offset >= 0)
-		setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_FIX, 1);
-}
-
 #ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
 
 static void op_mux_switch_ctrl(struct op_x86_model_spec const *model,
@@ -439,7 +312,7 @@
 			goto fail;
 		}
 		/* both registers must be reserved */
-		if (num_counters == NUM_COUNTERS_F15H) {
+		if (num_counters == AMD64_NUM_COUNTERS_F15H) {
 			msrs->counters[i].addr = MSR_F15H_PERF_CTR + (i << 1);
 			msrs->controls[i].addr = MSR_F15H_PERF_CTL + (i << 1);
 		} else {
@@ -504,15 +377,6 @@
 		val |= op_x86_get_ctrl(model, &counter_config[virt]);
 		wrmsrl(msrs->controls[i].addr, val);
 	}
-
-	if (ibs_caps)
-		setup_APIC_ibs();
-}
-
-static void op_amd_cpu_shutdown(void)
-{
-	if (ibs_caps)
-		clear_APIC_ibs();
 }
 
 static int op_amd_check_ctrs(struct pt_regs * const regs,
@@ -575,86 +439,6 @@
 	op_amd_stop_ibs();
 }
 
-static int setup_ibs_ctl(int ibs_eilvt_off)
-{
-	struct pci_dev *cpu_cfg;
-	int nodes;
-	u32 value = 0;
-
-	nodes = 0;
-	cpu_cfg = NULL;
-	do {
-		cpu_cfg = pci_get_device(PCI_VENDOR_ID_AMD,
-					 PCI_DEVICE_ID_AMD_10H_NB_MISC,
-					 cpu_cfg);
-		if (!cpu_cfg)
-			break;
-		++nodes;
-		pci_write_config_dword(cpu_cfg, IBSCTL, ibs_eilvt_off
-				       | IBSCTL_LVT_OFFSET_VALID);
-		pci_read_config_dword(cpu_cfg, IBSCTL, &value);
-		if (value != (ibs_eilvt_off | IBSCTL_LVT_OFFSET_VALID)) {
-			pci_dev_put(cpu_cfg);
-			printk(KERN_DEBUG "Failed to setup IBS LVT offset, "
-			       "IBSCTL = 0x%08x\n", value);
-			return -EINVAL;
-		}
-	} while (1);
-
-	if (!nodes) {
-		printk(KERN_DEBUG "No CPU node configured for IBS\n");
-		return -ENODEV;
-	}
-
-	return 0;
-}
-
-/*
- * This runs only on the current cpu. We try to find an LVT offset and
- * setup the local APIC. For this we must disable preemption. On
- * success we initialize all nodes with this offset. This updates then
- * the offset in the IBS_CTL per-node msr. The per-core APIC setup of
- * the IBS interrupt vector is called from op_amd_setup_ctrs()/op_-
- * amd_cpu_shutdown() using the new offset.
- */
-static int force_ibs_eilvt_setup(void)
-{
-	int offset;
-	int ret;
-
-	preempt_disable();
-	/* find the next free available EILVT entry, skip offset 0 */
-	for (offset = 1; offset < APIC_EILVT_NR_MAX; offset++) {
-		if (get_eilvt(offset))
-			break;
-	}
-	preempt_enable();
-
-	if (offset == APIC_EILVT_NR_MAX) {
-		printk(KERN_DEBUG "No EILVT entry available\n");
-		return -EBUSY;
-	}
-
-	ret = setup_ibs_ctl(offset);
-	if (ret)
-		goto out;
-
-	if (!ibs_eilvt_valid()) {
-		ret = -EFAULT;
-		goto out;
-	}
-
-	pr_err(FW_BUG "using offset %d for IBS interrupts\n", offset);
-	pr_err(FW_BUG "workaround enabled for IBS LVT offset\n");
-
-	return 0;
-out:
-	preempt_disable();
-	put_eilvt(offset);
-	preempt_enable();
-	return ret;
-}
-
 /*
  * check and reserve APIC extended interrupt LVT offset for IBS if
  * available
@@ -667,17 +451,6 @@
 	if (!ibs_caps)
 		return;
 
-	if (ibs_eilvt_valid())
-		goto out;
-
-	if (!force_ibs_eilvt_setup())
-		goto out;
-
-	/* Failed to setup ibs */
-	ibs_caps = 0;
-	return;
-
-out:
 	printk(KERN_INFO "oprofile: AMD IBS detected (0x%08x)\n", ibs_caps);
 }
 
@@ -741,9 +514,9 @@
 	ops->create_files = setup_ibs_files;
 
 	if (boot_cpu_data.x86 == 0x15) {
-		num_counters = NUM_COUNTERS_F15H;
+		num_counters = AMD64_NUM_COUNTERS_F15H;
 	} else {
-		num_counters = NUM_COUNTERS;
+		num_counters = AMD64_NUM_COUNTERS;
 	}
 
 	op_amd_spec.num_counters = num_counters;
@@ -760,7 +533,6 @@
 	.init			= op_amd_init,
 	.fill_in_addresses	= &op_amd_fill_in_addresses,
 	.setup_ctrs		= &op_amd_setup_ctrs,
-	.cpu_down		= &op_amd_cpu_shutdown,
 	.check_ctrs		= &op_amd_check_ctrs,
 	.start			= &op_amd_start,
 	.stop			= &op_amd_stop,

diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c
index 94b7450..d90528e 100644
--- a/arch/x86/oprofile/op_model_ppro.c
+++ b/arch/x86/oprofile/op_model_ppro.c

@@ -28,7 +28,7 @@
 
 #define MSR_PPRO_EVENTSEL_RESERVED	((0xFFFFFFFFULL<<32)|(1ULL<<21))
 
-static u64 *reset_value;
+static u64 reset_value[OP_MAX_COUNTER];
 
 static void ppro_shutdown(struct op_msrs const * const msrs)
 {
@@ -40,10 +40,6 @@
 		release_perfctr_nmi(MSR_P6_PERFCTR0 + i);
 		release_evntsel_nmi(MSR_P6_EVNTSEL0 + i);
 	}
-	if (reset_value) {
-		kfree(reset_value);
-		reset_value = NULL;
-	}
 }
 
 static int ppro_fill_in_addresses(struct op_msrs * const msrs)
@@ -79,13 +75,6 @@
 	u64 val;
 	int i;
 
-	if (!reset_value) {
-		reset_value = kzalloc(sizeof(reset_value[0]) * num_counters,
-					GFP_ATOMIC);
-		if (!reset_value)
-			return;
-	}
-
 	if (cpu_has_arch_perfmon) {
 		union cpuid10_eax eax;
 		eax.full = cpuid_eax(0xa);
@@ -141,13 +130,6 @@
 	u64 val;
 	int i;
 
-	/*
-	 * This can happen if perf counters are in use when
-	 * we steal the die notifier NMI.
-	 */
-	if (unlikely(!reset_value))
-		goto out;
-
 	for (i = 0; i < num_counters; ++i) {
 		if (!reset_value[i])
 			continue;
@@ -158,7 +140,6 @@
 		wrmsrl(msrs->counters[i].addr, -reset_value[i]);
 	}
 
-out:
 	/* Only P6 based Pentium M need to re-unmask the apic vector but it
 	 * doesn't hurt other P6 variant */
 	apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED);
@@ -179,8 +160,6 @@
 	u64 val;
 	int i;
 
-	if (!reset_value)
-		return;
 	for (i = 0; i < num_counters; ++i) {
 		if (reset_value[i]) {
 			rdmsrl(msrs->controls[i].addr, val);
@@ -196,8 +175,6 @@
 	u64 val;
 	int i;
 
-	if (!reset_value)
-		return;
 	for (i = 0; i < num_counters; ++i) {
 		if (!reset_value[i])
 			continue;
@@ -242,7 +219,7 @@
 		eax.split.bit_width = 40;
 	}
 
-	num_counters = eax.split.num_counters;
+	num_counters = min((int)eax.split.num_counters, OP_MAX_COUNTER);
 
 	op_arch_perfmon_spec.num_counters = num_counters;
 	op_arch_perfmon_spec.num_controls = num_counters;

diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h
index 89017fa..71e8a67 100644
--- a/arch/x86/oprofile/op_x86_model.h
+++ b/arch/x86/oprofile/op_x86_model.h

@@ -43,7 +43,6 @@
 	int		(*fill_in_addresses)(struct op_msrs * const msrs);
 	void		(*setup_ctrs)(struct op_x86_model_spec const *model,
 				      struct op_msrs const * const msrs);
-	void		(*cpu_down)(void);
 	int		(*check_ctrs)(struct pt_regs * const regs,
 				      struct op_msrs const * const msrs);
 	void		(*start)(struct op_msrs const * const msrs);

diff --git a/arch/xtensa/configs/iss_defconfig b/arch/xtensa/configs/iss_defconfig
index 0234cd1..f932b30 100644
--- a/arch/xtensa/configs/iss_defconfig
+++ b/arch/xtensa/configs/iss_defconfig

@@ -15,7 +15,6 @@
 # CONFIG_ARCH_HAS_ILOG2_U64 is not set
 CONFIG_NO_IOPORT=y
 CONFIG_HZ=100
-CONFIG_GENERIC_TIME=y
 CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config"
 CONFIG_CONSTRUCTORS=y
 

diff --git a/arch/xtensa/configs/s6105_defconfig b/arch/xtensa/configs/s6105_defconfig
index 4891abb..550e8ed 100644
--- a/arch/xtensa/configs/s6105_defconfig
+++ b/arch/xtensa/configs/s6105_defconfig

@@ -15,7 +15,6 @@
 # CONFIG_ARCH_HAS_ILOG2_U64 is not set
 CONFIG_NO_IOPORT=y
 CONFIG_HZ=100
-CONFIG_GENERIC_TIME=y
 CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config"
 
 #

diff --git a/drivers/acpi/apei/Kconfig b/drivers/acpi/apei/Kconfig
index e3f4787..f0c1ce9 100644
--- a/drivers/acpi/apei/Kconfig
+++ b/drivers/acpi/apei/Kconfig

@@ -14,7 +14,6 @@
 	depends on ACPI_APEI && X86
 	select ACPI_HED
 	select IRQ_WORK
-	select LLIST
 	select GENERIC_ALLOCATOR
 	help
 	  Generic Hardware Error Source provides a way to report

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 0784f99..b8e08cb 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c

@@ -50,6 +50,7 @@
 #include <acpi/hed.h>
 #include <asm/mce.h>
 #include <asm/tlbflush.h>
+#include <asm/nmi.h>
 
 #include "apei-internal.h"
 
@@ -749,15 +750,11 @@
 	}
 }
 
-static int ghes_notify_nmi(struct notifier_block *this,
-				  unsigned long cmd, void *data)
+static int ghes_notify_nmi(unsigned int cmd, struct pt_regs *regs)
 {
 	struct ghes *ghes, *ghes_global = NULL;
 	int sev, sev_global = -1;
-	int ret = NOTIFY_DONE;
-
-	if (cmd != DIE_NMI)
-		return ret;
+	int ret = NMI_DONE;
 
 	raw_spin_lock(&ghes_nmi_lock);
 	list_for_each_entry_rcu(ghes, &ghes_nmi, list) {
@@ -770,10 +767,10 @@
 			sev_global = sev;
 			ghes_global = ghes;
 		}
-		ret = NOTIFY_STOP;
+		ret = NMI_HANDLED;
 	}
 
-	if (ret == NOTIFY_DONE)
+	if (ret == NMI_DONE)
 		goto out;
 
 	if (sev_global >= GHES_SEV_PANIC) {
@@ -825,10 +822,6 @@
 	.notifier_call = ghes_notify_sci,
 };
 
-static struct notifier_block ghes_notifier_nmi = {
-	.notifier_call = ghes_notify_nmi,
-};
-
 static unsigned long ghes_esource_prealloc_size(
 	const struct acpi_hest_generic *generic)
 {
@@ -918,7 +911,8 @@
 		ghes_estatus_pool_expand(len);
 		mutex_lock(&ghes_list_mutex);
 		if (list_empty(&ghes_nmi))
-			register_die_notifier(&ghes_notifier_nmi);
+			register_nmi_handler(NMI_LOCAL, ghes_notify_nmi, 0,
+						"ghes");
 		list_add_rcu(&ghes->list, &ghes_nmi);
 		mutex_unlock(&ghes_list_mutex);
 		break;
@@ -964,7 +958,7 @@
 		mutex_lock(&ghes_list_mutex);
 		list_del_rcu(&ghes->list);
 		if (list_empty(&ghes_nmi))
-			unregister_die_notifier(&ghes_notifier_nmi);
+			unregister_nmi_handler(NMI_LOCAL, "ghes");
 		mutex_unlock(&ghes_list_mutex);
 		/*
 		 * To synchronize with NMI handler, ghes can only be

diff --git a/drivers/char/ipmi/ipmi_watchdog.c b/drivers/char/ipmi/ipmi_watchdog.c
index 3302586..c2917ffa 100644
--- a/drivers/char/ipmi/ipmi_watchdog.c
+++ b/drivers/char/ipmi/ipmi_watchdog.c

@@ -65,6 +65,7 @@
  * mechanism for it at that time.
  */
 #include <asm/kdebug.h>
+#include <asm/nmi.h>
 #define HAVE_DIE_NMI
 #endif
 
@@ -1077,17 +1078,8 @@
 
 #ifdef HAVE_DIE_NMI
 static int
-ipmi_nmi(struct notifier_block *self, unsigned long val, void *data)
+ipmi_nmi(unsigned int val, struct pt_regs *regs)
 {
-	struct die_args *args = data;
-
-	if (val != DIE_NMIUNKNOWN)
-		return NOTIFY_OK;
-
-	/* Hack, if it's a memory or I/O error, ignore it. */
-	if (args->err & 0xc0)
-		return NOTIFY_OK;
-
 	/*
 	 * If we get here, it's an NMI that's not a memory or I/O
 	 * error.  We can't truly tell if it's from IPMI or not
@@ -1097,15 +1089,15 @@
 
 	if (testing_nmi) {
 		testing_nmi = 2;
-		return NOTIFY_STOP;
+		return NMI_HANDLED;
 	}
 
 	/* If we are not expecting a timeout, ignore it. */
 	if (ipmi_watchdog_state == WDOG_TIMEOUT_NONE)
-		return NOTIFY_OK;
+		return NMI_DONE;
 
 	if (preaction_val != WDOG_PRETIMEOUT_NMI)
-		return NOTIFY_OK;
+		return NMI_DONE;
 
 	/*
 	 * If no one else handled the NMI, we assume it was the IPMI
@@ -1120,12 +1112,8 @@
 			panic(PFX "pre-timeout");
 	}
 
-	return NOTIFY_STOP;
+	return NMI_HANDLED;
 }
-
-static struct notifier_block ipmi_nmi_handler = {
-	.notifier_call = ipmi_nmi
-};
 #endif
 
 static int wdog_reboot_handler(struct notifier_block *this,
@@ -1290,7 +1278,8 @@
 		}
 	}
 	if (do_nmi && !nmi_handler_registered) {
-		rv = register_die_notifier(&ipmi_nmi_handler);
+		rv = register_nmi_handler(NMI_UNKNOWN, ipmi_nmi, 0,
+						"ipmi");
 		if (rv) {
 			printk(KERN_WARNING PFX
 			       "Can't register nmi handler\n");
@@ -1298,7 +1287,7 @@
 		} else
 			nmi_handler_registered = 1;
 	} else if (!do_nmi && nmi_handler_registered) {
-		unregister_die_notifier(&ipmi_nmi_handler);
+		unregister_nmi_handler(NMI_UNKNOWN, "ipmi");
 		nmi_handler_registered = 0;
 	}
 #endif
@@ -1336,7 +1325,7 @@
 	if (rv) {
 #ifdef HAVE_DIE_NMI
 		if (nmi_handler_registered)
-			unregister_die_notifier(&ipmi_nmi_handler);
+			unregister_nmi_handler(NMI_UNKNOWN, "ipmi");
 #endif
 		atomic_notifier_chain_unregister(&panic_notifier_list,
 						 &wdog_panic_notifier);
@@ -1357,7 +1346,7 @@
 
 #ifdef HAVE_DIE_NMI
 	if (nmi_handler_registered)
-		unregister_die_notifier(&ipmi_nmi_handler);
+		unregister_nmi_handler(NMI_UNKNOWN, "ipmi");
 #endif
 
 	atomic_notifier_chain_unregister(&panic_notifier_list,

diff --git a/drivers/clocksource/dw_apb_timer.c b/drivers/clocksource/dw_apb_timer.c
index 580f870..8c2a35f 100644
--- a/drivers/clocksource/dw_apb_timer.c
+++ b/drivers/clocksource/dw_apb_timer.c

@@ -348,7 +348,7 @@
  * dw_apb_clocksource_register() as the next step.
  */
 struct dw_apb_clocksource *
-dw_apb_clocksource_init(unsigned rating, char *name, void __iomem *base,
+dw_apb_clocksource_init(unsigned rating, const char *name, void __iomem *base,
 			unsigned long freq)
 {
 	struct dw_apb_clocksource *dw_cs = kzalloc(sizeof(*dw_cs), GFP_KERNEL);

diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c
index 33b56e5..c97b468 100644
--- a/drivers/cpufreq/cpufreq_conservative.c
+++ b/drivers/cpufreq/cpufreq_conservative.c

@@ -120,10 +120,12 @@
 
 static inline cputime64_t get_cpu_idle_time(unsigned int cpu, cputime64_t *wall)
 {
-	u64 idle_time = get_cpu_idle_time_us(cpu, wall);
+	u64 idle_time = get_cpu_idle_time_us(cpu, NULL);
 
 	if (idle_time == -1ULL)
 		return get_cpu_idle_time_jiffy(cpu, wall);
+	else
+		idle_time += get_cpu_iowait_time_us(cpu, wall);
 
 	return idle_time;
 }

diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index 629b3ec..fa8af4e 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c

@@ -144,10 +144,12 @@
 
 static inline cputime64_t get_cpu_idle_time(unsigned int cpu, cputime64_t *wall)
 {
-	u64 idle_time = get_cpu_idle_time_us(cpu, wall);
+	u64 idle_time = get_cpu_idle_time_us(cpu, NULL);
 
 	if (idle_time == -1ULL)
 		return get_cpu_idle_time_jiffy(cpu, wall);
+	else
+		idle_time += get_cpu_iowait_time_us(cpu, wall);
 
 	return idle_time;
 }

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 0dc6546..7878712 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c

@@ -29,35 +29,6 @@
 #include "md.h"
 #include "bitmap.h"
 
-/* debug macros */
-
-#define DEBUG 0
-
-#if DEBUG
-/* these are for debugging purposes only! */
-
-/* define one and only one of these */
-#define INJECT_FAULTS_1 0 /* cause bitmap_alloc_page to fail always */
-#define INJECT_FAULTS_2 0 /* cause bitmap file to be kicked when first bit set*/
-#define INJECT_FAULTS_3 0 /* treat bitmap file as kicked at init time */
-#define INJECT_FAULTS_4 0 /* undef */
-#define INJECT_FAULTS_5 0 /* undef */
-#define INJECT_FAULTS_6 0
-
-/* if these are defined, the driver will fail! debug only */
-#define INJECT_FATAL_FAULT_1 0 /* fail kmalloc, causing bitmap_create to fail */
-#define INJECT_FATAL_FAULT_2 0 /* undef */
-#define INJECT_FATAL_FAULT_3 0 /* undef */
-#endif
-
-#ifndef PRINTK
-#  if DEBUG > 0
-#    define PRINTK(x...) printk(KERN_DEBUG x)
-#  else
-#    define PRINTK(x...)
-#  endif
-#endif
-
 static inline char *bmname(struct bitmap *bitmap)
 {
 	return bitmap->mddev ? mdname(bitmap->mddev) : "mdX";
@@ -70,16 +41,12 @@
 {
 	unsigned char *page;
 
-#ifdef INJECT_FAULTS_1
-	page = NULL;
-#else
 	page = kzalloc(PAGE_SIZE, GFP_NOIO);
-#endif
 	if (!page)
 		printk("%s: bitmap_alloc_page FAILED\n", bmname(bitmap));
 	else
-		PRINTK("%s: bitmap_alloc_page: allocated page at %p\n",
-			bmname(bitmap), page);
+		pr_debug("%s: bitmap_alloc_page: allocated page at %p\n",
+			 bmname(bitmap), page);
 	return page;
 }
 
@@ -88,7 +55,7 @@
  */
 static void bitmap_free_page(struct bitmap *bitmap, unsigned char *page)
 {
-	PRINTK("%s: bitmap_free_page: free page %p\n", bmname(bitmap), page);
+	pr_debug("%s: bitmap_free_page: free page %p\n", bmname(bitmap), page);
 	kfree(page);
 }
 
@@ -133,8 +100,8 @@
 	spin_lock_irq(&bitmap->lock);
 
 	if (mappage == NULL) {
-		PRINTK("%s: bitmap map page allocation failed, hijacking\n",
-			bmname(bitmap));
+		pr_debug("%s: bitmap map page allocation failed, hijacking\n",
+			 bmname(bitmap));
 		/* failed - set the hijacked flag so that we can use the
 		 * pointer as a counter */
 		if (!bitmap->bp[page].map)
@@ -187,13 +154,13 @@
  */
 
 /* IO operations when bitmap is stored near all superblocks */
-static struct page *read_sb_page(mddev_t *mddev, loff_t offset,
+static struct page *read_sb_page(struct mddev *mddev, loff_t offset,
 				 struct page *page,
 				 unsigned long index, int size)
 {
 	/* choose a good rdev and read the page from there */
 
-	mdk_rdev_t *rdev;
+	struct md_rdev *rdev;
 	sector_t target;
 	int did_alloc = 0;
 
@@ -226,7 +193,7 @@
 
 }
 
-static mdk_rdev_t *next_active_rdev(mdk_rdev_t *rdev, mddev_t *mddev)
+static struct md_rdev *next_active_rdev(struct md_rdev *rdev, struct mddev *mddev)
 {
 	/* Iterate the disks of an mddev, using rcu to protect access to the
 	 * linked list, and raising the refcount of devices we return to ensure
@@ -247,7 +214,7 @@
 		pos = &rdev->same_set;
 	}
 	list_for_each_continue_rcu(pos, &mddev->disks) {
-		rdev = list_entry(pos, mdk_rdev_t, same_set);
+		rdev = list_entry(pos, struct md_rdev, same_set);
 		if (rdev->raid_disk >= 0 &&
 		    !test_bit(Faulty, &rdev->flags)) {
 			/* this is a usable devices */
@@ -262,9 +229,9 @@
 
 static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
 {
-	mdk_rdev_t *rdev = NULL;
+	struct md_rdev *rdev = NULL;
 	struct block_device *bdev;
-	mddev_t *mddev = bitmap->mddev;
+	struct mddev *mddev = bitmap->mddev;
 
 	while ((rdev = next_active_rdev(rdev, mddev)) != NULL) {
 		int size = PAGE_SIZE;
@@ -409,8 +376,8 @@
 	struct buffer_head *bh;
 	sector_t block;
 
-	PRINTK("read bitmap file (%dB @ %llu)\n", (int)PAGE_SIZE,
-			(unsigned long long)index << PAGE_SHIFT);
+	pr_debug("read bitmap file (%dB @ %llu)\n", (int)PAGE_SIZE,
+		 (unsigned long long)index << PAGE_SHIFT);
 
 	page = alloc_page(GFP_KERNEL);
 	if (!page)
@@ -868,7 +835,8 @@
 
 enum bitmap_page_attr {
 	BITMAP_PAGE_DIRTY = 0,     /* there are set bits that need to be synced */
-	BITMAP_PAGE_CLEAN = 1,     /* there are bits that might need to be cleared */
+	BITMAP_PAGE_PENDING = 1,   /* there are bits that are being cleaned.
+				    * i.e. counter is 1 or 2. */
 	BITMAP_PAGE_NEEDWRITE = 2, /* there are cleared bits that need to be synced */
 };
 
@@ -919,7 +887,7 @@
 	else
 		__set_bit_le(bit, kaddr);
 	kunmap_atomic(kaddr, KM_USER0);
-	PRINTK("set file bit %lu page %lu\n", bit, page->index);
+	pr_debug("set file bit %lu page %lu\n", bit, page->index);
 	/* record page number so it gets flushed to disk when unplug occurs */
 	set_page_attr(bitmap, page, BITMAP_PAGE_DIRTY);
 }
@@ -997,11 +965,7 @@
 
 	BUG_ON(!file && !bitmap->mddev->bitmap_info.offset);
 
-#ifdef INJECT_FAULTS_3
-	outofdate = 1;
-#else
 	outofdate = bitmap->flags & BITMAP_STALE;
-#endif
 	if (outofdate)
 		printk(KERN_INFO "%s: bitmap file is out of date, doing full "
 			"recovery\n", bmname(bitmap));
@@ -1111,7 +1075,6 @@
 					       (sector_t)i << CHUNK_BLOCK_SHIFT(bitmap),
 					       needed);
 			bit_cnt++;
-			set_page_attr(bitmap, page, BITMAP_PAGE_CLEAN);
 		}
 	}
 
@@ -1146,6 +1109,7 @@
 	for (i = 0; i < bitmap->file_pages; i++)
 		set_page_attr(bitmap, bitmap->filemap[i],
 			      BITMAP_PAGE_NEEDWRITE);
+	bitmap->allclean = 0;
 }
 
 static void bitmap_count_page(struct bitmap *bitmap, sector_t offset, int inc)
@@ -1164,7 +1128,7 @@
  *			out to disk
  */
 
-void bitmap_daemon_work(mddev_t *mddev)
+void bitmap_daemon_work(struct mddev *mddev)
 {
 	struct bitmap *bitmap;
 	unsigned long j;
@@ -1204,17 +1168,15 @@
 
 		if (page != lastpage) {
 			/* skip this page unless it's marked as needing cleaning */
-			if (!test_page_attr(bitmap, page, BITMAP_PAGE_CLEAN)) {
+			if (!test_page_attr(bitmap, page, BITMAP_PAGE_PENDING)) {
 				int need_write = test_page_attr(bitmap, page,
 								BITMAP_PAGE_NEEDWRITE);
 				if (need_write)
 					clear_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE);
 
 				spin_unlock_irqrestore(&bitmap->lock, flags);
-				if (need_write) {
+				if (need_write)
 					write_page(bitmap, page, 0);
-					bitmap->allclean = 0;
-				}
 				spin_lock_irqsave(&bitmap->lock, flags);
 				j |= (PAGE_BITS - 1);
 				continue;
@@ -1222,12 +1184,16 @@
 
 			/* grab the new page, sync and release the old */
 			if (lastpage != NULL) {
-				if (test_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE)) {
-					clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
+				if (test_page_attr(bitmap, lastpage,
+						   BITMAP_PAGE_NEEDWRITE)) {
+					clear_page_attr(bitmap, lastpage,
+							BITMAP_PAGE_NEEDWRITE);
 					spin_unlock_irqrestore(&bitmap->lock, flags);
 					write_page(bitmap, lastpage, 0);
 				} else {
-					set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
+					set_page_attr(bitmap, lastpage,
+						      BITMAP_PAGE_NEEDWRITE);
+					bitmap->allclean = 0;
 					spin_unlock_irqrestore(&bitmap->lock, flags);
 				}
 			} else
@@ -1249,19 +1215,17 @@
 			}
 			spin_lock_irqsave(&bitmap->lock, flags);
 			if (!bitmap->need_sync)
-				clear_page_attr(bitmap, page, BITMAP_PAGE_CLEAN);
+				clear_page_attr(bitmap, page, BITMAP_PAGE_PENDING);
+			else
+				bitmap->allclean = 0;
 		}
 		bmc = bitmap_get_counter(bitmap,
 					 (sector_t)j << CHUNK_BLOCK_SHIFT(bitmap),
 					 &blocks, 0);
-		if (bmc) {
-			if (*bmc)
-				bitmap->allclean = 0;
-
-			if (*bmc == 2) {
-				*bmc = 1; /* maybe clear the bit next time */
-				set_page_attr(bitmap, page, BITMAP_PAGE_CLEAN);
-			} else if (*bmc == 1 && !bitmap->need_sync) {
+		if (!bmc)
+			j |= PAGE_COUNTER_MASK;
+		else if (*bmc) {
+			if (*bmc == 1 && !bitmap->need_sync) {
 				/* we can clear the bit */
 				*bmc = 0;
 				bitmap_count_page(bitmap,
@@ -1275,13 +1239,16 @@
 						  paddr);
 				else
 					__clear_bit_le(
-							file_page_offset(bitmap,
-									 j),
-							paddr);
+						file_page_offset(bitmap,
+								 j),
+						paddr);
 				kunmap_atomic(paddr, KM_USER0);
+			} else if (*bmc <= 2) {
+				*bmc = 1; /* maybe clear the bit next time */
+				set_page_attr(bitmap, page, BITMAP_PAGE_PENDING);
+				bitmap->allclean = 0;
 			}
-		} else
-			j |= PAGE_COUNTER_MASK;
+		}
 	}
 	spin_unlock_irqrestore(&bitmap->lock, flags);
 
@@ -1294,6 +1261,7 @@
 			write_page(bitmap, lastpage, 0);
 		} else {
 			set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
+			bitmap->allclean = 0;
 			spin_unlock_irqrestore(&bitmap->lock, flags);
 		}
 	}
@@ -1359,8 +1327,8 @@
 		if (bw > bitmap->behind_writes_used)
 			bitmap->behind_writes_used = bw;
 
-		PRINTK(KERN_DEBUG "inc write-behind count %d/%d\n",
-		       bw, bitmap->max_write_behind);
+		pr_debug("inc write-behind count %d/%lu\n",
+			 bw, bitmap->mddev->bitmap_info.max_write_behind);
 	}
 
 	while (sectors) {
@@ -1407,7 +1375,6 @@
 		else
 			sectors = 0;
 	}
-	bitmap->allclean = 0;
 	return 0;
 }
 EXPORT_SYMBOL(bitmap_startwrite);
@@ -1420,8 +1387,9 @@
 	if (behind) {
 		if (atomic_dec_and_test(&bitmap->behind_writes))
 			wake_up(&bitmap->behind_wait);
-		PRINTK(KERN_DEBUG "dec write-behind count %d/%d\n",
-		  atomic_read(&bitmap->behind_writes), bitmap->max_write_behind);
+		pr_debug("dec write-behind count %d/%lu\n",
+			 atomic_read(&bitmap->behind_writes),
+			 bitmap->mddev->bitmap_info.max_write_behind);
 	}
 	if (bitmap->mddev->degraded)
 		/* Never clear bits or update events_cleared when degraded */
@@ -1453,13 +1421,14 @@
 			wake_up(&bitmap->overflow_wait);
 
 		(*bmc)--;
-		if (*bmc <= 2)
+		if (*bmc <= 2) {
 			set_page_attr(bitmap,
 				      filemap_get_page(
 					      bitmap,
 					      offset >> CHUNK_BLOCK_SHIFT(bitmap)),
-				      BITMAP_PAGE_CLEAN);
-
+				      BITMAP_PAGE_PENDING);
+			bitmap->allclean = 0;
+		}
 		spin_unlock_irqrestore(&bitmap->lock, flags);
 		offset += blocks;
 		if (sectors > blocks)
@@ -1495,7 +1464,6 @@
 		}
 	}
 	spin_unlock_irq(&bitmap->lock);
-	bitmap->allclean = 0;
 	return rv;
 }
 
@@ -1543,15 +1511,16 @@
 		if (!NEEDED(*bmc) && aborted)
 			*bmc |= NEEDED_MASK;
 		else {
-			if (*bmc <= 2)
+			if (*bmc <= 2) {
 				set_page_attr(bitmap,
 					      filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap)),
-					      BITMAP_PAGE_CLEAN);
+					      BITMAP_PAGE_PENDING);
+				bitmap->allclean = 0;
+			}
 		}
 	}
  unlock:
 	spin_unlock_irqrestore(&bitmap->lock, flags);
-	bitmap->allclean = 0;
 }
 EXPORT_SYMBOL(bitmap_end_sync);
 
@@ -1622,10 +1591,10 @@
 		*bmc = 1 | (needed ? NEEDED_MASK : 0);
 		bitmap_count_page(bitmap, offset, 1);
 		page = filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap));
-		set_page_attr(bitmap, page, BITMAP_PAGE_CLEAN);
+		set_page_attr(bitmap, page, BITMAP_PAGE_PENDING);
+		bitmap->allclean = 0;
 	}
 	spin_unlock_irq(&bitmap->lock);
-	bitmap->allclean = 0;
 }
 
 /* dirty the memory and file bits for bitmap chunks "s" to "e" */
@@ -1649,7 +1618,7 @@
 /*
  * flush out any pending updates
  */
-void bitmap_flush(mddev_t *mddev)
+void bitmap_flush(struct mddev *mddev)
 {
 	struct bitmap *bitmap = mddev->bitmap;
 	long sleep;
@@ -1697,7 +1666,7 @@
 	kfree(bitmap);
 }
 
-void bitmap_destroy(mddev_t *mddev)
+void bitmap_destroy(struct mddev *mddev)
 {
 	struct bitmap *bitmap = mddev->bitmap;
 
@@ -1720,7 +1689,7 @@
  * initialize the bitmap structure
  * if this returns an error, bitmap_destroy must be called to do clean up
  */
-int bitmap_create(mddev_t *mddev)
+int bitmap_create(struct mddev *mddev)
 {
 	struct bitmap *bitmap;
 	sector_t blocks = mddev->resync_max_sectors;
@@ -1802,11 +1771,8 @@
 	bitmap->pages = pages;
 	bitmap->missing_pages = pages;
 
-#ifdef INJECT_FATAL_FAULT_1
-	bitmap->bp = NULL;
-#else
 	bitmap->bp = kzalloc(pages * sizeof(*bitmap->bp), GFP_KERNEL);
-#endif
+
 	err = -ENOMEM;
 	if (!bitmap->bp)
 		goto error;
@@ -1824,7 +1790,7 @@
 	return err;
 }
 
-int bitmap_load(mddev_t *mddev)
+int bitmap_load(struct mddev *mddev)
 {
 	int err = 0;
 	sector_t start = 0;
@@ -1870,7 +1836,7 @@
 EXPORT_SYMBOL_GPL(bitmap_load);
 
 static ssize_t
-location_show(mddev_t *mddev, char *page)
+location_show(struct mddev *mddev, char *page)
 {
 	ssize_t len;
 	if (mddev->bitmap_info.file)
@@ -1884,7 +1850,7 @@
 }
 
 static ssize_t
-location_store(mddev_t *mddev, const char *buf, size_t len)
+location_store(struct mddev *mddev, const char *buf, size_t len)
 {
 
 	if (mddev->pers) {
@@ -1961,7 +1927,7 @@
 __ATTR(location, S_IRUGO|S_IWUSR, location_show, location_store);
 
 static ssize_t
-timeout_show(mddev_t *mddev, char *page)
+timeout_show(struct mddev *mddev, char *page)
 {
 	ssize_t len;
 	unsigned long secs = mddev->bitmap_info.daemon_sleep / HZ;
@@ -1975,7 +1941,7 @@
 }
 
 static ssize_t
-timeout_store(mddev_t *mddev, const char *buf, size_t len)
+timeout_store(struct mddev *mddev, const char *buf, size_t len)
 {
 	/* timeout can be set at any time */
 	unsigned long timeout;
@@ -2011,13 +1977,13 @@
 __ATTR(time_base, S_IRUGO|S_IWUSR, timeout_show, timeout_store);
 
 static ssize_t
-backlog_show(mddev_t *mddev, char *page)
+backlog_show(struct mddev *mddev, char *page)
 {
 	return sprintf(page, "%lu\n", mddev->bitmap_info.max_write_behind);
 }
 
 static ssize_t
-backlog_store(mddev_t *mddev, const char *buf, size_t len)
+backlog_store(struct mddev *mddev, const char *buf, size_t len)
 {
 	unsigned long backlog;
 	int rv = strict_strtoul(buf, 10, &backlog);
@@ -2033,13 +1999,13 @@
 __ATTR(backlog, S_IRUGO|S_IWUSR, backlog_show, backlog_store);
 
 static ssize_t
-chunksize_show(mddev_t *mddev, char *page)
+chunksize_show(struct mddev *mddev, char *page)
 {
 	return sprintf(page, "%lu\n", mddev->bitmap_info.chunksize);
 }
 
 static ssize_t
-chunksize_store(mddev_t *mddev, const char *buf, size_t len)
+chunksize_store(struct mddev *mddev, const char *buf, size_t len)
 {
 	/* Can only be changed when no bitmap is active */
 	int rv;
@@ -2059,13 +2025,13 @@
 static struct md_sysfs_entry bitmap_chunksize =
 __ATTR(chunksize, S_IRUGO|S_IWUSR, chunksize_show, chunksize_store);
 
-static ssize_t metadata_show(mddev_t *mddev, char *page)
+static ssize_t metadata_show(struct mddev *mddev, char *page)
 {
 	return sprintf(page, "%s\n", (mddev->bitmap_info.external
 				      ? "external" : "internal"));
 }
 
-static ssize_t metadata_store(mddev_t *mddev, const char *buf, size_t len)
+static ssize_t metadata_store(struct mddev *mddev, const char *buf, size_t len)
 {
 	if (mddev->bitmap ||
 	    mddev->bitmap_info.file ||
@@ -2083,7 +2049,7 @@
 static struct md_sysfs_entry bitmap_metadata =
 __ATTR(metadata, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
 
-static ssize_t can_clear_show(mddev_t *mddev, char *page)
+static ssize_t can_clear_show(struct mddev *mddev, char *page)
 {
 	int len;
 	if (mddev->bitmap)
@@ -2094,7 +2060,7 @@
 	return len;
 }
 
-static ssize_t can_clear_store(mddev_t *mddev, const char *buf, size_t len)
+static ssize_t can_clear_store(struct mddev *mddev, const char *buf, size_t len)
 {
 	if (mddev->bitmap == NULL)
 		return -ENOENT;
@@ -2113,7 +2079,7 @@
 __ATTR(can_clear, S_IRUGO|S_IWUSR, can_clear_show, can_clear_store);
 
 static ssize_t
-behind_writes_used_show(mddev_t *mddev, char *page)
+behind_writes_used_show(struct mddev *mddev, char *page)
 {
 	if (mddev->bitmap == NULL)
 		return sprintf(page, "0\n");
@@ -2122,7 +2088,7 @@
 }
 
 static ssize_t
-behind_writes_used_reset(mddev_t *mddev, const char *buf, size_t len)
+behind_writes_used_reset(struct mddev *mddev, const char *buf, size_t len)
 {
 	if (mddev->bitmap)
 		mddev->bitmap->behind_writes_used = 0;

diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
index a28f2e5..a15436d 100644
--- a/drivers/md/bitmap.h
+++ b/drivers/md/bitmap.h

@@ -193,7 +193,7 @@
 	unsigned long pages; /* total number of pages in the bitmap */
 	unsigned long missing_pages; /* number of pages not yet allocated */
 
-	mddev_t *mddev; /* the md device that the bitmap is for */
+	struct mddev *mddev; /* the md device that the bitmap is for */
 
 	/* bitmap chunksize -- how much data does each bit represent? */
 	unsigned long chunkshift; /* chunksize = 2^chunkshift (for bitops) */
@@ -238,10 +238,10 @@
 /* the bitmap API */
 
 /* these are used only by md/bitmap */
-int  bitmap_create(mddev_t *mddev);
-int bitmap_load(mddev_t *mddev);
-void bitmap_flush(mddev_t *mddev);
-void bitmap_destroy(mddev_t *mddev);
+int  bitmap_create(struct mddev *mddev);
+int bitmap_load(struct mddev *mddev);
+void bitmap_flush(struct mddev *mddev);
+void bitmap_destroy(struct mddev *mddev);
 
 void bitmap_print_sb(struct bitmap *bitmap);
 void bitmap_update_sb(struct bitmap *bitmap);
@@ -262,7 +262,7 @@
 void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector);
 
 void bitmap_unplug(struct bitmap *bitmap);
-void bitmap_daemon_work(mddev_t *mddev);
+void bitmap_daemon_work(struct mddev *mddev);
 #endif
 
 #endif

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 86df8b2..37a3726 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c

@@ -37,7 +37,7 @@
 	 */
 	struct dm_dev *meta_dev;
 	struct dm_dev *data_dev;
-	struct mdk_rdev_s rdev;
+	struct md_rdev rdev;
 };
 
 /*
@@ -57,7 +57,7 @@
 
 	uint64_t print_flags;
 
-	struct mddev_s md;
+	struct mddev md;
 	struct raid_type *raid_type;
 	struct dm_target_callbacks callbacks;
 
@@ -594,7 +594,7 @@
 				/* Always set to 0 when writing. */
 } __packed;
 
-static int read_disk_sb(mdk_rdev_t *rdev, int size)
+static int read_disk_sb(struct md_rdev *rdev, int size)
 {
 	BUG_ON(!rdev->sb_page);
 
@@ -611,9 +611,9 @@
 	return 0;
 }
 
-static void super_sync(mddev_t *mddev, mdk_rdev_t *rdev)
+static void super_sync(struct mddev *mddev, struct md_rdev *rdev)
 {
-	mdk_rdev_t *r, *t;
+	struct md_rdev *r, *t;
 	uint64_t failed_devices;
 	struct dm_raid_superblock *sb;
 
@@ -651,7 +651,7 @@
  *
  * Return: 1 if use rdev, 0 if use refdev, -Exxx otherwise
  */
-static int super_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev)
+static int super_load(struct md_rdev *rdev, struct md_rdev *refdev)
 {
 	int ret;
 	struct dm_raid_superblock *sb;
@@ -689,7 +689,7 @@
 	return (events_sb > events_refsb) ? 1 : 0;
 }
 
-static int super_init_validation(mddev_t *mddev, mdk_rdev_t *rdev)
+static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev)
 {
 	int role;
 	struct raid_set *rs = container_of(mddev, struct raid_set, md);
@@ -698,7 +698,7 @@
 	struct dm_raid_superblock *sb;
 	uint32_t new_devs = 0;
 	uint32_t rebuilds = 0;
-	mdk_rdev_t *r, *t;
+	struct md_rdev *r, *t;
 	struct dm_raid_superblock *sb2;
 
 	sb = page_address(rdev->sb_page);
@@ -809,7 +809,7 @@
 	return 0;
 }
 
-static int super_validate(mddev_t *mddev, mdk_rdev_t *rdev)
+static int super_validate(struct mddev *mddev, struct md_rdev *rdev)
 {
 	struct dm_raid_superblock *sb = page_address(rdev->sb_page);
 
@@ -849,8 +849,8 @@
 static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
 {
 	int ret;
-	mdk_rdev_t *rdev, *freshest, *tmp;
-	mddev_t *mddev = &rs->md;
+	struct md_rdev *rdev, *freshest, *tmp;
+	struct mddev *mddev = &rs->md;
 
 	freshest = NULL;
 	rdev_for_each(rdev, tmp, mddev) {
@@ -1004,7 +1004,7 @@
 static int raid_map(struct dm_target *ti, struct bio *bio, union map_info *map_context)
 {
 	struct raid_set *rs = ti->private;
-	mddev_t *mddev = &rs->md;
+	struct mddev *mddev = &rs->md;
 
 	mddev->pers->make_request(mddev, bio);
 
@@ -1097,7 +1097,7 @@
 			       rs->md.bitmap_info.max_write_behind);
 
 		if (rs->print_flags & DMPF_STRIPE_CACHE) {
-			raid5_conf_t *conf = rs->md.private;
+			struct r5conf *conf = rs->md.private;
 
 			/* convert from kiB to sectors */
 			DMEMIT(" stripe_cache %d",
@@ -1146,7 +1146,7 @@
 {
 	struct raid_set *rs = ti->private;
 	unsigned chunk_size = rs->md.chunk_sectors << 9;
-	raid5_conf_t *conf = rs->md.private;
+	struct r5conf *conf = rs->md.private;
 
 	blk_limits_io_min(limits, chunk_size);
 	blk_limits_io_opt(limits, chunk_size * (conf->raid_disks - conf->max_degraded));

diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index 23078da..60816b1 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c

@@ -81,16 +81,16 @@
 	bio_io_error(b);
 }
 
-typedef struct faulty_conf {
+struct faulty_conf {
 	int period[Modes];
 	atomic_t counters[Modes];
 	sector_t faults[MaxFault];
 	int	modes[MaxFault];
 	int nfaults;
-	mdk_rdev_t *rdev;
-} conf_t;
+	struct md_rdev *rdev;
+};
 
-static int check_mode(conf_t *conf, int mode)
+static int check_mode(struct faulty_conf *conf, int mode)
 {
 	if (conf->period[mode] == 0 &&
 	    atomic_read(&conf->counters[mode]) <= 0)
@@ -105,7 +105,7 @@
 	return 0;
 }
 
-static int check_sector(conf_t *conf, sector_t start, sector_t end, int dir)
+static int check_sector(struct faulty_conf *conf, sector_t start, sector_t end, int dir)
 {
 	/* If we find a ReadFixable sector, we fix it ... */
 	int i;
@@ -129,7 +129,7 @@
 	return 0;
 }
 
-static void add_sector(conf_t *conf, sector_t start, int mode)
+static void add_sector(struct faulty_conf *conf, sector_t start, int mode)
 {
 	int i;
 	int n = conf->nfaults;
@@ -169,9 +169,9 @@
 		conf->nfaults = n+1;
 }
 
-static int make_request(mddev_t *mddev, struct bio *bio)
+static int make_request(struct mddev *mddev, struct bio *bio)
 {
-	conf_t *conf = mddev->private;
+	struct faulty_conf *conf = mddev->private;
 	int failit = 0;
 
 	if (bio_data_dir(bio) == WRITE) {
@@ -222,9 +222,9 @@
 	}
 }
 
-static void status(struct seq_file *seq, mddev_t *mddev)
+static void status(struct seq_file *seq, struct mddev *mddev)
 {
-	conf_t *conf = mddev->private;
+	struct faulty_conf *conf = mddev->private;
 	int n;
 
 	if ((n=atomic_read(&conf->counters[WriteTransient])) != 0)
@@ -255,11 +255,11 @@
 }
 
 
-static int reshape(mddev_t *mddev)
+static int reshape(struct mddev *mddev)
 {
 	int mode = mddev->new_layout & ModeMask;
 	int count = mddev->new_layout >> ModeShift;
-	conf_t *conf = mddev->private;
+	struct faulty_conf *conf = mddev->private;
 
 	if (mddev->new_layout < 0)
 		return 0;
@@ -284,7 +284,7 @@
 	return 0;
 }
 
-static sector_t faulty_size(mddev_t *mddev, sector_t sectors, int raid_disks)
+static sector_t faulty_size(struct mddev *mddev, sector_t sectors, int raid_disks)
 {
 	WARN_ONCE(raid_disks,
 		  "%s does not support generic reshape\n", __func__);
@@ -295,11 +295,11 @@
 	return sectors;
 }
 
-static int run(mddev_t *mddev)
+static int run(struct mddev *mddev)
 {
-	mdk_rdev_t *rdev;
+	struct md_rdev *rdev;
 	int i;
-	conf_t *conf;
+	struct faulty_conf *conf;
 
 	if (md_check_no_bitmap(mddev))
 		return -EINVAL;
@@ -325,16 +325,16 @@
 	return 0;
 }
 
-static int stop(mddev_t *mddev)
+static int stop(struct mddev *mddev)
 {
-	conf_t *conf = mddev->private;
+	struct faulty_conf *conf = mddev->private;
 
 	kfree(conf);
 	mddev->private = NULL;
 	return 0;
 }
 
-static struct mdk_personality faulty_personality =
+static struct md_personality faulty_personality =
 {
 	.name		= "faulty",
 	.level		= LEVEL_FAULTY,

diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 6cd2c31..10c5844 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c

@@ -26,10 +26,10 @@
 /*
  * find which device holds a particular offset 
  */
-static inline dev_info_t *which_dev(mddev_t *mddev, sector_t sector)
+static inline struct dev_info *which_dev(struct mddev *mddev, sector_t sector)
 {
 	int lo, mid, hi;
-	linear_conf_t *conf;
+	struct linear_conf *conf;
 
 	lo = 0;
 	hi = mddev->raid_disks - 1;
@@ -63,8 +63,8 @@
 				 struct bvec_merge_data *bvm,
 				 struct bio_vec *biovec)
 {
-	mddev_t *mddev = q->queuedata;
-	dev_info_t *dev0;
+	struct mddev *mddev = q->queuedata;
+	struct dev_info *dev0;
 	unsigned long maxsectors, bio_sectors = bvm->bi_size >> 9;
 	sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
 
@@ -89,8 +89,8 @@
 
 static int linear_congested(void *data, int bits)
 {
-	mddev_t *mddev = data;
-	linear_conf_t *conf;
+	struct mddev *mddev = data;
+	struct linear_conf *conf;
 	int i, ret = 0;
 
 	if (mddev_congested(mddev, bits))
@@ -108,9 +108,9 @@
 	return ret;
 }
 
-static sector_t linear_size(mddev_t *mddev, sector_t sectors, int raid_disks)
+static sector_t linear_size(struct mddev *mddev, sector_t sectors, int raid_disks)
 {
-	linear_conf_t *conf;
+	struct linear_conf *conf;
 	sector_t array_sectors;
 
 	rcu_read_lock();
@@ -123,13 +123,13 @@
 	return array_sectors;
 }
 
-static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
+static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
 {
-	linear_conf_t *conf;
-	mdk_rdev_t *rdev;
+	struct linear_conf *conf;
+	struct md_rdev *rdev;
 	int i, cnt;
 
-	conf = kzalloc (sizeof (*conf) + raid_disks*sizeof(dev_info_t),
+	conf = kzalloc (sizeof (*conf) + raid_disks*sizeof(struct dev_info),
 			GFP_KERNEL);
 	if (!conf)
 		return NULL;
@@ -139,7 +139,7 @@
 
 	list_for_each_entry(rdev, &mddev->disks, same_set) {
 		int j = rdev->raid_disk;
-		dev_info_t *disk = conf->disks + j;
+		struct dev_info *disk = conf->disks + j;
 		sector_t sectors;
 
 		if (j < 0 || j >= raid_disks || disk->rdev) {
@@ -194,9 +194,9 @@
 	return NULL;
 }
 
-static int linear_run (mddev_t *mddev)
+static int linear_run (struct mddev *mddev)
 {
-	linear_conf_t *conf;
+	struct linear_conf *conf;
 
 	if (md_check_no_bitmap(mddev))
 		return -EINVAL;
@@ -213,7 +213,7 @@
 	return md_integrity_register(mddev);
 }
 
-static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev)
+static int linear_add(struct mddev *mddev, struct md_rdev *rdev)
 {
 	/* Adding a drive to a linear array allows the array to grow.
 	 * It is permitted if the new drive has a matching superblock
@@ -223,7 +223,7 @@
 	 * The current one is never freed until the array is stopped.
 	 * This avoids races.
 	 */
-	linear_conf_t *newconf, *oldconf;
+	struct linear_conf *newconf, *oldconf;
 
 	if (rdev->saved_raid_disk != mddev->raid_disks)
 		return -EINVAL;
@@ -245,9 +245,9 @@
 	return 0;
 }
 
-static int linear_stop (mddev_t *mddev)
+static int linear_stop (struct mddev *mddev)
 {
-	linear_conf_t *conf = mddev->private;
+	struct linear_conf *conf = mddev->private;
 
 	/*
 	 * We do not require rcu protection here since
@@ -264,9 +264,9 @@
 	return 0;
 }
 
-static int linear_make_request (mddev_t *mddev, struct bio *bio)
+static int linear_make_request (struct mddev *mddev, struct bio *bio)
 {
-	dev_info_t *tmp_dev;
+	struct dev_info *tmp_dev;
 	sector_t start_sector;
 
 	if (unlikely(bio->bi_rw & REQ_FLUSH)) {
@@ -323,14 +323,14 @@
 	return 1;
 }
 
-static void linear_status (struct seq_file *seq, mddev_t *mddev)
+static void linear_status (struct seq_file *seq, struct mddev *mddev)
 {
 
 	seq_printf(seq, " %dk rounding", mddev->chunk_sectors / 2);
 }
 
 
-static struct mdk_personality linear_personality =
+static struct md_personality linear_personality =
 {
 	.name		= "linear",
 	.level		= LEVEL_LINEAR,

diff --git a/drivers/md/linear.h b/drivers/md/linear.h
index 2f2da05..b685ddd 100644
--- a/drivers/md/linear.h
+++ b/drivers/md/linear.h

@@ -2,20 +2,14 @@
 #define _LINEAR_H
 
 struct dev_info {
-	mdk_rdev_t	*rdev;
+	struct md_rdev	*rdev;
 	sector_t	end_sector;
 };
 
-typedef struct dev_info dev_info_t;
-
-struct linear_private_data
+struct linear_conf
 {
 	struct rcu_head		rcu;
 	sector_t		array_sectors;
-	dev_info_t		disks[0];
+	struct dev_info		disks[0];
 };
-
-
-typedef struct linear_private_data linear_conf_t;
-
 #endif

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 5c95ccb..266e82e 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c

@@ -54,9 +54,6 @@
 #include "md.h"
 #include "bitmap.h"
 
-#define DEBUG 0
-#define dprintk(x...) ((void)(DEBUG && printk(x)))
-
 #ifndef MODULE
 static void autostart_arrays(int part);
 #endif
@@ -98,13 +95,13 @@
 
 static int sysctl_speed_limit_min = 1000;
 static int sysctl_speed_limit_max = 200000;
-static inline int speed_min(mddev_t *mddev)
+static inline int speed_min(struct mddev *mddev)
 {
 	return mddev->sync_speed_min ?
 		mddev->sync_speed_min : sysctl_speed_limit_min;
 }
 
-static inline int speed_max(mddev_t *mddev)
+static inline int speed_max(struct mddev *mddev)
 {
 	return mddev->sync_speed_max ?
 		mddev->sync_speed_max : sysctl_speed_limit_max;
@@ -160,7 +157,7 @@
 
 static void mddev_bio_destructor(struct bio *bio)
 {
-	mddev_t *mddev, **mddevp;
+	struct mddev *mddev, **mddevp;
 
 	mddevp = (void*)bio;
 	mddev = mddevp[-1];
@@ -169,10 +166,10 @@
 }
 
 struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
-			    mddev_t *mddev)
+			    struct mddev *mddev)
 {
 	struct bio *b;
-	mddev_t **mddevp;
+	struct mddev **mddevp;
 
 	if (!mddev || !mddev->bio_set)
 		return bio_alloc(gfp_mask, nr_iovecs);
@@ -189,10 +186,10 @@
 EXPORT_SYMBOL_GPL(bio_alloc_mddev);
 
 struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
-			    mddev_t *mddev)
+			    struct mddev *mddev)
 {
 	struct bio *b;
-	mddev_t **mddevp;
+	struct mddev **mddevp;
 
 	if (!mddev || !mddev->bio_set)
 		return bio_clone(bio, gfp_mask);
@@ -281,7 +278,7 @@
  */
 static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
 static atomic_t md_event_count;
-void md_new_event(mddev_t *mddev)
+void md_new_event(struct mddev *mddev)
 {
 	atomic_inc(&md_event_count);
 	wake_up(&md_event_waiters);
@@ -291,7 +288,7 @@
 /* Alternate version that can be called from interrupts
  * when calling sysfs_notify isn't needed.
  */
-static void md_new_event_inintr(mddev_t *mddev)
+static void md_new_event_inintr(struct mddev *mddev)
 {
 	atomic_inc(&md_event_count);
 	wake_up(&md_event_waiters);
@@ -312,19 +309,19 @@
  * Any code which breaks out of this loop while own
  * a reference to the current mddev and must mddev_put it.
  */
-#define for_each_mddev(mddev,tmp)					\
+#define for_each_mddev(_mddev,_tmp)					\
 									\
 	for (({ spin_lock(&all_mddevs_lock); 				\
-		tmp = all_mddevs.next;					\
-		mddev = NULL;});					\
-	     ({ if (tmp != &all_mddevs)					\
-			mddev_get(list_entry(tmp, mddev_t, all_mddevs));\
+		_tmp = all_mddevs.next;					\
+		_mddev = NULL;});					\
+	     ({ if (_tmp != &all_mddevs)				\
+			mddev_get(list_entry(_tmp, struct mddev, all_mddevs));\
 		spin_unlock(&all_mddevs_lock);				\
-		if (mddev) mddev_put(mddev);				\
-		mddev = list_entry(tmp, mddev_t, all_mddevs);		\
-		tmp != &all_mddevs;});					\
+		if (_mddev) mddev_put(_mddev);				\
+		_mddev = list_entry(_tmp, struct mddev, all_mddevs);	\
+		_tmp != &all_mddevs;});					\
 	     ({ spin_lock(&all_mddevs_lock);				\
-		tmp = tmp->next;})					\
+		_tmp = _tmp->next;})					\
 		)
 
 
@@ -338,7 +335,7 @@
 static int md_make_request(struct request_queue *q, struct bio *bio)
 {
 	const int rw = bio_data_dir(bio);
-	mddev_t *mddev = q->queuedata;
+	struct mddev *mddev = q->queuedata;
 	int rv;
 	int cpu;
 	unsigned int sectors;
@@ -390,7 +387,7 @@
  * Once ->stop is called and completes, the module will be completely
  * unused.
  */
-void mddev_suspend(mddev_t *mddev)
+void mddev_suspend(struct mddev *mddev)
 {
 	BUG_ON(mddev->suspended);
 	mddev->suspended = 1;
@@ -400,7 +397,7 @@
 }
 EXPORT_SYMBOL_GPL(mddev_suspend);
 
-void mddev_resume(mddev_t *mddev)
+void mddev_resume(struct mddev *mddev)
 {
 	mddev->suspended = 0;
 	wake_up(&mddev->sb_wait);
@@ -411,7 +408,7 @@
 }
 EXPORT_SYMBOL_GPL(mddev_resume);
 
-int mddev_congested(mddev_t *mddev, int bits)
+int mddev_congested(struct mddev *mddev, int bits)
 {
 	return mddev->suspended;
 }
@@ -423,8 +420,8 @@
 
 static void md_end_flush(struct bio *bio, int err)
 {
-	mdk_rdev_t *rdev = bio->bi_private;
-	mddev_t *mddev = rdev->mddev;
+	struct md_rdev *rdev = bio->bi_private;
+	struct mddev *mddev = rdev->mddev;
 
 	rdev_dec_pending(rdev, mddev);
 
@@ -439,8 +436,8 @@
 
 static void submit_flushes(struct work_struct *ws)
 {
-	mddev_t *mddev = container_of(ws, mddev_t, flush_work);
-	mdk_rdev_t *rdev;
+	struct mddev *mddev = container_of(ws, struct mddev, flush_work);
+	struct md_rdev *rdev;
 
 	INIT_WORK(&mddev->flush_work, md_submit_flush_data);
 	atomic_set(&mddev->flush_pending, 1);
@@ -472,7 +469,7 @@
 
 static void md_submit_flush_data(struct work_struct *ws)
 {
-	mddev_t *mddev = container_of(ws, mddev_t, flush_work);
+	struct mddev *mddev = container_of(ws, struct mddev, flush_work);
 	struct bio *bio = mddev->flush_bio;
 
 	if (bio->bi_size == 0)
@@ -488,7 +485,7 @@
 	wake_up(&mddev->sb_wait);
 }
 
-void md_flush_request(mddev_t *mddev, struct bio *bio)
+void md_flush_request(struct mddev *mddev, struct bio *bio)
 {
 	spin_lock_irq(&mddev->write_lock);
 	wait_event_lock_irq(mddev->sb_wait,
@@ -512,7 +509,7 @@
  */
 struct md_plug_cb {
 	struct blk_plug_cb cb;
-	mddev_t *mddev;
+	struct mddev *mddev;
 };
 
 static void plugger_unplug(struct blk_plug_cb *cb)
@@ -526,7 +523,7 @@
 /* Check that an unplug wakeup will come shortly.
  * If not, wakeup the md thread immediately
  */
-int mddev_check_plugged(mddev_t *mddev)
+int mddev_check_plugged(struct mddev *mddev)
 {
 	struct blk_plug *plug = current->plug;
 	struct md_plug_cb *mdcb;
@@ -558,7 +555,7 @@
 }
 EXPORT_SYMBOL_GPL(mddev_check_plugged);
 
-static inline mddev_t *mddev_get(mddev_t *mddev)
+static inline struct mddev *mddev_get(struct mddev *mddev)
 {
 	atomic_inc(&mddev->active);
 	return mddev;
@@ -566,7 +563,7 @@
 
 static void mddev_delayed_delete(struct work_struct *ws);
 
-static void mddev_put(mddev_t *mddev)
+static void mddev_put(struct mddev *mddev)
 {
 	struct bio_set *bs = NULL;
 
@@ -595,7 +592,7 @@
 		bioset_free(bs);
 }
 
-void mddev_init(mddev_t *mddev)
+void mddev_init(struct mddev *mddev)
 {
 	mutex_init(&mddev->open_mutex);
 	mutex_init(&mddev->reconfig_mutex);
@@ -618,9 +615,9 @@
 }
 EXPORT_SYMBOL_GPL(mddev_init);
 
-static mddev_t * mddev_find(dev_t unit)
+static struct mddev * mddev_find(dev_t unit)
 {
-	mddev_t *mddev, *new = NULL;
+	struct mddev *mddev, *new = NULL;
 
 	if (unit && MAJOR(unit) != MD_MAJOR)
 		unit &= ~((1<<MdpMinorShift)-1);
@@ -692,24 +689,24 @@
 	goto retry;
 }
 
-static inline int mddev_lock(mddev_t * mddev)
+static inline int mddev_lock(struct mddev * mddev)
 {
 	return mutex_lock_interruptible(&mddev->reconfig_mutex);
 }
 
-static inline int mddev_is_locked(mddev_t *mddev)
+static inline int mddev_is_locked(struct mddev *mddev)
 {
 	return mutex_is_locked(&mddev->reconfig_mutex);
 }
 
-static inline int mddev_trylock(mddev_t * mddev)
+static inline int mddev_trylock(struct mddev * mddev)
 {
 	return mutex_trylock(&mddev->reconfig_mutex);
 }
 
 static struct attribute_group md_redundancy_group;
 
-static void mddev_unlock(mddev_t * mddev)
+static void mddev_unlock(struct mddev * mddev)
 {
 	if (mddev->to_remove) {
 		/* These cannot be removed under reconfig_mutex as
@@ -744,17 +741,17 @@
 	} else
 		mutex_unlock(&mddev->reconfig_mutex);
 
-	/* was we've dropped the mutex we need a spinlock to
-	 * make sur the thread doesn't disappear
+	/* As we've dropped the mutex we need a spinlock to
+	 * make sure the thread doesn't disappear
 	 */
 	spin_lock(&pers_lock);
 	md_wakeup_thread(mddev->thread);
 	spin_unlock(&pers_lock);
 }
 
-static mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
+static struct md_rdev * find_rdev_nr(struct mddev *mddev, int nr)
 {
-	mdk_rdev_t *rdev;
+	struct md_rdev *rdev;
 
 	list_for_each_entry(rdev, &mddev->disks, same_set)
 		if (rdev->desc_nr == nr)
@@ -763,9 +760,9 @@
 	return NULL;
 }
 
-static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev)
+static struct md_rdev * find_rdev(struct mddev * mddev, dev_t dev)
 {
-	mdk_rdev_t *rdev;
+	struct md_rdev *rdev;
 
 	list_for_each_entry(rdev, &mddev->disks, same_set)
 		if (rdev->bdev->bd_dev == dev)
@@ -774,9 +771,9 @@
 	return NULL;
 }
 
-static struct mdk_personality *find_pers(int level, char *clevel)
+static struct md_personality *find_pers(int level, char *clevel)
 {
-	struct mdk_personality *pers;
+	struct md_personality *pers;
 	list_for_each_entry(pers, &pers_list, list) {
 		if (level != LEVEL_NONE && pers->level == level)
 			return pers;
@@ -787,13 +784,13 @@
 }
 
 /* return the offset of the super block in 512byte sectors */
-static inline sector_t calc_dev_sboffset(mdk_rdev_t *rdev)
+static inline sector_t calc_dev_sboffset(struct md_rdev *rdev)
 {
 	sector_t num_sectors = i_size_read(rdev->bdev->bd_inode) / 512;
 	return MD_NEW_SIZE_SECTORS(num_sectors);
 }
 
-static int alloc_disk_sb(mdk_rdev_t * rdev)
+static int alloc_disk_sb(struct md_rdev * rdev)
 {
 	if (rdev->sb_page)
 		MD_BUG();
@@ -807,7 +804,7 @@
 	return 0;
 }
 
-static void free_disk_sb(mdk_rdev_t * rdev)
+static void free_disk_sb(struct md_rdev * rdev)
 {
 	if (rdev->sb_page) {
 		put_page(rdev->sb_page);
@@ -825,8 +822,8 @@
 
 static void super_written(struct bio *bio, int error)
 {
-	mdk_rdev_t *rdev = bio->bi_private;
-	mddev_t *mddev = rdev->mddev;
+	struct md_rdev *rdev = bio->bi_private;
+	struct mddev *mddev = rdev->mddev;
 
 	if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) {
 		printk("md: super_written gets error=%d, uptodate=%d\n",
@@ -840,7 +837,7 @@
 	bio_put(bio);
 }
 
-void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
+void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
 		   sector_t sector, int size, struct page *page)
 {
 	/* write first size bytes of page to sector of rdev
@@ -861,7 +858,7 @@
 	submit_bio(WRITE_FLUSH_FUA, bio);
 }
 
-void md_super_wait(mddev_t *mddev)
+void md_super_wait(struct mddev *mddev)
 {
 	/* wait for all superblock writes that were scheduled to complete */
 	DEFINE_WAIT(wq);
@@ -879,7 +876,7 @@
 	complete((struct completion*)bio->bi_private);
 }
 
-int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size,
+int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
 		 struct page *page, int rw, bool metadata_op)
 {
 	struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev);
@@ -907,7 +904,7 @@
 }
 EXPORT_SYMBOL_GPL(sync_page_io);
 
-static int read_disk_sb(mdk_rdev_t * rdev, int size)
+static int read_disk_sb(struct md_rdev * rdev, int size)
 {
 	char b[BDEVNAME_SIZE];
 	if (!rdev->sb_page) {
@@ -1014,7 +1011,7 @@
  * We rely on user-space to write the initial superblock, and support
  * reading and updating of superblocks.
  * Interface methods are:
- *   int load_super(mdk_rdev_t *dev, mdk_rdev_t *refdev, int minor_version)
+ *   int load_super(struct md_rdev *dev, struct md_rdev *refdev, int minor_version)
  *      loads and validates a superblock on dev.
  *      if refdev != NULL, compare superblocks on both devices
  *    Return:
@@ -1024,13 +1021,13 @@
  *     -EINVAL superblock incompatible or invalid
  *     -othererror e.g. -EIO
  *
- *   int validate_super(mddev_t *mddev, mdk_rdev_t *dev)
+ *   int validate_super(struct mddev *mddev, struct md_rdev *dev)
  *      Verify that dev is acceptable into mddev.
  *       The first time, mddev->raid_disks will be 0, and data from
  *       dev should be merged in.  Subsequent calls check that dev
  *       is new enough.  Return 0 or -EINVAL
  *
- *   void sync_super(mddev_t *mddev, mdk_rdev_t *dev)
+ *   void sync_super(struct mddev *mddev, struct md_rdev *dev)
  *     Update the superblock for rdev with data in mddev
  *     This does not write to disc.
  *
@@ -1039,11 +1036,11 @@
 struct super_type  {
 	char		    *name;
 	struct module	    *owner;
-	int		    (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev,
+	int		    (*load_super)(struct md_rdev *rdev, struct md_rdev *refdev,
 					  int minor_version);
-	int		    (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev);
-	void		    (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev);
-	unsigned long long  (*rdev_size_change)(mdk_rdev_t *rdev,
+	int		    (*validate_super)(struct mddev *mddev, struct md_rdev *rdev);
+	void		    (*sync_super)(struct mddev *mddev, struct md_rdev *rdev);
+	unsigned long long  (*rdev_size_change)(struct md_rdev *rdev,
 						sector_t num_sectors);
 };
 
@@ -1055,7 +1052,7 @@
  * has a bitmap. Otherwise, it returns 0.
  *
  */
-int md_check_no_bitmap(mddev_t *mddev)
+int md_check_no_bitmap(struct mddev *mddev)
 {
 	if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset)
 		return 0;
@@ -1068,7 +1065,7 @@
 /*
  * load_super for 0.90.0 
  */
-static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
+static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
 {
 	char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
 	mdp_super_t *sb;
@@ -1163,7 +1160,7 @@
 /*
  * validate_super for 0.90.0
  */
-static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
+static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
 {
 	mdp_disk_t *desc;
 	mdp_super_t *sb = page_address(rdev->sb_page);
@@ -1275,10 +1272,10 @@
 /*
  * sync_super for 0.90.0
  */
-static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
+static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev)
 {
 	mdp_super_t *sb;
-	mdk_rdev_t *rdev2;
+	struct md_rdev *rdev2;
 	int next_spare = mddev->raid_disks;
 
 
@@ -1419,7 +1416,7 @@
  * rdev_size_change for 0.90.0
  */
 static unsigned long long
-super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
+super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
 {
 	if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
 		return 0; /* component must fit device */
@@ -1469,7 +1466,7 @@
 
 static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors,
 			    int acknowledged);
-static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
+static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
 {
 	struct mdp_superblock_1 *sb;
 	int ret;
@@ -1625,7 +1622,7 @@
 	return ret;
 }
 
-static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
+static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
 {
 	struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
 	__u64 ev1 = le64_to_cpu(sb->events);
@@ -1726,10 +1723,10 @@
 	return 0;
 }
 
-static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
+static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
 {
 	struct mdp_superblock_1 *sb;
-	mdk_rdev_t *rdev2;
+	struct md_rdev *rdev2;
 	int max_dev, i;
 	/* make rdev->sb match mddev and rdev data. */
 
@@ -1851,7 +1848,7 @@
 }
 
 static unsigned long long
-super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
+super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
 {
 	struct mdp_superblock_1 *sb;
 	sector_t max_sectors;
@@ -1905,7 +1902,7 @@
 	},
 };
 
-static void sync_super(mddev_t *mddev, mdk_rdev_t *rdev)
+static void sync_super(struct mddev *mddev, struct md_rdev *rdev)
 {
 	if (mddev->sync_super) {
 		mddev->sync_super(mddev, rdev);
@@ -1917,9 +1914,9 @@
 	super_types[mddev->major_version].sync_super(mddev, rdev);
 }
 
-static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
+static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2)
 {
-	mdk_rdev_t *rdev, *rdev2;
+	struct md_rdev *rdev, *rdev2;
 
 	rcu_read_lock();
 	rdev_for_each_rcu(rdev, mddev1)
@@ -1942,9 +1939,9 @@
  * from the array. It only succeeds if all working and active component devices
  * are integrity capable with matching profiles.
  */
-int md_integrity_register(mddev_t *mddev)
+int md_integrity_register(struct mddev *mddev)
 {
-	mdk_rdev_t *rdev, *reference = NULL;
+	struct md_rdev *rdev, *reference = NULL;
 
 	if (list_empty(&mddev->disks))
 		return 0; /* nothing to do */
@@ -1989,7 +1986,7 @@
 EXPORT_SYMBOL(md_integrity_register);
 
 /* Disable data integrity if non-capable/non-matching disk is being added */
-void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev)
+void md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev)
 {
 	struct blk_integrity *bi_rdev = bdev_get_integrity(rdev->bdev);
 	struct blk_integrity *bi_mddev = blk_get_integrity(mddev->gendisk);
@@ -2006,7 +2003,7 @@
 }
 EXPORT_SYMBOL(md_integrity_add_rdev);
 
-static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
+static int bind_rdev_to_array(struct md_rdev * rdev, struct mddev * mddev)
 {
 	char b[BDEVNAME_SIZE];
 	struct kobject *ko;
@@ -2086,12 +2083,12 @@
 
 static void md_delayed_delete(struct work_struct *ws)
 {
-	mdk_rdev_t *rdev = container_of(ws, mdk_rdev_t, del_work);
+	struct md_rdev *rdev = container_of(ws, struct md_rdev, del_work);
 	kobject_del(&rdev->kobj);
 	kobject_put(&rdev->kobj);
 }
 
-static void unbind_rdev_from_array(mdk_rdev_t * rdev)
+static void unbind_rdev_from_array(struct md_rdev * rdev)
 {
 	char b[BDEVNAME_SIZE];
 	if (!rdev->mddev) {
@@ -2123,14 +2120,14 @@
  * otherwise reused by a RAID array (or any other kernel
  * subsystem), by bd_claiming the device.
  */
-static int lock_rdev(mdk_rdev_t *rdev, dev_t dev, int shared)
+static int lock_rdev(struct md_rdev *rdev, dev_t dev, int shared)
 {
 	int err = 0;
 	struct block_device *bdev;
 	char b[BDEVNAME_SIZE];
 
 	bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
-				 shared ? (mdk_rdev_t *)lock_rdev : rdev);
+				 shared ? (struct md_rdev *)lock_rdev : rdev);
 	if (IS_ERR(bdev)) {
 		printk(KERN_ERR "md: could not open %s.\n",
 			__bdevname(dev, b));
@@ -2140,7 +2137,7 @@
 	return err;
 }
 
-static void unlock_rdev(mdk_rdev_t *rdev)
+static void unlock_rdev(struct md_rdev *rdev)
 {
 	struct block_device *bdev = rdev->bdev;
 	rdev->bdev = NULL;
@@ -2151,7 +2148,7 @@
 
 void md_autodetect_dev(dev_t dev);
 
-static void export_rdev(mdk_rdev_t * rdev)
+static void export_rdev(struct md_rdev * rdev)
 {
 	char b[BDEVNAME_SIZE];
 	printk(KERN_INFO "md: export_rdev(%s)\n",
@@ -2167,15 +2164,15 @@
 	kobject_put(&rdev->kobj);
 }
 
-static void kick_rdev_from_array(mdk_rdev_t * rdev)
+static void kick_rdev_from_array(struct md_rdev * rdev)
 {
 	unbind_rdev_from_array(rdev);
 	export_rdev(rdev);
 }
 
-static void export_array(mddev_t *mddev)
+static void export_array(struct mddev *mddev)
 {
-	mdk_rdev_t *rdev, *tmp;
+	struct md_rdev *rdev, *tmp;
 
 	rdev_for_each(rdev, tmp, mddev) {
 		if (!rdev->mddev) {
@@ -2271,7 +2268,7 @@
 		);
 }
 
-static void print_rdev(mdk_rdev_t *rdev, int major_version)
+static void print_rdev(struct md_rdev *rdev, int major_version)
 {
 	char b[BDEVNAME_SIZE];
 	printk(KERN_INFO "md: rdev %s, Sect:%08llu F:%d S:%d DN:%u\n",
@@ -2295,8 +2292,8 @@
 static void md_print_devices(void)
 {
 	struct list_head *tmp;
-	mdk_rdev_t *rdev;
-	mddev_t *mddev;
+	struct md_rdev *rdev;
+	struct mddev *mddev;
 	char b[BDEVNAME_SIZE];
 
 	printk("\n");
@@ -2321,7 +2318,7 @@
 }
 
 
-static void sync_sbs(mddev_t * mddev, int nospares)
+static void sync_sbs(struct mddev * mddev, int nospares)
 {
 	/* Update each superblock (in-memory image), but
 	 * if we are allowed to, skip spares which already
@@ -2329,7 +2326,7 @@
 	 * (which would mean they aren't being marked as dirty
 	 * with the rest of the array)
 	 */
-	mdk_rdev_t *rdev;
+	struct md_rdev *rdev;
 	list_for_each_entry(rdev, &mddev->disks, same_set) {
 		if (rdev->sb_events == mddev->events ||
 		    (nospares &&
@@ -2344,9 +2341,9 @@
 	}
 }
 
-static void md_update_sb(mddev_t * mddev, int force_change)
+static void md_update_sb(struct mddev * mddev, int force_change)
 {
-	mdk_rdev_t *rdev;
+	struct md_rdev *rdev;
 	int sync_req;
 	int nospares = 0;
 	int any_badblocks_changed = 0;
@@ -2442,27 +2439,24 @@
 	sync_sbs(mddev, nospares);
 	spin_unlock_irq(&mddev->write_lock);
 
-	dprintk(KERN_INFO 
-		"md: updating %s RAID superblock on device (in sync %d)\n",
-		mdname(mddev),mddev->in_sync);
+	pr_debug("md: updating %s RAID superblock on device (in sync %d)\n",
+		 mdname(mddev), mddev->in_sync);
 
 	bitmap_update_sb(mddev->bitmap);
 	list_for_each_entry(rdev, &mddev->disks, same_set) {
 		char b[BDEVNAME_SIZE];
-		dprintk(KERN_INFO "md: ");
+
 		if (rdev->sb_loaded != 1)
 			continue; /* no noise on spare devices */
-		if (test_bit(Faulty, &rdev->flags))
-			dprintk("(skipping faulty ");
 
-		dprintk("%s ", bdevname(rdev->bdev,b));
-		if (!test_bit(Faulty, &rdev->flags)) {
+		if (!test_bit(Faulty, &rdev->flags) &&
+		    rdev->saved_raid_disk == -1) {
 			md_super_write(mddev,rdev,
 				       rdev->sb_start, rdev->sb_size,
 				       rdev->sb_page);
-			dprintk(KERN_INFO "(write) %s's sb offset: %llu\n",
-				bdevname(rdev->bdev,b),
-				(unsigned long long)rdev->sb_start);
+			pr_debug("md: (write) %s's sb offset: %llu\n",
+				 bdevname(rdev->bdev, b),
+				 (unsigned long long)rdev->sb_start);
 			rdev->sb_events = mddev->events;
 			if (rdev->badblocks.size) {
 				md_super_write(mddev, rdev,
@@ -2472,8 +2466,12 @@
 				rdev->badblocks.size = 0;
 			}
 
-		} else
-			dprintk(")\n");
+		} else if (test_bit(Faulty, &rdev->flags))
+			pr_debug("md: %s (skipping faulty)\n",
+				 bdevname(rdev->bdev, b));
+		else
+			pr_debug("(skipping incremental s/r ");
+
 		if (mddev->level == LEVEL_MULTIPATH)
 			/* only need to write one superblock... */
 			break;
@@ -2527,12 +2525,12 @@
 
 struct rdev_sysfs_entry {
 	struct attribute attr;
-	ssize_t (*show)(mdk_rdev_t *, char *);
-	ssize_t (*store)(mdk_rdev_t *, const char *, size_t);
+	ssize_t (*show)(struct md_rdev *, char *);
+	ssize_t (*store)(struct md_rdev *, const char *, size_t);
 };
 
 static ssize_t
-state_show(mdk_rdev_t *rdev, char *page)
+state_show(struct md_rdev *rdev, char *page)
 {
 	char *sep = "";
 	size_t len = 0;
@@ -2568,7 +2566,7 @@
 }
 
 static ssize_t
-state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
+state_store(struct md_rdev *rdev, const char *buf, size_t len)
 {
 	/* can write
 	 *  faulty  - simulates an error
@@ -2592,7 +2590,7 @@
 		if (rdev->raid_disk >= 0)
 			err = -EBUSY;
 		else {
-			mddev_t *mddev = rdev->mddev;
+			struct mddev *mddev = rdev->mddev;
 			kick_rdev_from_array(rdev);
 			if (mddev->pers)
 				md_update_sb(mddev, 1);
@@ -2641,13 +2639,13 @@
 __ATTR(state, S_IRUGO|S_IWUSR, state_show, state_store);
 
 static ssize_t
-errors_show(mdk_rdev_t *rdev, char *page)
+errors_show(struct md_rdev *rdev, char *page)
 {
 	return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors));
 }
 
 static ssize_t
-errors_store(mdk_rdev_t *rdev, const char *buf, size_t len)
+errors_store(struct md_rdev *rdev, const char *buf, size_t len)
 {
 	char *e;
 	unsigned long n = simple_strtoul(buf, &e, 10);
@@ -2661,7 +2659,7 @@
 __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store);
 
 static ssize_t
-slot_show(mdk_rdev_t *rdev, char *page)
+slot_show(struct md_rdev *rdev, char *page)
 {
 	if (rdev->raid_disk < 0)
 		return sprintf(page, "none\n");
@@ -2670,7 +2668,7 @@
 }
 
 static ssize_t
-slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
+slot_store(struct md_rdev *rdev, const char *buf, size_t len)
 {
 	char *e;
 	int err;
@@ -2701,7 +2699,7 @@
 		set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
 		md_wakeup_thread(rdev->mddev->thread);
 	} else if (rdev->mddev->pers) {
-		mdk_rdev_t *rdev2;
+		struct md_rdev *rdev2;
 		/* Activating a spare .. or possibly reactivating
 		 * if we ever get bitmaps working here.
 		 */
@@ -2728,6 +2726,7 @@
 			rdev->saved_raid_disk = slot;
 		else
 			rdev->saved_raid_disk = -1;
+		clear_bit(In_sync, &rdev->flags);
 		err = rdev->mddev->pers->
 			hot_add_disk(rdev->mddev, rdev);
 		if (err) {
@@ -2757,13 +2756,13 @@
 __ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store);
 
 static ssize_t
-offset_show(mdk_rdev_t *rdev, char *page)
+offset_show(struct md_rdev *rdev, char *page)
 {
 	return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset);
 }
 
 static ssize_t
-offset_store(mdk_rdev_t *rdev, const char *buf, size_t len)
+offset_store(struct md_rdev *rdev, const char *buf, size_t len)
 {
 	char *e;
 	unsigned long long offset = simple_strtoull(buf, &e, 10);
@@ -2783,7 +2782,7 @@
 __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store);
 
 static ssize_t
-rdev_size_show(mdk_rdev_t *rdev, char *page)
+rdev_size_show(struct md_rdev *rdev, char *page)
 {
 	return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2);
 }
@@ -2818,9 +2817,9 @@
 }
 
 static ssize_t
-rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
+rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
 {
-	mddev_t *my_mddev = rdev->mddev;
+	struct mddev *my_mddev = rdev->mddev;
 	sector_t oldsectors = rdev->sectors;
 	sector_t sectors;
 
@@ -2846,13 +2845,13 @@
 		 * a deadlock.  We have already changed rdev->sectors, and if
 		 * we have to change it back, we will have the lock again.
 		 */
-		mddev_t *mddev;
+		struct mddev *mddev;
 		int overlap = 0;
 		struct list_head *tmp;
 
 		mddev_unlock(my_mddev);
 		for_each_mddev(mddev, tmp) {
-			mdk_rdev_t *rdev2;
+			struct md_rdev *rdev2;
 
 			mddev_lock(mddev);
 			list_for_each_entry(rdev2, &mddev->disks, same_set)
@@ -2889,7 +2888,7 @@
 __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store);
 
 
-static ssize_t recovery_start_show(mdk_rdev_t *rdev, char *page)
+static ssize_t recovery_start_show(struct md_rdev *rdev, char *page)
 {
 	unsigned long long recovery_start = rdev->recovery_offset;
 
@@ -2900,7 +2899,7 @@
 	return sprintf(page, "%llu\n", recovery_start);
 }
 
-static ssize_t recovery_start_store(mdk_rdev_t *rdev, const char *buf, size_t len)
+static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_t len)
 {
 	unsigned long long recovery_start;
 
@@ -2930,11 +2929,11 @@
 static ssize_t
 badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack);
 
-static ssize_t bb_show(mdk_rdev_t *rdev, char *page)
+static ssize_t bb_show(struct md_rdev *rdev, char *page)
 {
 	return badblocks_show(&rdev->badblocks, page, 0);
 }
-static ssize_t bb_store(mdk_rdev_t *rdev, const char *page, size_t len)
+static ssize_t bb_store(struct md_rdev *rdev, const char *page, size_t len)
 {
 	int rv = badblocks_store(&rdev->badblocks, page, len, 0);
 	/* Maybe that ack was all we needed */
@@ -2946,11 +2945,11 @@
 __ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store);
 
 
-static ssize_t ubb_show(mdk_rdev_t *rdev, char *page)
+static ssize_t ubb_show(struct md_rdev *rdev, char *page)
 {
 	return badblocks_show(&rdev->badblocks, page, 1);
 }
-static ssize_t ubb_store(mdk_rdev_t *rdev, const char *page, size_t len)
+static ssize_t ubb_store(struct md_rdev *rdev, const char *page, size_t len)
 {
 	return badblocks_store(&rdev->badblocks, page, len, 1);
 }
@@ -2972,8 +2971,8 @@
 rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
 {
 	struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
-	mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj);
-	mddev_t *mddev = rdev->mddev;
+	struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
+	struct mddev *mddev = rdev->mddev;
 	ssize_t rv;
 
 	if (!entry->show)
@@ -2995,9 +2994,9 @@
 	      const char *page, size_t length)
 {
 	struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
-	mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj);
+	struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
 	ssize_t rv;
-	mddev_t *mddev = rdev->mddev;
+	struct mddev *mddev = rdev->mddev;
 
 	if (!entry->store)
 		return -EIO;
@@ -3016,7 +3015,7 @@
 
 static void rdev_free(struct kobject *ko)
 {
-	mdk_rdev_t *rdev = container_of(ko, mdk_rdev_t, kobj);
+	struct md_rdev *rdev = container_of(ko, struct md_rdev, kobj);
 	kfree(rdev);
 }
 static const struct sysfs_ops rdev_sysfs_ops = {
@@ -3029,7 +3028,7 @@
 	.default_attrs	= rdev_default_attrs,
 };
 
-int md_rdev_init(mdk_rdev_t *rdev)
+int md_rdev_init(struct md_rdev *rdev)
 {
 	rdev->desc_nr = -1;
 	rdev->saved_raid_disk = -1;
@@ -3072,11 +3071,11 @@
  *
  * a faulty rdev _never_ has rdev->sb set.
  */
-static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_minor)
+static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor)
 {
 	char b[BDEVNAME_SIZE];
 	int err;
-	mdk_rdev_t *rdev;
+	struct md_rdev *rdev;
 	sector_t size;
 
 	rdev = kzalloc(sizeof(*rdev), GFP_KERNEL);
@@ -3145,10 +3144,10 @@
  */
 
 
-static void analyze_sbs(mddev_t * mddev)
+static void analyze_sbs(struct mddev * mddev)
 {
 	int i;
-	mdk_rdev_t *rdev, *freshest, *tmp;
+	struct md_rdev *rdev, *freshest, *tmp;
 	char b[BDEVNAME_SIZE];
 
 	freshest = NULL;
@@ -3248,13 +3247,13 @@
 static void md_safemode_timeout(unsigned long data);
 
 static ssize_t
-safe_delay_show(mddev_t *mddev, char *page)
+safe_delay_show(struct mddev *mddev, char *page)
 {
 	int msec = (mddev->safemode_delay*1000)/HZ;
 	return sprintf(page, "%d.%03d\n", msec/1000, msec%1000);
 }
 static ssize_t
-safe_delay_store(mddev_t *mddev, const char *cbuf, size_t len)
+safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len)
 {
 	unsigned long msec;
 
@@ -3276,9 +3275,9 @@
 __ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store);
 
 static ssize_t
-level_show(mddev_t *mddev, char *page)
+level_show(struct mddev *mddev, char *page)
 {
-	struct mdk_personality *p = mddev->pers;
+	struct md_personality *p = mddev->pers;
 	if (p)
 		return sprintf(page, "%s\n", p->name);
 	else if (mddev->clevel[0])
@@ -3290,14 +3289,14 @@
 }
 
 static ssize_t
-level_store(mddev_t *mddev, const char *buf, size_t len)
+level_store(struct mddev *mddev, const char *buf, size_t len)
 {
 	char clevel[16];
 	ssize_t rv = len;
-	struct mdk_personality *pers;
+	struct md_personality *pers;
 	long level;
 	void *priv;
-	mdk_rdev_t *rdev;
+	struct md_rdev *rdev;
 
 	if (mddev->pers == NULL) {
 		if (len == 0)
@@ -3471,7 +3470,7 @@
 
 
 static ssize_t
-layout_show(mddev_t *mddev, char *page)
+layout_show(struct mddev *mddev, char *page)
 {
 	/* just a number, not meaningful for all levels */
 	if (mddev->reshape_position != MaxSector &&
@@ -3482,7 +3481,7 @@
 }
 
 static ssize_t
-layout_store(mddev_t *mddev, const char *buf, size_t len)
+layout_store(struct mddev *mddev, const char *buf, size_t len)
 {
 	char *e;
 	unsigned long n = simple_strtoul(buf, &e, 10);
@@ -3512,7 +3511,7 @@
 
 
 static ssize_t
-raid_disks_show(mddev_t *mddev, char *page)
+raid_disks_show(struct mddev *mddev, char *page)
 {
 	if (mddev->raid_disks == 0)
 		return 0;
@@ -3523,10 +3522,10 @@
 	return sprintf(page, "%d\n", mddev->raid_disks);
 }
 
-static int update_raid_disks(mddev_t *mddev, int raid_disks);
+static int update_raid_disks(struct mddev *mddev, int raid_disks);
 
 static ssize_t
-raid_disks_store(mddev_t *mddev, const char *buf, size_t len)
+raid_disks_store(struct mddev *mddev, const char *buf, size_t len)
 {
 	char *e;
 	int rv = 0;
@@ -3549,7 +3548,7 @@
 __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store);
 
 static ssize_t
-chunk_size_show(mddev_t *mddev, char *page)
+chunk_size_show(struct mddev *mddev, char *page)
 {
 	if (mddev->reshape_position != MaxSector &&
 	    mddev->chunk_sectors != mddev->new_chunk_sectors)
@@ -3560,7 +3559,7 @@
 }
 
 static ssize_t
-chunk_size_store(mddev_t *mddev, const char *buf, size_t len)
+chunk_size_store(struct mddev *mddev, const char *buf, size_t len)
 {
 	char *e;
 	unsigned long n = simple_strtoul(buf, &e, 10);
@@ -3589,7 +3588,7 @@
 __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store);
 
 static ssize_t
-resync_start_show(mddev_t *mddev, char *page)
+resync_start_show(struct mddev *mddev, char *page)
 {
 	if (mddev->recovery_cp == MaxSector)
 		return sprintf(page, "none\n");
@@ -3597,7 +3596,7 @@
 }
 
 static ssize_t
-resync_start_store(mddev_t *mddev, const char *buf, size_t len)
+resync_start_store(struct mddev *mddev, const char *buf, size_t len)
 {
 	char *e;
 	unsigned long long n = simple_strtoull(buf, &e, 10);
@@ -3667,7 +3666,7 @@
 }
 
 static ssize_t
-array_state_show(mddev_t *mddev, char *page)
+array_state_show(struct mddev *mddev, char *page)
 {
 	enum array_state st = inactive;
 
@@ -3700,13 +3699,13 @@
 	return sprintf(page, "%s\n", array_states[st]);
 }
 
-static int do_md_stop(mddev_t * mddev, int ro, int is_open);
-static int md_set_readonly(mddev_t * mddev, int is_open);
-static int do_md_run(mddev_t * mddev);
-static int restart_array(mddev_t *mddev);
+static int do_md_stop(struct mddev * mddev, int ro, int is_open);
+static int md_set_readonly(struct mddev * mddev, int is_open);
+static int do_md_run(struct mddev * mddev);
+static int restart_array(struct mddev *mddev);
 
 static ssize_t
-array_state_store(mddev_t *mddev, const char *buf, size_t len)
+array_state_store(struct mddev *mddev, const char *buf, size_t len)
 {
 	int err = -EINVAL;
 	enum array_state st = match_word(buf, array_states);
@@ -3800,13 +3799,13 @@
 __ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
 
 static ssize_t
-max_corrected_read_errors_show(mddev_t *mddev, char *page) {
+max_corrected_read_errors_show(struct mddev *mddev, char *page) {
 	return sprintf(page, "%d\n",
 		       atomic_read(&mddev->max_corr_read_errors));
 }
 
 static ssize_t
-max_corrected_read_errors_store(mddev_t *mddev, const char *buf, size_t len)
+max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len)
 {
 	char *e;
 	unsigned long n = simple_strtoul(buf, &e, 10);
@@ -3823,13 +3822,13 @@
 	max_corrected_read_errors_store);
 
 static ssize_t
-null_show(mddev_t *mddev, char *page)
+null_show(struct mddev *mddev, char *page)
 {
 	return -EINVAL;
 }
 
 static ssize_t
-new_dev_store(mddev_t *mddev, const char *buf, size_t len)
+new_dev_store(struct mddev *mddev, const char *buf, size_t len)
 {
 	/* buf must be %d:%d\n? giving major and minor numbers */
 	/* The new device is added to the array.
@@ -3842,7 +3841,7 @@
 	int major = simple_strtoul(buf, &e, 10);
 	int minor;
 	dev_t dev;
-	mdk_rdev_t *rdev;
+	struct md_rdev *rdev;
 	int err;
 
 	if (!*buf || *e != ':' || !e[1] || e[1] == '\n')
@@ -3860,8 +3859,9 @@
 		rdev = md_import_device(dev, mddev->major_version,
 					mddev->minor_version);
 		if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) {
-			mdk_rdev_t *rdev0 = list_entry(mddev->disks.next,
-						       mdk_rdev_t, same_set);
+			struct md_rdev *rdev0
+				= list_entry(mddev->disks.next,
+					     struct md_rdev, same_set);
 			err = super_types[mddev->major_version]
 				.load_super(rdev, rdev0, mddev->minor_version);
 			if (err < 0)
@@ -3885,7 +3885,7 @@
 __ATTR(new_dev, S_IWUSR, null_show, new_dev_store);
 
 static ssize_t
-bitmap_store(mddev_t *mddev, const char *buf, size_t len)
+bitmap_store(struct mddev *mddev, const char *buf, size_t len)
 {
 	char *end;
 	unsigned long chunk, end_chunk;
@@ -3914,16 +3914,16 @@
 __ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store);
 
 static ssize_t
-size_show(mddev_t *mddev, char *page)
+size_show(struct mddev *mddev, char *page)
 {
 	return sprintf(page, "%llu\n",
 		(unsigned long long)mddev->dev_sectors / 2);
 }
 
-static int update_size(mddev_t *mddev, sector_t num_sectors);
+static int update_size(struct mddev *mddev, sector_t num_sectors);
 
 static ssize_t
-size_store(mddev_t *mddev, const char *buf, size_t len)
+size_store(struct mddev *mddev, const char *buf, size_t len)
 {
 	/* If array is inactive, we can reduce the component size, but
 	 * not increase it (except from 0).
@@ -3958,7 +3958,7 @@
  * or N.M for internally known formats
  */
 static ssize_t
-metadata_show(mddev_t *mddev, char *page)
+metadata_show(struct mddev *mddev, char *page)
 {
 	if (mddev->persistent)
 		return sprintf(page, "%d.%d\n",
@@ -3970,7 +3970,7 @@
 }
 
 static ssize_t
-metadata_store(mddev_t *mddev, const char *buf, size_t len)
+metadata_store(struct mddev *mddev, const char *buf, size_t len)
 {
 	int major, minor;
 	char *e;
@@ -4024,7 +4024,7 @@
 __ATTR(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
 
 static ssize_t
-action_show(mddev_t *mddev, char *page)
+action_show(struct mddev *mddev, char *page)
 {
 	char *type = "idle";
 	if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
@@ -4046,10 +4046,10 @@
 	return sprintf(page, "%s\n", type);
 }
 
-static void reap_sync_thread(mddev_t *mddev);
+static void reap_sync_thread(struct mddev *mddev);
 
 static ssize_t
-action_store(mddev_t *mddev, const char *page, size_t len)
+action_store(struct mddev *mddev, const char *page, size_t len)
 {
 	if (!mddev->pers || !mddev->pers->sync_request)
 		return -EINVAL;
@@ -4095,7 +4095,7 @@
 }
 
 static ssize_t
-mismatch_cnt_show(mddev_t *mddev, char *page)
+mismatch_cnt_show(struct mddev *mddev, char *page)
 {
 	return sprintf(page, "%llu\n",
 		       (unsigned long long) mddev->resync_mismatches);
@@ -4108,14 +4108,14 @@
 static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt);
 
 static ssize_t
-sync_min_show(mddev_t *mddev, char *page)
+sync_min_show(struct mddev *mddev, char *page)
 {
 	return sprintf(page, "%d (%s)\n", speed_min(mddev),
 		       mddev->sync_speed_min ? "local": "system");
 }
 
 static ssize_t
-sync_min_store(mddev_t *mddev, const char *buf, size_t len)
+sync_min_store(struct mddev *mddev, const char *buf, size_t len)
 {
 	int min;
 	char *e;
@@ -4134,14 +4134,14 @@
 __ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store);
 
 static ssize_t
-sync_max_show(mddev_t *mddev, char *page)
+sync_max_show(struct mddev *mddev, char *page)
 {
 	return sprintf(page, "%d (%s)\n", speed_max(mddev),
 		       mddev->sync_speed_max ? "local": "system");
 }
 
 static ssize_t
-sync_max_store(mddev_t *mddev, const char *buf, size_t len)
+sync_max_store(struct mddev *mddev, const char *buf, size_t len)
 {
 	int max;
 	char *e;
@@ -4160,20 +4160,20 @@
 __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store);
 
 static ssize_t
-degraded_show(mddev_t *mddev, char *page)
+degraded_show(struct mddev *mddev, char *page)
 {
 	return sprintf(page, "%d\n", mddev->degraded);
 }
 static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded);
 
 static ssize_t
-sync_force_parallel_show(mddev_t *mddev, char *page)
+sync_force_parallel_show(struct mddev *mddev, char *page)
 {
 	return sprintf(page, "%d\n", mddev->parallel_resync);
 }
 
 static ssize_t
-sync_force_parallel_store(mddev_t *mddev, const char *buf, size_t len)
+sync_force_parallel_store(struct mddev *mddev, const char *buf, size_t len)
 {
 	long n;
 
@@ -4197,7 +4197,7 @@
        sync_force_parallel_show, sync_force_parallel_store);
 
 static ssize_t
-sync_speed_show(mddev_t *mddev, char *page)
+sync_speed_show(struct mddev *mddev, char *page)
 {
 	unsigned long resync, dt, db;
 	if (mddev->curr_resync == 0)
@@ -4212,7 +4212,7 @@
 static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);
 
 static ssize_t
-sync_completed_show(mddev_t *mddev, char *page)
+sync_completed_show(struct mddev *mddev, char *page)
 {
 	unsigned long long max_sectors, resync;
 
@@ -4231,13 +4231,13 @@
 static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed);
 
 static ssize_t
-min_sync_show(mddev_t *mddev, char *page)
+min_sync_show(struct mddev *mddev, char *page)
 {
 	return sprintf(page, "%llu\n",
 		       (unsigned long long)mddev->resync_min);
 }
 static ssize_t
-min_sync_store(mddev_t *mddev, const char *buf, size_t len)
+min_sync_store(struct mddev *mddev, const char *buf, size_t len)
 {
 	unsigned long long min;
 	if (strict_strtoull(buf, 10, &min))
@@ -4262,7 +4262,7 @@
 __ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store);
 
 static ssize_t
-max_sync_show(mddev_t *mddev, char *page)
+max_sync_show(struct mddev *mddev, char *page)
 {
 	if (mddev->resync_max == MaxSector)
 		return sprintf(page, "max\n");
@@ -4271,7 +4271,7 @@
 			       (unsigned long long)mddev->resync_max);
 }
 static ssize_t
-max_sync_store(mddev_t *mddev, const char *buf, size_t len)
+max_sync_store(struct mddev *mddev, const char *buf, size_t len)
 {
 	if (strncmp(buf, "max", 3) == 0)
 		mddev->resync_max = MaxSector;
@@ -4302,13 +4302,13 @@
 __ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store);
 
 static ssize_t
-suspend_lo_show(mddev_t *mddev, char *page)
+suspend_lo_show(struct mddev *mddev, char *page)
 {
 	return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo);
 }
 
 static ssize_t
-suspend_lo_store(mddev_t *mddev, const char *buf, size_t len)
+suspend_lo_store(struct mddev *mddev, const char *buf, size_t len)
 {
 	char *e;
 	unsigned long long new = simple_strtoull(buf, &e, 10);
@@ -4336,13 +4336,13 @@
 
 
 static ssize_t
-suspend_hi_show(mddev_t *mddev, char *page)
+suspend_hi_show(struct mddev *mddev, char *page)
 {
 	return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi);
 }
 
 static ssize_t
-suspend_hi_store(mddev_t *mddev, const char *buf, size_t len)
+suspend_hi_store(struct mddev *mddev, const char *buf, size_t len)
 {
 	char *e;
 	unsigned long long new = simple_strtoull(buf, &e, 10);
@@ -4369,7 +4369,7 @@
 __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
 
 static ssize_t
-reshape_position_show(mddev_t *mddev, char *page)
+reshape_position_show(struct mddev *mddev, char *page)
 {
 	if (mddev->reshape_position != MaxSector)
 		return sprintf(page, "%llu\n",
@@ -4379,7 +4379,7 @@
 }
 
 static ssize_t
-reshape_position_store(mddev_t *mddev, const char *buf, size_t len)
+reshape_position_store(struct mddev *mddev, const char *buf, size_t len)
 {
 	char *e;
 	unsigned long long new = simple_strtoull(buf, &e, 10);
@@ -4400,7 +4400,7 @@
        reshape_position_store);
 
 static ssize_t
-array_size_show(mddev_t *mddev, char *page)
+array_size_show(struct mddev *mddev, char *page)
 {
 	if (mddev->external_size)
 		return sprintf(page, "%llu\n",
@@ -4410,7 +4410,7 @@
 }
 
 static ssize_t
-array_size_store(mddev_t *mddev, const char *buf, size_t len)
+array_size_store(struct mddev *mddev, const char *buf, size_t len)
 {
 	sector_t sectors;
 
@@ -4485,7 +4485,7 @@
 md_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
 {
 	struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
-	mddev_t *mddev = container_of(kobj, struct mddev_s, kobj);
+	struct mddev *mddev = container_of(kobj, struct mddev, kobj);
 	ssize_t rv;
 
 	if (!entry->show)
@@ -4503,7 +4503,7 @@
 	      const char *page, size_t length)
 {
 	struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
-	mddev_t *mddev = container_of(kobj, struct mddev_s, kobj);
+	struct mddev *mddev = container_of(kobj, struct mddev, kobj);
 	ssize_t rv;
 
 	if (!entry->store)
@@ -4522,7 +4522,7 @@
 
 static void md_free(struct kobject *ko)
 {
-	mddev_t *mddev = container_of(ko, mddev_t, kobj);
+	struct mddev *mddev = container_of(ko, struct mddev, kobj);
 
 	if (mddev->sysfs_state)
 		sysfs_put(mddev->sysfs_state);
@@ -4551,7 +4551,7 @@
 
 static void mddev_delayed_delete(struct work_struct *ws)
 {
-	mddev_t *mddev = container_of(ws, mddev_t, del_work);
+	struct mddev *mddev = container_of(ws, struct mddev, del_work);
 
 	sysfs_remove_group(&mddev->kobj, &md_bitmap_group);
 	kobject_del(&mddev->kobj);
@@ -4561,7 +4561,7 @@
 static int md_alloc(dev_t dev, char *name)
 {
 	static DEFINE_MUTEX(disks_mutex);
-	mddev_t *mddev = mddev_find(dev);
+	struct mddev *mddev = mddev_find(dev);
 	struct gendisk *disk;
 	int partitioned;
 	int shift;
@@ -4588,7 +4588,7 @@
 	if (name) {
 		/* Need to ensure that 'name' is not a duplicate.
 		 */
-		mddev_t *mddev2;
+		struct mddev *mddev2;
 		spin_lock(&all_mddevs_lock);
 
 		list_for_each_entry(mddev2, &all_mddevs, all_mddevs)
@@ -4689,7 +4689,7 @@
 
 static void md_safemode_timeout(unsigned long data)
 {
-	mddev_t *mddev = (mddev_t *) data;
+	struct mddev *mddev = (struct mddev *) data;
 
 	if (!atomic_read(&mddev->writes_pending)) {
 		mddev->safemode = 1;
@@ -4701,11 +4701,11 @@
 
 static int start_dirty_degraded;
 
-int md_run(mddev_t *mddev)
+int md_run(struct mddev *mddev)
 {
 	int err;
-	mdk_rdev_t *rdev;
-	struct mdk_personality *pers;
+	struct md_rdev *rdev;
+	struct md_personality *pers;
 
 	if (list_empty(&mddev->disks))
 		/* cannot run an array with no devices.. */
@@ -4769,7 +4769,7 @@
 
 	if (mddev->bio_set == NULL)
 		mddev->bio_set = bioset_create(BIO_POOL_SIZE,
-					       sizeof(mddev_t *));
+					       sizeof(struct mddev *));
 
 	spin_lock(&pers_lock);
 	pers = find_pers(mddev->level, mddev->clevel);
@@ -4804,7 +4804,7 @@
 		 * configuration.
 		 */
 		char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
-		mdk_rdev_t *rdev2;
+		struct md_rdev *rdev2;
 		int warned = 0;
 
 		list_for_each_entry(rdev, &mddev->disks, same_set)
@@ -4903,7 +4903,7 @@
 }
 EXPORT_SYMBOL_GPL(md_run);
 
-static int do_md_run(mddev_t *mddev)
+static int do_md_run(struct mddev *mddev)
 {
 	int err;
 
@@ -4927,7 +4927,7 @@
 	return err;
 }
 
-static int restart_array(mddev_t *mddev)
+static int restart_array(struct mddev *mddev)
 {
 	struct gendisk *disk = mddev->gendisk;
 
@@ -4977,7 +4977,7 @@
 	spin_unlock(&inode->i_lock);
 }
 
-static void md_clean(mddev_t *mddev)
+static void md_clean(struct mddev *mddev)
 {
 	mddev->array_sectors = 0;
 	mddev->external_size = 0;
@@ -5020,7 +5020,7 @@
 	mddev->bitmap_info.max_write_behind = 0;
 }
 
-static void __md_stop_writes(mddev_t *mddev)
+static void __md_stop_writes(struct mddev *mddev)
 {
 	if (mddev->sync_thread) {
 		set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
@@ -5040,7 +5040,7 @@
 	}
 }
 
-void md_stop_writes(mddev_t *mddev)
+void md_stop_writes(struct mddev *mddev)
 {
 	mddev_lock(mddev);
 	__md_stop_writes(mddev);
@@ -5048,7 +5048,7 @@
 }
 EXPORT_SYMBOL_GPL(md_stop_writes);
 
-void md_stop(mddev_t *mddev)
+void md_stop(struct mddev *mddev)
 {
 	mddev->ready = 0;
 	mddev->pers->stop(mddev);
@@ -5060,7 +5060,7 @@
 }
 EXPORT_SYMBOL_GPL(md_stop);
 
-static int md_set_readonly(mddev_t *mddev, int is_open)
+static int md_set_readonly(struct mddev *mddev, int is_open)
 {
 	int err = 0;
 	mutex_lock(&mddev->open_mutex);
@@ -5090,10 +5090,10 @@
  *   0 - completely stop and dis-assemble array
  *   2 - stop but do not disassemble array
  */
-static int do_md_stop(mddev_t * mddev, int mode, int is_open)
+static int do_md_stop(struct mddev * mddev, int mode, int is_open)
 {
 	struct gendisk *disk = mddev->gendisk;
-	mdk_rdev_t *rdev;
+	struct md_rdev *rdev;
 
 	mutex_lock(&mddev->open_mutex);
 	if (atomic_read(&mddev->openers) > is_open ||
@@ -5156,9 +5156,9 @@
 }
 
 #ifndef MODULE
-static void autorun_array(mddev_t *mddev)
+static void autorun_array(struct mddev *mddev)
 {
-	mdk_rdev_t *rdev;
+	struct md_rdev *rdev;
 	int err;
 
 	if (list_empty(&mddev->disks))
@@ -5193,8 +5193,8 @@
  */
 static void autorun_devices(int part)
 {
-	mdk_rdev_t *rdev0, *rdev, *tmp;
-	mddev_t *mddev;
+	struct md_rdev *rdev0, *rdev, *tmp;
+	struct mddev *mddev;
 	char b[BDEVNAME_SIZE];
 
 	printk(KERN_INFO "md: autorun ...\n");
@@ -5203,7 +5203,7 @@
 		dev_t dev;
 		LIST_HEAD(candidates);
 		rdev0 = list_entry(pending_raid_disks.next,
-					 mdk_rdev_t, same_set);
+					 struct md_rdev, same_set);
 
 		printk(KERN_INFO "md: considering %s ...\n",
 			bdevname(rdev0->bdev,b));
@@ -5289,11 +5289,11 @@
 	return 0;
 }
 
-static int get_array_info(mddev_t * mddev, void __user * arg)
+static int get_array_info(struct mddev * mddev, void __user * arg)
 {
 	mdu_array_info_t info;
 	int nr,working,insync,failed,spare;
-	mdk_rdev_t *rdev;
+	struct md_rdev *rdev;
 
 	nr=working=insync=failed=spare=0;
 	list_for_each_entry(rdev, &mddev->disks, same_set) {
@@ -5342,7 +5342,7 @@
 	return 0;
 }
 
-static int get_bitmap_file(mddev_t * mddev, void __user * arg)
+static int get_bitmap_file(struct mddev * mddev, void __user * arg)
 {
 	mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */
 	char *ptr, *buf = NULL;
@@ -5382,10 +5382,10 @@
 	return err;
 }
 
-static int get_disk_info(mddev_t * mddev, void __user * arg)
+static int get_disk_info(struct mddev * mddev, void __user * arg)
 {
 	mdu_disk_info_t info;
-	mdk_rdev_t *rdev;
+	struct md_rdev *rdev;
 
 	if (copy_from_user(&info, arg, sizeof(info)))
 		return -EFAULT;
@@ -5416,10 +5416,10 @@
 	return 0;
 }
 
-static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
+static int add_new_disk(struct mddev * mddev, mdu_disk_info_t *info)
 {
 	char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
-	mdk_rdev_t *rdev;
+	struct md_rdev *rdev;
 	dev_t dev = MKDEV(info->major,info->minor);
 
 	if (info->major != MAJOR(dev) || info->minor != MINOR(dev))
@@ -5436,8 +5436,9 @@
 			return PTR_ERR(rdev);
 		}
 		if (!list_empty(&mddev->disks)) {
-			mdk_rdev_t *rdev0 = list_entry(mddev->disks.next,
-							mdk_rdev_t, same_set);
+			struct md_rdev *rdev0
+				= list_entry(mddev->disks.next,
+					     struct md_rdev, same_set);
 			err = super_types[mddev->major_version]
 				.load_super(rdev, rdev0, mddev->minor_version);
 			if (err < 0) {
@@ -5587,10 +5588,10 @@
 	return 0;
 }
 
-static int hot_remove_disk(mddev_t * mddev, dev_t dev)
+static int hot_remove_disk(struct mddev * mddev, dev_t dev)
 {
 	char b[BDEVNAME_SIZE];
-	mdk_rdev_t *rdev;
+	struct md_rdev *rdev;
 
 	rdev = find_rdev(mddev, dev);
 	if (!rdev)
@@ -5610,11 +5611,11 @@
 	return -EBUSY;
 }
 
-static int hot_add_disk(mddev_t * mddev, dev_t dev)
+static int hot_add_disk(struct mddev * mddev, dev_t dev)
 {
 	char b[BDEVNAME_SIZE];
 	int err;
-	mdk_rdev_t *rdev;
+	struct md_rdev *rdev;
 
 	if (!mddev->pers)
 		return -ENODEV;
@@ -5684,7 +5685,7 @@
 	return err;
 }
 
-static int set_bitmap_file(mddev_t *mddev, int fd)
+static int set_bitmap_file(struct mddev *mddev, int fd)
 {
 	int err;
 
@@ -5757,7 +5758,7 @@
  *  The minor and patch _version numbers are also kept incase the
  *  super_block handler wishes to interpret them.
  */
-static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
+static int set_array_info(struct mddev * mddev, mdu_array_info_t *info)
 {
 
 	if (info->raid_disks == 0) {
@@ -5827,7 +5828,7 @@
 	return 0;
 }
 
-void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors)
+void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors)
 {
 	WARN(!mddev_is_locked(mddev), "%s: unlocked mddev!\n", __func__);
 
@@ -5838,9 +5839,9 @@
 }
 EXPORT_SYMBOL(md_set_array_sectors);
 
-static int update_size(mddev_t *mddev, sector_t num_sectors)
+static int update_size(struct mddev *mddev, sector_t num_sectors)
 {
-	mdk_rdev_t *rdev;
+	struct md_rdev *rdev;
 	int rv;
 	int fit = (num_sectors == 0);
 
@@ -5876,7 +5877,7 @@
 	return rv;
 }
 
-static int update_raid_disks(mddev_t *mddev, int raid_disks)
+static int update_raid_disks(struct mddev *mddev, int raid_disks)
 {
 	int rv;
 	/* change the number of raid disks */
@@ -5904,7 +5905,7 @@
  * Any differences that cannot be handled will cause an error.
  * Normally, only one change can be managed at a time.
  */
-static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
+static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
 {
 	int rv = 0;
 	int cnt = 0;
@@ -5997,9 +5998,9 @@
 	return rv;
 }
 
-static int set_disk_faulty(mddev_t *mddev, dev_t dev)
+static int set_disk_faulty(struct mddev *mddev, dev_t dev)
 {
-	mdk_rdev_t *rdev;
+	struct md_rdev *rdev;
 
 	if (mddev->pers == NULL)
 		return -ENODEV;
@@ -6022,7 +6023,7 @@
  */
 static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 {
-	mddev_t *mddev = bdev->bd_disk->private_data;
+	struct mddev *mddev = bdev->bd_disk->private_data;
 
 	geo->heads = 2;
 	geo->sectors = 4;
@@ -6035,7 +6036,7 @@
 {
 	int err = 0;
 	void __user *argp = (void __user *)arg;
-	mddev_t *mddev = NULL;
+	struct mddev *mddev = NULL;
 	int ro;
 
 	if (!capable(CAP_SYS_ADMIN))
@@ -6298,7 +6299,7 @@
 	 * Succeed if we can lock the mddev, which confirms that
 	 * it isn't being stopped right now.
 	 */
-	mddev_t *mddev = mddev_find(bdev->bd_dev);
+	struct mddev *mddev = mddev_find(bdev->bd_dev);
 	int err;
 
 	if (mddev->gendisk != bdev->bd_disk) {
@@ -6327,7 +6328,7 @@
 
 static int md_release(struct gendisk *disk, fmode_t mode)
 {
- 	mddev_t *mddev = disk->private_data;
+ 	struct mddev *mddev = disk->private_data;
 
 	BUG_ON(!mddev);
 	atomic_dec(&mddev->openers);
@@ -6338,14 +6339,14 @@
 
 static int md_media_changed(struct gendisk *disk)
 {
-	mddev_t *mddev = disk->private_data;
+	struct mddev *mddev = disk->private_data;
 
 	return mddev->changed;
 }
 
 static int md_revalidate(struct gendisk *disk)
 {
-	mddev_t *mddev = disk->private_data;
+	struct mddev *mddev = disk->private_data;
 
 	mddev->changed = 0;
 	return 0;
@@ -6366,7 +6367,7 @@
 
 static int md_thread(void * arg)
 {
-	mdk_thread_t *thread = arg;
+	struct md_thread *thread = arg;
 
 	/*
 	 * md_thread is a 'system-thread', it's priority should be very
@@ -6405,21 +6406,21 @@
 	return 0;
 }
 
-void md_wakeup_thread(mdk_thread_t *thread)
+void md_wakeup_thread(struct md_thread *thread)
 {
 	if (thread) {
-		dprintk("md: waking up MD thread %s.\n", thread->tsk->comm);
+		pr_debug("md: waking up MD thread %s.\n", thread->tsk->comm);
 		set_bit(THREAD_WAKEUP, &thread->flags);
 		wake_up(&thread->wqueue);
 	}
 }
 
-mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev,
+struct md_thread *md_register_thread(void (*run) (struct mddev *), struct mddev *mddev,
 				 const char *name)
 {
-	mdk_thread_t *thread;
+	struct md_thread *thread;
 
-	thread = kzalloc(sizeof(mdk_thread_t), GFP_KERNEL);
+	thread = kzalloc(sizeof(struct md_thread), GFP_KERNEL);
 	if (!thread)
 		return NULL;
 
@@ -6439,12 +6440,12 @@
 	return thread;
 }
 
-void md_unregister_thread(mdk_thread_t **threadp)
+void md_unregister_thread(struct md_thread **threadp)
 {
-	mdk_thread_t *thread = *threadp;
+	struct md_thread *thread = *threadp;
 	if (!thread)
 		return;
-	dprintk("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk));
+	pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk));
 	/* Locking ensures that mddev_unlock does not wake_up a
 	 * non-existent thread
 	 */
@@ -6456,7 +6457,7 @@
 	kfree(thread);
 }
 
-void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
+void md_error(struct mddev *mddev, struct md_rdev *rdev)
 {
 	if (!mddev) {
 		MD_BUG();
@@ -6485,7 +6486,7 @@
 static void status_unused(struct seq_file *seq)
 {
 	int i = 0;
-	mdk_rdev_t *rdev;
+	struct md_rdev *rdev;
 
 	seq_printf(seq, "unused devices: ");
 
@@ -6502,7 +6503,7 @@
 }
 
 
-static void status_resync(struct seq_file *seq, mddev_t * mddev)
+static void status_resync(struct seq_file *seq, struct mddev * mddev)
 {
 	sector_t max_sectors, resync, res;
 	unsigned long dt, db;
@@ -6593,7 +6594,7 @@
 {
 	struct list_head *tmp;
 	loff_t l = *pos;
-	mddev_t *mddev;
+	struct mddev *mddev;
 
 	if (l >= 0x10000)
 		return NULL;
@@ -6604,7 +6605,7 @@
 	spin_lock(&all_mddevs_lock);
 	list_for_each(tmp,&all_mddevs)
 		if (!l--) {
-			mddev = list_entry(tmp, mddev_t, all_mddevs);
+			mddev = list_entry(tmp, struct mddev, all_mddevs);
 			mddev_get(mddev);
 			spin_unlock(&all_mddevs_lock);
 			return mddev;
@@ -6618,7 +6619,7 @@
 static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
 	struct list_head *tmp;
-	mddev_t *next_mddev, *mddev = v;
+	struct mddev *next_mddev, *mddev = v;
 	
 	++*pos;
 	if (v == (void*)2)
@@ -6630,7 +6631,7 @@
 	else
 		tmp = mddev->all_mddevs.next;
 	if (tmp != &all_mddevs)
-		next_mddev = mddev_get(list_entry(tmp,mddev_t,all_mddevs));
+		next_mddev = mddev_get(list_entry(tmp,struct mddev,all_mddevs));
 	else {
 		next_mddev = (void*)2;
 		*pos = 0x10000;
@@ -6645,7 +6646,7 @@
 
 static void md_seq_stop(struct seq_file *seq, void *v)
 {
-	mddev_t *mddev = v;
+	struct mddev *mddev = v;
 
 	if (mddev && v != (void*)1 && v != (void*)2)
 		mddev_put(mddev);
@@ -6653,13 +6654,13 @@
 
 static int md_seq_show(struct seq_file *seq, void *v)
 {
-	mddev_t *mddev = v;
+	struct mddev *mddev = v;
 	sector_t sectors;
-	mdk_rdev_t *rdev;
+	struct md_rdev *rdev;
 	struct bitmap *bitmap;
 
 	if (v == (void*)1) {
-		struct mdk_personality *pers;
+		struct md_personality *pers;
 		seq_printf(seq, "Personalities : ");
 		spin_lock(&pers_lock);
 		list_for_each_entry(pers, &pers_list, list)
@@ -6815,7 +6816,7 @@
 	.poll		= mdstat_poll,
 };
 
-int register_md_personality(struct mdk_personality *p)
+int register_md_personality(struct md_personality *p)
 {
 	spin_lock(&pers_lock);
 	list_add_tail(&p->list, &pers_list);
@@ -6824,7 +6825,7 @@
 	return 0;
 }
 
-int unregister_md_personality(struct mdk_personality *p)
+int unregister_md_personality(struct md_personality *p)
 {
 	printk(KERN_INFO "md: %s personality unregistered\n", p->name);
 	spin_lock(&pers_lock);
@@ -6833,9 +6834,9 @@
 	return 0;
 }
 
-static int is_mddev_idle(mddev_t *mddev, int init)
+static int is_mddev_idle(struct mddev *mddev, int init)
 {
-	mdk_rdev_t * rdev;
+	struct md_rdev * rdev;
 	int idle;
 	int curr_events;
 
@@ -6877,7 +6878,7 @@
 	return idle;
 }
 
-void md_done_sync(mddev_t *mddev, int blocks, int ok)
+void md_done_sync(struct mddev *mddev, int blocks, int ok)
 {
 	/* another "blocks" (512byte) blocks have been synced */
 	atomic_sub(blocks, &mddev->recovery_active);
@@ -6895,7 +6896,7 @@
  * in superblock) before writing, schedule a superblock update
  * and wait for it to complete.
  */
-void md_write_start(mddev_t *mddev, struct bio *bi)
+void md_write_start(struct mddev *mddev, struct bio *bi)
 {
 	int did_change = 0;
 	if (bio_data_dir(bi) != WRITE)
@@ -6930,7 +6931,7 @@
 		   !test_bit(MD_CHANGE_PENDING, &mddev->flags));
 }
 
-void md_write_end(mddev_t *mddev)
+void md_write_end(struct mddev *mddev)
 {
 	if (atomic_dec_and_test(&mddev->writes_pending)) {
 		if (mddev->safemode == 2)
@@ -6949,7 +6950,7 @@
  * In the ->external case MD_CHANGE_CLEAN can not be cleared until mddev->lock
  * is dropped, so return -EAGAIN after notifying userspace.
  */
-int md_allow_write(mddev_t *mddev)
+int md_allow_write(struct mddev *mddev)
 {
 	if (!mddev->pers)
 		return 0;
@@ -6981,9 +6982,9 @@
 
 #define SYNC_MARKS	10
 #define	SYNC_MARK_STEP	(3*HZ)
-void md_do_sync(mddev_t *mddev)
+void md_do_sync(struct mddev *mddev)
 {
-	mddev_t *mddev2;
+	struct mddev *mddev2;
 	unsigned int currspeed = 0,
 		 window;
 	sector_t max_sectors,j, io_sectors;
@@ -6993,7 +6994,7 @@
 	struct list_head *tmp;
 	sector_t last_check;
 	int skipped = 0;
-	mdk_rdev_t *rdev;
+	struct md_rdev *rdev;
 	char *desc;
 
 	/* just incase thread restarts... */
@@ -7308,9 +7309,9 @@
 }
 EXPORT_SYMBOL_GPL(md_do_sync);
 
-static int remove_and_add_spares(mddev_t *mddev)
+static int remove_and_add_spares(struct mddev *mddev)
 {
-	mdk_rdev_t *rdev;
+	struct md_rdev *rdev;
 	int spares = 0;
 
 	mddev->curr_resync_completed = 0;
@@ -7352,9 +7353,9 @@
 	return spares;
 }
 
-static void reap_sync_thread(mddev_t *mddev)
+static void reap_sync_thread(struct mddev *mddev)
 {
-	mdk_rdev_t *rdev;
+	struct md_rdev *rdev;
 
 	/* resync has finished, collect result */
 	md_unregister_thread(&mddev->sync_thread);
@@ -7369,15 +7370,19 @@
 	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
 	    mddev->pers->finish_reshape)
 		mddev->pers->finish_reshape(mddev);
-	md_update_sb(mddev, 1);
 
-	/* if array is no-longer degraded, then any saved_raid_disk
-	 * information must be scrapped
+	/* If array is no-longer degraded, then any saved_raid_disk
+	 * information must be scrapped.  Also if any device is now
+	 * In_sync we must scrape the saved_raid_disk for that device
+	 * do the superblock for an incrementally recovered device
+	 * written out.
 	 */
-	if (!mddev->degraded)
-		list_for_each_entry(rdev, &mddev->disks, same_set)
+	list_for_each_entry(rdev, &mddev->disks, same_set)
+		if (!mddev->degraded ||
+		    test_bit(In_sync, &rdev->flags))
 			rdev->saved_raid_disk = -1;
 
+	md_update_sb(mddev, 1);
 	clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
 	clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
 	clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
@@ -7413,7 +7418,7 @@
  *  5/ If array is degraded, try to add spares devices
  *  6/ If array has spares or is not in-sync, start a resync thread.
  */
-void md_check_recovery(mddev_t *mddev)
+void md_check_recovery(struct mddev *mddev)
 {
 	if (mddev->suspended)
 		return;
@@ -7449,7 +7454,7 @@
 			/* Only thing we do on a ro array is remove
 			 * failed devices.
 			 */
-			mdk_rdev_t *rdev;
+			struct md_rdev *rdev;
 			list_for_each_entry(rdev, &mddev->disks, same_set)
 				if (rdev->raid_disk >= 0 &&
 				    !test_bit(Blocked, &rdev->flags) &&
@@ -7573,7 +7578,7 @@
 	}
 }
 
-void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev)
+void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev)
 {
 	sysfs_notify_dirent_safe(rdev->sysfs_state);
 	wait_event_timeout(rdev->blocked_wait,
@@ -7831,7 +7836,7 @@
 	return rv;
 }
 
-int rdev_set_badblocks(mdk_rdev_t *rdev, sector_t s, int sectors,
+int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
 		       int acknowledged)
 {
 	int rv = md_set_badblocks(&rdev->badblocks,
@@ -7940,7 +7945,7 @@
 	return rv;
 }
 
-int rdev_clear_badblocks(mdk_rdev_t *rdev, sector_t s, int sectors)
+int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors)
 {
 	return md_clear_badblocks(&rdev->badblocks,
 				  s + rdev->data_offset,
@@ -8074,13 +8079,14 @@
 			    unsigned long code, void *x)
 {
 	struct list_head *tmp;
-	mddev_t *mddev;
+	struct mddev *mddev;
+	int need_delay = 0;
 
 	if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) {
 
 		printk(KERN_INFO "md: stopping all md devices.\n");
 
-		for_each_mddev(mddev, tmp)
+		for_each_mddev(mddev, tmp) {
 			if (mddev_trylock(mddev)) {
 				/* Force a switch to readonly even array
 				 * appears to still be in use.  Hence
@@ -8089,13 +8095,16 @@
 				md_set_readonly(mddev, 100);
 				mddev_unlock(mddev);
 			}
+			need_delay = 1;
+		}
 		/*
 		 * certain more exotic SCSI devices are known to be
 		 * volatile wrt too early system reboots. While the
 		 * right place to handle this issue is the given
 		 * driver, we do want to have a safe RAID driver ...
 		 */
-		mdelay(1000*1);
+		if (need_delay)
+			mdelay(1000*1);
 	}
 	return NOTIFY_DONE;
 }
@@ -8108,7 +8117,7 @@
 
 static void md_geninit(void)
 {
-	dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
+	pr_debug("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
 
 	proc_create("mdstat", S_IRUGO, NULL, &md_seq_fops);
 }
@@ -8183,7 +8192,7 @@
 
 static void autostart_arrays(int part)
 {
-	mdk_rdev_t *rdev;
+	struct md_rdev *rdev;
 	struct detected_devices_node *node_detected_dev;
 	dev_t dev;
 	int i_scanned, i_passed;
@@ -8223,7 +8232,7 @@
 
 static __exit void md_exit(void)
 {
-	mddev_t *mddev;
+	struct mddev *mddev;
 	struct list_head *tmp;
 
 	blk_unregister_region(MKDEV(MD_MAJOR,0), 1U << MINORBITS);

diff --git a/drivers/md/md.h b/drivers/md/md.h
index 0a309dc..51c1d91 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h

@@ -1,5 +1,5 @@
 /*
-   md_k.h : kernel internal structure of the Linux MD driver
+   md.h : kernel internal structure of the Linux MD driver
           Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman
 	  
    This program is free software; you can redistribute it and/or modify
@@ -26,9 +26,6 @@
 
 #define MaxSector (~(sector_t)0)
 
-typedef struct mddev_s mddev_t;
-typedef struct mdk_rdev_s mdk_rdev_t;
-
 /* Bad block numbers are stored sorted in a single page.
  * 64bits is used for each block or extent.
  * 54 bits are sector number, 9 bits are extent size,
@@ -39,12 +36,11 @@
 /*
  * MD's 'extended' device
  */
-struct mdk_rdev_s
-{
+struct md_rdev {
 	struct list_head same_set;	/* RAID devices within the same set */
 
 	sector_t sectors;		/* Device size (in 512bytes sectors) */
-	mddev_t *mddev;			/* RAID array if running */
+	struct mddev *mddev;		/* RAID array if running */
 	int last_events;		/* IO event timestamp */
 
 	/*
@@ -168,7 +164,7 @@
 
 extern int md_is_badblock(struct badblocks *bb, sector_t s, int sectors,
 			  sector_t *first_bad, int *bad_sectors);
-static inline int is_badblock(mdk_rdev_t *rdev, sector_t s, int sectors,
+static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors,
 			      sector_t *first_bad, int *bad_sectors)
 {
 	if (unlikely(rdev->badblocks.count)) {
@@ -181,15 +177,14 @@
 	}
 	return 0;
 }
-extern int rdev_set_badblocks(mdk_rdev_t *rdev, sector_t s, int sectors,
+extern int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
 			      int acknowledged);
-extern int rdev_clear_badblocks(mdk_rdev_t *rdev, sector_t s, int sectors);
+extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors);
 extern void md_ack_all_badblocks(struct badblocks *bb);
 
-struct mddev_s
-{
+struct mddev {
 	void				*private;
-	struct mdk_personality		*pers;
+	struct md_personality		*pers;
 	dev_t				unit;
 	int				md_minor;
 	struct list_head 		disks;
@@ -256,8 +251,8 @@
 	atomic_t			plug_cnt;	/* If device is expecting
 							 * more bios soon.
 							 */
-	struct mdk_thread_s		*thread;	/* management thread */
-	struct mdk_thread_s		*sync_thread;	/* doing resync or reconstruct */
+	struct md_thread		*thread;	/* management thread */
+	struct md_thread		*sync_thread;	/* doing resync or reconstruct */
 	sector_t			curr_resync;	/* last block scheduled */
 	/* As resync requests can complete out of order, we cannot easily track
 	 * how much resync has been completed.  So we occasionally pause until
@@ -402,11 +397,11 @@
 	atomic_t flush_pending;
 	struct work_struct flush_work;
 	struct work_struct event_work;	/* used by dm to report failure event */
-	void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev);
+	void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev);
 };
 
 
-static inline void rdev_dec_pending(mdk_rdev_t *rdev, mddev_t *mddev)
+static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev *mddev)
 {
 	int faulty = test_bit(Faulty, &rdev->flags);
 	if (atomic_dec_and_test(&rdev->nr_pending) && faulty)
@@ -418,35 +413,35 @@
         atomic_add(nr_sectors, &bdev->bd_contains->bd_disk->sync_io);
 }
 
-struct mdk_personality
+struct md_personality
 {
 	char *name;
 	int level;
 	struct list_head list;
 	struct module *owner;
-	int (*make_request)(mddev_t *mddev, struct bio *bio);
-	int (*run)(mddev_t *mddev);
-	int (*stop)(mddev_t *mddev);
-	void (*status)(struct seq_file *seq, mddev_t *mddev);
+	int (*make_request)(struct mddev *mddev, struct bio *bio);
+	int (*run)(struct mddev *mddev);
+	int (*stop)(struct mddev *mddev);
+	void (*status)(struct seq_file *seq, struct mddev *mddev);
 	/* error_handler must set ->faulty and clear ->in_sync
 	 * if appropriate, and should abort recovery if needed 
 	 */
-	void (*error_handler)(mddev_t *mddev, mdk_rdev_t *rdev);
-	int (*hot_add_disk) (mddev_t *mddev, mdk_rdev_t *rdev);
-	int (*hot_remove_disk) (mddev_t *mddev, int number);
-	int (*spare_active) (mddev_t *mddev);
-	sector_t (*sync_request)(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster);
-	int (*resize) (mddev_t *mddev, sector_t sectors);
-	sector_t (*size) (mddev_t *mddev, sector_t sectors, int raid_disks);
-	int (*check_reshape) (mddev_t *mddev);
-	int (*start_reshape) (mddev_t *mddev);
-	void (*finish_reshape) (mddev_t *mddev);
+	void (*error_handler)(struct mddev *mddev, struct md_rdev *rdev);
+	int (*hot_add_disk) (struct mddev *mddev, struct md_rdev *rdev);
+	int (*hot_remove_disk) (struct mddev *mddev, int number);
+	int (*spare_active) (struct mddev *mddev);
+	sector_t (*sync_request)(struct mddev *mddev, sector_t sector_nr, int *skipped, int go_faster);
+	int (*resize) (struct mddev *mddev, sector_t sectors);
+	sector_t (*size) (struct mddev *mddev, sector_t sectors, int raid_disks);
+	int (*check_reshape) (struct mddev *mddev);
+	int (*start_reshape) (struct mddev *mddev);
+	void (*finish_reshape) (struct mddev *mddev);
 	/* quiesce moves between quiescence states
 	 * 0 - fully active
 	 * 1 - no new requests allowed
 	 * others - reserved
 	 */
-	void (*quiesce) (mddev_t *mddev, int state);
+	void (*quiesce) (struct mddev *mddev, int state);
 	/* takeover is used to transition an array from one
 	 * personality to another.  The new personality must be able
 	 * to handle the data in the current layout.
@@ -456,14 +451,14 @@
 	 * This needs to be installed and then ->run used to activate the
 	 * array.
 	 */
-	void *(*takeover) (mddev_t *mddev);
+	void *(*takeover) (struct mddev *mddev);
 };
 
 
 struct md_sysfs_entry {
 	struct attribute attr;
-	ssize_t (*show)(mddev_t *, char *);
-	ssize_t (*store)(mddev_t *, const char *, size_t);
+	ssize_t (*show)(struct mddev *, char *);
+	ssize_t (*store)(struct mddev *, const char *, size_t);
 };
 extern struct attribute_group md_bitmap_group;
 
@@ -479,19 +474,19 @@
 		sysfs_notify_dirent(sd);
 }
 
-static inline char * mdname (mddev_t * mddev)
+static inline char * mdname (struct mddev * mddev)
 {
 	return mddev->gendisk ? mddev->gendisk->disk_name : "mdX";
 }
 
-static inline int sysfs_link_rdev(mddev_t *mddev, mdk_rdev_t *rdev)
+static inline int sysfs_link_rdev(struct mddev *mddev, struct md_rdev *rdev)
 {
 	char nm[20];
 	sprintf(nm, "rd%d", rdev->raid_disk);
 	return sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
 }
 
-static inline void sysfs_unlink_rdev(mddev_t *mddev, mdk_rdev_t *rdev)
+static inline void sysfs_unlink_rdev(struct mddev *mddev, struct md_rdev *rdev)
 {
 	char nm[20];
 	sprintf(nm, "rd%d", rdev->raid_disk);
@@ -514,14 +509,14 @@
 #define rdev_for_each_rcu(rdev, mddev)				\
 	list_for_each_entry_rcu(rdev, &((mddev)->disks), same_set)
 
-typedef struct mdk_thread_s {
-	void			(*run) (mddev_t *mddev);
-	mddev_t			*mddev;
+struct md_thread {
+	void			(*run) (struct mddev *mddev);
+	struct mddev		*mddev;
 	wait_queue_head_t	wqueue;
 	unsigned long           flags;
 	struct task_struct	*tsk;
 	unsigned long		timeout;
-} mdk_thread_t;
+};
 
 #define THREAD_WAKEUP  0
 
@@ -556,48 +551,50 @@
 	if (p) put_page(p);
 }
 
-extern int register_md_personality(struct mdk_personality *p);
-extern int unregister_md_personality(struct mdk_personality *p);
-extern mdk_thread_t * md_register_thread(void (*run) (mddev_t *mddev),
-				mddev_t *mddev, const char *name);
-extern void md_unregister_thread(mdk_thread_t **threadp);
-extern void md_wakeup_thread(mdk_thread_t *thread);
-extern void md_check_recovery(mddev_t *mddev);
-extern void md_write_start(mddev_t *mddev, struct bio *bi);
-extern void md_write_end(mddev_t *mddev);
-extern void md_done_sync(mddev_t *mddev, int blocks, int ok);
-extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev);
+extern int register_md_personality(struct md_personality *p);
+extern int unregister_md_personality(struct md_personality *p);
+extern struct md_thread *md_register_thread(
+	void (*run)(struct mddev *mddev),
+	struct mddev *mddev,
+	const char *name);
+extern void md_unregister_thread(struct md_thread **threadp);
+extern void md_wakeup_thread(struct md_thread *thread);
+extern void md_check_recovery(struct mddev *mddev);
+extern void md_write_start(struct mddev *mddev, struct bio *bi);
+extern void md_write_end(struct mddev *mddev);
+extern void md_done_sync(struct mddev *mddev, int blocks, int ok);
+extern void md_error(struct mddev *mddev, struct md_rdev *rdev);
 
-extern int mddev_congested(mddev_t *mddev, int bits);
-extern void md_flush_request(mddev_t *mddev, struct bio *bio);
-extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
+extern int mddev_congested(struct mddev *mddev, int bits);
+extern void md_flush_request(struct mddev *mddev, struct bio *bio);
+extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
 			   sector_t sector, int size, struct page *page);
-extern void md_super_wait(mddev_t *mddev);
-extern int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size, 
+extern void md_super_wait(struct mddev *mddev);
+extern int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, 
 			struct page *page, int rw, bool metadata_op);
-extern void md_do_sync(mddev_t *mddev);
-extern void md_new_event(mddev_t *mddev);
-extern int md_allow_write(mddev_t *mddev);
-extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
-extern void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors);
-extern int md_check_no_bitmap(mddev_t *mddev);
-extern int md_integrity_register(mddev_t *mddev);
-extern void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
+extern void md_do_sync(struct mddev *mddev);
+extern void md_new_event(struct mddev *mddev);
+extern int md_allow_write(struct mddev *mddev);
+extern void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev);
+extern void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors);
+extern int md_check_no_bitmap(struct mddev *mddev);
+extern int md_integrity_register(struct mddev *mddev);
+extern void md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev);
 extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale);
 extern void restore_bitmap_write_access(struct file *file);
 
-extern void mddev_init(mddev_t *mddev);
-extern int md_run(mddev_t *mddev);
-extern void md_stop(mddev_t *mddev);
-extern void md_stop_writes(mddev_t *mddev);
-extern int md_rdev_init(mdk_rdev_t *rdev);
+extern void mddev_init(struct mddev *mddev);
+extern int md_run(struct mddev *mddev);
+extern void md_stop(struct mddev *mddev);
+extern void md_stop_writes(struct mddev *mddev);
+extern int md_rdev_init(struct md_rdev *rdev);
 
-extern void mddev_suspend(mddev_t *mddev);
-extern void mddev_resume(mddev_t *mddev);
+extern void mddev_suspend(struct mddev *mddev);
+extern void mddev_resume(struct mddev *mddev);
 extern struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
-				   mddev_t *mddev);
+				   struct mddev *mddev);
 extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
-				   mddev_t *mddev);
-extern int mddev_check_plugged(mddev_t *mddev);
+				   struct mddev *mddev);
+extern int mddev_check_plugged(struct mddev *mddev);
 extern void md_trim_bio(struct bio *bio, int offset, int size);
 #endif /* _MD_MD_H */

diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index d5b5fb3..d32c785 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c

@@ -31,7 +31,7 @@
 #define	NR_RESERVED_BUFS	32
 
 
-static int multipath_map (multipath_conf_t *conf)
+static int multipath_map (struct mpconf *conf)
 {
 	int i, disks = conf->raid_disks;
 
@@ -42,7 +42,7 @@
 
 	rcu_read_lock();
 	for (i = 0; i < disks; i++) {
-		mdk_rdev_t *rdev = rcu_dereference(conf->multipaths[i].rdev);
+		struct md_rdev *rdev = rcu_dereference(conf->multipaths[i].rdev);
 		if (rdev && test_bit(In_sync, &rdev->flags)) {
 			atomic_inc(&rdev->nr_pending);
 			rcu_read_unlock();
@@ -58,8 +58,8 @@
 static void multipath_reschedule_retry (struct multipath_bh *mp_bh)
 {
 	unsigned long flags;
-	mddev_t *mddev = mp_bh->mddev;
-	multipath_conf_t *conf = mddev->private;
+	struct mddev *mddev = mp_bh->mddev;
+	struct mpconf *conf = mddev->private;
 
 	spin_lock_irqsave(&conf->device_lock, flags);
 	list_add(&mp_bh->retry_list, &conf->retry_list);
@@ -76,7 +76,7 @@
 static void multipath_end_bh_io (struct multipath_bh *mp_bh, int err)
 {
 	struct bio *bio = mp_bh->master_bio;
-	multipath_conf_t *conf = mp_bh->mddev->private;
+	struct mpconf *conf = mp_bh->mddev->private;
 
 	bio_endio(bio, err);
 	mempool_free(mp_bh, conf->pool);
@@ -86,8 +86,8 @@
 {
 	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	struct multipath_bh *mp_bh = bio->bi_private;
-	multipath_conf_t *conf = mp_bh->mddev->private;
-	mdk_rdev_t *rdev = conf->multipaths[mp_bh->path].rdev;
+	struct mpconf *conf = mp_bh->mddev->private;
+	struct md_rdev *rdev = conf->multipaths[mp_bh->path].rdev;
 
 	if (uptodate)
 		multipath_end_bh_io(mp_bh, 0);
@@ -106,9 +106,9 @@
 	rdev_dec_pending(rdev, conf->mddev);
 }
 
-static int multipath_make_request(mddev_t *mddev, struct bio * bio)
+static int multipath_make_request(struct mddev *mddev, struct bio * bio)
 {
-	multipath_conf_t *conf = mddev->private;
+	struct mpconf *conf = mddev->private;
 	struct multipath_bh * mp_bh;
 	struct multipath_info *multipath;
 
@@ -140,9 +140,9 @@
 	return 0;
 }
 
-static void multipath_status (struct seq_file *seq, mddev_t *mddev)
+static void multipath_status (struct seq_file *seq, struct mddev *mddev)
 {
-	multipath_conf_t *conf = mddev->private;
+	struct mpconf *conf = mddev->private;
 	int i;
 	
 	seq_printf (seq, " [%d/%d] [", conf->raid_disks,
@@ -156,8 +156,8 @@
 
 static int multipath_congested(void *data, int bits)
 {
-	mddev_t *mddev = data;
-	multipath_conf_t *conf = mddev->private;
+	struct mddev *mddev = data;
+	struct mpconf *conf = mddev->private;
 	int i, ret = 0;
 
 	if (mddev_congested(mddev, bits))
@@ -165,7 +165,7 @@
 
 	rcu_read_lock();
 	for (i = 0; i < mddev->raid_disks ; i++) {
-		mdk_rdev_t *rdev = rcu_dereference(conf->multipaths[i].rdev);
+		struct md_rdev *rdev = rcu_dereference(conf->multipaths[i].rdev);
 		if (rdev && !test_bit(Faulty, &rdev->flags)) {
 			struct request_queue *q = bdev_get_queue(rdev->bdev);
 
@@ -183,9 +183,9 @@
 /*
  * Careful, this can execute in IRQ contexts as well!
  */
-static void multipath_error (mddev_t *mddev, mdk_rdev_t *rdev)
+static void multipath_error (struct mddev *mddev, struct md_rdev *rdev)
 {
-	multipath_conf_t *conf = mddev->private;
+	struct mpconf *conf = mddev->private;
 	char b[BDEVNAME_SIZE];
 
 	if (conf->raid_disks - mddev->degraded <= 1) {
@@ -218,7 +218,7 @@
 	       conf->raid_disks - mddev->degraded);
 }
 
-static void print_multipath_conf (multipath_conf_t *conf)
+static void print_multipath_conf (struct mpconf *conf)
 {
 	int i;
 	struct multipath_info *tmp;
@@ -242,9 +242,9 @@
 }
 
 
-static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
+static int multipath_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 {
-	multipath_conf_t *conf = mddev->private;
+	struct mpconf *conf = mddev->private;
 	struct request_queue *q;
 	int err = -EEXIST;
 	int path;
@@ -291,11 +291,11 @@
 	return err;
 }
 
-static int multipath_remove_disk(mddev_t *mddev, int number)
+static int multipath_remove_disk(struct mddev *mddev, int number)
 {
-	multipath_conf_t *conf = mddev->private;
+	struct mpconf *conf = mddev->private;
 	int err = 0;
-	mdk_rdev_t *rdev;
+	struct md_rdev *rdev;
 	struct multipath_info *p = conf->multipaths + number;
 
 	print_multipath_conf(conf);
@@ -335,12 +335,12 @@
  *	3.	Performs writes following reads for array syncronising.
  */
 
-static void multipathd (mddev_t *mddev)
+static void multipathd (struct mddev *mddev)
 {
 	struct multipath_bh *mp_bh;
 	struct bio *bio;
 	unsigned long flags;
-	multipath_conf_t *conf = mddev->private;
+	struct mpconf *conf = mddev->private;
 	struct list_head *head = &conf->retry_list;
 
 	md_check_recovery(mddev);
@@ -379,7 +379,7 @@
 	spin_unlock_irqrestore(&conf->device_lock, flags);
 }
 
-static sector_t multipath_size(mddev_t *mddev, sector_t sectors, int raid_disks)
+static sector_t multipath_size(struct mddev *mddev, sector_t sectors, int raid_disks)
 {
 	WARN_ONCE(sectors || raid_disks,
 		  "%s does not support generic reshape\n", __func__);
@@ -387,12 +387,12 @@
 	return mddev->dev_sectors;
 }
 
-static int multipath_run (mddev_t *mddev)
+static int multipath_run (struct mddev *mddev)
 {
-	multipath_conf_t *conf;
+	struct mpconf *conf;
 	int disk_idx;
 	struct multipath_info *disk;
-	mdk_rdev_t *rdev;
+	struct md_rdev *rdev;
 	int working_disks;
 
 	if (md_check_no_bitmap(mddev))
@@ -409,7 +409,7 @@
 	 * should be freed in multipath_stop()]
 	 */
 
-	conf = kzalloc(sizeof(multipath_conf_t), GFP_KERNEL);
+	conf = kzalloc(sizeof(struct mpconf), GFP_KERNEL);
 	mddev->private = conf;
 	if (!conf) {
 		printk(KERN_ERR 
@@ -510,9 +510,9 @@
 }
 
 
-static int multipath_stop (mddev_t *mddev)
+static int multipath_stop (struct mddev *mddev)
 {
-	multipath_conf_t *conf = mddev->private;
+	struct mpconf *conf = mddev->private;
 
 	md_unregister_thread(&mddev->thread);
 	blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
@@ -523,7 +523,7 @@
 	return 0;
 }
 
-static struct mdk_personality multipath_personality =
+static struct md_personality multipath_personality =
 {
 	.name		= "multipath",
 	.level		= LEVEL_MULTIPATH,

diff --git a/drivers/md/multipath.h b/drivers/md/multipath.h
index 3c5a45e..717c60f 100644
--- a/drivers/md/multipath.h
+++ b/drivers/md/multipath.h

@@ -2,11 +2,11 @@
 #define _MULTIPATH_H
 
 struct multipath_info {
-	mdk_rdev_t	*rdev;
+	struct md_rdev	*rdev;
 };
 
-struct multipath_private_data {
-	mddev_t			*mddev;
+struct mpconf {
+	struct mddev			*mddev;
 	struct multipath_info	*multipaths;
 	int			raid_disks;
 	spinlock_t		device_lock;
@@ -15,8 +15,6 @@
 	mempool_t		*pool;
 };
 
-typedef struct multipath_private_data multipath_conf_t;
-
 /*
  * this is our 'private' 'collective' MULTIPATH buffer head.
  * it contains information about what kind of IO operations were started
@@ -24,7 +22,7 @@
  */
 
 struct multipath_bh {
-	mddev_t			*mddev;
+	struct mddev			*mddev;
 	struct bio		*master_bio;
 	struct bio		bio;
 	int			path;

diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index e86bf36..0eb08a4 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c

@@ -27,9 +27,9 @@
 
 static int raid0_congested(void *data, int bits)
 {
-	mddev_t *mddev = data;
-	raid0_conf_t *conf = mddev->private;
-	mdk_rdev_t **devlist = conf->devlist;
+	struct mddev *mddev = data;
+	struct r0conf *conf = mddev->private;
+	struct md_rdev **devlist = conf->devlist;
 	int raid_disks = conf->strip_zone[0].nb_dev;
 	int i, ret = 0;
 
@@ -47,52 +47,53 @@
 /*
  * inform the user of the raid configuration
 */
-static void dump_zones(mddev_t *mddev)
+static void dump_zones(struct mddev *mddev)
 {
-	int j, k, h;
+	int j, k;
 	sector_t zone_size = 0;
 	sector_t zone_start = 0;
 	char b[BDEVNAME_SIZE];
-	raid0_conf_t *conf = mddev->private;
+	struct r0conf *conf = mddev->private;
 	int raid_disks = conf->strip_zone[0].nb_dev;
-	printk(KERN_INFO "******* %s configuration *********\n",
-		mdname(mddev));
-	h = 0;
+	printk(KERN_INFO "md: RAID0 configuration for %s - %d zone%s\n",
+	       mdname(mddev),
+	       conf->nr_strip_zones, conf->nr_strip_zones==1?"":"s");
 	for (j = 0; j < conf->nr_strip_zones; j++) {
-		printk(KERN_INFO "zone%d=[", j);
+		printk(KERN_INFO "md: zone%d=[", j);
 		for (k = 0; k < conf->strip_zone[j].nb_dev; k++)
-			printk(KERN_CONT "%s/",
+			printk(KERN_CONT "%s%s", k?"/":"",
 			bdevname(conf->devlist[j*raid_disks
 						+ k]->bdev, b));
 		printk(KERN_CONT "]\n");
 
 		zone_size  = conf->strip_zone[j].zone_end - zone_start;
-		printk(KERN_INFO "        zone offset=%llukb "
-				"device offset=%llukb size=%llukb\n",
+		printk(KERN_INFO "      zone-offset=%10lluKB, "
+				"device-offset=%10lluKB, size=%10lluKB\n",
 			(unsigned long long)zone_start>>1,
 			(unsigned long long)conf->strip_zone[j].dev_start>>1,
 			(unsigned long long)zone_size>>1);
 		zone_start = conf->strip_zone[j].zone_end;
 	}
-	printk(KERN_INFO "**********************************\n\n");
+	printk(KERN_INFO "\n");
 }
 
-static int create_strip_zones(mddev_t *mddev, raid0_conf_t **private_conf)
+static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
 {
 	int i, c, err;
 	sector_t curr_zone_end, sectors;
-	mdk_rdev_t *smallest, *rdev1, *rdev2, *rdev, **dev;
+	struct md_rdev *smallest, *rdev1, *rdev2, *rdev, **dev;
 	struct strip_zone *zone;
 	int cnt;
 	char b[BDEVNAME_SIZE];
-	raid0_conf_t *conf = kzalloc(sizeof(*conf), GFP_KERNEL);
+	char b2[BDEVNAME_SIZE];
+	struct r0conf *conf = kzalloc(sizeof(*conf), GFP_KERNEL);
 
 	if (!conf)
 		return -ENOMEM;
 	list_for_each_entry(rdev1, &mddev->disks, same_set) {
-		printk(KERN_INFO "md/raid0:%s: looking at %s\n",
-		       mdname(mddev),
-		       bdevname(rdev1->bdev, b));
+		pr_debug("md/raid0:%s: looking at %s\n",
+			 mdname(mddev),
+			 bdevname(rdev1->bdev, b));
 		c = 0;
 
 		/* round size to chunk_size */
@@ -101,16 +102,16 @@
 		rdev1->sectors = sectors * mddev->chunk_sectors;
 
 		list_for_each_entry(rdev2, &mddev->disks, same_set) {
-			printk(KERN_INFO "md/raid0:%s:   comparing %s(%llu)",
-			       mdname(mddev),
-			       bdevname(rdev1->bdev,b),
-			       (unsigned long long)rdev1->sectors);
-			printk(KERN_CONT " with %s(%llu)\n",
-			       bdevname(rdev2->bdev,b),
-			       (unsigned long long)rdev2->sectors);
+			pr_debug("md/raid0:%s:   comparing %s(%llu)"
+				 " with %s(%llu)\n",
+				 mdname(mddev),
+				 bdevname(rdev1->bdev,b),
+				 (unsigned long long)rdev1->sectors,
+				 bdevname(rdev2->bdev,b2),
+				 (unsigned long long)rdev2->sectors);
 			if (rdev2 == rdev1) {
-				printk(KERN_INFO "md/raid0:%s:   END\n",
-				       mdname(mddev));
+				pr_debug("md/raid0:%s:   END\n",
+					 mdname(mddev));
 				break;
 			}
 			if (rdev2->sectors == rdev1->sectors) {
@@ -118,30 +119,30 @@
 				 * Not unique, don't count it as a new
 				 * group
 				 */
-				printk(KERN_INFO "md/raid0:%s:   EQUAL\n",
-				       mdname(mddev));
+				pr_debug("md/raid0:%s:   EQUAL\n",
+					 mdname(mddev));
 				c = 1;
 				break;
 			}
-			printk(KERN_INFO "md/raid0:%s:   NOT EQUAL\n",
-			       mdname(mddev));
+			pr_debug("md/raid0:%s:   NOT EQUAL\n",
+				 mdname(mddev));
 		}
 		if (!c) {
-			printk(KERN_INFO "md/raid0:%s:   ==> UNIQUE\n",
-			       mdname(mddev));
+			pr_debug("md/raid0:%s:   ==> UNIQUE\n",
+				 mdname(mddev));
 			conf->nr_strip_zones++;
-			printk(KERN_INFO "md/raid0:%s: %d zones\n",
-			       mdname(mddev), conf->nr_strip_zones);
+			pr_debug("md/raid0:%s: %d zones\n",
+				 mdname(mddev), conf->nr_strip_zones);
 		}
 	}
-	printk(KERN_INFO "md/raid0:%s: FINAL %d zones\n",
-	       mdname(mddev), conf->nr_strip_zones);
+	pr_debug("md/raid0:%s: FINAL %d zones\n",
+		 mdname(mddev), conf->nr_strip_zones);
 	err = -ENOMEM;
 	conf->strip_zone = kzalloc(sizeof(struct strip_zone)*
 				conf->nr_strip_zones, GFP_KERNEL);
 	if (!conf->strip_zone)
 		goto abort;
-	conf->devlist = kzalloc(sizeof(mdk_rdev_t*)*
+	conf->devlist = kzalloc(sizeof(struct md_rdev*)*
 				conf->nr_strip_zones*mddev->raid_disks,
 				GFP_KERNEL);
 	if (!conf->devlist)
@@ -218,44 +219,45 @@
 		zone = conf->strip_zone + i;
 		dev = conf->devlist + i * mddev->raid_disks;
 
-		printk(KERN_INFO "md/raid0:%s: zone %d\n",
-		       mdname(mddev), i);
+		pr_debug("md/raid0:%s: zone %d\n", mdname(mddev), i);
 		zone->dev_start = smallest->sectors;
 		smallest = NULL;
 		c = 0;
 
 		for (j=0; j<cnt; j++) {
 			rdev = conf->devlist[j];
-			printk(KERN_INFO "md/raid0:%s: checking %s ...",
-			       mdname(mddev),
-			       bdevname(rdev->bdev, b));
 			if (rdev->sectors <= zone->dev_start) {
-				printk(KERN_CONT " nope.\n");
+				pr_debug("md/raid0:%s: checking %s ... nope\n",
+					 mdname(mddev),
+					 bdevname(rdev->bdev, b));
 				continue;
 			}
-			printk(KERN_CONT " contained as device %d\n", c);
+			pr_debug("md/raid0:%s: checking %s ..."
+				 " contained as device %d\n",
+				 mdname(mddev),
+				 bdevname(rdev->bdev, b), c);
 			dev[c] = rdev;
 			c++;
 			if (!smallest || rdev->sectors < smallest->sectors) {
 				smallest = rdev;
-				printk(KERN_INFO "md/raid0:%s:  (%llu) is smallest!.\n",
-				       mdname(mddev),
-				       (unsigned long long)rdev->sectors);
+				pr_debug("md/raid0:%s:  (%llu) is smallest!.\n",
+					 mdname(mddev),
+					 (unsigned long long)rdev->sectors);
 			}
 		}
 
 		zone->nb_dev = c;
 		sectors = (smallest->sectors - zone->dev_start) * c;
-		printk(KERN_INFO "md/raid0:%s: zone->nb_dev: %d, sectors: %llu\n",
-		       mdname(mddev),
-		       zone->nb_dev, (unsigned long long)sectors);
+		pr_debug("md/raid0:%s: zone->nb_dev: %d, sectors: %llu\n",
+			 mdname(mddev),
+			 zone->nb_dev, (unsigned long long)sectors);
 
 		curr_zone_end += sectors;
 		zone->zone_end = curr_zone_end;
 
-		printk(KERN_INFO "md/raid0:%s: current zone start: %llu\n",
-		       mdname(mddev),
-		       (unsigned long long)smallest->sectors);
+		pr_debug("md/raid0:%s: current zone start: %llu\n",
+			 mdname(mddev),
+			 (unsigned long long)smallest->sectors);
 	}
 	mddev->queue->backing_dev_info.congested_fn = raid0_congested;
 	mddev->queue->backing_dev_info.congested_data = mddev;
@@ -275,7 +277,7 @@
 	blk_queue_io_opt(mddev->queue,
 			 (mddev->chunk_sectors << 9) * mddev->raid_disks);
 
-	printk(KERN_INFO "md/raid0:%s: done.\n", mdname(mddev));
+	pr_debug("md/raid0:%s: done.\n", mdname(mddev));
 	*private_conf = conf;
 
 	return 0;
@@ -299,7 +301,7 @@
 				struct bvec_merge_data *bvm,
 				struct bio_vec *biovec)
 {
-	mddev_t *mddev = q->queuedata;
+	struct mddev *mddev = q->queuedata;
 	sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
 	int max;
 	unsigned int chunk_sectors = mddev->chunk_sectors;
@@ -318,10 +320,10 @@
 		return max;
 }
 
-static sector_t raid0_size(mddev_t *mddev, sector_t sectors, int raid_disks)
+static sector_t raid0_size(struct mddev *mddev, sector_t sectors, int raid_disks)
 {
 	sector_t array_sectors = 0;
-	mdk_rdev_t *rdev;
+	struct md_rdev *rdev;
 
 	WARN_ONCE(sectors || raid_disks,
 		  "%s does not support generic reshape\n", __func__);
@@ -332,9 +334,9 @@
 	return array_sectors;
 }
 
-static int raid0_run(mddev_t *mddev)
+static int raid0_run(struct mddev *mddev)
 {
-	raid0_conf_t *conf;
+	struct r0conf *conf;
 	int ret;
 
 	if (mddev->chunk_sectors == 0) {
@@ -382,9 +384,9 @@
 	return md_integrity_register(mddev);
 }
 
-static int raid0_stop(mddev_t *mddev)
+static int raid0_stop(struct mddev *mddev)
 {
-	raid0_conf_t *conf = mddev->private;
+	struct r0conf *conf = mddev->private;
 
 	blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
 	kfree(conf->strip_zone);
@@ -397,7 +399,7 @@
 /* Find the zone which holds a particular offset
  * Update *sectorp to be an offset in that zone
  */
-static struct strip_zone *find_zone(struct raid0_private_data *conf,
+static struct strip_zone *find_zone(struct r0conf *conf,
 				    sector_t *sectorp)
 {
 	int i;
@@ -417,12 +419,12 @@
  * remaps the bio to the target device. we separate two flows.
  * power 2 flow and a general flow for the sake of perfromance
 */
-static mdk_rdev_t *map_sector(mddev_t *mddev, struct strip_zone *zone,
+static struct md_rdev *map_sector(struct mddev *mddev, struct strip_zone *zone,
 				sector_t sector, sector_t *sector_offset)
 {
 	unsigned int sect_in_chunk;
 	sector_t chunk;
-	raid0_conf_t *conf = mddev->private;
+	struct r0conf *conf = mddev->private;
 	int raid_disks = conf->strip_zone[0].nb_dev;
 	unsigned int chunk_sects = mddev->chunk_sectors;
 
@@ -453,7 +455,7 @@
 /*
  * Is io distribute over 1 or more chunks ?
 */
-static inline int is_io_in_chunk_boundary(mddev_t *mddev,
+static inline int is_io_in_chunk_boundary(struct mddev *mddev,
 			unsigned int chunk_sects, struct bio *bio)
 {
 	if (likely(is_power_of_2(chunk_sects))) {
@@ -466,12 +468,12 @@
 	}
 }
 
-static int raid0_make_request(mddev_t *mddev, struct bio *bio)
+static int raid0_make_request(struct mddev *mddev, struct bio *bio)
 {
 	unsigned int chunk_sects;
 	sector_t sector_offset;
 	struct strip_zone *zone;
-	mdk_rdev_t *tmp_dev;
+	struct md_rdev *tmp_dev;
 
 	if (unlikely(bio->bi_rw & REQ_FLUSH)) {
 		md_flush_request(mddev, bio);
@@ -526,43 +528,16 @@
 	return 0;
 }
 
-static void raid0_status(struct seq_file *seq, mddev_t *mddev)
+static void raid0_status(struct seq_file *seq, struct mddev *mddev)
 {
-#undef MD_DEBUG
-#ifdef MD_DEBUG
-	int j, k, h;
-	char b[BDEVNAME_SIZE];
-	raid0_conf_t *conf = mddev->private;
-	int raid_disks = conf->strip_zone[0].nb_dev;
-
-	sector_t zone_size;
-	sector_t zone_start = 0;
-	h = 0;
-
-	for (j = 0; j < conf->nr_strip_zones; j++) {
-		seq_printf(seq, "      z%d", j);
-		seq_printf(seq, "=[");
-		for (k = 0; k < conf->strip_zone[j].nb_dev; k++)
-			seq_printf(seq, "%s/", bdevname(
-				conf->devlist[j*raid_disks + k]
-						->bdev, b));
-
-		zone_size  = conf->strip_zone[j].zone_end - zone_start;
-		seq_printf(seq, "] ze=%lld ds=%lld s=%lld\n",
-			(unsigned long long)zone_start>>1,
-			(unsigned long long)conf->strip_zone[j].dev_start>>1,
-			(unsigned long long)zone_size>>1);
-		zone_start = conf->strip_zone[j].zone_end;
-	}
-#endif
 	seq_printf(seq, " %dk chunks", mddev->chunk_sectors / 2);
 	return;
 }
 
-static void *raid0_takeover_raid45(mddev_t *mddev)
+static void *raid0_takeover_raid45(struct mddev *mddev)
 {
-	mdk_rdev_t *rdev;
-	raid0_conf_t *priv_conf;
+	struct md_rdev *rdev;
+	struct r0conf *priv_conf;
 
 	if (mddev->degraded != 1) {
 		printk(KERN_ERR "md/raid0:%s: raid5 must be degraded! Degraded disks: %d\n",
@@ -593,9 +568,9 @@
 	return priv_conf;
 }
 
-static void *raid0_takeover_raid10(mddev_t *mddev)
+static void *raid0_takeover_raid10(struct mddev *mddev)
 {
-	raid0_conf_t *priv_conf;
+	struct r0conf *priv_conf;
 
 	/* Check layout:
 	 *  - far_copies must be 1
@@ -634,9 +609,9 @@
 	return priv_conf;
 }
 
-static void *raid0_takeover_raid1(mddev_t *mddev)
+static void *raid0_takeover_raid1(struct mddev *mddev)
 {
-	raid0_conf_t *priv_conf;
+	struct r0conf *priv_conf;
 
 	/* Check layout:
 	 *  - (N - 1) mirror drives must be already faulty
@@ -660,7 +635,7 @@
 	return priv_conf;
 }
 
-static void *raid0_takeover(mddev_t *mddev)
+static void *raid0_takeover(struct mddev *mddev)
 {
 	/* raid0 can take over:
 	 *  raid4 - if all data disks are active.
@@ -691,11 +666,11 @@
 	return ERR_PTR(-EINVAL);
 }
 
-static void raid0_quiesce(mddev_t *mddev, int state)
+static void raid0_quiesce(struct mddev *mddev, int state)
 {
 }
 
-static struct mdk_personality raid0_personality=
+static struct md_personality raid0_personality=
 {
 	.name		= "raid0",
 	.level		= 0,

diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h
index 91f8e87..0884bba 100644
--- a/drivers/md/raid0.h
+++ b/drivers/md/raid0.h

@@ -1,20 +1,16 @@
 #ifndef _RAID0_H
 #define _RAID0_H
 
-struct strip_zone
-{
+struct strip_zone {
 	sector_t zone_end;	/* Start of the next zone (in sectors) */
 	sector_t dev_start;	/* Zone offset in real dev (in sectors) */
 	int nb_dev;		/* # of devices attached to the zone */
 };
 
-struct raid0_private_data
-{
+struct r0conf {
 	struct strip_zone *strip_zone;
-	mdk_rdev_t **devlist; /* lists of rdevs, pointed to by strip_zone->dev */
+	struct md_rdev **devlist; /* lists of rdevs, pointed to by strip_zone->dev */
 	int nr_strip_zones;
 };
 
-typedef struct raid0_private_data raid0_conf_t;
-
 #endif

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index d9587df..4602fc5 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c

@@ -40,22 +40,24 @@
 #include "raid1.h"
 #include "bitmap.h"
 
-#define DEBUG 0
-#define PRINTK(x...) do { if (DEBUG) printk(x); } while (0)
-
 /*
  * Number of guaranteed r1bios in case of extreme VM load:
  */
 #define	NR_RAID1_BIOS 256
 
+/* When there are this many requests queue to be written by
+ * the raid1 thread, we become 'congested' to provide back-pressure
+ * for writeback.
+ */
+static int max_queued_requests = 1024;
 
-static void allow_barrier(conf_t *conf);
-static void lower_barrier(conf_t *conf);
+static void allow_barrier(struct r1conf *conf);
+static void lower_barrier(struct r1conf *conf);
 
 static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
 {
 	struct pool_info *pi = data;
-	int size = offsetof(r1bio_t, bios[pi->raid_disks]);
+	int size = offsetof(struct r1bio, bios[pi->raid_disks]);
 
 	/* allocate a r1bio with room for raid_disks entries in the bios array */
 	return kzalloc(size, gfp_flags);
@@ -76,7 +78,7 @@
 {
 	struct pool_info *pi = data;
 	struct page *page;
-	r1bio_t *r1_bio;
+	struct r1bio *r1_bio;
 	struct bio *bio;
 	int i, j;
 
@@ -142,7 +144,7 @@
 {
 	struct pool_info *pi = data;
 	int i,j;
-	r1bio_t *r1bio = __r1_bio;
+	struct r1bio *r1bio = __r1_bio;
 
 	for (i = 0; i < RESYNC_PAGES; i++)
 		for (j = pi->raid_disks; j-- ;) {
@@ -157,7 +159,7 @@
 	r1bio_pool_free(r1bio, data);
 }
 
-static void put_all_bios(conf_t *conf, r1bio_t *r1_bio)
+static void put_all_bios(struct r1conf *conf, struct r1bio *r1_bio)
 {
 	int i;
 
@@ -169,17 +171,17 @@
 	}
 }
 
-static void free_r1bio(r1bio_t *r1_bio)
+static void free_r1bio(struct r1bio *r1_bio)
 {
-	conf_t *conf = r1_bio->mddev->private;
+	struct r1conf *conf = r1_bio->mddev->private;
 
 	put_all_bios(conf, r1_bio);
 	mempool_free(r1_bio, conf->r1bio_pool);
 }
 
-static void put_buf(r1bio_t *r1_bio)
+static void put_buf(struct r1bio *r1_bio)
 {
-	conf_t *conf = r1_bio->mddev->private;
+	struct r1conf *conf = r1_bio->mddev->private;
 	int i;
 
 	for (i=0; i<conf->raid_disks; i++) {
@@ -193,11 +195,11 @@
 	lower_barrier(conf);
 }
 
-static void reschedule_retry(r1bio_t *r1_bio)
+static void reschedule_retry(struct r1bio *r1_bio)
 {
 	unsigned long flags;
-	mddev_t *mddev = r1_bio->mddev;
-	conf_t *conf = mddev->private;
+	struct mddev *mddev = r1_bio->mddev;
+	struct r1conf *conf = mddev->private;
 
 	spin_lock_irqsave(&conf->device_lock, flags);
 	list_add(&r1_bio->retry_list, &conf->retry_list);
@@ -213,11 +215,11 @@
  * operation and are ready to return a success/failure code to the buffer
  * cache layer.
  */
-static void call_bio_endio(r1bio_t *r1_bio)
+static void call_bio_endio(struct r1bio *r1_bio)
 {
 	struct bio *bio = r1_bio->master_bio;
 	int done;
-	conf_t *conf = r1_bio->mddev->private;
+	struct r1conf *conf = r1_bio->mddev->private;
 
 	if (bio->bi_phys_segments) {
 		unsigned long flags;
@@ -240,17 +242,17 @@
 	}
 }
 
-static void raid_end_bio_io(r1bio_t *r1_bio)
+static void raid_end_bio_io(struct r1bio *r1_bio)
 {
 	struct bio *bio = r1_bio->master_bio;
 
 	/* if nobody has done the final endio yet, do it now */
 	if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
-		PRINTK(KERN_DEBUG "raid1: sync end %s on sectors %llu-%llu\n",
-			(bio_data_dir(bio) == WRITE) ? "write" : "read",
-			(unsigned long long) bio->bi_sector,
-			(unsigned long long) bio->bi_sector +
-				(bio->bi_size >> 9) - 1);
+		pr_debug("raid1: sync end %s on sectors %llu-%llu\n",
+			 (bio_data_dir(bio) == WRITE) ? "write" : "read",
+			 (unsigned long long) bio->bi_sector,
+			 (unsigned long long) bio->bi_sector +
+			 (bio->bi_size >> 9) - 1);
 
 		call_bio_endio(r1_bio);
 	}
@@ -260,20 +262,38 @@
 /*
  * Update disk head position estimator based on IRQ completion info.
  */
-static inline void update_head_pos(int disk, r1bio_t *r1_bio)
+static inline void update_head_pos(int disk, struct r1bio *r1_bio)
 {
-	conf_t *conf = r1_bio->mddev->private;
+	struct r1conf *conf = r1_bio->mddev->private;
 
 	conf->mirrors[disk].head_position =
 		r1_bio->sector + (r1_bio->sectors);
 }
 
+/*
+ * Find the disk number which triggered given bio
+ */
+static int find_bio_disk(struct r1bio *r1_bio, struct bio *bio)
+{
+	int mirror;
+	int raid_disks = r1_bio->mddev->raid_disks;
+
+	for (mirror = 0; mirror < raid_disks; mirror++)
+		if (r1_bio->bios[mirror] == bio)
+			break;
+
+	BUG_ON(mirror == raid_disks);
+	update_head_pos(mirror, r1_bio);
+
+	return mirror;
+}
+
 static void raid1_end_read_request(struct bio *bio, int error)
 {
 	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-	r1bio_t *r1_bio = bio->bi_private;
+	struct r1bio *r1_bio = bio->bi_private;
 	int mirror;
-	conf_t *conf = r1_bio->mddev->private;
+	struct r1conf *conf = r1_bio->mddev->private;
 
 	mirror = r1_bio->read_disk;
 	/*
@@ -318,7 +338,7 @@
 	rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
 }
 
-static void close_write(r1bio_t *r1_bio)
+static void close_write(struct r1bio *r1_bio)
 {
 	/* it really is the end of this request */
 	if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
@@ -337,7 +357,7 @@
 	md_write_end(r1_bio->mddev);
 }
 
-static void r1_bio_write_done(r1bio_t *r1_bio)
+static void r1_bio_write_done(struct r1bio *r1_bio)
 {
 	if (!atomic_dec_and_test(&r1_bio->remaining))
 		return;
@@ -356,15 +376,12 @@
 static void raid1_end_write_request(struct bio *bio, int error)
 {
 	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-	r1bio_t *r1_bio = bio->bi_private;
+	struct r1bio *r1_bio = bio->bi_private;
 	int mirror, behind = test_bit(R1BIO_BehindIO, &r1_bio->state);
-	conf_t *conf = r1_bio->mddev->private;
+	struct r1conf *conf = r1_bio->mddev->private;
 	struct bio *to_put = NULL;
 
-
-	for (mirror = 0; mirror < conf->raid_disks; mirror++)
-		if (r1_bio->bios[mirror] == bio)
-			break;
+	mirror = find_bio_disk(r1_bio, bio);
 
 	/*
 	 * 'one mirror IO has finished' event handler:
@@ -400,8 +417,6 @@
 		}
 	}
 
-	update_head_pos(mirror, r1_bio);
-
 	if (behind) {
 		if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags))
 			atomic_dec(&r1_bio->behind_remaining);
@@ -418,10 +433,11 @@
 			/* Maybe we can return now */
 			if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
 				struct bio *mbio = r1_bio->master_bio;
-				PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n",
-				       (unsigned long long) mbio->bi_sector,
-				       (unsigned long long) mbio->bi_sector +
-				       (mbio->bi_size >> 9) - 1);
+				pr_debug("raid1: behind end write sectors"
+					 " %llu-%llu\n",
+					 (unsigned long long) mbio->bi_sector,
+					 (unsigned long long) mbio->bi_sector +
+					 (mbio->bi_size >> 9) - 1);
 				call_bio_endio(r1_bio);
 			}
 		}
@@ -455,7 +471,7 @@
  *
  * The rdev for the device selected will have nr_pending incremented.
  */
-static int read_balance(conf_t *conf, r1bio_t *r1_bio, int *max_sectors)
+static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sectors)
 {
 	const sector_t this_sector = r1_bio->sector;
 	int sectors;
@@ -464,7 +480,7 @@
 	int best_disk;
 	int i;
 	sector_t best_dist;
-	mdk_rdev_t *rdev;
+	struct md_rdev *rdev;
 	int choose_first;
 
 	rcu_read_lock();
@@ -582,14 +598,18 @@
 	return best_disk;
 }
 
-int md_raid1_congested(mddev_t *mddev, int bits)
+int md_raid1_congested(struct mddev *mddev, int bits)
 {
-	conf_t *conf = mddev->private;
+	struct r1conf *conf = mddev->private;
 	int i, ret = 0;
 
+	if ((bits & (1 << BDI_async_congested)) &&
+	    conf->pending_count >= max_queued_requests)
+		return 1;
+
 	rcu_read_lock();
 	for (i = 0; i < mddev->raid_disks; i++) {
-		mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
+		struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
 		if (rdev && !test_bit(Faulty, &rdev->flags)) {
 			struct request_queue *q = bdev_get_queue(rdev->bdev);
 
@@ -611,13 +631,13 @@
 
 static int raid1_congested(void *data, int bits)
 {
-	mddev_t *mddev = data;
+	struct mddev *mddev = data;
 
 	return mddev_congested(mddev, bits) ||
 		md_raid1_congested(mddev, bits);
 }
 
-static void flush_pending_writes(conf_t *conf)
+static void flush_pending_writes(struct r1conf *conf)
 {
 	/* Any writes that have been queued but are awaiting
 	 * bitmap updates get flushed here.
@@ -627,10 +647,12 @@
 	if (conf->pending_bio_list.head) {
 		struct bio *bio;
 		bio = bio_list_get(&conf->pending_bio_list);
+		conf->pending_count = 0;
 		spin_unlock_irq(&conf->device_lock);
 		/* flush any pending bitmap writes to
 		 * disk before proceeding w/ I/O */
 		bitmap_unplug(conf->mddev->bitmap);
+		wake_up(&conf->wait_barrier);
 
 		while (bio) { /* submit pending writes */
 			struct bio *next = bio->bi_next;
@@ -665,7 +687,7 @@
  */
 #define RESYNC_DEPTH 32
 
-static void raise_barrier(conf_t *conf)
+static void raise_barrier(struct r1conf *conf)
 {
 	spin_lock_irq(&conf->resync_lock);
 
@@ -684,7 +706,7 @@
 	spin_unlock_irq(&conf->resync_lock);
 }
 
-static void lower_barrier(conf_t *conf)
+static void lower_barrier(struct r1conf *conf)
 {
 	unsigned long flags;
 	BUG_ON(conf->barrier <= 0);
@@ -694,7 +716,7 @@
 	wake_up(&conf->wait_barrier);
 }
 
-static void wait_barrier(conf_t *conf)
+static void wait_barrier(struct r1conf *conf)
 {
 	spin_lock_irq(&conf->resync_lock);
 	if (conf->barrier) {
@@ -708,7 +730,7 @@
 	spin_unlock_irq(&conf->resync_lock);
 }
 
-static void allow_barrier(conf_t *conf)
+static void allow_barrier(struct r1conf *conf)
 {
 	unsigned long flags;
 	spin_lock_irqsave(&conf->resync_lock, flags);
@@ -717,7 +739,7 @@
 	wake_up(&conf->wait_barrier);
 }
 
-static void freeze_array(conf_t *conf)
+static void freeze_array(struct r1conf *conf)
 {
 	/* stop syncio and normal IO and wait for everything to
 	 * go quite.
@@ -740,7 +762,7 @@
 			    flush_pending_writes(conf));
 	spin_unlock_irq(&conf->resync_lock);
 }
-static void unfreeze_array(conf_t *conf)
+static void unfreeze_array(struct r1conf *conf)
 {
 	/* reverse the effect of the freeze */
 	spin_lock_irq(&conf->resync_lock);
@@ -753,7 +775,7 @@
 
 /* duplicate the data pages for behind I/O 
  */
-static void alloc_behind_pages(struct bio *bio, r1bio_t *r1_bio)
+static void alloc_behind_pages(struct bio *bio, struct r1bio *r1_bio)
 {
 	int i;
 	struct bio_vec *bvec;
@@ -782,14 +804,14 @@
 		if (bvecs[i].bv_page)
 			put_page(bvecs[i].bv_page);
 	kfree(bvecs);
-	PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size);
+	pr_debug("%dB behind alloc failed, doing sync I/O\n", bio->bi_size);
 }
 
-static int make_request(mddev_t *mddev, struct bio * bio)
+static int make_request(struct mddev *mddev, struct bio * bio)
 {
-	conf_t *conf = mddev->private;
-	mirror_info_t *mirror;
-	r1bio_t *r1_bio;
+	struct r1conf *conf = mddev->private;
+	struct mirror_info *mirror;
+	struct r1bio *r1_bio;
 	struct bio *read_bio;
 	int i, disks;
 	struct bitmap *bitmap;
@@ -797,7 +819,7 @@
 	const int rw = bio_data_dir(bio);
 	const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
 	const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA));
-	mdk_rdev_t *blocked_rdev;
+	struct md_rdev *blocked_rdev;
 	int plugged;
 	int first_clone;
 	int sectors_handled;
@@ -934,6 +956,11 @@
 	/*
 	 * WRITE:
 	 */
+	if (conf->pending_count >= max_queued_requests) {
+		md_wakeup_thread(mddev->thread);
+		wait_event(conf->wait_barrier,
+			   conf->pending_count < max_queued_requests);
+	}
 	/* first select target devices under rcu_lock and
 	 * inc refcount on their rdev.  Record them by setting
 	 * bios[x] to bio
@@ -952,7 +979,7 @@
 	rcu_read_lock();
 	max_sectors = r1_bio->sectors;
 	for (i = 0;  i < disks; i++) {
-		mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
+		struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
 		if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
 			atomic_inc(&rdev->nr_pending);
 			blocked_rdev = rdev;
@@ -1097,6 +1124,7 @@
 		atomic_inc(&r1_bio->remaining);
 		spin_lock_irqsave(&conf->device_lock, flags);
 		bio_list_add(&conf->pending_bio_list, mbio);
+		conf->pending_count++;
 		spin_unlock_irqrestore(&conf->device_lock, flags);
 	}
 	/* Mustn't call r1_bio_write_done before this next test,
@@ -1127,16 +1155,16 @@
 	return 0;
 }
 
-static void status(struct seq_file *seq, mddev_t *mddev)
+static void status(struct seq_file *seq, struct mddev *mddev)
 {
-	conf_t *conf = mddev->private;
+	struct r1conf *conf = mddev->private;
 	int i;
 
 	seq_printf(seq, " [%d/%d] [", conf->raid_disks,
 		   conf->raid_disks - mddev->degraded);
 	rcu_read_lock();
 	for (i = 0; i < conf->raid_disks; i++) {
-		mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
+		struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
 		seq_printf(seq, "%s",
 			   rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
 	}
@@ -1145,10 +1173,10 @@
 }
 
 
-static void error(mddev_t *mddev, mdk_rdev_t *rdev)
+static void error(struct mddev *mddev, struct md_rdev *rdev)
 {
 	char b[BDEVNAME_SIZE];
-	conf_t *conf = mddev->private;
+	struct r1conf *conf = mddev->private;
 
 	/*
 	 * If it is not operational, then we have already marked it as dead
@@ -1188,7 +1216,7 @@
 	       mdname(mddev), conf->raid_disks - mddev->degraded);
 }
 
-static void print_conf(conf_t *conf)
+static void print_conf(struct r1conf *conf)
 {
 	int i;
 
@@ -1203,7 +1231,7 @@
 	rcu_read_lock();
 	for (i = 0; i < conf->raid_disks; i++) {
 		char b[BDEVNAME_SIZE];
-		mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
+		struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
 		if (rdev)
 			printk(KERN_DEBUG " disk %d, wo:%d, o:%d, dev:%s\n",
 			       i, !test_bit(In_sync, &rdev->flags),
@@ -1213,7 +1241,7 @@
 	rcu_read_unlock();
 }
 
-static void close_sync(conf_t *conf)
+static void close_sync(struct r1conf *conf)
 {
 	wait_barrier(conf);
 	allow_barrier(conf);
@@ -1222,10 +1250,10 @@
 	conf->r1buf_pool = NULL;
 }
 
-static int raid1_spare_active(mddev_t *mddev)
+static int raid1_spare_active(struct mddev *mddev)
 {
 	int i;
-	conf_t *conf = mddev->private;
+	struct r1conf *conf = mddev->private;
 	int count = 0;
 	unsigned long flags;
 
@@ -1235,7 +1263,7 @@
 	 * Called under mddev lock, so rcu protection not needed.
 	 */
 	for (i = 0; i < conf->raid_disks; i++) {
-		mdk_rdev_t *rdev = conf->mirrors[i].rdev;
+		struct md_rdev *rdev = conf->mirrors[i].rdev;
 		if (rdev
 		    && !test_bit(Faulty, &rdev->flags)
 		    && !test_and_set_bit(In_sync, &rdev->flags)) {
@@ -1252,12 +1280,12 @@
 }
 
 
-static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
+static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 {
-	conf_t *conf = mddev->private;
+	struct r1conf *conf = mddev->private;
 	int err = -EEXIST;
 	int mirror = 0;
-	mirror_info_t *p;
+	struct mirror_info *p;
 	int first = 0;
 	int last = mddev->raid_disks - 1;
 
@@ -1300,12 +1328,12 @@
 	return err;
 }
 
-static int raid1_remove_disk(mddev_t *mddev, int number)
+static int raid1_remove_disk(struct mddev *mddev, int number)
 {
-	conf_t *conf = mddev->private;
+	struct r1conf *conf = mddev->private;
 	int err = 0;
-	mdk_rdev_t *rdev;
-	mirror_info_t *p = conf->mirrors+ number;
+	struct md_rdev *rdev;
+	struct mirror_info *p = conf->mirrors+ number;
 
 	print_conf(conf);
 	rdev = p->rdev;
@@ -1343,14 +1371,10 @@
 
 static void end_sync_read(struct bio *bio, int error)
 {
-	r1bio_t *r1_bio = bio->bi_private;
-	int i;
+	struct r1bio *r1_bio = bio->bi_private;
 
-	for (i=r1_bio->mddev->raid_disks; i--; )
-		if (r1_bio->bios[i] == bio)
-			break;
-	BUG_ON(i < 0);
-	update_head_pos(i, r1_bio);
+	update_head_pos(r1_bio->read_disk, r1_bio);
+
 	/*
 	 * we have read a block, now it needs to be re-written,
 	 * or re-read if the read failed.
@@ -1366,19 +1390,15 @@
 static void end_sync_write(struct bio *bio, int error)
 {
 	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-	r1bio_t *r1_bio = bio->bi_private;
-	mddev_t *mddev = r1_bio->mddev;
-	conf_t *conf = mddev->private;
-	int i;
+	struct r1bio *r1_bio = bio->bi_private;
+	struct mddev *mddev = r1_bio->mddev;
+	struct r1conf *conf = mddev->private;
 	int mirror=0;
 	sector_t first_bad;
 	int bad_sectors;
 
-	for (i = 0; i < conf->raid_disks; i++)
-		if (r1_bio->bios[i] == bio) {
-			mirror = i;
-			break;
-		}
+	mirror = find_bio_disk(r1_bio, bio);
+
 	if (!uptodate) {
 		sector_t sync_blocks = 0;
 		sector_t s = r1_bio->sector;
@@ -1404,8 +1424,6 @@
 		)
 		set_bit(R1BIO_MadeGood, &r1_bio->state);
 
-	update_head_pos(mirror, r1_bio);
-
 	if (atomic_dec_and_test(&r1_bio->remaining)) {
 		int s = r1_bio->sectors;
 		if (test_bit(R1BIO_MadeGood, &r1_bio->state) ||
@@ -1418,7 +1436,7 @@
 	}
 }
 
-static int r1_sync_page_io(mdk_rdev_t *rdev, sector_t sector,
+static int r1_sync_page_io(struct md_rdev *rdev, sector_t sector,
 			    int sectors, struct page *page, int rw)
 {
 	if (sync_page_io(rdev, sector, sectors << 9, page, rw, false))
@@ -1432,7 +1450,7 @@
 	return 0;
 }
 
-static int fix_sync_read_error(r1bio_t *r1_bio)
+static int fix_sync_read_error(struct r1bio *r1_bio)
 {
 	/* Try some synchronous reads of other devices to get
 	 * good data, much like with normal read errors.  Only
@@ -1445,8 +1463,8 @@
 	 * made sure that anything with a bad block in range
 	 * will have bi_end_io clear.
 	 */
-	mddev_t *mddev = r1_bio->mddev;
-	conf_t *conf = mddev->private;
+	struct mddev *mddev = r1_bio->mddev;
+	struct r1conf *conf = mddev->private;
 	struct bio *bio = r1_bio->bios[r1_bio->read_disk];
 	sector_t sect = r1_bio->sector;
 	int sectors = r1_bio->sectors;
@@ -1456,7 +1474,7 @@
 		int s = sectors;
 		int d = r1_bio->read_disk;
 		int success = 0;
-		mdk_rdev_t *rdev;
+		struct md_rdev *rdev;
 		int start;
 
 		if (s > (PAGE_SIZE>>9))
@@ -1501,7 +1519,8 @@
 					abort = 1;
 			}
 			if (abort) {
-				mddev->recovery_disabled = 1;
+				conf->recovery_disabled =
+					mddev->recovery_disabled;
 				set_bit(MD_RECOVERY_INTR, &mddev->recovery);
 				md_done_sync(mddev, r1_bio->sectors, 0);
 				put_buf(r1_bio);
@@ -1552,7 +1571,7 @@
 	return 1;
 }
 
-static int process_checks(r1bio_t *r1_bio)
+static int process_checks(struct r1bio *r1_bio)
 {
 	/* We have read all readable devices.  If we haven't
 	 * got the block, then there is no hope left.
@@ -1561,8 +1580,8 @@
 	 * If any blocks failed to read, then we need to
 	 * attempt an over-write
 	 */
-	mddev_t *mddev = r1_bio->mddev;
-	conf_t *conf = mddev->private;
+	struct mddev *mddev = r1_bio->mddev;
+	struct r1conf *conf = mddev->private;
 	int primary;
 	int i;
 
@@ -1634,9 +1653,9 @@
 	return 0;
 }
 
-static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
+static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio)
 {
-	conf_t *conf = mddev->private;
+	struct r1conf *conf = mddev->private;
 	int i;
 	int disks = conf->raid_disks;
 	struct bio *bio, *wbio;
@@ -1686,16 +1705,16 @@
  *	3.	Performs writes following reads for array synchronising.
  */
 
-static void fix_read_error(conf_t *conf, int read_disk,
+static void fix_read_error(struct r1conf *conf, int read_disk,
 			   sector_t sect, int sectors)
 {
-	mddev_t *mddev = conf->mddev;
+	struct mddev *mddev = conf->mddev;
 	while(sectors) {
 		int s = sectors;
 		int d = read_disk;
 		int success = 0;
 		int start;
-		mdk_rdev_t *rdev;
+		struct md_rdev *rdev;
 
 		if (s > (PAGE_SIZE>>9))
 			s = PAGE_SIZE >> 9;
@@ -1726,7 +1745,7 @@
 
 		if (!success) {
 			/* Cannot read from anywhere - mark it bad */
-			mdk_rdev_t *rdev = conf->mirrors[read_disk].rdev;
+			struct md_rdev *rdev = conf->mirrors[read_disk].rdev;
 			if (!rdev_set_badblocks(rdev, sect, s, 0))
 				md_error(mddev, rdev);
 			break;
@@ -1789,11 +1808,11 @@
 	return test_bit(BIO_UPTODATE, &bio->bi_flags);
 }
 
-static int narrow_write_error(r1bio_t *r1_bio, int i)
+static int narrow_write_error(struct r1bio *r1_bio, int i)
 {
-	mddev_t *mddev = r1_bio->mddev;
-	conf_t *conf = mddev->private;
-	mdk_rdev_t *rdev = conf->mirrors[i].rdev;
+	struct mddev *mddev = r1_bio->mddev;
+	struct r1conf *conf = mddev->private;
+	struct md_rdev *rdev = conf->mirrors[i].rdev;
 	int vcnt, idx;
 	struct bio_vec *vec;
 
@@ -1865,12 +1884,12 @@
 	return ok;
 }
 
-static void handle_sync_write_finished(conf_t *conf, r1bio_t *r1_bio)
+static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
 {
 	int m;
 	int s = r1_bio->sectors;
 	for (m = 0; m < conf->raid_disks ; m++) {
-		mdk_rdev_t *rdev = conf->mirrors[m].rdev;
+		struct md_rdev *rdev = conf->mirrors[m].rdev;
 		struct bio *bio = r1_bio->bios[m];
 		if (bio->bi_end_io == NULL)
 			continue;
@@ -1888,12 +1907,12 @@
 	md_done_sync(conf->mddev, s, 1);
 }
 
-static void handle_write_finished(conf_t *conf, r1bio_t *r1_bio)
+static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
 {
 	int m;
 	for (m = 0; m < conf->raid_disks ; m++)
 		if (r1_bio->bios[m] == IO_MADE_GOOD) {
-			mdk_rdev_t *rdev = conf->mirrors[m].rdev;
+			struct md_rdev *rdev = conf->mirrors[m].rdev;
 			rdev_clear_badblocks(rdev,
 					     r1_bio->sector,
 					     r1_bio->sectors);
@@ -1917,14 +1936,14 @@
 	raid_end_bio_io(r1_bio);
 }
 
-static void handle_read_error(conf_t *conf, r1bio_t *r1_bio)
+static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
 {
 	int disk;
 	int max_sectors;
-	mddev_t *mddev = conf->mddev;
+	struct mddev *mddev = conf->mddev;
 	struct bio *bio;
 	char b[BDEVNAME_SIZE];
-	mdk_rdev_t *rdev;
+	struct md_rdev *rdev;
 
 	clear_bit(R1BIO_ReadError, &r1_bio->state);
 	/* we got a read error. Maybe the drive is bad.  Maybe just
@@ -2007,11 +2026,11 @@
 	}
 }
 
-static void raid1d(mddev_t *mddev)
+static void raid1d(struct mddev *mddev)
 {
-	r1bio_t *r1_bio;
+	struct r1bio *r1_bio;
 	unsigned long flags;
-	conf_t *conf = mddev->private;
+	struct r1conf *conf = mddev->private;
 	struct list_head *head = &conf->retry_list;
 	struct blk_plug plug;
 
@@ -2028,7 +2047,7 @@
 			spin_unlock_irqrestore(&conf->device_lock, flags);
 			break;
 		}
-		r1_bio = list_entry(head->prev, r1bio_t, retry_list);
+		r1_bio = list_entry(head->prev, struct r1bio, retry_list);
 		list_del(head->prev);
 		conf->nr_queued--;
 		spin_unlock_irqrestore(&conf->device_lock, flags);
@@ -2060,7 +2079,7 @@
 }
 
 
-static int init_resync(conf_t *conf)
+static int init_resync(struct r1conf *conf)
 {
 	int buffs;
 
@@ -2084,10 +2103,10 @@
  * that can be installed to exclude normal IO requests.
  */
 
-static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
+static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipped, int go_faster)
 {
-	conf_t *conf = mddev->private;
-	r1bio_t *r1_bio;
+	struct r1conf *conf = mddev->private;
+	struct r1bio *r1_bio;
 	struct bio *bio;
 	sector_t max_sector, nr_sectors;
 	int disk = -1;
@@ -2167,7 +2186,7 @@
 	set_bit(R1BIO_IsSync, &r1_bio->state);
 
 	for (i=0; i < conf->raid_disks; i++) {
-		mdk_rdev_t *rdev;
+		struct md_rdev *rdev;
 		bio = r1_bio->bios[i];
 
 		/* take from bio_init */
@@ -2239,7 +2258,7 @@
 		int ok = 1;
 		for (i = 0 ; i < conf->raid_disks ; i++)
 			if (r1_bio->bios[i]->bi_end_io == end_sync_write) {
-				mdk_rdev_t *rdev =
+				struct md_rdev *rdev =
 					rcu_dereference(conf->mirrors[i].rdev);
 				ok = rdev_set_badblocks(rdev, sector_nr,
 							min_bad, 0
@@ -2356,7 +2375,7 @@
 	return nr_sectors;
 }
 
-static sector_t raid1_size(mddev_t *mddev, sector_t sectors, int raid_disks)
+static sector_t raid1_size(struct mddev *mddev, sector_t sectors, int raid_disks)
 {
 	if (sectors)
 		return sectors;
@@ -2364,15 +2383,15 @@
 	return mddev->dev_sectors;
 }
 
-static conf_t *setup_conf(mddev_t *mddev)
+static struct r1conf *setup_conf(struct mddev *mddev)
 {
-	conf_t *conf;
+	struct r1conf *conf;
 	int i;
-	mirror_info_t *disk;
-	mdk_rdev_t *rdev;
+	struct mirror_info *disk;
+	struct md_rdev *rdev;
 	int err = -ENOMEM;
 
-	conf = kzalloc(sizeof(conf_t), GFP_KERNEL);
+	conf = kzalloc(sizeof(struct r1conf), GFP_KERNEL);
 	if (!conf)
 		goto abort;
 
@@ -2417,6 +2436,8 @@
 	init_waitqueue_head(&conf->wait_barrier);
 
 	bio_list_init(&conf->pending_bio_list);
+	conf->pending_count = 0;
+	conf->recovery_disabled = mddev->recovery_disabled - 1;
 
 	conf->last_used = -1;
 	for (i = 0; i < conf->raid_disks; i++) {
@@ -2465,11 +2486,11 @@
 	return ERR_PTR(err);
 }
 
-static int run(mddev_t *mddev)
+static int run(struct mddev *mddev)
 {
-	conf_t *conf;
+	struct r1conf *conf;
 	int i;
-	mdk_rdev_t *rdev;
+	struct md_rdev *rdev;
 
 	if (mddev->level != 1) {
 		printk(KERN_ERR "md/raid1:%s: raid level not set to mirroring (%d)\n",
@@ -2545,9 +2566,9 @@
 	return md_integrity_register(mddev);
 }
 
-static int stop(mddev_t *mddev)
+static int stop(struct mddev *mddev)
 {
-	conf_t *conf = mddev->private;
+	struct r1conf *conf = mddev->private;
 	struct bitmap *bitmap = mddev->bitmap;
 
 	/* wait for behind writes to complete */
@@ -2572,7 +2593,7 @@
 	return 0;
 }
 
-static int raid1_resize(mddev_t *mddev, sector_t sectors)
+static int raid1_resize(struct mddev *mddev, sector_t sectors)
 {
 	/* no resync is happening, and there is enough space
 	 * on all devices, so we can resize.
@@ -2596,7 +2617,7 @@
 	return 0;
 }
 
-static int raid1_reshape(mddev_t *mddev)
+static int raid1_reshape(struct mddev *mddev)
 {
 	/* We need to:
 	 * 1/ resize the r1bio_pool
@@ -2611,8 +2632,8 @@
 	 */
 	mempool_t *newpool, *oldpool;
 	struct pool_info *newpoolinfo;
-	mirror_info_t *newmirrors;
-	conf_t *conf = mddev->private;
+	struct mirror_info *newmirrors;
+	struct r1conf *conf = mddev->private;
 	int cnt, raid_disks;
 	unsigned long flags;
 	int d, d2, err;
@@ -2668,7 +2689,7 @@
 	conf->r1bio_pool = newpool;
 
 	for (d = d2 = 0; d < conf->raid_disks; d++) {
-		mdk_rdev_t *rdev = conf->mirrors[d].rdev;
+		struct md_rdev *rdev = conf->mirrors[d].rdev;
 		if (rdev && rdev->raid_disk != d2) {
 			sysfs_unlink_rdev(mddev, rdev);
 			rdev->raid_disk = d2;
@@ -2702,9 +2723,9 @@
 	return 0;
 }
 
-static void raid1_quiesce(mddev_t *mddev, int state)
+static void raid1_quiesce(struct mddev *mddev, int state)
 {
-	conf_t *conf = mddev->private;
+	struct r1conf *conf = mddev->private;
 
 	switch(state) {
 	case 2: /* wake for suspend */
@@ -2719,13 +2740,13 @@
 	}
 }
 
-static void *raid1_takeover(mddev_t *mddev)
+static void *raid1_takeover(struct mddev *mddev)
 {
 	/* raid1 can take over:
 	 *  raid5 with 2 devices, any layout or chunk size
 	 */
 	if (mddev->level == 5 && mddev->raid_disks == 2) {
-		conf_t *conf;
+		struct r1conf *conf;
 		mddev->new_level = 1;
 		mddev->new_layout = 0;
 		mddev->new_chunk_sectors = 0;
@@ -2737,7 +2758,7 @@
 	return ERR_PTR(-EINVAL);
 }
 
-static struct mdk_personality raid1_personality =
+static struct md_personality raid1_personality =
 {
 	.name		= "raid1",
 	.level		= 1,
@@ -2775,3 +2796,5 @@
 MODULE_ALIAS("md-personality-3"); /* RAID1 */
 MODULE_ALIAS("md-raid1");
 MODULE_ALIAS("md-level-1");
+
+module_param(max_queued_requests, int, S_IRUGO|S_IWUSR);

diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index e0d676b..c732b6c 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h

@@ -1,10 +1,8 @@
 #ifndef _RAID1_H
 #define _RAID1_H
 
-typedef struct mirror_info mirror_info_t;
-
 struct mirror_info {
-	mdk_rdev_t	*rdev;
+	struct md_rdev	*rdev;
 	sector_t	head_position;
 };
 
@@ -17,61 +15,82 @@
  */
 
 struct pool_info {
-	mddev_t *mddev;
+	struct mddev *mddev;
 	int	raid_disks;
 };
 
-
-typedef struct r1bio_s r1bio_t;
-
-struct r1_private_data_s {
-	mddev_t			*mddev;
-	mirror_info_t		*mirrors;
+struct r1conf {
+	struct mddev		*mddev;
+	struct mirror_info		*mirrors;
 	int			raid_disks;
+
+	/* When choose the best device for a read (read_balance())
+	 * we try to keep sequential reads one the same device
+	 * using 'last_used' and 'next_seq_sect'
+	 */
 	int			last_used;
 	sector_t		next_seq_sect;
+	/* During resync, read_balancing is only allowed on the part
+	 * of the array that has been resynced.  'next_resync' tells us
+	 * where that is.
+	 */
+	sector_t		next_resync;
+
 	spinlock_t		device_lock;
 
+	/* list of 'struct r1bio' that need to be processed by raid1d,
+	 * whether to retry a read, writeout a resync or recovery
+	 * block, or anything else.
+	 */
 	struct list_head	retry_list;
-	/* queue pending writes and submit them on unplug */
+
+	/* queue pending writes to be submitted on unplug */
 	struct bio_list		pending_bio_list;
+	int			pending_count;
 
-	/* for use when syncing mirrors: */
-
+	/* for use when syncing mirrors:
+	 * We don't allow both normal IO and resync/recovery IO at
+	 * the same time - resync/recovery can only happen when there
+	 * is no other IO.  So when either is active, the other has to wait.
+	 * See more details description in raid1.c near raise_barrier().
+	 */
+	wait_queue_head_t	wait_barrier;
 	spinlock_t		resync_lock;
 	int			nr_pending;
 	int			nr_waiting;
 	int			nr_queued;
 	int			barrier;
-	sector_t		next_resync;
-	int			fullsync;  /* set to 1 if a full sync is needed,
-					    * (fresh device added).
-					    * Cleared when a sync completes.
-					    */
-	int			recovery_disabled; /* when the same as
-						    * mddev->recovery_disabled
-						    * we don't allow recovery
-						    * to be attempted as we
-						    * expect a read error
-						    */
 
-	wait_queue_head_t	wait_barrier;
+	/* Set to 1 if a full sync is needed, (fresh device added).
+	 * Cleared when a sync completes.
+	 */
+	int			fullsync;
 
+	/* When the same as mddev->recovery_disabled we don't allow
+	 * recovery to be attempted as we expect a read error.
+	 */
+	int			recovery_disabled;
+
+
+	/* poolinfo contains information about the content of the
+	 * mempools - it changes when the array grows or shrinks
+	 */
 	struct pool_info	*poolinfo;
+	mempool_t		*r1bio_pool;
+	mempool_t		*r1buf_pool;
 
+	/* temporary buffer to synchronous IO when attempting to repair
+	 * a read error.
+	 */
 	struct page		*tmppage;
 
-	mempool_t *r1bio_pool;
-	mempool_t *r1buf_pool;
 
 	/* When taking over an array from a different personality, we store
 	 * the new thread here until we fully activate the array.
 	 */
-	struct mdk_thread_s	*thread;
+	struct md_thread	*thread;
 };
 
-typedef struct r1_private_data_s conf_t;
-
 /*
  * this is our 'private' RAID1 bio.
  *
@@ -79,7 +98,7 @@
  * for this RAID1 operation, and about their status:
  */
 
-struct r1bio_s {
+struct r1bio {
 	atomic_t		remaining; /* 'have we finished' count,
 					    * used from IRQ handlers
 					    */
@@ -89,7 +108,7 @@
 	sector_t		sector;
 	int			sectors;
 	unsigned long		state;
-	mddev_t			*mddev;
+	struct mddev		*mddev;
 	/*
 	 * original bio going to /dev/mdx
 	 */
@@ -148,6 +167,6 @@
 #define	R1BIO_MadeGood 7
 #define	R1BIO_WriteError 8
 
-extern int md_raid1_congested(mddev_t *mddev, int bits);
+extern int md_raid1_congested(struct mddev *mddev, int bits);
 
 #endif

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 0cd9672..132c18e 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c

@@ -58,13 +58,19 @@
  */
 #define	NR_RAID10_BIOS 256
 
-static void allow_barrier(conf_t *conf);
-static void lower_barrier(conf_t *conf);
+/* When there are this many requests queue to be written by
+ * the raid10 thread, we become 'congested' to provide back-pressure
+ * for writeback.
+ */
+static int max_queued_requests = 1024;
+
+static void allow_barrier(struct r10conf *conf);
+static void lower_barrier(struct r10conf *conf);
 
 static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
 {
-	conf_t *conf = data;
-	int size = offsetof(struct r10bio_s, devs[conf->copies]);
+	struct r10conf *conf = data;
+	int size = offsetof(struct r10bio, devs[conf->copies]);
 
 	/* allocate a r10bio with room for raid_disks entries in the bios array */
 	return kzalloc(size, gfp_flags);
@@ -92,9 +98,9 @@
  */
 static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
 {
-	conf_t *conf = data;
+	struct r10conf *conf = data;
 	struct page *page;
-	r10bio_t *r10_bio;
+	struct r10bio *r10_bio;
 	struct bio *bio;
 	int i, j;
 	int nalloc;
@@ -158,8 +164,8 @@
 static void r10buf_pool_free(void *__r10_bio, void *data)
 {
 	int i;
-	conf_t *conf = data;
-	r10bio_t *r10bio = __r10_bio;
+	struct r10conf *conf = data;
+	struct r10bio *r10bio = __r10_bio;
 	int j;
 
 	for (j=0; j < conf->copies; j++) {
@@ -175,7 +181,7 @@
 	r10bio_pool_free(r10bio, conf);
 }
 
-static void put_all_bios(conf_t *conf, r10bio_t *r10_bio)
+static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio)
 {
 	int i;
 
@@ -187,28 +193,28 @@
 	}
 }
 
-static void free_r10bio(r10bio_t *r10_bio)
+static void free_r10bio(struct r10bio *r10_bio)
 {
-	conf_t *conf = r10_bio->mddev->private;
+	struct r10conf *conf = r10_bio->mddev->private;
 
 	put_all_bios(conf, r10_bio);
 	mempool_free(r10_bio, conf->r10bio_pool);
 }
 
-static void put_buf(r10bio_t *r10_bio)
+static void put_buf(struct r10bio *r10_bio)
 {
-	conf_t *conf = r10_bio->mddev->private;
+	struct r10conf *conf = r10_bio->mddev->private;
 
 	mempool_free(r10_bio, conf->r10buf_pool);
 
 	lower_barrier(conf);
 }
 
-static void reschedule_retry(r10bio_t *r10_bio)
+static void reschedule_retry(struct r10bio *r10_bio)
 {
 	unsigned long flags;
-	mddev_t *mddev = r10_bio->mddev;
-	conf_t *conf = mddev->private;
+	struct mddev *mddev = r10_bio->mddev;
+	struct r10conf *conf = mddev->private;
 
 	spin_lock_irqsave(&conf->device_lock, flags);
 	list_add(&r10_bio->retry_list, &conf->retry_list);
@@ -226,11 +232,11 @@
  * operation and are ready to return a success/failure code to the buffer
  * cache layer.
  */
-static void raid_end_bio_io(r10bio_t *r10_bio)
+static void raid_end_bio_io(struct r10bio *r10_bio)
 {
 	struct bio *bio = r10_bio->master_bio;
 	int done;
-	conf_t *conf = r10_bio->mddev->private;
+	struct r10conf *conf = r10_bio->mddev->private;
 
 	if (bio->bi_phys_segments) {
 		unsigned long flags;
@@ -256,9 +262,9 @@
 /*
  * Update disk head position estimator based on IRQ completion info.
  */
-static inline void update_head_pos(int slot, r10bio_t *r10_bio)
+static inline void update_head_pos(int slot, struct r10bio *r10_bio)
 {
-	conf_t *conf = r10_bio->mddev->private;
+	struct r10conf *conf = r10_bio->mddev->private;
 
 	conf->mirrors[r10_bio->devs[slot].devnum].head_position =
 		r10_bio->devs[slot].addr + (r10_bio->sectors);
@@ -267,7 +273,7 @@
 /*
  * Find the disk number which triggered given bio
  */
-static int find_bio_disk(conf_t *conf, r10bio_t *r10_bio,
+static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio,
 			 struct bio *bio, int *slotp)
 {
 	int slot;
@@ -287,9 +293,9 @@
 static void raid10_end_read_request(struct bio *bio, int error)
 {
 	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-	r10bio_t *r10_bio = bio->bi_private;
+	struct r10bio *r10_bio = bio->bi_private;
 	int slot, dev;
-	conf_t *conf = r10_bio->mddev->private;
+	struct r10conf *conf = r10_bio->mddev->private;
 
 
 	slot = r10_bio->read_slot;
@@ -327,7 +333,7 @@
 	}
 }
 
-static void close_write(r10bio_t *r10_bio)
+static void close_write(struct r10bio *r10_bio)
 {
 	/* clear the bitmap if all writes complete successfully */
 	bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,
@@ -337,7 +343,7 @@
 	md_write_end(r10_bio->mddev);
 }
 
-static void one_write_done(r10bio_t *r10_bio)
+static void one_write_done(struct r10bio *r10_bio)
 {
 	if (atomic_dec_and_test(&r10_bio->remaining)) {
 		if (test_bit(R10BIO_WriteError, &r10_bio->state))
@@ -355,10 +361,10 @@
 static void raid10_end_write_request(struct bio *bio, int error)
 {
 	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-	r10bio_t *r10_bio = bio->bi_private;
+	struct r10bio *r10_bio = bio->bi_private;
 	int dev;
 	int dec_rdev = 1;
-	conf_t *conf = r10_bio->mddev->private;
+	struct r10conf *conf = r10_bio->mddev->private;
 	int slot;
 
 	dev = find_bio_disk(conf, r10_bio, bio, &slot);
@@ -433,7 +439,7 @@
  * sector offset to a virtual address
  */
 
-static void raid10_find_phys(conf_t *conf, r10bio_t *r10bio)
+static void raid10_find_phys(struct r10conf *conf, struct r10bio *r10bio)
 {
 	int n,f;
 	sector_t sector;
@@ -481,7 +487,7 @@
 	BUG_ON(slot != conf->copies);
 }
 
-static sector_t raid10_find_virt(conf_t *conf, sector_t sector, int dev)
+static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
 {
 	sector_t offset, chunk, vchunk;
 
@@ -522,7 +528,7 @@
 				 struct bvec_merge_data *bvm,
 				 struct bio_vec *biovec)
 {
-	mddev_t *mddev = q->queuedata;
+	struct mddev *mddev = q->queuedata;
 	sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
 	int max;
 	unsigned int chunk_sectors = mddev->chunk_sectors;
@@ -555,14 +561,14 @@
  * FIXME: possibly should rethink readbalancing and do it differently
  * depending on near_copies / far_copies geometry.
  */
-static int read_balance(conf_t *conf, r10bio_t *r10_bio, int *max_sectors)
+static int read_balance(struct r10conf *conf, struct r10bio *r10_bio, int *max_sectors)
 {
 	const sector_t this_sector = r10_bio->sector;
 	int disk, slot;
 	int sectors = r10_bio->sectors;
 	int best_good_sectors;
 	sector_t new_distance, best_dist;
-	mdk_rdev_t *rdev;
+	struct md_rdev *rdev;
 	int do_balance;
 	int best_slot;
 
@@ -677,15 +683,19 @@
 
 static int raid10_congested(void *data, int bits)
 {
-	mddev_t *mddev = data;
-	conf_t *conf = mddev->private;
+	struct mddev *mddev = data;
+	struct r10conf *conf = mddev->private;
 	int i, ret = 0;
 
+	if ((bits & (1 << BDI_async_congested)) &&
+	    conf->pending_count >= max_queued_requests)
+		return 1;
+
 	if (mddev_congested(mddev, bits))
 		return 1;
 	rcu_read_lock();
 	for (i = 0; i < conf->raid_disks && ret == 0; i++) {
-		mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
+		struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
 		if (rdev && !test_bit(Faulty, &rdev->flags)) {
 			struct request_queue *q = bdev_get_queue(rdev->bdev);
 
@@ -696,7 +706,7 @@
 	return ret;
 }
 
-static void flush_pending_writes(conf_t *conf)
+static void flush_pending_writes(struct r10conf *conf)
 {
 	/* Any writes that have been queued but are awaiting
 	 * bitmap updates get flushed here.
@@ -706,10 +716,12 @@
 	if (conf->pending_bio_list.head) {
 		struct bio *bio;
 		bio = bio_list_get(&conf->pending_bio_list);
+		conf->pending_count = 0;
 		spin_unlock_irq(&conf->device_lock);
 		/* flush any pending bitmap writes to disk
 		 * before proceeding w/ I/O */
 		bitmap_unplug(conf->mddev->bitmap);
+		wake_up(&conf->wait_barrier);
 
 		while (bio) { /* submit pending writes */
 			struct bio *next = bio->bi_next;
@@ -743,7 +755,7 @@
  *    lower_barrier when the particular background IO completes.
  */
 
-static void raise_barrier(conf_t *conf, int force)
+static void raise_barrier(struct r10conf *conf, int force)
 {
 	BUG_ON(force && !conf->barrier);
 	spin_lock_irq(&conf->resync_lock);
@@ -763,7 +775,7 @@
 	spin_unlock_irq(&conf->resync_lock);
 }
 
-static void lower_barrier(conf_t *conf)
+static void lower_barrier(struct r10conf *conf)
 {
 	unsigned long flags;
 	spin_lock_irqsave(&conf->resync_lock, flags);
@@ -772,7 +784,7 @@
 	wake_up(&conf->wait_barrier);
 }
 
-static void wait_barrier(conf_t *conf)
+static void wait_barrier(struct r10conf *conf)
 {
 	spin_lock_irq(&conf->resync_lock);
 	if (conf->barrier) {
@@ -786,7 +798,7 @@
 	spin_unlock_irq(&conf->resync_lock);
 }
 
-static void allow_barrier(conf_t *conf)
+static void allow_barrier(struct r10conf *conf)
 {
 	unsigned long flags;
 	spin_lock_irqsave(&conf->resync_lock, flags);
@@ -795,7 +807,7 @@
 	wake_up(&conf->wait_barrier);
 }
 
-static void freeze_array(conf_t *conf)
+static void freeze_array(struct r10conf *conf)
 {
 	/* stop syncio and normal IO and wait for everything to
 	 * go quiet.
@@ -820,7 +832,7 @@
 	spin_unlock_irq(&conf->resync_lock);
 }
 
-static void unfreeze_array(conf_t *conf)
+static void unfreeze_array(struct r10conf *conf)
 {
 	/* reverse the effect of the freeze */
 	spin_lock_irq(&conf->resync_lock);
@@ -830,11 +842,11 @@
 	spin_unlock_irq(&conf->resync_lock);
 }
 
-static int make_request(mddev_t *mddev, struct bio * bio)
+static int make_request(struct mddev *mddev, struct bio * bio)
 {
-	conf_t *conf = mddev->private;
-	mirror_info_t *mirror;
-	r10bio_t *r10_bio;
+	struct r10conf *conf = mddev->private;
+	struct mirror_info *mirror;
+	struct r10bio *r10_bio;
 	struct bio *read_bio;
 	int i;
 	int chunk_sects = conf->chunk_mask + 1;
@@ -842,7 +854,7 @@
 	const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
 	const unsigned long do_fua = (bio->bi_rw & REQ_FUA);
 	unsigned long flags;
-	mdk_rdev_t *blocked_rdev;
+	struct md_rdev *blocked_rdev;
 	int plugged;
 	int sectors_handled;
 	int max_sectors;
@@ -996,6 +1008,11 @@
 	/*
 	 * WRITE:
 	 */
+	if (conf->pending_count >= max_queued_requests) {
+		md_wakeup_thread(mddev->thread);
+		wait_event(conf->wait_barrier,
+			   conf->pending_count < max_queued_requests);
+	}
 	/* first select target devices under rcu_lock and
 	 * inc refcount on their rdev.  Record them by setting
 	 * bios[x] to bio
@@ -1017,7 +1034,7 @@
 
 	for (i = 0;  i < conf->copies; i++) {
 		int d = r10_bio->devs[i].devnum;
-		mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[d].rdev);
+		struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
 		if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
 			atomic_inc(&rdev->nr_pending);
 			blocked_rdev = rdev;
@@ -1129,6 +1146,7 @@
 		atomic_inc(&r10_bio->remaining);
 		spin_lock_irqsave(&conf->device_lock, flags);
 		bio_list_add(&conf->pending_bio_list, mbio);
+		conf->pending_count++;
 		spin_unlock_irqrestore(&conf->device_lock, flags);
 	}
 
@@ -1161,9 +1179,9 @@
 	return 0;
 }
 
-static void status(struct seq_file *seq, mddev_t *mddev)
+static void status(struct seq_file *seq, struct mddev *mddev)
 {
-	conf_t *conf = mddev->private;
+	struct r10conf *conf = mddev->private;
 	int i;
 
 	if (conf->near_copies < conf->raid_disks)
@@ -1190,7 +1208,7 @@
  * Don't consider the device numbered 'ignore'
  * as we might be about to remove it.
  */
-static int enough(conf_t *conf, int ignore)
+static int enough(struct r10conf *conf, int ignore)
 {
 	int first = 0;
 
@@ -1209,10 +1227,10 @@
 	return 1;
 }
 
-static void error(mddev_t *mddev, mdk_rdev_t *rdev)
+static void error(struct mddev *mddev, struct md_rdev *rdev)
 {
 	char b[BDEVNAME_SIZE];
-	conf_t *conf = mddev->private;
+	struct r10conf *conf = mddev->private;
 
 	/*
 	 * If it is not operational, then we have already marked it as dead
@@ -1246,10 +1264,10 @@
 	       mdname(mddev), conf->raid_disks - mddev->degraded);
 }
 
-static void print_conf(conf_t *conf)
+static void print_conf(struct r10conf *conf)
 {
 	int i;
-	mirror_info_t *tmp;
+	struct mirror_info *tmp;
 
 	printk(KERN_DEBUG "RAID10 conf printout:\n");
 	if (!conf) {
@@ -1270,7 +1288,7 @@
 	}
 }
 
-static void close_sync(conf_t *conf)
+static void close_sync(struct r10conf *conf)
 {
 	wait_barrier(conf);
 	allow_barrier(conf);
@@ -1279,11 +1297,11 @@
 	conf->r10buf_pool = NULL;
 }
 
-static int raid10_spare_active(mddev_t *mddev)
+static int raid10_spare_active(struct mddev *mddev)
 {
 	int i;
-	conf_t *conf = mddev->private;
-	mirror_info_t *tmp;
+	struct r10conf *conf = mddev->private;
+	struct mirror_info *tmp;
 	int count = 0;
 	unsigned long flags;
 
@@ -1309,9 +1327,9 @@
 }
 
 
-static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
+static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 {
-	conf_t *conf = mddev->private;
+	struct r10conf *conf = mddev->private;
 	int err = -EEXIST;
 	int mirror;
 	int first = 0;
@@ -1334,7 +1352,7 @@
 	else
 		mirror = first;
 	for ( ; mirror <= last ; mirror++) {
-		mirror_info_t *p = &conf->mirrors[mirror];
+		struct mirror_info *p = &conf->mirrors[mirror];
 		if (p->recovery_disabled == mddev->recovery_disabled)
 			continue;
 		if (!p->rdev)
@@ -1355,6 +1373,7 @@
 		}
 
 		p->head_position = 0;
+		p->recovery_disabled = mddev->recovery_disabled - 1;
 		rdev->raid_disk = mirror;
 		err = 0;
 		if (rdev->saved_raid_disk != mirror)
@@ -1368,12 +1387,12 @@
 	return err;
 }
 
-static int raid10_remove_disk(mddev_t *mddev, int number)
+static int raid10_remove_disk(struct mddev *mddev, int number)
 {
-	conf_t *conf = mddev->private;
+	struct r10conf *conf = mddev->private;
 	int err = 0;
-	mdk_rdev_t *rdev;
-	mirror_info_t *p = conf->mirrors+ number;
+	struct md_rdev *rdev;
+	struct mirror_info *p = conf->mirrors+ number;
 
 	print_conf(conf);
 	rdev = p->rdev;
@@ -1411,8 +1430,8 @@
 
 static void end_sync_read(struct bio *bio, int error)
 {
-	r10bio_t *r10_bio = bio->bi_private;
-	conf_t *conf = r10_bio->mddev->private;
+	struct r10bio *r10_bio = bio->bi_private;
+	struct r10conf *conf = r10_bio->mddev->private;
 	int d;
 
 	d = find_bio_disk(conf, r10_bio, bio, NULL);
@@ -1439,9 +1458,9 @@
 	}
 }
 
-static void end_sync_request(r10bio_t *r10_bio)
+static void end_sync_request(struct r10bio *r10_bio)
 {
-	mddev_t *mddev = r10_bio->mddev;
+	struct mddev *mddev = r10_bio->mddev;
 
 	while (atomic_dec_and_test(&r10_bio->remaining)) {
 		if (r10_bio->master_bio == NULL) {
@@ -1455,7 +1474,7 @@
 			md_done_sync(mddev, s, 1);
 			break;
 		} else {
-			r10bio_t *r10_bio2 = (r10bio_t *)r10_bio->master_bio;
+			struct r10bio *r10_bio2 = (struct r10bio *)r10_bio->master_bio;
 			if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
 			    test_bit(R10BIO_WriteError, &r10_bio->state))
 				reschedule_retry(r10_bio);
@@ -1469,9 +1488,9 @@
 static void end_sync_write(struct bio *bio, int error)
 {
 	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-	r10bio_t *r10_bio = bio->bi_private;
-	mddev_t *mddev = r10_bio->mddev;
-	conf_t *conf = mddev->private;
+	struct r10bio *r10_bio = bio->bi_private;
+	struct mddev *mddev = r10_bio->mddev;
+	struct r10conf *conf = mddev->private;
 	int d;
 	sector_t first_bad;
 	int bad_sectors;
@@ -1509,9 +1528,9 @@
  * We check if all blocks are in-sync and only write to blocks that
  * aren't in sync
  */
-static void sync_request_write(mddev_t *mddev, r10bio_t *r10_bio)
+static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
 {
-	conf_t *conf = mddev->private;
+	struct r10conf *conf = mddev->private;
 	int i, first;
 	struct bio *tbio, *fbio;
 
@@ -1609,7 +1628,7 @@
  * The second for writing.
  *
  */
-static void fix_recovery_read_error(r10bio_t *r10_bio)
+static void fix_recovery_read_error(struct r10bio *r10_bio)
 {
 	/* We got a read error during recovery.
 	 * We repeat the read in smaller page-sized sections.
@@ -1618,8 +1637,8 @@
 	 * If a read fails, record a bad block on both old and
 	 * new devices.
 	 */
-	mddev_t *mddev = r10_bio->mddev;
-	conf_t *conf = mddev->private;
+	struct mddev *mddev = r10_bio->mddev;
+	struct r10conf *conf = mddev->private;
 	struct bio *bio = r10_bio->devs[0].bio;
 	sector_t sect = 0;
 	int sectors = r10_bio->sectors;
@@ -1629,7 +1648,7 @@
 
 	while (sectors) {
 		int s = sectors;
-		mdk_rdev_t *rdev;
+		struct md_rdev *rdev;
 		sector_t addr;
 		int ok;
 
@@ -1663,7 +1682,7 @@
 
 			if (rdev != conf->mirrors[dw].rdev) {
 				/* need bad block on destination too */
-				mdk_rdev_t *rdev2 = conf->mirrors[dw].rdev;
+				struct md_rdev *rdev2 = conf->mirrors[dw].rdev;
 				addr = r10_bio->devs[1].addr + sect;
 				ok = rdev_set_badblocks(rdev2, addr, s, 0);
 				if (!ok) {
@@ -1688,9 +1707,9 @@
 	}
 }
 
-static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio)
+static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
 {
-	conf_t *conf = mddev->private;
+	struct r10conf *conf = mddev->private;
 	int d;
 	struct bio *wbio;
 
@@ -1719,7 +1738,7 @@
  * since the last recorded read error.
  *
  */
-static void check_decay_read_errors(mddev_t *mddev, mdk_rdev_t *rdev)
+static void check_decay_read_errors(struct mddev *mddev, struct md_rdev *rdev)
 {
 	struct timespec cur_time_mon;
 	unsigned long hours_since_last;
@@ -1750,7 +1769,7 @@
 		atomic_set(&rdev->read_errors, read_errors >> hours_since_last);
 }
 
-static int r10_sync_page_io(mdk_rdev_t *rdev, sector_t sector,
+static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector,
 			    int sectors, struct page *page, int rw)
 {
 	sector_t first_bad;
@@ -1778,11 +1797,11 @@
  *	3.	Performs writes following reads for array synchronising.
  */
 
-static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
+static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10bio *r10_bio)
 {
 	int sect = 0; /* Offset from r10_bio->sector */
 	int sectors = r10_bio->sectors;
-	mdk_rdev_t*rdev;
+	struct md_rdev*rdev;
 	int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
 	int d = r10_bio->devs[r10_bio->read_slot].devnum;
 
@@ -1983,12 +2002,12 @@
 	return test_bit(BIO_UPTODATE, &bio->bi_flags);
 }
 
-static int narrow_write_error(r10bio_t *r10_bio, int i)
+static int narrow_write_error(struct r10bio *r10_bio, int i)
 {
 	struct bio *bio = r10_bio->master_bio;
-	mddev_t *mddev = r10_bio->mddev;
-	conf_t *conf = mddev->private;
-	mdk_rdev_t *rdev = conf->mirrors[r10_bio->devs[i].devnum].rdev;
+	struct mddev *mddev = r10_bio->mddev;
+	struct r10conf *conf = mddev->private;
+	struct md_rdev *rdev = conf->mirrors[r10_bio->devs[i].devnum].rdev;
 	/* bio has the data to be written to slot 'i' where
 	 * we just recently had a write error.
 	 * We repeatedly clone the bio and trim down to one block,
@@ -2040,13 +2059,13 @@
 	return ok;
 }
 
-static void handle_read_error(mddev_t *mddev, r10bio_t *r10_bio)
+static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
 {
 	int slot = r10_bio->read_slot;
 	int mirror = r10_bio->devs[slot].devnum;
 	struct bio *bio;
-	conf_t *conf = mddev->private;
-	mdk_rdev_t *rdev;
+	struct r10conf *conf = mddev->private;
+	struct md_rdev *rdev;
 	char b[BDEVNAME_SIZE];
 	unsigned long do_sync;
 	int max_sectors;
@@ -2139,7 +2158,7 @@
 		generic_make_request(bio);
 }
 
-static void handle_write_completed(conf_t *conf, r10bio_t *r10_bio)
+static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
 {
 	/* Some sort of write request has finished and it
 	 * succeeded in writing where we thought there was a
@@ -2148,7 +2167,7 @@
 	 * a bad block.
 	 */
 	int m;
-	mdk_rdev_t *rdev;
+	struct md_rdev *rdev;
 
 	if (test_bit(R10BIO_IsSync, &r10_bio->state) ||
 	    test_bit(R10BIO_IsRecover, &r10_bio->state)) {
@@ -2200,11 +2219,11 @@
 	}
 }
 
-static void raid10d(mddev_t *mddev)
+static void raid10d(struct mddev *mddev)
 {
-	r10bio_t *r10_bio;
+	struct r10bio *r10_bio;
 	unsigned long flags;
-	conf_t *conf = mddev->private;
+	struct r10conf *conf = mddev->private;
 	struct list_head *head = &conf->retry_list;
 	struct blk_plug plug;
 
@@ -2220,7 +2239,7 @@
 			spin_unlock_irqrestore(&conf->device_lock, flags);
 			break;
 		}
-		r10_bio = list_entry(head->prev, r10bio_t, retry_list);
+		r10_bio = list_entry(head->prev, struct r10bio, retry_list);
 		list_del(head->prev);
 		conf->nr_queued--;
 		spin_unlock_irqrestore(&conf->device_lock, flags);
@@ -2252,7 +2271,7 @@
 }
 
 
-static int init_resync(conf_t *conf)
+static int init_resync(struct r10conf *conf)
 {
 	int buffs;
 
@@ -2297,11 +2316,11 @@
  *
  */
 
-static sector_t sync_request(mddev_t *mddev, sector_t sector_nr,
+static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
 			     int *skipped, int go_faster)
 {
-	conf_t *conf = mddev->private;
-	r10bio_t *r10_bio;
+	struct r10conf *conf = mddev->private;
+	struct r10bio *r10_bio;
 	struct bio *biolist = NULL, *bio;
 	sector_t max_sector, nr_sectors;
 	int i;
@@ -2393,7 +2412,7 @@
 
 		for (i=0 ; i<conf->raid_disks; i++) {
 			int still_degraded;
-			r10bio_t *rb2;
+			struct r10bio *rb2;
 			sector_t sect;
 			int must_sync;
 			int any_working;
@@ -2453,7 +2472,7 @@
 				int k;
 				int d = r10_bio->devs[j].devnum;
 				sector_t from_addr, to_addr;
-				mdk_rdev_t *rdev;
+				struct md_rdev *rdev;
 				sector_t sector, first_bad;
 				int bad_sectors;
 				if (!conf->mirrors[d].rdev ||
@@ -2547,8 +2566,8 @@
 		}
 		if (biolist == NULL) {
 			while (r10_bio) {
-				r10bio_t *rb2 = r10_bio;
-				r10_bio = (r10bio_t*) rb2->master_bio;
+				struct r10bio *rb2 = r10_bio;
+				r10_bio = (struct r10bio*) rb2->master_bio;
 				rb2->master_bio = NULL;
 				put_buf(rb2);
 			}
@@ -2714,10 +2733,10 @@
 }
 
 static sector_t
-raid10_size(mddev_t *mddev, sector_t sectors, int raid_disks)
+raid10_size(struct mddev *mddev, sector_t sectors, int raid_disks)
 {
 	sector_t size;
-	conf_t *conf = mddev->private;
+	struct r10conf *conf = mddev->private;
 
 	if (!raid_disks)
 		raid_disks = conf->raid_disks;
@@ -2733,9 +2752,9 @@
 }
 
 
-static conf_t *setup_conf(mddev_t *mddev)
+static struct r10conf *setup_conf(struct mddev *mddev)
 {
-	conf_t *conf = NULL;
+	struct r10conf *conf = NULL;
 	int nc, fc, fo;
 	sector_t stride, size;
 	int err = -EINVAL;
@@ -2760,7 +2779,7 @@
 	}
 
 	err = -ENOMEM;
-	conf = kzalloc(sizeof(conf_t), GFP_KERNEL);
+	conf = kzalloc(sizeof(struct r10conf), GFP_KERNEL);
 	if (!conf)
 		goto out;
 
@@ -2836,12 +2855,12 @@
 	return ERR_PTR(err);
 }
 
-static int run(mddev_t *mddev)
+static int run(struct mddev *mddev)
 {
-	conf_t *conf;
+	struct r10conf *conf;
 	int i, disk_idx, chunk_size;
-	mirror_info_t *disk;
-	mdk_rdev_t *rdev;
+	struct mirror_info *disk;
+	struct md_rdev *rdev;
 	sector_t size;
 
 	/*
@@ -2913,6 +2932,7 @@
 			if (disk->rdev)
 				conf->fullsync = 1;
 		}
+		disk->recovery_disabled = mddev->recovery_disabled - 1;
 	}
 
 	if (mddev->recovery_cp != MaxSector)
@@ -2966,9 +2986,9 @@
 	return -EIO;
 }
 
-static int stop(mddev_t *mddev)
+static int stop(struct mddev *mddev)
 {
-	conf_t *conf = mddev->private;
+	struct r10conf *conf = mddev->private;
 
 	raise_barrier(conf, 0);
 	lower_barrier(conf);
@@ -2983,9 +3003,9 @@
 	return 0;
 }
 
-static void raid10_quiesce(mddev_t *mddev, int state)
+static void raid10_quiesce(struct mddev *mddev, int state)
 {
-	conf_t *conf = mddev->private;
+	struct r10conf *conf = mddev->private;
 
 	switch(state) {
 	case 1:
@@ -2997,10 +3017,10 @@
 	}
 }
 
-static void *raid10_takeover_raid0(mddev_t *mddev)
+static void *raid10_takeover_raid0(struct mddev *mddev)
 {
-	mdk_rdev_t *rdev;
-	conf_t *conf;
+	struct md_rdev *rdev;
+	struct r10conf *conf;
 
 	if (mddev->degraded > 0) {
 		printk(KERN_ERR "md/raid10:%s: Error: degraded raid0!\n",
@@ -3029,17 +3049,17 @@
 	return conf;
 }
 
-static void *raid10_takeover(mddev_t *mddev)
+static void *raid10_takeover(struct mddev *mddev)
 {
-	struct raid0_private_data *raid0_priv;
+	struct r0conf *raid0_conf;
 
 	/* raid10 can take over:
 	 *  raid0 - providing it has only two drives
 	 */
 	if (mddev->level == 0) {
 		/* for raid0 takeover only one zone is supported */
-		raid0_priv = mddev->private;
-		if (raid0_priv->nr_strip_zones > 1) {
+		raid0_conf = mddev->private;
+		if (raid0_conf->nr_strip_zones > 1) {
 			printk(KERN_ERR "md/raid10:%s: cannot takeover raid 0"
 			       " with more than one zone.\n",
 			       mdname(mddev));
@@ -3050,7 +3070,7 @@
 	return ERR_PTR(-EINVAL);
 }
 
-static struct mdk_personality raid10_personality =
+static struct md_personality raid10_personality =
 {
 	.name		= "raid10",
 	.level		= 10,
@@ -3086,3 +3106,5 @@
 MODULE_ALIAS("md-personality-9"); /* RAID10 */
 MODULE_ALIAS("md-raid10");
 MODULE_ALIAS("md-level-10");
+
+module_param(max_queued_requests, int, S_IRUGO|S_IWUSR);

diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 79cb52a..7facfdf 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h

@@ -1,10 +1,8 @@
 #ifndef _RAID10_H
 #define _RAID10_H
 
-typedef struct mirror_info mirror_info_t;
-
 struct mirror_info {
-	mdk_rdev_t	*rdev;
+	struct md_rdev	*rdev;
 	sector_t	head_position;
 	int		recovery_disabled;	/* matches
 						 * mddev->recovery_disabled
@@ -13,11 +11,9 @@
 						 */
 };
 
-typedef struct r10bio_s r10bio_t;
-
-struct r10_private_data_s {
-	mddev_t			*mddev;
-	mirror_info_t		*mirrors;
+struct r10conf {
+	struct mddev		*mddev;
+	struct mirror_info	*mirrors;
 	int			raid_disks;
 	spinlock_t		device_lock;
 
@@ -46,7 +42,7 @@
 	struct list_head	retry_list;
 	/* queue pending writes and submit them on unplug */
 	struct bio_list		pending_bio_list;
-
+	int			pending_count;
 
 	spinlock_t		resync_lock;
 	int nr_pending;
@@ -68,11 +64,9 @@
 	/* When taking over an array from a different personality, we store
 	 * the new thread here until we fully activate the array.
 	 */
-	struct mdk_thread_s	*thread;
+	struct md_thread	*thread;
 };
 
-typedef struct r10_private_data_s conf_t;
-
 /*
  * this is our 'private' RAID10 bio.
  *
@@ -80,14 +74,14 @@
  * for this RAID10 operation, and about their status:
  */
 
-struct r10bio_s {
+struct r10bio {
 	atomic_t		remaining; /* 'have we finished' count,
 					    * used from IRQ handlers
 					    */
 	sector_t		sector;	/* virtual sector number */
 	int			sectors;
 	unsigned long		state;
-	mddev_t			*mddev;
+	struct mddev		*mddev;
 	/*
 	 * original bio going to /dev/mdx
 	 */

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index ac5e8b5..f6fe053 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c

@@ -70,7 +70,11 @@
 #define NR_HASH			(PAGE_SIZE / sizeof(struct hlist_head))
 #define HASH_MASK		(NR_HASH - 1)
 
-#define stripe_hash(conf, sect)	(&((conf)->stripe_hashtbl[((sect) >> STRIPE_SHIFT) & HASH_MASK]))
+static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect)
+{
+	int hash = (sect >> STRIPE_SHIFT) & HASH_MASK;
+	return &conf->stripe_hashtbl[hash];
+}
 
 /* bio's attached to a stripe+device for I/O are linked together in bi_sector
  * order without overlap.  There may be several bio's per stripe+device, and
@@ -78,24 +82,17 @@
  * When walking this list for a particular stripe+device, we must never proceed
  * beyond a bio that extends past this device, as the next bio might no longer
  * be valid.
- * This macro is used to determine the 'next' bio in the list, given the sector
+ * This function is used to determine the 'next' bio in the list, given the sector
  * of the current stripe+device
  */
-#define r5_next_bio(bio, sect) ( ( (bio)->bi_sector + ((bio)->bi_size>>9) < sect + STRIPE_SECTORS) ? (bio)->bi_next : NULL)
-/*
- * The following can be used to debug the driver
- */
-#define RAID5_PARANOIA	1
-#if RAID5_PARANOIA && defined(CONFIG_SMP)
-# define CHECK_DEVLOCK() assert_spin_locked(&conf->device_lock)
-#else
-# define CHECK_DEVLOCK()
-#endif
-
-#ifdef DEBUG
-#define inline
-#define __inline__
-#endif
+static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector)
+{
+	int sectors = bio->bi_size >> 9;
+	if (bio->bi_sector + sectors < sector + STRIPE_SECTORS)
+		return bio->bi_next;
+	else
+		return NULL;
+}
 
 /*
  * We maintain a biased count of active stripes in the bottom 16 bits of
@@ -183,7 +180,7 @@
 	}
 }
 
-static void print_raid5_conf (raid5_conf_t *conf);
+static void print_raid5_conf (struct r5conf *conf);
 
 static int stripe_operations_active(struct stripe_head *sh)
 {
@@ -192,7 +189,7 @@
 	       test_bit(STRIPE_COMPUTE_RUN, &sh->state);
 }
 
-static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
+static void __release_stripe(struct r5conf *conf, struct stripe_head *sh)
 {
 	if (atomic_dec_and_test(&sh->count)) {
 		BUG_ON(!list_empty(&sh->lru));
@@ -228,7 +225,7 @@
 
 static void release_stripe(struct stripe_head *sh)
 {
-	raid5_conf_t *conf = sh->raid_conf;
+	struct r5conf *conf = sh->raid_conf;
 	unsigned long flags;
 
 	spin_lock_irqsave(&conf->device_lock, flags);
@@ -244,25 +241,23 @@
 	hlist_del_init(&sh->hash);
 }
 
-static inline void insert_hash(raid5_conf_t *conf, struct stripe_head *sh)
+static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh)
 {
 	struct hlist_head *hp = stripe_hash(conf, sh->sector);
 
 	pr_debug("insert_hash(), stripe %llu\n",
 		(unsigned long long)sh->sector);
 
-	CHECK_DEVLOCK();
 	hlist_add_head(&sh->hash, hp);
 }
 
 
 /* find an idle stripe, make sure it is unhashed, and return it. */
-static struct stripe_head *get_free_stripe(raid5_conf_t *conf)
+static struct stripe_head *get_free_stripe(struct r5conf *conf)
 {
 	struct stripe_head *sh = NULL;
 	struct list_head *first;
 
-	CHECK_DEVLOCK();
 	if (list_empty(&conf->inactive_list))
 		goto out;
 	first = conf->inactive_list.next;
@@ -306,19 +301,18 @@
 }
 
 static void raid5_build_block(struct stripe_head *sh, int i, int previous);
-static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous,
+static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
 			    struct stripe_head *sh);
 
 static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
 {
-	raid5_conf_t *conf = sh->raid_conf;
+	struct r5conf *conf = sh->raid_conf;
 	int i;
 
 	BUG_ON(atomic_read(&sh->count) != 0);
 	BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
 	BUG_ON(stripe_operations_active(sh));
 
-	CHECK_DEVLOCK();
 	pr_debug("init_stripe called, stripe %llu\n",
 		(unsigned long long)sh->sector);
 
@@ -348,13 +342,12 @@
 	insert_hash(conf, sh);
 }
 
-static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector,
+static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector,
 					 short generation)
 {
 	struct stripe_head *sh;
 	struct hlist_node *hn;
 
-	CHECK_DEVLOCK();
 	pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector);
 	hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash)
 		if (sh->sector == sector && sh->generation == generation)
@@ -376,7 +369,7 @@
  * of the two sections, and some non-in_sync devices may
  * be insync in the section most affected by failed devices.
  */
-static int has_failed(raid5_conf_t *conf)
+static int has_failed(struct r5conf *conf)
 {
 	int degraded;
 	int i;
@@ -386,7 +379,7 @@
 	rcu_read_lock();
 	degraded = 0;
 	for (i = 0; i < conf->previous_raid_disks; i++) {
-		mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
+		struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
 		if (!rdev || test_bit(Faulty, &rdev->flags))
 			degraded++;
 		else if (test_bit(In_sync, &rdev->flags))
@@ -410,7 +403,7 @@
 	rcu_read_lock();
 	degraded = 0;
 	for (i = 0; i < conf->raid_disks; i++) {
-		mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
+		struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
 		if (!rdev || test_bit(Faulty, &rdev->flags))
 			degraded++;
 		else if (test_bit(In_sync, &rdev->flags))
@@ -431,7 +424,7 @@
 }
 
 static struct stripe_head *
-get_active_stripe(raid5_conf_t *conf, sector_t sector,
+get_active_stripe(struct r5conf *conf, sector_t sector,
 		  int previous, int noblock, int noquiesce)
 {
 	struct stripe_head *sh;
@@ -491,7 +484,7 @@
 
 static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 {
-	raid5_conf_t *conf = sh->raid_conf;
+	struct r5conf *conf = sh->raid_conf;
 	int i, disks = sh->disks;
 
 	might_sleep();
@@ -499,7 +492,7 @@
 	for (i = disks; i--; ) {
 		int rw;
 		struct bio *bi;
-		mdk_rdev_t *rdev;
+		struct md_rdev *rdev;
 		if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
 			if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags))
 				rw = WRITE_FUA;
@@ -650,7 +643,7 @@
 {
 	struct stripe_head *sh = stripe_head_ref;
 	struct bio *return_bi = NULL;
-	raid5_conf_t *conf = sh->raid_conf;
+	struct r5conf *conf = sh->raid_conf;
 	int i;
 
 	pr_debug("%s: stripe %llu\n", __func__,
@@ -695,7 +688,7 @@
 static void ops_run_biofill(struct stripe_head *sh)
 {
 	struct dma_async_tx_descriptor *tx = NULL;
-	raid5_conf_t *conf = sh->raid_conf;
+	struct r5conf *conf = sh->raid_conf;
 	struct async_submit_ctl submit;
 	int i;
 
@@ -1246,7 +1239,7 @@
 {
 	int overlap_clear = 0, i, disks = sh->disks;
 	struct dma_async_tx_descriptor *tx = NULL;
-	raid5_conf_t *conf = sh->raid_conf;
+	struct r5conf *conf = sh->raid_conf;
 	int level = conf->level;
 	struct raid5_percpu *percpu;
 	unsigned long cpu;
@@ -1337,7 +1330,7 @@
 #define raid_run_ops __raid_run_ops
 #endif
 
-static int grow_one_stripe(raid5_conf_t *conf)
+static int grow_one_stripe(struct r5conf *conf)
 {
 	struct stripe_head *sh;
 	sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL);
@@ -1362,7 +1355,7 @@
 	return 1;
 }
 
-static int grow_stripes(raid5_conf_t *conf, int num)
+static int grow_stripes(struct r5conf *conf, int num)
 {
 	struct kmem_cache *sc;
 	int devs = max(conf->raid_disks, conf->previous_raid_disks);
@@ -1411,7 +1404,7 @@
 	return len;
 }
 
-static int resize_stripes(raid5_conf_t *conf, int newsize)
+static int resize_stripes(struct r5conf *conf, int newsize)
 {
 	/* Make all the stripes able to hold 'newsize' devices.
 	 * New slots in each stripe get 'page' set to a new page.
@@ -1556,7 +1549,7 @@
 	return err;
 }
 
-static int drop_one_stripe(raid5_conf_t *conf)
+static int drop_one_stripe(struct r5conf *conf)
 {
 	struct stripe_head *sh;
 
@@ -1572,7 +1565,7 @@
 	return 1;
 }
 
-static void shrink_stripes(raid5_conf_t *conf)
+static void shrink_stripes(struct r5conf *conf)
 {
 	while (drop_one_stripe(conf))
 		;
@@ -1585,11 +1578,11 @@
 static void raid5_end_read_request(struct bio * bi, int error)
 {
 	struct stripe_head *sh = bi->bi_private;
-	raid5_conf_t *conf = sh->raid_conf;
+	struct r5conf *conf = sh->raid_conf;
 	int disks = sh->disks, i;
 	int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
 	char b[BDEVNAME_SIZE];
-	mdk_rdev_t *rdev;
+	struct md_rdev *rdev;
 
 
 	for (i=0 ; i<disks; i++)
@@ -1672,7 +1665,7 @@
 static void raid5_end_write_request(struct bio *bi, int error)
 {
 	struct stripe_head *sh = bi->bi_private;
-	raid5_conf_t *conf = sh->raid_conf;
+	struct r5conf *conf = sh->raid_conf;
 	int disks = sh->disks, i;
 	int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
 	sector_t first_bad;
@@ -1726,10 +1719,10 @@
 	dev->sector = compute_blocknr(sh, i, previous);
 }
 
-static void error(mddev_t *mddev, mdk_rdev_t *rdev)
+static void error(struct mddev *mddev, struct md_rdev *rdev)
 {
 	char b[BDEVNAME_SIZE];
-	raid5_conf_t *conf = mddev->private;
+	struct r5conf *conf = mddev->private;
 	pr_debug("raid456: error called\n");
 
 	if (test_and_clear_bit(In_sync, &rdev->flags)) {
@@ -1758,7 +1751,7 @@
  * Input: a 'big' sector number,
  * Output: index of the data and parity disk, and the sector # in them.
  */
-static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector,
+static sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
 				     int previous, int *dd_idx,
 				     struct stripe_head *sh)
 {
@@ -1963,7 +1956,7 @@
 
 static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous)
 {
-	raid5_conf_t *conf = sh->raid_conf;
+	struct r5conf *conf = sh->raid_conf;
 	int raid_disks = sh->disks;
 	int data_disks = raid_disks - conf->max_degraded;
 	sector_t new_sector = sh->sector, check;
@@ -2088,7 +2081,7 @@
 			 int rcw, int expand)
 {
 	int i, pd_idx = sh->pd_idx, disks = sh->disks;
-	raid5_conf_t *conf = sh->raid_conf;
+	struct r5conf *conf = sh->raid_conf;
 	int level = conf->level;
 
 	if (rcw) {
@@ -2173,7 +2166,7 @@
 static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite)
 {
 	struct bio **bip;
-	raid5_conf_t *conf = sh->raid_conf;
+	struct r5conf *conf = sh->raid_conf;
 	int firstwrite=0;
 
 	pr_debug("adding bi b#%llu to stripe s#%llu\n",
@@ -2235,9 +2228,9 @@
 	return 0;
 }
 
-static void end_reshape(raid5_conf_t *conf);
+static void end_reshape(struct r5conf *conf);
 
-static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous,
+static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
 			    struct stripe_head *sh)
 {
 	int sectors_per_chunk =
@@ -2254,7 +2247,7 @@
 }
 
 static void
-handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh,
+handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
 				struct stripe_head_state *s, int disks,
 				struct bio **return_bi)
 {
@@ -2264,7 +2257,7 @@
 		int bitmap_end = 0;
 
 		if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
-			mdk_rdev_t *rdev;
+			struct md_rdev *rdev;
 			rcu_read_lock();
 			rdev = rcu_dereference(conf->disks[i].rdev);
 			if (rdev && test_bit(In_sync, &rdev->flags))
@@ -2359,7 +2352,7 @@
 }
 
 static void
-handle_failed_sync(raid5_conf_t *conf, struct stripe_head *sh,
+handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
 		   struct stripe_head_state *s)
 {
 	int abort = 0;
@@ -2378,7 +2371,7 @@
 	 * refcounting of rdevs is not needed
 	 */
 	for (i = 0; i < conf->raid_disks; i++) {
-		mdk_rdev_t *rdev = conf->disks[i].rdev;
+		struct md_rdev *rdev = conf->disks[i].rdev;
 		if (!rdev
 		    || test_bit(Faulty, &rdev->flags)
 		    || test_bit(In_sync, &rdev->flags))
@@ -2508,7 +2501,7 @@
  * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but
  * never LOCKED, so we don't need to test 'failed' directly.
  */
-static void handle_stripe_clean_event(raid5_conf_t *conf,
+static void handle_stripe_clean_event(struct r5conf *conf,
 	struct stripe_head *sh, int disks, struct bio **return_bi)
 {
 	int i;
@@ -2553,7 +2546,7 @@
 			md_wakeup_thread(conf->mddev->thread);
 }
 
-static void handle_stripe_dirtying(raid5_conf_t *conf,
+static void handle_stripe_dirtying(struct r5conf *conf,
 				   struct stripe_head *sh,
 				   struct stripe_head_state *s,
 				   int disks)
@@ -2655,7 +2648,7 @@
 		schedule_reconstruction(sh, s, rcw == 0, 0);
 }
 
-static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
+static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh,
 				struct stripe_head_state *s, int disks)
 {
 	struct r5dev *dev = NULL;
@@ -2743,7 +2736,7 @@
 }
 
 
-static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
+static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh,
 				  struct stripe_head_state *s,
 				  int disks)
 {
@@ -2906,7 +2899,7 @@
 	}
 }
 
-static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh)
+static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
 {
 	int i;
 
@@ -2985,7 +2978,7 @@
 
 static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
 {
-	raid5_conf_t *conf = sh->raid_conf;
+	struct r5conf *conf = sh->raid_conf;
 	int disks = sh->disks;
 	struct r5dev *dev;
 	int i;
@@ -3002,7 +2995,7 @@
 	rcu_read_lock();
 	spin_lock_irq(&conf->device_lock);
 	for (i=disks; i--; ) {
-		mdk_rdev_t *rdev;
+		struct md_rdev *rdev;
 		sector_t first_bad;
 		int bad_sectors;
 		int is_bad = 0;
@@ -3069,7 +3062,7 @@
 			}
 		} else if (test_bit(In_sync, &rdev->flags))
 			set_bit(R5_Insync, &dev->flags);
-		else {
+		else if (!test_bit(Faulty, &rdev->flags)) {
 			/* in sync if before recovery_offset */
 			if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset)
 				set_bit(R5_Insync, &dev->flags);
@@ -3109,7 +3102,7 @@
 static void handle_stripe(struct stripe_head *sh)
 {
 	struct stripe_head_state s;
-	raid5_conf_t *conf = sh->raid_conf;
+	struct r5conf *conf = sh->raid_conf;
 	int i;
 	int prexor;
 	int disks = sh->disks;
@@ -3341,7 +3334,7 @@
 
 	if (s.handle_bad_blocks)
 		for (i = disks; i--; ) {
-			mdk_rdev_t *rdev;
+			struct md_rdev *rdev;
 			struct r5dev *dev = &sh->dev[i];
 			if (test_and_clear_bit(R5_WriteError, &dev->flags)) {
 				/* We own a safe reference to the rdev */
@@ -3380,7 +3373,7 @@
 	clear_bit(STRIPE_ACTIVE, &sh->state);
 }
 
-static void raid5_activate_delayed(raid5_conf_t *conf)
+static void raid5_activate_delayed(struct r5conf *conf)
 {
 	if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
 		while (!list_empty(&conf->delayed_list)) {
@@ -3396,7 +3389,7 @@
 	}
 }
 
-static void activate_bit_delay(raid5_conf_t *conf)
+static void activate_bit_delay(struct r5conf *conf)
 {
 	/* device_lock is held */
 	struct list_head head;
@@ -3410,9 +3403,9 @@
 	}
 }
 
-int md_raid5_congested(mddev_t *mddev, int bits)
+int md_raid5_congested(struct mddev *mddev, int bits)
 {
-	raid5_conf_t *conf = mddev->private;
+	struct r5conf *conf = mddev->private;
 
 	/* No difference between reads and writes.  Just check
 	 * how busy the stripe_cache is
@@ -3431,7 +3424,7 @@
 
 static int raid5_congested(void *data, int bits)
 {
-	mddev_t *mddev = data;
+	struct mddev *mddev = data;
 
 	return mddev_congested(mddev, bits) ||
 		md_raid5_congested(mddev, bits);
@@ -3444,7 +3437,7 @@
 				struct bvec_merge_data *bvm,
 				struct bio_vec *biovec)
 {
-	mddev_t *mddev = q->queuedata;
+	struct mddev *mddev = q->queuedata;
 	sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
 	int max;
 	unsigned int chunk_sectors = mddev->chunk_sectors;
@@ -3464,7 +3457,7 @@
 }
 
 
-static int in_chunk_boundary(mddev_t *mddev, struct bio *bio)
+static int in_chunk_boundary(struct mddev *mddev, struct bio *bio)
 {
 	sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev);
 	unsigned int chunk_sectors = mddev->chunk_sectors;
@@ -3480,7 +3473,7 @@
  *  add bio to the retry LIFO  ( in O(1) ... we are in interrupt )
  *  later sampled by raid5d.
  */
-static void add_bio_to_retry(struct bio *bi,raid5_conf_t *conf)
+static void add_bio_to_retry(struct bio *bi,struct r5conf *conf)
 {
 	unsigned long flags;
 
@@ -3494,7 +3487,7 @@
 }
 
 
-static struct bio *remove_bio_from_retry(raid5_conf_t *conf)
+static struct bio *remove_bio_from_retry(struct r5conf *conf)
 {
 	struct bio *bi;
 
@@ -3527,10 +3520,10 @@
 static void raid5_align_endio(struct bio *bi, int error)
 {
 	struct bio* raid_bi  = bi->bi_private;
-	mddev_t *mddev;
-	raid5_conf_t *conf;
+	struct mddev *mddev;
+	struct r5conf *conf;
 	int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
-	mdk_rdev_t *rdev;
+	struct md_rdev *rdev;
 
 	bio_put(bi);
 
@@ -3574,12 +3567,12 @@
 }
 
 
-static int chunk_aligned_read(mddev_t *mddev, struct bio * raid_bio)
+static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
 {
-	raid5_conf_t *conf = mddev->private;
+	struct r5conf *conf = mddev->private;
 	int dd_idx;
 	struct bio* align_bi;
-	mdk_rdev_t *rdev;
+	struct md_rdev *rdev;
 
 	if (!in_chunk_boundary(mddev, raid_bio)) {
 		pr_debug("chunk_aligned_read : non aligned\n");
@@ -3652,7 +3645,7 @@
  * head of the hold_list has changed, i.e. the head was promoted to the
  * handle_list.
  */
-static struct stripe_head *__get_priority_stripe(raid5_conf_t *conf)
+static struct stripe_head *__get_priority_stripe(struct r5conf *conf)
 {
 	struct stripe_head *sh;
 
@@ -3695,9 +3688,9 @@
 	return sh;
 }
 
-static int make_request(mddev_t *mddev, struct bio * bi)
+static int make_request(struct mddev *mddev, struct bio * bi)
 {
-	raid5_conf_t *conf = mddev->private;
+	struct r5conf *conf = mddev->private;
 	int dd_idx;
 	sector_t new_sector;
 	sector_t logical_sector, last_sector;
@@ -3855,9 +3848,9 @@
 	return 0;
 }
 
-static sector_t raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks);
+static sector_t raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks);
 
-static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped)
+static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *skipped)
 {
 	/* reshaping is quite different to recovery/resync so it is
 	 * handled quite separately ... here.
@@ -3868,7 +3861,7 @@
 	 * As the reads complete, handle_stripe will copy the data
 	 * into the destination stripe and release that stripe.
 	 */
-	raid5_conf_t *conf = mddev->private;
+	struct r5conf *conf = mddev->private;
 	struct stripe_head *sh;
 	sector_t first_sector, last_sector;
 	int raid_disks = conf->previous_raid_disks;
@@ -4075,9 +4068,9 @@
 }
 
 /* FIXME go_faster isn't used */
-static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
+static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipped, int go_faster)
 {
-	raid5_conf_t *conf = mddev->private;
+	struct r5conf *conf = mddev->private;
 	struct stripe_head *sh;
 	sector_t max_sector = mddev->dev_sectors;
 	sector_t sync_blocks;
@@ -4162,7 +4155,7 @@
 	return STRIPE_SECTORS;
 }
 
-static int  retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
+static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
 {
 	/* We may not be able to submit a whole bio at once as there
 	 * may not be enough stripe_heads available.
@@ -4234,10 +4227,10 @@
  * During the scan, completed stripes are saved for us by the interrupt
  * handler, so that they will not have to wait for our next wakeup.
  */
-static void raid5d(mddev_t *mddev)
+static void raid5d(struct mddev *mddev)
 {
 	struct stripe_head *sh;
-	raid5_conf_t *conf = mddev->private;
+	struct r5conf *conf = mddev->private;
 	int handled;
 	struct blk_plug plug;
 
@@ -4301,9 +4294,9 @@
 }
 
 static ssize_t
-raid5_show_stripe_cache_size(mddev_t *mddev, char *page)
+raid5_show_stripe_cache_size(struct mddev *mddev, char *page)
 {
-	raid5_conf_t *conf = mddev->private;
+	struct r5conf *conf = mddev->private;
 	if (conf)
 		return sprintf(page, "%d\n", conf->max_nr_stripes);
 	else
@@ -4311,9 +4304,9 @@
 }
 
 int
-raid5_set_cache_size(mddev_t *mddev, int size)
+raid5_set_cache_size(struct mddev *mddev, int size)
 {
-	raid5_conf_t *conf = mddev->private;
+	struct r5conf *conf = mddev->private;
 	int err;
 
 	if (size <= 16 || size > 32768)
@@ -4337,9 +4330,9 @@
 EXPORT_SYMBOL(raid5_set_cache_size);
 
 static ssize_t
-raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len)
+raid5_store_stripe_cache_size(struct mddev *mddev, const char *page, size_t len)
 {
-	raid5_conf_t *conf = mddev->private;
+	struct r5conf *conf = mddev->private;
 	unsigned long new;
 	int err;
 
@@ -4362,9 +4355,9 @@
 				raid5_store_stripe_cache_size);
 
 static ssize_t
-raid5_show_preread_threshold(mddev_t *mddev, char *page)
+raid5_show_preread_threshold(struct mddev *mddev, char *page)
 {
-	raid5_conf_t *conf = mddev->private;
+	struct r5conf *conf = mddev->private;
 	if (conf)
 		return sprintf(page, "%d\n", conf->bypass_threshold);
 	else
@@ -4372,9 +4365,9 @@
 }
 
 static ssize_t
-raid5_store_preread_threshold(mddev_t *mddev, const char *page, size_t len)
+raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len)
 {
-	raid5_conf_t *conf = mddev->private;
+	struct r5conf *conf = mddev->private;
 	unsigned long new;
 	if (len >= PAGE_SIZE)
 		return -EINVAL;
@@ -4396,9 +4389,9 @@
 					raid5_store_preread_threshold);
 
 static ssize_t
-stripe_cache_active_show(mddev_t *mddev, char *page)
+stripe_cache_active_show(struct mddev *mddev, char *page)
 {
-	raid5_conf_t *conf = mddev->private;
+	struct r5conf *conf = mddev->private;
 	if (conf)
 		return sprintf(page, "%d\n", atomic_read(&conf->active_stripes));
 	else
@@ -4420,9 +4413,9 @@
 };
 
 static sector_t
-raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks)
+raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks)
 {
-	raid5_conf_t *conf = mddev->private;
+	struct r5conf *conf = mddev->private;
 
 	if (!sectors)
 		sectors = mddev->dev_sectors;
@@ -4435,7 +4428,7 @@
 	return sectors * (raid_disks - conf->max_degraded);
 }
 
-static void raid5_free_percpu(raid5_conf_t *conf)
+static void raid5_free_percpu(struct r5conf *conf)
 {
 	struct raid5_percpu *percpu;
 	unsigned long cpu;
@@ -4457,7 +4450,7 @@
 	free_percpu(conf->percpu);
 }
 
-static void free_conf(raid5_conf_t *conf)
+static void free_conf(struct r5conf *conf)
 {
 	shrink_stripes(conf);
 	raid5_free_percpu(conf);
@@ -4470,7 +4463,7 @@
 static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action,
 			      void *hcpu)
 {
-	raid5_conf_t *conf = container_of(nfb, raid5_conf_t, cpu_notify);
+	struct r5conf *conf = container_of(nfb, struct r5conf, cpu_notify);
 	long cpu = (long)hcpu;
 	struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu);
 
@@ -4505,7 +4498,7 @@
 }
 #endif
 
-static int raid5_alloc_percpu(raid5_conf_t *conf)
+static int raid5_alloc_percpu(struct r5conf *conf)
 {
 	unsigned long cpu;
 	struct page *spare_page;
@@ -4547,11 +4540,11 @@
 	return err;
 }
 
-static raid5_conf_t *setup_conf(mddev_t *mddev)
+static struct r5conf *setup_conf(struct mddev *mddev)
 {
-	raid5_conf_t *conf;
+	struct r5conf *conf;
 	int raid_disk, memory, max_disks;
-	mdk_rdev_t *rdev;
+	struct md_rdev *rdev;
 	struct disk_info *disk;
 
 	if (mddev->new_level != 5
@@ -4583,7 +4576,7 @@
 		return ERR_PTR(-EINVAL);
 	}
 
-	conf = kzalloc(sizeof(raid5_conf_t), GFP_KERNEL);
+	conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL);
 	if (conf == NULL)
 		goto abort;
 	spin_lock_init(&conf->device_lock);
@@ -4598,6 +4591,7 @@
 	atomic_set(&conf->preread_active_stripes, 0);
 	atomic_set(&conf->active_aligned_reads, 0);
 	conf->bypass_threshold = BYPASS_THRESHOLD;
+	conf->recovery_disabled = mddev->recovery_disabled - 1;
 
 	conf->raid_disks = mddev->raid_disks;
 	if (mddev->reshape_position == MaxSector)
@@ -4712,12 +4706,12 @@
 	return 0;
 }
 
-static int run(mddev_t *mddev)
+static int run(struct mddev *mddev)
 {
-	raid5_conf_t *conf;
+	struct r5conf *conf;
 	int working_disks = 0;
 	int dirty_parity_disks = 0;
-	mdk_rdev_t *rdev;
+	struct md_rdev *rdev;
 	sector_t reshape_offset = 0;
 
 	if (mddev->recovery_cp != MaxSector)
@@ -4942,18 +4936,16 @@
 	return 0;
 abort:
 	md_unregister_thread(&mddev->thread);
-	if (conf) {
-		print_raid5_conf(conf);
-		free_conf(conf);
-	}
+	print_raid5_conf(conf);
+	free_conf(conf);
 	mddev->private = NULL;
 	printk(KERN_ALERT "md/raid:%s: failed to run raid set.\n", mdname(mddev));
 	return -EIO;
 }
 
-static int stop(mddev_t *mddev)
+static int stop(struct mddev *mddev)
 {
-	raid5_conf_t *conf = mddev->private;
+	struct r5conf *conf = mddev->private;
 
 	md_unregister_thread(&mddev->thread);
 	if (mddev->queue)
@@ -4964,44 +4956,9 @@
 	return 0;
 }
 
-#ifdef DEBUG
-static void print_sh(struct seq_file *seq, struct stripe_head *sh)
+static void status(struct seq_file *seq, struct mddev *mddev)
 {
-	int i;
-
-	seq_printf(seq, "sh %llu, pd_idx %d, state %ld.\n",
-		   (unsigned long long)sh->sector, sh->pd_idx, sh->state);
-	seq_printf(seq, "sh %llu,  count %d.\n",
-		   (unsigned long long)sh->sector, atomic_read(&sh->count));
-	seq_printf(seq, "sh %llu, ", (unsigned long long)sh->sector);
-	for (i = 0; i < sh->disks; i++) {
-		seq_printf(seq, "(cache%d: %p %ld) ",
-			   i, sh->dev[i].page, sh->dev[i].flags);
-	}
-	seq_printf(seq, "\n");
-}
-
-static void printall(struct seq_file *seq, raid5_conf_t *conf)
-{
-	struct stripe_head *sh;
-	struct hlist_node *hn;
-	int i;
-
-	spin_lock_irq(&conf->device_lock);
-	for (i = 0; i < NR_HASH; i++) {
-		hlist_for_each_entry(sh, hn, &conf->stripe_hashtbl[i], hash) {
-			if (sh->raid_conf != conf)
-				continue;
-			print_sh(seq, sh);
-		}
-	}
-	spin_unlock_irq(&conf->device_lock);
-}
-#endif
-
-static void status(struct seq_file *seq, mddev_t *mddev)
-{
-	raid5_conf_t *conf = mddev->private;
+	struct r5conf *conf = mddev->private;
 	int i;
 
 	seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level,
@@ -5012,13 +4969,9 @@
 			       conf->disks[i].rdev &&
 			       test_bit(In_sync, &conf->disks[i].rdev->flags) ? "U" : "_");
 	seq_printf (seq, "]");
-#ifdef DEBUG
-	seq_printf (seq, "\n");
-	printall(seq, conf);
-#endif
 }
 
-static void print_raid5_conf (raid5_conf_t *conf)
+static void print_raid5_conf (struct r5conf *conf)
 {
 	int i;
 	struct disk_info *tmp;
@@ -5042,10 +4995,10 @@
 	}
 }
 
-static int raid5_spare_active(mddev_t *mddev)
+static int raid5_spare_active(struct mddev *mddev)
 {
 	int i;
-	raid5_conf_t *conf = mddev->private;
+	struct r5conf *conf = mddev->private;
 	struct disk_info *tmp;
 	int count = 0;
 	unsigned long flags;
@@ -5067,11 +5020,11 @@
 	return count;
 }
 
-static int raid5_remove_disk(mddev_t *mddev, int number)
+static int raid5_remove_disk(struct mddev *mddev, int number)
 {
-	raid5_conf_t *conf = mddev->private;
+	struct r5conf *conf = mddev->private;
 	int err = 0;
-	mdk_rdev_t *rdev;
+	struct md_rdev *rdev;
 	struct disk_info *p = conf->disks + number;
 
 	print_raid5_conf(conf);
@@ -5110,9 +5063,9 @@
 	return err;
 }
 
-static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
+static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 {
-	raid5_conf_t *conf = mddev->private;
+	struct r5conf *conf = mddev->private;
 	int err = -EEXIST;
 	int disk;
 	struct disk_info *p;
@@ -5153,7 +5106,7 @@
 	return err;
 }
 
-static int raid5_resize(mddev_t *mddev, sector_t sectors)
+static int raid5_resize(struct mddev *mddev, sector_t sectors)
 {
 	/* no resync is happening, and there is enough space
 	 * on all devices, so we can resize.
@@ -5180,7 +5133,7 @@
 	return 0;
 }
 
-static int check_stripe_cache(mddev_t *mddev)
+static int check_stripe_cache(struct mddev *mddev)
 {
 	/* Can only proceed if there are plenty of stripe_heads.
 	 * We need a minimum of one full stripe,, and for sensible progress
@@ -5190,7 +5143,7 @@
 	 * If the chunk size is greater, user-space should request more
 	 * stripe_heads first.
 	 */
-	raid5_conf_t *conf = mddev->private;
+	struct r5conf *conf = mddev->private;
 	if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4
 	    > conf->max_nr_stripes ||
 	    ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4
@@ -5204,9 +5157,9 @@
 	return 1;
 }
 
-static int check_reshape(mddev_t *mddev)
+static int check_reshape(struct mddev *mddev)
 {
-	raid5_conf_t *conf = mddev->private;
+	struct r5conf *conf = mddev->private;
 
 	if (mddev->delta_disks == 0 &&
 	    mddev->new_layout == mddev->layout &&
@@ -5236,10 +5189,10 @@
 	return resize_stripes(conf, conf->raid_disks + mddev->delta_disks);
 }
 
-static int raid5_start_reshape(mddev_t *mddev)
+static int raid5_start_reshape(struct mddev *mddev)
 {
-	raid5_conf_t *conf = mddev->private;
-	mdk_rdev_t *rdev;
+	struct r5conf *conf = mddev->private;
+	struct md_rdev *rdev;
 	int spares = 0;
 	unsigned long flags;
 
@@ -5353,7 +5306,7 @@
 /* This is called from the reshape thread and should make any
  * changes needed in 'conf'
  */
-static void end_reshape(raid5_conf_t *conf)
+static void end_reshape(struct r5conf *conf)
 {
 
 	if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
@@ -5380,9 +5333,9 @@
 /* This is called from the raid5d thread with mddev_lock held.
  * It makes config changes to the device.
  */
-static void raid5_finish_reshape(mddev_t *mddev)
+static void raid5_finish_reshape(struct mddev *mddev)
 {
-	raid5_conf_t *conf = mddev->private;
+	struct r5conf *conf = mddev->private;
 
 	if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
 
@@ -5401,7 +5354,7 @@
 			for (d = conf->raid_disks ;
 			     d < conf->raid_disks - mddev->delta_disks;
 			     d++) {
-				mdk_rdev_t *rdev = conf->disks[d].rdev;
+				struct md_rdev *rdev = conf->disks[d].rdev;
 				if (rdev && raid5_remove_disk(mddev, d) == 0) {
 					sysfs_unlink_rdev(mddev, rdev);
 					rdev->raid_disk = -1;
@@ -5415,9 +5368,9 @@
 	}
 }
 
-static void raid5_quiesce(mddev_t *mddev, int state)
+static void raid5_quiesce(struct mddev *mddev, int state)
 {
-	raid5_conf_t *conf = mddev->private;
+	struct r5conf *conf = mddev->private;
 
 	switch(state) {
 	case 2: /* resume for a suspend */
@@ -5451,20 +5404,20 @@
 }
 
 
-static void *raid45_takeover_raid0(mddev_t *mddev, int level)
+static void *raid45_takeover_raid0(struct mddev *mddev, int level)
 {
-	struct raid0_private_data *raid0_priv = mddev->private;
+	struct r0conf *raid0_conf = mddev->private;
 	sector_t sectors;
 
 	/* for raid0 takeover only one zone is supported */
-	if (raid0_priv->nr_strip_zones > 1) {
+	if (raid0_conf->nr_strip_zones > 1) {
 		printk(KERN_ERR "md/raid:%s: cannot takeover raid0 with more than one zone.\n",
 		       mdname(mddev));
 		return ERR_PTR(-EINVAL);
 	}
 
-	sectors = raid0_priv->strip_zone[0].zone_end;
-	sector_div(sectors, raid0_priv->strip_zone[0].nb_dev);
+	sectors = raid0_conf->strip_zone[0].zone_end;
+	sector_div(sectors, raid0_conf->strip_zone[0].nb_dev);
 	mddev->dev_sectors = sectors;
 	mddev->new_level = level;
 	mddev->new_layout = ALGORITHM_PARITY_N;
@@ -5478,7 +5431,7 @@
 }
 
 
-static void *raid5_takeover_raid1(mddev_t *mddev)
+static void *raid5_takeover_raid1(struct mddev *mddev)
 {
 	int chunksect;
 
@@ -5505,7 +5458,7 @@
 	return setup_conf(mddev);
 }
 
-static void *raid5_takeover_raid6(mddev_t *mddev)
+static void *raid5_takeover_raid6(struct mddev *mddev)
 {
 	int new_layout;
 
@@ -5539,14 +5492,14 @@
 }
 
 
-static int raid5_check_reshape(mddev_t *mddev)
+static int raid5_check_reshape(struct mddev *mddev)
 {
 	/* For a 2-drive array, the layout and chunk size can be changed
 	 * immediately as not restriping is needed.
 	 * For larger arrays we record the new value - after validation
 	 * to be used by a reshape pass.
 	 */
-	raid5_conf_t *conf = mddev->private;
+	struct r5conf *conf = mddev->private;
 	int new_chunk = mddev->new_chunk_sectors;
 
 	if (mddev->new_layout >= 0 && !algorithm_valid_raid5(mddev->new_layout))
@@ -5579,7 +5532,7 @@
 	return check_reshape(mddev);
 }
 
-static int raid6_check_reshape(mddev_t *mddev)
+static int raid6_check_reshape(struct mddev *mddev)
 {
 	int new_chunk = mddev->new_chunk_sectors;
 
@@ -5599,7 +5552,7 @@
 	return check_reshape(mddev);
 }
 
-static void *raid5_takeover(mddev_t *mddev)
+static void *raid5_takeover(struct mddev *mddev)
 {
 	/* raid5 can take over:
 	 *  raid0 - if there is only one strip zone - make it a raid4 layout
@@ -5622,7 +5575,7 @@
 	return ERR_PTR(-EINVAL);
 }
 
-static void *raid4_takeover(mddev_t *mddev)
+static void *raid4_takeover(struct mddev *mddev)
 {
 	/* raid4 can take over:
 	 *  raid0 - if there is only one strip zone
@@ -5639,9 +5592,9 @@
 	return ERR_PTR(-EINVAL);
 }
 
-static struct mdk_personality raid5_personality;
+static struct md_personality raid5_personality;
 
-static void *raid6_takeover(mddev_t *mddev)
+static void *raid6_takeover(struct mddev *mddev)
 {
 	/* Currently can only take over a raid5.  We map the
 	 * personality to an equivalent raid6 personality
@@ -5688,7 +5641,7 @@
 }
 
 
-static struct mdk_personality raid6_personality =
+static struct md_personality raid6_personality =
 {
 	.name		= "raid6",
 	.level		= 6,
@@ -5710,7 +5663,7 @@
 	.quiesce	= raid5_quiesce,
 	.takeover	= raid6_takeover,
 };
-static struct mdk_personality raid5_personality =
+static struct md_personality raid5_personality =
 {
 	.name		= "raid5",
 	.level		= 5,
@@ -5733,7 +5686,7 @@
 	.takeover	= raid5_takeover,
 };
 
-static struct mdk_personality raid4_personality =
+static struct md_personality raid4_personality =
 {
 	.name		= "raid4",
 	.level		= 4,

diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 11b9566..e10c553 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h

@@ -197,7 +197,7 @@
 struct stripe_head {
 	struct hlist_node	hash;
 	struct list_head	lru;	      /* inactive_list or handle_list */
-	struct raid5_private_data *raid_conf;
+	struct r5conf		*raid_conf;
 	short			generation;	/* increments with every
 						 * reshape */
 	sector_t		sector;		/* sector of this row */
@@ -248,7 +248,7 @@
 	unsigned long ops_request;
 
 	struct bio *return_bi;
-	mdk_rdev_t *blocked_rdev;
+	struct md_rdev *blocked_rdev;
 	int handle_bad_blocks;
 };
 
@@ -344,12 +344,12 @@
 
 
 struct disk_info {
-	mdk_rdev_t	*rdev;
+	struct md_rdev	*rdev;
 };
 
-struct raid5_private_data {
+struct r5conf {
 	struct hlist_head	*stripe_hashtbl;
-	mddev_t			*mddev;
+	struct mddev		*mddev;
 	struct disk_info	*spare;
 	int			chunk_sectors;
 	int			level, algorithm;
@@ -436,11 +436,9 @@
 	/* When taking over an array from a different personality, we store
 	 * the new thread here until we fully activate the array.
 	 */
-	struct mdk_thread_s	*thread;
+	struct md_thread	*thread;
 };
 
-typedef struct raid5_private_data raid5_conf_t;
-
 /*
  * Our supported algorithms
  */
@@ -503,7 +501,7 @@
 	return layout >= 8 && layout <= 10;
 }
 
-extern int md_raid5_congested(mddev_t *mddev, int bits);
-extern void md_raid5_kick_device(raid5_conf_t *conf);
-extern int raid5_set_cache_size(mddev_t *mddev, int size);
+extern int md_raid5_congested(struct mddev *mddev, int bits);
+extern void md_raid5_kick_device(struct r5conf *conf);
+extern int raid5_set_cache_size(struct mddev *mddev, int size);
 #endif

diff --git a/drivers/scsi/osd/Kconfig b/drivers/scsi/osd/Kconfig
index 861b5ce..a070351 100644
--- a/drivers/scsi/osd/Kconfig
+++ b/drivers/scsi/osd/Kconfig

@@ -11,10 +11,6 @@
 # it under the terms of the GNU General Public version 2 License as
 # published by the Free Software Foundation
 #
-# FIXME: SCSI_OSD_INITIATOR should select CONFIG (HMAC) SHA1 somehow.
-#        How is it done properly?
-#
-
 config SCSI_OSD_INITIATOR
 	tristate "OSD-Initiator library"
 	depends on SCSI

diff --git a/drivers/staging/et131x/Kconfig b/drivers/staging/et131x/Kconfig
index f0ed08d..9e1864c 100644
--- a/drivers/staging/et131x/Kconfig
+++ b/drivers/staging/et131x/Kconfig

@@ -1,6 +1,6 @@
 config ET131X
 	tristate "Agere ET-1310 Gigabit Ethernet support"
-	depends on NETDEV_1000 && PCI
+	depends on PCI
 	default n
 	---help---
 	  This driver supports Agere ET-1310 ethernet adapters.

diff --git a/drivers/staging/slicoss/Kconfig b/drivers/staging/slicoss/Kconfig
index d2993d3..5cde96b 100644
--- a/drivers/staging/slicoss/Kconfig
+++ b/drivers/staging/slicoss/Kconfig

@@ -1,6 +1,6 @@
 config SLICOSS
 	tristate "Alacritech Gigabit IS-NIC support"
-	depends on PCI && X86 && NETDEV_1000
+	depends on PCI && X86
 	default n
 	help
 	  This driver supports Alacritech's IS-NIC gigabit ethernet cards.

diff --git a/drivers/watchdog/hpwdt.c b/drivers/watchdog/hpwdt.c
index 809cbda..3774c9b 100644
--- a/drivers/watchdog/hpwdt.c
+++ b/drivers/watchdog/hpwdt.c

@@ -35,6 +35,7 @@
 #include <linux/notifier.h>
 #include <asm/cacheflush.h>
 #endif /* CONFIG_HPWDT_NMI_DECODING */
+#include <asm/nmi.h>
 
 #define HPWDT_VERSION			"1.3.0"
 #define SECS_TO_TICKS(secs)		((secs) * 1000 / 128)
@@ -477,15 +478,11 @@
 /*
  *	NMI Handler
  */
-static int hpwdt_pretimeout(struct notifier_block *nb, unsigned long ulReason,
-				void *data)
+static int hpwdt_pretimeout(unsigned int ulReason, struct pt_regs *regs)
 {
 	unsigned long rom_pl;
 	static int die_nmi_called;
 
-	if (ulReason != DIE_NMIUNKNOWN)
-		goto out;
-
 	if (!hpwdt_nmi_decoding)
 		goto out;
 
@@ -508,7 +505,7 @@
 		"Management Log for details.\n");
 
 out:
-	return NOTIFY_OK;
+	return NMI_DONE;
 }
 #endif /* CONFIG_HPWDT_NMI_DECODING */
 
@@ -648,13 +645,6 @@
 	.fops = &hpwdt_fops,
 };
 
-#ifdef CONFIG_HPWDT_NMI_DECODING
-static struct notifier_block die_notifier = {
-	.notifier_call = hpwdt_pretimeout,
-	.priority = 0,
-};
-#endif /* CONFIG_HPWDT_NMI_DECODING */
-
 /*
  *	Init & Exit
  */
@@ -740,10 +730,9 @@
 	 * die notify list to handle a critical NMI. The default is to
 	 * be last so other users of the NMI signal can function.
 	 */
-	if (priority)
-		die_notifier.priority = 0x7FFFFFFF;
-
-	retval = register_die_notifier(&die_notifier);
+	retval = register_nmi_handler(NMI_UNKNOWN, hpwdt_pretimeout,
+					(priority) ? NMI_FLAG_FIRST : 0,
+					"hpwdt");
 	if (retval != 0) {
 		dev_warn(&dev->dev,
 			"Unable to register a die notifier (err=%d).\n",
@@ -763,7 +752,7 @@
 
 static void hpwdt_exit_nmi_decoding(void)
 {
-	unregister_die_notifier(&die_notifier);
+	unregister_nmi_handler(NMI_UNKNOWN, "hpwdt");
 	if (cru_rom_addr)
 		iounmap(cru_rom_addr);
 }

diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index 7a55b29..0eb8a57 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c

@@ -1049,7 +1049,7 @@
 	if (irq < 0)
 		return irq;
 
-	irqflags |= IRQF_NO_SUSPEND | IRQF_FORCE_RESUME;
+	irqflags |= IRQF_NO_SUSPEND | IRQF_FORCE_RESUME | IRQF_EARLY_RESUME;
 	retval = request_irq(irq, handler, irqflags, devname, dev_id);
 	if (retval != 0) {
 		unbind_from_irq(irq);

diff --git a/fs/Makefile b/fs/Makefile
index afc1096..5c30a13 100644
--- a/fs/Makefile
+++ b/fs/Makefile

@@ -120,6 +120,6 @@
 obj-$(CONFIG_OCFS2_FS)		+= ocfs2/
 obj-$(CONFIG_BTRFS_FS)		+= btrfs/
 obj-$(CONFIG_GFS2_FS)           += gfs2/
-obj-$(CONFIG_EXOFS_FS)          += exofs/
+obj-$(y)          		+= exofs/ # Multiple mods, used by nfs/objlayout
 obj-$(CONFIG_CEPH_FS)		+= ceph/
 obj-$(CONFIG_PSTORE)		+= pstore/

diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild
index c5a5855..352ba14 100644
--- a/fs/exofs/Kbuild
+++ b/fs/exofs/Kbuild

@@ -13,7 +13,8 @@
 #
 
 # ore module library
-obj-$(CONFIG_ORE) += ore.o
+libore-y := ore.o ore_raid.o
+obj-$(CONFIG_ORE) += libore.o
 
 exofs-y := inode.o file.o symlink.o namei.o dir.o super.o
 obj-$(CONFIG_EXOFS_FS) += exofs.o

diff --git a/fs/exofs/Kconfig b/fs/exofs/Kconfig
index 70bae41..fa9a286 100644
--- a/fs/exofs/Kconfig
+++ b/fs/exofs/Kconfig

@@ -1,10 +1,17 @@
+# Note ORE needs to "select ASYNC_XOR". So Not to force multiple selects
+# for every ORE user we do it like this. Any user should add itself here
+# at the "depends on EXOFS_FS || ..." with an ||. The dependencies are
+# selected here, and we default to "ON". So in effect it is like been
+# selected by any of the users.
 config ORE
 	tristate
+	depends on EXOFS_FS
+	select ASYNC_XOR
+	default SCSI_OSD_ULD
 
 config EXOFS_FS
 	tristate "exofs: OSD based file system support"
 	depends on SCSI_OSD_ULD
-	select ORE
 	help
 	  EXOFS is a file system that uses an OSD storage device,
 	  as its backing storage.

diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index f4e442e..51f4b4c 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h

@@ -53,6 +53,10 @@
 /* u64 has problems with printk this will cast it to unsigned long long */
 #define _LLU(x) (unsigned long long)(x)
 
+struct exofs_dev {
+	struct ore_dev ored;
+	unsigned did;
+};
 /*
  * our extension to the in-memory superblock
  */
@@ -66,13 +70,9 @@
 	u32		s_next_generation;	/* next gen # to use          */
 	atomic_t	s_curr_pending;		/* number of pending commands */
 
-	struct pnfs_osd_data_map data_map;	/* Default raid to use
-						 * FIXME: Needed ?
-						 */
 	struct ore_layout	layout;		/* Default files layout       */
 	struct ore_comp one_comp;		/* id & cred of partition id=0*/
-	struct ore_components comps;		/* comps for the partition    */
-	struct osd_dev	*_min_one_dev[1];	/* Place holder for one dev   */
+	struct ore_components oc;		/* comps for the partition    */
 };
 
 /*
@@ -86,7 +86,7 @@
 	uint32_t       i_dir_start_lookup; /* which page to start lookup      */
 	uint64_t       i_commit_size;      /* the object's written length     */
 	struct ore_comp one_comp;	   /* same component for all devices  */
-	struct ore_components comps;	   /* inode view of the device table  */
+	struct ore_components oc;	   /* inode view of the device table  */
 };
 
 static inline osd_id exofs_oi_objno(struct exofs_i_info *oi)
@@ -207,7 +207,7 @@
  * bigger and that the device table repeats twice.
  * See: exofs_read_lookup_dev_table()
  */
-static inline void exofs_init_comps(struct ore_components *comps,
+static inline void exofs_init_comps(struct ore_components *oc,
 				    struct ore_comp *one_comp,
 				    struct exofs_sb_info *sbi, osd_id oid)
 {
@@ -217,13 +217,15 @@
 	one_comp->obj.id = oid;
 	exofs_make_credential(one_comp->cred, &one_comp->obj);
 
-	comps->numdevs = sbi->comps.numdevs;
-	comps->single_comp = EC_SINGLE_COMP;
-	comps->comps = one_comp;
+	oc->first_dev = 0;
+	oc->numdevs = sbi->layout.group_width * sbi->layout.mirrors_p1 *
+							sbi->layout.group_count;
+	oc->single_comp = EC_SINGLE_COMP;
+	oc->comps = one_comp;
 
 	/* Round robin device view of the table */
-	first_dev = (dev_mod * sbi->layout.mirrors_p1) % sbi->comps.numdevs;
-	comps->ods = sbi->comps.ods + first_dev;
+	first_dev = (dev_mod * sbi->layout.mirrors_p1) % sbi->oc.numdevs;
+	oc->ods = &sbi->oc.ods[first_dev];
 }
 
 #endif

diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index f39a38f..3e5f3a6 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c

@@ -37,11 +37,7 @@
 
 #define EXOFS_DBGMSG2(M...) do {} while (0)
 
-enum { BIO_MAX_PAGES_KMALLOC =
-		(PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),
-	MAX_PAGES_KMALLOC =
-		PAGE_SIZE / sizeof(struct page *),
-};
+enum {MAX_PAGES_KMALLOC = PAGE_SIZE / sizeof(struct page *), };
 
 unsigned exofs_max_io_pages(struct ore_layout *layout,
 			    unsigned expected_pages)
@@ -49,8 +45,7 @@
 	unsigned pages = min_t(unsigned, expected_pages, MAX_PAGES_KMALLOC);
 
 	/* TODO: easily support bio chaining */
-	pages =  min_t(unsigned, pages,
-		       layout->group_width * BIO_MAX_PAGES_KMALLOC);
+	pages =  min_t(unsigned, pages, layout->max_io_length / PAGE_SIZE);
 	return pages;
 }
 
@@ -68,6 +63,7 @@
 	bool read_4_write; /* This means two things: that the read is sync
 			    * And the pages should not be unlocked.
 			    */
+	struct page *that_locked_page;
 };
 
 static void _pcol_init(struct page_collect *pcol, unsigned expected_pages,
@@ -86,6 +82,7 @@
 	pcol->length = 0;
 	pcol->pg_first = -1;
 	pcol->read_4_write = false;
+	pcol->that_locked_page = NULL;
 }
 
 static void _pcol_reset(struct page_collect *pcol)
@@ -98,6 +95,7 @@
 	pcol->length = 0;
 	pcol->pg_first = -1;
 	pcol->ios = NULL;
+	pcol->that_locked_page = NULL;
 
 	/* this is probably the end of the loop but in writes
 	 * it might not end here. don't be left with nothing
@@ -149,14 +147,17 @@
 	return 0;
 }
 
+enum {PAGE_WAS_NOT_IN_IO = 17};
 static int update_read_page(struct page *page, int ret)
 {
-	if (ret == 0) {
+	switch (ret) {
+	case 0:
 		/* Everything is OK */
 		SetPageUptodate(page);
 		if (PageError(page))
 			ClearPageError(page);
-	} else if (ret == -EFAULT) {
+		break;
+	case -EFAULT:
 		/* In this case we were trying to read something that wasn't on
 		 * disk yet - return a page full of zeroes.  This should be OK,
 		 * because the object should be empty (if there was a write
@@ -167,16 +168,22 @@
 		SetPageUptodate(page);
 		if (PageError(page))
 			ClearPageError(page);
-		ret = 0; /* recovered error */
 		EXOFS_DBGMSG("recovered read error\n");
-	} else /* Error */
+		/* fall through */
+	case PAGE_WAS_NOT_IN_IO:
+		ret = 0; /* recovered error */
+		break;
+	default:
 		SetPageError(page);
-
+	}
 	return ret;
 }
 
 static void update_write_page(struct page *page, int ret)
 {
+	if (unlikely(ret == PAGE_WAS_NOT_IN_IO))
+		return; /* don't pass start don't collect $200 */
+
 	if (ret) {
 		mapping_set_error(page->mapping, ret);
 		SetPageError(page);
@@ -190,15 +197,16 @@
 static int __readpages_done(struct page_collect *pcol)
 {
 	int i;
-	u64 resid;
 	u64 good_bytes;
 	u64 length = 0;
-	int ret = ore_check_io(pcol->ios, &resid);
+	int ret = ore_check_io(pcol->ios, NULL);
 
-	if (likely(!ret))
+	if (likely(!ret)) {
 		good_bytes = pcol->length;
-	else
-		good_bytes = pcol->length - resid;
+		ret = PAGE_WAS_NOT_IN_IO;
+	} else {
+		good_bytes = 0;
+	}
 
 	EXOFS_DBGMSG2("readpages_done(0x%lx) good_bytes=0x%llx"
 		     " length=0x%lx nr_pages=%u\n",
@@ -259,6 +267,46 @@
 	}
 }
 
+static int _maybe_not_all_in_one_io(struct ore_io_state *ios,
+	struct page_collect *pcol_src, struct page_collect *pcol)
+{
+	/* length was wrong or offset was not page aligned */
+	BUG_ON(pcol_src->nr_pages < ios->nr_pages);
+
+	if (pcol_src->nr_pages > ios->nr_pages) {
+		struct page **src_page;
+		unsigned pages_less = pcol_src->nr_pages - ios->nr_pages;
+		unsigned long len_less = pcol_src->length - ios->length;
+		unsigned i;
+		int ret;
+
+		/* This IO was trimmed */
+		pcol_src->nr_pages = ios->nr_pages;
+		pcol_src->length = ios->length;
+
+		/* Left over pages are passed to the next io */
+		pcol->expected_pages += pages_less;
+		pcol->nr_pages = pages_less;
+		pcol->length = len_less;
+		src_page = pcol_src->pages + pcol_src->nr_pages;
+		pcol->pg_first = (*src_page)->index;
+
+		ret = pcol_try_alloc(pcol);
+		if (unlikely(ret))
+			return ret;
+
+		for (i = 0; i < pages_less; ++i)
+			pcol->pages[i] = *src_page++;
+
+		EXOFS_DBGMSG("Length was adjusted nr_pages=0x%x "
+			"pages_less=0x%x expected_pages=0x%x "
+			"next_offset=0x%llx next_len=0x%lx\n",
+			pcol_src->nr_pages, pages_less, pcol->expected_pages,
+			pcol->pg_first * PAGE_SIZE, pcol->length);
+	}
+	return 0;
+}
+
 static int read_exec(struct page_collect *pcol)
 {
 	struct exofs_i_info *oi = exofs_i(pcol->inode);
@@ -270,7 +318,7 @@
 		return 0;
 
 	if (!pcol->ios) {
-		int ret = ore_get_rw_state(&pcol->sbi->layout, &oi->comps, true,
+		int ret = ore_get_rw_state(&pcol->sbi->layout, &oi->oc, true,
 					     pcol->pg_first << PAGE_CACHE_SHIFT,
 					     pcol->length, &pcol->ios);
 
@@ -280,7 +328,6 @@
 
 	ios = pcol->ios;
 	ios->pages = pcol->pages;
-	ios->nr_pages = pcol->nr_pages;
 
 	if (pcol->read_4_write) {
 		ore_read(pcol->ios);
@@ -296,17 +343,23 @@
 	*pcol_copy = *pcol;
 	ios->done = readpages_done;
 	ios->private = pcol_copy;
+
+	/* pages ownership was passed to pcol_copy */
+	_pcol_reset(pcol);
+
+	ret = _maybe_not_all_in_one_io(ios, pcol_copy, pcol);
+	if (unlikely(ret))
+		goto err;
+
+	EXOFS_DBGMSG2("read_exec(0x%lx) offset=0x%llx length=0x%llx\n",
+		pcol->inode->i_ino, _LLU(ios->offset), _LLU(ios->length));
+
 	ret = ore_read(ios);
 	if (unlikely(ret))
 		goto err;
 
 	atomic_inc(&pcol->sbi->s_curr_pending);
 
-	EXOFS_DBGMSG2("read_exec obj=0x%llx start=0x%llx length=0x%lx\n",
-		  oi->one_comp.obj.id, _LLU(ios->offset), pcol->length);
-
-	/* pages ownership was passed to pcol_copy */
-	_pcol_reset(pcol);
 	return 0;
 
 err:
@@ -341,6 +394,8 @@
 		EXOFS_ERR("PageUptodate(0x%lx, 0x%lx)\n", pcol->inode->i_ino,
 			  page->index);
 
+	pcol->that_locked_page = page;
+
 	if (page->index < end_index)
 		len = PAGE_CACHE_SIZE;
 	else if (page->index == end_index)
@@ -429,6 +484,10 @@
 		return ret;
 	}
 
+	ret = read_exec(&pcol);
+	if (unlikely(ret))
+		return ret;
+
 	return read_exec(&pcol);
 }
 
@@ -462,17 +521,18 @@
 {
 	struct page_collect *pcol = p;
 	int i;
-	u64 resid;
 	u64  good_bytes;
 	u64  length = 0;
-	int ret = ore_check_io(ios, &resid);
+	int ret = ore_check_io(ios, NULL);
 
 	atomic_dec(&pcol->sbi->s_curr_pending);
 
-	if (likely(!ret))
+	if (likely(!ret)) {
 		good_bytes = pcol->length;
-	else
-		good_bytes = pcol->length - resid;
+		ret = PAGE_WAS_NOT_IN_IO;
+	} else {
+		good_bytes = 0;
+	}
 
 	EXOFS_DBGMSG2("writepages_done(0x%lx) good_bytes=0x%llx"
 		     " length=0x%lx nr_pages=%u\n",
@@ -505,6 +565,56 @@
 	EXOFS_DBGMSG2("writepages_done END\n");
 }
 
+static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate)
+{
+	struct page_collect *pcol = priv;
+	pgoff_t index = offset / PAGE_SIZE;
+
+	if (!pcol->that_locked_page ||
+	    (pcol->that_locked_page->index != index)) {
+		struct page *page = find_get_page(pcol->inode->i_mapping, index);
+
+		if (!page) {
+			page = find_or_create_page(pcol->inode->i_mapping,
+						   index, GFP_NOFS);
+			if (unlikely(!page)) {
+				EXOFS_DBGMSG("grab_cache_page Failed "
+					"index=0x%llx\n", _LLU(index));
+				return NULL;
+			}
+			unlock_page(page);
+		}
+		if (PageDirty(page) || PageWriteback(page))
+			*uptodate = true;
+		else
+			*uptodate = PageUptodate(page);
+		EXOFS_DBGMSG("index=0x%lx uptodate=%d\n", index, *uptodate);
+		return page;
+	} else {
+		EXOFS_DBGMSG("YES that_locked_page index=0x%lx\n",
+			     pcol->that_locked_page->index);
+		*uptodate = true;
+		return pcol->that_locked_page;
+	}
+}
+
+static void __r4w_put_page(void *priv, struct page *page)
+{
+	struct page_collect *pcol = priv;
+
+	if (pcol->that_locked_page != page) {
+		EXOFS_DBGMSG("index=0x%lx\n", page->index);
+		page_cache_release(page);
+		return;
+	}
+	EXOFS_DBGMSG("that_locked_page index=0x%lx\n", page->index);
+}
+
+static const struct _ore_r4w_op _r4w_op = {
+	.get_page = &__r4w_get_page,
+	.put_page = &__r4w_put_page,
+};
+
 static int write_exec(struct page_collect *pcol)
 {
 	struct exofs_i_info *oi = exofs_i(pcol->inode);
@@ -516,10 +626,9 @@
 		return 0;
 
 	BUG_ON(pcol->ios);
-	ret = ore_get_rw_state(&pcol->sbi->layout, &oi->comps, false,
+	ret = ore_get_rw_state(&pcol->sbi->layout, &oi->oc, false,
 				 pcol->pg_first << PAGE_CACHE_SHIFT,
 				 pcol->length, &pcol->ios);
-
 	if (unlikely(ret))
 		goto err;
 
@@ -534,10 +643,20 @@
 
 	ios = pcol->ios;
 	ios->pages = pcol_copy->pages;
-	ios->nr_pages = pcol_copy->nr_pages;
 	ios->done = writepages_done;
+	ios->r4w = &_r4w_op;
 	ios->private = pcol_copy;
 
+	/* pages ownership was passed to pcol_copy */
+	_pcol_reset(pcol);
+
+	ret = _maybe_not_all_in_one_io(ios, pcol_copy, pcol);
+	if (unlikely(ret))
+		goto err;
+
+	EXOFS_DBGMSG2("write_exec(0x%lx) offset=0x%llx length=0x%llx\n",
+		pcol->inode->i_ino, _LLU(ios->offset), _LLU(ios->length));
+
 	ret = ore_write(ios);
 	if (unlikely(ret)) {
 		EXOFS_ERR("write_exec: ore_write() Failed\n");
@@ -545,11 +664,6 @@
 	}
 
 	atomic_inc(&pcol->sbi->s_curr_pending);
-	EXOFS_DBGMSG2("write_exec(0x%lx, 0x%llx) start=0x%llx length=0x%lx\n",
-		  pcol->inode->i_ino, pcol->pg_first, _LLU(ios->offset),
-		  pcol->length);
-	/* pages ownership was passed to pcol_copy */
-	_pcol_reset(pcol);
 	return 0;
 
 err:
@@ -689,14 +803,33 @@
 	_pcol_init(&pcol, expected_pages, mapping->host);
 
 	ret = write_cache_pages(mapping, wbc, writepage_strip, &pcol);
-	if (ret) {
+	if (unlikely(ret)) {
 		EXOFS_ERR("write_cache_pages => %d\n", ret);
 		return ret;
 	}
 
-	return write_exec(&pcol);
+	ret = write_exec(&pcol);
+	if (unlikely(ret))
+		return ret;
+
+	if (wbc->sync_mode == WB_SYNC_ALL) {
+		return write_exec(&pcol); /* pump the last reminder */
+	} else if (pcol.nr_pages) {
+		/* not SYNC let the reminder join the next writeout */
+		unsigned i;
+
+		for (i = 0; i < pcol.nr_pages; i++) {
+			struct page *page = pcol.pages[i];
+
+			end_page_writeback(page);
+			set_page_dirty(page);
+			unlock_page(page);
+		}
+	}
+	return 0;
 }
 
+/*
 static int exofs_writepage(struct page *page, struct writeback_control *wbc)
 {
 	struct page_collect pcol;
@@ -712,7 +845,7 @@
 
 	return write_exec(&pcol);
 }
-
+*/
 /* i_mutex held using inode->i_size directly */
 static void _write_failed(struct inode *inode, loff_t to)
 {
@@ -818,7 +951,7 @@
 const struct address_space_operations exofs_aops = {
 	.readpage	= exofs_readpage,
 	.readpages	= exofs_readpages,
-	.writepage	= exofs_writepage,
+	.writepage	= NULL,
 	.writepages	= exofs_writepages,
 	.write_begin	= exofs_write_begin_export,
 	.write_end	= exofs_write_end,
@@ -860,7 +993,7 @@
 
 	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 
-	ret = ore_truncate(&sbi->layout, &oi->comps, (u64)newsize);
+	ret = ore_truncate(&sbi->layout, &oi->oc, (u64)newsize);
 	if (likely(!ret))
 		truncate_setsize(inode, newsize);
 
@@ -927,14 +1060,14 @@
 	struct exofs_on_disk_inode_layout *layout;
 	int ret;
 
-	ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios);
+	ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios);
 	if (unlikely(ret)) {
 		EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__);
 		return ret;
 	}
 
-	attrs[1].len = exofs_on_disk_inode_layout_size(sbi->comps.numdevs);
-	attrs[2].len = exofs_on_disk_inode_layout_size(sbi->comps.numdevs);
+	attrs[1].len = exofs_on_disk_inode_layout_size(sbi->oc.numdevs);
+	attrs[2].len = exofs_on_disk_inode_layout_size(sbi->oc.numdevs);
 
 	ios->in_attr = attrs;
 	ios->in_attr_len = ARRAY_SIZE(attrs);
@@ -1018,7 +1151,7 @@
 		return inode;
 	oi = exofs_i(inode);
 	__oi_init(oi);
-	exofs_init_comps(&oi->comps, &oi->one_comp, sb->s_fs_info,
+	exofs_init_comps(&oi->oc, &oi->one_comp, sb->s_fs_info,
 			 exofs_oi_objno(oi));
 
 	/* read the inode from the osd */
@@ -1172,13 +1305,13 @@
 	spin_unlock(&sbi->s_next_gen_lock);
 	insert_inode_hash(inode);
 
-	exofs_init_comps(&oi->comps, &oi->one_comp, sb->s_fs_info,
+	exofs_init_comps(&oi->oc, &oi->one_comp, sb->s_fs_info,
 			 exofs_oi_objno(oi));
 	exofs_sbi_write_stats(sbi); /* Make sure new sbi->s_nextid is on disk */
 
 	mark_inode_dirty(inode);
 
-	ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios);
+	ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios);
 	if (unlikely(ret)) {
 		EXOFS_ERR("exofs_new_inode: ore_get_io_state failed\n");
 		return ERR_PTR(ret);
@@ -1267,7 +1400,7 @@
 	} else
 		memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data));
 
-	ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios);
+	ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios);
 	if (unlikely(ret)) {
 		EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__);
 		goto free_args;
@@ -1350,7 +1483,7 @@
 	/* ignore the error, attempt a remove anyway */
 
 	/* Now Remove the OSD objects */
-	ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios);
+	ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios);
 	if (unlikely(ret)) {
 		EXOFS_ERR("%s: ore_get_io_state failed\n", __func__);
 		return;

diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index 25305af..fcfa86a 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c

@@ -24,76 +24,287 @@
 
 #include <linux/slab.h>
 #include <asm/div64.h>
+#include <linux/lcm.h>
 
-#include <scsi/osd_ore.h>
-
-#define ORE_ERR(fmt, a...) printk(KERN_ERR "ore: " fmt, ##a)
-
-#ifdef CONFIG_EXOFS_DEBUG
-#define ORE_DBGMSG(fmt, a...) \
-	printk(KERN_NOTICE "ore @%s:%d: " fmt, __func__, __LINE__, ##a)
-#else
-#define ORE_DBGMSG(fmt, a...) \
-	do { if (0) printk(fmt, ##a); } while (0)
-#endif
-
-/* u64 has problems with printk this will cast it to unsigned long long */
-#define _LLU(x) (unsigned long long)(x)
-
-#define ORE_DBGMSG2(M...) do {} while (0)
-/* #define ORE_DBGMSG2 ORE_DBGMSG */
+#include "ore_raid.h"
 
 MODULE_AUTHOR("Boaz Harrosh <bharrosh@panasas.com>");
 MODULE_DESCRIPTION("Objects Raid Engine ore.ko");
 MODULE_LICENSE("GPL");
 
+/* ore_verify_layout does a couple of things:
+ * 1. Given a minimum number of needed parameters fixes up the rest of the
+ *    members to be operatonals for the ore. The needed parameters are those
+ *    that are defined by the pnfs-objects layout STD.
+ * 2. Check to see if the current ore code actually supports these parameters
+ *    for example stripe_unit must be a multple of the system PAGE_SIZE,
+ *    and etc...
+ * 3. Cache some havily used calculations that will be needed by users.
+ */
+
+enum { BIO_MAX_PAGES_KMALLOC =
+		(PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),};
+
+int ore_verify_layout(unsigned total_comps, struct ore_layout *layout)
+{
+	u64 stripe_length;
+
+	switch (layout->raid_algorithm) {
+	case PNFS_OSD_RAID_0:
+		layout->parity = 0;
+		break;
+	case PNFS_OSD_RAID_5:
+		layout->parity = 1;
+		break;
+	case PNFS_OSD_RAID_PQ:
+	case PNFS_OSD_RAID_4:
+	default:
+		ORE_ERR("Only RAID_0/5 for now\n");
+		return -EINVAL;
+	}
+	if (0 != (layout->stripe_unit & ~PAGE_MASK)) {
+		ORE_ERR("Stripe Unit(0x%llx)"
+			  " must be Multples of PAGE_SIZE(0x%lx)\n",
+			  _LLU(layout->stripe_unit), PAGE_SIZE);
+		return -EINVAL;
+	}
+	if (layout->group_width) {
+		if (!layout->group_depth) {
+			ORE_ERR("group_depth == 0 && group_width != 0\n");
+			return -EINVAL;
+		}
+		if (total_comps < (layout->group_width * layout->mirrors_p1)) {
+			ORE_ERR("Data Map wrong, "
+				"numdevs=%d < group_width=%d * mirrors=%d\n",
+				total_comps, layout->group_width,
+				layout->mirrors_p1);
+			return -EINVAL;
+		}
+		layout->group_count = total_comps / layout->mirrors_p1 /
+						layout->group_width;
+	} else {
+		if (layout->group_depth) {
+			printk(KERN_NOTICE "Warning: group_depth ignored "
+				"group_width == 0 && group_depth == %lld\n",
+				_LLU(layout->group_depth));
+		}
+		layout->group_width = total_comps / layout->mirrors_p1;
+		layout->group_depth = -1;
+		layout->group_count = 1;
+	}
+
+	stripe_length = (u64)layout->group_width * layout->stripe_unit;
+	if (stripe_length >= (1ULL << 32)) {
+		ORE_ERR("Stripe_length(0x%llx) >= 32bit is not supported\n",
+			_LLU(stripe_length));
+		return -EINVAL;
+	}
+
+	layout->max_io_length =
+		(BIO_MAX_PAGES_KMALLOC * PAGE_SIZE - layout->stripe_unit) *
+							layout->group_width;
+	if (layout->parity) {
+		unsigned stripe_length =
+				(layout->group_width - layout->parity) *
+				layout->stripe_unit;
+
+		layout->max_io_length /= stripe_length;
+		layout->max_io_length *= stripe_length;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(ore_verify_layout);
+
 static u8 *_ios_cred(struct ore_io_state *ios, unsigned index)
 {
-	return ios->comps->comps[index & ios->comps->single_comp].cred;
+	return ios->oc->comps[index & ios->oc->single_comp].cred;
 }
 
 static struct osd_obj_id *_ios_obj(struct ore_io_state *ios, unsigned index)
 {
-	return &ios->comps->comps[index & ios->comps->single_comp].obj;
+	return &ios->oc->comps[index & ios->oc->single_comp].obj;
 }
 
 static struct osd_dev *_ios_od(struct ore_io_state *ios, unsigned index)
 {
-	return ios->comps->ods[index];
+	ORE_DBGMSG2("oc->first_dev=%d oc->numdevs=%d i=%d oc->ods=%p\n",
+		    ios->oc->first_dev, ios->oc->numdevs, index,
+		    ios->oc->ods);
+
+	return ore_comp_dev(ios->oc, index);
 }
 
-int  ore_get_rw_state(struct ore_layout *layout, struct ore_components *comps,
+int  _ore_get_io_state(struct ore_layout *layout,
+			struct ore_components *oc, unsigned numdevs,
+			unsigned sgs_per_dev, unsigned num_par_pages,
+			struct ore_io_state **pios)
+{
+	struct ore_io_state *ios;
+	struct page **pages;
+	struct osd_sg_entry *sgilist;
+	struct __alloc_all_io_state {
+		struct ore_io_state ios;
+		struct ore_per_dev_state per_dev[numdevs];
+		union {
+			struct osd_sg_entry sglist[sgs_per_dev * numdevs];
+			struct page *pages[num_par_pages];
+		};
+	} *_aios;
+
+	if (likely(sizeof(*_aios) <= PAGE_SIZE)) {
+		_aios = kzalloc(sizeof(*_aios), GFP_KERNEL);
+		if (unlikely(!_aios)) {
+			ORE_DBGMSG("Failed kzalloc bytes=%zd\n",
+				   sizeof(*_aios));
+			*pios = NULL;
+			return -ENOMEM;
+		}
+		pages = num_par_pages ? _aios->pages : NULL;
+		sgilist = sgs_per_dev ? _aios->sglist : NULL;
+		ios = &_aios->ios;
+	} else {
+		struct __alloc_small_io_state {
+			struct ore_io_state ios;
+			struct ore_per_dev_state per_dev[numdevs];
+		} *_aio_small;
+		union __extra_part {
+			struct osd_sg_entry sglist[sgs_per_dev * numdevs];
+			struct page *pages[num_par_pages];
+		} *extra_part;
+
+		_aio_small = kzalloc(sizeof(*_aio_small), GFP_KERNEL);
+		if (unlikely(!_aio_small)) {
+			ORE_DBGMSG("Failed alloc first part bytes=%zd\n",
+				   sizeof(*_aio_small));
+			*pios = NULL;
+			return -ENOMEM;
+		}
+		extra_part = kzalloc(sizeof(*extra_part), GFP_KERNEL);
+		if (unlikely(!extra_part)) {
+			ORE_DBGMSG("Failed alloc second part bytes=%zd\n",
+				   sizeof(*extra_part));
+			kfree(_aio_small);
+			*pios = NULL;
+			return -ENOMEM;
+		}
+
+		pages = num_par_pages ? extra_part->pages : NULL;
+		sgilist = sgs_per_dev ? extra_part->sglist : NULL;
+		/* In this case the per_dev[0].sgilist holds the pointer to
+		 * be freed
+		 */
+		ios = &_aio_small->ios;
+		ios->extra_part_alloc = true;
+	}
+
+	if (pages) {
+		ios->parity_pages = pages;
+		ios->max_par_pages = num_par_pages;
+	}
+	if (sgilist) {
+		unsigned d;
+
+		for (d = 0; d < numdevs; ++d) {
+			ios->per_dev[d].sglist = sgilist;
+			sgilist += sgs_per_dev;
+		}
+		ios->sgs_per_dev = sgs_per_dev;
+	}
+
+	ios->layout = layout;
+	ios->oc = oc;
+	*pios = ios;
+	return 0;
+}
+
+/* Allocate an io_state for only a single group of devices
+ *
+ * If a user needs to call ore_read/write() this version must be used becase it
+ * allocates extra stuff for striping and raid.
+ * The ore might decide to only IO less then @length bytes do to alignmets
+ * and constrains as follows:
+ * - The IO cannot cross group boundary.
+ * - In raid5/6 The end of the IO must align at end of a stripe eg.
+ *   (@offset + @length) % strip_size == 0. Or the complete range is within a
+ *   single stripe.
+ * - Memory condition only permitted a shorter IO. (A user can use @length=~0
+ *   And check the returned ios->length for max_io_size.)
+ *
+ * The caller must check returned ios->length (and/or ios->nr_pages) and
+ * re-issue these pages that fall outside of ios->length
+ */
+int  ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc,
 		      bool is_reading, u64 offset, u64 length,
 		      struct ore_io_state **pios)
 {
 	struct ore_io_state *ios;
+	unsigned numdevs = layout->group_width * layout->mirrors_p1;
+	unsigned sgs_per_dev = 0, max_par_pages = 0;
+	int ret;
 
-	/*TODO: Maybe use kmem_cach per sbi of size
-	 * exofs_io_state_size(layout->s_numdevs)
-	 */
-	ios = kzalloc(ore_io_state_size(comps->numdevs), GFP_KERNEL);
-	if (unlikely(!ios)) {
-		ORE_DBGMSG("Failed kzalloc bytes=%d\n",
-			     ore_io_state_size(comps->numdevs));
-		*pios = NULL;
-		return -ENOMEM;
+	if (layout->parity && length) {
+		unsigned data_devs = layout->group_width - layout->parity;
+		unsigned stripe_size = layout->stripe_unit * data_devs;
+		unsigned pages_in_unit = layout->stripe_unit / PAGE_SIZE;
+		u32 remainder;
+		u64 num_stripes;
+		u64 num_raid_units;
+
+		num_stripes = div_u64_rem(length, stripe_size, &remainder);
+		if (remainder)
+			++num_stripes;
+
+		num_raid_units =  num_stripes * layout->parity;
+
+		if (is_reading) {
+			/* For reads add per_dev sglist array */
+			/* TODO: Raid 6 we need twice more. Actually:
+			*         num_stripes / LCMdP(W,P);
+			*         if (W%P != 0) num_stripes *= parity;
+			*/
+
+			/* first/last seg is split */
+			num_raid_units += layout->group_width;
+			sgs_per_dev = div_u64(num_raid_units, data_devs);
+		} else {
+			/* For Writes add parity pages array. */
+			max_par_pages = num_raid_units * pages_in_unit *
+						sizeof(struct page *);
+		}
 	}
 
-	ios->layout = layout;
-	ios->comps = comps;
-	ios->offset = offset;
-	ios->length = length;
-	ios->reading = is_reading;
+	ret = _ore_get_io_state(layout, oc, numdevs, sgs_per_dev, max_par_pages,
+				pios);
+	if (unlikely(ret))
+		return ret;
 
-	*pios = ios;
+	ios = *pios;
+	ios->reading = is_reading;
+	ios->offset = offset;
+
+	if (length) {
+		ore_calc_stripe_info(layout, offset, length, &ios->si);
+		ios->length = ios->si.length;
+		ios->nr_pages = (ios->length + PAGE_SIZE - 1) / PAGE_SIZE;
+		if (layout->parity)
+			_ore_post_alloc_raid_stuff(ios);
+	}
+
 	return 0;
 }
 EXPORT_SYMBOL(ore_get_rw_state);
 
-int  ore_get_io_state(struct ore_layout *layout, struct ore_components *comps,
-		      struct ore_io_state **ios)
+/* Allocate an io_state for all the devices in the comps array
+ *
+ * This version of io_state allocation is used mostly by create/remove
+ * and trunc where we currently need all the devices. The only wastful
+ * bit is the read/write_attributes with no IO. Those sites should
+ * be converted to use ore_get_rw_state() with length=0
+ */
+int  ore_get_io_state(struct ore_layout *layout, struct ore_components *oc,
+		      struct ore_io_state **pios)
 {
-	return ore_get_rw_state(layout, comps, true, 0, 0, ios);
+	return _ore_get_io_state(layout, oc, oc->numdevs, 0, 0, pios);
 }
 EXPORT_SYMBOL(ore_get_io_state);
 
@@ -111,6 +322,7 @@
 				bio_put(per_dev->bio);
 		}
 
+		_ore_free_raid_stuff(ios);
 		kfree(ios);
 	}
 }
@@ -138,7 +350,7 @@
 	kref_put(&ios->kref, _last_io);
 }
 
-static int ore_io_execute(struct ore_io_state *ios)
+int ore_io_execute(struct ore_io_state *ios)
 {
 	DECLARE_COMPLETION_ONSTACK(wait);
 	bool sync = (ios->done == NULL);
@@ -198,7 +410,7 @@
 	}
 }
 
-int ore_check_io(struct ore_io_state *ios, u64 *resid)
+int ore_check_io(struct ore_io_state *ios, ore_on_dev_error on_dev_error)
 {
 	enum osd_err_priority acumulated_osd_err = 0;
 	int acumulated_lin_err = 0;
@@ -206,7 +418,8 @@
 
 	for (i = 0; i < ios->numdevs; i++) {
 		struct osd_sense_info osi;
-		struct osd_request *or = ios->per_dev[i].or;
+		struct ore_per_dev_state *per_dev = &ios->per_dev[i];
+		struct osd_request *or = per_dev->or;
 		int ret;
 
 		if (unlikely(!or))
@@ -218,29 +431,31 @@
 
 		if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) {
 			/* start read offset passed endof file */
-			_clear_bio(ios->per_dev[i].bio);
+			_clear_bio(per_dev->bio);
 			ORE_DBGMSG("start read offset passed end of file "
 				"offset=0x%llx, length=0x%llx\n",
-				_LLU(ios->per_dev[i].offset),
-				_LLU(ios->per_dev[i].length));
+				_LLU(per_dev->offset),
+				_LLU(per_dev->length));
 
 			continue; /* we recovered */
 		}
 
+		if (on_dev_error) {
+			u64 residual = ios->reading ?
+					or->in.residual : or->out.residual;
+			u64 offset = (ios->offset + ios->length) - residual;
+			struct ore_dev *od = ios->oc->ods[
+					per_dev->dev - ios->oc->first_dev];
+
+			on_dev_error(ios, od, per_dev->dev, osi.osd_err_pri,
+				     offset, residual);
+		}
 		if (osi.osd_err_pri >= acumulated_osd_err) {
 			acumulated_osd_err = osi.osd_err_pri;
 			acumulated_lin_err = ret;
 		}
 	}
 
-	/* TODO: raid specific residual calculations */
-	if (resid) {
-		if (likely(!acumulated_lin_err))
-			*resid = 0;
-		else
-			*resid = ios->length;
-	}
-
 	return acumulated_lin_err;
 }
 EXPORT_SYMBOL(ore_check_io);
@@ -248,61 +463,65 @@
 /*
  * L - logical offset into the file
  *
- * U - The number of bytes in a stripe within a group
+ * D - number of Data devices
+ *	D = group_width - parity
  *
- *	U = stripe_unit * group_width
+ * U - The number of bytes in a stripe within a group
+ *	U =  stripe_unit * D
  *
  * T - The number of bytes striped within a group of component objects
  *     (before advancing to the next group)
- *
- *	T = stripe_unit * group_width * group_depth
+ *	T = U * group_depth
  *
  * S - The number of bytes striped across all component objects
  *     before the pattern repeats
+ *	S = T * group_count
  *
- *	S = stripe_unit * group_width * group_depth * group_count
- *
- * M - The "major" (i.e., across all components) stripe number
- *
+ * M - The "major" (i.e., across all components) cycle number
  *	M = L / S
  *
- * G - Counts the groups from the beginning of the major stripe
- *
+ * G - Counts the groups from the beginning of the major cycle
  *	G = (L - (M * S)) / T	[or (L % S) / T]
  *
  * H - The byte offset within the group
- *
  *	H = (L - (M * S)) % T	[or (L % S) % T]
  *
  * N - The "minor" (i.e., across the group) stripe number
- *
  *	N = H / U
  *
  * C - The component index coresponding to L
  *
- *	C = (H - (N * U)) / stripe_unit + G * group_width
- *	[or (L % U) / stripe_unit + G * group_width]
+ *	C = (H - (N * U)) / stripe_unit + G * D
+ *	[or (L % U) / stripe_unit + G * D]
  *
  * O - The component offset coresponding to L
- *
  *	O = L % stripe_unit + N * stripe_unit + M * group_depth * stripe_unit
+ *
+ * LCMdP – Parity cycle: Lowest Common Multiple of group_width, parity
+ *          divide by parity
+ *	LCMdP = lcm(group_width, parity) / parity
+ *
+ * R - The parity Rotation stripe
+ *     (Note parity cycle always starts at a group's boundary)
+ *	R = N % LCMdP
+ *
+ * I = the first parity device index
+ *	I = (group_width + group_width - R*parity - parity) % group_width
+ *
+ * Craid - The component index Rotated
+ *	Craid = (group_width + C - R*parity) % group_width
+ *      (We add the group_width to avoid negative numbers modulo math)
  */
-struct _striping_info {
-	u64 obj_offset;
-	u64 group_length;
-	u64 M; /* for truncate */
-	unsigned dev;
-	unsigned unit_off;
-};
-
-static void _calc_stripe_info(struct ore_layout *layout, u64 file_offset,
-			      struct _striping_info *si)
+void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset,
+			  u64 length, struct ore_striping_info *si)
 {
 	u32	stripe_unit = layout->stripe_unit;
 	u32	group_width = layout->group_width;
 	u64	group_depth = layout->group_depth;
+	u32	parity      = layout->parity;
 
-	u32	U = stripe_unit * group_width;
+	u32	D = group_width - parity;
+	u32	U = D * stripe_unit;
 	u64	T = U * group_depth;
 	u64	S = T * layout->group_count;
 	u64	M = div64_u64(file_offset, S);
@@ -318,39 +537,65 @@
 	u32	N = div_u64(H, U);
 
 	/* "H - (N * U)" is just "H % U" so it's bound to u32 */
-	si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width;
-	si->dev *= layout->mirrors_p1;
+	u32	C = (u32)(H - (N * U)) / stripe_unit + G * group_width;
 
 	div_u64_rem(file_offset, stripe_unit, &si->unit_off);
 
 	si->obj_offset = si->unit_off + (N * stripe_unit) +
 				  (M * group_depth * stripe_unit);
 
-	si->group_length = T - H;
+	if (parity) {
+		u32 LCMdP = lcm(group_width, parity) / parity;
+		/* R     = N % LCMdP; */
+		u32 RxP   = (N % LCMdP) * parity;
+		u32 first_dev = C - C % group_width;
+
+		si->par_dev = (group_width + group_width - parity - RxP) %
+			      group_width + first_dev;
+		si->dev = (group_width + C - RxP) % group_width + first_dev;
+		si->bytes_in_stripe = U;
+		si->first_stripe_start = M * S + G * T + N * U;
+	} else {
+		/* Make the math correct see _prepare_one_group */
+		si->par_dev = group_width;
+		si->dev = C;
+	}
+
+	si->dev *= layout->mirrors_p1;
+	si->par_dev *= layout->mirrors_p1;
+	si->offset = file_offset;
+	si->length = T - H;
+	if (si->length > length)
+		si->length = length;
 	si->M = M;
 }
+EXPORT_SYMBOL(ore_calc_stripe_info);
 
-static int _add_stripe_unit(struct ore_io_state *ios,  unsigned *cur_pg,
-		unsigned pgbase, struct ore_per_dev_state *per_dev,
-		int cur_len)
+int _ore_add_stripe_unit(struct ore_io_state *ios,  unsigned *cur_pg,
+			 unsigned pgbase, struct page **pages,
+			 struct ore_per_dev_state *per_dev, int cur_len)
 {
 	unsigned pg = *cur_pg;
 	struct request_queue *q =
 			osd_request_queue(_ios_od(ios, per_dev->dev));
-
-	per_dev->length += cur_len;
+	unsigned len = cur_len;
+	int ret;
 
 	if (per_dev->bio == NULL) {
 		unsigned pages_in_stripe = ios->layout->group_width *
 					(ios->layout->stripe_unit / PAGE_SIZE);
-		unsigned bio_size = (ios->nr_pages + pages_in_stripe) /
-						ios->layout->group_width;
+		unsigned nr_pages = ios->nr_pages * ios->layout->group_width /
+					(ios->layout->group_width -
+					 ios->layout->parity);
+		unsigned bio_size = (nr_pages + pages_in_stripe) /
+					ios->layout->group_width;
 
 		per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size);
 		if (unlikely(!per_dev->bio)) {
 			ORE_DBGMSG("Failed to allocate BIO size=%u\n",
 				     bio_size);
-			return -ENOMEM;
+			ret = -ENOMEM;
+			goto out;
 		}
 	}
 
@@ -358,64 +603,90 @@
 		unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len);
 		unsigned added_len;
 
-		BUG_ON(ios->nr_pages <= pg);
 		cur_len -= pglen;
 
-		added_len = bio_add_pc_page(q, per_dev->bio, ios->pages[pg],
+		added_len = bio_add_pc_page(q, per_dev->bio, pages[pg],
 					    pglen, pgbase);
-		if (unlikely(pglen != added_len))
-			return -ENOMEM;
+		if (unlikely(pglen != added_len)) {
+			ORE_DBGMSG("Failed bio_add_pc_page bi_vcnt=%u\n",
+				   per_dev->bio->bi_vcnt);
+			ret = -ENOMEM;
+			goto out;
+		}
+		_add_stripe_page(ios->sp2d, &ios->si, pages[pg]);
+
 		pgbase = 0;
 		++pg;
 	}
 	BUG_ON(cur_len);
 
+	per_dev->length += len;
 	*cur_pg = pg;
-	return 0;
+	ret = 0;
+out:	/* we fail the complete unit on an error eg don't advance
+	 * per_dev->length and cur_pg. This means that we might have a bigger
+	 * bio than the CDB requested length (per_dev->length). That's fine
+	 * only the oposite is fatal.
+	 */
+	return ret;
 }
 
-static int _prepare_one_group(struct ore_io_state *ios, u64 length,
-			      struct _striping_info *si)
+static int _prepare_for_striping(struct ore_io_state *ios)
 {
+	struct ore_striping_info *si = &ios->si;
 	unsigned stripe_unit = ios->layout->stripe_unit;
 	unsigned mirrors_p1 = ios->layout->mirrors_p1;
-	unsigned devs_in_group = ios->layout->group_width * mirrors_p1;
+	unsigned group_width = ios->layout->group_width;
+	unsigned devs_in_group = group_width * mirrors_p1;
 	unsigned dev = si->dev;
 	unsigned first_dev = dev - (dev % devs_in_group);
-	unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0;
+	unsigned dev_order;
 	unsigned cur_pg = ios->pages_consumed;
+	u64 length = ios->length;
 	int ret = 0;
 
+	if (!ios->pages) {
+		ios->numdevs = ios->layout->mirrors_p1;
+		return 0;
+	}
+
+	BUG_ON(length > si->length);
+
+	dev_order = _dev_order(devs_in_group, mirrors_p1, si->par_dev, dev);
+	si->cur_comp = dev_order;
+	si->cur_pg = si->unit_off / PAGE_SIZE;
+
 	while (length) {
-		struct ore_per_dev_state *per_dev = &ios->per_dev[dev];
+		unsigned comp = dev - first_dev;
+		struct ore_per_dev_state *per_dev = &ios->per_dev[comp];
 		unsigned cur_len, page_off = 0;
 
 		if (!per_dev->length) {
 			per_dev->dev = dev;
-			if (dev < si->dev) {
-				per_dev->offset = si->obj_offset + stripe_unit -
-								   si->unit_off;
-				cur_len = stripe_unit;
-			} else if (dev == si->dev) {
+			if (dev == si->dev) {
+				WARN_ON(dev == si->par_dev);
 				per_dev->offset = si->obj_offset;
 				cur_len = stripe_unit - si->unit_off;
 				page_off = si->unit_off & ~PAGE_MASK;
 				BUG_ON(page_off && (page_off != ios->pgbase));
-			} else { /* dev > si->dev */
-				per_dev->offset = si->obj_offset - si->unit_off;
+			} else {
+				if (si->cur_comp > dev_order)
+					per_dev->offset =
+						si->obj_offset - si->unit_off;
+				else /* si->cur_comp < dev_order */
+					per_dev->offset =
+						si->obj_offset + stripe_unit -
+								   si->unit_off;
 				cur_len = stripe_unit;
 			}
-
-			if (max_comp < dev)
-				max_comp = dev;
 		} else {
 			cur_len = stripe_unit;
 		}
 		if (cur_len >= length)
 			cur_len = length;
 
-		ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev,
-				       cur_len);
+		ret = _ore_add_stripe_unit(ios, &cur_pg, page_off, ios->pages,
+					   per_dev, cur_len);
 		if (unlikely(ret))
 			goto out;
 
@@ -423,60 +694,60 @@
 		dev = (dev % devs_in_group) + first_dev;
 
 		length -= cur_len;
-	}
-out:
-	ios->numdevs = max_comp + mirrors_p1;
-	ios->pages_consumed = cur_pg;
-	return ret;
-}
 
-static int _prepare_for_striping(struct ore_io_state *ios)
-{
-	u64 length = ios->length;
-	u64 offset = ios->offset;
-	struct _striping_info si;
-	int ret = 0;
+		si->cur_comp = (si->cur_comp + 1) % group_width;
+		if (unlikely((dev == si->par_dev) || (!length && ios->sp2d))) {
+			if (!length && ios->sp2d) {
+				/* If we are writing and this is the very last
+				 * stripe. then operate on parity dev.
+				 */
+				dev = si->par_dev;
+			}
+			if (ios->sp2d)
+				/* In writes cur_len just means if it's the
+				 * last one. See _ore_add_parity_unit.
+				 */
+				cur_len = length;
+			per_dev = &ios->per_dev[dev - first_dev];
+			if (!per_dev->length) {
+				/* Only/always the parity unit of the first
+				 * stripe will be empty. So this is a chance to
+				 * initialize the per_dev info.
+				 */
+				per_dev->dev = dev;
+				per_dev->offset = si->obj_offset - si->unit_off;
+			}
 
-	if (!ios->pages) {
-		if (ios->kern_buff) {
-			struct ore_per_dev_state *per_dev = &ios->per_dev[0];
+			ret = _ore_add_parity_unit(ios, si, per_dev, cur_len);
+			if (unlikely(ret))
+					goto out;
 
-			_calc_stripe_info(ios->layout, ios->offset, &si);
-			per_dev->offset = si.obj_offset;
-			per_dev->dev = si.dev;
-
-			/* no cross device without page array */
-			BUG_ON((ios->layout->group_width > 1) &&
-			       (si.unit_off + ios->length >
-				ios->layout->stripe_unit));
+			/* Rotate next par_dev backwards with wraping */
+			si->par_dev = (devs_in_group + si->par_dev -
+				       ios->layout->parity * mirrors_p1) %
+				      devs_in_group + first_dev;
+			/* Next stripe, start fresh */
+			si->cur_comp = 0;
+			si->cur_pg = 0;
 		}
-		ios->numdevs = ios->layout->mirrors_p1;
-		return 0;
 	}
-
-	while (length) {
-		_calc_stripe_info(ios->layout, offset, &si);
-
-		if (length < si.group_length)
-			si.group_length = length;
-
-		ret = _prepare_one_group(ios, si.group_length, &si);
-		if (unlikely(ret))
-			goto out;
-
-		offset += si.group_length;
-		length -= si.group_length;
-	}
-
 out:
-	return ret;
+	ios->numdevs = devs_in_group;
+	ios->pages_consumed = cur_pg;
+	if (unlikely(ret)) {
+		if (length == ios->length)
+			return ret;
+		else
+			ios->length -= length;
+	}
+	return 0;
 }
 
 int ore_create(struct ore_io_state *ios)
 {
 	int i, ret;
 
-	for (i = 0; i < ios->comps->numdevs; i++) {
+	for (i = 0; i < ios->oc->numdevs; i++) {
 		struct osd_request *or;
 
 		or = osd_start_request(_ios_od(ios, i), GFP_KERNEL);
@@ -501,7 +772,7 @@
 {
 	int i, ret;
 
-	for (i = 0; i < ios->comps->numdevs; i++) {
+	for (i = 0; i < ios->oc->numdevs; i++) {
 		struct osd_request *or;
 
 		or = osd_start_request(_ios_od(ios, i), GFP_KERNEL);
@@ -543,7 +814,6 @@
 			goto out;
 		}
 		per_dev->or = or;
-		per_dev->offset = master_dev->offset;
 
 		if (ios->pages) {
 			struct bio *bio;
@@ -562,6 +832,7 @@
 				__bio_clone(bio, master_dev->bio);
 				bio->bi_bdev = NULL;
 				bio->bi_next = NULL;
+				per_dev->offset = master_dev->offset;
 				per_dev->length = master_dev->length;
 				per_dev->bio =  bio;
 				per_dev->dev = dev;
@@ -579,7 +850,15 @@
 				     _LLU(per_dev->offset),
 				     _LLU(per_dev->length), dev);
 		} else if (ios->kern_buff) {
-			ret = osd_req_write_kern(or, _ios_obj(ios, dev),
+			per_dev->offset = ios->si.obj_offset;
+			per_dev->dev = ios->si.dev + dev;
+
+			/* no cross device without page array */
+			BUG_ON((ios->layout->group_width > 1) &&
+			       (ios->si.unit_off + ios->length >
+				ios->layout->stripe_unit));
+
+			ret = osd_req_write_kern(or, _ios_obj(ios, per_dev->dev),
 						 per_dev->offset,
 						 ios->kern_buff, ios->length);
 			if (unlikely(ret))
@@ -588,7 +867,7 @@
 				      "length=0x%llx dev=%d\n",
 				     _LLU(_ios_obj(ios, dev)->id),
 				     _LLU(per_dev->offset),
-				     _LLU(ios->length), dev);
+				     _LLU(ios->length), per_dev->dev);
 		} else {
 			osd_req_set_attributes(or, _ios_obj(ios, dev));
 			ORE_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n",
@@ -614,6 +893,14 @@
 	int i;
 	int ret;
 
+	if (unlikely(ios->sp2d && !ios->r4w)) {
+		/* A library is attempting a RAID-write without providing
+		 * a pages lock interface.
+		 */
+		WARN_ON_ONCE(1);
+		return -ENOTSUPP;
+	}
+
 	ret = _prepare_for_striping(ios);
 	if (unlikely(ret))
 		return ret;
@@ -629,7 +916,7 @@
 }
 EXPORT_SYMBOL(ore_write);
 
-static int _read_mirror(struct ore_io_state *ios, unsigned cur_comp)
+int _ore_read_mirror(struct ore_io_state *ios, unsigned cur_comp)
 {
 	struct osd_request *or;
 	struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp];
@@ -648,22 +935,27 @@
 	per_dev->or = or;
 
 	if (ios->pages) {
-		osd_req_read(or, obj, per_dev->offset,
-				per_dev->bio, per_dev->length);
+		if (per_dev->cur_sg) {
+			/* finalize the last sg_entry */
+			_ore_add_sg_seg(per_dev, 0, false);
+			if (unlikely(!per_dev->cur_sg))
+				return 0; /* Skip parity only device */
+
+			osd_req_read_sg(or, obj, per_dev->bio,
+					per_dev->sglist, per_dev->cur_sg);
+		} else {
+			/* The no raid case */
+			osd_req_read(or, obj, per_dev->offset,
+				     per_dev->bio, per_dev->length);
+		}
+
 		ORE_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx"
-			     " dev=%d\n", _LLU(obj->id),
+			     " dev=%d sg_len=%d\n", _LLU(obj->id),
 			     _LLU(per_dev->offset), _LLU(per_dev->length),
-			     first_dev);
-	} else if (ios->kern_buff) {
-		int ret = osd_req_read_kern(or, obj, per_dev->offset,
-					    ios->kern_buff, ios->length);
-		ORE_DBGMSG2("read_kern(0x%llx) offset=0x%llx "
-			      "length=0x%llx dev=%d ret=>%d\n",
-			      _LLU(obj->id), _LLU(per_dev->offset),
-			      _LLU(ios->length), first_dev, ret);
-		if (unlikely(ret))
-			return ret;
+			     first_dev, per_dev->cur_sg);
 	} else {
+		BUG_ON(ios->kern_buff);
+
 		osd_req_get_attributes(or, obj);
 		ORE_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n",
 			      _LLU(obj->id),
@@ -688,7 +980,7 @@
 		return ret;
 
 	for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
-		ret = _read_mirror(ios, i);
+		ret = _ore_read_mirror(ios, i);
 		if (unlikely(ret))
 			return ret;
 	}
@@ -744,31 +1036,29 @@
 }
 
 struct _trunc_info {
-	struct _striping_info si;
+	struct ore_striping_info si;
 	u64 prev_group_obj_off;
 	u64 next_group_obj_off;
 
 	unsigned first_group_dev;
 	unsigned nex_group_dev;
-	unsigned max_devs;
 };
 
-void _calc_trunk_info(struct ore_layout *layout, u64 file_offset,
-		       struct _trunc_info *ti)
+static void _calc_trunk_info(struct ore_layout *layout, u64 file_offset,
+			     struct _trunc_info *ti)
 {
 	unsigned stripe_unit = layout->stripe_unit;
 
-	_calc_stripe_info(layout, file_offset, &ti->si);
+	ore_calc_stripe_info(layout, file_offset, 0, &ti->si);
 
 	ti->prev_group_obj_off = ti->si.M * stripe_unit;
 	ti->next_group_obj_off = ti->si.M ? (ti->si.M - 1) * stripe_unit : 0;
 
 	ti->first_group_dev = ti->si.dev - (ti->si.dev % layout->group_width);
 	ti->nex_group_dev = ti->first_group_dev + layout->group_width;
-	ti->max_devs = layout->group_width * layout->group_count;
 }
 
-int ore_truncate(struct ore_layout *layout, struct ore_components *comps,
+int ore_truncate(struct ore_layout *layout, struct ore_components *oc,
 		   u64 size)
 {
 	struct ore_io_state *ios;
@@ -779,22 +1069,22 @@
 	struct _trunc_info ti;
 	int i, ret;
 
-	ret = ore_get_io_state(layout, comps, &ios);
+	ret = ore_get_io_state(layout, oc, &ios);
 	if (unlikely(ret))
 		return ret;
 
 	_calc_trunk_info(ios->layout, size, &ti);
 
-	size_attrs = kcalloc(ti.max_devs, sizeof(*size_attrs),
+	size_attrs = kcalloc(ios->oc->numdevs, sizeof(*size_attrs),
 			     GFP_KERNEL);
 	if (unlikely(!size_attrs)) {
 		ret = -ENOMEM;
 		goto out;
 	}
 
-	ios->numdevs = ios->comps->numdevs;
+	ios->numdevs = ios->oc->numdevs;
 
-	for (i = 0; i < ti.max_devs; ++i) {
+	for (i = 0; i < ios->numdevs; ++i) {
 		struct exofs_trunc_attr *size_attr = &size_attrs[i];
 		u64 obj_size;
 
@@ -815,7 +1105,7 @@
 		size_attr->attr.val_ptr = &size_attr->newsize;
 
 		ORE_DBGMSG("trunc(0x%llx) obj_offset=0x%llx dev=%d\n",
-			     _LLU(comps->comps->obj.id), _LLU(obj_size), i);
+			     _LLU(oc->comps->obj.id), _LLU(obj_size), i);
 		ret = _truncate_mirrors(ios, i * ios->layout->mirrors_p1,
 					&size_attr->attr);
 		if (unlikely(ret))

diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c
new file mode 100644
index 0000000..29c47e5
--- /dev/null
+++ b/fs/exofs/ore_raid.c

@@ -0,0 +1,660 @@
+/*
+ * Copyright (C) 2011
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * This file is part of the objects raid engine (ore).
+ *
+ * It is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with "ore". If not, write to the Free Software Foundation, Inc:
+ *	"Free Software Foundation <info@fsf.org>"
+ */
+
+#include <linux/gfp.h>
+#include <linux/async_tx.h>
+
+#include "ore_raid.h"
+
+#undef ORE_DBGMSG2
+#define ORE_DBGMSG2 ORE_DBGMSG
+
+struct page *_raid_page_alloc(void)
+{
+	return alloc_page(GFP_KERNEL);
+}
+
+void _raid_page_free(struct page *p)
+{
+	__free_page(p);
+}
+
+/* This struct is forward declare in ore_io_state, but is private to here.
+ * It is put on ios->sp2d for RAID5/6 writes only. See _gen_xor_unit.
+ *
+ * __stripe_pages_2d is a 2d array of pages, and it is also a corner turn.
+ * Ascending page index access is sp2d(p-minor, c-major). But storage is
+ * sp2d[p-minor][c-major], so it can be properlly presented to the async-xor
+ * API.
+ */
+struct __stripe_pages_2d {
+	/* Cache some hot path repeated calculations */
+	unsigned parity;
+	unsigned data_devs;
+	unsigned pages_in_unit;
+
+	bool needed ;
+
+	/* Array size is pages_in_unit (layout->stripe_unit / PAGE_SIZE) */
+	struct __1_page_stripe {
+		bool alloc;
+		unsigned write_count;
+		struct async_submit_ctl submit;
+		struct dma_async_tx_descriptor *tx;
+
+		/* The size of this array is data_devs + parity */
+		struct page **pages;
+		struct page **scribble;
+		/* bool array, size of this array is data_devs */
+		char *page_is_read;
+	} _1p_stripes[];
+};
+
+/* This can get bigger then a page. So support multiple page allocations
+ * _sp2d_free should be called even if _sp2d_alloc fails (by returning
+ * none-zero).
+ */
+static int _sp2d_alloc(unsigned pages_in_unit, unsigned group_width,
+		       unsigned parity, struct __stripe_pages_2d **psp2d)
+{
+	struct __stripe_pages_2d *sp2d;
+	unsigned data_devs = group_width - parity;
+	struct _alloc_all_bytes {
+		struct __alloc_stripe_pages_2d {
+			struct __stripe_pages_2d sp2d;
+			struct __1_page_stripe _1p_stripes[pages_in_unit];
+		} __asp2d;
+		struct __alloc_1p_arrays {
+			struct page *pages[group_width];
+			struct page *scribble[group_width];
+			char page_is_read[data_devs];
+		} __a1pa[pages_in_unit];
+	} *_aab;
+	struct __alloc_1p_arrays *__a1pa;
+	struct __alloc_1p_arrays *__a1pa_end;
+	const unsigned sizeof__a1pa = sizeof(_aab->__a1pa[0]);
+	unsigned num_a1pa, alloc_size, i;
+
+	/* FIXME: check these numbers in ore_verify_layout */
+	BUG_ON(sizeof(_aab->__asp2d) > PAGE_SIZE);
+	BUG_ON(sizeof__a1pa > PAGE_SIZE);
+
+	if (sizeof(*_aab) > PAGE_SIZE) {
+		num_a1pa = (PAGE_SIZE - sizeof(_aab->__asp2d)) / sizeof__a1pa;
+		alloc_size = sizeof(_aab->__asp2d) + sizeof__a1pa * num_a1pa;
+	} else {
+		num_a1pa = pages_in_unit;
+		alloc_size = sizeof(*_aab);
+	}
+
+	_aab = kzalloc(alloc_size, GFP_KERNEL);
+	if (unlikely(!_aab)) {
+		ORE_DBGMSG("!! Failed to alloc sp2d size=%d\n", alloc_size);
+		return -ENOMEM;
+	}
+
+	sp2d = &_aab->__asp2d.sp2d;
+	*psp2d = sp2d; /* From here Just call _sp2d_free */
+
+	__a1pa = _aab->__a1pa;
+	__a1pa_end = __a1pa + num_a1pa;
+
+	for (i = 0; i < pages_in_unit; ++i) {
+		if (unlikely(__a1pa >= __a1pa_end)) {
+			num_a1pa = min_t(unsigned, PAGE_SIZE / sizeof__a1pa,
+							pages_in_unit - i);
+
+			__a1pa = kzalloc(num_a1pa * sizeof__a1pa, GFP_KERNEL);
+			if (unlikely(!__a1pa)) {
+				ORE_DBGMSG("!! Failed to _alloc_1p_arrays=%d\n",
+					   num_a1pa);
+				return -ENOMEM;
+			}
+			__a1pa_end = __a1pa + num_a1pa;
+			/* First *pages is marked for kfree of the buffer */
+			sp2d->_1p_stripes[i].alloc = true;
+		}
+
+		sp2d->_1p_stripes[i].pages = __a1pa->pages;
+		sp2d->_1p_stripes[i].scribble = __a1pa->scribble ;
+		sp2d->_1p_stripes[i].page_is_read = __a1pa->page_is_read;
+		++__a1pa;
+	}
+
+	sp2d->parity = parity;
+	sp2d->data_devs = data_devs;
+	sp2d->pages_in_unit = pages_in_unit;
+	return 0;
+}
+
+static void _sp2d_reset(struct __stripe_pages_2d *sp2d,
+			const struct _ore_r4w_op *r4w, void *priv)
+{
+	unsigned data_devs = sp2d->data_devs;
+	unsigned group_width = data_devs + sp2d->parity;
+	unsigned p;
+
+	if (!sp2d->needed)
+		return;
+
+	for (p = 0; p < sp2d->pages_in_unit; p++) {
+		struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
+
+		if (_1ps->write_count < group_width) {
+			unsigned c;
+
+			for (c = 0; c < data_devs; c++)
+				if (_1ps->page_is_read[c]) {
+					struct page *page = _1ps->pages[c];
+
+					r4w->put_page(priv, page);
+					_1ps->page_is_read[c] = false;
+				}
+		}
+
+		memset(_1ps->pages, 0, group_width * sizeof(*_1ps->pages));
+		_1ps->write_count = 0;
+		_1ps->tx = NULL;
+	}
+
+	sp2d->needed = false;
+}
+
+static void _sp2d_free(struct __stripe_pages_2d *sp2d)
+{
+	unsigned i;
+
+	if (!sp2d)
+		return;
+
+	for (i = 0; i < sp2d->pages_in_unit; ++i) {
+		if (sp2d->_1p_stripes[i].alloc)
+			kfree(sp2d->_1p_stripes[i].pages);
+	}
+
+	kfree(sp2d);
+}
+
+static unsigned _sp2d_min_pg(struct __stripe_pages_2d *sp2d)
+{
+	unsigned p;
+
+	for (p = 0; p < sp2d->pages_in_unit; p++) {
+		struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
+
+		if (_1ps->write_count)
+			return p;
+	}
+
+	return ~0;
+}
+
+static unsigned _sp2d_max_pg(struct __stripe_pages_2d *sp2d)
+{
+	unsigned p;
+
+	for (p = sp2d->pages_in_unit - 1; p >= 0; --p) {
+		struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
+
+		if (_1ps->write_count)
+			return p;
+	}
+
+	return ~0;
+}
+
+static void _gen_xor_unit(struct __stripe_pages_2d *sp2d)
+{
+	unsigned p;
+	for (p = 0; p < sp2d->pages_in_unit; p++) {
+		struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
+
+		if (!_1ps->write_count)
+			continue;
+
+		init_async_submit(&_1ps->submit,
+			ASYNC_TX_XOR_ZERO_DST | ASYNC_TX_ACK,
+			NULL,
+			NULL, NULL,
+			(addr_conv_t *)_1ps->scribble);
+
+		/* TODO: raid6 */
+		_1ps->tx = async_xor(_1ps->pages[sp2d->data_devs], _1ps->pages,
+				     0, sp2d->data_devs, PAGE_SIZE,
+				     &_1ps->submit);
+	}
+
+	for (p = 0; p < sp2d->pages_in_unit; p++) {
+		struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
+		/* NOTE: We wait for HW synchronously (I don't have such HW
+		 * to test with.) Is parallelism needed with today's multi
+		 * cores?
+		 */
+		async_tx_issue_pending(_1ps->tx);
+	}
+}
+
+void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d,
+		       struct ore_striping_info *si, struct page *page)
+{
+	struct __1_page_stripe *_1ps;
+
+	sp2d->needed = true;
+
+	_1ps = &sp2d->_1p_stripes[si->cur_pg];
+	_1ps->pages[si->cur_comp] = page;
+	++_1ps->write_count;
+
+	si->cur_pg = (si->cur_pg + 1) % sp2d->pages_in_unit;
+	/* si->cur_comp is advanced outside at main loop */
+}
+
+void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len,
+		     bool not_last)
+{
+	struct osd_sg_entry *sge;
+
+	ORE_DBGMSG("dev=%d cur_len=0x%x not_last=%d cur_sg=%d "
+		     "offset=0x%llx length=0x%x last_sgs_total=0x%x\n",
+		     per_dev->dev, cur_len, not_last, per_dev->cur_sg,
+		     _LLU(per_dev->offset), per_dev->length,
+		     per_dev->last_sgs_total);
+
+	if (!per_dev->cur_sg) {
+		sge = per_dev->sglist;
+
+		/* First time we prepare two entries */
+		if (per_dev->length) {
+			++per_dev->cur_sg;
+			sge->offset = per_dev->offset;
+			sge->len = per_dev->length;
+		} else {
+			/* Here the parity is the first unit of this object.
+			 * This happens every time we reach a parity device on
+			 * the same stripe as the per_dev->offset. We need to
+			 * just skip this unit.
+			 */
+			per_dev->offset += cur_len;
+			return;
+		}
+	} else {
+		/* finalize the last one */
+		sge = &per_dev->sglist[per_dev->cur_sg - 1];
+		sge->len = per_dev->length - per_dev->last_sgs_total;
+	}
+
+	if (not_last) {
+		/* Partly prepare the next one */
+		struct osd_sg_entry *next_sge = sge + 1;
+
+		++per_dev->cur_sg;
+		next_sge->offset = sge->offset + sge->len + cur_len;
+		/* Save cur len so we know how mutch was added next time */
+		per_dev->last_sgs_total = per_dev->length;
+		next_sge->len = 0;
+	} else if (!sge->len) {
+		/* Optimize for when the last unit is a parity */
+		--per_dev->cur_sg;
+	}
+}
+
+static int _alloc_read_4_write(struct ore_io_state *ios)
+{
+	struct ore_layout *layout = ios->layout;
+	int ret;
+	/* We want to only read those pages not in cache so worst case
+	 * is a stripe populated with every other page
+	 */
+	unsigned sgs_per_dev = ios->sp2d->pages_in_unit + 2;
+
+	ret = _ore_get_io_state(layout, ios->oc,
+				layout->group_width * layout->mirrors_p1,
+				sgs_per_dev, 0, &ios->ios_read_4_write);
+	return ret;
+}
+
+/* @si contains info of the to-be-inserted page. Update of @si should be
+ * maintained by caller. Specificaly si->dev, si->obj_offset, ...
+ */
+static int _add_to_read_4_write(struct ore_io_state *ios,
+				struct ore_striping_info *si, struct page *page)
+{
+	struct request_queue *q;
+	struct ore_per_dev_state *per_dev;
+	struct ore_io_state *read_ios;
+	unsigned first_dev = si->dev - (si->dev %
+			  (ios->layout->group_width * ios->layout->mirrors_p1));
+	unsigned comp = si->dev - first_dev;
+	unsigned added_len;
+
+	if (!ios->ios_read_4_write) {
+		int ret = _alloc_read_4_write(ios);
+
+		if (unlikely(ret))
+			return ret;
+	}
+
+	read_ios = ios->ios_read_4_write;
+	read_ios->numdevs = ios->layout->group_width * ios->layout->mirrors_p1;
+
+	per_dev = &read_ios->per_dev[comp];
+	if (!per_dev->length) {
+		per_dev->bio = bio_kmalloc(GFP_KERNEL,
+					   ios->sp2d->pages_in_unit);
+		if (unlikely(!per_dev->bio)) {
+			ORE_DBGMSG("Failed to allocate BIO size=%u\n",
+				     ios->sp2d->pages_in_unit);
+			return -ENOMEM;
+		}
+		per_dev->offset = si->obj_offset;
+		per_dev->dev = si->dev;
+	} else if (si->obj_offset != (per_dev->offset + per_dev->length)) {
+		u64 gap = si->obj_offset - (per_dev->offset + per_dev->length);
+
+		_ore_add_sg_seg(per_dev, gap, true);
+	}
+	q = osd_request_queue(ore_comp_dev(read_ios->oc, per_dev->dev));
+	added_len = bio_add_pc_page(q, per_dev->bio, page, PAGE_SIZE, 0);
+	if (unlikely(added_len != PAGE_SIZE)) {
+		ORE_DBGMSG("Failed to bio_add_pc_page bi_vcnt=%d\n",
+			      per_dev->bio->bi_vcnt);
+		return -ENOMEM;
+	}
+
+	per_dev->length += PAGE_SIZE;
+	return 0;
+}
+
+static void _mark_read4write_pages_uptodate(struct ore_io_state *ios, int ret)
+{
+	struct bio_vec *bv;
+	unsigned i, d;
+
+	/* loop on all devices all pages */
+	for (d = 0; d < ios->numdevs; d++) {
+		struct bio *bio = ios->per_dev[d].bio;
+
+		if (!bio)
+			continue;
+
+		__bio_for_each_segment(bv, bio, i, 0) {
+			struct page *page = bv->bv_page;
+
+			SetPageUptodate(page);
+			if (PageError(page))
+				ClearPageError(page);
+		}
+	}
+}
+
+/* read_4_write is hacked to read the start of the first stripe and/or
+ * the end of the last stripe. If needed, with an sg-gap at each device/page.
+ * It is assumed to be called after the to_be_written pages of the first stripe
+ * are populating ios->sp2d[][]
+ *
+ * NOTE: We call ios->r4w->lock_fn for all pages needed for parity calculations
+ * These pages are held at sp2d[p].pages[c] but with
+ * sp2d[p].page_is_read[c] = true. At _sp2d_reset these pages are
+ * ios->r4w->lock_fn(). The ios->r4w->lock_fn might signal that the page is
+ * @uptodate=true, so we don't need to read it, only unlock, after IO.
+ *
+ * TODO: The read_4_write should calc a need_to_read_pages_count, if bigger then
+ * to-be-written count, we should consider the xor-in-place mode.
+ * need_to_read_pages_count is the actual number of pages not present in cache.
+ * maybe "devs_in_group - ios->sp2d[p].write_count" is a good enough
+ * approximation? In this mode the read pages are put in the empty places of
+ * ios->sp2d[p][*], xor is calculated the same way. These pages are
+ * allocated/freed and don't go through cache
+ */
+static int _read_4_write(struct ore_io_state *ios)
+{
+	struct ore_io_state *ios_read;
+	struct ore_striping_info read_si;
+	struct __stripe_pages_2d *sp2d = ios->sp2d;
+	u64 offset = ios->si.first_stripe_start;
+	u64 last_stripe_end;
+	unsigned bytes_in_stripe = ios->si.bytes_in_stripe;
+	unsigned i, c, p, min_p = sp2d->pages_in_unit, max_p = -1;
+	int ret;
+
+	if (offset == ios->offset) /* Go to start collect $200 */
+		goto read_last_stripe;
+
+	min_p = _sp2d_min_pg(sp2d);
+	max_p = _sp2d_max_pg(sp2d);
+
+	for (c = 0; ; c++) {
+		ore_calc_stripe_info(ios->layout, offset, 0, &read_si);
+		read_si.obj_offset += min_p * PAGE_SIZE;
+		offset += min_p * PAGE_SIZE;
+		for (p = min_p; p <= max_p; p++) {
+			struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
+			struct page **pp = &_1ps->pages[c];
+			bool uptodate;
+
+			if (*pp)
+				/* to-be-written pages start here */
+				goto read_last_stripe;
+
+			*pp = ios->r4w->get_page(ios->private, offset,
+						 &uptodate);
+			if (unlikely(!*pp))
+				return -ENOMEM;
+
+			if (!uptodate)
+				_add_to_read_4_write(ios, &read_si, *pp);
+
+			/* Mark read-pages to be cache_released */
+			_1ps->page_is_read[c] = true;
+			read_si.obj_offset += PAGE_SIZE;
+			offset += PAGE_SIZE;
+		}
+		offset += (sp2d->pages_in_unit - p) * PAGE_SIZE;
+	}
+
+read_last_stripe:
+	offset = ios->offset + (ios->length + PAGE_SIZE - 1) /
+				PAGE_SIZE * PAGE_SIZE;
+	last_stripe_end = div_u64(offset + bytes_in_stripe - 1, bytes_in_stripe)
+				 * bytes_in_stripe;
+	if (offset == last_stripe_end) /* Optimize for the aligned case */
+		goto read_it;
+
+	ore_calc_stripe_info(ios->layout, offset, 0, &read_si);
+	p = read_si.unit_off / PAGE_SIZE;
+	c = _dev_order(ios->layout->group_width * ios->layout->mirrors_p1,
+		       ios->layout->mirrors_p1, read_si.par_dev, read_si.dev);
+
+	BUG_ON(ios->si.first_stripe_start + bytes_in_stripe != last_stripe_end);
+	/* unaligned IO must be within a single stripe */
+
+	if (min_p == sp2d->pages_in_unit) {
+		/* Didn't do it yet */
+		min_p = _sp2d_min_pg(sp2d);
+		max_p = _sp2d_max_pg(sp2d);
+	}
+
+	while (offset < last_stripe_end) {
+		struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
+
+		if ((min_p <= p) && (p <= max_p)) {
+			struct page *page;
+			bool uptodate;
+
+			BUG_ON(_1ps->pages[c]);
+			page = ios->r4w->get_page(ios->private, offset,
+						  &uptodate);
+			if (unlikely(!page))
+				return -ENOMEM;
+
+			_1ps->pages[c] = page;
+			/* Mark read-pages to be cache_released */
+			_1ps->page_is_read[c] = true;
+			if (!uptodate)
+				_add_to_read_4_write(ios, &read_si, page);
+		}
+
+		offset += PAGE_SIZE;
+		if (p == (sp2d->pages_in_unit - 1)) {
+			++c;
+			p = 0;
+			ore_calc_stripe_info(ios->layout, offset, 0, &read_si);
+		} else {
+			read_si.obj_offset += PAGE_SIZE;
+			++p;
+		}
+	}
+
+read_it:
+	ios_read = ios->ios_read_4_write;
+	if (!ios_read)
+		return 0;
+
+	/* FIXME: Ugly to signal _sbi_read_mirror that we have bio(s). Change
+	 * to check for per_dev->bio
+	 */
+	ios_read->pages = ios->pages;
+
+	/* Now read these devices */
+	for (i = 0; i < ios_read->numdevs; i += ios_read->layout->mirrors_p1) {
+		ret = _ore_read_mirror(ios_read, i);
+		if (unlikely(ret))
+			return ret;
+	}
+
+	ret = ore_io_execute(ios_read); /* Synchronus execution */
+	if (unlikely(ret)) {
+		ORE_DBGMSG("!! ore_io_execute => %d\n", ret);
+		return ret;
+	}
+
+	_mark_read4write_pages_uptodate(ios_read, ret);
+	return 0;
+}
+
+/* In writes @cur_len means length left. .i.e cur_len==0 is the last parity U */
+int _ore_add_parity_unit(struct ore_io_state *ios,
+			    struct ore_striping_info *si,
+			    struct ore_per_dev_state *per_dev,
+			    unsigned cur_len)
+{
+	if (ios->reading) {
+		BUG_ON(per_dev->cur_sg >= ios->sgs_per_dev);
+		_ore_add_sg_seg(per_dev, cur_len, true);
+	} else {
+		struct __stripe_pages_2d *sp2d = ios->sp2d;
+		struct page **pages = ios->parity_pages + ios->cur_par_page;
+		unsigned num_pages;
+		unsigned array_start = 0;
+		unsigned i;
+		int ret;
+
+		si->cur_pg = _sp2d_min_pg(sp2d);
+		num_pages  = _sp2d_max_pg(sp2d) + 1 - si->cur_pg;
+
+		if (!cur_len) /* If last stripe operate on parity comp */
+			si->cur_comp = sp2d->data_devs;
+
+		if (!per_dev->length) {
+			per_dev->offset += si->cur_pg * PAGE_SIZE;
+			/* If first stripe, Read in all read4write pages
+			 * (if needed) before we calculate the first parity.
+			 */
+			_read_4_write(ios);
+		}
+
+		for (i = 0; i < num_pages; i++) {
+			pages[i] = _raid_page_alloc();
+			if (unlikely(!pages[i]))
+				return -ENOMEM;
+
+			++(ios->cur_par_page);
+		}
+
+		BUG_ON(si->cur_comp != sp2d->data_devs);
+		BUG_ON(si->cur_pg + num_pages > sp2d->pages_in_unit);
+
+		ret = _ore_add_stripe_unit(ios,  &array_start, 0, pages,
+					   per_dev, num_pages * PAGE_SIZE);
+		if (unlikely(ret))
+			return ret;
+
+		/* TODO: raid6 if (last_parity_dev) */
+		_gen_xor_unit(sp2d);
+		_sp2d_reset(sp2d, ios->r4w, ios->private);
+	}
+	return 0;
+}
+
+int _ore_post_alloc_raid_stuff(struct ore_io_state *ios)
+{
+	struct ore_layout *layout = ios->layout;
+
+	if (ios->parity_pages) {
+		unsigned pages_in_unit = layout->stripe_unit / PAGE_SIZE;
+		unsigned stripe_size = ios->si.bytes_in_stripe;
+		u64 last_stripe, first_stripe;
+
+		if (_sp2d_alloc(pages_in_unit, layout->group_width,
+				layout->parity, &ios->sp2d)) {
+			return -ENOMEM;
+		}
+
+		BUG_ON(ios->offset % PAGE_SIZE);
+
+		/* Round io down to last full strip */
+		first_stripe = div_u64(ios->offset, stripe_size);
+		last_stripe = div_u64(ios->offset + ios->length, stripe_size);
+
+		/* If an IO spans more then a single stripe it must end at
+		 * a stripe boundary. The reminder at the end is pushed into the
+		 * next IO.
+		 */
+		if (last_stripe != first_stripe) {
+			ios->length = last_stripe * stripe_size - ios->offset;
+
+			BUG_ON(!ios->length);
+			ios->nr_pages = (ios->length + PAGE_SIZE - 1) /
+					PAGE_SIZE;
+			ios->si.length = ios->length; /*make it consistent */
+		}
+	}
+	return 0;
+}
+
+void _ore_free_raid_stuff(struct ore_io_state *ios)
+{
+	if (ios->sp2d) { /* writing and raid */
+		unsigned i;
+
+		for (i = 0; i < ios->cur_par_page; i++) {
+			struct page *page = ios->parity_pages[i];
+
+			if (page)
+				_raid_page_free(page);
+		}
+		if (ios->extra_part_alloc)
+			kfree(ios->parity_pages);
+		/* If IO returned an error pages might need unlocking */
+		_sp2d_reset(ios->sp2d, ios->r4w, ios->private);
+		_sp2d_free(ios->sp2d);
+	} else {
+		/* Will only be set if raid reading && sglist is big */
+		if (ios->extra_part_alloc)
+			kfree(ios->per_dev[0].sglist);
+	}
+	if (ios->ios_read_4_write)
+		ore_put_io_state(ios->ios_read_4_write);
+}

diff --git a/fs/exofs/ore_raid.h b/fs/exofs/ore_raid.h
new file mode 100644
index 0000000..2ffd2c3
--- /dev/null
+++ b/fs/exofs/ore_raid.h

@@ -0,0 +1,79 @@
+/*
+ * Copyright (C) from 2011
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * This file is part of the objects raid engine (ore).
+ *
+ * It is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with "ore". If not, write to the Free Software Foundation, Inc:
+ *	"Free Software Foundation <info@fsf.org>"
+ */
+
+#include <scsi/osd_ore.h>
+
+#define ORE_ERR(fmt, a...) printk(KERN_ERR "ore: " fmt, ##a)
+
+#ifdef CONFIG_EXOFS_DEBUG
+#define ORE_DBGMSG(fmt, a...) \
+	printk(KERN_NOTICE "ore @%s:%d: " fmt, __func__, __LINE__, ##a)
+#else
+#define ORE_DBGMSG(fmt, a...) \
+	do { if (0) printk(fmt, ##a); } while (0)
+#endif
+
+/* u64 has problems with printk this will cast it to unsigned long long */
+#define _LLU(x) (unsigned long long)(x)
+
+#define ORE_DBGMSG2(M...) do {} while (0)
+/* #define ORE_DBGMSG2 ORE_DBGMSG */
+
+/* Calculate the component order in a stripe. eg the logical data unit
+ * address within the stripe of @dev given the @par_dev of this stripe.
+ */
+static inline unsigned _dev_order(unsigned devs_in_group, unsigned mirrors_p1,
+				  unsigned par_dev, unsigned dev)
+{
+	unsigned first_dev = dev - dev % devs_in_group;
+
+	dev -= first_dev;
+	par_dev -= first_dev;
+
+	if (devs_in_group == par_dev) /* The raid 0 case */
+		return dev / mirrors_p1;
+	/* raid4/5/6 case */
+	return ((devs_in_group + dev - par_dev - mirrors_p1) % devs_in_group) /
+	       mirrors_p1;
+}
+
+/* ios_raid.c stuff needed by ios.c */
+int _ore_post_alloc_raid_stuff(struct ore_io_state *ios);
+void _ore_free_raid_stuff(struct ore_io_state *ios);
+
+void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len,
+		 bool not_last);
+int _ore_add_parity_unit(struct ore_io_state *ios, struct ore_striping_info *si,
+		     struct ore_per_dev_state *per_dev, unsigned cur_len);
+void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d,
+		       struct ore_striping_info *si, struct page *page);
+static inline void _add_stripe_page(struct __stripe_pages_2d *sp2d,
+				struct ore_striping_info *si, struct page *page)
+{
+	if (!sp2d) /* Inline the fast path */
+		return; /* Hay no raid stuff */
+	_ore_add_stripe_page(sp2d, si, page);
+}
+
+/* ios.c stuff needed by ios_raid.c */
+int  _ore_get_io_state(struct ore_layout *layout,
+			struct ore_components *oc, unsigned numdevs,
+			unsigned sgs_per_dev, unsigned num_par_pages,
+			struct ore_io_state **pios);
+int _ore_add_stripe_unit(struct ore_io_state *ios,  unsigned *cur_pg,
+		unsigned pgbase, struct page **pages,
+		struct ore_per_dev_state *per_dev, int cur_len);
+int _ore_read_mirror(struct ore_io_state *ios, unsigned cur_comp);
+int ore_io_execute(struct ore_io_state *ios);

diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 2748940..057b237 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c

@@ -266,7 +266,7 @@
 	struct ore_io_state *ios;
 	int ret;
 
-	ret = ore_get_io_state(&sbi->layout, &sbi->comps, &ios);
+	ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios);
 	if (unlikely(ret)) {
 		EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__);
 		return ret;
@@ -321,7 +321,7 @@
 	struct ore_io_state *ios;
 	int ret;
 
-	ret = ore_get_io_state(&sbi->layout, &sbi->comps, &ios);
+	ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios);
 	if (unlikely(ret)) {
 		EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__);
 		return ret;
@@ -355,12 +355,12 @@
 /*
  * Write the superblock to the OSD
  */
-int exofs_sync_fs(struct super_block *sb, int wait)
+static int exofs_sync_fs(struct super_block *sb, int wait)
 {
 	struct exofs_sb_info *sbi;
 	struct exofs_fscb *fscb;
 	struct ore_comp one_comp;
-	struct ore_components comps;
+	struct ore_components oc;
 	struct ore_io_state *ios;
 	int ret = -ENOMEM;
 
@@ -378,9 +378,9 @@
 	 * the writeable info is set in exofs_sbi_write_stats() above.
 	 */
 
-	exofs_init_comps(&comps, &one_comp, sbi, EXOFS_SUPER_ID);
+	exofs_init_comps(&oc, &one_comp, sbi, EXOFS_SUPER_ID);
 
-	ret = ore_get_io_state(&sbi->layout, &comps, &ios);
+	ret = ore_get_io_state(&sbi->layout, &oc, &ios);
 	if (unlikely(ret))
 		goto out;
 
@@ -429,19 +429,20 @@
 		msg, dev_path ?: "", odi->osdname, _LLU(pid));
 }
 
-void exofs_free_sbi(struct exofs_sb_info *sbi)
+static void exofs_free_sbi(struct exofs_sb_info *sbi)
 {
-	while (sbi->comps.numdevs) {
-		int i = --sbi->comps.numdevs;
-		struct osd_dev *od = sbi->comps.ods[i];
+	unsigned numdevs = sbi->oc.numdevs;
+
+	while (numdevs) {
+		unsigned i = --numdevs;
+		struct osd_dev *od = ore_comp_dev(&sbi->oc, i);
 
 		if (od) {
-			sbi->comps.ods[i] = NULL;
+			ore_comp_set_dev(&sbi->oc, i, NULL);
 			osduld_put_device(od);
 		}
 	}
-	if (sbi->comps.ods != sbi->_min_one_dev)
-		kfree(sbi->comps.ods);
+	kfree(sbi->oc.ods);
 	kfree(sbi);
 }
 
@@ -468,7 +469,7 @@
 				  msecs_to_jiffies(100));
 	}
 
-	_exofs_print_device("Unmounting", NULL, sbi->comps.ods[0],
+	_exofs_print_device("Unmounting", NULL, ore_comp_dev(&sbi->oc, 0),
 			    sbi->one_comp.obj.partition);
 
 	bdi_destroy(&sbi->bdi);
@@ -479,76 +480,20 @@
 static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs,
 				    struct exofs_device_table *dt)
 {
-	u64 stripe_length;
+	int ret;
 
-	sbi->data_map.odm_num_comps   =
-				le32_to_cpu(dt->dt_data_map.cb_num_comps);
-	sbi->data_map.odm_stripe_unit =
+	sbi->layout.stripe_unit =
 				le64_to_cpu(dt->dt_data_map.cb_stripe_unit);
-	sbi->data_map.odm_group_width =
+	sbi->layout.group_width =
 				le32_to_cpu(dt->dt_data_map.cb_group_width);
-	sbi->data_map.odm_group_depth =
+	sbi->layout.group_depth =
 				le32_to_cpu(dt->dt_data_map.cb_group_depth);
-	sbi->data_map.odm_mirror_cnt  =
-				le32_to_cpu(dt->dt_data_map.cb_mirror_cnt);
-	sbi->data_map.odm_raid_algorithm  =
+	sbi->layout.mirrors_p1  =
+				le32_to_cpu(dt->dt_data_map.cb_mirror_cnt) + 1;
+	sbi->layout.raid_algorithm  =
 				le32_to_cpu(dt->dt_data_map.cb_raid_algorithm);
 
-/* FIXME: Only raid0 for now. if not so, do not mount */
-	if (sbi->data_map.odm_num_comps != numdevs) {
-		EXOFS_ERR("odm_num_comps(%u) != numdevs(%u)\n",
-			  sbi->data_map.odm_num_comps, numdevs);
-		return -EINVAL;
-	}
-	if (sbi->data_map.odm_raid_algorithm != PNFS_OSD_RAID_0) {
-		EXOFS_ERR("Only RAID_0 for now\n");
-		return -EINVAL;
-	}
-	if (0 != (numdevs % (sbi->data_map.odm_mirror_cnt + 1))) {
-		EXOFS_ERR("Data Map wrong, numdevs=%d mirrors=%d\n",
-			  numdevs, sbi->data_map.odm_mirror_cnt);
-		return -EINVAL;
-	}
-
-	if (0 != (sbi->data_map.odm_stripe_unit & ~PAGE_MASK)) {
-		EXOFS_ERR("Stripe Unit(0x%llx)"
-			  " must be Multples of PAGE_SIZE(0x%lx)\n",
-			  _LLU(sbi->data_map.odm_stripe_unit), PAGE_SIZE);
-		return -EINVAL;
-	}
-
-	sbi->layout.stripe_unit = sbi->data_map.odm_stripe_unit;
-	sbi->layout.mirrors_p1 = sbi->data_map.odm_mirror_cnt + 1;
-
-	if (sbi->data_map.odm_group_width) {
-		sbi->layout.group_width = sbi->data_map.odm_group_width;
-		sbi->layout.group_depth = sbi->data_map.odm_group_depth;
-		if (!sbi->layout.group_depth) {
-			EXOFS_ERR("group_depth == 0 && group_width != 0\n");
-			return -EINVAL;
-		}
-		sbi->layout.group_count = sbi->data_map.odm_num_comps /
-						sbi->layout.mirrors_p1 /
-						sbi->data_map.odm_group_width;
-	} else {
-		if (sbi->data_map.odm_group_depth) {
-			printk(KERN_NOTICE "Warning: group_depth ignored "
-				"group_width == 0 && group_depth == %d\n",
-				sbi->data_map.odm_group_depth);
-			sbi->data_map.odm_group_depth = 0;
-		}
-		sbi->layout.group_width = sbi->data_map.odm_num_comps /
-							sbi->layout.mirrors_p1;
-		sbi->layout.group_depth = -1;
-		sbi->layout.group_count = 1;
-	}
-
-	stripe_length = (u64)sbi->layout.group_width * sbi->layout.stripe_unit;
-	if (stripe_length >= (1ULL << 32)) {
-		EXOFS_ERR("Total Stripe length(0x%llx)"
-			  " >= 32bit is not supported\n", _LLU(stripe_length));
-		return -EINVAL;
-	}
+	ret = ore_verify_layout(numdevs, &sbi->layout);
 
 	EXOFS_DBGMSG("exofs: layout: "
 		"num_comps=%u stripe_unit=0x%x group_width=%u "
@@ -558,8 +503,8 @@
 		sbi->layout.group_width,
 		_LLU(sbi->layout.group_depth),
 		sbi->layout.mirrors_p1,
-		sbi->data_map.odm_raid_algorithm);
-	return 0;
+		sbi->layout.raid_algorithm);
+	return ret;
 }
 
 static unsigned __ra_pages(struct ore_layout *layout)
@@ -605,12 +550,40 @@
 	return !(odi->systemid_len || odi->osdname_len);
 }
 
+int __alloc_dev_table(struct exofs_sb_info *sbi, unsigned numdevs,
+		      struct exofs_dev **peds)
+{
+	struct __alloc_ore_devs_and_exofs_devs {
+		/* Twice bigger table: See exofs_init_comps() and comment at
+		 * exofs_read_lookup_dev_table()
+		 */
+		struct ore_dev *oreds[numdevs * 2 - 1];
+		struct exofs_dev eds[numdevs];
+	} *aoded;
+	struct exofs_dev *eds;
+	unsigned i;
+
+	aoded = kzalloc(sizeof(*aoded), GFP_KERNEL);
+	if (unlikely(!aoded)) {
+		EXOFS_ERR("ERROR: faild allocating Device array[%d]\n",
+			  numdevs);
+		return -ENOMEM;
+	}
+
+	sbi->oc.ods = aoded->oreds;
+	*peds = eds = aoded->eds;
+	for (i = 0; i < numdevs; ++i)
+		aoded->oreds[i] = &eds[i].ored;
+	return 0;
+}
+
 static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi,
 				       struct osd_dev *fscb_od,
 				       unsigned table_count)
 {
 	struct ore_comp comp;
 	struct exofs_device_table *dt;
+	struct exofs_dev *eds;
 	unsigned table_bytes = table_count * sizeof(dt->dt_dev_table[0]) +
 					     sizeof(*dt);
 	unsigned numdevs, i;
@@ -623,7 +596,7 @@
 		return -ENOMEM;
 	}
 
-	sbi->comps.numdevs = 0;
+	sbi->oc.numdevs = 0;
 
 	comp.obj.partition = sbi->one_comp.obj.partition;
 	comp.obj.id = EXOFS_DEVTABLE_ID;
@@ -647,20 +620,16 @@
 	if (unlikely(ret))
 		goto out;
 
-	if (likely(numdevs > 1)) {
-		unsigned size = numdevs * sizeof(sbi->comps.ods[0]);
-
-		/* Twice bigger table: See exofs_init_comps() and below
-		 * comment
-		 */
-		sbi->comps.ods = kzalloc(size + size - 1, GFP_KERNEL);
-		if (unlikely(!sbi->comps.ods)) {
-			EXOFS_ERR("ERROR: faild allocating Device array[%d]\n",
-				  numdevs);
-			ret = -ENOMEM;
-			goto out;
-		}
-	}
+	ret = __alloc_dev_table(sbi, numdevs, &eds);
+	if (unlikely(ret))
+		goto out;
+	/* exofs round-robins the device table view according to inode
+	 * number. We hold a: twice bigger table hence inodes can point
+	 * to any device and have a sequential view of the table
+	 * starting at this device. See exofs_init_comps()
+	 */
+	memcpy(&sbi->oc.ods[numdevs], &sbi->oc.ods[0],
+		(numdevs - 1) * sizeof(sbi->oc.ods[0]));
 
 	for (i = 0; i < numdevs; i++) {
 		struct exofs_fscb fscb;
@@ -676,13 +645,16 @@
 		printk(KERN_NOTICE "Add device[%d]: osd_name-%s\n",
 		       i, odi.osdname);
 
+		/* the exofs id is currently the table index */
+		eds[i].did = i;
+
 		/* On all devices the device table is identical. The user can
 		 * specify any one of the participating devices on the command
 		 * line. We always keep them in device-table order.
 		 */
 		if (fscb_od && osduld_device_same(fscb_od, &odi)) {
-			sbi->comps.ods[i] = fscb_od;
-			++sbi->comps.numdevs;
+			eds[i].ored.od = fscb_od;
+			++sbi->oc.numdevs;
 			fscb_od = NULL;
 			continue;
 		}
@@ -695,8 +667,8 @@
 			goto out;
 		}
 
-		sbi->comps.ods[i] = od;
-		++sbi->comps.numdevs;
+		eds[i].ored.od = od;
+		++sbi->oc.numdevs;
 
 		/* Read the fscb of the other devices to make sure the FS
 		 * partition is there.
@@ -718,21 +690,10 @@
 
 out:
 	kfree(dt);
-	if (likely(!ret)) {
-		unsigned numdevs = sbi->comps.numdevs;
-
-		if (unlikely(fscb_od)) {
+	if (unlikely(fscb_od && !ret)) {
 			EXOFS_ERR("ERROR: Bad device-table container device not present\n");
 			osduld_put_device(fscb_od);
 			return -EINVAL;
-		}
-		/* exofs round-robins the device table view according to inode
-		 * number. We hold a: twice bigger table hence inodes can point
-		 * to any device and have a sequential view of the table
-		 * starting at this device. See exofs_init_comps()
-		 */
-		for (i = 0; i < numdevs - 1; ++i)
-			sbi->comps.ods[i + numdevs] = sbi->comps.ods[i];
 	}
 	return ret;
 }
@@ -783,10 +744,9 @@
 	sbi->one_comp.obj.partition = opts->pid;
 	sbi->one_comp.obj.id = 0;
 	exofs_make_credential(sbi->one_comp.cred, &sbi->one_comp.obj);
-	sbi->comps.numdevs = 1;
-	sbi->comps.single_comp = EC_SINGLE_COMP;
-	sbi->comps.comps = &sbi->one_comp;
-	sbi->comps.ods = sbi->_min_one_dev;
+	sbi->oc.numdevs = 1;
+	sbi->oc.single_comp = EC_SINGLE_COMP;
+	sbi->oc.comps = &sbi->one_comp;
 
 	/* fill in some other data by hand */
 	memset(sb->s_id, 0, sizeof(sb->s_id));
@@ -835,7 +795,13 @@
 		if (unlikely(ret))
 			goto free_sbi;
 	} else {
-		sbi->comps.ods[0] = od;
+		struct exofs_dev *eds;
+
+		ret = __alloc_dev_table(sbi, 1, &eds);
+		if (unlikely(ret))
+			goto free_sbi;
+
+		ore_comp_set_dev(&sbi->oc, 0, od);
 	}
 
 	__sbi_read_stats(sbi);
@@ -875,7 +841,8 @@
 		goto free_sbi;
 	}
 
-	_exofs_print_device("Mounting", opts->dev_name, sbi->comps.ods[0],
+	_exofs_print_device("Mounting", opts->dev_name,
+			    ore_comp_dev(&sbi->oc, 0),
 			    sbi->one_comp.obj.partition);
 	return 0;
 
@@ -924,7 +891,7 @@
 	uint64_t used = ULLONG_MAX;
 	int ret;
 
-	ret = ore_get_io_state(&sbi->layout, &sbi->comps, &ios);
+	ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios);
 	if (ret) {
 		EXOFS_DBGMSG("ore_get_io_state failed.\n");
 		return ret;
@@ -981,7 +948,7 @@
  * EXPORT OPERATIONS
  *****************************************************************************/
 
-struct dentry *exofs_get_parent(struct dentry *child)
+static struct dentry *exofs_get_parent(struct dentry *child)
 {
 	unsigned long ino = exofs_parent_ino(child);
 

diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 9758b65..42b274d 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c

@@ -10,6 +10,7 @@
 #include <linux/time.h>
 #include <linux/irqnr.h>
 #include <asm/cputime.h>
+#include <linux/tick.h>
 
 #ifndef arch_irq_stat_cpu
 #define arch_irq_stat_cpu(cpu) 0
@@ -21,6 +22,35 @@
 #define arch_idle_time(cpu) 0
 #endif
 
+static cputime64_t get_idle_time(int cpu)
+{
+	u64 idle_time = get_cpu_idle_time_us(cpu, NULL);
+	cputime64_t idle;
+
+	if (idle_time == -1ULL) {
+		/* !NO_HZ so we can rely on cpustat.idle */
+		idle = kstat_cpu(cpu).cpustat.idle;
+		idle = cputime64_add(idle, arch_idle_time(cpu));
+	} else
+		idle = usecs_to_cputime(idle_time);
+
+	return idle;
+}
+
+static cputime64_t get_iowait_time(int cpu)
+{
+	u64 iowait_time = get_cpu_iowait_time_us(cpu, NULL);
+	cputime64_t iowait;
+
+	if (iowait_time == -1ULL)
+		/* !NO_HZ so we can rely on cpustat.iowait */
+		iowait = kstat_cpu(cpu).cpustat.iowait;
+	else
+		iowait = usecs_to_cputime(iowait_time);
+
+	return iowait;
+}
+
 static int show_stat(struct seq_file *p, void *v)
 {
 	int i, j;
@@ -42,9 +72,8 @@
 		user = cputime64_add(user, kstat_cpu(i).cpustat.user);
 		nice = cputime64_add(nice, kstat_cpu(i).cpustat.nice);
 		system = cputime64_add(system, kstat_cpu(i).cpustat.system);
-		idle = cputime64_add(idle, kstat_cpu(i).cpustat.idle);
-		idle = cputime64_add(idle, arch_idle_time(i));
-		iowait = cputime64_add(iowait, kstat_cpu(i).cpustat.iowait);
+		idle = cputime64_add(idle, get_idle_time(i));
+		iowait = cputime64_add(iowait, get_iowait_time(i));
 		irq = cputime64_add(irq, kstat_cpu(i).cpustat.irq);
 		softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq);
 		steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal);
@@ -76,14 +105,12 @@
 		(unsigned long long)cputime64_to_clock_t(guest),
 		(unsigned long long)cputime64_to_clock_t(guest_nice));
 	for_each_online_cpu(i) {
-
 		/* Copy values here to work around gcc-2.95.3, gcc-2.96 */
 		user = kstat_cpu(i).cpustat.user;
 		nice = kstat_cpu(i).cpustat.nice;
 		system = kstat_cpu(i).cpustat.system;
-		idle = kstat_cpu(i).cpustat.idle;
-		idle = cputime64_add(idle, arch_idle_time(i));
-		iowait = kstat_cpu(i).cpustat.iowait;
+		idle = get_idle_time(i);
+		iowait = get_iowait_time(i);
 		irq = kstat_cpu(i).cpustat.irq;
 		softirq = kstat_cpu(i).cpustat.softirq;
 		steal = kstat_cpu(i).cpustat.steal;

diff --git a/include/asm-generic/cputime.h b/include/asm-generic/cputime.h
index 61e03dd..62ce682 100644
--- a/include/asm-generic/cputime.h
+++ b/include/asm-generic/cputime.h

@@ -38,8 +38,8 @@
 /*
  * Convert cputime to microseconds and back.
  */
-#define cputime_to_usecs(__ct)		jiffies_to_usecs(__ct);
-#define usecs_to_cputime(__msecs)	usecs_to_jiffies(__msecs);
+#define cputime_to_usecs(__ct)		jiffies_to_usecs(__ct)
+#define usecs_to_cputime(__msecs)	usecs_to_jiffies(__msecs)
 
 /*
  * Convert cputime to seconds and back.

diff --git a/include/linux/alarmtimer.h b/include/linux/alarmtimer.h
index c5d6095..975009e 100644
--- a/include/linux/alarmtimer.h
+++ b/include/linux/alarmtimer.h

@@ -13,6 +13,16 @@
 	ALARM_NUMTYPE,
 };
 
+enum alarmtimer_restart {
+	ALARMTIMER_NORESTART,
+	ALARMTIMER_RESTART,
+};
+
+
+#define ALARMTIMER_STATE_INACTIVE	0x00
+#define ALARMTIMER_STATE_ENQUEUED	0x01
+#define ALARMTIMER_STATE_CALLBACK	0x02
+
 /**
  * struct alarm - Alarm timer structure
  * @node:	timerqueue node for adding to the event list this value
@@ -25,16 +35,45 @@
  */
 struct alarm {
 	struct timerqueue_node	node;
-	ktime_t			period;
-	void			(*function)(struct alarm *);
+	enum alarmtimer_restart	(*function)(struct alarm *, ktime_t now);
 	enum alarmtimer_type	type;
-	bool			enabled;
+	int			state;
 	void			*data;
 };
 
 void alarm_init(struct alarm *alarm, enum alarmtimer_type type,
-		void (*function)(struct alarm *));
-void alarm_start(struct alarm *alarm, ktime_t start, ktime_t period);
-void alarm_cancel(struct alarm *alarm);
+		enum alarmtimer_restart (*function)(struct alarm *, ktime_t));
+void alarm_start(struct alarm *alarm, ktime_t start);
+int alarm_try_to_cancel(struct alarm *alarm);
+int alarm_cancel(struct alarm *alarm);
+
+u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval);
+
+/*
+ * A alarmtimer is active, when it is enqueued into timerqueue or the
+ * callback function is running.
+ */
+static inline int alarmtimer_active(const struct alarm *timer)
+{
+	return timer->state != ALARMTIMER_STATE_INACTIVE;
+}
+
+/*
+ * Helper function to check, whether the timer is on one of the queues
+ */
+static inline int alarmtimer_is_queued(struct alarm *timer)
+{
+	return timer->state & ALARMTIMER_STATE_ENQUEUED;
+}
+
+/*
+ * Helper function to check, whether the timer is running the callback
+ * function
+ */
+static inline int alarmtimer_callback_running(struct alarm *timer)
+{
+	return timer->state & ALARMTIMER_STATE_CALLBACK;
+}
+
 
 #endif

diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h
index d6733e2..81e803e 100644
--- a/include/linux/clockchips.h
+++ b/include/linux/clockchips.h

@@ -45,20 +45,22 @@
  */
 #define CLOCK_EVT_FEAT_PERIODIC		0x000001
 #define CLOCK_EVT_FEAT_ONESHOT		0x000002
+#define CLOCK_EVT_FEAT_KTIME		0x000004
 /*
  * x86(64) specific misfeatures:
  *
  * - Clockevent source stops in C3 State and needs broadcast support.
  * - Local APIC timer is used as a dummy device.
  */
-#define CLOCK_EVT_FEAT_C3STOP		0x000004
-#define CLOCK_EVT_FEAT_DUMMY		0x000008
+#define CLOCK_EVT_FEAT_C3STOP		0x000008
+#define CLOCK_EVT_FEAT_DUMMY		0x000010
 
 /**
  * struct clock_event_device - clock event device descriptor
  * @event_handler:	Assigned by the framework to be called by the low
  *			level handler of the event source
- * @set_next_event:	set next event function
+ * @set_next_event:	set next event function using a clocksource delta
+ * @set_next_ktime:	set next event function using a direct ktime value
  * @next_event:		local storage for the next event in oneshot mode
  * @max_delta_ns:	maximum delta value in ns
  * @min_delta_ns:	minimum delta value in ns
@@ -81,6 +83,8 @@
 	void			(*event_handler)(struct clock_event_device *);
 	int			(*set_next_event)(unsigned long evt,
 						  struct clock_event_device *);
+	int			(*set_next_ktime)(ktime_t expires,
+						  struct clock_event_device *);
 	ktime_t			next_event;
 	u64			max_delta_ns;
 	u64			min_delta_ns;
@@ -140,7 +144,7 @@
 				 enum clock_event_mode mode);
 extern int clockevents_register_notifier(struct notifier_block *nb);
 extern int clockevents_program_event(struct clock_event_device *dev,
-				     ktime_t expires, ktime_t now);
+				     ktime_t expires, bool force);
 
 extern void clockevents_handle_noop(struct clock_event_device *dev);
 

diff --git a/include/linux/dw_apb_timer.h b/include/linux/dw_apb_timer.h
index 49638ea..07261d5 100644
--- a/include/linux/dw_apb_timer.h
+++ b/include/linux/dw_apb_timer.h

@@ -46,7 +46,7 @@
 dw_apb_clockevent_init(int cpu, const char *name, unsigned rating,
 		       void __iomem *base, int irq, unsigned long freq);
 struct dw_apb_clocksource *
-dw_apb_clocksource_init(unsigned rating, char *name, void __iomem *base,
+dw_apb_clocksource_init(unsigned rating, const char *name, void __iomem *base,
 			unsigned long freq);
 void dw_apb_clocksource_register(struct dw_apb_clocksource *dw_cs);
 void dw_apb_clocksource_start(struct dw_apb_clocksource *dw_cs);

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index a103732..a64b00e 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h

@@ -59,6 +59,8 @@
  * IRQF_NO_SUSPEND - Do not disable this IRQ during suspend
  * IRQF_FORCE_RESUME - Force enable it on resume even if IRQF_NO_SUSPEND is set
  * IRQF_NO_THREAD - Interrupt cannot be threaded
+ * IRQF_EARLY_RESUME - Resume IRQ early during syscore instead of at device
+ *                resume time.
  */
 #define IRQF_DISABLED		0x00000020
 #define IRQF_SAMPLE_RANDOM	0x00000040
@@ -72,6 +74,7 @@
 #define IRQF_NO_SUSPEND		0x00004000
 #define IRQF_FORCE_RESUME	0x00008000
 #define IRQF_NO_THREAD		0x00010000
+#define IRQF_EARLY_RESUME	0x00020000
 
 #define IRQF_TIMER		(__IRQF_TIMER | IRQF_NO_SUSPEND | IRQF_NO_THREAD)
 
@@ -95,6 +98,7 @@
  * @flags:	flags (see IRQF_* above)
  * @name:	name of the device
  * @dev_id:	cookie to identify the device
+ * @percpu_dev_id:	cookie to identify the device
  * @next:	pointer to the next irqaction for shared interrupts
  * @irq:	interrupt number
  * @dir:	pointer to the proc/irq/NN/name entry
@@ -104,17 +108,18 @@
  * @thread_mask:	bitmask for keeping track of @thread activity
  */
 struct irqaction {
-	irq_handler_t handler;
-	unsigned long flags;
-	void *dev_id;
-	struct irqaction *next;
-	int irq;
-	irq_handler_t thread_fn;
-	struct task_struct *thread;
-	unsigned long thread_flags;
-	unsigned long thread_mask;
-	const char *name;
-	struct proc_dir_entry *dir;
+	irq_handler_t		handler;
+	unsigned long		flags;
+	void			*dev_id;
+	void __percpu		*percpu_dev_id;
+	struct irqaction	*next;
+	int			irq;
+	irq_handler_t		thread_fn;
+	struct task_struct	*thread;
+	unsigned long		thread_flags;
+	unsigned long		thread_mask;
+	const char		*name;
+	struct proc_dir_entry	*dir;
 } ____cacheline_internodealigned_in_smp;
 
 extern irqreturn_t no_action(int cpl, void *dev_id);
@@ -136,6 +141,10 @@
 request_any_context_irq(unsigned int irq, irq_handler_t handler,
 			unsigned long flags, const char *name, void *dev_id);
 
+extern int __must_check
+request_percpu_irq(unsigned int irq, irq_handler_t handler,
+		   const char *devname, void __percpu *percpu_dev_id);
+
 extern void exit_irq_thread(void);
 #else
 
@@ -164,10 +173,18 @@
 	return request_irq(irq, handler, flags, name, dev_id);
 }
 
+static inline int __must_check
+request_percpu_irq(unsigned int irq, irq_handler_t handler,
+		   const char *devname, void __percpu *percpu_dev_id)
+{
+	return request_irq(irq, handler, 0, devname, percpu_dev_id);
+}
+
 static inline void exit_irq_thread(void) { }
 #endif
 
 extern void free_irq(unsigned int, void *);
+extern void free_percpu_irq(unsigned int, void __percpu *);
 
 struct device;
 
@@ -207,7 +224,9 @@
 
 extern void disable_irq_nosync(unsigned int irq);
 extern void disable_irq(unsigned int irq);
+extern void disable_percpu_irq(unsigned int irq);
 extern void enable_irq(unsigned int irq);
+extern void enable_percpu_irq(unsigned int irq, unsigned int type);
 
 /* The following three functions are for the core kernel use only. */
 #ifdef CONFIG_GENERIC_HARDIRQS

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 5951730..59e49c8 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h

@@ -66,6 +66,7 @@
  * IRQ_NO_BALANCING		- Interrupt cannot be balanced (affinity set)
  * IRQ_MOVE_PCNTXT		- Interrupt can be migrated from process context
  * IRQ_NESTED_TRHEAD		- Interrupt nests into another thread
+ * IRQ_PER_CPU_DEVID		- Dev_id is a per-cpu variable
  */
 enum {
 	IRQ_TYPE_NONE		= 0x00000000,
@@ -88,12 +89,13 @@
 	IRQ_MOVE_PCNTXT		= (1 << 14),
 	IRQ_NESTED_THREAD	= (1 << 15),
 	IRQ_NOTHREAD		= (1 << 16),
+	IRQ_PER_CPU_DEVID	= (1 << 17),
 };
 
 #define IRQF_MODIFY_MASK	\
 	(IRQ_TYPE_SENSE_MASK | IRQ_NOPROBE | IRQ_NOREQUEST | \
 	 IRQ_NOAUTOEN | IRQ_MOVE_PCNTXT | IRQ_LEVEL | IRQ_NO_BALANCING | \
-	 IRQ_PER_CPU | IRQ_NESTED_THREAD)
+	 IRQ_PER_CPU | IRQ_NESTED_THREAD | IRQ_NOTHREAD | IRQ_PER_CPU_DEVID)
 
 #define IRQ_NO_BALANCING_MASK	(IRQ_PER_CPU | IRQ_NO_BALANCING)
 
@@ -336,12 +338,14 @@
  * IRQCHIP_MASK_ON_SUSPEND:	Mask non wake irqs in the suspend path
  * IRQCHIP_ONOFFLINE_ENABLED:	Only call irq_on/off_line callbacks
  *				when irq enabled
+ * IRQCHIP_SKIP_SET_WAKE:	Skip chip.irq_set_wake(), for this irq chip
  */
 enum {
 	IRQCHIP_SET_TYPE_MASKED		= (1 <<  0),
 	IRQCHIP_EOI_IF_HANDLED		= (1 <<  1),
 	IRQCHIP_MASK_ON_SUSPEND		= (1 <<  2),
 	IRQCHIP_ONOFFLINE_ENABLED	= (1 <<  3),
+	IRQCHIP_SKIP_SET_WAKE		= (1 <<  4),
 };
 
 /* This include will go away once we isolated irq_desc usage to core code */
@@ -365,6 +369,8 @@
 struct irqaction;
 extern int setup_irq(unsigned int irq, struct irqaction *new);
 extern void remove_irq(unsigned int irq, struct irqaction *act);
+extern int setup_percpu_irq(unsigned int irq, struct irqaction *new);
+extern void remove_percpu_irq(unsigned int irq, struct irqaction *act);
 
 extern void irq_cpu_online(void);
 extern void irq_cpu_offline(void);
@@ -392,6 +398,7 @@
 extern void handle_edge_eoi_irq(unsigned int irq, struct irq_desc *desc);
 extern void handle_simple_irq(unsigned int irq, struct irq_desc *desc);
 extern void handle_percpu_irq(unsigned int irq, struct irq_desc *desc);
+extern void handle_percpu_devid_irq(unsigned int irq, struct irq_desc *desc);
 extern void handle_bad_irq(unsigned int irq, struct irq_desc *desc);
 extern void handle_nested_irq(unsigned int irq);
 
@@ -420,6 +427,8 @@
 	irq_set_chip_and_handler_name(irq, chip, handle, NULL);
 }
 
+extern int irq_set_percpu_devid(unsigned int irq);
+
 extern void
 __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
 		  const char *name);
@@ -481,6 +490,13 @@
 		irq_clear_status_flags(irq, IRQ_NESTED_THREAD);
 }
 
+static inline void irq_set_percpu_devid_flags(unsigned int irq)
+{
+	irq_set_status_flags(irq,
+			     IRQ_NOAUTOEN | IRQ_PER_CPU | IRQ_NOTHREAD |
+			     IRQ_NOPROBE | IRQ_PER_CPU_DEVID);
+}
+
 /* Handle dynamic irq creation and destruction */
 extern unsigned int create_irq_nr(unsigned int irq_want, int node);
 extern int create_irq(void);

diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h
index 4fa09d4..6a9e8f5 100644
--- a/include/linux/irq_work.h
+++ b/include/linux/irq_work.h

@@ -1,20 +1,23 @@
 #ifndef _LINUX_IRQ_WORK_H
 #define _LINUX_IRQ_WORK_H
 
+#include <linux/llist.h>
+
 struct irq_work {
-	struct irq_work *next;
+	unsigned long flags;
+	struct llist_node llnode;
 	void (*func)(struct irq_work *);
 };
 
 static inline
-void init_irq_work(struct irq_work *entry, void (*func)(struct irq_work *))
+void init_irq_work(struct irq_work *work, void (*func)(struct irq_work *))
 {
-	entry->next = NULL;
-	entry->func = func;
+	work->flags = 0;
+	work->func = func;
 }
 
-bool irq_work_queue(struct irq_work *entry);
+bool irq_work_queue(struct irq_work *work);
 void irq_work_run(void);
-void irq_work_sync(struct irq_work *entry);
+void irq_work_sync(struct irq_work *work);
 
 #endif /* _LINUX_IRQ_WORK_H */

diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index 150134a..6b69c2c 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h

@@ -53,6 +53,7 @@
 	unsigned long		last_unhandled;	/* Aging timer for unhandled count */
 	unsigned int		irqs_unhandled;
 	raw_spinlock_t		lock;
+	struct cpumask		*percpu_enabled;
 #ifdef CONFIG_SMP
 	const struct cpumask	*affinity_hint;
 	struct irq_affinity_notify *affinity_notify;

diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h
index f97672a..265e2c3 100644
--- a/include/linux/jiffies.h
+++ b/include/linux/jiffies.h

@@ -303,7 +303,7 @@
 extern unsigned long timeval_to_jiffies(const struct timeval *value);
 extern void jiffies_to_timeval(const unsigned long jiffies,
 			       struct timeval *value);
-extern clock_t jiffies_to_clock_t(long x);
+extern clock_t jiffies_to_clock_t(unsigned long x);
 extern unsigned long clock_t_to_jiffies(unsigned long x);
 extern u64 jiffies_64_to_clock_t(u64 x);
 extern u64 nsec_to_clock_t(u64 x);

diff --git a/include/linux/llist.h b/include/linux/llist.h
index aa0c8b5..7287734 100644
--- a/include/linux/llist.h
+++ b/include/linux/llist.h

@@ -35,10 +35,30 @@
  *
  * The basic atomic operation of this list is cmpxchg on long.  On
  * architectures that don't have NMI-safe cmpxchg implementation, the
- * list can NOT be used in NMI handler.  So code uses the list in NMI
- * handler should depend on CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG.
+ * list can NOT be used in NMI handlers.  So code that uses the list in
+ * an NMI handler should depend on CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG.
+ *
+ * Copyright 2010,2011 Intel Corp.
+ *   Author: Huang Ying <ying.huang@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation;
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
+#include <linux/kernel.h>
+#include <asm/system.h>
+#include <asm/processor.h>
+
 struct llist_head {
 	struct llist_node *first;
 };
@@ -113,14 +133,55 @@
  * test whether the list is empty without deleting something from the
  * list.
  */
-static inline int llist_empty(const struct llist_head *head)
+static inline bool llist_empty(const struct llist_head *head)
 {
 	return ACCESS_ONCE(head->first) == NULL;
 }
 
-void llist_add(struct llist_node *new, struct llist_head *head);
-void llist_add_batch(struct llist_node *new_first, struct llist_node *new_last,
-		     struct llist_head *head);
-struct llist_node *llist_del_first(struct llist_head *head);
-struct llist_node *llist_del_all(struct llist_head *head);
+static inline struct llist_node *llist_next(struct llist_node *node)
+{
+	return node->next;
+}
+
+/**
+ * llist_add - add a new entry
+ * @new:	new entry to be added
+ * @head:	the head for your lock-less list
+ *
+ * Return whether list is empty before adding.
+ */
+static inline bool llist_add(struct llist_node *new, struct llist_head *head)
+{
+	struct llist_node *entry, *old_entry;
+
+	entry = head->first;
+	for (;;) {
+		old_entry = entry;
+		new->next = entry;
+		entry = cmpxchg(&head->first, old_entry, new);
+		if (entry == old_entry)
+			break;
+	}
+
+	return old_entry == NULL;
+}
+
+/**
+ * llist_del_all - delete all entries from lock-less list
+ * @head:	the head of lock-less list to delete all entries
+ *
+ * If list is empty, return NULL, otherwise, delete all entries and
+ * return the pointer to the first entry.  The order of entries
+ * deleted is from the newest to the oldest added one.
+ */
+static inline struct llist_node *llist_del_all(struct llist_head *head)
+{
+	return xchg(&head->first, NULL);
+}
+
+extern bool llist_add_batch(struct llist_node *new_first,
+			    struct llist_node *new_last,
+			    struct llist_head *head);
+extern struct llist_node *llist_del_first(struct llist_head *head);
+
 #endif /* LLIST_H */

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index ef820a3..b6a56e3 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h

@@ -548,7 +548,7 @@
 #endif
 
 #ifdef CONFIG_PROVE_RCU
-extern void lockdep_rcu_dereference(const char *file, const int line);
+void lockdep_rcu_suspicious(const char *file, const int line, const char *s);
 #endif
 
 #endif /* __LINUX_LOCKDEP_H */

diff --git a/include/linux/module.h b/include/linux/module.h
index 1c30087..8639216 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h

@@ -580,9 +580,6 @@
 
 extern void print_modules(void);
 
-extern void module_update_tracepoints(void);
-extern int module_get_iter_tracepoints(struct tracepoint_iter *iter);
-
 #else /* !CONFIG_MODULES... */
 #define EXPORT_SYMBOL(sym)
 #define EXPORT_SYMBOL_GPL(sym)
@@ -698,15 +695,6 @@
 static inline void print_modules(void)
 {
 }
-
-static inline void module_update_tracepoints(void)
-{
-}
-
-static inline int module_get_iter_tracepoints(struct tracepoint_iter *iter)
-{
-	return 0;
-}
 #endif /* CONFIG_MODULES */
 
 #ifdef CONFIG_SYSFS

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index c816075..1e9ebe5 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h

@@ -220,7 +220,10 @@
 				mmap_data      :  1, /* non-exec mmap data    */
 				sample_id_all  :  1, /* sample_type all events */
 
-				__reserved_1   : 45;
+				exclude_host   :  1, /* don't count in host   */
+				exclude_guest  :  1, /* don't count in guest  */
+
+				__reserved_1   : 43;
 
 	union {
 		__u32		wakeup_events;	  /* wakeup every n events */

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index 959c141..042058f 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h

@@ -81,7 +81,10 @@
 			unsigned long incr;
 			unsigned long expires;
 		} mmtimer;
-		struct alarm alarmtimer;
+		struct {
+			struct alarm alarmtimer;
+			ktime_t interval;
+		} alarm;
 		struct rcu_head rcu;
 	} it;
 };

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 8f4f881..2cf4226 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h

@@ -33,6 +33,7 @@
 #ifndef __LINUX_RCUPDATE_H
 #define __LINUX_RCUPDATE_H
 
+#include <linux/types.h>
 #include <linux/cache.h>
 #include <linux/spinlock.h>
 #include <linux/threads.h>
@@ -64,32 +65,74 @@
 #define ULONG_CMP_GE(a, b)	(ULONG_MAX / 2 >= (a) - (b))
 #define ULONG_CMP_LT(a, b)	(ULONG_MAX / 2 < (a) - (b))
 
-/**
- * struct rcu_head - callback structure for use with RCU
- * @next: next update requests in a list
- * @func: actual update function to call after the grace period.
- */
-struct rcu_head {
-	struct rcu_head *next;
-	void (*func)(struct rcu_head *head);
-};
-
 /* Exported common interfaces */
+
+#ifdef CONFIG_PREEMPT_RCU
+
+/**
+ * call_rcu() - Queue an RCU callback for invocation after a grace period.
+ * @head: structure to be used for queueing the RCU updates.
+ * @func: actual callback function to be invoked after the grace period
+ *
+ * The callback function will be invoked some time after a full grace
+ * period elapses, in other words after all pre-existing RCU read-side
+ * critical sections have completed.  However, the callback function
+ * might well execute concurrently with RCU read-side critical sections
+ * that started after call_rcu() was invoked.  RCU read-side critical
+ * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
+ * and may be nested.
+ */
+extern void call_rcu(struct rcu_head *head,
+			      void (*func)(struct rcu_head *head));
+
+#else /* #ifdef CONFIG_PREEMPT_RCU */
+
+/* In classic RCU, call_rcu() is just call_rcu_sched(). */
+#define	call_rcu	call_rcu_sched
+
+#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
+
+/**
+ * call_rcu_bh() - Queue an RCU for invocation after a quicker grace period.
+ * @head: structure to be used for queueing the RCU updates.
+ * @func: actual callback function to be invoked after the grace period
+ *
+ * The callback function will be invoked some time after a full grace
+ * period elapses, in other words after all currently executing RCU
+ * read-side critical sections have completed. call_rcu_bh() assumes
+ * that the read-side critical sections end on completion of a softirq
+ * handler. This means that read-side critical sections in process
+ * context must not be interrupted by softirqs. This interface is to be
+ * used when most of the read-side critical sections are in softirq context.
+ * RCU read-side critical sections are delimited by :
+ *  - rcu_read_lock() and  rcu_read_unlock(), if in interrupt context.
+ *  OR
+ *  - rcu_read_lock_bh() and rcu_read_unlock_bh(), if in process context.
+ *  These may be nested.
+ */
+extern void call_rcu_bh(struct rcu_head *head,
+			void (*func)(struct rcu_head *head));
+
+/**
+ * call_rcu_sched() - Queue an RCU for invocation after sched grace period.
+ * @head: structure to be used for queueing the RCU updates.
+ * @func: actual callback function to be invoked after the grace period
+ *
+ * The callback function will be invoked some time after a full grace
+ * period elapses, in other words after all currently executing RCU
+ * read-side critical sections have completed. call_rcu_sched() assumes
+ * that the read-side critical sections end on enabling of preemption
+ * or on voluntary preemption.
+ * RCU read-side critical sections are delimited by :
+ *  - rcu_read_lock_sched() and  rcu_read_unlock_sched(),
+ *  OR
+ *  anything that disables preemption.
+ *  These may be nested.
+ */
 extern void call_rcu_sched(struct rcu_head *head,
 			   void (*func)(struct rcu_head *rcu));
+
 extern void synchronize_sched(void);
-extern void rcu_barrier_bh(void);
-extern void rcu_barrier_sched(void);
-
-static inline void __rcu_read_lock_bh(void)
-{
-	local_bh_disable();
-}
-
-static inline void __rcu_read_unlock_bh(void)
-{
-	local_bh_enable();
-}
 
 #ifdef CONFIG_PREEMPT_RCU
 
@@ -152,6 +195,15 @@
 
 #endif /* #else #ifdef CONFIG_NO_HZ */
 
+/*
+ * Infrastructure to implement the synchronize_() primitives in
+ * TREE_RCU and rcu_barrier_() primitives in TINY_RCU.
+ */
+
+typedef void call_rcu_func_t(struct rcu_head *head,
+			     void (*func)(struct rcu_head *head));
+void wait_rcu_gp(call_rcu_func_t crf);
+
 #if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU)
 #include <linux/rcutree.h>
 #elif defined(CONFIG_TINY_RCU) || defined(CONFIG_TINY_PREEMPT_RCU)
@@ -297,19 +349,31 @@
 /**
  * rcu_lockdep_assert - emit lockdep splat if specified condition not met
  * @c: condition to check
+ * @s: informative message
  */
-#define rcu_lockdep_assert(c)						\
+#define rcu_lockdep_assert(c, s)					\
 	do {								\
 		static bool __warned;					\
 		if (debug_lockdep_rcu_enabled() && !__warned && !(c)) {	\
 			__warned = true;				\
-			lockdep_rcu_dereference(__FILE__, __LINE__);	\
+			lockdep_rcu_suspicious(__FILE__, __LINE__, s);	\
 		}							\
 	} while (0)
 
+#define rcu_sleep_check()						\
+	do {								\
+		rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map),	\
+				   "Illegal context switch in RCU-bh"	\
+				   " read-side critical section");	\
+		rcu_lockdep_assert(!lock_is_held(&rcu_sched_lock_map),	\
+				   "Illegal context switch in RCU-sched"\
+				   " read-side critical section");	\
+	} while (0)
+
 #else /* #ifdef CONFIG_PROVE_RCU */
 
-#define rcu_lockdep_assert(c) do { } while (0)
+#define rcu_lockdep_assert(c, s) do { } while (0)
+#define rcu_sleep_check() do { } while (0)
 
 #endif /* #else #ifdef CONFIG_PROVE_RCU */
 
@@ -338,14 +402,16 @@
 #define __rcu_dereference_check(p, c, space) \
 	({ \
 		typeof(*p) *_________p1 = (typeof(*p)*__force )ACCESS_ONCE(p); \
-		rcu_lockdep_assert(c); \
+		rcu_lockdep_assert(c, "suspicious rcu_dereference_check()" \
+				      " usage"); \
 		rcu_dereference_sparse(p, space); \
 		smp_read_barrier_depends(); \
 		((typeof(*p) __force __kernel *)(_________p1)); \
 	})
 #define __rcu_dereference_protected(p, c, space) \
 	({ \
-		rcu_lockdep_assert(c); \
+		rcu_lockdep_assert(c, "suspicious rcu_dereference_protected()" \
+				      " usage"); \
 		rcu_dereference_sparse(p, space); \
 		((typeof(*p) __force __kernel *)(p)); \
 	})
@@ -359,15 +425,15 @@
 #define __rcu_dereference_index_check(p, c) \
 	({ \
 		typeof(p) _________p1 = ACCESS_ONCE(p); \
-		rcu_lockdep_assert(c); \
+		rcu_lockdep_assert(c, \
+				   "suspicious rcu_dereference_index_check()" \
+				   " usage"); \
 		smp_read_barrier_depends(); \
 		(_________p1); \
 	})
 #define __rcu_assign_pointer(p, v, space) \
 	({ \
-		if (!__builtin_constant_p(v) || \
-		    ((v) != NULL)) \
-			smp_wmb(); \
+		smp_wmb(); \
 		(p) = (typeof(*v) __force space *)(v); \
 	})
 
@@ -500,26 +566,6 @@
 #define rcu_dereference_protected(p, c) \
 	__rcu_dereference_protected((p), (c), __rcu)
 
-/**
- * rcu_dereference_bh_protected() - fetch RCU-bh pointer when updates prevented
- * @p: The pointer to read, prior to dereferencing
- * @c: The conditions under which the dereference will take place
- *
- * This is the RCU-bh counterpart to rcu_dereference_protected().
- */
-#define rcu_dereference_bh_protected(p, c) \
-	__rcu_dereference_protected((p), (c), __rcu)
-
-/**
- * rcu_dereference_sched_protected() - fetch RCU-sched pointer when updates prevented
- * @p: The pointer to read, prior to dereferencing
- * @c: The conditions under which the dereference will take place
- *
- * This is the RCU-sched counterpart to rcu_dereference_protected().
- */
-#define rcu_dereference_sched_protected(p, c) \
-	__rcu_dereference_protected((p), (c), __rcu)
-
 
 /**
  * rcu_dereference() - fetch RCU-protected pointer for dereferencing
@@ -630,7 +676,7 @@
  */
 static inline void rcu_read_lock_bh(void)
 {
-	__rcu_read_lock_bh();
+	local_bh_disable();
 	__acquire(RCU_BH);
 	rcu_read_acquire_bh();
 }
@@ -644,7 +690,7 @@
 {
 	rcu_read_release_bh();
 	__release(RCU_BH);
-	__rcu_read_unlock_bh();
+	local_bh_enable();
 }
 
 /**
@@ -698,11 +744,18 @@
  * any prior initialization.  Returns the value assigned.
  *
  * Inserts memory barriers on architectures that require them
- * (pretty much all of them other than x86), and also prevents
- * the compiler from reordering the code that initializes the
- * structure after the pointer assignment.  More importantly, this
- * call documents which pointers will be dereferenced by RCU read-side
- * code.
+ * (which is most of them), and also prevents the compiler from
+ * reordering the code that initializes the structure after the pointer
+ * assignment.  More importantly, this call documents which pointers
+ * will be dereferenced by RCU read-side code.
+ *
+ * In some special cases, you may use RCU_INIT_POINTER() instead
+ * of rcu_assign_pointer().  RCU_INIT_POINTER() is a bit faster due
+ * to the fact that it does not constrain either the CPU or the compiler.
+ * That said, using RCU_INIT_POINTER() when you should have used
+ * rcu_assign_pointer() is a very bad thing that results in
+ * impossible-to-diagnose memory corruption.  So please be careful.
+ * See the RCU_INIT_POINTER() comment header for details.
  */
 #define rcu_assign_pointer(p, v) \
 	__rcu_assign_pointer((p), (v), __rcu)
@@ -710,105 +763,38 @@
 /**
  * RCU_INIT_POINTER() - initialize an RCU protected pointer
  *
- * Initialize an RCU-protected pointer in such a way to avoid RCU-lockdep
- * splats.
+ * Initialize an RCU-protected pointer in special cases where readers
+ * do not need ordering constraints on the CPU or the compiler.  These
+ * special cases are:
+ *
+ * 1.	This use of RCU_INIT_POINTER() is NULLing out the pointer -or-
+ * 2.	The caller has taken whatever steps are required to prevent
+ *	RCU readers from concurrently accessing this pointer -or-
+ * 3.	The referenced data structure has already been exposed to
+ *	readers either at compile time or via rcu_assign_pointer() -and-
+ *	a.	You have not made -any- reader-visible changes to
+ *		this structure since then -or-
+ *	b.	It is OK for readers accessing this structure from its
+ *		new location to see the old state of the structure.  (For
+ *		example, the changes were to statistical counters or to
+ *		other state where exact synchronization is not required.)
+ *
+ * Failure to follow these rules governing use of RCU_INIT_POINTER() will
+ * result in impossible-to-diagnose memory corruption.  As in the structures
+ * will look OK in crash dumps, but any concurrent RCU readers might
+ * see pre-initialized values of the referenced data structure.  So
+ * please be very careful how you use RCU_INIT_POINTER()!!!
+ *
+ * If you are creating an RCU-protected linked structure that is accessed
+ * by a single external-to-structure RCU-protected pointer, then you may
+ * use RCU_INIT_POINTER() to initialize the internal RCU-protected
+ * pointers, but you must use rcu_assign_pointer() to initialize the
+ * external-to-structure pointer -after- you have completely initialized
+ * the reader-accessible portions of the linked structure.
  */
 #define RCU_INIT_POINTER(p, v) \
 		p = (typeof(*v) __force __rcu *)(v)
 
-/* Infrastructure to implement the synchronize_() primitives. */
-
-struct rcu_synchronize {
-	struct rcu_head head;
-	struct completion completion;
-};
-
-extern void wakeme_after_rcu(struct rcu_head  *head);
-
-#ifdef CONFIG_PREEMPT_RCU
-
-/**
- * call_rcu() - Queue an RCU callback for invocation after a grace period.
- * @head: structure to be used for queueing the RCU updates.
- * @func: actual callback function to be invoked after the grace period
- *
- * The callback function will be invoked some time after a full grace
- * period elapses, in other words after all pre-existing RCU read-side
- * critical sections have completed.  However, the callback function
- * might well execute concurrently with RCU read-side critical sections
- * that started after call_rcu() was invoked.  RCU read-side critical
- * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
- * and may be nested.
- */
-extern void call_rcu(struct rcu_head *head,
-			      void (*func)(struct rcu_head *head));
-
-#else /* #ifdef CONFIG_PREEMPT_RCU */
-
-/* In classic RCU, call_rcu() is just call_rcu_sched(). */
-#define	call_rcu	call_rcu_sched
-
-#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
-
-/**
- * call_rcu_bh() - Queue an RCU for invocation after a quicker grace period.
- * @head: structure to be used for queueing the RCU updates.
- * @func: actual callback function to be invoked after the grace period
- *
- * The callback function will be invoked some time after a full grace
- * period elapses, in other words after all currently executing RCU
- * read-side critical sections have completed. call_rcu_bh() assumes
- * that the read-side critical sections end on completion of a softirq
- * handler. This means that read-side critical sections in process
- * context must not be interrupted by softirqs. This interface is to be
- * used when most of the read-side critical sections are in softirq context.
- * RCU read-side critical sections are delimited by :
- *  - rcu_read_lock() and  rcu_read_unlock(), if in interrupt context.
- *  OR
- *  - rcu_read_lock_bh() and rcu_read_unlock_bh(), if in process context.
- *  These may be nested.
- */
-extern void call_rcu_bh(struct rcu_head *head,
-			void (*func)(struct rcu_head *head));
-
-/*
- * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally
- * by call_rcu() and rcu callback execution, and are therefore not part of the
- * RCU API. Leaving in rcupdate.h because they are used by all RCU flavors.
- */
-
-#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
-# define STATE_RCU_HEAD_READY	0
-# define STATE_RCU_HEAD_QUEUED	1
-
-extern struct debug_obj_descr rcuhead_debug_descr;
-
-static inline void debug_rcu_head_queue(struct rcu_head *head)
-{
-	WARN_ON_ONCE((unsigned long)head & 0x3);
-	debug_object_activate(head, &rcuhead_debug_descr);
-	debug_object_active_state(head, &rcuhead_debug_descr,
-				  STATE_RCU_HEAD_READY,
-				  STATE_RCU_HEAD_QUEUED);
-}
-
-static inline void debug_rcu_head_unqueue(struct rcu_head *head)
-{
-	debug_object_active_state(head, &rcuhead_debug_descr,
-				  STATE_RCU_HEAD_QUEUED,
-				  STATE_RCU_HEAD_READY);
-	debug_object_deactivate(head, &rcuhead_debug_descr);
-}
-#else	/* !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
-static inline void debug_rcu_head_queue(struct rcu_head *head)
-{
-}
-
-static inline void debug_rcu_head_unqueue(struct rcu_head *head)
-{
-}
-#endif	/* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
-
 static __always_inline bool __is_kfree_rcu_offset(unsigned long offset)
 {
 	return offset < 4096;
@@ -827,18 +813,6 @@
 	call_rcu(head, (rcu_callback)offset);
 }
 
-extern void kfree(const void *);
-
-static inline void __rcu_reclaim(struct rcu_head *head)
-{
-	unsigned long offset = (unsigned long)head->func;
-
-	if (__is_kfree_rcu_offset(offset))
-		kfree((void *)head - offset);
-	else
-		head->func(head);
-}
-
 /**
  * kfree_rcu() - kfree an object after a grace period.
  * @ptr:	pointer to kfree

diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 52b3e02..00b7a5e 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h

@@ -27,9 +27,23 @@
 
 #include <linux/cache.h>
 
+#ifdef CONFIG_RCU_BOOST
 static inline void rcu_init(void)
 {
 }
+#else /* #ifdef CONFIG_RCU_BOOST */
+void rcu_init(void);
+#endif /* #else #ifdef CONFIG_RCU_BOOST */
+
+static inline void rcu_barrier_bh(void)
+{
+	wait_rcu_gp(call_rcu_bh);
+}
+
+static inline void rcu_barrier_sched(void)
+{
+	wait_rcu_gp(call_rcu_sched);
+}
 
 #ifdef CONFIG_TINY_RCU
 
@@ -45,9 +59,13 @@
 
 #else /* #ifdef CONFIG_TINY_RCU */
 
-void rcu_barrier(void);
 void synchronize_rcu_expedited(void);
 
+static inline void rcu_barrier(void)
+{
+	wait_rcu_gp(call_rcu);
+}
+
 #endif /* #else #ifdef CONFIG_TINY_RCU */
 
 static inline void synchronize_rcu_bh(void)

diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index e65d066..6745846 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h

@@ -67,6 +67,8 @@
 }
 
 extern void rcu_barrier(void);
+extern void rcu_barrier_bh(void);
+extern void rcu_barrier_sched(void);
 
 extern unsigned long rcutorture_testseq;
 extern unsigned long rcutorture_vernum;

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index b891de9..67be037 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h

@@ -154,6 +154,8 @@
 void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu);
 void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu);
 
+unsigned long ring_buffer_oldest_event_ts(struct ring_buffer *buffer, int cpu);
+unsigned long ring_buffer_bytes_cpu(struct ring_buffer *buffer, int cpu);
 unsigned long ring_buffer_entries(struct ring_buffer *buffer);
 unsigned long ring_buffer_overruns(struct ring_buffer *buffer);
 unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu);

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1be699d..e8acce7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h

@@ -90,6 +90,7 @@
 #include <linux/task_io_accounting.h>
 #include <linux/latencytop.h>
 #include <linux/cred.h>
+#include <linux/llist.h>
 
 #include <asm/processor.h>
 
@@ -270,7 +271,6 @@
 
 extern int runqueue_is_locked(int cpu);
 
-extern cpumask_var_t nohz_cpu_mask;
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
 extern void select_nohz_load_balancer(int stop_tick);
 extern int get_nohz_timer_target(void);
@@ -1225,7 +1225,7 @@
 	unsigned int ptrace;
 
 #ifdef CONFIG_SMP
-	struct task_struct *wake_entry;
+	struct llist_node wake_entry;
 	int on_cpu;
 #endif
 	int on_rq;
@@ -1260,9 +1260,6 @@
 #ifdef CONFIG_PREEMPT_RCU
 	int rcu_read_lock_nesting;
 	char rcu_read_unlock_special;
-#if defined(CONFIG_RCU_BOOST) && defined(CONFIG_TREE_PREEMPT_RCU)
-	int rcu_boosted;
-#endif /* #if defined(CONFIG_RCU_BOOST) && defined(CONFIG_TREE_PREEMPT_RCU) */
 	struct list_head rcu_node_entry;
 #endif /* #ifdef CONFIG_PREEMPT_RCU */
 #ifdef CONFIG_TREE_PREEMPT_RCU
@@ -2039,6 +2036,10 @@
 static inline void sched_autogroup_exit(struct signal_struct *sig) { }
 #endif
 
+#ifdef CONFIG_CFS_BANDWIDTH
+extern unsigned int sysctl_sched_cfs_bandwidth_slice;
+#endif
+
 #ifdef CONFIG_RT_MUTEXES
 extern int rt_mutex_getprio(struct task_struct *p);
 extern void rt_mutex_setprio(struct task_struct *p, int prio);

diff --git a/include/linux/trace_clock.h b/include/linux/trace_clock.h
index 7a81303..4eb4902 100644
--- a/include/linux/trace_clock.h
+++ b/include/linux/trace_clock.h

@@ -15,5 +15,6 @@
 extern u64 notrace trace_clock_local(void);
 extern u64 notrace trace_clock(void);
 extern u64 notrace trace_clock_global(void);
+extern u64 notrace trace_clock_counter(void);
 
 #endif /* _LINUX_TRACE_CLOCK_H */

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index d530a44..df0a779 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h

@@ -54,8 +54,18 @@
 						void *data);
 extern void tracepoint_probe_update_all(void);
 
+#ifdef CONFIG_MODULES
+struct tp_module {
+	struct list_head list;
+	unsigned int num_tracepoints;
+	struct tracepoint * const *tracepoints_ptrs;
+};
+#endif /* CONFIG_MODULES */
+
 struct tracepoint_iter {
-	struct module *module;
+#ifdef CONFIG_MODULES
+	struct tp_module *module;
+#endif /* CONFIG_MODULES */
 	struct tracepoint * const *tracepoint;
 };
 
@@ -63,8 +73,6 @@
 extern void tracepoint_iter_next(struct tracepoint_iter *iter);
 extern void tracepoint_iter_stop(struct tracepoint_iter *iter);
 extern void tracepoint_iter_reset(struct tracepoint_iter *iter);
-extern int tracepoint_get_iter_range(struct tracepoint * const **tracepoint,
-	struct tracepoint * const *begin, struct tracepoint * const *end);
 
 /*
  * tracepoint_synchronize_unregister must be called between the last tracepoint
@@ -78,17 +86,6 @@
 
 #define PARAMS(args...) args
 
-#ifdef CONFIG_TRACEPOINTS
-extern
-void tracepoint_update_probe_range(struct tracepoint * const *begin,
-	struct tracepoint * const *end);
-#else
-static inline
-void tracepoint_update_probe_range(struct tracepoint * const *begin,
-	struct tracepoint * const *end)
-{ }
-#endif /* CONFIG_TRACEPOINTS */
-
 #endif /* _LINUX_TRACEPOINT_H */
 
 /*

diff --git a/include/linux/types.h b/include/linux/types.h
index 176da8c..57a9723 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h

@@ -238,6 +238,16 @@
 	char			f_fpack[6];
 };
 
+/**
+ * struct rcu_head - callback structure for use with RCU
+ * @next: next update requests in a list
+ * @func: actual update function to call after the grace period.
+ */
+struct rcu_head {
+	struct rcu_head *next;
+	void (*func)(struct rcu_head *head);
+};
+
 #endif	/* __KERNEL__ */
 #endif /*  __ASSEMBLY__ */
 #endif /* _LINUX_TYPES_H */

diff --git a/include/scsi/osd_ore.h b/include/scsi/osd_ore.h
index c5c5e00..f05fa82 100644
--- a/include/scsi/osd_ore.h
+++ b/include/scsi/osd_ore.h

@@ -34,15 +34,30 @@
 
 struct ore_layout {
 	/* Our way of looking at the data_map */
+	enum pnfs_osd_raid_algorithm4
+		 raid_algorithm;
 	unsigned stripe_unit;
 	unsigned mirrors_p1;
 
 	unsigned group_width;
+	unsigned parity;
 	u64	 group_depth;
 	unsigned group_count;
+
+	/* Cached often needed calculations filled in by
+	 * ore_verify_layout
+	 */
+	unsigned long max_io_length;	/* Max length that should be passed to
+					 * ore_get_rw_state
+					 */
+};
+
+struct ore_dev {
+	struct osd_dev *od;
 };
 
 struct ore_components {
+	unsigned	first_dev;		/* First logical device no    */
 	unsigned	numdevs;		/* Num of devices in array    */
 	/* If @single_comp == EC_SINGLE_COMP, @comps points to a single
 	 * component. else there are @numdevs components
@@ -51,20 +66,60 @@
 		EC_SINGLE_COMP = 0, EC_MULTPLE_COMPS = 0xffffffff
 	}		single_comp;
 	struct ore_comp	*comps;
-	struct osd_dev	**ods;			/* osd_dev array              */
+
+	/* Array of pointers to ore_dev-* . User will usually have these pointed
+	 * too a bigger struct which contain an "ore_dev ored" member and use
+	 * container_of(oc->ods[i], struct foo_dev, ored) to access the bigger
+	 * structure.
+	 */
+	struct ore_dev	**ods;
+};
+
+/* ore_comp_dev Recievies a logical device index */
+static inline struct osd_dev *ore_comp_dev(
+	const struct ore_components *oc, unsigned i)
+{
+	BUG_ON((i < oc->first_dev) || (oc->first_dev + oc->numdevs <= i));
+	return oc->ods[i - oc->first_dev]->od;
+}
+
+static inline void ore_comp_set_dev(
+	struct ore_components *oc, unsigned i, struct osd_dev *od)
+{
+	oc->ods[i - oc->first_dev]->od = od;
+}
+
+struct ore_striping_info {
+	u64 offset;
+	u64 obj_offset;
+	u64 length;
+	u64 first_stripe_start; /* only used in raid writes */
+	u64 M; /* for truncate */
+	unsigned bytes_in_stripe;
+	unsigned dev;
+	unsigned par_dev;
+	unsigned unit_off;
+	unsigned cur_pg;
+	unsigned cur_comp;
 };
 
 struct ore_io_state;
 typedef void (*ore_io_done_fn)(struct ore_io_state *ios, void *private);
+struct _ore_r4w_op {
+	/* @Priv given here is passed ios->private */
+	struct page * (*get_page)(void *priv, u64 page_index, bool *uptodate);
+	void (*put_page)(void *priv, struct page *page);
+};
 
 struct ore_io_state {
 	struct kref		kref;
+	struct ore_striping_info si;
 
 	void			*private;
 	ore_io_done_fn	done;
 
 	struct ore_layout	*layout;
-	struct ore_components	*comps;
+	struct ore_components	*oc;
 
 	/* Global read/write IO*/
 	loff_t			offset;
@@ -84,6 +139,16 @@
 
 	bool			reading;
 
+	/* House keeping of Parity pages */
+	bool			extra_part_alloc;
+	struct page		**parity_pages;
+	unsigned		max_par_pages;
+	unsigned		cur_par_page;
+	unsigned		sgs_per_dev;
+	struct __stripe_pages_2d *sp2d;
+	struct ore_io_state	 *ios_read_4_write;
+	const struct _ore_r4w_op *r4w;
+
 	/* Variable array of size numdevs */
 	unsigned numdevs;
 	struct ore_per_dev_state {
@@ -91,7 +156,10 @@
 		struct bio *bio;
 		loff_t offset;
 		unsigned length;
+		unsigned last_sgs_total;
 		unsigned dev;
+		struct osd_sg_entry *sglist;
+		unsigned cur_sg;
 	} per_dev[];
 };
 
@@ -102,6 +170,9 @@
 }
 
 /* ore.c */
+int ore_verify_layout(unsigned total_comps, struct ore_layout *layout);
+void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset,
+			  u64 length, struct ore_striping_info *si);
 int ore_get_rw_state(struct ore_layout *layout, struct ore_components *comps,
 		     bool is_reading, u64 offset, u64 length,
 		     struct ore_io_state **ios);
@@ -109,7 +180,10 @@
 		     struct ore_io_state **ios);
 void ore_put_io_state(struct ore_io_state *ios);
 
-int ore_check_io(struct ore_io_state *ios, u64 *resid);
+typedef void (*ore_on_dev_error)(struct ore_io_state *ios, struct ore_dev *od,
+	unsigned dev_index, enum osd_err_priority oep,
+	u64 dev_offset, u64  dev_len);
+int ore_check_io(struct ore_io_state *ios, ore_on_dev_error rep);
 
 int ore_create(struct ore_io_state *ios);
 int ore_remove(struct ore_io_state *ios);

diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h
new file mode 100644
index 0000000..669fbd6
--- /dev/null
+++ b/include/trace/events/rcu.h

@@ -0,0 +1,459 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM rcu
+
+#if !defined(_TRACE_RCU_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_RCU_H
+
+#include <linux/tracepoint.h>
+
+/*
+ * Tracepoint for start/end markers used for utilization calculations.
+ * By convention, the string is of the following forms:
+ *
+ * "Start <activity>" -- Mark the start of the specified activity,
+ *			 such as "context switch".  Nesting is permitted.
+ * "End <activity>" -- Mark the end of the specified activity.
+ *
+ * An "@" character within "<activity>" is a comment character: Data
+ * reduction scripts will ignore the "@" and the remainder of the line.
+ */
+TRACE_EVENT(rcu_utilization,
+
+	TP_PROTO(char *s),
+
+	TP_ARGS(s),
+
+	TP_STRUCT__entry(
+		__field(char *, s)
+	),
+
+	TP_fast_assign(
+		__entry->s = s;
+	),
+
+	TP_printk("%s", __entry->s)
+);
+
+#ifdef CONFIG_RCU_TRACE
+
+#if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU)
+
+/*
+ * Tracepoint for grace-period events: starting and ending a grace
+ * period ("start" and "end", respectively), a CPU noting the start
+ * of a new grace period or the end of an old grace period ("cpustart"
+ * and "cpuend", respectively), a CPU passing through a quiescent
+ * state ("cpuqs"), a CPU coming online or going offline ("cpuonl"
+ * and "cpuofl", respectively), and a CPU being kicked for being too
+ * long in dyntick-idle mode ("kick").
+ */
+TRACE_EVENT(rcu_grace_period,
+
+	TP_PROTO(char *rcuname, unsigned long gpnum, char *gpevent),
+
+	TP_ARGS(rcuname, gpnum, gpevent),
+
+	TP_STRUCT__entry(
+		__field(char *, rcuname)
+		__field(unsigned long, gpnum)
+		__field(char *, gpevent)
+	),
+
+	TP_fast_assign(
+		__entry->rcuname = rcuname;
+		__entry->gpnum = gpnum;
+		__entry->gpevent = gpevent;
+	),
+
+	TP_printk("%s %lu %s",
+		  __entry->rcuname, __entry->gpnum, __entry->gpevent)
+);
+
+/*
+ * Tracepoint for grace-period-initialization events.  These are
+ * distinguished by the type of RCU, the new grace-period number, the
+ * rcu_node structure level, the starting and ending CPU covered by the
+ * rcu_node structure, and the mask of CPUs that will be waited for.
+ * All but the type of RCU are extracted from the rcu_node structure.
+ */
+TRACE_EVENT(rcu_grace_period_init,
+
+	TP_PROTO(char *rcuname, unsigned long gpnum, u8 level,
+		 int grplo, int grphi, unsigned long qsmask),
+
+	TP_ARGS(rcuname, gpnum, level, grplo, grphi, qsmask),
+
+	TP_STRUCT__entry(
+		__field(char *, rcuname)
+		__field(unsigned long, gpnum)
+		__field(u8, level)
+		__field(int, grplo)
+		__field(int, grphi)
+		__field(unsigned long, qsmask)
+	),
+
+	TP_fast_assign(
+		__entry->rcuname = rcuname;
+		__entry->gpnum = gpnum;
+		__entry->level = level;
+		__entry->grplo = grplo;
+		__entry->grphi = grphi;
+		__entry->qsmask = qsmask;
+	),
+
+	TP_printk("%s %lu %u %d %d %lx",
+		  __entry->rcuname, __entry->gpnum, __entry->level,
+		  __entry->grplo, __entry->grphi, __entry->qsmask)
+);
+
+/*
+ * Tracepoint for tasks blocking within preemptible-RCU read-side
+ * critical sections.  Track the type of RCU (which one day might
+ * include SRCU), the grace-period number that the task is blocking
+ * (the current or the next), and the task's PID.
+ */
+TRACE_EVENT(rcu_preempt_task,
+
+	TP_PROTO(char *rcuname, int pid, unsigned long gpnum),
+
+	TP_ARGS(rcuname, pid, gpnum),
+
+	TP_STRUCT__entry(
+		__field(char *, rcuname)
+		__field(unsigned long, gpnum)
+		__field(int, pid)
+	),
+
+	TP_fast_assign(
+		__entry->rcuname = rcuname;
+		__entry->gpnum = gpnum;
+		__entry->pid = pid;
+	),
+
+	TP_printk("%s %lu %d",
+		  __entry->rcuname, __entry->gpnum, __entry->pid)
+);
+
+/*
+ * Tracepoint for tasks that blocked within a given preemptible-RCU
+ * read-side critical section exiting that critical section.  Track the
+ * type of RCU (which one day might include SRCU) and the task's PID.
+ */
+TRACE_EVENT(rcu_unlock_preempted_task,
+
+	TP_PROTO(char *rcuname, unsigned long gpnum, int pid),
+
+	TP_ARGS(rcuname, gpnum, pid),
+
+	TP_STRUCT__entry(
+		__field(char *, rcuname)
+		__field(unsigned long, gpnum)
+		__field(int, pid)
+	),
+
+	TP_fast_assign(
+		__entry->rcuname = rcuname;
+		__entry->gpnum = gpnum;
+		__entry->pid = pid;
+	),
+
+	TP_printk("%s %lu %d", __entry->rcuname, __entry->gpnum, __entry->pid)
+);
+
+/*
+ * Tracepoint for quiescent-state-reporting events.  These are
+ * distinguished by the type of RCU, the grace-period number, the
+ * mask of quiescent lower-level entities, the rcu_node structure level,
+ * the starting and ending CPU covered by the rcu_node structure, and
+ * whether there are any blocked tasks blocking the current grace period.
+ * All but the type of RCU are extracted from the rcu_node structure.
+ */
+TRACE_EVENT(rcu_quiescent_state_report,
+
+	TP_PROTO(char *rcuname, unsigned long gpnum,
+		 unsigned long mask, unsigned long qsmask,
+		 u8 level, int grplo, int grphi, int gp_tasks),
+
+	TP_ARGS(rcuname, gpnum, mask, qsmask, level, grplo, grphi, gp_tasks),
+
+	TP_STRUCT__entry(
+		__field(char *, rcuname)
+		__field(unsigned long, gpnum)
+		__field(unsigned long, mask)
+		__field(unsigned long, qsmask)
+		__field(u8, level)
+		__field(int, grplo)
+		__field(int, grphi)
+		__field(u8, gp_tasks)
+	),
+
+	TP_fast_assign(
+		__entry->rcuname = rcuname;
+		__entry->gpnum = gpnum;
+		__entry->mask = mask;
+		__entry->qsmask = qsmask;
+		__entry->level = level;
+		__entry->grplo = grplo;
+		__entry->grphi = grphi;
+		__entry->gp_tasks = gp_tasks;
+	),
+
+	TP_printk("%s %lu %lx>%lx %u %d %d %u",
+		  __entry->rcuname, __entry->gpnum,
+		  __entry->mask, __entry->qsmask, __entry->level,
+		  __entry->grplo, __entry->grphi, __entry->gp_tasks)
+);
+
+/*
+ * Tracepoint for quiescent states detected by force_quiescent_state().
+ * These trace events include the type of RCU, the grace-period number
+ * that was blocked by the CPU, the CPU itself, and the type of quiescent
+ * state, which can be "dti" for dyntick-idle mode, "ofl" for CPU offline,
+ * or "kick" when kicking a CPU that has been in dyntick-idle mode for
+ * too long.
+ */
+TRACE_EVENT(rcu_fqs,
+
+	TP_PROTO(char *rcuname, unsigned long gpnum, int cpu, char *qsevent),
+
+	TP_ARGS(rcuname, gpnum, cpu, qsevent),
+
+	TP_STRUCT__entry(
+		__field(char *, rcuname)
+		__field(unsigned long, gpnum)
+		__field(int, cpu)
+		__field(char *, qsevent)
+	),
+
+	TP_fast_assign(
+		__entry->rcuname = rcuname;
+		__entry->gpnum = gpnum;
+		__entry->cpu = cpu;
+		__entry->qsevent = qsevent;
+	),
+
+	TP_printk("%s %lu %d %s",
+		  __entry->rcuname, __entry->gpnum,
+		  __entry->cpu, __entry->qsevent)
+);
+
+#endif /* #if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) */
+
+/*
+ * Tracepoint for dyntick-idle entry/exit events.  These take a string
+ * as argument: "Start" for entering dyntick-idle mode and "End" for
+ * leaving it.
+ */
+TRACE_EVENT(rcu_dyntick,
+
+	TP_PROTO(char *polarity),
+
+	TP_ARGS(polarity),
+
+	TP_STRUCT__entry(
+		__field(char *, polarity)
+	),
+
+	TP_fast_assign(
+		__entry->polarity = polarity;
+	),
+
+	TP_printk("%s", __entry->polarity)
+);
+
+/*
+ * Tracepoint for the registration of a single RCU callback function.
+ * The first argument is the type of RCU, the second argument is
+ * a pointer to the RCU callback itself, and the third element is the
+ * new RCU callback queue length for the current CPU.
+ */
+TRACE_EVENT(rcu_callback,
+
+	TP_PROTO(char *rcuname, struct rcu_head *rhp, long qlen),
+
+	TP_ARGS(rcuname, rhp, qlen),
+
+	TP_STRUCT__entry(
+		__field(char *, rcuname)
+		__field(void *, rhp)
+		__field(void *, func)
+		__field(long, qlen)
+	),
+
+	TP_fast_assign(
+		__entry->rcuname = rcuname;
+		__entry->rhp = rhp;
+		__entry->func = rhp->func;
+		__entry->qlen = qlen;
+	),
+
+	TP_printk("%s rhp=%p func=%pf %ld",
+		  __entry->rcuname, __entry->rhp, __entry->func, __entry->qlen)
+);
+
+/*
+ * Tracepoint for the registration of a single RCU callback of the special
+ * kfree() form.  The first argument is the RCU type, the second argument
+ * is a pointer to the RCU callback, the third argument is the offset
+ * of the callback within the enclosing RCU-protected data structure,
+ * and the fourth argument is the new RCU callback queue length for the
+ * current CPU.
+ */
+TRACE_EVENT(rcu_kfree_callback,
+
+	TP_PROTO(char *rcuname, struct rcu_head *rhp, unsigned long offset,
+		 long qlen),
+
+	TP_ARGS(rcuname, rhp, offset, qlen),
+
+	TP_STRUCT__entry(
+		__field(char *, rcuname)
+		__field(void *, rhp)
+		__field(unsigned long, offset)
+		__field(long, qlen)
+	),
+
+	TP_fast_assign(
+		__entry->rcuname = rcuname;
+		__entry->rhp = rhp;
+		__entry->offset = offset;
+		__entry->qlen = qlen;
+	),
+
+	TP_printk("%s rhp=%p func=%ld %ld",
+		  __entry->rcuname, __entry->rhp, __entry->offset,
+		  __entry->qlen)
+);
+
+/*
+ * Tracepoint for marking the beginning rcu_do_batch, performed to start
+ * RCU callback invocation.  The first argument is the RCU flavor,
+ * the second is the total number of callbacks (including those that
+ * are not yet ready to be invoked), and the third argument is the
+ * current RCU-callback batch limit.
+ */
+TRACE_EVENT(rcu_batch_start,
+
+	TP_PROTO(char *rcuname, long qlen, int blimit),
+
+	TP_ARGS(rcuname, qlen, blimit),
+
+	TP_STRUCT__entry(
+		__field(char *, rcuname)
+		__field(long, qlen)
+		__field(int, blimit)
+	),
+
+	TP_fast_assign(
+		__entry->rcuname = rcuname;
+		__entry->qlen = qlen;
+		__entry->blimit = blimit;
+	),
+
+	TP_printk("%s CBs=%ld bl=%d",
+		  __entry->rcuname, __entry->qlen, __entry->blimit)
+);
+
+/*
+ * Tracepoint for the invocation of a single RCU callback function.
+ * The first argument is the type of RCU, and the second argument is
+ * a pointer to the RCU callback itself.
+ */
+TRACE_EVENT(rcu_invoke_callback,
+
+	TP_PROTO(char *rcuname, struct rcu_head *rhp),
+
+	TP_ARGS(rcuname, rhp),
+
+	TP_STRUCT__entry(
+		__field(char *, rcuname)
+		__field(void *, rhp)
+		__field(void *, func)
+	),
+
+	TP_fast_assign(
+		__entry->rcuname = rcuname;
+		__entry->rhp = rhp;
+		__entry->func = rhp->func;
+	),
+
+	TP_printk("%s rhp=%p func=%pf",
+		  __entry->rcuname, __entry->rhp, __entry->func)
+);
+
+/*
+ * Tracepoint for the invocation of a single RCU callback of the special
+ * kfree() form.  The first argument is the RCU flavor, the second
+ * argument is a pointer to the RCU callback, and the third argument
+ * is the offset of the callback within the enclosing RCU-protected
+ * data structure.
+ */
+TRACE_EVENT(rcu_invoke_kfree_callback,
+
+	TP_PROTO(char *rcuname, struct rcu_head *rhp, unsigned long offset),
+
+	TP_ARGS(rcuname, rhp, offset),
+
+	TP_STRUCT__entry(
+		__field(char *, rcuname)
+		__field(void *, rhp)
+		__field(unsigned long, offset)
+	),
+
+	TP_fast_assign(
+		__entry->rcuname = rcuname;
+		__entry->rhp = rhp;
+		__entry->offset	= offset;
+	),
+
+	TP_printk("%s rhp=%p func=%ld",
+		  __entry->rcuname, __entry->rhp, __entry->offset)
+);
+
+/*
+ * Tracepoint for exiting rcu_do_batch after RCU callbacks have been
+ * invoked.  The first argument is the name of the RCU flavor and
+ * the second argument is number of callbacks actually invoked.
+ */
+TRACE_EVENT(rcu_batch_end,
+
+	TP_PROTO(char *rcuname, int callbacks_invoked),
+
+	TP_ARGS(rcuname, callbacks_invoked),
+
+	TP_STRUCT__entry(
+		__field(char *, rcuname)
+		__field(int, callbacks_invoked)
+	),
+
+	TP_fast_assign(
+		__entry->rcuname = rcuname;
+		__entry->callbacks_invoked = callbacks_invoked;
+	),
+
+	TP_printk("%s CBs-invoked=%d",
+		  __entry->rcuname, __entry->callbacks_invoked)
+);
+
+#else /* #ifdef CONFIG_RCU_TRACE */
+
+#define trace_rcu_grace_period(rcuname, gpnum, gpevent) do { } while (0)
+#define trace_rcu_grace_period_init(rcuname, gpnum, level, grplo, grphi, qsmask) do { } while (0)
+#define trace_rcu_preempt_task(rcuname, pid, gpnum) do { } while (0)
+#define trace_rcu_unlock_preempted_task(rcuname, gpnum, pid) do { } while (0)
+#define trace_rcu_quiescent_state_report(rcuname, gpnum, mask, qsmask, level, grplo, grphi, gp_tasks) do { } while (0)
+#define trace_rcu_fqs(rcuname, gpnum, cpu, qsevent) do { } while (0)
+#define trace_rcu_dyntick(polarity) do { } while (0)
+#define trace_rcu_callback(rcuname, rhp, qlen) do { } while (0)
+#define trace_rcu_kfree_callback(rcuname, rhp, offset, qlen) do { } while (0)
+#define trace_rcu_batch_start(rcuname, qlen, blimit) do { } while (0)
+#define trace_rcu_invoke_callback(rcuname, rhp) do { } while (0)
+#define trace_rcu_invoke_kfree_callback(rcuname, rhp, offset) do { } while (0)
+#define trace_rcu_batch_end(rcuname, callbacks_invoked) do { } while (0)
+
+#endif /* #else #ifdef CONFIG_RCU_TRACE */
+
+#endif /* _TRACE_RCU_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>

diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index f633478..959ff18 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h

@@ -100,7 +100,7 @@
 	 * For all intents and purposes a preempted task is a running task.
 	 */
 	if (task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)
-		state = TASK_RUNNING;
+		state = TASK_RUNNING | TASK_STATE_MAX;
 #endif
 
 	return state;
@@ -137,13 +137,14 @@
 		__entry->next_prio	= next->prio;
 	),
 
-	TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s ==> next_comm=%s next_pid=%d next_prio=%d",
+	TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s%s ==> next_comm=%s next_pid=%d next_prio=%d",
 		__entry->prev_comm, __entry->prev_pid, __entry->prev_prio,
-		__entry->prev_state ?
-		  __print_flags(__entry->prev_state, "|",
+		__entry->prev_state & (TASK_STATE_MAX-1) ?
+		  __print_flags(__entry->prev_state & (TASK_STATE_MAX-1), "|",
 				{ 1, "S"} , { 2, "D" }, { 4, "T" }, { 8, "t" },
 				{ 16, "Z" }, { 32, "X" }, { 64, "x" },
 				{ 128, "W" }) : "R",
+		__entry->prev_state & TASK_STATE_MAX ? "+" : "",
 		__entry->next_comm, __entry->next_pid, __entry->next_prio)
 );
 

diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 533c49f..7697249 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h

@@ -711,6 +711,9 @@
 #undef __perf_count
 #define __perf_count(c) __count = (c)
 
+#undef TP_perf_assign
+#define TP_perf_assign(args...) args
+
 #undef DECLARE_EVENT_CLASS
 #define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print)	\
 static notrace void							\

diff --git a/init/Kconfig b/init/Kconfig
index d627783..31ba0fd 100644
--- a/init/Kconfig
+++ b/init/Kconfig

@@ -391,7 +391,7 @@
 
 config TREE_PREEMPT_RCU
 	bool "Preemptible tree-based hierarchical RCU"
-	depends on PREEMPT
+	depends on PREEMPT && SMP
 	help
 	  This option selects the RCU implementation that is
 	  designed for very large SMP systems with hundreds or
@@ -401,7 +401,7 @@
 
 config TINY_RCU
 	bool "UP-only small-memory-footprint RCU"
-	depends on !SMP
+	depends on !PREEMPT && !SMP
 	help
 	  This option selects the RCU implementation that is
 	  designed for UP systems from which real-time response
@@ -410,7 +410,7 @@
 
 config TINY_PREEMPT_RCU
 	bool "Preemptible UP-only small-memory-footprint RCU"
-	depends on !SMP && PREEMPT
+	depends on PREEMPT && !SMP
 	help
 	  This option selects the RCU implementation that is designed
 	  for real-time UP systems.  This option greatly reduces the
@@ -715,6 +715,18 @@
 	depends on CGROUP_SCHED
 	default CGROUP_SCHED
 
+config CFS_BANDWIDTH
+	bool "CPU bandwidth provisioning for FAIR_GROUP_SCHED"
+	depends on EXPERIMENTAL
+	depends on FAIR_GROUP_SCHED
+	default n
+	help
+	  This option allows users to define CPU bandwidth rates (limits) for
+	  tasks running within the fair group scheduler.  Groups with no limit
+	  set are considered to be unconstrained and will run with no
+	  restriction.
+	  See tip/Documentation/scheduler/sched-bwc.txt for more information.
+
 config RT_GROUP_SCHED
 	bool "Group scheduling for SCHED_RR/FIFO"
 	depends on EXPERIMENTAL

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 0f85778..d1a1bee 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c

@@ -29,6 +29,7 @@
 #include <linux/hardirq.h>
 #include <linux/rculist.h>
 #include <linux/uaccess.h>
+#include <linux/suspend.h>
 #include <linux/syscalls.h>
 #include <linux/anon_inodes.h>
 #include <linux/kernel_stat.h>
@@ -5758,6 +5759,7 @@
 	pmu = idr_find(&pmu_idr, event->attr.type);
 	rcu_read_unlock();
 	if (pmu) {
+		event->pmu = pmu;
 		ret = pmu->event_init(event);
 		if (ret)
 			pmu = ERR_PTR(ret);
@@ -5765,6 +5767,7 @@
 	}
 
 	list_for_each_entry_rcu(pmu, &pmus, entry) {
+		event->pmu = pmu;
 		ret = pmu->event_init(event);
 		if (!ret)
 			goto unlock;
@@ -5891,8 +5894,6 @@
 		return ERR_PTR(err);
 	}
 
-	event->pmu = pmu;
-
 	if (!event->parent) {
 		if (event->attach_state & PERF_ATTACH_TASK)
 			jump_label_inc(&perf_sched_events);
@@ -6852,7 +6853,7 @@
 	struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
 
 	mutex_lock(&swhash->hlist_mutex);
-	if (swhash->hlist_refcount > 0) {
+	if (swhash->hlist_refcount > 0 && !swhash->swevent_hlist) {
 		struct swevent_hlist *hlist;
 
 		hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
@@ -6941,7 +6942,14 @@
 {
 	unsigned int cpu = (long)hcpu;
 
-	switch (action & ~CPU_TASKS_FROZEN) {
+	/*
+	 * Ignore suspend/resume action, the perf_pm_notifier will
+	 * take care of that.
+	 */
+	if (action & CPU_TASKS_FROZEN)
+		return NOTIFY_OK;
+
+	switch (action) {
 
 	case CPU_UP_PREPARE:
 	case CPU_DOWN_FAILED:
@@ -6960,6 +6968,90 @@
 	return NOTIFY_OK;
 }
 
+static void perf_pm_resume_cpu(void *unused)
+{
+	struct perf_cpu_context *cpuctx;
+	struct perf_event_context *ctx;
+	struct pmu *pmu;
+	int idx;
+
+	idx = srcu_read_lock(&pmus_srcu);
+	list_for_each_entry_rcu(pmu, &pmus, entry) {
+		cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+		ctx = cpuctx->task_ctx;
+
+		perf_ctx_lock(cpuctx, ctx);
+		perf_pmu_disable(cpuctx->ctx.pmu);
+
+		cpu_ctx_sched_out(cpuctx, EVENT_ALL);
+		if (ctx)
+			ctx_sched_out(ctx, cpuctx, EVENT_ALL);
+
+		perf_pmu_enable(cpuctx->ctx.pmu);
+		perf_ctx_unlock(cpuctx, ctx);
+	}
+	srcu_read_unlock(&pmus_srcu, idx);
+}
+
+static void perf_pm_suspend_cpu(void *unused)
+{
+	struct perf_cpu_context *cpuctx;
+	struct perf_event_context *ctx;
+	struct pmu *pmu;
+	int idx;
+
+	idx = srcu_read_lock(&pmus_srcu);
+	list_for_each_entry_rcu(pmu, &pmus, entry) {
+		cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+		ctx = cpuctx->task_ctx;
+
+		perf_ctx_lock(cpuctx, ctx);
+		perf_pmu_disable(cpuctx->ctx.pmu);
+
+		perf_event_sched_in(cpuctx, ctx, current);
+
+		perf_pmu_enable(cpuctx->ctx.pmu);
+		perf_ctx_unlock(cpuctx, ctx);
+	}
+	srcu_read_unlock(&pmus_srcu, idx);
+}
+
+static int perf_resume(void)
+{
+	get_online_cpus();
+	smp_call_function(perf_pm_resume_cpu, NULL, 1);
+	put_online_cpus();
+
+	return NOTIFY_OK;
+}
+
+static int perf_suspend(void)
+{
+	get_online_cpus();
+	smp_call_function(perf_pm_suspend_cpu, NULL, 1);
+	put_online_cpus();
+
+	return NOTIFY_OK;
+}
+
+static int perf_pm(struct notifier_block *self, unsigned long action, void *ptr)
+{
+	switch (action) {
+	case PM_POST_HIBERNATION:
+	case PM_POST_SUSPEND:
+		return perf_resume();
+	case PM_HIBERNATION_PREPARE:
+	case PM_SUSPEND_PREPARE:
+		return perf_suspend();
+	default:
+		return NOTIFY_DONE;
+	}
+}
+
+static struct notifier_block perf_pm_notifier = {
+	.notifier_call = perf_pm,
+};
+
 void __init perf_event_init(void)
 {
 	int ret;
@@ -6974,6 +7066,7 @@
 	perf_tp_register();
 	perf_cpu_notifier(perf_cpu_notify);
 	register_reboot_notifier(&perf_reboot_notifier);
+	register_pm_notifier(&perf_pm_notifier);
 
 	ret = init_hw_breakpoint();
 	WARN(ret, "hw_breakpoint initialization failed with: %d", ret);

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index dc5114b..f7c543a 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c

@@ -26,7 +26,7 @@
 int irq_set_chip(unsigned int irq, struct irq_chip *chip)
 {
 	unsigned long flags;
-	struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
+	struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
 
 	if (!desc)
 		return -EINVAL;
@@ -54,7 +54,7 @@
 int irq_set_irq_type(unsigned int irq, unsigned int type)
 {
 	unsigned long flags;
-	struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
+	struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
 	int ret = 0;
 
 	if (!desc)
@@ -78,7 +78,7 @@
 int irq_set_handler_data(unsigned int irq, void *data)
 {
 	unsigned long flags;
-	struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
+	struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
 
 	if (!desc)
 		return -EINVAL;
@@ -98,7 +98,7 @@
 int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry)
 {
 	unsigned long flags;
-	struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
+	struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
 
 	if (!desc)
 		return -EINVAL;
@@ -119,7 +119,7 @@
 int irq_set_chip_data(unsigned int irq, void *data)
 {
 	unsigned long flags;
-	struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
+	struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
 
 	if (!desc)
 		return -EINVAL;
@@ -204,6 +204,24 @@
 	}
 }
 
+void irq_percpu_enable(struct irq_desc *desc, unsigned int cpu)
+{
+	if (desc->irq_data.chip->irq_enable)
+		desc->irq_data.chip->irq_enable(&desc->irq_data);
+	else
+		desc->irq_data.chip->irq_unmask(&desc->irq_data);
+	cpumask_set_cpu(cpu, desc->percpu_enabled);
+}
+
+void irq_percpu_disable(struct irq_desc *desc, unsigned int cpu)
+{
+	if (desc->irq_data.chip->irq_disable)
+		desc->irq_data.chip->irq_disable(&desc->irq_data);
+	else
+		desc->irq_data.chip->irq_mask(&desc->irq_data);
+	cpumask_clear_cpu(cpu, desc->percpu_enabled);
+}
+
 static inline void mask_ack_irq(struct irq_desc *desc)
 {
 	if (desc->irq_data.chip->irq_mask_ack)
@@ -544,12 +562,44 @@
 		chip->irq_eoi(&desc->irq_data);
 }
 
+/**
+ * handle_percpu_devid_irq - Per CPU local irq handler with per cpu dev ids
+ * @irq:	the interrupt number
+ * @desc:	the interrupt description structure for this irq
+ *
+ * Per CPU interrupts on SMP machines without locking requirements. Same as
+ * handle_percpu_irq() above but with the following extras:
+ *
+ * action->percpu_dev_id is a pointer to percpu variables which
+ * contain the real device id for the cpu on which this handler is
+ * called
+ */
+void handle_percpu_devid_irq(unsigned int irq, struct irq_desc *desc)
+{
+	struct irq_chip *chip = irq_desc_get_chip(desc);
+	struct irqaction *action = desc->action;
+	void *dev_id = __this_cpu_ptr(action->percpu_dev_id);
+	irqreturn_t res;
+
+	kstat_incr_irqs_this_cpu(irq, desc);
+
+	if (chip->irq_ack)
+		chip->irq_ack(&desc->irq_data);
+
+	trace_irq_handler_entry(irq, action);
+	res = action->handler(irq, dev_id);
+	trace_irq_handler_exit(irq, action, res);
+
+	if (chip->irq_eoi)
+		chip->irq_eoi(&desc->irq_data);
+}
+
 void
 __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
 		  const char *name)
 {
 	unsigned long flags;
-	struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
+	struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, 0);
 
 	if (!desc)
 		return;
@@ -593,7 +643,7 @@
 void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
 {
 	unsigned long flags;
-	struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
+	struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
 
 	if (!desc)
 		return;

diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 6546431..a73dd6c 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h

@@ -71,6 +71,8 @@
 extern void irq_shutdown(struct irq_desc *desc);
 extern void irq_enable(struct irq_desc *desc);
 extern void irq_disable(struct irq_desc *desc);
+extern void irq_percpu_enable(struct irq_desc *desc, unsigned int cpu);
+extern void irq_percpu_disable(struct irq_desc *desc, unsigned int cpu);
 extern void mask_irq(struct irq_desc *desc);
 extern void unmask_irq(struct irq_desc *desc);
 
@@ -114,14 +116,21 @@
 		desc->irq_data.chip->irq_bus_sync_unlock(&desc->irq_data);
 }
 
+#define _IRQ_DESC_CHECK		(1 << 0)
+#define _IRQ_DESC_PERCPU	(1 << 1)
+
+#define IRQ_GET_DESC_CHECK_GLOBAL	(_IRQ_DESC_CHECK)
+#define IRQ_GET_DESC_CHECK_PERCPU	(_IRQ_DESC_CHECK | _IRQ_DESC_PERCPU)
+
 struct irq_desc *
-__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus);
+__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus,
+		    unsigned int check);
 void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus);
 
 static inline struct irq_desc *
-irq_get_desc_buslock(unsigned int irq, unsigned long *flags)
+irq_get_desc_buslock(unsigned int irq, unsigned long *flags, unsigned int check)
 {
-	return __irq_get_desc_lock(irq, flags, true);
+	return __irq_get_desc_lock(irq, flags, true, check);
 }
 
 static inline void
@@ -131,9 +140,9 @@
 }
 
 static inline struct irq_desc *
-irq_get_desc_lock(unsigned int irq, unsigned long *flags)
+irq_get_desc_lock(unsigned int irq, unsigned long *flags, unsigned int check)
 {
-	return __irq_get_desc_lock(irq, flags, false);
+	return __irq_get_desc_lock(irq, flags, false, check);
 }
 
 static inline void

diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 039b889..1550e84 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c

@@ -424,11 +424,22 @@
 }
 
 struct irq_desc *
-__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus)
+__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus,
+		    unsigned int check)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 
 	if (desc) {
+		if (check & _IRQ_DESC_CHECK) {
+			if ((check & _IRQ_DESC_PERCPU) &&
+			    !irq_settings_is_per_cpu_devid(desc))
+				return NULL;
+
+			if (!(check & _IRQ_DESC_PERCPU) &&
+			    irq_settings_is_per_cpu_devid(desc))
+				return NULL;
+		}
+
 		if (bus)
 			chip_bus_lock(desc);
 		raw_spin_lock_irqsave(&desc->lock, *flags);
@@ -443,6 +454,25 @@
 		chip_bus_sync_unlock(desc);
 }
 
+int irq_set_percpu_devid(unsigned int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	if (!desc)
+		return -EINVAL;
+
+	if (desc->percpu_enabled)
+		return -EINVAL;
+
+	desc->percpu_enabled = kzalloc(sizeof(*desc->percpu_enabled), GFP_KERNEL);
+
+	if (!desc->percpu_enabled)
+		return -ENOMEM;
+
+	irq_set_percpu_devid_flags(irq);
+	return 0;
+}
+
 /**
  * dynamic_irq_cleanup - cleanup a dynamically allocated irq
  * @irq:	irq number to initialize

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 9b956fa..67ce837 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c

@@ -195,7 +195,7 @@
 int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
 {
 	unsigned long flags;
-	struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
+	struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
 
 	if (!desc)
 		return -EINVAL;
@@ -356,7 +356,7 @@
 static int __disable_irq_nosync(unsigned int irq)
 {
 	unsigned long flags;
-	struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
+	struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
 
 	if (!desc)
 		return -EINVAL;
@@ -448,7 +448,7 @@
 void enable_irq(unsigned int irq)
 {
 	unsigned long flags;
-	struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
+	struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
 
 	if (!desc)
 		return;
@@ -467,6 +467,9 @@
 	struct irq_desc *desc = irq_to_desc(irq);
 	int ret = -ENXIO;
 
+	if (irq_desc_get_chip(desc)->flags &  IRQCHIP_SKIP_SET_WAKE)
+		return 0;
+
 	if (desc->irq_data.chip->irq_set_wake)
 		ret = desc->irq_data.chip->irq_set_wake(&desc->irq_data, on);
 
@@ -488,7 +491,7 @@
 int irq_set_irq_wake(unsigned int irq, unsigned int on)
 {
 	unsigned long flags;
-	struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
+	struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
 	int ret = 0;
 
 	if (!desc)
@@ -529,7 +532,7 @@
 int can_request_irq(unsigned int irq, unsigned long irqflags)
 {
 	unsigned long flags;
-	struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
+	struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
 	int canrequest = 0;
 
 	if (!desc)
@@ -1118,6 +1121,8 @@
 	int retval;
 	struct irq_desc *desc = irq_to_desc(irq);
 
+	if (WARN_ON(irq_settings_is_per_cpu_devid(desc)))
+		return -EINVAL;
 	chip_bus_lock(desc);
 	retval = __setup_irq(irq, desc, act);
 	chip_bus_sync_unlock(desc);
@@ -1126,7 +1131,7 @@
 }
 EXPORT_SYMBOL_GPL(setup_irq);
 
- /*
+/*
  * Internal function to unregister an irqaction - used to free
  * regular and special interrupts that are part of the architecture.
  */
@@ -1224,7 +1229,10 @@
  */
 void remove_irq(unsigned int irq, struct irqaction *act)
 {
-	__free_irq(irq, act->dev_id);
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	if (desc && !WARN_ON(irq_settings_is_per_cpu_devid(desc)))
+	    __free_irq(irq, act->dev_id);
 }
 EXPORT_SYMBOL_GPL(remove_irq);
 
@@ -1246,7 +1254,7 @@
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 
-	if (!desc)
+	if (!desc || WARN_ON(irq_settings_is_per_cpu_devid(desc)))
 		return;
 
 #ifdef CONFIG_SMP
@@ -1324,7 +1332,8 @@
 	if (!desc)
 		return -EINVAL;
 
-	if (!irq_settings_can_request(desc))
+	if (!irq_settings_can_request(desc) ||
+	    WARN_ON(irq_settings_is_per_cpu_devid(desc)))
 		return -EINVAL;
 
 	if (!handler) {
@@ -1409,3 +1418,194 @@
 	return !ret ? IRQC_IS_HARDIRQ : ret;
 }
 EXPORT_SYMBOL_GPL(request_any_context_irq);
+
+void enable_percpu_irq(unsigned int irq, unsigned int type)
+{
+	unsigned int cpu = smp_processor_id();
+	unsigned long flags;
+	struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_PERCPU);
+
+	if (!desc)
+		return;
+
+	type &= IRQ_TYPE_SENSE_MASK;
+	if (type != IRQ_TYPE_NONE) {
+		int ret;
+
+		ret = __irq_set_trigger(desc, irq, type);
+
+		if (ret) {
+			WARN(1, "failed to set type for IRQ%d\n", irq);
+			goto out;
+		}
+	}
+
+	irq_percpu_enable(desc, cpu);
+out:
+	irq_put_desc_unlock(desc, flags);
+}
+
+void disable_percpu_irq(unsigned int irq)
+{
+	unsigned int cpu = smp_processor_id();
+	unsigned long flags;
+	struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_PERCPU);
+
+	if (!desc)
+		return;
+
+	irq_percpu_disable(desc, cpu);
+	irq_put_desc_unlock(desc, flags);
+}
+
+/*
+ * Internal function to unregister a percpu irqaction.
+ */
+static struct irqaction *__free_percpu_irq(unsigned int irq, void __percpu *dev_id)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+	struct irqaction *action;
+	unsigned long flags;
+
+	WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq);
+
+	if (!desc)
+		return NULL;
+
+	raw_spin_lock_irqsave(&desc->lock, flags);
+
+	action = desc->action;
+	if (!action || action->percpu_dev_id != dev_id) {
+		WARN(1, "Trying to free already-free IRQ %d\n", irq);
+		goto bad;
+	}
+
+	if (!cpumask_empty(desc->percpu_enabled)) {
+		WARN(1, "percpu IRQ %d still enabled on CPU%d!\n",
+		     irq, cpumask_first(desc->percpu_enabled));
+		goto bad;
+	}
+
+	/* Found it - now remove it from the list of entries: */
+	desc->action = NULL;
+
+	raw_spin_unlock_irqrestore(&desc->lock, flags);
+
+	unregister_handler_proc(irq, action);
+
+	module_put(desc->owner);
+	return action;
+
+bad:
+	raw_spin_unlock_irqrestore(&desc->lock, flags);
+	return NULL;
+}
+
+/**
+ *	remove_percpu_irq - free a per-cpu interrupt
+ *	@irq: Interrupt line to free
+ *	@act: irqaction for the interrupt
+ *
+ * Used to remove interrupts statically setup by the early boot process.
+ */
+void remove_percpu_irq(unsigned int irq, struct irqaction *act)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	if (desc && irq_settings_is_per_cpu_devid(desc))
+	    __free_percpu_irq(irq, act->percpu_dev_id);
+}
+
+/**
+ *	free_percpu_irq - free an interrupt allocated with request_percpu_irq
+ *	@irq: Interrupt line to free
+ *	@dev_id: Device identity to free
+ *
+ *	Remove a percpu interrupt handler. The handler is removed, but
+ *	the interrupt line is not disabled. This must be done on each
+ *	CPU before calling this function. The function does not return
+ *	until any executing interrupts for this IRQ have completed.
+ *
+ *	This function must not be called from interrupt context.
+ */
+void free_percpu_irq(unsigned int irq, void __percpu *dev_id)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	if (!desc || !irq_settings_is_per_cpu_devid(desc))
+		return;
+
+	chip_bus_lock(desc);
+	kfree(__free_percpu_irq(irq, dev_id));
+	chip_bus_sync_unlock(desc);
+}
+
+/**
+ *	setup_percpu_irq - setup a per-cpu interrupt
+ *	@irq: Interrupt line to setup
+ *	@act: irqaction for the interrupt
+ *
+ * Used to statically setup per-cpu interrupts in the early boot process.
+ */
+int setup_percpu_irq(unsigned int irq, struct irqaction *act)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+	int retval;
+
+	if (!desc || !irq_settings_is_per_cpu_devid(desc))
+		return -EINVAL;
+	chip_bus_lock(desc);
+	retval = __setup_irq(irq, desc, act);
+	chip_bus_sync_unlock(desc);
+
+	return retval;
+}
+
+/**
+ *	request_percpu_irq - allocate a percpu interrupt line
+ *	@irq: Interrupt line to allocate
+ *	@handler: Function to be called when the IRQ occurs.
+ *	@devname: An ascii name for the claiming device
+ *	@dev_id: A percpu cookie passed back to the handler function
+ *
+ *	This call allocates interrupt resources, but doesn't
+ *	automatically enable the interrupt. It has to be done on each
+ *	CPU using enable_percpu_irq().
+ *
+ *	Dev_id must be globally unique. It is a per-cpu variable, and
+ *	the handler gets called with the interrupted CPU's instance of
+ *	that variable.
+ */
+int request_percpu_irq(unsigned int irq, irq_handler_t handler,
+		       const char *devname, void __percpu *dev_id)
+{
+	struct irqaction *action;
+	struct irq_desc *desc;
+	int retval;
+
+	if (!dev_id)
+		return -EINVAL;
+
+	desc = irq_to_desc(irq);
+	if (!desc || !irq_settings_can_request(desc) ||
+	    !irq_settings_is_per_cpu_devid(desc))
+		return -EINVAL;
+
+	action = kzalloc(sizeof(struct irqaction), GFP_KERNEL);
+	if (!action)
+		return -ENOMEM;
+
+	action->handler = handler;
+	action->flags = IRQF_PERCPU;
+	action->name = devname;
+	action->percpu_dev_id = dev_id;
+
+	chip_bus_lock(desc);
+	retval = __setup_irq(irq, desc, action);
+	chip_bus_sync_unlock(desc);
+
+	if (retval)
+		kfree(action);
+
+	return retval;
+}

diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index f76fc00..15e53b1 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c

@@ -9,6 +9,7 @@
 #include <linux/irq.h>
 #include <linux/module.h>
 #include <linux/interrupt.h>
+#include <linux/syscore_ops.h>
 
 #include "internals.h"
 
@@ -39,25 +40,58 @@
 }
 EXPORT_SYMBOL_GPL(suspend_device_irqs);
 
-/**
- * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs()
- *
- * Enable all interrupt lines previously disabled by suspend_device_irqs() that
- * have the IRQS_SUSPENDED flag set.
- */
-void resume_device_irqs(void)
+static void resume_irqs(bool want_early)
 {
 	struct irq_desc *desc;
 	int irq;
 
 	for_each_irq_desc(irq, desc) {
 		unsigned long flags;
+		bool is_early = desc->action &&
+			desc->action->flags & IRQF_EARLY_RESUME;
+
+		if (is_early != want_early)
+			continue;
 
 		raw_spin_lock_irqsave(&desc->lock, flags);
 		__enable_irq(desc, irq, true);
 		raw_spin_unlock_irqrestore(&desc->lock, flags);
 	}
 }
+
+/**
+ * irq_pm_syscore_ops - enable interrupt lines early
+ *
+ * Enable all interrupt lines with %IRQF_EARLY_RESUME set.
+ */
+static void irq_pm_syscore_resume(void)
+{
+	resume_irqs(true);
+}
+
+static struct syscore_ops irq_pm_syscore_ops = {
+	.resume		= irq_pm_syscore_resume,
+};
+
+static int __init irq_pm_init_ops(void)
+{
+	register_syscore_ops(&irq_pm_syscore_ops);
+	return 0;
+}
+
+device_initcall(irq_pm_init_ops);
+
+/**
+ * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs()
+ *
+ * Enable all non-%IRQF_EARLY_RESUME interrupt lines previously
+ * disabled by suspend_device_irqs() that have the IRQS_SUSPENDED flag
+ * set as well as those with %IRQF_FORCE_RESUME.
+ */
+void resume_device_irqs(void)
+{
+	resume_irqs(false);
+}
 EXPORT_SYMBOL_GPL(resume_device_irqs);
 
 /**

diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
index f166783..1162f10 100644
--- a/kernel/irq/settings.h
+++ b/kernel/irq/settings.h

@@ -13,6 +13,7 @@
 	_IRQ_MOVE_PCNTXT	= IRQ_MOVE_PCNTXT,
 	_IRQ_NO_BALANCING	= IRQ_NO_BALANCING,
 	_IRQ_NESTED_THREAD	= IRQ_NESTED_THREAD,
+	_IRQ_PER_CPU_DEVID	= IRQ_PER_CPU_DEVID,
 	_IRQF_MODIFY_MASK	= IRQF_MODIFY_MASK,
 };
 
@@ -24,6 +25,7 @@
 #define IRQ_NOTHREAD		GOT_YOU_MORON
 #define IRQ_NOAUTOEN		GOT_YOU_MORON
 #define IRQ_NESTED_THREAD	GOT_YOU_MORON
+#define IRQ_PER_CPU_DEVID	GOT_YOU_MORON
 #undef IRQF_MODIFY_MASK
 #define IRQF_MODIFY_MASK	GOT_YOU_MORON
 
@@ -39,6 +41,11 @@
 	return desc->status_use_accessors & _IRQ_PER_CPU;
 }
 
+static inline bool irq_settings_is_per_cpu_devid(struct irq_desc *desc)
+{
+	return desc->status_use_accessors & _IRQ_PER_CPU_DEVID;
+}
+
 static inline void irq_settings_set_per_cpu(struct irq_desc *desc)
 {
 	desc->status_use_accessors |= _IRQ_PER_CPU;

diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index c58fa7d..0e2cde4 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c

@@ -17,54 +17,34 @@
  * claimed   NULL, 3 -> {pending}       : claimed to be enqueued
  * pending   next, 3 -> {busy}          : queued, pending callback
  * busy      NULL, 2 -> {free, claimed} : callback in progress, can be claimed
- *
- * We use the lower two bits of the next pointer to keep PENDING and BUSY
- * flags.
  */
 
 #define IRQ_WORK_PENDING	1UL
 #define IRQ_WORK_BUSY		2UL
 #define IRQ_WORK_FLAGS		3UL
 
-static inline bool irq_work_is_set(struct irq_work *entry, int flags)
-{
-	return (unsigned long)entry->next & flags;
-}
-
-static inline struct irq_work *irq_work_next(struct irq_work *entry)
-{
-	unsigned long next = (unsigned long)entry->next;
-	next &= ~IRQ_WORK_FLAGS;
-	return (struct irq_work *)next;
-}
-
-static inline struct irq_work *next_flags(struct irq_work *entry, int flags)
-{
-	unsigned long next = (unsigned long)entry;
-	next |= flags;
-	return (struct irq_work *)next;
-}
-
-static DEFINE_PER_CPU(struct irq_work *, irq_work_list);
+static DEFINE_PER_CPU(struct llist_head, irq_work_list);
 
 /*
  * Claim the entry so that no one else will poke at it.
  */
-static bool irq_work_claim(struct irq_work *entry)
+static bool irq_work_claim(struct irq_work *work)
 {
-	struct irq_work *next, *nflags;
+	unsigned long flags, nflags;
 
-	do {
-		next = entry->next;
-		if ((unsigned long)next & IRQ_WORK_PENDING)
+	for (;;) {
+		flags = work->flags;
+		if (flags & IRQ_WORK_PENDING)
 			return false;
-		nflags = next_flags(next, IRQ_WORK_FLAGS);
-	} while (cmpxchg(&entry->next, next, nflags) != next);
+		nflags = flags | IRQ_WORK_FLAGS;
+		if (cmpxchg(&work->flags, flags, nflags) == flags)
+			break;
+		cpu_relax();
+	}
 
 	return true;
 }
 
-
 void __weak arch_irq_work_raise(void)
 {
 	/*
@@ -75,20 +55,15 @@
 /*
  * Queue the entry and raise the IPI if needed.
  */
-static void __irq_work_queue(struct irq_work *entry)
+static void __irq_work_queue(struct irq_work *work)
 {
-	struct irq_work *next;
+	bool empty;
 
 	preempt_disable();
 
-	do {
-		next = __this_cpu_read(irq_work_list);
-		/* Can assign non-atomic because we keep the flags set. */
-		entry->next = next_flags(next, IRQ_WORK_FLAGS);
-	} while (this_cpu_cmpxchg(irq_work_list, next, entry) != next);
-
+	empty = llist_add(&work->llnode, &__get_cpu_var(irq_work_list));
 	/* The list was empty, raise self-interrupt to start processing. */
-	if (!irq_work_next(entry))
+	if (empty)
 		arch_irq_work_raise();
 
 	preempt_enable();
@@ -100,16 +75,16 @@
  *
  * Can be re-enqueued while the callback is still in progress.
  */
-bool irq_work_queue(struct irq_work *entry)
+bool irq_work_queue(struct irq_work *work)
 {
-	if (!irq_work_claim(entry)) {
+	if (!irq_work_claim(work)) {
 		/*
 		 * Already enqueued, can't do!
 		 */
 		return false;
 	}
 
-	__irq_work_queue(entry);
+	__irq_work_queue(work);
 	return true;
 }
 EXPORT_SYMBOL_GPL(irq_work_queue);
@@ -120,34 +95,34 @@
  */
 void irq_work_run(void)
 {
-	struct irq_work *list;
+	struct irq_work *work;
+	struct llist_head *this_list;
+	struct llist_node *llnode;
 
-	if (this_cpu_read(irq_work_list) == NULL)
+	this_list = &__get_cpu_var(irq_work_list);
+	if (llist_empty(this_list))
 		return;
 
 	BUG_ON(!in_irq());
 	BUG_ON(!irqs_disabled());
 
-	list = this_cpu_xchg(irq_work_list, NULL);
+	llnode = llist_del_all(this_list);
+	while (llnode != NULL) {
+		work = llist_entry(llnode, struct irq_work, llnode);
 
-	while (list != NULL) {
-		struct irq_work *entry = list;
-
-		list = irq_work_next(list);
+		llnode = llist_next(llnode);
 
 		/*
-		 * Clear the PENDING bit, after this point the @entry
+		 * Clear the PENDING bit, after this point the @work
 		 * can be re-used.
 		 */
-		entry->next = next_flags(NULL, IRQ_WORK_BUSY);
-		entry->func(entry);
+		work->flags = IRQ_WORK_BUSY;
+		work->func(work);
 		/*
 		 * Clear the BUSY bit and return to the free state if
 		 * no-one else claimed it meanwhile.
 		 */
-		(void)cmpxchg(&entry->next,
-			      next_flags(NULL, IRQ_WORK_BUSY),
-			      NULL);
+		(void)cmpxchg(&work->flags, IRQ_WORK_BUSY, 0);
 	}
 }
 EXPORT_SYMBOL_GPL(irq_work_run);
@@ -156,11 +131,11 @@
  * Synchronize against the irq_work @entry, ensures the entry is not
  * currently in use.
  */
-void irq_work_sync(struct irq_work *entry)
+void irq_work_sync(struct irq_work *work)
 {
 	WARN_ON_ONCE(irqs_disabled());
 
-	while (irq_work_is_set(entry, IRQ_WORK_BUSY))
+	while (work->flags & IRQ_WORK_BUSY)
 		cpu_relax();
 }
 EXPORT_SYMBOL_GPL(irq_work_sync);

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index c081fa9..e69434b 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c

@@ -1145,10 +1145,11 @@
 	if (debug_locks_silent)
 		return 0;
 
-	printk("\n=======================================================\n");
-	printk(  "[ INFO: possible circular locking dependency detected ]\n");
+	printk("\n");
+	printk("======================================================\n");
+	printk("[ INFO: possible circular locking dependency detected ]\n");
 	print_kernel_version();
-	printk(  "-------------------------------------------------------\n");
+	printk("-------------------------------------------------------\n");
 	printk("%s/%d is trying to acquire lock:\n",
 		curr->comm, task_pid_nr(curr));
 	print_lock(check_src);
@@ -1482,11 +1483,12 @@
 	if (!debug_locks_off_graph_unlock() || debug_locks_silent)
 		return 0;
 
-	printk("\n======================================================\n");
-	printk(  "[ INFO: %s-safe -> %s-unsafe lock order detected ]\n",
+	printk("\n");
+	printk("======================================================\n");
+	printk("[ INFO: %s-safe -> %s-unsafe lock order detected ]\n",
 		irqclass, irqclass);
 	print_kernel_version();
-	printk(  "------------------------------------------------------\n");
+	printk("------------------------------------------------------\n");
 	printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n",
 		curr->comm, task_pid_nr(curr),
 		curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT,
@@ -1711,10 +1713,11 @@
 	if (!debug_locks_off_graph_unlock() || debug_locks_silent)
 		return 0;
 
-	printk("\n=============================================\n");
-	printk(  "[ INFO: possible recursive locking detected ]\n");
+	printk("\n");
+	printk("=============================================\n");
+	printk("[ INFO: possible recursive locking detected ]\n");
 	print_kernel_version();
-	printk(  "---------------------------------------------\n");
+	printk("---------------------------------------------\n");
 	printk("%s/%d is trying to acquire lock:\n",
 		curr->comm, task_pid_nr(curr));
 	print_lock(next);
@@ -2217,10 +2220,11 @@
 	if (!debug_locks_off_graph_unlock() || debug_locks_silent)
 		return 0;
 
-	printk("\n=================================\n");
-	printk(  "[ INFO: inconsistent lock state ]\n");
+	printk("\n");
+	printk("=================================\n");
+	printk("[ INFO: inconsistent lock state ]\n");
 	print_kernel_version();
-	printk(  "---------------------------------\n");
+	printk("---------------------------------\n");
 
 	printk("inconsistent {%s} -> {%s} usage.\n",
 		usage_str[prev_bit], usage_str[new_bit]);
@@ -2281,10 +2285,11 @@
 	if (!debug_locks_off_graph_unlock() || debug_locks_silent)
 		return 0;
 
-	printk("\n=========================================================\n");
-	printk(  "[ INFO: possible irq lock inversion dependency detected ]\n");
+	printk("\n");
+	printk("=========================================================\n");
+	printk("[ INFO: possible irq lock inversion dependency detected ]\n");
 	print_kernel_version();
-	printk(  "---------------------------------------------------------\n");
+	printk("---------------------------------------------------------\n");
 	printk("%s/%d just changed the state of lock:\n",
 		curr->comm, task_pid_nr(curr));
 	print_lock(this);
@@ -3161,9 +3166,10 @@
 	if (debug_locks_silent)
 		return 0;
 
-	printk("\n=====================================\n");
-	printk(  "[ BUG: bad unlock balance detected! ]\n");
-	printk(  "-------------------------------------\n");
+	printk("\n");
+	printk("=====================================\n");
+	printk("[ BUG: bad unlock balance detected! ]\n");
+	printk("-------------------------------------\n");
 	printk("%s/%d is trying to release lock (",
 		curr->comm, task_pid_nr(curr));
 	print_lockdep_cache(lock);
@@ -3604,9 +3610,10 @@
 	if (debug_locks_silent)
 		return 0;
 
-	printk("\n=================================\n");
-	printk(  "[ BUG: bad contention detected! ]\n");
-	printk(  "---------------------------------\n");
+	printk("\n");
+	printk("=================================\n");
+	printk("[ BUG: bad contention detected! ]\n");
+	printk("---------------------------------\n");
 	printk("%s/%d is trying to contend lock (",
 		curr->comm, task_pid_nr(curr));
 	print_lockdep_cache(lock);
@@ -3977,9 +3984,10 @@
 	if (debug_locks_silent)
 		return;
 
-	printk("\n=========================\n");
-	printk(  "[ BUG: held lock freed! ]\n");
-	printk(  "-------------------------\n");
+	printk("\n");
+	printk("=========================\n");
+	printk("[ BUG: held lock freed! ]\n");
+	printk("-------------------------\n");
 	printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n",
 		curr->comm, task_pid_nr(curr), mem_from, mem_to-1);
 	print_lock(hlock);
@@ -4033,9 +4041,10 @@
 	if (debug_locks_silent)
 		return;
 
-	printk("\n=====================================\n");
-	printk(  "[ BUG: lock held at task exit time! ]\n");
-	printk(  "-------------------------------------\n");
+	printk("\n");
+	printk("=====================================\n");
+	printk("[ BUG: lock held at task exit time! ]\n");
+	printk("-------------------------------------\n");
 	printk("%s/%d is exiting with locks still held!\n",
 		curr->comm, task_pid_nr(curr));
 	lockdep_print_held_locks(curr);
@@ -4129,16 +4138,17 @@
 	if (unlikely(curr->lockdep_depth)) {
 		if (!debug_locks_off())
 			return;
-		printk("\n================================================\n");
-		printk(  "[ BUG: lock held when returning to user space! ]\n");
-		printk(  "------------------------------------------------\n");
+		printk("\n");
+		printk("================================================\n");
+		printk("[ BUG: lock held when returning to user space! ]\n");
+		printk("------------------------------------------------\n");
 		printk("%s/%d is leaving the kernel with locks still held!\n",
 				curr->comm, curr->pid);
 		lockdep_print_held_locks(curr);
 	}
 }
 
-void lockdep_rcu_dereference(const char *file, const int line)
+void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
 {
 	struct task_struct *curr = current;
 
@@ -4147,15 +4157,15 @@
 		return;
 #endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */
 	/* Note: the following can be executed concurrently, so be careful. */
-	printk("\n===================================================\n");
-	printk(  "[ INFO: suspicious rcu_dereference_check() usage. ]\n");
-	printk(  "---------------------------------------------------\n");
-	printk("%s:%d invoked rcu_dereference_check() without protection!\n",
-			file, line);
+	printk("\n");
+	printk("===============================\n");
+	printk("[ INFO: suspicious RCU usage. ]\n");
+	printk("-------------------------------\n");
+	printk("%s:%d %s!\n", file, line, s);
 	printk("\nother info that might help us debug this:\n\n");
 	printk("\nrcu_scheduler_active = %d, debug_locks = %d\n", rcu_scheduler_active, debug_locks);
 	lockdep_print_held_locks(curr);
 	printk("\nstack backtrace:\n");
 	dump_stack();
 }
-EXPORT_SYMBOL_GPL(lockdep_rcu_dereference);
+EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious);

diff --git a/kernel/module.c b/kernel/module.c
index 04379f92..93342d9 100644
--- a/kernel/module.c
+++ b/kernel/module.c

@@ -3487,50 +3487,3 @@
 }
 EXPORT_SYMBOL(module_layout);
 #endif
-
-#ifdef CONFIG_TRACEPOINTS
-void module_update_tracepoints(void)
-{
-	struct module *mod;
-
-	mutex_lock(&module_mutex);
-	list_for_each_entry(mod, &modules, list)
-		if (!mod->taints)
-			tracepoint_update_probe_range(mod->tracepoints_ptrs,
-				mod->tracepoints_ptrs + mod->num_tracepoints);
-	mutex_unlock(&module_mutex);
-}
-
-/*
- * Returns 0 if current not found.
- * Returns 1 if current found.
- */
-int module_get_iter_tracepoints(struct tracepoint_iter *iter)
-{
-	struct module *iter_mod;
-	int found = 0;
-
-	mutex_lock(&module_mutex);
-	list_for_each_entry(iter_mod, &modules, list) {
-		if (!iter_mod->taints) {
-			/*
-			 * Sorted module list
-			 */
-			if (iter_mod < iter->module)
-				continue;
-			else if (iter_mod > iter->module)
-				iter->tracepoint = NULL;
-			found = tracepoint_get_iter_range(&iter->tracepoint,
-				iter_mod->tracepoints_ptrs,
-				iter_mod->tracepoints_ptrs
-					+ iter_mod->num_tracepoints);
-			if (found) {
-				iter->module = iter_mod;
-				break;
-			}
-		}
-	}
-	mutex_unlock(&module_mutex);
-	return found;
-}
-#endif

diff --git a/kernel/pid.c b/kernel/pid.c
index e432057..8cafe7e 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c

@@ -418,7 +418,9 @@
  */
 struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
 {
-	rcu_lockdep_assert(rcu_read_lock_held());
+	rcu_lockdep_assert(rcu_read_lock_held(),
+			   "find_task_by_pid_ns() needs rcu_read_lock()"
+			   " protection");
 	return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID);
 }
 

diff --git a/kernel/rcu.h b/kernel/rcu.h
new file mode 100644
index 0000000..f600868
--- /dev/null
+++ b/kernel/rcu.h

@@ -0,0 +1,85 @@
+/*
+ * Read-Copy Update definitions shared among RCU implementations.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright IBM Corporation, 2011
+ *
+ * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+ */
+
+#ifndef __LINUX_RCU_H
+#define __LINUX_RCU_H
+
+#ifdef CONFIG_RCU_TRACE
+#define RCU_TRACE(stmt) stmt
+#else /* #ifdef CONFIG_RCU_TRACE */
+#define RCU_TRACE(stmt)
+#endif /* #else #ifdef CONFIG_RCU_TRACE */
+
+/*
+ * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally
+ * by call_rcu() and rcu callback execution, and are therefore not part of the
+ * RCU API. Leaving in rcupdate.h because they are used by all RCU flavors.
+ */
+
+#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
+# define STATE_RCU_HEAD_READY	0
+# define STATE_RCU_HEAD_QUEUED	1
+
+extern struct debug_obj_descr rcuhead_debug_descr;
+
+static inline void debug_rcu_head_queue(struct rcu_head *head)
+{
+	WARN_ON_ONCE((unsigned long)head & 0x3);
+	debug_object_activate(head, &rcuhead_debug_descr);
+	debug_object_active_state(head, &rcuhead_debug_descr,
+				  STATE_RCU_HEAD_READY,
+				  STATE_RCU_HEAD_QUEUED);
+}
+
+static inline void debug_rcu_head_unqueue(struct rcu_head *head)
+{
+	debug_object_active_state(head, &rcuhead_debug_descr,
+				  STATE_RCU_HEAD_QUEUED,
+				  STATE_RCU_HEAD_READY);
+	debug_object_deactivate(head, &rcuhead_debug_descr);
+}
+#else	/* !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
+static inline void debug_rcu_head_queue(struct rcu_head *head)
+{
+}
+
+static inline void debug_rcu_head_unqueue(struct rcu_head *head)
+{
+}
+#endif	/* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
+
+extern void kfree(const void *);
+
+static inline void __rcu_reclaim(char *rn, struct rcu_head *head)
+{
+	unsigned long offset = (unsigned long)head->func;
+
+	if (__is_kfree_rcu_offset(offset)) {
+		RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset));
+		kfree((void *)head - offset);
+	} else {
+		RCU_TRACE(trace_rcu_invoke_callback(rn, head));
+		head->func(head);
+	}
+}
+
+#endif /* __LINUX_RCU_H */

diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index ddddb32..ca0d23b 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c

@@ -46,6 +46,11 @@
 #include <linux/module.h>
 #include <linux/hardirq.h>
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/rcu.h>
+
+#include "rcu.h"
+
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 static struct lock_class_key rcu_lock_key;
 struct lockdep_map rcu_lock_map =
@@ -94,11 +99,16 @@
 
 #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
+struct rcu_synchronize {
+	struct rcu_head head;
+	struct completion completion;
+};
+
 /*
  * Awaken the corresponding synchronize_rcu() instance now that a
  * grace period has elapsed.
  */
-void wakeme_after_rcu(struct rcu_head  *head)
+static void wakeme_after_rcu(struct rcu_head  *head)
 {
 	struct rcu_synchronize *rcu;
 
@@ -106,6 +116,20 @@
 	complete(&rcu->completion);
 }
 
+void wait_rcu_gp(call_rcu_func_t crf)
+{
+	struct rcu_synchronize rcu;
+
+	init_rcu_head_on_stack(&rcu.head);
+	init_completion(&rcu.completion);
+	/* Will wake me after RCU finished. */
+	crf(&rcu.head, wakeme_after_rcu);
+	/* Wait for it. */
+	wait_for_completion(&rcu.completion);
+	destroy_rcu_head_on_stack(&rcu.head);
+}
+EXPORT_SYMBOL_GPL(wait_rcu_gp);
+
 #ifdef CONFIG_PROVE_RCU
 /*
  * wrapper function to avoid #include problems.

diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 7bbac7d..da775c8 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c

@@ -37,16 +37,17 @@
 #include <linux/cpu.h>
 #include <linux/prefetch.h>
 
-/* Controls for rcu_kthread() kthread, replacing RCU_SOFTIRQ used previously. */
-static struct task_struct *rcu_kthread_task;
-static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq);
-static unsigned long have_rcu_kthread_work;
+#ifdef CONFIG_RCU_TRACE
+#include <trace/events/rcu.h>
+#endif /* #else #ifdef CONFIG_RCU_TRACE */
+
+#include "rcu.h"
 
 /* Forward declarations for rcutiny_plugin.h. */
 struct rcu_ctrlblk;
-static void invoke_rcu_kthread(void);
-static void rcu_process_callbacks(struct rcu_ctrlblk *rcp);
-static int rcu_kthread(void *arg);
+static void invoke_rcu_callbacks(void);
+static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp);
+static void rcu_process_callbacks(struct softirq_action *unused);
 static void __call_rcu(struct rcu_head *head,
 		       void (*func)(struct rcu_head *rcu),
 		       struct rcu_ctrlblk *rcp);
@@ -96,16 +97,6 @@
 }
 
 /*
- * Wake up rcu_kthread() to process callbacks now eligible for invocation
- * or to boost readers.
- */
-static void invoke_rcu_kthread(void)
-{
-	have_rcu_kthread_work = 1;
-	wake_up(&rcu_kthread_wq);
-}
-
-/*
  * Record an rcu quiescent state.  And an rcu_bh quiescent state while we
  * are at it, given that any rcu quiescent state is also an rcu_bh
  * quiescent state.  Use "+" instead of "||" to defeat short circuiting.
@@ -117,7 +108,7 @@
 	local_irq_save(flags);
 	if (rcu_qsctr_help(&rcu_sched_ctrlblk) +
 	    rcu_qsctr_help(&rcu_bh_ctrlblk))
-		invoke_rcu_kthread();
+		invoke_rcu_callbacks();
 	local_irq_restore(flags);
 }
 
@@ -130,7 +121,7 @@
 
 	local_irq_save(flags);
 	if (rcu_qsctr_help(&rcu_bh_ctrlblk))
-		invoke_rcu_kthread();
+		invoke_rcu_callbacks();
 	local_irq_restore(flags);
 }
 
@@ -154,18 +145,23 @@
  * Invoke the RCU callbacks on the specified rcu_ctrlkblk structure
  * whose grace period has elapsed.
  */
-static void rcu_process_callbacks(struct rcu_ctrlblk *rcp)
+static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
 {
+	char *rn = NULL;
 	struct rcu_head *next, *list;
 	unsigned long flags;
 	RCU_TRACE(int cb_count = 0);
 
 	/* If no RCU callbacks ready to invoke, just return. */
-	if (&rcp->rcucblist == rcp->donetail)
+	if (&rcp->rcucblist == rcp->donetail) {
+		RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, -1));
+		RCU_TRACE(trace_rcu_batch_end(rcp->name, 0));
 		return;
+	}
 
 	/* Move the ready-to-invoke callbacks to a local list. */
 	local_irq_save(flags);
+	RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, -1));
 	list = rcp->rcucblist;
 	rcp->rcucblist = *rcp->donetail;
 	*rcp->donetail = NULL;
@@ -176,49 +172,26 @@
 	local_irq_restore(flags);
 
 	/* Invoke the callbacks on the local list. */
+	RCU_TRACE(rn = rcp->name);
 	while (list) {
 		next = list->next;
 		prefetch(next);
 		debug_rcu_head_unqueue(list);
 		local_bh_disable();
-		__rcu_reclaim(list);
+		__rcu_reclaim(rn, list);
 		local_bh_enable();
 		list = next;
 		RCU_TRACE(cb_count++);
 	}
 	RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count));
+	RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count));
 }
 
-/*
- * This kthread invokes RCU callbacks whose grace periods have
- * elapsed.  It is awakened as needed, and takes the place of the
- * RCU_SOFTIRQ that was used previously for this purpose.
- * This is a kthread, but it is never stopped, at least not until
- * the system goes down.
- */
-static int rcu_kthread(void *arg)
+static void rcu_process_callbacks(struct softirq_action *unused)
 {
-	unsigned long work;
-	unsigned long morework;
-	unsigned long flags;
-
-	for (;;) {
-		wait_event_interruptible(rcu_kthread_wq,
-					 have_rcu_kthread_work != 0);
-		morework = rcu_boost();
-		local_irq_save(flags);
-		work = have_rcu_kthread_work;
-		have_rcu_kthread_work = morework;
-		local_irq_restore(flags);
-		if (work) {
-			rcu_process_callbacks(&rcu_sched_ctrlblk);
-			rcu_process_callbacks(&rcu_bh_ctrlblk);
-			rcu_preempt_process_callbacks();
-		}
-		schedule_timeout_interruptible(1); /* Leave CPU for others. */
-	}
-
-	return 0;  /* Not reached, but needed to shut gcc up. */
+	__rcu_process_callbacks(&rcu_sched_ctrlblk);
+	__rcu_process_callbacks(&rcu_bh_ctrlblk);
+	rcu_preempt_process_callbacks();
 }
 
 /*
@@ -280,45 +253,3 @@
 	__call_rcu(head, func, &rcu_bh_ctrlblk);
 }
 EXPORT_SYMBOL_GPL(call_rcu_bh);
-
-void rcu_barrier_bh(void)
-{
-	struct rcu_synchronize rcu;
-
-	init_rcu_head_on_stack(&rcu.head);
-	init_completion(&rcu.completion);
-	/* Will wake me after RCU finished. */
-	call_rcu_bh(&rcu.head, wakeme_after_rcu);
-	/* Wait for it. */
-	wait_for_completion(&rcu.completion);
-	destroy_rcu_head_on_stack(&rcu.head);
-}
-EXPORT_SYMBOL_GPL(rcu_barrier_bh);
-
-void rcu_barrier_sched(void)
-{
-	struct rcu_synchronize rcu;
-
-	init_rcu_head_on_stack(&rcu.head);
-	init_completion(&rcu.completion);
-	/* Will wake me after RCU finished. */
-	call_rcu_sched(&rcu.head, wakeme_after_rcu);
-	/* Wait for it. */
-	wait_for_completion(&rcu.completion);
-	destroy_rcu_head_on_stack(&rcu.head);
-}
-EXPORT_SYMBOL_GPL(rcu_barrier_sched);
-
-/*
- * Spawn the kthread that invokes RCU callbacks.
- */
-static int __init rcu_spawn_kthreads(void)
-{
-	struct sched_param sp;
-
-	rcu_kthread_task = kthread_run(rcu_kthread, NULL, "rcu_kthread");
-	sp.sched_priority = RCU_BOOST_PRIO;
-	sched_setscheduler_nocheck(rcu_kthread_task, SCHED_FIFO, &sp);
-	return 0;
-}
-early_initcall(rcu_spawn_kthreads);

diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index f259c67..02aa713 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h

@@ -26,29 +26,26 @@
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
 
-#ifdef CONFIG_RCU_TRACE
-#define RCU_TRACE(stmt)	stmt
-#else /* #ifdef CONFIG_RCU_TRACE */
-#define RCU_TRACE(stmt)
-#endif /* #else #ifdef CONFIG_RCU_TRACE */
-
 /* Global control variables for rcupdate callback mechanism. */
 struct rcu_ctrlblk {
 	struct rcu_head *rcucblist;	/* List of pending callbacks (CBs). */
 	struct rcu_head **donetail;	/* ->next pointer of last "done" CB. */
 	struct rcu_head **curtail;	/* ->next pointer of last CB. */
 	RCU_TRACE(long qlen);		/* Number of pending CBs. */
+	RCU_TRACE(char *name);		/* Name of RCU type. */
 };
 
 /* Definition for rcupdate control block. */
 static struct rcu_ctrlblk rcu_sched_ctrlblk = {
 	.donetail	= &rcu_sched_ctrlblk.rcucblist,
 	.curtail	= &rcu_sched_ctrlblk.rcucblist,
+	RCU_TRACE(.name = "rcu_sched")
 };
 
 static struct rcu_ctrlblk rcu_bh_ctrlblk = {
 	.donetail	= &rcu_bh_ctrlblk.rcucblist,
 	.curtail	= &rcu_bh_ctrlblk.rcucblist,
+	RCU_TRACE(.name = "rcu_bh")
 };
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -131,6 +128,7 @@
 	.rcb.curtail = &rcu_preempt_ctrlblk.rcb.rcucblist,
 	.nexttail = &rcu_preempt_ctrlblk.rcb.rcucblist,
 	.blkd_tasks = LIST_HEAD_INIT(rcu_preempt_ctrlblk.blkd_tasks),
+	RCU_TRACE(.rcb.name = "rcu_preempt")
 };
 
 static int rcu_preempted_readers_exp(void);
@@ -247,6 +245,13 @@
 
 #include "rtmutex_common.h"
 
+#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
+
+/* Controls for rcu_kthread() kthread. */
+static struct task_struct *rcu_kthread_task;
+static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq);
+static unsigned long have_rcu_kthread_work;
+
 /*
  * Carry out RCU priority boosting on the task indicated by ->boost_tasks,
  * and advance ->boost_tasks to the next task in the ->blkd_tasks list.
@@ -334,7 +339,7 @@
 		if (rcu_preempt_ctrlblk.exp_tasks == NULL)
 			rcu_preempt_ctrlblk.boost_tasks =
 				rcu_preempt_ctrlblk.gp_tasks;
-		invoke_rcu_kthread();
+		invoke_rcu_callbacks();
 	} else
 		RCU_TRACE(rcu_initiate_boost_trace());
 	return 1;
@@ -353,14 +358,6 @@
 #else /* #ifdef CONFIG_RCU_BOOST */
 
 /*
- * If there is no RCU priority boosting, we don't boost.
- */
-static int rcu_boost(void)
-{
-	return 0;
-}
-
-/*
  * If there is no RCU priority boosting, we don't initiate boosting,
  * but we do indicate whether there are blocked readers blocking the
  * current grace period.
@@ -427,7 +424,7 @@
 
 	/* If there are done callbacks, cause them to be invoked. */
 	if (*rcu_preempt_ctrlblk.rcb.donetail != NULL)
-		invoke_rcu_kthread();
+		invoke_rcu_callbacks();
 }
 
 /*
@@ -648,7 +645,7 @@
 		rcu_preempt_cpu_qs();
 	if (&rcu_preempt_ctrlblk.rcb.rcucblist !=
 	    rcu_preempt_ctrlblk.rcb.donetail)
-		invoke_rcu_kthread();
+		invoke_rcu_callbacks();
 	if (rcu_preempt_gp_in_progress() &&
 	    rcu_cpu_blocking_cur_gp() &&
 	    rcu_preempt_running_reader())
@@ -674,7 +671,7 @@
  */
 static void rcu_preempt_process_callbacks(void)
 {
-	rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb);
+	__rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb);
 }
 
 /*
@@ -697,20 +694,6 @@
 }
 EXPORT_SYMBOL_GPL(call_rcu);
 
-void rcu_barrier(void)
-{
-	struct rcu_synchronize rcu;
-
-	init_rcu_head_on_stack(&rcu.head);
-	init_completion(&rcu.completion);
-	/* Will wake me after RCU finished. */
-	call_rcu(&rcu.head, wakeme_after_rcu);
-	/* Wait for it. */
-	wait_for_completion(&rcu.completion);
-	destroy_rcu_head_on_stack(&rcu.head);
-}
-EXPORT_SYMBOL_GPL(rcu_barrier);
-
 /*
  * synchronize_rcu - wait until a grace period has elapsed.
  *
@@ -864,15 +847,6 @@
 #endif /* #ifdef CONFIG_RCU_TRACE */
 
 /*
- * Because preemptible RCU does not exist, it is never necessary to
- * boost preempted RCU readers.
- */
-static int rcu_boost(void)
-{
-	return 0;
-}
-
-/*
  * Because preemptible RCU does not exist, it never has any callbacks
  * to check.
  */
@@ -898,6 +872,78 @@
 
 #endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */
 
+#ifdef CONFIG_RCU_BOOST
+
+/*
+ * Wake up rcu_kthread() to process callbacks now eligible for invocation
+ * or to boost readers.
+ */
+static void invoke_rcu_callbacks(void)
+{
+	have_rcu_kthread_work = 1;
+	wake_up(&rcu_kthread_wq);
+}
+
+/*
+ * This kthread invokes RCU callbacks whose grace periods have
+ * elapsed.  It is awakened as needed, and takes the place of the
+ * RCU_SOFTIRQ that is used for this purpose when boosting is disabled.
+ * This is a kthread, but it is never stopped, at least not until
+ * the system goes down.
+ */
+static int rcu_kthread(void *arg)
+{
+	unsigned long work;
+	unsigned long morework;
+	unsigned long flags;
+
+	for (;;) {
+		wait_event_interruptible(rcu_kthread_wq,
+					 have_rcu_kthread_work != 0);
+		morework = rcu_boost();
+		local_irq_save(flags);
+		work = have_rcu_kthread_work;
+		have_rcu_kthread_work = morework;
+		local_irq_restore(flags);
+		if (work)
+			rcu_process_callbacks(NULL);
+		schedule_timeout_interruptible(1); /* Leave CPU for others. */
+	}
+
+	return 0;  /* Not reached, but needed to shut gcc up. */
+}
+
+/*
+ * Spawn the kthread that invokes RCU callbacks.
+ */
+static int __init rcu_spawn_kthreads(void)
+{
+	struct sched_param sp;
+
+	rcu_kthread_task = kthread_run(rcu_kthread, NULL, "rcu_kthread");
+	sp.sched_priority = RCU_BOOST_PRIO;
+	sched_setscheduler_nocheck(rcu_kthread_task, SCHED_FIFO, &sp);
+	return 0;
+}
+early_initcall(rcu_spawn_kthreads);
+
+#else /* #ifdef CONFIG_RCU_BOOST */
+
+/*
+ * Start up softirq processing of callbacks.
+ */
+void invoke_rcu_callbacks(void)
+{
+	raise_softirq(RCU_SOFTIRQ);
+}
+
+void rcu_init(void)
+{
+	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
+}
+
+#endif /* #else #ifdef CONFIG_RCU_BOOST */
+
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 #include <linux/kernel_stat.h>
 
@@ -913,12 +959,6 @@
 
 #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
-#ifdef CONFIG_RCU_BOOST
-#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
-#else /* #ifdef CONFIG_RCU_BOOST */
-#define RCU_BOOST_PRIO 1
-#endif /* #else #ifdef CONFIG_RCU_BOOST */
-
 #ifdef CONFIG_RCU_TRACE
 
 #ifdef CONFIG_RCU_BOOST

diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 98f51b1..764825c 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c

@@ -73,7 +73,7 @@
 MODULE_PARM_DESC(nreaders, "Number of RCU reader threads");
 module_param(nfakewriters, int, 0444);
 MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads");
-module_param(stat_interval, int, 0444);
+module_param(stat_interval, int, 0644);
 MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s");
 module_param(verbose, bool, 0444);
 MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s");
@@ -480,30 +480,6 @@
 	call_rcu_bh(&p->rtort_rcu, rcu_torture_cb);
 }
 
-struct rcu_bh_torture_synchronize {
-	struct rcu_head head;
-	struct completion completion;
-};
-
-static void rcu_bh_torture_wakeme_after_cb(struct rcu_head *head)
-{
-	struct rcu_bh_torture_synchronize *rcu;
-
-	rcu = container_of(head, struct rcu_bh_torture_synchronize, head);
-	complete(&rcu->completion);
-}
-
-static void rcu_bh_torture_synchronize(void)
-{
-	struct rcu_bh_torture_synchronize rcu;
-
-	init_rcu_head_on_stack(&rcu.head);
-	init_completion(&rcu.completion);
-	call_rcu_bh(&rcu.head, rcu_bh_torture_wakeme_after_cb);
-	wait_for_completion(&rcu.completion);
-	destroy_rcu_head_on_stack(&rcu.head);
-}
-
 static struct rcu_torture_ops rcu_bh_ops = {
 	.init		= NULL,
 	.cleanup	= NULL,
@@ -512,7 +488,7 @@
 	.readunlock	= rcu_bh_torture_read_unlock,
 	.completed	= rcu_bh_torture_completed,
 	.deferred_free	= rcu_bh_torture_deferred_free,
-	.sync		= rcu_bh_torture_synchronize,
+	.sync		= synchronize_rcu_bh,
 	.cb_barrier	= rcu_barrier_bh,
 	.fqs		= rcu_bh_force_quiescent_state,
 	.stats		= NULL,
@@ -528,7 +504,7 @@
 	.readunlock	= rcu_bh_torture_read_unlock,
 	.completed	= rcu_bh_torture_completed,
 	.deferred_free	= rcu_sync_torture_deferred_free,
-	.sync		= rcu_bh_torture_synchronize,
+	.sync		= synchronize_rcu_bh,
 	.cb_barrier	= NULL,
 	.fqs		= rcu_bh_force_quiescent_state,
 	.stats		= NULL,
@@ -536,6 +512,22 @@
 	.name		= "rcu_bh_sync"
 };
 
+static struct rcu_torture_ops rcu_bh_expedited_ops = {
+	.init		= rcu_sync_torture_init,
+	.cleanup	= NULL,
+	.readlock	= rcu_bh_torture_read_lock,
+	.read_delay	= rcu_read_delay,  /* just reuse rcu's version. */
+	.readunlock	= rcu_bh_torture_read_unlock,
+	.completed	= rcu_bh_torture_completed,
+	.deferred_free	= rcu_sync_torture_deferred_free,
+	.sync		= synchronize_rcu_bh_expedited,
+	.cb_barrier	= NULL,
+	.fqs		= rcu_bh_force_quiescent_state,
+	.stats		= NULL,
+	.irq_capable	= 1,
+	.name		= "rcu_bh_expedited"
+};
+
 /*
  * Definitions for srcu torture testing.
  */
@@ -659,11 +651,6 @@
 	call_rcu_sched(&p->rtort_rcu, rcu_torture_cb);
 }
 
-static void sched_torture_synchronize(void)
-{
-	synchronize_sched();
-}
-
 static struct rcu_torture_ops sched_ops = {
 	.init		= rcu_sync_torture_init,
 	.cleanup	= NULL,
@@ -672,7 +659,7 @@
 	.readunlock	= sched_torture_read_unlock,
 	.completed	= rcu_no_completed,
 	.deferred_free	= rcu_sched_torture_deferred_free,
-	.sync		= sched_torture_synchronize,
+	.sync		= synchronize_sched,
 	.cb_barrier	= rcu_barrier_sched,
 	.fqs		= rcu_sched_force_quiescent_state,
 	.stats		= NULL,
@@ -688,7 +675,7 @@
 	.readunlock	= sched_torture_read_unlock,
 	.completed	= rcu_no_completed,
 	.deferred_free	= rcu_sync_torture_deferred_free,
-	.sync		= sched_torture_synchronize,
+	.sync		= synchronize_sched,
 	.cb_barrier	= NULL,
 	.fqs		= rcu_sched_force_quiescent_state,
 	.stats		= NULL,
@@ -754,7 +741,7 @@
 	do {
 		/* Wait for the next test interval. */
 		oldstarttime = boost_starttime;
-		while (jiffies - oldstarttime > ULONG_MAX / 2) {
+		while (ULONG_CMP_LT(jiffies, oldstarttime)) {
 			schedule_timeout_uninterruptible(1);
 			rcu_stutter_wait("rcu_torture_boost");
 			if (kthread_should_stop() ||
@@ -765,7 +752,7 @@
 		/* Do one boost-test interval. */
 		endtime = oldstarttime + test_boost_duration * HZ;
 		call_rcu_time = jiffies;
-		while (jiffies - endtime > ULONG_MAX / 2) {
+		while (ULONG_CMP_LT(jiffies, endtime)) {
 			/* If we don't have a callback in flight, post one. */
 			if (!rbi.inflight) {
 				smp_mb(); /* RCU core before ->inflight = 1. */
@@ -792,7 +779,8 @@
 		 * interval.  Besides, we are running at RT priority,
 		 * so delays should be relatively rare.
 		 */
-		while (oldstarttime == boost_starttime) {
+		while (oldstarttime == boost_starttime &&
+		       !kthread_should_stop()) {
 			if (mutex_trylock(&boost_mutex)) {
 				boost_starttime = jiffies +
 						  test_boost_interval * HZ;
@@ -809,11 +797,11 @@
 
 	/* Clean up and exit. */
 	VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping");
-	destroy_rcu_head_on_stack(&rbi.rcu);
 	rcutorture_shutdown_absorb("rcu_torture_boost");
 	while (!kthread_should_stop() || rbi.inflight)
 		schedule_timeout_uninterruptible(1);
 	smp_mb(); /* order accesses to ->inflight before stack-frame death. */
+	destroy_rcu_head_on_stack(&rbi.rcu);
 	return 0;
 }
 
@@ -831,11 +819,13 @@
 	VERBOSE_PRINTK_STRING("rcu_torture_fqs task started");
 	do {
 		fqs_resume_time = jiffies + fqs_stutter * HZ;
-		while (jiffies - fqs_resume_time > LONG_MAX) {
+		while (ULONG_CMP_LT(jiffies, fqs_resume_time) &&
+		       !kthread_should_stop()) {
 			schedule_timeout_interruptible(1);
 		}
 		fqs_burst_remaining = fqs_duration;
-		while (fqs_burst_remaining > 0) {
+		while (fqs_burst_remaining > 0 &&
+		       !kthread_should_stop()) {
 			cur_ops->fqs();
 			udelay(fqs_holdoff);
 			fqs_burst_remaining -= fqs_holdoff;
@@ -1280,8 +1270,9 @@
 	/* Don't allow time recalculation while creating a new task. */
 	mutex_lock(&boost_mutex);
 	VERBOSE_PRINTK_STRING("Creating rcu_torture_boost task");
-	boost_tasks[cpu] = kthread_create(rcu_torture_boost, NULL,
-					  "rcu_torture_boost");
+	boost_tasks[cpu] = kthread_create_on_node(rcu_torture_boost, NULL,
+						  cpu_to_node(cpu),
+						  "rcu_torture_boost");
 	if (IS_ERR(boost_tasks[cpu])) {
 		retval = PTR_ERR(boost_tasks[cpu]);
 		VERBOSE_PRINTK_STRING("rcu_torture_boost task create failed");
@@ -1424,7 +1415,7 @@
 	int firsterr = 0;
 	static struct rcu_torture_ops *torture_ops[] =
 		{ &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops,
-		  &rcu_bh_ops, &rcu_bh_sync_ops,
+		  &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops,
 		  &srcu_ops, &srcu_expedited_ops,
 		  &sched_ops, &sched_sync_ops, &sched_expedited_ops, };
 

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index ba06207..e234eb9 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c

@@ -52,13 +52,16 @@
 #include <linux/prefetch.h>
 
 #include "rcutree.h"
+#include <trace/events/rcu.h>
+
+#include "rcu.h"
 
 /* Data structures. */
 
 static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
 
 #define RCU_STATE_INITIALIZER(structname) { \
-	.level = { &structname.node[0] }, \
+	.level = { &structname##_state.node[0] }, \
 	.levelcnt = { \
 		NUM_RCU_LVL_0,  /* root of hierarchy. */ \
 		NUM_RCU_LVL_1, \
@@ -69,17 +72,17 @@
 	.signaled = RCU_GP_IDLE, \
 	.gpnum = -300, \
 	.completed = -300, \
-	.onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname.onofflock), \
-	.fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname.fqslock), \
+	.onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \
+	.fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.fqslock), \
 	.n_force_qs = 0, \
 	.n_force_qs_ngp = 0, \
 	.name = #structname, \
 }
 
-struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched_state);
+struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched);
 DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
 
-struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
+struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh);
 DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
 
 static struct rcu_state *rcu_state;
@@ -128,8 +131,6 @@
 static void invoke_rcu_core(void);
 static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
 
-#define RCU_KTHREAD_PRIO 1	/* RT priority for per-CPU kthreads. */
-
 /*
  * Track the rcutorture test sequence number and the update version
  * number within a given test.  The rcutorture_testseq is incremented
@@ -156,33 +157,41 @@
  * Note a quiescent state.  Because we do not need to know
  * how many quiescent states passed, just if there was at least
  * one since the start of the grace period, this just sets a flag.
+ * The caller must have disabled preemption.
  */
 void rcu_sched_qs(int cpu)
 {
 	struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu);
 
-	rdp->passed_quiesc_completed = rdp->gpnum - 1;
+	rdp->passed_quiesce_gpnum = rdp->gpnum;
 	barrier();
-	rdp->passed_quiesc = 1;
+	if (rdp->passed_quiesce == 0)
+		trace_rcu_grace_period("rcu_sched", rdp->gpnum, "cpuqs");
+	rdp->passed_quiesce = 1;
 }
 
 void rcu_bh_qs(int cpu)
 {
 	struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
 
-	rdp->passed_quiesc_completed = rdp->gpnum - 1;
+	rdp->passed_quiesce_gpnum = rdp->gpnum;
 	barrier();
-	rdp->passed_quiesc = 1;
+	if (rdp->passed_quiesce == 0)
+		trace_rcu_grace_period("rcu_bh", rdp->gpnum, "cpuqs");
+	rdp->passed_quiesce = 1;
 }
 
 /*
  * Note a context switch.  This is a quiescent state for RCU-sched,
  * and requires special handling for preemptible RCU.
+ * The caller must have disabled preemption.
  */
 void rcu_note_context_switch(int cpu)
 {
+	trace_rcu_utilization("Start context switch");
 	rcu_sched_qs(cpu);
 	rcu_preempt_note_context_switch(cpu);
+	trace_rcu_utilization("End context switch");
 }
 EXPORT_SYMBOL_GPL(rcu_note_context_switch);
 
@@ -193,7 +202,7 @@
 };
 #endif /* #ifdef CONFIG_NO_HZ */
 
-static int blimit = 10;		/* Maximum callbacks per softirq. */
+static int blimit = 10;		/* Maximum callbacks per rcu_do_batch. */
 static int qhimark = 10000;	/* If this many pending, ignore blimit. */
 static int qlowmark = 100;	/* Once only this many pending, use blimit. */
 
@@ -314,6 +323,7 @@
 	 * trust its state not to change because interrupts are disabled.
 	 */
 	if (cpu_is_offline(rdp->cpu)) {
+		trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "ofl");
 		rdp->offline_fqs++;
 		return 1;
 	}
@@ -354,19 +364,13 @@
 		local_irq_restore(flags);
 		return;
 	}
+	trace_rcu_dyntick("Start");
 	/* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
 	smp_mb__before_atomic_inc();  /* See above. */
 	atomic_inc(&rdtp->dynticks);
 	smp_mb__after_atomic_inc();  /* Force ordering with next sojourn. */
 	WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
 	local_irq_restore(flags);
-
-	/* If the interrupt queued a callback, get out of dyntick mode. */
-	if (in_irq() &&
-	    (__get_cpu_var(rcu_sched_data).nxtlist ||
-	     __get_cpu_var(rcu_bh_data).nxtlist ||
-	     rcu_preempt_needs_cpu(smp_processor_id())))
-		set_need_resched();
 }
 
 /*
@@ -391,6 +395,7 @@
 	/* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
 	smp_mb__after_atomic_inc();  /* See above. */
 	WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
+	trace_rcu_dyntick("End");
 	local_irq_restore(flags);
 }
 
@@ -481,11 +486,11 @@
  */
 static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
 {
-	unsigned long curr;
-	unsigned long snap;
+	unsigned int curr;
+	unsigned int snap;
 
-	curr = (unsigned long)atomic_add_return(0, &rdp->dynticks->dynticks);
-	snap = (unsigned long)rdp->dynticks_snap;
+	curr = (unsigned int)atomic_add_return(0, &rdp->dynticks->dynticks);
+	snap = (unsigned int)rdp->dynticks_snap;
 
 	/*
 	 * If the CPU passed through or entered a dynticks idle phase with
@@ -495,7 +500,8 @@
 	 * read-side critical section that started before the beginning
 	 * of the current RCU grace period.
 	 */
-	if ((curr & 0x1) == 0 || ULONG_CMP_GE(curr, snap + 2)) {
+	if ((curr & 0x1) == 0 || UINT_CMP_GE(curr, snap + 2)) {
+		trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "dti");
 		rdp->dynticks_fqs++;
 		return 1;
 	}
@@ -537,6 +543,7 @@
 	int cpu;
 	long delta;
 	unsigned long flags;
+	int ndetected;
 	struct rcu_node *rnp = rcu_get_root(rsp);
 
 	/* Only let one CPU complain about others per time interval. */
@@ -553,7 +560,7 @@
 	 * Now rat on any tasks that got kicked up to the root rcu_node
 	 * due to CPU offlining.
 	 */
-	rcu_print_task_stall(rnp);
+	ndetected = rcu_print_task_stall(rnp);
 	raw_spin_unlock_irqrestore(&rnp->lock, flags);
 
 	/*
@@ -565,17 +572,22 @@
 	       rsp->name);
 	rcu_for_each_leaf_node(rsp, rnp) {
 		raw_spin_lock_irqsave(&rnp->lock, flags);
-		rcu_print_task_stall(rnp);
+		ndetected += rcu_print_task_stall(rnp);
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 		if (rnp->qsmask == 0)
 			continue;
 		for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
-			if (rnp->qsmask & (1UL << cpu))
+			if (rnp->qsmask & (1UL << cpu)) {
 				printk(" %d", rnp->grplo + cpu);
+				ndetected++;
+			}
 	}
 	printk("} (detected by %d, t=%ld jiffies)\n",
 	       smp_processor_id(), (long)(jiffies - rsp->gp_start));
-	trigger_all_cpu_backtrace();
+	if (ndetected == 0)
+		printk(KERN_ERR "INFO: Stall ended before state dump start\n");
+	else if (!trigger_all_cpu_backtrace())
+		dump_stack();
 
 	/* If so configured, complain about tasks blocking the grace period. */
 
@@ -596,7 +608,8 @@
 	 */
 	printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n",
 	       rsp->name, smp_processor_id(), jiffies - rsp->gp_start);
-	trigger_all_cpu_backtrace();
+	if (!trigger_all_cpu_backtrace())
+		dump_stack();
 
 	raw_spin_lock_irqsave(&rnp->lock, flags);
 	if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall))
@@ -678,9 +691,10 @@
 		 * go looking for one.
 		 */
 		rdp->gpnum = rnp->gpnum;
+		trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpustart");
 		if (rnp->qsmask & rdp->grpmask) {
 			rdp->qs_pending = 1;
-			rdp->passed_quiesc = 0;
+			rdp->passed_quiesce = 0;
 		} else
 			rdp->qs_pending = 0;
 	}
@@ -741,6 +755,7 @@
 
 		/* Remember that we saw this grace-period completion. */
 		rdp->completed = rnp->completed;
+		trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuend");
 
 		/*
 		 * If we were in an extended quiescent state, we may have
@@ -826,31 +841,31 @@
 	struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
 	struct rcu_node *rnp = rcu_get_root(rsp);
 
-	if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) {
-		if (cpu_needs_another_gp(rsp, rdp))
-			rsp->fqs_need_gp = 1;
-		if (rnp->completed == rsp->completed) {
-			raw_spin_unlock_irqrestore(&rnp->lock, flags);
-			return;
-		}
-		raw_spin_unlock(&rnp->lock);	 /* irqs remain disabled. */
-
+	if (!rcu_scheduler_fully_active ||
+	    !cpu_needs_another_gp(rsp, rdp)) {
 		/*
-		 * Propagate new ->completed value to rcu_node structures
-		 * so that other CPUs don't have to wait until the start
-		 * of the next grace period to process their callbacks.
+		 * Either the scheduler hasn't yet spawned the first
+		 * non-idle task or this CPU does not need another
+		 * grace period.  Either way, don't start a new grace
+		 * period.
 		 */
-		rcu_for_each_node_breadth_first(rsp, rnp) {
-			raw_spin_lock(&rnp->lock); /* irqs already disabled. */
-			rnp->completed = rsp->completed;
-			raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
-		}
-		local_irq_restore(flags);
+		raw_spin_unlock_irqrestore(&rnp->lock, flags);
+		return;
+	}
+
+	if (rsp->fqs_active) {
+		/*
+		 * This CPU needs a grace period, but force_quiescent_state()
+		 * is running.  Tell it to start one on this CPU's behalf.
+		 */
+		rsp->fqs_need_gp = 1;
+		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 		return;
 	}
 
 	/* Advance to a new grace period and initialize state. */
 	rsp->gpnum++;
+	trace_rcu_grace_period(rsp->name, rsp->gpnum, "start");
 	WARN_ON_ONCE(rsp->signaled == RCU_GP_INIT);
 	rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */
 	rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
@@ -865,6 +880,9 @@
 		rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */
 		rcu_start_gp_per_cpu(rsp, rnp, rdp);
 		rcu_preempt_boost_start_gp(rnp);
+		trace_rcu_grace_period_init(rsp->name, rnp->gpnum,
+					    rnp->level, rnp->grplo,
+					    rnp->grphi, rnp->qsmask);
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 		return;
 	}
@@ -901,6 +919,9 @@
 		if (rnp == rdp->mynode)
 			rcu_start_gp_per_cpu(rsp, rnp, rdp);
 		rcu_preempt_boost_start_gp(rnp);
+		trace_rcu_grace_period_init(rsp->name, rnp->gpnum,
+					    rnp->level, rnp->grplo,
+					    rnp->grphi, rnp->qsmask);
 		raw_spin_unlock(&rnp->lock);	/* irqs remain disabled. */
 	}
 
@@ -922,6 +943,8 @@
 	__releases(rcu_get_root(rsp)->lock)
 {
 	unsigned long gp_duration;
+	struct rcu_node *rnp = rcu_get_root(rsp);
+	struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
 
 	WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
 
@@ -933,7 +956,41 @@
 	gp_duration = jiffies - rsp->gp_start;
 	if (gp_duration > rsp->gp_max)
 		rsp->gp_max = gp_duration;
-	rsp->completed = rsp->gpnum;
+
+	/*
+	 * We know the grace period is complete, but to everyone else
+	 * it appears to still be ongoing.  But it is also the case
+	 * that to everyone else it looks like there is nothing that
+	 * they can do to advance the grace period.  It is therefore
+	 * safe for us to drop the lock in order to mark the grace
+	 * period as completed in all of the rcu_node structures.
+	 *
+	 * But if this CPU needs another grace period, it will take
+	 * care of this while initializing the next grace period.
+	 * We use RCU_WAIT_TAIL instead of the usual RCU_DONE_TAIL
+	 * because the callbacks have not yet been advanced: Those
+	 * callbacks are waiting on the grace period that just now
+	 * completed.
+	 */
+	if (*rdp->nxttail[RCU_WAIT_TAIL] == NULL) {
+		raw_spin_unlock(&rnp->lock);	 /* irqs remain disabled. */
+
+		/*
+		 * Propagate new ->completed value to rcu_node structures
+		 * so that other CPUs don't have to wait until the start
+		 * of the next grace period to process their callbacks.
+		 */
+		rcu_for_each_node_breadth_first(rsp, rnp) {
+			raw_spin_lock(&rnp->lock); /* irqs already disabled. */
+			rnp->completed = rsp->gpnum;
+			raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+		}
+		rnp = rcu_get_root(rsp);
+		raw_spin_lock(&rnp->lock); /* irqs already disabled. */
+	}
+
+	rsp->completed = rsp->gpnum;  /* Declare the grace period complete. */
+	trace_rcu_grace_period(rsp->name, rsp->completed, "end");
 	rsp->signaled = RCU_GP_IDLE;
 	rcu_start_gp(rsp, flags);  /* releases root node's rnp->lock. */
 }
@@ -962,6 +1019,10 @@
 			return;
 		}
 		rnp->qsmask &= ~mask;
+		trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum,
+						 mask, rnp->qsmask, rnp->level,
+						 rnp->grplo, rnp->grphi,
+						 !!rnp->gp_tasks);
 		if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
 
 			/* Other bits still set at this level, so done. */
@@ -1000,7 +1061,7 @@
  * based on quiescent states detected in an earlier grace period!
  */
 static void
-rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp)
+rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastgp)
 {
 	unsigned long flags;
 	unsigned long mask;
@@ -1008,17 +1069,15 @@
 
 	rnp = rdp->mynode;
 	raw_spin_lock_irqsave(&rnp->lock, flags);
-	if (lastcomp != rnp->completed) {
+	if (lastgp != rnp->gpnum || rnp->completed == rnp->gpnum) {
 
 		/*
-		 * Someone beat us to it for this grace period, so leave.
-		 * The race with GP start is resolved by the fact that we
-		 * hold the leaf rcu_node lock, so that the per-CPU bits
-		 * cannot yet be initialized -- so we would simply find our
-		 * CPU's bit already cleared in rcu_report_qs_rnp() if this
-		 * race occurred.
+		 * The grace period in which this quiescent state was
+		 * recorded has ended, so don't report it upwards.
+		 * We will instead need a new quiescent state that lies
+		 * within the current grace period.
 		 */
-		rdp->passed_quiesc = 0;	/* try again later! */
+		rdp->passed_quiesce = 0;	/* need qs for new gp. */
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 		return;
 	}
@@ -1062,14 +1121,14 @@
 	 * Was there a quiescent state since the beginning of the grace
 	 * period? If no, then exit and wait for the next call.
 	 */
-	if (!rdp->passed_quiesc)
+	if (!rdp->passed_quiesce)
 		return;
 
 	/*
 	 * Tell RCU we are done (but rcu_report_qs_rdp() will be the
 	 * judge of that).
 	 */
-	rcu_report_qs_rdp(rdp->cpu, rsp, rdp, rdp->passed_quiesc_completed);
+	rcu_report_qs_rdp(rdp->cpu, rsp, rdp, rdp->passed_quiesce_gpnum);
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
@@ -1130,11 +1189,20 @@
 		if (rnp->qsmaskinit != 0) {
 			if (rnp != rdp->mynode)
 				raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+			else
+				trace_rcu_grace_period(rsp->name,
+						       rnp->gpnum + 1 -
+						       !!(rnp->qsmask & mask),
+						       "cpuofl");
 			break;
 		}
-		if (rnp == rdp->mynode)
+		if (rnp == rdp->mynode) {
+			trace_rcu_grace_period(rsp->name,
+					       rnp->gpnum + 1 -
+					       !!(rnp->qsmask & mask),
+					       "cpuofl");
 			need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp);
-		else
+		} else
 			raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
 		mask = rnp->grpmask;
 		rnp = rnp->parent;
@@ -1190,17 +1258,22 @@
 {
 	unsigned long flags;
 	struct rcu_head *next, *list, **tail;
-	int count;
+	int bl, count;
 
 	/* If no callbacks are ready, just return.*/
-	if (!cpu_has_callbacks_ready_to_invoke(rdp))
+	if (!cpu_has_callbacks_ready_to_invoke(rdp)) {
+		trace_rcu_batch_start(rsp->name, 0, 0);
+		trace_rcu_batch_end(rsp->name, 0);
 		return;
+	}
 
 	/*
 	 * Extract the list of ready callbacks, disabling to prevent
 	 * races with call_rcu() from interrupt handlers.
 	 */
 	local_irq_save(flags);
+	bl = rdp->blimit;
+	trace_rcu_batch_start(rsp->name, rdp->qlen, bl);
 	list = rdp->nxtlist;
 	rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL];
 	*rdp->nxttail[RCU_DONE_TAIL] = NULL;
@@ -1216,13 +1289,14 @@
 		next = list->next;
 		prefetch(next);
 		debug_rcu_head_unqueue(list);
-		__rcu_reclaim(list);
+		__rcu_reclaim(rsp->name, list);
 		list = next;
-		if (++count >= rdp->blimit)
+		if (++count >= bl)
 			break;
 	}
 
 	local_irq_save(flags);
+	trace_rcu_batch_end(rsp->name, count);
 
 	/* Update count, and requeue any remaining callbacks. */
 	rdp->qlen -= count;
@@ -1250,7 +1324,7 @@
 
 	local_irq_restore(flags);
 
-	/* Re-raise the RCU softirq if there are callbacks remaining. */
+	/* Re-invoke RCU core processing if there are callbacks remaining. */
 	if (cpu_has_callbacks_ready_to_invoke(rdp))
 		invoke_rcu_core();
 }
@@ -1258,7 +1332,7 @@
 /*
  * Check to see if this CPU is in a non-context-switch quiescent state
  * (user mode or idle loop for rcu, non-softirq execution for rcu_bh).
- * Also schedule the RCU softirq handler.
+ * Also schedule RCU core processing.
  *
  * This function must be called with hardirqs disabled.  It is normally
  * invoked from the scheduling-clock interrupt.  If rcu_pending returns
@@ -1266,6 +1340,7 @@
  */
 void rcu_check_callbacks(int cpu, int user)
 {
+	trace_rcu_utilization("Start scheduler-tick");
 	if (user ||
 	    (idle_cpu(cpu) && rcu_scheduler_active &&
 	     !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
@@ -1299,6 +1374,7 @@
 	rcu_preempt_check_callbacks(cpu);
 	if (rcu_pending(cpu))
 		invoke_rcu_core();
+	trace_rcu_utilization("End scheduler-tick");
 }
 
 #ifdef CONFIG_SMP
@@ -1360,10 +1436,14 @@
 	unsigned long flags;
 	struct rcu_node *rnp = rcu_get_root(rsp);
 
-	if (!rcu_gp_in_progress(rsp))
+	trace_rcu_utilization("Start fqs");
+	if (!rcu_gp_in_progress(rsp)) {
+		trace_rcu_utilization("End fqs");
 		return;  /* No grace period in progress, nothing to force. */
+	}
 	if (!raw_spin_trylock_irqsave(&rsp->fqslock, flags)) {
 		rsp->n_force_qs_lh++; /* Inexact, can lose counts.  Tough! */
+		trace_rcu_utilization("End fqs");
 		return;	/* Someone else is already on the job. */
 	}
 	if (relaxed && ULONG_CMP_GE(rsp->jiffies_force_qs, jiffies))
@@ -1412,11 +1492,13 @@
 		raw_spin_unlock(&rsp->fqslock); /* irqs remain disabled */
 		rsp->fqs_need_gp = 0;
 		rcu_start_gp(rsp, flags); /* releases rnp->lock */
+		trace_rcu_utilization("End fqs");
 		return;
 	}
 	raw_spin_unlock(&rnp->lock);  /* irqs remain disabled */
 unlock_fqs_ret:
 	raw_spin_unlock_irqrestore(&rsp->fqslock, flags);
+	trace_rcu_utilization("End fqs");
 }
 
 #else /* #ifdef CONFIG_SMP */
@@ -1429,9 +1511,9 @@
 #endif /* #else #ifdef CONFIG_SMP */
 
 /*
- * This does the RCU processing work from softirq context for the
- * specified rcu_state and rcu_data structures.  This may be called
- * only from the CPU to whom the rdp belongs.
+ * This does the RCU core processing work for the specified rcu_state
+ * and rcu_data structures.  This may be called only from the CPU to
+ * whom the rdp belongs.
  */
 static void
 __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
@@ -1468,24 +1550,24 @@
 }
 
 /*
- * Do softirq processing for the current CPU.
+ * Do RCU core processing for the current CPU.
  */
 static void rcu_process_callbacks(struct softirq_action *unused)
 {
+	trace_rcu_utilization("Start RCU core");
 	__rcu_process_callbacks(&rcu_sched_state,
 				&__get_cpu_var(rcu_sched_data));
 	__rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
 	rcu_preempt_process_callbacks();
-
-	/* If we are last CPU on way to dyntick-idle mode, accelerate it. */
-	rcu_needs_cpu_flush();
+	trace_rcu_utilization("End RCU core");
 }
 
 /*
- * Wake up the current CPU's kthread.  This replaces raise_softirq()
- * in earlier versions of RCU.  Note that because we are running on
- * the current CPU with interrupts disabled, the rcu_cpu_kthread_task
- * cannot disappear out from under us.
+ * Schedule RCU callback invocation.  If the specified type of RCU
+ * does not support RCU priority boosting, just do a direct call,
+ * otherwise wake up the per-CPU kernel kthread.  Note that because we
+ * are running on the current CPU with interrupts disabled, the
+ * rcu_cpu_kthread_task cannot disappear out from under us.
  */
 static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
 {
@@ -1530,6 +1612,12 @@
 	rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
 	rdp->qlen++;
 
+	if (__is_kfree_rcu_offset((unsigned long)func))
+		trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func,
+					 rdp->qlen);
+	else
+		trace_rcu_callback(rsp->name, head, rdp->qlen);
+
 	/* If interrupts were disabled, don't dive into RCU core. */
 	if (irqs_disabled_flags(flags)) {
 		local_irq_restore(flags);
@@ -1613,18 +1701,9 @@
  */
 void synchronize_sched(void)
 {
-	struct rcu_synchronize rcu;
-
 	if (rcu_blocking_is_gp())
 		return;
-
-	init_rcu_head_on_stack(&rcu.head);
-	init_completion(&rcu.completion);
-	/* Will wake me after RCU finished. */
-	call_rcu_sched(&rcu.head, wakeme_after_rcu);
-	/* Wait for it. */
-	wait_for_completion(&rcu.completion);
-	destroy_rcu_head_on_stack(&rcu.head);
+	wait_rcu_gp(call_rcu_sched);
 }
 EXPORT_SYMBOL_GPL(synchronize_sched);
 
@@ -1639,18 +1718,9 @@
  */
 void synchronize_rcu_bh(void)
 {
-	struct rcu_synchronize rcu;
-
 	if (rcu_blocking_is_gp())
 		return;
-
-	init_rcu_head_on_stack(&rcu.head);
-	init_completion(&rcu.completion);
-	/* Will wake me after RCU finished. */
-	call_rcu_bh(&rcu.head, wakeme_after_rcu);
-	/* Wait for it. */
-	wait_for_completion(&rcu.completion);
-	destroy_rcu_head_on_stack(&rcu.head);
+	wait_rcu_gp(call_rcu_bh);
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
 
@@ -1671,7 +1741,8 @@
 	check_cpu_stall(rsp, rdp);
 
 	/* Is the RCU core waiting for a quiescent state from this CPU? */
-	if (rdp->qs_pending && !rdp->passed_quiesc) {
+	if (rcu_scheduler_fully_active &&
+	    rdp->qs_pending && !rdp->passed_quiesce) {
 
 		/*
 		 * If force_quiescent_state() coming soon and this CPU
@@ -1683,7 +1754,7 @@
 		    ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1,
 				 jiffies))
 			set_need_resched();
-	} else if (rdp->qs_pending && rdp->passed_quiesc) {
+	} else if (rdp->qs_pending && rdp->passed_quiesce) {
 		rdp->n_rp_report_qs++;
 		return 1;
 	}
@@ -1846,6 +1917,7 @@
 	rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
 #endif /* #ifdef CONFIG_NO_HZ */
 	rdp->cpu = cpu;
+	rdp->rsp = rsp;
 	raw_spin_unlock_irqrestore(&rnp->lock, flags);
 }
 
@@ -1865,8 +1937,6 @@
 
 	/* Set up local state, ensuring consistent view of global state. */
 	raw_spin_lock_irqsave(&rnp->lock, flags);
-	rdp->passed_quiesc = 0;  /* We could be racing with new GP, */
-	rdp->qs_pending = 1;	 /*  so set up to respond to current GP. */
 	rdp->beenonline = 1;	 /* We have now been online. */
 	rdp->preemptible = preemptible;
 	rdp->qlen_last_fqs_check = 0;
@@ -1891,9 +1961,17 @@
 		rnp->qsmaskinit |= mask;
 		mask = rnp->grpmask;
 		if (rnp == rdp->mynode) {
-			rdp->gpnum = rnp->completed; /* if GP in progress... */
+			/*
+			 * If there is a grace period in progress, we will
+			 * set up to wait for it next time we run the
+			 * RCU core code.
+			 */
+			rdp->gpnum = rnp->completed;
 			rdp->completed = rnp->completed;
-			rdp->passed_quiesc_completed = rnp->completed - 1;
+			rdp->passed_quiesce = 0;
+			rdp->qs_pending = 0;
+			rdp->passed_quiesce_gpnum = rnp->gpnum - 1;
+			trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuonl");
 		}
 		raw_spin_unlock(&rnp->lock); /* irqs already disabled. */
 		rnp = rnp->parent;
@@ -1919,6 +1997,7 @@
 	struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
 	struct rcu_node *rnp = rdp->mynode;
 
+	trace_rcu_utilization("Start CPU hotplug");
 	switch (action) {
 	case CPU_UP_PREPARE:
 	case CPU_UP_PREPARE_FROZEN:
@@ -1954,6 +2033,7 @@
 	default:
 		break;
 	}
+	trace_rcu_utilization("End CPU hotplug");
 	return NOTIFY_OK;
 }
 

diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 01b2ccd..849ce9e 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h

@@ -230,9 +230,9 @@
 					/*  in order to detect GP end. */
 	unsigned long	gpnum;		/* Highest gp number that this CPU */
 					/*  is aware of having started. */
-	unsigned long	passed_quiesc_completed;
-					/* Value of completed at time of qs. */
-	bool		passed_quiesc;	/* User-mode/idle loop etc. */
+	unsigned long	passed_quiesce_gpnum;
+					/* gpnum at time of quiescent state. */
+	bool		passed_quiesce;	/* User-mode/idle loop etc. */
 	bool		qs_pending;	/* Core waits for quiesc state. */
 	bool		beenonline;	/* CPU online at least once. */
 	bool		preemptible;	/* Preemptible RCU? */
@@ -299,6 +299,7 @@
 	unsigned long n_rp_need_nothing;
 
 	int cpu;
+	struct rcu_state *rsp;
 };
 
 /* Values for signaled field in struct rcu_state. */
@@ -417,6 +418,13 @@
 DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data);
 #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
 
+#ifdef CONFIG_RCU_BOOST
+DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
+DECLARE_PER_CPU(int, rcu_cpu_kthread_cpu);
+DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
+DECLARE_PER_CPU(char, rcu_cpu_has_work);
+#endif /* #ifdef CONFIG_RCU_BOOST */
+
 #ifndef RCU_TREE_NONCORE
 
 /* Forward declarations for rcutree_plugin.h */
@@ -430,7 +438,7 @@
 static void rcu_stop_cpu_kthread(int cpu);
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
 static void rcu_print_detail_task_stall(struct rcu_state *rsp);
-static void rcu_print_task_stall(struct rcu_node *rnp);
+static int rcu_print_task_stall(struct rcu_node *rnp);
 static void rcu_preempt_stall_reset(void);
 static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
 #ifdef CONFIG_HOTPLUG_CPU
@@ -450,7 +458,6 @@
 static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
 static void rcu_preempt_send_cbs_to_online(void);
 static void __init __rcu_init_preempt(void);
-static void rcu_needs_cpu_flush(void);
 static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
 static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
 static void invoke_rcu_callbacks_kthread(void);

diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 8aafbb8..4b9b9f8 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h

@@ -27,6 +27,14 @@
 #include <linux/delay.h>
 #include <linux/stop_machine.h>
 
+#define RCU_KTHREAD_PRIO 1
+
+#ifdef CONFIG_RCU_BOOST
+#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
+#else
+#define RCU_BOOST_PRIO RCU_KTHREAD_PRIO
+#endif
+
 /*
  * Check the RCU kernel configuration parameters and print informative
  * messages about anything out of the ordinary.  If you like #ifdef, you
@@ -64,7 +72,7 @@
 
 #ifdef CONFIG_TREE_PREEMPT_RCU
 
-struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state);
+struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt);
 DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
 static struct rcu_state *rcu_state = &rcu_preempt_state;
 
@@ -122,9 +130,11 @@
 {
 	struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
 
-	rdp->passed_quiesc_completed = rdp->gpnum - 1;
+	rdp->passed_quiesce_gpnum = rdp->gpnum;
 	barrier();
-	rdp->passed_quiesc = 1;
+	if (rdp->passed_quiesce == 0)
+		trace_rcu_grace_period("rcu_preempt", rdp->gpnum, "cpuqs");
+	rdp->passed_quiesce = 1;
 	current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
 }
 
@@ -190,6 +200,11 @@
 			if (rnp->qsmask & rdp->grpmask)
 				rnp->gp_tasks = &t->rcu_node_entry;
 		}
+		trace_rcu_preempt_task(rdp->rsp->name,
+				       t->pid,
+				       (rnp->qsmask & rdp->grpmask)
+				       ? rnp->gpnum
+				       : rnp->gpnum + 1);
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 	} else if (t->rcu_read_lock_nesting < 0 &&
 		   t->rcu_read_unlock_special) {
@@ -299,6 +314,9 @@
 	int empty_exp;
 	unsigned long flags;
 	struct list_head *np;
+#ifdef CONFIG_RCU_BOOST
+	struct rt_mutex *rbmp = NULL;
+#endif /* #ifdef CONFIG_RCU_BOOST */
 	struct rcu_node *rnp;
 	int special;
 
@@ -344,6 +362,9 @@
 		smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
 		np = rcu_next_node_entry(t, rnp);
 		list_del_init(&t->rcu_node_entry);
+		t->rcu_blocked_node = NULL;
+		trace_rcu_unlock_preempted_task("rcu_preempt",
+						rnp->gpnum, t->pid);
 		if (&t->rcu_node_entry == rnp->gp_tasks)
 			rnp->gp_tasks = np;
 		if (&t->rcu_node_entry == rnp->exp_tasks)
@@ -351,30 +372,34 @@
 #ifdef CONFIG_RCU_BOOST
 		if (&t->rcu_node_entry == rnp->boost_tasks)
 			rnp->boost_tasks = np;
-		/* Snapshot and clear ->rcu_boosted with rcu_node lock held. */
-		if (t->rcu_boosted) {
-			special |= RCU_READ_UNLOCK_BOOSTED;
-			t->rcu_boosted = 0;
+		/* Snapshot/clear ->rcu_boost_mutex with rcu_node lock held. */
+		if (t->rcu_boost_mutex) {
+			rbmp = t->rcu_boost_mutex;
+			t->rcu_boost_mutex = NULL;
 		}
 #endif /* #ifdef CONFIG_RCU_BOOST */
-		t->rcu_blocked_node = NULL;
 
 		/*
 		 * If this was the last task on the current list, and if
 		 * we aren't waiting on any CPUs, report the quiescent state.
 		 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock.
 		 */
-		if (empty)
-			raw_spin_unlock_irqrestore(&rnp->lock, flags);
-		else
+		if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) {
+			trace_rcu_quiescent_state_report("preempt_rcu",
+							 rnp->gpnum,
+							 0, rnp->qsmask,
+							 rnp->level,
+							 rnp->grplo,
+							 rnp->grphi,
+							 !!rnp->gp_tasks);
 			rcu_report_unblock_qs_rnp(rnp, flags);
+		} else
+			raw_spin_unlock_irqrestore(&rnp->lock, flags);
 
 #ifdef CONFIG_RCU_BOOST
 		/* Unboost if we were boosted. */
-		if (special & RCU_READ_UNLOCK_BOOSTED) {
-			rt_mutex_unlock(t->rcu_boost_mutex);
-			t->rcu_boost_mutex = NULL;
-		}
+		if (rbmp)
+			rt_mutex_unlock(rbmp);
 #endif /* #ifdef CONFIG_RCU_BOOST */
 
 		/*
@@ -399,10 +424,10 @@
 {
 	struct task_struct *t = current;
 
-	barrier();  /* needed if we ever invoke rcu_read_unlock in rcutree.c */
 	if (t->rcu_read_lock_nesting != 1)
 		--t->rcu_read_lock_nesting;
 	else {
+		barrier();  /* critical section before exit code. */
 		t->rcu_read_lock_nesting = INT_MIN;
 		barrier();  /* assign before ->rcu_read_unlock_special load */
 		if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
@@ -466,16 +491,20 @@
  * Scan the current list of tasks blocked within RCU read-side critical
  * sections, printing out the tid of each.
  */
-static void rcu_print_task_stall(struct rcu_node *rnp)
+static int rcu_print_task_stall(struct rcu_node *rnp)
 {
 	struct task_struct *t;
+	int ndetected = 0;
 
 	if (!rcu_preempt_blocked_readers_cgp(rnp))
-		return;
+		return 0;
 	t = list_entry(rnp->gp_tasks,
 		       struct task_struct, rcu_node_entry);
-	list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry)
+	list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
 		printk(" P%d", t->pid);
+		ndetected++;
+	}
+	return ndetected;
 }
 
 /*
@@ -656,18 +685,9 @@
  */
 void synchronize_rcu(void)
 {
-	struct rcu_synchronize rcu;
-
 	if (!rcu_scheduler_active)
 		return;
-
-	init_rcu_head_on_stack(&rcu.head);
-	init_completion(&rcu.completion);
-	/* Will wake me after RCU finished. */
-	call_rcu(&rcu.head, wakeme_after_rcu);
-	/* Wait for it. */
-	wait_for_completion(&rcu.completion);
-	destroy_rcu_head_on_stack(&rcu.head);
+	wait_rcu_gp(call_rcu);
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu);
 
@@ -968,8 +988,9 @@
  * Because preemptible RCU does not exist, we never have to check for
  * tasks blocked within RCU read-side critical sections.
  */
-static void rcu_print_task_stall(struct rcu_node *rnp)
+static int rcu_print_task_stall(struct rcu_node *rnp)
 {
+	return 0;
 }
 
 /*
@@ -1136,6 +1157,8 @@
 
 #endif /* #else #ifdef CONFIG_RCU_TRACE */
 
+static struct lock_class_key rcu_boost_class;
+
 /*
  * Carry out RCU priority boosting on the task indicated by ->exp_tasks
  * or ->boost_tasks, advancing the pointer to the next task in the
@@ -1198,8 +1221,10 @@
 	 */
 	t = container_of(tb, struct task_struct, rcu_node_entry);
 	rt_mutex_init_proxy_locked(&mtx, t);
+	/* Avoid lockdep false positives.  This rt_mutex is its own thing. */
+	lockdep_set_class_and_name(&mtx.wait_lock, &rcu_boost_class,
+				   "rcu_boost_mutex");
 	t->rcu_boost_mutex = &mtx;
-	t->rcu_boosted = 1;
 	raw_spin_unlock_irqrestore(&rnp->lock, flags);
 	rt_mutex_lock(&mtx);  /* Side effect: boosts task t's priority. */
 	rt_mutex_unlock(&mtx);  /* Keep lockdep happy. */
@@ -1228,9 +1253,12 @@
 	int spincnt = 0;
 	int more2boost;
 
+	trace_rcu_utilization("Start boost kthread@init");
 	for (;;) {
 		rnp->boost_kthread_status = RCU_KTHREAD_WAITING;
+		trace_rcu_utilization("End boost kthread@rcu_wait");
 		rcu_wait(rnp->boost_tasks || rnp->exp_tasks);
+		trace_rcu_utilization("Start boost kthread@rcu_wait");
 		rnp->boost_kthread_status = RCU_KTHREAD_RUNNING;
 		more2boost = rcu_boost(rnp);
 		if (more2boost)
@@ -1238,11 +1266,14 @@
 		else
 			spincnt = 0;
 		if (spincnt > 10) {
+			trace_rcu_utilization("End boost kthread@rcu_yield");
 			rcu_yield(rcu_boost_kthread_timer, (unsigned long)rnp);
+			trace_rcu_utilization("Start boost kthread@rcu_yield");
 			spincnt = 0;
 		}
 	}
 	/* NOTREACHED */
+	trace_rcu_utilization("End boost kthread@notreached");
 	return 0;
 }
 
@@ -1291,11 +1322,9 @@
 
 	local_irq_save(flags);
 	__this_cpu_write(rcu_cpu_has_work, 1);
-	if (__this_cpu_read(rcu_cpu_kthread_task) == NULL) {
-		local_irq_restore(flags);
-		return;
-	}
-	wake_up_process(__this_cpu_read(rcu_cpu_kthread_task));
+	if (__this_cpu_read(rcu_cpu_kthread_task) != NULL &&
+	    current != __this_cpu_read(rcu_cpu_kthread_task))
+		wake_up_process(__this_cpu_read(rcu_cpu_kthread_task));
 	local_irq_restore(flags);
 }
 
@@ -1343,13 +1372,13 @@
 	if (rnp->boost_kthread_task != NULL)
 		return 0;
 	t = kthread_create(rcu_boost_kthread, (void *)rnp,
-			   "rcub%d", rnp_index);
+			   "rcub/%d", rnp_index);
 	if (IS_ERR(t))
 		return PTR_ERR(t);
 	raw_spin_lock_irqsave(&rnp->lock, flags);
 	rnp->boost_kthread_task = t;
 	raw_spin_unlock_irqrestore(&rnp->lock, flags);
-	sp.sched_priority = RCU_KTHREAD_PRIO;
+	sp.sched_priority = RCU_BOOST_PRIO;
 	sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
 	wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
 	return 0;
@@ -1444,6 +1473,7 @@
 {
 	struct sched_param sp;
 	struct timer_list yield_timer;
+	int prio = current->rt_priority;
 
 	setup_timer_on_stack(&yield_timer, f, arg);
 	mod_timer(&yield_timer, jiffies + 2);
@@ -1451,7 +1481,8 @@
 	sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp);
 	set_user_nice(current, 19);
 	schedule();
-	sp.sched_priority = RCU_KTHREAD_PRIO;
+	set_user_nice(current, 0);
+	sp.sched_priority = prio;
 	sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
 	del_timer(&yield_timer);
 }
@@ -1489,7 +1520,8 @@
 
 /*
  * Per-CPU kernel thread that invokes RCU callbacks.  This replaces the
- * earlier RCU softirq.
+ * RCU softirq used in flavors and configurations of RCU that do not
+ * support RCU priority boosting.
  */
 static int rcu_cpu_kthread(void *arg)
 {
@@ -1500,9 +1532,12 @@
 	char work;
 	char *workp = &per_cpu(rcu_cpu_has_work, cpu);
 
+	trace_rcu_utilization("Start CPU kthread@init");
 	for (;;) {
 		*statusp = RCU_KTHREAD_WAITING;
+		trace_rcu_utilization("End CPU kthread@rcu_wait");
 		rcu_wait(*workp != 0 || kthread_should_stop());
+		trace_rcu_utilization("Start CPU kthread@rcu_wait");
 		local_bh_disable();
 		if (rcu_cpu_kthread_should_stop(cpu)) {
 			local_bh_enable();
@@ -1523,11 +1558,14 @@
 			spincnt = 0;
 		if (spincnt > 10) {
 			*statusp = RCU_KTHREAD_YIELDING;
+			trace_rcu_utilization("End CPU kthread@rcu_yield");
 			rcu_yield(rcu_cpu_kthread_timer, (unsigned long)cpu);
+			trace_rcu_utilization("Start CPU kthread@rcu_yield");
 			spincnt = 0;
 		}
 	}
 	*statusp = RCU_KTHREAD_STOPPED;
+	trace_rcu_utilization("End CPU kthread@term");
 	return 0;
 }
 
@@ -1560,7 +1598,10 @@
 	if (!rcu_scheduler_fully_active ||
 	    per_cpu(rcu_cpu_kthread_task, cpu) != NULL)
 		return 0;
-	t = kthread_create(rcu_cpu_kthread, (void *)(long)cpu, "rcuc%d", cpu);
+	t = kthread_create_on_node(rcu_cpu_kthread,
+				   (void *)(long)cpu,
+				   cpu_to_node(cpu),
+				   "rcuc/%d", cpu);
 	if (IS_ERR(t))
 		return PTR_ERR(t);
 	if (cpu_online(cpu))
@@ -1669,7 +1710,7 @@
 		return 0;
 	if (rnp->node_kthread_task == NULL) {
 		t = kthread_create(rcu_node_kthread, (void *)rnp,
-				   "rcun%d", rnp_index);
+				   "rcun/%d", rnp_index);
 		if (IS_ERR(t))
 			return PTR_ERR(t);
 		raw_spin_lock_irqsave(&rnp->lock, flags);
@@ -1907,15 +1948,6 @@
 	return rcu_needs_cpu_quick_check(cpu);
 }
 
-/*
- * Check to see if we need to continue a callback-flush operations to
- * allow the last CPU to enter dyntick-idle mode.  But fast dyntick-idle
- * entry is not configured, so we never do need to.
- */
-static void rcu_needs_cpu_flush(void)
-{
-}
-
 #else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */
 
 #define RCU_NEEDS_CPU_FLUSHES 5
@@ -1991,20 +2023,4 @@
 	return c;
 }
 
-/*
- * Check to see if we need to continue a callback-flush operations to
- * allow the last CPU to enter dyntick-idle mode.
- */
-static void rcu_needs_cpu_flush(void)
-{
-	int cpu = smp_processor_id();
-	unsigned long flags;
-
-	if (per_cpu(rcu_dyntick_drain, cpu) <= 0)
-		return;
-	local_irq_save(flags);
-	(void)rcu_needs_cpu(cpu);
-	local_irq_restore(flags);
-}
-
 #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */

diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 3b0c098..9feffa4 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c

@@ -48,11 +48,6 @@
 
 #ifdef CONFIG_RCU_BOOST
 
-DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
-DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_cpu);
-DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
-DECLARE_PER_CPU(char, rcu_cpu_has_work);
-
 static char convert_kthread_status(unsigned int kthread_status)
 {
 	if (kthread_status > RCU_KTHREAD_MAX)
@@ -66,11 +61,11 @@
 {
 	if (!rdp->beenonline)
 		return;
-	seq_printf(m, "%3d%cc=%lu g=%lu pq=%d pqc=%lu qp=%d",
+	seq_printf(m, "%3d%cc=%lu g=%lu pq=%d pgp=%lu qp=%d",
 		   rdp->cpu,
 		   cpu_is_offline(rdp->cpu) ? '!' : ' ',
 		   rdp->completed, rdp->gpnum,
-		   rdp->passed_quiesc, rdp->passed_quiesc_completed,
+		   rdp->passed_quiesce, rdp->passed_quiesce_gpnum,
 		   rdp->qs_pending);
 #ifdef CONFIG_NO_HZ
 	seq_printf(m, " dt=%d/%d/%d df=%lu",
@@ -144,7 +139,7 @@
 		   rdp->cpu,
 		   cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"",
 		   rdp->completed, rdp->gpnum,
-		   rdp->passed_quiesc, rdp->passed_quiesc_completed,
+		   rdp->passed_quiesce, rdp->passed_quiesce_gpnum,
 		   rdp->qs_pending);
 #ifdef CONFIG_NO_HZ
 	seq_printf(m, ",%d,%d,%d,%lu",
@@ -175,7 +170,7 @@
 
 static int show_rcudata_csv(struct seq_file *m, void *unused)
 {
-	seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pqc\",\"pq\",");
+	seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\",");
 #ifdef CONFIG_NO_HZ
 	seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\",");
 #endif /* #ifdef CONFIG_NO_HZ */

diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 255e166..5e8d9cc 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c

@@ -579,6 +579,7 @@
 		    struct rt_mutex_waiter *waiter)
 {
 	int ret = 0;
+	int was_disabled;
 
 	for (;;) {
 		/* Try to acquire the lock: */
@@ -601,10 +602,17 @@
 
 		raw_spin_unlock(&lock->wait_lock);
 
+		was_disabled = irqs_disabled();
+		if (was_disabled)
+			local_irq_enable();
+
 		debug_rt_mutex_print_deadlock(waiter);
 
 		schedule_rt_mutex(lock);
 
+		if (was_disabled)
+			local_irq_disable();
+
 		raw_spin_lock(&lock->wait_lock);
 		set_current_state(state);
 	}

diff --git a/kernel/sched.c b/kernel/sched.c
index 8aa0080..d87c6e5 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c

@@ -196,10 +196,28 @@
 	return sysctl_sched_rt_runtime >= 0;
 }
 
+static void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
+{
+	unsigned long delta;
+	ktime_t soft, hard, now;
+
+	for (;;) {
+		if (hrtimer_active(period_timer))
+			break;
+
+		now = hrtimer_cb_get_time(period_timer);
+		hrtimer_forward(period_timer, now, period);
+
+		soft = hrtimer_get_softexpires(period_timer);
+		hard = hrtimer_get_expires(period_timer);
+		delta = ktime_to_ns(ktime_sub(hard, soft));
+		__hrtimer_start_range_ns(period_timer, soft, delta,
+					 HRTIMER_MODE_ABS_PINNED, 0);
+	}
+}
+
 static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
 {
-	ktime_t now;
-
 	if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
 		return;
 
@@ -207,22 +225,7 @@
 		return;
 
 	raw_spin_lock(&rt_b->rt_runtime_lock);
-	for (;;) {
-		unsigned long delta;
-		ktime_t soft, hard;
-
-		if (hrtimer_active(&rt_b->rt_period_timer))
-			break;
-
-		now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
-		hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
-
-		soft = hrtimer_get_softexpires(&rt_b->rt_period_timer);
-		hard = hrtimer_get_expires(&rt_b->rt_period_timer);
-		delta = ktime_to_ns(ktime_sub(hard, soft));
-		__hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
-				HRTIMER_MODE_ABS_PINNED, 0);
-	}
+	start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period);
 	raw_spin_unlock(&rt_b->rt_runtime_lock);
 }
 
@@ -247,6 +250,24 @@
 
 static LIST_HEAD(task_groups);
 
+struct cfs_bandwidth {
+#ifdef CONFIG_CFS_BANDWIDTH
+	raw_spinlock_t lock;
+	ktime_t period;
+	u64 quota, runtime;
+	s64 hierarchal_quota;
+	u64 runtime_expires;
+
+	int idle, timer_active;
+	struct hrtimer period_timer, slack_timer;
+	struct list_head throttled_cfs_rq;
+
+	/* statistics */
+	int nr_periods, nr_throttled;
+	u64 throttled_time;
+#endif
+};
+
 /* task group related information */
 struct task_group {
 	struct cgroup_subsys_state css;
@@ -278,6 +299,8 @@
 #ifdef CONFIG_SCHED_AUTOGROUP
 	struct autogroup *autogroup;
 #endif
+
+	struct cfs_bandwidth cfs_bandwidth;
 };
 
 /* task_group_lock serializes the addition/removal of task groups */
@@ -311,7 +334,7 @@
 /* CFS-related fields in a runqueue */
 struct cfs_rq {
 	struct load_weight load;
-	unsigned long nr_running;
+	unsigned long nr_running, h_nr_running;
 
 	u64 exec_clock;
 	u64 min_vruntime;
@@ -377,9 +400,120 @@
 
 	unsigned long load_contribution;
 #endif
+#ifdef CONFIG_CFS_BANDWIDTH
+	int runtime_enabled;
+	u64 runtime_expires;
+	s64 runtime_remaining;
+
+	u64 throttled_timestamp;
+	int throttled, throttle_count;
+	struct list_head throttled_list;
+#endif
 #endif
 };
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_CFS_BANDWIDTH
+static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
+{
+	return &tg->cfs_bandwidth;
+}
+
+static inline u64 default_cfs_period(void);
+static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
+static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
+
+static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
+{
+	struct cfs_bandwidth *cfs_b =
+		container_of(timer, struct cfs_bandwidth, slack_timer);
+	do_sched_cfs_slack_timer(cfs_b);
+
+	return HRTIMER_NORESTART;
+}
+
+static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
+{
+	struct cfs_bandwidth *cfs_b =
+		container_of(timer, struct cfs_bandwidth, period_timer);
+	ktime_t now;
+	int overrun;
+	int idle = 0;
+
+	for (;;) {
+		now = hrtimer_cb_get_time(timer);
+		overrun = hrtimer_forward(timer, now, cfs_b->period);
+
+		if (!overrun)
+			break;
+
+		idle = do_sched_cfs_period_timer(cfs_b, overrun);
+	}
+
+	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
+}
+
+static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
+{
+	raw_spin_lock_init(&cfs_b->lock);
+	cfs_b->runtime = 0;
+	cfs_b->quota = RUNTIME_INF;
+	cfs_b->period = ns_to_ktime(default_cfs_period());
+
+	INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
+	hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	cfs_b->period_timer.function = sched_cfs_period_timer;
+	hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	cfs_b->slack_timer.function = sched_cfs_slack_timer;
+}
+
+static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+	cfs_rq->runtime_enabled = 0;
+	INIT_LIST_HEAD(&cfs_rq->throttled_list);
+}
+
+/* requires cfs_b->lock, may release to reprogram timer */
+static void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
+{
+	/*
+	 * The timer may be active because we're trying to set a new bandwidth
+	 * period or because we're racing with the tear-down path
+	 * (timer_active==0 becomes visible before the hrtimer call-back
+	 * terminates).  In either case we ensure that it's re-programmed
+	 */
+	while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
+		raw_spin_unlock(&cfs_b->lock);
+		/* ensure cfs_b->lock is available while we wait */
+		hrtimer_cancel(&cfs_b->period_timer);
+
+		raw_spin_lock(&cfs_b->lock);
+		/* if someone else restarted the timer then we're done */
+		if (cfs_b->timer_active)
+			return;
+	}
+
+	cfs_b->timer_active = 1;
+	start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
+}
+
+static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
+{
+	hrtimer_cancel(&cfs_b->period_timer);
+	hrtimer_cancel(&cfs_b->slack_timer);
+}
+#else
+static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
+static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
+static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
+
+static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
+{
+	return NULL;
+}
+#endif /* CONFIG_CFS_BANDWIDTH */
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
 /* Real-Time classes' related field in a runqueue: */
 struct rt_rq {
 	struct rt_prio_array active;
@@ -510,7 +644,7 @@
 
 	unsigned long cpu_power;
 
-	unsigned char idle_at_tick;
+	unsigned char idle_balance;
 	/* For active balancing */
 	int post_schedule;
 	int active_balance;
@@ -520,8 +654,6 @@
 	int cpu;
 	int online;
 
-	unsigned long avg_load_per_task;
-
 	u64 rt_avg;
 	u64 age_stamp;
 	u64 idle_stamp;
@@ -570,7 +702,7 @@
 #endif
 
 #ifdef CONFIG_SMP
-	struct task_struct *wake_list;
+	struct llist_head wake_list;
 #endif
 };
 
@@ -1272,6 +1404,18 @@
 		smp_send_reschedule(cpu);
 }
 
+static inline bool got_nohz_idle_kick(void)
+{
+	return idle_cpu(smp_processor_id()) && this_rq()->nohz_balance_kick;
+}
+
+#else /* CONFIG_NO_HZ */
+
+static inline bool got_nohz_idle_kick(void)
+{
+	return false;
+}
+
 #endif /* CONFIG_NO_HZ */
 
 static u64 sched_avg_period(void)
@@ -1471,24 +1615,28 @@
 	update_load_sub(&rq->load, load);
 }
 
-#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)
+#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
+			(defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
 typedef int (*tg_visitor)(struct task_group *, void *);
 
 /*
- * Iterate the full tree, calling @down when first entering a node and @up when
- * leaving it for the final time.
+ * Iterate task_group tree rooted at *from, calling @down when first entering a
+ * node and @up when leaving it for the final time.
+ *
+ * Caller must hold rcu_lock or sufficient equivalent.
  */
-static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
+static int walk_tg_tree_from(struct task_group *from,
+			     tg_visitor down, tg_visitor up, void *data)
 {
 	struct task_group *parent, *child;
 	int ret;
 
-	rcu_read_lock();
-	parent = &root_task_group;
+	parent = from;
+
 down:
 	ret = (*down)(parent, data);
 	if (ret)
-		goto out_unlock;
+		goto out;
 	list_for_each_entry_rcu(child, &parent->children, siblings) {
 		parent = child;
 		goto down;
@@ -1497,19 +1645,29 @@
 		continue;
 	}
 	ret = (*up)(parent, data);
-	if (ret)
-		goto out_unlock;
+	if (ret || parent == from)
+		goto out;
 
 	child = parent;
 	parent = parent->parent;
 	if (parent)
 		goto up;
-out_unlock:
-	rcu_read_unlock();
-
+out:
 	return ret;
 }
 
+/*
+ * Iterate the full tree, calling @down when first entering a node and @up when
+ * leaving it for the final time.
+ *
+ * Caller must hold rcu_lock or sufficient equivalent.
+ */
+
+static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
+{
+	return walk_tg_tree_from(&root_task_group, down, up, data);
+}
+
 static int tg_nop(struct task_group *tg, void *data)
 {
 	return 0;
@@ -1569,11 +1727,9 @@
 	unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
 
 	if (nr_running)
-		rq->avg_load_per_task = rq->load.weight / nr_running;
-	else
-		rq->avg_load_per_task = 0;
+		return rq->load.weight / nr_running;
 
-	return rq->avg_load_per_task;
+	return 0;
 }
 
 #ifdef CONFIG_PREEMPT
@@ -1806,7 +1962,6 @@
 		rq->nr_uninterruptible--;
 
 	enqueue_task(rq, p, flags);
-	inc_nr_running(rq);
 }
 
 /*
@@ -1818,7 +1973,6 @@
 		rq->nr_uninterruptible++;
 
 	dequeue_task(rq, p, flags);
-	dec_nr_running(rq);
 }
 
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -2390,11 +2544,11 @@
 
 	/* Look for allowed, online CPU in same node. */
 	for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)
-		if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
+		if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
 			return dest_cpu;
 
 	/* Any allowed, online CPU? */
-	dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask);
+	dest_cpu = cpumask_any_and(tsk_cpus_allowed(p), cpu_active_mask);
 	if (dest_cpu < nr_cpu_ids)
 		return dest_cpu;
 
@@ -2431,7 +2585,7 @@
 	 * [ this allows ->select_task() to simply return task_cpu(p) and
 	 *   not worry about this generic constraint ]
 	 */
-	if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) ||
+	if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) ||
 		     !cpu_online(cpu)))
 		cpu = select_fallback_rq(task_cpu(p), p);
 
@@ -2556,42 +2710,26 @@
 }
 
 #ifdef CONFIG_SMP
-static void sched_ttwu_do_pending(struct task_struct *list)
+static void sched_ttwu_pending(void)
 {
 	struct rq *rq = this_rq();
+	struct llist_node *llist = llist_del_all(&rq->wake_list);
+	struct task_struct *p;
 
 	raw_spin_lock(&rq->lock);
 
-	while (list) {
-		struct task_struct *p = list;
-		list = list->wake_entry;
+	while (llist) {
+		p = llist_entry(llist, struct task_struct, wake_entry);
+		llist = llist_next(llist);
 		ttwu_do_activate(rq, p, 0);
 	}
 
 	raw_spin_unlock(&rq->lock);
 }
 
-#ifdef CONFIG_HOTPLUG_CPU
-
-static void sched_ttwu_pending(void)
-{
-	struct rq *rq = this_rq();
-	struct task_struct *list = xchg(&rq->wake_list, NULL);
-
-	if (!list)
-		return;
-
-	sched_ttwu_do_pending(list);
-}
-
-#endif /* CONFIG_HOTPLUG_CPU */
-
 void scheduler_ipi(void)
 {
-	struct rq *rq = this_rq();
-	struct task_struct *list = xchg(&rq->wake_list, NULL);
-
-	if (!list)
+	if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
 		return;
 
 	/*
@@ -2608,25 +2746,21 @@
 	 * somewhat pessimize the simple resched case.
 	 */
 	irq_enter();
-	sched_ttwu_do_pending(list);
+	sched_ttwu_pending();
+
+	/*
+	 * Check if someone kicked us for doing the nohz idle load balance.
+	 */
+	if (unlikely(got_nohz_idle_kick() && !need_resched())) {
+		this_rq()->idle_balance = 1;
+		raise_softirq_irqoff(SCHED_SOFTIRQ);
+	}
 	irq_exit();
 }
 
 static void ttwu_queue_remote(struct task_struct *p, int cpu)
 {
-	struct rq *rq = cpu_rq(cpu);
-	struct task_struct *next = rq->wake_list;
-
-	for (;;) {
-		struct task_struct *old = next;
-
-		p->wake_entry = next;
-		next = cmpxchg(&rq->wake_list, old, p);
-		if (next == old)
-			break;
-	}
-
-	if (!next)
+	if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list))
 		smp_send_reschedule(cpu);
 }
 
@@ -2848,19 +2982,23 @@
 	p->state = TASK_RUNNING;
 
 	/*
+	 * Make sure we do not leak PI boosting priority to the child.
+	 */
+	p->prio = current->normal_prio;
+
+	/*
 	 * Revert to default priority/policy on fork if requested.
 	 */
 	if (unlikely(p->sched_reset_on_fork)) {
-		if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) {
+		if (task_has_rt_policy(p)) {
 			p->policy = SCHED_NORMAL;
-			p->normal_prio = p->static_prio;
-		}
-
-		if (PRIO_TO_NICE(p->static_prio) < 0) {
 			p->static_prio = NICE_TO_PRIO(0);
-			p->normal_prio = p->static_prio;
-			set_load_weight(p);
-		}
+			p->rt_priority = 0;
+		} else if (PRIO_TO_NICE(p->static_prio) < 0)
+			p->static_prio = NICE_TO_PRIO(0);
+
+		p->prio = p->normal_prio = __normal_prio(p);
+		set_load_weight(p);
 
 		/*
 		 * We don't need the reset flag anymore after the fork. It has
@@ -2869,11 +3007,6 @@
 		p->sched_reset_on_fork = 0;
 	}
 
-	/*
-	 * Make sure we do not leak PI boosting priority to the child.
-	 */
-	p->prio = current->normal_prio;
-
 	if (!rt_prio(p->prio))
 		p->sched_class = &fair_sched_class;
 
@@ -4116,7 +4249,7 @@
 	perf_event_task_tick();
 
 #ifdef CONFIG_SMP
-	rq->idle_at_tick = idle_cpu(cpu);
+	rq->idle_balance = idle_cpu(cpu);
 	trigger_load_balance(rq, cpu);
 #endif
 }
@@ -4213,6 +4346,7 @@
 	 */
 	if (unlikely(in_atomic_preempt_off() && !prev->exit_state))
 		__schedule_bug(prev);
+	rcu_sleep_check();
 
 	profile_hit(SCHED_PROFILING, __builtin_return_address(0));
 
@@ -4239,7 +4373,7 @@
 	 * Optimization: we know that if all tasks are in
 	 * the fair class we can call that function directly:
 	 */
-	if (likely(rq->nr_running == rq->cfs.nr_running)) {
+	if (likely(rq->nr_running == rq->cfs.h_nr_running)) {
 		p = fair_sched_class.pick_next_task(rq);
 		if (likely(p))
 			return p;
@@ -5025,7 +5159,20 @@
  */
 int idle_cpu(int cpu)
 {
-	return cpu_curr(cpu) == cpu_rq(cpu)->idle;
+	struct rq *rq = cpu_rq(cpu);
+
+	if (rq->curr != rq->idle)
+		return 0;
+
+	if (rq->nr_running)
+		return 0;
+
+#ifdef CONFIG_SMP
+	if (!llist_empty(&rq->wake_list))
+		return 0;
+#endif
+
+	return 1;
 }
 
 /**
@@ -5875,7 +6022,7 @@
 	printk(KERN_INFO
 		"  task                        PC stack   pid father\n");
 #endif
-	read_lock(&tasklist_lock);
+	rcu_read_lock();
 	do_each_thread(g, p) {
 		/*
 		 * reset the NMI-timeout, listing all files on a slow
@@ -5891,7 +6038,7 @@
 #ifdef CONFIG_SCHED_DEBUG
 	sysrq_sched_debug_show();
 #endif
-	read_unlock(&tasklist_lock);
+	rcu_read_unlock();
 	/*
 	 * Only show locks if all tasks are dumped:
 	 */
@@ -5955,15 +6102,6 @@
 }
 
 /*
- * In a system that switches off the HZ timer nohz_cpu_mask
- * indicates which cpus entered this state. This is used
- * in the rcu update to wait only for active cpus. For system
- * which do not switch off the HZ timer nohz_cpu_mask should
- * always be CPU_BITS_NONE.
- */
-cpumask_var_t nohz_cpu_mask;
-
-/*
  * Increase the granularity value when there are more CPUs,
  * because with more CPUs the 'effective latency' as visible
  * to users decreases. But the relationship is not linear,
@@ -6015,10 +6153,9 @@
 {
 	if (p->sched_class && p->sched_class->set_cpus_allowed)
 		p->sched_class->set_cpus_allowed(p, new_mask);
-	else {
-		cpumask_copy(&p->cpus_allowed, new_mask);
-		p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
-	}
+
+	cpumask_copy(&p->cpus_allowed, new_mask);
+	p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
 }
 
 /*
@@ -6116,7 +6253,7 @@
 	if (task_cpu(p) != src_cpu)
 		goto done;
 	/* Affinity changed (again). */
-	if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
+	if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
 		goto fail;
 
 	/*
@@ -6197,6 +6334,30 @@
 	rq->calc_load_active = 0;
 }
 
+#ifdef CONFIG_CFS_BANDWIDTH
+static void unthrottle_offline_cfs_rqs(struct rq *rq)
+{
+	struct cfs_rq *cfs_rq;
+
+	for_each_leaf_cfs_rq(rq, cfs_rq) {
+		struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+
+		if (!cfs_rq->runtime_enabled)
+			continue;
+
+		/*
+		 * clock_task is not advancing so we just need to make sure
+		 * there's some valid quota amount
+		 */
+		cfs_rq->runtime_remaining = cfs_b->quota;
+		if (cfs_rq_throttled(cfs_rq))
+			unthrottle_cfs_rq(cfs_rq);
+	}
+}
+#else
+static void unthrottle_offline_cfs_rqs(struct rq *rq) {}
+#endif
+
 /*
  * Migrate all tasks from the rq, sleeping tasks will be migrated by
  * try_to_wake_up()->select_task_rq().
@@ -6222,6 +6383,9 @@
 	 */
 	rq->stop = NULL;
 
+	/* Ensure any throttled groups are reachable by pick_next_task */
+	unthrottle_offline_cfs_rqs(rq);
+
 	for ( ; ; ) {
 		/*
 		 * There's this thread running, bail when that's the only
@@ -7965,6 +8129,7 @@
 	/* allow initial update_cfs_load() to truncate */
 	cfs_rq->load_stamp = 1;
 #endif
+	init_cfs_rq_runtime(cfs_rq);
 
 	tg->cfs_rq[cpu] = cfs_rq;
 	tg->se[cpu] = se;
@@ -8104,6 +8269,7 @@
 		 * We achieve this by letting root_task_group's tasks sit
 		 * directly in rq->cfs (i.e root_task_group->se[] = NULL).
 		 */
+		init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
 		init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
@@ -8133,7 +8299,6 @@
 		rq_attach_root(rq, &def_root_domain);
 #ifdef CONFIG_NO_HZ
 		rq->nohz_balance_kick = 0;
-		init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i));
 #endif
 #endif
 		init_rq_hrtick(rq);
@@ -8175,8 +8340,6 @@
 	 */
 	current->sched_class = &fair_sched_class;
 
-	/* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
-	zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
 #ifdef CONFIG_SMP
 	zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
 #ifdef CONFIG_NO_HZ
@@ -8206,6 +8369,7 @@
 {
 	static unsigned long prev_jiffy;	/* ratelimiting */
 
+	rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
 	if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
 	    system_state != SYSTEM_RUNNING || oops_in_progress)
 		return;
@@ -8345,6 +8509,8 @@
 {
 	int i;
 
+	destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
+
 	for_each_possible_cpu(i) {
 		if (tg->cfs_rq)
 			kfree(tg->cfs_rq[i]);
@@ -8372,6 +8538,8 @@
 
 	tg->shares = NICE_0_LOAD;
 
+	init_cfs_bandwidth(tg_cfs_bandwidth(tg));
+
 	for_each_possible_cpu(i) {
 		cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
 				      GFP_KERNEL, cpu_to_node(i));
@@ -8647,12 +8815,7 @@
 }
 #endif
 
-#ifdef CONFIG_RT_GROUP_SCHED
-/*
- * Ensure that the real time constraints are schedulable.
- */
-static DEFINE_MUTEX(rt_constraints_mutex);
-
+#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
 static unsigned long to_ratio(u64 period, u64 runtime)
 {
 	if (runtime == RUNTIME_INF)
@@ -8660,6 +8823,13 @@
 
 	return div64_u64(runtime << 20, period);
 }
+#endif
+
+#ifdef CONFIG_RT_GROUP_SCHED
+/*
+ * Ensure that the real time constraints are schedulable.
+ */
+static DEFINE_MUTEX(rt_constraints_mutex);
 
 /* Must be called with tasklist_lock held */
 static inline int tg_has_rt_tasks(struct task_group *tg)
@@ -8680,7 +8850,7 @@
 	u64 rt_runtime;
 };
 
-static int tg_schedulable(struct task_group *tg, void *data)
+static int tg_rt_schedulable(struct task_group *tg, void *data)
 {
 	struct rt_schedulable_data *d = data;
 	struct task_group *child;
@@ -8738,16 +8908,22 @@
 
 static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
 {
+	int ret;
+
 	struct rt_schedulable_data data = {
 		.tg = tg,
 		.rt_period = period,
 		.rt_runtime = runtime,
 	};
 
-	return walk_tg_tree(tg_schedulable, tg_nop, &data);
+	rcu_read_lock();
+	ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
+	rcu_read_unlock();
+
+	return ret;
 }
 
-static int tg_set_bandwidth(struct task_group *tg,
+static int tg_set_rt_bandwidth(struct task_group *tg,
 		u64 rt_period, u64 rt_runtime)
 {
 	int i, err = 0;
@@ -8786,7 +8962,7 @@
 	if (rt_runtime_us < 0)
 		rt_runtime = RUNTIME_INF;
 
-	return tg_set_bandwidth(tg, rt_period, rt_runtime);
+	return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
 }
 
 long sched_group_rt_runtime(struct task_group *tg)
@@ -8811,7 +8987,7 @@
 	if (rt_period == 0)
 		return -EINVAL;
 
-	return tg_set_bandwidth(tg, rt_period, rt_runtime);
+	return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
 }
 
 long sched_group_rt_period(struct task_group *tg)
@@ -9001,6 +9177,238 @@
 
 	return (u64) scale_load_down(tg->shares);
 }
+
+#ifdef CONFIG_CFS_BANDWIDTH
+static DEFINE_MUTEX(cfs_constraints_mutex);
+
+const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
+const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
+
+static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
+
+static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
+{
+	int i, ret = 0, runtime_enabled;
+	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
+
+	if (tg == &root_task_group)
+		return -EINVAL;
+
+	/*
+	 * Ensure we have at some amount of bandwidth every period.  This is
+	 * to prevent reaching a state of large arrears when throttled via
+	 * entity_tick() resulting in prolonged exit starvation.
+	 */
+	if (quota < min_cfs_quota_period || period < min_cfs_quota_period)
+		return -EINVAL;
+
+	/*
+	 * Likewise, bound things on the otherside by preventing insane quota
+	 * periods.  This also allows us to normalize in computing quota
+	 * feasibility.
+	 */
+	if (period > max_cfs_quota_period)
+		return -EINVAL;
+
+	mutex_lock(&cfs_constraints_mutex);
+	ret = __cfs_schedulable(tg, period, quota);
+	if (ret)
+		goto out_unlock;
+
+	runtime_enabled = quota != RUNTIME_INF;
+	raw_spin_lock_irq(&cfs_b->lock);
+	cfs_b->period = ns_to_ktime(period);
+	cfs_b->quota = quota;
+
+	__refill_cfs_bandwidth_runtime(cfs_b);
+	/* restart the period timer (if active) to handle new period expiry */
+	if (runtime_enabled && cfs_b->timer_active) {
+		/* force a reprogram */
+		cfs_b->timer_active = 0;
+		__start_cfs_bandwidth(cfs_b);
+	}
+	raw_spin_unlock_irq(&cfs_b->lock);
+
+	for_each_possible_cpu(i) {
+		struct cfs_rq *cfs_rq = tg->cfs_rq[i];
+		struct rq *rq = rq_of(cfs_rq);
+
+		raw_spin_lock_irq(&rq->lock);
+		cfs_rq->runtime_enabled = runtime_enabled;
+		cfs_rq->runtime_remaining = 0;
+
+		if (cfs_rq_throttled(cfs_rq))
+			unthrottle_cfs_rq(cfs_rq);
+		raw_spin_unlock_irq(&rq->lock);
+	}
+out_unlock:
+	mutex_unlock(&cfs_constraints_mutex);
+
+	return ret;
+}
+
+int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
+{
+	u64 quota, period;
+
+	period = ktime_to_ns(tg_cfs_bandwidth(tg)->period);
+	if (cfs_quota_us < 0)
+		quota = RUNTIME_INF;
+	else
+		quota = (u64)cfs_quota_us * NSEC_PER_USEC;
+
+	return tg_set_cfs_bandwidth(tg, period, quota);
+}
+
+long tg_get_cfs_quota(struct task_group *tg)
+{
+	u64 quota_us;
+
+	if (tg_cfs_bandwidth(tg)->quota == RUNTIME_INF)
+		return -1;
+
+	quota_us = tg_cfs_bandwidth(tg)->quota;
+	do_div(quota_us, NSEC_PER_USEC);
+
+	return quota_us;
+}
+
+int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
+{
+	u64 quota, period;
+
+	period = (u64)cfs_period_us * NSEC_PER_USEC;
+	quota = tg_cfs_bandwidth(tg)->quota;
+
+	if (period <= 0)
+		return -EINVAL;
+
+	return tg_set_cfs_bandwidth(tg, period, quota);
+}
+
+long tg_get_cfs_period(struct task_group *tg)
+{
+	u64 cfs_period_us;
+
+	cfs_period_us = ktime_to_ns(tg_cfs_bandwidth(tg)->period);
+	do_div(cfs_period_us, NSEC_PER_USEC);
+
+	return cfs_period_us;
+}
+
+static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft)
+{
+	return tg_get_cfs_quota(cgroup_tg(cgrp));
+}
+
+static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype,
+				s64 cfs_quota_us)
+{
+	return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us);
+}
+
+static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft)
+{
+	return tg_get_cfs_period(cgroup_tg(cgrp));
+}
+
+static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype,
+				u64 cfs_period_us)
+{
+	return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us);
+}
+
+struct cfs_schedulable_data {
+	struct task_group *tg;
+	u64 period, quota;
+};
+
+/*
+ * normalize group quota/period to be quota/max_period
+ * note: units are usecs
+ */
+static u64 normalize_cfs_quota(struct task_group *tg,
+			       struct cfs_schedulable_data *d)
+{
+	u64 quota, period;
+
+	if (tg == d->tg) {
+		period = d->period;
+		quota = d->quota;
+	} else {
+		period = tg_get_cfs_period(tg);
+		quota = tg_get_cfs_quota(tg);
+	}
+
+	/* note: these should typically be equivalent */
+	if (quota == RUNTIME_INF || quota == -1)
+		return RUNTIME_INF;
+
+	return to_ratio(period, quota);
+}
+
+static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
+{
+	struct cfs_schedulable_data *d = data;
+	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
+	s64 quota = 0, parent_quota = -1;
+
+	if (!tg->parent) {
+		quota = RUNTIME_INF;
+	} else {
+		struct cfs_bandwidth *parent_b = tg_cfs_bandwidth(tg->parent);
+
+		quota = normalize_cfs_quota(tg, d);
+		parent_quota = parent_b->hierarchal_quota;
+
+		/*
+		 * ensure max(child_quota) <= parent_quota, inherit when no
+		 * limit is set
+		 */
+		if (quota == RUNTIME_INF)
+			quota = parent_quota;
+		else if (parent_quota != RUNTIME_INF && quota > parent_quota)
+			return -EINVAL;
+	}
+	cfs_b->hierarchal_quota = quota;
+
+	return 0;
+}
+
+static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
+{
+	int ret;
+	struct cfs_schedulable_data data = {
+		.tg = tg,
+		.period = period,
+		.quota = quota,
+	};
+
+	if (quota != RUNTIME_INF) {
+		do_div(data.period, NSEC_PER_USEC);
+		do_div(data.quota, NSEC_PER_USEC);
+	}
+
+	rcu_read_lock();
+	ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
+	rcu_read_unlock();
+
+	return ret;
+}
+
+static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft,
+		struct cgroup_map_cb *cb)
+{
+	struct task_group *tg = cgroup_tg(cgrp);
+	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
+
+	cb->fill(cb, "nr_periods", cfs_b->nr_periods);
+	cb->fill(cb, "nr_throttled", cfs_b->nr_throttled);
+	cb->fill(cb, "throttled_time", cfs_b->throttled_time);
+
+	return 0;
+}
+#endif /* CONFIG_CFS_BANDWIDTH */
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
 #ifdef CONFIG_RT_GROUP_SCHED
@@ -9035,6 +9443,22 @@
 		.write_u64 = cpu_shares_write_u64,
 	},
 #endif
+#ifdef CONFIG_CFS_BANDWIDTH
+	{
+		.name = "cfs_quota_us",
+		.read_s64 = cpu_cfs_quota_read_s64,
+		.write_s64 = cpu_cfs_quota_write_s64,
+	},
+	{
+		.name = "cfs_period_us",
+		.read_u64 = cpu_cfs_period_read_u64,
+		.write_u64 = cpu_cfs_period_write_u64,
+	},
+	{
+		.name = "stat",
+		.read_map = cpu_stats_show,
+	},
+#endif
 #ifdef CONFIG_RT_GROUP_SCHED
 	{
 		.name = "rt_runtime_us",
@@ -9344,4 +9768,3 @@
 	.subsys_id = cpuacct_subsys_id,
 };
 #endif	/* CONFIG_CGROUP_CPUACCT */
-

diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index 2722dc1..a86cf9d 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c

@@ -47,9 +47,6 @@
 	return cpupri;
 }
 
-#define for_each_cpupri_active(array, idx)                    \
-	for_each_set_bit(idx, array, CPUPRI_NR_PRIORITIES)
-
 /**
  * cpupri_find - find the best (lowest-pri) CPU in the system
  * @cp: The cpupri context
@@ -71,11 +68,38 @@
 	int                  idx      = 0;
 	int                  task_pri = convert_prio(p->prio);
 
-	for_each_cpupri_active(cp->pri_active, idx) {
-		struct cpupri_vec *vec  = &cp->pri_to_cpu[idx];
+	if (task_pri >= MAX_RT_PRIO)
+		return 0;
 
-		if (idx >= task_pri)
-			break;
+	for (idx = 0; idx < task_pri; idx++) {
+		struct cpupri_vec *vec  = &cp->pri_to_cpu[idx];
+		int skip = 0;
+
+		if (!atomic_read(&(vec)->count))
+			skip = 1;
+		/*
+		 * When looking at the vector, we need to read the counter,
+		 * do a memory barrier, then read the mask.
+		 *
+		 * Note: This is still all racey, but we can deal with it.
+		 *  Ideally, we only want to look at masks that are set.
+		 *
+		 *  If a mask is not set, then the only thing wrong is that we
+		 *  did a little more work than necessary.
+		 *
+		 *  If we read a zero count but the mask is set, because of the
+		 *  memory barriers, that can only happen when the highest prio
+		 *  task for a run queue has left the run queue, in which case,
+		 *  it will be followed by a pull. If the task we are processing
+		 *  fails to find a proper place to go, that pull request will
+		 *  pull this task if the run queue is running at a lower
+		 *  priority.
+		 */
+		smp_rmb();
+
+		/* Need to do the rmb for every iteration */
+		if (skip)
+			continue;
 
 		if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
 			continue;
@@ -115,7 +139,7 @@
 {
 	int                 *currpri = &cp->cpu_to_pri[cpu];
 	int                  oldpri  = *currpri;
-	unsigned long        flags;
+	int                  do_mb = 0;
 
 	newpri = convert_prio(newpri);
 
@@ -128,32 +152,46 @@
 	 * If the cpu was currently mapped to a different value, we
 	 * need to map it to the new value then remove the old value.
 	 * Note, we must add the new value first, otherwise we risk the
-	 * cpu being cleared from pri_active, and this cpu could be
-	 * missed for a push or pull.
+	 * cpu being missed by the priority loop in cpupri_find.
 	 */
 	if (likely(newpri != CPUPRI_INVALID)) {
 		struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];
 
-		raw_spin_lock_irqsave(&vec->lock, flags);
-
 		cpumask_set_cpu(cpu, vec->mask);
-		vec->count++;
-		if (vec->count == 1)
-			set_bit(newpri, cp->pri_active);
-
-		raw_spin_unlock_irqrestore(&vec->lock, flags);
+		/*
+		 * When adding a new vector, we update the mask first,
+		 * do a write memory barrier, and then update the count, to
+		 * make sure the vector is visible when count is set.
+		 */
+		smp_mb__before_atomic_inc();
+		atomic_inc(&(vec)->count);
+		do_mb = 1;
 	}
 	if (likely(oldpri != CPUPRI_INVALID)) {
 		struct cpupri_vec *vec  = &cp->pri_to_cpu[oldpri];
 
-		raw_spin_lock_irqsave(&vec->lock, flags);
+		/*
+		 * Because the order of modification of the vec->count
+		 * is important, we must make sure that the update
+		 * of the new prio is seen before we decrement the
+		 * old prio. This makes sure that the loop sees
+		 * one or the other when we raise the priority of
+		 * the run queue. We don't care about when we lower the
+		 * priority, as that will trigger an rt pull anyway.
+		 *
+		 * We only need to do a memory barrier if we updated
+		 * the new priority vec.
+		 */
+		if (do_mb)
+			smp_mb__after_atomic_inc();
 
-		vec->count--;
-		if (!vec->count)
-			clear_bit(oldpri, cp->pri_active);
+		/*
+		 * When removing from the vector, we decrement the counter first
+		 * do a memory barrier and then clear the mask.
+		 */
+		atomic_dec(&(vec)->count);
+		smp_mb__after_atomic_inc();
 		cpumask_clear_cpu(cpu, vec->mask);
-
-		raw_spin_unlock_irqrestore(&vec->lock, flags);
 	}
 
 	*currpri = newpri;
@@ -175,8 +213,7 @@
 	for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
 		struct cpupri_vec *vec = &cp->pri_to_cpu[i];
 
-		raw_spin_lock_init(&vec->lock);
-		vec->count = 0;
+		atomic_set(&vec->count, 0);
 		if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL))
 			goto cleanup;
 	}

diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h
index 9fc7d38..f6d7561 100644
--- a/kernel/sched_cpupri.h
+++ b/kernel/sched_cpupri.h

@@ -4,7 +4,6 @@
 #include <linux/sched.h>
 
 #define CPUPRI_NR_PRIORITIES	(MAX_RT_PRIO + 2)
-#define CPUPRI_NR_PRI_WORDS	BITS_TO_LONGS(CPUPRI_NR_PRIORITIES)
 
 #define CPUPRI_INVALID -1
 #define CPUPRI_IDLE     0
@@ -12,14 +11,12 @@
 /* values 2-101 are RT priorities 0-99 */
 
 struct cpupri_vec {
-	raw_spinlock_t lock;
-	int        count;
-	cpumask_var_t mask;
+	atomic_t	count;
+	cpumask_var_t	mask;
 };
 
 struct cpupri {
 	struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES];
-	long              pri_active[CPUPRI_NR_PRI_WORDS];
 	int               cpu_to_pri[NR_CPUS];
 };
 

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index bc8ee99..5c9e679 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c

@@ -89,6 +89,20 @@
  */
 unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
 
+#ifdef CONFIG_CFS_BANDWIDTH
+/*
+ * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
+ * each time a cfs_rq requests quota.
+ *
+ * Note: in the case that the slice exceeds the runtime remaining (either due
+ * to consumption or the quota being specified to be smaller than the slice)
+ * we will always only issue the remaining available time.
+ *
+ * default: 5 msec, units: microseconds
+  */
+unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
+#endif
+
 static const struct sched_class fair_sched_class;
 
 /**************************************************************
@@ -292,6 +306,8 @@
 
 #endif	/* CONFIG_FAIR_GROUP_SCHED */
 
+static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+				   unsigned long delta_exec);
 
 /**************************************************************
  * Scheduling class tree data structure manipulation methods:
@@ -583,6 +599,8 @@
 		cpuacct_charge(curtask, delta_exec);
 		account_group_exec_runtime(curtask, delta_exec);
 	}
+
+	account_cfs_rq_runtime(cfs_rq, delta_exec);
 }
 
 static inline void
@@ -688,6 +706,8 @@
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
+/* we need this in update_cfs_load and load-balance functions below */
+static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
 # ifdef CONFIG_SMP
 static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
 					    int global_update)
@@ -710,7 +730,7 @@
 	u64 now, delta;
 	unsigned long load = cfs_rq->load.weight;
 
-	if (cfs_rq->tg == &root_task_group)
+	if (cfs_rq->tg == &root_task_group || throttled_hierarchy(cfs_rq))
 		return;
 
 	now = rq_of(cfs_rq)->clock_task;
@@ -819,7 +839,7 @@
 
 	tg = cfs_rq->tg;
 	se = tg->se[cpu_of(rq_of(cfs_rq))];
-	if (!se)
+	if (!se || throttled_hierarchy(cfs_rq))
 		return;
 #ifndef CONFIG_SMP
 	if (likely(se->load.weight == tg->shares))
@@ -950,6 +970,8 @@
 	se->vruntime = vruntime;
 }
 
+static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
+
 static void
 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
@@ -979,8 +1001,10 @@
 		__enqueue_entity(cfs_rq, se);
 	se->on_rq = 1;
 
-	if (cfs_rq->nr_running == 1)
+	if (cfs_rq->nr_running == 1) {
 		list_add_leaf_cfs_rq(cfs_rq);
+		check_enqueue_throttle(cfs_rq);
+	}
 }
 
 static void __clear_buddies_last(struct sched_entity *se)
@@ -1028,6 +1052,8 @@
 		__clear_buddies_skip(se);
 }
 
+static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
+
 static void
 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
@@ -1066,6 +1092,9 @@
 	if (!(flags & DEQUEUE_SLEEP))
 		se->vruntime -= cfs_rq->min_vruntime;
 
+	/* return excess runtime on last dequeue */
+	return_cfs_rq_runtime(cfs_rq);
+
 	update_min_vruntime(cfs_rq);
 	update_cfs_shares(cfs_rq);
 }
@@ -1077,6 +1106,8 @@
 check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 {
 	unsigned long ideal_runtime, delta_exec;
+	struct sched_entity *se;
+	s64 delta;
 
 	ideal_runtime = sched_slice(cfs_rq, curr);
 	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
@@ -1095,22 +1126,17 @@
 	 * narrow margin doesn't have to wait for a full slice.
 	 * This also mitigates buddy induced latencies under load.
 	 */
-	if (!sched_feat(WAKEUP_PREEMPT))
-		return;
-
 	if (delta_exec < sysctl_sched_min_granularity)
 		return;
 
-	if (cfs_rq->nr_running > 1) {
-		struct sched_entity *se = __pick_first_entity(cfs_rq);
-		s64 delta = curr->vruntime - se->vruntime;
+	se = __pick_first_entity(cfs_rq);
+	delta = curr->vruntime - se->vruntime;
 
-		if (delta < 0)
-			return;
+	if (delta < 0)
+		return;
 
-		if (delta > ideal_runtime)
-			resched_task(rq_of(cfs_rq)->curr);
-	}
+	if (delta > ideal_runtime)
+		resched_task(rq_of(cfs_rq)->curr);
 }
 
 static void
@@ -1185,6 +1211,8 @@
 	return se;
 }
 
+static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
+
 static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 {
 	/*
@@ -1194,6 +1222,9 @@
 	if (prev->on_rq)
 		update_curr(cfs_rq);
 
+	/* throttle cfs_rqs exceeding runtime */
+	check_cfs_rq_runtime(cfs_rq);
+
 	check_spread(cfs_rq, prev);
 	if (prev->on_rq) {
 		update_stats_wait_start(cfs_rq, prev);
@@ -1233,10 +1264,583 @@
 		return;
 #endif
 
-	if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT))
+	if (cfs_rq->nr_running > 1)
 		check_preempt_tick(cfs_rq, curr);
 }
 
+
+/**************************************************
+ * CFS bandwidth control machinery
+ */
+
+#ifdef CONFIG_CFS_BANDWIDTH
+/*
+ * default period for cfs group bandwidth.
+ * default: 0.1s, units: nanoseconds
+ */
+static inline u64 default_cfs_period(void)
+{
+	return 100000000ULL;
+}
+
+static inline u64 sched_cfs_bandwidth_slice(void)
+{
+	return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
+}
+
+/*
+ * Replenish runtime according to assigned quota and update expiration time.
+ * We use sched_clock_cpu directly instead of rq->clock to avoid adding
+ * additional synchronization around rq->lock.
+ *
+ * requires cfs_b->lock
+ */
+static void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
+{
+	u64 now;
+
+	if (cfs_b->quota == RUNTIME_INF)
+		return;
+
+	now = sched_clock_cpu(smp_processor_id());
+	cfs_b->runtime = cfs_b->quota;
+	cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
+}
+
+/* returns 0 on failure to allocate runtime */
+static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+	struct task_group *tg = cfs_rq->tg;
+	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
+	u64 amount = 0, min_amount, expires;
+
+	/* note: this is a positive sum as runtime_remaining <= 0 */
+	min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
+
+	raw_spin_lock(&cfs_b->lock);
+	if (cfs_b->quota == RUNTIME_INF)
+		amount = min_amount;
+	else {
+		/*
+		 * If the bandwidth pool has become inactive, then at least one
+		 * period must have elapsed since the last consumption.
+		 * Refresh the global state and ensure bandwidth timer becomes
+		 * active.
+		 */
+		if (!cfs_b->timer_active) {
+			__refill_cfs_bandwidth_runtime(cfs_b);
+			__start_cfs_bandwidth(cfs_b);
+		}
+
+		if (cfs_b->runtime > 0) {
+			amount = min(cfs_b->runtime, min_amount);
+			cfs_b->runtime -= amount;
+			cfs_b->idle = 0;
+		}
+	}
+	expires = cfs_b->runtime_expires;
+	raw_spin_unlock(&cfs_b->lock);
+
+	cfs_rq->runtime_remaining += amount;
+	/*
+	 * we may have advanced our local expiration to account for allowed
+	 * spread between our sched_clock and the one on which runtime was
+	 * issued.
+	 */
+	if ((s64)(expires - cfs_rq->runtime_expires) > 0)
+		cfs_rq->runtime_expires = expires;
+
+	return cfs_rq->runtime_remaining > 0;
+}
+
+/*
+ * Note: This depends on the synchronization provided by sched_clock and the
+ * fact that rq->clock snapshots this value.
+ */
+static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+	struct rq *rq = rq_of(cfs_rq);
+
+	/* if the deadline is ahead of our clock, nothing to do */
+	if (likely((s64)(rq->clock - cfs_rq->runtime_expires) < 0))
+		return;
+
+	if (cfs_rq->runtime_remaining < 0)
+		return;
+
+	/*
+	 * If the local deadline has passed we have to consider the
+	 * possibility that our sched_clock is 'fast' and the global deadline
+	 * has not truly expired.
+	 *
+	 * Fortunately we can check determine whether this the case by checking
+	 * whether the global deadline has advanced.
+	 */
+
+	if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) {
+		/* extend local deadline, drift is bounded above by 2 ticks */
+		cfs_rq->runtime_expires += TICK_NSEC;
+	} else {
+		/* global deadline is ahead, expiration has passed */
+		cfs_rq->runtime_remaining = 0;
+	}
+}
+
+static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+				     unsigned long delta_exec)
+{
+	/* dock delta_exec before expiring quota (as it could span periods) */
+	cfs_rq->runtime_remaining -= delta_exec;
+	expire_cfs_rq_runtime(cfs_rq);
+
+	if (likely(cfs_rq->runtime_remaining > 0))
+		return;
+
+	/*
+	 * if we're unable to extend our runtime we resched so that the active
+	 * hierarchy can be throttled
+	 */
+	if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
+		resched_task(rq_of(cfs_rq)->curr);
+}
+
+static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+						   unsigned long delta_exec)
+{
+	if (!cfs_rq->runtime_enabled)
+		return;
+
+	__account_cfs_rq_runtime(cfs_rq, delta_exec);
+}
+
+static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
+{
+	return cfs_rq->throttled;
+}
+
+/* check whether cfs_rq, or any parent, is throttled */
+static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
+{
+	return cfs_rq->throttle_count;
+}
+
+/*
+ * Ensure that neither of the group entities corresponding to src_cpu or
+ * dest_cpu are members of a throttled hierarchy when performing group
+ * load-balance operations.
+ */
+static inline int throttled_lb_pair(struct task_group *tg,
+				    int src_cpu, int dest_cpu)
+{
+	struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
+
+	src_cfs_rq = tg->cfs_rq[src_cpu];
+	dest_cfs_rq = tg->cfs_rq[dest_cpu];
+
+	return throttled_hierarchy(src_cfs_rq) ||
+	       throttled_hierarchy(dest_cfs_rq);
+}
+
+/* updated child weight may affect parent so we have to do this bottom up */
+static int tg_unthrottle_up(struct task_group *tg, void *data)
+{
+	struct rq *rq = data;
+	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
+
+	cfs_rq->throttle_count--;
+#ifdef CONFIG_SMP
+	if (!cfs_rq->throttle_count) {
+		u64 delta = rq->clock_task - cfs_rq->load_stamp;
+
+		/* leaving throttled state, advance shares averaging windows */
+		cfs_rq->load_stamp += delta;
+		cfs_rq->load_last += delta;
+
+		/* update entity weight now that we are on_rq again */
+		update_cfs_shares(cfs_rq);
+	}
+#endif
+
+	return 0;
+}
+
+static int tg_throttle_down(struct task_group *tg, void *data)
+{
+	struct rq *rq = data;
+	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
+
+	/* group is entering throttled state, record last load */
+	if (!cfs_rq->throttle_count)
+		update_cfs_load(cfs_rq, 0);
+	cfs_rq->throttle_count++;
+
+	return 0;
+}
+
+static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
+{
+	struct rq *rq = rq_of(cfs_rq);
+	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+	struct sched_entity *se;
+	long task_delta, dequeue = 1;
+
+	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
+
+	/* account load preceding throttle */
+	rcu_read_lock();
+	walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
+	rcu_read_unlock();
+
+	task_delta = cfs_rq->h_nr_running;
+	for_each_sched_entity(se) {
+		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
+		/* throttled entity or throttle-on-deactivate */
+		if (!se->on_rq)
+			break;
+
+		if (dequeue)
+			dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
+		qcfs_rq->h_nr_running -= task_delta;
+
+		if (qcfs_rq->load.weight)
+			dequeue = 0;
+	}
+
+	if (!se)
+		rq->nr_running -= task_delta;
+
+	cfs_rq->throttled = 1;
+	cfs_rq->throttled_timestamp = rq->clock;
+	raw_spin_lock(&cfs_b->lock);
+	list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
+	raw_spin_unlock(&cfs_b->lock);
+}
+
+static void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
+{
+	struct rq *rq = rq_of(cfs_rq);
+	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+	struct sched_entity *se;
+	int enqueue = 1;
+	long task_delta;
+
+	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
+
+	cfs_rq->throttled = 0;
+	raw_spin_lock(&cfs_b->lock);
+	cfs_b->throttled_time += rq->clock - cfs_rq->throttled_timestamp;
+	list_del_rcu(&cfs_rq->throttled_list);
+	raw_spin_unlock(&cfs_b->lock);
+	cfs_rq->throttled_timestamp = 0;
+
+	update_rq_clock(rq);
+	/* update hierarchical throttle state */
+	walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
+
+	if (!cfs_rq->load.weight)
+		return;
+
+	task_delta = cfs_rq->h_nr_running;
+	for_each_sched_entity(se) {
+		if (se->on_rq)
+			enqueue = 0;
+
+		cfs_rq = cfs_rq_of(se);
+		if (enqueue)
+			enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
+		cfs_rq->h_nr_running += task_delta;
+
+		if (cfs_rq_throttled(cfs_rq))
+			break;
+	}
+
+	if (!se)
+		rq->nr_running += task_delta;
+
+	/* determine whether we need to wake up potentially idle cpu */
+	if (rq->curr == rq->idle && rq->cfs.nr_running)
+		resched_task(rq->curr);
+}
+
+static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
+		u64 remaining, u64 expires)
+{
+	struct cfs_rq *cfs_rq;
+	u64 runtime = remaining;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
+				throttled_list) {
+		struct rq *rq = rq_of(cfs_rq);
+
+		raw_spin_lock(&rq->lock);
+		if (!cfs_rq_throttled(cfs_rq))
+			goto next;
+
+		runtime = -cfs_rq->runtime_remaining + 1;
+		if (runtime > remaining)
+			runtime = remaining;
+		remaining -= runtime;
+
+		cfs_rq->runtime_remaining += runtime;
+		cfs_rq->runtime_expires = expires;
+
+		/* we check whether we're throttled above */
+		if (cfs_rq->runtime_remaining > 0)
+			unthrottle_cfs_rq(cfs_rq);
+
+next:
+		raw_spin_unlock(&rq->lock);
+
+		if (!remaining)
+			break;
+	}
+	rcu_read_unlock();
+
+	return remaining;
+}
+
+/*
+ * Responsible for refilling a task_group's bandwidth and unthrottling its
+ * cfs_rqs as appropriate. If there has been no activity within the last
+ * period the timer is deactivated until scheduling resumes; cfs_b->idle is
+ * used to track this state.
+ */
+static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
+{
+	u64 runtime, runtime_expires;
+	int idle = 1, throttled;
+
+	raw_spin_lock(&cfs_b->lock);
+	/* no need to continue the timer with no bandwidth constraint */
+	if (cfs_b->quota == RUNTIME_INF)
+		goto out_unlock;
+
+	throttled = !list_empty(&cfs_b->throttled_cfs_rq);
+	/* idle depends on !throttled (for the case of a large deficit) */
+	idle = cfs_b->idle && !throttled;
+	cfs_b->nr_periods += overrun;
+
+	/* if we're going inactive then everything else can be deferred */
+	if (idle)
+		goto out_unlock;
+
+	__refill_cfs_bandwidth_runtime(cfs_b);
+
+	if (!throttled) {
+		/* mark as potentially idle for the upcoming period */
+		cfs_b->idle = 1;
+		goto out_unlock;
+	}
+
+	/* account preceding periods in which throttling occurred */
+	cfs_b->nr_throttled += overrun;
+
+	/*
+	 * There are throttled entities so we must first use the new bandwidth
+	 * to unthrottle them before making it generally available.  This
+	 * ensures that all existing debts will be paid before a new cfs_rq is
+	 * allowed to run.
+	 */
+	runtime = cfs_b->runtime;
+	runtime_expires = cfs_b->runtime_expires;
+	cfs_b->runtime = 0;
+
+	/*
+	 * This check is repeated as we are holding onto the new bandwidth
+	 * while we unthrottle.  This can potentially race with an unthrottled
+	 * group trying to acquire new bandwidth from the global pool.
+	 */
+	while (throttled && runtime > 0) {
+		raw_spin_unlock(&cfs_b->lock);
+		/* we can't nest cfs_b->lock while distributing bandwidth */
+		runtime = distribute_cfs_runtime(cfs_b, runtime,
+						 runtime_expires);
+		raw_spin_lock(&cfs_b->lock);
+
+		throttled = !list_empty(&cfs_b->throttled_cfs_rq);
+	}
+
+	/* return (any) remaining runtime */
+	cfs_b->runtime = runtime;
+	/*
+	 * While we are ensured activity in the period following an
+	 * unthrottle, this also covers the case in which the new bandwidth is
+	 * insufficient to cover the existing bandwidth deficit.  (Forcing the
+	 * timer to remain active while there are any throttled entities.)
+	 */
+	cfs_b->idle = 0;
+out_unlock:
+	if (idle)
+		cfs_b->timer_active = 0;
+	raw_spin_unlock(&cfs_b->lock);
+
+	return idle;
+}
+
+/* a cfs_rq won't donate quota below this amount */
+static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
+/* minimum remaining period time to redistribute slack quota */
+static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
+/* how long we wait to gather additional slack before distributing */
+static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
+
+/* are we near the end of the current quota period? */
+static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
+{
+	struct hrtimer *refresh_timer = &cfs_b->period_timer;
+	u64 remaining;
+
+	/* if the call-back is running a quota refresh is already occurring */
+	if (hrtimer_callback_running(refresh_timer))
+		return 1;
+
+	/* is a quota refresh about to occur? */
+	remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
+	if (remaining < min_expire)
+		return 1;
+
+	return 0;
+}
+
+static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
+{
+	u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
+
+	/* if there's a quota refresh soon don't bother with slack */
+	if (runtime_refresh_within(cfs_b, min_left))
+		return;
+
+	start_bandwidth_timer(&cfs_b->slack_timer,
+				ns_to_ktime(cfs_bandwidth_slack_period));
+}
+
+/* we know any runtime found here is valid as update_curr() precedes return */
+static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+	s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
+
+	if (slack_runtime <= 0)
+		return;
+
+	raw_spin_lock(&cfs_b->lock);
+	if (cfs_b->quota != RUNTIME_INF &&
+	    cfs_rq->runtime_expires == cfs_b->runtime_expires) {
+		cfs_b->runtime += slack_runtime;
+
+		/* we are under rq->lock, defer unthrottling using a timer */
+		if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
+		    !list_empty(&cfs_b->throttled_cfs_rq))
+			start_cfs_slack_bandwidth(cfs_b);
+	}
+	raw_spin_unlock(&cfs_b->lock);
+
+	/* even if it's not valid for return we don't want to try again */
+	cfs_rq->runtime_remaining -= slack_runtime;
+}
+
+static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+	if (!cfs_rq->runtime_enabled || !cfs_rq->nr_running)
+		return;
+
+	__return_cfs_rq_runtime(cfs_rq);
+}
+
+/*
+ * This is done with a timer (instead of inline with bandwidth return) since
+ * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
+ */
+static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
+{
+	u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
+	u64 expires;
+
+	/* confirm we're still not at a refresh boundary */
+	if (runtime_refresh_within(cfs_b, min_bandwidth_expiration))
+		return;
+
+	raw_spin_lock(&cfs_b->lock);
+	if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
+		runtime = cfs_b->runtime;
+		cfs_b->runtime = 0;
+	}
+	expires = cfs_b->runtime_expires;
+	raw_spin_unlock(&cfs_b->lock);
+
+	if (!runtime)
+		return;
+
+	runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
+
+	raw_spin_lock(&cfs_b->lock);
+	if (expires == cfs_b->runtime_expires)
+		cfs_b->runtime = runtime;
+	raw_spin_unlock(&cfs_b->lock);
+}
+
+/*
+ * When a group wakes up we want to make sure that its quota is not already
+ * expired/exceeded, otherwise it may be allowed to steal additional ticks of
+ * runtime as update_curr() throttling can not not trigger until it's on-rq.
+ */
+static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
+{
+	/* an active group must be handled by the update_curr()->put() path */
+	if (!cfs_rq->runtime_enabled || cfs_rq->curr)
+		return;
+
+	/* ensure the group is not already throttled */
+	if (cfs_rq_throttled(cfs_rq))
+		return;
+
+	/* update runtime allocation */
+	account_cfs_rq_runtime(cfs_rq, 0);
+	if (cfs_rq->runtime_remaining <= 0)
+		throttle_cfs_rq(cfs_rq);
+}
+
+/* conditionally throttle active cfs_rq's from put_prev_entity() */
+static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+	if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
+		return;
+
+	/*
+	 * it's possible for a throttled entity to be forced into a running
+	 * state (e.g. set_curr_task), in this case we're finished.
+	 */
+	if (cfs_rq_throttled(cfs_rq))
+		return;
+
+	throttle_cfs_rq(cfs_rq);
+}
+#else
+static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+				     unsigned long delta_exec) {}
+static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
+static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
+static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
+
+static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
+{
+	return 0;
+}
+
+static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
+{
+	return 0;
+}
+
+static inline int throttled_lb_pair(struct task_group *tg,
+				    int src_cpu, int dest_cpu)
+{
+	return 0;
+}
+#endif
+
 /**************************************************
  * CFS operations on tasks:
  */
@@ -1313,16 +1917,33 @@
 			break;
 		cfs_rq = cfs_rq_of(se);
 		enqueue_entity(cfs_rq, se, flags);
+
+		/*
+		 * end evaluation on encountering a throttled cfs_rq
+		 *
+		 * note: in the case of encountering a throttled cfs_rq we will
+		 * post the final h_nr_running increment below.
+		*/
+		if (cfs_rq_throttled(cfs_rq))
+			break;
+		cfs_rq->h_nr_running++;
+
 		flags = ENQUEUE_WAKEUP;
 	}
 
 	for_each_sched_entity(se) {
 		cfs_rq = cfs_rq_of(se);
+		cfs_rq->h_nr_running++;
+
+		if (cfs_rq_throttled(cfs_rq))
+			break;
 
 		update_cfs_load(cfs_rq, 0);
 		update_cfs_shares(cfs_rq);
 	}
 
+	if (!se)
+		inc_nr_running(rq);
 	hrtick_update(rq);
 }
 
@@ -1343,6 +1964,16 @@
 		cfs_rq = cfs_rq_of(se);
 		dequeue_entity(cfs_rq, se, flags);
 
+		/*
+		 * end evaluation on encountering a throttled cfs_rq
+		 *
+		 * note: in the case of encountering a throttled cfs_rq we will
+		 * post the final h_nr_running decrement below.
+		*/
+		if (cfs_rq_throttled(cfs_rq))
+			break;
+		cfs_rq->h_nr_running--;
+
 		/* Don't dequeue parent if it has other entities besides us */
 		if (cfs_rq->load.weight) {
 			/*
@@ -1361,11 +1992,17 @@
 
 	for_each_sched_entity(se) {
 		cfs_rq = cfs_rq_of(se);
+		cfs_rq->h_nr_running--;
+
+		if (cfs_rq_throttled(cfs_rq))
+			break;
 
 		update_cfs_load(cfs_rq, 0);
 		update_cfs_shares(cfs_rq);
 	}
 
+	if (!se)
+		dec_nr_running(rq);
 	hrtick_update(rq);
 }
 
@@ -1434,7 +2071,6 @@
 
 	return wl;
 }
-
 #else
 
 static inline unsigned long effective_load(struct task_group *tg, int cpu,
@@ -1547,7 +2183,7 @@
 
 		/* Skip over this group if it has no CPUs allowed */
 		if (!cpumask_intersects(sched_group_cpus(group),
-					&p->cpus_allowed))
+					tsk_cpus_allowed(p)))
 			continue;
 
 		local_group = cpumask_test_cpu(this_cpu,
@@ -1593,7 +2229,7 @@
 	int i;
 
 	/* Traverse only the allowed CPUs */
-	for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
+	for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
 		load = weighted_cpuload(i);
 
 		if (load < min_load || (load == min_load && i == this_cpu)) {
@@ -1637,7 +2273,7 @@
 		if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
 			break;
 
-		for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) {
+		for_each_cpu_and(i, sched_domain_span(sd), tsk_cpus_allowed(p)) {
 			if (idle_cpu(i)) {
 				target = i;
 				break;
@@ -1680,7 +2316,7 @@
 	int sync = wake_flags & WF_SYNC;
 
 	if (sd_flag & SD_BALANCE_WAKE) {
-		if (cpumask_test_cpu(cpu, &p->cpus_allowed))
+		if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
 			want_affine = 1;
 		new_cpu = prev_cpu;
 	}
@@ -1875,6 +2511,15 @@
 	if (unlikely(se == pse))
 		return;
 
+	/*
+	 * This is possible from callers such as pull_task(), in which we
+	 * unconditionally check_prempt_curr() after an enqueue (which may have
+	 * lead to a throttle).  This both saves work and prevents false
+	 * next-buddy nomination below.
+	 */
+	if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
+		return;
+
 	if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
 		set_next_buddy(pse);
 		next_buddy_marked = 1;
@@ -1883,6 +2528,12 @@
 	/*
 	 * We can come here with TIF_NEED_RESCHED already set from new task
 	 * wake up path.
+	 *
+	 * Note: this also catches the edge-case of curr being in a throttled
+	 * group (e.g. via set_curr_task), since update_curr() (in the
+	 * enqueue of curr) will have resulted in resched being set.  This
+	 * prevents us from potentially nominating it as a false LAST_BUDDY
+	 * below.
 	 */
 	if (test_tsk_need_resched(curr))
 		return;
@@ -1899,10 +2550,6 @@
 	if (unlikely(p->policy != SCHED_NORMAL))
 		return;
 
-
-	if (!sched_feat(WAKEUP_PREEMPT))
-		return;
-
 	find_matching_se(&se, &pse);
 	update_curr(cfs_rq_of(se));
 	BUG_ON(!pse);
@@ -2005,7 +2652,8 @@
 {
 	struct sched_entity *se = &p->se;
 
-	if (!se->on_rq)
+	/* throttled hierarchies are not runnable */
+	if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
 		return false;
 
 	/* Tell the scheduler that we'd really like pse to run next. */
@@ -2049,7 +2697,7 @@
 	 * 2) cannot be migrated to this CPU due to cpus_allowed, or
 	 * 3) are cache-hot on their current CPU.
 	 */
-	if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
+	if (!cpumask_test_cpu(this_cpu, tsk_cpus_allowed(p))) {
 		schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
 		return 0;
 	}
@@ -2102,6 +2750,9 @@
 
 	for_each_leaf_cfs_rq(busiest, cfs_rq) {
 		list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) {
+			if (throttled_lb_pair(task_group(p),
+					      busiest->cpu, this_cpu))
+				break;
 
 			if (!can_migrate_task(p, busiest, this_cpu,
 						sd, idle, &pinned))
@@ -2217,8 +2868,13 @@
 	 * Iterates the task_group tree in a bottom up fashion, see
 	 * list_add_leaf_cfs_rq() for details.
 	 */
-	for_each_leaf_cfs_rq(rq, cfs_rq)
+	for_each_leaf_cfs_rq(rq, cfs_rq) {
+		/* throttled entities do not contribute to load */
+		if (throttled_hierarchy(cfs_rq))
+			continue;
+
 		update_shares_cpu(cfs_rq->tg, cpu);
+	}
 	rcu_read_unlock();
 }
 
@@ -2268,9 +2924,10 @@
 		u64 rem_load, moved_load;
 
 		/*
-		 * empty group
+		 * empty group or part of a throttled hierarchy
 		 */
-		if (!busiest_cfs_rq->task_weight)
+		if (!busiest_cfs_rq->task_weight ||
+		    throttled_lb_pair(busiest_cfs_rq->tg, cpu_of(busiest), this_cpu))
 			continue;
 
 		rem_load = (u64)rem_load_move * busiest_weight;
@@ -3430,7 +4087,7 @@
 			 * moved to this_cpu
 			 */
 			if (!cpumask_test_cpu(this_cpu,
-					      &busiest->curr->cpus_allowed)) {
+					tsk_cpus_allowed(busiest->curr))) {
 				raw_spin_unlock_irqrestore(&busiest->lock,
 							    flags);
 				all_pinned = 1;
@@ -3612,22 +4269,6 @@
 }
 
 #ifdef CONFIG_NO_HZ
-
-static DEFINE_PER_CPU(struct call_single_data, remote_sched_softirq_cb);
-
-static void trigger_sched_softirq(void *data)
-{
-	raise_softirq_irqoff(SCHED_SOFTIRQ);
-}
-
-static inline void init_sched_softirq_csd(struct call_single_data *csd)
-{
-	csd->func = trigger_sched_softirq;
-	csd->info = NULL;
-	csd->flags = 0;
-	csd->priv = 0;
-}
-
 /*
  * idle load balancing details
  * - One of the idle CPUs nominates itself as idle load_balancer, while
@@ -3667,7 +4308,7 @@
 	struct sched_domain *sd;
 
 	for_each_domain(cpu, sd)
-		if (sd && (sd->flags & flag))
+		if (sd->flags & flag)
 			break;
 
 	return sd;
@@ -3793,11 +4434,16 @@
 	}
 
 	if (!cpu_rq(ilb_cpu)->nohz_balance_kick) {
-		struct call_single_data *cp;
-
 		cpu_rq(ilb_cpu)->nohz_balance_kick = 1;
-		cp = &per_cpu(remote_sched_softirq_cb, cpu);
-		__smp_call_function_single(ilb_cpu, cp, 0);
+
+		smp_mb();
+		/*
+		 * Use smp_send_reschedule() instead of resched_cpu().
+		 * This way we generate a sched IPI on the target cpu which
+		 * is idle. And the softirq performing nohz idle load balance
+		 * will be run before returning from the IPI.
+		 */
+		smp_send_reschedule(ilb_cpu);
 	}
 	return;
 }
@@ -4030,7 +4676,7 @@
 	if (time_before(now, nohz.next_balance))
 		return 0;
 
-	if (rq->idle_at_tick)
+	if (idle_cpu(cpu))
 		return 0;
 
 	first_pick_cpu = atomic_read(&nohz.first_pick_cpu);
@@ -4066,7 +4712,7 @@
 {
 	int this_cpu = smp_processor_id();
 	struct rq *this_rq = cpu_rq(this_cpu);
-	enum cpu_idle_type idle = this_rq->idle_at_tick ?
+	enum cpu_idle_type idle = this_rq->idle_balance ?
 						CPU_IDLE : CPU_NOT_IDLE;
 
 	rebalance_domains(this_cpu, idle);
@@ -4251,8 +4897,13 @@
 {
 	struct sched_entity *se = &rq->curr->se;
 
-	for_each_sched_entity(se)
-		set_next_entity(cfs_rq_of(se), se);
+	for_each_sched_entity(se) {
+		struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+		set_next_entity(cfs_rq, se);
+		/* ensure bandwidth has been allocated on our new cfs_rq */
+		account_cfs_rq_runtime(cfs_rq, 0);
+	}
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED

diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 2e74677..efa0a7b 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h

@@ -12,11 +12,6 @@
 SCHED_FEAT(START_DEBIT, 1)
 
 /*
- * Should wakeups try to preempt running tasks.
- */
-SCHED_FEAT(WAKEUP_PREEMPT, 1)
-
-/*
  * Based on load and program behaviour, see if it makes sense to place
  * a newly woken task on the same cpu as the task that woke it --
  * improve cache locality. Typically used with SYNC wakeups as

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index af11778..056cbd2 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c

@@ -124,21 +124,33 @@
 	update_rt_migration(rt_rq);
 }
 
+static inline int has_pushable_tasks(struct rq *rq)
+{
+	return !plist_head_empty(&rq->rt.pushable_tasks);
+}
+
 static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
 {
 	plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
 	plist_node_init(&p->pushable_tasks, p->prio);
 	plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);
+
+	/* Update the highest prio pushable task */
+	if (p->prio < rq->rt.highest_prio.next)
+		rq->rt.highest_prio.next = p->prio;
 }
 
 static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
 {
 	plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
-}
 
-static inline int has_pushable_tasks(struct rq *rq)
-{
-	return !plist_head_empty(&rq->rt.pushable_tasks);
+	/* Update the new highest prio pushable task */
+	if (has_pushable_tasks(rq)) {
+		p = plist_first_entry(&rq->rt.pushable_tasks,
+				      struct task_struct, pushable_tasks);
+		rq->rt.highest_prio.next = p->prio;
+	} else
+		rq->rt.highest_prio.next = MAX_RT_PRIO;
 }
 
 #else
@@ -643,6 +655,7 @@
 
 	if (rt_rq->rt_time > runtime) {
 		rt_rq->rt_throttled = 1;
+		printk_once(KERN_WARNING "sched: RT throttling activated\n");
 		if (rt_rq_throttled(rt_rq)) {
 			sched_rt_rq_dequeue(rt_rq);
 			return 1;
@@ -698,47 +711,13 @@
 
 #if defined CONFIG_SMP
 
-static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu);
-
-static inline int next_prio(struct rq *rq)
-{
-	struct task_struct *next = pick_next_highest_task_rt(rq, rq->cpu);
-
-	if (next && rt_prio(next->prio))
-		return next->prio;
-	else
-		return MAX_RT_PRIO;
-}
-
 static void
 inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
 {
 	struct rq *rq = rq_of_rt_rq(rt_rq);
 
-	if (prio < prev_prio) {
-
-		/*
-		 * If the new task is higher in priority than anything on the
-		 * run-queue, we know that the previous high becomes our
-		 * next-highest.
-		 */
-		rt_rq->highest_prio.next = prev_prio;
-
-		if (rq->online)
-			cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
-
-	} else if (prio == rt_rq->highest_prio.curr)
-		/*
-		 * If the next task is equal in priority to the highest on
-		 * the run-queue, then we implicitly know that the next highest
-		 * task cannot be any lower than current
-		 */
-		rt_rq->highest_prio.next = prio;
-	else if (prio < rt_rq->highest_prio.next)
-		/*
-		 * Otherwise, we need to recompute next-highest
-		 */
-		rt_rq->highest_prio.next = next_prio(rq);
+	if (rq->online && prio < prev_prio)
+		cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
 }
 
 static void
@@ -746,9 +725,6 @@
 {
 	struct rq *rq = rq_of_rt_rq(rt_rq);
 
-	if (rt_rq->rt_nr_running && (prio <= rt_rq->highest_prio.next))
-		rt_rq->highest_prio.next = next_prio(rq);
-
 	if (rq->online && rt_rq->highest_prio.curr != prev_prio)
 		cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
 }
@@ -961,6 +937,8 @@
 
 	if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
 		enqueue_pushable_task(rq, p);
+
+	inc_nr_running(rq);
 }
 
 static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
@@ -971,6 +949,8 @@
 	dequeue_rt_entity(rt_se);
 
 	dequeue_pushable_task(rq, p);
+
+	dec_nr_running(rq);
 }
 
 /*
@@ -1017,10 +997,12 @@
 	struct rq *rq;
 	int cpu;
 
-	if (sd_flag != SD_BALANCE_WAKE)
-		return smp_processor_id();
-
 	cpu = task_cpu(p);
+
+	/* For anything but wake ups, just return the task_cpu */
+	if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
+		goto out;
+
 	rq = cpu_rq(cpu);
 
 	rcu_read_lock();
@@ -1059,6 +1041,7 @@
 	}
 	rcu_read_unlock();
 
+out:
 	return cpu;
 }
 
@@ -1178,7 +1161,6 @@
 static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
 {
 	update_curr_rt(rq);
-	p->se.exec_start = 0;
 
 	/*
 	 * The previous task needs to be made eligible for pushing
@@ -1198,7 +1180,7 @@
 static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
 {
 	if (!task_running(rq, p) &&
-	    (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) &&
+	    (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) &&
 	    (p->rt.nr_cpus_allowed > 1))
 		return 1;
 	return 0;
@@ -1343,7 +1325,7 @@
 			 */
 			if (unlikely(task_rq(task) != rq ||
 				     !cpumask_test_cpu(lowest_rq->cpu,
-						       &task->cpus_allowed) ||
+						       tsk_cpus_allowed(task)) ||
 				     task_running(rq, task) ||
 				     !task->on_rq)) {
 
@@ -1394,6 +1376,7 @@
 {
 	struct task_struct *next_task;
 	struct rq *lowest_rq;
+	int ret = 0;
 
 	if (!rq->rt.overloaded)
 		return 0;
@@ -1426,7 +1409,7 @@
 	if (!lowest_rq) {
 		struct task_struct *task;
 		/*
-		 * find lock_lowest_rq releases rq->lock
+		 * find_lock_lowest_rq releases rq->lock
 		 * so it is possible that next_task has migrated.
 		 *
 		 * We need to make sure that the task is still on the same
@@ -1436,12 +1419,11 @@
 		task = pick_next_pushable_task(rq);
 		if (task_cpu(next_task) == rq->cpu && task == next_task) {
 			/*
-			 * If we get here, the task hasn't moved at all, but
-			 * it has failed to push.  We will not try again,
-			 * since the other cpus will pull from us when they
-			 * are ready.
+			 * The task hasn't migrated, and is still the next
+			 * eligible task, but we failed to find a run-queue
+			 * to push it to.  Do not retry in this case, since
+			 * other cpus will pull from us when ready.
 			 */
-			dequeue_pushable_task(rq, next_task);
 			goto out;
 		}
 
@@ -1460,6 +1442,7 @@
 	deactivate_task(rq, next_task, 0);
 	set_task_cpu(next_task, lowest_rq->cpu);
 	activate_task(lowest_rq, next_task, 0);
+	ret = 1;
 
 	resched_task(lowest_rq->curr);
 
@@ -1468,7 +1451,7 @@
 out:
 	put_task_struct(next_task);
 
-	return 1;
+	return ret;
 }
 
 static void push_rt_tasks(struct rq *rq)
@@ -1626,9 +1609,6 @@
 
 		update_rt_migration(&rq->rt);
 	}
-
-	cpumask_copy(&p->cpus_allowed, new_mask);
-	p->rt.nr_cpus_allowed = weight;
 }
 
 /* Assumes rq->lock is held */
@@ -1863,4 +1843,3 @@
 	rcu_read_unlock();
 }
 #endif /* CONFIG_SCHED_DEBUG */
-

diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c
index 6f43763..8b44e7f 100644
--- a/kernel/sched_stoptask.c
+++ b/kernel/sched_stoptask.c

@@ -34,11 +34,13 @@
 static void
 enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
 {
+	inc_nr_running(rq);
 }
 
 static void
 dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
 {
+	dec_nr_running(rq);
 }
 
 static void yield_task_stop(struct rq *rq)

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 11d65b5..2d2ecdc 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c

@@ -379,6 +379,16 @@
 		.extra2		= &one,
 	},
 #endif
+#ifdef CONFIG_CFS_BANDWIDTH
+	{
+		.procname	= "sched_cfs_bandwidth_slice_us",
+		.data		= &sysctl_sched_cfs_bandwidth_slice,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &one,
+	},
+#endif
 #ifdef CONFIG_PROVE_LOCKING
 	{
 		.procname	= "prove_locking",

diff --git a/kernel/time.c b/kernel/time.c
index 8e8dc6d..d776062 100644
--- a/kernel/time.c
+++ b/kernel/time.c

@@ -575,7 +575,7 @@
 /*
  * Convert jiffies/jiffies_64 to clock_t and back.
  */
-clock_t jiffies_to_clock_t(long x)
+clock_t jiffies_to_clock_t(unsigned long x)
 {
 #if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
 # if HZ < USER_HZ

diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index f06a8a3..b26c222 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig

@@ -27,3 +27,5 @@
 	default y
 	depends on GENERIC_CLOCKEVENTS || GENERIC_CLOCKEVENTS_MIGR
 
+config GENERIC_CLOCKEVENTS_MIN_ADJUST
+	bool

diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index ea5e1a9..c436e79 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c

@@ -53,27 +53,6 @@
 static DEFINE_SPINLOCK(rtcdev_lock);
 
 /**
- * has_wakealarm - check rtc device has wakealarm ability
- * @dev: current device
- * @name_ptr: name to be returned
- *
- * This helper function checks to see if the rtc device can wake
- * from suspend.
- */
-static int has_wakealarm(struct device *dev, void *name_ptr)
-{
-	struct rtc_device *candidate = to_rtc_device(dev);
-
-	if (!candidate->ops->set_alarm)
-		return 0;
-	if (!device_may_wakeup(candidate->dev.parent))
-		return 0;
-
-	*(const char **)name_ptr = dev_name(dev);
-	return 1;
-}
-
-/**
  * alarmtimer_get_rtcdev - Return selected rtcdevice
  *
  * This function returns the rtc device to use for wakealarms.
@@ -82,37 +61,64 @@
  */
 static struct rtc_device *alarmtimer_get_rtcdev(void)
 {
-	struct device *dev;
-	char *str;
 	unsigned long flags;
 	struct rtc_device *ret;
 
 	spin_lock_irqsave(&rtcdev_lock, flags);
-	if (!rtcdev) {
-		/* Find an rtc device and init the rtc_timer */
-		dev = class_find_device(rtc_class, NULL, &str, has_wakealarm);
-		/* If we have a device then str is valid. See has_wakealarm() */
-		if (dev) {
-			rtcdev = rtc_class_open(str);
-			/*
-			 * Drop the reference we got in class_find_device,
-			 * rtc_open takes its own.
-			 */
-			put_device(dev);
-			rtc_timer_init(&rtctimer, NULL, NULL);
-		}
-	}
 	ret = rtcdev;
 	spin_unlock_irqrestore(&rtcdev_lock, flags);
 
 	return ret;
 }
-#else
-#define alarmtimer_get_rtcdev() (0)
-#define rtcdev (0)
-#endif
 
 
+static int alarmtimer_rtc_add_device(struct device *dev,
+				struct class_interface *class_intf)
+{
+	unsigned long flags;
+	struct rtc_device *rtc = to_rtc_device(dev);
+
+	if (rtcdev)
+		return -EBUSY;
+
+	if (!rtc->ops->set_alarm)
+		return -1;
+	if (!device_may_wakeup(rtc->dev.parent))
+		return -1;
+
+	spin_lock_irqsave(&rtcdev_lock, flags);
+	if (!rtcdev) {
+		rtcdev = rtc;
+		/* hold a reference so it doesn't go away */
+		get_device(dev);
+	}
+	spin_unlock_irqrestore(&rtcdev_lock, flags);
+	return 0;
+}
+
+static struct class_interface alarmtimer_rtc_interface = {
+	.add_dev = &alarmtimer_rtc_add_device,
+};
+
+static int alarmtimer_rtc_interface_setup(void)
+{
+	alarmtimer_rtc_interface.class = rtc_class;
+	return class_interface_register(&alarmtimer_rtc_interface);
+}
+static void alarmtimer_rtc_interface_remove(void)
+{
+	class_interface_unregister(&alarmtimer_rtc_interface);
+}
+#else
+static inline struct rtc_device *alarmtimer_get_rtcdev(void)
+{
+	return NULL;
+}
+#define rtcdev (NULL)
+static inline int alarmtimer_rtc_interface_setup(void) { return 0; }
+static inline void alarmtimer_rtc_interface_remove(void) { }
+#endif
+
 /**
  * alarmtimer_enqueue - Adds an alarm timer to an alarm_base timerqueue
  * @base: pointer to the base where the timer is being run
@@ -126,6 +132,8 @@
 static void alarmtimer_enqueue(struct alarm_base *base, struct alarm *alarm)
 {
 	timerqueue_add(&base->timerqueue, &alarm->node);
+	alarm->state |= ALARMTIMER_STATE_ENQUEUED;
+
 	if (&alarm->node == timerqueue_getnext(&base->timerqueue)) {
 		hrtimer_try_to_cancel(&base->timer);
 		hrtimer_start(&base->timer, alarm->node.expires,
@@ -147,7 +155,12 @@
 {
 	struct timerqueue_node *next = timerqueue_getnext(&base->timerqueue);
 
+	if (!(alarm->state & ALARMTIMER_STATE_ENQUEUED))
+		return;
+
 	timerqueue_del(&base->timerqueue, &alarm->node);
+	alarm->state &= ~ALARMTIMER_STATE_ENQUEUED;
+
 	if (next == &alarm->node) {
 		hrtimer_try_to_cancel(&base->timer);
 		next = timerqueue_getnext(&base->timerqueue);
@@ -174,6 +187,7 @@
 	unsigned long flags;
 	ktime_t now;
 	int ret = HRTIMER_NORESTART;
+	int restart = ALARMTIMER_NORESTART;
 
 	spin_lock_irqsave(&base->lock, flags);
 	now = base->gettime();
@@ -187,17 +201,19 @@
 		alarm = container_of(next, struct alarm, node);
 
 		timerqueue_del(&base->timerqueue, &alarm->node);
-		alarm->enabled = 0;
-		/* Re-add periodic timers */
-		if (alarm->period.tv64) {
-			alarm->node.expires = ktime_add(expired, alarm->period);
-			timerqueue_add(&base->timerqueue, &alarm->node);
-			alarm->enabled = 1;
-		}
+		alarm->state &= ~ALARMTIMER_STATE_ENQUEUED;
+
+		alarm->state |= ALARMTIMER_STATE_CALLBACK;
 		spin_unlock_irqrestore(&base->lock, flags);
 		if (alarm->function)
-			alarm->function(alarm);
+			restart = alarm->function(alarm, now);
 		spin_lock_irqsave(&base->lock, flags);
+		alarm->state &= ~ALARMTIMER_STATE_CALLBACK;
+
+		if (restart != ALARMTIMER_NORESTART) {
+			timerqueue_add(&base->timerqueue, &alarm->node);
+			alarm->state |= ALARMTIMER_STATE_ENQUEUED;
+		}
 	}
 
 	if (next) {
@@ -234,7 +250,7 @@
 	freezer_delta = ktime_set(0, 0);
 	spin_unlock_irqrestore(&freezer_delta_lock, flags);
 
-	rtc = rtcdev;
+	rtc = alarmtimer_get_rtcdev();
 	/* If we have no rtcdev, just return */
 	if (!rtc)
 		return 0;
@@ -299,54 +315,112 @@
  * @function: callback that is run when the alarm fires
  */
 void alarm_init(struct alarm *alarm, enum alarmtimer_type type,
-		void (*function)(struct alarm *))
+		enum alarmtimer_restart (*function)(struct alarm *, ktime_t))
 {
 	timerqueue_init(&alarm->node);
-	alarm->period = ktime_set(0, 0);
 	alarm->function = function;
 	alarm->type = type;
-	alarm->enabled = 0;
+	alarm->state = ALARMTIMER_STATE_INACTIVE;
 }
 
 /**
  * alarm_start - Sets an alarm to fire
  * @alarm: ptr to alarm to set
  * @start: time to run the alarm
- * @period: period at which the alarm will recur
  */
-void alarm_start(struct alarm *alarm, ktime_t start, ktime_t period)
+void alarm_start(struct alarm *alarm, ktime_t start)
 {
 	struct alarm_base *base = &alarm_bases[alarm->type];
 	unsigned long flags;
 
 	spin_lock_irqsave(&base->lock, flags);
-	if (alarm->enabled)
+	if (alarmtimer_active(alarm))
 		alarmtimer_remove(base, alarm);
 	alarm->node.expires = start;
-	alarm->period = period;
 	alarmtimer_enqueue(base, alarm);
-	alarm->enabled = 1;
 	spin_unlock_irqrestore(&base->lock, flags);
 }
 
 /**
- * alarm_cancel - Tries to cancel an alarm timer
+ * alarm_try_to_cancel - Tries to cancel an alarm timer
  * @alarm: ptr to alarm to be canceled
+ *
+ * Returns 1 if the timer was canceled, 0 if it was not running,
+ * and -1 if the callback was running
  */
-void alarm_cancel(struct alarm *alarm)
+int alarm_try_to_cancel(struct alarm *alarm)
 {
 	struct alarm_base *base = &alarm_bases[alarm->type];
 	unsigned long flags;
-
+	int ret = -1;
 	spin_lock_irqsave(&base->lock, flags);
-	if (alarm->enabled)
+
+	if (alarmtimer_callback_running(alarm))
+		goto out;
+
+	if (alarmtimer_is_queued(alarm)) {
 		alarmtimer_remove(base, alarm);
-	alarm->enabled = 0;
+		ret = 1;
+	} else
+		ret = 0;
+out:
 	spin_unlock_irqrestore(&base->lock, flags);
+	return ret;
 }
 
 
 /**
+ * alarm_cancel - Spins trying to cancel an alarm timer until it is done
+ * @alarm: ptr to alarm to be canceled
+ *
+ * Returns 1 if the timer was canceled, 0 if it was not active.
+ */
+int alarm_cancel(struct alarm *alarm)
+{
+	for (;;) {
+		int ret = alarm_try_to_cancel(alarm);
+		if (ret >= 0)
+			return ret;
+		cpu_relax();
+	}
+}
+
+
+u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval)
+{
+	u64 overrun = 1;
+	ktime_t delta;
+
+	delta = ktime_sub(now, alarm->node.expires);
+
+	if (delta.tv64 < 0)
+		return 0;
+
+	if (unlikely(delta.tv64 >= interval.tv64)) {
+		s64 incr = ktime_to_ns(interval);
+
+		overrun = ktime_divns(delta, incr);
+
+		alarm->node.expires = ktime_add_ns(alarm->node.expires,
+							incr*overrun);
+
+		if (alarm->node.expires.tv64 > now.tv64)
+			return overrun;
+		/*
+		 * This (and the ktime_add() below) is the
+		 * correction for exact:
+		 */
+		overrun++;
+	}
+
+	alarm->node.expires = ktime_add(alarm->node.expires, interval);
+	return overrun;
+}
+
+
+
+
+/**
  * clock2alarm - helper that converts from clockid to alarmtypes
  * @clockid: clockid.
  */
@@ -365,12 +439,21 @@
  *
  * Posix timer callback for expired alarm timers.
  */
-static void alarm_handle_timer(struct alarm *alarm)
+static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm,
+							ktime_t now)
 {
 	struct k_itimer *ptr = container_of(alarm, struct k_itimer,
-						it.alarmtimer);
+						it.alarm.alarmtimer);
 	if (posix_timer_event(ptr, 0) != 0)
 		ptr->it_overrun++;
+
+	/* Re-add periodic timers */
+	if (ptr->it.alarm.interval.tv64) {
+		ptr->it_overrun += alarm_forward(alarm, now,
+						ptr->it.alarm.interval);
+		return ALARMTIMER_RESTART;
+	}
+	return ALARMTIMER_NORESTART;
 }
 
 /**
@@ -427,7 +510,7 @@
 
 	type = clock2alarm(new_timer->it_clock);
 	base = &alarm_bases[type];
-	alarm_init(&new_timer->it.alarmtimer, type, alarm_handle_timer);
+	alarm_init(&new_timer->it.alarm.alarmtimer, type, alarm_handle_timer);
 	return 0;
 }
 
@@ -444,9 +527,9 @@
 	memset(cur_setting, 0, sizeof(struct itimerspec));
 
 	cur_setting->it_interval =
-			ktime_to_timespec(timr->it.alarmtimer.period);
+			ktime_to_timespec(timr->it.alarm.interval);
 	cur_setting->it_value =
-			ktime_to_timespec(timr->it.alarmtimer.node.expires);
+		ktime_to_timespec(timr->it.alarm.alarmtimer.node.expires);
 	return;
 }
 
@@ -461,7 +544,9 @@
 	if (!rtcdev)
 		return -ENOTSUPP;
 
-	alarm_cancel(&timr->it.alarmtimer);
+	if (alarm_try_to_cancel(&timr->it.alarm.alarmtimer) < 0)
+		return TIMER_RETRY;
+
 	return 0;
 }
 
@@ -481,25 +566,17 @@
 	if (!rtcdev)
 		return -ENOTSUPP;
 
-	/*
-	 * XXX HACK! Currently we can DOS a system if the interval
-	 * period on alarmtimers is too small. Cap the interval here
-	 * to 100us and solve this properly in a future patch! -jstultz
-	 */
-	if ((new_setting->it_interval.tv_sec == 0) &&
-			(new_setting->it_interval.tv_nsec < 100000))
-		new_setting->it_interval.tv_nsec = 100000;
-
 	if (old_setting)
 		alarm_timer_get(timr, old_setting);
 
 	/* If the timer was already set, cancel it */
-	alarm_cancel(&timr->it.alarmtimer);
+	if (alarm_try_to_cancel(&timr->it.alarm.alarmtimer) < 0)
+		return TIMER_RETRY;
 
 	/* start the timer */
-	alarm_start(&timr->it.alarmtimer,
-			timespec_to_ktime(new_setting->it_value),
-			timespec_to_ktime(new_setting->it_interval));
+	timr->it.alarm.interval = timespec_to_ktime(new_setting->it_interval);
+	alarm_start(&timr->it.alarm.alarmtimer,
+			timespec_to_ktime(new_setting->it_value));
 	return 0;
 }
 
@@ -509,13 +586,15 @@
  *
  * Wakes up the task that set the alarmtimer
  */
-static void alarmtimer_nsleep_wakeup(struct alarm *alarm)
+static enum alarmtimer_restart alarmtimer_nsleep_wakeup(struct alarm *alarm,
+								ktime_t now)
 {
 	struct task_struct *task = (struct task_struct *)alarm->data;
 
 	alarm->data = NULL;
 	if (task)
 		wake_up_process(task);
+	return ALARMTIMER_NORESTART;
 }
 
 /**
@@ -530,7 +609,7 @@
 	alarm->data = (void *)current;
 	do {
 		set_current_state(TASK_INTERRUPTIBLE);
-		alarm_start(alarm, absexp, ktime_set(0, 0));
+		alarm_start(alarm, absexp);
 		if (likely(alarm->data))
 			schedule();
 
@@ -691,6 +770,7 @@
  */
 static int __init alarmtimer_init(void)
 {
+	struct platform_device *pdev;
 	int error = 0;
 	int i;
 	struct k_clock alarm_clock = {
@@ -719,10 +799,26 @@
 				HRTIMER_MODE_ABS);
 		alarm_bases[i].timer.function = alarmtimer_fired;
 	}
-	error = platform_driver_register(&alarmtimer_driver);
-	platform_device_register_simple("alarmtimer", -1, NULL, 0);
 
+	error = alarmtimer_rtc_interface_setup();
+	if (error)
+		return error;
+
+	error = platform_driver_register(&alarmtimer_driver);
+	if (error)
+		goto out_if;
+
+	pdev = platform_device_register_simple("alarmtimer", -1, NULL, 0);
+	if (IS_ERR(pdev)) {
+		error = PTR_ERR(pdev);
+		goto out_drv;
+	}
+	return 0;
+
+out_drv:
+	platform_driver_unregister(&alarmtimer_driver);
+out_if:
+	alarmtimer_rtc_interface_remove();
 	return error;
 }
 device_initcall(alarmtimer_init);
-

diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index e4c699d..1ecd6ba 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c

@@ -94,42 +94,143 @@
 	dev->next_event.tv64 = KTIME_MAX;
 }
 
+#ifdef CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST
+
+/* Limit min_delta to a jiffie */
+#define MIN_DELTA_LIMIT		(NSEC_PER_SEC / HZ)
+
+/**
+ * clockevents_increase_min_delta - raise minimum delta of a clock event device
+ * @dev:       device to increase the minimum delta
+ *
+ * Returns 0 on success, -ETIME when the minimum delta reached the limit.
+ */
+static int clockevents_increase_min_delta(struct clock_event_device *dev)
+{
+	/* Nothing to do if we already reached the limit */
+	if (dev->min_delta_ns >= MIN_DELTA_LIMIT) {
+		printk(KERN_WARNING "CE: Reprogramming failure. Giving up\n");
+		dev->next_event.tv64 = KTIME_MAX;
+		return -ETIME;
+	}
+
+	if (dev->min_delta_ns < 5000)
+		dev->min_delta_ns = 5000;
+	else
+		dev->min_delta_ns += dev->min_delta_ns >> 1;
+
+	if (dev->min_delta_ns > MIN_DELTA_LIMIT)
+		dev->min_delta_ns = MIN_DELTA_LIMIT;
+
+	printk(KERN_WARNING "CE: %s increased min_delta_ns to %llu nsec\n",
+	       dev->name ? dev->name : "?",
+	       (unsigned long long) dev->min_delta_ns);
+	return 0;
+}
+
+/**
+ * clockevents_program_min_delta - Set clock event device to the minimum delay.
+ * @dev:	device to program
+ *
+ * Returns 0 on success, -ETIME when the retry loop failed.
+ */
+static int clockevents_program_min_delta(struct clock_event_device *dev)
+{
+	unsigned long long clc;
+	int64_t delta;
+	int i;
+
+	for (i = 0;;) {
+		delta = dev->min_delta_ns;
+		dev->next_event = ktime_add_ns(ktime_get(), delta);
+
+		if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN)
+			return 0;
+
+		dev->retries++;
+		clc = ((unsigned long long) delta * dev->mult) >> dev->shift;
+		if (dev->set_next_event((unsigned long) clc, dev) == 0)
+			return 0;
+
+		if (++i > 2) {
+			/*
+			 * We tried 3 times to program the device with the
+			 * given min_delta_ns. Try to increase the minimum
+			 * delta, if that fails as well get out of here.
+			 */
+			if (clockevents_increase_min_delta(dev))
+				return -ETIME;
+			i = 0;
+		}
+	}
+}
+
+#else  /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */
+
+/**
+ * clockevents_program_min_delta - Set clock event device to the minimum delay.
+ * @dev:	device to program
+ *
+ * Returns 0 on success, -ETIME when the retry loop failed.
+ */
+static int clockevents_program_min_delta(struct clock_event_device *dev)
+{
+	unsigned long long clc;
+	int64_t delta;
+
+	delta = dev->min_delta_ns;
+	dev->next_event = ktime_add_ns(ktime_get(), delta);
+
+	if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN)
+		return 0;
+
+	dev->retries++;
+	clc = ((unsigned long long) delta * dev->mult) >> dev->shift;
+	return dev->set_next_event((unsigned long) clc, dev);
+}
+
+#endif /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */
+
 /**
  * clockevents_program_event - Reprogram the clock event device.
+ * @dev:	device to program
  * @expires:	absolute expiry time (monotonic clock)
+ * @force:	program minimum delay if expires can not be set
  *
  * Returns 0 on success, -ETIME when the event is in the past.
  */
 int clockevents_program_event(struct clock_event_device *dev, ktime_t expires,
-			      ktime_t now)
+			      bool force)
 {
 	unsigned long long clc;
 	int64_t delta;
+	int rc;
 
 	if (unlikely(expires.tv64 < 0)) {
 		WARN_ON_ONCE(1);
 		return -ETIME;
 	}
 
-	delta = ktime_to_ns(ktime_sub(expires, now));
-
-	if (delta <= 0)
-		return -ETIME;
-
 	dev->next_event = expires;
 
 	if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN)
 		return 0;
 
-	if (delta > dev->max_delta_ns)
-		delta = dev->max_delta_ns;
-	if (delta < dev->min_delta_ns)
-		delta = dev->min_delta_ns;
+	/* Shortcut for clockevent devices that can deal with ktime. */
+	if (dev->features & CLOCK_EVT_FEAT_KTIME)
+		return dev->set_next_ktime(expires, dev);
 
-	clc = delta * dev->mult;
-	clc >>= dev->shift;
+	delta = ktime_to_ns(ktime_sub(expires, ktime_get()));
+	if (delta <= 0)
+		return force ? clockevents_program_min_delta(dev) : -ETIME;
 
-	return dev->set_next_event((unsigned long) clc, dev);
+	delta = min(delta, (int64_t) dev->max_delta_ns);
+	delta = max(delta, (int64_t) dev->min_delta_ns);
+
+	clc = ((unsigned long long) delta * dev->mult) >> dev->shift;
+	rc = dev->set_next_event((unsigned long) clc, dev);
+
+	return (rc && force) ? clockevents_program_min_delta(dev) : rc;
 }
 
 /**
@@ -258,7 +359,7 @@
 	if (dev->mode != CLOCK_EVT_MODE_ONESHOT)
 		return 0;
 
-	return clockevents_program_event(dev, dev->next_event, ktime_get());
+	return clockevents_program_event(dev, dev->next_event, false);
 }
 
 /*

diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index e0980f0..cf52fda 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c

@@ -186,6 +186,7 @@
 static DECLARE_WORK(watchdog_work, clocksource_watchdog_work);
 static DEFINE_SPINLOCK(watchdog_lock);
 static int watchdog_running;
+static atomic_t watchdog_reset_pending;
 
 static int clocksource_watchdog_kthread(void *data);
 static void __clocksource_change_rating(struct clocksource *cs, int rating);
@@ -247,12 +248,14 @@
 	struct clocksource *cs;
 	cycle_t csnow, wdnow;
 	int64_t wd_nsec, cs_nsec;
-	int next_cpu;
+	int next_cpu, reset_pending;
 
 	spin_lock(&watchdog_lock);
 	if (!watchdog_running)
 		goto out;
 
+	reset_pending = atomic_read(&watchdog_reset_pending);
+
 	list_for_each_entry(cs, &watchdog_list, wd_list) {
 
 		/* Clocksource already marked unstable? */
@@ -268,7 +271,8 @@
 		local_irq_enable();
 
 		/* Clocksource initialized ? */
-		if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) {
+		if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) ||
+		    atomic_read(&watchdog_reset_pending)) {
 			cs->flags |= CLOCK_SOURCE_WATCHDOG;
 			cs->wd_last = wdnow;
 			cs->cs_last = csnow;
@@ -283,8 +287,11 @@
 		cs->cs_last = csnow;
 		cs->wd_last = wdnow;
 
+		if (atomic_read(&watchdog_reset_pending))
+			continue;
+
 		/* Check the deviation from the watchdog clocksource. */
-		if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) {
+		if ((abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD)) {
 			clocksource_unstable(cs, cs_nsec - wd_nsec);
 			continue;
 		}
@@ -303,6 +310,13 @@
 	}
 
 	/*
+	 * We only clear the watchdog_reset_pending, when we did a
+	 * full cycle through all clocksources.
+	 */
+	if (reset_pending)
+		atomic_dec(&watchdog_reset_pending);
+
+	/*
 	 * Cycle through CPUs to check if the CPUs stay synchronized
 	 * to each other.
 	 */
@@ -344,23 +358,7 @@
 
 static void clocksource_resume_watchdog(void)
 {
-	unsigned long flags;
-
-	/*
-	 * We use trylock here to avoid a potential dead lock when
-	 * kgdb calls this code after the kernel has been stopped with
-	 * watchdog_lock held. When watchdog_lock is held we just
-	 * return and accept, that the watchdog might trigger and mark
-	 * the monitored clock source (usually TSC) unstable.
-	 *
-	 * This does not affect the other caller clocksource_resume()
-	 * because at this point the kernel is UP, interrupts are
-	 * disabled and nothing can hold watchdog_lock.
-	 */
-	if (!spin_trylock_irqsave(&watchdog_lock, flags))
-		return;
-	clocksource_reset_watchdog();
-	spin_unlock_irqrestore(&watchdog_lock, flags);
+	atomic_inc(&watchdog_reset_pending);
 }
 
 static void clocksource_enqueue_watchdog(struct clocksource *cs)

diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index c7218d1..f954282 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c

@@ -194,7 +194,7 @@
 	for (next = dev->next_event; ;) {
 		next = ktime_add(next, tick_period);
 
-		if (!clockevents_program_event(dev, next, ktime_get()))
+		if (!clockevents_program_event(dev, next, false))
 			return;
 		tick_do_periodic_broadcast();
 	}
@@ -373,7 +373,7 @@
 {
 	struct clock_event_device *bc = tick_broadcast_device.evtdev;
 
-	return tick_dev_program_event(bc, expires, force);
+	return clockevents_program_event(bc, expires, force);
 }
 
 int tick_resume_broadcast_oneshot(struct clock_event_device *bc)

diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 119528d..da6c9ec 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c

@@ -94,7 +94,7 @@
 	 */
 	next = ktime_add(dev->next_event, tick_period);
 	for (;;) {
-		if (!clockevents_program_event(dev, next, ktime_get()))
+		if (!clockevents_program_event(dev, next, false))
 			return;
 		/*
 		 * Have to be careful here. If we're in oneshot mode,
@@ -137,7 +137,7 @@
 		clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
 
 		for (;;) {
-			if (!clockevents_program_event(dev, next, ktime_get()))
+			if (!clockevents_program_event(dev, next, false))
 				return;
 			next = ktime_add(next, tick_period);
 		}

diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 1009b06..4e265b9 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h

@@ -26,8 +26,6 @@
 extern void tick_setup_oneshot(struct clock_event_device *newdev,
 			       void (*handler)(struct clock_event_device *),
 			       ktime_t nextevt);
-extern int tick_dev_program_event(struct clock_event_device *dev,
-				  ktime_t expires, int force);
 extern int tick_program_event(ktime_t expires, int force);
 extern void tick_oneshot_notify(void);
 extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *));

diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 2d04411..8241090 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c

@@ -21,74 +21,6 @@
 
 #include "tick-internal.h"
 
-/* Limit min_delta to a jiffie */
-#define MIN_DELTA_LIMIT		(NSEC_PER_SEC / HZ)
-
-static int tick_increase_min_delta(struct clock_event_device *dev)
-{
-	/* Nothing to do if we already reached the limit */
-	if (dev->min_delta_ns >= MIN_DELTA_LIMIT)
-		return -ETIME;
-
-	if (dev->min_delta_ns < 5000)
-		dev->min_delta_ns = 5000;
-	else
-		dev->min_delta_ns += dev->min_delta_ns >> 1;
-
-	if (dev->min_delta_ns > MIN_DELTA_LIMIT)
-		dev->min_delta_ns = MIN_DELTA_LIMIT;
-
-	printk(KERN_WARNING "CE: %s increased min_delta_ns to %llu nsec\n",
-	       dev->name ? dev->name : "?",
-	       (unsigned long long) dev->min_delta_ns);
-	return 0;
-}
-
-/**
- * tick_program_event internal worker function
- */
-int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires,
-			   int force)
-{
-	ktime_t now = ktime_get();
-	int i;
-
-	for (i = 0;;) {
-		int ret = clockevents_program_event(dev, expires, now);
-
-		if (!ret || !force)
-			return ret;
-
-		dev->retries++;
-		/*
-		 * We tried 3 times to program the device with the given
-		 * min_delta_ns. If that's not working then we increase it
-		 * and emit a warning.
-		 */
-		if (++i > 2) {
-			/* Increase the min. delta and try again */
-			if (tick_increase_min_delta(dev)) {
-				/*
-				 * Get out of the loop if min_delta_ns
-				 * hit the limit already. That's
-				 * better than staying here forever.
-				 *
-				 * We clear next_event so we have a
-				 * chance that the box survives.
-				 */
-				printk(KERN_WARNING
-				       "CE: Reprogramming failure. Giving up\n");
-				dev->next_event.tv64 = KTIME_MAX;
-				return -ETIME;
-			}
-			i = 0;
-		}
-
-		now = ktime_get();
-		expires = ktime_add_ns(now, dev->min_delta_ns);
-	}
-}
-
 /**
  * tick_program_event
  */
@@ -96,7 +28,7 @@
 {
 	struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
 
-	return tick_dev_program_event(dev, expires, force);
+	return clockevents_program_event(dev, expires, force);
 }
 
 /**
@@ -104,11 +36,10 @@
  */
 void tick_resume_oneshot(void)
 {
-	struct tick_device *td = &__get_cpu_var(tick_cpu_device);
-	struct clock_event_device *dev = td->evtdev;
+	struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
 
 	clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
-	tick_program_event(ktime_get(), 1);
+	clockevents_program_event(dev, ktime_get(), true);
 }
 
 /**
@@ -120,7 +51,7 @@
 {
 	newdev->event_handler = handler;
 	clockevents_set_mode(newdev, CLOCK_EVT_MODE_ONESHOT);
-	tick_dev_program_event(newdev, next_event, 1);
+	clockevents_program_event(newdev, next_event, true);
 }
 
 /**

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index d5097c4..4042064 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c

@@ -139,7 +139,6 @@
 	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
 	unsigned long flags;
 
-	cpumask_clear_cpu(cpu, nohz_cpu_mask);
 	ts->idle_waketime = now;
 
 	local_irq_save(flags);
@@ -159,9 +158,10 @@
 
 	if (ts->idle_active) {
 		delta = ktime_sub(now, ts->idle_entrytime);
-		ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
 		if (nr_iowait_cpu(cpu) > 0)
 			ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta);
+		else
+			ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
 		ts->idle_entrytime = now;
 	}
 
@@ -197,11 +197,11 @@
 /**
  * get_cpu_idle_time_us - get the total idle time of a cpu
  * @cpu: CPU number to query
- * @last_update_time: variable to store update time in
+ * @last_update_time: variable to store update time in. Do not update
+ * counters if NULL.
  *
  * Return the cummulative idle time (since boot) for a given
- * CPU, in microseconds. The idle time returned includes
- * the iowait time (unlike what "top" and co report).
+ * CPU, in microseconds.
  *
  * This time is measured via accounting rather than sampling,
  * and is as accurate as ktime_get() is.
@@ -211,20 +211,35 @@
 u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
 {
 	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+	ktime_t now, idle;
 
 	if (!tick_nohz_enabled)
 		return -1;
 
-	update_ts_time_stats(cpu, ts, ktime_get(), last_update_time);
+	now = ktime_get();
+	if (last_update_time) {
+		update_ts_time_stats(cpu, ts, now, last_update_time);
+		idle = ts->idle_sleeptime;
+	} else {
+		if (ts->idle_active && !nr_iowait_cpu(cpu)) {
+			ktime_t delta = ktime_sub(now, ts->idle_entrytime);
 
-	return ktime_to_us(ts->idle_sleeptime);
+			idle = ktime_add(ts->idle_sleeptime, delta);
+		} else {
+			idle = ts->idle_sleeptime;
+		}
+	}
+
+	return ktime_to_us(idle);
+
 }
 EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
 
-/*
+/**
  * get_cpu_iowait_time_us - get the total iowait time of a cpu
  * @cpu: CPU number to query
- * @last_update_time: variable to store update time in
+ * @last_update_time: variable to store update time in. Do not update
+ * counters if NULL.
  *
  * Return the cummulative iowait time (since boot) for a given
  * CPU, in microseconds.
@@ -237,13 +252,26 @@
 u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
 {
 	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+	ktime_t now, iowait;
 
 	if (!tick_nohz_enabled)
 		return -1;
 
-	update_ts_time_stats(cpu, ts, ktime_get(), last_update_time);
+	now = ktime_get();
+	if (last_update_time) {
+		update_ts_time_stats(cpu, ts, now, last_update_time);
+		iowait = ts->iowait_sleeptime;
+	} else {
+		if (ts->idle_active && nr_iowait_cpu(cpu) > 0) {
+			ktime_t delta = ktime_sub(now, ts->idle_entrytime);
 
-	return ktime_to_us(ts->iowait_sleeptime);
+			iowait = ktime_add(ts->iowait_sleeptime, delta);
+		} else {
+			iowait = ts->iowait_sleeptime;
+		}
+	}
+
+	return ktime_to_us(iowait);
 }
 EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
 
@@ -389,9 +417,6 @@
 		else
 			expires.tv64 = KTIME_MAX;
 
-		if (delta_jiffies > 1)
-			cpumask_set_cpu(cpu, nohz_cpu_mask);
-
 		/* Skip reprogram of event if its not changed */
 		if (ts->tick_stopped && ktime_equal(expires, dev->next_event))
 			goto out;
@@ -441,7 +466,6 @@
 		 * softirq.
 		 */
 		tick_do_update_jiffies64(ktime_get());
-		cpumask_clear_cpu(cpu, nohz_cpu_mask);
 	}
 	raise_softirq_irqoff(TIMER_SOFTIRQ);
 out:
@@ -524,7 +548,6 @@
 	/* Update jiffies first */
 	select_nohz_load_balancer(0);
 	tick_do_update_jiffies64(now);
-	cpumask_clear_cpu(cpu, nohz_cpu_mask);
 
 #ifndef CONFIG_VIRT_CPU_ACCOUNTING
 	/*
@@ -640,8 +663,6 @@
 		next = ktime_add(next, tick_period);
 	}
 	local_irq_enable();
-
-	printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", smp_processor_id());
 }
 
 /*
@@ -793,10 +814,8 @@
 	}
 
 #ifdef CONFIG_NO_HZ
-	if (tick_nohz_enabled) {
+	if (tick_nohz_enabled)
 		ts->nohz_mode = NOHZ_MODE_HIGHRES;
-		printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", smp_processor_id());
-	}
 #endif
 }
 #endif /* HIGH_RES_TIMERS */

diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index f49405f..5f39a07 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile

@@ -15,6 +15,8 @@
 KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
 endif
 
+CFLAGS_trace_events_filter.o := -I$(src)
+
 #
 # Make the trace clocks available generally: it's infrastructure
 # relied on by ptrace for example:

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index c3e4575..077d853 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c

@@ -3863,6 +3863,14 @@
 }
 
 /**
+ * Test if ftrace is dead or not.
+ */
+int ftrace_is_dead(void)
+{
+	return ftrace_disabled;
+}
+
+/**
  * register_ftrace_function - register a function for profiling
  * @ops - ops structure that holds the function for profiling.
  *

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index f2f821a..f5b7b5c 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c

@@ -488,12 +488,14 @@
 	struct buffer_page		*reader_page;
 	unsigned long			lost_events;
 	unsigned long			last_overrun;
+	local_t				entries_bytes;
 	local_t				commit_overrun;
 	local_t				overrun;
 	local_t				entries;
 	local_t				committing;
 	local_t				commits;
 	unsigned long			read;
+	unsigned long			read_bytes;
 	u64				write_stamp;
 	u64				read_stamp;
 };
@@ -1708,6 +1710,7 @@
 		 * the counters.
 		 */
 		local_add(entries, &cpu_buffer->overrun);
+		local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes);
 
 		/*
 		 * The entries will be zeroed out when we move the
@@ -1863,6 +1866,9 @@
 	event = __rb_page_index(tail_page, tail);
 	kmemcheck_annotate_bitfield(event, bitfield);
 
+	/* account for padding bytes */
+	local_add(BUF_PAGE_SIZE - tail, &cpu_buffer->entries_bytes);
+
 	/*
 	 * Save the original length to the meta data.
 	 * This will be used by the reader to add lost event
@@ -2054,6 +2060,9 @@
 	if (!tail)
 		tail_page->page->time_stamp = ts;
 
+	/* account for these added bytes */
+	local_add(length, &cpu_buffer->entries_bytes);
+
 	return event;
 }
 
@@ -2076,6 +2085,7 @@
 	if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) {
 		unsigned long write_mask =
 			local_read(&bpage->write) & ~RB_WRITE_MASK;
+		unsigned long event_length = rb_event_length(event);
 		/*
 		 * This is on the tail page. It is possible that
 		 * a write could come in and move the tail page
@@ -2085,8 +2095,11 @@
 		old_index += write_mask;
 		new_index += write_mask;
 		index = local_cmpxchg(&bpage->write, old_index, new_index);
-		if (index == old_index)
+		if (index == old_index) {
+			/* update counters */
+			local_sub(event_length, &cpu_buffer->entries_bytes);
 			return 1;
+		}
 	}
 
 	/* could not discard */
@@ -2661,6 +2674,58 @@
 }
 
 /**
+ * ring_buffer_oldest_event_ts - get the oldest event timestamp from the buffer
+ * @buffer: The ring buffer
+ * @cpu: The per CPU buffer to read from.
+ */
+unsigned long ring_buffer_oldest_event_ts(struct ring_buffer *buffer, int cpu)
+{
+	unsigned long flags;
+	struct ring_buffer_per_cpu *cpu_buffer;
+	struct buffer_page *bpage;
+	unsigned long ret;
+
+	if (!cpumask_test_cpu(cpu, buffer->cpumask))
+		return 0;
+
+	cpu_buffer = buffer->buffers[cpu];
+	raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+	/*
+	 * if the tail is on reader_page, oldest time stamp is on the reader
+	 * page
+	 */
+	if (cpu_buffer->tail_page == cpu_buffer->reader_page)
+		bpage = cpu_buffer->reader_page;
+	else
+		bpage = rb_set_head_page(cpu_buffer);
+	ret = bpage->page->time_stamp;
+	raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_oldest_event_ts);
+
+/**
+ * ring_buffer_bytes_cpu - get the number of bytes consumed in a cpu buffer
+ * @buffer: The ring buffer
+ * @cpu: The per CPU buffer to read from.
+ */
+unsigned long ring_buffer_bytes_cpu(struct ring_buffer *buffer, int cpu)
+{
+	struct ring_buffer_per_cpu *cpu_buffer;
+	unsigned long ret;
+
+	if (!cpumask_test_cpu(cpu, buffer->cpumask))
+		return 0;
+
+	cpu_buffer = buffer->buffers[cpu];
+	ret = local_read(&cpu_buffer->entries_bytes) - cpu_buffer->read_bytes;
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_bytes_cpu);
+
+/**
  * ring_buffer_entries_cpu - get the number of entries in a cpu buffer
  * @buffer: The ring buffer
  * @cpu: The per CPU buffer to get the entries from.
@@ -3527,11 +3592,13 @@
 	cpu_buffer->reader_page->read = 0;
 
 	local_set(&cpu_buffer->commit_overrun, 0);
+	local_set(&cpu_buffer->entries_bytes, 0);
 	local_set(&cpu_buffer->overrun, 0);
 	local_set(&cpu_buffer->entries, 0);
 	local_set(&cpu_buffer->committing, 0);
 	local_set(&cpu_buffer->commits, 0);
 	cpu_buffer->read = 0;
+	cpu_buffer->read_bytes = 0;
 
 	cpu_buffer->write_stamp = 0;
 	cpu_buffer->read_stamp = 0;
@@ -3918,6 +3985,7 @@
 	} else {
 		/* update the entry counter */
 		cpu_buffer->read += rb_page_entries(reader);
+		cpu_buffer->read_bytes += BUF_PAGE_SIZE;
 
 		/* swap the pages */
 		rb_init_page(bpage);

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 0c8bdee..f2bd275 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c

@@ -435,6 +435,7 @@
 } trace_clocks[] = {
 	{ trace_clock_local,	"local" },
 	{ trace_clock_global,	"global" },
+	{ trace_clock_counter,	"counter" },
 };
 
 int trace_clock_id;
@@ -2159,6 +2160,14 @@
 	}
 }
 
+static void test_ftrace_alive(struct seq_file *m)
+{
+	if (!ftrace_is_dead())
+		return;
+	seq_printf(m, "# WARNING: FUNCTION TRACING IS CORRUPTED\n");
+	seq_printf(m, "#          MAY BE MISSING FUNCTION EVENTS\n");
+}
+
 static int s_show(struct seq_file *m, void *v)
 {
 	struct trace_iterator *iter = v;
@@ -2168,6 +2177,7 @@
 		if (iter->tr) {
 			seq_printf(m, "# tracer: %s\n", iter->trace->name);
 			seq_puts(m, "#\n");
+			test_ftrace_alive(m);
 		}
 		if (iter->trace && iter->trace->print_header)
 			iter->trace->print_header(m);
@@ -2710,9 +2720,9 @@
 	"# cat /sys/kernel/debug/tracing/trace_options\n"
 	"noprint-parent nosym-offset nosym-addr noverbose\n"
 	"# echo print-parent > /sys/kernel/debug/tracing/trace_options\n"
-	"# echo 1 > /sys/kernel/debug/tracing/tracing_enabled\n"
+	"# echo 1 > /sys/kernel/debug/tracing/tracing_on\n"
 	"# cat /sys/kernel/debug/tracing/trace > /tmp/trace.txt\n"
-	"# echo 0 > /sys/kernel/debug/tracing/tracing_enabled\n"
+	"# echo 0 > /sys/kernel/debug/tracing/tracing_on\n"
 ;
 
 static ssize_t
@@ -3569,6 +3579,30 @@
 }
 
 static ssize_t
+tracing_total_entries_read(struct file *filp, char __user *ubuf,
+				size_t cnt, loff_t *ppos)
+{
+	struct trace_array *tr = filp->private_data;
+	char buf[64];
+	int r, cpu;
+	unsigned long size = 0, expanded_size = 0;
+
+	mutex_lock(&trace_types_lock);
+	for_each_tracing_cpu(cpu) {
+		size += tr->entries >> 10;
+		if (!ring_buffer_expanded)
+			expanded_size += trace_buf_size >> 10;
+	}
+	if (ring_buffer_expanded)
+		r = sprintf(buf, "%lu\n", size);
+	else
+		r = sprintf(buf, "%lu (expanded: %lu)\n", size, expanded_size);
+	mutex_unlock(&trace_types_lock);
+
+	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static ssize_t
 tracing_free_buffer_write(struct file *filp, const char __user *ubuf,
 			  size_t cnt, loff_t *ppos)
 {
@@ -3594,22 +3628,24 @@
 	return 0;
 }
 
-static int mark_printk(const char *fmt, ...)
-{
-	int ret;
-	va_list args;
-	va_start(args, fmt);
-	ret = trace_vprintk(0, fmt, args);
-	va_end(args);
-	return ret;
-}
-
 static ssize_t
 tracing_mark_write(struct file *filp, const char __user *ubuf,
 					size_t cnt, loff_t *fpos)
 {
-	char *buf;
-	size_t written;
+	unsigned long addr = (unsigned long)ubuf;
+	struct ring_buffer_event *event;
+	struct ring_buffer *buffer;
+	struct print_entry *entry;
+	unsigned long irq_flags;
+	struct page *pages[2];
+	int nr_pages = 1;
+	ssize_t written;
+	void *page1;
+	void *page2;
+	int offset;
+	int size;
+	int len;
+	int ret;
 
 	if (tracing_disabled)
 		return -EINVAL;
@@ -3617,28 +3653,81 @@
 	if (cnt > TRACE_BUF_SIZE)
 		cnt = TRACE_BUF_SIZE;
 
-	buf = kmalloc(cnt + 2, GFP_KERNEL);
-	if (buf == NULL)
-		return -ENOMEM;
+	/*
+	 * Userspace is injecting traces into the kernel trace buffer.
+	 * We want to be as non intrusive as possible.
+	 * To do so, we do not want to allocate any special buffers
+	 * or take any locks, but instead write the userspace data
+	 * straight into the ring buffer.
+	 *
+	 * First we need to pin the userspace buffer into memory,
+	 * which, most likely it is, because it just referenced it.
+	 * But there's no guarantee that it is. By using get_user_pages_fast()
+	 * and kmap_atomic/kunmap_atomic() we can get access to the
+	 * pages directly. We then write the data directly into the
+	 * ring buffer.
+	 */
+	BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE);
 
-	if (copy_from_user(buf, ubuf, cnt)) {
-		kfree(buf);
-		return -EFAULT;
+	/* check if we cross pages */
+	if ((addr & PAGE_MASK) != ((addr + cnt) & PAGE_MASK))
+		nr_pages = 2;
+
+	offset = addr & (PAGE_SIZE - 1);
+	addr &= PAGE_MASK;
+
+	ret = get_user_pages_fast(addr, nr_pages, 0, pages);
+	if (ret < nr_pages) {
+		while (--ret >= 0)
+			put_page(pages[ret]);
+		written = -EFAULT;
+		goto out;
 	}
-	if (buf[cnt-1] != '\n') {
-		buf[cnt] = '\n';
-		buf[cnt+1] = '\0';
-	} else
-		buf[cnt] = '\0';
 
-	written = mark_printk("%s", buf);
-	kfree(buf);
+	page1 = kmap_atomic(pages[0]);
+	if (nr_pages == 2)
+		page2 = kmap_atomic(pages[1]);
+
+	local_save_flags(irq_flags);
+	size = sizeof(*entry) + cnt + 2; /* possible \n added */
+	buffer = global_trace.buffer;
+	event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
+					  irq_flags, preempt_count());
+	if (!event) {
+		/* Ring buffer disabled, return as if not open for write */
+		written = -EBADF;
+		goto out_unlock;
+	}
+
+	entry = ring_buffer_event_data(event);
+	entry->ip = _THIS_IP_;
+
+	if (nr_pages == 2) {
+		len = PAGE_SIZE - offset;
+		memcpy(&entry->buf, page1 + offset, len);
+		memcpy(&entry->buf[len], page2, cnt - len);
+	} else
+		memcpy(&entry->buf, page1 + offset, cnt);
+
+	if (entry->buf[cnt - 1] != '\n') {
+		entry->buf[cnt] = '\n';
+		entry->buf[cnt + 1] = '\0';
+	} else
+		entry->buf[cnt] = '\0';
+
+	ring_buffer_unlock_commit(buffer, event);
+
+	written = cnt;
+
 	*fpos += written;
 
-	/* don't tell userspace we wrote more - it might confuse them */
-	if (written > cnt)
-		written = cnt;
-
+ out_unlock:
+	if (nr_pages == 2)
+		kunmap_atomic(page2);
+	kunmap_atomic(page1);
+	while (nr_pages > 0)
+		put_page(pages[--nr_pages]);
+ out:
 	return written;
 }
 
@@ -3739,6 +3828,12 @@
 	.llseek		= generic_file_llseek,
 };
 
+static const struct file_operations tracing_total_entries_fops = {
+	.open		= tracing_open_generic,
+	.read		= tracing_total_entries_read,
+	.llseek		= generic_file_llseek,
+};
+
 static const struct file_operations tracing_free_buffer_fops = {
 	.write		= tracing_free_buffer_write,
 	.release	= tracing_free_buffer_release,
@@ -3808,8 +3903,6 @@
 	if (info->read < PAGE_SIZE)
 		goto read;
 
-	info->read = 0;
-
 	trace_access_lock(info->cpu);
 	ret = ring_buffer_read_page(info->tr->buffer,
 				    &info->spare,
@@ -3819,6 +3912,8 @@
 	if (ret < 0)
 		return 0;
 
+	info->read = 0;
+
 read:
 	size = PAGE_SIZE - info->read;
 	if (size > count)
@@ -4026,6 +4121,8 @@
 	struct trace_array *tr = &global_trace;
 	struct trace_seq *s;
 	unsigned long cnt;
+	unsigned long long t;
+	unsigned long usec_rem;
 
 	s = kmalloc(sizeof(*s), GFP_KERNEL);
 	if (!s)
@@ -4042,6 +4139,17 @@
 	cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu);
 	trace_seq_printf(s, "commit overrun: %ld\n", cnt);
 
+	cnt = ring_buffer_bytes_cpu(tr->buffer, cpu);
+	trace_seq_printf(s, "bytes: %ld\n", cnt);
+
+	t = ns2usecs(ring_buffer_oldest_event_ts(tr->buffer, cpu));
+	usec_rem = do_div(t, USEC_PER_SEC);
+	trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n", t, usec_rem);
+
+	t = ns2usecs(ring_buffer_time_stamp(tr->buffer, cpu));
+	usec_rem = do_div(t, USEC_PER_SEC);
+	trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem);
+
 	count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
 
 	kfree(s);
@@ -4450,6 +4558,9 @@
 	trace_create_file("buffer_size_kb", 0644, d_tracer,
 			&global_trace, &tracing_entries_fops);
 
+	trace_create_file("buffer_total_size_kb", 0444, d_tracer,
+			&global_trace, &tracing_total_entries_fops);
+
 	trace_create_file("free_buffer", 0644, d_tracer,
 			&global_trace, &tracing_free_buffer_fops);
 
@@ -4566,6 +4677,12 @@
 
 	tracing_off();
 
+	/* Did function tracer already get disabled? */
+	if (ftrace_is_dead()) {
+		printk("# WARNING: FUNCTION TRACING IS CORRUPTED\n");
+		printk("#          MAY BE MISSING FUNCTION EVENTS\n");
+	}
+
 	if (disable_tracing)
 		ftrace_kill();
 

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 616846b..092e1f8 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h

@@ -579,11 +579,13 @@
 
 	return test_tsk_trace_trace(task);
 }
+extern int ftrace_is_dead(void);
 #else
 static inline int ftrace_trace_task(struct task_struct *task)
 {
 	return 1;
 }
+static inline int ftrace_is_dead(void) { return 0; }
 #endif
 
 /*
@@ -761,16 +763,10 @@
 	filter_pred_fn_t 	fn;
 	u64 			val;
 	struct regex		regex;
-	/*
-	 * Leaf nodes use field_name, ops is used by AND and OR
-	 * nodes. The field_name is always freed when freeing a pred.
-	 * We can overload field_name for ops and have it freed
-	 * as well.
-	 */
-	union {
-		char		*field_name;
-		unsigned short	*ops;
-	};
+	unsigned short		*ops;
+#ifdef CONFIG_FTRACE_STARTUP_TEST
+	struct ftrace_event_field *field;
+#endif
 	int 			offset;
 	int 			not;
 	int 			op;

diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 6302747..3947835 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c

@@ -113,3 +113,15 @@
 
 	return now;
 }
+
+static atomic64_t trace_counter;
+
+/*
+ * trace_clock_counter(): simply an atomic counter.
+ * Use the trace_counter "counter" for cases where you do not care
+ * about timings, but are interested in strict ordering.
+ */
+u64 notrace trace_clock_counter(void)
+{
+	return atomic64_add_return(1, &trace_counter);
+}

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 256764e..816d3d0 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c

@@ -381,6 +381,63 @@
 	return pred;
 }
 
+enum walk_return {
+	WALK_PRED_ABORT,
+	WALK_PRED_PARENT,
+	WALK_PRED_DEFAULT,
+};
+
+typedef int (*filter_pred_walkcb_t) (enum move_type move,
+				     struct filter_pred *pred,
+				     int *err, void *data);
+
+static int walk_pred_tree(struct filter_pred *preds,
+			  struct filter_pred *root,
+			  filter_pred_walkcb_t cb, void *data)
+{
+	struct filter_pred *pred = root;
+	enum move_type move = MOVE_DOWN;
+	int done = 0;
+
+	if  (!preds)
+		return -EINVAL;
+
+	do {
+		int err = 0, ret;
+
+		ret = cb(move, pred, &err, data);
+		if (ret == WALK_PRED_ABORT)
+			return err;
+		if (ret == WALK_PRED_PARENT)
+			goto get_parent;
+
+		switch (move) {
+		case MOVE_DOWN:
+			if (pred->left != FILTER_PRED_INVALID) {
+				pred = &preds[pred->left];
+				continue;
+			}
+			goto get_parent;
+		case MOVE_UP_FROM_LEFT:
+			pred = &preds[pred->right];
+			move = MOVE_DOWN;
+			continue;
+		case MOVE_UP_FROM_RIGHT:
+ get_parent:
+			if (pred == root)
+				break;
+			pred = get_pred_parent(pred, preds,
+					       pred->parent,
+					       &move);
+			continue;
+		}
+		done = 1;
+	} while (!done);
+
+	/* We are fine. */
+	return 0;
+}
+
 /*
  * A series of AND or ORs where found together. Instead of
  * climbing up and down the tree branches, an array of the
@@ -410,99 +467,91 @@
 
 	for (i = 0; i < op->val; i++) {
 		pred = &preds[op->ops[i]];
-		match = pred->fn(pred, rec);
+		if (!WARN_ON_ONCE(!pred->fn))
+			match = pred->fn(pred, rec);
 		if (!!match == type)
 			return match;
 	}
 	return match;
 }
 
+struct filter_match_preds_data {
+	struct filter_pred *preds;
+	int match;
+	void *rec;
+};
+
+static int filter_match_preds_cb(enum move_type move, struct filter_pred *pred,
+				 int *err, void *data)
+{
+	struct filter_match_preds_data *d = data;
+
+	*err = 0;
+	switch (move) {
+	case MOVE_DOWN:
+		/* only AND and OR have children */
+		if (pred->left != FILTER_PRED_INVALID) {
+			/* If ops is set, then it was folded. */
+			if (!pred->ops)
+				return WALK_PRED_DEFAULT;
+			/* We can treat folded ops as a leaf node */
+			d->match = process_ops(d->preds, pred, d->rec);
+		} else {
+			if (!WARN_ON_ONCE(!pred->fn))
+				d->match = pred->fn(pred, d->rec);
+		}
+
+		return WALK_PRED_PARENT;
+	case MOVE_UP_FROM_LEFT:
+		/*
+		 * Check for short circuits.
+		 *
+		 * Optimization: !!match == (pred->op == OP_OR)
+		 *   is the same as:
+		 * if ((match && pred->op == OP_OR) ||
+		 *     (!match && pred->op == OP_AND))
+		 */
+		if (!!d->match == (pred->op == OP_OR))
+			return WALK_PRED_PARENT;
+		break;
+	case MOVE_UP_FROM_RIGHT:
+		break;
+	}
+
+	return WALK_PRED_DEFAULT;
+}
+
 /* return 1 if event matches, 0 otherwise (discard) */
 int filter_match_preds(struct event_filter *filter, void *rec)
 {
-	int match = -1;
-	enum move_type move = MOVE_DOWN;
 	struct filter_pred *preds;
-	struct filter_pred *pred;
 	struct filter_pred *root;
-	int n_preds;
-	int done = 0;
+	struct filter_match_preds_data data = {
+		/* match is currently meaningless */
+		.match = -1,
+		.rec   = rec,
+	};
+	int n_preds, ret;
 
 	/* no filter is considered a match */
 	if (!filter)
 		return 1;
 
 	n_preds = filter->n_preds;
-
 	if (!n_preds)
 		return 1;
 
 	/*
 	 * n_preds, root and filter->preds are protect with preemption disabled.
 	 */
-	preds = rcu_dereference_sched(filter->preds);
 	root = rcu_dereference_sched(filter->root);
 	if (!root)
 		return 1;
 
-	pred = root;
-
-	/* match is currently meaningless */
-	match = -1;
-
-	do {
-		switch (move) {
-		case MOVE_DOWN:
-			/* only AND and OR have children */
-			if (pred->left != FILTER_PRED_INVALID) {
-				/* If ops is set, then it was folded. */
-				if (!pred->ops) {
-					/* keep going to down the left side */
-					pred = &preds[pred->left];
-					continue;
-				}
-				/* We can treat folded ops as a leaf node */
-				match = process_ops(preds, pred, rec);
-			} else
-				match = pred->fn(pred, rec);
-			/* If this pred is the only pred */
-			if (pred == root)
-				break;
-			pred = get_pred_parent(pred, preds,
-					       pred->parent, &move);
-			continue;
-		case MOVE_UP_FROM_LEFT:
-			/*
-			 * Check for short circuits.
-			 *
-			 * Optimization: !!match == (pred->op == OP_OR)
-			 *   is the same as:
-			 * if ((match && pred->op == OP_OR) ||
-			 *     (!match && pred->op == OP_AND))
-			 */
-			if (!!match == (pred->op == OP_OR)) {
-				if (pred == root)
-					break;
-				pred = get_pred_parent(pred, preds,
-						       pred->parent, &move);
-				continue;
-			}
-			/* now go down the right side of the tree. */
-			pred = &preds[pred->right];
-			move = MOVE_DOWN;
-			continue;
-		case MOVE_UP_FROM_RIGHT:
-			/* We finished this equation. */
-			if (pred == root)
-				break;
-			pred = get_pred_parent(pred, preds,
-					       pred->parent, &move);
-			continue;
-		}
-		done = 1;
-	} while (!done);
-
-	return match;
+	data.preds = preds = rcu_dereference_sched(filter->preds);
+	ret = walk_pred_tree(preds, root, filter_match_preds_cb, &data);
+	WARN_ON(ret);
+	return data.match;
 }
 EXPORT_SYMBOL_GPL(filter_match_preds);
 
@@ -628,22 +677,6 @@
 	return __find_event_field(head, name);
 }
 
-static void filter_free_pred(struct filter_pred *pred)
-{
-	if (!pred)
-		return;
-
-	kfree(pred->field_name);
-	kfree(pred);
-}
-
-static void filter_clear_pred(struct filter_pred *pred)
-{
-	kfree(pred->field_name);
-	pred->field_name = NULL;
-	pred->regex.len = 0;
-}
-
 static int __alloc_pred_stack(struct pred_stack *stack, int n_preds)
 {
 	stack->preds = kzalloc(sizeof(*stack->preds)*(n_preds + 1), GFP_KERNEL);
@@ -689,20 +722,13 @@
 static int filter_set_pred(struct event_filter *filter,
 			   int idx,
 			   struct pred_stack *stack,
-			   struct filter_pred *src,
-			   filter_pred_fn_t fn)
+			   struct filter_pred *src)
 {
 	struct filter_pred *dest = &filter->preds[idx];
 	struct filter_pred *left;
 	struct filter_pred *right;
 
 	*dest = *src;
-	if (src->field_name) {
-		dest->field_name = kstrdup(src->field_name, GFP_KERNEL);
-		if (!dest->field_name)
-			return -ENOMEM;
-	}
-	dest->fn = fn;
 	dest->index = idx;
 
 	if (dest->op == OP_OR || dest->op == OP_AND) {
@@ -743,11 +769,7 @@
 
 static void __free_preds(struct event_filter *filter)
 {
-	int i;
-
 	if (filter->preds) {
-		for (i = 0; i < filter->a_preds; i++)
-			kfree(filter->preds[i].field_name);
 		kfree(filter->preds);
 		filter->preds = NULL;
 	}
@@ -840,23 +862,19 @@
 	}
 }
 
-static int filter_add_pred_fn(struct filter_parse_state *ps,
-			      struct ftrace_event_call *call,
-			      struct event_filter *filter,
-			      struct filter_pred *pred,
-			      struct pred_stack *stack,
-			      filter_pred_fn_t fn)
+static int filter_add_pred(struct filter_parse_state *ps,
+			   struct event_filter *filter,
+			   struct filter_pred *pred,
+			   struct pred_stack *stack)
 {
-	int idx, err;
+	int err;
 
 	if (WARN_ON(filter->n_preds == filter->a_preds)) {
 		parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
 		return -ENOSPC;
 	}
 
-	idx = filter->n_preds;
-	filter_clear_pred(&filter->preds[idx]);
-	err = filter_set_pred(filter, idx, stack, pred, fn);
+	err = filter_set_pred(filter, filter->n_preds, stack, pred);
 	if (err)
 		return err;
 
@@ -937,31 +955,15 @@
 	return fn;
 }
 
-static int filter_add_pred(struct filter_parse_state *ps,
-			   struct ftrace_event_call *call,
-			   struct event_filter *filter,
-			   struct filter_pred *pred,
-			   struct pred_stack *stack,
-			   bool dry_run)
+static int init_pred(struct filter_parse_state *ps,
+		     struct ftrace_event_field *field,
+		     struct filter_pred *pred)
+
 {
-	struct ftrace_event_field *field;
-	filter_pred_fn_t fn;
+	filter_pred_fn_t fn = filter_pred_none;
 	unsigned long long val;
 	int ret;
 
-	fn = pred->fn = filter_pred_none;
-
-	if (pred->op == OP_AND)
-		goto add_pred_fn;
-	else if (pred->op == OP_OR)
-		goto add_pred_fn;
-
-	field = find_event_field(call, pred->field_name);
-	if (!field) {
-		parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0);
-		return -EINVAL;
-	}
-
 	pred->offset = field->offset;
 
 	if (!is_legal_op(field, pred->op)) {
@@ -1001,9 +1003,7 @@
 	if (pred->op == OP_NE)
 		pred->not = 1;
 
-add_pred_fn:
-	if (!dry_run)
-		return filter_add_pred_fn(ps, call, filter, pred, stack, fn);
+	pred->fn = fn;
 	return 0;
 }
 
@@ -1302,39 +1302,37 @@
 	return 0;
 }
 
-static struct filter_pred *create_pred(int op, char *operand1, char *operand2)
+static struct filter_pred *create_pred(struct filter_parse_state *ps,
+				       struct ftrace_event_call *call,
+				       int op, char *operand1, char *operand2)
 {
-	struct filter_pred *pred;
+	struct ftrace_event_field *field;
+	static struct filter_pred pred;
 
-	pred = kzalloc(sizeof(*pred), GFP_KERNEL);
-	if (!pred)
-		return NULL;
+	memset(&pred, 0, sizeof(pred));
+	pred.op = op;
 
-	pred->field_name = kstrdup(operand1, GFP_KERNEL);
-	if (!pred->field_name) {
-		kfree(pred);
+	if (op == OP_AND || op == OP_OR)
+		return &pred;
+
+	if (!operand1 || !operand2) {
+		parse_error(ps, FILT_ERR_MISSING_FIELD, 0);
 		return NULL;
 	}
 
-	strcpy(pred->regex.pattern, operand2);
-	pred->regex.len = strlen(pred->regex.pattern);
-
-	pred->op = op;
-
-	return pred;
-}
-
-static struct filter_pred *create_logical_pred(int op)
-{
-	struct filter_pred *pred;
-
-	pred = kzalloc(sizeof(*pred), GFP_KERNEL);
-	if (!pred)
+	field = find_event_field(call, operand1);
+	if (!field) {
+		parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0);
 		return NULL;
+	}
 
-	pred->op = op;
+	strcpy(pred.regex.pattern, operand2);
+	pred.regex.len = strlen(pred.regex.pattern);
 
-	return pred;
+#ifdef CONFIG_FTRACE_STARTUP_TEST
+	pred.field = field;
+#endif
+	return init_pred(ps, field, &pred) ? NULL : &pred;
 }
 
 static int check_preds(struct filter_parse_state *ps)
@@ -1375,6 +1373,23 @@
 	return n_preds;
 }
 
+struct check_pred_data {
+	int count;
+	int max;
+};
+
+static int check_pred_tree_cb(enum move_type move, struct filter_pred *pred,
+			      int *err, void *data)
+{
+	struct check_pred_data *d = data;
+
+	if (WARN_ON(d->count++ > d->max)) {
+		*err = -EINVAL;
+		return WALK_PRED_ABORT;
+	}
+	return WALK_PRED_DEFAULT;
+}
+
 /*
  * The tree is walked at filtering of an event. If the tree is not correctly
  * built, it may cause an infinite loop. Check here that the tree does
@@ -1383,107 +1398,76 @@
 static int check_pred_tree(struct event_filter *filter,
 			   struct filter_pred *root)
 {
-	struct filter_pred *preds;
-	struct filter_pred *pred;
-	enum move_type move = MOVE_DOWN;
-	int count = 0;
-	int done = 0;
-	int max;
+	struct check_pred_data data = {
+		/*
+		 * The max that we can hit a node is three times.
+		 * Once going down, once coming up from left, and
+		 * once coming up from right. This is more than enough
+		 * since leafs are only hit a single time.
+		 */
+		.max   = 3 * filter->n_preds,
+		.count = 0,
+	};
 
-	/*
-	 * The max that we can hit a node is three times.
-	 * Once going down, once coming up from left, and
-	 * once coming up from right. This is more than enough
-	 * since leafs are only hit a single time.
-	 */
-	max = 3 * filter->n_preds;
+	return walk_pred_tree(filter->preds, root,
+			      check_pred_tree_cb, &data);
+}
 
-	preds = filter->preds;
-	if  (!preds)
-		return -EINVAL;
-	pred = root;
+static int count_leafs_cb(enum move_type move, struct filter_pred *pred,
+			  int *err, void *data)
+{
+	int *count = data;
 
-	do {
-		if (WARN_ON(count++ > max))
-			return -EINVAL;
+	if ((move == MOVE_DOWN) &&
+	    (pred->left == FILTER_PRED_INVALID))
+		(*count)++;
 
-		switch (move) {
-		case MOVE_DOWN:
-			if (pred->left != FILTER_PRED_INVALID) {
-				pred = &preds[pred->left];
-				continue;
-			}
-			/* A leaf at the root is just a leaf in the tree */
-			if (pred == root)
-				break;
-			pred = get_pred_parent(pred, preds,
-					       pred->parent, &move);
-			continue;
-		case MOVE_UP_FROM_LEFT:
-			pred = &preds[pred->right];
-			move = MOVE_DOWN;
-			continue;
-		case MOVE_UP_FROM_RIGHT:
-			if (pred == root)
-				break;
-			pred = get_pred_parent(pred, preds,
-					       pred->parent, &move);
-			continue;
-		}
-		done = 1;
-	} while (!done);
-
-	/* We are fine. */
-	return 0;
+	return WALK_PRED_DEFAULT;
 }
 
 static int count_leafs(struct filter_pred *preds, struct filter_pred *root)
 {
-	struct filter_pred *pred;
-	enum move_type move = MOVE_DOWN;
-	int count = 0;
-	int done = 0;
+	int count = 0, ret;
 
-	pred = root;
-
-	do {
-		switch (move) {
-		case MOVE_DOWN:
-			if (pred->left != FILTER_PRED_INVALID) {
-				pred = &preds[pred->left];
-				continue;
-			}
-			/* A leaf at the root is just a leaf in the tree */
-			if (pred == root)
-				return 1;
-			count++;
-			pred = get_pred_parent(pred, preds,
-					       pred->parent, &move);
-			continue;
-		case MOVE_UP_FROM_LEFT:
-			pred = &preds[pred->right];
-			move = MOVE_DOWN;
-			continue;
-		case MOVE_UP_FROM_RIGHT:
-			if (pred == root)
-				break;
-			pred = get_pred_parent(pred, preds,
-					       pred->parent, &move);
-			continue;
-		}
-		done = 1;
-	} while (!done);
-
+	ret = walk_pred_tree(preds, root, count_leafs_cb, &count);
+	WARN_ON(ret);
 	return count;
 }
 
+struct fold_pred_data {
+	struct filter_pred *root;
+	int count;
+	int children;
+};
+
+static int fold_pred_cb(enum move_type move, struct filter_pred *pred,
+			int *err, void *data)
+{
+	struct fold_pred_data *d = data;
+	struct filter_pred *root = d->root;
+
+	if (move != MOVE_DOWN)
+		return WALK_PRED_DEFAULT;
+	if (pred->left != FILTER_PRED_INVALID)
+		return WALK_PRED_DEFAULT;
+
+	if (WARN_ON(d->count == d->children)) {
+		*err = -EINVAL;
+		return WALK_PRED_ABORT;
+	}
+
+	pred->index &= ~FILTER_PRED_FOLD;
+	root->ops[d->count++] = pred->index;
+	return WALK_PRED_DEFAULT;
+}
+
 static int fold_pred(struct filter_pred *preds, struct filter_pred *root)
 {
-	struct filter_pred *pred;
-	enum move_type move = MOVE_DOWN;
-	int count = 0;
+	struct fold_pred_data data = {
+		.root  = root,
+		.count = 0,
+	};
 	int children;
-	int done = 0;
 
 	/* No need to keep the fold flag */
 	root->index &= ~FILTER_PRED_FOLD;
@@ -1501,37 +1485,26 @@
 		return -ENOMEM;
 
 	root->val = children;
+	data.children = children;
+	return walk_pred_tree(preds, root, fold_pred_cb, &data);
+}
 
-	pred = root;
-	do {
-		switch (move) {
-		case MOVE_DOWN:
-			if (pred->left != FILTER_PRED_INVALID) {
-				pred = &preds[pred->left];
-				continue;
-			}
-			if (WARN_ON(count == children))
-				return -EINVAL;
-			pred->index &= ~FILTER_PRED_FOLD;
-			root->ops[count++] = pred->index;
-			pred = get_pred_parent(pred, preds,
-					       pred->parent, &move);
-			continue;
-		case MOVE_UP_FROM_LEFT:
-			pred = &preds[pred->right];
-			move = MOVE_DOWN;
-			continue;
-		case MOVE_UP_FROM_RIGHT:
-			if (pred == root)
-				break;
-			pred = get_pred_parent(pred, preds,
-					       pred->parent, &move);
-			continue;
-		}
-		done = 1;
-	} while (!done);
+static int fold_pred_tree_cb(enum move_type move, struct filter_pred *pred,
+			     int *err, void *data)
+{
+	struct filter_pred *preds = data;
 
-	return 0;
+	if (move != MOVE_DOWN)
+		return WALK_PRED_DEFAULT;
+	if (!(pred->index & FILTER_PRED_FOLD))
+		return WALK_PRED_DEFAULT;
+
+	*err = fold_pred(preds, pred);
+	if (*err)
+		return WALK_PRED_ABORT;
+
+	/* eveyrhing below is folded, continue with parent */
+	return WALK_PRED_PARENT;
 }
 
 /*
@@ -1542,51 +1515,8 @@
 static int fold_pred_tree(struct event_filter *filter,
 			   struct filter_pred *root)
 {
-	struct filter_pred *preds;
-	struct filter_pred *pred;
-	enum move_type move = MOVE_DOWN;
-	int done = 0;
-	int err;
-
-	preds = filter->preds;
-	if  (!preds)
-		return -EINVAL;
-	pred = root;
-
-	do {
-		switch (move) {
-		case MOVE_DOWN:
-			if (pred->index & FILTER_PRED_FOLD) {
-				err = fold_pred(preds, pred);
-				if (err)
-					return err;
-				/* Folded nodes are like leafs */
-			} else if (pred->left != FILTER_PRED_INVALID) {
-				pred = &preds[pred->left];
-				continue;
-			}
-
-			/* A leaf at the root is just a leaf in the tree */
-			if (pred == root)
-				break;
-			pred = get_pred_parent(pred, preds,
-					       pred->parent, &move);
-			continue;
-		case MOVE_UP_FROM_LEFT:
-			pred = &preds[pred->right];
-			move = MOVE_DOWN;
-			continue;
-		case MOVE_UP_FROM_RIGHT:
-			if (pred == root)
-				break;
-			pred = get_pred_parent(pred, preds,
-					       pred->parent, &move);
-			continue;
-		}
-		done = 1;
-	} while (!done);
-
-	return 0;
+	return walk_pred_tree(filter->preds, root, fold_pred_tree_cb,
+			      filter->preds);
 }
 
 static int replace_preds(struct ftrace_event_call *call,
@@ -1643,27 +1573,17 @@
 			goto fail;
 		}
 
-		if (elt->op == OP_AND || elt->op == OP_OR) {
-			pred = create_logical_pred(elt->op);
-			goto add_pred;
-		}
-
-		if (!operand1 || !operand2) {
-			parse_error(ps, FILT_ERR_MISSING_FIELD, 0);
+		pred = create_pred(ps, call, elt->op, operand1, operand2);
+		if (!pred) {
 			err = -EINVAL;
 			goto fail;
 		}
 
-		pred = create_pred(elt->op, operand1, operand2);
-add_pred:
-		if (!pred) {
-			err = -ENOMEM;
-			goto fail;
+		if (!dry_run) {
+			err = filter_add_pred(ps, filter, pred, &stack);
+			if (err)
+				goto fail;
 		}
-		err = filter_add_pred(ps, call, filter, pred, &stack, dry_run);
-		filter_free_pred(pred);
-		if (err)
-			goto fail;
 
 		operand1 = operand2 = NULL;
 	}
@@ -1958,17 +1878,14 @@
 	int err;
 	struct event_filter *filter;
 	struct filter_parse_state *ps;
-	struct ftrace_event_call *call = NULL;
+	struct ftrace_event_call *call;
 
 	mutex_lock(&event_mutex);
 
-	list_for_each_entry(call, &ftrace_events, list) {
-		if (call->event.type == event_id)
-			break;
-	}
+	call = event->tp_event;
 
 	err = -EINVAL;
-	if (&call->list == &ftrace_events)
+	if (!call)
 		goto out_unlock;
 
 	err = -EEXIST;
@@ -2012,3 +1929,215 @@
 
 #endif /* CONFIG_PERF_EVENTS */
 
+#ifdef CONFIG_FTRACE_STARTUP_TEST
+
+#include <linux/types.h>
+#include <linux/tracepoint.h>
+
+#define CREATE_TRACE_POINTS
+#include "trace_events_filter_test.h"
+
+static int test_get_filter(char *filter_str, struct ftrace_event_call *call,
+			   struct event_filter **pfilter)
+{
+	struct event_filter *filter;
+	struct filter_parse_state *ps;
+	int err = -ENOMEM;
+
+	filter = __alloc_filter();
+	if (!filter)
+		goto out;
+
+	ps = kzalloc(sizeof(*ps), GFP_KERNEL);
+	if (!ps)
+		goto free_filter;
+
+	parse_init(ps, filter_ops, filter_str);
+	err = filter_parse(ps);
+	if (err)
+		goto free_ps;
+
+	err = replace_preds(call, filter, ps, filter_str, false);
+	if (!err)
+		*pfilter = filter;
+
+ free_ps:
+	filter_opstack_clear(ps);
+	postfix_clear(ps);
+	kfree(ps);
+
+ free_filter:
+	if (err)
+		__free_filter(filter);
+
+ out:
+	return err;
+}
+
+#define DATA_REC(m, va, vb, vc, vd, ve, vf, vg, vh, nvisit) \
+{ \
+	.filter = FILTER, \
+	.rec    = { .a = va, .b = vb, .c = vc, .d = vd, \
+		    .e = ve, .f = vf, .g = vg, .h = vh }, \
+	.match  = m, \
+	.not_visited = nvisit, \
+}
+#define YES 1
+#define NO  0
+
+static struct test_filter_data_t {
+	char *filter;
+	struct ftrace_raw_ftrace_test_filter rec;
+	int match;
+	char *not_visited;
+} test_filter_data[] = {
+#define FILTER "a == 1 && b == 1 && c == 1 && d == 1 && " \
+	       "e == 1 && f == 1 && g == 1 && h == 1"
+	DATA_REC(YES, 1, 1, 1, 1, 1, 1, 1, 1, ""),
+	DATA_REC(NO,  0, 1, 1, 1, 1, 1, 1, 1, "bcdefgh"),
+	DATA_REC(NO,  1, 1, 1, 1, 1, 1, 1, 0, ""),
+#undef FILTER
+#define FILTER "a == 1 || b == 1 || c == 1 || d == 1 || " \
+	       "e == 1 || f == 1 || g == 1 || h == 1"
+	DATA_REC(NO,  0, 0, 0, 0, 0, 0, 0, 0, ""),
+	DATA_REC(YES, 0, 0, 0, 0, 0, 0, 0, 1, ""),
+	DATA_REC(YES, 1, 0, 0, 0, 0, 0, 0, 0, "bcdefgh"),
+#undef FILTER
+#define FILTER "(a == 1 || b == 1) && (c == 1 || d == 1) && " \
+	       "(e == 1 || f == 1) && (g == 1 || h == 1)"
+	DATA_REC(NO,  0, 0, 1, 1, 1, 1, 1, 1, "dfh"),
+	DATA_REC(YES, 0, 1, 0, 1, 0, 1, 0, 1, ""),
+	DATA_REC(YES, 1, 0, 1, 0, 0, 1, 0, 1, "bd"),
+	DATA_REC(NO,  1, 0, 1, 0, 0, 1, 0, 0, "bd"),
+#undef FILTER
+#define FILTER "(a == 1 && b == 1) || (c == 1 && d == 1) || " \
+	       "(e == 1 && f == 1) || (g == 1 && h == 1)"
+	DATA_REC(YES, 1, 0, 1, 1, 1, 1, 1, 1, "efgh"),
+	DATA_REC(YES, 0, 0, 0, 0, 0, 0, 1, 1, ""),
+	DATA_REC(NO,  0, 0, 0, 0, 0, 0, 0, 1, ""),
+#undef FILTER
+#define FILTER "(a == 1 && b == 1) && (c == 1 && d == 1) && " \
+	       "(e == 1 && f == 1) || (g == 1 && h == 1)"
+	DATA_REC(YES, 1, 1, 1, 1, 1, 1, 0, 0, "gh"),
+	DATA_REC(NO,  0, 0, 0, 0, 0, 0, 0, 1, ""),
+	DATA_REC(YES, 1, 1, 1, 1, 1, 0, 1, 1, ""),
+#undef FILTER
+#define FILTER "((a == 1 || b == 1) || (c == 1 || d == 1) || " \
+	       "(e == 1 || f == 1)) && (g == 1 || h == 1)"
+	DATA_REC(YES, 1, 1, 1, 1, 1, 1, 0, 1, "bcdef"),
+	DATA_REC(NO,  0, 0, 0, 0, 0, 0, 0, 0, ""),
+	DATA_REC(YES, 1, 1, 1, 1, 1, 0, 1, 1, "h"),
+#undef FILTER
+#define FILTER "((((((((a == 1) && (b == 1)) || (c == 1)) && (d == 1)) || " \
+	       "(e == 1)) && (f == 1)) || (g == 1)) && (h == 1))"
+	DATA_REC(YES, 1, 1, 1, 1, 1, 1, 1, 1, "ceg"),
+	DATA_REC(NO,  0, 1, 0, 1, 0, 1, 0, 1, ""),
+	DATA_REC(NO,  1, 0, 1, 0, 1, 0, 1, 0, ""),
+#undef FILTER
+#define FILTER "((((((((a == 1) || (b == 1)) && (c == 1)) || (d == 1)) && " \
+	       "(e == 1)) || (f == 1)) && (g == 1)) || (h == 1))"
+	DATA_REC(YES, 1, 1, 1, 1, 1, 1, 1, 1, "bdfh"),
+	DATA_REC(YES, 0, 1, 0, 1, 0, 1, 0, 1, ""),
+	DATA_REC(YES, 1, 0, 1, 0, 1, 0, 1, 0, "bdfh"),
+};
+
+#undef DATA_REC
+#undef FILTER
+#undef YES
+#undef NO
+
+#define DATA_CNT (sizeof(test_filter_data)/sizeof(struct test_filter_data_t))
+
+static int test_pred_visited;
+
+static int test_pred_visited_fn(struct filter_pred *pred, void *event)
+{
+	struct ftrace_event_field *field = pred->field;
+
+	test_pred_visited = 1;
+	printk(KERN_INFO "\npred visited %s\n", field->name);
+	return 1;
+}
+
+static int test_walk_pred_cb(enum move_type move, struct filter_pred *pred,
+			     int *err, void *data)
+{
+	char *fields = data;
+
+	if ((move == MOVE_DOWN) &&
+	    (pred->left == FILTER_PRED_INVALID)) {
+		struct ftrace_event_field *field = pred->field;
+
+		if (!field) {
+			WARN(1, "all leafs should have field defined");
+			return WALK_PRED_DEFAULT;
+		}
+		if (!strchr(fields, *field->name))
+			return WALK_PRED_DEFAULT;
+
+		WARN_ON(!pred->fn);
+		pred->fn = test_pred_visited_fn;
+	}
+	return WALK_PRED_DEFAULT;
+}
+
+static __init int ftrace_test_event_filter(void)
+{
+	int i;
+
+	printk(KERN_INFO "Testing ftrace filter: ");
+
+	for (i = 0; i < DATA_CNT; i++) {
+		struct event_filter *filter = NULL;
+		struct test_filter_data_t *d = &test_filter_data[i];
+		int err;
+
+		err = test_get_filter(d->filter, &event_ftrace_test_filter,
+				      &filter);
+		if (err) {
+			printk(KERN_INFO
+			       "Failed to get filter for '%s', err %d\n",
+			       d->filter, err);
+			break;
+		}
+
+		/*
+		 * The preemption disabling is not really needed for self
+		 * tests, but the rcu dereference will complain without it.
+		 */
+		preempt_disable();
+		if (*d->not_visited)
+			walk_pred_tree(filter->preds, filter->root,
+				       test_walk_pred_cb,
+				       d->not_visited);
+
+		test_pred_visited = 0;
+		err = filter_match_preds(filter, &d->rec);
+		preempt_enable();
+
+		__free_filter(filter);
+
+		if (test_pred_visited) {
+			printk(KERN_INFO
+			       "Failed, unwanted pred visited for filter %s\n",
+			       d->filter);
+			break;
+		}
+
+		if (err != d->match) {
+			printk(KERN_INFO
+			       "Failed to match filter '%s', expected %d\n",
+			       d->filter, d->match);
+			break;
+		}
+	}
+
+	if (i == DATA_CNT)
+		printk(KERN_CONT "OK\n");
+
+	return 0;
+}
+
+late_initcall(ftrace_test_event_filter);
+
+#endif /* CONFIG_FTRACE_STARTUP_TEST */

diff --git a/kernel/trace/trace_events_filter_test.h b/kernel/trace/trace_events_filter_test.h
new file mode 100644
index 0000000..bfd4dba
--- /dev/null
+++ b/kernel/trace/trace_events_filter_test.h

@@ -0,0 +1,50 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM test
+
+#if !defined(_TRACE_TEST_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_TEST_H
+
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(ftrace_test_filter,
+
+	TP_PROTO(int a, int b, int c, int d, int e, int f, int g, int h),
+
+	TP_ARGS(a, b, c, d, e, f, g, h),
+
+	TP_STRUCT__entry(
+		__field(int, a)
+		__field(int, b)
+		__field(int, c)
+		__field(int, d)
+		__field(int, e)
+		__field(int, f)
+		__field(int, g)
+		__field(int, h)
+	),
+
+	TP_fast_assign(
+		__entry->a = a;
+		__entry->b = b;
+		__entry->c = c;
+		__entry->d = d;
+		__entry->e = e;
+		__entry->f = f;
+		__entry->g = g;
+		__entry->h = h;
+	),
+
+	TP_printk("a %d, b %d, c %d, d %d, e %d, f %d, g %d, h %d",
+		  __entry->a, __entry->b, __entry->c, __entry->d,
+		  __entry->e, __entry->f, __entry->g, __entry->h)
+);
+
+#endif /* _TRACE_TEST_H || TRACE_HEADER_MULTI_READ */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace_events_filter_test
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>

diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 1118621..20dad0d 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c

@@ -505,13 +505,13 @@
 #ifdef CONFIG_PREEMPT_TRACER
 void trace_preempt_on(unsigned long a0, unsigned long a1)
 {
-	if (preempt_trace())
+	if (preempt_trace() && !irq_trace())
 		stop_critical_timing(a0, a1);
 }
 
 void trace_preempt_off(unsigned long a0, unsigned long a1)
 {
-	if (preempt_trace())
+	if (preempt_trace() && !irq_trace())
 		start_critical_timing(a0, a1);
 }
 #endif /* CONFIG_PREEMPT_TRACER */

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 5fb3697..00d527c 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c

@@ -836,11 +836,17 @@
 }
 
 /* Unregister a trace_probe and probe_event: call with locking probe_lock */
-static void unregister_trace_probe(struct trace_probe *tp)
+static int unregister_trace_probe(struct trace_probe *tp)
 {
+	/* Enabled event can not be unregistered */
+	if (trace_probe_is_enabled(tp))
+		return -EBUSY;
+
 	__unregister_trace_probe(tp);
 	list_del(&tp->list);
 	unregister_probe_event(tp);
+
+	return 0;
 }
 
 /* Register a trace_probe and probe_event */
@@ -854,7 +860,9 @@
 	/* Delete old (same name) event if exist */
 	old_tp = find_trace_probe(tp->call.name, tp->call.class->system);
 	if (old_tp) {
-		unregister_trace_probe(old_tp);
+		ret = unregister_trace_probe(old_tp);
+		if (ret < 0)
+			goto end;
 		free_trace_probe(old_tp);
 	}
 
@@ -892,6 +900,7 @@
 	mutex_lock(&probe_lock);
 	list_for_each_entry(tp, &probe_list, list) {
 		if (trace_probe_within_module(tp, mod)) {
+			/* Don't need to check busy - this should have gone. */
 			__unregister_trace_probe(tp);
 			ret = __register_trace_probe(tp);
 			if (ret)
@@ -1205,10 +1214,11 @@
 			return -ENOENT;
 		}
 		/* delete an event */
-		unregister_trace_probe(tp);
-		free_trace_probe(tp);
+		ret = unregister_trace_probe(tp);
+		if (ret == 0)
+			free_trace_probe(tp);
 		mutex_unlock(&probe_lock);
-		return 0;
+		return ret;
 	}
 
 	if (argc < 2) {
@@ -1317,18 +1327,29 @@
 	return ret;
 }
 
-static void release_all_trace_probes(void)
+static int release_all_trace_probes(void)
 {
 	struct trace_probe *tp;
+	int ret = 0;
 
 	mutex_lock(&probe_lock);
+	/* Ensure no probe is in use. */
+	list_for_each_entry(tp, &probe_list, list)
+		if (trace_probe_is_enabled(tp)) {
+			ret = -EBUSY;
+			goto end;
+		}
 	/* TODO: Use batch unregistration */
 	while (!list_empty(&probe_list)) {
 		tp = list_entry(probe_list.next, struct trace_probe, list);
 		unregister_trace_probe(tp);
 		free_trace_probe(tp);
 	}
+
+end:
 	mutex_unlock(&probe_lock);
+
+	return ret;
 }
 
 /* Probes listing interfaces */
@@ -1380,9 +1401,13 @@
 
 static int probes_open(struct inode *inode, struct file *file)
 {
-	if ((file->f_mode & FMODE_WRITE) &&
-	    (file->f_flags & O_TRUNC))
-		release_all_trace_probes();
+	int ret;
+
+	if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) {
+		ret = release_all_trace_probes();
+		if (ret < 0)
+			return ret;
+	}
 
 	return seq_open(file, &probes_seq_op);
 }
@@ -2055,6 +2080,21 @@
 
 	ret = target(1, 2, 3, 4, 5, 6);
 
+	/* Disable trace points before removing it */
+	tp = find_trace_probe("testprobe", KPROBE_EVENT_SYSTEM);
+	if (WARN_ON_ONCE(tp == NULL)) {
+		pr_warning("error on getting test probe.\n");
+		warn++;
+	} else
+		disable_trace_probe(tp, TP_FLAG_TRACE);
+
+	tp = find_trace_probe("testprobe2", KPROBE_EVENT_SYSTEM);
+	if (WARN_ON_ONCE(tp == NULL)) {
+		pr_warning("error on getting 2nd test probe.\n");
+		warn++;
+	} else
+		disable_trace_probe(tp, TP_FLAG_TRACE);
+
 	ret = command_trace_probe("-:testprobe");
 	if (WARN_ON_ONCE(ret)) {
 		pr_warning("error on deleting a probe.\n");

diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index 1f06468..6fd4ffd 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c

@@ -59,18 +59,19 @@
 			continue;
 		}
 
+		fmt = NULL;
 		tb_fmt = kmalloc(sizeof(*tb_fmt), GFP_KERNEL);
-		if (tb_fmt)
+		if (tb_fmt) {
 			fmt = kmalloc(strlen(*iter) + 1, GFP_KERNEL);
-		if (tb_fmt && fmt) {
-			list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list);
-			strcpy(fmt, *iter);
-			tb_fmt->fmt = fmt;
-			*iter = tb_fmt->fmt;
-		} else {
-			kfree(tb_fmt);
-			*iter = NULL;
+			if (fmt) {
+				list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list);
+				strcpy(fmt, *iter);
+				tb_fmt->fmt = fmt;
+			} else
+				kfree(tb_fmt);
 		}
+		*iter = fmt;
+
 	}
 	mutex_unlock(&btrace_mutex);
 }

diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index b219f14..db110b8 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c

@@ -34,11 +34,16 @@
 static const int tracepoint_debug;
 
 /*
- * tracepoints_mutex nests inside module_mutex. Tracepoints mutex protects the
- * builtin and module tracepoints and the hash table.
+ * Tracepoints mutex protects the builtin and module tracepoints and the hash
+ * table, as well as the local module list.
  */
 static DEFINE_MUTEX(tracepoints_mutex);
 
+#ifdef CONFIG_MODULES
+/* Local list of struct module */
+static LIST_HEAD(tracepoint_module_list);
+#endif /* CONFIG_MODULES */
+
 /*
  * Tracepoint hash table, containing the active tracepoints.
  * Protected by tracepoints_mutex.
@@ -292,9 +297,10 @@
  * @end: end of the range
  *
  * Updates the probe callback corresponding to a range of tracepoints.
+ * Called with tracepoints_mutex held.
  */
-void tracepoint_update_probe_range(struct tracepoint * const *begin,
-				   struct tracepoint * const *end)
+static void tracepoint_update_probe_range(struct tracepoint * const *begin,
+					  struct tracepoint * const *end)
 {
 	struct tracepoint * const *iter;
 	struct tracepoint_entry *mark_entry;
@@ -302,7 +308,6 @@
 	if (!begin)
 		return;
 
-	mutex_lock(&tracepoints_mutex);
 	for (iter = begin; iter < end; iter++) {
 		mark_entry = get_tracepoint((*iter)->name);
 		if (mark_entry) {
@@ -312,11 +317,27 @@
 			disable_tracepoint(*iter);
 		}
 	}
-	mutex_unlock(&tracepoints_mutex);
 }
 
+#ifdef CONFIG_MODULES
+void module_update_tracepoints(void)
+{
+	struct tp_module *tp_mod;
+
+	list_for_each_entry(tp_mod, &tracepoint_module_list, list)
+		tracepoint_update_probe_range(tp_mod->tracepoints_ptrs,
+			tp_mod->tracepoints_ptrs + tp_mod->num_tracepoints);
+}
+#else /* CONFIG_MODULES */
+void module_update_tracepoints(void)
+{
+}
+#endif /* CONFIG_MODULES */
+
+
 /*
  * Update probes, removing the faulty probes.
+ * Called with tracepoints_mutex held.
  */
 static void tracepoint_update_probes(void)
 {
@@ -359,11 +380,12 @@
 
 	mutex_lock(&tracepoints_mutex);
 	old = tracepoint_add_probe(name, probe, data);
-	mutex_unlock(&tracepoints_mutex);
-	if (IS_ERR(old))
+	if (IS_ERR(old)) {
+		mutex_unlock(&tracepoints_mutex);
 		return PTR_ERR(old);
-
+	}
 	tracepoint_update_probes();		/* may update entry */
+	mutex_unlock(&tracepoints_mutex);
 	release_probes(old);
 	return 0;
 }
@@ -402,11 +424,12 @@
 
 	mutex_lock(&tracepoints_mutex);
 	old = tracepoint_remove_probe(name, probe, data);
-	mutex_unlock(&tracepoints_mutex);
-	if (IS_ERR(old))
+	if (IS_ERR(old)) {
+		mutex_unlock(&tracepoints_mutex);
 		return PTR_ERR(old);
-
+	}
 	tracepoint_update_probes();		/* may update entry */
+	mutex_unlock(&tracepoints_mutex);
 	release_probes(old);
 	return 0;
 }
@@ -489,9 +512,8 @@
 	if (!list_empty(&old_probes))
 		list_replace_init(&old_probes, &release_probes);
 	need_update = 0;
-	mutex_unlock(&tracepoints_mutex);
-
 	tracepoint_update_probes();
+	mutex_unlock(&tracepoints_mutex);
 	list_for_each_entry_safe(pos, next, &release_probes, u.list) {
 		list_del(&pos->u.list);
 		call_rcu_sched(&pos->u.rcu, rcu_free_old_probes);
@@ -509,7 +531,7 @@
  * Will return the first tracepoint in the range if the input tracepoint is
  * NULL.
  */
-int tracepoint_get_iter_range(struct tracepoint * const **tracepoint,
+static int tracepoint_get_iter_range(struct tracepoint * const **tracepoint,
 	struct tracepoint * const *begin, struct tracepoint * const *end)
 {
 	if (!*tracepoint && begin != end) {
@@ -520,11 +542,12 @@
 		return 1;
 	return 0;
 }
-EXPORT_SYMBOL_GPL(tracepoint_get_iter_range);
 
+#ifdef CONFIG_MODULES
 static void tracepoint_get_iter(struct tracepoint_iter *iter)
 {
 	int found = 0;
+	struct tp_module *iter_mod;
 
 	/* Core kernel tracepoints */
 	if (!iter->module) {
@@ -534,12 +557,43 @@
 		if (found)
 			goto end;
 	}
-	/* tracepoints in modules. */
-	found = module_get_iter_tracepoints(iter);
+	/* Tracepoints in modules */
+	mutex_lock(&tracepoints_mutex);
+	list_for_each_entry(iter_mod, &tracepoint_module_list, list) {
+		/*
+		 * Sorted module list
+		 */
+		if (iter_mod < iter->module)
+			continue;
+		else if (iter_mod > iter->module)
+			iter->tracepoint = NULL;
+		found = tracepoint_get_iter_range(&iter->tracepoint,
+			iter_mod->tracepoints_ptrs,
+			iter_mod->tracepoints_ptrs
+				+ iter_mod->num_tracepoints);
+		if (found) {
+			iter->module = iter_mod;
+			break;
+		}
+	}
+	mutex_unlock(&tracepoints_mutex);
 end:
 	if (!found)
 		tracepoint_iter_reset(iter);
 }
+#else /* CONFIG_MODULES */
+static void tracepoint_get_iter(struct tracepoint_iter *iter)
+{
+	int found = 0;
+
+	/* Core kernel tracepoints */
+	found = tracepoint_get_iter_range(&iter->tracepoint,
+			__start___tracepoints_ptrs,
+			__stop___tracepoints_ptrs);
+	if (!found)
+		tracepoint_iter_reset(iter);
+}
+#endif /* CONFIG_MODULES */
 
 void tracepoint_iter_start(struct tracepoint_iter *iter)
 {
@@ -566,26 +620,98 @@
 
 void tracepoint_iter_reset(struct tracepoint_iter *iter)
 {
+#ifdef CONFIG_MODULES
 	iter->module = NULL;
+#endif /* CONFIG_MODULES */
 	iter->tracepoint = NULL;
 }
 EXPORT_SYMBOL_GPL(tracepoint_iter_reset);
 
 #ifdef CONFIG_MODULES
+static int tracepoint_module_coming(struct module *mod)
+{
+	struct tp_module *tp_mod, *iter;
+	int ret = 0;
+
+	/*
+	 * We skip modules that tain the kernel, especially those with different
+	 * module header (for forced load), to make sure we don't cause a crash.
+	 */
+	if (mod->taints)
+		return 0;
+	mutex_lock(&tracepoints_mutex);
+	tp_mod = kmalloc(sizeof(struct tp_module), GFP_KERNEL);
+	if (!tp_mod) {
+		ret = -ENOMEM;
+		goto end;
+	}
+	tp_mod->num_tracepoints = mod->num_tracepoints;
+	tp_mod->tracepoints_ptrs = mod->tracepoints_ptrs;
+
+	/*
+	 * tracepoint_module_list is kept sorted by struct module pointer
+	 * address for iteration on tracepoints from a seq_file that can release
+	 * the mutex between calls.
+	 */
+	list_for_each_entry_reverse(iter, &tracepoint_module_list, list) {
+		BUG_ON(iter == tp_mod);	/* Should never be in the list twice */
+		if (iter < tp_mod) {
+			/* We belong to the location right after iter. */
+			list_add(&tp_mod->list, &iter->list);
+			goto module_added;
+		}
+	}
+	/* We belong to the beginning of the list */
+	list_add(&tp_mod->list, &tracepoint_module_list);
+module_added:
+	tracepoint_update_probe_range(mod->tracepoints_ptrs,
+		mod->tracepoints_ptrs + mod->num_tracepoints);
+end:
+	mutex_unlock(&tracepoints_mutex);
+	return ret;
+}
+
+static int tracepoint_module_going(struct module *mod)
+{
+	struct tp_module *pos;
+
+	mutex_lock(&tracepoints_mutex);
+	tracepoint_update_probe_range(mod->tracepoints_ptrs,
+		mod->tracepoints_ptrs + mod->num_tracepoints);
+	list_for_each_entry(pos, &tracepoint_module_list, list) {
+		if (pos->tracepoints_ptrs == mod->tracepoints_ptrs) {
+			list_del(&pos->list);
+			kfree(pos);
+			break;
+		}
+	}
+	/*
+	 * In the case of modules that were tainted at "coming", we'll simply
+	 * walk through the list without finding it. We cannot use the "tainted"
+	 * flag on "going", in case a module taints the kernel only after being
+	 * loaded.
+	 */
+	mutex_unlock(&tracepoints_mutex);
+	return 0;
+}
 
 int tracepoint_module_notify(struct notifier_block *self,
 			     unsigned long val, void *data)
 {
 	struct module *mod = data;
+	int ret = 0;
 
 	switch (val) {
 	case MODULE_STATE_COMING:
+		ret = tracepoint_module_coming(mod);
+		break;
+	case MODULE_STATE_LIVE:
+		break;
 	case MODULE_STATE_GOING:
-		tracepoint_update_probe_range(mod->tracepoints_ptrs,
-			mod->tracepoints_ptrs + mod->num_tracepoints);
+		ret = tracepoint_module_going(mod);
 		break;
 	}
-	return 0;
+	return ret;
 }
 
 struct notifier_block tracepoint_module_nb = {
@@ -598,7 +724,6 @@
 	return register_module_notifier(&tracepoint_module_nb);
 }
 __initcall(init_tracepoints);
-
 #endif /* CONFIG_MODULES */
 
 #ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 36491cd..d680381 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c

@@ -321,7 +321,7 @@
  */
 static int watchdog(void *unused)
 {
-	static struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
+	struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
 	struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
 
 	sched_setscheduler(current, SCHED_FIFO, &param);
@@ -350,7 +350,8 @@
 		set_current_state(TASK_INTERRUPTIBLE);
 	}
 	__set_current_state(TASK_RUNNING);
-
+	param.sched_priority = 0;
+	sched_setscheduler(current, SCHED_NORMAL, &param);
 	return 0;
 }
 
@@ -438,7 +439,7 @@
 
 	/* create the watchdog thread */
 	if (!p) {
-		p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu);
+		p = kthread_create_on_node(watchdog, NULL, cpu_to_node(cpu), "watchdog/%d", cpu);
 		if (IS_ERR(p)) {
 			printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu);
 			if (!err) {

diff --git a/lib/Kconfig b/lib/Kconfig
index 6c695ff..32f3e5a 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig

@@ -276,7 +276,4 @@
 	  so its calculations are in fixed point. Modules can select this
 	  when they require this function. Module will be called cordic.
 
-config LLIST
-	bool
-
 endmenu

diff --git a/lib/Makefile b/lib/Makefile
index 3f5bc6d..a4da283 100644
--- a/lib/Makefile
+++ b/lib/Makefile

@@ -22,7 +22,7 @@
 obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \
 	 bust_spinlocks.o hexdump.o kasprintf.o bitmap.o scatterlist.o \
 	 string_helpers.o gcd.o lcm.o list_sort.o uuid.o flex_array.o \
-	 bsearch.o find_last_bit.o find_next_bit.o
+	 bsearch.o find_last_bit.o find_next_bit.o llist.o
 obj-y += kstrtox.o
 obj-$(CONFIG_TEST_KSTRTOX) += test-kstrtox.o
 
@@ -115,8 +115,6 @@
 
 obj-$(CONFIG_CORDIC) += cordic.o
 
-obj-$(CONFIG_LLIST) += llist.o
-
 hostprogs-y	:= gen_crc32table
 clean-files	:= crc32table.h
 

diff --git a/lib/llist.c b/lib/llist.c
index da44572..700cff7 100644
--- a/lib/llist.c
+++ b/lib/llist.c

@@ -3,8 +3,8 @@
  *
  * The basic atomic operation of this list is cmpxchg on long.  On
  * architectures that don't have NMI-safe cmpxchg implementation, the
- * list can NOT be used in NMI handler.  So code uses the list in NMI
- * handler should depend on CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG.
+ * list can NOT be used in NMI handlers.  So code that uses the list in
+ * an NMI handler should depend on CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG.
  *
  * Copyright 2010,2011 Intel Corp.
  *   Author: Huang Ying <ying.huang@intel.com>
@@ -30,48 +30,28 @@
 #include <asm/system.h>
 
 /**
- * llist_add - add a new entry
- * @new:	new entry to be added
- * @head:	the head for your lock-less list
- */
-void llist_add(struct llist_node *new, struct llist_head *head)
-{
-	struct llist_node *entry, *old_entry;
-
-#ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG
-	BUG_ON(in_nmi());
-#endif
-
-	entry = head->first;
-	do {
-		old_entry = entry;
-		new->next = entry;
-		cpu_relax();
-	} while ((entry = cmpxchg(&head->first, old_entry, new)) != old_entry);
-}
-EXPORT_SYMBOL_GPL(llist_add);
-
-/**
  * llist_add_batch - add several linked entries in batch
  * @new_first:	first entry in batch to be added
  * @new_last:	last entry in batch to be added
  * @head:	the head for your lock-less list
+ *
+ * Return whether list is empty before adding.
  */
-void llist_add_batch(struct llist_node *new_first, struct llist_node *new_last,
+bool llist_add_batch(struct llist_node *new_first, struct llist_node *new_last,
 		     struct llist_head *head)
 {
 	struct llist_node *entry, *old_entry;
 
-#ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG
-	BUG_ON(in_nmi());
-#endif
-
 	entry = head->first;
-	do {
+	for (;;) {
 		old_entry = entry;
 		new_last->next = entry;
-		cpu_relax();
-	} while ((entry = cmpxchg(&head->first, old_entry, new_first)) != old_entry);
+		entry = cmpxchg(&head->first, old_entry, new_first);
+		if (entry == old_entry)
+			break;
+	}
+
+	return old_entry == NULL;
 }
 EXPORT_SYMBOL_GPL(llist_add_batch);
 
@@ -93,37 +73,17 @@
 {
 	struct llist_node *entry, *old_entry, *next;
 
-#ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG
-	BUG_ON(in_nmi());
-#endif
-
 	entry = head->first;
-	do {
+	for (;;) {
 		if (entry == NULL)
 			return NULL;
 		old_entry = entry;
 		next = entry->next;
-		cpu_relax();
-	} while ((entry = cmpxchg(&head->first, old_entry, next)) != old_entry);
+		entry = cmpxchg(&head->first, old_entry, next);
+		if (entry == old_entry)
+			break;
+	}
 
 	return entry;
 }
 EXPORT_SYMBOL_GPL(llist_del_first);
-
-/**
- * llist_del_all - delete all entries from lock-less list
- * @head:	the head of lock-less list to delete all entries
- *
- * If list is empty, return NULL, otherwise, delete all entries and
- * return the pointer to the first entry.  The order of entries
- * deleted is from the newest to the oldest added one.
- */
-struct llist_node *llist_del_all(struct llist_head *head)
-{
-#ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG
-	BUG_ON(in_nmi());
-#endif
-
-	return xchg(&head->first, NULL);
-}
-EXPORT_SYMBOL_GPL(llist_del_all);

diff --git a/lib/raid6/int.uc b/lib/raid6/int.uc
index d1e276a..5b50f8d 100644
--- a/lib/raid6/int.uc
+++ b/lib/raid6/int.uc

@@ -11,7 +11,7 @@
  * ----------------------------------------------------------------------- */
 
 /*
- * raid6int$#.c
+ * int$#.c
  *
  * $#-way unrolled portable integer math RAID-6 instruction set
  *

diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c
index 4689cb0..503f087 100644
--- a/lib/smp_processor_id.c
+++ b/lib/smp_processor_id.c

@@ -22,7 +22,7 @@
 	 * Kernel threads bound to a single CPU can safely use
 	 * smp_processor_id():
 	 */
-	if (cpumask_equal(&current->cpus_allowed, cpumask_of(this_cpu)))
+	if (cpumask_equal(tsk_cpus_allowed(current), cpumask_of(this_cpu)))
 		goto out;
 
 	/*

diff --git a/tools/perf/Documentation/perf-annotate.txt b/tools/perf/Documentation/perf-annotate.txt
index 85c5f02..fe6762e 100644
--- a/tools/perf/Documentation/perf-annotate.txt
+++ b/tools/perf/Documentation/perf-annotate.txt

@@ -72,6 +72,19 @@
 	CPUs are specified with -: 0-2. Default is to report samples on all
 	CPUs.
 
+--asm-raw::
+	Show raw instruction encoding of assembly instructions.
+
+--source::
+	Interleave source code with assembly code. Enabled by default,
+	disable with --no-source.
+
+--symfs=<directory>::
+        Look for files with symbols relative to this directory.
+
+-M::
+--disassembler-style=:: Set disassembler style for objdump.
+
 SEE ALSO
 --------
 linkperf:perf-record[1], linkperf:perf-report[1]

diff --git a/tools/perf/Documentation/perf-buildid-list.txt b/tools/perf/Documentation/perf-buildid-list.txt
index 5eaac6f..cc22325 100644
--- a/tools/perf/Documentation/perf-buildid-list.txt
+++ b/tools/perf/Documentation/perf-buildid-list.txt

@@ -16,6 +16,9 @@
 tools can be used to fetch packages with matching symbol tables for use by
 perf report.
 
+It can also be used to show the build id of the running kernel or in an ELF
+file using -i/--input.
+
 OPTIONS
 -------
 -H::
@@ -27,6 +30,9 @@
 -f::
 --force::
 	Don't do ownership validation.
+-k::
+--kernel::
+	Show running kernel build id.
 -v::
 --verbose::
 	Be more verbose.

diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt
index 04253c0..212f24d 100644
--- a/tools/perf/Documentation/perf-report.txt
+++ b/tools/perf/Documentation/perf-report.txt

@@ -134,6 +134,24 @@
 	CPUs are specified with -: 0-2. Default is to report samples on all
 	CPUs.
 
+-M::
+--disassembler-style=:: Set disassembler style for objdump.
+
+--source::
+	Interleave source code with assembly code. Enabled by default,
+	disable with --no-source.
+
+--asm-raw::
+	Show raw instruction encoding of assembly instructions.
+
+--show-total-period:: Show a column with the sum of periods.
+
+-I::
+--show-info::
+	Display extended information about the perf.data file. This adds
+	information which may be very large and thus may clutter the display.
+	It currently includes: cpu and numa topology of the host system.
+
 SEE ALSO
 --------
-linkperf:perf-stat[1]
+linkperf:perf-stat[1], linkperf:perf-annotate[1]

diff --git a/tools/perf/Documentation/perf-sched.txt b/tools/perf/Documentation/perf-sched.txt
index 46822d5..5b212b5 100644
--- a/tools/perf/Documentation/perf-sched.txt
+++ b/tools/perf/Documentation/perf-sched.txt

@@ -8,7 +8,7 @@
 SYNOPSIS
 --------
 [verse]
-'perf sched' {record|latency|map|replay|trace}
+'perf sched' {record|latency|map|replay|script}
 
 DESCRIPTION
 -----------
@@ -20,8 +20,8 @@
   'perf sched latency' to report the per task scheduling latencies
   and other scheduling properties of the workload.
 
-  'perf sched trace' to see a detailed trace of the workload that
-  was recorded.
+  'perf sched script' to see a detailed trace of the workload that
+   was recorded (aliased to 'perf script' for now).
 
   'perf sched replay' to simulate the workload that was recorded
   via perf sched record. (this is done by starting up mockup threads

diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt
index db01786..dec87ec 100644
--- a/tools/perf/Documentation/perf-script.txt
+++ b/tools/perf/Documentation/perf-script.txt

@@ -188,6 +188,13 @@
 	CPUs are specified with -: 0-2. Default is to report samples on all
 	CPUs.
 
+-I::
+--show-info::
+	Display extended information about the perf.data file. This adds
+	information which may be very large and thus may clutter the display.
+	It currently includes: cpu and numa topology of the host system.
+	It can only be used with the perf script report mode.
+
 SEE ALSO
 --------
 linkperf:perf-record[1], linkperf:perf-script-perl[1],

diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt
index 918cc38..8966b9a 100644
--- a/tools/perf/Documentation/perf-stat.txt
+++ b/tools/perf/Documentation/perf-stat.txt

@@ -94,6 +94,22 @@
 corresponding events, i.e., they always refer to events defined earlier on the command
 line.
 
+-o file::
+--output file::
+Print the output into the designated file.
+
+--append::
+Append to the output file designated with the -o option. Ignored if -o is not specified.
+
+--log-fd::
+
+Log output to fd, instead of stderr.  Complementary to --output, and mutually exclusive
+with it.  --append may be used here.  Examples:
+     3>results  perf stat --log-fd 3          -- $cmd
+     3>>results perf stat --log-fd 3 --append -- $cmd
+
+
+
 EXAMPLES
 --------
 

diff --git a/tools/perf/Documentation/perf-top.txt b/tools/perf/Documentation/perf-top.txt
index f6eb1cd..b1a5bbb 100644
--- a/tools/perf/Documentation/perf-top.txt
+++ b/tools/perf/Documentation/perf-top.txt

@@ -106,6 +106,51 @@
 --zero::
 	Zero history across display updates.
 
+-s::
+--sort::
+	Sort by key(s): pid, comm, dso, symbol, parent
+
+-n::
+--show-nr-samples::
+	Show a column with the number of samples.
+
+--show-total-period::
+	Show a column with the sum of periods.
+
+--dsos::
+	Only consider symbols in these dsos.
+
+--comms::
+	Only consider symbols in these comms.
+
+--symbols::
+	Only consider these symbols.
+
+-M::
+--disassembler-style=:: Set disassembler style for objdump.
+
+--source::
+	Interleave source code with assembly code. Enabled by default,
+	disable with --no-source.
+
+--asm-raw::
+	Show raw instruction encoding of assembly instructions.
+
+-G [type,min,order]::
+--call-graph::
+        Display call chains using type, min percent threshold and order.
+	type can be either:
+	- flat: single column, linear exposure of call chains.
+	- graph: use a graph tree, displaying absolute overhead rates.
+	- fractal: like graph, but displays relative rates. Each branch of
+		 the tree is considered as a new profiled object.
+
+	order can be either:
+	- callee: callee based call graph.
+	- caller: inverted caller based call graph.
+
+	Default: fractal,0.5,callee.
+
 INTERACTIVE PROMPTING KEYS
 --------------------------
 
@@ -130,9 +175,6 @@
 [S]::
 	Stop annotation, return to full profile display.
 
-[w]::
-	Toggle between weighted sum and individual count[E]r profile.
-
 [z]::
 	Toggle event count zeroing across display updates.
 

diff --git a/tools/perf/Documentation/perfconfig.example b/tools/perf/Documentation/perfconfig.example
new file mode 100644
index 0000000..d144866
--- /dev/null
+++ b/tools/perf/Documentation/perfconfig.example

@@ -0,0 +1,20 @@
+[colors]
+
+	# These were the old defaults
+	top = red, lightgray
+	medium = green, lightgray
+	normal = black, lightgray
+	selected = lightgray, magenta
+	code = blue, lightgray
+
+[tui]
+
+	# Defaults if linked with libslang
+	report = on
+	annotate = on
+	top = on
+
+[buildid]
+
+	# Default, disable using /dev/null
+	dir = /root/.debug

diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index e9d5c27..b98e307 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile

@@ -466,13 +466,13 @@
 		LIB_OBJS += $(OUTPUT)util/ui/browsers/annotate.o
 		LIB_OBJS += $(OUTPUT)util/ui/browsers/hists.o
 		LIB_OBJS += $(OUTPUT)util/ui/browsers/map.o
-		LIB_OBJS += $(OUTPUT)util/ui/browsers/top.o
 		LIB_OBJS += $(OUTPUT)util/ui/helpline.o
 		LIB_OBJS += $(OUTPUT)util/ui/progress.o
 		LIB_OBJS += $(OUTPUT)util/ui/util.o
 		LIB_H += util/ui/browser.h
 		LIB_H += util/ui/browsers/map.h
 		LIB_H += util/ui/helpline.h
+		LIB_H += util/ui/keysyms.h
 		LIB_H += util/ui/libslang.h
 		LIB_H += util/ui/progress.h
 		LIB_H += util/ui/util.h
@@ -729,9 +729,6 @@
 $(OUTPUT)util/ui/browsers/annotate.o: util/ui/browsers/annotate.c $(OUTPUT)PERF-CFLAGS
 	$(QUIET_CC)$(CC) -o $@ -c $(ALL_CFLAGS) -DENABLE_SLFUTURE_CONST $<
 
-$(OUTPUT)util/ui/browsers/top.o: util/ui/browsers/top.c $(OUTPUT)PERF-CFLAGS
-	$(QUIET_CC)$(CC) -o $@ -c $(ALL_CFLAGS) -DENABLE_SLFUTURE_CONST $<
-
 $(OUTPUT)util/ui/browsers/hists.o: util/ui/browsers/hists.c $(OUTPUT)PERF-CFLAGS
 	$(QUIET_CC)$(CC) -o $@ -c $(ALL_CFLAGS) -DENABLE_SLFUTURE_CONST $<
 

diff --git a/tools/perf/arch/powerpc/Makefile b/tools/perf/arch/powerpc/Makefile
index 15130b5..744e629 100644
--- a/tools/perf/arch/powerpc/Makefile
+++ b/tools/perf/arch/powerpc/Makefile

@@ -2,3 +2,4 @@
 PERF_HAVE_DWARF_REGS := 1
 LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/dwarf-regs.o
 endif
+LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/header.o

diff --git a/tools/perf/arch/powerpc/util/header.c b/tools/perf/arch/powerpc/util/header.c
new file mode 100644
index 0000000..eba80c2
--- /dev/null
+++ b/tools/perf/arch/powerpc/util/header.c

@@ -0,0 +1,36 @@
+#include <sys/types.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "../../util/header.h"
+
+#define __stringify_1(x)        #x
+#define __stringify(x)          __stringify_1(x)
+
+#define mfspr(rn)       ({unsigned long rval; \
+			 asm volatile("mfspr %0," __stringify(rn) \
+				      : "=r" (rval)); rval; })
+
+#define SPRN_PVR        0x11F	/* Processor Version Register */
+#define PVR_VER(pvr)    (((pvr) >>  16) & 0xFFFF) /* Version field */
+#define PVR_REV(pvr)    (((pvr) >>   0) & 0xFFFF) /* Revison field */
+
+int
+get_cpuid(char *buffer, size_t sz)
+{
+	unsigned long pvr;
+	int nb;
+
+	pvr = mfspr(SPRN_PVR);
+
+	nb = snprintf(buffer, sz, "%lu,%lu$", PVR_VER(pvr), PVR_REV(pvr));
+
+	/* look for end marker to ensure the entire data fit */
+	if (strchr(buffer, '$')) {
+		buffer[nb-1] = '\0';
+		return 0;
+	}
+	return -1;
+}

diff --git a/tools/perf/arch/x86/Makefile b/tools/perf/arch/x86/Makefile
index 15130b5..744e629 100644
--- a/tools/perf/arch/x86/Makefile
+++ b/tools/perf/arch/x86/Makefile

@@ -2,3 +2,4 @@
 PERF_HAVE_DWARF_REGS := 1
 LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/dwarf-regs.o
 endif
+LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/header.o

diff --git a/tools/perf/arch/x86/util/header.c b/tools/perf/arch/x86/util/header.c
new file mode 100644
index 0000000..f940060
--- /dev/null
+++ b/tools/perf/arch/x86/util/header.c

@@ -0,0 +1,59 @@
+#include <sys/types.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "../../util/header.h"
+
+static inline void
+cpuid(unsigned int op, unsigned int *a, unsigned int *b, unsigned int *c,
+      unsigned int *d)
+{
+	__asm__ __volatile__ (".byte 0x53\n\tcpuid\n\t"
+			      "movl %%ebx, %%esi\n\t.byte 0x5b"
+			: "=a" (*a),
+			"=S" (*b),
+			"=c" (*c),
+			"=d" (*d)
+			: "a" (op));
+}
+
+int
+get_cpuid(char *buffer, size_t sz)
+{
+	unsigned int a, b, c, d, lvl;
+	int family = -1, model = -1, step = -1;
+	int nb;
+	char vendor[16];
+
+	cpuid(0, &lvl, &b, &c, &d);
+	strncpy(&vendor[0], (char *)(&b), 4);
+	strncpy(&vendor[4], (char *)(&d), 4);
+	strncpy(&vendor[8], (char *)(&c), 4);
+	vendor[12] = '\0';
+
+	if (lvl >= 1) {
+		cpuid(1, &a, &b, &c, &d);
+
+		family = (a >> 8) & 0xf;  /* bits 11 - 8 */
+		model  = (a >> 4) & 0xf;  /* Bits  7 - 4 */
+		step   = a & 0xf;
+
+		/* extended family */
+		if (family == 0xf)
+			family += (a >> 20) & 0xff;
+
+		/* extended model */
+		if (family >= 0x6)
+			model += ((a >> 16) & 0xf) << 4;
+	}
+	nb = snprintf(buffer, sz, "%s,%u,%u,%u$", vendor, family, model, step);
+
+	/* look for end marker to ensure the entire data fit */
+	if (strchr(buffer, '$')) {
+		buffer[nb-1] = '\0';
+		return 0;
+	}
+	return -1;
+}

diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index 555aefd..46b4c24 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c

@@ -114,10 +114,11 @@
 				    print_line, full_paths, 0, 0);
 }
 
-static void hists__find_annotations(struct hists *self, int evidx)
+static void hists__find_annotations(struct hists *self, int evidx,
+				    int nr_events)
 {
 	struct rb_node *nd = rb_first(&self->entries), *next;
-	int key = KEY_RIGHT;
+	int key = K_RIGHT;
 
 	while (nd) {
 		struct hist_entry *he = rb_entry(nd, struct hist_entry, rb_node);
@@ -129,7 +130,7 @@
 		notes = symbol__annotation(he->ms.sym);
 		if (notes->src == NULL) {
 find_next:
-			if (key == KEY_LEFT)
+			if (key == K_LEFT)
 				nd = rb_prev(nd);
 			else
 				nd = rb_next(nd);
@@ -137,12 +138,13 @@
 		}
 
 		if (use_browser > 0) {
-			key = hist_entry__tui_annotate(he, evidx);
+			key = hist_entry__tui_annotate(he, evidx, nr_events,
+						       NULL, NULL, 0);
 			switch (key) {
-			case KEY_RIGHT:
+			case K_RIGHT:
 				next = rb_next(nd);
 				break;
-			case KEY_LEFT:
+			case K_LEFT:
 				next = rb_prev(nd);
 				break;
 			default:
@@ -215,7 +217,8 @@
 			total_nr_samples += nr_samples;
 			hists__collapse_resort(hists);
 			hists__output_resort(hists);
-			hists__find_annotations(hists, pos->idx);
+			hists__find_annotations(hists, pos->idx,
+						session->evlist->nr_entries);
 		}
 	}
 
@@ -267,6 +270,14 @@
 	OPT_BOOLEAN('P', "full-paths", &full_paths,
 		    "Don't shorten the displayed pathnames"),
 	OPT_STRING('c', "cpu", &cpu_list, "cpu", "list of cpus to profile"),
+	OPT_STRING(0, "symfs", &symbol_conf.symfs, "directory",
+		   "Look for files with symbols relative to this directory"),
+	OPT_BOOLEAN(0, "source", &symbol_conf.annotate_src,
+		    "Interleave source code with assembly code (default)"),
+	OPT_BOOLEAN(0, "asm-raw", &symbol_conf.annotate_asm_raw,
+		    "Display raw encoding of assembly instructions (default)"),
+	OPT_STRING('M', "disassembler-style", &disassembler_style, "disassembler style",
+		   "Specify disassembler style (e.g. -M intel for intel syntax)"),
 	OPT_END()
 };
 

diff --git a/tools/perf/builtin-buildid-list.c b/tools/perf/builtin-buildid-list.c
index 5af32ae..cb690a6 100644
--- a/tools/perf/builtin-buildid-list.c
+++ b/tools/perf/builtin-buildid-list.c

@@ -1,7 +1,8 @@
 /*
  * builtin-buildid-list.c
  *
- * Builtin buildid-list command: list buildids in perf.data
+ * Builtin buildid-list command: list buildids in perf.data, in the running
+ * kernel and in ELF files.
  *
  * Copyright (C) 2009, Red Hat Inc.
  * Copyright (C) 2009, Arnaldo Carvalho de Melo <acme@redhat.com>
@@ -15,8 +16,11 @@
 #include "util/session.h"
 #include "util/symbol.h"
 
+#include <libelf.h>
+
 static char const *input_name = "perf.data";
 static bool force;
+static bool show_kernel;
 static bool with_hits;
 
 static const char * const buildid_list_usage[] = {
@@ -29,12 +33,13 @@
 	OPT_STRING('i', "input", &input_name, "file",
 		    "input file name"),
 	OPT_BOOLEAN('f', "force", &force, "don't complain, do it"),
+	OPT_BOOLEAN('k', "kernel", &show_kernel, "Show current kernel build id"),
 	OPT_INCR('v', "verbose", &verbose,
 		    "be more verbose"),
 	OPT_END()
 };
 
-static int __cmd_buildid_list(void)
+static int perf_session__list_build_ids(void)
 {
 	struct perf_session *session;
 
@@ -52,6 +57,49 @@
 	return 0;
 }
 
+static int sysfs__fprintf_build_id(FILE *fp)
+{
+	u8 kallsyms_build_id[BUILD_ID_SIZE];
+	char sbuild_id[BUILD_ID_SIZE * 2 + 1];
+
+	if (sysfs__read_build_id("/sys/kernel/notes", kallsyms_build_id,
+				 sizeof(kallsyms_build_id)) != 0)
+		return -1;
+
+	build_id__sprintf(kallsyms_build_id, sizeof(kallsyms_build_id),
+			  sbuild_id);
+	fprintf(fp, "%s\n", sbuild_id);
+	return 0;
+}
+
+static int filename__fprintf_build_id(const char *name, FILE *fp)
+{
+	u8 build_id[BUILD_ID_SIZE];
+	char sbuild_id[BUILD_ID_SIZE * 2 + 1];
+
+	if (filename__read_build_id(name, build_id,
+				    sizeof(build_id)) != sizeof(build_id))
+		return 0;
+
+	build_id__sprintf(build_id, sizeof(build_id), sbuild_id);
+	return fprintf(fp, "%s\n", sbuild_id);
+}
+
+static int __cmd_buildid_list(void)
+{
+	if (show_kernel)
+		return sysfs__fprintf_build_id(stdout);
+
+	elf_version(EV_CURRENT);
+	/*
+ 	 * See if this is an ELF file first:
+ 	 */
+	if (filename__fprintf_build_id(input_name, stdout))
+		return 0;
+
+	return perf_session__list_build_ids();
+}
+
 int cmd_buildid_list(int argc, const char **argv, const char *prefix __used)
 {
 	argc = parse_options(argc, argv, options, buildid_list_usage, 0);

diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c
index e821999..b39f3a1 100644
--- a/tools/perf/builtin-diff.c
+++ b/tools/perf/builtin-diff.c

@@ -162,7 +162,7 @@
 
 	hists__match(&session[0]->hists, &session[1]->hists);
 	hists__fprintf(&session[1]->hists, &session[0]->hists,
-		       show_displacement, stdout);
+		       show_displacement, true, 0, 0, stdout);
 out_delete:
 	for (i = 0; i < 2; ++i)
 		perf_session__delete(session[i]);

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index f4c3fbe..f82480f 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c

@@ -73,6 +73,7 @@
 
 static struct perf_session	*session;
 static const char		*cpu_list;
+static const char               *progname;
 
 static void advance_output(size_t size)
 {
@@ -137,17 +138,29 @@
 
 static volatile int done = 0;
 static volatile int signr = -1;
+static volatile int child_finished = 0;
 
 static void sig_handler(int sig)
 {
+	if (sig == SIGCHLD)
+		child_finished = 1;
+
 	done = 1;
 	signr = sig;
 }
 
 static void sig_atexit(void)
 {
-	if (child_pid > 0)
-		kill(child_pid, SIGTERM);
+	int status;
+
+	if (child_pid > 0) {
+		if (!child_finished)
+			kill(child_pid, SIGTERM);
+
+		wait(&status);
+		if (WIFSIGNALED(status))
+			psignal(WTERMSIG(status), progname);
+	}
 
 	if (signr == -1 || signr == SIGUSR1)
 		return;
@@ -446,6 +459,8 @@
 	char buf;
 	struct machine *machine;
 
+	progname = argv[0];
+
 	page_size = sysconf(_SC_PAGE_SIZE);
 
 	atexit(sig_atexit);
@@ -514,6 +529,19 @@
 	if (have_tracepoints(&evsel_list->entries))
 		perf_header__set_feat(&session->header, HEADER_TRACE_INFO);
 
+	perf_header__set_feat(&session->header, HEADER_HOSTNAME);
+	perf_header__set_feat(&session->header, HEADER_OSRELEASE);
+	perf_header__set_feat(&session->header, HEADER_ARCH);
+	perf_header__set_feat(&session->header, HEADER_CPUDESC);
+	perf_header__set_feat(&session->header, HEADER_NRCPUS);
+	perf_header__set_feat(&session->header, HEADER_EVENT_DESC);
+	perf_header__set_feat(&session->header, HEADER_CMDLINE);
+	perf_header__set_feat(&session->header, HEADER_VERSION);
+	perf_header__set_feat(&session->header, HEADER_CPU_TOPOLOGY);
+	perf_header__set_feat(&session->header, HEADER_TOTAL_MEM);
+	perf_header__set_feat(&session->header, HEADER_NUMA_TOPOLOGY);
+	perf_header__set_feat(&session->header, HEADER_CPUID);
+
 	/* 512 kiB: default amount of unprivileged mlocked memory */
 	if (mmap_pages == UINT_MAX)
 		mmap_pages = (512 * 1024) / page_size;
@@ -785,6 +813,8 @@
 	int err = -ENOMEM;
 	struct perf_evsel *pos;
 
+	perf_header__set_cmdline(argc, argv);
+
 	evsel_list = perf_evlist__new(NULL, NULL);
 	if (evsel_list == NULL)
 		return -ENOMEM;

diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index d7ff277..4d7c834 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c

@@ -40,6 +40,7 @@
 static bool		force, use_tui, use_stdio;
 static bool		hide_unresolved;
 static bool		dont_use_callchains;
+static bool		show_full_info;
 
 static bool		show_threads;
 static struct perf_read_values	show_threads_values;
@@ -229,13 +230,10 @@
 
 	list_for_each_entry(pos, &evlist->entries, node) {
 		struct hists *hists = &pos->hists;
-		const char *evname = NULL;
-
-		if (rb_first(&hists->entries) != rb_last(&hists->entries))
-			evname = event_name(pos);
+		const char *evname = event_name(pos);
 
 		hists__fprintf_nr_sample_events(hists, evname, stdout);
-		hists__fprintf(hists, NULL, false, stdout);
+		hists__fprintf(hists, NULL, false, true, 0, 0, stdout);
 		fprintf(stdout, "\n\n");
 	}
 
@@ -276,6 +274,9 @@
 			goto out_delete;
 	}
 
+	if (use_browser <= 0)
+		perf_session__fprintf_info(session, stdout, show_full_info);
+
 	if (show_threads)
 		perf_read_values_init(&show_threads_values);
 
@@ -330,9 +331,10 @@
 		goto out_delete;
 	}
 
-	if (use_browser > 0)
-		perf_evlist__tui_browse_hists(session->evlist, help);
-	else
+	if (use_browser > 0) {
+		perf_evlist__tui_browse_hists(session->evlist, help,
+					      NULL, NULL, 0);
+	} else
 		perf_evlist__tty_browse_hists(session->evlist, help);
 
 out_delete:
@@ -487,6 +489,16 @@
 	OPT_STRING(0, "symfs", &symbol_conf.symfs, "directory",
 		    "Look for files with symbols relative to this directory"),
 	OPT_STRING('c', "cpu", &cpu_list, "cpu", "list of cpus to profile"),
+	OPT_BOOLEAN('I', "show-info", &show_full_info,
+		    "Display extended information about perf.data file"),
+	OPT_BOOLEAN(0, "source", &symbol_conf.annotate_src,
+		    "Interleave source code with assembly code (default)"),
+	OPT_BOOLEAN(0, "asm-raw", &symbol_conf.annotate_asm_raw,
+		    "Display raw encoding of assembly instructions (default)"),
+	OPT_STRING('M', "disassembler-style", &disassembler_style, "disassembler style",
+		   "Specify disassembler style (e.g. -M intel for intel syntax)"),
+	OPT_BOOLEAN(0, "show-total-period", &symbol_conf.show_total_period,
+		    "Show a column with the sum of periods"),
 	OPT_END()
 };
 

diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index 09024ec..2f62a29 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c

@@ -22,6 +22,7 @@
 static u64			nr_unordered;
 extern const struct option	record_options[];
 static bool			no_callchain;
+static bool			show_full_info;
 static const char		*cpu_list;
 static DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS);
 
@@ -1083,7 +1084,8 @@
 		     "comma separated output fields prepend with 'type:'. Valid types: hw,sw,trace,raw. Fields: comm,tid,pid,time,cpu,event,trace,ip,sym,dso,addr",
 		     parse_output_fields),
 	OPT_STRING('c', "cpu", &cpu_list, "cpu", "list of cpus to profile"),
-
+	OPT_BOOLEAN('I', "show-info", &show_full_info,
+		    "display extended information from perf.data file"),
 	OPT_END()
 };
 
@@ -1268,6 +1270,8 @@
 			return -1;
 	}
 
+	perf_session__fprintf_info(session, stdout, show_full_info);
+
 	if (!no_callchain)
 		symbol_conf.use_callchain = true;
 	else

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 5deb17d..7ce65f5 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c

@@ -194,6 +194,9 @@
 static const char		*csv_sep			= NULL;
 static bool			csv_output			= false;
 static bool			group				= false;
+static const char		*output_name			= NULL;
+static FILE			*output				= NULL;
+static int			output_fd;
 
 static volatile int done = 0;
 
@@ -251,8 +254,13 @@
  */
 static double stddev_stats(struct stats *stats)
 {
-	double variance = stats->M2 / (stats->n - 1);
-	double variance_mean = variance / stats->n;
+	double variance, variance_mean;
+
+	if (!stats->n)
+		return 0.0;
+
+	variance = stats->M2 / (stats->n - 1);
+	variance_mean = variance / stats->n;
 
 	return sqrt(variance_mean);
 }
@@ -352,7 +360,7 @@
 		update_stats(&ps->res_stats[i], count[i]);
 
 	if (verbose) {
-		fprintf(stderr, "%s: %" PRIu64 " %" PRIu64 " %" PRIu64 "\n",
+		fprintf(output, "%s: %" PRIu64 " %" PRIu64 " %" PRIu64 "\n",
 			event_name(counter), count[0], count[1], count[2]);
 	}
 
@@ -487,6 +495,8 @@
 	if (forks) {
 		close(go_pipe[1]);
 		wait(&status);
+		if (WIFSIGNALED(status))
+			psignal(WTERMSIG(status), argv[0]);
 	} else {
 		while(!done) sleep(1);
 	}
@@ -519,9 +529,9 @@
 		pct = 100.0*total/avg;
 
 	if (csv_output)
-		fprintf(stderr, "%s%.2f%%", csv_sep, pct);
-	else
-		fprintf(stderr, "  ( +-%6.2f%% )", pct);
+		fprintf(output, "%s%.2f%%", csv_sep, pct);
+	else if (pct)
+		fprintf(output, "  ( +-%6.2f%% )", pct);
 }
 
 static void print_noise(struct perf_evsel *evsel, double avg)
@@ -546,16 +556,17 @@
 			csv_output ? 0 : -4,
 			evsel_list->cpus->map[cpu], csv_sep);
 
-	fprintf(stderr, fmt, cpustr, msecs, csv_sep, event_name(evsel));
+	fprintf(output, fmt, cpustr, msecs, csv_sep, event_name(evsel));
 
 	if (evsel->cgrp)
-		fprintf(stderr, "%s%s", csv_sep, evsel->cgrp->name);
+		fprintf(output, "%s%s", csv_sep, evsel->cgrp->name);
 
 	if (csv_output)
 		return;
 
 	if (perf_evsel__match(evsel, SOFTWARE, SW_TASK_CLOCK))
-		fprintf(stderr, " # %8.3f CPUs utilized          ", avg / avg_stats(&walltime_nsecs_stats));
+		fprintf(output, " # %8.3f CPUs utilized          ",
+			avg / avg_stats(&walltime_nsecs_stats));
 }
 
 static void print_stalled_cycles_frontend(int cpu, struct perf_evsel *evsel __used, double avg)
@@ -576,9 +587,9 @@
 	else if (ratio > 10.0)
 		color = PERF_COLOR_YELLOW;
 
-	fprintf(stderr, " #  ");
-	color_fprintf(stderr, color, "%6.2f%%", ratio);
-	fprintf(stderr, " frontend cycles idle   ");
+	fprintf(output, " #  ");
+	color_fprintf(output, color, "%6.2f%%", ratio);
+	fprintf(output, " frontend cycles idle   ");
 }
 
 static void print_stalled_cycles_backend(int cpu, struct perf_evsel *evsel __used, double avg)
@@ -599,9 +610,9 @@
 	else if (ratio > 20.0)
 		color = PERF_COLOR_YELLOW;
 
-	fprintf(stderr, " #  ");
-	color_fprintf(stderr, color, "%6.2f%%", ratio);
-	fprintf(stderr, " backend  cycles idle   ");
+	fprintf(output, " #  ");
+	color_fprintf(output, color, "%6.2f%%", ratio);
+	fprintf(output, " backend  cycles idle   ");
 }
 
 static void print_branch_misses(int cpu, struct perf_evsel *evsel __used, double avg)
@@ -622,9 +633,9 @@
 	else if (ratio > 5.0)
 		color = PERF_COLOR_YELLOW;
 
-	fprintf(stderr, " #  ");
-	color_fprintf(stderr, color, "%6.2f%%", ratio);
-	fprintf(stderr, " of all branches        ");
+	fprintf(output, " #  ");
+	color_fprintf(output, color, "%6.2f%%", ratio);
+	fprintf(output, " of all branches        ");
 }
 
 static void print_l1_dcache_misses(int cpu, struct perf_evsel *evsel __used, double avg)
@@ -645,9 +656,9 @@
 	else if (ratio > 5.0)
 		color = PERF_COLOR_YELLOW;
 
-	fprintf(stderr, " #  ");
-	color_fprintf(stderr, color, "%6.2f%%", ratio);
-	fprintf(stderr, " of all L1-dcache hits  ");
+	fprintf(output, " #  ");
+	color_fprintf(output, color, "%6.2f%%", ratio);
+	fprintf(output, " of all L1-dcache hits  ");
 }
 
 static void print_l1_icache_misses(int cpu, struct perf_evsel *evsel __used, double avg)
@@ -668,9 +679,9 @@
 	else if (ratio > 5.0)
 		color = PERF_COLOR_YELLOW;
 
-	fprintf(stderr, " #  ");
-	color_fprintf(stderr, color, "%6.2f%%", ratio);
-	fprintf(stderr, " of all L1-icache hits  ");
+	fprintf(output, " #  ");
+	color_fprintf(output, color, "%6.2f%%", ratio);
+	fprintf(output, " of all L1-icache hits  ");
 }
 
 static void print_dtlb_cache_misses(int cpu, struct perf_evsel *evsel __used, double avg)
@@ -691,9 +702,9 @@
 	else if (ratio > 5.0)
 		color = PERF_COLOR_YELLOW;
 
-	fprintf(stderr, " #  ");
-	color_fprintf(stderr, color, "%6.2f%%", ratio);
-	fprintf(stderr, " of all dTLB cache hits ");
+	fprintf(output, " #  ");
+	color_fprintf(output, color, "%6.2f%%", ratio);
+	fprintf(output, " of all dTLB cache hits ");
 }
 
 static void print_itlb_cache_misses(int cpu, struct perf_evsel *evsel __used, double avg)
@@ -714,9 +725,9 @@
 	else if (ratio > 5.0)
 		color = PERF_COLOR_YELLOW;
 
-	fprintf(stderr, " #  ");
-	color_fprintf(stderr, color, "%6.2f%%", ratio);
-	fprintf(stderr, " of all iTLB cache hits ");
+	fprintf(output, " #  ");
+	color_fprintf(output, color, "%6.2f%%", ratio);
+	fprintf(output, " of all iTLB cache hits ");
 }
 
 static void print_ll_cache_misses(int cpu, struct perf_evsel *evsel __used, double avg)
@@ -737,9 +748,9 @@
 	else if (ratio > 5.0)
 		color = PERF_COLOR_YELLOW;
 
-	fprintf(stderr, " #  ");
-	color_fprintf(stderr, color, "%6.2f%%", ratio);
-	fprintf(stderr, " of all LL-cache hits   ");
+	fprintf(output, " #  ");
+	color_fprintf(output, color, "%6.2f%%", ratio);
+	fprintf(output, " of all LL-cache hits   ");
 }
 
 static void abs_printout(int cpu, struct perf_evsel *evsel, double avg)
@@ -762,10 +773,10 @@
 	else
 		cpu = 0;
 
-	fprintf(stderr, fmt, cpustr, avg, csv_sep, event_name(evsel));
+	fprintf(output, fmt, cpustr, avg, csv_sep, event_name(evsel));
 
 	if (evsel->cgrp)
-		fprintf(stderr, "%s%s", csv_sep, evsel->cgrp->name);
+		fprintf(output, "%s%s", csv_sep, evsel->cgrp->name);
 
 	if (csv_output)
 		return;
@@ -776,14 +787,14 @@
 		if (total)
 			ratio = avg / total;
 
-		fprintf(stderr, " #   %5.2f  insns per cycle        ", ratio);
+		fprintf(output, " #   %5.2f  insns per cycle        ", ratio);
 
 		total = avg_stats(&runtime_stalled_cycles_front_stats[cpu]);
 		total = max(total, avg_stats(&runtime_stalled_cycles_back_stats[cpu]));
 
 		if (total && avg) {
 			ratio = total / avg;
-			fprintf(stderr, "\n                                             #   %5.2f  stalled cycles per insn", ratio);
+			fprintf(output, "\n                                             #   %5.2f  stalled cycles per insn", ratio);
 		}
 
 	} else if (perf_evsel__match(evsel, HARDWARE, HW_BRANCH_MISSES) &&
@@ -831,7 +842,7 @@
 		if (total)
 			ratio = avg * 100 / total;
 
-		fprintf(stderr, " # %8.3f %% of all cache refs    ", ratio);
+		fprintf(output, " # %8.3f %% of all cache refs    ", ratio);
 
 	} else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_FRONTEND)) {
 		print_stalled_cycles_frontend(cpu, evsel, avg);
@@ -843,16 +854,16 @@
 		if (total)
 			ratio = 1.0 * avg / total;
 
-		fprintf(stderr, " # %8.3f GHz                    ", ratio);
+		fprintf(output, " # %8.3f GHz                    ", ratio);
 	} else if (runtime_nsecs_stats[cpu].n != 0) {
 		total = avg_stats(&runtime_nsecs_stats[cpu]);
 
 		if (total)
 			ratio = 1000.0 * avg / total;
 
-		fprintf(stderr, " # %8.3f M/sec                  ", ratio);
+		fprintf(output, " # %8.3f M/sec                  ", ratio);
 	} else {
-		fprintf(stderr, "                                   ");
+		fprintf(output, "                                   ");
 	}
 }
 
@@ -867,7 +878,7 @@
 	int scaled = counter->counts->scaled;
 
 	if (scaled == -1) {
-		fprintf(stderr, "%*s%s%*s",
+		fprintf(output, "%*s%s%*s",
 			csv_output ? 0 : 18,
 			counter->supported ? CNTR_NOT_COUNTED : CNTR_NOT_SUPPORTED,
 			csv_sep,
@@ -875,9 +886,9 @@
 			event_name(counter));
 
 		if (counter->cgrp)
-			fprintf(stderr, "%s%s", csv_sep, counter->cgrp->name);
+			fprintf(output, "%s%s", csv_sep, counter->cgrp->name);
 
-		fputc('\n', stderr);
+		fputc('\n', output);
 		return;
 	}
 
@@ -889,7 +900,7 @@
 	print_noise(counter, avg);
 
 	if (csv_output) {
-		fputc('\n', stderr);
+		fputc('\n', output);
 		return;
 	}
 
@@ -899,9 +910,9 @@
 		avg_enabled = avg_stats(&ps->res_stats[1]);
 		avg_running = avg_stats(&ps->res_stats[2]);
 
-		fprintf(stderr, " [%5.2f%%]", 100 * avg_running / avg_enabled);
+		fprintf(output, " [%5.2f%%]", 100 * avg_running / avg_enabled);
 	}
-	fprintf(stderr, "\n");
+	fprintf(output, "\n");
 }
 
 /*
@@ -918,7 +929,7 @@
 		ena = counter->counts->cpu[cpu].ena;
 		run = counter->counts->cpu[cpu].run;
 		if (run == 0 || ena == 0) {
-			fprintf(stderr, "CPU%*d%s%*s%s%*s",
+			fprintf(output, "CPU%*d%s%*s%s%*s",
 				csv_output ? 0 : -4,
 				evsel_list->cpus->map[cpu], csv_sep,
 				csv_output ? 0 : 18,
@@ -928,9 +939,10 @@
 				event_name(counter));
 
 			if (counter->cgrp)
-				fprintf(stderr, "%s%s", csv_sep, counter->cgrp->name);
+				fprintf(output, "%s%s",
+					csv_sep, counter->cgrp->name);
 
-			fputc('\n', stderr);
+			fputc('\n', output);
 			continue;
 		}
 
@@ -943,9 +955,10 @@
 			print_noise(counter, 1.0);
 
 			if (run != ena)
-				fprintf(stderr, "  (%.2f%%)", 100.0 * run / ena);
+				fprintf(output, "  (%.2f%%)",
+					100.0 * run / ena);
 		}
-		fputc('\n', stderr);
+		fputc('\n', output);
 	}
 }
 
@@ -957,21 +970,21 @@
 	fflush(stdout);
 
 	if (!csv_output) {
-		fprintf(stderr, "\n");
-		fprintf(stderr, " Performance counter stats for ");
+		fprintf(output, "\n");
+		fprintf(output, " Performance counter stats for ");
 		if(target_pid == -1 && target_tid == -1) {
-			fprintf(stderr, "\'%s", argv[0]);
+			fprintf(output, "\'%s", argv[0]);
 			for (i = 1; i < argc; i++)
-				fprintf(stderr, " %s", argv[i]);
+				fprintf(output, " %s", argv[i]);
 		} else if (target_pid != -1)
-			fprintf(stderr, "process id \'%d", target_pid);
+			fprintf(output, "process id \'%d", target_pid);
 		else
-			fprintf(stderr, "thread id \'%d", target_tid);
+			fprintf(output, "thread id \'%d", target_tid);
 
-		fprintf(stderr, "\'");
+		fprintf(output, "\'");
 		if (run_count > 1)
-			fprintf(stderr, " (%d runs)", run_count);
-		fprintf(stderr, ":\n\n");
+			fprintf(output, " (%d runs)", run_count);
+		fprintf(output, ":\n\n");
 	}
 
 	if (no_aggr) {
@@ -984,15 +997,15 @@
 
 	if (!csv_output) {
 		if (!null_run)
-			fprintf(stderr, "\n");
-		fprintf(stderr, " %17.9f seconds time elapsed",
+			fprintf(output, "\n");
+		fprintf(output, " %17.9f seconds time elapsed",
 				avg_stats(&walltime_nsecs_stats)/1e9);
 		if (run_count > 1) {
-			fprintf(stderr, "                                        ");
+			fprintf(output, "                                        ");
 			print_noise_pct(stddev_stats(&walltime_nsecs_stats),
 					avg_stats(&walltime_nsecs_stats));
 		}
-		fprintf(stderr, "\n\n");
+		fprintf(output, "\n\n");
 	}
 }
 
@@ -1030,6 +1043,8 @@
 	return 0;
 }
 
+static bool append_file;
+
 static const struct option options[] = {
 	OPT_CALLBACK('e', "event", &evsel_list, "event",
 		     "event selector. use 'perf list' to list available events",
@@ -1070,6 +1085,11 @@
 	OPT_CALLBACK('G', "cgroup", &evsel_list, "name",
 		     "monitor event in cgroup name only",
 		     parse_cgroups),
+	OPT_STRING('o', "output", &output_name, "file",
+		    "output file name"),
+	OPT_BOOLEAN(0, "append", &append_file, "append to the output file"),
+	OPT_INTEGER(0, "log-fd", &output_fd,
+		    "log output to fd, instead of stderr"),
 	OPT_END()
 };
 
@@ -1141,6 +1161,7 @@
 {
 	struct perf_evsel *pos;
 	int status = -ENOMEM;
+	const char *mode;
 
 	setlocale(LC_ALL, "");
 
@@ -1151,16 +1172,46 @@
 	argc = parse_options(argc, argv, options, stat_usage,
 		PARSE_OPT_STOP_AT_NON_OPTION);
 
-	if (csv_sep)
+	output = stderr;
+	if (output_name && strcmp(output_name, "-"))
+		output = NULL;
+
+	if (output_name && output_fd) {
+		fprintf(stderr, "cannot use both --output and --log-fd\n");
+		usage_with_options(stat_usage, options);
+	}
+	if (!output) {
+		struct timespec tm;
+		mode = append_file ? "a" : "w";
+
+		output = fopen(output_name, mode);
+		if (!output) {
+			perror("failed to create output file");
+			exit(-1);
+		}
+		clock_gettime(CLOCK_REALTIME, &tm);
+		fprintf(output, "# started on %s\n", ctime(&tm.tv_sec));
+	} else if (output_fd != 2) {
+		mode = append_file ? "a" : "w";
+		output = fdopen(output_fd, mode);
+		if (!output) {
+			perror("Failed opening logfd");
+			return -errno;
+		}
+	}
+
+	if (csv_sep) {
 		csv_output = true;
-	else
+		if (!strcmp(csv_sep, "\\t"))
+			csv_sep = "\t";
+	} else
 		csv_sep = DEFAULT_SEPARATOR;
 
 	/*
 	 * let the spreadsheet do the pretty-printing
 	 */
 	if (csv_output) {
-		/* User explicitely passed -B? */
+		/* User explicitly passed -B? */
 		if (big_num_opt == 1) {
 			fprintf(stderr, "-B option not supported with -x\n");
 			usage_with_options(stat_usage, options);
@@ -1226,7 +1277,8 @@
 	status = 0;
 	for (run_idx = 0; run_idx < run_count; run_idx++) {
 		if (run_count != 1 && verbose)
-			fprintf(stderr, "[ perf stat: executing run #%d ... ]\n", run_idx + 1);
+			fprintf(output, "[ perf stat: executing run #%d ... ]\n",
+				run_idx + 1);
 
 		if (sync_run)
 			sync();

diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index d28013b..7a87171 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c

@@ -5,6 +5,7 @@
  * any workload, CPU or specific PID.
  *
  * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
+ *		 2011, Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
  *
  * Improvements and fixes by:
  *
@@ -36,6 +37,7 @@
 #include "util/parse-events.h"
 #include "util/cpumap.h"
 #include "util/xyarray.h"
+#include "util/sort.h"
 
 #include "util/debug.h"
 
@@ -65,12 +67,8 @@
 static struct perf_top top = {
 	.count_filter		= 5,
 	.delay_secs		= 2,
-	.display_weighted	= -1,
 	.target_pid		= -1,
 	.target_tid		= -1,
-	.active_symbols		= LIST_HEAD_INIT(top.active_symbols),
-	.active_symbols_lock	= PTHREAD_MUTEX_INITIALIZER,
-	.active_symbols_cond	= PTHREAD_COND_INITIALIZER,
 	.freq			= 1000, /* 1 KHz */
 };
 
@@ -78,6 +76,12 @@
 
 static bool			use_tui, use_stdio;
 
+static bool			sort_has_symbols;
+
+static bool			dont_use_callchains;
+static char			callchain_default_opt[]		= "fractal,0.5,callee";
+
+
 static int			default_interval		=      0;
 
 static bool			kptr_restrict_warned;
@@ -85,7 +89,6 @@
 static bool			inherit				=  false;
 static int			realtime_prio			=      0;
 static bool			group				=  false;
-static unsigned int		page_size;
 static unsigned int		mmap_pages			=    128;
 
 static bool			dump_symtab                     =  false;
@@ -93,7 +96,6 @@
 static struct winsize		winsize;
 
 static const char		*sym_filter			=   NULL;
-struct sym_entry		*sym_filter_entry_sched		=   NULL;
 static int			sym_pcnt_filter			=      5;
 
 /*
@@ -136,18 +138,18 @@
 	update_print_entries(&winsize);
 }
 
-static int parse_source(struct sym_entry *syme)
+static int parse_source(struct hist_entry *he)
 {
 	struct symbol *sym;
 	struct annotation *notes;
 	struct map *map;
 	int err = -1;
 
-	if (!syme)
+	if (!he || !he->ms.sym)
 		return -1;
 
-	sym = sym_entry__symbol(syme);
-	map = syme->map;
+	sym = he->ms.sym;
+	map = he->ms.map;
 
 	/*
 	 * We can't annotate with just /proc/kallsyms
@@ -175,53 +177,62 @@
 		return err;
 	}
 
-	err = symbol__annotate(sym, syme->map, 0);
+	err = symbol__annotate(sym, map, 0);
 	if (err == 0) {
 out_assign:
-		top.sym_filter_entry = syme;
+		top.sym_filter_entry = he;
 	}
 
 	pthread_mutex_unlock(&notes->lock);
 	return err;
 }
 
-static void __zero_source_counters(struct sym_entry *syme)
+static void __zero_source_counters(struct hist_entry *he)
 {
-	struct symbol *sym = sym_entry__symbol(syme);
+	struct symbol *sym = he->ms.sym;
 	symbol__annotate_zero_histograms(sym);
 }
 
-static void record_precise_ip(struct sym_entry *syme, struct map *map,
-			      int counter, u64 ip)
+static void record_precise_ip(struct hist_entry *he, int counter, u64 ip)
 {
 	struct annotation *notes;
 	struct symbol *sym;
 
-	if (syme != top.sym_filter_entry)
+	if (he == NULL || he->ms.sym == NULL ||
+	    (he != top.sym_filter_entry && use_browser != 1))
 		return;
 
-	sym = sym_entry__symbol(syme);
+	sym = he->ms.sym;
 	notes = symbol__annotation(sym);
 
 	if (pthread_mutex_trylock(&notes->lock))
 		return;
 
-	ip = map->map_ip(map, ip);
-	symbol__inc_addr_samples(sym, map, counter, ip);
+	if (notes->src == NULL &&
+	    symbol__alloc_hist(sym, top.evlist->nr_entries) < 0) {
+		pthread_mutex_unlock(&notes->lock);
+		pr_err("Not enough memory for annotating '%s' symbol!\n",
+		       sym->name);
+		sleep(1);
+		return;
+	}
+
+	ip = he->ms.map->map_ip(he->ms.map, ip);
+	symbol__inc_addr_samples(sym, he->ms.map, counter, ip);
 
 	pthread_mutex_unlock(&notes->lock);
 }
 
-static void show_details(struct sym_entry *syme)
+static void show_details(struct hist_entry *he)
 {
 	struct annotation *notes;
 	struct symbol *symbol;
 	int more;
 
-	if (!syme)
+	if (!he)
 		return;
 
-	symbol = sym_entry__symbol(syme);
+	symbol = he->ms.sym;
 	notes = symbol__annotation(symbol);
 
 	pthread_mutex_lock(&notes->lock);
@@ -232,7 +243,7 @@
 	printf("Showing %s for %s\n", event_name(top.sym_evsel), symbol->name);
 	printf("  Events  Pcnt (>=%d%%)\n", sym_pcnt_filter);
 
-	more = symbol__annotate_printf(symbol, syme->map, top.sym_evsel->idx,
+	more = symbol__annotate_printf(symbol, he->ms.map, top.sym_evsel->idx,
 				       0, sym_pcnt_filter, top.print_entries, 4);
 	if (top.zero)
 		symbol__annotate_zero_histogram(symbol, top.sym_evsel->idx);
@@ -246,21 +257,28 @@
 
 static const char		CONSOLE_CLEAR[] = "[H[2J";
 
-static void __list_insert_active_sym(struct sym_entry *syme)
+static struct hist_entry *
+	perf_session__add_hist_entry(struct perf_session *session,
+				     struct addr_location *al,
+				     struct perf_sample *sample,
+				     struct perf_evsel *evsel)
 {
-	list_add(&syme->node, &top.active_symbols);
+	struct hist_entry *he;
+
+	he = __hists__add_entry(&evsel->hists, al, NULL, sample->period);
+	if (he == NULL)
+		return NULL;
+
+	session->hists.stats.total_period += sample->period;
+	hists__inc_nr_events(&evsel->hists, PERF_RECORD_SAMPLE);
+	return he;
 }
 
-static void print_sym_table(struct perf_session *session)
+static void print_sym_table(void)
 {
 	char bf[160];
 	int printed = 0;
-	struct rb_node *nd;
-	struct sym_entry *syme;
-	struct rb_root tmp = RB_ROOT;
 	const int win_width = winsize.ws_col - 1;
-	int sym_width, dso_width, dso_short_width;
-	float sum_ksamples = perf_top__decay_samples(&top, &tmp);
 
 	puts(CONSOLE_CLEAR);
 
@@ -271,10 +289,12 @@
 
 	printf("%-*.*s\n", win_width, win_width, graph_dotted_line);
 
-	if (session->hists.stats.total_lost != 0) {
+	if (top.total_lost_warned != top.session->hists.stats.total_lost) {
+		top.total_lost_warned = top.session->hists.stats.total_lost;
 		color_fprintf(stdout, PERF_COLOR_RED, "WARNING:");
 		printf(" LOST %" PRIu64 " events, Check IO/CPU overload\n",
-		       session->hists.stats.total_lost);
+		       top.total_lost_warned);
+		++printed;
 	}
 
 	if (top.sym_filter_entry) {
@@ -282,58 +302,15 @@
 		return;
 	}
 
-	perf_top__find_widths(&top, &tmp, &dso_width, &dso_short_width,
-			      &sym_width);
-
-	if (sym_width + dso_width > winsize.ws_col - 29) {
-		dso_width = dso_short_width;
-		if (sym_width + dso_width > winsize.ws_col - 29)
-			sym_width = winsize.ws_col - dso_width - 29;
-	}
+	hists__collapse_resort_threaded(&top.sym_evsel->hists);
+	hists__output_resort_threaded(&top.sym_evsel->hists);
+	hists__decay_entries_threaded(&top.sym_evsel->hists,
+				      top.hide_user_symbols,
+				      top.hide_kernel_symbols);
+	hists__output_recalc_col_len(&top.sym_evsel->hists, winsize.ws_row - 3);
 	putchar('\n');
-	if (top.evlist->nr_entries == 1)
-		printf("             samples  pcnt");
-	else
-		printf("   weight    samples  pcnt");
-
-	if (verbose)
-		printf("         RIP       ");
-	printf(" %-*.*s DSO\n", sym_width, sym_width, "function");
-	printf("   %s    _______ _____",
-	       top.evlist->nr_entries == 1 ? "      " : "______");
-	if (verbose)
-		printf(" ________________");
-	printf(" %-*.*s", sym_width, sym_width, graph_line);
-	printf(" %-*.*s", dso_width, dso_width, graph_line);
-	puts("\n");
-
-	for (nd = rb_first(&tmp); nd; nd = rb_next(nd)) {
-		struct symbol *sym;
-		double pcnt;
-
-		syme = rb_entry(nd, struct sym_entry, rb_node);
-		sym = sym_entry__symbol(syme);
-		if (++printed > top.print_entries ||
-		    (int)syme->snap_count < top.count_filter)
-			continue;
-
-		pcnt = 100.0 - (100.0 * ((sum_ksamples - syme->snap_count) /
-					 sum_ksamples));
-
-		if (top.evlist->nr_entries == 1 || !top.display_weighted)
-			printf("%20.2f ", syme->weight);
-		else
-			printf("%9.1f %10ld ", syme->weight, syme->snap_count);
-
-		percent_color_fprintf(stdout, "%4.1f%%", pcnt);
-		if (verbose)
-			printf(" %016" PRIx64, sym->start);
-		printf(" %-*.*s", sym_width, sym_width, sym->name);
-		printf(" %-*.*s\n", dso_width, dso_width,
-		       dso_width >= syme->map->dso->long_name_len ?
-					syme->map->dso->long_name :
-					syme->map->dso->short_name);
-	}
+	hists__fprintf(&top.sym_evsel->hists, NULL, false, false,
+		       winsize.ws_row - 4 - printed, win_width, stdout);
 }
 
 static void prompt_integer(int *target, const char *msg)
@@ -371,10 +348,11 @@
 		*target = tmp;
 }
 
-static void prompt_symbol(struct sym_entry **target, const char *msg)
+static void prompt_symbol(struct hist_entry **target, const char *msg)
 {
 	char *buf = malloc(0), *p;
-	struct sym_entry *syme = *target, *n, *found = NULL;
+	struct hist_entry *syme = *target, *n, *found = NULL;
+	struct rb_node *next;
 	size_t dummy = 0;
 
 	/* zero counters of active symbol */
@@ -391,17 +369,14 @@
 	if (p)
 		*p = 0;
 
-	pthread_mutex_lock(&top.active_symbols_lock);
-	syme = list_entry(top.active_symbols.next, struct sym_entry, node);
-	pthread_mutex_unlock(&top.active_symbols_lock);
-
-	list_for_each_entry_safe_from(syme, n, &top.active_symbols, node) {
-		struct symbol *sym = sym_entry__symbol(syme);
-
-		if (!strcmp(buf, sym->name)) {
-			found = syme;
+	next = rb_first(&top.sym_evsel->hists.entries);
+	while (next) {
+		n = rb_entry(next, struct hist_entry, rb_node);
+		if (n->ms.sym && !strcmp(buf, n->ms.sym->name)) {
+			found = n;
 			break;
 		}
+		next = rb_next(&n->rb_node);
 	}
 
 	if (!found) {
@@ -420,7 +395,7 @@
 	char *name = NULL;
 
 	if (top.sym_filter_entry) {
-		struct symbol *sym = sym_entry__symbol(top.sym_filter_entry);
+		struct symbol *sym = top.sym_filter_entry->ms.sym;
 		name = sym->name;
 	}
 
@@ -437,9 +412,6 @@
 	fprintf(stdout, "\t[s]     annotate symbol.                   \t(%s)\n", name?: "NULL");
 	fprintf(stdout, "\t[S]     stop annotation.\n");
 
-	if (top.evlist->nr_entries > 1)
-		fprintf(stdout, "\t[w]     toggle display weighted/count[E]r. \t(%d)\n", top.display_weighted ? 1 : 0);
-
 	fprintf(stdout,
 		"\t[K]     hide kernel_symbols symbols.     \t(%s)\n",
 		top.hide_kernel_symbols ? "yes" : "no");
@@ -466,7 +438,6 @@
 		case 'S':
 			return 1;
 		case 'E':
-		case 'w':
 			return top.evlist->nr_entries > 1 ? 1 : 0;
 		default:
 			break;
@@ -475,7 +446,7 @@
 	return 0;
 }
 
-static void handle_keypress(struct perf_session *session, int c)
+static void handle_keypress(int c)
 {
 	if (!key_mapped(c)) {
 		struct pollfd stdin_poll = { .fd = 0, .events = POLLIN };
@@ -551,7 +522,7 @@
 		case 'Q':
 			printf("exiting.\n");
 			if (dump_symtab)
-				perf_session__fprintf_dsos(session, stderr);
+				perf_session__fprintf_dsos(top.session, stderr);
 			exit(0);
 		case 's':
 			prompt_symbol(&top.sym_filter_entry, "Enter details symbol");
@@ -560,7 +531,7 @@
 			if (!top.sym_filter_entry)
 				break;
 			else {
-				struct sym_entry *syme = top.sym_filter_entry;
+				struct hist_entry *syme = top.sym_filter_entry;
 
 				top.sym_filter_entry = NULL;
 				__zero_source_counters(syme);
@@ -569,9 +540,6 @@
 		case 'U':
 			top.hide_user_symbols = !top.hide_user_symbols;
 			break;
-		case 'w':
-			top.display_weighted = ~top.display_weighted;
-			break;
 		case 'z':
 			top.zero = !top.zero;
 			break;
@@ -580,19 +548,31 @@
 	}
 }
 
+static void perf_top__sort_new_samples(void *arg)
+{
+	struct perf_top *t = arg;
+	perf_top__reset_sample_counters(t);
+
+	if (t->evlist->selected != NULL)
+		t->sym_evsel = t->evlist->selected;
+
+	hists__collapse_resort_threaded(&t->sym_evsel->hists);
+	hists__output_resort_threaded(&t->sym_evsel->hists);
+	hists__decay_entries_threaded(&t->sym_evsel->hists,
+				      top.hide_user_symbols,
+				      top.hide_kernel_symbols);
+	hists__output_recalc_col_len(&t->sym_evsel->hists, winsize.ws_row - 3);
+}
+
 static void *display_thread_tui(void *arg __used)
 {
-	int err = 0;
-	pthread_mutex_lock(&top.active_symbols_lock);
-	while (list_empty(&top.active_symbols)) {
-		err = pthread_cond_wait(&top.active_symbols_cond,
-					&top.active_symbols_lock);
-		if (err)
-			break;
-	}
-	pthread_mutex_unlock(&top.active_symbols_lock);
-	if (!err)
-		perf_top__tui_browser(&top);
+	const char *help = "For a higher level overview, try: perf top --sort comm,dso";
+
+	perf_top__sort_new_samples(&top);
+	perf_evlist__tui_browse_hists(top.evlist, help,
+				      perf_top__sort_new_samples,
+				      &top, top.delay_secs);
+
 	exit_browser(0);
 	exit(0);
 	return NULL;
@@ -603,7 +583,6 @@
 	struct pollfd stdin_poll = { .fd = 0, .events = POLLIN };
 	struct termios tc, save;
 	int delay_msecs, c;
-	struct perf_session *session = (struct perf_session *) arg;
 
 	tcgetattr(0, &save);
 	tc = save;
@@ -611,20 +590,35 @@
 	tc.c_cc[VMIN] = 0;
 	tc.c_cc[VTIME] = 0;
 
+	pthread__unblock_sigwinch();
 repeat:
 	delay_msecs = top.delay_secs * 1000;
 	tcsetattr(0, TCSANOW, &tc);
 	/* trash return*/
 	getc(stdin);
 
-	do {
-		print_sym_table(session);
-	} while (!poll(&stdin_poll, 1, delay_msecs) == 1);
-
+	while (1) {
+		print_sym_table();
+		/*
+		 * Either timeout expired or we got an EINTR due to SIGWINCH,
+		 * refresh screen in both cases.
+		 */
+		switch (poll(&stdin_poll, 1, delay_msecs)) {
+		case 0:
+			continue;
+		case -1:
+			if (errno == EINTR)
+				continue;
+			/* Fall trhu */
+		default:
+			goto process_hotkey;
+		}
+	}
+process_hotkey:
 	c = getc(stdin);
 	tcsetattr(0, TCSAFLUSH, &save);
 
-	handle_keypress(session, c);
+	handle_keypress(c);
 	goto repeat;
 
 	return NULL;
@@ -645,9 +639,8 @@
 	NULL
 };
 
-static int symbol_filter(struct map *map, struct symbol *sym)
+static int symbol_filter(struct map *map __used, struct symbol *sym)
 {
-	struct sym_entry *syme;
 	const char *name = sym->name;
 	int i;
 
@@ -667,16 +660,6 @@
 	    strstr(name, "_text_end"))
 		return 1;
 
-	syme = symbol__priv(sym);
-	syme->map = map;
-	symbol__annotate_init(map, sym);
-
-	if (!top.sym_filter_entry && sym_filter && !strcmp(name, sym_filter)) {
-		/* schedule initial sym_filter_entry setup */
-		sym_filter_entry_sched = syme;
-		sym_filter = NULL;
-	}
-
 	for (i = 0; skip_symbols[i]; i++) {
 		if (!strcmp(skip_symbols[i], name)) {
 			sym->ignore = true;
@@ -691,10 +674,11 @@
 				       struct perf_sample *sample,
 				       struct perf_session *session)
 {
+	struct symbol *parent = NULL;
 	u64 ip = event->ip.ip;
-	struct sym_entry *syme;
 	struct addr_location al;
 	struct machine *machine;
+	int err;
 	u8 origin = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
 
 	++top.samples;
@@ -783,46 +767,41 @@
 				sleep(5);
 			vmlinux_warned = true;
 		}
-
-		return;
 	}
 
-	/* let's see, whether we need to install initial sym_filter_entry */
-	if (sym_filter_entry_sched) {
-		top.sym_filter_entry = sym_filter_entry_sched;
-		sym_filter_entry_sched = NULL;
-		if (parse_source(top.sym_filter_entry) < 0) {
-			struct symbol *sym = sym_entry__symbol(top.sym_filter_entry);
-
-			pr_err("Can't annotate %s", sym->name);
-			if (top.sym_filter_entry->map->dso->symtab_type == SYMTAB__KALLSYMS) {
-				pr_err(": No vmlinux file was found in the path:\n");
-				machine__fprintf_vmlinux_path(machine, stderr);
-			} else
-				pr_err(".\n");
-			exit(1);
-		}
-	}
-
-	syme = symbol__priv(al.sym);
-	if (!al.sym->ignore) {
+	if (al.sym == NULL || !al.sym->ignore) {
 		struct perf_evsel *evsel;
+		struct hist_entry *he;
 
 		evsel = perf_evlist__id2evsel(top.evlist, sample->id);
 		assert(evsel != NULL);
-		syme->count[evsel->idx]++;
-		record_precise_ip(syme, al.map, evsel->idx, ip);
-		pthread_mutex_lock(&top.active_symbols_lock);
-		if (list_empty(&syme->node) || !syme->node.next) {
-			static bool first = true;
-			__list_insert_active_sym(syme);
-			if (first) {
-				pthread_cond_broadcast(&top.active_symbols_cond);
-				first = false;
-			}
+
+		if ((sort__has_parent || symbol_conf.use_callchain) &&
+		    sample->callchain) {
+			err = perf_session__resolve_callchain(session, al.thread,
+							      sample->callchain, &parent);
+			if (err)
+				return;
 		}
-		pthread_mutex_unlock(&top.active_symbols_lock);
+
+		he = perf_session__add_hist_entry(session, &al, sample, evsel);
+		if (he == NULL) {
+			pr_err("Problem incrementing symbol period, skipping event\n");
+			return;
+		}
+
+		if (symbol_conf.use_callchain) {
+			err = callchain_append(he->callchain, &session->callchain_cursor,
+					       sample->period);
+			if (err)
+				return;
+		}
+
+		if (sort_has_symbols)
+			record_precise_ip(he, evsel->idx, ip);
 	}
+
+	return;
 }
 
 static void perf_session__mmap_read_idx(struct perf_session *self, int idx)
@@ -873,7 +852,11 @@
 			attr->read_format |= PERF_FORMAT_ID;
 		}
 
+		if (symbol_conf.use_callchain)
+			attr->sample_type |= PERF_SAMPLE_CALLCHAIN;
+
 		attr->mmap = 1;
+		attr->comm = 1;
 		attr->inherit = inherit;
 try_again:
 		if (perf_evsel__open(counter, top.evlist->cpus,
@@ -928,35 +911,56 @@
 	exit(0);
 }
 
+static int setup_sample_type(void)
+{
+	if (!sort_has_symbols) {
+		if (symbol_conf.use_callchain) {
+			ui__warning("Selected -g but \"sym\" not present in --sort/-s.");
+			return -EINVAL;
+		}
+	} else if (!dont_use_callchains && callchain_param.mode != CHAIN_NONE) {
+		if (callchain_register_param(&callchain_param) < 0) {
+			ui__warning("Can't register callchain params.\n");
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
 static int __cmd_top(void)
 {
 	pthread_t thread;
-	int ret __used;
+	int ret;
 	/*
 	 * FIXME: perf_session__new should allow passing a O_MMAP, so that all this
 	 * mmap reading, etc is encapsulated in it. Use O_WRONLY for now.
 	 */
-	struct perf_session *session = perf_session__new(NULL, O_WRONLY, false, false, NULL);
-	if (session == NULL)
+	top.session = perf_session__new(NULL, O_WRONLY, false, false, NULL);
+	if (top.session == NULL)
 		return -ENOMEM;
 
+	ret = setup_sample_type();
+	if (ret)
+		goto out_delete;
+
 	if (top.target_tid != -1)
 		perf_event__synthesize_thread_map(top.evlist->threads,
-						  perf_event__process, session);
+						  perf_event__process, top.session);
 	else
-		perf_event__synthesize_threads(perf_event__process, session);
+		perf_event__synthesize_threads(perf_event__process, top.session);
 
 	start_counters(top.evlist);
-	session->evlist = top.evlist;
-	perf_session__update_sample_type(session);
+	top.session->evlist = top.evlist;
+	perf_session__update_sample_type(top.session);
 
 	/* Wait for a minimal set of events before starting the snapshot */
 	poll(top.evlist->pollfd, top.evlist->nr_fds, 100);
 
-	perf_session__mmap_read(session);
+	perf_session__mmap_read(top.session);
 
 	if (pthread_create(&thread, NULL, (use_browser > 0 ? display_thread_tui :
-							     display_thread), session)) {
+							     display_thread), NULL)) {
 		printf("Could not create display thread.\n");
 		exit(-1);
 	}
@@ -974,12 +978,96 @@
 	while (1) {
 		u64 hits = top.samples;
 
-		perf_session__mmap_read(session);
+		perf_session__mmap_read(top.session);
 
 		if (hits == top.samples)
 			ret = poll(top.evlist->pollfd, top.evlist->nr_fds, 100);
 	}
 
+out_delete:
+	perf_session__delete(top.session);
+	top.session = NULL;
+
+	return 0;
+}
+
+static int
+parse_callchain_opt(const struct option *opt __used, const char *arg,
+		    int unset)
+{
+	char *tok, *tok2;
+	char *endptr;
+
+	/*
+	 * --no-call-graph
+	 */
+	if (unset) {
+		dont_use_callchains = true;
+		return 0;
+	}
+
+	symbol_conf.use_callchain = true;
+
+	if (!arg)
+		return 0;
+
+	tok = strtok((char *)arg, ",");
+	if (!tok)
+		return -1;
+
+	/* get the output mode */
+	if (!strncmp(tok, "graph", strlen(arg)))
+		callchain_param.mode = CHAIN_GRAPH_ABS;
+
+	else if (!strncmp(tok, "flat", strlen(arg)))
+		callchain_param.mode = CHAIN_FLAT;
+
+	else if (!strncmp(tok, "fractal", strlen(arg)))
+		callchain_param.mode = CHAIN_GRAPH_REL;
+
+	else if (!strncmp(tok, "none", strlen(arg))) {
+		callchain_param.mode = CHAIN_NONE;
+		symbol_conf.use_callchain = false;
+
+		return 0;
+	}
+
+	else
+		return -1;
+
+	/* get the min percentage */
+	tok = strtok(NULL, ",");
+	if (!tok)
+		goto setup;
+
+	callchain_param.min_percent = strtod(tok, &endptr);
+	if (tok == endptr)
+		return -1;
+
+	/* get the print limit */
+	tok2 = strtok(NULL, ",");
+	if (!tok2)
+		goto setup;
+
+	if (tok2[0] != 'c') {
+		callchain_param.print_limit = strtod(tok2, &endptr);
+		tok2 = strtok(NULL, ",");
+		if (!tok2)
+			goto setup;
+	}
+
+	/* get the call chain order */
+	if (!strcmp(tok2, "caller"))
+		callchain_param.order = ORDER_CALLER;
+	else if (!strcmp(tok2, "callee"))
+		callchain_param.order = ORDER_CALLEE;
+	else
+		return -1;
+setup:
+	if (callchain_register_param(&callchain_param) < 0) {
+		fprintf(stderr, "Can't register callchain params\n");
+		return -1;
+	}
 	return 0;
 }
 
@@ -1019,7 +1107,7 @@
 			    "put the counters into a counter group"),
 	OPT_BOOLEAN('i', "inherit", &inherit,
 		    "child tasks inherit counters"),
-	OPT_STRING('s', "sym-annotate", &sym_filter, "symbol name",
+	OPT_STRING(0, "sym-annotate", &sym_filter, "symbol name",
 		    "symbol to annotate"),
 	OPT_BOOLEAN('z', "zero", &top.zero,
 		    "zero history across updates"),
@@ -1033,6 +1121,28 @@
 	OPT_BOOLEAN(0, "stdio", &use_stdio, "Use the stdio interface"),
 	OPT_INCR('v', "verbose", &verbose,
 		    "be more verbose (show counter open errors, etc)"),
+	OPT_STRING('s', "sort", &sort_order, "key[,key2...]",
+		   "sort by key(s): pid, comm, dso, symbol, parent"),
+	OPT_BOOLEAN('n', "show-nr-samples", &symbol_conf.show_nr_samples,
+		    "Show a column with the number of samples"),
+	OPT_CALLBACK_DEFAULT('G', "call-graph", NULL, "output_type,min_percent, call_order",
+		     "Display callchains using output_type (graph, flat, fractal, or none), min percent threshold and callchain order. "
+		     "Default: fractal,0.5,callee", &parse_callchain_opt,
+		     callchain_default_opt),
+	OPT_BOOLEAN(0, "show-total-period", &symbol_conf.show_total_period,
+		    "Show a column with the sum of periods"),
+	OPT_STRING(0, "dsos", &symbol_conf.dso_list_str, "dso[,dso...]",
+		   "only consider symbols in these dsos"),
+	OPT_STRING(0, "comms", &symbol_conf.comm_list_str, "comm[,comm...]",
+		   "only consider symbols in these comms"),
+	OPT_STRING(0, "symbols", &symbol_conf.sym_list_str, "symbol[,symbol...]",
+		   "only consider these symbols"),
+	OPT_BOOLEAN(0, "source", &symbol_conf.annotate_src,
+		    "Interleave source code with assembly code (default)"),
+	OPT_BOOLEAN(0, "asm-raw", &symbol_conf.annotate_asm_raw,
+		    "Display raw encoding of assembly instructions (default)"),
+	OPT_STRING('M', "disassembler-style", &disassembler_style, "disassembler style",
+		   "Specify disassembler style (e.g. -M intel for intel syntax)"),
 	OPT_END()
 };
 
@@ -1045,18 +1155,16 @@
 	if (top.evlist == NULL)
 		return -ENOMEM;
 
-	page_size = sysconf(_SC_PAGE_SIZE);
+	symbol_conf.exclude_other = false;
 
 	argc = parse_options(argc, argv, options, top_usage, 0);
 	if (argc)
 		usage_with_options(top_usage, options);
 
-	/*
- 	 * XXX For now start disabled, only using TUI if explicitely asked for.
- 	 * Change that when handle_keys equivalent gets written, live annotation
- 	 * done, etc.
- 	 */
-	use_browser = 0;
+	if (sort_order == default_sort_order)
+		sort_order = "dso,symbol";
+
+	setup_sorting(top_usage, options);
 
 	if (use_stdio)
 		use_browser = 0;
@@ -1119,13 +1227,22 @@
 
 	top.sym_evsel = list_entry(top.evlist->entries.next, struct perf_evsel, node);
 
-	symbol_conf.priv_size = (sizeof(struct sym_entry) + sizeof(struct annotation) +
-				 (top.evlist->nr_entries + 1) * sizeof(unsigned long));
+	symbol_conf.priv_size = sizeof(struct annotation);
 
 	symbol_conf.try_vmlinux_path = (symbol_conf.vmlinux_name == NULL);
 	if (symbol__init() < 0)
 		return -1;
 
+	sort_entry__setup_elide(&sort_dso, symbol_conf.dso_list, "dso", stdout);
+	sort_entry__setup_elide(&sort_comm, symbol_conf.comm_list, "comm", stdout);
+	sort_entry__setup_elide(&sort_sym, symbol_conf.sym_list, "symbol", stdout);
+
+	/*
+	 * Avoid annotation data structures overhead when symbols aren't on the
+	 * sort list.
+	 */
+	sort_has_symbols = sort_sym.list.next != NULL;
+
 	get_term_dimensions(&winsize);
 	if (top.print_entries == 0) {
 		update_print_entries(&winsize);

diff --git a/tools/perf/builtin.h b/tools/perf/builtin.h
index 4702e24..b382bd5 100644
--- a/tools/perf/builtin.h
+++ b/tools/perf/builtin.h

@@ -4,7 +4,6 @@
 #include "util/util.h"
 #include "util/strbuf.h"
 
-extern const char perf_version_string[];
 extern const char perf_usage_string[];
 extern const char perf_more_info_string[];
 

diff --git a/tools/perf/perf.c b/tools/perf/perf.c
index ec635b7..73d0cac 100644
--- a/tools/perf/perf.c
+++ b/tools/perf/perf.c

@@ -427,6 +427,24 @@
 		debugfs_mntpt[0] = '\0';
 }
 
+static void pthread__block_sigwinch(void)
+{
+	sigset_t set;
+
+	sigemptyset(&set);
+	sigaddset(&set, SIGWINCH);
+	pthread_sigmask(SIG_BLOCK, &set, NULL);
+}
+
+void pthread__unblock_sigwinch(void)
+{
+	sigset_t set;
+
+	sigemptyset(&set);
+	sigaddset(&set, SIGWINCH);
+	pthread_sigmask(SIG_UNBLOCK, &set, NULL);
+}
+
 int main(int argc, const char **argv)
 {
 	const char *cmd;
@@ -480,6 +498,12 @@
 	 * time.
 	 */
 	setup_path();
+	/*
+	 * Block SIGWINCH notifications so that the thread that wants it can
+	 * unblock and get syscalls like select interrupted instead of waiting
+	 * forever while the signal goes to some other non interested thread.
+	 */
+	pthread__block_sigwinch();
 
 	while (1) {
 		static int done_help;

diff --git a/tools/perf/perf.h b/tools/perf/perf.h
index a5fc660..914c895 100644
--- a/tools/perf/perf.h
+++ b/tools/perf/perf.h

@@ -9,18 +9,21 @@
 #include "../../arch/x86/include/asm/unistd.h"
 #define rmb()		asm volatile("lock; addl $0,0(%%esp)" ::: "memory")
 #define cpu_relax()	asm volatile("rep; nop" ::: "memory");
+#define CPUINFO_PROC	"model name"
 #endif
 
 #if defined(__x86_64__)
 #include "../../arch/x86/include/asm/unistd.h"
 #define rmb()		asm volatile("lfence" ::: "memory")
 #define cpu_relax()	asm volatile("rep; nop" ::: "memory");
+#define CPUINFO_PROC	"model name"
 #endif
 
 #ifdef __powerpc__
 #include "../../arch/powerpc/include/asm/unistd.h"
 #define rmb()		asm volatile ("sync" ::: "memory")
 #define cpu_relax()	asm volatile ("" ::: "memory");
+#define CPUINFO_PROC	"cpu"
 #endif
 
 #ifdef __s390__
@@ -37,30 +40,35 @@
 # define rmb()		asm volatile("" ::: "memory")
 #endif
 #define cpu_relax()	asm volatile("" ::: "memory")
+#define CPUINFO_PROC	"cpu type"
 #endif
 
 #ifdef __hppa__
 #include "../../arch/parisc/include/asm/unistd.h"
 #define rmb()		asm volatile("" ::: "memory")
 #define cpu_relax()	asm volatile("" ::: "memory");
+#define CPUINFO_PROC	"cpu"
 #endif
 
 #ifdef __sparc__
 #include "../../arch/sparc/include/asm/unistd.h"
 #define rmb()		asm volatile("":::"memory")
 #define cpu_relax()	asm volatile("":::"memory")
+#define CPUINFO_PROC	"cpu"
 #endif
 
 #ifdef __alpha__
 #include "../../arch/alpha/include/asm/unistd.h"
 #define rmb()		asm volatile("mb" ::: "memory")
 #define cpu_relax()	asm volatile("" ::: "memory")
+#define CPUINFO_PROC	"cpu model"
 #endif
 
 #ifdef __ia64__
 #include "../../arch/ia64/include/asm/unistd.h"
 #define rmb()		asm volatile ("mf" ::: "memory")
 #define cpu_relax()	asm volatile ("hint @pause" ::: "memory")
+#define CPUINFO_PROC	"model name"
 #endif
 
 #ifdef __arm__
@@ -71,6 +79,7 @@
  */
 #define rmb()		((void(*)(void))0xffff0fa0)()
 #define cpu_relax()	asm volatile("":::"memory")
+#define CPUINFO_PROC	"Processor"
 #endif
 
 #ifdef __mips__
@@ -83,6 +92,7 @@
 				: /* no input */			\
 				: "memory")
 #define cpu_relax()	asm volatile("" ::: "memory")
+#define CPUINFO_PROC	"cpu model"
 #endif
 
 #include <time.h>
@@ -171,5 +181,8 @@
 };
 
 extern bool perf_host, perf_guest;
+extern const char perf_version_string[];
+
+void pthread__unblock_sigwinch(void);
 
 #endif

diff --git a/tools/perf/scripts/python/bin/net_dropmonitor-record b/tools/perf/scripts/python/bin/net_dropmonitor-record
new file mode 100755
index 0000000..423fb81
--- /dev/null
+++ b/tools/perf/scripts/python/bin/net_dropmonitor-record

@@ -0,0 +1,2 @@
+#!/bin/bash
+perf record -e skb:kfree_skb $@

diff --git a/tools/perf/scripts/python/bin/net_dropmonitor-report b/tools/perf/scripts/python/bin/net_dropmonitor-report
new file mode 100755
index 0000000..8d698f5
--- /dev/null
+++ b/tools/perf/scripts/python/bin/net_dropmonitor-report

@@ -0,0 +1,4 @@
+#!/bin/bash
+# description: display a table of dropped frames
+
+perf script -s "$PERF_EXEC_PATH"/scripts/python/net_dropmonitor.py $@

diff --git a/tools/perf/scripts/python/net_dropmonitor.py b/tools/perf/scripts/python/net_dropmonitor.py
new file mode 100755
index 0000000..a4ffc95
--- /dev/null
+++ b/tools/perf/scripts/python/net_dropmonitor.py

@@ -0,0 +1,72 @@
+# Monitor the system for dropped packets and proudce a report of drop locations and counts
+
+import os
+import sys
+
+sys.path.append(os.environ['PERF_EXEC_PATH'] + \
+		'/scripts/python/Perf-Trace-Util/lib/Perf/Trace')
+
+from perf_trace_context import *
+from Core import *
+from Util import *
+
+drop_log = {}
+kallsyms = []
+
+def get_kallsyms_table():
+	global kallsyms
+	try:
+		f = open("/proc/kallsyms", "r")
+		linecount = 0
+		for line in f:
+			linecount = linecount+1
+		f.seek(0)
+	except:
+		return
+
+
+	j = 0
+	for line in f:
+		loc = int(line.split()[0], 16)
+		name = line.split()[2]
+		j = j +1
+		if ((j % 100) == 0):
+			print "\r" + str(j) + "/" + str(linecount),
+		kallsyms.append({ 'loc': loc, 'name' : name})
+
+	print "\r" + str(j) + "/" + str(linecount)
+	kallsyms.sort()
+	return
+
+def get_sym(sloc):
+	loc = int(sloc)
+	for i in kallsyms:
+		if (i['loc'] >= loc):
+			return (i['name'], i['loc']-loc)
+	return (None, 0)
+
+def print_drop_table():
+	print "%25s %25s %25s" % ("LOCATION", "OFFSET", "COUNT")
+	for i in drop_log.keys():
+		(sym, off) = get_sym(i)
+		if sym == None:
+			sym = i
+		print "%25s %25s %25s" % (sym, off, drop_log[i])
+
+
+def trace_begin():
+	print "Starting trace (Ctrl-C to dump results)"
+
+def trace_end():
+	print "Gathering kallsyms data"
+	get_kallsyms_table()
+	print_drop_table()
+
+# called from perf, when it finds a correspoinding event
+def skb__kfree_skb(name, context, cpu, sec, nsec, pid, comm,
+			skbaddr, protocol, location):
+	slocation = str(location)
+	try:
+		drop_log[slocation] = drop_log[slocation] + 1
+	except:
+		drop_log[slocation] = 1

diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index e01af2b..bc8f477 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c

@@ -16,6 +16,8 @@
 #include "annotate.h"
 #include <pthread.h>
 
+const char 	*disassembler_style;
+
 int symbol__annotate_init(struct map *map __used, struct symbol *sym)
 {
 	struct annotation *notes = symbol__annotation(sym);
@@ -323,10 +325,15 @@
 		 dso, dso->long_name, sym, sym->name);
 
 	snprintf(command, sizeof(command),
-		 "objdump --start-address=0x%016" PRIx64
-		 " --stop-address=0x%016" PRIx64 " -dS -C %s|grep -v %s|expand",
+		 "objdump %s%s --start-address=0x%016" PRIx64
+		 " --stop-address=0x%016" PRIx64
+		 " -d %s %s -C %s|grep -v %s|expand",
+		 disassembler_style ? "-M " : "",
+		 disassembler_style ? disassembler_style : "",
 		 map__rip_2objdump(map, sym->start),
 		 map__rip_2objdump(map, sym->end),
+		 symbol_conf.annotate_asm_raw ? "" : "--no-show-raw",
+		 symbol_conf.annotate_src ? "-S" : "",
 		 symfs_filename, filename);
 
 	pr_debug("Executing: %s\n", command);

diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h
index c2c2868..d907252 100644
--- a/tools/perf/util/annotate.h
+++ b/tools/perf/util/annotate.h

@@ -91,13 +91,18 @@
 #ifdef NO_NEWT_SUPPORT
 static inline int symbol__tui_annotate(struct symbol *sym __used,
 				       struct map *map __used,
-				       int evidx __used, int refresh __used)
+				       int evidx __used,
+				       void(*timer)(void *arg) __used,
+				       void *arg __used, int delay_secs __used)
 {
 	return 0;
 }
 #else
 int symbol__tui_annotate(struct symbol *sym, struct map *map, int evidx,
-			 int refresh);
+			 int nr_events, void(*timer)(void *arg), void *arg,
+			 int delay_secs);
 #endif
 
+extern const char	*disassembler_style;
+
 #endif	/* __PERF_ANNOTATE_H */

diff --git a/tools/perf/util/color.c b/tools/perf/util/color.c
index e191eb9..521c38a 100644
--- a/tools/perf/util/color.c
+++ b/tools/perf/util/color.c

@@ -200,7 +200,7 @@
 	 * Auto-detect:
 	 */
 	if (perf_use_color_default < 0) {
-		if (isatty(1) || pager_in_use())
+		if (isatty(fileno(fp)) || pager_in_use())
 			perf_use_color_default = 1;
 		else
 			perf_use_color_default = 0;

diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index 72e9f48..2f6bc89 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c

@@ -533,3 +533,9 @@
 	first = list_entry(evlist->entries.next, struct perf_evsel, node);
 	return first->attr.sample_id_all;
 }
+
+void perf_evlist__set_selected(struct perf_evlist *evlist,
+			       struct perf_evsel *evsel)
+{
+	evlist->selected = evsel;
+}

diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
index f349150..6be71fc 100644
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h

@@ -25,6 +25,7 @@
 	struct pollfd	 *pollfd;
 	struct thread_map *threads;
 	struct cpu_map	  *cpus;
+	struct perf_evsel *selected;
 };
 
 struct perf_evsel;
@@ -56,6 +57,9 @@
 void perf_evlist__disable(struct perf_evlist *evlist);
 void perf_evlist__enable(struct perf_evlist *evlist);
 
+void perf_evlist__set_selected(struct perf_evlist *evlist,
+			       struct perf_evsel *evsel);
+
 static inline void perf_evlist__set_maps(struct perf_evlist *evlist,
 					 struct cpu_map *cpus,
 					 struct thread_map *threads)

diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index e389815..b46f6e4 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c

@@ -39,6 +39,7 @@
 	evsel->idx	   = idx;
 	evsel->attr	   = *attr;
 	INIT_LIST_HEAD(&evsel->node);
+	hists__init(&evsel->hists);
 }
 
 struct perf_evsel *perf_evsel__new(struct perf_event_attr *attr, int idx)

diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index b6c1ad1..76c0b2c 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c

@@ -7,6 +7,7 @@
 #include <stdlib.h>
 #include <linux/list.h>
 #include <linux/kernel.h>
+#include <sys/utsname.h>
 
 #include "evlist.h"
 #include "evsel.h"
@@ -17,12 +18,19 @@
 #include "session.h"
 #include "symbol.h"
 #include "debug.h"
+#include "cpumap.h"
 
 static bool no_buildid_cache = false;
 
 static int event_count;
 static struct perf_trace_event_type *events;
 
+static u32 header_argc;
+static const char **header_argv;
+
+static int dsos__write_buildid_table(struct perf_header *header, int fd);
+static int perf_session__cache_build_ids(struct perf_session *session);
+
 int perf_header__push_event(u64 id, const char *name)
 {
 	if (strlen(name) > MAX_EVENT_NAME)
@@ -110,6 +118,1020 @@
 	return err;
 }
 
+static int do_write_string(int fd, const char *str)
+{
+	u32 len, olen;
+	int ret;
+
+	olen = strlen(str) + 1;
+	len = ALIGN(olen, NAME_ALIGN);
+
+	/* write len, incl. \0 */
+	ret = do_write(fd, &len, sizeof(len));
+	if (ret < 0)
+		return ret;
+
+	return write_padded(fd, str, olen, len);
+}
+
+static char *do_read_string(int fd, struct perf_header *ph)
+{
+	ssize_t sz, ret;
+	u32 len;
+	char *buf;
+
+	sz = read(fd, &len, sizeof(len));
+	if (sz < (ssize_t)sizeof(len))
+		return NULL;
+
+	if (ph->needs_swap)
+		len = bswap_32(len);
+
+	buf = malloc(len);
+	if (!buf)
+		return NULL;
+
+	ret = read(fd, buf, len);
+	if (ret == (ssize_t)len) {
+		/*
+		 * strings are padded by zeroes
+		 * thus the actual strlen of buf
+		 * may be less than len
+		 */
+		return buf;
+	}
+
+	free(buf);
+	return NULL;
+}
+
+int
+perf_header__set_cmdline(int argc, const char **argv)
+{
+	int i;
+
+	header_argc = (u32)argc;
+
+	/* do not include NULL termination */
+	header_argv = calloc(argc, sizeof(char *));
+	if (!header_argv)
+		return -ENOMEM;
+
+	/*
+	 * must copy argv contents because it gets moved
+	 * around during option parsing
+	 */
+	for (i = 0; i < argc ; i++)
+		header_argv[i] = argv[i];
+
+	return 0;
+}
+
+static int write_trace_info(int fd, struct perf_header *h __used,
+			    struct perf_evlist *evlist)
+{
+	return read_tracing_data(fd, &evlist->entries);
+}
+
+
+static int write_build_id(int fd, struct perf_header *h,
+			  struct perf_evlist *evlist __used)
+{
+	struct perf_session *session;
+	int err;
+
+	session = container_of(h, struct perf_session, header);
+
+	err = dsos__write_buildid_table(h, fd);
+	if (err < 0) {
+		pr_debug("failed to write buildid table\n");
+		return err;
+	}
+	if (!no_buildid_cache)
+		perf_session__cache_build_ids(session);
+
+	return 0;
+}
+
+static int write_hostname(int fd, struct perf_header *h __used,
+			  struct perf_evlist *evlist __used)
+{
+	struct utsname uts;
+	int ret;
+
+	ret = uname(&uts);
+	if (ret < 0)
+		return -1;
+
+	return do_write_string(fd, uts.nodename);
+}
+
+static int write_osrelease(int fd, struct perf_header *h __used,
+			   struct perf_evlist *evlist __used)
+{
+	struct utsname uts;
+	int ret;
+
+	ret = uname(&uts);
+	if (ret < 0)
+		return -1;
+
+	return do_write_string(fd, uts.release);
+}
+
+static int write_arch(int fd, struct perf_header *h __used,
+		      struct perf_evlist *evlist __used)
+{
+	struct utsname uts;
+	int ret;
+
+	ret = uname(&uts);
+	if (ret < 0)
+		return -1;
+
+	return do_write_string(fd, uts.machine);
+}
+
+static int write_version(int fd, struct perf_header *h __used,
+			 struct perf_evlist *evlist __used)
+{
+	return do_write_string(fd, perf_version_string);
+}
+
+static int write_cpudesc(int fd, struct perf_header *h __used,
+		       struct perf_evlist *evlist __used)
+{
+#ifndef CPUINFO_PROC
+#define CPUINFO_PROC NULL
+#endif
+	FILE *file;
+	char *buf = NULL;
+	char *s, *p;
+	const char *search = CPUINFO_PROC;
+	size_t len = 0;
+	int ret = -1;
+
+	if (!search)
+		return -1;
+
+	file = fopen("/proc/cpuinfo", "r");
+	if (!file)
+		return -1;
+
+	while (getline(&buf, &len, file) > 0) {
+		ret = strncmp(buf, search, strlen(search));
+		if (!ret)
+			break;
+	}
+
+	if (ret)
+		goto done;
+
+	s = buf;
+
+	p = strchr(buf, ':');
+	if (p && *(p+1) == ' ' && *(p+2))
+		s = p + 2;
+	p = strchr(s, '\n');
+	if (p)
+		*p = '\0';
+
+	/* squash extra space characters (branding string) */
+	p = s;
+	while (*p) {
+		if (isspace(*p)) {
+			char *r = p + 1;
+			char *q = r;
+			*p = ' ';
+			while (*q && isspace(*q))
+				q++;
+			if (q != (p+1))
+				while ((*r++ = *q++));
+		}
+		p++;
+	}
+	ret = do_write_string(fd, s);
+done:
+	free(buf);
+	fclose(file);
+	return ret;
+}
+
+static int write_nrcpus(int fd, struct perf_header *h __used,
+			struct perf_evlist *evlist __used)
+{
+	long nr;
+	u32 nrc, nra;
+	int ret;
+
+	nr = sysconf(_SC_NPROCESSORS_CONF);
+	if (nr < 0)
+		return -1;
+
+	nrc = (u32)(nr & UINT_MAX);
+
+	nr = sysconf(_SC_NPROCESSORS_ONLN);
+	if (nr < 0)
+		return -1;
+
+	nra = (u32)(nr & UINT_MAX);
+
+	ret = do_write(fd, &nrc, sizeof(nrc));
+	if (ret < 0)
+		return ret;
+
+	return do_write(fd, &nra, sizeof(nra));
+}
+
+static int write_event_desc(int fd, struct perf_header *h __used,
+			    struct perf_evlist *evlist)
+{
+	struct perf_evsel *attr;
+	u32 nre = 0, nri, sz;
+	int ret;
+
+	list_for_each_entry(attr, &evlist->entries, node)
+		nre++;
+
+	/*
+	 * write number of events
+	 */
+	ret = do_write(fd, &nre, sizeof(nre));
+	if (ret < 0)
+		return ret;
+
+	/*
+	 * size of perf_event_attr struct
+	 */
+	sz = (u32)sizeof(attr->attr);
+	ret = do_write(fd, &sz, sizeof(sz));
+	if (ret < 0)
+		return ret;
+
+	list_for_each_entry(attr, &evlist->entries, node) {
+
+		ret = do_write(fd, &attr->attr, sz);
+		if (ret < 0)
+			return ret;
+		/*
+		 * write number of unique id per event
+		 * there is one id per instance of an event
+		 *
+		 * copy into an nri to be independent of the
+		 * type of ids,
+		 */
+		nri = attr->ids;
+		ret = do_write(fd, &nri, sizeof(nri));
+		if (ret < 0)
+			return ret;
+
+		/*
+		 * write event string as passed on cmdline
+		 */
+		ret = do_write_string(fd, attr->name);
+		if (ret < 0)
+			return ret;
+		/*
+		 * write unique ids for this event
+		 */
+		ret = do_write(fd, attr->id, attr->ids * sizeof(u64));
+		if (ret < 0)
+			return ret;
+	}
+	return 0;
+}
+
+static int write_cmdline(int fd, struct perf_header *h __used,
+			 struct perf_evlist *evlist __used)
+{
+	char buf[MAXPATHLEN];
+	char proc[32];
+	u32 i, n;
+	int ret;
+
+	/*
+	 * actual atual path to perf binary
+	 */
+	sprintf(proc, "/proc/%d/exe", getpid());
+	ret = readlink(proc, buf, sizeof(buf));
+	if (ret <= 0)
+		return -1;
+
+	/* readlink() does not add null termination */
+	buf[ret] = '\0';
+
+	/* account for binary path */
+	n = header_argc + 1;
+
+	ret = do_write(fd, &n, sizeof(n));
+	if (ret < 0)
+		return ret;
+
+	ret = do_write_string(fd, buf);
+	if (ret < 0)
+		return ret;
+
+	for (i = 0 ; i < header_argc; i++) {
+		ret = do_write_string(fd, header_argv[i]);
+		if (ret < 0)
+			return ret;
+	}
+	return 0;
+}
+
+#define CORE_SIB_FMT \
+	"/sys/devices/system/cpu/cpu%d/topology/core_siblings_list"
+#define THRD_SIB_FMT \
+	"/sys/devices/system/cpu/cpu%d/topology/thread_siblings_list"
+
+struct cpu_topo {
+	u32 core_sib;
+	u32 thread_sib;
+	char **core_siblings;
+	char **thread_siblings;
+};
+
+static int build_cpu_topo(struct cpu_topo *tp, int cpu)
+{
+	FILE *fp;
+	char filename[MAXPATHLEN];
+	char *buf = NULL, *p;
+	size_t len = 0;
+	u32 i = 0;
+	int ret = -1;
+
+	sprintf(filename, CORE_SIB_FMT, cpu);
+	fp = fopen(filename, "r");
+	if (!fp)
+		return -1;
+
+	if (getline(&buf, &len, fp) <= 0)
+		goto done;
+
+	fclose(fp);
+
+	p = strchr(buf, '\n');
+	if (p)
+		*p = '\0';
+
+	for (i = 0; i < tp->core_sib; i++) {
+		if (!strcmp(buf, tp->core_siblings[i]))
+			break;
+	}
+	if (i == tp->core_sib) {
+		tp->core_siblings[i] = buf;
+		tp->core_sib++;
+		buf = NULL;
+		len = 0;
+	}
+
+	sprintf(filename, THRD_SIB_FMT, cpu);
+	fp = fopen(filename, "r");
+	if (!fp)
+		goto done;
+
+	if (getline(&buf, &len, fp) <= 0)
+		goto done;
+
+	p = strchr(buf, '\n');
+	if (p)
+		*p = '\0';
+
+	for (i = 0; i < tp->thread_sib; i++) {
+		if (!strcmp(buf, tp->thread_siblings[i]))
+			break;
+	}
+	if (i == tp->thread_sib) {
+		tp->thread_siblings[i] = buf;
+		tp->thread_sib++;
+		buf = NULL;
+	}
+	ret = 0;
+done:
+	if(fp)
+		fclose(fp);
+	free(buf);
+	return ret;
+}
+
+static void free_cpu_topo(struct cpu_topo *tp)
+{
+	u32 i;
+
+	if (!tp)
+		return;
+
+	for (i = 0 ; i < tp->core_sib; i++)
+		free(tp->core_siblings[i]);
+
+	for (i = 0 ; i < tp->thread_sib; i++)
+		free(tp->thread_siblings[i]);
+
+	free(tp);
+}
+
+static struct cpu_topo *build_cpu_topology(void)
+{
+	struct cpu_topo *tp;
+	void *addr;
+	u32 nr, i;
+	size_t sz;
+	long ncpus;
+	int ret = -1;
+
+	ncpus = sysconf(_SC_NPROCESSORS_CONF);
+	if (ncpus < 0)
+		return NULL;
+
+	nr = (u32)(ncpus & UINT_MAX);
+
+	sz = nr * sizeof(char *);
+
+	addr = calloc(1, sizeof(*tp) + 2 * sz);
+	if (!addr)
+		return NULL;
+
+	tp = addr;
+
+	addr += sizeof(*tp);
+	tp->core_siblings = addr;
+	addr += sz;
+	tp->thread_siblings = addr;
+
+	for (i = 0; i < nr; i++) {
+		ret = build_cpu_topo(tp, i);
+		if (ret < 0)
+			break;
+	}
+	if (ret) {
+		free_cpu_topo(tp);
+		tp = NULL;
+	}
+	return tp;
+}
+
+static int write_cpu_topology(int fd, struct perf_header *h __used,
+			  struct perf_evlist *evlist __used)
+{
+	struct cpu_topo *tp;
+	u32 i;
+	int ret;
+
+	tp = build_cpu_topology();
+	if (!tp)
+		return -1;
+
+	ret = do_write(fd, &tp->core_sib, sizeof(tp->core_sib));
+	if (ret < 0)
+		goto done;
+
+	for (i = 0; i < tp->core_sib; i++) {
+		ret = do_write_string(fd, tp->core_siblings[i]);
+		if (ret < 0)
+			goto done;
+	}
+	ret = do_write(fd, &tp->thread_sib, sizeof(tp->thread_sib));
+	if (ret < 0)
+		goto done;
+
+	for (i = 0; i < tp->thread_sib; i++) {
+		ret = do_write_string(fd, tp->thread_siblings[i]);
+		if (ret < 0)
+			break;
+	}
+done:
+	free_cpu_topo(tp);
+	return ret;
+}
+
+
+
+static int write_total_mem(int fd, struct perf_header *h __used,
+			  struct perf_evlist *evlist __used)
+{
+	char *buf = NULL;
+	FILE *fp;
+	size_t len = 0;
+	int ret = -1, n;
+	uint64_t mem;
+
+	fp = fopen("/proc/meminfo", "r");
+	if (!fp)
+		return -1;
+
+	while (getline(&buf, &len, fp) > 0) {
+		ret = strncmp(buf, "MemTotal:", 9);
+		if (!ret)
+			break;
+	}
+	if (!ret) {
+		n = sscanf(buf, "%*s %"PRIu64, &mem);
+		if (n == 1)
+			ret = do_write(fd, &mem, sizeof(mem));
+	}
+	free(buf);
+	fclose(fp);
+	return ret;
+}
+
+static int write_topo_node(int fd, int node)
+{
+	char str[MAXPATHLEN];
+	char field[32];
+	char *buf = NULL, *p;
+	size_t len = 0;
+	FILE *fp;
+	u64 mem_total, mem_free, mem;
+	int ret = -1;
+
+	sprintf(str, "/sys/devices/system/node/node%d/meminfo", node);
+	fp = fopen(str, "r");
+	if (!fp)
+		return -1;
+
+	while (getline(&buf, &len, fp) > 0) {
+		/* skip over invalid lines */
+		if (!strchr(buf, ':'))
+			continue;
+		if (sscanf(buf, "%*s %*d %s %"PRIu64, field, &mem) != 2)
+			goto done;
+		if (!strcmp(field, "MemTotal:"))
+			mem_total = mem;
+		if (!strcmp(field, "MemFree:"))
+			mem_free = mem;
+	}
+
+	fclose(fp);
+
+	ret = do_write(fd, &mem_total, sizeof(u64));
+	if (ret)
+		goto done;
+
+	ret = do_write(fd, &mem_free, sizeof(u64));
+	if (ret)
+		goto done;
+
+	ret = -1;
+	sprintf(str, "/sys/devices/system/node/node%d/cpulist", node);
+
+	fp = fopen(str, "r");
+	if (!fp)
+		goto done;
+
+	if (getline(&buf, &len, fp) <= 0)
+		goto done;
+
+	p = strchr(buf, '\n');
+	if (p)
+		*p = '\0';
+
+	ret = do_write_string(fd, buf);
+done:
+	free(buf);
+	fclose(fp);
+	return ret;
+}
+
+static int write_numa_topology(int fd, struct perf_header *h __used,
+			  struct perf_evlist *evlist __used)
+{
+	char *buf = NULL;
+	size_t len = 0;
+	FILE *fp;
+	struct cpu_map *node_map = NULL;
+	char *c;
+	u32 nr, i, j;
+	int ret = -1;
+
+	fp = fopen("/sys/devices/system/node/online", "r");
+	if (!fp)
+		return -1;
+
+	if (getline(&buf, &len, fp) <= 0)
+		goto done;
+
+	c = strchr(buf, '\n');
+	if (c)
+		*c = '\0';
+
+	node_map = cpu_map__new(buf);
+	if (!node_map)
+		goto done;
+
+	nr = (u32)node_map->nr;
+
+	ret = do_write(fd, &nr, sizeof(nr));
+	if (ret < 0)
+		goto done;
+
+	for (i = 0; i < nr; i++) {
+		j = (u32)node_map->map[i];
+		ret = do_write(fd, &j, sizeof(j));
+		if (ret < 0)
+			break;
+
+		ret = write_topo_node(fd, i);
+		if (ret < 0)
+			break;
+	}
+done:
+	free(buf);
+	fclose(fp);
+	free(node_map);
+	return ret;
+}
+
+/*
+ * default get_cpuid(): nothing gets recorded
+ * actual implementation must be in arch/$(ARCH)/util/header.c
+ */
+int __attribute__((weak)) get_cpuid(char *buffer __used, size_t sz __used)
+{
+	return -1;
+}
+
+static int write_cpuid(int fd, struct perf_header *h __used,
+		       struct perf_evlist *evlist __used)
+{
+	char buffer[64];
+	int ret;
+
+	ret = get_cpuid(buffer, sizeof(buffer));
+	if (!ret)
+		goto write_it;
+
+	return -1;
+write_it:
+	return do_write_string(fd, buffer);
+}
+
+static void print_hostname(struct perf_header *ph, int fd, FILE *fp)
+{
+	char *str = do_read_string(fd, ph);
+	fprintf(fp, "# hostname : %s\n", str);
+	free(str);
+}
+
+static void print_osrelease(struct perf_header *ph, int fd, FILE *fp)
+{
+	char *str = do_read_string(fd, ph);
+	fprintf(fp, "# os release : %s\n", str);
+	free(str);
+}
+
+static void print_arch(struct perf_header *ph, int fd, FILE *fp)
+{
+	char *str = do_read_string(fd, ph);
+	fprintf(fp, "# arch : %s\n", str);
+	free(str);
+}
+
+static void print_cpudesc(struct perf_header *ph, int fd, FILE *fp)
+{
+	char *str = do_read_string(fd, ph);
+	fprintf(fp, "# cpudesc : %s\n", str);
+	free(str);
+}
+
+static void print_nrcpus(struct perf_header *ph, int fd, FILE *fp)
+{
+	ssize_t ret;
+	u32 nr;
+
+	ret = read(fd, &nr, sizeof(nr));
+	if (ret != (ssize_t)sizeof(nr))
+		nr = -1; /* interpreted as error */
+
+	if (ph->needs_swap)
+		nr = bswap_32(nr);
+
+	fprintf(fp, "# nrcpus online : %u\n", nr);
+
+	ret = read(fd, &nr, sizeof(nr));
+	if (ret != (ssize_t)sizeof(nr))
+		nr = -1; /* interpreted as error */
+
+	if (ph->needs_swap)
+		nr = bswap_32(nr);
+
+	fprintf(fp, "# nrcpus avail : %u\n", nr);
+}
+
+static void print_version(struct perf_header *ph, int fd, FILE *fp)
+{
+	char *str = do_read_string(fd, ph);
+	fprintf(fp, "# perf version : %s\n", str);
+	free(str);
+}
+
+static void print_cmdline(struct perf_header *ph, int fd, FILE *fp)
+{
+	ssize_t ret;
+	char *str;
+	u32 nr, i;
+
+	ret = read(fd, &nr, sizeof(nr));
+	if (ret != (ssize_t)sizeof(nr))
+		return;
+
+	if (ph->needs_swap)
+		nr = bswap_32(nr);
+
+	fprintf(fp, "# cmdline : ");
+
+	for (i = 0; i < nr; i++) {
+		str = do_read_string(fd, ph);
+		fprintf(fp, "%s ", str);
+		free(str);
+	}
+	fputc('\n', fp);
+}
+
+static void print_cpu_topology(struct perf_header *ph, int fd, FILE *fp)
+{
+	ssize_t ret;
+	u32 nr, i;
+	char *str;
+
+	ret = read(fd, &nr, sizeof(nr));
+	if (ret != (ssize_t)sizeof(nr))
+		return;
+
+	if (ph->needs_swap)
+		nr = bswap_32(nr);
+
+	for (i = 0; i < nr; i++) {
+		str = do_read_string(fd, ph);
+		fprintf(fp, "# sibling cores   : %s\n", str);
+		free(str);
+	}
+
+	ret = read(fd, &nr, sizeof(nr));
+	if (ret != (ssize_t)sizeof(nr))
+		return;
+
+	if (ph->needs_swap)
+		nr = bswap_32(nr);
+
+	for (i = 0; i < nr; i++) {
+		str = do_read_string(fd, ph);
+		fprintf(fp, "# sibling threads : %s\n", str);
+		free(str);
+	}
+}
+
+static void print_event_desc(struct perf_header *ph, int fd, FILE *fp)
+{
+	struct perf_event_attr attr;
+	uint64_t id;
+	void *buf = NULL;
+	char *str;
+	u32 nre, sz, nr, i, j, msz;
+	int ret;
+
+	/* number of events */
+	ret = read(fd, &nre, sizeof(nre));
+	if (ret != (ssize_t)sizeof(nre))
+		goto error;
+
+	if (ph->needs_swap)
+		nre = bswap_32(nre);
+
+	ret = read(fd, &sz, sizeof(sz));
+	if (ret != (ssize_t)sizeof(sz))
+		goto error;
+
+	if (ph->needs_swap)
+		sz = bswap_32(sz);
+
+	/*
+	 * ensure it is at least to our ABI rev
+	 */
+	if (sz < (u32)sizeof(attr))
+		goto error;
+
+	memset(&attr, 0, sizeof(attr));
+
+	/* read entire region to sync up to next field */
+	buf = malloc(sz);
+	if (!buf)
+		goto error;
+
+	msz = sizeof(attr);
+	if (sz < msz)
+		msz = sz;
+
+	for (i = 0 ; i < nre; i++) {
+
+		ret = read(fd, buf, sz);
+		if (ret != (ssize_t)sz)
+			goto error;
+
+		if (ph->needs_swap)
+			perf_event__attr_swap(buf);
+
+		memcpy(&attr, buf, msz);
+
+		ret = read(fd, &nr, sizeof(nr));
+		if (ret != (ssize_t)sizeof(nr))
+			goto error;
+
+		if (ph->needs_swap)
+			nr = bswap_32(nr);
+
+		str = do_read_string(fd, ph);
+		fprintf(fp, "# event : name = %s, ", str);
+		free(str);
+
+		fprintf(fp, "type = %d, config = 0x%"PRIx64
+			    ", config1 = 0x%"PRIx64", config2 = 0x%"PRIx64,
+				attr.type,
+				(u64)attr.config,
+				(u64)attr.config1,
+				(u64)attr.config2);
+
+		fprintf(fp, ", excl_usr = %d, excl_kern = %d",
+				attr.exclude_user,
+				attr.exclude_kernel);
+
+		if (nr)
+			fprintf(fp, ", id = {");
+
+		for (j = 0 ; j < nr; j++) {
+			ret = read(fd, &id, sizeof(id));
+			if (ret != (ssize_t)sizeof(id))
+				goto error;
+
+			if (ph->needs_swap)
+				id = bswap_64(id);
+
+			if (j)
+				fputc(',', fp);
+
+			fprintf(fp, " %"PRIu64, id);
+		}
+		if (nr && j == nr)
+			fprintf(fp, " }");
+		fputc('\n', fp);
+	}
+	free(buf);
+	return;
+error:
+	fprintf(fp, "# event desc: not available or unable to read\n");
+}
+
+static void print_total_mem(struct perf_header *h __used, int fd, FILE *fp)
+{
+	uint64_t mem;
+	ssize_t ret;
+
+	ret = read(fd, &mem, sizeof(mem));
+	if (ret != sizeof(mem))
+		goto error;
+
+	if (h->needs_swap)
+		mem = bswap_64(mem);
+
+	fprintf(fp, "# total memory : %"PRIu64" kB\n", mem);
+	return;
+error:
+	fprintf(fp, "# total memory : unknown\n");
+}
+
+static void print_numa_topology(struct perf_header *h __used, int fd, FILE *fp)
+{
+	ssize_t ret;
+	u32 nr, c, i;
+	char *str;
+	uint64_t mem_total, mem_free;
+
+	/* nr nodes */
+	ret = read(fd, &nr, sizeof(nr));
+	if (ret != (ssize_t)sizeof(nr))
+		goto error;
+
+	if (h->needs_swap)
+		nr = bswap_32(nr);
+
+	for (i = 0; i < nr; i++) {
+
+		/* node number */
+		ret = read(fd, &c, sizeof(c));
+		if (ret != (ssize_t)sizeof(c))
+			goto error;
+
+		if (h->needs_swap)
+			c = bswap_32(c);
+
+		ret = read(fd, &mem_total, sizeof(u64));
+		if (ret != sizeof(u64))
+			goto error;
+
+		ret = read(fd, &mem_free, sizeof(u64));
+		if (ret != sizeof(u64))
+			goto error;
+
+		if (h->needs_swap) {
+			mem_total = bswap_64(mem_total);
+			mem_free = bswap_64(mem_free);
+		}
+
+		fprintf(fp, "# node%u meminfo  : total = %"PRIu64" kB,"
+			    " free = %"PRIu64" kB\n",
+			c,
+			mem_total,
+			mem_free);
+
+		str = do_read_string(fd, h);
+		fprintf(fp, "# node%u cpu list : %s\n", c, str);
+		free(str);
+	}
+	return;
+error:
+	fprintf(fp, "# numa topology : not available\n");
+}
+
+static void print_cpuid(struct perf_header *ph, int fd, FILE *fp)
+{
+	char *str = do_read_string(fd, ph);
+	fprintf(fp, "# cpuid : %s\n", str);
+	free(str);
+}
+
+struct feature_ops {
+	int (*write)(int fd, struct perf_header *h, struct perf_evlist *evlist);
+	void (*print)(struct perf_header *h, int fd, FILE *fp);
+	const char *name;
+	bool full_only;
+};
+
+#define FEAT_OPA(n, w, p) \
+	[n] = { .name = #n, .write = w, .print = p }
+#define FEAT_OPF(n, w, p) \
+	[n] = { .name = #n, .write = w, .print = p, .full_only = true }
+
+static const struct feature_ops feat_ops[HEADER_LAST_FEATURE] = {
+	FEAT_OPA(HEADER_TRACE_INFO, write_trace_info, NULL),
+	FEAT_OPA(HEADER_BUILD_ID, write_build_id, NULL),
+	FEAT_OPA(HEADER_HOSTNAME, write_hostname, print_hostname),
+	FEAT_OPA(HEADER_OSRELEASE, write_osrelease, print_osrelease),
+	FEAT_OPA(HEADER_VERSION, write_version, print_version),
+	FEAT_OPA(HEADER_ARCH, write_arch, print_arch),
+	FEAT_OPA(HEADER_NRCPUS, write_nrcpus, print_nrcpus),
+	FEAT_OPA(HEADER_CPUDESC, write_cpudesc, print_cpudesc),
+	FEAT_OPA(HEADER_CPUID, write_cpuid, print_cpuid),
+	FEAT_OPA(HEADER_TOTAL_MEM, write_total_mem, print_total_mem),
+	FEAT_OPA(HEADER_EVENT_DESC, write_event_desc, print_event_desc),
+	FEAT_OPA(HEADER_CMDLINE, write_cmdline, print_cmdline),
+	FEAT_OPF(HEADER_CPU_TOPOLOGY, write_cpu_topology, print_cpu_topology),
+	FEAT_OPF(HEADER_NUMA_TOPOLOGY, write_numa_topology, print_numa_topology),
+};
+
+struct header_print_data {
+	FILE *fp;
+	bool full; /* extended list of headers */
+};
+
+static int perf_file_section__fprintf_info(struct perf_file_section *section,
+					   struct perf_header *ph,
+					   int feat, int fd, void *data)
+{
+	struct header_print_data *hd = data;
+
+	if (lseek(fd, section->offset, SEEK_SET) == (off_t)-1) {
+		pr_debug("Failed to lseek to %" PRIu64 " offset for feature "
+				"%d, continuing...\n", section->offset, feat);
+		return 0;
+	}
+	if (feat < HEADER_TRACE_INFO || feat >= HEADER_LAST_FEATURE) {
+		pr_warning("unknown feature %d\n", feat);
+		return -1;
+	}
+	if (!feat_ops[feat].print)
+		return 0;
+
+	if (!feat_ops[feat].full_only || hd->full)
+		feat_ops[feat].print(ph, fd, hd->fp);
+	else
+		fprintf(hd->fp, "# %s info available, use -I to display\n",
+			feat_ops[feat].name);
+
+	return 0;
+}
+
+int perf_header__fprintf_info(struct perf_session *session, FILE *fp, bool full)
+{
+	struct header_print_data hd;
+	struct perf_header *header = &session->header;
+	int fd = session->fd;
+	hd.fp = fp;
+	hd.full = full;
+
+	perf_header__process_sections(header, fd, &hd,
+				      perf_file_section__fprintf_info);
+	return 0;
+}
+
 #define dsos__for_each_with_build_id(pos, head)	\
 	list_for_each_entry(pos, head, node)	\
 		if (!pos->has_build_id)		\
@@ -267,7 +1289,7 @@
 	if (access(linkname, F_OK))
 		goto out_free;
 
-	if (readlink(linkname, filename, size) < 0)
+	if (readlink(linkname, filename, size - 1) < 0)
 		goto out_free;
 
 	if (unlink(linkname))
@@ -356,15 +1378,41 @@
 	return ret;
 }
 
+static int do_write_feat(int fd, struct perf_header *h, int type,
+			 struct perf_file_section **p,
+			 struct perf_evlist *evlist)
+{
+	int err;
+	int ret = 0;
+
+	if (perf_header__has_feat(h, type)) {
+
+		(*p)->offset = lseek(fd, 0, SEEK_CUR);
+
+		err = feat_ops[type].write(fd, h, evlist);
+		if (err < 0) {
+			pr_debug("failed to write feature %d\n", type);
+
+			/* undo anything written */
+			lseek(fd, (*p)->offset, SEEK_SET);
+
+			return -1;
+		}
+		(*p)->size = lseek(fd, 0, SEEK_CUR) - (*p)->offset;
+		(*p)++;
+	}
+	return ret;
+}
+
 static int perf_header__adds_write(struct perf_header *header,
 				   struct perf_evlist *evlist, int fd)
 {
 	int nr_sections;
 	struct perf_session *session;
-	struct perf_file_section *feat_sec;
+	struct perf_file_section *feat_sec, *p;
 	int sec_size;
 	u64 sec_start;
-	int idx = 0, err;
+	int err;
 
 	session = container_of(header, struct perf_session, header);
 
@@ -376,7 +1424,7 @@
 	if (!nr_sections)
 		return 0;
 
-	feat_sec = calloc(sizeof(*feat_sec), nr_sections);
+	feat_sec = p = calloc(sizeof(*feat_sec), nr_sections);
 	if (feat_sec == NULL)
 		return -ENOMEM;
 
@@ -385,36 +1433,69 @@
 	sec_start = header->data_offset + header->data_size;
 	lseek(fd, sec_start + sec_size, SEEK_SET);
 
-	if (perf_header__has_feat(header, HEADER_TRACE_INFO)) {
-		struct perf_file_section *trace_sec;
+	err = do_write_feat(fd, header, HEADER_TRACE_INFO, &p, evlist);
+	if (err)
+		goto out_free;
 
-		trace_sec = &feat_sec[idx++];
-
-		/* Write trace info */
-		trace_sec->offset = lseek(fd, 0, SEEK_CUR);
-		read_tracing_data(fd, &evlist->entries);
-		trace_sec->size = lseek(fd, 0, SEEK_CUR) - trace_sec->offset;
+	err = do_write_feat(fd, header, HEADER_BUILD_ID, &p, evlist);
+	if (err) {
+		perf_header__clear_feat(header, HEADER_BUILD_ID);
+		goto out_free;
 	}
 
-	if (perf_header__has_feat(header, HEADER_BUILD_ID)) {
-		struct perf_file_section *buildid_sec;
+	err = do_write_feat(fd, header, HEADER_HOSTNAME, &p, evlist);
+	if (err)
+		perf_header__clear_feat(header, HEADER_HOSTNAME);
 
-		buildid_sec = &feat_sec[idx++];
+	err = do_write_feat(fd, header, HEADER_OSRELEASE, &p, evlist);
+	if (err)
+		perf_header__clear_feat(header, HEADER_OSRELEASE);
 
-		/* Write build-ids */
-		buildid_sec->offset = lseek(fd, 0, SEEK_CUR);
-		err = dsos__write_buildid_table(header, fd);
-		if (err < 0) {
-			pr_debug("failed to write buildid table\n");
-			goto out_free;
-		}
-		buildid_sec->size = lseek(fd, 0, SEEK_CUR) -
-					  buildid_sec->offset;
-		if (!no_buildid_cache)
-			perf_session__cache_build_ids(session);
-	}
+	err = do_write_feat(fd, header, HEADER_VERSION, &p, evlist);
+	if (err)
+		perf_header__clear_feat(header, HEADER_VERSION);
+
+	err = do_write_feat(fd, header, HEADER_ARCH, &p, evlist);
+	if (err)
+		perf_header__clear_feat(header, HEADER_ARCH);
+
+	err = do_write_feat(fd, header, HEADER_NRCPUS, &p, evlist);
+	if (err)
+		perf_header__clear_feat(header, HEADER_NRCPUS);
+
+	err = do_write_feat(fd, header, HEADER_CPUDESC, &p, evlist);
+	if (err)
+		perf_header__clear_feat(header, HEADER_CPUDESC);
+
+	err = do_write_feat(fd, header, HEADER_CPUID, &p, evlist);
+	if (err)
+		perf_header__clear_feat(header, HEADER_CPUID);
+
+	err = do_write_feat(fd, header, HEADER_TOTAL_MEM, &p, evlist);
+	if (err)
+		perf_header__clear_feat(header, HEADER_TOTAL_MEM);
+
+	err = do_write_feat(fd, header, HEADER_CMDLINE, &p, evlist);
+	if (err)
+		perf_header__clear_feat(header, HEADER_CMDLINE);
+
+	err = do_write_feat(fd, header, HEADER_EVENT_DESC, &p, evlist);
+	if (err)
+		perf_header__clear_feat(header, HEADER_EVENT_DESC);
+
+	err = do_write_feat(fd, header, HEADER_CPU_TOPOLOGY, &p, evlist);
+	if (err)
+		perf_header__clear_feat(header, HEADER_CPU_TOPOLOGY);
+
+	err = do_write_feat(fd, header, HEADER_NUMA_TOPOLOGY, &p, evlist);
+	if (err)
+		perf_header__clear_feat(header, HEADER_NUMA_TOPOLOGY);
 
 	lseek(fd, sec_start, SEEK_SET);
+	/*
+	 * may write more than needed due to dropped feature, but
+	 * this is okay, reader will skip the mising entries
+	 */
 	err = do_write(fd, feat_sec, sec_size);
 	if (err < 0)
 		pr_debug("failed to write feature section\n");
@@ -554,9 +1635,10 @@
 }
 
 int perf_header__process_sections(struct perf_header *header, int fd,
+				  void *data,
 				  int (*process)(struct perf_file_section *section,
-						 struct perf_header *ph,
-						 int feat, int fd))
+				  struct perf_header *ph,
+				  int feat, int fd, void *data))
 {
 	struct perf_file_section *feat_sec;
 	int nr_sections;
@@ -584,7 +1666,7 @@
 		if (perf_header__has_feat(header, feat)) {
 			struct perf_file_section *sec = &feat_sec[idx++];
 
-			err = process(sec, header, feat, fd);
+			err = process(sec, header, feat, fd, data);
 			if (err < 0)
 				break;
 		}
@@ -621,21 +1703,41 @@
 			bitmap_zero(header->adds_features, HEADER_FEAT_BITS);
 		else
 			return -1;
+	} else if (ph->needs_swap) {
+		unsigned int i;
+		/*
+		 * feature bitmap is declared as an array of unsigned longs --
+		 * not good since its size can differ between the host that
+		 * generated the data file and the host analyzing the file.
+		 *
+		 * We need to handle endianness, but we don't know the size of
+		 * the unsigned long where the file was generated. Take a best
+		 * guess at determining it: try 64-bit swap first (ie., file
+		 * created on a 64-bit host), and check if the hostname feature
+		 * bit is set (this feature bit is forced on as of fbe96f2).
+		 * If the bit is not, undo the 64-bit swap and try a 32-bit
+		 * swap. If the hostname bit is still not set (e.g., older data
+		 * file), punt and fallback to the original behavior --
+		 * clearing all feature bits and setting buildid.
+		 */
+		for (i = 0; i < BITS_TO_LONGS(HEADER_FEAT_BITS); ++i)
+			header->adds_features[i] = bswap_64(header->adds_features[i]);
+
+		if (!test_bit(HEADER_HOSTNAME, header->adds_features)) {
+			for (i = 0; i < BITS_TO_LONGS(HEADER_FEAT_BITS); ++i) {
+				header->adds_features[i] = bswap_64(header->adds_features[i]);
+				header->adds_features[i] = bswap_32(header->adds_features[i]);
+			}
+		}
+
+		if (!test_bit(HEADER_HOSTNAME, header->adds_features)) {
+			bitmap_zero(header->adds_features, HEADER_FEAT_BITS);
+			set_bit(HEADER_BUILD_ID, header->adds_features);
+		}
 	}
 
 	memcpy(&ph->adds_features, &header->adds_features,
 	       sizeof(ph->adds_features));
-	/*
-	 * FIXME: hack that assumes that if we need swap the perf.data file
-	 * may be coming from an arch with a different word-size, ergo different
-	 * DEFINE_BITMAP format, investigate more later, but for now its mostly
-	 * safe to assume that we have a build-id section. Trace files probably
-	 * have several other issues in this realm anyway...
-	 */
-	if (ph->needs_swap) {
-		memset(&ph->adds_features, 0, sizeof(ph->adds_features));
-		perf_header__set_feat(ph, HEADER_BUILD_ID);
-	}
 
 	ph->event_offset = header->event_types.offset;
 	ph->event_size   = header->event_types.size;
@@ -796,7 +1898,7 @@
 
 static int perf_file_section__process(struct perf_file_section *section,
 				      struct perf_header *ph,
-				      int feat, int fd)
+				      int feat, int fd, void *data __used)
 {
 	if (lseek(fd, section->offset, SEEK_SET) == (off_t)-1) {
 		pr_debug("Failed to lseek to %" PRIu64 " offset for feature "
@@ -813,6 +1915,21 @@
 		if (perf_header__read_build_ids(ph, fd, section->offset, section->size))
 			pr_debug("Failed to read buildids, continuing...\n");
 		break;
+
+	case HEADER_HOSTNAME:
+	case HEADER_OSRELEASE:
+	case HEADER_VERSION:
+	case HEADER_ARCH:
+	case HEADER_NRCPUS:
+	case HEADER_CPUDESC:
+	case HEADER_CPUID:
+	case HEADER_TOTAL_MEM:
+	case HEADER_CMDLINE:
+	case HEADER_EVENT_DESC:
+	case HEADER_CPU_TOPOLOGY:
+	case HEADER_NUMA_TOPOLOGY:
+		break;
+
 	default:
 		pr_debug("unknown feature %d, continuing...\n", feat);
 	}
@@ -935,7 +2052,8 @@
 		event_count =  f_header.event_types.size / sizeof(struct perf_trace_event_type);
 	}
 
-	perf_header__process_sections(header, fd, perf_file_section__process);
+	perf_header__process_sections(header, fd, NULL,
+				      perf_file_section__process);
 
 	lseek(fd, header->data_offset, SEEK_SET);
 
@@ -1100,15 +2218,29 @@
 				   struct perf_session *session __unused)
 {
 	union perf_event ev;
+	struct tracing_data *tdata;
 	ssize_t size = 0, aligned_size = 0, padding;
 	int err __used = 0;
 
+	/*
+	 * We are going to store the size of the data followed
+	 * by the data contents. Since the fd descriptor is a pipe,
+	 * we cannot seek back to store the size of the data once
+	 * we know it. Instead we:
+	 *
+	 * - write the tracing data to the temp file
+	 * - get/write the data size to pipe
+	 * - write the tracing data from the temp file
+	 *   to the pipe
+	 */
+	tdata = tracing_data_get(&evlist->entries, fd, true);
+	if (!tdata)
+		return -1;
+
 	memset(&ev, 0, sizeof(ev));
 
 	ev.tracing_data.header.type = PERF_RECORD_HEADER_TRACING_DATA;
-	size = read_tracing_data_size(fd, &evlist->entries);
-	if (size <= 0)
-		return size;
+	size = tdata->size;
 	aligned_size = ALIGN(size, sizeof(u64));
 	padding = aligned_size - size;
 	ev.tracing_data.header.size = sizeof(ev.tracing_data);
@@ -1116,7 +2248,12 @@
 
 	process(&ev, NULL, session);
 
-	err = read_tracing_data(fd, &evlist->entries);
+	/*
+	 * The put function will copy all the tracing data
+	 * stored in temp file to the pipe.
+	 */
+	tracing_data_put(tdata);
+
 	write_padded(fd, NULL, 0, padding);
 
 	return aligned_size;

diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h
index 1886256..3d5a742 100644
--- a/tools/perf/util/header.h
+++ b/tools/perf/util/header.h

@@ -12,6 +12,20 @@
 enum {
 	HEADER_TRACE_INFO = 1,
 	HEADER_BUILD_ID,
+
+	HEADER_HOSTNAME,
+	HEADER_OSRELEASE,
+	HEADER_VERSION,
+	HEADER_ARCH,
+	HEADER_NRCPUS,
+	HEADER_CPUDESC,
+	HEADER_CPUID,
+	HEADER_TOTAL_MEM,
+	HEADER_CMDLINE,
+	HEADER_EVENT_DESC,
+	HEADER_CPU_TOPOLOGY,
+	HEADER_NUMA_TOPOLOGY,
+
 	HEADER_LAST_FEATURE,
 };
 
@@ -68,10 +82,15 @@
 void perf_header__clear_feat(struct perf_header *header, int feat);
 bool perf_header__has_feat(const struct perf_header *header, int feat);
 
+int perf_header__set_cmdline(int argc, const char **argv);
+
 int perf_header__process_sections(struct perf_header *header, int fd,
+				  void *data,
 				  int (*process)(struct perf_file_section *section,
-						 struct perf_header *ph,
-						 int feat, int fd));
+				  struct perf_header *ph,
+				  int feat, int fd, void *data));
+
+int perf_header__fprintf_info(struct perf_session *s, FILE *fp, bool full);
 
 int build_id_cache__add_s(const char *sbuild_id, const char *debugdir,
 			  const char *name, bool is_kallsyms);
@@ -104,4 +123,10 @@
 				    struct perf_session *session);
 int perf_event__process_build_id(union perf_event *event,
 				 struct perf_session *session);
+
+/*
+ * arch specific callback
+ */
+int get_cpuid(char *buffer, size_t sz);
+
 #endif /* __PERF_HEADER_H */

diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index 677e1da..f6a9939 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c

@@ -6,6 +6,11 @@
 #include "sort.h"
 #include <math.h>
 
+static bool hists__filter_entry_by_dso(struct hists *hists,
+				       struct hist_entry *he);
+static bool hists__filter_entry_by_thread(struct hists *hists,
+					  struct hist_entry *he);
+
 enum hist_filter {
 	HIST_FILTER__DSO,
 	HIST_FILTER__THREAD,
@@ -18,56 +23,56 @@
 	.order  = ORDER_CALLEE
 };
 
-u16 hists__col_len(struct hists *self, enum hist_column col)
+u16 hists__col_len(struct hists *hists, enum hist_column col)
 {
-	return self->col_len[col];
+	return hists->col_len[col];
 }
 
-void hists__set_col_len(struct hists *self, enum hist_column col, u16 len)
+void hists__set_col_len(struct hists *hists, enum hist_column col, u16 len)
 {
-	self->col_len[col] = len;
+	hists->col_len[col] = len;
 }
 
-bool hists__new_col_len(struct hists *self, enum hist_column col, u16 len)
+bool hists__new_col_len(struct hists *hists, enum hist_column col, u16 len)
 {
-	if (len > hists__col_len(self, col)) {
-		hists__set_col_len(self, col, len);
+	if (len > hists__col_len(hists, col)) {
+		hists__set_col_len(hists, col, len);
 		return true;
 	}
 	return false;
 }
 
-static void hists__reset_col_len(struct hists *self)
+static void hists__reset_col_len(struct hists *hists)
 {
 	enum hist_column col;
 
 	for (col = 0; col < HISTC_NR_COLS; ++col)
-		hists__set_col_len(self, col, 0);
+		hists__set_col_len(hists, col, 0);
 }
 
-static void hists__calc_col_len(struct hists *self, struct hist_entry *h)
+static void hists__calc_col_len(struct hists *hists, struct hist_entry *h)
 {
 	u16 len;
 
 	if (h->ms.sym)
-		hists__new_col_len(self, HISTC_SYMBOL, h->ms.sym->namelen);
+		hists__new_col_len(hists, HISTC_SYMBOL, h->ms.sym->namelen);
 	else {
 		const unsigned int unresolved_col_width = BITS_PER_LONG / 4;
 
-		if (hists__col_len(self, HISTC_DSO) < unresolved_col_width &&
+		if (hists__col_len(hists, HISTC_DSO) < unresolved_col_width &&
 		    !symbol_conf.col_width_list_str && !symbol_conf.field_sep &&
 		    !symbol_conf.dso_list)
-			hists__set_col_len(self, HISTC_DSO,
+			hists__set_col_len(hists, HISTC_DSO,
 					   unresolved_col_width);
 	}
 
 	len = thread__comm_len(h->thread);
-	if (hists__new_col_len(self, HISTC_COMM, len))
-		hists__set_col_len(self, HISTC_THREAD, len + 6);
+	if (hists__new_col_len(hists, HISTC_COMM, len))
+		hists__set_col_len(hists, HISTC_THREAD, len + 6);
 
 	if (h->ms.map) {
 		len = dso__name_len(h->ms.map->dso);
-		hists__new_col_len(self, HISTC_DSO, len);
+		hists__new_col_len(hists, HISTC_DSO, len);
 	}
 }
 
@@ -92,6 +97,67 @@
 	}
 }
 
+static void hist_entry__decay(struct hist_entry *he)
+{
+	he->period = (he->period * 7) / 8;
+	he->nr_events = (he->nr_events * 7) / 8;
+}
+
+static bool hists__decay_entry(struct hists *hists, struct hist_entry *he)
+{
+	u64 prev_period = he->period;
+
+	if (prev_period == 0)
+		return true;
+
+	hist_entry__decay(he);
+
+	if (!he->filtered)
+		hists->stats.total_period -= prev_period - he->period;
+
+	return he->period == 0;
+}
+
+static void __hists__decay_entries(struct hists *hists, bool zap_user,
+				   bool zap_kernel, bool threaded)
+{
+	struct rb_node *next = rb_first(&hists->entries);
+	struct hist_entry *n;
+
+	while (next) {
+		n = rb_entry(next, struct hist_entry, rb_node);
+		next = rb_next(&n->rb_node);
+		/*
+		 * We may be annotating this, for instance, so keep it here in
+		 * case some it gets new samples, we'll eventually free it when
+		 * the user stops browsing and it agains gets fully decayed.
+		 */
+		if (((zap_user && n->level == '.') ||
+		     (zap_kernel && n->level != '.') ||
+		     hists__decay_entry(hists, n)) &&
+		    !n->used) {
+			rb_erase(&n->rb_node, &hists->entries);
+
+			if (sort__need_collapse || threaded)
+				rb_erase(&n->rb_node_in, &hists->entries_collapsed);
+
+			hist_entry__free(n);
+			--hists->nr_entries;
+		}
+	}
+}
+
+void hists__decay_entries(struct hists *hists, bool zap_user, bool zap_kernel)
+{
+	return __hists__decay_entries(hists, zap_user, zap_kernel, false);
+}
+
+void hists__decay_entries_threaded(struct hists *hists,
+				   bool zap_user, bool zap_kernel)
+{
+	return __hists__decay_entries(hists, zap_user, zap_kernel, true);
+}
+
 /*
  * histogram, sorted on item, collects periods
  */
@@ -113,11 +179,12 @@
 	return self;
 }
 
-static void hists__inc_nr_entries(struct hists *self, struct hist_entry *h)
+static void hists__inc_nr_entries(struct hists *hists, struct hist_entry *h)
 {
 	if (!h->filtered) {
-		hists__calc_col_len(self, h);
-		++self->nr_entries;
+		hists__calc_col_len(hists, h);
+		++hists->nr_entries;
+		hists->stats.total_period += h->period;
 	}
 }
 
@@ -128,11 +195,11 @@
 	return 0;
 }
 
-struct hist_entry *__hists__add_entry(struct hists *self,
+struct hist_entry *__hists__add_entry(struct hists *hists,
 				      struct addr_location *al,
 				      struct symbol *sym_parent, u64 period)
 {
-	struct rb_node **p = &self->entries.rb_node;
+	struct rb_node **p;
 	struct rb_node *parent = NULL;
 	struct hist_entry *he;
 	struct hist_entry entry = {
@@ -150,9 +217,13 @@
 	};
 	int cmp;
 
+	pthread_mutex_lock(&hists->lock);
+
+	p = &hists->entries_in->rb_node;
+
 	while (*p != NULL) {
 		parent = *p;
-		he = rb_entry(parent, struct hist_entry, rb_node);
+		he = rb_entry(parent, struct hist_entry, rb_node_in);
 
 		cmp = hist_entry__cmp(&entry, he);
 
@@ -170,12 +241,14 @@
 
 	he = hist_entry__new(&entry);
 	if (!he)
-		return NULL;
-	rb_link_node(&he->rb_node, parent, p);
-	rb_insert_color(&he->rb_node, &self->entries);
-	hists__inc_nr_entries(self, he);
+		goto out_unlock;
+
+	rb_link_node(&he->rb_node_in, parent, p);
+	rb_insert_color(&he->rb_node_in, hists->entries_in);
 out:
 	hist_entry__add_cpumode_period(he, al->cpumode, period);
+out_unlock:
+	pthread_mutex_unlock(&hists->lock);
 	return he;
 }
 
@@ -222,7 +295,7 @@
  * collapse the histogram
  */
 
-static bool hists__collapse_insert_entry(struct hists *self,
+static bool hists__collapse_insert_entry(struct hists *hists,
 					 struct rb_root *root,
 					 struct hist_entry *he)
 {
@@ -233,15 +306,16 @@
 
 	while (*p != NULL) {
 		parent = *p;
-		iter = rb_entry(parent, struct hist_entry, rb_node);
+		iter = rb_entry(parent, struct hist_entry, rb_node_in);
 
 		cmp = hist_entry__collapse(iter, he);
 
 		if (!cmp) {
 			iter->period += he->period;
+			iter->nr_events += he->nr_events;
 			if (symbol_conf.use_callchain) {
-				callchain_cursor_reset(&self->callchain_cursor);
-				callchain_merge(&self->callchain_cursor, iter->callchain,
+				callchain_cursor_reset(&hists->callchain_cursor);
+				callchain_merge(&hists->callchain_cursor, iter->callchain,
 						he->callchain);
 			}
 			hist_entry__free(he);
@@ -254,35 +328,70 @@
 			p = &(*p)->rb_right;
 	}
 
-	rb_link_node(&he->rb_node, parent, p);
-	rb_insert_color(&he->rb_node, root);
+	rb_link_node(&he->rb_node_in, parent, p);
+	rb_insert_color(&he->rb_node_in, root);
 	return true;
 }
 
-void hists__collapse_resort(struct hists *self)
+static struct rb_root *hists__get_rotate_entries_in(struct hists *hists)
 {
-	struct rb_root tmp;
+	struct rb_root *root;
+
+	pthread_mutex_lock(&hists->lock);
+
+	root = hists->entries_in;
+	if (++hists->entries_in > &hists->entries_in_array[1])
+		hists->entries_in = &hists->entries_in_array[0];
+
+	pthread_mutex_unlock(&hists->lock);
+
+	return root;
+}
+
+static void hists__apply_filters(struct hists *hists, struct hist_entry *he)
+{
+	hists__filter_entry_by_dso(hists, he);
+	hists__filter_entry_by_thread(hists, he);
+}
+
+static void __hists__collapse_resort(struct hists *hists, bool threaded)
+{
+	struct rb_root *root;
 	struct rb_node *next;
 	struct hist_entry *n;
 
-	if (!sort__need_collapse)
+	if (!sort__need_collapse && !threaded)
 		return;
 
-	tmp = RB_ROOT;
-	next = rb_first(&self->entries);
-	self->nr_entries = 0;
-	hists__reset_col_len(self);
+	root = hists__get_rotate_entries_in(hists);
+	next = rb_first(root);
+	hists->stats.total_period = 0;
 
 	while (next) {
-		n = rb_entry(next, struct hist_entry, rb_node);
-		next = rb_next(&n->rb_node);
+		n = rb_entry(next, struct hist_entry, rb_node_in);
+		next = rb_next(&n->rb_node_in);
 
-		rb_erase(&n->rb_node, &self->entries);
-		if (hists__collapse_insert_entry(self, &tmp, n))
-			hists__inc_nr_entries(self, n);
+		rb_erase(&n->rb_node_in, root);
+		if (hists__collapse_insert_entry(hists, &hists->entries_collapsed, n)) {
+			/*
+			 * If it wasn't combined with one of the entries already
+			 * collapsed, we need to apply the filters that may have
+			 * been set by, say, the hist_browser.
+			 */
+			hists__apply_filters(hists, n);
+			hists__inc_nr_entries(hists, n);
+		}
 	}
+}
 
-	self->entries = tmp;
+void hists__collapse_resort(struct hists *hists)
+{
+	return __hists__collapse_resort(hists, false);
+}
+
+void hists__collapse_resort_threaded(struct hists *hists)
+{
+	return __hists__collapse_resort(hists, true);
 }
 
 /*
@@ -315,31 +424,43 @@
 	rb_insert_color(&he->rb_node, entries);
 }
 
-void hists__output_resort(struct hists *self)
+static void __hists__output_resort(struct hists *hists, bool threaded)
 {
-	struct rb_root tmp;
+	struct rb_root *root;
 	struct rb_node *next;
 	struct hist_entry *n;
 	u64 min_callchain_hits;
 
-	min_callchain_hits = self->stats.total_period * (callchain_param.min_percent / 100);
+	min_callchain_hits = hists->stats.total_period * (callchain_param.min_percent / 100);
 
-	tmp = RB_ROOT;
-	next = rb_first(&self->entries);
+	if (sort__need_collapse || threaded)
+		root = &hists->entries_collapsed;
+	else
+		root = hists->entries_in;
 
-	self->nr_entries = 0;
-	hists__reset_col_len(self);
+	next = rb_first(root);
+	hists->entries = RB_ROOT;
+
+	hists->nr_entries = 0;
+	hists__reset_col_len(hists);
 
 	while (next) {
-		n = rb_entry(next, struct hist_entry, rb_node);
-		next = rb_next(&n->rb_node);
+		n = rb_entry(next, struct hist_entry, rb_node_in);
+		next = rb_next(&n->rb_node_in);
 
-		rb_erase(&n->rb_node, &self->entries);
-		__hists__insert_output_entry(&tmp, n, min_callchain_hits);
-		hists__inc_nr_entries(self, n);
+		__hists__insert_output_entry(&hists->entries, n, min_callchain_hits);
+		hists__inc_nr_entries(hists, n);
 	}
+}
 
-	self->entries = tmp;
+void hists__output_resort(struct hists *hists)
+{
+	return __hists__output_resort(hists, false);
+}
+
+void hists__output_resort_threaded(struct hists *hists)
+{
+	return __hists__output_resort(hists, true);
 }
 
 static size_t callchain__fprintf_left_margin(FILE *fp, int left_margin)
@@ -594,12 +715,27 @@
 	return ret;
 }
 
-int hist_entry__snprintf(struct hist_entry *self, char *s, size_t size,
-			 struct hists *hists, struct hists *pair_hists,
-			 bool show_displacement, long displacement,
-			 bool color, u64 session_total)
+void hists__output_recalc_col_len(struct hists *hists, int max_rows)
 {
-	struct sort_entry *se;
+	struct rb_node *next = rb_first(&hists->entries);
+	struct hist_entry *n;
+	int row = 0;
+
+	hists__reset_col_len(hists);
+
+	while (next && row++ < max_rows) {
+		n = rb_entry(next, struct hist_entry, rb_node);
+		if (!n->filtered)
+			hists__calc_col_len(hists, n);
+		next = rb_next(&n->rb_node);
+	}
+}
+
+static int hist_entry__pcnt_snprintf(struct hist_entry *self, char *s,
+				     size_t size, struct hists *pair_hists,
+				     bool show_displacement, long displacement,
+				     bool color, u64 session_total)
+{
 	u64 period, total, period_sys, period_us, period_guest_sys, period_guest_us;
 	u64 nr_events;
 	const char *sep = symbol_conf.field_sep;
@@ -664,6 +800,13 @@
 			ret += snprintf(s + ret, size - ret, "%11" PRIu64, nr_events);
 	}
 
+	if (symbol_conf.show_total_period) {
+		if (sep)
+			ret += snprintf(s + ret, size - ret, "%c%" PRIu64, *sep, period);
+		else
+			ret += snprintf(s + ret, size - ret, " %12" PRIu64, period);
+	}
+
 	if (pair_hists) {
 		char bf[32];
 		double old_percent = 0, new_percent = 0, diff;
@@ -698,26 +841,42 @@
 		}
 	}
 
+	return ret;
+}
+
+int hist_entry__snprintf(struct hist_entry *he, char *s, size_t size,
+			 struct hists *hists)
+{
+	const char *sep = symbol_conf.field_sep;
+	struct sort_entry *se;
+	int ret = 0;
+
 	list_for_each_entry(se, &hist_entry__sort_list, list) {
 		if (se->elide)
 			continue;
 
 		ret += snprintf(s + ret, size - ret, "%s", sep ?: "  ");
-		ret += se->se_snprintf(self, s + ret, size - ret,
+		ret += se->se_snprintf(he, s + ret, size - ret,
 				       hists__col_len(hists, se->se_width_idx));
 	}
 
 	return ret;
 }
 
-int hist_entry__fprintf(struct hist_entry *self, struct hists *hists,
+int hist_entry__fprintf(struct hist_entry *he, size_t size, struct hists *hists,
 			struct hists *pair_hists, bool show_displacement,
 			long displacement, FILE *fp, u64 session_total)
 {
 	char bf[512];
-	hist_entry__snprintf(self, bf, sizeof(bf), hists, pair_hists,
-			     show_displacement, displacement,
-			     true, session_total);
+	int ret;
+
+	if (size == 0 || size > sizeof(bf))
+		size = sizeof(bf);
+
+	ret = hist_entry__pcnt_snprintf(he, bf, size, pair_hists,
+					show_displacement, displacement,
+					true, session_total);
+	hist_entry__snprintf(he, bf + ret, size - ret, hists);
 	return fprintf(fp, "%s\n", bf);
 }
 
@@ -738,8 +897,9 @@
 					     left_margin);
 }
 
-size_t hists__fprintf(struct hists *self, struct hists *pair,
-		      bool show_displacement, FILE *fp)
+size_t hists__fprintf(struct hists *hists, struct hists *pair,
+		      bool show_displacement, bool show_header, int max_rows,
+		      int max_cols, FILE *fp)
 {
 	struct sort_entry *se;
 	struct rb_node *nd;
@@ -749,9 +909,13 @@
 	unsigned int width;
 	const char *sep = symbol_conf.field_sep;
 	const char *col_width = symbol_conf.col_width_list_str;
+	int nr_rows = 0;
 
 	init_rem_hits();
 
+	if (!show_header)
+		goto print_entries;
+
 	fprintf(fp, "# %s", pair ? "Baseline" : "Overhead");
 
 	if (symbol_conf.show_nr_samples) {
@@ -761,6 +925,13 @@
 			fputs("  Samples  ", fp);
 	}
 
+	if (symbol_conf.show_total_period) {
+		if (sep)
+			ret += fprintf(fp, "%cPeriod", *sep);
+		else
+			ret += fprintf(fp, "   Period    ");
+	}
+
 	if (symbol_conf.show_cpu_utilization) {
 		if (sep) {
 			ret += fprintf(fp, "%csys", *sep);
@@ -803,18 +974,21 @@
 		width = strlen(se->se_header);
 		if (symbol_conf.col_width_list_str) {
 			if (col_width) {
-				hists__set_col_len(self, se->se_width_idx,
+				hists__set_col_len(hists, se->se_width_idx,
 						   atoi(col_width));
 				col_width = strchr(col_width, ',');
 				if (col_width)
 					++col_width;
 			}
 		}
-		if (!hists__new_col_len(self, se->se_width_idx, width))
-			width = hists__col_len(self, se->se_width_idx);
+		if (!hists__new_col_len(hists, se->se_width_idx, width))
+			width = hists__col_len(hists, se->se_width_idx);
 		fprintf(fp, "  %*s", width, se->se_header);
 	}
+
 	fprintf(fp, "\n");
+	if (max_rows && ++nr_rows >= max_rows)
+		goto out;
 
 	if (sep)
 		goto print_entries;
@@ -822,6 +996,8 @@
 	fprintf(fp, "# ........");
 	if (symbol_conf.show_nr_samples)
 		fprintf(fp, " ..........");
+	if (symbol_conf.show_total_period)
+		fprintf(fp, " ............");
 	if (pair) {
 		fprintf(fp, " ..........");
 		if (show_displacement)
@@ -834,17 +1010,23 @@
 			continue;
 
 		fprintf(fp, "  ");
-		width = hists__col_len(self, se->se_width_idx);
+		width = hists__col_len(hists, se->se_width_idx);
 		if (width == 0)
 			width = strlen(se->se_header);
 		for (i = 0; i < width; i++)
 			fprintf(fp, ".");
 	}
 
-	fprintf(fp, "\n#\n");
+	fprintf(fp, "\n");
+	if (max_rows && ++nr_rows >= max_rows)
+		goto out;
+
+	fprintf(fp, "#\n");
+	if (max_rows && ++nr_rows >= max_rows)
+		goto out;
 
 print_entries:
-	for (nd = rb_first(&self->entries); nd; nd = rb_next(nd)) {
+	for (nd = rb_first(&hists->entries); nd; nd = rb_next(nd)) {
 		struct hist_entry *h = rb_entry(nd, struct hist_entry, rb_node);
 
 		if (h->filtered)
@@ -858,19 +1040,22 @@
 				displacement = 0;
 			++position;
 		}
-		ret += hist_entry__fprintf(h, self, pair, show_displacement,
-					   displacement, fp, self->stats.total_period);
+		ret += hist_entry__fprintf(h, max_cols, hists, pair, show_displacement,
+					   displacement, fp, hists->stats.total_period);
 
 		if (symbol_conf.use_callchain)
-			ret += hist_entry__fprintf_callchain(h, self, fp,
-							     self->stats.total_period);
+			ret += hist_entry__fprintf_callchain(h, hists, fp,
+							     hists->stats.total_period);
+		if (max_rows && ++nr_rows >= max_rows)
+			goto out;
+
 		if (h->ms.map == NULL && verbose > 1) {
 			__map_groups__fprintf_maps(&h->thread->mg,
 						   MAP__FUNCTION, verbose, fp);
 			fprintf(fp, "%.10s end\n", graph_dotted_line);
 		}
 	}
-
+out:
 	free(rem_sq_bracket);
 
 	return ret;
@@ -879,7 +1064,7 @@
 /*
  * See hists__fprintf to match the column widths
  */
-unsigned int hists__sort_list_width(struct hists *self)
+unsigned int hists__sort_list_width(struct hists *hists)
 {
 	struct sort_entry *se;
 	int ret = 9; /* total % */
@@ -896,9 +1081,12 @@
 	if (symbol_conf.show_nr_samples)
 		ret += 11;
 
+	if (symbol_conf.show_total_period)
+		ret += 13;
+
 	list_for_each_entry(se, &hist_entry__sort_list, list)
 		if (!se->elide)
-			ret += 2 + hists__col_len(self, se->se_width_idx);
+			ret += 2 + hists__col_len(hists, se->se_width_idx);
 
 	if (verbose) /* Addr + origin */
 		ret += 3 + BITS_PER_LONG / 4;
@@ -906,63 +1094,84 @@
 	return ret;
 }
 
-static void hists__remove_entry_filter(struct hists *self, struct hist_entry *h,
+static void hists__remove_entry_filter(struct hists *hists, struct hist_entry *h,
 				       enum hist_filter filter)
 {
 	h->filtered &= ~(1 << filter);
 	if (h->filtered)
 		return;
 
-	++self->nr_entries;
+	++hists->nr_entries;
 	if (h->ms.unfolded)
-		self->nr_entries += h->nr_rows;
+		hists->nr_entries += h->nr_rows;
 	h->row_offset = 0;
-	self->stats.total_period += h->period;
-	self->stats.nr_events[PERF_RECORD_SAMPLE] += h->nr_events;
+	hists->stats.total_period += h->period;
+	hists->stats.nr_events[PERF_RECORD_SAMPLE] += h->nr_events;
 
-	hists__calc_col_len(self, h);
+	hists__calc_col_len(hists, h);
 }
 
-void hists__filter_by_dso(struct hists *self, const struct dso *dso)
+
+static bool hists__filter_entry_by_dso(struct hists *hists,
+				       struct hist_entry *he)
+{
+	if (hists->dso_filter != NULL &&
+	    (he->ms.map == NULL || he->ms.map->dso != hists->dso_filter)) {
+		he->filtered |= (1 << HIST_FILTER__DSO);
+		return true;
+	}
+
+	return false;
+}
+
+void hists__filter_by_dso(struct hists *hists)
 {
 	struct rb_node *nd;
 
-	self->nr_entries = self->stats.total_period = 0;
-	self->stats.nr_events[PERF_RECORD_SAMPLE] = 0;
-	hists__reset_col_len(self);
+	hists->nr_entries = hists->stats.total_period = 0;
+	hists->stats.nr_events[PERF_RECORD_SAMPLE] = 0;
+	hists__reset_col_len(hists);
 
-	for (nd = rb_first(&self->entries); nd; nd = rb_next(nd)) {
+	for (nd = rb_first(&hists->entries); nd; nd = rb_next(nd)) {
 		struct hist_entry *h = rb_entry(nd, struct hist_entry, rb_node);
 
 		if (symbol_conf.exclude_other && !h->parent)
 			continue;
 
-		if (dso != NULL && (h->ms.map == NULL || h->ms.map->dso != dso)) {
-			h->filtered |= (1 << HIST_FILTER__DSO);
+		if (hists__filter_entry_by_dso(hists, h))
 			continue;
-		}
 
-		hists__remove_entry_filter(self, h, HIST_FILTER__DSO);
+		hists__remove_entry_filter(hists, h, HIST_FILTER__DSO);
 	}
 }
 
-void hists__filter_by_thread(struct hists *self, const struct thread *thread)
+static bool hists__filter_entry_by_thread(struct hists *hists,
+					  struct hist_entry *he)
+{
+	if (hists->thread_filter != NULL &&
+	    he->thread != hists->thread_filter) {
+		he->filtered |= (1 << HIST_FILTER__THREAD);
+		return true;
+	}
+
+	return false;
+}
+
+void hists__filter_by_thread(struct hists *hists)
 {
 	struct rb_node *nd;
 
-	self->nr_entries = self->stats.total_period = 0;
-	self->stats.nr_events[PERF_RECORD_SAMPLE] = 0;
-	hists__reset_col_len(self);
+	hists->nr_entries = hists->stats.total_period = 0;
+	hists->stats.nr_events[PERF_RECORD_SAMPLE] = 0;
+	hists__reset_col_len(hists);
 
-	for (nd = rb_first(&self->entries); nd; nd = rb_next(nd)) {
+	for (nd = rb_first(&hists->entries); nd; nd = rb_next(nd)) {
 		struct hist_entry *h = rb_entry(nd, struct hist_entry, rb_node);
 
-		if (thread != NULL && h->thread != thread) {
-			h->filtered |= (1 << HIST_FILTER__THREAD);
+		if (hists__filter_entry_by_thread(hists, h))
 			continue;
-		}
 
-		hists__remove_entry_filter(self, h, HIST_FILTER__THREAD);
+		hists__remove_entry_filter(hists, h, HIST_FILTER__THREAD);
 	}
 }
 
@@ -976,13 +1185,13 @@
 	return symbol__annotate(he->ms.sym, he->ms.map, privsize);
 }
 
-void hists__inc_nr_events(struct hists *self, u32 type)
+void hists__inc_nr_events(struct hists *hists, u32 type)
 {
-	++self->stats.nr_events[0];
-	++self->stats.nr_events[type];
+	++hists->stats.nr_events[0];
+	++hists->stats.nr_events[type];
 }
 
-size_t hists__fprintf_nr_events(struct hists *self, FILE *fp)
+size_t hists__fprintf_nr_events(struct hists *hists, FILE *fp)
 {
 	int i;
 	size_t ret = 0;
@@ -990,7 +1199,7 @@
 	for (i = 0; i < PERF_RECORD_HEADER_MAX; ++i) {
 		const char *name;
 
-		if (self->stats.nr_events[i] == 0)
+		if (hists->stats.nr_events[i] == 0)
 			continue;
 
 		name = perf_event__name(i);
@@ -998,8 +1207,18 @@
 			continue;
 
 		ret += fprintf(fp, "%16s events: %10d\n", name,
-			       self->stats.nr_events[i]);
+			       hists->stats.nr_events[i]);
 	}
 
 	return ret;
 }
+
+void hists__init(struct hists *hists)
+{
+	memset(hists, 0, sizeof(*hists));
+	hists->entries_in_array[0] = hists->entries_in_array[1] = RB_ROOT;
+	hists->entries_in = &hists->entries_in_array[0];
+	hists->entries_collapsed = RB_ROOT;
+	hists->entries = RB_ROOT;
+	pthread_mutex_init(&hists->lock, NULL);
+}

diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index 3beb97c..ff93ddc 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h

@@ -2,6 +2,7 @@
 #define __PERF_HIST_H
 
 #include <linux/types.h>
+#include <pthread.h>
 #include "callchain.h"
 
 extern struct callchain_param callchain_param;
@@ -42,9 +43,18 @@
 	HISTC_NR_COLS, /* Last entry */
 };
 
+struct thread;
+struct dso;
+
 struct hists {
+	struct rb_root		entries_in_array[2];
+	struct rb_root		*entries_in;
 	struct rb_root		entries;
+	struct rb_root		entries_collapsed;
 	u64			nr_entries;
+	const struct thread	*thread_filter;
+	const struct dso	*dso_filter;
+	pthread_mutex_t		lock;
 	struct events_stats	stats;
 	u64			event_stream;
 	u16			col_len[HISTC_NR_COLS];
@@ -52,34 +62,42 @@
 	struct callchain_cursor	callchain_cursor;
 };
 
+void hists__init(struct hists *hists);
+
 struct hist_entry *__hists__add_entry(struct hists *self,
 				      struct addr_location *al,
 				      struct symbol *parent, u64 period);
 extern int64_t hist_entry__cmp(struct hist_entry *, struct hist_entry *);
 extern int64_t hist_entry__collapse(struct hist_entry *, struct hist_entry *);
-int hist_entry__fprintf(struct hist_entry *self, struct hists *hists,
+int hist_entry__fprintf(struct hist_entry *he, size_t size, struct hists *hists,
 			struct hists *pair_hists, bool show_displacement,
-			long displacement, FILE *fp, u64 total);
+			long displacement, FILE *fp, u64 session_total);
 int hist_entry__snprintf(struct hist_entry *self, char *bf, size_t size,
-			 struct hists *hists, struct hists *pair_hists,
-			 bool show_displacement, long displacement,
-			 bool color, u64 total);
+			 struct hists *hists);
 void hist_entry__free(struct hist_entry *);
 
 void hists__output_resort(struct hists *self);
+void hists__output_resort_threaded(struct hists *hists);
 void hists__collapse_resort(struct hists *self);
+void hists__collapse_resort_threaded(struct hists *hists);
+
+void hists__decay_entries(struct hists *hists, bool zap_user, bool zap_kernel);
+void hists__decay_entries_threaded(struct hists *hists, bool zap_user,
+				   bool zap_kernel);
+void hists__output_recalc_col_len(struct hists *hists, int max_rows);
 
 void hists__inc_nr_events(struct hists *self, u32 type);
 size_t hists__fprintf_nr_events(struct hists *self, FILE *fp);
 
 size_t hists__fprintf(struct hists *self, struct hists *pair,
-		      bool show_displacement, FILE *fp);
+		      bool show_displacement, bool show_header,
+		      int max_rows, int max_cols, FILE *fp);
 
 int hist_entry__inc_addr_samples(struct hist_entry *self, int evidx, u64 addr);
 int hist_entry__annotate(struct hist_entry *self, size_t privsize);
 
-void hists__filter_by_dso(struct hists *self, const struct dso *dso);
-void hists__filter_by_thread(struct hists *self, const struct thread *thread);
+void hists__filter_by_dso(struct hists *hists);
+void hists__filter_by_thread(struct hists *hists);
 
 u16 hists__col_len(struct hists *self, enum hist_column col);
 void hists__set_col_len(struct hists *self, enum hist_column col, u16 len);
@@ -90,26 +108,33 @@
 #ifdef NO_NEWT_SUPPORT
 static inline
 int perf_evlist__tui_browse_hists(struct perf_evlist *evlist __used,
-				  const char *help __used)
+				  const char *help __used,
+				  void(*timer)(void *arg) __used,
+				  void *arg __used,
+				  int refresh __used)
 {
 	return 0;
 }
 
 static inline int hist_entry__tui_annotate(struct hist_entry *self __used,
-					   int evidx __used)
+					   int evidx __used,
+					   int nr_events __used,
+					   void(*timer)(void *arg) __used,
+					   void *arg __used,
+					   int delay_secs __used)
 {
 	return 0;
 }
-#define KEY_LEFT -1
-#define KEY_RIGHT -2
+#define K_LEFT -1
+#define K_RIGHT -2
 #else
-#include <newt.h>
-int hist_entry__tui_annotate(struct hist_entry *self, int evidx);
+#include "ui/keysyms.h"
+int hist_entry__tui_annotate(struct hist_entry *he, int evidx, int nr_events,
+			     void(*timer)(void *arg), void *arg, int delay_secs);
 
-#define KEY_LEFT NEWT_KEY_LEFT
-#define KEY_RIGHT NEWT_KEY_RIGHT
-
-int perf_evlist__tui_browse_hists(struct perf_evlist *evlist, const char *help);
+int perf_evlist__tui_browse_hists(struct perf_evlist *evlist, const char *help,
+				  void(*timer)(void *arg), void *arg,
+				  int refresh);
 #endif
 
 unsigned int hists__sort_list_width(struct hists *self);

diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c
index a16ecab..78284b1 100644
--- a/tools/perf/util/map.c
+++ b/tools/perf/util/map.c

@@ -18,6 +18,13 @@
 	return strcmp(filename, "//anon") == 0;
 }
 
+static inline int is_no_dso_memory(const char *filename)
+{
+	return !strcmp(filename, "[stack]") ||
+	       !strcmp(filename, "[vdso]")  ||
+	       !strcmp(filename, "[heap]");
+}
+
 void map__init(struct map *self, enum map_type type,
 	       u64 start, u64 end, u64 pgoff, struct dso *dso)
 {
@@ -42,9 +49,10 @@
 	if (self != NULL) {
 		char newfilename[PATH_MAX];
 		struct dso *dso;
-		int anon;
+		int anon, no_dso;
 
 		anon = is_anon_memory(filename);
+		no_dso = is_no_dso_memory(filename);
 
 		if (anon) {
 			snprintf(newfilename, sizeof(newfilename), "/tmp/perf-%d.map", pid);
@@ -57,12 +65,16 @@
 
 		map__init(self, type, start, start + len, pgoff, dso);
 
-		if (anon) {
-set_identity:
+		if (anon || no_dso) {
 			self->map_ip = self->unmap_ip = identity__map_ip;
-		} else if (strcmp(filename, "[vdso]") == 0) {
-			dso__set_loaded(dso, self->type);
-			goto set_identity;
+
+			/*
+			 * Set memory without DSO as loaded. All map__find_*
+			 * functions still return NULL, and we avoid the
+			 * unnecessary map__load warning.
+			 */
+			if (no_dso)
+				dso__set_loaded(dso, self->type);
 		}
 	}
 	return self;
@@ -127,8 +139,8 @@
 
 		if (len > sizeof(DSO__DELETED) &&
 		    strcmp(name + real_len + 1, DSO__DELETED) == 0) {
-			pr_warning("%.*s was updated, restart the long "
-				   "running apps that use it!\n",
+			pr_warning("%.*s was updated (is prelink enabled?). "
+				"Restart the long running apps that use it!\n",
 				   (int)real_len, name);
 		} else {
 			pr_warning("no symbols found in %s, maybe install "
@@ -220,55 +232,55 @@
 	return ip;
 }
 
-void map_groups__init(struct map_groups *self)
+void map_groups__init(struct map_groups *mg)
 {
 	int i;
 	for (i = 0; i < MAP__NR_TYPES; ++i) {
-		self->maps[i] = RB_ROOT;
-		INIT_LIST_HEAD(&self->removed_maps[i]);
+		mg->maps[i] = RB_ROOT;
+		INIT_LIST_HEAD(&mg->removed_maps[i]);
 	}
-	self->machine = NULL;
+	mg->machine = NULL;
 }
 
-static void maps__delete(struct rb_root *self)
+static void maps__delete(struct rb_root *maps)
 {
-	struct rb_node *next = rb_first(self);
+	struct rb_node *next = rb_first(maps);
 
 	while (next) {
 		struct map *pos = rb_entry(next, struct map, rb_node);
 
 		next = rb_next(&pos->rb_node);
-		rb_erase(&pos->rb_node, self);
+		rb_erase(&pos->rb_node, maps);
 		map__delete(pos);
 	}
 }
 
-static void maps__delete_removed(struct list_head *self)
+static void maps__delete_removed(struct list_head *maps)
 {
 	struct map *pos, *n;
 
-	list_for_each_entry_safe(pos, n, self, node) {
+	list_for_each_entry_safe(pos, n, maps, node) {
 		list_del(&pos->node);
 		map__delete(pos);
 	}
 }
 
-void map_groups__exit(struct map_groups *self)
+void map_groups__exit(struct map_groups *mg)
 {
 	int i;
 
 	for (i = 0; i < MAP__NR_TYPES; ++i) {
-		maps__delete(&self->maps[i]);
-		maps__delete_removed(&self->removed_maps[i]);
+		maps__delete(&mg->maps[i]);
+		maps__delete_removed(&mg->removed_maps[i]);
 	}
 }
 
-void map_groups__flush(struct map_groups *self)
+void map_groups__flush(struct map_groups *mg)
 {
 	int type;
 
 	for (type = 0; type < MAP__NR_TYPES; type++) {
-		struct rb_root *root = &self->maps[type];
+		struct rb_root *root = &mg->maps[type];
 		struct rb_node *next = rb_first(root);
 
 		while (next) {
@@ -280,17 +292,17 @@
 			 * instance in some hist_entry instances, so
 			 * just move them to a separate list.
 			 */
-			list_add_tail(&pos->node, &self->removed_maps[pos->type]);
+			list_add_tail(&pos->node, &mg->removed_maps[pos->type]);
 		}
 	}
 }
 
-struct symbol *map_groups__find_symbol(struct map_groups *self,
+struct symbol *map_groups__find_symbol(struct map_groups *mg,
 				       enum map_type type, u64 addr,
 				       struct map **mapp,
 				       symbol_filter_t filter)
 {
-	struct map *map = map_groups__find(self, type, addr);
+	struct map *map = map_groups__find(mg, type, addr);
 
 	if (map != NULL) {
 		if (mapp != NULL)
@@ -301,7 +313,7 @@
 	return NULL;
 }
 
-struct symbol *map_groups__find_symbol_by_name(struct map_groups *self,
+struct symbol *map_groups__find_symbol_by_name(struct map_groups *mg,
 					       enum map_type type,
 					       const char *name,
 					       struct map **mapp,
@@ -309,7 +321,7 @@
 {
 	struct rb_node *nd;
 
-	for (nd = rb_first(&self->maps[type]); nd; nd = rb_next(nd)) {
+	for (nd = rb_first(&mg->maps[type]); nd; nd = rb_next(nd)) {
 		struct map *pos = rb_entry(nd, struct map, rb_node);
 		struct symbol *sym = map__find_symbol_by_name(pos, name, filter);
 
@@ -323,13 +335,13 @@
 	return NULL;
 }
 
-size_t __map_groups__fprintf_maps(struct map_groups *self,
+size_t __map_groups__fprintf_maps(struct map_groups *mg,
 				  enum map_type type, int verbose, FILE *fp)
 {
 	size_t printed = fprintf(fp, "%s:\n", map_type__name[type]);
 	struct rb_node *nd;
 
-	for (nd = rb_first(&self->maps[type]); nd; nd = rb_next(nd)) {
+	for (nd = rb_first(&mg->maps[type]); nd; nd = rb_next(nd)) {
 		struct map *pos = rb_entry(nd, struct map, rb_node);
 		printed += fprintf(fp, "Map:");
 		printed += map__fprintf(pos, fp);
@@ -342,22 +354,22 @@
 	return printed;
 }
 
-size_t map_groups__fprintf_maps(struct map_groups *self, int verbose, FILE *fp)
+size_t map_groups__fprintf_maps(struct map_groups *mg, int verbose, FILE *fp)
 {
 	size_t printed = 0, i;
 	for (i = 0; i < MAP__NR_TYPES; ++i)
-		printed += __map_groups__fprintf_maps(self, i, verbose, fp);
+		printed += __map_groups__fprintf_maps(mg, i, verbose, fp);
 	return printed;
 }
 
-static size_t __map_groups__fprintf_removed_maps(struct map_groups *self,
+static size_t __map_groups__fprintf_removed_maps(struct map_groups *mg,
 						 enum map_type type,
 						 int verbose, FILE *fp)
 {
 	struct map *pos;
 	size_t printed = 0;
 
-	list_for_each_entry(pos, &self->removed_maps[type], node) {
+	list_for_each_entry(pos, &mg->removed_maps[type], node) {
 		printed += fprintf(fp, "Map:");
 		printed += map__fprintf(pos, fp);
 		if (verbose > 1) {
@@ -368,26 +380,26 @@
 	return printed;
 }
 
-static size_t map_groups__fprintf_removed_maps(struct map_groups *self,
+static size_t map_groups__fprintf_removed_maps(struct map_groups *mg,
 					       int verbose, FILE *fp)
 {
 	size_t printed = 0, i;
 	for (i = 0; i < MAP__NR_TYPES; ++i)
-		printed += __map_groups__fprintf_removed_maps(self, i, verbose, fp);
+		printed += __map_groups__fprintf_removed_maps(mg, i, verbose, fp);
 	return printed;
 }
 
-size_t map_groups__fprintf(struct map_groups *self, int verbose, FILE *fp)
+size_t map_groups__fprintf(struct map_groups *mg, int verbose, FILE *fp)
 {
-	size_t printed = map_groups__fprintf_maps(self, verbose, fp);
+	size_t printed = map_groups__fprintf_maps(mg, verbose, fp);
 	printed += fprintf(fp, "Removed maps:\n");
-	return printed + map_groups__fprintf_removed_maps(self, verbose, fp);
+	return printed + map_groups__fprintf_removed_maps(mg, verbose, fp);
 }
 
-int map_groups__fixup_overlappings(struct map_groups *self, struct map *map,
+int map_groups__fixup_overlappings(struct map_groups *mg, struct map *map,
 				   int verbose, FILE *fp)
 {
-	struct rb_root *root = &self->maps[map->type];
+	struct rb_root *root = &mg->maps[map->type];
 	struct rb_node *next = rb_first(root);
 	int err = 0;
 
@@ -418,7 +430,7 @@
 			}
 
 			before->end = map->start - 1;
-			map_groups__insert(self, before);
+			map_groups__insert(mg, before);
 			if (verbose >= 2)
 				map__fprintf(before, fp);
 		}
@@ -432,7 +444,7 @@
 			}
 
 			after->start = map->end + 1;
-			map_groups__insert(self, after);
+			map_groups__insert(mg, after);
 			if (verbose >= 2)
 				map__fprintf(after, fp);
 		}
@@ -441,7 +453,7 @@
 		 * If we have references, just move them to a separate list.
 		 */
 		if (pos->referenced)
-			list_add_tail(&pos->node, &self->removed_maps[map->type]);
+			list_add_tail(&pos->node, &mg->removed_maps[map->type]);
 		else
 			map__delete(pos);
 
@@ -455,7 +467,7 @@
 /*
  * XXX This should not really _copy_ te maps, but refcount them.
  */
-int map_groups__clone(struct map_groups *self,
+int map_groups__clone(struct map_groups *mg,
 		      struct map_groups *parent, enum map_type type)
 {
 	struct rb_node *nd;
@@ -464,7 +476,7 @@
 		struct map *new = map__clone(map);
 		if (new == NULL)
 			return -ENOMEM;
-		map_groups__insert(self, new);
+		map_groups__insert(mg, new);
 	}
 	return 0;
 }

diff --git a/tools/perf/util/map.h b/tools/perf/util/map.h
index b397c03..890d855 100644
--- a/tools/perf/util/map.h
+++ b/tools/perf/util/map.h

@@ -123,17 +123,17 @@
 
 void map__reloc_vmlinux(struct map *self);
 
-size_t __map_groups__fprintf_maps(struct map_groups *self,
+size_t __map_groups__fprintf_maps(struct map_groups *mg,
 				  enum map_type type, int verbose, FILE *fp);
 void maps__insert(struct rb_root *maps, struct map *map);
-void maps__remove(struct rb_root *self, struct map *map);
+void maps__remove(struct rb_root *maps, struct map *map);
 struct map *maps__find(struct rb_root *maps, u64 addr);
-void map_groups__init(struct map_groups *self);
-void map_groups__exit(struct map_groups *self);
-int map_groups__clone(struct map_groups *self,
+void map_groups__init(struct map_groups *mg);
+void map_groups__exit(struct map_groups *mg);
+int map_groups__clone(struct map_groups *mg,
 		      struct map_groups *parent, enum map_type type);
-size_t map_groups__fprintf(struct map_groups *self, int verbose, FILE *fp);
-size_t map_groups__fprintf_maps(struct map_groups *self, int verbose, FILE *fp);
+size_t map_groups__fprintf(struct map_groups *mg, int verbose, FILE *fp);
+size_t map_groups__fprintf_maps(struct map_groups *mg, int verbose, FILE *fp);
 
 typedef void (*machine__process_t)(struct machine *self, void *data);
 
@@ -162,29 +162,29 @@
 	return self ? self->pid == HOST_KERNEL_ID : false;
 }
 
-static inline void map_groups__insert(struct map_groups *self, struct map *map)
+static inline void map_groups__insert(struct map_groups *mg, struct map *map)
 {
-	maps__insert(&self->maps[map->type], map);
-	map->groups = self;
+	maps__insert(&mg->maps[map->type], map);
+	map->groups = mg;
 }
 
-static inline void map_groups__remove(struct map_groups *self, struct map *map)
+static inline void map_groups__remove(struct map_groups *mg, struct map *map)
 {
-	maps__remove(&self->maps[map->type], map);
+	maps__remove(&mg->maps[map->type], map);
 }
 
-static inline struct map *map_groups__find(struct map_groups *self,
+static inline struct map *map_groups__find(struct map_groups *mg,
 					   enum map_type type, u64 addr)
 {
-	return maps__find(&self->maps[type], addr);
+	return maps__find(&mg->maps[type], addr);
 }
 
-struct symbol *map_groups__find_symbol(struct map_groups *self,
+struct symbol *map_groups__find_symbol(struct map_groups *mg,
 				       enum map_type type, u64 addr,
 				       struct map **mapp,
 				       symbol_filter_t filter);
 
-struct symbol *map_groups__find_symbol_by_name(struct map_groups *self,
+struct symbol *map_groups__find_symbol_by_name(struct map_groups *mg,
 					       enum map_type type,
 					       const char *name,
 					       struct map **mapp,
@@ -208,11 +208,11 @@
 }
 
 static inline
-struct symbol *map_groups__find_function_by_name(struct map_groups *self,
+struct symbol *map_groups__find_function_by_name(struct map_groups *mg,
 						 const char *name, struct map **mapp,
 						 symbol_filter_t filter)
 {
-	return map_groups__find_symbol_by_name(self, MAP__FUNCTION, name, mapp, filter);
+	return map_groups__find_symbol_by_name(mg, MAP__FUNCTION, name, mapp, filter);
 }
 
 static inline
@@ -225,13 +225,13 @@
 						 filter);
 }
 
-int map_groups__fixup_overlappings(struct map_groups *self, struct map *map,
+int map_groups__fixup_overlappings(struct map_groups *mg, struct map *map,
 				   int verbose, FILE *fp);
 
-struct map *map_groups__find_by_name(struct map_groups *self,
+struct map *map_groups__find_by_name(struct map_groups *mg,
 				     enum map_type type, const char *name);
 struct map *machine__new_module(struct machine *self, u64 start, const char *filename);
 
-void map_groups__flush(struct map_groups *self);
+void map_groups__flush(struct map_groups *mg);
 
 #endif /* __PERF_MAP_H */

diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
index 1c7bfa5..eb25900 100644
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c

@@ -1956,8 +1956,10 @@
 
 	pr_debug("Writing event: %s\n", buf);
 	ret = write(fd, buf, strlen(buf));
-	if (ret < 0)
+	if (ret < 0) {
+		ret = -errno;
 		goto error;
+	}
 
 	printf("Remove event: %s\n", ent->s);
 	return 0;

diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 72458d9..20e011c 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c

@@ -1326,3 +1326,22 @@
 
 	return 0;
 }
+
+void perf_session__fprintf_info(struct perf_session *session, FILE *fp,
+				bool full)
+{
+	struct stat st;
+	int ret;
+
+	if (session == NULL || fp == NULL)
+		return;
+
+	ret = fstat(session->fd, &st);
+	if (ret == -1)
+		return;
+
+	fprintf(fp, "# ========\n");
+	fprintf(fp, "# captured on: %s", ctime(&st.st_ctime));
+	perf_header__fprintf_info(session, fp, full);
+	fprintf(fp, "# ========\n#\n");
+}

diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h
index 974d0cb..514b06d 100644
--- a/tools/perf/util/session.h
+++ b/tools/perf/util/session.h

@@ -177,4 +177,5 @@
 int perf_session__cpu_bitmap(struct perf_session *session,
 			     const char *cpu_list, unsigned long *cpu_bitmap);
 
+void perf_session__fprintf_info(struct perf_session *s, FILE *fp, bool full);
 #endif /* __PERF_SESSION_H */

diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index 1ee8f1e..16da30d 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c

@@ -177,7 +177,9 @@
 				       BITS_PER_LONG / 4, self->ip, o);
 	}
 
-	ret += repsep_snprintf(bf + ret, size - ret, "[%c] ", self->level);
+	if (!sort_dso.elide)
+		ret += repsep_snprintf(bf + ret, size - ret, "[%c] ", self->level);
+
 	if (self->ms.sym)
 		ret += repsep_snprintf(bf + ret, size - ret, "%s",
 				       self->ms.sym->name);

diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h
index 77d0388..3f67ae3 100644
--- a/tools/perf/util/sort.h
+++ b/tools/perf/util/sort.h

@@ -45,6 +45,7 @@
  * @nr_rows - rows expanded in callchain, recalculated on folding/unfolding
  */
 struct hist_entry {
+	struct rb_node		rb_node_in;
 	struct rb_node		rb_node;
 	u64			period;
 	u64			period_sys;
@@ -63,6 +64,7 @@
 
 	bool			init_have_children;
 	char			level;
+	bool			used;
 	u8			filtered;
 	struct symbol		*parent;
 	union {

diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index 40eeaf0..632b50c 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c

@@ -24,7 +24,7 @@
 #include <sys/utsname.h>
 
 #ifndef KSYM_NAME_LEN
-#define KSYM_NAME_LEN 128
+#define KSYM_NAME_LEN 256
 #endif
 
 #ifndef NT_GNU_BUILD_ID
@@ -46,6 +46,7 @@
 	.exclude_other	  = true,
 	.use_modules	  = true,
 	.try_vmlinux_path = true,
+	.annotate_src	  = true,
 	.symfs            = "",
 };
 

diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h
index 4f377d9..29f8d74 100644
--- a/tools/perf/util/symbol.h
+++ b/tools/perf/util/symbol.h

@@ -72,11 +72,14 @@
 			use_modules,
 			sort_by_name,
 			show_nr_samples,
+			show_total_period,
 			use_callchain,
 			exclude_other,
 			show_cpu_utilization,
 			initialized,
-			kptr_restrict;
+			kptr_restrict,
+			annotate_asm_raw,
+			annotate_src;
 	const char	*vmlinux_name,
 			*kallsyms_name,
 			*source_prefix,

diff --git a/tools/perf/util/top.c b/tools/perf/util/top.c
index a11f607..500471d 100644
--- a/tools/perf/util/top.c
+++ b/tools/perf/util/top.c

@@ -15,52 +15,6 @@
 #include "top.h"
 #include <inttypes.h>
 
-/*
- * Ordering weight: count-1 * count-2 * ... / count-n
- */
-static double sym_weight(const struct sym_entry *sym, struct perf_top *top)
-{
-	double weight = sym->snap_count;
-	int counter;
-
-	if (!top->display_weighted)
-		return weight;
-
-	for (counter = 1; counter < top->evlist->nr_entries - 1; counter++)
-		weight *= sym->count[counter];
-
-	weight /= (sym->count[counter] + 1);
-
-	return weight;
-}
-
-static void perf_top__remove_active_sym(struct perf_top *top, struct sym_entry *syme)
-{
-	pthread_mutex_lock(&top->active_symbols_lock);
-	list_del_init(&syme->node);
-	pthread_mutex_unlock(&top->active_symbols_lock);
-}
-
-static void rb_insert_active_sym(struct rb_root *tree, struct sym_entry *se)
-{
-	struct rb_node **p = &tree->rb_node;
-	struct rb_node *parent = NULL;
-	struct sym_entry *iter;
-
-	while (*p != NULL) {
-		parent = *p;
-		iter = rb_entry(parent, struct sym_entry, rb_node);
-
-		if (se->weight > iter->weight)
-			p = &(*p)->rb_left;
-		else
-			p = &(*p)->rb_right;
-	}
-
-	rb_link_node(&se->rb_node, parent, p);
-	rb_insert_color(&se->rb_node, tree);
-}
-
 #define SNPRINTF(buf, size, fmt, args...) \
 ({ \
 	size_t r = snprintf(buf, size, fmt, ## args); \
@@ -69,7 +23,6 @@
 
 size_t perf_top__header_snprintf(struct perf_top *top, char *bf, size_t size)
 {
-	struct perf_evsel *counter;
 	float samples_per_sec = top->samples / top->delay_secs;
 	float ksamples_per_sec = top->kernel_samples / top->delay_secs;
 	float esamples_percent = (100.0 * top->exact_samples) / top->samples;
@@ -104,7 +57,7 @@
 			       esamples_percent);
 	}
 
-	if (top->evlist->nr_entries == 1 || !top->display_weighted) {
+	if (top->evlist->nr_entries == 1) {
 		struct perf_evsel *first;
 		first = list_entry(top->evlist->entries.next, struct perf_evsel, node);
 		ret += SNPRINTF(bf + ret, size - ret, "%" PRIu64 "%s ",
@@ -112,27 +65,7 @@
 				top->freq ? "Hz" : "");
 	}
 
-	if (!top->display_weighted) {
-		ret += SNPRINTF(bf + ret, size - ret, "%s",
-				event_name(top->sym_evsel));
-	} else {
-		/*
-		 * Don't let events eat all the space. Leaving 30 bytes
-		 * for the rest should be enough.
-		 */
-		size_t last_pos = size - 30;
-
-		list_for_each_entry(counter, &top->evlist->entries, node) {
-			ret += SNPRINTF(bf + ret, size - ret, "%s%s",
-					counter->idx ? "/" : "",
-					event_name(counter));
-			if (ret > last_pos) {
-				sprintf(bf + last_pos - 3, "..");
-				ret = last_pos - 1;
-				break;
-			}
-		}
-	}
+	ret += SNPRINTF(bf + ret, size - ret, "%s", event_name(top->sym_evsel));
 
 	ret += SNPRINTF(bf + ret, size - ret, "], ");
 
@@ -166,73 +99,3 @@
 	top->exact_samples = top->guest_kernel_samples =
 	top->guest_us_samples = 0;
 }
-
-float perf_top__decay_samples(struct perf_top *top, struct rb_root *root)
-{
-	struct sym_entry *syme, *n;
-	float sum_ksamples = 0.0;
-	int snap = !top->display_weighted ? top->sym_evsel->idx : 0, j;
-
-	/* Sort the active symbols */
-	pthread_mutex_lock(&top->active_symbols_lock);
-	syme = list_entry(top->active_symbols.next, struct sym_entry, node);
-	pthread_mutex_unlock(&top->active_symbols_lock);
-
-	top->rb_entries = 0;
-	list_for_each_entry_safe_from(syme, n, &top->active_symbols, node) {
-		syme->snap_count = syme->count[snap];
-		if (syme->snap_count != 0) {
-
-			if ((top->hide_user_symbols &&
-			     syme->map->dso->kernel == DSO_TYPE_USER) ||
-			    (top->hide_kernel_symbols &&
-			     syme->map->dso->kernel == DSO_TYPE_KERNEL)) {
-				perf_top__remove_active_sym(top, syme);
-				continue;
-			}
-			syme->weight = sym_weight(syme, top);
-
-			if ((int)syme->snap_count >= top->count_filter) {
-				rb_insert_active_sym(root, syme);
-				++top->rb_entries;
-			}
-			sum_ksamples += syme->snap_count;
-
-			for (j = 0; j < top->evlist->nr_entries; j++)
-				syme->count[j] = top->zero ? 0 : syme->count[j] * 7 / 8;
-		} else
-			perf_top__remove_active_sym(top, syme);
-	}
-
-	return sum_ksamples;
-}
-
-/*
- * Find the longest symbol name that will be displayed
- */
-void perf_top__find_widths(struct perf_top *top, struct rb_root *root,
-			   int *dso_width, int *dso_short_width, int *sym_width)
-{
-	struct rb_node *nd;
-	int printed = 0;
-
-	*sym_width = *dso_width = *dso_short_width = 0;
-
-	for (nd = rb_first(root); nd; nd = rb_next(nd)) {
-		struct sym_entry *syme = rb_entry(nd, struct sym_entry, rb_node);
-		struct symbol *sym = sym_entry__symbol(syme);
-
-		if (++printed > top->print_entries ||
-		    (int)syme->snap_count < top->count_filter)
-			continue;
-
-		if (syme->map->dso->long_name_len > *dso_width)
-			*dso_width = syme->map->dso->long_name_len;
-
-		if (syme->map->dso->short_name_len > *dso_short_width)
-			*dso_short_width = syme->map->dso->short_name_len;
-
-		if (sym->namelen > *sym_width)
-			*sym_width = sym->namelen;
-	}
-}

diff --git a/tools/perf/util/top.h b/tools/perf/util/top.h
index bfbf95b..01d1057 100644
--- a/tools/perf/util/top.h
+++ b/tools/perf/util/top.h

@@ -4,26 +4,10 @@
 #include "types.h"
 #include "../perf.h"
 #include <stddef.h>
-#include <pthread.h>
-#include <linux/list.h>
-#include <linux/rbtree.h>
 
 struct perf_evlist;
 struct perf_evsel;
-
-struct sym_entry {
-	struct rb_node		rb_node;
-	struct list_head	node;
-	unsigned long		snap_count;
-	double			weight;
-	struct map		*map;
-	unsigned long		count[0];
-};
-
-static inline struct symbol *sym_entry__symbol(struct sym_entry *self)
-{
-       return ((void *)self) + symbol_conf.priv_size;
-}
+struct perf_session;
 
 struct perf_top {
 	struct perf_evlist *evlist;
@@ -31,34 +15,21 @@
 	 * Symbols will be added here in perf_event__process_sample and will
 	 * get out after decayed.
 	 */
-	struct list_head   active_symbols;
-	pthread_mutex_t	   active_symbols_lock;
-	pthread_cond_t	   active_symbols_cond;
 	u64		   samples;
 	u64		   kernel_samples, us_samples;
 	u64		   exact_samples;
 	u64		   guest_us_samples, guest_kernel_samples;
+	u64		   total_lost_warned;
 	int		   print_entries, count_filter, delay_secs;
-	int		   display_weighted, freq, rb_entries;
+	int		   freq;
 	pid_t		   target_pid, target_tid;
 	bool		   hide_kernel_symbols, hide_user_symbols, zero;
 	const char	   *cpu_list;
-	struct sym_entry   *sym_filter_entry;
+	struct hist_entry  *sym_filter_entry;
 	struct perf_evsel  *sym_evsel;
+	struct perf_session *session;
 };
 
 size_t perf_top__header_snprintf(struct perf_top *top, char *bf, size_t size);
 void perf_top__reset_sample_counters(struct perf_top *top);
-float perf_top__decay_samples(struct perf_top *top, struct rb_root *root);
-void perf_top__find_widths(struct perf_top *top, struct rb_root *root,
-			   int *dso_width, int *dso_short_width, int *sym_width);
-
-#ifdef NO_NEWT_SUPPORT
-static inline int perf_top__tui_browser(struct perf_top *top __used)
-{
-	return 0;
-}
-#else
-int perf_top__tui_browser(struct perf_top *top);
-#endif
 #endif /* __PERF_TOP_H */

diff --git a/tools/perf/util/trace-event-info.c b/tools/perf/util/trace-event-info.c
index 3403f81..2d530cf 100644
--- a/tools/perf/util/trace-event-info.c
+++ b/tools/perf/util/trace-event-info.c

@@ -196,7 +196,8 @@
 		die("Can't read '%s'", file);
 
 	/* put in zeros for file size, then fill true size later */
-	write_or_die(&size, hdr_sz);
+	if (hdr_sz)
+		write_or_die(&size, hdr_sz);
 
 	do {
 		r = read(fd, buf, BUFSIZ);
@@ -212,7 +213,7 @@
 	if (bigendian())
 		sizep += sizeof(u64) - hdr_sz;
 
-	if (pwrite(output_fd, sizep, hdr_sz, hdr_pos) < 0)
+	if (hdr_sz && pwrite(output_fd, sizep, hdr_sz, hdr_pos) < 0)
 		die("writing to %s", output_file);
 }
 
@@ -428,6 +429,19 @@
 	return nr_tracepoints > 0 ? path.next : NULL;
 }
 
+static void
+put_tracepoints_path(struct tracepoint_path *tps)
+{
+	while (tps) {
+		struct tracepoint_path *t = tps;
+
+		tps = tps->next;
+		free(t->name);
+		free(t->system);
+		free(t);
+	}
+}
+
 bool have_tracepoints(struct list_head *pattrs)
 {
 	struct perf_evsel *pos;
@@ -439,19 +453,11 @@
 	return false;
 }
 
-int read_tracing_data(int fd, struct list_head *pattrs)
+static void tracing_data_header(void)
 {
-	char buf[BUFSIZ];
-	struct tracepoint_path *tps = get_tracepoints_path(pattrs);
+	char buf[20];
 
-	/*
-	 * What? No tracepoints? No sense writing anything here, bail out.
-	 */
-	if (tps == NULL)
-		return -1;
-
-	output_fd = fd;
-
+	/* just guessing this is someone's birthday.. ;) */
 	buf[0] = 23;
 	buf[1] = 8;
 	buf[2] = 68;
@@ -476,28 +482,86 @@
 	/* save page_size */
 	page_size = sysconf(_SC_PAGESIZE);
 	write_or_die(&page_size, 4);
+}
 
+struct tracing_data *tracing_data_get(struct list_head *pattrs,
+				      int fd, bool temp)
+{
+	struct tracepoint_path *tps;
+	struct tracing_data *tdata;
+
+	output_fd = fd;
+
+	tps = get_tracepoints_path(pattrs);
+	if (!tps)
+		return NULL;
+
+	tdata = malloc_or_die(sizeof(*tdata));
+	tdata->temp = temp;
+	tdata->size = 0;
+
+	if (temp) {
+		int temp_fd;
+
+		snprintf(tdata->temp_file, sizeof(tdata->temp_file),
+			 "/tmp/perf-XXXXXX");
+		if (!mkstemp(tdata->temp_file))
+			die("Can't make temp file");
+
+		temp_fd = open(tdata->temp_file, O_RDWR);
+		if (temp_fd < 0)
+			die("Can't read '%s'", tdata->temp_file);
+
+		/*
+		 * Set the temp file the default output, so all the
+		 * tracing data are stored into it.
+		 */
+		output_fd = temp_fd;
+	}
+
+	tracing_data_header();
 	read_header_files();
 	read_ftrace_files(tps);
 	read_event_files(tps);
 	read_proc_kallsyms();
 	read_ftrace_printk();
 
-	return 0;
+	/*
+	 * All tracing data are stored by now, we can restore
+	 * the default output file in case we used temp file.
+	 */
+	if (temp) {
+		tdata->size = lseek(output_fd, 0, SEEK_CUR);
+		close(output_fd);
+		output_fd = fd;
+	}
+
+	put_tracepoints_path(tps);
+	return tdata;
 }
 
-ssize_t read_tracing_data_size(int fd, struct list_head *pattrs)
+void tracing_data_put(struct tracing_data *tdata)
 {
-	ssize_t size;
-	int err = 0;
+	if (tdata->temp) {
+		record_file(tdata->temp_file, 0);
+		unlink(tdata->temp_file);
+	}
 
-	calc_data_size = 1;
-	err = read_tracing_data(fd, pattrs);
-	size = calc_data_size - 1;
-	calc_data_size = 0;
+	free(tdata);
+}
 
-	if (err < 0)
-		return err;
+int read_tracing_data(int fd, struct list_head *pattrs)
+{
+	struct tracing_data *tdata;
 
-	return size;
+	/*
+	 * We work over the real file, so we can write data
+	 * directly, no temp file is needed.
+	 */
+	tdata = tracing_data_get(pattrs, fd, false);
+	if (!tdata)
+		return -ENOMEM;
+
+	tracing_data_put(tdata);
+	return 0;
 }

diff --git a/tools/perf/util/trace-event.h b/tools/perf/util/trace-event.h
index f674dda..a841008 100644
--- a/tools/perf/util/trace-event.h
+++ b/tools/perf/util/trace-event.h

@@ -263,7 +263,18 @@
 unsigned long long eval_flag(const char *flag);
 
 int read_tracing_data(int fd, struct list_head *pattrs);
-ssize_t read_tracing_data_size(int fd, struct list_head *pattrs);
+
+struct tracing_data {
+	/* size is only valid if temp is 'true' */
+	ssize_t size;
+	bool temp;
+	char temp_file[50];
+};
+
+struct tracing_data *tracing_data_get(struct list_head *pattrs,
+				      int fd, bool temp);
+void tracing_data_put(struct tracing_data *tdata);
+
 
 /* taken from kernel/trace/trace.h */
 enum trace_flag_type {

diff --git a/tools/perf/util/ui/browser.c b/tools/perf/util/ui/browser.c
index 611219f..5359f37 100644
--- a/tools/perf/util/ui/browser.c
+++ b/tools/perf/util/ui/browser.c

@@ -1,4 +1,8 @@
+#include "../util.h"
+#include "../cache.h"
+#include "../../perf.h"
 #include "libslang.h"
+#include <newt.h>
 #include "ui.h"
 #include <linux/compiler.h>
 #include <linux/list.h>
@@ -7,13 +11,13 @@
 #include <sys/ttydefaults.h>
 #include "browser.h"
 #include "helpline.h"
+#include "keysyms.h"
 #include "../color.h"
-#include "../util.h"
-#include <stdio.h>
 
-static int ui_browser__percent_color(double percent, bool current)
+static int ui_browser__percent_color(struct ui_browser *browser,
+				     double percent, bool current)
 {
-	if (current)
+	if (current && (!browser->use_navkeypressed || browser->navkeypressed))
 		return HE_COLORSET_SELECTED;
 	if (percent >= MIN_RED)
 		return HE_COLORSET_TOP;
@@ -30,7 +34,7 @@
 void ui_browser__set_percent_color(struct ui_browser *self,
 				   double percent, bool current)
 {
-	 int color = ui_browser__percent_color(percent, current);
+	 int color = ui_browser__percent_color(self, percent, current);
 	 ui_browser__set_color(self, color);
 }
 
@@ -39,31 +43,62 @@
 	SLsmg_gotorc(self->y + y, self->x + x);
 }
 
+static struct list_head *
+ui_browser__list_head_filter_entries(struct ui_browser *browser,
+				     struct list_head *pos)
+{
+	do {
+		if (!browser->filter || !browser->filter(browser, pos))
+			return pos;
+		pos = pos->next;
+	} while (pos != browser->entries);
+
+	return NULL;
+}
+
+static struct list_head *
+ui_browser__list_head_filter_prev_entries(struct ui_browser *browser,
+					  struct list_head *pos)
+{
+	do {
+		if (!browser->filter || !browser->filter(browser, pos))
+			return pos;
+		pos = pos->prev;
+	} while (pos != browser->entries);
+
+	return NULL;
+}
+
 void ui_browser__list_head_seek(struct ui_browser *self, off_t offset, int whence)
 {
 	struct list_head *head = self->entries;
 	struct list_head *pos;
 
+	if (self->nr_entries == 0)
+		return;
+
 	switch (whence) {
 	case SEEK_SET:
-		pos = head->next;
+		pos = ui_browser__list_head_filter_entries(self, head->next);
 		break;
 	case SEEK_CUR:
 		pos = self->top;
 		break;
 	case SEEK_END:
-		pos = head->prev;
+		pos = ui_browser__list_head_filter_prev_entries(self, head->prev);
 		break;
 	default:
 		return;
 	}
 
+	assert(pos != NULL);
+
 	if (offset > 0) {
 		while (offset-- != 0)
-			pos = pos->next;
+			pos = ui_browser__list_head_filter_entries(self, pos->next);
 	} else {
 		while (offset++ != 0)
-			pos = pos->prev;
+			pos = ui_browser__list_head_filter_prev_entries(self, pos->prev);
 	}
 
 	self->top = pos;
@@ -127,11 +162,8 @@
 
 void ui_browser__refresh_dimensions(struct ui_browser *self)
 {
-	int cols, rows;
-	newtGetScreenSize(&cols, &rows);
-
-	self->width = cols - 1;
-	self->height = rows - 2;
+	self->width = SLtt_Screen_Cols - 1;
+	self->height = SLtt_Screen_Rows - 2;
 	self->y = 1;
 	self->x = 0;
 }
@@ -142,26 +174,11 @@
 	self->seek(self, 0, SEEK_SET);
 }
 
-void ui_browser__add_exit_key(struct ui_browser *self, int key)
-{
-	newtFormAddHotKey(self->form, key);
-}
-
-void ui_browser__add_exit_keys(struct ui_browser *self, int keys[])
-{
-	int i = 0;
-
-	while (keys[i] && i < 64) {
-		ui_browser__add_exit_key(self, keys[i]);
-		++i;
-	}
-}
-
 void __ui_browser__show_title(struct ui_browser *browser, const char *title)
 {
 	SLsmg_gotorc(0, 0);
 	ui_browser__set_color(browser, NEWT_COLORSET_ROOT);
-	slsmg_write_nstring(title, browser->width);
+	slsmg_write_nstring(title, browser->width + 1);
 }
 
 void ui_browser__show_title(struct ui_browser *browser, const char *title)
@@ -174,78 +191,189 @@
 int ui_browser__show(struct ui_browser *self, const char *title,
 		     const char *helpline, ...)
 {
+	int err;
 	va_list ap;
-	int keys[] = { NEWT_KEY_UP, NEWT_KEY_DOWN, NEWT_KEY_PGUP,
-		       NEWT_KEY_PGDN, NEWT_KEY_HOME, NEWT_KEY_END, ' ',
-		       NEWT_KEY_LEFT, NEWT_KEY_ESCAPE, 'q', CTRL('c'), 0 };
-
-	if (self->form != NULL)
-		newtFormDestroy(self->form);
 
 	ui_browser__refresh_dimensions(self);
-	self->form = newtForm(NULL, NULL, 0);
-	if (self->form == NULL)
-		return -1;
-
-	self->sb = newtVerticalScrollbar(self->width, 1, self->height,
-					 HE_COLORSET_NORMAL,
-					 HE_COLORSET_SELECTED);
-	if (self->sb == NULL)
-		return -1;
 
 	pthread_mutex_lock(&ui__lock);
 	__ui_browser__show_title(self, title);
 
-	ui_browser__add_exit_keys(self, keys);
-	newtFormAddComponent(self->form, self->sb);
+	self->title = title;
+	free(self->helpline);
+	self->helpline = NULL;
 
 	va_start(ap, helpline);
-	ui_helpline__vpush(helpline, ap);
+	err = vasprintf(&self->helpline, helpline, ap);
 	va_end(ap);
+	if (err > 0)
+		ui_helpline__push(self->helpline);
 	pthread_mutex_unlock(&ui__lock);
-	return 0;
+	return err ? 0 : -1;
 }
 
-void ui_browser__hide(struct ui_browser *self)
+void ui_browser__hide(struct ui_browser *browser __used)
 {
 	pthread_mutex_lock(&ui__lock);
-	newtFormDestroy(self->form);
-	self->form = NULL;
 	ui_helpline__pop();
 	pthread_mutex_unlock(&ui__lock);
 }
 
-int ui_browser__refresh(struct ui_browser *self)
+static void ui_browser__scrollbar_set(struct ui_browser *browser)
+{
+	int height = browser->height, h = 0, pct = 0,
+	    col = browser->width,
+	    row = browser->y - 1;
+
+	if (browser->nr_entries > 1) {
+		pct = ((browser->index * (browser->height - 1)) /
+		       (browser->nr_entries - 1));
+	}
+
+	while (h < height) {
+	        ui_browser__gotorc(browser, row++, col);
+		SLsmg_set_char_set(1);
+		SLsmg_write_char(h == pct ? SLSMG_DIAMOND_CHAR : SLSMG_BOARD_CHAR);
+		SLsmg_set_char_set(0);
+		++h;
+	}
+}
+
+static int __ui_browser__refresh(struct ui_browser *browser)
 {
 	int row;
+	int width = browser->width;
 
+	row = browser->refresh(browser);
+	ui_browser__set_color(browser, HE_COLORSET_NORMAL);
+
+	if (!browser->use_navkeypressed || browser->navkeypressed)
+		ui_browser__scrollbar_set(browser);
+	else
+		width += 1;
+
+	SLsmg_fill_region(browser->y + row, browser->x,
+			  browser->height - row, width, ' ');
+
+	return 0;
+}
+
+int ui_browser__refresh(struct ui_browser *browser)
+{
 	pthread_mutex_lock(&ui__lock);
-	newtScrollbarSet(self->sb, self->index, self->nr_entries - 1);
-	row = self->refresh(self);
-	ui_browser__set_color(self, HE_COLORSET_NORMAL);
-	SLsmg_fill_region(self->y + row, self->x,
-			  self->height - row, self->width, ' ');
+	__ui_browser__refresh(browser);
 	pthread_mutex_unlock(&ui__lock);
 
 	return 0;
 }
 
-int ui_browser__run(struct ui_browser *self)
+/*
+ * Here we're updating nr_entries _after_ we started browsing, i.e.  we have to
+ * forget about any reference to any entry in the underlying data structure,
+ * that is why we do a SEEK_SET. Think about 'perf top' in the hists browser
+ * after an output_resort and hist decay.
+ */
+void ui_browser__update_nr_entries(struct ui_browser *browser, u32 nr_entries)
 {
-	struct newtExitStruct es;
+	off_t offset = nr_entries - browser->nr_entries;
 
-	if (ui_browser__refresh(self) < 0)
-		return -1;
+	browser->nr_entries = nr_entries;
+
+	if (offset < 0) {
+		if (browser->top_idx < (u64)-offset)
+			offset = -browser->top_idx;
+
+		browser->index += offset;
+		browser->top_idx += offset;
+	}
+
+	browser->top = NULL;
+	browser->seek(browser, browser->top_idx, SEEK_SET);
+}
+
+static int ui__getch(int delay_secs)
+{
+	struct timeval timeout, *ptimeout = delay_secs ? &timeout : NULL;
+	fd_set read_set;
+	int err, key;
+
+	FD_ZERO(&read_set);
+	FD_SET(0, &read_set);
+
+	if (delay_secs) {
+		timeout.tv_sec = delay_secs;
+		timeout.tv_usec = 0;
+	}
+
+        err = select(1, &read_set, NULL, NULL, ptimeout);
+
+	if (err == 0)
+		return K_TIMER;
+
+	if (err == -1) {
+		if (errno == EINTR)
+			return K_RESIZE;
+		return K_ERROR;
+	}
+
+	key = SLang_getkey();
+	if (key != K_ESC)
+		return key;
+
+	FD_ZERO(&read_set);
+	FD_SET(0, &read_set);
+	timeout.tv_sec = 0;
+	timeout.tv_usec = 20;
+        err = select(1, &read_set, NULL, NULL, &timeout);
+	if (err == 0)
+		return K_ESC;
+
+	SLang_ungetkey(key);
+	return SLkp_getkey();
+}
+
+int ui_browser__run(struct ui_browser *self, int delay_secs)
+{
+	int err, key;
+
+	pthread__unblock_sigwinch();
 
 	while (1) {
 		off_t offset;
 
-		newtFormRun(self->form, &es);
-
-		if (es.reason != NEWT_EXIT_HOTKEY)
+		pthread_mutex_lock(&ui__lock);
+		err = __ui_browser__refresh(self);
+		SLsmg_refresh();
+		pthread_mutex_unlock(&ui__lock);
+		if (err < 0)
 			break;
-		switch (es.u.key) {
-		case NEWT_KEY_DOWN:
+
+		key = ui__getch(delay_secs);
+
+		if (key == K_RESIZE) {
+			pthread_mutex_lock(&ui__lock);
+			SLtt_get_screen_size();
+			SLsmg_reinit_smg();
+			pthread_mutex_unlock(&ui__lock);
+			ui_browser__refresh_dimensions(self);
+			__ui_browser__show_title(self, self->title);
+			ui_helpline__puts(self->helpline);
+			continue;
+		}
+
+		if (self->use_navkeypressed && !self->navkeypressed) {
+			if (key == K_DOWN || key == K_UP ||
+			    key == K_PGDN || key == K_PGUP ||
+			    key == K_HOME || key == K_END ||
+			    key == ' ') {
+				self->navkeypressed = true;
+				continue;
+			} else
+				return key;
+		}
+
+		switch (key) {
+		case K_DOWN:
 			if (self->index == self->nr_entries - 1)
 				break;
 			++self->index;
@@ -254,7 +382,7 @@
 				self->seek(self, +1, SEEK_CUR);
 			}
 			break;
-		case NEWT_KEY_UP:
+		case K_UP:
 			if (self->index == 0)
 				break;
 			--self->index;
@@ -263,7 +391,7 @@
 				self->seek(self, -1, SEEK_CUR);
 			}
 			break;
-		case NEWT_KEY_PGDN:
+		case K_PGDN:
 		case ' ':
 			if (self->top_idx + self->height > self->nr_entries - 1)
 				break;
@@ -275,7 +403,7 @@
 			self->top_idx += offset;
 			self->seek(self, +offset, SEEK_CUR);
 			break;
-		case NEWT_KEY_PGUP:
+		case K_PGUP:
 			if (self->top_idx == 0)
 				break;
 
@@ -288,10 +416,10 @@
 			self->top_idx -= offset;
 			self->seek(self, -offset, SEEK_CUR);
 			break;
-		case NEWT_KEY_HOME:
+		case K_HOME:
 			ui_browser__reset_index(self);
 			break;
-		case NEWT_KEY_END:
+		case K_END:
 			offset = self->height - 1;
 			if (offset >= self->nr_entries)
 				offset = self->nr_entries - 1;
@@ -301,10 +429,8 @@
 			self->seek(self, -offset, SEEK_END);
 			break;
 		default:
-			return es.u.key;
+			return key;
 		}
-		if (ui_browser__refresh(self) < 0)
-			return -1;
 	}
 	return -1;
 }
@@ -316,41 +442,105 @@
 	int row = 0;
 
 	if (self->top == NULL || self->top == self->entries)
-                self->top = head->next;
+                self->top = ui_browser__list_head_filter_entries(self, head->next);
 
 	pos = self->top;
 
 	list_for_each_from(pos, head) {
-		ui_browser__gotorc(self, row, 0);
-		self->write(self, pos, row);
-		if (++row == self->height)
-			break;
+		if (!self->filter || !self->filter(self, pos)) {
+			ui_browser__gotorc(self, row, 0);
+			self->write(self, pos, row);
+			if (++row == self->height)
+				break;
+		}
 	}
 
 	return row;
 }
 
-static struct newtPercentTreeColors {
-	const char *topColorFg, *topColorBg;
-	const char *mediumColorFg, *mediumColorBg;
-	const char *normalColorFg, *normalColorBg;
-	const char *selColorFg, *selColorBg;
-	const char *codeColorFg, *codeColorBg;
-} defaultPercentTreeColors = {
-	"red",       "lightgray",
-	"green",     "lightgray",
-	"black",     "lightgray",
-	"lightgray", "magenta",
-	"blue",	     "lightgray",
+static struct ui_browser__colorset {
+	const char *name, *fg, *bg;
+	int colorset;
+} ui_browser__colorsets[] = {
+	{
+		.colorset = HE_COLORSET_TOP,
+		.name	  = "top",
+		.fg	  = "red",
+		.bg	  = "default",
+	},
+	{
+		.colorset = HE_COLORSET_MEDIUM,
+		.name	  = "medium",
+		.fg	  = "green",
+		.bg	  = "default",
+	},
+	{
+		.colorset = HE_COLORSET_NORMAL,
+		.name	  = "normal",
+		.fg	  = "default",
+		.bg	  = "default",
+	},
+	{
+		.colorset = HE_COLORSET_SELECTED,
+		.name	  = "selected",
+		.fg	  = "black",
+		.bg	  = "lightgray",
+	},
+	{
+		.colorset = HE_COLORSET_CODE,
+		.name	  = "code",
+		.fg	  = "blue",
+		.bg	  = "default",
+	},
+	{
+		.name = NULL,
+	}
 };
 
+
+static int ui_browser__color_config(const char *var, const char *value,
+				    void *data __used)
+{
+	char *fg = NULL, *bg;
+	int i;
+
+	/* same dir for all commands */
+	if (prefixcmp(var, "colors.") != 0)
+		return 0;
+
+	for (i = 0; ui_browser__colorsets[i].name != NULL; ++i) {
+		const char *name = var + 7;
+
+		if (strcmp(ui_browser__colorsets[i].name, name) != 0)
+			continue;
+
+		fg = strdup(value);
+		if (fg == NULL)
+			break;
+
+		bg = strchr(fg, ',');
+		if (bg == NULL)
+			break;
+
+		*bg = '\0';
+		while (isspace(*++bg));
+		ui_browser__colorsets[i].bg = bg;
+		ui_browser__colorsets[i].fg = fg;
+		return 0;
+	}
+
+	free(fg);
+	return -1;
+}
+
 void ui_browser__init(void)
 {
-	struct newtPercentTreeColors *c = &defaultPercentTreeColors;
+	int i = 0;
 
-	sltt_set_color(HE_COLORSET_TOP, NULL, c->topColorFg, c->topColorBg);
-	sltt_set_color(HE_COLORSET_MEDIUM, NULL, c->mediumColorFg, c->mediumColorBg);
-	sltt_set_color(HE_COLORSET_NORMAL, NULL, c->normalColorFg, c->normalColorBg);
-	sltt_set_color(HE_COLORSET_SELECTED, NULL, c->selColorFg, c->selColorBg);
-	sltt_set_color(HE_COLORSET_CODE, NULL, c->codeColorFg, c->codeColorBg);
+	perf_config(ui_browser__color_config, NULL);
+
+	while (ui_browser__colorsets[i].name) {
+		struct ui_browser__colorset *c = &ui_browser__colorsets[i++];
+		sltt_set_color(c->colorset, c->name, c->fg, c->bg);
+	}
 }

diff --git a/tools/perf/util/ui/browser.h b/tools/perf/util/ui/browser.h
index fc63dda..a2c707d 100644
--- a/tools/perf/util/ui/browser.h
+++ b/tools/perf/util/ui/browser.h

@@ -2,7 +2,6 @@
 #define _PERF_UI_BROWSER_H_ 1
 
 #include <stdbool.h>
-#include <newt.h>
 #include <sys/types.h>
 #include "../types.h"
 
@@ -13,15 +12,19 @@
 #define HE_COLORSET_CODE	54
 
 struct ui_browser {
-	newtComponent form, sb;
 	u64	      index, top_idx;
 	void	      *top, *entries;
 	u16	      y, x, width, height;
 	void	      *priv;
+	const char    *title;
+	char	      *helpline;
 	unsigned int  (*refresh)(struct ui_browser *self);
 	void	      (*write)(struct ui_browser *self, void *entry, int row);
 	void	      (*seek)(struct ui_browser *self, off_t offset, int whence);
+	bool	      (*filter)(struct ui_browser *self, void *entry);
 	u32	      nr_entries;
+	bool	      navkeypressed;
+	bool	      use_navkeypressed;
 };
 
 void ui_browser__set_color(struct ui_browser *self, int color);
@@ -32,15 +35,14 @@
 void ui_browser__reset_index(struct ui_browser *self);
 
 void ui_browser__gotorc(struct ui_browser *self, int y, int x);
-void ui_browser__add_exit_key(struct ui_browser *self, int key);
-void ui_browser__add_exit_keys(struct ui_browser *self, int keys[]);
 void __ui_browser__show_title(struct ui_browser *browser, const char *title);
 void ui_browser__show_title(struct ui_browser *browser, const char *title);
 int ui_browser__show(struct ui_browser *self, const char *title,
 		     const char *helpline, ...);
 void ui_browser__hide(struct ui_browser *self);
 int ui_browser__refresh(struct ui_browser *self);
-int ui_browser__run(struct ui_browser *self);
+int ui_browser__run(struct ui_browser *browser, int delay_secs);
+void ui_browser__update_nr_entries(struct ui_browser *browser, u32 nr_entries);
 
 void ui_browser__rb_tree_seek(struct ui_browser *self, off_t offset, int whence);
 unsigned int ui_browser__rb_tree_refresh(struct ui_browser *self);

diff --git a/tools/perf/util/ui/browsers/annotate.c b/tools/perf/util/ui/browsers/annotate.c
index 0229723..4e0cb7fe 100644
--- a/tools/perf/util/ui/browsers/annotate.c
+++ b/tools/perf/util/ui/browsers/annotate.c

@@ -6,6 +6,7 @@
 #include "../../sort.h"
 #include "../../symbol.h"
 #include <pthread.h>
+#include <newt.h>
 
 static void ui__error_window(const char *fmt, ...)
 {
@@ -20,12 +21,17 @@
 	struct ui_browser b;
 	struct rb_root	  entries;
 	struct rb_node	  *curr_hot;
+	struct objdump_line *selection;
+	int		    nr_asm_entries;
+	int		    nr_entries;
+	bool		    hide_src_code;
 };
 
 struct objdump_line_rb_node {
 	struct rb_node	rb_node;
 	double		percent;
 	u32		idx;
+	int		idx_asm;
 };
 
 static inline
@@ -34,9 +40,22 @@
 	return (struct objdump_line_rb_node *)(self + 1);
 }
 
+static bool objdump_line__filter(struct ui_browser *browser, void *entry)
+{
+	struct annotate_browser *ab = container_of(browser, struct annotate_browser, b);
+
+	if (ab->hide_src_code) {
+		struct objdump_line *ol = list_entry(entry, struct objdump_line, node);
+		return ol->offset == -1;
+	}
+
+	return false;
+}
+
 static void annotate_browser__write(struct ui_browser *self, void *entry, int row)
 {
-	struct objdump_line *ol = rb_entry(entry, struct objdump_line, node);
+	struct annotate_browser *ab = container_of(self, struct annotate_browser, b);
+	struct objdump_line *ol = list_entry(entry, struct objdump_line, node);
 	bool current_entry = ui_browser__is_current_entry(self, row);
 	int width = self->width;
 
@@ -51,6 +70,11 @@
 
 	SLsmg_write_char(':');
 	slsmg_write_nstring(" ", 8);
+
+	/* The scroll bar isn't being used */
+	if (!self->navkeypressed)
+		width += 1;
+
 	if (!*ol->line)
 		slsmg_write_nstring(" ", width - 18);
 	else
@@ -58,6 +82,8 @@
 
 	if (!current_entry)
 		ui_browser__set_color(self, HE_COLORSET_CODE);
+	else
+		ab->selection = ol;
 }
 
 static double objdump_line__calc_percent(struct objdump_line *self,
@@ -141,7 +167,8 @@
 static void annotate_browser__calc_percent(struct annotate_browser *browser,
 					   int evidx)
 {
-	struct symbol *sym = browser->b.priv;
+	struct map_symbol *ms = browser->b.priv;
+	struct symbol *sym = ms->sym;
 	struct annotation *notes = symbol__annotation(sym);
 	struct objdump_line *pos;
 
@@ -163,25 +190,60 @@
 	browser->curr_hot = rb_last(&browser->entries);
 }
 
+static bool annotate_browser__toggle_source(struct annotate_browser *browser)
+{
+	struct objdump_line *ol;
+	struct objdump_line_rb_node *olrb;
+	off_t offset = browser->b.index - browser->b.top_idx;
+
+	browser->b.seek(&browser->b, offset, SEEK_CUR);
+	ol = list_entry(browser->b.top, struct objdump_line, node);
+	olrb = objdump_line__rb(ol);
+
+	if (browser->hide_src_code) {
+		if (olrb->idx_asm < offset)
+			offset = olrb->idx;
+
+		browser->b.nr_entries = browser->nr_entries;
+		browser->hide_src_code = false;
+		browser->b.seek(&browser->b, -offset, SEEK_CUR);
+		browser->b.top_idx = olrb->idx - offset;
+		browser->b.index = olrb->idx;
+	} else {
+		if (olrb->idx_asm < 0) {
+			ui_helpline__puts("Only available for assembly lines.");
+			browser->b.seek(&browser->b, -offset, SEEK_CUR);
+			return false;
+		}
+
+		if (olrb->idx_asm < offset)
+			offset = olrb->idx_asm;
+
+		browser->b.nr_entries = browser->nr_asm_entries;
+		browser->hide_src_code = true;
+		browser->b.seek(&browser->b, -offset, SEEK_CUR);
+		browser->b.top_idx = olrb->idx_asm - offset;
+		browser->b.index = olrb->idx_asm;
+	}
+
+	return true;
+}
+
 static int annotate_browser__run(struct annotate_browser *self, int evidx,
-				 int refresh)
+				 int nr_events, void(*timer)(void *arg),
+				 void *arg, int delay_secs)
 {
 	struct rb_node *nd = NULL;
-	struct symbol *sym = self->b.priv;
-	/*
-	 * RIGHT To allow builtin-annotate to cycle thru multiple symbols by
-	 * examining the exit key for this function.
-	 */
-	int exit_keys[] = { 'H', NEWT_KEY_TAB, NEWT_KEY_UNTAB,
-			    NEWT_KEY_RIGHT, 0 };
+	struct map_symbol *ms = self->b.priv;
+	struct symbol *sym = ms->sym;
+	const char *help = "<-, ESC: exit, TAB/shift+TAB: cycle hottest lines, "
+			   "H: Hottest, -> Line action, S -> Toggle source "
+			   "code view";
 	int key;
 
-	if (ui_browser__show(&self->b, sym->name,
-			     "<-, -> or ESC: exit, TAB/shift+TAB: "
-			     "cycle hottest lines, H: Hottest") < 0)
+	if (ui_browser__show(&self->b, sym->name, help) < 0)
 		return -1;
 
-	ui_browser__add_exit_keys(&self->b, exit_keys);
 	annotate_browser__calc_percent(self, evidx);
 
 	if (self->curr_hot)
@@ -189,13 +251,10 @@
 
 	nd = self->curr_hot;
 
-	if (refresh != 0)
-		newtFormSetTimer(self->b.form, refresh);
-
 	while (1) {
-		key = ui_browser__run(&self->b);
+		key = ui_browser__run(&self->b, delay_secs);
 
-		if (refresh != 0) {
+		if (delay_secs != 0) {
 			annotate_browser__calc_percent(self, evidx);
 			/*
 			 * Current line focus got out of the list of most active
@@ -207,15 +266,14 @@
 		}
 
 		switch (key) {
-		case -1:
-			/*
- 			 * FIXME we need to check if it was
- 			 * es.reason == NEWT_EXIT_TIMER
- 			 */
-			if (refresh != 0)
+		case K_TIMER:
+			if (timer != NULL)
+				timer(arg);
+
+			if (delay_secs != 0)
 				symbol__annotate_decay_histogram(sym, evidx);
 			continue;
-		case NEWT_KEY_TAB:
+		case K_TAB:
 			if (nd != NULL) {
 				nd = rb_prev(nd);
 				if (nd == NULL)
@@ -223,7 +281,7 @@
 			} else
 				nd = self->curr_hot;
 			break;
-		case NEWT_KEY_UNTAB:
+		case K_UNTAB:
 			if (nd != NULL)
 				nd = rb_next(nd);
 				if (nd == NULL)
@@ -234,8 +292,68 @@
 		case 'H':
 			nd = self->curr_hot;
 			break;
-		default:
+		case 'S':
+			if (annotate_browser__toggle_source(self))
+				ui_helpline__puts(help);
+			continue;
+		case K_ENTER:
+		case K_RIGHT:
+			if (self->selection == NULL) {
+				ui_helpline__puts("Huh? No selection. Report to linux-kernel@vger.kernel.org");
+				continue;
+			}
+
+			if (self->selection->offset == -1) {
+				ui_helpline__puts("Actions are only available for assembly lines.");
+				continue;
+			} else {
+				char *s = strstr(self->selection->line, "callq ");
+				struct annotation *notes;
+				struct symbol *target;
+				u64 ip;
+
+				if (s == NULL) {
+					ui_helpline__puts("Actions are only available for the 'callq' instruction.");
+					continue;
+				}
+
+				s = strchr(s, ' ');
+				if (s++ == NULL) {
+					ui_helpline__puts("Invallid callq instruction.");
+					continue;
+				}
+
+				ip = strtoull(s, NULL, 16);
+				ip = ms->map->map_ip(ms->map, ip);
+				target = map__find_symbol(ms->map, ip, NULL);
+				if (target == NULL) {
+					ui_helpline__puts("The called function was not found.");
+					continue;
+				}
+
+				notes = symbol__annotation(target);
+				pthread_mutex_lock(&notes->lock);
+
+				if (notes->src == NULL &&
+				    symbol__alloc_hist(target, nr_events) < 0) {
+					pthread_mutex_unlock(&notes->lock);
+					ui__warning("Not enough memory for annotating '%s' symbol!\n",
+						    target->name);
+					continue;
+				}
+
+				pthread_mutex_unlock(&notes->lock);
+				symbol__tui_annotate(target, ms->map, evidx, nr_events,
+						     timer, arg, delay_secs);
+			}
+			continue;
+		case K_LEFT:
+		case K_ESC:
+		case 'q':
+		case CTRL('c'):
 			goto out;
+		default:
+			continue;
 		}
 
 		if (nd != NULL)
@@ -246,22 +364,31 @@
 	return key;
 }
 
-int hist_entry__tui_annotate(struct hist_entry *he, int evidx)
+int hist_entry__tui_annotate(struct hist_entry *he, int evidx, int nr_events,
+			     void(*timer)(void *arg), void *arg, int delay_secs)
 {
-	return symbol__tui_annotate(he->ms.sym, he->ms.map, evidx, 0);
+	return symbol__tui_annotate(he->ms.sym, he->ms.map, evidx, nr_events,
+				    timer, arg, delay_secs);
 }
 
 int symbol__tui_annotate(struct symbol *sym, struct map *map, int evidx,
-			 int refresh)
+			 int nr_events, void(*timer)(void *arg), void *arg,
+			 int delay_secs)
 {
 	struct objdump_line *pos, *n;
 	struct annotation *notes;
+	struct map_symbol ms = {
+		.map = map,
+		.sym = sym,
+	};
 	struct annotate_browser browser = {
 		.b = {
 			.refresh = ui_browser__list_head_refresh,
 			.seek	 = ui_browser__list_head_seek,
 			.write	 = annotate_browser__write,
-			.priv	 = sym,
+			.filter  = objdump_line__filter,
+			.priv	 = &ms,
+			.use_navkeypressed = true,
 		},
 	};
 	int ret;
@@ -288,12 +415,18 @@
 		if (browser.b.width < line_len)
 			browser.b.width = line_len;
 		rbpos = objdump_line__rb(pos);
-		rbpos->idx = browser.b.nr_entries++;
+		rbpos->idx = browser.nr_entries++;
+		if (pos->offset != -1)
+			rbpos->idx_asm = browser.nr_asm_entries++;
+		else
+			rbpos->idx_asm = -1;
 	}
 
+	browser.b.nr_entries = browser.nr_entries;
 	browser.b.entries = &notes->src->source,
 	browser.b.width += 18; /* Percentage */
-	ret = annotate_browser__run(&browser, evidx, refresh);
+	ret = annotate_browser__run(&browser, evidx, nr_events,
+				    timer, arg, delay_secs);
 	list_for_each_entry_safe(pos, n, &notes->src->source, node) {
 		list_del(&pos->node);
 		objdump_line__free(pos);

diff --git a/tools/perf/util/ui/browsers/hists.c b/tools/perf/util/ui/browsers/hists.c
index 5d767c6..4663dcb 100644
--- a/tools/perf/util/ui/browsers/hists.c
+++ b/tools/perf/util/ui/browsers/hists.c

@@ -24,8 +24,12 @@
 	struct hists	    *hists;
 	struct hist_entry   *he_selection;
 	struct map_symbol   *selection;
+	bool		     has_symbols;
 };
 
+static int hists__browser_title(struct hists *self, char *bf, size_t size,
+				const char *ev_name);
+
 static void hist_browser__refresh_dimensions(struct hist_browser *self)
 {
 	/* 3 == +/- toggle symbol before actual hist_entry rendering */
@@ -290,28 +294,34 @@
 	ui_browser__reset_index(&self->b);
 }
 
-static int hist_browser__run(struct hist_browser *self, const char *title)
+static int hist_browser__run(struct hist_browser *self, const char *ev_name,
+			     void(*timer)(void *arg), void *arg, int delay_secs)
 {
 	int key;
-	int exit_keys[] = { 'a', '?', 'h', 'C', 'd', 'D', 'E', 't',
-			    NEWT_KEY_ENTER, NEWT_KEY_RIGHT, NEWT_KEY_LEFT,
-			    NEWT_KEY_TAB, NEWT_KEY_UNTAB, 0, };
+	char title[160];
 
 	self->b.entries = &self->hists->entries;
 	self->b.nr_entries = self->hists->nr_entries;
 
 	hist_browser__refresh_dimensions(self);
+	hists__browser_title(self->hists, title, sizeof(title), ev_name);
 
 	if (ui_browser__show(&self->b, title,
 			     "Press '?' for help on key bindings") < 0)
 		return -1;
 
-	ui_browser__add_exit_keys(&self->b, exit_keys);
-
 	while (1) {
-		key = ui_browser__run(&self->b);
+		key = ui_browser__run(&self->b, delay_secs);
 
 		switch (key) {
+		case -1:
+			/* FIXME we need to check if it was es.reason == NEWT_EXIT_TIMER */
+			timer(arg);
+			ui_browser__update_nr_entries(&self->b, self->hists->nr_entries);
+			hists__browser_title(self->hists, title, sizeof(title),
+					     ev_name);
+			ui_browser__show_title(&self->b, title);
+			continue;
 		case 'D': { /* Debug */
 			static int seq;
 			struct hist_entry *h = rb_entry(self->b.top,
@@ -334,7 +344,7 @@
 			/* Expand the whole world. */
 			hist_browser__set_folding(self, true);
 			break;
-		case NEWT_KEY_ENTER:
+		case K_ENTER:
 			if (hist_browser__toggle_fold(self))
 				break;
 			/* fall thru */
@@ -532,7 +542,7 @@
 	char s[256];
 	double percent;
 	int printed = 0;
-	int color, width = self->b.width;
+	int width = self->b.width - 6; /* The percentage */
 	char folded_sign = ' ';
 	bool current_entry = ui_browser__is_current_entry(&self->b, row);
 	off_t row_offset = entry->row_offset;
@@ -548,26 +558,35 @@
 	}
 
 	if (row_offset == 0) {
-		hist_entry__snprintf(entry, s, sizeof(s), self->hists, NULL, false,
-				     0, false, self->hists->stats.total_period);
+		hist_entry__snprintf(entry, s, sizeof(s), self->hists);
 		percent = (entry->period * 100.0) / self->hists->stats.total_period;
 
-		color = HE_COLORSET_SELECTED;
-		if (!current_entry) {
-			if (percent >= MIN_RED)
-				color = HE_COLORSET_TOP;
-			else if (percent >= MIN_GREEN)
-				color = HE_COLORSET_MEDIUM;
-			else
-				color = HE_COLORSET_NORMAL;
-		}
-
-		ui_browser__set_color(&self->b, color);
+		ui_browser__set_percent_color(&self->b, percent, current_entry);
 		ui_browser__gotorc(&self->b, row, 0);
 		if (symbol_conf.use_callchain) {
 			slsmg_printf("%c ", folded_sign);
 			width -= 2;
 		}
+
+		slsmg_printf(" %5.2f%%", percent);
+
+		/* The scroll bar isn't being used */
+		if (!self->b.navkeypressed)
+			width += 1;
+
+		if (!current_entry || !self->b.navkeypressed)
+			ui_browser__set_color(&self->b, HE_COLORSET_NORMAL);
+
+		if (symbol_conf.show_nr_samples) {
+			slsmg_printf(" %11u", entry->nr_events);
+			width -= 12;
+		}
+
+		if (symbol_conf.show_total_period) {
+			slsmg_printf(" %12" PRIu64, entry->period);
+			width -= 13;
+		}
+
 		slsmg_write_nstring(s, width);
 		++row;
 		++printed;
@@ -585,14 +604,23 @@
 	return printed;
 }
 
+static void ui_browser__hists_init_top(struct ui_browser *browser)
+{
+	if (browser->top == NULL) {
+		struct hist_browser *hb;
+
+		hb = container_of(browser, struct hist_browser, b);
+		browser->top = rb_first(&hb->hists->entries);
+	}
+}
+
 static unsigned int hist_browser__refresh(struct ui_browser *self)
 {
 	unsigned row = 0;
 	struct rb_node *nd;
 	struct hist_browser *hb = container_of(self, struct hist_browser, b);
 
-	if (self->top == NULL)
-		self->top = rb_first(&hb->hists->entries);
+	ui_browser__hists_init_top(self);
 
 	for (nd = self->top; nd; nd = rb_next(nd)) {
 		struct hist_entry *h = rb_entry(nd, struct hist_entry, rb_node);
@@ -644,6 +672,8 @@
 	if (self->nr_entries == 0)
 		return;
 
+	ui_browser__hists_init_top(self);
+
 	switch (whence) {
 	case SEEK_SET:
 		nd = hists__filter_entries(rb_first(self->entries));
@@ -761,6 +791,8 @@
 		self->hists = hists;
 		self->b.refresh = hist_browser__refresh;
 		self->b.seek = ui_browser__hists_seek;
+		self->b.use_navkeypressed = true,
+		self->has_symbols = sort_sym.list.next != NULL;
 	}
 
 	return self;
@@ -782,11 +814,12 @@
 }
 
 static int hists__browser_title(struct hists *self, char *bf, size_t size,
-				const char *ev_name, const struct dso *dso,
-				const struct thread *thread)
+				const char *ev_name)
 {
 	char unit;
 	int printed;
+	const struct dso *dso = self->dso_filter;
+	const struct thread *thread = self->thread_filter;
 	unsigned long nr_events = self->stats.nr_events[PERF_RECORD_SAMPLE];
 
 	nr_events = convert_unit(nr_events, &unit);
@@ -803,16 +836,15 @@
 	return printed;
 }
 
-static int perf_evsel__hists_browse(struct perf_evsel *evsel,
+static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events,
 				    const char *helpline, const char *ev_name,
-				    bool left_exits)
+				    bool left_exits,
+				    void(*timer)(void *arg), void *arg,
+				    int delay_secs)
 {
 	struct hists *self = &evsel->hists;
 	struct hist_browser *browser = hist_browser__new(self);
 	struct pstack *fstack;
-	const struct thread *thread_filter = NULL;
-	const struct dso *dso_filter = NULL;
-	char msg[160];
 	int key = -1;
 
 	if (browser == NULL)
@@ -824,8 +856,6 @@
 
 	ui_helpline__push(helpline);
 
-	hists__browser_title(self, msg, sizeof(msg), ev_name,
-			     dso_filter, thread_filter);
 	while (1) {
 		const struct thread *thread = NULL;
 		const struct dso *dso = NULL;
@@ -834,7 +864,7 @@
 		    annotate = -2, zoom_dso = -2, zoom_thread = -2,
 		    browse_map = -2;
 
-		key = hist_browser__run(browser, msg);
+		key = hist_browser__run(browser, ev_name, timer, arg, delay_secs);
 
 		if (browser->he_selection != NULL) {
 			thread = hist_browser__selected_thread(browser);
@@ -842,14 +872,23 @@
 		}
 
 		switch (key) {
-		case NEWT_KEY_TAB:
-		case NEWT_KEY_UNTAB:
+		case K_TAB:
+		case K_UNTAB:
+			if (nr_events == 1)
+				continue;
 			/*
 			 * Exit the browser, let hists__browser_tree
 			 * go to the next or previous
 			 */
 			goto out_free_stack;
 		case 'a':
+			if (!browser->has_symbols) {
+				ui__warning(
+			"Annotation is only available for symbolic views, "
+			"include \"sym\" in --sort to use it.");
+				continue;
+			}
+
 			if (browser->selection == NULL ||
 			    browser->selection->sym == NULL ||
 			    browser->selection->map->dso->annotate_warned)
@@ -859,25 +898,29 @@
 			goto zoom_dso;
 		case 't':
 			goto zoom_thread;
-		case NEWT_KEY_F1:
+		case K_F1:
 		case 'h':
 		case '?':
-			ui__help_window("->        Zoom into DSO/Threads & Annotate current symbol\n"
-					"<-        Zoom out\n"
-					"a         Annotate current symbol\n"
-					"h/?/F1    Show this window\n"
-					"C         Collapse all callchains\n"
-					"E         Expand all callchains\n"
-					"d         Zoom into current DSO\n"
-					"t         Zoom into current Thread\n"
-					"TAB/UNTAB Switch events\n"
-					"q/CTRL+C  Exit browser");
+			ui__help_window("h/?/F1        Show this window\n"
+					"UP/DOWN/PGUP\n"
+					"PGDN/SPACE    Navigate\n"
+					"q/ESC/CTRL+C  Exit browser\n\n"
+					"For multiple event sessions:\n\n"
+					"TAB/UNTAB Switch events\n\n"
+					"For symbolic views (--sort has sym):\n\n"
+					"->            Zoom into DSO/Threads & Annotate current symbol\n"
+					"<-            Zoom out\n"
+					"a             Annotate current symbol\n"
+					"C             Collapse all callchains\n"
+					"E             Expand all callchains\n"
+					"d             Zoom into current DSO\n"
+					"t             Zoom into current Thread\n");
 			continue;
-		case NEWT_KEY_ENTER:
-		case NEWT_KEY_RIGHT:
+		case K_ENTER:
+		case K_RIGHT:
 			/* menu */
 			break;
-		case NEWT_KEY_LEFT: {
+		case K_LEFT: {
 			const void *top;
 
 			if (pstack__empty(fstack)) {
@@ -889,21 +932,27 @@
 				continue;
 			}
 			top = pstack__pop(fstack);
-			if (top == &dso_filter)
+			if (top == &browser->hists->dso_filter)
 				goto zoom_out_dso;
-			if (top == &thread_filter)
+			if (top == &browser->hists->thread_filter)
 				goto zoom_out_thread;
 			continue;
 		}
-		case NEWT_KEY_ESCAPE:
+		case K_ESC:
 			if (!left_exits &&
 			    !ui__dialog_yesno("Do you really want to exit?"))
 				continue;
 			/* Fall thru */
-		default:
+		case 'q':
+		case CTRL('c'):
 			goto out_free_stack;
+		default:
+			continue;
 		}
 
+		if (!browser->has_symbols)
+			goto add_exit_option;
+
 		if (browser->selection != NULL &&
 		    browser->selection->sym != NULL &&
 		    !browser->selection->map->dso->annotate_warned &&
@@ -913,14 +962,14 @@
 
 		if (thread != NULL &&
 		    asprintf(&options[nr_options], "Zoom %s %s(%d) thread",
-			     (thread_filter ? "out of" : "into"),
+			     (browser->hists->thread_filter ? "out of" : "into"),
 			     (thread->comm_set ? thread->comm : ""),
 			     thread->pid) > 0)
 			zoom_thread = nr_options++;
 
 		if (dso != NULL &&
 		    asprintf(&options[nr_options], "Zoom %s %s DSO",
-			     (dso_filter ? "out of" : "into"),
+			     (browser->hists->dso_filter ? "out of" : "into"),
 			     (dso->kernel ? "the Kernel" : dso->short_name)) > 0)
 			zoom_dso = nr_options++;
 
@@ -928,7 +977,7 @@
 		    browser->selection->map != NULL &&
 		    asprintf(&options[nr_options], "Browse map details") > 0)
 			browse_map = nr_options++;
-
+add_exit_option:
 		options[nr_options++] = (char *)"Exit";
 
 		choice = ui__popup_menu(nr_options, options);
@@ -948,46 +997,52 @@
 			he = hist_browser__selected_entry(browser);
 			if (he == NULL)
 				continue;
-
-			hist_entry__tui_annotate(he, evsel->idx);
+			/*
+			 * Don't let this be freed, say, by hists__decay_entry.
+			 */
+			he->used = true;
+			hist_entry__tui_annotate(he, evsel->idx, nr_events,
+						 timer, arg, delay_secs);
+			he->used = false;
+			ui_browser__update_nr_entries(&browser->b, browser->hists->nr_entries);
 		} else if (choice == browse_map)
 			map__browse(browser->selection->map);
 		else if (choice == zoom_dso) {
 zoom_dso:
-			if (dso_filter) {
-				pstack__remove(fstack, &dso_filter);
+			if (browser->hists->dso_filter) {
+				pstack__remove(fstack, &browser->hists->dso_filter);
 zoom_out_dso:
 				ui_helpline__pop();
-				dso_filter = NULL;
+				browser->hists->dso_filter = NULL;
+				sort_dso.elide = false;
 			} else {
 				if (dso == NULL)
 					continue;
 				ui_helpline__fpush("To zoom out press <- or -> + \"Zoom out of %s DSO\"",
 						   dso->kernel ? "the Kernel" : dso->short_name);
-				dso_filter = dso;
-				pstack__push(fstack, &dso_filter);
+				browser->hists->dso_filter = dso;
+				sort_dso.elide = true;
+				pstack__push(fstack, &browser->hists->dso_filter);
 			}
-			hists__filter_by_dso(self, dso_filter);
-			hists__browser_title(self, msg, sizeof(msg), ev_name,
-					     dso_filter, thread_filter);
+			hists__filter_by_dso(self);
 			hist_browser__reset(browser);
 		} else if (choice == zoom_thread) {
 zoom_thread:
-			if (thread_filter) {
-				pstack__remove(fstack, &thread_filter);
+			if (browser->hists->thread_filter) {
+				pstack__remove(fstack, &browser->hists->thread_filter);
 zoom_out_thread:
 				ui_helpline__pop();
-				thread_filter = NULL;
+				browser->hists->thread_filter = NULL;
+				sort_thread.elide = false;
 			} else {
 				ui_helpline__fpush("To zoom out press <- or -> + \"Zoom out of %s(%d) thread\"",
 						   thread->comm_set ? thread->comm : "",
 						   thread->pid);
-				thread_filter = thread;
-				pstack__push(fstack, &thread_filter);
+				browser->hists->thread_filter = thread;
+				sort_thread.elide = true;
+				pstack__push(fstack, &browser->hists->thread_filter);
 			}
-			hists__filter_by_thread(self, thread_filter);
-			hists__browser_title(self, msg, sizeof(msg), ev_name,
-					     dso_filter, thread_filter);
+			hists__filter_by_thread(self);
 			hist_browser__reset(browser);
 		}
 	}
@@ -1026,9 +1081,10 @@
 		menu->selection = evsel;
 }
 
-static int perf_evsel_menu__run(struct perf_evsel_menu *menu, const char *help)
+static int perf_evsel_menu__run(struct perf_evsel_menu *menu,
+				int nr_events, const char *help,
+				void(*timer)(void *arg), void *arg, int delay_secs)
 {
-	int exit_keys[] = { NEWT_KEY_ENTER, NEWT_KEY_RIGHT, 0, };
 	struct perf_evlist *evlist = menu->b.priv;
 	struct perf_evsel *pos;
 	const char *ev_name, *title = "Available samples";
@@ -1038,50 +1094,65 @@
 			     "ESC: exit, ENTER|->: Browse histograms") < 0)
 		return -1;
 
-	ui_browser__add_exit_keys(&menu->b, exit_keys);
-
 	while (1) {
-		key = ui_browser__run(&menu->b);
+		key = ui_browser__run(&menu->b, delay_secs);
 
 		switch (key) {
-		case NEWT_KEY_RIGHT:
-		case NEWT_KEY_ENTER:
+		case K_TIMER:
+			timer(arg);
+			continue;
+		case K_RIGHT:
+		case K_ENTER:
 			if (!menu->selection)
 				continue;
 			pos = menu->selection;
 browse_hists:
+			perf_evlist__set_selected(evlist, pos);
+			/*
+			 * Give the calling tool a chance to populate the non
+			 * default evsel resorted hists tree.
+			 */
+			if (timer)
+				timer(arg);
 			ev_name = event_name(pos);
-			key = perf_evsel__hists_browse(pos, help, ev_name, true);
+			key = perf_evsel__hists_browse(pos, nr_events, help,
+						       ev_name, true, timer,
+						       arg, delay_secs);
 			ui_browser__show_title(&menu->b, title);
-			break;
-		case NEWT_KEY_LEFT:
+			switch (key) {
+			case K_TAB:
+				if (pos->node.next == &evlist->entries)
+					pos = list_entry(evlist->entries.next, struct perf_evsel, node);
+				else
+					pos = list_entry(pos->node.next, struct perf_evsel, node);
+				goto browse_hists;
+			case K_UNTAB:
+				if (pos->node.prev == &evlist->entries)
+					pos = list_entry(evlist->entries.prev, struct perf_evsel, node);
+				else
+					pos = list_entry(pos->node.prev, struct perf_evsel, node);
+				goto browse_hists;
+			case K_ESC:
+				if (!ui__dialog_yesno("Do you really want to exit?"))
+					continue;
+				/* Fall thru */
+			case 'q':
+			case CTRL('c'):
+				goto out;
+			default:
+				continue;
+			}
+		case K_LEFT:
 			continue;
-		case NEWT_KEY_ESCAPE:
+		case K_ESC:
 			if (!ui__dialog_yesno("Do you really want to exit?"))
 				continue;
 			/* Fall thru */
-		default:
-			goto out;
-		}
-
-		switch (key) {
-		case NEWT_KEY_TAB:
-			if (pos->node.next == &evlist->entries)
-				pos = list_entry(evlist->entries.next, struct perf_evsel, node);
-			else
-				pos = list_entry(pos->node.next, struct perf_evsel, node);
-			goto browse_hists;
-		case NEWT_KEY_UNTAB:
-			if (pos->node.prev == &evlist->entries)
-				pos = list_entry(evlist->entries.prev, struct perf_evsel, node);
-			else
-				pos = list_entry(pos->node.prev, struct perf_evsel, node);
-			goto browse_hists;
 		case 'q':
 		case CTRL('c'):
 			goto out;
 		default:
-			break;
+			continue;
 		}
 	}
 
@@ -1091,7 +1162,9 @@
 }
 
 static int __perf_evlist__tui_browse_hists(struct perf_evlist *evlist,
-					   const char *help)
+					   const char *help,
+					   void(*timer)(void *arg), void *arg,
+					   int delay_secs)
 {
 	struct perf_evsel *pos;
 	struct perf_evsel_menu menu = {
@@ -1121,18 +1194,24 @@
 			pos->name = strdup(ev_name);
 	}
 
-	return perf_evsel_menu__run(&menu, help);
+	return perf_evsel_menu__run(&menu, evlist->nr_entries, help, timer,
+				    arg, delay_secs);
 }
 
-int perf_evlist__tui_browse_hists(struct perf_evlist *evlist, const char *help)
+int perf_evlist__tui_browse_hists(struct perf_evlist *evlist, const char *help,
+				  void(*timer)(void *arg), void *arg,
+				  int delay_secs)
 {
 
 	if (evlist->nr_entries == 1) {
 		struct perf_evsel *first = list_entry(evlist->entries.next,
 						      struct perf_evsel, node);
 		const char *ev_name = event_name(first);
-		return perf_evsel__hists_browse(first, help, ev_name, false);
+		return perf_evsel__hists_browse(first, evlist->nr_entries, help,
+						ev_name, false, timer, arg,
+						delay_secs);
 	}
 
-	return __perf_evlist__tui_browse_hists(evlist, help);
+	return __perf_evlist__tui_browse_hists(evlist, help,
+					       timer, arg, delay_secs);
 }

diff --git a/tools/perf/util/ui/browsers/map.c b/tools/perf/util/ui/browsers/map.c
index 8462bff..6905bcc 100644
--- a/tools/perf/util/ui/browsers/map.c
+++ b/tools/perf/util/ui/browsers/map.c

@@ -1,5 +1,6 @@
 #include "../libslang.h"
 #include <elf.h>
+#include <newt.h>
 #include <inttypes.h>
 #include <sys/ttydefaults.h>
 #include <ctype.h>
@@ -108,11 +109,8 @@
 			     verbose ? "" : "restart with -v to use") < 0)
 		return -1;
 
-	if (verbose)
-		ui_browser__add_exit_key(&self->b, '/');
-
 	while (1) {
-		key = ui_browser__run(&self->b);
+		key = ui_browser__run(&self->b, 0);
 
 		if (verbose && key == '/')
 			map_browser__search(self);

diff --git a/tools/perf/util/ui/browsers/top.c b/tools/perf/util/ui/browsers/top.c
deleted file mode 100644
index 88403cf..0000000
--- a/tools/perf/util/ui/browsers/top.c
+++ /dev/null

@@ -1,212 +0,0 @@
-/*
- * Copyright (C) 2011, Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
- *
- * Parts came from builtin-{top,stat,record}.c, see those files for further
- * copyright notes.
- *
- * Released under the GPL v2. (and only v2, not any later version)
- */
-#include "../browser.h"
-#include "../../annotate.h"
-#include "../helpline.h"
-#include "../libslang.h"
-#include "../util.h"
-#include "../../evlist.h"
-#include "../../hist.h"
-#include "../../sort.h"
-#include "../../symbol.h"
-#include "../../top.h"
-
-struct perf_top_browser {
-	struct ui_browser b;
-	struct rb_root	  root;
-	struct sym_entry  *selection;
-	float		  sum_ksamples;
-	int		  dso_width;
-	int		  dso_short_width;
-	int		  sym_width;
-};
-
-static void perf_top_browser__write(struct ui_browser *browser, void *entry, int row)
-{
-	struct perf_top_browser *top_browser = container_of(browser, struct perf_top_browser, b);
-	struct sym_entry *syme = rb_entry(entry, struct sym_entry, rb_node);
-	bool current_entry = ui_browser__is_current_entry(browser, row);
-	struct symbol *symbol = sym_entry__symbol(syme);
-	struct perf_top *top = browser->priv;
-	int width = browser->width;
-	double pcnt;
-
-	pcnt = 100.0 - (100.0 * ((top_browser->sum_ksamples - syme->snap_count) /
-				 top_browser->sum_ksamples));
-	ui_browser__set_percent_color(browser, pcnt, current_entry);
-
-	if (top->evlist->nr_entries == 1 || !top->display_weighted) {
-		slsmg_printf("%20.2f ", syme->weight);
-		width -= 24;
-	} else {
-		slsmg_printf("%9.1f %10ld ", syme->weight, syme->snap_count);
-		width -= 23;
-	}
-
-	slsmg_printf("%4.1f%%", pcnt);
-	width -= 7;
-
-	if (verbose) {
-		slsmg_printf(" %016" PRIx64, symbol->start);
-		width -= 17;
-	}
-
-	slsmg_printf(" %-*.*s ", top_browser->sym_width, top_browser->sym_width,
-		     symbol->name);
-	width -= top_browser->sym_width;
-	slsmg_write_nstring(width >= syme->map->dso->long_name_len ?
-				syme->map->dso->long_name :
-				syme->map->dso->short_name, width);
-
-	if (current_entry)
-		top_browser->selection = syme;
-}
-
-static void perf_top_browser__update_rb_tree(struct perf_top_browser *browser)
-{
-	struct perf_top *top = browser->b.priv;
-	u64 top_idx = browser->b.top_idx;
-
-	browser->root = RB_ROOT;
-	browser->b.top = NULL;
-	browser->sum_ksamples = perf_top__decay_samples(top, &browser->root);
-	/*
- 	 * No active symbols
- 	 */
-	if (top->rb_entries == 0)
-		return;
-
-	perf_top__find_widths(top, &browser->root, &browser->dso_width,
-			      &browser->dso_short_width,
-                              &browser->sym_width);
-	if (browser->sym_width + browser->dso_width > browser->b.width - 29) {
-		browser->dso_width = browser->dso_short_width;
-		if (browser->sym_width + browser->dso_width > browser->b.width - 29)
-			browser->sym_width = browser->b.width - browser->dso_width - 29;
-	}
-
-	/*
-	 * Adjust the ui_browser indexes since the entries in the browser->root
-	 * rb_tree may have changed, then seek it from start, so that we get a
-	 * possible new top of the screen.
- 	 */
-	browser->b.nr_entries = top->rb_entries;
-
-	if (top_idx >= browser->b.nr_entries) {
-		if (browser->b.height >= browser->b.nr_entries)
-			top_idx = browser->b.nr_entries - browser->b.height;
-		else
-			top_idx = 0;
-	}
-
-	if (browser->b.index >= top_idx + browser->b.height)
-		browser->b.index = top_idx + browser->b.index - browser->b.top_idx;
-
-	if (browser->b.index >= browser->b.nr_entries)
-		browser->b.index = browser->b.nr_entries - 1;
-
-	browser->b.top_idx = top_idx;
-	browser->b.seek(&browser->b, top_idx, SEEK_SET);
-}
-
-static void perf_top_browser__annotate(struct perf_top_browser *browser)
-{
-	struct sym_entry *syme = browser->selection;
-	struct symbol *sym = sym_entry__symbol(syme);
-	struct annotation *notes = symbol__annotation(sym);
-	struct perf_top *top = browser->b.priv;
-
-	if (notes->src != NULL)
-		goto do_annotation;
-
-	pthread_mutex_lock(&notes->lock);
-
-	top->sym_filter_entry = NULL;
-
-	if (symbol__alloc_hist(sym, top->evlist->nr_entries) < 0) {
-		pr_err("Not enough memory for annotating '%s' symbol!\n",
-		       sym->name);
-		pthread_mutex_unlock(&notes->lock);
-		return;
-	}
-
-	top->sym_filter_entry = syme;
-
-	pthread_mutex_unlock(&notes->lock);
-do_annotation:
-	symbol__tui_annotate(sym, syme->map, 0, top->delay_secs * 1000);
-}
-
-static int perf_top_browser__run(struct perf_top_browser *browser)
-{
-	int key;
-	char title[160];
-	struct perf_top *top = browser->b.priv;
-	int delay_msecs = top->delay_secs * 1000;
-	int exit_keys[] = { 'a', NEWT_KEY_ENTER, NEWT_KEY_RIGHT, 0, };
-
-	perf_top_browser__update_rb_tree(browser);
-        perf_top__header_snprintf(top, title, sizeof(title));
-        perf_top__reset_sample_counters(top);
-
-	if (ui_browser__show(&browser->b, title,
-			     "ESC: exit, ENTER|->|a: Live Annotate") < 0)
-		return -1;
-
-	newtFormSetTimer(browser->b.form, delay_msecs);
-	ui_browser__add_exit_keys(&browser->b, exit_keys);
-
-	while (1) {
-		key = ui_browser__run(&browser->b);
-
-		switch (key) {
-		case -1:
-			/* FIXME we need to check if it was es.reason == NEWT_EXIT_TIMER */
-			perf_top_browser__update_rb_tree(browser);
-			perf_top__header_snprintf(top, title, sizeof(title));
-			perf_top__reset_sample_counters(top);
-			ui_browser__set_color(&browser->b, NEWT_COLORSET_ROOT);
-			SLsmg_gotorc(0, 0);
-			slsmg_write_nstring(title, browser->b.width);
-			break;
-		case 'a':
-		case NEWT_KEY_RIGHT:
-		case NEWT_KEY_ENTER:
-			if (browser->selection)
-				perf_top_browser__annotate(browser);
-			break;
-		case NEWT_KEY_LEFT:
-			continue;
-		case NEWT_KEY_ESCAPE:
-			if (!ui__dialog_yesno("Do you really want to exit?"))
-				continue;
-			/* Fall thru */
-		default:
-			goto out;
-		}
-	}
-out:
-	ui_browser__hide(&browser->b);
-	return key;
-}
-
-int perf_top__tui_browser(struct perf_top *top)
-{
-	struct perf_top_browser browser = {
-		.b = {
-			.entries = &browser.root,
-			.refresh = ui_browser__rb_tree_refresh,
-			.seek	 = ui_browser__rb_tree_seek,
-			.write	 = perf_top_browser__write,
-			.priv	 = top,
-		},
-	};
-
-	return perf_top_browser__run(&browser);
-}

diff --git a/tools/perf/util/ui/helpline.h b/tools/perf/util/ui/helpline.h
index ab6028d..fdcbc02 100644
--- a/tools/perf/util/ui/helpline.h
+++ b/tools/perf/util/ui/helpline.h

@@ -1,6 +1,9 @@
 #ifndef _PERF_UI_HELPLINE_H_
 #define _PERF_UI_HELPLINE_H_ 1
 
+#include <stdio.h>
+#include <stdarg.h>
+
 void ui_helpline__init(void);
 void ui_helpline__pop(void);
 void ui_helpline__push(const char *msg);

diff --git a/tools/perf/util/ui/keysyms.h b/tools/perf/util/ui/keysyms.h
new file mode 100644
index 0000000..3458b19
--- /dev/null
+++ b/tools/perf/util/ui/keysyms.h

@@ -0,0 +1,25 @@
+#ifndef _PERF_KEYSYMS_H_
+#define _PERF_KEYSYMS_H_ 1
+
+#include "libslang.h"
+
+#define K_DOWN	SL_KEY_DOWN
+#define K_END	SL_KEY_END
+#define K_ENTER	'\r'
+#define K_ESC	033
+#define K_F1	SL_KEY_F(1)
+#define K_HOME	SL_KEY_HOME
+#define K_LEFT	SL_KEY_LEFT
+#define K_PGDN	SL_KEY_NPAGE
+#define K_PGUP	SL_KEY_PPAGE
+#define K_RIGHT	SL_KEY_RIGHT
+#define K_TAB	'\t'
+#define K_UNTAB	SL_KEY_UNTAB
+#define K_UP	SL_KEY_UP
+
+/* Not really keys */
+#define K_TIMER	 -1
+#define K_ERROR	 -2
+#define K_RESIZE -3
+
+#endif /* _PERF_KEYSYMS_H_ */

diff --git a/tools/perf/util/ui/libslang.h b/tools/perf/util/ui/libslang.h
index 2b63e1c..4d54b64 100644
--- a/tools/perf/util/ui/libslang.h
+++ b/tools/perf/util/ui/libslang.h

@@ -24,4 +24,6 @@
 #define sltt_set_color SLtt_set_color
 #endif
 
+#define SL_KEY_UNTAB 0x1000
+
 #endif /* _PERF_UI_SLANG_H_ */

diff --git a/tools/perf/util/ui/setup.c b/tools/perf/util/ui/setup.c
index ee46d67..1e6ba06 100644
--- a/tools/perf/util/ui/setup.c
+++ b/tools/perf/util/ui/setup.c

@@ -7,6 +7,7 @@
 #include "browser.h"
 #include "helpline.h"
 #include "ui.h"
+#include "libslang.h"
 
 pthread_mutex_t ui__lock = PTHREAD_MUTEX_INITIALIZER;
 
@@ -17,6 +18,33 @@
 	newtResume();
 }
 
+static int ui__init(void)
+{
+	int err = SLkp_init();
+
+	if (err < 0)
+		goto out;
+
+	SLkp_define_keysym((char *)"^(kB)", SL_KEY_UNTAB);
+out:
+	return err;
+}
+
+static void ui__exit(void)
+{
+	SLtt_set_cursor_visibility(1);
+	SLsmg_refresh();
+	SLsmg_reset_smg();
+	SLang_reset_tty();
+}
+
+static void ui__signal(int sig)
+{
+	ui__exit();
+	psignal(sig, "perf");
+	exit(0);
+}
+
 void setup_browser(bool fallback_to_pager)
 {
 	if (!isatty(1) || !use_browser || dump_trace) {
@@ -28,10 +56,16 @@
 
 	use_browser = 1;
 	newtInit();
-	newtCls();
+	ui__init();
 	newtSetSuspendCallback(newt_suspend, NULL);
 	ui_helpline__init();
 	ui_browser__init();
+
+	signal(SIGSEGV, ui__signal);
+	signal(SIGFPE, ui__signal);
+	signal(SIGINT, ui__signal);
+	signal(SIGQUIT, ui__signal);
+	signal(SIGTERM, ui__signal);
 }
 
 void exit_browser(bool wait_for_ok)
@@ -41,6 +75,6 @@
 			char title[] = "Fatal Error", ok[] = "Ok";
 			newtWinMessage(title, ok, ui_helpline__last_msg);
 		}
-		newtFinished();
+		ui__exit();
 	}
 }
commit	138c4ae9cfda8fdcf9e137457853b09ef8cf8f77	[log] [tgz]
author	Linus Torvalds <torvalds@linux-foundation.org>	Wed Oct 26 21:46:18 2011 +0200
committer	Linus Torvalds <torvalds@linux-foundation.org>	Wed Oct 26 21:46:18 2011 +0200
tree	704c363de6d5868b08e9ae31a436ff04d423f625
parent	3b3dd79d6a8b3debd0291465fc8cd9caf765d545 [diff]
parent	e182a345d40deba7c3165a2857812bf403818319 [diff]