3.0 patches

commit: 410449452dd3868bff8971f247c7f42652d7bab3 [log] [tgz]
author: Greg Kroah-Hartman <gregkh@suse.de> Mon Oct 03 14:36:48 2011 -0700
committer: Greg Kroah-Hartman <gregkh@suse.de> Mon Oct 03 14:36:48 2011 -0700
tree: c85585ed990683c9ef2ce96a98e1a7bcb1fd86e7
parent: 380dd8e0ffba79547f9346ebb6a943eae837c2b5 [diff]
diff --git a/queue-3.0/ide-disk-fix-request-requeuing.patch b/queue-3.0/ide-disk-fix-request-requeuing.patch
new file mode 100644
index 0000000..13b2644
--- /dev/null
+++ b/queue-3.0/ide-disk-fix-request-requeuing.patch

@@ -0,0 +1,47 @@
+From 2c8fc867602e385fd2abe76da0b6bda8ed907547 Mon Sep 17 00:00:00 2001
+From: Borislav Petkov <bp@alien8.de>
+Date: Mon, 3 Oct 2011 14:28:18 -0400
+Subject: ide-disk: Fix request requeuing
+
+From: Borislav Petkov <bp@alien8.de>
+
+commit 2c8fc867602e385fd2abe76da0b6bda8ed907547 upstream.
+
+Simon Kirby reported that on his RAID setup with idedisk underneath
+the box OOMs after a couple of days of runtime. Running with
+CONFIG_DEBUG_KMEMLEAK pointed to idedisk_prep_fn() which unconditionally
+allocates an ide_cmd struct. However, ide_requeue_and_plug() can be
+called more than once per request, either from the request issue or the
+IRQ handler path and do blk_peek_request() ends up in idedisk_prep_fn()
+repeatedly, allocating a struct ide_cmd everytime and "forgetting" the
+previous pointer.
+
+Make sure the code reuses the old allocated chunk.
+
+Reported-and-tested-by: Simon Kirby <sim@hostway.ca>
+Link: http://marc.info/?l=linux-kernel&m=131667641517919
+Link: http://lkml.kernel.org/r/20110922072643.GA27232@hostway.ca
+Signed-off-by: Borislav Petkov <bp@alien8.de>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ drivers/ide/ide-disk.c |    7 ++++++-
+ 1 file changed, 6 insertions(+), 1 deletion(-)
+
+--- a/drivers/ide/ide-disk.c
++++ b/drivers/ide/ide-disk.c
+@@ -435,7 +435,12 @@ static int idedisk_prep_fn(struct reques
+ 	if (!(rq->cmd_flags & REQ_FLUSH))
+ 		return BLKPREP_OK;
+ 
+-	cmd = kzalloc(sizeof(*cmd), GFP_ATOMIC);
++	if (rq->special) {
++		cmd = rq->special;
++		memset(cmd, 0, sizeof(*cmd));
++	} else {
++		cmd = kzalloc(sizeof(*cmd), GFP_ATOMIC);
++	}
+ 
+ 	/* FIXME: map struct ide_taskfile on rq->cmd[] */
+ 	BUG_ON(cmd == NULL);

diff --git a/queue-3.0/pci-don-t-crash-when-reading-mpss-from-root-complex.patch b/queue-3.0/pci-don-t-crash-when-reading-mpss-from-root-complex.patch
new file mode 100644
index 0000000..9fcb7fd
--- /dev/null
+++ b/queue-3.0/pci-don-t-crash-when-reading-mpss-from-root-complex.patch

@@ -0,0 +1,41 @@
+From 1a4b1a41b8a3d5256019854e851beed063b34344 Mon Sep 17 00:00:00 2001
+From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
+Date: Tue, 13 Sep 2011 15:16:33 -0300
+Subject: pci: Don't crash when reading mpss from root complex
+
+From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
+
+commit 1a4b1a41b8a3d5256019854e851beed063b34344 upstream.
+
+In pcie_find_smpss(), we have the following statement:
+
+ 	if (dev->is_hotplug_bridge && (!list_is_singular(&dev->bus->devices) ||
+	    dev->bus->self->pcie_type != PCI_EXP_TYPE_ROOT_PORT))
+
+The problem is that at least on my machine, this gets called for the
+root complex (virtual P2P bridge), and dev->bus->self is NULL since
+the parent bus for this is not itself anchor to a PCI device.
+
+This adds the necessary NULL check.
+
+Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
+Acked-by: Jon Mason <mason@myri.com>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ drivers/pci/probe.c |    3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/drivers/pci/probe.c
++++ b/drivers/pci/probe.c
+@@ -1352,7 +1352,8 @@ static int pcie_find_smpss(struct pci_de
+ 	 * will occur as normal.
+ 	 */
+ 	if (dev->is_hotplug_bridge && (!list_is_singular(&dev->bus->devices) ||
+-	    dev->bus->self->pcie_type != PCI_EXP_TYPE_ROOT_PORT))
++	     (dev->bus->self &&
++	      dev->bus->self->pcie_type != PCI_EXP_TYPE_ROOT_PORT)))
+ 		*smpss = 0;
+ 
+ 	if (*smpss > dev->pcie_mpss)

diff --git a/queue-3.0/pci-export-pcie_bus_configure_settings-symbol.patch b/queue-3.0/pci-export-pcie_bus_configure_settings-symbol.patch
new file mode 100644
index 0000000..1812baf
--- /dev/null
+++ b/queue-3.0/pci-export-pcie_bus_configure_settings-symbol.patch

@@ -0,0 +1,31 @@
+From debc3b778508f59696ff188f0feca271dcbfa7d9 Mon Sep 17 00:00:00 2001
+From: Jon Mason <mason@myri.com>
+Date: Tue, 2 Aug 2011 00:01:18 -0500
+Subject: PCI: export pcie_bus_configure_settings symbol
+
+From: Jon Mason <mason@myri.com>
+
+commit debc3b778508f59696ff188f0feca271dcbfa7d9 upstream.
+
+pcie_bus_configure_settings needs to be exported if the PCI hotplug
+driver is being compiled as a module.
+
+Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
+Signed-off-by: Jon Mason <mason@myri.com>
+Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ drivers/pci/probe.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/drivers/pci/probe.c
++++ b/drivers/pci/probe.c
+@@ -1471,6 +1471,7 @@ void pcie_bus_configure_settings(struct
+ 	pcie_bus_configure_set(bus->self, &smpss);
+ 	pci_walk_bus(bus, pcie_bus_configure_set, &smpss);
+ }
++EXPORT_SYMBOL_GPL(pcie_bus_configure_settings);
+ 
+ unsigned int __devinit pci_scan_child_bus(struct pci_bus *bus)
+ {

diff --git a/queue-3.0/pci-remove-mrrs-modification-from-mps-setting-code.patch b/queue-3.0/pci-remove-mrrs-modification-from-mps-setting-code.patch
new file mode 100644
index 0000000..7c0f5cd
--- /dev/null
+++ b/queue-3.0/pci-remove-mrrs-modification-from-mps-setting-code.patch

@@ -0,0 +1,117 @@
+From ed2888e906b56769b4ffabb9c577190438aa68b8 Mon Sep 17 00:00:00 2001
+From: Jon Mason <mason@myri.com>
+Date: Thu, 8 Sep 2011 16:41:18 -0500
+Subject: PCI: Remove MRRS modification from MPS setting code
+
+From: Jon Mason <mason@myri.com>
+
+commit ed2888e906b56769b4ffabb9c577190438aa68b8 upstream.
+
+Modifying the Maximum Read Request Size to 0 (value of 128Bytes) has
+massive negative ramifications on some devices.  Without knowing which
+devices have this issue, do not modify from the default value when
+walking the PCI-E bus in pcie_bus_safe mode.  Also, make pcie_bus_safe
+the default procedure.
+
+Tested-by: Sven Schnelle <svens@stackframe.org>
+Tested-by: Simon Kirby <sim@hostway.ca>
+Tested-by: Stephen M. Cameron <scameron@beardog.cce.hp.com>
+Reported-and-tested-by: Eric Dumazet <eric.dumazet@gmail.com>
+Reported-and-tested-by: Niels Ole Salscheider <niels_ole@salscheider-online.de>
+References: https://bugzilla.kernel.org/show_bug.cgi?id=42162
+Signed-off-by: Jon Mason <mason@myri.com>
+Acked-by: Jesse Barnes <jbarnes@virtuousgeek.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ drivers/pci/pci.c   |    2 +-
+ drivers/pci/probe.c |   45 ++++++++++++++++++++++++---------------------
+ 2 files changed, 25 insertions(+), 22 deletions(-)
+
+--- a/drivers/pci/pci.c
++++ b/drivers/pci/pci.c
+@@ -77,7 +77,7 @@ unsigned long pci_cardbus_mem_size = DEF
+ unsigned long pci_hotplug_io_size  = DEFAULT_HOTPLUG_IO_SIZE;
+ unsigned long pci_hotplug_mem_size = DEFAULT_HOTPLUG_MEM_SIZE;
+ 
+-enum pcie_bus_config_types pcie_bus_config = PCIE_BUS_PERFORMANCE;
++enum pcie_bus_config_types pcie_bus_config = PCIE_BUS_SAFE;
+ 
+ /*
+  * The default CLS is used if arch didn't set CLS explicitly and not
+--- a/drivers/pci/probe.c
++++ b/drivers/pci/probe.c
+@@ -1397,34 +1397,37 @@ static void pcie_write_mps(struct pci_de
+ 
+ static void pcie_write_mrrs(struct pci_dev *dev, int mps)
+ {
+-	int rc, mrrs;
+-
+-	if (pcie_bus_config == PCIE_BUS_PERFORMANCE) {
+-		int dev_mpss = 128 << dev->pcie_mpss;
+-
+-		/* For Max performance, the MRRS must be set to the largest
+-		 * supported value.  However, it cannot be configured larger
+-		 * than the MPS the device or the bus can support.  This assumes
+-		 * that the largest MRRS available on the device cannot be
+-		 * smaller than the device MPSS.
+-		 */
+-		mrrs = mps < dev_mpss ? mps : dev_mpss;
+-	} else
+-		/* In the "safe" case, configure the MRRS for fairness on the
+-		 * bus by making all devices have the same size
+-		 */
+-		mrrs = mps;
++	int rc, mrrs, dev_mpss;
+ 
++	/* In the "safe" case, do not configure the MRRS.  There appear to be
++	 * issues with setting MRRS to 0 on a number of devices.
++	 */
++
++	if (pcie_bus_config != PCIE_BUS_PERFORMANCE)
++		return;
++
++	dev_mpss = 128 << dev->pcie_mpss;
++
++	/* For Max performance, the MRRS must be set to the largest supported
++	 * value.  However, it cannot be configured larger than the MPS the
++	 * device or the bus can support.  This assumes that the largest MRRS
++	 * available on the device cannot be smaller than the device MPSS.
++	 */
++	mrrs = min(mps, dev_mpss);
+ 
+ 	/* MRRS is a R/W register.  Invalid values can be written, but a
+-	 * subsiquent read will verify if the value is acceptable or not.
++	 * subsequent read will verify if the value is acceptable or not.
+ 	 * If the MRRS value provided is not acceptable (e.g., too large),
+ 	 * shrink the value until it is acceptable to the HW.
+  	 */
+ 	while (mrrs != pcie_get_readrq(dev) && mrrs >= 128) {
++		dev_warn(&dev->dev, "Attempting to modify the PCI-E MRRS value"
++			 " to %d.  If any issues are encountered, please try "
++			 "running with pci=pcie_bus_safe\n", mrrs);
+ 		rc = pcie_set_readrq(dev, mrrs);
+ 		if (rc)
+-			dev_err(&dev->dev, "Failed attempting to set the MRRS\n");
++			dev_err(&dev->dev,
++				"Failed attempting to set the MRRS\n");
+ 
+ 		mrrs /= 2;
+ 	}
+@@ -1437,13 +1440,13 @@ static int pcie_bus_configure_set(struct
+ 	if (!pci_is_pcie(dev))
+ 		return 0;
+ 
+-	dev_info(&dev->dev, "Dev MPS %d MPSS %d MRRS %d\n",
++	dev_dbg(&dev->dev, "Dev MPS %d MPSS %d MRRS %d\n",
+ 		 pcie_get_mps(dev), 128<<dev->pcie_mpss, pcie_get_readrq(dev));
+ 
+ 	pcie_write_mps(dev, mps);
+ 	pcie_write_mrrs(dev, mps);
+ 
+-	dev_info(&dev->dev, "Dev MPS %d MPSS %d MRRS %d\n",
++	dev_dbg(&dev->dev, "Dev MPS %d MPSS %d MRRS %d\n",
+ 		 pcie_get_mps(dev), 128<<dev->pcie_mpss, pcie_get_readrq(dev));
+ 
+ 	return 0;

diff --git a/queue-3.0/pci-series b/queue-3.0/pci-series
new file mode 100644
index 0000000..2d8d8ad
--- /dev/null
+++ b/queue-3.0/pci-series

@@ -0,0 +1,4 @@
+pci-set-pci-e-max-payload-size-on-fabric.patch
+pci-export-pcie_bus_configure_settings-symbol.patch
+pci-remove-mrrs-modification-from-mps-setting-code.patch
+pci-don-t-crash-when-reading-mpss-from-root-complex.patch

diff --git a/queue-3.0/pci-set-pci-e-max-payload-size-on-fabric.patch b/queue-3.0/pci-set-pci-e-max-payload-size-on-fabric.patch
new file mode 100644
index 0000000..6ca45f3
--- /dev/null
+++ b/queue-3.0/pci-set-pci-e-max-payload-size-on-fabric.patch

@@ -0,0 +1,445 @@
+From b03e7495a862b028294f59fc87286d6d78ee7fa1 Mon Sep 17 00:00:00 2001
+From: Jon Mason <mason@myri.com>
+Date: Wed, 20 Jul 2011 15:20:54 -0500
+Subject: PCI: Set PCI-E Max Payload Size on fabric
+
+From: Jon Mason <mason@myri.com>
+
+commit b03e7495a862b028294f59fc87286d6d78ee7fa1 upstream.
+
+On a given PCI-E fabric, each device, bridge, and root port can have a
+different PCI-E maximum payload size.  There is a sizable performance
+boost for having the largest possible maximum payload size on each PCI-E
+device.  However, if improperly configured, fatal bus errors can occur.
+Thus, it is important to ensure that PCI-E payloads sends by a device
+are never larger than the MPS setting of all devices on the way to the
+destination.
+
+This can be achieved two ways:
+
+- A conservative approach is to use the smallest common denominator of
+  the entire tree below a root complex for every device on that fabric.
+
+This means for example that having a 128 bytes MPS USB controller on one
+leg of a switch will dramatically reduce performances of a video card or
+10GE adapter on another leg of that same switch.
+
+It also means that any hierarchy supporting hotplug slots (including
+expresscard or thunderbolt I suppose, dbl check that) will have to be
+entirely clamped to 128 bytes since we cannot predict what will be
+plugged into those slots, and we cannot change the MPS on a "live"
+system.
+
+- A more optimal way is possible, if it falls within a couple of
+  constraints:
+* The top-level host bridge will never generate packets larger than the
+  smallest TLP (or if it can be controlled independently from its MPS at
+  least)
+* The device will never generate packets larger than MPS (which can be
+  configured via MRRS)
+* No support of direct PCI-E <-> PCI-E transfers between devices without
+  some additional code to specifically deal with that case
+
+Then we can use an approach that basically ignores downstream requests
+and focuses exclusively on upstream requests. In that case, all we need
+to care about is that a device MPS is no larger than its parent MPS,
+which allows us to keep all switches/bridges to the max MPS supported by
+their parent and eventually the PHB.
+
+In this case, your USB controller would no longer "starve" your 10GE
+Ethernet and your hotplug slots won't affect your global MPS.
+Additionally, the hotplugged devices themselves can be configured to a
+larger MPS up to the value configured in the hotplug bridge.
+
+To choose between the two available options, two PCI kernel boot args
+have been added to the PCI calls.  "pcie_bus_safe" will provide the
+former behavior, while "pcie_bus_perf" will perform the latter behavior.
+By default, the latter behavior is used.
+
+NOTE: due to the location of the enablement, each arch will need to add
+calls to this function.  This patch only enables x86.
+
+This patch includes a number of changes recommended by Benjamin
+Herrenschmidt.
+
+Tested-by: Jordan_Hargrave@dell.com
+Signed-off-by: Jon Mason <mason@myri.com>
+Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ arch/x86/pci/acpi.c              |    9 ++
+ drivers/pci/hotplug/pcihp_slot.c |   45 ------------
+ drivers/pci/pci.c                |   67 ++++++++++++++++++
+ drivers/pci/probe.c              |  145 +++++++++++++++++++++++++++++++++++++++
+ include/linux/pci.h              |   15 +++-
+ 5 files changed, 236 insertions(+), 45 deletions(-)
+
+--- a/arch/x86/pci/acpi.c
++++ b/arch/x86/pci/acpi.c
+@@ -361,6 +361,15 @@ struct pci_bus * __devinit pci_acpi_scan
+ 		}
+ 	}
+ 
++	/* After the PCI-E bus has been walked and all devices discovered,
++	 * configure any settings of the fabric that might be necessary.
++	 */
++	if (bus) {
++		struct pci_bus *child;
++		list_for_each_entry(child, &bus->children, node)
++			pcie_bus_configure_settings(child, child->self->pcie_mpss);
++	}
++
+ 	if (!bus)
+ 		kfree(sd);
+ 
+--- a/drivers/pci/hotplug/pcihp_slot.c
++++ b/drivers/pci/hotplug/pcihp_slot.c
+@@ -158,47 +158,6 @@ static void program_hpp_type2(struct pci
+ 	 */
+ }
+ 
+-/* Program PCIE MaxPayload setting on device: ensure parent maxpayload <= device */
+-static int pci_set_payload(struct pci_dev *dev)
+-{
+-       int pos, ppos;
+-       u16 pctl, psz;
+-       u16 dctl, dsz, dcap, dmax;
+-       struct pci_dev *parent;
+-
+-       parent = dev->bus->self;
+-       pos = pci_find_capability(dev, PCI_CAP_ID_EXP);
+-       if (!pos)
+-               return 0;
+-
+-       /* Read Device MaxPayload capability and setting */
+-       pci_read_config_word(dev, pos + PCI_EXP_DEVCTL, &dctl);
+-       pci_read_config_word(dev, pos + PCI_EXP_DEVCAP, &dcap);
+-       dsz = (dctl & PCI_EXP_DEVCTL_PAYLOAD) >> 5;
+-       dmax = (dcap & PCI_EXP_DEVCAP_PAYLOAD);
+-
+-       /* Read Parent MaxPayload setting */
+-       ppos = pci_find_capability(parent, PCI_CAP_ID_EXP);
+-       if (!ppos)
+-               return 0;
+-       pci_read_config_word(parent, ppos + PCI_EXP_DEVCTL, &pctl);
+-       psz = (pctl &  PCI_EXP_DEVCTL_PAYLOAD) >> 5;
+-
+-       /* If parent payload > device max payload -> error
+-        * If parent payload > device payload -> set speed
+-        * If parent payload <= device payload -> do nothing
+-        */
+-       if (psz > dmax)
+-               return -1;
+-       else if (psz > dsz) {
+-               dev_info(&dev->dev, "Setting MaxPayload to %d\n", 128 << psz);
+-               pci_write_config_word(dev, pos + PCI_EXP_DEVCTL,
+-                                     (dctl & ~PCI_EXP_DEVCTL_PAYLOAD) +
+-                                     (psz << 5));
+-       }
+-       return 0;
+-}
+-
+ void pci_configure_slot(struct pci_dev *dev)
+ {
+ 	struct pci_dev *cdev;
+@@ -210,9 +169,7 @@ void pci_configure_slot(struct pci_dev *
+ 			(dev->class >> 8) == PCI_CLASS_BRIDGE_PCI)))
+ 		return;
+ 
+-       ret = pci_set_payload(dev);
+-       if (ret)
+-               dev_warn(&dev->dev, "could not set device max payload\n");
++	pcie_bus_configure_settings(dev->bus, dev->bus->self->pcie_mpss);
+ 
+ 	memset(&hpp, 0, sizeof(hpp));
+ 	ret = pci_get_hp_params(dev, &hpp);
+--- a/drivers/pci/pci.c
++++ b/drivers/pci/pci.c
+@@ -77,6 +77,8 @@ unsigned long pci_cardbus_mem_size = DEF
+ unsigned long pci_hotplug_io_size  = DEFAULT_HOTPLUG_IO_SIZE;
+ unsigned long pci_hotplug_mem_size = DEFAULT_HOTPLUG_MEM_SIZE;
+ 
++enum pcie_bus_config_types pcie_bus_config = PCIE_BUS_PERFORMANCE;
++
+ /*
+  * The default CLS is used if arch didn't set CLS explicitly and not
+  * all pci devices agree on the same value.  Arch can override either
+@@ -3223,6 +3225,67 @@ out:
+ EXPORT_SYMBOL(pcie_set_readrq);
+ 
+ /**
++ * pcie_get_mps - get PCI Express maximum payload size
++ * @dev: PCI device to query
++ *
++ * Returns maximum payload size in bytes
++ *    or appropriate error value.
++ */
++int pcie_get_mps(struct pci_dev *dev)
++{
++	int ret, cap;
++	u16 ctl;
++
++	cap = pci_pcie_cap(dev);
++	if (!cap)
++		return -EINVAL;
++
++	ret = pci_read_config_word(dev, cap + PCI_EXP_DEVCTL, &ctl);
++	if (!ret)
++		ret = 128 << ((ctl & PCI_EXP_DEVCTL_PAYLOAD) >> 5);
++
++	return ret;
++}
++
++/**
++ * pcie_set_mps - set PCI Express maximum payload size
++ * @dev: PCI device to query
++ * @rq: maximum payload size in bytes
++ *    valid values are 128, 256, 512, 1024, 2048, 4096
++ *
++ * If possible sets maximum payload size
++ */
++int pcie_set_mps(struct pci_dev *dev, int mps)
++{
++	int cap, err = -EINVAL;
++	u16 ctl, v;
++
++	if (mps < 128 || mps > 4096 || !is_power_of_2(mps))
++		goto out;
++
++	v = ffs(mps) - 8;
++	if (v > dev->pcie_mpss)
++		goto out;
++	v <<= 5;
++
++	cap = pci_pcie_cap(dev);
++	if (!cap)
++		goto out;
++
++	err = pci_read_config_word(dev, cap + PCI_EXP_DEVCTL, &ctl);
++	if (err)
++		goto out;
++
++	if ((ctl & PCI_EXP_DEVCTL_PAYLOAD) != v) {
++		ctl &= ~PCI_EXP_DEVCTL_PAYLOAD;
++		ctl |= v;
++		err = pci_write_config_word(dev, cap + PCI_EXP_DEVCTL, ctl);
++	}
++out:
++	return err;
++}
++
++/**
+  * pci_select_bars - Make BAR mask from the type of resource
+  * @dev: the PCI device for which BAR mask is made
+  * @flags: resource type mask to be selected
+@@ -3505,6 +3568,10 @@ static int __init pci_setup(char *str)
+ 				pci_hotplug_io_size = memparse(str + 9, &str);
+ 			} else if (!strncmp(str, "hpmemsize=", 10)) {
+ 				pci_hotplug_mem_size = memparse(str + 10, &str);
++			} else if (!strncmp(str, "pcie_bus_safe", 13)) {
++				pcie_bus_config = PCIE_BUS_SAFE;
++			} else if (!strncmp(str, "pcie_bus_perf", 13)) {
++				pcie_bus_config = PCIE_BUS_PERFORMANCE;
+ 			} else {
+ 				printk(KERN_ERR "PCI: Unknown option `%s'\n",
+ 						str);
+--- a/drivers/pci/probe.c
++++ b/drivers/pci/probe.c
+@@ -860,6 +860,8 @@ void set_pcie_port_type(struct pci_dev *
+ 	pdev->pcie_cap = pos;
+ 	pci_read_config_word(pdev, pos + PCI_EXP_FLAGS, &reg16);
+ 	pdev->pcie_type = (reg16 & PCI_EXP_FLAGS_TYPE) >> 4;
++	pci_read_config_word(pdev, pos + PCI_EXP_DEVCAP, &reg16);
++	pdev->pcie_mpss = reg16 & PCI_EXP_DEVCAP_PAYLOAD;
+ }
+ 
+ void set_pcie_hotplug_bridge(struct pci_dev *pdev)
+@@ -1327,6 +1329,149 @@ int pci_scan_slot(struct pci_bus *bus, i
+ 	return nr;
+ }
+ 
++static int pcie_find_smpss(struct pci_dev *dev, void *data)
++{
++	u8 *smpss = data;
++
++	if (!pci_is_pcie(dev))
++		return 0;
++
++	/* For PCIE hotplug enabled slots not connected directly to a
++	 * PCI-E root port, there can be problems when hotplugging
++	 * devices.  This is due to the possibility of hotplugging a
++	 * device into the fabric with a smaller MPS that the devices
++	 * currently running have configured.  Modifying the MPS on the
++	 * running devices could cause a fatal bus error due to an
++	 * incoming frame being larger than the newly configured MPS.
++	 * To work around this, the MPS for the entire fabric must be
++	 * set to the minimum size.  Any devices hotplugged into this
++	 * fabric will have the minimum MPS set.  If the PCI hotplug
++	 * slot is directly connected to the root port and there are not
++	 * other devices on the fabric (which seems to be the most
++	 * common case), then this is not an issue and MPS discovery
++	 * will occur as normal.
++	 */
++	if (dev->is_hotplug_bridge && (!list_is_singular(&dev->bus->devices) ||
++	    dev->bus->self->pcie_type != PCI_EXP_TYPE_ROOT_PORT))
++		*smpss = 0;
++
++	if (*smpss > dev->pcie_mpss)
++		*smpss = dev->pcie_mpss;
++
++	return 0;
++}
++
++static void pcie_write_mps(struct pci_dev *dev, int mps)
++{
++	int rc, dev_mpss;
++
++	dev_mpss = 128 << dev->pcie_mpss;
++
++	if (pcie_bus_config == PCIE_BUS_PERFORMANCE) {
++		if (dev->bus->self) {
++			dev_dbg(&dev->bus->dev, "Bus MPSS %d\n",
++				128 << dev->bus->self->pcie_mpss);
++
++			/* For "MPS Force Max", the assumption is made that
++			 * downstream communication will never be larger than
++			 * the MRRS.  So, the MPS only needs to be configured
++			 * for the upstream communication.  This being the case,
++			 * walk from the top down and set the MPS of the child
++			 * to that of the parent bus.
++			 */
++			mps = 128 << dev->bus->self->pcie_mpss;
++			if (mps > dev_mpss)
++				dev_warn(&dev->dev, "MPS configured higher than"
++					 " maximum supported by the device.  If"
++					 " a bus issue occurs, try running with"
++					 " pci=pcie_bus_safe.\n");
++		}
++
++		dev->pcie_mpss = ffs(mps) - 8;
++	}
++
++	rc = pcie_set_mps(dev, mps);
++	if (rc)
++		dev_err(&dev->dev, "Failed attempting to set the MPS\n");
++}
++
++static void pcie_write_mrrs(struct pci_dev *dev, int mps)
++{
++	int rc, mrrs;
++
++	if (pcie_bus_config == PCIE_BUS_PERFORMANCE) {
++		int dev_mpss = 128 << dev->pcie_mpss;
++
++		/* For Max performance, the MRRS must be set to the largest
++		 * supported value.  However, it cannot be configured larger
++		 * than the MPS the device or the bus can support.  This assumes
++		 * that the largest MRRS available on the device cannot be
++		 * smaller than the device MPSS.
++		 */
++		mrrs = mps < dev_mpss ? mps : dev_mpss;
++	} else
++		/* In the "safe" case, configure the MRRS for fairness on the
++		 * bus by making all devices have the same size
++		 */
++		mrrs = mps;
++
++
++	/* MRRS is a R/W register.  Invalid values can be written, but a
++	 * subsiquent read will verify if the value is acceptable or not.
++	 * If the MRRS value provided is not acceptable (e.g., too large),
++	 * shrink the value until it is acceptable to the HW.
++ 	 */
++	while (mrrs != pcie_get_readrq(dev) && mrrs >= 128) {
++		rc = pcie_set_readrq(dev, mrrs);
++		if (rc)
++			dev_err(&dev->dev, "Failed attempting to set the MRRS\n");
++
++		mrrs /= 2;
++	}
++}
++
++static int pcie_bus_configure_set(struct pci_dev *dev, void *data)
++{
++	int mps = 128 << *(u8 *)data;
++
++	if (!pci_is_pcie(dev))
++		return 0;
++
++	dev_info(&dev->dev, "Dev MPS %d MPSS %d MRRS %d\n",
++		 pcie_get_mps(dev), 128<<dev->pcie_mpss, pcie_get_readrq(dev));
++
++	pcie_write_mps(dev, mps);
++	pcie_write_mrrs(dev, mps);
++
++	dev_info(&dev->dev, "Dev MPS %d MPSS %d MRRS %d\n",
++		 pcie_get_mps(dev), 128<<dev->pcie_mpss, pcie_get_readrq(dev));
++
++	return 0;
++}
++
++/* pcie_bus_configure_mps requires that pci_walk_bus work in a top-down,
++ * parents then children fashion.  If this changes, then this code will not
++ * work as designed.
++ */
++void pcie_bus_configure_settings(struct pci_bus *bus, u8 mpss)
++{
++	u8 smpss = mpss;
++
++	if (!bus->self)
++		return;
++
++	if (!pci_is_pcie(bus->self))
++		return;
++
++	if (pcie_bus_config == PCIE_BUS_SAFE) {
++		pcie_find_smpss(bus->self, &smpss);
++		pci_walk_bus(bus, pcie_find_smpss, &smpss);
++	}
++
++	pcie_bus_configure_set(bus->self, &smpss);
++	pci_walk_bus(bus, pcie_bus_configure_set, &smpss);
++}
++
+ unsigned int __devinit pci_scan_child_bus(struct pci_bus *bus)
+ {
+ 	unsigned int devfn, pass, max = bus->secondary;
+--- a/include/linux/pci.h
++++ b/include/linux/pci.h
+@@ -251,7 +251,8 @@ struct pci_dev {
+ 	u8		revision;	/* PCI revision, low byte of class word */
+ 	u8		hdr_type;	/* PCI header type (`multi' flag masked out) */
+ 	u8		pcie_cap;	/* PCI-E capability offset */
+-	u8		pcie_type;	/* PCI-E device/port type */
++	u8		pcie_type:4;	/* PCI-E device/port type */
++	u8		pcie_mpss:3;	/* PCI-E Max Payload Size Supported */
+ 	u8		rom_base_reg;	/* which config register controls the ROM */
+ 	u8		pin;  		/* which interrupt pin this device uses */
+ 
+@@ -617,6 +618,16 @@ struct pci_driver {
+ /* these external functions are only available when PCI support is enabled */
+ #ifdef CONFIG_PCI
+ 
++extern void pcie_bus_configure_settings(struct pci_bus *bus, u8 smpss);
++
++enum pcie_bus_config_types {
++	PCIE_BUS_PERFORMANCE,
++	PCIE_BUS_SAFE,
++	PCIE_BUS_PEER2PEER,
++};
++
++extern enum pcie_bus_config_types pcie_bus_config;
++
+ extern struct bus_type pci_bus_type;
+ 
+ /* Do NOT directly access these two variables, unless you are arch specific pci
+@@ -796,6 +807,8 @@ int pcix_get_mmrbc(struct pci_dev *dev);
+ int pcix_set_mmrbc(struct pci_dev *dev, int mmrbc);
+ int pcie_get_readrq(struct pci_dev *dev);
+ int pcie_set_readrq(struct pci_dev *dev, int rq);
++int pcie_get_mps(struct pci_dev *dev);
++int pcie_set_mps(struct pci_dev *dev, int mps);
+ int __pci_reset_function(struct pci_dev *dev);
+ int pci_reset_function(struct pci_dev *dev);
+ void pci_update_resource(struct pci_dev *dev, int resno);

diff --git a/queue-3.0/sched-fix-up-wchan-borkage.patch b/queue-3.0/sched-fix-up-wchan-borkage.patch
new file mode 100644
index 0000000..bc7eff7
--- /dev/null
+++ b/queue-3.0/sched-fix-up-wchan-borkage.patch

@@ -0,0 +1,35 @@
+From 6ebbe7a07b3bc40b168d2afc569a6543c020d2e3 Mon Sep 17 00:00:00 2001
+From: Simon Kirby <sim@hostway.ca>
+Date: Thu, 22 Sep 2011 17:03:46 -0700
+Subject: sched: Fix up wchan borkage
+
+From: Simon Kirby <sim@hostway.ca>
+
+commit 6ebbe7a07b3bc40b168d2afc569a6543c020d2e3 upstream.
+
+Commit c259e01a1ec ("sched: Separate the scheduler entry for
+preemption") contained a boo-boo wrecking wchan output. It forgot to
+put the new schedule() function in the __sched section and thereby
+doesn't get properly ignored for things like wchan.
+
+Tested-by: Simon Kirby <sim@hostway.ca>
+Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
+Link: http://lkml.kernel.org/r/20110923000346.GA25425@hostway.ca
+Signed-off-by: Ingo Molnar <mingo@elte.hu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ kernel/sched.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/kernel/sched.c
++++ b/kernel/sched.c
+@@ -4335,7 +4335,7 @@ static inline void sched_submit_work(str
+ 		blk_schedule_flush_plug(tsk);
+ }
+ 
+-asmlinkage void schedule(void)
++asmlinkage void __sched schedule(void)
+ {
+ 	struct task_struct *tsk = current;
+ 

diff --git a/queue-3.0/sched-rt-migrate-equal-priority-tasks-to-available-cpus.patch b/queue-3.0/sched-rt-migrate-equal-priority-tasks-to-available-cpus.patch
new file mode 100644
index 0000000..4f3aba4
--- /dev/null
+++ b/queue-3.0/sched-rt-migrate-equal-priority-tasks-to-available-cpus.patch

@@ -0,0 +1,57 @@
+From 3be209a8e22cedafc1b6945608b7bb8d9887ab61 Mon Sep 17 00:00:00 2001
+From: Shawn Bohrer <sbohrer@rgmadvisors.com>
+Date: Mon, 12 Sep 2011 09:28:04 -0500
+Subject: sched/rt: Migrate equal priority tasks to available CPUs
+
+From: Shawn Bohrer <sbohrer@rgmadvisors.com>
+
+commit 3be209a8e22cedafc1b6945608b7bb8d9887ab61 upstream.
+
+Commit 43fa5460fe60dea5c610490a1d263415419c60f6 ("sched: Try not to
+migrate higher priority RT tasks") also introduced a change in behavior
+which keeps RT tasks on the same CPU if there is an equal priority RT
+task currently running even if there are empty CPUs available.
+
+This can cause unnecessary wakeup latencies, and can prevent the
+scheduler from balancing all RT tasks across available CPUs.
+
+This change causes an RT task to search for a new CPU if an equal
+priority RT task is already running on wakeup.  Lower priority tasks
+will still have to wait on higher priority tasks, but the system should
+still balance out because there is always the possibility that if there
+are both a high and low priority RT tasks on a given CPU that the high
+priority task could wakeup while the low priority task is running and
+force it to search for a better runqueue.
+
+Signed-off-by: Shawn Bohrer <sbohrer@rgmadvisors.com>
+Acked-by: Steven Rostedt <rostedt@goodmis.org>
+Tested-by: Steven Rostedt <rostedt@goodmis.org>
+Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
+Link: http://lkml.kernel.org/r/1315837684-18733-1-git-send-email-sbohrer@rgmadvisors.com
+Signed-off-by: Ingo Molnar <mingo@elte.hu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ kernel/sched_rt.c |    4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/kernel/sched_rt.c
++++ b/kernel/sched_rt.c
+@@ -1038,7 +1038,7 @@ select_task_rq_rt(struct task_struct *p,
+ 	 */
+ 	if (curr && unlikely(rt_task(curr)) &&
+ 	    (curr->rt.nr_cpus_allowed < 2 ||
+-	     curr->prio < p->prio) &&
++	     curr->prio <= p->prio) &&
+ 	    (p->rt.nr_cpus_allowed > 1)) {
+ 		int target = find_lowest_rq(p);
+ 
+@@ -1569,7 +1569,7 @@ static void task_woken_rt(struct rq *rq,
+ 	    p->rt.nr_cpus_allowed > 1 &&
+ 	    rt_task(rq->curr) &&
+ 	    (rq->curr->rt.nr_cpus_allowed < 2 ||
+-	     rq->curr->prio < p->prio))
++	     rq->curr->prio <= p->prio))
+ 		push_rt_tasks(rq);
+ }
+ 

diff --git a/queue-3.0/series b/queue-3.0/series
new file mode 100644
index 0000000..bd2b856
--- /dev/null
+++ b/queue-3.0/series

@@ -0,0 +1,9 @@
+pci-set-pci-e-max-payload-size-on-fabric.patch
+pci-export-pcie_bus_configure_settings-symbol.patch
+pci-remove-mrrs-modification-from-mps-setting-code.patch
+pci-don-t-crash-when-reading-mpss-from-root-complex.patch
+sparc64-force-the-execute-bit-in-openfirmware-s-translation-entries.patch
+
+sched-rt-migrate-equal-priority-tasks-to-available-cpus.patch
+sched-fix-up-wchan-borkage.patch
+ide-disk-fix-request-requeuing.patch

diff --git a/queue-3.0/sparc64-force-the-execute-bit-in-openfirmware-s-translation-entries.patch b/queue-3.0/sparc64-force-the-execute-bit-in-openfirmware-s-translation-entries.patch
new file mode 100644
index 0000000..19527e0
--- /dev/null
+++ b/queue-3.0/sparc64-force-the-execute-bit-in-openfirmware-s-translation-entries.patch

@@ -0,0 +1,58 @@
+From f4142cba4e4065a416e78ade905bea29ff3930e6 Mon Sep 17 00:00:00 2001
+From: "David S. Miller" <davem@davemloft.net>
+Date: Thu, 29 Sep 2011 12:18:59 -0700
+Subject: sparc64: Force the execute bit in OpenFirmware's translation entries.
+Status: RO
+Content-Length: 1822
+Lines: 48
+
+From: "David S. Miller" <davem@davemloft.net>
+
+In the OF 'translations' property, the template TTEs in the mappings
+never specify the executable bit.  This is the case even though some
+of these mappings are for OF's code segment.
+
+Therefore, we need to force the execute bit on in every mapping.
+
+This problem can only really trigger on Niagara/sun4v machines and the
+history behind this is a little complicated.
+
+Previous to sun4v, the sun4u TTE entries lacked a hardware execute
+permission bit.  So OF didn't have to ever worry about setting
+anything to handle executable pages.  Any valid TTE loaded into the
+I-TLB would be respected by the chip.
+
+But sun4v Niagara chips have a real hardware enforced executable bit
+in their TTEs.  So it has to be set or else the I-TLB throws an
+instruction access exception with type code 6 (protection violation).
+
+We've been extremely fortunate to not get bitten by this in the past.
+
+The best I can tell is that the OF's mappings for it's executable code
+were mapped using permanent locked mappings on sun4v in the past.
+Therefore, the fact that we didn't have the exec bit set in the OF
+translations we would use did not matter in practice.
+
+Thanks to Greg Onufer for helping me track this down.
+
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ arch/sparc/mm/init_64.c |    5 +++++
+ 1 file changed, 5 insertions(+)
+
+--- a/arch/sparc/mm/init_64.c
++++ b/arch/sparc/mm/init_64.c
+@@ -511,6 +511,11 @@ static void __init read_obp_translations
+ 		for (i = 0; i < prom_trans_ents; i++)
+ 			prom_trans[i].data &= ~0x0003fe0000000000UL;
+ 	}
++
++	/* Force execute bit on.  */
++	for (i = 0; i < prom_trans_ents; i++)
++		prom_trans[i].data |= (tlb_type == hypervisor ?
++				       _PAGE_EXEC_4V : _PAGE_EXEC_4U);
+ }
+ 
+ static void __init hypervisor_tlb_lock(unsigned long vaddr,
commit	410449452dd3868bff8971f247c7f42652d7bab3	[log] [tgz]
author	Greg Kroah-Hartman <gregkh@suse.de>	Mon Oct 03 14:36:48 2011 -0700
committer	Greg Kroah-Hartman <gregkh@suse.de>	Mon Oct 03 14:36:48 2011 -0700
tree	c85585ed990683c9ef2ce96a98e1a7bcb1fd86e7
parent	380dd8e0ffba79547f9346ebb6a943eae837c2b5 [diff]