| From: Bruce Allan <bruce.w.allan@intel.com> |
| Date: Fri, 24 Aug 2012 20:38:11 +0000 |
| Subject: e1000e: DoS while TSO enabled caused by link partner with small MSS |
| |
| commit d821a4c4d11ad160925dab2bb009b8444beff484 upstream. |
| |
| With a low enough MSS on the link partner and TSO enabled locally, the |
| networking stack can periodically send a very large (e.g. 64KB) TCP |
| message for which the driver will attempt to use more Tx descriptors than |
| are available by default in the Tx ring. This is due to a workaround in |
| the code that imposes a limit of only 4 MSS-sized segments per descriptor |
| which appears to be a carry-over from the older e1000 driver and may be |
| applicable only to some older PCI or PCIx parts which are not supported in |
| e1000e. When the driver gets a message that is too large to fit across the |
| configured number of Tx descriptors, it stops the upper stack from queueing |
| any more and gets stuck in this state. After a timeout, the upper stack |
| assumes the adapter is hung and calls the driver to reset it. |
| |
| Remove the unnecessary limitation of using up to only 4 MSS-sized segments |
| per Tx descriptor, and put in a hard failure test to catch when attempting |
| to check for message sizes larger than would fit in the whole Tx ring. |
| Refactor the remaining logic that limits the size of data per Tx descriptor |
| from a seemingly arbitrary 8KB to a limit based on the dynamic size of the |
| Tx packet buffer as described in the hardware specification. |
| |
| Also, fix the logic in the check for space in the Tx ring for the next |
| largest possible packet after the current one has been successfully queued |
| for transmit, and use the appropriate defines for default ring sizes in |
| e1000_probe instead of magic values. |
| |
| This issue goes back to the introduction of e1000e in 2.6.24 when it was |
| split off from e1000. |
| |
| Reported-by: Ben Hutchings <bhutchings@solarflare.com> |
| Signed-off-by: Bruce Allan <bruce.w.allan@intel.com> |
| Tested-by: Aaron Brown <aaron.f.brown@intel.com> |
| Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com> |
| Signed-off-by: David S. Miller <davem@davemloft.net> |
| [bwh: Backported to 3.2: |
| - Adjust context |
| - Adjust for use of net_device vs e1000_ring parameter] |
| Signed-off-by: Ben Hutchings <ben@decadent.org.uk> |
| --- |
| --- a/drivers/net/ethernet/intel/e1000e/e1000.h |
| +++ b/drivers/net/ethernet/intel/e1000e/e1000.h |
| @@ -302,6 +302,7 @@ struct e1000_adapter { |
| */ |
| struct e1000_ring *tx_ring /* One per active queue */ |
| ____cacheline_aligned_in_smp; |
| + u32 tx_fifo_limit; |
| |
| struct napi_struct napi; |
| |
| --- a/drivers/net/ethernet/intel/e1000e/netdev.c |
| +++ b/drivers/net/ethernet/intel/e1000e/netdev.c |
| @@ -3386,6 +3386,15 @@ void e1000e_reset(struct e1000_adapter * |
| } |
| |
| /* |
| + * Alignment of Tx data is on an arbitrary byte boundary with the |
| + * maximum size per Tx descriptor limited only to the transmit |
| + * allocation of the packet buffer minus 96 bytes with an upper |
| + * limit of 24KB due to receive synchronization limitations. |
| + */ |
| + adapter->tx_fifo_limit = min_t(u32, ((er32(PBA) >> 16) << 10) - 96, |
| + 24 << 10); |
| + |
| + /* |
| * Disable Adaptive Interrupt Moderation if 2 full packets cannot |
| * fit in receive buffer and early-receive not supported. |
| */ |
| @@ -4647,13 +4656,9 @@ static bool e1000_tx_csum(struct e1000_a |
| return 1; |
| } |
| |
| -#define E1000_MAX_PER_TXD 8192 |
| -#define E1000_MAX_TXD_PWR 12 |
| - |
| static int e1000_tx_map(struct e1000_adapter *adapter, |
| struct sk_buff *skb, unsigned int first, |
| - unsigned int max_per_txd, unsigned int nr_frags, |
| - unsigned int mss) |
| + unsigned int max_per_txd, unsigned int nr_frags) |
| { |
| struct e1000_ring *tx_ring = adapter->tx_ring; |
| struct pci_dev *pdev = adapter->pdev; |
| @@ -4882,20 +4887,19 @@ static int e1000_maybe_stop_tx(struct ne |
| { |
| struct e1000_adapter *adapter = netdev_priv(netdev); |
| |
| + BUG_ON(size > adapter->tx_ring->count); |
| + |
| if (e1000_desc_unused(adapter->tx_ring) >= size) |
| return 0; |
| return __e1000_maybe_stop_tx(netdev, size); |
| } |
| |
| -#define TXD_USE_COUNT(S, X) (((S) >> (X)) + 1 ) |
| static netdev_tx_t e1000_xmit_frame(struct sk_buff *skb, |
| struct net_device *netdev) |
| { |
| struct e1000_adapter *adapter = netdev_priv(netdev); |
| struct e1000_ring *tx_ring = adapter->tx_ring; |
| unsigned int first; |
| - unsigned int max_per_txd = E1000_MAX_PER_TXD; |
| - unsigned int max_txd_pwr = E1000_MAX_TXD_PWR; |
| unsigned int tx_flags = 0; |
| unsigned int len = skb_headlen(skb); |
| unsigned int nr_frags; |
| @@ -4915,18 +4919,8 @@ static netdev_tx_t e1000_xmit_frame(stru |
| } |
| |
| mss = skb_shinfo(skb)->gso_size; |
| - /* |
| - * The controller does a simple calculation to |
| - * make sure there is enough room in the FIFO before |
| - * initiating the DMA for each buffer. The calc is: |
| - * 4 = ceil(buffer len/mss). To make sure we don't |
| - * overrun the FIFO, adjust the max buffer len if mss |
| - * drops. |
| - */ |
| if (mss) { |
| u8 hdr_len; |
| - max_per_txd = min(mss << 2, max_per_txd); |
| - max_txd_pwr = fls(max_per_txd) - 1; |
| |
| /* |
| * TSO Workaround for 82571/2/3 Controllers -- if skb->data |
| @@ -4956,12 +4950,12 @@ static netdev_tx_t e1000_xmit_frame(stru |
| count++; |
| count++; |
| |
| - count += TXD_USE_COUNT(len, max_txd_pwr); |
| + count += DIV_ROUND_UP(len, adapter->tx_fifo_limit); |
| |
| nr_frags = skb_shinfo(skb)->nr_frags; |
| for (f = 0; f < nr_frags; f++) |
| - count += TXD_USE_COUNT(skb_frag_size(&skb_shinfo(skb)->frags[f]), |
| - max_txd_pwr); |
| + count += DIV_ROUND_UP(skb_frag_size(&skb_shinfo(skb)->frags[f]), |
| + adapter->tx_fifo_limit); |
| |
| if (adapter->hw.mac.tx_pkt_filtering) |
| e1000_transfer_dhcp_info(adapter, skb); |
| @@ -5000,12 +4994,15 @@ static netdev_tx_t e1000_xmit_frame(stru |
| tx_flags |= E1000_TX_FLAGS_IPV4; |
| |
| /* if count is 0 then mapping error has occurred */ |
| - count = e1000_tx_map(adapter, skb, first, max_per_txd, nr_frags, mss); |
| + count = e1000_tx_map(adapter, skb, first, adapter->tx_fifo_limit, |
| + nr_frags); |
| if (count) { |
| e1000_tx_queue(adapter, tx_flags, count); |
| /* Make sure there is space in the ring for the next send. */ |
| - e1000_maybe_stop_tx(netdev, MAX_SKB_FRAGS + 2); |
| - |
| + e1000_maybe_stop_tx(netdev, |
| + (MAX_SKB_FRAGS * |
| + DIV_ROUND_UP(PAGE_SIZE, |
| + adapter->tx_fifo_limit) + 2)); |
| } else { |
| dev_kfree_skb_any(skb); |
| tx_ring->buffer_info[first].time_stamp = 0; |
| @@ -6150,8 +6147,8 @@ static int __devinit e1000_probe(struct |
| adapter->hw.phy.autoneg_advertised = 0x2f; |
| |
| /* ring size defaults */ |
| - adapter->rx_ring->count = 256; |
| - adapter->tx_ring->count = 256; |
| + adapter->rx_ring->count = E1000_DEFAULT_RXD; |
| + adapter->tx_ring->count = E1000_DEFAULT_TXD; |
| |
| /* |
| * Initial Wake on LAN setting - If APM wake is enabled in |