| From bhutchings@solarflare.com Mon Jan 14 13:03:31 2013 |
| From: Greg Rose <gregory.v.rose@intel.com> |
| Date: Fri, 4 Jan 2013 00:33:34 +0000 |
| Subject: rtnetlink: Fix problem with buffer allocation |
| To: Greg Kroah-Hartman <gregkh@linuxfoundation.org> |
| Message-ID: <1357259614.2685.62.camel@bwh-desktop.uk.solarflarecom.com> |
| |
| |
| From: Greg Rose <gregory.v.rose@intel.com> |
| |
| commit 115c9b81928360d769a76c632bae62d15206a94a upstream. |
| |
| Implement a new netlink attribute type IFLA_EXT_MASK. The mask |
| is a 32 bit value that can be used to indicate to the kernel that |
| certain extended ifinfo values are requested by the user application. |
| At this time the only mask value defined is RTEXT_FILTER_VF to |
| indicate that the user wants the ifinfo dump to send information |
| about the VFs belonging to the interface. |
| |
| This patch fixes a bug in which certain applications do not have |
| large enough buffers to accommodate the extra information returned |
| by the kernel with large numbers of SR-IOV virtual functions. |
| Those applications will not send the new netlink attribute with |
| the interface info dump request netlink messages so they will |
| not get unexpectedly large request buffers returned by the kernel. |
| |
| Modifies the rtnl_calcit function to traverse the list of net |
| devices and compute the minimum buffer size that can hold the |
| info dumps of all matching devices based upon the filter passed |
| in via the new netlink attribute filter mask. If no filter |
| mask is sent then the buffer allocation defaults to NLMSG_GOODSIZE. |
| |
| With this change it is possible to add yet to be defined netlink |
| attributes to the dump request which should make it fairly extensible |
| in the future. |
| |
| Signed-off-by: Greg Rose <gregory.v.rose@intel.com> |
| Acked-by: Greg Rose <gregory.v.rose@intel.com> |
| Signed-off-by: David S. Miller <davem@davemloft.net> |
| [bwh: Backported to 3.0: |
| - Adjust context |
| - Drop the change in do_setlink() that reverts commit f18da1456581 |
| ('net: RTNETLINK adjusting values of min_ifinfo_dump_size'), which |
| was never applied here] |
| Signed-off-by: Ben Hutchings <bhutchings@solarflare.com> |
| Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> |
| --- |
| include/linux/if_link.h | 1 |
| include/linux/rtnetlink.h | 3 + |
| include/net/rtnetlink.h | 2 - |
| net/core/rtnetlink.c | 77 ++++++++++++++++++++++++++++++++++------------ |
| 4 files changed, 63 insertions(+), 20 deletions(-) |
| |
| --- a/include/linux/if_link.h |
| +++ b/include/linux/if_link.h |
| @@ -137,6 +137,7 @@ enum { |
| IFLA_AF_SPEC, |
| IFLA_GROUP, /* Group the device belongs to */ |
| IFLA_NET_NS_FD, |
| + IFLA_EXT_MASK, /* Extended info mask, VFs, etc */ |
| __IFLA_MAX |
| }; |
| |
| --- a/include/linux/rtnetlink.h |
| +++ b/include/linux/rtnetlink.h |
| @@ -600,6 +600,9 @@ struct tcamsg { |
| #define TCA_ACT_TAB 1 /* attr type must be >=1 */ |
| #define TCAA_MAX 1 |
| |
| +/* New extended info filters for IFLA_EXT_MASK */ |
| +#define RTEXT_FILTER_VF (1 << 0) |
| + |
| /* End of information exported to user level */ |
| |
| #ifdef __KERNEL__ |
| --- a/include/net/rtnetlink.h |
| +++ b/include/net/rtnetlink.h |
| @@ -6,7 +6,7 @@ |
| |
| typedef int (*rtnl_doit_func)(struct sk_buff *, struct nlmsghdr *, void *); |
| typedef int (*rtnl_dumpit_func)(struct sk_buff *, struct netlink_callback *); |
| -typedef u16 (*rtnl_calcit_func)(struct sk_buff *); |
| +typedef u16 (*rtnl_calcit_func)(struct sk_buff *, struct nlmsghdr *); |
| |
| extern int __rtnl_register(int protocol, int msgtype, |
| rtnl_doit_func, rtnl_dumpit_func, |
| --- a/net/core/rtnetlink.c |
| +++ b/net/core/rtnetlink.c |
| @@ -60,7 +60,6 @@ struct rtnl_link { |
| }; |
| |
| static DEFINE_MUTEX(rtnl_mutex); |
| -static u16 min_ifinfo_dump_size; |
| |
| void rtnl_lock(void) |
| { |
| @@ -727,10 +726,11 @@ static void copy_rtnl_link_stats64(void |
| } |
| |
| /* All VF info */ |
| -static inline int rtnl_vfinfo_size(const struct net_device *dev) |
| +static inline int rtnl_vfinfo_size(const struct net_device *dev, |
| + u32 ext_filter_mask) |
| { |
| - if (dev->dev.parent && dev_is_pci(dev->dev.parent)) { |
| - |
| + if (dev->dev.parent && dev_is_pci(dev->dev.parent) && |
| + (ext_filter_mask & RTEXT_FILTER_VF)) { |
| int num_vfs = dev_num_vf(dev->dev.parent); |
| size_t size = nla_total_size(sizeof(struct nlattr)); |
| size += nla_total_size(num_vfs * sizeof(struct nlattr)); |
| @@ -768,7 +768,8 @@ static size_t rtnl_port_size(const struc |
| return port_self_size; |
| } |
| |
| -static noinline size_t if_nlmsg_size(const struct net_device *dev) |
| +static noinline size_t if_nlmsg_size(const struct net_device *dev, |
| + u32 ext_filter_mask) |
| { |
| return NLMSG_ALIGN(sizeof(struct ifinfomsg)) |
| + nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */ |
| @@ -786,8 +787,9 @@ static noinline size_t if_nlmsg_size(con |
| + nla_total_size(4) /* IFLA_MASTER */ |
| + nla_total_size(1) /* IFLA_OPERSTATE */ |
| + nla_total_size(1) /* IFLA_LINKMODE */ |
| - + nla_total_size(4) /* IFLA_NUM_VF */ |
| - + rtnl_vfinfo_size(dev) /* IFLA_VFINFO_LIST */ |
| + + nla_total_size(ext_filter_mask |
| + & RTEXT_FILTER_VF ? 4 : 0) /* IFLA_NUM_VF */ |
| + + rtnl_vfinfo_size(dev, ext_filter_mask) /* IFLA_VFINFO_LIST */ |
| + rtnl_port_size(dev) /* IFLA_VF_PORTS + IFLA_PORT_SELF */ |
| + rtnl_link_get_size(dev) /* IFLA_LINKINFO */ |
| + rtnl_link_get_af_size(dev); /* IFLA_AF_SPEC */ |
| @@ -870,7 +872,7 @@ static int rtnl_port_fill(struct sk_buff |
| |
| static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, |
| int type, u32 pid, u32 seq, u32 change, |
| - unsigned int flags) |
| + unsigned int flags, u32 ext_filter_mask) |
| { |
| struct ifinfomsg *ifm; |
| struct nlmsghdr *nlh; |
| @@ -943,10 +945,11 @@ static int rtnl_fill_ifinfo(struct sk_bu |
| goto nla_put_failure; |
| copy_rtnl_link_stats64(nla_data(attr), stats); |
| |
| - if (dev->dev.parent) |
| + if (dev->dev.parent && (ext_filter_mask & RTEXT_FILTER_VF)) |
| NLA_PUT_U32(skb, IFLA_NUM_VF, dev_num_vf(dev->dev.parent)); |
| |
| - if (dev->netdev_ops->ndo_get_vf_config && dev->dev.parent) { |
| + if (dev->netdev_ops->ndo_get_vf_config && dev->dev.parent |
| + && (ext_filter_mask & RTEXT_FILTER_VF)) { |
| int i; |
| |
| struct nlattr *vfinfo, *vf; |
| @@ -1033,11 +1036,20 @@ static int rtnl_dump_ifinfo(struct sk_bu |
| struct net_device *dev; |
| struct hlist_head *head; |
| struct hlist_node *node; |
| + struct nlattr *tb[IFLA_MAX+1]; |
| + u32 ext_filter_mask = 0; |
| |
| s_h = cb->args[0]; |
| s_idx = cb->args[1]; |
| |
| rcu_read_lock(); |
| + |
| + nlmsg_parse(cb->nlh, sizeof(struct rtgenmsg), tb, IFLA_MAX, |
| + ifla_policy); |
| + |
| + if (tb[IFLA_EXT_MASK]) |
| + ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]); |
| + |
| for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { |
| idx = 0; |
| head = &net->dev_index_head[h]; |
| @@ -1047,7 +1059,8 @@ static int rtnl_dump_ifinfo(struct sk_bu |
| if (rtnl_fill_ifinfo(skb, dev, RTM_NEWLINK, |
| NETLINK_CB(cb->skb).pid, |
| cb->nlh->nlmsg_seq, 0, |
| - NLM_F_MULTI) <= 0) |
| + NLM_F_MULTI, |
| + ext_filter_mask) <= 0) |
| goto out; |
| cont: |
| idx++; |
| @@ -1081,6 +1094,7 @@ const struct nla_policy ifla_policy[IFLA |
| [IFLA_VF_PORTS] = { .type = NLA_NESTED }, |
| [IFLA_PORT_SELF] = { .type = NLA_NESTED }, |
| [IFLA_AF_SPEC] = { .type = NLA_NESTED }, |
| + [IFLA_EXT_MASK] = { .type = NLA_U32 }, |
| }; |
| EXPORT_SYMBOL(ifla_policy); |
| |
| @@ -1813,6 +1827,7 @@ static int rtnl_getlink(struct sk_buff * |
| struct net_device *dev = NULL; |
| struct sk_buff *nskb; |
| int err; |
| + u32 ext_filter_mask = 0; |
| |
| err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy); |
| if (err < 0) |
| @@ -1821,6 +1836,9 @@ static int rtnl_getlink(struct sk_buff * |
| if (tb[IFLA_IFNAME]) |
| nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); |
| |
| + if (tb[IFLA_EXT_MASK]) |
| + ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]); |
| + |
| ifm = nlmsg_data(nlh); |
| if (ifm->ifi_index > 0) |
| dev = __dev_get_by_index(net, ifm->ifi_index); |
| @@ -1832,12 +1850,12 @@ static int rtnl_getlink(struct sk_buff * |
| if (dev == NULL) |
| return -ENODEV; |
| |
| - nskb = nlmsg_new(if_nlmsg_size(dev), GFP_KERNEL); |
| + nskb = nlmsg_new(if_nlmsg_size(dev, ext_filter_mask), GFP_KERNEL); |
| if (nskb == NULL) |
| return -ENOBUFS; |
| |
| err = rtnl_fill_ifinfo(nskb, dev, RTM_NEWLINK, NETLINK_CB(skb).pid, |
| - nlh->nlmsg_seq, 0, 0); |
| + nlh->nlmsg_seq, 0, 0, ext_filter_mask); |
| if (err < 0) { |
| /* -EMSGSIZE implies BUG in if_nlmsg_size */ |
| WARN_ON(err == -EMSGSIZE); |
| @@ -1848,8 +1866,31 @@ static int rtnl_getlink(struct sk_buff * |
| return err; |
| } |
| |
| -static u16 rtnl_calcit(struct sk_buff *skb) |
| +static u16 rtnl_calcit(struct sk_buff *skb, struct nlmsghdr *nlh) |
| { |
| + struct net *net = sock_net(skb->sk); |
| + struct net_device *dev; |
| + struct nlattr *tb[IFLA_MAX+1]; |
| + u32 ext_filter_mask = 0; |
| + u16 min_ifinfo_dump_size = 0; |
| + |
| + nlmsg_parse(nlh, sizeof(struct rtgenmsg), tb, IFLA_MAX, ifla_policy); |
| + |
| + if (tb[IFLA_EXT_MASK]) |
| + ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]); |
| + |
| + if (!ext_filter_mask) |
| + return NLMSG_GOODSIZE; |
| + /* |
| + * traverse the list of net devices and compute the minimum |
| + * buffer size based upon the filter mask. |
| + */ |
| + list_for_each_entry(dev, &net->dev_base_head, dev_list) { |
| + min_ifinfo_dump_size = max_t(u16, min_ifinfo_dump_size, |
| + if_nlmsg_size(dev, |
| + ext_filter_mask)); |
| + } |
| + |
| return min_ifinfo_dump_size; |
| } |
| |
| @@ -1884,13 +1925,11 @@ void rtmsg_ifinfo(int type, struct net_d |
| int err = -ENOBUFS; |
| size_t if_info_size; |
| |
| - skb = nlmsg_new((if_info_size = if_nlmsg_size(dev)), GFP_KERNEL); |
| + skb = nlmsg_new((if_info_size = if_nlmsg_size(dev, 0)), GFP_KERNEL); |
| if (skb == NULL) |
| goto errout; |
| |
| - min_ifinfo_dump_size = max_t(u16, if_info_size, min_ifinfo_dump_size); |
| - |
| - err = rtnl_fill_ifinfo(skb, dev, type, 0, 0, change, 0); |
| + err = rtnl_fill_ifinfo(skb, dev, type, 0, 0, change, 0, 0); |
| if (err < 0) { |
| /* -EMSGSIZE implies BUG in if_nlmsg_size() */ |
| WARN_ON(err == -EMSGSIZE); |
| @@ -1948,7 +1987,7 @@ static int rtnetlink_rcv_msg(struct sk_b |
| return -EOPNOTSUPP; |
| calcit = rtnl_get_calcit(family, type); |
| if (calcit) |
| - min_dump_alloc = calcit(skb); |
| + min_dump_alloc = calcit(skb, nlh); |
| |
| __rtnl_unlock(); |
| rtnl = net->rtnl; |